From e8c630d30374ac9eeea71a41304c021246a6b7cf Mon Sep 17 00:00:00 2001 From: Ruihan-Yin Date: Tue, 24 Jun 2025 18:44:50 -0700 Subject: [PATCH 01/19] EVEX.ZU --- src/coreclr/jit/codegenxarch.cpp | 680 ++++++++++++++++--------------- src/coreclr/jit/emit.h | 13 + src/coreclr/jit/emitxarch.cpp | 89 +++- src/coreclr/jit/emitxarch.h | 27 ++ src/coreclr/jit/instr.h | 4 + src/coreclr/jit/instrsxarch.h | 19 + 6 files changed, 488 insertions(+), 344 deletions(-) diff --git a/src/coreclr/jit/codegenxarch.cpp b/src/coreclr/jit/codegenxarch.cpp index 7561351315bd4a..a93f9badb3351f 100644 --- a/src/coreclr/jit/codegenxarch.cpp +++ b/src/coreclr/jit/codegenxarch.cpp @@ -9111,345 +9111,347 @@ void CodeGen::genAmd64EmitterUnitTestsApx() genDefineTempLabel(genCreateTempLabel()); // This test suite needs REX2 enabled. - if (!theEmitter->UseRex2Encoding() && !theEmitter->emitComp->DoJitStressRex2Encoding()) - { - return; - } - - theEmitter->emitIns_R_R(INS_add, EA_1BYTE, REG_EAX, REG_ECX); - theEmitter->emitIns_R_R(INS_add, EA_2BYTE, REG_EAX, REG_ECX); - theEmitter->emitIns_R_R(INS_add, EA_4BYTE, REG_EAX, REG_ECX); - theEmitter->emitIns_R_R(INS_add, EA_8BYTE, REG_EAX, REG_ECX); - theEmitter->emitIns_R_R(INS_or, EA_4BYTE, REG_EAX, REG_ECX); - theEmitter->emitIns_R_R(INS_adc, EA_4BYTE, REG_EAX, REG_ECX); - theEmitter->emitIns_R_R(INS_sbb, EA_4BYTE, REG_EAX, REG_ECX); - theEmitter->emitIns_R_R(INS_and, EA_4BYTE, REG_EAX, REG_ECX); - theEmitter->emitIns_R_R(INS_sub, EA_4BYTE, REG_EAX, REG_ECX); - theEmitter->emitIns_R_R(INS_xor, EA_4BYTE, REG_EAX, REG_ECX); - theEmitter->emitIns_R_R(INS_cmp, EA_4BYTE, REG_EAX, REG_ECX); - theEmitter->emitIns_R_R(INS_test, EA_4BYTE, REG_EAX, REG_ECX); - theEmitter->emitIns_R_R(INS_bsf, EA_4BYTE, REG_EAX, REG_ECX); - theEmitter->emitIns_R_R(INS_bsr, EA_4BYTE, REG_EAX, REG_ECX); - - theEmitter->emitIns_R_R(INS_cmovo, EA_4BYTE, REG_EAX, REG_ECX); - - theEmitter->emitIns_Mov(INS_mov, EA_4BYTE, REG_EAX, REG_ECX, false); - theEmitter->emitIns_Mov(INS_movsx, EA_2BYTE, REG_EAX, REG_ECX, false); - theEmitter->emitIns_Mov(INS_movzx, EA_2BYTE, REG_EAX, REG_ECX, false); - - theEmitter->emitIns_R_R(INS_popcnt, EA_4BYTE, REG_EAX, REG_ECX); - theEmitter->emitIns_R_R(INS_lzcnt, EA_4BYTE, REG_EAX, REG_ECX); - theEmitter->emitIns_R_R(INS_tzcnt, EA_4BYTE, REG_EAX, REG_ECX); - - theEmitter->emitIns_R_I(INS_add, EA_4BYTE, REG_ECX, 0x05); - theEmitter->emitIns_R_I(INS_add, EA_2BYTE, REG_ECX, 0x05); - theEmitter->emitIns_R_I(INS_or, EA_4BYTE, REG_EAX, 0x05); - theEmitter->emitIns_R_I(INS_adc, EA_4BYTE, REG_EAX, 0x05); - theEmitter->emitIns_R_I(INS_sbb, EA_4BYTE, REG_EAX, 0x05); - theEmitter->emitIns_R_I(INS_and, EA_4BYTE, REG_EAX, 0x05); - theEmitter->emitIns_R_I(INS_sub, EA_4BYTE, REG_EAX, 0x05); - theEmitter->emitIns_R_I(INS_xor, EA_4BYTE, REG_EAX, 0x05); - theEmitter->emitIns_R_I(INS_cmp, EA_4BYTE, REG_EAX, 0x05); - theEmitter->emitIns_R_I(INS_test, EA_4BYTE, REG_EAX, 0x05); - - theEmitter->emitIns_R_I(INS_mov, EA_4BYTE, REG_EAX, 0xE0); - - // JIT tend to compress imm64 to imm32 if higher half is all-zero, make sure this test checks the path for imm64. - theEmitter->emitIns_R_I(INS_mov, EA_8BYTE, REG_RAX, 0xFFFF000000000000); - - // shf reg, cl - theEmitter->emitIns_R(INS_rol, EA_4BYTE, REG_EAX); - theEmitter->emitIns_R(INS_ror, EA_4BYTE, REG_EAX); - theEmitter->emitIns_R(INS_rcl, EA_4BYTE, REG_EAX); - theEmitter->emitIns_R(INS_rcr, EA_4BYTE, REG_EAX); - theEmitter->emitIns_R(INS_shl, EA_4BYTE, REG_EAX); - theEmitter->emitIns_R(INS_shr, EA_4BYTE, REG_EAX); - theEmitter->emitIns_R(INS_sar, EA_4BYTE, REG_EAX); - - // shf reg, 1 - theEmitter->emitIns_R(INS_rol_1, EA_4BYTE, REG_EAX); - theEmitter->emitIns_R(INS_ror_1, EA_4BYTE, REG_EAX); - theEmitter->emitIns_R(INS_rcl_1, EA_4BYTE, REG_EAX); - theEmitter->emitIns_R(INS_rcr_1, EA_4BYTE, REG_EAX); - theEmitter->emitIns_R(INS_shl_1, EA_4BYTE, REG_EAX); - theEmitter->emitIns_R(INS_shr_1, EA_4BYTE, REG_EAX); - theEmitter->emitIns_R(INS_sar_1, EA_4BYTE, REG_EAX); - - // shf reg, imm8 - theEmitter->emitIns_R_I(INS_shl_N, EA_4BYTE, REG_ECX, 0x05); - theEmitter->emitIns_R_I(INS_shr_N, EA_4BYTE, REG_ECX, 0x05); - theEmitter->emitIns_R_I(INS_sar_N, EA_4BYTE, REG_ECX, 0x05); - theEmitter->emitIns_R_I(INS_rol_N, EA_4BYTE, REG_ECX, 0x05); - theEmitter->emitIns_R_I(INS_ror_N, EA_4BYTE, REG_ECX, 0x05); - theEmitter->emitIns_R_I(INS_rcl_N, EA_4BYTE, REG_ECX, 0x05); - theEmitter->emitIns_R_I(INS_rcr_N, EA_4BYTE, REG_ECX, 0x05); - - theEmitter->emitIns_R(INS_neg, EA_2BYTE, REG_EAX); - theEmitter->emitIns_R(INS_not, EA_2BYTE, REG_EAX); - - theEmitter->emitIns_R_AR(INS_lea, EA_4BYTE, REG_ECX, REG_EAX, 4); - - theEmitter->emitIns_R_AR(INS_mov, EA_1BYTE, REG_ECX, REG_EAX, 4); - theEmitter->emitIns_R_AR(INS_mov, EA_2BYTE, REG_ECX, REG_EAX, 4); - theEmitter->emitIns_R_AR(INS_mov, EA_4BYTE, REG_ECX, REG_EAX, 4); - theEmitter->emitIns_R_AR(INS_mov, EA_8BYTE, REG_ECX, REG_EAX, 4); - - theEmitter->emitIns_R_AR(INS_add, EA_1BYTE, REG_EAX, REG_ECX, 4); - theEmitter->emitIns_R_AR(INS_add, EA_2BYTE, REG_EAX, REG_ECX, 4); - theEmitter->emitIns_R_AR(INS_add, EA_4BYTE, REG_EAX, REG_ECX, 4); - theEmitter->emitIns_R_AR(INS_add, EA_8BYTE, REG_EAX, REG_ECX, 4); - theEmitter->emitIns_R_AR(INS_or, EA_4BYTE, REG_EAX, REG_ECX, 4); - theEmitter->emitIns_R_AR(INS_adc, EA_4BYTE, REG_EAX, REG_ECX, 4); - theEmitter->emitIns_R_AR(INS_sbb, EA_4BYTE, REG_EAX, REG_ECX, 4); - theEmitter->emitIns_R_AR(INS_and, EA_4BYTE, REG_EAX, REG_ECX, 4); - theEmitter->emitIns_R_AR(INS_sub, EA_4BYTE, REG_EAX, REG_ECX, 4); - theEmitter->emitIns_R_AR(INS_xor, EA_4BYTE, REG_EAX, REG_ECX, 4); - theEmitter->emitIns_R_AR(INS_cmp, EA_4BYTE, REG_EAX, REG_ECX, 4); - theEmitter->emitIns_R_AR(INS_test, EA_4BYTE, REG_EAX, REG_ECX, 4); - theEmitter->emitIns_R_AR(INS_bsf, EA_4BYTE, REG_EAX, REG_ECX, 4); - theEmitter->emitIns_R_AR(INS_bsr, EA_4BYTE, REG_EAX, REG_ECX, 4); - theEmitter->emitIns_R_AR(INS_popcnt, EA_4BYTE, REG_EAX, REG_ECX, 4); - theEmitter->emitIns_R_AR(INS_lzcnt, EA_4BYTE, REG_EAX, REG_ECX, 4); - theEmitter->emitIns_R_AR(INS_tzcnt, EA_4BYTE, REG_EAX, REG_ECX, 4); - - theEmitter->emitIns_AR_R(INS_add, EA_1BYTE, REG_EAX, REG_ECX, 4); - theEmitter->emitIns_AR_R(INS_add, EA_2BYTE, REG_EAX, REG_ECX, 4); - theEmitter->emitIns_AR_R(INS_add, EA_4BYTE, REG_EAX, REG_ECX, 4); - theEmitter->emitIns_AR_R(INS_add, EA_8BYTE, REG_EAX, REG_ECX, 4); - theEmitter->emitIns_AR_R(INS_or, EA_4BYTE, REG_EAX, REG_ECX, 4); - theEmitter->emitIns_AR_R(INS_adc, EA_4BYTE, REG_EAX, REG_ECX, 4); - theEmitter->emitIns_AR_R(INS_sbb, EA_4BYTE, REG_EAX, REG_ECX, 4); - theEmitter->emitIns_AR_R(INS_and, EA_4BYTE, REG_EAX, REG_ECX, 4); - theEmitter->emitIns_AR_R(INS_sub, EA_4BYTE, REG_EAX, REG_ECX, 4); - theEmitter->emitIns_AR_R(INS_xor, EA_4BYTE, REG_EAX, REG_ECX, 4); - theEmitter->emitIns_AR_R(INS_cmp, EA_4BYTE, REG_EAX, REG_ECX, 4); - theEmitter->emitIns_AR_R(INS_test, EA_4BYTE, REG_EAX, REG_ECX, 4); - - theEmitter->emitIns_R_AR(INS_movsx, EA_2BYTE, REG_ECX, REG_EAX, 4); - theEmitter->emitIns_R_AR(INS_movzx, EA_2BYTE, REG_EAX, REG_ECX, 4); - theEmitter->emitIns_R_AR(INS_cmovo, EA_4BYTE, REG_EAX, REG_ECX, 4); - - theEmitter->emitIns_AR_R(INS_xadd, EA_4BYTE, REG_EAX, REG_EDX, 2); - - theEmitter->emitIns_R_R_I(INS_shld, EA_4BYTE, REG_EAX, REG_ECX, 5); - theEmitter->emitIns_R_R_I(INS_shrd, EA_2BYTE, REG_EAX, REG_ECX, 5); - - theEmitter->emitIns_AR_R(INS_cmpxchg, EA_2BYTE, REG_EAX, REG_EDX, 2); - - theEmitter->emitIns_R(INS_seto, EA_1BYTE, REG_EDX); - - theEmitter->emitIns_R(INS_bswap, EA_8BYTE, REG_EDX); - - // INS_bt only has reg-to-reg form. - theEmitter->emitIns_R_R(INS_bt, EA_2BYTE, REG_EAX, REG_EDX); - - theEmitter->emitIns_R(INS_idiv, EA_8BYTE, REG_EDX); - - theEmitter->emitIns_R_R(INS_xchg, EA_8BYTE, REG_EAX, REG_EDX); - - theEmitter->emitIns_R(INS_div, EA_8BYTE, REG_EDX); - theEmitter->emitIns_R(INS_mulEAX, EA_8BYTE, REG_EDX); - - GenTreePhysReg physReg(REG_EDX); - physReg.SetRegNum(REG_EDX); - GenTreeIndir load = indirForm(TYP_INT, &physReg); - - theEmitter->emitIns_R_A(INS_add, EA_1BYTE, REG_EAX, &load); - theEmitter->emitIns_R_A(INS_add, EA_2BYTE, REG_EAX, &load); - theEmitter->emitIns_R_A(INS_add, EA_4BYTE, REG_EAX, &load); - theEmitter->emitIns_R_A(INS_add, EA_8BYTE, REG_EAX, &load); - theEmitter->emitIns_R_A(INS_or, EA_4BYTE, REG_EAX, &load); - theEmitter->emitIns_R_A(INS_adc, EA_4BYTE, REG_EAX, &load); - theEmitter->emitIns_R_A(INS_sbb, EA_4BYTE, REG_EAX, &load); - theEmitter->emitIns_R_A(INS_and, EA_4BYTE, REG_EAX, &load); - theEmitter->emitIns_R_A(INS_sub, EA_4BYTE, REG_EAX, &load); - theEmitter->emitIns_R_A(INS_xor, EA_4BYTE, REG_EAX, &load); - theEmitter->emitIns_R_A(INS_cmp, EA_4BYTE, REG_EAX, &load); - theEmitter->emitIns_R_A(INS_test, EA_4BYTE, REG_EAX, &load); - theEmitter->emitIns_R_A(INS_bsf, EA_4BYTE, REG_EAX, &load); - theEmitter->emitIns_R_A(INS_bsr, EA_4BYTE, REG_EAX, &load); - - // Note: - // All the tests below rely on the runtime status of the stack this unit tests attaching to, - // it might fail due to stack value unavailable/mismatch, since these tests are mainly for - // encoding correctness check, this kind of failures may be considered as not harmful. - - theEmitter->emitIns_R_S(INS_add, EA_1BYTE, REG_EAX, 0, 0); - theEmitter->emitIns_R_S(INS_add, EA_2BYTE, REG_EAX, 0, 0); - theEmitter->emitIns_R_S(INS_add, EA_4BYTE, REG_EAX, 0, 0); - theEmitter->emitIns_R_S(INS_add, EA_8BYTE, REG_EAX, 0, 0); - theEmitter->emitIns_R_S(INS_or, EA_4BYTE, REG_EAX, 0, 0); - theEmitter->emitIns_R_S(INS_adc, EA_4BYTE, REG_EAX, 0, 0); - theEmitter->emitIns_R_S(INS_sbb, EA_4BYTE, REG_EAX, 0, 0); - theEmitter->emitIns_R_S(INS_and, EA_4BYTE, REG_EAX, 0, 0); - theEmitter->emitIns_R_S(INS_sub, EA_4BYTE, REG_EAX, 0, 0); - theEmitter->emitIns_R_S(INS_xor, EA_4BYTE, REG_EAX, 0, 0); - theEmitter->emitIns_R_S(INS_cmp, EA_4BYTE, REG_EAX, 0, 0); - theEmitter->emitIns_R_S(INS_test, EA_4BYTE, REG_EAX, 0, 0); - theEmitter->emitIns_S_R(INS_xadd, EA_2BYTE, REG_EAX, 0, 0); - - theEmitter->emitIns_S_I(INS_shl_N, EA_4BYTE, 0, 0, 4); - theEmitter->emitIns_S(INS_shl_1, EA_4BYTE, 0, 4); - - theEmitter->emitIns_R_S(INS_movsx, EA_2BYTE, REG_ECX, 0, 0); - theEmitter->emitIns_R_S(INS_movzx, EA_2BYTE, REG_EAX, 0, 0); - theEmitter->emitIns_R_S(INS_cmovo, EA_4BYTE, REG_EAX, 0, 0); - - theEmitter->emitIns_R(INS_pop, EA_PTRSIZE, REG_EAX); - theEmitter->emitIns_R(INS_push, EA_PTRSIZE, REG_EAX); - theEmitter->emitIns_R(INS_pop_hide, EA_PTRSIZE, REG_EAX); - theEmitter->emitIns_R(INS_push_hide, EA_PTRSIZE, REG_EAX); - - theEmitter->emitIns_S(INS_pop, EA_PTRSIZE, 0, 0); - theEmitter->emitIns_I(INS_push, EA_PTRSIZE, 50); - - theEmitter->emitIns_R(INS_inc, EA_4BYTE, REG_EAX); - theEmitter->emitIns_AR(INS_inc, EA_2BYTE, REG_EAX, 2); - theEmitter->emitIns_S(INS_inc, EA_2BYTE, 0, 0); - theEmitter->emitIns_R(INS_dec, EA_4BYTE, REG_EAX); - theEmitter->emitIns_AR(INS_dec, EA_2BYTE, REG_EAX, 2); - theEmitter->emitIns_S(INS_dec, EA_2BYTE, 0, 0); - - theEmitter->emitIns_S(INS_neg, EA_2BYTE, 0, 0); - theEmitter->emitIns_S(INS_not, EA_2BYTE, 0, 0); - - // APX-EVEX - - theEmitter->emitIns_R_R_R(INS_add, EA_8BYTE, REG_R10, REG_EAX, REG_ECX, INS_OPTS_EVEX_nd); - theEmitter->emitIns_R_R_R(INS_sub, EA_2BYTE, REG_R10, REG_EAX, REG_ECX, INS_OPTS_EVEX_nd); - theEmitter->emitIns_R_R_R(INS_or, EA_2BYTE, REG_R10, REG_EAX, REG_ECX, INS_OPTS_EVEX_nd); - theEmitter->emitIns_R_R_R(INS_and, EA_2BYTE, REG_R10, REG_EAX, REG_ECX, INS_OPTS_EVEX_nd); - theEmitter->emitIns_R_R_R(INS_xor, EA_1BYTE, REG_R10, REG_EAX, REG_ECX, INS_OPTS_EVEX_nd); - - theEmitter->emitIns_R_R_I(INS_or, EA_2BYTE, REG_R10, REG_EAX, 10565, INS_OPTS_EVEX_nd); - theEmitter->emitIns_R_R_I(INS_or, EA_8BYTE, REG_R10, REG_EAX, 10, INS_OPTS_EVEX_nd); - theEmitter->emitIns_R_R_S(INS_or, EA_8BYTE, REG_R10, REG_EAX, 0, 1, INS_OPTS_EVEX_nd); - - theEmitter->emitIns_R_R(INS_neg, EA_2BYTE, REG_R10, REG_ECX, INS_OPTS_EVEX_nd); - - theEmitter->emitIns_R_R(INS_shl, EA_2BYTE, REG_R11, REG_EAX, INS_OPTS_EVEX_nd); - theEmitter->emitIns_R_R(INS_shl_1, EA_2BYTE, REG_R11, REG_EAX, INS_OPTS_EVEX_nd); - theEmitter->emitIns_R_R_I(INS_shl_N, EA_2BYTE, REG_R11, REG_ECX, 7, INS_OPTS_EVEX_nd); - theEmitter->emitIns_R_R_I(INS_shl_N, EA_2BYTE, REG_R11, REG_ECX, 7, INS_OPTS_EVEX_nd); - theEmitter->emitIns_R_R_I(INS_rcr_N, EA_2BYTE, REG_R11, REG_ECX, 7, INS_OPTS_EVEX_nd); - theEmitter->emitIns_R_R_I(INS_rcl_N, EA_2BYTE, REG_R11, REG_ECX, 7, INS_OPTS_EVEX_nd); - - theEmitter->emitIns_R_R(INS_inc, EA_2BYTE, REG_R11, REG_ECX, INS_OPTS_EVEX_nd); - theEmitter->emitIns_R_R(INS_dec, EA_2BYTE, REG_R11, REG_ECX, INS_OPTS_EVEX_nd); - - theEmitter->emitIns_R_R_R(INS_cmovo, EA_4BYTE, REG_R12, REG_R11, REG_EAX, INS_OPTS_EVEX_nd); - - theEmitter->emitIns_R_R_R(INS_imul, EA_4BYTE, REG_R12, REG_R11, REG_ECX, INS_OPTS_EVEX_nd); - theEmitter->emitIns_R_R_S(INS_imul, EA_4BYTE, REG_R12, REG_R11, 0, 1, INS_OPTS_EVEX_nd); - - theEmitter->emitIns_R_R(INS_add, EA_4BYTE, REG_R12, REG_R11, INS_OPTS_EVEX_nf); - theEmitter->emitIns_R_R(INS_sub, EA_4BYTE, REG_R12, REG_R11, INS_OPTS_EVEX_nf); - theEmitter->emitIns_R_R(INS_and, EA_4BYTE, REG_R12, REG_R11, INS_OPTS_EVEX_nf); - theEmitter->emitIns_R_R(INS_or, EA_4BYTE, REG_R12, REG_R11, INS_OPTS_EVEX_nf); - theEmitter->emitIns_R_R(INS_xor, EA_4BYTE, REG_R12, REG_R11, INS_OPTS_EVEX_nf); - theEmitter->emitIns_R(INS_inc, EA_4BYTE, REG_R12, INS_OPTS_EVEX_nf); - theEmitter->emitIns_R(INS_dec, EA_4BYTE, REG_R12, INS_OPTS_EVEX_nf); - - theEmitter->emitIns_R_I(INS_add, EA_4BYTE, REG_R12, 5, INS_OPTS_EVEX_nf); - theEmitter->emitIns_R_I(INS_sub, EA_4BYTE, REG_R12, 5, INS_OPTS_EVEX_nf); - theEmitter->emitIns_R_I(INS_and, EA_4BYTE, REG_R12, 5, INS_OPTS_EVEX_nf); - theEmitter->emitIns_R_I(INS_or, EA_4BYTE, REG_R12, 5, INS_OPTS_EVEX_nf); - theEmitter->emitIns_R_I(INS_xor, EA_4BYTE, REG_R12, 5, INS_OPTS_EVEX_nf); - - theEmitter->emitIns_R_S(INS_add, EA_4BYTE, REG_R12, 0, 1, INS_OPTS_EVEX_nf); - theEmitter->emitIns_R_S(INS_sub, EA_4BYTE, REG_R12, 0, 1, INS_OPTS_EVEX_nf); - theEmitter->emitIns_R_S(INS_and, EA_4BYTE, REG_R12, 0, 1, INS_OPTS_EVEX_nf); - theEmitter->emitIns_R_S(INS_or, EA_4BYTE, REG_R12, 0, 1, INS_OPTS_EVEX_nf); - theEmitter->emitIns_R_S(INS_xor, EA_4BYTE, REG_R12, 0, 1, INS_OPTS_EVEX_nf); - - theEmitter->emitIns_R(INS_neg, EA_2BYTE, REG_R11, INS_OPTS_EVEX_nf); - theEmitter->emitIns_R(INS_shl, EA_2BYTE, REG_R11, INS_OPTS_EVEX_nf); - theEmitter->emitIns_R(INS_shl_1, EA_2BYTE, REG_R11, INS_OPTS_EVEX_nf); - theEmitter->emitIns_R_I(INS_shl_N, EA_2BYTE, REG_R11, 7, INS_OPTS_EVEX_nf); - theEmitter->emitIns_R_I(INS_shl_N, EA_2BYTE, REG_R11, 7, INS_OPTS_EVEX_nf); - - theEmitter->emitIns_R_R(INS_imul, EA_4BYTE, REG_R12, REG_R11, INS_OPTS_EVEX_nf); - theEmitter->emitIns_R_S(INS_imul, EA_4BYTE, REG_R12, 0, 1, INS_OPTS_EVEX_nf); - - theEmitter->emitIns_R_I(INS_imul_15, EA_4BYTE, REG_R12, 5, INS_OPTS_EVEX_nf); - - theEmitter->emitIns_R(INS_imulEAX, EA_8BYTE, REG_R12, INS_OPTS_EVEX_nf); - theEmitter->emitIns_R(INS_mulEAX, EA_8BYTE, REG_R12, INS_OPTS_EVEX_nf); - theEmitter->emitIns_R(INS_div, EA_8BYTE, REG_R12, INS_OPTS_EVEX_nf); - theEmitter->emitIns_R(INS_idiv, EA_8BYTE, REG_R12, INS_OPTS_EVEX_nf); - - theEmitter->emitIns_R_R(INS_tzcnt_apx, EA_8BYTE, REG_R12, REG_R11, INS_OPTS_EVEX_nf); - theEmitter->emitIns_R_R(INS_lzcnt_apx, EA_8BYTE, REG_R12, REG_R11, INS_OPTS_EVEX_nf); - theEmitter->emitIns_R_R(INS_popcnt_apx, EA_8BYTE, REG_R12, REG_R11, INS_OPTS_EVEX_nf); - - theEmitter->emitIns_R_S(INS_tzcnt_apx, EA_8BYTE, REG_R12, 0, 1, INS_OPTS_EVEX_nf); - theEmitter->emitIns_R_S(INS_lzcnt_apx, EA_8BYTE, REG_R12, 0, 1, INS_OPTS_EVEX_nf); - theEmitter->emitIns_R_S(INS_popcnt_apx, EA_8BYTE, REG_R12, 0, 1, INS_OPTS_EVEX_nf); - - theEmitter->emitIns_R_R_R(INS_add, EA_2BYTE, REG_R12, REG_R13, REG_R11, - (insOpts)(INS_OPTS_EVEX_nf | INS_OPTS_EVEX_nd)); - - theEmitter->emitIns_R_R_R(INS_andn, EA_8BYTE, REG_R11, REG_R13, REG_R11, INS_OPTS_EVEX_nf); - theEmitter->emitIns_R_R_R(INS_bextr, EA_8BYTE, REG_R11, REG_R13, REG_R11, INS_OPTS_EVEX_nf); - - theEmitter->emitIns_R_R(INS_blsi, EA_8BYTE, REG_R11, REG_R13, INS_OPTS_EVEX_nf); - theEmitter->emitIns_R_R(INS_blsmsk, EA_8BYTE, REG_R11, REG_R13, INS_OPTS_EVEX_nf); - theEmitter->emitIns_R_S(INS_blsr, EA_8BYTE, REG_R11, 0, 1); - - theEmitter->emitIns_AR(INS_inc, EA_4BYTE, REG_EAX, 0, INS_OPTS_EVEX_NoApxPromotion); - - theEmitter->emitIns_BASE_R_R(INS_inc, EA_4BYTE, REG_R11, REG_R12); - theEmitter->emitIns_BASE_R_R_I(INS_add, EA_4BYTE, REG_R11, REG_R12, 5); - - // testing for EGPR encodings. - GenTreePhysReg eGPR(REG_R16); - eGPR.SetRegNum(REG_R16); - GenTreeIndir loadGPR = indirForm(TYP_SIMD32, &eGPR); - - // // SIMD instructions - // // In most of the cases, EGPR will only be used as BASE/INDEX registers in SIMD instructions. - theEmitter->emitIns_R_R_A(INS_addps, EA_32BYTE, REG_XMM16, REG_XMM16, &loadGPR); - - // // Legacy instructions - theEmitter->emitIns_R_ARX(INS_add, EA_4BYTE, REG_R16, REG_R17, REG_R18, 1, 0); - - theEmitter->emitIns_AR_R(INS_movnti64, EA_8BYTE, REG_R17, REG_R16, 10); - theEmitter->emitIns_R_R_R(INS_andn, EA_8BYTE, REG_R17, REG_R16, REG_R18); - - theEmitter->emitIns_Mov(INS_kmovb_gpr, EA_4BYTE, REG_R16, REG_K0, false); - theEmitter->emitIns_Mov(INS_kmovb_msk, EA_4BYTE, REG_K5, REG_K0, false); - theEmitter->emitIns_Mov(INS_kmovw_gpr, EA_4BYTE, REG_R16, REG_K0, false); - theEmitter->emitIns_Mov(INS_kmovw_msk, EA_4BYTE, REG_K5, REG_K0, false); - theEmitter->emitIns_Mov(INS_kmovd_gpr, EA_4BYTE, REG_R16, REG_K0, false); - theEmitter->emitIns_Mov(INS_kmovd_msk, EA_4BYTE, REG_K5, REG_K0, false); - theEmitter->emitIns_Mov(INS_kmovq_gpr, EA_8BYTE, REG_R16, REG_K0, false); - theEmitter->emitIns_Mov(INS_kmovq_msk, EA_8BYTE, REG_K5, REG_K0, false); - - theEmitter->emitIns_R_R(INS_crc32_apx, EA_1BYTE, REG_R16, REG_R17); - theEmitter->emitIns_R_R(INS_crc32_apx, EA_2BYTE, REG_R16, REG_R17); - theEmitter->emitIns_R_R(INS_crc32_apx, EA_8BYTE, REG_R16, REG_R17); - theEmitter->emitIns_R_A(INS_crc32_apx, EA_8BYTE, REG_R18, &loadGPR); - theEmitter->emitIns_R_S(INS_crc32_apx, EA_8BYTE, REG_R18, 0, 0); - - // Note that BZHI has a reversed src operands due to special handling at import. - theEmitter->emitIns_R_R_R(INS_bzhi, EA_4BYTE, REG_R16, REG_R18, REG_R17); - theEmitter->emitIns_R_R_R(INS_bzhi, EA_8BYTE, REG_R16, REG_R18, REG_R17); - theEmitter->emitIns_R_R_R(INS_mulx, EA_4BYTE, REG_R16, REG_R18, REG_R17); - theEmitter->emitIns_R_R_R(INS_mulx, EA_8BYTE, REG_R16, REG_R18, REG_R17); - theEmitter->emitIns_R_R_R(INS_pdep, EA_4BYTE, REG_R16, REG_R18, REG_R17); - theEmitter->emitIns_R_R_R(INS_pdep, EA_8BYTE, REG_R16, REG_R18, REG_R17); - theEmitter->emitIns_R_R_R(INS_pext, EA_4BYTE, REG_R16, REG_R18, REG_R17); - theEmitter->emitIns_R_R_R(INS_pext, EA_8BYTE, REG_R16, REG_R18, REG_R17); - - theEmitter->emitIns_R_R(INS_push2, EA_PTRSIZE, REG_R17, REG_R18, (insOpts)(INS_OPTS_EVEX_nd | INS_OPTS_APX_ppx)); - theEmitter->emitIns_R_R(INS_pop2, EA_PTRSIZE, REG_R17, REG_R18, (insOpts)(INS_OPTS_EVEX_nd | INS_OPTS_APX_ppx)); - theEmitter->emitIns_R(INS_push, EA_PTRSIZE, REG_R11, INS_OPTS_APX_ppx); - theEmitter->emitIns_R(INS_pop, EA_PTRSIZE, REG_R11, INS_OPTS_APX_ppx); - theEmitter->emitIns_R(INS_push, EA_PTRSIZE, REG_R17, INS_OPTS_APX_ppx); - theEmitter->emitIns_R(INS_pop, EA_PTRSIZE, REG_R17, INS_OPTS_APX_ppx); - - theEmitter->emitIns_Mov(INS_movd32, EA_4BYTE, REG_R16, REG_XMM0, false); - theEmitter->emitIns_Mov(INS_movd32, EA_4BYTE, REG_R16, REG_XMM16, false); + // if (!theEmitter->UseRex2Encoding() && !theEmitter->emitComp->DoJitStressRex2Encoding()) + // { + // return; + // } + + // theEmitter->emitIns_R_R(INS_add, EA_1BYTE, REG_EAX, REG_ECX); + // theEmitter->emitIns_R_R(INS_add, EA_2BYTE, REG_EAX, REG_ECX); + // theEmitter->emitIns_R_R(INS_add, EA_4BYTE, REG_EAX, REG_ECX); + // theEmitter->emitIns_R_R(INS_add, EA_8BYTE, REG_EAX, REG_ECX); + // theEmitter->emitIns_R_R(INS_or, EA_4BYTE, REG_EAX, REG_ECX); + // theEmitter->emitIns_R_R(INS_adc, EA_4BYTE, REG_EAX, REG_ECX); + // theEmitter->emitIns_R_R(INS_sbb, EA_4BYTE, REG_EAX, REG_ECX); + // theEmitter->emitIns_R_R(INS_and, EA_4BYTE, REG_EAX, REG_ECX); + // theEmitter->emitIns_R_R(INS_sub, EA_4BYTE, REG_EAX, REG_ECX); + // theEmitter->emitIns_R_R(INS_xor, EA_4BYTE, REG_EAX, REG_ECX); + // theEmitter->emitIns_R_R(INS_cmp, EA_4BYTE, REG_EAX, REG_ECX); + // theEmitter->emitIns_R_R(INS_test, EA_4BYTE, REG_EAX, REG_ECX); + // theEmitter->emitIns_R_R(INS_bsf, EA_4BYTE, REG_EAX, REG_ECX); + // theEmitter->emitIns_R_R(INS_bsr, EA_4BYTE, REG_EAX, REG_ECX); + + // theEmitter->emitIns_R_R(INS_cmovo, EA_4BYTE, REG_EAX, REG_ECX); + + // theEmitter->emitIns_Mov(INS_mov, EA_4BYTE, REG_EAX, REG_ECX, false); + // theEmitter->emitIns_Mov(INS_movsx, EA_2BYTE, REG_EAX, REG_ECX, false); + // theEmitter->emitIns_Mov(INS_movzx, EA_2BYTE, REG_EAX, REG_ECX, false); + + // theEmitter->emitIns_R_R(INS_popcnt, EA_4BYTE, REG_EAX, REG_ECX); + // theEmitter->emitIns_R_R(INS_lzcnt, EA_4BYTE, REG_EAX, REG_ECX); + // theEmitter->emitIns_R_R(INS_tzcnt, EA_4BYTE, REG_EAX, REG_ECX); + + // theEmitter->emitIns_R_I(INS_add, EA_4BYTE, REG_ECX, 0x05); + // theEmitter->emitIns_R_I(INS_add, EA_2BYTE, REG_ECX, 0x05); + // theEmitter->emitIns_R_I(INS_or, EA_4BYTE, REG_EAX, 0x05); + // theEmitter->emitIns_R_I(INS_adc, EA_4BYTE, REG_EAX, 0x05); + // theEmitter->emitIns_R_I(INS_sbb, EA_4BYTE, REG_EAX, 0x05); + // theEmitter->emitIns_R_I(INS_and, EA_4BYTE, REG_EAX, 0x05); + // theEmitter->emitIns_R_I(INS_sub, EA_4BYTE, REG_EAX, 0x05); + // theEmitter->emitIns_R_I(INS_xor, EA_4BYTE, REG_EAX, 0x05); + // theEmitter->emitIns_R_I(INS_cmp, EA_4BYTE, REG_EAX, 0x05); + // theEmitter->emitIns_R_I(INS_test, EA_4BYTE, REG_EAX, 0x05); + + // theEmitter->emitIns_R_I(INS_mov, EA_4BYTE, REG_EAX, 0xE0); + + // // JIT tend to compress imm64 to imm32 if higher half is all-zero, make sure this test checks the path for imm64. + // theEmitter->emitIns_R_I(INS_mov, EA_8BYTE, REG_RAX, 0xFFFF000000000000); + + // // shf reg, cl + // theEmitter->emitIns_R(INS_rol, EA_4BYTE, REG_EAX); + // theEmitter->emitIns_R(INS_ror, EA_4BYTE, REG_EAX); + // theEmitter->emitIns_R(INS_rcl, EA_4BYTE, REG_EAX); + // theEmitter->emitIns_R(INS_rcr, EA_4BYTE, REG_EAX); + // theEmitter->emitIns_R(INS_shl, EA_4BYTE, REG_EAX); + // theEmitter->emitIns_R(INS_shr, EA_4BYTE, REG_EAX); + // theEmitter->emitIns_R(INS_sar, EA_4BYTE, REG_EAX); + + // // shf reg, 1 + // theEmitter->emitIns_R(INS_rol_1, EA_4BYTE, REG_EAX); + // theEmitter->emitIns_R(INS_ror_1, EA_4BYTE, REG_EAX); + // theEmitter->emitIns_R(INS_rcl_1, EA_4BYTE, REG_EAX); + // theEmitter->emitIns_R(INS_rcr_1, EA_4BYTE, REG_EAX); + // theEmitter->emitIns_R(INS_shl_1, EA_4BYTE, REG_EAX); + // theEmitter->emitIns_R(INS_shr_1, EA_4BYTE, REG_EAX); + // theEmitter->emitIns_R(INS_sar_1, EA_4BYTE, REG_EAX); + + // // shf reg, imm8 + // theEmitter->emitIns_R_I(INS_shl_N, EA_4BYTE, REG_ECX, 0x05); + // theEmitter->emitIns_R_I(INS_shr_N, EA_4BYTE, REG_ECX, 0x05); + // theEmitter->emitIns_R_I(INS_sar_N, EA_4BYTE, REG_ECX, 0x05); + // theEmitter->emitIns_R_I(INS_rol_N, EA_4BYTE, REG_ECX, 0x05); + // theEmitter->emitIns_R_I(INS_ror_N, EA_4BYTE, REG_ECX, 0x05); + // theEmitter->emitIns_R_I(INS_rcl_N, EA_4BYTE, REG_ECX, 0x05); + // theEmitter->emitIns_R_I(INS_rcr_N, EA_4BYTE, REG_ECX, 0x05); + + // theEmitter->emitIns_R(INS_neg, EA_2BYTE, REG_EAX); + // theEmitter->emitIns_R(INS_not, EA_2BYTE, REG_EAX); + + // theEmitter->emitIns_R_AR(INS_lea, EA_4BYTE, REG_ECX, REG_EAX, 4); + + // theEmitter->emitIns_R_AR(INS_mov, EA_1BYTE, REG_ECX, REG_EAX, 4); + // theEmitter->emitIns_R_AR(INS_mov, EA_2BYTE, REG_ECX, REG_EAX, 4); + // theEmitter->emitIns_R_AR(INS_mov, EA_4BYTE, REG_ECX, REG_EAX, 4); + // theEmitter->emitIns_R_AR(INS_mov, EA_8BYTE, REG_ECX, REG_EAX, 4); + + // theEmitter->emitIns_R_AR(INS_add, EA_1BYTE, REG_EAX, REG_ECX, 4); + // theEmitter->emitIns_R_AR(INS_add, EA_2BYTE, REG_EAX, REG_ECX, 4); + // theEmitter->emitIns_R_AR(INS_add, EA_4BYTE, REG_EAX, REG_ECX, 4); + // theEmitter->emitIns_R_AR(INS_add, EA_8BYTE, REG_EAX, REG_ECX, 4); + // theEmitter->emitIns_R_AR(INS_or, EA_4BYTE, REG_EAX, REG_ECX, 4); + // theEmitter->emitIns_R_AR(INS_adc, EA_4BYTE, REG_EAX, REG_ECX, 4); + // theEmitter->emitIns_R_AR(INS_sbb, EA_4BYTE, REG_EAX, REG_ECX, 4); + // theEmitter->emitIns_R_AR(INS_and, EA_4BYTE, REG_EAX, REG_ECX, 4); + // theEmitter->emitIns_R_AR(INS_sub, EA_4BYTE, REG_EAX, REG_ECX, 4); + // theEmitter->emitIns_R_AR(INS_xor, EA_4BYTE, REG_EAX, REG_ECX, 4); + // theEmitter->emitIns_R_AR(INS_cmp, EA_4BYTE, REG_EAX, REG_ECX, 4); + // theEmitter->emitIns_R_AR(INS_test, EA_4BYTE, REG_EAX, REG_ECX, 4); + // theEmitter->emitIns_R_AR(INS_bsf, EA_4BYTE, REG_EAX, REG_ECX, 4); + // theEmitter->emitIns_R_AR(INS_bsr, EA_4BYTE, REG_EAX, REG_ECX, 4); + // theEmitter->emitIns_R_AR(INS_popcnt, EA_4BYTE, REG_EAX, REG_ECX, 4); + // theEmitter->emitIns_R_AR(INS_lzcnt, EA_4BYTE, REG_EAX, REG_ECX, 4); + // theEmitter->emitIns_R_AR(INS_tzcnt, EA_4BYTE, REG_EAX, REG_ECX, 4); + + // theEmitter->emitIns_AR_R(INS_add, EA_1BYTE, REG_EAX, REG_ECX, 4); + // theEmitter->emitIns_AR_R(INS_add, EA_2BYTE, REG_EAX, REG_ECX, 4); + // theEmitter->emitIns_AR_R(INS_add, EA_4BYTE, REG_EAX, REG_ECX, 4); + // theEmitter->emitIns_AR_R(INS_add, EA_8BYTE, REG_EAX, REG_ECX, 4); + // theEmitter->emitIns_AR_R(INS_or, EA_4BYTE, REG_EAX, REG_ECX, 4); + // theEmitter->emitIns_AR_R(INS_adc, EA_4BYTE, REG_EAX, REG_ECX, 4); + // theEmitter->emitIns_AR_R(INS_sbb, EA_4BYTE, REG_EAX, REG_ECX, 4); + // theEmitter->emitIns_AR_R(INS_and, EA_4BYTE, REG_EAX, REG_ECX, 4); + // theEmitter->emitIns_AR_R(INS_sub, EA_4BYTE, REG_EAX, REG_ECX, 4); + // theEmitter->emitIns_AR_R(INS_xor, EA_4BYTE, REG_EAX, REG_ECX, 4); + // theEmitter->emitIns_AR_R(INS_cmp, EA_4BYTE, REG_EAX, REG_ECX, 4); + // theEmitter->emitIns_AR_R(INS_test, EA_4BYTE, REG_EAX, REG_ECX, 4); + + // theEmitter->emitIns_R_AR(INS_movsx, EA_2BYTE, REG_ECX, REG_EAX, 4); + // theEmitter->emitIns_R_AR(INS_movzx, EA_2BYTE, REG_EAX, REG_ECX, 4); + // theEmitter->emitIns_R_AR(INS_cmovo, EA_4BYTE, REG_EAX, REG_ECX, 4); + + // theEmitter->emitIns_AR_R(INS_xadd, EA_4BYTE, REG_EAX, REG_EDX, 2); + + // theEmitter->emitIns_R_R_I(INS_shld, EA_4BYTE, REG_EAX, REG_ECX, 5); + // theEmitter->emitIns_R_R_I(INS_shrd, EA_2BYTE, REG_EAX, REG_ECX, 5); + + // theEmitter->emitIns_AR_R(INS_cmpxchg, EA_2BYTE, REG_EAX, REG_EDX, 2); + + // theEmitter->emitIns_R(INS_seto, EA_1BYTE, REG_EDX); + + // theEmitter->emitIns_R(INS_bswap, EA_8BYTE, REG_EDX); + + // // INS_bt only has reg-to-reg form. + // theEmitter->emitIns_R_R(INS_bt, EA_2BYTE, REG_EAX, REG_EDX); + + // theEmitter->emitIns_R(INS_idiv, EA_8BYTE, REG_EDX); + + // theEmitter->emitIns_R_R(INS_xchg, EA_8BYTE, REG_EAX, REG_EDX); + + // theEmitter->emitIns_R(INS_div, EA_8BYTE, REG_EDX); + // theEmitter->emitIns_R(INS_mulEAX, EA_8BYTE, REG_EDX); + + // GenTreePhysReg physReg(REG_EDX); + // physReg.SetRegNum(REG_EDX); + // GenTreeIndir load = indirForm(TYP_INT, &physReg); + + // theEmitter->emitIns_R_A(INS_add, EA_1BYTE, REG_EAX, &load); + // theEmitter->emitIns_R_A(INS_add, EA_2BYTE, REG_EAX, &load); + // theEmitter->emitIns_R_A(INS_add, EA_4BYTE, REG_EAX, &load); + // theEmitter->emitIns_R_A(INS_add, EA_8BYTE, REG_EAX, &load); + // theEmitter->emitIns_R_A(INS_or, EA_4BYTE, REG_EAX, &load); + // theEmitter->emitIns_R_A(INS_adc, EA_4BYTE, REG_EAX, &load); + // theEmitter->emitIns_R_A(INS_sbb, EA_4BYTE, REG_EAX, &load); + // theEmitter->emitIns_R_A(INS_and, EA_4BYTE, REG_EAX, &load); + // theEmitter->emitIns_R_A(INS_sub, EA_4BYTE, REG_EAX, &load); + // theEmitter->emitIns_R_A(INS_xor, EA_4BYTE, REG_EAX, &load); + // theEmitter->emitIns_R_A(INS_cmp, EA_4BYTE, REG_EAX, &load); + // theEmitter->emitIns_R_A(INS_test, EA_4BYTE, REG_EAX, &load); + // theEmitter->emitIns_R_A(INS_bsf, EA_4BYTE, REG_EAX, &load); + // theEmitter->emitIns_R_A(INS_bsr, EA_4BYTE, REG_EAX, &load); + + // // Note: + // // All the tests below rely on the runtime status of the stack this unit tests attaching to, + // // it might fail due to stack value unavailable/mismatch, since these tests are mainly for + // // encoding correctness check, this kind of failures may be considered as not harmful. + + // theEmitter->emitIns_R_S(INS_add, EA_1BYTE, REG_EAX, 0, 0); + // theEmitter->emitIns_R_S(INS_add, EA_2BYTE, REG_EAX, 0, 0); + // theEmitter->emitIns_R_S(INS_add, EA_4BYTE, REG_EAX, 0, 0); + // theEmitter->emitIns_R_S(INS_add, EA_8BYTE, REG_EAX, 0, 0); + // theEmitter->emitIns_R_S(INS_or, EA_4BYTE, REG_EAX, 0, 0); + // theEmitter->emitIns_R_S(INS_adc, EA_4BYTE, REG_EAX, 0, 0); + // theEmitter->emitIns_R_S(INS_sbb, EA_4BYTE, REG_EAX, 0, 0); + // theEmitter->emitIns_R_S(INS_and, EA_4BYTE, REG_EAX, 0, 0); + // theEmitter->emitIns_R_S(INS_sub, EA_4BYTE, REG_EAX, 0, 0); + // theEmitter->emitIns_R_S(INS_xor, EA_4BYTE, REG_EAX, 0, 0); + // theEmitter->emitIns_R_S(INS_cmp, EA_4BYTE, REG_EAX, 0, 0); + // theEmitter->emitIns_R_S(INS_test, EA_4BYTE, REG_EAX, 0, 0); + // theEmitter->emitIns_S_R(INS_xadd, EA_2BYTE, REG_EAX, 0, 0); + + // theEmitter->emitIns_S_I(INS_shl_N, EA_4BYTE, 0, 0, 4); + // theEmitter->emitIns_S(INS_shl_1, EA_4BYTE, 0, 4); + + // theEmitter->emitIns_R_S(INS_movsx, EA_2BYTE, REG_ECX, 0, 0); + // theEmitter->emitIns_R_S(INS_movzx, EA_2BYTE, REG_EAX, 0, 0); + // theEmitter->emitIns_R_S(INS_cmovo, EA_4BYTE, REG_EAX, 0, 0); + + // theEmitter->emitIns_R(INS_pop, EA_PTRSIZE, REG_EAX); + // theEmitter->emitIns_R(INS_push, EA_PTRSIZE, REG_EAX); + // theEmitter->emitIns_R(INS_pop_hide, EA_PTRSIZE, REG_EAX); + // theEmitter->emitIns_R(INS_push_hide, EA_PTRSIZE, REG_EAX); + + // theEmitter->emitIns_S(INS_pop, EA_PTRSIZE, 0, 0); + // theEmitter->emitIns_I(INS_push, EA_PTRSIZE, 50); + + // theEmitter->emitIns_R(INS_inc, EA_4BYTE, REG_EAX); + // theEmitter->emitIns_AR(INS_inc, EA_2BYTE, REG_EAX, 2); + // theEmitter->emitIns_S(INS_inc, EA_2BYTE, 0, 0); + // theEmitter->emitIns_R(INS_dec, EA_4BYTE, REG_EAX); + // theEmitter->emitIns_AR(INS_dec, EA_2BYTE, REG_EAX, 2); + // theEmitter->emitIns_S(INS_dec, EA_2BYTE, 0, 0); + + // theEmitter->emitIns_S(INS_neg, EA_2BYTE, 0, 0); + // theEmitter->emitIns_S(INS_not, EA_2BYTE, 0, 0); + + // // APX-EVEX + + // theEmitter->emitIns_R_R_R(INS_add, EA_8BYTE, REG_R10, REG_EAX, REG_ECX, INS_OPTS_EVEX_nd); + // theEmitter->emitIns_R_R_R(INS_sub, EA_2BYTE, REG_R10, REG_EAX, REG_ECX, INS_OPTS_EVEX_nd); + // theEmitter->emitIns_R_R_R(INS_or, EA_2BYTE, REG_R10, REG_EAX, REG_ECX, INS_OPTS_EVEX_nd); + // theEmitter->emitIns_R_R_R(INS_and, EA_2BYTE, REG_R10, REG_EAX, REG_ECX, INS_OPTS_EVEX_nd); + // theEmitter->emitIns_R_R_R(INS_xor, EA_1BYTE, REG_R10, REG_EAX, REG_ECX, INS_OPTS_EVEX_nd); + + // theEmitter->emitIns_R_R_I(INS_or, EA_2BYTE, REG_R10, REG_EAX, 10565, INS_OPTS_EVEX_nd); + // theEmitter->emitIns_R_R_I(INS_or, EA_8BYTE, REG_R10, REG_EAX, 10, INS_OPTS_EVEX_nd); + // theEmitter->emitIns_R_R_S(INS_or, EA_8BYTE, REG_R10, REG_EAX, 0, 1, INS_OPTS_EVEX_nd); + + // theEmitter->emitIns_R_R(INS_neg, EA_2BYTE, REG_R10, REG_ECX, INS_OPTS_EVEX_nd); + + // theEmitter->emitIns_R_R(INS_shl, EA_2BYTE, REG_R11, REG_EAX, INS_OPTS_EVEX_nd); + // theEmitter->emitIns_R_R(INS_shl_1, EA_2BYTE, REG_R11, REG_EAX, INS_OPTS_EVEX_nd); + // theEmitter->emitIns_R_R_I(INS_shl_N, EA_2BYTE, REG_R11, REG_ECX, 7, INS_OPTS_EVEX_nd); + // theEmitter->emitIns_R_R_I(INS_shl_N, EA_2BYTE, REG_R11, REG_ECX, 7, INS_OPTS_EVEX_nd); + // theEmitter->emitIns_R_R_I(INS_rcr_N, EA_2BYTE, REG_R11, REG_ECX, 7, INS_OPTS_EVEX_nd); + // theEmitter->emitIns_R_R_I(INS_rcl_N, EA_2BYTE, REG_R11, REG_ECX, 7, INS_OPTS_EVEX_nd); + + // theEmitter->emitIns_R_R(INS_inc, EA_2BYTE, REG_R11, REG_ECX, INS_OPTS_EVEX_nd); + // theEmitter->emitIns_R_R(INS_dec, EA_2BYTE, REG_R11, REG_ECX, INS_OPTS_EVEX_nd); + + // theEmitter->emitIns_R_R_R(INS_cmovo, EA_4BYTE, REG_R12, REG_R11, REG_EAX, INS_OPTS_EVEX_nd); + + // theEmitter->emitIns_R_R_R(INS_imul, EA_4BYTE, REG_R12, REG_R11, REG_ECX, INS_OPTS_EVEX_nd); + // theEmitter->emitIns_R_R_S(INS_imul, EA_4BYTE, REG_R12, REG_R11, 0, 1, INS_OPTS_EVEX_nd); + + // theEmitter->emitIns_R_R(INS_add, EA_4BYTE, REG_R12, REG_R11, INS_OPTS_EVEX_nf); + // theEmitter->emitIns_R_R(INS_sub, EA_4BYTE, REG_R12, REG_R11, INS_OPTS_EVEX_nf); + // theEmitter->emitIns_R_R(INS_and, EA_4BYTE, REG_R12, REG_R11, INS_OPTS_EVEX_nf); + // theEmitter->emitIns_R_R(INS_or, EA_4BYTE, REG_R12, REG_R11, INS_OPTS_EVEX_nf); + // theEmitter->emitIns_R_R(INS_xor, EA_4BYTE, REG_R12, REG_R11, INS_OPTS_EVEX_nf); + // theEmitter->emitIns_R(INS_inc, EA_4BYTE, REG_R12, INS_OPTS_EVEX_nf); + // theEmitter->emitIns_R(INS_dec, EA_4BYTE, REG_R12, INS_OPTS_EVEX_nf); + + // theEmitter->emitIns_R_I(INS_add, EA_4BYTE, REG_R12, 5, INS_OPTS_EVEX_nf); + // theEmitter->emitIns_R_I(INS_sub, EA_4BYTE, REG_R12, 5, INS_OPTS_EVEX_nf); + // theEmitter->emitIns_R_I(INS_and, EA_4BYTE, REG_R12, 5, INS_OPTS_EVEX_nf); + // theEmitter->emitIns_R_I(INS_or, EA_4BYTE, REG_R12, 5, INS_OPTS_EVEX_nf); + // theEmitter->emitIns_R_I(INS_xor, EA_4BYTE, REG_R12, 5, INS_OPTS_EVEX_nf); + + // theEmitter->emitIns_R_S(INS_add, EA_4BYTE, REG_R12, 0, 1, INS_OPTS_EVEX_nf); + // theEmitter->emitIns_R_S(INS_sub, EA_4BYTE, REG_R12, 0, 1, INS_OPTS_EVEX_nf); + // theEmitter->emitIns_R_S(INS_and, EA_4BYTE, REG_R12, 0, 1, INS_OPTS_EVEX_nf); + // theEmitter->emitIns_R_S(INS_or, EA_4BYTE, REG_R12, 0, 1, INS_OPTS_EVEX_nf); + // theEmitter->emitIns_R_S(INS_xor, EA_4BYTE, REG_R12, 0, 1, INS_OPTS_EVEX_nf); + + // theEmitter->emitIns_R(INS_neg, EA_2BYTE, REG_R11, INS_OPTS_EVEX_nf); + // theEmitter->emitIns_R(INS_shl, EA_2BYTE, REG_R11, INS_OPTS_EVEX_nf); + // theEmitter->emitIns_R(INS_shl_1, EA_2BYTE, REG_R11, INS_OPTS_EVEX_nf); + // theEmitter->emitIns_R_I(INS_shl_N, EA_2BYTE, REG_R11, 7, INS_OPTS_EVEX_nf); + // theEmitter->emitIns_R_I(INS_shl_N, EA_2BYTE, REG_R11, 7, INS_OPTS_EVEX_nf); + + // theEmitter->emitIns_R_R(INS_imul, EA_4BYTE, REG_R12, REG_R11, INS_OPTS_EVEX_nf); + // theEmitter->emitIns_R_S(INS_imul, EA_4BYTE, REG_R12, 0, 1, INS_OPTS_EVEX_nf); + + // theEmitter->emitIns_R_I(INS_imul_15, EA_4BYTE, REG_R12, 5, INS_OPTS_EVEX_nf); + + // theEmitter->emitIns_R(INS_imulEAX, EA_8BYTE, REG_R12, INS_OPTS_EVEX_nf); + // theEmitter->emitIns_R(INS_mulEAX, EA_8BYTE, REG_R12, INS_OPTS_EVEX_nf); + // theEmitter->emitIns_R(INS_div, EA_8BYTE, REG_R12, INS_OPTS_EVEX_nf); + // theEmitter->emitIns_R(INS_idiv, EA_8BYTE, REG_R12, INS_OPTS_EVEX_nf); + + // theEmitter->emitIns_R_R(INS_tzcnt_apx, EA_8BYTE, REG_R12, REG_R11, INS_OPTS_EVEX_nf); + // theEmitter->emitIns_R_R(INS_lzcnt_apx, EA_8BYTE, REG_R12, REG_R11, INS_OPTS_EVEX_nf); + // theEmitter->emitIns_R_R(INS_popcnt_apx, EA_8BYTE, REG_R12, REG_R11, INS_OPTS_EVEX_nf); + + // theEmitter->emitIns_R_S(INS_tzcnt_apx, EA_8BYTE, REG_R12, 0, 1, INS_OPTS_EVEX_nf); + // theEmitter->emitIns_R_S(INS_lzcnt_apx, EA_8BYTE, REG_R12, 0, 1, INS_OPTS_EVEX_nf); + // theEmitter->emitIns_R_S(INS_popcnt_apx, EA_8BYTE, REG_R12, 0, 1, INS_OPTS_EVEX_nf); + + // theEmitter->emitIns_R_R_R(INS_add, EA_2BYTE, REG_R12, REG_R13, REG_R11, + // (insOpts)(INS_OPTS_EVEX_nf | INS_OPTS_EVEX_nd)); + + // theEmitter->emitIns_R_R_R(INS_andn, EA_8BYTE, REG_R11, REG_R13, REG_R11, INS_OPTS_EVEX_nf); + // theEmitter->emitIns_R_R_R(INS_bextr, EA_8BYTE, REG_R11, REG_R13, REG_R11, INS_OPTS_EVEX_nf); + + // theEmitter->emitIns_R_R(INS_blsi, EA_8BYTE, REG_R11, REG_R13, INS_OPTS_EVEX_nf); + // theEmitter->emitIns_R_R(INS_blsmsk, EA_8BYTE, REG_R11, REG_R13, INS_OPTS_EVEX_nf); + // theEmitter->emitIns_R_S(INS_blsr, EA_8BYTE, REG_R11, 0, 1); + + // theEmitter->emitIns_AR(INS_inc, EA_4BYTE, REG_EAX, 0, INS_OPTS_EVEX_NoApxPromotion); + + // theEmitter->emitIns_BASE_R_R(INS_inc, EA_4BYTE, REG_R11, REG_R12); + // theEmitter->emitIns_BASE_R_R_I(INS_add, EA_4BYTE, REG_R11, REG_R12, 5); + + // // testing for EGPR encodings. + // GenTreePhysReg eGPR(REG_R16); + // eGPR.SetRegNum(REG_R16); + // GenTreeIndir loadGPR = indirForm(TYP_SIMD32, &eGPR); + + // // // SIMD instructions + // // // In most of the cases, EGPR will only be used as BASE/INDEX registers in SIMD instructions. + // theEmitter->emitIns_R_R_A(INS_addps, EA_32BYTE, REG_XMM16, REG_XMM16, &loadGPR); + + // // // Legacy instructions + // theEmitter->emitIns_R_ARX(INS_add, EA_4BYTE, REG_R16, REG_R17, REG_R18, 1, 0); + + // theEmitter->emitIns_AR_R(INS_movnti64, EA_8BYTE, REG_R17, REG_R16, 10); + // theEmitter->emitIns_R_R_R(INS_andn, EA_8BYTE, REG_R17, REG_R16, REG_R18); + + // theEmitter->emitIns_Mov(INS_kmovb_gpr, EA_4BYTE, REG_R16, REG_K0, false); + // theEmitter->emitIns_Mov(INS_kmovb_msk, EA_4BYTE, REG_K5, REG_K0, false); + // theEmitter->emitIns_Mov(INS_kmovw_gpr, EA_4BYTE, REG_R16, REG_K0, false); + // theEmitter->emitIns_Mov(INS_kmovw_msk, EA_4BYTE, REG_K5, REG_K0, false); + // theEmitter->emitIns_Mov(INS_kmovd_gpr, EA_4BYTE, REG_R16, REG_K0, false); + // theEmitter->emitIns_Mov(INS_kmovd_msk, EA_4BYTE, REG_K5, REG_K0, false); + // theEmitter->emitIns_Mov(INS_kmovq_gpr, EA_8BYTE, REG_R16, REG_K0, false); + // theEmitter->emitIns_Mov(INS_kmovq_msk, EA_8BYTE, REG_K5, REG_K0, false); + + // theEmitter->emitIns_R_R(INS_crc32_apx, EA_1BYTE, REG_R16, REG_R17); + // theEmitter->emitIns_R_R(INS_crc32_apx, EA_2BYTE, REG_R16, REG_R17); + // theEmitter->emitIns_R_R(INS_crc32_apx, EA_8BYTE, REG_R16, REG_R17); + // theEmitter->emitIns_R_A(INS_crc32_apx, EA_8BYTE, REG_R18, &loadGPR); + // theEmitter->emitIns_R_S(INS_crc32_apx, EA_8BYTE, REG_R18, 0, 0); + + // // Note that BZHI has a reversed src operands due to special handling at import. + // theEmitter->emitIns_R_R_R(INS_bzhi, EA_4BYTE, REG_R16, REG_R18, REG_R17); + // theEmitter->emitIns_R_R_R(INS_bzhi, EA_8BYTE, REG_R16, REG_R18, REG_R17); + // theEmitter->emitIns_R_R_R(INS_mulx, EA_4BYTE, REG_R16, REG_R18, REG_R17); + // theEmitter->emitIns_R_R_R(INS_mulx, EA_8BYTE, REG_R16, REG_R18, REG_R17); + // theEmitter->emitIns_R_R_R(INS_pdep, EA_4BYTE, REG_R16, REG_R18, REG_R17); + // theEmitter->emitIns_R_R_R(INS_pdep, EA_8BYTE, REG_R16, REG_R18, REG_R17); + // theEmitter->emitIns_R_R_R(INS_pext, EA_4BYTE, REG_R16, REG_R18, REG_R17); + // theEmitter->emitIns_R_R_R(INS_pext, EA_8BYTE, REG_R16, REG_R18, REG_R17); + + // theEmitter->emitIns_R_R(INS_push2, EA_PTRSIZE, REG_R17, REG_R18, (insOpts)(INS_OPTS_EVEX_nd | INS_OPTS_APX_ppx)); + // theEmitter->emitIns_R_R(INS_pop2, EA_PTRSIZE, REG_R17, REG_R18, (insOpts)(INS_OPTS_EVEX_nd | INS_OPTS_APX_ppx)); + // theEmitter->emitIns_R(INS_push, EA_PTRSIZE, REG_R11, INS_OPTS_APX_ppx); + // theEmitter->emitIns_R(INS_pop, EA_PTRSIZE, REG_R11, INS_OPTS_APX_ppx); + // theEmitter->emitIns_R(INS_push, EA_PTRSIZE, REG_R17, INS_OPTS_APX_ppx); + // theEmitter->emitIns_R(INS_pop, EA_PTRSIZE, REG_R17, INS_OPTS_APX_ppx); + + // theEmitter->emitIns_Mov(INS_movd32, EA_4BYTE, REG_R16, REG_XMM0, false); + // theEmitter->emitIns_Mov(INS_movd32, EA_4BYTE, REG_R16, REG_XMM16, false); + + theEmitter->emitIns_R(INS_setzuo, EA_1BYTE, REG_R11, INS_OPTS_EVEX_zu); } void CodeGen::genAmd64EmitterUnitTestsAvx10v2() diff --git a/src/coreclr/jit/emit.h b/src/coreclr/jit/emit.h index c398e4c26058e6..4adfddfc250a15 100644 --- a/src/coreclr/jit/emit.h +++ b/src/coreclr/jit/emit.h @@ -848,6 +848,8 @@ class emitter (_idCustom6 << 1) | _idCustom5 /* Evex.b: embedded broadcast, embedded rounding, embedded SAE \ */ #define _idEvexNdContext _idCustom5 /* bits used for the APX-EVEX.nd context for promoted legacy instructions */ +#define _idEvexZuContext _idCustom5 /* bits used for the APX-EVEX.zu context for promoted legacy instructions */ + #define _idEvexNfContext _idCustom6 /* bits used for the APX-EVEX.nf context for promoted legacy/vex instructions */ // We repurpose 4 bits for the default flag value bits for ccmp instructions. @@ -1787,12 +1789,23 @@ class emitter return _idEvexNdContext != 0; } + bool idIsEvexZuContextSet() const + { + return _idEvexZuContext != 0; + } + void idSetEvexNdContext() { assert(!idIsEvexNdContextSet()); _idEvexNdContext = 1; } + void idSetEvexZuContext() + { + assert(!idIsEvexZuContextSet()); + _idEvexZuContext = 1; + } + bool idIsEvexNfContextSet() const { return _idEvexNfContext != 0; diff --git a/src/coreclr/jit/emitxarch.cpp b/src/coreclr/jit/emitxarch.cpp index a45b298bdb7293..24c6e0c82c9c27 100644 --- a/src/coreclr/jit/emitxarch.cpp +++ b/src/coreclr/jit/emitxarch.cpp @@ -432,6 +432,12 @@ bool emitter::IsApxExtendedEvexInstruction(instruction ins) const return true; } + if (ins >= INS_setzuo && ins <= INS_setzug) + { + // SETcc can use EVEX.ZU feature. + return true; + } + if (IsApxOnlyInstruction(ins)) { return true; @@ -1967,16 +1973,25 @@ bool emitter::TakesApxExtendedEvexPrefix(const instrDesc* id) const return false; } - if (id->idIsEvexNdContextSet()) + if (id->idIsEvexNdContextSet() && HasApxNdd(ins)) { + // The instruction uses APX-ND hint, and it requires EVEX. return true; } - if (id->idIsEvexNfContextSet()) + if (id->idIsEvexNfContextSet() && HasApxNf(ins)) { + // The instruction uses APX-NF hint, and it requires EVEX. return true; } + if (ins >= INS_setzuo && ins <= INS_setzug) + { + // These are promoted forms of SETcc instruction with EVEX.ZU. + // TODO-XArch-APX: maybe consider return true as we may only use those instructions with ZU set. + return id->idIsEvexZuContextSet(); + } + if (ins == INS_crc32_apx || ins == INS_movbe_apx) { return true; @@ -2085,11 +2100,17 @@ emitter::code_t emitter::AddEvexPrefix(const instrDesc* id, code_t code, emitAtt // TODO-XArch-APX: // verify if it is actually safe to reuse the EVEX.ND with EVEX.B on instrDesc. - if (id->idIsEvexNdContextSet()) + if (id->idIsEvexNdContextSet() && HasApxNdd(ins)) { code |= ND_BIT_IN_BYTE_EVEX_PREFIX; } + if (id->idIsEvexZuContextSet()) + { + // EVEX.ZU reuses the EVEX.ND bit for SETcc and IMUL. + code |= ND_BIT_IN_BYTE_EVEX_PREFIX; + } + if (id->idIsEvexNfContextSet()) { code |= NF_BIT_IN_BYTE_EVEX_PREFIX; @@ -2117,6 +2138,13 @@ emitter::code_t emitter::AddEvexPrefix(const instrDesc* id, code_t code, emitAtt code |= ((size_t)id->idGetEvexDFV()) << 43; code |= ((size_t)GetCCFromCCMP(ins)) << 32; } + + if (ins >= INS_setzuo && ins <= INS_setzug) + { + // SETcc in EVEX space are assigned with new opcode: EVEX.LLZ.F2.MAP4.IGNORED 4x. + // Here we need to hard code the EVEX.pp for F2 prefix. + code |= 0x30000000000ULL; + } #endif return code; @@ -2937,7 +2965,7 @@ emitter::code_t emitter::emitExtractEvexPrefix(instruction ins, code_t& code) co // // 00 - None (0F - packed float) // 01 - 66 (66 0F - packed double) - // 10 - F3 (F3 0F - scalar float + // 10 - F3 (F3 0F - scalar float) // 11 - F2 (F2 0F - scalar double) switch (sizePrefix) { @@ -6928,6 +6956,7 @@ void emitter::emitIns_R(instruction ins, emitAttr attr, regNumber reg, insOpts i } SetEvexNfIfNeeded(id, instOptions); + SetEvexZuIfNeeded(id, instOptions); // Vex bytes sz += emitGetAdjustedSize(id, insEncodeMRreg(id, reg, attr, insCodeMR(ins))); @@ -16601,7 +16630,7 @@ BYTE* emitter::emitOutputR(BYTE* dst, instrDesc* id) case INS_setge: case INS_setle: case INS_setg: - + { assert(id->idGCref() == GCT_NONE); assert(size == EA_1BYTE); @@ -16610,6 +16639,7 @@ BYTE* emitter::emitOutputR(BYTE* dst, instrDesc* id) if (TakesRex2Prefix(id)) { code = AddRex2Prefix(ins, code); + code = insEncodeReg012(id, reg, EA_1BYTE, &code); dst += emitOutputRexOrSimdPrefixIfNeeded(ins, dst, code); dst += emitOutputWord(dst, code & 0x0000FFFF); } @@ -16624,6 +16654,37 @@ BYTE* emitter::emitOutputR(BYTE* dst, instrDesc* id) dst += emitOutputWord(dst, code & 0x0000FFFF); } break; + } + +#ifdef TARGET_AMD64 + case INS_setzuo: + case INS_setzuno: + case INS_setzub: + case INS_setzuae: + case INS_setzue: + case INS_setzune: + case INS_setzube: + case INS_setzua: + case INS_setzus: + case INS_setzuns: + case INS_setzup: + case INS_setzunp: + case INS_setzul: + case INS_setzuge: + case INS_setzule: + case INS_setzug: + { + assert(TakesApxExtendedEvexPrefix(id)); + assert(size == EA_1BYTE); + + code = insEncodeMRreg(id, reg, size, insCodeMR(ins)); + code = AddEvexPrefix(id, code, size); + unsigned regcode = insEncodeReg012(id, reg, size, &code); + dst += emitOutputRexOrSimdPrefixIfNeeded(ins, dst, code); + dst += emitOutputWord(dst, code & 0x0000FFFF); + break; + } +#endif case INS_mulEAX: case INS_imulEAX: @@ -20804,6 +20865,24 @@ emitter::insExecutionCharacteristics emitter::getInsExecutionCharacteristics(ins case INS_setge: case INS_setle: case INS_setg: +#ifdef TARGET_AMD64 + case INS_setzuo: + case INS_setzuno: + case INS_setzub: + case INS_setzuae: + case INS_setzue: + case INS_setzune: + case INS_setzube: + case INS_setzua: + case INS_setzus: + case INS_setzuns: + case INS_setzup: + case INS_setzunp: + case INS_setzul: + case INS_setzuge: + case INS_setzule: + case INS_setzug: +#endif result.insLatency += PERFSCORE_LATENCY_1C; if (insFmt == IF_RRD) { diff --git a/src/coreclr/jit/emitxarch.h b/src/coreclr/jit/emitxarch.h index 96c3bcf6881060..e80723bdf48819 100644 --- a/src/coreclr/jit/emitxarch.h +++ b/src/coreclr/jit/emitxarch.h @@ -570,6 +570,33 @@ void SetEvexNfIfNeeded(instrDesc* id, insOpts instOptions) } } +//------------------------------------------------------------------------ +// SetEvexNdIfNeeded: set Evex.zu on instrDesc +// +// Arguments: +// id - instruction descriptor +// instOptions - emit options +// +void SetEvexZuIfNeeded(instrDesc* id, insOpts instOptions) +{ + if ((instOptions & INS_OPTS_EVEX_zu_MASK) != 0) + { + assert(UsePromotedEVEXEncoding()); + instruction ins = id->idIns(); +#ifdef TARGET_AMD64 + assert((ins >= INS_setzuo && ins <= INS_setzug) || (ins >= INS_imul_AX && ins <= INS_imul_31)); +#else + // This method is not expected to be used on 32-bit systems. + unreached(); +#endif + id->idSetEvexZuContext(); + } + else + { + assert((instOptions & INS_OPTS_EVEX_zu_MASK) == 0); + } +} + //------------------------------------------------------------------------ // SetApxPpxIfNeeded: set APX.ppx on instrDesc // diff --git a/src/coreclr/jit/instr.h b/src/coreclr/jit/instr.h index b4cd09b7f0a429..9c10793a762136 100644 --- a/src/coreclr/jit/instr.h +++ b/src/coreclr/jit/instr.h @@ -299,6 +299,10 @@ enum insOpts: unsigned // One-bit: 0b10_0000_0000_0000 INS_OPTS_APX_ppx_MASK = 0x2000, // mask for APX-EVEX.ppx feature. + INS_OPTS_EVEX_zu = 1 << 14, // Zero Upper for APX-EVEX + // One-bit: 0b100_0000_0000_0000 + INS_OPTS_EVEX_zu_MASK = 0x4000, // mask for APX-EVEX.zu feature. + }; #elif defined(TARGET_ARM) || defined(TARGET_ARM64) || defined(TARGET_LOONGARCH64) || defined(TARGET_RISCV64) diff --git a/src/coreclr/jit/instrsxarch.h b/src/coreclr/jit/instrsxarch.h index ac5440537e1772..564b5a563eaae0 100644 --- a/src/coreclr/jit/instrsxarch.h +++ b/src/coreclr/jit/instrsxarch.h @@ -1284,6 +1284,25 @@ INST1(setge, "setge", IUM_WR, 0x0F009D, INST1(setle, "setle", IUM_WR, 0x0F009E, INS_TT_NONE, Reads_OF | Reads_SF | Reads_ZF | Encoding_REX2) INST1(setg, "setg", IUM_WR, 0x0F009F, INS_TT_NONE, Reads_OF | Reads_SF | Reads_ZF | Encoding_REX2) +#ifdef TARGET_AMD64 +INST1(setzuo, "setzuo", IUM_WR, 0x40, INS_TT_NONE, Reads_OF) +INST1(setzuno, "setzuno", IUM_WR, 0x41, INS_TT_NONE, Reads_OF) +INST1(setzub, "setzub", IUM_WR, 0x42, INS_TT_NONE, Reads_CF) +INST1(setzuae, "setzuae", IUM_WR, 0x43, INS_TT_NONE, Reads_CF) +INST1(setzue, "setzue", IUM_WR, 0x44, INS_TT_NONE, Reads_ZF) +INST1(setzune, "setzune", IUM_WR, 0x45, INS_TT_NONE, Reads_ZF) +INST1(setzube, "setzube", IUM_WR, 0x46, INS_TT_NONE, Reads_ZF | Reads_CF) +INST1(setzua, "setzua", IUM_WR, 0x47, INS_TT_NONE, Reads_ZF | Reads_CF) +INST1(setzus, "setzus", IUM_WR, 0x48, INS_TT_NONE, Reads_SF) +INST1(setzuns, "setzuns", IUM_WR, 0x49, INS_TT_NONE, Reads_SF) +INST1(setzup, "setzup", IUM_WR, 0x4A, INS_TT_NONE, Reads_PF) +INST1(setzunp, "setzunp", IUM_WR, 0x4B, INS_TT_NONE, Reads_PF) +INST1(setzul, "setzul", IUM_WR, 0x4C, INS_TT_NONE, Reads_OF | Reads_SF) +INST1(setzuge, "setzuge", IUM_WR, 0x4D, INS_TT_NONE, Reads_OF | Reads_SF) +INST1(setzule, "setzule", IUM_WR, 0x4E, INS_TT_NONE, Reads_OF | Reads_SF | Reads_ZF) +INST1(setzug, "setzug", IUM_WR, 0x4F, INS_TT_NONE, Reads_OF | Reads_SF | Reads_ZF) +#endif + // Indirect jump used for tailcalls. We differentiate between func-internal // indirect jump (e.g. used for switch) and tailcall indirect jumps because the // x64 unwinder might require the latter to be rex.w prefixed. From 6d977af108d732b7383f4eb6d49583fadb2a613c Mon Sep 17 00:00:00 2001 From: Ruihan-Yin Date: Wed, 25 Jun 2025 12:25:28 -0700 Subject: [PATCH 02/19] revert changes: commenting out emitter unit tests. --- src/coreclr/jit/codegenxarch.cpp | 678 +++++++++++++++---------------- 1 file changed, 339 insertions(+), 339 deletions(-) diff --git a/src/coreclr/jit/codegenxarch.cpp b/src/coreclr/jit/codegenxarch.cpp index a93f9badb3351f..7ca0415ccc4fc4 100644 --- a/src/coreclr/jit/codegenxarch.cpp +++ b/src/coreclr/jit/codegenxarch.cpp @@ -9111,345 +9111,345 @@ void CodeGen::genAmd64EmitterUnitTestsApx() genDefineTempLabel(genCreateTempLabel()); // This test suite needs REX2 enabled. - // if (!theEmitter->UseRex2Encoding() && !theEmitter->emitComp->DoJitStressRex2Encoding()) - // { - // return; - // } - - // theEmitter->emitIns_R_R(INS_add, EA_1BYTE, REG_EAX, REG_ECX); - // theEmitter->emitIns_R_R(INS_add, EA_2BYTE, REG_EAX, REG_ECX); - // theEmitter->emitIns_R_R(INS_add, EA_4BYTE, REG_EAX, REG_ECX); - // theEmitter->emitIns_R_R(INS_add, EA_8BYTE, REG_EAX, REG_ECX); - // theEmitter->emitIns_R_R(INS_or, EA_4BYTE, REG_EAX, REG_ECX); - // theEmitter->emitIns_R_R(INS_adc, EA_4BYTE, REG_EAX, REG_ECX); - // theEmitter->emitIns_R_R(INS_sbb, EA_4BYTE, REG_EAX, REG_ECX); - // theEmitter->emitIns_R_R(INS_and, EA_4BYTE, REG_EAX, REG_ECX); - // theEmitter->emitIns_R_R(INS_sub, EA_4BYTE, REG_EAX, REG_ECX); - // theEmitter->emitIns_R_R(INS_xor, EA_4BYTE, REG_EAX, REG_ECX); - // theEmitter->emitIns_R_R(INS_cmp, EA_4BYTE, REG_EAX, REG_ECX); - // theEmitter->emitIns_R_R(INS_test, EA_4BYTE, REG_EAX, REG_ECX); - // theEmitter->emitIns_R_R(INS_bsf, EA_4BYTE, REG_EAX, REG_ECX); - // theEmitter->emitIns_R_R(INS_bsr, EA_4BYTE, REG_EAX, REG_ECX); - - // theEmitter->emitIns_R_R(INS_cmovo, EA_4BYTE, REG_EAX, REG_ECX); - - // theEmitter->emitIns_Mov(INS_mov, EA_4BYTE, REG_EAX, REG_ECX, false); - // theEmitter->emitIns_Mov(INS_movsx, EA_2BYTE, REG_EAX, REG_ECX, false); - // theEmitter->emitIns_Mov(INS_movzx, EA_2BYTE, REG_EAX, REG_ECX, false); - - // theEmitter->emitIns_R_R(INS_popcnt, EA_4BYTE, REG_EAX, REG_ECX); - // theEmitter->emitIns_R_R(INS_lzcnt, EA_4BYTE, REG_EAX, REG_ECX); - // theEmitter->emitIns_R_R(INS_tzcnt, EA_4BYTE, REG_EAX, REG_ECX); - - // theEmitter->emitIns_R_I(INS_add, EA_4BYTE, REG_ECX, 0x05); - // theEmitter->emitIns_R_I(INS_add, EA_2BYTE, REG_ECX, 0x05); - // theEmitter->emitIns_R_I(INS_or, EA_4BYTE, REG_EAX, 0x05); - // theEmitter->emitIns_R_I(INS_adc, EA_4BYTE, REG_EAX, 0x05); - // theEmitter->emitIns_R_I(INS_sbb, EA_4BYTE, REG_EAX, 0x05); - // theEmitter->emitIns_R_I(INS_and, EA_4BYTE, REG_EAX, 0x05); - // theEmitter->emitIns_R_I(INS_sub, EA_4BYTE, REG_EAX, 0x05); - // theEmitter->emitIns_R_I(INS_xor, EA_4BYTE, REG_EAX, 0x05); - // theEmitter->emitIns_R_I(INS_cmp, EA_4BYTE, REG_EAX, 0x05); - // theEmitter->emitIns_R_I(INS_test, EA_4BYTE, REG_EAX, 0x05); - - // theEmitter->emitIns_R_I(INS_mov, EA_4BYTE, REG_EAX, 0xE0); - - // // JIT tend to compress imm64 to imm32 if higher half is all-zero, make sure this test checks the path for imm64. - // theEmitter->emitIns_R_I(INS_mov, EA_8BYTE, REG_RAX, 0xFFFF000000000000); - - // // shf reg, cl - // theEmitter->emitIns_R(INS_rol, EA_4BYTE, REG_EAX); - // theEmitter->emitIns_R(INS_ror, EA_4BYTE, REG_EAX); - // theEmitter->emitIns_R(INS_rcl, EA_4BYTE, REG_EAX); - // theEmitter->emitIns_R(INS_rcr, EA_4BYTE, REG_EAX); - // theEmitter->emitIns_R(INS_shl, EA_4BYTE, REG_EAX); - // theEmitter->emitIns_R(INS_shr, EA_4BYTE, REG_EAX); - // theEmitter->emitIns_R(INS_sar, EA_4BYTE, REG_EAX); - - // // shf reg, 1 - // theEmitter->emitIns_R(INS_rol_1, EA_4BYTE, REG_EAX); - // theEmitter->emitIns_R(INS_ror_1, EA_4BYTE, REG_EAX); - // theEmitter->emitIns_R(INS_rcl_1, EA_4BYTE, REG_EAX); - // theEmitter->emitIns_R(INS_rcr_1, EA_4BYTE, REG_EAX); - // theEmitter->emitIns_R(INS_shl_1, EA_4BYTE, REG_EAX); - // theEmitter->emitIns_R(INS_shr_1, EA_4BYTE, REG_EAX); - // theEmitter->emitIns_R(INS_sar_1, EA_4BYTE, REG_EAX); - - // // shf reg, imm8 - // theEmitter->emitIns_R_I(INS_shl_N, EA_4BYTE, REG_ECX, 0x05); - // theEmitter->emitIns_R_I(INS_shr_N, EA_4BYTE, REG_ECX, 0x05); - // theEmitter->emitIns_R_I(INS_sar_N, EA_4BYTE, REG_ECX, 0x05); - // theEmitter->emitIns_R_I(INS_rol_N, EA_4BYTE, REG_ECX, 0x05); - // theEmitter->emitIns_R_I(INS_ror_N, EA_4BYTE, REG_ECX, 0x05); - // theEmitter->emitIns_R_I(INS_rcl_N, EA_4BYTE, REG_ECX, 0x05); - // theEmitter->emitIns_R_I(INS_rcr_N, EA_4BYTE, REG_ECX, 0x05); - - // theEmitter->emitIns_R(INS_neg, EA_2BYTE, REG_EAX); - // theEmitter->emitIns_R(INS_not, EA_2BYTE, REG_EAX); - - // theEmitter->emitIns_R_AR(INS_lea, EA_4BYTE, REG_ECX, REG_EAX, 4); - - // theEmitter->emitIns_R_AR(INS_mov, EA_1BYTE, REG_ECX, REG_EAX, 4); - // theEmitter->emitIns_R_AR(INS_mov, EA_2BYTE, REG_ECX, REG_EAX, 4); - // theEmitter->emitIns_R_AR(INS_mov, EA_4BYTE, REG_ECX, REG_EAX, 4); - // theEmitter->emitIns_R_AR(INS_mov, EA_8BYTE, REG_ECX, REG_EAX, 4); - - // theEmitter->emitIns_R_AR(INS_add, EA_1BYTE, REG_EAX, REG_ECX, 4); - // theEmitter->emitIns_R_AR(INS_add, EA_2BYTE, REG_EAX, REG_ECX, 4); - // theEmitter->emitIns_R_AR(INS_add, EA_4BYTE, REG_EAX, REG_ECX, 4); - // theEmitter->emitIns_R_AR(INS_add, EA_8BYTE, REG_EAX, REG_ECX, 4); - // theEmitter->emitIns_R_AR(INS_or, EA_4BYTE, REG_EAX, REG_ECX, 4); - // theEmitter->emitIns_R_AR(INS_adc, EA_4BYTE, REG_EAX, REG_ECX, 4); - // theEmitter->emitIns_R_AR(INS_sbb, EA_4BYTE, REG_EAX, REG_ECX, 4); - // theEmitter->emitIns_R_AR(INS_and, EA_4BYTE, REG_EAX, REG_ECX, 4); - // theEmitter->emitIns_R_AR(INS_sub, EA_4BYTE, REG_EAX, REG_ECX, 4); - // theEmitter->emitIns_R_AR(INS_xor, EA_4BYTE, REG_EAX, REG_ECX, 4); - // theEmitter->emitIns_R_AR(INS_cmp, EA_4BYTE, REG_EAX, REG_ECX, 4); - // theEmitter->emitIns_R_AR(INS_test, EA_4BYTE, REG_EAX, REG_ECX, 4); - // theEmitter->emitIns_R_AR(INS_bsf, EA_4BYTE, REG_EAX, REG_ECX, 4); - // theEmitter->emitIns_R_AR(INS_bsr, EA_4BYTE, REG_EAX, REG_ECX, 4); - // theEmitter->emitIns_R_AR(INS_popcnt, EA_4BYTE, REG_EAX, REG_ECX, 4); - // theEmitter->emitIns_R_AR(INS_lzcnt, EA_4BYTE, REG_EAX, REG_ECX, 4); - // theEmitter->emitIns_R_AR(INS_tzcnt, EA_4BYTE, REG_EAX, REG_ECX, 4); - - // theEmitter->emitIns_AR_R(INS_add, EA_1BYTE, REG_EAX, REG_ECX, 4); - // theEmitter->emitIns_AR_R(INS_add, EA_2BYTE, REG_EAX, REG_ECX, 4); - // theEmitter->emitIns_AR_R(INS_add, EA_4BYTE, REG_EAX, REG_ECX, 4); - // theEmitter->emitIns_AR_R(INS_add, EA_8BYTE, REG_EAX, REG_ECX, 4); - // theEmitter->emitIns_AR_R(INS_or, EA_4BYTE, REG_EAX, REG_ECX, 4); - // theEmitter->emitIns_AR_R(INS_adc, EA_4BYTE, REG_EAX, REG_ECX, 4); - // theEmitter->emitIns_AR_R(INS_sbb, EA_4BYTE, REG_EAX, REG_ECX, 4); - // theEmitter->emitIns_AR_R(INS_and, EA_4BYTE, REG_EAX, REG_ECX, 4); - // theEmitter->emitIns_AR_R(INS_sub, EA_4BYTE, REG_EAX, REG_ECX, 4); - // theEmitter->emitIns_AR_R(INS_xor, EA_4BYTE, REG_EAX, REG_ECX, 4); - // theEmitter->emitIns_AR_R(INS_cmp, EA_4BYTE, REG_EAX, REG_ECX, 4); - // theEmitter->emitIns_AR_R(INS_test, EA_4BYTE, REG_EAX, REG_ECX, 4); - - // theEmitter->emitIns_R_AR(INS_movsx, EA_2BYTE, REG_ECX, REG_EAX, 4); - // theEmitter->emitIns_R_AR(INS_movzx, EA_2BYTE, REG_EAX, REG_ECX, 4); - // theEmitter->emitIns_R_AR(INS_cmovo, EA_4BYTE, REG_EAX, REG_ECX, 4); - - // theEmitter->emitIns_AR_R(INS_xadd, EA_4BYTE, REG_EAX, REG_EDX, 2); - - // theEmitter->emitIns_R_R_I(INS_shld, EA_4BYTE, REG_EAX, REG_ECX, 5); - // theEmitter->emitIns_R_R_I(INS_shrd, EA_2BYTE, REG_EAX, REG_ECX, 5); - - // theEmitter->emitIns_AR_R(INS_cmpxchg, EA_2BYTE, REG_EAX, REG_EDX, 2); - - // theEmitter->emitIns_R(INS_seto, EA_1BYTE, REG_EDX); - - // theEmitter->emitIns_R(INS_bswap, EA_8BYTE, REG_EDX); - - // // INS_bt only has reg-to-reg form. - // theEmitter->emitIns_R_R(INS_bt, EA_2BYTE, REG_EAX, REG_EDX); - - // theEmitter->emitIns_R(INS_idiv, EA_8BYTE, REG_EDX); - - // theEmitter->emitIns_R_R(INS_xchg, EA_8BYTE, REG_EAX, REG_EDX); - - // theEmitter->emitIns_R(INS_div, EA_8BYTE, REG_EDX); - // theEmitter->emitIns_R(INS_mulEAX, EA_8BYTE, REG_EDX); - - // GenTreePhysReg physReg(REG_EDX); - // physReg.SetRegNum(REG_EDX); - // GenTreeIndir load = indirForm(TYP_INT, &physReg); - - // theEmitter->emitIns_R_A(INS_add, EA_1BYTE, REG_EAX, &load); - // theEmitter->emitIns_R_A(INS_add, EA_2BYTE, REG_EAX, &load); - // theEmitter->emitIns_R_A(INS_add, EA_4BYTE, REG_EAX, &load); - // theEmitter->emitIns_R_A(INS_add, EA_8BYTE, REG_EAX, &load); - // theEmitter->emitIns_R_A(INS_or, EA_4BYTE, REG_EAX, &load); - // theEmitter->emitIns_R_A(INS_adc, EA_4BYTE, REG_EAX, &load); - // theEmitter->emitIns_R_A(INS_sbb, EA_4BYTE, REG_EAX, &load); - // theEmitter->emitIns_R_A(INS_and, EA_4BYTE, REG_EAX, &load); - // theEmitter->emitIns_R_A(INS_sub, EA_4BYTE, REG_EAX, &load); - // theEmitter->emitIns_R_A(INS_xor, EA_4BYTE, REG_EAX, &load); - // theEmitter->emitIns_R_A(INS_cmp, EA_4BYTE, REG_EAX, &load); - // theEmitter->emitIns_R_A(INS_test, EA_4BYTE, REG_EAX, &load); - // theEmitter->emitIns_R_A(INS_bsf, EA_4BYTE, REG_EAX, &load); - // theEmitter->emitIns_R_A(INS_bsr, EA_4BYTE, REG_EAX, &load); - - // // Note: - // // All the tests below rely on the runtime status of the stack this unit tests attaching to, - // // it might fail due to stack value unavailable/mismatch, since these tests are mainly for - // // encoding correctness check, this kind of failures may be considered as not harmful. - - // theEmitter->emitIns_R_S(INS_add, EA_1BYTE, REG_EAX, 0, 0); - // theEmitter->emitIns_R_S(INS_add, EA_2BYTE, REG_EAX, 0, 0); - // theEmitter->emitIns_R_S(INS_add, EA_4BYTE, REG_EAX, 0, 0); - // theEmitter->emitIns_R_S(INS_add, EA_8BYTE, REG_EAX, 0, 0); - // theEmitter->emitIns_R_S(INS_or, EA_4BYTE, REG_EAX, 0, 0); - // theEmitter->emitIns_R_S(INS_adc, EA_4BYTE, REG_EAX, 0, 0); - // theEmitter->emitIns_R_S(INS_sbb, EA_4BYTE, REG_EAX, 0, 0); - // theEmitter->emitIns_R_S(INS_and, EA_4BYTE, REG_EAX, 0, 0); - // theEmitter->emitIns_R_S(INS_sub, EA_4BYTE, REG_EAX, 0, 0); - // theEmitter->emitIns_R_S(INS_xor, EA_4BYTE, REG_EAX, 0, 0); - // theEmitter->emitIns_R_S(INS_cmp, EA_4BYTE, REG_EAX, 0, 0); - // theEmitter->emitIns_R_S(INS_test, EA_4BYTE, REG_EAX, 0, 0); - // theEmitter->emitIns_S_R(INS_xadd, EA_2BYTE, REG_EAX, 0, 0); - - // theEmitter->emitIns_S_I(INS_shl_N, EA_4BYTE, 0, 0, 4); - // theEmitter->emitIns_S(INS_shl_1, EA_4BYTE, 0, 4); - - // theEmitter->emitIns_R_S(INS_movsx, EA_2BYTE, REG_ECX, 0, 0); - // theEmitter->emitIns_R_S(INS_movzx, EA_2BYTE, REG_EAX, 0, 0); - // theEmitter->emitIns_R_S(INS_cmovo, EA_4BYTE, REG_EAX, 0, 0); - - // theEmitter->emitIns_R(INS_pop, EA_PTRSIZE, REG_EAX); - // theEmitter->emitIns_R(INS_push, EA_PTRSIZE, REG_EAX); - // theEmitter->emitIns_R(INS_pop_hide, EA_PTRSIZE, REG_EAX); - // theEmitter->emitIns_R(INS_push_hide, EA_PTRSIZE, REG_EAX); - - // theEmitter->emitIns_S(INS_pop, EA_PTRSIZE, 0, 0); - // theEmitter->emitIns_I(INS_push, EA_PTRSIZE, 50); - - // theEmitter->emitIns_R(INS_inc, EA_4BYTE, REG_EAX); - // theEmitter->emitIns_AR(INS_inc, EA_2BYTE, REG_EAX, 2); - // theEmitter->emitIns_S(INS_inc, EA_2BYTE, 0, 0); - // theEmitter->emitIns_R(INS_dec, EA_4BYTE, REG_EAX); - // theEmitter->emitIns_AR(INS_dec, EA_2BYTE, REG_EAX, 2); - // theEmitter->emitIns_S(INS_dec, EA_2BYTE, 0, 0); - - // theEmitter->emitIns_S(INS_neg, EA_2BYTE, 0, 0); - // theEmitter->emitIns_S(INS_not, EA_2BYTE, 0, 0); - - // // APX-EVEX - - // theEmitter->emitIns_R_R_R(INS_add, EA_8BYTE, REG_R10, REG_EAX, REG_ECX, INS_OPTS_EVEX_nd); - // theEmitter->emitIns_R_R_R(INS_sub, EA_2BYTE, REG_R10, REG_EAX, REG_ECX, INS_OPTS_EVEX_nd); - // theEmitter->emitIns_R_R_R(INS_or, EA_2BYTE, REG_R10, REG_EAX, REG_ECX, INS_OPTS_EVEX_nd); - // theEmitter->emitIns_R_R_R(INS_and, EA_2BYTE, REG_R10, REG_EAX, REG_ECX, INS_OPTS_EVEX_nd); - // theEmitter->emitIns_R_R_R(INS_xor, EA_1BYTE, REG_R10, REG_EAX, REG_ECX, INS_OPTS_EVEX_nd); - - // theEmitter->emitIns_R_R_I(INS_or, EA_2BYTE, REG_R10, REG_EAX, 10565, INS_OPTS_EVEX_nd); - // theEmitter->emitIns_R_R_I(INS_or, EA_8BYTE, REG_R10, REG_EAX, 10, INS_OPTS_EVEX_nd); - // theEmitter->emitIns_R_R_S(INS_or, EA_8BYTE, REG_R10, REG_EAX, 0, 1, INS_OPTS_EVEX_nd); - - // theEmitter->emitIns_R_R(INS_neg, EA_2BYTE, REG_R10, REG_ECX, INS_OPTS_EVEX_nd); - - // theEmitter->emitIns_R_R(INS_shl, EA_2BYTE, REG_R11, REG_EAX, INS_OPTS_EVEX_nd); - // theEmitter->emitIns_R_R(INS_shl_1, EA_2BYTE, REG_R11, REG_EAX, INS_OPTS_EVEX_nd); - // theEmitter->emitIns_R_R_I(INS_shl_N, EA_2BYTE, REG_R11, REG_ECX, 7, INS_OPTS_EVEX_nd); - // theEmitter->emitIns_R_R_I(INS_shl_N, EA_2BYTE, REG_R11, REG_ECX, 7, INS_OPTS_EVEX_nd); - // theEmitter->emitIns_R_R_I(INS_rcr_N, EA_2BYTE, REG_R11, REG_ECX, 7, INS_OPTS_EVEX_nd); - // theEmitter->emitIns_R_R_I(INS_rcl_N, EA_2BYTE, REG_R11, REG_ECX, 7, INS_OPTS_EVEX_nd); - - // theEmitter->emitIns_R_R(INS_inc, EA_2BYTE, REG_R11, REG_ECX, INS_OPTS_EVEX_nd); - // theEmitter->emitIns_R_R(INS_dec, EA_2BYTE, REG_R11, REG_ECX, INS_OPTS_EVEX_nd); - - // theEmitter->emitIns_R_R_R(INS_cmovo, EA_4BYTE, REG_R12, REG_R11, REG_EAX, INS_OPTS_EVEX_nd); - - // theEmitter->emitIns_R_R_R(INS_imul, EA_4BYTE, REG_R12, REG_R11, REG_ECX, INS_OPTS_EVEX_nd); - // theEmitter->emitIns_R_R_S(INS_imul, EA_4BYTE, REG_R12, REG_R11, 0, 1, INS_OPTS_EVEX_nd); - - // theEmitter->emitIns_R_R(INS_add, EA_4BYTE, REG_R12, REG_R11, INS_OPTS_EVEX_nf); - // theEmitter->emitIns_R_R(INS_sub, EA_4BYTE, REG_R12, REG_R11, INS_OPTS_EVEX_nf); - // theEmitter->emitIns_R_R(INS_and, EA_4BYTE, REG_R12, REG_R11, INS_OPTS_EVEX_nf); - // theEmitter->emitIns_R_R(INS_or, EA_4BYTE, REG_R12, REG_R11, INS_OPTS_EVEX_nf); - // theEmitter->emitIns_R_R(INS_xor, EA_4BYTE, REG_R12, REG_R11, INS_OPTS_EVEX_nf); - // theEmitter->emitIns_R(INS_inc, EA_4BYTE, REG_R12, INS_OPTS_EVEX_nf); - // theEmitter->emitIns_R(INS_dec, EA_4BYTE, REG_R12, INS_OPTS_EVEX_nf); - - // theEmitter->emitIns_R_I(INS_add, EA_4BYTE, REG_R12, 5, INS_OPTS_EVEX_nf); - // theEmitter->emitIns_R_I(INS_sub, EA_4BYTE, REG_R12, 5, INS_OPTS_EVEX_nf); - // theEmitter->emitIns_R_I(INS_and, EA_4BYTE, REG_R12, 5, INS_OPTS_EVEX_nf); - // theEmitter->emitIns_R_I(INS_or, EA_4BYTE, REG_R12, 5, INS_OPTS_EVEX_nf); - // theEmitter->emitIns_R_I(INS_xor, EA_4BYTE, REG_R12, 5, INS_OPTS_EVEX_nf); - - // theEmitter->emitIns_R_S(INS_add, EA_4BYTE, REG_R12, 0, 1, INS_OPTS_EVEX_nf); - // theEmitter->emitIns_R_S(INS_sub, EA_4BYTE, REG_R12, 0, 1, INS_OPTS_EVEX_nf); - // theEmitter->emitIns_R_S(INS_and, EA_4BYTE, REG_R12, 0, 1, INS_OPTS_EVEX_nf); - // theEmitter->emitIns_R_S(INS_or, EA_4BYTE, REG_R12, 0, 1, INS_OPTS_EVEX_nf); - // theEmitter->emitIns_R_S(INS_xor, EA_4BYTE, REG_R12, 0, 1, INS_OPTS_EVEX_nf); - - // theEmitter->emitIns_R(INS_neg, EA_2BYTE, REG_R11, INS_OPTS_EVEX_nf); - // theEmitter->emitIns_R(INS_shl, EA_2BYTE, REG_R11, INS_OPTS_EVEX_nf); - // theEmitter->emitIns_R(INS_shl_1, EA_2BYTE, REG_R11, INS_OPTS_EVEX_nf); - // theEmitter->emitIns_R_I(INS_shl_N, EA_2BYTE, REG_R11, 7, INS_OPTS_EVEX_nf); - // theEmitter->emitIns_R_I(INS_shl_N, EA_2BYTE, REG_R11, 7, INS_OPTS_EVEX_nf); - - // theEmitter->emitIns_R_R(INS_imul, EA_4BYTE, REG_R12, REG_R11, INS_OPTS_EVEX_nf); - // theEmitter->emitIns_R_S(INS_imul, EA_4BYTE, REG_R12, 0, 1, INS_OPTS_EVEX_nf); - - // theEmitter->emitIns_R_I(INS_imul_15, EA_4BYTE, REG_R12, 5, INS_OPTS_EVEX_nf); - - // theEmitter->emitIns_R(INS_imulEAX, EA_8BYTE, REG_R12, INS_OPTS_EVEX_nf); - // theEmitter->emitIns_R(INS_mulEAX, EA_8BYTE, REG_R12, INS_OPTS_EVEX_nf); - // theEmitter->emitIns_R(INS_div, EA_8BYTE, REG_R12, INS_OPTS_EVEX_nf); - // theEmitter->emitIns_R(INS_idiv, EA_8BYTE, REG_R12, INS_OPTS_EVEX_nf); - - // theEmitter->emitIns_R_R(INS_tzcnt_apx, EA_8BYTE, REG_R12, REG_R11, INS_OPTS_EVEX_nf); - // theEmitter->emitIns_R_R(INS_lzcnt_apx, EA_8BYTE, REG_R12, REG_R11, INS_OPTS_EVEX_nf); - // theEmitter->emitIns_R_R(INS_popcnt_apx, EA_8BYTE, REG_R12, REG_R11, INS_OPTS_EVEX_nf); - - // theEmitter->emitIns_R_S(INS_tzcnt_apx, EA_8BYTE, REG_R12, 0, 1, INS_OPTS_EVEX_nf); - // theEmitter->emitIns_R_S(INS_lzcnt_apx, EA_8BYTE, REG_R12, 0, 1, INS_OPTS_EVEX_nf); - // theEmitter->emitIns_R_S(INS_popcnt_apx, EA_8BYTE, REG_R12, 0, 1, INS_OPTS_EVEX_nf); - - // theEmitter->emitIns_R_R_R(INS_add, EA_2BYTE, REG_R12, REG_R13, REG_R11, - // (insOpts)(INS_OPTS_EVEX_nf | INS_OPTS_EVEX_nd)); - - // theEmitter->emitIns_R_R_R(INS_andn, EA_8BYTE, REG_R11, REG_R13, REG_R11, INS_OPTS_EVEX_nf); - // theEmitter->emitIns_R_R_R(INS_bextr, EA_8BYTE, REG_R11, REG_R13, REG_R11, INS_OPTS_EVEX_nf); - - // theEmitter->emitIns_R_R(INS_blsi, EA_8BYTE, REG_R11, REG_R13, INS_OPTS_EVEX_nf); - // theEmitter->emitIns_R_R(INS_blsmsk, EA_8BYTE, REG_R11, REG_R13, INS_OPTS_EVEX_nf); - // theEmitter->emitIns_R_S(INS_blsr, EA_8BYTE, REG_R11, 0, 1); - - // theEmitter->emitIns_AR(INS_inc, EA_4BYTE, REG_EAX, 0, INS_OPTS_EVEX_NoApxPromotion); - - // theEmitter->emitIns_BASE_R_R(INS_inc, EA_4BYTE, REG_R11, REG_R12); - // theEmitter->emitIns_BASE_R_R_I(INS_add, EA_4BYTE, REG_R11, REG_R12, 5); - - // // testing for EGPR encodings. - // GenTreePhysReg eGPR(REG_R16); - // eGPR.SetRegNum(REG_R16); - // GenTreeIndir loadGPR = indirForm(TYP_SIMD32, &eGPR); - - // // // SIMD instructions - // // // In most of the cases, EGPR will only be used as BASE/INDEX registers in SIMD instructions. - // theEmitter->emitIns_R_R_A(INS_addps, EA_32BYTE, REG_XMM16, REG_XMM16, &loadGPR); - - // // // Legacy instructions - // theEmitter->emitIns_R_ARX(INS_add, EA_4BYTE, REG_R16, REG_R17, REG_R18, 1, 0); - - // theEmitter->emitIns_AR_R(INS_movnti64, EA_8BYTE, REG_R17, REG_R16, 10); - // theEmitter->emitIns_R_R_R(INS_andn, EA_8BYTE, REG_R17, REG_R16, REG_R18); - - // theEmitter->emitIns_Mov(INS_kmovb_gpr, EA_4BYTE, REG_R16, REG_K0, false); - // theEmitter->emitIns_Mov(INS_kmovb_msk, EA_4BYTE, REG_K5, REG_K0, false); - // theEmitter->emitIns_Mov(INS_kmovw_gpr, EA_4BYTE, REG_R16, REG_K0, false); - // theEmitter->emitIns_Mov(INS_kmovw_msk, EA_4BYTE, REG_K5, REG_K0, false); - // theEmitter->emitIns_Mov(INS_kmovd_gpr, EA_4BYTE, REG_R16, REG_K0, false); - // theEmitter->emitIns_Mov(INS_kmovd_msk, EA_4BYTE, REG_K5, REG_K0, false); - // theEmitter->emitIns_Mov(INS_kmovq_gpr, EA_8BYTE, REG_R16, REG_K0, false); - // theEmitter->emitIns_Mov(INS_kmovq_msk, EA_8BYTE, REG_K5, REG_K0, false); - - // theEmitter->emitIns_R_R(INS_crc32_apx, EA_1BYTE, REG_R16, REG_R17); - // theEmitter->emitIns_R_R(INS_crc32_apx, EA_2BYTE, REG_R16, REG_R17); - // theEmitter->emitIns_R_R(INS_crc32_apx, EA_8BYTE, REG_R16, REG_R17); - // theEmitter->emitIns_R_A(INS_crc32_apx, EA_8BYTE, REG_R18, &loadGPR); - // theEmitter->emitIns_R_S(INS_crc32_apx, EA_8BYTE, REG_R18, 0, 0); - - // // Note that BZHI has a reversed src operands due to special handling at import. - // theEmitter->emitIns_R_R_R(INS_bzhi, EA_4BYTE, REG_R16, REG_R18, REG_R17); - // theEmitter->emitIns_R_R_R(INS_bzhi, EA_8BYTE, REG_R16, REG_R18, REG_R17); - // theEmitter->emitIns_R_R_R(INS_mulx, EA_4BYTE, REG_R16, REG_R18, REG_R17); - // theEmitter->emitIns_R_R_R(INS_mulx, EA_8BYTE, REG_R16, REG_R18, REG_R17); - // theEmitter->emitIns_R_R_R(INS_pdep, EA_4BYTE, REG_R16, REG_R18, REG_R17); - // theEmitter->emitIns_R_R_R(INS_pdep, EA_8BYTE, REG_R16, REG_R18, REG_R17); - // theEmitter->emitIns_R_R_R(INS_pext, EA_4BYTE, REG_R16, REG_R18, REG_R17); - // theEmitter->emitIns_R_R_R(INS_pext, EA_8BYTE, REG_R16, REG_R18, REG_R17); - - // theEmitter->emitIns_R_R(INS_push2, EA_PTRSIZE, REG_R17, REG_R18, (insOpts)(INS_OPTS_EVEX_nd | INS_OPTS_APX_ppx)); - // theEmitter->emitIns_R_R(INS_pop2, EA_PTRSIZE, REG_R17, REG_R18, (insOpts)(INS_OPTS_EVEX_nd | INS_OPTS_APX_ppx)); - // theEmitter->emitIns_R(INS_push, EA_PTRSIZE, REG_R11, INS_OPTS_APX_ppx); - // theEmitter->emitIns_R(INS_pop, EA_PTRSIZE, REG_R11, INS_OPTS_APX_ppx); - // theEmitter->emitIns_R(INS_push, EA_PTRSIZE, REG_R17, INS_OPTS_APX_ppx); - // theEmitter->emitIns_R(INS_pop, EA_PTRSIZE, REG_R17, INS_OPTS_APX_ppx); - - // theEmitter->emitIns_Mov(INS_movd32, EA_4BYTE, REG_R16, REG_XMM0, false); - // theEmitter->emitIns_Mov(INS_movd32, EA_4BYTE, REG_R16, REG_XMM16, false); + if (!theEmitter->UseRex2Encoding() && !theEmitter->emitComp->DoJitStressRex2Encoding()) + { + return; + } + + theEmitter->emitIns_R_R(INS_add, EA_1BYTE, REG_EAX, REG_ECX); + theEmitter->emitIns_R_R(INS_add, EA_2BYTE, REG_EAX, REG_ECX); + theEmitter->emitIns_R_R(INS_add, EA_4BYTE, REG_EAX, REG_ECX); + theEmitter->emitIns_R_R(INS_add, EA_8BYTE, REG_EAX, REG_ECX); + theEmitter->emitIns_R_R(INS_or, EA_4BYTE, REG_EAX, REG_ECX); + theEmitter->emitIns_R_R(INS_adc, EA_4BYTE, REG_EAX, REG_ECX); + theEmitter->emitIns_R_R(INS_sbb, EA_4BYTE, REG_EAX, REG_ECX); + theEmitter->emitIns_R_R(INS_and, EA_4BYTE, REG_EAX, REG_ECX); + theEmitter->emitIns_R_R(INS_sub, EA_4BYTE, REG_EAX, REG_ECX); + theEmitter->emitIns_R_R(INS_xor, EA_4BYTE, REG_EAX, REG_ECX); + theEmitter->emitIns_R_R(INS_cmp, EA_4BYTE, REG_EAX, REG_ECX); + theEmitter->emitIns_R_R(INS_test, EA_4BYTE, REG_EAX, REG_ECX); + theEmitter->emitIns_R_R(INS_bsf, EA_4BYTE, REG_EAX, REG_ECX); + theEmitter->emitIns_R_R(INS_bsr, EA_4BYTE, REG_EAX, REG_ECX); + + theEmitter->emitIns_R_R(INS_cmovo, EA_4BYTE, REG_EAX, REG_ECX); + + theEmitter->emitIns_Mov(INS_mov, EA_4BYTE, REG_EAX, REG_ECX, false); + theEmitter->emitIns_Mov(INS_movsx, EA_2BYTE, REG_EAX, REG_ECX, false); + theEmitter->emitIns_Mov(INS_movzx, EA_2BYTE, REG_EAX, REG_ECX, false); + + theEmitter->emitIns_R_R(INS_popcnt, EA_4BYTE, REG_EAX, REG_ECX); + theEmitter->emitIns_R_R(INS_lzcnt, EA_4BYTE, REG_EAX, REG_ECX); + theEmitter->emitIns_R_R(INS_tzcnt, EA_4BYTE, REG_EAX, REG_ECX); + + theEmitter->emitIns_R_I(INS_add, EA_4BYTE, REG_ECX, 0x05); + theEmitter->emitIns_R_I(INS_add, EA_2BYTE, REG_ECX, 0x05); + theEmitter->emitIns_R_I(INS_or, EA_4BYTE, REG_EAX, 0x05); + theEmitter->emitIns_R_I(INS_adc, EA_4BYTE, REG_EAX, 0x05); + theEmitter->emitIns_R_I(INS_sbb, EA_4BYTE, REG_EAX, 0x05); + theEmitter->emitIns_R_I(INS_and, EA_4BYTE, REG_EAX, 0x05); + theEmitter->emitIns_R_I(INS_sub, EA_4BYTE, REG_EAX, 0x05); + theEmitter->emitIns_R_I(INS_xor, EA_4BYTE, REG_EAX, 0x05); + theEmitter->emitIns_R_I(INS_cmp, EA_4BYTE, REG_EAX, 0x05); + theEmitter->emitIns_R_I(INS_test, EA_4BYTE, REG_EAX, 0x05); + + theEmitter->emitIns_R_I(INS_mov, EA_4BYTE, REG_EAX, 0xE0); + + // JIT tend to compress imm64 to imm32 if higher half is all-zero, make sure this test checks the path for imm64. + theEmitter->emitIns_R_I(INS_mov, EA_8BYTE, REG_RAX, 0xFFFF000000000000); + + // shf reg, cl + theEmitter->emitIns_R(INS_rol, EA_4BYTE, REG_EAX); + theEmitter->emitIns_R(INS_ror, EA_4BYTE, REG_EAX); + theEmitter->emitIns_R(INS_rcl, EA_4BYTE, REG_EAX); + theEmitter->emitIns_R(INS_rcr, EA_4BYTE, REG_EAX); + theEmitter->emitIns_R(INS_shl, EA_4BYTE, REG_EAX); + theEmitter->emitIns_R(INS_shr, EA_4BYTE, REG_EAX); + theEmitter->emitIns_R(INS_sar, EA_4BYTE, REG_EAX); + + // shf reg, 1 + theEmitter->emitIns_R(INS_rol_1, EA_4BYTE, REG_EAX); + theEmitter->emitIns_R(INS_ror_1, EA_4BYTE, REG_EAX); + theEmitter->emitIns_R(INS_rcl_1, EA_4BYTE, REG_EAX); + theEmitter->emitIns_R(INS_rcr_1, EA_4BYTE, REG_EAX); + theEmitter->emitIns_R(INS_shl_1, EA_4BYTE, REG_EAX); + theEmitter->emitIns_R(INS_shr_1, EA_4BYTE, REG_EAX); + theEmitter->emitIns_R(INS_sar_1, EA_4BYTE, REG_EAX); + + // shf reg, imm8 + theEmitter->emitIns_R_I(INS_shl_N, EA_4BYTE, REG_ECX, 0x05); + theEmitter->emitIns_R_I(INS_shr_N, EA_4BYTE, REG_ECX, 0x05); + theEmitter->emitIns_R_I(INS_sar_N, EA_4BYTE, REG_ECX, 0x05); + theEmitter->emitIns_R_I(INS_rol_N, EA_4BYTE, REG_ECX, 0x05); + theEmitter->emitIns_R_I(INS_ror_N, EA_4BYTE, REG_ECX, 0x05); + theEmitter->emitIns_R_I(INS_rcl_N, EA_4BYTE, REG_ECX, 0x05); + theEmitter->emitIns_R_I(INS_rcr_N, EA_4BYTE, REG_ECX, 0x05); + + theEmitter->emitIns_R(INS_neg, EA_2BYTE, REG_EAX); + theEmitter->emitIns_R(INS_not, EA_2BYTE, REG_EAX); + + theEmitter->emitIns_R_AR(INS_lea, EA_4BYTE, REG_ECX, REG_EAX, 4); + + theEmitter->emitIns_R_AR(INS_mov, EA_1BYTE, REG_ECX, REG_EAX, 4); + theEmitter->emitIns_R_AR(INS_mov, EA_2BYTE, REG_ECX, REG_EAX, 4); + theEmitter->emitIns_R_AR(INS_mov, EA_4BYTE, REG_ECX, REG_EAX, 4); + theEmitter->emitIns_R_AR(INS_mov, EA_8BYTE, REG_ECX, REG_EAX, 4); + + theEmitter->emitIns_R_AR(INS_add, EA_1BYTE, REG_EAX, REG_ECX, 4); + theEmitter->emitIns_R_AR(INS_add, EA_2BYTE, REG_EAX, REG_ECX, 4); + theEmitter->emitIns_R_AR(INS_add, EA_4BYTE, REG_EAX, REG_ECX, 4); + theEmitter->emitIns_R_AR(INS_add, EA_8BYTE, REG_EAX, REG_ECX, 4); + theEmitter->emitIns_R_AR(INS_or, EA_4BYTE, REG_EAX, REG_ECX, 4); + theEmitter->emitIns_R_AR(INS_adc, EA_4BYTE, REG_EAX, REG_ECX, 4); + theEmitter->emitIns_R_AR(INS_sbb, EA_4BYTE, REG_EAX, REG_ECX, 4); + theEmitter->emitIns_R_AR(INS_and, EA_4BYTE, REG_EAX, REG_ECX, 4); + theEmitter->emitIns_R_AR(INS_sub, EA_4BYTE, REG_EAX, REG_ECX, 4); + theEmitter->emitIns_R_AR(INS_xor, EA_4BYTE, REG_EAX, REG_ECX, 4); + theEmitter->emitIns_R_AR(INS_cmp, EA_4BYTE, REG_EAX, REG_ECX, 4); + theEmitter->emitIns_R_AR(INS_test, EA_4BYTE, REG_EAX, REG_ECX, 4); + theEmitter->emitIns_R_AR(INS_bsf, EA_4BYTE, REG_EAX, REG_ECX, 4); + theEmitter->emitIns_R_AR(INS_bsr, EA_4BYTE, REG_EAX, REG_ECX, 4); + theEmitter->emitIns_R_AR(INS_popcnt, EA_4BYTE, REG_EAX, REG_ECX, 4); + theEmitter->emitIns_R_AR(INS_lzcnt, EA_4BYTE, REG_EAX, REG_ECX, 4); + theEmitter->emitIns_R_AR(INS_tzcnt, EA_4BYTE, REG_EAX, REG_ECX, 4); + + theEmitter->emitIns_AR_R(INS_add, EA_1BYTE, REG_EAX, REG_ECX, 4); + theEmitter->emitIns_AR_R(INS_add, EA_2BYTE, REG_EAX, REG_ECX, 4); + theEmitter->emitIns_AR_R(INS_add, EA_4BYTE, REG_EAX, REG_ECX, 4); + theEmitter->emitIns_AR_R(INS_add, EA_8BYTE, REG_EAX, REG_ECX, 4); + theEmitter->emitIns_AR_R(INS_or, EA_4BYTE, REG_EAX, REG_ECX, 4); + theEmitter->emitIns_AR_R(INS_adc, EA_4BYTE, REG_EAX, REG_ECX, 4); + theEmitter->emitIns_AR_R(INS_sbb, EA_4BYTE, REG_EAX, REG_ECX, 4); + theEmitter->emitIns_AR_R(INS_and, EA_4BYTE, REG_EAX, REG_ECX, 4); + theEmitter->emitIns_AR_R(INS_sub, EA_4BYTE, REG_EAX, REG_ECX, 4); + theEmitter->emitIns_AR_R(INS_xor, EA_4BYTE, REG_EAX, REG_ECX, 4); + theEmitter->emitIns_AR_R(INS_cmp, EA_4BYTE, REG_EAX, REG_ECX, 4); + theEmitter->emitIns_AR_R(INS_test, EA_4BYTE, REG_EAX, REG_ECX, 4); + + theEmitter->emitIns_R_AR(INS_movsx, EA_2BYTE, REG_ECX, REG_EAX, 4); + theEmitter->emitIns_R_AR(INS_movzx, EA_2BYTE, REG_EAX, REG_ECX, 4); + theEmitter->emitIns_R_AR(INS_cmovo, EA_4BYTE, REG_EAX, REG_ECX, 4); + + theEmitter->emitIns_AR_R(INS_xadd, EA_4BYTE, REG_EAX, REG_EDX, 2); + + theEmitter->emitIns_R_R_I(INS_shld, EA_4BYTE, REG_EAX, REG_ECX, 5); + theEmitter->emitIns_R_R_I(INS_shrd, EA_2BYTE, REG_EAX, REG_ECX, 5); + + theEmitter->emitIns_AR_R(INS_cmpxchg, EA_2BYTE, REG_EAX, REG_EDX, 2); + + theEmitter->emitIns_R(INS_seto, EA_1BYTE, REG_EDX); + + theEmitter->emitIns_R(INS_bswap, EA_8BYTE, REG_EDX); + + // INS_bt only has reg-to-reg form. + theEmitter->emitIns_R_R(INS_bt, EA_2BYTE, REG_EAX, REG_EDX); + + theEmitter->emitIns_R(INS_idiv, EA_8BYTE, REG_EDX); + + theEmitter->emitIns_R_R(INS_xchg, EA_8BYTE, REG_EAX, REG_EDX); + + theEmitter->emitIns_R(INS_div, EA_8BYTE, REG_EDX); + theEmitter->emitIns_R(INS_mulEAX, EA_8BYTE, REG_EDX); + + GenTreePhysReg physReg(REG_EDX); + physReg.SetRegNum(REG_EDX); + GenTreeIndir load = indirForm(TYP_INT, &physReg); + + theEmitter->emitIns_R_A(INS_add, EA_1BYTE, REG_EAX, &load); + theEmitter->emitIns_R_A(INS_add, EA_2BYTE, REG_EAX, &load); + theEmitter->emitIns_R_A(INS_add, EA_4BYTE, REG_EAX, &load); + theEmitter->emitIns_R_A(INS_add, EA_8BYTE, REG_EAX, &load); + theEmitter->emitIns_R_A(INS_or, EA_4BYTE, REG_EAX, &load); + theEmitter->emitIns_R_A(INS_adc, EA_4BYTE, REG_EAX, &load); + theEmitter->emitIns_R_A(INS_sbb, EA_4BYTE, REG_EAX, &load); + theEmitter->emitIns_R_A(INS_and, EA_4BYTE, REG_EAX, &load); + theEmitter->emitIns_R_A(INS_sub, EA_4BYTE, REG_EAX, &load); + theEmitter->emitIns_R_A(INS_xor, EA_4BYTE, REG_EAX, &load); + theEmitter->emitIns_R_A(INS_cmp, EA_4BYTE, REG_EAX, &load); + theEmitter->emitIns_R_A(INS_test, EA_4BYTE, REG_EAX, &load); + theEmitter->emitIns_R_A(INS_bsf, EA_4BYTE, REG_EAX, &load); + theEmitter->emitIns_R_A(INS_bsr, EA_4BYTE, REG_EAX, &load); + + // Note: + // All the tests below rely on the runtime status of the stack this unit tests attaching to, + // it might fail due to stack value unavailable/mismatch, since these tests are mainly for + // encoding correctness check, this kind of failures may be considered as not harmful. + + theEmitter->emitIns_R_S(INS_add, EA_1BYTE, REG_EAX, 0, 0); + theEmitter->emitIns_R_S(INS_add, EA_2BYTE, REG_EAX, 0, 0); + theEmitter->emitIns_R_S(INS_add, EA_4BYTE, REG_EAX, 0, 0); + theEmitter->emitIns_R_S(INS_add, EA_8BYTE, REG_EAX, 0, 0); + theEmitter->emitIns_R_S(INS_or, EA_4BYTE, REG_EAX, 0, 0); + theEmitter->emitIns_R_S(INS_adc, EA_4BYTE, REG_EAX, 0, 0); + theEmitter->emitIns_R_S(INS_sbb, EA_4BYTE, REG_EAX, 0, 0); + theEmitter->emitIns_R_S(INS_and, EA_4BYTE, REG_EAX, 0, 0); + theEmitter->emitIns_R_S(INS_sub, EA_4BYTE, REG_EAX, 0, 0); + theEmitter->emitIns_R_S(INS_xor, EA_4BYTE, REG_EAX, 0, 0); + theEmitter->emitIns_R_S(INS_cmp, EA_4BYTE, REG_EAX, 0, 0); + theEmitter->emitIns_R_S(INS_test, EA_4BYTE, REG_EAX, 0, 0); + theEmitter->emitIns_S_R(INS_xadd, EA_2BYTE, REG_EAX, 0, 0); + + theEmitter->emitIns_S_I(INS_shl_N, EA_4BYTE, 0, 0, 4); + theEmitter->emitIns_S(INS_shl_1, EA_4BYTE, 0, 4); + + theEmitter->emitIns_R_S(INS_movsx, EA_2BYTE, REG_ECX, 0, 0); + theEmitter->emitIns_R_S(INS_movzx, EA_2BYTE, REG_EAX, 0, 0); + theEmitter->emitIns_R_S(INS_cmovo, EA_4BYTE, REG_EAX, 0, 0); + + theEmitter->emitIns_R(INS_pop, EA_PTRSIZE, REG_EAX); + theEmitter->emitIns_R(INS_push, EA_PTRSIZE, REG_EAX); + theEmitter->emitIns_R(INS_pop_hide, EA_PTRSIZE, REG_EAX); + theEmitter->emitIns_R(INS_push_hide, EA_PTRSIZE, REG_EAX); + + theEmitter->emitIns_S(INS_pop, EA_PTRSIZE, 0, 0); + theEmitter->emitIns_I(INS_push, EA_PTRSIZE, 50); + + theEmitter->emitIns_R(INS_inc, EA_4BYTE, REG_EAX); + theEmitter->emitIns_AR(INS_inc, EA_2BYTE, REG_EAX, 2); + theEmitter->emitIns_S(INS_inc, EA_2BYTE, 0, 0); + theEmitter->emitIns_R(INS_dec, EA_4BYTE, REG_EAX); + theEmitter->emitIns_AR(INS_dec, EA_2BYTE, REG_EAX, 2); + theEmitter->emitIns_S(INS_dec, EA_2BYTE, 0, 0); + + theEmitter->emitIns_S(INS_neg, EA_2BYTE, 0, 0); + theEmitter->emitIns_S(INS_not, EA_2BYTE, 0, 0); + + // APX-EVEX + + theEmitter->emitIns_R_R_R(INS_add, EA_8BYTE, REG_R10, REG_EAX, REG_ECX, INS_OPTS_EVEX_nd); + theEmitter->emitIns_R_R_R(INS_sub, EA_2BYTE, REG_R10, REG_EAX, REG_ECX, INS_OPTS_EVEX_nd); + theEmitter->emitIns_R_R_R(INS_or, EA_2BYTE, REG_R10, REG_EAX, REG_ECX, INS_OPTS_EVEX_nd); + theEmitter->emitIns_R_R_R(INS_and, EA_2BYTE, REG_R10, REG_EAX, REG_ECX, INS_OPTS_EVEX_nd); + theEmitter->emitIns_R_R_R(INS_xor, EA_1BYTE, REG_R10, REG_EAX, REG_ECX, INS_OPTS_EVEX_nd); + + theEmitter->emitIns_R_R_I(INS_or, EA_2BYTE, REG_R10, REG_EAX, 10565, INS_OPTS_EVEX_nd); + theEmitter->emitIns_R_R_I(INS_or, EA_8BYTE, REG_R10, REG_EAX, 10, INS_OPTS_EVEX_nd); + theEmitter->emitIns_R_R_S(INS_or, EA_8BYTE, REG_R10, REG_EAX, 0, 1, INS_OPTS_EVEX_nd); + + theEmitter->emitIns_R_R(INS_neg, EA_2BYTE, REG_R10, REG_ECX, INS_OPTS_EVEX_nd); + + theEmitter->emitIns_R_R(INS_shl, EA_2BYTE, REG_R11, REG_EAX, INS_OPTS_EVEX_nd); + theEmitter->emitIns_R_R(INS_shl_1, EA_2BYTE, REG_R11, REG_EAX, INS_OPTS_EVEX_nd); + theEmitter->emitIns_R_R_I(INS_shl_N, EA_2BYTE, REG_R11, REG_ECX, 7, INS_OPTS_EVEX_nd); + theEmitter->emitIns_R_R_I(INS_shl_N, EA_2BYTE, REG_R11, REG_ECX, 7, INS_OPTS_EVEX_nd); + theEmitter->emitIns_R_R_I(INS_rcr_N, EA_2BYTE, REG_R11, REG_ECX, 7, INS_OPTS_EVEX_nd); + theEmitter->emitIns_R_R_I(INS_rcl_N, EA_2BYTE, REG_R11, REG_ECX, 7, INS_OPTS_EVEX_nd); + + theEmitter->emitIns_R_R(INS_inc, EA_2BYTE, REG_R11, REG_ECX, INS_OPTS_EVEX_nd); + theEmitter->emitIns_R_R(INS_dec, EA_2BYTE, REG_R11, REG_ECX, INS_OPTS_EVEX_nd); + + theEmitter->emitIns_R_R_R(INS_cmovo, EA_4BYTE, REG_R12, REG_R11, REG_EAX, INS_OPTS_EVEX_nd); + + theEmitter->emitIns_R_R_R(INS_imul, EA_4BYTE, REG_R12, REG_R11, REG_ECX, INS_OPTS_EVEX_nd); + theEmitter->emitIns_R_R_S(INS_imul, EA_4BYTE, REG_R12, REG_R11, 0, 1, INS_OPTS_EVEX_nd); + + theEmitter->emitIns_R_R(INS_add, EA_4BYTE, REG_R12, REG_R11, INS_OPTS_EVEX_nf); + theEmitter->emitIns_R_R(INS_sub, EA_4BYTE, REG_R12, REG_R11, INS_OPTS_EVEX_nf); + theEmitter->emitIns_R_R(INS_and, EA_4BYTE, REG_R12, REG_R11, INS_OPTS_EVEX_nf); + theEmitter->emitIns_R_R(INS_or, EA_4BYTE, REG_R12, REG_R11, INS_OPTS_EVEX_nf); + theEmitter->emitIns_R_R(INS_xor, EA_4BYTE, REG_R12, REG_R11, INS_OPTS_EVEX_nf); + theEmitter->emitIns_R(INS_inc, EA_4BYTE, REG_R12, INS_OPTS_EVEX_nf); + theEmitter->emitIns_R(INS_dec, EA_4BYTE, REG_R12, INS_OPTS_EVEX_nf); + + theEmitter->emitIns_R_I(INS_add, EA_4BYTE, REG_R12, 5, INS_OPTS_EVEX_nf); + theEmitter->emitIns_R_I(INS_sub, EA_4BYTE, REG_R12, 5, INS_OPTS_EVEX_nf); + theEmitter->emitIns_R_I(INS_and, EA_4BYTE, REG_R12, 5, INS_OPTS_EVEX_nf); + theEmitter->emitIns_R_I(INS_or, EA_4BYTE, REG_R12, 5, INS_OPTS_EVEX_nf); + theEmitter->emitIns_R_I(INS_xor, EA_4BYTE, REG_R12, 5, INS_OPTS_EVEX_nf); + + theEmitter->emitIns_R_S(INS_add, EA_4BYTE, REG_R12, 0, 1, INS_OPTS_EVEX_nf); + theEmitter->emitIns_R_S(INS_sub, EA_4BYTE, REG_R12, 0, 1, INS_OPTS_EVEX_nf); + theEmitter->emitIns_R_S(INS_and, EA_4BYTE, REG_R12, 0, 1, INS_OPTS_EVEX_nf); + theEmitter->emitIns_R_S(INS_or, EA_4BYTE, REG_R12, 0, 1, INS_OPTS_EVEX_nf); + theEmitter->emitIns_R_S(INS_xor, EA_4BYTE, REG_R12, 0, 1, INS_OPTS_EVEX_nf); + + theEmitter->emitIns_R(INS_neg, EA_2BYTE, REG_R11, INS_OPTS_EVEX_nf); + theEmitter->emitIns_R(INS_shl, EA_2BYTE, REG_R11, INS_OPTS_EVEX_nf); + theEmitter->emitIns_R(INS_shl_1, EA_2BYTE, REG_R11, INS_OPTS_EVEX_nf); + theEmitter->emitIns_R_I(INS_shl_N, EA_2BYTE, REG_R11, 7, INS_OPTS_EVEX_nf); + theEmitter->emitIns_R_I(INS_shl_N, EA_2BYTE, REG_R11, 7, INS_OPTS_EVEX_nf); + + theEmitter->emitIns_R_R(INS_imul, EA_4BYTE, REG_R12, REG_R11, INS_OPTS_EVEX_nf); + theEmitter->emitIns_R_S(INS_imul, EA_4BYTE, REG_R12, 0, 1, INS_OPTS_EVEX_nf); + + theEmitter->emitIns_R_I(INS_imul_15, EA_4BYTE, REG_R12, 5, INS_OPTS_EVEX_nf); + + theEmitter->emitIns_R(INS_imulEAX, EA_8BYTE, REG_R12, INS_OPTS_EVEX_nf); + theEmitter->emitIns_R(INS_mulEAX, EA_8BYTE, REG_R12, INS_OPTS_EVEX_nf); + theEmitter->emitIns_R(INS_div, EA_8BYTE, REG_R12, INS_OPTS_EVEX_nf); + theEmitter->emitIns_R(INS_idiv, EA_8BYTE, REG_R12, INS_OPTS_EVEX_nf); + + theEmitter->emitIns_R_R(INS_tzcnt_apx, EA_8BYTE, REG_R12, REG_R11, INS_OPTS_EVEX_nf); + theEmitter->emitIns_R_R(INS_lzcnt_apx, EA_8BYTE, REG_R12, REG_R11, INS_OPTS_EVEX_nf); + theEmitter->emitIns_R_R(INS_popcnt_apx, EA_8BYTE, REG_R12, REG_R11, INS_OPTS_EVEX_nf); + + theEmitter->emitIns_R_S(INS_tzcnt_apx, EA_8BYTE, REG_R12, 0, 1, INS_OPTS_EVEX_nf); + theEmitter->emitIns_R_S(INS_lzcnt_apx, EA_8BYTE, REG_R12, 0, 1, INS_OPTS_EVEX_nf); + theEmitter->emitIns_R_S(INS_popcnt_apx, EA_8BYTE, REG_R12, 0, 1, INS_OPTS_EVEX_nf); + + theEmitter->emitIns_R_R_R(INS_add, EA_2BYTE, REG_R12, REG_R13, REG_R11, + (insOpts)(INS_OPTS_EVEX_nf | INS_OPTS_EVEX_nd)); + + theEmitter->emitIns_R_R_R(INS_andn, EA_8BYTE, REG_R11, REG_R13, REG_R11, INS_OPTS_EVEX_nf); + theEmitter->emitIns_R_R_R(INS_bextr, EA_8BYTE, REG_R11, REG_R13, REG_R11, INS_OPTS_EVEX_nf); + + theEmitter->emitIns_R_R(INS_blsi, EA_8BYTE, REG_R11, REG_R13, INS_OPTS_EVEX_nf); + theEmitter->emitIns_R_R(INS_blsmsk, EA_8BYTE, REG_R11, REG_R13, INS_OPTS_EVEX_nf); + theEmitter->emitIns_R_S(INS_blsr, EA_8BYTE, REG_R11, 0, 1); + + theEmitter->emitIns_AR(INS_inc, EA_4BYTE, REG_EAX, 0, INS_OPTS_EVEX_NoApxPromotion); + + theEmitter->emitIns_BASE_R_R(INS_inc, EA_4BYTE, REG_R11, REG_R12); + theEmitter->emitIns_BASE_R_R_I(INS_add, EA_4BYTE, REG_R11, REG_R12, 5); + + // testing for EGPR encodings. + GenTreePhysReg eGPR(REG_R16); + eGPR.SetRegNum(REG_R16); + GenTreeIndir loadGPR = indirForm(TYP_SIMD32, &eGPR); + + // // SIMD instructions + // // In most of the cases, EGPR will only be used as BASE/INDEX registers in SIMD instructions. + theEmitter->emitIns_R_R_A(INS_addps, EA_32BYTE, REG_XMM16, REG_XMM16, &loadGPR); + + // // Legacy instructions + theEmitter->emitIns_R_ARX(INS_add, EA_4BYTE, REG_R16, REG_R17, REG_R18, 1, 0); + + theEmitter->emitIns_AR_R(INS_movnti64, EA_8BYTE, REG_R17, REG_R16, 10); + theEmitter->emitIns_R_R_R(INS_andn, EA_8BYTE, REG_R17, REG_R16, REG_R18); + + theEmitter->emitIns_Mov(INS_kmovb_gpr, EA_4BYTE, REG_R16, REG_K0, false); + theEmitter->emitIns_Mov(INS_kmovb_msk, EA_4BYTE, REG_K5, REG_K0, false); + theEmitter->emitIns_Mov(INS_kmovw_gpr, EA_4BYTE, REG_R16, REG_K0, false); + theEmitter->emitIns_Mov(INS_kmovw_msk, EA_4BYTE, REG_K5, REG_K0, false); + theEmitter->emitIns_Mov(INS_kmovd_gpr, EA_4BYTE, REG_R16, REG_K0, false); + theEmitter->emitIns_Mov(INS_kmovd_msk, EA_4BYTE, REG_K5, REG_K0, false); + theEmitter->emitIns_Mov(INS_kmovq_gpr, EA_8BYTE, REG_R16, REG_K0, false); + theEmitter->emitIns_Mov(INS_kmovq_msk, EA_8BYTE, REG_K5, REG_K0, false); + + theEmitter->emitIns_R_R(INS_crc32_apx, EA_1BYTE, REG_R16, REG_R17); + theEmitter->emitIns_R_R(INS_crc32_apx, EA_2BYTE, REG_R16, REG_R17); + theEmitter->emitIns_R_R(INS_crc32_apx, EA_8BYTE, REG_R16, REG_R17); + theEmitter->emitIns_R_A(INS_crc32_apx, EA_8BYTE, REG_R18, &loadGPR); + theEmitter->emitIns_R_S(INS_crc32_apx, EA_8BYTE, REG_R18, 0, 0); + + // Note that BZHI has a reversed src operands due to special handling at import. + theEmitter->emitIns_R_R_R(INS_bzhi, EA_4BYTE, REG_R16, REG_R18, REG_R17); + theEmitter->emitIns_R_R_R(INS_bzhi, EA_8BYTE, REG_R16, REG_R18, REG_R17); + theEmitter->emitIns_R_R_R(INS_mulx, EA_4BYTE, REG_R16, REG_R18, REG_R17); + theEmitter->emitIns_R_R_R(INS_mulx, EA_8BYTE, REG_R16, REG_R18, REG_R17); + theEmitter->emitIns_R_R_R(INS_pdep, EA_4BYTE, REG_R16, REG_R18, REG_R17); + theEmitter->emitIns_R_R_R(INS_pdep, EA_8BYTE, REG_R16, REG_R18, REG_R17); + theEmitter->emitIns_R_R_R(INS_pext, EA_4BYTE, REG_R16, REG_R18, REG_R17); + theEmitter->emitIns_R_R_R(INS_pext, EA_8BYTE, REG_R16, REG_R18, REG_R17); + + theEmitter->emitIns_R_R(INS_push2, EA_PTRSIZE, REG_R17, REG_R18, (insOpts)(INS_OPTS_EVEX_nd | INS_OPTS_APX_ppx)); + theEmitter->emitIns_R_R(INS_pop2, EA_PTRSIZE, REG_R17, REG_R18, (insOpts)(INS_OPTS_EVEX_nd | INS_OPTS_APX_ppx)); + theEmitter->emitIns_R(INS_push, EA_PTRSIZE, REG_R11, INS_OPTS_APX_ppx); + theEmitter->emitIns_R(INS_pop, EA_PTRSIZE, REG_R11, INS_OPTS_APX_ppx); + theEmitter->emitIns_R(INS_push, EA_PTRSIZE, REG_R17, INS_OPTS_APX_ppx); + theEmitter->emitIns_R(INS_pop, EA_PTRSIZE, REG_R17, INS_OPTS_APX_ppx); + + theEmitter->emitIns_Mov(INS_movd32, EA_4BYTE, REG_R16, REG_XMM0, false); + theEmitter->emitIns_Mov(INS_movd32, EA_4BYTE, REG_R16, REG_XMM16, false); theEmitter->emitIns_R(INS_setzuo, EA_1BYTE, REG_R11, INS_OPTS_EVEX_zu); } From 361b73377f5d7f7a595d2e05c462c039e663c2f5 Mon Sep 17 00:00:00 2001 From: Ruihan-Yin Date: Wed, 25 Jun 2025 16:04:59 -0700 Subject: [PATCH 03/19] code gen surface for setzux --- src/coreclr/jit/codegen.h | 2 +- src/coreclr/jit/codegenxarch.cpp | 8 ++++++-- src/coreclr/jit/emitxarch.cpp | 5 ----- src/coreclr/jit/emitxarch.h | 2 +- src/coreclr/jit/instr.cpp | 13 ++++++++++--- src/coreclr/jit/jitconfigvalues.h | 1 + 6 files changed, 19 insertions(+), 12 deletions(-) diff --git a/src/coreclr/jit/codegen.h b/src/coreclr/jit/codegen.h index b099e7df8cba1e..ef668b59cd780f 100644 --- a/src/coreclr/jit/codegen.h +++ b/src/coreclr/jit/codegen.h @@ -1338,7 +1338,7 @@ class CodeGen final : public CodeGenInterface void inst_JMP(emitJumpKind jmp, BasicBlock* tgtBlock); #endif - void inst_SET(emitJumpKind condition, regNumber reg); + void inst_SET(emitJumpKind condition, regNumber reg, insOpts instOptions = INS_OPTS_NONE); void inst_RV(instruction ins, regNumber reg, var_types type, emitAttr size = EA_UNKNOWN); diff --git a/src/coreclr/jit/codegenxarch.cpp b/src/coreclr/jit/codegenxarch.cpp index 7ca0415ccc4fc4..f60bee2a099635 100644 --- a/src/coreclr/jit/codegenxarch.cpp +++ b/src/coreclr/jit/codegenxarch.cpp @@ -1811,15 +1811,19 @@ void CodeGen::inst_SETCC(GenCondition condition, var_types type, regNumber dstRe inst_SET(desc.jumpKind1, dstReg); + + const bool useZU = compiler->canUseApxEncoding() && compiler->canUseEvexEncoding() && JitConfig.EnableApxZU(); + insOpts instOptions = useZU ? INS_OPTS_EVEX_zu : INS_OPTS_NONE; if (desc.oper != GT_NONE) { BasicBlock* labelNext = genCreateTempLabel(); inst_JMP((desc.oper == GT_OR) ? desc.jumpKind1 : emitter::emitReverseJumpKind(desc.jumpKind1), labelNext); - inst_SET(desc.jumpKind2, dstReg); + inst_SET(desc.jumpKind2, dstReg, instOptions); genDefineTempLabel(labelNext); } - if (!varTypeIsByte(type)) + // TODO-XArch-Apx: we can apply EVEX.ZU to avoid this movzx. + if (!varTypeIsByte(type) && !useZU) { GetEmitter()->emitIns_Mov(INS_movzx, EA_1BYTE, dstReg, dstReg, /* canSkip */ false); } diff --git a/src/coreclr/jit/emitxarch.cpp b/src/coreclr/jit/emitxarch.cpp index 24c6e0c82c9c27..dd33a2520b528b 100644 --- a/src/coreclr/jit/emitxarch.cpp +++ b/src/coreclr/jit/emitxarch.cpp @@ -2124,11 +2124,6 @@ emitter::code_t emitter::AddEvexPrefix(const instrDesc* id, code_t code, emitAtt if (instrIsExtendedReg3opImul(ins)) { // EVEX.R3 - // TODO-XArch-APX: - // A few side notes: based on how JIT defined IMUL, we may need to extend - // the definition to `IMUL_31` to cover EGPRs. And it can be defined in a - // similar way that opcodes comes with built-in REX2 prefix, and convert - // it to EVEX when needed with some helper functions. code &= 0xFF7FFFFFFFFFFFFFULL; } #ifdef TARGET_AMD64 diff --git a/src/coreclr/jit/emitxarch.h b/src/coreclr/jit/emitxarch.h index e80723bdf48819..4b52d106ca5506 100644 --- a/src/coreclr/jit/emitxarch.h +++ b/src/coreclr/jit/emitxarch.h @@ -584,7 +584,7 @@ void SetEvexZuIfNeeded(instrDesc* id, insOpts instOptions) assert(UsePromotedEVEXEncoding()); instruction ins = id->idIns(); #ifdef TARGET_AMD64 - assert((ins >= INS_setzuo && ins <= INS_setzug) || (ins >= INS_imul_AX && ins <= INS_imul_31)); + assert(ins >= INS_setzuo && ins <= INS_setzug); #else // This method is not expected to be used on 32-bit systems. unreached(); diff --git a/src/coreclr/jit/instr.cpp b/src/coreclr/jit/instr.cpp index d41a7607c22ed2..ac24da89926e30 100644 --- a/src/coreclr/jit/instr.cpp +++ b/src/coreclr/jit/instr.cpp @@ -441,13 +441,12 @@ unsigned CodeGenInterface::instKMaskBaseSize(instruction ins) * Generate a set instruction. */ -void CodeGen::inst_SET(emitJumpKind condition, regNumber reg) +void CodeGen::inst_SET(emitJumpKind condition, regNumber reg, insOpts instOptions) { #ifdef TARGET_XARCH instruction ins; /* Convert the condition to an instruction opcode */ - switch (condition) { case EJ_js: @@ -501,10 +500,18 @@ void CodeGen::inst_SET(emitJumpKind condition, regNumber reg) return; } +#ifdef TARGET_AMD64 + // If using ZU feature, we need to promote the SETcc to the new instruction. + if ((instOptions & INS_OPTS_EVEX_zu_MASK) != 0) + { + ins = (instruction)(ins + 8); + } +#endif + assert(genRegMask(reg) & RBM_BYTE_REGS); // These instructions only write the low byte of 'reg' - GetEmitter()->emitIns_R(ins, EA_1BYTE, reg); + GetEmitter()->emitIns_R(ins, EA_1BYTE, reg, instOptions); #elif defined(TARGET_ARM64) GetEmitter()->emitIns_R_COND(INS_cset, EA_8BYTE, reg, JumpKindToInsCond(condition)); diff --git a/src/coreclr/jit/jitconfigvalues.h b/src/coreclr/jit/jitconfigvalues.h index 4c6a30de6bc50b..c6c1096afd04ca 100644 --- a/src/coreclr/jit/jitconfigvalues.h +++ b/src/coreclr/jit/jitconfigvalues.h @@ -435,6 +435,7 @@ RELEASE_CONFIG_INTEGER(EnableEmbeddedMasking, "EnableEmbeddedMasking", RELEASE_CONFIG_INTEGER(EnableApxNDD, "EnableApxNDD", 0) // Allows APX NDD feature to be disabled RELEASE_CONFIG_INTEGER(EnableApxConditionalChaining, "EnableApxConditionalChaining", 0) // Allows APX conditional compare chaining RELEASE_CONFIG_INTEGER(EnableApxPPX, "EnableApxPPX", 0) // Allows APX PPX feature to be disabled +RELEASE_CONFIG_INTEGER(EnableApxZU, "EnableApxZU", 0) // Allows APX ZU feature to be disabled // clang-format on From f6a4fee5e784aab4a58adbff2fadefad1a3cf163 Mon Sep 17 00:00:00 2001 From: Ruihan-Yin Date: Thu, 26 Jun 2025 17:46:15 -0700 Subject: [PATCH 04/19] bug fixes. --- src/coreclr/jit/codegenxarch.cpp | 8 ++++---- src/coreclr/jit/emitxarch.cpp | 4 ++-- src/coreclr/jit/instr.cpp | 2 +- src/coreclr/jit/instrsxarch.h | 18 +++++++++--------- 4 files changed, 16 insertions(+), 16 deletions(-) diff --git a/src/coreclr/jit/codegenxarch.cpp b/src/coreclr/jit/codegenxarch.cpp index f60bee2a099635..33108ebea9e977 100644 --- a/src/coreclr/jit/codegenxarch.cpp +++ b/src/coreclr/jit/codegenxarch.cpp @@ -1808,12 +1808,12 @@ void CodeGen::inst_SETCC(GenCondition condition, var_types type, regNumber dstRe assert(genIsValidIntReg(dstReg) && isByteReg(dstReg)); const GenConditionDesc& desc = GenConditionDesc::Get(condition); + const bool useZU = compiler->canUseApxEncoding() && compiler->canUseEvexEncoding() && JitConfig.EnableApxZU() && !varTypeIsByte(type); + insOpts instOptions = useZU ? INS_OPTS_EVEX_zu : INS_OPTS_NONE; - inst_SET(desc.jumpKind1, dstReg); + inst_SET(desc.jumpKind1, dstReg, instOptions); - const bool useZU = compiler->canUseApxEncoding() && compiler->canUseEvexEncoding() && JitConfig.EnableApxZU(); - insOpts instOptions = useZU ? INS_OPTS_EVEX_zu : INS_OPTS_NONE; if (desc.oper != GT_NONE) { BasicBlock* labelNext = genCreateTempLabel(); @@ -1823,7 +1823,7 @@ void CodeGen::inst_SETCC(GenCondition condition, var_types type, regNumber dstRe } // TODO-XArch-Apx: we can apply EVEX.ZU to avoid this movzx. - if (!varTypeIsByte(type) && !useZU) + if (!useZU) { GetEmitter()->emitIns_Mov(INS_movzx, EA_1BYTE, dstReg, dstReg, /* canSkip */ false); } diff --git a/src/coreclr/jit/emitxarch.cpp b/src/coreclr/jit/emitxarch.cpp index dd33a2520b528b..dd4f4271621930 100644 --- a/src/coreclr/jit/emitxarch.cpp +++ b/src/coreclr/jit/emitxarch.cpp @@ -16672,9 +16672,9 @@ BYTE* emitter::emitOutputR(BYTE* dst, instrDesc* id) assert(TakesApxExtendedEvexPrefix(id)); assert(size == EA_1BYTE); - code = insEncodeMRreg(id, reg, size, insCodeMR(ins)); + code = insCodeMR(ins); code = AddEvexPrefix(id, code, size); - unsigned regcode = insEncodeReg012(id, reg, size, &code); + code = insEncodeMRreg(id, reg, EA_1BYTE, code); dst += emitOutputRexOrSimdPrefixIfNeeded(ins, dst, code); dst += emitOutputWord(dst, code & 0x0000FFFF); break; diff --git a/src/coreclr/jit/instr.cpp b/src/coreclr/jit/instr.cpp index ac24da89926e30..e6ad919d33fec2 100644 --- a/src/coreclr/jit/instr.cpp +++ b/src/coreclr/jit/instr.cpp @@ -504,7 +504,7 @@ void CodeGen::inst_SET(emitJumpKind condition, regNumber reg, insOpts instOption // If using ZU feature, we need to promote the SETcc to the new instruction. if ((instOptions & INS_OPTS_EVEX_zu_MASK) != 0) { - ins = (instruction)(ins + 8); + ins = (instruction)(ins + 16); } #endif diff --git a/src/coreclr/jit/instrsxarch.h b/src/coreclr/jit/instrsxarch.h index 564b5a563eaae0..640ec4b0ca81e8 100644 --- a/src/coreclr/jit/instrsxarch.h +++ b/src/coreclr/jit/instrsxarch.h @@ -1279,14 +1279,14 @@ INST1(sets, "sets", IUM_WR, 0x0F0098, INST1(setns, "setns", IUM_WR, 0x0F0099, INS_TT_NONE, Reads_SF | Encoding_REX2) INST1(setp, "setp", IUM_WR, 0x0F009A, INS_TT_NONE, Reads_PF | Encoding_REX2) INST1(setnp, "setnp", IUM_WR, 0x0F009B, INS_TT_NONE, Reads_PF | Encoding_REX2) -INST1(setl, "setl", IUM_WR, 0x0F009C, INS_TT_NONE, Reads_OF | Reads_SF | Encoding_REX2) -INST1(setge, "setge", IUM_WR, 0x0F009D, INS_TT_NONE, Reads_OF | Reads_SF | Encoding_REX2) -INST1(setle, "setle", IUM_WR, 0x0F009E, INS_TT_NONE, Reads_OF | Reads_SF | Reads_ZF | Encoding_REX2) -INST1(setg, "setg", IUM_WR, 0x0F009F, INS_TT_NONE, Reads_OF | Reads_SF | Reads_ZF | Encoding_REX2) +INST1(setl, "setl", IUM_WR, 0x0F009C, INS_TT_NONE, Reads_OF | Reads_SF | Encoding_REX2) +INST1(setge, "setge", IUM_WR, 0x0F009D, INS_TT_NONE, Reads_OF | Reads_SF | Encoding_REX2) +INST1(setle, "setle", IUM_WR, 0x0F009E, INS_TT_NONE, Reads_OF | Reads_SF | Reads_ZF | Encoding_REX2) +INST1(setg, "setg", IUM_WR, 0x0F009F, INS_TT_NONE, Reads_OF | Reads_SF | Reads_ZF | Encoding_REX2) #ifdef TARGET_AMD64 INST1(setzuo, "setzuo", IUM_WR, 0x40, INS_TT_NONE, Reads_OF) -INST1(setzuno, "setzuno", IUM_WR, 0x41, INS_TT_NONE, Reads_OF) +INST1(setzuno, "setzuno", IUM_WR, 0x41, INS_TT_NONE, Reads_OF) INST1(setzub, "setzub", IUM_WR, 0x42, INS_TT_NONE, Reads_CF) INST1(setzuae, "setzuae", IUM_WR, 0x43, INS_TT_NONE, Reads_CF) INST1(setzue, "setzue", IUM_WR, 0x44, INS_TT_NONE, Reads_ZF) @@ -1297,10 +1297,10 @@ INST1(setzus, "setzus", IUM_WR, 0x48, INST1(setzuns, "setzuns", IUM_WR, 0x49, INS_TT_NONE, Reads_SF) INST1(setzup, "setzup", IUM_WR, 0x4A, INS_TT_NONE, Reads_PF) INST1(setzunp, "setzunp", IUM_WR, 0x4B, INS_TT_NONE, Reads_PF) -INST1(setzul, "setzul", IUM_WR, 0x4C, INS_TT_NONE, Reads_OF | Reads_SF) -INST1(setzuge, "setzuge", IUM_WR, 0x4D, INS_TT_NONE, Reads_OF | Reads_SF) -INST1(setzule, "setzule", IUM_WR, 0x4E, INS_TT_NONE, Reads_OF | Reads_SF | Reads_ZF) -INST1(setzug, "setzug", IUM_WR, 0x4F, INS_TT_NONE, Reads_OF | Reads_SF | Reads_ZF) +INST1(setzul, "setzul", IUM_WR, 0x4C, INS_TT_NONE, Reads_OF | Reads_SF) +INST1(setzuge, "setzuge", IUM_WR, 0x4D, INS_TT_NONE, Reads_OF | Reads_SF) +INST1(setzule, "setzule", IUM_WR, 0x4E, INS_TT_NONE, Reads_OF | Reads_SF | Reads_ZF) +INST1(setzug, "setzug", IUM_WR, 0x4F, INS_TT_NONE, Reads_OF | Reads_SF | Reads_ZF) #endif // Indirect jump used for tailcalls. We differentiate between func-internal From e0359dc61506720f0425528ec58ab8b27852a289 Mon Sep 17 00:00:00 2001 From: Ruihan-Yin Date: Thu, 26 Jun 2025 20:23:39 -0700 Subject: [PATCH 05/19] bug fixes --- src/coreclr/jit/emitxarch.cpp | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/coreclr/jit/emitxarch.cpp b/src/coreclr/jit/emitxarch.cpp index dd4f4271621930..f1a63c60637aac 100644 --- a/src/coreclr/jit/emitxarch.cpp +++ b/src/coreclr/jit/emitxarch.cpp @@ -16629,17 +16629,18 @@ BYTE* emitter::emitOutputR(BYTE* dst, instrDesc* id) assert(id->idGCref() == GCT_NONE); assert(size == EA_1BYTE); - code = insEncodeMRreg(id, reg, EA_1BYTE, insCodeMR(ins)); + code = insCodeMR(ins); if (TakesRex2Prefix(id)) { code = AddRex2Prefix(ins, code); - code = insEncodeReg012(id, reg, EA_1BYTE, &code); + code = insEncodeMRreg(id, reg, EA_1BYTE, code); dst += emitOutputRexOrSimdPrefixIfNeeded(ins, dst, code); dst += emitOutputWord(dst, code & 0x0000FFFF); } else { + code = insEncodeMRreg(id, reg, EA_1BYTE, code); // Output the REX prefix dst += emitOutputRexOrSimdPrefixIfNeeded(ins, dst, code); // We expect this to always be a 'big' opcode From 040ec5a4486ce3fc1b7bd9f3f6c7e055eb36e049 Mon Sep 17 00:00:00 2001 From: Ruihan-Yin Date: Mon, 30 Jun 2025 14:25:44 -0700 Subject: [PATCH 06/19] resolve comments. --- src/coreclr/jit/emitxarch.h | 2 +- src/coreclr/jit/instr.cpp | 16 ++++++++++++++++ src/coreclr/jit/instrsxarch.h | 2 ++ 3 files changed, 19 insertions(+), 1 deletion(-) diff --git a/src/coreclr/jit/emitxarch.h b/src/coreclr/jit/emitxarch.h index 4b52d106ca5506..43c8d3019814ee 100644 --- a/src/coreclr/jit/emitxarch.h +++ b/src/coreclr/jit/emitxarch.h @@ -571,7 +571,7 @@ void SetEvexNfIfNeeded(instrDesc* id, insOpts instOptions) } //------------------------------------------------------------------------ -// SetEvexNdIfNeeded: set Evex.zu on instrDesc +// SetEvexZuIfNeeded: set Evex.zu on instrDesc // // Arguments: // id - instruction descriptor diff --git a/src/coreclr/jit/instr.cpp b/src/coreclr/jit/instr.cpp index e6ad919d33fec2..83f3f764b443da 100644 --- a/src/coreclr/jit/instr.cpp +++ b/src/coreclr/jit/instr.cpp @@ -504,6 +504,22 @@ void CodeGen::inst_SET(emitJumpKind condition, regNumber reg, insOpts instOption // If using ZU feature, we need to promote the SETcc to the new instruction. if ((instOptions & INS_OPTS_EVEX_zu_MASK) != 0) { + assert(INS_setzuo == (INS_seto + 16)); + assert(INS_setzuno == (INS_setno + 16)); + assert(INS_setzub == (INS_setb + 16)); + assert(INS_setzuae == (INS_setae + 16)); + assert(INS_setzue == (INS_sete + 16)); + assert(INS_setzune == (INS_setne + 16)); + assert(INS_setzube == (INS_setbe + 16)); + assert(INS_setzua == (INS_seta + 16)); + assert(INS_setzus == (INS_sets + 16)); + assert(INS_setzuns == (INS_setns + 16)); + assert(INS_setzup == (INS_setp + 16)); + assert(INS_setzunp == (INS_setnp + 16)); + assert(INS_setzul == (INS_setl + 16)); + assert(INS_setzuge == (INS_setge + 16)); + assert(INS_setzule == (INS_setle + 16)); + assert(INS_setzug == (INS_setg + 16)); ins = (instruction)(ins + 16); } #endif diff --git a/src/coreclr/jit/instrsxarch.h b/src/coreclr/jit/instrsxarch.h index 640ec4b0ca81e8..22f756d38b5b2f 100644 --- a/src/coreclr/jit/instrsxarch.h +++ b/src/coreclr/jit/instrsxarch.h @@ -1285,6 +1285,8 @@ INST1(setle, "setle", IUM_WR, 0x0F009E, INST1(setg, "setg", IUM_WR, 0x0F009F, INS_TT_NONE, Reads_OF | Reads_SF | Reads_ZF | Encoding_REX2) #ifdef TARGET_AMD64 +// The following instructions shall always be next to SETcc instructions group, the offset between the original instruction and the ZU variant should be 16. +// No new instruction should be inserted from INS_seto to setzug. INST1(setzuo, "setzuo", IUM_WR, 0x40, INS_TT_NONE, Reads_OF) INST1(setzuno, "setzuno", IUM_WR, 0x41, INS_TT_NONE, Reads_OF) INST1(setzub, "setzub", IUM_WR, 0x42, INS_TT_NONE, Reads_CF) From 765ce32806537740d5e627f65eaa39936bc11191 Mon Sep 17 00:00:00 2001 From: Ruihan-Yin Date: Tue, 1 Jul 2025 12:05:03 -0700 Subject: [PATCH 07/19] resolve comment --- src/coreclr/jit/codegenxarch.cpp | 6 +++--- src/coreclr/jit/compiler.h | 11 +++++++++++ src/coreclr/jit/emitxarch.cpp | 26 +++++++++++++++++++++++--- src/coreclr/jit/emitxarch.h | 3 ++- 4 files changed, 39 insertions(+), 7 deletions(-) diff --git a/src/coreclr/jit/codegenxarch.cpp b/src/coreclr/jit/codegenxarch.cpp index 33108ebea9e977..2a878f87e390f9 100644 --- a/src/coreclr/jit/codegenxarch.cpp +++ b/src/coreclr/jit/codegenxarch.cpp @@ -1808,7 +1808,7 @@ void CodeGen::inst_SETCC(GenCondition condition, var_types type, regNumber dstRe assert(genIsValidIntReg(dstReg) && isByteReg(dstReg)); const GenConditionDesc& desc = GenConditionDesc::Get(condition); - const bool useZU = compiler->canUseApxEncoding() && compiler->canUseEvexEncoding() && JitConfig.EnableApxZU() && !varTypeIsByte(type); + const bool useZU = compiler->canUseApxEvexEncoding() && JitConfig.EnableApxZU() && !varTypeIsByte(type); insOpts instOptions = useZU ? INS_OPTS_EVEX_zu : INS_OPTS_NONE; inst_SET(desc.jumpKind1, dstReg, instOptions); @@ -10357,7 +10357,7 @@ void CodeGen::genPushCalleeSavedRegisters() #endif // DEBUG #ifdef TARGET_AMD64 - if (compiler->canUseApxEncoding() && compiler->canUseEvexEncoding() && JitConfig.EnableApxPPX()) + if (compiler->canUseApxEvexEncoding() && JitConfig.EnableApxPPX()) { genPushCalleeSavedRegistersFromMaskAPX(rsPushRegs); return; @@ -10483,7 +10483,7 @@ void CodeGen::genPopCalleeSavedRegisters(bool jmpEpilog) return; } - if (compiler->canUseApxEncoding() && compiler->canUseEvexEncoding() && JitConfig.EnableApxPPX()) + if (compiler->canUseApxEvexEncoding() && JitConfig.EnableApxPPX()) { regMaskTP rsPopRegs = regSet.rsGetModifiedIntCalleeSavedRegsMask(); const unsigned popCount = genPopCalleeSavedRegistersFromMaskAPX(rsPopRegs); diff --git a/src/coreclr/jit/compiler.h b/src/coreclr/jit/compiler.h index 99ebd6ac1eed82..fb1229b6cef96f 100644 --- a/src/coreclr/jit/compiler.h +++ b/src/coreclr/jit/compiler.h @@ -9723,6 +9723,17 @@ class Compiler return compOpportunisticallyDependsOn(InstructionSet_APX); } + //------------------------------------------------------------------------ + // canUseApxEvexEncoding - Answer the question: Are APX-EVEX encodings supported on this target. + // + // Returns: + // `true` if APX-EVEX encoding is supported, `false` if not. + // + bool canUseApxEvexEncoding() const + { + return canUseApxEncoding() && canUseEvexEncoding(); + } + private: //------------------------------------------------------------------------ // DoJitStressEvexEncoding- Answer the question: Do we force EVEX encoding. diff --git a/src/coreclr/jit/emitxarch.cpp b/src/coreclr/jit/emitxarch.cpp index f1a63c60637aac..15b79e1af157bc 100644 --- a/src/coreclr/jit/emitxarch.cpp +++ b/src/coreclr/jit/emitxarch.cpp @@ -156,6 +156,26 @@ bool emitter::IsKMOVInstruction(instruction ins) } } + +//------------------------------------------------------------------------ +// IsSETZUccInstruction: Is this a SETcc instruction with APX-ZU feature? +// +// Arguments: +// ins - The instruction to check. +// +// Returns: +// `true` if it is a SETcc instruction with APX-ZU feature. +// +bool emitter::IsSETZUccInstruction(instruction ins) +{ +#ifdef TARGET_AMD64 + return ((ins >= INS_setzuo) && (ins <= INS_setzug)); +#else + return false; +#endif +} + + regNumber emitter::getBmiRegNumber(instruction ins) { switch (ins) @@ -432,7 +452,7 @@ bool emitter::IsApxExtendedEvexInstruction(instruction ins) const return true; } - if (ins >= INS_setzuo && ins <= INS_setzug) + if (IsSETZUccInstruction(ins)) { // SETcc can use EVEX.ZU feature. return true; @@ -1985,7 +2005,7 @@ bool emitter::TakesApxExtendedEvexPrefix(const instrDesc* id) const return true; } - if (ins >= INS_setzuo && ins <= INS_setzug) + if (IsSETZUccInstruction(ins)) { // These are promoted forms of SETcc instruction with EVEX.ZU. // TODO-XArch-APX: maybe consider return true as we may only use those instructions with ZU set. @@ -2134,7 +2154,7 @@ emitter::code_t emitter::AddEvexPrefix(const instrDesc* id, code_t code, emitAtt code |= ((size_t)GetCCFromCCMP(ins)) << 32; } - if (ins >= INS_setzuo && ins <= INS_setzug) + if (IsSETZUccInstruction(ins)) { // SETcc in EVEX space are assigned with new opcode: EVEX.LLZ.F2.MAP4.IGNORED 4x. // Here we need to hard code the EVEX.pp for F2 prefix. diff --git a/src/coreclr/jit/emitxarch.h b/src/coreclr/jit/emitxarch.h index 43c8d3019814ee..415f481c577e59 100644 --- a/src/coreclr/jit/emitxarch.h +++ b/src/coreclr/jit/emitxarch.h @@ -121,6 +121,7 @@ static bool IsSSEOrAVXInstruction(instruction ins); static bool IsAVXOnlyInstruction(instruction ins); static bool IsAvx512OnlyInstruction(instruction ins); static bool IsKMOVInstruction(instruction ins); +static bool IsSETZUccInstruction(instruction ins); static bool Is3OpRmwInstruction(instruction ins); static bool IsBMIInstruction(instruction ins); static bool IsKInstruction(instruction ins); @@ -584,7 +585,7 @@ void SetEvexZuIfNeeded(instrDesc* id, insOpts instOptions) assert(UsePromotedEVEXEncoding()); instruction ins = id->idIns(); #ifdef TARGET_AMD64 - assert(ins >= INS_setzuo && ins <= INS_setzug); + assert(IsSETZUccInstruction(ins)); #else // This method is not expected to be used on 32-bit systems. unreached(); From 18b8297fa32469e21a0ba5925d596a56016e0350 Mon Sep 17 00:00:00 2001 From: Ruihan-Yin Date: Wed, 2 Jul 2025 20:43:17 -0700 Subject: [PATCH 08/19] formatting --- src/coreclr/jit/codegenxarch.cpp | 5 ++--- src/coreclr/jit/emitxarch.cpp | 2 -- src/coreclr/jit/instr.cpp | 16 ++++++++-------- 3 files changed, 10 insertions(+), 13 deletions(-) diff --git a/src/coreclr/jit/codegenxarch.cpp b/src/coreclr/jit/codegenxarch.cpp index 2a878f87e390f9..322b368c9ad50c 100644 --- a/src/coreclr/jit/codegenxarch.cpp +++ b/src/coreclr/jit/codegenxarch.cpp @@ -1808,12 +1808,11 @@ void CodeGen::inst_SETCC(GenCondition condition, var_types type, regNumber dstRe assert(genIsValidIntReg(dstReg) && isByteReg(dstReg)); const GenConditionDesc& desc = GenConditionDesc::Get(condition); - const bool useZU = compiler->canUseApxEvexEncoding() && JitConfig.EnableApxZU() && !varTypeIsByte(type); - insOpts instOptions = useZU ? INS_OPTS_EVEX_zu : INS_OPTS_NONE; + const bool useZU = compiler->canUseApxEvexEncoding() && JitConfig.EnableApxZU() && !varTypeIsByte(type); + insOpts instOptions = useZU ? INS_OPTS_EVEX_zu : INS_OPTS_NONE; inst_SET(desc.jumpKind1, dstReg, instOptions); - if (desc.oper != GT_NONE) { BasicBlock* labelNext = genCreateTempLabel(); diff --git a/src/coreclr/jit/emitxarch.cpp b/src/coreclr/jit/emitxarch.cpp index cd6fe92571dcad..bffe1d040b2f1b 100644 --- a/src/coreclr/jit/emitxarch.cpp +++ b/src/coreclr/jit/emitxarch.cpp @@ -156,7 +156,6 @@ bool emitter::IsKMOVInstruction(instruction ins) } } - //------------------------------------------------------------------------ // IsSETZUccInstruction: Is this a SETcc instruction with APX-ZU feature? // @@ -175,7 +174,6 @@ bool emitter::IsSETZUccInstruction(instruction ins) #endif } - regNumber emitter::getBmiRegNumber(instruction ins) { switch (ins) diff --git a/src/coreclr/jit/instr.cpp b/src/coreclr/jit/instr.cpp index c92d940a95cdb2..0c850178e5d8fb 100644 --- a/src/coreclr/jit/instr.cpp +++ b/src/coreclr/jit/instr.cpp @@ -675,22 +675,22 @@ void CodeGen::inst_SET(emitJumpKind condition, regNumber reg, insOpts instOption // If using ZU feature, we need to promote the SETcc to the new instruction. if ((instOptions & INS_OPTS_EVEX_zu_MASK) != 0) { - assert(INS_setzuo == (INS_seto + 16)); + assert(INS_setzuo == (INS_seto + 16)); assert(INS_setzuno == (INS_setno + 16)); - assert(INS_setzub == (INS_setb + 16)); + assert(INS_setzub == (INS_setb + 16)); assert(INS_setzuae == (INS_setae + 16)); - assert(INS_setzue == (INS_sete + 16)); + assert(INS_setzue == (INS_sete + 16)); assert(INS_setzune == (INS_setne + 16)); assert(INS_setzube == (INS_setbe + 16)); - assert(INS_setzua == (INS_seta + 16)); - assert(INS_setzus == (INS_sets + 16)); + assert(INS_setzua == (INS_seta + 16)); + assert(INS_setzus == (INS_sets + 16)); assert(INS_setzuns == (INS_setns + 16)); - assert(INS_setzup == (INS_setp + 16)); + assert(INS_setzup == (INS_setp + 16)); assert(INS_setzunp == (INS_setnp + 16)); - assert(INS_setzul == (INS_setl + 16)); + assert(INS_setzul == (INS_setl + 16)); assert(INS_setzuge == (INS_setge + 16)); assert(INS_setzule == (INS_setle + 16)); - assert(INS_setzug == (INS_setg + 16)); + assert(INS_setzug == (INS_setg + 16)); ins = (instruction)(ins + 16); } #endif From 48aab514315f3dbd7170d1c12b694d170cb4b2b0 Mon Sep 17 00:00:00 2001 From: Ruihan-Yin Date: Mon, 7 Jul 2025 14:13:45 -0700 Subject: [PATCH 09/19] Make sure movzx is emitted only when needed. --- src/coreclr/jit/codegenxarch.cpp | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/src/coreclr/jit/codegenxarch.cpp b/src/coreclr/jit/codegenxarch.cpp index 322b368c9ad50c..2d16ea88f6b816 100644 --- a/src/coreclr/jit/codegenxarch.cpp +++ b/src/coreclr/jit/codegenxarch.cpp @@ -1808,8 +1808,15 @@ void CodeGen::inst_SETCC(GenCondition condition, var_types type, regNumber dstRe assert(genIsValidIntReg(dstReg) && isByteReg(dstReg)); const GenConditionDesc& desc = GenConditionDesc::Get(condition); - const bool useZU = compiler->canUseApxEvexEncoding() && JitConfig.EnableApxZU() && !varTypeIsByte(type); - insOpts instOptions = useZU ? INS_OPTS_EVEX_zu : INS_OPTS_NONE; + insOpts instOptions = INS_OPTS_NONE; + if (compiler->canUseApxEvexEncoding() && JitConfig.EnableApxZU()) + { + instOptions = varTypeIsByte(type) ? INS_OPTS_NONE : INS_OPTS_EVEX_zu; + } + else + { + assert(instOptions == INS_OPTS_NONE); + } inst_SET(desc.jumpKind1, dstReg, instOptions); @@ -1822,7 +1829,7 @@ void CodeGen::inst_SETCC(GenCondition condition, var_types type, regNumber dstRe } // TODO-XArch-Apx: we can apply EVEX.ZU to avoid this movzx. - if (!useZU) + if ((instOptions == INS_OPTS_NONE) && !varTypeIsByte(type)) { GetEmitter()->emitIns_Mov(INS_movzx, EA_1BYTE, dstReg, dstReg, /* canSkip */ false); } From ba51151d9e5b04fd18dd703135bb9f8f03329b26 Mon Sep 17 00:00:00 2001 From: Ruihan-Yin Date: Mon, 7 Jul 2025 14:37:50 -0700 Subject: [PATCH 10/19] formatting. --- src/coreclr/jit/codegenxarch.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/coreclr/jit/codegenxarch.cpp b/src/coreclr/jit/codegenxarch.cpp index 2d16ea88f6b816..22c319494bc2cf 100644 --- a/src/coreclr/jit/codegenxarch.cpp +++ b/src/coreclr/jit/codegenxarch.cpp @@ -1807,8 +1807,8 @@ void CodeGen::inst_SETCC(GenCondition condition, var_types type, regNumber dstRe assert(varTypeIsIntegral(type)); assert(genIsValidIntReg(dstReg) && isByteReg(dstReg)); - const GenConditionDesc& desc = GenConditionDesc::Get(condition); - insOpts instOptions = INS_OPTS_NONE; + const GenConditionDesc& desc = GenConditionDesc::Get(condition); + insOpts instOptions = INS_OPTS_NONE; if (compiler->canUseApxEvexEncoding() && JitConfig.EnableApxZU()) { instOptions = varTypeIsByte(type) ? INS_OPTS_NONE : INS_OPTS_EVEX_zu; From c00ca9465f7593fc26897526997cd5300ec20dd2 Mon Sep 17 00:00:00 2001 From: Ruihan-Yin Date: Tue, 15 Jul 2025 15:58:44 -0700 Subject: [PATCH 11/19] resolve comments. --- src/coreclr/jit/codegenxarch.cpp | 2 +- src/coreclr/jit/emit.h | 2 +- src/coreclr/jit/emitxarch.cpp | 127 ++++++++++++++++++------------- src/coreclr/jit/instr.cpp | 32 ++++---- src/coreclr/jit/instrsxarch.h | 32 ++++---- 5 files changed, 109 insertions(+), 86 deletions(-) diff --git a/src/coreclr/jit/codegenxarch.cpp b/src/coreclr/jit/codegenxarch.cpp index 2d960aae2ab93b..0b304d06a5b116 100644 --- a/src/coreclr/jit/codegenxarch.cpp +++ b/src/coreclr/jit/codegenxarch.cpp @@ -9461,7 +9461,7 @@ void CodeGen::genAmd64EmitterUnitTestsApx() theEmitter->emitIns_Mov(INS_movd32, EA_4BYTE, REG_R16, REG_XMM0, false); theEmitter->emitIns_Mov(INS_movd32, EA_4BYTE, REG_R16, REG_XMM16, false); - theEmitter->emitIns_R(INS_setzuo, EA_1BYTE, REG_R11, INS_OPTS_EVEX_zu); + theEmitter->emitIns_R(INS_seto_apx, EA_1BYTE, REG_R11, INS_OPTS_EVEX_zu); } void CodeGen::genAmd64EmitterUnitTestsAvx10v2() diff --git a/src/coreclr/jit/emit.h b/src/coreclr/jit/emit.h index 93be4548ec62c1..bc3e8882e0cae8 100644 --- a/src/coreclr/jit/emit.h +++ b/src/coreclr/jit/emit.h @@ -1800,7 +1800,7 @@ class emitter bool idIsEvexZuContextSet() const { - return _idEvexZuContext != 0; + return (_idEvexZuContext != 0) && (IsSETZUccInstruction(_idIns)); } void idSetEvexNdContext() diff --git a/src/coreclr/jit/emitxarch.cpp b/src/coreclr/jit/emitxarch.cpp index b90e2a91656dd5..3ea4e5766f3838 100644 --- a/src/coreclr/jit/emitxarch.cpp +++ b/src/coreclr/jit/emitxarch.cpp @@ -179,7 +179,7 @@ bool emitter::IsKMOVInstruction(instruction ins) bool emitter::IsSETZUccInstruction(instruction ins) { #ifdef TARGET_AMD64 - return ((ins >= INS_setzuo) && (ins <= INS_setzug)); + return ((ins >= INS_seto_apx) && (ins <= INS_setg_apx)); #else return false; #endif @@ -2027,7 +2027,6 @@ bool emitter::TakesApxExtendedEvexPrefix(const instrDesc* id) const if (IsSETZUccInstruction(ins)) { // These are promoted forms of SETcc instruction with EVEX.ZU. - // TODO-XArch-APX: maybe consider return true as we may only use those instructions with ZU set. return id->idIsEvexZuContextSet(); } @@ -2172,13 +2171,6 @@ emitter::code_t emitter::AddEvexPrefix(const instrDesc* id, code_t code, emitAtt code |= ((size_t)id->idGetEvexDFV()) << 43; code |= ((size_t)GetCCFromCCMP(ins)) << 32; } - - if (IsSETZUccInstruction(ins)) - { - // SETcc in EVEX space are assigned with new opcode: EVEX.LLZ.F2.MAP4.IGNORED 4x. - // Here we need to hard code the EVEX.pp for F2 prefix. - code |= 0x30000000000ULL; - } #endif return code; @@ -3081,8 +3073,8 @@ emitter::code_t emitter::emitExtractEvexPrefix(instruction ins, code_t& code) co // 1. An escape byte 0F (For isa before AVX10.2) // 2. A map number from 0 to 7 (For AVX10.2 and above) leadingBytes = check; - assert(leadingBytes == 0x0F || (emitComp->compIsaSupportedDebugOnly(InstructionSet_AVX10v2) && - leadingBytes >= 0x00 && leadingBytes <= 0x07)); + assert((leadingBytes == 0x0F) || + ((emitComp->compIsaSupportedDebugOnly(InstructionSet_AVX10v2) || (emitComp->compIsaSupportedDebugOnly(InstructionSet_APX) || emitComp->canUseApxEncoding())) && (leadingBytes >= 0x00) && (leadingBytes <= 0x07))); // Get rid of both sizePrefix and escape byte code &= 0x0000FFFFLL; @@ -3153,6 +3145,13 @@ emitter::code_t emitter::emitExtractEvexPrefix(instruction ins, code_t& code) co break; } + case 0x04: + { + assert((emitComp->compIsaSupportedDebugOnly(InstructionSet_APX) || emitComp->canUseApxEncoding())); + evexPrefix |= (0x04 << 16); + break; + } + case 0x05: { assert(emitComp->compIsaSupportedDebugOnly(InstructionSet_AVX10v2)); @@ -3163,7 +3162,6 @@ emitter::code_t emitter::emitExtractEvexPrefix(instruction ins, code_t& code) co case 0x01: case 0x02: case 0x03: - case 0x04: case 0x06: case 0x07: default: @@ -5160,7 +5158,8 @@ inline UNATIVE_OFFSET emitter::emitInsSizeSVCalcDisp(instrDesc* id, code_t code, dsp = adr + id->idAddr()->iiaLclVar.lvaOffset(); dspIsZero = (dsp == 0); - + + // APX extended EVEX instructions have constant disp8 displacement, no need to compress. bool tryCompress = true; if (EBPbased) @@ -5192,7 +5191,7 @@ inline UNATIVE_OFFSET emitter::emitInsSizeSVCalcDisp(instrDesc* id, code_t code, { ssize_t compressedDsp; - if (TryEvexCompressDisp8Byte(id, dsp, &compressedDsp, &dspInByte)) + if (TryEvexCompressDisp8Byte(id, dsp, &compressedDsp, &dspInByte) && hasTupleTypeInfo(ins)) { SetEvexCompressedDisplacement(id); } @@ -5365,7 +5364,7 @@ UNATIVE_OFFSET emitter::emitInsSizeAM(instrDesc* id, code_t code) { ssize_t compressedDsp; - if (TryEvexCompressDisp8Byte(id, dsp, &compressedDsp, &dspInByte)) + if (TryEvexCompressDisp8Byte(id, dsp, &compressedDsp, &dspInByte) && hasTupleTypeInfo(ins)) { SetEvexCompressedDisplacement(id); } @@ -14669,10 +14668,22 @@ BYTE* emitter::emitOutputAM(BYTE* dst, instrDesc* id, code_t code, CnsVal* addc) assert(isCompressed && dspInByte); dsp = compressedDsp; } - else if (TakesEvexPrefix(id) || TakesApxExtendedEvexPrefix(id)) + else if (TakesEvexPrefix(id)) { - assert(!TryEvexCompressDisp8Byte(id, dsp, &compressedDsp, &dspInByte)); - dspInByte = false; + assert(!(TryEvexCompressDisp8Byte(id, dsp, &compressedDsp, &dspInByte) && hasTupleTypeInfo(ins))); + if (IsBMIInstruction(ins)) + { + dspInByte = ((signed char)dsp == (ssize_t)dsp); + } + else + { + dspInByte = false; + } + } + else if (TakesApxExtendedEvexPrefix(id)) + { + // the scaling factor of extended EVEX instructions is always 1. + dspInByte = ((signed char)dsp == (ssize_t)dsp); } else { @@ -15553,7 +15564,7 @@ BYTE* emitter::emitOutputSV(BYTE* dst, instrDesc* id, code_t code, CnsVal* addc) assert(isCompressed && dspInByte); dsp = (int)compressedDsp; } - else if (TakesEvexPrefix(id) || TakesApxExtendedEvexPrefix(id)) + else if (TakesEvexPrefix(id)) { #if FEATURE_FIXED_OUT_ARGS // TODO-AMD64-CQ: We should be able to accurately predict this when FEATURE_FIXED_OUT_ARGS @@ -15563,7 +15574,19 @@ BYTE* emitter::emitOutputSV(BYTE* dst, instrDesc* id, code_t code, CnsVal* addc) // assert(!TryEvexCompressDisp8Byte(id, dsp, &compressedDsp, &dspInByte)); #endif - dspInByte = false; + if (IsBMIInstruction(ins)) + { + dspInByte = ((signed char)dsp == (ssize_t)dsp); + } + else + { + dspInByte = false; + } + } + else if (TakesApxExtendedEvexPrefix(id)) + { + // Apx extended EVEX instructions do not support compressed displacement. + dspInByte = ((signed char)dsp == (ssize_t)dsp); } else { @@ -16496,22 +16519,22 @@ BYTE* emitter::emitOutputR(BYTE* dst, instrDesc* id) } #ifdef TARGET_AMD64 - case INS_setzuo: - case INS_setzuno: - case INS_setzub: - case INS_setzuae: - case INS_setzue: - case INS_setzune: - case INS_setzube: - case INS_setzua: - case INS_setzus: - case INS_setzuns: - case INS_setzup: - case INS_setzunp: - case INS_setzul: - case INS_setzuge: - case INS_setzule: - case INS_setzug: + case INS_seto_apx: + case INS_setno_apx: + case INS_setb_apx: + case INS_setae_apx: + case INS_sete_apx: + case INS_setne_apx: + case INS_setbe_apx: + case INS_seta_apx: + case INS_sets_apx: + case INS_setns_apx: + case INS_setp_apx: + case INS_setnp_apx: + case INS_setl_apx: + case INS_setge_apx: + case INS_setle_apx: + case INS_setg_apx: { assert(TakesApxExtendedEvexPrefix(id)); assert(size == EA_1BYTE); @@ -20734,22 +20757,22 @@ emitter::insExecutionCharacteristics emitter::getInsExecutionCharacteristics(ins case INS_setle: case INS_setg: #ifdef TARGET_AMD64 - case INS_setzuo: - case INS_setzuno: - case INS_setzub: - case INS_setzuae: - case INS_setzue: - case INS_setzune: - case INS_setzube: - case INS_setzua: - case INS_setzus: - case INS_setzuns: - case INS_setzup: - case INS_setzunp: - case INS_setzul: - case INS_setzuge: - case INS_setzule: - case INS_setzug: + case INS_seto_apx: + case INS_setno_apx: + case INS_setb_apx: + case INS_setae_apx: + case INS_sete_apx: + case INS_setne_apx: + case INS_setbe_apx: + case INS_seta_apx: + case INS_sets_apx: + case INS_setns_apx: + case INS_setp_apx: + case INS_setnp_apx: + case INS_setl_apx: + case INS_setge_apx: + case INS_setle_apx: + case INS_setg_apx: #endif { result.insLatency += PERFSCORE_LATENCY_1C; diff --git a/src/coreclr/jit/instr.cpp b/src/coreclr/jit/instr.cpp index b117e937bd9d7a..1d01a703900fba 100644 --- a/src/coreclr/jit/instr.cpp +++ b/src/coreclr/jit/instr.cpp @@ -675,22 +675,22 @@ void CodeGen::inst_SET(emitJumpKind condition, regNumber reg, insOpts instOption // If using ZU feature, we need to promote the SETcc to the new instruction. if ((instOptions & INS_OPTS_EVEX_zu_MASK) != 0) { - assert(INS_setzuo == (INS_seto + 16)); - assert(INS_setzuno == (INS_setno + 16)); - assert(INS_setzub == (INS_setb + 16)); - assert(INS_setzuae == (INS_setae + 16)); - assert(INS_setzue == (INS_sete + 16)); - assert(INS_setzune == (INS_setne + 16)); - assert(INS_setzube == (INS_setbe + 16)); - assert(INS_setzua == (INS_seta + 16)); - assert(INS_setzus == (INS_sets + 16)); - assert(INS_setzuns == (INS_setns + 16)); - assert(INS_setzup == (INS_setp + 16)); - assert(INS_setzunp == (INS_setnp + 16)); - assert(INS_setzul == (INS_setl + 16)); - assert(INS_setzuge == (INS_setge + 16)); - assert(INS_setzule == (INS_setle + 16)); - assert(INS_setzug == (INS_setg + 16)); + assert(INS_seto_apx == (INS_seto + 16)); + assert(INS_setno_apx == (INS_setno + 16)); + assert(INS_setb_apx == (INS_setb + 16)); + assert(INS_setae_apx == (INS_setae + 16)); + assert(INS_sete_apx == (INS_sete + 16)); + assert(INS_setne_apx == (INS_setne + 16)); + assert(INS_setbe_apx == (INS_setbe + 16)); + assert(INS_seta_apx == (INS_seta + 16)); + assert(INS_sets_apx == (INS_sets + 16)); + assert(INS_setns_apx == (INS_setns + 16)); + assert(INS_setp_apx == (INS_setp + 16)); + assert(INS_setnp_apx == (INS_setnp + 16)); + assert(INS_setl_apx == (INS_setl + 16)); + assert(INS_setge_apx == (INS_setge + 16)); + assert(INS_setle_apx == (INS_setle + 16)); + assert(INS_setg_apx == (INS_setg + 16)); ins = (instruction)(ins + 16); } #endif diff --git a/src/coreclr/jit/instrsxarch.h b/src/coreclr/jit/instrsxarch.h index 238afff3bf953d..69cfcaf4bf124b 100644 --- a/src/coreclr/jit/instrsxarch.h +++ b/src/coreclr/jit/instrsxarch.h @@ -1296,22 +1296,22 @@ INST1(setg, "setg", IUM_WR, 0x0F009F, #ifdef TARGET_AMD64 // The following instructions shall always be next to SETcc instructions group, the offset between the original instruction and the ZU variant should be 16. // No new instruction should be inserted from INS_seto to setzug. -INST1(setzuo, "setzuo", IUM_WR, 0x40, ILLEGAL, ILLEGAL, INS_TT_NONE, Reads_OF) -INST1(setzuno, "setzuno", IUM_WR, 0x41, ILLEGAL, ILLEGAL, INS_TT_NONE, Reads_OF) -INST1(setzub, "setzub", IUM_WR, 0x42, ILLEGAL, ILLEGAL, INS_TT_NONE, Reads_CF) -INST1(setzuae, "setzuae", IUM_WR, 0x43, ILLEGAL, ILLEGAL, INS_TT_NONE, Reads_CF) -INST1(setzue, "setzue", IUM_WR, 0x44, ILLEGAL, ILLEGAL, INS_TT_NONE, Reads_ZF) -INST1(setzune, "setzune", IUM_WR, 0x45, ILLEGAL, ILLEGAL, INS_TT_NONE, Reads_ZF) -INST1(setzube, "setzube", IUM_WR, 0x46, ILLEGAL, ILLEGAL, INS_TT_NONE, Reads_ZF | Reads_CF) -INST1(setzua, "setzua", IUM_WR, 0x47, ILLEGAL, ILLEGAL, INS_TT_NONE, Reads_ZF | Reads_CF) -INST1(setzus, "setzus", IUM_WR, 0x48, ILLEGAL, ILLEGAL, INS_TT_NONE, Reads_SF) -INST1(setzuns, "setzuns", IUM_WR, 0x49, ILLEGAL, ILLEGAL, INS_TT_NONE, Reads_SF) -INST1(setzup, "setzup", IUM_WR, 0x4A, ILLEGAL, ILLEGAL, INS_TT_NONE, Reads_PF) -INST1(setzunp, "setzunp", IUM_WR, 0x4B, ILLEGAL, ILLEGAL, INS_TT_NONE, Reads_PF) -INST1(setzul, "setzul", IUM_WR, 0x4C, ILLEGAL, ILLEGAL, INS_TT_NONE, Reads_OF | Reads_SF) -INST1(setzuge, "setzuge", IUM_WR, 0x4D, ILLEGAL, ILLEGAL, INS_TT_NONE, Reads_OF | Reads_SF) -INST1(setzule, "setzule", IUM_WR, 0x4E, ILLEGAL, ILLEGAL, INS_TT_NONE, Reads_OF | Reads_SF | Reads_ZF) -INST1(setzug, "setzug", IUM_WR, 0x4F, ILLEGAL, ILLEGAL, INS_TT_NONE, Reads_OF | Reads_SF | Reads_ZF) +INST1(seto_apx, "setzuo", IUM_WR, SSEDBLMAP(4, 0x40), ILLEGAL, ILLEGAL, INS_TT_NONE, Reads_OF) +INST1(setno_apx, "setzuno", IUM_WR, SSEDBLMAP(4, 0x41), ILLEGAL, ILLEGAL, INS_TT_NONE, Reads_OF) +INST1(setb_apx, "setzub", IUM_WR, SSEDBLMAP(4, 0x42), ILLEGAL, ILLEGAL, INS_TT_NONE, Reads_CF) +INST1(setae_apx, "setzuae", IUM_WR, SSEDBLMAP(4, 0x43), ILLEGAL, ILLEGAL, INS_TT_NONE, Reads_CF) +INST1(sete_apx, "setzue", IUM_WR, SSEDBLMAP(4, 0x44), ILLEGAL, ILLEGAL, INS_TT_NONE, Reads_ZF) +INST1(setne_apx, "setzune", IUM_WR, SSEDBLMAP(4, 0x45), ILLEGAL, ILLEGAL, INS_TT_NONE, Reads_ZF) +INST1(setbe_apx, "setzube", IUM_WR, SSEDBLMAP(4, 0x46), ILLEGAL, ILLEGAL, INS_TT_NONE, Reads_ZF | Reads_CF) +INST1(seta_apx, "setzua", IUM_WR, SSEDBLMAP(4, 0x47), ILLEGAL, ILLEGAL, INS_TT_NONE, Reads_ZF | Reads_CF) +INST1(sets_apx, "setzus", IUM_WR, SSEDBLMAP(4, 0x48), ILLEGAL, ILLEGAL, INS_TT_NONE, Reads_SF) +INST1(setns_apx, "setzuns", IUM_WR, SSEDBLMAP(4, 0x49), ILLEGAL, ILLEGAL, INS_TT_NONE, Reads_SF) +INST1(setp_apx, "setzup", IUM_WR, SSEDBLMAP(4, 0x4A), ILLEGAL, ILLEGAL, INS_TT_NONE, Reads_PF) +INST1(setnp_apx, "setzunp", IUM_WR, SSEDBLMAP(4, 0x4B), ILLEGAL, ILLEGAL, INS_TT_NONE, Reads_PF) +INST1(setl_apx, "setzul", IUM_WR, SSEDBLMAP(4, 0x4C), ILLEGAL, ILLEGAL, INS_TT_NONE, Reads_OF | Reads_SF) +INST1(setge_apx, "setzuge", IUM_WR, SSEDBLMAP(4, 0x4D), ILLEGAL, ILLEGAL, INS_TT_NONE, Reads_OF | Reads_SF) +INST1(setle_apx, "setzule", IUM_WR, SSEDBLMAP(4, 0x4E), ILLEGAL, ILLEGAL, INS_TT_NONE, Reads_OF | Reads_SF | Reads_ZF) +INST1(setg_apx, "setzug", IUM_WR, SSEDBLMAP(4, 0x4F), ILLEGAL, ILLEGAL, INS_TT_NONE, Reads_OF | Reads_SF | Reads_ZF) #endif // Indirect jump used for tailcalls. We differentiate between func-internal From 41ea1777abb163b0086633bbd07f2f08855ef408 Mon Sep 17 00:00:00 2001 From: Ruihan-Yin Date: Wed, 16 Jul 2025 15:25:37 -0700 Subject: [PATCH 12/19] revert compressed displacement bug fix changes. --- src/coreclr/jit/emitxarch.cpp | 40 ++++++----------------------------- 1 file changed, 7 insertions(+), 33 deletions(-) diff --git a/src/coreclr/jit/emitxarch.cpp b/src/coreclr/jit/emitxarch.cpp index 3ea4e5766f3838..ecf402de10abc9 100644 --- a/src/coreclr/jit/emitxarch.cpp +++ b/src/coreclr/jit/emitxarch.cpp @@ -5159,7 +5159,6 @@ inline UNATIVE_OFFSET emitter::emitInsSizeSVCalcDisp(instrDesc* id, code_t code, dspIsZero = (dsp == 0); - // APX extended EVEX instructions have constant disp8 displacement, no need to compress. bool tryCompress = true; if (EBPbased) @@ -5191,7 +5190,7 @@ inline UNATIVE_OFFSET emitter::emitInsSizeSVCalcDisp(instrDesc* id, code_t code, { ssize_t compressedDsp; - if (TryEvexCompressDisp8Byte(id, dsp, &compressedDsp, &dspInByte) && hasTupleTypeInfo(ins)) + if (TryEvexCompressDisp8Byte(id, dsp, &compressedDsp, &dspInByte)) { SetEvexCompressedDisplacement(id); } @@ -5364,7 +5363,7 @@ UNATIVE_OFFSET emitter::emitInsSizeAM(instrDesc* id, code_t code) { ssize_t compressedDsp; - if (TryEvexCompressDisp8Byte(id, dsp, &compressedDsp, &dspInByte) && hasTupleTypeInfo(ins)) + if (TryEvexCompressDisp8Byte(id, dsp, &compressedDsp, &dspInByte)) { SetEvexCompressedDisplacement(id); } @@ -14668,22 +14667,10 @@ BYTE* emitter::emitOutputAM(BYTE* dst, instrDesc* id, code_t code, CnsVal* addc) assert(isCompressed && dspInByte); dsp = compressedDsp; } - else if (TakesEvexPrefix(id)) - { - assert(!(TryEvexCompressDisp8Byte(id, dsp, &compressedDsp, &dspInByte) && hasTupleTypeInfo(ins))); - if (IsBMIInstruction(ins)) - { - dspInByte = ((signed char)dsp == (ssize_t)dsp); - } - else - { - dspInByte = false; - } - } - else if (TakesApxExtendedEvexPrefix(id)) + else if (TakesEvexPrefix(id) || TakesApxExtendedEvexPrefix(id)) { - // the scaling factor of extended EVEX instructions is always 1. - dspInByte = ((signed char)dsp == (ssize_t)dsp); + assert(!TryEvexCompressDisp8Byte(id, dsp, &compressedDsp, &dspInByte)); + dspInByte = false; } else { @@ -15564,7 +15551,7 @@ BYTE* emitter::emitOutputSV(BYTE* dst, instrDesc* id, code_t code, CnsVal* addc) assert(isCompressed && dspInByte); dsp = (int)compressedDsp; } - else if (TakesEvexPrefix(id)) + else if (TakesEvexPrefix(id) || TakesApxExtendedEvexPrefix(id)) { #if FEATURE_FIXED_OUT_ARGS // TODO-AMD64-CQ: We should be able to accurately predict this when FEATURE_FIXED_OUT_ARGS @@ -15573,20 +15560,7 @@ BYTE* emitter::emitOutputSV(BYTE* dst, instrDesc* id, code_t code, CnsVal* addc) // // assert(!TryEvexCompressDisp8Byte(id, dsp, &compressedDsp, &dspInByte)); #endif - - if (IsBMIInstruction(ins)) - { - dspInByte = ((signed char)dsp == (ssize_t)dsp); - } - else - { - dspInByte = false; - } - } - else if (TakesApxExtendedEvexPrefix(id)) - { - // Apx extended EVEX instructions do not support compressed displacement. - dspInByte = ((signed char)dsp == (ssize_t)dsp); + dspInByte = false; } else { From bd045fb5588efd7ec323eb4569e2633a16890bc1 Mon Sep 17 00:00:00 2001 From: Ruihan-Yin Date: Wed, 16 Jul 2025 15:38:27 -0700 Subject: [PATCH 13/19] formatting --- src/coreclr/jit/emitxarch.cpp | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/src/coreclr/jit/emitxarch.cpp b/src/coreclr/jit/emitxarch.cpp index ecf402de10abc9..92bd41702c09a7 100644 --- a/src/coreclr/jit/emitxarch.cpp +++ b/src/coreclr/jit/emitxarch.cpp @@ -3073,8 +3073,10 @@ emitter::code_t emitter::emitExtractEvexPrefix(instruction ins, code_t& code) co // 1. An escape byte 0F (For isa before AVX10.2) // 2. A map number from 0 to 7 (For AVX10.2 and above) leadingBytes = check; - assert((leadingBytes == 0x0F) || - ((emitComp->compIsaSupportedDebugOnly(InstructionSet_AVX10v2) || (emitComp->compIsaSupportedDebugOnly(InstructionSet_APX) || emitComp->canUseApxEncoding())) && (leadingBytes >= 0x00) && (leadingBytes <= 0x07))); + assert((leadingBytes == 0x0F) || + ((emitComp->compIsaSupportedDebugOnly(InstructionSet_AVX10v2) || + (emitComp->compIsaSupportedDebugOnly(InstructionSet_APX) || emitComp->canUseApxEncoding())) && + (leadingBytes >= 0x00) && (leadingBytes <= 0x07))); // Get rid of both sizePrefix and escape byte code &= 0x0000FFFFLL; @@ -5158,7 +5160,7 @@ inline UNATIVE_OFFSET emitter::emitInsSizeSVCalcDisp(instrDesc* id, code_t code, dsp = adr + id->idAddr()->iiaLclVar.lvaOffset(); dspIsZero = (dsp == 0); - + bool tryCompress = true; if (EBPbased) From f57207e36134d35553ca3041ae5634eae64f0523 Mon Sep 17 00:00:00 2001 From: Ruihan-Yin Date: Thu, 17 Jul 2025 14:21:00 -0700 Subject: [PATCH 14/19] resolve comments --- src/coreclr/jit/codegenxarch.cpp | 16 ++-- src/coreclr/jit/emit.h | 3 +- src/coreclr/jit/emitxarch.cpp | 124 ++++++++++++++++++------------- src/coreclr/jit/emitxarch.h | 16 ++-- 4 files changed, 92 insertions(+), 67 deletions(-) diff --git a/src/coreclr/jit/codegenxarch.cpp b/src/coreclr/jit/codegenxarch.cpp index 0b304d06a5b116..3d6479ccbc896d 100644 --- a/src/coreclr/jit/codegenxarch.cpp +++ b/src/coreclr/jit/codegenxarch.cpp @@ -1809,13 +1809,12 @@ void CodeGen::inst_SETCC(GenCondition condition, var_types type, regNumber dstRe const GenConditionDesc& desc = GenConditionDesc::Get(condition); insOpts instOptions = INS_OPTS_NONE; - if (compiler->canUseApxEvexEncoding() && JitConfig.EnableApxZU()) - { - instOptions = varTypeIsByte(type) ? INS_OPTS_NONE : INS_OPTS_EVEX_zu; - } - else + + bool needsMovzx = !varTypeIsByte(type); + if (needsMovzx && compiler->canUseApxEvexEncoding() && JitConfig.EnableApxZU()) { - assert(instOptions == INS_OPTS_NONE); + instOptions = INS_OPTS_EVEX_zu; + needsMovzx = false; } inst_SET(desc.jumpKind1, dstReg, instOptions); @@ -1828,8 +1827,9 @@ void CodeGen::inst_SETCC(GenCondition condition, var_types type, regNumber dstRe genDefineTempLabel(labelNext); } - // TODO-XArch-Apx: we can apply EVEX.ZU to avoid this movzx. - if ((instOptions == INS_OPTS_NONE) && !varTypeIsByte(type)) + // we can apply EVEX.ZU to avoid this movzx. + // TODO-XArch-apx: evaluate setcc + movzx and xor + set + if (needsMovzx) { GetEmitter()->emitIns_Mov(INS_movzx, EA_1BYTE, dstReg, dstReg, /* canSkip */ false); } diff --git a/src/coreclr/jit/emit.h b/src/coreclr/jit/emit.h index bc3e8882e0cae8..1374a6331e5caf 100644 --- a/src/coreclr/jit/emit.h +++ b/src/coreclr/jit/emit.h @@ -1795,12 +1795,13 @@ class emitter bool idIsEvexNdContextSet() const { + assert(!IsApxZuCompatibleInstruction(_idIns)); return _idEvexNdContext != 0; } bool idIsEvexZuContextSet() const { - return (_idEvexZuContext != 0) && (IsSETZUccInstruction(_idIns)); + return (_idEvexZuContext != 0); } void idSetEvexNdContext() diff --git a/src/coreclr/jit/emitxarch.cpp b/src/coreclr/jit/emitxarch.cpp index 92bd41702c09a7..b2b9c2fd540dd6 100644 --- a/src/coreclr/jit/emitxarch.cpp +++ b/src/coreclr/jit/emitxarch.cpp @@ -167,24 +167,6 @@ bool emitter::IsKMOVInstruction(instruction ins) } } -//------------------------------------------------------------------------ -// IsSETZUccInstruction: Is this a SETcc instruction with APX-ZU feature? -// -// Arguments: -// ins - The instruction to check. -// -// Returns: -// `true` if it is a SETcc instruction with APX-ZU feature. -// -bool emitter::IsSETZUccInstruction(instruction ins) -{ -#ifdef TARGET_AMD64 - return ((ins >= INS_seto_apx) && (ins <= INS_setg_apx)); -#else - return false; -#endif -} - regNumber emitter::getBmiRegNumber(instruction ins) { switch (ins) @@ -273,18 +255,55 @@ bool emitter::HasRex2Encoding(instruction ins) return (flags & Encoding_REX2) != 0; } -bool emitter::HasApxNdd(instruction ins) +//------------------------------------------------------------------------ +// IsApxNddCompatibleInstruction: Is this a APX-EVEX.ND compatible instruction? +// +// Arguments: +// ins - The instruction to check. +// +// Returns: +// `true` if it is a APX-EVEX.ND compatible instruction. +// +bool emitter::IsApxNddCompatibleInstruction(instruction ins) { insFlags flags = CodeGenInterface::instInfo[ins]; return (flags & INS_Flags_Has_NDD) != 0; } -bool emitter::HasApxNf(instruction ins) +//------------------------------------------------------------------------ +// IsApxNfCompatibleInstruction: Is this a APX-EVEX.NF compatible instruction? +// +// Arguments: +// ins - The instruction to check. +// +// Returns: +// `true` if it is a APX-EVEX.NF compatible instruction. +// +bool emitter::IsApxNfCompatibleInstruction(instruction ins) { insFlags flags = CodeGenInterface::instInfo[ins]; return (flags & INS_Flags_Has_NF) != 0; } +//------------------------------------------------------------------------ +// IsApxZuCompatibleInstruction: Is this a APX-EVEX.ZU compatible instruction? +// +// Arguments: +// ins - The instruction to check. +// +// Returns: +// `true` if it is a APX-EVEX.ZU compatible instruction. +// +bool emitter::IsApxZuCompatibleInstruction(instruction ins) +{ +#ifdef TARGET_AMD64 + // For now, we only have SETZUcc enabled for EVEX.ZU. + return ((ins >= INS_seto_apx) && (ins <= INS_setg_apx)); +#else + return false; +#endif +} + bool emitter::IsVexEncodableInstruction(instruction ins) const { if (!UseVEXEncoding()) @@ -435,7 +454,7 @@ bool emitter::IsRex2EncodableInstruction(instruction ins) const } //------------------------------------------------------------------------ -// IsApxNDDEncodableInstruction: Answer the question- does this instruction have apx ndd form. +// IsApxNddEncodableInstruction: Answer the question- does this instruction have apx ndd form. // // Arguments: // ins - The instruction to check. @@ -443,18 +462,18 @@ bool emitter::IsRex2EncodableInstruction(instruction ins) const // Returns: // `true` if ins has apx ndd form. // -bool emitter::IsApxNDDEncodableInstruction(instruction ins) const +bool emitter::IsApxNddEncodableInstruction(instruction ins) const { if (!UsePromotedEVEXEncoding()) { return false; } - return HasApxNdd(ins); + return IsApxNddCompatibleInstruction(ins); } //------------------------------------------------------------------------ -// IsApxNFEncodableInstruction: Answer the question - does this instruction have Evex.nf supported +// IsApxNfEncodableInstruction: Answer the question - does this instruction have Evex.nf supported // // Arguments: // ins - The instruction to check. @@ -462,14 +481,14 @@ bool emitter::IsApxNDDEncodableInstruction(instruction ins) const // Returns: // `true` if ins is Evex.nf supported. // -bool emitter::IsApxNFEncodableInstruction(instruction ins) const +bool emitter::IsApxNfEncodableInstruction(instruction ins) const { if (!UsePromotedEVEXEncoding()) { return false; } - return HasApxNf(ins); + return IsApxNfCompatibleInstruction(ins); } //------------------------------------------------------------------------ @@ -489,23 +508,28 @@ bool emitter::IsApxExtendedEvexInstruction(instruction ins) const return false; } - if (HasApxNdd(ins) || HasApxNf(ins)) + if (IsApxNddCompatibleInstruction(ins)) { return true; } - if (ins == INS_crc32_apx || ins == INS_movbe_apx) + if (IsApxNfCompatibleInstruction(ins)) { - // With the new opcode, CRC32 is promoted to EVEX with APX. return true; } - if (IsSETZUccInstruction(ins)) + if (IsApxZuCompatibleInstruction(ins)) { // SETcc can use EVEX.ZU feature. return true; } + if (ins == INS_crc32_apx || ins == INS_movbe_apx) + { + // With the new opcode, CRC32 is promoted to EVEX with APX. + return true; + } + if (IsApxOnlyInstruction(ins)) { return true; @@ -922,7 +946,7 @@ bool emitter::DoJitUseApxNDD(instruction ins) const #if !defined(TARGET_AMD64) return false; #else - return JitConfig.EnableApxNDD() && IsApxNDDEncodableInstruction(ins); + return JitConfig.EnableApxNDD() && IsApxNddEncodableInstruction(ins); #endif } @@ -2012,22 +2036,22 @@ bool emitter::TakesApxExtendedEvexPrefix(const instrDesc* id) const return false; } - if (id->idIsEvexNdContextSet() && HasApxNdd(ins)) + if (IsApxNddCompatibleInstruction(ins) && id->idIsEvexNdContextSet()) { // The instruction uses APX-ND hint, and it requires EVEX. return true; } - if (id->idIsEvexNfContextSet() && HasApxNf(ins)) + if (IsApxNfCompatibleInstruction(ins) && id->idIsEvexNfContextSet()) { // The instruction uses APX-NF hint, and it requires EVEX. return true; } - if (IsSETZUccInstruction(ins)) + if (IsApxZuCompatibleInstruction(ins) && id->idIsEvexZuContextSet()) { // These are promoted forms of SETcc instruction with EVEX.ZU. - return id->idIsEvexZuContextSet(); + return true; } if (ins == INS_crc32_apx || ins == INS_movbe_apx) @@ -2138,18 +2162,18 @@ emitter::code_t emitter::AddEvexPrefix(const instrDesc* id, code_t code, emitAtt // TODO-XArch-APX: // verify if it is actually safe to reuse the EVEX.ND with EVEX.B on instrDesc. - if (id->idIsEvexNdContextSet() && HasApxNdd(ins)) + if (IsApxNddCompatibleInstruction(ins) && id->idIsEvexNdContextSet()) { code |= ND_BIT_IN_BYTE_EVEX_PREFIX; } - if (id->idIsEvexZuContextSet()) + if (IsApxZuCompatibleInstruction(ins) && id->idIsEvexZuContextSet()) { // EVEX.ZU reuses the EVEX.ND bit for SETcc and IMUL. code |= ND_BIT_IN_BYTE_EVEX_PREFIX; } - if (id->idIsEvexNfContextSet()) + if (IsApxNfCompatibleInstruction(ins) && id->idIsEvexNfContextSet()) { code |= NF_BIT_IN_BYTE_EVEX_PREFIX; } @@ -3075,7 +3099,7 @@ emitter::code_t emitter::emitExtractEvexPrefix(instruction ins, code_t& code) co leadingBytes = check; assert((leadingBytes == 0x0F) || ((emitComp->compIsaSupportedDebugOnly(InstructionSet_AVX10v2) || - (emitComp->compIsaSupportedDebugOnly(InstructionSet_APX) || emitComp->canUseApxEncoding())) && + (emitComp->compIsaSupportedDebugOnly(InstructionSet_APX))) && (leadingBytes >= 0x00) && (leadingBytes <= 0x07))); // Get rid of both sizePrefix and escape byte @@ -3149,7 +3173,7 @@ emitter::code_t emitter::emitExtractEvexPrefix(instruction ins, code_t& code) co case 0x04: { - assert((emitComp->compIsaSupportedDebugOnly(InstructionSet_APX) || emitComp->canUseApxEncoding())); + assert(emitComp->compIsaSupportedDebugOnly(InstructionSet_APX)); evexPrefix |= (0x04 << 16); break; } @@ -3937,7 +3961,7 @@ inline emitter::insFormat emitter::emitInsModeFormat(instruction ins, insFormat #ifdef TARGET_AMD64 if (useNDD) { - assert(IsApxNDDEncodableInstruction(ins)); + assert(IsApxNddEncodableInstruction(ins)); if (ins == INS_rcl_N || ins == INS_rcr_N || ins == INS_rol_N || ins == INS_ror_N || ins == INS_shl_N || ins == INS_shr_N || ins == INS_sar_N) { @@ -6347,7 +6371,7 @@ regNumber emitter::emitInsBinary(instruction ins, emitAttr attr, GenTree* dst, G #else if (useNDD) { - assert(IsApxNDDEncodableInstruction(ins)); + assert(IsApxNddEncodableInstruction(ins)); // targetReg has to be an actual register if using NDD. assert(targetReg < REG_STK); // make sure target register is not either of the src registers. @@ -7978,7 +8002,7 @@ void emitter::emitIns_R_R(instruction ins, emitAttr attr, regNumber reg1, regNum } // Checking EVEX.ND and NDD compatibility together in case the ND slot is overridden by other features. - bool useNDD = ((instOptions & INS_OPTS_EVEX_nd_MASK) != 0) && IsApxNDDEncodableInstruction(ins); + bool useNDD = ((instOptions & INS_OPTS_EVEX_nd_MASK) != 0) && IsApxNddEncodableInstruction(ins); emitAttr size = EA_SIZE(attr); @@ -8025,7 +8049,7 @@ void emitter::emitIns_R_R_I( instrDesc* id = emitNewInstrSC(attr, ival); // Checking EVEX.ND and NDD compatibility together in case the ND slot is overridden by other features. - bool useNDD = ((instOptions & INS_OPTS_EVEX_nd_MASK) != 0) && IsApxNDDEncodableInstruction(ins); + bool useNDD = ((instOptions & INS_OPTS_EVEX_nd_MASK) != 0) && IsApxNddEncodableInstruction(ins); id->idIns(ins); id->idInsFmt(emitInsModeFormat(ins, IF_RRD_RRD_CNS, useNDD)); @@ -8423,7 +8447,7 @@ void emitter::emitIns_R_R_R( assert(IsThreeOperandAVXInstruction(ins) || IsKInstruction(ins) || IsApxExtendedEvexInstruction(ins)); // Checking EVEX.ND and NDD compatibility together in case the ND slot is overridden by other features. - bool useNDD = ((instOptions & INS_OPTS_EVEX_nd_MASK) != 0) && IsApxNDDEncodableInstruction(ins); + bool useNDD = ((instOptions & INS_OPTS_EVEX_nd_MASK) != 0) && IsApxNddEncodableInstruction(ins); instrDesc* id = emitNewInstr(attr); id->idIns(ins); @@ -8453,7 +8477,7 @@ void emitter::emitIns_R_R_S( instrDesc* id = emitNewInstr(attr); // Checking EVEX.ND and NDD compatibility together in case the ND slot is overridden by other features. - bool useNDD = ((instOptions & INS_OPTS_EVEX_nd_MASK) != 0) && IsApxNDDEncodableInstruction(ins); + bool useNDD = ((instOptions & INS_OPTS_EVEX_nd_MASK) != 0) && IsApxNddEncodableInstruction(ins); id->idIns(ins); id->idInsFmt((ins == INS_mulx) ? IF_RWR_RWR_SRD : emitInsModeFormat(ins, IF_RRD_RRD_SRD, useNDD)); @@ -12793,7 +12817,7 @@ void emitter::emitDispIns( /* Display the instruction name */ #ifdef TARGET_AMD64 - if (IsApxNFEncodableInstruction(id->idIns()) && id->idIsEvexNfContextSet()) + if (IsApxNfEncodableInstruction(id->idIns()) && id->idIsEvexNfContextSet()) { // print the EVEX.NF indication in psudeo prefix style. printf("{nf} "); @@ -16874,7 +16898,7 @@ BYTE* emitter::emitOutputRR(BYTE* dst, instrDesc* id) } unsigned regCode; - if (!id->idIsEvexNdContextSet() || !IsApxNDDEncodableInstruction(ins)) + if (!id->idIsEvexNdContextSet() || !IsApxNddEncodableInstruction(ins)) { regCode = insEncodeReg345(id, regFor345Bits, size, &code); regCode |= insEncodeReg012(id, regFor012Bits, size, &code); @@ -16946,7 +16970,7 @@ BYTE* emitter::emitOutputRR(BYTE* dst, instrDesc* id) dst += emitOutputByte(dst, (code >> 8) & 0xFF); dst += emitOutputByte(dst, (0xC0 | regCode)); } - else if (IsApxNDDEncodableInstruction(ins) && id->idIsEvexNdContextSet()) + else if (IsApxNddEncodableInstruction(ins) && id->idIsEvexNdContextSet()) { dst += emitOutputByte(dst, (code & 0xFF)); dst += emitOutputByte(dst, (0xC0 | regCode | (code >> 8))); @@ -19463,7 +19487,7 @@ size_t emitter::emitOutputInstr(insGroup* ig, instrDesc* id, BYTE** dp) { assert(IsVexOrEvexEncodableInstruction(ins) || IsApxExtendedEvexInstruction(ins)); - if (id->idIsEvexNdContextSet() && IsApxNDDEncodableInstruction(ins)) + if (IsApxNddEncodableInstruction(ins) && id->idIsEvexNdContextSet()) { // EVEX.vvvv has different semantic for APX-EVEX NDD instructions. code = insCodeRM(ins); diff --git a/src/coreclr/jit/emitxarch.h b/src/coreclr/jit/emitxarch.h index 9bdbeef04da3c2..126d1a67a3593b 100644 --- a/src/coreclr/jit/emitxarch.h +++ b/src/coreclr/jit/emitxarch.h @@ -123,7 +123,6 @@ static bool IsAvx512OnlyInstruction(instruction ins); static bool IsKMOVInstruction(instruction ins); static bool IsAVXVNNIFamilyInstruction(instruction ins); static bool IsAVXVNNIINTInstruction(instruction ins); -static bool IsSETZUccInstruction(instruction ins); static bool Is3OpRmwInstruction(instruction ins); static bool IsBMIInstruction(instruction ins); static bool IsKInstruction(instruction ins); @@ -133,13 +132,14 @@ static bool IsApxOnlyInstruction(instruction ins); static regNumber getBmiRegNumber(instruction ins); static regNumber getSseShiftRegNumber(instruction ins); static bool HasRex2Encoding(instruction ins); -static bool HasApxNdd(instruction ins); -static bool HasApxNf(instruction ins); +static bool IsApxNddCompatibleInstruction(instruction ins); +static bool IsApxNfCompatibleInstruction(instruction ins); +static bool IsApxZuCompatibleInstruction(instruction ins); bool IsVexEncodableInstruction(instruction ins) const; bool IsEvexEncodableInstruction(instruction ins) const; bool IsRex2EncodableInstruction(instruction ins) const; -bool IsApxNDDEncodableInstruction(instruction ins) const; -bool IsApxNFEncodableInstruction(instruction ins) const; +bool IsApxNddEncodableInstruction(instruction ins) const; +bool IsApxNfEncodableInstruction(instruction ins) const; bool IsApxExtendedEvexInstruction(instruction ins) const; bool IsShiftInstruction(instruction ins) const; bool IsLegacyMap1(code_t code) const; @@ -573,7 +573,7 @@ void SetEvexNdIfNeeded(instrDesc* id, insOpts instOptions) if ((instOptions & INS_OPTS_EVEX_nd_MASK) != 0) { assert(UsePromotedEVEXEncoding()); - assert(IsApxNDDEncodableInstruction(id->idIns())); + assert(IsApxNddEncodableInstruction(id->idIns())); id->idSetEvexNdContext(); } else @@ -594,7 +594,7 @@ void SetEvexNfIfNeeded(instrDesc* id, insOpts instOptions) if ((instOptions & INS_OPTS_EVEX_nf_MASK) != 0) { assert(UsePromotedEVEXEncoding()); - assert(IsApxNFEncodableInstruction(id->idIns())); + assert(IsApxNfEncodableInstruction(id->idIns())); id->idSetEvexNfContext(); } else @@ -617,7 +617,7 @@ void SetEvexZuIfNeeded(instrDesc* id, insOpts instOptions) assert(UsePromotedEVEXEncoding()); instruction ins = id->idIns(); #ifdef TARGET_AMD64 - assert(IsSETZUccInstruction(ins)); + assert(IsApxZuCompatibleInstruction(ins)); #else // This method is not expected to be used on 32-bit systems. unreached(); From 445e6cb58c832c7d0c1b26523857be1c678e89d1 Mon Sep 17 00:00:00 2001 From: Ruihan-Yin Date: Thu, 17 Jul 2025 15:01:24 -0700 Subject: [PATCH 15/19] formatting. --- src/coreclr/jit/codegenxarch.cpp | 2 +- src/coreclr/jit/emitxarch.cpp | 7 +++---- 2 files changed, 4 insertions(+), 5 deletions(-) diff --git a/src/coreclr/jit/codegenxarch.cpp b/src/coreclr/jit/codegenxarch.cpp index 3d6479ccbc896d..9ef4bf7a04c10b 100644 --- a/src/coreclr/jit/codegenxarch.cpp +++ b/src/coreclr/jit/codegenxarch.cpp @@ -1814,7 +1814,7 @@ void CodeGen::inst_SETCC(GenCondition condition, var_types type, regNumber dstRe if (needsMovzx && compiler->canUseApxEvexEncoding() && JitConfig.EnableApxZU()) { instOptions = INS_OPTS_EVEX_zu; - needsMovzx = false; + needsMovzx = false; } inst_SET(desc.jumpKind1, dstReg, instOptions); diff --git a/src/coreclr/jit/emitxarch.cpp b/src/coreclr/jit/emitxarch.cpp index b2b9c2fd540dd6..9f7840b39c6ddc 100644 --- a/src/coreclr/jit/emitxarch.cpp +++ b/src/coreclr/jit/emitxarch.cpp @@ -3097,10 +3097,9 @@ emitter::code_t emitter::emitExtractEvexPrefix(instruction ins, code_t& code) co // 1. An escape byte 0F (For isa before AVX10.2) // 2. A map number from 0 to 7 (For AVX10.2 and above) leadingBytes = check; - assert((leadingBytes == 0x0F) || - ((emitComp->compIsaSupportedDebugOnly(InstructionSet_AVX10v2) || - (emitComp->compIsaSupportedDebugOnly(InstructionSet_APX))) && - (leadingBytes >= 0x00) && (leadingBytes <= 0x07))); + assert((leadingBytes == 0x0F) || ((emitComp->compIsaSupportedDebugOnly(InstructionSet_AVX10v2) || + (emitComp->compIsaSupportedDebugOnly(InstructionSet_APX))) && + (leadingBytes >= 0x00) && (leadingBytes <= 0x07))); // Get rid of both sizePrefix and escape byte code &= 0x0000FFFFLL; From a4d9cb934cdac84374e15f433ab35ca6b490f2dc Mon Sep 17 00:00:00 2001 From: Ruihan-Yin Date: Thu, 17 Jul 2025 18:59:29 -0700 Subject: [PATCH 16/19] resolve comments --- src/coreclr/jit/emitxarch.cpp | 41 +++++++++++++++-------------------- 1 file changed, 17 insertions(+), 24 deletions(-) diff --git a/src/coreclr/jit/emitxarch.cpp b/src/coreclr/jit/emitxarch.cpp index 9f7840b39c6ddc..48279599b4c8cf 100644 --- a/src/coreclr/jit/emitxarch.cpp +++ b/src/coreclr/jit/emitxarch.cpp @@ -87,7 +87,23 @@ bool emitter::IsAvx512OnlyInstruction(instruction ins) bool emitter::IsApxOnlyInstruction(instruction ins) { - return (ins >= FIRST_APX_INSTRUCTION) && (ins <= LAST_APX_INSTRUCTION); +#ifdef TARGET_AMD64 + if (IsApxZuCompatibleInstruction(ins)) + { + return true; + } + + if (ins == INS_crc32_apx || ins == INS_movbe_apx) + { + return true; + } + + if (IsCCMP(ins)) + { + return true; + } +#endif // TARGET_AMD64 + return false; } bool emitter::IsAVXVNNIFamilyInstruction(instruction ins) @@ -518,18 +534,6 @@ bool emitter::IsApxExtendedEvexInstruction(instruction ins) const return true; } - if (IsApxZuCompatibleInstruction(ins)) - { - // SETcc can use EVEX.ZU feature. - return true; - } - - if (ins == INS_crc32_apx || ins == INS_movbe_apx) - { - // With the new opcode, CRC32 is promoted to EVEX with APX. - return true; - } - if (IsApxOnlyInstruction(ins)) { return true; @@ -2048,17 +2052,6 @@ bool emitter::TakesApxExtendedEvexPrefix(const instrDesc* id) const return true; } - if (IsApxZuCompatibleInstruction(ins) && id->idIsEvexZuContextSet()) - { - // These are promoted forms of SETcc instruction with EVEX.ZU. - return true; - } - - if (ins == INS_crc32_apx || ins == INS_movbe_apx) - { - return true; - } - #if defined(DEBUG) if (emitComp->DoJitStressPromotedEvexEncoding()) { From 737ef2c36c1412ae30d81aa5e0d2e7156e2d8895 Mon Sep 17 00:00:00 2001 From: Ruihan-Yin Date: Fri, 18 Jul 2025 16:18:14 -0700 Subject: [PATCH 17/19] resolve comment --- src/coreclr/jit/emit.h | 3 ++- src/coreclr/jit/emitxarch.cpp | 4 ++-- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/src/coreclr/jit/emit.h b/src/coreclr/jit/emit.h index 1374a6331e5caf..ec7fffec071bba 100644 --- a/src/coreclr/jit/emit.h +++ b/src/coreclr/jit/emit.h @@ -1795,12 +1795,13 @@ class emitter bool idIsEvexNdContextSet() const { - assert(!IsApxZuCompatibleInstruction(_idIns)); + assert(IsApxNddCompatibleInstruction(_idIns)); return _idEvexNdContext != 0; } bool idIsEvexZuContextSet() const { + assert(IsApxZuCompatibleInstruction(_idIns)); return (_idEvexZuContext != 0); } diff --git a/src/coreclr/jit/emitxarch.cpp b/src/coreclr/jit/emitxarch.cpp index 48279599b4c8cf..767e7005745b6f 100644 --- a/src/coreclr/jit/emitxarch.cpp +++ b/src/coreclr/jit/emitxarch.cpp @@ -16890,7 +16890,7 @@ BYTE* emitter::emitOutputRR(BYTE* dst, instrDesc* id) } unsigned regCode; - if (!id->idIsEvexNdContextSet() || !IsApxNddEncodableInstruction(ins)) + if (!IsApxNddEncodableInstruction(ins) || !id->idIsEvexNdContextSet()) { regCode = insEncodeReg345(id, regFor345Bits, size, &code); regCode |= insEncodeReg012(id, regFor012Bits, size, &code); @@ -19193,7 +19193,7 @@ size_t emitter::emitOutputInstr(insGroup* ig, instrDesc* id, BYTE** dp) { code = insCodeRM(ins); - if (id->idIsEvexNdContextSet() && TakesApxExtendedEvexPrefix(id)) + if (TakesApxExtendedEvexPrefix(id) && id->idIsEvexNdContextSet()) { // TODO-XArch-APX: // I'm not sure why instructions on this path can be with instruction From 0755ae56b3d96b0a77fed0cbd7f94847dc82b67f Mon Sep 17 00:00:00 2001 From: Ruihan-Yin Date: Wed, 23 Jul 2025 15:23:04 -0700 Subject: [PATCH 18/19] improve tp --- src/coreclr/jit/emitxarch.cpp | 15 +---------- src/coreclr/jit/instr.cpp | 35 ++++++++++++------------- src/coreclr/jit/instrsxarch.h | 48 +++++++++++++++-------------------- 3 files changed, 39 insertions(+), 59 deletions(-) diff --git a/src/coreclr/jit/emitxarch.cpp b/src/coreclr/jit/emitxarch.cpp index 767e7005745b6f..5d1ad1bd1b2491 100644 --- a/src/coreclr/jit/emitxarch.cpp +++ b/src/coreclr/jit/emitxarch.cpp @@ -88,20 +88,7 @@ bool emitter::IsAvx512OnlyInstruction(instruction ins) bool emitter::IsApxOnlyInstruction(instruction ins) { #ifdef TARGET_AMD64 - if (IsApxZuCompatibleInstruction(ins)) - { - return true; - } - - if (ins == INS_crc32_apx || ins == INS_movbe_apx) - { - return true; - } - - if (IsCCMP(ins)) - { - return true; - } + return (ins >= FIRST_APX_INSTRUCTION) && (ins <= LAST_APX_INSTRUCTION); #endif // TARGET_AMD64 return false; } diff --git a/src/coreclr/jit/instr.cpp b/src/coreclr/jit/instr.cpp index 1d01a703900fba..77707dce951bf3 100644 --- a/src/coreclr/jit/instr.cpp +++ b/src/coreclr/jit/instr.cpp @@ -675,23 +675,24 @@ void CodeGen::inst_SET(emitJumpKind condition, regNumber reg, insOpts instOption // If using ZU feature, we need to promote the SETcc to the new instruction. if ((instOptions & INS_OPTS_EVEX_zu_MASK) != 0) { - assert(INS_seto_apx == (INS_seto + 16)); - assert(INS_setno_apx == (INS_setno + 16)); - assert(INS_setb_apx == (INS_setb + 16)); - assert(INS_setae_apx == (INS_setae + 16)); - assert(INS_sete_apx == (INS_sete + 16)); - assert(INS_setne_apx == (INS_setne + 16)); - assert(INS_setbe_apx == (INS_setbe + 16)); - assert(INS_seta_apx == (INS_seta + 16)); - assert(INS_sets_apx == (INS_sets + 16)); - assert(INS_setns_apx == (INS_setns + 16)); - assert(INS_setp_apx == (INS_setp + 16)); - assert(INS_setnp_apx == (INS_setnp + 16)); - assert(INS_setl_apx == (INS_setl + 16)); - assert(INS_setge_apx == (INS_setge + 16)); - assert(INS_setle_apx == (INS_setle + 16)); - assert(INS_setg_apx == (INS_setg + 16)); - ins = (instruction)(ins + 16); + const int offset = (INS_seto - INS_seto_apx); + assert(INS_seto == (INS_seto_apx + offset)); + assert(INS_setno == (INS_setno_apx + offset)); + assert(INS_setb == (INS_setb_apx + offset)); + assert(INS_setae == (INS_setae_apx + offset)); + assert(INS_sete == (INS_sete_apx + offset)); + assert(INS_setne == (INS_setne_apx + offset)); + assert(INS_setbe == (INS_setbe_apx + offset)); + assert(INS_seta == (INS_seta_apx + offset)); + assert(INS_sets == (INS_sets_apx + offset)); + assert(INS_setns == (INS_setns_apx + offset)); + assert(INS_setp == (INS_setp_apx + offset)); + assert(INS_setnp == (INS_setnp_apx + offset)); + assert(INS_setl == (INS_setl_apx + offset)); + assert(INS_setge == (INS_setge_apx + offset)); + assert(INS_setle == (INS_setle_apx + offset)); + assert(INS_setg == (INS_setg_apx + offset)); + ins = (instruction)(ins + offset); } #endif diff --git a/src/coreclr/jit/instrsxarch.h b/src/coreclr/jit/instrsxarch.h index 69cfcaf4bf124b..ebc130e370cd84 100644 --- a/src/coreclr/jit/instrsxarch.h +++ b/src/coreclr/jit/instrsxarch.h @@ -1154,13 +1154,29 @@ INST3(ccmpge, "ccmpge", IUM_RD, 0x000038, 0x0003880, 0x INST3(ccmple, "ccmple", IUM_RD, 0x000038, 0x0003880, 0x00003A, ILLEGAL, ILLEGAL, INS_TT_NONE, Writes_OF | Writes_SF | Writes_ZF | Writes_CF | INS_FLAGS_Has_Sbit) INST3(ccmpg, "ccmpg", IUM_RD, 0x000038, 0x0003880, 0x00003A, ILLEGAL, ILLEGAL, INS_TT_NONE, Writes_OF | Writes_SF | Writes_ZF | Writes_CF | INS_FLAGS_Has_Sbit) #define LAST_CCMP_INSTRUCTION INS_ccmpg -#define LAST_APX_INSTRUCTION INS_ccmpg +INST3(crc32_apx, "crc32", IUM_RW, BAD_CODE, BAD_CODE, 0x0000F0, 3C, 1C, INS_TT_NONE, INS_FLAGS_None) +INST3(movbe_apx, "movbe", IUM_WR, 0x000061, BAD_CODE, 0x000060, ILLEGAL, ILLEGAL, INS_TT_NONE, INS_FLAGS_None) + +INST3(seto_apx, "setzuo", IUM_WR, SSEDBLMAP(4, 0x40), BAD_CODE, BAD_CODE, ILLEGAL, ILLEGAL, INS_TT_NONE, Reads_OF) +INST3(setno_apx, "setzuno", IUM_WR, SSEDBLMAP(4, 0x41), BAD_CODE, BAD_CODE, ILLEGAL, ILLEGAL, INS_TT_NONE, Reads_OF) +INST3(setb_apx, "setzub", IUM_WR, SSEDBLMAP(4, 0x42), BAD_CODE, BAD_CODE, ILLEGAL, ILLEGAL, INS_TT_NONE, Reads_CF) +INST3(setae_apx, "setzuae", IUM_WR, SSEDBLMAP(4, 0x43), BAD_CODE, BAD_CODE, ILLEGAL, ILLEGAL, INS_TT_NONE, Reads_CF) +INST3(sete_apx, "setzue", IUM_WR, SSEDBLMAP(4, 0x44), BAD_CODE, BAD_CODE, ILLEGAL, ILLEGAL, INS_TT_NONE, Reads_ZF) +INST3(setne_apx, "setzune", IUM_WR, SSEDBLMAP(4, 0x45), BAD_CODE, BAD_CODE, ILLEGAL, ILLEGAL, INS_TT_NONE, Reads_ZF) +INST3(setbe_apx, "setzube", IUM_WR, SSEDBLMAP(4, 0x46), BAD_CODE, BAD_CODE, ILLEGAL, ILLEGAL, INS_TT_NONE, Reads_ZF | Reads_CF) +INST3(seta_apx, "setzua", IUM_WR, SSEDBLMAP(4, 0x47), BAD_CODE, BAD_CODE, ILLEGAL, ILLEGAL, INS_TT_NONE, Reads_ZF | Reads_CF) +INST3(sets_apx, "setzus", IUM_WR, SSEDBLMAP(4, 0x48), BAD_CODE, BAD_CODE, ILLEGAL, ILLEGAL, INS_TT_NONE, Reads_SF) +INST3(setns_apx, "setzuns", IUM_WR, SSEDBLMAP(4, 0x49), BAD_CODE, BAD_CODE, ILLEGAL, ILLEGAL, INS_TT_NONE, Reads_SF) +INST3(setp_apx, "setzup", IUM_WR, SSEDBLMAP(4, 0x4A), BAD_CODE, BAD_CODE, ILLEGAL, ILLEGAL, INS_TT_NONE, Reads_PF) +INST3(setnp_apx, "setzunp", IUM_WR, SSEDBLMAP(4, 0x4B), BAD_CODE, BAD_CODE, ILLEGAL, ILLEGAL, INS_TT_NONE, Reads_PF) +INST3(setl_apx, "setzul", IUM_WR, SSEDBLMAP(4, 0x4C), BAD_CODE, BAD_CODE, ILLEGAL, ILLEGAL, INS_TT_NONE, Reads_OF | Reads_SF) +INST3(setge_apx, "setzuge", IUM_WR, SSEDBLMAP(4, 0x4D), BAD_CODE, BAD_CODE, ILLEGAL, ILLEGAL, INS_TT_NONE, Reads_OF | Reads_SF) +INST3(setle_apx, "setzule", IUM_WR, SSEDBLMAP(4, 0x4E), BAD_CODE, BAD_CODE, ILLEGAL, ILLEGAL, INS_TT_NONE, Reads_OF | Reads_SF | Reads_ZF) +INST3(setg_apx, "setzug", IUM_WR, SSEDBLMAP(4, 0x4F), BAD_CODE, BAD_CODE, ILLEGAL, ILLEGAL, INS_TT_NONE, Reads_OF | Reads_SF | Reads_ZF) +#define LAST_APX_INSTRUCTION INS_setg_apx // Scalar instructions in SSE4.2 INST3(crc32, "crc32", IUM_RW, BAD_CODE, BAD_CODE, PSSE38(0xF2, 0xF0), 3C, 1C, INS_TT_NONE, INS_FLAGS_None) -#ifdef TARGET_AMD64 -INST3(crc32_apx, "crc32", IUM_RW, BAD_CODE, BAD_CODE, 0x0000F0, 3C, 1C, INS_TT_NONE, INS_FLAGS_None) -#endif // BMI1 INST3(tzcnt, "tzcnt", IUM_WR, BAD_CODE, BAD_CODE, SSEFLT(0xBC), 3C, 1C, INS_TT_NONE, Undefined_OF | Undefined_SF | Writes_ZF | Undefined_AF | Undefined_PF | Writes_CF | Encoding_REX2) // Count the Number of Trailing Zero Bits @@ -1176,9 +1192,6 @@ INST3(lzcnt_apx, "lzcnt", IUM_WR, BAD_CODE, BAD_CODE, // MOVBE INST3(movbe, "movbe", IUM_WR, PCKMVB(0xF1), BAD_CODE, PCKMVB(0xF0), ILLEGAL, ILLEGAL, INS_TT_NONE, INS_FLAGS_None) -#ifdef TARGET_AMD64 -INST3(movbe_apx, "movbe", IUM_WR, 0x000061, BAD_CODE, 0x000060, ILLEGAL, ILLEGAL, INS_TT_NONE, INS_FLAGS_None) -#endif // POPCNT INST3(popcnt, "popcnt", IUM_WR, BAD_CODE, BAD_CODE, SSEFLT(0xB8), 3C, 1C, INS_TT_NONE, Resets_OF | Resets_SF | Writes_ZF | Resets_AF | Resets_PF | Resets_CF | Encoding_REX2) @@ -1293,27 +1306,6 @@ INST1(setge, "setge", IUM_WR, 0x0F009D, INST1(setle, "setle", IUM_WR, 0x0F009E, ILLEGAL, ILLEGAL, INS_TT_NONE, Reads_OF | Reads_SF | Reads_ZF | Encoding_REX2) INST1(setg, "setg", IUM_WR, 0x0F009F, ILLEGAL, ILLEGAL, INS_TT_NONE, Reads_OF | Reads_SF | Reads_ZF | Encoding_REX2) -#ifdef TARGET_AMD64 -// The following instructions shall always be next to SETcc instructions group, the offset between the original instruction and the ZU variant should be 16. -// No new instruction should be inserted from INS_seto to setzug. -INST1(seto_apx, "setzuo", IUM_WR, SSEDBLMAP(4, 0x40), ILLEGAL, ILLEGAL, INS_TT_NONE, Reads_OF) -INST1(setno_apx, "setzuno", IUM_WR, SSEDBLMAP(4, 0x41), ILLEGAL, ILLEGAL, INS_TT_NONE, Reads_OF) -INST1(setb_apx, "setzub", IUM_WR, SSEDBLMAP(4, 0x42), ILLEGAL, ILLEGAL, INS_TT_NONE, Reads_CF) -INST1(setae_apx, "setzuae", IUM_WR, SSEDBLMAP(4, 0x43), ILLEGAL, ILLEGAL, INS_TT_NONE, Reads_CF) -INST1(sete_apx, "setzue", IUM_WR, SSEDBLMAP(4, 0x44), ILLEGAL, ILLEGAL, INS_TT_NONE, Reads_ZF) -INST1(setne_apx, "setzune", IUM_WR, SSEDBLMAP(4, 0x45), ILLEGAL, ILLEGAL, INS_TT_NONE, Reads_ZF) -INST1(setbe_apx, "setzube", IUM_WR, SSEDBLMAP(4, 0x46), ILLEGAL, ILLEGAL, INS_TT_NONE, Reads_ZF | Reads_CF) -INST1(seta_apx, "setzua", IUM_WR, SSEDBLMAP(4, 0x47), ILLEGAL, ILLEGAL, INS_TT_NONE, Reads_ZF | Reads_CF) -INST1(sets_apx, "setzus", IUM_WR, SSEDBLMAP(4, 0x48), ILLEGAL, ILLEGAL, INS_TT_NONE, Reads_SF) -INST1(setns_apx, "setzuns", IUM_WR, SSEDBLMAP(4, 0x49), ILLEGAL, ILLEGAL, INS_TT_NONE, Reads_SF) -INST1(setp_apx, "setzup", IUM_WR, SSEDBLMAP(4, 0x4A), ILLEGAL, ILLEGAL, INS_TT_NONE, Reads_PF) -INST1(setnp_apx, "setzunp", IUM_WR, SSEDBLMAP(4, 0x4B), ILLEGAL, ILLEGAL, INS_TT_NONE, Reads_PF) -INST1(setl_apx, "setzul", IUM_WR, SSEDBLMAP(4, 0x4C), ILLEGAL, ILLEGAL, INS_TT_NONE, Reads_OF | Reads_SF) -INST1(setge_apx, "setzuge", IUM_WR, SSEDBLMAP(4, 0x4D), ILLEGAL, ILLEGAL, INS_TT_NONE, Reads_OF | Reads_SF) -INST1(setle_apx, "setzule", IUM_WR, SSEDBLMAP(4, 0x4E), ILLEGAL, ILLEGAL, INS_TT_NONE, Reads_OF | Reads_SF | Reads_ZF) -INST1(setg_apx, "setzug", IUM_WR, SSEDBLMAP(4, 0x4F), ILLEGAL, ILLEGAL, INS_TT_NONE, Reads_OF | Reads_SF | Reads_ZF) -#endif - // Indirect jump used for tailcalls. We differentiate between func-internal // indirect jump (e.g. used for switch) and tailcall indirect jumps because the // x64 unwinder might require the latter to be rex.w prefixed. From 05e937be1ee9559cab3504af9890bef46d8ac3f5 Mon Sep 17 00:00:00 2001 From: Ruihan-Yin Date: Wed, 23 Jul 2025 17:11:06 -0700 Subject: [PATCH 19/19] bug fix. --- src/coreclr/jit/emitxarch.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/coreclr/jit/emitxarch.cpp b/src/coreclr/jit/emitxarch.cpp index 5d1ad1bd1b2491..f776d7defe4d1a 100644 --- a/src/coreclr/jit/emitxarch.cpp +++ b/src/coreclr/jit/emitxarch.cpp @@ -19180,7 +19180,7 @@ size_t emitter::emitOutputInstr(insGroup* ig, instrDesc* id, BYTE** dp) { code = insCodeRM(ins); - if (TakesApxExtendedEvexPrefix(id) && id->idIsEvexNdContextSet()) + if (IsApxNddCompatibleInstruction(ins) && id->idIsEvexNdContextSet()) { // TODO-XArch-APX: // I'm not sure why instructions on this path can be with instruction