From ec19c1294d52d45357ec6e2d7cfe8cc4222b39e7 Mon Sep 17 00:00:00 2001
From: Tanner Gooding <tagoo@outlook.com>
Date: Fri, 9 Dec 2022 09:57:01 -0800
Subject: [PATCH 01/15] Remove the dead emitOutputRexOrVexPrefixIfNeeded and
 rename emitOutputSimdPrefixIfNeeded to emitOutputRexOrSimdPrefixIfNeeded

---
 src/coreclr/jit/emitxarch.cpp | 300 ++++------------------------------
 src/coreclr/jit/emitxarch.h   |   3 +-
 2 files changed, 31 insertions(+), 272 deletions(-)

diff --git a/src/coreclr/jit/emitxarch.cpp b/src/coreclr/jit/emitxarch.cpp
index 19c0ea4b480871..7f03582ea4a7c3 100644
--- a/src/coreclr/jit/emitxarch.cpp
+++ b/src/coreclr/jit/emitxarch.cpp
@@ -1280,7 +1280,7 @@ bool isPrefix(BYTE b)
 }
 
 //------------------------------------------------------------------------
-// emitOutputSimdPrefixIfNeeded: Outputs EVEX prefix (in case of AVX512 instructions),
+// emitOutputRexOrSimdPrefixIfNeeded: Outputs EVEX prefix (in case of AVX512 instructions),
 // VEX prefix (in case of AVX instructions) and REX.R/X/W/B otherwise.
 //
 // Arguments:
@@ -1292,7 +1292,7 @@ bool isPrefix(BYTE b)
 // Return Value:
 //    Size of prefix.
 //
-unsigned emitter::emitOutputSimdPrefixIfNeeded(instruction ins, BYTE* dst, code_t& code)
+unsigned emitter::emitOutputRexOrSimdPrefixIfNeeded(instruction ins, BYTE* dst, code_t& code)
 {
     // TODO-XArch-AVX512: Remove redundant code and collapse into single pathway for EVEX and VEX if possible.
     if (hasEvexPrefix(code))
@@ -1634,246 +1634,6 @@ unsigned emitter::emitOutputSimdPrefixIfNeeded(instruction ins, BYTE* dst, code_
     return 0;
 }
 
-// Outputs VEX prefix (in case of AVX instructions) and REX.R/X/W/B otherwise.
-unsigned emitter::emitOutputRexOrVexPrefixIfNeeded(instruction ins, BYTE* dst, code_t& code)
-{
-    if (hasVexPrefix(code))
-    {
-        // Only AVX instructions should have a VEX prefix
-        assert(UseVEXEncoding() && IsVexEncodedInstruction(ins));
-        code_t vexPrefix = (code >> 32) & 0x00FFFFFF;
-        code &= 0x00000000FFFFFFFFLL;
-
-        WORD leadingBytes = 0;
-        BYTE check        = (code >> 24) & 0xFF;
-        if (check != 0)
-        {
-            // 3-byte opcode: with the bytes ordered as 0x2211RM33 or
-            // 4-byte opcode: with the bytes ordered as 0x22114433
-            // check for a prefix in the 11 position
-            BYTE sizePrefix = (code >> 16) & 0xFF;
-            if ((sizePrefix != 0) && isPrefix(sizePrefix))
-            {
-                // 'pp' bits in byte2 of VEX prefix allows us to encode SIMD size prefixes as two bits
-                //
-                //   00  - None   (0F    - packed float)
-                //   01  - 66     (66 0F - packed double)
-                //   10  - F3     (F3 0F - scalar float
-                //   11  - F2     (F2 0F - scalar double)
-                switch (sizePrefix)
-                {
-                    case 0x66:
-                        if (IsBMIInstruction(ins))
-                        {
-                            switch (ins)
-                            {
-                                case INS_rorx:
-                                case INS_pdep:
-                                case INS_mulx:
-// TODO: Unblock when enabled for x86
-#ifdef TARGET_AMD64
-                                case INS_shrx:
-#endif
-                                {
-                                    vexPrefix |= 0x03;
-                                    break;
-                                }
-
-                                case INS_pext:
-// TODO: Unblock when enabled for x86
-#ifdef TARGET_AMD64
-                                case INS_sarx:
-#endif
-                                {
-                                    vexPrefix |= 0x02;
-                                    break;
-                                }
-// TODO: Unblock when enabled for x86
-#ifdef TARGET_AMD64
-                                case INS_shlx:
-                                {
-                                    vexPrefix |= 0x01;
-                                    break;
-                                }
-#endif
-                                default:
-                                {
-                                    vexPrefix |= 0x00;
-                                    break;
-                                }
-                            }
-                        }
-                        else
-                        {
-                            vexPrefix |= 0x01;
-                        }
-                        break;
-                    case 0xF3:
-                        vexPrefix |= 0x02;
-                        break;
-                    case 0xF2:
-                        vexPrefix |= 0x03;
-                        break;
-                    default:
-                        assert(!"unrecognized SIMD size prefix");
-                        unreached();
-                }
-
-                // Now the byte in the 22 position must be an escape byte 0F
-                leadingBytes = check;
-                assert(leadingBytes == 0x0F);
-
-                // Get rid of both sizePrefix and escape byte
-                code &= 0x0000FFFFLL;
-
-                // Check the byte in the 33 position to see if it is 3A or 38.
-                // In such a case escape bytes must be 0x0F3A or 0x0F38
-                check = code & 0xFF;
-                if (check == 0x3A || check == 0x38)
-                {
-                    leadingBytes = (leadingBytes << 8) | check;
-                    code &= 0x0000FF00LL;
-                }
-            }
-        }
-        else
-        {
-            // 2-byte opcode with the bytes ordered as 0x0011RM22
-            // the byte in position 11 must be an escape byte.
-            leadingBytes = (code >> 16) & 0xFF;
-            assert(leadingBytes == 0x0F || leadingBytes == 0x00);
-            code &= 0xFFFF;
-        }
-
-        // If there is an escape byte it must be 0x0F or 0x0F3A or 0x0F38
-        // m-mmmmm bits in byte 1 of VEX prefix allows us to encode these
-        // implied leading bytes. 0x0F is supported by both the 2-byte and
-        // 3-byte encoding. While 0x0F3A and 0x0F38 are only supported by
-        // the 3-byte version.
-
-        switch (leadingBytes)
-        {
-            case 0x00:
-                // there is no leading byte
-                break;
-            case 0x0F:
-                vexPrefix |= 0x0100;
-                break;
-            case 0x0F38:
-                vexPrefix |= 0x0200;
-                break;
-            case 0x0F3A:
-                vexPrefix |= 0x0300;
-                break;
-            default:
-                assert(!"encountered unknown leading bytes");
-                unreached();
-        }
-
-        // At this point
-        //     VEX.2211RM33 got transformed as VEX.0000RM33
-        //     VEX.0011RM22 got transformed as VEX.0000RM22
-        //
-        // Now output VEX prefix leaving the 4-byte opcode
-
-        // The 2-byte VEX encoding, requires that the X and B-bits are set (these
-        // bits are inverted from the REX values so set means off), the W-bit is
-        // not set (this bit is not inverted), and that the m-mmmm bits are 0-0001
-        // (the 2-byte VEX encoding only supports the 0x0F leading byte). When these
-        // conditions are met, we can change byte-0 from 0xC4 to 0xC5 and then
-        // byte-1 is the logical-or of bit 7 from byte-1 and bits 0-6 from byte 2
-        // from the 3-byte VEX encoding.
-        //
-        // Given the above, the check can be reduced to a simple mask and comparison.
-        // * 0xFFFF7F80 is a mask that ignores any bits whose value we don't care about:
-        //   * R can be set or unset              (0x7F ignores bit 7)
-        //   * vvvv can be any value              (0x80 ignores bits 3-6)
-        //   * L can be set or unset              (0x80 ignores bit 2)
-        //   * pp can be any value                (0x80 ignores bits 0-1)
-        // * 0x00C46100 is a value that signifies the requirements listed above were met:
-        //   * We must be a three-byte VEX opcode (0x00C4)
-        //   * X and B must be set                (0x61 validates bits 5-6)
-        //   * m-mmmm must be 0-00001             (0x61 validates bits 0-4)
-        //   * W must be unset                    (0x00 validates bit 7)
-        if ((vexPrefix & 0xFFFF7F80) == 0x00C46100)
-        {
-            // Encoding optimization calculation is not done while estimating the instruction
-            // size and thus over-predict instruction size by 1 byte.
-            // If there are IGs that will be aligned, do not optimize encoding so the
-            // estimated alignment sizes are accurate.
-            if (emitCurIG->igNum > emitLastAlignedIgNum)
-            {
-                emitOutputByte(dst, 0xC5);
-                emitOutputByte(dst + 1, ((vexPrefix >> 8) & 0x80) | (vexPrefix & 0x7F));
-                return 2;
-            }
-        }
-
-        emitOutputByte(dst, ((vexPrefix >> 16) & 0xFF));
-        emitOutputByte(dst + 1, ((vexPrefix >> 8) & 0xFF));
-        emitOutputByte(dst + 2, vexPrefix & 0xFF);
-        return 3;
-    }
-
-#ifdef TARGET_AMD64
-    if (code > 0x00FFFFFFFFLL)
-    {
-        BYTE prefix = (code >> 32) & 0xFF;
-        noway_assert(prefix >= 0x40 && prefix <= 0x4F);
-        code &= 0x00000000FFFFFFFFLL;
-
-        // TODO-AMD64-Cleanup: when we remove the prefixes (just the SSE opcodes right now)
-        // we can remove this code as well
-
-        // The REX prefix is required to come after all other prefixes.
-        // Some of our 'opcodes' actually include some prefixes, if that
-        // is the case, shift them over and place the REX prefix after
-        // the other prefixes, and emit any prefix that got moved out.
-        BYTE check = (code >> 24) & 0xFF;
-        if (check == 0)
-        {
-            // 3-byte opcode: with the bytes ordered as 0x00113322
-            // check for a prefix in the 11 position
-            check = (code >> 16) & 0xFF;
-            if (check != 0 && isPrefix(check))
-            {
-                // Swap the rex prefix and whatever this prefix is
-                code = (((DWORD)prefix << 16) | (code & 0x0000FFFFLL));
-                // and then emit the other prefix
-                return emitOutputByte(dst, check);
-            }
-        }
-        else
-        {
-            // 4-byte opcode with the bytes ordered as 0x22114433
-            // first check for a prefix in the 11 position
-            BYTE check2 = (code >> 16) & 0xFF;
-            if (isPrefix(check2))
-            {
-                assert(!isPrefix(check)); // We currently don't use this, so it is untested
-                if (isPrefix(check))
-                {
-                    // 3 prefixes were rex = rr, check = c1, check2 = c2 encoded as 0xrrc1c2XXXX
-                    // Change to c2rrc1XXXX, and emit check2 now
-                    code = (((code_t)prefix << 24) | ((code_t)check << 16) | (code & 0x0000FFFFLL));
-                }
-                else
-                {
-                    // 2 prefixes were rex = rr, check2 = c2 encoded as 0xrrXXc2XXXX, (check is part of the opcode)
-                    // Change to c2XXrrXXXX, and emit check2 now
-                    code = (((code_t)check << 24) | ((code_t)prefix << 16) | (code & 0x0000FFFFLL));
-                }
-                return emitOutputByte(dst, check2);
-            }
-        }
-
-        return emitOutputByte(dst, prefix);
-    }
-#endif // TARGET_AMD64
-
-    return 0;
-}
-
 #ifdef TARGET_AMD64
 /*****************************************************************************
  * Is the last instruction emitted a call instruction?
@@ -11339,7 +11099,7 @@ BYTE* emitter::emitOutputAM(BYTE* dst, instrDesc* id, code_t code, CnsVal* addc)
         if (id->idIsCallRegPtr())
         {
             code = insEncodeMRreg(ins, reg, EA_PTRSIZE, code);
-            dst += emitOutputSimdPrefixIfNeeded(ins, dst, code);
+            dst += emitOutputRexOrSimdPrefixIfNeeded(ins, dst, code);
             dst += emitOutputWord(dst, code);
             goto DONE;
         }
@@ -11365,7 +11125,7 @@ BYTE* emitter::emitOutputAM(BYTE* dst, instrDesc* id, code_t code, CnsVal* addc)
         }
 
         // And emit the REX prefix
-        dst += emitOutputSimdPrefixIfNeeded(ins, dst, code);
+        dst += emitOutputRexOrSimdPrefixIfNeeded(ins, dst, code);
 
 #endif // TARGET_AMD64
 
@@ -11527,7 +11287,7 @@ BYTE* emitter::emitOutputAM(BYTE* dst, instrDesc* id, code_t code, CnsVal* addc)
         }
         unsigned regcode = insEncodeReg345(ins, reg345, size, &code);
 
-        dst += emitOutputSimdPrefixIfNeeded(ins, dst, code);
+        dst += emitOutputRexOrSimdPrefixIfNeeded(ins, dst, code);
 
         if (UseSimdEncoding() && (ins != INS_crc32))
         {
@@ -11555,7 +11315,7 @@ BYTE* emitter::emitOutputAM(BYTE* dst, instrDesc* id, code_t code, CnsVal* addc)
         }
 
         // Output the REX prefix
-        dst += emitOutputSimdPrefixIfNeeded(ins, dst, code);
+        dst += emitOutputRexOrSimdPrefixIfNeeded(ins, dst, code);
 
         // Output the highest word of the opcode
         // We need to check again as in case of AVX instructions leading opcode bytes are stripped off
@@ -11572,7 +11332,7 @@ BYTE* emitter::emitOutputAM(BYTE* dst, instrDesc* id, code_t code, CnsVal* addc)
         assert(ins != INS_bt);
 
         // Output the REX prefix
-        dst += emitOutputSimdPrefixIfNeeded(ins, dst, code);
+        dst += emitOutputRexOrSimdPrefixIfNeeded(ins, dst, code);
 
         // Output the highest byte of the opcode
         if (code & 0x00FF0000)
@@ -11640,7 +11400,7 @@ BYTE* emitter::emitOutputAM(BYTE* dst, instrDesc* id, code_t code, CnsVal* addc)
     }
 
     // Output the REX prefix
-    dst += emitOutputSimdPrefixIfNeeded(ins, dst, code);
+    dst += emitOutputRexOrSimdPrefixIfNeeded(ins, dst, code);
 
     // Get the displacement value
     dsp = emitGetInsAmdAny(id);
@@ -12341,7 +12101,7 @@ BYTE* emitter::emitOutputSV(BYTE* dst, instrDesc* id, code_t code, CnsVal* addc)
         }
         unsigned regcode = insEncodeReg345(ins, reg345, size, &code);
 
-        dst += emitOutputSimdPrefixIfNeeded(ins, dst, code);
+        dst += emitOutputRexOrSimdPrefixIfNeeded(ins, dst, code);
 
         if (UseSimdEncoding() && (ins != INS_crc32))
         {
@@ -12369,7 +12129,7 @@ BYTE* emitter::emitOutputSV(BYTE* dst, instrDesc* id, code_t code, CnsVal* addc)
         }
 
         // Output the REX prefix
-        dst += emitOutputSimdPrefixIfNeeded(ins, dst, code);
+        dst += emitOutputRexOrSimdPrefixIfNeeded(ins, dst, code);
 
         // Output the highest word of the opcode
         // We need to check again because in case of AVX instructions the leading
@@ -12386,7 +12146,7 @@ BYTE* emitter::emitOutputSV(BYTE* dst, instrDesc* id, code_t code, CnsVal* addc)
         assert(ins != INS_bt);
 
         // Output the REX prefix
-        dst += emitOutputSimdPrefixIfNeeded(ins, dst, code);
+        dst += emitOutputRexOrSimdPrefixIfNeeded(ins, dst, code);
 
         // Output the highest byte of the opcode.
         // We need to check again because in case of AVX instructions the leading
@@ -12458,7 +12218,7 @@ BYTE* emitter::emitOutputSV(BYTE* dst, instrDesc* id, code_t code, CnsVal* addc)
     }
 
     // Output the REX prefix
-    dst += emitOutputSimdPrefixIfNeeded(ins, dst, code);
+    dst += emitOutputRexOrSimdPrefixIfNeeded(ins, dst, code);
 
     // Figure out the variable's frame position
     int varNum = id->idAddr()->iiaLclVar.lvaVarNum();
@@ -12841,7 +12601,7 @@ BYTE* emitter::emitOutputCV(BYTE* dst, instrDesc* id, code_t code, CnsVal* addc)
         }
         unsigned regcode = insEncodeReg345(ins, reg345, size, &code);
 
-        dst += emitOutputSimdPrefixIfNeeded(ins, dst, code);
+        dst += emitOutputRexOrSimdPrefixIfNeeded(ins, dst, code);
 
         if (UseVEXEncoding() && (ins != INS_crc32))
         {
@@ -12871,7 +12631,7 @@ BYTE* emitter::emitOutputCV(BYTE* dst, instrDesc* id, code_t code, CnsVal* addc)
         }
 
         // Output the REX prefix
-        dst += emitOutputSimdPrefixIfNeeded(ins, dst, code);
+        dst += emitOutputRexOrSimdPrefixIfNeeded(ins, dst, code);
 
         // Output the highest word of the opcode.
         // Check again since AVX instructions encode leading opcode bytes as part of VEX prefix.
@@ -12884,7 +12644,7 @@ BYTE* emitter::emitOutputCV(BYTE* dst, instrDesc* id, code_t code, CnsVal* addc)
     else if (code & 0x00FF0000)
     {
         // Output the REX prefix
-        dst += emitOutputSimdPrefixIfNeeded(ins, dst, code);
+        dst += emitOutputRexOrSimdPrefixIfNeeded(ins, dst, code);
 
         // Check again as VEX prefix would have encoded leading opcode byte
         if (code & 0x00FF0000)
@@ -12941,7 +12701,7 @@ BYTE* emitter::emitOutputCV(BYTE* dst, instrDesc* id, code_t code, CnsVal* addc)
     }
 
     // Output the REX prefix
-    dst += emitOutputSimdPrefixIfNeeded(ins, dst, code);
+    dst += emitOutputRexOrSimdPrefixIfNeeded(ins, dst, code);
 
     if (code)
     {
@@ -13228,7 +12988,7 @@ BYTE* emitter::emitOutputR(BYTE* dst, instrDesc* id)
                 unsigned regcode = insEncodeReg012(ins, reg, size, &code);
 
                 // Output the REX prefix
-                dst += emitOutputSimdPrefixIfNeeded(ins, dst, code);
+                dst += emitOutputRexOrSimdPrefixIfNeeded(ins, dst, code);
 
                 dst += emitOutputWord(dst, code | (regcode << 8));
             }
@@ -13255,7 +13015,7 @@ BYTE* emitter::emitOutputR(BYTE* dst, instrDesc* id)
             assert(!TakesRexWPrefix(ins, size));
 
             // Output the REX prefix
-            dst += emitOutputSimdPrefixIfNeeded(ins, dst, code);
+            dst += emitOutputRexOrSimdPrefixIfNeeded(ins, dst, code);
 
             dst += emitOutputByte(dst, code);
             break;
@@ -13280,7 +13040,7 @@ BYTE* emitter::emitOutputR(BYTE* dst, instrDesc* id)
             unsigned regcode = insEncodeReg012(ins, reg, size, &code);
 
             // Output the REX prefix
-            dst += emitOutputSimdPrefixIfNeeded(ins, dst, code);
+            dst += emitOutputRexOrSimdPrefixIfNeeded(ins, dst, code);
 
             dst += emitOutputWord(dst, code | (regcode << 8));
             break;
@@ -13309,7 +13069,7 @@ BYTE* emitter::emitOutputR(BYTE* dst, instrDesc* id)
             code = insEncodeMRreg(ins, reg, EA_1BYTE, insCodeMR(ins));
 
             // Output the REX prefix
-            dst += emitOutputSimdPrefixIfNeeded(ins, dst, code);
+            dst += emitOutputRexOrSimdPrefixIfNeeded(ins, dst, code);
 
             // We expect this to always be a 'big' opcode
             assert(code & 0x00FF0000);
@@ -13354,7 +13114,7 @@ BYTE* emitter::emitOutputR(BYTE* dst, instrDesc* id)
             }
 
             // Output the REX prefix
-            dst += emitOutputSimdPrefixIfNeeded(ins, dst, code);
+            dst += emitOutputRexOrSimdPrefixIfNeeded(ins, dst, code);
 
             dst += emitOutputWord(dst, code);
             break;
@@ -13590,7 +13350,7 @@ BYTE* emitter::emitOutputRR(BYTE* dst, instrDesc* id)
     }
 
     // Output the REX prefix
-    dst += emitOutputSimdPrefixIfNeeded(ins, dst, code);
+    dst += emitOutputRexOrSimdPrefixIfNeeded(ins, dst, code);
 
     if (code & 0xFF000000)
     {
@@ -13844,7 +13604,7 @@ BYTE* emitter::emitOutputRRR(BYTE* dst, instrDesc* id)
     code = insEncodeReg3456(ins, src1, size, code);
 
     // Output the REX/VEX/EVEX prefix
-    dst += emitOutputSimdPrefixIfNeeded(ins, dst, code);
+    dst += emitOutputRexOrSimdPrefixIfNeeded(ins, dst, code);
 
     // Is this a 'big' opcode?
     if (code & 0xFF000000)
@@ -13948,7 +13708,7 @@ BYTE* emitter::emitOutputRI(BYTE* dst, instrDesc* id)
         unsigned regcode = (insEncodeReg345(ins, regOpcode, size, &code) | insEncodeReg012(ins, reg, size, &code)) << 8;
 
         // Output the REX prefix
-        dst += emitOutputSimdPrefixIfNeeded(ins, dst, code);
+        dst += emitOutputRexOrSimdPrefixIfNeeded(ins, dst, code);
 
         if (code & 0xFF000000)
         {
@@ -13984,7 +13744,7 @@ BYTE* emitter::emitOutputRI(BYTE* dst, instrDesc* id)
             code = AddRexWPrefix(ins, code);
         }
 
-        dst += emitOutputSimdPrefixIfNeeded(ins, dst, code);
+        dst += emitOutputRexOrSimdPrefixIfNeeded(ins, dst, code);
 
         dst += emitOutputByte(dst, code);
         if (size == EA_4BYTE)
@@ -14113,7 +13873,7 @@ BYTE* emitter::emitOutputRI(BYTE* dst, instrDesc* id)
     }
 
     // Output the REX prefix
-    dst += emitOutputSimdPrefixIfNeeded(ins, dst, code);
+    dst += emitOutputRexOrSimdPrefixIfNeeded(ins, dst, code);
 
     // Does the value fit in a sign-extended byte?
     // Important!  Only set the 's' bit when we have a size larger than EA_1BYTE.
@@ -14315,7 +14075,7 @@ BYTE* emitter::emitOutputIV(BYTE* dst, instrDesc* id)
                 if (TakesRexWPrefix(ins, size) || (codeEvexMigrationCheck(code) && IsWEvexOpcodeExtension(ins)))
                 {
                     code = AddRexWPrefix(ins, code);
-                    dst += emitOutputSimdPrefixIfNeeded(ins, dst, code);
+                    dst += emitOutputRexOrSimdPrefixIfNeeded(ins, dst, code);
                 }
 
                 dst += emitOutputByte(dst, code);
@@ -14941,7 +14701,7 @@ size_t emitter::emitOutputInstr(insGroup* ig, instrDesc* id, BYTE** dp)
             {
                 code = AddRexWPrefix(ins, code);
             }
-            dst += emitOutputSimdPrefixIfNeeded(ins, dst, code);
+            dst += emitOutputRexOrSimdPrefixIfNeeded(ins, dst, code);
 #endif
             // Is this a 'big' opcode?
             if (code & 0xFF000000)
@@ -15234,7 +14994,7 @@ size_t emitter::emitOutputInstr(insGroup* ig, instrDesc* id, BYTE** dp)
                 dst += emitOutputByte(dst, 0x66);
             }
 
-            dst += emitOutputSimdPrefixIfNeeded(ins, dst, code);
+            dst += emitOutputRexOrSimdPrefixIfNeeded(ins, dst, code);
             dst += emitOutputWord(dst, code);
             dst += emitOutputByte(dst, emitGetInsSC(id));
             sz = emitSizeOfInsDsc(id);
@@ -15340,7 +15100,7 @@ size_t emitter::emitOutputInstr(insGroup* ig, instrDesc* id, BYTE** dp)
             regcode = (insEncodeReg345(ins, rReg, size, &code) | insEncodeReg012(ins, mReg, size, &code));
 
             // Output the REX prefix
-            dst += emitOutputSimdPrefixIfNeeded(ins, dst, code);
+            dst += emitOutputRexOrSimdPrefixIfNeeded(ins, dst, code);
 
             if (code & 0xFF000000)
             {
diff --git a/src/coreclr/jit/emitxarch.h b/src/coreclr/jit/emitxarch.h
index 13f79bc3833cca..c5238685a6247e 100644
--- a/src/coreclr/jit/emitxarch.h
+++ b/src/coreclr/jit/emitxarch.h
@@ -65,8 +65,7 @@ BYTE* emitOutputRRR(BYTE* dst, instrDesc* id);
 
 BYTE* emitOutputLJ(insGroup* ig, BYTE* dst, instrDesc* id);
 
-unsigned emitOutputSimdPrefixIfNeeded(instruction ins, BYTE* dst, code_t& code);
-unsigned emitOutputRexOrVexPrefixIfNeeded(instruction ins, BYTE* dst, code_t& code);
+unsigned emitOutputRexOrSimdPrefixIfNeeded(instruction ins, BYTE* dst, code_t& code);
 unsigned emitGetRexPrefixSize(instruction ins);
 unsigned emitGetVexPrefixSize(instruction ins, emitAttr attr);
 unsigned emitGetEvexPrefixSize(instruction ins);

From 4d75d052dc27ed33f7977c2fc72742bb510999d9 Mon Sep 17 00:00:00 2001
From: Tanner Gooding <tagoo@outlook.com>
Date: Fri, 9 Dec 2022 09:58:18 -0800
Subject: [PATCH 02/15] Remove the dead emitGetAdjustedSize and rename
 emitGetAdjustedSizeEvexAware to emitGetAdjustedSize

---
 src/coreclr/jit/emitxarch.cpp | 106 ++++------------------------------
 src/coreclr/jit/emitxarch.h   |   1 -
 2 files changed, 11 insertions(+), 96 deletions(-)

diff --git a/src/coreclr/jit/emitxarch.cpp b/src/coreclr/jit/emitxarch.cpp
index 7f03582ea4a7c3..1f74aab45221a9 100644
--- a/src/coreclr/jit/emitxarch.cpp
+++ b/src/coreclr/jit/emitxarch.cpp
@@ -1678,7 +1678,6 @@ unsigned emitter::emitGetRexPrefixSize(instruction ins)
 
 //------------------------------------------------------------------------
 // emitGetEvexPrefixSize: Gets Size of Evex prefix in bytes
-// TODO-XArch-AVX512: Once evex encoding is supported fully, this will take the place of emitGetAdjustedSize()
 //
 // Arguments:
 //    ins   -- The instruction
@@ -1710,9 +1709,8 @@ unsigned emitter::emitGetVexPrefixSize(instruction ins, emitAttr attr)
 }
 
 //------------------------------------------------------------------------
-// emitGetAdjustedSizeEvexAware: Determines any size adjustment needed for a given instruction based on the current
+// emitGetAdjustedSize: Determines any size adjustment needed for a given instruction based on the current
 // configuration.
-// TODO-XArch-AVX512: Once evex encoding is supported fully, this will take the place of emitGetAdjustedSize()
 //
 // Arguments:
 //    ins   -- The instruction being emitted
@@ -1722,7 +1720,7 @@ unsigned emitter::emitGetVexPrefixSize(instruction ins, emitAttr attr)
 // Returns:
 //    Updated size.
 //
-unsigned emitter::emitGetAdjustedSizeEvexAware(instruction ins, emitAttr attr, code_t code)
+unsigned emitter::emitGetAdjustedSize(instruction ins, emitAttr attr, code_t code)
 {
     unsigned adjustedSize = 0;
 
@@ -1847,88 +1845,6 @@ unsigned emitter::emitGetAdjustedSizeEvexAware(instruction ins, emitAttr attr, c
     return adjustedSize;
 }
 
-//------------------------------------------------------------------------
-// emitGetAdjustedSize: Determines any size adjustment needed for a given instruction based on the current
-// configuration.
-//
-// Arguments:
-//    ins   -- The instruction being emitted
-//    attr  -- The emit attribute
-//    code  -- The current opcode and any known prefixes
-unsigned emitter::emitGetAdjustedSize(instruction ins, emitAttr attr, code_t code)
-{
-    unsigned adjustedSize = 0;
-
-    if (IsVexEncodedInstruction(ins))
-    {
-        // VEX prefix encodes some bytes of the opcode and as a result, overall size of the instruction reduces.
-        // Therefore, to estimate the size adding VEX prefix size and size of instruction opcode bytes will always
-        // overstimate.
-        // Instead this routine will adjust the size of VEX prefix based on the number of bytes of opcode it encodes so
-        // that
-        // instruction size estimate will be accurate.
-        // Basically this  will decrease the vexPrefixSize, so that opcodeSize + vexPrefixAdjustedSize will be the right
-        // size.
-        //
-        // rightOpcodeSize + vexPrefixSize
-        //  = (opcodeSize - ExtrabytesSize) + vexPrefixSize
-        //  = opcodeSize + (vexPrefixSize - ExtrabytesSize)
-        //  = opcodeSize + vexPrefixAdjustedSize
-
-        unsigned vexPrefixAdjustedSize = emitGetVexPrefixSize(ins, attr);
-        assert(vexPrefixAdjustedSize == 3);
-
-        // In this case, opcode will contains escape prefix at least one byte,
-        // vexPrefixAdjustedSize should be minus one.
-        vexPrefixAdjustedSize -= 1;
-
-        // Get the fourth byte in Opcode.
-        // If this byte is non-zero, then we should check whether the opcode contains SIMD prefix or not.
-        BYTE check = (code >> 24) & 0xFF;
-        if (check != 0)
-        {
-            // 3-byte opcode: with the bytes ordered as 0x2211RM33 or
-            // 4-byte opcode: with the bytes ordered as 0x22114433
-            // Simd prefix is at the first byte.
-            BYTE sizePrefix = (code >> 16) & 0xFF;
-            if (sizePrefix != 0 && isPrefix(sizePrefix))
-            {
-                vexPrefixAdjustedSize -= 1;
-            }
-
-            // If the opcode size is 4 bytes, then the second escape prefix is at fourth byte in opcode.
-            // But in this case the opcode has not counted R\M part.
-            // opcodeSize + VexPrefixAdjustedSize - ExtraEscapePrefixSize + ModR\MSize
-            //=opcodeSize + VexPrefixAdjustedSize -1 + 1
-            //=opcodeSize + VexPrefixAdjustedSize
-            // So although we may have second byte escape prefix, we won't decrease vexPrefixAdjustedSize.
-        }
-
-        adjustedSize = vexPrefixAdjustedSize;
-    }
-    else if (Is4ByteSSEInstruction(ins))
-    {
-        // The 4-Byte SSE instructions require one additional byte to hold the ModRM byte
-        adjustedSize++;
-    }
-    else
-    {
-        if (ins == INS_crc32)
-        {
-            // Adjust code size for CRC32 that has 4-byte opcode but does not use SSE38 or EES3A encoding.
-            adjustedSize++;
-        }
-
-        if ((attr == EA_2BYTE) && (ins != INS_movzx) && (ins != INS_movsx))
-        {
-            // Most 16-bit operand instructions will need a 0x66 prefix.
-            adjustedSize++;
-        }
-    }
-
-    return adjustedSize;
-}
-
 //
 //------------------------------------------------------------------------
 // emitGetPrefixSize: Get size of rex or vex prefix emitted in code
@@ -2842,7 +2758,7 @@ inline UNATIVE_OFFSET emitter::emitInsSizeRR(instrDesc* id, code_t code)
     instruction ins  = id->idIns();
     emitAttr    attr = id->idOpSize();
 
-    UNATIVE_OFFSET sz = emitGetAdjustedSizeEvexAware(ins, attr, code);
+    UNATIVE_OFFSET sz = emitGetAdjustedSize(ins, attr, code);
 
     bool includeRexPrefixSize = true;
     // REX prefix
@@ -2910,7 +2826,7 @@ inline UNATIVE_OFFSET emitter::emitInsSizeRR(instruction ins, regNumber reg1, re
     // This would probably be better expressed as a different format or something?
     code_t code = insCodeRM(ins);
 
-    UNATIVE_OFFSET sz = emitGetAdjustedSizeEvexAware(ins, size, insCodeRM(ins));
+    UNATIVE_OFFSET sz = emitGetAdjustedSize(ins, size, insCodeRM(ins));
 
     bool includeRexPrefixSize = true;
     // REX prefix
@@ -3132,7 +3048,7 @@ inline UNATIVE_OFFSET emitter::emitInsSizeSV(instrDesc* id, code_t code, int var
     assert(id->idIns() != INS_invalid);
     instruction    ins      = id->idIns();
     emitAttr       attrSize = id->idOpSize();
-    UNATIVE_OFFSET prefix   = emitGetAdjustedSizeEvexAware(ins, attrSize, code);
+    UNATIVE_OFFSET prefix   = emitGetAdjustedSize(ins, attrSize, code);
 
     // REX prefix
     if (TakesRexWPrefix(ins, attrSize) || IsExtendedReg(id->idReg1(), attrSize) ||
@@ -3150,7 +3066,7 @@ inline UNATIVE_OFFSET emitter::emitInsSizeSV(instrDesc* id, code_t code, int var
     instruction    ins       = id->idIns();
     emitAttr       attrSize  = id->idOpSize();
     UNATIVE_OFFSET valSize   = EA_SIZE_IN_BYTES(attrSize);
-    UNATIVE_OFFSET prefix    = emitGetAdjustedSizeEvexAware(ins, attrSize, code);
+    UNATIVE_OFFSET prefix    = emitGetAdjustedSize(ins, attrSize, code);
     bool           valInByte = ((signed char)val == val) && (ins != INS_mov) && (ins != INS_test);
 
 #ifdef TARGET_AMD64
@@ -3283,7 +3199,7 @@ UNATIVE_OFFSET emitter::emitInsSizeAM(instrDesc* id, code_t code)
         size = 2;
     }
 
-    size += emitGetAdjustedSizeEvexAware(ins, attrSize, code);
+    size += emitGetAdjustedSize(ins, attrSize, code);
 
     if (hasRexPrefix(code))
     {
@@ -3501,7 +3417,7 @@ inline UNATIVE_OFFSET emitter::emitInsSizeCV(instrDesc* id, code_t code)
     // can be reached via RIP-relative addressing.
     UNATIVE_OFFSET size = sizeof(INT32);
 
-    size += emitGetAdjustedSizeEvexAware(ins, attrSize, code);
+    size += emitGetAdjustedSize(ins, attrSize, code);
 
     bool includeRexPrefixSize = true;
 
@@ -3743,7 +3659,7 @@ void emitter::emitIns(instruction ins, emitAttr attr)
 
     insFormat fmt = IF_NONE;
 
-    sz += emitGetAdjustedSizeEvexAware(ins, attr, code);
+    sz += emitGetAdjustedSize(ins, attr, code);
     if (TakesRexWPrefix(ins, attr))
     {
         sz += emitGetRexPrefixSize(ins);
@@ -4888,7 +4804,7 @@ void emitter::emitIns_R(instruction ins, emitAttr attr, regNumber reg)
     id->idReg1(reg);
 
     // Vex bytes
-    sz += emitGetAdjustedSizeEvexAware(ins, attr, insEncodeMRreg(ins, reg, attr, insCodeMR(ins)));
+    sz += emitGetAdjustedSize(ins, attr, insEncodeMRreg(ins, reg, attr, insCodeMR(ins)));
 
     // REX byte
     if (IsExtendedReg(reg, attr) || TakesRexWPrefix(ins, attr))
@@ -5030,7 +4946,7 @@ void emitter::emitIns_R_I(instruction ins,
             break;
     }
 
-    sz += emitGetAdjustedSizeEvexAware(ins, attr, insCodeMI(ins));
+    sz += emitGetAdjustedSize(ins, attr, insCodeMI(ins));
 
     // Do we need a REX prefix for AMD64? We need one if we are using any extended register (REX.R), or if we have a
     // 64-bit sized operand (REX.W). Note that IMUL in our encoding is special, with a "built-in", implicit, target
diff --git a/src/coreclr/jit/emitxarch.h b/src/coreclr/jit/emitxarch.h
index c5238685a6247e..94174885110835 100644
--- a/src/coreclr/jit/emitxarch.h
+++ b/src/coreclr/jit/emitxarch.h
@@ -71,7 +71,6 @@ unsigned emitGetVexPrefixSize(instruction ins, emitAttr attr);
 unsigned emitGetEvexPrefixSize(instruction ins);
 unsigned emitGetPrefixSize(code_t code, bool includeRexPrefixSize);
 unsigned emitGetAdjustedSize(instruction ins, emitAttr attr, code_t code);
-unsigned emitGetAdjustedSizeEvexAware(instruction ins, emitAttr attr, code_t code);
 
 unsigned insEncodeReg012(instruction ins, regNumber reg, emitAttr size, code_t* code);
 unsigned insEncodeReg345(instruction ins, regNumber reg, emitAttr size, code_t* code);

From 0932ac1e378a6b9a7352383dbba69003cdfba49e Mon Sep 17 00:00:00 2001
From: Tanner Gooding <tagoo@outlook.com>
Date: Fri, 9 Dec 2022 10:01:47 -0800
Subject: [PATCH 03/15] Simplify the emitGetEvexPrefixSize and
 emitGetVexPrefixSize methods

---
 src/coreclr/jit/emitxarch.cpp | 521 ++++++++++++++++++++--------------
 src/coreclr/jit/emitxarch.h   |   2 +-
 2 files changed, 309 insertions(+), 214 deletions(-)

diff --git a/src/coreclr/jit/emitxarch.cpp b/src/coreclr/jit/emitxarch.cpp
index 1f74aab45221a9..9d6a4a9cca196c 100644
--- a/src/coreclr/jit/emitxarch.cpp
+++ b/src/coreclr/jit/emitxarch.cpp
@@ -1280,261 +1280,354 @@ bool isPrefix(BYTE b)
 }
 
 //------------------------------------------------------------------------
-// emitOutputRexOrSimdPrefixIfNeeded: Outputs EVEX prefix (in case of AVX512 instructions),
-// VEX prefix (in case of AVX instructions) and REX.R/X/W/B otherwise.
+// emitExtractEvexPrefix: Extract the EVEX prefix
 //
 // Arguments:
-//    ins -- processor instruction to check.
-//    dst -- buffer to write prefix to.
-//    code -- opcode bits.
-//    attr -- operand size
+//    [In]      ins  -- processor instruction to check.
+//    [In, Out] code -- opcode bits which will no longer contain the evex prefix on return
 //
 // Return Value:
-//    Size of prefix.
+//    The extracted EVEX prefix.
 //
-unsigned emitter::emitOutputRexOrSimdPrefixIfNeeded(instruction ins, BYTE* dst, code_t& code)
+emitter::code_t emitter::emitExtractEvexPrefix(instruction ins, code_t& code)
 {
-    // TODO-XArch-AVX512: Remove redundant code and collapse into single pathway for EVEX and VEX if possible.
-    if (hasEvexPrefix(code))
-    {
-        // Only AVX512 instructions should have an EVEX prefix
-        assert(IsEvexEncodedInstruction(ins));
+    assert(IsEvexEncodedInstruction(ins));
 
-        code_t evexPrefix = (code >> 32) & 0xFFFFFFFF;
-        code &= 0x00000000FFFFFFFFLL;
+    code_t evexPrefix = (code >> 32) & 0xFFFFFFFF;
+    code &= 0x00000000FFFFFFFFLL;
 
-        WORD leadingBytes = 0;
-        BYTE check        = (code >> 24) & 0xFF;
-        if (check != 0)
+    WORD leadingBytes = 0;
+    BYTE check        = (code >> 24) & 0xFF;
+
+    if (check != 0)
+    {
+        // check for a prefix in the 11 position
+        BYTE sizePrefix = (code >> 16) & 0xFF;
+
+        if ((sizePrefix != 0) && isPrefix(sizePrefix))
         {
-            // check for a prefix in the 11 position
-            BYTE sizePrefix = (code >> 16) & 0xFF;
-            if ((sizePrefix != 0) && isPrefix(sizePrefix))
+            // 'pp' bits in byte 1 of EVEX prefix allows us to encode SIMD size prefixes as two bits
+            //
+            //   00  - None   (0F    - packed float)
+            //   01  - 66     (66 0F - packed double)
+            //   10  - F3     (F3 0F - scalar float
+            //   11  - F2     (F2 0F - scalar double)
+            switch (sizePrefix)
             {
-                // 'pp' bits in byte 1 of EVEX prefix allows us to encode SIMD size prefixes as two bits
-                //
-                //   00  - None   (0F    - packed float)
-                //   01  - 66     (66 0F - packed double)
-                //   10  - F3     (F3 0F - scalar float
-                //   11  - F2     (F2 0F - scalar double)
-                switch (sizePrefix)
+                case 0x66:
                 {
-                    case 0x66:
-                        // None of the existing BMI instructions should be EVEX encoded.
-                        assert(!IsBMIInstruction(ins));
-                        evexPrefix |= (0x01 << 8);
-                        break;
-                    case 0xF3:
-                        evexPrefix |= (0x02 << 8);
-                        break;
-                    case 0xF2:
-                        evexPrefix |= (0x03 << 8);
-                        break;
-                    default:
-                        assert(!"unrecognized SIMD size prefix");
-                        unreached();
+                    // None of the existing BMI instructions should be EVEX encoded.
+                    assert(!IsBMIInstruction(ins));
+                    evexPrefix |= (0x01 << 8);
+                    break;
                 }
 
-                // Now the byte in the 22 position must be an escape byte 0F
-                leadingBytes = check;
-                assert(leadingBytes == 0x0F);
+                case 0xF3:
+                {
+                    evexPrefix |= (0x02 << 8);
+                    break;
+                }
 
-                // Get rid of both sizePrefix and escape byte
-                code &= 0x0000FFFFLL;
+                case 0xF2:
+                {
+                    evexPrefix |= (0x03 << 8);
+                    break;
+                }
 
-                // Check the byte in the 33 position to see if it is 3A or 38.
-                // In such a case escape bytes must be 0x0F3A or 0x0F38
-                check = code & 0xFF;
-                if (check == 0x3A || check == 0x38)
+                default:
                 {
-                    leadingBytes = (leadingBytes << 8) | check;
-                    code &= 0x0000FF00LL;
+                    assert(!"unrecognized SIMD size prefix");
+                    unreached();
                 }
             }
+
+            // Now the byte in the 22 position must be an escape byte 0F
+            leadingBytes = check;
+            assert(leadingBytes == 0x0F);
+
+            // Get rid of both sizePrefix and escape byte
+            code &= 0x0000FFFFLL;
+
+            // Check the byte in the 33 position to see if it is 3A or 38.
+            // In such a case escape bytes must be 0x0F3A or 0x0F38
+            check = code & 0xFF;
+
+            if ((check == 0x3A) || (check == 0x38))
+            {
+                leadingBytes = (leadingBytes << 8) | check;
+                code &= 0x0000FF00LL;
+            }
         }
-        else
+    }
+    else
+    {
+        // 2-byte opcode with the bytes ordered as 0x0011RM22
+        // the byte in position 11 must be an escape byte.
+        leadingBytes = (code >> 16) & 0xFF;
+        assert(leadingBytes == 0x0F || leadingBytes == 0x00);
+        code &= 0xFFFF;
+    }
+
+    // If there is an escape byte it must be 0x0F or 0x0F3A or 0x0F38
+    // mm bits in byte 0 of EVEX prefix allows us to encode these
+    // implied leading bytes. They are identical to low two bits of VEX.mmmmm
+
+    switch (leadingBytes)
+    {
+        case 0x00:
         {
-            // 2-byte opcode with the bytes ordered as 0x0011RM22
-            // the byte in position 11 must be an escape byte.
-            leadingBytes = (code >> 16) & 0xFF;
-            assert(leadingBytes == 0x0F || leadingBytes == 0x00);
-            code &= 0xFFFF;
+            // there is no leading byte
+            break;
         }
 
-        // If there is an escape byte it must be 0x0F or 0x0F3A or 0x0F38
-        // mm bits in byte 0 of EVEX prefix allows us to encode these
-        // implied leading bytes. They are identical to low two bits of VEX.mmmmm
+        case 0x0F:
+        {
+            evexPrefix |= (0x01 << 16);
+            break;
+        }
 
-        switch (leadingBytes)
+        case 0x0F38:
         {
-            case 0x00:
-                // there is no leading byte
-                break;
-            case 0x0F:
-                evexPrefix |= (0x01 << 16);
-                break;
-            case 0x0F38:
-                evexPrefix |= (0x02 << 16);
-                break;
-            case 0x0F3A:
-                evexPrefix |= (0x03 << 16);
-                break;
-            default:
-                assert(!"encountered unknown leading bytes");
-                unreached();
+            evexPrefix |= (0x02 << 16);
+            break;
         }
 
-        // At this point
-        //     EVEX.2211RM33 got transformed as EVEX.0000RM33
-        //     EVEX.0011RM22 got transformed as EVEX.0000RM22
-        //
-        // Now output EVEX prefix leaving the 4-byte opcode
-        // EVEX prefix is always 4 bytes
+        case 0x0F3A:
+        {
+            evexPrefix |= (0x03 << 16);
+            break;
+        }
 
-        emitOutputByte(dst, ((evexPrefix >> 24) & 0xFF));
-        emitOutputByte(dst + 1, ((evexPrefix >> 16) & 0xFF));
-        emitOutputByte(dst + 2, (evexPrefix >> 8) & 0xFF);
-        emitOutputByte(dst + 3, evexPrefix & 0xFF);
-        return 4;
+        default:
+        {
+            assert(!"encountered unknown leading bytes");
+            unreached();
+        }
     }
-    else if (hasVexPrefix(code))
+
+    // At this point
+    //     EVEX.2211RM33 got transformed as EVEX.0000RM33
+    //     EVEX.0011RM22 got transformed as EVEX.0000RM22
+    //
+    // Now output EVEX prefix leaving the 4-byte opcode
+    // EVEX prefix is always 4 bytes
+
+    return evexPrefix;
+}
+
+//------------------------------------------------------------------------
+// emitExtractVexPrefix: Extract the VEX prefix
+//
+// Arguments:
+//    [In]      ins  -- processor instruction to check.
+//    [In, Out] code -- opcode bits which will no longer contain the vex prefix on return
+//
+// Return Value:
+//    The extracted VEX prefix.
+//
+emitter::code_t emitter::emitExtractVexPrefix(instruction ins, code_t& code)
+{
+    assert(IsVexEncodedInstruction(ins));
+
+    code_t vexPrefix = (code >> 32) & 0x00FFFFFF;
+    code &= 0x00000000FFFFFFFFLL;
+
+    WORD leadingBytes = 0;
+    BYTE check        = (code >> 24) & 0xFF;
+
+    if (check != 0)
     {
-        // Only AVX instructions should have a VEX prefix
-        assert(UseVEXEncoding() && IsVexEncodedInstruction(ins));
-        code_t vexPrefix = (code >> 32) & 0x00FFFFFF;
-        code &= 0x00000000FFFFFFFFLL;
+        // 3-byte opcode: with the bytes ordered as 0x2211RM33 or
+        // 4-byte opcode: with the bytes ordered as 0x22114433
 
-        WORD leadingBytes = 0;
-        BYTE check        = (code >> 24) & 0xFF;
-        if (check != 0)
+        // check for a prefix in the 11 position
+        BYTE sizePrefix = (code >> 16) & 0xFF;
+
+        if ((sizePrefix != 0) && isPrefix(sizePrefix))
         {
-            // 3-byte opcode: with the bytes ordered as 0x2211RM33 or
-            // 4-byte opcode: with the bytes ordered as 0x22114433
-            // check for a prefix in the 11 position
-            BYTE sizePrefix = (code >> 16) & 0xFF;
-            if ((sizePrefix != 0) && isPrefix(sizePrefix))
+            // 'pp' bits in byte2 of VEX prefix allows us to encode SIMD size prefixes as two bits
+            //
+            //   00  - None   (0F    - packed float)
+            //   01  - 66     (66 0F - packed double)
+            //   10  - F3     (F3 0F - scalar float
+            //   11  - F2     (F2 0F - scalar double)
+            switch (sizePrefix)
             {
-                // 'pp' bits in byte2 of VEX prefix allows us to encode SIMD size prefixes as two bits
-                //
-                //   00  - None   (0F    - packed float)
-                //   01  - 66     (66 0F - packed double)
-                //   10  - F3     (F3 0F - scalar float
-                //   11  - F2     (F2 0F - scalar double)
-                switch (sizePrefix)
+                case 0x66:
                 {
-                    case 0x66:
-                        if (IsBMIInstruction(ins))
+                    if (IsBMIInstruction(ins))
+                    {
+                        switch (ins)
                         {
-                            switch (ins)
-                            {
-                                case INS_rorx:
-                                case INS_pdep:
-                                case INS_mulx:
+                            case INS_rorx:
+                            case INS_pdep:
+                            case INS_mulx:
 // TODO: Unblock when enabled for x86
 #ifdef TARGET_AMD64
-                                case INS_shrx:
+                            case INS_shrx:
 #endif
-                                {
-                                    vexPrefix |= 0x03;
-                                    break;
-                                }
+                            {
+                                vexPrefix |= 0x03;
+                                break;
+                            }
 
-                                case INS_pext:
+                            case INS_pext:
 // TODO: Unblock when enabled for x86
 #ifdef TARGET_AMD64
-                                case INS_sarx:
+                            case INS_sarx:
 #endif
-                                {
-                                    vexPrefix |= 0x02;
-                                    break;
-                                }
+                            {
+                                vexPrefix |= 0x02;
+                                break;
+                            }
 // TODO: Unblock when enabled for x86
 #ifdef TARGET_AMD64
-                                case INS_shlx:
-                                {
-                                    vexPrefix |= 0x01;
-                                    break;
-                                }
+                            case INS_shlx:
+                            {
+                                vexPrefix |= 0x01;
+                                break;
+                            }
 #endif
-                                default:
-                                {
-                                    vexPrefix |= 0x00;
-                                    break;
-                                }
+                            default:
+                            {
+                                vexPrefix |= 0x00;
+                                break;
                             }
                         }
-                        else
-                        {
-                            vexPrefix |= 0x01;
-                        }
-                        break;
-                    case 0xF3:
-                        vexPrefix |= 0x02;
-                        break;
-                    case 0xF2:
-                        vexPrefix |= 0x03;
-                        break;
-                    default:
-                        assert(!"unrecognized SIMD size prefix");
-                        unreached();
+                    }
+                    else
+                    {
+                        vexPrefix |= 0x01;
+                    }
+                    break;
                 }
 
-                // Now the byte in the 22 position must be an escape byte 0F
-                leadingBytes = check;
-                assert(leadingBytes == 0x0F);
+                case 0xF3:
+                {
+                    vexPrefix |= 0x02;
+                    break;
+                }
 
-                // Get rid of both sizePrefix and escape byte
-                code &= 0x0000FFFFLL;
+                case 0xF2:
+                {
+                    vexPrefix |= 0x03;
+                    break;
+                }
 
-                // Check the byte in the 33 position to see if it is 3A or 38.
-                // In such a case escape bytes must be 0x0F3A or 0x0F38
-                check = code & 0xFF;
-                if (check == 0x3A || check == 0x38)
+                default:
                 {
-                    leadingBytes = (leadingBytes << 8) | check;
-                    code &= 0x0000FF00LL;
+                    assert(!"unrecognized SIMD size prefix");
+                    unreached();
                 }
             }
+
+            // Now the byte in the 22 position must be an escape byte 0F
+            leadingBytes = check;
+            assert(leadingBytes == 0x0F);
+
+            // Get rid of both sizePrefix and escape byte
+            code &= 0x0000FFFFLL;
+
+            // Check the byte in the 33 position to see if it is 3A or 38.
+            // In such a case escape bytes must be 0x0F3A or 0x0F38
+            check = code & 0xFF;
+
+            if ((check == 0x3A) || (check == 0x38))
+            {
+                leadingBytes = (leadingBytes << 8) | check;
+                code &= 0x0000FF00LL;
+            }
         }
-        else
+    }
+    else
+    {
+        // 2-byte opcode with the bytes ordered as 0x0011RM22
+        // the byte in position 11 must be an escape byte.
+        leadingBytes = (code >> 16) & 0xFF;
+        assert(leadingBytes == 0x0F || leadingBytes == 0x00);
+        code &= 0xFFFF;
+    }
+
+    // If there is an escape byte it must be 0x0F or 0x0F3A or 0x0F38
+    // m-mmmmm bits in byte 1 of VEX prefix allows us to encode these
+    // implied leading bytes. 0x0F is supported by both the 2-byte and
+    // 3-byte encoding. While 0x0F3A and 0x0F38 are only supported by
+    // the 3-byte version.
+
+    switch (leadingBytes)
+    {
+        case 0x00:
         {
-            // 2-byte opcode with the bytes ordered as 0x0011RM22
-            // the byte in position 11 must be an escape byte.
-            leadingBytes = (code >> 16) & 0xFF;
-            assert(leadingBytes == 0x0F || leadingBytes == 0x00);
-            code &= 0xFFFF;
+            // there is no leading byte
+            break;
         }
 
-        // If there is an escape byte it must be 0x0F or 0x0F3A or 0x0F38
-        // m-mmmmm bits in byte 1 of VEX prefix allows us to encode these
-        // implied leading bytes. 0x0F is supported by both the 2-byte and
-        // 3-byte encoding. While 0x0F3A and 0x0F38 are only supported by
-        // the 3-byte version.
+        case 0x0F:
+        {
+            vexPrefix |= 0x0100;
+            break;
+        }
 
-        switch (leadingBytes)
+        case 0x0F38:
         {
-            case 0x00:
-                // there is no leading byte
-                break;
-            case 0x0F:
-                vexPrefix |= 0x0100;
-                break;
-            case 0x0F38:
-                vexPrefix |= 0x0200;
-                break;
-            case 0x0F3A:
-                vexPrefix |= 0x0300;
-                break;
-            default:
-                assert(!"encountered unknown leading bytes");
-                unreached();
+            vexPrefix |= 0x0200;
+            break;
         }
 
-        // At this point
-        //     VEX.2211RM33 got transformed as VEX.0000RM33
-        //     VEX.0011RM22 got transformed as VEX.0000RM22
-        //
-        // Now output VEX prefix leaving the 4-byte opcode
+        case 0x0F3A:
+        {
+            vexPrefix |= 0x0300;
+            break;
+        }
+
+        default:
+        {
+            assert(!"encountered unknown leading bytes");
+            unreached();
+        }
+    }
+
+    // At this point
+    //     VEX.2211RM33 got transformed as VEX.0000RM33
+    //     VEX.0011RM22 got transformed as VEX.0000RM22
+    //
+    // Now output VEX prefix leaving the 4-byte opcode
+
+    return vexPrefix;
+}
+
+//------------------------------------------------------------------------
+// emitOutputRexOrSimdPrefixIfNeeded: Outputs EVEX prefix (in case of AVX512 instructions),
+// VEX prefix (in case of AVX instructions) and REX.R/X/W/B otherwise.
+//
+// Arguments:
+//    ins -- processor instruction to check.
+//    dst -- buffer to write prefix to.
+//    code -- opcode bits.
+//    attr -- operand size
+//
+// Return Value:
+//    Size of prefix.
+//
+unsigned emitter::emitOutputRexOrSimdPrefixIfNeeded(instruction ins, BYTE* dst, code_t& code)
+{
+    // TODO-XArch-AVX512: Remove redundant code and collapse into single pathway for EVEX and VEX if possible.
+    if (hasEvexPrefix(code))
+    {
+        code_t evexPrefix = emitExtractEvexPrefix(ins, code);
+        assert(evexPrefix != 0);
+
+        emitOutputByte(dst, ((evexPrefix >> 24) & 0xFF));
+        emitOutputByte(dst + 1, ((evexPrefix >> 16) & 0xFF));
+        emitOutputByte(dst + 2, (evexPrefix >> 8) & 0xFF);
+        emitOutputByte(dst + 3, evexPrefix & 0xFF);
+
+        return 4;
+    }
+    else if (hasVexPrefix(code))
+    {
+        code_t vexPrefix = emitExtractVexPrefix(ins, code);
+        assert(vexPrefix != 0);
 
         // The 2-byte VEX encoding, requires that the X and B-bits are set (these
         // bits are inverted from the REX values so set means off), the W-bit is
@@ -1555,16 +1648,19 @@ unsigned emitter::emitOutputRexOrSimdPrefixIfNeeded(instruction ins, BYTE* dst,
         //   * X and B must be set                (0x61 validates bits 5-6)
         //   * m-mmmm must be 0-00001             (0x61 validates bits 0-4)
         //   * W must be unset                    (0x00 validates bit 7)
+
         if ((vexPrefix & 0xFFFF7F80) == 0x00C46100)
         {
             // Encoding optimization calculation is not done while estimating the instruction
             // size and thus over-predict instruction size by 1 byte.
             // If there are IGs that will be aligned, do not optimize encoding so the
             // estimated alignment sizes are accurate.
+
             if (emitCurIG->igNum > emitLastAlignedIgNum)
             {
                 emitOutputByte(dst, 0xC5);
                 emitOutputByte(dst + 1, ((vexPrefix >> 8) & 0x80) | (vexPrefix & 0x7F));
+
                 return 2;
             }
         }
@@ -1572,6 +1668,7 @@ unsigned emitter::emitOutputRexOrSimdPrefixIfNeeded(instruction ins, BYTE* dst,
         emitOutputByte(dst, ((vexPrefix >> 16) & 0xFF));
         emitOutputByte(dst + 1, ((vexPrefix >> 8) & 0xFF));
         emitOutputByte(dst + 2, vexPrefix & 0xFF);
+
         return 3;
     }
 
@@ -1677,7 +1774,7 @@ unsigned emitter::emitGetRexPrefixSize(instruction ins)
 }
 
 //------------------------------------------------------------------------
-// emitGetEvexPrefixSize: Gets Size of Evex prefix in bytes
+// emitGetEvexPrefixSize: Gets Size of EVEX prefix in bytes
 //
 // Arguments:
 //    ins   -- The instruction
@@ -1687,25 +1784,23 @@ unsigned emitter::emitGetRexPrefixSize(instruction ins)
 //
 unsigned emitter::emitGetEvexPrefixSize(instruction ins)
 {
-    if (IsEvexEncodedInstruction(ins))
-    {
-        return 4;
-    }
-
-    // If not AVX512, then we don't need to encode prefix.
-    return 0;
+    assert(IsEvexEncodedInstruction(ins));
+    return 4;
 }
 
-// Size of vex prefix in bytes
-unsigned emitter::emitGetVexPrefixSize(instruction ins, emitAttr attr)
+//------------------------------------------------------------------------
+// emitGetVexPrefixSize: Gets Size of VEX prefix in bytes
+//
+// Arguments:
+//    ins   -- The instruction
+//
+// Returns:
+//    Prefix size in bytes.
+//
+unsigned emitter::emitGetVexPrefixSize(instruction ins)
 {
-    if (IsVexEncodedInstruction(ins))
-    {
-        return 3;
-    }
-
-    // If not AVX, then we don't need to encode vex prefix.
-    return 0;
+    assert(IsVexEncodedInstruction(ins));
+    return 3;
 }
 
 //------------------------------------------------------------------------
@@ -1791,8 +1886,8 @@ unsigned emitter::emitGetAdjustedSize(instruction ins, emitAttr attr, code_t cod
         //  = opcodeSize + (vexPrefixSize - ExtrabytesSize)
         //  = opcodeSize + vexPrefixAdjustedSize
 
-        unsigned simdPrefixAdjustedSize = emitGetVexPrefixSize(ins, attr);
-        assert(simdPrefixAdjustedSize == 3);
+        unsigned simdPrefixAdjustedSize = emitGetVexPrefixSize(ins);
+        assert((simdPrefixAdjustedSize == 2) || (simdPrefixAdjustedSize == 3));
 
         // In this case, opcode will contains escape prefix at least one byte,
         // vexPrefixAdjustedSize should be minus one.
diff --git a/src/coreclr/jit/emitxarch.h b/src/coreclr/jit/emitxarch.h
index 94174885110835..45ed8328688d9a 100644
--- a/src/coreclr/jit/emitxarch.h
+++ b/src/coreclr/jit/emitxarch.h
@@ -67,7 +67,7 @@ BYTE* emitOutputLJ(insGroup* ig, BYTE* dst, instrDesc* id);
 
 unsigned emitOutputRexOrSimdPrefixIfNeeded(instruction ins, BYTE* dst, code_t& code);
 unsigned emitGetRexPrefixSize(instruction ins);
-unsigned emitGetVexPrefixSize(instruction ins, emitAttr attr);
+unsigned emitGetVexPrefixSize(instruction ins);
 unsigned emitGetEvexPrefixSize(instruction ins);
 unsigned emitGetPrefixSize(code_t code, bool includeRexPrefixSize);
 unsigned emitGetAdjustedSize(instruction ins, emitAttr attr, code_t code);

From 7bd14bec72911408b19679a6ea496db6a3f4a9d7 Mon Sep 17 00:00:00 2001
From: Tanner Gooding <tagoo@outlook.com>
Date: Fri, 9 Dec 2022 10:09:55 -0800
Subject: [PATCH 04/15] Create helper emitExtractVexPrefix and
 emitExtractEvexPrefix functions

---
 src/coreclr/jit/emitxarch.h | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/src/coreclr/jit/emitxarch.h b/src/coreclr/jit/emitxarch.h
index 45ed8328688d9a..70d60dbc95566f 100644
--- a/src/coreclr/jit/emitxarch.h
+++ b/src/coreclr/jit/emitxarch.h
@@ -72,6 +72,9 @@ unsigned emitGetEvexPrefixSize(instruction ins);
 unsigned emitGetPrefixSize(code_t code, bool includeRexPrefixSize);
 unsigned emitGetAdjustedSize(instruction ins, emitAttr attr, code_t code);
 
+code_t emitExtractVexPrefix(instruction ins, code_t& code);
+code_t emitExtractEvexPrefix(instruction ins, code_t& code);
+
 unsigned insEncodeReg012(instruction ins, regNumber reg, emitAttr size, code_t* code);
 unsigned insEncodeReg345(instruction ins, regNumber reg, emitAttr size, code_t* code);
 code_t insEncodeReg3456(instruction ins, regNumber reg, emitAttr size, code_t code);

From b20a6d5bb4feb0859dbe5c8567ae02a390f9f9a9 Mon Sep 17 00:00:00 2001
From: Tanner Gooding <tagoo@outlook.com>
Date: Fri, 9 Dec 2022 10:37:05 -0800
Subject: [PATCH 05/15] Update emitGetVexPrefixSize and emitGetEvexPrefixSize
 to take an instrDesc

---
 src/coreclr/jit/emitxarch.cpp | 88 +++++++++++++++++++++--------------
 src/coreclr/jit/emitxarch.h   |  8 ++--
 2 files changed, 58 insertions(+), 38 deletions(-)

diff --git a/src/coreclr/jit/emitxarch.cpp b/src/coreclr/jit/emitxarch.cpp
index 9d6a4a9cca196c..d13215efc2e949 100644
--- a/src/coreclr/jit/emitxarch.cpp
+++ b/src/coreclr/jit/emitxarch.cpp
@@ -1777,13 +1777,14 @@ unsigned emitter::emitGetRexPrefixSize(instruction ins)
 // emitGetEvexPrefixSize: Gets Size of EVEX prefix in bytes
 //
 // Arguments:
-//    ins   -- The instruction
+//    ins   -- The instruction descriptor
 //
 // Returns:
 //    Prefix size in bytes.
 //
-unsigned emitter::emitGetEvexPrefixSize(instruction ins)
+unsigned emitter::emitGetEvexPrefixSize(instrDesc* id)
 {
+    instruction ins = id->idIns();
     assert(IsEvexEncodedInstruction(ins));
     return 4;
 }
@@ -1792,13 +1793,14 @@ unsigned emitter::emitGetEvexPrefixSize(instruction ins)
 // emitGetVexPrefixSize: Gets Size of VEX prefix in bytes
 //
 // Arguments:
-//    ins   -- The instruction
+//    id   -- The instruction descriptor
 //
 // Returns:
 //    Prefix size in bytes.
 //
-unsigned emitter::emitGetVexPrefixSize(instruction ins)
+unsigned emitter::emitGetVexPrefixSize(instrDesc* id)
 {
+    instruction ins = id->idIns();
     assert(IsVexEncodedInstruction(ins));
     return 3;
 }
@@ -1808,15 +1810,17 @@ unsigned emitter::emitGetVexPrefixSize(instruction ins)
 // configuration.
 //
 // Arguments:
-//    ins   -- The instruction being emitted
-//    attr  -- The emit attribute
+//    id    -- The instruction descriptor being emitted
 //    code  -- The current opcode and any known prefixes
 //
 // Returns:
 //    Updated size.
 //
-unsigned emitter::emitGetAdjustedSize(instruction ins, emitAttr attr, code_t code)
+unsigned emitter::emitGetAdjustedSize(instrDesc* id, code_t code)
 {
+    instruction ins  = id->idIns();
+    emitAttr    attr = id->idOpSize();
+
     unsigned adjustedSize = 0;
 
     // TODO-XArch-AVX512: Remove redundant code and possiblly collapse EVEX and VEX into a single pathway
@@ -1839,7 +1843,7 @@ unsigned emitter::emitGetAdjustedSize(instruction ins, emitAttr attr, code_t cod
         //  = opcodeSize + (evexPrefixSize - ExtrabytesSize)
         //  = opcodeSize + evexPrefixAdjustedSize
 
-        unsigned evexPrefixAdjustedSize = emitGetEvexPrefixSize(ins);
+        unsigned evexPrefixAdjustedSize = emitGetEvexPrefixSize(id);
         assert(evexPrefixAdjustedSize == 4);
 
         // In this case, opcode will contains escape prefix at least one byte,
@@ -1886,7 +1890,7 @@ unsigned emitter::emitGetAdjustedSize(instruction ins, emitAttr attr, code_t cod
         //  = opcodeSize + (vexPrefixSize - ExtrabytesSize)
         //  = opcodeSize + vexPrefixAdjustedSize
 
-        unsigned simdPrefixAdjustedSize = emitGetVexPrefixSize(ins);
+        unsigned simdPrefixAdjustedSize = emitGetVexPrefixSize(id);
         assert((simdPrefixAdjustedSize == 2) || (simdPrefixAdjustedSize == 3));
 
         // In this case, opcode will contains escape prefix at least one byte,
@@ -2853,7 +2857,7 @@ inline UNATIVE_OFFSET emitter::emitInsSizeRR(instrDesc* id, code_t code)
     instruction ins  = id->idIns();
     emitAttr    attr = id->idOpSize();
 
-    UNATIVE_OFFSET sz = emitGetAdjustedSize(ins, attr, code);
+    UNATIVE_OFFSET sz = emitGetAdjustedSize(id, code);
 
     bool includeRexPrefixSize = true;
     // REX prefix
@@ -2912,8 +2916,22 @@ inline UNATIVE_OFFSET emitter::emitInsSizeRR(instrDesc* id, code_t code, int val
     return valSize + emitInsSizeRR(id, code);
 }
 
-inline UNATIVE_OFFSET emitter::emitInsSizeRR(instruction ins, regNumber reg1, regNumber reg2, emitAttr attr)
+//------------------------------------------------------------------------
+// emitInsSizeRR: Determines the code size for an instruction encoding that does not have any addressing modes
+//
+// Arguments:
+//    id    -- The instruction descriptor being emitted
+//
+// TODO-Cleanup: This method should really be merged with emitInsSizeRR(instrDesc*, code_t). However, it has
+// existed for a long time and its not trivial to determine why it exists as a separate method.
+//
+inline UNATIVE_OFFSET emitter::emitInsSizeRR(instrDesc* id)
 {
+    instruction ins  = id->idIns();
+    regNumber   reg1 = id->idReg1();
+    regNumber   reg2 = id->idReg2();
+    emitAttr    attr = id->idOpSize();
+
     emitAttr size = EA_SIZE(attr);
 
     // If Byte 4 (which is 0xFF00) is zero, that's where the RM encoding goes.
@@ -2921,7 +2939,7 @@ inline UNATIVE_OFFSET emitter::emitInsSizeRR(instruction ins, regNumber reg1, re
     // This would probably be better expressed as a different format or something?
     code_t code = insCodeRM(ins);
 
-    UNATIVE_OFFSET sz = emitGetAdjustedSize(ins, size, insCodeRM(ins));
+    UNATIVE_OFFSET sz = emitGetAdjustedSize(id, code);
 
     bool includeRexPrefixSize = true;
     // REX prefix
@@ -3143,7 +3161,7 @@ inline UNATIVE_OFFSET emitter::emitInsSizeSV(instrDesc* id, code_t code, int var
     assert(id->idIns() != INS_invalid);
     instruction    ins      = id->idIns();
     emitAttr       attrSize = id->idOpSize();
-    UNATIVE_OFFSET prefix   = emitGetAdjustedSize(ins, attrSize, code);
+    UNATIVE_OFFSET prefix   = emitGetAdjustedSize(id, code);
 
     // REX prefix
     if (TakesRexWPrefix(ins, attrSize) || IsExtendedReg(id->idReg1(), attrSize) ||
@@ -3161,7 +3179,7 @@ inline UNATIVE_OFFSET emitter::emitInsSizeSV(instrDesc* id, code_t code, int var
     instruction    ins       = id->idIns();
     emitAttr       attrSize  = id->idOpSize();
     UNATIVE_OFFSET valSize   = EA_SIZE_IN_BYTES(attrSize);
-    UNATIVE_OFFSET prefix    = emitGetAdjustedSize(ins, attrSize, code);
+    UNATIVE_OFFSET prefix    = emitGetAdjustedSize(id, code);
     bool           valInByte = ((signed char)val == val) && (ins != INS_mov) && (ins != INS_test);
 
 #ifdef TARGET_AMD64
@@ -3294,7 +3312,7 @@ UNATIVE_OFFSET emitter::emitInsSizeAM(instrDesc* id, code_t code)
         size = 2;
     }
 
-    size += emitGetAdjustedSize(ins, attrSize, code);
+    size += emitGetAdjustedSize(id, code);
 
     if (hasRexPrefix(code))
     {
@@ -3512,7 +3530,7 @@ inline UNATIVE_OFFSET emitter::emitInsSizeCV(instrDesc* id, code_t code)
     // can be reached via RIP-relative addressing.
     UNATIVE_OFFSET size = sizeof(INT32);
 
-    size += emitGetAdjustedSize(ins, attrSize, code);
+    size += emitGetAdjustedSize(id, code);
 
     bool includeRexPrefixSize = true;
 
@@ -3754,14 +3772,15 @@ void emitter::emitIns(instruction ins, emitAttr attr)
 
     insFormat fmt = IF_NONE;
 
-    sz += emitGetAdjustedSize(ins, attr, code);
+    id->idIns(ins);
+    id->idInsFmt(fmt);
+
+    sz += emitGetAdjustedSize(id, code);
     if (TakesRexWPrefix(ins, attr))
     {
         sz += emitGetRexPrefixSize(ins);
     }
 
-    id->idIns(ins);
-    id->idInsFmt(fmt);
     id->idCodeSize(sz);
 
     dispIns(id);
@@ -4899,7 +4918,7 @@ void emitter::emitIns_R(instruction ins, emitAttr attr, regNumber reg)
     id->idReg1(reg);
 
     // Vex bytes
-    sz += emitGetAdjustedSize(ins, attr, insEncodeMRreg(ins, reg, attr, insCodeMR(ins)));
+    sz += emitGetAdjustedSize(id, insEncodeMRreg(ins, reg, attr, insCodeMR(ins)));
 
     // REX byte
     if (IsExtendedReg(reg, attr) || TakesRexWPrefix(ins, attr))
@@ -5041,7 +5060,17 @@ void emitter::emitIns_R_I(instruction ins,
             break;
     }
 
-    sz += emitGetAdjustedSize(ins, attr, insCodeMI(ins));
+    id = emitNewInstrSC(attr, val);
+    id->idIns(ins);
+    id->idInsFmt(fmt);
+    id->idReg1(reg);
+
+#ifdef DEBUG
+    id->idDebugOnlyInfo()->idFlags     = gtFlags;
+    id->idDebugOnlyInfo()->idMemCookie = targetHandle;
+#endif
+
+    sz += emitGetAdjustedSize(id, insCodeMI(ins));
 
     // Do we need a REX prefix for AMD64? We need one if we are using any extended register (REX.R), or if we have a
     // 64-bit sized operand (REX.W). Note that IMUL in our encoding is special, with a "built-in", implicit, target
@@ -5051,17 +5080,8 @@ void emitter::emitIns_R_I(instruction ins,
         sz += emitGetRexPrefixSize(ins);
     }
 
-    id = emitNewInstrSC(attr, val);
-    id->idIns(ins);
-    id->idInsFmt(fmt);
-    id->idReg1(reg);
     id->idCodeSize(sz);
 
-#ifdef DEBUG
-    id->idDebugOnlyInfo()->idFlags     = gtFlags;
-    id->idDebugOnlyInfo()->idMemCookie = targetHandle;
-#endif
-
     dispIns(id);
     emitCurIGsize += sz;
 
@@ -5633,13 +5653,13 @@ void emitter::emitIns_Mov(instruction ins, emitAttr attr, regNumber dstReg, regN
         return;
     }
 
-    UNATIVE_OFFSET sz = emitInsSizeRR(ins, dstReg, srcReg, attr);
-
     instrDesc* id = emitNewInstrSmall(attr);
     id->idIns(ins);
     id->idInsFmt(fmt);
     id->idReg1(dstReg);
     id->idReg2(srcReg);
+
+    UNATIVE_OFFSET sz = emitInsSizeRR(id);
     id->idCodeSize(sz);
 
     dispIns(id);
@@ -5664,8 +5684,6 @@ void emitter::emitIns_R_R(instruction ins, emitAttr attr, regNumber reg1, regNum
     assert(size <= EA_32BYTE);
     noway_assert(emitVerifyEncodable(ins, size, reg1, reg2));
 
-    UNATIVE_OFFSET sz = emitInsSizeRR(ins, reg1, reg2, attr);
-
     /* Special case: "XCHG" uses a different format */
     insFormat fmt = (ins == INS_xchg) ? IF_RRW_RRW : emitInsModeFormat(ins, IF_RRD_RRD);
 
@@ -5674,6 +5692,8 @@ void emitter::emitIns_R_R(instruction ins, emitAttr attr, regNumber reg1, regNum
     id->idInsFmt(fmt);
     id->idReg1(reg1);
     id->idReg2(reg2);
+
+    UNATIVE_OFFSET sz = emitInsSizeRR(id);
     id->idCodeSize(sz);
 
     dispIns(id);
diff --git a/src/coreclr/jit/emitxarch.h b/src/coreclr/jit/emitxarch.h
index 70d60dbc95566f..0beff1a7ce4b63 100644
--- a/src/coreclr/jit/emitxarch.h
+++ b/src/coreclr/jit/emitxarch.h
@@ -44,7 +44,7 @@ UNATIVE_OFFSET emitInsSizeSV(instrDesc* id, code_t code, int var, int dsp);
 UNATIVE_OFFSET emitInsSizeSV(instrDesc* id, code_t code, int var, int dsp, int val);
 UNATIVE_OFFSET emitInsSizeRR(instrDesc* id, code_t code);
 UNATIVE_OFFSET emitInsSizeRR(instrDesc* id, code_t code, int val);
-UNATIVE_OFFSET emitInsSizeRR(instruction ins, regNumber reg1, regNumber reg2, emitAttr attr);
+UNATIVE_OFFSET emitInsSizeRR(instrDesc* id);
 UNATIVE_OFFSET emitInsSizeAM(instrDesc* id, code_t code);
 UNATIVE_OFFSET emitInsSizeAM(instrDesc* id, code_t code, int val);
 UNATIVE_OFFSET emitInsSizeCV(instrDesc* id, code_t code);
@@ -67,10 +67,10 @@ BYTE* emitOutputLJ(insGroup* ig, BYTE* dst, instrDesc* id);
 
 unsigned emitOutputRexOrSimdPrefixIfNeeded(instruction ins, BYTE* dst, code_t& code);
 unsigned emitGetRexPrefixSize(instruction ins);
-unsigned emitGetVexPrefixSize(instruction ins);
-unsigned emitGetEvexPrefixSize(instruction ins);
+unsigned emitGetVexPrefixSize(instrDesc* id);
+unsigned emitGetEvexPrefixSize(instrDesc* id);
 unsigned emitGetPrefixSize(code_t code, bool includeRexPrefixSize);
-unsigned emitGetAdjustedSize(instruction ins, emitAttr attr, code_t code);
+unsigned emitGetAdjustedSize(instrDesc* id, code_t code);
 
 code_t emitExtractVexPrefix(instruction ins, code_t& code);
 code_t emitExtractEvexPrefix(instruction ins, code_t& code);

From 4d94897f78259a5f79d9ceaac41cc13d0dee5421 Mon Sep 17 00:00:00 2001
From: Tanner Gooding <tagoo@outlook.com>
Date: Fri, 9 Dec 2022 10:56:40 -0800
Subject: [PATCH 06/15] Update emitGetPrefixSize to take the instrDesc so the
 VEX prefix size can be computed correctly

---
 src/coreclr/jit/emitxarch.cpp | 53 ++++++++++++++++++++++-------------
 src/coreclr/jit/emitxarch.h   |  4 +--
 2 files changed, 35 insertions(+), 22 deletions(-)

diff --git a/src/coreclr/jit/emitxarch.cpp b/src/coreclr/jit/emitxarch.cpp
index d13215efc2e949..3de335d8bf33a6 100644
--- a/src/coreclr/jit/emitxarch.cpp
+++ b/src/coreclr/jit/emitxarch.cpp
@@ -1949,19 +1949,20 @@ unsigned emitter::emitGetAdjustedSize(instrDesc* id, code_t code)
 // emitGetPrefixSize: Get size of rex or vex prefix emitted in code
 //
 // Arguments:
+//    id                    -- The instruction descriptor for which to get its prefix size
 //    code                  -- The current opcode and any known prefixes
 //    includeRexPrefixSize  -- If Rex Prefix size should be included or not
 //
-unsigned emitter::emitGetPrefixSize(code_t code, bool includeRexPrefixSize)
+unsigned emitter::emitGetPrefixSize(instrDesc* id, code_t code, bool includeRexPrefixSize)
 {
     if (hasEvexPrefix(code))
     {
-        return 4;
+        return emitGetEvexPrefixSize(id);
     }
 
     if (hasVexPrefix(code))
     {
-        return 3;
+        return emitGetVexPrefixSize(id);
     }
 
     if (includeRexPrefixSize && hasRexPrefix(code))
@@ -2832,14 +2833,15 @@ bool emitter::emitVerifyEncodable(instruction ins, emitAttr size, regNumber reg1
 // emitInsSize: Estimate the size (in bytes of generated code) of the given instruction.
 //
 // Arguments:
+//    id    -- The instruction descriptor for which to estimate its size
 //    code  -- The current opcode and any known prefixes
 //    includeRexPrefixSize  -- If Rex Prefix size should be included or not
 //
-inline UNATIVE_OFFSET emitter::emitInsSize(code_t code, bool includeRexPrefixSize)
+inline UNATIVE_OFFSET emitter::emitInsSize(instrDesc* id, code_t code, bool includeRexPrefixSize)
 {
     UNATIVE_OFFSET size = (code & 0xFF000000) ? 4 : (code & 0x00FF0000) ? 3 : 2;
 #ifdef TARGET_AMD64
-    size += emitGetPrefixSize(code, includeRexPrefixSize);
+    size += emitGetPrefixSize(id, code, includeRexPrefixSize);
 #endif
     return size;
 }
@@ -2868,7 +2870,7 @@ inline UNATIVE_OFFSET emitter::emitInsSizeRR(instrDesc* id, code_t code)
         includeRexPrefixSize = !IsVexEncodedInstruction(ins);
     }
 
-    sz += emitInsSize(code, includeRexPrefixSize);
+    sz += emitInsSize(id, code, includeRexPrefixSize);
 
     return sz;
 }
@@ -2955,11 +2957,11 @@ inline UNATIVE_OFFSET emitter::emitInsSizeRR(instrDesc* id)
 
     if ((code & 0xFF00) != 0)
     {
-        sz += IsAvx512OrPriorInstruction(ins) ? emitInsSize(code, includeRexPrefixSize) : 5;
+        sz += IsAvx512OrPriorInstruction(ins) ? emitInsSize(id, code, includeRexPrefixSize) : 5;
     }
     else
     {
-        sz += emitInsSize(insEncodeRMreg(ins, code), includeRexPrefixSize);
+        sz += emitInsSize(id, insEncodeRMreg(ins, code), includeRexPrefixSize);
     }
 
     return sz;
@@ -2979,7 +2981,7 @@ inline UNATIVE_OFFSET emitter::emitInsSizeRR(instrDesc* id)
 //
 inline UNATIVE_OFFSET emitter::emitInsSizeSVCalcDisp(instrDesc* id, code_t code, int var, int dsp)
 {
-    UNATIVE_OFFSET size = emitInsSize(code, /* includeRexPrefixSize */ true);
+    UNATIVE_OFFSET size = emitInsSize(id, code, /* includeRexPrefixSize */ true);
     UNATIVE_OFFSET offs;
     bool           offsIsUpperBound = true;
     bool           EBPbased         = true;
@@ -3542,7 +3544,7 @@ inline UNATIVE_OFFSET emitter::emitInsSizeCV(instrDesc* id, code_t code)
         includeRexPrefixSize = false;
     }
 
-    return size + emitInsSize(code, includeRexPrefixSize);
+    return size + emitInsSize(id, code, includeRexPrefixSize);
 }
 
 inline UNATIVE_OFFSET emitter::emitInsSizeCV(instrDesc* id, code_t code, int val)
@@ -4966,6 +4968,11 @@ void emitter::emitIns_R_I(instruction ins,
     // (it is always encoded in a byte). Let's not complicate things until this is needed.
     assert(ins != INS_bt);
 
+    // TODO-Cleanup: This is used to track whether a call to emitInsSize is necessary. The call
+    // has existed for quite some time, historically, but it is likely problematic in practice.
+    // However, it is non-trivial to prove it is safe to remove at this time.
+    bool isSimdInsAndValInByte = false;
+
     // Figure out the size of the instruction
     switch (ins)
     {
@@ -5012,16 +5019,8 @@ void emitter::emitIns_R_I(instruction ins,
             {
                 if (IsAvx512OrPriorInstruction(ins))
                 {
-                    bool includeRexPrefixSize = true;
-                    // Do not get the RexSize() but just decide if it will be included down further and if yes,
-                    // do not include it again.
-                    if (IsExtendedReg(reg, attr) || TakesRexWPrefix(ins, size) || instrIsExtendedReg3opImul(ins))
-                    {
-                        includeRexPrefixSize = false;
-                    }
-
-                    sz = emitInsSize(insCodeMI(ins), includeRexPrefixSize);
-                    sz += 1;
+                    sz                    = 1;
+                    isSimdInsAndValInByte = true;
                 }
                 else if (size == EA_1BYTE && reg == REG_EAX && !instrIs3opImul(ins))
                 {
@@ -5070,6 +5069,20 @@ void emitter::emitIns_R_I(instruction ins,
     id->idDebugOnlyInfo()->idMemCookie = targetHandle;
 #endif
 
+    if (isSimdInsAndValInByte)
+    {
+        bool includeRexPrefixSize = true;
+
+        // Do not get the RexSize() but just decide if it will be included down further and if yes,
+        // do not include it again.
+        if (IsExtendedReg(reg, attr) || TakesRexWPrefix(ins, size) || instrIsExtendedReg3opImul(ins))
+        {
+            includeRexPrefixSize = false;
+        }
+
+        sz += emitInsSize(id, insCodeMI(ins), includeRexPrefixSize);
+    }
+
     sz += emitGetAdjustedSize(id, insCodeMI(ins));
 
     // Do we need a REX prefix for AMD64? We need one if we are using any extended register (REX.R), or if we have a
diff --git a/src/coreclr/jit/emitxarch.h b/src/coreclr/jit/emitxarch.h
index 0beff1a7ce4b63..30a42f301e1d1e 100644
--- a/src/coreclr/jit/emitxarch.h
+++ b/src/coreclr/jit/emitxarch.h
@@ -38,7 +38,7 @@ struct CnsVal
     bool    cnsReloc;
 };
 
-UNATIVE_OFFSET emitInsSize(code_t code, bool includeRexPrefixSize);
+UNATIVE_OFFSET emitInsSize(instrDesc* id, code_t code, bool includeRexPrefixSize);
 UNATIVE_OFFSET emitInsSizeSVCalcDisp(instrDesc* id, code_t code, int var, int dsp);
 UNATIVE_OFFSET emitInsSizeSV(instrDesc* id, code_t code, int var, int dsp);
 UNATIVE_OFFSET emitInsSizeSV(instrDesc* id, code_t code, int var, int dsp, int val);
@@ -69,7 +69,7 @@ unsigned emitOutputRexOrSimdPrefixIfNeeded(instruction ins, BYTE* dst, code_t& c
 unsigned emitGetRexPrefixSize(instruction ins);
 unsigned emitGetVexPrefixSize(instrDesc* id);
 unsigned emitGetEvexPrefixSize(instrDesc* id);
-unsigned emitGetPrefixSize(code_t code, bool includeRexPrefixSize);
+unsigned emitGetPrefixSize(instrDesc* id, code_t code, bool includeRexPrefixSize);
 unsigned emitGetAdjustedSize(instrDesc* id, code_t code);
 
 code_t emitExtractVexPrefix(instruction ins, code_t& code);

From ae744eb74682fec40d34e55e37445de0b5d936f6 Mon Sep 17 00:00:00 2001
From: Tanner Gooding <tagoo@outlook.com>
Date: Fri, 9 Dec 2022 13:08:14 -0800
Subject: [PATCH 07/15] Update emitGetVexPrefixSize to support returning 2 or 3

---
 src/coreclr/jit/emitxarch.cpp | 176 ++++++++++++++++++++++++++++------
 1 file changed, 149 insertions(+), 27 deletions(-)

diff --git a/src/coreclr/jit/emitxarch.cpp b/src/coreclr/jit/emitxarch.cpp
index 3de335d8bf33a6..822cc064a7ed8f 100644
--- a/src/coreclr/jit/emitxarch.cpp
+++ b/src/coreclr/jit/emitxarch.cpp
@@ -1651,18 +1651,10 @@ unsigned emitter::emitOutputRexOrSimdPrefixIfNeeded(instruction ins, BYTE* dst,
 
         if ((vexPrefix & 0xFFFF7F80) == 0x00C46100)
         {
-            // Encoding optimization calculation is not done while estimating the instruction
-            // size and thus over-predict instruction size by 1 byte.
-            // If there are IGs that will be aligned, do not optimize encoding so the
-            // estimated alignment sizes are accurate.
+            emitOutputByte(dst, 0xC5);
+            emitOutputByte(dst + 1, ((vexPrefix >> 8) & 0x80) | (vexPrefix & 0x7F));
 
-            if (emitCurIG->igNum > emitLastAlignedIgNum)
-            {
-                emitOutputByte(dst, 0xC5);
-                emitOutputByte(dst + 1, ((vexPrefix >> 8) & 0x80) | (vexPrefix & 0x7F));
-
-                return 2;
-            }
+            return 2;
         }
 
         emitOutputByte(dst, ((vexPrefix >> 16) & 0xFF));
@@ -1789,22 +1781,6 @@ unsigned emitter::emitGetEvexPrefixSize(instrDesc* id)
     return 4;
 }
 
-//------------------------------------------------------------------------
-// emitGetVexPrefixSize: Gets Size of VEX prefix in bytes
-//
-// Arguments:
-//    id   -- The instruction descriptor
-//
-// Returns:
-//    Prefix size in bytes.
-//
-unsigned emitter::emitGetVexPrefixSize(instrDesc* id)
-{
-    instruction ins = id->idIns();
-    assert(IsVexEncodedInstruction(ins));
-    return 3;
-}
-
 //------------------------------------------------------------------------
 // emitGetAdjustedSize: Determines any size adjustment needed for a given instruction based on the current
 // configuration.
@@ -2345,6 +2321,152 @@ inline bool hasCodeMR(instruction ins)
     return ((insCodesMR[ins] != BAD_CODE));
 }
 
+//------------------------------------------------------------------------
+// emitGetVexPrefixSize: Gets Size of VEX prefix in bytes
+//
+// Arguments:
+//    id   -- The instruction descriptor
+//
+// Returns:
+//    Prefix size in bytes.
+//
+unsigned emitter::emitGetVexPrefixSize(instrDesc* id)
+{
+    instruction ins  = id->idIns();
+    emitAttr    size = id->idOpSize();
+
+    assert(IsVexEncodedInstruction(ins));
+
+    if (EncodedBySSE38orSSE3A(ins))
+    {
+        // When the prefix is 0x0F38 or 0x0F3A, we must use the 3-byte encoding
+        return 3;
+    }
+
+    switch (ins)
+    {
+        case INS_crc32:
+#if defined(TARGET_AMD64)
+        case INS_sarx:
+        case INS_shrx:
+#endif // TARGET_AMD64
+        {
+            // When the prefix is 0x0F38 or 0x0F3A, we must use the 3-byte encoding
+            // These are special cases where the pp-bit is 0xF2 or 0xF3 and not 0x66
+            return 3;
+        }
+
+        default:
+        {
+            break;
+        }
+    }
+
+    if (TakesRexWPrefix(ins, size))
+    {
+        // When the REX.W bit is present, we must use the 3-byte encoding
+        return 3;
+    }
+
+    regNumber regFor012Bits = REG_NA;
+    regNumber regForSibBits = REG_NA;
+
+    switch (id->idInsFmt())
+    {
+        case IF_ARD:
+        case IF_AWR_RRD:
+        case IF_RRD_ARD:
+        case IF_RRW_ARD:
+        case IF_RRW_ARD_CNS:
+        case IF_RWR_ARD:
+        case IF_RWR_ARD_CNS:
+        case IF_RWR_RRD_ARD:
+        case IF_RWR_RRD_ARD_CNS:
+        {
+            regFor012Bits = id->idAddr()->iiaAddrMode.amBaseReg;
+            regForSibBits = id->idAddr()->iiaAddrMode.amIndxReg;
+            break;
+        }
+
+        case IF_MRD:
+        case IF_MWR_RRD:
+        case IF_RRD_MRD:
+        case IF_RRD_SRD:
+        case IF_RRW_MRD:
+        case IF_RRW_MRD_CNS:
+        case IF_RRW_SRD:
+        case IF_RRW_SRD_CNS:
+        case IF_RWR_MRD:
+        case IF_RWR_MRD_CNS:
+        case IF_RWR_RRD_MRD:
+        case IF_RWR_RRD_MRD_CNS:
+        case IF_RWR_RRD_SRD:
+        case IF_RWR_RRD_SRD_CNS:
+        case IF_RWR_SRD:
+        case IF_RWR_SRD_CNS:
+        case IF_SRD:
+        case IF_SWR_RRD:
+        {
+            // Nothing is encoded in a way to prevent the 2-byte encoding
+            break;
+        }
+
+        case IF_RRD_RRD:
+        case IF_RRW_RRD:
+        case IF_RWR_RRD:
+        {
+            regFor012Bits = id->idReg2();
+
+            if ((ins == INS_movd) && isFloatReg(regFor012Bits))
+            {
+                regFor012Bits = id->idReg1();
+            }
+            break;
+        }
+
+        case IF_RRW_RRW_CNS:
+        {
+            if (hasCodeMR(ins))
+            {
+                regFor012Bits = id->idReg1();
+            }
+            else
+            {
+                regFor012Bits = id->idReg2();
+            }
+            break;
+        }
+
+        case IF_RWR_RRD_RRD:
+        case IF_RWR_RRD_RRD_CNS:
+        case IF_RWR_RRD_RRD_RRD:
+        {
+            regFor012Bits = id->idReg3();
+            break;
+        }
+
+        default:
+        {
+            assert(!"Unhandled insFmt for emitGetVexPrefixSize");
+            return 3;
+        }
+    }
+
+    if ((regForSibBits != REG_NA) && IsExtendedReg(regForSibBits))
+    {
+        // When the REX.X bit is present, we must use the 3-byte encoding
+        return 3;
+    }
+
+    if ((regFor012Bits != REG_NA) && IsExtendedReg(regFor012Bits))
+    {
+        // When the REX.B bit is present, we must use the 3-byte encoding
+        return 3;
+    }
+
+    return 2;
+}
+
 /*****************************************************************************
  *
  *  Returns the "[r/m], reg" or "[r/m]" encoding of the given CPU instruction.

From f2ff3c4c1bd2fd2702daf346c3dbfe23ed933ec7 Mon Sep 17 00:00:00 2001
From: Tanner Gooding <tagoo@outlook.com>
Date: Fri, 9 Dec 2022 17:38:57 -0800
Subject: [PATCH 08/15] Ensure IF_RWR_CNS is handled in emitGetVexPrefixSize

---
 src/coreclr/jit/emitxarch.cpp | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/src/coreclr/jit/emitxarch.cpp b/src/coreclr/jit/emitxarch.cpp
index 822cc064a7ed8f..5e96383afa2085 100644
--- a/src/coreclr/jit/emitxarch.cpp
+++ b/src/coreclr/jit/emitxarch.cpp
@@ -2411,6 +2411,14 @@ unsigned emitter::emitGetVexPrefixSize(instrDesc* id)
             break;
         }
 
+        case IF_RRD_CNS:
+        case IF_RRW_CNS:
+        case IF_RWR_CNS:
+        {
+            regFor012Bits = id->idReg1();
+            break;
+        }
+
         case IF_RRD_RRD:
         case IF_RRW_RRD:
         case IF_RWR_RRD:

From e131b6be8e9f867e05588ff813534d4a78caa020 Mon Sep 17 00:00:00 2001
From: Tanner Gooding <tagoo@outlook.com>
Date: Fri, 9 Dec 2022 21:10:45 -0800
Subject: [PATCH 09/15] Don't try to estimate the 2-byte VEX prefix when
 optimizations are disabled

---
 src/coreclr/jit/emitxarch.cpp | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/src/coreclr/jit/emitxarch.cpp b/src/coreclr/jit/emitxarch.cpp
index 5e96383afa2085..25557af82c124e 100644
--- a/src/coreclr/jit/emitxarch.cpp
+++ b/src/coreclr/jit/emitxarch.cpp
@@ -2337,6 +2337,12 @@ unsigned emitter::emitGetVexPrefixSize(instrDesc* id)
 
     assert(IsVexEncodedInstruction(ins));
 
+    if (!emitComp->opts.OptimizationEnabled())
+    {
+        // Don't worry about saving code size when optimizations are disabled
+        return 3;
+    }
+
     if (EncodedBySSE38orSSE3A(ins))
     {
         // When the prefix is 0x0F38 or 0x0F3A, we must use the 3-byte encoding

From ea56d5f6dc34bc0d7d583ab9ddd7d931e309c97d Mon Sep 17 00:00:00 2001
From: Tanner Gooding <tagoo@outlook.com>
Date: Fri, 9 Dec 2022 21:45:58 -0800
Subject: [PATCH 10/15] Ensure we don't negatively impact estimated alignment
 sizes for debug code

---
 src/coreclr/jit/emitxarch.cpp | 16 +++++++++++++---
 1 file changed, 13 insertions(+), 3 deletions(-)

diff --git a/src/coreclr/jit/emitxarch.cpp b/src/coreclr/jit/emitxarch.cpp
index 25557af82c124e..33c8b59e746887 100644
--- a/src/coreclr/jit/emitxarch.cpp
+++ b/src/coreclr/jit/emitxarch.cpp
@@ -1651,10 +1651,20 @@ unsigned emitter::emitOutputRexOrSimdPrefixIfNeeded(instruction ins, BYTE* dst,
 
         if ((vexPrefix & 0xFFFF7F80) == 0x00C46100)
         {
-            emitOutputByte(dst, 0xC5);
-            emitOutputByte(dst + 1, ((vexPrefix >> 8) & 0x80) | (vexPrefix & 0x7F));
+            // Encoding optimization calculation when estimating the instruction size is
+            // only done when optimizations are enabled since its more expensive. So when
+            // optimizations are disabled we'll have over-predicted the instruction size
+            // by one byte. Due to how IG alignment is done, we only want to emit the 2-byte
+            // prefix if optimizations are enabled or we know we won't negatively impact the
+            // estimated alignment sizes.
 
-            return 2;
+            if (emitComp->opts.OptimizationEnabled() || (emitCurIG->igNum > emitLastAlignedIgNum))
+            {
+                emitOutputByte(dst, 0xC5);
+                emitOutputByte(dst + 1, ((vexPrefix >> 8) & 0x80) | (vexPrefix & 0x7F));
+
+                return 2;
+            }
         }
 
         emitOutputByte(dst, ((vexPrefix >> 16) & 0xFF));

From 501435595abf597f8e8e50717e8327385148dde0 Mon Sep 17 00:00:00 2001
From: Tanner Gooding <tagoo@outlook.com>
Date: Sat, 10 Dec 2022 06:11:59 -0800
Subject: [PATCH 11/15] Just check MinOpts not OptimizationsDisabled

---
 src/coreclr/jit/emitxarch.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/coreclr/jit/emitxarch.cpp b/src/coreclr/jit/emitxarch.cpp
index 33c8b59e746887..8b3d46eb683f1d 100644
--- a/src/coreclr/jit/emitxarch.cpp
+++ b/src/coreclr/jit/emitxarch.cpp
@@ -1658,7 +1658,7 @@ unsigned emitter::emitOutputRexOrSimdPrefixIfNeeded(instruction ins, BYTE* dst,
             // prefix if optimizations are enabled or we know we won't negatively impact the
             // estimated alignment sizes.
 
-            if (emitComp->opts.OptimizationEnabled() || (emitCurIG->igNum > emitLastAlignedIgNum))
+            if (!emitComp->opts.MinOpts() || (emitCurIG->igNum > emitLastAlignedIgNum))
             {
                 emitOutputByte(dst, 0xC5);
                 emitOutputByte(dst + 1, ((vexPrefix >> 8) & 0x80) | (vexPrefix & 0x7F));
@@ -2347,7 +2347,7 @@ unsigned emitter::emitGetVexPrefixSize(instrDesc* id)
 
     assert(IsVexEncodedInstruction(ins));
 
-    if (!emitComp->opts.OptimizationEnabled())
+    if (emitComp->opts.MinOpts())
     {
         // Don't worry about saving code size when optimizations are disabled
         return 3;

From 989c1ec083aca977fdf4684f08bba51ca1dab193 Mon Sep 17 00:00:00 2001
From: Tanner Gooding <tagoo@outlook.com>
Date: Sat, 10 Dec 2022 06:28:42 -0800
Subject: [PATCH 12/15] Mark some of the modified methods as const to indicate
 they don't mutate the emitter

---
 src/coreclr/jit/emitxarch.cpp | 14 +++++++-------
 src/coreclr/jit/emitxarch.h   | 14 +++++++-------
 2 files changed, 14 insertions(+), 14 deletions(-)

diff --git a/src/coreclr/jit/emitxarch.cpp b/src/coreclr/jit/emitxarch.cpp
index 8b3d46eb683f1d..598dc262ddd5bc 100644
--- a/src/coreclr/jit/emitxarch.cpp
+++ b/src/coreclr/jit/emitxarch.cpp
@@ -747,7 +747,7 @@ static bool IsDstSrcImmAvxInstruction(instruction ins)
 // Note that this should be true for any of the instructions in instrsXArch.h
 // that use the SSE38 or SSE3A macro but returns false if the VEX encoding is
 // in use, since that encoding does not require an additional byte.
-bool emitter::Is4ByteSSEInstruction(instruction ins)
+bool emitter::Is4ByteSSEInstruction(instruction ins) const
 {
     return !UseVEXEncoding() && EncodedBySSE38orSSE3A(ins);
 }
@@ -1289,7 +1289,7 @@ bool isPrefix(BYTE b)
 // Return Value:
 //    The extracted EVEX prefix.
 //
-emitter::code_t emitter::emitExtractEvexPrefix(instruction ins, code_t& code)
+emitter::code_t emitter::emitExtractEvexPrefix(instruction ins, code_t& code) const
 {
     assert(IsEvexEncodedInstruction(ins));
 
@@ -1425,7 +1425,7 @@ emitter::code_t emitter::emitExtractEvexPrefix(instruction ins, code_t& code)
 // Return Value:
 //    The extracted VEX prefix.
 //
-emitter::code_t emitter::emitExtractVexPrefix(instruction ins, code_t& code)
+emitter::code_t emitter::emitExtractVexPrefix(instruction ins, code_t& code) const
 {
     assert(IsVexEncodedInstruction(ins));
 
@@ -1784,7 +1784,7 @@ unsigned emitter::emitGetRexPrefixSize(instruction ins)
 // Returns:
 //    Prefix size in bytes.
 //
-unsigned emitter::emitGetEvexPrefixSize(instrDesc* id)
+unsigned emitter::emitGetEvexPrefixSize(instrDesc* id) const
 {
     instruction ins = id->idIns();
     assert(IsEvexEncodedInstruction(ins));
@@ -1802,7 +1802,7 @@ unsigned emitter::emitGetEvexPrefixSize(instrDesc* id)
 // Returns:
 //    Updated size.
 //
-unsigned emitter::emitGetAdjustedSize(instrDesc* id, code_t code)
+unsigned emitter::emitGetAdjustedSize(instrDesc* id, code_t code) const
 {
     instruction ins  = id->idIns();
     emitAttr    attr = id->idOpSize();
@@ -2340,7 +2340,7 @@ inline bool hasCodeMR(instruction ins)
 // Returns:
 //    Prefix size in bytes.
 //
-unsigned emitter::emitGetVexPrefixSize(instrDesc* id)
+unsigned emitter::emitGetVexPrefixSize(instrDesc* id) const
 {
     instruction ins  = id->idIns();
     emitAttr    size = id->idOpSize();
@@ -2555,7 +2555,7 @@ inline insTupleType insTupleTypeInfo(instruction ins)
 }
 
 // Return true if the instruction uses the SSE38 or SSE3A macro in instrsXArch.h.
-bool emitter::EncodedBySSE38orSSE3A(instruction ins)
+bool emitter::EncodedBySSE38orSSE3A(instruction ins) const
 {
     const size_t SSE38 = 0x0F660038;
     const size_t SSE3A = 0x0F66003A;
diff --git a/src/coreclr/jit/emitxarch.h b/src/coreclr/jit/emitxarch.h
index 30a42f301e1d1e..dd4eec46dadb92 100644
--- a/src/coreclr/jit/emitxarch.h
+++ b/src/coreclr/jit/emitxarch.h
@@ -67,13 +67,13 @@ BYTE* emitOutputLJ(insGroup* ig, BYTE* dst, instrDesc* id);
 
 unsigned emitOutputRexOrSimdPrefixIfNeeded(instruction ins, BYTE* dst, code_t& code);
 unsigned emitGetRexPrefixSize(instruction ins);
-unsigned emitGetVexPrefixSize(instrDesc* id);
-unsigned emitGetEvexPrefixSize(instrDesc* id);
+unsigned emitGetVexPrefixSize(instrDesc* id) const;
+unsigned emitGetEvexPrefixSize(instrDesc* id) const;
 unsigned emitGetPrefixSize(instrDesc* id, code_t code, bool includeRexPrefixSize);
-unsigned emitGetAdjustedSize(instrDesc* id, code_t code);
+unsigned emitGetAdjustedSize(instrDesc* id, code_t code) const;
 
-code_t emitExtractVexPrefix(instruction ins, code_t& code);
-code_t emitExtractEvexPrefix(instruction ins, code_t& code);
+code_t emitExtractVexPrefix(instruction ins, code_t& code) const;
+code_t emitExtractEvexPrefix(instruction ins, code_t& code) const;
 
 unsigned insEncodeReg012(instruction ins, regNumber reg, emitAttr size, code_t* code);
 unsigned insEncodeReg345(instruction ins, regNumber reg, emitAttr size, code_t* code);
@@ -111,8 +111,8 @@ code_t AddRexXPrefix(instruction ins, code_t code);
 code_t AddRexBPrefix(instruction ins, code_t code);
 code_t AddRexPrefix(instruction ins, code_t code);
 
-bool EncodedBySSE38orSSE3A(instruction ins);
-bool Is4ByteSSEInstruction(instruction ins);
+bool EncodedBySSE38orSSE3A(instruction ins) const;
+bool Is4ByteSSEInstruction(instruction ins) const;
 static bool IsMovInstruction(instruction ins);
 bool HasSideEffect(instruction ins, emitAttr size);
 bool IsRedundantMov(

From 0f2514d19baa54ba150ccd1f87fd9ced01f12ce5 Mon Sep 17 00:00:00 2001
From: Tanner Gooding <tagoo@outlook.com>
Date: Sat, 10 Dec 2022 07:12:01 -0800
Subject: [PATCH 13/15] Resolve the throughput issue by not getting optSize
 untill necessary

---
 src/coreclr/jit/emitxarch.cpp | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/src/coreclr/jit/emitxarch.cpp b/src/coreclr/jit/emitxarch.cpp
index 598dc262ddd5bc..b4b038733ca444 100644
--- a/src/coreclr/jit/emitxarch.cpp
+++ b/src/coreclr/jit/emitxarch.cpp
@@ -1658,7 +1658,7 @@ unsigned emitter::emitOutputRexOrSimdPrefixIfNeeded(instruction ins, BYTE* dst,
             // prefix if optimizations are enabled or we know we won't negatively impact the
             // estimated alignment sizes.
 
-            if (!emitComp->opts.MinOpts() || (emitCurIG->igNum > emitLastAlignedIgNum))
+            if (emitComp->opts.OptimizationEnabled() || (emitCurIG->igNum > emitLastAlignedIgNum))
             {
                 emitOutputByte(dst, 0xC5);
                 emitOutputByte(dst + 1, ((vexPrefix >> 8) & 0x80) | (vexPrefix & 0x7F));
@@ -2342,17 +2342,16 @@ inline bool hasCodeMR(instruction ins)
 //
 unsigned emitter::emitGetVexPrefixSize(instrDesc* id) const
 {
-    instruction ins  = id->idIns();
-    emitAttr    size = id->idOpSize();
-
-    assert(IsVexEncodedInstruction(ins));
+    assert(IsVexEncodedInstruction(id->idIns()));
 
-    if (emitComp->opts.MinOpts())
+    if (emitComp->opts.OptimizationDisabled())
     {
         // Don't worry about saving code size when optimizations are disabled
         return 3;
     }
 
+    instruction ins = id->idIns();
+
     if (EncodedBySSE38orSSE3A(ins))
     {
         // When the prefix is 0x0F38 or 0x0F3A, we must use the 3-byte encoding
@@ -2378,6 +2377,8 @@ unsigned emitter::emitGetVexPrefixSize(instrDesc* id) const
         }
     }
 
+    emitAttr size = id->idOpSize();
+
     if (TakesRexWPrefix(ins, size))
     {
         // When the REX.W bit is present, we must use the 3-byte encoding

From 4d0c099c49af681c38bec08d0bf9d43e5c9ad087 Mon Sep 17 00:00:00 2001
From: Tanner Gooding <tagoo@outlook.com>
Date: Sat, 10 Dec 2022 09:44:01 -0800
Subject: [PATCH 14/15] Switch back to doing accurate vex prefix size
 estimation always

---
 src/coreclr/jit/emitxarch.cpp | 22 +++-------------------
 1 file changed, 3 insertions(+), 19 deletions(-)

diff --git a/src/coreclr/jit/emitxarch.cpp b/src/coreclr/jit/emitxarch.cpp
index b4b038733ca444..de4153c5a694ed 100644
--- a/src/coreclr/jit/emitxarch.cpp
+++ b/src/coreclr/jit/emitxarch.cpp
@@ -1651,20 +1651,10 @@ unsigned emitter::emitOutputRexOrSimdPrefixIfNeeded(instruction ins, BYTE* dst,
 
         if ((vexPrefix & 0xFFFF7F80) == 0x00C46100)
         {
-            // Encoding optimization calculation when estimating the instruction size is
-            // only done when optimizations are enabled since its more expensive. So when
-            // optimizations are disabled we'll have over-predicted the instruction size
-            // by one byte. Due to how IG alignment is done, we only want to emit the 2-byte
-            // prefix if optimizations are enabled or we know we won't negatively impact the
-            // estimated alignment sizes.
+            emitOutputByte(dst, 0xC5);
+            emitOutputByte(dst + 1, ((vexPrefix >> 8) & 0x80) | (vexPrefix & 0x7F));
 
-            if (emitComp->opts.OptimizationEnabled() || (emitCurIG->igNum > emitLastAlignedIgNum))
-            {
-                emitOutputByte(dst, 0xC5);
-                emitOutputByte(dst + 1, ((vexPrefix >> 8) & 0x80) | (vexPrefix & 0x7F));
-
-                return 2;
-            }
+            return 2;
         }
 
         emitOutputByte(dst, ((vexPrefix >> 16) & 0xFF));
@@ -2344,12 +2334,6 @@ unsigned emitter::emitGetVexPrefixSize(instrDesc* id) const
 {
     assert(IsVexEncodedInstruction(id->idIns()));
 
-    if (emitComp->opts.OptimizationDisabled())
-    {
-        // Don't worry about saving code size when optimizations are disabled
-        return 3;
-    }
-
     instruction ins = id->idIns();
 
     if (EncodedBySSE38orSSE3A(ins))

From 57d572571676820ddbf495a1b1a723b38cb1c2db Mon Sep 17 00:00:00 2001
From: Tanner Gooding <tagoo@outlook.com>
Date: Tue, 13 Dec 2022 14:09:56 -0800
Subject: [PATCH 15/15] Small cleanup to reduce regression

---
 src/coreclr/jit/emitxarch.cpp | 28 +++++++++++++---------------
 1 file changed, 13 insertions(+), 15 deletions(-)

diff --git a/src/coreclr/jit/emitxarch.cpp b/src/coreclr/jit/emitxarch.cpp
index de4153c5a694ed..14ad6263e5ad22 100644
--- a/src/coreclr/jit/emitxarch.cpp
+++ b/src/coreclr/jit/emitxarch.cpp
@@ -1776,8 +1776,7 @@ unsigned emitter::emitGetRexPrefixSize(instruction ins)
 //
 unsigned emitter::emitGetEvexPrefixSize(instrDesc* id) const
 {
-    instruction ins = id->idIns();
-    assert(IsEvexEncodedInstruction(ins));
+    assert(IsEvexEncodedInstruction(id->idIns()));
     return 4;
 }
 
@@ -1794,10 +1793,8 @@ unsigned emitter::emitGetEvexPrefixSize(instrDesc* id) const
 //
 unsigned emitter::emitGetAdjustedSize(instrDesc* id, code_t code) const
 {
-    instruction ins  = id->idIns();
-    emitAttr    attr = id->idOpSize();
-
-    unsigned adjustedSize = 0;
+    instruction ins          = id->idIns();
+    unsigned    adjustedSize = 0;
 
     // TODO-XArch-AVX512: Remove redundant code and possiblly collapse EVEX and VEX into a single pathway
     // IsEvexEncodedInstruction(ins) is `true` for AVX/SSE instructions also which needs to be VEX encoded unless
@@ -1910,6 +1907,8 @@ unsigned emitter::emitGetAdjustedSize(instrDesc* id, code_t code) const
             adjustedSize++;
         }
 
+        emitAttr attr = id->idOpSize();
+
         if ((attr == EA_2BYTE) && (ins != INS_movzx) && (ins != INS_movsx))
         {
             // Most 16-bit operand instructions will need a 0x66 prefix.
@@ -2322,7 +2321,7 @@ inline bool hasCodeMR(instruction ins)
 }
 
 //------------------------------------------------------------------------
-// emitGetVexPrefixSize: Gets Size of VEX prefix in bytes
+// emitGetVexPrefixSize: Gets size of VEX prefix in bytes
 //
 // Arguments:
 //    id   -- The instruction descriptor
@@ -2332,9 +2331,8 @@ inline bool hasCodeMR(instruction ins)
 //
 unsigned emitter::emitGetVexPrefixSize(instrDesc* id) const
 {
-    assert(IsVexEncodedInstruction(id->idIns()));
-
     instruction ins = id->idIns();
+    assert(IsVexEncodedInstruction(ins));
 
     if (EncodedBySSE38orSSE3A(ins))
     {
@@ -3060,12 +3058,7 @@ inline UNATIVE_OFFSET emitter::emitInsSizeRR(instrDesc* id, code_t code, int val
 //
 inline UNATIVE_OFFSET emitter::emitInsSizeRR(instrDesc* id)
 {
-    instruction ins  = id->idIns();
-    regNumber   reg1 = id->idReg1();
-    regNumber   reg2 = id->idReg2();
-    emitAttr    attr = id->idOpSize();
-
-    emitAttr size = EA_SIZE(attr);
+    instruction ins = id->idIns();
 
     // If Byte 4 (which is 0xFF00) is zero, that's where the RM encoding goes.
     // Otherwise, it will be placed after the 4 byte encoding, making the total 5 bytes.
@@ -3078,6 +3071,11 @@ inline UNATIVE_OFFSET emitter::emitInsSizeRR(instrDesc* id)
     // REX prefix
     if (!hasRexPrefix(code))
     {
+        regNumber reg1 = id->idReg1();
+        regNumber reg2 = id->idReg2();
+        emitAttr  attr = id->idOpSize();
+        emitAttr  size = EA_SIZE(attr);
+
         if ((TakesRexWPrefix(ins, size) && ((ins != INS_xor) || (reg1 != reg2))) || IsExtendedReg(reg1, attr) ||
             IsExtendedReg(reg2, attr))
         {