From cc169eac6736693c4bdbd9f61ae821146252e4cb Mon Sep 17 00:00:00 2001
From: Li Tian <litian2025@gmail.com>
Date: Sun, 11 Dec 2016 19:13:28 -0800
Subject: [PATCH 1/9] Remove AVX/SSE transition penalties

There are two two kinds of transition penalties:
1.Transition from 256-bit AVX code to 128-bit legacy SSE code.
2.Transition from 128-bit legacy SSE code to either 128 or
256-bit AVX code. This only happens if there was a preceding
AVX256->legacy SSE transition penalty.

The primary goal is to remove the #1 AVX to SSE transition penalty.

Added two emitter flags: contains256bitAVXInstruction indicates that
if the JIT method contains 256-bit AVX code, containsAVXInstruction
indicates that if the method contains 128-bit or 256-bit AVX code.

Issue VZEROUPPER in prolog if the method contains 128-bit or 256-bit
AVX code, to avoid legacy SSE to AVX transition penalty, this could
happen for reverse pinvoke situation. Issue VZEROUPPER in epilog
if the method contains 256-bit AVX code, to avoid AVX to legacy
SSE transition penalty.

To limite code size increase impact, we only issue VZEROUPPER before
PInvoke call on user defined function if the JIT method contains
256-bit AVX code, assuming user defined function contains legacy
SSE code. No need to issue VZEROUPPER after PInvoke call because #2
SSE to AVX transition penalty won't happen since #1 AVX to SSE
transition has been taken care of before the PInvoke call.

We measured ~3% to 1% performance gain on TechEmPower plaintext and
verified those VTune AVX/SSE events: OTHER_ASSISTS.AVX_TO_SSE and
OTHER_ASSISTS.SSE_TO_AVE have been reduced to 0.

Fix #7240

move setContainsAVX flags to lower, refactor to a smaller method

refactor, fix typo in comments

fix format error
---
 src/jit/codegen.h         |  2 ++
 src/jit/codegencommon.cpp | 74 ++++++++++++++++++++++++++++-----------
 src/jit/codegenxarch.cpp  | 14 ++++++++
 src/jit/compiler.cpp      |  3 ++
 src/jit/emitxarch.h       | 28 +++++++++++++++
 src/jit/lower.h           |  1 +
 src/jit/lowerxarch.cpp    | 34 +++++++++++++++++-
 7 files changed, 134 insertions(+), 22 deletions(-)

diff --git a/src/jit/codegen.h b/src/jit/codegen.h
index c6e38ab6af60..15abbbf31c3d 100755
--- a/src/jit/codegen.h
+++ b/src/jit/codegen.h
@@ -390,6 +390,8 @@ class CodeGen : public CodeGenInterface
     // Save/Restore callee saved float regs to stack
     void genPreserveCalleeSavedFltRegs(unsigned lclFrameSize);
     void genRestoreCalleeSavedFltRegs(unsigned lclFrameSize);
+    // Generate VZeroupper instruction to avoid AVX/SSE transition penalty
+    bool genVzeroupperIfNeeded(bool check256bitOnly = true);
 
 #endif // _TARGET_XARCH_ && FEATURE_STACK_FP_X87
 
diff --git a/src/jit/codegencommon.cpp b/src/jit/codegencommon.cpp
index 240911523f4d..000051cf91e5 100644
--- a/src/jit/codegencommon.cpp
+++ b/src/jit/codegencommon.cpp
@@ -10583,7 +10583,8 @@ GenTreePtr CodeGen::genMakeConst(const void* cnsAddr, var_types cnsType, GenTree
 //             funclet frames: this will be FuncletInfo.fiSpDelta.
 void CodeGen::genPreserveCalleeSavedFltRegs(unsigned lclFrameSize)
 {
-    regMaskTP regMask = compiler->compCalleeFPRegsSavedMask;
+    bool      bVzeroupperIssued = genVzeroupperIfNeeded(false);
+    regMaskTP regMask           = compiler->compCalleeFPRegsSavedMask;
 
     // Only callee saved floating point registers should be in regMask
     assert((regMask & RBM_FLT_CALLEE_SAVED) == regMask);
@@ -10611,6 +10612,17 @@ void CodeGen::genPreserveCalleeSavedFltRegs(unsigned lclFrameSize)
         regMaskTP regBit = genRegMask(reg);
         if ((regBit & regMask) != 0)
         {
+#ifdef FEATURE_AVX_SUPPORT
+            // when we reach here, function does not contain AVX instruction so far, however, since copyIns can
+            // be an AVX instruction such as vmovupd, we should check and issue vzeroupper before the copyIns to
+            // avoid Legacy SSE code (from native code such as Reverse PInvoke) to AVX transition penalty
+            if (compiler->getFloatingPointInstructionSet() == InstructionSet_AVX && !bVzeroupperIssued &&
+                getEmitter()->IsAVXInstruction(copyIns))
+            {
+                instGen(INS_vzeroupper);
+                bVzeroupperIssued = true;
+            }
+#endif
             // ABI requires us to preserve lower 128-bits of YMM register.
             getEmitter()->emitIns_AR_R(copyIns,
                                        EA_8BYTE, // TODO-XArch-Cleanup: size specified here doesn't matter but should be
@@ -10621,16 +10633,6 @@ void CodeGen::genPreserveCalleeSavedFltRegs(unsigned lclFrameSize)
             offset -= XMM_REGSIZE_BYTES;
         }
     }
-
-#ifdef FEATURE_AVX_SUPPORT
-    // Just before restoring float registers issue a Vzeroupper to zero out upper 128-bits of all YMM regs.
-    // This is to avoid penalty if this routine is using AVX-256 and now returning to a routine that is
-    // using SSE2.
-    if (compiler->getFloatingPointInstructionSet() == InstructionSet_AVX)
-    {
-        instGen(INS_vzeroupper);
-    }
-#endif
 }
 
 // Save/Restore compCalleeFPRegsPushed with the smallest register number saved at [RSP+offset], working
@@ -10651,6 +10653,7 @@ void CodeGen::genRestoreCalleeSavedFltRegs(unsigned lclFrameSize)
     // fast path return
     if (regMask == RBM_NONE)
     {
+        genVzeroupperIfNeeded();
         return;
     }
 
@@ -10682,16 +10685,6 @@ void CodeGen::genRestoreCalleeSavedFltRegs(unsigned lclFrameSize)
     assert((offset % 16) == 0);
 #endif // _TARGET_AMD64_
 
-#ifdef FEATURE_AVX_SUPPORT
-    // Just before restoring float registers issue a Vzeroupper to zero out upper 128-bits of all YMM regs.
-    // This is to avoid penalty if this routine is using AVX-256 and now returning to a routine that is
-    // using SSE2.
-    if (compiler->getFloatingPointInstructionSet() == InstructionSet_AVX)
-    {
-        instGen(INS_vzeroupper);
-    }
-#endif
-
     for (regNumber reg = REG_FLT_CALLEE_SAVED_FIRST; regMask != RBM_NONE; reg = REG_NEXT(reg))
     {
         regMaskTP regBit = genRegMask(reg);
@@ -10706,7 +10699,46 @@ void CodeGen::genRestoreCalleeSavedFltRegs(unsigned lclFrameSize)
             offset -= XMM_REGSIZE_BYTES;
         }
     }
+    genVzeroupperIfNeeded();
+}
+
+// Generate Vzeroupper instruction as needed to zero out upper 128b-bit of all YMM registers so that the
+// AVX/Legacy SSE transition penalties can be avoided
+//
+// Params
+//   check256bitOnly  - Flag to check if the function contains 256-bit AVX instruction and generate Vzeroupper
+//      instruction, otherwise check if the function contains AVX instruciton (either 128-bit or 256-bit).
+//
+// Return Value:
+//     true if Vzeroupper instruction is issued, false otherwise.
+//
+bool CodeGen::genVzeroupperIfNeeded(bool check256bitOnly)
+{
+    bool bVzeroupperIssued = false;
+#ifdef FEATURE_AVX_SUPPORT
+    if (compiler->getFloatingPointInstructionSet() == InstructionSet_AVX)
+    {
+        if (check256bitOnly)
+        {
+            if (getEmitter()->Contains256bitAVX())
+            {
+                instGen(INS_vzeroupper);
+                bVzeroupperIssued = true;
+            }
+        }
+        else
+        {
+            if (getEmitter()->ContainsAVX())
+            {
+                instGen(INS_vzeroupper);
+                bVzeroupperIssued = true;
+            }
+        }
+    }
+#endif
+    return bVzeroupperIssued;
 }
+
 #endif // defined(_TARGET_XARCH_) && !FEATURE_STACK_FP_X87
 
 //-----------------------------------------------------------------------------------
diff --git a/src/jit/codegenxarch.cpp b/src/jit/codegenxarch.cpp
index 8e0af48799ab..495bc7e1b915 100644
--- a/src/jit/codegenxarch.cpp
+++ b/src/jit/codegenxarch.cpp
@@ -5001,6 +5001,20 @@ void CodeGen::genCallInstruction(GenTreePtr node)
 
 #endif // defined(_TARGET_X86_)
 
+#ifdef FEATURE_AVX_SUPPORT
+    // When it's a PInvoke call and the call type is USER function, we issue VZEROUPPER here
+    // if the function contains 256bit AVX instructions, this is to avoid AVX-256 to Legacy SSE
+    // transition penalty, assuming the user function contains legacy SSE instruction
+    if (call->IsPInvoke() && call->gtCallType == CT_USER_FUNC &&
+        compiler->getFloatingPointInstructionSet() == InstructionSet_AVX)
+    {
+        if (getEmitter()->Contains256bitAVX())
+        {
+            instGen(INS_vzeroupper);
+        }
+    }
+#endif
+
     if (target != nullptr)
     {
 #ifdef _TARGET_X86_
diff --git a/src/jit/compiler.cpp b/src/jit/compiler.cpp
index 30eccc3ce742..47d3c352c44a 100644
--- a/src/jit/compiler.cpp
+++ b/src/jit/compiler.cpp
@@ -2310,6 +2310,9 @@ void Compiler::compSetProcessor()
         if (opts.compCanUseAVX)
         {
             codeGen->getEmitter()->SetUseAVX(true);
+            // Assume each JITted method does not contain AVX instruction at first
+            codeGen->getEmitter()->SetContainsAVX(false);
+            codeGen->getEmitter()->SetContains256bitAVX(false);
         }
         else
 #endif // FEATURE_AVX_SUPPORT
diff --git a/src/jit/emitxarch.h b/src/jit/emitxarch.h
index 98256cdaa707..4753c1435456 100644
--- a/src/jit/emitxarch.h
+++ b/src/jit/emitxarch.h
@@ -150,6 +150,26 @@ void SetUseAVX(bool value)
     useAVXEncodings = value;
 }
 
+bool containsAVXInstruction;
+bool ContainsAVX()
+{
+    return containsAVXInstruction;
+}
+void SetContainsAVX(bool value)
+{
+    containsAVXInstruction = value;
+}
+
+bool contains256bitAVXInstruction;
+bool Contains256bitAVX()
+{
+    return contains256bitAVXInstruction;
+}
+void SetContains256bitAVX(bool value)
+{
+    contains256bitAVXInstruction = value;
+}
+
 bool IsThreeOperandBinaryAVXInstruction(instruction ins);
 bool IsThreeOperandMoveAVXInstruction(instruction ins);
 bool IsThreeOperandAVXInstruction(instruction ins)
@@ -162,6 +182,14 @@ bool                     UseAVX()
 {
     return false;
 }
+bool ContainsAVX()
+{
+    return false;
+}
+bool Contains256bitAVX()
+{
+    return false;
+}
 bool hasVexPrefix(code_t code)
 {
     return false;
diff --git a/src/jit/lower.h b/src/jit/lower.h
index 555b9e26c666..da47cf9041bd 100644
--- a/src/jit/lower.h
+++ b/src/jit/lower.h
@@ -235,6 +235,7 @@ class Lowering : public Phase
 
 #if defined(_TARGET_XARCH_)
     void SetMulOpCounts(GenTreePtr tree);
+    void SetContainsAVXFlags(bool isFloatingType = true, unsigned sizeOfSIMDVector = 0);
 #endif // defined(_TARGET_XARCH_)
 
 #if !CPU_LOAD_STORE_ARCH
diff --git a/src/jit/lowerxarch.cpp b/src/jit/lowerxarch.cpp
index bf5d29c596fa..b9df5cd0c651 100644
--- a/src/jit/lowerxarch.cpp
+++ b/src/jit/lowerxarch.cpp
@@ -166,7 +166,8 @@ void Lowering::TreeNodeInfoInit(GenTree* tree)
     Compiler*   compiler = comp;
 
     TreeNodeInfo* info = &(tree->gtLsraInfo);
-
+    // floating type generates AVX instruction (vmovss etc.), set the flag
+    SetContainsAVXFlags(varTypeIsFloating(tree->TypeGet()));
     switch (tree->OperGet())
     {
         GenTree* op1;
@@ -1773,6 +1774,8 @@ void Lowering::TreeNodeInfoInitBlockStore(GenTreeBlk* blkNode)
                     {
                         MakeSrcContained(blkNode, source);
                     }
+                    // use XMM register to fill with constants, it's AVX instruction and set the flag
+                    SetContainsAVXFlags();
                 }
                 blkNode->gtBlkOpKind = GenTreeBlk::BlkOpKindUnroll;
 
@@ -1954,6 +1957,8 @@ void Lowering::TreeNodeInfoInitBlockStore(GenTreeBlk* blkNode)
                     // series of 16-byte loads and stores.
                     blkNode->gtLsraInfo.internalFloatCount = 1;
                     blkNode->gtLsraInfo.addInternalCandidates(l, l->internalFloatRegCandidates());
+                    // use XMM register for load and store, need set the flag for AVX instruction
+                    SetContainsAVXFlags();
                 }
 
                 // If src or dst are on stack, we don't have to generate the address into a register
@@ -2732,6 +2737,7 @@ void Lowering::TreeNodeInfoInitSIMD(GenTree* tree)
     TreeNodeInfo* info     = &(tree->gtLsraInfo);
     LinearScan*   lsra     = m_lsra;
     info->dstCount         = 1;
+    SetContainsAVXFlags(true, simdTree->gtSIMDSize);
     switch (simdTree->gtSIMDIntrinsicID)
     {
         GenTree* op1;
@@ -4572,6 +4578,32 @@ void Lowering::SetMulOpCounts(GenTreePtr tree)
     }
 }
 
+//------------------------------------------------------------------------------
+// SetContainsAVXFlags: default value of isFloatingType is true, we set the
+// ContainsAVX flag when floating type value is true, when SIMD vector size is
+// 32 bytes, it is 256bit AVX instruction and we set Contains256bitAVX flag too
+//
+// Arguments:
+//    isFloatingType    - is floating type
+//    sizeOfSIMDVector  - SIMD Vector size
+//
+void Lowering::SetContainsAVXFlags(bool isFloatingType, unsigned sizeOfSIMDVector)
+{
+#ifdef FEATURE_AVX_SUPPORT
+    if (comp->getSIMDInstructionSet() == InstructionSet_AVX)
+    {
+        if (isFloatingType)
+        {
+            comp->getEmitter()->SetContainsAVX(true);
+            if (sizeOfSIMDVector == 32)
+            {
+                comp->codeGen->getEmitter()->SetContains256bitAVX(true);
+            }
+        }
+    }
+#endif
+}
+
 //------------------------------------------------------------------------------
 // isRMWRegOper: Can this binary tree node be used in a Read-Modify-Write format
 //

From 933257ec88bae2722618c39c0a55ffd2e50c0c48 Mon Sep 17 00:00:00 2001
From: Li Tian <litian2025@gmail.com>
Date: Tue, 10 Jan 2017 13:42:22 -0800
Subject: [PATCH 2/9] remove unnecessary check in CalleeSavedFltRegs

---
 src/jit/codegen.h         |  2 +-
 src/jit/codegencommon.cpp | 26 ++++----------------------
 src/jit/lowerxarch.cpp    | 11 +++++------
 3 files changed, 10 insertions(+), 29 deletions(-)

diff --git a/src/jit/codegen.h b/src/jit/codegen.h
index 15abbbf31c3d..090283ee50e8 100755
--- a/src/jit/codegen.h
+++ b/src/jit/codegen.h
@@ -391,7 +391,7 @@ class CodeGen : public CodeGenInterface
     void genPreserveCalleeSavedFltRegs(unsigned lclFrameSize);
     void genRestoreCalleeSavedFltRegs(unsigned lclFrameSize);
     // Generate VZeroupper instruction to avoid AVX/SSE transition penalty
-    bool genVzeroupperIfNeeded(bool check256bitOnly = true);
+    void genVzeroupperIfNeeded(bool check256bitOnly = true);
 
 #endif // _TARGET_XARCH_ && FEATURE_STACK_FP_X87
 
diff --git a/src/jit/codegencommon.cpp b/src/jit/codegencommon.cpp
index 000051cf91e5..a64e63dfacfe 100644
--- a/src/jit/codegencommon.cpp
+++ b/src/jit/codegencommon.cpp
@@ -10583,7 +10583,7 @@ GenTreePtr CodeGen::genMakeConst(const void* cnsAddr, var_types cnsType, GenTree
 //             funclet frames: this will be FuncletInfo.fiSpDelta.
 void CodeGen::genPreserveCalleeSavedFltRegs(unsigned lclFrameSize)
 {
-    bool      bVzeroupperIssued = genVzeroupperIfNeeded(false);
+    genVzeroupperIfNeeded(false);
     regMaskTP regMask           = compiler->compCalleeFPRegsSavedMask;
 
     // Only callee saved floating point registers should be in regMask
@@ -10612,17 +10612,6 @@ void CodeGen::genPreserveCalleeSavedFltRegs(unsigned lclFrameSize)
         regMaskTP regBit = genRegMask(reg);
         if ((regBit & regMask) != 0)
         {
-#ifdef FEATURE_AVX_SUPPORT
-            // when we reach here, function does not contain AVX instruction so far, however, since copyIns can
-            // be an AVX instruction such as vmovupd, we should check and issue vzeroupper before the copyIns to
-            // avoid Legacy SSE code (from native code such as Reverse PInvoke) to AVX transition penalty
-            if (compiler->getFloatingPointInstructionSet() == InstructionSet_AVX && !bVzeroupperIssued &&
-                getEmitter()->IsAVXInstruction(copyIns))
-            {
-                instGen(INS_vzeroupper);
-                bVzeroupperIssued = true;
-            }
-#endif
             // ABI requires us to preserve lower 128-bits of YMM register.
             getEmitter()->emitIns_AR_R(copyIns,
                                        EA_8BYTE, // TODO-XArch-Cleanup: size specified here doesn't matter but should be
@@ -10706,15 +10695,11 @@ void CodeGen::genRestoreCalleeSavedFltRegs(unsigned lclFrameSize)
 // AVX/Legacy SSE transition penalties can be avoided
 //
 // Params
-//   check256bitOnly  - Flag to check if the function contains 256-bit AVX instruction and generate Vzeroupper
-//      instruction, otherwise check if the function contains AVX instruciton (either 128-bit or 256-bit).
-//
-// Return Value:
-//     true if Vzeroupper instruction is issued, false otherwise.
+//   check256bitOnly  - true to check if the function contains 256-bit AVX instruction and generate Vzeroupper
+//      instruction, false to check if the function contains AVX instruciton (either 128-bit or 256-bit).
 //
-bool CodeGen::genVzeroupperIfNeeded(bool check256bitOnly)
+void CodeGen::genVzeroupperIfNeeded(bool check256bitOnly /* = true*/)
 {
-    bool bVzeroupperIssued = false;
 #ifdef FEATURE_AVX_SUPPORT
     if (compiler->getFloatingPointInstructionSet() == InstructionSet_AVX)
     {
@@ -10723,7 +10708,6 @@ bool CodeGen::genVzeroupperIfNeeded(bool check256bitOnly)
             if (getEmitter()->Contains256bitAVX())
             {
                 instGen(INS_vzeroupper);
-                bVzeroupperIssued = true;
             }
         }
         else
@@ -10731,12 +10715,10 @@ bool CodeGen::genVzeroupperIfNeeded(bool check256bitOnly)
             if (getEmitter()->ContainsAVX())
             {
                 instGen(INS_vzeroupper);
-                bVzeroupperIssued = true;
             }
         }
     }
 #endif
-    return bVzeroupperIssued;
 }
 
 #endif // defined(_TARGET_XARCH_) && !FEATURE_STACK_FP_X87
diff --git a/src/jit/lowerxarch.cpp b/src/jit/lowerxarch.cpp
index b9df5cd0c651..ea87c9c71507 100644
--- a/src/jit/lowerxarch.cpp
+++ b/src/jit/lowerxarch.cpp
@@ -1957,7 +1957,7 @@ void Lowering::TreeNodeInfoInitBlockStore(GenTreeBlk* blkNode)
                     // series of 16-byte loads and stores.
                     blkNode->gtLsraInfo.internalFloatCount = 1;
                     blkNode->gtLsraInfo.addInternalCandidates(l, l->internalFloatRegCandidates());
-                    // use XMM register for load and store, need set the flag for AVX instruction
+                    // Uses XMM reg for load and store and hence check to see whether AVX instructions are used for codegen
                     SetContainsAVXFlags();
                 }
 
@@ -4579,15 +4579,14 @@ void Lowering::SetMulOpCounts(GenTreePtr tree)
 }
 
 //------------------------------------------------------------------------------
-// SetContainsAVXFlags: default value of isFloatingType is true, we set the
-// ContainsAVX flag when floating type value is true, when SIMD vector size is
-// 32 bytes, it is 256bit AVX instruction and we set Contains256bitAVX flag too
+// SetContainsAVXFlags: Set ContainsAVX flag when it is floating type, set 
+// Contains256bitAVX flag when SIMD vector size is 32 bytes
 //
 // Arguments:
-//    isFloatingType    - is floating type
+//    isFloatingType    - true if it is floating type
 //    sizeOfSIMDVector  - SIMD Vector size
 //
-void Lowering::SetContainsAVXFlags(bool isFloatingType, unsigned sizeOfSIMDVector)
+void Lowering::SetContainsAVXFlags(bool isFloatingType /* = true */, unsigned sizeOfSIMDVector /* = 0*/)
 {
 #ifdef FEATURE_AVX_SUPPORT
     if (comp->getSIMDInstructionSet() == InstructionSet_AVX)

From 3fdc10a616d6e80024531a66597c27181916e58d Mon Sep 17 00:00:00 2001
From: Li Tian <litian2025@gmail.com>
Date: Tue, 10 Jan 2017 14:07:52 -0800
Subject: [PATCH 3/9] fix format error

---
 src/jit/codegencommon.cpp | 2 +-
 src/jit/lowerxarch.cpp    | 5 +++--
 2 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/src/jit/codegencommon.cpp b/src/jit/codegencommon.cpp
index a64e63dfacfe..55fc9d5ce135 100644
--- a/src/jit/codegencommon.cpp
+++ b/src/jit/codegencommon.cpp
@@ -10584,7 +10584,7 @@ GenTreePtr CodeGen::genMakeConst(const void* cnsAddr, var_types cnsType, GenTree
 void CodeGen::genPreserveCalleeSavedFltRegs(unsigned lclFrameSize)
 {
     genVzeroupperIfNeeded(false);
-    regMaskTP regMask           = compiler->compCalleeFPRegsSavedMask;
+    regMaskTP regMask = compiler->compCalleeFPRegsSavedMask;
 
     // Only callee saved floating point registers should be in regMask
     assert((regMask & RBM_FLT_CALLEE_SAVED) == regMask);
diff --git a/src/jit/lowerxarch.cpp b/src/jit/lowerxarch.cpp
index ea87c9c71507..39d298023ede 100644
--- a/src/jit/lowerxarch.cpp
+++ b/src/jit/lowerxarch.cpp
@@ -1957,7 +1957,8 @@ void Lowering::TreeNodeInfoInitBlockStore(GenTreeBlk* blkNode)
                     // series of 16-byte loads and stores.
                     blkNode->gtLsraInfo.internalFloatCount = 1;
                     blkNode->gtLsraInfo.addInternalCandidates(l, l->internalFloatRegCandidates());
-                    // Uses XMM reg for load and store and hence check to see whether AVX instructions are used for codegen
+                    // Uses XMM reg for load and store and hence check to see whether AVX instructions are used for
+                    // codegen
                     SetContainsAVXFlags();
                 }
 
@@ -4579,7 +4580,7 @@ void Lowering::SetMulOpCounts(GenTreePtr tree)
 }
 
 //------------------------------------------------------------------------------
-// SetContainsAVXFlags: Set ContainsAVX flag when it is floating type, set 
+// SetContainsAVXFlags: Set ContainsAVX flag when it is floating type, set
 // Contains256bitAVX flag when SIMD vector size is 32 bytes
 //
 // Arguments:

From c0627f858f2dd28f89374bfc079034fc3d15d954 Mon Sep 17 00:00:00 2001
From: Li Tian <litian2025@gmail.com>
Date: Tue, 10 Jan 2017 16:52:34 -0800
Subject: [PATCH 4/9] rename, use getSIMDInstructionSet()

---
 src/jit/codegencommon.cpp |  2 +-
 src/jit/codegenxarch.cpp  |  9 +++------
 src/jit/lower.h           |  2 +-
 src/jit/lowerxarch.cpp    | 18 +++++++++---------
 4 files changed, 14 insertions(+), 17 deletions(-)

diff --git a/src/jit/codegencommon.cpp b/src/jit/codegencommon.cpp
index 55fc9d5ce135..b0f4dc2ed8d8 100644
--- a/src/jit/codegencommon.cpp
+++ b/src/jit/codegencommon.cpp
@@ -10701,7 +10701,7 @@ void CodeGen::genRestoreCalleeSavedFltRegs(unsigned lclFrameSize)
 void CodeGen::genVzeroupperIfNeeded(bool check256bitOnly /* = true*/)
 {
 #ifdef FEATURE_AVX_SUPPORT
-    if (compiler->getFloatingPointInstructionSet() == InstructionSet_AVX)
+    if (compiler->getSIMDInstructionSet() == InstructionSet_AVX)
     {
         if (check256bitOnly)
         {
diff --git a/src/jit/codegenxarch.cpp b/src/jit/codegenxarch.cpp
index 495bc7e1b915..b65e27060560 100644
--- a/src/jit/codegenxarch.cpp
+++ b/src/jit/codegenxarch.cpp
@@ -5005,13 +5005,10 @@ void CodeGen::genCallInstruction(GenTreePtr node)
     // When it's a PInvoke call and the call type is USER function, we issue VZEROUPPER here
     // if the function contains 256bit AVX instructions, this is to avoid AVX-256 to Legacy SSE
     // transition penalty, assuming the user function contains legacy SSE instruction
-    if (call->IsPInvoke() && call->gtCallType == CT_USER_FUNC &&
-        compiler->getFloatingPointInstructionSet() == InstructionSet_AVX)
+    if (call->IsPInvoke() && (call->gtCallType == CT_USER_FUNC) && getEmitter()->Contains256bitAVX())
     {
-        if (getEmitter()->Contains256bitAVX())
-        {
-            instGen(INS_vzeroupper);
-        }
+        assert(compiler->getSIMDInstructionSet() == InstructionSet_AVX);
+        instGen(INS_vzeroupper);
     }
 #endif
 
diff --git a/src/jit/lower.h b/src/jit/lower.h
index da47cf9041bd..eecc6606ca89 100644
--- a/src/jit/lower.h
+++ b/src/jit/lower.h
@@ -235,7 +235,7 @@ class Lowering : public Phase
 
 #if defined(_TARGET_XARCH_)
     void SetMulOpCounts(GenTreePtr tree);
-    void SetContainsAVXFlags(bool isFloatingType = true, unsigned sizeOfSIMDVector = 0);
+    void SetContainsAVXFlags(bool isFloatingPointType = true, unsigned sizeOfSIMDVector = 0);
 #endif // defined(_TARGET_XARCH_)
 
 #if !CPU_LOAD_STORE_ARCH
diff --git a/src/jit/lowerxarch.cpp b/src/jit/lowerxarch.cpp
index 39d298023ede..e299a02f7f1d 100644
--- a/src/jit/lowerxarch.cpp
+++ b/src/jit/lowerxarch.cpp
@@ -4584,21 +4584,21 @@ void Lowering::SetMulOpCounts(GenTreePtr tree)
 // Contains256bitAVX flag when SIMD vector size is 32 bytes
 //
 // Arguments:
-//    isFloatingType    - true if it is floating type
-//    sizeOfSIMDVector  - SIMD Vector size
+//    isFloatingPointType   - true if it is floating point type
+//    sizeOfSIMDVector      - SIMD Vector size
 //
-void Lowering::SetContainsAVXFlags(bool isFloatingType /* = true */, unsigned sizeOfSIMDVector /* = 0*/)
+void Lowering::SetContainsAVXFlags(bool isFloatingPointType /* = true */, unsigned sizeOfSIMDVector /* = 0*/)
 {
 #ifdef FEATURE_AVX_SUPPORT
-    if (comp->getSIMDInstructionSet() == InstructionSet_AVX)
+    if (isFloatingPointType)
     {
-        if (isFloatingType)
+        if (comp->getFloatingPointInstructionSet() == InstructionSet_AVX)
         {
             comp->getEmitter()->SetContainsAVX(true);
-            if (sizeOfSIMDVector == 32)
-            {
-                comp->codeGen->getEmitter()->SetContains256bitAVX(true);
-            }
+        }
+        if (sizeOfSIMDVector == 32 && comp->getSIMDInstructionSet() == InstructionSet_AVX)
+        {
+            comp->getEmitter()->SetContains256bitAVX(true);
         }
     }
 #endif

From ec2f1deecb4f567de2c0028b40e4988b4e6ddf3a Mon Sep 17 00:00:00 2001
From: Li Tian <litian2025@gmail.com>
Date: Tue, 10 Jan 2017 22:09:47 -0800
Subject: [PATCH 5/9] fix comments, assertion failure in crossgen mscorlib

---
 src/jit/emitxarch.h    | 4 ++--
 src/jit/lowerxarch.cpp | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/jit/emitxarch.h b/src/jit/emitxarch.h
index 4753c1435456..40f22ed52627 100644
--- a/src/jit/emitxarch.h
+++ b/src/jit/emitxarch.h
@@ -150,7 +150,7 @@ void SetUseAVX(bool value)
     useAVXEncodings = value;
 }
 
-bool containsAVXInstruction;
+bool containsAVXInstruction = false;
 bool ContainsAVX()
 {
     return containsAVXInstruction;
@@ -160,7 +160,7 @@ void SetContainsAVX(bool value)
     containsAVXInstruction = value;
 }
 
-bool contains256bitAVXInstruction;
+bool contains256bitAVXInstruction = false;
 bool Contains256bitAVX()
 {
     return contains256bitAVXInstruction;
diff --git a/src/jit/lowerxarch.cpp b/src/jit/lowerxarch.cpp
index e299a02f7f1d..735a47f638f7 100644
--- a/src/jit/lowerxarch.cpp
+++ b/src/jit/lowerxarch.cpp
@@ -1957,8 +1957,8 @@ void Lowering::TreeNodeInfoInitBlockStore(GenTreeBlk* blkNode)
                     // series of 16-byte loads and stores.
                     blkNode->gtLsraInfo.internalFloatCount = 1;
                     blkNode->gtLsraInfo.addInternalCandidates(l, l->internalFloatRegCandidates());
-                    // Uses XMM reg for load and store and hence check to see whether AVX instructions are used for
-                    // codegen
+                    // Uses XMM reg for load and store and hence check to see whether AVX instructions 
+                    // are used for codegen, set ContainsAVX flag
                     SetContainsAVXFlags();
                 }
 

From eecbbbf769a1f2f35f4994da4b72f2f57703a510 Mon Sep 17 00:00:00 2001
From: Li Tian <litian2025@gmail.com>
Date: Tue, 10 Jan 2017 23:32:57 -0800
Subject: [PATCH 6/9] fix format error

---
 src/jit/lowerxarch.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/jit/lowerxarch.cpp b/src/jit/lowerxarch.cpp
index 735a47f638f7..3381060f09af 100644
--- a/src/jit/lowerxarch.cpp
+++ b/src/jit/lowerxarch.cpp
@@ -1957,7 +1957,7 @@ void Lowering::TreeNodeInfoInitBlockStore(GenTreeBlk* blkNode)
                     // series of 16-byte loads and stores.
                     blkNode->gtLsraInfo.internalFloatCount = 1;
                     blkNode->gtLsraInfo.addInternalCandidates(l, l->internalFloatRegCandidates());
-                    // Uses XMM reg for load and store and hence check to see whether AVX instructions 
+                    // Uses XMM reg for load and store and hence check to see whether AVX instructions
                     // are used for codegen, set ContainsAVX flag
                     SetContainsAVXFlags();
                 }

From 3d5e08f057970ee07327a86fea34ecce95574f97 Mon Sep 17 00:00:00 2001
From: Li Tian <litian2025@gmail.com>
Date: Wed, 11 Jan 2017 14:48:20 -0800
Subject: [PATCH 7/9] use assert insteaf of if statement

---
 src/jit/codegencommon.cpp | 28 +++++++++++++---------------
 1 file changed, 13 insertions(+), 15 deletions(-)

diff --git a/src/jit/codegencommon.cpp b/src/jit/codegencommon.cpp
index b0f4dc2ed8d8..1e022809368b 100644
--- a/src/jit/codegencommon.cpp
+++ b/src/jit/codegencommon.cpp
@@ -10701,22 +10701,20 @@ void CodeGen::genRestoreCalleeSavedFltRegs(unsigned lclFrameSize)
 void CodeGen::genVzeroupperIfNeeded(bool check256bitOnly /* = true*/)
 {
 #ifdef FEATURE_AVX_SUPPORT
-    if (compiler->getSIMDInstructionSet() == InstructionSet_AVX)
+    bool emitVzeroUpper = false;
+    if (check256bitOnly)
     {
-        if (check256bitOnly)
-        {
-            if (getEmitter()->Contains256bitAVX())
-            {
-                instGen(INS_vzeroupper);
-            }
-        }
-        else
-        {
-            if (getEmitter()->ContainsAVX())
-            {
-                instGen(INS_vzeroupper);
-            }
-        }
+        emitVzeroUpper = getEmitter()->Contains256bitAVX();
+    }
+    else
+    {
+        emitVzeroUpper = getEmitter()->ContainsAVX();
+    }
+
+    if (emitVzeroUpper)
+    {
+        assert(compiler->getSIMDInstructionSet() == InstructionSet_AVX);
+        instGen(INS_vzeroupper);
     }
 #endif
 }

From 0981bb41916937f60b5c9b1b0c5691556f630fe1 Mon Sep 17 00:00:00 2001
From: Li Tian <litian2025@gmail.com>
Date: Wed, 11 Jan 2017 20:47:48 -0800
Subject: [PATCH 8/9] Add more comments to document remove AVX/SSE transition
 penalty logic

---
 src/jit/codegencommon.cpp | 8 ++++++--
 src/jit/codegenxarch.cpp  | 5 ++++-
 2 files changed, 10 insertions(+), 3 deletions(-)

diff --git a/src/jit/codegencommon.cpp b/src/jit/codegencommon.cpp
index 1e022809368b..af6e6185f43b 100644
--- a/src/jit/codegencommon.cpp
+++ b/src/jit/codegencommon.cpp
@@ -10692,8 +10692,12 @@ void CodeGen::genRestoreCalleeSavedFltRegs(unsigned lclFrameSize)
 }
 
 // Generate Vzeroupper instruction as needed to zero out upper 128b-bit of all YMM registers so that the
-// AVX/Legacy SSE transition penalties can be avoided
-//
+// AVX/Legacy SSE transition penalties can be avoided. This function is been used in genPreserveCalleeSavedFltRegs 
+// (prolog) and genRestoreCalleeSavedFltRegs (epilog). Issue VZEROUPPER in Prolog if the method contains 
+// 128-bit or 256-bit AVX code, to avoid legacy SSE to AVX transition penalty, which could happen when native
+// code contains legacy SSE code calling into JIT AVX code (e.g. reverse pinvoke). Issue VZEROUPPER in Epilog
+// if the method contains 256-bit AVX code, to avoid AVX to legacy SSE transition penalty.
+// 
 // Params
 //   check256bitOnly  - true to check if the function contains 256-bit AVX instruction and generate Vzeroupper
 //      instruction, false to check if the function contains AVX instruciton (either 128-bit or 256-bit).
diff --git a/src/jit/codegenxarch.cpp b/src/jit/codegenxarch.cpp
index b65e27060560..11ee9eb91809 100644
--- a/src/jit/codegenxarch.cpp
+++ b/src/jit/codegenxarch.cpp
@@ -5004,7 +5004,10 @@ void CodeGen::genCallInstruction(GenTreePtr node)
 #ifdef FEATURE_AVX_SUPPORT
     // When it's a PInvoke call and the call type is USER function, we issue VZEROUPPER here
     // if the function contains 256bit AVX instructions, this is to avoid AVX-256 to Legacy SSE
-    // transition penalty, assuming the user function contains legacy SSE instruction
+    // transition penalty, assuming the user function contains legacy SSE instruction. 
+    // To limit code size increase impact: we only issue VZEROUPPER before PInvoke call, not issue 
+    // VZEROUPPER after PInvoke call because transition penalty from legacy SSE to AVX only happens  
+    // when there's preceding 256-bit AVX to legacy SSE transition penalty.  
     if (call->IsPInvoke() && (call->gtCallType == CT_USER_FUNC) && getEmitter()->Contains256bitAVX())
     {
         assert(compiler->getSIMDInstructionSet() == InstructionSet_AVX);

From b6d69f5da791efceedbda0fc7d001900efe0f326 Mon Sep 17 00:00:00 2001
From: Li Tian <litian2025@gmail.com>
Date: Wed, 11 Jan 2017 21:08:11 -0800
Subject: [PATCH 9/9] fix format

---
 src/jit/codegencommon.cpp | 6 +++---
 src/jit/codegenxarch.cpp  | 8 ++++----
 2 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/src/jit/codegencommon.cpp b/src/jit/codegencommon.cpp
index af6e6185f43b..f42103ebce71 100644
--- a/src/jit/codegencommon.cpp
+++ b/src/jit/codegencommon.cpp
@@ -10692,12 +10692,12 @@ void CodeGen::genRestoreCalleeSavedFltRegs(unsigned lclFrameSize)
 }
 
 // Generate Vzeroupper instruction as needed to zero out upper 128b-bit of all YMM registers so that the
-// AVX/Legacy SSE transition penalties can be avoided. This function is been used in genPreserveCalleeSavedFltRegs 
-// (prolog) and genRestoreCalleeSavedFltRegs (epilog). Issue VZEROUPPER in Prolog if the method contains 
+// AVX/Legacy SSE transition penalties can be avoided. This function is been used in genPreserveCalleeSavedFltRegs
+// (prolog) and genRestoreCalleeSavedFltRegs (epilog). Issue VZEROUPPER in Prolog if the method contains
 // 128-bit or 256-bit AVX code, to avoid legacy SSE to AVX transition penalty, which could happen when native
 // code contains legacy SSE code calling into JIT AVX code (e.g. reverse pinvoke). Issue VZEROUPPER in Epilog
 // if the method contains 256-bit AVX code, to avoid AVX to legacy SSE transition penalty.
-// 
+//
 // Params
 //   check256bitOnly  - true to check if the function contains 256-bit AVX instruction and generate Vzeroupper
 //      instruction, false to check if the function contains AVX instruciton (either 128-bit or 256-bit).
diff --git a/src/jit/codegenxarch.cpp b/src/jit/codegenxarch.cpp
index 11ee9eb91809..3241c8833872 100644
--- a/src/jit/codegenxarch.cpp
+++ b/src/jit/codegenxarch.cpp
@@ -5004,10 +5004,10 @@ void CodeGen::genCallInstruction(GenTreePtr node)
 #ifdef FEATURE_AVX_SUPPORT
     // When it's a PInvoke call and the call type is USER function, we issue VZEROUPPER here
     // if the function contains 256bit AVX instructions, this is to avoid AVX-256 to Legacy SSE
-    // transition penalty, assuming the user function contains legacy SSE instruction. 
-    // To limit code size increase impact: we only issue VZEROUPPER before PInvoke call, not issue 
-    // VZEROUPPER after PInvoke call because transition penalty from legacy SSE to AVX only happens  
-    // when there's preceding 256-bit AVX to legacy SSE transition penalty.  
+    // transition penalty, assuming the user function contains legacy SSE instruction.
+    // To limit code size increase impact: we only issue VZEROUPPER before PInvoke call, not issue
+    // VZEROUPPER after PInvoke call because transition penalty from legacy SSE to AVX only happens
+    // when there's preceding 256-bit AVX to legacy SSE transition penalty.
     if (call->IsPInvoke() && (call->gtCallType == CT_USER_FUNC) && getEmitter()->Contains256bitAVX())
     {
         assert(compiler->getSIMDInstructionSet() == InstructionSet_AVX);