From 47a78d3d32f2dd4743e6e773ae9eae3118396ac7 Mon Sep 17 00:00:00 2001
From: Jakob Botsch Nielsen <jakob.botsch.nielsen@gmail.com>
Date: Thu, 22 Sep 2022 16:02:37 +0200
Subject: [PATCH 01/14] JIT: Smarter ordering of late args based on register
 uses

---
 src/coreclr/jit/morph.cpp | 175 +++++++++++++++++++++++++++++++++++++-
 1 file changed, 172 insertions(+), 3 deletions(-)

diff --git a/src/coreclr/jit/morph.cpp b/src/coreclr/jit/morph.cpp
index e015597e1d8b81..68b62724e1f163 100644
--- a/src/coreclr/jit/morph.cpp
+++ b/src/coreclr/jit/morph.cpp
@@ -1256,6 +1256,50 @@ void CallArgs::ArgsComplete(Compiler* comp, GenTreeCall* call)
     m_argsComplete = true;
 }
 
+// Estimate registers used by a tree.
+static regMaskTP EstimateRegisterUses(Compiler* comp, GenTree* tree)
+{
+    struct RegisterUsesVisitor : GenTreeVisitor<RegisterUsesVisitor>
+    {
+        enum
+        {
+            DoPreOrder    = true,
+            DoLclVarsOnly = true,
+        };
+
+        regMaskTP Registers = RBM_NONE;
+
+        RegisterUsesVisitor(Compiler* comp) : GenTreeVisitor(comp)
+        {
+        }
+
+        fgWalkResult PreOrderVisit(GenTree** use, GenTree* parent)
+        {
+            GenTreeLclVarCommon* node = (*use)->AsLclVarCommon();
+            LclVarDsc*           desc = m_compiler->lvaGetDesc(node);
+            if (!desc->lvDoNotEnregister && desc->lvIsRegArg)
+            {
+                if (desc->GetArgReg() != REG_NA)
+                {
+                    Registers |= genRegMask(desc->GetArgReg());
+                }
+#if FEATURE_MULTIREG_ARGS
+                if (desc->GetOtherArgReg() != REG_NA)
+                {
+                    Registers |= genRegMask(desc->GetOtherArgReg());
+                }
+#endif
+            }
+
+            return fgWalkResult::WALK_CONTINUE;
+        }
+    };
+
+    RegisterUsesVisitor visitor(comp);
+    visitor.WalkTree(&tree, nullptr);
+    return visitor.Registers;
+}
+
 //------------------------------------------------------------------------
 // SortArgs: Sort arguments into a better passing order.
 //
@@ -1275,9 +1319,8 @@ void CallArgs::SortArgs(Compiler* comp, GenTreeCall* call, CallArg** sortedArgs)
     // idea is to move all "simple" arguments like constants and local vars to
     // the end, and move the complex arguments towards the beginning. This will
     // help prevent registers from being spilled by allowing us to evaluate the
-    // more complex arguments before the simpler arguments. We use the late
-    // list to keep the sorted result at this point, and the ordering ends up
-    // looking like:
+    // more complex arguments before the simpler arguments. The ordering ends
+    // up looking like:
     //     +------------------------------------+  <--- end of sortedArgs
     //     |          constants                 |
     //     +------------------------------------+
@@ -1338,6 +1381,7 @@ void CallArgs::SortArgs(Compiler* comp, GenTreeCall* call, CallArg** sortedArgs)
 
                 // place curArgTabEntry at the endTab position by performing a swap
                 //
+                JITDUMP("  [%06u] -> #%d (is constant)\n", argx->gtTreeID, endTab);
                 if (curInx != endTab)
                 {
                     sortedArgs[curInx] = sortedArgs[endTab];
@@ -1374,6 +1418,7 @@ void CallArgs::SortArgs(Compiler* comp, GenTreeCall* call, CallArg** sortedArgs)
 
                     // place curArgTabEntry at the begTab position by performing a swap
                     //
+                    JITDUMP("  [%06u] -> #%d (is call)\n", argx->gtTreeID, begTab);
                     if (curInx != begTab)
                     {
                         sortedArgs[curInx] = sortedArgs[begTab];
@@ -1387,6 +1432,48 @@ void CallArgs::SortArgs(Compiler* comp, GenTreeCall* call, CallArg** sortedArgs)
         }
     }
 
+    //// Finally place arguments such that arguments nodes that use parameters
+    //// are placed before arguments that would clobber the registers those
+    //// parameters may be in.
+    // while (argsRemaining > 0)
+    //{
+    //    // Find arg that clobbers least amount of registers used by other args.
+    //    unsigned bestNumClobbered = UINT_MAX;
+    //    unsigned bestIndex = 0;
+    //    for (unsigned i = begTab; i <= endTab; i++)
+    //    {
+    //        assert(!sortedArgs[i]->m_processed);
+    //        regMaskTP clobbered = 0;
+    //        for (unsigned j = 0; j < sortedArgs[i]->AbiInfo.NumRegs; j++)
+    //        {
+    //            clobbered |= genRegMask(sortedArgs[i]->AbiInfo.GetRegNum(j));
+    //        }
+
+    //        assert(clobbered != 0);
+    //        unsigned clobberedLaterArgs = 0;
+    //        for (unsigned j = begTab; j <= endTab; j++)
+    //        {
+    //            if (i == j)
+    //                continue;
+
+    //            regMaskTP uses = EstimateRegisterUses(comp, sortedArgs[j]->GetNode());
+    //            if ((uses & clobbered) != 0)
+    //                clobberedLaterArgs++;
+    //        }
+
+    //        if (clobberedLaterArgs < bestNumClobbered)
+    //        {
+    //            bestNumClobbered = clobberedLaterArgs;
+    //            bestIndex = i;
+    //        }
+    //    }
+
+    //    sortedArgs[bestIndex]->m_processed = true;
+    //    std::swap(sortedArgs[begTab], sortedArgs[bestIndex]);
+    //    begTab++;
+    //    argsRemaining--;
+    //}
+
     if (argsRemaining > 0)
     {
         // Next take care arguments that are temps.
@@ -1410,6 +1497,7 @@ void CallArgs::SortArgs(Compiler* comp, GenTreeCall* call, CallArg** sortedArgs)
 
                     // place curArgTabEntry at the begTab position by performing a swap
                     //
+                    JITDUMP("  [%06u] -> #%d (is temp)\n", arg->GetNode()->gtTreeID, begTab);
                     if (curInx != begTab)
                     {
                         sortedArgs[curInx] = sortedArgs[begTab];
@@ -1452,6 +1540,7 @@ void CallArgs::SortArgs(Compiler* comp, GenTreeCall* call, CallArg** sortedArgs)
 
                     // place curArgTabEntry at the endTab position by performing a swap
                     //
+                    JITDUMP("  [%06u] -> #%d (non-struct local)\n", arg->GetNode()->gtTreeID, endTab);
                     if (curInx != endTab)
                     {
                         sortedArgs[curInx] = sortedArgs[endTab];
@@ -1465,6 +1554,27 @@ void CallArgs::SortArgs(Compiler* comp, GenTreeCall* call, CallArg** sortedArgs)
         } while (curInx > begTab);
     }
 
+    if (argsRemaining > 0)
+    {
+        // Place all arguments that do not clobber registers.
+        for (unsigned curIndex = begTab; curIndex <= endTab; curIndex++)
+        {
+            CallArg* arg = sortedArgs[curIndex];
+            if (arg->m_processed)
+                continue;
+
+            if (arg->AbiInfo.NumRegs <= 0)
+            {
+                arg->m_processed = true;
+                memmove(&sortedArgs[begTab + 1], &sortedArgs[begTab], (curIndex - begTab) * sizeof(CallArg*));
+                JITDUMP("  [%06u] -> #%d (stack arg)\n", arg->GetNode()->gtTreeID, begTab);
+                sortedArgs[begTab] = arg;
+                begTab++;
+                argsRemaining--;
+            }
+        }
+    }
+
     // Finally, take care of all the remaining arguments.
     // Note that we fill in one arg at a time using a while loop.
     bool costsPrepared = false; // Only prepare tree costs once, the first time through this loop
@@ -1475,6 +1585,7 @@ void CallArgs::SortArgs(Compiler* comp, GenTreeCall* call, CallArg** sortedArgs)
         CallArg* expensiveArg      = nullptr;
         unsigned expensiveArgIndex = UINT_MAX;
         unsigned expensiveArgCost  = 0;
+        unsigned expensiveArgNum   = 0;
 
         // [We use a forward iterator pattern]
         //
@@ -1494,6 +1605,7 @@ void CallArgs::SortArgs(Compiler* comp, GenTreeCall* call, CallArg** sortedArgs)
                        !argx->OperIs(GT_CNS_INT));
 
                 // This arg should either have no persistent side effects or be the last one in our table
+
                 // assert(((argx->gtFlags & GTF_PERSISTENT_SIDE_EFFECTS) == 0) || (curInx == (argCount-1)));
 
                 if (argsRemaining == 1)
@@ -1501,6 +1613,7 @@ void CallArgs::SortArgs(Compiler* comp, GenTreeCall* call, CallArg** sortedArgs)
                     // This is the last arg to place
                     expensiveArgIndex = curInx;
                     expensiveArg      = arg;
+                    expensiveArgNum   = 1;
                     assert(begTab == endTab);
                     break;
                 }
@@ -1518,6 +1631,11 @@ void CallArgs::SortArgs(Compiler* comp, GenTreeCall* call, CallArg** sortedArgs)
                         expensiveArgCost  = argx->GetCostEx();
                         expensiveArgIndex = curInx;
                         expensiveArg      = arg;
+                        expensiveArgNum   = 1;
+                    }
+                    else if (argx->GetCostEx() == expensiveArgCost)
+                    {
+                        expensiveArgNum++;
                     }
                 }
             }
@@ -1525,12 +1643,63 @@ void CallArgs::SortArgs(Compiler* comp, GenTreeCall* call, CallArg** sortedArgs)
 
         noway_assert(expensiveArgIndex != UINT_MAX);
 
+        if (expensiveArgNum > 1)
+        {
+            // If there are multiple args with the same cost then place arg
+            // that clobbers least amount of registers used by other args.
+            // Find arg that clobbers least amount of registers used by other args.
+            unsigned bestNumClobbered = UINT_MAX;
+            unsigned bestIndex        = 0;
+            for (unsigned i = begTab; i <= endTab; i++)
+            {
+                if (sortedArgs[i]->m_processed)
+                    continue;
+
+                if (sortedArgs[i]->GetNode()->GetCostEx() != expensiveArgCost)
+                    continue;
+
+                regMaskTP clobbered = 0;
+                for (unsigned j = 0; j < sortedArgs[i]->AbiInfo.NumRegs; j++)
+                {
+                    clobbered |= genRegMask(sortedArgs[i]->AbiInfo.GetRegNum(j));
+                }
+
+                assert(clobbered != 0);
+                unsigned clobberedLaterArgs = 0;
+                for (unsigned j = begTab; j <= endTab; j++)
+                {
+                    if (i == j || sortedArgs[i]->m_processed ||
+                        sortedArgs[i]->GetNode()->GetCostEx() != expensiveArgCost)
+                        continue;
+
+                    regMaskTP uses = EstimateRegisterUses(comp, sortedArgs[j]->GetNode());
+                    if ((uses & clobbered) != 0)
+                        clobberedLaterArgs++;
+                }
+
+                if (clobberedLaterArgs < bestNumClobbered)
+                {
+                    bestNumClobbered = clobberedLaterArgs;
+                    bestIndex        = i;
+                }
+            }
+
+            expensiveArg      = sortedArgs[bestIndex];
+            expensiveArgIndex = bestIndex;
+        }
+
         // put the most expensive arg towards the beginning of the table
 
         expensiveArg->m_processed = true;
 
         // place expensiveArgTabEntry at the begTab position by performing a swap
         //
+        JITDUMP("  [%06u] -> #%d", expensiveArg->GetNode()->gtTreeID, begTab);
+        if (argsRemaining == 1)
+            JITDUMP(" (last arg)\n");
+        else
+            JITDUMP(" (cost %u)\n", expensiveArgCost);
+
         if (expensiveArgIndex != begTab)
         {
             sortedArgs[expensiveArgIndex] = sortedArgs[begTab];

From ff2f75a6cf34cabe5a2421f8b8dd0320e9c1b092 Mon Sep 17 00:00:00 2001
From: Jakob Botsch Nielsen <jakob.botsch.nielsen@gmail.com>
Date: Thu, 22 Sep 2022 16:33:15 +0200
Subject: [PATCH 02/14] Fix build

---
 src/coreclr/jit/morph.cpp | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/src/coreclr/jit/morph.cpp b/src/coreclr/jit/morph.cpp
index 68b62724e1f163..0c581b78a60c2c 100644
--- a/src/coreclr/jit/morph.cpp
+++ b/src/coreclr/jit/morph.cpp
@@ -1696,9 +1696,13 @@ void CallArgs::SortArgs(Compiler* comp, GenTreeCall* call, CallArg** sortedArgs)
         //
         JITDUMP("  [%06u] -> #%d", expensiveArg->GetNode()->gtTreeID, begTab);
         if (argsRemaining == 1)
+        {
             JITDUMP(" (last arg)\n");
+        }
         else
+        {
             JITDUMP(" (cost %u)\n", expensiveArgCost);
+        }
 
         if (expensiveArgIndex != begTab)
         {

From 014f4b7e327e78a6d436a6611160f57d747ef6d3 Mon Sep 17 00:00:00 2001
From: Jakob Botsch Nielsen <jakob.botsch.nielsen@gmail.com>
Date: Thu, 22 Sep 2022 23:09:13 +0200
Subject: [PATCH 03/14] Fix arg reg check

---
 src/coreclr/jit/morph.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/coreclr/jit/morph.cpp b/src/coreclr/jit/morph.cpp
index 0c581b78a60c2c..6631e76648a6de 100644
--- a/src/coreclr/jit/morph.cpp
+++ b/src/coreclr/jit/morph.cpp
@@ -1279,12 +1279,12 @@ static regMaskTP EstimateRegisterUses(Compiler* comp, GenTree* tree)
             LclVarDsc*           desc = m_compiler->lvaGetDesc(node);
             if (!desc->lvDoNotEnregister && desc->lvIsRegArg)
             {
-                if (desc->GetArgReg() != REG_NA)
+                if ((desc->GetArgReg() != REG_NA) && (desc->GetArgReg() != REG_STK))
                 {
                     Registers |= genRegMask(desc->GetArgReg());
                 }
 #if FEATURE_MULTIREG_ARGS
-                if (desc->GetOtherArgReg() != REG_NA)
+                if ((desc->GetOtherArgReg() != REG_NA) && (desc->GetOtherArgReg() != REG_STK))
                 {
                     Registers |= genRegMask(desc->GetOtherArgReg());
                 }

From 9d0f1df92848e938709dc9b6cbf1a264cb9d0474 Mon Sep 17 00:00:00 2001
From: Jakob Botsch Nielsen <jakob.botsch.nielsen@gmail.com>
Date: Mon, 26 Sep 2022 19:02:21 +0200
Subject: [PATCH 04/14] Generalize

---
 src/coreclr/jit/morph.cpp | 197 +++++++++++++++++++++++++-------------
 1 file changed, 132 insertions(+), 65 deletions(-)

diff --git a/src/coreclr/jit/morph.cpp b/src/coreclr/jit/morph.cpp
index 6631e76648a6de..81efcca26d05d5 100644
--- a/src/coreclr/jit/morph.cpp
+++ b/src/coreclr/jit/morph.cpp
@@ -1340,6 +1340,11 @@ void CallArgs::SortArgs(Compiler* comp, GenTreeCall* call, CallArg** sortedArgs)
         sortedArgs[argCount++] = &arg;
     }
 
+    if (argCount <= 1)
+    {
+        return;
+    }
+
     // Set the beginning and end for the new argument table
     unsigned curInx;
     int      regCount      = 0;
@@ -1390,10 +1395,18 @@ void CallArgs::SortArgs(Compiler* comp, GenTreeCall* call, CallArg** sortedArgs)
 
                 endTab--;
                 argsRemaining--;
+                JITDUMP("   ");
+                for (unsigned i = 0; i < argCount; i++)
+                    JITDUMP(" [%06u]", sortedArgs[i]->GetNode()->gtTreeID);
+                JITDUMP("\n");
             }
         }
     } while (curInx > 0);
 
+    unsigned beforeFirstConstArgIndex = endTab;
+
+    unsigned firstCallArgIndex = begTab;
+
     if (argsRemaining > 0)
     {
         // Next take care of arguments that are calls.
@@ -1427,11 +1440,17 @@ void CallArgs::SortArgs(Compiler* comp, GenTreeCall* call, CallArg** sortedArgs)
 
                     begTab++;
                     argsRemaining--;
+                    JITDUMP("   ");
+                    for (unsigned i = 0; i < argCount; i++)
+                        JITDUMP(" [%06u]", sortedArgs[i]->GetNode()->gtTreeID);
+                    JITDUMP("\n");
                 }
             }
         }
     }
 
+    unsigned afterCallIndex = begTab;
+
     //// Finally place arguments such that arguments nodes that use parameters
     //// are placed before arguments that would clobber the registers those
     //// parameters may be in.
@@ -1506,6 +1525,10 @@ void CallArgs::SortArgs(Compiler* comp, GenTreeCall* call, CallArg** sortedArgs)
 
                     begTab++;
                     argsRemaining--;
+                    JITDUMP("   ");
+                    for (unsigned i = 0; i < argCount; i++)
+                        JITDUMP(" [%06u]", sortedArgs[i]->GetNode()->gtTreeID);
+                    JITDUMP("\n");
                 }
             }
         }
@@ -1549,32 +1572,15 @@ void CallArgs::SortArgs(Compiler* comp, GenTreeCall* call, CallArg** sortedArgs)
 
                     endTab--;
                     argsRemaining--;
+                    JITDUMP("   ");
+                    for (unsigned i = 0; i < argCount; i++)
+                        JITDUMP(" [%06u]", sortedArgs[i]->GetNode()->gtTreeID);
+                    JITDUMP("\n");
                 }
             }
         } while (curInx > begTab);
     }
 
-    if (argsRemaining > 0)
-    {
-        // Place all arguments that do not clobber registers.
-        for (unsigned curIndex = begTab; curIndex <= endTab; curIndex++)
-        {
-            CallArg* arg = sortedArgs[curIndex];
-            if (arg->m_processed)
-                continue;
-
-            if (arg->AbiInfo.NumRegs <= 0)
-            {
-                arg->m_processed = true;
-                memmove(&sortedArgs[begTab + 1], &sortedArgs[begTab], (curIndex - begTab) * sizeof(CallArg*));
-                JITDUMP("  [%06u] -> #%d (stack arg)\n", arg->GetNode()->gtTreeID, begTab);
-                sortedArgs[begTab] = arg;
-                begTab++;
-                argsRemaining--;
-            }
-        }
-    }
-
     // Finally, take care of all the remaining arguments.
     // Note that we fill in one arg at a time using a while loop.
     bool costsPrepared = false; // Only prepare tree costs once, the first time through this loop
@@ -1643,50 +1649,50 @@ void CallArgs::SortArgs(Compiler* comp, GenTreeCall* call, CallArg** sortedArgs)
 
         noway_assert(expensiveArgIndex != UINT_MAX);
 
-        if (expensiveArgNum > 1)
-        {
-            // If there are multiple args with the same cost then place arg
-            // that clobbers least amount of registers used by other args.
-            // Find arg that clobbers least amount of registers used by other args.
-            unsigned bestNumClobbered = UINT_MAX;
-            unsigned bestIndex        = 0;
-            for (unsigned i = begTab; i <= endTab; i++)
-            {
-                if (sortedArgs[i]->m_processed)
-                    continue;
-
-                if (sortedArgs[i]->GetNode()->GetCostEx() != expensiveArgCost)
-                    continue;
-
-                regMaskTP clobbered = 0;
-                for (unsigned j = 0; j < sortedArgs[i]->AbiInfo.NumRegs; j++)
-                {
-                    clobbered |= genRegMask(sortedArgs[i]->AbiInfo.GetRegNum(j));
-                }
-
-                assert(clobbered != 0);
-                unsigned clobberedLaterArgs = 0;
-                for (unsigned j = begTab; j <= endTab; j++)
-                {
-                    if (i == j || sortedArgs[i]->m_processed ||
-                        sortedArgs[i]->GetNode()->GetCostEx() != expensiveArgCost)
-                        continue;
-
-                    regMaskTP uses = EstimateRegisterUses(comp, sortedArgs[j]->GetNode());
-                    if ((uses & clobbered) != 0)
-                        clobberedLaterArgs++;
-                }
-
-                if (clobberedLaterArgs < bestNumClobbered)
-                {
-                    bestNumClobbered = clobberedLaterArgs;
-                    bestIndex        = i;
-                }
-            }
-
-            expensiveArg      = sortedArgs[bestIndex];
-            expensiveArgIndex = bestIndex;
-        }
+        // if (expensiveArgNum > 1)
+        //{
+        //    // If there are multiple args with the same cost then place arg
+        //    // that clobbers least amount of registers used by other args.
+        //    // Find arg that clobbers least amount of registers used by other args.
+        //    unsigned bestNumClobbered = UINT_MAX;
+        //    unsigned bestIndex        = 0;
+        //    for (unsigned i = begTab; i <= endTab; i++)
+        //    {
+        //        if (sortedArgs[i]->m_processed)
+        //            continue;
+
+        //        if (sortedArgs[i]->GetNode()->GetCostEx() != expensiveArgCost)
+        //            continue;
+
+        //        regMaskTP clobbered = 0;
+        //        for (unsigned j = 0; j < sortedArgs[i]->AbiInfo.NumRegs; j++)
+        //        {
+        //            clobbered |= genRegMask(sortedArgs[i]->AbiInfo.GetRegNum(j));
+        //        }
+
+        //        assert(clobbered != 0);
+        //        unsigned clobberedLaterArgs = 0;
+        //        for (unsigned j = begTab; j <= endTab; j++)
+        //        {
+        //            if (i == j || sortedArgs[i]->m_processed ||
+        //                sortedArgs[i]->GetNode()->GetCostEx() != expensiveArgCost)
+        //                continue;
+
+        //            regMaskTP uses = EstimateRegisterUses(comp, sortedArgs[j]->GetNode());
+        //            if ((uses & clobbered) != 0)
+        //                clobberedLaterArgs++;
+        //        }
+
+        //        if (clobberedLaterArgs < bestNumClobbered)
+        //        {
+        //            bestNumClobbered = clobberedLaterArgs;
+        //            bestIndex        = i;
+        //        }
+        //    }
+
+        //    expensiveArg      = sortedArgs[bestIndex];
+        //    expensiveArgIndex = bestIndex;
+        //}
 
         // put the most expensive arg towards the beginning of the table
 
@@ -1714,6 +1720,67 @@ void CallArgs::SortArgs(Compiler* comp, GenTreeCall* call, CallArg** sortedArgs)
         argsRemaining--;
 
         costsPrepared = true; // If we have more expensive arguments, don't re-evaluate the tree cost on the next loop
+        JITDUMP("   ");
+        for (unsigned i = 0; i < argCount; i++)
+            JITDUMP(" [%06u]", sortedArgs[i]->GetNode()->gtTreeID);
+        JITDUMP("\n");
+    }
+
+    unsigned insertionIndex = afterCallIndex;
+    // Move all arguments that do not clobber registers back to happen right after GTF_CALL args.
+    for (unsigned i = afterCallIndex; i != endTab + 1; i++)
+    {
+        CallArg* arg = sortedArgs[i];
+        if (arg->AbiInfo.NumRegs <= 0)
+        {
+            memmove(&sortedArgs[insertionIndex + 1], &sortedArgs[insertionIndex],
+                    (i - insertionIndex) * sizeof(CallArg*));
+            JITDUMP("  [%06u] -> #%d (stack arg)\n", arg->GetNode()->gtTreeID, insertionIndex);
+            sortedArgs[insertionIndex] = arg;
+            JITDUMP("   ");
+            for (unsigned i = 0; i < argCount; i++)
+                JITDUMP(" [%06u]", sortedArgs[i]->GetNode()->gtTreeID);
+            JITDUMP("\n");
+
+            insertionIndex++;
+        }
+    }
+
+    // Now move args whose placement will potentially clobber a later use to the end of the list.
+    unsigned max      = beforeFirstConstArgIndex - firstCallArgIndex + 1;
+    unsigned curIndex = firstCallArgIndex;
+    for (unsigned count = 0; count < max; count++)
+    {
+        CallArg* arg = sortedArgs[curIndex];
+        if (arg->AbiInfo.NumRegs <= 0)
+        {
+            curIndex++;
+            continue;
+        }
+
+        regMaskTP clobbered = 0;
+        for (unsigned j = 0; j < arg->AbiInfo.NumRegs; j++)
+        {
+            clobbered |= genRegMask(arg->AbiInfo.GetRegNum(j));
+        }
+
+        unsigned nextIndex = curIndex + 1;
+
+        for (unsigned i = curIndex + 1; i != beforeFirstConstArgIndex + 1; i++)
+        {
+            regMaskTP usedRegs = EstimateRegisterUses(comp, sortedArgs[i]->GetNode());
+            if ((clobbered & usedRegs) != 0)
+            {
+                memmove(&sortedArgs[curIndex], &sortedArgs[curIndex + 1],
+                        (beforeFirstConstArgIndex - curIndex) * sizeof(CallArg*));
+                assert(sortedArgs[beforeFirstConstArgIndex - 1] == sortedArgs[beforeFirstConstArgIndex]);
+                sortedArgs[beforeFirstConstArgIndex] = arg;
+                nextIndex--;
+                break;
+            }
+        }
+
+        curIndex = nextIndex;
     }
 
     // The table should now be completely filled and thus begTab should now be adjacent to endTab

From 9c4df8454434793965e1939ee80ac3b2aa007d10 Mon Sep 17 00:00:00 2001
From: Jakob Botsch Nielsen <jakob.botsch.nielsen@gmail.com>
Date: Tue, 27 Sep 2022 11:17:56 +0200
Subject: [PATCH 05/14] SPMI: Improve speed significantly for large diffs

This starts communicating more information about diffs back from
superpmi and starts using it in the driver. The current information is
the base size, diff size, base instructions, diff instructions and
context size.

The driver uses the base size/diff size to pick a small number of
interesting contexts and only creates disassembly for these contexts.
jit-analyze is then also invoked on only these contexts, but the
contexts are picked so that they overlap with what jit-analyze would
display. Additionally, we also pick the top 20 smallest contexts (in
terms of context size) -- I frequently use the smallest contexts with
diffs to investigate my change.

The new behavior is only enabled when no custom metrics are specified.
If custom metrics are specified we fall back to producing all
disassembly files and leave it up to jit-analyze to extract and analyze
the metrics specified.

Also remove --retainOnlyTopFiles. This is no longer necessary since we
avoid creating these disassembly files in the first place. Another
benefit is that all contexts mentioned by jit-analyze output will now be
part of the artifacts.

The net result is that we can now get SPMI diffs for changes with even
hundreds of thousands of diffs in the same time as it takes to get diffs
for a small change.

Fix #76178
---
 src/coreclr/inc/clr_std/vector                |  27 ++++
 src/coreclr/scripts/superpmi.py               | 114 +++++++++++----
 src/coreclr/scripts/superpmi_diffs.py         |   3 +-
 .../superpmi/superpmi-shared/standardpch.h    |   2 +
 .../tools/superpmi/superpmi/CMakeLists.txt    |   1 +
 .../tools/superpmi/superpmi/commandline.cpp   |   8 +-
 .../tools/superpmi/superpmi/commandline.h     |   4 +-
 .../tools/superpmi/superpmi/fileio.cpp        | 115 +++++++++++++++
 src/coreclr/tools/superpmi/superpmi/fileio.h  | 133 ++++++++++++++++++
 .../superpmi/superpmi/metricssummary.cpp      |  95 ++-----------
 .../tools/superpmi/superpmi/metricssummary.h  |   2 +-
 .../superpmi/superpmi/parallelsuperpmi.cpp    |  64 +++++++--
 .../tools/superpmi/superpmi/superpmi.cpp      | 128 +++++++++++------
 13 files changed, 518 insertions(+), 178 deletions(-)
 create mode 100644 src/coreclr/tools/superpmi/superpmi/fileio.cpp
 create mode 100644 src/coreclr/tools/superpmi/superpmi/fileio.h

diff --git a/src/coreclr/inc/clr_std/vector b/src/coreclr/inc/clr_std/vector
index c10ecf6ba5a92c..c2d1caba890aaf 100644
--- a/src/coreclr/inc/clr_std/vector
+++ b/src/coreclr/inc/clr_std/vector
@@ -215,6 +215,33 @@ namespace std
                 }
             }
 
+            vector(const vector<T>&) = delete;
+            vector<T>& operator=(const vector<T>&) = delete;
+
+            vector(vector<T>&& v) noexcept
+                : m_size(v.m_size)
+                , m_capacity(v.m_capacity)
+                , m_pelements(v.m_pelements)
+                , m_isBufferOwner(v.m_isBufferOwner)
+            {
+                v.m_isBufferOwner = false;
+            }
+
+            vector<T>& operator=(vector<T>&& v) noexcept
+            {
+                if (m_isBufferOwner)
+                {
+                    erase(m_pelements, 0, m_size);
+                    delete [] (BYTE*)m_pelements;
+                }
+
+                m_size = v.m_size;
+                m_capacity = v.m_capacity;
+                m_pelements = v.m_pelements;
+                m_isBufferOwner = v.m_isBufferOwner;
+                v.m_isBufferOwner = false;
+                return *this;
+            }
 
             size_t size() const
             {
diff --git a/src/coreclr/scripts/superpmi.py b/src/coreclr/scripts/superpmi.py
index 45dfcfe8746e51..ae45443620cd6c 100644
--- a/src/coreclr/scripts/superpmi.py
+++ b/src/coreclr/scripts/superpmi.py
@@ -342,7 +342,6 @@ def add_core_root_arguments(parser, build_type_default, build_type_help):
 asm_diff_parser.add_argument("--debuginfo", action="store_true", help="Include debug info after disassembly (sets COMPlus_JitDebugDump).")
 asm_diff_parser.add_argument("-tag", help="Specify a word to add to the directory name where the asm diffs will be placed")
 asm_diff_parser.add_argument("-metrics", action="append", help="Metrics option to pass to jit-analyze. Can be specified multiple times, or pass comma-separated values.")
-asm_diff_parser.add_argument("-retainOnlyTopFiles", action="store_true", help="Retain only top .dasm files with largest improvements or regressions and delete remaining files.")
 asm_diff_parser.add_argument("--diff_with_release", action="store_true", help="Specify if this is asmdiff using release binaries.")
 asm_diff_parser.add_argument("--git_diff", action="store_true", help="Produce a '.diff' file from 'base' and 'diff' folders if there were any differences.")
 
@@ -501,6 +500,11 @@ def read_csv_metrics(path):
 
     return dict
 
+def read_csv_diffs(path):
+    with open(path) as csv_file:
+        reader = csv.DictReader(csv_file)
+        return list(reader)
+
 def determine_clrjit_compiler_version(clrjit_path):
     """ Obtain the version of the compiler that was used to compile the clrjit at the specified path.
 
@@ -1568,7 +1572,7 @@ def replay_with_asm_diffs(self):
                 logging.info("Running asm diffs of %s", mch_file)
 
                 fail_mcl_file = os.path.join(temp_location, os.path.basename(mch_file) + "_fail.mcl")
-                diff_mcl_file = os.path.join(temp_location, os.path.basename(mch_file) + "_diff.mcl")
+                diffs_info = os.path.join(temp_location, os.path.basename(mch_file) + "_diffs.csv")
 
                 base_metrics_summary_file = os.path.join(temp_location, os.path.basename(mch_file) + "_base_metrics.csv")
                 diff_metrics_summary_file = os.path.join(temp_location, os.path.basename(mch_file) + "_diff_metrics.csv")
@@ -1577,7 +1581,7 @@ def replay_with_asm_diffs(self):
                     "-a",  # Asm diffs
                     "-v", "ewmi",  # display errors, warnings, missing, jit info
                     "-f", fail_mcl_file,  # Failing mc List
-                    "-diffMCList", diff_mcl_file,  # Create all of the diffs in an mcl file
+                    "-diffsInfo", diffs_info,  # Information about diffs
                     "-r", os.path.join(temp_location, "repro"),  # Repro name, create .mc repro files
                     "-baseMetricsSummary", base_metrics_summary_file, # Create summary of metrics we can use to get total code size impact
                     "-diffMetricsSummary", diff_metrics_summary_file,
@@ -1633,8 +1637,10 @@ def replay_with_asm_diffs(self):
                             repro_base_command_line = "{} {} {}".format(self.superpmi_path, " ".join(altjit_asm_diffs_flags), self.diff_jit_path)
                             save_repro_mc_files(temp_location, self.coreclr_args, artifacts_base_name, repro_base_command_line)
 
+                diffs = read_csv_diffs(diffs_info)
+
                 # This file had asm diffs; keep track of that.
-                has_diffs = is_nonzero_length_file(diff_mcl_file)
+                has_diffs = len(diffs) > 0
                 if has_diffs:
                     files_with_asm_diffs.append(mch_file)
 
@@ -1649,12 +1655,6 @@ def replay_with_asm_diffs(self):
                     if return_code == 0:
                         logging.warning("Warning: SuperPMI returned a zero exit code, but generated a non-zero-sized mcl file")
 
-                    self.diff_mcl_contents = None
-                    with open(diff_mcl_file) as file_handle:
-                        mcl_lines = file_handle.readlines()
-                        mcl_lines = [item.strip() for item in mcl_lines]
-                        self.diff_mcl_contents = mcl_lines
-
                     asm_root_dir = create_unique_directory_name(self.coreclr_args.spmi_location, "asm.{}".format(artifacts_base_name))
                     base_asm_location = os.path.join(asm_root_dir, "base")
                     diff_asm_location = os.path.join(asm_root_dir, "diff")
@@ -1672,13 +1672,13 @@ def replay_with_asm_diffs(self):
                     text_differences = queue.Queue()
                     jit_dump_differences = queue.Queue()
 
-                    async def create_replay_artifacts(print_prefix, item, self, mch_file, env, jit_differences_queue, base_location, diff_location, extension):
+                    async def create_replay_artifacts(print_prefix, context_index, self, mch_file, env, jit_differences_queue, base_location, diff_location, extension):
                         """ Run superpmi over an MC to create JIT asm or JIT dumps for the method.
                         """
                         # Setup flags to call SuperPMI for both the diff jit and the base jit
 
                         flags = [
-                            "-c", item,
+                            "-c", str(context_index),
                             "-v", "q"  # only log from the jit.
                         ]
                         flags += altjit_replay_flags
@@ -1690,7 +1690,7 @@ async def create_replay_artifacts(print_prefix, item, self, mch_file, env, jit_d
 
                             async def create_one_artifact(jit_path: str, location: str, flags) -> str:
                                 command = [self.superpmi_path] + flags + [jit_path, mch_file]
-                                item_path = os.path.join(location, "{}{}".format(item, extension))
+                                item_path = os.path.join(location, "{}{}".format(context_index, extension))
                                 modified_env = env.copy()
                                 modified_env['COMPlus_JitStdOutFile'] = item_path
                                 logging.debug("%sGenerating %s", print_prefix, item_path)
@@ -1706,22 +1706,20 @@ async def create_one_artifact(jit_path: str, location: str, flags) -> str:
                             diff_txt = await create_one_artifact(self.diff_jit_path, diff_location, flags + diff_option_flags_for_diff_artifact)
 
                             if base_txt != diff_txt:
-                                jit_differences_queue.put_nowait(item)
+                                jit_differences_queue.put_nowait(context_index)
                     ################################################################################################ end of create_replay_artifacts()
 
-                    diff_items = []
-                    for item in self.diff_mcl_contents:
-                        diff_items.append(item)
-
                     logging.info("Creating dasm files: %s %s", base_asm_location, diff_asm_location)
-                    subproc_helper = AsyncSubprocessHelper(diff_items, verbose=True)
+                    dasm_contexts = self.pick_contexts_to_disassemble(diffs)
+                    subproc_helper = AsyncSubprocessHelper(dasm_contexts, verbose=True)
                     subproc_helper.run_to_completion(create_replay_artifacts, self, mch_file, asm_complus_vars_full_env, text_differences, base_asm_location, diff_asm_location, ".dasm")
 
                     if self.coreclr_args.diff_jit_dump:
                         logging.info("Creating JitDump files: %s %s", base_dump_location, diff_dump_location)
                         subproc_helper.run_to_completion(create_replay_artifacts, self, mch_file, jit_dump_complus_vars_full_env, jit_dump_differences, base_dump_location, diff_dump_location, ".txt")
 
-                    logging.info("Differences found. To replay SuperPMI use:")
+                    logging.info("Differences found. To replay SuperPMI use:".format(len(diffs)))
+
                     logging.info("")
                     for var, value in asm_complus_vars.items():
                         print_platform_specific_environment_vars(logging.INFO, self.coreclr_args, var, value)
@@ -1736,9 +1734,10 @@ async def create_one_artifact(jit_path: str, location: str, flags) -> str:
                         logging.info("%s %s -c ### %s %s", self.superpmi_path, " ".join(altjit_replay_flags), self.diff_jit_path, mch_file)
                         logging.info("")
 
-                    logging.debug("Method numbers with binary differences:")
-                    for item in self.diff_mcl_contents:
-                        logging.debug(item)
+                    smallest = sorted(diffs, key=lambda r: int(r["Context size"]))[:20]
+                    logging.debug("Smallest {} contexts with binary differences:".format(len(smallest)))
+                    for diff in smallest:
+                        logging.debug(diff["Context"])
                     logging.debug("")
 
                     if base_metrics is not None and diff_metrics is not None:
@@ -1769,13 +1768,22 @@ async def create_one_artifact(jit_path: str, location: str, flags) -> str:
                                 # It appears we have a built jit-analyze on the path, so try to run it.
                                 jit_analyze_summary_file = os.path.join(asm_root_dir, "summary.md")
                                 command = [ jit_analyze_path, "--md", jit_analyze_summary_file, "-r", "--base", base_asm_location, "--diff", diff_asm_location ]
-                                if self.coreclr_args.retainOnlyTopFiles:
-                                    command += [ "--retainOnlyTopFiles" ]
                                 if self.coreclr_args.metrics:
                                     command += [ "--metrics", ",".join(self.coreclr_args.metrics) ]
                                 elif base_bytes is not None and diff_bytes is not None:
                                     command += [ "--override-total-base-metric", str(base_bytes), "--override-total-diff-metric", str(diff_bytes) ]
 
+                                if len(dasm_contexts) < len(diffs):
+                                    # Avoid producing analysis output that assumes these are all contexts
+                                    # with diffs. When no custom metrics are specified we pick only a subset
+                                    # of interesting contexts to disassemble, to avoid spending too long.
+                                    # See pick_contexts_to_disassemble.
+                                    command += [ "--is-subset-of-diffs" ]
+                                else:
+                                    # Ask jit-analyze to avoid producing analysis output that assumes these are
+                                    # all contexts (we never produce .dasm files for contexts without diffs).
+                                    command += [ "--is-diffs-only" ]
+
                                 run_and_log(command, logging.INFO)
                                 ran_jit_analyze = True
 
@@ -1800,6 +1808,14 @@ async def create_one_artifact(jit_path: str, location: str, flags) -> str:
                     else:
                         logging.warning("No textual differences. Is this an issue with coredistools?")
 
+                    # If we are not specifying custom metrics then print a summary here, otherwise leave the summarization up to jit-analyze.
+                    if self.coreclr_args.metrics is None:
+                        num_improvements = sum(1 for r in diffs if int(r["Diff size"]) < int(r["Base size"]))
+                        num_regressions = sum(1 for r in diffs if int(r["Diff size"]) > int(r["Base size"]))
+                        logging.info("{} contexts with differences found ({} improvements, {} regressions)".format(len(diffs), num_improvements, num_regressions))
+                        logging.info("")
+                        logging.info("")
+
                     if self.coreclr_args.diff_jit_dump:
                         try:
                             current_jit_dump_diff = jit_dump_differences.get_nowait()
@@ -1994,6 +2010,49 @@ def write_pivot_section(row):
         return result
         ################################################################################################ end of replay_with_asm_diffs()
 
+    def pick_contexts_to_disassemble(self, diffs):
+        """ Given information about diffs, pick the context numbers to create .dasm files for
+
+        Returns:
+            List of method context numbers to create disassembly for.
+        """
+
+        # If there are non-default metrics then we need to disassemble
+        # everything so that jit-analyze can handle those.
+        if self.coreclr_args.metrics is not None:
+            contexts = diffs
+        else:
+            # In the default case we have size improvements/regressions
+            # available without needing to disassemble all, so pick a subset of
+            # interesting diffs to pass to jit-analyze.
+
+            # 20 smallest method contexts with diffs
+            smallest_contexts = sorted(diffs, key=lambda r: int(r["Context size"]))[:20]
+            # Order by byte-wise improvement, largest improvements first
+            by_diff_size = sorted(diffs, key=lambda r: int(r["Diff size"]) - int(r["Base size"]))
+            # 20 top improvements, byte-wise
+            top_improvements_bytes = by_diff_size[:20]
+            # 20 top regressions, byte-wise
+            top_regressions_bytes = by_diff_size[-20:]
+
+            # Order by percentage-wise size improvement, largest improvements first
+            def diff_pct(r):
+                base = int(r["Base size"])
+                if base == 0:
+                    return 0
+                diff = int(r["Diff size"])
+                return (diff - base) / base
+
+            by_diff_size_pct = sorted(diffs, key=diff_pct)
+            top_improvements_pct = by_diff_size_pct[:20]
+            top_regressions_pct = by_diff_size_pct[-20:]
+
+            contexts = smallest_contexts + top_improvements_bytes + top_regressions_bytes + top_improvements_pct + top_regressions_pct
+
+        final_contexts_indices = list(set(int(r["Context"]) for r in contexts))
+        final_contexts_indices.sort()
+        return final_contexts_indices
+
 ################################################################################
 # SuperPMI Replay/TP diff
 ################################################################################
@@ -3908,11 +3967,6 @@ def verify_base_diff_args():
                             lambda unused: True,
                             "Unable to set metrics.")
 
-        coreclr_args.verify(args,
-                            "retainOnlyTopFiles",
-                            lambda unused: True,
-                            "Unable to set retainOnlyTopFiles.")
-
         coreclr_args.verify(args,
                             "diff_with_release",
                             lambda unused: True,
diff --git a/src/coreclr/scripts/superpmi_diffs.py b/src/coreclr/scripts/superpmi_diffs.py
index 47243ad43d3a23..b7af1427fc481c 100644
--- a/src/coreclr/scripts/superpmi_diffs.py
+++ b/src/coreclr/scripts/superpmi_diffs.py
@@ -216,8 +216,7 @@ def main(main_args):
         "-spmi_location", spmi_location,
         "-error_limit", "100",
         "-log_level", "debug",
-        "-log_file", log_file,
-        "-retainOnlyTopFiles"])
+        "-log_file", log_file])
 
     print("Running superpmi.py tpdiff")
 
diff --git a/src/coreclr/tools/superpmi/superpmi-shared/standardpch.h b/src/coreclr/tools/superpmi/superpmi-shared/standardpch.h
index dc4ce235ffbf2a..e0e83c41d03ab6 100644
--- a/src/coreclr/tools/superpmi/superpmi-shared/standardpch.h
+++ b/src/coreclr/tools/superpmi/superpmi-shared/standardpch.h
@@ -62,6 +62,7 @@
 
 // Getting STL to work with PAL is difficult, so reimplement STL functionality to not require it.
 #ifdef TARGET_UNIX
+#include "clr_std/utility"
 #include "clr_std/string"
 #include "clr_std/algorithm"
 #include "clr_std/vector"
@@ -69,6 +70,7 @@
 #ifndef USE_STL
 #define USE_STL
 #endif // USE_STL
+#include <utility>
 #include <string>
 #include <algorithm>
 #include <vector>
diff --git a/src/coreclr/tools/superpmi/superpmi/CMakeLists.txt b/src/coreclr/tools/superpmi/superpmi/CMakeLists.txt
index 63237450898e3b..76ab73ffdabfe2 100644
--- a/src/coreclr/tools/superpmi/superpmi/CMakeLists.txt
+++ b/src/coreclr/tools/superpmi/superpmi/CMakeLists.txt
@@ -26,6 +26,7 @@ set(SUPERPMI_SOURCES
     neardiffer.cpp
     parallelsuperpmi.cpp
     superpmi.cpp
+    fileio.cpp
     jithost.cpp
     ../superpmi-shared/callutils.cpp
     ../superpmi-shared/compileresult.cpp
diff --git a/src/coreclr/tools/superpmi/superpmi/commandline.cpp b/src/coreclr/tools/superpmi/superpmi/commandline.cpp
index 563d8cb16248c0..ca407889e87235 100644
--- a/src/coreclr/tools/superpmi/superpmi/commandline.cpp
+++ b/src/coreclr/tools/superpmi/superpmi/commandline.cpp
@@ -318,7 +318,7 @@ bool CommandLine::Parse(int argc, char* argv[], /* OUT */ Options* o)
 
                 o->mclFilename = argv[i];
             }
-            else if ((_strnicmp(&argv[i][1], "diffMCList", 10) == 0))
+            else if ((_strnicmp(&argv[i][1], "diffsInfo", 9) == 0))
             {
                 if (++i >= argc)
                 {
@@ -326,7 +326,7 @@ bool CommandLine::Parse(int argc, char* argv[], /* OUT */ Options* o)
                     return false;
                 }
 
-                o->diffMCLFilename = argv[i];
+                o->diffsInfo = argv[i];
             }
             else if ((_strnicmp(&argv[i][1], "target", 6) == 0))
             {
@@ -641,9 +641,9 @@ bool CommandLine::Parse(int argc, char* argv[], /* OUT */ Options* o)
         DumpHelp(argv[0]);
         return false;
     }
-    if (o->diffMCLFilename != nullptr && !o->applyDiff)
+    if (o->diffsInfo != nullptr && !o->applyDiff)
     {
-        LogError("-diffMCList specified without -applyDiff.");
+        LogError("-diffsInfo specified without -applyDiff.");
         DumpHelp(argv[0]);
         return false;
     }
diff --git a/src/coreclr/tools/superpmi/superpmi/commandline.h b/src/coreclr/tools/superpmi/superpmi/commandline.h
index 055adf00295cda..e801f8cb4f752d 100644
--- a/src/coreclr/tools/superpmi/superpmi/commandline.h
+++ b/src/coreclr/tools/superpmi/superpmi/commandline.h
@@ -39,7 +39,7 @@ class CommandLine
             , baseMetricsSummaryFile(nullptr)
             , diffMetricsSummaryFile(nullptr)
             , mclFilename(nullptr)
-            , diffMCLFilename(nullptr)
+            , diffsInfo(nullptr)
             , targetArchitecture(nullptr)
             , compileList(nullptr)
             , offset(-1)
@@ -72,7 +72,7 @@ class CommandLine
         char* baseMetricsSummaryFile;
         char* diffMetricsSummaryFile;
         char* mclFilename;
-        char* diffMCLFilename;
+        char* diffsInfo;
         char* targetArchitecture;
         char* compileList;
         int   offset;
diff --git a/src/coreclr/tools/superpmi/superpmi/fileio.cpp b/src/coreclr/tools/superpmi/superpmi/fileio.cpp
new file mode 100644
index 00000000000000..ed16485a038e8a
--- /dev/null
+++ b/src/coreclr/tools/superpmi/superpmi/fileio.cpp
@@ -0,0 +1,115 @@
+#include "standardpch.h"
+#include "fileio.h"
+
+bool FileWriter::Printf(const char* fmt, ...)
+{
+    va_list args;
+    va_start(args, fmt);
+
+    char stackBuffer[512];
+    size_t bufferSize = sizeof(stackBuffer);
+    char* pBuffer = stackBuffer;
+    while (true)
+    {
+        va_list argsCopy;
+        va_copy(argsCopy, args);
+        int printed = _vsnprintf_s(pBuffer, bufferSize, _TRUNCATE, fmt, argsCopy);
+        va_end(argsCopy);
+
+        if (printed < 0)
+        {
+            // buffer too small
+            if (pBuffer != stackBuffer)
+                delete[] pBuffer;
+
+            bufferSize *= 2;
+            pBuffer = new char[bufferSize];
+        }
+        else
+        {
+            DWORD numWritten;
+            bool result =
+                WriteFile(m_file.Get(), pBuffer, static_cast<DWORD>(printed), &numWritten, nullptr) &&
+                (numWritten == static_cast<DWORD>(printed));
+
+            if (pBuffer != stackBuffer)
+                delete[] pBuffer;
+
+            va_end(args);
+            return result;
+        }
+    }
+}
+
+bool FileWriter::CreateNew(const char* path, FileWriter* fw)
+{
+    FileHandle handle(CreateFile(path, GENERIC_WRITE, 0, nullptr, CREATE_ALWAYS, FILE_ATTRIBUTE_NORMAL, nullptr));
+    if (!handle.IsValid())
+    {
+        return false;
+    }
+
+    *fw = FileWriter(std::move(handle));
+    return true;
+}
+
+bool FileLineReader::Open(const char* path, FileLineReader* fr)
+{
+    FileHandle file(CreateFile(path, GENERIC_READ, FILE_SHARE_READ, nullptr, OPEN_EXISTING, FILE_ATTRIBUTE_NORMAL, nullptr));
+    if (!file.IsValid())
+    {
+        return false;
+    }
+
+    LARGE_INTEGER size;
+    if (!GetFileSizeEx(file.Get(), &size))
+    {
+        return false;
+    }
+
+    if (static_cast<ULONGLONG>(size.QuadPart) > SIZE_MAX)
+    {
+        return false;
+    }
+
+    FileMappingHandle mapping(CreateFileMapping(file.Get(), nullptr, PAGE_READONLY, size.u.HighPart, size.u.LowPart, nullptr));
+    if (!mapping.IsValid())
+    {
+        return false;
+    }
+
+    FileViewHandle view(MapViewOfFile(mapping.Get(), FILE_MAP_READ, 0, 0, 0));
+    if (!view.IsValid())
+    {
+        return false;
+    }
+
+    *fr = FileLineReader(std::move(file), std::move(mapping), std::move(view), static_cast<size_t>(size.QuadPart));
+    return true;
+}
+
+bool FileLineReader::AdvanceLine()
+{
+    if (m_cur >= m_end)
+    {
+        return false;
+    }
+
+    char* end = m_cur;
+    while (end < m_end && *end != '\r' && *end != '\n')
+    {
+        end++;
+    }
+
+    m_currentLine.resize(end - m_cur + 1);
+    memcpy(m_currentLine.data(), m_cur, end - m_cur);
+    m_currentLine[end - m_cur] = '\0';
+
+    m_cur = end;
+    if (m_cur < m_end && *m_cur == '\r')
+        m_cur++;
+    if (m_cur < m_end && *m_cur == '\n')
+        m_cur++;
+
+    return true;
+}
diff --git a/src/coreclr/tools/superpmi/superpmi/fileio.h b/src/coreclr/tools/superpmi/superpmi/fileio.h
new file mode 100644
index 00000000000000..a88e74d6ee00c5
--- /dev/null
+++ b/src/coreclr/tools/superpmi/superpmi/fileio.h
@@ -0,0 +1,133 @@
+// Licensed to the .NET Foundation under one or more agreements.
+// The .NET Foundation licenses this file to you under the MIT license.
+
+#ifndef _FileIO
+#define _FileIO
+
+template<typename HandleSpec>
+struct HandleWrapper
+{
+    using HandleType = typename HandleSpec::Type;
+
+    HandleWrapper()
+        : m_handle(HandleSpec::Invalid())
+    {
+    }
+
+    explicit HandleWrapper(HandleType handle)
+        : m_handle(handle)
+    {
+    }
+
+    ~HandleWrapper()
+    {
+        if (m_handle != HandleSpec::Invalid())
+        {
+            HandleSpec::Close(m_handle);
+            m_handle = HandleSpec::Invalid();
+        }
+    }
+
+    HandleWrapper(const HandleWrapper&) = delete;
+    HandleWrapper& operator=(HandleWrapper&) = delete;
+
+    HandleWrapper(HandleWrapper&& hw) noexcept
+        : m_handle(hw.m_handle)
+    {
+        hw.m_handle = HandleSpec::Invalid();
+    }
+
+    HandleWrapper& operator=(HandleWrapper&& hw) noexcept
+    {
+        if (m_handle != HandleSpec::Invalid())
+            HandleSpec::Close(m_handle);
+
+        m_handle = hw.m_handle;
+        hw.m_handle = HandleSpec::Invalid();
+        return *this;
+    }
+
+    bool IsValid() { return m_handle != HandleSpec::Invalid(); }
+    HandleType Get() { return m_handle; }
+
+private:
+    HandleType m_handle;
+};
+
+struct FileHandleSpec
+{
+    using Type = HANDLE;
+    static HANDLE Invalid() { return INVALID_HANDLE_VALUE; }
+    static void Close(HANDLE h) { CloseHandle(h); }
+};
+
+struct FileMappingHandleSpec
+{
+    using Type = HANDLE;
+    static HANDLE Invalid() { return nullptr; }
+    static void Close(HANDLE h) { CloseHandle(h); }
+};
+
+struct FileViewHandleSpec
+{
+    using Type = LPVOID;
+    static LPVOID Invalid() { return nullptr; }
+    static void Close(LPVOID p) { UnmapViewOfFile(p); }
+};
+
+typedef HandleWrapper<FileHandleSpec> FileHandle;
+typedef HandleWrapper<FileMappingHandleSpec> FileMappingHandle;
+typedef HandleWrapper<FileViewHandleSpec> FileViewHandle;
+
+class FileWriter
+{
+    FileHandle m_file;
+
+    FileWriter(FileHandle file)
+        : m_file(std::move(file))
+    {
+    }
+
+public:
+    FileWriter()
+    {
+    }
+
+    bool Printf(const char* fmt, ...);
+
+    static bool CreateNew(const char* path, FileWriter* fw);
+};
+
+class FileLineReader
+{
+    FileHandle m_file;
+    FileMappingHandle m_fileMapping;
+    FileViewHandle m_view;
+
+    char* m_cur;
+    char* m_end;
+    std::vector<char> m_currentLine;
+
+    FileLineReader(FileHandle file, FileMappingHandle fileMapping, FileViewHandle view, size_t size)
+        : m_file(std::move(file))
+        , m_fileMapping(std::move(fileMapping))
+        , m_view(std::move(view))
+        , m_cur(static_cast<char*>(m_view.Get()))
+        , m_end(static_cast<char*>(m_view.Get()) + size)
+    {
+    }
+
+public:
+    FileLineReader()
+        : m_cur(nullptr)
+        , m_end(nullptr)
+    {
+    }
+
+    bool AdvanceLine();
+    const char* GetCurrentLine() { return m_currentLine.data(); }
+
+    static bool Open(const char* path, FileLineReader* fr);
+};
+
+#endif
diff --git a/src/coreclr/tools/superpmi/superpmi/metricssummary.cpp b/src/coreclr/tools/superpmi/superpmi/metricssummary.cpp
index cb9c908ee72d80..7967cf90293fb4 100644
--- a/src/coreclr/tools/superpmi/superpmi/metricssummary.cpp
+++ b/src/coreclr/tools/superpmi/superpmi/metricssummary.cpp
@@ -4,6 +4,7 @@
 #include "standardpch.h"
 #include "metricssummary.h"
 #include "logging.h"
+#include "fileio.h"
 
 void MetricsSummary::AggregateFrom(const MetricsSummary& other)
 {
@@ -24,50 +25,15 @@ void MetricsSummaries::AggregateFrom(const MetricsSummaries& other)
     FullOpts.AggregateFrom(other.FullOpts);
 }
 
-struct FileHandleWrapper
-{
-    FileHandleWrapper(HANDLE hFile)
-        : hFile(hFile)
-    {
-    }
-
-    ~FileHandleWrapper()
-    {
-        CloseHandle(hFile);
-    }
-
-    HANDLE get() { return hFile; }
-
-private:
-    HANDLE hFile;
-};
-
-static bool FilePrintf(HANDLE hFile, const char* fmt, ...)
-{
-    va_list args;
-    va_start(args, fmt);
-
-    char buffer[4096];
-    int len = vsprintf_s(buffer, ARRAY_SIZE(buffer), fmt, args);
-    DWORD numWritten;
-    bool result =
-        WriteFile(hFile, buffer, static_cast<DWORD>(len), &numWritten, nullptr) && (numWritten == static_cast<DWORD>(len));
-
-    va_end(args);
-
-    return result;
-}
-
 bool MetricsSummaries::SaveToFile(const char* path)
 {
-    FileHandleWrapper file(CreateFile(path, GENERIC_WRITE, 0, nullptr, CREATE_ALWAYS, FILE_ATTRIBUTE_NORMAL, nullptr));
-    if (file.get() == INVALID_HANDLE_VALUE)
+    FileWriter file;
+    if (!FileWriter::CreateNew(path, &file))
     {
         return false;
     }
 
-    if (!FilePrintf(
-        file.get(),
+    if (!file.Printf(
         "Successful compiles,Failing compiles,Missing compiles,Contexts with diffs,"
         "Code bytes,Diffed code bytes,Executed instructions,Diff executed instructions,Name\n"))
     {
@@ -75,16 +41,15 @@ bool MetricsSummaries::SaveToFile(const char* path)
     }
 
     return
-        WriteRow(file.get(), "Overall", Overall) &&
-        WriteRow(file.get(), "MinOpts", MinOpts) &&
-        WriteRow(file.get(), "FullOpts", FullOpts);
+        WriteRow(file, "Overall", Overall) &&
+        WriteRow(file, "MinOpts", MinOpts) &&
+        WriteRow(file, "FullOpts", FullOpts);
 }
 
-bool MetricsSummaries::WriteRow(HANDLE hFile, const char* name, const MetricsSummary& summary)
+bool MetricsSummaries::WriteRow(FileWriter& fw, const char* name, const MetricsSummary& summary)
 {
     return
-        FilePrintf(
-            hFile,
+        fw.Printf(
             "%d,%d,%d,%d,%lld,%lld,%lld,%lld,%s\n",
             summary.SuccessfulCompiles,
             summary.FailingCompiles,
@@ -99,59 +64,27 @@ bool MetricsSummaries::WriteRow(HANDLE hFile, const char* name, const MetricsSum
 
 bool MetricsSummaries::LoadFromFile(const char* path, MetricsSummaries* metrics)
 {
-    FileHandleWrapper file(CreateFile(path, GENERIC_READ, FILE_SHARE_READ, nullptr, OPEN_EXISTING, FILE_ATTRIBUTE_NORMAL, nullptr));
-    if (file.get() == INVALID_HANDLE_VALUE)
-    {
-        return false;
-    }
-
-    LARGE_INTEGER len;
-    if (!GetFileSizeEx(file.get(), &len))
+    FileLineReader reader;
+    if (!FileLineReader::Open(path, &reader))
     {
         return false;
     }
 
-    DWORD stringLen = static_cast<DWORD>(len.QuadPart);
-    std::vector<char> content(stringLen);
-    DWORD numRead;
-    if (!ReadFile(file.get(), content.data(), stringLen, &numRead, nullptr) || (numRead != stringLen))
+    if (!reader.AdvanceLine())
     {
         return false;
     }
 
-    std::vector<char> line;
-    size_t index = 0;
-    auto nextLine = [&line, &content, &index]()
-    {
-        size_t end = index;
-        while ((end < content.size()) && (content[end] != '\r') && (content[end] != '\n'))
-        {
-            end++;
-        }
-
-        line.resize(end - index + 1);
-        memcpy(line.data(), &content[index], end - index);
-        line[end - index] = '\0';
-
-        index = end;
-        if ((index < content.size()) && (content[index] == '\r'))
-            index++;
-        if ((index < content.size()) && (content[index] == '\n'))
-            index++;
-    };
-
     *metrics = MetricsSummaries();
-    nextLine();
     bool result = true;
-    while (index < content.size())
+    while (reader.AdvanceLine())
     {
-        nextLine();
         MetricsSummary summary;
 
         char name[32];
         int scanResult =
             sscanf_s(
-                line.data(),
+                reader.GetCurrentLine(),
                 "%d,%d,%d,%d,%lld,%lld,%lld,%lld,%s",
                 &summary.SuccessfulCompiles,
                 &summary.FailingCompiles,
diff --git a/src/coreclr/tools/superpmi/superpmi/metricssummary.h b/src/coreclr/tools/superpmi/superpmi/metricssummary.h
index 118d6aac224006..14e364cd9fcf3e 100644
--- a/src/coreclr/tools/superpmi/superpmi/metricssummary.h
+++ b/src/coreclr/tools/superpmi/superpmi/metricssummary.h
@@ -39,7 +39,7 @@ class MetricsSummaries
     bool SaveToFile(const char* path);
     static bool LoadFromFile(const char* path, MetricsSummaries* metrics);
 private:
-    static bool WriteRow(HANDLE hFile, const char* name, const MetricsSummary& summary);
+    static bool WriteRow(class FileWriter& fw, const char* name, const MetricsSummary& summary);
 };
 
 #endif
diff --git a/src/coreclr/tools/superpmi/superpmi/parallelsuperpmi.cpp b/src/coreclr/tools/superpmi/superpmi/parallelsuperpmi.cpp
index dfca5adcc376a6..610ccdf732b73e 100644
--- a/src/coreclr/tools/superpmi/superpmi/parallelsuperpmi.cpp
+++ b/src/coreclr/tools/superpmi/superpmi/parallelsuperpmi.cpp
@@ -9,6 +9,7 @@
 #include "commandline.h"
 #include "errorhandling.h"
 #include "metricssummary.h"
+#include "fileio.h"
 
 #define MAX_LOG_LINE_SIZE 0x1000 // 4 KB
 
@@ -349,7 +350,7 @@ struct PerWorkerData
     HANDLE hStdError;
 
     char* failingMCListPath;
-    char* diffMCListPath;
+    char* diffsInfoPath;
     char* stdOutputPath;
     char* stdErrorPath;
     char* baseMetricsSummaryPath;
@@ -359,7 +360,7 @@ struct PerWorkerData
         : hStdOutput(INVALID_HANDLE_VALUE)
         , hStdError(INVALID_HANDLE_VALUE)
         , failingMCListPath(nullptr)
-        , diffMCListPath(nullptr)
+        , diffsInfoPath(nullptr)
         , stdOutputPath(nullptr)
         , stdErrorPath(nullptr)
         , baseMetricsSummaryPath(nullptr)
@@ -368,7 +369,7 @@ struct PerWorkerData
     }
 };
 
-void MergeWorkerMCLs(char* mclFilename, PerWorkerData* workerData, int workerCount, char* PerWorkerData::*mclPath)
+static void MergeWorkerMCLs(char* mclFilename, PerWorkerData* workerData, int workerCount, char* PerWorkerData::*mclPath)
 {
     int **MCL = new int *[workerCount], *MCLCount = new int[workerCount], totalCount = 0;
 
@@ -396,6 +397,39 @@ void MergeWorkerMCLs(char* mclFilename, PerWorkerData* workerData, int workerCou
         LogError("Unable to write to MCL file %s.", mclFilename);
 }
 
+static void MergeWorkerCsvs(char* csvFilename, PerWorkerData* workerData, int workerCount, char* PerWorkerData::* csvPath)
+{
+    FileWriter fw;
+    if (!FileWriter::CreateNew(csvFilename, &fw))
+    {
+        LogError("Could not create file %s", csvFilename);
+        return;
+    }
+
+    bool hasHeader = false;
+    for (int i = 0; i < workerCount; i++)
+    {
+        FileLineReader reader;
+        if (!FileLineReader::Open(workerData[i].*csvPath, &reader))
+        {
+            LogError("Could not open child CSV file %s", workerData[i].*csvPath);
+            continue;
+        }
+
+        if (hasHeader && !reader.AdvanceLine())
+        {
+            continue;
+        }
+
+        while (reader.AdvanceLine())
+        {
+             fw.Printf("%s\n", reader.GetCurrentLine());
+        }
+
+        hasHeader = true;
+    }
+}
+
 #define MAX_CMDLINE_SIZE 0x1000 // 4 KB
 
 //-------------------------------------------------------------
@@ -528,8 +562,8 @@ int doParallelSuperPMI(CommandLine::Options& o)
     LogVerbose("Using child (%s) with args (%s)", spmiFilename, spmiArgs);
     if (o.mclFilename != nullptr)
         LogVerbose(" failingMCList=%s", o.mclFilename);
-    if (o.diffMCLFilename != nullptr)
-        LogVerbose(" diffMCLFilename=%s", o.diffMCLFilename);
+    if (o.diffsInfo != nullptr)
+        LogVerbose(" diffsInfo=%s", o.diffsInfo);
     LogVerbose(" workerCount=%d, skipCleanup=%d.", o.workerCount, o.skipCleanup);
 
     PerWorkerData* perWorkerData = new PerWorkerData[o.workerCount];
@@ -551,10 +585,10 @@ int doParallelSuperPMI(CommandLine::Options& o)
             sprintf_s(wd.failingMCListPath, MAX_PATH, "%sParallelSuperPMI-%u-%d.mcl", tempPath, randNumber, i);
         }
 
-        if (o.diffMCLFilename != nullptr)
+        if (o.diffsInfo != nullptr)
         {
-            wd.diffMCListPath = new char[MAX_PATH];
-            sprintf_s(wd.diffMCListPath, MAX_PATH, "%sParallelSuperPMI-Diff-%u-%d.mcl", tempPath, randNumber, i);
+            wd.diffsInfoPath = new char[MAX_PATH];
+            sprintf_s(wd.diffsInfoPath, MAX_PATH, "%sParallelSuperPMI-Diff-%u-%d.mcl", tempPath, randNumber, i);
         }
 
         if (o.baseMetricsSummaryFile != nullptr)
@@ -593,10 +627,10 @@ int doParallelSuperPMI(CommandLine::Options& o)
                                       wd.failingMCListPath);
         }
 
-        if (wd.diffMCListPath != nullptr)
+        if (wd.diffsInfoPath != nullptr)
         {
-            bytesWritten += sprintf_s(cmdLine + bytesWritten, MAX_CMDLINE_SIZE - bytesWritten, " -diffMCList %s",
-                                      wd.diffMCListPath);
+            bytesWritten += sprintf_s(cmdLine + bytesWritten, MAX_CMDLINE_SIZE - bytesWritten, " -diffsInfo %s",
+                                      wd.diffsInfoPath);
         }
 
         if (wd.baseMetricsSummaryPath != nullptr)
@@ -719,10 +753,10 @@ int doParallelSuperPMI(CommandLine::Options& o)
             MergeWorkerMCLs(o.mclFilename, perWorkerData, o.workerCount, &PerWorkerData::failingMCListPath);
         }
 
-        if (o.diffMCLFilename != nullptr && !usageError)
+        if (o.diffsInfo != nullptr && !usageError)
         {
             // Concat the resulting diff .mcl files
-            MergeWorkerMCLs(o.diffMCLFilename, perWorkerData, o.workerCount, &PerWorkerData::diffMCListPath);
+            MergeWorkerCsvs(o.diffsInfo, perWorkerData, o.workerCount, &PerWorkerData::diffsInfoPath);
         }
 
         if (o.baseMetricsSummaryFile != nullptr && !usageError)
@@ -761,9 +795,9 @@ int doParallelSuperPMI(CommandLine::Options& o)
             {
                 DeleteFile(wd.failingMCListPath);
             }
-            if (wd.diffMCListPath != nullptr)
+            if (wd.diffsInfoPath != nullptr)
             {
-                DeleteFile(wd.diffMCListPath);
+                DeleteFile(wd.diffsInfoPath);
             }
             if (wd.baseMetricsSummaryPath != nullptr)
             {
diff --git a/src/coreclr/tools/superpmi/superpmi/superpmi.cpp b/src/coreclr/tools/superpmi/superpmi/superpmi.cpp
index ddf595281691bd..7875d4ca111199 100644
--- a/src/coreclr/tools/superpmi/superpmi/superpmi.cpp
+++ b/src/coreclr/tools/superpmi/superpmi/superpmi.cpp
@@ -19,6 +19,7 @@
 #include "methodstatsemitter.h"
 #include "spmiutil.h"
 #include "metricssummary.h"
+#include "fileio.h"
 
 extern int doParallelSuperPMI(CommandLine::Options& o);
 
@@ -64,54 +65,47 @@ void SetSuperPmiTargetArchitecture(const char* targetArchitecture)
     }
 }
 
+enum class NearDifferResult
+{
+    SuccessWithoutDiff,
+    SuccessWithDiff,
+    Failure,
+};
+
 // This function uses PAL_TRY, so it can't be in the a function that requires object unwinding. Extracting it out here
 // avoids compiler error.
 //
-static void InvokeNearDiffer(NearDiffer*           nearDiffer,
-                             CommandLine::Options* o,
-                             MethodContext**       mc,
-                             CompileResult**       crl,
-                             bool*                 hasDiff,
-                             MethodContextReader** reader,
-                             MCList*               failingMCL,
-                             MCList*               diffMCL)
+static NearDifferResult InvokeNearDiffer(NearDiffer*           nearDiffer,
+                                   MethodContext**       mc,
+                                   CompileResult**       crl,
+                                   MethodContextReader** reader)
 {
+
     struct Param : FilterSuperPMIExceptionsParam_CaptureException
     {
         NearDiffer*           nearDiffer;
-        CommandLine::Options* o;
         MethodContext**       mc;
         CompileResult**       crl;
-        bool*                 hasDiff;
         MethodContextReader** reader;
-        MCList*               failingMCL;
-        MCList*               diffMCL;
+        NearDifferResult      result;
     } param;
     param.nearDiffer = nearDiffer;
-    param.o          = o;
     param.mc         = mc;
     param.crl        = crl;
-    param.hasDiff    = hasDiff;
     param.reader     = reader;
-    param.failingMCL = failingMCL;
-    param.diffMCL    = diffMCL;
-    *hasDiff = false;
+    param.result     = NearDifferResult::Failure;
 
     PAL_TRY(Param*, pParam, &param)
     {
         if (!pParam->nearDiffer->compare(*pParam->mc, *pParam->crl, (*pParam->mc)->cr))
         {
-            *pParam->hasDiff = true;
+            pParam->result = NearDifferResult::SuccessWithDiff;
             LogIssue(ISSUE_ASM_DIFF, "main method %d of size %d differs", (*pParam->reader)->GetMethodContextIndex(),
                      (*pParam->mc)->methodSize);
-
-            // This is a difference in ASM outputs from Jit1 & Jit2 and not a playback failure
-            // We will add this MC to the diffMCList if one is requested
-            // Otherwise this will end up in failingMCList
-            if ((*pParam->o).diffMCLFilename != nullptr)
-                (*pParam->diffMCL).AddMethodToMCL((*pParam->reader)->GetMethodContextIndex());
-            else if ((*pParam->o).mclFilename != nullptr)
-                (*pParam->failingMCL).AddMethodToMCL((*pParam->reader)->GetMethodContextIndex());
+        }
+        else
+        {
+            pParam->result = NearDifferResult::SuccessWithoutDiff;
         }
     }
     PAL_EXCEPT_FILTER(FilterSuperPMIExceptions_CaptureExceptionAndStop)
@@ -121,10 +115,26 @@ static void InvokeNearDiffer(NearDiffer*           nearDiffer,
         LogError("main method %d of size %d failed to load and compile correctly.", (*reader)->GetMethodContextIndex(),
                  (*mc)->methodSize);
         e.ShowAndDeleteMessage();
-        if ((*o).mclFilename != nullptr)
-            (*failingMCL).AddMethodToMCL((*reader)->GetMethodContextIndex());
+        param.result = NearDifferResult::Failure;
     }
     PAL_ENDTRY
+
+    return param.result;
+}
+
+static bool PrintDiffsCsvHeader(FileWriter& fw)
+{
+    return fw.Printf("Context,Context size,Base size,Diff size,Base instructions,Diff instructions\n");
+}
+
+static bool PrintDiffsCsvRow(
+    FileWriter& fw,
+    int context,
+    uint32_t contextSize,
+    long long baseSize, long long diffSize,
+    long long baseInstructions, long long diffInstructions)
+{
+    return fw.Printf("%d,%u,%lld,%lld,%lld,%lld\n", context, contextSize, baseSize, diffSize, baseInstructions, diffInstructions);
 }
 
 // Run superpmi. The return value is as follows:
@@ -171,7 +181,8 @@ int __cdecl main(int argc, char* argv[])
 #endif
 
     bool   collectThroughput = false;
-    MCList failingToReplayMCL, diffMCL;
+    MCList failingToReplayMCL;
+    FileWriter diffCsv;
 
     CommandLine::Options o;
     if (!CommandLine::Parse(argc, argv, &o))
@@ -230,9 +241,13 @@ int __cdecl main(int argc, char* argv[])
     {
         failingToReplayMCL.InitializeMCL(o.mclFilename);
     }
-    if (o.diffMCLFilename != nullptr)
+    if (o.diffsInfo != nullptr)
     {
-        diffMCL.InitializeMCL(o.diffMCLFilename);
+        if (!FileWriter::CreateNew(o.diffsInfo, &diffCsv))
+        {
+            LogError("Could not create file %s", o.diffsInfo);
+            return (int)SpmiResult::GeneralFailure;
+        }
     }
 
     SetDebugDumpVariables();
@@ -266,6 +281,11 @@ int __cdecl main(int argc, char* argv[])
         }
     }
 
+    if (o.diffsInfo != nullptr)
+    {
+        PrintDiffsCsvHeader(diffCsv);
+    }
+
     MetricsSummaries totalBaseMetrics;
     MetricsSummaries totalDiffMetrics;
 
@@ -552,16 +572,42 @@ int __cdecl main(int argc, char* argv[])
                 }
                 else
                 {
-                    bool hasDiff;
-                    InvokeNearDiffer(&nearDiffer, &o, &mc, &crl, &hasDiff, &reader, &failingToReplayMCL, &diffMCL);
+                    NearDifferResult result = InvokeNearDiffer(&nearDiffer, &mc, &crl, &reader);
 
-                    if (hasDiff)
+                    switch (result)
                     {
-                        totalBaseMetrics.Overall.NumContextsWithDiffs++;
-                        totalDiffMetrics.Overall.NumContextsWithDiffs++;
+                        case NearDifferResult::SuccessWithDiff:
+                            totalBaseMetrics.Overall.NumContextsWithDiffs++;
+                            totalDiffMetrics.Overall.NumContextsWithDiffs++;
+
+                            totalBaseMetricsOpts.NumContextsWithDiffs++;
+                            totalDiffMetricsOpts.NumContextsWithDiffs++;
+
+                            // This is a difference in ASM outputs from Jit1 & Jit2 and not a playback failure
+                            // We will add this MC to the diffs info if there is one.
+                            // Otherwise this will end up in failingMCList
+                            if (o.diffsInfo != nullptr)
+                            {
+                                PrintDiffsCsvRow(
+                                    diffCsv,
+                                    reader->GetMethodContextIndex(),
+                                    mcb.size,
+                                    baseMetrics.NumCodeBytes, diffMetrics.NumCodeBytes,
+                                    baseMetrics.NumExecutedInstructions, diffMetrics.NumExecutedInstructions);
+                            }
+                            else if (o.mclFilename != nullptr)
+                            {
+                                failingToReplayMCL.AddMethodToMCL(reader->GetMethodContextIndex());
+                            }
 
-                        totalBaseMetricsOpts.NumContextsWithDiffs++;
-                        totalDiffMetricsOpts.NumContextsWithDiffs++;
+                            break;
+                        case NearDifferResult::SuccessWithoutDiff:
+                            break;
+                        case NearDifferResult::Failure:
+                            if (o.mclFilename != nullptr)
+                                failingToReplayMCL.AddMethodToMCL(reader->GetMethodContextIndex());
+
+                            break;
                     }
 
                     totalBaseMetrics.Overall.NumDiffedCodeBytes += baseMetrics.NumCodeBytes;
@@ -625,7 +671,7 @@ int __cdecl main(int argc, char* argv[])
                 if (o.breakOnError)
                 {
                     if (o.indexCount == -1)
-                        LogInfo("HINT: to repro add '/c %d' to cmdline", reader->GetMethodContextIndex());
+                        LogInfo("HINT: to repro add '-c %d' to cmdline", reader->GetMethodContextIndex());
                     __debugbreak();
                 }
             }
@@ -674,10 +720,6 @@ int __cdecl main(int argc, char* argv[])
     {
         failingToReplayMCL.CloseMCL();
     }
-    if (o.diffMCLFilename != nullptr)
-    {
-        diffMCL.CloseMCL();
-    }
     Logger::Shutdown();
 
     SpmiResult result = SpmiResult::Success;

From 16ac63288265eac51003a031f960717757e96d27 Mon Sep 17 00:00:00 2001
From: Jakob Botsch Nielsen <jakob.botsch.nielsen@gmail.com>
Date: Wed, 28 Sep 2022 14:20:40 +0200
Subject: [PATCH 06/14] Move after last conflicting arg instead of to the end

Since there can be multiple conflicting pairs this strategy works
better.
---
 src/coreclr/jit/morph.cpp | 31 ++++++++++++++++++++++---------
 1 file changed, 22 insertions(+), 9 deletions(-)

diff --git a/src/coreclr/jit/morph.cpp b/src/coreclr/jit/morph.cpp
index bd563d958f0a07..dd4b4866dc0e6c 100644
--- a/src/coreclr/jit/morph.cpp
+++ b/src/coreclr/jit/morph.cpp
@@ -1746,7 +1746,7 @@ void CallArgs::SortArgs(Compiler* comp, GenTreeCall* call, CallArg** sortedArgs)
         }
     }
 
-    // Now move args whose placement will potentially clobber a later use to the end of the list.
+    // Now move args whose placement will potentially clobber a later arg to after that arg.
     unsigned max      = beforeFirstConstArgIndex - firstCallArgIndex + 1;
     unsigned curIndex = firstCallArgIndex;
     for (unsigned count = 0; count < max; count++)
@@ -1764,23 +1764,36 @@ void CallArgs::SortArgs(Compiler* comp, GenTreeCall* call, CallArg** sortedArgs)
             clobbered |= genRegMask(arg->AbiInfo.GetRegNum(j));
         }
 
-        unsigned nextIndex = curIndex + 1;
+        unsigned lastClobbered = UINT_MAX;
+        regMaskTP lastClobberedRegs = RBM_NONE;
 
         for (unsigned i = curIndex + 1; i != beforeFirstConstArgIndex + 1; i++)
         {
             regMaskTP usedRegs = EstimateRegisterUses(comp, sortedArgs[i]->GetNode());
             if ((clobbered & usedRegs) != 0)
             {
-                memmove(&sortedArgs[curIndex], &sortedArgs[curIndex + 1],
-                        (beforeFirstConstArgIndex - curIndex) * sizeof(CallArg*));
-                assert(sortedArgs[beforeFirstConstArgIndex - 1] == sortedArgs[beforeFirstConstArgIndex]);
-                sortedArgs[beforeFirstConstArgIndex] = arg;
-                nextIndex--;
-                break;
+                lastClobbered = i;
+                lastClobberedRegs = clobbered & usedRegs;
             }
         }
 
-        curIndex = nextIndex;
+        if (lastClobbered != UINT_MAX)
+        {
+            JITDUMP("  [%06u] -> #%u (clobbers %s used by [%06u])\n", arg->GetNode()->gtTreeID, lastClobbered, getRegName(genRegNumFromMask(genFindLowestReg(lastClobberedRegs))), sortedArgs[lastClobbered]->GetNode()->gtTreeID);
+            memmove(&sortedArgs[curIndex], &sortedArgs[curIndex + 1],
+                (lastClobbered - curIndex) * sizeof(CallArg*));
+            assert(sortedArgs[lastClobbered - 1] == sortedArgs[lastClobbered]);
+            sortedArgs[lastClobbered] = arg;
+
+            JITDUMP("   ");
+            for (unsigned i = 0; i < argCount; i++)
+                JITDUMP(" [%06u]", sortedArgs[i]->GetNode()->gtTreeID);
+            JITDUMP("\n");
+        }
+        else
+        {
+            curIndex++;
+        }
     }
 
     // The table should now be completely filled and thus begTab should now be adjacent to endTab

From 3760ba384612fc48d9e781d7192effae0b3907ea Mon Sep 17 00:00:00 2001
From: Jakob Botsch Nielsen <jakob.botsch.nielsen@gmail.com>
Date: Thu, 29 Sep 2022 15:14:46 +0200
Subject: [PATCH 07/14] Tarjan's algorithm, version F

---
 src/coreclr/jit/gentree.h |   3 +
 src/coreclr/jit/morph.cpp | 716 ++++++++++++++------------------------
 2 files changed, 258 insertions(+), 461 deletions(-)

diff --git a/src/coreclr/jit/gentree.h b/src/coreclr/jit/gentree.h
index 0658db6409138d..5f51656307bfed 100644
--- a/src/coreclr/jit/gentree.h
+++ b/src/coreclr/jit/gentree.h
@@ -4603,11 +4603,14 @@ class CallArg
         , m_needPlace(false)
         , m_isTmp(false)
         , m_processed(false)
+        , Tag(0)
     {
     }
 
 public:
     CallArgABIInformation AbiInfo;
+    // Tag that can be used for any purpose during transformations.
+    int Tag;
 
     CallArg(const NewCallArg& arg) : CallArg()
     {
diff --git a/src/coreclr/jit/morph.cpp b/src/coreclr/jit/morph.cpp
index dd4b4866dc0e6c..3f38d4eae42e30 100644
--- a/src/coreclr/jit/morph.cpp
+++ b/src/coreclr/jit/morph.cpp
@@ -16,6 +16,7 @@ XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
 #endif
 
 #include "allocacheck.h" // for alloca
+#include "algorithm.h"
 
 //-------------------------------------------------------------
 // fgMorphInit: prepare for running the morph phases
@@ -1256,550 +1257,343 @@ void CallArgs::ArgsComplete(Compiler* comp, GenTreeCall* call)
     m_argsComplete = true;
 }
 
-// Estimate registers used by a tree.
-static regMaskTP EstimateRegisterUses(Compiler* comp, GenTree* tree)
+struct ArgInterferenceGraphNode
 {
-    struct RegisterUsesVisitor : GenTreeVisitor<RegisterUsesVisitor>
-    {
-        enum
-        {
-            DoPreOrder    = true,
-            DoLclVarsOnly = true,
-        };
-
-        regMaskTP Registers = RBM_NONE;
-
-        RegisterUsesVisitor(Compiler* comp) : GenTreeVisitor(comp)
-        {
-        }
-
-        fgWalkResult PreOrderVisit(GenTree** use, GenTree* parent)
-        {
-            GenTreeLclVarCommon* node = (*use)->AsLclVarCommon();
-            LclVarDsc*           desc = m_compiler->lvaGetDesc(node);
-            if (!desc->lvDoNotEnregister && desc->lvIsRegArg)
-            {
-                if ((desc->GetArgReg() != REG_NA) && (desc->GetArgReg() != REG_STK))
-                {
-                    Registers |= genRegMask(desc->GetArgReg());
-                }
-#if FEATURE_MULTIREG_ARGS
-                if ((desc->GetOtherArgReg() != REG_NA) && (desc->GetOtherArgReg() != REG_STK))
-                {
-                    Registers |= genRegMask(desc->GetOtherArgReg());
-                }
-#endif
-            }
-
-            return fgWalkResult::WALK_CONTINUE;
-        }
-    };
-
-    RegisterUsesVisitor visitor(comp);
-    visitor.WalkTree(&tree, nullptr);
-    return visitor.Registers;
-}
-
-//------------------------------------------------------------------------
-// SortArgs: Sort arguments into a better passing order.
-//
-// Parameters:
-//   comp       - The compiler object.
-//   call       - The call that contains this CallArgs instance.
-//   sortedArgs - A table of at least `CountArgs()` entries where the sorted
-//                arguments are written into.
-//
-void CallArgs::SortArgs(Compiler* comp, GenTreeCall* call, CallArg** sortedArgs)
+    CallArg* Arg;
+    // Registers clobbered by placing this argument.
+    regMaskTP Clobbers;
+    // Registers that may be used by the argument (guess).
+    regMaskTP Uses;
+    // Bit mask of edges. There is an edge (i, j) if argument i clobbers a register used by argument j.
+    uint32_t Edges;
+
+    int  Index;
+    int  LowLink;
+    bool OnStack;
+
+    // Index of next node in the SCC.
+    int SccNext;
+};
+
+class ArgInterferenceGraph
 {
-    assert(m_argsComplete);
+    Compiler* m_comp;
+    int       m_index = 0;
 
-    JITDUMP("\nSorting the arguments:\n");
+    ArrayStack<ArgInterferenceGraphNode> m_nodes;
+    ArrayStack<int>                      m_stack;
+    ArrayStack<int>                      m_sccs;
 
-    // Shuffle the arguments around before we build the late args list. The
-    // idea is to move all "simple" arguments like constants and local vars to
-    // the end, and move the complex arguments towards the beginning. This will
-    // help prevent registers from being spilled by allowing us to evaluate the
-    // more complex arguments before the simpler arguments. The ordering ends
-    // up looking like:
-    //     +------------------------------------+  <--- end of sortedArgs
-    //     |          constants                 |
-    //     +------------------------------------+
-    //     |    local var / local field         |
-    //     +------------------------------------+
-    //     | remaining arguments sorted by cost |
-    //     +------------------------------------+
-    //     | temps (CallArg::m_needTmp == true) |
-    //     +------------------------------------+
-    //     |  args with calls (GTF_CALL)        |
-    //     +------------------------------------+  <--- start of sortedArgs
-    //
+public:
+    ArgInterferenceGraph(Compiler* comp, unsigned argCount)
+        : m_comp(comp)
+        , m_nodes(comp->getAllocator(CMK_CallArgs), static_cast<int>(argCount))
+        , m_stack(comp->getAllocator(CMK_CallArgs))
+        , m_sccs(comp->getAllocator(CMK_CallArgs))
 
-    unsigned argCount = 0;
-    for (CallArg& arg : Args())
     {
-        sortedArgs[argCount++] = &arg;
     }
 
-    if (argCount <= 1)
+    int NumSccs()
     {
-        return;
+        return m_sccs.Height();
     }
 
-    // Set the beginning and end for the new argument table
-    unsigned curInx;
-    int      regCount      = 0;
-    unsigned begTab        = 0;
-    unsigned endTab        = argCount - 1;
-    unsigned argsRemaining = argCount;
-
-    // First take care of arguments that are constants.
-    // [We use a backward iterator pattern]
-    //
-    curInx = argCount;
-    do
+    int FirstSccNode(int sccIndex)
     {
-        curInx--;
-
-        CallArg* arg = sortedArgs[curInx];
+        return m_sccs.Bottom(sccIndex);
+    }
+    ArgInterferenceGraphNode& GetNode(int index)
+    {
+        return m_nodes.BottomRef(index);
+    }
 
-        if (arg->AbiInfo.GetRegNum() != REG_STK)
+    void AddNode(CallArg* arg)
+    {
+        regMaskTP clobbers = RBM_NONE;
+        for (unsigned i = 0; i < arg->AbiInfo.NumRegs; i++)
         {
-            regCount++;
+            clobbers |= genRegMask(arg->AbiInfo.GetRegNum(i));
         }
 
-        assert(arg->GetLateNode() == nullptr);
+        arg->Tag = m_nodes.Height();
+        ArgInterferenceGraphNode node;
+        node.Arg      = arg;
+        node.Clobbers = clobbers;
+        node.Uses     = EstimateRegisterUses(arg->GetNode());
+        node.Edges    = 0;
+        node.Index    = -1;
+        node.LowLink  = -1;
+        node.OnStack  = false;
+        node.SccNext  = UINT_MAX;
+        m_nodes.Push(node);
+    }
 
-        // Skip any already processed args
-        //
-        if (!arg->m_processed)
+    void FindSccs()
+    {
+        // Create edges
+        for (int i = 0; i < m_nodes.Height(); i++)
         {
-            GenTree* argx = arg->GetEarlyNode();
-
-            assert(argx != nullptr);
-            // put constants at the end of the table
-            //
-            if (argx->gtOper == GT_CNS_INT)
+            for (int j = 0; j < m_nodes.Height(); j++)
             {
-                noway_assert(curInx <= endTab);
-
-                arg->m_processed = true;
-
-                // place curArgTabEntry at the endTab position by performing a swap
-                //
-                JITDUMP("  [%06u] -> #%d (is constant)\n", argx->gtTreeID, endTab);
-                if (curInx != endTab)
+                if (i == j)
                 {
-                    sortedArgs[curInx] = sortedArgs[endTab];
-                    sortedArgs[endTab] = arg;
+                    continue;
                 }
 
-                endTab--;
-                argsRemaining--;
-                JITDUMP("   ");
-                for (unsigned i = 0; i < argCount; i++)
-                    JITDUMP(" [%06u]", sortedArgs[i]->GetNode()->gtTreeID);
-                JITDUMP("\n");
+                ArgInterferenceGraphNode& ni = m_nodes.BottomRef(i);
+                ArgInterferenceGraphNode& nj = m_nodes.BottomRef(j);
+
+                if ((ni.Clobbers & nj.Uses) != 0)
+                    ni.Edges |= 1 << j;
             }
         }
-    } while (curInx > 0);
 
-    unsigned beforeFirstConstArgIndex = endTab;
-
-    unsigned firstCallArgIndex = begTab;
-
-    if (argsRemaining > 0)
-    {
-        // Next take care of arguments that are calls.
-        // [We use a forward iterator pattern]
-        //
-        for (curInx = begTab; curInx <= endTab; curInx++)
+        for (int i = 0; i < m_nodes.Height(); i++)
         {
-            CallArg* arg = sortedArgs[curInx];
-
-            // Skip any already processed args
-            //
-            if (!arg->m_processed)
+            if (m_nodes.BottomRef(i).Index == -1)
             {
-                GenTree* argx = arg->GetEarlyNode();
-                assert(argx != nullptr);
-
-                // put calls at the beginning of the table
-                //
-                if (argx->gtFlags & GTF_CALL)
-                {
-                    arg->m_processed = true;
-
-                    // place curArgTabEntry at the begTab position by performing a swap
-                    //
-                    JITDUMP("  [%06u] -> #%d (is call)\n", argx->gtTreeID, begTab);
-                    if (curInx != begTab)
-                    {
-                        sortedArgs[curInx] = sortedArgs[begTab];
-                        sortedArgs[begTab] = arg;
-                    }
-
-                    begTab++;
-                    argsRemaining--;
-                    JITDUMP("   ");
-                    for (unsigned i = 0; i < argCount; i++)
-                        JITDUMP(" [%06u]", sortedArgs[i]->GetNode()->gtTreeID);
-                    JITDUMP("\n");
-                }
+                FindScc(i);
             }
         }
     }
 
-    unsigned afterCallIndex = begTab;
-
-    //// Finally place arguments such that arguments nodes that use parameters
-    //// are placed before arguments that would clobber the registers those
-    //// parameters may be in.
-    // while (argsRemaining > 0)
-    //{
-    //    // Find arg that clobbers least amount of registers used by other args.
-    //    unsigned bestNumClobbered = UINT_MAX;
-    //    unsigned bestIndex = 0;
-    //    for (unsigned i = begTab; i <= endTab; i++)
-    //    {
-    //        assert(!sortedArgs[i]->m_processed);
-    //        regMaskTP clobbered = 0;
-    //        for (unsigned j = 0; j < sortedArgs[i]->AbiInfo.NumRegs; j++)
-    //        {
-    //            clobbered |= genRegMask(sortedArgs[i]->AbiInfo.GetRegNum(j));
-    //        }
-
-    //        assert(clobbered != 0);
-    //        unsigned clobberedLaterArgs = 0;
-    //        for (unsigned j = begTab; j <= endTab; j++)
-    //        {
-    //            if (i == j)
-    //                continue;
-
-    //            regMaskTP uses = EstimateRegisterUses(comp, sortedArgs[j]->GetNode());
-    //            if ((uses & clobbered) != 0)
-    //                clobberedLaterArgs++;
-    //        }
-
-    //        if (clobberedLaterArgs < bestNumClobbered)
-    //        {
-    //            bestNumClobbered = clobberedLaterArgs;
-    //            bestIndex = i;
-    //        }
-    //    }
-
-    //    sortedArgs[bestIndex]->m_processed = true;
-    //    std::swap(sortedArgs[begTab], sortedArgs[bestIndex]);
-    //    begTab++;
-    //    argsRemaining--;
-    //}
-
-    if (argsRemaining > 0)
-    {
-        // Next take care arguments that are temps.
-        // These temps come before the arguments that are
-        // ordinary local vars or local fields
-        // since this will give them a better chance to become
-        // enregistered into their actual argument register.
-        // [We use a forward iterator pattern]
-        //
-        for (curInx = begTab; curInx <= endTab; curInx++)
-        {
-            CallArg* arg = sortedArgs[curInx];
+private:
+    // Implementation of Tarjan's algorithm
+    void FindScc(int index)
+    {
+        ArgInterferenceGraphNode& node = m_nodes.BottomRef(index);
+        node.Index                     = m_index;
+        node.LowLink                   = m_index;
+        m_index++;
 
-            // Skip any already processed args
-            //
-            if (!arg->m_processed)
-            {
-                if (arg->m_needTmp)
-                {
-                    arg->m_processed = true;
+        uint32_t nodeStackIndex = m_stack.Height();
+        m_stack.Push(index);
+        node.OnStack = true;
 
-                    // place curArgTabEntry at the begTab position by performing a swap
-                    //
-                    JITDUMP("  [%06u] -> #%d (is temp)\n", arg->GetNode()->gtTreeID, begTab);
-                    if (curInx != begTab)
-                    {
-                        sortedArgs[curInx] = sortedArgs[begTab];
-                        sortedArgs[begTab] = arg;
-                    }
+        uint32_t neighbors = node.Edges;
+        while (neighbors != 0)
+        {
+            uint32_t neighborIndexMask = genFindLowestBit(neighbors);
+            neighbors &= ~neighborIndexMask;
+            int neighborIndex = static_cast<int>(genLog2(neighborIndexMask));
 
-                    begTab++;
-                    argsRemaining--;
-                    JITDUMP("   ");
-                    for (unsigned i = 0; i < argCount; i++)
-                        JITDUMP(" [%06u]", sortedArgs[i]->GetNode()->gtTreeID);
-                    JITDUMP("\n");
-                }
+            assert(neighborIndex < m_nodes.Height());
+            ArgInterferenceGraphNode& neighbor = m_nodes.BottomRef(neighborIndex);
+
+            if (neighbor.Index == -1)
+            {
+                FindScc(neighborIndex);
+                node.LowLink = min(node.LowLink, neighbor.LowLink);
+            }
+            else if (neighbor.OnStack)
+            {
+                node.LowLink = min(node.LowLink, neighbor.Index);
             }
         }
-    }
 
-    if (argsRemaining > 0)
-    {
-        // Next take care of local var and local field arguments.
-        // These are moved towards the end of the argument evaluation.
-        // [We use a backward iterator pattern]
-        //
-        curInx = endTab + 1;
-        do
+        if (node.LowLink == node.Index)
         {
-            curInx--;
-
-            CallArg* arg = sortedArgs[curInx];
-
-            // Skip any already processed args
-            //
-            if (!arg->m_processed)
+            // Pop and link SCC.
+            for (int j = m_stack.Height() - 1, i = nodeStackIndex; i < m_stack.Height(); j = i++)
             {
-                GenTree* argx = arg->GetEarlyNode();
-                assert(argx != nullptr);
-
-                // As a CQ heuristic, sort TYP_STRUCT args using the cost estimation below.
-                if (!argx->TypeIs(TYP_STRUCT) && argx->OperIs(GT_LCL_VAR, GT_LCL_FLD))
-                {
-                    noway_assert(curInx <= endTab);
-
-                    arg->m_processed = true;
-
-                    // place curArgTabEntry at the endTab position by performing a swap
-                    //
-                    JITDUMP("  [%06u] -> #%d (non-struct local)\n", arg->GetNode()->gtTreeID, endTab);
-                    if (curInx != endTab)
-                    {
-                        sortedArgs[curInx] = sortedArgs[endTab];
-                        sortedArgs[endTab] = arg;
-                    }
-
-                    endTab--;
-                    argsRemaining--;
-                    JITDUMP("   ");
-                    for (unsigned i = 0; i < argCount; i++)
-                        JITDUMP(" [%06u]", sortedArgs[i]->GetNode()->gtTreeID);
-                    JITDUMP("\n");
-                }
+                int                       ni   = m_stack.Bottom(i);
+                int                       nj   = m_stack.Bottom(j);
+                ArgInterferenceGraphNode& node = m_nodes.BottomRef(ni);
+                node.SccNext                   = nj;
+                node.OnStack                   = false;
             }
-        } while (curInx > begTab);
+
+            m_stack.Pop(m_stack.Height() - nodeStackIndex);
+            m_sccs.Push(index);
+        }
     }
 
-    // Finally, take care of all the remaining arguments.
-    // Note that we fill in one arg at a time using a while loop.
-    bool costsPrepared = false; // Only prepare tree costs once, the first time through this loop
-    while (argsRemaining > 0)
+    // Estimate registers used by a tree.
+    regMaskTP EstimateRegisterUses(GenTree* tree)
     {
-        /* Find the most expensive arg remaining and evaluate it next */
-
-        CallArg* expensiveArg      = nullptr;
-        unsigned expensiveArgIndex = UINT_MAX;
-        unsigned expensiveArgCost  = 0;
-        unsigned expensiveArgNum   = 0;
-
-        // [We use a forward iterator pattern]
-        //
-        for (curInx = begTab; curInx <= endTab; curInx++)
+        struct RegisterUsesVisitor : GenTreeVisitor<RegisterUsesVisitor>
         {
-            CallArg* arg = sortedArgs[curInx];
-
-            // Skip any already processed args
-            //
-            if (!arg->m_processed)
+            enum
             {
-                GenTree* argx = arg->GetEarlyNode();
-                assert(argx != nullptr);
-
-                // We should have already handled these kinds of args
-                assert((!argx->OperIs(GT_LCL_VAR, GT_LCL_FLD) || argx->TypeIs(TYP_STRUCT)) &&
-                       !argx->OperIs(GT_CNS_INT));
+                DoPreOrder    = true,
+                DoLclVarsOnly = true,
+            };
 
-                // This arg should either have no persistent side effects or be the last one in our table
+            regMaskTP Registers = RBM_NONE;
 
-                // assert(((argx->gtFlags & GTF_PERSISTENT_SIDE_EFFECTS) == 0) || (curInx == (argCount-1)));
+            RegisterUsesVisitor(Compiler* comp) : GenTreeVisitor(comp)
+            {
+            }
 
-                if (argsRemaining == 1)
-                {
-                    // This is the last arg to place
-                    expensiveArgIndex = curInx;
-                    expensiveArg      = arg;
-                    expensiveArgNum   = 1;
-                    assert(begTab == endTab);
-                    break;
-                }
-                else
+            fgWalkResult PreOrderVisit(GenTree** use, GenTree* parent)
+            {
+                GenTreeLclVarCommon* node = (*use)->AsLclVarCommon();
+                LclVarDsc*           desc = m_compiler->lvaGetDesc(node);
+                if (!desc->lvDoNotEnregister && desc->lvIsRegArg)
                 {
-                    if (!costsPrepared)
-                    {
-                        /* We call gtPrepareCost to measure the cost of evaluating this tree */
-                        comp->gtPrepareCost(argx);
-                    }
-
-                    if (argx->GetCostEx() > expensiveArgCost)
+                    if ((desc->GetArgReg() != REG_NA) && (desc->GetArgReg() != REG_STK))
                     {
-                        // Remember this arg as the most expensive one that we have yet seen
-                        expensiveArgCost  = argx->GetCostEx();
-                        expensiveArgIndex = curInx;
-                        expensiveArg      = arg;
-                        expensiveArgNum   = 1;
+                        Registers |= genRegMask(desc->GetArgReg());
                     }
-                    else if (argx->GetCostEx() == expensiveArgCost)
+#if FEATURE_MULTIREG_ARGS
+                    if ((desc->GetOtherArgReg() != REG_NA) && (desc->GetOtherArgReg() != REG_STK))
                     {
-                        expensiveArgNum++;
+                        Registers |= genRegMask(desc->GetOtherArgReg());
                     }
+#endif
                 }
-            }
-        }
-
-        noway_assert(expensiveArgIndex != UINT_MAX);
-
-        // if (expensiveArgNum > 1)
-        //{
-        //    // If there are multiple args with the same cost then place arg
-        //    // that clobbers least amount of registers used by other args.
-        //    // Find arg that clobbers least amount of registers used by other args.
-        //    unsigned bestNumClobbered = UINT_MAX;
-        //    unsigned bestIndex        = 0;
-        //    for (unsigned i = begTab; i <= endTab; i++)
-        //    {
-        //        if (sortedArgs[i]->m_processed)
-        //            continue;
 
-        //        if (sortedArgs[i]->GetNode()->GetCostEx() != expensiveArgCost)
-        //            continue;
-
-        //        regMaskTP clobbered = 0;
-        //        for (unsigned j = 0; j < sortedArgs[i]->AbiInfo.NumRegs; j++)
-        //        {
-        //            clobbered |= genRegMask(sortedArgs[i]->AbiInfo.GetRegNum(j));
-        //        }
-
-        //        assert(clobbered != 0);
-        //        unsigned clobberedLaterArgs = 0;
-        //        for (unsigned j = begTab; j <= endTab; j++)
-        //        {
-        //            if (i == j || sortedArgs[i]->m_processed ||
-        //                sortedArgs[i]->GetNode()->GetCostEx() != expensiveArgCost)
-        //                continue;
+                return fgWalkResult::WALK_CONTINUE;
+            }
+        };
 
-        //            regMaskTP uses = EstimateRegisterUses(comp, sortedArgs[j]->GetNode());
-        //            if ((uses & clobbered) != 0)
-        //                clobberedLaterArgs++;
-        //        }
+        RegisterUsesVisitor visitor(m_comp);
+        visitor.WalkTree(&tree, nullptr);
+        return visitor.Registers;
+    }
+};
 
-        //        if (clobberedLaterArgs < bestNumClobbered)
-        //        {
-        //            bestNumClobbered = clobberedLaterArgs;
-        //            bestIndex        = i;
-        //        }
-        //    }
+//------------------------------------------------------------------------
+// SortArgs: Sort arguments into a better passing order.
+//
+// Parameters:
+//   comp       - The compiler object.
+//   call       - The call that contains this CallArgs instance.
+//   sortedArgs - A table of at least `CountArgs()` entries where the sorted
+//                arguments are written into.
+//
+// Remarks:
+//   It is expected that arguments that interfere (in terms of side effects)
+//   have been marked as being evaluated into temps and that this function is
+//   thus free to reorder arguments freely. For arguments evaluated into temp,
+//   the result affects when the temp is placed.
+//
+void CallArgs::SortArgs(Compiler* comp, GenTreeCall* call, CallArg** sortedArgs)
+{
+    assert(m_argsComplete);
 
-        //    expensiveArg      = sortedArgs[bestIndex];
-        //    expensiveArgIndex = bestIndex;
-        //}
+    JITDUMP("\nSorting the arguments:\n");
 
-        // put the most expensive arg towards the beginning of the table
+    unsigned argCount = 0;
+    for (CallArg& arg : Args())
+    {
+        sortedArgs[argCount] = &arg;
+        arg.Tag              = static_cast<int>(argCount);
+        argCount++;
+    }
 
-        expensiveArg->m_processed = true;
+    if ((argCount <= 1) || (argCount >= 64))
+    {
+        return;
+    }
 
-        // place expensiveArgTabEntry at the begTab position by performing a swap
-        //
-        JITDUMP("  [%06u] -> #%d", expensiveArg->GetNode()->gtTreeID, begTab);
-        if (argsRemaining == 1)
+    // First sort the arguments according to two heuristics:
+    // 1. Put constants at the end of the table. They cannot conflict with other arguments, so placing them last is
+    // always beneficial.
+    // 2. Put calls at the beginning of the table.
+    jitstd::sort(sortedArgs, sortedArgs + argCount, [](CallArg* l, CallArg* r) {
+        GenTree* lNode = l->GetNode();
+        GenTree* rNode = r->GetNode();
+        // Put constants at the end, they do not conflict with anything.
+        if (lNode->OperIsConst() != rNode->OperIsConst())
         {
-            JITDUMP(" (last arg)\n");
+            return rNode->OperIsConst();
         }
-        else
+
+        // Put calls at the beginning.
+        if (lNode->IsCall() != rNode->IsCall())
         {
-            JITDUMP(" (cost %u)\n", expensiveArgCost);
+            return lNode->IsCall();
         }
 
-        if (expensiveArgIndex != begTab)
+        return l->Tag < r->Tag;
+    });
+
+    if (comp->opts.OptimizationEnabled())
+    {
+        // When optimizing, also resolve conflicts due to arguments using
+        // parameters that may be enregistered. For example: if an argument
+        // uses a register 'rcx', try to ensure that 'rcx' is placed _after_
+        // that argument. We create an interference graph and use Tarjan's
+        // algorithm, which will both find the SCCs (cycles) and reverse
+        // topologically sort the arguments, ensuring the above. Tarjan's
+        // algorithm here is also implemented in a stable manner such that
+        // arguments are not needlessly reordered when there are no conflicts.
+        ArgInterferenceGraph graph(comp, argCount);
+
+        for (unsigned i = 0; i < argCount; i++)
         {
-            sortedArgs[expensiveArgIndex] = sortedArgs[begTab];
-            sortedArgs[begTab]            = expensiveArg;
+            graph.AddNode(sortedArgs[i]);
         }
 
-        begTab++;
-        argsRemaining--;
+        graph.FindSccs();
 
-        costsPrepared = true; // If we have more expensive arguments, don't re-evaluate the tree cost on the next loop
-        JITDUMP("   ");
-        for (unsigned i = 0; i < argCount; i++)
-            JITDUMP(" [%06u]", sortedArgs[i]->GetNode()->gtTreeID);
-        JITDUMP("\n");
-    }
+        JITDUMP("Argument SCCs and order:\n");
 
-    unsigned insertionIndex = afterCallIndex;
-    // Move all arguments that do not clobber registers back to happen right after GTF_CALL args.
-    for (unsigned i = afterCallIndex; i != endTab + 1; i++)
-    {
-        CallArg* arg = sortedArgs[i];
-        if (arg->AbiInfo.NumRegs <= 0)
+        unsigned curIndex = 0;
+        for (int scc = 0; scc < graph.NumSccs(); scc++)
         {
-            memmove(&sortedArgs[insertionIndex + 1], &sortedArgs[insertionIndex],
-                    (i - insertionIndex) * sizeof(CallArg*));
-            JITDUMP("  [%06u] -> #%d (stack arg)\n", arg->GetNode()->gtTreeID, insertionIndex);
-            sortedArgs[insertionIndex] = arg;
-            JITDUMP("   ");
-            for (unsigned i = 0; i < argCount; i++)
-                JITDUMP(" [%06u]", sortedArgs[i]->GetNode()->gtTreeID);
-            JITDUMP("\n");
+            int firstNodeIndex = graph.FirstSccNode(scc);
 
-            insertionIndex++;
-        }
-    }
+            unsigned sccSize   = 0;
+            int      nodeIndex = firstNodeIndex;
 
-    // Now move args whose placement will potentially clobber a later arg to after that arg.
-    unsigned max      = beforeFirstConstArgIndex - firstCallArgIndex + 1;
-    unsigned curIndex = firstCallArgIndex;
-    for (unsigned count = 0; count < max; count++)
-    {
-        CallArg* arg = sortedArgs[curIndex];
-        if (arg->AbiInfo.NumRegs <= 0)
-        {
-            curIndex++;
-            continue;
-        }
+            int lclVarIndex = -1;
+            do
+            {
+                ArgInterferenceGraphNode& node = graph.GetNode(nodeIndex);
+                if (node.Arg->GetNode()->OperIs(GT_LCL_VAR))
+                {
+                    lclVarIndex = nodeIndex;
+                }
 
-        regMaskTP clobbered = 0;
-        for (unsigned j = 0; j < arg->AbiInfo.NumRegs; j++)
-        {
-            clobbered |= genRegMask(arg->AbiInfo.GetRegNum(j));
-        }
+                assert(curIndex < argCount);
+                sortedArgs[curIndex + sccSize] = node.Arg;
 
-        unsigned lastClobbered = UINT_MAX;
-        regMaskTP lastClobberedRegs = RBM_NONE;
+                nodeIndex = node.SccNext;
 
-        for (unsigned i = curIndex + 1; i != beforeFirstConstArgIndex + 1; i++)
-        {
-            regMaskTP usedRegs = EstimateRegisterUses(comp, sortedArgs[i]->GetNode());
-            if ((clobbered & usedRegs) != 0)
+                sccSize++;
+
+            } while (nodeIndex != firstNodeIndex);
+
+            // LSRA is able to break cycles by rehoming LCL_VAR parameters into
+            // another register, if it sees the LCL_VAR last. If we notice that
+            // the last arg we placed is not a LCL_VAR, and we had a LCL_VAR,
+            // then repeat the loop but ensure that we place a LCL_VAR node
+            // last.
+            if ((lclVarIndex != -1) && !sortedArgs[curIndex + sccSize - 1]->GetNode()->OperIs(GT_LCL_VAR))
             {
-                lastClobbered = i;
-                lastClobberedRegs = clobbered & usedRegs;
+                // Refill the sorted args, this time placing a LCL_VAR last.
+                firstNodeIndex = graph.GetNode(lclVarIndex).SccNext;
+
+                sccSize   = 0;
+                nodeIndex = firstNodeIndex;
+
+                do
+                {
+                    ArgInterferenceGraphNode& node = graph.GetNode(nodeIndex);
+                    sortedArgs[curIndex + sccSize] = node.Arg;
+                    nodeIndex                      = node.SccNext;
+                    sccSize++;
+                } while (nodeIndex != firstNodeIndex);
             }
-        }
 
-        if (lastClobbered != UINT_MAX)
-        {
-            JITDUMP("  [%06u] -> #%u (clobbers %s used by [%06u])\n", arg->GetNode()->gtTreeID, lastClobbered, getRegName(genRegNumFromMask(genFindLowestReg(lastClobberedRegs))), sortedArgs[lastClobbered]->GetNode()->gtTreeID);
-            memmove(&sortedArgs[curIndex], &sortedArgs[curIndex + 1],
-                (lastClobbered - curIndex) * sizeof(CallArg*));
-            assert(sortedArgs[lastClobbered - 1] == sortedArgs[lastClobbered]);
-            sortedArgs[lastClobbered] = arg;
+#ifdef DEBUG
+            if (comp->verbose)
+            {
+                printf("  [");
+                for (unsigned j = 0; j < sccSize; j++)
+                    printf(" [%06u]", sortedArgs[curIndex + j]->GetNode()->gtTreeID);
+                printf(" ]\n");
+            }
+#endif
 
-            JITDUMP("   ");
-            for (unsigned i = 0; i < argCount; i++)
-                JITDUMP(" [%06u]", sortedArgs[i]->GetNode()->gtTreeID);
-            JITDUMP("\n");
-        }
-        else
-        {
-            curIndex++;
+            curIndex += sccSize;
         }
-    }
 
-    // The table should now be completely filled and thus begTab should now be adjacent to endTab
-    // and regArgsRemaining should be zero
-    assert(begTab == (endTab + 1));
-    assert(argsRemaining == 0);
+        assert(curIndex == argCount);
+    }
 }
 
 //------------------------------------------------------------------------------

From 27ee382ab653fcb6416ca0ba2a8edf89314abde1 Mon Sep 17 00:00:00 2001
From: Jakob Botsch Nielsen <jakob.botsch.nielsen@gmail.com>
Date: Thu, 29 Sep 2022 15:26:23 +0200
Subject: [PATCH 08/14] Clarify comment

---
 src/coreclr/jit/morph.cpp | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/coreclr/jit/morph.cpp b/src/coreclr/jit/morph.cpp
index 3f38d4eae42e30..32229cd29c81ab 100644
--- a/src/coreclr/jit/morph.cpp
+++ b/src/coreclr/jit/morph.cpp
@@ -1558,10 +1558,10 @@ void CallArgs::SortArgs(Compiler* comp, GenTreeCall* call, CallArg** sortedArgs)
             } while (nodeIndex != firstNodeIndex);
 
             // LSRA is able to break cycles by rehoming LCL_VAR parameters into
-            // another register, if it sees the LCL_VAR last. If we notice that
-            // the last arg we placed is not a LCL_VAR, and we had a LCL_VAR,
-            // then repeat the loop but ensure that we place a LCL_VAR node
-            // last.
+            // another register. It does this better when it sees the LCL_VAR
+            // last. If we notice that the last arg we placed is not a LCL_VAR,
+            // and we had a LCL_VAR, then repeat the loop but ensure that we
+            // place a LCL_VAR node last.
             if ((lclVarIndex != -1) && !sortedArgs[curIndex + sccSize - 1]->GetNode()->OperIs(GT_LCL_VAR))
             {
                 // Refill the sorted args, this time placing a LCL_VAR last.

From d316a05c4177248fca4534677f0030b8b635a0ec Mon Sep 17 00:00:00 2001
From: Jakob Botsch Nielsen <jakob.botsch.nielsen@gmail.com>
Date: Thu, 29 Sep 2022 17:16:30 +0200
Subject: [PATCH 09/14] Switch edges to list, optimize TP by a lot

---
 src/coreclr/jit/morph.cpp | 201 ++++++++++++++++++++------------------
 1 file changed, 105 insertions(+), 96 deletions(-)

diff --git a/src/coreclr/jit/morph.cpp b/src/coreclr/jit/morph.cpp
index 32229cd29c81ab..9f5fd20d2efb9a 100644
--- a/src/coreclr/jit/morph.cpp
+++ b/src/coreclr/jit/morph.cpp
@@ -1264,8 +1264,6 @@ struct ArgInterferenceGraphNode
     regMaskTP Clobbers;
     // Registers that may be used by the argument (guess).
     regMaskTP Uses;
-    // Bit mask of edges. There is an edge (i, j) if argument i clobbers a register used by argument j.
-    uint32_t Edges;
 
     int  Index;
     int  LowLink;
@@ -1281,16 +1279,22 @@ class ArgInterferenceGraph
     int       m_index = 0;
 
     ArrayStack<ArgInterferenceGraphNode> m_nodes;
+    ArrayStack<int>                      m_edges;
     ArrayStack<int>                      m_stack;
     ArrayStack<int>                      m_sccs;
+    // Registers that are used by an argument that does not also clobber that register.
+    regMaskTP                            m_regDependencies;
+    regMaskTP                            m_allClobbers;
 
 public:
     ArgInterferenceGraph(Compiler* comp, unsigned argCount)
         : m_comp(comp)
         , m_nodes(comp->getAllocator(CMK_CallArgs), static_cast<int>(argCount))
+        , m_edges(comp->getAllocator(CMK_CallArgs))
         , m_stack(comp->getAllocator(CMK_CallArgs))
-        , m_sccs(comp->getAllocator(CMK_CallArgs))
-
+        , m_sccs(comp->getAllocator(CMK_CallArgs), static_cast<int>(argCount))
+        , m_regDependencies(RBM_NONE)
+        , m_allClobbers(RBM_NONE)
     {
     }
 
@@ -1316,39 +1320,22 @@ class ArgInterferenceGraph
             clobbers |= genRegMask(arg->AbiInfo.GetRegNum(i));
         }
 
-        arg->Tag = m_nodes.Height();
         ArgInterferenceGraphNode node;
         node.Arg      = arg;
         node.Clobbers = clobbers;
         node.Uses     = EstimateRegisterUses(arg->GetNode());
-        node.Edges    = 0;
         node.Index    = -1;
         node.LowLink  = -1;
         node.OnStack  = false;
         node.SccNext  = UINT_MAX;
         m_nodes.Push(node);
+
+        m_regDependencies |= node.Uses & ~clobbers;
+        m_allClobbers |= clobbers;
     }
 
     void FindSccs()
     {
-        // Create edges
-        for (int i = 0; i < m_nodes.Height(); i++)
-        {
-            for (int j = 0; j < m_nodes.Height(); j++)
-            {
-                if (i == j)
-                {
-                    continue;
-                }
-
-                ArgInterferenceGraphNode& ni = m_nodes.BottomRef(i);
-                ArgInterferenceGraphNode& nj = m_nodes.BottomRef(j);
-
-                if ((ni.Clobbers & nj.Uses) != 0)
-                    ni.Edges |= 1 << j;
-            }
-        }
-
         for (int i = 0; i < m_nodes.Height(); i++)
         {
             if (m_nodes.BottomRef(i).Index == -1)
@@ -1358,6 +1345,8 @@ class ArgInterferenceGraph
         }
     }
 
+    bool HasInterference() { return (m_allClobbers & m_regDependencies) != RBM_NONE; }
+
 private:
     // Implementation of Tarjan's algorithm
     void FindScc(int index)
@@ -1367,23 +1356,34 @@ class ArgInterferenceGraph
         node.LowLink                   = m_index;
         m_index++;
 
+        // Early exit for args that do not clobber any other argument.
+        if ((m_regDependencies & node.Clobbers) == RBM_NONE)
+        {
+            node.SccNext = index;
+            m_sccs.Push(index);
+            return;
+        }
+
         uint32_t nodeStackIndex = m_stack.Height();
         m_stack.Push(index);
         node.OnStack = true;
 
-        uint32_t neighbors = node.Edges;
-        while (neighbors != 0)
+        for (int i = 0; i < m_nodes.Height(); i++)
         {
-            uint32_t neighborIndexMask = genFindLowestBit(neighbors);
-            neighbors &= ~neighborIndexMask;
-            int neighborIndex = static_cast<int>(genLog2(neighborIndexMask));
+            if (i == index)
+            {
+                continue;
+            }
 
-            assert(neighborIndex < m_nodes.Height());
-            ArgInterferenceGraphNode& neighbor = m_nodes.BottomRef(neighborIndex);
+            ArgInterferenceGraphNode& neighbor = m_nodes.BottomRef(i);
+            if ((neighbor.Uses & node.Clobbers) == 0)
+            {
+                continue;
+            }
 
             if (neighbor.Index == -1)
             {
-                FindScc(neighborIndex);
+                FindScc(i);
                 node.LowLink = min(node.LowLink, neighbor.LowLink);
             }
             else if (neighbor.OnStack)
@@ -1485,6 +1485,7 @@ void CallArgs::SortArgs(Compiler* comp, GenTreeCall* call, CallArg** sortedArgs)
 
     if ((argCount <= 1) || (argCount >= 64))
     {
+        JITDUMP("  Placed arguments in order (%u arguments).\n", argCount);
         return;
     }
 
@@ -1510,90 +1511,98 @@ void CallArgs::SortArgs(Compiler* comp, GenTreeCall* call, CallArg** sortedArgs)
         return l->Tag < r->Tag;
     });
 
-    if (comp->opts.OptimizationEnabled())
+    if (comp->opts.OptimizationDisabled())
     {
-        // When optimizing, also resolve conflicts due to arguments using
-        // parameters that may be enregistered. For example: if an argument
-        // uses a register 'rcx', try to ensure that 'rcx' is placed _after_
-        // that argument. We create an interference graph and use Tarjan's
-        // algorithm, which will both find the SCCs (cycles) and reverse
-        // topologically sort the arguments, ensuring the above. Tarjan's
-        // algorithm here is also implemented in a stable manner such that
-        // arguments are not needlessly reordered when there are no conflicts.
-        ArgInterferenceGraph graph(comp, argCount);
+        return;
+    }
 
-        for (unsigned i = 0; i < argCount; i++)
-        {
-            graph.AddNode(sortedArgs[i]);
-        }
+    // When optimizing also resolve conflicts due to arguments using parameters
+    // that may be enregistered. For example: if an argument uses a register
+    // 'rcx', try to ensure that 'rcx' is placed _after_ that argument. We
+    // create an interference graph and use Tarjan's algorithm, which will both
+    // find the SCCs (cycles) and reverse topologically sort the arguments,
+    // ensuring the above. The implementation of Tarjan's here also iterates
+    // neighbors in order such that arguments are not needlessly reordered when
+    // there are no conflicts.
+    ArgInterferenceGraph graph(comp, argCount);
+
+    for (unsigned i = 0; i < argCount; i++)
+    {
+        graph.AddNode(sortedArgs[i]);
+    }
 
-        graph.FindSccs();
+    if (!graph.HasInterference())
+    {
+        JITDUMP("  No interference found between arguments.\n");
+        return;
+    }
 
-        JITDUMP("Argument SCCs and order:\n");
+    graph.FindSccs();
 
-        unsigned curIndex = 0;
-        for (int scc = 0; scc < graph.NumSccs(); scc++)
-        {
-            int firstNodeIndex = graph.FirstSccNode(scc);
+    JITDUMP("  Arguments have register interference. Argument order and SCCs:\n");
 
-            unsigned sccSize   = 0;
-            int      nodeIndex = firstNodeIndex;
+    unsigned curIndex = 0;
+    for (int scc = 0; scc < graph.NumSccs(); scc++)
+    {
+        int firstNodeIndex = graph.FirstSccNode(scc);
 
-            int lclVarIndex = -1;
-            do
-            {
-                ArgInterferenceGraphNode& node = graph.GetNode(nodeIndex);
-                if (node.Arg->GetNode()->OperIs(GT_LCL_VAR))
-                {
-                    lclVarIndex = nodeIndex;
-                }
+        unsigned sccSize   = 0;
+        int      nodeIndex = firstNodeIndex;
 
-                assert(curIndex < argCount);
-                sortedArgs[curIndex + sccSize] = node.Arg;
+        int lclVarIndex = -1;
+        do
+        {
+            ArgInterferenceGraphNode& node = graph.GetNode(nodeIndex);
+            if (node.Arg->GetNode()->OperIs(GT_LCL_VAR))
+            {
+                lclVarIndex = nodeIndex;
+            }
 
-                nodeIndex = node.SccNext;
+            assert(curIndex < argCount);
+            sortedArgs[curIndex + sccSize] = node.Arg;
 
-                sccSize++;
+            nodeIndex = node.SccNext;
 
-            } while (nodeIndex != firstNodeIndex);
+            sccSize++;
 
-            // LSRA is able to break cycles by rehoming LCL_VAR parameters into
-            // another register. It does this better when it sees the LCL_VAR
-            // last. If we notice that the last arg we placed is not a LCL_VAR,
-            // and we had a LCL_VAR, then repeat the loop but ensure that we
-            // place a LCL_VAR node last.
-            if ((lclVarIndex != -1) && !sortedArgs[curIndex + sccSize - 1]->GetNode()->OperIs(GT_LCL_VAR))
-            {
-                // Refill the sorted args, this time placing a LCL_VAR last.
-                firstNodeIndex = graph.GetNode(lclVarIndex).SccNext;
+        } while (nodeIndex != firstNodeIndex);
 
-                sccSize   = 0;
-                nodeIndex = firstNodeIndex;
+        // LSRA is able to break cycles by rehoming LCL_VAR parameters into
+        // another register. It does this better when it sees the LCL_VAR
+        // last. If we notice that the last arg we placed is not a LCL_VAR,
+        // and we had a LCL_VAR, then repeat the loop but ensure that we
+        // place a LCL_VAR node last.
+        if ((lclVarIndex != -1) && !sortedArgs[curIndex + sccSize - 1]->GetNode()->OperIs(GT_LCL_VAR))
+        {
+            // Refill the sorted args, this time placing a LCL_VAR last.
+            firstNodeIndex = graph.GetNode(lclVarIndex).SccNext;
 
-                do
-                {
-                    ArgInterferenceGraphNode& node = graph.GetNode(nodeIndex);
-                    sortedArgs[curIndex + sccSize] = node.Arg;
-                    nodeIndex                      = node.SccNext;
-                    sccSize++;
-                } while (nodeIndex != firstNodeIndex);
-            }
+            sccSize   = 0;
+            nodeIndex = firstNodeIndex;
 
-#ifdef DEBUG
-            if (comp->verbose)
+            do
             {
-                printf("  [");
-                for (unsigned j = 0; j < sccSize; j++)
-                    printf(" [%06u]", sortedArgs[curIndex + j]->GetNode()->gtTreeID);
-                printf(" ]\n");
-            }
-#endif
+                ArgInterferenceGraphNode& node = graph.GetNode(nodeIndex);
+                sortedArgs[curIndex + sccSize] = node.Arg;
+                nodeIndex                      = node.SccNext;
+                sccSize++;
+            } while (nodeIndex != firstNodeIndex);
+        }
 
-            curIndex += sccSize;
+#ifdef DEBUG
+        if (comp->verbose)
+        {
+            printf("  [");
+            for (unsigned j = 0; j < sccSize; j++)
+                printf(" [%06u]", sortedArgs[curIndex + j]->GetNode()->gtTreeID);
+            printf(" ]\n");
         }
+#endif
 
-        assert(curIndex == argCount);
+        curIndex += sccSize;
     }
+
+    assert(curIndex == argCount);
 }
 
 //------------------------------------------------------------------------------

From 1d5e10e6ac9d4d400c326f842cb9893c967f8fdb Mon Sep 17 00:00:00 2001
From: Jakob Botsch Nielsen <jakob.botsch.nielsen@gmail.com>
Date: Thu, 29 Sep 2022 17:17:04 +0200
Subject: [PATCH 10/14] Run jit-format

---
 src/coreclr/jit/morph.cpp | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/src/coreclr/jit/morph.cpp b/src/coreclr/jit/morph.cpp
index 9f5fd20d2efb9a..ae6b5d5a513c0e 100644
--- a/src/coreclr/jit/morph.cpp
+++ b/src/coreclr/jit/morph.cpp
@@ -1283,8 +1283,8 @@ class ArgInterferenceGraph
     ArrayStack<int>                      m_stack;
     ArrayStack<int>                      m_sccs;
     // Registers that are used by an argument that does not also clobber that register.
-    regMaskTP                            m_regDependencies;
-    regMaskTP                            m_allClobbers;
+    regMaskTP m_regDependencies;
+    regMaskTP m_allClobbers;
 
 public:
     ArgInterferenceGraph(Compiler* comp, unsigned argCount)
@@ -1345,7 +1345,10 @@ class ArgInterferenceGraph
         }
     }
 
-    bool HasInterference() { return (m_allClobbers & m_regDependencies) != RBM_NONE; }
+    bool HasInterference()
+    {
+        return (m_allClobbers & m_regDependencies) != RBM_NONE;
+    }
 
 private:
     // Implementation of Tarjan's algorithm

From b324b4b67c76d797cdf12b75f3d44f59e8d8c7a9 Mon Sep 17 00:00:00 2001
From: Jakob Botsch Nielsen <jakob.botsch.nielsen@gmail.com>
Date: Thu, 29 Sep 2022 19:32:25 +0200
Subject: [PATCH 11/14] Revert "SPMI: Improve speed significantly for large
 diffs"

This reverts commit 9c4df8454434793965e1939ee80ac3b2aa007d10.
---
 src/coreclr/inc/clr_std/vector                |  27 ----
 src/coreclr/scripts/superpmi.py               | 114 ++++-----------
 src/coreclr/scripts/superpmi_diffs.py         |   3 +-
 .../superpmi/superpmi-shared/standardpch.h    |   2 -
 .../tools/superpmi/superpmi/CMakeLists.txt    |   1 -
 .../tools/superpmi/superpmi/commandline.cpp   |   8 +-
 .../tools/superpmi/superpmi/commandline.h     |   4 +-
 .../tools/superpmi/superpmi/fileio.cpp        | 115 ---------------
 src/coreclr/tools/superpmi/superpmi/fileio.h  | 133 ------------------
 .../superpmi/superpmi/metricssummary.cpp      |  95 +++++++++++--
 .../tools/superpmi/superpmi/metricssummary.h  |   2 +-
 .../superpmi/superpmi/parallelsuperpmi.cpp    |  64 ++-------
 .../tools/superpmi/superpmi/superpmi.cpp      | 128 ++++++-----------
 13 files changed, 178 insertions(+), 518 deletions(-)
 delete mode 100644 src/coreclr/tools/superpmi/superpmi/fileio.cpp
 delete mode 100644 src/coreclr/tools/superpmi/superpmi/fileio.h

diff --git a/src/coreclr/inc/clr_std/vector b/src/coreclr/inc/clr_std/vector
index c2d1caba890aaf..c10ecf6ba5a92c 100644
--- a/src/coreclr/inc/clr_std/vector
+++ b/src/coreclr/inc/clr_std/vector
@@ -215,33 +215,6 @@ namespace std
                 }
             }
 
-            vector(const vector<T>&) = delete;
-            vector<T>& operator=(const vector<T>&) = delete;
-
-            vector(vector<T>&& v) noexcept
-                : m_size(v.m_size)
-                , m_capacity(v.m_capacity)
-                , m_pelements(v.m_pelements)
-                , m_isBufferOwner(v.m_isBufferOwner)
-            {
-                v.m_isBufferOwner = false;
-            }
-
-            vector<T>& operator=(vector<T>&& v) noexcept
-            {
-                if (m_isBufferOwner)
-                {
-                    erase(m_pelements, 0, m_size);
-                    delete [] (BYTE*)m_pelements;
-                }
-
-                m_size = v.m_size;
-                m_capacity = v.m_capacity;
-                m_pelements = v.m_pelements;
-                m_isBufferOwner = v.m_isBufferOwner;
-                v.m_isBufferOwner = false;
-                return *this;
-            }
 
             size_t size() const
             {
diff --git a/src/coreclr/scripts/superpmi.py b/src/coreclr/scripts/superpmi.py
index ae45443620cd6c..45dfcfe8746e51 100644
--- a/src/coreclr/scripts/superpmi.py
+++ b/src/coreclr/scripts/superpmi.py
@@ -342,6 +342,7 @@ def add_core_root_arguments(parser, build_type_default, build_type_help):
 asm_diff_parser.add_argument("--debuginfo", action="store_true", help="Include debug info after disassembly (sets COMPlus_JitDebugDump).")
 asm_diff_parser.add_argument("-tag", help="Specify a word to add to the directory name where the asm diffs will be placed")
 asm_diff_parser.add_argument("-metrics", action="append", help="Metrics option to pass to jit-analyze. Can be specified multiple times, or pass comma-separated values.")
+asm_diff_parser.add_argument("-retainOnlyTopFiles", action="store_true", help="Retain only top .dasm files with largest improvements or regressions and delete remaining files.")
 asm_diff_parser.add_argument("--diff_with_release", action="store_true", help="Specify if this is asmdiff using release binaries.")
 asm_diff_parser.add_argument("--git_diff", action="store_true", help="Produce a '.diff' file from 'base' and 'diff' folders if there were any differences.")
 
@@ -500,11 +501,6 @@ def read_csv_metrics(path):
 
     return dict
 
-def read_csv_diffs(path):
-    with open(path) as csv_file:
-        reader = csv.DictReader(csv_file)
-        return list(reader)
-
 def determine_clrjit_compiler_version(clrjit_path):
     """ Obtain the version of the compiler that was used to compile the clrjit at the specified path.
 
@@ -1572,7 +1568,7 @@ def replay_with_asm_diffs(self):
                 logging.info("Running asm diffs of %s", mch_file)
 
                 fail_mcl_file = os.path.join(temp_location, os.path.basename(mch_file) + "_fail.mcl")
-                diffs_info = os.path.join(temp_location, os.path.basename(mch_file) + "_diffs.csv")
+                diff_mcl_file = os.path.join(temp_location, os.path.basename(mch_file) + "_diff.mcl")
 
                 base_metrics_summary_file = os.path.join(temp_location, os.path.basename(mch_file) + "_base_metrics.csv")
                 diff_metrics_summary_file = os.path.join(temp_location, os.path.basename(mch_file) + "_diff_metrics.csv")
@@ -1581,7 +1577,7 @@ def replay_with_asm_diffs(self):
                     "-a",  # Asm diffs
                     "-v", "ewmi",  # display errors, warnings, missing, jit info
                     "-f", fail_mcl_file,  # Failing mc List
-                    "-diffsInfo", diffs_info,  # Information about diffs
+                    "-diffMCList", diff_mcl_file,  # Create all of the diffs in an mcl file
                     "-r", os.path.join(temp_location, "repro"),  # Repro name, create .mc repro files
                     "-baseMetricsSummary", base_metrics_summary_file, # Create summary of metrics we can use to get total code size impact
                     "-diffMetricsSummary", diff_metrics_summary_file,
@@ -1637,10 +1633,8 @@ def replay_with_asm_diffs(self):
                             repro_base_command_line = "{} {} {}".format(self.superpmi_path, " ".join(altjit_asm_diffs_flags), self.diff_jit_path)
                             save_repro_mc_files(temp_location, self.coreclr_args, artifacts_base_name, repro_base_command_line)
 
-                diffs = read_csv_diffs(diffs_info)
-
                 # This file had asm diffs; keep track of that.
-                has_diffs = len(diffs) > 0
+                has_diffs = is_nonzero_length_file(diff_mcl_file)
                 if has_diffs:
                     files_with_asm_diffs.append(mch_file)
 
@@ -1655,6 +1649,12 @@ def replay_with_asm_diffs(self):
                     if return_code == 0:
                         logging.warning("Warning: SuperPMI returned a zero exit code, but generated a non-zero-sized mcl file")
 
+                    self.diff_mcl_contents = None
+                    with open(diff_mcl_file) as file_handle:
+                        mcl_lines = file_handle.readlines()
+                        mcl_lines = [item.strip() for item in mcl_lines]
+                        self.diff_mcl_contents = mcl_lines
+
                     asm_root_dir = create_unique_directory_name(self.coreclr_args.spmi_location, "asm.{}".format(artifacts_base_name))
                     base_asm_location = os.path.join(asm_root_dir, "base")
                     diff_asm_location = os.path.join(asm_root_dir, "diff")
@@ -1672,13 +1672,13 @@ def replay_with_asm_diffs(self):
                     text_differences = queue.Queue()
                     jit_dump_differences = queue.Queue()
 
-                    async def create_replay_artifacts(print_prefix, context_index, self, mch_file, env, jit_differences_queue, base_location, diff_location, extension):
+                    async def create_replay_artifacts(print_prefix, item, self, mch_file, env, jit_differences_queue, base_location, diff_location, extension):
                         """ Run superpmi over an MC to create JIT asm or JIT dumps for the method.
                         """
                         # Setup flags to call SuperPMI for both the diff jit and the base jit
 
                         flags = [
-                            "-c", str(context_index),
+                            "-c", item,
                             "-v", "q"  # only log from the jit.
                         ]
                         flags += altjit_replay_flags
@@ -1690,7 +1690,7 @@ async def create_replay_artifacts(print_prefix, context_index, self, mch_file, e
 
                             async def create_one_artifact(jit_path: str, location: str, flags) -> str:
                                 command = [self.superpmi_path] + flags + [jit_path, mch_file]
-                                item_path = os.path.join(location, "{}{}".format(context_index, extension))
+                                item_path = os.path.join(location, "{}{}".format(item, extension))
                                 modified_env = env.copy()
                                 modified_env['COMPlus_JitStdOutFile'] = item_path
                                 logging.debug("%sGenerating %s", print_prefix, item_path)
@@ -1706,20 +1706,22 @@ async def create_one_artifact(jit_path: str, location: str, flags) -> str:
                             diff_txt = await create_one_artifact(self.diff_jit_path, diff_location, flags + diff_option_flags_for_diff_artifact)
 
                             if base_txt != diff_txt:
-                                jit_differences_queue.put_nowait(context_index)
+                                jit_differences_queue.put_nowait(item)
                     ################################################################################################ end of create_replay_artifacts()
 
+                    diff_items = []
+                    for item in self.diff_mcl_contents:
+                        diff_items.append(item)
+
                     logging.info("Creating dasm files: %s %s", base_asm_location, diff_asm_location)
-                    dasm_contexts = self.pick_contexts_to_disassemble(diffs)
-                    subproc_helper = AsyncSubprocessHelper(dasm_contexts, verbose=True)
+                    subproc_helper = AsyncSubprocessHelper(diff_items, verbose=True)
                     subproc_helper.run_to_completion(create_replay_artifacts, self, mch_file, asm_complus_vars_full_env, text_differences, base_asm_location, diff_asm_location, ".dasm")
 
                     if self.coreclr_args.diff_jit_dump:
                         logging.info("Creating JitDump files: %s %s", base_dump_location, diff_dump_location)
                         subproc_helper.run_to_completion(create_replay_artifacts, self, mch_file, jit_dump_complus_vars_full_env, jit_dump_differences, base_dump_location, diff_dump_location, ".txt")
 
-                    logging.info("Differences found. To replay SuperPMI use:".format(len(diffs)))
-
+                    logging.info("Differences found. To replay SuperPMI use:")
                     logging.info("")
                     for var, value in asm_complus_vars.items():
                         print_platform_specific_environment_vars(logging.INFO, self.coreclr_args, var, value)
@@ -1734,10 +1736,9 @@ async def create_one_artifact(jit_path: str, location: str, flags) -> str:
                         logging.info("%s %s -c ### %s %s", self.superpmi_path, " ".join(altjit_replay_flags), self.diff_jit_path, mch_file)
                         logging.info("")
 
-                    smallest = sorted(diffs, key=lambda r: int(r["Context size"]))[:20]
-                    logging.debug("Smallest {} contexts with binary differences:".format(len(smallest)))
-                    for diff in smallest:
-                        logging.debug(diff["Context"])
+                    logging.debug("Method numbers with binary differences:")
+                    for item in self.diff_mcl_contents:
+                        logging.debug(item)
                     logging.debug("")
 
                     if base_metrics is not None and diff_metrics is not None:
@@ -1768,22 +1769,13 @@ async def create_one_artifact(jit_path: str, location: str, flags) -> str:
                                 # It appears we have a built jit-analyze on the path, so try to run it.
                                 jit_analyze_summary_file = os.path.join(asm_root_dir, "summary.md")
                                 command = [ jit_analyze_path, "--md", jit_analyze_summary_file, "-r", "--base", base_asm_location, "--diff", diff_asm_location ]
+                                if self.coreclr_args.retainOnlyTopFiles:
+                                    command += [ "--retainOnlyTopFiles" ]
                                 if self.coreclr_args.metrics:
                                     command += [ "--metrics", ",".join(self.coreclr_args.metrics) ]
                                 elif base_bytes is not None and diff_bytes is not None:
                                     command += [ "--override-total-base-metric", str(base_bytes), "--override-total-diff-metric", str(diff_bytes) ]
 
-                                if len(dasm_contexts) < len(diffs):
-                                    # Avoid producing analysis output that assumes these are all contexts
-                                    # with diffs. When no custom metrics are specified we pick only a subset
-                                    # of interesting contexts to disassemble, to avoid spending too long.
-                                    # See pick_contexts_to_disassemble.
-                                    command += [ "--is-subset-of-diffs" ]
-                                else:
-                                    # Ask jit-analyze to avoid producing analysis output that assumes these are
-                                    # all contexts (we never produce .dasm files for contexts without diffs).
-                                    command += [ "--is-diffs-only" ]
-
                                 run_and_log(command, logging.INFO)
                                 ran_jit_analyze = True
 
@@ -1808,14 +1800,6 @@ async def create_one_artifact(jit_path: str, location: str, flags) -> str:
                     else:
                         logging.warning("No textual differences. Is this an issue with coredistools?")
 
-                    # If we are not specifying custom metrics then print a summary here, otherwise leave the summarization up to jit-analyze.
-                    if self.coreclr_args.metrics is None:
-                        num_improvements = sum(1 for r in diffs if int(r["Diff size"]) < int(r["Base size"]))
-                        num_regressions = sum(1 for r in diffs if int(r["Diff size"]) > int(r["Base size"]))
-                        logging.info("{} contexts with differences found ({} improvements, {} regressions)".format(len(diffs), num_improvements, num_regressions))
-                        logging.info("")
-                        logging.info("")
-
                     if self.coreclr_args.diff_jit_dump:
                         try:
                             current_jit_dump_diff = jit_dump_differences.get_nowait()
@@ -2010,49 +1994,6 @@ def write_pivot_section(row):
         return result
         ################################################################################################ end of replay_with_asm_diffs()
 
-    def pick_contexts_to_disassemble(self, diffs):
-        """ Given information about diffs, pick the context numbers to create .dasm files for
-
-        Returns:
-            List of method context numbers to create disassembly for.
-        """
-
-        # If there are non-default metrics then we need to disassemble
-        # everything so that jit-analyze can handle those.
-        if self.coreclr_args.metrics is not None:
-            contexts = diffs
-        else:
-            # In the default case we have size improvements/regressions
-            # available without needing to disassemble all, so pick a subset of
-            # interesting diffs to pass to jit-analyze.
-
-            # 20 smallest method contexts with diffs
-            smallest_contexts = sorted(diffs, key=lambda r: int(r["Context size"]))[:20]
-            # Order by byte-wise improvement, largest improvements first
-            by_diff_size = sorted(diffs, key=lambda r: int(r["Diff size"]) - int(r["Base size"]))
-            # 20 top improvements, byte-wise
-            top_improvements_bytes = by_diff_size[:20]
-            # 20 top regressions, byte-wise
-            top_regressions_bytes = by_diff_size[-20:]
-
-            # Order by percentage-wise size improvement, largest improvements first
-            def diff_pct(r):
-                base = int(r["Base size"])
-                if base == 0:
-                    return 0
-                diff = int(r["Diff size"])
-                return (diff - base) / base
-
-            by_diff_size_pct = sorted(diffs, key=diff_pct)
-            top_improvements_pct = by_diff_size_pct[:20]
-            top_regressions_pct = by_diff_size_pct[-20:]
-
-            contexts = smallest_contexts + top_improvements_bytes + top_regressions_bytes + top_improvements_pct + top_regressions_pct
-
-        final_contexts_indices = list(set(int(r["Context"]) for r in contexts))
-        final_contexts_indices.sort()
-        return final_contexts_indices
-
 ################################################################################
 # SuperPMI Replay/TP diff
 ################################################################################
@@ -3967,6 +3908,11 @@ def verify_base_diff_args():
                             lambda unused: True,
                             "Unable to set metrics.")
 
+        coreclr_args.verify(args,
+                            "retainOnlyTopFiles",
+                            lambda unused: True,
+                            "Unable to set retainOnlyTopFiles.")
+
         coreclr_args.verify(args,
                             "diff_with_release",
                             lambda unused: True,
diff --git a/src/coreclr/scripts/superpmi_diffs.py b/src/coreclr/scripts/superpmi_diffs.py
index b7af1427fc481c..47243ad43d3a23 100644
--- a/src/coreclr/scripts/superpmi_diffs.py
+++ b/src/coreclr/scripts/superpmi_diffs.py
@@ -216,7 +216,8 @@ def main(main_args):
         "-spmi_location", spmi_location,
         "-error_limit", "100",
         "-log_level", "debug",
-        "-log_file", log_file])
+        "-log_file", log_file,
+        "-retainOnlyTopFiles"])
 
     print("Running superpmi.py tpdiff")
 
diff --git a/src/coreclr/tools/superpmi/superpmi-shared/standardpch.h b/src/coreclr/tools/superpmi/superpmi-shared/standardpch.h
index e0e83c41d03ab6..dc4ce235ffbf2a 100644
--- a/src/coreclr/tools/superpmi/superpmi-shared/standardpch.h
+++ b/src/coreclr/tools/superpmi/superpmi-shared/standardpch.h
@@ -62,7 +62,6 @@
 
 // Getting STL to work with PAL is difficult, so reimplement STL functionality to not require it.
 #ifdef TARGET_UNIX
-#include "clr_std/utility"
 #include "clr_std/string"
 #include "clr_std/algorithm"
 #include "clr_std/vector"
@@ -70,7 +69,6 @@
 #ifndef USE_STL
 #define USE_STL
 #endif // USE_STL
-#include <utility>
 #include <string>
 #include <algorithm>
 #include <vector>
diff --git a/src/coreclr/tools/superpmi/superpmi/CMakeLists.txt b/src/coreclr/tools/superpmi/superpmi/CMakeLists.txt
index 76ab73ffdabfe2..63237450898e3b 100644
--- a/src/coreclr/tools/superpmi/superpmi/CMakeLists.txt
+++ b/src/coreclr/tools/superpmi/superpmi/CMakeLists.txt
@@ -26,7 +26,6 @@ set(SUPERPMI_SOURCES
     neardiffer.cpp
     parallelsuperpmi.cpp
     superpmi.cpp
-    fileio.cpp
     jithost.cpp
     ../superpmi-shared/callutils.cpp
     ../superpmi-shared/compileresult.cpp
diff --git a/src/coreclr/tools/superpmi/superpmi/commandline.cpp b/src/coreclr/tools/superpmi/superpmi/commandline.cpp
index ca407889e87235..563d8cb16248c0 100644
--- a/src/coreclr/tools/superpmi/superpmi/commandline.cpp
+++ b/src/coreclr/tools/superpmi/superpmi/commandline.cpp
@@ -318,7 +318,7 @@ bool CommandLine::Parse(int argc, char* argv[], /* OUT */ Options* o)
 
                 o->mclFilename = argv[i];
             }
-            else if ((_strnicmp(&argv[i][1], "diffsInfo", 9) == 0))
+            else if ((_strnicmp(&argv[i][1], "diffMCList", 10) == 0))
             {
                 if (++i >= argc)
                 {
@@ -326,7 +326,7 @@ bool CommandLine::Parse(int argc, char* argv[], /* OUT */ Options* o)
                     return false;
                 }
 
-                o->diffsInfo = argv[i];
+                o->diffMCLFilename = argv[i];
             }
             else if ((_strnicmp(&argv[i][1], "target", 6) == 0))
             {
@@ -641,9 +641,9 @@ bool CommandLine::Parse(int argc, char* argv[], /* OUT */ Options* o)
         DumpHelp(argv[0]);
         return false;
     }
-    if (o->diffsInfo != nullptr && !o->applyDiff)
+    if (o->diffMCLFilename != nullptr && !o->applyDiff)
     {
-        LogError("-diffsInfo specified without -applyDiff.");
+        LogError("-diffMCList specified without -applyDiff.");
         DumpHelp(argv[0]);
         return false;
     }
diff --git a/src/coreclr/tools/superpmi/superpmi/commandline.h b/src/coreclr/tools/superpmi/superpmi/commandline.h
index e801f8cb4f752d..055adf00295cda 100644
--- a/src/coreclr/tools/superpmi/superpmi/commandline.h
+++ b/src/coreclr/tools/superpmi/superpmi/commandline.h
@@ -39,7 +39,7 @@ class CommandLine
             , baseMetricsSummaryFile(nullptr)
             , diffMetricsSummaryFile(nullptr)
             , mclFilename(nullptr)
-            , diffsInfo(nullptr)
+            , diffMCLFilename(nullptr)
             , targetArchitecture(nullptr)
             , compileList(nullptr)
             , offset(-1)
@@ -72,7 +72,7 @@ class CommandLine
         char* baseMetricsSummaryFile;
         char* diffMetricsSummaryFile;
         char* mclFilename;
-        char* diffsInfo;
+        char* diffMCLFilename;
         char* targetArchitecture;
         char* compileList;
         int   offset;
diff --git a/src/coreclr/tools/superpmi/superpmi/fileio.cpp b/src/coreclr/tools/superpmi/superpmi/fileio.cpp
deleted file mode 100644
index ed16485a038e8a..00000000000000
--- a/src/coreclr/tools/superpmi/superpmi/fileio.cpp
+++ /dev/null
@@ -1,115 +0,0 @@
-#include "standardpch.h"
-#include "fileio.h"
-
-bool FileWriter::Printf(const char* fmt, ...)
-{
-    va_list args;
-    va_start(args, fmt);
-
-    char stackBuffer[512];
-    size_t bufferSize = sizeof(stackBuffer);
-    char* pBuffer = stackBuffer;
-    while (true)
-    {
-        va_list argsCopy;
-        va_copy(argsCopy, args);
-        int printed = _vsnprintf_s(pBuffer, bufferSize, _TRUNCATE, fmt, argsCopy);
-        va_end(argsCopy);
-
-        if (printed < 0)
-        {
-            // buffer too small
-            if (pBuffer != stackBuffer)
-                delete[] pBuffer;
-
-            bufferSize *= 2;
-            pBuffer = new char[bufferSize];
-        }
-        else
-        {
-            DWORD numWritten;
-            bool result =
-                WriteFile(m_file.Get(), pBuffer, static_cast<DWORD>(printed), &numWritten, nullptr) &&
-                (numWritten == static_cast<DWORD>(printed));
-
-            if (pBuffer != stackBuffer)
-                delete[] pBuffer;
-
-            va_end(args);
-            return result;
-        }
-    }
-}
-
-bool FileWriter::CreateNew(const char* path, FileWriter* fw)
-{
-    FileHandle handle(CreateFile(path, GENERIC_WRITE, 0, nullptr, CREATE_ALWAYS, FILE_ATTRIBUTE_NORMAL, nullptr));
-    if (!handle.IsValid())
-    {
-        return false;
-    }
-
-    *fw = FileWriter(std::move(handle));
-    return true;
-}
-
-bool FileLineReader::Open(const char* path, FileLineReader* fr)
-{
-    FileHandle file(CreateFile(path, GENERIC_READ, FILE_SHARE_READ, nullptr, OPEN_EXISTING, FILE_ATTRIBUTE_NORMAL, nullptr));
-    if (!file.IsValid())
-    {
-        return false;
-    }
-
-    LARGE_INTEGER size;
-    if (!GetFileSizeEx(file.Get(), &size))
-    {
-        return false;
-    }
-
-    if (static_cast<ULONGLONG>(size.QuadPart) > SIZE_MAX)
-    {
-        return false;
-    }
-
-    FileMappingHandle mapping(CreateFileMapping(file.Get(), nullptr, PAGE_READONLY, size.u.HighPart, size.u.LowPart, nullptr));
-    if (!mapping.IsValid())
-    {
-        return false;
-    }
-
-    FileViewHandle view(MapViewOfFile(mapping.Get(), FILE_MAP_READ, 0, 0, 0));
-    if (!view.IsValid())
-    {
-        return false;
-    }
-
-    *fr = FileLineReader(std::move(file), std::move(mapping), std::move(view), static_cast<size_t>(size.QuadPart));
-    return true;
-}
-
-bool FileLineReader::AdvanceLine()
-{
-    if (m_cur >= m_end)
-    {
-        return false;
-    }
-
-    char* end = m_cur;
-    while (end < m_end && *end != '\r' && *end != '\n')
-    {
-        end++;
-    }
-
-    m_currentLine.resize(end - m_cur + 1);
-    memcpy(m_currentLine.data(), m_cur, end - m_cur);
-    m_currentLine[end - m_cur] = '\0';
-
-    m_cur = end;
-    if (m_cur < m_end && *m_cur == '\r')
-        m_cur++;
-    if (m_cur < m_end && *m_cur == '\n')
-        m_cur++;
-
-    return true;
-}
diff --git a/src/coreclr/tools/superpmi/superpmi/fileio.h b/src/coreclr/tools/superpmi/superpmi/fileio.h
deleted file mode 100644
index a88e74d6ee00c5..00000000000000
--- a/src/coreclr/tools/superpmi/superpmi/fileio.h
+++ /dev/null
@@ -1,133 +0,0 @@
-// Licensed to the .NET Foundation under one or more agreements.
-// The .NET Foundation licenses this file to you under the MIT license.
-
-#ifndef _FileIO
-#define _FileIO
-
-template<typename HandleSpec>
-struct HandleWrapper
-{
-    using HandleType = typename HandleSpec::Type;
-
-    HandleWrapper()
-        : m_handle(HandleSpec::Invalid())
-    {
-    }
-
-    explicit HandleWrapper(HandleType handle)
-        : m_handle(handle)
-    {
-    }
-
-    ~HandleWrapper()
-    {
-        if (m_handle != HandleSpec::Invalid())
-        {
-            HandleSpec::Close(m_handle);
-            m_handle = HandleSpec::Invalid();
-        }
-    }
-
-    HandleWrapper(const HandleWrapper&) = delete;
-    HandleWrapper& operator=(HandleWrapper&) = delete;
-
-    HandleWrapper(HandleWrapper&& hw) noexcept
-        : m_handle(hw.m_handle)
-    {
-        hw.m_handle = HandleSpec::Invalid();
-    }
-
-    HandleWrapper& operator=(HandleWrapper&& hw) noexcept
-    {
-        if (m_handle != HandleSpec::Invalid())
-            HandleSpec::Close(m_handle);
-
-        m_handle = hw.m_handle;
-        hw.m_handle = HandleSpec::Invalid();
-        return *this;
-    }
-
-    bool IsValid() { return m_handle != HandleSpec::Invalid(); }
-    HandleType Get() { return m_handle; }
-
-private:
-    HandleType m_handle;
-};
-
-struct FileHandleSpec
-{
-    using Type = HANDLE;
-    static HANDLE Invalid() { return INVALID_HANDLE_VALUE; }
-    static void Close(HANDLE h) { CloseHandle(h); }
-};
-
-struct FileMappingHandleSpec
-{
-    using Type = HANDLE;
-    static HANDLE Invalid() { return nullptr; }
-    static void Close(HANDLE h) { CloseHandle(h); }
-};
-
-struct FileViewHandleSpec
-{
-    using Type = LPVOID;
-    static LPVOID Invalid() { return nullptr; }
-    static void Close(LPVOID p) { UnmapViewOfFile(p); }
-};
-
-typedef HandleWrapper<FileHandleSpec> FileHandle;
-typedef HandleWrapper<FileMappingHandleSpec> FileMappingHandle;
-typedef HandleWrapper<FileViewHandleSpec> FileViewHandle;
-
-class FileWriter
-{
-    FileHandle m_file;
-
-    FileWriter(FileHandle file)
-        : m_file(std::move(file))
-    {
-    }
-
-public:
-    FileWriter()
-    {
-    }
-
-    bool Printf(const char* fmt, ...);
-
-    static bool CreateNew(const char* path, FileWriter* fw);
-};
-
-class FileLineReader
-{
-    FileHandle m_file;
-    FileMappingHandle m_fileMapping;
-    FileViewHandle m_view;
-
-    char* m_cur;
-    char* m_end;
-    std::vector<char> m_currentLine;
-
-    FileLineReader(FileHandle file, FileMappingHandle fileMapping, FileViewHandle view, size_t size)
-        : m_file(std::move(file))
-        , m_fileMapping(std::move(fileMapping))
-        , m_view(std::move(view))
-        , m_cur(static_cast<char*>(m_view.Get()))
-        , m_end(static_cast<char*>(m_view.Get()) + size)
-    {
-    }
-
-public:
-    FileLineReader()
-        : m_cur(nullptr)
-        , m_end(nullptr)
-    {
-    }
-
-    bool AdvanceLine();
-    const char* GetCurrentLine() { return m_currentLine.data(); }
-
-    static bool Open(const char* path, FileLineReader* fr);
-};
-
-#endif
diff --git a/src/coreclr/tools/superpmi/superpmi/metricssummary.cpp b/src/coreclr/tools/superpmi/superpmi/metricssummary.cpp
index 7967cf90293fb4..cb9c908ee72d80 100644
--- a/src/coreclr/tools/superpmi/superpmi/metricssummary.cpp
+++ b/src/coreclr/tools/superpmi/superpmi/metricssummary.cpp
@@ -4,7 +4,6 @@
 #include "standardpch.h"
 #include "metricssummary.h"
 #include "logging.h"
-#include "fileio.h"
 
 void MetricsSummary::AggregateFrom(const MetricsSummary& other)
 {
@@ -25,15 +24,50 @@ void MetricsSummaries::AggregateFrom(const MetricsSummaries& other)
     FullOpts.AggregateFrom(other.FullOpts);
 }
 
+struct FileHandleWrapper
+{
+    FileHandleWrapper(HANDLE hFile)
+        : hFile(hFile)
+    {
+    }
+
+    ~FileHandleWrapper()
+    {
+        CloseHandle(hFile);
+    }
+
+    HANDLE get() { return hFile; }
+
+private:
+    HANDLE hFile;
+};
+
+static bool FilePrintf(HANDLE hFile, const char* fmt, ...)
+{
+    va_list args;
+    va_start(args, fmt);
+
+    char buffer[4096];
+    int len = vsprintf_s(buffer, ARRAY_SIZE(buffer), fmt, args);
+    DWORD numWritten;
+    bool result =
+        WriteFile(hFile, buffer, static_cast<DWORD>(len), &numWritten, nullptr) && (numWritten == static_cast<DWORD>(len));
+
+    va_end(args);
+
+    return result;
+}
+
 bool MetricsSummaries::SaveToFile(const char* path)
 {
-    FileWriter file;
-    if (!FileWriter::CreateNew(path, &file))
+    FileHandleWrapper file(CreateFile(path, GENERIC_WRITE, 0, nullptr, CREATE_ALWAYS, FILE_ATTRIBUTE_NORMAL, nullptr));
+    if (file.get() == INVALID_HANDLE_VALUE)
     {
         return false;
     }
 
-    if (!file.Printf(
+    if (!FilePrintf(
+        file.get(),
         "Successful compiles,Failing compiles,Missing compiles,Contexts with diffs,"
         "Code bytes,Diffed code bytes,Executed instructions,Diff executed instructions,Name\n"))
     {
@@ -41,15 +75,16 @@ bool MetricsSummaries::SaveToFile(const char* path)
     }
 
     return
-        WriteRow(file, "Overall", Overall) &&
-        WriteRow(file, "MinOpts", MinOpts) &&
-        WriteRow(file, "FullOpts", FullOpts);
+        WriteRow(file.get(), "Overall", Overall) &&
+        WriteRow(file.get(), "MinOpts", MinOpts) &&
+        WriteRow(file.get(), "FullOpts", FullOpts);
 }
 
-bool MetricsSummaries::WriteRow(FileWriter& fw, const char* name, const MetricsSummary& summary)
+bool MetricsSummaries::WriteRow(HANDLE hFile, const char* name, const MetricsSummary& summary)
 {
     return
-        fw.Printf(
+        FilePrintf(
+            hFile,
             "%d,%d,%d,%d,%lld,%lld,%lld,%lld,%s\n",
             summary.SuccessfulCompiles,
             summary.FailingCompiles,
@@ -64,27 +99,59 @@ bool MetricsSummaries::WriteRow(FileWriter& fw, const char* name, const MetricsS
 
 bool MetricsSummaries::LoadFromFile(const char* path, MetricsSummaries* metrics)
 {
-    FileLineReader reader;
-    if (!FileLineReader::Open(path, &reader))
+    FileHandleWrapper file(CreateFile(path, GENERIC_READ, FILE_SHARE_READ, nullptr, OPEN_EXISTING, FILE_ATTRIBUTE_NORMAL, nullptr));
+    if (file.get() == INVALID_HANDLE_VALUE)
+    {
+        return false;
+    }
+
+    LARGE_INTEGER len;
+    if (!GetFileSizeEx(file.get(), &len))
     {
         return false;
     }
 
-    if (!reader.AdvanceLine())
+    DWORD stringLen = static_cast<DWORD>(len.QuadPart);
+    std::vector<char> content(stringLen);
+    DWORD numRead;
+    if (!ReadFile(file.get(), content.data(), stringLen, &numRead, nullptr) || (numRead != stringLen))
     {
         return false;
     }
 
+    std::vector<char> line;
+    size_t index = 0;
+    auto nextLine = [&line, &content, &index]()
+    {
+        size_t end = index;
+        while ((end < content.size()) && (content[end] != '\r') && (content[end] != '\n'))
+        {
+            end++;
+        }
+
+        line.resize(end - index + 1);
+        memcpy(line.data(), &content[index], end - index);
+        line[end - index] = '\0';
+
+        index = end;
+        if ((index < content.size()) && (content[index] == '\r'))
+            index++;
+        if ((index < content.size()) && (content[index] == '\n'))
+            index++;
+    };
+
     *metrics = MetricsSummaries();
+    nextLine();
     bool result = true;
-    while (reader.AdvanceLine())
+    while (index < content.size())
     {
+        nextLine();
         MetricsSummary summary;
 
         char name[32];
         int scanResult =
             sscanf_s(
-                reader.GetCurrentLine(),
+                line.data(),
                 "%d,%d,%d,%d,%lld,%lld,%lld,%lld,%s",
                 &summary.SuccessfulCompiles,
                 &summary.FailingCompiles,
diff --git a/src/coreclr/tools/superpmi/superpmi/metricssummary.h b/src/coreclr/tools/superpmi/superpmi/metricssummary.h
index 14e364cd9fcf3e..118d6aac224006 100644
--- a/src/coreclr/tools/superpmi/superpmi/metricssummary.h
+++ b/src/coreclr/tools/superpmi/superpmi/metricssummary.h
@@ -39,7 +39,7 @@ class MetricsSummaries
     bool SaveToFile(const char* path);
     static bool LoadFromFile(const char* path, MetricsSummaries* metrics);
 private:
-    static bool WriteRow(class FileWriter& fw, const char* name, const MetricsSummary& summary);
+    static bool WriteRow(HANDLE hFile, const char* name, const MetricsSummary& summary);
 };
 
 #endif
diff --git a/src/coreclr/tools/superpmi/superpmi/parallelsuperpmi.cpp b/src/coreclr/tools/superpmi/superpmi/parallelsuperpmi.cpp
index 610ccdf732b73e..dfca5adcc376a6 100644
--- a/src/coreclr/tools/superpmi/superpmi/parallelsuperpmi.cpp
+++ b/src/coreclr/tools/superpmi/superpmi/parallelsuperpmi.cpp
@@ -9,7 +9,6 @@
 #include "commandline.h"
 #include "errorhandling.h"
 #include "metricssummary.h"
-#include "fileio.h"
 
 #define MAX_LOG_LINE_SIZE 0x1000 // 4 KB
 
@@ -350,7 +349,7 @@ struct PerWorkerData
     HANDLE hStdError;
 
     char* failingMCListPath;
-    char* diffsInfoPath;
+    char* diffMCListPath;
     char* stdOutputPath;
     char* stdErrorPath;
     char* baseMetricsSummaryPath;
@@ -360,7 +359,7 @@ struct PerWorkerData
         : hStdOutput(INVALID_HANDLE_VALUE)
         , hStdError(INVALID_HANDLE_VALUE)
         , failingMCListPath(nullptr)
-        , diffsInfoPath(nullptr)
+        , diffMCListPath(nullptr)
         , stdOutputPath(nullptr)
         , stdErrorPath(nullptr)
         , baseMetricsSummaryPath(nullptr)
@@ -369,7 +368,7 @@ struct PerWorkerData
     }
 };
 
-static void MergeWorkerMCLs(char* mclFilename, PerWorkerData* workerData, int workerCount, char* PerWorkerData::*mclPath)
+void MergeWorkerMCLs(char* mclFilename, PerWorkerData* workerData, int workerCount, char* PerWorkerData::*mclPath)
 {
     int **MCL = new int *[workerCount], *MCLCount = new int[workerCount], totalCount = 0;
 
@@ -397,39 +396,6 @@ static void MergeWorkerMCLs(char* mclFilename, PerWorkerData* workerData, int wo
         LogError("Unable to write to MCL file %s.", mclFilename);
 }
 
-static void MergeWorkerCsvs(char* csvFilename, PerWorkerData* workerData, int workerCount, char* PerWorkerData::* csvPath)
-{
-    FileWriter fw;
-    if (!FileWriter::CreateNew(csvFilename, &fw))
-    {
-        LogError("Could not create file %s", csvFilename);
-        return;
-    }
-
-    bool hasHeader = false;
-    for (int i = 0; i < workerCount; i++)
-    {
-        FileLineReader reader;
-        if (!FileLineReader::Open(workerData[i].*csvPath, &reader))
-        {
-            LogError("Could not open child CSV file %s", workerData[i].*csvPath);
-            continue;
-        }
-
-        if (hasHeader && !reader.AdvanceLine())
-        {
-            continue;
-        }
-
-        while (reader.AdvanceLine())
-        {
-             fw.Printf("%s\n", reader.GetCurrentLine());
-        }
-
-        hasHeader = true;
-    }
-}
-
 #define MAX_CMDLINE_SIZE 0x1000 // 4 KB
 
 //-------------------------------------------------------------
@@ -562,8 +528,8 @@ int doParallelSuperPMI(CommandLine::Options& o)
     LogVerbose("Using child (%s) with args (%s)", spmiFilename, spmiArgs);
     if (o.mclFilename != nullptr)
         LogVerbose(" failingMCList=%s", o.mclFilename);
-    if (o.diffsInfo != nullptr)
-        LogVerbose(" diffsInfo=%s", o.diffsInfo);
+    if (o.diffMCLFilename != nullptr)
+        LogVerbose(" diffMCLFilename=%s", o.diffMCLFilename);
     LogVerbose(" workerCount=%d, skipCleanup=%d.", o.workerCount, o.skipCleanup);
 
     PerWorkerData* perWorkerData = new PerWorkerData[o.workerCount];
@@ -585,10 +551,10 @@ int doParallelSuperPMI(CommandLine::Options& o)
             sprintf_s(wd.failingMCListPath, MAX_PATH, "%sParallelSuperPMI-%u-%d.mcl", tempPath, randNumber, i);
         }
 
-        if (o.diffsInfo != nullptr)
+        if (o.diffMCLFilename != nullptr)
         {
-            wd.diffsInfoPath = new char[MAX_PATH];
-            sprintf_s(wd.diffsInfoPath, MAX_PATH, "%sParallelSuperPMI-Diff-%u-%d.mcl", tempPath, randNumber, i);
+            wd.diffMCListPath = new char[MAX_PATH];
+            sprintf_s(wd.diffMCListPath, MAX_PATH, "%sParallelSuperPMI-Diff-%u-%d.mcl", tempPath, randNumber, i);
         }
 
         if (o.baseMetricsSummaryFile != nullptr)
@@ -627,10 +593,10 @@ int doParallelSuperPMI(CommandLine::Options& o)
                                       wd.failingMCListPath);
         }
 
-        if (wd.diffsInfoPath != nullptr)
+        if (wd.diffMCListPath != nullptr)
         {
-            bytesWritten += sprintf_s(cmdLine + bytesWritten, MAX_CMDLINE_SIZE - bytesWritten, " -diffsInfo %s",
-                                      wd.diffsInfoPath);
+            bytesWritten += sprintf_s(cmdLine + bytesWritten, MAX_CMDLINE_SIZE - bytesWritten, " -diffMCList %s",
+                                      wd.diffMCListPath);
         }
 
         if (wd.baseMetricsSummaryPath != nullptr)
@@ -753,10 +719,10 @@ int doParallelSuperPMI(CommandLine::Options& o)
             MergeWorkerMCLs(o.mclFilename, perWorkerData, o.workerCount, &PerWorkerData::failingMCListPath);
         }
 
-        if (o.diffsInfo != nullptr && !usageError)
+        if (o.diffMCLFilename != nullptr && !usageError)
         {
             // Concat the resulting diff .mcl files
-            MergeWorkerCsvs(o.diffsInfo, perWorkerData, o.workerCount, &PerWorkerData::diffsInfoPath);
+            MergeWorkerMCLs(o.diffMCLFilename, perWorkerData, o.workerCount, &PerWorkerData::diffMCListPath);
         }
 
         if (o.baseMetricsSummaryFile != nullptr && !usageError)
@@ -795,9 +761,9 @@ int doParallelSuperPMI(CommandLine::Options& o)
             {
                 DeleteFile(wd.failingMCListPath);
             }
-            if (wd.diffsInfoPath != nullptr)
+            if (wd.diffMCListPath != nullptr)
             {
-                DeleteFile(wd.diffsInfoPath);
+                DeleteFile(wd.diffMCListPath);
             }
             if (wd.baseMetricsSummaryPath != nullptr)
             {
diff --git a/src/coreclr/tools/superpmi/superpmi/superpmi.cpp b/src/coreclr/tools/superpmi/superpmi/superpmi.cpp
index 7875d4ca111199..ddf595281691bd 100644
--- a/src/coreclr/tools/superpmi/superpmi/superpmi.cpp
+++ b/src/coreclr/tools/superpmi/superpmi/superpmi.cpp
@@ -19,7 +19,6 @@
 #include "methodstatsemitter.h"
 #include "spmiutil.h"
 #include "metricssummary.h"
-#include "fileio.h"
 
 extern int doParallelSuperPMI(CommandLine::Options& o);
 
@@ -65,47 +64,54 @@ void SetSuperPmiTargetArchitecture(const char* targetArchitecture)
     }
 }
 
-enum class NearDifferResult
-{
-    SuccessWithoutDiff,
-    SuccessWithDiff,
-    Failure,
-};
-
 // This function uses PAL_TRY, so it can't be in the a function that requires object unwinding. Extracting it out here
 // avoids compiler error.
 //
-static NearDifferResult InvokeNearDiffer(NearDiffer*           nearDiffer,
-                                   MethodContext**       mc,
-                                   CompileResult**       crl,
-                                   MethodContextReader** reader)
+static void InvokeNearDiffer(NearDiffer*           nearDiffer,
+                             CommandLine::Options* o,
+                             MethodContext**       mc,
+                             CompileResult**       crl,
+                             bool*                 hasDiff,
+                             MethodContextReader** reader,
+                             MCList*               failingMCL,
+                             MCList*               diffMCL)
 {
-
     struct Param : FilterSuperPMIExceptionsParam_CaptureException
     {
         NearDiffer*           nearDiffer;
+        CommandLine::Options* o;
         MethodContext**       mc;
         CompileResult**       crl;
+        bool*                 hasDiff;
         MethodContextReader** reader;
-        NearDifferResult      result;
+        MCList*               failingMCL;
+        MCList*               diffMCL;
     } param;
     param.nearDiffer = nearDiffer;
+    param.o          = o;
     param.mc         = mc;
     param.crl        = crl;
+    param.hasDiff    = hasDiff;
     param.reader     = reader;
-    param.result     = NearDifferResult::Failure;
+    param.failingMCL = failingMCL;
+    param.diffMCL    = diffMCL;
+    *hasDiff = false;
 
     PAL_TRY(Param*, pParam, &param)
     {
         if (!pParam->nearDiffer->compare(*pParam->mc, *pParam->crl, (*pParam->mc)->cr))
         {
-            pParam->result = NearDifferResult::SuccessWithDiff;
+            *pParam->hasDiff = true;
             LogIssue(ISSUE_ASM_DIFF, "main method %d of size %d differs", (*pParam->reader)->GetMethodContextIndex(),
                      (*pParam->mc)->methodSize);
-        }
-        else
-        {
-            pParam->result = NearDifferResult::SuccessWithoutDiff;
+
+            // This is a difference in ASM outputs from Jit1 & Jit2 and not a playback failure
+            // We will add this MC to the diffMCList if one is requested
+            // Otherwise this will end up in failingMCList
+            if ((*pParam->o).diffMCLFilename != nullptr)
+                (*pParam->diffMCL).AddMethodToMCL((*pParam->reader)->GetMethodContextIndex());
+            else if ((*pParam->o).mclFilename != nullptr)
+                (*pParam->failingMCL).AddMethodToMCL((*pParam->reader)->GetMethodContextIndex());
         }
     }
     PAL_EXCEPT_FILTER(FilterSuperPMIExceptions_CaptureExceptionAndStop)
@@ -115,26 +121,10 @@ static NearDifferResult InvokeNearDiffer(NearDiffer*           nearDiffer,
         LogError("main method %d of size %d failed to load and compile correctly.", (*reader)->GetMethodContextIndex(),
                  (*mc)->methodSize);
         e.ShowAndDeleteMessage();
-        param.result = NearDifferResult::Failure;
+        if ((*o).mclFilename != nullptr)
+            (*failingMCL).AddMethodToMCL((*reader)->GetMethodContextIndex());
     }
     PAL_ENDTRY
-
-    return param.result;
-}
-
-static bool PrintDiffsCsvHeader(FileWriter& fw)
-{
-    return fw.Printf("Context,Context size,Base size,Diff size,Base instructions,Diff instructions\n");
-}
-
-static bool PrintDiffsCsvRow(
-    FileWriter& fw,
-    int context,
-    uint32_t contextSize,
-    long long baseSize, long long diffSize,
-    long long baseInstructions, long long diffInstructions)
-{
-    return fw.Printf("%d,%u,%lld,%lld,%lld,%lld\n", context, contextSize, baseSize, diffSize, baseInstructions, diffInstructions);
 }
 
 // Run superpmi. The return value is as follows:
@@ -181,8 +171,7 @@ int __cdecl main(int argc, char* argv[])
 #endif
 
     bool   collectThroughput = false;
-    MCList failingToReplayMCL;
-    FileWriter diffCsv;
+    MCList failingToReplayMCL, diffMCL;
 
     CommandLine::Options o;
     if (!CommandLine::Parse(argc, argv, &o))
@@ -241,13 +230,9 @@ int __cdecl main(int argc, char* argv[])
     {
         failingToReplayMCL.InitializeMCL(o.mclFilename);
     }
-    if (o.diffsInfo != nullptr)
+    if (o.diffMCLFilename != nullptr)
     {
-        if (!FileWriter::CreateNew(o.diffsInfo, &diffCsv))
-        {
-            LogError("Could not create file %s", o.diffsInfo);
-            return (int)SpmiResult::GeneralFailure;
-        }
+        diffMCL.InitializeMCL(o.diffMCLFilename);
     }
 
     SetDebugDumpVariables();
@@ -281,11 +266,6 @@ int __cdecl main(int argc, char* argv[])
         }
     }
 
-    if (o.diffsInfo != nullptr)
-    {
-        PrintDiffsCsvHeader(diffCsv);
-    }
-
     MetricsSummaries totalBaseMetrics;
     MetricsSummaries totalDiffMetrics;
 
@@ -572,42 +552,16 @@ int __cdecl main(int argc, char* argv[])
                 }
                 else
                 {
-                    NearDifferResult result = InvokeNearDiffer(&nearDiffer, &mc, &crl, &reader);
+                    bool hasDiff;
+                    InvokeNearDiffer(&nearDiffer, &o, &mc, &crl, &hasDiff, &reader, &failingToReplayMCL, &diffMCL);
 
-                    switch (result)
+                    if (hasDiff)
                     {
-                        case NearDifferResult::SuccessWithDiff:
-                            totalBaseMetrics.Overall.NumContextsWithDiffs++;
-                            totalDiffMetrics.Overall.NumContextsWithDiffs++;
-
-                            totalBaseMetricsOpts.NumContextsWithDiffs++;
-                            totalDiffMetricsOpts.NumContextsWithDiffs++;
-
-                            // This is a difference in ASM outputs from Jit1 & Jit2 and not a playback failure
-                            // We will add this MC to the diffs info if there is one.
-                            // Otherwise this will end up in failingMCList
-                            if (o.diffsInfo != nullptr)
-                            {
-                                PrintDiffsCsvRow(
-                                    diffCsv,
-                                    reader->GetMethodContextIndex(),
-                                    mcb.size,
-                                    baseMetrics.NumCodeBytes, diffMetrics.NumCodeBytes,
-                                    baseMetrics.NumExecutedInstructions, diffMetrics.NumExecutedInstructions);
-                            }
-                            else if (o.mclFilename != nullptr)
-                            {
-                                failingToReplayMCL.AddMethodToMCL(reader->GetMethodContextIndex());
-                            }
+                        totalBaseMetrics.Overall.NumContextsWithDiffs++;
+                        totalDiffMetrics.Overall.NumContextsWithDiffs++;
 
-                            break;
-                        case NearDifferResult::SuccessWithoutDiff:
-                            break;
-                        case NearDifferResult::Failure:
-                            if (o.mclFilename != nullptr)
-                                failingToReplayMCL.AddMethodToMCL(reader->GetMethodContextIndex());
-
-                            break;
+                        totalBaseMetricsOpts.NumContextsWithDiffs++;
+                        totalDiffMetricsOpts.NumContextsWithDiffs++;
                     }
 
                     totalBaseMetrics.Overall.NumDiffedCodeBytes += baseMetrics.NumCodeBytes;
@@ -671,7 +625,7 @@ int __cdecl main(int argc, char* argv[])
                 if (o.breakOnError)
                 {
                     if (o.indexCount == -1)
-                        LogInfo("HINT: to repro add '-c %d' to cmdline", reader->GetMethodContextIndex());
+                        LogInfo("HINT: to repro add '/c %d' to cmdline", reader->GetMethodContextIndex());
                     __debugbreak();
                 }
             }
@@ -720,6 +674,10 @@ int __cdecl main(int argc, char* argv[])
     {
         failingToReplayMCL.CloseMCL();
     }
+    if (o.diffMCLFilename != nullptr)
+    {
+        diffMCL.CloseMCL();
+    }
     Logger::Shutdown();
 
     SpmiResult result = SpmiResult::Success;

From 24fee4949f231636c715965a5662d687b656b20b Mon Sep 17 00:00:00 2001
From: Jakob Botsch Nielsen <jakob.botsch.nielsen@gmail.com>
Date: Fri, 30 Sep 2022 13:54:06 +0200
Subject: [PATCH 12/14] Nits

---
 src/coreclr/jit/morph.cpp | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/src/coreclr/jit/morph.cpp b/src/coreclr/jit/morph.cpp
index ee15267a316863..f40f3a7a8f29ee 100644
--- a/src/coreclr/jit/morph.cpp
+++ b/src/coreclr/jit/morph.cpp
@@ -1283,8 +1283,9 @@ class ArgInterferenceGraph
     ArrayStack<int>                      m_stack;
     ArrayStack<int>                      m_sccs;
     // Registers that are used by an argument that does not also clobber that register.
-    regMaskTP m_regDependencies;
-    regMaskTP m_allClobbers;
+    regMaskTP m_regDependencies = RBM_NONE;
+    // Mask of registers clobbered by placing arguments.
+    regMaskTP m_allClobbers = RBM_NONE;
 
 public:
     ArgInterferenceGraph(Compiler* comp, unsigned argCount)
@@ -1293,8 +1294,6 @@ class ArgInterferenceGraph
         , m_edges(comp->getAllocator(CMK_CallArgs))
         , m_stack(comp->getAllocator(CMK_CallArgs))
         , m_sccs(comp->getAllocator(CMK_CallArgs), static_cast<int>(argCount))
-        , m_regDependencies(RBM_NONE)
-        , m_allClobbers(RBM_NONE)
     {
     }
 
@@ -1561,7 +1560,7 @@ void CallArgs::SortArgs(Compiler* comp, GenTreeCall* call, CallArg** sortedArgs)
                 lclVarIndex = nodeIndex;
             }
 
-            assert(curIndex < argCount);
+            assert(curIndex + sccSize < argCount);
             sortedArgs[curIndex + sccSize] = node.Arg;
 
             nodeIndex = node.SccNext;

From a54c11305f507fb1aea4139a5be52c8ed9f7ec8a Mon Sep 17 00:00:00 2001
From: Jakob Botsch Nielsen <jakob.botsch.nielsen@gmail.com>
Date: Fri, 30 Sep 2022 13:54:59 +0200
Subject: [PATCH 13/14] JIT: Avoid internal register allocation when possible
 for PUTARG_SPLIT

We can almost always avoid allocating an internal register, which may be
expensive if lr is picked since we cannot use thumb encoding for it.

The only case where we need an internal register is when we have a
source that is in a register, and we have a single taget register to
place that conflicts with that source register. The to-stack copies then
need an intermediate scratch register to avoid clobbering the source
register.

Nit

Nit 2
---
 src/coreclr/jit/codegenarmarch.cpp | 91 ++++++++++++++++++++++--------
 src/coreclr/jit/lsraarmarch.cpp    | 12 +++-
 2 files changed, 77 insertions(+), 26 deletions(-)

diff --git a/src/coreclr/jit/codegenarmarch.cpp b/src/coreclr/jit/codegenarmarch.cpp
index f7bb7a91f14e23..57dd84208ae3ff 100644
--- a/src/coreclr/jit/codegenarmarch.cpp
+++ b/src/coreclr/jit/codegenarmarch.cpp
@@ -1205,7 +1205,15 @@ void CodeGen::genPutArgSplit(GenTreePutArgSplit* treeNode)
         var_types targetType = source->TypeGet();
         assert(source->isContained() && varTypeIsStruct(targetType));
 
-        regNumber    baseReg      = treeNode->ExtractTempReg();
+        // We need a register to store intermediate values that we are loading
+        // from the source into. We can usually use one of the target registers
+        // that will be overridden anyway. The exception is when the source is
+        // in a register and that register is the unique target register we are
+        // placing. LSRA will always allocate an internal register when there
+        // is just one target register to handle this situation.
+
+        int          firstRegToPlace;
+        regNumber    valueReg     = REG_NA;
         unsigned     srcLclNum    = BAD_VAR_NUM;
         unsigned     srcLclOffset = 0;
         regNumber    addrReg      = REG_NA;
@@ -1221,6 +1229,10 @@ void CodeGen::genPutArgSplit(GenTreePutArgSplit* treeNode)
 
             // This struct must live on the stack frame.
             assert(varDsc->lvOnFrame && !varDsc->lvRegister);
+
+            // No possible conflicts, just use the first register as the value register.
+            firstRegToPlace = 0;
+            valueReg        = treeNode->GetRegNumByIdx(0);
         }
         else // we must have a GT_OBJ
         {
@@ -1228,10 +1240,40 @@ void CodeGen::genPutArgSplit(GenTreePutArgSplit* treeNode)
             addrReg  = genConsumeReg(source->AsObj()->Addr());
             addrType = source->AsObj()->Addr()->TypeGet();
 
-            // If addrReg equal to baseReg, we use the last target register as alternative baseReg.
-            // Because the candidate mask for the internal baseReg does not include any of the target register,
-            // we can ensure that baseReg, addrReg, and the last target register are not all same.
-            assert(baseReg != addrReg);
+            regNumber allocatedValueReg = REG_NA;
+            if (treeNode->gtNumRegs == 1)
+            {
+                allocatedValueReg = treeNode->ExtractTempReg();
+            }
+
+            // Pick a register to store intermediate values in for the to-stack
+            // copy. It must not conflict with addrReg. We try to prefer an
+            // argument register since those can always use thumb encoding.
+            valueReg = treeNode->GetRegNumByIdx(0);
+            if (valueReg == addrReg)
+            {
+                if (treeNode->gtNumRegs == 1)
+                {
+                    valueReg = allocatedValueReg;
+                }
+                else
+                {
+                    // Prefer argument register that can always use thumb encoding.
+                    valueReg = treeNode->GetRegNumByIdx(1);
+                }
+            }
+
+            // Find first register to place. If we are placing addrReg, then
+            // make sure we place it last to avoid clobbering its value.
+            firstRegToPlace = 0;
+            for (unsigned i = 0; i < treeNode->gtNumRegs; i++)
+            {
+                if (treeNode->GetRegNumByIdx(i) == addrReg)
+                {
+                    firstRegToPlace = i + 1;
+                    break;
+                }
+            }
         }
 
         // Put on stack first
@@ -1268,19 +1310,18 @@ void CodeGen::genPutArgSplit(GenTreePutArgSplit* treeNode)
             if (srcLclNum != BAD_VAR_NUM)
             {
                 // Load from our local source
-                emit->emitIns_R_S(loadIns, attr, baseReg, srcLclNum, srcLclOffset + structOffset);
+                emit->emitIns_R_S(loadIns, attr, valueReg, srcLclNum, srcLclOffset + structOffset);
             }
             else
             {
-                // check for case of destroying the addrRegister while we still need it
-                assert(baseReg != addrReg);
+                assert(valueReg != addrReg);
 
                 // Load from our address expression source
-                emit->emitIns_R_R_I(loadIns, attr, baseReg, addrReg, structOffset);
+                emit->emitIns_R_R_I(loadIns, attr, valueReg, addrReg, structOffset);
             }
 
             // Emit the instruction to store the register into the outgoing argument area
-            emit->emitIns_S_R(ins_Store(type), attr, baseReg, varNumOut, argOffsetOut);
+            emit->emitIns_S_R(ins_Store(type), attr, valueReg, varNumOut, argOffsetOut);
             argOffsetOut += moveSize;
             assert(argOffsetOut <= argOffsetMax);
 
@@ -1288,13 +1329,21 @@ void CodeGen::genPutArgSplit(GenTreePutArgSplit* treeNode)
             structOffset += moveSize;
         }
 
-        // We set up the registers in order, so that we assign the last target register `baseReg` is no longer in use,
-        // in case we had to reuse the last target register for it.
-        structOffset = 0;
-        for (unsigned idx = 0; idx < treeNode->gtNumRegs; idx++)
+        // Place registers starting from firstRegToPlace. It should ensure we
+        // place addrReg last (if we place it at all).
+        structOffset         = static_cast<unsigned>(firstRegToPlace) * TARGET_POINTER_SIZE;
+        unsigned curRegIndex = firstRegToPlace;
+
+        for (unsigned regsPlaced = 0; regsPlaced < treeNode->gtNumRegs; regsPlaced++)
         {
-            regNumber targetReg = treeNode->GetRegNumByIdx(idx);
-            var_types type      = treeNode->GetRegType(idx);
+            if (curRegIndex == treeNode->gtNumRegs)
+            {
+                curRegIndex  = 0;
+                structOffset = 0;
+            }
+
+            regNumber targetReg = treeNode->GetRegNumByIdx(curRegIndex);
+            var_types type      = treeNode->GetRegType(curRegIndex);
 
             if (srcLclNum != BAD_VAR_NUM)
             {
@@ -1303,17 +1352,13 @@ void CodeGen::genPutArgSplit(GenTreePutArgSplit* treeNode)
             }
             else
             {
-                // check for case of destroying the addrRegister while we still need it
-                if (targetReg == addrReg && idx != treeNode->gtNumRegs - 1)
-                {
-                    assert(targetReg != baseReg);
-                    emit->emitIns_Mov(INS_mov, emitActualTypeSize(addrType), baseReg, addrReg, /* canSkip */ false);
-                    addrReg = baseReg;
-                }
+                assert((addrReg != targetReg) || (regsPlaced == treeNode->gtNumRegs - 1));
 
                 // Load from our address expression source
                 emit->emitIns_R_R_I(INS_ldr, emitTypeSize(type), targetReg, addrReg, structOffset);
             }
+
+            curRegIndex++;
             structOffset += TARGET_POINTER_SIZE;
         }
     }
diff --git a/src/coreclr/jit/lsraarmarch.cpp b/src/coreclr/jit/lsraarmarch.cpp
index 2c42d385758bfd..d41eef6ad2ed59 100644
--- a/src/coreclr/jit/lsraarmarch.cpp
+++ b/src/coreclr/jit/lsraarmarch.cpp
@@ -554,11 +554,17 @@ int LinearScan::BuildPutArgSplit(GenTreePutArgSplit* argNode)
     {
         assert(src->TypeIs(TYP_STRUCT) && src->isContained());
 
-        // We can use a ldr/str sequence so we need an internal register
-        buildInternalIntRegisterDefForNode(argNode, allRegs(TYP_INT) & ~argMask);
-
         if (src->OperIs(GT_OBJ))
         {
+            // If the PUTARG_SPLIT clobbers only one register we may need an
+            // extra internal register in case there is a conflict between the
+            // source address register and target register.
+            if (argNode->gtNumRegs == 1)
+            {
+                // We can use a ldr/str sequence so we need an internal register
+                buildInternalIntRegisterDefForNode(argNode, allRegs(TYP_INT) & ~argMask);
+            }
+
             // We will generate code that loads from the OBJ's address, which must be in a register.
             srcCount = BuildOperandUses(src->AsObj()->Addr());
         }

From 991690171c01126d187853c111d52489cb6768cf Mon Sep 17 00:00:00 2001
From: Jakob Botsch Nielsen <jakob.botsch.nielsen@gmail.com>
Date: Sat, 1 Oct 2022 21:03:14 +0200
Subject: [PATCH 14/14] Revert "JIT: Avoid internal register allocation when
 possible for PUTARG_SPLIT"

This reverts commit a54c11305f507fb1aea4139a5be52c8ed9f7ec8a.
---
 src/coreclr/jit/codegenarmarch.cpp | 91 ++++++++----------------------
 src/coreclr/jit/lsraarmarch.cpp    | 12 +---
 2 files changed, 26 insertions(+), 77 deletions(-)

diff --git a/src/coreclr/jit/codegenarmarch.cpp b/src/coreclr/jit/codegenarmarch.cpp
index 57dd84208ae3ff..f7bb7a91f14e23 100644
--- a/src/coreclr/jit/codegenarmarch.cpp
+++ b/src/coreclr/jit/codegenarmarch.cpp
@@ -1205,15 +1205,7 @@ void CodeGen::genPutArgSplit(GenTreePutArgSplit* treeNode)
         var_types targetType = source->TypeGet();
         assert(source->isContained() && varTypeIsStruct(targetType));
 
-        // We need a register to store intermediate values that we are loading
-        // from the source into. We can usually use one of the target registers
-        // that will be overridden anyway. The exception is when the source is
-        // in a register and that register is the unique target register we are
-        // placing. LSRA will always allocate an internal register when there
-        // is just one target register to handle this situation.
-
-        int          firstRegToPlace;
-        regNumber    valueReg     = REG_NA;
+        regNumber    baseReg      = treeNode->ExtractTempReg();
         unsigned     srcLclNum    = BAD_VAR_NUM;
         unsigned     srcLclOffset = 0;
         regNumber    addrReg      = REG_NA;
@@ -1229,10 +1221,6 @@ void CodeGen::genPutArgSplit(GenTreePutArgSplit* treeNode)
 
             // This struct must live on the stack frame.
             assert(varDsc->lvOnFrame && !varDsc->lvRegister);
-
-            // No possible conflicts, just use the first register as the value register.
-            firstRegToPlace = 0;
-            valueReg        = treeNode->GetRegNumByIdx(0);
         }
         else // we must have a GT_OBJ
         {
@@ -1240,40 +1228,10 @@ void CodeGen::genPutArgSplit(GenTreePutArgSplit* treeNode)
             addrReg  = genConsumeReg(source->AsObj()->Addr());
             addrType = source->AsObj()->Addr()->TypeGet();
 
-            regNumber allocatedValueReg = REG_NA;
-            if (treeNode->gtNumRegs == 1)
-            {
-                allocatedValueReg = treeNode->ExtractTempReg();
-            }
-
-            // Pick a register to store intermediate values in for the to-stack
-            // copy. It must not conflict with addrReg. We try to prefer an
-            // argument register since those can always use thumb encoding.
-            valueReg = treeNode->GetRegNumByIdx(0);
-            if (valueReg == addrReg)
-            {
-                if (treeNode->gtNumRegs == 1)
-                {
-                    valueReg = allocatedValueReg;
-                }
-                else
-                {
-                    // Prefer argument register that can always use thumb encoding.
-                    valueReg = treeNode->GetRegNumByIdx(1);
-                }
-            }
-
-            // Find first register to place. If we are placing addrReg, then
-            // make sure we place it last to avoid clobbering its value.
-            firstRegToPlace = 0;
-            for (unsigned i = 0; i < treeNode->gtNumRegs; i++)
-            {
-                if (treeNode->GetRegNumByIdx(i) == addrReg)
-                {
-                    firstRegToPlace = i + 1;
-                    break;
-                }
-            }
+            // If addrReg equal to baseReg, we use the last target register as alternative baseReg.
+            // Because the candidate mask for the internal baseReg does not include any of the target register,
+            // we can ensure that baseReg, addrReg, and the last target register are not all same.
+            assert(baseReg != addrReg);
         }
 
         // Put on stack first
@@ -1310,18 +1268,19 @@ void CodeGen::genPutArgSplit(GenTreePutArgSplit* treeNode)
             if (srcLclNum != BAD_VAR_NUM)
             {
                 // Load from our local source
-                emit->emitIns_R_S(loadIns, attr, valueReg, srcLclNum, srcLclOffset + structOffset);
+                emit->emitIns_R_S(loadIns, attr, baseReg, srcLclNum, srcLclOffset + structOffset);
             }
             else
             {
-                assert(valueReg != addrReg);
+                // check for case of destroying the addrRegister while we still need it
+                assert(baseReg != addrReg);
 
                 // Load from our address expression source
-                emit->emitIns_R_R_I(loadIns, attr, valueReg, addrReg, structOffset);
+                emit->emitIns_R_R_I(loadIns, attr, baseReg, addrReg, structOffset);
             }
 
             // Emit the instruction to store the register into the outgoing argument area
-            emit->emitIns_S_R(ins_Store(type), attr, valueReg, varNumOut, argOffsetOut);
+            emit->emitIns_S_R(ins_Store(type), attr, baseReg, varNumOut, argOffsetOut);
             argOffsetOut += moveSize;
             assert(argOffsetOut <= argOffsetMax);
 
@@ -1329,21 +1288,13 @@ void CodeGen::genPutArgSplit(GenTreePutArgSplit* treeNode)
             structOffset += moveSize;
         }
 
-        // Place registers starting from firstRegToPlace. It should ensure we
-        // place addrReg last (if we place it at all).
-        structOffset         = static_cast<unsigned>(firstRegToPlace) * TARGET_POINTER_SIZE;
-        unsigned curRegIndex = firstRegToPlace;
-
-        for (unsigned regsPlaced = 0; regsPlaced < treeNode->gtNumRegs; regsPlaced++)
+        // We set up the registers in order, so that we assign the last target register `baseReg` is no longer in use,
+        // in case we had to reuse the last target register for it.
+        structOffset = 0;
+        for (unsigned idx = 0; idx < treeNode->gtNumRegs; idx++)
         {
-            if (curRegIndex == treeNode->gtNumRegs)
-            {
-                curRegIndex  = 0;
-                structOffset = 0;
-            }
-
-            regNumber targetReg = treeNode->GetRegNumByIdx(curRegIndex);
-            var_types type      = treeNode->GetRegType(curRegIndex);
+            regNumber targetReg = treeNode->GetRegNumByIdx(idx);
+            var_types type      = treeNode->GetRegType(idx);
 
             if (srcLclNum != BAD_VAR_NUM)
             {
@@ -1352,13 +1303,17 @@ void CodeGen::genPutArgSplit(GenTreePutArgSplit* treeNode)
             }
             else
             {
-                assert((addrReg != targetReg) || (regsPlaced == treeNode->gtNumRegs - 1));
+                // check for case of destroying the addrRegister while we still need it
+                if (targetReg == addrReg && idx != treeNode->gtNumRegs - 1)
+                {
+                    assert(targetReg != baseReg);
+                    emit->emitIns_Mov(INS_mov, emitActualTypeSize(addrType), baseReg, addrReg, /* canSkip */ false);
+                    addrReg = baseReg;
+                }
 
                 // Load from our address expression source
                 emit->emitIns_R_R_I(INS_ldr, emitTypeSize(type), targetReg, addrReg, structOffset);
             }
-
-            curRegIndex++;
             structOffset += TARGET_POINTER_SIZE;
         }
     }
diff --git a/src/coreclr/jit/lsraarmarch.cpp b/src/coreclr/jit/lsraarmarch.cpp
index d41eef6ad2ed59..2c42d385758bfd 100644
--- a/src/coreclr/jit/lsraarmarch.cpp
+++ b/src/coreclr/jit/lsraarmarch.cpp
@@ -554,17 +554,11 @@ int LinearScan::BuildPutArgSplit(GenTreePutArgSplit* argNode)
     {
         assert(src->TypeIs(TYP_STRUCT) && src->isContained());
 
+        // We can use a ldr/str sequence so we need an internal register
+        buildInternalIntRegisterDefForNode(argNode, allRegs(TYP_INT) & ~argMask);
+
         if (src->OperIs(GT_OBJ))
         {
-            // If the PUTARG_SPLIT clobbers only one register we may need an
-            // extra internal register in case there is a conflict between the
-            // source address register and target register.
-            if (argNode->gtNumRegs == 1)
-            {
-                // We can use a ldr/str sequence so we need an internal register
-                buildInternalIntRegisterDefForNode(argNode, allRegs(TYP_INT) & ~argMask);
-            }
-
             // We will generate code that loads from the OBJ's address, which must be in a register.
             srcCount = BuildOperandUses(src->AsObj()->Addr());
         }