diff --git a/.gitignore b/.gitignore
index 354f05b9d0d..d3bbf343ae3 100644
--- a/.gitignore
+++ b/.gitignore
@@ -337,3 +337,4 @@ ASALocalRun/
 .mfractor/
 /src/Simulation/Simulators.Tests/TestProjects/QsharpExe/built
 /src/Simulation/Simulators.Tests/TestProjects/TargetedExe/built
+dbw_test
diff --git a/AdvantageBenchmark/privateBuild/.editorconfig b/AdvantageBenchmark/privateBuild/.editorconfig
new file mode 100644
index 00000000000..6872d54293d
--- /dev/null
+++ b/AdvantageBenchmark/privateBuild/.editorconfig
@@ -0,0 +1,4 @@
+﻿[*.cs]
+
+# SA1025: Code should not contain multiple whitespace in a row
+dotnet_diagnostic.SA1025.severity = none
diff --git a/AdvantageBenchmark/privateBuild/Program.cs b/AdvantageBenchmark/privateBuild/Program.cs
new file mode 100644
index 00000000000..5306ff2a683
--- /dev/null
+++ b/AdvantageBenchmark/privateBuild/Program.cs
@@ -0,0 +1,71 @@
+namespace quantum
+{
+    using System;
+    using System.Diagnostics;
+    using Microsoft.Quantum.Simulation.Simulators;
+
+    class Program 
+    {
+        public static void Main(string[] args) 
+        {
+            Console.WriteLine($"CSV,test,loop,secs,gates,THREADS,FUSESPAN,FUSEDEPTH,Gates/sec");
+            var envThr  = System.Environment.GetEnvironmentVariable("OMP_NUM_THREADS");
+            var envFus  = System.Environment.GetEnvironmentVariable("QDK_SIM_FUSESPAN");
+            var envDep  = System.Environment.GetEnvironmentVariable("QDK_SIM_FUSEDEPTH");
+            if (envThr == null || envThr.Length == 0) envThr = "Default";
+            if (envFus == null || envFus.Length == 0) envFus = "Default";
+            if (envDep == null || envDep.Length == 0) envDep = "99";
+
+            int tstMin  = 0;
+            int tstMax  = 3;
+            int loopCnt = 10;
+
+            if (args.Length > 0) tstMin  = Convert.ToInt32(args[0]);
+            if (args.Length > 1) tstMax  = Convert.ToInt32(args[1]);
+            if (args.Length > 2) loopCnt = Convert.ToInt32(args[2]);
+
+            using (var sim = new QuantumSimulator()) 
+            {
+                long        gates = 1;
+                TimeSpan    ts;
+                double      tSecs;
+                double      gps;
+                string      tstName = "";
+                Stopwatch stopWatch = new Stopwatch();
+
+                for (int tst = tstMin; tst <= tstMax; tst++)
+                {
+                    for (int loop = 0; loop < loopCnt; loop++)
+                    {
+                        stopWatch.Restart();
+                        switch (tst) 
+                        {
+                            case 0: 
+                                gates   = Dummy.Run(sim).Result; 
+                                tstName = "Dummy";
+                                break;
+                            case 1: 
+                                gates = Advantage44.Run(sim).Result; 
+                                tstName = "4x4";
+                                break;
+                            case 2: 
+                                gates = Advantage55.Run(sim).Result; 
+                                tstName = "5x5";
+                                break;
+                            case 3: 
+                                gates = Advantage56.Run(sim).Result; 
+                                tstName = "5x6";
+                                break;
+                        }
+			            stopWatch.Stop();
+                        ts 	    = stopWatch.Elapsed;
+                        tSecs 	    = ts.TotalSeconds;
+                        gps 	    = gates / tSecs;
+                        
+                        Console.WriteLine($"CSV,{tstName},{loop:D2},{tSecs:F2},{gates:E2},{envThr},{envFus},{envDep},{gps:E2}");
+                    }
+                }
+            }
+        }
+    }
+}
diff --git a/AdvantageBenchmark/privateBuild/Quantum.qs b/AdvantageBenchmark/privateBuild/Quantum.qs
new file mode 100644
index 00000000000..c34c15a047f
--- /dev/null
+++ b/AdvantageBenchmark/privateBuild/Quantum.qs
@@ -0,0 +1,826 @@
+﻿namespace quantum {
+    open Microsoft.Quantum.Canon;
+    open Microsoft.Quantum.Intrinsic;
+    open Microsoft.Quantum.Diagnostics;
+    open Microsoft.Quantum.Measurement;
+    
+    operation CZ (a : Qubit, b : Qubit) : Unit
+    {
+        body (...)
+        {
+            H(b);
+            CNOT(a, b);
+            H(b);
+        }
+        
+        adjoint self;
+    }
+
+    operation Dummy(): Int {
+        using (q = Qubit[2]) {
+            CZ(q[0],q[1]);
+            ResetAll(q);
+        }
+        return(1);
+    }
+
+    operation Advantage44() : Int {
+        let loops = 200;
+        let gateCnt = (171+27*2) * loops;
+        using (q = Qubit[16]) {
+            for (loop in 0..(loops-1)) {
+                H(q[0]);
+                H(q[1]);
+                H(q[2]);
+                H(q[3]);
+                H(q[4]);
+                H(q[5]);
+                H(q[6]);
+                H(q[7]);
+                H(q[8]);
+                H(q[9]);
+                H(q[10]);
+                H(q[11]);
+                H(q[12]);
+                H(q[13]);
+                H(q[14]);
+                H(q[15]);
+                CZ(q[2],q[3]);
+                CZ(q[10],q[11]);
+                CZ(q[4],q[5]);
+                CZ(q[12],q[13]);
+                H(q[0]);
+                H(q[1]);
+                H(q[2]);
+                H(q[3]);
+                H(q[4]);
+                H(q[5]);
+                H(q[6]);
+                H(q[7]);
+                H(q[8]);
+                H(q[9]);
+                H(q[10]);
+                H(q[11]);
+                H(q[12]);
+                H(q[13]);
+                H(q[14]);
+                H(q[15]);
+                CZ(q[0],q[1]);
+                CZ(q[8],q[9]);
+                CZ(q[6],q[7]);
+                CZ(q[14],q[15]);
+                H(q[0]);
+                H(q[1]);
+                H(q[2]);
+                H(q[3]);
+                H(q[4]);
+                H(q[5]);
+                H(q[6]);
+                H(q[7]);
+                H(q[8]);
+                H(q[9]);
+                H(q[10]);
+                H(q[11]);
+                H(q[12]);
+                H(q[13]);
+                H(q[14]);
+                H(q[15]);
+                CZ(q[5],q[9]);
+                CZ(q[7],q[11]);
+                H(q[0]);
+                H(q[1]);
+                H(q[2]);
+                H(q[3]);
+                H(q[4]);
+                H(q[5]);
+                H(q[6]);
+                H(q[7]);
+                H(q[8]);
+                H(q[9]);
+                H(q[10]);
+                H(q[11]);
+                H(q[12]);
+                H(q[13]);
+                H(q[14]);
+                H(q[15]);
+                CZ(q[4],q[8]);
+                CZ(q[6],q[10]);
+                H(q[0]);
+                H(q[1]);
+                H(q[2]);
+                H(q[3]);
+                H(q[4]);
+                H(q[5]);
+                H(q[6]);
+                H(q[7]);
+                H(q[8]);
+                H(q[9]);
+                H(q[10]);
+                H(q[11]);
+                H(q[12]);
+                H(q[13]);
+                H(q[14]);
+                H(q[15]);
+                CZ(q[3],q[4]);
+                CZ(q[11],q[12]);
+                CZ(q[5],q[6]);
+                CZ(q[13],q[14]);
+                H(q[0]);
+                H(q[1]);
+                H(q[2]);
+                H(q[3]);
+                H(q[4]);
+                H(q[5]);
+                H(q[6]);
+                H(q[7]);
+                H(q[8]);
+                H(q[9]);
+                H(q[10]);
+                H(q[11]);
+                H(q[12]);
+                H(q[13]);
+                H(q[14]);
+                H(q[15]);
+                CZ(q[1],q[2]);
+                CZ(q[9],q[10]);
+                CZ(q[7],q[8]);
+                H(q[0]);
+                H(q[1]);
+                H(q[2]);
+                H(q[3]);
+                H(q[4]);
+                H(q[5]);
+                H(q[6]);
+                H(q[7]);
+                H(q[8]);
+                H(q[9]);
+                H(q[10]);
+                H(q[11]);
+                H(q[12]);
+                H(q[13]);
+                H(q[14]);
+                H(q[15]);
+                CZ(q[0],q[4]);
+                CZ(q[2],q[6]);
+                CZ(q[9],q[13]);
+                CZ(q[11],q[15]);
+                H(q[0]);
+                H(q[1]);
+                H(q[2]);
+                H(q[3]);
+                H(q[4]);
+                H(q[5]);
+                H(q[6]);
+                H(q[7]);
+                H(q[8]);
+                H(q[9]);
+                H(q[10]);
+                H(q[11]);
+                H(q[12]);
+                H(q[13]);
+                H(q[14]);
+                H(q[15]);
+                CZ(q[8],q[12]);
+                CZ(q[10],q[14]);
+                CZ(q[1],q[5]);
+                CZ(q[3],q[7]);
+                H(q[0]);
+                H(q[1]);
+                H(q[2]);
+                H(q[3]);
+                H(q[4]);
+                H(q[5]);
+                H(q[6]);
+                H(q[7]);
+                H(q[8]);
+                H(q[9]);
+                H(q[10]);
+                H(q[11]);
+                H(q[12]);
+                H(q[13]);
+                H(q[14]);
+                H(q[15]);
+                for (q1 in q) { let _ = M(q1); }
+            }
+        ResetAll(q);
+        }
+    return(gateCnt);
+    }
+
+    operation Advantage55() : Int {
+        let loops = 1;
+        let gateCnt = (269+44*2) * loops;
+        using (q = Qubit[25]) {
+            for (loop in 0..(loops-1)) {
+                H(q[0]);
+                H(q[1]);
+                H(q[2]);
+                H(q[3]);
+                H(q[4]);
+                H(q[5]);
+                H(q[6]);
+                H(q[7]);
+                H(q[8]);
+                H(q[9]);
+                H(q[10]);
+                H(q[11]);
+                H(q[12]);
+                H(q[13]);
+                H(q[14]);
+                H(q[15]);
+                H(q[16]);
+                H(q[17]);
+                H(q[18]);
+                H(q[19]);
+                H(q[20]);
+                H(q[21]);
+                H(q[22]);
+                H(q[23]);
+                H(q[24]);
+                CZ(q[2],q[3]);
+                CZ(q[12],q[13]);
+                CZ(q[22],q[23]);
+                CZ(q[5],q[6]);
+                CZ(q[9],q[10]);
+                CZ(q[15],q[16]);
+                CZ(q[19],q[20]);
+                H(q[0]);
+                H(q[1]);
+                H(q[2]);
+                H(q[3]);
+                H(q[4]);
+                H(q[5]);
+                H(q[6]);
+                H(q[7]);
+                H(q[8]);
+                H(q[9]);
+                H(q[10]);
+                H(q[11]);
+                H(q[12]);
+                H(q[13]);
+                H(q[14]);
+                H(q[15]);
+                H(q[16]);
+                H(q[17]);
+                H(q[18]);
+                H(q[19]);
+                H(q[20]);
+                H(q[21]);
+                H(q[22]);
+                H(q[23]);
+                H(q[24]);
+                CZ(q[0],q[1]);
+                CZ(q[4],q[5]);
+                CZ(q[10],q[11]);
+                CZ(q[14],q[15]);
+                CZ(q[20],q[21]);
+                CZ(q[7],q[8]);
+                CZ(q[17],q[18]);
+                H(q[0]);
+                H(q[1]);
+                H(q[2]);
+                H(q[3]);
+                H(q[4]);
+                H(q[5]);
+                H(q[6]);
+                H(q[7]);
+                H(q[8]);
+                H(q[9]);
+                H(q[10]);
+                H(q[11]);
+                H(q[12]);
+                H(q[13]);
+                H(q[14]);
+                H(q[15]);
+                H(q[16]);
+                H(q[17]);
+                H(q[18]);
+                H(q[19]);
+                H(q[20]);
+                H(q[21]);
+                H(q[22]);
+                H(q[23]);
+                H(q[24]);
+                CZ(q[15],q[20]);
+                CZ(q[17],q[22]);
+                CZ(q[19],q[24]);
+                CZ(q[6],q[11]);
+                CZ(q[8],q[13]);
+                H(q[0]);
+                H(q[1]);
+                H(q[2]);
+                H(q[3]);
+                H(q[4]);
+                H(q[5]);
+                H(q[6]);
+                H(q[7]);
+                H(q[8]);
+                H(q[9]);
+                H(q[10]);
+                H(q[11]);
+                H(q[12]);
+                H(q[13]);
+                H(q[14]);
+                H(q[15]);
+                H(q[16]);
+                H(q[17]);
+                H(q[18]);
+                H(q[19]);
+                H(q[20]);
+                H(q[21]);
+                H(q[22]);
+                H(q[23]);
+                H(q[24]);
+                CZ(q[5],q[10]);
+                CZ(q[7],q[12]);
+                CZ(q[9],q[14]);
+                CZ(q[16],q[21]);
+                CZ(q[18],q[23]);
+                H(q[0]);
+                H(q[1]);
+                H(q[2]);
+                H(q[3]);
+                H(q[4]);
+                H(q[5]);
+                H(q[6]);
+                H(q[7]);
+                H(q[8]);
+                H(q[9]);
+                H(q[10]);
+                H(q[11]);
+                H(q[12]);
+                H(q[13]);
+                H(q[14]);
+                H(q[15]);
+                H(q[16]);
+                H(q[17]);
+                H(q[18]);
+                H(q[19]);
+                H(q[20]);
+                H(q[21]);
+                H(q[22]);
+                H(q[23]);
+                H(q[24]);
+                CZ(q[3],q[4]);
+                CZ(q[13],q[14]);
+                CZ(q[23],q[24]);
+                CZ(q[6],q[7]);
+                CZ(q[16],q[17]);
+                H(q[0]);
+                H(q[1]);
+                H(q[2]);
+                H(q[3]);
+                H(q[4]);
+                H(q[5]);
+                H(q[6]);
+                H(q[7]);
+                H(q[8]);
+                H(q[9]);
+                H(q[10]);
+                H(q[11]);
+                H(q[12]);
+                H(q[13]);
+                H(q[14]);
+                H(q[15]);
+                H(q[16]);
+                H(q[17]);
+                H(q[18]);
+                H(q[19]);
+                H(q[20]);
+                H(q[21]);
+                H(q[22]);
+                H(q[23]);
+                H(q[24]);
+                CZ(q[1],q[2]);
+                CZ(q[11],q[12]);
+                CZ(q[21],q[22]);
+                CZ(q[8],q[9]);
+                CZ(q[18],q[19]);
+                H(q[0]);
+                H(q[1]);
+                H(q[2]);
+                H(q[3]);
+                H(q[4]);
+                H(q[5]);
+                H(q[6]);
+                H(q[7]);
+                H(q[8]);
+                H(q[9]);
+                H(q[10]);
+                H(q[11]);
+                H(q[12]);
+                H(q[13]);
+                H(q[14]);
+                H(q[15]);
+                H(q[16]);
+                H(q[17]);
+                H(q[18]);
+                H(q[19]);
+                H(q[20]);
+                H(q[21]);
+                H(q[22]);
+                H(q[23]);
+                H(q[24]);
+                CZ(q[0],q[5]);
+                CZ(q[2],q[7]);
+                CZ(q[4],q[9]);
+                CZ(q[11],q[16]);
+                CZ(q[13],q[18]);
+                H(q[0]);
+                H(q[1]);
+                H(q[2]);
+                H(q[3]);
+                H(q[4]);
+                H(q[5]);
+                H(q[6]);
+                H(q[7]);
+                H(q[8]);
+                H(q[9]);
+                H(q[10]);
+                H(q[11]);
+                H(q[12]);
+                H(q[13]);
+                H(q[14]);
+                H(q[15]);
+                H(q[16]);
+                H(q[17]);
+                H(q[18]);
+                H(q[19]);
+                H(q[20]);
+                H(q[21]);
+                H(q[22]);
+                H(q[23]);
+                H(q[24]);
+                CZ(q[10],q[15]);
+                CZ(q[12],q[17]);
+                CZ(q[14],q[19]);
+                CZ(q[1],q[6]);
+                CZ(q[3],q[8]);
+                H(q[0]);
+                H(q[1]);
+                H(q[2]);
+                H(q[3]);
+                H(q[4]);
+                H(q[5]);
+                H(q[6]);
+                H(q[7]);
+                H(q[8]);
+                H(q[9]);
+                H(q[10]);
+                H(q[11]);
+                H(q[12]);
+                H(q[13]);
+                H(q[14]);
+                H(q[15]);
+                H(q[16]);
+                H(q[17]);
+                H(q[18]);
+                H(q[19]);
+                H(q[20]);
+                H(q[21]);
+                H(q[22]);
+                H(q[23]);
+                H(q[24]);
+                for (q1 in q) { let _ = M(q1); }
+            }
+        ResetAll(q);
+        }
+    return(gateCnt);
+    }
+
+    operation Advantage56() : Int {
+        let loops = 1;
+        let gateCnt = (323+53*2) * loops;
+        using (q = Qubit[30]) {
+            for (loop in 0..(loops-1)) {
+                H(q[0]);
+                H(q[1]);
+                H(q[2]);
+                H(q[3]);
+                H(q[4]);
+                H(q[5]);
+                H(q[6]);
+                H(q[7]);
+                H(q[8]);
+                H(q[9]);
+                H(q[10]);
+                H(q[11]);
+                H(q[12]);
+                H(q[13]);
+                H(q[14]);
+                H(q[15]);
+                H(q[16]);
+                H(q[17]);
+                H(q[18]);
+                H(q[19]);
+                H(q[20]);
+                H(q[21]);
+                H(q[22]);
+                H(q[23]);
+                H(q[24]);
+                H(q[25]);
+                H(q[26]);
+                H(q[27]);
+                H(q[28]);
+                H(q[29]);
+                CZ(q[2],q[3]);
+                CZ(q[14],q[15]);
+                CZ(q[26],q[27]);
+                CZ(q[6],q[7]);
+                CZ(q[10],q[11]);
+                CZ(q[18],q[19]);
+                CZ(q[22],q[23]);
+                H(q[0]);
+                H(q[1]);
+                H(q[2]);
+                H(q[3]);
+                H(q[4]);
+                H(q[5]);
+                H(q[6]);
+                H(q[7]);
+                H(q[8]);
+                H(q[9]);
+                H(q[10]);
+                H(q[11]);
+                H(q[12]);
+                H(q[13]);
+                H(q[14]);
+                H(q[15]);
+                H(q[16]);
+                H(q[17]);
+                H(q[18]);
+                H(q[19]);
+                H(q[20]);
+                H(q[21]);
+                H(q[22]);
+                H(q[23]);
+                H(q[24]);
+                H(q[25]);
+                H(q[26]);
+                H(q[27]);
+                H(q[28]);
+                H(q[29]);
+                CZ(q[0],q[1]);
+                CZ(q[4],q[5]);
+                CZ(q[12],q[13]);
+                CZ(q[16],q[17]);
+                CZ(q[24],q[25]);
+                CZ(q[28],q[29]);
+                CZ(q[8],q[9]);
+                CZ(q[20],q[21]);
+                H(q[0]);
+                H(q[1]);
+                H(q[2]);
+                H(q[3]);
+                H(q[4]);
+                H(q[5]);
+                H(q[6]);
+                H(q[7]);
+                H(q[8]);
+                H(q[9]);
+                H(q[10]);
+                H(q[11]);
+                H(q[12]);
+                H(q[13]);
+                H(q[14]);
+                H(q[15]);
+                H(q[16]);
+                H(q[17]);
+                H(q[18]);
+                H(q[19]);
+                H(q[20]);
+                H(q[21]);
+                H(q[22]);
+                H(q[23]);
+                H(q[24]);
+                H(q[25]);
+                H(q[26]);
+                H(q[27]);
+                H(q[28]);
+                H(q[29]);
+                CZ(q[18],q[24]);
+                CZ(q[20],q[26]);
+                CZ(q[22],q[28]);
+                CZ(q[7],q[13]);
+                CZ(q[9],q[15]);
+                CZ(q[11],q[17]);
+                H(q[0]);
+                H(q[1]);
+                H(q[2]);
+                H(q[3]);
+                H(q[4]);
+                H(q[5]);
+                H(q[6]);
+                H(q[7]);
+                H(q[8]);
+                H(q[9]);
+                H(q[10]);
+                H(q[11]);
+                H(q[12]);
+                H(q[13]);
+                H(q[14]);
+                H(q[15]);
+                H(q[16]);
+                H(q[17]);
+                H(q[18]);
+                H(q[19]);
+                H(q[20]);
+                H(q[21]);
+                H(q[22]);
+                H(q[23]);
+                H(q[24]);
+                H(q[25]);
+                H(q[26]);
+                H(q[27]);
+                H(q[28]);
+                H(q[29]);
+                CZ(q[6],q[12]);
+                CZ(q[8],q[14]);
+                CZ(q[10],q[16]);
+                CZ(q[19],q[25]);
+                CZ(q[21],q[27]);
+                CZ(q[23],q[29]);
+                H(q[0]);
+                H(q[1]);
+                H(q[2]);
+                H(q[3]);
+                H(q[4]);
+                H(q[5]);
+                H(q[6]);
+                H(q[7]);
+                H(q[8]);
+                H(q[9]);
+                H(q[10]);
+                H(q[11]);
+                H(q[12]);
+                H(q[13]);
+                H(q[14]);
+                H(q[15]);
+                H(q[16]);
+                H(q[17]);
+                H(q[18]);
+                H(q[19]);
+                H(q[20]);
+                H(q[21]);
+                H(q[22]);
+                H(q[23]);
+                H(q[24]);
+                H(q[25]);
+                H(q[26]);
+                H(q[27]);
+                H(q[28]);
+                H(q[29]);
+                CZ(q[3],q[4]);
+                CZ(q[15],q[16]);
+                CZ(q[27],q[28]);
+                CZ(q[7],q[8]);
+                CZ(q[11],q[12]);
+                CZ(q[19],q[20]);
+                CZ(q[23],q[24]);
+                H(q[0]);
+                H(q[1]);
+                H(q[2]);
+                H(q[3]);
+                H(q[4]);
+                H(q[5]);
+                H(q[6]);
+                H(q[7]);
+                H(q[8]);
+                H(q[9]);
+                H(q[10]);
+                H(q[11]);
+                H(q[12]);
+                H(q[13]);
+                H(q[14]);
+                H(q[15]);
+                H(q[16]);
+                H(q[17]);
+                H(q[18]);
+                H(q[19]);
+                H(q[20]);
+                H(q[21]);
+                H(q[22]);
+                H(q[23]);
+                H(q[24]);
+                H(q[25]);
+                H(q[26]);
+                H(q[27]);
+                H(q[28]);
+                H(q[29]);
+                CZ(q[1],q[2]);
+                CZ(q[5],q[6]);
+                CZ(q[13],q[14]);
+                CZ(q[17],q[18]);
+                CZ(q[25],q[26]);
+                CZ(q[9],q[10]);
+                CZ(q[21],q[22]);
+                H(q[0]);
+                H(q[1]);
+                H(q[2]);
+                H(q[3]);
+                H(q[4]);
+                H(q[5]);
+                H(q[6]);
+                H(q[7]);
+                H(q[8]);
+                H(q[9]);
+                H(q[10]);
+                H(q[11]);
+                H(q[12]);
+                H(q[13]);
+                H(q[14]);
+                H(q[15]);
+                H(q[16]);
+                H(q[17]);
+                H(q[18]);
+                H(q[19]);
+                H(q[20]);
+                H(q[21]);
+                H(q[22]);
+                H(q[23]);
+                H(q[24]);
+                H(q[25]);
+                H(q[26]);
+                H(q[27]);
+                H(q[28]);
+                H(q[29]);
+                CZ(q[0],q[6]);
+                CZ(q[2],q[8]);
+                CZ(q[4],q[10]);
+                CZ(q[13],q[19]);
+                CZ(q[15],q[21]);
+                CZ(q[17],q[23]);
+                H(q[0]);
+                H(q[1]);
+                H(q[2]);
+                H(q[3]);
+                H(q[4]);
+                H(q[5]);
+                H(q[6]);
+                H(q[7]);
+                H(q[8]);
+                H(q[9]);
+                H(q[10]);
+                H(q[11]);
+                H(q[12]);
+                H(q[13]);
+                H(q[14]);
+                H(q[15]);
+                H(q[16]);
+                H(q[17]);
+                H(q[18]);
+                H(q[19]);
+                H(q[20]);
+                H(q[21]);
+                H(q[22]);
+                H(q[23]);
+                H(q[24]);
+                H(q[25]);
+                H(q[26]);
+                H(q[27]);
+                H(q[28]);
+                H(q[29]);
+                CZ(q[12],q[18]);
+                CZ(q[14],q[20]);
+                CZ(q[16],q[22]);
+                CZ(q[1],q[7]);
+                CZ(q[3],q[9]);
+                CZ(q[5],q[11]);
+                H(q[0]);
+                H(q[1]);
+                H(q[2]);
+                H(q[3]);
+                H(q[4]);
+                H(q[5]);
+                H(q[6]);
+                H(q[7]);
+                H(q[8]);
+                H(q[9]);
+                H(q[10]);
+                H(q[11]);
+                H(q[12]);
+                H(q[13]);
+                H(q[14]);
+                H(q[15]);
+                H(q[16]);
+                H(q[17]);
+                H(q[18]);
+                H(q[19]);
+                H(q[20]);
+                H(q[21]);
+                H(q[22]);
+                H(q[23]);
+                H(q[24]);
+                H(q[25]);
+                H(q[26]);
+                H(q[27]);
+                H(q[28]);
+                H(q[29]);
+                for (q1 in q) { let _ = M(q1); }
+            }
+        ResetAll(q);
+        }
+    return(gateCnt);
+    }
+
+}
diff --git a/AdvantageBenchmark/privateBuild/advantage.sln b/AdvantageBenchmark/privateBuild/advantage.sln
new file mode 100644
index 00000000000..91b39bc2b87
Binary files /dev/null and b/AdvantageBenchmark/privateBuild/advantage.sln differ
diff --git a/AdvantageBenchmark/privateBuild/host.csproj b/AdvantageBenchmark/privateBuild/host.csproj
new file mode 100644
index 00000000000..73723279d53
--- /dev/null
+++ b/AdvantageBenchmark/privateBuild/host.csproj
@@ -0,0 +1,11 @@
+<Project Sdk="Microsoft.Quantum.Sdk/0.12.20082705-beta">
+  <!-- Add project references to Simulator and other required packages: -->
+  <Import Project="..\..\src\Simulation\Common\Simulators.Dev.props" />
+
+  <PropertyGroup>
+    <OutputType>Exe</OutputType>
+    <TargetFramework>netcoreapp3.1</TargetFramework>
+    <IncludeQsharpCorePackages>false</IncludeQsharpCorePackages> <!-- otherwise the standard library is included by the Sdk -->
+  </PropertyGroup>
+
+</Project>
diff --git a/AdvantageBenchmark/privateBuild/parseLog.py b/AdvantageBenchmark/privateBuild/parseLog.py
new file mode 100644
index 00000000000..fbf1f395fe0
--- /dev/null
+++ b/AdvantageBenchmark/privateBuild/parseLog.py
@@ -0,0 +1,50 @@
+import re   
+import sys
+import numpy as np
+from collections import namedtuple
+
+info    = namedtuple('Info','test loop secs gates threads span depth gps')
+logName = sys.argv[1]
+reHead  = re.compile(r"^CSV,test,")
+reInfo  = re.compile(r'^CSV,([^,]+),([^,]+),([^,]+),([^,]+),([^,]+),([^,]+),([^,]+),([^,\s]+)')
+fp      = open(logName,'r')
+infos   = []
+
+print('test,secs,gates,threads,span,depth,gps')
+
+def dumpGpss():
+    global infos
+    if len(infos) > 0:
+        gpss    = [float(i.gps) for i in infos]
+        gpsMed  = np.median(gpss)
+        cnt     = 0.0
+        tot     = 0.0
+        #for gps in gpss:
+        #    if gps > gpsMed/2.0 and gps < gpsMed*1.5:
+        #        cnt += 1.0
+        #        tot += gps
+        #if cnt > 0: gps = tot/cnt
+        #else:       gps = np.average(gpss)
+        gps     = np.max(gpss)
+
+        idx     = int(len(infos)/2)
+        itm     = infos[idx]
+        print(f"{itm.test},{itm.secs},{itm.gates},{itm.threads},{itm.span},{itm.depth},{gps:.1f}")
+        infos = []
+
+while True:
+    inp = fp.readline()
+    if inp == "": 
+        dumpGpss()
+        break
+    found   = reHead.search(inp)
+    if found:
+        dumpGpss()
+        continue
+    found   = reInfo.search(inp)
+    if found:
+        infos.append(info(found.group(1),found.group(2),found.group(3),found.group(4),
+                        found.group(5),found.group(6),found.group(7),found.group(8)))
+        continue
+
+fp.close()
diff --git a/AdvantageBenchmark/privateBuild/runTest.ps1 b/AdvantageBenchmark/privateBuild/runTest.ps1
new file mode 100644
index 00000000000..12fe3870497
--- /dev/null
+++ b/AdvantageBenchmark/privateBuild/runTest.ps1
@@ -0,0 +1,9 @@
+for ($tst=1; $tst -le 2; $tst++) {
+    for ($thrd=4; $thrd -ge 1; $thrd--) {
+        for ($span=4; $span -ge 0; $span--) {
+            $env:OMP_NUM_THREADS = $thrd
+            $env:QDK_SIM_FUSESPAN = $span
+            .\bin\Release\netcoreapp3.1\host.exe $tst $tst 5
+        }
+    }
+}
diff --git a/AdvantageBenchmark/privateBuild/runTest.sh b/AdvantageBenchmark/privateBuild/runTest.sh
new file mode 100755
index 00000000000..c8dc2155faa
--- /dev/null
+++ b/AdvantageBenchmark/privateBuild/runTest.sh
@@ -0,0 +1,14 @@
+#!/bin/bash
+
+for tst in {1..3}
+do
+    for thrd in {20..2..-2}
+    do
+        for span in {7..0..-1}
+        do
+            export OMP_NUM_THREADS=$thrd
+            export QDK_SIM_FUSESPAN=$span
+            ./bin/Release/netcoreapp3.1/host $tst $tst 5
+        done
+    done
+done
diff --git a/AdvantageBenchmark/readme.md b/AdvantageBenchmark/readme.md
new file mode 100644
index 00000000000..33242bd4fec
--- /dev/null
+++ b/AdvantageBenchmark/readme.md
@@ -0,0 +1,15 @@
+# Advantage Benchmark
+
+## Purpose
+
+This benchmark is intended to provide an easy way to verify the performance characteristcs of a given release build of the QDK simulator vs the current tree. The releaseBuild folder contains projects that will build the quantum advantage Q# program with a QDK from a nuget source and verify the gates-per-second execution of that program. The privateBuild folder compiles the same Q# program with the runtime in the curent source tree instead.
+
+## Executing the benchmark
+
+To execute the benchmark, compile each version of advantage.sln using `dotnet build .\advantage.sln -c Release` from their respective folders. Then the executable to run will be either `bin\Release\netcoreapp3.1\host.exe` in the privateBuild folder or `host\bin\Release\netcoreapp3.1\host.exe` in the releaseBuild folder. This executable takes parameters describing which test circuits to execute and how many loops to perform as integer arguments, such that `host.exe 1 1 5` will run 5 loops of test 1 and `host.exe 0 3 100` will run 100 loops of tests 0 through 3. Check the contents of `privateBuild\Program.cs` to see the tests that correspond to each identifier; for most machines, test 1 aka advantage 4x4 circuit is the best choice for benchmarking.
+
+The benchmark can also be run via runTest.ps1 or runTest.sh, which performs a sweep across configured environment variables that adjust the number of threads used and gates fused in simulating the circuit. See the definition of the script used on your platform to understand how it configures the `OMP_NUM_THREADS` and `QDK_SIM_FUSESPAN` environment variables.
+
+## Collecting results
+
+The output of `host.exe` is a table showing the gates-per-second along with other identifiying information for the run, output at intervals during the looped execution. When driven via runTest.ps1/.sh, the output will be a larger table of all the results for the various combinations of threads and fusion spans. To help collect these results into a meaningful table, the parseLog.py script will convert the output from a runTest execution into a CSV file with the single highest gates-per-second observed for a given thread/fuse-span combination. This can then be loaded into a spreadsheet program for easier graphing or other visualization.
diff --git a/AdvantageBenchmark/releasedBuild/advantage.sln b/AdvantageBenchmark/releasedBuild/advantage.sln
new file mode 100644
index 00000000000..4571a900f07
--- /dev/null
+++ b/AdvantageBenchmark/releasedBuild/advantage.sln
@@ -0,0 +1,48 @@
+﻿
+Microsoft Visual Studio Solution File, Format Version 12.00
+# Visual Studio 15
+VisualStudioVersion = 15.0.26124.0
+MinimumVisualStudioVersion = 15.0.26124.0
+Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "quantum", "quantum\quantum.csproj", "{576A1AEE-9051-458D-B3E8-EFE3F64235B0}"
+EndProject
+Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "host", "host\host.csproj", "{642AEC30-F51D-4547-A1FD-A8AC759A75A5}"
+EndProject
+Global
+	GlobalSection(SolutionConfigurationPlatforms) = preSolution
+		Debug|Any CPU = Debug|Any CPU
+		Debug|x64 = Debug|x64
+		Debug|x86 = Debug|x86
+		Release|Any CPU = Release|Any CPU
+		Release|x64 = Release|x64
+		Release|x86 = Release|x86
+	EndGlobalSection
+	GlobalSection(SolutionProperties) = preSolution
+		HideSolutionNode = FALSE
+	EndGlobalSection
+	GlobalSection(ProjectConfigurationPlatforms) = postSolution
+		{576A1AEE-9051-458D-B3E8-EFE3F64235B0}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
+		{576A1AEE-9051-458D-B3E8-EFE3F64235B0}.Debug|Any CPU.Build.0 = Debug|Any CPU
+		{576A1AEE-9051-458D-B3E8-EFE3F64235B0}.Debug|x64.ActiveCfg = Debug|Any CPU
+		{576A1AEE-9051-458D-B3E8-EFE3F64235B0}.Debug|x64.Build.0 = Debug|Any CPU
+		{576A1AEE-9051-458D-B3E8-EFE3F64235B0}.Debug|x86.ActiveCfg = Debug|Any CPU
+		{576A1AEE-9051-458D-B3E8-EFE3F64235B0}.Debug|x86.Build.0 = Debug|Any CPU
+		{576A1AEE-9051-458D-B3E8-EFE3F64235B0}.Release|Any CPU.ActiveCfg = Release|Any CPU
+		{576A1AEE-9051-458D-B3E8-EFE3F64235B0}.Release|Any CPU.Build.0 = Release|Any CPU
+		{576A1AEE-9051-458D-B3E8-EFE3F64235B0}.Release|x64.ActiveCfg = Release|Any CPU
+		{576A1AEE-9051-458D-B3E8-EFE3F64235B0}.Release|x64.Build.0 = Release|Any CPU
+		{576A1AEE-9051-458D-B3E8-EFE3F64235B0}.Release|x86.ActiveCfg = Release|Any CPU
+		{576A1AEE-9051-458D-B3E8-EFE3F64235B0}.Release|x86.Build.0 = Release|Any CPU
+		{642AEC30-F51D-4547-A1FD-A8AC759A75A5}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
+		{642AEC30-F51D-4547-A1FD-A8AC759A75A5}.Debug|Any CPU.Build.0 = Debug|Any CPU
+		{642AEC30-F51D-4547-A1FD-A8AC759A75A5}.Debug|x64.ActiveCfg = Debug|Any CPU
+		{642AEC30-F51D-4547-A1FD-A8AC759A75A5}.Debug|x64.Build.0 = Debug|Any CPU
+		{642AEC30-F51D-4547-A1FD-A8AC759A75A5}.Debug|x86.ActiveCfg = Debug|Any CPU
+		{642AEC30-F51D-4547-A1FD-A8AC759A75A5}.Debug|x86.Build.0 = Debug|Any CPU
+		{642AEC30-F51D-4547-A1FD-A8AC759A75A5}.Release|Any CPU.ActiveCfg = Release|Any CPU
+		{642AEC30-F51D-4547-A1FD-A8AC759A75A5}.Release|Any CPU.Build.0 = Release|Any CPU
+		{642AEC30-F51D-4547-A1FD-A8AC759A75A5}.Release|x64.ActiveCfg = Release|Any CPU
+		{642AEC30-F51D-4547-A1FD-A8AC759A75A5}.Release|x64.Build.0 = Release|Any CPU
+		{642AEC30-F51D-4547-A1FD-A8AC759A75A5}.Release|x86.ActiveCfg = Release|Any CPU
+		{642AEC30-F51D-4547-A1FD-A8AC759A75A5}.Release|x86.Build.0 = Release|Any CPU
+	EndGlobalSection
+EndGlobal
diff --git a/AdvantageBenchmark/releasedBuild/host/host.csproj b/AdvantageBenchmark/releasedBuild/host/host.csproj
new file mode 100644
index 00000000000..8b145ec5eef
--- /dev/null
+++ b/AdvantageBenchmark/releasedBuild/host/host.csproj
@@ -0,0 +1,16 @@
+<Project Sdk="Microsoft.NET.Sdk">
+
+  <ItemGroup>
+    <Compile Include="..\..\privateBuild\program.cs" Link="program.cs" />
+  </ItemGroup>
+
+  <ItemGroup>
+    <ProjectReference Include="..\quantum\quantum.csproj" />
+  </ItemGroup>
+
+  <PropertyGroup>
+    <OutputType>Exe</OutputType>
+    <TargetFramework>netcoreapp3.1</TargetFramework>
+  </PropertyGroup>
+
+</Project>
diff --git a/AdvantageBenchmark/releasedBuild/quantum/quantum.csproj b/AdvantageBenchmark/releasedBuild/quantum/quantum.csproj
new file mode 100644
index 00000000000..18667e533f6
--- /dev/null
+++ b/AdvantageBenchmark/releasedBuild/quantum/quantum.csproj
@@ -0,0 +1,11 @@
+<Project Sdk="Microsoft.Quantum.Sdk/0.12.20082705-beta">
+
+  <ItemGroup>
+    <QsharpCompile Include="..\..\privateBuild\quantum.qs" />
+  </ItemGroup>
+
+  <PropertyGroup>
+    <TargetFramework>netstandard2.1</TargetFramework>
+  </PropertyGroup>
+
+</Project>
diff --git a/build/test.ps1 b/build/test.ps1
index 09e0e49fb9f..e88962d2a17 100644
--- a/build/test.ps1
+++ b/build/test.ps1
@@ -8,7 +8,8 @@ if ($Env:ENABLE_NATIVE -ne "false") {
     Write-Host "##[info]Test Native simulator"
     pushd (Join-Path $PSScriptRoot "../src/Simulation/Native/build")
     cmake --build . --config $Env:BUILD_CONFIGURATION
-    ctest -C $Env:BUILD_CONFIGURATION
+    cp ../advantage_44_4.log .
+    ctest -C $Env:BUILD_CONFIGURATION --verbose
     if ($LastExitCode -ne 0) {
         Write-Host "##vso[task.logissue type=error;]Failed to test Native Simulator"
         $script:all_ok = $False
diff --git a/src/Simulation/CsharpGeneration/Microsoft.Quantum.CsharpGeneration.fsproj b/src/Simulation/CsharpGeneration/Microsoft.Quantum.CsharpGeneration.fsproj
index 75496178ba5..b47f0af63be 100644
--- a/src/Simulation/CsharpGeneration/Microsoft.Quantum.CsharpGeneration.fsproj
+++ b/src/Simulation/CsharpGeneration/Microsoft.Quantum.CsharpGeneration.fsproj
@@ -21,7 +21,7 @@
 
   <ItemGroup>
     <PackageReference Update="FSharp.Core" Version="4.7.0" />
-    <PackageReference Include="Microsoft.Quantum.Compiler" Version="0.12.2008.2604-alpha" />
+    <PackageReference Include="Microsoft.Quantum.Compiler" Version="0.12.20082705-beta" />
   </ItemGroup>
 
   <ItemGroup>
diff --git a/src/Simulation/Native/.gitignore b/src/Simulation/Native/.gitignore
index 06c6a304f20..8f1fc6e9a01 100644
--- a/src/Simulation/Native/.gitignore
+++ b/src/Simulation/Native/.gitignore
@@ -2,3 +2,11 @@
 build
 /.vs
 /vs2017
+*.csv
+foo*
+*.filters
+*.cmake
+*.vcxproj
+CMakeFiles/
+CMakeCache.txt
+*.so
diff --git a/src/Simulation/Native/CMakeLists.txt b/src/Simulation/Native/CMakeLists.txt
index b9c22c575f7..c56b6f95a3e 100644
--- a/src/Simulation/Native/CMakeLists.txt
+++ b/src/Simulation/Native/CMakeLists.txt
@@ -24,7 +24,7 @@ set(CMAKE_CXX_STANDARD_REQUIRED ON)
 
 ADD_DEFINITIONS(-D_SCL_SECURE_NO_WARNINGS)
 # Configuration options (choose one to turn on)
-option(BUILD_SHARED_LIBS "Build shared libraries" OFF)
+option(BUILD_SHARED_LIBS "Build shared libraries" ON)
 option(ENABLE_OPENMP  "Enable OpenMP Parallelization" ON)
 option(USE_SINGLE_PRECISION "Use single-precision floating point operations" OFF)
 option(HAVE_INTRINSICS "Have AVX intrinsics" OFF)
diff --git a/src/Simulation/Native/CMakeSettings.json b/src/Simulation/Native/CMakeSettings.json
new file mode 100644
index 00000000000..ee45e8257c1
--- /dev/null
+++ b/src/Simulation/Native/CMakeSettings.json
@@ -0,0 +1,28 @@
+﻿{
+  "configurations": [
+    {
+      "name": "x64-Debug",
+      "generator": "Ninja",
+      "configurationType": "Debug",
+      "inheritEnvironments": [ "msvc_x64_x64" ],
+      "buildRoot": "${projectDir}\\out\\build\\${name}",
+      "installRoot": "${projectDir}\\out\\install\\${name}",
+      "cmakeCommandArgs": "",
+      "buildCommandArgs": "",
+      "ctestCommandArgs": "",
+      "variables": []
+    },
+    {
+      "name": "x64-Release",
+      "generator": "Ninja",
+      "configurationType": "RelWithDebInfo",
+      "buildRoot": "${projectDir}\\out\\build\\${name}",
+      "installRoot": "${projectDir}\\out\\install\\${name}",
+      "cmakeCommandArgs": "",
+      "buildCommandArgs": "",
+      "ctestCommandArgs": "",
+      "inheritEnvironments": [ "msvc_x64_x64" ],
+      "variables": []
+    }
+  ]
+}
\ No newline at end of file
diff --git a/src/Simulation/Native/Makefile b/src/Simulation/Native/Makefile
new file mode 100644
index 00000000000..bedf9d3e28f
--- /dev/null
+++ b/src/Simulation/Native/Makefile
@@ -0,0 +1,364 @@
+# CMAKE generated file: DO NOT EDIT!
+# Generated by "Unix Makefiles" Generator, CMake Version 3.16
+
+# Default target executed when no arguments are given to make.
+default_target: all
+
+.PHONY : default_target
+
+# Allow only one "make -f Makefile2" at a time, but pass parallelism.
+.NOTPARALLEL:
+
+
+#=============================================================================
+# Special targets provided by cmake.
+
+# Disable implicit rules so canonical targets will work.
+.SUFFIXES:
+
+
+# Remove some rules from gmake that .SUFFIXES does not remove.
+SUFFIXES =
+
+.SUFFIXES: .hpux_make_needs_suffix_list
+
+
+# Suppress display of executed commands.
+$(VERBOSE).SILENT:
+
+
+# A target that is always out of date.
+cmake_force:
+
+.PHONY : cmake_force
+
+#=============================================================================
+# Set environment variables for the build.
+
+# The shell in which to execute make rules.
+SHELL = /bin/sh
+
+# The CMake executable.
+CMAKE_COMMAND = /usr/bin/cmake
+
+# The command to remove a file.
+RM = /usr/bin/cmake -E remove -f
+
+# Escaping for special characters.
+EQUALS = =
+
+# The top-level source directory on which CMake was run.
+CMAKE_SOURCE_DIR = /mnt/c/depot/Git/qsharp-runtime/src/Simulation/Native
+
+# The top-level build directory on which CMake was run.
+CMAKE_BINARY_DIR = /mnt/c/depot/Git/qsharp-runtime/src/Simulation/Native
+
+#=============================================================================
+# Targets provided globally by CMake.
+
+# Special rule for the target install/strip
+install/strip: preinstall
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Installing the project stripped..."
+	/usr/bin/cmake -DCMAKE_INSTALL_DO_STRIP=1 -P cmake_install.cmake
+.PHONY : install/strip
+
+# Special rule for the target install/strip
+install/strip/fast: preinstall/fast
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Installing the project stripped..."
+	/usr/bin/cmake -DCMAKE_INSTALL_DO_STRIP=1 -P cmake_install.cmake
+.PHONY : install/strip/fast
+
+# Special rule for the target install/local
+install/local: preinstall
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Installing only the local directory..."
+	/usr/bin/cmake -DCMAKE_INSTALL_LOCAL_ONLY=1 -P cmake_install.cmake
+.PHONY : install/local
+
+# Special rule for the target install/local
+install/local/fast: preinstall/fast
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Installing only the local directory..."
+	/usr/bin/cmake -DCMAKE_INSTALL_LOCAL_ONLY=1 -P cmake_install.cmake
+.PHONY : install/local/fast
+
+# Special rule for the target install
+install: preinstall
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Install the project..."
+	/usr/bin/cmake -P cmake_install.cmake
+.PHONY : install
+
+# Special rule for the target install
+install/fast: preinstall/fast
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Install the project..."
+	/usr/bin/cmake -P cmake_install.cmake
+.PHONY : install/fast
+
+# Special rule for the target list_install_components
+list_install_components:
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Available install components are: \"libraries\""
+.PHONY : list_install_components
+
+# Special rule for the target list_install_components
+list_install_components/fast: list_install_components
+
+.PHONY : list_install_components/fast
+
+# Special rule for the target rebuild_cache
+rebuild_cache:
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Running CMake to regenerate build system..."
+	/usr/bin/cmake -S$(CMAKE_SOURCE_DIR) -B$(CMAKE_BINARY_DIR)
+.PHONY : rebuild_cache
+
+# Special rule for the target rebuild_cache
+rebuild_cache/fast: rebuild_cache
+
+.PHONY : rebuild_cache/fast
+
+# Special rule for the target edit_cache
+edit_cache:
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Running CMake cache editor..."
+	/usr/bin/ccmake -S$(CMAKE_SOURCE_DIR) -B$(CMAKE_BINARY_DIR)
+.PHONY : edit_cache
+
+# Special rule for the target edit_cache
+edit_cache/fast: edit_cache
+
+.PHONY : edit_cache/fast
+
+# Special rule for the target test
+test:
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Running tests..."
+	/usr/bin/ctest --force-new-ctest-process $(ARGS)
+.PHONY : test
+
+# Special rule for the target test
+test/fast: test
+
+.PHONY : test/fast
+
+# The main all target
+all: cmake_check_build_system
+	$(CMAKE_COMMAND) -E cmake_progress_start /mnt/c/depot/Git/qsharp-runtime/src/Simulation/Native/CMakeFiles /mnt/c/depot/Git/qsharp-runtime/src/Simulation/Native/CMakeFiles/progress.marks
+	$(MAKE) -f CMakeFiles/Makefile2 all
+	$(CMAKE_COMMAND) -E cmake_progress_start /mnt/c/depot/Git/qsharp-runtime/src/Simulation/Native/CMakeFiles 0
+.PHONY : all
+
+# The main clean target
+clean:
+	$(MAKE) -f CMakeFiles/Makefile2 clean
+.PHONY : clean
+
+# The main clean target
+clean/fast: clean
+
+.PHONY : clean/fast
+
+# Prepare targets for installation.
+preinstall: all
+	$(MAKE) -f CMakeFiles/Makefile2 preinstall
+.PHONY : preinstall
+
+# Prepare targets for installation.
+preinstall/fast:
+	$(MAKE) -f CMakeFiles/Makefile2 preinstall
+.PHONY : preinstall/fast
+
+# clear depends
+depend:
+	$(CMAKE_COMMAND) -S$(CMAKE_SOURCE_DIR) -B$(CMAKE_BINARY_DIR) --check-build-system CMakeFiles/Makefile.cmake 1
+.PHONY : depend
+
+#=============================================================================
+# Target rules for targets named Microsoft.Quantum.Simulator.Runtime
+
+# Build rule for target.
+Microsoft.Quantum.Simulator.Runtime: cmake_check_build_system
+	$(MAKE) -f CMakeFiles/Makefile2 Microsoft.Quantum.Simulator.Runtime
+.PHONY : Microsoft.Quantum.Simulator.Runtime
+
+# fast build rule for target.
+Microsoft.Quantum.Simulator.Runtime/fast:
+	$(MAKE) -f src/CMakeFiles/Microsoft.Quantum.Simulator.Runtime.dir/build.make src/CMakeFiles/Microsoft.Quantum.Simulator.Runtime.dir/build
+.PHONY : Microsoft.Quantum.Simulator.Runtime/fast
+
+#=============================================================================
+# Target rules for targets named tinymatrix_test
+
+# Build rule for target.
+tinymatrix_test: cmake_check_build_system
+	$(MAKE) -f CMakeFiles/Makefile2 tinymatrix_test
+.PHONY : tinymatrix_test
+
+# fast build rule for target.
+tinymatrix_test/fast:
+	$(MAKE) -f src/util/CMakeFiles/tinymatrix_test.dir/build.make src/util/CMakeFiles/tinymatrix_test.dir/build
+.PHONY : tinymatrix_test/fast
+
+#=============================================================================
+# Target rules for targets named bititerator_test
+
+# Build rule for target.
+bititerator_test: cmake_check_build_system
+	$(MAKE) -f CMakeFiles/Makefile2 bititerator_test
+.PHONY : bititerator_test
+
+# fast build rule for target.
+bititerator_test/fast:
+	$(MAKE) -f src/util/CMakeFiles/bititerator_test.dir/build.make src/util/CMakeFiles/bititerator_test.dir/build
+.PHONY : bititerator_test/fast
+
+#=============================================================================
+# Target rules for targets named bitops_test
+
+# Build rule for target.
+bitops_test: cmake_check_build_system
+	$(MAKE) -f CMakeFiles/Makefile2 bitops_test
+.PHONY : bitops_test
+
+# fast build rule for target.
+bitops_test/fast:
+	$(MAKE) -f src/util/CMakeFiles/bitops_test.dir/build.make src/util/CMakeFiles/bitops_test.dir/build
+.PHONY : bitops_test/fast
+
+#=============================================================================
+# Target rules for targets named openmp_test
+
+# Build rule for target.
+openmp_test: cmake_check_build_system
+	$(MAKE) -f CMakeFiles/Makefile2 openmp_test
+.PHONY : openmp_test
+
+# fast build rule for target.
+openmp_test/fast:
+	$(MAKE) -f src/util/CMakeFiles/openmp_test.dir/build.make src/util/CMakeFiles/openmp_test.dir/build
+.PHONY : openmp_test/fast
+
+#=============================================================================
+# Target rules for targets named cpuid_test
+
+# Build rule for target.
+cpuid_test: cmake_check_build_system
+	$(MAKE) -f CMakeFiles/Makefile2 cpuid_test
+.PHONY : cpuid_test
+
+# fast build rule for target.
+cpuid_test/fast:
+	$(MAKE) -f src/util/CMakeFiles/cpuid_test.dir/build.make src/util/CMakeFiles/cpuid_test.dir/build
+.PHONY : cpuid_test/fast
+
+#=============================================================================
+# Target rules for targets named argmaxnrm2_test
+
+# Build rule for target.
+argmaxnrm2_test: cmake_check_build_system
+	$(MAKE) -f CMakeFiles/Makefile2 argmaxnrm2_test
+.PHONY : argmaxnrm2_test
+
+# fast build rule for target.
+argmaxnrm2_test/fast:
+	$(MAKE) -f src/util/CMakeFiles/argmaxnrm2_test.dir/build.make src/util/CMakeFiles/argmaxnrm2_test.dir/build
+.PHONY : argmaxnrm2_test/fast
+
+#=============================================================================
+# Target rules for targets named diagmatrix_test
+
+# Build rule for target.
+diagmatrix_test: cmake_check_build_system
+	$(MAKE) -f CMakeFiles/Makefile2 diagmatrix_test
+.PHONY : diagmatrix_test
+
+# fast build rule for target.
+diagmatrix_test/fast:
+	$(MAKE) -f src/util/CMakeFiles/diagmatrix_test.dir/build.make src/util/CMakeFiles/diagmatrix_test.dir/build
+.PHONY : diagmatrix_test/fast
+
+#=============================================================================
+# Target rules for targets named dbw_test
+
+# Build rule for target.
+dbw_test: cmake_check_build_system
+	$(MAKE) -f CMakeFiles/Makefile2 dbw_test
+.PHONY : dbw_test
+
+# fast build rule for target.
+dbw_test/fast:
+	$(MAKE) -f src/simulator/CMakeFiles/dbw_test.dir/build.make src/simulator/CMakeFiles/dbw_test.dir/build
+.PHONY : dbw_test/fast
+
+#=============================================================================
+# Target rules for targets named capi_test
+
+# Build rule for target.
+capi_test: cmake_check_build_system
+	$(MAKE) -f CMakeFiles/Makefile2 capi_test
+.PHONY : capi_test
+
+# fast build rule for target.
+capi_test/fast:
+	$(MAKE) -f src/simulator/CMakeFiles/capi_test.dir/build.make src/simulator/CMakeFiles/capi_test.dir/build
+.PHONY : capi_test/fast
+
+#=============================================================================
+# Target rules for targets named factory_test
+
+# Build rule for target.
+factory_test: cmake_check_build_system
+	$(MAKE) -f CMakeFiles/Makefile2 factory_test
+.PHONY : factory_test
+
+# fast build rule for target.
+factory_test/fast:
+	$(MAKE) -f src/simulator/CMakeFiles/factory_test.dir/build.make src/simulator/CMakeFiles/factory_test.dir/build
+.PHONY : factory_test/fast
+
+#=============================================================================
+# Target rules for targets named local_test
+
+# Build rule for target.
+local_test: cmake_check_build_system
+	$(MAKE) -f CMakeFiles/Makefile2 local_test
+.PHONY : local_test
+
+# fast build rule for target.
+local_test/fast:
+	$(MAKE) -f src/simulator/CMakeFiles/local_test.dir/build.make src/simulator/CMakeFiles/local_test.dir/build
+.PHONY : local_test/fast
+
+# Help Target
+help:
+	@echo "The following are some of the valid targets for this Makefile:"
+	@echo "... all (the default if no target is provided)"
+	@echo "... clean"
+	@echo "... depend"
+	@echo "... install/strip"
+	@echo "... install/local"
+	@echo "... install"
+	@echo "... list_install_components"
+	@echo "... rebuild_cache"
+	@echo "... edit_cache"
+	@echo "... test"
+	@echo "... Microsoft.Quantum.Simulator.Runtime"
+	@echo "... tinymatrix_test"
+	@echo "... bititerator_test"
+	@echo "... bitops_test"
+	@echo "... openmp_test"
+	@echo "... cpuid_test"
+	@echo "... argmaxnrm2_test"
+	@echo "... diagmatrix_test"
+	@echo "... dbw_test"
+	@echo "... capi_test"
+	@echo "... factory_test"
+	@echo "... local_test"
+.PHONY : help
+
+
+
+#=============================================================================
+# Special targets to cleanup operation of make.
+
+# Special rule to run CMake to check the build system integrity.
+# No rule that depends on this can have commands that come from listfiles
+# because they might be regenerated.
+cmake_check_build_system:
+	$(CMAKE_COMMAND) -S$(CMAKE_SOURCE_DIR) -B$(CMAKE_BINARY_DIR) --check-build-system CMakeFiles/Makefile.cmake 0
+.PHONY : cmake_check_build_system
+
diff --git a/src/Simulation/Native/advantage_44_4.log b/src/Simulation/Native/advantage_44_4.log
new file mode 100644
index 00000000000..fc883d684bd
--- /dev/null
+++ b/src/Simulation/Native/advantage_44_4.log
@@ -0,0 +1,354 @@
+=== Original:
+ 0: H[0]
+ 1: H[1]
+ 2: H[2]
+ 3: H[3]
+ 4: H[4]
+ 5: H[5]
+ 6: H[6]
+ 7: H[7]
+ 8: H[8]
+ 9: H[9]
+10: H[10]
+11: H[11]
+12: H[12]
+13: H[13]
+14: H[14]
+15: H[15]
+16: CZ[2, 3]
+17: CZ[10, 11]
+18: CZ[4, 5]
+19: CZ[12, 13]
+20: H[0]
+21: H[1]
+22: H[2]
+23: H[3]
+24: H[4]
+25: H[5]
+26: H[6]
+27: H[7]
+28: H[8]
+29: H[9]
+30: H[10]
+31: H[11]
+32: H[12]
+33: H[13]
+34: H[14]
+35: H[15]
+36: CZ[0, 1]
+37: CZ[8, 9]
+38: CZ[6, 7]
+39: CZ[14, 15]
+40: H[0]
+41: H[1]
+42: H[2]
+43: H[3]
+44: H[4]
+45: H[5]
+46: H[6]
+47: H[7]
+48: H[8]
+49: H[9]
+50: H[10]
+51: H[11]
+52: H[12]
+53: H[13]
+54: H[14]
+55: H[15]
+56: CZ[5, 9]
+57: CZ[7, 11]
+58: H[0]
+59: H[1]
+60: H[2]
+61: H[3]
+62: H[4]
+63: H[5]
+64: H[6]
+65: H[7]
+66: H[8]
+67: H[9]
+68: H[10]
+69: H[11]
+70: H[12]
+71: H[13]
+72: H[14]
+73: H[15]
+74: CZ[4, 8]
+75: CZ[6, 10]
+76: H[0]
+77: H[1]
+78: H[2]
+79: H[3]
+80: H[4]
+81: H[5]
+82: H[6]
+83: H[7]
+84: H[8]
+85: H[9]
+86: H[10]
+87: H[11]
+88: H[12]
+89: H[13]
+90: H[14]
+91: H[15]
+92: CZ[3, 4]
+93: CZ[11, 12]
+94: CZ[5, 6]
+95: CZ[13, 14]
+96: H[0]
+97: H[1]
+98: H[2]
+99: H[3]
+100: H[4]
+101: H[5]
+102: H[6]
+103: H[7]
+104: H[8]
+105: H[9]
+106: H[10]
+107: H[11]
+108: H[12]
+109: H[13]
+110: H[14]
+111: H[15]
+112: CZ[1, 2]
+113: CZ[9, 10]
+114: CZ[7, 8]
+115: H[0]
+116: H[1]
+117: H[2]
+118: H[3]
+119: H[4]
+120: H[5]
+121: H[6]
+122: H[7]
+123: H[8]
+124: H[9]
+125: H[10]
+126: H[11]
+127: H[12]
+128: H[13]
+129: H[14]
+130: H[15]
+131: CZ[0, 4]
+132: CZ[2, 6]
+133: CZ[9, 13]
+134: CZ[11, 15]
+135: H[0]
+136: H[1]
+137: H[2]
+138: H[3]
+139: H[4]
+140: H[5]
+141: H[6]
+142: H[7]
+143: H[8]
+144: H[9]
+145: H[10]
+146: H[11]
+147: H[12]
+148: H[13]
+149: H[14]
+150: H[15]
+151: CZ[8, 12]
+152: CZ[10, 14]
+153: CZ[1, 5]
+154: CZ[3, 7]
+155: H[0]
+156: H[1]
+157: H[2]
+158: H[3]
+159: H[4]
+160: H[5]
+161: H[6]
+162: H[7]
+163: H[8]
+164: H[9]
+165: H[10]
+166: H[11]
+167: H[12]
+168: H[13]
+169: H[14]
+170: H[15]
+=== Clusters (cost=  5.034):
+==== cluster[ 0]: depth=26 width=4
+ 0: H[8]
+ 1: H[9]
+ 2: H[4]
+ 3: H[5]
+ 4: H[8]
+ 5: H[9]
+ 6: CZ[4, 5]
+ 7: H[5]
+ 8: CZ[8, 9]
+ 9: H[9]
+10: H[4]
+11: H[5]
+12: H[8]
+13: CZ[5, 9]
+14: H[4]
+15: H[5]
+16: H[8]
+17: H[9]
+18: H[4]
+19: H[5]
+20: CZ[4, 8]
+21: H[9]
+22: H[4]
+23: H[8]
+24: H[9]
+25: H[8]
+==== cluster[ 1]: depth=26 width=4
+26: H[10]
+27: H[11]
+28: H[6]
+29: H[7]
+30: CZ[10, 11]
+31: H[11]
+32: H[6]
+33: H[7]
+34: H[10]
+35: H[11]
+36: CZ[6, 7]
+37: H[7]
+38: H[10]
+39: CZ[7, 11]
+40: H[6]
+41: H[7]
+42: H[10]
+43: H[11]
+44: H[6]
+45: H[7]
+46: CZ[6, 10]
+47: H[11]
+48: H[6]
+49: H[7]
+50: H[10]
+51: H[10]
+==== cluster[ 2]: depth=30 width=4
+52: H[0]
+53: H[1]
+54: H[2]
+55: H[3]
+56: H[0]
+57: H[1]
+58: CZ[2, 3]
+59: H[3]
+60: CZ[0, 1]
+61: H[1]
+62: H[2]
+63: H[3]
+64: H[0]
+65: H[1]
+66: H[2]
+67: H[3]
+68: H[0]
+69: H[1]
+70: H[2]
+71: H[3]
+72: H[0]
+73: H[1]
+74: H[2]
+75: H[0]
+76: H[2]
+77: H[0]
+78: CZ[1, 2]
+79: H[2]
+80: H[1]
+81: H[1]
+==== cluster[ 3]: depth=30 width=4
+82: H[12]
+83: H[13]
+84: H[14]
+85: H[15]
+86: CZ[12, 13]
+87: H[13]
+88: H[14]
+89: H[15]
+90: H[12]
+91: H[13]
+92: CZ[14, 15]
+93: H[15]
+94: H[12]
+95: H[13]
+96: H[14]
+97: H[15]
+98: H[12]
+99: H[13]
+100: H[14]
+101: H[15]
+102: H[12]
+103: H[14]
+104: H[15]
+105: CZ[13, 14]
+106: H[14]
+107: H[15]
+108: H[13]
+109: H[14]
+110: H[13]
+111: H[14]
+==== cluster[ 4]: depth=14 width=4
+112: CZ[7, 8]
+113: CZ[3, 4]
+114: H[4]
+115: H[7]
+116: H[8]
+117: H[3]
+118: H[4]
+119: H[7]
+120: H[8]
+121: H[3]
+122: H[3]
+123: CZ[3, 7]
+124: H[3]
+125: H[7]
+==== cluster[ 5]: depth=11 width=3
+126: CZ[11, 12]
+127: H[12]
+128: H[11]
+129: H[12]
+130: H[11]
+131: H[12]
+132: CZ[11, 15]
+133: H[11]
+134: H[11]
+135: H[15]
+136: H[15]
+==== cluster[ 6]: depth=11 width=3
+137: CZ[5, 6]
+138: H[6]
+139: H[5]
+140: H[6]
+141: CZ[2, 6]
+142: H[5]
+143: H[6]
+144: H[2]
+145: H[5]
+146: H[6]
+147: H[2]
+==== cluster[ 7]: depth= 7 width=3
+148: CZ[9, 10]
+149: H[10]
+150: H[9]
+151: H[10]
+152: CZ[10, 14]
+153: H[10]
+154: H[14]
+==== cluster[ 8]: depth= 8 width=4
+155: CZ[0, 4]
+156: CZ[8, 12]
+157: H[4]
+158: H[12]
+159: H[0]
+160: H[8]
+161: H[4]
+162: H[0]
+==== cluster[ 9]: depth= 8 width=4
+163: CZ[9, 13]
+164: CZ[1, 5]
+165: H[13]
+166: H[1]
+167: H[9]
+168: H[5]
+169: H[13]
+170: H[9]
diff --git a/src/Simulation/Native/argmaxnrm2_test b/src/Simulation/Native/argmaxnrm2_test
new file mode 100644
index 00000000000..caad0d10c76
Binary files /dev/null and b/src/Simulation/Native/argmaxnrm2_test differ
diff --git a/src/Simulation/Native/bititerator_test b/src/Simulation/Native/bititerator_test
new file mode 100644
index 00000000000..a506e00ca46
Binary files /dev/null and b/src/Simulation/Native/bititerator_test differ
diff --git a/src/Simulation/Native/bitops_test b/src/Simulation/Native/bitops_test
new file mode 100644
index 00000000000..7b1a9213aa9
Binary files /dev/null and b/src/Simulation/Native/bitops_test differ
diff --git a/src/Simulation/Native/capi_test b/src/Simulation/Native/capi_test
new file mode 100644
index 00000000000..ab5ec3c9ce2
Binary files /dev/null and b/src/Simulation/Native/capi_test differ
diff --git a/src/Simulation/Native/codegen/codegen_fma.py b/src/Simulation/Native/codegen/codegen_fma.py
new file mode 100644
index 00000000000..6e523aa1498
--- /dev/null
+++ b/src/Simulation/Native/codegen/codegen_fma.py
@@ -0,0 +1,434 @@
+#!/usr/bin/env python3
+# (C) 2018 ETH Zurich, ITP, Thomas Häner and Damian Steiger
+# Code generator for n-qubit gate
+
+import sys
+
+
+def avx_type(complex_avx_len):
+  if complex_avx_len == 2:
+    return "__m256d"
+  elif complex_avx_len == 4:
+    return "__m512d"
+  elif complex_avx_len == 1:
+    return "std::complex<double>"
+  else:
+    raise Exception("Unknown avx type.")
+
+
+def avx_prefix(complex_avx_len):
+  if complex_avx_len == 2:
+    return "_mm256"
+  elif complex_avx_len == 4:
+    return "_mm512"
+  else:
+    raise Exception("Unknown avx type.")
+
+
+def generate_kernel_core(N, n, kernelarray, blocks, only_one_matrix, unroll_loops, avx_len):
+  indent = 1
+  
+  kernelarray.append("// (C) 2018 ETH Zurich, ITP, Thomas Häner and Damian Steiger\n\ntemplate <class V, class M>\ninline void kernel_core(V& psi, std::size_t I")
+  for i in range(n):
+    kernelarray.append(", std::size_t d" + str(i))
+  
+  kernelarray.append(", M const& m")
+  if not only_one_matrix:
+    kernelarray.append(", M const& mt")
+  kernelarray.append(")\n{\n")
+  
+  indices = [""]*N
+  for num in range(N):
+    tmp = "I"
+    for b in range(n):
+      if (num>>b) & 1:
+        tmp = tmp + " + d"+str(b)
+    indices[num] = tmp
+
+  add = ["\t" + avx_type(avx_len) + " v[" + str(int(N/blocks)) + "];\n"]
+  for b in range(blocks):
+    if avx_len == 4:
+      x4 = "x4"
+    else:
+      x4 = ""
+    for num in range(int(N/blocks)*b, int(N/blocks)*(b+1)):
+      add.append("\n\tv[" + str(int(num % (N/blocks))) + "] = ")
+      if avx_len > 1:
+        add.append("load1" + x4 + "(&")
+      add.append("psi[" + indices[num] + "]")
+      if avx_len > 1:
+        add.append(")")
+      add.append(";")
+    add.append("\n")
+    if b == 0:
+      add.append("\n\t" + avx_type(avx_len) + " tmp[" + str(int(N/avx_len)) + "] = {")
+      for i in range(int(N/avx_len)):
+        if avx_len > 1:
+          add.append(avx_prefix(avx_len) + "_setzero_pd(), ")
+        else:
+          add.append("0., ")
+      add[-1] = add[-1][:-2] + "};\n"
+    
+    if unroll_loops:
+      inline_FMAs = False
+      miniblocks = N/avx_len/4
+      miniblocks = max(miniblocks, 1)
+      for mb in range(int(miniblocks)):
+        for rb in range(int(N/avx_len/miniblocks)):
+          r = int(mb*N/avx_len/miniblocks) + rb
+          add.append("\n\ttmp[" + str(r) + "] = ")
+        
+          for i in range(int(N/blocks)):
+            if not inline_FMAs or avx_len == 1:
+              add.append("fma(v[" + str(i) + "], m[" + str(b*int(N/blocks)*int(N/avx_len)+r*int(N/blocks)+i) +"], ")
+              if not only_one_matrix:
+                add.append("mt[" + str(b*int(N/blocks)*int(N/avx_len)+i+r*int(N/blocks)) +"], ")
+            else:
+              add.append(avx_prefix(avx_len) + "_fmadd_pd(v[" + str(i) + "], m[" + str(b*int(N/blocks)*int(N/avx_len)+r*int(N/blocks)+i) +"], ")
+          add.append("tmp[" + str(r) + "]")
+          add.append(")"*int(N/blocks)+";")
+        
+        if inline_FMAs and not only_one_matrix and not avx_len == 1:
+          for rb in range(int(N/avx_len/miniblocks)):
+            r = int(mb*N/avx_len/miniblocks) + rb
+            add.append("\n\ttmp[" + str(r) + "] = ")
+            for i in range(int(N/blocks)):
+              add.append(avx_prefix(avx_len) + "_fmadd_pd(" + avx_prefix(avx_len) + "_permute_pd(v[" + str(i) + "], 5), mt[" + str(b*int(N/blocks)*int(N/avx_len)+r*int(N/blocks)+i) +"], ")
+            add.append("tmp[" + str(r) + "]")
+            add.append(")"*int(N/blocks)+";")
+        
+        if inline_FMAs and only_one_matrix and avx_len > 1:
+          raise Exception("Not implemented yet!")
+        
+        for rb in range(int(N/avx_len/miniblocks)):
+          r = int(mb*N/avx_len/miniblocks) + rb
+          if b == blocks-1:
+            add.append("\n\t")
+            if avx_len > 1:
+              add.append("store(")
+            for i in range(avx_len):
+              if avx_len > 1:
+                add.append("(double*)&")
+              add.append("psi[" + indices[avx_len*r+avx_len-i-1] + "], ")
+            if avx_len == 1:
+              add[-1] = add[-1][:-2] + " = "
+            add.append("tmp[" + str(r) + "]);")
+            if avx_len == 1:
+              add[-1] = add[-1][:-2] + ";"
+    else:
+      add.append("\tfor (unsigned i = 0; i < " + str(int(N/avx_len)) + "; ++i){\n\t\ttmp[i] = ")
+      for i in range(int(N/blocks)):
+        add.append("fma(v[" + str(i) + "], m[" + str(b*int(N/blocks)*int(N/avx_len)) + " + i * "+str(int(N/blocks)) + " + " + str(i) +"], ")
+        if not only_one_matrix:
+          add.append("mt[" + str(b*int(N/blocks)*int(N/avx_len)) + " + i * " + str(int(N/blocks)) + " + " + str(i) +"], ")
+      add.append("tmp[i]")
+      add.append(")"*int(N/blocks)+";")
+      add.append("\n\t}\n")
+      if b == blocks-1:
+        for r in range(int(N/avx_len)):
+          add.append("\n\t")
+          if avx_len > 1:
+            add.append("store(")
+          for i in range(avx_len):
+            if avx_len > 1:
+              add.append("(double*)&")
+            add.append("psi[" + indices[avx_len*r+avx_len-i-1] + "], ")
+          if avx_len == 1:
+            add[-1] = add[-1][:-2] + " = "
+          add.append("tmp[" + str(r) + "]);")
+          if avx_len == 1:
+            add[-1] = add[-1][:-2] + ";"
+    
+    add.append("\n")
+    kernelarray.append("".join(add))
+    add=[""]
+  kernelarray.append("".join(add))
+  kernelarray.append("\n}\n\n")
+
+def generate_kernel(n, blocks, only_one_matrix, unroll_loops, avx_len):
+  kernel = ""
+  
+  N = 1<<n
+  idx = list(range(0,n))
+  
+  kernelarray = []
+  generate_kernel_core(N,n,kernelarray,blocks,only_one_matrix,unroll_loops, avx_len)
+  kernel = "".join([kernel, "".join(kernelarray)])
+  
+  
+  
+  kernelarray = []
+  kernel = kernel + "// bit indices id[.] are given from high to low (e.g. control first for CNOT)\ntemplate <class V, class M>\n"
+  kernel = kernel + "void kernel(V& psi"
+  for i in range(n-1,-1,-1):
+    kernel = kernel + ", unsigned id"+str(i)
+  kernel = kernel + ", M const& matrix, std::size_t ctrlmask)\n{\n     std::size_t n = psi.size();\n"
+
+  for i in idx:
+    kernel = kernel + "\tstd::size_t d"+str(i)+" = 1ULL << id"+str(i)+";\n"
+  
+  kernel += ("\tauto m = matrix;\n"
+            "\tstd::size_t dsorted[] = {")
+  add =  ["d0"]
+  for i in range(1,n):
+    add.append(", d" + str(i))
+  add.append("};\n")
+  add.append("\tpermute_qubits_and_matrix(dsorted, " + str(n) + ", m);\n")
+  kernel += "".join(add)
+  
+  if False:
+    add = ["\n\t" + avx_type(avx_len) + " mm[] = {"]
+    for b in range(blocks):
+      for r in range(int(N/avx_len)):
+        for c in range(int(N/blocks)):
+          add.append("loada")
+          if only_one_matrix:
+            add[-1] = add[-1]+"b"
+          add.append("(")
+          for i in range(avx_len):
+            add.append("&m["+str(avx_len*r+i)+"]["+str(c+b*int(N/blocks))+"], ")
+          add[-1] = add[-1][:-2] + "), "
+    add[-1] = add[-1][:-2] + "};\n"
+  else:
+    add = ["\n\t" + avx_type(avx_len) + " mm[" + str(N*int(N/avx_len)) + "];"]
+    add.append("\n\tfor (unsigned b = 0; b < " + str(blocks) + "; ++b){"
+               "\n\t\tfor (unsigned r = 0; r < " + str(int(N/avx_len)) + "; ++r){"
+               "\n\t\t\tfor (unsigned c = 0; c < " + str(int(N/blocks)) + "; ++c){"
+               "\n\t\t\t\tmm[b*"+str(int(N/avx_len)*int(N/blocks))+"+r*"+str(int(N/blocks))+"+c]"
+               " = ")
+    if avx_len > 1:
+      add.append("loada")
+      if only_one_matrix:
+        add[-1] = add[-1]+"b"
+      add.append("(")
+      for i in range(avx_len):
+        add.append("&m["+str(avx_len)+"*r+"+str(i)+"][c+b*"+str(int(N/blocks))+"], ")
+      add[-1] = add[-1][:-2] + ");"
+    else:
+      add.append("m[r][c+b*"+str(int(N/blocks))+"];")
+    add.append("\n\t\t\t}\n\t\t}\n\t}\n")
+  kernelarray.append("".join(add))
+  
+  if False:
+    add = ["\n\t" + avx_type(avx_len) + " mmt[] = {"]
+    for b in range(blocks):
+      for r in range(int(N/avx_len)):
+        for c in range(int(N/blocks)):
+          add.append("loadbm")
+          add.append("(")
+          for i in range(avx_len):
+            add.append("&m["+str(avx_len*r+i)+"]["+str(c+b*int(N/blocks))+"], ")
+          add[-1] = add[-1][:-2] + "), "
+    add[-1] = add[-1][:-2] + "};\n"
+  else:
+    add = ["\n\t" + avx_type(avx_len) + " mmt[" + str(N*int(N/avx_len)) + "];"]
+    add.append("\n\tfor (unsigned b = 0; b < " + str(blocks) + "; ++b){"
+               "\n\t\tfor (unsigned r = 0; r < " + str(int(N/avx_len)) + "; ++r){"
+               "\n\t\t\tfor (unsigned c = 0; c < " + str(int(N/blocks)) + "; ++c){"
+               "\n\t\t\t\tmmt[b*"+str(int(N/avx_len)*int(N/blocks))+"+r*"+str(int(N/blocks))+"+c]"
+               " = loadbm(")
+    for i in range(avx_len):
+      add.append("&m["+str(avx_len)+"*r+"+str(i)+"][c+b*"+str(int(N/blocks))+"], ")
+    add[-1] = add[-1][:-2] + ");\n\t\t\t}\n\t\t}\n\t}\n"
+  
+  if only_one_matrix:
+    add = []
+  
+  add.append("\n\n")
+  kernelarray.append("".join(add))
+  
+  nc = len(idx)-1
+  add = []
+  indent = 1
+  kernelarray.append("#ifndef _MSC_VER\n")
+  kernelarray.append("\t"*indent + "if (ctrlmask == 0){\n")
+  indent += 1
+  kernelarray.append("\t"*indent + "#pragma omp parallel for collapse(LOOP_COLLAPSE"+str(n)+") schedule(static) proc_bind(spread)\n" + "\t"*indent + "for (std::size_t i0 = 0; i0 < n; i0 += 2 * dsorted[0]){\n")
+  indent = indent + 1
+  for i in range(1,nc+1):
+    kernelarray.append("\t"*indent + "for (std::size_t i"+str(i)+" = 0; i"+str(i)+" < dsorted["+str(i-1) + "]; i"+str(i)+" += 2 * dsorted["+str(i)+"]){\n")
+    indent = indent + 1
+
+  kernelarray.append("\t"*indent + "for (std::size_t i"+str(nc+1)+" = 0; i"+str(nc+1)+" < dsorted["+str(nc)+"]; ++i"+str(nc+1)+"){\n")
+  indent = indent + 1
+
+  # inner-most loop: call kernel core
+
+
+  kernelarray.append("\t"*indent + "kernel_core(psi, i0")
+  add = []
+  for i in range(n):
+    add.append(" + i"+str(i+1))
+  kernelarray.append("".join(add))
+  for i in range(n):
+    kernelarray.append(", dsorted[" + str(n-1-i) + "]")
+
+  if only_one_matrix:
+    kernelarray.append(", mm);\n")
+  else:
+    kernelarray.append(", mm, mmt);\n")
+
+  #end for(s) and if
+  add = [""]*indent
+  for i in range(indent-1,0,-1):
+    add[indent-1-i] = "\t"*i+"}\n"
+  kernelarray.append("".join(add))
+
+  # if controlmask != 0
+  indent = 1
+  kernelarray.append("\t"*indent + "else{\n")
+  indent += 1
+  kernelarray.append("\t"*indent + "#pragma omp parallel for collapse(LOOP_COLLAPSE"+str(n)+") schedule(static)\n" + "\t"*indent + "for (std::size_t i0 = 0; i0 < n; i0 += 2 * dsorted[0]){\n")
+  indent = indent + 1
+  for i in range(1,nc+1):
+    kernelarray.append("\t"*indent + "for (std::size_t i"+str(i)+" = 0; i"+str(i)+" < dsorted["+str(i-1) + "]; i"+str(i)+" += 2 * dsorted["+str(i)+"]){\n")
+    indent = indent + 1
+
+  kernelarray.append("\t"*indent + "for (std::size_t i"+str(nc+1)+" = 0; i"+str(nc+1)+" < dsorted["+str(nc)+"]; ++i"+str(nc+1)+"){\n")
+  indent = indent + 1
+
+  # inner-most loop: call kernel core
+
+  kernelarray.append("\t"*indent + "if (((i0")
+  add = []
+  for i in range(n):
+    add.append(" + i"+str(i+1))
+  kernelarray.append("".join(add))
+  kernelarray.append(")&ctrlmask) == ctrlmask)\n")
+  kernelarray.append("\t"*(indent+1) + "kernel_core(psi, i0")
+  add = []
+  for i in range(n):
+    add.append(" + i"+str(i+1))
+  kernelarray.append("".join(add))
+  for i in range(n):
+    kernelarray.append(", dsorted[" + str(n-1-i) + "]")
+
+  if only_one_matrix:
+    kernelarray.append(", mm);\n")
+  else:
+    kernelarray.append(", mm, mmt);\n")
+
+  #end for(s) and if
+  add = [""]*indent
+  for i in range(indent-1,0,-1):
+    add[indent-1-i] = "\t"*i+"}\n"
+  kernelarray.append("".join(add))
+
+
+################ Start of _MSC_VER code block ##################
+  kernelarray.append("#else\n")
+  kernelarray.append("    std::intptr_t zero = 0;\n")
+  kernelarray.append("    std::intptr_t dmask = dsorted[0]");
+  for i in range(n-1):        kernelarray.append(" + dsorted["+str(i+1)+"]")
+  kernelarray.append(         ";\n")
+  kernelarray.append("\n");
+  kernelarray.append("    if (ctrlmask == 0){\n")
+  kernelarray.append("        #pragma omp parallel for schedule(static)\n")
+  kernelarray.append("        for (std::intptr_t i = 0; i < static_cast<std::intptr_t>(n); ++i)\n")
+  kernelarray.append("            if ((i & dmask) == zero)\n")
+  kernelarray.append("                kernel_core(psi, i")
+  for i in range(n):                    kernelarray.append(", dsorted[" + str(n-1-i) + "]")
+  if only_one_matrix:                   kernelarray.append(", mm);\n")
+  else:                                 kernelarray.append(", mm, mmt);\n")
+  # if controlmask != 0
+  kernelarray.append("     } else {\n")
+  kernelarray.append("        #pragma omp parallel for schedule(static)\n")
+  kernelarray.append("        for (std::intptr_t i = 0; i < static_cast<std::intptr_t>(n); ++i)\n")
+  kernelarray.append("            if ((i & ctrlmask) == ctrlmask && (i & dmask) == zero)\n")
+  kernelarray.append("                kernel_core(psi, i")
+  for i in range(n):                    kernelarray.append(", dsorted[" + str(n-1-i) + "]")
+  if only_one_matrix:                   kernelarray.append(", mm);\n")
+  else:                                 kernelarray.append(", mm, mmt);\n")
+  kernelarray.append("     }\n")
+  kernelarray.append("#endif\n")
+
+  kernelarray.append("}\n")
+  kernel = "".join([kernel,"".join(kernelarray)])
+  return kernel
+
+def generate_includes(N):
+  return "#include <cassert>\n#include <iostream>\n#include <vector>\n#include <complex>\n#include <cstdlib>\n#include <omp.h>\n" + \
+    "#include \"alignedallocator.hpp\"\n#include \"timing.hpp\"\n#include \"cintrin.hpp\"\n" + \
+    "#include <algorithm>\n#include <functional>\n\n" + \
+    "#include \"util/par_for.hpp\"\n" + \
+    "using namespace std;\n#define LOOP_COLLAPSE" + str(N) + " " + str(N+1) + "\n"
+
+def generate_main(n):
+  N = str(1 << n)
+  text = "using rowtype = vector<complex<double>,aligned_allocator<complex<double>,64>>;\nusing matrixtype = vector<rowtype>;\n\nint main(int argc, char *argv[]){"
+  text = text + "\n\tassert(argc > "+str(1+n)+");"
+  text = text + "\n\tsize_t N = 1ULL << atoi(argv[1]);"
+  for i in range(n):
+    text = text + "\n\tunsigned i" + str(i) + " = atoi(argv[" + str(i+2) + "]);"
+  
+  text = text + "\n\tmatrixtype m("+N+", rowtype("+N+"));";
+  text = text + "\n\tfor (unsigned i = 0; i < "+N+"; ++i)\n\t\tfor (unsigned j = 0; j < "+N+"; ++j)\n\t\t\tm[i][j] = drand48();\n"
+  
+  text = text + "\n\tTimer t;\n\tfor (unsigned threads = 1; threads <= 24; ++threads){"
+  text = text + "\n\t\tomp_set_num_threads(threads);"
+  text = text + "\n\t\trowtype psi(N);\n\t\t#pragma omp parallel\n\t\t{\n\t\t\t#pragma omp for schedule(static)\n\t\t\tfor (size_t i = 0; i < psi.size(); ++i)\n\t\t\t\tpsi[i] = drand48();\n"
+  text = text + "\n\t\t\t#pragma omp single\n\t\t\tt.start();"
+  text = text + "\n\t\t\tkernel(psi, N"
+  for i in range(n):
+    text = text + ", i" + str(i)
+  text = text + ", m, 0);"
+  text = text + "\n\t\t\t#pragma omp waitall\n\t\t\t#pragma omp single\n\t\t\t{ cout << \"threads: \" << threads << \", time:\" << t.stop()*1.e-6 << \"\\n\"; }"
+  text = text + "\n\t\t}" # end for
+  text = text + "\n\t}" # end for
+  text = text + "\n\n}" # end main
+  return text
+
+
+#####################################################
+# MAIN                                              #
+#####################################################
+
+if len(sys.argv) < 2:
+  print("Generates the code for an n-qubit gate.\nUsage:\n./codegen_fma.py [n_qubits] {n_blocks} {only one matrix?} {unroll loops?} {none|avx2|avx512}\n\n")
+  exit()
+
+n = int(sys.argv[1]) # number of qubits
+
+try: # number of blocks
+  blocks = int(sys.argv[2])
+except Exception:
+  blocks = 1
+
+try:
+  only_one_matrix = int(sys.argv[3])
+except Exception:
+  only_one_matrix = False
+
+try:
+  unroll_loops = int(sys.argv[4])
+except Exception:
+  unroll_loops = False
+
+try:
+  avx = str(sys.argv[5])
+  if avx == "avx512":
+    avx = 4
+  elif avx == "avx2" or avx == "avx":
+    avx = 2
+  elif avx == "none":
+    avx = 1
+    only_one_matrix = True
+  else:
+    raise RuntimeError("Unknown avx type: {}".format(avx))
+except IndexError:
+  avx = 2
+
+while (1 << n)/blocks < 1:
+  blocks = int(blocks/2)
+
+if (1 << n) < avx:
+  avx = int(avx/2)
+
+kernel = generate_kernel(n, blocks, only_one_matrix, unroll_loops, avx) # generate code for n-qubit gate
+
+# if user wants a main (for testing) generate as well:
+for a in sys.argv:
+  if str(a) == "gen_main":
+    kernel = generate_includes(n) + kernel + generate_main(n)
+
+print(kernel)
diff --git a/src/Simulation/Native/codegen/codegen_test.cpp b/src/Simulation/Native/codegen/codegen_test.cpp
new file mode 100644
index 00000000000..1ff778ce96d
--- /dev/null
+++ b/src/Simulation/Native/codegen/codegen_test.cpp
@@ -0,0 +1,127 @@
+#include <cassert>
+#include <iostream>
+#include <vector>
+#include <complex>
+#include <cstdlib>
+#include <omp.h>
+#include "alignedalloc.hpp"
+//#include "timing.hpp"
+#include "cintrin.hpp"
+#include <algorithm>
+#include <functional>
+
+#include "util/par_for.hpp"
+using namespace std;
+#define LOOP_COLLAPSE1 2
+// (C) 2018 ETH Zurich, ITP, Thomas H�ner and Damian Steiger
+
+template <class V, class M>
+inline void kernel_core(V& psi, std::size_t I, std::size_t d0, M const& m)
+{
+	std::complex<double> v[2];
+
+	v[0] = psi[I];
+	v[1] = psi[I + d0];
+
+	std::complex<double> tmp[2] = {0., 0.};
+
+	tmp[0] = fma(v[0], m[0], fma(v[1], m[1], tmp[0]));
+	tmp[1] = fma(v[0], m[2], fma(v[1], m[3], tmp[1]));
+	psi[I] = tmp[0];
+	psi[I + d0] = tmp[1];
+
+}
+
+// bit indices id[.] are given from high to low (e.g. control first for CNOT)
+template <class V, class M>
+void kernel(V& psi, unsigned id0, M const& matrix, std::size_t ctrlmask)
+{
+     std::size_t n = psi.size();
+	std::size_t d0 = 1ULL << id0;
+	auto m = matrix;
+	std::size_t dsorted[] = {d0};
+	permute_qubits_and_matrix(dsorted, 1, m);
+
+	std::complex<double> mm[4];
+	for (unsigned b = 0; b < 1; ++b){
+		for (unsigned r = 0; r < 2; ++r){
+			for (unsigned c = 0; c < 2; ++c){
+				mm[b*4+r*2+c] = m[r][c+b*2];
+			}
+		}
+	}
+
+
+#ifndef _MSC_VER
+	if (ctrlmask == 0){
+		#pragma omp parallel for collapse(LOOP_COLLAPSE1) schedule(static)
+		for (std::size_t i0 = 0; i0 < n; i0 += 2 * dsorted[0]){
+			for (std::size_t i1 = 0; i1 < dsorted[0]; ++i1){
+				kernel_core(psi, i0 + i1, dsorted[0], mm);
+			}
+		}
+	}
+	else{
+		#pragma omp parallel for collapse(LOOP_COLLAPSE1) schedule(static)
+		for (std::size_t i0 = 0; i0 < n; i0 += 2 * dsorted[0]){
+			for (std::size_t i1 = 0; i1 < dsorted[0]; ++i1){
+				if (((i0 + i1)&ctrlmask) == ctrlmask)
+					kernel_core(psi, i0 + i1, dsorted[0], mm);
+			}
+		}
+	}
+#else
+    intptr_t zero = 0;
+    intptr_t dmask = dsorted[0];
+
+    if (ctrlmask == 0){
+        auto thrdFnc= [&](size_t dsorted[],intptr_t& dmask, intptr_t& zero,V &psi,M const& m) {
+            return [&](unsigned i) {
+                if ((i & dmask) == zero)
+                    kernel_core(psi, i, dsorted[0], m);
+            };
+        };
+        pl::async_par_for(0,n,thrdFnc(dsorted,dmask,zero,psi,m));
+     } else {
+        auto thrdFnc= [&](size_t dsorted[],size_t& ctrlmask,intptr_t& dmask, intptr_t& zero,V &psi,M const& m) {
+            return [&](unsigned i) {
+                if ((i & ctrlmask) == ctrlmask && (i & dmask) == zero)
+                    kernel_core(psi, i, dsorted[0], m);
+            };
+        };
+        pl::async_par_for(0,n,thrdFnc(dsorted,ctrlmask,dmask,zero,psi,m));
+     }
+#endif
+}
+using rowtype = vector<complex<double>,AlignedAlloc<complex<double>,64>>;
+using matrixtype = vector<rowtype>;
+
+int main(int argc, char *argv[]){
+	assert(argc > 2);
+	size_t N = 1ULL << atoi(argv[1]);
+	unsigned i0 = atoi(argv[2]);
+	matrixtype m(2, rowtype(2));
+	for (unsigned i = 0; i < 2; ++i)
+		for (unsigned j = 0; j < 2; ++j)
+			m[i][j] = drand48();
+
+	Timer t;
+	for (unsigned threads = 1; threads <= 24; ++threads){
+		omp_set_num_threads(threads);
+		rowtype psi(N);
+		#pragma omp parallel
+		{
+			#pragma omp for schedule(static)
+			for (size_t i = 0; i < psi.size(); ++i)
+				psi[i] = drand48();
+
+			#pragma omp single
+			t.start();
+			kernel(psi, N, i0, m, 0);
+			#pragma omp waitall
+			#pragma omp single
+			{ cout << "threads: " << threads << ", time:" << t.stop()*1.e-6 << "\n"; }
+		}
+	}
+
+}
diff --git a/src/Simulation/Native/codegen/generate.ps1 b/src/Simulation/Native/codegen/generate.ps1
new file mode 100644
index 00000000000..1cc506d187b
--- /dev/null
+++ b/src/Simulation/Native/codegen/generate.ps1
@@ -0,0 +1,21 @@
+# onematrix[i] determines whether to use a single gate matrix for the i-qubit gate kernel
+# instead of using two matrices (which allows to reduce the number of operations
+# by pre-computation)
+$onematrix=(0,0,0,0,0,0,1,1) # g++ best
+
+# unroll[i] determines whether to unroll loops
+$unroll=(1,1,1,1,1,1,0,0) # g++ best
+
+# register length to use: can be none, avx2, or avx512
+# avx=avx2
+
+# blocking: must be a power of two and at most 2^k for a k-qubit gate
+$b=(0,2,4,8,16,16,16,32) # gcc & icc best
+
+foreach ($i in 1..7) {
+    "Generating $i kernel with $b[$i] blocks."
+	python codegen_fma.py $i $b[$i] $onematrix[$i] $unroll[$i] none > nointrin/kernel$i.hpp
+	python codegen_fma.py $i $b[$i] $onematrix[$i] $unroll[$i] avx > avx/kernel$i.hpp
+	python codegen_fma.py $i $b[$i] $onematrix[$i] $unroll[$i] avx2 > avx2/kernel$i.hpp
+    python codegen_fma.py $i $b[$i] $onematrix[$i] $unroll[$i] avx512 > avx512/kernel$i.hpp
+}
\ No newline at end of file
diff --git a/src/Simulation/Native/codegen/generate.sh b/src/Simulation/Native/codegen/generate.sh
new file mode 100644
index 00000000000..2ec0557d571
--- /dev/null
+++ b/src/Simulation/Native/codegen/generate.sh
@@ -0,0 +1,28 @@
+#!/bin/bash
+# (C) 2018 ETH Zurich, ITP, Thomas Häner and Damian Steiger
+
+# onematrix[i] determines whether to use a single gate matrix for the i-qubit gate kernel
+# instead of using two matrices (which allows to reduce the number of operations
+# by pre-computation)
+onematrix=(0 0 0 0 0 0 1 1) # g++ best
+#onematrix=(0 0 1 0 0 0 1 1) # icc best
+
+# unroll[i] determines whether to unroll loops
+unroll=(1 1 1 1 1 1 0 0) # g++ best
+#unroll=(1 1 1 0 0 1 0 0) # icc best
+#unroll=(0 0 0 0 0 0 0 0)
+
+# register length to use: can be none, avx2, or avx512
+avx=avx2
+
+# blocking: must be a power of two and at most 2^k for a k-qubit gate
+b=(0 2 4 8 16 16 16 32) # gcc & icc best
+
+for i in {1..7}
+do
+	echo "Generating $i kernel with ${b[$i]} blocks."
+	./codegen_fma.py $i ${b[$i]} ${onematrix[$i]} ${unroll[$i]} none > ../nointrin/kernel${i}.hpp
+	./codegen_fma.py $i ${b[$i]} ${onematrix[$i]} ${unroll[$i]} avx > ../avx/kernel${i}.hpp
+	./codegen_fma.py $i ${b[$i]} ${onematrix[$i]} ${unroll[$i]} avx2 > ../avx2/kernel${i}.hpp
+  ./codegen_fma.py $i ${b[$i]} ${onematrix[$i]} ${unroll[$i]} avx512 > ../avx512/kernel${i}.hpp
+done
diff --git a/src/Simulation/Native/cpuid_test b/src/Simulation/Native/cpuid_test
new file mode 100644
index 00000000000..97f5752dcb7
Binary files /dev/null and b/src/Simulation/Native/cpuid_test differ
diff --git a/src/Simulation/Native/diagmatrix_test b/src/Simulation/Native/diagmatrix_test
new file mode 100644
index 00000000000..b070bbb4ab9
Binary files /dev/null and b/src/Simulation/Native/diagmatrix_test differ
diff --git a/src/Simulation/Native/doCopy.ps1 b/src/Simulation/Native/doCopy.ps1
new file mode 100644
index 00000000000..e0fb1b6e1d2
--- /dev/null
+++ b/src/Simulation/Native/doCopy.ps1
@@ -0,0 +1,13 @@
+param([string]$bld = "Release")
+
+"================== COPYING $bld"
+$dll = "Microsoft.Quantum.Simulator.Runtime.*"  # DLL and PDB
+$srcDir = "C:\depot\Git\qsharp-runtime\src\Simulation\Native\build\$bld"
+foreach ($dest in "H2O","Ham","integer-factorization") {
+    foreach ($typ in "Release","Debug") {
+        $dstDir = "C:\depot\Git\msr-quarc\wecker\QDK\$dest\bin\$typ"
+        $dstDir += "\netcoreapp3.0\runtimes\win-x64\native"
+        robocopy /NJH /NJS /NP /NDL $srcDir $dstDir $dll | Out-Null
+    }
+}
+exit 0
diff --git a/src/Simulation/Native/factory_test b/src/Simulation/Native/factory_test
new file mode 100644
index 00000000000..b5937b571f8
Binary files /dev/null and b/src/Simulation/Native/factory_test differ
diff --git a/src/Simulation/Native/local_test b/src/Simulation/Native/local_test
new file mode 100644
index 00000000000..feaf3627ddc
Binary files /dev/null and b/src/Simulation/Native/local_test differ
diff --git a/src/Simulation/Native/openmp_test b/src/Simulation/Native/openmp_test
new file mode 100644
index 00000000000..00e5d9f877f
Binary files /dev/null and b/src/Simulation/Native/openmp_test differ
diff --git a/src/Simulation/Native/parseLog.py b/src/Simulation/Native/parseLog.py
new file mode 100644
index 00000000000..ee7837867e0
--- /dev/null
+++ b/src/Simulation/Native/parseLog.py
@@ -0,0 +1,113 @@
+import re   
+import sys
+import numpy as np
+
+logName = sys.argv[1]
+reSched = re.compile(r"^==== sched:\s+(\S+)")
+reFN    = re.compile(r"^(\S+)\.")
+reNQs   = re.compile(r"nQs=(\d+) .*range=(\d+).*prb=(\d+)")
+reSim   = re.compile(' (Generic|AVX|AVX2|AVX512)$')
+rePars  = re.compile(r'OMP_NUM_THREADS=(\d+) fusedSpan=(\d) fusedDepth=(\d+) wfnCapacity=(\d+)')
+reInfo  = re.compile(r'sz=([.\d]+) nQs=([.\d]+) nCs=([.\d]+) flsh= *([.\de+-]+).*gts= *([.\de+-]+).*elap= *(\d+).*(.)gps= *([.\de+-]+).*fus= *([.\d]+).*ker= *([.\d]+)')
+found   = reFN.search(logName)
+env     = found.group(1)
+fp      = open(logName,'r')
+gpss    = []
+print(f'"env","test","typ","sim","qs","threads","span","sz","gps"')
+sim     = ""
+totalQs = -1
+threads = -1
+span    = -1
+sz      = -1
+rng     = 1
+prb     = -1
+sched   = "???"
+
+prbs = [
+    "ladder"   ,
+    "ladder"   ,
+    "shor_4"   ,
+    "shor_6"   ,
+    "shor_8"   ,
+    "shor_10"  ,
+    "shor_12"  ,
+    "suprem_44",
+    "suprem_55",
+    "suprem_56",
+    "qulacs_5",
+    "qulacs_10",
+    "qulacs_15",
+    "qulacs_20",
+    "qulacs_25"
+]
+def dumpGpss():
+    global gpss,env,sim,totalQs,threads,span,sz,rng,prb,sched
+    if len(gpss) > 0:
+        gpsMed  = np.median(gpss)
+        cnt     = 0.0
+        tot     = 0.0
+        for gps in gpss:
+            if gps > gpsMed/2.0 and gps < gpsMed*1.5:
+                cnt += 1.0
+                tot += gps
+        if cnt > 0: gps = tot/cnt
+        else:       gps = gpsAvg
+
+        nam     = prbs[prb]
+
+        if rng == 0:    nam  = f'{env},{nam}L'
+        elif rng == 2:  nam  = f'{env},{nam}H'
+        else:           nam  = f'{env},{nam}'
+
+        print(f"{nam},{sched},{sim},{totalQs},{threads},{span},{sz},{gps:.1f}")
+        
+        gpss = []
+
+while True:
+    inp = fp.readline()
+    if inp == "": 
+        dumpGpss()
+        break
+    found   = reSched.search(inp)
+    if found:
+        dumpGpss()
+        sched        = found.group(1)
+        continue
+    found   = reNQs.search(inp)
+    if found:
+        dumpGpss()
+        totalQs     = found.group(1)
+        rng         = int(found.group(2))
+        prb         = int(found.group(3))
+        continue
+    found   = reSim.search(inp)
+    if found:
+        dumpGpss()
+        sim     = found.group(1)
+        continue
+    found   = rePars.search(inp)
+    if found:
+        threads     = found.group(1)
+        span        = found.group(2)
+        limit       = found.group(3)
+        wfnSiz      = found.group(4)
+        continue
+    found   = reInfo.search(inp)
+    if found:
+        sz          = found.group(1)
+        nQs         = float(found.group(2))
+        nCs         = float(found.group(3))
+        flushes     = found.group(4)
+        gates       = found.group(5)
+        elap        = found.group(6)
+        if (found.group(7) == 'k'): mul = 1000.0
+        else:                       mul = 1.0
+        gps         = float(found.group(8)) * mul
+        fusions     = found.group(9)
+        kernel      = found.group(10)
+        gpss.append(gps)
+        continue
+
+
+fp.close()
+
diff --git a/src/Simulation/Native/src/CMakeLists.txt b/src/Simulation/Native/src/CMakeLists.txt
index e53864f85c9..faf1f1c480c 100644
--- a/src/Simulation/Native/src/CMakeLists.txt
+++ b/src/Simulation/Native/src/CMakeLists.txt
@@ -8,8 +8,9 @@ set(AVX2FLAGS "/arch:AVX2" )
 set(AVX512FLAGS "/arch:AVX512" )
 set(FMAFLAGS "")
 else(MSVC)
-SET(AVXFLAGS "-mavx" )
-set(AVX2FLAGS -mfma;-mavx2)
+SET(AVXFLAGS "-mavx")
+set(AVX2FLAGS "-mfma -mavx2")
+set(AVX512FLAGS "-mfma -mavx512f -mavx512cd")
 set(FMAFLAGS )
 endif(MSVC)
 
@@ -19,14 +20,16 @@ configure_file(version.hpp.in ${PROJECT_BINARY_DIR}/src/version.hpp)
 add_subdirectory(util)
 add_subdirectory(simulator)
 
-set(SOURCES simulator/factory.cpp simulator/capi.cpp simulator/simulator.cpp util/openmp.cpp simulator/simulatoravx.cpp simulator/simulatoravx2.cpp )
+set(SOURCES simulator/factory.cpp simulator/capi.cpp simulator/simulator.cpp util/openmp.cpp simulator/simulatoravx.cpp simulator/simulatoravx2.cpp simulator/simulatoravx512.cpp )
 if(BUILD_SHARED_LIBS)
   add_library(Microsoft.Quantum.Simulator.Runtime SHARED ${SOURCES})
   set_source_files_properties(simulator/simulatoravx.cpp PROPERTIES COMPILE_FLAGS ${AVXFLAGS})
 if (MSVC)
 set_source_files_properties(simulator/simulatoravx2.cpp PROPERTIES COMPILE_FLAGS -mfma COMPILE_FLAGS ${AVX2FLAGS})
+set_source_files_properties(simulator/simulatoravx512.cpp PROPERTIES COMPILE_FLAGS -mfma COMPILE_FLAGS ${AVX512FLAGS})
 else(MSVC)
-set_source_files_properties(simulator/simulatoravx2.cpp PROPERTIES COMPILE_FLAGS -mfma COMPILE_FLAGS "-mavx2 -mfma")
+set_source_files_properties(simulator/simulatoravx2.cpp PROPERTIES COMPILE_FLAGS -mfma COMPILE_FLAGS ${AVX2FLAGS})
+set_source_files_properties(simulator/simulatoravx512.cpp PROPERTIES COMPILE_FLAGS -mfma COMPILE_FLAGS ${AVX512FLAGS})
 endif(MSVC)
   message (STATUS "Building shared library")
   target_compile_definitions(Microsoft.Quantum.Simulator.Runtime PRIVATE BUILD_DLL=1)
@@ -36,6 +39,7 @@ else(BUILD_SHARED_LIBS)
   add_library(Microsoft.Quantum.Simulator.Runtime STATIC ${SOURCES})
   set_source_files_properties(simulator/simulatoravx.cpp PROPERTIES COMPILE_FLAGS ${AVXFLAGS})
   set_source_files_properties(simulator/simulatoravx2.cpp PROPERTIES COMPILE_FLAGS ${AVX2FLAGS})
+  set_source_files_properties(simulator/simulatoravx512.cpp PROPERTIES COMPILE_FLAGS ${AVX512FLAGS})
 endif(BUILD_SHARED_LIBS)
 
 install(TARGETS Microsoft.Quantum.Simulator.Runtime
diff --git a/src/Simulation/Native/src/Makefile b/src/Simulation/Native/src/Makefile
new file mode 100644
index 00000000000..edaded273e5
--- /dev/null
+++ b/src/Simulation/Native/src/Makefile
@@ -0,0 +1,422 @@
+# CMAKE generated file: DO NOT EDIT!
+# Generated by "Unix Makefiles" Generator, CMake Version 3.16
+
+# Default target executed when no arguments are given to make.
+default_target: all
+
+.PHONY : default_target
+
+# Allow only one "make -f Makefile2" at a time, but pass parallelism.
+.NOTPARALLEL:
+
+
+#=============================================================================
+# Special targets provided by cmake.
+
+# Disable implicit rules so canonical targets will work.
+.SUFFIXES:
+
+
+# Remove some rules from gmake that .SUFFIXES does not remove.
+SUFFIXES =
+
+.SUFFIXES: .hpux_make_needs_suffix_list
+
+
+# Suppress display of executed commands.
+$(VERBOSE).SILENT:
+
+
+# A target that is always out of date.
+cmake_force:
+
+.PHONY : cmake_force
+
+#=============================================================================
+# Set environment variables for the build.
+
+# The shell in which to execute make rules.
+SHELL = /bin/sh
+
+# The CMake executable.
+CMAKE_COMMAND = /usr/bin/cmake
+
+# The command to remove a file.
+RM = /usr/bin/cmake -E remove -f
+
+# Escaping for special characters.
+EQUALS = =
+
+# The top-level source directory on which CMake was run.
+CMAKE_SOURCE_DIR = /mnt/c/depot/Git/qsharp-runtime/src/Simulation/Native
+
+# The top-level build directory on which CMake was run.
+CMAKE_BINARY_DIR = /mnt/c/depot/Git/qsharp-runtime/src/Simulation/Native
+
+#=============================================================================
+# Targets provided globally by CMake.
+
+# Special rule for the target install/strip
+install/strip: preinstall
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Installing the project stripped..."
+	/usr/bin/cmake -DCMAKE_INSTALL_DO_STRIP=1 -P cmake_install.cmake
+.PHONY : install/strip
+
+# Special rule for the target install/strip
+install/strip/fast: preinstall/fast
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Installing the project stripped..."
+	/usr/bin/cmake -DCMAKE_INSTALL_DO_STRIP=1 -P cmake_install.cmake
+.PHONY : install/strip/fast
+
+# Special rule for the target install/local
+install/local: preinstall
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Installing only the local directory..."
+	/usr/bin/cmake -DCMAKE_INSTALL_LOCAL_ONLY=1 -P cmake_install.cmake
+.PHONY : install/local
+
+# Special rule for the target install/local
+install/local/fast: preinstall/fast
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Installing only the local directory..."
+	/usr/bin/cmake -DCMAKE_INSTALL_LOCAL_ONLY=1 -P cmake_install.cmake
+.PHONY : install/local/fast
+
+# Special rule for the target install
+install: preinstall
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Install the project..."
+	/usr/bin/cmake -P cmake_install.cmake
+.PHONY : install
+
+# Special rule for the target install
+install/fast: preinstall/fast
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Install the project..."
+	/usr/bin/cmake -P cmake_install.cmake
+.PHONY : install/fast
+
+# Special rule for the target list_install_components
+list_install_components:
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Available install components are: \"libraries\""
+.PHONY : list_install_components
+
+# Special rule for the target list_install_components
+list_install_components/fast: list_install_components
+
+.PHONY : list_install_components/fast
+
+# Special rule for the target rebuild_cache
+rebuild_cache:
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Running CMake to regenerate build system..."
+	/usr/bin/cmake -S$(CMAKE_SOURCE_DIR) -B$(CMAKE_BINARY_DIR)
+.PHONY : rebuild_cache
+
+# Special rule for the target rebuild_cache
+rebuild_cache/fast: rebuild_cache
+
+.PHONY : rebuild_cache/fast
+
+# Special rule for the target edit_cache
+edit_cache:
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Running CMake cache editor..."
+	/usr/bin/ccmake -S$(CMAKE_SOURCE_DIR) -B$(CMAKE_BINARY_DIR)
+.PHONY : edit_cache
+
+# Special rule for the target edit_cache
+edit_cache/fast: edit_cache
+
+.PHONY : edit_cache/fast
+
+# Special rule for the target test
+test:
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Running tests..."
+	/usr/bin/ctest --force-new-ctest-process $(ARGS)
+.PHONY : test
+
+# Special rule for the target test
+test/fast: test
+
+.PHONY : test/fast
+
+# The main all target
+all: cmake_check_build_system
+	cd /mnt/c/depot/Git/qsharp-runtime/src/Simulation/Native && $(CMAKE_COMMAND) -E cmake_progress_start /mnt/c/depot/Git/qsharp-runtime/src/Simulation/Native/CMakeFiles /mnt/c/depot/Git/qsharp-runtime/src/Simulation/Native/src/CMakeFiles/progress.marks
+	cd /mnt/c/depot/Git/qsharp-runtime/src/Simulation/Native && $(MAKE) -f CMakeFiles/Makefile2 src/all
+	$(CMAKE_COMMAND) -E cmake_progress_start /mnt/c/depot/Git/qsharp-runtime/src/Simulation/Native/CMakeFiles 0
+.PHONY : all
+
+# The main clean target
+clean:
+	cd /mnt/c/depot/Git/qsharp-runtime/src/Simulation/Native && $(MAKE) -f CMakeFiles/Makefile2 src/clean
+.PHONY : clean
+
+# The main clean target
+clean/fast: clean
+
+.PHONY : clean/fast
+
+# Prepare targets for installation.
+preinstall: all
+	cd /mnt/c/depot/Git/qsharp-runtime/src/Simulation/Native && $(MAKE) -f CMakeFiles/Makefile2 src/preinstall
+.PHONY : preinstall
+
+# Prepare targets for installation.
+preinstall/fast:
+	cd /mnt/c/depot/Git/qsharp-runtime/src/Simulation/Native && $(MAKE) -f CMakeFiles/Makefile2 src/preinstall
+.PHONY : preinstall/fast
+
+# clear depends
+depend:
+	cd /mnt/c/depot/Git/qsharp-runtime/src/Simulation/Native && $(CMAKE_COMMAND) -S$(CMAKE_SOURCE_DIR) -B$(CMAKE_BINARY_DIR) --check-build-system CMakeFiles/Makefile.cmake 1
+.PHONY : depend
+
+# Convenience name for target.
+src/CMakeFiles/Microsoft.Quantum.Simulator.Runtime.dir/rule:
+	cd /mnt/c/depot/Git/qsharp-runtime/src/Simulation/Native && $(MAKE) -f CMakeFiles/Makefile2 src/CMakeFiles/Microsoft.Quantum.Simulator.Runtime.dir/rule
+.PHONY : src/CMakeFiles/Microsoft.Quantum.Simulator.Runtime.dir/rule
+
+# Convenience name for target.
+Microsoft.Quantum.Simulator.Runtime: src/CMakeFiles/Microsoft.Quantum.Simulator.Runtime.dir/rule
+
+.PHONY : Microsoft.Quantum.Simulator.Runtime
+
+# fast build rule for target.
+Microsoft.Quantum.Simulator.Runtime/fast:
+	cd /mnt/c/depot/Git/qsharp-runtime/src/Simulation/Native && $(MAKE) -f src/CMakeFiles/Microsoft.Quantum.Simulator.Runtime.dir/build.make src/CMakeFiles/Microsoft.Quantum.Simulator.Runtime.dir/build
+.PHONY : Microsoft.Quantum.Simulator.Runtime/fast
+
+simulator/capi.o: simulator/capi.cpp.o
+
+.PHONY : simulator/capi.o
+
+# target to build an object file
+simulator/capi.cpp.o:
+	cd /mnt/c/depot/Git/qsharp-runtime/src/Simulation/Native && $(MAKE) -f src/CMakeFiles/Microsoft.Quantum.Simulator.Runtime.dir/build.make src/CMakeFiles/Microsoft.Quantum.Simulator.Runtime.dir/simulator/capi.cpp.o
+.PHONY : simulator/capi.cpp.o
+
+simulator/capi.i: simulator/capi.cpp.i
+
+.PHONY : simulator/capi.i
+
+# target to preprocess a source file
+simulator/capi.cpp.i:
+	cd /mnt/c/depot/Git/qsharp-runtime/src/Simulation/Native && $(MAKE) -f src/CMakeFiles/Microsoft.Quantum.Simulator.Runtime.dir/build.make src/CMakeFiles/Microsoft.Quantum.Simulator.Runtime.dir/simulator/capi.cpp.i
+.PHONY : simulator/capi.cpp.i
+
+simulator/capi.s: simulator/capi.cpp.s
+
+.PHONY : simulator/capi.s
+
+# target to generate assembly for a file
+simulator/capi.cpp.s:
+	cd /mnt/c/depot/Git/qsharp-runtime/src/Simulation/Native && $(MAKE) -f src/CMakeFiles/Microsoft.Quantum.Simulator.Runtime.dir/build.make src/CMakeFiles/Microsoft.Quantum.Simulator.Runtime.dir/simulator/capi.cpp.s
+.PHONY : simulator/capi.cpp.s
+
+simulator/factory.o: simulator/factory.cpp.o
+
+.PHONY : simulator/factory.o
+
+# target to build an object file
+simulator/factory.cpp.o:
+	cd /mnt/c/depot/Git/qsharp-runtime/src/Simulation/Native && $(MAKE) -f src/CMakeFiles/Microsoft.Quantum.Simulator.Runtime.dir/build.make src/CMakeFiles/Microsoft.Quantum.Simulator.Runtime.dir/simulator/factory.cpp.o
+.PHONY : simulator/factory.cpp.o
+
+simulator/factory.i: simulator/factory.cpp.i
+
+.PHONY : simulator/factory.i
+
+# target to preprocess a source file
+simulator/factory.cpp.i:
+	cd /mnt/c/depot/Git/qsharp-runtime/src/Simulation/Native && $(MAKE) -f src/CMakeFiles/Microsoft.Quantum.Simulator.Runtime.dir/build.make src/CMakeFiles/Microsoft.Quantum.Simulator.Runtime.dir/simulator/factory.cpp.i
+.PHONY : simulator/factory.cpp.i
+
+simulator/factory.s: simulator/factory.cpp.s
+
+.PHONY : simulator/factory.s
+
+# target to generate assembly for a file
+simulator/factory.cpp.s:
+	cd /mnt/c/depot/Git/qsharp-runtime/src/Simulation/Native && $(MAKE) -f src/CMakeFiles/Microsoft.Quantum.Simulator.Runtime.dir/build.make src/CMakeFiles/Microsoft.Quantum.Simulator.Runtime.dir/simulator/factory.cpp.s
+.PHONY : simulator/factory.cpp.s
+
+simulator/simulator.o: simulator/simulator.cpp.o
+
+.PHONY : simulator/simulator.o
+
+# target to build an object file
+simulator/simulator.cpp.o:
+	cd /mnt/c/depot/Git/qsharp-runtime/src/Simulation/Native && $(MAKE) -f src/CMakeFiles/Microsoft.Quantum.Simulator.Runtime.dir/build.make src/CMakeFiles/Microsoft.Quantum.Simulator.Runtime.dir/simulator/simulator.cpp.o
+.PHONY : simulator/simulator.cpp.o
+
+simulator/simulator.i: simulator/simulator.cpp.i
+
+.PHONY : simulator/simulator.i
+
+# target to preprocess a source file
+simulator/simulator.cpp.i:
+	cd /mnt/c/depot/Git/qsharp-runtime/src/Simulation/Native && $(MAKE) -f src/CMakeFiles/Microsoft.Quantum.Simulator.Runtime.dir/build.make src/CMakeFiles/Microsoft.Quantum.Simulator.Runtime.dir/simulator/simulator.cpp.i
+.PHONY : simulator/simulator.cpp.i
+
+simulator/simulator.s: simulator/simulator.cpp.s
+
+.PHONY : simulator/simulator.s
+
+# target to generate assembly for a file
+simulator/simulator.cpp.s:
+	cd /mnt/c/depot/Git/qsharp-runtime/src/Simulation/Native && $(MAKE) -f src/CMakeFiles/Microsoft.Quantum.Simulator.Runtime.dir/build.make src/CMakeFiles/Microsoft.Quantum.Simulator.Runtime.dir/simulator/simulator.cpp.s
+.PHONY : simulator/simulator.cpp.s
+
+simulator/simulatoravx.o: simulator/simulatoravx.cpp.o
+
+.PHONY : simulator/simulatoravx.o
+
+# target to build an object file
+simulator/simulatoravx.cpp.o:
+	cd /mnt/c/depot/Git/qsharp-runtime/src/Simulation/Native && $(MAKE) -f src/CMakeFiles/Microsoft.Quantum.Simulator.Runtime.dir/build.make src/CMakeFiles/Microsoft.Quantum.Simulator.Runtime.dir/simulator/simulatoravx.cpp.o
+.PHONY : simulator/simulatoravx.cpp.o
+
+simulator/simulatoravx.i: simulator/simulatoravx.cpp.i
+
+.PHONY : simulator/simulatoravx.i
+
+# target to preprocess a source file
+simulator/simulatoravx.cpp.i:
+	cd /mnt/c/depot/Git/qsharp-runtime/src/Simulation/Native && $(MAKE) -f src/CMakeFiles/Microsoft.Quantum.Simulator.Runtime.dir/build.make src/CMakeFiles/Microsoft.Quantum.Simulator.Runtime.dir/simulator/simulatoravx.cpp.i
+.PHONY : simulator/simulatoravx.cpp.i
+
+simulator/simulatoravx.s: simulator/simulatoravx.cpp.s
+
+.PHONY : simulator/simulatoravx.s
+
+# target to generate assembly for a file
+simulator/simulatoravx.cpp.s:
+	cd /mnt/c/depot/Git/qsharp-runtime/src/Simulation/Native && $(MAKE) -f src/CMakeFiles/Microsoft.Quantum.Simulator.Runtime.dir/build.make src/CMakeFiles/Microsoft.Quantum.Simulator.Runtime.dir/simulator/simulatoravx.cpp.s
+.PHONY : simulator/simulatoravx.cpp.s
+
+simulator/simulatoravx2.o: simulator/simulatoravx2.cpp.o
+
+.PHONY : simulator/simulatoravx2.o
+
+# target to build an object file
+simulator/simulatoravx2.cpp.o:
+	cd /mnt/c/depot/Git/qsharp-runtime/src/Simulation/Native && $(MAKE) -f src/CMakeFiles/Microsoft.Quantum.Simulator.Runtime.dir/build.make src/CMakeFiles/Microsoft.Quantum.Simulator.Runtime.dir/simulator/simulatoravx2.cpp.o
+.PHONY : simulator/simulatoravx2.cpp.o
+
+simulator/simulatoravx2.i: simulator/simulatoravx2.cpp.i
+
+.PHONY : simulator/simulatoravx2.i
+
+# target to preprocess a source file
+simulator/simulatoravx2.cpp.i:
+	cd /mnt/c/depot/Git/qsharp-runtime/src/Simulation/Native && $(MAKE) -f src/CMakeFiles/Microsoft.Quantum.Simulator.Runtime.dir/build.make src/CMakeFiles/Microsoft.Quantum.Simulator.Runtime.dir/simulator/simulatoravx2.cpp.i
+.PHONY : simulator/simulatoravx2.cpp.i
+
+simulator/simulatoravx2.s: simulator/simulatoravx2.cpp.s
+
+.PHONY : simulator/simulatoravx2.s
+
+# target to generate assembly for a file
+simulator/simulatoravx2.cpp.s:
+	cd /mnt/c/depot/Git/qsharp-runtime/src/Simulation/Native && $(MAKE) -f src/CMakeFiles/Microsoft.Quantum.Simulator.Runtime.dir/build.make src/CMakeFiles/Microsoft.Quantum.Simulator.Runtime.dir/simulator/simulatoravx2.cpp.s
+.PHONY : simulator/simulatoravx2.cpp.s
+
+simulator/simulatoravx512.o: simulator/simulatoravx512.cpp.o
+
+.PHONY : simulator/simulatoravx512.o
+
+# target to build an object file
+simulator/simulatoravx512.cpp.o:
+	cd /mnt/c/depot/Git/qsharp-runtime/src/Simulation/Native && $(MAKE) -f src/CMakeFiles/Microsoft.Quantum.Simulator.Runtime.dir/build.make src/CMakeFiles/Microsoft.Quantum.Simulator.Runtime.dir/simulator/simulatoravx512.cpp.o
+.PHONY : simulator/simulatoravx512.cpp.o
+
+simulator/simulatoravx512.i: simulator/simulatoravx512.cpp.i
+
+.PHONY : simulator/simulatoravx512.i
+
+# target to preprocess a source file
+simulator/simulatoravx512.cpp.i:
+	cd /mnt/c/depot/Git/qsharp-runtime/src/Simulation/Native && $(MAKE) -f src/CMakeFiles/Microsoft.Quantum.Simulator.Runtime.dir/build.make src/CMakeFiles/Microsoft.Quantum.Simulator.Runtime.dir/simulator/simulatoravx512.cpp.i
+.PHONY : simulator/simulatoravx512.cpp.i
+
+simulator/simulatoravx512.s: simulator/simulatoravx512.cpp.s
+
+.PHONY : simulator/simulatoravx512.s
+
+# target to generate assembly for a file
+simulator/simulatoravx512.cpp.s:
+	cd /mnt/c/depot/Git/qsharp-runtime/src/Simulation/Native && $(MAKE) -f src/CMakeFiles/Microsoft.Quantum.Simulator.Runtime.dir/build.make src/CMakeFiles/Microsoft.Quantum.Simulator.Runtime.dir/simulator/simulatoravx512.cpp.s
+.PHONY : simulator/simulatoravx512.cpp.s
+
+util/openmp.o: util/openmp.cpp.o
+
+.PHONY : util/openmp.o
+
+# target to build an object file
+util/openmp.cpp.o:
+	cd /mnt/c/depot/Git/qsharp-runtime/src/Simulation/Native && $(MAKE) -f src/CMakeFiles/Microsoft.Quantum.Simulator.Runtime.dir/build.make src/CMakeFiles/Microsoft.Quantum.Simulator.Runtime.dir/util/openmp.cpp.o
+.PHONY : util/openmp.cpp.o
+
+util/openmp.i: util/openmp.cpp.i
+
+.PHONY : util/openmp.i
+
+# target to preprocess a source file
+util/openmp.cpp.i:
+	cd /mnt/c/depot/Git/qsharp-runtime/src/Simulation/Native && $(MAKE) -f src/CMakeFiles/Microsoft.Quantum.Simulator.Runtime.dir/build.make src/CMakeFiles/Microsoft.Quantum.Simulator.Runtime.dir/util/openmp.cpp.i
+.PHONY : util/openmp.cpp.i
+
+util/openmp.s: util/openmp.cpp.s
+
+.PHONY : util/openmp.s
+
+# target to generate assembly for a file
+util/openmp.cpp.s:
+	cd /mnt/c/depot/Git/qsharp-runtime/src/Simulation/Native && $(MAKE) -f src/CMakeFiles/Microsoft.Quantum.Simulator.Runtime.dir/build.make src/CMakeFiles/Microsoft.Quantum.Simulator.Runtime.dir/util/openmp.cpp.s
+.PHONY : util/openmp.cpp.s
+
+# Help Target
+help:
+	@echo "The following are some of the valid targets for this Makefile:"
+	@echo "... all (the default if no target is provided)"
+	@echo "... clean"
+	@echo "... depend"
+	@echo "... install/strip"
+	@echo "... install/local"
+	@echo "... install"
+	@echo "... list_install_components"
+	@echo "... rebuild_cache"
+	@echo "... edit_cache"
+	@echo "... test"
+	@echo "... Microsoft.Quantum.Simulator.Runtime"
+	@echo "... simulator/capi.o"
+	@echo "... simulator/capi.i"
+	@echo "... simulator/capi.s"
+	@echo "... simulator/factory.o"
+	@echo "... simulator/factory.i"
+	@echo "... simulator/factory.s"
+	@echo "... simulator/simulator.o"
+	@echo "... simulator/simulator.i"
+	@echo "... simulator/simulator.s"
+	@echo "... simulator/simulatoravx.o"
+	@echo "... simulator/simulatoravx.i"
+	@echo "... simulator/simulatoravx.s"
+	@echo "... simulator/simulatoravx2.o"
+	@echo "... simulator/simulatoravx2.i"
+	@echo "... simulator/simulatoravx2.s"
+	@echo "... simulator/simulatoravx512.o"
+	@echo "... simulator/simulatoravx512.i"
+	@echo "... simulator/simulatoravx512.s"
+	@echo "... util/openmp.o"
+	@echo "... util/openmp.i"
+	@echo "... util/openmp.s"
+.PHONY : help
+
+
+
+#=============================================================================
+# Special targets to cleanup operation of make.
+
+# Special rule to run CMake to check the build system integrity.
+# No rule that depends on this can have commands that come from listfiles
+# because they might be regenerated.
+cmake_check_build_system:
+	cd /mnt/c/depot/Git/qsharp-runtime/src/Simulation/Native && $(CMAKE_COMMAND) -S$(CMAKE_SOURCE_DIR) -B$(CMAKE_BINARY_DIR) --check-build-system CMakeFiles/Makefile.cmake 0
+.PHONY : cmake_check_build_system
+
diff --git a/src/Simulation/Native/src/config.hpp b/src/Simulation/Native/src/config.hpp
new file mode 100644
index 00000000000..363baa4d241
--- /dev/null
+++ b/src/Simulation/Native/src/config.hpp
@@ -0,0 +1,50 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#pragma once
+
+#include <complex>
+
+// check if we want to force single precision
+/* #undef USE_SINGLE_PRECISION */
+
+// check if we have AVX intrinsics
+/* #undef HAVE_INTRINSICS */
+
+// check if we have AVX-512 intrinsics
+/* #undef HAVE_AVX512 */
+
+// check if we want to use fused kernels
+#define USE_GATE_FUSION
+
+#define BUILD_SHARED_LIBS
+
+
+#if defined (_MSC_VER) && defined (BUILD_SHARED_LIBS)
+
+#ifdef BUILD_DLL
+#define MICROSOFT_QUANTUM_DECL __declspec(dllexport)
+#else
+#define MICROSOFT_QUANTUM_DECL __declspec(dllimport)
+#endif
+#define MICROSOFT_QUANTUM_DECL_IMPORT __declspec(dllimport)
+#else
+#define MICROSOFT_QUANTUM_DECL
+#define MICROSOFT_QUANTUM_DECL_IMPORT
+#endif
+
+#ifdef HAVE_INTRINSICS
+#ifdef HAVE_AVX512
+#define SIMULATOR SimulatorAVX512
+#else
+#ifdef HAVE_FMA
+#define SIMULATOR SimulatorAVX2
+#else
+#define SIMULATOR SimulatorAVX
+#endif
+#endif
+#else
+#define SIMULATOR SimulatorGeneric
+#endif
+
+
diff --git a/src/Simulation/Native/src/external/avx/kernel1.hpp b/src/Simulation/Native/src/external/avx/kernel1.hpp
index eac0cf47ea4..24799c4524e 100644
--- a/src/Simulation/Native/src/external/avx/kernel1.hpp
+++ b/src/Simulation/Native/src/external/avx/kernel1.hpp
@@ -22,7 +22,7 @@ inline void kernel_core(V& psi, std::size_t I, std::size_t d0, M const& m, M con
 template <class V, class M>
 void kernel(V& psi, unsigned id0, M const& matrix, std::size_t ctrlmask)
 {
-     std::size_t n = psi.size();
+    std::size_t n = psi.size();
 	std::size_t d0 = 1ULL << id0;
 	auto m = matrix;
 	std::size_t dsorted[] = {d0};
@@ -49,7 +49,7 @@ void kernel(V& psi, unsigned id0, M const& matrix, std::size_t ctrlmask)
 
 #ifndef _MSC_VER
 	if (ctrlmask == 0){
-		#pragma omp for collapse(LOOP_COLLAPSE1) schedule(static)
+		#pragma omp parallel for collapse(LOOP_COLLAPSE1) schedule(static) proc_bind(spread)
 		for (std::size_t i0 = 0; i0 < n; i0 += 2 * dsorted[0]){
 			for (std::size_t i1 = 0; i1 < dsorted[0]; ++i1){
 				kernel_core(psi, i0 + i1, dsorted[0], mm, mmt);
@@ -57,7 +57,7 @@ void kernel(V& psi, unsigned id0, M const& matrix, std::size_t ctrlmask)
 		}
 	}
 	else{
-		#pragma omp for collapse(LOOP_COLLAPSE1) schedule(static)
+		#pragma omp parallel for collapse(LOOP_COLLAPSE1) schedule(static)
 		for (std::size_t i0 = 0; i0 < n; i0 += 2 * dsorted[0]){
 			for (std::size_t i1 = 0; i1 < dsorted[0]; ++i1){
 				if (((i0 + i1)&ctrlmask) == ctrlmask)
diff --git a/src/Simulation/Native/src/external/avx/kernel2.hpp b/src/Simulation/Native/src/external/avx/kernel2.hpp
index 24f6c8ee13d..7d52dc39eec 100644
--- a/src/Simulation/Native/src/external/avx/kernel2.hpp
+++ b/src/Simulation/Native/src/external/avx/kernel2.hpp
@@ -63,7 +63,7 @@ void kernel(V& psi, unsigned id1, unsigned id0, M const& matrix, std::size_t ctr
 
 #ifndef _MSC_VER
 	if (ctrlmask == 0){
-		#pragma omp for collapse(LOOP_COLLAPSE2) schedule(static)
+		#pragma omp parallel for collapse(LOOP_COLLAPSE2) schedule(static) proc_bind(spread)
 		for (std::size_t i0 = 0; i0 < n; i0 += 2 * dsorted[0]){
 			for (std::size_t i1 = 0; i1 < dsorted[0]; i1 += 2 * dsorted[1]){
 				for (std::size_t i2 = 0; i2 < dsorted[1]; ++i2){
@@ -73,7 +73,7 @@ void kernel(V& psi, unsigned id1, unsigned id0, M const& matrix, std::size_t ctr
 		}
 	}
 	else{
-		#pragma omp for collapse(LOOP_COLLAPSE2) schedule(static)
+		#pragma omp parallel for collapse(LOOP_COLLAPSE2) schedule(static)
 		for (std::size_t i0 = 0; i0 < n; i0 += 2 * dsorted[0]){
 			for (std::size_t i1 = 0; i1 < dsorted[0]; i1 += 2 * dsorted[1]){
 				for (std::size_t i2 = 0; i2 < dsorted[1]; ++i2){
diff --git a/src/Simulation/Native/src/external/avx/kernel3.hpp b/src/Simulation/Native/src/external/avx/kernel3.hpp
index a63dbc28693..58248d4742e 100644
--- a/src/Simulation/Native/src/external/avx/kernel3.hpp
+++ b/src/Simulation/Native/src/external/avx/kernel3.hpp
@@ -102,7 +102,7 @@ void kernel(V& psi, unsigned id2, unsigned id1, unsigned id0, M const& matrix, s
 
 #ifndef _MSC_VER
 	if (ctrlmask == 0){
-		#pragma omp for collapse(LOOP_COLLAPSE3) schedule(static)
+		#pragma omp parallel for collapse(LOOP_COLLAPSE3) schedule(static) proc_bind(spread)
 		for (std::size_t i0 = 0; i0 < n; i0 += 2 * dsorted[0]){
 			for (std::size_t i1 = 0; i1 < dsorted[0]; i1 += 2 * dsorted[1]){
 				for (std::size_t i2 = 0; i2 < dsorted[1]; i2 += 2 * dsorted[2]){
@@ -114,7 +114,7 @@ void kernel(V& psi, unsigned id2, unsigned id1, unsigned id0, M const& matrix, s
 		}
 	}
 	else{
-		#pragma omp for collapse(LOOP_COLLAPSE3) schedule(static)
+		#pragma omp parallel for collapse(LOOP_COLLAPSE3) schedule(static)
 		for (std::size_t i0 = 0; i0 < n; i0 += 2 * dsorted[0]){
 			for (std::size_t i1 = 0; i1 < dsorted[0]; i1 += 2 * dsorted[1]){
 				for (std::size_t i2 = 0; i2 < dsorted[1]; i2 += 2 * dsorted[2]){
diff --git a/src/Simulation/Native/src/external/avx/kernel4.hpp b/src/Simulation/Native/src/external/avx/kernel4.hpp
index 92f573a0255..7ddcd504404 100644
--- a/src/Simulation/Native/src/external/avx/kernel4.hpp
+++ b/src/Simulation/Native/src/external/avx/kernel4.hpp
@@ -227,7 +227,7 @@ void kernel(V& psi, unsigned id3, unsigned id2, unsigned id1, unsigned id0, M co
 
 #ifndef _MSC_VER
 	if (ctrlmask == 0){
-		#pragma omp for collapse(LOOP_COLLAPSE4) schedule(static)
+		#pragma omp parallel for collapse(LOOP_COLLAPSE4) schedule(static) proc_bind(spread)
 		for (std::size_t i0 = 0; i0 < n; i0 += 2 * dsorted[0]){
 			for (std::size_t i1 = 0; i1 < dsorted[0]; i1 += 2 * dsorted[1]){
 				for (std::size_t i2 = 0; i2 < dsorted[1]; i2 += 2 * dsorted[2]){
@@ -241,7 +241,7 @@ void kernel(V& psi, unsigned id3, unsigned id2, unsigned id1, unsigned id0, M co
 		}
 	}
 	else{
-		#pragma omp for collapse(LOOP_COLLAPSE4) schedule(static)
+		#pragma omp parallel for collapse(LOOP_COLLAPSE4) schedule(static)
 		for (std::size_t i0 = 0; i0 < n; i0 += 2 * dsorted[0]){
 			for (std::size_t i1 = 0; i1 < dsorted[0]; i1 += 2 * dsorted[1]){
 				for (std::size_t i2 = 0; i2 < dsorted[1]; i2 += 2 * dsorted[2]){
diff --git a/src/Simulation/Native/src/external/avx/kernel5.hpp b/src/Simulation/Native/src/external/avx/kernel5.hpp
index 0cdba4b89c5..72078dd6fd4 100644
--- a/src/Simulation/Native/src/external/avx/kernel5.hpp
+++ b/src/Simulation/Native/src/external/avx/kernel5.hpp
@@ -380,7 +380,7 @@ void kernel(V& psi, unsigned id4, unsigned id3, unsigned id2, unsigned id1, unsi
 
 #ifndef _MSC_VER
 	if (ctrlmask == 0){
-		#pragma omp for collapse(LOOP_COLLAPSE5) schedule(static)
+		#pragma omp parallel for collapse(LOOP_COLLAPSE5) schedule(static) proc_bind(spread)
 		for (std::size_t i0 = 0; i0 < n; i0 += 2 * dsorted[0]){
 			for (std::size_t i1 = 0; i1 < dsorted[0]; i1 += 2 * dsorted[1]){
 				for (std::size_t i2 = 0; i2 < dsorted[1]; i2 += 2 * dsorted[2]){
@@ -396,7 +396,7 @@ void kernel(V& psi, unsigned id4, unsigned id3, unsigned id2, unsigned id1, unsi
 		}
 	}
 	else{
-		#pragma omp for collapse(LOOP_COLLAPSE5) schedule(static)
+		#pragma omp parallel for collapse(LOOP_COLLAPSE5) schedule(static)
 		for (std::size_t i0 = 0; i0 < n; i0 += 2 * dsorted[0]){
 			for (std::size_t i1 = 0; i1 < dsorted[0]; i1 += 2 * dsorted[1]){
 				for (std::size_t i2 = 0; i2 < dsorted[1]; i2 += 2 * dsorted[2]){
diff --git a/src/Simulation/Native/src/external/avx/kernel6.hpp b/src/Simulation/Native/src/external/avx/kernel6.hpp
index 343c6e3154c..89a4364b22c 100644
--- a/src/Simulation/Native/src/external/avx/kernel6.hpp
+++ b/src/Simulation/Native/src/external/avx/kernel6.hpp
@@ -212,7 +212,7 @@ void kernel(V& psi, unsigned id5, unsigned id4, unsigned id3, unsigned id2, unsi
 
 #ifndef _MSC_VER
 	if (ctrlmask == 0){
-		#pragma omp for collapse(LOOP_COLLAPSE6) schedule(static)
+		#pragma omp parallel for collapse(LOOP_COLLAPSE6) schedule(static) proc_bind(spread)
 		for (std::size_t i0 = 0; i0 < n; i0 += 2 * dsorted[0]){
 			for (std::size_t i1 = 0; i1 < dsorted[0]; i1 += 2 * dsorted[1]){
 				for (std::size_t i2 = 0; i2 < dsorted[1]; i2 += 2 * dsorted[2]){
@@ -230,7 +230,7 @@ void kernel(V& psi, unsigned id5, unsigned id4, unsigned id3, unsigned id2, unsi
 		}
 	}
 	else{
-		#pragma omp for collapse(LOOP_COLLAPSE6) schedule(static)
+		#pragma omp parallel for collapse(LOOP_COLLAPSE6) schedule(static)
 		for (std::size_t i0 = 0; i0 < n; i0 += 2 * dsorted[0]){
 			for (std::size_t i1 = 0; i1 < dsorted[0]; i1 += 2 * dsorted[1]){
 				for (std::size_t i2 = 0; i2 < dsorted[1]; i2 += 2 * dsorted[2]){
diff --git a/src/Simulation/Native/src/external/avx/kernel7.hpp b/src/Simulation/Native/src/external/avx/kernel7.hpp
index ecd0f45f3cf..8dfda9eee71 100644
--- a/src/Simulation/Native/src/external/avx/kernel7.hpp
+++ b/src/Simulation/Native/src/external/avx/kernel7.hpp
@@ -389,7 +389,7 @@ void kernel(V& psi, unsigned id6, unsigned id5, unsigned id4, unsigned id3, unsi
 
 #ifndef _MSC_VER
 	if (ctrlmask == 0){
-		#pragma omp for collapse(LOOP_COLLAPSE7) schedule(static)
+		#pragma omp parallel for collapse(LOOP_COLLAPSE7) schedule(static) proc_bind(spread)
 		for (std::size_t i0 = 0; i0 < n; i0 += 2 * dsorted[0]){
 			for (std::size_t i1 = 0; i1 < dsorted[0]; i1 += 2 * dsorted[1]){
 				for (std::size_t i2 = 0; i2 < dsorted[1]; i2 += 2 * dsorted[2]){
@@ -409,7 +409,7 @@ void kernel(V& psi, unsigned id6, unsigned id5, unsigned id4, unsigned id3, unsi
 		}
 	}
 	else{
-		#pragma omp for collapse(LOOP_COLLAPSE7) schedule(static)
+		#pragma omp parallel for collapse(LOOP_COLLAPSE7) schedule(static)
 		for (std::size_t i0 = 0; i0 < n; i0 += 2 * dsorted[0]){
 			for (std::size_t i1 = 0; i1 < dsorted[0]; i1 += 2 * dsorted[1]){
 				for (std::size_t i2 = 0; i2 < dsorted[1]; i2 += 2 * dsorted[2]){
diff --git a/src/Simulation/Native/src/external/avx2/kernel1.hpp b/src/Simulation/Native/src/external/avx2/kernel1.hpp
index eac0cf47ea4..198676259e4 100644
--- a/src/Simulation/Native/src/external/avx2/kernel1.hpp
+++ b/src/Simulation/Native/src/external/avx2/kernel1.hpp
@@ -49,7 +49,7 @@ void kernel(V& psi, unsigned id0, M const& matrix, std::size_t ctrlmask)
 
 #ifndef _MSC_VER
 	if (ctrlmask == 0){
-		#pragma omp for collapse(LOOP_COLLAPSE1) schedule(static)
+		#pragma omp parallel for collapse(LOOP_COLLAPSE1) schedule(static) proc_bind(spread)
 		for (std::size_t i0 = 0; i0 < n; i0 += 2 * dsorted[0]){
 			for (std::size_t i1 = 0; i1 < dsorted[0]; ++i1){
 				kernel_core(psi, i0 + i1, dsorted[0], mm, mmt);
@@ -57,7 +57,7 @@ void kernel(V& psi, unsigned id0, M const& matrix, std::size_t ctrlmask)
 		}
 	}
 	else{
-		#pragma omp for collapse(LOOP_COLLAPSE1) schedule(static)
+		#pragma omp parallel for collapse(LOOP_COLLAPSE1) schedule(static)
 		for (std::size_t i0 = 0; i0 < n; i0 += 2 * dsorted[0]){
 			for (std::size_t i1 = 0; i1 < dsorted[0]; ++i1){
 				if (((i0 + i1)&ctrlmask) == ctrlmask)
diff --git a/src/Simulation/Native/src/external/avx2/kernel2.hpp b/src/Simulation/Native/src/external/avx2/kernel2.hpp
index 24f6c8ee13d..7d52dc39eec 100644
--- a/src/Simulation/Native/src/external/avx2/kernel2.hpp
+++ b/src/Simulation/Native/src/external/avx2/kernel2.hpp
@@ -63,7 +63,7 @@ void kernel(V& psi, unsigned id1, unsigned id0, M const& matrix, std::size_t ctr
 
 #ifndef _MSC_VER
 	if (ctrlmask == 0){
-		#pragma omp for collapse(LOOP_COLLAPSE2) schedule(static)
+		#pragma omp parallel for collapse(LOOP_COLLAPSE2) schedule(static) proc_bind(spread)
 		for (std::size_t i0 = 0; i0 < n; i0 += 2 * dsorted[0]){
 			for (std::size_t i1 = 0; i1 < dsorted[0]; i1 += 2 * dsorted[1]){
 				for (std::size_t i2 = 0; i2 < dsorted[1]; ++i2){
@@ -73,7 +73,7 @@ void kernel(V& psi, unsigned id1, unsigned id0, M const& matrix, std::size_t ctr
 		}
 	}
 	else{
-		#pragma omp for collapse(LOOP_COLLAPSE2) schedule(static)
+		#pragma omp parallel for collapse(LOOP_COLLAPSE2) schedule(static)
 		for (std::size_t i0 = 0; i0 < n; i0 += 2 * dsorted[0]){
 			for (std::size_t i1 = 0; i1 < dsorted[0]; i1 += 2 * dsorted[1]){
 				for (std::size_t i2 = 0; i2 < dsorted[1]; ++i2){
diff --git a/src/Simulation/Native/src/external/avx2/kernel3.hpp b/src/Simulation/Native/src/external/avx2/kernel3.hpp
index a63dbc28693..58248d4742e 100644
--- a/src/Simulation/Native/src/external/avx2/kernel3.hpp
+++ b/src/Simulation/Native/src/external/avx2/kernel3.hpp
@@ -102,7 +102,7 @@ void kernel(V& psi, unsigned id2, unsigned id1, unsigned id0, M const& matrix, s
 
 #ifndef _MSC_VER
 	if (ctrlmask == 0){
-		#pragma omp for collapse(LOOP_COLLAPSE3) schedule(static)
+		#pragma omp parallel for collapse(LOOP_COLLAPSE3) schedule(static) proc_bind(spread)
 		for (std::size_t i0 = 0; i0 < n; i0 += 2 * dsorted[0]){
 			for (std::size_t i1 = 0; i1 < dsorted[0]; i1 += 2 * dsorted[1]){
 				for (std::size_t i2 = 0; i2 < dsorted[1]; i2 += 2 * dsorted[2]){
@@ -114,7 +114,7 @@ void kernel(V& psi, unsigned id2, unsigned id1, unsigned id0, M const& matrix, s
 		}
 	}
 	else{
-		#pragma omp for collapse(LOOP_COLLAPSE3) schedule(static)
+		#pragma omp parallel for collapse(LOOP_COLLAPSE3) schedule(static)
 		for (std::size_t i0 = 0; i0 < n; i0 += 2 * dsorted[0]){
 			for (std::size_t i1 = 0; i1 < dsorted[0]; i1 += 2 * dsorted[1]){
 				for (std::size_t i2 = 0; i2 < dsorted[1]; i2 += 2 * dsorted[2]){
diff --git a/src/Simulation/Native/src/external/avx2/kernel4.hpp b/src/Simulation/Native/src/external/avx2/kernel4.hpp
index 92f573a0255..7ddcd504404 100644
--- a/src/Simulation/Native/src/external/avx2/kernel4.hpp
+++ b/src/Simulation/Native/src/external/avx2/kernel4.hpp
@@ -227,7 +227,7 @@ void kernel(V& psi, unsigned id3, unsigned id2, unsigned id1, unsigned id0, M co
 
 #ifndef _MSC_VER
 	if (ctrlmask == 0){
-		#pragma omp for collapse(LOOP_COLLAPSE4) schedule(static)
+		#pragma omp parallel for collapse(LOOP_COLLAPSE4) schedule(static) proc_bind(spread)
 		for (std::size_t i0 = 0; i0 < n; i0 += 2 * dsorted[0]){
 			for (std::size_t i1 = 0; i1 < dsorted[0]; i1 += 2 * dsorted[1]){
 				for (std::size_t i2 = 0; i2 < dsorted[1]; i2 += 2 * dsorted[2]){
@@ -241,7 +241,7 @@ void kernel(V& psi, unsigned id3, unsigned id2, unsigned id1, unsigned id0, M co
 		}
 	}
 	else{
-		#pragma omp for collapse(LOOP_COLLAPSE4) schedule(static)
+		#pragma omp parallel for collapse(LOOP_COLLAPSE4) schedule(static)
 		for (std::size_t i0 = 0; i0 < n; i0 += 2 * dsorted[0]){
 			for (std::size_t i1 = 0; i1 < dsorted[0]; i1 += 2 * dsorted[1]){
 				for (std::size_t i2 = 0; i2 < dsorted[1]; i2 += 2 * dsorted[2]){
diff --git a/src/Simulation/Native/src/external/avx2/kernel5.hpp b/src/Simulation/Native/src/external/avx2/kernel5.hpp
index 0cdba4b89c5..72078dd6fd4 100644
--- a/src/Simulation/Native/src/external/avx2/kernel5.hpp
+++ b/src/Simulation/Native/src/external/avx2/kernel5.hpp
@@ -380,7 +380,7 @@ void kernel(V& psi, unsigned id4, unsigned id3, unsigned id2, unsigned id1, unsi
 
 #ifndef _MSC_VER
 	if (ctrlmask == 0){
-		#pragma omp for collapse(LOOP_COLLAPSE5) schedule(static)
+		#pragma omp parallel for collapse(LOOP_COLLAPSE5) schedule(static) proc_bind(spread)
 		for (std::size_t i0 = 0; i0 < n; i0 += 2 * dsorted[0]){
 			for (std::size_t i1 = 0; i1 < dsorted[0]; i1 += 2 * dsorted[1]){
 				for (std::size_t i2 = 0; i2 < dsorted[1]; i2 += 2 * dsorted[2]){
@@ -396,7 +396,7 @@ void kernel(V& psi, unsigned id4, unsigned id3, unsigned id2, unsigned id1, unsi
 		}
 	}
 	else{
-		#pragma omp for collapse(LOOP_COLLAPSE5) schedule(static)
+		#pragma omp parallel for collapse(LOOP_COLLAPSE5) schedule(static)
 		for (std::size_t i0 = 0; i0 < n; i0 += 2 * dsorted[0]){
 			for (std::size_t i1 = 0; i1 < dsorted[0]; i1 += 2 * dsorted[1]){
 				for (std::size_t i2 = 0; i2 < dsorted[1]; i2 += 2 * dsorted[2]){
diff --git a/src/Simulation/Native/src/external/avx2/kernel6.hpp b/src/Simulation/Native/src/external/avx2/kernel6.hpp
index 343c6e3154c..89a4364b22c 100644
--- a/src/Simulation/Native/src/external/avx2/kernel6.hpp
+++ b/src/Simulation/Native/src/external/avx2/kernel6.hpp
@@ -212,7 +212,7 @@ void kernel(V& psi, unsigned id5, unsigned id4, unsigned id3, unsigned id2, unsi
 
 #ifndef _MSC_VER
 	if (ctrlmask == 0){
-		#pragma omp for collapse(LOOP_COLLAPSE6) schedule(static)
+		#pragma omp parallel for collapse(LOOP_COLLAPSE6) schedule(static) proc_bind(spread)
 		for (std::size_t i0 = 0; i0 < n; i0 += 2 * dsorted[0]){
 			for (std::size_t i1 = 0; i1 < dsorted[0]; i1 += 2 * dsorted[1]){
 				for (std::size_t i2 = 0; i2 < dsorted[1]; i2 += 2 * dsorted[2]){
@@ -230,7 +230,7 @@ void kernel(V& psi, unsigned id5, unsigned id4, unsigned id3, unsigned id2, unsi
 		}
 	}
 	else{
-		#pragma omp for collapse(LOOP_COLLAPSE6) schedule(static)
+		#pragma omp parallel for collapse(LOOP_COLLAPSE6) schedule(static)
 		for (std::size_t i0 = 0; i0 < n; i0 += 2 * dsorted[0]){
 			for (std::size_t i1 = 0; i1 < dsorted[0]; i1 += 2 * dsorted[1]){
 				for (std::size_t i2 = 0; i2 < dsorted[1]; i2 += 2 * dsorted[2]){
diff --git a/src/Simulation/Native/src/external/avx2/kernel7.hpp b/src/Simulation/Native/src/external/avx2/kernel7.hpp
index ecd0f45f3cf..8dfda9eee71 100644
--- a/src/Simulation/Native/src/external/avx2/kernel7.hpp
+++ b/src/Simulation/Native/src/external/avx2/kernel7.hpp
@@ -389,7 +389,7 @@ void kernel(V& psi, unsigned id6, unsigned id5, unsigned id4, unsigned id3, unsi
 
 #ifndef _MSC_VER
 	if (ctrlmask == 0){
-		#pragma omp for collapse(LOOP_COLLAPSE7) schedule(static)
+		#pragma omp parallel for collapse(LOOP_COLLAPSE7) schedule(static) proc_bind(spread)
 		for (std::size_t i0 = 0; i0 < n; i0 += 2 * dsorted[0]){
 			for (std::size_t i1 = 0; i1 < dsorted[0]; i1 += 2 * dsorted[1]){
 				for (std::size_t i2 = 0; i2 < dsorted[1]; i2 += 2 * dsorted[2]){
@@ -409,7 +409,7 @@ void kernel(V& psi, unsigned id6, unsigned id5, unsigned id4, unsigned id3, unsi
 		}
 	}
 	else{
-		#pragma omp for collapse(LOOP_COLLAPSE7) schedule(static)
+		#pragma omp parallel for collapse(LOOP_COLLAPSE7) schedule(static)
 		for (std::size_t i0 = 0; i0 < n; i0 += 2 * dsorted[0]){
 			for (std::size_t i1 = 0; i1 < dsorted[0]; i1 += 2 * dsorted[1]){
 				for (std::size_t i2 = 0; i2 < dsorted[1]; i2 += 2 * dsorted[2]){
diff --git a/src/Simulation/Native/src/external/avx512/kernel1.hpp b/src/Simulation/Native/src/external/avx512/kernel1.hpp
new file mode 100644
index 00000000000..19f2c473370
--- /dev/null
+++ b/src/Simulation/Native/src/external/avx512/kernel1.hpp
@@ -0,0 +1,85 @@
+// (C) 2018 ETH Zurich, ITP, Thomas Häner and Damian Steiger
+
+template <class V, class M>
+inline void kernel_core(V& psi, std::size_t I, std::size_t d0, M const& m, M const& mt)
+{
+	__m256d v[1];
+
+	v[0] = load1(&psi[I]);
+
+	__m256d tmp[1] = {_mm256_setzero_pd()};
+
+	tmp[0] = fma(v[0], m[0], mt[0], tmp[0]);
+
+	v[0] = load1(&psi[I + d0]);
+
+	tmp[0] = fma(v[0], m[1], mt[1], tmp[0]);
+	store((double*)&psi[I + d0], (double*)&psi[I], tmp[0]);
+
+}
+
+// bit indices id[.] are given from high to low (e.g. control first for CNOT)
+template <class V, class M>
+void kernel(V& psi, unsigned id0, M const& matrix, std::size_t ctrlmask)
+{
+     std::size_t n = psi.size();
+	std::size_t d0 = 1ULL << id0;
+	auto m = matrix;
+	std::size_t dsorted[] = {d0};
+	permute_qubits_and_matrix(dsorted, 1, m);
+
+	__m256d mm[2];
+	for (unsigned b = 0; b < 2; ++b){
+		for (unsigned r = 0; r < 1; ++r){
+			for (unsigned c = 0; c < 1; ++c){
+				mm[b*1+r*1+c] = loada(&m[2*r+0][c+b*1], &m[2*r+1][c+b*1]);
+			}
+		}
+	}
+
+	__m256d mmt[2];
+	for (unsigned b = 0; b < 2; ++b){
+		for (unsigned r = 0; r < 1; ++r){
+			for (unsigned c = 0; c < 1; ++c){
+				mmt[b*1+r*1+c] = loadbm(&m[2*r+0][c+b*1], &m[2*r+1][c+b*1]);
+			}
+		}
+	}
+
+
+#ifndef _MSC_VER
+	if (ctrlmask == 0){
+		#pragma omp parallel for collapse(LOOP_COLLAPSE1) schedule(static) proc_bind(spread)
+		for (std::size_t i0 = 0; i0 < n; i0 += 2 * dsorted[0]){
+			for (std::size_t i1 = 0; i1 < dsorted[0]; ++i1){
+				kernel_core(psi, i0 + i1, dsorted[0], mm, mmt);
+			}
+		}
+	}
+	else{
+		#pragma omp parallel for collapse(LOOP_COLLAPSE1) schedule(static)
+		for (std::size_t i0 = 0; i0 < n; i0 += 2 * dsorted[0]){
+			for (std::size_t i1 = 0; i1 < dsorted[0]; ++i1){
+				if (((i0 + i1)&ctrlmask) == ctrlmask)
+					kernel_core(psi, i0 + i1, dsorted[0], mm, mmt);
+			}
+		}
+	}
+#else
+    std::intptr_t zero = 0;
+    std::intptr_t dmask = dsorted[0];
+
+    if (ctrlmask == 0){
+        #pragma omp parallel for schedule(static)
+        for (std::intptr_t i = 0; i < static_cast<std::intptr_t>(n); ++i)
+            if ((i & dmask) == zero)
+                kernel_core(psi, i, dsorted[0], mm, mmt);
+     } else {
+        #pragma omp parallel for schedule(static)
+        for (std::intptr_t i = 0; i < static_cast<std::intptr_t>(n); ++i)
+            if ((i & ctrlmask) == ctrlmask && (i & dmask) == zero)
+                kernel_core(psi, i, dsorted[0], mm, mmt);
+     }
+#endif
+}
+
diff --git a/src/Simulation/Native/src/external/avx512/kernel2.hpp b/src/Simulation/Native/src/external/avx512/kernel2.hpp
new file mode 100644
index 00000000000..9a47f3044fb
--- /dev/null
+++ b/src/Simulation/Native/src/external/avx512/kernel2.hpp
@@ -0,0 +1,98 @@
+// (C) 2018 ETH Zurich, ITP, Thomas Häner and Damian Steiger
+
+template <class V, class M>
+inline void kernel_core(V& psi, std::size_t I, std::size_t d0, std::size_t d1, M const& m, M const& mt)
+{
+	__m512d v[1];
+
+	v[0] = load1x4(&psi[I]);
+
+	__m512d tmp[1] = {_mm512_setzero_pd()};
+
+	tmp[0] = fma(v[0], m[0], mt[0], tmp[0]);
+
+	v[0] = load1x4(&psi[I + d0]);
+
+	tmp[0] = fma(v[0], m[1], mt[1], tmp[0]);
+
+	v[0] = load1x4(&psi[I + d1]);
+
+	tmp[0] = fma(v[0], m[2], mt[2], tmp[0]);
+
+	v[0] = load1x4(&psi[I + d0 + d1]);
+
+	tmp[0] = fma(v[0], m[3], mt[3], tmp[0]);
+	store((double*)&psi[I + d0 + d1], (double*)&psi[I + d1], (double*)&psi[I + d0], (double*)&psi[I], tmp[0]);
+
+}
+
+// bit indices id[.] are given from high to low (e.g. control first for CNOT)
+template <class V, class M>
+void kernel(V& psi, unsigned id1, unsigned id0, M const& matrix, std::size_t ctrlmask)
+{
+     std::size_t n = psi.size();
+	std::size_t d0 = 1ULL << id0;
+	std::size_t d1 = 1ULL << id1;
+	auto m = matrix;
+	std::size_t dsorted[] = {d0, d1};
+	permute_qubits_and_matrix(dsorted, 2, m);
+
+	__m512d mm[4];
+	for (unsigned b = 0; b < 4; ++b){
+		for (unsigned r = 0; r < 1; ++r){
+			for (unsigned c = 0; c < 1; ++c){
+				mm[b*1+r*1+c] = loada(&m[4*r+0][c+b*1], &m[4*r+1][c+b*1], &m[4*r+2][c+b*1], &m[4*r+3][c+b*1]);
+			}
+		}
+	}
+
+	__m512d mmt[4];
+	for (unsigned b = 0; b < 4; ++b){
+		for (unsigned r = 0; r < 1; ++r){
+			for (unsigned c = 0; c < 1; ++c){
+				mmt[b*1+r*1+c] = loadbm(&m[4*r+0][c+b*1], &m[4*r+1][c+b*1], &m[4*r+2][c+b*1], &m[4*r+3][c+b*1]);
+			}
+		}
+	}
+
+
+#ifndef _MSC_VER
+	if (ctrlmask == 0){
+		#pragma omp parallel for collapse(LOOP_COLLAPSE2) schedule(static) proc_bind(spread)
+		for (std::size_t i0 = 0; i0 < n; i0 += 2 * dsorted[0]){
+			for (std::size_t i1 = 0; i1 < dsorted[0]; i1 += 2 * dsorted[1]){
+				for (std::size_t i2 = 0; i2 < dsorted[1]; ++i2){
+					kernel_core(psi, i0 + i1 + i2, dsorted[1], dsorted[0], mm, mmt);
+				}
+			}
+		}
+	}
+	else{
+		#pragma omp parallel for collapse(LOOP_COLLAPSE2) schedule(static)
+		for (std::size_t i0 = 0; i0 < n; i0 += 2 * dsorted[0]){
+			for (std::size_t i1 = 0; i1 < dsorted[0]; i1 += 2 * dsorted[1]){
+				for (std::size_t i2 = 0; i2 < dsorted[1]; ++i2){
+					if (((i0 + i1 + i2)&ctrlmask) == ctrlmask)
+						kernel_core(psi, i0 + i1 + i2, dsorted[1], dsorted[0], mm, mmt);
+				}
+			}
+		}
+	}
+#else
+    std::intptr_t zero = 0;
+    std::intptr_t dmask = dsorted[0] + dsorted[1];
+
+    if (ctrlmask == 0){
+        #pragma omp parallel for schedule(static)
+        for (std::intptr_t i = 0; i < static_cast<std::intptr_t>(n); ++i)
+            if ((i & dmask) == zero)
+                kernel_core(psi, i, dsorted[1], dsorted[0], mm, mmt);
+     } else {
+        #pragma omp parallel for schedule(static)
+        for (std::intptr_t i = 0; i < static_cast<std::intptr_t>(n); ++i)
+            if ((i & ctrlmask) == ctrlmask && (i & dmask) == zero)
+                kernel_core(psi, i, dsorted[1], dsorted[0], mm, mmt);
+     }
+#endif
+}
+
diff --git a/src/Simulation/Native/src/external/avx512/kernel3.hpp b/src/Simulation/Native/src/external/avx512/kernel3.hpp
new file mode 100644
index 00000000000..a0f27741672
--- /dev/null
+++ b/src/Simulation/Native/src/external/avx512/kernel3.hpp
@@ -0,0 +1,128 @@
+// (C) 2018 ETH Zurich, ITP, Thomas Häner and Damian Steiger
+
+template <class V, class M>
+inline void kernel_core(V& psi, std::size_t I, std::size_t d0, std::size_t d1, std::size_t d2, M const& m, M const& mt)
+{
+	__m512d v[1];
+
+	v[0] = load1x4(&psi[I]);
+
+	__m512d tmp[2] = {_mm512_setzero_pd(), _mm512_setzero_pd()};
+
+	tmp[0] = fma(v[0], m[0], mt[0], tmp[0]);
+	tmp[1] = fma(v[0], m[1], mt[1], tmp[1]);
+
+	v[0] = load1x4(&psi[I + d0]);
+
+	tmp[0] = fma(v[0], m[2], mt[2], tmp[0]);
+	tmp[1] = fma(v[0], m[3], mt[3], tmp[1]);
+
+	v[0] = load1x4(&psi[I + d1]);
+
+	tmp[0] = fma(v[0], m[4], mt[4], tmp[0]);
+	tmp[1] = fma(v[0], m[5], mt[5], tmp[1]);
+
+	v[0] = load1x4(&psi[I + d0 + d1]);
+
+	tmp[0] = fma(v[0], m[6], mt[6], tmp[0]);
+	tmp[1] = fma(v[0], m[7], mt[7], tmp[1]);
+
+	v[0] = load1x4(&psi[I + d2]);
+
+	tmp[0] = fma(v[0], m[8], mt[8], tmp[0]);
+	tmp[1] = fma(v[0], m[9], mt[9], tmp[1]);
+
+	v[0] = load1x4(&psi[I + d0 + d2]);
+
+	tmp[0] = fma(v[0], m[10], mt[10], tmp[0]);
+	tmp[1] = fma(v[0], m[11], mt[11], tmp[1]);
+
+	v[0] = load1x4(&psi[I + d1 + d2]);
+
+	tmp[0] = fma(v[0], m[12], mt[12], tmp[0]);
+	tmp[1] = fma(v[0], m[13], mt[13], tmp[1]);
+
+	v[0] = load1x4(&psi[I + d0 + d1 + d2]);
+
+	tmp[0] = fma(v[0], m[14], mt[14], tmp[0]);
+	tmp[1] = fma(v[0], m[15], mt[15], tmp[1]);
+	store((double*)&psi[I + d0 + d1], (double*)&psi[I + d1], (double*)&psi[I + d0], (double*)&psi[I], tmp[0]);
+	store((double*)&psi[I + d0 + d1 + d2], (double*)&psi[I + d1 + d2], (double*)&psi[I + d0 + d2], (double*)&psi[I + d2], tmp[1]);
+
+}
+
+// bit indices id[.] are given from high to low (e.g. control first for CNOT)
+template <class V, class M>
+void kernel(V& psi, unsigned id2, unsigned id1, unsigned id0, M const& matrix, std::size_t ctrlmask)
+{
+     std::size_t n = psi.size();
+	std::size_t d0 = 1ULL << id0;
+	std::size_t d1 = 1ULL << id1;
+	std::size_t d2 = 1ULL << id2;
+	auto m = matrix;
+	std::size_t dsorted[] = {d0, d1, d2};
+	permute_qubits_and_matrix(dsorted, 3, m);
+
+	__m512d mm[16];
+	for (unsigned b = 0; b < 8; ++b){
+		for (unsigned r = 0; r < 2; ++r){
+			for (unsigned c = 0; c < 1; ++c){
+				mm[b*2+r*1+c] = loada(&m[4*r+0][c+b*1], &m[4*r+1][c+b*1], &m[4*r+2][c+b*1], &m[4*r+3][c+b*1]);
+			}
+		}
+	}
+
+	__m512d mmt[16];
+	for (unsigned b = 0; b < 8; ++b){
+		for (unsigned r = 0; r < 2; ++r){
+			for (unsigned c = 0; c < 1; ++c){
+				mmt[b*2+r*1+c] = loadbm(&m[4*r+0][c+b*1], &m[4*r+1][c+b*1], &m[4*r+2][c+b*1], &m[4*r+3][c+b*1]);
+			}
+		}
+	}
+
+
+#ifndef _MSC_VER
+	if (ctrlmask == 0){
+		#pragma omp parallel for collapse(LOOP_COLLAPSE3) schedule(static) proc_bind(spread)
+		for (std::size_t i0 = 0; i0 < n; i0 += 2 * dsorted[0]){
+			for (std::size_t i1 = 0; i1 < dsorted[0]; i1 += 2 * dsorted[1]){
+				for (std::size_t i2 = 0; i2 < dsorted[1]; i2 += 2 * dsorted[2]){
+					for (std::size_t i3 = 0; i3 < dsorted[2]; ++i3){
+						kernel_core(psi, i0 + i1 + i2 + i3, dsorted[2], dsorted[1], dsorted[0], mm, mmt);
+					}
+				}
+			}
+		}
+	}
+	else{
+		#pragma omp parallel for collapse(LOOP_COLLAPSE3) schedule(static)
+		for (std::size_t i0 = 0; i0 < n; i0 += 2 * dsorted[0]){
+			for (std::size_t i1 = 0; i1 < dsorted[0]; i1 += 2 * dsorted[1]){
+				for (std::size_t i2 = 0; i2 < dsorted[1]; i2 += 2 * dsorted[2]){
+					for (std::size_t i3 = 0; i3 < dsorted[2]; ++i3){
+						if (((i0 + i1 + i2 + i3)&ctrlmask) == ctrlmask)
+							kernel_core(psi, i0 + i1 + i2 + i3, dsorted[2], dsorted[1], dsorted[0], mm, mmt);
+					}
+				}
+			}
+		}
+	}
+#else
+    std::intptr_t zero = 0;
+    std::intptr_t dmask = dsorted[0] + dsorted[1] + dsorted[2];
+
+    if (ctrlmask == 0){
+        #pragma omp parallel for schedule(static)
+        for (std::intptr_t i = 0; i < static_cast<std::intptr_t>(n); ++i)
+            if ((i & dmask) == zero)
+                kernel_core(psi, i, dsorted[2], dsorted[1], dsorted[0], mm, mmt);
+     } else {
+        #pragma omp parallel for schedule(static)
+        for (std::intptr_t i = 0; i < static_cast<std::intptr_t>(n); ++i)
+            if ((i & ctrlmask) == ctrlmask && (i & dmask) == zero)
+                kernel_core(psi, i, dsorted[2], dsorted[1], dsorted[0], mm, mmt);
+     }
+#endif
+}
+
diff --git a/src/Simulation/Native/src/external/avx512/kernel4.hpp b/src/Simulation/Native/src/external/avx512/kernel4.hpp
new file mode 100644
index 00000000000..e956661a996
--- /dev/null
+++ b/src/Simulation/Native/src/external/avx512/kernel4.hpp
@@ -0,0 +1,207 @@
+// (C) 2018 ETH Zurich, ITP, Thomas Häner and Damian Steiger
+
+template <class V, class M>
+inline void kernel_core(V& psi, std::size_t I, std::size_t d0, std::size_t d1, std::size_t d2, std::size_t d3, M const& m, M const& mt)
+{
+	__m512d v[1];
+
+	v[0] = load1x4(&psi[I]);
+
+	__m512d tmp[4] = {_mm512_setzero_pd(), _mm512_setzero_pd(), _mm512_setzero_pd(), _mm512_setzero_pd()};
+
+	tmp[0] = fma(v[0], m[0], mt[0], tmp[0]);
+	tmp[1] = fma(v[0], m[1], mt[1], tmp[1]);
+	tmp[2] = fma(v[0], m[2], mt[2], tmp[2]);
+	tmp[3] = fma(v[0], m[3], mt[3], tmp[3]);
+
+	v[0] = load1x4(&psi[I + d0]);
+
+	tmp[0] = fma(v[0], m[4], mt[4], tmp[0]);
+	tmp[1] = fma(v[0], m[5], mt[5], tmp[1]);
+	tmp[2] = fma(v[0], m[6], mt[6], tmp[2]);
+	tmp[3] = fma(v[0], m[7], mt[7], tmp[3]);
+
+	v[0] = load1x4(&psi[I + d1]);
+
+	tmp[0] = fma(v[0], m[8], mt[8], tmp[0]);
+	tmp[1] = fma(v[0], m[9], mt[9], tmp[1]);
+	tmp[2] = fma(v[0], m[10], mt[10], tmp[2]);
+	tmp[3] = fma(v[0], m[11], mt[11], tmp[3]);
+
+	v[0] = load1x4(&psi[I + d0 + d1]);
+
+	tmp[0] = fma(v[0], m[12], mt[12], tmp[0]);
+	tmp[1] = fma(v[0], m[13], mt[13], tmp[1]);
+	tmp[2] = fma(v[0], m[14], mt[14], tmp[2]);
+	tmp[3] = fma(v[0], m[15], mt[15], tmp[3]);
+
+	v[0] = load1x4(&psi[I + d2]);
+
+	tmp[0] = fma(v[0], m[16], mt[16], tmp[0]);
+	tmp[1] = fma(v[0], m[17], mt[17], tmp[1]);
+	tmp[2] = fma(v[0], m[18], mt[18], tmp[2]);
+	tmp[3] = fma(v[0], m[19], mt[19], tmp[3]);
+
+	v[0] = load1x4(&psi[I + d0 + d2]);
+
+	tmp[0] = fma(v[0], m[20], mt[20], tmp[0]);
+	tmp[1] = fma(v[0], m[21], mt[21], tmp[1]);
+	tmp[2] = fma(v[0], m[22], mt[22], tmp[2]);
+	tmp[3] = fma(v[0], m[23], mt[23], tmp[3]);
+
+	v[0] = load1x4(&psi[I + d1 + d2]);
+
+	tmp[0] = fma(v[0], m[24], mt[24], tmp[0]);
+	tmp[1] = fma(v[0], m[25], mt[25], tmp[1]);
+	tmp[2] = fma(v[0], m[26], mt[26], tmp[2]);
+	tmp[3] = fma(v[0], m[27], mt[27], tmp[3]);
+
+	v[0] = load1x4(&psi[I + d0 + d1 + d2]);
+
+	tmp[0] = fma(v[0], m[28], mt[28], tmp[0]);
+	tmp[1] = fma(v[0], m[29], mt[29], tmp[1]);
+	tmp[2] = fma(v[0], m[30], mt[30], tmp[2]);
+	tmp[3] = fma(v[0], m[31], mt[31], tmp[3]);
+
+	v[0] = load1x4(&psi[I + d3]);
+
+	tmp[0] = fma(v[0], m[32], mt[32], tmp[0]);
+	tmp[1] = fma(v[0], m[33], mt[33], tmp[1]);
+	tmp[2] = fma(v[0], m[34], mt[34], tmp[2]);
+	tmp[3] = fma(v[0], m[35], mt[35], tmp[3]);
+
+	v[0] = load1x4(&psi[I + d0 + d3]);
+
+	tmp[0] = fma(v[0], m[36], mt[36], tmp[0]);
+	tmp[1] = fma(v[0], m[37], mt[37], tmp[1]);
+	tmp[2] = fma(v[0], m[38], mt[38], tmp[2]);
+	tmp[3] = fma(v[0], m[39], mt[39], tmp[3]);
+
+	v[0] = load1x4(&psi[I + d1 + d3]);
+
+	tmp[0] = fma(v[0], m[40], mt[40], tmp[0]);
+	tmp[1] = fma(v[0], m[41], mt[41], tmp[1]);
+	tmp[2] = fma(v[0], m[42], mt[42], tmp[2]);
+	tmp[3] = fma(v[0], m[43], mt[43], tmp[3]);
+
+	v[0] = load1x4(&psi[I + d0 + d1 + d3]);
+
+	tmp[0] = fma(v[0], m[44], mt[44], tmp[0]);
+	tmp[1] = fma(v[0], m[45], mt[45], tmp[1]);
+	tmp[2] = fma(v[0], m[46], mt[46], tmp[2]);
+	tmp[3] = fma(v[0], m[47], mt[47], tmp[3]);
+
+	v[0] = load1x4(&psi[I + d2 + d3]);
+
+	tmp[0] = fma(v[0], m[48], mt[48], tmp[0]);
+	tmp[1] = fma(v[0], m[49], mt[49], tmp[1]);
+	tmp[2] = fma(v[0], m[50], mt[50], tmp[2]);
+	tmp[3] = fma(v[0], m[51], mt[51], tmp[3]);
+
+	v[0] = load1x4(&psi[I + d0 + d2 + d3]);
+
+	tmp[0] = fma(v[0], m[52], mt[52], tmp[0]);
+	tmp[1] = fma(v[0], m[53], mt[53], tmp[1]);
+	tmp[2] = fma(v[0], m[54], mt[54], tmp[2]);
+	tmp[3] = fma(v[0], m[55], mt[55], tmp[3]);
+
+	v[0] = load1x4(&psi[I + d1 + d2 + d3]);
+
+	tmp[0] = fma(v[0], m[56], mt[56], tmp[0]);
+	tmp[1] = fma(v[0], m[57], mt[57], tmp[1]);
+	tmp[2] = fma(v[0], m[58], mt[58], tmp[2]);
+	tmp[3] = fma(v[0], m[59], mt[59], tmp[3]);
+
+	v[0] = load1x4(&psi[I + d0 + d1 + d2 + d3]);
+
+	tmp[0] = fma(v[0], m[60], mt[60], tmp[0]);
+	tmp[1] = fma(v[0], m[61], mt[61], tmp[1]);
+	tmp[2] = fma(v[0], m[62], mt[62], tmp[2]);
+	tmp[3] = fma(v[0], m[63], mt[63], tmp[3]);
+	store((double*)&psi[I + d0 + d1], (double*)&psi[I + d1], (double*)&psi[I + d0], (double*)&psi[I], tmp[0]);
+	store((double*)&psi[I + d0 + d1 + d2], (double*)&psi[I + d1 + d2], (double*)&psi[I + d0 + d2], (double*)&psi[I + d2], tmp[1]);
+	store((double*)&psi[I + d0 + d1 + d3], (double*)&psi[I + d1 + d3], (double*)&psi[I + d0 + d3], (double*)&psi[I + d3], tmp[2]);
+	store((double*)&psi[I + d0 + d1 + d2 + d3], (double*)&psi[I + d1 + d2 + d3], (double*)&psi[I + d0 + d2 + d3], (double*)&psi[I + d2 + d3], tmp[3]);
+
+}
+
+// bit indices id[.] are given from high to low (e.g. control first for CNOT)
+template <class V, class M>
+void kernel(V& psi, unsigned id3, unsigned id2, unsigned id1, unsigned id0, M const& matrix, std::size_t ctrlmask)
+{
+     std::size_t n = psi.size();
+	std::size_t d0 = 1ULL << id0;
+	std::size_t d1 = 1ULL << id1;
+	std::size_t d2 = 1ULL << id2;
+	std::size_t d3 = 1ULL << id3;
+	auto m = matrix;
+	std::size_t dsorted[] = {d0, d1, d2, d3};
+	permute_qubits_and_matrix(dsorted, 4, m);
+
+	__m512d mm[64];
+	for (unsigned b = 0; b < 16; ++b){
+		for (unsigned r = 0; r < 4; ++r){
+			for (unsigned c = 0; c < 1; ++c){
+				mm[b*4+r*1+c] = loada(&m[4*r+0][c+b*1], &m[4*r+1][c+b*1], &m[4*r+2][c+b*1], &m[4*r+3][c+b*1]);
+			}
+		}
+	}
+
+	__m512d mmt[64];
+	for (unsigned b = 0; b < 16; ++b){
+		for (unsigned r = 0; r < 4; ++r){
+			for (unsigned c = 0; c < 1; ++c){
+				mmt[b*4+r*1+c] = loadbm(&m[4*r+0][c+b*1], &m[4*r+1][c+b*1], &m[4*r+2][c+b*1], &m[4*r+3][c+b*1]);
+			}
+		}
+	}
+
+
+#ifndef _MSC_VER
+	if (ctrlmask == 0){
+		#pragma omp parallel for collapse(LOOP_COLLAPSE4) schedule(static) proc_bind(spread)
+		for (std::size_t i0 = 0; i0 < n; i0 += 2 * dsorted[0]){
+			for (std::size_t i1 = 0; i1 < dsorted[0]; i1 += 2 * dsorted[1]){
+				for (std::size_t i2 = 0; i2 < dsorted[1]; i2 += 2 * dsorted[2]){
+					for (std::size_t i3 = 0; i3 < dsorted[2]; i3 += 2 * dsorted[3]){
+						for (std::size_t i4 = 0; i4 < dsorted[3]; ++i4){
+							kernel_core(psi, i0 + i1 + i2 + i3 + i4, dsorted[3], dsorted[2], dsorted[1], dsorted[0], mm, mmt);
+						}
+					}
+				}
+			}
+		}
+	}
+	else{
+		#pragma omp parallel for collapse(LOOP_COLLAPSE4) schedule(static)
+		for (std::size_t i0 = 0; i0 < n; i0 += 2 * dsorted[0]){
+			for (std::size_t i1 = 0; i1 < dsorted[0]; i1 += 2 * dsorted[1]){
+				for (std::size_t i2 = 0; i2 < dsorted[1]; i2 += 2 * dsorted[2]){
+					for (std::size_t i3 = 0; i3 < dsorted[2]; i3 += 2 * dsorted[3]){
+						for (std::size_t i4 = 0; i4 < dsorted[3]; ++i4){
+							if (((i0 + i1 + i2 + i3 + i4)&ctrlmask) == ctrlmask)
+								kernel_core(psi, i0 + i1 + i2 + i3 + i4, dsorted[3], dsorted[2], dsorted[1], dsorted[0], mm, mmt);
+						}
+					}
+				}
+			}
+		}
+	}
+#else
+    std::intptr_t zero = 0;
+    std::intptr_t dmask = dsorted[0] + dsorted[1] + dsorted[2] + dsorted[3];
+
+    if (ctrlmask == 0){
+        #pragma omp parallel for schedule(static)
+        for (std::intptr_t i = 0; i < static_cast<std::intptr_t>(n); ++i)
+            if ((i & dmask) == zero)
+                kernel_core(psi, i, dsorted[3], dsorted[2], dsorted[1], dsorted[0], mm, mmt);
+     } else {
+        #pragma omp parallel for schedule(static)
+        for (std::intptr_t i = 0; i < static_cast<std::intptr_t>(n); ++i)
+            if ((i & ctrlmask) == ctrlmask && (i & dmask) == zero)
+                kernel_core(psi, i, dsorted[3], dsorted[2], dsorted[1], dsorted[0], mm, mmt);
+     }
+#endif
+}
+
diff --git a/src/Simulation/Native/src/external/avx512/kernel5.hpp b/src/Simulation/Native/src/external/avx512/kernel5.hpp
new file mode 100644
index 00000000000..ec1cdb918e6
--- /dev/null
+++ b/src/Simulation/Native/src/external/avx512/kernel5.hpp
@@ -0,0 +1,296 @@
+// (C) 2018 ETH Zurich, ITP, Thomas Häner and Damian Steiger
+
+template <class V, class M>
+inline void kernel_core(V& psi, std::size_t I, std::size_t d0, std::size_t d1, std::size_t d2, std::size_t d3, std::size_t d4, M const& m, M const& mt)
+{
+	__m512d v[2];
+
+	v[0] = load1x4(&psi[I]);
+	v[1] = load1x4(&psi[I + d0]);
+
+	__m512d tmp[8] = {_mm512_setzero_pd(), _mm512_setzero_pd(), _mm512_setzero_pd(), _mm512_setzero_pd(), _mm512_setzero_pd(), _mm512_setzero_pd(), _mm512_setzero_pd(), _mm512_setzero_pd()};
+
+	tmp[0] = fma(v[0], m[0], mt[0], fma(v[1], m[1], mt[1], tmp[0]));
+	tmp[1] = fma(v[0], m[2], mt[2], fma(v[1], m[3], mt[3], tmp[1]));
+	tmp[2] = fma(v[0], m[4], mt[4], fma(v[1], m[5], mt[5], tmp[2]));
+	tmp[3] = fma(v[0], m[6], mt[6], fma(v[1], m[7], mt[7], tmp[3]));
+	tmp[4] = fma(v[0], m[8], mt[8], fma(v[1], m[9], mt[9], tmp[4]));
+	tmp[5] = fma(v[0], m[10], mt[10], fma(v[1], m[11], mt[11], tmp[5]));
+	tmp[6] = fma(v[0], m[12], mt[12], fma(v[1], m[13], mt[13], tmp[6]));
+	tmp[7] = fma(v[0], m[14], mt[14], fma(v[1], m[15], mt[15], tmp[7]));
+
+	v[0] = load1x4(&psi[I + d1]);
+	v[1] = load1x4(&psi[I + d0 + d1]);
+
+	tmp[0] = fma(v[0], m[16], mt[16], fma(v[1], m[17], mt[17], tmp[0]));
+	tmp[1] = fma(v[0], m[18], mt[18], fma(v[1], m[19], mt[19], tmp[1]));
+	tmp[2] = fma(v[0], m[20], mt[20], fma(v[1], m[21], mt[21], tmp[2]));
+	tmp[3] = fma(v[0], m[22], mt[22], fma(v[1], m[23], mt[23], tmp[3]));
+	tmp[4] = fma(v[0], m[24], mt[24], fma(v[1], m[25], mt[25], tmp[4]));
+	tmp[5] = fma(v[0], m[26], mt[26], fma(v[1], m[27], mt[27], tmp[5]));
+	tmp[6] = fma(v[0], m[28], mt[28], fma(v[1], m[29], mt[29], tmp[6]));
+	tmp[7] = fma(v[0], m[30], mt[30], fma(v[1], m[31], mt[31], tmp[7]));
+
+	v[0] = load1x4(&psi[I + d2]);
+	v[1] = load1x4(&psi[I + d0 + d2]);
+
+	tmp[0] = fma(v[0], m[32], mt[32], fma(v[1], m[33], mt[33], tmp[0]));
+	tmp[1] = fma(v[0], m[34], mt[34], fma(v[1], m[35], mt[35], tmp[1]));
+	tmp[2] = fma(v[0], m[36], mt[36], fma(v[1], m[37], mt[37], tmp[2]));
+	tmp[3] = fma(v[0], m[38], mt[38], fma(v[1], m[39], mt[39], tmp[3]));
+	tmp[4] = fma(v[0], m[40], mt[40], fma(v[1], m[41], mt[41], tmp[4]));
+	tmp[5] = fma(v[0], m[42], mt[42], fma(v[1], m[43], mt[43], tmp[5]));
+	tmp[6] = fma(v[0], m[44], mt[44], fma(v[1], m[45], mt[45], tmp[6]));
+	tmp[7] = fma(v[0], m[46], mt[46], fma(v[1], m[47], mt[47], tmp[7]));
+
+	v[0] = load1x4(&psi[I + d1 + d2]);
+	v[1] = load1x4(&psi[I + d0 + d1 + d2]);
+
+	tmp[0] = fma(v[0], m[48], mt[48], fma(v[1], m[49], mt[49], tmp[0]));
+	tmp[1] = fma(v[0], m[50], mt[50], fma(v[1], m[51], mt[51], tmp[1]));
+	tmp[2] = fma(v[0], m[52], mt[52], fma(v[1], m[53], mt[53], tmp[2]));
+	tmp[3] = fma(v[0], m[54], mt[54], fma(v[1], m[55], mt[55], tmp[3]));
+	tmp[4] = fma(v[0], m[56], mt[56], fma(v[1], m[57], mt[57], tmp[4]));
+	tmp[5] = fma(v[0], m[58], mt[58], fma(v[1], m[59], mt[59], tmp[5]));
+	tmp[6] = fma(v[0], m[60], mt[60], fma(v[1], m[61], mt[61], tmp[6]));
+	tmp[7] = fma(v[0], m[62], mt[62], fma(v[1], m[63], mt[63], tmp[7]));
+
+	v[0] = load1x4(&psi[I + d3]);
+	v[1] = load1x4(&psi[I + d0 + d3]);
+
+	tmp[0] = fma(v[0], m[64], mt[64], fma(v[1], m[65], mt[65], tmp[0]));
+	tmp[1] = fma(v[0], m[66], mt[66], fma(v[1], m[67], mt[67], tmp[1]));
+	tmp[2] = fma(v[0], m[68], mt[68], fma(v[1], m[69], mt[69], tmp[2]));
+	tmp[3] = fma(v[0], m[70], mt[70], fma(v[1], m[71], mt[71], tmp[3]));
+	tmp[4] = fma(v[0], m[72], mt[72], fma(v[1], m[73], mt[73], tmp[4]));
+	tmp[5] = fma(v[0], m[74], mt[74], fma(v[1], m[75], mt[75], tmp[5]));
+	tmp[6] = fma(v[0], m[76], mt[76], fma(v[1], m[77], mt[77], tmp[6]));
+	tmp[7] = fma(v[0], m[78], mt[78], fma(v[1], m[79], mt[79], tmp[7]));
+
+	v[0] = load1x4(&psi[I + d1 + d3]);
+	v[1] = load1x4(&psi[I + d0 + d1 + d3]);
+
+	tmp[0] = fma(v[0], m[80], mt[80], fma(v[1], m[81], mt[81], tmp[0]));
+	tmp[1] = fma(v[0], m[82], mt[82], fma(v[1], m[83], mt[83], tmp[1]));
+	tmp[2] = fma(v[0], m[84], mt[84], fma(v[1], m[85], mt[85], tmp[2]));
+	tmp[3] = fma(v[0], m[86], mt[86], fma(v[1], m[87], mt[87], tmp[3]));
+	tmp[4] = fma(v[0], m[88], mt[88], fma(v[1], m[89], mt[89], tmp[4]));
+	tmp[5] = fma(v[0], m[90], mt[90], fma(v[1], m[91], mt[91], tmp[5]));
+	tmp[6] = fma(v[0], m[92], mt[92], fma(v[1], m[93], mt[93], tmp[6]));
+	tmp[7] = fma(v[0], m[94], mt[94], fma(v[1], m[95], mt[95], tmp[7]));
+
+	v[0] = load1x4(&psi[I + d2 + d3]);
+	v[1] = load1x4(&psi[I + d0 + d2 + d3]);
+
+	tmp[0] = fma(v[0], m[96], mt[96], fma(v[1], m[97], mt[97], tmp[0]));
+	tmp[1] = fma(v[0], m[98], mt[98], fma(v[1], m[99], mt[99], tmp[1]));
+	tmp[2] = fma(v[0], m[100], mt[100], fma(v[1], m[101], mt[101], tmp[2]));
+	tmp[3] = fma(v[0], m[102], mt[102], fma(v[1], m[103], mt[103], tmp[3]));
+	tmp[4] = fma(v[0], m[104], mt[104], fma(v[1], m[105], mt[105], tmp[4]));
+	tmp[5] = fma(v[0], m[106], mt[106], fma(v[1], m[107], mt[107], tmp[5]));
+	tmp[6] = fma(v[0], m[108], mt[108], fma(v[1], m[109], mt[109], tmp[6]));
+	tmp[7] = fma(v[0], m[110], mt[110], fma(v[1], m[111], mt[111], tmp[7]));
+
+	v[0] = load1x4(&psi[I + d1 + d2 + d3]);
+	v[1] = load1x4(&psi[I + d0 + d1 + d2 + d3]);
+
+	tmp[0] = fma(v[0], m[112], mt[112], fma(v[1], m[113], mt[113], tmp[0]));
+	tmp[1] = fma(v[0], m[114], mt[114], fma(v[1], m[115], mt[115], tmp[1]));
+	tmp[2] = fma(v[0], m[116], mt[116], fma(v[1], m[117], mt[117], tmp[2]));
+	tmp[3] = fma(v[0], m[118], mt[118], fma(v[1], m[119], mt[119], tmp[3]));
+	tmp[4] = fma(v[0], m[120], mt[120], fma(v[1], m[121], mt[121], tmp[4]));
+	tmp[5] = fma(v[0], m[122], mt[122], fma(v[1], m[123], mt[123], tmp[5]));
+	tmp[6] = fma(v[0], m[124], mt[124], fma(v[1], m[125], mt[125], tmp[6]));
+	tmp[7] = fma(v[0], m[126], mt[126], fma(v[1], m[127], mt[127], tmp[7]));
+
+	v[0] = load1x4(&psi[I + d4]);
+	v[1] = load1x4(&psi[I + d0 + d4]);
+
+	tmp[0] = fma(v[0], m[128], mt[128], fma(v[1], m[129], mt[129], tmp[0]));
+	tmp[1] = fma(v[0], m[130], mt[130], fma(v[1], m[131], mt[131], tmp[1]));
+	tmp[2] = fma(v[0], m[132], mt[132], fma(v[1], m[133], mt[133], tmp[2]));
+	tmp[3] = fma(v[0], m[134], mt[134], fma(v[1], m[135], mt[135], tmp[3]));
+	tmp[4] = fma(v[0], m[136], mt[136], fma(v[1], m[137], mt[137], tmp[4]));
+	tmp[5] = fma(v[0], m[138], mt[138], fma(v[1], m[139], mt[139], tmp[5]));
+	tmp[6] = fma(v[0], m[140], mt[140], fma(v[1], m[141], mt[141], tmp[6]));
+	tmp[7] = fma(v[0], m[142], mt[142], fma(v[1], m[143], mt[143], tmp[7]));
+
+	v[0] = load1x4(&psi[I + d1 + d4]);
+	v[1] = load1x4(&psi[I + d0 + d1 + d4]);
+
+	tmp[0] = fma(v[0], m[144], mt[144], fma(v[1], m[145], mt[145], tmp[0]));
+	tmp[1] = fma(v[0], m[146], mt[146], fma(v[1], m[147], mt[147], tmp[1]));
+	tmp[2] = fma(v[0], m[148], mt[148], fma(v[1], m[149], mt[149], tmp[2]));
+	tmp[3] = fma(v[0], m[150], mt[150], fma(v[1], m[151], mt[151], tmp[3]));
+	tmp[4] = fma(v[0], m[152], mt[152], fma(v[1], m[153], mt[153], tmp[4]));
+	tmp[5] = fma(v[0], m[154], mt[154], fma(v[1], m[155], mt[155], tmp[5]));
+	tmp[6] = fma(v[0], m[156], mt[156], fma(v[1], m[157], mt[157], tmp[6]));
+	tmp[7] = fma(v[0], m[158], mt[158], fma(v[1], m[159], mt[159], tmp[7]));
+
+	v[0] = load1x4(&psi[I + d2 + d4]);
+	v[1] = load1x4(&psi[I + d0 + d2 + d4]);
+
+	tmp[0] = fma(v[0], m[160], mt[160], fma(v[1], m[161], mt[161], tmp[0]));
+	tmp[1] = fma(v[0], m[162], mt[162], fma(v[1], m[163], mt[163], tmp[1]));
+	tmp[2] = fma(v[0], m[164], mt[164], fma(v[1], m[165], mt[165], tmp[2]));
+	tmp[3] = fma(v[0], m[166], mt[166], fma(v[1], m[167], mt[167], tmp[3]));
+	tmp[4] = fma(v[0], m[168], mt[168], fma(v[1], m[169], mt[169], tmp[4]));
+	tmp[5] = fma(v[0], m[170], mt[170], fma(v[1], m[171], mt[171], tmp[5]));
+	tmp[6] = fma(v[0], m[172], mt[172], fma(v[1], m[173], mt[173], tmp[6]));
+	tmp[7] = fma(v[0], m[174], mt[174], fma(v[1], m[175], mt[175], tmp[7]));
+
+	v[0] = load1x4(&psi[I + d1 + d2 + d4]);
+	v[1] = load1x4(&psi[I + d0 + d1 + d2 + d4]);
+
+	tmp[0] = fma(v[0], m[176], mt[176], fma(v[1], m[177], mt[177], tmp[0]));
+	tmp[1] = fma(v[0], m[178], mt[178], fma(v[1], m[179], mt[179], tmp[1]));
+	tmp[2] = fma(v[0], m[180], mt[180], fma(v[1], m[181], mt[181], tmp[2]));
+	tmp[3] = fma(v[0], m[182], mt[182], fma(v[1], m[183], mt[183], tmp[3]));
+	tmp[4] = fma(v[0], m[184], mt[184], fma(v[1], m[185], mt[185], tmp[4]));
+	tmp[5] = fma(v[0], m[186], mt[186], fma(v[1], m[187], mt[187], tmp[5]));
+	tmp[6] = fma(v[0], m[188], mt[188], fma(v[1], m[189], mt[189], tmp[6]));
+	tmp[7] = fma(v[0], m[190], mt[190], fma(v[1], m[191], mt[191], tmp[7]));
+
+	v[0] = load1x4(&psi[I + d3 + d4]);
+	v[1] = load1x4(&psi[I + d0 + d3 + d4]);
+
+	tmp[0] = fma(v[0], m[192], mt[192], fma(v[1], m[193], mt[193], tmp[0]));
+	tmp[1] = fma(v[0], m[194], mt[194], fma(v[1], m[195], mt[195], tmp[1]));
+	tmp[2] = fma(v[0], m[196], mt[196], fma(v[1], m[197], mt[197], tmp[2]));
+	tmp[3] = fma(v[0], m[198], mt[198], fma(v[1], m[199], mt[199], tmp[3]));
+	tmp[4] = fma(v[0], m[200], mt[200], fma(v[1], m[201], mt[201], tmp[4]));
+	tmp[5] = fma(v[0], m[202], mt[202], fma(v[1], m[203], mt[203], tmp[5]));
+	tmp[6] = fma(v[0], m[204], mt[204], fma(v[1], m[205], mt[205], tmp[6]));
+	tmp[7] = fma(v[0], m[206], mt[206], fma(v[1], m[207], mt[207], tmp[7]));
+
+	v[0] = load1x4(&psi[I + d1 + d3 + d4]);
+	v[1] = load1x4(&psi[I + d0 + d1 + d3 + d4]);
+
+	tmp[0] = fma(v[0], m[208], mt[208], fma(v[1], m[209], mt[209], tmp[0]));
+	tmp[1] = fma(v[0], m[210], mt[210], fma(v[1], m[211], mt[211], tmp[1]));
+	tmp[2] = fma(v[0], m[212], mt[212], fma(v[1], m[213], mt[213], tmp[2]));
+	tmp[3] = fma(v[0], m[214], mt[214], fma(v[1], m[215], mt[215], tmp[3]));
+	tmp[4] = fma(v[0], m[216], mt[216], fma(v[1], m[217], mt[217], tmp[4]));
+	tmp[5] = fma(v[0], m[218], mt[218], fma(v[1], m[219], mt[219], tmp[5]));
+	tmp[6] = fma(v[0], m[220], mt[220], fma(v[1], m[221], mt[221], tmp[6]));
+	tmp[7] = fma(v[0], m[222], mt[222], fma(v[1], m[223], mt[223], tmp[7]));
+
+	v[0] = load1x4(&psi[I + d2 + d3 + d4]);
+	v[1] = load1x4(&psi[I + d0 + d2 + d3 + d4]);
+
+	tmp[0] = fma(v[0], m[224], mt[224], fma(v[1], m[225], mt[225], tmp[0]));
+	tmp[1] = fma(v[0], m[226], mt[226], fma(v[1], m[227], mt[227], tmp[1]));
+	tmp[2] = fma(v[0], m[228], mt[228], fma(v[1], m[229], mt[229], tmp[2]));
+	tmp[3] = fma(v[0], m[230], mt[230], fma(v[1], m[231], mt[231], tmp[3]));
+	tmp[4] = fma(v[0], m[232], mt[232], fma(v[1], m[233], mt[233], tmp[4]));
+	tmp[5] = fma(v[0], m[234], mt[234], fma(v[1], m[235], mt[235], tmp[5]));
+	tmp[6] = fma(v[0], m[236], mt[236], fma(v[1], m[237], mt[237], tmp[6]));
+	tmp[7] = fma(v[0], m[238], mt[238], fma(v[1], m[239], mt[239], tmp[7]));
+
+	v[0] = load1x4(&psi[I + d1 + d2 + d3 + d4]);
+	v[1] = load1x4(&psi[I + d0 + d1 + d2 + d3 + d4]);
+
+	tmp[0] = fma(v[0], m[240], mt[240], fma(v[1], m[241], mt[241], tmp[0]));
+	tmp[1] = fma(v[0], m[242], mt[242], fma(v[1], m[243], mt[243], tmp[1]));
+	tmp[2] = fma(v[0], m[244], mt[244], fma(v[1], m[245], mt[245], tmp[2]));
+	tmp[3] = fma(v[0], m[246], mt[246], fma(v[1], m[247], mt[247], tmp[3]));
+	store((double*)&psi[I + d0 + d1], (double*)&psi[I + d1], (double*)&psi[I + d0], (double*)&psi[I], tmp[0]);
+	store((double*)&psi[I + d0 + d1 + d2], (double*)&psi[I + d1 + d2], (double*)&psi[I + d0 + d2], (double*)&psi[I + d2], tmp[1]);
+	store((double*)&psi[I + d0 + d1 + d3], (double*)&psi[I + d1 + d3], (double*)&psi[I + d0 + d3], (double*)&psi[I + d3], tmp[2]);
+	store((double*)&psi[I + d0 + d1 + d2 + d3], (double*)&psi[I + d1 + d2 + d3], (double*)&psi[I + d0 + d2 + d3], (double*)&psi[I + d2 + d3], tmp[3]);
+	tmp[4] = fma(v[0], m[248], mt[248], fma(v[1], m[249], mt[249], tmp[4]));
+	tmp[5] = fma(v[0], m[250], mt[250], fma(v[1], m[251], mt[251], tmp[5]));
+	tmp[6] = fma(v[0], m[252], mt[252], fma(v[1], m[253], mt[253], tmp[6]));
+	tmp[7] = fma(v[0], m[254], mt[254], fma(v[1], m[255], mt[255], tmp[7]));
+	store((double*)&psi[I + d0 + d1 + d4], (double*)&psi[I + d1 + d4], (double*)&psi[I + d0 + d4], (double*)&psi[I + d4], tmp[4]);
+	store((double*)&psi[I + d0 + d1 + d2 + d4], (double*)&psi[I + d1 + d2 + d4], (double*)&psi[I + d0 + d2 + d4], (double*)&psi[I + d2 + d4], tmp[5]);
+	store((double*)&psi[I + d0 + d1 + d3 + d4], (double*)&psi[I + d1 + d3 + d4], (double*)&psi[I + d0 + d3 + d4], (double*)&psi[I + d3 + d4], tmp[6]);
+	store((double*)&psi[I + d0 + d1 + d2 + d3 + d4], (double*)&psi[I + d1 + d2 + d3 + d4], (double*)&psi[I + d0 + d2 + d3 + d4], (double*)&psi[I + d2 + d3 + d4], tmp[7]);
+
+}
+
+// bit indices id[.] are given from high to low (e.g. control first for CNOT)
+template <class V, class M>
+void kernel(V& psi, unsigned id4, unsigned id3, unsigned id2, unsigned id1, unsigned id0, M const& matrix, std::size_t ctrlmask)
+{
+     std::size_t n = psi.size();
+	std::size_t d0 = 1ULL << id0;
+	std::size_t d1 = 1ULL << id1;
+	std::size_t d2 = 1ULL << id2;
+	std::size_t d3 = 1ULL << id3;
+	std::size_t d4 = 1ULL << id4;
+	auto m = matrix;
+	std::size_t dsorted[] = {d0, d1, d2, d3, d4};
+	permute_qubits_and_matrix(dsorted, 5, m);
+
+	__m512d mm[256];
+	for (unsigned b = 0; b < 16; ++b){
+		for (unsigned r = 0; r < 8; ++r){
+			for (unsigned c = 0; c < 2; ++c){
+				mm[b*16+r*2+c] = loada(&m[4*r+0][c+b*2], &m[4*r+1][c+b*2], &m[4*r+2][c+b*2], &m[4*r+3][c+b*2]);
+			}
+		}
+	}
+
+	__m512d mmt[256];
+	for (unsigned b = 0; b < 16; ++b){
+		for (unsigned r = 0; r < 8; ++r){
+			for (unsigned c = 0; c < 2; ++c){
+				mmt[b*16+r*2+c] = loadbm(&m[4*r+0][c+b*2], &m[4*r+1][c+b*2], &m[4*r+2][c+b*2], &m[4*r+3][c+b*2]);
+			}
+		}
+	}
+
+
+#ifndef _MSC_VER
+	if (ctrlmask == 0){
+		#pragma omp parallel for collapse(LOOP_COLLAPSE5) schedule(static) proc_bind(spread)
+		for (std::size_t i0 = 0; i0 < n; i0 += 2 * dsorted[0]){
+			for (std::size_t i1 = 0; i1 < dsorted[0]; i1 += 2 * dsorted[1]){
+				for (std::size_t i2 = 0; i2 < dsorted[1]; i2 += 2 * dsorted[2]){
+					for (std::size_t i3 = 0; i3 < dsorted[2]; i3 += 2 * dsorted[3]){
+						for (std::size_t i4 = 0; i4 < dsorted[3]; i4 += 2 * dsorted[4]){
+							for (std::size_t i5 = 0; i5 < dsorted[4]; ++i5){
+								kernel_core(psi, i0 + i1 + i2 + i3 + i4 + i5, dsorted[4], dsorted[3], dsorted[2], dsorted[1], dsorted[0], mm, mmt);
+							}
+						}
+					}
+				}
+			}
+		}
+	}
+	else{
+		#pragma omp parallel for collapse(LOOP_COLLAPSE5) schedule(static)
+		for (std::size_t i0 = 0; i0 < n; i0 += 2 * dsorted[0]){
+			for (std::size_t i1 = 0; i1 < dsorted[0]; i1 += 2 * dsorted[1]){
+				for (std::size_t i2 = 0; i2 < dsorted[1]; i2 += 2 * dsorted[2]){
+					for (std::size_t i3 = 0; i3 < dsorted[2]; i3 += 2 * dsorted[3]){
+						for (std::size_t i4 = 0; i4 < dsorted[3]; i4 += 2 * dsorted[4]){
+							for (std::size_t i5 = 0; i5 < dsorted[4]; ++i5){
+								if (((i0 + i1 + i2 + i3 + i4 + i5)&ctrlmask) == ctrlmask)
+									kernel_core(psi, i0 + i1 + i2 + i3 + i4 + i5, dsorted[4], dsorted[3], dsorted[2], dsorted[1], dsorted[0], mm, mmt);
+							}
+						}
+					}
+				}
+			}
+		}
+	}
+#else
+    std::intptr_t zero = 0;
+    std::intptr_t dmask = dsorted[0] + dsorted[1] + dsorted[2] + dsorted[3] + dsorted[4];
+
+    if (ctrlmask == 0){
+        #pragma omp parallel for schedule(static)
+        for (std::intptr_t i = 0; i < static_cast<std::intptr_t>(n); ++i)
+            if ((i & dmask) == zero)
+                kernel_core(psi, i, dsorted[4], dsorted[3], dsorted[2], dsorted[1], dsorted[0], mm, mmt);
+     } else {
+        #pragma omp parallel for schedule(static)
+        for (std::intptr_t i = 0; i < static_cast<std::intptr_t>(n); ++i)
+            if ((i & ctrlmask) == ctrlmask && (i & dmask) == zero)
+                kernel_core(psi, i, dsorted[4], dsorted[3], dsorted[2], dsorted[1], dsorted[0], mm, mmt);
+     }
+#endif
+}
+
diff --git a/src/Simulation/Native/src/external/avx512/kernel6.hpp b/src/Simulation/Native/src/external/avx512/kernel6.hpp
new file mode 100644
index 00000000000..77a6a89465e
--- /dev/null
+++ b/src/Simulation/Native/src/external/avx512/kernel6.hpp
@@ -0,0 +1,252 @@
+// (C) 2018 ETH Zurich, ITP, Thomas Häner and Damian Steiger
+
+template <class V, class M>
+inline void kernel_core(V& psi, std::size_t I, std::size_t d0, std::size_t d1, std::size_t d2, std::size_t d3, std::size_t d4, std::size_t d5, M const& m)
+{
+	__m512d v[4];
+
+	v[0] = load1x4(&psi[I]);
+	v[1] = load1x4(&psi[I + d0]);
+	v[2] = load1x4(&psi[I + d1]);
+	v[3] = load1x4(&psi[I + d0 + d1]);
+
+	__m512d tmp[16] = {_mm512_setzero_pd(), _mm512_setzero_pd(), _mm512_setzero_pd(), _mm512_setzero_pd(), _mm512_setzero_pd(), _mm512_setzero_pd(), _mm512_setzero_pd(), _mm512_setzero_pd(), _mm512_setzero_pd(), _mm512_setzero_pd(), _mm512_setzero_pd(), _mm512_setzero_pd(), _mm512_setzero_pd(), _mm512_setzero_pd(), _mm512_setzero_pd(), _mm512_setzero_pd()};
+	for (unsigned i = 0; i < 16; ++i){
+		tmp[i] = fma(v[0], m[0 + i * 4 + 0], fma(v[1], m[0 + i * 4 + 1], fma(v[2], m[0 + i * 4 + 2], fma(v[3], m[0 + i * 4 + 3], tmp[i]))));
+	}
+
+
+	v[0] = load1x4(&psi[I + d2]);
+	v[1] = load1x4(&psi[I + d0 + d2]);
+	v[2] = load1x4(&psi[I + d1 + d2]);
+	v[3] = load1x4(&psi[I + d0 + d1 + d2]);
+	for (unsigned i = 0; i < 16; ++i){
+		tmp[i] = fma(v[0], m[64 + i * 4 + 0], fma(v[1], m[64 + i * 4 + 1], fma(v[2], m[64 + i * 4 + 2], fma(v[3], m[64 + i * 4 + 3], tmp[i]))));
+	}
+
+
+	v[0] = load1x4(&psi[I + d3]);
+	v[1] = load1x4(&psi[I + d0 + d3]);
+	v[2] = load1x4(&psi[I + d1 + d3]);
+	v[3] = load1x4(&psi[I + d0 + d1 + d3]);
+	for (unsigned i = 0; i < 16; ++i){
+		tmp[i] = fma(v[0], m[128 + i * 4 + 0], fma(v[1], m[128 + i * 4 + 1], fma(v[2], m[128 + i * 4 + 2], fma(v[3], m[128 + i * 4 + 3], tmp[i]))));
+	}
+
+
+	v[0] = load1x4(&psi[I + d2 + d3]);
+	v[1] = load1x4(&psi[I + d0 + d2 + d3]);
+	v[2] = load1x4(&psi[I + d1 + d2 + d3]);
+	v[3] = load1x4(&psi[I + d0 + d1 + d2 + d3]);
+	for (unsigned i = 0; i < 16; ++i){
+		tmp[i] = fma(v[0], m[192 + i * 4 + 0], fma(v[1], m[192 + i * 4 + 1], fma(v[2], m[192 + i * 4 + 2], fma(v[3], m[192 + i * 4 + 3], tmp[i]))));
+	}
+
+
+	v[0] = load1x4(&psi[I + d4]);
+	v[1] = load1x4(&psi[I + d0 + d4]);
+	v[2] = load1x4(&psi[I + d1 + d4]);
+	v[3] = load1x4(&psi[I + d0 + d1 + d4]);
+	for (unsigned i = 0; i < 16; ++i){
+		tmp[i] = fma(v[0], m[256 + i * 4 + 0], fma(v[1], m[256 + i * 4 + 1], fma(v[2], m[256 + i * 4 + 2], fma(v[3], m[256 + i * 4 + 3], tmp[i]))));
+	}
+
+
+	v[0] = load1x4(&psi[I + d2 + d4]);
+	v[1] = load1x4(&psi[I + d0 + d2 + d4]);
+	v[2] = load1x4(&psi[I + d1 + d2 + d4]);
+	v[3] = load1x4(&psi[I + d0 + d1 + d2 + d4]);
+	for (unsigned i = 0; i < 16; ++i){
+		tmp[i] = fma(v[0], m[320 + i * 4 + 0], fma(v[1], m[320 + i * 4 + 1], fma(v[2], m[320 + i * 4 + 2], fma(v[3], m[320 + i * 4 + 3], tmp[i]))));
+	}
+
+
+	v[0] = load1x4(&psi[I + d3 + d4]);
+	v[1] = load1x4(&psi[I + d0 + d3 + d4]);
+	v[2] = load1x4(&psi[I + d1 + d3 + d4]);
+	v[3] = load1x4(&psi[I + d0 + d1 + d3 + d4]);
+	for (unsigned i = 0; i < 16; ++i){
+		tmp[i] = fma(v[0], m[384 + i * 4 + 0], fma(v[1], m[384 + i * 4 + 1], fma(v[2], m[384 + i * 4 + 2], fma(v[3], m[384 + i * 4 + 3], tmp[i]))));
+	}
+
+
+	v[0] = load1x4(&psi[I + d2 + d3 + d4]);
+	v[1] = load1x4(&psi[I + d0 + d2 + d3 + d4]);
+	v[2] = load1x4(&psi[I + d1 + d2 + d3 + d4]);
+	v[3] = load1x4(&psi[I + d0 + d1 + d2 + d3 + d4]);
+	for (unsigned i = 0; i < 16; ++i){
+		tmp[i] = fma(v[0], m[448 + i * 4 + 0], fma(v[1], m[448 + i * 4 + 1], fma(v[2], m[448 + i * 4 + 2], fma(v[3], m[448 + i * 4 + 3], tmp[i]))));
+	}
+
+
+	v[0] = load1x4(&psi[I + d5]);
+	v[1] = load1x4(&psi[I + d0 + d5]);
+	v[2] = load1x4(&psi[I + d1 + d5]);
+	v[3] = load1x4(&psi[I + d0 + d1 + d5]);
+	for (unsigned i = 0; i < 16; ++i){
+		tmp[i] = fma(v[0], m[512 + i * 4 + 0], fma(v[1], m[512 + i * 4 + 1], fma(v[2], m[512 + i * 4 + 2], fma(v[3], m[512 + i * 4 + 3], tmp[i]))));
+	}
+
+
+	v[0] = load1x4(&psi[I + d2 + d5]);
+	v[1] = load1x4(&psi[I + d0 + d2 + d5]);
+	v[2] = load1x4(&psi[I + d1 + d2 + d5]);
+	v[3] = load1x4(&psi[I + d0 + d1 + d2 + d5]);
+	for (unsigned i = 0; i < 16; ++i){
+		tmp[i] = fma(v[0], m[576 + i * 4 + 0], fma(v[1], m[576 + i * 4 + 1], fma(v[2], m[576 + i * 4 + 2], fma(v[3], m[576 + i * 4 + 3], tmp[i]))));
+	}
+
+
+	v[0] = load1x4(&psi[I + d3 + d5]);
+	v[1] = load1x4(&psi[I + d0 + d3 + d5]);
+	v[2] = load1x4(&psi[I + d1 + d3 + d5]);
+	v[3] = load1x4(&psi[I + d0 + d1 + d3 + d5]);
+	for (unsigned i = 0; i < 16; ++i){
+		tmp[i] = fma(v[0], m[640 + i * 4 + 0], fma(v[1], m[640 + i * 4 + 1], fma(v[2], m[640 + i * 4 + 2], fma(v[3], m[640 + i * 4 + 3], tmp[i]))));
+	}
+
+
+	v[0] = load1x4(&psi[I + d2 + d3 + d5]);
+	v[1] = load1x4(&psi[I + d0 + d2 + d3 + d5]);
+	v[2] = load1x4(&psi[I + d1 + d2 + d3 + d5]);
+	v[3] = load1x4(&psi[I + d0 + d1 + d2 + d3 + d5]);
+	for (unsigned i = 0; i < 16; ++i){
+		tmp[i] = fma(v[0], m[704 + i * 4 + 0], fma(v[1], m[704 + i * 4 + 1], fma(v[2], m[704 + i * 4 + 2], fma(v[3], m[704 + i * 4 + 3], tmp[i]))));
+	}
+
+
+	v[0] = load1x4(&psi[I + d4 + d5]);
+	v[1] = load1x4(&psi[I + d0 + d4 + d5]);
+	v[2] = load1x4(&psi[I + d1 + d4 + d5]);
+	v[3] = load1x4(&psi[I + d0 + d1 + d4 + d5]);
+	for (unsigned i = 0; i < 16; ++i){
+		tmp[i] = fma(v[0], m[768 + i * 4 + 0], fma(v[1], m[768 + i * 4 + 1], fma(v[2], m[768 + i * 4 + 2], fma(v[3], m[768 + i * 4 + 3], tmp[i]))));
+	}
+
+
+	v[0] = load1x4(&psi[I + d2 + d4 + d5]);
+	v[1] = load1x4(&psi[I + d0 + d2 + d4 + d5]);
+	v[2] = load1x4(&psi[I + d1 + d2 + d4 + d5]);
+	v[3] = load1x4(&psi[I + d0 + d1 + d2 + d4 + d5]);
+	for (unsigned i = 0; i < 16; ++i){
+		tmp[i] = fma(v[0], m[832 + i * 4 + 0], fma(v[1], m[832 + i * 4 + 1], fma(v[2], m[832 + i * 4 + 2], fma(v[3], m[832 + i * 4 + 3], tmp[i]))));
+	}
+
+
+	v[0] = load1x4(&psi[I + d3 + d4 + d5]);
+	v[1] = load1x4(&psi[I + d0 + d3 + d4 + d5]);
+	v[2] = load1x4(&psi[I + d1 + d3 + d4 + d5]);
+	v[3] = load1x4(&psi[I + d0 + d1 + d3 + d4 + d5]);
+	for (unsigned i = 0; i < 16; ++i){
+		tmp[i] = fma(v[0], m[896 + i * 4 + 0], fma(v[1], m[896 + i * 4 + 1], fma(v[2], m[896 + i * 4 + 2], fma(v[3], m[896 + i * 4 + 3], tmp[i]))));
+	}
+
+
+	v[0] = load1x4(&psi[I + d2 + d3 + d4 + d5]);
+	v[1] = load1x4(&psi[I + d0 + d2 + d3 + d4 + d5]);
+	v[2] = load1x4(&psi[I + d1 + d2 + d3 + d4 + d5]);
+	v[3] = load1x4(&psi[I + d0 + d1 + d2 + d3 + d4 + d5]);
+	for (unsigned i = 0; i < 16; ++i){
+		tmp[i] = fma(v[0], m[960 + i * 4 + 0], fma(v[1], m[960 + i * 4 + 1], fma(v[2], m[960 + i * 4 + 2], fma(v[3], m[960 + i * 4 + 3], tmp[i]))));
+	}
+
+	store((double*)&psi[I + d0 + d1], (double*)&psi[I + d1], (double*)&psi[I + d0], (double*)&psi[I], tmp[0]);
+	store((double*)&psi[I + d0 + d1 + d2], (double*)&psi[I + d1 + d2], (double*)&psi[I + d0 + d2], (double*)&psi[I + d2], tmp[1]);
+	store((double*)&psi[I + d0 + d1 + d3], (double*)&psi[I + d1 + d3], (double*)&psi[I + d0 + d3], (double*)&psi[I + d3], tmp[2]);
+	store((double*)&psi[I + d0 + d1 + d2 + d3], (double*)&psi[I + d1 + d2 + d3], (double*)&psi[I + d0 + d2 + d3], (double*)&psi[I + d2 + d3], tmp[3]);
+	store((double*)&psi[I + d0 + d1 + d4], (double*)&psi[I + d1 + d4], (double*)&psi[I + d0 + d4], (double*)&psi[I + d4], tmp[4]);
+	store((double*)&psi[I + d0 + d1 + d2 + d4], (double*)&psi[I + d1 + d2 + d4], (double*)&psi[I + d0 + d2 + d4], (double*)&psi[I + d2 + d4], tmp[5]);
+	store((double*)&psi[I + d0 + d1 + d3 + d4], (double*)&psi[I + d1 + d3 + d4], (double*)&psi[I + d0 + d3 + d4], (double*)&psi[I + d3 + d4], tmp[6]);
+	store((double*)&psi[I + d0 + d1 + d2 + d3 + d4], (double*)&psi[I + d1 + d2 + d3 + d4], (double*)&psi[I + d0 + d2 + d3 + d4], (double*)&psi[I + d2 + d3 + d4], tmp[7]);
+	store((double*)&psi[I + d0 + d1 + d5], (double*)&psi[I + d1 + d5], (double*)&psi[I + d0 + d5], (double*)&psi[I + d5], tmp[8]);
+	store((double*)&psi[I + d0 + d1 + d2 + d5], (double*)&psi[I + d1 + d2 + d5], (double*)&psi[I + d0 + d2 + d5], (double*)&psi[I + d2 + d5], tmp[9]);
+	store((double*)&psi[I + d0 + d1 + d3 + d5], (double*)&psi[I + d1 + d3 + d5], (double*)&psi[I + d0 + d3 + d5], (double*)&psi[I + d3 + d5], tmp[10]);
+	store((double*)&psi[I + d0 + d1 + d2 + d3 + d5], (double*)&psi[I + d1 + d2 + d3 + d5], (double*)&psi[I + d0 + d2 + d3 + d5], (double*)&psi[I + d2 + d3 + d5], tmp[11]);
+	store((double*)&psi[I + d0 + d1 + d4 + d5], (double*)&psi[I + d1 + d4 + d5], (double*)&psi[I + d0 + d4 + d5], (double*)&psi[I + d4 + d5], tmp[12]);
+	store((double*)&psi[I + d0 + d1 + d2 + d4 + d5], (double*)&psi[I + d1 + d2 + d4 + d5], (double*)&psi[I + d0 + d2 + d4 + d5], (double*)&psi[I + d2 + d4 + d5], tmp[13]);
+	store((double*)&psi[I + d0 + d1 + d3 + d4 + d5], (double*)&psi[I + d1 + d3 + d4 + d5], (double*)&psi[I + d0 + d3 + d4 + d5], (double*)&psi[I + d3 + d4 + d5], tmp[14]);
+	store((double*)&psi[I + d0 + d1 + d2 + d3 + d4 + d5], (double*)&psi[I + d1 + d2 + d3 + d4 + d5], (double*)&psi[I + d0 + d2 + d3 + d4 + d5], (double*)&psi[I + d2 + d3 + d4 + d5], tmp[15]);
+
+}
+
+// bit indices id[.] are given from high to low (e.g. control first for CNOT)
+template <class V, class M>
+void kernel(V& psi, unsigned id5, unsigned id4, unsigned id3, unsigned id2, unsigned id1, unsigned id0, M const& matrix, std::size_t ctrlmask)
+{
+     std::size_t n = psi.size();
+	std::size_t d0 = 1ULL << id0;
+	std::size_t d1 = 1ULL << id1;
+	std::size_t d2 = 1ULL << id2;
+	std::size_t d3 = 1ULL << id3;
+	std::size_t d4 = 1ULL << id4;
+	std::size_t d5 = 1ULL << id5;
+	auto m = matrix;
+	std::size_t dsorted[] = {d0, d1, d2, d3, d4, d5};
+	permute_qubits_and_matrix(dsorted, 6, m);
+
+	__m512d mm[1024];
+	for (unsigned b = 0; b < 16; ++b){
+		for (unsigned r = 0; r < 16; ++r){
+			for (unsigned c = 0; c < 4; ++c){
+				mm[b*64+r*4+c] = loadab(&m[4*r+0][c+b*4], &m[4*r+1][c+b*4], &m[4*r+2][c+b*4], &m[4*r+3][c+b*4]);
+			}
+		}
+	}
+
+
+#ifndef _MSC_VER
+	if (ctrlmask == 0){
+		#pragma omp parallel for collapse(LOOP_COLLAPSE6) schedule(static) proc_bind(spread)
+		for (std::size_t i0 = 0; i0 < n; i0 += 2 * dsorted[0]){
+			for (std::size_t i1 = 0; i1 < dsorted[0]; i1 += 2 * dsorted[1]){
+				for (std::size_t i2 = 0; i2 < dsorted[1]; i2 += 2 * dsorted[2]){
+					for (std::size_t i3 = 0; i3 < dsorted[2]; i3 += 2 * dsorted[3]){
+						for (std::size_t i4 = 0; i4 < dsorted[3]; i4 += 2 * dsorted[4]){
+							for (std::size_t i5 = 0; i5 < dsorted[4]; i5 += 2 * dsorted[5]){
+								for (std::size_t i6 = 0; i6 < dsorted[5]; ++i6){
+									kernel_core(psi, i0 + i1 + i2 + i3 + i4 + i5 + i6, dsorted[5], dsorted[4], dsorted[3], dsorted[2], dsorted[1], dsorted[0], mm);
+								}
+							}
+						}
+					}
+				}
+			}
+		}
+	}
+	else{
+		#pragma omp parallel for collapse(LOOP_COLLAPSE6) schedule(static)
+		for (std::size_t i0 = 0; i0 < n; i0 += 2 * dsorted[0]){
+			for (std::size_t i1 = 0; i1 < dsorted[0]; i1 += 2 * dsorted[1]){
+				for (std::size_t i2 = 0; i2 < dsorted[1]; i2 += 2 * dsorted[2]){
+					for (std::size_t i3 = 0; i3 < dsorted[2]; i3 += 2 * dsorted[3]){
+						for (std::size_t i4 = 0; i4 < dsorted[3]; i4 += 2 * dsorted[4]){
+							for (std::size_t i5 = 0; i5 < dsorted[4]; i5 += 2 * dsorted[5]){
+								for (std::size_t i6 = 0; i6 < dsorted[5]; ++i6){
+									if (((i0 + i1 + i2 + i3 + i4 + i5 + i6)&ctrlmask) == ctrlmask)
+										kernel_core(psi, i0 + i1 + i2 + i3 + i4 + i5 + i6, dsorted[5], dsorted[4], dsorted[3], dsorted[2], dsorted[1], dsorted[0], mm);
+								}
+							}
+						}
+					}
+				}
+			}
+		}
+	}
+#else
+    std::intptr_t zero = 0;
+    std::intptr_t dmask = dsorted[0] + dsorted[1] + dsorted[2] + dsorted[3] + dsorted[4] + dsorted[5];
+
+    if (ctrlmask == 0){
+        #pragma omp parallel for schedule(static)
+        for (std::intptr_t i = 0; i < static_cast<std::intptr_t>(n); ++i)
+            if ((i & dmask) == zero)
+                kernel_core(psi, i, dsorted[5], dsorted[4], dsorted[3], dsorted[2], dsorted[1], dsorted[0], mm);
+     } else {
+        #pragma omp parallel for schedule(static)
+        for (std::intptr_t i = 0; i < static_cast<std::intptr_t>(n); ++i)
+            if ((i & ctrlmask) == ctrlmask && (i & dmask) == zero)
+                kernel_core(psi, i, dsorted[5], dsorted[4], dsorted[3], dsorted[2], dsorted[1], dsorted[0], mm);
+     }
+#endif
+}
+
diff --git a/src/Simulation/Native/src/external/avx512/kernel7.hpp b/src/Simulation/Native/src/external/avx512/kernel7.hpp
new file mode 100644
index 00000000000..8e60b76cff2
--- /dev/null
+++ b/src/Simulation/Native/src/external/avx512/kernel7.hpp
@@ -0,0 +1,417 @@
+// (C) 2018 ETH Zurich, ITP, Thomas Häner and Damian Steiger
+
+template <class V, class M>
+inline void kernel_core(V& psi, std::size_t I, std::size_t d0, std::size_t d1, std::size_t d2, std::size_t d3, std::size_t d4, std::size_t d5, std::size_t d6, M const& m)
+{
+	__m512d v[4];
+
+	v[0] = load1x4(&psi[I]);
+	v[1] = load1x4(&psi[I + d0]);
+	v[2] = load1x4(&psi[I + d1]);
+	v[3] = load1x4(&psi[I + d0 + d1]);
+
+	__m512d tmp[32] = {_mm512_setzero_pd(), _mm512_setzero_pd(), _mm512_setzero_pd(), _mm512_setzero_pd(), _mm512_setzero_pd(), _mm512_setzero_pd(), _mm512_setzero_pd(), _mm512_setzero_pd(), _mm512_setzero_pd(), _mm512_setzero_pd(), _mm512_setzero_pd(), _mm512_setzero_pd(), _mm512_setzero_pd(), _mm512_setzero_pd(), _mm512_setzero_pd(), _mm512_setzero_pd(), _mm512_setzero_pd(), _mm512_setzero_pd(), _mm512_setzero_pd(), _mm512_setzero_pd(), _mm512_setzero_pd(), _mm512_setzero_pd(), _mm512_setzero_pd(), _mm512_setzero_pd(), _mm512_setzero_pd(), _mm512_setzero_pd(), _mm512_setzero_pd(), _mm512_setzero_pd(), _mm512_setzero_pd(), _mm512_setzero_pd(), _mm512_setzero_pd(), _mm512_setzero_pd()};
+	for (unsigned i = 0; i < 32; ++i){
+		tmp[i] = fma(v[0], m[0 + i * 4 + 0], fma(v[1], m[0 + i * 4 + 1], fma(v[2], m[0 + i * 4 + 2], fma(v[3], m[0 + i * 4 + 3], tmp[i]))));
+	}
+
+
+	v[0] = load1x4(&psi[I + d2]);
+	v[1] = load1x4(&psi[I + d0 + d2]);
+	v[2] = load1x4(&psi[I + d1 + d2]);
+	v[3] = load1x4(&psi[I + d0 + d1 + d2]);
+	for (unsigned i = 0; i < 32; ++i){
+		tmp[i] = fma(v[0], m[128 + i * 4 + 0], fma(v[1], m[128 + i * 4 + 1], fma(v[2], m[128 + i * 4 + 2], fma(v[3], m[128 + i * 4 + 3], tmp[i]))));
+	}
+
+
+	v[0] = load1x4(&psi[I + d3]);
+	v[1] = load1x4(&psi[I + d0 + d3]);
+	v[2] = load1x4(&psi[I + d1 + d3]);
+	v[3] = load1x4(&psi[I + d0 + d1 + d3]);
+	for (unsigned i = 0; i < 32; ++i){
+		tmp[i] = fma(v[0], m[256 + i * 4 + 0], fma(v[1], m[256 + i * 4 + 1], fma(v[2], m[256 + i * 4 + 2], fma(v[3], m[256 + i * 4 + 3], tmp[i]))));
+	}
+
+
+	v[0] = load1x4(&psi[I + d2 + d3]);
+	v[1] = load1x4(&psi[I + d0 + d2 + d3]);
+	v[2] = load1x4(&psi[I + d1 + d2 + d3]);
+	v[3] = load1x4(&psi[I + d0 + d1 + d2 + d3]);
+	for (unsigned i = 0; i < 32; ++i){
+		tmp[i] = fma(v[0], m[384 + i * 4 + 0], fma(v[1], m[384 + i * 4 + 1], fma(v[2], m[384 + i * 4 + 2], fma(v[3], m[384 + i * 4 + 3], tmp[i]))));
+	}
+
+
+	v[0] = load1x4(&psi[I + d4]);
+	v[1] = load1x4(&psi[I + d0 + d4]);
+	v[2] = load1x4(&psi[I + d1 + d4]);
+	v[3] = load1x4(&psi[I + d0 + d1 + d4]);
+	for (unsigned i = 0; i < 32; ++i){
+		tmp[i] = fma(v[0], m[512 + i * 4 + 0], fma(v[1], m[512 + i * 4 + 1], fma(v[2], m[512 + i * 4 + 2], fma(v[3], m[512 + i * 4 + 3], tmp[i]))));
+	}
+
+
+	v[0] = load1x4(&psi[I + d2 + d4]);
+	v[1] = load1x4(&psi[I + d0 + d2 + d4]);
+	v[2] = load1x4(&psi[I + d1 + d2 + d4]);
+	v[3] = load1x4(&psi[I + d0 + d1 + d2 + d4]);
+	for (unsigned i = 0; i < 32; ++i){
+		tmp[i] = fma(v[0], m[640 + i * 4 + 0], fma(v[1], m[640 + i * 4 + 1], fma(v[2], m[640 + i * 4 + 2], fma(v[3], m[640 + i * 4 + 3], tmp[i]))));
+	}
+
+
+	v[0] = load1x4(&psi[I + d3 + d4]);
+	v[1] = load1x4(&psi[I + d0 + d3 + d4]);
+	v[2] = load1x4(&psi[I + d1 + d3 + d4]);
+	v[3] = load1x4(&psi[I + d0 + d1 + d3 + d4]);
+	for (unsigned i = 0; i < 32; ++i){
+		tmp[i] = fma(v[0], m[768 + i * 4 + 0], fma(v[1], m[768 + i * 4 + 1], fma(v[2], m[768 + i * 4 + 2], fma(v[3], m[768 + i * 4 + 3], tmp[i]))));
+	}
+
+
+	v[0] = load1x4(&psi[I + d2 + d3 + d4]);
+	v[1] = load1x4(&psi[I + d0 + d2 + d3 + d4]);
+	v[2] = load1x4(&psi[I + d1 + d2 + d3 + d4]);
+	v[3] = load1x4(&psi[I + d0 + d1 + d2 + d3 + d4]);
+	for (unsigned i = 0; i < 32; ++i){
+		tmp[i] = fma(v[0], m[896 + i * 4 + 0], fma(v[1], m[896 + i * 4 + 1], fma(v[2], m[896 + i * 4 + 2], fma(v[3], m[896 + i * 4 + 3], tmp[i]))));
+	}
+
+
+	v[0] = load1x4(&psi[I + d5]);
+	v[1] = load1x4(&psi[I + d0 + d5]);
+	v[2] = load1x4(&psi[I + d1 + d5]);
+	v[3] = load1x4(&psi[I + d0 + d1 + d5]);
+	for (unsigned i = 0; i < 32; ++i){
+		tmp[i] = fma(v[0], m[1024 + i * 4 + 0], fma(v[1], m[1024 + i * 4 + 1], fma(v[2], m[1024 + i * 4 + 2], fma(v[3], m[1024 + i * 4 + 3], tmp[i]))));
+	}
+
+
+	v[0] = load1x4(&psi[I + d2 + d5]);
+	v[1] = load1x4(&psi[I + d0 + d2 + d5]);
+	v[2] = load1x4(&psi[I + d1 + d2 + d5]);
+	v[3] = load1x4(&psi[I + d0 + d1 + d2 + d5]);
+	for (unsigned i = 0; i < 32; ++i){
+		tmp[i] = fma(v[0], m[1152 + i * 4 + 0], fma(v[1], m[1152 + i * 4 + 1], fma(v[2], m[1152 + i * 4 + 2], fma(v[3], m[1152 + i * 4 + 3], tmp[i]))));
+	}
+
+
+	v[0] = load1x4(&psi[I + d3 + d5]);
+	v[1] = load1x4(&psi[I + d0 + d3 + d5]);
+	v[2] = load1x4(&psi[I + d1 + d3 + d5]);
+	v[3] = load1x4(&psi[I + d0 + d1 + d3 + d5]);
+	for (unsigned i = 0; i < 32; ++i){
+		tmp[i] = fma(v[0], m[1280 + i * 4 + 0], fma(v[1], m[1280 + i * 4 + 1], fma(v[2], m[1280 + i * 4 + 2], fma(v[3], m[1280 + i * 4 + 3], tmp[i]))));
+	}
+
+
+	v[0] = load1x4(&psi[I + d2 + d3 + d5]);
+	v[1] = load1x4(&psi[I + d0 + d2 + d3 + d5]);
+	v[2] = load1x4(&psi[I + d1 + d2 + d3 + d5]);
+	v[3] = load1x4(&psi[I + d0 + d1 + d2 + d3 + d5]);
+	for (unsigned i = 0; i < 32; ++i){
+		tmp[i] = fma(v[0], m[1408 + i * 4 + 0], fma(v[1], m[1408 + i * 4 + 1], fma(v[2], m[1408 + i * 4 + 2], fma(v[3], m[1408 + i * 4 + 3], tmp[i]))));
+	}
+
+
+	v[0] = load1x4(&psi[I + d4 + d5]);
+	v[1] = load1x4(&psi[I + d0 + d4 + d5]);
+	v[2] = load1x4(&psi[I + d1 + d4 + d5]);
+	v[3] = load1x4(&psi[I + d0 + d1 + d4 + d5]);
+	for (unsigned i = 0; i < 32; ++i){
+		tmp[i] = fma(v[0], m[1536 + i * 4 + 0], fma(v[1], m[1536 + i * 4 + 1], fma(v[2], m[1536 + i * 4 + 2], fma(v[3], m[1536 + i * 4 + 3], tmp[i]))));
+	}
+
+
+	v[0] = load1x4(&psi[I + d2 + d4 + d5]);
+	v[1] = load1x4(&psi[I + d0 + d2 + d4 + d5]);
+	v[2] = load1x4(&psi[I + d1 + d2 + d4 + d5]);
+	v[3] = load1x4(&psi[I + d0 + d1 + d2 + d4 + d5]);
+	for (unsigned i = 0; i < 32; ++i){
+		tmp[i] = fma(v[0], m[1664 + i * 4 + 0], fma(v[1], m[1664 + i * 4 + 1], fma(v[2], m[1664 + i * 4 + 2], fma(v[3], m[1664 + i * 4 + 3], tmp[i]))));
+	}
+
+
+	v[0] = load1x4(&psi[I + d3 + d4 + d5]);
+	v[1] = load1x4(&psi[I + d0 + d3 + d4 + d5]);
+	v[2] = load1x4(&psi[I + d1 + d3 + d4 + d5]);
+	v[3] = load1x4(&psi[I + d0 + d1 + d3 + d4 + d5]);
+	for (unsigned i = 0; i < 32; ++i){
+		tmp[i] = fma(v[0], m[1792 + i * 4 + 0], fma(v[1], m[1792 + i * 4 + 1], fma(v[2], m[1792 + i * 4 + 2], fma(v[3], m[1792 + i * 4 + 3], tmp[i]))));
+	}
+
+
+	v[0] = load1x4(&psi[I + d2 + d3 + d4 + d5]);
+	v[1] = load1x4(&psi[I + d0 + d2 + d3 + d4 + d5]);
+	v[2] = load1x4(&psi[I + d1 + d2 + d3 + d4 + d5]);
+	v[3] = load1x4(&psi[I + d0 + d1 + d2 + d3 + d4 + d5]);
+	for (unsigned i = 0; i < 32; ++i){
+		tmp[i] = fma(v[0], m[1920 + i * 4 + 0], fma(v[1], m[1920 + i * 4 + 1], fma(v[2], m[1920 + i * 4 + 2], fma(v[3], m[1920 + i * 4 + 3], tmp[i]))));
+	}
+
+
+	v[0] = load1x4(&psi[I + d6]);
+	v[1] = load1x4(&psi[I + d0 + d6]);
+	v[2] = load1x4(&psi[I + d1 + d6]);
+	v[3] = load1x4(&psi[I + d0 + d1 + d6]);
+	for (unsigned i = 0; i < 32; ++i){
+		tmp[i] = fma(v[0], m[2048 + i * 4 + 0], fma(v[1], m[2048 + i * 4 + 1], fma(v[2], m[2048 + i * 4 + 2], fma(v[3], m[2048 + i * 4 + 3], tmp[i]))));
+	}
+
+
+	v[0] = load1x4(&psi[I + d2 + d6]);
+	v[1] = load1x4(&psi[I + d0 + d2 + d6]);
+	v[2] = load1x4(&psi[I + d1 + d2 + d6]);
+	v[3] = load1x4(&psi[I + d0 + d1 + d2 + d6]);
+	for (unsigned i = 0; i < 32; ++i){
+		tmp[i] = fma(v[0], m[2176 + i * 4 + 0], fma(v[1], m[2176 + i * 4 + 1], fma(v[2], m[2176 + i * 4 + 2], fma(v[3], m[2176 + i * 4 + 3], tmp[i]))));
+	}
+
+
+	v[0] = load1x4(&psi[I + d3 + d6]);
+	v[1] = load1x4(&psi[I + d0 + d3 + d6]);
+	v[2] = load1x4(&psi[I + d1 + d3 + d6]);
+	v[3] = load1x4(&psi[I + d0 + d1 + d3 + d6]);
+	for (unsigned i = 0; i < 32; ++i){
+		tmp[i] = fma(v[0], m[2304 + i * 4 + 0], fma(v[1], m[2304 + i * 4 + 1], fma(v[2], m[2304 + i * 4 + 2], fma(v[3], m[2304 + i * 4 + 3], tmp[i]))));
+	}
+
+
+	v[0] = load1x4(&psi[I + d2 + d3 + d6]);
+	v[1] = load1x4(&psi[I + d0 + d2 + d3 + d6]);
+	v[2] = load1x4(&psi[I + d1 + d2 + d3 + d6]);
+	v[3] = load1x4(&psi[I + d0 + d1 + d2 + d3 + d6]);
+	for (unsigned i = 0; i < 32; ++i){
+		tmp[i] = fma(v[0], m[2432 + i * 4 + 0], fma(v[1], m[2432 + i * 4 + 1], fma(v[2], m[2432 + i * 4 + 2], fma(v[3], m[2432 + i * 4 + 3], tmp[i]))));
+	}
+
+
+	v[0] = load1x4(&psi[I + d4 + d6]);
+	v[1] = load1x4(&psi[I + d0 + d4 + d6]);
+	v[2] = load1x4(&psi[I + d1 + d4 + d6]);
+	v[3] = load1x4(&psi[I + d0 + d1 + d4 + d6]);
+	for (unsigned i = 0; i < 32; ++i){
+		tmp[i] = fma(v[0], m[2560 + i * 4 + 0], fma(v[1], m[2560 + i * 4 + 1], fma(v[2], m[2560 + i * 4 + 2], fma(v[3], m[2560 + i * 4 + 3], tmp[i]))));
+	}
+
+
+	v[0] = load1x4(&psi[I + d2 + d4 + d6]);
+	v[1] = load1x4(&psi[I + d0 + d2 + d4 + d6]);
+	v[2] = load1x4(&psi[I + d1 + d2 + d4 + d6]);
+	v[3] = load1x4(&psi[I + d0 + d1 + d2 + d4 + d6]);
+	for (unsigned i = 0; i < 32; ++i){
+		tmp[i] = fma(v[0], m[2688 + i * 4 + 0], fma(v[1], m[2688 + i * 4 + 1], fma(v[2], m[2688 + i * 4 + 2], fma(v[3], m[2688 + i * 4 + 3], tmp[i]))));
+	}
+
+
+	v[0] = load1x4(&psi[I + d3 + d4 + d6]);
+	v[1] = load1x4(&psi[I + d0 + d3 + d4 + d6]);
+	v[2] = load1x4(&psi[I + d1 + d3 + d4 + d6]);
+	v[3] = load1x4(&psi[I + d0 + d1 + d3 + d4 + d6]);
+	for (unsigned i = 0; i < 32; ++i){
+		tmp[i] = fma(v[0], m[2816 + i * 4 + 0], fma(v[1], m[2816 + i * 4 + 1], fma(v[2], m[2816 + i * 4 + 2], fma(v[3], m[2816 + i * 4 + 3], tmp[i]))));
+	}
+
+
+	v[0] = load1x4(&psi[I + d2 + d3 + d4 + d6]);
+	v[1] = load1x4(&psi[I + d0 + d2 + d3 + d4 + d6]);
+	v[2] = load1x4(&psi[I + d1 + d2 + d3 + d4 + d6]);
+	v[3] = load1x4(&psi[I + d0 + d1 + d2 + d3 + d4 + d6]);
+	for (unsigned i = 0; i < 32; ++i){
+		tmp[i] = fma(v[0], m[2944 + i * 4 + 0], fma(v[1], m[2944 + i * 4 + 1], fma(v[2], m[2944 + i * 4 + 2], fma(v[3], m[2944 + i * 4 + 3], tmp[i]))));
+	}
+
+
+	v[0] = load1x4(&psi[I + d5 + d6]);
+	v[1] = load1x4(&psi[I + d0 + d5 + d6]);
+	v[2] = load1x4(&psi[I + d1 + d5 + d6]);
+	v[3] = load1x4(&psi[I + d0 + d1 + d5 + d6]);
+	for (unsigned i = 0; i < 32; ++i){
+		tmp[i] = fma(v[0], m[3072 + i * 4 + 0], fma(v[1], m[3072 + i * 4 + 1], fma(v[2], m[3072 + i * 4 + 2], fma(v[3], m[3072 + i * 4 + 3], tmp[i]))));
+	}
+
+
+	v[0] = load1x4(&psi[I + d2 + d5 + d6]);
+	v[1] = load1x4(&psi[I + d0 + d2 + d5 + d6]);
+	v[2] = load1x4(&psi[I + d1 + d2 + d5 + d6]);
+	v[3] = load1x4(&psi[I + d0 + d1 + d2 + d5 + d6]);
+	for (unsigned i = 0; i < 32; ++i){
+		tmp[i] = fma(v[0], m[3200 + i * 4 + 0], fma(v[1], m[3200 + i * 4 + 1], fma(v[2], m[3200 + i * 4 + 2], fma(v[3], m[3200 + i * 4 + 3], tmp[i]))));
+	}
+
+
+	v[0] = load1x4(&psi[I + d3 + d5 + d6]);
+	v[1] = load1x4(&psi[I + d0 + d3 + d5 + d6]);
+	v[2] = load1x4(&psi[I + d1 + d3 + d5 + d6]);
+	v[3] = load1x4(&psi[I + d0 + d1 + d3 + d5 + d6]);
+	for (unsigned i = 0; i < 32; ++i){
+		tmp[i] = fma(v[0], m[3328 + i * 4 + 0], fma(v[1], m[3328 + i * 4 + 1], fma(v[2], m[3328 + i * 4 + 2], fma(v[3], m[3328 + i * 4 + 3], tmp[i]))));
+	}
+
+
+	v[0] = load1x4(&psi[I + d2 + d3 + d5 + d6]);
+	v[1] = load1x4(&psi[I + d0 + d2 + d3 + d5 + d6]);
+	v[2] = load1x4(&psi[I + d1 + d2 + d3 + d5 + d6]);
+	v[3] = load1x4(&psi[I + d0 + d1 + d2 + d3 + d5 + d6]);
+	for (unsigned i = 0; i < 32; ++i){
+		tmp[i] = fma(v[0], m[3456 + i * 4 + 0], fma(v[1], m[3456 + i * 4 + 1], fma(v[2], m[3456 + i * 4 + 2], fma(v[3], m[3456 + i * 4 + 3], tmp[i]))));
+	}
+
+
+	v[0] = load1x4(&psi[I + d4 + d5 + d6]);
+	v[1] = load1x4(&psi[I + d0 + d4 + d5 + d6]);
+	v[2] = load1x4(&psi[I + d1 + d4 + d5 + d6]);
+	v[3] = load1x4(&psi[I + d0 + d1 + d4 + d5 + d6]);
+	for (unsigned i = 0; i < 32; ++i){
+		tmp[i] = fma(v[0], m[3584 + i * 4 + 0], fma(v[1], m[3584 + i * 4 + 1], fma(v[2], m[3584 + i * 4 + 2], fma(v[3], m[3584 + i * 4 + 3], tmp[i]))));
+	}
+
+
+	v[0] = load1x4(&psi[I + d2 + d4 + d5 + d6]);
+	v[1] = load1x4(&psi[I + d0 + d2 + d4 + d5 + d6]);
+	v[2] = load1x4(&psi[I + d1 + d2 + d4 + d5 + d6]);
+	v[3] = load1x4(&psi[I + d0 + d1 + d2 + d4 + d5 + d6]);
+	for (unsigned i = 0; i < 32; ++i){
+		tmp[i] = fma(v[0], m[3712 + i * 4 + 0], fma(v[1], m[3712 + i * 4 + 1], fma(v[2], m[3712 + i * 4 + 2], fma(v[3], m[3712 + i * 4 + 3], tmp[i]))));
+	}
+
+
+	v[0] = load1x4(&psi[I + d3 + d4 + d5 + d6]);
+	v[1] = load1x4(&psi[I + d0 + d3 + d4 + d5 + d6]);
+	v[2] = load1x4(&psi[I + d1 + d3 + d4 + d5 + d6]);
+	v[3] = load1x4(&psi[I + d0 + d1 + d3 + d4 + d5 + d6]);
+	for (unsigned i = 0; i < 32; ++i){
+		tmp[i] = fma(v[0], m[3840 + i * 4 + 0], fma(v[1], m[3840 + i * 4 + 1], fma(v[2], m[3840 + i * 4 + 2], fma(v[3], m[3840 + i * 4 + 3], tmp[i]))));
+	}
+
+
+	v[0] = load1x4(&psi[I + d2 + d3 + d4 + d5 + d6]);
+	v[1] = load1x4(&psi[I + d0 + d2 + d3 + d4 + d5 + d6]);
+	v[2] = load1x4(&psi[I + d1 + d2 + d3 + d4 + d5 + d6]);
+	v[3] = load1x4(&psi[I + d0 + d1 + d2 + d3 + d4 + d5 + d6]);
+	for (unsigned i = 0; i < 32; ++i){
+		tmp[i] = fma(v[0], m[3968 + i * 4 + 0], fma(v[1], m[3968 + i * 4 + 1], fma(v[2], m[3968 + i * 4 + 2], fma(v[3], m[3968 + i * 4 + 3], tmp[i]))));
+	}
+
+	store((double*)&psi[I + d0 + d1], (double*)&psi[I + d1], (double*)&psi[I + d0], (double*)&psi[I], tmp[0]);
+	store((double*)&psi[I + d0 + d1 + d2], (double*)&psi[I + d1 + d2], (double*)&psi[I + d0 + d2], (double*)&psi[I + d2], tmp[1]);
+	store((double*)&psi[I + d0 + d1 + d3], (double*)&psi[I + d1 + d3], (double*)&psi[I + d0 + d3], (double*)&psi[I + d3], tmp[2]);
+	store((double*)&psi[I + d0 + d1 + d2 + d3], (double*)&psi[I + d1 + d2 + d3], (double*)&psi[I + d0 + d2 + d3], (double*)&psi[I + d2 + d3], tmp[3]);
+	store((double*)&psi[I + d0 + d1 + d4], (double*)&psi[I + d1 + d4], (double*)&psi[I + d0 + d4], (double*)&psi[I + d4], tmp[4]);
+	store((double*)&psi[I + d0 + d1 + d2 + d4], (double*)&psi[I + d1 + d2 + d4], (double*)&psi[I + d0 + d2 + d4], (double*)&psi[I + d2 + d4], tmp[5]);
+	store((double*)&psi[I + d0 + d1 + d3 + d4], (double*)&psi[I + d1 + d3 + d4], (double*)&psi[I + d0 + d3 + d4], (double*)&psi[I + d3 + d4], tmp[6]);
+	store((double*)&psi[I + d0 + d1 + d2 + d3 + d4], (double*)&psi[I + d1 + d2 + d3 + d4], (double*)&psi[I + d0 + d2 + d3 + d4], (double*)&psi[I + d2 + d3 + d4], tmp[7]);
+	store((double*)&psi[I + d0 + d1 + d5], (double*)&psi[I + d1 + d5], (double*)&psi[I + d0 + d5], (double*)&psi[I + d5], tmp[8]);
+	store((double*)&psi[I + d0 + d1 + d2 + d5], (double*)&psi[I + d1 + d2 + d5], (double*)&psi[I + d0 + d2 + d5], (double*)&psi[I + d2 + d5], tmp[9]);
+	store((double*)&psi[I + d0 + d1 + d3 + d5], (double*)&psi[I + d1 + d3 + d5], (double*)&psi[I + d0 + d3 + d5], (double*)&psi[I + d3 + d5], tmp[10]);
+	store((double*)&psi[I + d0 + d1 + d2 + d3 + d5], (double*)&psi[I + d1 + d2 + d3 + d5], (double*)&psi[I + d0 + d2 + d3 + d5], (double*)&psi[I + d2 + d3 + d5], tmp[11]);
+	store((double*)&psi[I + d0 + d1 + d4 + d5], (double*)&psi[I + d1 + d4 + d5], (double*)&psi[I + d0 + d4 + d5], (double*)&psi[I + d4 + d5], tmp[12]);
+	store((double*)&psi[I + d0 + d1 + d2 + d4 + d5], (double*)&psi[I + d1 + d2 + d4 + d5], (double*)&psi[I + d0 + d2 + d4 + d5], (double*)&psi[I + d2 + d4 + d5], tmp[13]);
+	store((double*)&psi[I + d0 + d1 + d3 + d4 + d5], (double*)&psi[I + d1 + d3 + d4 + d5], (double*)&psi[I + d0 + d3 + d4 + d5], (double*)&psi[I + d3 + d4 + d5], tmp[14]);
+	store((double*)&psi[I + d0 + d1 + d2 + d3 + d4 + d5], (double*)&psi[I + d1 + d2 + d3 + d4 + d5], (double*)&psi[I + d0 + d2 + d3 + d4 + d5], (double*)&psi[I + d2 + d3 + d4 + d5], tmp[15]);
+	store((double*)&psi[I + d0 + d1 + d6], (double*)&psi[I + d1 + d6], (double*)&psi[I + d0 + d6], (double*)&psi[I + d6], tmp[16]);
+	store((double*)&psi[I + d0 + d1 + d2 + d6], (double*)&psi[I + d1 + d2 + d6], (double*)&psi[I + d0 + d2 + d6], (double*)&psi[I + d2 + d6], tmp[17]);
+	store((double*)&psi[I + d0 + d1 + d3 + d6], (double*)&psi[I + d1 + d3 + d6], (double*)&psi[I + d0 + d3 + d6], (double*)&psi[I + d3 + d6], tmp[18]);
+	store((double*)&psi[I + d0 + d1 + d2 + d3 + d6], (double*)&psi[I + d1 + d2 + d3 + d6], (double*)&psi[I + d0 + d2 + d3 + d6], (double*)&psi[I + d2 + d3 + d6], tmp[19]);
+	store((double*)&psi[I + d0 + d1 + d4 + d6], (double*)&psi[I + d1 + d4 + d6], (double*)&psi[I + d0 + d4 + d6], (double*)&psi[I + d4 + d6], tmp[20]);
+	store((double*)&psi[I + d0 + d1 + d2 + d4 + d6], (double*)&psi[I + d1 + d2 + d4 + d6], (double*)&psi[I + d0 + d2 + d4 + d6], (double*)&psi[I + d2 + d4 + d6], tmp[21]);
+	store((double*)&psi[I + d0 + d1 + d3 + d4 + d6], (double*)&psi[I + d1 + d3 + d4 + d6], (double*)&psi[I + d0 + d3 + d4 + d6], (double*)&psi[I + d3 + d4 + d6], tmp[22]);
+	store((double*)&psi[I + d0 + d1 + d2 + d3 + d4 + d6], (double*)&psi[I + d1 + d2 + d3 + d4 + d6], (double*)&psi[I + d0 + d2 + d3 + d4 + d6], (double*)&psi[I + d2 + d3 + d4 + d6], tmp[23]);
+	store((double*)&psi[I + d0 + d1 + d5 + d6], (double*)&psi[I + d1 + d5 + d6], (double*)&psi[I + d0 + d5 + d6], (double*)&psi[I + d5 + d6], tmp[24]);
+	store((double*)&psi[I + d0 + d1 + d2 + d5 + d6], (double*)&psi[I + d1 + d2 + d5 + d6], (double*)&psi[I + d0 + d2 + d5 + d6], (double*)&psi[I + d2 + d5 + d6], tmp[25]);
+	store((double*)&psi[I + d0 + d1 + d3 + d5 + d6], (double*)&psi[I + d1 + d3 + d5 + d6], (double*)&psi[I + d0 + d3 + d5 + d6], (double*)&psi[I + d3 + d5 + d6], tmp[26]);
+	store((double*)&psi[I + d0 + d1 + d2 + d3 + d5 + d6], (double*)&psi[I + d1 + d2 + d3 + d5 + d6], (double*)&psi[I + d0 + d2 + d3 + d5 + d6], (double*)&psi[I + d2 + d3 + d5 + d6], tmp[27]);
+	store((double*)&psi[I + d0 + d1 + d4 + d5 + d6], (double*)&psi[I + d1 + d4 + d5 + d6], (double*)&psi[I + d0 + d4 + d5 + d6], (double*)&psi[I + d4 + d5 + d6], tmp[28]);
+	store((double*)&psi[I + d0 + d1 + d2 + d4 + d5 + d6], (double*)&psi[I + d1 + d2 + d4 + d5 + d6], (double*)&psi[I + d0 + d2 + d4 + d5 + d6], (double*)&psi[I + d2 + d4 + d5 + d6], tmp[29]);
+	store((double*)&psi[I + d0 + d1 + d3 + d4 + d5 + d6], (double*)&psi[I + d1 + d3 + d4 + d5 + d6], (double*)&psi[I + d0 + d3 + d4 + d5 + d6], (double*)&psi[I + d3 + d4 + d5 + d6], tmp[30]);
+	store((double*)&psi[I + d0 + d1 + d2 + d3 + d4 + d5 + d6], (double*)&psi[I + d1 + d2 + d3 + d4 + d5 + d6], (double*)&psi[I + d0 + d2 + d3 + d4 + d5 + d6], (double*)&psi[I + d2 + d3 + d4 + d5 + d6], tmp[31]);
+
+}
+
+// bit indices id[.] are given from high to low (e.g. control first for CNOT)
+template <class V, class M>
+void kernel(V& psi, unsigned id6, unsigned id5, unsigned id4, unsigned id3, unsigned id2, unsigned id1, unsigned id0, M const& matrix, std::size_t ctrlmask)
+{
+     std::size_t n = psi.size();
+	std::size_t d0 = 1ULL << id0;
+	std::size_t d1 = 1ULL << id1;
+	std::size_t d2 = 1ULL << id2;
+	std::size_t d3 = 1ULL << id3;
+	std::size_t d4 = 1ULL << id4;
+	std::size_t d5 = 1ULL << id5;
+	std::size_t d6 = 1ULL << id6;
+	auto m = matrix;
+	std::size_t dsorted[] = {d0, d1, d2, d3, d4, d5, d6};
+	permute_qubits_and_matrix(dsorted, 7, m);
+
+	__m512d mm[4096];
+	for (unsigned b = 0; b < 32; ++b){
+		for (unsigned r = 0; r < 32; ++r){
+			for (unsigned c = 0; c < 4; ++c){
+				mm[b*128+r*4+c] = loadab(&m[4*r+0][c+b*4], &m[4*r+1][c+b*4], &m[4*r+2][c+b*4], &m[4*r+3][c+b*4]);
+			}
+		}
+	}
+
+
+#ifndef _MSC_VER
+	if (ctrlmask == 0){
+		#pragma omp parallel for collapse(LOOP_COLLAPSE7) schedule(static) proc_bind(spread)
+		for (std::size_t i0 = 0; i0 < n; i0 += 2 * dsorted[0]){
+			for (std::size_t i1 = 0; i1 < dsorted[0]; i1 += 2 * dsorted[1]){
+				for (std::size_t i2 = 0; i2 < dsorted[1]; i2 += 2 * dsorted[2]){
+					for (std::size_t i3 = 0; i3 < dsorted[2]; i3 += 2 * dsorted[3]){
+						for (std::size_t i4 = 0; i4 < dsorted[3]; i4 += 2 * dsorted[4]){
+							for (std::size_t i5 = 0; i5 < dsorted[4]; i5 += 2 * dsorted[5]){
+								for (std::size_t i6 = 0; i6 < dsorted[5]; i6 += 2 * dsorted[6]){
+									for (std::size_t i7 = 0; i7 < dsorted[6]; ++i7){
+										kernel_core(psi, i0 + i1 + i2 + i3 + i4 + i5 + i6 + i7, dsorted[6], dsorted[5], dsorted[4], dsorted[3], dsorted[2], dsorted[1], dsorted[0], mm);
+									}
+								}
+							}
+						}
+					}
+				}
+			}
+		}
+	}
+	else{
+		#pragma omp parallel for collapse(LOOP_COLLAPSE7) schedule(static)
+		for (std::size_t i0 = 0; i0 < n; i0 += 2 * dsorted[0]){
+			for (std::size_t i1 = 0; i1 < dsorted[0]; i1 += 2 * dsorted[1]){
+				for (std::size_t i2 = 0; i2 < dsorted[1]; i2 += 2 * dsorted[2]){
+					for (std::size_t i3 = 0; i3 < dsorted[2]; i3 += 2 * dsorted[3]){
+						for (std::size_t i4 = 0; i4 < dsorted[3]; i4 += 2 * dsorted[4]){
+							for (std::size_t i5 = 0; i5 < dsorted[4]; i5 += 2 * dsorted[5]){
+								for (std::size_t i6 = 0; i6 < dsorted[5]; i6 += 2 * dsorted[6]){
+									for (std::size_t i7 = 0; i7 < dsorted[6]; ++i7){
+										if (((i0 + i1 + i2 + i3 + i4 + i5 + i6 + i7)&ctrlmask) == ctrlmask)
+											kernel_core(psi, i0 + i1 + i2 + i3 + i4 + i5 + i6 + i7, dsorted[6], dsorted[5], dsorted[4], dsorted[3], dsorted[2], dsorted[1], dsorted[0], mm);
+									}
+								}
+							}
+						}
+					}
+				}
+			}
+		}
+	}
+#else
+    std::intptr_t zero = 0;
+    std::intptr_t dmask = dsorted[0] + dsorted[1] + dsorted[2] + dsorted[3] + dsorted[4] + dsorted[5] + dsorted[6];
+
+    if (ctrlmask == 0){
+        #pragma omp parallel for schedule(static)
+        for (std::intptr_t i = 0; i < static_cast<std::intptr_t>(n); ++i)
+            if ((i & dmask) == zero)
+                kernel_core(psi, i, dsorted[6], dsorted[5], dsorted[4], dsorted[3], dsorted[2], dsorted[1], dsorted[0], mm);
+     } else {
+        #pragma omp parallel for schedule(static)
+        for (std::intptr_t i = 0; i < static_cast<std::intptr_t>(n); ++i)
+            if ((i & ctrlmask) == ctrlmask && (i & dmask) == zero)
+                kernel_core(psi, i, dsorted[6], dsorted[5], dsorted[4], dsorted[3], dsorted[2], dsorted[1], dsorted[0], mm);
+     }
+#endif
+}
+
diff --git a/src/Simulation/Native/src/external/avx512/kernels.hpp b/src/Simulation/Native/src/external/avx512/kernels.hpp
new file mode 100644
index 00000000000..d5a056663bd
--- /dev/null
+++ b/src/Simulation/Native/src/external/avx512/kernels.hpp
@@ -0,0 +1,31 @@
+// (C) 2018 ETH Zurich, ITP, Thomas Häner and Damian Steiger
+
+#ifndef KERNELS_HPP_
+#define KERNELS_HPP_
+
+#include <cmath>
+#include <cstdlib>
+#include <vector>
+#include <complex>
+#include <functional>
+#include <algorithm>
+#include "../cintrin.hpp"
+#include "util/alignedalloc.hpp"
+
+#define LOOP_COLLAPSE1 2
+#define LOOP_COLLAPSE2 3
+#define LOOP_COLLAPSE3 4
+#define LOOP_COLLAPSE4 5
+#define LOOP_COLLAPSE5 6
+#define LOOP_COLLAPSE6 7
+#define LOOP_COLLAPSE7 8
+
+#include "kernel1.hpp"
+#include "kernel2.hpp"
+#include "kernel3.hpp"
+#include "kernel4.hpp"
+#include "kernel5.hpp"
+#include "kernel6.hpp"
+#include "kernel7.hpp"
+
+#endif
diff --git a/src/Simulation/Native/src/external/cintrin.hpp b/src/Simulation/Native/src/external/cintrin.hpp
index 8034408c1e8..7e9eca72956 100644
--- a/src/Simulation/Native/src/external/cintrin.hpp
+++ b/src/Simulation/Native/src/external/cintrin.hpp
@@ -35,7 +35,15 @@ inline void permute_qubits_and_matrix(I *delta_list, unsigned n, M & matrix){
 }
 
 inline std::complex<double> fma(std::complex<double> const& c1, std::complex<double> const& c2, std::complex<double> const& a){
-	return c1*c2 + a;
+	// Expanded complex FMA to hard coded access (much faster)
+#ifdef _MSC_VER
+	double r = (c1._Val[0] * c2._Val[0] - c1._Val[1] * c2._Val[1]) + a._Val[0];
+	double i = (c1._Val[0] * c2._Val[1] + c1._Val[1] * c2._Val[0]) + a._Val[1];
+#else
+	double r = (c1.real() * c2.real() - c1.imag() * c2.imag()) + a.real();
+	double i = (c1.real() * c2.imag() + c1.imag() * c2.real()) + a.imag();
+#endif
+	return std::complex<double>(r, i);
 }
 
 inline __m256d fma(__m256d const& c1, __m256d const& c2, __m256d const& a){
diff --git a/src/Simulation/Native/src/external/fused.hpp b/src/Simulation/Native/src/external/fused.hpp
index 6c1137ceb3b..2b170461e41 100644
--- a/src/Simulation/Native/src/external/fused.hpp
+++ b/src/Simulation/Native/src/external/fused.hpp
@@ -5,6 +5,8 @@
 #include "config.hpp"
 #include "external/fusion.hpp"
 #include "simulator/kernels.hpp"
+#include <string>
+#include <thread>
 
 #ifndef HAVE_INTRINSICS
 #include "external/nointrin/kernels.hpp"
@@ -15,7 +17,7 @@
 #ifdef HAVE_FMA
 #include "external/avx2/kernels.hpp"
 #else
-#include "external/avx2/kernels.hpp"
+#include "external/avx/kernels.hpp"
 #endif
 #endif
 #endif
@@ -29,15 +31,35 @@ namespace SIMULATOR
 
 class Fused
   {
+
   public:
-    Fused() {}
+      Fused() {
+        wfnCapacity     = 0u;   // used to optimize runtime parameters
+        maxFusedSpan    = 4;    // determine span to use at runtime
+        maxFusedDepth   = 999;  // determine max depth to use at runtime
+    }
 
     inline void reset()
     {
       fusedgates = Fusion();
     }
 
+    const Fusion& get_fusedgates() const {
+        return fusedgates;
+    }
     
+    void set_fusedgates(Fusion newFusedGates) const {
+        fusedgates = newFusedGates;
+    }
+
+    const int maxSpan() const {
+        return maxFusedSpan;
+    }
+
+    const int maxDepth() const {
+        return maxFusedDepth;
+    }
+
     template <class T, class A>
     void flush(std::vector<T, A>& wfn) const
     {
@@ -46,9 +68,9 @@ class Fused
       
       Fusion::Matrix m;
       Fusion::IndexVector qs, cs;
-      
+
       fusedgates.perform_fusion(m, qs, cs);
-      
+
       std::size_t cmask = 0;
       for (auto c : cs)
         cmask |= (1ull << c);
@@ -70,23 +92,19 @@ class Fused
         case 5:
           ::kernel(wfn, qs[4], qs[3], qs[2], qs[1], qs[0], m, cmask);
           break;
+        case 6:
+            ::kernel(wfn, qs[5], qs[4], qs[3], qs[2], qs[1], qs[0], m, cmask);
+            break;
+        case 7:
+            ::kernel(wfn, qs[6], qs[5], qs[4], qs[3], qs[2], qs[1], qs[0], m, cmask);
+            break;
       }
-      
+
       fusedgates = Fusion();
     }
     
-    template <class T, class A1, class A2>
-    bool subsytemwavefunction(std::vector<T, A1>& wfn,
-                              std::vector<unsigned> const& qs,
-                              std::vector<T, A2>& qubitswfn,
-                              double tolerance)
-    {
-      flush(wfn); // we have to flush before we can extract the state
-      return kernels::subsytemwavefunction(wfn, qs, qubitswfn, tolerance);
-    }
-    
     template <class M>
-    Fusion::Matrix convertMatrix(M const& m)
+    Fusion::Matrix convertMatrix(M const& m) const
     {
       Fusion::Matrix mat(2, Fusion::Matrix::value_type(2));
       for (unsigned i = 0; i < 2; ++i)
@@ -96,30 +114,86 @@ class Fused
     }
     
     template <class T, class A, class M>
-    void apply_controlled(std::vector<T, A>& wfn, M const& mat, std::vector<unsigned> const& cs, unsigned q)
+    void apply_controlled(std::vector<T, A>& wfn, M const& mat, std::vector<unsigned> const& cs, unsigned q) const
     {
-      if (fusedgates.num_qubits()+fusedgates.num_controls()+cs.size()>8 || fusedgates.size() > 15)
-        flush(wfn);
-      Fusion newgates = fusedgates;
-      newgates.insert(convertMatrix(mat), std::vector<unsigned>(1, q), cs);
-      
-      if (newgates.num_qubits() > 4)
-      {
-        flush(wfn);
-        fusedgates.insert(convertMatrix(mat), std::vector<unsigned>(1, q), cs);
-      }
-      else
-        fusedgates = newgates;
+        Fusion::IndexVector qs = std::vector<unsigned>(1, q);
+        fusedgates.insert(convertMatrix(mat), qs, cs);
     }
-    
+
     template <class T, class A, class M>
-    void apply(std::vector<T, A>& wfn, M const& mat, unsigned q)
+    void apply(std::vector<T, A>& wfn, M const& mat, unsigned q) const
     {
       std::vector<unsigned> cs;
       apply_controlled(wfn, mat, cs, q);
     }
+
+    template <class T, class A>
+    bool shouldFlush(std::vector<T, A>& wfn, std::vector<unsigned> const& cs, unsigned q)
+    {
+        // Major runtime logic change here
+
+          // Have to update capacity as the WFN grows
+        if (wfnCapacity != wfn.capacity()) {
+            wfnCapacity = wfn.capacity();
+            char* envNT = NULL;
+            size_t len;
+#ifdef _MSC_VER
+            errno_t err = _dupenv_s(&envNT, &len, "OMP_NUM_THREADS");
+#else
+            envNT = getenv("OMP_NUM_THREADS");
+#endif
+            if (envNT == NULL) { // If the user didn't force the number of threads, make an intelligent guess
+                int nMaxThrds = std::thread::hardware_concurrency();        // Logical HW threads
+                if (nMaxThrds > 4) nMaxThrds/= 2;                           // Assume we have hyperthreading (no consistent/concise way to do this)
+                if (wfnCapacity < 1u << 20) {
+                    if (nMaxThrds > 8) nMaxThrds = 8;                       // Small problem, never use too many
+                    else if (nMaxThrds > 3) nMaxThrds = 3;                  // Small problem on a small machine
+                }
+                omp_set_num_threads(nMaxThrds);
+            }
+
+            // Set the max fused depth
+            char* envFD = NULL;
+            maxFusedDepth = 999;
+#ifdef _MSC_VER
+            err = _dupenv_s(&envFD, &len, "QDK_SIM_FUSEDEPTH");
+            if (envFD != NULL && len > 0) {
+                maxFusedDepth = atoi(envFD);
+        }
+#else
+            envFD = getenv("QDK_SIM_FUSEDEPTH");
+            if (envFD != NULL && strlen(envFD) > 0) {
+                maxFusedDepth = atoi(envFD);
+            }
+#endif
+            // Set the fused span limit
+            char* envFS = NULL;
+            maxFusedSpan = 4;                               // General sweet spot
+            if (wfnCapacity < 1u << 20) maxFusedSpan = 2;   // Don't pre-fuse small problems
+#ifdef _MSC_VER
+            err = _dupenv_s(&envFS, &len, "QDK_SIM_FUSESPAN");
+            if (envFS != NULL && len > 0) {
+                maxFusedSpan = atoi(envFS);
+                if (maxFusedSpan > 7) maxFusedSpan = 7;     // Highest we can handle
+        }
+#else
+            envFS = getenv("QDK_SIM_FUSESPAN");
+            if (envFS != NULL && strlen(envFS) > 0) {
+                maxFusedSpan = atoi(envFS);
+            }
+#endif
+
+        }
+        return false;
+    }
+
   private:
     mutable Fusion fusedgates;
+
+    //: New runtime optimizatin settings
+    mutable size_t wfnCapacity;
+    mutable int    maxFusedSpan;
+    mutable int    maxFusedDepth;
   };
   
   
diff --git a/src/Simulation/Native/src/external/fusion.hpp b/src/Simulation/Native/src/external/fusion.hpp
index 2e6b4bdb50d..88140349dba 100644
--- a/src/Simulation/Native/src/external/fusion.hpp
+++ b/src/Simulation/Native/src/external/fusion.hpp
@@ -10,6 +10,7 @@
 #include <iostream>
 #include <cassert>
 #include "util/alignedalloc.hpp"
+#include <unordered_map>
 
 class Item{
 public:  
@@ -17,14 +18,20 @@ class Item{
 	using IndexVector = std::vector<Index>;
 	using Complex = std::complex<double>;
 	using Matrix = std::vector<std::vector<Complex, Microsoft::Quantum::Simulator::AlignedAlloc<Complex, 64>>>;
-	Item(Matrix mat, IndexVector idx) : mat_(mat), idx_(idx) {}
+	Item(Matrix mat, IndexVector idx) : mat_(std::move(mat)), idx_(idx) {}
 	Matrix& get_matrix() { return mat_; }
-	IndexVector& get_indices() { return idx_; }
+	IndexVector& get_indices() const { return idx_; }
+	void remap_idx(std::unordered_map<unsigned, unsigned> elemDict) const {
+		for (size_t i = 0; i < idx_.size(); i++) {
+			idx_[i] = elemDict[idx_[i]];
+		}
+	}
 private:
 	Matrix mat_;
-	IndexVector idx_;
+	mutable IndexVector idx_;
 };
 
+// Class handling the fusion of gates
 class Fusion{
 public:
 	using Index = unsigned;
@@ -37,7 +44,7 @@ class Fusion{
 	Fusion() : global_factor_(1.) {}
 	
 	Index num_qubits() const {
-		return static_cast<Index>(set_.size());
+		return static_cast<Index>(target_set_.size());
 	}
 
 	Index num_controls() const {
@@ -58,9 +65,58 @@ class Fusion{
 		handle_controls(empty_matrix, empty_vec, {}); // remove all current control qubits (this is a GLOBAL factor)
 	}
 	
-	void insert(Matrix matrix, IndexVector index_list, IndexVector const& ctrl_list = {}){
+	const IndexSet& get_target_set() const {
+		return target_set_;
+	}
+
+	const ItemVector& get_items() const {
+		return items_;
+	}
+
+	const IndexSet& get_ctrl_set() const {
+		return ctrl_set_;
+	}
+
+	const Complex& get_global_factor() const {
+		return global_factor_;
+	}
+
+	static void remap_qubits(std::set<Index>& qubits, const std::unordered_map<unsigned, unsigned>& mapFromOldLocToNewLoc) {
+		std::set<Index> tempSet;
+		for (unsigned elem : qubits) {
+			if (mapFromOldLocToNewLoc.find(elem) != mapFromOldLocToNewLoc.end()) {
+				tempSet.insert(mapFromOldLocToNewLoc.at(elem));
+			}
+		}
+		qubits.swap(tempSet);
+	}
+
+	void remap_target_set(const std::unordered_map<unsigned, unsigned>& mapFromOldLocToNewLoc) const {
+		remap_qubits(target_set_, mapFromOldLocToNewLoc);
+	}
+
+	void remap_ctrl_set(const std::unordered_map<unsigned, unsigned>& mapFromOldLocToNewLoc) const {
+		remap_qubits(ctrl_set_, mapFromOldLocToNewLoc);
+	}
+	
+	void set_items(ItemVector&& newItems) const {
+		items_.swap(newItems);
+	}
+
+	// This saves a class instance create/destroy on every gate insert
+	// Need a quick way to decide if we're going to grow too wide
+	int predict(IndexVector index_list, IndexVector const& ctrl_list = {}) {
+		int cnt = num_qubits() + num_controls();
+		for (auto idx : index_list)
+			if (target_set_.count(idx) == 0 && ctrl_set_.count(idx) == 0) cnt++;
+		for (auto idx : ctrl_list)
+			if (target_set_.count(idx) == 0 && ctrl_set_.count(idx) == 0) cnt++;
+		return cnt;
+	}
+
+	void insert(Matrix matrix, IndexVector index_list, IndexVector const& ctrl_list = {}) const {
 		for (auto idx : index_list)
-			set_.emplace(idx);
+			target_set_.emplace(idx);
 		
 		if (global_factor_ != 1. && ctrl_list.size() > 0){
 			assert(ctrl_set_.size() == 0);
@@ -73,7 +129,7 @@ class Fusion{
 	}
 	
 	void get_indices(IndexVector &indices) const{
-		for (auto idx : set_)
+		for (auto idx : target_set_)
 			indices.push_back(idx);
 	}
 	
@@ -81,7 +137,7 @@ class Fusion{
 		if (global_factor_ != 1.)
 			assert(ctrl_set_.size() == 0);
 		
-		for (auto idx : set_)
+		for (auto idx : target_set_)
 			index_list.push_back(idx);
 		
 		unsigned N = num_qubits();
@@ -97,7 +153,8 @@ class Fusion{
 			for (unsigned i = 0; i < idx.size(); ++i)
 				idx2mat[i] = static_cast<unsigned>(((std::equal_range(index_list.begin(), index_list.end(), idx[i])).first - index_list.begin()));
 			
-			for (std::size_t k = 0; k < (1ULL<<N); ++k){ // loop over big matrix columns
+			#pragma omp parallel for schedule(static)
+			for (int k = 0; k < (1ULL<<N); ++k){ // loop over big matrix columns
 				// check if column index satisfies control-mask
 				// if not: leave it unchanged
 				std::vector<Complex> oldcol(1ULL<<N);
@@ -127,7 +184,7 @@ class Fusion{
 	}
 
 private:
-	void add_controls(Matrix &matrix, IndexVector &indexList, IndexVector const& new_ctrls){
+	void add_controls(Matrix &matrix, IndexVector &indexList, IndexVector const& new_ctrls) const {
 		indexList.reserve(indexList.size()+new_ctrls.size());
 		indexList.insert(indexList.end(), new_ctrls.begin(), new_ctrls.end());
 		
@@ -145,7 +202,7 @@ class Fusion{
 		matrix = std::move(newmatrix);
 	}
 	
-	void handle_controls(Matrix &matrix, IndexVector &indexList, IndexVector const& ctrlList){
+	void handle_controls(Matrix &matrix, IndexVector &indexList, IndexVector const& ctrlList) const {
 		auto unhandled_ctrl = ctrl_set_; // will contain all ctrls that are not part of the new command
 		// --> need to be removed from the global mask and the controls incorporated into the old
 		// commands (the ones already in the list).
@@ -154,7 +211,7 @@ class Fusion{
 			if (ctrl_set_.count(ctrlIdx) == 0){ // need to either add it to the list or to the command
 				if (items_.size() > 0){ // add it to the command
 					add_controls(matrix, indexList, {ctrlIdx});
-					set_.insert(ctrlIdx);
+					target_set_.insert(ctrlIdx);
 				}
 				else // add it to the list
 					ctrl_set_.emplace(ctrlIdx);
@@ -170,17 +227,17 @@ class Fusion{
 			for (auto idx : unhandled_ctrl){
 				new_ctrls.push_back(idx);
 				ctrl_set_.erase(idx);
-				set_.insert(idx);
+				target_set_.insert(idx);
 			}
 			for (auto &item : items_)
 				add_controls(item.get_matrix(), item.get_indices(), new_ctrls);
 		}
 	}
 	
-	IndexSet set_;
-	ItemVector items_;
-	IndexSet ctrl_set_;
-	Complex global_factor_;
+	mutable IndexSet target_set_; //set of qubits being acted on
+	mutable ItemVector items_; //queue if gates to be fused
+	mutable IndexSet ctrl_set_; //set of controls
+	mutable Complex global_factor_;
 };
 
 #endif
diff --git a/src/Simulation/Native/src/external/nointrin/kernel1.hpp b/src/Simulation/Native/src/external/nointrin/kernel1.hpp
index a6b624d3328..5173b58d8a8 100644
--- a/src/Simulation/Native/src/external/nointrin/kernel1.hpp
+++ b/src/Simulation/Native/src/external/nointrin/kernel1.hpp
@@ -43,7 +43,7 @@ void kernel(V& psi, unsigned id0, M const& matrix, std::size_t ctrlmask)
 
 #ifndef _MSC_VER
 	if (ctrlmask == 0){
-		#pragma omp for collapse(LOOP_COLLAPSE1) schedule(static)
+		#pragma omp parallel for collapse(LOOP_COLLAPSE1) schedule(static) proc_bind(spread)
 		for (std::size_t i0 = 0; i0 < n; i0 += 2 * dsorted[0]){
 			for (std::size_t i1 = 0; i1 < dsorted[0]; ++i1){
 				kernel_core(psi, i0 + i1, dsorted[0], mm);
@@ -51,7 +51,7 @@ void kernel(V& psi, unsigned id0, M const& matrix, std::size_t ctrlmask)
 		}
 	}
 	else{
-		#pragma omp for collapse(LOOP_COLLAPSE1) schedule(static)
+		#pragma omp parallel for collapse(LOOP_COLLAPSE1) schedule(static)
 		for (std::size_t i0 = 0; i0 < n; i0 += 2 * dsorted[0]){
 			for (std::size_t i1 = 0; i1 < dsorted[0]; ++i1){
 				if (((i0 + i1)&ctrlmask) == ctrlmask)
@@ -60,20 +60,20 @@ void kernel(V& psi, unsigned id0, M const& matrix, std::size_t ctrlmask)
 		}
 	}
 #else
-	std::intptr_t zero = 0;
-	std::intptr_t dmask = dsorted[0];
+    std::intptr_t zero = 0;
+    std::intptr_t dmask = dsorted[0];
 
-	if (ctrlmask == 0){
-		#pragma omp parallel for schedule(static)
-		for (std::intptr_t i = 0; i < static_cast<std::intptr_t>(n); ++i)
-			if ((i & dmask) == zero)
-				kernel_core(psi, i, dsorted[0], mm);
-	} else {
-		#pragma omp parallel for schedule(static)
-		for (std::intptr_t i = 0; i < static_cast<std::intptr_t>(n); ++i)
-			if ((i & ctrlmask) == ctrlmask && (i & dmask) == zero)
-				kernel_core(psi, i, dsorted[0], mm);
-	}
+    if (ctrlmask == 0){
+        #pragma omp parallel for schedule(static)
+        for (std::intptr_t i = 0; i < static_cast<std::intptr_t>(n); ++i)
+            if ((i & dmask) == zero)
+                kernel_core(psi, i, dsorted[0], mm);
+     } else {
+        #pragma omp parallel for schedule(static)
+        for (std::intptr_t i = 0; i < static_cast<std::intptr_t>(n); ++i)
+            if ((i & ctrlmask) == ctrlmask && (i & dmask) == zero)
+                kernel_core(psi, i, dsorted[0], mm);
+     }
 #endif
 }
 
diff --git a/src/Simulation/Native/src/external/nointrin/kernel2.hpp b/src/Simulation/Native/src/external/nointrin/kernel2.hpp
index 43be1a33440..dcb47fe7f48 100644
--- a/src/Simulation/Native/src/external/nointrin/kernel2.hpp
+++ b/src/Simulation/Native/src/external/nointrin/kernel2.hpp
@@ -64,7 +64,7 @@ void kernel(V& psi, unsigned id1, unsigned id0, M const& matrix, std::size_t ctr
 
 #ifndef _MSC_VER
 	if (ctrlmask == 0){
-		#pragma omp for collapse(LOOP_COLLAPSE2) schedule(static)
+		#pragma omp parallel for collapse(LOOP_COLLAPSE2) schedule(static) proc_bind(spread)
 		for (std::size_t i0 = 0; i0 < n; i0 += 2 * dsorted[0]){
 			for (std::size_t i1 = 0; i1 < dsorted[0]; i1 += 2 * dsorted[1]){
 				for (std::size_t i2 = 0; i2 < dsorted[1]; ++i2){
@@ -74,7 +74,7 @@ void kernel(V& psi, unsigned id1, unsigned id0, M const& matrix, std::size_t ctr
 		}
 	}
 	else{
-		#pragma omp for collapse(LOOP_COLLAPSE2) schedule(static)
+		#pragma omp parallel for collapse(LOOP_COLLAPSE2) schedule(static)
 		for (std::size_t i0 = 0; i0 < n; i0 += 2 * dsorted[0]){
 			for (std::size_t i1 = 0; i1 < dsorted[0]; i1 += 2 * dsorted[1]){
 				for (std::size_t i2 = 0; i2 < dsorted[1]; ++i2){
diff --git a/src/Simulation/Native/src/external/nointrin/kernel3.hpp b/src/Simulation/Native/src/external/nointrin/kernel3.hpp
index da44778220a..1019845187a 100644
--- a/src/Simulation/Native/src/external/nointrin/kernel3.hpp
+++ b/src/Simulation/Native/src/external/nointrin/kernel3.hpp
@@ -129,7 +129,7 @@ void kernel(V& psi, unsigned id2, unsigned id1, unsigned id0, M const& matrix, s
 
 #ifndef _MSC_VER
 	if (ctrlmask == 0){
-		#pragma omp for collapse(LOOP_COLLAPSE3) schedule(static)
+		#pragma omp parallel for collapse(LOOP_COLLAPSE3) schedule(static) proc_bind(spread)
 		for (std::size_t i0 = 0; i0 < n; i0 += 2 * dsorted[0]){
 			for (std::size_t i1 = 0; i1 < dsorted[0]; i1 += 2 * dsorted[1]){
 				for (std::size_t i2 = 0; i2 < dsorted[1]; i2 += 2 * dsorted[2]){
@@ -141,7 +141,7 @@ void kernel(V& psi, unsigned id2, unsigned id1, unsigned id0, M const& matrix, s
 		}
 	}
 	else{
-		#pragma omp for collapse(LOOP_COLLAPSE3) schedule(static)
+		#pragma omp parallel for collapse(LOOP_COLLAPSE3) schedule(static)
 		for (std::size_t i0 = 0; i0 < n; i0 += 2 * dsorted[0]){
 			for (std::size_t i1 = 0; i1 < dsorted[0]; i1 += 2 * dsorted[1]){
 				for (std::size_t i2 = 0; i2 < dsorted[1]; i2 += 2 * dsorted[2]){
diff --git a/src/Simulation/Native/src/external/nointrin/kernel4.hpp b/src/Simulation/Native/src/external/nointrin/kernel4.hpp
index 6bc3b303e6e..46d33620e74 100644
--- a/src/Simulation/Native/src/external/nointrin/kernel4.hpp
+++ b/src/Simulation/Native/src/external/nointrin/kernel4.hpp
@@ -354,7 +354,7 @@ void kernel(V& psi, unsigned id3, unsigned id2, unsigned id1, unsigned id0, M co
 
 #ifndef _MSC_VER
 	if (ctrlmask == 0){
-		#pragma omp for collapse(LOOP_COLLAPSE4) schedule(static)
+		#pragma omp parallel for collapse(LOOP_COLLAPSE4) schedule(static) proc_bind(spread)
 		for (std::size_t i0 = 0; i0 < n; i0 += 2 * dsorted[0]){
 			for (std::size_t i1 = 0; i1 < dsorted[0]; i1 += 2 * dsorted[1]){
 				for (std::size_t i2 = 0; i2 < dsorted[1]; i2 += 2 * dsorted[2]){
@@ -368,7 +368,7 @@ void kernel(V& psi, unsigned id3, unsigned id2, unsigned id1, unsigned id0, M co
 		}
 	}
 	else{
-		#pragma omp for collapse(LOOP_COLLAPSE4) schedule(static)
+		#pragma omp parallel for collapse(LOOP_COLLAPSE4) schedule(static)
 		for (std::size_t i0 = 0; i0 < n; i0 += 2 * dsorted[0]){
 			for (std::size_t i1 = 0; i1 < dsorted[0]; i1 += 2 * dsorted[1]){
 				for (std::size_t i2 = 0; i2 < dsorted[1]; i2 += 2 * dsorted[2]){
diff --git a/src/Simulation/Native/src/external/nointrin/kernel5.hpp b/src/Simulation/Native/src/external/nointrin/kernel5.hpp
index 13d363f7df3..08657104779 100644
--- a/src/Simulation/Native/src/external/nointrin/kernel5.hpp
+++ b/src/Simulation/Native/src/external/nointrin/kernel5.hpp
@@ -643,7 +643,7 @@ void kernel(V& psi, unsigned id4, unsigned id3, unsigned id2, unsigned id1, unsi
 
 #ifndef _MSC_VER
 	if (ctrlmask == 0){
-		#pragma omp for collapse(LOOP_COLLAPSE5) schedule(static)
+		#pragma omp parallel for collapse(LOOP_COLLAPSE5) schedule(static) proc_bind(spread)
 		for (std::size_t i0 = 0; i0 < n; i0 += 2 * dsorted[0]){
 			for (std::size_t i1 = 0; i1 < dsorted[0]; i1 += 2 * dsorted[1]){
 				for (std::size_t i2 = 0; i2 < dsorted[1]; i2 += 2 * dsorted[2]){
@@ -659,7 +659,7 @@ void kernel(V& psi, unsigned id4, unsigned id3, unsigned id2, unsigned id1, unsi
 		}
 	}
 	else{
-		#pragma omp for collapse(LOOP_COLLAPSE5) schedule(static)
+		#pragma omp parallel for collapse(LOOP_COLLAPSE5) schedule(static)
 		for (std::size_t i0 = 0; i0 < n; i0 += 2 * dsorted[0]){
 			for (std::size_t i1 = 0; i1 < dsorted[0]; i1 += 2 * dsorted[1]){
 				for (std::size_t i2 = 0; i2 < dsorted[1]; i2 += 2 * dsorted[2]){
diff --git a/src/Simulation/Native/src/external/nointrin/kernel6.hpp b/src/Simulation/Native/src/external/nointrin/kernel6.hpp
index 893bf4e35d5..7f8ea4741a3 100644
--- a/src/Simulation/Native/src/external/nointrin/kernel6.hpp
+++ b/src/Simulation/Native/src/external/nointrin/kernel6.hpp
@@ -244,7 +244,7 @@ void kernel(V& psi, unsigned id5, unsigned id4, unsigned id3, unsigned id2, unsi
 
 #ifndef _MSC_VER
 	if (ctrlmask == 0){
-		#pragma omp for collapse(LOOP_COLLAPSE6) schedule(static)
+		#pragma omp parallel for collapse(LOOP_COLLAPSE6) schedule(static) proc_bind(spread)
 		for (std::size_t i0 = 0; i0 < n; i0 += 2 * dsorted[0]){
 			for (std::size_t i1 = 0; i1 < dsorted[0]; i1 += 2 * dsorted[1]){
 				for (std::size_t i2 = 0; i2 < dsorted[1]; i2 += 2 * dsorted[2]){
@@ -262,7 +262,7 @@ void kernel(V& psi, unsigned id5, unsigned id4, unsigned id3, unsigned id2, unsi
 		}
 	}
 	else{
-		#pragma omp for collapse(LOOP_COLLAPSE6) schedule(static)
+		#pragma omp parallel for collapse(LOOP_COLLAPSE6) schedule(static)
 		for (std::size_t i0 = 0; i0 < n; i0 += 2 * dsorted[0]){
 			for (std::size_t i1 = 0; i1 < dsorted[0]; i1 += 2 * dsorted[1]){
 				for (std::size_t i2 = 0; i2 < dsorted[1]; i2 += 2 * dsorted[2]){
diff --git a/src/Simulation/Native/src/external/nointrin/kernel7.hpp b/src/Simulation/Native/src/external/nointrin/kernel7.hpp
index a9537bb61d1..fc8401da66f 100644
--- a/src/Simulation/Native/src/external/nointrin/kernel7.hpp
+++ b/src/Simulation/Native/src/external/nointrin/kernel7.hpp
@@ -453,7 +453,7 @@ void kernel(V& psi, unsigned id6, unsigned id5, unsigned id4, unsigned id3, unsi
 
 #ifndef _MSC_VER
 	if (ctrlmask == 0){
-		#pragma omp for collapse(LOOP_COLLAPSE7) schedule(static)
+		#pragma omp parallel for collapse(LOOP_COLLAPSE7) schedule(static) proc_bind(spread)
 		for (std::size_t i0 = 0; i0 < n; i0 += 2 * dsorted[0]){
 			for (std::size_t i1 = 0; i1 < dsorted[0]; i1 += 2 * dsorted[1]){
 				for (std::size_t i2 = 0; i2 < dsorted[1]; i2 += 2 * dsorted[2]){
@@ -473,7 +473,7 @@ void kernel(V& psi, unsigned id6, unsigned id5, unsigned id4, unsigned id3, unsi
 		}
 	}
 	else{
-		#pragma omp for collapse(LOOP_COLLAPSE7) schedule(static)
+		#pragma omp parallel for collapse(LOOP_COLLAPSE7) schedule(static)
 		for (std::size_t i0 = 0; i0 < n; i0 += 2 * dsorted[0]){
 			for (std::size_t i1 = 0; i1 < dsorted[0]; i1 += 2 * dsorted[1]){
 				for (std::size_t i2 = 0; i2 < dsorted[1]; i2 += 2 * dsorted[2]){
diff --git a/src/Simulation/Native/src/simulator/CMakeLists.txt b/src/Simulation/Native/src/simulator/CMakeLists.txt
index d048feac129..b1caf02fc05 100644
--- a/src/Simulation/Native/src/simulator/CMakeLists.txt
+++ b/src/Simulation/Native/src/simulator/CMakeLists.txt
@@ -4,9 +4,12 @@
 add_executable(local_test local_test.cpp)
 add_executable(factory_test factory_test.cpp)
 add_executable(capi_test capi_test.cpp)
+add_executable(dbw_test dbw_test.cpp)
 target_link_libraries(factory_test Microsoft.Quantum.Simulator.Runtime)
 target_link_libraries(local_test Microsoft.Quantum.Simulator.Runtime)
 target_link_libraries(capi_test Microsoft.Quantum.Simulator.Runtime)
+target_link_libraries(dbw_test Microsoft.Quantum.Simulator.Runtime)
 add_test(NAME factory_test COMMAND ./factory_test WORKING_DIRECTORY ${CMAKE_BINARY_DIR})
 add_test(NAME local_test COMMAND ./local_test WORKING_DIRECTORY ${CMAKE_BINARY_DIR})
 add_test(NAME capi_test COMMAND ./capi_test WORKING_DIRECTORY ${CMAKE_BINARY_DIR})
+add_test(NAME dbw_test COMMAND ./dbw_test WORKING_DIRECTORY ${CMAKE_BINARY_DIR})
diff --git a/src/Simulation/Native/src/simulator/Makefile b/src/Simulation/Native/src/simulator/Makefile
new file mode 100644
index 00000000000..3522e3fd641
--- /dev/null
+++ b/src/Simulation/Native/src/simulator/Makefile
@@ -0,0 +1,380 @@
+# CMAKE generated file: DO NOT EDIT!
+# Generated by "Unix Makefiles" Generator, CMake Version 3.16
+
+# Default target executed when no arguments are given to make.
+default_target: all
+
+.PHONY : default_target
+
+# Allow only one "make -f Makefile2" at a time, but pass parallelism.
+.NOTPARALLEL:
+
+
+#=============================================================================
+# Special targets provided by cmake.
+
+# Disable implicit rules so canonical targets will work.
+.SUFFIXES:
+
+
+# Remove some rules from gmake that .SUFFIXES does not remove.
+SUFFIXES =
+
+.SUFFIXES: .hpux_make_needs_suffix_list
+
+
+# Suppress display of executed commands.
+$(VERBOSE).SILENT:
+
+
+# A target that is always out of date.
+cmake_force:
+
+.PHONY : cmake_force
+
+#=============================================================================
+# Set environment variables for the build.
+
+# The shell in which to execute make rules.
+SHELL = /bin/sh
+
+# The CMake executable.
+CMAKE_COMMAND = /usr/bin/cmake
+
+# The command to remove a file.
+RM = /usr/bin/cmake -E remove -f
+
+# Escaping for special characters.
+EQUALS = =
+
+# The top-level source directory on which CMake was run.
+CMAKE_SOURCE_DIR = /mnt/c/depot/Git/qsharp-runtime/src/Simulation/Native
+
+# The top-level build directory on which CMake was run.
+CMAKE_BINARY_DIR = /mnt/c/depot/Git/qsharp-runtime/src/Simulation/Native
+
+#=============================================================================
+# Targets provided globally by CMake.
+
+# Special rule for the target install/strip
+install/strip: preinstall
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Installing the project stripped..."
+	/usr/bin/cmake -DCMAKE_INSTALL_DO_STRIP=1 -P cmake_install.cmake
+.PHONY : install/strip
+
+# Special rule for the target install/strip
+install/strip/fast: preinstall/fast
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Installing the project stripped..."
+	/usr/bin/cmake -DCMAKE_INSTALL_DO_STRIP=1 -P cmake_install.cmake
+.PHONY : install/strip/fast
+
+# Special rule for the target edit_cache
+edit_cache:
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Running CMake cache editor..."
+	/usr/bin/ccmake -S$(CMAKE_SOURCE_DIR) -B$(CMAKE_BINARY_DIR)
+.PHONY : edit_cache
+
+# Special rule for the target edit_cache
+edit_cache/fast: edit_cache
+
+.PHONY : edit_cache/fast
+
+# Special rule for the target test
+test:
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Running tests..."
+	/usr/bin/ctest --force-new-ctest-process $(ARGS)
+.PHONY : test
+
+# Special rule for the target test
+test/fast: test
+
+.PHONY : test/fast
+
+# Special rule for the target install
+install: preinstall
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Install the project..."
+	/usr/bin/cmake -P cmake_install.cmake
+.PHONY : install
+
+# Special rule for the target install
+install/fast: preinstall/fast
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Install the project..."
+	/usr/bin/cmake -P cmake_install.cmake
+.PHONY : install/fast
+
+# Special rule for the target list_install_components
+list_install_components:
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Available install components are: \"libraries\""
+.PHONY : list_install_components
+
+# Special rule for the target list_install_components
+list_install_components/fast: list_install_components
+
+.PHONY : list_install_components/fast
+
+# Special rule for the target rebuild_cache
+rebuild_cache:
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Running CMake to regenerate build system..."
+	/usr/bin/cmake -S$(CMAKE_SOURCE_DIR) -B$(CMAKE_BINARY_DIR)
+.PHONY : rebuild_cache
+
+# Special rule for the target rebuild_cache
+rebuild_cache/fast: rebuild_cache
+
+.PHONY : rebuild_cache/fast
+
+# Special rule for the target install/local
+install/local: preinstall
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Installing only the local directory..."
+	/usr/bin/cmake -DCMAKE_INSTALL_LOCAL_ONLY=1 -P cmake_install.cmake
+.PHONY : install/local
+
+# Special rule for the target install/local
+install/local/fast: preinstall/fast
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Installing only the local directory..."
+	/usr/bin/cmake -DCMAKE_INSTALL_LOCAL_ONLY=1 -P cmake_install.cmake
+.PHONY : install/local/fast
+
+# The main all target
+all: cmake_check_build_system
+	cd /mnt/c/depot/Git/qsharp-runtime/src/Simulation/Native && $(CMAKE_COMMAND) -E cmake_progress_start /mnt/c/depot/Git/qsharp-runtime/src/Simulation/Native/CMakeFiles /mnt/c/depot/Git/qsharp-runtime/src/Simulation/Native/src/simulator/CMakeFiles/progress.marks
+	cd /mnt/c/depot/Git/qsharp-runtime/src/Simulation/Native && $(MAKE) -f CMakeFiles/Makefile2 src/simulator/all
+	$(CMAKE_COMMAND) -E cmake_progress_start /mnt/c/depot/Git/qsharp-runtime/src/Simulation/Native/CMakeFiles 0
+.PHONY : all
+
+# The main clean target
+clean:
+	cd /mnt/c/depot/Git/qsharp-runtime/src/Simulation/Native && $(MAKE) -f CMakeFiles/Makefile2 src/simulator/clean
+.PHONY : clean
+
+# The main clean target
+clean/fast: clean
+
+.PHONY : clean/fast
+
+# Prepare targets for installation.
+preinstall: all
+	cd /mnt/c/depot/Git/qsharp-runtime/src/Simulation/Native && $(MAKE) -f CMakeFiles/Makefile2 src/simulator/preinstall
+.PHONY : preinstall
+
+# Prepare targets for installation.
+preinstall/fast:
+	cd /mnt/c/depot/Git/qsharp-runtime/src/Simulation/Native && $(MAKE) -f CMakeFiles/Makefile2 src/simulator/preinstall
+.PHONY : preinstall/fast
+
+# clear depends
+depend:
+	cd /mnt/c/depot/Git/qsharp-runtime/src/Simulation/Native && $(CMAKE_COMMAND) -S$(CMAKE_SOURCE_DIR) -B$(CMAKE_BINARY_DIR) --check-build-system CMakeFiles/Makefile.cmake 1
+.PHONY : depend
+
+# Convenience name for target.
+src/simulator/CMakeFiles/dbw_test.dir/rule:
+	cd /mnt/c/depot/Git/qsharp-runtime/src/Simulation/Native && $(MAKE) -f CMakeFiles/Makefile2 src/simulator/CMakeFiles/dbw_test.dir/rule
+.PHONY : src/simulator/CMakeFiles/dbw_test.dir/rule
+
+# Convenience name for target.
+dbw_test: src/simulator/CMakeFiles/dbw_test.dir/rule
+
+.PHONY : dbw_test
+
+# fast build rule for target.
+dbw_test/fast:
+	cd /mnt/c/depot/Git/qsharp-runtime/src/Simulation/Native && $(MAKE) -f src/simulator/CMakeFiles/dbw_test.dir/build.make src/simulator/CMakeFiles/dbw_test.dir/build
+.PHONY : dbw_test/fast
+
+# Convenience name for target.
+src/simulator/CMakeFiles/capi_test.dir/rule:
+	cd /mnt/c/depot/Git/qsharp-runtime/src/Simulation/Native && $(MAKE) -f CMakeFiles/Makefile2 src/simulator/CMakeFiles/capi_test.dir/rule
+.PHONY : src/simulator/CMakeFiles/capi_test.dir/rule
+
+# Convenience name for target.
+capi_test: src/simulator/CMakeFiles/capi_test.dir/rule
+
+.PHONY : capi_test
+
+# fast build rule for target.
+capi_test/fast:
+	cd /mnt/c/depot/Git/qsharp-runtime/src/Simulation/Native && $(MAKE) -f src/simulator/CMakeFiles/capi_test.dir/build.make src/simulator/CMakeFiles/capi_test.dir/build
+.PHONY : capi_test/fast
+
+# Convenience name for target.
+src/simulator/CMakeFiles/factory_test.dir/rule:
+	cd /mnt/c/depot/Git/qsharp-runtime/src/Simulation/Native && $(MAKE) -f CMakeFiles/Makefile2 src/simulator/CMakeFiles/factory_test.dir/rule
+.PHONY : src/simulator/CMakeFiles/factory_test.dir/rule
+
+# Convenience name for target.
+factory_test: src/simulator/CMakeFiles/factory_test.dir/rule
+
+.PHONY : factory_test
+
+# fast build rule for target.
+factory_test/fast:
+	cd /mnt/c/depot/Git/qsharp-runtime/src/Simulation/Native && $(MAKE) -f src/simulator/CMakeFiles/factory_test.dir/build.make src/simulator/CMakeFiles/factory_test.dir/build
+.PHONY : factory_test/fast
+
+# Convenience name for target.
+src/simulator/CMakeFiles/local_test.dir/rule:
+	cd /mnt/c/depot/Git/qsharp-runtime/src/Simulation/Native && $(MAKE) -f CMakeFiles/Makefile2 src/simulator/CMakeFiles/local_test.dir/rule
+.PHONY : src/simulator/CMakeFiles/local_test.dir/rule
+
+# Convenience name for target.
+local_test: src/simulator/CMakeFiles/local_test.dir/rule
+
+.PHONY : local_test
+
+# fast build rule for target.
+local_test/fast:
+	cd /mnt/c/depot/Git/qsharp-runtime/src/Simulation/Native && $(MAKE) -f src/simulator/CMakeFiles/local_test.dir/build.make src/simulator/CMakeFiles/local_test.dir/build
+.PHONY : local_test/fast
+
+capi_test.o: capi_test.cpp.o
+
+.PHONY : capi_test.o
+
+# target to build an object file
+capi_test.cpp.o:
+	cd /mnt/c/depot/Git/qsharp-runtime/src/Simulation/Native && $(MAKE) -f src/simulator/CMakeFiles/capi_test.dir/build.make src/simulator/CMakeFiles/capi_test.dir/capi_test.cpp.o
+.PHONY : capi_test.cpp.o
+
+capi_test.i: capi_test.cpp.i
+
+.PHONY : capi_test.i
+
+# target to preprocess a source file
+capi_test.cpp.i:
+	cd /mnt/c/depot/Git/qsharp-runtime/src/Simulation/Native && $(MAKE) -f src/simulator/CMakeFiles/capi_test.dir/build.make src/simulator/CMakeFiles/capi_test.dir/capi_test.cpp.i
+.PHONY : capi_test.cpp.i
+
+capi_test.s: capi_test.cpp.s
+
+.PHONY : capi_test.s
+
+# target to generate assembly for a file
+capi_test.cpp.s:
+	cd /mnt/c/depot/Git/qsharp-runtime/src/Simulation/Native && $(MAKE) -f src/simulator/CMakeFiles/capi_test.dir/build.make src/simulator/CMakeFiles/capi_test.dir/capi_test.cpp.s
+.PHONY : capi_test.cpp.s
+
+dbw_test.o: dbw_test.cpp.o
+
+.PHONY : dbw_test.o
+
+# target to build an object file
+dbw_test.cpp.o:
+	cd /mnt/c/depot/Git/qsharp-runtime/src/Simulation/Native && $(MAKE) -f src/simulator/CMakeFiles/dbw_test.dir/build.make src/simulator/CMakeFiles/dbw_test.dir/dbw_test.cpp.o
+.PHONY : dbw_test.cpp.o
+
+dbw_test.i: dbw_test.cpp.i
+
+.PHONY : dbw_test.i
+
+# target to preprocess a source file
+dbw_test.cpp.i:
+	cd /mnt/c/depot/Git/qsharp-runtime/src/Simulation/Native && $(MAKE) -f src/simulator/CMakeFiles/dbw_test.dir/build.make src/simulator/CMakeFiles/dbw_test.dir/dbw_test.cpp.i
+.PHONY : dbw_test.cpp.i
+
+dbw_test.s: dbw_test.cpp.s
+
+.PHONY : dbw_test.s
+
+# target to generate assembly for a file
+dbw_test.cpp.s:
+	cd /mnt/c/depot/Git/qsharp-runtime/src/Simulation/Native && $(MAKE) -f src/simulator/CMakeFiles/dbw_test.dir/build.make src/simulator/CMakeFiles/dbw_test.dir/dbw_test.cpp.s
+.PHONY : dbw_test.cpp.s
+
+factory_test.o: factory_test.cpp.o
+
+.PHONY : factory_test.o
+
+# target to build an object file
+factory_test.cpp.o:
+	cd /mnt/c/depot/Git/qsharp-runtime/src/Simulation/Native && $(MAKE) -f src/simulator/CMakeFiles/factory_test.dir/build.make src/simulator/CMakeFiles/factory_test.dir/factory_test.cpp.o
+.PHONY : factory_test.cpp.o
+
+factory_test.i: factory_test.cpp.i
+
+.PHONY : factory_test.i
+
+# target to preprocess a source file
+factory_test.cpp.i:
+	cd /mnt/c/depot/Git/qsharp-runtime/src/Simulation/Native && $(MAKE) -f src/simulator/CMakeFiles/factory_test.dir/build.make src/simulator/CMakeFiles/factory_test.dir/factory_test.cpp.i
+.PHONY : factory_test.cpp.i
+
+factory_test.s: factory_test.cpp.s
+
+.PHONY : factory_test.s
+
+# target to generate assembly for a file
+factory_test.cpp.s:
+	cd /mnt/c/depot/Git/qsharp-runtime/src/Simulation/Native && $(MAKE) -f src/simulator/CMakeFiles/factory_test.dir/build.make src/simulator/CMakeFiles/factory_test.dir/factory_test.cpp.s
+.PHONY : factory_test.cpp.s
+
+local_test.o: local_test.cpp.o
+
+.PHONY : local_test.o
+
+# target to build an object file
+local_test.cpp.o:
+	cd /mnt/c/depot/Git/qsharp-runtime/src/Simulation/Native && $(MAKE) -f src/simulator/CMakeFiles/local_test.dir/build.make src/simulator/CMakeFiles/local_test.dir/local_test.cpp.o
+.PHONY : local_test.cpp.o
+
+local_test.i: local_test.cpp.i
+
+.PHONY : local_test.i
+
+# target to preprocess a source file
+local_test.cpp.i:
+	cd /mnt/c/depot/Git/qsharp-runtime/src/Simulation/Native && $(MAKE) -f src/simulator/CMakeFiles/local_test.dir/build.make src/simulator/CMakeFiles/local_test.dir/local_test.cpp.i
+.PHONY : local_test.cpp.i
+
+local_test.s: local_test.cpp.s
+
+.PHONY : local_test.s
+
+# target to generate assembly for a file
+local_test.cpp.s:
+	cd /mnt/c/depot/Git/qsharp-runtime/src/Simulation/Native && $(MAKE) -f src/simulator/CMakeFiles/local_test.dir/build.make src/simulator/CMakeFiles/local_test.dir/local_test.cpp.s
+.PHONY : local_test.cpp.s
+
+# Help Target
+help:
+	@echo "The following are some of the valid targets for this Makefile:"
+	@echo "... all (the default if no target is provided)"
+	@echo "... clean"
+	@echo "... depend"
+	@echo "... install/strip"
+	@echo "... edit_cache"
+	@echo "... test"
+	@echo "... dbw_test"
+	@echo "... capi_test"
+	@echo "... install"
+	@echo "... list_install_components"
+	@echo "... rebuild_cache"
+	@echo "... factory_test"
+	@echo "... install/local"
+	@echo "... local_test"
+	@echo "... capi_test.o"
+	@echo "... capi_test.i"
+	@echo "... capi_test.s"
+	@echo "... dbw_test.o"
+	@echo "... dbw_test.i"
+	@echo "... dbw_test.s"
+	@echo "... factory_test.o"
+	@echo "... factory_test.i"
+	@echo "... factory_test.s"
+	@echo "... local_test.o"
+	@echo "... local_test.i"
+	@echo "... local_test.s"
+.PHONY : help
+
+
+
+#=============================================================================
+# Special targets to cleanup operation of make.
+
+# Special rule to run CMake to check the build system integrity.
+# No rule that depends on this can have commands that come from listfiles
+# because they might be regenerated.
+cmake_check_build_system:
+	cd /mnt/c/depot/Git/qsharp-runtime/src/Simulation/Native && $(CMAKE_COMMAND) -S$(CMAKE_SOURCE_DIR) -B$(CMAKE_BINARY_DIR) --check-build-system CMakeFiles/Makefile.cmake 0
+.PHONY : cmake_check_build_system
+
diff --git a/src/Simulation/Native/src/simulator/Project.sln b/src/Simulation/Native/src/simulator/Project.sln
new file mode 100644
index 00000000000..8bff94040c4
--- /dev/null
+++ b/src/Simulation/Native/src/simulator/Project.sln
@@ -0,0 +1,95 @@
+﻿
+Microsoft Visual Studio Solution File, Format Version 12.00
+# Visual Studio Version 16
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "ALL_BUILD", "ALL_BUILD.vcxproj", "{FF099378-3036-3F4A-9265-FD7892A6D7C4}"
+	ProjectSection(ProjectDependencies) = postProject
+		{71F2A928-1769-3C7F-96A5-9D3162F0F318} = {71F2A928-1769-3C7F-96A5-9D3162F0F318}
+		{9B43304E-07FF-358F-9B80-6563FEACD34B} = {9B43304E-07FF-358F-9B80-6563FEACD34B}
+		{B9922A7F-66CA-38A9-AB14-F37716C2E642} = {B9922A7F-66CA-38A9-AB14-F37716C2E642}
+		{9A0E62D8-1D15-3C23-9033-C3CB9F76209B} = {9A0E62D8-1D15-3C23-9033-C3CB9F76209B}
+		{96344D76-033E-328A-A304-6957F95DA3E2} = {96344D76-033E-328A-A304-6957F95DA3E2}
+	EndProjectSection
+EndProject
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "ZERO_CHECK", "ZERO_CHECK.vcxproj", "{71F2A928-1769-3C7F-96A5-9D3162F0F318}"
+	ProjectSection(ProjectDependencies) = postProject
+	EndProjectSection
+EndProject
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "capi_test", "capi_test.vcxproj", "{9B43304E-07FF-358F-9B80-6563FEACD34B}"
+	ProjectSection(ProjectDependencies) = postProject
+		{71F2A928-1769-3C7F-96A5-9D3162F0F318} = {71F2A928-1769-3C7F-96A5-9D3162F0F318}
+	EndProjectSection
+EndProject
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "dbw_test", "dbw_test.vcxproj", "{B9922A7F-66CA-38A9-AB14-F37716C2E642}"
+	ProjectSection(ProjectDependencies) = postProject
+		{71F2A928-1769-3C7F-96A5-9D3162F0F318} = {71F2A928-1769-3C7F-96A5-9D3162F0F318}
+	EndProjectSection
+EndProject
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "factory_test", "factory_test.vcxproj", "{9A0E62D8-1D15-3C23-9033-C3CB9F76209B}"
+	ProjectSection(ProjectDependencies) = postProject
+		{71F2A928-1769-3C7F-96A5-9D3162F0F318} = {71F2A928-1769-3C7F-96A5-9D3162F0F318}
+	EndProjectSection
+EndProject
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "local_test", "local_test.vcxproj", "{96344D76-033E-328A-A304-6957F95DA3E2}"
+	ProjectSection(ProjectDependencies) = postProject
+		{71F2A928-1769-3C7F-96A5-9D3162F0F318} = {71F2A928-1769-3C7F-96A5-9D3162F0F318}
+	EndProjectSection
+EndProject
+Global
+	GlobalSection(SolutionConfigurationPlatforms) = preSolution
+		Debug|x64 = Debug|x64
+		Release|x64 = Release|x64
+		MinSizeRel|x64 = MinSizeRel|x64
+		RelWithDebInfo|x64 = RelWithDebInfo|x64
+	EndGlobalSection
+	GlobalSection(ProjectConfigurationPlatforms) = postSolution
+		{FF099378-3036-3F4A-9265-FD7892A6D7C4}.Debug|x64.ActiveCfg = Debug|x64
+		{FF099378-3036-3F4A-9265-FD7892A6D7C4}.Release|x64.ActiveCfg = Release|x64
+		{FF099378-3036-3F4A-9265-FD7892A6D7C4}.MinSizeRel|x64.ActiveCfg = MinSizeRel|x64
+		{FF099378-3036-3F4A-9265-FD7892A6D7C4}.RelWithDebInfo|x64.ActiveCfg = RelWithDebInfo|x64
+		{71F2A928-1769-3C7F-96A5-9D3162F0F318}.Debug|x64.ActiveCfg = Debug|x64
+		{71F2A928-1769-3C7F-96A5-9D3162F0F318}.Debug|x64.Build.0 = Debug|x64
+		{71F2A928-1769-3C7F-96A5-9D3162F0F318}.Release|x64.ActiveCfg = Release|x64
+		{71F2A928-1769-3C7F-96A5-9D3162F0F318}.Release|x64.Build.0 = Release|x64
+		{71F2A928-1769-3C7F-96A5-9D3162F0F318}.MinSizeRel|x64.ActiveCfg = MinSizeRel|x64
+		{71F2A928-1769-3C7F-96A5-9D3162F0F318}.MinSizeRel|x64.Build.0 = MinSizeRel|x64
+		{71F2A928-1769-3C7F-96A5-9D3162F0F318}.RelWithDebInfo|x64.ActiveCfg = RelWithDebInfo|x64
+		{71F2A928-1769-3C7F-96A5-9D3162F0F318}.RelWithDebInfo|x64.Build.0 = RelWithDebInfo|x64
+		{9B43304E-07FF-358F-9B80-6563FEACD34B}.Debug|x64.ActiveCfg = Debug|x64
+		{9B43304E-07FF-358F-9B80-6563FEACD34B}.Debug|x64.Build.0 = Debug|x64
+		{9B43304E-07FF-358F-9B80-6563FEACD34B}.Release|x64.ActiveCfg = Release|x64
+		{9B43304E-07FF-358F-9B80-6563FEACD34B}.Release|x64.Build.0 = Release|x64
+		{9B43304E-07FF-358F-9B80-6563FEACD34B}.MinSizeRel|x64.ActiveCfg = MinSizeRel|x64
+		{9B43304E-07FF-358F-9B80-6563FEACD34B}.MinSizeRel|x64.Build.0 = MinSizeRel|x64
+		{9B43304E-07FF-358F-9B80-6563FEACD34B}.RelWithDebInfo|x64.ActiveCfg = RelWithDebInfo|x64
+		{9B43304E-07FF-358F-9B80-6563FEACD34B}.RelWithDebInfo|x64.Build.0 = RelWithDebInfo|x64
+		{B9922A7F-66CA-38A9-AB14-F37716C2E642}.Debug|x64.ActiveCfg = Debug|x64
+		{B9922A7F-66CA-38A9-AB14-F37716C2E642}.Debug|x64.Build.0 = Debug|x64
+		{B9922A7F-66CA-38A9-AB14-F37716C2E642}.Release|x64.ActiveCfg = Release|x64
+		{B9922A7F-66CA-38A9-AB14-F37716C2E642}.Release|x64.Build.0 = Release|x64
+		{B9922A7F-66CA-38A9-AB14-F37716C2E642}.MinSizeRel|x64.ActiveCfg = MinSizeRel|x64
+		{B9922A7F-66CA-38A9-AB14-F37716C2E642}.MinSizeRel|x64.Build.0 = MinSizeRel|x64
+		{B9922A7F-66CA-38A9-AB14-F37716C2E642}.RelWithDebInfo|x64.ActiveCfg = RelWithDebInfo|x64
+		{B9922A7F-66CA-38A9-AB14-F37716C2E642}.RelWithDebInfo|x64.Build.0 = RelWithDebInfo|x64
+		{9A0E62D8-1D15-3C23-9033-C3CB9F76209B}.Debug|x64.ActiveCfg = Debug|x64
+		{9A0E62D8-1D15-3C23-9033-C3CB9F76209B}.Debug|x64.Build.0 = Debug|x64
+		{9A0E62D8-1D15-3C23-9033-C3CB9F76209B}.Release|x64.ActiveCfg = Release|x64
+		{9A0E62D8-1D15-3C23-9033-C3CB9F76209B}.Release|x64.Build.0 = Release|x64
+		{9A0E62D8-1D15-3C23-9033-C3CB9F76209B}.MinSizeRel|x64.ActiveCfg = MinSizeRel|x64
+		{9A0E62D8-1D15-3C23-9033-C3CB9F76209B}.MinSizeRel|x64.Build.0 = MinSizeRel|x64
+		{9A0E62D8-1D15-3C23-9033-C3CB9F76209B}.RelWithDebInfo|x64.ActiveCfg = RelWithDebInfo|x64
+		{9A0E62D8-1D15-3C23-9033-C3CB9F76209B}.RelWithDebInfo|x64.Build.0 = RelWithDebInfo|x64
+		{96344D76-033E-328A-A304-6957F95DA3E2}.Debug|x64.ActiveCfg = Debug|x64
+		{96344D76-033E-328A-A304-6957F95DA3E2}.Debug|x64.Build.0 = Debug|x64
+		{96344D76-033E-328A-A304-6957F95DA3E2}.Release|x64.ActiveCfg = Release|x64
+		{96344D76-033E-328A-A304-6957F95DA3E2}.Release|x64.Build.0 = Release|x64
+		{96344D76-033E-328A-A304-6957F95DA3E2}.MinSizeRel|x64.ActiveCfg = MinSizeRel|x64
+		{96344D76-033E-328A-A304-6957F95DA3E2}.MinSizeRel|x64.Build.0 = MinSizeRel|x64
+		{96344D76-033E-328A-A304-6957F95DA3E2}.RelWithDebInfo|x64.ActiveCfg = RelWithDebInfo|x64
+		{96344D76-033E-328A-A304-6957F95DA3E2}.RelWithDebInfo|x64.Build.0 = RelWithDebInfo|x64
+	EndGlobalSection
+	GlobalSection(ExtensibilityGlobals) = postSolution
+		SolutionGuid = {075CF7D4-156C-30A3-9A71-3554AECB5B07}
+	EndGlobalSection
+	GlobalSection(ExtensibilityAddIns) = postSolution
+	EndGlobalSection
+EndGlobal
diff --git a/src/Simulation/Native/src/simulator/capi.cpp b/src/Simulation/Native/src/simulator/capi.cpp
index 344445f46c4..544f1f25261 100644
--- a/src/Simulation/Native/src/simulator/capi.cpp
+++ b/src/Simulation/Native/src/simulator/capi.cpp
@@ -13,7 +13,7 @@ MICROSOFT_QUANTUM_DECL unsigned init()
 {
   return Microsoft::Quantum::Simulator::create();
 }
-  
+
 MICROSOFT_QUANTUM_DECL void destroy(_In_ unsigned id)
   {
     Microsoft::Quantum::Simulator::destroy(id);
diff --git a/src/Simulation/Native/src/simulator/capi_test.cpp b/src/Simulation/Native/src/simulator/capi_test.cpp
index 945127e260d..179b6a94960 100644
--- a/src/Simulation/Native/src/simulator/capi_test.cpp
+++ b/src/Simulation/Native/src/simulator/capi_test.cpp
@@ -107,24 +107,24 @@ void test_gates()
     allocateQubit(sim_id, 0);
     allocateQubit(sim_id, 1);
 
-     CRx(sim_id, 1.0, 0, 1);
+    dump(sim_id, "test_gatesA");
+    CRx(sim_id, 1.0, 0, 1);
 
-    assert(M(sim_id, 1)==false);
+    dump(sim_id, "test_gatesB");
+    assert(M(sim_id, 1) == false);
 
     X(sim_id, 0);
-     CRx(sim_id, 1.0, 0, 1);
+    CRx(sim_id, 1.0, 0, 1);
 
     H(sim_id, 1);
     CRz(sim_id, -1.0, 0, 1);
     H(sim_id, 1);
 
-    assert(M(sim_id, 1)==false);
+    assert(M(sim_id, 1) == false);
 
     X(sim_id, 1);
 
-    assert(M(sim_id, 1)==true);
-
-    X(sim_id, 1);
+    assert(M(sim_id, 1) == true);
 
     release(sim_id, 0);
     release(sim_id, 1);
@@ -132,7 +132,6 @@ void test_gates()
     destroy(sim_id);
 }
 
-
 void test_allocate()
 {
     auto sim_id = init();
@@ -348,14 +347,13 @@ void test_permute_basis()
     Ry(sim_id, -1.1, 3);
     CX(sim_id, 1, 2);
     H(sim_id, 1);
-
     // Dump(sim_id, "permute-end.txt");
     assert(M(sim_id, 0) == false);
     assert(M(sim_id, 1) == false);
     assert(M(sim_id, 2) == false);
     assert(M(sim_id, 3) == false);
     assert(M(sim_id, 4) == false);
-
+ 
     for (unsigned i = 0; i < nqubits + 1; ++i)
         release(sim_id, i);
     destroy(sim_id);
diff --git a/src/Simulation/Native/src/simulator/dbw_test.cpp b/src/Simulation/Native/src/simulator/dbw_test.cpp
new file mode 100644
index 00000000000..ac4df0f1448
--- /dev/null
+++ b/src/Simulation/Native/src/simulator/dbw_test.cpp
@@ -0,0 +1,240 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#include "simulator/capi.hpp"
+#include <cassert>
+#include <cmath>
+#include <iostream>
+#include <fstream>
+#include <vector>
+#include <complex>
+#include <array>
+#include <omp.h>
+#include <chrono>
+#include <regex>
+#include <string>
+#include <vector>
+#include <stdarg.h>
+
+#include "util/cpuid.hpp"
+#include "capi.hpp"
+#include <cstdarg>
+
+using namespace std;
+
+// some convenience functions
+void CX(unsigned sim_id, unsigned c, unsigned q)
+{
+    MCX(sim_id,1,&c,q);
+}
+
+void CZ(unsigned sim_id, unsigned c, unsigned q)
+{
+    MCZ(sim_id,1,&c,q);
+}
+
+void Ry(unsigned sim_id, double phi, unsigned q)
+{
+    R(sim_id,2,phi,q);
+}
+
+void CRz(unsigned sim_id, double phi, unsigned c, unsigned q)
+{
+    MCR(sim_id,3,phi,1,&c,q);
+}
+
+void CRx(unsigned sim_id, double phi, unsigned c, unsigned q)
+{
+    MCR(sim_id,1,phi,1,&c,q);
+}
+
+
+void dump(unsigned sim_id, const char* label)
+{
+    auto dump_callback = [](size_t idx, double r, double i) {
+        std::cout << idx << ":\t" << r << '\t' << i << '\n';
+        return true;
+    };
+    auto sim_ids_callback = [](unsigned idx) { std::cout << idx << " "; };
+
+    std::cout << label << "\n" << "wave function for ids (least to most significant): ["; 
+    DumpIds(sim_id, sim_ids_callback);
+    std::cout << "]\n";
+    Dump(sim_id, dump_callback);
+}
+
+std::vector<std::vector<std::int32_t>> loadPrb(int circStart, int circStop) {
+    std::vector<std::vector<std::int32_t>> rslt;
+    for (int k = circStart; k < circStop; k++) {
+        unsigned c = k - 1;
+        if (k > 0)
+            for (int j = 0; j < 5; j++) {
+                std::vector<std::int32_t> nums = { k - 1, k };
+                rslt.push_back(nums);
+            }
+        if (k % 5 == 0) {
+            for (int j = 0; j < 5; j++) {
+                std::vector<std::int32_t> nums = { k };
+                rslt.push_back(nums);
+            }
+        }
+    }
+    return rslt;
+}
+
+std::vector<std::int32_t> splitNums(const std::string& str, char delim = ',') {
+    std::vector<std::int32_t> nums;
+    size_t start;
+    size_t end = 0;
+    while ((start = str.find_first_not_of(delim, end)) != std::string::npos) {
+        end = str.find(delim, start);
+        nums.push_back(stoi(str.substr(start, end - start)));
+    }
+    return nums;
+}
+
+std::vector<std::vector<std::int32_t>> loadTest(char* fName,bool doClusters) {
+    std::vector<std::vector<std::int32_t>> rslt;
+    std::vector<std::int32_t> empty;
+    string line;
+    ifstream file(fName);
+    if (!file.is_open()) throw(std::invalid_argument("Can't open input file"));
+
+    int phase = 0;
+    if (doClusters) phase = 2;
+
+    regex reOrig("^=== Original:.*[\r]?");
+    regex reGate("^\\s*(\\d+):\\s+(.+)\\[(.*)\\].*[\r]?");
+    regex reClusters("^=== Clusters.*[\r]?");
+    regex reCluster("^==== cluster\\[\\s*(\\d+)\\]:.*[\r]?");
+    smatch sm;
+
+    while (getline(file, line)) {
+        if (phase == 99) break;
+        switch (phase) {
+        case 0:
+            if (regex_match(line, sm, reOrig)) phase = 1;
+            break;
+        case 1:
+            if (regex_match(line, sm, reGate)) {
+                auto qs = splitNums(sm[3]);
+                rslt.push_back(qs);
+            }
+            else phase = 99;
+            break;
+        case 2:
+            if (regex_match(line, reClusters)) 
+                phase = 3;
+            break;
+        case 4:
+            if (regex_match(line, sm, reGate)) {
+                auto qs = splitNums(sm[3]);
+                rslt.push_back(qs);
+                break;
+            }
+            else {
+                phase = 3;
+                [[fallthrough]];
+            }
+        case 3:
+            if (regex_match(line, sm, reCluster)) {
+                rslt.push_back(empty);
+                phase = 4;
+            }
+            break;
+        }
+    }
+    file.close();
+    return rslt;
+}
+
+void mySprintf(char* buf, int bufSiz, const char* fmt, ...) {
+    va_list args;
+#ifdef _MSC_VER
+    __crt_va_start(args, fmt);
+    vsprintf_s(buf, bufSiz, fmt, args);
+    __crt_va_end(args);
+#else
+    va_start(args,fmt);
+    vsprintf(buf, fmt, args);
+    va_end(args);
+#endif
+    //perror(buf);
+}
+
+int numQs(vector<vector<int32_t>> prb) {
+    int mx = -1;
+    for (auto i : prb)
+        for (auto j : i)
+            if (j > mx) mx = j;
+    return (mx + 1);
+}
+
+int main()
+{
+    int                     nQs;
+    vector<vector<int32_t>> prb;
+    char                    fName[30];
+
+    // Perform a small number of loops on the 4x4 advantage circuit.
+    int sizR = 4;
+    int sizC = 4;
+    int loops = 10;
+    mySprintf(fName, sizeof(fName), "advantage_%d%d_4.log", sizR, sizC);
+
+    prb         = loadTest(fName, false);
+    nQs         = numQs(prb);
+    int gateCnt = (int)prb.size();
+    double maxGps = 0.0;
+
+#ifdef NDEBUG
+    double gpsFailureThreshold = 1000.0;
+#else
+    double gpsFailureThreshold = 60.0;
+#endif
+
+    printf("==== Starting %s (%d gates), Failure threshold %.2e gps\n", fName, gateCnt, gpsFailureThreshold);
+
+    auto sim_id = init();
+    for (int q = 0; q < nQs; q++) allocateQubit(sim_id, q);
+
+    std::chrono::system_clock::time_point start = std::chrono::system_clock::now();
+    int itvl = loops / 10;
+    for (int i = 0; i < loops; i++) {
+        for (int j = 0; j < prb.size(); j++) {
+            auto qs = prb[j];
+            uint32_t cs[2];
+            switch (qs.size()) {
+            case 0: // No op
+                break;
+            case 1:
+                H(sim_id, qs[0]);
+                break;
+            case 2:
+                CX(sim_id, qs[0], qs[1]);
+                break;
+            case 3:
+                cs[0] = (uint32_t)qs[0];
+                cs[1] = (uint32_t)qs[1];
+                MCX(sim_id, 2, cs, qs[2]);
+                break;
+            default:
+                throw(std::invalid_argument("Didn't expect more then 3 wire gates"));
+            }
+
+        }
+        for (int q = 0; q < nQs; q++) M(sim_id, q);
+
+        std::chrono::system_clock::time_point curr = std::chrono::system_clock::now();
+        std::chrono::duration<double> elapsed = curr - start;
+        if (i % itvl == (itvl - 1)) {
+            double gps = (double)gateCnt * (double)i / elapsed.count();
+            printf("Loops[%4d]: GPS = %.2e\n", i, gps);
+            fflush(stdout);
+            if (gps > maxGps) maxGps = gps;
+        }
+    }
+    destroy(sim_id);
+
+    if (maxGps < gpsFailureThreshold) return -1;
+}
diff --git a/src/Simulation/Native/src/simulator/factory.cpp b/src/Simulation/Native/src/simulator/factory.cpp
index b80b02b35f6..b4ea5fd4e63 100644
--- a/src/Simulation/Native/src/simulator/factory.cpp
+++ b/src/Simulation/Native/src/simulator/factory.cpp
@@ -13,15 +13,19 @@ namespace Microsoft
   {
     namespace SimulatorGeneric
     {
-      MICROSOFT_QUANTUM_DECL_IMPORT Microsoft::Quantum::Simulator::SimulatorInterface* createSimulator(unsigned);
+      Microsoft::Quantum::Simulator::SimulatorInterface* createSimulator(unsigned);
     }
     namespace SimulatorAVX
     {
-      MICROSOFT_QUANTUM_DECL_IMPORT Microsoft::Quantum::Simulator::SimulatorInterface* createSimulator(unsigned);
+      Microsoft::Quantum::Simulator::SimulatorInterface* createSimulator(unsigned);
     }
     namespace SimulatorAVX2
     {
-      MICROSOFT_QUANTUM_DECL_IMPORT Microsoft::Quantum::Simulator::SimulatorInterface* createSimulator(unsigned);
+      Microsoft::Quantum::Simulator::SimulatorInterface* createSimulator(unsigned);
+    }
+    namespace SimulatorAVX512
+    {
+      Microsoft::Quantum::Simulator::SimulatorInterface* createSimulator(unsigned);
     }
   }
 }
@@ -35,11 +39,15 @@ namespace Microsoft
       std::shared_mutex _mutex;
       std::vector<std::shared_ptr<SimulatorInterface>> _psis;
 
-      SimulatorInterface* createSimulator(unsigned maxlocal)
-      {
-        if (haveFMA() && haveAVX2())
+     SimulatorInterface* createSimulator(unsigned maxlocal)
+     {
+       if (haveAVX512())
+        {
+            return SimulatorAVX512::createSimulator(maxlocal);
+        }
+        else if (haveFMA() && haveAVX2())
         {
-          return SimulatorAVX2::createSimulator(maxlocal);
+           return SimulatorAVX2::createSimulator(maxlocal);
         }
         else if(haveAVX())
         {
diff --git a/src/Simulation/Native/src/simulator/gates.hpp b/src/Simulation/Native/src/simulator/gates.hpp
index 3458a3295f0..15034067c06 100644
--- a/src/Simulation/Native/src/simulator/gates.hpp
+++ b/src/Simulation/Native/src/simulator/gates.hpp
@@ -16,448 +16,453 @@
 
 namespace Microsoft
 {
-namespace Quantum
-{
-namespace SIMULATOR
-{
-namespace Gates
-{
-
-/// a type for runtime basis specification
-enum Basis
-{
-    PauliI = 0,
-    PauliX = 1,
-    PauliY = 3,
-    PauliZ = 2
-};
-
-/// a general one qubit gate, storing the qubit number
-class OneQubitGate
-{
-  public:
-    OneQubitGate(unsigned q) : qubit_(q)
-    {
-    }
-    unsigned qubit() const
-    {
-        return qubit_;
-    }
-
-  private:
-    unsigned qubit_;
-};
-
-/// a general one qubit roitation gate, storing the qubit number and angle
-class RotationGate : public OneQubitGate
-{
-  public:
-    RotationGate(double phi, unsigned q) : OneQubitGate(q), angle_(phi)
+    namespace Quantum
     {
-    }
-    double angle() const
-    {
-        return angle_;
-    }
-
-  private:
-    double angle_;
-};
-
-/// The Pauli X gate
-class X : public OneQubitGate
-{
-  public:
-    X(unsigned q) : OneQubitGate(q)
-    {
-    }
-
-    std::string name() const
-    {
-        return "X";
-    }
-
-    TinyMatrix<RealType, 2> matrix() const
-    {
-        RealType mat[2][2] = {{0., 1.}, {1., 0.}};
-        return TinyMatrix<RealType, 2>(mat);
-    }
-};
-
-/// The Pauli Y gate
-class Y : public OneQubitGate
-{
-  public:
-    Y(unsigned q) : OneQubitGate(q)
-    {
-    }
-
-    std::string name() const
-    {
-        return "Y";
-    }
-
-    TinyMatrix<ComplexType, 2> matrix() const
-    {
-        using val_t = ComplexType;
-        val_t mat[2][2] = {{val_t(0.), val_t(0.)}, {val_t(0.), val_t(0.)}};
-        mat[0][1] = val_t(0., -1.);
-        mat[1][0] = val_t(0., 1.);
-        return TinyMatrix<val_t, 2>(mat);
-    }
-};
-
-/// The Pauli Z gate
-class Z : public OneQubitGate
-{
-  public:
-    Z(unsigned q) : OneQubitGate(q)
-    {
-    }
-
-    std::string name() const
-    {
-        return "Z";
-    }
-
-    DiagMatrix<RealType, 2> matrix() const
-    {
-        RealType diag[2] = {1., -1.};
-        return DiagMatrix<RealType, 2>(diag);
-    }
-};
-
-/// The Hadamard gate
-class H : public OneQubitGate
-{
-  public:
-    H(unsigned q) : OneQubitGate(q)
-    {
-    }
-
-    std::string name() const
-    {
-        return "H";
-    }
-
-    TinyMatrix<RealType, 2> matrix() const
-    {
-        RealType r = std::sqrt(0.5);
-        RealType mat[2][2] = {{r, r}, {r, -r}};
-        return TinyMatrix<RealType, 2, 2>(mat);
-    }
-};
-
-/// The Y-version of a Hadamard gate
-class HY : public OneQubitGate
-{
-  public:
-    HY(unsigned q) : OneQubitGate(q)
-    {
-    }
-
-    std::string name() const
-    {
-        return "HY";
-    }
-
-    TinyMatrix<ComplexType, 2> matrix() const
-    {
-        ComplexType r(std::sqrt(0.5), 0.);
-        ComplexType i(0., std::sqrt(0.5));
-        ComplexType mat[2][2] = {{r, r}, {i, -i}};
-        return TinyMatrix<ComplexType, 2>(mat);
-    }
-};
-
-/// The adjoint Y-version of a Hadamard gate
-class AdjHY : public OneQubitGate
-{
-  public:
-    AdjHY(unsigned q) : OneQubitGate(q)
-    {
-    }
-
-    std::string name() const
-    {
-        return "AdjHY";
-    }
-
-    TinyMatrix<ComplexType, 2> matrix() const
-    {
-        ComplexType r(std::sqrt(0.5), 0.);
-        ComplexType i(0., std::sqrt(0.5));
-        ComplexType mat[2][2] = {{r, -i}, {r, i}};
-        return TinyMatrix<ComplexType, 2>(mat);
-    }
-};
-
-/// The S (phase) gate
-class S : public OneQubitGate
-{
-  public:
-    S(unsigned q) : OneQubitGate(q)
-    {
-    }
-
-    std::string name() const
-    {
-        return "S";
-    }
-
-    DiagMatrix<ComplexType, 2> matrix() const
-    {
-        using val_t = ComplexType;
-        val_t diag[2] = {val_t(1.), val_t(0., 1.)};
-        return DiagMatrix<val_t, 2>(diag);
-    }
-};
-
-/// The adjoint of the S (phase) gate
-class AdjS : public OneQubitGate
-{
-  public:
-    AdjS(unsigned q) : OneQubitGate(q)
-    {
-    }
-
-    std::string name() const
-    {
-        return "AdjS";
-    }
-
-    DiagMatrix<ComplexType, 2> matrix() const
-    {
-        using val_t = ComplexType;
-        val_t diag[2] = {val_t(1.), val_t(0., -1.)};
-        return DiagMatrix<val_t, 2>(diag);
-    }
-};
-
-/// The T (pi/8) gate
-class T : public OneQubitGate
-{
-  public:
-    T(unsigned q) : OneQubitGate(q)
-    {
-    }
-
-    std::string name() const
-    {
-        return "T";
-    }
-
-    DiagMatrix<ComplexType, 2> matrix() const
-    {
-        using val_t = ComplexType;
-        RealType r = std::sqrt(0.5);
-        val_t diag[2] = {val_t(1.), val_t(r, r)};
-        return DiagMatrix<val_t, 2>(diag);
-    }
-};
-
-/// The T (pi/8) gate
-class AdjT : public OneQubitGate
-{
-  public:
-    AdjT(unsigned q) : OneQubitGate(q)
-    {
-    }
-
-    std::string name() const
-    {
-        return "AdjT";
-    }
-
-    DiagMatrix<ComplexType, 2> matrix() const
-    {
-        using val_t = ComplexType;
-        RealType r = std::sqrt(0.5);
-        val_t diag[2] = {val_t(1.), val_t(r, -r)};
-        return DiagMatrix<val_t, 2>(diag);
-    }
-};
-
-/// The G gate
-class G : public RotationGate
-{
-  public:
-    G(RealType phi, unsigned q) : RotationGate(phi, q)
-    {
-    }
-
-    std::string name() const
-    {
-        return "G";
-    }
-
-    DiagMatrix<ComplexType, 2> matrix() const
-    {
-        DiagMatrix<ComplexType, 2> d;
-        ComplexType arg(0., 0.5 * angle());
-        d(0, 0) = d(1, 1) = std::exp(-arg);
-        return d;
-    }
-};
-
-/// The Rx gate
-class Rx : public RotationGate
-{
-  public:
-    Rx(RealType phi, unsigned q) : RotationGate(phi, q)
-    {
-    }
-
-    std::string name() const
-    {
-        return "Rx";
-    }
-
-    TinyMatrix<ComplexType, 2> matrix() const
-    {
-        using val_t = ComplexType;
-        val_t s(0., -std::sin(0.5 * angle()));
-        val_t c = std::cos(0.5 * angle());
-        val_t mat[2][2] = {{c, s}, {s, c}};
-        return TinyMatrix<val_t, 2>(mat);
-    }
-};
-
-/// The Ry gate
-class Ry : public RotationGate
-{
-  public:
-    Ry(RealType phi, unsigned q) : RotationGate(phi, q)
-    {
-    }
-
-    std::string name() const
-    {
-        return "Ry";
-    }
-
-    TinyMatrix<RealType, 2> matrix() const
-    {
-        RealType s = std::sin(0.5 * angle());
-        RealType c = std::cos(0.5 * angle());
-        RealType mat[2][2] = {{c, -s}, {s, c}};
-        ;
-        return TinyMatrix<RealType, 2>(mat);
-    }
-};
-
-/// The Rz gate
-class Rz : public RotationGate
-{
-  public:
-    Rz(RealType phi, unsigned q) : RotationGate(phi, q)
-    {
-    }
-
-    std::string name() const
-    {
-        return "Rz";
-    }
-
-    DiagMatrix<ComplexType, 2> matrix() const
-    {
-        DiagMatrix<ComplexType, 2> d;
-        ComplexType arg(0., 0.5 * angle());
-        d(0, 0) = std::exp(-arg);
-        d(1, 1) = std::exp(arg);
-        return d;
-    }
-};
-
-/// The R1 gate
-class R1 : public RotationGate
-{
-  public:
-    R1(RealType phi, unsigned q) : RotationGate(phi, q)
-    {
-    }
-
-    std::string name() const
-    {
-        return "R1";
-    }
-
-    DiagMatrix<ComplexType, 2> matrix() const
-    {
-        DiagMatrix<ComplexType, 2> d;
-        ComplexType arg(0., angle());
-        d(0, 0) = 1.;
-        d(1, 1) = std::exp(-arg);
-        return d;
-    }
-};
-
-/// The R1 gate
-class R1Frac : public R1
-{
-  public:
-    R1Frac(int k, int n, unsigned q) : R1(M_PI * static_cast<RealType>(k) / static_cast<RealType>(1 << n), q)
-    {
-    }
-
-    std::string name() const
-    {
-        return "R1Frac";
-    }
-};
-
-/// The R gate for rotation around an arbitrary basis
-class R : public RotationGate
-{
-  public:
-    R(Basis b, RealType phi, unsigned q) : RotationGate(phi, q), b_(b)
-    {
-    }
-
-    std::string name() const
-    {
-        return "R";
-    }
-
-    TinyMatrix<ComplexType, 2> matrix() const
-    {
-        switch (b_)
+        namespace SIMULATOR
         {
-            case PauliI:
-                return G(angle(), qubit()).matrix();
-                break;
-            case PauliX:
-                return Rx(angle(), qubit()).matrix();
-                break;
-            case PauliY:
-                return Ry(angle(), qubit()).matrix();
-                break;
-            case PauliZ:
-                return Rz(angle(), qubit()).matrix();
-                break;
-            default:
-                assert(false);
+            namespace Gates
+            {
+
+                /// a type for runtime basis specification
+                enum Basis
+                {
+                    PauliI = 0,
+                    PauliX = 1,
+                    PauliY = 3,
+                    PauliZ = 2
+                };
+
+                /// a general one qubit gate, storing the qubit number
+                class OneQubitGate
+                {
+                public:
+                    OneQubitGate(unsigned q) : qubit_(q)
+                    {
+                    }
+                    unsigned qubit() const
+                    {
+                        return qubit_;
+                    }
+
+                    virtual TinyMatrix<ComplexType, 2> matrix() const
+                    {
+                        throw "Calling overriden function";
+                    }
+
+                private:
+                    unsigned qubit_;
+                };
+
+                /// a general one qubit roitation gate, storing the qubit number and angle
+                class RotationGate : public OneQubitGate
+                {
+                public:
+                    RotationGate(double phi, unsigned q) : OneQubitGate(q), angle_(phi)
+                    {
+                    }
+                    double angle() const
+                    {
+                        return angle_;
+                    }
+
+                private:
+                    double angle_;
+                };
+
+                /// The Pauli X gate
+                class X : public OneQubitGate
+                {
+                public:
+                    X(unsigned q) : OneQubitGate(q)
+                    {
+                    }
+
+                    std::string name() const
+                    {
+                        return "X";
+                    }
+
+                    TinyMatrix<ComplexType, 2> matrix() const
+                    {
+                        RealType mat[2][2] = { {0., 1.}, {1., 0.} };
+                        return TinyMatrix<RealType, 2>(mat);
+                    }
+                };
+
+                /// The Pauli Y gate
+                class Y : public OneQubitGate
+                {
+                public:
+                    Y(unsigned q) : OneQubitGate(q)
+                    {
+                    }
+
+                    std::string name() const
+                    {
+                        return "Y";
+                    }
+
+                    TinyMatrix<ComplexType, 2> matrix() const
+                    {
+                        using val_t = ComplexType;
+                        val_t mat[2][2] = { {val_t(0.), val_t(0.)}, {val_t(0.), val_t(0.)} };
+                        mat[0][1] = val_t(0., -1.);
+                        mat[1][0] = val_t(0., 1.);
+                        return TinyMatrix<val_t, 2>(mat);
+                    }
+                };
+
+                /// The Pauli Z gate
+                class Z : public OneQubitGate
+                {
+                public:
+                    Z(unsigned q) : OneQubitGate(q)
+                    {
+                    }
+
+                    std::string name() const
+                    {
+                        return "Z";
+                    }
+
+                    TinyMatrix<ComplexType, 2> matrix() const
+                    {
+                        RealType diag[2] = { 1., -1. };
+                        return TinyMatrix<ComplexType, 2>(DiagMatrix<RealType, 2>(diag));
+                    }
+                };
+
+                /// The Hadamard gate
+                class H : public OneQubitGate
+                {
+                public:
+                    H(unsigned q) : OneQubitGate(q)
+                    {
+                    }
+
+                    std::string name() const
+                    {
+                        return "H";
+                    }
+
+                    TinyMatrix<ComplexType, 2> matrix() const
+                    {
+                        RealType r = std::sqrt(0.5);
+                        RealType mat[2][2] = { {r, r}, {r, -r} };
+                        return TinyMatrix<ComplexType, 2>(TinyMatrix<RealType, 2, 2>(mat));
+                    }
+                };
+
+                /// The Y-version of a Hadamard gate
+                class HY : public OneQubitGate
+                {
+                public:
+                    HY(unsigned q) : OneQubitGate(q)
+                    {
+                    }
+
+                    std::string name() const
+                    {
+                        return "HY";
+                    }
+
+                    TinyMatrix<ComplexType, 2> matrix() const
+                    {
+                        ComplexType r(std::sqrt(0.5), 0.);
+                        ComplexType i(0., std::sqrt(0.5));
+                        ComplexType mat[2][2] = { {r, r}, {i, -i} };
+                        return TinyMatrix<ComplexType, 2>(mat);
+                    }
+                };
+
+                /// The adjoint Y-version of a Hadamard gate
+                class AdjHY : public OneQubitGate
+                {
+                public:
+                    AdjHY(unsigned q) : OneQubitGate(q)
+                    {
+                    }
+
+                    std::string name() const
+                    {
+                        return "AdjHY";
+                    }
+
+                    TinyMatrix<ComplexType, 2> matrix() const
+                    {
+                        ComplexType r(std::sqrt(0.5), 0.);
+                        ComplexType i(0., std::sqrt(0.5));
+                        ComplexType mat[2][2] = { {r, -i}, {r, i} };
+                        return TinyMatrix<ComplexType, 2>(mat);
+                    }
+                };
+
+                /// The S (phase) gate
+                class S : public OneQubitGate
+                {
+                public:
+                    S(unsigned q) : OneQubitGate(q)
+                    {
+                    }
+
+                    std::string name() const
+                    {
+                        return "S";
+                    }
+
+                    TinyMatrix<ComplexType, 2> matrix() const
+                    {
+                        using val_t = ComplexType;
+                        val_t diag[2] = { val_t(1.), val_t(0., 1.) };
+                        return TinyMatrix<ComplexType, 2>(DiagMatrix<val_t, 2>(diag));
+                    }
+                };
+
+                /// The adjoint of the S (phase) gate
+                class AdjS : public OneQubitGate
+                {
+                public:
+                    AdjS(unsigned q) : OneQubitGate(q)
+                    {
+                    }
+
+                    std::string name() const
+                    {
+                        return "AdjS";
+                    }
+
+                    TinyMatrix<ComplexType, 2> matrix() const
+                    {
+                        using val_t = ComplexType;
+                        val_t diag[2] = { val_t(1.), val_t(0., -1.) };
+                        return TinyMatrix<ComplexType, 2>(DiagMatrix<val_t, 2>(diag));
+                    }
+                };
+
+                /// The T (pi/8) gate
+                class T : public OneQubitGate
+                {
+                public:
+                    T(unsigned q) : OneQubitGate(q)
+                    {
+                    }
+
+                    std::string name() const
+                    {
+                        return "T";
+                    }
+
+                    TinyMatrix<ComplexType, 2> matrix() const
+                    {
+                        using val_t = ComplexType;
+                        RealType r = std::sqrt(0.5);
+                        val_t diag[2] = { val_t(1.), val_t(r, r) };
+                        return TinyMatrix<ComplexType, 2>(DiagMatrix<val_t, 2>(diag));
+                    }
+                };
+
+                /// The T (pi/8) gate
+                class AdjT : public OneQubitGate
+                {
+                public:
+                    AdjT(unsigned q) : OneQubitGate(q)
+                    {
+                    }
+
+                    std::string name() const
+                    {
+                        return "AdjT";
+                    }
+
+                    TinyMatrix<ComplexType, 2> matrix() const
+                    {
+                        using val_t = ComplexType;
+                        RealType r = std::sqrt(0.5);
+                        val_t diag[2] = { val_t(1.), val_t(r, -r) };
+                        return TinyMatrix<ComplexType, 2>(DiagMatrix<val_t, 2>(diag));
+                    }
+                };
+
+                /// The G gate
+                class G : public RotationGate
+                {
+                public:
+                    G(RealType phi, unsigned q) : RotationGate(phi, q)
+                    {
+                    }
+
+                    std::string name() const
+                    {
+                        return "G";
+                    }
+
+                    TinyMatrix<ComplexType, 2> matrix() const
+                    {
+                        DiagMatrix<ComplexType, 2> d;
+                        ComplexType arg(0., 0.5 * angle());
+                        d(0, 0) = d(1, 1) = std::exp(-arg);
+                        return TinyMatrix<ComplexType, 2>(d);
+                    }
+                };
+
+                /// The Rx gate
+                class Rx : public RotationGate
+                {
+                public:
+                    Rx(RealType phi, unsigned q) : RotationGate(phi, q)
+                    {
+                    }
+
+                    std::string name() const
+                    {
+                        return "Rx";
+                    }
+
+                    TinyMatrix<ComplexType, 2> matrix() const
+                    {
+                        using val_t = ComplexType;
+                        val_t s(0., -std::sin(0.5 * angle()));
+                        val_t c = std::cos(0.5 * angle());
+                        val_t mat[2][2] = { {c, s}, {s, c} };
+                        return TinyMatrix<val_t, 2>(mat);
+                    }
+                };
+
+                /// The Ry gate
+                class Ry : public RotationGate
+                {
+                public:
+                    Ry(RealType phi, unsigned q) : RotationGate(phi, q)
+                    {
+                    }
+
+                    std::string name() const
+                    {
+                        return "Ry";
+                    }
+
+                    TinyMatrix<ComplexType, 2> matrix() const
+                    {
+                        RealType s = std::sin(0.5 * angle());
+                        RealType c = std::cos(0.5 * angle());
+                        RealType mat[2][2] = { {c, -s}, {s, c} };
+                        ;
+                        return TinyMatrix<ComplexType, 2>(TinyMatrix<RealType, 2>(mat));
+                    }
+                };
+
+                /// The Rz gate
+                class Rz : public RotationGate
+                {
+                public:
+                    Rz(RealType phi, unsigned q) : RotationGate(phi, q)
+                    {
+                    }
+
+                    std::string name() const
+                    {
+                        return "Rz";
+                    }
+
+                    TinyMatrix<ComplexType, 2> matrix() const
+                    {
+                        DiagMatrix<ComplexType, 2> d;
+                        ComplexType arg(0., 0.5 * angle());
+                        d(0, 0) = std::exp(-arg);
+                        d(1, 1) = std::exp(arg);
+                        return TinyMatrix<ComplexType, 2>(d);
+                    }
+                };
+
+                /// The R1 gate
+                class R1 : public RotationGate
+                {
+                public:
+                    R1(RealType phi, unsigned q) : RotationGate(phi, q)
+                    {
+                    }
+
+                    std::string name() const
+                    {
+                        return "R1";
+                    }
+
+                    TinyMatrix<ComplexType, 2> matrix() const
+                    {
+                        DiagMatrix<ComplexType, 2> d;
+                        ComplexType arg(0., angle());
+                        d(0, 0) = 1.;
+                        d(1, 1) = std::exp(-arg);
+                        return TinyMatrix<ComplexType, 2>(d);
+                    }
+                };
+
+                /// The R1 gate
+                class R1Frac : public R1
+                {
+                public:
+                    R1Frac(int k, int n, unsigned q) : R1(M_PI* static_cast<RealType>(k) / static_cast<RealType>(1ll << n), q)
+                    {
+                    }
+
+                    std::string name() const
+                    {
+                        return "R1Frac";
+                    }
+                };
+
+                /// The R gate for rotation around an arbitrary basis
+                class R : public RotationGate
+                {
+                public:
+                    R(Basis b, RealType phi, unsigned q) : RotationGate(phi, q), b_(b)
+                    {
+                    }
+
+                    std::string name() const
+                    {
+                        return "R";
+                    }
+
+                    TinyMatrix<ComplexType, 2> matrix() const
+                    {
+                        switch (b_)
+                        {
+                        case PauliI:
+                            return G(angle(), qubit()).matrix();
+                            break;
+                        case PauliX:
+                            return Rx(angle(), qubit()).matrix();
+                            break;
+                        case PauliY:
+                            return Ry(angle(), qubit()).matrix();
+                            break;
+                        case PauliZ:
+                            return Rz(angle(), qubit()).matrix();
+                            break;
+                        default:
+                            assert(false);
+                        }
+                        // dummy return
+                        return TinyMatrix<ComplexType, 2>();
+                    }
+
+                private:
+                    Basis b_;
+                };
+
+                class RFrac : public R
+                {
+                public:
+                    RFrac(Basis b, int k, int n, unsigned q) : R(b, -2. * M_PI * static_cast<RealType>(k) / static_cast<RealType>(1ll << n), q)
+                    {
+                    }
+                    std::string name() const
+                    {
+                        return "RFrac";
+                    }
+                };
+            }
         }
-        // dummy return
-        return TinyMatrix<ComplexType, 2>();
-    }
-
-  private:
-    Basis b_;
-};
-
-class RFrac : public R
-{
-  public:
-    RFrac(Basis b, int k, int n, unsigned q) : R(b, -2. * M_PI * static_cast<RealType>(k) / static_cast<RealType>(1 << n), q)
-    {
-    }
-    std::string name() const
-    {
-        return "RFrac";
     }
-};
-}
-}
-}
 }
diff --git a/src/Simulation/Native/src/simulator/kernels.hpp b/src/Simulation/Native/src/simulator/kernels.hpp
index 68a710de36a..21447c5b535 100644
--- a/src/Simulation/Native/src/simulator/kernels.hpp
+++ b/src/Simulation/Native/src/simulator/kernels.hpp
@@ -398,7 +398,6 @@ void subsytemwavefunction_by_pivot(std::vector<T, A1> const& wfn,
 
     std::vector<size_t> chunks;
     
-#pragma omp parallel
     {
 #pragma omp single
         chunks = split_interval_in_chunks(max, omp_get_num_threads());
@@ -446,7 +445,6 @@ bool istensorproduct(std::vector<T, A1> const& wfn,
     std::size_t compl_st = compl_bits.to_ullong();
 
     std::atomic<bool> go(true);
-#pragma omp parallel
     {
         int thread_id = omp_get_thread_num();
         if (thread_id < chunks.size() - 1)
diff --git a/src/Simulation/Native/src/simulator/local_test.cpp b/src/Simulation/Native/src/simulator/local_test.cpp
index 36820b89852..6f91d06b8fc 100644
--- a/src/Simulation/Native/src/simulator/local_test.cpp
+++ b/src/Simulation/Native/src/simulator/local_test.cpp
@@ -9,7 +9,6 @@
 
 using namespace Microsoft::Quantum::SIMULATOR;
 
-
 void test_exp()
 {
     SimulatorType sim;
@@ -324,7 +323,6 @@ void test_extract_qubits_state()
     test_extract_qubits_cat_state(4, {1, 2}, {1, 3});
     test_extract_qubits_cat_state(4, {1, 3}, {0, 1});
     test_extract_qubits_cat_state(4, {2, 3}, {1, 2});
-
     test_extract_qubits_cat_state(12, {2, 4, 5, 6, 7}, {0, 1, 2});
     test_extract_qubits_cat_state(6, {0, 1, 3}, {0, 1});
     test_extract_qubits_cat_state(10, {0, 5}, {5, 6});
diff --git a/src/Simulation/Native/src/simulator/simulator.hpp b/src/Simulation/Native/src/simulator/simulator.hpp
index 453fa4d1b89..41efd245de9 100644
--- a/src/Simulation/Native/src/simulator/simulator.hpp
+++ b/src/Simulation/Native/src/simulator/simulator.hpp
@@ -99,6 +99,7 @@ namespace SIMULATOR
     bool release(unsigned q)
     {
       recursive_lock_type l(mutex());
+      flush();
       bool allok = isclassical(q);
       if (allok)
       allok = (psi.getvalue(q)==false);
diff --git a/src/Simulation/Native/src/simulator/simulatoravx512.cpp b/src/Simulation/Native/src/simulator/simulatoravx512.cpp
new file mode 100644
index 00000000000..88bc438dccc
--- /dev/null
+++ b/src/Simulation/Native/src/simulator/simulatoravx512.cpp
@@ -0,0 +1,16 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#define HAVE_INTRINSICS
+#define HAVE_AVX512
+#define HAVE_FMA
+
+#include "simulator/simulator.hpp"
+
+
+namespace sim = Microsoft::Quantum::SimulatorAVX512;
+
+MICROSOFT_QUANTUM_DECL Microsoft::Quantum::Simulator::SimulatorInterface* sim::createSimulator(unsigned maxlocal)
+{
+  return new sim::SimulatorType(maxlocal);
+}
diff --git a/src/Simulation/Native/src/simulator/wavefunction.hpp b/src/Simulation/Native/src/simulator/wavefunction.hpp
index 1d3a390642c..748aef30787 100644
--- a/src/Simulation/Native/src/simulator/wavefunction.hpp
+++ b/src/Simulation/Native/src/simulator/wavefunction.hpp
@@ -12,6 +12,9 @@
 #include <limits>
 #include <random>
 #include <vector>
+#include <unordered_set>
+#include <unordered_map>
+#include <string.h>
 
 #include "types.hpp"
 #include "gates.hpp"
@@ -20,41 +23,122 @@
 
 namespace Microsoft
 {
-namespace Quantum
-{
-namespace SIMULATOR
-{
-    namespace detail
+    namespace Quantum
     {
-        inline std::size_t get_register(const std::vector<unsigned>& qs, std::size_t basis_state)
-        {
-            std::size_t result = 0;
-            for (unsigned i = 0; i < qs.size(); ++i)
-                result |= ((basis_state >> qs[i]) & 1) << i;
-            return result;
-
-        }
-
-        inline std::size_t set_register(const std::vector<unsigned>& qs, std::size_t qmask, std::size_t basis_state, std::size_t original = 0ull)
+        namespace SIMULATOR
         {
-            std::size_t result = original & ~qmask;
-            for (unsigned i = 0; i < qs.size(); ++i)
-                result |= ((basis_state >> i) & 1) << qs[i];
-            return result;
-        }
-    }
+            namespace detail
+            {
+                inline std::size_t get_register(const std::vector<unsigned>& qs, std::size_t basis_state)
+                {
+                    std::size_t result = 0;
+                    for (unsigned i = 0; i < qs.size(); ++i)
+                        result |= ((basis_state >> qs[i]) & 1) << i;
+                    return result;
+
+                }
+
+                inline std::size_t set_register(const std::vector<unsigned>& qs, std::size_t qmask, std::size_t basis_state, std::size_t original = 0ull)
+                {
+                    std::size_t result = original & ~qmask;
+                    for (unsigned i = 0; i < qs.size(); ++i)
+                        result |= ((basis_state >> i) & 1) << qs[i];
+                    return result;
+                }
+            }
+
+            // Creating a gate wrapper datatype to represent a gate in a cluster
+            class GateWrapper {
+            public:
+                GateWrapper(std::vector<unsigned> controls, unsigned target, TinyMatrix<ComplexType, 2> mat) : controls_(controls), target_(target), mat_(mat) {}
+                std::vector<unsigned> get_controls() { return controls_; }
+                unsigned get_target() { return target_; }
+                TinyMatrix<ComplexType, 2> get_mat() { return mat_; }
+            private:
+                std::vector<unsigned> controls_;
+                unsigned target_;
+                TinyMatrix<ComplexType, 2> mat_;
+            };
+
+            // Creating a cluster datatype for new scheduling logic
+            class Cluster {
+            public:
+                Cluster(std::vector<unsigned> qids, std::vector<GateWrapper> gates) : qids_(qids), gates_(gates) {}
+                std::vector<unsigned> get_qids() { return qids_; }
+                std::vector<GateWrapper> get_gates() { return gates_; }
+
+                void setQids(std::vector<unsigned> qids) {
+                    qids_ = qids;
+                }
+
+                void append_gates(std::vector<GateWrapper> gates) {
+                    gates_.insert(gates_.end(), gates.begin(), gates.end());
+                }
+
+                size_t size() {
+                    return gates_.size();
+                }
+
+                // Greedy method that finds next appropriate cluster
+                std::pair<Cluster, std::vector<unsigned>> next_cluster(std::vector<Cluster>& nextClusters, unsigned maxWidth) {
+                    std::vector<unsigned>   myUnion;                                // My qubits touched + Next qubits touched
+                    std::vector<unsigned>   myDiff;                                 // New qubits touched by Next
+                    std::vector<unsigned>   myInter;                                // Old qubits touched by Next
+                    std::vector<unsigned>   allInter;                               // My qubits + All touched qubits
+                    std::set<unsigned>      myTouched(qids_.begin(), qids_.end());  // My qubits touched
+                    std::set<unsigned>      allTouched = myTouched;                 // All the qubits touched so far
+
+                    int lastNexts = (int)nextClusters.size() - 1;                   // nexts are in reverse order (from above)
+                    for (int i = 0; i <= lastNexts; i++) {                          // Look at the clusters that follow us
+                        auto   nextQs = nextClusters[lastNexts-i].get_qids();       // Pull off one future cluster
+                        std::sort(nextQs.begin(), nextQs.end());                    // Has to be sorted for set operations
+                        myUnion.clear();
+                        std::set_union(nextQs.begin(), nextQs.end(),                // See what qubits we and the future cluster touch
+                            myTouched.begin(), myTouched.end(),
+                            std::back_inserter(myUnion));
+                        if (myUnion.size() <= maxWidth) {                           // It's a candiate if it's not beyond our allowed width
+                            myDiff.clear();
+                            std::set_difference(nextQs.begin(), nextQs.end(),       // Figure out if any of the future qubits aren't already seen by us
+                                myTouched.begin(), myTouched.end(),
+                                std::back_inserter(myDiff));
+                            allInter.clear();
+                            std::set_intersection(myDiff.begin(), myDiff.end(),     // These are any new qubits that might have already been touched
+                                allTouched.begin(), allTouched.end(),
+                                std::back_inserter(allInter));
+                            if (allInter.size() == 0) {                             // If the new qubits are untouched... then this is allowed
+                                auto cl = nextClusters[lastNexts-i];
+                                nextClusters.erase(nextClusters.begin() + (lastNexts-i));       // Remove the future cluster
+                                return std::make_pair(cl, myUnion);                 // ... and add it to our cluster (done above)
+                            }
+                        }
+                        myInter.clear();
+                        std::set_intersection(nextQs.begin(), nextQs.end(),         // If a future cluster touches any of our qubits... we've hit a hard wall
+                            myTouched.begin(), myTouched.end(),
+                            std::back_inserter(myInter));
+                        if (myInter.size() != 0) break;
+
+                        allTouched.insert(nextQs.begin(), nextQs.end());            // Add in all qubits touched, and try the next cluster
+                    }
+                    Cluster defCl = Cluster({}, {});                                // Couldn't find any more clusters to add
+                    std::vector<unsigned> defVec = {};
+                    return std::make_pair(defCl, defVec);
+                }
+        private:
+            std::vector<unsigned> qids_;
+            std::vector<GateWrapper> gates_;
+            };
 
 /// A wave function class to store and manipulate the state of qubits
 
 template <class T = ComplexType>
 class Wavefunction
 {
-  public:
+public:
     using value_type = T;
     using qubit_t = unsigned;
     using RngEngine = std::mt19937;
 
-    constexpr qubit_t invalid_qubit() const { return std::numeric_limits<qubit_t>::max();}
+    constexpr qubit_t invalid_qubit() const { return std::numeric_limits<qubit_t>::max(); }
 
     /// allocate a wave function for zero qubits
     Wavefunction(unsigned /*ignore*/) : num_qubits_(0), wfn_(1, 1.), usage_(0)
@@ -74,7 +158,7 @@ class Wavefunction
 
     ~Wavefunction()
     {
-      flush();
+        flush();
     }
 
     unsigned qubit(unsigned q) const
@@ -90,10 +174,34 @@ class Wavefunction
 
     void flush() const
     {
-        fused_.flush(wfn_);
+        int maxSpan = fused_.maxSpan();
+        auto clusters = make_clusters(maxSpan, gatelist_); //making clusters with gates in the queue
+
+        if (clusters.size() == 0) {
+            fused_.flush(wfn_);
+        }
+        else {
+            // logic to flush gates in each cluster
+            for (int i = 0; i < clusters.size(); i++) {
+                Cluster cl = clusters.at(i);
+
+                for (GateWrapper gate : cl.get_gates()) {
+                    std::vector<unsigned> cs = gate.get_controls();
+                    if (cs.size() == 0) {
+                        fused_.apply(wfn_, gate.get_mat(), qubit(gate.get_target()));
+                    }
+                    else {
+                        fused_.apply_controlled(wfn_, gate.get_mat(), qubits(cs), qubit(gate.get_target()));
+                    }
+                }
+
+                fused_.flush(wfn_);
+            }
+        }
+        gatelist_.clear();
     }
 
-	/// allocate a qubit and grow the wave function
+    /// allocate a qubit and grow the wave function
     unsigned allocate()
     {
         assert(usage_ != 2);
@@ -113,7 +221,7 @@ class Wavefunction
         }
     }
 
-	/// allocate a qubit and grow the wave function
+    /// allocate a qubit and grow the wave function
     void allocateQubit(unsigned id)
     {
         assert(usage_ != 1);
@@ -134,7 +242,7 @@ class Wavefunction
     /// \pre the qubit has to be in a classical state in the computational basis
     void release(qubit_t q)
     {
-        unsigned p = qubit(q);
+        unsigned p = qubit(q); //returns qubitmap_[q]
         flush();
         kernels::collapse(wfn_, p, getvalue(q), true);
         for (int i = 0; i < qubitmap_.size(); ++i)
@@ -196,9 +304,9 @@ class Wavefunction
     }
 
     void apply_controlled_exp(std::vector<Gates::Basis> const& bs,
-                              double phi,
-                              std::vector<unsigned> const& cs,
-                              std::vector<unsigned> const& qs)
+        double phi,
+        std::vector<unsigned> const& cs,
+        std::vector<unsigned> const& qs)
     {
         flush();
         kernels::apply_controlled_exp(wfn_, bs, phi, qubits(cs), qubits(qs));
@@ -238,11 +346,66 @@ class Wavefunction
         rng_.seed(s);
     }
 
+    //method that makes clusters to be flushed
+    std::vector<Cluster> make_clusters(unsigned fuseSpan, std::vector<GateWrapper> gates) const {
+        std::vector<Cluster> curClusters;
+
+        if (gates.size() > 0) {
+            //creating initial cluster containing one gate each
+            for (int i = 0; i < gates.size(); i++) {
+                std::vector<unsigned> qids;
+                std::vector<unsigned> controlQids = gates[i].get_controls();
+                if (controlQids.size() > 0) {
+                    qids = controlQids;
+                }
+                qids.push_back(gates[i].get_target());
+                Cluster newCl = Cluster(qids, { gates[i] });
+                curClusters.push_back(newCl);
+            }
+            //creating clusters using greedy algorithm
+            for (int i = 1; i < (int)fuseSpan + 1; i++) {                                   // Build clusters of width 1,2,...
+                std::reverse(curClusters.begin(), curClusters.end());                       // Keep everything in reverse order
+                auto prevClusters = curClusters;                                            // Save away the last set of clusters built
+                curClusters.clear();
+                auto prevCluster = prevClusters.back();                                     // Pop the first cluster
+                prevClusters.pop_back();
+                while (prevClusters.size() > 0)  {                                          // While there are more clusters...
+                    auto foundCompat = prevCluster.next_cluster(prevClusters, i);           // See if we can accumlate anyone who follows
+                    Cluster clusterFound = foundCompat.first;
+                    std::vector<unsigned> foundTotQids = foundCompat.second;
+                    if (clusterFound.get_gates().size() == 0 ||                             // Can't append any more clusters to this one
+                        (int)prevCluster.size() >= fused_.maxDepth()) {                     // ... or we're beyond max depth
+                        curClusters.push_back(prevCluster);                                 // Save this cluster
+                        if (prevCluster.size() > 0)
+                        {
+                            prevCluster = prevClusters.back();
+                            prevClusters.pop_back();
+                        }
+                    }
+                    else {
+                        prevCluster.setQids(foundTotQids);                                  // New version of our cluster (appended)
+                        prevCluster.append_gates(clusterFound.get_gates());
+                    }
+                }                                                                           // Keep looking for clusters to add
+                curClusters.push_back(prevCluster);                                         // Save the final cluster
+            }                                                                               // Start all over with the next larger span
+        }
+        
+        return curClusters;
+    }
+
     /// generic application of a gate
     template <class Gate>
     void apply(Gate const& g)
     {
-        fused_.apply(wfn_, g.matrix(), qubit(g));
+        std::vector<qubit_t> cs;
+        GateWrapper gateApplied = GateWrapper(cs, g.qubit(), g.matrix());
+        gatelist_.push_back(gateApplied);
+        if (gatelist_.size() > 999) {
+            flush();
+        }
+
+        int doFlush = fused_.shouldFlush(wfn_, cs, g.qubit());
     }
 
     /// generic application of a multiply controlled gate
@@ -250,7 +413,13 @@ class Wavefunction
     void apply_controlled(std::vector<qubit_t> cs, Gate const& g)
     {
         std::vector<qubit_t> pcs = qubits(cs);
-        fused_.apply_controlled(wfn_, g.matrix(), pcs, qubit(g));
+        GateWrapper gateApplied = GateWrapper(cs, g.qubit(), g.matrix());
+        gatelist_.push_back(gateApplied);
+        if (gatelist_.size() > 999) {
+            flush();
+        }
+        
+        int doFlush = fused_.shouldFlush(wfn_, cs, g.qubit());
     }
 
     /// generic application of a controlled gate
@@ -274,7 +443,8 @@ class Wavefunction
     template <class A>
     bool subsytemwavefunction(std::vector<unsigned> const& qs, std::vector<T, A>& qubitswfn, double tolerance)
     {
-        return fused_.subsytemwavefunction(wfn_, qubits(qs), qubitswfn, tolerance);
+        flush(); // we have to flush before we can extract the state
+        return kernels::subsytemwavefunction(wfn_, qubits(qs), qubitswfn, tolerance);
     }
 
 
@@ -338,8 +508,9 @@ class Wavefunction
   private:
     unsigned num_qubits_;             // for convenience
     mutable WavefunctionStorage wfn_; // storing the wave function
-    std::vector<qubit_t> qubitmap_;   // mapping of logical to physical qubits
-	  int usage_;
+    mutable std::vector<qubit_t> qubitmap_;   // mapping of logical to physical qubits
+	int usage_;
+    mutable std::vector<GateWrapper> gatelist_;
 
     // randomness support
     RngEngine rng_;
diff --git a/src/Simulation/Native/src/util/Makefile b/src/Simulation/Native/src/util/Makefile
new file mode 100644
index 00000000000..b980489f2aa
--- /dev/null
+++ b/src/Simulation/Native/src/util/Makefile
@@ -0,0 +1,518 @@
+# CMAKE generated file: DO NOT EDIT!
+# Generated by "Unix Makefiles" Generator, CMake Version 3.16
+
+# Default target executed when no arguments are given to make.
+default_target: all
+
+.PHONY : default_target
+
+# Allow only one "make -f Makefile2" at a time, but pass parallelism.
+.NOTPARALLEL:
+
+
+#=============================================================================
+# Special targets provided by cmake.
+
+# Disable implicit rules so canonical targets will work.
+.SUFFIXES:
+
+
+# Remove some rules from gmake that .SUFFIXES does not remove.
+SUFFIXES =
+
+.SUFFIXES: .hpux_make_needs_suffix_list
+
+
+# Suppress display of executed commands.
+$(VERBOSE).SILENT:
+
+
+# A target that is always out of date.
+cmake_force:
+
+.PHONY : cmake_force
+
+#=============================================================================
+# Set environment variables for the build.
+
+# The shell in which to execute make rules.
+SHELL = /bin/sh
+
+# The CMake executable.
+CMAKE_COMMAND = /usr/bin/cmake
+
+# The command to remove a file.
+RM = /usr/bin/cmake -E remove -f
+
+# Escaping for special characters.
+EQUALS = =
+
+# The top-level source directory on which CMake was run.
+CMAKE_SOURCE_DIR = /mnt/c/depot/Git/qsharp-runtime/src/Simulation/Native
+
+# The top-level build directory on which CMake was run.
+CMAKE_BINARY_DIR = /mnt/c/depot/Git/qsharp-runtime/src/Simulation/Native
+
+#=============================================================================
+# Targets provided globally by CMake.
+
+# Special rule for the target install/strip
+install/strip: preinstall
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Installing the project stripped..."
+	/usr/bin/cmake -DCMAKE_INSTALL_DO_STRIP=1 -P cmake_install.cmake
+.PHONY : install/strip
+
+# Special rule for the target install/strip
+install/strip/fast: preinstall/fast
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Installing the project stripped..."
+	/usr/bin/cmake -DCMAKE_INSTALL_DO_STRIP=1 -P cmake_install.cmake
+.PHONY : install/strip/fast
+
+# Special rule for the target install/local
+install/local: preinstall
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Installing only the local directory..."
+	/usr/bin/cmake -DCMAKE_INSTALL_LOCAL_ONLY=1 -P cmake_install.cmake
+.PHONY : install/local
+
+# Special rule for the target install/local
+install/local/fast: preinstall/fast
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Installing only the local directory..."
+	/usr/bin/cmake -DCMAKE_INSTALL_LOCAL_ONLY=1 -P cmake_install.cmake
+.PHONY : install/local/fast
+
+# Special rule for the target test
+test:
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Running tests..."
+	/usr/bin/ctest --force-new-ctest-process $(ARGS)
+.PHONY : test
+
+# Special rule for the target test
+test/fast: test
+
+.PHONY : test/fast
+
+# Special rule for the target edit_cache
+edit_cache:
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Running CMake cache editor..."
+	/usr/bin/ccmake -S$(CMAKE_SOURCE_DIR) -B$(CMAKE_BINARY_DIR)
+.PHONY : edit_cache
+
+# Special rule for the target edit_cache
+edit_cache/fast: edit_cache
+
+.PHONY : edit_cache/fast
+
+# Special rule for the target rebuild_cache
+rebuild_cache:
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Running CMake to regenerate build system..."
+	/usr/bin/cmake -S$(CMAKE_SOURCE_DIR) -B$(CMAKE_BINARY_DIR)
+.PHONY : rebuild_cache
+
+# Special rule for the target rebuild_cache
+rebuild_cache/fast: rebuild_cache
+
+.PHONY : rebuild_cache/fast
+
+# Special rule for the target list_install_components
+list_install_components:
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Available install components are: \"libraries\""
+.PHONY : list_install_components
+
+# Special rule for the target list_install_components
+list_install_components/fast: list_install_components
+
+.PHONY : list_install_components/fast
+
+# Special rule for the target install
+install: preinstall
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Install the project..."
+	/usr/bin/cmake -P cmake_install.cmake
+.PHONY : install
+
+# Special rule for the target install
+install/fast: preinstall/fast
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Install the project..."
+	/usr/bin/cmake -P cmake_install.cmake
+.PHONY : install/fast
+
+# The main all target
+all: cmake_check_build_system
+	cd /mnt/c/depot/Git/qsharp-runtime/src/Simulation/Native && $(CMAKE_COMMAND) -E cmake_progress_start /mnt/c/depot/Git/qsharp-runtime/src/Simulation/Native/CMakeFiles /mnt/c/depot/Git/qsharp-runtime/src/Simulation/Native/src/util/CMakeFiles/progress.marks
+	cd /mnt/c/depot/Git/qsharp-runtime/src/Simulation/Native && $(MAKE) -f CMakeFiles/Makefile2 src/util/all
+	$(CMAKE_COMMAND) -E cmake_progress_start /mnt/c/depot/Git/qsharp-runtime/src/Simulation/Native/CMakeFiles 0
+.PHONY : all
+
+# The main clean target
+clean:
+	cd /mnt/c/depot/Git/qsharp-runtime/src/Simulation/Native && $(MAKE) -f CMakeFiles/Makefile2 src/util/clean
+.PHONY : clean
+
+# The main clean target
+clean/fast: clean
+
+.PHONY : clean/fast
+
+# Prepare targets for installation.
+preinstall: all
+	cd /mnt/c/depot/Git/qsharp-runtime/src/Simulation/Native && $(MAKE) -f CMakeFiles/Makefile2 src/util/preinstall
+.PHONY : preinstall
+
+# Prepare targets for installation.
+preinstall/fast:
+	cd /mnt/c/depot/Git/qsharp-runtime/src/Simulation/Native && $(MAKE) -f CMakeFiles/Makefile2 src/util/preinstall
+.PHONY : preinstall/fast
+
+# clear depends
+depend:
+	cd /mnt/c/depot/Git/qsharp-runtime/src/Simulation/Native && $(CMAKE_COMMAND) -S$(CMAKE_SOURCE_DIR) -B$(CMAKE_BINARY_DIR) --check-build-system CMakeFiles/Makefile.cmake 1
+.PHONY : depend
+
+# Convenience name for target.
+src/util/CMakeFiles/tinymatrix_test.dir/rule:
+	cd /mnt/c/depot/Git/qsharp-runtime/src/Simulation/Native && $(MAKE) -f CMakeFiles/Makefile2 src/util/CMakeFiles/tinymatrix_test.dir/rule
+.PHONY : src/util/CMakeFiles/tinymatrix_test.dir/rule
+
+# Convenience name for target.
+tinymatrix_test: src/util/CMakeFiles/tinymatrix_test.dir/rule
+
+.PHONY : tinymatrix_test
+
+# fast build rule for target.
+tinymatrix_test/fast:
+	cd /mnt/c/depot/Git/qsharp-runtime/src/Simulation/Native && $(MAKE) -f src/util/CMakeFiles/tinymatrix_test.dir/build.make src/util/CMakeFiles/tinymatrix_test.dir/build
+.PHONY : tinymatrix_test/fast
+
+# Convenience name for target.
+src/util/CMakeFiles/bititerator_test.dir/rule:
+	cd /mnt/c/depot/Git/qsharp-runtime/src/Simulation/Native && $(MAKE) -f CMakeFiles/Makefile2 src/util/CMakeFiles/bititerator_test.dir/rule
+.PHONY : src/util/CMakeFiles/bititerator_test.dir/rule
+
+# Convenience name for target.
+bititerator_test: src/util/CMakeFiles/bititerator_test.dir/rule
+
+.PHONY : bititerator_test
+
+# fast build rule for target.
+bititerator_test/fast:
+	cd /mnt/c/depot/Git/qsharp-runtime/src/Simulation/Native && $(MAKE) -f src/util/CMakeFiles/bititerator_test.dir/build.make src/util/CMakeFiles/bititerator_test.dir/build
+.PHONY : bititerator_test/fast
+
+# Convenience name for target.
+src/util/CMakeFiles/bitops_test.dir/rule:
+	cd /mnt/c/depot/Git/qsharp-runtime/src/Simulation/Native && $(MAKE) -f CMakeFiles/Makefile2 src/util/CMakeFiles/bitops_test.dir/rule
+.PHONY : src/util/CMakeFiles/bitops_test.dir/rule
+
+# Convenience name for target.
+bitops_test: src/util/CMakeFiles/bitops_test.dir/rule
+
+.PHONY : bitops_test
+
+# fast build rule for target.
+bitops_test/fast:
+	cd /mnt/c/depot/Git/qsharp-runtime/src/Simulation/Native && $(MAKE) -f src/util/CMakeFiles/bitops_test.dir/build.make src/util/CMakeFiles/bitops_test.dir/build
+.PHONY : bitops_test/fast
+
+# Convenience name for target.
+src/util/CMakeFiles/openmp_test.dir/rule:
+	cd /mnt/c/depot/Git/qsharp-runtime/src/Simulation/Native && $(MAKE) -f CMakeFiles/Makefile2 src/util/CMakeFiles/openmp_test.dir/rule
+.PHONY : src/util/CMakeFiles/openmp_test.dir/rule
+
+# Convenience name for target.
+openmp_test: src/util/CMakeFiles/openmp_test.dir/rule
+
+.PHONY : openmp_test
+
+# fast build rule for target.
+openmp_test/fast:
+	cd /mnt/c/depot/Git/qsharp-runtime/src/Simulation/Native && $(MAKE) -f src/util/CMakeFiles/openmp_test.dir/build.make src/util/CMakeFiles/openmp_test.dir/build
+.PHONY : openmp_test/fast
+
+# Convenience name for target.
+src/util/CMakeFiles/cpuid_test.dir/rule:
+	cd /mnt/c/depot/Git/qsharp-runtime/src/Simulation/Native && $(MAKE) -f CMakeFiles/Makefile2 src/util/CMakeFiles/cpuid_test.dir/rule
+.PHONY : src/util/CMakeFiles/cpuid_test.dir/rule
+
+# Convenience name for target.
+cpuid_test: src/util/CMakeFiles/cpuid_test.dir/rule
+
+.PHONY : cpuid_test
+
+# fast build rule for target.
+cpuid_test/fast:
+	cd /mnt/c/depot/Git/qsharp-runtime/src/Simulation/Native && $(MAKE) -f src/util/CMakeFiles/cpuid_test.dir/build.make src/util/CMakeFiles/cpuid_test.dir/build
+.PHONY : cpuid_test/fast
+
+# Convenience name for target.
+src/util/CMakeFiles/argmaxnrm2_test.dir/rule:
+	cd /mnt/c/depot/Git/qsharp-runtime/src/Simulation/Native && $(MAKE) -f CMakeFiles/Makefile2 src/util/CMakeFiles/argmaxnrm2_test.dir/rule
+.PHONY : src/util/CMakeFiles/argmaxnrm2_test.dir/rule
+
+# Convenience name for target.
+argmaxnrm2_test: src/util/CMakeFiles/argmaxnrm2_test.dir/rule
+
+.PHONY : argmaxnrm2_test
+
+# fast build rule for target.
+argmaxnrm2_test/fast:
+	cd /mnt/c/depot/Git/qsharp-runtime/src/Simulation/Native && $(MAKE) -f src/util/CMakeFiles/argmaxnrm2_test.dir/build.make src/util/CMakeFiles/argmaxnrm2_test.dir/build
+.PHONY : argmaxnrm2_test/fast
+
+# Convenience name for target.
+src/util/CMakeFiles/diagmatrix_test.dir/rule:
+	cd /mnt/c/depot/Git/qsharp-runtime/src/Simulation/Native && $(MAKE) -f CMakeFiles/Makefile2 src/util/CMakeFiles/diagmatrix_test.dir/rule
+.PHONY : src/util/CMakeFiles/diagmatrix_test.dir/rule
+
+# Convenience name for target.
+diagmatrix_test: src/util/CMakeFiles/diagmatrix_test.dir/rule
+
+.PHONY : diagmatrix_test
+
+# fast build rule for target.
+diagmatrix_test/fast:
+	cd /mnt/c/depot/Git/qsharp-runtime/src/Simulation/Native && $(MAKE) -f src/util/CMakeFiles/diagmatrix_test.dir/build.make src/util/CMakeFiles/diagmatrix_test.dir/build
+.PHONY : diagmatrix_test/fast
+
+argmaxnrm2_test.o: argmaxnrm2_test.cpp.o
+
+.PHONY : argmaxnrm2_test.o
+
+# target to build an object file
+argmaxnrm2_test.cpp.o:
+	cd /mnt/c/depot/Git/qsharp-runtime/src/Simulation/Native && $(MAKE) -f src/util/CMakeFiles/argmaxnrm2_test.dir/build.make src/util/CMakeFiles/argmaxnrm2_test.dir/argmaxnrm2_test.cpp.o
+.PHONY : argmaxnrm2_test.cpp.o
+
+argmaxnrm2_test.i: argmaxnrm2_test.cpp.i
+
+.PHONY : argmaxnrm2_test.i
+
+# target to preprocess a source file
+argmaxnrm2_test.cpp.i:
+	cd /mnt/c/depot/Git/qsharp-runtime/src/Simulation/Native && $(MAKE) -f src/util/CMakeFiles/argmaxnrm2_test.dir/build.make src/util/CMakeFiles/argmaxnrm2_test.dir/argmaxnrm2_test.cpp.i
+.PHONY : argmaxnrm2_test.cpp.i
+
+argmaxnrm2_test.s: argmaxnrm2_test.cpp.s
+
+.PHONY : argmaxnrm2_test.s
+
+# target to generate assembly for a file
+argmaxnrm2_test.cpp.s:
+	cd /mnt/c/depot/Git/qsharp-runtime/src/Simulation/Native && $(MAKE) -f src/util/CMakeFiles/argmaxnrm2_test.dir/build.make src/util/CMakeFiles/argmaxnrm2_test.dir/argmaxnrm2_test.cpp.s
+.PHONY : argmaxnrm2_test.cpp.s
+
+bititerator_test.o: bititerator_test.cpp.o
+
+.PHONY : bititerator_test.o
+
+# target to build an object file
+bititerator_test.cpp.o:
+	cd /mnt/c/depot/Git/qsharp-runtime/src/Simulation/Native && $(MAKE) -f src/util/CMakeFiles/bititerator_test.dir/build.make src/util/CMakeFiles/bititerator_test.dir/bititerator_test.cpp.o
+.PHONY : bititerator_test.cpp.o
+
+bititerator_test.i: bititerator_test.cpp.i
+
+.PHONY : bititerator_test.i
+
+# target to preprocess a source file
+bititerator_test.cpp.i:
+	cd /mnt/c/depot/Git/qsharp-runtime/src/Simulation/Native && $(MAKE) -f src/util/CMakeFiles/bititerator_test.dir/build.make src/util/CMakeFiles/bititerator_test.dir/bititerator_test.cpp.i
+.PHONY : bititerator_test.cpp.i
+
+bititerator_test.s: bititerator_test.cpp.s
+
+.PHONY : bititerator_test.s
+
+# target to generate assembly for a file
+bititerator_test.cpp.s:
+	cd /mnt/c/depot/Git/qsharp-runtime/src/Simulation/Native && $(MAKE) -f src/util/CMakeFiles/bititerator_test.dir/build.make src/util/CMakeFiles/bititerator_test.dir/bititerator_test.cpp.s
+.PHONY : bititerator_test.cpp.s
+
+bitops_test.o: bitops_test.cpp.o
+
+.PHONY : bitops_test.o
+
+# target to build an object file
+bitops_test.cpp.o:
+	cd /mnt/c/depot/Git/qsharp-runtime/src/Simulation/Native && $(MAKE) -f src/util/CMakeFiles/bitops_test.dir/build.make src/util/CMakeFiles/bitops_test.dir/bitops_test.cpp.o
+.PHONY : bitops_test.cpp.o
+
+bitops_test.i: bitops_test.cpp.i
+
+.PHONY : bitops_test.i
+
+# target to preprocess a source file
+bitops_test.cpp.i:
+	cd /mnt/c/depot/Git/qsharp-runtime/src/Simulation/Native && $(MAKE) -f src/util/CMakeFiles/bitops_test.dir/build.make src/util/CMakeFiles/bitops_test.dir/bitops_test.cpp.i
+.PHONY : bitops_test.cpp.i
+
+bitops_test.s: bitops_test.cpp.s
+
+.PHONY : bitops_test.s
+
+# target to generate assembly for a file
+bitops_test.cpp.s:
+	cd /mnt/c/depot/Git/qsharp-runtime/src/Simulation/Native && $(MAKE) -f src/util/CMakeFiles/bitops_test.dir/build.make src/util/CMakeFiles/bitops_test.dir/bitops_test.cpp.s
+.PHONY : bitops_test.cpp.s
+
+cpuid_test.o: cpuid_test.cpp.o
+
+.PHONY : cpuid_test.o
+
+# target to build an object file
+cpuid_test.cpp.o:
+	cd /mnt/c/depot/Git/qsharp-runtime/src/Simulation/Native && $(MAKE) -f src/util/CMakeFiles/cpuid_test.dir/build.make src/util/CMakeFiles/cpuid_test.dir/cpuid_test.cpp.o
+.PHONY : cpuid_test.cpp.o
+
+cpuid_test.i: cpuid_test.cpp.i
+
+.PHONY : cpuid_test.i
+
+# target to preprocess a source file
+cpuid_test.cpp.i:
+	cd /mnt/c/depot/Git/qsharp-runtime/src/Simulation/Native && $(MAKE) -f src/util/CMakeFiles/cpuid_test.dir/build.make src/util/CMakeFiles/cpuid_test.dir/cpuid_test.cpp.i
+.PHONY : cpuid_test.cpp.i
+
+cpuid_test.s: cpuid_test.cpp.s
+
+.PHONY : cpuid_test.s
+
+# target to generate assembly for a file
+cpuid_test.cpp.s:
+	cd /mnt/c/depot/Git/qsharp-runtime/src/Simulation/Native && $(MAKE) -f src/util/CMakeFiles/cpuid_test.dir/build.make src/util/CMakeFiles/cpuid_test.dir/cpuid_test.cpp.s
+.PHONY : cpuid_test.cpp.s
+
+diagmatrix_test.o: diagmatrix_test.cpp.o
+
+.PHONY : diagmatrix_test.o
+
+# target to build an object file
+diagmatrix_test.cpp.o:
+	cd /mnt/c/depot/Git/qsharp-runtime/src/Simulation/Native && $(MAKE) -f src/util/CMakeFiles/diagmatrix_test.dir/build.make src/util/CMakeFiles/diagmatrix_test.dir/diagmatrix_test.cpp.o
+.PHONY : diagmatrix_test.cpp.o
+
+diagmatrix_test.i: diagmatrix_test.cpp.i
+
+.PHONY : diagmatrix_test.i
+
+# target to preprocess a source file
+diagmatrix_test.cpp.i:
+	cd /mnt/c/depot/Git/qsharp-runtime/src/Simulation/Native && $(MAKE) -f src/util/CMakeFiles/diagmatrix_test.dir/build.make src/util/CMakeFiles/diagmatrix_test.dir/diagmatrix_test.cpp.i
+.PHONY : diagmatrix_test.cpp.i
+
+diagmatrix_test.s: diagmatrix_test.cpp.s
+
+.PHONY : diagmatrix_test.s
+
+# target to generate assembly for a file
+diagmatrix_test.cpp.s:
+	cd /mnt/c/depot/Git/qsharp-runtime/src/Simulation/Native && $(MAKE) -f src/util/CMakeFiles/diagmatrix_test.dir/build.make src/util/CMakeFiles/diagmatrix_test.dir/diagmatrix_test.cpp.s
+.PHONY : diagmatrix_test.cpp.s
+
+openmp_test.o: openmp_test.cpp.o
+
+.PHONY : openmp_test.o
+
+# target to build an object file
+openmp_test.cpp.o:
+	cd /mnt/c/depot/Git/qsharp-runtime/src/Simulation/Native && $(MAKE) -f src/util/CMakeFiles/openmp_test.dir/build.make src/util/CMakeFiles/openmp_test.dir/openmp_test.cpp.o
+.PHONY : openmp_test.cpp.o
+
+openmp_test.i: openmp_test.cpp.i
+
+.PHONY : openmp_test.i
+
+# target to preprocess a source file
+openmp_test.cpp.i:
+	cd /mnt/c/depot/Git/qsharp-runtime/src/Simulation/Native && $(MAKE) -f src/util/CMakeFiles/openmp_test.dir/build.make src/util/CMakeFiles/openmp_test.dir/openmp_test.cpp.i
+.PHONY : openmp_test.cpp.i
+
+openmp_test.s: openmp_test.cpp.s
+
+.PHONY : openmp_test.s
+
+# target to generate assembly for a file
+openmp_test.cpp.s:
+	cd /mnt/c/depot/Git/qsharp-runtime/src/Simulation/Native && $(MAKE) -f src/util/CMakeFiles/openmp_test.dir/build.make src/util/CMakeFiles/openmp_test.dir/openmp_test.cpp.s
+.PHONY : openmp_test.cpp.s
+
+tinymatrix_test.o: tinymatrix_test.cpp.o
+
+.PHONY : tinymatrix_test.o
+
+# target to build an object file
+tinymatrix_test.cpp.o:
+	cd /mnt/c/depot/Git/qsharp-runtime/src/Simulation/Native && $(MAKE) -f src/util/CMakeFiles/tinymatrix_test.dir/build.make src/util/CMakeFiles/tinymatrix_test.dir/tinymatrix_test.cpp.o
+.PHONY : tinymatrix_test.cpp.o
+
+tinymatrix_test.i: tinymatrix_test.cpp.i
+
+.PHONY : tinymatrix_test.i
+
+# target to preprocess a source file
+tinymatrix_test.cpp.i:
+	cd /mnt/c/depot/Git/qsharp-runtime/src/Simulation/Native && $(MAKE) -f src/util/CMakeFiles/tinymatrix_test.dir/build.make src/util/CMakeFiles/tinymatrix_test.dir/tinymatrix_test.cpp.i
+.PHONY : tinymatrix_test.cpp.i
+
+tinymatrix_test.s: tinymatrix_test.cpp.s
+
+.PHONY : tinymatrix_test.s
+
+# target to generate assembly for a file
+tinymatrix_test.cpp.s:
+	cd /mnt/c/depot/Git/qsharp-runtime/src/Simulation/Native && $(MAKE) -f src/util/CMakeFiles/tinymatrix_test.dir/build.make src/util/CMakeFiles/tinymatrix_test.dir/tinymatrix_test.cpp.s
+.PHONY : tinymatrix_test.cpp.s
+
+# Help Target
+help:
+	@echo "The following are some of the valid targets for this Makefile:"
+	@echo "... all (the default if no target is provided)"
+	@echo "... clean"
+	@echo "... depend"
+	@echo "... install/strip"
+	@echo "... tinymatrix_test"
+	@echo "... bititerator_test"
+	@echo "... install/local"
+	@echo "... bitops_test"
+	@echo "... openmp_test"
+	@echo "... cpuid_test"
+	@echo "... argmaxnrm2_test"
+	@echo "... test"
+	@echo "... edit_cache"
+	@echo "... rebuild_cache"
+	@echo "... list_install_components"
+	@echo "... diagmatrix_test"
+	@echo "... install"
+	@echo "... argmaxnrm2_test.o"
+	@echo "... argmaxnrm2_test.i"
+	@echo "... argmaxnrm2_test.s"
+	@echo "... bititerator_test.o"
+	@echo "... bititerator_test.i"
+	@echo "... bititerator_test.s"
+	@echo "... bitops_test.o"
+	@echo "... bitops_test.i"
+	@echo "... bitops_test.s"
+	@echo "... cpuid_test.o"
+	@echo "... cpuid_test.i"
+	@echo "... cpuid_test.s"
+	@echo "... diagmatrix_test.o"
+	@echo "... diagmatrix_test.i"
+	@echo "... diagmatrix_test.s"
+	@echo "... openmp_test.o"
+	@echo "... openmp_test.i"
+	@echo "... openmp_test.s"
+	@echo "... tinymatrix_test.o"
+	@echo "... tinymatrix_test.i"
+	@echo "... tinymatrix_test.s"
+.PHONY : help
+
+
+
+#=============================================================================
+# Special targets to cleanup operation of make.
+
+# Special rule to run CMake to check the build system integrity.
+# No rule that depends on this can have commands that come from listfiles
+# because they might be regenerated.
+cmake_check_build_system:
+	cd /mnt/c/depot/Git/qsharp-runtime/src/Simulation/Native && $(CMAKE_COMMAND) -S$(CMAKE_SOURCE_DIR) -B$(CMAKE_BINARY_DIR) --check-build-system CMakeFiles/Makefile.cmake 0
+.PHONY : cmake_check_build_system
+
diff --git a/src/Simulation/Native/src/util/cpuid.hpp b/src/Simulation/Native/src/util/cpuid.hpp
index cbce00cf2d1..705d65dec85 100644
--- a/src/Simulation/Native/src/util/cpuid.hpp
+++ b/src/Simulation/Native/src/util/cpuid.hpp
@@ -40,7 +40,7 @@ namespace Microsoft
       {
 #ifndef _MSC_VER
         //__builtin_cpu_init();
-        return false; // __builtin_cpu_supports("avx512bw");
+          return (__builtin_cpu_supports("avx512f") != 0 && __builtin_cpu_supports("avx512cd") != 0);
 #else
         int cpuInfo[4];
         __cpuid(cpuInfo,0);
diff --git a/src/Simulation/Native/src/version.hpp b/src/Simulation/Native/src/version.hpp
new file mode 100644
index 00000000000..b55afdfd213
--- /dev/null
+++ b/src/Simulation/Native/src/version.hpp
@@ -0,0 +1,11 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#pragma once
+
+/* #undef MICROSOFT_QUANTUM_SIMULATOR_VERSION */
+/* #undef MICROSOFT_QUANTUM_SIMULATOR_VERSION_MAJOR */
+/* #undef MICROSOFT_QUANTUM_SIMULATOR_VERSION_MINOR */
+/* #undef MICROSOFT_QUANTUM_SIMULATOR_VERSION_PATCH */
+/* #undef MICROSOFT_QUANTUM_SIMULATOR_VERSION_STRING */
+#define MICROSOFT_QUANTUM_SIMULATOR_YEAR "2020"
diff --git a/src/Simulation/Native/stats.xlsx b/src/Simulation/Native/stats.xlsx
new file mode 100644
index 00000000000..321f140939c
Binary files /dev/null and b/src/Simulation/Native/stats.xlsx differ
diff --git a/src/Simulation/Native/tinymatrix_test b/src/Simulation/Native/tinymatrix_test
new file mode 100644
index 00000000000..2c7100d1b31
Binary files /dev/null and b/src/Simulation/Native/tinymatrix_test differ
diff --git a/src/Simulation/QCTraceSimulator.Tests/Tests.Microsoft.Quantum.Simulation.QCTraceSimulatorRuntime.csproj b/src/Simulation/QCTraceSimulator.Tests/Tests.Microsoft.Quantum.Simulation.QCTraceSimulatorRuntime.csproj
index ff3f9c59da5..ba121246d35 100644
--- a/src/Simulation/QCTraceSimulator.Tests/Tests.Microsoft.Quantum.Simulation.QCTraceSimulatorRuntime.csproj
+++ b/src/Simulation/QCTraceSimulator.Tests/Tests.Microsoft.Quantum.Simulation.QCTraceSimulatorRuntime.csproj
@@ -1,4 +1,4 @@
-﻿<Project Sdk="Microsoft.Quantum.Sdk/0.12.2008.2604-alpha">
+﻿<Project Sdk="Microsoft.Quantum.Sdk/0.12.20082705-beta">
 
   <Import Project="..\Common\AssemblyCommon.props" />
   <Import Project="..\Common\Simulators.Dev.props" />
diff --git a/src/Simulation/QsharpCore/Microsoft.Quantum.QSharp.Core.csproj b/src/Simulation/QsharpCore/Microsoft.Quantum.QSharp.Core.csproj
index 6e8467e5159..807c708c9b3 100644
--- a/src/Simulation/QsharpCore/Microsoft.Quantum.QSharp.Core.csproj
+++ b/src/Simulation/QsharpCore/Microsoft.Quantum.QSharp.Core.csproj
@@ -1,4 +1,4 @@
-﻿<Project Sdk="Microsoft.Quantum.Sdk/0.12.2008.2604-alpha">
+﻿<Project Sdk="Microsoft.Quantum.Sdk/0.12.20082705-beta">
 
   <Import Project="..\Common\AssemblyCommon.props" />
   <Import Project="..\Common\DebugSymbols.props" />
diff --git a/src/Simulation/Simulators.Tests/TestProjects/HoneywellExe/HoneywellExe.csproj b/src/Simulation/Simulators.Tests/TestProjects/HoneywellExe/HoneywellExe.csproj
index 27cc28e4a65..957ffe6c2bd 100644
--- a/src/Simulation/Simulators.Tests/TestProjects/HoneywellExe/HoneywellExe.csproj
+++ b/src/Simulation/Simulators.Tests/TestProjects/HoneywellExe/HoneywellExe.csproj
@@ -1,4 +1,4 @@
-<Project Sdk="Microsoft.Quantum.Sdk/0.12.2008.2604-alpha">
+<Project Sdk="Microsoft.Quantum.Sdk/0.12.20082705-beta">
 
   <PropertyGroup>
     <OutputType>Library</OutputType>
diff --git a/src/Simulation/Simulators.Tests/TestProjects/IntrinsicTests/IntrinsicTests.csproj b/src/Simulation/Simulators.Tests/TestProjects/IntrinsicTests/IntrinsicTests.csproj
index 8396c045f93..e85a1a19e96 100644
--- a/src/Simulation/Simulators.Tests/TestProjects/IntrinsicTests/IntrinsicTests.csproj
+++ b/src/Simulation/Simulators.Tests/TestProjects/IntrinsicTests/IntrinsicTests.csproj
@@ -1,4 +1,4 @@
-<Project Sdk="Microsoft.Quantum.Sdk/0.12.2008.2604-alpha">
+<Project Sdk="Microsoft.Quantum.Sdk/0.12.20082705-beta">
 
   <PropertyGroup>
     <TargetFramework>netcoreapp3.1</TargetFramework>
diff --git a/src/Simulation/Simulators.Tests/TestProjects/IonQExe/IonQExe.csproj b/src/Simulation/Simulators.Tests/TestProjects/IonQExe/IonQExe.csproj
index 83d11ae6541..5ac7dcf5f7e 100644
--- a/src/Simulation/Simulators.Tests/TestProjects/IonQExe/IonQExe.csproj
+++ b/src/Simulation/Simulators.Tests/TestProjects/IonQExe/IonQExe.csproj
@@ -1,4 +1,4 @@
-<Project Sdk="Microsoft.Quantum.Sdk/0.12.2008.2604-alpha">
+<Project Sdk="Microsoft.Quantum.Sdk/0.12.20082705-beta">
 
   <PropertyGroup>
     <OutputType>Library</OutputType>
diff --git a/src/Simulation/Simulators.Tests/TestProjects/Library with Spaces/Library with Spaces.csproj b/src/Simulation/Simulators.Tests/TestProjects/Library with Spaces/Library with Spaces.csproj
index df4d6e13a43..ebdaa51f4c0 100644
--- a/src/Simulation/Simulators.Tests/TestProjects/Library with Spaces/Library with Spaces.csproj	
+++ b/src/Simulation/Simulators.Tests/TestProjects/Library with Spaces/Library with Spaces.csproj	
@@ -1,4 +1,4 @@
-<Project Sdk="Microsoft.Quantum.Sdk/0.12.2008.2604-alpha">
+<Project Sdk="Microsoft.Quantum.Sdk/0.12.20082705-beta">
   <PropertyGroup>
     <TargetFramework>netstandard2.1</TargetFramework>
     <CsharpGeneration>false</CsharpGeneration>
diff --git a/src/Simulation/Simulators.Tests/TestProjects/Library1/Library1.csproj b/src/Simulation/Simulators.Tests/TestProjects/Library1/Library1.csproj
index c01bc8d99bc..56799905ef0 100644
--- a/src/Simulation/Simulators.Tests/TestProjects/Library1/Library1.csproj
+++ b/src/Simulation/Simulators.Tests/TestProjects/Library1/Library1.csproj
@@ -1,4 +1,4 @@
-﻿<Project Sdk="Microsoft.Quantum.Sdk/0.12.2008.2604-alpha">
+﻿<Project Sdk="Microsoft.Quantum.Sdk/0.12.20082705-beta">
 
   <PropertyGroup>
     <TargetFramework>netstandard2.1</TargetFramework>
diff --git a/src/Simulation/Simulators.Tests/TestProjects/Library2/Library2.csproj b/src/Simulation/Simulators.Tests/TestProjects/Library2/Library2.csproj
index c01bc8d99bc..56799905ef0 100644
--- a/src/Simulation/Simulators.Tests/TestProjects/Library2/Library2.csproj
+++ b/src/Simulation/Simulators.Tests/TestProjects/Library2/Library2.csproj
@@ -1,4 +1,4 @@
-﻿<Project Sdk="Microsoft.Quantum.Sdk/0.12.2008.2604-alpha">
+﻿<Project Sdk="Microsoft.Quantum.Sdk/0.12.20082705-beta">
 
   <PropertyGroup>
     <TargetFramework>netstandard2.1</TargetFramework>
diff --git a/src/Simulation/Simulators.Tests/TestProjects/QCIExe/QCIExe.csproj b/src/Simulation/Simulators.Tests/TestProjects/QCIExe/QCIExe.csproj
index fcc18c5313d..df2e5362b29 100644
--- a/src/Simulation/Simulators.Tests/TestProjects/QCIExe/QCIExe.csproj
+++ b/src/Simulation/Simulators.Tests/TestProjects/QCIExe/QCIExe.csproj
@@ -1,4 +1,4 @@
-<Project Sdk="Microsoft.Quantum.Sdk/0.12.2008.2604-alpha">
+<Project Sdk="Microsoft.Quantum.Sdk/0.12.20082705-beta">
 
   <PropertyGroup>
     <OutputType>Library</OutputType>
diff --git a/src/Simulation/Simulators.Tests/TestProjects/QsharpExe/QsharpExe.csproj b/src/Simulation/Simulators.Tests/TestProjects/QsharpExe/QsharpExe.csproj
index e74705db790..283f23eec8b 100644
--- a/src/Simulation/Simulators.Tests/TestProjects/QsharpExe/QsharpExe.csproj
+++ b/src/Simulation/Simulators.Tests/TestProjects/QsharpExe/QsharpExe.csproj
@@ -1,4 +1,4 @@
-﻿<Project Sdk="Microsoft.Quantum.Sdk/0.12.2008.2604-alpha">
+﻿<Project Sdk="Microsoft.Quantum.Sdk/0.12.20082705-beta">
 
   <PropertyGroup>
     <OutputType>Exe</OutputType>
diff --git a/src/Simulation/Simulators.Tests/TestProjects/TargetedExe/TargetedExe.csproj b/src/Simulation/Simulators.Tests/TestProjects/TargetedExe/TargetedExe.csproj
index 6e76c47fe19..682391cfdac 100644
--- a/src/Simulation/Simulators.Tests/TestProjects/TargetedExe/TargetedExe.csproj
+++ b/src/Simulation/Simulators.Tests/TestProjects/TargetedExe/TargetedExe.csproj
@@ -1,4 +1,4 @@
-<Project Sdk="Microsoft.Quantum.Sdk/0.12.2008.2604-alpha">
+<Project Sdk="Microsoft.Quantum.Sdk/0.12.20082705-beta">
 
   <PropertyGroup>
     <OutputType>Exe</OutputType>
diff --git a/src/Simulation/Simulators.Tests/TestProjects/UnitTests/UnitTests.csproj b/src/Simulation/Simulators.Tests/TestProjects/UnitTests/UnitTests.csproj
index 03d1c8a4a8e..22e1d8d901d 100644
--- a/src/Simulation/Simulators.Tests/TestProjects/UnitTests/UnitTests.csproj
+++ b/src/Simulation/Simulators.Tests/TestProjects/UnitTests/UnitTests.csproj
@@ -1,4 +1,4 @@
-﻿<Project Sdk="Microsoft.Quantum.Sdk/0.12.2008.2604-alpha">
+﻿<Project Sdk="Microsoft.Quantum.Sdk/0.12.20082705-beta">
 
   <PropertyGroup>
     <TargetFramework>netcoreapp3.1</TargetFramework>
diff --git a/src/Simulation/Simulators.Tests/Tests.Microsoft.Quantum.Simulators.csproj b/src/Simulation/Simulators.Tests/Tests.Microsoft.Quantum.Simulators.csproj
index ca7ce29588f..6fa960aef8f 100644
--- a/src/Simulation/Simulators.Tests/Tests.Microsoft.Quantum.Simulators.csproj
+++ b/src/Simulation/Simulators.Tests/Tests.Microsoft.Quantum.Simulators.csproj
@@ -1,4 +1,4 @@
-﻿<Project Sdk="Microsoft.Quantum.Sdk/0.12.2008.2604-alpha">
+﻿<Project Sdk="Microsoft.Quantum.Sdk/0.12.20082705-beta">
 
   <Import Project="..\Common\AssemblyCommon.props" />
   <Import Project="..\Common\DebugSymbols.props" />
diff --git a/src/Simulation/Simulators/Microsoft.Quantum.Simulators.csproj b/src/Simulation/Simulators/Microsoft.Quantum.Simulators.csproj
index 00cd0c2aeda..25a7446c6f1 100644
--- a/src/Simulation/Simulators/Microsoft.Quantum.Simulators.csproj
+++ b/src/Simulation/Simulators/Microsoft.Quantum.Simulators.csproj
@@ -1,4 +1,4 @@
-﻿<Project Sdk="Microsoft.Quantum.Sdk/0.12.2008.2604-alpha">
+﻿<Project Sdk="Microsoft.Quantum.Sdk/0.12.20082705-beta">
 
   <Import Project="..\Common\AssemblyCommon.props" />
   <Import Project="..\Common\DebugSymbols.props" />