diff --git a/.gitignore b/.gitignore index 354f05b9d0d..d3bbf343ae3 100644 --- a/.gitignore +++ b/.gitignore @@ -337,3 +337,4 @@ ASALocalRun/ .mfractor/ /src/Simulation/Simulators.Tests/TestProjects/QsharpExe/built /src/Simulation/Simulators.Tests/TestProjects/TargetedExe/built +dbw_test diff --git a/AdvantageBenchmark/privateBuild/.editorconfig b/AdvantageBenchmark/privateBuild/.editorconfig new file mode 100644 index 00000000000..6872d54293d --- /dev/null +++ b/AdvantageBenchmark/privateBuild/.editorconfig @@ -0,0 +1,4 @@ +[*.cs] + +# SA1025: Code should not contain multiple whitespace in a row +dotnet_diagnostic.SA1025.severity = none diff --git a/AdvantageBenchmark/privateBuild/Program.cs b/AdvantageBenchmark/privateBuild/Program.cs new file mode 100644 index 00000000000..5306ff2a683 --- /dev/null +++ b/AdvantageBenchmark/privateBuild/Program.cs @@ -0,0 +1,71 @@ +namespace quantum +{ + using System; + using System.Diagnostics; + using Microsoft.Quantum.Simulation.Simulators; + + class Program + { + public static void Main(string[] args) + { + Console.WriteLine($"CSV,test,loop,secs,gates,THREADS,FUSESPAN,FUSEDEPTH,Gates/sec"); + var envThr = System.Environment.GetEnvironmentVariable("OMP_NUM_THREADS"); + var envFus = System.Environment.GetEnvironmentVariable("QDK_SIM_FUSESPAN"); + var envDep = System.Environment.GetEnvironmentVariable("QDK_SIM_FUSEDEPTH"); + if (envThr == null || envThr.Length == 0) envThr = "Default"; + if (envFus == null || envFus.Length == 0) envFus = "Default"; + if (envDep == null || envDep.Length == 0) envDep = "99"; + + int tstMin = 0; + int tstMax = 3; + int loopCnt = 10; + + if (args.Length > 0) tstMin = Convert.ToInt32(args[0]); + if (args.Length > 1) tstMax = Convert.ToInt32(args[1]); + if (args.Length > 2) loopCnt = Convert.ToInt32(args[2]); + + using (var sim = new QuantumSimulator()) + { + long gates = 1; + TimeSpan ts; + double tSecs; + double gps; + string tstName = ""; + Stopwatch stopWatch = new Stopwatch(); + + for (int tst = tstMin; tst <= tstMax; tst++) + { + for (int loop = 0; loop < loopCnt; loop++) + { + stopWatch.Restart(); + switch (tst) + { + case 0: + gates = Dummy.Run(sim).Result; + tstName = "Dummy"; + break; + case 1: + gates = Advantage44.Run(sim).Result; + tstName = "4x4"; + break; + case 2: + gates = Advantage55.Run(sim).Result; + tstName = "5x5"; + break; + case 3: + gates = Advantage56.Run(sim).Result; + tstName = "5x6"; + break; + } + stopWatch.Stop(); + ts = stopWatch.Elapsed; + tSecs = ts.TotalSeconds; + gps = gates / tSecs; + + Console.WriteLine($"CSV,{tstName},{loop:D2},{tSecs:F2},{gates:E2},{envThr},{envFus},{envDep},{gps:E2}"); + } + } + } + } + } +} diff --git a/AdvantageBenchmark/privateBuild/Quantum.qs b/AdvantageBenchmark/privateBuild/Quantum.qs new file mode 100644 index 00000000000..c34c15a047f --- /dev/null +++ b/AdvantageBenchmark/privateBuild/Quantum.qs @@ -0,0 +1,826 @@ +namespace quantum { + open Microsoft.Quantum.Canon; + open Microsoft.Quantum.Intrinsic; + open Microsoft.Quantum.Diagnostics; + open Microsoft.Quantum.Measurement; + + operation CZ (a : Qubit, b : Qubit) : Unit + { + body (...) + { + H(b); + CNOT(a, b); + H(b); + } + + adjoint self; + } + + operation Dummy(): Int { + using (q = Qubit[2]) { + CZ(q[0],q[1]); + ResetAll(q); + } + return(1); + } + + operation Advantage44() : Int { + let loops = 200; + let gateCnt = (171+27*2) * loops; + using (q = Qubit[16]) { + for (loop in 0..(loops-1)) { + H(q[0]); + H(q[1]); + H(q[2]); + H(q[3]); + H(q[4]); + H(q[5]); + H(q[6]); + H(q[7]); + H(q[8]); + H(q[9]); + H(q[10]); + H(q[11]); + H(q[12]); + H(q[13]); + H(q[14]); + H(q[15]); + CZ(q[2],q[3]); + CZ(q[10],q[11]); + CZ(q[4],q[5]); + CZ(q[12],q[13]); + H(q[0]); + H(q[1]); + H(q[2]); + H(q[3]); + H(q[4]); + H(q[5]); + H(q[6]); + H(q[7]); + H(q[8]); + H(q[9]); + H(q[10]); + H(q[11]); + H(q[12]); + H(q[13]); + H(q[14]); + H(q[15]); + CZ(q[0],q[1]); + CZ(q[8],q[9]); + CZ(q[6],q[7]); + CZ(q[14],q[15]); + H(q[0]); + H(q[1]); + H(q[2]); + H(q[3]); + H(q[4]); + H(q[5]); + H(q[6]); + H(q[7]); + H(q[8]); + H(q[9]); + H(q[10]); + H(q[11]); + H(q[12]); + H(q[13]); + H(q[14]); + H(q[15]); + CZ(q[5],q[9]); + CZ(q[7],q[11]); + H(q[0]); + H(q[1]); + H(q[2]); + H(q[3]); + H(q[4]); + H(q[5]); + H(q[6]); + H(q[7]); + H(q[8]); + H(q[9]); + H(q[10]); + H(q[11]); + H(q[12]); + H(q[13]); + H(q[14]); + H(q[15]); + CZ(q[4],q[8]); + CZ(q[6],q[10]); + H(q[0]); + H(q[1]); + H(q[2]); + H(q[3]); + H(q[4]); + H(q[5]); + H(q[6]); + H(q[7]); + H(q[8]); + H(q[9]); + H(q[10]); + H(q[11]); + H(q[12]); + H(q[13]); + H(q[14]); + H(q[15]); + CZ(q[3],q[4]); + CZ(q[11],q[12]); + CZ(q[5],q[6]); + CZ(q[13],q[14]); + H(q[0]); + H(q[1]); + H(q[2]); + H(q[3]); + H(q[4]); + H(q[5]); + H(q[6]); + H(q[7]); + H(q[8]); + H(q[9]); + H(q[10]); + H(q[11]); + H(q[12]); + H(q[13]); + H(q[14]); + H(q[15]); + CZ(q[1],q[2]); + CZ(q[9],q[10]); + CZ(q[7],q[8]); + H(q[0]); + H(q[1]); + H(q[2]); + H(q[3]); + H(q[4]); + H(q[5]); + H(q[6]); + H(q[7]); + H(q[8]); + H(q[9]); + H(q[10]); + H(q[11]); + H(q[12]); + H(q[13]); + H(q[14]); + H(q[15]); + CZ(q[0],q[4]); + CZ(q[2],q[6]); + CZ(q[9],q[13]); + CZ(q[11],q[15]); + H(q[0]); + H(q[1]); + H(q[2]); + H(q[3]); + H(q[4]); + H(q[5]); + H(q[6]); + H(q[7]); + H(q[8]); + H(q[9]); + H(q[10]); + H(q[11]); + H(q[12]); + H(q[13]); + H(q[14]); + H(q[15]); + CZ(q[8],q[12]); + CZ(q[10],q[14]); + CZ(q[1],q[5]); + CZ(q[3],q[7]); + H(q[0]); + H(q[1]); + H(q[2]); + H(q[3]); + H(q[4]); + H(q[5]); + H(q[6]); + H(q[7]); + H(q[8]); + H(q[9]); + H(q[10]); + H(q[11]); + H(q[12]); + H(q[13]); + H(q[14]); + H(q[15]); + for (q1 in q) { let _ = M(q1); } + } + ResetAll(q); + } + return(gateCnt); + } + + operation Advantage55() : Int { + let loops = 1; + let gateCnt = (269+44*2) * loops; + using (q = Qubit[25]) { + for (loop in 0..(loops-1)) { + H(q[0]); + H(q[1]); + H(q[2]); + H(q[3]); + H(q[4]); + H(q[5]); + H(q[6]); + H(q[7]); + H(q[8]); + H(q[9]); + H(q[10]); + H(q[11]); + H(q[12]); + H(q[13]); + H(q[14]); + H(q[15]); + H(q[16]); + H(q[17]); + H(q[18]); + H(q[19]); + H(q[20]); + H(q[21]); + H(q[22]); + H(q[23]); + H(q[24]); + CZ(q[2],q[3]); + CZ(q[12],q[13]); + CZ(q[22],q[23]); + CZ(q[5],q[6]); + CZ(q[9],q[10]); + CZ(q[15],q[16]); + CZ(q[19],q[20]); + H(q[0]); + H(q[1]); + H(q[2]); + H(q[3]); + H(q[4]); + H(q[5]); + H(q[6]); + H(q[7]); + H(q[8]); + H(q[9]); + H(q[10]); + H(q[11]); + H(q[12]); + H(q[13]); + H(q[14]); + H(q[15]); + H(q[16]); + H(q[17]); + H(q[18]); + H(q[19]); + H(q[20]); + H(q[21]); + H(q[22]); + H(q[23]); + H(q[24]); + CZ(q[0],q[1]); + CZ(q[4],q[5]); + CZ(q[10],q[11]); + CZ(q[14],q[15]); + CZ(q[20],q[21]); + CZ(q[7],q[8]); + CZ(q[17],q[18]); + H(q[0]); + H(q[1]); + H(q[2]); + H(q[3]); + H(q[4]); + H(q[5]); + H(q[6]); + H(q[7]); + H(q[8]); + H(q[9]); + H(q[10]); + H(q[11]); + H(q[12]); + H(q[13]); + H(q[14]); + H(q[15]); + H(q[16]); + H(q[17]); + H(q[18]); + H(q[19]); + H(q[20]); + H(q[21]); + H(q[22]); + H(q[23]); + H(q[24]); + CZ(q[15],q[20]); + CZ(q[17],q[22]); + CZ(q[19],q[24]); + CZ(q[6],q[11]); + CZ(q[8],q[13]); + H(q[0]); + H(q[1]); + H(q[2]); + H(q[3]); + H(q[4]); + H(q[5]); + H(q[6]); + H(q[7]); + H(q[8]); + H(q[9]); + H(q[10]); + H(q[11]); + H(q[12]); + H(q[13]); + H(q[14]); + H(q[15]); + H(q[16]); + H(q[17]); + H(q[18]); + H(q[19]); + H(q[20]); + H(q[21]); + H(q[22]); + H(q[23]); + H(q[24]); + CZ(q[5],q[10]); + CZ(q[7],q[12]); + CZ(q[9],q[14]); + CZ(q[16],q[21]); + CZ(q[18],q[23]); + H(q[0]); + H(q[1]); + H(q[2]); + H(q[3]); + H(q[4]); + H(q[5]); + H(q[6]); + H(q[7]); + H(q[8]); + H(q[9]); + H(q[10]); + H(q[11]); + H(q[12]); + H(q[13]); + H(q[14]); + H(q[15]); + H(q[16]); + H(q[17]); + H(q[18]); + H(q[19]); + H(q[20]); + H(q[21]); + H(q[22]); + H(q[23]); + H(q[24]); + CZ(q[3],q[4]); + CZ(q[13],q[14]); + CZ(q[23],q[24]); + CZ(q[6],q[7]); + CZ(q[16],q[17]); + H(q[0]); + H(q[1]); + H(q[2]); + H(q[3]); + H(q[4]); + H(q[5]); + H(q[6]); + H(q[7]); + H(q[8]); + H(q[9]); + H(q[10]); + H(q[11]); + H(q[12]); + H(q[13]); + H(q[14]); + H(q[15]); + H(q[16]); + H(q[17]); + H(q[18]); + H(q[19]); + H(q[20]); + H(q[21]); + H(q[22]); + H(q[23]); + H(q[24]); + CZ(q[1],q[2]); + CZ(q[11],q[12]); + CZ(q[21],q[22]); + CZ(q[8],q[9]); + CZ(q[18],q[19]); + H(q[0]); + H(q[1]); + H(q[2]); + H(q[3]); + H(q[4]); + H(q[5]); + H(q[6]); + H(q[7]); + H(q[8]); + H(q[9]); + H(q[10]); + H(q[11]); + H(q[12]); + H(q[13]); + H(q[14]); + H(q[15]); + H(q[16]); + H(q[17]); + H(q[18]); + H(q[19]); + H(q[20]); + H(q[21]); + H(q[22]); + H(q[23]); + H(q[24]); + CZ(q[0],q[5]); + CZ(q[2],q[7]); + CZ(q[4],q[9]); + CZ(q[11],q[16]); + CZ(q[13],q[18]); + H(q[0]); + H(q[1]); + H(q[2]); + H(q[3]); + H(q[4]); + H(q[5]); + H(q[6]); + H(q[7]); + H(q[8]); + H(q[9]); + H(q[10]); + H(q[11]); + H(q[12]); + H(q[13]); + H(q[14]); + H(q[15]); + H(q[16]); + H(q[17]); + H(q[18]); + H(q[19]); + H(q[20]); + H(q[21]); + H(q[22]); + H(q[23]); + H(q[24]); + CZ(q[10],q[15]); + CZ(q[12],q[17]); + CZ(q[14],q[19]); + CZ(q[1],q[6]); + CZ(q[3],q[8]); + H(q[0]); + H(q[1]); + H(q[2]); + H(q[3]); + H(q[4]); + H(q[5]); + H(q[6]); + H(q[7]); + H(q[8]); + H(q[9]); + H(q[10]); + H(q[11]); + H(q[12]); + H(q[13]); + H(q[14]); + H(q[15]); + H(q[16]); + H(q[17]); + H(q[18]); + H(q[19]); + H(q[20]); + H(q[21]); + H(q[22]); + H(q[23]); + H(q[24]); + for (q1 in q) { let _ = M(q1); } + } + ResetAll(q); + } + return(gateCnt); + } + + operation Advantage56() : Int { + let loops = 1; + let gateCnt = (323+53*2) * loops; + using (q = Qubit[30]) { + for (loop in 0..(loops-1)) { + H(q[0]); + H(q[1]); + H(q[2]); + H(q[3]); + H(q[4]); + H(q[5]); + H(q[6]); + H(q[7]); + H(q[8]); + H(q[9]); + H(q[10]); + H(q[11]); + H(q[12]); + H(q[13]); + H(q[14]); + H(q[15]); + H(q[16]); + H(q[17]); + H(q[18]); + H(q[19]); + H(q[20]); + H(q[21]); + H(q[22]); + H(q[23]); + H(q[24]); + H(q[25]); + H(q[26]); + H(q[27]); + H(q[28]); + H(q[29]); + CZ(q[2],q[3]); + CZ(q[14],q[15]); + CZ(q[26],q[27]); + CZ(q[6],q[7]); + CZ(q[10],q[11]); + CZ(q[18],q[19]); + CZ(q[22],q[23]); + H(q[0]); + H(q[1]); + H(q[2]); + H(q[3]); + H(q[4]); + H(q[5]); + H(q[6]); + H(q[7]); + H(q[8]); + H(q[9]); + H(q[10]); + H(q[11]); + H(q[12]); + H(q[13]); + H(q[14]); + H(q[15]); + H(q[16]); + H(q[17]); + H(q[18]); + H(q[19]); + H(q[20]); + H(q[21]); + H(q[22]); + H(q[23]); + H(q[24]); + H(q[25]); + H(q[26]); + H(q[27]); + H(q[28]); + H(q[29]); + CZ(q[0],q[1]); + CZ(q[4],q[5]); + CZ(q[12],q[13]); + CZ(q[16],q[17]); + CZ(q[24],q[25]); + CZ(q[28],q[29]); + CZ(q[8],q[9]); + CZ(q[20],q[21]); + H(q[0]); + H(q[1]); + H(q[2]); + H(q[3]); + H(q[4]); + H(q[5]); + H(q[6]); + H(q[7]); + H(q[8]); + H(q[9]); + H(q[10]); + H(q[11]); + H(q[12]); + H(q[13]); + H(q[14]); + H(q[15]); + H(q[16]); + H(q[17]); + H(q[18]); + H(q[19]); + H(q[20]); + H(q[21]); + H(q[22]); + H(q[23]); + H(q[24]); + H(q[25]); + H(q[26]); + H(q[27]); + H(q[28]); + H(q[29]); + CZ(q[18],q[24]); + CZ(q[20],q[26]); + CZ(q[22],q[28]); + CZ(q[7],q[13]); + CZ(q[9],q[15]); + CZ(q[11],q[17]); + H(q[0]); + H(q[1]); + H(q[2]); + H(q[3]); + H(q[4]); + H(q[5]); + H(q[6]); + H(q[7]); + H(q[8]); + H(q[9]); + H(q[10]); + H(q[11]); + H(q[12]); + H(q[13]); + H(q[14]); + H(q[15]); + H(q[16]); + H(q[17]); + H(q[18]); + H(q[19]); + H(q[20]); + H(q[21]); + H(q[22]); + H(q[23]); + H(q[24]); + H(q[25]); + H(q[26]); + H(q[27]); + H(q[28]); + H(q[29]); + CZ(q[6],q[12]); + CZ(q[8],q[14]); + CZ(q[10],q[16]); + CZ(q[19],q[25]); + CZ(q[21],q[27]); + CZ(q[23],q[29]); + H(q[0]); + H(q[1]); + H(q[2]); + H(q[3]); + H(q[4]); + H(q[5]); + H(q[6]); + H(q[7]); + H(q[8]); + H(q[9]); + H(q[10]); + H(q[11]); + H(q[12]); + H(q[13]); + H(q[14]); + H(q[15]); + H(q[16]); + H(q[17]); + H(q[18]); + H(q[19]); + H(q[20]); + H(q[21]); + H(q[22]); + H(q[23]); + H(q[24]); + H(q[25]); + H(q[26]); + H(q[27]); + H(q[28]); + H(q[29]); + CZ(q[3],q[4]); + CZ(q[15],q[16]); + CZ(q[27],q[28]); + CZ(q[7],q[8]); + CZ(q[11],q[12]); + CZ(q[19],q[20]); + CZ(q[23],q[24]); + H(q[0]); + H(q[1]); + H(q[2]); + H(q[3]); + H(q[4]); + H(q[5]); + H(q[6]); + H(q[7]); + H(q[8]); + H(q[9]); + H(q[10]); + H(q[11]); + H(q[12]); + H(q[13]); + H(q[14]); + H(q[15]); + H(q[16]); + H(q[17]); + H(q[18]); + H(q[19]); + H(q[20]); + H(q[21]); + H(q[22]); + H(q[23]); + H(q[24]); + H(q[25]); + H(q[26]); + H(q[27]); + H(q[28]); + H(q[29]); + CZ(q[1],q[2]); + CZ(q[5],q[6]); + CZ(q[13],q[14]); + CZ(q[17],q[18]); + CZ(q[25],q[26]); + CZ(q[9],q[10]); + CZ(q[21],q[22]); + H(q[0]); + H(q[1]); + H(q[2]); + H(q[3]); + H(q[4]); + H(q[5]); + H(q[6]); + H(q[7]); + H(q[8]); + H(q[9]); + H(q[10]); + H(q[11]); + H(q[12]); + H(q[13]); + H(q[14]); + H(q[15]); + H(q[16]); + H(q[17]); + H(q[18]); + H(q[19]); + H(q[20]); + H(q[21]); + H(q[22]); + H(q[23]); + H(q[24]); + H(q[25]); + H(q[26]); + H(q[27]); + H(q[28]); + H(q[29]); + CZ(q[0],q[6]); + CZ(q[2],q[8]); + CZ(q[4],q[10]); + CZ(q[13],q[19]); + CZ(q[15],q[21]); + CZ(q[17],q[23]); + H(q[0]); + H(q[1]); + H(q[2]); + H(q[3]); + H(q[4]); + H(q[5]); + H(q[6]); + H(q[7]); + H(q[8]); + H(q[9]); + H(q[10]); + H(q[11]); + H(q[12]); + H(q[13]); + H(q[14]); + H(q[15]); + H(q[16]); + H(q[17]); + H(q[18]); + H(q[19]); + H(q[20]); + H(q[21]); + H(q[22]); + H(q[23]); + H(q[24]); + H(q[25]); + H(q[26]); + H(q[27]); + H(q[28]); + H(q[29]); + CZ(q[12],q[18]); + CZ(q[14],q[20]); + CZ(q[16],q[22]); + CZ(q[1],q[7]); + CZ(q[3],q[9]); + CZ(q[5],q[11]); + H(q[0]); + H(q[1]); + H(q[2]); + H(q[3]); + H(q[4]); + H(q[5]); + H(q[6]); + H(q[7]); + H(q[8]); + H(q[9]); + H(q[10]); + H(q[11]); + H(q[12]); + H(q[13]); + H(q[14]); + H(q[15]); + H(q[16]); + H(q[17]); + H(q[18]); + H(q[19]); + H(q[20]); + H(q[21]); + H(q[22]); + H(q[23]); + H(q[24]); + H(q[25]); + H(q[26]); + H(q[27]); + H(q[28]); + H(q[29]); + for (q1 in q) { let _ = M(q1); } + } + ResetAll(q); + } + return(gateCnt); + } + +} diff --git a/AdvantageBenchmark/privateBuild/advantage.sln b/AdvantageBenchmark/privateBuild/advantage.sln new file mode 100644 index 00000000000..91b39bc2b87 Binary files /dev/null and b/AdvantageBenchmark/privateBuild/advantage.sln differ diff --git a/AdvantageBenchmark/privateBuild/host.csproj b/AdvantageBenchmark/privateBuild/host.csproj new file mode 100644 index 00000000000..73723279d53 --- /dev/null +++ b/AdvantageBenchmark/privateBuild/host.csproj @@ -0,0 +1,11 @@ + + + + + + Exe + netcoreapp3.1 + false + + + diff --git a/AdvantageBenchmark/privateBuild/parseLog.py b/AdvantageBenchmark/privateBuild/parseLog.py new file mode 100644 index 00000000000..fbf1f395fe0 --- /dev/null +++ b/AdvantageBenchmark/privateBuild/parseLog.py @@ -0,0 +1,50 @@ +import re +import sys +import numpy as np +from collections import namedtuple + +info = namedtuple('Info','test loop secs gates threads span depth gps') +logName = sys.argv[1] +reHead = re.compile(r"^CSV,test,") +reInfo = re.compile(r'^CSV,([^,]+),([^,]+),([^,]+),([^,]+),([^,]+),([^,]+),([^,]+),([^,\s]+)') +fp = open(logName,'r') +infos = [] + +print('test,secs,gates,threads,span,depth,gps') + +def dumpGpss(): + global infos + if len(infos) > 0: + gpss = [float(i.gps) for i in infos] + gpsMed = np.median(gpss) + cnt = 0.0 + tot = 0.0 + #for gps in gpss: + # if gps > gpsMed/2.0 and gps < gpsMed*1.5: + # cnt += 1.0 + # tot += gps + #if cnt > 0: gps = tot/cnt + #else: gps = np.average(gpss) + gps = np.max(gpss) + + idx = int(len(infos)/2) + itm = infos[idx] + print(f"{itm.test},{itm.secs},{itm.gates},{itm.threads},{itm.span},{itm.depth},{gps:.1f}") + infos = [] + +while True: + inp = fp.readline() + if inp == "": + dumpGpss() + break + found = reHead.search(inp) + if found: + dumpGpss() + continue + found = reInfo.search(inp) + if found: + infos.append(info(found.group(1),found.group(2),found.group(3),found.group(4), + found.group(5),found.group(6),found.group(7),found.group(8))) + continue + +fp.close() diff --git a/AdvantageBenchmark/privateBuild/runTest.ps1 b/AdvantageBenchmark/privateBuild/runTest.ps1 new file mode 100644 index 00000000000..12fe3870497 --- /dev/null +++ b/AdvantageBenchmark/privateBuild/runTest.ps1 @@ -0,0 +1,9 @@ +for ($tst=1; $tst -le 2; $tst++) { + for ($thrd=4; $thrd -ge 1; $thrd--) { + for ($span=4; $span -ge 0; $span--) { + $env:OMP_NUM_THREADS = $thrd + $env:QDK_SIM_FUSESPAN = $span + .\bin\Release\netcoreapp3.1\host.exe $tst $tst 5 + } + } +} diff --git a/AdvantageBenchmark/privateBuild/runTest.sh b/AdvantageBenchmark/privateBuild/runTest.sh new file mode 100755 index 00000000000..c8dc2155faa --- /dev/null +++ b/AdvantageBenchmark/privateBuild/runTest.sh @@ -0,0 +1,14 @@ +#!/bin/bash + +for tst in {1..3} +do + for thrd in {20..2..-2} + do + for span in {7..0..-1} + do + export OMP_NUM_THREADS=$thrd + export QDK_SIM_FUSESPAN=$span + ./bin/Release/netcoreapp3.1/host $tst $tst 5 + done + done +done diff --git a/AdvantageBenchmark/readme.md b/AdvantageBenchmark/readme.md new file mode 100644 index 00000000000..33242bd4fec --- /dev/null +++ b/AdvantageBenchmark/readme.md @@ -0,0 +1,15 @@ +# Advantage Benchmark + +## Purpose + +This benchmark is intended to provide an easy way to verify the performance characteristcs of a given release build of the QDK simulator vs the current tree. The releaseBuild folder contains projects that will build the quantum advantage Q# program with a QDK from a nuget source and verify the gates-per-second execution of that program. The privateBuild folder compiles the same Q# program with the runtime in the curent source tree instead. + +## Executing the benchmark + +To execute the benchmark, compile each version of advantage.sln using `dotnet build .\advantage.sln -c Release` from their respective folders. Then the executable to run will be either `bin\Release\netcoreapp3.1\host.exe` in the privateBuild folder or `host\bin\Release\netcoreapp3.1\host.exe` in the releaseBuild folder. This executable takes parameters describing which test circuits to execute and how many loops to perform as integer arguments, such that `host.exe 1 1 5` will run 5 loops of test 1 and `host.exe 0 3 100` will run 100 loops of tests 0 through 3. Check the contents of `privateBuild\Program.cs` to see the tests that correspond to each identifier; for most machines, test 1 aka advantage 4x4 circuit is the best choice for benchmarking. + +The benchmark can also be run via runTest.ps1 or runTest.sh, which performs a sweep across configured environment variables that adjust the number of threads used and gates fused in simulating the circuit. See the definition of the script used on your platform to understand how it configures the `OMP_NUM_THREADS` and `QDK_SIM_FUSESPAN` environment variables. + +## Collecting results + +The output of `host.exe` is a table showing the gates-per-second along with other identifiying information for the run, output at intervals during the looped execution. When driven via runTest.ps1/.sh, the output will be a larger table of all the results for the various combinations of threads and fusion spans. To help collect these results into a meaningful table, the parseLog.py script will convert the output from a runTest execution into a CSV file with the single highest gates-per-second observed for a given thread/fuse-span combination. This can then be loaded into a spreadsheet program for easier graphing or other visualization. diff --git a/AdvantageBenchmark/releasedBuild/advantage.sln b/AdvantageBenchmark/releasedBuild/advantage.sln new file mode 100644 index 00000000000..4571a900f07 --- /dev/null +++ b/AdvantageBenchmark/releasedBuild/advantage.sln @@ -0,0 +1,48 @@ + +Microsoft Visual Studio Solution File, Format Version 12.00 +# Visual Studio 15 +VisualStudioVersion = 15.0.26124.0 +MinimumVisualStudioVersion = 15.0.26124.0 +Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "quantum", "quantum\quantum.csproj", "{576A1AEE-9051-458D-B3E8-EFE3F64235B0}" +EndProject +Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "host", "host\host.csproj", "{642AEC30-F51D-4547-A1FD-A8AC759A75A5}" +EndProject +Global + GlobalSection(SolutionConfigurationPlatforms) = preSolution + Debug|Any CPU = Debug|Any CPU + Debug|x64 = Debug|x64 + Debug|x86 = Debug|x86 + Release|Any CPU = Release|Any CPU + Release|x64 = Release|x64 + Release|x86 = Release|x86 + EndGlobalSection + GlobalSection(SolutionProperties) = preSolution + HideSolutionNode = FALSE + EndGlobalSection + GlobalSection(ProjectConfigurationPlatforms) = postSolution + {576A1AEE-9051-458D-B3E8-EFE3F64235B0}.Debug|Any CPU.ActiveCfg = Debug|Any CPU + {576A1AEE-9051-458D-B3E8-EFE3F64235B0}.Debug|Any CPU.Build.0 = Debug|Any CPU + {576A1AEE-9051-458D-B3E8-EFE3F64235B0}.Debug|x64.ActiveCfg = Debug|Any CPU + {576A1AEE-9051-458D-B3E8-EFE3F64235B0}.Debug|x64.Build.0 = Debug|Any CPU + {576A1AEE-9051-458D-B3E8-EFE3F64235B0}.Debug|x86.ActiveCfg = Debug|Any CPU + {576A1AEE-9051-458D-B3E8-EFE3F64235B0}.Debug|x86.Build.0 = Debug|Any CPU + {576A1AEE-9051-458D-B3E8-EFE3F64235B0}.Release|Any CPU.ActiveCfg = Release|Any CPU + {576A1AEE-9051-458D-B3E8-EFE3F64235B0}.Release|Any CPU.Build.0 = Release|Any CPU + {576A1AEE-9051-458D-B3E8-EFE3F64235B0}.Release|x64.ActiveCfg = Release|Any CPU + {576A1AEE-9051-458D-B3E8-EFE3F64235B0}.Release|x64.Build.0 = Release|Any CPU + {576A1AEE-9051-458D-B3E8-EFE3F64235B0}.Release|x86.ActiveCfg = Release|Any CPU + {576A1AEE-9051-458D-B3E8-EFE3F64235B0}.Release|x86.Build.0 = Release|Any CPU + {642AEC30-F51D-4547-A1FD-A8AC759A75A5}.Debug|Any CPU.ActiveCfg = Debug|Any CPU + {642AEC30-F51D-4547-A1FD-A8AC759A75A5}.Debug|Any CPU.Build.0 = Debug|Any CPU + {642AEC30-F51D-4547-A1FD-A8AC759A75A5}.Debug|x64.ActiveCfg = Debug|Any CPU + {642AEC30-F51D-4547-A1FD-A8AC759A75A5}.Debug|x64.Build.0 = Debug|Any CPU + {642AEC30-F51D-4547-A1FD-A8AC759A75A5}.Debug|x86.ActiveCfg = Debug|Any CPU + {642AEC30-F51D-4547-A1FD-A8AC759A75A5}.Debug|x86.Build.0 = Debug|Any CPU + {642AEC30-F51D-4547-A1FD-A8AC759A75A5}.Release|Any CPU.ActiveCfg = Release|Any CPU + {642AEC30-F51D-4547-A1FD-A8AC759A75A5}.Release|Any CPU.Build.0 = Release|Any CPU + {642AEC30-F51D-4547-A1FD-A8AC759A75A5}.Release|x64.ActiveCfg = Release|Any CPU + {642AEC30-F51D-4547-A1FD-A8AC759A75A5}.Release|x64.Build.0 = Release|Any CPU + {642AEC30-F51D-4547-A1FD-A8AC759A75A5}.Release|x86.ActiveCfg = Release|Any CPU + {642AEC30-F51D-4547-A1FD-A8AC759A75A5}.Release|x86.Build.0 = Release|Any CPU + EndGlobalSection +EndGlobal diff --git a/AdvantageBenchmark/releasedBuild/host/host.csproj b/AdvantageBenchmark/releasedBuild/host/host.csproj new file mode 100644 index 00000000000..8b145ec5eef --- /dev/null +++ b/AdvantageBenchmark/releasedBuild/host/host.csproj @@ -0,0 +1,16 @@ + + + + + + + + + + + + Exe + netcoreapp3.1 + + + diff --git a/AdvantageBenchmark/releasedBuild/quantum/quantum.csproj b/AdvantageBenchmark/releasedBuild/quantum/quantum.csproj new file mode 100644 index 00000000000..18667e533f6 --- /dev/null +++ b/AdvantageBenchmark/releasedBuild/quantum/quantum.csproj @@ -0,0 +1,11 @@ + + + + + + + + netstandard2.1 + + + diff --git a/build/test.ps1 b/build/test.ps1 index 09e0e49fb9f..e88962d2a17 100644 --- a/build/test.ps1 +++ b/build/test.ps1 @@ -8,7 +8,8 @@ if ($Env:ENABLE_NATIVE -ne "false") { Write-Host "##[info]Test Native simulator" pushd (Join-Path $PSScriptRoot "../src/Simulation/Native/build") cmake --build . --config $Env:BUILD_CONFIGURATION - ctest -C $Env:BUILD_CONFIGURATION + cp ../advantage_44_4.log . + ctest -C $Env:BUILD_CONFIGURATION --verbose if ($LastExitCode -ne 0) { Write-Host "##vso[task.logissue type=error;]Failed to test Native Simulator" $script:all_ok = $False diff --git a/src/Simulation/CsharpGeneration/Microsoft.Quantum.CsharpGeneration.fsproj b/src/Simulation/CsharpGeneration/Microsoft.Quantum.CsharpGeneration.fsproj index 75496178ba5..b47f0af63be 100644 --- a/src/Simulation/CsharpGeneration/Microsoft.Quantum.CsharpGeneration.fsproj +++ b/src/Simulation/CsharpGeneration/Microsoft.Quantum.CsharpGeneration.fsproj @@ -21,7 +21,7 @@ - + diff --git a/src/Simulation/Native/.gitignore b/src/Simulation/Native/.gitignore index 06c6a304f20..8f1fc6e9a01 100644 --- a/src/Simulation/Native/.gitignore +++ b/src/Simulation/Native/.gitignore @@ -2,3 +2,11 @@ build /.vs /vs2017 +*.csv +foo* +*.filters +*.cmake +*.vcxproj +CMakeFiles/ +CMakeCache.txt +*.so diff --git a/src/Simulation/Native/CMakeLists.txt b/src/Simulation/Native/CMakeLists.txt index b9c22c575f7..c56b6f95a3e 100644 --- a/src/Simulation/Native/CMakeLists.txt +++ b/src/Simulation/Native/CMakeLists.txt @@ -24,7 +24,7 @@ set(CMAKE_CXX_STANDARD_REQUIRED ON) ADD_DEFINITIONS(-D_SCL_SECURE_NO_WARNINGS) # Configuration options (choose one to turn on) -option(BUILD_SHARED_LIBS "Build shared libraries" OFF) +option(BUILD_SHARED_LIBS "Build shared libraries" ON) option(ENABLE_OPENMP "Enable OpenMP Parallelization" ON) option(USE_SINGLE_PRECISION "Use single-precision floating point operations" OFF) option(HAVE_INTRINSICS "Have AVX intrinsics" OFF) diff --git a/src/Simulation/Native/CMakeSettings.json b/src/Simulation/Native/CMakeSettings.json new file mode 100644 index 00000000000..ee45e8257c1 --- /dev/null +++ b/src/Simulation/Native/CMakeSettings.json @@ -0,0 +1,28 @@ +{ + "configurations": [ + { + "name": "x64-Debug", + "generator": "Ninja", + "configurationType": "Debug", + "inheritEnvironments": [ "msvc_x64_x64" ], + "buildRoot": "${projectDir}\\out\\build\\${name}", + "installRoot": "${projectDir}\\out\\install\\${name}", + "cmakeCommandArgs": "", + "buildCommandArgs": "", + "ctestCommandArgs": "", + "variables": [] + }, + { + "name": "x64-Release", + "generator": "Ninja", + "configurationType": "RelWithDebInfo", + "buildRoot": "${projectDir}\\out\\build\\${name}", + "installRoot": "${projectDir}\\out\\install\\${name}", + "cmakeCommandArgs": "", + "buildCommandArgs": "", + "ctestCommandArgs": "", + "inheritEnvironments": [ "msvc_x64_x64" ], + "variables": [] + } + ] +} \ No newline at end of file diff --git a/src/Simulation/Native/Makefile b/src/Simulation/Native/Makefile new file mode 100644 index 00000000000..bedf9d3e28f --- /dev/null +++ b/src/Simulation/Native/Makefile @@ -0,0 +1,364 @@ +# CMAKE generated file: DO NOT EDIT! +# Generated by "Unix Makefiles" Generator, CMake Version 3.16 + +# Default target executed when no arguments are given to make. +default_target: all + +.PHONY : default_target + +# Allow only one "make -f Makefile2" at a time, but pass parallelism. +.NOTPARALLEL: + + +#============================================================================= +# Special targets provided by cmake. + +# Disable implicit rules so canonical targets will work. +.SUFFIXES: + + +# Remove some rules from gmake that .SUFFIXES does not remove. +SUFFIXES = + +.SUFFIXES: .hpux_make_needs_suffix_list + + +# Suppress display of executed commands. +$(VERBOSE).SILENT: + + +# A target that is always out of date. +cmake_force: + +.PHONY : cmake_force + +#============================================================================= +# Set environment variables for the build. + +# The shell in which to execute make rules. +SHELL = /bin/sh + +# The CMake executable. +CMAKE_COMMAND = /usr/bin/cmake + +# The command to remove a file. +RM = /usr/bin/cmake -E remove -f + +# Escaping for special characters. +EQUALS = = + +# The top-level source directory on which CMake was run. +CMAKE_SOURCE_DIR = /mnt/c/depot/Git/qsharp-runtime/src/Simulation/Native + +# The top-level build directory on which CMake was run. +CMAKE_BINARY_DIR = /mnt/c/depot/Git/qsharp-runtime/src/Simulation/Native + +#============================================================================= +# Targets provided globally by CMake. + +# Special rule for the target install/strip +install/strip: preinstall + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Installing the project stripped..." + /usr/bin/cmake -DCMAKE_INSTALL_DO_STRIP=1 -P cmake_install.cmake +.PHONY : install/strip + +# Special rule for the target install/strip +install/strip/fast: preinstall/fast + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Installing the project stripped..." + /usr/bin/cmake -DCMAKE_INSTALL_DO_STRIP=1 -P cmake_install.cmake +.PHONY : install/strip/fast + +# Special rule for the target install/local +install/local: preinstall + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Installing only the local directory..." + /usr/bin/cmake -DCMAKE_INSTALL_LOCAL_ONLY=1 -P cmake_install.cmake +.PHONY : install/local + +# Special rule for the target install/local +install/local/fast: preinstall/fast + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Installing only the local directory..." + /usr/bin/cmake -DCMAKE_INSTALL_LOCAL_ONLY=1 -P cmake_install.cmake +.PHONY : install/local/fast + +# Special rule for the target install +install: preinstall + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Install the project..." + /usr/bin/cmake -P cmake_install.cmake +.PHONY : install + +# Special rule for the target install +install/fast: preinstall/fast + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Install the project..." + /usr/bin/cmake -P cmake_install.cmake +.PHONY : install/fast + +# Special rule for the target list_install_components +list_install_components: + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Available install components are: \"libraries\"" +.PHONY : list_install_components + +# Special rule for the target list_install_components +list_install_components/fast: list_install_components + +.PHONY : list_install_components/fast + +# Special rule for the target rebuild_cache +rebuild_cache: + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Running CMake to regenerate build system..." + /usr/bin/cmake -S$(CMAKE_SOURCE_DIR) -B$(CMAKE_BINARY_DIR) +.PHONY : rebuild_cache + +# Special rule for the target rebuild_cache +rebuild_cache/fast: rebuild_cache + +.PHONY : rebuild_cache/fast + +# Special rule for the target edit_cache +edit_cache: + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Running CMake cache editor..." + /usr/bin/ccmake -S$(CMAKE_SOURCE_DIR) -B$(CMAKE_BINARY_DIR) +.PHONY : edit_cache + +# Special rule for the target edit_cache +edit_cache/fast: edit_cache + +.PHONY : edit_cache/fast + +# Special rule for the target test +test: + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Running tests..." + /usr/bin/ctest --force-new-ctest-process $(ARGS) +.PHONY : test + +# Special rule for the target test +test/fast: test + +.PHONY : test/fast + +# The main all target +all: cmake_check_build_system + $(CMAKE_COMMAND) -E cmake_progress_start /mnt/c/depot/Git/qsharp-runtime/src/Simulation/Native/CMakeFiles /mnt/c/depot/Git/qsharp-runtime/src/Simulation/Native/CMakeFiles/progress.marks + $(MAKE) -f CMakeFiles/Makefile2 all + $(CMAKE_COMMAND) -E cmake_progress_start /mnt/c/depot/Git/qsharp-runtime/src/Simulation/Native/CMakeFiles 0 +.PHONY : all + +# The main clean target +clean: + $(MAKE) -f CMakeFiles/Makefile2 clean +.PHONY : clean + +# The main clean target +clean/fast: clean + +.PHONY : clean/fast + +# Prepare targets for installation. +preinstall: all + $(MAKE) -f CMakeFiles/Makefile2 preinstall +.PHONY : preinstall + +# Prepare targets for installation. +preinstall/fast: + $(MAKE) -f CMakeFiles/Makefile2 preinstall +.PHONY : preinstall/fast + +# clear depends +depend: + $(CMAKE_COMMAND) -S$(CMAKE_SOURCE_DIR) -B$(CMAKE_BINARY_DIR) --check-build-system CMakeFiles/Makefile.cmake 1 +.PHONY : depend + +#============================================================================= +# Target rules for targets named Microsoft.Quantum.Simulator.Runtime + +# Build rule for target. +Microsoft.Quantum.Simulator.Runtime: cmake_check_build_system + $(MAKE) -f CMakeFiles/Makefile2 Microsoft.Quantum.Simulator.Runtime +.PHONY : Microsoft.Quantum.Simulator.Runtime + +# fast build rule for target. +Microsoft.Quantum.Simulator.Runtime/fast: + $(MAKE) -f src/CMakeFiles/Microsoft.Quantum.Simulator.Runtime.dir/build.make src/CMakeFiles/Microsoft.Quantum.Simulator.Runtime.dir/build +.PHONY : Microsoft.Quantum.Simulator.Runtime/fast + +#============================================================================= +# Target rules for targets named tinymatrix_test + +# Build rule for target. +tinymatrix_test: cmake_check_build_system + $(MAKE) -f CMakeFiles/Makefile2 tinymatrix_test +.PHONY : tinymatrix_test + +# fast build rule for target. +tinymatrix_test/fast: + $(MAKE) -f src/util/CMakeFiles/tinymatrix_test.dir/build.make src/util/CMakeFiles/tinymatrix_test.dir/build +.PHONY : tinymatrix_test/fast + +#============================================================================= +# Target rules for targets named bititerator_test + +# Build rule for target. +bititerator_test: cmake_check_build_system + $(MAKE) -f CMakeFiles/Makefile2 bititerator_test +.PHONY : bititerator_test + +# fast build rule for target. +bititerator_test/fast: + $(MAKE) -f src/util/CMakeFiles/bititerator_test.dir/build.make src/util/CMakeFiles/bititerator_test.dir/build +.PHONY : bititerator_test/fast + +#============================================================================= +# Target rules for targets named bitops_test + +# Build rule for target. +bitops_test: cmake_check_build_system + $(MAKE) -f CMakeFiles/Makefile2 bitops_test +.PHONY : bitops_test + +# fast build rule for target. +bitops_test/fast: + $(MAKE) -f src/util/CMakeFiles/bitops_test.dir/build.make src/util/CMakeFiles/bitops_test.dir/build +.PHONY : bitops_test/fast + +#============================================================================= +# Target rules for targets named openmp_test + +# Build rule for target. +openmp_test: cmake_check_build_system + $(MAKE) -f CMakeFiles/Makefile2 openmp_test +.PHONY : openmp_test + +# fast build rule for target. +openmp_test/fast: + $(MAKE) -f src/util/CMakeFiles/openmp_test.dir/build.make src/util/CMakeFiles/openmp_test.dir/build +.PHONY : openmp_test/fast + +#============================================================================= +# Target rules for targets named cpuid_test + +# Build rule for target. +cpuid_test: cmake_check_build_system + $(MAKE) -f CMakeFiles/Makefile2 cpuid_test +.PHONY : cpuid_test + +# fast build rule for target. +cpuid_test/fast: + $(MAKE) -f src/util/CMakeFiles/cpuid_test.dir/build.make src/util/CMakeFiles/cpuid_test.dir/build +.PHONY : cpuid_test/fast + +#============================================================================= +# Target rules for targets named argmaxnrm2_test + +# Build rule for target. +argmaxnrm2_test: cmake_check_build_system + $(MAKE) -f CMakeFiles/Makefile2 argmaxnrm2_test +.PHONY : argmaxnrm2_test + +# fast build rule for target. +argmaxnrm2_test/fast: + $(MAKE) -f src/util/CMakeFiles/argmaxnrm2_test.dir/build.make src/util/CMakeFiles/argmaxnrm2_test.dir/build +.PHONY : argmaxnrm2_test/fast + +#============================================================================= +# Target rules for targets named diagmatrix_test + +# Build rule for target. +diagmatrix_test: cmake_check_build_system + $(MAKE) -f CMakeFiles/Makefile2 diagmatrix_test +.PHONY : diagmatrix_test + +# fast build rule for target. +diagmatrix_test/fast: + $(MAKE) -f src/util/CMakeFiles/diagmatrix_test.dir/build.make src/util/CMakeFiles/diagmatrix_test.dir/build +.PHONY : diagmatrix_test/fast + +#============================================================================= +# Target rules for targets named dbw_test + +# Build rule for target. +dbw_test: cmake_check_build_system + $(MAKE) -f CMakeFiles/Makefile2 dbw_test +.PHONY : dbw_test + +# fast build rule for target. +dbw_test/fast: + $(MAKE) -f src/simulator/CMakeFiles/dbw_test.dir/build.make src/simulator/CMakeFiles/dbw_test.dir/build +.PHONY : dbw_test/fast + +#============================================================================= +# Target rules for targets named capi_test + +# Build rule for target. +capi_test: cmake_check_build_system + $(MAKE) -f CMakeFiles/Makefile2 capi_test +.PHONY : capi_test + +# fast build rule for target. +capi_test/fast: + $(MAKE) -f src/simulator/CMakeFiles/capi_test.dir/build.make src/simulator/CMakeFiles/capi_test.dir/build +.PHONY : capi_test/fast + +#============================================================================= +# Target rules for targets named factory_test + +# Build rule for target. +factory_test: cmake_check_build_system + $(MAKE) -f CMakeFiles/Makefile2 factory_test +.PHONY : factory_test + +# fast build rule for target. +factory_test/fast: + $(MAKE) -f src/simulator/CMakeFiles/factory_test.dir/build.make src/simulator/CMakeFiles/factory_test.dir/build +.PHONY : factory_test/fast + +#============================================================================= +# Target rules for targets named local_test + +# Build rule for target. +local_test: cmake_check_build_system + $(MAKE) -f CMakeFiles/Makefile2 local_test +.PHONY : local_test + +# fast build rule for target. +local_test/fast: + $(MAKE) -f src/simulator/CMakeFiles/local_test.dir/build.make src/simulator/CMakeFiles/local_test.dir/build +.PHONY : local_test/fast + +# Help Target +help: + @echo "The following are some of the valid targets for this Makefile:" + @echo "... all (the default if no target is provided)" + @echo "... clean" + @echo "... depend" + @echo "... install/strip" + @echo "... install/local" + @echo "... install" + @echo "... list_install_components" + @echo "... rebuild_cache" + @echo "... edit_cache" + @echo "... test" + @echo "... Microsoft.Quantum.Simulator.Runtime" + @echo "... tinymatrix_test" + @echo "... bititerator_test" + @echo "... bitops_test" + @echo "... openmp_test" + @echo "... cpuid_test" + @echo "... argmaxnrm2_test" + @echo "... diagmatrix_test" + @echo "... dbw_test" + @echo "... capi_test" + @echo "... factory_test" + @echo "... local_test" +.PHONY : help + + + +#============================================================================= +# Special targets to cleanup operation of make. + +# Special rule to run CMake to check the build system integrity. +# No rule that depends on this can have commands that come from listfiles +# because they might be regenerated. +cmake_check_build_system: + $(CMAKE_COMMAND) -S$(CMAKE_SOURCE_DIR) -B$(CMAKE_BINARY_DIR) --check-build-system CMakeFiles/Makefile.cmake 0 +.PHONY : cmake_check_build_system + diff --git a/src/Simulation/Native/advantage_44_4.log b/src/Simulation/Native/advantage_44_4.log new file mode 100644 index 00000000000..fc883d684bd --- /dev/null +++ b/src/Simulation/Native/advantage_44_4.log @@ -0,0 +1,354 @@ +=== Original: + 0: H[0] + 1: H[1] + 2: H[2] + 3: H[3] + 4: H[4] + 5: H[5] + 6: H[6] + 7: H[7] + 8: H[8] + 9: H[9] +10: H[10] +11: H[11] +12: H[12] +13: H[13] +14: H[14] +15: H[15] +16: CZ[2, 3] +17: CZ[10, 11] +18: CZ[4, 5] +19: CZ[12, 13] +20: H[0] +21: H[1] +22: H[2] +23: H[3] +24: H[4] +25: H[5] +26: H[6] +27: H[7] +28: H[8] +29: H[9] +30: H[10] +31: H[11] +32: H[12] +33: H[13] +34: H[14] +35: H[15] +36: CZ[0, 1] +37: CZ[8, 9] +38: CZ[6, 7] +39: CZ[14, 15] +40: H[0] +41: H[1] +42: H[2] +43: H[3] +44: H[4] +45: H[5] +46: H[6] +47: H[7] +48: H[8] +49: H[9] +50: H[10] +51: H[11] +52: H[12] +53: H[13] +54: H[14] +55: H[15] +56: CZ[5, 9] +57: CZ[7, 11] +58: H[0] +59: H[1] +60: H[2] +61: H[3] +62: H[4] +63: H[5] +64: H[6] +65: H[7] +66: H[8] +67: H[9] +68: H[10] +69: H[11] +70: H[12] +71: H[13] +72: H[14] +73: H[15] +74: CZ[4, 8] +75: CZ[6, 10] +76: H[0] +77: H[1] +78: H[2] +79: H[3] +80: H[4] +81: H[5] +82: H[6] +83: H[7] +84: H[8] +85: H[9] +86: H[10] +87: H[11] +88: H[12] +89: H[13] +90: H[14] +91: H[15] +92: CZ[3, 4] +93: CZ[11, 12] +94: CZ[5, 6] +95: CZ[13, 14] +96: H[0] +97: H[1] +98: H[2] +99: H[3] +100: H[4] +101: H[5] +102: H[6] +103: H[7] +104: H[8] +105: H[9] +106: H[10] +107: H[11] +108: H[12] +109: H[13] +110: H[14] +111: H[15] +112: CZ[1, 2] +113: CZ[9, 10] +114: CZ[7, 8] +115: H[0] +116: H[1] +117: H[2] +118: H[3] +119: H[4] +120: H[5] +121: H[6] +122: H[7] +123: H[8] +124: H[9] +125: H[10] +126: H[11] +127: H[12] +128: H[13] +129: H[14] +130: H[15] +131: CZ[0, 4] +132: CZ[2, 6] +133: CZ[9, 13] +134: CZ[11, 15] +135: H[0] +136: H[1] +137: H[2] +138: H[3] +139: H[4] +140: H[5] +141: H[6] +142: H[7] +143: H[8] +144: H[9] +145: H[10] +146: H[11] +147: H[12] +148: H[13] +149: H[14] +150: H[15] +151: CZ[8, 12] +152: CZ[10, 14] +153: CZ[1, 5] +154: CZ[3, 7] +155: H[0] +156: H[1] +157: H[2] +158: H[3] +159: H[4] +160: H[5] +161: H[6] +162: H[7] +163: H[8] +164: H[9] +165: H[10] +166: H[11] +167: H[12] +168: H[13] +169: H[14] +170: H[15] +=== Clusters (cost= 5.034): +==== cluster[ 0]: depth=26 width=4 + 0: H[8] + 1: H[9] + 2: H[4] + 3: H[5] + 4: H[8] + 5: H[9] + 6: CZ[4, 5] + 7: H[5] + 8: CZ[8, 9] + 9: H[9] +10: H[4] +11: H[5] +12: H[8] +13: CZ[5, 9] +14: H[4] +15: H[5] +16: H[8] +17: H[9] +18: H[4] +19: H[5] +20: CZ[4, 8] +21: H[9] +22: H[4] +23: H[8] +24: H[9] +25: H[8] +==== cluster[ 1]: depth=26 width=4 +26: H[10] +27: H[11] +28: H[6] +29: H[7] +30: CZ[10, 11] +31: H[11] +32: H[6] +33: H[7] +34: H[10] +35: H[11] +36: CZ[6, 7] +37: H[7] +38: H[10] +39: CZ[7, 11] +40: H[6] +41: H[7] +42: H[10] +43: H[11] +44: H[6] +45: H[7] +46: CZ[6, 10] +47: H[11] +48: H[6] +49: H[7] +50: H[10] +51: H[10] +==== cluster[ 2]: depth=30 width=4 +52: H[0] +53: H[1] +54: H[2] +55: H[3] +56: H[0] +57: H[1] +58: CZ[2, 3] +59: H[3] +60: CZ[0, 1] +61: H[1] +62: H[2] +63: H[3] +64: H[0] +65: H[1] +66: H[2] +67: H[3] +68: H[0] +69: H[1] +70: H[2] +71: H[3] +72: H[0] +73: H[1] +74: H[2] +75: H[0] +76: H[2] +77: H[0] +78: CZ[1, 2] +79: H[2] +80: H[1] +81: H[1] +==== cluster[ 3]: depth=30 width=4 +82: H[12] +83: H[13] +84: H[14] +85: H[15] +86: CZ[12, 13] +87: H[13] +88: H[14] +89: H[15] +90: H[12] +91: H[13] +92: CZ[14, 15] +93: H[15] +94: H[12] +95: H[13] +96: H[14] +97: H[15] +98: H[12] +99: H[13] +100: H[14] +101: H[15] +102: H[12] +103: H[14] +104: H[15] +105: CZ[13, 14] +106: H[14] +107: H[15] +108: H[13] +109: H[14] +110: H[13] +111: H[14] +==== cluster[ 4]: depth=14 width=4 +112: CZ[7, 8] +113: CZ[3, 4] +114: H[4] +115: H[7] +116: H[8] +117: H[3] +118: H[4] +119: H[7] +120: H[8] +121: H[3] +122: H[3] +123: CZ[3, 7] +124: H[3] +125: H[7] +==== cluster[ 5]: depth=11 width=3 +126: CZ[11, 12] +127: H[12] +128: H[11] +129: H[12] +130: H[11] +131: H[12] +132: CZ[11, 15] +133: H[11] +134: H[11] +135: H[15] +136: H[15] +==== cluster[ 6]: depth=11 width=3 +137: CZ[5, 6] +138: H[6] +139: H[5] +140: H[6] +141: CZ[2, 6] +142: H[5] +143: H[6] +144: H[2] +145: H[5] +146: H[6] +147: H[2] +==== cluster[ 7]: depth= 7 width=3 +148: CZ[9, 10] +149: H[10] +150: H[9] +151: H[10] +152: CZ[10, 14] +153: H[10] +154: H[14] +==== cluster[ 8]: depth= 8 width=4 +155: CZ[0, 4] +156: CZ[8, 12] +157: H[4] +158: H[12] +159: H[0] +160: H[8] +161: H[4] +162: H[0] +==== cluster[ 9]: depth= 8 width=4 +163: CZ[9, 13] +164: CZ[1, 5] +165: H[13] +166: H[1] +167: H[9] +168: H[5] +169: H[13] +170: H[9] diff --git a/src/Simulation/Native/argmaxnrm2_test b/src/Simulation/Native/argmaxnrm2_test new file mode 100644 index 00000000000..caad0d10c76 Binary files /dev/null and b/src/Simulation/Native/argmaxnrm2_test differ diff --git a/src/Simulation/Native/bititerator_test b/src/Simulation/Native/bititerator_test new file mode 100644 index 00000000000..a506e00ca46 Binary files /dev/null and b/src/Simulation/Native/bititerator_test differ diff --git a/src/Simulation/Native/bitops_test b/src/Simulation/Native/bitops_test new file mode 100644 index 00000000000..7b1a9213aa9 Binary files /dev/null and b/src/Simulation/Native/bitops_test differ diff --git a/src/Simulation/Native/capi_test b/src/Simulation/Native/capi_test new file mode 100644 index 00000000000..ab5ec3c9ce2 Binary files /dev/null and b/src/Simulation/Native/capi_test differ diff --git a/src/Simulation/Native/codegen/codegen_fma.py b/src/Simulation/Native/codegen/codegen_fma.py new file mode 100644 index 00000000000..6e523aa1498 --- /dev/null +++ b/src/Simulation/Native/codegen/codegen_fma.py @@ -0,0 +1,434 @@ +#!/usr/bin/env python3 +# (C) 2018 ETH Zurich, ITP, Thomas Häner and Damian Steiger +# Code generator for n-qubit gate + +import sys + + +def avx_type(complex_avx_len): + if complex_avx_len == 2: + return "__m256d" + elif complex_avx_len == 4: + return "__m512d" + elif complex_avx_len == 1: + return "std::complex" + else: + raise Exception("Unknown avx type.") + + +def avx_prefix(complex_avx_len): + if complex_avx_len == 2: + return "_mm256" + elif complex_avx_len == 4: + return "_mm512" + else: + raise Exception("Unknown avx type.") + + +def generate_kernel_core(N, n, kernelarray, blocks, only_one_matrix, unroll_loops, avx_len): + indent = 1 + + kernelarray.append("// (C) 2018 ETH Zurich, ITP, Thomas Häner and Damian Steiger\n\ntemplate \ninline void kernel_core(V& psi, std::size_t I") + for i in range(n): + kernelarray.append(", std::size_t d" + str(i)) + + kernelarray.append(", M const& m") + if not only_one_matrix: + kernelarray.append(", M const& mt") + kernelarray.append(")\n{\n") + + indices = [""]*N + for num in range(N): + tmp = "I" + for b in range(n): + if (num>>b) & 1: + tmp = tmp + " + d"+str(b) + indices[num] = tmp + + add = ["\t" + avx_type(avx_len) + " v[" + str(int(N/blocks)) + "];\n"] + for b in range(blocks): + if avx_len == 4: + x4 = "x4" + else: + x4 = "" + for num in range(int(N/blocks)*b, int(N/blocks)*(b+1)): + add.append("\n\tv[" + str(int(num % (N/blocks))) + "] = ") + if avx_len > 1: + add.append("load1" + x4 + "(&") + add.append("psi[" + indices[num] + "]") + if avx_len > 1: + add.append(")") + add.append(";") + add.append("\n") + if b == 0: + add.append("\n\t" + avx_type(avx_len) + " tmp[" + str(int(N/avx_len)) + "] = {") + for i in range(int(N/avx_len)): + if avx_len > 1: + add.append(avx_prefix(avx_len) + "_setzero_pd(), ") + else: + add.append("0., ") + add[-1] = add[-1][:-2] + "};\n" + + if unroll_loops: + inline_FMAs = False + miniblocks = N/avx_len/4 + miniblocks = max(miniblocks, 1) + for mb in range(int(miniblocks)): + for rb in range(int(N/avx_len/miniblocks)): + r = int(mb*N/avx_len/miniblocks) + rb + add.append("\n\ttmp[" + str(r) + "] = ") + + for i in range(int(N/blocks)): + if not inline_FMAs or avx_len == 1: + add.append("fma(v[" + str(i) + "], m[" + str(b*int(N/blocks)*int(N/avx_len)+r*int(N/blocks)+i) +"], ") + if not only_one_matrix: + add.append("mt[" + str(b*int(N/blocks)*int(N/avx_len)+i+r*int(N/blocks)) +"], ") + else: + add.append(avx_prefix(avx_len) + "_fmadd_pd(v[" + str(i) + "], m[" + str(b*int(N/blocks)*int(N/avx_len)+r*int(N/blocks)+i) +"], ") + add.append("tmp[" + str(r) + "]") + add.append(")"*int(N/blocks)+";") + + if inline_FMAs and not only_one_matrix and not avx_len == 1: + for rb in range(int(N/avx_len/miniblocks)): + r = int(mb*N/avx_len/miniblocks) + rb + add.append("\n\ttmp[" + str(r) + "] = ") + for i in range(int(N/blocks)): + add.append(avx_prefix(avx_len) + "_fmadd_pd(" + avx_prefix(avx_len) + "_permute_pd(v[" + str(i) + "], 5), mt[" + str(b*int(N/blocks)*int(N/avx_len)+r*int(N/blocks)+i) +"], ") + add.append("tmp[" + str(r) + "]") + add.append(")"*int(N/blocks)+";") + + if inline_FMAs and only_one_matrix and avx_len > 1: + raise Exception("Not implemented yet!") + + for rb in range(int(N/avx_len/miniblocks)): + r = int(mb*N/avx_len/miniblocks) + rb + if b == blocks-1: + add.append("\n\t") + if avx_len > 1: + add.append("store(") + for i in range(avx_len): + if avx_len > 1: + add.append("(double*)&") + add.append("psi[" + indices[avx_len*r+avx_len-i-1] + "], ") + if avx_len == 1: + add[-1] = add[-1][:-2] + " = " + add.append("tmp[" + str(r) + "]);") + if avx_len == 1: + add[-1] = add[-1][:-2] + ";" + else: + add.append("\tfor (unsigned i = 0; i < " + str(int(N/avx_len)) + "; ++i){\n\t\ttmp[i] = ") + for i in range(int(N/blocks)): + add.append("fma(v[" + str(i) + "], m[" + str(b*int(N/blocks)*int(N/avx_len)) + " + i * "+str(int(N/blocks)) + " + " + str(i) +"], ") + if not only_one_matrix: + add.append("mt[" + str(b*int(N/blocks)*int(N/avx_len)) + " + i * " + str(int(N/blocks)) + " + " + str(i) +"], ") + add.append("tmp[i]") + add.append(")"*int(N/blocks)+";") + add.append("\n\t}\n") + if b == blocks-1: + for r in range(int(N/avx_len)): + add.append("\n\t") + if avx_len > 1: + add.append("store(") + for i in range(avx_len): + if avx_len > 1: + add.append("(double*)&") + add.append("psi[" + indices[avx_len*r+avx_len-i-1] + "], ") + if avx_len == 1: + add[-1] = add[-1][:-2] + " = " + add.append("tmp[" + str(r) + "]);") + if avx_len == 1: + add[-1] = add[-1][:-2] + ";" + + add.append("\n") + kernelarray.append("".join(add)) + add=[""] + kernelarray.append("".join(add)) + kernelarray.append("\n}\n\n") + +def generate_kernel(n, blocks, only_one_matrix, unroll_loops, avx_len): + kernel = "" + + N = 1<\n" + kernel = kernel + "void kernel(V& psi" + for i in range(n-1,-1,-1): + kernel = kernel + ", unsigned id"+str(i) + kernel = kernel + ", M const& matrix, std::size_t ctrlmask)\n{\n std::size_t n = psi.size();\n" + + for i in idx: + kernel = kernel + "\tstd::size_t d"+str(i)+" = 1ULL << id"+str(i)+";\n" + + kernel += ("\tauto m = matrix;\n" + "\tstd::size_t dsorted[] = {") + add = ["d0"] + for i in range(1,n): + add.append(", d" + str(i)) + add.append("};\n") + add.append("\tpermute_qubits_and_matrix(dsorted, " + str(n) + ", m);\n") + kernel += "".join(add) + + if False: + add = ["\n\t" + avx_type(avx_len) + " mm[] = {"] + for b in range(blocks): + for r in range(int(N/avx_len)): + for c in range(int(N/blocks)): + add.append("loada") + if only_one_matrix: + add[-1] = add[-1]+"b" + add.append("(") + for i in range(avx_len): + add.append("&m["+str(avx_len*r+i)+"]["+str(c+b*int(N/blocks))+"], ") + add[-1] = add[-1][:-2] + "), " + add[-1] = add[-1][:-2] + "};\n" + else: + add = ["\n\t" + avx_type(avx_len) + " mm[" + str(N*int(N/avx_len)) + "];"] + add.append("\n\tfor (unsigned b = 0; b < " + str(blocks) + "; ++b){" + "\n\t\tfor (unsigned r = 0; r < " + str(int(N/avx_len)) + "; ++r){" + "\n\t\t\tfor (unsigned c = 0; c < " + str(int(N/blocks)) + "; ++c){" + "\n\t\t\t\tmm[b*"+str(int(N/avx_len)*int(N/blocks))+"+r*"+str(int(N/blocks))+"+c]" + " = ") + if avx_len > 1: + add.append("loada") + if only_one_matrix: + add[-1] = add[-1]+"b" + add.append("(") + for i in range(avx_len): + add.append("&m["+str(avx_len)+"*r+"+str(i)+"][c+b*"+str(int(N/blocks))+"], ") + add[-1] = add[-1][:-2] + ");" + else: + add.append("m[r][c+b*"+str(int(N/blocks))+"];") + add.append("\n\t\t\t}\n\t\t}\n\t}\n") + kernelarray.append("".join(add)) + + if False: + add = ["\n\t" + avx_type(avx_len) + " mmt[] = {"] + for b in range(blocks): + for r in range(int(N/avx_len)): + for c in range(int(N/blocks)): + add.append("loadbm") + add.append("(") + for i in range(avx_len): + add.append("&m["+str(avx_len*r+i)+"]["+str(c+b*int(N/blocks))+"], ") + add[-1] = add[-1][:-2] + "), " + add[-1] = add[-1][:-2] + "};\n" + else: + add = ["\n\t" + avx_type(avx_len) + " mmt[" + str(N*int(N/avx_len)) + "];"] + add.append("\n\tfor (unsigned b = 0; b < " + str(blocks) + "; ++b){" + "\n\t\tfor (unsigned r = 0; r < " + str(int(N/avx_len)) + "; ++r){" + "\n\t\t\tfor (unsigned c = 0; c < " + str(int(N/blocks)) + "; ++c){" + "\n\t\t\t\tmmt[b*"+str(int(N/avx_len)*int(N/blocks))+"+r*"+str(int(N/blocks))+"+c]" + " = loadbm(") + for i in range(avx_len): + add.append("&m["+str(avx_len)+"*r+"+str(i)+"][c+b*"+str(int(N/blocks))+"], ") + add[-1] = add[-1][:-2] + ");\n\t\t\t}\n\t\t}\n\t}\n" + + if only_one_matrix: + add = [] + + add.append("\n\n") + kernelarray.append("".join(add)) + + nc = len(idx)-1 + add = [] + indent = 1 + kernelarray.append("#ifndef _MSC_VER\n") + kernelarray.append("\t"*indent + "if (ctrlmask == 0){\n") + indent += 1 + kernelarray.append("\t"*indent + "#pragma omp parallel for collapse(LOOP_COLLAPSE"+str(n)+") schedule(static) proc_bind(spread)\n" + "\t"*indent + "for (std::size_t i0 = 0; i0 < n; i0 += 2 * dsorted[0]){\n") + indent = indent + 1 + for i in range(1,nc+1): + kernelarray.append("\t"*indent + "for (std::size_t i"+str(i)+" = 0; i"+str(i)+" < dsorted["+str(i-1) + "]; i"+str(i)+" += 2 * dsorted["+str(i)+"]){\n") + indent = indent + 1 + + kernelarray.append("\t"*indent + "for (std::size_t i"+str(nc+1)+" = 0; i"+str(nc+1)+" < dsorted["+str(nc)+"]; ++i"+str(nc+1)+"){\n") + indent = indent + 1 + + # inner-most loop: call kernel core + + + kernelarray.append("\t"*indent + "kernel_core(psi, i0") + add = [] + for i in range(n): + add.append(" + i"+str(i+1)) + kernelarray.append("".join(add)) + for i in range(n): + kernelarray.append(", dsorted[" + str(n-1-i) + "]") + + if only_one_matrix: + kernelarray.append(", mm);\n") + else: + kernelarray.append(", mm, mmt);\n") + + #end for(s) and if + add = [""]*indent + for i in range(indent-1,0,-1): + add[indent-1-i] = "\t"*i+"}\n" + kernelarray.append("".join(add)) + + # if controlmask != 0 + indent = 1 + kernelarray.append("\t"*indent + "else{\n") + indent += 1 + kernelarray.append("\t"*indent + "#pragma omp parallel for collapse(LOOP_COLLAPSE"+str(n)+") schedule(static)\n" + "\t"*indent + "for (std::size_t i0 = 0; i0 < n; i0 += 2 * dsorted[0]){\n") + indent = indent + 1 + for i in range(1,nc+1): + kernelarray.append("\t"*indent + "for (std::size_t i"+str(i)+" = 0; i"+str(i)+" < dsorted["+str(i-1) + "]; i"+str(i)+" += 2 * dsorted["+str(i)+"]){\n") + indent = indent + 1 + + kernelarray.append("\t"*indent + "for (std::size_t i"+str(nc+1)+" = 0; i"+str(nc+1)+" < dsorted["+str(nc)+"]; ++i"+str(nc+1)+"){\n") + indent = indent + 1 + + # inner-most loop: call kernel core + + kernelarray.append("\t"*indent + "if (((i0") + add = [] + for i in range(n): + add.append(" + i"+str(i+1)) + kernelarray.append("".join(add)) + kernelarray.append(")&ctrlmask) == ctrlmask)\n") + kernelarray.append("\t"*(indent+1) + "kernel_core(psi, i0") + add = [] + for i in range(n): + add.append(" + i"+str(i+1)) + kernelarray.append("".join(add)) + for i in range(n): + kernelarray.append(", dsorted[" + str(n-1-i) + "]") + + if only_one_matrix: + kernelarray.append(", mm);\n") + else: + kernelarray.append(", mm, mmt);\n") + + #end for(s) and if + add = [""]*indent + for i in range(indent-1,0,-1): + add[indent-1-i] = "\t"*i+"}\n" + kernelarray.append("".join(add)) + + +################ Start of _MSC_VER code block ################## + kernelarray.append("#else\n") + kernelarray.append(" std::intptr_t zero = 0;\n") + kernelarray.append(" std::intptr_t dmask = dsorted[0]"); + for i in range(n-1): kernelarray.append(" + dsorted["+str(i+1)+"]") + kernelarray.append( ";\n") + kernelarray.append("\n"); + kernelarray.append(" if (ctrlmask == 0){\n") + kernelarray.append(" #pragma omp parallel for schedule(static)\n") + kernelarray.append(" for (std::intptr_t i = 0; i < static_cast(n); ++i)\n") + kernelarray.append(" if ((i & dmask) == zero)\n") + kernelarray.append(" kernel_core(psi, i") + for i in range(n): kernelarray.append(", dsorted[" + str(n-1-i) + "]") + if only_one_matrix: kernelarray.append(", mm);\n") + else: kernelarray.append(", mm, mmt);\n") + # if controlmask != 0 + kernelarray.append(" } else {\n") + kernelarray.append(" #pragma omp parallel for schedule(static)\n") + kernelarray.append(" for (std::intptr_t i = 0; i < static_cast(n); ++i)\n") + kernelarray.append(" if ((i & ctrlmask) == ctrlmask && (i & dmask) == zero)\n") + kernelarray.append(" kernel_core(psi, i") + for i in range(n): kernelarray.append(", dsorted[" + str(n-1-i) + "]") + if only_one_matrix: kernelarray.append(", mm);\n") + else: kernelarray.append(", mm, mmt);\n") + kernelarray.append(" }\n") + kernelarray.append("#endif\n") + + kernelarray.append("}\n") + kernel = "".join([kernel,"".join(kernelarray)]) + return kernel + +def generate_includes(N): + return "#include \n#include \n#include \n#include \n#include \n#include \n" + \ + "#include \"alignedallocator.hpp\"\n#include \"timing.hpp\"\n#include \"cintrin.hpp\"\n" + \ + "#include \n#include \n\n" + \ + "#include \"util/par_for.hpp\"\n" + \ + "using namespace std;\n#define LOOP_COLLAPSE" + str(N) + " " + str(N+1) + "\n" + +def generate_main(n): + N = str(1 << n) + text = "using rowtype = vector,aligned_allocator,64>>;\nusing matrixtype = vector;\n\nint main(int argc, char *argv[]){" + text = text + "\n\tassert(argc > "+str(1+n)+");" + text = text + "\n\tsize_t N = 1ULL << atoi(argv[1]);" + for i in range(n): + text = text + "\n\tunsigned i" + str(i) + " = atoi(argv[" + str(i+2) + "]);" + + text = text + "\n\tmatrixtype m("+N+", rowtype("+N+"));"; + text = text + "\n\tfor (unsigned i = 0; i < "+N+"; ++i)\n\t\tfor (unsigned j = 0; j < "+N+"; ++j)\n\t\t\tm[i][j] = drand48();\n" + + text = text + "\n\tTimer t;\n\tfor (unsigned threads = 1; threads <= 24; ++threads){" + text = text + "\n\t\tomp_set_num_threads(threads);" + text = text + "\n\t\trowtype psi(N);\n\t\t#pragma omp parallel\n\t\t{\n\t\t\t#pragma omp for schedule(static)\n\t\t\tfor (size_t i = 0; i < psi.size(); ++i)\n\t\t\t\tpsi[i] = drand48();\n" + text = text + "\n\t\t\t#pragma omp single\n\t\t\tt.start();" + text = text + "\n\t\t\tkernel(psi, N" + for i in range(n): + text = text + ", i" + str(i) + text = text + ", m, 0);" + text = text + "\n\t\t\t#pragma omp waitall\n\t\t\t#pragma omp single\n\t\t\t{ cout << \"threads: \" << threads << \", time:\" << t.stop()*1.e-6 << \"\\n\"; }" + text = text + "\n\t\t}" # end for + text = text + "\n\t}" # end for + text = text + "\n\n}" # end main + return text + + +##################################################### +# MAIN # +##################################################### + +if len(sys.argv) < 2: + print("Generates the code for an n-qubit gate.\nUsage:\n./codegen_fma.py [n_qubits] {n_blocks} {only one matrix?} {unroll loops?} {none|avx2|avx512}\n\n") + exit() + +n = int(sys.argv[1]) # number of qubits + +try: # number of blocks + blocks = int(sys.argv[2]) +except Exception: + blocks = 1 + +try: + only_one_matrix = int(sys.argv[3]) +except Exception: + only_one_matrix = False + +try: + unroll_loops = int(sys.argv[4]) +except Exception: + unroll_loops = False + +try: + avx = str(sys.argv[5]) + if avx == "avx512": + avx = 4 + elif avx == "avx2" or avx == "avx": + avx = 2 + elif avx == "none": + avx = 1 + only_one_matrix = True + else: + raise RuntimeError("Unknown avx type: {}".format(avx)) +except IndexError: + avx = 2 + +while (1 << n)/blocks < 1: + blocks = int(blocks/2) + +if (1 << n) < avx: + avx = int(avx/2) + +kernel = generate_kernel(n, blocks, only_one_matrix, unroll_loops, avx) # generate code for n-qubit gate + +# if user wants a main (for testing) generate as well: +for a in sys.argv: + if str(a) == "gen_main": + kernel = generate_includes(n) + kernel + generate_main(n) + +print(kernel) diff --git a/src/Simulation/Native/codegen/codegen_test.cpp b/src/Simulation/Native/codegen/codegen_test.cpp new file mode 100644 index 00000000000..1ff778ce96d --- /dev/null +++ b/src/Simulation/Native/codegen/codegen_test.cpp @@ -0,0 +1,127 @@ +#include +#include +#include +#include +#include +#include +#include "alignedalloc.hpp" +//#include "timing.hpp" +#include "cintrin.hpp" +#include +#include + +#include "util/par_for.hpp" +using namespace std; +#define LOOP_COLLAPSE1 2 +// (C) 2018 ETH Zurich, ITP, Thomas H�ner and Damian Steiger + +template +inline void kernel_core(V& psi, std::size_t I, std::size_t d0, M const& m) +{ + std::complex v[2]; + + v[0] = psi[I]; + v[1] = psi[I + d0]; + + std::complex tmp[2] = {0., 0.}; + + tmp[0] = fma(v[0], m[0], fma(v[1], m[1], tmp[0])); + tmp[1] = fma(v[0], m[2], fma(v[1], m[3], tmp[1])); + psi[I] = tmp[0]; + psi[I + d0] = tmp[1]; + +} + +// bit indices id[.] are given from high to low (e.g. control first for CNOT) +template +void kernel(V& psi, unsigned id0, M const& matrix, std::size_t ctrlmask) +{ + std::size_t n = psi.size(); + std::size_t d0 = 1ULL << id0; + auto m = matrix; + std::size_t dsorted[] = {d0}; + permute_qubits_and_matrix(dsorted, 1, m); + + std::complex mm[4]; + for (unsigned b = 0; b < 1; ++b){ + for (unsigned r = 0; r < 2; ++r){ + for (unsigned c = 0; c < 2; ++c){ + mm[b*4+r*2+c] = m[r][c+b*2]; + } + } + } + + +#ifndef _MSC_VER + if (ctrlmask == 0){ + #pragma omp parallel for collapse(LOOP_COLLAPSE1) schedule(static) + for (std::size_t i0 = 0; i0 < n; i0 += 2 * dsorted[0]){ + for (std::size_t i1 = 0; i1 < dsorted[0]; ++i1){ + kernel_core(psi, i0 + i1, dsorted[0], mm); + } + } + } + else{ + #pragma omp parallel for collapse(LOOP_COLLAPSE1) schedule(static) + for (std::size_t i0 = 0; i0 < n; i0 += 2 * dsorted[0]){ + for (std::size_t i1 = 0; i1 < dsorted[0]; ++i1){ + if (((i0 + i1)&ctrlmask) == ctrlmask) + kernel_core(psi, i0 + i1, dsorted[0], mm); + } + } + } +#else + intptr_t zero = 0; + intptr_t dmask = dsorted[0]; + + if (ctrlmask == 0){ + auto thrdFnc= [&](size_t dsorted[],intptr_t& dmask, intptr_t& zero,V &psi,M const& m) { + return [&](unsigned i) { + if ((i & dmask) == zero) + kernel_core(psi, i, dsorted[0], m); + }; + }; + pl::async_par_for(0,n,thrdFnc(dsorted,dmask,zero,psi,m)); + } else { + auto thrdFnc= [&](size_t dsorted[],size_t& ctrlmask,intptr_t& dmask, intptr_t& zero,V &psi,M const& m) { + return [&](unsigned i) { + if ((i & ctrlmask) == ctrlmask && (i & dmask) == zero) + kernel_core(psi, i, dsorted[0], m); + }; + }; + pl::async_par_for(0,n,thrdFnc(dsorted,ctrlmask,dmask,zero,psi,m)); + } +#endif +} +using rowtype = vector,AlignedAlloc,64>>; +using matrixtype = vector; + +int main(int argc, char *argv[]){ + assert(argc > 2); + size_t N = 1ULL << atoi(argv[1]); + unsigned i0 = atoi(argv[2]); + matrixtype m(2, rowtype(2)); + for (unsigned i = 0; i < 2; ++i) + for (unsigned j = 0; j < 2; ++j) + m[i][j] = drand48(); + + Timer t; + for (unsigned threads = 1; threads <= 24; ++threads){ + omp_set_num_threads(threads); + rowtype psi(N); + #pragma omp parallel + { + #pragma omp for schedule(static) + for (size_t i = 0; i < psi.size(); ++i) + psi[i] = drand48(); + + #pragma omp single + t.start(); + kernel(psi, N, i0, m, 0); + #pragma omp waitall + #pragma omp single + { cout << "threads: " << threads << ", time:" << t.stop()*1.e-6 << "\n"; } + } + } + +} diff --git a/src/Simulation/Native/codegen/generate.ps1 b/src/Simulation/Native/codegen/generate.ps1 new file mode 100644 index 00000000000..1cc506d187b --- /dev/null +++ b/src/Simulation/Native/codegen/generate.ps1 @@ -0,0 +1,21 @@ +# onematrix[i] determines whether to use a single gate matrix for the i-qubit gate kernel +# instead of using two matrices (which allows to reduce the number of operations +# by pre-computation) +$onematrix=(0,0,0,0,0,0,1,1) # g++ best + +# unroll[i] determines whether to unroll loops +$unroll=(1,1,1,1,1,1,0,0) # g++ best + +# register length to use: can be none, avx2, or avx512 +# avx=avx2 + +# blocking: must be a power of two and at most 2^k for a k-qubit gate +$b=(0,2,4,8,16,16,16,32) # gcc & icc best + +foreach ($i in 1..7) { + "Generating $i kernel with $b[$i] blocks." + python codegen_fma.py $i $b[$i] $onematrix[$i] $unroll[$i] none > nointrin/kernel$i.hpp + python codegen_fma.py $i $b[$i] $onematrix[$i] $unroll[$i] avx > avx/kernel$i.hpp + python codegen_fma.py $i $b[$i] $onematrix[$i] $unroll[$i] avx2 > avx2/kernel$i.hpp + python codegen_fma.py $i $b[$i] $onematrix[$i] $unroll[$i] avx512 > avx512/kernel$i.hpp +} \ No newline at end of file diff --git a/src/Simulation/Native/codegen/generate.sh b/src/Simulation/Native/codegen/generate.sh new file mode 100644 index 00000000000..2ec0557d571 --- /dev/null +++ b/src/Simulation/Native/codegen/generate.sh @@ -0,0 +1,28 @@ +#!/bin/bash +# (C) 2018 ETH Zurich, ITP, Thomas Häner and Damian Steiger + +# onematrix[i] determines whether to use a single gate matrix for the i-qubit gate kernel +# instead of using two matrices (which allows to reduce the number of operations +# by pre-computation) +onematrix=(0 0 0 0 0 0 1 1) # g++ best +#onematrix=(0 0 1 0 0 0 1 1) # icc best + +# unroll[i] determines whether to unroll loops +unroll=(1 1 1 1 1 1 0 0) # g++ best +#unroll=(1 1 1 0 0 1 0 0) # icc best +#unroll=(0 0 0 0 0 0 0 0) + +# register length to use: can be none, avx2, or avx512 +avx=avx2 + +# blocking: must be a power of two and at most 2^k for a k-qubit gate +b=(0 2 4 8 16 16 16 32) # gcc & icc best + +for i in {1..7} +do + echo "Generating $i kernel with ${b[$i]} blocks." + ./codegen_fma.py $i ${b[$i]} ${onematrix[$i]} ${unroll[$i]} none > ../nointrin/kernel${i}.hpp + ./codegen_fma.py $i ${b[$i]} ${onematrix[$i]} ${unroll[$i]} avx > ../avx/kernel${i}.hpp + ./codegen_fma.py $i ${b[$i]} ${onematrix[$i]} ${unroll[$i]} avx2 > ../avx2/kernel${i}.hpp + ./codegen_fma.py $i ${b[$i]} ${onematrix[$i]} ${unroll[$i]} avx512 > ../avx512/kernel${i}.hpp +done diff --git a/src/Simulation/Native/cpuid_test b/src/Simulation/Native/cpuid_test new file mode 100644 index 00000000000..97f5752dcb7 Binary files /dev/null and b/src/Simulation/Native/cpuid_test differ diff --git a/src/Simulation/Native/diagmatrix_test b/src/Simulation/Native/diagmatrix_test new file mode 100644 index 00000000000..b070bbb4ab9 Binary files /dev/null and b/src/Simulation/Native/diagmatrix_test differ diff --git a/src/Simulation/Native/doCopy.ps1 b/src/Simulation/Native/doCopy.ps1 new file mode 100644 index 00000000000..e0fb1b6e1d2 --- /dev/null +++ b/src/Simulation/Native/doCopy.ps1 @@ -0,0 +1,13 @@ +param([string]$bld = "Release") + +"================== COPYING $bld" +$dll = "Microsoft.Quantum.Simulator.Runtime.*" # DLL and PDB +$srcDir = "C:\depot\Git\qsharp-runtime\src\Simulation\Native\build\$bld" +foreach ($dest in "H2O","Ham","integer-factorization") { + foreach ($typ in "Release","Debug") { + $dstDir = "C:\depot\Git\msr-quarc\wecker\QDK\$dest\bin\$typ" + $dstDir += "\netcoreapp3.0\runtimes\win-x64\native" + robocopy /NJH /NJS /NP /NDL $srcDir $dstDir $dll | Out-Null + } +} +exit 0 diff --git a/src/Simulation/Native/factory_test b/src/Simulation/Native/factory_test new file mode 100644 index 00000000000..b5937b571f8 Binary files /dev/null and b/src/Simulation/Native/factory_test differ diff --git a/src/Simulation/Native/local_test b/src/Simulation/Native/local_test new file mode 100644 index 00000000000..feaf3627ddc Binary files /dev/null and b/src/Simulation/Native/local_test differ diff --git a/src/Simulation/Native/openmp_test b/src/Simulation/Native/openmp_test new file mode 100644 index 00000000000..00e5d9f877f Binary files /dev/null and b/src/Simulation/Native/openmp_test differ diff --git a/src/Simulation/Native/parseLog.py b/src/Simulation/Native/parseLog.py new file mode 100644 index 00000000000..ee7837867e0 --- /dev/null +++ b/src/Simulation/Native/parseLog.py @@ -0,0 +1,113 @@ +import re +import sys +import numpy as np + +logName = sys.argv[1] +reSched = re.compile(r"^==== sched:\s+(\S+)") +reFN = re.compile(r"^(\S+)\.") +reNQs = re.compile(r"nQs=(\d+) .*range=(\d+).*prb=(\d+)") +reSim = re.compile(' (Generic|AVX|AVX2|AVX512)$') +rePars = re.compile(r'OMP_NUM_THREADS=(\d+) fusedSpan=(\d) fusedDepth=(\d+) wfnCapacity=(\d+)') +reInfo = re.compile(r'sz=([.\d]+) nQs=([.\d]+) nCs=([.\d]+) flsh= *([.\de+-]+).*gts= *([.\de+-]+).*elap= *(\d+).*(.)gps= *([.\de+-]+).*fus= *([.\d]+).*ker= *([.\d]+)') +found = reFN.search(logName) +env = found.group(1) +fp = open(logName,'r') +gpss = [] +print(f'"env","test","typ","sim","qs","threads","span","sz","gps"') +sim = "" +totalQs = -1 +threads = -1 +span = -1 +sz = -1 +rng = 1 +prb = -1 +sched = "???" + +prbs = [ + "ladder" , + "ladder" , + "shor_4" , + "shor_6" , + "shor_8" , + "shor_10" , + "shor_12" , + "suprem_44", + "suprem_55", + "suprem_56", + "qulacs_5", + "qulacs_10", + "qulacs_15", + "qulacs_20", + "qulacs_25" +] +def dumpGpss(): + global gpss,env,sim,totalQs,threads,span,sz,rng,prb,sched + if len(gpss) > 0: + gpsMed = np.median(gpss) + cnt = 0.0 + tot = 0.0 + for gps in gpss: + if gps > gpsMed/2.0 and gps < gpsMed*1.5: + cnt += 1.0 + tot += gps + if cnt > 0: gps = tot/cnt + else: gps = gpsAvg + + nam = prbs[prb] + + if rng == 0: nam = f'{env},{nam}L' + elif rng == 2: nam = f'{env},{nam}H' + else: nam = f'{env},{nam}' + + print(f"{nam},{sched},{sim},{totalQs},{threads},{span},{sz},{gps:.1f}") + + gpss = [] + +while True: + inp = fp.readline() + if inp == "": + dumpGpss() + break + found = reSched.search(inp) + if found: + dumpGpss() + sched = found.group(1) + continue + found = reNQs.search(inp) + if found: + dumpGpss() + totalQs = found.group(1) + rng = int(found.group(2)) + prb = int(found.group(3)) + continue + found = reSim.search(inp) + if found: + dumpGpss() + sim = found.group(1) + continue + found = rePars.search(inp) + if found: + threads = found.group(1) + span = found.group(2) + limit = found.group(3) + wfnSiz = found.group(4) + continue + found = reInfo.search(inp) + if found: + sz = found.group(1) + nQs = float(found.group(2)) + nCs = float(found.group(3)) + flushes = found.group(4) + gates = found.group(5) + elap = found.group(6) + if (found.group(7) == 'k'): mul = 1000.0 + else: mul = 1.0 + gps = float(found.group(8)) * mul + fusions = found.group(9) + kernel = found.group(10) + gpss.append(gps) + continue + + +fp.close() + diff --git a/src/Simulation/Native/src/CMakeLists.txt b/src/Simulation/Native/src/CMakeLists.txt index e53864f85c9..faf1f1c480c 100644 --- a/src/Simulation/Native/src/CMakeLists.txt +++ b/src/Simulation/Native/src/CMakeLists.txt @@ -8,8 +8,9 @@ set(AVX2FLAGS "/arch:AVX2" ) set(AVX512FLAGS "/arch:AVX512" ) set(FMAFLAGS "") else(MSVC) -SET(AVXFLAGS "-mavx" ) -set(AVX2FLAGS -mfma;-mavx2) +SET(AVXFLAGS "-mavx") +set(AVX2FLAGS "-mfma -mavx2") +set(AVX512FLAGS "-mfma -mavx512f -mavx512cd") set(FMAFLAGS ) endif(MSVC) @@ -19,14 +20,16 @@ configure_file(version.hpp.in ${PROJECT_BINARY_DIR}/src/version.hpp) add_subdirectory(util) add_subdirectory(simulator) -set(SOURCES simulator/factory.cpp simulator/capi.cpp simulator/simulator.cpp util/openmp.cpp simulator/simulatoravx.cpp simulator/simulatoravx2.cpp ) +set(SOURCES simulator/factory.cpp simulator/capi.cpp simulator/simulator.cpp util/openmp.cpp simulator/simulatoravx.cpp simulator/simulatoravx2.cpp simulator/simulatoravx512.cpp ) if(BUILD_SHARED_LIBS) add_library(Microsoft.Quantum.Simulator.Runtime SHARED ${SOURCES}) set_source_files_properties(simulator/simulatoravx.cpp PROPERTIES COMPILE_FLAGS ${AVXFLAGS}) if (MSVC) set_source_files_properties(simulator/simulatoravx2.cpp PROPERTIES COMPILE_FLAGS -mfma COMPILE_FLAGS ${AVX2FLAGS}) +set_source_files_properties(simulator/simulatoravx512.cpp PROPERTIES COMPILE_FLAGS -mfma COMPILE_FLAGS ${AVX512FLAGS}) else(MSVC) -set_source_files_properties(simulator/simulatoravx2.cpp PROPERTIES COMPILE_FLAGS -mfma COMPILE_FLAGS "-mavx2 -mfma") +set_source_files_properties(simulator/simulatoravx2.cpp PROPERTIES COMPILE_FLAGS -mfma COMPILE_FLAGS ${AVX2FLAGS}) +set_source_files_properties(simulator/simulatoravx512.cpp PROPERTIES COMPILE_FLAGS -mfma COMPILE_FLAGS ${AVX512FLAGS}) endif(MSVC) message (STATUS "Building shared library") target_compile_definitions(Microsoft.Quantum.Simulator.Runtime PRIVATE BUILD_DLL=1) @@ -36,6 +39,7 @@ else(BUILD_SHARED_LIBS) add_library(Microsoft.Quantum.Simulator.Runtime STATIC ${SOURCES}) set_source_files_properties(simulator/simulatoravx.cpp PROPERTIES COMPILE_FLAGS ${AVXFLAGS}) set_source_files_properties(simulator/simulatoravx2.cpp PROPERTIES COMPILE_FLAGS ${AVX2FLAGS}) + set_source_files_properties(simulator/simulatoravx512.cpp PROPERTIES COMPILE_FLAGS ${AVX512FLAGS}) endif(BUILD_SHARED_LIBS) install(TARGETS Microsoft.Quantum.Simulator.Runtime diff --git a/src/Simulation/Native/src/Makefile b/src/Simulation/Native/src/Makefile new file mode 100644 index 00000000000..edaded273e5 --- /dev/null +++ b/src/Simulation/Native/src/Makefile @@ -0,0 +1,422 @@ +# CMAKE generated file: DO NOT EDIT! +# Generated by "Unix Makefiles" Generator, CMake Version 3.16 + +# Default target executed when no arguments are given to make. +default_target: all + +.PHONY : default_target + +# Allow only one "make -f Makefile2" at a time, but pass parallelism. +.NOTPARALLEL: + + +#============================================================================= +# Special targets provided by cmake. + +# Disable implicit rules so canonical targets will work. +.SUFFIXES: + + +# Remove some rules from gmake that .SUFFIXES does not remove. +SUFFIXES = + +.SUFFIXES: .hpux_make_needs_suffix_list + + +# Suppress display of executed commands. +$(VERBOSE).SILENT: + + +# A target that is always out of date. +cmake_force: + +.PHONY : cmake_force + +#============================================================================= +# Set environment variables for the build. + +# The shell in which to execute make rules. +SHELL = /bin/sh + +# The CMake executable. +CMAKE_COMMAND = /usr/bin/cmake + +# The command to remove a file. +RM = /usr/bin/cmake -E remove -f + +# Escaping for special characters. +EQUALS = = + +# The top-level source directory on which CMake was run. +CMAKE_SOURCE_DIR = /mnt/c/depot/Git/qsharp-runtime/src/Simulation/Native + +# The top-level build directory on which CMake was run. +CMAKE_BINARY_DIR = /mnt/c/depot/Git/qsharp-runtime/src/Simulation/Native + +#============================================================================= +# Targets provided globally by CMake. + +# Special rule for the target install/strip +install/strip: preinstall + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Installing the project stripped..." + /usr/bin/cmake -DCMAKE_INSTALL_DO_STRIP=1 -P cmake_install.cmake +.PHONY : install/strip + +# Special rule for the target install/strip +install/strip/fast: preinstall/fast + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Installing the project stripped..." + /usr/bin/cmake -DCMAKE_INSTALL_DO_STRIP=1 -P cmake_install.cmake +.PHONY : install/strip/fast + +# Special rule for the target install/local +install/local: preinstall + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Installing only the local directory..." + /usr/bin/cmake -DCMAKE_INSTALL_LOCAL_ONLY=1 -P cmake_install.cmake +.PHONY : install/local + +# Special rule for the target install/local +install/local/fast: preinstall/fast + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Installing only the local directory..." + /usr/bin/cmake -DCMAKE_INSTALL_LOCAL_ONLY=1 -P cmake_install.cmake +.PHONY : install/local/fast + +# Special rule for the target install +install: preinstall + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Install the project..." + /usr/bin/cmake -P cmake_install.cmake +.PHONY : install + +# Special rule for the target install +install/fast: preinstall/fast + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Install the project..." + /usr/bin/cmake -P cmake_install.cmake +.PHONY : install/fast + +# Special rule for the target list_install_components +list_install_components: + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Available install components are: \"libraries\"" +.PHONY : list_install_components + +# Special rule for the target list_install_components +list_install_components/fast: list_install_components + +.PHONY : list_install_components/fast + +# Special rule for the target rebuild_cache +rebuild_cache: + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Running CMake to regenerate build system..." + /usr/bin/cmake -S$(CMAKE_SOURCE_DIR) -B$(CMAKE_BINARY_DIR) +.PHONY : rebuild_cache + +# Special rule for the target rebuild_cache +rebuild_cache/fast: rebuild_cache + +.PHONY : rebuild_cache/fast + +# Special rule for the target edit_cache +edit_cache: + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Running CMake cache editor..." + /usr/bin/ccmake -S$(CMAKE_SOURCE_DIR) -B$(CMAKE_BINARY_DIR) +.PHONY : edit_cache + +# Special rule for the target edit_cache +edit_cache/fast: edit_cache + +.PHONY : edit_cache/fast + +# Special rule for the target test +test: + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Running tests..." + /usr/bin/ctest --force-new-ctest-process $(ARGS) +.PHONY : test + +# Special rule for the target test +test/fast: test + +.PHONY : test/fast + +# The main all target +all: cmake_check_build_system + cd /mnt/c/depot/Git/qsharp-runtime/src/Simulation/Native && $(CMAKE_COMMAND) -E cmake_progress_start /mnt/c/depot/Git/qsharp-runtime/src/Simulation/Native/CMakeFiles /mnt/c/depot/Git/qsharp-runtime/src/Simulation/Native/src/CMakeFiles/progress.marks + cd /mnt/c/depot/Git/qsharp-runtime/src/Simulation/Native && $(MAKE) -f CMakeFiles/Makefile2 src/all + $(CMAKE_COMMAND) -E cmake_progress_start /mnt/c/depot/Git/qsharp-runtime/src/Simulation/Native/CMakeFiles 0 +.PHONY : all + +# The main clean target +clean: + cd /mnt/c/depot/Git/qsharp-runtime/src/Simulation/Native && $(MAKE) -f CMakeFiles/Makefile2 src/clean +.PHONY : clean + +# The main clean target +clean/fast: clean + +.PHONY : clean/fast + +# Prepare targets for installation. +preinstall: all + cd /mnt/c/depot/Git/qsharp-runtime/src/Simulation/Native && $(MAKE) -f CMakeFiles/Makefile2 src/preinstall +.PHONY : preinstall + +# Prepare targets for installation. +preinstall/fast: + cd /mnt/c/depot/Git/qsharp-runtime/src/Simulation/Native && $(MAKE) -f CMakeFiles/Makefile2 src/preinstall +.PHONY : preinstall/fast + +# clear depends +depend: + cd /mnt/c/depot/Git/qsharp-runtime/src/Simulation/Native && $(CMAKE_COMMAND) -S$(CMAKE_SOURCE_DIR) -B$(CMAKE_BINARY_DIR) --check-build-system CMakeFiles/Makefile.cmake 1 +.PHONY : depend + +# Convenience name for target. +src/CMakeFiles/Microsoft.Quantum.Simulator.Runtime.dir/rule: + cd /mnt/c/depot/Git/qsharp-runtime/src/Simulation/Native && $(MAKE) -f CMakeFiles/Makefile2 src/CMakeFiles/Microsoft.Quantum.Simulator.Runtime.dir/rule +.PHONY : src/CMakeFiles/Microsoft.Quantum.Simulator.Runtime.dir/rule + +# Convenience name for target. +Microsoft.Quantum.Simulator.Runtime: src/CMakeFiles/Microsoft.Quantum.Simulator.Runtime.dir/rule + +.PHONY : Microsoft.Quantum.Simulator.Runtime + +# fast build rule for target. +Microsoft.Quantum.Simulator.Runtime/fast: + cd /mnt/c/depot/Git/qsharp-runtime/src/Simulation/Native && $(MAKE) -f src/CMakeFiles/Microsoft.Quantum.Simulator.Runtime.dir/build.make src/CMakeFiles/Microsoft.Quantum.Simulator.Runtime.dir/build +.PHONY : Microsoft.Quantum.Simulator.Runtime/fast + +simulator/capi.o: simulator/capi.cpp.o + +.PHONY : simulator/capi.o + +# target to build an object file +simulator/capi.cpp.o: + cd /mnt/c/depot/Git/qsharp-runtime/src/Simulation/Native && $(MAKE) -f src/CMakeFiles/Microsoft.Quantum.Simulator.Runtime.dir/build.make src/CMakeFiles/Microsoft.Quantum.Simulator.Runtime.dir/simulator/capi.cpp.o +.PHONY : simulator/capi.cpp.o + +simulator/capi.i: simulator/capi.cpp.i + +.PHONY : simulator/capi.i + +# target to preprocess a source file +simulator/capi.cpp.i: + cd /mnt/c/depot/Git/qsharp-runtime/src/Simulation/Native && $(MAKE) -f src/CMakeFiles/Microsoft.Quantum.Simulator.Runtime.dir/build.make src/CMakeFiles/Microsoft.Quantum.Simulator.Runtime.dir/simulator/capi.cpp.i +.PHONY : simulator/capi.cpp.i + +simulator/capi.s: simulator/capi.cpp.s + +.PHONY : simulator/capi.s + +# target to generate assembly for a file +simulator/capi.cpp.s: + cd /mnt/c/depot/Git/qsharp-runtime/src/Simulation/Native && $(MAKE) -f src/CMakeFiles/Microsoft.Quantum.Simulator.Runtime.dir/build.make src/CMakeFiles/Microsoft.Quantum.Simulator.Runtime.dir/simulator/capi.cpp.s +.PHONY : simulator/capi.cpp.s + +simulator/factory.o: simulator/factory.cpp.o + +.PHONY : simulator/factory.o + +# target to build an object file +simulator/factory.cpp.o: + cd /mnt/c/depot/Git/qsharp-runtime/src/Simulation/Native && $(MAKE) -f src/CMakeFiles/Microsoft.Quantum.Simulator.Runtime.dir/build.make src/CMakeFiles/Microsoft.Quantum.Simulator.Runtime.dir/simulator/factory.cpp.o +.PHONY : simulator/factory.cpp.o + +simulator/factory.i: simulator/factory.cpp.i + +.PHONY : simulator/factory.i + +# target to preprocess a source file +simulator/factory.cpp.i: + cd /mnt/c/depot/Git/qsharp-runtime/src/Simulation/Native && $(MAKE) -f src/CMakeFiles/Microsoft.Quantum.Simulator.Runtime.dir/build.make src/CMakeFiles/Microsoft.Quantum.Simulator.Runtime.dir/simulator/factory.cpp.i +.PHONY : simulator/factory.cpp.i + +simulator/factory.s: simulator/factory.cpp.s + +.PHONY : simulator/factory.s + +# target to generate assembly for a file +simulator/factory.cpp.s: + cd /mnt/c/depot/Git/qsharp-runtime/src/Simulation/Native && $(MAKE) -f src/CMakeFiles/Microsoft.Quantum.Simulator.Runtime.dir/build.make src/CMakeFiles/Microsoft.Quantum.Simulator.Runtime.dir/simulator/factory.cpp.s +.PHONY : simulator/factory.cpp.s + +simulator/simulator.o: simulator/simulator.cpp.o + +.PHONY : simulator/simulator.o + +# target to build an object file +simulator/simulator.cpp.o: + cd /mnt/c/depot/Git/qsharp-runtime/src/Simulation/Native && $(MAKE) -f src/CMakeFiles/Microsoft.Quantum.Simulator.Runtime.dir/build.make src/CMakeFiles/Microsoft.Quantum.Simulator.Runtime.dir/simulator/simulator.cpp.o +.PHONY : simulator/simulator.cpp.o + +simulator/simulator.i: simulator/simulator.cpp.i + +.PHONY : simulator/simulator.i + +# target to preprocess a source file +simulator/simulator.cpp.i: + cd /mnt/c/depot/Git/qsharp-runtime/src/Simulation/Native && $(MAKE) -f src/CMakeFiles/Microsoft.Quantum.Simulator.Runtime.dir/build.make src/CMakeFiles/Microsoft.Quantum.Simulator.Runtime.dir/simulator/simulator.cpp.i +.PHONY : simulator/simulator.cpp.i + +simulator/simulator.s: simulator/simulator.cpp.s + +.PHONY : simulator/simulator.s + +# target to generate assembly for a file +simulator/simulator.cpp.s: + cd /mnt/c/depot/Git/qsharp-runtime/src/Simulation/Native && $(MAKE) -f src/CMakeFiles/Microsoft.Quantum.Simulator.Runtime.dir/build.make src/CMakeFiles/Microsoft.Quantum.Simulator.Runtime.dir/simulator/simulator.cpp.s +.PHONY : simulator/simulator.cpp.s + +simulator/simulatoravx.o: simulator/simulatoravx.cpp.o + +.PHONY : simulator/simulatoravx.o + +# target to build an object file +simulator/simulatoravx.cpp.o: + cd /mnt/c/depot/Git/qsharp-runtime/src/Simulation/Native && $(MAKE) -f src/CMakeFiles/Microsoft.Quantum.Simulator.Runtime.dir/build.make src/CMakeFiles/Microsoft.Quantum.Simulator.Runtime.dir/simulator/simulatoravx.cpp.o +.PHONY : simulator/simulatoravx.cpp.o + +simulator/simulatoravx.i: simulator/simulatoravx.cpp.i + +.PHONY : simulator/simulatoravx.i + +# target to preprocess a source file +simulator/simulatoravx.cpp.i: + cd /mnt/c/depot/Git/qsharp-runtime/src/Simulation/Native && $(MAKE) -f src/CMakeFiles/Microsoft.Quantum.Simulator.Runtime.dir/build.make src/CMakeFiles/Microsoft.Quantum.Simulator.Runtime.dir/simulator/simulatoravx.cpp.i +.PHONY : simulator/simulatoravx.cpp.i + +simulator/simulatoravx.s: simulator/simulatoravx.cpp.s + +.PHONY : simulator/simulatoravx.s + +# target to generate assembly for a file +simulator/simulatoravx.cpp.s: + cd /mnt/c/depot/Git/qsharp-runtime/src/Simulation/Native && $(MAKE) -f src/CMakeFiles/Microsoft.Quantum.Simulator.Runtime.dir/build.make src/CMakeFiles/Microsoft.Quantum.Simulator.Runtime.dir/simulator/simulatoravx.cpp.s +.PHONY : simulator/simulatoravx.cpp.s + +simulator/simulatoravx2.o: simulator/simulatoravx2.cpp.o + +.PHONY : simulator/simulatoravx2.o + +# target to build an object file +simulator/simulatoravx2.cpp.o: + cd /mnt/c/depot/Git/qsharp-runtime/src/Simulation/Native && $(MAKE) -f src/CMakeFiles/Microsoft.Quantum.Simulator.Runtime.dir/build.make src/CMakeFiles/Microsoft.Quantum.Simulator.Runtime.dir/simulator/simulatoravx2.cpp.o +.PHONY : simulator/simulatoravx2.cpp.o + +simulator/simulatoravx2.i: simulator/simulatoravx2.cpp.i + +.PHONY : simulator/simulatoravx2.i + +# target to preprocess a source file +simulator/simulatoravx2.cpp.i: + cd /mnt/c/depot/Git/qsharp-runtime/src/Simulation/Native && $(MAKE) -f src/CMakeFiles/Microsoft.Quantum.Simulator.Runtime.dir/build.make src/CMakeFiles/Microsoft.Quantum.Simulator.Runtime.dir/simulator/simulatoravx2.cpp.i +.PHONY : simulator/simulatoravx2.cpp.i + +simulator/simulatoravx2.s: simulator/simulatoravx2.cpp.s + +.PHONY : simulator/simulatoravx2.s + +# target to generate assembly for a file +simulator/simulatoravx2.cpp.s: + cd /mnt/c/depot/Git/qsharp-runtime/src/Simulation/Native && $(MAKE) -f src/CMakeFiles/Microsoft.Quantum.Simulator.Runtime.dir/build.make src/CMakeFiles/Microsoft.Quantum.Simulator.Runtime.dir/simulator/simulatoravx2.cpp.s +.PHONY : simulator/simulatoravx2.cpp.s + +simulator/simulatoravx512.o: simulator/simulatoravx512.cpp.o + +.PHONY : simulator/simulatoravx512.o + +# target to build an object file +simulator/simulatoravx512.cpp.o: + cd /mnt/c/depot/Git/qsharp-runtime/src/Simulation/Native && $(MAKE) -f src/CMakeFiles/Microsoft.Quantum.Simulator.Runtime.dir/build.make src/CMakeFiles/Microsoft.Quantum.Simulator.Runtime.dir/simulator/simulatoravx512.cpp.o +.PHONY : simulator/simulatoravx512.cpp.o + +simulator/simulatoravx512.i: simulator/simulatoravx512.cpp.i + +.PHONY : simulator/simulatoravx512.i + +# target to preprocess a source file +simulator/simulatoravx512.cpp.i: + cd /mnt/c/depot/Git/qsharp-runtime/src/Simulation/Native && $(MAKE) -f src/CMakeFiles/Microsoft.Quantum.Simulator.Runtime.dir/build.make src/CMakeFiles/Microsoft.Quantum.Simulator.Runtime.dir/simulator/simulatoravx512.cpp.i +.PHONY : simulator/simulatoravx512.cpp.i + +simulator/simulatoravx512.s: simulator/simulatoravx512.cpp.s + +.PHONY : simulator/simulatoravx512.s + +# target to generate assembly for a file +simulator/simulatoravx512.cpp.s: + cd /mnt/c/depot/Git/qsharp-runtime/src/Simulation/Native && $(MAKE) -f src/CMakeFiles/Microsoft.Quantum.Simulator.Runtime.dir/build.make src/CMakeFiles/Microsoft.Quantum.Simulator.Runtime.dir/simulator/simulatoravx512.cpp.s +.PHONY : simulator/simulatoravx512.cpp.s + +util/openmp.o: util/openmp.cpp.o + +.PHONY : util/openmp.o + +# target to build an object file +util/openmp.cpp.o: + cd /mnt/c/depot/Git/qsharp-runtime/src/Simulation/Native && $(MAKE) -f src/CMakeFiles/Microsoft.Quantum.Simulator.Runtime.dir/build.make src/CMakeFiles/Microsoft.Quantum.Simulator.Runtime.dir/util/openmp.cpp.o +.PHONY : util/openmp.cpp.o + +util/openmp.i: util/openmp.cpp.i + +.PHONY : util/openmp.i + +# target to preprocess a source file +util/openmp.cpp.i: + cd /mnt/c/depot/Git/qsharp-runtime/src/Simulation/Native && $(MAKE) -f src/CMakeFiles/Microsoft.Quantum.Simulator.Runtime.dir/build.make src/CMakeFiles/Microsoft.Quantum.Simulator.Runtime.dir/util/openmp.cpp.i +.PHONY : util/openmp.cpp.i + +util/openmp.s: util/openmp.cpp.s + +.PHONY : util/openmp.s + +# target to generate assembly for a file +util/openmp.cpp.s: + cd /mnt/c/depot/Git/qsharp-runtime/src/Simulation/Native && $(MAKE) -f src/CMakeFiles/Microsoft.Quantum.Simulator.Runtime.dir/build.make src/CMakeFiles/Microsoft.Quantum.Simulator.Runtime.dir/util/openmp.cpp.s +.PHONY : util/openmp.cpp.s + +# Help Target +help: + @echo "The following are some of the valid targets for this Makefile:" + @echo "... all (the default if no target is provided)" + @echo "... clean" + @echo "... depend" + @echo "... install/strip" + @echo "... install/local" + @echo "... install" + @echo "... list_install_components" + @echo "... rebuild_cache" + @echo "... edit_cache" + @echo "... test" + @echo "... Microsoft.Quantum.Simulator.Runtime" + @echo "... simulator/capi.o" + @echo "... simulator/capi.i" + @echo "... simulator/capi.s" + @echo "... simulator/factory.o" + @echo "... simulator/factory.i" + @echo "... simulator/factory.s" + @echo "... simulator/simulator.o" + @echo "... simulator/simulator.i" + @echo "... simulator/simulator.s" + @echo "... simulator/simulatoravx.o" + @echo "... simulator/simulatoravx.i" + @echo "... simulator/simulatoravx.s" + @echo "... simulator/simulatoravx2.o" + @echo "... simulator/simulatoravx2.i" + @echo "... simulator/simulatoravx2.s" + @echo "... simulator/simulatoravx512.o" + @echo "... simulator/simulatoravx512.i" + @echo "... simulator/simulatoravx512.s" + @echo "... util/openmp.o" + @echo "... util/openmp.i" + @echo "... util/openmp.s" +.PHONY : help + + + +#============================================================================= +# Special targets to cleanup operation of make. + +# Special rule to run CMake to check the build system integrity. +# No rule that depends on this can have commands that come from listfiles +# because they might be regenerated. +cmake_check_build_system: + cd /mnt/c/depot/Git/qsharp-runtime/src/Simulation/Native && $(CMAKE_COMMAND) -S$(CMAKE_SOURCE_DIR) -B$(CMAKE_BINARY_DIR) --check-build-system CMakeFiles/Makefile.cmake 0 +.PHONY : cmake_check_build_system + diff --git a/src/Simulation/Native/src/config.hpp b/src/Simulation/Native/src/config.hpp new file mode 100644 index 00000000000..363baa4d241 --- /dev/null +++ b/src/Simulation/Native/src/config.hpp @@ -0,0 +1,50 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +#pragma once + +#include + +// check if we want to force single precision +/* #undef USE_SINGLE_PRECISION */ + +// check if we have AVX intrinsics +/* #undef HAVE_INTRINSICS */ + +// check if we have AVX-512 intrinsics +/* #undef HAVE_AVX512 */ + +// check if we want to use fused kernels +#define USE_GATE_FUSION + +#define BUILD_SHARED_LIBS + + +#if defined (_MSC_VER) && defined (BUILD_SHARED_LIBS) + +#ifdef BUILD_DLL +#define MICROSOFT_QUANTUM_DECL __declspec(dllexport) +#else +#define MICROSOFT_QUANTUM_DECL __declspec(dllimport) +#endif +#define MICROSOFT_QUANTUM_DECL_IMPORT __declspec(dllimport) +#else +#define MICROSOFT_QUANTUM_DECL +#define MICROSOFT_QUANTUM_DECL_IMPORT +#endif + +#ifdef HAVE_INTRINSICS +#ifdef HAVE_AVX512 +#define SIMULATOR SimulatorAVX512 +#else +#ifdef HAVE_FMA +#define SIMULATOR SimulatorAVX2 +#else +#define SIMULATOR SimulatorAVX +#endif +#endif +#else +#define SIMULATOR SimulatorGeneric +#endif + + diff --git a/src/Simulation/Native/src/external/avx/kernel1.hpp b/src/Simulation/Native/src/external/avx/kernel1.hpp index eac0cf47ea4..24799c4524e 100644 --- a/src/Simulation/Native/src/external/avx/kernel1.hpp +++ b/src/Simulation/Native/src/external/avx/kernel1.hpp @@ -22,7 +22,7 @@ inline void kernel_core(V& psi, std::size_t I, std::size_t d0, M const& m, M con template void kernel(V& psi, unsigned id0, M const& matrix, std::size_t ctrlmask) { - std::size_t n = psi.size(); + std::size_t n = psi.size(); std::size_t d0 = 1ULL << id0; auto m = matrix; std::size_t dsorted[] = {d0}; @@ -49,7 +49,7 @@ void kernel(V& psi, unsigned id0, M const& matrix, std::size_t ctrlmask) #ifndef _MSC_VER if (ctrlmask == 0){ - #pragma omp for collapse(LOOP_COLLAPSE1) schedule(static) + #pragma omp parallel for collapse(LOOP_COLLAPSE1) schedule(static) proc_bind(spread) for (std::size_t i0 = 0; i0 < n; i0 += 2 * dsorted[0]){ for (std::size_t i1 = 0; i1 < dsorted[0]; ++i1){ kernel_core(psi, i0 + i1, dsorted[0], mm, mmt); @@ -57,7 +57,7 @@ void kernel(V& psi, unsigned id0, M const& matrix, std::size_t ctrlmask) } } else{ - #pragma omp for collapse(LOOP_COLLAPSE1) schedule(static) + #pragma omp parallel for collapse(LOOP_COLLAPSE1) schedule(static) for (std::size_t i0 = 0; i0 < n; i0 += 2 * dsorted[0]){ for (std::size_t i1 = 0; i1 < dsorted[0]; ++i1){ if (((i0 + i1)&ctrlmask) == ctrlmask) diff --git a/src/Simulation/Native/src/external/avx/kernel2.hpp b/src/Simulation/Native/src/external/avx/kernel2.hpp index 24f6c8ee13d..7d52dc39eec 100644 --- a/src/Simulation/Native/src/external/avx/kernel2.hpp +++ b/src/Simulation/Native/src/external/avx/kernel2.hpp @@ -63,7 +63,7 @@ void kernel(V& psi, unsigned id1, unsigned id0, M const& matrix, std::size_t ctr #ifndef _MSC_VER if (ctrlmask == 0){ - #pragma omp for collapse(LOOP_COLLAPSE2) schedule(static) + #pragma omp parallel for collapse(LOOP_COLLAPSE2) schedule(static) proc_bind(spread) for (std::size_t i0 = 0; i0 < n; i0 += 2 * dsorted[0]){ for (std::size_t i1 = 0; i1 < dsorted[0]; i1 += 2 * dsorted[1]){ for (std::size_t i2 = 0; i2 < dsorted[1]; ++i2){ @@ -73,7 +73,7 @@ void kernel(V& psi, unsigned id1, unsigned id0, M const& matrix, std::size_t ctr } } else{ - #pragma omp for collapse(LOOP_COLLAPSE2) schedule(static) + #pragma omp parallel for collapse(LOOP_COLLAPSE2) schedule(static) for (std::size_t i0 = 0; i0 < n; i0 += 2 * dsorted[0]){ for (std::size_t i1 = 0; i1 < dsorted[0]; i1 += 2 * dsorted[1]){ for (std::size_t i2 = 0; i2 < dsorted[1]; ++i2){ diff --git a/src/Simulation/Native/src/external/avx/kernel3.hpp b/src/Simulation/Native/src/external/avx/kernel3.hpp index a63dbc28693..58248d4742e 100644 --- a/src/Simulation/Native/src/external/avx/kernel3.hpp +++ b/src/Simulation/Native/src/external/avx/kernel3.hpp @@ -102,7 +102,7 @@ void kernel(V& psi, unsigned id2, unsigned id1, unsigned id0, M const& matrix, s #ifndef _MSC_VER if (ctrlmask == 0){ - #pragma omp for collapse(LOOP_COLLAPSE3) schedule(static) + #pragma omp parallel for collapse(LOOP_COLLAPSE3) schedule(static) proc_bind(spread) for (std::size_t i0 = 0; i0 < n; i0 += 2 * dsorted[0]){ for (std::size_t i1 = 0; i1 < dsorted[0]; i1 += 2 * dsorted[1]){ for (std::size_t i2 = 0; i2 < dsorted[1]; i2 += 2 * dsorted[2]){ @@ -114,7 +114,7 @@ void kernel(V& psi, unsigned id2, unsigned id1, unsigned id0, M const& matrix, s } } else{ - #pragma omp for collapse(LOOP_COLLAPSE3) schedule(static) + #pragma omp parallel for collapse(LOOP_COLLAPSE3) schedule(static) for (std::size_t i0 = 0; i0 < n; i0 += 2 * dsorted[0]){ for (std::size_t i1 = 0; i1 < dsorted[0]; i1 += 2 * dsorted[1]){ for (std::size_t i2 = 0; i2 < dsorted[1]; i2 += 2 * dsorted[2]){ diff --git a/src/Simulation/Native/src/external/avx/kernel4.hpp b/src/Simulation/Native/src/external/avx/kernel4.hpp index 92f573a0255..7ddcd504404 100644 --- a/src/Simulation/Native/src/external/avx/kernel4.hpp +++ b/src/Simulation/Native/src/external/avx/kernel4.hpp @@ -227,7 +227,7 @@ void kernel(V& psi, unsigned id3, unsigned id2, unsigned id1, unsigned id0, M co #ifndef _MSC_VER if (ctrlmask == 0){ - #pragma omp for collapse(LOOP_COLLAPSE4) schedule(static) + #pragma omp parallel for collapse(LOOP_COLLAPSE4) schedule(static) proc_bind(spread) for (std::size_t i0 = 0; i0 < n; i0 += 2 * dsorted[0]){ for (std::size_t i1 = 0; i1 < dsorted[0]; i1 += 2 * dsorted[1]){ for (std::size_t i2 = 0; i2 < dsorted[1]; i2 += 2 * dsorted[2]){ @@ -241,7 +241,7 @@ void kernel(V& psi, unsigned id3, unsigned id2, unsigned id1, unsigned id0, M co } } else{ - #pragma omp for collapse(LOOP_COLLAPSE4) schedule(static) + #pragma omp parallel for collapse(LOOP_COLLAPSE4) schedule(static) for (std::size_t i0 = 0; i0 < n; i0 += 2 * dsorted[0]){ for (std::size_t i1 = 0; i1 < dsorted[0]; i1 += 2 * dsorted[1]){ for (std::size_t i2 = 0; i2 < dsorted[1]; i2 += 2 * dsorted[2]){ diff --git a/src/Simulation/Native/src/external/avx/kernel5.hpp b/src/Simulation/Native/src/external/avx/kernel5.hpp index 0cdba4b89c5..72078dd6fd4 100644 --- a/src/Simulation/Native/src/external/avx/kernel5.hpp +++ b/src/Simulation/Native/src/external/avx/kernel5.hpp @@ -380,7 +380,7 @@ void kernel(V& psi, unsigned id4, unsigned id3, unsigned id2, unsigned id1, unsi #ifndef _MSC_VER if (ctrlmask == 0){ - #pragma omp for collapse(LOOP_COLLAPSE5) schedule(static) + #pragma omp parallel for collapse(LOOP_COLLAPSE5) schedule(static) proc_bind(spread) for (std::size_t i0 = 0; i0 < n; i0 += 2 * dsorted[0]){ for (std::size_t i1 = 0; i1 < dsorted[0]; i1 += 2 * dsorted[1]){ for (std::size_t i2 = 0; i2 < dsorted[1]; i2 += 2 * dsorted[2]){ @@ -396,7 +396,7 @@ void kernel(V& psi, unsigned id4, unsigned id3, unsigned id2, unsigned id1, unsi } } else{ - #pragma omp for collapse(LOOP_COLLAPSE5) schedule(static) + #pragma omp parallel for collapse(LOOP_COLLAPSE5) schedule(static) for (std::size_t i0 = 0; i0 < n; i0 += 2 * dsorted[0]){ for (std::size_t i1 = 0; i1 < dsorted[0]; i1 += 2 * dsorted[1]){ for (std::size_t i2 = 0; i2 < dsorted[1]; i2 += 2 * dsorted[2]){ diff --git a/src/Simulation/Native/src/external/avx/kernel6.hpp b/src/Simulation/Native/src/external/avx/kernel6.hpp index 343c6e3154c..89a4364b22c 100644 --- a/src/Simulation/Native/src/external/avx/kernel6.hpp +++ b/src/Simulation/Native/src/external/avx/kernel6.hpp @@ -212,7 +212,7 @@ void kernel(V& psi, unsigned id5, unsigned id4, unsigned id3, unsigned id2, unsi #ifndef _MSC_VER if (ctrlmask == 0){ - #pragma omp for collapse(LOOP_COLLAPSE6) schedule(static) + #pragma omp parallel for collapse(LOOP_COLLAPSE6) schedule(static) proc_bind(spread) for (std::size_t i0 = 0; i0 < n; i0 += 2 * dsorted[0]){ for (std::size_t i1 = 0; i1 < dsorted[0]; i1 += 2 * dsorted[1]){ for (std::size_t i2 = 0; i2 < dsorted[1]; i2 += 2 * dsorted[2]){ @@ -230,7 +230,7 @@ void kernel(V& psi, unsigned id5, unsigned id4, unsigned id3, unsigned id2, unsi } } else{ - #pragma omp for collapse(LOOP_COLLAPSE6) schedule(static) + #pragma omp parallel for collapse(LOOP_COLLAPSE6) schedule(static) for (std::size_t i0 = 0; i0 < n; i0 += 2 * dsorted[0]){ for (std::size_t i1 = 0; i1 < dsorted[0]; i1 += 2 * dsorted[1]){ for (std::size_t i2 = 0; i2 < dsorted[1]; i2 += 2 * dsorted[2]){ diff --git a/src/Simulation/Native/src/external/avx/kernel7.hpp b/src/Simulation/Native/src/external/avx/kernel7.hpp index ecd0f45f3cf..8dfda9eee71 100644 --- a/src/Simulation/Native/src/external/avx/kernel7.hpp +++ b/src/Simulation/Native/src/external/avx/kernel7.hpp @@ -389,7 +389,7 @@ void kernel(V& psi, unsigned id6, unsigned id5, unsigned id4, unsigned id3, unsi #ifndef _MSC_VER if (ctrlmask == 0){ - #pragma omp for collapse(LOOP_COLLAPSE7) schedule(static) + #pragma omp parallel for collapse(LOOP_COLLAPSE7) schedule(static) proc_bind(spread) for (std::size_t i0 = 0; i0 < n; i0 += 2 * dsorted[0]){ for (std::size_t i1 = 0; i1 < dsorted[0]; i1 += 2 * dsorted[1]){ for (std::size_t i2 = 0; i2 < dsorted[1]; i2 += 2 * dsorted[2]){ @@ -409,7 +409,7 @@ void kernel(V& psi, unsigned id6, unsigned id5, unsigned id4, unsigned id3, unsi } } else{ - #pragma omp for collapse(LOOP_COLLAPSE7) schedule(static) + #pragma omp parallel for collapse(LOOP_COLLAPSE7) schedule(static) for (std::size_t i0 = 0; i0 < n; i0 += 2 * dsorted[0]){ for (std::size_t i1 = 0; i1 < dsorted[0]; i1 += 2 * dsorted[1]){ for (std::size_t i2 = 0; i2 < dsorted[1]; i2 += 2 * dsorted[2]){ diff --git a/src/Simulation/Native/src/external/avx2/kernel1.hpp b/src/Simulation/Native/src/external/avx2/kernel1.hpp index eac0cf47ea4..198676259e4 100644 --- a/src/Simulation/Native/src/external/avx2/kernel1.hpp +++ b/src/Simulation/Native/src/external/avx2/kernel1.hpp @@ -49,7 +49,7 @@ void kernel(V& psi, unsigned id0, M const& matrix, std::size_t ctrlmask) #ifndef _MSC_VER if (ctrlmask == 0){ - #pragma omp for collapse(LOOP_COLLAPSE1) schedule(static) + #pragma omp parallel for collapse(LOOP_COLLAPSE1) schedule(static) proc_bind(spread) for (std::size_t i0 = 0; i0 < n; i0 += 2 * dsorted[0]){ for (std::size_t i1 = 0; i1 < dsorted[0]; ++i1){ kernel_core(psi, i0 + i1, dsorted[0], mm, mmt); @@ -57,7 +57,7 @@ void kernel(V& psi, unsigned id0, M const& matrix, std::size_t ctrlmask) } } else{ - #pragma omp for collapse(LOOP_COLLAPSE1) schedule(static) + #pragma omp parallel for collapse(LOOP_COLLAPSE1) schedule(static) for (std::size_t i0 = 0; i0 < n; i0 += 2 * dsorted[0]){ for (std::size_t i1 = 0; i1 < dsorted[0]; ++i1){ if (((i0 + i1)&ctrlmask) == ctrlmask) diff --git a/src/Simulation/Native/src/external/avx2/kernel2.hpp b/src/Simulation/Native/src/external/avx2/kernel2.hpp index 24f6c8ee13d..7d52dc39eec 100644 --- a/src/Simulation/Native/src/external/avx2/kernel2.hpp +++ b/src/Simulation/Native/src/external/avx2/kernel2.hpp @@ -63,7 +63,7 @@ void kernel(V& psi, unsigned id1, unsigned id0, M const& matrix, std::size_t ctr #ifndef _MSC_VER if (ctrlmask == 0){ - #pragma omp for collapse(LOOP_COLLAPSE2) schedule(static) + #pragma omp parallel for collapse(LOOP_COLLAPSE2) schedule(static) proc_bind(spread) for (std::size_t i0 = 0; i0 < n; i0 += 2 * dsorted[0]){ for (std::size_t i1 = 0; i1 < dsorted[0]; i1 += 2 * dsorted[1]){ for (std::size_t i2 = 0; i2 < dsorted[1]; ++i2){ @@ -73,7 +73,7 @@ void kernel(V& psi, unsigned id1, unsigned id0, M const& matrix, std::size_t ctr } } else{ - #pragma omp for collapse(LOOP_COLLAPSE2) schedule(static) + #pragma omp parallel for collapse(LOOP_COLLAPSE2) schedule(static) for (std::size_t i0 = 0; i0 < n; i0 += 2 * dsorted[0]){ for (std::size_t i1 = 0; i1 < dsorted[0]; i1 += 2 * dsorted[1]){ for (std::size_t i2 = 0; i2 < dsorted[1]; ++i2){ diff --git a/src/Simulation/Native/src/external/avx2/kernel3.hpp b/src/Simulation/Native/src/external/avx2/kernel3.hpp index a63dbc28693..58248d4742e 100644 --- a/src/Simulation/Native/src/external/avx2/kernel3.hpp +++ b/src/Simulation/Native/src/external/avx2/kernel3.hpp @@ -102,7 +102,7 @@ void kernel(V& psi, unsigned id2, unsigned id1, unsigned id0, M const& matrix, s #ifndef _MSC_VER if (ctrlmask == 0){ - #pragma omp for collapse(LOOP_COLLAPSE3) schedule(static) + #pragma omp parallel for collapse(LOOP_COLLAPSE3) schedule(static) proc_bind(spread) for (std::size_t i0 = 0; i0 < n; i0 += 2 * dsorted[0]){ for (std::size_t i1 = 0; i1 < dsorted[0]; i1 += 2 * dsorted[1]){ for (std::size_t i2 = 0; i2 < dsorted[1]; i2 += 2 * dsorted[2]){ @@ -114,7 +114,7 @@ void kernel(V& psi, unsigned id2, unsigned id1, unsigned id0, M const& matrix, s } } else{ - #pragma omp for collapse(LOOP_COLLAPSE3) schedule(static) + #pragma omp parallel for collapse(LOOP_COLLAPSE3) schedule(static) for (std::size_t i0 = 0; i0 < n; i0 += 2 * dsorted[0]){ for (std::size_t i1 = 0; i1 < dsorted[0]; i1 += 2 * dsorted[1]){ for (std::size_t i2 = 0; i2 < dsorted[1]; i2 += 2 * dsorted[2]){ diff --git a/src/Simulation/Native/src/external/avx2/kernel4.hpp b/src/Simulation/Native/src/external/avx2/kernel4.hpp index 92f573a0255..7ddcd504404 100644 --- a/src/Simulation/Native/src/external/avx2/kernel4.hpp +++ b/src/Simulation/Native/src/external/avx2/kernel4.hpp @@ -227,7 +227,7 @@ void kernel(V& psi, unsigned id3, unsigned id2, unsigned id1, unsigned id0, M co #ifndef _MSC_VER if (ctrlmask == 0){ - #pragma omp for collapse(LOOP_COLLAPSE4) schedule(static) + #pragma omp parallel for collapse(LOOP_COLLAPSE4) schedule(static) proc_bind(spread) for (std::size_t i0 = 0; i0 < n; i0 += 2 * dsorted[0]){ for (std::size_t i1 = 0; i1 < dsorted[0]; i1 += 2 * dsorted[1]){ for (std::size_t i2 = 0; i2 < dsorted[1]; i2 += 2 * dsorted[2]){ @@ -241,7 +241,7 @@ void kernel(V& psi, unsigned id3, unsigned id2, unsigned id1, unsigned id0, M co } } else{ - #pragma omp for collapse(LOOP_COLLAPSE4) schedule(static) + #pragma omp parallel for collapse(LOOP_COLLAPSE4) schedule(static) for (std::size_t i0 = 0; i0 < n; i0 += 2 * dsorted[0]){ for (std::size_t i1 = 0; i1 < dsorted[0]; i1 += 2 * dsorted[1]){ for (std::size_t i2 = 0; i2 < dsorted[1]; i2 += 2 * dsorted[2]){ diff --git a/src/Simulation/Native/src/external/avx2/kernel5.hpp b/src/Simulation/Native/src/external/avx2/kernel5.hpp index 0cdba4b89c5..72078dd6fd4 100644 --- a/src/Simulation/Native/src/external/avx2/kernel5.hpp +++ b/src/Simulation/Native/src/external/avx2/kernel5.hpp @@ -380,7 +380,7 @@ void kernel(V& psi, unsigned id4, unsigned id3, unsigned id2, unsigned id1, unsi #ifndef _MSC_VER if (ctrlmask == 0){ - #pragma omp for collapse(LOOP_COLLAPSE5) schedule(static) + #pragma omp parallel for collapse(LOOP_COLLAPSE5) schedule(static) proc_bind(spread) for (std::size_t i0 = 0; i0 < n; i0 += 2 * dsorted[0]){ for (std::size_t i1 = 0; i1 < dsorted[0]; i1 += 2 * dsorted[1]){ for (std::size_t i2 = 0; i2 < dsorted[1]; i2 += 2 * dsorted[2]){ @@ -396,7 +396,7 @@ void kernel(V& psi, unsigned id4, unsigned id3, unsigned id2, unsigned id1, unsi } } else{ - #pragma omp for collapse(LOOP_COLLAPSE5) schedule(static) + #pragma omp parallel for collapse(LOOP_COLLAPSE5) schedule(static) for (std::size_t i0 = 0; i0 < n; i0 += 2 * dsorted[0]){ for (std::size_t i1 = 0; i1 < dsorted[0]; i1 += 2 * dsorted[1]){ for (std::size_t i2 = 0; i2 < dsorted[1]; i2 += 2 * dsorted[2]){ diff --git a/src/Simulation/Native/src/external/avx2/kernel6.hpp b/src/Simulation/Native/src/external/avx2/kernel6.hpp index 343c6e3154c..89a4364b22c 100644 --- a/src/Simulation/Native/src/external/avx2/kernel6.hpp +++ b/src/Simulation/Native/src/external/avx2/kernel6.hpp @@ -212,7 +212,7 @@ void kernel(V& psi, unsigned id5, unsigned id4, unsigned id3, unsigned id2, unsi #ifndef _MSC_VER if (ctrlmask == 0){ - #pragma omp for collapse(LOOP_COLLAPSE6) schedule(static) + #pragma omp parallel for collapse(LOOP_COLLAPSE6) schedule(static) proc_bind(spread) for (std::size_t i0 = 0; i0 < n; i0 += 2 * dsorted[0]){ for (std::size_t i1 = 0; i1 < dsorted[0]; i1 += 2 * dsorted[1]){ for (std::size_t i2 = 0; i2 < dsorted[1]; i2 += 2 * dsorted[2]){ @@ -230,7 +230,7 @@ void kernel(V& psi, unsigned id5, unsigned id4, unsigned id3, unsigned id2, unsi } } else{ - #pragma omp for collapse(LOOP_COLLAPSE6) schedule(static) + #pragma omp parallel for collapse(LOOP_COLLAPSE6) schedule(static) for (std::size_t i0 = 0; i0 < n; i0 += 2 * dsorted[0]){ for (std::size_t i1 = 0; i1 < dsorted[0]; i1 += 2 * dsorted[1]){ for (std::size_t i2 = 0; i2 < dsorted[1]; i2 += 2 * dsorted[2]){ diff --git a/src/Simulation/Native/src/external/avx2/kernel7.hpp b/src/Simulation/Native/src/external/avx2/kernel7.hpp index ecd0f45f3cf..8dfda9eee71 100644 --- a/src/Simulation/Native/src/external/avx2/kernel7.hpp +++ b/src/Simulation/Native/src/external/avx2/kernel7.hpp @@ -389,7 +389,7 @@ void kernel(V& psi, unsigned id6, unsigned id5, unsigned id4, unsigned id3, unsi #ifndef _MSC_VER if (ctrlmask == 0){ - #pragma omp for collapse(LOOP_COLLAPSE7) schedule(static) + #pragma omp parallel for collapse(LOOP_COLLAPSE7) schedule(static) proc_bind(spread) for (std::size_t i0 = 0; i0 < n; i0 += 2 * dsorted[0]){ for (std::size_t i1 = 0; i1 < dsorted[0]; i1 += 2 * dsorted[1]){ for (std::size_t i2 = 0; i2 < dsorted[1]; i2 += 2 * dsorted[2]){ @@ -409,7 +409,7 @@ void kernel(V& psi, unsigned id6, unsigned id5, unsigned id4, unsigned id3, unsi } } else{ - #pragma omp for collapse(LOOP_COLLAPSE7) schedule(static) + #pragma omp parallel for collapse(LOOP_COLLAPSE7) schedule(static) for (std::size_t i0 = 0; i0 < n; i0 += 2 * dsorted[0]){ for (std::size_t i1 = 0; i1 < dsorted[0]; i1 += 2 * dsorted[1]){ for (std::size_t i2 = 0; i2 < dsorted[1]; i2 += 2 * dsorted[2]){ diff --git a/src/Simulation/Native/src/external/avx512/kernel1.hpp b/src/Simulation/Native/src/external/avx512/kernel1.hpp new file mode 100644 index 00000000000..19f2c473370 --- /dev/null +++ b/src/Simulation/Native/src/external/avx512/kernel1.hpp @@ -0,0 +1,85 @@ +// (C) 2018 ETH Zurich, ITP, Thomas Häner and Damian Steiger + +template +inline void kernel_core(V& psi, std::size_t I, std::size_t d0, M const& m, M const& mt) +{ + __m256d v[1]; + + v[0] = load1(&psi[I]); + + __m256d tmp[1] = {_mm256_setzero_pd()}; + + tmp[0] = fma(v[0], m[0], mt[0], tmp[0]); + + v[0] = load1(&psi[I + d0]); + + tmp[0] = fma(v[0], m[1], mt[1], tmp[0]); + store((double*)&psi[I + d0], (double*)&psi[I], tmp[0]); + +} + +// bit indices id[.] are given from high to low (e.g. control first for CNOT) +template +void kernel(V& psi, unsigned id0, M const& matrix, std::size_t ctrlmask) +{ + std::size_t n = psi.size(); + std::size_t d0 = 1ULL << id0; + auto m = matrix; + std::size_t dsorted[] = {d0}; + permute_qubits_and_matrix(dsorted, 1, m); + + __m256d mm[2]; + for (unsigned b = 0; b < 2; ++b){ + for (unsigned r = 0; r < 1; ++r){ + for (unsigned c = 0; c < 1; ++c){ + mm[b*1+r*1+c] = loada(&m[2*r+0][c+b*1], &m[2*r+1][c+b*1]); + } + } + } + + __m256d mmt[2]; + for (unsigned b = 0; b < 2; ++b){ + for (unsigned r = 0; r < 1; ++r){ + for (unsigned c = 0; c < 1; ++c){ + mmt[b*1+r*1+c] = loadbm(&m[2*r+0][c+b*1], &m[2*r+1][c+b*1]); + } + } + } + + +#ifndef _MSC_VER + if (ctrlmask == 0){ + #pragma omp parallel for collapse(LOOP_COLLAPSE1) schedule(static) proc_bind(spread) + for (std::size_t i0 = 0; i0 < n; i0 += 2 * dsorted[0]){ + for (std::size_t i1 = 0; i1 < dsorted[0]; ++i1){ + kernel_core(psi, i0 + i1, dsorted[0], mm, mmt); + } + } + } + else{ + #pragma omp parallel for collapse(LOOP_COLLAPSE1) schedule(static) + for (std::size_t i0 = 0; i0 < n; i0 += 2 * dsorted[0]){ + for (std::size_t i1 = 0; i1 < dsorted[0]; ++i1){ + if (((i0 + i1)&ctrlmask) == ctrlmask) + kernel_core(psi, i0 + i1, dsorted[0], mm, mmt); + } + } + } +#else + std::intptr_t zero = 0; + std::intptr_t dmask = dsorted[0]; + + if (ctrlmask == 0){ + #pragma omp parallel for schedule(static) + for (std::intptr_t i = 0; i < static_cast(n); ++i) + if ((i & dmask) == zero) + kernel_core(psi, i, dsorted[0], mm, mmt); + } else { + #pragma omp parallel for schedule(static) + for (std::intptr_t i = 0; i < static_cast(n); ++i) + if ((i & ctrlmask) == ctrlmask && (i & dmask) == zero) + kernel_core(psi, i, dsorted[0], mm, mmt); + } +#endif +} + diff --git a/src/Simulation/Native/src/external/avx512/kernel2.hpp b/src/Simulation/Native/src/external/avx512/kernel2.hpp new file mode 100644 index 00000000000..9a47f3044fb --- /dev/null +++ b/src/Simulation/Native/src/external/avx512/kernel2.hpp @@ -0,0 +1,98 @@ +// (C) 2018 ETH Zurich, ITP, Thomas Häner and Damian Steiger + +template +inline void kernel_core(V& psi, std::size_t I, std::size_t d0, std::size_t d1, M const& m, M const& mt) +{ + __m512d v[1]; + + v[0] = load1x4(&psi[I]); + + __m512d tmp[1] = {_mm512_setzero_pd()}; + + tmp[0] = fma(v[0], m[0], mt[0], tmp[0]); + + v[0] = load1x4(&psi[I + d0]); + + tmp[0] = fma(v[0], m[1], mt[1], tmp[0]); + + v[0] = load1x4(&psi[I + d1]); + + tmp[0] = fma(v[0], m[2], mt[2], tmp[0]); + + v[0] = load1x4(&psi[I + d0 + d1]); + + tmp[0] = fma(v[0], m[3], mt[3], tmp[0]); + store((double*)&psi[I + d0 + d1], (double*)&psi[I + d1], (double*)&psi[I + d0], (double*)&psi[I], tmp[0]); + +} + +// bit indices id[.] are given from high to low (e.g. control first for CNOT) +template +void kernel(V& psi, unsigned id1, unsigned id0, M const& matrix, std::size_t ctrlmask) +{ + std::size_t n = psi.size(); + std::size_t d0 = 1ULL << id0; + std::size_t d1 = 1ULL << id1; + auto m = matrix; + std::size_t dsorted[] = {d0, d1}; + permute_qubits_and_matrix(dsorted, 2, m); + + __m512d mm[4]; + for (unsigned b = 0; b < 4; ++b){ + for (unsigned r = 0; r < 1; ++r){ + for (unsigned c = 0; c < 1; ++c){ + mm[b*1+r*1+c] = loada(&m[4*r+0][c+b*1], &m[4*r+1][c+b*1], &m[4*r+2][c+b*1], &m[4*r+3][c+b*1]); + } + } + } + + __m512d mmt[4]; + for (unsigned b = 0; b < 4; ++b){ + for (unsigned r = 0; r < 1; ++r){ + for (unsigned c = 0; c < 1; ++c){ + mmt[b*1+r*1+c] = loadbm(&m[4*r+0][c+b*1], &m[4*r+1][c+b*1], &m[4*r+2][c+b*1], &m[4*r+3][c+b*1]); + } + } + } + + +#ifndef _MSC_VER + if (ctrlmask == 0){ + #pragma omp parallel for collapse(LOOP_COLLAPSE2) schedule(static) proc_bind(spread) + for (std::size_t i0 = 0; i0 < n; i0 += 2 * dsorted[0]){ + for (std::size_t i1 = 0; i1 < dsorted[0]; i1 += 2 * dsorted[1]){ + for (std::size_t i2 = 0; i2 < dsorted[1]; ++i2){ + kernel_core(psi, i0 + i1 + i2, dsorted[1], dsorted[0], mm, mmt); + } + } + } + } + else{ + #pragma omp parallel for collapse(LOOP_COLLAPSE2) schedule(static) + for (std::size_t i0 = 0; i0 < n; i0 += 2 * dsorted[0]){ + for (std::size_t i1 = 0; i1 < dsorted[0]; i1 += 2 * dsorted[1]){ + for (std::size_t i2 = 0; i2 < dsorted[1]; ++i2){ + if (((i0 + i1 + i2)&ctrlmask) == ctrlmask) + kernel_core(psi, i0 + i1 + i2, dsorted[1], dsorted[0], mm, mmt); + } + } + } + } +#else + std::intptr_t zero = 0; + std::intptr_t dmask = dsorted[0] + dsorted[1]; + + if (ctrlmask == 0){ + #pragma omp parallel for schedule(static) + for (std::intptr_t i = 0; i < static_cast(n); ++i) + if ((i & dmask) == zero) + kernel_core(psi, i, dsorted[1], dsorted[0], mm, mmt); + } else { + #pragma omp parallel for schedule(static) + for (std::intptr_t i = 0; i < static_cast(n); ++i) + if ((i & ctrlmask) == ctrlmask && (i & dmask) == zero) + kernel_core(psi, i, dsorted[1], dsorted[0], mm, mmt); + } +#endif +} + diff --git a/src/Simulation/Native/src/external/avx512/kernel3.hpp b/src/Simulation/Native/src/external/avx512/kernel3.hpp new file mode 100644 index 00000000000..a0f27741672 --- /dev/null +++ b/src/Simulation/Native/src/external/avx512/kernel3.hpp @@ -0,0 +1,128 @@ +// (C) 2018 ETH Zurich, ITP, Thomas Häner and Damian Steiger + +template +inline void kernel_core(V& psi, std::size_t I, std::size_t d0, std::size_t d1, std::size_t d2, M const& m, M const& mt) +{ + __m512d v[1]; + + v[0] = load1x4(&psi[I]); + + __m512d tmp[2] = {_mm512_setzero_pd(), _mm512_setzero_pd()}; + + tmp[0] = fma(v[0], m[0], mt[0], tmp[0]); + tmp[1] = fma(v[0], m[1], mt[1], tmp[1]); + + v[0] = load1x4(&psi[I + d0]); + + tmp[0] = fma(v[0], m[2], mt[2], tmp[0]); + tmp[1] = fma(v[0], m[3], mt[3], tmp[1]); + + v[0] = load1x4(&psi[I + d1]); + + tmp[0] = fma(v[0], m[4], mt[4], tmp[0]); + tmp[1] = fma(v[0], m[5], mt[5], tmp[1]); + + v[0] = load1x4(&psi[I + d0 + d1]); + + tmp[0] = fma(v[0], m[6], mt[6], tmp[0]); + tmp[1] = fma(v[0], m[7], mt[7], tmp[1]); + + v[0] = load1x4(&psi[I + d2]); + + tmp[0] = fma(v[0], m[8], mt[8], tmp[0]); + tmp[1] = fma(v[0], m[9], mt[9], tmp[1]); + + v[0] = load1x4(&psi[I + d0 + d2]); + + tmp[0] = fma(v[0], m[10], mt[10], tmp[0]); + tmp[1] = fma(v[0], m[11], mt[11], tmp[1]); + + v[0] = load1x4(&psi[I + d1 + d2]); + + tmp[0] = fma(v[0], m[12], mt[12], tmp[0]); + tmp[1] = fma(v[0], m[13], mt[13], tmp[1]); + + v[0] = load1x4(&psi[I + d0 + d1 + d2]); + + tmp[0] = fma(v[0], m[14], mt[14], tmp[0]); + tmp[1] = fma(v[0], m[15], mt[15], tmp[1]); + store((double*)&psi[I + d0 + d1], (double*)&psi[I + d1], (double*)&psi[I + d0], (double*)&psi[I], tmp[0]); + store((double*)&psi[I + d0 + d1 + d2], (double*)&psi[I + d1 + d2], (double*)&psi[I + d0 + d2], (double*)&psi[I + d2], tmp[1]); + +} + +// bit indices id[.] are given from high to low (e.g. control first for CNOT) +template +void kernel(V& psi, unsigned id2, unsigned id1, unsigned id0, M const& matrix, std::size_t ctrlmask) +{ + std::size_t n = psi.size(); + std::size_t d0 = 1ULL << id0; + std::size_t d1 = 1ULL << id1; + std::size_t d2 = 1ULL << id2; + auto m = matrix; + std::size_t dsorted[] = {d0, d1, d2}; + permute_qubits_and_matrix(dsorted, 3, m); + + __m512d mm[16]; + for (unsigned b = 0; b < 8; ++b){ + for (unsigned r = 0; r < 2; ++r){ + for (unsigned c = 0; c < 1; ++c){ + mm[b*2+r*1+c] = loada(&m[4*r+0][c+b*1], &m[4*r+1][c+b*1], &m[4*r+2][c+b*1], &m[4*r+3][c+b*1]); + } + } + } + + __m512d mmt[16]; + for (unsigned b = 0; b < 8; ++b){ + for (unsigned r = 0; r < 2; ++r){ + for (unsigned c = 0; c < 1; ++c){ + mmt[b*2+r*1+c] = loadbm(&m[4*r+0][c+b*1], &m[4*r+1][c+b*1], &m[4*r+2][c+b*1], &m[4*r+3][c+b*1]); + } + } + } + + +#ifndef _MSC_VER + if (ctrlmask == 0){ + #pragma omp parallel for collapse(LOOP_COLLAPSE3) schedule(static) proc_bind(spread) + for (std::size_t i0 = 0; i0 < n; i0 += 2 * dsorted[0]){ + for (std::size_t i1 = 0; i1 < dsorted[0]; i1 += 2 * dsorted[1]){ + for (std::size_t i2 = 0; i2 < dsorted[1]; i2 += 2 * dsorted[2]){ + for (std::size_t i3 = 0; i3 < dsorted[2]; ++i3){ + kernel_core(psi, i0 + i1 + i2 + i3, dsorted[2], dsorted[1], dsorted[0], mm, mmt); + } + } + } + } + } + else{ + #pragma omp parallel for collapse(LOOP_COLLAPSE3) schedule(static) + for (std::size_t i0 = 0; i0 < n; i0 += 2 * dsorted[0]){ + for (std::size_t i1 = 0; i1 < dsorted[0]; i1 += 2 * dsorted[1]){ + for (std::size_t i2 = 0; i2 < dsorted[1]; i2 += 2 * dsorted[2]){ + for (std::size_t i3 = 0; i3 < dsorted[2]; ++i3){ + if (((i0 + i1 + i2 + i3)&ctrlmask) == ctrlmask) + kernel_core(psi, i0 + i1 + i2 + i3, dsorted[2], dsorted[1], dsorted[0], mm, mmt); + } + } + } + } + } +#else + std::intptr_t zero = 0; + std::intptr_t dmask = dsorted[0] + dsorted[1] + dsorted[2]; + + if (ctrlmask == 0){ + #pragma omp parallel for schedule(static) + for (std::intptr_t i = 0; i < static_cast(n); ++i) + if ((i & dmask) == zero) + kernel_core(psi, i, dsorted[2], dsorted[1], dsorted[0], mm, mmt); + } else { + #pragma omp parallel for schedule(static) + for (std::intptr_t i = 0; i < static_cast(n); ++i) + if ((i & ctrlmask) == ctrlmask && (i & dmask) == zero) + kernel_core(psi, i, dsorted[2], dsorted[1], dsorted[0], mm, mmt); + } +#endif +} + diff --git a/src/Simulation/Native/src/external/avx512/kernel4.hpp b/src/Simulation/Native/src/external/avx512/kernel4.hpp new file mode 100644 index 00000000000..e956661a996 --- /dev/null +++ b/src/Simulation/Native/src/external/avx512/kernel4.hpp @@ -0,0 +1,207 @@ +// (C) 2018 ETH Zurich, ITP, Thomas Häner and Damian Steiger + +template +inline void kernel_core(V& psi, std::size_t I, std::size_t d0, std::size_t d1, std::size_t d2, std::size_t d3, M const& m, M const& mt) +{ + __m512d v[1]; + + v[0] = load1x4(&psi[I]); + + __m512d tmp[4] = {_mm512_setzero_pd(), _mm512_setzero_pd(), _mm512_setzero_pd(), _mm512_setzero_pd()}; + + tmp[0] = fma(v[0], m[0], mt[0], tmp[0]); + tmp[1] = fma(v[0], m[1], mt[1], tmp[1]); + tmp[2] = fma(v[0], m[2], mt[2], tmp[2]); + tmp[3] = fma(v[0], m[3], mt[3], tmp[3]); + + v[0] = load1x4(&psi[I + d0]); + + tmp[0] = fma(v[0], m[4], mt[4], tmp[0]); + tmp[1] = fma(v[0], m[5], mt[5], tmp[1]); + tmp[2] = fma(v[0], m[6], mt[6], tmp[2]); + tmp[3] = fma(v[0], m[7], mt[7], tmp[3]); + + v[0] = load1x4(&psi[I + d1]); + + tmp[0] = fma(v[0], m[8], mt[8], tmp[0]); + tmp[1] = fma(v[0], m[9], mt[9], tmp[1]); + tmp[2] = fma(v[0], m[10], mt[10], tmp[2]); + tmp[3] = fma(v[0], m[11], mt[11], tmp[3]); + + v[0] = load1x4(&psi[I + d0 + d1]); + + tmp[0] = fma(v[0], m[12], mt[12], tmp[0]); + tmp[1] = fma(v[0], m[13], mt[13], tmp[1]); + tmp[2] = fma(v[0], m[14], mt[14], tmp[2]); + tmp[3] = fma(v[0], m[15], mt[15], tmp[3]); + + v[0] = load1x4(&psi[I + d2]); + + tmp[0] = fma(v[0], m[16], mt[16], tmp[0]); + tmp[1] = fma(v[0], m[17], mt[17], tmp[1]); + tmp[2] = fma(v[0], m[18], mt[18], tmp[2]); + tmp[3] = fma(v[0], m[19], mt[19], tmp[3]); + + v[0] = load1x4(&psi[I + d0 + d2]); + + tmp[0] = fma(v[0], m[20], mt[20], tmp[0]); + tmp[1] = fma(v[0], m[21], mt[21], tmp[1]); + tmp[2] = fma(v[0], m[22], mt[22], tmp[2]); + tmp[3] = fma(v[0], m[23], mt[23], tmp[3]); + + v[0] = load1x4(&psi[I + d1 + d2]); + + tmp[0] = fma(v[0], m[24], mt[24], tmp[0]); + tmp[1] = fma(v[0], m[25], mt[25], tmp[1]); + tmp[2] = fma(v[0], m[26], mt[26], tmp[2]); + tmp[3] = fma(v[0], m[27], mt[27], tmp[3]); + + v[0] = load1x4(&psi[I + d0 + d1 + d2]); + + tmp[0] = fma(v[0], m[28], mt[28], tmp[0]); + tmp[1] = fma(v[0], m[29], mt[29], tmp[1]); + tmp[2] = fma(v[0], m[30], mt[30], tmp[2]); + tmp[3] = fma(v[0], m[31], mt[31], tmp[3]); + + v[0] = load1x4(&psi[I + d3]); + + tmp[0] = fma(v[0], m[32], mt[32], tmp[0]); + tmp[1] = fma(v[0], m[33], mt[33], tmp[1]); + tmp[2] = fma(v[0], m[34], mt[34], tmp[2]); + tmp[3] = fma(v[0], m[35], mt[35], tmp[3]); + + v[0] = load1x4(&psi[I + d0 + d3]); + + tmp[0] = fma(v[0], m[36], mt[36], tmp[0]); + tmp[1] = fma(v[0], m[37], mt[37], tmp[1]); + tmp[2] = fma(v[0], m[38], mt[38], tmp[2]); + tmp[3] = fma(v[0], m[39], mt[39], tmp[3]); + + v[0] = load1x4(&psi[I + d1 + d3]); + + tmp[0] = fma(v[0], m[40], mt[40], tmp[0]); + tmp[1] = fma(v[0], m[41], mt[41], tmp[1]); + tmp[2] = fma(v[0], m[42], mt[42], tmp[2]); + tmp[3] = fma(v[0], m[43], mt[43], tmp[3]); + + v[0] = load1x4(&psi[I + d0 + d1 + d3]); + + tmp[0] = fma(v[0], m[44], mt[44], tmp[0]); + tmp[1] = fma(v[0], m[45], mt[45], tmp[1]); + tmp[2] = fma(v[0], m[46], mt[46], tmp[2]); + tmp[3] = fma(v[0], m[47], mt[47], tmp[3]); + + v[0] = load1x4(&psi[I + d2 + d3]); + + tmp[0] = fma(v[0], m[48], mt[48], tmp[0]); + tmp[1] = fma(v[0], m[49], mt[49], tmp[1]); + tmp[2] = fma(v[0], m[50], mt[50], tmp[2]); + tmp[3] = fma(v[0], m[51], mt[51], tmp[3]); + + v[0] = load1x4(&psi[I + d0 + d2 + d3]); + + tmp[0] = fma(v[0], m[52], mt[52], tmp[0]); + tmp[1] = fma(v[0], m[53], mt[53], tmp[1]); + tmp[2] = fma(v[0], m[54], mt[54], tmp[2]); + tmp[3] = fma(v[0], m[55], mt[55], tmp[3]); + + v[0] = load1x4(&psi[I + d1 + d2 + d3]); + + tmp[0] = fma(v[0], m[56], mt[56], tmp[0]); + tmp[1] = fma(v[0], m[57], mt[57], tmp[1]); + tmp[2] = fma(v[0], m[58], mt[58], tmp[2]); + tmp[3] = fma(v[0], m[59], mt[59], tmp[3]); + + v[0] = load1x4(&psi[I + d0 + d1 + d2 + d3]); + + tmp[0] = fma(v[0], m[60], mt[60], tmp[0]); + tmp[1] = fma(v[0], m[61], mt[61], tmp[1]); + tmp[2] = fma(v[0], m[62], mt[62], tmp[2]); + tmp[3] = fma(v[0], m[63], mt[63], tmp[3]); + store((double*)&psi[I + d0 + d1], (double*)&psi[I + d1], (double*)&psi[I + d0], (double*)&psi[I], tmp[0]); + store((double*)&psi[I + d0 + d1 + d2], (double*)&psi[I + d1 + d2], (double*)&psi[I + d0 + d2], (double*)&psi[I + d2], tmp[1]); + store((double*)&psi[I + d0 + d1 + d3], (double*)&psi[I + d1 + d3], (double*)&psi[I + d0 + d3], (double*)&psi[I + d3], tmp[2]); + store((double*)&psi[I + d0 + d1 + d2 + d3], (double*)&psi[I + d1 + d2 + d3], (double*)&psi[I + d0 + d2 + d3], (double*)&psi[I + d2 + d3], tmp[3]); + +} + +// bit indices id[.] are given from high to low (e.g. control first for CNOT) +template +void kernel(V& psi, unsigned id3, unsigned id2, unsigned id1, unsigned id0, M const& matrix, std::size_t ctrlmask) +{ + std::size_t n = psi.size(); + std::size_t d0 = 1ULL << id0; + std::size_t d1 = 1ULL << id1; + std::size_t d2 = 1ULL << id2; + std::size_t d3 = 1ULL << id3; + auto m = matrix; + std::size_t dsorted[] = {d0, d1, d2, d3}; + permute_qubits_and_matrix(dsorted, 4, m); + + __m512d mm[64]; + for (unsigned b = 0; b < 16; ++b){ + for (unsigned r = 0; r < 4; ++r){ + for (unsigned c = 0; c < 1; ++c){ + mm[b*4+r*1+c] = loada(&m[4*r+0][c+b*1], &m[4*r+1][c+b*1], &m[4*r+2][c+b*1], &m[4*r+3][c+b*1]); + } + } + } + + __m512d mmt[64]; + for (unsigned b = 0; b < 16; ++b){ + for (unsigned r = 0; r < 4; ++r){ + for (unsigned c = 0; c < 1; ++c){ + mmt[b*4+r*1+c] = loadbm(&m[4*r+0][c+b*1], &m[4*r+1][c+b*1], &m[4*r+2][c+b*1], &m[4*r+3][c+b*1]); + } + } + } + + +#ifndef _MSC_VER + if (ctrlmask == 0){ + #pragma omp parallel for collapse(LOOP_COLLAPSE4) schedule(static) proc_bind(spread) + for (std::size_t i0 = 0; i0 < n; i0 += 2 * dsorted[0]){ + for (std::size_t i1 = 0; i1 < dsorted[0]; i1 += 2 * dsorted[1]){ + for (std::size_t i2 = 0; i2 < dsorted[1]; i2 += 2 * dsorted[2]){ + for (std::size_t i3 = 0; i3 < dsorted[2]; i3 += 2 * dsorted[3]){ + for (std::size_t i4 = 0; i4 < dsorted[3]; ++i4){ + kernel_core(psi, i0 + i1 + i2 + i3 + i4, dsorted[3], dsorted[2], dsorted[1], dsorted[0], mm, mmt); + } + } + } + } + } + } + else{ + #pragma omp parallel for collapse(LOOP_COLLAPSE4) schedule(static) + for (std::size_t i0 = 0; i0 < n; i0 += 2 * dsorted[0]){ + for (std::size_t i1 = 0; i1 < dsorted[0]; i1 += 2 * dsorted[1]){ + for (std::size_t i2 = 0; i2 < dsorted[1]; i2 += 2 * dsorted[2]){ + for (std::size_t i3 = 0; i3 < dsorted[2]; i3 += 2 * dsorted[3]){ + for (std::size_t i4 = 0; i4 < dsorted[3]; ++i4){ + if (((i0 + i1 + i2 + i3 + i4)&ctrlmask) == ctrlmask) + kernel_core(psi, i0 + i1 + i2 + i3 + i4, dsorted[3], dsorted[2], dsorted[1], dsorted[0], mm, mmt); + } + } + } + } + } + } +#else + std::intptr_t zero = 0; + std::intptr_t dmask = dsorted[0] + dsorted[1] + dsorted[2] + dsorted[3]; + + if (ctrlmask == 0){ + #pragma omp parallel for schedule(static) + for (std::intptr_t i = 0; i < static_cast(n); ++i) + if ((i & dmask) == zero) + kernel_core(psi, i, dsorted[3], dsorted[2], dsorted[1], dsorted[0], mm, mmt); + } else { + #pragma omp parallel for schedule(static) + for (std::intptr_t i = 0; i < static_cast(n); ++i) + if ((i & ctrlmask) == ctrlmask && (i & dmask) == zero) + kernel_core(psi, i, dsorted[3], dsorted[2], dsorted[1], dsorted[0], mm, mmt); + } +#endif +} + diff --git a/src/Simulation/Native/src/external/avx512/kernel5.hpp b/src/Simulation/Native/src/external/avx512/kernel5.hpp new file mode 100644 index 00000000000..ec1cdb918e6 --- /dev/null +++ b/src/Simulation/Native/src/external/avx512/kernel5.hpp @@ -0,0 +1,296 @@ +// (C) 2018 ETH Zurich, ITP, Thomas Häner and Damian Steiger + +template +inline void kernel_core(V& psi, std::size_t I, std::size_t d0, std::size_t d1, std::size_t d2, std::size_t d3, std::size_t d4, M const& m, M const& mt) +{ + __m512d v[2]; + + v[0] = load1x4(&psi[I]); + v[1] = load1x4(&psi[I + d0]); + + __m512d tmp[8] = {_mm512_setzero_pd(), _mm512_setzero_pd(), _mm512_setzero_pd(), _mm512_setzero_pd(), _mm512_setzero_pd(), _mm512_setzero_pd(), _mm512_setzero_pd(), _mm512_setzero_pd()}; + + tmp[0] = fma(v[0], m[0], mt[0], fma(v[1], m[1], mt[1], tmp[0])); + tmp[1] = fma(v[0], m[2], mt[2], fma(v[1], m[3], mt[3], tmp[1])); + tmp[2] = fma(v[0], m[4], mt[4], fma(v[1], m[5], mt[5], tmp[2])); + tmp[3] = fma(v[0], m[6], mt[6], fma(v[1], m[7], mt[7], tmp[3])); + tmp[4] = fma(v[0], m[8], mt[8], fma(v[1], m[9], mt[9], tmp[4])); + tmp[5] = fma(v[0], m[10], mt[10], fma(v[1], m[11], mt[11], tmp[5])); + tmp[6] = fma(v[0], m[12], mt[12], fma(v[1], m[13], mt[13], tmp[6])); + tmp[7] = fma(v[0], m[14], mt[14], fma(v[1], m[15], mt[15], tmp[7])); + + v[0] = load1x4(&psi[I + d1]); + v[1] = load1x4(&psi[I + d0 + d1]); + + tmp[0] = fma(v[0], m[16], mt[16], fma(v[1], m[17], mt[17], tmp[0])); + tmp[1] = fma(v[0], m[18], mt[18], fma(v[1], m[19], mt[19], tmp[1])); + tmp[2] = fma(v[0], m[20], mt[20], fma(v[1], m[21], mt[21], tmp[2])); + tmp[3] = fma(v[0], m[22], mt[22], fma(v[1], m[23], mt[23], tmp[3])); + tmp[4] = fma(v[0], m[24], mt[24], fma(v[1], m[25], mt[25], tmp[4])); + tmp[5] = fma(v[0], m[26], mt[26], fma(v[1], m[27], mt[27], tmp[5])); + tmp[6] = fma(v[0], m[28], mt[28], fma(v[1], m[29], mt[29], tmp[6])); + tmp[7] = fma(v[0], m[30], mt[30], fma(v[1], m[31], mt[31], tmp[7])); + + v[0] = load1x4(&psi[I + d2]); + v[1] = load1x4(&psi[I + d0 + d2]); + + tmp[0] = fma(v[0], m[32], mt[32], fma(v[1], m[33], mt[33], tmp[0])); + tmp[1] = fma(v[0], m[34], mt[34], fma(v[1], m[35], mt[35], tmp[1])); + tmp[2] = fma(v[0], m[36], mt[36], fma(v[1], m[37], mt[37], tmp[2])); + tmp[3] = fma(v[0], m[38], mt[38], fma(v[1], m[39], mt[39], tmp[3])); + tmp[4] = fma(v[0], m[40], mt[40], fma(v[1], m[41], mt[41], tmp[4])); + tmp[5] = fma(v[0], m[42], mt[42], fma(v[1], m[43], mt[43], tmp[5])); + tmp[6] = fma(v[0], m[44], mt[44], fma(v[1], m[45], mt[45], tmp[6])); + tmp[7] = fma(v[0], m[46], mt[46], fma(v[1], m[47], mt[47], tmp[7])); + + v[0] = load1x4(&psi[I + d1 + d2]); + v[1] = load1x4(&psi[I + d0 + d1 + d2]); + + tmp[0] = fma(v[0], m[48], mt[48], fma(v[1], m[49], mt[49], tmp[0])); + tmp[1] = fma(v[0], m[50], mt[50], fma(v[1], m[51], mt[51], tmp[1])); + tmp[2] = fma(v[0], m[52], mt[52], fma(v[1], m[53], mt[53], tmp[2])); + tmp[3] = fma(v[0], m[54], mt[54], fma(v[1], m[55], mt[55], tmp[3])); + tmp[4] = fma(v[0], m[56], mt[56], fma(v[1], m[57], mt[57], tmp[4])); + tmp[5] = fma(v[0], m[58], mt[58], fma(v[1], m[59], mt[59], tmp[5])); + tmp[6] = fma(v[0], m[60], mt[60], fma(v[1], m[61], mt[61], tmp[6])); + tmp[7] = fma(v[0], m[62], mt[62], fma(v[1], m[63], mt[63], tmp[7])); + + v[0] = load1x4(&psi[I + d3]); + v[1] = load1x4(&psi[I + d0 + d3]); + + tmp[0] = fma(v[0], m[64], mt[64], fma(v[1], m[65], mt[65], tmp[0])); + tmp[1] = fma(v[0], m[66], mt[66], fma(v[1], m[67], mt[67], tmp[1])); + tmp[2] = fma(v[0], m[68], mt[68], fma(v[1], m[69], mt[69], tmp[2])); + tmp[3] = fma(v[0], m[70], mt[70], fma(v[1], m[71], mt[71], tmp[3])); + tmp[4] = fma(v[0], m[72], mt[72], fma(v[1], m[73], mt[73], tmp[4])); + tmp[5] = fma(v[0], m[74], mt[74], fma(v[1], m[75], mt[75], tmp[5])); + tmp[6] = fma(v[0], m[76], mt[76], fma(v[1], m[77], mt[77], tmp[6])); + tmp[7] = fma(v[0], m[78], mt[78], fma(v[1], m[79], mt[79], tmp[7])); + + v[0] = load1x4(&psi[I + d1 + d3]); + v[1] = load1x4(&psi[I + d0 + d1 + d3]); + + tmp[0] = fma(v[0], m[80], mt[80], fma(v[1], m[81], mt[81], tmp[0])); + tmp[1] = fma(v[0], m[82], mt[82], fma(v[1], m[83], mt[83], tmp[1])); + tmp[2] = fma(v[0], m[84], mt[84], fma(v[1], m[85], mt[85], tmp[2])); + tmp[3] = fma(v[0], m[86], mt[86], fma(v[1], m[87], mt[87], tmp[3])); + tmp[4] = fma(v[0], m[88], mt[88], fma(v[1], m[89], mt[89], tmp[4])); + tmp[5] = fma(v[0], m[90], mt[90], fma(v[1], m[91], mt[91], tmp[5])); + tmp[6] = fma(v[0], m[92], mt[92], fma(v[1], m[93], mt[93], tmp[6])); + tmp[7] = fma(v[0], m[94], mt[94], fma(v[1], m[95], mt[95], tmp[7])); + + v[0] = load1x4(&psi[I + d2 + d3]); + v[1] = load1x4(&psi[I + d0 + d2 + d3]); + + tmp[0] = fma(v[0], m[96], mt[96], fma(v[1], m[97], mt[97], tmp[0])); + tmp[1] = fma(v[0], m[98], mt[98], fma(v[1], m[99], mt[99], tmp[1])); + tmp[2] = fma(v[0], m[100], mt[100], fma(v[1], m[101], mt[101], tmp[2])); + tmp[3] = fma(v[0], m[102], mt[102], fma(v[1], m[103], mt[103], tmp[3])); + tmp[4] = fma(v[0], m[104], mt[104], fma(v[1], m[105], mt[105], tmp[4])); + tmp[5] = fma(v[0], m[106], mt[106], fma(v[1], m[107], mt[107], tmp[5])); + tmp[6] = fma(v[0], m[108], mt[108], fma(v[1], m[109], mt[109], tmp[6])); + tmp[7] = fma(v[0], m[110], mt[110], fma(v[1], m[111], mt[111], tmp[7])); + + v[0] = load1x4(&psi[I + d1 + d2 + d3]); + v[1] = load1x4(&psi[I + d0 + d1 + d2 + d3]); + + tmp[0] = fma(v[0], m[112], mt[112], fma(v[1], m[113], mt[113], tmp[0])); + tmp[1] = fma(v[0], m[114], mt[114], fma(v[1], m[115], mt[115], tmp[1])); + tmp[2] = fma(v[0], m[116], mt[116], fma(v[1], m[117], mt[117], tmp[2])); + tmp[3] = fma(v[0], m[118], mt[118], fma(v[1], m[119], mt[119], tmp[3])); + tmp[4] = fma(v[0], m[120], mt[120], fma(v[1], m[121], mt[121], tmp[4])); + tmp[5] = fma(v[0], m[122], mt[122], fma(v[1], m[123], mt[123], tmp[5])); + tmp[6] = fma(v[0], m[124], mt[124], fma(v[1], m[125], mt[125], tmp[6])); + tmp[7] = fma(v[0], m[126], mt[126], fma(v[1], m[127], mt[127], tmp[7])); + + v[0] = load1x4(&psi[I + d4]); + v[1] = load1x4(&psi[I + d0 + d4]); + + tmp[0] = fma(v[0], m[128], mt[128], fma(v[1], m[129], mt[129], tmp[0])); + tmp[1] = fma(v[0], m[130], mt[130], fma(v[1], m[131], mt[131], tmp[1])); + tmp[2] = fma(v[0], m[132], mt[132], fma(v[1], m[133], mt[133], tmp[2])); + tmp[3] = fma(v[0], m[134], mt[134], fma(v[1], m[135], mt[135], tmp[3])); + tmp[4] = fma(v[0], m[136], mt[136], fma(v[1], m[137], mt[137], tmp[4])); + tmp[5] = fma(v[0], m[138], mt[138], fma(v[1], m[139], mt[139], tmp[5])); + tmp[6] = fma(v[0], m[140], mt[140], fma(v[1], m[141], mt[141], tmp[6])); + tmp[7] = fma(v[0], m[142], mt[142], fma(v[1], m[143], mt[143], tmp[7])); + + v[0] = load1x4(&psi[I + d1 + d4]); + v[1] = load1x4(&psi[I + d0 + d1 + d4]); + + tmp[0] = fma(v[0], m[144], mt[144], fma(v[1], m[145], mt[145], tmp[0])); + tmp[1] = fma(v[0], m[146], mt[146], fma(v[1], m[147], mt[147], tmp[1])); + tmp[2] = fma(v[0], m[148], mt[148], fma(v[1], m[149], mt[149], tmp[2])); + tmp[3] = fma(v[0], m[150], mt[150], fma(v[1], m[151], mt[151], tmp[3])); + tmp[4] = fma(v[0], m[152], mt[152], fma(v[1], m[153], mt[153], tmp[4])); + tmp[5] = fma(v[0], m[154], mt[154], fma(v[1], m[155], mt[155], tmp[5])); + tmp[6] = fma(v[0], m[156], mt[156], fma(v[1], m[157], mt[157], tmp[6])); + tmp[7] = fma(v[0], m[158], mt[158], fma(v[1], m[159], mt[159], tmp[7])); + + v[0] = load1x4(&psi[I + d2 + d4]); + v[1] = load1x4(&psi[I + d0 + d2 + d4]); + + tmp[0] = fma(v[0], m[160], mt[160], fma(v[1], m[161], mt[161], tmp[0])); + tmp[1] = fma(v[0], m[162], mt[162], fma(v[1], m[163], mt[163], tmp[1])); + tmp[2] = fma(v[0], m[164], mt[164], fma(v[1], m[165], mt[165], tmp[2])); + tmp[3] = fma(v[0], m[166], mt[166], fma(v[1], m[167], mt[167], tmp[3])); + tmp[4] = fma(v[0], m[168], mt[168], fma(v[1], m[169], mt[169], tmp[4])); + tmp[5] = fma(v[0], m[170], mt[170], fma(v[1], m[171], mt[171], tmp[5])); + tmp[6] = fma(v[0], m[172], mt[172], fma(v[1], m[173], mt[173], tmp[6])); + tmp[7] = fma(v[0], m[174], mt[174], fma(v[1], m[175], mt[175], tmp[7])); + + v[0] = load1x4(&psi[I + d1 + d2 + d4]); + v[1] = load1x4(&psi[I + d0 + d1 + d2 + d4]); + + tmp[0] = fma(v[0], m[176], mt[176], fma(v[1], m[177], mt[177], tmp[0])); + tmp[1] = fma(v[0], m[178], mt[178], fma(v[1], m[179], mt[179], tmp[1])); + tmp[2] = fma(v[0], m[180], mt[180], fma(v[1], m[181], mt[181], tmp[2])); + tmp[3] = fma(v[0], m[182], mt[182], fma(v[1], m[183], mt[183], tmp[3])); + tmp[4] = fma(v[0], m[184], mt[184], fma(v[1], m[185], mt[185], tmp[4])); + tmp[5] = fma(v[0], m[186], mt[186], fma(v[1], m[187], mt[187], tmp[5])); + tmp[6] = fma(v[0], m[188], mt[188], fma(v[1], m[189], mt[189], tmp[6])); + tmp[7] = fma(v[0], m[190], mt[190], fma(v[1], m[191], mt[191], tmp[7])); + + v[0] = load1x4(&psi[I + d3 + d4]); + v[1] = load1x4(&psi[I + d0 + d3 + d4]); + + tmp[0] = fma(v[0], m[192], mt[192], fma(v[1], m[193], mt[193], tmp[0])); + tmp[1] = fma(v[0], m[194], mt[194], fma(v[1], m[195], mt[195], tmp[1])); + tmp[2] = fma(v[0], m[196], mt[196], fma(v[1], m[197], mt[197], tmp[2])); + tmp[3] = fma(v[0], m[198], mt[198], fma(v[1], m[199], mt[199], tmp[3])); + tmp[4] = fma(v[0], m[200], mt[200], fma(v[1], m[201], mt[201], tmp[4])); + tmp[5] = fma(v[0], m[202], mt[202], fma(v[1], m[203], mt[203], tmp[5])); + tmp[6] = fma(v[0], m[204], mt[204], fma(v[1], m[205], mt[205], tmp[6])); + tmp[7] = fma(v[0], m[206], mt[206], fma(v[1], m[207], mt[207], tmp[7])); + + v[0] = load1x4(&psi[I + d1 + d3 + d4]); + v[1] = load1x4(&psi[I + d0 + d1 + d3 + d4]); + + tmp[0] = fma(v[0], m[208], mt[208], fma(v[1], m[209], mt[209], tmp[0])); + tmp[1] = fma(v[0], m[210], mt[210], fma(v[1], m[211], mt[211], tmp[1])); + tmp[2] = fma(v[0], m[212], mt[212], fma(v[1], m[213], mt[213], tmp[2])); + tmp[3] = fma(v[0], m[214], mt[214], fma(v[1], m[215], mt[215], tmp[3])); + tmp[4] = fma(v[0], m[216], mt[216], fma(v[1], m[217], mt[217], tmp[4])); + tmp[5] = fma(v[0], m[218], mt[218], fma(v[1], m[219], mt[219], tmp[5])); + tmp[6] = fma(v[0], m[220], mt[220], fma(v[1], m[221], mt[221], tmp[6])); + tmp[7] = fma(v[0], m[222], mt[222], fma(v[1], m[223], mt[223], tmp[7])); + + v[0] = load1x4(&psi[I + d2 + d3 + d4]); + v[1] = load1x4(&psi[I + d0 + d2 + d3 + d4]); + + tmp[0] = fma(v[0], m[224], mt[224], fma(v[1], m[225], mt[225], tmp[0])); + tmp[1] = fma(v[0], m[226], mt[226], fma(v[1], m[227], mt[227], tmp[1])); + tmp[2] = fma(v[0], m[228], mt[228], fma(v[1], m[229], mt[229], tmp[2])); + tmp[3] = fma(v[0], m[230], mt[230], fma(v[1], m[231], mt[231], tmp[3])); + tmp[4] = fma(v[0], m[232], mt[232], fma(v[1], m[233], mt[233], tmp[4])); + tmp[5] = fma(v[0], m[234], mt[234], fma(v[1], m[235], mt[235], tmp[5])); + tmp[6] = fma(v[0], m[236], mt[236], fma(v[1], m[237], mt[237], tmp[6])); + tmp[7] = fma(v[0], m[238], mt[238], fma(v[1], m[239], mt[239], tmp[7])); + + v[0] = load1x4(&psi[I + d1 + d2 + d3 + d4]); + v[1] = load1x4(&psi[I + d0 + d1 + d2 + d3 + d4]); + + tmp[0] = fma(v[0], m[240], mt[240], fma(v[1], m[241], mt[241], tmp[0])); + tmp[1] = fma(v[0], m[242], mt[242], fma(v[1], m[243], mt[243], tmp[1])); + tmp[2] = fma(v[0], m[244], mt[244], fma(v[1], m[245], mt[245], tmp[2])); + tmp[3] = fma(v[0], m[246], mt[246], fma(v[1], m[247], mt[247], tmp[3])); + store((double*)&psi[I + d0 + d1], (double*)&psi[I + d1], (double*)&psi[I + d0], (double*)&psi[I], tmp[0]); + store((double*)&psi[I + d0 + d1 + d2], (double*)&psi[I + d1 + d2], (double*)&psi[I + d0 + d2], (double*)&psi[I + d2], tmp[1]); + store((double*)&psi[I + d0 + d1 + d3], (double*)&psi[I + d1 + d3], (double*)&psi[I + d0 + d3], (double*)&psi[I + d3], tmp[2]); + store((double*)&psi[I + d0 + d1 + d2 + d3], (double*)&psi[I + d1 + d2 + d3], (double*)&psi[I + d0 + d2 + d3], (double*)&psi[I + d2 + d3], tmp[3]); + tmp[4] = fma(v[0], m[248], mt[248], fma(v[1], m[249], mt[249], tmp[4])); + tmp[5] = fma(v[0], m[250], mt[250], fma(v[1], m[251], mt[251], tmp[5])); + tmp[6] = fma(v[0], m[252], mt[252], fma(v[1], m[253], mt[253], tmp[6])); + tmp[7] = fma(v[0], m[254], mt[254], fma(v[1], m[255], mt[255], tmp[7])); + store((double*)&psi[I + d0 + d1 + d4], (double*)&psi[I + d1 + d4], (double*)&psi[I + d0 + d4], (double*)&psi[I + d4], tmp[4]); + store((double*)&psi[I + d0 + d1 + d2 + d4], (double*)&psi[I + d1 + d2 + d4], (double*)&psi[I + d0 + d2 + d4], (double*)&psi[I + d2 + d4], tmp[5]); + store((double*)&psi[I + d0 + d1 + d3 + d4], (double*)&psi[I + d1 + d3 + d4], (double*)&psi[I + d0 + d3 + d4], (double*)&psi[I + d3 + d4], tmp[6]); + store((double*)&psi[I + d0 + d1 + d2 + d3 + d4], (double*)&psi[I + d1 + d2 + d3 + d4], (double*)&psi[I + d0 + d2 + d3 + d4], (double*)&psi[I + d2 + d3 + d4], tmp[7]); + +} + +// bit indices id[.] are given from high to low (e.g. control first for CNOT) +template +void kernel(V& psi, unsigned id4, unsigned id3, unsigned id2, unsigned id1, unsigned id0, M const& matrix, std::size_t ctrlmask) +{ + std::size_t n = psi.size(); + std::size_t d0 = 1ULL << id0; + std::size_t d1 = 1ULL << id1; + std::size_t d2 = 1ULL << id2; + std::size_t d3 = 1ULL << id3; + std::size_t d4 = 1ULL << id4; + auto m = matrix; + std::size_t dsorted[] = {d0, d1, d2, d3, d4}; + permute_qubits_and_matrix(dsorted, 5, m); + + __m512d mm[256]; + for (unsigned b = 0; b < 16; ++b){ + for (unsigned r = 0; r < 8; ++r){ + for (unsigned c = 0; c < 2; ++c){ + mm[b*16+r*2+c] = loada(&m[4*r+0][c+b*2], &m[4*r+1][c+b*2], &m[4*r+2][c+b*2], &m[4*r+3][c+b*2]); + } + } + } + + __m512d mmt[256]; + for (unsigned b = 0; b < 16; ++b){ + for (unsigned r = 0; r < 8; ++r){ + for (unsigned c = 0; c < 2; ++c){ + mmt[b*16+r*2+c] = loadbm(&m[4*r+0][c+b*2], &m[4*r+1][c+b*2], &m[4*r+2][c+b*2], &m[4*r+3][c+b*2]); + } + } + } + + +#ifndef _MSC_VER + if (ctrlmask == 0){ + #pragma omp parallel for collapse(LOOP_COLLAPSE5) schedule(static) proc_bind(spread) + for (std::size_t i0 = 0; i0 < n; i0 += 2 * dsorted[0]){ + for (std::size_t i1 = 0; i1 < dsorted[0]; i1 += 2 * dsorted[1]){ + for (std::size_t i2 = 0; i2 < dsorted[1]; i2 += 2 * dsorted[2]){ + for (std::size_t i3 = 0; i3 < dsorted[2]; i3 += 2 * dsorted[3]){ + for (std::size_t i4 = 0; i4 < dsorted[3]; i4 += 2 * dsorted[4]){ + for (std::size_t i5 = 0; i5 < dsorted[4]; ++i5){ + kernel_core(psi, i0 + i1 + i2 + i3 + i4 + i5, dsorted[4], dsorted[3], dsorted[2], dsorted[1], dsorted[0], mm, mmt); + } + } + } + } + } + } + } + else{ + #pragma omp parallel for collapse(LOOP_COLLAPSE5) schedule(static) + for (std::size_t i0 = 0; i0 < n; i0 += 2 * dsorted[0]){ + for (std::size_t i1 = 0; i1 < dsorted[0]; i1 += 2 * dsorted[1]){ + for (std::size_t i2 = 0; i2 < dsorted[1]; i2 += 2 * dsorted[2]){ + for (std::size_t i3 = 0; i3 < dsorted[2]; i3 += 2 * dsorted[3]){ + for (std::size_t i4 = 0; i4 < dsorted[3]; i4 += 2 * dsorted[4]){ + for (std::size_t i5 = 0; i5 < dsorted[4]; ++i5){ + if (((i0 + i1 + i2 + i3 + i4 + i5)&ctrlmask) == ctrlmask) + kernel_core(psi, i0 + i1 + i2 + i3 + i4 + i5, dsorted[4], dsorted[3], dsorted[2], dsorted[1], dsorted[0], mm, mmt); + } + } + } + } + } + } + } +#else + std::intptr_t zero = 0; + std::intptr_t dmask = dsorted[0] + dsorted[1] + dsorted[2] + dsorted[3] + dsorted[4]; + + if (ctrlmask == 0){ + #pragma omp parallel for schedule(static) + for (std::intptr_t i = 0; i < static_cast(n); ++i) + if ((i & dmask) == zero) + kernel_core(psi, i, dsorted[4], dsorted[3], dsorted[2], dsorted[1], dsorted[0], mm, mmt); + } else { + #pragma omp parallel for schedule(static) + for (std::intptr_t i = 0; i < static_cast(n); ++i) + if ((i & ctrlmask) == ctrlmask && (i & dmask) == zero) + kernel_core(psi, i, dsorted[4], dsorted[3], dsorted[2], dsorted[1], dsorted[0], mm, mmt); + } +#endif +} + diff --git a/src/Simulation/Native/src/external/avx512/kernel6.hpp b/src/Simulation/Native/src/external/avx512/kernel6.hpp new file mode 100644 index 00000000000..77a6a89465e --- /dev/null +++ b/src/Simulation/Native/src/external/avx512/kernel6.hpp @@ -0,0 +1,252 @@ +// (C) 2018 ETH Zurich, ITP, Thomas Häner and Damian Steiger + +template +inline void kernel_core(V& psi, std::size_t I, std::size_t d0, std::size_t d1, std::size_t d2, std::size_t d3, std::size_t d4, std::size_t d5, M const& m) +{ + __m512d v[4]; + + v[0] = load1x4(&psi[I]); + v[1] = load1x4(&psi[I + d0]); + v[2] = load1x4(&psi[I + d1]); + v[3] = load1x4(&psi[I + d0 + d1]); + + __m512d tmp[16] = {_mm512_setzero_pd(), _mm512_setzero_pd(), _mm512_setzero_pd(), _mm512_setzero_pd(), _mm512_setzero_pd(), _mm512_setzero_pd(), _mm512_setzero_pd(), _mm512_setzero_pd(), _mm512_setzero_pd(), _mm512_setzero_pd(), _mm512_setzero_pd(), _mm512_setzero_pd(), _mm512_setzero_pd(), _mm512_setzero_pd(), _mm512_setzero_pd(), _mm512_setzero_pd()}; + for (unsigned i = 0; i < 16; ++i){ + tmp[i] = fma(v[0], m[0 + i * 4 + 0], fma(v[1], m[0 + i * 4 + 1], fma(v[2], m[0 + i * 4 + 2], fma(v[3], m[0 + i * 4 + 3], tmp[i])))); + } + + + v[0] = load1x4(&psi[I + d2]); + v[1] = load1x4(&psi[I + d0 + d2]); + v[2] = load1x4(&psi[I + d1 + d2]); + v[3] = load1x4(&psi[I + d0 + d1 + d2]); + for (unsigned i = 0; i < 16; ++i){ + tmp[i] = fma(v[0], m[64 + i * 4 + 0], fma(v[1], m[64 + i * 4 + 1], fma(v[2], m[64 + i * 4 + 2], fma(v[3], m[64 + i * 4 + 3], tmp[i])))); + } + + + v[0] = load1x4(&psi[I + d3]); + v[1] = load1x4(&psi[I + d0 + d3]); + v[2] = load1x4(&psi[I + d1 + d3]); + v[3] = load1x4(&psi[I + d0 + d1 + d3]); + for (unsigned i = 0; i < 16; ++i){ + tmp[i] = fma(v[0], m[128 + i * 4 + 0], fma(v[1], m[128 + i * 4 + 1], fma(v[2], m[128 + i * 4 + 2], fma(v[3], m[128 + i * 4 + 3], tmp[i])))); + } + + + v[0] = load1x4(&psi[I + d2 + d3]); + v[1] = load1x4(&psi[I + d0 + d2 + d3]); + v[2] = load1x4(&psi[I + d1 + d2 + d3]); + v[3] = load1x4(&psi[I + d0 + d1 + d2 + d3]); + for (unsigned i = 0; i < 16; ++i){ + tmp[i] = fma(v[0], m[192 + i * 4 + 0], fma(v[1], m[192 + i * 4 + 1], fma(v[2], m[192 + i * 4 + 2], fma(v[3], m[192 + i * 4 + 3], tmp[i])))); + } + + + v[0] = load1x4(&psi[I + d4]); + v[1] = load1x4(&psi[I + d0 + d4]); + v[2] = load1x4(&psi[I + d1 + d4]); + v[3] = load1x4(&psi[I + d0 + d1 + d4]); + for (unsigned i = 0; i < 16; ++i){ + tmp[i] = fma(v[0], m[256 + i * 4 + 0], fma(v[1], m[256 + i * 4 + 1], fma(v[2], m[256 + i * 4 + 2], fma(v[3], m[256 + i * 4 + 3], tmp[i])))); + } + + + v[0] = load1x4(&psi[I + d2 + d4]); + v[1] = load1x4(&psi[I + d0 + d2 + d4]); + v[2] = load1x4(&psi[I + d1 + d2 + d4]); + v[3] = load1x4(&psi[I + d0 + d1 + d2 + d4]); + for (unsigned i = 0; i < 16; ++i){ + tmp[i] = fma(v[0], m[320 + i * 4 + 0], fma(v[1], m[320 + i * 4 + 1], fma(v[2], m[320 + i * 4 + 2], fma(v[3], m[320 + i * 4 + 3], tmp[i])))); + } + + + v[0] = load1x4(&psi[I + d3 + d4]); + v[1] = load1x4(&psi[I + d0 + d3 + d4]); + v[2] = load1x4(&psi[I + d1 + d3 + d4]); + v[3] = load1x4(&psi[I + d0 + d1 + d3 + d4]); + for (unsigned i = 0; i < 16; ++i){ + tmp[i] = fma(v[0], m[384 + i * 4 + 0], fma(v[1], m[384 + i * 4 + 1], fma(v[2], m[384 + i * 4 + 2], fma(v[3], m[384 + i * 4 + 3], tmp[i])))); + } + + + v[0] = load1x4(&psi[I + d2 + d3 + d4]); + v[1] = load1x4(&psi[I + d0 + d2 + d3 + d4]); + v[2] = load1x4(&psi[I + d1 + d2 + d3 + d4]); + v[3] = load1x4(&psi[I + d0 + d1 + d2 + d3 + d4]); + for (unsigned i = 0; i < 16; ++i){ + tmp[i] = fma(v[0], m[448 + i * 4 + 0], fma(v[1], m[448 + i * 4 + 1], fma(v[2], m[448 + i * 4 + 2], fma(v[3], m[448 + i * 4 + 3], tmp[i])))); + } + + + v[0] = load1x4(&psi[I + d5]); + v[1] = load1x4(&psi[I + d0 + d5]); + v[2] = load1x4(&psi[I + d1 + d5]); + v[3] = load1x4(&psi[I + d0 + d1 + d5]); + for (unsigned i = 0; i < 16; ++i){ + tmp[i] = fma(v[0], m[512 + i * 4 + 0], fma(v[1], m[512 + i * 4 + 1], fma(v[2], m[512 + i * 4 + 2], fma(v[3], m[512 + i * 4 + 3], tmp[i])))); + } + + + v[0] = load1x4(&psi[I + d2 + d5]); + v[1] = load1x4(&psi[I + d0 + d2 + d5]); + v[2] = load1x4(&psi[I + d1 + d2 + d5]); + v[3] = load1x4(&psi[I + d0 + d1 + d2 + d5]); + for (unsigned i = 0; i < 16; ++i){ + tmp[i] = fma(v[0], m[576 + i * 4 + 0], fma(v[1], m[576 + i * 4 + 1], fma(v[2], m[576 + i * 4 + 2], fma(v[3], m[576 + i * 4 + 3], tmp[i])))); + } + + + v[0] = load1x4(&psi[I + d3 + d5]); + v[1] = load1x4(&psi[I + d0 + d3 + d5]); + v[2] = load1x4(&psi[I + d1 + d3 + d5]); + v[3] = load1x4(&psi[I + d0 + d1 + d3 + d5]); + for (unsigned i = 0; i < 16; ++i){ + tmp[i] = fma(v[0], m[640 + i * 4 + 0], fma(v[1], m[640 + i * 4 + 1], fma(v[2], m[640 + i * 4 + 2], fma(v[3], m[640 + i * 4 + 3], tmp[i])))); + } + + + v[0] = load1x4(&psi[I + d2 + d3 + d5]); + v[1] = load1x4(&psi[I + d0 + d2 + d3 + d5]); + v[2] = load1x4(&psi[I + d1 + d2 + d3 + d5]); + v[3] = load1x4(&psi[I + d0 + d1 + d2 + d3 + d5]); + for (unsigned i = 0; i < 16; ++i){ + tmp[i] = fma(v[0], m[704 + i * 4 + 0], fma(v[1], m[704 + i * 4 + 1], fma(v[2], m[704 + i * 4 + 2], fma(v[3], m[704 + i * 4 + 3], tmp[i])))); + } + + + v[0] = load1x4(&psi[I + d4 + d5]); + v[1] = load1x4(&psi[I + d0 + d4 + d5]); + v[2] = load1x4(&psi[I + d1 + d4 + d5]); + v[3] = load1x4(&psi[I + d0 + d1 + d4 + d5]); + for (unsigned i = 0; i < 16; ++i){ + tmp[i] = fma(v[0], m[768 + i * 4 + 0], fma(v[1], m[768 + i * 4 + 1], fma(v[2], m[768 + i * 4 + 2], fma(v[3], m[768 + i * 4 + 3], tmp[i])))); + } + + + v[0] = load1x4(&psi[I + d2 + d4 + d5]); + v[1] = load1x4(&psi[I + d0 + d2 + d4 + d5]); + v[2] = load1x4(&psi[I + d1 + d2 + d4 + d5]); + v[3] = load1x4(&psi[I + d0 + d1 + d2 + d4 + d5]); + for (unsigned i = 0; i < 16; ++i){ + tmp[i] = fma(v[0], m[832 + i * 4 + 0], fma(v[1], m[832 + i * 4 + 1], fma(v[2], m[832 + i * 4 + 2], fma(v[3], m[832 + i * 4 + 3], tmp[i])))); + } + + + v[0] = load1x4(&psi[I + d3 + d4 + d5]); + v[1] = load1x4(&psi[I + d0 + d3 + d4 + d5]); + v[2] = load1x4(&psi[I + d1 + d3 + d4 + d5]); + v[3] = load1x4(&psi[I + d0 + d1 + d3 + d4 + d5]); + for (unsigned i = 0; i < 16; ++i){ + tmp[i] = fma(v[0], m[896 + i * 4 + 0], fma(v[1], m[896 + i * 4 + 1], fma(v[2], m[896 + i * 4 + 2], fma(v[3], m[896 + i * 4 + 3], tmp[i])))); + } + + + v[0] = load1x4(&psi[I + d2 + d3 + d4 + d5]); + v[1] = load1x4(&psi[I + d0 + d2 + d3 + d4 + d5]); + v[2] = load1x4(&psi[I + d1 + d2 + d3 + d4 + d5]); + v[3] = load1x4(&psi[I + d0 + d1 + d2 + d3 + d4 + d5]); + for (unsigned i = 0; i < 16; ++i){ + tmp[i] = fma(v[0], m[960 + i * 4 + 0], fma(v[1], m[960 + i * 4 + 1], fma(v[2], m[960 + i * 4 + 2], fma(v[3], m[960 + i * 4 + 3], tmp[i])))); + } + + store((double*)&psi[I + d0 + d1], (double*)&psi[I + d1], (double*)&psi[I + d0], (double*)&psi[I], tmp[0]); + store((double*)&psi[I + d0 + d1 + d2], (double*)&psi[I + d1 + d2], (double*)&psi[I + d0 + d2], (double*)&psi[I + d2], tmp[1]); + store((double*)&psi[I + d0 + d1 + d3], (double*)&psi[I + d1 + d3], (double*)&psi[I + d0 + d3], (double*)&psi[I + d3], tmp[2]); + store((double*)&psi[I + d0 + d1 + d2 + d3], (double*)&psi[I + d1 + d2 + d3], (double*)&psi[I + d0 + d2 + d3], (double*)&psi[I + d2 + d3], tmp[3]); + store((double*)&psi[I + d0 + d1 + d4], (double*)&psi[I + d1 + d4], (double*)&psi[I + d0 + d4], (double*)&psi[I + d4], tmp[4]); + store((double*)&psi[I + d0 + d1 + d2 + d4], (double*)&psi[I + d1 + d2 + d4], (double*)&psi[I + d0 + d2 + d4], (double*)&psi[I + d2 + d4], tmp[5]); + store((double*)&psi[I + d0 + d1 + d3 + d4], (double*)&psi[I + d1 + d3 + d4], (double*)&psi[I + d0 + d3 + d4], (double*)&psi[I + d3 + d4], tmp[6]); + store((double*)&psi[I + d0 + d1 + d2 + d3 + d4], (double*)&psi[I + d1 + d2 + d3 + d4], (double*)&psi[I + d0 + d2 + d3 + d4], (double*)&psi[I + d2 + d3 + d4], tmp[7]); + store((double*)&psi[I + d0 + d1 + d5], (double*)&psi[I + d1 + d5], (double*)&psi[I + d0 + d5], (double*)&psi[I + d5], tmp[8]); + store((double*)&psi[I + d0 + d1 + d2 + d5], (double*)&psi[I + d1 + d2 + d5], (double*)&psi[I + d0 + d2 + d5], (double*)&psi[I + d2 + d5], tmp[9]); + store((double*)&psi[I + d0 + d1 + d3 + d5], (double*)&psi[I + d1 + d3 + d5], (double*)&psi[I + d0 + d3 + d5], (double*)&psi[I + d3 + d5], tmp[10]); + store((double*)&psi[I + d0 + d1 + d2 + d3 + d5], (double*)&psi[I + d1 + d2 + d3 + d5], (double*)&psi[I + d0 + d2 + d3 + d5], (double*)&psi[I + d2 + d3 + d5], tmp[11]); + store((double*)&psi[I + d0 + d1 + d4 + d5], (double*)&psi[I + d1 + d4 + d5], (double*)&psi[I + d0 + d4 + d5], (double*)&psi[I + d4 + d5], tmp[12]); + store((double*)&psi[I + d0 + d1 + d2 + d4 + d5], (double*)&psi[I + d1 + d2 + d4 + d5], (double*)&psi[I + d0 + d2 + d4 + d5], (double*)&psi[I + d2 + d4 + d5], tmp[13]); + store((double*)&psi[I + d0 + d1 + d3 + d4 + d5], (double*)&psi[I + d1 + d3 + d4 + d5], (double*)&psi[I + d0 + d3 + d4 + d5], (double*)&psi[I + d3 + d4 + d5], tmp[14]); + store((double*)&psi[I + d0 + d1 + d2 + d3 + d4 + d5], (double*)&psi[I + d1 + d2 + d3 + d4 + d5], (double*)&psi[I + d0 + d2 + d3 + d4 + d5], (double*)&psi[I + d2 + d3 + d4 + d5], tmp[15]); + +} + +// bit indices id[.] are given from high to low (e.g. control first for CNOT) +template +void kernel(V& psi, unsigned id5, unsigned id4, unsigned id3, unsigned id2, unsigned id1, unsigned id0, M const& matrix, std::size_t ctrlmask) +{ + std::size_t n = psi.size(); + std::size_t d0 = 1ULL << id0; + std::size_t d1 = 1ULL << id1; + std::size_t d2 = 1ULL << id2; + std::size_t d3 = 1ULL << id3; + std::size_t d4 = 1ULL << id4; + std::size_t d5 = 1ULL << id5; + auto m = matrix; + std::size_t dsorted[] = {d0, d1, d2, d3, d4, d5}; + permute_qubits_and_matrix(dsorted, 6, m); + + __m512d mm[1024]; + for (unsigned b = 0; b < 16; ++b){ + for (unsigned r = 0; r < 16; ++r){ + for (unsigned c = 0; c < 4; ++c){ + mm[b*64+r*4+c] = loadab(&m[4*r+0][c+b*4], &m[4*r+1][c+b*4], &m[4*r+2][c+b*4], &m[4*r+3][c+b*4]); + } + } + } + + +#ifndef _MSC_VER + if (ctrlmask == 0){ + #pragma omp parallel for collapse(LOOP_COLLAPSE6) schedule(static) proc_bind(spread) + for (std::size_t i0 = 0; i0 < n; i0 += 2 * dsorted[0]){ + for (std::size_t i1 = 0; i1 < dsorted[0]; i1 += 2 * dsorted[1]){ + for (std::size_t i2 = 0; i2 < dsorted[1]; i2 += 2 * dsorted[2]){ + for (std::size_t i3 = 0; i3 < dsorted[2]; i3 += 2 * dsorted[3]){ + for (std::size_t i4 = 0; i4 < dsorted[3]; i4 += 2 * dsorted[4]){ + for (std::size_t i5 = 0; i5 < dsorted[4]; i5 += 2 * dsorted[5]){ + for (std::size_t i6 = 0; i6 < dsorted[5]; ++i6){ + kernel_core(psi, i0 + i1 + i2 + i3 + i4 + i5 + i6, dsorted[5], dsorted[4], dsorted[3], dsorted[2], dsorted[1], dsorted[0], mm); + } + } + } + } + } + } + } + } + else{ + #pragma omp parallel for collapse(LOOP_COLLAPSE6) schedule(static) + for (std::size_t i0 = 0; i0 < n; i0 += 2 * dsorted[0]){ + for (std::size_t i1 = 0; i1 < dsorted[0]; i1 += 2 * dsorted[1]){ + for (std::size_t i2 = 0; i2 < dsorted[1]; i2 += 2 * dsorted[2]){ + for (std::size_t i3 = 0; i3 < dsorted[2]; i3 += 2 * dsorted[3]){ + for (std::size_t i4 = 0; i4 < dsorted[3]; i4 += 2 * dsorted[4]){ + for (std::size_t i5 = 0; i5 < dsorted[4]; i5 += 2 * dsorted[5]){ + for (std::size_t i6 = 0; i6 < dsorted[5]; ++i6){ + if (((i0 + i1 + i2 + i3 + i4 + i5 + i6)&ctrlmask) == ctrlmask) + kernel_core(psi, i0 + i1 + i2 + i3 + i4 + i5 + i6, dsorted[5], dsorted[4], dsorted[3], dsorted[2], dsorted[1], dsorted[0], mm); + } + } + } + } + } + } + } + } +#else + std::intptr_t zero = 0; + std::intptr_t dmask = dsorted[0] + dsorted[1] + dsorted[2] + dsorted[3] + dsorted[4] + dsorted[5]; + + if (ctrlmask == 0){ + #pragma omp parallel for schedule(static) + for (std::intptr_t i = 0; i < static_cast(n); ++i) + if ((i & dmask) == zero) + kernel_core(psi, i, dsorted[5], dsorted[4], dsorted[3], dsorted[2], dsorted[1], dsorted[0], mm); + } else { + #pragma omp parallel for schedule(static) + for (std::intptr_t i = 0; i < static_cast(n); ++i) + if ((i & ctrlmask) == ctrlmask && (i & dmask) == zero) + kernel_core(psi, i, dsorted[5], dsorted[4], dsorted[3], dsorted[2], dsorted[1], dsorted[0], mm); + } +#endif +} + diff --git a/src/Simulation/Native/src/external/avx512/kernel7.hpp b/src/Simulation/Native/src/external/avx512/kernel7.hpp new file mode 100644 index 00000000000..8e60b76cff2 --- /dev/null +++ b/src/Simulation/Native/src/external/avx512/kernel7.hpp @@ -0,0 +1,417 @@ +// (C) 2018 ETH Zurich, ITP, Thomas Häner and Damian Steiger + +template +inline void kernel_core(V& psi, std::size_t I, std::size_t d0, std::size_t d1, std::size_t d2, std::size_t d3, std::size_t d4, std::size_t d5, std::size_t d6, M const& m) +{ + __m512d v[4]; + + v[0] = load1x4(&psi[I]); + v[1] = load1x4(&psi[I + d0]); + v[2] = load1x4(&psi[I + d1]); + v[3] = load1x4(&psi[I + d0 + d1]); + + __m512d tmp[32] = {_mm512_setzero_pd(), _mm512_setzero_pd(), _mm512_setzero_pd(), _mm512_setzero_pd(), _mm512_setzero_pd(), _mm512_setzero_pd(), _mm512_setzero_pd(), _mm512_setzero_pd(), _mm512_setzero_pd(), _mm512_setzero_pd(), _mm512_setzero_pd(), _mm512_setzero_pd(), _mm512_setzero_pd(), _mm512_setzero_pd(), _mm512_setzero_pd(), _mm512_setzero_pd(), _mm512_setzero_pd(), _mm512_setzero_pd(), _mm512_setzero_pd(), _mm512_setzero_pd(), _mm512_setzero_pd(), _mm512_setzero_pd(), _mm512_setzero_pd(), _mm512_setzero_pd(), _mm512_setzero_pd(), _mm512_setzero_pd(), _mm512_setzero_pd(), _mm512_setzero_pd(), _mm512_setzero_pd(), _mm512_setzero_pd(), _mm512_setzero_pd(), _mm512_setzero_pd()}; + for (unsigned i = 0; i < 32; ++i){ + tmp[i] = fma(v[0], m[0 + i * 4 + 0], fma(v[1], m[0 + i * 4 + 1], fma(v[2], m[0 + i * 4 + 2], fma(v[3], m[0 + i * 4 + 3], tmp[i])))); + } + + + v[0] = load1x4(&psi[I + d2]); + v[1] = load1x4(&psi[I + d0 + d2]); + v[2] = load1x4(&psi[I + d1 + d2]); + v[3] = load1x4(&psi[I + d0 + d1 + d2]); + for (unsigned i = 0; i < 32; ++i){ + tmp[i] = fma(v[0], m[128 + i * 4 + 0], fma(v[1], m[128 + i * 4 + 1], fma(v[2], m[128 + i * 4 + 2], fma(v[3], m[128 + i * 4 + 3], tmp[i])))); + } + + + v[0] = load1x4(&psi[I + d3]); + v[1] = load1x4(&psi[I + d0 + d3]); + v[2] = load1x4(&psi[I + d1 + d3]); + v[3] = load1x4(&psi[I + d0 + d1 + d3]); + for (unsigned i = 0; i < 32; ++i){ + tmp[i] = fma(v[0], m[256 + i * 4 + 0], fma(v[1], m[256 + i * 4 + 1], fma(v[2], m[256 + i * 4 + 2], fma(v[3], m[256 + i * 4 + 3], tmp[i])))); + } + + + v[0] = load1x4(&psi[I + d2 + d3]); + v[1] = load1x4(&psi[I + d0 + d2 + d3]); + v[2] = load1x4(&psi[I + d1 + d2 + d3]); + v[3] = load1x4(&psi[I + d0 + d1 + d2 + d3]); + for (unsigned i = 0; i < 32; ++i){ + tmp[i] = fma(v[0], m[384 + i * 4 + 0], fma(v[1], m[384 + i * 4 + 1], fma(v[2], m[384 + i * 4 + 2], fma(v[3], m[384 + i * 4 + 3], tmp[i])))); + } + + + v[0] = load1x4(&psi[I + d4]); + v[1] = load1x4(&psi[I + d0 + d4]); + v[2] = load1x4(&psi[I + d1 + d4]); + v[3] = load1x4(&psi[I + d0 + d1 + d4]); + for (unsigned i = 0; i < 32; ++i){ + tmp[i] = fma(v[0], m[512 + i * 4 + 0], fma(v[1], m[512 + i * 4 + 1], fma(v[2], m[512 + i * 4 + 2], fma(v[3], m[512 + i * 4 + 3], tmp[i])))); + } + + + v[0] = load1x4(&psi[I + d2 + d4]); + v[1] = load1x4(&psi[I + d0 + d2 + d4]); + v[2] = load1x4(&psi[I + d1 + d2 + d4]); + v[3] = load1x4(&psi[I + d0 + d1 + d2 + d4]); + for (unsigned i = 0; i < 32; ++i){ + tmp[i] = fma(v[0], m[640 + i * 4 + 0], fma(v[1], m[640 + i * 4 + 1], fma(v[2], m[640 + i * 4 + 2], fma(v[3], m[640 + i * 4 + 3], tmp[i])))); + } + + + v[0] = load1x4(&psi[I + d3 + d4]); + v[1] = load1x4(&psi[I + d0 + d3 + d4]); + v[2] = load1x4(&psi[I + d1 + d3 + d4]); + v[3] = load1x4(&psi[I + d0 + d1 + d3 + d4]); + for (unsigned i = 0; i < 32; ++i){ + tmp[i] = fma(v[0], m[768 + i * 4 + 0], fma(v[1], m[768 + i * 4 + 1], fma(v[2], m[768 + i * 4 + 2], fma(v[3], m[768 + i * 4 + 3], tmp[i])))); + } + + + v[0] = load1x4(&psi[I + d2 + d3 + d4]); + v[1] = load1x4(&psi[I + d0 + d2 + d3 + d4]); + v[2] = load1x4(&psi[I + d1 + d2 + d3 + d4]); + v[3] = load1x4(&psi[I + d0 + d1 + d2 + d3 + d4]); + for (unsigned i = 0; i < 32; ++i){ + tmp[i] = fma(v[0], m[896 + i * 4 + 0], fma(v[1], m[896 + i * 4 + 1], fma(v[2], m[896 + i * 4 + 2], fma(v[3], m[896 + i * 4 + 3], tmp[i])))); + } + + + v[0] = load1x4(&psi[I + d5]); + v[1] = load1x4(&psi[I + d0 + d5]); + v[2] = load1x4(&psi[I + d1 + d5]); + v[3] = load1x4(&psi[I + d0 + d1 + d5]); + for (unsigned i = 0; i < 32; ++i){ + tmp[i] = fma(v[0], m[1024 + i * 4 + 0], fma(v[1], m[1024 + i * 4 + 1], fma(v[2], m[1024 + i * 4 + 2], fma(v[3], m[1024 + i * 4 + 3], tmp[i])))); + } + + + v[0] = load1x4(&psi[I + d2 + d5]); + v[1] = load1x4(&psi[I + d0 + d2 + d5]); + v[2] = load1x4(&psi[I + d1 + d2 + d5]); + v[3] = load1x4(&psi[I + d0 + d1 + d2 + d5]); + for (unsigned i = 0; i < 32; ++i){ + tmp[i] = fma(v[0], m[1152 + i * 4 + 0], fma(v[1], m[1152 + i * 4 + 1], fma(v[2], m[1152 + i * 4 + 2], fma(v[3], m[1152 + i * 4 + 3], tmp[i])))); + } + + + v[0] = load1x4(&psi[I + d3 + d5]); + v[1] = load1x4(&psi[I + d0 + d3 + d5]); + v[2] = load1x4(&psi[I + d1 + d3 + d5]); + v[3] = load1x4(&psi[I + d0 + d1 + d3 + d5]); + for (unsigned i = 0; i < 32; ++i){ + tmp[i] = fma(v[0], m[1280 + i * 4 + 0], fma(v[1], m[1280 + i * 4 + 1], fma(v[2], m[1280 + i * 4 + 2], fma(v[3], m[1280 + i * 4 + 3], tmp[i])))); + } + + + v[0] = load1x4(&psi[I + d2 + d3 + d5]); + v[1] = load1x4(&psi[I + d0 + d2 + d3 + d5]); + v[2] = load1x4(&psi[I + d1 + d2 + d3 + d5]); + v[3] = load1x4(&psi[I + d0 + d1 + d2 + d3 + d5]); + for (unsigned i = 0; i < 32; ++i){ + tmp[i] = fma(v[0], m[1408 + i * 4 + 0], fma(v[1], m[1408 + i * 4 + 1], fma(v[2], m[1408 + i * 4 + 2], fma(v[3], m[1408 + i * 4 + 3], tmp[i])))); + } + + + v[0] = load1x4(&psi[I + d4 + d5]); + v[1] = load1x4(&psi[I + d0 + d4 + d5]); + v[2] = load1x4(&psi[I + d1 + d4 + d5]); + v[3] = load1x4(&psi[I + d0 + d1 + d4 + d5]); + for (unsigned i = 0; i < 32; ++i){ + tmp[i] = fma(v[0], m[1536 + i * 4 + 0], fma(v[1], m[1536 + i * 4 + 1], fma(v[2], m[1536 + i * 4 + 2], fma(v[3], m[1536 + i * 4 + 3], tmp[i])))); + } + + + v[0] = load1x4(&psi[I + d2 + d4 + d5]); + v[1] = load1x4(&psi[I + d0 + d2 + d4 + d5]); + v[2] = load1x4(&psi[I + d1 + d2 + d4 + d5]); + v[3] = load1x4(&psi[I + d0 + d1 + d2 + d4 + d5]); + for (unsigned i = 0; i < 32; ++i){ + tmp[i] = fma(v[0], m[1664 + i * 4 + 0], fma(v[1], m[1664 + i * 4 + 1], fma(v[2], m[1664 + i * 4 + 2], fma(v[3], m[1664 + i * 4 + 3], tmp[i])))); + } + + + v[0] = load1x4(&psi[I + d3 + d4 + d5]); + v[1] = load1x4(&psi[I + d0 + d3 + d4 + d5]); + v[2] = load1x4(&psi[I + d1 + d3 + d4 + d5]); + v[3] = load1x4(&psi[I + d0 + d1 + d3 + d4 + d5]); + for (unsigned i = 0; i < 32; ++i){ + tmp[i] = fma(v[0], m[1792 + i * 4 + 0], fma(v[1], m[1792 + i * 4 + 1], fma(v[2], m[1792 + i * 4 + 2], fma(v[3], m[1792 + i * 4 + 3], tmp[i])))); + } + + + v[0] = load1x4(&psi[I + d2 + d3 + d4 + d5]); + v[1] = load1x4(&psi[I + d0 + d2 + d3 + d4 + d5]); + v[2] = load1x4(&psi[I + d1 + d2 + d3 + d4 + d5]); + v[3] = load1x4(&psi[I + d0 + d1 + d2 + d3 + d4 + d5]); + for (unsigned i = 0; i < 32; ++i){ + tmp[i] = fma(v[0], m[1920 + i * 4 + 0], fma(v[1], m[1920 + i * 4 + 1], fma(v[2], m[1920 + i * 4 + 2], fma(v[3], m[1920 + i * 4 + 3], tmp[i])))); + } + + + v[0] = load1x4(&psi[I + d6]); + v[1] = load1x4(&psi[I + d0 + d6]); + v[2] = load1x4(&psi[I + d1 + d6]); + v[3] = load1x4(&psi[I + d0 + d1 + d6]); + for (unsigned i = 0; i < 32; ++i){ + tmp[i] = fma(v[0], m[2048 + i * 4 + 0], fma(v[1], m[2048 + i * 4 + 1], fma(v[2], m[2048 + i * 4 + 2], fma(v[3], m[2048 + i * 4 + 3], tmp[i])))); + } + + + v[0] = load1x4(&psi[I + d2 + d6]); + v[1] = load1x4(&psi[I + d0 + d2 + d6]); + v[2] = load1x4(&psi[I + d1 + d2 + d6]); + v[3] = load1x4(&psi[I + d0 + d1 + d2 + d6]); + for (unsigned i = 0; i < 32; ++i){ + tmp[i] = fma(v[0], m[2176 + i * 4 + 0], fma(v[1], m[2176 + i * 4 + 1], fma(v[2], m[2176 + i * 4 + 2], fma(v[3], m[2176 + i * 4 + 3], tmp[i])))); + } + + + v[0] = load1x4(&psi[I + d3 + d6]); + v[1] = load1x4(&psi[I + d0 + d3 + d6]); + v[2] = load1x4(&psi[I + d1 + d3 + d6]); + v[3] = load1x4(&psi[I + d0 + d1 + d3 + d6]); + for (unsigned i = 0; i < 32; ++i){ + tmp[i] = fma(v[0], m[2304 + i * 4 + 0], fma(v[1], m[2304 + i * 4 + 1], fma(v[2], m[2304 + i * 4 + 2], fma(v[3], m[2304 + i * 4 + 3], tmp[i])))); + } + + + v[0] = load1x4(&psi[I + d2 + d3 + d6]); + v[1] = load1x4(&psi[I + d0 + d2 + d3 + d6]); + v[2] = load1x4(&psi[I + d1 + d2 + d3 + d6]); + v[3] = load1x4(&psi[I + d0 + d1 + d2 + d3 + d6]); + for (unsigned i = 0; i < 32; ++i){ + tmp[i] = fma(v[0], m[2432 + i * 4 + 0], fma(v[1], m[2432 + i * 4 + 1], fma(v[2], m[2432 + i * 4 + 2], fma(v[3], m[2432 + i * 4 + 3], tmp[i])))); + } + + + v[0] = load1x4(&psi[I + d4 + d6]); + v[1] = load1x4(&psi[I + d0 + d4 + d6]); + v[2] = load1x4(&psi[I + d1 + d4 + d6]); + v[3] = load1x4(&psi[I + d0 + d1 + d4 + d6]); + for (unsigned i = 0; i < 32; ++i){ + tmp[i] = fma(v[0], m[2560 + i * 4 + 0], fma(v[1], m[2560 + i * 4 + 1], fma(v[2], m[2560 + i * 4 + 2], fma(v[3], m[2560 + i * 4 + 3], tmp[i])))); + } + + + v[0] = load1x4(&psi[I + d2 + d4 + d6]); + v[1] = load1x4(&psi[I + d0 + d2 + d4 + d6]); + v[2] = load1x4(&psi[I + d1 + d2 + d4 + d6]); + v[3] = load1x4(&psi[I + d0 + d1 + d2 + d4 + d6]); + for (unsigned i = 0; i < 32; ++i){ + tmp[i] = fma(v[0], m[2688 + i * 4 + 0], fma(v[1], m[2688 + i * 4 + 1], fma(v[2], m[2688 + i * 4 + 2], fma(v[3], m[2688 + i * 4 + 3], tmp[i])))); + } + + + v[0] = load1x4(&psi[I + d3 + d4 + d6]); + v[1] = load1x4(&psi[I + d0 + d3 + d4 + d6]); + v[2] = load1x4(&psi[I + d1 + d3 + d4 + d6]); + v[3] = load1x4(&psi[I + d0 + d1 + d3 + d4 + d6]); + for (unsigned i = 0; i < 32; ++i){ + tmp[i] = fma(v[0], m[2816 + i * 4 + 0], fma(v[1], m[2816 + i * 4 + 1], fma(v[2], m[2816 + i * 4 + 2], fma(v[3], m[2816 + i * 4 + 3], tmp[i])))); + } + + + v[0] = load1x4(&psi[I + d2 + d3 + d4 + d6]); + v[1] = load1x4(&psi[I + d0 + d2 + d3 + d4 + d6]); + v[2] = load1x4(&psi[I + d1 + d2 + d3 + d4 + d6]); + v[3] = load1x4(&psi[I + d0 + d1 + d2 + d3 + d4 + d6]); + for (unsigned i = 0; i < 32; ++i){ + tmp[i] = fma(v[0], m[2944 + i * 4 + 0], fma(v[1], m[2944 + i * 4 + 1], fma(v[2], m[2944 + i * 4 + 2], fma(v[3], m[2944 + i * 4 + 3], tmp[i])))); + } + + + v[0] = load1x4(&psi[I + d5 + d6]); + v[1] = load1x4(&psi[I + d0 + d5 + d6]); + v[2] = load1x4(&psi[I + d1 + d5 + d6]); + v[3] = load1x4(&psi[I + d0 + d1 + d5 + d6]); + for (unsigned i = 0; i < 32; ++i){ + tmp[i] = fma(v[0], m[3072 + i * 4 + 0], fma(v[1], m[3072 + i * 4 + 1], fma(v[2], m[3072 + i * 4 + 2], fma(v[3], m[3072 + i * 4 + 3], tmp[i])))); + } + + + v[0] = load1x4(&psi[I + d2 + d5 + d6]); + v[1] = load1x4(&psi[I + d0 + d2 + d5 + d6]); + v[2] = load1x4(&psi[I + d1 + d2 + d5 + d6]); + v[3] = load1x4(&psi[I + d0 + d1 + d2 + d5 + d6]); + for (unsigned i = 0; i < 32; ++i){ + tmp[i] = fma(v[0], m[3200 + i * 4 + 0], fma(v[1], m[3200 + i * 4 + 1], fma(v[2], m[3200 + i * 4 + 2], fma(v[3], m[3200 + i * 4 + 3], tmp[i])))); + } + + + v[0] = load1x4(&psi[I + d3 + d5 + d6]); + v[1] = load1x4(&psi[I + d0 + d3 + d5 + d6]); + v[2] = load1x4(&psi[I + d1 + d3 + d5 + d6]); + v[3] = load1x4(&psi[I + d0 + d1 + d3 + d5 + d6]); + for (unsigned i = 0; i < 32; ++i){ + tmp[i] = fma(v[0], m[3328 + i * 4 + 0], fma(v[1], m[3328 + i * 4 + 1], fma(v[2], m[3328 + i * 4 + 2], fma(v[3], m[3328 + i * 4 + 3], tmp[i])))); + } + + + v[0] = load1x4(&psi[I + d2 + d3 + d5 + d6]); + v[1] = load1x4(&psi[I + d0 + d2 + d3 + d5 + d6]); + v[2] = load1x4(&psi[I + d1 + d2 + d3 + d5 + d6]); + v[3] = load1x4(&psi[I + d0 + d1 + d2 + d3 + d5 + d6]); + for (unsigned i = 0; i < 32; ++i){ + tmp[i] = fma(v[0], m[3456 + i * 4 + 0], fma(v[1], m[3456 + i * 4 + 1], fma(v[2], m[3456 + i * 4 + 2], fma(v[3], m[3456 + i * 4 + 3], tmp[i])))); + } + + + v[0] = load1x4(&psi[I + d4 + d5 + d6]); + v[1] = load1x4(&psi[I + d0 + d4 + d5 + d6]); + v[2] = load1x4(&psi[I + d1 + d4 + d5 + d6]); + v[3] = load1x4(&psi[I + d0 + d1 + d4 + d5 + d6]); + for (unsigned i = 0; i < 32; ++i){ + tmp[i] = fma(v[0], m[3584 + i * 4 + 0], fma(v[1], m[3584 + i * 4 + 1], fma(v[2], m[3584 + i * 4 + 2], fma(v[3], m[3584 + i * 4 + 3], tmp[i])))); + } + + + v[0] = load1x4(&psi[I + d2 + d4 + d5 + d6]); + v[1] = load1x4(&psi[I + d0 + d2 + d4 + d5 + d6]); + v[2] = load1x4(&psi[I + d1 + d2 + d4 + d5 + d6]); + v[3] = load1x4(&psi[I + d0 + d1 + d2 + d4 + d5 + d6]); + for (unsigned i = 0; i < 32; ++i){ + tmp[i] = fma(v[0], m[3712 + i * 4 + 0], fma(v[1], m[3712 + i * 4 + 1], fma(v[2], m[3712 + i * 4 + 2], fma(v[3], m[3712 + i * 4 + 3], tmp[i])))); + } + + + v[0] = load1x4(&psi[I + d3 + d4 + d5 + d6]); + v[1] = load1x4(&psi[I + d0 + d3 + d4 + d5 + d6]); + v[2] = load1x4(&psi[I + d1 + d3 + d4 + d5 + d6]); + v[3] = load1x4(&psi[I + d0 + d1 + d3 + d4 + d5 + d6]); + for (unsigned i = 0; i < 32; ++i){ + tmp[i] = fma(v[0], m[3840 + i * 4 + 0], fma(v[1], m[3840 + i * 4 + 1], fma(v[2], m[3840 + i * 4 + 2], fma(v[3], m[3840 + i * 4 + 3], tmp[i])))); + } + + + v[0] = load1x4(&psi[I + d2 + d3 + d4 + d5 + d6]); + v[1] = load1x4(&psi[I + d0 + d2 + d3 + d4 + d5 + d6]); + v[2] = load1x4(&psi[I + d1 + d2 + d3 + d4 + d5 + d6]); + v[3] = load1x4(&psi[I + d0 + d1 + d2 + d3 + d4 + d5 + d6]); + for (unsigned i = 0; i < 32; ++i){ + tmp[i] = fma(v[0], m[3968 + i * 4 + 0], fma(v[1], m[3968 + i * 4 + 1], fma(v[2], m[3968 + i * 4 + 2], fma(v[3], m[3968 + i * 4 + 3], tmp[i])))); + } + + store((double*)&psi[I + d0 + d1], (double*)&psi[I + d1], (double*)&psi[I + d0], (double*)&psi[I], tmp[0]); + store((double*)&psi[I + d0 + d1 + d2], (double*)&psi[I + d1 + d2], (double*)&psi[I + d0 + d2], (double*)&psi[I + d2], tmp[1]); + store((double*)&psi[I + d0 + d1 + d3], (double*)&psi[I + d1 + d3], (double*)&psi[I + d0 + d3], (double*)&psi[I + d3], tmp[2]); + store((double*)&psi[I + d0 + d1 + d2 + d3], (double*)&psi[I + d1 + d2 + d3], (double*)&psi[I + d0 + d2 + d3], (double*)&psi[I + d2 + d3], tmp[3]); + store((double*)&psi[I + d0 + d1 + d4], (double*)&psi[I + d1 + d4], (double*)&psi[I + d0 + d4], (double*)&psi[I + d4], tmp[4]); + store((double*)&psi[I + d0 + d1 + d2 + d4], (double*)&psi[I + d1 + d2 + d4], (double*)&psi[I + d0 + d2 + d4], (double*)&psi[I + d2 + d4], tmp[5]); + store((double*)&psi[I + d0 + d1 + d3 + d4], (double*)&psi[I + d1 + d3 + d4], (double*)&psi[I + d0 + d3 + d4], (double*)&psi[I + d3 + d4], tmp[6]); + store((double*)&psi[I + d0 + d1 + d2 + d3 + d4], (double*)&psi[I + d1 + d2 + d3 + d4], (double*)&psi[I + d0 + d2 + d3 + d4], (double*)&psi[I + d2 + d3 + d4], tmp[7]); + store((double*)&psi[I + d0 + d1 + d5], (double*)&psi[I + d1 + d5], (double*)&psi[I + d0 + d5], (double*)&psi[I + d5], tmp[8]); + store((double*)&psi[I + d0 + d1 + d2 + d5], (double*)&psi[I + d1 + d2 + d5], (double*)&psi[I + d0 + d2 + d5], (double*)&psi[I + d2 + d5], tmp[9]); + store((double*)&psi[I + d0 + d1 + d3 + d5], (double*)&psi[I + d1 + d3 + d5], (double*)&psi[I + d0 + d3 + d5], (double*)&psi[I + d3 + d5], tmp[10]); + store((double*)&psi[I + d0 + d1 + d2 + d3 + d5], (double*)&psi[I + d1 + d2 + d3 + d5], (double*)&psi[I + d0 + d2 + d3 + d5], (double*)&psi[I + d2 + d3 + d5], tmp[11]); + store((double*)&psi[I + d0 + d1 + d4 + d5], (double*)&psi[I + d1 + d4 + d5], (double*)&psi[I + d0 + d4 + d5], (double*)&psi[I + d4 + d5], tmp[12]); + store((double*)&psi[I + d0 + d1 + d2 + d4 + d5], (double*)&psi[I + d1 + d2 + d4 + d5], (double*)&psi[I + d0 + d2 + d4 + d5], (double*)&psi[I + d2 + d4 + d5], tmp[13]); + store((double*)&psi[I + d0 + d1 + d3 + d4 + d5], (double*)&psi[I + d1 + d3 + d4 + d5], (double*)&psi[I + d0 + d3 + d4 + d5], (double*)&psi[I + d3 + d4 + d5], tmp[14]); + store((double*)&psi[I + d0 + d1 + d2 + d3 + d4 + d5], (double*)&psi[I + d1 + d2 + d3 + d4 + d5], (double*)&psi[I + d0 + d2 + d3 + d4 + d5], (double*)&psi[I + d2 + d3 + d4 + d5], tmp[15]); + store((double*)&psi[I + d0 + d1 + d6], (double*)&psi[I + d1 + d6], (double*)&psi[I + d0 + d6], (double*)&psi[I + d6], tmp[16]); + store((double*)&psi[I + d0 + d1 + d2 + d6], (double*)&psi[I + d1 + d2 + d6], (double*)&psi[I + d0 + d2 + d6], (double*)&psi[I + d2 + d6], tmp[17]); + store((double*)&psi[I + d0 + d1 + d3 + d6], (double*)&psi[I + d1 + d3 + d6], (double*)&psi[I + d0 + d3 + d6], (double*)&psi[I + d3 + d6], tmp[18]); + store((double*)&psi[I + d0 + d1 + d2 + d3 + d6], (double*)&psi[I + d1 + d2 + d3 + d6], (double*)&psi[I + d0 + d2 + d3 + d6], (double*)&psi[I + d2 + d3 + d6], tmp[19]); + store((double*)&psi[I + d0 + d1 + d4 + d6], (double*)&psi[I + d1 + d4 + d6], (double*)&psi[I + d0 + d4 + d6], (double*)&psi[I + d4 + d6], tmp[20]); + store((double*)&psi[I + d0 + d1 + d2 + d4 + d6], (double*)&psi[I + d1 + d2 + d4 + d6], (double*)&psi[I + d0 + d2 + d4 + d6], (double*)&psi[I + d2 + d4 + d6], tmp[21]); + store((double*)&psi[I + d0 + d1 + d3 + d4 + d6], (double*)&psi[I + d1 + d3 + d4 + d6], (double*)&psi[I + d0 + d3 + d4 + d6], (double*)&psi[I + d3 + d4 + d6], tmp[22]); + store((double*)&psi[I + d0 + d1 + d2 + d3 + d4 + d6], (double*)&psi[I + d1 + d2 + d3 + d4 + d6], (double*)&psi[I + d0 + d2 + d3 + d4 + d6], (double*)&psi[I + d2 + d3 + d4 + d6], tmp[23]); + store((double*)&psi[I + d0 + d1 + d5 + d6], (double*)&psi[I + d1 + d5 + d6], (double*)&psi[I + d0 + d5 + d6], (double*)&psi[I + d5 + d6], tmp[24]); + store((double*)&psi[I + d0 + d1 + d2 + d5 + d6], (double*)&psi[I + d1 + d2 + d5 + d6], (double*)&psi[I + d0 + d2 + d5 + d6], (double*)&psi[I + d2 + d5 + d6], tmp[25]); + store((double*)&psi[I + d0 + d1 + d3 + d5 + d6], (double*)&psi[I + d1 + d3 + d5 + d6], (double*)&psi[I + d0 + d3 + d5 + d6], (double*)&psi[I + d3 + d5 + d6], tmp[26]); + store((double*)&psi[I + d0 + d1 + d2 + d3 + d5 + d6], (double*)&psi[I + d1 + d2 + d3 + d5 + d6], (double*)&psi[I + d0 + d2 + d3 + d5 + d6], (double*)&psi[I + d2 + d3 + d5 + d6], tmp[27]); + store((double*)&psi[I + d0 + d1 + d4 + d5 + d6], (double*)&psi[I + d1 + d4 + d5 + d6], (double*)&psi[I + d0 + d4 + d5 + d6], (double*)&psi[I + d4 + d5 + d6], tmp[28]); + store((double*)&psi[I + d0 + d1 + d2 + d4 + d5 + d6], (double*)&psi[I + d1 + d2 + d4 + d5 + d6], (double*)&psi[I + d0 + d2 + d4 + d5 + d6], (double*)&psi[I + d2 + d4 + d5 + d6], tmp[29]); + store((double*)&psi[I + d0 + d1 + d3 + d4 + d5 + d6], (double*)&psi[I + d1 + d3 + d4 + d5 + d6], (double*)&psi[I + d0 + d3 + d4 + d5 + d6], (double*)&psi[I + d3 + d4 + d5 + d6], tmp[30]); + store((double*)&psi[I + d0 + d1 + d2 + d3 + d4 + d5 + d6], (double*)&psi[I + d1 + d2 + d3 + d4 + d5 + d6], (double*)&psi[I + d0 + d2 + d3 + d4 + d5 + d6], (double*)&psi[I + d2 + d3 + d4 + d5 + d6], tmp[31]); + +} + +// bit indices id[.] are given from high to low (e.g. control first for CNOT) +template +void kernel(V& psi, unsigned id6, unsigned id5, unsigned id4, unsigned id3, unsigned id2, unsigned id1, unsigned id0, M const& matrix, std::size_t ctrlmask) +{ + std::size_t n = psi.size(); + std::size_t d0 = 1ULL << id0; + std::size_t d1 = 1ULL << id1; + std::size_t d2 = 1ULL << id2; + std::size_t d3 = 1ULL << id3; + std::size_t d4 = 1ULL << id4; + std::size_t d5 = 1ULL << id5; + std::size_t d6 = 1ULL << id6; + auto m = matrix; + std::size_t dsorted[] = {d0, d1, d2, d3, d4, d5, d6}; + permute_qubits_and_matrix(dsorted, 7, m); + + __m512d mm[4096]; + for (unsigned b = 0; b < 32; ++b){ + for (unsigned r = 0; r < 32; ++r){ + for (unsigned c = 0; c < 4; ++c){ + mm[b*128+r*4+c] = loadab(&m[4*r+0][c+b*4], &m[4*r+1][c+b*4], &m[4*r+2][c+b*4], &m[4*r+3][c+b*4]); + } + } + } + + +#ifndef _MSC_VER + if (ctrlmask == 0){ + #pragma omp parallel for collapse(LOOP_COLLAPSE7) schedule(static) proc_bind(spread) + for (std::size_t i0 = 0; i0 < n; i0 += 2 * dsorted[0]){ + for (std::size_t i1 = 0; i1 < dsorted[0]; i1 += 2 * dsorted[1]){ + for (std::size_t i2 = 0; i2 < dsorted[1]; i2 += 2 * dsorted[2]){ + for (std::size_t i3 = 0; i3 < dsorted[2]; i3 += 2 * dsorted[3]){ + for (std::size_t i4 = 0; i4 < dsorted[3]; i4 += 2 * dsorted[4]){ + for (std::size_t i5 = 0; i5 < dsorted[4]; i5 += 2 * dsorted[5]){ + for (std::size_t i6 = 0; i6 < dsorted[5]; i6 += 2 * dsorted[6]){ + for (std::size_t i7 = 0; i7 < dsorted[6]; ++i7){ + kernel_core(psi, i0 + i1 + i2 + i3 + i4 + i5 + i6 + i7, dsorted[6], dsorted[5], dsorted[4], dsorted[3], dsorted[2], dsorted[1], dsorted[0], mm); + } + } + } + } + } + } + } + } + } + else{ + #pragma omp parallel for collapse(LOOP_COLLAPSE7) schedule(static) + for (std::size_t i0 = 0; i0 < n; i0 += 2 * dsorted[0]){ + for (std::size_t i1 = 0; i1 < dsorted[0]; i1 += 2 * dsorted[1]){ + for (std::size_t i2 = 0; i2 < dsorted[1]; i2 += 2 * dsorted[2]){ + for (std::size_t i3 = 0; i3 < dsorted[2]; i3 += 2 * dsorted[3]){ + for (std::size_t i4 = 0; i4 < dsorted[3]; i4 += 2 * dsorted[4]){ + for (std::size_t i5 = 0; i5 < dsorted[4]; i5 += 2 * dsorted[5]){ + for (std::size_t i6 = 0; i6 < dsorted[5]; i6 += 2 * dsorted[6]){ + for (std::size_t i7 = 0; i7 < dsorted[6]; ++i7){ + if (((i0 + i1 + i2 + i3 + i4 + i5 + i6 + i7)&ctrlmask) == ctrlmask) + kernel_core(psi, i0 + i1 + i2 + i3 + i4 + i5 + i6 + i7, dsorted[6], dsorted[5], dsorted[4], dsorted[3], dsorted[2], dsorted[1], dsorted[0], mm); + } + } + } + } + } + } + } + } + } +#else + std::intptr_t zero = 0; + std::intptr_t dmask = dsorted[0] + dsorted[1] + dsorted[2] + dsorted[3] + dsorted[4] + dsorted[5] + dsorted[6]; + + if (ctrlmask == 0){ + #pragma omp parallel for schedule(static) + for (std::intptr_t i = 0; i < static_cast(n); ++i) + if ((i & dmask) == zero) + kernel_core(psi, i, dsorted[6], dsorted[5], dsorted[4], dsorted[3], dsorted[2], dsorted[1], dsorted[0], mm); + } else { + #pragma omp parallel for schedule(static) + for (std::intptr_t i = 0; i < static_cast(n); ++i) + if ((i & ctrlmask) == ctrlmask && (i & dmask) == zero) + kernel_core(psi, i, dsorted[6], dsorted[5], dsorted[4], dsorted[3], dsorted[2], dsorted[1], dsorted[0], mm); + } +#endif +} + diff --git a/src/Simulation/Native/src/external/avx512/kernels.hpp b/src/Simulation/Native/src/external/avx512/kernels.hpp new file mode 100644 index 00000000000..d5a056663bd --- /dev/null +++ b/src/Simulation/Native/src/external/avx512/kernels.hpp @@ -0,0 +1,31 @@ +// (C) 2018 ETH Zurich, ITP, Thomas Häner and Damian Steiger + +#ifndef KERNELS_HPP_ +#define KERNELS_HPP_ + +#include +#include +#include +#include +#include +#include +#include "../cintrin.hpp" +#include "util/alignedalloc.hpp" + +#define LOOP_COLLAPSE1 2 +#define LOOP_COLLAPSE2 3 +#define LOOP_COLLAPSE3 4 +#define LOOP_COLLAPSE4 5 +#define LOOP_COLLAPSE5 6 +#define LOOP_COLLAPSE6 7 +#define LOOP_COLLAPSE7 8 + +#include "kernel1.hpp" +#include "kernel2.hpp" +#include "kernel3.hpp" +#include "kernel4.hpp" +#include "kernel5.hpp" +#include "kernel6.hpp" +#include "kernel7.hpp" + +#endif diff --git a/src/Simulation/Native/src/external/cintrin.hpp b/src/Simulation/Native/src/external/cintrin.hpp index 8034408c1e8..7e9eca72956 100644 --- a/src/Simulation/Native/src/external/cintrin.hpp +++ b/src/Simulation/Native/src/external/cintrin.hpp @@ -35,7 +35,15 @@ inline void permute_qubits_and_matrix(I *delta_list, unsigned n, M & matrix){ } inline std::complex fma(std::complex const& c1, std::complex const& c2, std::complex const& a){ - return c1*c2 + a; + // Expanded complex FMA to hard coded access (much faster) +#ifdef _MSC_VER + double r = (c1._Val[0] * c2._Val[0] - c1._Val[1] * c2._Val[1]) + a._Val[0]; + double i = (c1._Val[0] * c2._Val[1] + c1._Val[1] * c2._Val[0]) + a._Val[1]; +#else + double r = (c1.real() * c2.real() - c1.imag() * c2.imag()) + a.real(); + double i = (c1.real() * c2.imag() + c1.imag() * c2.real()) + a.imag(); +#endif + return std::complex(r, i); } inline __m256d fma(__m256d const& c1, __m256d const& c2, __m256d const& a){ diff --git a/src/Simulation/Native/src/external/fused.hpp b/src/Simulation/Native/src/external/fused.hpp index 6c1137ceb3b..2b170461e41 100644 --- a/src/Simulation/Native/src/external/fused.hpp +++ b/src/Simulation/Native/src/external/fused.hpp @@ -5,6 +5,8 @@ #include "config.hpp" #include "external/fusion.hpp" #include "simulator/kernels.hpp" +#include +#include #ifndef HAVE_INTRINSICS #include "external/nointrin/kernels.hpp" @@ -15,7 +17,7 @@ #ifdef HAVE_FMA #include "external/avx2/kernels.hpp" #else -#include "external/avx2/kernels.hpp" +#include "external/avx/kernels.hpp" #endif #endif #endif @@ -29,15 +31,35 @@ namespace SIMULATOR class Fused { + public: - Fused() {} + Fused() { + wfnCapacity = 0u; // used to optimize runtime parameters + maxFusedSpan = 4; // determine span to use at runtime + maxFusedDepth = 999; // determine max depth to use at runtime + } inline void reset() { fusedgates = Fusion(); } + const Fusion& get_fusedgates() const { + return fusedgates; + } + void set_fusedgates(Fusion newFusedGates) const { + fusedgates = newFusedGates; + } + + const int maxSpan() const { + return maxFusedSpan; + } + + const int maxDepth() const { + return maxFusedDepth; + } + template void flush(std::vector& wfn) const { @@ -46,9 +68,9 @@ class Fused Fusion::Matrix m; Fusion::IndexVector qs, cs; - + fusedgates.perform_fusion(m, qs, cs); - + std::size_t cmask = 0; for (auto c : cs) cmask |= (1ull << c); @@ -70,23 +92,19 @@ class Fused case 5: ::kernel(wfn, qs[4], qs[3], qs[2], qs[1], qs[0], m, cmask); break; + case 6: + ::kernel(wfn, qs[5], qs[4], qs[3], qs[2], qs[1], qs[0], m, cmask); + break; + case 7: + ::kernel(wfn, qs[6], qs[5], qs[4], qs[3], qs[2], qs[1], qs[0], m, cmask); + break; } - + fusedgates = Fusion(); } - template - bool subsytemwavefunction(std::vector& wfn, - std::vector const& qs, - std::vector& qubitswfn, - double tolerance) - { - flush(wfn); // we have to flush before we can extract the state - return kernels::subsytemwavefunction(wfn, qs, qubitswfn, tolerance); - } - template - Fusion::Matrix convertMatrix(M const& m) + Fusion::Matrix convertMatrix(M const& m) const { Fusion::Matrix mat(2, Fusion::Matrix::value_type(2)); for (unsigned i = 0; i < 2; ++i) @@ -96,30 +114,86 @@ class Fused } template - void apply_controlled(std::vector& wfn, M const& mat, std::vector const& cs, unsigned q) + void apply_controlled(std::vector& wfn, M const& mat, std::vector const& cs, unsigned q) const { - if (fusedgates.num_qubits()+fusedgates.num_controls()+cs.size()>8 || fusedgates.size() > 15) - flush(wfn); - Fusion newgates = fusedgates; - newgates.insert(convertMatrix(mat), std::vector(1, q), cs); - - if (newgates.num_qubits() > 4) - { - flush(wfn); - fusedgates.insert(convertMatrix(mat), std::vector(1, q), cs); - } - else - fusedgates = newgates; + Fusion::IndexVector qs = std::vector(1, q); + fusedgates.insert(convertMatrix(mat), qs, cs); } - + template - void apply(std::vector& wfn, M const& mat, unsigned q) + void apply(std::vector& wfn, M const& mat, unsigned q) const { std::vector cs; apply_controlled(wfn, mat, cs, q); } + + template + bool shouldFlush(std::vector& wfn, std::vector const& cs, unsigned q) + { + // Major runtime logic change here + + // Have to update capacity as the WFN grows + if (wfnCapacity != wfn.capacity()) { + wfnCapacity = wfn.capacity(); + char* envNT = NULL; + size_t len; +#ifdef _MSC_VER + errno_t err = _dupenv_s(&envNT, &len, "OMP_NUM_THREADS"); +#else + envNT = getenv("OMP_NUM_THREADS"); +#endif + if (envNT == NULL) { // If the user didn't force the number of threads, make an intelligent guess + int nMaxThrds = std::thread::hardware_concurrency(); // Logical HW threads + if (nMaxThrds > 4) nMaxThrds/= 2; // Assume we have hyperthreading (no consistent/concise way to do this) + if (wfnCapacity < 1u << 20) { + if (nMaxThrds > 8) nMaxThrds = 8; // Small problem, never use too many + else if (nMaxThrds > 3) nMaxThrds = 3; // Small problem on a small machine + } + omp_set_num_threads(nMaxThrds); + } + + // Set the max fused depth + char* envFD = NULL; + maxFusedDepth = 999; +#ifdef _MSC_VER + err = _dupenv_s(&envFD, &len, "QDK_SIM_FUSEDEPTH"); + if (envFD != NULL && len > 0) { + maxFusedDepth = atoi(envFD); + } +#else + envFD = getenv("QDK_SIM_FUSEDEPTH"); + if (envFD != NULL && strlen(envFD) > 0) { + maxFusedDepth = atoi(envFD); + } +#endif + // Set the fused span limit + char* envFS = NULL; + maxFusedSpan = 4; // General sweet spot + if (wfnCapacity < 1u << 20) maxFusedSpan = 2; // Don't pre-fuse small problems +#ifdef _MSC_VER + err = _dupenv_s(&envFS, &len, "QDK_SIM_FUSESPAN"); + if (envFS != NULL && len > 0) { + maxFusedSpan = atoi(envFS); + if (maxFusedSpan > 7) maxFusedSpan = 7; // Highest we can handle + } +#else + envFS = getenv("QDK_SIM_FUSESPAN"); + if (envFS != NULL && strlen(envFS) > 0) { + maxFusedSpan = atoi(envFS); + } +#endif + + } + return false; + } + private: mutable Fusion fusedgates; + + //: New runtime optimizatin settings + mutable size_t wfnCapacity; + mutable int maxFusedSpan; + mutable int maxFusedDepth; }; diff --git a/src/Simulation/Native/src/external/fusion.hpp b/src/Simulation/Native/src/external/fusion.hpp index 2e6b4bdb50d..88140349dba 100644 --- a/src/Simulation/Native/src/external/fusion.hpp +++ b/src/Simulation/Native/src/external/fusion.hpp @@ -10,6 +10,7 @@ #include #include #include "util/alignedalloc.hpp" +#include class Item{ public: @@ -17,14 +18,20 @@ class Item{ using IndexVector = std::vector; using Complex = std::complex; using Matrix = std::vector>>; - Item(Matrix mat, IndexVector idx) : mat_(mat), idx_(idx) {} + Item(Matrix mat, IndexVector idx) : mat_(std::move(mat)), idx_(idx) {} Matrix& get_matrix() { return mat_; } - IndexVector& get_indices() { return idx_; } + IndexVector& get_indices() const { return idx_; } + void remap_idx(std::unordered_map elemDict) const { + for (size_t i = 0; i < idx_.size(); i++) { + idx_[i] = elemDict[idx_[i]]; + } + } private: Matrix mat_; - IndexVector idx_; + mutable IndexVector idx_; }; +// Class handling the fusion of gates class Fusion{ public: using Index = unsigned; @@ -37,7 +44,7 @@ class Fusion{ Fusion() : global_factor_(1.) {} Index num_qubits() const { - return static_cast(set_.size()); + return static_cast(target_set_.size()); } Index num_controls() const { @@ -58,9 +65,58 @@ class Fusion{ handle_controls(empty_matrix, empty_vec, {}); // remove all current control qubits (this is a GLOBAL factor) } - void insert(Matrix matrix, IndexVector index_list, IndexVector const& ctrl_list = {}){ + const IndexSet& get_target_set() const { + return target_set_; + } + + const ItemVector& get_items() const { + return items_; + } + + const IndexSet& get_ctrl_set() const { + return ctrl_set_; + } + + const Complex& get_global_factor() const { + return global_factor_; + } + + static void remap_qubits(std::set& qubits, const std::unordered_map& mapFromOldLocToNewLoc) { + std::set tempSet; + for (unsigned elem : qubits) { + if (mapFromOldLocToNewLoc.find(elem) != mapFromOldLocToNewLoc.end()) { + tempSet.insert(mapFromOldLocToNewLoc.at(elem)); + } + } + qubits.swap(tempSet); + } + + void remap_target_set(const std::unordered_map& mapFromOldLocToNewLoc) const { + remap_qubits(target_set_, mapFromOldLocToNewLoc); + } + + void remap_ctrl_set(const std::unordered_map& mapFromOldLocToNewLoc) const { + remap_qubits(ctrl_set_, mapFromOldLocToNewLoc); + } + + void set_items(ItemVector&& newItems) const { + items_.swap(newItems); + } + + // This saves a class instance create/destroy on every gate insert + // Need a quick way to decide if we're going to grow too wide + int predict(IndexVector index_list, IndexVector const& ctrl_list = {}) { + int cnt = num_qubits() + num_controls(); + for (auto idx : index_list) + if (target_set_.count(idx) == 0 && ctrl_set_.count(idx) == 0) cnt++; + for (auto idx : ctrl_list) + if (target_set_.count(idx) == 0 && ctrl_set_.count(idx) == 0) cnt++; + return cnt; + } + + void insert(Matrix matrix, IndexVector index_list, IndexVector const& ctrl_list = {}) const { for (auto idx : index_list) - set_.emplace(idx); + target_set_.emplace(idx); if (global_factor_ != 1. && ctrl_list.size() > 0){ assert(ctrl_set_.size() == 0); @@ -73,7 +129,7 @@ class Fusion{ } void get_indices(IndexVector &indices) const{ - for (auto idx : set_) + for (auto idx : target_set_) indices.push_back(idx); } @@ -81,7 +137,7 @@ class Fusion{ if (global_factor_ != 1.) assert(ctrl_set_.size() == 0); - for (auto idx : set_) + for (auto idx : target_set_) index_list.push_back(idx); unsigned N = num_qubits(); @@ -97,7 +153,8 @@ class Fusion{ for (unsigned i = 0; i < idx.size(); ++i) idx2mat[i] = static_cast(((std::equal_range(index_list.begin(), index_list.end(), idx[i])).first - index_list.begin())); - for (std::size_t k = 0; k < (1ULL< oldcol(1ULL< need to be removed from the global mask and the controls incorporated into the old // commands (the ones already in the list). @@ -154,7 +211,7 @@ class Fusion{ if (ctrl_set_.count(ctrlIdx) == 0){ // need to either add it to the list or to the command if (items_.size() > 0){ // add it to the command add_controls(matrix, indexList, {ctrlIdx}); - set_.insert(ctrlIdx); + target_set_.insert(ctrlIdx); } else // add it to the list ctrl_set_.emplace(ctrlIdx); @@ -170,17 +227,17 @@ class Fusion{ for (auto idx : unhandled_ctrl){ new_ctrls.push_back(idx); ctrl_set_.erase(idx); - set_.insert(idx); + target_set_.insert(idx); } for (auto &item : items_) add_controls(item.get_matrix(), item.get_indices(), new_ctrls); } } - IndexSet set_; - ItemVector items_; - IndexSet ctrl_set_; - Complex global_factor_; + mutable IndexSet target_set_; //set of qubits being acted on + mutable ItemVector items_; //queue if gates to be fused + mutable IndexSet ctrl_set_; //set of controls + mutable Complex global_factor_; }; #endif diff --git a/src/Simulation/Native/src/external/nointrin/kernel1.hpp b/src/Simulation/Native/src/external/nointrin/kernel1.hpp index a6b624d3328..5173b58d8a8 100644 --- a/src/Simulation/Native/src/external/nointrin/kernel1.hpp +++ b/src/Simulation/Native/src/external/nointrin/kernel1.hpp @@ -43,7 +43,7 @@ void kernel(V& psi, unsigned id0, M const& matrix, std::size_t ctrlmask) #ifndef _MSC_VER if (ctrlmask == 0){ - #pragma omp for collapse(LOOP_COLLAPSE1) schedule(static) + #pragma omp parallel for collapse(LOOP_COLLAPSE1) schedule(static) proc_bind(spread) for (std::size_t i0 = 0; i0 < n; i0 += 2 * dsorted[0]){ for (std::size_t i1 = 0; i1 < dsorted[0]; ++i1){ kernel_core(psi, i0 + i1, dsorted[0], mm); @@ -51,7 +51,7 @@ void kernel(V& psi, unsigned id0, M const& matrix, std::size_t ctrlmask) } } else{ - #pragma omp for collapse(LOOP_COLLAPSE1) schedule(static) + #pragma omp parallel for collapse(LOOP_COLLAPSE1) schedule(static) for (std::size_t i0 = 0; i0 < n; i0 += 2 * dsorted[0]){ for (std::size_t i1 = 0; i1 < dsorted[0]; ++i1){ if (((i0 + i1)&ctrlmask) == ctrlmask) @@ -60,20 +60,20 @@ void kernel(V& psi, unsigned id0, M const& matrix, std::size_t ctrlmask) } } #else - std::intptr_t zero = 0; - std::intptr_t dmask = dsorted[0]; + std::intptr_t zero = 0; + std::intptr_t dmask = dsorted[0]; - if (ctrlmask == 0){ - #pragma omp parallel for schedule(static) - for (std::intptr_t i = 0; i < static_cast(n); ++i) - if ((i & dmask) == zero) - kernel_core(psi, i, dsorted[0], mm); - } else { - #pragma omp parallel for schedule(static) - for (std::intptr_t i = 0; i < static_cast(n); ++i) - if ((i & ctrlmask) == ctrlmask && (i & dmask) == zero) - kernel_core(psi, i, dsorted[0], mm); - } + if (ctrlmask == 0){ + #pragma omp parallel for schedule(static) + for (std::intptr_t i = 0; i < static_cast(n); ++i) + if ((i & dmask) == zero) + kernel_core(psi, i, dsorted[0], mm); + } else { + #pragma omp parallel for schedule(static) + for (std::intptr_t i = 0; i < static_cast(n); ++i) + if ((i & ctrlmask) == ctrlmask && (i & dmask) == zero) + kernel_core(psi, i, dsorted[0], mm); + } #endif } diff --git a/src/Simulation/Native/src/external/nointrin/kernel2.hpp b/src/Simulation/Native/src/external/nointrin/kernel2.hpp index 43be1a33440..dcb47fe7f48 100644 --- a/src/Simulation/Native/src/external/nointrin/kernel2.hpp +++ b/src/Simulation/Native/src/external/nointrin/kernel2.hpp @@ -64,7 +64,7 @@ void kernel(V& psi, unsigned id1, unsigned id0, M const& matrix, std::size_t ctr #ifndef _MSC_VER if (ctrlmask == 0){ - #pragma omp for collapse(LOOP_COLLAPSE2) schedule(static) + #pragma omp parallel for collapse(LOOP_COLLAPSE2) schedule(static) proc_bind(spread) for (std::size_t i0 = 0; i0 < n; i0 += 2 * dsorted[0]){ for (std::size_t i1 = 0; i1 < dsorted[0]; i1 += 2 * dsorted[1]){ for (std::size_t i2 = 0; i2 < dsorted[1]; ++i2){ @@ -74,7 +74,7 @@ void kernel(V& psi, unsigned id1, unsigned id0, M const& matrix, std::size_t ctr } } else{ - #pragma omp for collapse(LOOP_COLLAPSE2) schedule(static) + #pragma omp parallel for collapse(LOOP_COLLAPSE2) schedule(static) for (std::size_t i0 = 0; i0 < n; i0 += 2 * dsorted[0]){ for (std::size_t i1 = 0; i1 < dsorted[0]; i1 += 2 * dsorted[1]){ for (std::size_t i2 = 0; i2 < dsorted[1]; ++i2){ diff --git a/src/Simulation/Native/src/external/nointrin/kernel3.hpp b/src/Simulation/Native/src/external/nointrin/kernel3.hpp index da44778220a..1019845187a 100644 --- a/src/Simulation/Native/src/external/nointrin/kernel3.hpp +++ b/src/Simulation/Native/src/external/nointrin/kernel3.hpp @@ -129,7 +129,7 @@ void kernel(V& psi, unsigned id2, unsigned id1, unsigned id0, M const& matrix, s #ifndef _MSC_VER if (ctrlmask == 0){ - #pragma omp for collapse(LOOP_COLLAPSE3) schedule(static) + #pragma omp parallel for collapse(LOOP_COLLAPSE3) schedule(static) proc_bind(spread) for (std::size_t i0 = 0; i0 < n; i0 += 2 * dsorted[0]){ for (std::size_t i1 = 0; i1 < dsorted[0]; i1 += 2 * dsorted[1]){ for (std::size_t i2 = 0; i2 < dsorted[1]; i2 += 2 * dsorted[2]){ @@ -141,7 +141,7 @@ void kernel(V& psi, unsigned id2, unsigned id1, unsigned id0, M const& matrix, s } } else{ - #pragma omp for collapse(LOOP_COLLAPSE3) schedule(static) + #pragma omp parallel for collapse(LOOP_COLLAPSE3) schedule(static) for (std::size_t i0 = 0; i0 < n; i0 += 2 * dsorted[0]){ for (std::size_t i1 = 0; i1 < dsorted[0]; i1 += 2 * dsorted[1]){ for (std::size_t i2 = 0; i2 < dsorted[1]; i2 += 2 * dsorted[2]){ diff --git a/src/Simulation/Native/src/external/nointrin/kernel4.hpp b/src/Simulation/Native/src/external/nointrin/kernel4.hpp index 6bc3b303e6e..46d33620e74 100644 --- a/src/Simulation/Native/src/external/nointrin/kernel4.hpp +++ b/src/Simulation/Native/src/external/nointrin/kernel4.hpp @@ -354,7 +354,7 @@ void kernel(V& psi, unsigned id3, unsigned id2, unsigned id1, unsigned id0, M co #ifndef _MSC_VER if (ctrlmask == 0){ - #pragma omp for collapse(LOOP_COLLAPSE4) schedule(static) + #pragma omp parallel for collapse(LOOP_COLLAPSE4) schedule(static) proc_bind(spread) for (std::size_t i0 = 0; i0 < n; i0 += 2 * dsorted[0]){ for (std::size_t i1 = 0; i1 < dsorted[0]; i1 += 2 * dsorted[1]){ for (std::size_t i2 = 0; i2 < dsorted[1]; i2 += 2 * dsorted[2]){ @@ -368,7 +368,7 @@ void kernel(V& psi, unsigned id3, unsigned id2, unsigned id1, unsigned id0, M co } } else{ - #pragma omp for collapse(LOOP_COLLAPSE4) schedule(static) + #pragma omp parallel for collapse(LOOP_COLLAPSE4) schedule(static) for (std::size_t i0 = 0; i0 < n; i0 += 2 * dsorted[0]){ for (std::size_t i1 = 0; i1 < dsorted[0]; i1 += 2 * dsorted[1]){ for (std::size_t i2 = 0; i2 < dsorted[1]; i2 += 2 * dsorted[2]){ diff --git a/src/Simulation/Native/src/external/nointrin/kernel5.hpp b/src/Simulation/Native/src/external/nointrin/kernel5.hpp index 13d363f7df3..08657104779 100644 --- a/src/Simulation/Native/src/external/nointrin/kernel5.hpp +++ b/src/Simulation/Native/src/external/nointrin/kernel5.hpp @@ -643,7 +643,7 @@ void kernel(V& psi, unsigned id4, unsigned id3, unsigned id2, unsigned id1, unsi #ifndef _MSC_VER if (ctrlmask == 0){ - #pragma omp for collapse(LOOP_COLLAPSE5) schedule(static) + #pragma omp parallel for collapse(LOOP_COLLAPSE5) schedule(static) proc_bind(spread) for (std::size_t i0 = 0; i0 < n; i0 += 2 * dsorted[0]){ for (std::size_t i1 = 0; i1 < dsorted[0]; i1 += 2 * dsorted[1]){ for (std::size_t i2 = 0; i2 < dsorted[1]; i2 += 2 * dsorted[2]){ @@ -659,7 +659,7 @@ void kernel(V& psi, unsigned id4, unsigned id3, unsigned id2, unsigned id1, unsi } } else{ - #pragma omp for collapse(LOOP_COLLAPSE5) schedule(static) + #pragma omp parallel for collapse(LOOP_COLLAPSE5) schedule(static) for (std::size_t i0 = 0; i0 < n; i0 += 2 * dsorted[0]){ for (std::size_t i1 = 0; i1 < dsorted[0]; i1 += 2 * dsorted[1]){ for (std::size_t i2 = 0; i2 < dsorted[1]; i2 += 2 * dsorted[2]){ diff --git a/src/Simulation/Native/src/external/nointrin/kernel6.hpp b/src/Simulation/Native/src/external/nointrin/kernel6.hpp index 893bf4e35d5..7f8ea4741a3 100644 --- a/src/Simulation/Native/src/external/nointrin/kernel6.hpp +++ b/src/Simulation/Native/src/external/nointrin/kernel6.hpp @@ -244,7 +244,7 @@ void kernel(V& psi, unsigned id5, unsigned id4, unsigned id3, unsigned id2, unsi #ifndef _MSC_VER if (ctrlmask == 0){ - #pragma omp for collapse(LOOP_COLLAPSE6) schedule(static) + #pragma omp parallel for collapse(LOOP_COLLAPSE6) schedule(static) proc_bind(spread) for (std::size_t i0 = 0; i0 < n; i0 += 2 * dsorted[0]){ for (std::size_t i1 = 0; i1 < dsorted[0]; i1 += 2 * dsorted[1]){ for (std::size_t i2 = 0; i2 < dsorted[1]; i2 += 2 * dsorted[2]){ @@ -262,7 +262,7 @@ void kernel(V& psi, unsigned id5, unsigned id4, unsigned id3, unsigned id2, unsi } } else{ - #pragma omp for collapse(LOOP_COLLAPSE6) schedule(static) + #pragma omp parallel for collapse(LOOP_COLLAPSE6) schedule(static) for (std::size_t i0 = 0; i0 < n; i0 += 2 * dsorted[0]){ for (std::size_t i1 = 0; i1 < dsorted[0]; i1 += 2 * dsorted[1]){ for (std::size_t i2 = 0; i2 < dsorted[1]; i2 += 2 * dsorted[2]){ diff --git a/src/Simulation/Native/src/external/nointrin/kernel7.hpp b/src/Simulation/Native/src/external/nointrin/kernel7.hpp index a9537bb61d1..fc8401da66f 100644 --- a/src/Simulation/Native/src/external/nointrin/kernel7.hpp +++ b/src/Simulation/Native/src/external/nointrin/kernel7.hpp @@ -453,7 +453,7 @@ void kernel(V& psi, unsigned id6, unsigned id5, unsigned id4, unsigned id3, unsi #ifndef _MSC_VER if (ctrlmask == 0){ - #pragma omp for collapse(LOOP_COLLAPSE7) schedule(static) + #pragma omp parallel for collapse(LOOP_COLLAPSE7) schedule(static) proc_bind(spread) for (std::size_t i0 = 0; i0 < n; i0 += 2 * dsorted[0]){ for (std::size_t i1 = 0; i1 < dsorted[0]; i1 += 2 * dsorted[1]){ for (std::size_t i2 = 0; i2 < dsorted[1]; i2 += 2 * dsorted[2]){ @@ -473,7 +473,7 @@ void kernel(V& psi, unsigned id6, unsigned id5, unsigned id4, unsigned id3, unsi } } else{ - #pragma omp for collapse(LOOP_COLLAPSE7) schedule(static) + #pragma omp parallel for collapse(LOOP_COLLAPSE7) schedule(static) for (std::size_t i0 = 0; i0 < n; i0 += 2 * dsorted[0]){ for (std::size_t i1 = 0; i1 < dsorted[0]; i1 += 2 * dsorted[1]){ for (std::size_t i2 = 0; i2 < dsorted[1]; i2 += 2 * dsorted[2]){ diff --git a/src/Simulation/Native/src/simulator/CMakeLists.txt b/src/Simulation/Native/src/simulator/CMakeLists.txt index d048feac129..b1caf02fc05 100644 --- a/src/Simulation/Native/src/simulator/CMakeLists.txt +++ b/src/Simulation/Native/src/simulator/CMakeLists.txt @@ -4,9 +4,12 @@ add_executable(local_test local_test.cpp) add_executable(factory_test factory_test.cpp) add_executable(capi_test capi_test.cpp) +add_executable(dbw_test dbw_test.cpp) target_link_libraries(factory_test Microsoft.Quantum.Simulator.Runtime) target_link_libraries(local_test Microsoft.Quantum.Simulator.Runtime) target_link_libraries(capi_test Microsoft.Quantum.Simulator.Runtime) +target_link_libraries(dbw_test Microsoft.Quantum.Simulator.Runtime) add_test(NAME factory_test COMMAND ./factory_test WORKING_DIRECTORY ${CMAKE_BINARY_DIR}) add_test(NAME local_test COMMAND ./local_test WORKING_DIRECTORY ${CMAKE_BINARY_DIR}) add_test(NAME capi_test COMMAND ./capi_test WORKING_DIRECTORY ${CMAKE_BINARY_DIR}) +add_test(NAME dbw_test COMMAND ./dbw_test WORKING_DIRECTORY ${CMAKE_BINARY_DIR}) diff --git a/src/Simulation/Native/src/simulator/Makefile b/src/Simulation/Native/src/simulator/Makefile new file mode 100644 index 00000000000..3522e3fd641 --- /dev/null +++ b/src/Simulation/Native/src/simulator/Makefile @@ -0,0 +1,380 @@ +# CMAKE generated file: DO NOT EDIT! +# Generated by "Unix Makefiles" Generator, CMake Version 3.16 + +# Default target executed when no arguments are given to make. +default_target: all + +.PHONY : default_target + +# Allow only one "make -f Makefile2" at a time, but pass parallelism. +.NOTPARALLEL: + + +#============================================================================= +# Special targets provided by cmake. + +# Disable implicit rules so canonical targets will work. +.SUFFIXES: + + +# Remove some rules from gmake that .SUFFIXES does not remove. +SUFFIXES = + +.SUFFIXES: .hpux_make_needs_suffix_list + + +# Suppress display of executed commands. +$(VERBOSE).SILENT: + + +# A target that is always out of date. +cmake_force: + +.PHONY : cmake_force + +#============================================================================= +# Set environment variables for the build. + +# The shell in which to execute make rules. +SHELL = /bin/sh + +# The CMake executable. +CMAKE_COMMAND = /usr/bin/cmake + +# The command to remove a file. +RM = /usr/bin/cmake -E remove -f + +# Escaping for special characters. +EQUALS = = + +# The top-level source directory on which CMake was run. +CMAKE_SOURCE_DIR = /mnt/c/depot/Git/qsharp-runtime/src/Simulation/Native + +# The top-level build directory on which CMake was run. +CMAKE_BINARY_DIR = /mnt/c/depot/Git/qsharp-runtime/src/Simulation/Native + +#============================================================================= +# Targets provided globally by CMake. + +# Special rule for the target install/strip +install/strip: preinstall + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Installing the project stripped..." + /usr/bin/cmake -DCMAKE_INSTALL_DO_STRIP=1 -P cmake_install.cmake +.PHONY : install/strip + +# Special rule for the target install/strip +install/strip/fast: preinstall/fast + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Installing the project stripped..." + /usr/bin/cmake -DCMAKE_INSTALL_DO_STRIP=1 -P cmake_install.cmake +.PHONY : install/strip/fast + +# Special rule for the target edit_cache +edit_cache: + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Running CMake cache editor..." + /usr/bin/ccmake -S$(CMAKE_SOURCE_DIR) -B$(CMAKE_BINARY_DIR) +.PHONY : edit_cache + +# Special rule for the target edit_cache +edit_cache/fast: edit_cache + +.PHONY : edit_cache/fast + +# Special rule for the target test +test: + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Running tests..." + /usr/bin/ctest --force-new-ctest-process $(ARGS) +.PHONY : test + +# Special rule for the target test +test/fast: test + +.PHONY : test/fast + +# Special rule for the target install +install: preinstall + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Install the project..." + /usr/bin/cmake -P cmake_install.cmake +.PHONY : install + +# Special rule for the target install +install/fast: preinstall/fast + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Install the project..." + /usr/bin/cmake -P cmake_install.cmake +.PHONY : install/fast + +# Special rule for the target list_install_components +list_install_components: + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Available install components are: \"libraries\"" +.PHONY : list_install_components + +# Special rule for the target list_install_components +list_install_components/fast: list_install_components + +.PHONY : list_install_components/fast + +# Special rule for the target rebuild_cache +rebuild_cache: + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Running CMake to regenerate build system..." + /usr/bin/cmake -S$(CMAKE_SOURCE_DIR) -B$(CMAKE_BINARY_DIR) +.PHONY : rebuild_cache + +# Special rule for the target rebuild_cache +rebuild_cache/fast: rebuild_cache + +.PHONY : rebuild_cache/fast + +# Special rule for the target install/local +install/local: preinstall + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Installing only the local directory..." + /usr/bin/cmake -DCMAKE_INSTALL_LOCAL_ONLY=1 -P cmake_install.cmake +.PHONY : install/local + +# Special rule for the target install/local +install/local/fast: preinstall/fast + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Installing only the local directory..." + /usr/bin/cmake -DCMAKE_INSTALL_LOCAL_ONLY=1 -P cmake_install.cmake +.PHONY : install/local/fast + +# The main all target +all: cmake_check_build_system + cd /mnt/c/depot/Git/qsharp-runtime/src/Simulation/Native && $(CMAKE_COMMAND) -E cmake_progress_start /mnt/c/depot/Git/qsharp-runtime/src/Simulation/Native/CMakeFiles /mnt/c/depot/Git/qsharp-runtime/src/Simulation/Native/src/simulator/CMakeFiles/progress.marks + cd /mnt/c/depot/Git/qsharp-runtime/src/Simulation/Native && $(MAKE) -f CMakeFiles/Makefile2 src/simulator/all + $(CMAKE_COMMAND) -E cmake_progress_start /mnt/c/depot/Git/qsharp-runtime/src/Simulation/Native/CMakeFiles 0 +.PHONY : all + +# The main clean target +clean: + cd /mnt/c/depot/Git/qsharp-runtime/src/Simulation/Native && $(MAKE) -f CMakeFiles/Makefile2 src/simulator/clean +.PHONY : clean + +# The main clean target +clean/fast: clean + +.PHONY : clean/fast + +# Prepare targets for installation. +preinstall: all + cd /mnt/c/depot/Git/qsharp-runtime/src/Simulation/Native && $(MAKE) -f CMakeFiles/Makefile2 src/simulator/preinstall +.PHONY : preinstall + +# Prepare targets for installation. +preinstall/fast: + cd /mnt/c/depot/Git/qsharp-runtime/src/Simulation/Native && $(MAKE) -f CMakeFiles/Makefile2 src/simulator/preinstall +.PHONY : preinstall/fast + +# clear depends +depend: + cd /mnt/c/depot/Git/qsharp-runtime/src/Simulation/Native && $(CMAKE_COMMAND) -S$(CMAKE_SOURCE_DIR) -B$(CMAKE_BINARY_DIR) --check-build-system CMakeFiles/Makefile.cmake 1 +.PHONY : depend + +# Convenience name for target. +src/simulator/CMakeFiles/dbw_test.dir/rule: + cd /mnt/c/depot/Git/qsharp-runtime/src/Simulation/Native && $(MAKE) -f CMakeFiles/Makefile2 src/simulator/CMakeFiles/dbw_test.dir/rule +.PHONY : src/simulator/CMakeFiles/dbw_test.dir/rule + +# Convenience name for target. +dbw_test: src/simulator/CMakeFiles/dbw_test.dir/rule + +.PHONY : dbw_test + +# fast build rule for target. +dbw_test/fast: + cd /mnt/c/depot/Git/qsharp-runtime/src/Simulation/Native && $(MAKE) -f src/simulator/CMakeFiles/dbw_test.dir/build.make src/simulator/CMakeFiles/dbw_test.dir/build +.PHONY : dbw_test/fast + +# Convenience name for target. +src/simulator/CMakeFiles/capi_test.dir/rule: + cd /mnt/c/depot/Git/qsharp-runtime/src/Simulation/Native && $(MAKE) -f CMakeFiles/Makefile2 src/simulator/CMakeFiles/capi_test.dir/rule +.PHONY : src/simulator/CMakeFiles/capi_test.dir/rule + +# Convenience name for target. +capi_test: src/simulator/CMakeFiles/capi_test.dir/rule + +.PHONY : capi_test + +# fast build rule for target. +capi_test/fast: + cd /mnt/c/depot/Git/qsharp-runtime/src/Simulation/Native && $(MAKE) -f src/simulator/CMakeFiles/capi_test.dir/build.make src/simulator/CMakeFiles/capi_test.dir/build +.PHONY : capi_test/fast + +# Convenience name for target. +src/simulator/CMakeFiles/factory_test.dir/rule: + cd /mnt/c/depot/Git/qsharp-runtime/src/Simulation/Native && $(MAKE) -f CMakeFiles/Makefile2 src/simulator/CMakeFiles/factory_test.dir/rule +.PHONY : src/simulator/CMakeFiles/factory_test.dir/rule + +# Convenience name for target. +factory_test: src/simulator/CMakeFiles/factory_test.dir/rule + +.PHONY : factory_test + +# fast build rule for target. +factory_test/fast: + cd /mnt/c/depot/Git/qsharp-runtime/src/Simulation/Native && $(MAKE) -f src/simulator/CMakeFiles/factory_test.dir/build.make src/simulator/CMakeFiles/factory_test.dir/build +.PHONY : factory_test/fast + +# Convenience name for target. +src/simulator/CMakeFiles/local_test.dir/rule: + cd /mnt/c/depot/Git/qsharp-runtime/src/Simulation/Native && $(MAKE) -f CMakeFiles/Makefile2 src/simulator/CMakeFiles/local_test.dir/rule +.PHONY : src/simulator/CMakeFiles/local_test.dir/rule + +# Convenience name for target. +local_test: src/simulator/CMakeFiles/local_test.dir/rule + +.PHONY : local_test + +# fast build rule for target. +local_test/fast: + cd /mnt/c/depot/Git/qsharp-runtime/src/Simulation/Native && $(MAKE) -f src/simulator/CMakeFiles/local_test.dir/build.make src/simulator/CMakeFiles/local_test.dir/build +.PHONY : local_test/fast + +capi_test.o: capi_test.cpp.o + +.PHONY : capi_test.o + +# target to build an object file +capi_test.cpp.o: + cd /mnt/c/depot/Git/qsharp-runtime/src/Simulation/Native && $(MAKE) -f src/simulator/CMakeFiles/capi_test.dir/build.make src/simulator/CMakeFiles/capi_test.dir/capi_test.cpp.o +.PHONY : capi_test.cpp.o + +capi_test.i: capi_test.cpp.i + +.PHONY : capi_test.i + +# target to preprocess a source file +capi_test.cpp.i: + cd /mnt/c/depot/Git/qsharp-runtime/src/Simulation/Native && $(MAKE) -f src/simulator/CMakeFiles/capi_test.dir/build.make src/simulator/CMakeFiles/capi_test.dir/capi_test.cpp.i +.PHONY : capi_test.cpp.i + +capi_test.s: capi_test.cpp.s + +.PHONY : capi_test.s + +# target to generate assembly for a file +capi_test.cpp.s: + cd /mnt/c/depot/Git/qsharp-runtime/src/Simulation/Native && $(MAKE) -f src/simulator/CMakeFiles/capi_test.dir/build.make src/simulator/CMakeFiles/capi_test.dir/capi_test.cpp.s +.PHONY : capi_test.cpp.s + +dbw_test.o: dbw_test.cpp.o + +.PHONY : dbw_test.o + +# target to build an object file +dbw_test.cpp.o: + cd /mnt/c/depot/Git/qsharp-runtime/src/Simulation/Native && $(MAKE) -f src/simulator/CMakeFiles/dbw_test.dir/build.make src/simulator/CMakeFiles/dbw_test.dir/dbw_test.cpp.o +.PHONY : dbw_test.cpp.o + +dbw_test.i: dbw_test.cpp.i + +.PHONY : dbw_test.i + +# target to preprocess a source file +dbw_test.cpp.i: + cd /mnt/c/depot/Git/qsharp-runtime/src/Simulation/Native && $(MAKE) -f src/simulator/CMakeFiles/dbw_test.dir/build.make src/simulator/CMakeFiles/dbw_test.dir/dbw_test.cpp.i +.PHONY : dbw_test.cpp.i + +dbw_test.s: dbw_test.cpp.s + +.PHONY : dbw_test.s + +# target to generate assembly for a file +dbw_test.cpp.s: + cd /mnt/c/depot/Git/qsharp-runtime/src/Simulation/Native && $(MAKE) -f src/simulator/CMakeFiles/dbw_test.dir/build.make src/simulator/CMakeFiles/dbw_test.dir/dbw_test.cpp.s +.PHONY : dbw_test.cpp.s + +factory_test.o: factory_test.cpp.o + +.PHONY : factory_test.o + +# target to build an object file +factory_test.cpp.o: + cd /mnt/c/depot/Git/qsharp-runtime/src/Simulation/Native && $(MAKE) -f src/simulator/CMakeFiles/factory_test.dir/build.make src/simulator/CMakeFiles/factory_test.dir/factory_test.cpp.o +.PHONY : factory_test.cpp.o + +factory_test.i: factory_test.cpp.i + +.PHONY : factory_test.i + +# target to preprocess a source file +factory_test.cpp.i: + cd /mnt/c/depot/Git/qsharp-runtime/src/Simulation/Native && $(MAKE) -f src/simulator/CMakeFiles/factory_test.dir/build.make src/simulator/CMakeFiles/factory_test.dir/factory_test.cpp.i +.PHONY : factory_test.cpp.i + +factory_test.s: factory_test.cpp.s + +.PHONY : factory_test.s + +# target to generate assembly for a file +factory_test.cpp.s: + cd /mnt/c/depot/Git/qsharp-runtime/src/Simulation/Native && $(MAKE) -f src/simulator/CMakeFiles/factory_test.dir/build.make src/simulator/CMakeFiles/factory_test.dir/factory_test.cpp.s +.PHONY : factory_test.cpp.s + +local_test.o: local_test.cpp.o + +.PHONY : local_test.o + +# target to build an object file +local_test.cpp.o: + cd /mnt/c/depot/Git/qsharp-runtime/src/Simulation/Native && $(MAKE) -f src/simulator/CMakeFiles/local_test.dir/build.make src/simulator/CMakeFiles/local_test.dir/local_test.cpp.o +.PHONY : local_test.cpp.o + +local_test.i: local_test.cpp.i + +.PHONY : local_test.i + +# target to preprocess a source file +local_test.cpp.i: + cd /mnt/c/depot/Git/qsharp-runtime/src/Simulation/Native && $(MAKE) -f src/simulator/CMakeFiles/local_test.dir/build.make src/simulator/CMakeFiles/local_test.dir/local_test.cpp.i +.PHONY : local_test.cpp.i + +local_test.s: local_test.cpp.s + +.PHONY : local_test.s + +# target to generate assembly for a file +local_test.cpp.s: + cd /mnt/c/depot/Git/qsharp-runtime/src/Simulation/Native && $(MAKE) -f src/simulator/CMakeFiles/local_test.dir/build.make src/simulator/CMakeFiles/local_test.dir/local_test.cpp.s +.PHONY : local_test.cpp.s + +# Help Target +help: + @echo "The following are some of the valid targets for this Makefile:" + @echo "... all (the default if no target is provided)" + @echo "... clean" + @echo "... depend" + @echo "... install/strip" + @echo "... edit_cache" + @echo "... test" + @echo "... dbw_test" + @echo "... capi_test" + @echo "... install" + @echo "... list_install_components" + @echo "... rebuild_cache" + @echo "... factory_test" + @echo "... install/local" + @echo "... local_test" + @echo "... capi_test.o" + @echo "... capi_test.i" + @echo "... capi_test.s" + @echo "... dbw_test.o" + @echo "... dbw_test.i" + @echo "... dbw_test.s" + @echo "... factory_test.o" + @echo "... factory_test.i" + @echo "... factory_test.s" + @echo "... local_test.o" + @echo "... local_test.i" + @echo "... local_test.s" +.PHONY : help + + + +#============================================================================= +# Special targets to cleanup operation of make. + +# Special rule to run CMake to check the build system integrity. +# No rule that depends on this can have commands that come from listfiles +# because they might be regenerated. +cmake_check_build_system: + cd /mnt/c/depot/Git/qsharp-runtime/src/Simulation/Native && $(CMAKE_COMMAND) -S$(CMAKE_SOURCE_DIR) -B$(CMAKE_BINARY_DIR) --check-build-system CMakeFiles/Makefile.cmake 0 +.PHONY : cmake_check_build_system + diff --git a/src/Simulation/Native/src/simulator/Project.sln b/src/Simulation/Native/src/simulator/Project.sln new file mode 100644 index 00000000000..8bff94040c4 --- /dev/null +++ b/src/Simulation/Native/src/simulator/Project.sln @@ -0,0 +1,95 @@ + +Microsoft Visual Studio Solution File, Format Version 12.00 +# Visual Studio Version 16 +Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "ALL_BUILD", "ALL_BUILD.vcxproj", "{FF099378-3036-3F4A-9265-FD7892A6D7C4}" + ProjectSection(ProjectDependencies) = postProject + {71F2A928-1769-3C7F-96A5-9D3162F0F318} = {71F2A928-1769-3C7F-96A5-9D3162F0F318} + {9B43304E-07FF-358F-9B80-6563FEACD34B} = {9B43304E-07FF-358F-9B80-6563FEACD34B} + {B9922A7F-66CA-38A9-AB14-F37716C2E642} = {B9922A7F-66CA-38A9-AB14-F37716C2E642} + {9A0E62D8-1D15-3C23-9033-C3CB9F76209B} = {9A0E62D8-1D15-3C23-9033-C3CB9F76209B} + {96344D76-033E-328A-A304-6957F95DA3E2} = {96344D76-033E-328A-A304-6957F95DA3E2} + EndProjectSection +EndProject +Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "ZERO_CHECK", "ZERO_CHECK.vcxproj", "{71F2A928-1769-3C7F-96A5-9D3162F0F318}" + ProjectSection(ProjectDependencies) = postProject + EndProjectSection +EndProject +Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "capi_test", "capi_test.vcxproj", "{9B43304E-07FF-358F-9B80-6563FEACD34B}" + ProjectSection(ProjectDependencies) = postProject + {71F2A928-1769-3C7F-96A5-9D3162F0F318} = {71F2A928-1769-3C7F-96A5-9D3162F0F318} + EndProjectSection +EndProject +Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "dbw_test", "dbw_test.vcxproj", "{B9922A7F-66CA-38A9-AB14-F37716C2E642}" + ProjectSection(ProjectDependencies) = postProject + {71F2A928-1769-3C7F-96A5-9D3162F0F318} = {71F2A928-1769-3C7F-96A5-9D3162F0F318} + EndProjectSection +EndProject +Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "factory_test", "factory_test.vcxproj", "{9A0E62D8-1D15-3C23-9033-C3CB9F76209B}" + ProjectSection(ProjectDependencies) = postProject + {71F2A928-1769-3C7F-96A5-9D3162F0F318} = {71F2A928-1769-3C7F-96A5-9D3162F0F318} + EndProjectSection +EndProject +Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "local_test", "local_test.vcxproj", "{96344D76-033E-328A-A304-6957F95DA3E2}" + ProjectSection(ProjectDependencies) = postProject + {71F2A928-1769-3C7F-96A5-9D3162F0F318} = {71F2A928-1769-3C7F-96A5-9D3162F0F318} + EndProjectSection +EndProject +Global + GlobalSection(SolutionConfigurationPlatforms) = preSolution + Debug|x64 = Debug|x64 + Release|x64 = Release|x64 + MinSizeRel|x64 = MinSizeRel|x64 + RelWithDebInfo|x64 = RelWithDebInfo|x64 + EndGlobalSection + GlobalSection(ProjectConfigurationPlatforms) = postSolution + {FF099378-3036-3F4A-9265-FD7892A6D7C4}.Debug|x64.ActiveCfg = Debug|x64 + {FF099378-3036-3F4A-9265-FD7892A6D7C4}.Release|x64.ActiveCfg = Release|x64 + {FF099378-3036-3F4A-9265-FD7892A6D7C4}.MinSizeRel|x64.ActiveCfg = MinSizeRel|x64 + {FF099378-3036-3F4A-9265-FD7892A6D7C4}.RelWithDebInfo|x64.ActiveCfg = RelWithDebInfo|x64 + {71F2A928-1769-3C7F-96A5-9D3162F0F318}.Debug|x64.ActiveCfg = Debug|x64 + {71F2A928-1769-3C7F-96A5-9D3162F0F318}.Debug|x64.Build.0 = Debug|x64 + {71F2A928-1769-3C7F-96A5-9D3162F0F318}.Release|x64.ActiveCfg = Release|x64 + {71F2A928-1769-3C7F-96A5-9D3162F0F318}.Release|x64.Build.0 = Release|x64 + {71F2A928-1769-3C7F-96A5-9D3162F0F318}.MinSizeRel|x64.ActiveCfg = MinSizeRel|x64 + {71F2A928-1769-3C7F-96A5-9D3162F0F318}.MinSizeRel|x64.Build.0 = MinSizeRel|x64 + {71F2A928-1769-3C7F-96A5-9D3162F0F318}.RelWithDebInfo|x64.ActiveCfg = RelWithDebInfo|x64 + {71F2A928-1769-3C7F-96A5-9D3162F0F318}.RelWithDebInfo|x64.Build.0 = RelWithDebInfo|x64 + {9B43304E-07FF-358F-9B80-6563FEACD34B}.Debug|x64.ActiveCfg = Debug|x64 + {9B43304E-07FF-358F-9B80-6563FEACD34B}.Debug|x64.Build.0 = Debug|x64 + {9B43304E-07FF-358F-9B80-6563FEACD34B}.Release|x64.ActiveCfg = Release|x64 + {9B43304E-07FF-358F-9B80-6563FEACD34B}.Release|x64.Build.0 = Release|x64 + {9B43304E-07FF-358F-9B80-6563FEACD34B}.MinSizeRel|x64.ActiveCfg = MinSizeRel|x64 + {9B43304E-07FF-358F-9B80-6563FEACD34B}.MinSizeRel|x64.Build.0 = MinSizeRel|x64 + {9B43304E-07FF-358F-9B80-6563FEACD34B}.RelWithDebInfo|x64.ActiveCfg = RelWithDebInfo|x64 + {9B43304E-07FF-358F-9B80-6563FEACD34B}.RelWithDebInfo|x64.Build.0 = RelWithDebInfo|x64 + {B9922A7F-66CA-38A9-AB14-F37716C2E642}.Debug|x64.ActiveCfg = Debug|x64 + {B9922A7F-66CA-38A9-AB14-F37716C2E642}.Debug|x64.Build.0 = Debug|x64 + {B9922A7F-66CA-38A9-AB14-F37716C2E642}.Release|x64.ActiveCfg = Release|x64 + {B9922A7F-66CA-38A9-AB14-F37716C2E642}.Release|x64.Build.0 = Release|x64 + {B9922A7F-66CA-38A9-AB14-F37716C2E642}.MinSizeRel|x64.ActiveCfg = MinSizeRel|x64 + {B9922A7F-66CA-38A9-AB14-F37716C2E642}.MinSizeRel|x64.Build.0 = MinSizeRel|x64 + {B9922A7F-66CA-38A9-AB14-F37716C2E642}.RelWithDebInfo|x64.ActiveCfg = RelWithDebInfo|x64 + {B9922A7F-66CA-38A9-AB14-F37716C2E642}.RelWithDebInfo|x64.Build.0 = RelWithDebInfo|x64 + {9A0E62D8-1D15-3C23-9033-C3CB9F76209B}.Debug|x64.ActiveCfg = Debug|x64 + {9A0E62D8-1D15-3C23-9033-C3CB9F76209B}.Debug|x64.Build.0 = Debug|x64 + {9A0E62D8-1D15-3C23-9033-C3CB9F76209B}.Release|x64.ActiveCfg = Release|x64 + {9A0E62D8-1D15-3C23-9033-C3CB9F76209B}.Release|x64.Build.0 = Release|x64 + {9A0E62D8-1D15-3C23-9033-C3CB9F76209B}.MinSizeRel|x64.ActiveCfg = MinSizeRel|x64 + {9A0E62D8-1D15-3C23-9033-C3CB9F76209B}.MinSizeRel|x64.Build.0 = MinSizeRel|x64 + {9A0E62D8-1D15-3C23-9033-C3CB9F76209B}.RelWithDebInfo|x64.ActiveCfg = RelWithDebInfo|x64 + {9A0E62D8-1D15-3C23-9033-C3CB9F76209B}.RelWithDebInfo|x64.Build.0 = RelWithDebInfo|x64 + {96344D76-033E-328A-A304-6957F95DA3E2}.Debug|x64.ActiveCfg = Debug|x64 + {96344D76-033E-328A-A304-6957F95DA3E2}.Debug|x64.Build.0 = Debug|x64 + {96344D76-033E-328A-A304-6957F95DA3E2}.Release|x64.ActiveCfg = Release|x64 + {96344D76-033E-328A-A304-6957F95DA3E2}.Release|x64.Build.0 = Release|x64 + {96344D76-033E-328A-A304-6957F95DA3E2}.MinSizeRel|x64.ActiveCfg = MinSizeRel|x64 + {96344D76-033E-328A-A304-6957F95DA3E2}.MinSizeRel|x64.Build.0 = MinSizeRel|x64 + {96344D76-033E-328A-A304-6957F95DA3E2}.RelWithDebInfo|x64.ActiveCfg = RelWithDebInfo|x64 + {96344D76-033E-328A-A304-6957F95DA3E2}.RelWithDebInfo|x64.Build.0 = RelWithDebInfo|x64 + EndGlobalSection + GlobalSection(ExtensibilityGlobals) = postSolution + SolutionGuid = {075CF7D4-156C-30A3-9A71-3554AECB5B07} + EndGlobalSection + GlobalSection(ExtensibilityAddIns) = postSolution + EndGlobalSection +EndGlobal diff --git a/src/Simulation/Native/src/simulator/capi.cpp b/src/Simulation/Native/src/simulator/capi.cpp index 344445f46c4..544f1f25261 100644 --- a/src/Simulation/Native/src/simulator/capi.cpp +++ b/src/Simulation/Native/src/simulator/capi.cpp @@ -13,7 +13,7 @@ MICROSOFT_QUANTUM_DECL unsigned init() { return Microsoft::Quantum::Simulator::create(); } - + MICROSOFT_QUANTUM_DECL void destroy(_In_ unsigned id) { Microsoft::Quantum::Simulator::destroy(id); diff --git a/src/Simulation/Native/src/simulator/capi_test.cpp b/src/Simulation/Native/src/simulator/capi_test.cpp index 945127e260d..179b6a94960 100644 --- a/src/Simulation/Native/src/simulator/capi_test.cpp +++ b/src/Simulation/Native/src/simulator/capi_test.cpp @@ -107,24 +107,24 @@ void test_gates() allocateQubit(sim_id, 0); allocateQubit(sim_id, 1); - CRx(sim_id, 1.0, 0, 1); + dump(sim_id, "test_gatesA"); + CRx(sim_id, 1.0, 0, 1); - assert(M(sim_id, 1)==false); + dump(sim_id, "test_gatesB"); + assert(M(sim_id, 1) == false); X(sim_id, 0); - CRx(sim_id, 1.0, 0, 1); + CRx(sim_id, 1.0, 0, 1); H(sim_id, 1); CRz(sim_id, -1.0, 0, 1); H(sim_id, 1); - assert(M(sim_id, 1)==false); + assert(M(sim_id, 1) == false); X(sim_id, 1); - assert(M(sim_id, 1)==true); - - X(sim_id, 1); + assert(M(sim_id, 1) == true); release(sim_id, 0); release(sim_id, 1); @@ -132,7 +132,6 @@ void test_gates() destroy(sim_id); } - void test_allocate() { auto sim_id = init(); @@ -348,14 +347,13 @@ void test_permute_basis() Ry(sim_id, -1.1, 3); CX(sim_id, 1, 2); H(sim_id, 1); - // Dump(sim_id, "permute-end.txt"); assert(M(sim_id, 0) == false); assert(M(sim_id, 1) == false); assert(M(sim_id, 2) == false); assert(M(sim_id, 3) == false); assert(M(sim_id, 4) == false); - + for (unsigned i = 0; i < nqubits + 1; ++i) release(sim_id, i); destroy(sim_id); diff --git a/src/Simulation/Native/src/simulator/dbw_test.cpp b/src/Simulation/Native/src/simulator/dbw_test.cpp new file mode 100644 index 00000000000..ac4df0f1448 --- /dev/null +++ b/src/Simulation/Native/src/simulator/dbw_test.cpp @@ -0,0 +1,240 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +#include "simulator/capi.hpp" +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "util/cpuid.hpp" +#include "capi.hpp" +#include + +using namespace std; + +// some convenience functions +void CX(unsigned sim_id, unsigned c, unsigned q) +{ + MCX(sim_id,1,&c,q); +} + +void CZ(unsigned sim_id, unsigned c, unsigned q) +{ + MCZ(sim_id,1,&c,q); +} + +void Ry(unsigned sim_id, double phi, unsigned q) +{ + R(sim_id,2,phi,q); +} + +void CRz(unsigned sim_id, double phi, unsigned c, unsigned q) +{ + MCR(sim_id,3,phi,1,&c,q); +} + +void CRx(unsigned sim_id, double phi, unsigned c, unsigned q) +{ + MCR(sim_id,1,phi,1,&c,q); +} + + +void dump(unsigned sim_id, const char* label) +{ + auto dump_callback = [](size_t idx, double r, double i) { + std::cout << idx << ":\t" << r << '\t' << i << '\n'; + return true; + }; + auto sim_ids_callback = [](unsigned idx) { std::cout << idx << " "; }; + + std::cout << label << "\n" << "wave function for ids (least to most significant): ["; + DumpIds(sim_id, sim_ids_callback); + std::cout << "]\n"; + Dump(sim_id, dump_callback); +} + +std::vector> loadPrb(int circStart, int circStop) { + std::vector> rslt; + for (int k = circStart; k < circStop; k++) { + unsigned c = k - 1; + if (k > 0) + for (int j = 0; j < 5; j++) { + std::vector nums = { k - 1, k }; + rslt.push_back(nums); + } + if (k % 5 == 0) { + for (int j = 0; j < 5; j++) { + std::vector nums = { k }; + rslt.push_back(nums); + } + } + } + return rslt; +} + +std::vector splitNums(const std::string& str, char delim = ',') { + std::vector nums; + size_t start; + size_t end = 0; + while ((start = str.find_first_not_of(delim, end)) != std::string::npos) { + end = str.find(delim, start); + nums.push_back(stoi(str.substr(start, end - start))); + } + return nums; +} + +std::vector> loadTest(char* fName,bool doClusters) { + std::vector> rslt; + std::vector empty; + string line; + ifstream file(fName); + if (!file.is_open()) throw(std::invalid_argument("Can't open input file")); + + int phase = 0; + if (doClusters) phase = 2; + + regex reOrig("^=== Original:.*[\r]?"); + regex reGate("^\\s*(\\d+):\\s+(.+)\\[(.*)\\].*[\r]?"); + regex reClusters("^=== Clusters.*[\r]?"); + regex reCluster("^==== cluster\\[\\s*(\\d+)\\]:.*[\r]?"); + smatch sm; + + while (getline(file, line)) { + if (phase == 99) break; + switch (phase) { + case 0: + if (regex_match(line, sm, reOrig)) phase = 1; + break; + case 1: + if (regex_match(line, sm, reGate)) { + auto qs = splitNums(sm[3]); + rslt.push_back(qs); + } + else phase = 99; + break; + case 2: + if (regex_match(line, reClusters)) + phase = 3; + break; + case 4: + if (regex_match(line, sm, reGate)) { + auto qs = splitNums(sm[3]); + rslt.push_back(qs); + break; + } + else { + phase = 3; + [[fallthrough]]; + } + case 3: + if (regex_match(line, sm, reCluster)) { + rslt.push_back(empty); + phase = 4; + } + break; + } + } + file.close(); + return rslt; +} + +void mySprintf(char* buf, int bufSiz, const char* fmt, ...) { + va_list args; +#ifdef _MSC_VER + __crt_va_start(args, fmt); + vsprintf_s(buf, bufSiz, fmt, args); + __crt_va_end(args); +#else + va_start(args,fmt); + vsprintf(buf, fmt, args); + va_end(args); +#endif + //perror(buf); +} + +int numQs(vector> prb) { + int mx = -1; + for (auto i : prb) + for (auto j : i) + if (j > mx) mx = j; + return (mx + 1); +} + +int main() +{ + int nQs; + vector> prb; + char fName[30]; + + // Perform a small number of loops on the 4x4 advantage circuit. + int sizR = 4; + int sizC = 4; + int loops = 10; + mySprintf(fName, sizeof(fName), "advantage_%d%d_4.log", sizR, sizC); + + prb = loadTest(fName, false); + nQs = numQs(prb); + int gateCnt = (int)prb.size(); + double maxGps = 0.0; + +#ifdef NDEBUG + double gpsFailureThreshold = 1000.0; +#else + double gpsFailureThreshold = 60.0; +#endif + + printf("==== Starting %s (%d gates), Failure threshold %.2e gps\n", fName, gateCnt, gpsFailureThreshold); + + auto sim_id = init(); + for (int q = 0; q < nQs; q++) allocateQubit(sim_id, q); + + std::chrono::system_clock::time_point start = std::chrono::system_clock::now(); + int itvl = loops / 10; + for (int i = 0; i < loops; i++) { + for (int j = 0; j < prb.size(); j++) { + auto qs = prb[j]; + uint32_t cs[2]; + switch (qs.size()) { + case 0: // No op + break; + case 1: + H(sim_id, qs[0]); + break; + case 2: + CX(sim_id, qs[0], qs[1]); + break; + case 3: + cs[0] = (uint32_t)qs[0]; + cs[1] = (uint32_t)qs[1]; + MCX(sim_id, 2, cs, qs[2]); + break; + default: + throw(std::invalid_argument("Didn't expect more then 3 wire gates")); + } + + } + for (int q = 0; q < nQs; q++) M(sim_id, q); + + std::chrono::system_clock::time_point curr = std::chrono::system_clock::now(); + std::chrono::duration elapsed = curr - start; + if (i % itvl == (itvl - 1)) { + double gps = (double)gateCnt * (double)i / elapsed.count(); + printf("Loops[%4d]: GPS = %.2e\n", i, gps); + fflush(stdout); + if (gps > maxGps) maxGps = gps; + } + } + destroy(sim_id); + + if (maxGps < gpsFailureThreshold) return -1; +} diff --git a/src/Simulation/Native/src/simulator/factory.cpp b/src/Simulation/Native/src/simulator/factory.cpp index b80b02b35f6..b4ea5fd4e63 100644 --- a/src/Simulation/Native/src/simulator/factory.cpp +++ b/src/Simulation/Native/src/simulator/factory.cpp @@ -13,15 +13,19 @@ namespace Microsoft { namespace SimulatorGeneric { - MICROSOFT_QUANTUM_DECL_IMPORT Microsoft::Quantum::Simulator::SimulatorInterface* createSimulator(unsigned); + Microsoft::Quantum::Simulator::SimulatorInterface* createSimulator(unsigned); } namespace SimulatorAVX { - MICROSOFT_QUANTUM_DECL_IMPORT Microsoft::Quantum::Simulator::SimulatorInterface* createSimulator(unsigned); + Microsoft::Quantum::Simulator::SimulatorInterface* createSimulator(unsigned); } namespace SimulatorAVX2 { - MICROSOFT_QUANTUM_DECL_IMPORT Microsoft::Quantum::Simulator::SimulatorInterface* createSimulator(unsigned); + Microsoft::Quantum::Simulator::SimulatorInterface* createSimulator(unsigned); + } + namespace SimulatorAVX512 + { + Microsoft::Quantum::Simulator::SimulatorInterface* createSimulator(unsigned); } } } @@ -35,11 +39,15 @@ namespace Microsoft std::shared_mutex _mutex; std::vector> _psis; - SimulatorInterface* createSimulator(unsigned maxlocal) - { - if (haveFMA() && haveAVX2()) + SimulatorInterface* createSimulator(unsigned maxlocal) + { + if (haveAVX512()) + { + return SimulatorAVX512::createSimulator(maxlocal); + } + else if (haveFMA() && haveAVX2()) { - return SimulatorAVX2::createSimulator(maxlocal); + return SimulatorAVX2::createSimulator(maxlocal); } else if(haveAVX()) { diff --git a/src/Simulation/Native/src/simulator/gates.hpp b/src/Simulation/Native/src/simulator/gates.hpp index 3458a3295f0..15034067c06 100644 --- a/src/Simulation/Native/src/simulator/gates.hpp +++ b/src/Simulation/Native/src/simulator/gates.hpp @@ -16,448 +16,453 @@ namespace Microsoft { -namespace Quantum -{ -namespace SIMULATOR -{ -namespace Gates -{ - -/// a type for runtime basis specification -enum Basis -{ - PauliI = 0, - PauliX = 1, - PauliY = 3, - PauliZ = 2 -}; - -/// a general one qubit gate, storing the qubit number -class OneQubitGate -{ - public: - OneQubitGate(unsigned q) : qubit_(q) - { - } - unsigned qubit() const - { - return qubit_; - } - - private: - unsigned qubit_; -}; - -/// a general one qubit roitation gate, storing the qubit number and angle -class RotationGate : public OneQubitGate -{ - public: - RotationGate(double phi, unsigned q) : OneQubitGate(q), angle_(phi) + namespace Quantum { - } - double angle() const - { - return angle_; - } - - private: - double angle_; -}; - -/// The Pauli X gate -class X : public OneQubitGate -{ - public: - X(unsigned q) : OneQubitGate(q) - { - } - - std::string name() const - { - return "X"; - } - - TinyMatrix matrix() const - { - RealType mat[2][2] = {{0., 1.}, {1., 0.}}; - return TinyMatrix(mat); - } -}; - -/// The Pauli Y gate -class Y : public OneQubitGate -{ - public: - Y(unsigned q) : OneQubitGate(q) - { - } - - std::string name() const - { - return "Y"; - } - - TinyMatrix matrix() const - { - using val_t = ComplexType; - val_t mat[2][2] = {{val_t(0.), val_t(0.)}, {val_t(0.), val_t(0.)}}; - mat[0][1] = val_t(0., -1.); - mat[1][0] = val_t(0., 1.); - return TinyMatrix(mat); - } -}; - -/// The Pauli Z gate -class Z : public OneQubitGate -{ - public: - Z(unsigned q) : OneQubitGate(q) - { - } - - std::string name() const - { - return "Z"; - } - - DiagMatrix matrix() const - { - RealType diag[2] = {1., -1.}; - return DiagMatrix(diag); - } -}; - -/// The Hadamard gate -class H : public OneQubitGate -{ - public: - H(unsigned q) : OneQubitGate(q) - { - } - - std::string name() const - { - return "H"; - } - - TinyMatrix matrix() const - { - RealType r = std::sqrt(0.5); - RealType mat[2][2] = {{r, r}, {r, -r}}; - return TinyMatrix(mat); - } -}; - -/// The Y-version of a Hadamard gate -class HY : public OneQubitGate -{ - public: - HY(unsigned q) : OneQubitGate(q) - { - } - - std::string name() const - { - return "HY"; - } - - TinyMatrix matrix() const - { - ComplexType r(std::sqrt(0.5), 0.); - ComplexType i(0., std::sqrt(0.5)); - ComplexType mat[2][2] = {{r, r}, {i, -i}}; - return TinyMatrix(mat); - } -}; - -/// The adjoint Y-version of a Hadamard gate -class AdjHY : public OneQubitGate -{ - public: - AdjHY(unsigned q) : OneQubitGate(q) - { - } - - std::string name() const - { - return "AdjHY"; - } - - TinyMatrix matrix() const - { - ComplexType r(std::sqrt(0.5), 0.); - ComplexType i(0., std::sqrt(0.5)); - ComplexType mat[2][2] = {{r, -i}, {r, i}}; - return TinyMatrix(mat); - } -}; - -/// The S (phase) gate -class S : public OneQubitGate -{ - public: - S(unsigned q) : OneQubitGate(q) - { - } - - std::string name() const - { - return "S"; - } - - DiagMatrix matrix() const - { - using val_t = ComplexType; - val_t diag[2] = {val_t(1.), val_t(0., 1.)}; - return DiagMatrix(diag); - } -}; - -/// The adjoint of the S (phase) gate -class AdjS : public OneQubitGate -{ - public: - AdjS(unsigned q) : OneQubitGate(q) - { - } - - std::string name() const - { - return "AdjS"; - } - - DiagMatrix matrix() const - { - using val_t = ComplexType; - val_t diag[2] = {val_t(1.), val_t(0., -1.)}; - return DiagMatrix(diag); - } -}; - -/// The T (pi/8) gate -class T : public OneQubitGate -{ - public: - T(unsigned q) : OneQubitGate(q) - { - } - - std::string name() const - { - return "T"; - } - - DiagMatrix matrix() const - { - using val_t = ComplexType; - RealType r = std::sqrt(0.5); - val_t diag[2] = {val_t(1.), val_t(r, r)}; - return DiagMatrix(diag); - } -}; - -/// The T (pi/8) gate -class AdjT : public OneQubitGate -{ - public: - AdjT(unsigned q) : OneQubitGate(q) - { - } - - std::string name() const - { - return "AdjT"; - } - - DiagMatrix matrix() const - { - using val_t = ComplexType; - RealType r = std::sqrt(0.5); - val_t diag[2] = {val_t(1.), val_t(r, -r)}; - return DiagMatrix(diag); - } -}; - -/// The G gate -class G : public RotationGate -{ - public: - G(RealType phi, unsigned q) : RotationGate(phi, q) - { - } - - std::string name() const - { - return "G"; - } - - DiagMatrix matrix() const - { - DiagMatrix d; - ComplexType arg(0., 0.5 * angle()); - d(0, 0) = d(1, 1) = std::exp(-arg); - return d; - } -}; - -/// The Rx gate -class Rx : public RotationGate -{ - public: - Rx(RealType phi, unsigned q) : RotationGate(phi, q) - { - } - - std::string name() const - { - return "Rx"; - } - - TinyMatrix matrix() const - { - using val_t = ComplexType; - val_t s(0., -std::sin(0.5 * angle())); - val_t c = std::cos(0.5 * angle()); - val_t mat[2][2] = {{c, s}, {s, c}}; - return TinyMatrix(mat); - } -}; - -/// The Ry gate -class Ry : public RotationGate -{ - public: - Ry(RealType phi, unsigned q) : RotationGate(phi, q) - { - } - - std::string name() const - { - return "Ry"; - } - - TinyMatrix matrix() const - { - RealType s = std::sin(0.5 * angle()); - RealType c = std::cos(0.5 * angle()); - RealType mat[2][2] = {{c, -s}, {s, c}}; - ; - return TinyMatrix(mat); - } -}; - -/// The Rz gate -class Rz : public RotationGate -{ - public: - Rz(RealType phi, unsigned q) : RotationGate(phi, q) - { - } - - std::string name() const - { - return "Rz"; - } - - DiagMatrix matrix() const - { - DiagMatrix d; - ComplexType arg(0., 0.5 * angle()); - d(0, 0) = std::exp(-arg); - d(1, 1) = std::exp(arg); - return d; - } -}; - -/// The R1 gate -class R1 : public RotationGate -{ - public: - R1(RealType phi, unsigned q) : RotationGate(phi, q) - { - } - - std::string name() const - { - return "R1"; - } - - DiagMatrix matrix() const - { - DiagMatrix d; - ComplexType arg(0., angle()); - d(0, 0) = 1.; - d(1, 1) = std::exp(-arg); - return d; - } -}; - -/// The R1 gate -class R1Frac : public R1 -{ - public: - R1Frac(int k, int n, unsigned q) : R1(M_PI * static_cast(k) / static_cast(1 << n), q) - { - } - - std::string name() const - { - return "R1Frac"; - } -}; - -/// The R gate for rotation around an arbitrary basis -class R : public RotationGate -{ - public: - R(Basis b, RealType phi, unsigned q) : RotationGate(phi, q), b_(b) - { - } - - std::string name() const - { - return "R"; - } - - TinyMatrix matrix() const - { - switch (b_) + namespace SIMULATOR { - case PauliI: - return G(angle(), qubit()).matrix(); - break; - case PauliX: - return Rx(angle(), qubit()).matrix(); - break; - case PauliY: - return Ry(angle(), qubit()).matrix(); - break; - case PauliZ: - return Rz(angle(), qubit()).matrix(); - break; - default: - assert(false); + namespace Gates + { + + /// a type for runtime basis specification + enum Basis + { + PauliI = 0, + PauliX = 1, + PauliY = 3, + PauliZ = 2 + }; + + /// a general one qubit gate, storing the qubit number + class OneQubitGate + { + public: + OneQubitGate(unsigned q) : qubit_(q) + { + } + unsigned qubit() const + { + return qubit_; + } + + virtual TinyMatrix matrix() const + { + throw "Calling overriden function"; + } + + private: + unsigned qubit_; + }; + + /// a general one qubit roitation gate, storing the qubit number and angle + class RotationGate : public OneQubitGate + { + public: + RotationGate(double phi, unsigned q) : OneQubitGate(q), angle_(phi) + { + } + double angle() const + { + return angle_; + } + + private: + double angle_; + }; + + /// The Pauli X gate + class X : public OneQubitGate + { + public: + X(unsigned q) : OneQubitGate(q) + { + } + + std::string name() const + { + return "X"; + } + + TinyMatrix matrix() const + { + RealType mat[2][2] = { {0., 1.}, {1., 0.} }; + return TinyMatrix(mat); + } + }; + + /// The Pauli Y gate + class Y : public OneQubitGate + { + public: + Y(unsigned q) : OneQubitGate(q) + { + } + + std::string name() const + { + return "Y"; + } + + TinyMatrix matrix() const + { + using val_t = ComplexType; + val_t mat[2][2] = { {val_t(0.), val_t(0.)}, {val_t(0.), val_t(0.)} }; + mat[0][1] = val_t(0., -1.); + mat[1][0] = val_t(0., 1.); + return TinyMatrix(mat); + } + }; + + /// The Pauli Z gate + class Z : public OneQubitGate + { + public: + Z(unsigned q) : OneQubitGate(q) + { + } + + std::string name() const + { + return "Z"; + } + + TinyMatrix matrix() const + { + RealType diag[2] = { 1., -1. }; + return TinyMatrix(DiagMatrix(diag)); + } + }; + + /// The Hadamard gate + class H : public OneQubitGate + { + public: + H(unsigned q) : OneQubitGate(q) + { + } + + std::string name() const + { + return "H"; + } + + TinyMatrix matrix() const + { + RealType r = std::sqrt(0.5); + RealType mat[2][2] = { {r, r}, {r, -r} }; + return TinyMatrix(TinyMatrix(mat)); + } + }; + + /// The Y-version of a Hadamard gate + class HY : public OneQubitGate + { + public: + HY(unsigned q) : OneQubitGate(q) + { + } + + std::string name() const + { + return "HY"; + } + + TinyMatrix matrix() const + { + ComplexType r(std::sqrt(0.5), 0.); + ComplexType i(0., std::sqrt(0.5)); + ComplexType mat[2][2] = { {r, r}, {i, -i} }; + return TinyMatrix(mat); + } + }; + + /// The adjoint Y-version of a Hadamard gate + class AdjHY : public OneQubitGate + { + public: + AdjHY(unsigned q) : OneQubitGate(q) + { + } + + std::string name() const + { + return "AdjHY"; + } + + TinyMatrix matrix() const + { + ComplexType r(std::sqrt(0.5), 0.); + ComplexType i(0., std::sqrt(0.5)); + ComplexType mat[2][2] = { {r, -i}, {r, i} }; + return TinyMatrix(mat); + } + }; + + /// The S (phase) gate + class S : public OneQubitGate + { + public: + S(unsigned q) : OneQubitGate(q) + { + } + + std::string name() const + { + return "S"; + } + + TinyMatrix matrix() const + { + using val_t = ComplexType; + val_t diag[2] = { val_t(1.), val_t(0., 1.) }; + return TinyMatrix(DiagMatrix(diag)); + } + }; + + /// The adjoint of the S (phase) gate + class AdjS : public OneQubitGate + { + public: + AdjS(unsigned q) : OneQubitGate(q) + { + } + + std::string name() const + { + return "AdjS"; + } + + TinyMatrix matrix() const + { + using val_t = ComplexType; + val_t diag[2] = { val_t(1.), val_t(0., -1.) }; + return TinyMatrix(DiagMatrix(diag)); + } + }; + + /// The T (pi/8) gate + class T : public OneQubitGate + { + public: + T(unsigned q) : OneQubitGate(q) + { + } + + std::string name() const + { + return "T"; + } + + TinyMatrix matrix() const + { + using val_t = ComplexType; + RealType r = std::sqrt(0.5); + val_t diag[2] = { val_t(1.), val_t(r, r) }; + return TinyMatrix(DiagMatrix(diag)); + } + }; + + /// The T (pi/8) gate + class AdjT : public OneQubitGate + { + public: + AdjT(unsigned q) : OneQubitGate(q) + { + } + + std::string name() const + { + return "AdjT"; + } + + TinyMatrix matrix() const + { + using val_t = ComplexType; + RealType r = std::sqrt(0.5); + val_t diag[2] = { val_t(1.), val_t(r, -r) }; + return TinyMatrix(DiagMatrix(diag)); + } + }; + + /// The G gate + class G : public RotationGate + { + public: + G(RealType phi, unsigned q) : RotationGate(phi, q) + { + } + + std::string name() const + { + return "G"; + } + + TinyMatrix matrix() const + { + DiagMatrix d; + ComplexType arg(0., 0.5 * angle()); + d(0, 0) = d(1, 1) = std::exp(-arg); + return TinyMatrix(d); + } + }; + + /// The Rx gate + class Rx : public RotationGate + { + public: + Rx(RealType phi, unsigned q) : RotationGate(phi, q) + { + } + + std::string name() const + { + return "Rx"; + } + + TinyMatrix matrix() const + { + using val_t = ComplexType; + val_t s(0., -std::sin(0.5 * angle())); + val_t c = std::cos(0.5 * angle()); + val_t mat[2][2] = { {c, s}, {s, c} }; + return TinyMatrix(mat); + } + }; + + /// The Ry gate + class Ry : public RotationGate + { + public: + Ry(RealType phi, unsigned q) : RotationGate(phi, q) + { + } + + std::string name() const + { + return "Ry"; + } + + TinyMatrix matrix() const + { + RealType s = std::sin(0.5 * angle()); + RealType c = std::cos(0.5 * angle()); + RealType mat[2][2] = { {c, -s}, {s, c} }; + ; + return TinyMatrix(TinyMatrix(mat)); + } + }; + + /// The Rz gate + class Rz : public RotationGate + { + public: + Rz(RealType phi, unsigned q) : RotationGate(phi, q) + { + } + + std::string name() const + { + return "Rz"; + } + + TinyMatrix matrix() const + { + DiagMatrix d; + ComplexType arg(0., 0.5 * angle()); + d(0, 0) = std::exp(-arg); + d(1, 1) = std::exp(arg); + return TinyMatrix(d); + } + }; + + /// The R1 gate + class R1 : public RotationGate + { + public: + R1(RealType phi, unsigned q) : RotationGate(phi, q) + { + } + + std::string name() const + { + return "R1"; + } + + TinyMatrix matrix() const + { + DiagMatrix d; + ComplexType arg(0., angle()); + d(0, 0) = 1.; + d(1, 1) = std::exp(-arg); + return TinyMatrix(d); + } + }; + + /// The R1 gate + class R1Frac : public R1 + { + public: + R1Frac(int k, int n, unsigned q) : R1(M_PI* static_cast(k) / static_cast(1ll << n), q) + { + } + + std::string name() const + { + return "R1Frac"; + } + }; + + /// The R gate for rotation around an arbitrary basis + class R : public RotationGate + { + public: + R(Basis b, RealType phi, unsigned q) : RotationGate(phi, q), b_(b) + { + } + + std::string name() const + { + return "R"; + } + + TinyMatrix matrix() const + { + switch (b_) + { + case PauliI: + return G(angle(), qubit()).matrix(); + break; + case PauliX: + return Rx(angle(), qubit()).matrix(); + break; + case PauliY: + return Ry(angle(), qubit()).matrix(); + break; + case PauliZ: + return Rz(angle(), qubit()).matrix(); + break; + default: + assert(false); + } + // dummy return + return TinyMatrix(); + } + + private: + Basis b_; + }; + + class RFrac : public R + { + public: + RFrac(Basis b, int k, int n, unsigned q) : R(b, -2. * M_PI * static_cast(k) / static_cast(1ll << n), q) + { + } + std::string name() const + { + return "RFrac"; + } + }; + } } - // dummy return - return TinyMatrix(); - } - - private: - Basis b_; -}; - -class RFrac : public R -{ - public: - RFrac(Basis b, int k, int n, unsigned q) : R(b, -2. * M_PI * static_cast(k) / static_cast(1 << n), q) - { - } - std::string name() const - { - return "RFrac"; } -}; -} -} -} } diff --git a/src/Simulation/Native/src/simulator/kernels.hpp b/src/Simulation/Native/src/simulator/kernels.hpp index 68a710de36a..21447c5b535 100644 --- a/src/Simulation/Native/src/simulator/kernels.hpp +++ b/src/Simulation/Native/src/simulator/kernels.hpp @@ -398,7 +398,6 @@ void subsytemwavefunction_by_pivot(std::vector const& wfn, std::vector chunks; -#pragma omp parallel { #pragma omp single chunks = split_interval_in_chunks(max, omp_get_num_threads()); @@ -446,7 +445,6 @@ bool istensorproduct(std::vector const& wfn, std::size_t compl_st = compl_bits.to_ullong(); std::atomic go(true); -#pragma omp parallel { int thread_id = omp_get_thread_num(); if (thread_id < chunks.size() - 1) diff --git a/src/Simulation/Native/src/simulator/local_test.cpp b/src/Simulation/Native/src/simulator/local_test.cpp index 36820b89852..6f91d06b8fc 100644 --- a/src/Simulation/Native/src/simulator/local_test.cpp +++ b/src/Simulation/Native/src/simulator/local_test.cpp @@ -9,7 +9,6 @@ using namespace Microsoft::Quantum::SIMULATOR; - void test_exp() { SimulatorType sim; @@ -324,7 +323,6 @@ void test_extract_qubits_state() test_extract_qubits_cat_state(4, {1, 2}, {1, 3}); test_extract_qubits_cat_state(4, {1, 3}, {0, 1}); test_extract_qubits_cat_state(4, {2, 3}, {1, 2}); - test_extract_qubits_cat_state(12, {2, 4, 5, 6, 7}, {0, 1, 2}); test_extract_qubits_cat_state(6, {0, 1, 3}, {0, 1}); test_extract_qubits_cat_state(10, {0, 5}, {5, 6}); diff --git a/src/Simulation/Native/src/simulator/simulator.hpp b/src/Simulation/Native/src/simulator/simulator.hpp index 453fa4d1b89..41efd245de9 100644 --- a/src/Simulation/Native/src/simulator/simulator.hpp +++ b/src/Simulation/Native/src/simulator/simulator.hpp @@ -99,6 +99,7 @@ namespace SIMULATOR bool release(unsigned q) { recursive_lock_type l(mutex()); + flush(); bool allok = isclassical(q); if (allok) allok = (psi.getvalue(q)==false); diff --git a/src/Simulation/Native/src/simulator/simulatoravx512.cpp b/src/Simulation/Native/src/simulator/simulatoravx512.cpp new file mode 100644 index 00000000000..88bc438dccc --- /dev/null +++ b/src/Simulation/Native/src/simulator/simulatoravx512.cpp @@ -0,0 +1,16 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +#define HAVE_INTRINSICS +#define HAVE_AVX512 +#define HAVE_FMA + +#include "simulator/simulator.hpp" + + +namespace sim = Microsoft::Quantum::SimulatorAVX512; + +MICROSOFT_QUANTUM_DECL Microsoft::Quantum::Simulator::SimulatorInterface* sim::createSimulator(unsigned maxlocal) +{ + return new sim::SimulatorType(maxlocal); +} diff --git a/src/Simulation/Native/src/simulator/wavefunction.hpp b/src/Simulation/Native/src/simulator/wavefunction.hpp index 1d3a390642c..748aef30787 100644 --- a/src/Simulation/Native/src/simulator/wavefunction.hpp +++ b/src/Simulation/Native/src/simulator/wavefunction.hpp @@ -12,6 +12,9 @@ #include #include #include +#include +#include +#include #include "types.hpp" #include "gates.hpp" @@ -20,41 +23,122 @@ namespace Microsoft { -namespace Quantum -{ -namespace SIMULATOR -{ - namespace detail + namespace Quantum { - inline std::size_t get_register(const std::vector& qs, std::size_t basis_state) - { - std::size_t result = 0; - for (unsigned i = 0; i < qs.size(); ++i) - result |= ((basis_state >> qs[i]) & 1) << i; - return result; - - } - - inline std::size_t set_register(const std::vector& qs, std::size_t qmask, std::size_t basis_state, std::size_t original = 0ull) + namespace SIMULATOR { - std::size_t result = original & ~qmask; - for (unsigned i = 0; i < qs.size(); ++i) - result |= ((basis_state >> i) & 1) << qs[i]; - return result; - } - } + namespace detail + { + inline std::size_t get_register(const std::vector& qs, std::size_t basis_state) + { + std::size_t result = 0; + for (unsigned i = 0; i < qs.size(); ++i) + result |= ((basis_state >> qs[i]) & 1) << i; + return result; + + } + + inline std::size_t set_register(const std::vector& qs, std::size_t qmask, std::size_t basis_state, std::size_t original = 0ull) + { + std::size_t result = original & ~qmask; + for (unsigned i = 0; i < qs.size(); ++i) + result |= ((basis_state >> i) & 1) << qs[i]; + return result; + } + } + + // Creating a gate wrapper datatype to represent a gate in a cluster + class GateWrapper { + public: + GateWrapper(std::vector controls, unsigned target, TinyMatrix mat) : controls_(controls), target_(target), mat_(mat) {} + std::vector get_controls() { return controls_; } + unsigned get_target() { return target_; } + TinyMatrix get_mat() { return mat_; } + private: + std::vector controls_; + unsigned target_; + TinyMatrix mat_; + }; + + // Creating a cluster datatype for new scheduling logic + class Cluster { + public: + Cluster(std::vector qids, std::vector gates) : qids_(qids), gates_(gates) {} + std::vector get_qids() { return qids_; } + std::vector get_gates() { return gates_; } + + void setQids(std::vector qids) { + qids_ = qids; + } + + void append_gates(std::vector gates) { + gates_.insert(gates_.end(), gates.begin(), gates.end()); + } + + size_t size() { + return gates_.size(); + } + + // Greedy method that finds next appropriate cluster + std::pair> next_cluster(std::vector& nextClusters, unsigned maxWidth) { + std::vector myUnion; // My qubits touched + Next qubits touched + std::vector myDiff; // New qubits touched by Next + std::vector myInter; // Old qubits touched by Next + std::vector allInter; // My qubits + All touched qubits + std::set myTouched(qids_.begin(), qids_.end()); // My qubits touched + std::set allTouched = myTouched; // All the qubits touched so far + + int lastNexts = (int)nextClusters.size() - 1; // nexts are in reverse order (from above) + for (int i = 0; i <= lastNexts; i++) { // Look at the clusters that follow us + auto nextQs = nextClusters[lastNexts-i].get_qids(); // Pull off one future cluster + std::sort(nextQs.begin(), nextQs.end()); // Has to be sorted for set operations + myUnion.clear(); + std::set_union(nextQs.begin(), nextQs.end(), // See what qubits we and the future cluster touch + myTouched.begin(), myTouched.end(), + std::back_inserter(myUnion)); + if (myUnion.size() <= maxWidth) { // It's a candiate if it's not beyond our allowed width + myDiff.clear(); + std::set_difference(nextQs.begin(), nextQs.end(), // Figure out if any of the future qubits aren't already seen by us + myTouched.begin(), myTouched.end(), + std::back_inserter(myDiff)); + allInter.clear(); + std::set_intersection(myDiff.begin(), myDiff.end(), // These are any new qubits that might have already been touched + allTouched.begin(), allTouched.end(), + std::back_inserter(allInter)); + if (allInter.size() == 0) { // If the new qubits are untouched... then this is allowed + auto cl = nextClusters[lastNexts-i]; + nextClusters.erase(nextClusters.begin() + (lastNexts-i)); // Remove the future cluster + return std::make_pair(cl, myUnion); // ... and add it to our cluster (done above) + } + } + myInter.clear(); + std::set_intersection(nextQs.begin(), nextQs.end(), // If a future cluster touches any of our qubits... we've hit a hard wall + myTouched.begin(), myTouched.end(), + std::back_inserter(myInter)); + if (myInter.size() != 0) break; + + allTouched.insert(nextQs.begin(), nextQs.end()); // Add in all qubits touched, and try the next cluster + } + Cluster defCl = Cluster({}, {}); // Couldn't find any more clusters to add + std::vector defVec = {}; + return std::make_pair(defCl, defVec); + } + private: + std::vector qids_; + std::vector gates_; + }; /// A wave function class to store and manipulate the state of qubits template class Wavefunction { - public: +public: using value_type = T; using qubit_t = unsigned; using RngEngine = std::mt19937; - constexpr qubit_t invalid_qubit() const { return std::numeric_limits::max();} + constexpr qubit_t invalid_qubit() const { return std::numeric_limits::max(); } /// allocate a wave function for zero qubits Wavefunction(unsigned /*ignore*/) : num_qubits_(0), wfn_(1, 1.), usage_(0) @@ -74,7 +158,7 @@ class Wavefunction ~Wavefunction() { - flush(); + flush(); } unsigned qubit(unsigned q) const @@ -90,10 +174,34 @@ class Wavefunction void flush() const { - fused_.flush(wfn_); + int maxSpan = fused_.maxSpan(); + auto clusters = make_clusters(maxSpan, gatelist_); //making clusters with gates in the queue + + if (clusters.size() == 0) { + fused_.flush(wfn_); + } + else { + // logic to flush gates in each cluster + for (int i = 0; i < clusters.size(); i++) { + Cluster cl = clusters.at(i); + + for (GateWrapper gate : cl.get_gates()) { + std::vector cs = gate.get_controls(); + if (cs.size() == 0) { + fused_.apply(wfn_, gate.get_mat(), qubit(gate.get_target())); + } + else { + fused_.apply_controlled(wfn_, gate.get_mat(), qubits(cs), qubit(gate.get_target())); + } + } + + fused_.flush(wfn_); + } + } + gatelist_.clear(); } - /// allocate a qubit and grow the wave function + /// allocate a qubit and grow the wave function unsigned allocate() { assert(usage_ != 2); @@ -113,7 +221,7 @@ class Wavefunction } } - /// allocate a qubit and grow the wave function + /// allocate a qubit and grow the wave function void allocateQubit(unsigned id) { assert(usage_ != 1); @@ -134,7 +242,7 @@ class Wavefunction /// \pre the qubit has to be in a classical state in the computational basis void release(qubit_t q) { - unsigned p = qubit(q); + unsigned p = qubit(q); //returns qubitmap_[q] flush(); kernels::collapse(wfn_, p, getvalue(q), true); for (int i = 0; i < qubitmap_.size(); ++i) @@ -196,9 +304,9 @@ class Wavefunction } void apply_controlled_exp(std::vector const& bs, - double phi, - std::vector const& cs, - std::vector const& qs) + double phi, + std::vector const& cs, + std::vector const& qs) { flush(); kernels::apply_controlled_exp(wfn_, bs, phi, qubits(cs), qubits(qs)); @@ -238,11 +346,66 @@ class Wavefunction rng_.seed(s); } + //method that makes clusters to be flushed + std::vector make_clusters(unsigned fuseSpan, std::vector gates) const { + std::vector curClusters; + + if (gates.size() > 0) { + //creating initial cluster containing one gate each + for (int i = 0; i < gates.size(); i++) { + std::vector qids; + std::vector controlQids = gates[i].get_controls(); + if (controlQids.size() > 0) { + qids = controlQids; + } + qids.push_back(gates[i].get_target()); + Cluster newCl = Cluster(qids, { gates[i] }); + curClusters.push_back(newCl); + } + //creating clusters using greedy algorithm + for (int i = 1; i < (int)fuseSpan + 1; i++) { // Build clusters of width 1,2,... + std::reverse(curClusters.begin(), curClusters.end()); // Keep everything in reverse order + auto prevClusters = curClusters; // Save away the last set of clusters built + curClusters.clear(); + auto prevCluster = prevClusters.back(); // Pop the first cluster + prevClusters.pop_back(); + while (prevClusters.size() > 0) { // While there are more clusters... + auto foundCompat = prevCluster.next_cluster(prevClusters, i); // See if we can accumlate anyone who follows + Cluster clusterFound = foundCompat.first; + std::vector foundTotQids = foundCompat.second; + if (clusterFound.get_gates().size() == 0 || // Can't append any more clusters to this one + (int)prevCluster.size() >= fused_.maxDepth()) { // ... or we're beyond max depth + curClusters.push_back(prevCluster); // Save this cluster + if (prevCluster.size() > 0) + { + prevCluster = prevClusters.back(); + prevClusters.pop_back(); + } + } + else { + prevCluster.setQids(foundTotQids); // New version of our cluster (appended) + prevCluster.append_gates(clusterFound.get_gates()); + } + } // Keep looking for clusters to add + curClusters.push_back(prevCluster); // Save the final cluster + } // Start all over with the next larger span + } + + return curClusters; + } + /// generic application of a gate template void apply(Gate const& g) { - fused_.apply(wfn_, g.matrix(), qubit(g)); + std::vector cs; + GateWrapper gateApplied = GateWrapper(cs, g.qubit(), g.matrix()); + gatelist_.push_back(gateApplied); + if (gatelist_.size() > 999) { + flush(); + } + + int doFlush = fused_.shouldFlush(wfn_, cs, g.qubit()); } /// generic application of a multiply controlled gate @@ -250,7 +413,13 @@ class Wavefunction void apply_controlled(std::vector cs, Gate const& g) { std::vector pcs = qubits(cs); - fused_.apply_controlled(wfn_, g.matrix(), pcs, qubit(g)); + GateWrapper gateApplied = GateWrapper(cs, g.qubit(), g.matrix()); + gatelist_.push_back(gateApplied); + if (gatelist_.size() > 999) { + flush(); + } + + int doFlush = fused_.shouldFlush(wfn_, cs, g.qubit()); } /// generic application of a controlled gate @@ -274,7 +443,8 @@ class Wavefunction template bool subsytemwavefunction(std::vector const& qs, std::vector& qubitswfn, double tolerance) { - return fused_.subsytemwavefunction(wfn_, qubits(qs), qubitswfn, tolerance); + flush(); // we have to flush before we can extract the state + return kernels::subsytemwavefunction(wfn_, qubits(qs), qubitswfn, tolerance); } @@ -338,8 +508,9 @@ class Wavefunction private: unsigned num_qubits_; // for convenience mutable WavefunctionStorage wfn_; // storing the wave function - std::vector qubitmap_; // mapping of logical to physical qubits - int usage_; + mutable std::vector qubitmap_; // mapping of logical to physical qubits + int usage_; + mutable std::vector gatelist_; // randomness support RngEngine rng_; diff --git a/src/Simulation/Native/src/util/Makefile b/src/Simulation/Native/src/util/Makefile new file mode 100644 index 00000000000..b980489f2aa --- /dev/null +++ b/src/Simulation/Native/src/util/Makefile @@ -0,0 +1,518 @@ +# CMAKE generated file: DO NOT EDIT! +# Generated by "Unix Makefiles" Generator, CMake Version 3.16 + +# Default target executed when no arguments are given to make. +default_target: all + +.PHONY : default_target + +# Allow only one "make -f Makefile2" at a time, but pass parallelism. +.NOTPARALLEL: + + +#============================================================================= +# Special targets provided by cmake. + +# Disable implicit rules so canonical targets will work. +.SUFFIXES: + + +# Remove some rules from gmake that .SUFFIXES does not remove. +SUFFIXES = + +.SUFFIXES: .hpux_make_needs_suffix_list + + +# Suppress display of executed commands. +$(VERBOSE).SILENT: + + +# A target that is always out of date. +cmake_force: + +.PHONY : cmake_force + +#============================================================================= +# Set environment variables for the build. + +# The shell in which to execute make rules. +SHELL = /bin/sh + +# The CMake executable. +CMAKE_COMMAND = /usr/bin/cmake + +# The command to remove a file. +RM = /usr/bin/cmake -E remove -f + +# Escaping for special characters. +EQUALS = = + +# The top-level source directory on which CMake was run. +CMAKE_SOURCE_DIR = /mnt/c/depot/Git/qsharp-runtime/src/Simulation/Native + +# The top-level build directory on which CMake was run. +CMAKE_BINARY_DIR = /mnt/c/depot/Git/qsharp-runtime/src/Simulation/Native + +#============================================================================= +# Targets provided globally by CMake. + +# Special rule for the target install/strip +install/strip: preinstall + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Installing the project stripped..." + /usr/bin/cmake -DCMAKE_INSTALL_DO_STRIP=1 -P cmake_install.cmake +.PHONY : install/strip + +# Special rule for the target install/strip +install/strip/fast: preinstall/fast + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Installing the project stripped..." + /usr/bin/cmake -DCMAKE_INSTALL_DO_STRIP=1 -P cmake_install.cmake +.PHONY : install/strip/fast + +# Special rule for the target install/local +install/local: preinstall + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Installing only the local directory..." + /usr/bin/cmake -DCMAKE_INSTALL_LOCAL_ONLY=1 -P cmake_install.cmake +.PHONY : install/local + +# Special rule for the target install/local +install/local/fast: preinstall/fast + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Installing only the local directory..." + /usr/bin/cmake -DCMAKE_INSTALL_LOCAL_ONLY=1 -P cmake_install.cmake +.PHONY : install/local/fast + +# Special rule for the target test +test: + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Running tests..." + /usr/bin/ctest --force-new-ctest-process $(ARGS) +.PHONY : test + +# Special rule for the target test +test/fast: test + +.PHONY : test/fast + +# Special rule for the target edit_cache +edit_cache: + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Running CMake cache editor..." + /usr/bin/ccmake -S$(CMAKE_SOURCE_DIR) -B$(CMAKE_BINARY_DIR) +.PHONY : edit_cache + +# Special rule for the target edit_cache +edit_cache/fast: edit_cache + +.PHONY : edit_cache/fast + +# Special rule for the target rebuild_cache +rebuild_cache: + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Running CMake to regenerate build system..." + /usr/bin/cmake -S$(CMAKE_SOURCE_DIR) -B$(CMAKE_BINARY_DIR) +.PHONY : rebuild_cache + +# Special rule for the target rebuild_cache +rebuild_cache/fast: rebuild_cache + +.PHONY : rebuild_cache/fast + +# Special rule for the target list_install_components +list_install_components: + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Available install components are: \"libraries\"" +.PHONY : list_install_components + +# Special rule for the target list_install_components +list_install_components/fast: list_install_components + +.PHONY : list_install_components/fast + +# Special rule for the target install +install: preinstall + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Install the project..." + /usr/bin/cmake -P cmake_install.cmake +.PHONY : install + +# Special rule for the target install +install/fast: preinstall/fast + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Install the project..." + /usr/bin/cmake -P cmake_install.cmake +.PHONY : install/fast + +# The main all target +all: cmake_check_build_system + cd /mnt/c/depot/Git/qsharp-runtime/src/Simulation/Native && $(CMAKE_COMMAND) -E cmake_progress_start /mnt/c/depot/Git/qsharp-runtime/src/Simulation/Native/CMakeFiles /mnt/c/depot/Git/qsharp-runtime/src/Simulation/Native/src/util/CMakeFiles/progress.marks + cd /mnt/c/depot/Git/qsharp-runtime/src/Simulation/Native && $(MAKE) -f CMakeFiles/Makefile2 src/util/all + $(CMAKE_COMMAND) -E cmake_progress_start /mnt/c/depot/Git/qsharp-runtime/src/Simulation/Native/CMakeFiles 0 +.PHONY : all + +# The main clean target +clean: + cd /mnt/c/depot/Git/qsharp-runtime/src/Simulation/Native && $(MAKE) -f CMakeFiles/Makefile2 src/util/clean +.PHONY : clean + +# The main clean target +clean/fast: clean + +.PHONY : clean/fast + +# Prepare targets for installation. +preinstall: all + cd /mnt/c/depot/Git/qsharp-runtime/src/Simulation/Native && $(MAKE) -f CMakeFiles/Makefile2 src/util/preinstall +.PHONY : preinstall + +# Prepare targets for installation. +preinstall/fast: + cd /mnt/c/depot/Git/qsharp-runtime/src/Simulation/Native && $(MAKE) -f CMakeFiles/Makefile2 src/util/preinstall +.PHONY : preinstall/fast + +# clear depends +depend: + cd /mnt/c/depot/Git/qsharp-runtime/src/Simulation/Native && $(CMAKE_COMMAND) -S$(CMAKE_SOURCE_DIR) -B$(CMAKE_BINARY_DIR) --check-build-system CMakeFiles/Makefile.cmake 1 +.PHONY : depend + +# Convenience name for target. +src/util/CMakeFiles/tinymatrix_test.dir/rule: + cd /mnt/c/depot/Git/qsharp-runtime/src/Simulation/Native && $(MAKE) -f CMakeFiles/Makefile2 src/util/CMakeFiles/tinymatrix_test.dir/rule +.PHONY : src/util/CMakeFiles/tinymatrix_test.dir/rule + +# Convenience name for target. +tinymatrix_test: src/util/CMakeFiles/tinymatrix_test.dir/rule + +.PHONY : tinymatrix_test + +# fast build rule for target. +tinymatrix_test/fast: + cd /mnt/c/depot/Git/qsharp-runtime/src/Simulation/Native && $(MAKE) -f src/util/CMakeFiles/tinymatrix_test.dir/build.make src/util/CMakeFiles/tinymatrix_test.dir/build +.PHONY : tinymatrix_test/fast + +# Convenience name for target. +src/util/CMakeFiles/bititerator_test.dir/rule: + cd /mnt/c/depot/Git/qsharp-runtime/src/Simulation/Native && $(MAKE) -f CMakeFiles/Makefile2 src/util/CMakeFiles/bititerator_test.dir/rule +.PHONY : src/util/CMakeFiles/bititerator_test.dir/rule + +# Convenience name for target. +bititerator_test: src/util/CMakeFiles/bititerator_test.dir/rule + +.PHONY : bititerator_test + +# fast build rule for target. +bititerator_test/fast: + cd /mnt/c/depot/Git/qsharp-runtime/src/Simulation/Native && $(MAKE) -f src/util/CMakeFiles/bititerator_test.dir/build.make src/util/CMakeFiles/bititerator_test.dir/build +.PHONY : bititerator_test/fast + +# Convenience name for target. +src/util/CMakeFiles/bitops_test.dir/rule: + cd /mnt/c/depot/Git/qsharp-runtime/src/Simulation/Native && $(MAKE) -f CMakeFiles/Makefile2 src/util/CMakeFiles/bitops_test.dir/rule +.PHONY : src/util/CMakeFiles/bitops_test.dir/rule + +# Convenience name for target. +bitops_test: src/util/CMakeFiles/bitops_test.dir/rule + +.PHONY : bitops_test + +# fast build rule for target. +bitops_test/fast: + cd /mnt/c/depot/Git/qsharp-runtime/src/Simulation/Native && $(MAKE) -f src/util/CMakeFiles/bitops_test.dir/build.make src/util/CMakeFiles/bitops_test.dir/build +.PHONY : bitops_test/fast + +# Convenience name for target. +src/util/CMakeFiles/openmp_test.dir/rule: + cd /mnt/c/depot/Git/qsharp-runtime/src/Simulation/Native && $(MAKE) -f CMakeFiles/Makefile2 src/util/CMakeFiles/openmp_test.dir/rule +.PHONY : src/util/CMakeFiles/openmp_test.dir/rule + +# Convenience name for target. +openmp_test: src/util/CMakeFiles/openmp_test.dir/rule + +.PHONY : openmp_test + +# fast build rule for target. +openmp_test/fast: + cd /mnt/c/depot/Git/qsharp-runtime/src/Simulation/Native && $(MAKE) -f src/util/CMakeFiles/openmp_test.dir/build.make src/util/CMakeFiles/openmp_test.dir/build +.PHONY : openmp_test/fast + +# Convenience name for target. +src/util/CMakeFiles/cpuid_test.dir/rule: + cd /mnt/c/depot/Git/qsharp-runtime/src/Simulation/Native && $(MAKE) -f CMakeFiles/Makefile2 src/util/CMakeFiles/cpuid_test.dir/rule +.PHONY : src/util/CMakeFiles/cpuid_test.dir/rule + +# Convenience name for target. +cpuid_test: src/util/CMakeFiles/cpuid_test.dir/rule + +.PHONY : cpuid_test + +# fast build rule for target. +cpuid_test/fast: + cd /mnt/c/depot/Git/qsharp-runtime/src/Simulation/Native && $(MAKE) -f src/util/CMakeFiles/cpuid_test.dir/build.make src/util/CMakeFiles/cpuid_test.dir/build +.PHONY : cpuid_test/fast + +# Convenience name for target. +src/util/CMakeFiles/argmaxnrm2_test.dir/rule: + cd /mnt/c/depot/Git/qsharp-runtime/src/Simulation/Native && $(MAKE) -f CMakeFiles/Makefile2 src/util/CMakeFiles/argmaxnrm2_test.dir/rule +.PHONY : src/util/CMakeFiles/argmaxnrm2_test.dir/rule + +# Convenience name for target. +argmaxnrm2_test: src/util/CMakeFiles/argmaxnrm2_test.dir/rule + +.PHONY : argmaxnrm2_test + +# fast build rule for target. +argmaxnrm2_test/fast: + cd /mnt/c/depot/Git/qsharp-runtime/src/Simulation/Native && $(MAKE) -f src/util/CMakeFiles/argmaxnrm2_test.dir/build.make src/util/CMakeFiles/argmaxnrm2_test.dir/build +.PHONY : argmaxnrm2_test/fast + +# Convenience name for target. +src/util/CMakeFiles/diagmatrix_test.dir/rule: + cd /mnt/c/depot/Git/qsharp-runtime/src/Simulation/Native && $(MAKE) -f CMakeFiles/Makefile2 src/util/CMakeFiles/diagmatrix_test.dir/rule +.PHONY : src/util/CMakeFiles/diagmatrix_test.dir/rule + +# Convenience name for target. +diagmatrix_test: src/util/CMakeFiles/diagmatrix_test.dir/rule + +.PHONY : diagmatrix_test + +# fast build rule for target. +diagmatrix_test/fast: + cd /mnt/c/depot/Git/qsharp-runtime/src/Simulation/Native && $(MAKE) -f src/util/CMakeFiles/diagmatrix_test.dir/build.make src/util/CMakeFiles/diagmatrix_test.dir/build +.PHONY : diagmatrix_test/fast + +argmaxnrm2_test.o: argmaxnrm2_test.cpp.o + +.PHONY : argmaxnrm2_test.o + +# target to build an object file +argmaxnrm2_test.cpp.o: + cd /mnt/c/depot/Git/qsharp-runtime/src/Simulation/Native && $(MAKE) -f src/util/CMakeFiles/argmaxnrm2_test.dir/build.make src/util/CMakeFiles/argmaxnrm2_test.dir/argmaxnrm2_test.cpp.o +.PHONY : argmaxnrm2_test.cpp.o + +argmaxnrm2_test.i: argmaxnrm2_test.cpp.i + +.PHONY : argmaxnrm2_test.i + +# target to preprocess a source file +argmaxnrm2_test.cpp.i: + cd /mnt/c/depot/Git/qsharp-runtime/src/Simulation/Native && $(MAKE) -f src/util/CMakeFiles/argmaxnrm2_test.dir/build.make src/util/CMakeFiles/argmaxnrm2_test.dir/argmaxnrm2_test.cpp.i +.PHONY : argmaxnrm2_test.cpp.i + +argmaxnrm2_test.s: argmaxnrm2_test.cpp.s + +.PHONY : argmaxnrm2_test.s + +# target to generate assembly for a file +argmaxnrm2_test.cpp.s: + cd /mnt/c/depot/Git/qsharp-runtime/src/Simulation/Native && $(MAKE) -f src/util/CMakeFiles/argmaxnrm2_test.dir/build.make src/util/CMakeFiles/argmaxnrm2_test.dir/argmaxnrm2_test.cpp.s +.PHONY : argmaxnrm2_test.cpp.s + +bititerator_test.o: bititerator_test.cpp.o + +.PHONY : bititerator_test.o + +# target to build an object file +bititerator_test.cpp.o: + cd /mnt/c/depot/Git/qsharp-runtime/src/Simulation/Native && $(MAKE) -f src/util/CMakeFiles/bititerator_test.dir/build.make src/util/CMakeFiles/bititerator_test.dir/bititerator_test.cpp.o +.PHONY : bititerator_test.cpp.o + +bititerator_test.i: bititerator_test.cpp.i + +.PHONY : bititerator_test.i + +# target to preprocess a source file +bititerator_test.cpp.i: + cd /mnt/c/depot/Git/qsharp-runtime/src/Simulation/Native && $(MAKE) -f src/util/CMakeFiles/bititerator_test.dir/build.make src/util/CMakeFiles/bititerator_test.dir/bititerator_test.cpp.i +.PHONY : bititerator_test.cpp.i + +bititerator_test.s: bititerator_test.cpp.s + +.PHONY : bititerator_test.s + +# target to generate assembly for a file +bititerator_test.cpp.s: + cd /mnt/c/depot/Git/qsharp-runtime/src/Simulation/Native && $(MAKE) -f src/util/CMakeFiles/bititerator_test.dir/build.make src/util/CMakeFiles/bititerator_test.dir/bititerator_test.cpp.s +.PHONY : bititerator_test.cpp.s + +bitops_test.o: bitops_test.cpp.o + +.PHONY : bitops_test.o + +# target to build an object file +bitops_test.cpp.o: + cd /mnt/c/depot/Git/qsharp-runtime/src/Simulation/Native && $(MAKE) -f src/util/CMakeFiles/bitops_test.dir/build.make src/util/CMakeFiles/bitops_test.dir/bitops_test.cpp.o +.PHONY : bitops_test.cpp.o + +bitops_test.i: bitops_test.cpp.i + +.PHONY : bitops_test.i + +# target to preprocess a source file +bitops_test.cpp.i: + cd /mnt/c/depot/Git/qsharp-runtime/src/Simulation/Native && $(MAKE) -f src/util/CMakeFiles/bitops_test.dir/build.make src/util/CMakeFiles/bitops_test.dir/bitops_test.cpp.i +.PHONY : bitops_test.cpp.i + +bitops_test.s: bitops_test.cpp.s + +.PHONY : bitops_test.s + +# target to generate assembly for a file +bitops_test.cpp.s: + cd /mnt/c/depot/Git/qsharp-runtime/src/Simulation/Native && $(MAKE) -f src/util/CMakeFiles/bitops_test.dir/build.make src/util/CMakeFiles/bitops_test.dir/bitops_test.cpp.s +.PHONY : bitops_test.cpp.s + +cpuid_test.o: cpuid_test.cpp.o + +.PHONY : cpuid_test.o + +# target to build an object file +cpuid_test.cpp.o: + cd /mnt/c/depot/Git/qsharp-runtime/src/Simulation/Native && $(MAKE) -f src/util/CMakeFiles/cpuid_test.dir/build.make src/util/CMakeFiles/cpuid_test.dir/cpuid_test.cpp.o +.PHONY : cpuid_test.cpp.o + +cpuid_test.i: cpuid_test.cpp.i + +.PHONY : cpuid_test.i + +# target to preprocess a source file +cpuid_test.cpp.i: + cd /mnt/c/depot/Git/qsharp-runtime/src/Simulation/Native && $(MAKE) -f src/util/CMakeFiles/cpuid_test.dir/build.make src/util/CMakeFiles/cpuid_test.dir/cpuid_test.cpp.i +.PHONY : cpuid_test.cpp.i + +cpuid_test.s: cpuid_test.cpp.s + +.PHONY : cpuid_test.s + +# target to generate assembly for a file +cpuid_test.cpp.s: + cd /mnt/c/depot/Git/qsharp-runtime/src/Simulation/Native && $(MAKE) -f src/util/CMakeFiles/cpuid_test.dir/build.make src/util/CMakeFiles/cpuid_test.dir/cpuid_test.cpp.s +.PHONY : cpuid_test.cpp.s + +diagmatrix_test.o: diagmatrix_test.cpp.o + +.PHONY : diagmatrix_test.o + +# target to build an object file +diagmatrix_test.cpp.o: + cd /mnt/c/depot/Git/qsharp-runtime/src/Simulation/Native && $(MAKE) -f src/util/CMakeFiles/diagmatrix_test.dir/build.make src/util/CMakeFiles/diagmatrix_test.dir/diagmatrix_test.cpp.o +.PHONY : diagmatrix_test.cpp.o + +diagmatrix_test.i: diagmatrix_test.cpp.i + +.PHONY : diagmatrix_test.i + +# target to preprocess a source file +diagmatrix_test.cpp.i: + cd /mnt/c/depot/Git/qsharp-runtime/src/Simulation/Native && $(MAKE) -f src/util/CMakeFiles/diagmatrix_test.dir/build.make src/util/CMakeFiles/diagmatrix_test.dir/diagmatrix_test.cpp.i +.PHONY : diagmatrix_test.cpp.i + +diagmatrix_test.s: diagmatrix_test.cpp.s + +.PHONY : diagmatrix_test.s + +# target to generate assembly for a file +diagmatrix_test.cpp.s: + cd /mnt/c/depot/Git/qsharp-runtime/src/Simulation/Native && $(MAKE) -f src/util/CMakeFiles/diagmatrix_test.dir/build.make src/util/CMakeFiles/diagmatrix_test.dir/diagmatrix_test.cpp.s +.PHONY : diagmatrix_test.cpp.s + +openmp_test.o: openmp_test.cpp.o + +.PHONY : openmp_test.o + +# target to build an object file +openmp_test.cpp.o: + cd /mnt/c/depot/Git/qsharp-runtime/src/Simulation/Native && $(MAKE) -f src/util/CMakeFiles/openmp_test.dir/build.make src/util/CMakeFiles/openmp_test.dir/openmp_test.cpp.o +.PHONY : openmp_test.cpp.o + +openmp_test.i: openmp_test.cpp.i + +.PHONY : openmp_test.i + +# target to preprocess a source file +openmp_test.cpp.i: + cd /mnt/c/depot/Git/qsharp-runtime/src/Simulation/Native && $(MAKE) -f src/util/CMakeFiles/openmp_test.dir/build.make src/util/CMakeFiles/openmp_test.dir/openmp_test.cpp.i +.PHONY : openmp_test.cpp.i + +openmp_test.s: openmp_test.cpp.s + +.PHONY : openmp_test.s + +# target to generate assembly for a file +openmp_test.cpp.s: + cd /mnt/c/depot/Git/qsharp-runtime/src/Simulation/Native && $(MAKE) -f src/util/CMakeFiles/openmp_test.dir/build.make src/util/CMakeFiles/openmp_test.dir/openmp_test.cpp.s +.PHONY : openmp_test.cpp.s + +tinymatrix_test.o: tinymatrix_test.cpp.o + +.PHONY : tinymatrix_test.o + +# target to build an object file +tinymatrix_test.cpp.o: + cd /mnt/c/depot/Git/qsharp-runtime/src/Simulation/Native && $(MAKE) -f src/util/CMakeFiles/tinymatrix_test.dir/build.make src/util/CMakeFiles/tinymatrix_test.dir/tinymatrix_test.cpp.o +.PHONY : tinymatrix_test.cpp.o + +tinymatrix_test.i: tinymatrix_test.cpp.i + +.PHONY : tinymatrix_test.i + +# target to preprocess a source file +tinymatrix_test.cpp.i: + cd /mnt/c/depot/Git/qsharp-runtime/src/Simulation/Native && $(MAKE) -f src/util/CMakeFiles/tinymatrix_test.dir/build.make src/util/CMakeFiles/tinymatrix_test.dir/tinymatrix_test.cpp.i +.PHONY : tinymatrix_test.cpp.i + +tinymatrix_test.s: tinymatrix_test.cpp.s + +.PHONY : tinymatrix_test.s + +# target to generate assembly for a file +tinymatrix_test.cpp.s: + cd /mnt/c/depot/Git/qsharp-runtime/src/Simulation/Native && $(MAKE) -f src/util/CMakeFiles/tinymatrix_test.dir/build.make src/util/CMakeFiles/tinymatrix_test.dir/tinymatrix_test.cpp.s +.PHONY : tinymatrix_test.cpp.s + +# Help Target +help: + @echo "The following are some of the valid targets for this Makefile:" + @echo "... all (the default if no target is provided)" + @echo "... clean" + @echo "... depend" + @echo "... install/strip" + @echo "... tinymatrix_test" + @echo "... bititerator_test" + @echo "... install/local" + @echo "... bitops_test" + @echo "... openmp_test" + @echo "... cpuid_test" + @echo "... argmaxnrm2_test" + @echo "... test" + @echo "... edit_cache" + @echo "... rebuild_cache" + @echo "... list_install_components" + @echo "... diagmatrix_test" + @echo "... install" + @echo "... argmaxnrm2_test.o" + @echo "... argmaxnrm2_test.i" + @echo "... argmaxnrm2_test.s" + @echo "... bititerator_test.o" + @echo "... bititerator_test.i" + @echo "... bititerator_test.s" + @echo "... bitops_test.o" + @echo "... bitops_test.i" + @echo "... bitops_test.s" + @echo "... cpuid_test.o" + @echo "... cpuid_test.i" + @echo "... cpuid_test.s" + @echo "... diagmatrix_test.o" + @echo "... diagmatrix_test.i" + @echo "... diagmatrix_test.s" + @echo "... openmp_test.o" + @echo "... openmp_test.i" + @echo "... openmp_test.s" + @echo "... tinymatrix_test.o" + @echo "... tinymatrix_test.i" + @echo "... tinymatrix_test.s" +.PHONY : help + + + +#============================================================================= +# Special targets to cleanup operation of make. + +# Special rule to run CMake to check the build system integrity. +# No rule that depends on this can have commands that come from listfiles +# because they might be regenerated. +cmake_check_build_system: + cd /mnt/c/depot/Git/qsharp-runtime/src/Simulation/Native && $(CMAKE_COMMAND) -S$(CMAKE_SOURCE_DIR) -B$(CMAKE_BINARY_DIR) --check-build-system CMakeFiles/Makefile.cmake 0 +.PHONY : cmake_check_build_system + diff --git a/src/Simulation/Native/src/util/cpuid.hpp b/src/Simulation/Native/src/util/cpuid.hpp index cbce00cf2d1..705d65dec85 100644 --- a/src/Simulation/Native/src/util/cpuid.hpp +++ b/src/Simulation/Native/src/util/cpuid.hpp @@ -40,7 +40,7 @@ namespace Microsoft { #ifndef _MSC_VER //__builtin_cpu_init(); - return false; // __builtin_cpu_supports("avx512bw"); + return (__builtin_cpu_supports("avx512f") != 0 && __builtin_cpu_supports("avx512cd") != 0); #else int cpuInfo[4]; __cpuid(cpuInfo,0); diff --git a/src/Simulation/Native/src/version.hpp b/src/Simulation/Native/src/version.hpp new file mode 100644 index 00000000000..b55afdfd213 --- /dev/null +++ b/src/Simulation/Native/src/version.hpp @@ -0,0 +1,11 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +#pragma once + +/* #undef MICROSOFT_QUANTUM_SIMULATOR_VERSION */ +/* #undef MICROSOFT_QUANTUM_SIMULATOR_VERSION_MAJOR */ +/* #undef MICROSOFT_QUANTUM_SIMULATOR_VERSION_MINOR */ +/* #undef MICROSOFT_QUANTUM_SIMULATOR_VERSION_PATCH */ +/* #undef MICROSOFT_QUANTUM_SIMULATOR_VERSION_STRING */ +#define MICROSOFT_QUANTUM_SIMULATOR_YEAR "2020" diff --git a/src/Simulation/Native/stats.xlsx b/src/Simulation/Native/stats.xlsx new file mode 100644 index 00000000000..321f140939c Binary files /dev/null and b/src/Simulation/Native/stats.xlsx differ diff --git a/src/Simulation/Native/tinymatrix_test b/src/Simulation/Native/tinymatrix_test new file mode 100644 index 00000000000..2c7100d1b31 Binary files /dev/null and b/src/Simulation/Native/tinymatrix_test differ diff --git a/src/Simulation/QCTraceSimulator.Tests/Tests.Microsoft.Quantum.Simulation.QCTraceSimulatorRuntime.csproj b/src/Simulation/QCTraceSimulator.Tests/Tests.Microsoft.Quantum.Simulation.QCTraceSimulatorRuntime.csproj index ff3f9c59da5..ba121246d35 100644 --- a/src/Simulation/QCTraceSimulator.Tests/Tests.Microsoft.Quantum.Simulation.QCTraceSimulatorRuntime.csproj +++ b/src/Simulation/QCTraceSimulator.Tests/Tests.Microsoft.Quantum.Simulation.QCTraceSimulatorRuntime.csproj @@ -1,4 +1,4 @@ - + diff --git a/src/Simulation/QsharpCore/Microsoft.Quantum.QSharp.Core.csproj b/src/Simulation/QsharpCore/Microsoft.Quantum.QSharp.Core.csproj index 6e8467e5159..807c708c9b3 100644 --- a/src/Simulation/QsharpCore/Microsoft.Quantum.QSharp.Core.csproj +++ b/src/Simulation/QsharpCore/Microsoft.Quantum.QSharp.Core.csproj @@ -1,4 +1,4 @@ - + diff --git a/src/Simulation/Simulators.Tests/TestProjects/HoneywellExe/HoneywellExe.csproj b/src/Simulation/Simulators.Tests/TestProjects/HoneywellExe/HoneywellExe.csproj index 27cc28e4a65..957ffe6c2bd 100644 --- a/src/Simulation/Simulators.Tests/TestProjects/HoneywellExe/HoneywellExe.csproj +++ b/src/Simulation/Simulators.Tests/TestProjects/HoneywellExe/HoneywellExe.csproj @@ -1,4 +1,4 @@ - + Library diff --git a/src/Simulation/Simulators.Tests/TestProjects/IntrinsicTests/IntrinsicTests.csproj b/src/Simulation/Simulators.Tests/TestProjects/IntrinsicTests/IntrinsicTests.csproj index 8396c045f93..e85a1a19e96 100644 --- a/src/Simulation/Simulators.Tests/TestProjects/IntrinsicTests/IntrinsicTests.csproj +++ b/src/Simulation/Simulators.Tests/TestProjects/IntrinsicTests/IntrinsicTests.csproj @@ -1,4 +1,4 @@ - + netcoreapp3.1 diff --git a/src/Simulation/Simulators.Tests/TestProjects/IonQExe/IonQExe.csproj b/src/Simulation/Simulators.Tests/TestProjects/IonQExe/IonQExe.csproj index 83d11ae6541..5ac7dcf5f7e 100644 --- a/src/Simulation/Simulators.Tests/TestProjects/IonQExe/IonQExe.csproj +++ b/src/Simulation/Simulators.Tests/TestProjects/IonQExe/IonQExe.csproj @@ -1,4 +1,4 @@ - + Library diff --git a/src/Simulation/Simulators.Tests/TestProjects/Library with Spaces/Library with Spaces.csproj b/src/Simulation/Simulators.Tests/TestProjects/Library with Spaces/Library with Spaces.csproj index df4d6e13a43..ebdaa51f4c0 100644 --- a/src/Simulation/Simulators.Tests/TestProjects/Library with Spaces/Library with Spaces.csproj +++ b/src/Simulation/Simulators.Tests/TestProjects/Library with Spaces/Library with Spaces.csproj @@ -1,4 +1,4 @@ - + netstandard2.1 false diff --git a/src/Simulation/Simulators.Tests/TestProjects/Library1/Library1.csproj b/src/Simulation/Simulators.Tests/TestProjects/Library1/Library1.csproj index c01bc8d99bc..56799905ef0 100644 --- a/src/Simulation/Simulators.Tests/TestProjects/Library1/Library1.csproj +++ b/src/Simulation/Simulators.Tests/TestProjects/Library1/Library1.csproj @@ -1,4 +1,4 @@ - + netstandard2.1 diff --git a/src/Simulation/Simulators.Tests/TestProjects/Library2/Library2.csproj b/src/Simulation/Simulators.Tests/TestProjects/Library2/Library2.csproj index c01bc8d99bc..56799905ef0 100644 --- a/src/Simulation/Simulators.Tests/TestProjects/Library2/Library2.csproj +++ b/src/Simulation/Simulators.Tests/TestProjects/Library2/Library2.csproj @@ -1,4 +1,4 @@ - + netstandard2.1 diff --git a/src/Simulation/Simulators.Tests/TestProjects/QCIExe/QCIExe.csproj b/src/Simulation/Simulators.Tests/TestProjects/QCIExe/QCIExe.csproj index fcc18c5313d..df2e5362b29 100644 --- a/src/Simulation/Simulators.Tests/TestProjects/QCIExe/QCIExe.csproj +++ b/src/Simulation/Simulators.Tests/TestProjects/QCIExe/QCIExe.csproj @@ -1,4 +1,4 @@ - + Library diff --git a/src/Simulation/Simulators.Tests/TestProjects/QsharpExe/QsharpExe.csproj b/src/Simulation/Simulators.Tests/TestProjects/QsharpExe/QsharpExe.csproj index e74705db790..283f23eec8b 100644 --- a/src/Simulation/Simulators.Tests/TestProjects/QsharpExe/QsharpExe.csproj +++ b/src/Simulation/Simulators.Tests/TestProjects/QsharpExe/QsharpExe.csproj @@ -1,4 +1,4 @@ - + Exe diff --git a/src/Simulation/Simulators.Tests/TestProjects/TargetedExe/TargetedExe.csproj b/src/Simulation/Simulators.Tests/TestProjects/TargetedExe/TargetedExe.csproj index 6e76c47fe19..682391cfdac 100644 --- a/src/Simulation/Simulators.Tests/TestProjects/TargetedExe/TargetedExe.csproj +++ b/src/Simulation/Simulators.Tests/TestProjects/TargetedExe/TargetedExe.csproj @@ -1,4 +1,4 @@ - + Exe diff --git a/src/Simulation/Simulators.Tests/TestProjects/UnitTests/UnitTests.csproj b/src/Simulation/Simulators.Tests/TestProjects/UnitTests/UnitTests.csproj index 03d1c8a4a8e..22e1d8d901d 100644 --- a/src/Simulation/Simulators.Tests/TestProjects/UnitTests/UnitTests.csproj +++ b/src/Simulation/Simulators.Tests/TestProjects/UnitTests/UnitTests.csproj @@ -1,4 +1,4 @@ - + netcoreapp3.1 diff --git a/src/Simulation/Simulators.Tests/Tests.Microsoft.Quantum.Simulators.csproj b/src/Simulation/Simulators.Tests/Tests.Microsoft.Quantum.Simulators.csproj index ca7ce29588f..6fa960aef8f 100644 --- a/src/Simulation/Simulators.Tests/Tests.Microsoft.Quantum.Simulators.csproj +++ b/src/Simulation/Simulators.Tests/Tests.Microsoft.Quantum.Simulators.csproj @@ -1,4 +1,4 @@ - + diff --git a/src/Simulation/Simulators/Microsoft.Quantum.Simulators.csproj b/src/Simulation/Simulators/Microsoft.Quantum.Simulators.csproj index 00cd0c2aeda..25a7446c6f1 100644 --- a/src/Simulation/Simulators/Microsoft.Quantum.Simulators.csproj +++ b/src/Simulation/Simulators/Microsoft.Quantum.Simulators.csproj @@ -1,4 +1,4 @@ - +