From 333fdf684d4738085574d9fe2f55611f9c989dd9 Mon Sep 17 00:00:00 2001
From: Sarah Marshall <33814365+samarsha@users.noreply.github.com>
Date: Wed, 10 Jun 2020 16:49:01 -0700
Subject: [PATCH 1/4] Create a separate package for the entry point driver
(#225)
* Add EntryPointDriver to pack.ps1
* Rename Microsoft.Quantum.Simulation.Simulators to Microsoft.Quantum.Simulators
* Rename EntryPointDriver to Microsoft.Quantum.EntryPointDriver
* Rename Tests.EntryPointDriver to Microsoft.Quantum.EntryPointDriver.Tests
* Add package description for EntryPointDriver
* Remove EntryPointDriver from CsharpGeneration package
* Update namespace in EntryPointDriver
* Update namespace in EntryPointDriver.Tests
* Update simulator test project filename
* Update manifest.ps1
* Rename Microsoft.Quantum.EntryPointDriver.Tests to Tests.Microsoft.Quantum.EntryPointDriver
* Restore name of simulators assembly
* Add DelaySign.cs to EntryPointDriver
* Use same assembly name as package name again
* Update reference to simulators csproj
* Update simulators nuspec
* Update project reference to simulators
* Update namespace names in test
* Remove package reference to Q# compiler
* Use IImmutableList instead of IReadOnlyList
* Use ImmutableList.Create with params array
* Make DriverSettings immutable
* Make DriverSettings properties internal
* Add PackageId to EntryPointDriver.csproj
---
Simulation.sln | 8 +-
build/manifest.ps1 | 6 +-
build/pack.ps1 | 1 +
src/Simulation/Common/Simulators.Dev.props | 2 +-
.../Core/Properties/AssemblyInfo.cs | 2 +-
.../Tests.CsharpGeneration.fsproj | 2 +-
src/Simulation/CsharpGeneration/EntryPoint.fs | 26 ++++-
.../CsharpGeneration/FindNuspecReferences.ps1 | 7 --
...t.Quantum.CsharpGeneration.nuspec.template | 7 --
...Microsoft.Quantum.EntryPointDriver.fsproj} | 4 +-
.../EntryPointDriver.Tests/Tests.fs | 8 +-
.../EntryPointDriver.Tests/Tests.qs | 66 ++++++-------
src/Simulation/EntryPointDriver/Azure.cs | 4 +-
src/Simulation/EntryPointDriver/Driver.cs | 95 ++++++++++---------
.../EntryPointDriver/DriverSettings.cs | 49 ++++++++++
.../EntryPointDriver/IEntryPoint.cs | 10 +-
...Microsoft.Quantum.EntryPointDriver.csproj} | 13 ++-
src/Simulation/EntryPointDriver/OptionInfo.cs | 17 ++--
src/Simulation/EntryPointDriver/Options.cs | 2 +-
src/Simulation/EntryPointDriver/Parsers.cs | 2 +-
src/Simulation/EntryPointDriver/Simulation.cs | 20 ++--
.../EntryPointDriver/TestMachines.cs | 2 +-
src/Simulation/EntryPointDriver/Validation.cs | 25 ++---
.../TestProjects/QsharpExe/QsharpExe.csproj | 3 +-
.../TestProjects/UnitTests/UnitTests.csproj | 2 +-
...Tests.Microsoft.Quantum.Simulators.csproj} | 0
.../Simulators/FindNuspecReferences.ps1 | 2 +-
...oj => Microsoft.Quantum.Simulators.csproj} | 1 -
...crosoft.Quantum.Simulators.nuspec.template | 3 +
.../Simulators/Properties/AssemblyInfo.cs | 2 +-
30 files changed, 222 insertions(+), 169 deletions(-)
rename src/Simulation/EntryPointDriver.Tests/{Tests.EntryPointDriver.fsproj => Tests.Microsoft.Quantum.EntryPointDriver.fsproj} (85%)
create mode 100644 src/Simulation/EntryPointDriver/DriverSettings.cs
rename src/Simulation/EntryPointDriver/{EntryPointDriver.csproj => Microsoft.Quantum.EntryPointDriver.csproj} (68%)
rename src/Simulation/Simulators.Tests/{Tests.Microsoft.Quantum.Simulation.Simulators.csproj => Tests.Microsoft.Quantum.Simulators.csproj} (100%)
rename src/Simulation/Simulators/{Microsoft.Quantum.Simulation.Simulators.csproj => Microsoft.Quantum.Simulators.csproj} (99%)
diff --git a/Simulation.sln b/Simulation.sln
index 2566fb7b457..7886189c882 100644
--- a/Simulation.sln
+++ b/Simulation.sln
@@ -17,11 +17,11 @@ Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "Microsoft.Quantum.Simulatio
EndProject
Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "Microsoft.Quantum.Simulation.Common", "src\Simulation\Common\Microsoft.Quantum.Simulation.Common.csproj", "{8EC46ADB-7FAA-49EA-BA63-E7B32C4F4445}"
EndProject
-Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "Microsoft.Quantum.Simulation.Simulators", "src\Simulation\Simulators\Microsoft.Quantum.Simulation.Simulators.csproj", "{72B7E75C-D305-45BD-929E-C86298AAA8DE}"
+Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "Microsoft.Quantum.Simulators", "src\Simulation\Simulators\Microsoft.Quantum.Simulators.csproj", "{72B7E75C-D305-45BD-929E-C86298AAA8DE}"
EndProject
Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "Tests.Microsoft.Quantum.Simulation.QCTraceSimulatorRuntime", "src\Simulation\QCTraceSimulator.Tests\Tests.Microsoft.Quantum.Simulation.QCTraceSimulatorRuntime.csproj", "{DD50D2D9-2765-449B-8C4B-835A428E160D}"
EndProject
-Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "Tests.Microsoft.Quantum.Simulation.Simulators", "src\Simulation\Simulators.Tests\Tests.Microsoft.Quantum.Simulation.Simulators.csproj", "{23461B29-F9DE-4F5B-BC30-50BBE1A10B48}"
+Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "Tests.Microsoft.Quantum.Simulators", "src\Simulation\Simulators.Tests\Tests.Microsoft.Quantum.Simulators.csproj", "{23461B29-F9DE-4F5B-BC30-50BBE1A10B48}"
EndProject
Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "simulation", "simulation", "{34D419E9-CCF1-4E48-9FA4-3AD4B86BEEB4}"
EndProject
@@ -47,9 +47,9 @@ Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "TestProjects", "TestProject
EndProject
Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "QsharpExe", "src\Simulation\Simulators.Tests\TestProjects\QsharpExe\QsharpExe.csproj", "{2F5796A7-4AF8-4B78-928A-0A3A80752F9D}"
EndProject
-Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "EntryPointDriver", "src\Simulation\EntryPointDriver\EntryPointDriver.csproj", "{944FE7EF-9220-4CC6-BB20-CE517195B922}"
+Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "Microsoft.Quantum.EntryPointDriver", "src\Simulation\EntryPointDriver\Microsoft.Quantum.EntryPointDriver.csproj", "{944FE7EF-9220-4CC6-BB20-CE517195B922}"
EndProject
-Project("{6EC3EE1D-3C4E-46DD-8F32-0CC8E7565705}") = "Tests.EntryPointDriver", "src\Simulation\EntryPointDriver.Tests\Tests.EntryPointDriver.fsproj", "{E2F30496-19D8-46A8-9BC0-26936FFE70D2}"
+Project("{F2A71F9B-5D33-465A-A702-920D77279786}") = "Tests.Microsoft.Quantum.EntryPointDriver", "src\Simulation\EntryPointDriver.Tests\Tests.Microsoft.Quantum.EntryPointDriver.fsproj", "{E2F30496-19D8-46A8-9BC0-26936FFE70D2}"
EndProject
Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "Library1", "src\Simulation\Simulators.Tests\TestProjects\Library1\Library1.csproj", "{7256B986-6705-42FC-9F57-485D72D9DE51}"
EndProject
diff --git a/build/manifest.ps1 b/build/manifest.ps1
index 07a8fca41f3..29ea070beb7 100644
--- a/build/manifest.ps1
+++ b/build/manifest.ps1
@@ -8,6 +8,7 @@
"Microsoft.Azure.Quantum.Client",
"Microsoft.Quantum.CsharpGeneration",
"Microsoft.Quantum.Development.Kit",
+ "Microsoft.Quantum.EntryPointDriver",
"Microsoft.Quantum.QSharp.Core",
"Microsoft.Quantum.Runtime.Core",
"Microsoft.Quantum.Simulators",
@@ -19,10 +20,11 @@
".\src\simulation\CsharpGeneration.App\bin\$Env:BUILD_CONFIGURATION\netcoreapp3.1\Microsoft.Quantum.CsharpGeneration.App.dll",
".\src\simulation\CsharpGeneration.App\bin\$Env:BUILD_CONFIGURATION\netcoreapp3.1\Microsoft.Quantum.RoslynWrapper.dll",
".\src\simulation\Core\bin\$Env:BUILD_CONFIGURATION\netstandard2.1\Microsoft.Quantum.Runtime.Core.dll",
+ ".\src\simulation\EntryPointDriver\bin\$Env:BUILD_CONFIGURATION\netstandard2.1\Microsoft.Quantum.EntryPointDriver.dll",
".\src\simulation\QsharpCore\bin\$Env:BUILD_CONFIGURATION\netstandard2.1\Microsoft.Quantum.QSharp.Core.dll",
".\src\simulation\Simulators\bin\$Env:BUILD_CONFIGURATION\netstandard2.1\Microsoft.Quantum.Simulation.Common.dll",
".\src\simulation\Simulators\bin\$Env:BUILD_CONFIGURATION\netstandard2.1\Microsoft.Quantum.Simulation.QCTraceSimulatorRuntime.dll",
- ".\src\simulation\Simulators\bin\$Env:BUILD_CONFIGURATION\netstandard2.1\Microsoft.Quantum.Simulation.Simulators.dll",
+ ".\src\simulation\Simulators\bin\$Env:BUILD_CONFIGURATION\netstandard2.1\Microsoft.Quantum.Simulators.dll",
".\src\Xunit\bin\$Env:BUILD_CONFIGURATION\netstandard2.1\Microsoft.Quantum.Xunit.dll"
) | ForEach-Object { Get-Item (Join-Path $PSScriptRoot (Join-Path ".." $_)) };
-} | Write-Output;
\ No newline at end of file
+} | Write-Output;
diff --git a/build/pack.ps1 b/build/pack.ps1
index b4b39f19e2a..893db2b5728 100644
--- a/build/pack.ps1
+++ b/build/pack.ps1
@@ -61,6 +61,7 @@ function Pack-Dotnet() {
Write-Host "##[info]Using nuget to create packages"
Pack-Dotnet '../src/Azure/Azure.Quantum.Client/Microsoft.Azure.Quantum.Client.csproj'
Pack-One '../src/Simulation/CsharpGeneration/Microsoft.Quantum.CsharpGeneration.fsproj' '-IncludeReferencedProjects'
+Pack-Dotnet '../src/Simulation/EntryPointDriver/Microsoft.Quantum.EntryPointDriver.csproj'
Pack-Dotnet '../src/Simulation/Core/Microsoft.Quantum.Runtime.Core.csproj'
Pack-Dotnet '../src/Simulation/QsharpCore/Microsoft.Quantum.QSharp.Core.csproj'
Pack-One '../src/Simulation/Simulators/Microsoft.Quantum.Simulators.nuspec'
diff --git a/src/Simulation/Common/Simulators.Dev.props b/src/Simulation/Common/Simulators.Dev.props
index d459a4b6634..f92092e9568 100644
--- a/src/Simulation/Common/Simulators.Dev.props
+++ b/src/Simulation/Common/Simulators.Dev.props
@@ -28,7 +28,7 @@
-
+
diff --git a/src/Simulation/Core/Properties/AssemblyInfo.cs b/src/Simulation/Core/Properties/AssemblyInfo.cs
index 9781d4c4d84..c2a5d12dced 100644
--- a/src/Simulation/Core/Properties/AssemblyInfo.cs
+++ b/src/Simulation/Core/Properties/AssemblyInfo.cs
@@ -6,4 +6,4 @@
using System.Runtime.InteropServices;
// Allow the test assembly to use our internal methods
-[assembly: InternalsVisibleTo("Tests.Microsoft.Quantum.Simulation.Simulators" + SigningConstants.PUBLIC_KEY)]
+[assembly: InternalsVisibleTo("Tests.Microsoft.Quantum.Simulators" + SigningConstants.PUBLIC_KEY)]
diff --git a/src/Simulation/CsharpGeneration.Tests/Tests.CsharpGeneration.fsproj b/src/Simulation/CsharpGeneration.Tests/Tests.CsharpGeneration.fsproj
index 45d4e5dfe37..906f812078b 100644
--- a/src/Simulation/CsharpGeneration.Tests/Tests.CsharpGeneration.fsproj
+++ b/src/Simulation/CsharpGeneration.Tests/Tests.CsharpGeneration.fsproj
@@ -54,7 +54,7 @@
-
+
diff --git a/src/Simulation/CsharpGeneration/EntryPoint.fs b/src/Simulation/CsharpGeneration/EntryPoint.fs
index b975bf374a8..8906a7403c7 100644
--- a/src/Simulation/CsharpGeneration/EntryPoint.fs
+++ b/src/Simulation/CsharpGeneration/EntryPoint.fs
@@ -24,7 +24,25 @@ type private Parameter =
let entryPointClassName = "__QsEntryPoint__"
/// The namespace containing the non-generated parts of the entry point driver.
-let private driverNamespace = "Microsoft.Quantum.CsharpGeneration.EntryPointDriver"
+let private driverNamespace = "Microsoft.Quantum.EntryPointDriver"
+
+/// The driver settings object.
+let private driverSettings =
+ let newDriverSettings = driverNamespace + ".DriverSettings" |> ``type`` |> SyntaxFactory.ObjectCreationExpression
+ let namedArg (name : string) expr = SyntaxFactory.NameColon name |> (SyntaxFactory.Argument expr).WithNameColon
+ let immutableList elements = invoke (ident "System.Collections.Immutable.ImmutableList.Create") ``(`` elements ``)``
+ let simulatorOptionAliases =
+ [ literal <| "--" + fst CommandLineArguments.SimulatorOption
+ literal <| "-" + snd CommandLineArguments.SimulatorOption ]
+ |> immutableList
+ [ namedArg "simulatorOptionAliases" simulatorOptionAliases
+ namedArg "quantumSimulatorName" <| literal AssemblyConstants.QuantumSimulator
+ namedArg "toffoliSimulatorName" <| literal AssemblyConstants.ToffoliSimulator
+ namedArg "resourcesEstimatorName" <| literal AssemblyConstants.ResourcesEstimator ]
+ |> SyntaxFactory.SeparatedList
+ |> SyntaxFactory.ArgumentList
+ |> newDriverSettings.WithArgumentList
+ :> ExpressionSyntax
/// A sequence of all of the named parameters in the argument tuple and their respective C# and Q# types.
let rec private parameters context doc = function
@@ -104,7 +122,7 @@ let private mainMethod context entryPoint =
let callableName, argTypeName, returnTypeName = callableTypeNames context entryPoint
let driverType = generic (driverNamespace + ".Driver") ``<<`` [callableName; argTypeName; returnTypeName] ``>>``
let entryPointInstance = ``new`` (``type`` entryPointClassName) ``(`` [] ``)``
- let driver = ``new`` driverType ``(`` [entryPointInstance] ``)``
+ let driver = ``new`` driverType ``(`` [driverSettings; entryPointInstance] ``)``
let commandLineArgsName = "args"
arrow_method "System.Threading.Tasks.Task" "Main" ``<<`` [] ``>>``
``(`` [param commandLineArgsName ``of`` (``type`` "string[]")] ``)``
@@ -121,7 +139,7 @@ let private entryPointClass context entryPoint =
context.assemblyConstants.TryGetValue AssemblyConstants.DefaultSimulator
|> snd
|> (fun value -> if String.IsNullOrWhiteSpace value then AssemblyConstants.QuantumSimulator else value)
- let defaultSimulatorProperty = property "DefaultSimulator" "string" (literal defaultSimulator)
+ let defaultSimulatorNameProperty = property "DefaultSimulatorName" "string" (literal defaultSimulator)
let infoProperty =
property "Info"
(sprintf "EntryPointInfo<%s, %s>" argTypeName returnTypeName)
@@ -129,7 +147,7 @@ let private entryPointClass context entryPoint =
let members : MemberDeclarationSyntax list = [
summaryProperty
parameterOptionsProperty parameters
- defaultSimulatorProperty
+ defaultSimulatorNameProperty
infoProperty
customSimulatorFactory defaultSimulator
createArgument context entryPoint
diff --git a/src/Simulation/CsharpGeneration/FindNuspecReferences.ps1 b/src/Simulation/CsharpGeneration/FindNuspecReferences.ps1
index 68d012099af..0c3695812f9 100644
--- a/src/Simulation/CsharpGeneration/FindNuspecReferences.ps1
+++ b/src/Simulation/CsharpGeneration/FindNuspecReferences.ps1
@@ -59,13 +59,6 @@ function Add-PackageReferenceDependencies($ProjectFileName) {
# Add dependencies for the projects included in this NuGet package.
Add-PackageReferenceDependencies 'Microsoft.Quantum.CsharpGeneration.fsproj'
-Add-PackageReferenceDependencies '..\EntryPointDriver\EntryPointDriver.csproj'
-
-# Manually add EntryPointDriver's project references as package references to avoid a build-time dependency cycle.
-# $version$ is replaced with the current package version when the package is built.
-Add-Dependency 'Microsoft.Quantum.Runtime.Core' '$version$'
-Add-Dependency 'Microsoft.Quantum.Simulators' '$version$'
-Add-Dependency 'Microsoft.Azure.Quantum.Client' '$version$'
$nuspec.package.metadata.AppendChild($dependencies)
$nuspec.Save([Path]::Combine((Get-Location), $target))
diff --git a/src/Simulation/CsharpGeneration/Microsoft.Quantum.CsharpGeneration.nuspec.template b/src/Simulation/CsharpGeneration/Microsoft.Quantum.CsharpGeneration.nuspec.template
index cf3797a7b0a..997341009c3 100644
--- a/src/Simulation/CsharpGeneration/Microsoft.Quantum.CsharpGeneration.nuspec.template
+++ b/src/Simulation/CsharpGeneration/Microsoft.Quantum.CsharpGeneration.nuspec.template
@@ -23,12 +23,5 @@
-
-
-
-
diff --git a/src/Simulation/EntryPointDriver.Tests/Tests.EntryPointDriver.fsproj b/src/Simulation/EntryPointDriver.Tests/Tests.Microsoft.Quantum.EntryPointDriver.fsproj
similarity index 85%
rename from src/Simulation/EntryPointDriver.Tests/Tests.EntryPointDriver.fsproj
rename to src/Simulation/EntryPointDriver.Tests/Tests.Microsoft.Quantum.EntryPointDriver.fsproj
index 24ea7c82cda..8ab17e90bed 100644
--- a/src/Simulation/EntryPointDriver.Tests/Tests.EntryPointDriver.fsproj
+++ b/src/Simulation/EntryPointDriver.Tests/Tests.Microsoft.Quantum.EntryPointDriver.fsproj
@@ -4,7 +4,7 @@
netcoreapp3.1
false
false
- Microsoft.Quantum.CsharpGeneration.EntryPointDriver.Tests
+ Microsoft.Quantum.EntryPointDriver.Tests
@@ -29,7 +29,7 @@
-
+
diff --git a/src/Simulation/EntryPointDriver.Tests/Tests.fs b/src/Simulation/EntryPointDriver.Tests/Tests.fs
index 808b4b6b607..57f283a2d73 100644
--- a/src/Simulation/EntryPointDriver.Tests/Tests.fs
+++ b/src/Simulation/EntryPointDriver.Tests/Tests.fs
@@ -1,7 +1,7 @@
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
-module Microsoft.Quantum.CsharpGeneration.EntryPointDriver.Tests
+module Microsoft.Quantum.EntryPointDriver.Tests
open System
open System.Collections.Immutable
@@ -33,7 +33,7 @@ let private intrinsicFile = Path.GetFullPath "Intrinsic.qs"
let private testFile = Path.GetFullPath "Tests.qs"
/// The namespace used for the test cases.
-let private testNamespace = "Microsoft.Quantum.CsharpGeneration.EntryPointDriver.Tests"
+let private testNamespace = "Microsoft.Quantum.EntryPointDriver.Tests"
/// The test case for the given test number.
let private testCase =
@@ -95,12 +95,12 @@ let private compileCsharp (sources : string seq) =
"System.Runtime"
"System.Runtime.Extensions"
"System.Runtime.Numerics"
- "Microsoft.Quantum.CsharpGeneration.EntryPointDriver"
+ "Microsoft.Quantum.EntryPointDriver"
"Microsoft.Quantum.QSharp.Core"
"Microsoft.Quantum.QsDataStructures"
"Microsoft.Quantum.Runtime.Core"
"Microsoft.Quantum.Simulation.Common"
- "Microsoft.Quantum.Simulation.Simulators"
+ "Microsoft.Quantum.Simulators"
]
|> List.map (fun name -> upcast MetadataReference.CreateFromFile (referencedAssembly name))
diff --git a/src/Simulation/EntryPointDriver.Tests/Tests.qs b/src/Simulation/EntryPointDriver.Tests/Tests.qs
index bfe7a467c8d..89ff907bde9 100644
--- a/src/Simulation/EntryPointDriver.Tests/Tests.qs
+++ b/src/Simulation/EntryPointDriver.Tests/Tests.qs
@@ -5,14 +5,14 @@
// No Options
//
-namespace Microsoft.Quantum.CsharpGeneration.EntryPointDriver.Tests {
+namespace Microsoft.Quantum.EntryPointDriver.Tests {
@EntryPoint()
operation ReturnUnit() : Unit { }
}
// ---
-namespace Microsoft.Quantum.CsharpGeneration.EntryPointDriver.Tests {
+namespace Microsoft.Quantum.EntryPointDriver.Tests {
@EntryPoint()
operation ReturnInt() : Int {
return 42;
@@ -21,7 +21,7 @@ namespace Microsoft.Quantum.CsharpGeneration.EntryPointDriver.Tests {
// ---
-namespace Microsoft.Quantum.CsharpGeneration.EntryPointDriver.Tests {
+namespace Microsoft.Quantum.EntryPointDriver.Tests {
@EntryPoint()
operation ReturnString() : String {
return "Hello, World!";
@@ -34,7 +34,7 @@ namespace Microsoft.Quantum.CsharpGeneration.EntryPointDriver.Tests {
// Single Option
//
-namespace Microsoft.Quantum.CsharpGeneration.EntryPointDriver.Tests {
+namespace Microsoft.Quantum.EntryPointDriver.Tests {
@EntryPoint()
operation AcceptInt(n : Int) : Int {
return n;
@@ -43,7 +43,7 @@ namespace Microsoft.Quantum.CsharpGeneration.EntryPointDriver.Tests {
// ---
-namespace Microsoft.Quantum.CsharpGeneration.EntryPointDriver.Tests {
+namespace Microsoft.Quantum.EntryPointDriver.Tests {
@EntryPoint()
operation AcceptBigInt(n : BigInt) : BigInt {
return n;
@@ -52,7 +52,7 @@ namespace Microsoft.Quantum.CsharpGeneration.EntryPointDriver.Tests {
// ---
-namespace Microsoft.Quantum.CsharpGeneration.EntryPointDriver.Tests {
+namespace Microsoft.Quantum.EntryPointDriver.Tests {
@EntryPoint()
operation AcceptDouble(n : Double) : Double {
return n;
@@ -61,7 +61,7 @@ namespace Microsoft.Quantum.CsharpGeneration.EntryPointDriver.Tests {
// ---
-namespace Microsoft.Quantum.CsharpGeneration.EntryPointDriver.Tests {
+namespace Microsoft.Quantum.EntryPointDriver.Tests {
@EntryPoint()
operation AcceptBool(b : Bool) : Bool {
return b;
@@ -70,7 +70,7 @@ namespace Microsoft.Quantum.CsharpGeneration.EntryPointDriver.Tests {
// ---
-namespace Microsoft.Quantum.CsharpGeneration.EntryPointDriver.Tests {
+namespace Microsoft.Quantum.EntryPointDriver.Tests {
@EntryPoint()
operation AcceptPauli(p : Pauli) : Pauli {
return p;
@@ -79,7 +79,7 @@ namespace Microsoft.Quantum.CsharpGeneration.EntryPointDriver.Tests {
// ---
-namespace Microsoft.Quantum.CsharpGeneration.EntryPointDriver.Tests {
+namespace Microsoft.Quantum.EntryPointDriver.Tests {
@EntryPoint()
operation AcceptResult(r : Result) : Result {
return r;
@@ -88,7 +88,7 @@ namespace Microsoft.Quantum.CsharpGeneration.EntryPointDriver.Tests {
// ---
-namespace Microsoft.Quantum.CsharpGeneration.EntryPointDriver.Tests {
+namespace Microsoft.Quantum.EntryPointDriver.Tests {
@EntryPoint()
operation AcceptRange(r : Range) : Range {
return r;
@@ -97,7 +97,7 @@ namespace Microsoft.Quantum.CsharpGeneration.EntryPointDriver.Tests {
// ---
-namespace Microsoft.Quantum.CsharpGeneration.EntryPointDriver.Tests {
+namespace Microsoft.Quantum.EntryPointDriver.Tests {
@EntryPoint()
operation AcceptString(s : String) : String {
return s;
@@ -106,7 +106,7 @@ namespace Microsoft.Quantum.CsharpGeneration.EntryPointDriver.Tests {
// ---
-namespace Microsoft.Quantum.CsharpGeneration.EntryPointDriver.Tests {
+namespace Microsoft.Quantum.EntryPointDriver.Tests {
@EntryPoint()
operation AcceptUnit(u : Unit) : Unit {
return u;
@@ -115,7 +115,7 @@ namespace Microsoft.Quantum.CsharpGeneration.EntryPointDriver.Tests {
// ---
-namespace Microsoft.Quantum.CsharpGeneration.EntryPointDriver.Tests {
+namespace Microsoft.Quantum.EntryPointDriver.Tests {
@EntryPoint()
operation AcceptStringArray(xs : String[]) : String[] {
return xs;
@@ -124,7 +124,7 @@ namespace Microsoft.Quantum.CsharpGeneration.EntryPointDriver.Tests {
// ---
-namespace Microsoft.Quantum.CsharpGeneration.EntryPointDriver.Tests {
+namespace Microsoft.Quantum.EntryPointDriver.Tests {
@EntryPoint()
operation AcceptBigIntArray(bs : BigInt[]) : BigInt[] {
return bs;
@@ -133,7 +133,7 @@ namespace Microsoft.Quantum.CsharpGeneration.EntryPointDriver.Tests {
// ---
-namespace Microsoft.Quantum.CsharpGeneration.EntryPointDriver.Tests {
+namespace Microsoft.Quantum.EntryPointDriver.Tests {
@EntryPoint()
operation AcceptPauliArray(ps : Pauli[]) : Pauli[] {
return ps;
@@ -142,7 +142,7 @@ namespace Microsoft.Quantum.CsharpGeneration.EntryPointDriver.Tests {
// ---
-namespace Microsoft.Quantum.CsharpGeneration.EntryPointDriver.Tests {
+namespace Microsoft.Quantum.EntryPointDriver.Tests {
@EntryPoint()
operation AcceptRangeArray(rs : Range[]) : Range[] {
return rs;
@@ -151,7 +151,7 @@ namespace Microsoft.Quantum.CsharpGeneration.EntryPointDriver.Tests {
// ---
-namespace Microsoft.Quantum.CsharpGeneration.EntryPointDriver.Tests {
+namespace Microsoft.Quantum.EntryPointDriver.Tests {
@EntryPoint()
operation AcceptResultArray(rs : Result[]) : Result[] {
return rs;
@@ -160,7 +160,7 @@ namespace Microsoft.Quantum.CsharpGeneration.EntryPointDriver.Tests {
// ---
-namespace Microsoft.Quantum.CsharpGeneration.EntryPointDriver.Tests {
+namespace Microsoft.Quantum.EntryPointDriver.Tests {
@EntryPoint()
operation AcceptUnitArray(us : Unit[]) : Unit[] {
return us;
@@ -173,7 +173,7 @@ namespace Microsoft.Quantum.CsharpGeneration.EntryPointDriver.Tests {
// Multiple Options
//
-namespace Microsoft.Quantum.CsharpGeneration.EntryPointDriver.Tests {
+namespace Microsoft.Quantum.EntryPointDriver.Tests {
@EntryPoint()
operation TwoOptions(n : Int, b : Bool) : String {
return $"{n} {b}";
@@ -182,7 +182,7 @@ namespace Microsoft.Quantum.CsharpGeneration.EntryPointDriver.Tests {
// ---
-namespace Microsoft.Quantum.CsharpGeneration.EntryPointDriver.Tests {
+namespace Microsoft.Quantum.EntryPointDriver.Tests {
@EntryPoint()
operation ThreeOptions(n : Int, b : Bool, xs : String[]) : String {
return $"{n} {b} {xs}";
@@ -195,7 +195,7 @@ namespace Microsoft.Quantum.CsharpGeneration.EntryPointDriver.Tests {
// Tuples
//
-namespace Microsoft.Quantum.CsharpGeneration.EntryPointDriver.Tests {
+namespace Microsoft.Quantum.EntryPointDriver.Tests {
@EntryPoint()
operation RedundantOneTuple((x : Int)) : String {
return $"{x}";
@@ -204,7 +204,7 @@ namespace Microsoft.Quantum.CsharpGeneration.EntryPointDriver.Tests {
// ---
-namespace Microsoft.Quantum.CsharpGeneration.EntryPointDriver.Tests {
+namespace Microsoft.Quantum.EntryPointDriver.Tests {
@EntryPoint()
operation RedundantTwoTuple((x : Int, y : Int)) : String {
return $"{x} {y}";
@@ -213,7 +213,7 @@ namespace Microsoft.Quantum.CsharpGeneration.EntryPointDriver.Tests {
// ---
-namespace Microsoft.Quantum.CsharpGeneration.EntryPointDriver.Tests {
+namespace Microsoft.Quantum.EntryPointDriver.Tests {
@EntryPoint()
operation OneTuple(x : Int, (y : Int)) : String {
return $"{x} {y}";
@@ -222,7 +222,7 @@ namespace Microsoft.Quantum.CsharpGeneration.EntryPointDriver.Tests {
// ---
-namespace Microsoft.Quantum.CsharpGeneration.EntryPointDriver.Tests {
+namespace Microsoft.Quantum.EntryPointDriver.Tests {
@EntryPoint()
operation TwoTuple(x : Int, (y : Int, z : Int)) : String {
return $"{x} {y} {z}";
@@ -235,7 +235,7 @@ namespace Microsoft.Quantum.CsharpGeneration.EntryPointDriver.Tests {
// Name Conversion
//
-namespace Microsoft.Quantum.CsharpGeneration.EntryPointDriver.Tests {
+namespace Microsoft.Quantum.EntryPointDriver.Tests {
@EntryPoint()
operation CamelCase(camelCaseName : String) : String {
return camelCaseName;
@@ -244,7 +244,7 @@ namespace Microsoft.Quantum.CsharpGeneration.EntryPointDriver.Tests {
// ---
-namespace Microsoft.Quantum.CsharpGeneration.EntryPointDriver.Tests {
+namespace Microsoft.Quantum.EntryPointDriver.Tests {
@EntryPoint()
operation SingleLetter(x : String) : String {
return x;
@@ -257,7 +257,7 @@ namespace Microsoft.Quantum.CsharpGeneration.EntryPointDriver.Tests {
// Shadowing
//
-namespace Microsoft.Quantum.CsharpGeneration.EntryPointDriver.Tests {
+namespace Microsoft.Quantum.EntryPointDriver.Tests {
@EntryPoint()
operation ShadowSimulator(simulator : String) : String {
return simulator;
@@ -266,7 +266,7 @@ namespace Microsoft.Quantum.CsharpGeneration.EntryPointDriver.Tests {
// ---
-namespace Microsoft.Quantum.CsharpGeneration.EntryPointDriver.Tests {
+namespace Microsoft.Quantum.EntryPointDriver.Tests {
@EntryPoint()
operation ShadowS(s : String) : String {
return s;
@@ -275,7 +275,7 @@ namespace Microsoft.Quantum.CsharpGeneration.EntryPointDriver.Tests {
// ---
-namespace Microsoft.Quantum.CsharpGeneration.EntryPointDriver.Tests {
+namespace Microsoft.Quantum.EntryPointDriver.Tests {
@EntryPoint()
operation ShadowVersion(version : String) : String {
return version;
@@ -284,7 +284,7 @@ namespace Microsoft.Quantum.CsharpGeneration.EntryPointDriver.Tests {
// ---
-namespace Microsoft.Quantum.CsharpGeneration.EntryPointDriver.Tests {
+namespace Microsoft.Quantum.EntryPointDriver.Tests {
@EntryPoint()
operation ShadowTarget(target : String) : String {
return target;
@@ -293,7 +293,7 @@ namespace Microsoft.Quantum.CsharpGeneration.EntryPointDriver.Tests {
// ---
-namespace Microsoft.Quantum.CsharpGeneration.EntryPointDriver.Tests {
+namespace Microsoft.Quantum.EntryPointDriver.Tests {
@EntryPoint()
operation ShadowShots(shots : Int) : Int {
return shots;
@@ -306,7 +306,7 @@ namespace Microsoft.Quantum.CsharpGeneration.EntryPointDriver.Tests {
// Simulators
//
-namespace Microsoft.Quantum.CsharpGeneration.EntryPointDriver.Tests {
+namespace Microsoft.Quantum.EntryPointDriver.Tests {
open Microsoft.Quantum.Intrinsic;
@EntryPoint()
@@ -332,7 +332,7 @@ namespace Microsoft.Quantum.CsharpGeneration.EntryPointDriver.Tests {
// Help
//
-namespace Microsoft.Quantum.CsharpGeneration.EntryPointDriver.Tests {
+namespace Microsoft.Quantum.EntryPointDriver.Tests {
/// # Summary
/// This test checks that the entry point documentation appears correctly in the command line help message.
///
diff --git a/src/Simulation/EntryPointDriver/Azure.cs b/src/Simulation/EntryPointDriver/Azure.cs
index 48116911142..86ad79791a3 100644
--- a/src/Simulation/EntryPointDriver/Azure.cs
+++ b/src/Simulation/EntryPointDriver/Azure.cs
@@ -8,9 +8,9 @@
using Microsoft.Azure.Quantum.Exceptions;
using Microsoft.Quantum.Runtime;
using Microsoft.Quantum.Simulation.Common.Exceptions;
-using static Microsoft.Quantum.CsharpGeneration.EntryPointDriver.Driver;
+using static Microsoft.Quantum.EntryPointDriver.Driver;
-namespace Microsoft.Quantum.CsharpGeneration.EntryPointDriver
+namespace Microsoft.Quantum.EntryPointDriver
{
///
/// Provides entry point submission to Azure Quantum.
diff --git a/src/Simulation/EntryPointDriver/Driver.cs b/src/Simulation/EntryPointDriver/Driver.cs
index 36dc8d72c56..daabf992cde 100644
--- a/src/Simulation/EntryPointDriver/Driver.cs
+++ b/src/Simulation/EntryPointDriver/Driver.cs
@@ -2,6 +2,7 @@
// Licensed under the MIT License.
using System;
+using System.Collections.Immutable;
using System.CommandLine;
using System.CommandLine.Builder;
using System.CommandLine.Help;
@@ -11,11 +12,10 @@
using System.Linq;
using System.Text;
using System.Threading.Tasks;
-using Microsoft.Quantum.QsCompiler.ReservedKeywords;
using Microsoft.Quantum.Simulation.Core;
-using static Microsoft.Quantum.CsharpGeneration.EntryPointDriver.Driver;
+using static Microsoft.Quantum.EntryPointDriver.Driver;
-namespace Microsoft.Quantum.CsharpGeneration.EntryPointDriver
+namespace Microsoft.Quantum.EntryPointDriver
{
///
/// The entry point driver is the entry point for the C# application that executes the Q# entry point.
@@ -25,6 +25,11 @@ namespace Microsoft.Quantum.CsharpGeneration.EntryPointDriver
/// The entry point's return type.
public sealed class Driver where TCallable : AbstractCallable, ICallable
{
+ ///
+ /// The driver settings.
+ ///
+ private readonly DriverSettings settings;
+
///
/// The entry point.
///
@@ -38,24 +43,22 @@ public sealed class Driver where TCallable : AbstractCalla
///
/// Creates a new driver for the entry point.
///
+ /// The driver settings.
/// The entry point.
- public Driver(IEntryPoint entryPoint)
+ public Driver(DriverSettings settings, IEntryPoint entryPoint)
{
+ this.settings = settings;
this.entryPoint = entryPoint;
SimulatorOption = new OptionInfo(
- new[]
- {
- "--" + CommandLineArguments.SimulatorOption.Item1,
- "-" + CommandLineArguments.SimulatorOption.Item2
- },
- entryPoint.DefaultSimulator,
+ settings.SimulatorOptionAliases,
+ entryPoint.DefaultSimulatorName,
"The name of the simulator to use.",
suggestions: new[]
{
- AssemblyConstants.QuantumSimulator,
- AssemblyConstants.ToffoliSimulator,
- AssemblyConstants.ResourcesEstimator,
- entryPoint.DefaultSimulator
+ settings.QuantumSimulatorName,
+ settings.ToffoliSimulatorName,
+ settings.ResourcesEstimatorName,
+ entryPoint.DefaultSimulatorName
});
}
@@ -119,28 +122,28 @@ public async Task Run(string[] args)
/// The exit code.
private async Task Simulate(ParseResult parseResult, string simulator) =>
await Simulation.Simulate(
- entryPoint, parseResult, DefaultIfShadowed(SimulatorOption, simulator));
+ settings, entryPoint, parseResult, DefaultIfShadowed(SimulatorOption, simulator));
///
/// Submits the entry point to Azure Quantum.
///
/// The command-line parsing result.
- /// The submission settings.
- private async Task Submit(ParseResult parseResult, AzureSettings settings) =>
+ /// The Azure submission settings.
+ private async Task Submit(ParseResult parseResult, AzureSettings azureSettings) =>
await Azure.Submit(entryPoint, parseResult, new AzureSettings
{
- Target = settings.Target,
- Storage = settings.Storage,
- Subscription = settings.Subscription,
- ResourceGroup = settings.ResourceGroup,
- Workspace = settings.Workspace,
- AadToken = DefaultIfShadowed(AadTokenOption, settings.AadToken),
- BaseUri = DefaultIfShadowed(BaseUriOption, settings.BaseUri),
- JobName = DefaultIfShadowed(JobNameOption, settings.JobName),
- Shots = DefaultIfShadowed(ShotsOption, settings.Shots),
- Output = DefaultIfShadowed(OutputOption, settings.Output),
- DryRun = DefaultIfShadowed(DryRunOption, settings.DryRun),
- Verbose = DefaultIfShadowed(VerboseOption, settings.Verbose)
+ Target = azureSettings.Target,
+ Storage = azureSettings.Storage,
+ Subscription = azureSettings.Subscription,
+ ResourceGroup = azureSettings.ResourceGroup,
+ Workspace = azureSettings.Workspace,
+ AadToken = DefaultIfShadowed(AadTokenOption, azureSettings.AadToken),
+ BaseUri = DefaultIfShadowed(BaseUriOption, azureSettings.BaseUri),
+ JobName = DefaultIfShadowed(JobNameOption, azureSettings.JobName),
+ Shots = DefaultIfShadowed(ShotsOption, azureSettings.Shots),
+ Output = DefaultIfShadowed(OutputOption, azureSettings.Output),
+ DryRun = DefaultIfShadowed(DryRunOption, azureSettings.DryRun),
+ Verbose = DefaultIfShadowed(VerboseOption, azureSettings.Verbose)
});
///
@@ -206,55 +209,55 @@ internal static class Driver
/// The target option.
///
internal static readonly OptionInfo TargetOption = new OptionInfo(
- new[] { "--target" }, "The target device ID.");
+ ImmutableList.Create("--target"), "The target device ID.");
///
/// The storage option.
///
internal static readonly OptionInfo StorageOption = new OptionInfo(
- new[] { "--storage" }, "The storage account connection string.");
-
+ ImmutableList.Create("--storage"), "The storage account connection string.");
+
///
/// The subscription option.
///
internal static readonly OptionInfo SubscriptionOption = new OptionInfo(
- new[] { "--subscription" }, "The subscription ID.");
+ ImmutableList.Create("--subscription"), "The subscription ID.");
///
/// The resource group option.
///
internal static readonly OptionInfo ResourceGroupOption = new OptionInfo(
- new[] { "--resource-group" }, "The resource group name.");
+ ImmutableList.Create("--resource-group"), "The resource group name.");
///
/// The workspace option.
///
internal static readonly OptionInfo WorkspaceOption = new OptionInfo(
- new[] { "--workspace" }, "The workspace name.");
+ ImmutableList.Create("--workspace"), "The workspace name.");
///
/// The AAD token option.
///
internal static readonly OptionInfo AadTokenOption = new OptionInfo(
- new[] { "--aad-token" }, default, "The Azure Active Directory authentication token.");
-
+ ImmutableList.Create("--aad-token"), default, "The Azure Active Directory authentication token.");
+
///
/// The base URI option.
///
internal static readonly OptionInfo BaseUriOption = new OptionInfo(
- new[] { "--base-uri" }, default, "The base URI of the Azure Quantum endpoint.");
+ ImmutableList.Create("--base-uri"), default, "The base URI of the Azure Quantum endpoint.");
///
/// The job name option.
///
internal static readonly OptionInfo JobNameOption = new OptionInfo(
- new[] { "--job-name" }, default, "The name of the submitted job.");
-
+ ImmutableList.Create("--job-name"), default, "The name of the submitted job.");
+
///
/// The shots option.
///
internal static readonly OptionInfo ShotsOption = new OptionInfo(
- new[] { "--shots" },
+ ImmutableList.Create("--shots"),
500,
"The number of times the program is executed on the target machine.",
validator: result =>
@@ -266,7 +269,7 @@ internal static class Driver
/// The output option.
///
internal static readonly OptionInfo OutputOption = new OptionInfo(
- new[] { "--output" },
+ ImmutableList.Create("--output"),
OutputFormat.FriendlyUri,
"The information to show in the output after the job is submitted.");
@@ -274,7 +277,7 @@ internal static class Driver
/// The dry run option.
///
internal static readonly OptionInfo DryRunOption = new OptionInfo(
- new[] { "--dry-run" },
+ ImmutableList.Create("--dry-run"),
false,
"Validate the program and options, but do not submit to Azure Quantum.");
@@ -282,7 +285,7 @@ internal static class Driver
/// The verbose option.
///
internal static readonly OptionInfo VerboseOption = new OptionInfo(
- new[] { "--verbose" }, false, "Show additional information about the submission.");
+ ImmutableList.Create("--verbose"), false, "Show additional information about the submission.");
///
/// Displays a message to the console using the given color and text writer.
@@ -308,7 +311,9 @@ internal sealed class QsHelpBuilder : HelpBuilder
/// Creates a new help builder using the given console.
///
/// The console to use.
- internal QsHelpBuilder(IConsole console) : base(console) { }
+ internal QsHelpBuilder(IConsole console) : base(console)
+ {
+ }
protected override string ArgumentDescriptor(IArgument argument)
{
diff --git a/src/Simulation/EntryPointDriver/DriverSettings.cs b/src/Simulation/EntryPointDriver/DriverSettings.cs
new file mode 100644
index 00000000000..4b513cf50e4
--- /dev/null
+++ b/src/Simulation/EntryPointDriver/DriverSettings.cs
@@ -0,0 +1,49 @@
+using System.Collections.Immutable;
+
+namespace Microsoft.Quantum.EntryPointDriver
+{
+ ///
+ /// Settings for the entry point driver.
+ ///
+ public sealed class DriverSettings
+ {
+ ///
+ /// The aliases for the simulator command-line option.
+ ///
+ internal IImmutableList SimulatorOptionAliases { get; }
+
+ ///
+ /// The name of the quantum simulator.
+ ///
+ internal string QuantumSimulatorName { get; }
+
+ ///
+ /// The name of the Toffoli simulator.
+ ///
+ internal string ToffoliSimulatorName { get; }
+
+ ///
+ /// The name of the resources estimator.
+ ///
+ internal string ResourcesEstimatorName { get; }
+
+ ///
+ /// Creates a new driver settings instance.
+ ///
+ /// The aliases for the simulator command-line option.
+ /// The name of the quantum simulator.
+ /// The name of the Toffoli simulator.
+ /// The name of the resources estimator.
+ public DriverSettings(
+ IImmutableList simulatorOptionAliases,
+ string quantumSimulatorName,
+ string toffoliSimulatorName,
+ string resourcesEstimatorName)
+ {
+ SimulatorOptionAliases = simulatorOptionAliases;
+ QuantumSimulatorName = quantumSimulatorName;
+ ToffoliSimulatorName = toffoliSimulatorName;
+ ResourcesEstimatorName = resourcesEstimatorName;
+ }
+ }
+}
diff --git a/src/Simulation/EntryPointDriver/IEntryPoint.cs b/src/Simulation/EntryPointDriver/IEntryPoint.cs
index 65250d086d3..b06f3e7c016 100644
--- a/src/Simulation/EntryPointDriver/IEntryPoint.cs
+++ b/src/Simulation/EntryPointDriver/IEntryPoint.cs
@@ -7,7 +7,7 @@
using System.CommandLine.Parsing;
using Microsoft.Quantum.Simulation.Core;
-namespace Microsoft.Quantum.CsharpGeneration.EntryPointDriver
+namespace Microsoft.Quantum.EntryPointDriver
{
///
/// The interface between the entry point and the command-line program.
@@ -24,17 +24,17 @@ public interface IEntryPoint
/// The summary from the entry point's documentation comment.
///
string Summary { get; }
-
+
///
/// The command-line options corresponding to the entry point's parameters.
///
IEnumerable
-
diff --git a/src/Simulation/Simulators/Microsoft.Quantum.Simulators.nuspec.template b/src/Simulation/Simulators/Microsoft.Quantum.Simulators.nuspec.template
index 36b266522d6..756eaca2e6b 100644
--- a/src/Simulation/Simulators/Microsoft.Quantum.Simulators.nuspec.template
+++ b/src/Simulation/Simulators/Microsoft.Quantum.Simulators.nuspec.template
@@ -22,6 +22,9 @@
+
+
+
diff --git a/src/Simulation/Simulators/Properties/AssemblyInfo.cs b/src/Simulation/Simulators/Properties/AssemblyInfo.cs
index ed09e5cdc2b..4dc75e0a503 100644
--- a/src/Simulation/Simulators/Properties/AssemblyInfo.cs
+++ b/src/Simulation/Simulators/Properties/AssemblyInfo.cs
@@ -6,4 +6,4 @@
using System.Runtime.InteropServices;
// Allow the test assembly to use our internal methods
-[assembly: InternalsVisibleTo("Tests.Microsoft.Quantum.Simulation.Simulators" + SigningConstants.PUBLIC_KEY)]
\ No newline at end of file
+[assembly: InternalsVisibleTo("Tests.Microsoft.Quantum.Simulators" + SigningConstants.PUBLIC_KEY)]
\ No newline at end of file
From bd1f2cbfcd4ed3ed3514c138ec6b174c7f381f51 Mon Sep 17 00:00:00 2001
From: Chris Granade
Date: Fri, 12 Jun 2020 23:14:59 -0700
Subject: [PATCH 2/4] Fix NUGET_VERSION issues (#251)
* Fix assembly vs nuget version in pack.ps1.
* Fix project vs package reference.
* Feedback from @bettinaheim.
---
build/pack.ps1 | 3 ++-
.../Simulators.Tests/TestProjects/UnitTests/UnitTests.csproj | 1 -
2 files changed, 2 insertions(+), 2 deletions(-)
diff --git a/build/pack.ps1 b/build/pack.ps1
index 893db2b5728..1dd8dd7fca2 100644
--- a/build/pack.ps1
+++ b/build/pack.ps1
@@ -46,7 +46,8 @@ function Pack-Dotnet() {
-c $Env:BUILD_CONFIGURATION `
-v detailed `
@args `
- /property:Version=$Env:NUGET_VERSION `
+ /property:Version=$Env:ASSEMBLY_VERSION `
+ /property:PackageVersion=$Env:NUGET_VERSION `
$option1 `
$option2 `
$option3
diff --git a/src/Simulation/Simulators.Tests/TestProjects/UnitTests/UnitTests.csproj b/src/Simulation/Simulators.Tests/TestProjects/UnitTests/UnitTests.csproj
index 2e5d0bd73cd..6ff0e165ec6 100644
--- a/src/Simulation/Simulators.Tests/TestProjects/UnitTests/UnitTests.csproj
+++ b/src/Simulation/Simulators.Tests/TestProjects/UnitTests/UnitTests.csproj
@@ -18,7 +18,6 @@
-
From 7b17b2e3cc72c3301ab46e2f877d30c8549933ea Mon Sep 17 00:00:00 2001
From: Dave Wecker
Date: Mon, 15 Jun 2020 12:20:18 -0700
Subject: [PATCH 3/4] Added AVX512
---
src/Simulation/Native/src/CMakeLists.txt | 12 +-
.../Native/src/external/avx512/kernel1.hpp | 85 ++++
.../Native/src/external/avx512/kernel2.hpp | 98 ++++
.../Native/src/external/avx512/kernel3.hpp | 128 ++++++
.../Native/src/external/avx512/kernel4.hpp | 207 +++++++++
.../Native/src/external/avx512/kernel5.hpp | 296 +++++++++++++
.../Native/src/external/avx512/kernel6.hpp | 252 +++++++++++
.../Native/src/external/avx512/kernel7.hpp | 417 ++++++++++++++++++
.../Native/src/external/avx512/kernels.hpp | 31 ++
.../Native/src/simulator/factory.cpp | 4 +
.../Native/src/simulator/simulatoravx512.cpp | 16 +
src/Simulation/Native/src/util/cpuid.hpp | 2 +-
12 files changed, 1543 insertions(+), 5 deletions(-)
create mode 100644 src/Simulation/Native/src/external/avx512/kernel1.hpp
create mode 100644 src/Simulation/Native/src/external/avx512/kernel2.hpp
create mode 100644 src/Simulation/Native/src/external/avx512/kernel3.hpp
create mode 100644 src/Simulation/Native/src/external/avx512/kernel4.hpp
create mode 100644 src/Simulation/Native/src/external/avx512/kernel5.hpp
create mode 100644 src/Simulation/Native/src/external/avx512/kernel6.hpp
create mode 100644 src/Simulation/Native/src/external/avx512/kernel7.hpp
create mode 100644 src/Simulation/Native/src/external/avx512/kernels.hpp
create mode 100644 src/Simulation/Native/src/simulator/simulatoravx512.cpp
diff --git a/src/Simulation/Native/src/CMakeLists.txt b/src/Simulation/Native/src/CMakeLists.txt
index e53864f85c9..08da4708397 100644
--- a/src/Simulation/Native/src/CMakeLists.txt
+++ b/src/Simulation/Native/src/CMakeLists.txt
@@ -8,8 +8,9 @@ set(AVX2FLAGS "/arch:AVX2" )
set(AVX512FLAGS "/arch:AVX512" )
set(FMAFLAGS "")
else(MSVC)
-SET(AVXFLAGS "-mavx" )
-set(AVX2FLAGS -mfma;-mavx2)
+SET(AVXFLAGS "-mavx")
+set(AVX2FLAGS "-mfma -mavx2")
+set(AVX512FLAGS "-mavx512f -mavx512cd")
set(FMAFLAGS )
endif(MSVC)
@@ -19,14 +20,16 @@ configure_file(version.hpp.in ${PROJECT_BINARY_DIR}/src/version.hpp)
add_subdirectory(util)
add_subdirectory(simulator)
-set(SOURCES simulator/factory.cpp simulator/capi.cpp simulator/simulator.cpp util/openmp.cpp simulator/simulatoravx.cpp simulator/simulatoravx2.cpp )
+set(SOURCES simulator/factory.cpp simulator/capi.cpp simulator/simulator.cpp util/openmp.cpp simulator/simulatoravx.cpp simulator/simulatoravx2.cpp simulator/simulatoravx512.cpp )
if(BUILD_SHARED_LIBS)
add_library(Microsoft.Quantum.Simulator.Runtime SHARED ${SOURCES})
set_source_files_properties(simulator/simulatoravx.cpp PROPERTIES COMPILE_FLAGS ${AVXFLAGS})
if (MSVC)
set_source_files_properties(simulator/simulatoravx2.cpp PROPERTIES COMPILE_FLAGS -mfma COMPILE_FLAGS ${AVX2FLAGS})
+set_source_files_properties(simulator/simulatoravx512.cpp PROPERTIES COMPILE_FLAGS -mfma COMPILE_FLAGS ${AVX512FLAGS})
else(MSVC)
-set_source_files_properties(simulator/simulatoravx2.cpp PROPERTIES COMPILE_FLAGS -mfma COMPILE_FLAGS "-mavx2 -mfma")
+set_source_files_properties(simulator/simulatoravx2.cpp PROPERTIES COMPILE_FLAGS -mfma COMPILE_FLAGS ${AVX2FLAGS})
+set_source_files_properties(simulator/simulatoravx512.cpp PROPERTIES COMPILE_FLAGS -mfma COMPILE_FLAGS ${AVX512FLAGS})
endif(MSVC)
message (STATUS "Building shared library")
target_compile_definitions(Microsoft.Quantum.Simulator.Runtime PRIVATE BUILD_DLL=1)
@@ -36,6 +39,7 @@ else(BUILD_SHARED_LIBS)
add_library(Microsoft.Quantum.Simulator.Runtime STATIC ${SOURCES})
set_source_files_properties(simulator/simulatoravx.cpp PROPERTIES COMPILE_FLAGS ${AVXFLAGS})
set_source_files_properties(simulator/simulatoravx2.cpp PROPERTIES COMPILE_FLAGS ${AVX2FLAGS})
+ set_source_files_properties(simulator/simulatoravx512.cpp PROPERTIES COMPILE_FLAGS ${AVX512FLAGS})
endif(BUILD_SHARED_LIBS)
install(TARGETS Microsoft.Quantum.Simulator.Runtime
diff --git a/src/Simulation/Native/src/external/avx512/kernel1.hpp b/src/Simulation/Native/src/external/avx512/kernel1.hpp
new file mode 100644
index 00000000000..11f839e5025
--- /dev/null
+++ b/src/Simulation/Native/src/external/avx512/kernel1.hpp
@@ -0,0 +1,85 @@
+// (C) 2018 ETH Zurich, ITP, Thomas H�ner and Damian Steiger
+
+template
+inline void kernel_core(V& psi, std::size_t I, std::size_t d0, M const& m, M const& mt)
+{
+ __m256d v[1];
+
+ v[0] = load1(&psi[I]);
+
+ __m256d tmp[1] = {_mm256_setzero_pd()};
+
+ tmp[0] = fma(v[0], m[0], mt[0], tmp[0]);
+
+ v[0] = load1(&psi[I + d0]);
+
+ tmp[0] = fma(v[0], m[1], mt[1], tmp[0]);
+ store((double*)&psi[I + d0], (double*)&psi[I], tmp[0]);
+
+}
+
+// bit indices id[.] are given from high to low (e.g. control first for CNOT)
+template
+void kernel(V& psi, unsigned id0, M const& matrix, std::size_t ctrlmask)
+{
+ std::size_t n = psi.size();
+ std::size_t d0 = 1ULL << id0;
+ auto m = matrix;
+ std::size_t dsorted[] = {d0};
+ permute_qubits_and_matrix(dsorted, 1, m);
+
+ __m256d mm[2];
+ for (unsigned b = 0; b < 2; ++b){
+ for (unsigned r = 0; r < 1; ++r){
+ for (unsigned c = 0; c < 1; ++c){
+ mm[b*1+r*1+c] = loada(&m[2*r+0][c+b*1], &m[2*r+1][c+b*1]);
+ }
+ }
+ }
+
+ __m256d mmt[2];
+ for (unsigned b = 0; b < 2; ++b){
+ for (unsigned r = 0; r < 1; ++r){
+ for (unsigned c = 0; c < 1; ++c){
+ mmt[b*1+r*1+c] = loadbm(&m[2*r+0][c+b*1], &m[2*r+1][c+b*1]);
+ }
+ }
+ }
+
+
+#ifndef _MSC_VER
+ if (ctrlmask == 0){
+ #pragma omp parallel for collapse(LOOP_COLLAPSE1) schedule(static)
+ for (std::size_t i0 = 0; i0 < n; i0 += 2 * dsorted[0]){
+ for (std::size_t i1 = 0; i1 < dsorted[0]; ++i1){
+ kernel_core(psi, i0 + i1, dsorted[0], mm, mmt);
+ }
+ }
+ }
+ else{
+ #pragma omp parallel for collapse(LOOP_COLLAPSE1) schedule(static)
+ for (std::size_t i0 = 0; i0 < n; i0 += 2 * dsorted[0]){
+ for (std::size_t i1 = 0; i1 < dsorted[0]; ++i1){
+ if (((i0 + i1)&ctrlmask) == ctrlmask)
+ kernel_core(psi, i0 + i1, dsorted[0], mm, mmt);
+ }
+ }
+ }
+#else
+ std::intptr_t zero = 0;
+ std::intptr_t dmask = dsorted[0];
+
+ if (ctrlmask == 0){
+ #pragma omp parallel for schedule(static)
+ for (std::intptr_t i = 0; i < static_cast(n); ++i)
+ if ((i & dmask) == zero)
+ kernel_core(psi, i, dsorted[0], mm, mmt);
+ } else {
+ #pragma omp parallel for schedule(static)
+ for (std::intptr_t i = 0; i < static_cast(n); ++i)
+ if ((i & ctrlmask) == ctrlmask && (i & dmask) == zero)
+ kernel_core(psi, i, dsorted[0], mm, mmt);
+ }
+#endif
+}
+
diff --git a/src/Simulation/Native/src/external/avx512/kernel2.hpp b/src/Simulation/Native/src/external/avx512/kernel2.hpp
new file mode 100644
index 00000000000..9153a865dd7
--- /dev/null
+++ b/src/Simulation/Native/src/external/avx512/kernel2.hpp
@@ -0,0 +1,98 @@
+// (C) 2018 ETH Zurich, ITP, Thomas H�ner and Damian Steiger
+
+template
+inline void kernel_core(V& psi, std::size_t I, std::size_t d0, std::size_t d1, M const& m, M const& mt)
+{
+ __m512d v[1];
+
+ v[0] = load1x4(&psi[I]);
+
+ __m512d tmp[1] = {_mm512_setzero_pd()};
+
+ tmp[0] = fma(v[0], m[0], mt[0], tmp[0]);
+
+ v[0] = load1x4(&psi[I + d0]);
+
+ tmp[0] = fma(v[0], m[1], mt[1], tmp[0]);
+
+ v[0] = load1x4(&psi[I + d1]);
+
+ tmp[0] = fma(v[0], m[2], mt[2], tmp[0]);
+
+ v[0] = load1x4(&psi[I + d0 + d1]);
+
+ tmp[0] = fma(v[0], m[3], mt[3], tmp[0]);
+ store((double*)&psi[I + d0 + d1], (double*)&psi[I + d1], (double*)&psi[I + d0], (double*)&psi[I], tmp[0]);
+
+}
+
+// bit indices id[.] are given from high to low (e.g. control first for CNOT)
+template
+void kernel(V& psi, unsigned id1, unsigned id0, M const& matrix, std::size_t ctrlmask)
+{
+ std::size_t n = psi.size();
+ std::size_t d0 = 1ULL << id0;
+ std::size_t d1 = 1ULL << id1;
+ auto m = matrix;
+ std::size_t dsorted[] = {d0, d1};
+ permute_qubits_and_matrix(dsorted, 2, m);
+
+ __m512d mm[4];
+ for (unsigned b = 0; b < 4; ++b){
+ for (unsigned r = 0; r < 1; ++r){
+ for (unsigned c = 0; c < 1; ++c){
+ mm[b*1+r*1+c] = loada(&m[4*r+0][c+b*1], &m[4*r+1][c+b*1], &m[4*r+2][c+b*1], &m[4*r+3][c+b*1]);
+ }
+ }
+ }
+
+ __m512d mmt[4];
+ for (unsigned b = 0; b < 4; ++b){
+ for (unsigned r = 0; r < 1; ++r){
+ for (unsigned c = 0; c < 1; ++c){
+ mmt[b*1+r*1+c] = loadbm(&m[4*r+0][c+b*1], &m[4*r+1][c+b*1], &m[4*r+2][c+b*1], &m[4*r+3][c+b*1]);
+ }
+ }
+ }
+
+
+#ifndef _MSC_VER
+ if (ctrlmask == 0){
+ #pragma omp parallel for collapse(LOOP_COLLAPSE2) schedule(static)
+ for (std::size_t i0 = 0; i0 < n; i0 += 2 * dsorted[0]){
+ for (std::size_t i1 = 0; i1 < dsorted[0]; i1 += 2 * dsorted[1]){
+ for (std::size_t i2 = 0; i2 < dsorted[1]; ++i2){
+ kernel_core(psi, i0 + i1 + i2, dsorted[1], dsorted[0], mm, mmt);
+ }
+ }
+ }
+ }
+ else{
+ #pragma omp parallel for collapse(LOOP_COLLAPSE2) schedule(static)
+ for (std::size_t i0 = 0; i0 < n; i0 += 2 * dsorted[0]){
+ for (std::size_t i1 = 0; i1 < dsorted[0]; i1 += 2 * dsorted[1]){
+ for (std::size_t i2 = 0; i2 < dsorted[1]; ++i2){
+ if (((i0 + i1 + i2)&ctrlmask) == ctrlmask)
+ kernel_core(psi, i0 + i1 + i2, dsorted[1], dsorted[0], mm, mmt);
+ }
+ }
+ }
+ }
+#else
+ std::intptr_t zero = 0;
+ std::intptr_t dmask = dsorted[0] + dsorted[1];
+
+ if (ctrlmask == 0){
+ #pragma omp parallel for schedule(static)
+ for (std::intptr_t i = 0; i < static_cast(n); ++i)
+ if ((i & dmask) == zero)
+ kernel_core(psi, i, dsorted[1], dsorted[0], mm, mmt);
+ } else {
+ #pragma omp parallel for schedule(static)
+ for (std::intptr_t i = 0; i < static_cast(n); ++i)
+ if ((i & ctrlmask) == ctrlmask && (i & dmask) == zero)
+ kernel_core(psi, i, dsorted[1], dsorted[0], mm, mmt);
+ }
+#endif
+}
+
diff --git a/src/Simulation/Native/src/external/avx512/kernel3.hpp b/src/Simulation/Native/src/external/avx512/kernel3.hpp
new file mode 100644
index 00000000000..e4db4808042
--- /dev/null
+++ b/src/Simulation/Native/src/external/avx512/kernel3.hpp
@@ -0,0 +1,128 @@
+// (C) 2018 ETH Zurich, ITP, Thomas H�ner and Damian Steiger
+
+template
+inline void kernel_core(V& psi, std::size_t I, std::size_t d0, std::size_t d1, std::size_t d2, M const& m, M const& mt)
+{
+ __m512d v[1];
+
+ v[0] = load1x4(&psi[I]);
+
+ __m512d tmp[2] = {_mm512_setzero_pd(), _mm512_setzero_pd()};
+
+ tmp[0] = fma(v[0], m[0], mt[0], tmp[0]);
+ tmp[1] = fma(v[0], m[1], mt[1], tmp[1]);
+
+ v[0] = load1x4(&psi[I + d0]);
+
+ tmp[0] = fma(v[0], m[2], mt[2], tmp[0]);
+ tmp[1] = fma(v[0], m[3], mt[3], tmp[1]);
+
+ v[0] = load1x4(&psi[I + d1]);
+
+ tmp[0] = fma(v[0], m[4], mt[4], tmp[0]);
+ tmp[1] = fma(v[0], m[5], mt[5], tmp[1]);
+
+ v[0] = load1x4(&psi[I + d0 + d1]);
+
+ tmp[0] = fma(v[0], m[6], mt[6], tmp[0]);
+ tmp[1] = fma(v[0], m[7], mt[7], tmp[1]);
+
+ v[0] = load1x4(&psi[I + d2]);
+
+ tmp[0] = fma(v[0], m[8], mt[8], tmp[0]);
+ tmp[1] = fma(v[0], m[9], mt[9], tmp[1]);
+
+ v[0] = load1x4(&psi[I + d0 + d2]);
+
+ tmp[0] = fma(v[0], m[10], mt[10], tmp[0]);
+ tmp[1] = fma(v[0], m[11], mt[11], tmp[1]);
+
+ v[0] = load1x4(&psi[I + d1 + d2]);
+
+ tmp[0] = fma(v[0], m[12], mt[12], tmp[0]);
+ tmp[1] = fma(v[0], m[13], mt[13], tmp[1]);
+
+ v[0] = load1x4(&psi[I + d0 + d1 + d2]);
+
+ tmp[0] = fma(v[0], m[14], mt[14], tmp[0]);
+ tmp[1] = fma(v[0], m[15], mt[15], tmp[1]);
+ store((double*)&psi[I + d0 + d1], (double*)&psi[I + d1], (double*)&psi[I + d0], (double*)&psi[I], tmp[0]);
+ store((double*)&psi[I + d0 + d1 + d2], (double*)&psi[I + d1 + d2], (double*)&psi[I + d0 + d2], (double*)&psi[I + d2], tmp[1]);
+
+}
+
+// bit indices id[.] are given from high to low (e.g. control first for CNOT)
+template
+void kernel(V& psi, unsigned id2, unsigned id1, unsigned id0, M const& matrix, std::size_t ctrlmask)
+{
+ std::size_t n = psi.size();
+ std::size_t d0 = 1ULL << id0;
+ std::size_t d1 = 1ULL << id1;
+ std::size_t d2 = 1ULL << id2;
+ auto m = matrix;
+ std::size_t dsorted[] = {d0, d1, d2};
+ permute_qubits_and_matrix(dsorted, 3, m);
+
+ __m512d mm[16];
+ for (unsigned b = 0; b < 8; ++b){
+ for (unsigned r = 0; r < 2; ++r){
+ for (unsigned c = 0; c < 1; ++c){
+ mm[b*2+r*1+c] = loada(&m[4*r+0][c+b*1], &m[4*r+1][c+b*1], &m[4*r+2][c+b*1], &m[4*r+3][c+b*1]);
+ }
+ }
+ }
+
+ __m512d mmt[16];
+ for (unsigned b = 0; b < 8; ++b){
+ for (unsigned r = 0; r < 2; ++r){
+ for (unsigned c = 0; c < 1; ++c){
+ mmt[b*2+r*1+c] = loadbm(&m[4*r+0][c+b*1], &m[4*r+1][c+b*1], &m[4*r+2][c+b*1], &m[4*r+3][c+b*1]);
+ }
+ }
+ }
+
+
+#ifndef _MSC_VER
+ if (ctrlmask == 0){
+ #pragma omp parallel for collapse(LOOP_COLLAPSE3) schedule(static)
+ for (std::size_t i0 = 0; i0 < n; i0 += 2 * dsorted[0]){
+ for (std::size_t i1 = 0; i1 < dsorted[0]; i1 += 2 * dsorted[1]){
+ for (std::size_t i2 = 0; i2 < dsorted[1]; i2 += 2 * dsorted[2]){
+ for (std::size_t i3 = 0; i3 < dsorted[2]; ++i3){
+ kernel_core(psi, i0 + i1 + i2 + i3, dsorted[2], dsorted[1], dsorted[0], mm, mmt);
+ }
+ }
+ }
+ }
+ }
+ else{
+ #pragma omp parallel for collapse(LOOP_COLLAPSE3) schedule(static)
+ for (std::size_t i0 = 0; i0 < n; i0 += 2 * dsorted[0]){
+ for (std::size_t i1 = 0; i1 < dsorted[0]; i1 += 2 * dsorted[1]){
+ for (std::size_t i2 = 0; i2 < dsorted[1]; i2 += 2 * dsorted[2]){
+ for (std::size_t i3 = 0; i3 < dsorted[2]; ++i3){
+ if (((i0 + i1 + i2 + i3)&ctrlmask) == ctrlmask)
+ kernel_core(psi, i0 + i1 + i2 + i3, dsorted[2], dsorted[1], dsorted[0], mm, mmt);
+ }
+ }
+ }
+ }
+ }
+#else
+ std::intptr_t zero = 0;
+ std::intptr_t dmask = dsorted[0] + dsorted[1] + dsorted[2];
+
+ if (ctrlmask == 0){
+ #pragma omp parallel for schedule(static)
+ for (std::intptr_t i = 0; i < static_cast(n); ++i)
+ if ((i & dmask) == zero)
+ kernel_core(psi, i, dsorted[2], dsorted[1], dsorted[0], mm, mmt);
+ } else {
+ #pragma omp parallel for schedule(static)
+ for (std::intptr_t i = 0; i < static_cast(n); ++i)
+ if ((i & ctrlmask) == ctrlmask && (i & dmask) == zero)
+ kernel_core(psi, i, dsorted[2], dsorted[1], dsorted[0], mm, mmt);
+ }
+#endif
+}
+
diff --git a/src/Simulation/Native/src/external/avx512/kernel4.hpp b/src/Simulation/Native/src/external/avx512/kernel4.hpp
new file mode 100644
index 00000000000..16bfc1ff86e
--- /dev/null
+++ b/src/Simulation/Native/src/external/avx512/kernel4.hpp
@@ -0,0 +1,207 @@
+// (C) 2018 ETH Zurich, ITP, Thomas H�ner and Damian Steiger
+
+template
+inline void kernel_core(V& psi, std::size_t I, std::size_t d0, std::size_t d1, std::size_t d2, std::size_t d3, M const& m, M const& mt)
+{
+ __m512d v[1];
+
+ v[0] = load1x4(&psi[I]);
+
+ __m512d tmp[4] = {_mm512_setzero_pd(), _mm512_setzero_pd(), _mm512_setzero_pd(), _mm512_setzero_pd()};
+
+ tmp[0] = fma(v[0], m[0], mt[0], tmp[0]);
+ tmp[1] = fma(v[0], m[1], mt[1], tmp[1]);
+ tmp[2] = fma(v[0], m[2], mt[2], tmp[2]);
+ tmp[3] = fma(v[0], m[3], mt[3], tmp[3]);
+
+ v[0] = load1x4(&psi[I + d0]);
+
+ tmp[0] = fma(v[0], m[4], mt[4], tmp[0]);
+ tmp[1] = fma(v[0], m[5], mt[5], tmp[1]);
+ tmp[2] = fma(v[0], m[6], mt[6], tmp[2]);
+ tmp[3] = fma(v[0], m[7], mt[7], tmp[3]);
+
+ v[0] = load1x4(&psi[I + d1]);
+
+ tmp[0] = fma(v[0], m[8], mt[8], tmp[0]);
+ tmp[1] = fma(v[0], m[9], mt[9], tmp[1]);
+ tmp[2] = fma(v[0], m[10], mt[10], tmp[2]);
+ tmp[3] = fma(v[0], m[11], mt[11], tmp[3]);
+
+ v[0] = load1x4(&psi[I + d0 + d1]);
+
+ tmp[0] = fma(v[0], m[12], mt[12], tmp[0]);
+ tmp[1] = fma(v[0], m[13], mt[13], tmp[1]);
+ tmp[2] = fma(v[0], m[14], mt[14], tmp[2]);
+ tmp[3] = fma(v[0], m[15], mt[15], tmp[3]);
+
+ v[0] = load1x4(&psi[I + d2]);
+
+ tmp[0] = fma(v[0], m[16], mt[16], tmp[0]);
+ tmp[1] = fma(v[0], m[17], mt[17], tmp[1]);
+ tmp[2] = fma(v[0], m[18], mt[18], tmp[2]);
+ tmp[3] = fma(v[0], m[19], mt[19], tmp[3]);
+
+ v[0] = load1x4(&psi[I + d0 + d2]);
+
+ tmp[0] = fma(v[0], m[20], mt[20], tmp[0]);
+ tmp[1] = fma(v[0], m[21], mt[21], tmp[1]);
+ tmp[2] = fma(v[0], m[22], mt[22], tmp[2]);
+ tmp[3] = fma(v[0], m[23], mt[23], tmp[3]);
+
+ v[0] = load1x4(&psi[I + d1 + d2]);
+
+ tmp[0] = fma(v[0], m[24], mt[24], tmp[0]);
+ tmp[1] = fma(v[0], m[25], mt[25], tmp[1]);
+ tmp[2] = fma(v[0], m[26], mt[26], tmp[2]);
+ tmp[3] = fma(v[0], m[27], mt[27], tmp[3]);
+
+ v[0] = load1x4(&psi[I + d0 + d1 + d2]);
+
+ tmp[0] = fma(v[0], m[28], mt[28], tmp[0]);
+ tmp[1] = fma(v[0], m[29], mt[29], tmp[1]);
+ tmp[2] = fma(v[0], m[30], mt[30], tmp[2]);
+ tmp[3] = fma(v[0], m[31], mt[31], tmp[3]);
+
+ v[0] = load1x4(&psi[I + d3]);
+
+ tmp[0] = fma(v[0], m[32], mt[32], tmp[0]);
+ tmp[1] = fma(v[0], m[33], mt[33], tmp[1]);
+ tmp[2] = fma(v[0], m[34], mt[34], tmp[2]);
+ tmp[3] = fma(v[0], m[35], mt[35], tmp[3]);
+
+ v[0] = load1x4(&psi[I + d0 + d3]);
+
+ tmp[0] = fma(v[0], m[36], mt[36], tmp[0]);
+ tmp[1] = fma(v[0], m[37], mt[37], tmp[1]);
+ tmp[2] = fma(v[0], m[38], mt[38], tmp[2]);
+ tmp[3] = fma(v[0], m[39], mt[39], tmp[3]);
+
+ v[0] = load1x4(&psi[I + d1 + d3]);
+
+ tmp[0] = fma(v[0], m[40], mt[40], tmp[0]);
+ tmp[1] = fma(v[0], m[41], mt[41], tmp[1]);
+ tmp[2] = fma(v[0], m[42], mt[42], tmp[2]);
+ tmp[3] = fma(v[0], m[43], mt[43], tmp[3]);
+
+ v[0] = load1x4(&psi[I + d0 + d1 + d3]);
+
+ tmp[0] = fma(v[0], m[44], mt[44], tmp[0]);
+ tmp[1] = fma(v[0], m[45], mt[45], tmp[1]);
+ tmp[2] = fma(v[0], m[46], mt[46], tmp[2]);
+ tmp[3] = fma(v[0], m[47], mt[47], tmp[3]);
+
+ v[0] = load1x4(&psi[I + d2 + d3]);
+
+ tmp[0] = fma(v[0], m[48], mt[48], tmp[0]);
+ tmp[1] = fma(v[0], m[49], mt[49], tmp[1]);
+ tmp[2] = fma(v[0], m[50], mt[50], tmp[2]);
+ tmp[3] = fma(v[0], m[51], mt[51], tmp[3]);
+
+ v[0] = load1x4(&psi[I + d0 + d2 + d3]);
+
+ tmp[0] = fma(v[0], m[52], mt[52], tmp[0]);
+ tmp[1] = fma(v[0], m[53], mt[53], tmp[1]);
+ tmp[2] = fma(v[0], m[54], mt[54], tmp[2]);
+ tmp[3] = fma(v[0], m[55], mt[55], tmp[3]);
+
+ v[0] = load1x4(&psi[I + d1 + d2 + d3]);
+
+ tmp[0] = fma(v[0], m[56], mt[56], tmp[0]);
+ tmp[1] = fma(v[0], m[57], mt[57], tmp[1]);
+ tmp[2] = fma(v[0], m[58], mt[58], tmp[2]);
+ tmp[3] = fma(v[0], m[59], mt[59], tmp[3]);
+
+ v[0] = load1x4(&psi[I + d0 + d1 + d2 + d3]);
+
+ tmp[0] = fma(v[0], m[60], mt[60], tmp[0]);
+ tmp[1] = fma(v[0], m[61], mt[61], tmp[1]);
+ tmp[2] = fma(v[0], m[62], mt[62], tmp[2]);
+ tmp[3] = fma(v[0], m[63], mt[63], tmp[3]);
+ store((double*)&psi[I + d0 + d1], (double*)&psi[I + d1], (double*)&psi[I + d0], (double*)&psi[I], tmp[0]);
+ store((double*)&psi[I + d0 + d1 + d2], (double*)&psi[I + d1 + d2], (double*)&psi[I + d0 + d2], (double*)&psi[I + d2], tmp[1]);
+ store((double*)&psi[I + d0 + d1 + d3], (double*)&psi[I + d1 + d3], (double*)&psi[I + d0 + d3], (double*)&psi[I + d3], tmp[2]);
+ store((double*)&psi[I + d0 + d1 + d2 + d3], (double*)&psi[I + d1 + d2 + d3], (double*)&psi[I + d0 + d2 + d3], (double*)&psi[I + d2 + d3], tmp[3]);
+
+}
+
+// bit indices id[.] are given from high to low (e.g. control first for CNOT)
+template
+void kernel(V& psi, unsigned id3, unsigned id2, unsigned id1, unsigned id0, M const& matrix, std::size_t ctrlmask)
+{
+ std::size_t n = psi.size();
+ std::size_t d0 = 1ULL << id0;
+ std::size_t d1 = 1ULL << id1;
+ std::size_t d2 = 1ULL << id2;
+ std::size_t d3 = 1ULL << id3;
+ auto m = matrix;
+ std::size_t dsorted[] = {d0, d1, d2, d3};
+ permute_qubits_and_matrix(dsorted, 4, m);
+
+ __m512d mm[64];
+ for (unsigned b = 0; b < 16; ++b){
+ for (unsigned r = 0; r < 4; ++r){
+ for (unsigned c = 0; c < 1; ++c){
+ mm[b*4+r*1+c] = loada(&m[4*r+0][c+b*1], &m[4*r+1][c+b*1], &m[4*r+2][c+b*1], &m[4*r+3][c+b*1]);
+ }
+ }
+ }
+
+ __m512d mmt[64];
+ for (unsigned b = 0; b < 16; ++b){
+ for (unsigned r = 0; r < 4; ++r){
+ for (unsigned c = 0; c < 1; ++c){
+ mmt[b*4+r*1+c] = loadbm(&m[4*r+0][c+b*1], &m[4*r+1][c+b*1], &m[4*r+2][c+b*1], &m[4*r+3][c+b*1]);
+ }
+ }
+ }
+
+
+#ifndef _MSC_VER
+ if (ctrlmask == 0){
+ #pragma omp parallel for collapse(LOOP_COLLAPSE4) schedule(static)
+ for (std::size_t i0 = 0; i0 < n; i0 += 2 * dsorted[0]){
+ for (std::size_t i1 = 0; i1 < dsorted[0]; i1 += 2 * dsorted[1]){
+ for (std::size_t i2 = 0; i2 < dsorted[1]; i2 += 2 * dsorted[2]){
+ for (std::size_t i3 = 0; i3 < dsorted[2]; i3 += 2 * dsorted[3]){
+ for (std::size_t i4 = 0; i4 < dsorted[3]; ++i4){
+ kernel_core(psi, i0 + i1 + i2 + i3 + i4, dsorted[3], dsorted[2], dsorted[1], dsorted[0], mm, mmt);
+ }
+ }
+ }
+ }
+ }
+ }
+ else{
+ #pragma omp parallel for collapse(LOOP_COLLAPSE4) schedule(static)
+ for (std::size_t i0 = 0; i0 < n; i0 += 2 * dsorted[0]){
+ for (std::size_t i1 = 0; i1 < dsorted[0]; i1 += 2 * dsorted[1]){
+ for (std::size_t i2 = 0; i2 < dsorted[1]; i2 += 2 * dsorted[2]){
+ for (std::size_t i3 = 0; i3 < dsorted[2]; i3 += 2 * dsorted[3]){
+ for (std::size_t i4 = 0; i4 < dsorted[3]; ++i4){
+ if (((i0 + i1 + i2 + i3 + i4)&ctrlmask) == ctrlmask)
+ kernel_core(psi, i0 + i1 + i2 + i3 + i4, dsorted[3], dsorted[2], dsorted[1], dsorted[0], mm, mmt);
+ }
+ }
+ }
+ }
+ }
+ }
+#else
+ std::intptr_t zero = 0;
+ std::intptr_t dmask = dsorted[0] + dsorted[1] + dsorted[2] + dsorted[3];
+
+ if (ctrlmask == 0){
+ #pragma omp parallel for schedule(static)
+ for (std::intptr_t i = 0; i < static_cast(n); ++i)
+ if ((i & dmask) == zero)
+ kernel_core(psi, i, dsorted[3], dsorted[2], dsorted[1], dsorted[0], mm, mmt);
+ } else {
+ #pragma omp parallel for schedule(static)
+ for (std::intptr_t i = 0; i < static_cast(n); ++i)
+ if ((i & ctrlmask) == ctrlmask && (i & dmask) == zero)
+ kernel_core(psi, i, dsorted[3], dsorted[2], dsorted[1], dsorted[0], mm, mmt);
+ }
+#endif
+}
+
diff --git a/src/Simulation/Native/src/external/avx512/kernel5.hpp b/src/Simulation/Native/src/external/avx512/kernel5.hpp
new file mode 100644
index 00000000000..6d1a030edda
--- /dev/null
+++ b/src/Simulation/Native/src/external/avx512/kernel5.hpp
@@ -0,0 +1,296 @@
+// (C) 2018 ETH Zurich, ITP, Thomas H�ner and Damian Steiger
+
+template
+inline void kernel_core(V& psi, std::size_t I, std::size_t d0, std::size_t d1, std::size_t d2, std::size_t d3, std::size_t d4, M const& m, M const& mt)
+{
+ __m512d v[2];
+
+ v[0] = load1x4(&psi[I]);
+ v[1] = load1x4(&psi[I + d0]);
+
+ __m512d tmp[8] = {_mm512_setzero_pd(), _mm512_setzero_pd(), _mm512_setzero_pd(), _mm512_setzero_pd(), _mm512_setzero_pd(), _mm512_setzero_pd(), _mm512_setzero_pd(), _mm512_setzero_pd()};
+
+ tmp[0] = fma(v[0], m[0], mt[0], fma(v[1], m[1], mt[1], tmp[0]));
+ tmp[1] = fma(v[0], m[2], mt[2], fma(v[1], m[3], mt[3], tmp[1]));
+ tmp[2] = fma(v[0], m[4], mt[4], fma(v[1], m[5], mt[5], tmp[2]));
+ tmp[3] = fma(v[0], m[6], mt[6], fma(v[1], m[7], mt[7], tmp[3]));
+ tmp[4] = fma(v[0], m[8], mt[8], fma(v[1], m[9], mt[9], tmp[4]));
+ tmp[5] = fma(v[0], m[10], mt[10], fma(v[1], m[11], mt[11], tmp[5]));
+ tmp[6] = fma(v[0], m[12], mt[12], fma(v[1], m[13], mt[13], tmp[6]));
+ tmp[7] = fma(v[0], m[14], mt[14], fma(v[1], m[15], mt[15], tmp[7]));
+
+ v[0] = load1x4(&psi[I + d1]);
+ v[1] = load1x4(&psi[I + d0 + d1]);
+
+ tmp[0] = fma(v[0], m[16], mt[16], fma(v[1], m[17], mt[17], tmp[0]));
+ tmp[1] = fma(v[0], m[18], mt[18], fma(v[1], m[19], mt[19], tmp[1]));
+ tmp[2] = fma(v[0], m[20], mt[20], fma(v[1], m[21], mt[21], tmp[2]));
+ tmp[3] = fma(v[0], m[22], mt[22], fma(v[1], m[23], mt[23], tmp[3]));
+ tmp[4] = fma(v[0], m[24], mt[24], fma(v[1], m[25], mt[25], tmp[4]));
+ tmp[5] = fma(v[0], m[26], mt[26], fma(v[1], m[27], mt[27], tmp[5]));
+ tmp[6] = fma(v[0], m[28], mt[28], fma(v[1], m[29], mt[29], tmp[6]));
+ tmp[7] = fma(v[0], m[30], mt[30], fma(v[1], m[31], mt[31], tmp[7]));
+
+ v[0] = load1x4(&psi[I + d2]);
+ v[1] = load1x4(&psi[I + d0 + d2]);
+
+ tmp[0] = fma(v[0], m[32], mt[32], fma(v[1], m[33], mt[33], tmp[0]));
+ tmp[1] = fma(v[0], m[34], mt[34], fma(v[1], m[35], mt[35], tmp[1]));
+ tmp[2] = fma(v[0], m[36], mt[36], fma(v[1], m[37], mt[37], tmp[2]));
+ tmp[3] = fma(v[0], m[38], mt[38], fma(v[1], m[39], mt[39], tmp[3]));
+ tmp[4] = fma(v[0], m[40], mt[40], fma(v[1], m[41], mt[41], tmp[4]));
+ tmp[5] = fma(v[0], m[42], mt[42], fma(v[1], m[43], mt[43], tmp[5]));
+ tmp[6] = fma(v[0], m[44], mt[44], fma(v[1], m[45], mt[45], tmp[6]));
+ tmp[7] = fma(v[0], m[46], mt[46], fma(v[1], m[47], mt[47], tmp[7]));
+
+ v[0] = load1x4(&psi[I + d1 + d2]);
+ v[1] = load1x4(&psi[I + d0 + d1 + d2]);
+
+ tmp[0] = fma(v[0], m[48], mt[48], fma(v[1], m[49], mt[49], tmp[0]));
+ tmp[1] = fma(v[0], m[50], mt[50], fma(v[1], m[51], mt[51], tmp[1]));
+ tmp[2] = fma(v[0], m[52], mt[52], fma(v[1], m[53], mt[53], tmp[2]));
+ tmp[3] = fma(v[0], m[54], mt[54], fma(v[1], m[55], mt[55], tmp[3]));
+ tmp[4] = fma(v[0], m[56], mt[56], fma(v[1], m[57], mt[57], tmp[4]));
+ tmp[5] = fma(v[0], m[58], mt[58], fma(v[1], m[59], mt[59], tmp[5]));
+ tmp[6] = fma(v[0], m[60], mt[60], fma(v[1], m[61], mt[61], tmp[6]));
+ tmp[7] = fma(v[0], m[62], mt[62], fma(v[1], m[63], mt[63], tmp[7]));
+
+ v[0] = load1x4(&psi[I + d3]);
+ v[1] = load1x4(&psi[I + d0 + d3]);
+
+ tmp[0] = fma(v[0], m[64], mt[64], fma(v[1], m[65], mt[65], tmp[0]));
+ tmp[1] = fma(v[0], m[66], mt[66], fma(v[1], m[67], mt[67], tmp[1]));
+ tmp[2] = fma(v[0], m[68], mt[68], fma(v[1], m[69], mt[69], tmp[2]));
+ tmp[3] = fma(v[0], m[70], mt[70], fma(v[1], m[71], mt[71], tmp[3]));
+ tmp[4] = fma(v[0], m[72], mt[72], fma(v[1], m[73], mt[73], tmp[4]));
+ tmp[5] = fma(v[0], m[74], mt[74], fma(v[1], m[75], mt[75], tmp[5]));
+ tmp[6] = fma(v[0], m[76], mt[76], fma(v[1], m[77], mt[77], tmp[6]));
+ tmp[7] = fma(v[0], m[78], mt[78], fma(v[1], m[79], mt[79], tmp[7]));
+
+ v[0] = load1x4(&psi[I + d1 + d3]);
+ v[1] = load1x4(&psi[I + d0 + d1 + d3]);
+
+ tmp[0] = fma(v[0], m[80], mt[80], fma(v[1], m[81], mt[81], tmp[0]));
+ tmp[1] = fma(v[0], m[82], mt[82], fma(v[1], m[83], mt[83], tmp[1]));
+ tmp[2] = fma(v[0], m[84], mt[84], fma(v[1], m[85], mt[85], tmp[2]));
+ tmp[3] = fma(v[0], m[86], mt[86], fma(v[1], m[87], mt[87], tmp[3]));
+ tmp[4] = fma(v[0], m[88], mt[88], fma(v[1], m[89], mt[89], tmp[4]));
+ tmp[5] = fma(v[0], m[90], mt[90], fma(v[1], m[91], mt[91], tmp[5]));
+ tmp[6] = fma(v[0], m[92], mt[92], fma(v[1], m[93], mt[93], tmp[6]));
+ tmp[7] = fma(v[0], m[94], mt[94], fma(v[1], m[95], mt[95], tmp[7]));
+
+ v[0] = load1x4(&psi[I + d2 + d3]);
+ v[1] = load1x4(&psi[I + d0 + d2 + d3]);
+
+ tmp[0] = fma(v[0], m[96], mt[96], fma(v[1], m[97], mt[97], tmp[0]));
+ tmp[1] = fma(v[0], m[98], mt[98], fma(v[1], m[99], mt[99], tmp[1]));
+ tmp[2] = fma(v[0], m[100], mt[100], fma(v[1], m[101], mt[101], tmp[2]));
+ tmp[3] = fma(v[0], m[102], mt[102], fma(v[1], m[103], mt[103], tmp[3]));
+ tmp[4] = fma(v[0], m[104], mt[104], fma(v[1], m[105], mt[105], tmp[4]));
+ tmp[5] = fma(v[0], m[106], mt[106], fma(v[1], m[107], mt[107], tmp[5]));
+ tmp[6] = fma(v[0], m[108], mt[108], fma(v[1], m[109], mt[109], tmp[6]));
+ tmp[7] = fma(v[0], m[110], mt[110], fma(v[1], m[111], mt[111], tmp[7]));
+
+ v[0] = load1x4(&psi[I + d1 + d2 + d3]);
+ v[1] = load1x4(&psi[I + d0 + d1 + d2 + d3]);
+
+ tmp[0] = fma(v[0], m[112], mt[112], fma(v[1], m[113], mt[113], tmp[0]));
+ tmp[1] = fma(v[0], m[114], mt[114], fma(v[1], m[115], mt[115], tmp[1]));
+ tmp[2] = fma(v[0], m[116], mt[116], fma(v[1], m[117], mt[117], tmp[2]));
+ tmp[3] = fma(v[0], m[118], mt[118], fma(v[1], m[119], mt[119], tmp[3]));
+ tmp[4] = fma(v[0], m[120], mt[120], fma(v[1], m[121], mt[121], tmp[4]));
+ tmp[5] = fma(v[0], m[122], mt[122], fma(v[1], m[123], mt[123], tmp[5]));
+ tmp[6] = fma(v[0], m[124], mt[124], fma(v[1], m[125], mt[125], tmp[6]));
+ tmp[7] = fma(v[0], m[126], mt[126], fma(v[1], m[127], mt[127], tmp[7]));
+
+ v[0] = load1x4(&psi[I + d4]);
+ v[1] = load1x4(&psi[I + d0 + d4]);
+
+ tmp[0] = fma(v[0], m[128], mt[128], fma(v[1], m[129], mt[129], tmp[0]));
+ tmp[1] = fma(v[0], m[130], mt[130], fma(v[1], m[131], mt[131], tmp[1]));
+ tmp[2] = fma(v[0], m[132], mt[132], fma(v[1], m[133], mt[133], tmp[2]));
+ tmp[3] = fma(v[0], m[134], mt[134], fma(v[1], m[135], mt[135], tmp[3]));
+ tmp[4] = fma(v[0], m[136], mt[136], fma(v[1], m[137], mt[137], tmp[4]));
+ tmp[5] = fma(v[0], m[138], mt[138], fma(v[1], m[139], mt[139], tmp[5]));
+ tmp[6] = fma(v[0], m[140], mt[140], fma(v[1], m[141], mt[141], tmp[6]));
+ tmp[7] = fma(v[0], m[142], mt[142], fma(v[1], m[143], mt[143], tmp[7]));
+
+ v[0] = load1x4(&psi[I + d1 + d4]);
+ v[1] = load1x4(&psi[I + d0 + d1 + d4]);
+
+ tmp[0] = fma(v[0], m[144], mt[144], fma(v[1], m[145], mt[145], tmp[0]));
+ tmp[1] = fma(v[0], m[146], mt[146], fma(v[1], m[147], mt[147], tmp[1]));
+ tmp[2] = fma(v[0], m[148], mt[148], fma(v[1], m[149], mt[149], tmp[2]));
+ tmp[3] = fma(v[0], m[150], mt[150], fma(v[1], m[151], mt[151], tmp[3]));
+ tmp[4] = fma(v[0], m[152], mt[152], fma(v[1], m[153], mt[153], tmp[4]));
+ tmp[5] = fma(v[0], m[154], mt[154], fma(v[1], m[155], mt[155], tmp[5]));
+ tmp[6] = fma(v[0], m[156], mt[156], fma(v[1], m[157], mt[157], tmp[6]));
+ tmp[7] = fma(v[0], m[158], mt[158], fma(v[1], m[159], mt[159], tmp[7]));
+
+ v[0] = load1x4(&psi[I + d2 + d4]);
+ v[1] = load1x4(&psi[I + d0 + d2 + d4]);
+
+ tmp[0] = fma(v[0], m[160], mt[160], fma(v[1], m[161], mt[161], tmp[0]));
+ tmp[1] = fma(v[0], m[162], mt[162], fma(v[1], m[163], mt[163], tmp[1]));
+ tmp[2] = fma(v[0], m[164], mt[164], fma(v[1], m[165], mt[165], tmp[2]));
+ tmp[3] = fma(v[0], m[166], mt[166], fma(v[1], m[167], mt[167], tmp[3]));
+ tmp[4] = fma(v[0], m[168], mt[168], fma(v[1], m[169], mt[169], tmp[4]));
+ tmp[5] = fma(v[0], m[170], mt[170], fma(v[1], m[171], mt[171], tmp[5]));
+ tmp[6] = fma(v[0], m[172], mt[172], fma(v[1], m[173], mt[173], tmp[6]));
+ tmp[7] = fma(v[0], m[174], mt[174], fma(v[1], m[175], mt[175], tmp[7]));
+
+ v[0] = load1x4(&psi[I + d1 + d2 + d4]);
+ v[1] = load1x4(&psi[I + d0 + d1 + d2 + d4]);
+
+ tmp[0] = fma(v[0], m[176], mt[176], fma(v[1], m[177], mt[177], tmp[0]));
+ tmp[1] = fma(v[0], m[178], mt[178], fma(v[1], m[179], mt[179], tmp[1]));
+ tmp[2] = fma(v[0], m[180], mt[180], fma(v[1], m[181], mt[181], tmp[2]));
+ tmp[3] = fma(v[0], m[182], mt[182], fma(v[1], m[183], mt[183], tmp[3]));
+ tmp[4] = fma(v[0], m[184], mt[184], fma(v[1], m[185], mt[185], tmp[4]));
+ tmp[5] = fma(v[0], m[186], mt[186], fma(v[1], m[187], mt[187], tmp[5]));
+ tmp[6] = fma(v[0], m[188], mt[188], fma(v[1], m[189], mt[189], tmp[6]));
+ tmp[7] = fma(v[0], m[190], mt[190], fma(v[1], m[191], mt[191], tmp[7]));
+
+ v[0] = load1x4(&psi[I + d3 + d4]);
+ v[1] = load1x4(&psi[I + d0 + d3 + d4]);
+
+ tmp[0] = fma(v[0], m[192], mt[192], fma(v[1], m[193], mt[193], tmp[0]));
+ tmp[1] = fma(v[0], m[194], mt[194], fma(v[1], m[195], mt[195], tmp[1]));
+ tmp[2] = fma(v[0], m[196], mt[196], fma(v[1], m[197], mt[197], tmp[2]));
+ tmp[3] = fma(v[0], m[198], mt[198], fma(v[1], m[199], mt[199], tmp[3]));
+ tmp[4] = fma(v[0], m[200], mt[200], fma(v[1], m[201], mt[201], tmp[4]));
+ tmp[5] = fma(v[0], m[202], mt[202], fma(v[1], m[203], mt[203], tmp[5]));
+ tmp[6] = fma(v[0], m[204], mt[204], fma(v[1], m[205], mt[205], tmp[6]));
+ tmp[7] = fma(v[0], m[206], mt[206], fma(v[1], m[207], mt[207], tmp[7]));
+
+ v[0] = load1x4(&psi[I + d1 + d3 + d4]);
+ v[1] = load1x4(&psi[I + d0 + d1 + d3 + d4]);
+
+ tmp[0] = fma(v[0], m[208], mt[208], fma(v[1], m[209], mt[209], tmp[0]));
+ tmp[1] = fma(v[0], m[210], mt[210], fma(v[1], m[211], mt[211], tmp[1]));
+ tmp[2] = fma(v[0], m[212], mt[212], fma(v[1], m[213], mt[213], tmp[2]));
+ tmp[3] = fma(v[0], m[214], mt[214], fma(v[1], m[215], mt[215], tmp[3]));
+ tmp[4] = fma(v[0], m[216], mt[216], fma(v[1], m[217], mt[217], tmp[4]));
+ tmp[5] = fma(v[0], m[218], mt[218], fma(v[1], m[219], mt[219], tmp[5]));
+ tmp[6] = fma(v[0], m[220], mt[220], fma(v[1], m[221], mt[221], tmp[6]));
+ tmp[7] = fma(v[0], m[222], mt[222], fma(v[1], m[223], mt[223], tmp[7]));
+
+ v[0] = load1x4(&psi[I + d2 + d3 + d4]);
+ v[1] = load1x4(&psi[I + d0 + d2 + d3 + d4]);
+
+ tmp[0] = fma(v[0], m[224], mt[224], fma(v[1], m[225], mt[225], tmp[0]));
+ tmp[1] = fma(v[0], m[226], mt[226], fma(v[1], m[227], mt[227], tmp[1]));
+ tmp[2] = fma(v[0], m[228], mt[228], fma(v[1], m[229], mt[229], tmp[2]));
+ tmp[3] = fma(v[0], m[230], mt[230], fma(v[1], m[231], mt[231], tmp[3]));
+ tmp[4] = fma(v[0], m[232], mt[232], fma(v[1], m[233], mt[233], tmp[4]));
+ tmp[5] = fma(v[0], m[234], mt[234], fma(v[1], m[235], mt[235], tmp[5]));
+ tmp[6] = fma(v[0], m[236], mt[236], fma(v[1], m[237], mt[237], tmp[6]));
+ tmp[7] = fma(v[0], m[238], mt[238], fma(v[1], m[239], mt[239], tmp[7]));
+
+ v[0] = load1x4(&psi[I + d1 + d2 + d3 + d4]);
+ v[1] = load1x4(&psi[I + d0 + d1 + d2 + d3 + d4]);
+
+ tmp[0] = fma(v[0], m[240], mt[240], fma(v[1], m[241], mt[241], tmp[0]));
+ tmp[1] = fma(v[0], m[242], mt[242], fma(v[1], m[243], mt[243], tmp[1]));
+ tmp[2] = fma(v[0], m[244], mt[244], fma(v[1], m[245], mt[245], tmp[2]));
+ tmp[3] = fma(v[0], m[246], mt[246], fma(v[1], m[247], mt[247], tmp[3]));
+ store((double*)&psi[I + d0 + d1], (double*)&psi[I + d1], (double*)&psi[I + d0], (double*)&psi[I], tmp[0]);
+ store((double*)&psi[I + d0 + d1 + d2], (double*)&psi[I + d1 + d2], (double*)&psi[I + d0 + d2], (double*)&psi[I + d2], tmp[1]);
+ store((double*)&psi[I + d0 + d1 + d3], (double*)&psi[I + d1 + d3], (double*)&psi[I + d0 + d3], (double*)&psi[I + d3], tmp[2]);
+ store((double*)&psi[I + d0 + d1 + d2 + d3], (double*)&psi[I + d1 + d2 + d3], (double*)&psi[I + d0 + d2 + d3], (double*)&psi[I + d2 + d3], tmp[3]);
+ tmp[4] = fma(v[0], m[248], mt[248], fma(v[1], m[249], mt[249], tmp[4]));
+ tmp[5] = fma(v[0], m[250], mt[250], fma(v[1], m[251], mt[251], tmp[5]));
+ tmp[6] = fma(v[0], m[252], mt[252], fma(v[1], m[253], mt[253], tmp[6]));
+ tmp[7] = fma(v[0], m[254], mt[254], fma(v[1], m[255], mt[255], tmp[7]));
+ store((double*)&psi[I + d0 + d1 + d4], (double*)&psi[I + d1 + d4], (double*)&psi[I + d0 + d4], (double*)&psi[I + d4], tmp[4]);
+ store((double*)&psi[I + d0 + d1 + d2 + d4], (double*)&psi[I + d1 + d2 + d4], (double*)&psi[I + d0 + d2 + d4], (double*)&psi[I + d2 + d4], tmp[5]);
+ store((double*)&psi[I + d0 + d1 + d3 + d4], (double*)&psi[I + d1 + d3 + d4], (double*)&psi[I + d0 + d3 + d4], (double*)&psi[I + d3 + d4], tmp[6]);
+ store((double*)&psi[I + d0 + d1 + d2 + d3 + d4], (double*)&psi[I + d1 + d2 + d3 + d4], (double*)&psi[I + d0 + d2 + d3 + d4], (double*)&psi[I + d2 + d3 + d4], tmp[7]);
+
+}
+
+// bit indices id[.] are given from high to low (e.g. control first for CNOT)
+template
+void kernel(V& psi, unsigned id4, unsigned id3, unsigned id2, unsigned id1, unsigned id0, M const& matrix, std::size_t ctrlmask)
+{
+ std::size_t n = psi.size();
+ std::size_t d0 = 1ULL << id0;
+ std::size_t d1 = 1ULL << id1;
+ std::size_t d2 = 1ULL << id2;
+ std::size_t d3 = 1ULL << id3;
+ std::size_t d4 = 1ULL << id4;
+ auto m = matrix;
+ std::size_t dsorted[] = {d0, d1, d2, d3, d4};
+ permute_qubits_and_matrix(dsorted, 5, m);
+
+ __m512d mm[256];
+ for (unsigned b = 0; b < 16; ++b){
+ for (unsigned r = 0; r < 8; ++r){
+ for (unsigned c = 0; c < 2; ++c){
+ mm[b*16+r*2+c] = loada(&m[4*r+0][c+b*2], &m[4*r+1][c+b*2], &m[4*r+2][c+b*2], &m[4*r+3][c+b*2]);
+ }
+ }
+ }
+
+ __m512d mmt[256];
+ for (unsigned b = 0; b < 16; ++b){
+ for (unsigned r = 0; r < 8; ++r){
+ for (unsigned c = 0; c < 2; ++c){
+ mmt[b*16+r*2+c] = loadbm(&m[4*r+0][c+b*2], &m[4*r+1][c+b*2], &m[4*r+2][c+b*2], &m[4*r+3][c+b*2]);
+ }
+ }
+ }
+
+
+#ifndef _MSC_VER
+ if (ctrlmask == 0){
+ #pragma omp parallel for collapse(LOOP_COLLAPSE5) schedule(static)
+ for (std::size_t i0 = 0; i0 < n; i0 += 2 * dsorted[0]){
+ for (std::size_t i1 = 0; i1 < dsorted[0]; i1 += 2 * dsorted[1]){
+ for (std::size_t i2 = 0; i2 < dsorted[1]; i2 += 2 * dsorted[2]){
+ for (std::size_t i3 = 0; i3 < dsorted[2]; i3 += 2 * dsorted[3]){
+ for (std::size_t i4 = 0; i4 < dsorted[3]; i4 += 2 * dsorted[4]){
+ for (std::size_t i5 = 0; i5 < dsorted[4]; ++i5){
+ kernel_core(psi, i0 + i1 + i2 + i3 + i4 + i5, dsorted[4], dsorted[3], dsorted[2], dsorted[1], dsorted[0], mm, mmt);
+ }
+ }
+ }
+ }
+ }
+ }
+ }
+ else{
+ #pragma omp parallel for collapse(LOOP_COLLAPSE5) schedule(static)
+ for (std::size_t i0 = 0; i0 < n; i0 += 2 * dsorted[0]){
+ for (std::size_t i1 = 0; i1 < dsorted[0]; i1 += 2 * dsorted[1]){
+ for (std::size_t i2 = 0; i2 < dsorted[1]; i2 += 2 * dsorted[2]){
+ for (std::size_t i3 = 0; i3 < dsorted[2]; i3 += 2 * dsorted[3]){
+ for (std::size_t i4 = 0; i4 < dsorted[3]; i4 += 2 * dsorted[4]){
+ for (std::size_t i5 = 0; i5 < dsorted[4]; ++i5){
+ if (((i0 + i1 + i2 + i3 + i4 + i5)&ctrlmask) == ctrlmask)
+ kernel_core(psi, i0 + i1 + i2 + i3 + i4 + i5, dsorted[4], dsorted[3], dsorted[2], dsorted[1], dsorted[0], mm, mmt);
+ }
+ }
+ }
+ }
+ }
+ }
+ }
+#else
+ std::intptr_t zero = 0;
+ std::intptr_t dmask = dsorted[0] + dsorted[1] + dsorted[2] + dsorted[3] + dsorted[4];
+
+ if (ctrlmask == 0){
+ #pragma omp parallel for schedule(static)
+ for (std::intptr_t i = 0; i < static_cast(n); ++i)
+ if ((i & dmask) == zero)
+ kernel_core(psi, i, dsorted[4], dsorted[3], dsorted[2], dsorted[1], dsorted[0], mm, mmt);
+ } else {
+ #pragma omp parallel for schedule(static)
+ for (std::intptr_t i = 0; i < static_cast(n); ++i)
+ if ((i & ctrlmask) == ctrlmask && (i & dmask) == zero)
+ kernel_core(psi, i, dsorted[4], dsorted[3], dsorted[2], dsorted[1], dsorted[0], mm, mmt);
+ }
+#endif
+}
+
diff --git a/src/Simulation/Native/src/external/avx512/kernel6.hpp b/src/Simulation/Native/src/external/avx512/kernel6.hpp
new file mode 100644
index 00000000000..26bbcb6240d
--- /dev/null
+++ b/src/Simulation/Native/src/external/avx512/kernel6.hpp
@@ -0,0 +1,252 @@
+// (C) 2018 ETH Zurich, ITP, Thomas H�ner and Damian Steiger
+
+template
+inline void kernel_core(V& psi, std::size_t I, std::size_t d0, std::size_t d1, std::size_t d2, std::size_t d3, std::size_t d4, std::size_t d5, M const& m)
+{
+ __m512d v[4];
+
+ v[0] = load1x4(&psi[I]);
+ v[1] = load1x4(&psi[I + d0]);
+ v[2] = load1x4(&psi[I + d1]);
+ v[3] = load1x4(&psi[I + d0 + d1]);
+
+ __m512d tmp[16] = {_mm512_setzero_pd(), _mm512_setzero_pd(), _mm512_setzero_pd(), _mm512_setzero_pd(), _mm512_setzero_pd(), _mm512_setzero_pd(), _mm512_setzero_pd(), _mm512_setzero_pd(), _mm512_setzero_pd(), _mm512_setzero_pd(), _mm512_setzero_pd(), _mm512_setzero_pd(), _mm512_setzero_pd(), _mm512_setzero_pd(), _mm512_setzero_pd(), _mm512_setzero_pd()};
+ for (unsigned i = 0; i < 16; ++i){
+ tmp[i] = fma(v[0], m[0 + i * 4 + 0], fma(v[1], m[0 + i * 4 + 1], fma(v[2], m[0 + i * 4 + 2], fma(v[3], m[0 + i * 4 + 3], tmp[i]))));
+ }
+
+
+ v[0] = load1x4(&psi[I + d2]);
+ v[1] = load1x4(&psi[I + d0 + d2]);
+ v[2] = load1x4(&psi[I + d1 + d2]);
+ v[3] = load1x4(&psi[I + d0 + d1 + d2]);
+ for (unsigned i = 0; i < 16; ++i){
+ tmp[i] = fma(v[0], m[64 + i * 4 + 0], fma(v[1], m[64 + i * 4 + 1], fma(v[2], m[64 + i * 4 + 2], fma(v[3], m[64 + i * 4 + 3], tmp[i]))));
+ }
+
+
+ v[0] = load1x4(&psi[I + d3]);
+ v[1] = load1x4(&psi[I + d0 + d3]);
+ v[2] = load1x4(&psi[I + d1 + d3]);
+ v[3] = load1x4(&psi[I + d0 + d1 + d3]);
+ for (unsigned i = 0; i < 16; ++i){
+ tmp[i] = fma(v[0], m[128 + i * 4 + 0], fma(v[1], m[128 + i * 4 + 1], fma(v[2], m[128 + i * 4 + 2], fma(v[3], m[128 + i * 4 + 3], tmp[i]))));
+ }
+
+
+ v[0] = load1x4(&psi[I + d2 + d3]);
+ v[1] = load1x4(&psi[I + d0 + d2 + d3]);
+ v[2] = load1x4(&psi[I + d1 + d2 + d3]);
+ v[3] = load1x4(&psi[I + d0 + d1 + d2 + d3]);
+ for (unsigned i = 0; i < 16; ++i){
+ tmp[i] = fma(v[0], m[192 + i * 4 + 0], fma(v[1], m[192 + i * 4 + 1], fma(v[2], m[192 + i * 4 + 2], fma(v[3], m[192 + i * 4 + 3], tmp[i]))));
+ }
+
+
+ v[0] = load1x4(&psi[I + d4]);
+ v[1] = load1x4(&psi[I + d0 + d4]);
+ v[2] = load1x4(&psi[I + d1 + d4]);
+ v[3] = load1x4(&psi[I + d0 + d1 + d4]);
+ for (unsigned i = 0; i < 16; ++i){
+ tmp[i] = fma(v[0], m[256 + i * 4 + 0], fma(v[1], m[256 + i * 4 + 1], fma(v[2], m[256 + i * 4 + 2], fma(v[3], m[256 + i * 4 + 3], tmp[i]))));
+ }
+
+
+ v[0] = load1x4(&psi[I + d2 + d4]);
+ v[1] = load1x4(&psi[I + d0 + d2 + d4]);
+ v[2] = load1x4(&psi[I + d1 + d2 + d4]);
+ v[3] = load1x4(&psi[I + d0 + d1 + d2 + d4]);
+ for (unsigned i = 0; i < 16; ++i){
+ tmp[i] = fma(v[0], m[320 + i * 4 + 0], fma(v[1], m[320 + i * 4 + 1], fma(v[2], m[320 + i * 4 + 2], fma(v[3], m[320 + i * 4 + 3], tmp[i]))));
+ }
+
+
+ v[0] = load1x4(&psi[I + d3 + d4]);
+ v[1] = load1x4(&psi[I + d0 + d3 + d4]);
+ v[2] = load1x4(&psi[I + d1 + d3 + d4]);
+ v[3] = load1x4(&psi[I + d0 + d1 + d3 + d4]);
+ for (unsigned i = 0; i < 16; ++i){
+ tmp[i] = fma(v[0], m[384 + i * 4 + 0], fma(v[1], m[384 + i * 4 + 1], fma(v[2], m[384 + i * 4 + 2], fma(v[3], m[384 + i * 4 + 3], tmp[i]))));
+ }
+
+
+ v[0] = load1x4(&psi[I + d2 + d3 + d4]);
+ v[1] = load1x4(&psi[I + d0 + d2 + d3 + d4]);
+ v[2] = load1x4(&psi[I + d1 + d2 + d3 + d4]);
+ v[3] = load1x4(&psi[I + d0 + d1 + d2 + d3 + d4]);
+ for (unsigned i = 0; i < 16; ++i){
+ tmp[i] = fma(v[0], m[448 + i * 4 + 0], fma(v[1], m[448 + i * 4 + 1], fma(v[2], m[448 + i * 4 + 2], fma(v[3], m[448 + i * 4 + 3], tmp[i]))));
+ }
+
+
+ v[0] = load1x4(&psi[I + d5]);
+ v[1] = load1x4(&psi[I + d0 + d5]);
+ v[2] = load1x4(&psi[I + d1 + d5]);
+ v[3] = load1x4(&psi[I + d0 + d1 + d5]);
+ for (unsigned i = 0; i < 16; ++i){
+ tmp[i] = fma(v[0], m[512 + i * 4 + 0], fma(v[1], m[512 + i * 4 + 1], fma(v[2], m[512 + i * 4 + 2], fma(v[3], m[512 + i * 4 + 3], tmp[i]))));
+ }
+
+
+ v[0] = load1x4(&psi[I + d2 + d5]);
+ v[1] = load1x4(&psi[I + d0 + d2 + d5]);
+ v[2] = load1x4(&psi[I + d1 + d2 + d5]);
+ v[3] = load1x4(&psi[I + d0 + d1 + d2 + d5]);
+ for (unsigned i = 0; i < 16; ++i){
+ tmp[i] = fma(v[0], m[576 + i * 4 + 0], fma(v[1], m[576 + i * 4 + 1], fma(v[2], m[576 + i * 4 + 2], fma(v[3], m[576 + i * 4 + 3], tmp[i]))));
+ }
+
+
+ v[0] = load1x4(&psi[I + d3 + d5]);
+ v[1] = load1x4(&psi[I + d0 + d3 + d5]);
+ v[2] = load1x4(&psi[I + d1 + d3 + d5]);
+ v[3] = load1x4(&psi[I + d0 + d1 + d3 + d5]);
+ for (unsigned i = 0; i < 16; ++i){
+ tmp[i] = fma(v[0], m[640 + i * 4 + 0], fma(v[1], m[640 + i * 4 + 1], fma(v[2], m[640 + i * 4 + 2], fma(v[3], m[640 + i * 4 + 3], tmp[i]))));
+ }
+
+
+ v[0] = load1x4(&psi[I + d2 + d3 + d5]);
+ v[1] = load1x4(&psi[I + d0 + d2 + d3 + d5]);
+ v[2] = load1x4(&psi[I + d1 + d2 + d3 + d5]);
+ v[3] = load1x4(&psi[I + d0 + d1 + d2 + d3 + d5]);
+ for (unsigned i = 0; i < 16; ++i){
+ tmp[i] = fma(v[0], m[704 + i * 4 + 0], fma(v[1], m[704 + i * 4 + 1], fma(v[2], m[704 + i * 4 + 2], fma(v[3], m[704 + i * 4 + 3], tmp[i]))));
+ }
+
+
+ v[0] = load1x4(&psi[I + d4 + d5]);
+ v[1] = load1x4(&psi[I + d0 + d4 + d5]);
+ v[2] = load1x4(&psi[I + d1 + d4 + d5]);
+ v[3] = load1x4(&psi[I + d0 + d1 + d4 + d5]);
+ for (unsigned i = 0; i < 16; ++i){
+ tmp[i] = fma(v[0], m[768 + i * 4 + 0], fma(v[1], m[768 + i * 4 + 1], fma(v[2], m[768 + i * 4 + 2], fma(v[3], m[768 + i * 4 + 3], tmp[i]))));
+ }
+
+
+ v[0] = load1x4(&psi[I + d2 + d4 + d5]);
+ v[1] = load1x4(&psi[I + d0 + d2 + d4 + d5]);
+ v[2] = load1x4(&psi[I + d1 + d2 + d4 + d5]);
+ v[3] = load1x4(&psi[I + d0 + d1 + d2 + d4 + d5]);
+ for (unsigned i = 0; i < 16; ++i){
+ tmp[i] = fma(v[0], m[832 + i * 4 + 0], fma(v[1], m[832 + i * 4 + 1], fma(v[2], m[832 + i * 4 + 2], fma(v[3], m[832 + i * 4 + 3], tmp[i]))));
+ }
+
+
+ v[0] = load1x4(&psi[I + d3 + d4 + d5]);
+ v[1] = load1x4(&psi[I + d0 + d3 + d4 + d5]);
+ v[2] = load1x4(&psi[I + d1 + d3 + d4 + d5]);
+ v[3] = load1x4(&psi[I + d0 + d1 + d3 + d4 + d5]);
+ for (unsigned i = 0; i < 16; ++i){
+ tmp[i] = fma(v[0], m[896 + i * 4 + 0], fma(v[1], m[896 + i * 4 + 1], fma(v[2], m[896 + i * 4 + 2], fma(v[3], m[896 + i * 4 + 3], tmp[i]))));
+ }
+
+
+ v[0] = load1x4(&psi[I + d2 + d3 + d4 + d5]);
+ v[1] = load1x4(&psi[I + d0 + d2 + d3 + d4 + d5]);
+ v[2] = load1x4(&psi[I + d1 + d2 + d3 + d4 + d5]);
+ v[3] = load1x4(&psi[I + d0 + d1 + d2 + d3 + d4 + d5]);
+ for (unsigned i = 0; i < 16; ++i){
+ tmp[i] = fma(v[0], m[960 + i * 4 + 0], fma(v[1], m[960 + i * 4 + 1], fma(v[2], m[960 + i * 4 + 2], fma(v[3], m[960 + i * 4 + 3], tmp[i]))));
+ }
+
+ store((double*)&psi[I + d0 + d1], (double*)&psi[I + d1], (double*)&psi[I + d0], (double*)&psi[I], tmp[0]);
+ store((double*)&psi[I + d0 + d1 + d2], (double*)&psi[I + d1 + d2], (double*)&psi[I + d0 + d2], (double*)&psi[I + d2], tmp[1]);
+ store((double*)&psi[I + d0 + d1 + d3], (double*)&psi[I + d1 + d3], (double*)&psi[I + d0 + d3], (double*)&psi[I + d3], tmp[2]);
+ store((double*)&psi[I + d0 + d1 + d2 + d3], (double*)&psi[I + d1 + d2 + d3], (double*)&psi[I + d0 + d2 + d3], (double*)&psi[I + d2 + d3], tmp[3]);
+ store((double*)&psi[I + d0 + d1 + d4], (double*)&psi[I + d1 + d4], (double*)&psi[I + d0 + d4], (double*)&psi[I + d4], tmp[4]);
+ store((double*)&psi[I + d0 + d1 + d2 + d4], (double*)&psi[I + d1 + d2 + d4], (double*)&psi[I + d0 + d2 + d4], (double*)&psi[I + d2 + d4], tmp[5]);
+ store((double*)&psi[I + d0 + d1 + d3 + d4], (double*)&psi[I + d1 + d3 + d4], (double*)&psi[I + d0 + d3 + d4], (double*)&psi[I + d3 + d4], tmp[6]);
+ store((double*)&psi[I + d0 + d1 + d2 + d3 + d4], (double*)&psi[I + d1 + d2 + d3 + d4], (double*)&psi[I + d0 + d2 + d3 + d4], (double*)&psi[I + d2 + d3 + d4], tmp[7]);
+ store((double*)&psi[I + d0 + d1 + d5], (double*)&psi[I + d1 + d5], (double*)&psi[I + d0 + d5], (double*)&psi[I + d5], tmp[8]);
+ store((double*)&psi[I + d0 + d1 + d2 + d5], (double*)&psi[I + d1 + d2 + d5], (double*)&psi[I + d0 + d2 + d5], (double*)&psi[I + d2 + d5], tmp[9]);
+ store((double*)&psi[I + d0 + d1 + d3 + d5], (double*)&psi[I + d1 + d3 + d5], (double*)&psi[I + d0 + d3 + d5], (double*)&psi[I + d3 + d5], tmp[10]);
+ store((double*)&psi[I + d0 + d1 + d2 + d3 + d5], (double*)&psi[I + d1 + d2 + d3 + d5], (double*)&psi[I + d0 + d2 + d3 + d5], (double*)&psi[I + d2 + d3 + d5], tmp[11]);
+ store((double*)&psi[I + d0 + d1 + d4 + d5], (double*)&psi[I + d1 + d4 + d5], (double*)&psi[I + d0 + d4 + d5], (double*)&psi[I + d4 + d5], tmp[12]);
+ store((double*)&psi[I + d0 + d1 + d2 + d4 + d5], (double*)&psi[I + d1 + d2 + d4 + d5], (double*)&psi[I + d0 + d2 + d4 + d5], (double*)&psi[I + d2 + d4 + d5], tmp[13]);
+ store((double*)&psi[I + d0 + d1 + d3 + d4 + d5], (double*)&psi[I + d1 + d3 + d4 + d5], (double*)&psi[I + d0 + d3 + d4 + d5], (double*)&psi[I + d3 + d4 + d5], tmp[14]);
+ store((double*)&psi[I + d0 + d1 + d2 + d3 + d4 + d5], (double*)&psi[I + d1 + d2 + d3 + d4 + d5], (double*)&psi[I + d0 + d2 + d3 + d4 + d5], (double*)&psi[I + d2 + d3 + d4 + d5], tmp[15]);
+
+}
+
+// bit indices id[.] are given from high to low (e.g. control first for CNOT)
+template
+void kernel(V& psi, unsigned id5, unsigned id4, unsigned id3, unsigned id2, unsigned id1, unsigned id0, M const& matrix, std::size_t ctrlmask)
+{
+ std::size_t n = psi.size();
+ std::size_t d0 = 1ULL << id0;
+ std::size_t d1 = 1ULL << id1;
+ std::size_t d2 = 1ULL << id2;
+ std::size_t d3 = 1ULL << id3;
+ std::size_t d4 = 1ULL << id4;
+ std::size_t d5 = 1ULL << id5;
+ auto m = matrix;
+ std::size_t dsorted[] = {d0, d1, d2, d3, d4, d5};
+ permute_qubits_and_matrix(dsorted, 6, m);
+
+ __m512d mm[1024];
+ for (unsigned b = 0; b < 16; ++b){
+ for (unsigned r = 0; r < 16; ++r){
+ for (unsigned c = 0; c < 4; ++c){
+ mm[b*64+r*4+c] = loadab(&m[4*r+0][c+b*4], &m[4*r+1][c+b*4], &m[4*r+2][c+b*4], &m[4*r+3][c+b*4]);
+ }
+ }
+ }
+
+
+#ifndef _MSC_VER
+ if (ctrlmask == 0){
+ #pragma omp parallel for collapse(LOOP_COLLAPSE6) schedule(static)
+ for (std::size_t i0 = 0; i0 < n; i0 += 2 * dsorted[0]){
+ for (std::size_t i1 = 0; i1 < dsorted[0]; i1 += 2 * dsorted[1]){
+ for (std::size_t i2 = 0; i2 < dsorted[1]; i2 += 2 * dsorted[2]){
+ for (std::size_t i3 = 0; i3 < dsorted[2]; i3 += 2 * dsorted[3]){
+ for (std::size_t i4 = 0; i4 < dsorted[3]; i4 += 2 * dsorted[4]){
+ for (std::size_t i5 = 0; i5 < dsorted[4]; i5 += 2 * dsorted[5]){
+ for (std::size_t i6 = 0; i6 < dsorted[5]; ++i6){
+ kernel_core(psi, i0 + i1 + i2 + i3 + i4 + i5 + i6, dsorted[5], dsorted[4], dsorted[3], dsorted[2], dsorted[1], dsorted[0], mm);
+ }
+ }
+ }
+ }
+ }
+ }
+ }
+ }
+ else{
+ #pragma omp parallel for collapse(LOOP_COLLAPSE6) schedule(static)
+ for (std::size_t i0 = 0; i0 < n; i0 += 2 * dsorted[0]){
+ for (std::size_t i1 = 0; i1 < dsorted[0]; i1 += 2 * dsorted[1]){
+ for (std::size_t i2 = 0; i2 < dsorted[1]; i2 += 2 * dsorted[2]){
+ for (std::size_t i3 = 0; i3 < dsorted[2]; i3 += 2 * dsorted[3]){
+ for (std::size_t i4 = 0; i4 < dsorted[3]; i4 += 2 * dsorted[4]){
+ for (std::size_t i5 = 0; i5 < dsorted[4]; i5 += 2 * dsorted[5]){
+ for (std::size_t i6 = 0; i6 < dsorted[5]; ++i6){
+ if (((i0 + i1 + i2 + i3 + i4 + i5 + i6)&ctrlmask) == ctrlmask)
+ kernel_core(psi, i0 + i1 + i2 + i3 + i4 + i5 + i6, dsorted[5], dsorted[4], dsorted[3], dsorted[2], dsorted[1], dsorted[0], mm);
+ }
+ }
+ }
+ }
+ }
+ }
+ }
+ }
+#else
+ std::intptr_t zero = 0;
+ std::intptr_t dmask = dsorted[0] + dsorted[1] + dsorted[2] + dsorted[3] + dsorted[4] + dsorted[5];
+
+ if (ctrlmask == 0){
+ #pragma omp parallel for schedule(static)
+ for (std::intptr_t i = 0; i < static_cast(n); ++i)
+ if ((i & dmask) == zero)
+ kernel_core(psi, i, dsorted[5], dsorted[4], dsorted[3], dsorted[2], dsorted[1], dsorted[0], mm);
+ } else {
+ #pragma omp parallel for schedule(static)
+ for (std::intptr_t i = 0; i < static_cast(n); ++i)
+ if ((i & ctrlmask) == ctrlmask && (i & dmask) == zero)
+ kernel_core(psi, i, dsorted[5], dsorted[4], dsorted[3], dsorted[2], dsorted[1], dsorted[0], mm);
+ }
+#endif
+}
+
diff --git a/src/Simulation/Native/src/external/avx512/kernel7.hpp b/src/Simulation/Native/src/external/avx512/kernel7.hpp
new file mode 100644
index 00000000000..8f80d7c7d34
--- /dev/null
+++ b/src/Simulation/Native/src/external/avx512/kernel7.hpp
@@ -0,0 +1,417 @@
+// (C) 2018 ETH Zurich, ITP, Thomas H�ner and Damian Steiger
+
+template
+inline void kernel_core(V& psi, std::size_t I, std::size_t d0, std::size_t d1, std::size_t d2, std::size_t d3, std::size_t d4, std::size_t d5, std::size_t d6, M const& m)
+{
+ __m512d v[4];
+
+ v[0] = load1x4(&psi[I]);
+ v[1] = load1x4(&psi[I + d0]);
+ v[2] = load1x4(&psi[I + d1]);
+ v[3] = load1x4(&psi[I + d0 + d1]);
+
+ __m512d tmp[32] = {_mm512_setzero_pd(), _mm512_setzero_pd(), _mm512_setzero_pd(), _mm512_setzero_pd(), _mm512_setzero_pd(), _mm512_setzero_pd(), _mm512_setzero_pd(), _mm512_setzero_pd(), _mm512_setzero_pd(), _mm512_setzero_pd(), _mm512_setzero_pd(), _mm512_setzero_pd(), _mm512_setzero_pd(), _mm512_setzero_pd(), _mm512_setzero_pd(), _mm512_setzero_pd(), _mm512_setzero_pd(), _mm512_setzero_pd(), _mm512_setzero_pd(), _mm512_setzero_pd(), _mm512_setzero_pd(), _mm512_setzero_pd(), _mm512_setzero_pd(), _mm512_setzero_pd(), _mm512_setzero_pd(), _mm512_setzero_pd(), _mm512_setzero_pd(), _mm512_setzero_pd(), _mm512_setzero_pd(), _mm512_setzero_pd(), _mm512_setzero_pd(), _mm512_setzero_pd()};
+ for (unsigned i = 0; i < 32; ++i){
+ tmp[i] = fma(v[0], m[0 + i * 4 + 0], fma(v[1], m[0 + i * 4 + 1], fma(v[2], m[0 + i * 4 + 2], fma(v[3], m[0 + i * 4 + 3], tmp[i]))));
+ }
+
+
+ v[0] = load1x4(&psi[I + d2]);
+ v[1] = load1x4(&psi[I + d0 + d2]);
+ v[2] = load1x4(&psi[I + d1 + d2]);
+ v[3] = load1x4(&psi[I + d0 + d1 + d2]);
+ for (unsigned i = 0; i < 32; ++i){
+ tmp[i] = fma(v[0], m[128 + i * 4 + 0], fma(v[1], m[128 + i * 4 + 1], fma(v[2], m[128 + i * 4 + 2], fma(v[3], m[128 + i * 4 + 3], tmp[i]))));
+ }
+
+
+ v[0] = load1x4(&psi[I + d3]);
+ v[1] = load1x4(&psi[I + d0 + d3]);
+ v[2] = load1x4(&psi[I + d1 + d3]);
+ v[3] = load1x4(&psi[I + d0 + d1 + d3]);
+ for (unsigned i = 0; i < 32; ++i){
+ tmp[i] = fma(v[0], m[256 + i * 4 + 0], fma(v[1], m[256 + i * 4 + 1], fma(v[2], m[256 + i * 4 + 2], fma(v[3], m[256 + i * 4 + 3], tmp[i]))));
+ }
+
+
+ v[0] = load1x4(&psi[I + d2 + d3]);
+ v[1] = load1x4(&psi[I + d0 + d2 + d3]);
+ v[2] = load1x4(&psi[I + d1 + d2 + d3]);
+ v[3] = load1x4(&psi[I + d0 + d1 + d2 + d3]);
+ for (unsigned i = 0; i < 32; ++i){
+ tmp[i] = fma(v[0], m[384 + i * 4 + 0], fma(v[1], m[384 + i * 4 + 1], fma(v[2], m[384 + i * 4 + 2], fma(v[3], m[384 + i * 4 + 3], tmp[i]))));
+ }
+
+
+ v[0] = load1x4(&psi[I + d4]);
+ v[1] = load1x4(&psi[I + d0 + d4]);
+ v[2] = load1x4(&psi[I + d1 + d4]);
+ v[3] = load1x4(&psi[I + d0 + d1 + d4]);
+ for (unsigned i = 0; i < 32; ++i){
+ tmp[i] = fma(v[0], m[512 + i * 4 + 0], fma(v[1], m[512 + i * 4 + 1], fma(v[2], m[512 + i * 4 + 2], fma(v[3], m[512 + i * 4 + 3], tmp[i]))));
+ }
+
+
+ v[0] = load1x4(&psi[I + d2 + d4]);
+ v[1] = load1x4(&psi[I + d0 + d2 + d4]);
+ v[2] = load1x4(&psi[I + d1 + d2 + d4]);
+ v[3] = load1x4(&psi[I + d0 + d1 + d2 + d4]);
+ for (unsigned i = 0; i < 32; ++i){
+ tmp[i] = fma(v[0], m[640 + i * 4 + 0], fma(v[1], m[640 + i * 4 + 1], fma(v[2], m[640 + i * 4 + 2], fma(v[3], m[640 + i * 4 + 3], tmp[i]))));
+ }
+
+
+ v[0] = load1x4(&psi[I + d3 + d4]);
+ v[1] = load1x4(&psi[I + d0 + d3 + d4]);
+ v[2] = load1x4(&psi[I + d1 + d3 + d4]);
+ v[3] = load1x4(&psi[I + d0 + d1 + d3 + d4]);
+ for (unsigned i = 0; i < 32; ++i){
+ tmp[i] = fma(v[0], m[768 + i * 4 + 0], fma(v[1], m[768 + i * 4 + 1], fma(v[2], m[768 + i * 4 + 2], fma(v[3], m[768 + i * 4 + 3], tmp[i]))));
+ }
+
+
+ v[0] = load1x4(&psi[I + d2 + d3 + d4]);
+ v[1] = load1x4(&psi[I + d0 + d2 + d3 + d4]);
+ v[2] = load1x4(&psi[I + d1 + d2 + d3 + d4]);
+ v[3] = load1x4(&psi[I + d0 + d1 + d2 + d3 + d4]);
+ for (unsigned i = 0; i < 32; ++i){
+ tmp[i] = fma(v[0], m[896 + i * 4 + 0], fma(v[1], m[896 + i * 4 + 1], fma(v[2], m[896 + i * 4 + 2], fma(v[3], m[896 + i * 4 + 3], tmp[i]))));
+ }
+
+
+ v[0] = load1x4(&psi[I + d5]);
+ v[1] = load1x4(&psi[I + d0 + d5]);
+ v[2] = load1x4(&psi[I + d1 + d5]);
+ v[3] = load1x4(&psi[I + d0 + d1 + d5]);
+ for (unsigned i = 0; i < 32; ++i){
+ tmp[i] = fma(v[0], m[1024 + i * 4 + 0], fma(v[1], m[1024 + i * 4 + 1], fma(v[2], m[1024 + i * 4 + 2], fma(v[3], m[1024 + i * 4 + 3], tmp[i]))));
+ }
+
+
+ v[0] = load1x4(&psi[I + d2 + d5]);
+ v[1] = load1x4(&psi[I + d0 + d2 + d5]);
+ v[2] = load1x4(&psi[I + d1 + d2 + d5]);
+ v[3] = load1x4(&psi[I + d0 + d1 + d2 + d5]);
+ for (unsigned i = 0; i < 32; ++i){
+ tmp[i] = fma(v[0], m[1152 + i * 4 + 0], fma(v[1], m[1152 + i * 4 + 1], fma(v[2], m[1152 + i * 4 + 2], fma(v[3], m[1152 + i * 4 + 3], tmp[i]))));
+ }
+
+
+ v[0] = load1x4(&psi[I + d3 + d5]);
+ v[1] = load1x4(&psi[I + d0 + d3 + d5]);
+ v[2] = load1x4(&psi[I + d1 + d3 + d5]);
+ v[3] = load1x4(&psi[I + d0 + d1 + d3 + d5]);
+ for (unsigned i = 0; i < 32; ++i){
+ tmp[i] = fma(v[0], m[1280 + i * 4 + 0], fma(v[1], m[1280 + i * 4 + 1], fma(v[2], m[1280 + i * 4 + 2], fma(v[3], m[1280 + i * 4 + 3], tmp[i]))));
+ }
+
+
+ v[0] = load1x4(&psi[I + d2 + d3 + d5]);
+ v[1] = load1x4(&psi[I + d0 + d2 + d3 + d5]);
+ v[2] = load1x4(&psi[I + d1 + d2 + d3 + d5]);
+ v[3] = load1x4(&psi[I + d0 + d1 + d2 + d3 + d5]);
+ for (unsigned i = 0; i < 32; ++i){
+ tmp[i] = fma(v[0], m[1408 + i * 4 + 0], fma(v[1], m[1408 + i * 4 + 1], fma(v[2], m[1408 + i * 4 + 2], fma(v[3], m[1408 + i * 4 + 3], tmp[i]))));
+ }
+
+
+ v[0] = load1x4(&psi[I + d4 + d5]);
+ v[1] = load1x4(&psi[I + d0 + d4 + d5]);
+ v[2] = load1x4(&psi[I + d1 + d4 + d5]);
+ v[3] = load1x4(&psi[I + d0 + d1 + d4 + d5]);
+ for (unsigned i = 0; i < 32; ++i){
+ tmp[i] = fma(v[0], m[1536 + i * 4 + 0], fma(v[1], m[1536 + i * 4 + 1], fma(v[2], m[1536 + i * 4 + 2], fma(v[3], m[1536 + i * 4 + 3], tmp[i]))));
+ }
+
+
+ v[0] = load1x4(&psi[I + d2 + d4 + d5]);
+ v[1] = load1x4(&psi[I + d0 + d2 + d4 + d5]);
+ v[2] = load1x4(&psi[I + d1 + d2 + d4 + d5]);
+ v[3] = load1x4(&psi[I + d0 + d1 + d2 + d4 + d5]);
+ for (unsigned i = 0; i < 32; ++i){
+ tmp[i] = fma(v[0], m[1664 + i * 4 + 0], fma(v[1], m[1664 + i * 4 + 1], fma(v[2], m[1664 + i * 4 + 2], fma(v[3], m[1664 + i * 4 + 3], tmp[i]))));
+ }
+
+
+ v[0] = load1x4(&psi[I + d3 + d4 + d5]);
+ v[1] = load1x4(&psi[I + d0 + d3 + d4 + d5]);
+ v[2] = load1x4(&psi[I + d1 + d3 + d4 + d5]);
+ v[3] = load1x4(&psi[I + d0 + d1 + d3 + d4 + d5]);
+ for (unsigned i = 0; i < 32; ++i){
+ tmp[i] = fma(v[0], m[1792 + i * 4 + 0], fma(v[1], m[1792 + i * 4 + 1], fma(v[2], m[1792 + i * 4 + 2], fma(v[3], m[1792 + i * 4 + 3], tmp[i]))));
+ }
+
+
+ v[0] = load1x4(&psi[I + d2 + d3 + d4 + d5]);
+ v[1] = load1x4(&psi[I + d0 + d2 + d3 + d4 + d5]);
+ v[2] = load1x4(&psi[I + d1 + d2 + d3 + d4 + d5]);
+ v[3] = load1x4(&psi[I + d0 + d1 + d2 + d3 + d4 + d5]);
+ for (unsigned i = 0; i < 32; ++i){
+ tmp[i] = fma(v[0], m[1920 + i * 4 + 0], fma(v[1], m[1920 + i * 4 + 1], fma(v[2], m[1920 + i * 4 + 2], fma(v[3], m[1920 + i * 4 + 3], tmp[i]))));
+ }
+
+
+ v[0] = load1x4(&psi[I + d6]);
+ v[1] = load1x4(&psi[I + d0 + d6]);
+ v[2] = load1x4(&psi[I + d1 + d6]);
+ v[3] = load1x4(&psi[I + d0 + d1 + d6]);
+ for (unsigned i = 0; i < 32; ++i){
+ tmp[i] = fma(v[0], m[2048 + i * 4 + 0], fma(v[1], m[2048 + i * 4 + 1], fma(v[2], m[2048 + i * 4 + 2], fma(v[3], m[2048 + i * 4 + 3], tmp[i]))));
+ }
+
+
+ v[0] = load1x4(&psi[I + d2 + d6]);
+ v[1] = load1x4(&psi[I + d0 + d2 + d6]);
+ v[2] = load1x4(&psi[I + d1 + d2 + d6]);
+ v[3] = load1x4(&psi[I + d0 + d1 + d2 + d6]);
+ for (unsigned i = 0; i < 32; ++i){
+ tmp[i] = fma(v[0], m[2176 + i * 4 + 0], fma(v[1], m[2176 + i * 4 + 1], fma(v[2], m[2176 + i * 4 + 2], fma(v[3], m[2176 + i * 4 + 3], tmp[i]))));
+ }
+
+
+ v[0] = load1x4(&psi[I + d3 + d6]);
+ v[1] = load1x4(&psi[I + d0 + d3 + d6]);
+ v[2] = load1x4(&psi[I + d1 + d3 + d6]);
+ v[3] = load1x4(&psi[I + d0 + d1 + d3 + d6]);
+ for (unsigned i = 0; i < 32; ++i){
+ tmp[i] = fma(v[0], m[2304 + i * 4 + 0], fma(v[1], m[2304 + i * 4 + 1], fma(v[2], m[2304 + i * 4 + 2], fma(v[3], m[2304 + i * 4 + 3], tmp[i]))));
+ }
+
+
+ v[0] = load1x4(&psi[I + d2 + d3 + d6]);
+ v[1] = load1x4(&psi[I + d0 + d2 + d3 + d6]);
+ v[2] = load1x4(&psi[I + d1 + d2 + d3 + d6]);
+ v[3] = load1x4(&psi[I + d0 + d1 + d2 + d3 + d6]);
+ for (unsigned i = 0; i < 32; ++i){
+ tmp[i] = fma(v[0], m[2432 + i * 4 + 0], fma(v[1], m[2432 + i * 4 + 1], fma(v[2], m[2432 + i * 4 + 2], fma(v[3], m[2432 + i * 4 + 3], tmp[i]))));
+ }
+
+
+ v[0] = load1x4(&psi[I + d4 + d6]);
+ v[1] = load1x4(&psi[I + d0 + d4 + d6]);
+ v[2] = load1x4(&psi[I + d1 + d4 + d6]);
+ v[3] = load1x4(&psi[I + d0 + d1 + d4 + d6]);
+ for (unsigned i = 0; i < 32; ++i){
+ tmp[i] = fma(v[0], m[2560 + i * 4 + 0], fma(v[1], m[2560 + i * 4 + 1], fma(v[2], m[2560 + i * 4 + 2], fma(v[3], m[2560 + i * 4 + 3], tmp[i]))));
+ }
+
+
+ v[0] = load1x4(&psi[I + d2 + d4 + d6]);
+ v[1] = load1x4(&psi[I + d0 + d2 + d4 + d6]);
+ v[2] = load1x4(&psi[I + d1 + d2 + d4 + d6]);
+ v[3] = load1x4(&psi[I + d0 + d1 + d2 + d4 + d6]);
+ for (unsigned i = 0; i < 32; ++i){
+ tmp[i] = fma(v[0], m[2688 + i * 4 + 0], fma(v[1], m[2688 + i * 4 + 1], fma(v[2], m[2688 + i * 4 + 2], fma(v[3], m[2688 + i * 4 + 3], tmp[i]))));
+ }
+
+
+ v[0] = load1x4(&psi[I + d3 + d4 + d6]);
+ v[1] = load1x4(&psi[I + d0 + d3 + d4 + d6]);
+ v[2] = load1x4(&psi[I + d1 + d3 + d4 + d6]);
+ v[3] = load1x4(&psi[I + d0 + d1 + d3 + d4 + d6]);
+ for (unsigned i = 0; i < 32; ++i){
+ tmp[i] = fma(v[0], m[2816 + i * 4 + 0], fma(v[1], m[2816 + i * 4 + 1], fma(v[2], m[2816 + i * 4 + 2], fma(v[3], m[2816 + i * 4 + 3], tmp[i]))));
+ }
+
+
+ v[0] = load1x4(&psi[I + d2 + d3 + d4 + d6]);
+ v[1] = load1x4(&psi[I + d0 + d2 + d3 + d4 + d6]);
+ v[2] = load1x4(&psi[I + d1 + d2 + d3 + d4 + d6]);
+ v[3] = load1x4(&psi[I + d0 + d1 + d2 + d3 + d4 + d6]);
+ for (unsigned i = 0; i < 32; ++i){
+ tmp[i] = fma(v[0], m[2944 + i * 4 + 0], fma(v[1], m[2944 + i * 4 + 1], fma(v[2], m[2944 + i * 4 + 2], fma(v[3], m[2944 + i * 4 + 3], tmp[i]))));
+ }
+
+
+ v[0] = load1x4(&psi[I + d5 + d6]);
+ v[1] = load1x4(&psi[I + d0 + d5 + d6]);
+ v[2] = load1x4(&psi[I + d1 + d5 + d6]);
+ v[3] = load1x4(&psi[I + d0 + d1 + d5 + d6]);
+ for (unsigned i = 0; i < 32; ++i){
+ tmp[i] = fma(v[0], m[3072 + i * 4 + 0], fma(v[1], m[3072 + i * 4 + 1], fma(v[2], m[3072 + i * 4 + 2], fma(v[3], m[3072 + i * 4 + 3], tmp[i]))));
+ }
+
+
+ v[0] = load1x4(&psi[I + d2 + d5 + d6]);
+ v[1] = load1x4(&psi[I + d0 + d2 + d5 + d6]);
+ v[2] = load1x4(&psi[I + d1 + d2 + d5 + d6]);
+ v[3] = load1x4(&psi[I + d0 + d1 + d2 + d5 + d6]);
+ for (unsigned i = 0; i < 32; ++i){
+ tmp[i] = fma(v[0], m[3200 + i * 4 + 0], fma(v[1], m[3200 + i * 4 + 1], fma(v[2], m[3200 + i * 4 + 2], fma(v[3], m[3200 + i * 4 + 3], tmp[i]))));
+ }
+
+
+ v[0] = load1x4(&psi[I + d3 + d5 + d6]);
+ v[1] = load1x4(&psi[I + d0 + d3 + d5 + d6]);
+ v[2] = load1x4(&psi[I + d1 + d3 + d5 + d6]);
+ v[3] = load1x4(&psi[I + d0 + d1 + d3 + d5 + d6]);
+ for (unsigned i = 0; i < 32; ++i){
+ tmp[i] = fma(v[0], m[3328 + i * 4 + 0], fma(v[1], m[3328 + i * 4 + 1], fma(v[2], m[3328 + i * 4 + 2], fma(v[3], m[3328 + i * 4 + 3], tmp[i]))));
+ }
+
+
+ v[0] = load1x4(&psi[I + d2 + d3 + d5 + d6]);
+ v[1] = load1x4(&psi[I + d0 + d2 + d3 + d5 + d6]);
+ v[2] = load1x4(&psi[I + d1 + d2 + d3 + d5 + d6]);
+ v[3] = load1x4(&psi[I + d0 + d1 + d2 + d3 + d5 + d6]);
+ for (unsigned i = 0; i < 32; ++i){
+ tmp[i] = fma(v[0], m[3456 + i * 4 + 0], fma(v[1], m[3456 + i * 4 + 1], fma(v[2], m[3456 + i * 4 + 2], fma(v[3], m[3456 + i * 4 + 3], tmp[i]))));
+ }
+
+
+ v[0] = load1x4(&psi[I + d4 + d5 + d6]);
+ v[1] = load1x4(&psi[I + d0 + d4 + d5 + d6]);
+ v[2] = load1x4(&psi[I + d1 + d4 + d5 + d6]);
+ v[3] = load1x4(&psi[I + d0 + d1 + d4 + d5 + d6]);
+ for (unsigned i = 0; i < 32; ++i){
+ tmp[i] = fma(v[0], m[3584 + i * 4 + 0], fma(v[1], m[3584 + i * 4 + 1], fma(v[2], m[3584 + i * 4 + 2], fma(v[3], m[3584 + i * 4 + 3], tmp[i]))));
+ }
+
+
+ v[0] = load1x4(&psi[I + d2 + d4 + d5 + d6]);
+ v[1] = load1x4(&psi[I + d0 + d2 + d4 + d5 + d6]);
+ v[2] = load1x4(&psi[I + d1 + d2 + d4 + d5 + d6]);
+ v[3] = load1x4(&psi[I + d0 + d1 + d2 + d4 + d5 + d6]);
+ for (unsigned i = 0; i < 32; ++i){
+ tmp[i] = fma(v[0], m[3712 + i * 4 + 0], fma(v[1], m[3712 + i * 4 + 1], fma(v[2], m[3712 + i * 4 + 2], fma(v[3], m[3712 + i * 4 + 3], tmp[i]))));
+ }
+
+
+ v[0] = load1x4(&psi[I + d3 + d4 + d5 + d6]);
+ v[1] = load1x4(&psi[I + d0 + d3 + d4 + d5 + d6]);
+ v[2] = load1x4(&psi[I + d1 + d3 + d4 + d5 + d6]);
+ v[3] = load1x4(&psi[I + d0 + d1 + d3 + d4 + d5 + d6]);
+ for (unsigned i = 0; i < 32; ++i){
+ tmp[i] = fma(v[0], m[3840 + i * 4 + 0], fma(v[1], m[3840 + i * 4 + 1], fma(v[2], m[3840 + i * 4 + 2], fma(v[3], m[3840 + i * 4 + 3], tmp[i]))));
+ }
+
+
+ v[0] = load1x4(&psi[I + d2 + d3 + d4 + d5 + d6]);
+ v[1] = load1x4(&psi[I + d0 + d2 + d3 + d4 + d5 + d6]);
+ v[2] = load1x4(&psi[I + d1 + d2 + d3 + d4 + d5 + d6]);
+ v[3] = load1x4(&psi[I + d0 + d1 + d2 + d3 + d4 + d5 + d6]);
+ for (unsigned i = 0; i < 32; ++i){
+ tmp[i] = fma(v[0], m[3968 + i * 4 + 0], fma(v[1], m[3968 + i * 4 + 1], fma(v[2], m[3968 + i * 4 + 2], fma(v[3], m[3968 + i * 4 + 3], tmp[i]))));
+ }
+
+ store((double*)&psi[I + d0 + d1], (double*)&psi[I + d1], (double*)&psi[I + d0], (double*)&psi[I], tmp[0]);
+ store((double*)&psi[I + d0 + d1 + d2], (double*)&psi[I + d1 + d2], (double*)&psi[I + d0 + d2], (double*)&psi[I + d2], tmp[1]);
+ store((double*)&psi[I + d0 + d1 + d3], (double*)&psi[I + d1 + d3], (double*)&psi[I + d0 + d3], (double*)&psi[I + d3], tmp[2]);
+ store((double*)&psi[I + d0 + d1 + d2 + d3], (double*)&psi[I + d1 + d2 + d3], (double*)&psi[I + d0 + d2 + d3], (double*)&psi[I + d2 + d3], tmp[3]);
+ store((double*)&psi[I + d0 + d1 + d4], (double*)&psi[I + d1 + d4], (double*)&psi[I + d0 + d4], (double*)&psi[I + d4], tmp[4]);
+ store((double*)&psi[I + d0 + d1 + d2 + d4], (double*)&psi[I + d1 + d2 + d4], (double*)&psi[I + d0 + d2 + d4], (double*)&psi[I + d2 + d4], tmp[5]);
+ store((double*)&psi[I + d0 + d1 + d3 + d4], (double*)&psi[I + d1 + d3 + d4], (double*)&psi[I + d0 + d3 + d4], (double*)&psi[I + d3 + d4], tmp[6]);
+ store((double*)&psi[I + d0 + d1 + d2 + d3 + d4], (double*)&psi[I + d1 + d2 + d3 + d4], (double*)&psi[I + d0 + d2 + d3 + d4], (double*)&psi[I + d2 + d3 + d4], tmp[7]);
+ store((double*)&psi[I + d0 + d1 + d5], (double*)&psi[I + d1 + d5], (double*)&psi[I + d0 + d5], (double*)&psi[I + d5], tmp[8]);
+ store((double*)&psi[I + d0 + d1 + d2 + d5], (double*)&psi[I + d1 + d2 + d5], (double*)&psi[I + d0 + d2 + d5], (double*)&psi[I + d2 + d5], tmp[9]);
+ store((double*)&psi[I + d0 + d1 + d3 + d5], (double*)&psi[I + d1 + d3 + d5], (double*)&psi[I + d0 + d3 + d5], (double*)&psi[I + d3 + d5], tmp[10]);
+ store((double*)&psi[I + d0 + d1 + d2 + d3 + d5], (double*)&psi[I + d1 + d2 + d3 + d5], (double*)&psi[I + d0 + d2 + d3 + d5], (double*)&psi[I + d2 + d3 + d5], tmp[11]);
+ store((double*)&psi[I + d0 + d1 + d4 + d5], (double*)&psi[I + d1 + d4 + d5], (double*)&psi[I + d0 + d4 + d5], (double*)&psi[I + d4 + d5], tmp[12]);
+ store((double*)&psi[I + d0 + d1 + d2 + d4 + d5], (double*)&psi[I + d1 + d2 + d4 + d5], (double*)&psi[I + d0 + d2 + d4 + d5], (double*)&psi[I + d2 + d4 + d5], tmp[13]);
+ store((double*)&psi[I + d0 + d1 + d3 + d4 + d5], (double*)&psi[I + d1 + d3 + d4 + d5], (double*)&psi[I + d0 + d3 + d4 + d5], (double*)&psi[I + d3 + d4 + d5], tmp[14]);
+ store((double*)&psi[I + d0 + d1 + d2 + d3 + d4 + d5], (double*)&psi[I + d1 + d2 + d3 + d4 + d5], (double*)&psi[I + d0 + d2 + d3 + d4 + d5], (double*)&psi[I + d2 + d3 + d4 + d5], tmp[15]);
+ store((double*)&psi[I + d0 + d1 + d6], (double*)&psi[I + d1 + d6], (double*)&psi[I + d0 + d6], (double*)&psi[I + d6], tmp[16]);
+ store((double*)&psi[I + d0 + d1 + d2 + d6], (double*)&psi[I + d1 + d2 + d6], (double*)&psi[I + d0 + d2 + d6], (double*)&psi[I + d2 + d6], tmp[17]);
+ store((double*)&psi[I + d0 + d1 + d3 + d6], (double*)&psi[I + d1 + d3 + d6], (double*)&psi[I + d0 + d3 + d6], (double*)&psi[I + d3 + d6], tmp[18]);
+ store((double*)&psi[I + d0 + d1 + d2 + d3 + d6], (double*)&psi[I + d1 + d2 + d3 + d6], (double*)&psi[I + d0 + d2 + d3 + d6], (double*)&psi[I + d2 + d3 + d6], tmp[19]);
+ store((double*)&psi[I + d0 + d1 + d4 + d6], (double*)&psi[I + d1 + d4 + d6], (double*)&psi[I + d0 + d4 + d6], (double*)&psi[I + d4 + d6], tmp[20]);
+ store((double*)&psi[I + d0 + d1 + d2 + d4 + d6], (double*)&psi[I + d1 + d2 + d4 + d6], (double*)&psi[I + d0 + d2 + d4 + d6], (double*)&psi[I + d2 + d4 + d6], tmp[21]);
+ store((double*)&psi[I + d0 + d1 + d3 + d4 + d6], (double*)&psi[I + d1 + d3 + d4 + d6], (double*)&psi[I + d0 + d3 + d4 + d6], (double*)&psi[I + d3 + d4 + d6], tmp[22]);
+ store((double*)&psi[I + d0 + d1 + d2 + d3 + d4 + d6], (double*)&psi[I + d1 + d2 + d3 + d4 + d6], (double*)&psi[I + d0 + d2 + d3 + d4 + d6], (double*)&psi[I + d2 + d3 + d4 + d6], tmp[23]);
+ store((double*)&psi[I + d0 + d1 + d5 + d6], (double*)&psi[I + d1 + d5 + d6], (double*)&psi[I + d0 + d5 + d6], (double*)&psi[I + d5 + d6], tmp[24]);
+ store((double*)&psi[I + d0 + d1 + d2 + d5 + d6], (double*)&psi[I + d1 + d2 + d5 + d6], (double*)&psi[I + d0 + d2 + d5 + d6], (double*)&psi[I + d2 + d5 + d6], tmp[25]);
+ store((double*)&psi[I + d0 + d1 + d3 + d5 + d6], (double*)&psi[I + d1 + d3 + d5 + d6], (double*)&psi[I + d0 + d3 + d5 + d6], (double*)&psi[I + d3 + d5 + d6], tmp[26]);
+ store((double*)&psi[I + d0 + d1 + d2 + d3 + d5 + d6], (double*)&psi[I + d1 + d2 + d3 + d5 + d6], (double*)&psi[I + d0 + d2 + d3 + d5 + d6], (double*)&psi[I + d2 + d3 + d5 + d6], tmp[27]);
+ store((double*)&psi[I + d0 + d1 + d4 + d5 + d6], (double*)&psi[I + d1 + d4 + d5 + d6], (double*)&psi[I + d0 + d4 + d5 + d6], (double*)&psi[I + d4 + d5 + d6], tmp[28]);
+ store((double*)&psi[I + d0 + d1 + d2 + d4 + d5 + d6], (double*)&psi[I + d1 + d2 + d4 + d5 + d6], (double*)&psi[I + d0 + d2 + d4 + d5 + d6], (double*)&psi[I + d2 + d4 + d5 + d6], tmp[29]);
+ store((double*)&psi[I + d0 + d1 + d3 + d4 + d5 + d6], (double*)&psi[I + d1 + d3 + d4 + d5 + d6], (double*)&psi[I + d0 + d3 + d4 + d5 + d6], (double*)&psi[I + d3 + d4 + d5 + d6], tmp[30]);
+ store((double*)&psi[I + d0 + d1 + d2 + d3 + d4 + d5 + d6], (double*)&psi[I + d1 + d2 + d3 + d4 + d5 + d6], (double*)&psi[I + d0 + d2 + d3 + d4 + d5 + d6], (double*)&psi[I + d2 + d3 + d4 + d5 + d6], tmp[31]);
+
+}
+
+// bit indices id[.] are given from high to low (e.g. control first for CNOT)
+template
+void kernel(V& psi, unsigned id6, unsigned id5, unsigned id4, unsigned id3, unsigned id2, unsigned id1, unsigned id0, M const& matrix, std::size_t ctrlmask)
+{
+ std::size_t n = psi.size();
+ std::size_t d0 = 1ULL << id0;
+ std::size_t d1 = 1ULL << id1;
+ std::size_t d2 = 1ULL << id2;
+ std::size_t d3 = 1ULL << id3;
+ std::size_t d4 = 1ULL << id4;
+ std::size_t d5 = 1ULL << id5;
+ std::size_t d6 = 1ULL << id6;
+ auto m = matrix;
+ std::size_t dsorted[] = {d0, d1, d2, d3, d4, d5, d6};
+ permute_qubits_and_matrix(dsorted, 7, m);
+
+ __m512d mm[4096];
+ for (unsigned b = 0; b < 32; ++b){
+ for (unsigned r = 0; r < 32; ++r){
+ for (unsigned c = 0; c < 4; ++c){
+ mm[b*128+r*4+c] = loadab(&m[4*r+0][c+b*4], &m[4*r+1][c+b*4], &m[4*r+2][c+b*4], &m[4*r+3][c+b*4]);
+ }
+ }
+ }
+
+
+#ifndef _MSC_VER
+ if (ctrlmask == 0){
+ #pragma omp parallel for collapse(LOOP_COLLAPSE7) schedule(static)
+ for (std::size_t i0 = 0; i0 < n; i0 += 2 * dsorted[0]){
+ for (std::size_t i1 = 0; i1 < dsorted[0]; i1 += 2 * dsorted[1]){
+ for (std::size_t i2 = 0; i2 < dsorted[1]; i2 += 2 * dsorted[2]){
+ for (std::size_t i3 = 0; i3 < dsorted[2]; i3 += 2 * dsorted[3]){
+ for (std::size_t i4 = 0; i4 < dsorted[3]; i4 += 2 * dsorted[4]){
+ for (std::size_t i5 = 0; i5 < dsorted[4]; i5 += 2 * dsorted[5]){
+ for (std::size_t i6 = 0; i6 < dsorted[5]; i6 += 2 * dsorted[6]){
+ for (std::size_t i7 = 0; i7 < dsorted[6]; ++i7){
+ kernel_core(psi, i0 + i1 + i2 + i3 + i4 + i5 + i6 + i7, dsorted[6], dsorted[5], dsorted[4], dsorted[3], dsorted[2], dsorted[1], dsorted[0], mm);
+ }
+ }
+ }
+ }
+ }
+ }
+ }
+ }
+ }
+ else{
+ #pragma omp parallel for collapse(LOOP_COLLAPSE7) schedule(static)
+ for (std::size_t i0 = 0; i0 < n; i0 += 2 * dsorted[0]){
+ for (std::size_t i1 = 0; i1 < dsorted[0]; i1 += 2 * dsorted[1]){
+ for (std::size_t i2 = 0; i2 < dsorted[1]; i2 += 2 * dsorted[2]){
+ for (std::size_t i3 = 0; i3 < dsorted[2]; i3 += 2 * dsorted[3]){
+ for (std::size_t i4 = 0; i4 < dsorted[3]; i4 += 2 * dsorted[4]){
+ for (std::size_t i5 = 0; i5 < dsorted[4]; i5 += 2 * dsorted[5]){
+ for (std::size_t i6 = 0; i6 < dsorted[5]; i6 += 2 * dsorted[6]){
+ for (std::size_t i7 = 0; i7 < dsorted[6]; ++i7){
+ if (((i0 + i1 + i2 + i3 + i4 + i5 + i6 + i7)&ctrlmask) == ctrlmask)
+ kernel_core(psi, i0 + i1 + i2 + i3 + i4 + i5 + i6 + i7, dsorted[6], dsorted[5], dsorted[4], dsorted[3], dsorted[2], dsorted[1], dsorted[0], mm);
+ }
+ }
+ }
+ }
+ }
+ }
+ }
+ }
+ }
+#else
+ std::intptr_t zero = 0;
+ std::intptr_t dmask = dsorted[0] + dsorted[1] + dsorted[2] + dsorted[3] + dsorted[4] + dsorted[5] + dsorted[6];
+
+ if (ctrlmask == 0){
+ #pragma omp parallel for schedule(static)
+ for (std::intptr_t i = 0; i < static_cast(n); ++i)
+ if ((i & dmask) == zero)
+ kernel_core(psi, i, dsorted[6], dsorted[5], dsorted[4], dsorted[3], dsorted[2], dsorted[1], dsorted[0], mm);
+ } else {
+ #pragma omp parallel for schedule(static)
+ for (std::intptr_t i = 0; i < static_cast(n); ++i)
+ if ((i & ctrlmask) == ctrlmask && (i & dmask) == zero)
+ kernel_core(psi, i, dsorted[6], dsorted[5], dsorted[4], dsorted[3], dsorted[2], dsorted[1], dsorted[0], mm);
+ }
+#endif
+}
+
diff --git a/src/Simulation/Native/src/external/avx512/kernels.hpp b/src/Simulation/Native/src/external/avx512/kernels.hpp
new file mode 100644
index 00000000000..d5a056663bd
--- /dev/null
+++ b/src/Simulation/Native/src/external/avx512/kernels.hpp
@@ -0,0 +1,31 @@
+// (C) 2018 ETH Zurich, ITP, Thomas Häner and Damian Steiger
+
+#ifndef KERNELS_HPP_
+#define KERNELS_HPP_
+
+#include
+#include
+#include
+#include
+#include
+#include
+#include "../cintrin.hpp"
+#include "util/alignedalloc.hpp"
+
+#define LOOP_COLLAPSE1 2
+#define LOOP_COLLAPSE2 3
+#define LOOP_COLLAPSE3 4
+#define LOOP_COLLAPSE4 5
+#define LOOP_COLLAPSE5 6
+#define LOOP_COLLAPSE6 7
+#define LOOP_COLLAPSE7 8
+
+#include "kernel1.hpp"
+#include "kernel2.hpp"
+#include "kernel3.hpp"
+#include "kernel4.hpp"
+#include "kernel5.hpp"
+#include "kernel6.hpp"
+#include "kernel7.hpp"
+
+#endif
diff --git a/src/Simulation/Native/src/simulator/factory.cpp b/src/Simulation/Native/src/simulator/factory.cpp
index 6d46fbcf7f7..f2023319b95 100644
--- a/src/Simulation/Native/src/simulator/factory.cpp
+++ b/src/Simulation/Native/src/simulator/factory.cpp
@@ -22,6 +22,10 @@ namespace Microsoft
{
MICROSOFT_QUANTUM_DECL_IMPORT Microsoft::Quantum::Simulator::SimulatorInterface* createSimulator(unsigned);
}
+ namespace SimulatorAVX512
+ {
+ MICROSOFT_QUANTUM_DECL_IMPORT Microsoft::Quantum::Simulator::SimulatorInterface* createSimulator(unsigned);
+ }
}
}
diff --git a/src/Simulation/Native/src/simulator/simulatoravx512.cpp b/src/Simulation/Native/src/simulator/simulatoravx512.cpp
new file mode 100644
index 00000000000..88bc438dccc
--- /dev/null
+++ b/src/Simulation/Native/src/simulator/simulatoravx512.cpp
@@ -0,0 +1,16 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#define HAVE_INTRINSICS
+#define HAVE_AVX512
+#define HAVE_FMA
+
+#include "simulator/simulator.hpp"
+
+
+namespace sim = Microsoft::Quantum::SimulatorAVX512;
+
+MICROSOFT_QUANTUM_DECL Microsoft::Quantum::Simulator::SimulatorInterface* sim::createSimulator(unsigned maxlocal)
+{
+ return new sim::SimulatorType(maxlocal);
+}
diff --git a/src/Simulation/Native/src/util/cpuid.hpp b/src/Simulation/Native/src/util/cpuid.hpp
index cbce00cf2d1..705d65dec85 100644
--- a/src/Simulation/Native/src/util/cpuid.hpp
+++ b/src/Simulation/Native/src/util/cpuid.hpp
@@ -40,7 +40,7 @@ namespace Microsoft
{
#ifndef _MSC_VER
//__builtin_cpu_init();
- return false; // __builtin_cpu_supports("avx512bw");
+ return (__builtin_cpu_supports("avx512f") != 0 && __builtin_cpu_supports("avx512cd") != 0);
#else
int cpuInfo[4];
__cpuid(cpuInfo,0);
From 21cd32e5c8a31c9d3bc1e792b781617ba4d9eec3 Mon Sep 17 00:00:00 2001
From: "Stefan J. Wernli"
Date: Mon, 15 Jun 2020 17:30:25 -0700
Subject: [PATCH 4/4] Add fma for AVX512
---
src/Simulation/Native/src/CMakeLists.txt | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/src/Simulation/Native/src/CMakeLists.txt b/src/Simulation/Native/src/CMakeLists.txt
index 08da4708397..faf1f1c480c 100644
--- a/src/Simulation/Native/src/CMakeLists.txt
+++ b/src/Simulation/Native/src/CMakeLists.txt
@@ -10,7 +10,7 @@ set(FMAFLAGS "")
else(MSVC)
SET(AVXFLAGS "-mavx")
set(AVX2FLAGS "-mfma -mavx2")
-set(AVX512FLAGS "-mavx512f -mavx512cd")
+set(AVX512FLAGS "-mfma -mavx512f -mavx512cd")
set(FMAFLAGS )
endif(MSVC)