diff --git a/src/Simulation/Native/CMakeLists.txt b/src/Simulation/Native/CMakeLists.txt index 0d5e14a053c..fd2d668f1c9 100644 --- a/src/Simulation/Native/CMakeLists.txt +++ b/src/Simulation/Native/CMakeLists.txt @@ -24,7 +24,7 @@ set(CMAKE_CXX_STANDARD_REQUIRED ON) ADD_DEFINITIONS(-D_SCL_SECURE_NO_WARNINGS) # Configuration options (choose one to turn on) -option(BUILD_SHARED_LIBS "Build shared libraries" OFF) +option(BUILD_SHARED_LIBS "Build shared libraries" ON) option(ENABLE_OPENMP "Enable OpenMP Parallelization" ON) option(USE_SINGLE_PRECISION "Use single-precision floating point operations" OFF) option(HAVE_INTRINSICS "Have AVX intrinsics" OFF) diff --git a/src/Simulation/Native/codegen/codegen_fma.py b/src/Simulation/Native/codegen/codegen_fma.py new file mode 100644 index 00000000000..d54638b81ba --- /dev/null +++ b/src/Simulation/Native/codegen/codegen_fma.py @@ -0,0 +1,434 @@ +#!/usr/bin/env python3 +# (C) 2018 ETH Zurich, ITP, Thomas Häner and Damian Steiger +# Code generator for n-qubit gate + +import sys + + +def avx_type(complex_avx_len): + if complex_avx_len == 2: + return "__m256d" + elif complex_avx_len == 4: + return "__m512d" + elif complex_avx_len == 1: + return "std::complex" + else: + raise Exception("Unknown avx type.") + + +def avx_prefix(complex_avx_len): + if complex_avx_len == 2: + return "_mm256" + elif complex_avx_len == 4: + return "_mm512" + else: + raise Exception("Unknown avx type.") + + +def generate_kernel_core(N, n, kernelarray, blocks, only_one_matrix, unroll_loops, avx_len): + indent = 1 + + kernelarray.append("// (C) 2018 ETH Zurich, ITP, Thomas Häner and Damian Steiger\n\ntemplate \ninline void kernel_core(V& psi, std::size_t I") + for i in range(n): + kernelarray.append(", std::size_t d" + str(i)) + + kernelarray.append(", M const& m") + if not only_one_matrix: + kernelarray.append(", M const& mt") + kernelarray.append(")\n{\n") + + indices = [""]*N + for num in range(N): + tmp = "I" + for b in range(n): + if (num>>b) & 1: + tmp = tmp + " + d"+str(b) + indices[num] = tmp + + add = ["\t" + avx_type(avx_len) + " v[" + str(int(N/blocks)) + "];\n"] + for b in range(blocks): + if avx_len == 4: + x4 = "x4" + else: + x4 = "" + for num in range(int(N/blocks)*b, int(N/blocks)*(b+1)): + add.append("\n\tv[" + str(int(num % (N/blocks))) + "] = ") + if avx_len > 1: + add.append("load1" + x4 + "(&") + add.append("psi[" + indices[num] + "]") + if avx_len > 1: + add.append(")") + add.append(";") + add.append("\n") + if b == 0: + add.append("\n\t" + avx_type(avx_len) + " tmp[" + str(int(N/avx_len)) + "] = {") + for i in range(int(N/avx_len)): + if avx_len > 1: + add.append(avx_prefix(avx_len) + "_setzero_pd(), ") + else: + add.append("0., ") + add[-1] = add[-1][:-2] + "};\n" + + if unroll_loops: + inline_FMAs = False + miniblocks = N/avx_len/4 + miniblocks = max(miniblocks, 1) + for mb in range(int(miniblocks)): + for rb in range(int(N/avx_len/miniblocks)): + r = int(mb*N/avx_len/miniblocks) + rb + add.append("\n\ttmp[" + str(r) + "] = ") + + for i in range(int(N/blocks)): + if not inline_FMAs or avx_len == 1: + add.append("fma(v[" + str(i) + "], m[" + str(b*int(N/blocks)*int(N/avx_len)+r*int(N/blocks)+i) +"], ") + if not only_one_matrix: + add.append("mt[" + str(b*int(N/blocks)*int(N/avx_len)+i+r*int(N/blocks)) +"], ") + else: + add.append(avx_prefix(avx_len) + "_fmadd_pd(v[" + str(i) + "], m[" + str(b*int(N/blocks)*int(N/avx_len)+r*int(N/blocks)+i) +"], ") + add.append("tmp[" + str(r) + "]") + add.append(")"*int(N/blocks)+";") + + if inline_FMAs and not only_one_matrix and not avx_len == 1: + for rb in range(int(N/avx_len/miniblocks)): + r = int(mb*N/avx_len/miniblocks) + rb + add.append("\n\ttmp[" + str(r) + "] = ") + for i in range(int(N/blocks)): + add.append(avx_prefix(avx_len) + "_fmadd_pd(" + avx_prefix(avx_len) + "_permute_pd(v[" + str(i) + "], 5), mt[" + str(b*int(N/blocks)*int(N/avx_len)+r*int(N/blocks)+i) +"], ") + add.append("tmp[" + str(r) + "]") + add.append(")"*int(N/blocks)+";") + + if inline_FMAs and only_one_matrix and avx_len > 1: + raise Exception("Not implemented yet!") + + for rb in range(int(N/avx_len/miniblocks)): + r = int(mb*N/avx_len/miniblocks) + rb + if b == blocks-1: + add.append("\n\t") + if avx_len > 1: + add.append("store(") + for i in range(avx_len): + if avx_len > 1: + add.append("(double*)&") + add.append("psi[" + indices[avx_len*r+avx_len-i-1] + "], ") + if avx_len == 1: + add[-1] = add[-1][:-2] + " = " + add.append("tmp[" + str(r) + "]);") + if avx_len == 1: + add[-1] = add[-1][:-2] + ";" + else: + add.append("\tfor (unsigned i = 0; i < " + str(int(N/avx_len)) + "; ++i){\n\t\ttmp[i] = ") + for i in range(int(N/blocks)): + add.append("fma(v[" + str(i) + "], m[" + str(b*int(N/blocks)*int(N/avx_len)) + " + i * "+str(int(N/blocks)) + " + " + str(i) +"], ") + if not only_one_matrix: + add.append("mt[" + str(b*int(N/blocks)*int(N/avx_len)) + " + i * " + str(int(N/blocks)) + " + " + str(i) +"], ") + add.append("tmp[i]") + add.append(")"*int(N/blocks)+";") + add.append("\n\t}\n") + if b == blocks-1: + for r in range(int(N/avx_len)): + add.append("\n\t") + if avx_len > 1: + add.append("store(") + for i in range(avx_len): + if avx_len > 1: + add.append("(double*)&") + add.append("psi[" + indices[avx_len*r+avx_len-i-1] + "], ") + if avx_len == 1: + add[-1] = add[-1][:-2] + " = " + add.append("tmp[" + str(r) + "]);") + if avx_len == 1: + add[-1] = add[-1][:-2] + ";" + + add.append("\n") + kernelarray.append("".join(add)) + add=[""] + kernelarray.append("".join(add)) + kernelarray.append("\n}\n\n") + +def generate_kernel(n, blocks, only_one_matrix, unroll_loops, avx_len): + kernel = "" + + N = 1<\n" + kernel = kernel + "void kernel(V& psi" + for i in range(n-1,-1,-1): + kernel = kernel + ", unsigned id"+str(i) + kernel = kernel + ", M const& matrix, std::size_t ctrlmask)\n{\n std::size_t n = psi.size();\n" + + for i in idx: + kernel = kernel + "\tstd::size_t d"+str(i)+" = 1ULL << id"+str(i)+";\n" + + kernel += ("\tauto m = matrix;\n" + "\tstd::size_t dsorted[] = {") + add = ["d0"] + for i in range(1,n): + add.append(", d" + str(i)) + add.append("};\n") + add.append("\tpermute_qubits_and_matrix(dsorted, " + str(n) + ", m);\n") + kernel += "".join(add) + + if False: + add = ["\n\t" + avx_type(avx_len) + " mm[] = {"] + for b in range(blocks): + for r in range(int(N/avx_len)): + for c in range(int(N/blocks)): + add.append("loada") + if only_one_matrix: + add[-1] = add[-1]+"b" + add.append("(") + for i in range(avx_len): + add.append("&m["+str(avx_len*r+i)+"]["+str(c+b*int(N/blocks))+"], ") + add[-1] = add[-1][:-2] + "), " + add[-1] = add[-1][:-2] + "};\n" + else: + add = ["\n\t" + avx_type(avx_len) + " mm[" + str(N*int(N/avx_len)) + "];"] + add.append("\n\tfor (unsigned b = 0; b < " + str(blocks) + "; ++b){" + "\n\t\tfor (unsigned r = 0; r < " + str(int(N/avx_len)) + "; ++r){" + "\n\t\t\tfor (unsigned c = 0; c < " + str(int(N/blocks)) + "; ++c){" + "\n\t\t\t\tmm[b*"+str(int(N/avx_len)*int(N/blocks))+"+r*"+str(int(N/blocks))+"+c]" + " = ") + if avx_len > 1: + add.append("loada") + if only_one_matrix: + add[-1] = add[-1]+"b" + add.append("(") + for i in range(avx_len): + add.append("&m["+str(avx_len)+"*r+"+str(i)+"][c+b*"+str(int(N/blocks))+"], ") + add[-1] = add[-1][:-2] + ");" + else: + add.append("m[r][c+b*"+str(int(N/blocks))+"];") + add.append("\n\t\t\t}\n\t\t}\n\t}\n") + kernelarray.append("".join(add)) + + if False: + add = ["\n\t" + avx_type(avx_len) + " mmt[] = {"] + for b in range(blocks): + for r in range(int(N/avx_len)): + for c in range(int(N/blocks)): + add.append("loadbm") + add.append("(") + for i in range(avx_len): + add.append("&m["+str(avx_len*r+i)+"]["+str(c+b*int(N/blocks))+"], ") + add[-1] = add[-1][:-2] + "), " + add[-1] = add[-1][:-2] + "};\n" + else: + add = ["\n\t" + avx_type(avx_len) + " mmt[" + str(N*int(N/avx_len)) + "];"] + add.append("\n\tfor (unsigned b = 0; b < " + str(blocks) + "; ++b){" + "\n\t\tfor (unsigned r = 0; r < " + str(int(N/avx_len)) + "; ++r){" + "\n\t\t\tfor (unsigned c = 0; c < " + str(int(N/blocks)) + "; ++c){" + "\n\t\t\t\tmmt[b*"+str(int(N/avx_len)*int(N/blocks))+"+r*"+str(int(N/blocks))+"+c]" + " = loadbm(") + for i in range(avx_len): + add.append("&m["+str(avx_len)+"*r+"+str(i)+"][c+b*"+str(int(N/blocks))+"], ") + add[-1] = add[-1][:-2] + ");\n\t\t\t}\n\t\t}\n\t}\n" + + if only_one_matrix: + add = [] + + add.append("\n\n") + kernelarray.append("".join(add)) + + nc = len(idx)-1 + add = [] + indent = 1 + kernelarray.append("#ifndef _MSC_VER\n") + kernelarray.append("\t"*indent + "if (ctrlmask == 0){\n") + indent += 1 + kernelarray.append("\t"*indent + "#pragma omp parallel for collapse(LOOP_COLLAPSE"+str(n)+") schedule(static)\n" + "\t"*indent + "for (std::size_t i0 = 0; i0 < n; i0 += 2 * dsorted[0]){\n") + indent = indent + 1 + for i in range(1,nc+1): + kernelarray.append("\t"*indent + "for (std::size_t i"+str(i)+" = 0; i"+str(i)+" < dsorted["+str(i-1) + "]; i"+str(i)+" += 2 * dsorted["+str(i)+"]){\n") + indent = indent + 1 + + kernelarray.append("\t"*indent + "for (std::size_t i"+str(nc+1)+" = 0; i"+str(nc+1)+" < dsorted["+str(nc)+"]; ++i"+str(nc+1)+"){\n") + indent = indent + 1 + + # inner-most loop: call kernel core + + + kernelarray.append("\t"*indent + "kernel_core(psi, i0") + add = [] + for i in range(n): + add.append(" + i"+str(i+1)) + kernelarray.append("".join(add)) + for i in range(n): + kernelarray.append(", dsorted[" + str(n-1-i) + "]") + + if only_one_matrix: + kernelarray.append(", mm);\n") + else: + kernelarray.append(", mm, mmt);\n") + + #end for(s) and if + add = [""]*indent + for i in range(indent-1,0,-1): + add[indent-1-i] = "\t"*i+"}\n" + kernelarray.append("".join(add)) + + # if controlmask != 0 + indent = 1 + kernelarray.append("\t"*indent + "else{\n") + indent += 1 + kernelarray.append("\t"*indent + "#pragma omp parallel for collapse(LOOP_COLLAPSE"+str(n)+") schedule(static)\n" + "\t"*indent + "for (std::size_t i0 = 0; i0 < n; i0 += 2 * dsorted[0]){\n") + indent = indent + 1 + for i in range(1,nc+1): + kernelarray.append("\t"*indent + "for (std::size_t i"+str(i)+" = 0; i"+str(i)+" < dsorted["+str(i-1) + "]; i"+str(i)+" += 2 * dsorted["+str(i)+"]){\n") + indent = indent + 1 + + kernelarray.append("\t"*indent + "for (std::size_t i"+str(nc+1)+" = 0; i"+str(nc+1)+" < dsorted["+str(nc)+"]; ++i"+str(nc+1)+"){\n") + indent = indent + 1 + + # inner-most loop: call kernel core + + kernelarray.append("\t"*indent + "if (((i0") + add = [] + for i in range(n): + add.append(" + i"+str(i+1)) + kernelarray.append("".join(add)) + kernelarray.append(")&ctrlmask) == ctrlmask)\n") + kernelarray.append("\t"*(indent+1) + "kernel_core(psi, i0") + add = [] + for i in range(n): + add.append(" + i"+str(i+1)) + kernelarray.append("".join(add)) + for i in range(n): + kernelarray.append(", dsorted[" + str(n-1-i) + "]") + + if only_one_matrix: + kernelarray.append(", mm);\n") + else: + kernelarray.append(", mm, mmt);\n") + + #end for(s) and if + add = [""]*indent + for i in range(indent-1,0,-1): + add[indent-1-i] = "\t"*i+"}\n" + kernelarray.append("".join(add)) + + +################ Start of _MSC_VER code block ################## + kernelarray.append("#else\n") + kernelarray.append(" std::intptr_t zero = 0;\n") + kernelarray.append(" std::intptr_t dmask = dsorted[0]"); + for i in range(n-1): kernelarray.append(" + dsorted["+str(i+1)+"]") + kernelarray.append( ";\n") + kernelarray.append("\n"); + kernelarray.append(" if (ctrlmask == 0){\n") + kernelarray.append(" #pragma omp parallel for schedule(static)\n") + kernelarray.append(" for (std::intptr_t i = 0; i < static_cast(n); ++i)\n") + kernelarray.append(" if ((i & dmask) == zero)\n") + kernelarray.append(" kernel_core(psi, i") + for i in range(n): kernelarray.append(", dsorted[" + str(n-1-i) + "]") + if only_one_matrix: kernelarray.append(", mm);\n") + else: kernelarray.append(", mm, mmt);\n") + # if controlmask != 0 + kernelarray.append(" } else {\n") + kernelarray.append(" #pragma omp parallel for schedule(static)\n") + kernelarray.append(" for (std::intptr_t i = 0; i < static_cast(n); ++i)\n") + kernelarray.append(" if ((i & ctrlmask) == ctrlmask && (i & dmask) == zero)\n") + kernelarray.append(" kernel_core(psi, i") + for i in range(n): kernelarray.append(", dsorted[" + str(n-1-i) + "]") + if only_one_matrix: kernelarray.append(", mm);\n") + else: kernelarray.append(", mm, mmt);\n") + kernelarray.append(" }\n") + kernelarray.append("#endif\n") + + kernelarray.append("}\n") + kernel = "".join([kernel,"".join(kernelarray)]) + return kernel + +def generate_includes(N): + return "#include \n#include \n#include \n#include \n#include \n#include \n" + \ + "#include \"alignedallocator.hpp\"\n#include \"timing.hpp\"\n#include \"cintrin.hpp\"\n" + \ + "#include \n#include \n\n" + \ + "#include \"util/par_for.hpp\"\n" + \ + "using namespace std;\n#define LOOP_COLLAPSE" + str(N) + " " + str(N+1) + "\n" + +def generate_main(n): + N = str(1 << n) + text = "using rowtype = vector,aligned_allocator,64>>;\nusing matrixtype = vector;\n\nint main(int argc, char *argv[]){" + text = text + "\n\tassert(argc > "+str(1+n)+");" + text = text + "\n\tsize_t N = 1ULL << atoi(argv[1]);" + for i in range(n): + text = text + "\n\tunsigned i" + str(i) + " = atoi(argv[" + str(i+2) + "]);" + + text = text + "\n\tmatrixtype m("+N+", rowtype("+N+"));"; + text = text + "\n\tfor (unsigned i = 0; i < "+N+"; ++i)\n\t\tfor (unsigned j = 0; j < "+N+"; ++j)\n\t\t\tm[i][j] = drand48();\n" + + text = text + "\n\tTimer t;\n\tfor (unsigned threads = 1; threads <= 24; ++threads){" + text = text + "\n\t\tomp_set_num_threads(threads);" + text = text + "\n\t\trowtype psi(N);\n\t\t#pragma omp parallel\n\t\t{\n\t\t\t#pragma omp for schedule(static)\n\t\t\tfor (size_t i = 0; i < psi.size(); ++i)\n\t\t\t\tpsi[i] = drand48();\n" + text = text + "\n\t\t\t#pragma omp single\n\t\t\tt.start();" + text = text + "\n\t\t\tkernel(psi, N" + for i in range(n): + text = text + ", i" + str(i) + text = text + ", m, 0);" + text = text + "\n\t\t\t#pragma omp waitall\n\t\t\t#pragma omp single\n\t\t\t{ cout << \"threads: \" << threads << \", time:\" << t.stop()*1.e-6 << \"\\n\"; }" + text = text + "\n\t\t}" # end for + text = text + "\n\t}" # end for + text = text + "\n\n}" # end main + return text + + +##################################################### +# MAIN # +##################################################### + +if len(sys.argv) < 2: + print("Generates the code for an n-qubit gate.\nUsage:\n./codegen_fma.py [n_qubits] {n_blocks} {only one matrix?} {unroll loops?} {none|avx2|avx512}\n\n") + exit() + +n = int(sys.argv[1]) # number of qubits + +try: # number of blocks + blocks = int(sys.argv[2]) +except Exception: + blocks = 1 + +try: + only_one_matrix = int(sys.argv[3]) +except Exception: + only_one_matrix = False + +try: + unroll_loops = int(sys.argv[4]) +except Exception: + unroll_loops = False + +try: + avx = str(sys.argv[5]) + if avx == "avx512": + avx = 4 + elif avx == "avx2" or avx == "avx": + avx = 2 + elif avx == "none": + avx = 1 + only_one_matrix = True + else: + raise RuntimeError("Unknown avx type: {}".format(avx)) +except IndexError: + avx = 2 + +while (1 << n)/blocks < 1: + blocks = int(blocks/2) + +if (1 << n) < avx: + avx = int(avx/2) + +kernel = generate_kernel(n, blocks, only_one_matrix, unroll_loops, avx) # generate code for n-qubit gate + +# if user wants a main (for testing) generate as well: +for a in sys.argv: + if str(a) == "gen_main": + kernel = generate_includes(n) + kernel + generate_main(n) + +print(kernel) diff --git a/src/Simulation/Native/codegen/codegen_test.cpp b/src/Simulation/Native/codegen/codegen_test.cpp new file mode 100644 index 00000000000..1ff778ce96d --- /dev/null +++ b/src/Simulation/Native/codegen/codegen_test.cpp @@ -0,0 +1,127 @@ +#include +#include +#include +#include +#include +#include +#include "alignedalloc.hpp" +//#include "timing.hpp" +#include "cintrin.hpp" +#include +#include + +#include "util/par_for.hpp" +using namespace std; +#define LOOP_COLLAPSE1 2 +// (C) 2018 ETH Zurich, ITP, Thomas H�ner and Damian Steiger + +template +inline void kernel_core(V& psi, std::size_t I, std::size_t d0, M const& m) +{ + std::complex v[2]; + + v[0] = psi[I]; + v[1] = psi[I + d0]; + + std::complex tmp[2] = {0., 0.}; + + tmp[0] = fma(v[0], m[0], fma(v[1], m[1], tmp[0])); + tmp[1] = fma(v[0], m[2], fma(v[1], m[3], tmp[1])); + psi[I] = tmp[0]; + psi[I + d0] = tmp[1]; + +} + +// bit indices id[.] are given from high to low (e.g. control first for CNOT) +template +void kernel(V& psi, unsigned id0, M const& matrix, std::size_t ctrlmask) +{ + std::size_t n = psi.size(); + std::size_t d0 = 1ULL << id0; + auto m = matrix; + std::size_t dsorted[] = {d0}; + permute_qubits_and_matrix(dsorted, 1, m); + + std::complex mm[4]; + for (unsigned b = 0; b < 1; ++b){ + for (unsigned r = 0; r < 2; ++r){ + for (unsigned c = 0; c < 2; ++c){ + mm[b*4+r*2+c] = m[r][c+b*2]; + } + } + } + + +#ifndef _MSC_VER + if (ctrlmask == 0){ + #pragma omp parallel for collapse(LOOP_COLLAPSE1) schedule(static) + for (std::size_t i0 = 0; i0 < n; i0 += 2 * dsorted[0]){ + for (std::size_t i1 = 0; i1 < dsorted[0]; ++i1){ + kernel_core(psi, i0 + i1, dsorted[0], mm); + } + } + } + else{ + #pragma omp parallel for collapse(LOOP_COLLAPSE1) schedule(static) + for (std::size_t i0 = 0; i0 < n; i0 += 2 * dsorted[0]){ + for (std::size_t i1 = 0; i1 < dsorted[0]; ++i1){ + if (((i0 + i1)&ctrlmask) == ctrlmask) + kernel_core(psi, i0 + i1, dsorted[0], mm); + } + } + } +#else + intptr_t zero = 0; + intptr_t dmask = dsorted[0]; + + if (ctrlmask == 0){ + auto thrdFnc= [&](size_t dsorted[],intptr_t& dmask, intptr_t& zero,V &psi,M const& m) { + return [&](unsigned i) { + if ((i & dmask) == zero) + kernel_core(psi, i, dsorted[0], m); + }; + }; + pl::async_par_for(0,n,thrdFnc(dsorted,dmask,zero,psi,m)); + } else { + auto thrdFnc= [&](size_t dsorted[],size_t& ctrlmask,intptr_t& dmask, intptr_t& zero,V &psi,M const& m) { + return [&](unsigned i) { + if ((i & ctrlmask) == ctrlmask && (i & dmask) == zero) + kernel_core(psi, i, dsorted[0], m); + }; + }; + pl::async_par_for(0,n,thrdFnc(dsorted,ctrlmask,dmask,zero,psi,m)); + } +#endif +} +using rowtype = vector,AlignedAlloc,64>>; +using matrixtype = vector; + +int main(int argc, char *argv[]){ + assert(argc > 2); + size_t N = 1ULL << atoi(argv[1]); + unsigned i0 = atoi(argv[2]); + matrixtype m(2, rowtype(2)); + for (unsigned i = 0; i < 2; ++i) + for (unsigned j = 0; j < 2; ++j) + m[i][j] = drand48(); + + Timer t; + for (unsigned threads = 1; threads <= 24; ++threads){ + omp_set_num_threads(threads); + rowtype psi(N); + #pragma omp parallel + { + #pragma omp for schedule(static) + for (size_t i = 0; i < psi.size(); ++i) + psi[i] = drand48(); + + #pragma omp single + t.start(); + kernel(psi, N, i0, m, 0); + #pragma omp waitall + #pragma omp single + { cout << "threads: " << threads << ", time:" << t.stop()*1.e-6 << "\n"; } + } + } + +} diff --git a/src/Simulation/Native/codegen/generate.ps1 b/src/Simulation/Native/codegen/generate.ps1 new file mode 100644 index 00000000000..1cc506d187b --- /dev/null +++ b/src/Simulation/Native/codegen/generate.ps1 @@ -0,0 +1,21 @@ +# onematrix[i] determines whether to use a single gate matrix for the i-qubit gate kernel +# instead of using two matrices (which allows to reduce the number of operations +# by pre-computation) +$onematrix=(0,0,0,0,0,0,1,1) # g++ best + +# unroll[i] determines whether to unroll loops +$unroll=(1,1,1,1,1,1,0,0) # g++ best + +# register length to use: can be none, avx2, or avx512 +# avx=avx2 + +# blocking: must be a power of two and at most 2^k for a k-qubit gate +$b=(0,2,4,8,16,16,16,32) # gcc & icc best + +foreach ($i in 1..7) { + "Generating $i kernel with $b[$i] blocks." + python codegen_fma.py $i $b[$i] $onematrix[$i] $unroll[$i] none > nointrin/kernel$i.hpp + python codegen_fma.py $i $b[$i] $onematrix[$i] $unroll[$i] avx > avx/kernel$i.hpp + python codegen_fma.py $i $b[$i] $onematrix[$i] $unroll[$i] avx2 > avx2/kernel$i.hpp + python codegen_fma.py $i $b[$i] $onematrix[$i] $unroll[$i] avx512 > avx512/kernel$i.hpp +} \ No newline at end of file diff --git a/src/Simulation/Native/codegen/generate.sh b/src/Simulation/Native/codegen/generate.sh new file mode 100644 index 00000000000..2ec0557d571 --- /dev/null +++ b/src/Simulation/Native/codegen/generate.sh @@ -0,0 +1,28 @@ +#!/bin/bash +# (C) 2018 ETH Zurich, ITP, Thomas Häner and Damian Steiger + +# onematrix[i] determines whether to use a single gate matrix for the i-qubit gate kernel +# instead of using two matrices (which allows to reduce the number of operations +# by pre-computation) +onematrix=(0 0 0 0 0 0 1 1) # g++ best +#onematrix=(0 0 1 0 0 0 1 1) # icc best + +# unroll[i] determines whether to unroll loops +unroll=(1 1 1 1 1 1 0 0) # g++ best +#unroll=(1 1 1 0 0 1 0 0) # icc best +#unroll=(0 0 0 0 0 0 0 0) + +# register length to use: can be none, avx2, or avx512 +avx=avx2 + +# blocking: must be a power of two and at most 2^k for a k-qubit gate +b=(0 2 4 8 16 16 16 32) # gcc & icc best + +for i in {1..7} +do + echo "Generating $i kernel with ${b[$i]} blocks." + ./codegen_fma.py $i ${b[$i]} ${onematrix[$i]} ${unroll[$i]} none > ../nointrin/kernel${i}.hpp + ./codegen_fma.py $i ${b[$i]} ${onematrix[$i]} ${unroll[$i]} avx > ../avx/kernel${i}.hpp + ./codegen_fma.py $i ${b[$i]} ${onematrix[$i]} ${unroll[$i]} avx2 > ../avx2/kernel${i}.hpp + ./codegen_fma.py $i ${b[$i]} ${onematrix[$i]} ${unroll[$i]} avx512 > ../avx512/kernel${i}.hpp +done diff --git a/src/Simulation/Native/src/config.hpp b/src/Simulation/Native/src/config.hpp new file mode 100644 index 00000000000..363baa4d241 --- /dev/null +++ b/src/Simulation/Native/src/config.hpp @@ -0,0 +1,50 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +#pragma once + +#include + +// check if we want to force single precision +/* #undef USE_SINGLE_PRECISION */ + +// check if we have AVX intrinsics +/* #undef HAVE_INTRINSICS */ + +// check if we have AVX-512 intrinsics +/* #undef HAVE_AVX512 */ + +// check if we want to use fused kernels +#define USE_GATE_FUSION + +#define BUILD_SHARED_LIBS + + +#if defined (_MSC_VER) && defined (BUILD_SHARED_LIBS) + +#ifdef BUILD_DLL +#define MICROSOFT_QUANTUM_DECL __declspec(dllexport) +#else +#define MICROSOFT_QUANTUM_DECL __declspec(dllimport) +#endif +#define MICROSOFT_QUANTUM_DECL_IMPORT __declspec(dllimport) +#else +#define MICROSOFT_QUANTUM_DECL +#define MICROSOFT_QUANTUM_DECL_IMPORT +#endif + +#ifdef HAVE_INTRINSICS +#ifdef HAVE_AVX512 +#define SIMULATOR SimulatorAVX512 +#else +#ifdef HAVE_FMA +#define SIMULATOR SimulatorAVX2 +#else +#define SIMULATOR SimulatorAVX +#endif +#endif +#else +#define SIMULATOR SimulatorGeneric +#endif + + diff --git a/src/Simulation/Native/src/external/avx/kernel1.hpp b/src/Simulation/Native/src/external/avx/kernel1.hpp index eac0cf47ea4..2bdcb04aea3 100644 --- a/src/Simulation/Native/src/external/avx/kernel1.hpp +++ b/src/Simulation/Native/src/external/avx/kernel1.hpp @@ -49,7 +49,7 @@ void kernel(V& psi, unsigned id0, M const& matrix, std::size_t ctrlmask) #ifndef _MSC_VER if (ctrlmask == 0){ - #pragma omp for collapse(LOOP_COLLAPSE1) schedule(static) + #pragma omp parallel for collapse(LOOP_COLLAPSE1) schedule(static) for (std::size_t i0 = 0; i0 < n; i0 += 2 * dsorted[0]){ for (std::size_t i1 = 0; i1 < dsorted[0]; ++i1){ kernel_core(psi, i0 + i1, dsorted[0], mm, mmt); @@ -57,7 +57,7 @@ void kernel(V& psi, unsigned id0, M const& matrix, std::size_t ctrlmask) } } else{ - #pragma omp for collapse(LOOP_COLLAPSE1) schedule(static) + #pragma omp parallel for collapse(LOOP_COLLAPSE1) schedule(static) for (std::size_t i0 = 0; i0 < n; i0 += 2 * dsorted[0]){ for (std::size_t i1 = 0; i1 < dsorted[0]; ++i1){ if (((i0 + i1)&ctrlmask) == ctrlmask) diff --git a/src/Simulation/Native/src/external/avx/kernel2.hpp b/src/Simulation/Native/src/external/avx/kernel2.hpp index 24f6c8ee13d..8012bf3638d 100644 --- a/src/Simulation/Native/src/external/avx/kernel2.hpp +++ b/src/Simulation/Native/src/external/avx/kernel2.hpp @@ -63,7 +63,7 @@ void kernel(V& psi, unsigned id1, unsigned id0, M const& matrix, std::size_t ctr #ifndef _MSC_VER if (ctrlmask == 0){ - #pragma omp for collapse(LOOP_COLLAPSE2) schedule(static) + #pragma omp parallel for collapse(LOOP_COLLAPSE2) schedule(static) for (std::size_t i0 = 0; i0 < n; i0 += 2 * dsorted[0]){ for (std::size_t i1 = 0; i1 < dsorted[0]; i1 += 2 * dsorted[1]){ for (std::size_t i2 = 0; i2 < dsorted[1]; ++i2){ @@ -73,7 +73,7 @@ void kernel(V& psi, unsigned id1, unsigned id0, M const& matrix, std::size_t ctr } } else{ - #pragma omp for collapse(LOOP_COLLAPSE2) schedule(static) + #pragma omp parallel for collapse(LOOP_COLLAPSE2) schedule(static) for (std::size_t i0 = 0; i0 < n; i0 += 2 * dsorted[0]){ for (std::size_t i1 = 0; i1 < dsorted[0]; i1 += 2 * dsorted[1]){ for (std::size_t i2 = 0; i2 < dsorted[1]; ++i2){ diff --git a/src/Simulation/Native/src/external/avx/kernel3.hpp b/src/Simulation/Native/src/external/avx/kernel3.hpp index a63dbc28693..bc7037c6004 100644 --- a/src/Simulation/Native/src/external/avx/kernel3.hpp +++ b/src/Simulation/Native/src/external/avx/kernel3.hpp @@ -102,7 +102,7 @@ void kernel(V& psi, unsigned id2, unsigned id1, unsigned id0, M const& matrix, s #ifndef _MSC_VER if (ctrlmask == 0){ - #pragma omp for collapse(LOOP_COLLAPSE3) schedule(static) + #pragma omp parallel for collapse(LOOP_COLLAPSE3) schedule(static) for (std::size_t i0 = 0; i0 < n; i0 += 2 * dsorted[0]){ for (std::size_t i1 = 0; i1 < dsorted[0]; i1 += 2 * dsorted[1]){ for (std::size_t i2 = 0; i2 < dsorted[1]; i2 += 2 * dsorted[2]){ @@ -114,7 +114,7 @@ void kernel(V& psi, unsigned id2, unsigned id1, unsigned id0, M const& matrix, s } } else{ - #pragma omp for collapse(LOOP_COLLAPSE3) schedule(static) + #pragma omp parallel for collapse(LOOP_COLLAPSE3) schedule(static) for (std::size_t i0 = 0; i0 < n; i0 += 2 * dsorted[0]){ for (std::size_t i1 = 0; i1 < dsorted[0]; i1 += 2 * dsorted[1]){ for (std::size_t i2 = 0; i2 < dsorted[1]; i2 += 2 * dsorted[2]){ diff --git a/src/Simulation/Native/src/external/avx/kernel4.hpp b/src/Simulation/Native/src/external/avx/kernel4.hpp index 92f573a0255..e20a190af5c 100644 --- a/src/Simulation/Native/src/external/avx/kernel4.hpp +++ b/src/Simulation/Native/src/external/avx/kernel4.hpp @@ -227,7 +227,7 @@ void kernel(V& psi, unsigned id3, unsigned id2, unsigned id1, unsigned id0, M co #ifndef _MSC_VER if (ctrlmask == 0){ - #pragma omp for collapse(LOOP_COLLAPSE4) schedule(static) + #pragma omp parallel for collapse(LOOP_COLLAPSE4) schedule(static) for (std::size_t i0 = 0; i0 < n; i0 += 2 * dsorted[0]){ for (std::size_t i1 = 0; i1 < dsorted[0]; i1 += 2 * dsorted[1]){ for (std::size_t i2 = 0; i2 < dsorted[1]; i2 += 2 * dsorted[2]){ @@ -241,7 +241,7 @@ void kernel(V& psi, unsigned id3, unsigned id2, unsigned id1, unsigned id0, M co } } else{ - #pragma omp for collapse(LOOP_COLLAPSE4) schedule(static) + #pragma omp parallel for collapse(LOOP_COLLAPSE4) schedule(static) for (std::size_t i0 = 0; i0 < n; i0 += 2 * dsorted[0]){ for (std::size_t i1 = 0; i1 < dsorted[0]; i1 += 2 * dsorted[1]){ for (std::size_t i2 = 0; i2 < dsorted[1]; i2 += 2 * dsorted[2]){ diff --git a/src/Simulation/Native/src/external/avx/kernel5.hpp b/src/Simulation/Native/src/external/avx/kernel5.hpp index 0cdba4b89c5..8cf84656e68 100644 --- a/src/Simulation/Native/src/external/avx/kernel5.hpp +++ b/src/Simulation/Native/src/external/avx/kernel5.hpp @@ -380,7 +380,7 @@ void kernel(V& psi, unsigned id4, unsigned id3, unsigned id2, unsigned id1, unsi #ifndef _MSC_VER if (ctrlmask == 0){ - #pragma omp for collapse(LOOP_COLLAPSE5) schedule(static) + #pragma omp parallel for collapse(LOOP_COLLAPSE5) schedule(static) for (std::size_t i0 = 0; i0 < n; i0 += 2 * dsorted[0]){ for (std::size_t i1 = 0; i1 < dsorted[0]; i1 += 2 * dsorted[1]){ for (std::size_t i2 = 0; i2 < dsorted[1]; i2 += 2 * dsorted[2]){ @@ -396,7 +396,7 @@ void kernel(V& psi, unsigned id4, unsigned id3, unsigned id2, unsigned id1, unsi } } else{ - #pragma omp for collapse(LOOP_COLLAPSE5) schedule(static) + #pragma omp parallel for collapse(LOOP_COLLAPSE5) schedule(static) for (std::size_t i0 = 0; i0 < n; i0 += 2 * dsorted[0]){ for (std::size_t i1 = 0; i1 < dsorted[0]; i1 += 2 * dsorted[1]){ for (std::size_t i2 = 0; i2 < dsorted[1]; i2 += 2 * dsorted[2]){ diff --git a/src/Simulation/Native/src/external/avx/kernel6.hpp b/src/Simulation/Native/src/external/avx/kernel6.hpp index 343c6e3154c..087c4e83473 100644 --- a/src/Simulation/Native/src/external/avx/kernel6.hpp +++ b/src/Simulation/Native/src/external/avx/kernel6.hpp @@ -212,7 +212,7 @@ void kernel(V& psi, unsigned id5, unsigned id4, unsigned id3, unsigned id2, unsi #ifndef _MSC_VER if (ctrlmask == 0){ - #pragma omp for collapse(LOOP_COLLAPSE6) schedule(static) + #pragma omp parallel for collapse(LOOP_COLLAPSE6) schedule(static) for (std::size_t i0 = 0; i0 < n; i0 += 2 * dsorted[0]){ for (std::size_t i1 = 0; i1 < dsorted[0]; i1 += 2 * dsorted[1]){ for (std::size_t i2 = 0; i2 < dsorted[1]; i2 += 2 * dsorted[2]){ @@ -230,7 +230,7 @@ void kernel(V& psi, unsigned id5, unsigned id4, unsigned id3, unsigned id2, unsi } } else{ - #pragma omp for collapse(LOOP_COLLAPSE6) schedule(static) + #pragma omp parallel for collapse(LOOP_COLLAPSE6) schedule(static) for (std::size_t i0 = 0; i0 < n; i0 += 2 * dsorted[0]){ for (std::size_t i1 = 0; i1 < dsorted[0]; i1 += 2 * dsorted[1]){ for (std::size_t i2 = 0; i2 < dsorted[1]; i2 += 2 * dsorted[2]){ diff --git a/src/Simulation/Native/src/external/avx/kernel7.hpp b/src/Simulation/Native/src/external/avx/kernel7.hpp index ecd0f45f3cf..197e800b208 100644 --- a/src/Simulation/Native/src/external/avx/kernel7.hpp +++ b/src/Simulation/Native/src/external/avx/kernel7.hpp @@ -389,7 +389,7 @@ void kernel(V& psi, unsigned id6, unsigned id5, unsigned id4, unsigned id3, unsi #ifndef _MSC_VER if (ctrlmask == 0){ - #pragma omp for collapse(LOOP_COLLAPSE7) schedule(static) + #pragma omp parallel for collapse(LOOP_COLLAPSE7) schedule(static) for (std::size_t i0 = 0; i0 < n; i0 += 2 * dsorted[0]){ for (std::size_t i1 = 0; i1 < dsorted[0]; i1 += 2 * dsorted[1]){ for (std::size_t i2 = 0; i2 < dsorted[1]; i2 += 2 * dsorted[2]){ @@ -409,7 +409,7 @@ void kernel(V& psi, unsigned id6, unsigned id5, unsigned id4, unsigned id3, unsi } } else{ - #pragma omp for collapse(LOOP_COLLAPSE7) schedule(static) + #pragma omp parallel for collapse(LOOP_COLLAPSE7) schedule(static) for (std::size_t i0 = 0; i0 < n; i0 += 2 * dsorted[0]){ for (std::size_t i1 = 0; i1 < dsorted[0]; i1 += 2 * dsorted[1]){ for (std::size_t i2 = 0; i2 < dsorted[1]; i2 += 2 * dsorted[2]){ diff --git a/src/Simulation/Native/src/external/avx2/kernel1.hpp b/src/Simulation/Native/src/external/avx2/kernel1.hpp index eac0cf47ea4..2bdcb04aea3 100644 --- a/src/Simulation/Native/src/external/avx2/kernel1.hpp +++ b/src/Simulation/Native/src/external/avx2/kernel1.hpp @@ -49,7 +49,7 @@ void kernel(V& psi, unsigned id0, M const& matrix, std::size_t ctrlmask) #ifndef _MSC_VER if (ctrlmask == 0){ - #pragma omp for collapse(LOOP_COLLAPSE1) schedule(static) + #pragma omp parallel for collapse(LOOP_COLLAPSE1) schedule(static) for (std::size_t i0 = 0; i0 < n; i0 += 2 * dsorted[0]){ for (std::size_t i1 = 0; i1 < dsorted[0]; ++i1){ kernel_core(psi, i0 + i1, dsorted[0], mm, mmt); @@ -57,7 +57,7 @@ void kernel(V& psi, unsigned id0, M const& matrix, std::size_t ctrlmask) } } else{ - #pragma omp for collapse(LOOP_COLLAPSE1) schedule(static) + #pragma omp parallel for collapse(LOOP_COLLAPSE1) schedule(static) for (std::size_t i0 = 0; i0 < n; i0 += 2 * dsorted[0]){ for (std::size_t i1 = 0; i1 < dsorted[0]; ++i1){ if (((i0 + i1)&ctrlmask) == ctrlmask) diff --git a/src/Simulation/Native/src/external/avx2/kernel2.hpp b/src/Simulation/Native/src/external/avx2/kernel2.hpp index 24f6c8ee13d..8012bf3638d 100644 --- a/src/Simulation/Native/src/external/avx2/kernel2.hpp +++ b/src/Simulation/Native/src/external/avx2/kernel2.hpp @@ -63,7 +63,7 @@ void kernel(V& psi, unsigned id1, unsigned id0, M const& matrix, std::size_t ctr #ifndef _MSC_VER if (ctrlmask == 0){ - #pragma omp for collapse(LOOP_COLLAPSE2) schedule(static) + #pragma omp parallel for collapse(LOOP_COLLAPSE2) schedule(static) for (std::size_t i0 = 0; i0 < n; i0 += 2 * dsorted[0]){ for (std::size_t i1 = 0; i1 < dsorted[0]; i1 += 2 * dsorted[1]){ for (std::size_t i2 = 0; i2 < dsorted[1]; ++i2){ @@ -73,7 +73,7 @@ void kernel(V& psi, unsigned id1, unsigned id0, M const& matrix, std::size_t ctr } } else{ - #pragma omp for collapse(LOOP_COLLAPSE2) schedule(static) + #pragma omp parallel for collapse(LOOP_COLLAPSE2) schedule(static) for (std::size_t i0 = 0; i0 < n; i0 += 2 * dsorted[0]){ for (std::size_t i1 = 0; i1 < dsorted[0]; i1 += 2 * dsorted[1]){ for (std::size_t i2 = 0; i2 < dsorted[1]; ++i2){ diff --git a/src/Simulation/Native/src/external/avx2/kernel3.hpp b/src/Simulation/Native/src/external/avx2/kernel3.hpp index a63dbc28693..bc7037c6004 100644 --- a/src/Simulation/Native/src/external/avx2/kernel3.hpp +++ b/src/Simulation/Native/src/external/avx2/kernel3.hpp @@ -102,7 +102,7 @@ void kernel(V& psi, unsigned id2, unsigned id1, unsigned id0, M const& matrix, s #ifndef _MSC_VER if (ctrlmask == 0){ - #pragma omp for collapse(LOOP_COLLAPSE3) schedule(static) + #pragma omp parallel for collapse(LOOP_COLLAPSE3) schedule(static) for (std::size_t i0 = 0; i0 < n; i0 += 2 * dsorted[0]){ for (std::size_t i1 = 0; i1 < dsorted[0]; i1 += 2 * dsorted[1]){ for (std::size_t i2 = 0; i2 < dsorted[1]; i2 += 2 * dsorted[2]){ @@ -114,7 +114,7 @@ void kernel(V& psi, unsigned id2, unsigned id1, unsigned id0, M const& matrix, s } } else{ - #pragma omp for collapse(LOOP_COLLAPSE3) schedule(static) + #pragma omp parallel for collapse(LOOP_COLLAPSE3) schedule(static) for (std::size_t i0 = 0; i0 < n; i0 += 2 * dsorted[0]){ for (std::size_t i1 = 0; i1 < dsorted[0]; i1 += 2 * dsorted[1]){ for (std::size_t i2 = 0; i2 < dsorted[1]; i2 += 2 * dsorted[2]){ diff --git a/src/Simulation/Native/src/external/avx2/kernel4.hpp b/src/Simulation/Native/src/external/avx2/kernel4.hpp index 92f573a0255..e20a190af5c 100644 --- a/src/Simulation/Native/src/external/avx2/kernel4.hpp +++ b/src/Simulation/Native/src/external/avx2/kernel4.hpp @@ -227,7 +227,7 @@ void kernel(V& psi, unsigned id3, unsigned id2, unsigned id1, unsigned id0, M co #ifndef _MSC_VER if (ctrlmask == 0){ - #pragma omp for collapse(LOOP_COLLAPSE4) schedule(static) + #pragma omp parallel for collapse(LOOP_COLLAPSE4) schedule(static) for (std::size_t i0 = 0; i0 < n; i0 += 2 * dsorted[0]){ for (std::size_t i1 = 0; i1 < dsorted[0]; i1 += 2 * dsorted[1]){ for (std::size_t i2 = 0; i2 < dsorted[1]; i2 += 2 * dsorted[2]){ @@ -241,7 +241,7 @@ void kernel(V& psi, unsigned id3, unsigned id2, unsigned id1, unsigned id0, M co } } else{ - #pragma omp for collapse(LOOP_COLLAPSE4) schedule(static) + #pragma omp parallel for collapse(LOOP_COLLAPSE4) schedule(static) for (std::size_t i0 = 0; i0 < n; i0 += 2 * dsorted[0]){ for (std::size_t i1 = 0; i1 < dsorted[0]; i1 += 2 * dsorted[1]){ for (std::size_t i2 = 0; i2 < dsorted[1]; i2 += 2 * dsorted[2]){ diff --git a/src/Simulation/Native/src/external/avx2/kernel5.hpp b/src/Simulation/Native/src/external/avx2/kernel5.hpp index 0cdba4b89c5..8cf84656e68 100644 --- a/src/Simulation/Native/src/external/avx2/kernel5.hpp +++ b/src/Simulation/Native/src/external/avx2/kernel5.hpp @@ -380,7 +380,7 @@ void kernel(V& psi, unsigned id4, unsigned id3, unsigned id2, unsigned id1, unsi #ifndef _MSC_VER if (ctrlmask == 0){ - #pragma omp for collapse(LOOP_COLLAPSE5) schedule(static) + #pragma omp parallel for collapse(LOOP_COLLAPSE5) schedule(static) for (std::size_t i0 = 0; i0 < n; i0 += 2 * dsorted[0]){ for (std::size_t i1 = 0; i1 < dsorted[0]; i1 += 2 * dsorted[1]){ for (std::size_t i2 = 0; i2 < dsorted[1]; i2 += 2 * dsorted[2]){ @@ -396,7 +396,7 @@ void kernel(V& psi, unsigned id4, unsigned id3, unsigned id2, unsigned id1, unsi } } else{ - #pragma omp for collapse(LOOP_COLLAPSE5) schedule(static) + #pragma omp parallel for collapse(LOOP_COLLAPSE5) schedule(static) for (std::size_t i0 = 0; i0 < n; i0 += 2 * dsorted[0]){ for (std::size_t i1 = 0; i1 < dsorted[0]; i1 += 2 * dsorted[1]){ for (std::size_t i2 = 0; i2 < dsorted[1]; i2 += 2 * dsorted[2]){ diff --git a/src/Simulation/Native/src/external/avx2/kernel6.hpp b/src/Simulation/Native/src/external/avx2/kernel6.hpp index 343c6e3154c..087c4e83473 100644 --- a/src/Simulation/Native/src/external/avx2/kernel6.hpp +++ b/src/Simulation/Native/src/external/avx2/kernel6.hpp @@ -212,7 +212,7 @@ void kernel(V& psi, unsigned id5, unsigned id4, unsigned id3, unsigned id2, unsi #ifndef _MSC_VER if (ctrlmask == 0){ - #pragma omp for collapse(LOOP_COLLAPSE6) schedule(static) + #pragma omp parallel for collapse(LOOP_COLLAPSE6) schedule(static) for (std::size_t i0 = 0; i0 < n; i0 += 2 * dsorted[0]){ for (std::size_t i1 = 0; i1 < dsorted[0]; i1 += 2 * dsorted[1]){ for (std::size_t i2 = 0; i2 < dsorted[1]; i2 += 2 * dsorted[2]){ @@ -230,7 +230,7 @@ void kernel(V& psi, unsigned id5, unsigned id4, unsigned id3, unsigned id2, unsi } } else{ - #pragma omp for collapse(LOOP_COLLAPSE6) schedule(static) + #pragma omp parallel for collapse(LOOP_COLLAPSE6) schedule(static) for (std::size_t i0 = 0; i0 < n; i0 += 2 * dsorted[0]){ for (std::size_t i1 = 0; i1 < dsorted[0]; i1 += 2 * dsorted[1]){ for (std::size_t i2 = 0; i2 < dsorted[1]; i2 += 2 * dsorted[2]){ diff --git a/src/Simulation/Native/src/external/avx2/kernel7.hpp b/src/Simulation/Native/src/external/avx2/kernel7.hpp index ecd0f45f3cf..197e800b208 100644 --- a/src/Simulation/Native/src/external/avx2/kernel7.hpp +++ b/src/Simulation/Native/src/external/avx2/kernel7.hpp @@ -389,7 +389,7 @@ void kernel(V& psi, unsigned id6, unsigned id5, unsigned id4, unsigned id3, unsi #ifndef _MSC_VER if (ctrlmask == 0){ - #pragma omp for collapse(LOOP_COLLAPSE7) schedule(static) + #pragma omp parallel for collapse(LOOP_COLLAPSE7) schedule(static) for (std::size_t i0 = 0; i0 < n; i0 += 2 * dsorted[0]){ for (std::size_t i1 = 0; i1 < dsorted[0]; i1 += 2 * dsorted[1]){ for (std::size_t i2 = 0; i2 < dsorted[1]; i2 += 2 * dsorted[2]){ @@ -409,7 +409,7 @@ void kernel(V& psi, unsigned id6, unsigned id5, unsigned id4, unsigned id3, unsi } } else{ - #pragma omp for collapse(LOOP_COLLAPSE7) schedule(static) + #pragma omp parallel for collapse(LOOP_COLLAPSE7) schedule(static) for (std::size_t i0 = 0; i0 < n; i0 += 2 * dsorted[0]){ for (std::size_t i1 = 0; i1 < dsorted[0]; i1 += 2 * dsorted[1]){ for (std::size_t i2 = 0; i2 < dsorted[1]; i2 += 2 * dsorted[2]){ diff --git a/src/Simulation/Native/src/external/cintrin.hpp b/src/Simulation/Native/src/external/cintrin.hpp index 8034408c1e8..b0eab16a79c 100644 --- a/src/Simulation/Native/src/external/cintrin.hpp +++ b/src/Simulation/Native/src/external/cintrin.hpp @@ -35,7 +35,10 @@ inline void permute_qubits_and_matrix(I *delta_list, unsigned n, M & matrix){ } inline std::complex fma(std::complex const& c1, std::complex const& c2, std::complex const& a){ - return c1*c2 + a; + // Expanded complex FMA to hard coded access (much faster) + double r = (c1.real() * c2.real() - c1.imag() * c2.imag()) + a.real(); + double i = (c1.real() * c2.imag() + c1.imag() * c2.real()) + a.imag(); + return std::complex(r, i); } inline __m256d fma(__m256d const& c1, __m256d const& c2, __m256d const& a){ diff --git a/src/Simulation/Native/src/external/fused.hpp b/src/Simulation/Native/src/external/fused.hpp index 6c1137ceb3b..235cf8ed15c 100644 --- a/src/Simulation/Native/src/external/fused.hpp +++ b/src/Simulation/Native/src/external/fused.hpp @@ -15,7 +15,7 @@ #ifdef HAVE_FMA #include "external/avx2/kernels.hpp" #else -#include "external/avx2/kernels.hpp" +#include "external/avx/kernels.hpp" #endif #endif #endif @@ -30,7 +30,11 @@ namespace SIMULATOR class Fused { public: - Fused() {} + Fused() { + wfnCapacity = 0u; // used to optimize runtime parameters + maxFusedSpan =-1; // determine span to use at runtime + maxFusedDepth = 99; // determine max depth to use at runtime + } inline void reset() { @@ -48,7 +52,7 @@ class Fused Fusion::IndexVector qs, cs; fusedgates.perform_fusion(m, qs, cs); - + std::size_t cmask = 0; for (auto c : cs) cmask |= (1ull << c); @@ -71,7 +75,7 @@ class Fused ::kernel(wfn, qs[4], qs[3], qs[2], qs[1], qs[0], m, cmask); break; } - + fusedgates = Fusion(); } @@ -98,20 +102,42 @@ class Fused template void apply_controlled(std::vector& wfn, M const& mat, std::vector const& cs, unsigned q) { - if (fusedgates.num_qubits()+fusedgates.num_controls()+cs.size()>8 || fusedgates.size() > 15) - flush(wfn); - Fusion newgates = fusedgates; - newgates.insert(convertMatrix(mat), std::vector(1, q), cs); - - if (newgates.num_qubits() > 4) - { - flush(wfn); - fusedgates.insert(convertMatrix(mat), std::vector(1, q), cs); - } - else - fusedgates = newgates; + // Major runtime logic change here + + // Have to update capacity as the WFN grows + if (wfnCapacity != wfn.capacity()) { + wfnCapacity = wfn.capacity(); + char* envNT = NULL; + size_t len; +#ifdef _MSC_VER + errno_t err = _dupenv_s(&envNT, &len, "OMP_NUM_THREADS"); +#else + envNT = getenv("OMP_NUM_THREADS"); +#endif + if (envNT == NULL) { // If the user didn't force the number of threads, make an intelligent guess + int nMaxThrds = 4; + if (wfnCapacity < 1ul << 20) nMaxThrds = 3; + int nProcs = omp_get_num_procs(); + if (nProcs < 3) nMaxThrds = nProcs; + omp_set_num_threads(nMaxThrds); + } + + // This is now pretty much unlimited, could be set in the future + maxFusedDepth = 99; + + // Default for large problems (optimized with benchmarks) + maxFusedSpan = 3; + + // Reduce size for small problems (optimized with benchmarks) + if (wfnCapacity < 1ul << 20) maxFusedSpan = 2; + } + + // New rules of when to stop fusing + Fusion::IndexVector qs = std::vector(1, q); + if (fusedgates.predict(qs, cs) > maxFusedSpan || fusedgates.size() >= maxFusedDepth) flush(wfn); + fusedgates.insert(convertMatrix(mat), qs, cs); } - + template void apply(std::vector& wfn, M const& mat, unsigned q) { @@ -120,6 +146,11 @@ class Fused } private: mutable Fusion fusedgates; + + //: New runtime optimizatin settings + mutable size_t wfnCapacity; + mutable int maxFusedSpan; + mutable int maxFusedDepth; }; diff --git a/src/Simulation/Native/src/external/fusion.hpp b/src/Simulation/Native/src/external/fusion.hpp index 2e6b4bdb50d..f3224f79212 100644 --- a/src/Simulation/Native/src/external/fusion.hpp +++ b/src/Simulation/Native/src/external/fusion.hpp @@ -58,6 +58,18 @@ class Fusion{ handle_controls(empty_matrix, empty_vec, {}); // remove all current control qubits (this is a GLOBAL factor) } + + // This saves a class instance create/destroy on every gate insert + // Need a quick way to decide if we're going to grow too wide + int predict(IndexVector index_list, IndexVector const& ctrl_list = {}) { + int cnt = num_qubits() + num_controls(); + for (auto idx : index_list) + if (set_.count(idx) == 0 && ctrl_set_.count(idx) == 0) cnt++; + for (auto idx : ctrl_list) + if (set_.count(idx) == 0 && ctrl_set_.count(idx) == 0) cnt++; + return cnt; + } + void insert(Matrix matrix, IndexVector index_list, IndexVector const& ctrl_list = {}){ for (auto idx : index_list) set_.emplace(idx); @@ -97,7 +109,8 @@ class Fusion{ for (unsigned i = 0; i < idx.size(); ++i) idx2mat[i] = static_cast(((std::equal_range(index_list.begin(), index_list.end(), idx[i])).first - index_list.begin())); - for (std::size_t k = 0; k < (1ULL< oldcol(1ULL< const& wfn, std::vector chunks; -#pragma omp parallel { #pragma omp single chunks = split_interval_in_chunks(max, omp_get_num_threads()); @@ -446,7 +445,6 @@ bool istensorproduct(std::vector const& wfn, std::size_t compl_st = compl_bits.to_ullong(); std::atomic go(true); -#pragma omp parallel { int thread_id = omp_get_thread_num(); if (thread_id < chunks.size() - 1) diff --git a/src/Simulation/Native/src/version.hpp b/src/Simulation/Native/src/version.hpp new file mode 100644 index 00000000000..b55afdfd213 --- /dev/null +++ b/src/Simulation/Native/src/version.hpp @@ -0,0 +1,11 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +#pragma once + +/* #undef MICROSOFT_QUANTUM_SIMULATOR_VERSION */ +/* #undef MICROSOFT_QUANTUM_SIMULATOR_VERSION_MAJOR */ +/* #undef MICROSOFT_QUANTUM_SIMULATOR_VERSION_MINOR */ +/* #undef MICROSOFT_QUANTUM_SIMULATOR_VERSION_PATCH */ +/* #undef MICROSOFT_QUANTUM_SIMULATOR_VERSION_STRING */ +#define MICROSOFT_QUANTUM_SIMULATOR_YEAR "2020"