diff --git a/src/Simulation/Native/CMakeLists.txt b/src/Simulation/Native/CMakeLists.txt
index 0d5e14a053c..fd2d668f1c9 100644
--- a/src/Simulation/Native/CMakeLists.txt
+++ b/src/Simulation/Native/CMakeLists.txt
@@ -24,7 +24,7 @@ set(CMAKE_CXX_STANDARD_REQUIRED ON)
 
 ADD_DEFINITIONS(-D_SCL_SECURE_NO_WARNINGS)
 # Configuration options (choose one to turn on)
-option(BUILD_SHARED_LIBS "Build shared libraries" OFF)
+option(BUILD_SHARED_LIBS "Build shared libraries" ON)
 option(ENABLE_OPENMP  "Enable OpenMP Parallelization" ON)
 option(USE_SINGLE_PRECISION "Use single-precision floating point operations" OFF)
 option(HAVE_INTRINSICS "Have AVX intrinsics" OFF)
diff --git a/src/Simulation/Native/codegen/codegen_fma.py b/src/Simulation/Native/codegen/codegen_fma.py
new file mode 100644
index 00000000000..d54638b81ba
--- /dev/null
+++ b/src/Simulation/Native/codegen/codegen_fma.py
@@ -0,0 +1,434 @@
+#!/usr/bin/env python3
+# (C) 2018 ETH Zurich, ITP, Thomas Häner and Damian Steiger
+# Code generator for n-qubit gate
+
+import sys
+
+
+def avx_type(complex_avx_len):
+  if complex_avx_len == 2:
+    return "__m256d"
+  elif complex_avx_len == 4:
+    return "__m512d"
+  elif complex_avx_len == 1:
+    return "std::complex<double>"
+  else:
+    raise Exception("Unknown avx type.")
+
+
+def avx_prefix(complex_avx_len):
+  if complex_avx_len == 2:
+    return "_mm256"
+  elif complex_avx_len == 4:
+    return "_mm512"
+  else:
+    raise Exception("Unknown avx type.")
+
+
+def generate_kernel_core(N, n, kernelarray, blocks, only_one_matrix, unroll_loops, avx_len):
+  indent = 1
+  
+  kernelarray.append("// (C) 2018 ETH Zurich, ITP, Thomas Häner and Damian Steiger\n\ntemplate <class V, class M>\ninline void kernel_core(V& psi, std::size_t I")
+  for i in range(n):
+    kernelarray.append(", std::size_t d" + str(i))
+  
+  kernelarray.append(", M const& m")
+  if not only_one_matrix:
+    kernelarray.append(", M const& mt")
+  kernelarray.append(")\n{\n")
+  
+  indices = [""]*N
+  for num in range(N):
+    tmp = "I"
+    for b in range(n):
+      if (num>>b) & 1:
+        tmp = tmp + " + d"+str(b)
+    indices[num] = tmp
+
+  add = ["\t" + avx_type(avx_len) + " v[" + str(int(N/blocks)) + "];\n"]
+  for b in range(blocks):
+    if avx_len == 4:
+      x4 = "x4"
+    else:
+      x4 = ""
+    for num in range(int(N/blocks)*b, int(N/blocks)*(b+1)):
+      add.append("\n\tv[" + str(int(num % (N/blocks))) + "] = ")
+      if avx_len > 1:
+        add.append("load1" + x4 + "(&")
+      add.append("psi[" + indices[num] + "]")
+      if avx_len > 1:
+        add.append(")")
+      add.append(";")
+    add.append("\n")
+    if b == 0:
+      add.append("\n\t" + avx_type(avx_len) + " tmp[" + str(int(N/avx_len)) + "] = {")
+      for i in range(int(N/avx_len)):
+        if avx_len > 1:
+          add.append(avx_prefix(avx_len) + "_setzero_pd(), ")
+        else:
+          add.append("0., ")
+      add[-1] = add[-1][:-2] + "};\n"
+    
+    if unroll_loops:
+      inline_FMAs = False
+      miniblocks = N/avx_len/4
+      miniblocks = max(miniblocks, 1)
+      for mb in range(int(miniblocks)):
+        for rb in range(int(N/avx_len/miniblocks)):
+          r = int(mb*N/avx_len/miniblocks) + rb
+          add.append("\n\ttmp[" + str(r) + "] = ")
+        
+          for i in range(int(N/blocks)):
+            if not inline_FMAs or avx_len == 1:
+              add.append("fma(v[" + str(i) + "], m[" + str(b*int(N/blocks)*int(N/avx_len)+r*int(N/blocks)+i) +"], ")
+              if not only_one_matrix:
+                add.append("mt[" + str(b*int(N/blocks)*int(N/avx_len)+i+r*int(N/blocks)) +"], ")
+            else:
+              add.append(avx_prefix(avx_len) + "_fmadd_pd(v[" + str(i) + "], m[" + str(b*int(N/blocks)*int(N/avx_len)+r*int(N/blocks)+i) +"], ")
+          add.append("tmp[" + str(r) + "]")
+          add.append(")"*int(N/blocks)+";")
+        
+        if inline_FMAs and not only_one_matrix and not avx_len == 1:
+          for rb in range(int(N/avx_len/miniblocks)):
+            r = int(mb*N/avx_len/miniblocks) + rb
+            add.append("\n\ttmp[" + str(r) + "] = ")
+            for i in range(int(N/blocks)):
+              add.append(avx_prefix(avx_len) + "_fmadd_pd(" + avx_prefix(avx_len) + "_permute_pd(v[" + str(i) + "], 5), mt[" + str(b*int(N/blocks)*int(N/avx_len)+r*int(N/blocks)+i) +"], ")
+            add.append("tmp[" + str(r) + "]")
+            add.append(")"*int(N/blocks)+";")
+        
+        if inline_FMAs and only_one_matrix and avx_len > 1:
+          raise Exception("Not implemented yet!")
+        
+        for rb in range(int(N/avx_len/miniblocks)):
+          r = int(mb*N/avx_len/miniblocks) + rb
+          if b == blocks-1:
+            add.append("\n\t")
+            if avx_len > 1:
+              add.append("store(")
+            for i in range(avx_len):
+              if avx_len > 1:
+                add.append("(double*)&")
+              add.append("psi[" + indices[avx_len*r+avx_len-i-1] + "], ")
+            if avx_len == 1:
+              add[-1] = add[-1][:-2] + " = "
+            add.append("tmp[" + str(r) + "]);")
+            if avx_len == 1:
+              add[-1] = add[-1][:-2] + ";"
+    else:
+      add.append("\tfor (unsigned i = 0; i < " + str(int(N/avx_len)) + "; ++i){\n\t\ttmp[i] = ")
+      for i in range(int(N/blocks)):
+        add.append("fma(v[" + str(i) + "], m[" + str(b*int(N/blocks)*int(N/avx_len)) + " + i * "+str(int(N/blocks)) + " + " + str(i) +"], ")
+        if not only_one_matrix:
+          add.append("mt[" + str(b*int(N/blocks)*int(N/avx_len)) + " + i * " + str(int(N/blocks)) + " + " + str(i) +"], ")
+      add.append("tmp[i]")
+      add.append(")"*int(N/blocks)+";")
+      add.append("\n\t}\n")
+      if b == blocks-1:
+        for r in range(int(N/avx_len)):
+          add.append("\n\t")
+          if avx_len > 1:
+            add.append("store(")
+          for i in range(avx_len):
+            if avx_len > 1:
+              add.append("(double*)&")
+            add.append("psi[" + indices[avx_len*r+avx_len-i-1] + "], ")
+          if avx_len == 1:
+            add[-1] = add[-1][:-2] + " = "
+          add.append("tmp[" + str(r) + "]);")
+          if avx_len == 1:
+            add[-1] = add[-1][:-2] + ";"
+    
+    add.append("\n")
+    kernelarray.append("".join(add))
+    add=[""]
+  kernelarray.append("".join(add))
+  kernelarray.append("\n}\n\n")
+
+def generate_kernel(n, blocks, only_one_matrix, unroll_loops, avx_len):
+  kernel = ""
+  
+  N = 1<<n
+  idx = list(range(0,n))
+  
+  kernelarray = []
+  generate_kernel_core(N,n,kernelarray,blocks,only_one_matrix,unroll_loops, avx_len)
+  kernel = "".join([kernel, "".join(kernelarray)])
+  
+  
+  
+  kernelarray = []
+  kernel = kernel + "// bit indices id[.] are given from high to low (e.g. control first for CNOT)\ntemplate <class V, class M>\n"
+  kernel = kernel + "void kernel(V& psi"
+  for i in range(n-1,-1,-1):
+    kernel = kernel + ", unsigned id"+str(i)
+  kernel = kernel + ", M const& matrix, std::size_t ctrlmask)\n{\n     std::size_t n = psi.size();\n"
+
+  for i in idx:
+    kernel = kernel + "\tstd::size_t d"+str(i)+" = 1ULL << id"+str(i)+";\n"
+  
+  kernel += ("\tauto m = matrix;\n"
+            "\tstd::size_t dsorted[] = {")
+  add =  ["d0"]
+  for i in range(1,n):
+    add.append(", d" + str(i))
+  add.append("};\n")
+  add.append("\tpermute_qubits_and_matrix(dsorted, " + str(n) + ", m);\n")
+  kernel += "".join(add)
+  
+  if False:
+    add = ["\n\t" + avx_type(avx_len) + " mm[] = {"]
+    for b in range(blocks):
+      for r in range(int(N/avx_len)):
+        for c in range(int(N/blocks)):
+          add.append("loada")
+          if only_one_matrix:
+            add[-1] = add[-1]+"b"
+          add.append("(")
+          for i in range(avx_len):
+            add.append("&m["+str(avx_len*r+i)+"]["+str(c+b*int(N/blocks))+"], ")
+          add[-1] = add[-1][:-2] + "), "
+    add[-1] = add[-1][:-2] + "};\n"
+  else:
+    add = ["\n\t" + avx_type(avx_len) + " mm[" + str(N*int(N/avx_len)) + "];"]
+    add.append("\n\tfor (unsigned b = 0; b < " + str(blocks) + "; ++b){"
+               "\n\t\tfor (unsigned r = 0; r < " + str(int(N/avx_len)) + "; ++r){"
+               "\n\t\t\tfor (unsigned c = 0; c < " + str(int(N/blocks)) + "; ++c){"
+               "\n\t\t\t\tmm[b*"+str(int(N/avx_len)*int(N/blocks))+"+r*"+str(int(N/blocks))+"+c]"
+               " = ")
+    if avx_len > 1:
+      add.append("loada")
+      if only_one_matrix:
+        add[-1] = add[-1]+"b"
+      add.append("(")
+      for i in range(avx_len):
+        add.append("&m["+str(avx_len)+"*r+"+str(i)+"][c+b*"+str(int(N/blocks))+"], ")
+      add[-1] = add[-1][:-2] + ");"
+    else:
+      add.append("m[r][c+b*"+str(int(N/blocks))+"];")
+    add.append("\n\t\t\t}\n\t\t}\n\t}\n")
+  kernelarray.append("".join(add))
+  
+  if False:
+    add = ["\n\t" + avx_type(avx_len) + " mmt[] = {"]
+    for b in range(blocks):
+      for r in range(int(N/avx_len)):
+        for c in range(int(N/blocks)):
+          add.append("loadbm")
+          add.append("(")
+          for i in range(avx_len):
+            add.append("&m["+str(avx_len*r+i)+"]["+str(c+b*int(N/blocks))+"], ")
+          add[-1] = add[-1][:-2] + "), "
+    add[-1] = add[-1][:-2] + "};\n"
+  else:
+    add = ["\n\t" + avx_type(avx_len) + " mmt[" + str(N*int(N/avx_len)) + "];"]
+    add.append("\n\tfor (unsigned b = 0; b < " + str(blocks) + "; ++b){"
+               "\n\t\tfor (unsigned r = 0; r < " + str(int(N/avx_len)) + "; ++r){"
+               "\n\t\t\tfor (unsigned c = 0; c < " + str(int(N/blocks)) + "; ++c){"
+               "\n\t\t\t\tmmt[b*"+str(int(N/avx_len)*int(N/blocks))+"+r*"+str(int(N/blocks))+"+c]"
+               " = loadbm(")
+    for i in range(avx_len):
+      add.append("&m["+str(avx_len)+"*r+"+str(i)+"][c+b*"+str(int(N/blocks))+"], ")
+    add[-1] = add[-1][:-2] + ");\n\t\t\t}\n\t\t}\n\t}\n"
+  
+  if only_one_matrix:
+    add = []
+  
+  add.append("\n\n")
+  kernelarray.append("".join(add))
+  
+  nc = len(idx)-1
+  add = []
+  indent = 1
+  kernelarray.append("#ifndef _MSC_VER\n")
+  kernelarray.append("\t"*indent + "if (ctrlmask == 0){\n")
+  indent += 1
+  kernelarray.append("\t"*indent + "#pragma omp parallel for collapse(LOOP_COLLAPSE"+str(n)+") schedule(static)\n" + "\t"*indent + "for (std::size_t i0 = 0; i0 < n; i0 += 2 * dsorted[0]){\n")
+  indent = indent + 1
+  for i in range(1,nc+1):
+    kernelarray.append("\t"*indent + "for (std::size_t i"+str(i)+" = 0; i"+str(i)+" < dsorted["+str(i-1) + "]; i"+str(i)+" += 2 * dsorted["+str(i)+"]){\n")
+    indent = indent + 1
+
+  kernelarray.append("\t"*indent + "for (std::size_t i"+str(nc+1)+" = 0; i"+str(nc+1)+" < dsorted["+str(nc)+"]; ++i"+str(nc+1)+"){\n")
+  indent = indent + 1
+
+  # inner-most loop: call kernel core
+
+
+  kernelarray.append("\t"*indent + "kernel_core(psi, i0")
+  add = []
+  for i in range(n):
+    add.append(" + i"+str(i+1))
+  kernelarray.append("".join(add))
+  for i in range(n):
+    kernelarray.append(", dsorted[" + str(n-1-i) + "]")
+
+  if only_one_matrix:
+    kernelarray.append(", mm);\n")
+  else:
+    kernelarray.append(", mm, mmt);\n")
+
+  #end for(s) and if
+  add = [""]*indent
+  for i in range(indent-1,0,-1):
+    add[indent-1-i] = "\t"*i+"}\n"
+  kernelarray.append("".join(add))
+
+  # if controlmask != 0
+  indent = 1
+  kernelarray.append("\t"*indent + "else{\n")
+  indent += 1
+  kernelarray.append("\t"*indent + "#pragma omp parallel for collapse(LOOP_COLLAPSE"+str(n)+") schedule(static)\n" + "\t"*indent + "for (std::size_t i0 = 0; i0 < n; i0 += 2 * dsorted[0]){\n")
+  indent = indent + 1
+  for i in range(1,nc+1):
+    kernelarray.append("\t"*indent + "for (std::size_t i"+str(i)+" = 0; i"+str(i)+" < dsorted["+str(i-1) + "]; i"+str(i)+" += 2 * dsorted["+str(i)+"]){\n")
+    indent = indent + 1
+
+  kernelarray.append("\t"*indent + "for (std::size_t i"+str(nc+1)+" = 0; i"+str(nc+1)+" < dsorted["+str(nc)+"]; ++i"+str(nc+1)+"){\n")
+  indent = indent + 1
+
+  # inner-most loop: call kernel core
+
+  kernelarray.append("\t"*indent + "if (((i0")
+  add = []
+  for i in range(n):
+    add.append(" + i"+str(i+1))
+  kernelarray.append("".join(add))
+  kernelarray.append(")&ctrlmask) == ctrlmask)\n")
+  kernelarray.append("\t"*(indent+1) + "kernel_core(psi, i0")
+  add = []
+  for i in range(n):
+    add.append(" + i"+str(i+1))
+  kernelarray.append("".join(add))
+  for i in range(n):
+    kernelarray.append(", dsorted[" + str(n-1-i) + "]")
+
+  if only_one_matrix:
+    kernelarray.append(", mm);\n")
+  else:
+    kernelarray.append(", mm, mmt);\n")
+
+  #end for(s) and if
+  add = [""]*indent
+  for i in range(indent-1,0,-1):
+    add[indent-1-i] = "\t"*i+"}\n"
+  kernelarray.append("".join(add))
+
+
+################ Start of _MSC_VER code block ##################
+  kernelarray.append("#else\n")
+  kernelarray.append("    std::intptr_t zero = 0;\n")
+  kernelarray.append("    std::intptr_t dmask = dsorted[0]");
+  for i in range(n-1):        kernelarray.append(" + dsorted["+str(i+1)+"]")
+  kernelarray.append(         ";\n")
+  kernelarray.append("\n");
+  kernelarray.append("    if (ctrlmask == 0){\n")
+  kernelarray.append("        #pragma omp parallel for schedule(static)\n")
+  kernelarray.append("        for (std::intptr_t i = 0; i < static_cast<std::intptr_t>(n); ++i)\n")
+  kernelarray.append("            if ((i & dmask) == zero)\n")
+  kernelarray.append("                kernel_core(psi, i")
+  for i in range(n):                    kernelarray.append(", dsorted[" + str(n-1-i) + "]")
+  if only_one_matrix:                   kernelarray.append(", mm);\n")
+  else:                                 kernelarray.append(", mm, mmt);\n")
+  # if controlmask != 0
+  kernelarray.append("     } else {\n")
+  kernelarray.append("        #pragma omp parallel for schedule(static)\n")
+  kernelarray.append("        for (std::intptr_t i = 0; i < static_cast<std::intptr_t>(n); ++i)\n")
+  kernelarray.append("            if ((i & ctrlmask) == ctrlmask && (i & dmask) == zero)\n")
+  kernelarray.append("                kernel_core(psi, i")
+  for i in range(n):                    kernelarray.append(", dsorted[" + str(n-1-i) + "]")
+  if only_one_matrix:                   kernelarray.append(", mm);\n")
+  else:                                 kernelarray.append(", mm, mmt);\n")
+  kernelarray.append("     }\n")
+  kernelarray.append("#endif\n")
+
+  kernelarray.append("}\n")
+  kernel = "".join([kernel,"".join(kernelarray)])
+  return kernel
+
+def generate_includes(N):
+  return "#include <cassert>\n#include <iostream>\n#include <vector>\n#include <complex>\n#include <cstdlib>\n#include <omp.h>\n" + \
+    "#include \"alignedallocator.hpp\"\n#include \"timing.hpp\"\n#include \"cintrin.hpp\"\n" + \
+    "#include <algorithm>\n#include <functional>\n\n" + \
+    "#include \"util/par_for.hpp\"\n" + \
+    "using namespace std;\n#define LOOP_COLLAPSE" + str(N) + " " + str(N+1) + "\n"
+
+def generate_main(n):
+  N = str(1 << n)
+  text = "using rowtype = vector<complex<double>,aligned_allocator<complex<double>,64>>;\nusing matrixtype = vector<rowtype>;\n\nint main(int argc, char *argv[]){"
+  text = text + "\n\tassert(argc > "+str(1+n)+");"
+  text = text + "\n\tsize_t N = 1ULL << atoi(argv[1]);"
+  for i in range(n):
+    text = text + "\n\tunsigned i" + str(i) + " = atoi(argv[" + str(i+2) + "]);"
+  
+  text = text + "\n\tmatrixtype m("+N+", rowtype("+N+"));";
+  text = text + "\n\tfor (unsigned i = 0; i < "+N+"; ++i)\n\t\tfor (unsigned j = 0; j < "+N+"; ++j)\n\t\t\tm[i][j] = drand48();\n"
+  
+  text = text + "\n\tTimer t;\n\tfor (unsigned threads = 1; threads <= 24; ++threads){"
+  text = text + "\n\t\tomp_set_num_threads(threads);"
+  text = text + "\n\t\trowtype psi(N);\n\t\t#pragma omp parallel\n\t\t{\n\t\t\t#pragma omp for schedule(static)\n\t\t\tfor (size_t i = 0; i < psi.size(); ++i)\n\t\t\t\tpsi[i] = drand48();\n"
+  text = text + "\n\t\t\t#pragma omp single\n\t\t\tt.start();"
+  text = text + "\n\t\t\tkernel(psi, N"
+  for i in range(n):
+    text = text + ", i" + str(i)
+  text = text + ", m, 0);"
+  text = text + "\n\t\t\t#pragma omp waitall\n\t\t\t#pragma omp single\n\t\t\t{ cout << \"threads: \" << threads << \", time:\" << t.stop()*1.e-6 << \"\\n\"; }"
+  text = text + "\n\t\t}" # end for
+  text = text + "\n\t}" # end for
+  text = text + "\n\n}" # end main
+  return text
+
+
+#####################################################
+# MAIN                                              #
+#####################################################
+
+if len(sys.argv) < 2:
+  print("Generates the code for an n-qubit gate.\nUsage:\n./codegen_fma.py [n_qubits] {n_blocks} {only one matrix?} {unroll loops?} {none|avx2|avx512}\n\n")
+  exit()
+
+n = int(sys.argv[1]) # number of qubits
+
+try: # number of blocks
+  blocks = int(sys.argv[2])
+except Exception:
+  blocks = 1
+
+try:
+  only_one_matrix = int(sys.argv[3])
+except Exception:
+  only_one_matrix = False
+
+try:
+  unroll_loops = int(sys.argv[4])
+except Exception:
+  unroll_loops = False
+
+try:
+  avx = str(sys.argv[5])
+  if avx == "avx512":
+    avx = 4
+  elif avx == "avx2" or avx == "avx":
+    avx = 2
+  elif avx == "none":
+    avx = 1
+    only_one_matrix = True
+  else:
+    raise RuntimeError("Unknown avx type: {}".format(avx))
+except IndexError:
+  avx = 2
+
+while (1 << n)/blocks < 1:
+  blocks = int(blocks/2)
+
+if (1 << n) < avx:
+  avx = int(avx/2)
+
+kernel = generate_kernel(n, blocks, only_one_matrix, unroll_loops, avx) # generate code for n-qubit gate
+
+# if user wants a main (for testing) generate as well:
+for a in sys.argv:
+  if str(a) == "gen_main":
+    kernel = generate_includes(n) + kernel + generate_main(n)
+
+print(kernel)
diff --git a/src/Simulation/Native/codegen/codegen_test.cpp b/src/Simulation/Native/codegen/codegen_test.cpp
new file mode 100644
index 00000000000..1ff778ce96d
--- /dev/null
+++ b/src/Simulation/Native/codegen/codegen_test.cpp
@@ -0,0 +1,127 @@
+#include <cassert>
+#include <iostream>
+#include <vector>
+#include <complex>
+#include <cstdlib>
+#include <omp.h>
+#include "alignedalloc.hpp"
+//#include "timing.hpp"
+#include "cintrin.hpp"
+#include <algorithm>
+#include <functional>
+
+#include "util/par_for.hpp"
+using namespace std;
+#define LOOP_COLLAPSE1 2
+// (C) 2018 ETH Zurich, ITP, Thomas H�ner and Damian Steiger
+
+template <class V, class M>
+inline void kernel_core(V& psi, std::size_t I, std::size_t d0, M const& m)
+{
+	std::complex<double> v[2];
+
+	v[0] = psi[I];
+	v[1] = psi[I + d0];
+
+	std::complex<double> tmp[2] = {0., 0.};
+
+	tmp[0] = fma(v[0], m[0], fma(v[1], m[1], tmp[0]));
+	tmp[1] = fma(v[0], m[2], fma(v[1], m[3], tmp[1]));
+	psi[I] = tmp[0];
+	psi[I + d0] = tmp[1];
+
+}
+
+// bit indices id[.] are given from high to low (e.g. control first for CNOT)
+template <class V, class M>
+void kernel(V& psi, unsigned id0, M const& matrix, std::size_t ctrlmask)
+{
+     std::size_t n = psi.size();
+	std::size_t d0 = 1ULL << id0;
+	auto m = matrix;
+	std::size_t dsorted[] = {d0};
+	permute_qubits_and_matrix(dsorted, 1, m);
+
+	std::complex<double> mm[4];
+	for (unsigned b = 0; b < 1; ++b){
+		for (unsigned r = 0; r < 2; ++r){
+			for (unsigned c = 0; c < 2; ++c){
+				mm[b*4+r*2+c] = m[r][c+b*2];
+			}
+		}
+	}
+
+
+#ifndef _MSC_VER
+	if (ctrlmask == 0){
+		#pragma omp parallel for collapse(LOOP_COLLAPSE1) schedule(static)
+		for (std::size_t i0 = 0; i0 < n; i0 += 2 * dsorted[0]){
+			for (std::size_t i1 = 0; i1 < dsorted[0]; ++i1){
+				kernel_core(psi, i0 + i1, dsorted[0], mm);
+			}
+		}
+	}
+	else{
+		#pragma omp parallel for collapse(LOOP_COLLAPSE1) schedule(static)
+		for (std::size_t i0 = 0; i0 < n; i0 += 2 * dsorted[0]){
+			for (std::size_t i1 = 0; i1 < dsorted[0]; ++i1){
+				if (((i0 + i1)&ctrlmask) == ctrlmask)
+					kernel_core(psi, i0 + i1, dsorted[0], mm);
+			}
+		}
+	}
+#else
+    intptr_t zero = 0;
+    intptr_t dmask = dsorted[0];
+
+    if (ctrlmask == 0){
+        auto thrdFnc= [&](size_t dsorted[],intptr_t& dmask, intptr_t& zero,V &psi,M const& m) {
+            return [&](unsigned i) {
+                if ((i & dmask) == zero)
+                    kernel_core(psi, i, dsorted[0], m);
+            };
+        };
+        pl::async_par_for(0,n,thrdFnc(dsorted,dmask,zero,psi,m));
+     } else {
+        auto thrdFnc= [&](size_t dsorted[],size_t& ctrlmask,intptr_t& dmask, intptr_t& zero,V &psi,M const& m) {
+            return [&](unsigned i) {
+                if ((i & ctrlmask) == ctrlmask && (i & dmask) == zero)
+                    kernel_core(psi, i, dsorted[0], m);
+            };
+        };
+        pl::async_par_for(0,n,thrdFnc(dsorted,ctrlmask,dmask,zero,psi,m));
+     }
+#endif
+}
+using rowtype = vector<complex<double>,AlignedAlloc<complex<double>,64>>;
+using matrixtype = vector<rowtype>;
+
+int main(int argc, char *argv[]){
+	assert(argc > 2);
+	size_t N = 1ULL << atoi(argv[1]);
+	unsigned i0 = atoi(argv[2]);
+	matrixtype m(2, rowtype(2));
+	for (unsigned i = 0; i < 2; ++i)
+		for (unsigned j = 0; j < 2; ++j)
+			m[i][j] = drand48();
+
+	Timer t;
+	for (unsigned threads = 1; threads <= 24; ++threads){
+		omp_set_num_threads(threads);
+		rowtype psi(N);
+		#pragma omp parallel
+		{
+			#pragma omp for schedule(static)
+			for (size_t i = 0; i < psi.size(); ++i)
+				psi[i] = drand48();
+
+			#pragma omp single
+			t.start();
+			kernel(psi, N, i0, m, 0);
+			#pragma omp waitall
+			#pragma omp single
+			{ cout << "threads: " << threads << ", time:" << t.stop()*1.e-6 << "\n"; }
+		}
+	}
+
+}
diff --git a/src/Simulation/Native/codegen/generate.ps1 b/src/Simulation/Native/codegen/generate.ps1
new file mode 100644
index 00000000000..1cc506d187b
--- /dev/null
+++ b/src/Simulation/Native/codegen/generate.ps1
@@ -0,0 +1,21 @@
+# onematrix[i] determines whether to use a single gate matrix for the i-qubit gate kernel
+# instead of using two matrices (which allows to reduce the number of operations
+# by pre-computation)
+$onematrix=(0,0,0,0,0,0,1,1) # g++ best
+
+# unroll[i] determines whether to unroll loops
+$unroll=(1,1,1,1,1,1,0,0) # g++ best
+
+# register length to use: can be none, avx2, or avx512
+# avx=avx2
+
+# blocking: must be a power of two and at most 2^k for a k-qubit gate
+$b=(0,2,4,8,16,16,16,32) # gcc & icc best
+
+foreach ($i in 1..7) {
+    "Generating $i kernel with $b[$i] blocks."
+	python codegen_fma.py $i $b[$i] $onematrix[$i] $unroll[$i] none > nointrin/kernel$i.hpp
+	python codegen_fma.py $i $b[$i] $onematrix[$i] $unroll[$i] avx > avx/kernel$i.hpp
+	python codegen_fma.py $i $b[$i] $onematrix[$i] $unroll[$i] avx2 > avx2/kernel$i.hpp
+    python codegen_fma.py $i $b[$i] $onematrix[$i] $unroll[$i] avx512 > avx512/kernel$i.hpp
+}
\ No newline at end of file
diff --git a/src/Simulation/Native/codegen/generate.sh b/src/Simulation/Native/codegen/generate.sh
new file mode 100644
index 00000000000..2ec0557d571
--- /dev/null
+++ b/src/Simulation/Native/codegen/generate.sh
@@ -0,0 +1,28 @@
+#!/bin/bash
+# (C) 2018 ETH Zurich, ITP, Thomas Häner and Damian Steiger
+
+# onematrix[i] determines whether to use a single gate matrix for the i-qubit gate kernel
+# instead of using two matrices (which allows to reduce the number of operations
+# by pre-computation)
+onematrix=(0 0 0 0 0 0 1 1) # g++ best
+#onematrix=(0 0 1 0 0 0 1 1) # icc best
+
+# unroll[i] determines whether to unroll loops
+unroll=(1 1 1 1 1 1 0 0) # g++ best
+#unroll=(1 1 1 0 0 1 0 0) # icc best
+#unroll=(0 0 0 0 0 0 0 0)
+
+# register length to use: can be none, avx2, or avx512
+avx=avx2
+
+# blocking: must be a power of two and at most 2^k for a k-qubit gate
+b=(0 2 4 8 16 16 16 32) # gcc & icc best
+
+for i in {1..7}
+do
+	echo "Generating $i kernel with ${b[$i]} blocks."
+	./codegen_fma.py $i ${b[$i]} ${onematrix[$i]} ${unroll[$i]} none > ../nointrin/kernel${i}.hpp
+	./codegen_fma.py $i ${b[$i]} ${onematrix[$i]} ${unroll[$i]} avx > ../avx/kernel${i}.hpp
+	./codegen_fma.py $i ${b[$i]} ${onematrix[$i]} ${unroll[$i]} avx2 > ../avx2/kernel${i}.hpp
+  ./codegen_fma.py $i ${b[$i]} ${onematrix[$i]} ${unroll[$i]} avx512 > ../avx512/kernel${i}.hpp
+done
diff --git a/src/Simulation/Native/src/config.hpp b/src/Simulation/Native/src/config.hpp
new file mode 100644
index 00000000000..363baa4d241
--- /dev/null
+++ b/src/Simulation/Native/src/config.hpp
@@ -0,0 +1,50 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#pragma once
+
+#include <complex>
+
+// check if we want to force single precision
+/* #undef USE_SINGLE_PRECISION */
+
+// check if we have AVX intrinsics
+/* #undef HAVE_INTRINSICS */
+
+// check if we have AVX-512 intrinsics
+/* #undef HAVE_AVX512 */
+
+// check if we want to use fused kernels
+#define USE_GATE_FUSION
+
+#define BUILD_SHARED_LIBS
+
+
+#if defined (_MSC_VER) && defined (BUILD_SHARED_LIBS)
+
+#ifdef BUILD_DLL
+#define MICROSOFT_QUANTUM_DECL __declspec(dllexport)
+#else
+#define MICROSOFT_QUANTUM_DECL __declspec(dllimport)
+#endif
+#define MICROSOFT_QUANTUM_DECL_IMPORT __declspec(dllimport)
+#else
+#define MICROSOFT_QUANTUM_DECL
+#define MICROSOFT_QUANTUM_DECL_IMPORT
+#endif
+
+#ifdef HAVE_INTRINSICS
+#ifdef HAVE_AVX512
+#define SIMULATOR SimulatorAVX512
+#else
+#ifdef HAVE_FMA
+#define SIMULATOR SimulatorAVX2
+#else
+#define SIMULATOR SimulatorAVX
+#endif
+#endif
+#else
+#define SIMULATOR SimulatorGeneric
+#endif
+
+
diff --git a/src/Simulation/Native/src/external/avx/kernel1.hpp b/src/Simulation/Native/src/external/avx/kernel1.hpp
index eac0cf47ea4..2bdcb04aea3 100644
--- a/src/Simulation/Native/src/external/avx/kernel1.hpp
+++ b/src/Simulation/Native/src/external/avx/kernel1.hpp
@@ -49,7 +49,7 @@ void kernel(V& psi, unsigned id0, M const& matrix, std::size_t ctrlmask)
 
 #ifndef _MSC_VER
 	if (ctrlmask == 0){
-		#pragma omp for collapse(LOOP_COLLAPSE1) schedule(static)
+		#pragma omp parallel for collapse(LOOP_COLLAPSE1) schedule(static)
 		for (std::size_t i0 = 0; i0 < n; i0 += 2 * dsorted[0]){
 			for (std::size_t i1 = 0; i1 < dsorted[0]; ++i1){
 				kernel_core(psi, i0 + i1, dsorted[0], mm, mmt);
@@ -57,7 +57,7 @@ void kernel(V& psi, unsigned id0, M const& matrix, std::size_t ctrlmask)
 		}
 	}
 	else{
-		#pragma omp for collapse(LOOP_COLLAPSE1) schedule(static)
+		#pragma omp parallel for collapse(LOOP_COLLAPSE1) schedule(static)
 		for (std::size_t i0 = 0; i0 < n; i0 += 2 * dsorted[0]){
 			for (std::size_t i1 = 0; i1 < dsorted[0]; ++i1){
 				if (((i0 + i1)&ctrlmask) == ctrlmask)
diff --git a/src/Simulation/Native/src/external/avx/kernel2.hpp b/src/Simulation/Native/src/external/avx/kernel2.hpp
index 24f6c8ee13d..8012bf3638d 100644
--- a/src/Simulation/Native/src/external/avx/kernel2.hpp
+++ b/src/Simulation/Native/src/external/avx/kernel2.hpp
@@ -63,7 +63,7 @@ void kernel(V& psi, unsigned id1, unsigned id0, M const& matrix, std::size_t ctr
 
 #ifndef _MSC_VER
 	if (ctrlmask == 0){
-		#pragma omp for collapse(LOOP_COLLAPSE2) schedule(static)
+		#pragma omp parallel for collapse(LOOP_COLLAPSE2) schedule(static)
 		for (std::size_t i0 = 0; i0 < n; i0 += 2 * dsorted[0]){
 			for (std::size_t i1 = 0; i1 < dsorted[0]; i1 += 2 * dsorted[1]){
 				for (std::size_t i2 = 0; i2 < dsorted[1]; ++i2){
@@ -73,7 +73,7 @@ void kernel(V& psi, unsigned id1, unsigned id0, M const& matrix, std::size_t ctr
 		}
 	}
 	else{
-		#pragma omp for collapse(LOOP_COLLAPSE2) schedule(static)
+		#pragma omp parallel for collapse(LOOP_COLLAPSE2) schedule(static)
 		for (std::size_t i0 = 0; i0 < n; i0 += 2 * dsorted[0]){
 			for (std::size_t i1 = 0; i1 < dsorted[0]; i1 += 2 * dsorted[1]){
 				for (std::size_t i2 = 0; i2 < dsorted[1]; ++i2){
diff --git a/src/Simulation/Native/src/external/avx/kernel3.hpp b/src/Simulation/Native/src/external/avx/kernel3.hpp
index a63dbc28693..bc7037c6004 100644
--- a/src/Simulation/Native/src/external/avx/kernel3.hpp
+++ b/src/Simulation/Native/src/external/avx/kernel3.hpp
@@ -102,7 +102,7 @@ void kernel(V& psi, unsigned id2, unsigned id1, unsigned id0, M const& matrix, s
 
 #ifndef _MSC_VER
 	if (ctrlmask == 0){
-		#pragma omp for collapse(LOOP_COLLAPSE3) schedule(static)
+		#pragma omp parallel for collapse(LOOP_COLLAPSE3) schedule(static)
 		for (std::size_t i0 = 0; i0 < n; i0 += 2 * dsorted[0]){
 			for (std::size_t i1 = 0; i1 < dsorted[0]; i1 += 2 * dsorted[1]){
 				for (std::size_t i2 = 0; i2 < dsorted[1]; i2 += 2 * dsorted[2]){
@@ -114,7 +114,7 @@ void kernel(V& psi, unsigned id2, unsigned id1, unsigned id0, M const& matrix, s
 		}
 	}
 	else{
-		#pragma omp for collapse(LOOP_COLLAPSE3) schedule(static)
+		#pragma omp parallel for collapse(LOOP_COLLAPSE3) schedule(static)
 		for (std::size_t i0 = 0; i0 < n; i0 += 2 * dsorted[0]){
 			for (std::size_t i1 = 0; i1 < dsorted[0]; i1 += 2 * dsorted[1]){
 				for (std::size_t i2 = 0; i2 < dsorted[1]; i2 += 2 * dsorted[2]){
diff --git a/src/Simulation/Native/src/external/avx/kernel4.hpp b/src/Simulation/Native/src/external/avx/kernel4.hpp
index 92f573a0255..e20a190af5c 100644
--- a/src/Simulation/Native/src/external/avx/kernel4.hpp
+++ b/src/Simulation/Native/src/external/avx/kernel4.hpp
@@ -227,7 +227,7 @@ void kernel(V& psi, unsigned id3, unsigned id2, unsigned id1, unsigned id0, M co
 
 #ifndef _MSC_VER
 	if (ctrlmask == 0){
-		#pragma omp for collapse(LOOP_COLLAPSE4) schedule(static)
+		#pragma omp parallel for collapse(LOOP_COLLAPSE4) schedule(static)
 		for (std::size_t i0 = 0; i0 < n; i0 += 2 * dsorted[0]){
 			for (std::size_t i1 = 0; i1 < dsorted[0]; i1 += 2 * dsorted[1]){
 				for (std::size_t i2 = 0; i2 < dsorted[1]; i2 += 2 * dsorted[2]){
@@ -241,7 +241,7 @@ void kernel(V& psi, unsigned id3, unsigned id2, unsigned id1, unsigned id0, M co
 		}
 	}
 	else{
-		#pragma omp for collapse(LOOP_COLLAPSE4) schedule(static)
+		#pragma omp parallel for collapse(LOOP_COLLAPSE4) schedule(static)
 		for (std::size_t i0 = 0; i0 < n; i0 += 2 * dsorted[0]){
 			for (std::size_t i1 = 0; i1 < dsorted[0]; i1 += 2 * dsorted[1]){
 				for (std::size_t i2 = 0; i2 < dsorted[1]; i2 += 2 * dsorted[2]){
diff --git a/src/Simulation/Native/src/external/avx/kernel5.hpp b/src/Simulation/Native/src/external/avx/kernel5.hpp
index 0cdba4b89c5..8cf84656e68 100644
--- a/src/Simulation/Native/src/external/avx/kernel5.hpp
+++ b/src/Simulation/Native/src/external/avx/kernel5.hpp
@@ -380,7 +380,7 @@ void kernel(V& psi, unsigned id4, unsigned id3, unsigned id2, unsigned id1, unsi
 
 #ifndef _MSC_VER
 	if (ctrlmask == 0){
-		#pragma omp for collapse(LOOP_COLLAPSE5) schedule(static)
+		#pragma omp parallel for collapse(LOOP_COLLAPSE5) schedule(static)
 		for (std::size_t i0 = 0; i0 < n; i0 += 2 * dsorted[0]){
 			for (std::size_t i1 = 0; i1 < dsorted[0]; i1 += 2 * dsorted[1]){
 				for (std::size_t i2 = 0; i2 < dsorted[1]; i2 += 2 * dsorted[2]){
@@ -396,7 +396,7 @@ void kernel(V& psi, unsigned id4, unsigned id3, unsigned id2, unsigned id1, unsi
 		}
 	}
 	else{
-		#pragma omp for collapse(LOOP_COLLAPSE5) schedule(static)
+		#pragma omp parallel for collapse(LOOP_COLLAPSE5) schedule(static)
 		for (std::size_t i0 = 0; i0 < n; i0 += 2 * dsorted[0]){
 			for (std::size_t i1 = 0; i1 < dsorted[0]; i1 += 2 * dsorted[1]){
 				for (std::size_t i2 = 0; i2 < dsorted[1]; i2 += 2 * dsorted[2]){
diff --git a/src/Simulation/Native/src/external/avx/kernel6.hpp b/src/Simulation/Native/src/external/avx/kernel6.hpp
index 343c6e3154c..087c4e83473 100644
--- a/src/Simulation/Native/src/external/avx/kernel6.hpp
+++ b/src/Simulation/Native/src/external/avx/kernel6.hpp
@@ -212,7 +212,7 @@ void kernel(V& psi, unsigned id5, unsigned id4, unsigned id3, unsigned id2, unsi
 
 #ifndef _MSC_VER
 	if (ctrlmask == 0){
-		#pragma omp for collapse(LOOP_COLLAPSE6) schedule(static)
+		#pragma omp parallel for collapse(LOOP_COLLAPSE6) schedule(static)
 		for (std::size_t i0 = 0; i0 < n; i0 += 2 * dsorted[0]){
 			for (std::size_t i1 = 0; i1 < dsorted[0]; i1 += 2 * dsorted[1]){
 				for (std::size_t i2 = 0; i2 < dsorted[1]; i2 += 2 * dsorted[2]){
@@ -230,7 +230,7 @@ void kernel(V& psi, unsigned id5, unsigned id4, unsigned id3, unsigned id2, unsi
 		}
 	}
 	else{
-		#pragma omp for collapse(LOOP_COLLAPSE6) schedule(static)
+		#pragma omp parallel for collapse(LOOP_COLLAPSE6) schedule(static)
 		for (std::size_t i0 = 0; i0 < n; i0 += 2 * dsorted[0]){
 			for (std::size_t i1 = 0; i1 < dsorted[0]; i1 += 2 * dsorted[1]){
 				for (std::size_t i2 = 0; i2 < dsorted[1]; i2 += 2 * dsorted[2]){
diff --git a/src/Simulation/Native/src/external/avx/kernel7.hpp b/src/Simulation/Native/src/external/avx/kernel7.hpp
index ecd0f45f3cf..197e800b208 100644
--- a/src/Simulation/Native/src/external/avx/kernel7.hpp
+++ b/src/Simulation/Native/src/external/avx/kernel7.hpp
@@ -389,7 +389,7 @@ void kernel(V& psi, unsigned id6, unsigned id5, unsigned id4, unsigned id3, unsi
 
 #ifndef _MSC_VER
 	if (ctrlmask == 0){
-		#pragma omp for collapse(LOOP_COLLAPSE7) schedule(static)
+		#pragma omp parallel for collapse(LOOP_COLLAPSE7) schedule(static)
 		for (std::size_t i0 = 0; i0 < n; i0 += 2 * dsorted[0]){
 			for (std::size_t i1 = 0; i1 < dsorted[0]; i1 += 2 * dsorted[1]){
 				for (std::size_t i2 = 0; i2 < dsorted[1]; i2 += 2 * dsorted[2]){
@@ -409,7 +409,7 @@ void kernel(V& psi, unsigned id6, unsigned id5, unsigned id4, unsigned id3, unsi
 		}
 	}
 	else{
-		#pragma omp for collapse(LOOP_COLLAPSE7) schedule(static)
+		#pragma omp parallel for collapse(LOOP_COLLAPSE7) schedule(static)
 		for (std::size_t i0 = 0; i0 < n; i0 += 2 * dsorted[0]){
 			for (std::size_t i1 = 0; i1 < dsorted[0]; i1 += 2 * dsorted[1]){
 				for (std::size_t i2 = 0; i2 < dsorted[1]; i2 += 2 * dsorted[2]){
diff --git a/src/Simulation/Native/src/external/avx2/kernel1.hpp b/src/Simulation/Native/src/external/avx2/kernel1.hpp
index eac0cf47ea4..2bdcb04aea3 100644
--- a/src/Simulation/Native/src/external/avx2/kernel1.hpp
+++ b/src/Simulation/Native/src/external/avx2/kernel1.hpp
@@ -49,7 +49,7 @@ void kernel(V& psi, unsigned id0, M const& matrix, std::size_t ctrlmask)
 
 #ifndef _MSC_VER
 	if (ctrlmask == 0){
-		#pragma omp for collapse(LOOP_COLLAPSE1) schedule(static)
+		#pragma omp parallel for collapse(LOOP_COLLAPSE1) schedule(static)
 		for (std::size_t i0 = 0; i0 < n; i0 += 2 * dsorted[0]){
 			for (std::size_t i1 = 0; i1 < dsorted[0]; ++i1){
 				kernel_core(psi, i0 + i1, dsorted[0], mm, mmt);
@@ -57,7 +57,7 @@ void kernel(V& psi, unsigned id0, M const& matrix, std::size_t ctrlmask)
 		}
 	}
 	else{
-		#pragma omp for collapse(LOOP_COLLAPSE1) schedule(static)
+		#pragma omp parallel for collapse(LOOP_COLLAPSE1) schedule(static)
 		for (std::size_t i0 = 0; i0 < n; i0 += 2 * dsorted[0]){
 			for (std::size_t i1 = 0; i1 < dsorted[0]; ++i1){
 				if (((i0 + i1)&ctrlmask) == ctrlmask)
diff --git a/src/Simulation/Native/src/external/avx2/kernel2.hpp b/src/Simulation/Native/src/external/avx2/kernel2.hpp
index 24f6c8ee13d..8012bf3638d 100644
--- a/src/Simulation/Native/src/external/avx2/kernel2.hpp
+++ b/src/Simulation/Native/src/external/avx2/kernel2.hpp
@@ -63,7 +63,7 @@ void kernel(V& psi, unsigned id1, unsigned id0, M const& matrix, std::size_t ctr
 
 #ifndef _MSC_VER
 	if (ctrlmask == 0){
-		#pragma omp for collapse(LOOP_COLLAPSE2) schedule(static)
+		#pragma omp parallel for collapse(LOOP_COLLAPSE2) schedule(static)
 		for (std::size_t i0 = 0; i0 < n; i0 += 2 * dsorted[0]){
 			for (std::size_t i1 = 0; i1 < dsorted[0]; i1 += 2 * dsorted[1]){
 				for (std::size_t i2 = 0; i2 < dsorted[1]; ++i2){
@@ -73,7 +73,7 @@ void kernel(V& psi, unsigned id1, unsigned id0, M const& matrix, std::size_t ctr
 		}
 	}
 	else{
-		#pragma omp for collapse(LOOP_COLLAPSE2) schedule(static)
+		#pragma omp parallel for collapse(LOOP_COLLAPSE2) schedule(static)
 		for (std::size_t i0 = 0; i0 < n; i0 += 2 * dsorted[0]){
 			for (std::size_t i1 = 0; i1 < dsorted[0]; i1 += 2 * dsorted[1]){
 				for (std::size_t i2 = 0; i2 < dsorted[1]; ++i2){
diff --git a/src/Simulation/Native/src/external/avx2/kernel3.hpp b/src/Simulation/Native/src/external/avx2/kernel3.hpp
index a63dbc28693..bc7037c6004 100644
--- a/src/Simulation/Native/src/external/avx2/kernel3.hpp
+++ b/src/Simulation/Native/src/external/avx2/kernel3.hpp
@@ -102,7 +102,7 @@ void kernel(V& psi, unsigned id2, unsigned id1, unsigned id0, M const& matrix, s
 
 #ifndef _MSC_VER
 	if (ctrlmask == 0){
-		#pragma omp for collapse(LOOP_COLLAPSE3) schedule(static)
+		#pragma omp parallel for collapse(LOOP_COLLAPSE3) schedule(static)
 		for (std::size_t i0 = 0; i0 < n; i0 += 2 * dsorted[0]){
 			for (std::size_t i1 = 0; i1 < dsorted[0]; i1 += 2 * dsorted[1]){
 				for (std::size_t i2 = 0; i2 < dsorted[1]; i2 += 2 * dsorted[2]){
@@ -114,7 +114,7 @@ void kernel(V& psi, unsigned id2, unsigned id1, unsigned id0, M const& matrix, s
 		}
 	}
 	else{
-		#pragma omp for collapse(LOOP_COLLAPSE3) schedule(static)
+		#pragma omp parallel for collapse(LOOP_COLLAPSE3) schedule(static)
 		for (std::size_t i0 = 0; i0 < n; i0 += 2 * dsorted[0]){
 			for (std::size_t i1 = 0; i1 < dsorted[0]; i1 += 2 * dsorted[1]){
 				for (std::size_t i2 = 0; i2 < dsorted[1]; i2 += 2 * dsorted[2]){
diff --git a/src/Simulation/Native/src/external/avx2/kernel4.hpp b/src/Simulation/Native/src/external/avx2/kernel4.hpp
index 92f573a0255..e20a190af5c 100644
--- a/src/Simulation/Native/src/external/avx2/kernel4.hpp
+++ b/src/Simulation/Native/src/external/avx2/kernel4.hpp
@@ -227,7 +227,7 @@ void kernel(V& psi, unsigned id3, unsigned id2, unsigned id1, unsigned id0, M co
 
 #ifndef _MSC_VER
 	if (ctrlmask == 0){
-		#pragma omp for collapse(LOOP_COLLAPSE4) schedule(static)
+		#pragma omp parallel for collapse(LOOP_COLLAPSE4) schedule(static)
 		for (std::size_t i0 = 0; i0 < n; i0 += 2 * dsorted[0]){
 			for (std::size_t i1 = 0; i1 < dsorted[0]; i1 += 2 * dsorted[1]){
 				for (std::size_t i2 = 0; i2 < dsorted[1]; i2 += 2 * dsorted[2]){
@@ -241,7 +241,7 @@ void kernel(V& psi, unsigned id3, unsigned id2, unsigned id1, unsigned id0, M co
 		}
 	}
 	else{
-		#pragma omp for collapse(LOOP_COLLAPSE4) schedule(static)
+		#pragma omp parallel for collapse(LOOP_COLLAPSE4) schedule(static)
 		for (std::size_t i0 = 0; i0 < n; i0 += 2 * dsorted[0]){
 			for (std::size_t i1 = 0; i1 < dsorted[0]; i1 += 2 * dsorted[1]){
 				for (std::size_t i2 = 0; i2 < dsorted[1]; i2 += 2 * dsorted[2]){
diff --git a/src/Simulation/Native/src/external/avx2/kernel5.hpp b/src/Simulation/Native/src/external/avx2/kernel5.hpp
index 0cdba4b89c5..8cf84656e68 100644
--- a/src/Simulation/Native/src/external/avx2/kernel5.hpp
+++ b/src/Simulation/Native/src/external/avx2/kernel5.hpp
@@ -380,7 +380,7 @@ void kernel(V& psi, unsigned id4, unsigned id3, unsigned id2, unsigned id1, unsi
 
 #ifndef _MSC_VER
 	if (ctrlmask == 0){
-		#pragma omp for collapse(LOOP_COLLAPSE5) schedule(static)
+		#pragma omp parallel for collapse(LOOP_COLLAPSE5) schedule(static)
 		for (std::size_t i0 = 0; i0 < n; i0 += 2 * dsorted[0]){
 			for (std::size_t i1 = 0; i1 < dsorted[0]; i1 += 2 * dsorted[1]){
 				for (std::size_t i2 = 0; i2 < dsorted[1]; i2 += 2 * dsorted[2]){
@@ -396,7 +396,7 @@ void kernel(V& psi, unsigned id4, unsigned id3, unsigned id2, unsigned id1, unsi
 		}
 	}
 	else{
-		#pragma omp for collapse(LOOP_COLLAPSE5) schedule(static)
+		#pragma omp parallel for collapse(LOOP_COLLAPSE5) schedule(static)
 		for (std::size_t i0 = 0; i0 < n; i0 += 2 * dsorted[0]){
 			for (std::size_t i1 = 0; i1 < dsorted[0]; i1 += 2 * dsorted[1]){
 				for (std::size_t i2 = 0; i2 < dsorted[1]; i2 += 2 * dsorted[2]){
diff --git a/src/Simulation/Native/src/external/avx2/kernel6.hpp b/src/Simulation/Native/src/external/avx2/kernel6.hpp
index 343c6e3154c..087c4e83473 100644
--- a/src/Simulation/Native/src/external/avx2/kernel6.hpp
+++ b/src/Simulation/Native/src/external/avx2/kernel6.hpp
@@ -212,7 +212,7 @@ void kernel(V& psi, unsigned id5, unsigned id4, unsigned id3, unsigned id2, unsi
 
 #ifndef _MSC_VER
 	if (ctrlmask == 0){
-		#pragma omp for collapse(LOOP_COLLAPSE6) schedule(static)
+		#pragma omp parallel for collapse(LOOP_COLLAPSE6) schedule(static)
 		for (std::size_t i0 = 0; i0 < n; i0 += 2 * dsorted[0]){
 			for (std::size_t i1 = 0; i1 < dsorted[0]; i1 += 2 * dsorted[1]){
 				for (std::size_t i2 = 0; i2 < dsorted[1]; i2 += 2 * dsorted[2]){
@@ -230,7 +230,7 @@ void kernel(V& psi, unsigned id5, unsigned id4, unsigned id3, unsigned id2, unsi
 		}
 	}
 	else{
-		#pragma omp for collapse(LOOP_COLLAPSE6) schedule(static)
+		#pragma omp parallel for collapse(LOOP_COLLAPSE6) schedule(static)
 		for (std::size_t i0 = 0; i0 < n; i0 += 2 * dsorted[0]){
 			for (std::size_t i1 = 0; i1 < dsorted[0]; i1 += 2 * dsorted[1]){
 				for (std::size_t i2 = 0; i2 < dsorted[1]; i2 += 2 * dsorted[2]){
diff --git a/src/Simulation/Native/src/external/avx2/kernel7.hpp b/src/Simulation/Native/src/external/avx2/kernel7.hpp
index ecd0f45f3cf..197e800b208 100644
--- a/src/Simulation/Native/src/external/avx2/kernel7.hpp
+++ b/src/Simulation/Native/src/external/avx2/kernel7.hpp
@@ -389,7 +389,7 @@ void kernel(V& psi, unsigned id6, unsigned id5, unsigned id4, unsigned id3, unsi
 
 #ifndef _MSC_VER
 	if (ctrlmask == 0){
-		#pragma omp for collapse(LOOP_COLLAPSE7) schedule(static)
+		#pragma omp parallel for collapse(LOOP_COLLAPSE7) schedule(static)
 		for (std::size_t i0 = 0; i0 < n; i0 += 2 * dsorted[0]){
 			for (std::size_t i1 = 0; i1 < dsorted[0]; i1 += 2 * dsorted[1]){
 				for (std::size_t i2 = 0; i2 < dsorted[1]; i2 += 2 * dsorted[2]){
@@ -409,7 +409,7 @@ void kernel(V& psi, unsigned id6, unsigned id5, unsigned id4, unsigned id3, unsi
 		}
 	}
 	else{
-		#pragma omp for collapse(LOOP_COLLAPSE7) schedule(static)
+		#pragma omp parallel for collapse(LOOP_COLLAPSE7) schedule(static)
 		for (std::size_t i0 = 0; i0 < n; i0 += 2 * dsorted[0]){
 			for (std::size_t i1 = 0; i1 < dsorted[0]; i1 += 2 * dsorted[1]){
 				for (std::size_t i2 = 0; i2 < dsorted[1]; i2 += 2 * dsorted[2]){
diff --git a/src/Simulation/Native/src/external/cintrin.hpp b/src/Simulation/Native/src/external/cintrin.hpp
index 8034408c1e8..b0eab16a79c 100644
--- a/src/Simulation/Native/src/external/cintrin.hpp
+++ b/src/Simulation/Native/src/external/cintrin.hpp
@@ -35,7 +35,10 @@ inline void permute_qubits_and_matrix(I *delta_list, unsigned n, M & matrix){
 }
 
 inline std::complex<double> fma(std::complex<double> const& c1, std::complex<double> const& c2, std::complex<double> const& a){
-	return c1*c2 + a;
+	// Expanded complex FMA to hard coded access (much faster)
+	double r = (c1.real() * c2.real() - c1.imag() * c2.imag()) + a.real();
+	double i = (c1.real() * c2.imag() + c1.imag() * c2.real()) + a.imag();
+	return std::complex<double>(r, i);
 }
 
 inline __m256d fma(__m256d const& c1, __m256d const& c2, __m256d const& a){
diff --git a/src/Simulation/Native/src/external/fused.hpp b/src/Simulation/Native/src/external/fused.hpp
index 6c1137ceb3b..235cf8ed15c 100644
--- a/src/Simulation/Native/src/external/fused.hpp
+++ b/src/Simulation/Native/src/external/fused.hpp
@@ -15,7 +15,7 @@
 #ifdef HAVE_FMA
 #include "external/avx2/kernels.hpp"
 #else
-#include "external/avx2/kernels.hpp"
+#include "external/avx/kernels.hpp"
 #endif
 #endif
 #endif
@@ -30,7 +30,11 @@ namespace SIMULATOR
 class Fused
   {
   public:
-    Fused() {}
+      Fused() {
+        wfnCapacity     = 0u; // used to optimize runtime parameters
+        maxFusedSpan    =-1;  // determine span to use at runtime
+        maxFusedDepth   = 99; // determine max depth to use at runtime
+    }
 
     inline void reset()
     {
@@ -48,7 +52,7 @@ class Fused
       Fusion::IndexVector qs, cs;
       
       fusedgates.perform_fusion(m, qs, cs);
-      
+
       std::size_t cmask = 0;
       for (auto c : cs)
         cmask |= (1ull << c);
@@ -71,7 +75,7 @@ class Fused
           ::kernel(wfn, qs[4], qs[3], qs[2], qs[1], qs[0], m, cmask);
           break;
       }
-      
+
       fusedgates = Fusion();
     }
     
@@ -98,20 +102,42 @@ class Fused
     template <class T, class A, class M>
     void apply_controlled(std::vector<T, A>& wfn, M const& mat, std::vector<unsigned> const& cs, unsigned q)
     {
-      if (fusedgates.num_qubits()+fusedgates.num_controls()+cs.size()>8 || fusedgates.size() > 15)
-        flush(wfn);
-      Fusion newgates = fusedgates;
-      newgates.insert(convertMatrix(mat), std::vector<unsigned>(1, q), cs);
-      
-      if (newgates.num_qubits() > 4)
-      {
-        flush(wfn);
-        fusedgates.insert(convertMatrix(mat), std::vector<unsigned>(1, q), cs);
-      }
-      else
-        fusedgates = newgates;
+      // Major runtime logic change here
+
+        // Have to update capacity as the WFN grows
+        if (wfnCapacity != wfn.capacity()) {
+            wfnCapacity     = wfn.capacity();
+            char* envNT = NULL;
+            size_t len;
+#ifdef _MSC_VER
+            errno_t err = _dupenv_s(&envNT, &len, "OMP_NUM_THREADS");
+#else
+            envNT = getenv("OMP_NUM_THREADS");
+#endif
+            if (envNT == NULL) { // If the user didn't force the number of threads, make an intelligent guess
+                int nMaxThrds = 4;
+                if (wfnCapacity < 1ul << 20) nMaxThrds = 3;
+                int nProcs = omp_get_num_procs();
+                if (nProcs < 3) nMaxThrds = nProcs;
+                omp_set_num_threads(nMaxThrds);
+            }
+
+            // This is now pretty much unlimited, could be set in the future
+            maxFusedDepth = 99;
+
+            // Default for large problems (optimized with benchmarks)
+            maxFusedSpan = 3;
+
+            // Reduce size for small problems (optimized with benchmarks)
+            if (wfnCapacity < 1ul << 20) maxFusedSpan = 2;
+        }
+
+        // New rules of when to stop fusing
+        Fusion::IndexVector qs      = std::vector<unsigned>(1, q);
+        if (fusedgates.predict(qs, cs) > maxFusedSpan || fusedgates.size() >= maxFusedDepth)  flush(wfn);
+        fusedgates.insert(convertMatrix(mat), qs, cs);
     }
-    
+
     template <class T, class A, class M>
     void apply(std::vector<T, A>& wfn, M const& mat, unsigned q)
     {
@@ -120,6 +146,11 @@ class Fused
     }
   private:
     mutable Fusion fusedgates;
+
+    //: New runtime optimizatin settings
+    mutable size_t wfnCapacity;
+    mutable int    maxFusedSpan;
+    mutable int    maxFusedDepth;
   };
   
   
diff --git a/src/Simulation/Native/src/external/fusion.hpp b/src/Simulation/Native/src/external/fusion.hpp
index 2e6b4bdb50d..f3224f79212 100644
--- a/src/Simulation/Native/src/external/fusion.hpp
+++ b/src/Simulation/Native/src/external/fusion.hpp
@@ -58,6 +58,18 @@ class Fusion{
 		handle_controls(empty_matrix, empty_vec, {}); // remove all current control qubits (this is a GLOBAL factor)
 	}
 	
+
+	// This saves a class instance create/destroy on every gate insert
+	// Need a quick way to decide if we're going to grow too wide
+	int predict(IndexVector index_list, IndexVector const& ctrl_list = {}) {
+		int cnt = num_qubits() + num_controls();
+		for (auto idx : index_list)
+			if (set_.count(idx) == 0 && ctrl_set_.count(idx) == 0) cnt++;
+		for (auto idx : ctrl_list)
+			if (set_.count(idx) == 0 && ctrl_set_.count(idx) == 0) cnt++;
+		return cnt;
+	}
+
 	void insert(Matrix matrix, IndexVector index_list, IndexVector const& ctrl_list = {}){
 		for (auto idx : index_list)
 			set_.emplace(idx);
@@ -97,7 +109,8 @@ class Fusion{
 			for (unsigned i = 0; i < idx.size(); ++i)
 				idx2mat[i] = static_cast<unsigned>(((std::equal_range(index_list.begin(), index_list.end(), idx[i])).first - index_list.begin()));
 			
-			for (std::size_t k = 0; k < (1ULL<<N); ++k){ // loop over big matrix columns
+			#pragma omp parallel for schedule(static)
+			for (int k = 0; k < (1ULL<<N); ++k){ // loop over big matrix columns //@@@DBG std::size_t
 				// check if column index satisfies control-mask
 				// if not: leave it unchanged
 				std::vector<Complex> oldcol(1ULL<<N);
diff --git a/src/Simulation/Native/src/external/nointrin/kernel1.hpp b/src/Simulation/Native/src/external/nointrin/kernel1.hpp
index a6b624d3328..53840cd0904 100644
--- a/src/Simulation/Native/src/external/nointrin/kernel1.hpp
+++ b/src/Simulation/Native/src/external/nointrin/kernel1.hpp
@@ -43,7 +43,7 @@ void kernel(V& psi, unsigned id0, M const& matrix, std::size_t ctrlmask)
 
 #ifndef _MSC_VER
 	if (ctrlmask == 0){
-		#pragma omp for collapse(LOOP_COLLAPSE1) schedule(static)
+		#pragma omp parallel for collapse(LOOP_COLLAPSE1) schedule(static)
 		for (std::size_t i0 = 0; i0 < n; i0 += 2 * dsorted[0]){
 			for (std::size_t i1 = 0; i1 < dsorted[0]; ++i1){
 				kernel_core(psi, i0 + i1, dsorted[0], mm);
@@ -51,7 +51,7 @@ void kernel(V& psi, unsigned id0, M const& matrix, std::size_t ctrlmask)
 		}
 	}
 	else{
-		#pragma omp for collapse(LOOP_COLLAPSE1) schedule(static)
+		#pragma omp parallel for collapse(LOOP_COLLAPSE1) schedule(static)
 		for (std::size_t i0 = 0; i0 < n; i0 += 2 * dsorted[0]){
 			for (std::size_t i1 = 0; i1 < dsorted[0]; ++i1){
 				if (((i0 + i1)&ctrlmask) == ctrlmask)
diff --git a/src/Simulation/Native/src/external/nointrin/kernel2.hpp b/src/Simulation/Native/src/external/nointrin/kernel2.hpp
index 43be1a33440..7ad58accb47 100644
--- a/src/Simulation/Native/src/external/nointrin/kernel2.hpp
+++ b/src/Simulation/Native/src/external/nointrin/kernel2.hpp
@@ -64,7 +64,7 @@ void kernel(V& psi, unsigned id1, unsigned id0, M const& matrix, std::size_t ctr
 
 #ifndef _MSC_VER
 	if (ctrlmask == 0){
-		#pragma omp for collapse(LOOP_COLLAPSE2) schedule(static)
+		#pragma omp parallel for collapse(LOOP_COLLAPSE2) schedule(static)
 		for (std::size_t i0 = 0; i0 < n; i0 += 2 * dsorted[0]){
 			for (std::size_t i1 = 0; i1 < dsorted[0]; i1 += 2 * dsorted[1]){
 				for (std::size_t i2 = 0; i2 < dsorted[1]; ++i2){
@@ -74,7 +74,7 @@ void kernel(V& psi, unsigned id1, unsigned id0, M const& matrix, std::size_t ctr
 		}
 	}
 	else{
-		#pragma omp for collapse(LOOP_COLLAPSE2) schedule(static)
+		#pragma omp parallel for collapse(LOOP_COLLAPSE2) schedule(static)
 		for (std::size_t i0 = 0; i0 < n; i0 += 2 * dsorted[0]){
 			for (std::size_t i1 = 0; i1 < dsorted[0]; i1 += 2 * dsorted[1]){
 				for (std::size_t i2 = 0; i2 < dsorted[1]; ++i2){
diff --git a/src/Simulation/Native/src/external/nointrin/kernel3.hpp b/src/Simulation/Native/src/external/nointrin/kernel3.hpp
index da44778220a..6b714eb0df8 100644
--- a/src/Simulation/Native/src/external/nointrin/kernel3.hpp
+++ b/src/Simulation/Native/src/external/nointrin/kernel3.hpp
@@ -129,7 +129,7 @@ void kernel(V& psi, unsigned id2, unsigned id1, unsigned id0, M const& matrix, s
 
 #ifndef _MSC_VER
 	if (ctrlmask == 0){
-		#pragma omp for collapse(LOOP_COLLAPSE3) schedule(static)
+		#pragma omp parallel for collapse(LOOP_COLLAPSE3) schedule(static)
 		for (std::size_t i0 = 0; i0 < n; i0 += 2 * dsorted[0]){
 			for (std::size_t i1 = 0; i1 < dsorted[0]; i1 += 2 * dsorted[1]){
 				for (std::size_t i2 = 0; i2 < dsorted[1]; i2 += 2 * dsorted[2]){
@@ -141,7 +141,7 @@ void kernel(V& psi, unsigned id2, unsigned id1, unsigned id0, M const& matrix, s
 		}
 	}
 	else{
-		#pragma omp for collapse(LOOP_COLLAPSE3) schedule(static)
+		#pragma omp parallel for collapse(LOOP_COLLAPSE3) schedule(static)
 		for (std::size_t i0 = 0; i0 < n; i0 += 2 * dsorted[0]){
 			for (std::size_t i1 = 0; i1 < dsorted[0]; i1 += 2 * dsorted[1]){
 				for (std::size_t i2 = 0; i2 < dsorted[1]; i2 += 2 * dsorted[2]){
diff --git a/src/Simulation/Native/src/external/nointrin/kernel4.hpp b/src/Simulation/Native/src/external/nointrin/kernel4.hpp
index 6bc3b303e6e..f8b38ec3a92 100644
--- a/src/Simulation/Native/src/external/nointrin/kernel4.hpp
+++ b/src/Simulation/Native/src/external/nointrin/kernel4.hpp
@@ -354,7 +354,7 @@ void kernel(V& psi, unsigned id3, unsigned id2, unsigned id1, unsigned id0, M co
 
 #ifndef _MSC_VER
 	if (ctrlmask == 0){
-		#pragma omp for collapse(LOOP_COLLAPSE4) schedule(static)
+		#pragma omp parallel for collapse(LOOP_COLLAPSE4) schedule(static)
 		for (std::size_t i0 = 0; i0 < n; i0 += 2 * dsorted[0]){
 			for (std::size_t i1 = 0; i1 < dsorted[0]; i1 += 2 * dsorted[1]){
 				for (std::size_t i2 = 0; i2 < dsorted[1]; i2 += 2 * dsorted[2]){
@@ -368,7 +368,7 @@ void kernel(V& psi, unsigned id3, unsigned id2, unsigned id1, unsigned id0, M co
 		}
 	}
 	else{
-		#pragma omp for collapse(LOOP_COLLAPSE4) schedule(static)
+		#pragma omp parallel for collapse(LOOP_COLLAPSE4) schedule(static)
 		for (std::size_t i0 = 0; i0 < n; i0 += 2 * dsorted[0]){
 			for (std::size_t i1 = 0; i1 < dsorted[0]; i1 += 2 * dsorted[1]){
 				for (std::size_t i2 = 0; i2 < dsorted[1]; i2 += 2 * dsorted[2]){
diff --git a/src/Simulation/Native/src/external/nointrin/kernel5.hpp b/src/Simulation/Native/src/external/nointrin/kernel5.hpp
index 13d363f7df3..59094e29042 100644
--- a/src/Simulation/Native/src/external/nointrin/kernel5.hpp
+++ b/src/Simulation/Native/src/external/nointrin/kernel5.hpp
@@ -643,7 +643,7 @@ void kernel(V& psi, unsigned id4, unsigned id3, unsigned id2, unsigned id1, unsi
 
 #ifndef _MSC_VER
 	if (ctrlmask == 0){
-		#pragma omp for collapse(LOOP_COLLAPSE5) schedule(static)
+		#pragma omp parallel for collapse(LOOP_COLLAPSE5) schedule(static)
 		for (std::size_t i0 = 0; i0 < n; i0 += 2 * dsorted[0]){
 			for (std::size_t i1 = 0; i1 < dsorted[0]; i1 += 2 * dsorted[1]){
 				for (std::size_t i2 = 0; i2 < dsorted[1]; i2 += 2 * dsorted[2]){
@@ -659,7 +659,7 @@ void kernel(V& psi, unsigned id4, unsigned id3, unsigned id2, unsigned id1, unsi
 		}
 	}
 	else{
-		#pragma omp for collapse(LOOP_COLLAPSE5) schedule(static)
+		#pragma omp parallel for collapse(LOOP_COLLAPSE5) schedule(static)
 		for (std::size_t i0 = 0; i0 < n; i0 += 2 * dsorted[0]){
 			for (std::size_t i1 = 0; i1 < dsorted[0]; i1 += 2 * dsorted[1]){
 				for (std::size_t i2 = 0; i2 < dsorted[1]; i2 += 2 * dsorted[2]){
diff --git a/src/Simulation/Native/src/external/nointrin/kernel6.hpp b/src/Simulation/Native/src/external/nointrin/kernel6.hpp
index 893bf4e35d5..0fe55f34878 100644
--- a/src/Simulation/Native/src/external/nointrin/kernel6.hpp
+++ b/src/Simulation/Native/src/external/nointrin/kernel6.hpp
@@ -244,7 +244,7 @@ void kernel(V& psi, unsigned id5, unsigned id4, unsigned id3, unsigned id2, unsi
 
 #ifndef _MSC_VER
 	if (ctrlmask == 0){
-		#pragma omp for collapse(LOOP_COLLAPSE6) schedule(static)
+		#pragma omp parallel for collapse(LOOP_COLLAPSE6) schedule(static)
 		for (std::size_t i0 = 0; i0 < n; i0 += 2 * dsorted[0]){
 			for (std::size_t i1 = 0; i1 < dsorted[0]; i1 += 2 * dsorted[1]){
 				for (std::size_t i2 = 0; i2 < dsorted[1]; i2 += 2 * dsorted[2]){
@@ -262,7 +262,7 @@ void kernel(V& psi, unsigned id5, unsigned id4, unsigned id3, unsigned id2, unsi
 		}
 	}
 	else{
-		#pragma omp for collapse(LOOP_COLLAPSE6) schedule(static)
+		#pragma omp parallel for collapse(LOOP_COLLAPSE6) schedule(static)
 		for (std::size_t i0 = 0; i0 < n; i0 += 2 * dsorted[0]){
 			for (std::size_t i1 = 0; i1 < dsorted[0]; i1 += 2 * dsorted[1]){
 				for (std::size_t i2 = 0; i2 < dsorted[1]; i2 += 2 * dsorted[2]){
diff --git a/src/Simulation/Native/src/external/nointrin/kernel7.hpp b/src/Simulation/Native/src/external/nointrin/kernel7.hpp
index a9537bb61d1..2a3809b134f 100644
--- a/src/Simulation/Native/src/external/nointrin/kernel7.hpp
+++ b/src/Simulation/Native/src/external/nointrin/kernel7.hpp
@@ -453,7 +453,7 @@ void kernel(V& psi, unsigned id6, unsigned id5, unsigned id4, unsigned id3, unsi
 
 #ifndef _MSC_VER
 	if (ctrlmask == 0){
-		#pragma omp for collapse(LOOP_COLLAPSE7) schedule(static)
+		#pragma omp parallel for collapse(LOOP_COLLAPSE7) schedule(static)
 		for (std::size_t i0 = 0; i0 < n; i0 += 2 * dsorted[0]){
 			for (std::size_t i1 = 0; i1 < dsorted[0]; i1 += 2 * dsorted[1]){
 				for (std::size_t i2 = 0; i2 < dsorted[1]; i2 += 2 * dsorted[2]){
@@ -473,7 +473,7 @@ void kernel(V& psi, unsigned id6, unsigned id5, unsigned id4, unsigned id3, unsi
 		}
 	}
 	else{
-		#pragma omp for collapse(LOOP_COLLAPSE7) schedule(static)
+		#pragma omp parallel for collapse(LOOP_COLLAPSE7) schedule(static)
 		for (std::size_t i0 = 0; i0 < n; i0 += 2 * dsorted[0]){
 			for (std::size_t i1 = 0; i1 < dsorted[0]; i1 += 2 * dsorted[1]){
 				for (std::size_t i2 = 0; i2 < dsorted[1]; i2 += 2 * dsorted[2]){
diff --git a/src/Simulation/Native/src/simulator/kernels.hpp b/src/Simulation/Native/src/simulator/kernels.hpp
index 68a710de36a..21447c5b535 100644
--- a/src/Simulation/Native/src/simulator/kernels.hpp
+++ b/src/Simulation/Native/src/simulator/kernels.hpp
@@ -398,7 +398,6 @@ void subsytemwavefunction_by_pivot(std::vector<T, A1> const& wfn,
 
     std::vector<size_t> chunks;
     
-#pragma omp parallel
     {
 #pragma omp single
         chunks = split_interval_in_chunks(max, omp_get_num_threads());
@@ -446,7 +445,6 @@ bool istensorproduct(std::vector<T, A1> const& wfn,
     std::size_t compl_st = compl_bits.to_ullong();
 
     std::atomic<bool> go(true);
-#pragma omp parallel
     {
         int thread_id = omp_get_thread_num();
         if (thread_id < chunks.size() - 1)
diff --git a/src/Simulation/Native/src/version.hpp b/src/Simulation/Native/src/version.hpp
new file mode 100644
index 00000000000..b55afdfd213
--- /dev/null
+++ b/src/Simulation/Native/src/version.hpp
@@ -0,0 +1,11 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#pragma once
+
+/* #undef MICROSOFT_QUANTUM_SIMULATOR_VERSION */
+/* #undef MICROSOFT_QUANTUM_SIMULATOR_VERSION_MAJOR */
+/* #undef MICROSOFT_QUANTUM_SIMULATOR_VERSION_MINOR */
+/* #undef MICROSOFT_QUANTUM_SIMULATOR_VERSION_PATCH */
+/* #undef MICROSOFT_QUANTUM_SIMULATOR_VERSION_STRING */
+#define MICROSOFT_QUANTUM_SIMULATOR_YEAR "2020"