diff --git a/src/Simulation/Native/CMakeSettings.json b/src/Simulation/Native/CMakeSettings.json
new file mode 100644
index 00000000000..ee45e8257c1
--- /dev/null
+++ b/src/Simulation/Native/CMakeSettings.json
@@ -0,0 +1,28 @@
+﻿{
+  "configurations": [
+    {
+      "name": "x64-Debug",
+      "generator": "Ninja",
+      "configurationType": "Debug",
+      "inheritEnvironments": [ "msvc_x64_x64" ],
+      "buildRoot": "${projectDir}\\out\\build\\${name}",
+      "installRoot": "${projectDir}\\out\\install\\${name}",
+      "cmakeCommandArgs": "",
+      "buildCommandArgs": "",
+      "ctestCommandArgs": "",
+      "variables": []
+    },
+    {
+      "name": "x64-Release",
+      "generator": "Ninja",
+      "configurationType": "RelWithDebInfo",
+      "buildRoot": "${projectDir}\\out\\build\\${name}",
+      "installRoot": "${projectDir}\\out\\install\\${name}",
+      "cmakeCommandArgs": "",
+      "buildCommandArgs": "",
+      "ctestCommandArgs": "",
+      "inheritEnvironments": [ "msvc_x64_x64" ],
+      "variables": []
+    }
+  ]
+}
\ No newline at end of file
diff --git a/src/Simulation/Native/src/external/fused.hpp b/src/Simulation/Native/src/external/fused.hpp
index 235cf8ed15c..f1c7ac49e68 100644
--- a/src/Simulation/Native/src/external/fused.hpp
+++ b/src/Simulation/Native/src/external/fused.hpp
@@ -41,7 +41,14 @@ class Fused
       fusedgates = Fusion();
     }
 
+    const Fusion& get_fusedgates() const {
+        return fusedgates;
+    }
     
+    void set_fusedgates(Fusion newFusedGates) const {
+        fusedgates = newFusedGates;
+    }
+
     template <class T, class A>
     void flush(std::vector<T, A>& wfn) const
     {
@@ -79,16 +86,6 @@ class Fused
       fusedgates = Fusion();
     }
     
-    template <class T, class A1, class A2>
-    bool subsytemwavefunction(std::vector<T, A1>& wfn,
-                              std::vector<unsigned> const& qs,
-                              std::vector<T, A2>& qubitswfn,
-                              double tolerance)
-    {
-      flush(wfn); // we have to flush before we can extract the state
-      return kernels::subsytemwavefunction(wfn, qs, qubitswfn, tolerance);
-    }
-    
     template <class M>
     Fusion::Matrix convertMatrix(M const& m)
     {
@@ -102,11 +99,25 @@ class Fused
     template <class T, class A, class M>
     void apply_controlled(std::vector<T, A>& wfn, M const& mat, std::vector<unsigned> const& cs, unsigned q)
     {
-      // Major runtime logic change here
+        Fusion::IndexVector qs = std::vector<unsigned>(1, q);
+        fusedgates.insert(convertMatrix(mat), qs, cs);
+    }
 
-        // Have to update capacity as the WFN grows
+    template <class T, class A, class M>
+    void apply(std::vector<T, A>& wfn, M const& mat, unsigned q)
+    {
+      std::vector<unsigned> cs;
+      apply_controlled(wfn, mat, cs, q);
+    }
+
+    template <class T, class A>
+    bool shouldFlush(std::vector<T, A>& wfn, std::vector<unsigned> const& cs, unsigned q)
+    {
+        // Major runtime logic change here
+
+          // Have to update capacity as the WFN grows
         if (wfnCapacity != wfn.capacity()) {
-            wfnCapacity     = wfn.capacity();
+            wfnCapacity = wfn.capacity();
             char* envNT = NULL;
             size_t len;
 #ifdef _MSC_VER
@@ -133,16 +144,9 @@ class Fused
         }
 
         // New rules of when to stop fusing
-        Fusion::IndexVector qs      = std::vector<unsigned>(1, q);
-        if (fusedgates.predict(qs, cs) > maxFusedSpan || fusedgates.size() >= maxFusedDepth)  flush(wfn);
-        fusedgates.insert(convertMatrix(mat), qs, cs);
-    }
+        Fusion::IndexVector qs = std::vector<unsigned>(1, q);
 
-    template <class T, class A, class M>
-    void apply(std::vector<T, A>& wfn, M const& mat, unsigned q)
-    {
-      std::vector<unsigned> cs;
-      apply_controlled(wfn, mat, cs, q);
+        return (fusedgates.predict(qs, cs) > maxFusedSpan || fusedgates.size() >= maxFusedDepth);
     }
   private:
     mutable Fusion fusedgates;
diff --git a/src/Simulation/Native/src/external/fusion.hpp b/src/Simulation/Native/src/external/fusion.hpp
index f3224f79212..f89471a1e88 100644
--- a/src/Simulation/Native/src/external/fusion.hpp
+++ b/src/Simulation/Native/src/external/fusion.hpp
@@ -10,6 +10,7 @@
 #include <iostream>
 #include <cassert>
 #include "util/alignedalloc.hpp"
+#include <unordered_map>
 
 class Item{
 public:  
@@ -17,14 +18,20 @@ class Item{
 	using IndexVector = std::vector<Index>;
 	using Complex = std::complex<double>;
 	using Matrix = std::vector<std::vector<Complex, Microsoft::Quantum::Simulator::AlignedAlloc<Complex, 64>>>;
-	Item(Matrix mat, IndexVector idx) : mat_(mat), idx_(idx) {}
+	Item(Matrix mat, IndexVector idx) : mat_(std::move(mat)), idx_(idx) {}
 	Matrix& get_matrix() { return mat_; }
-	IndexVector& get_indices() { return idx_; }
+	IndexVector& get_indices() const { return idx_; }
+	void remap_idx(std::unordered_map<unsigned, unsigned> elemDict) const {
+		for (size_t i = 0; i < idx_.size(); i++) {
+			idx_[i] = elemDict[idx_[i]];
+		}
+	}
 private:
 	Matrix mat_;
-	IndexVector idx_;
+	mutable IndexVector idx_;
 };
 
+// Class handling the fusion of gates
 class Fusion{
 public:
 	using Index = unsigned;
@@ -37,7 +44,7 @@ class Fusion{
 	Fusion() : global_factor_(1.) {}
 	
 	Index num_qubits() const {
-		return static_cast<Index>(set_.size());
+		return static_cast<Index>(target_set_.size());
 	}
 
 	Index num_controls() const {
@@ -58,21 +65,58 @@ class Fusion{
 		handle_controls(empty_matrix, empty_vec, {}); // remove all current control qubits (this is a GLOBAL factor)
 	}
 	
+	const IndexSet& get_target_set() const {
+		return target_set_;
+	}
+
+	const ItemVector& get_items() const {
+		return items_;
+	}
+
+	const IndexSet& get_ctrl_set() const {
+		return ctrl_set_;
+	}
+
+	const Complex& get_global_factor() const {
+		return global_factor_;
+	}
+
+	static void remap_qubits(std::set<Index>& qubits, const std::unordered_map<unsigned, unsigned>& mapFromOldLocToNewLoc) {
+		std::set<Index> tempSet;
+		for (unsigned elem : qubits) {
+			if (mapFromOldLocToNewLoc.find(elem) != mapFromOldLocToNewLoc.end()) {
+				tempSet.insert(mapFromOldLocToNewLoc.at(elem));
+			}
+		}
+		qubits.swap(tempSet);
+	}
+
+	void remap_target_set(const std::unordered_map<unsigned, unsigned>& mapFromOldLocToNewLoc) const {
+		remap_qubits(target_set_, mapFromOldLocToNewLoc);
+	}
+
+	void remap_ctrl_set(const std::unordered_map<unsigned, unsigned>& mapFromOldLocToNewLoc) const {
+		remap_qubits(ctrl_set_, mapFromOldLocToNewLoc);
+	}
+	
+	void set_items(ItemVector&& newItems) {
+		items_.swap(newItems);
+	}
 
 	// This saves a class instance create/destroy on every gate insert
 	// Need a quick way to decide if we're going to grow too wide
 	int predict(IndexVector index_list, IndexVector const& ctrl_list = {}) {
 		int cnt = num_qubits() + num_controls();
 		for (auto idx : index_list)
-			if (set_.count(idx) == 0 && ctrl_set_.count(idx) == 0) cnt++;
+			if (target_set_.count(idx) == 0 && ctrl_set_.count(idx) == 0) cnt++;
 		for (auto idx : ctrl_list)
-			if (set_.count(idx) == 0 && ctrl_set_.count(idx) == 0) cnt++;
+			if (target_set_.count(idx) == 0 && ctrl_set_.count(idx) == 0) cnt++;
 		return cnt;
 	}
 
 	void insert(Matrix matrix, IndexVector index_list, IndexVector const& ctrl_list = {}){
 		for (auto idx : index_list)
-			set_.emplace(idx);
+			target_set_.emplace(idx);
 		
 		if (global_factor_ != 1. && ctrl_list.size() > 0){
 			assert(ctrl_set_.size() == 0);
@@ -85,7 +129,7 @@ class Fusion{
 	}
 	
 	void get_indices(IndexVector &indices) const{
-		for (auto idx : set_)
+		for (auto idx : target_set_)
 			indices.push_back(idx);
 	}
 	
@@ -93,7 +137,7 @@ class Fusion{
 		if (global_factor_ != 1.)
 			assert(ctrl_set_.size() == 0);
 		
-		for (auto idx : set_)
+		for (auto idx : target_set_)
 			index_list.push_back(idx);
 		
 		unsigned N = num_qubits();
@@ -167,7 +211,7 @@ class Fusion{
 			if (ctrl_set_.count(ctrlIdx) == 0){ // need to either add it to the list or to the command
 				if (items_.size() > 0){ // add it to the command
 					add_controls(matrix, indexList, {ctrlIdx});
-					set_.insert(ctrlIdx);
+					target_set_.insert(ctrlIdx);
 				}
 				else // add it to the list
 					ctrl_set_.emplace(ctrlIdx);
@@ -183,17 +227,17 @@ class Fusion{
 			for (auto idx : unhandled_ctrl){
 				new_ctrls.push_back(idx);
 				ctrl_set_.erase(idx);
-				set_.insert(idx);
+				target_set_.insert(idx);
 			}
 			for (auto &item : items_)
 				add_controls(item.get_matrix(), item.get_indices(), new_ctrls);
 		}
 	}
 	
-	IndexSet set_;
-	ItemVector items_;
-	IndexSet ctrl_set_;
-	Complex global_factor_;
+	mutable IndexSet target_set_; //set of qubits being acted on
+	mutable ItemVector items_; //queue if gates to be fused
+	mutable IndexSet ctrl_set_; //set of controls
+	mutable Complex global_factor_;
 };
 
 #endif
diff --git a/src/Simulation/Native/src/external/nointrin/kernel1.hpp b/src/Simulation/Native/src/external/nointrin/kernel1.hpp
index 015e7e9d227..5173b58d8a8 100644
--- a/src/Simulation/Native/src/external/nointrin/kernel1.hpp
+++ b/src/Simulation/Native/src/external/nointrin/kernel1.hpp
@@ -60,20 +60,20 @@ void kernel(V& psi, unsigned id0, M const& matrix, std::size_t ctrlmask)
 		}
 	}
 #else
-	std::intptr_t zero = 0;
-	std::intptr_t dmask = dsorted[0];
+    std::intptr_t zero = 0;
+    std::intptr_t dmask = dsorted[0];
 
-	if (ctrlmask == 0){
-		#pragma omp parallel for schedule(static)
-		for (std::intptr_t i = 0; i < static_cast<std::intptr_t>(n); ++i)
-			if ((i & dmask) == zero)
-				kernel_core(psi, i, dsorted[0], mm);
-	} else {
-		#pragma omp parallel for schedule(static)
-		for (std::intptr_t i = 0; i < static_cast<std::intptr_t>(n); ++i)
-			if ((i & ctrlmask) == ctrlmask && (i & dmask) == zero)
-				kernel_core(psi, i, dsorted[0], mm);
-	}
+    if (ctrlmask == 0){
+        #pragma omp parallel for schedule(static)
+        for (std::intptr_t i = 0; i < static_cast<std::intptr_t>(n); ++i)
+            if ((i & dmask) == zero)
+                kernel_core(psi, i, dsorted[0], mm);
+     } else {
+        #pragma omp parallel for schedule(static)
+        for (std::intptr_t i = 0; i < static_cast<std::intptr_t>(n); ++i)
+            if ((i & ctrlmask) == ctrlmask && (i & dmask) == zero)
+                kernel_core(psi, i, dsorted[0], mm);
+     }
 #endif
 }
 
diff --git a/src/Simulation/Native/src/simulator/capi_test.cpp b/src/Simulation/Native/src/simulator/capi_test.cpp
index feb725db2fa..4a686291619 100644
--- a/src/Simulation/Native/src/simulator/capi_test.cpp
+++ b/src/Simulation/Native/src/simulator/capi_test.cpp
@@ -107,24 +107,22 @@ void test_gates()
     allocateQubit(sim_id, 0);
     allocateQubit(sim_id, 1);
 
-     CRx(sim_id, 1.0, 0, 1);
+    CRx(sim_id, 1.0, 0, 1);
 
-    assert(M(sim_id, 1)==false);
+    assert(M(sim_id, 1) == false);
 
     X(sim_id, 0);
-     CRx(sim_id, 1.0, 0, 1);
+    CRx(sim_id, 1.0, 0, 1);
 
     H(sim_id, 1);
     CRx(sim_id, -1.0, 0, 1);
     H(sim_id, 1);
 
-    assert(M(sim_id, 1)==false);
+    assert(M(sim_id, 1) == false);
 
     X(sim_id, 1);
 
-    assert(M(sim_id, 1)==true);
-
-    X(sim_id, 1);
+    assert(M(sim_id, 1) == true);
 
     release(sim_id, 0);
     release(sim_id, 1);
@@ -132,7 +130,6 @@ void test_gates()
     destroy(sim_id);
 }
 
-
 void test_allocate()
 {
     auto sim_id = init();
diff --git a/src/Simulation/Native/src/simulator/wavefunction.hpp b/src/Simulation/Native/src/simulator/wavefunction.hpp
index 1d3a390642c..bb9fdae21bf 100644
--- a/src/Simulation/Native/src/simulator/wavefunction.hpp
+++ b/src/Simulation/Native/src/simulator/wavefunction.hpp
@@ -12,6 +12,7 @@
 #include <limits>
 #include <random>
 #include <vector>
+#include <unordered_set>
 
 #include "types.hpp"
 #include "gates.hpp"
@@ -90,6 +91,50 @@ class Wavefunction
 
     void flush() const
     {
+        // logic to reorder
+        const Fusion& fg = fused_.get_fusedgates();
+        const auto& itemsToFuse = fg.get_items();
+        const auto& ctrlSet = fg.get_ctrl_set();
+        // getting all qubits to move to lower end of the wfn
+        if (!itemsToFuse.empty()) {
+            std::vector<unsigned> unionOfAllQubitsInUse;
+            std::unordered_set<unsigned> indicesSet; //set is introduced to guard against duplicate insertion and maintianing original order
+            for (int i = 0; i < itemsToFuse.size(); i++) {
+                const auto& tempIndices = itemsToFuse[i].get_indices();
+                for (unsigned j = 0; j < tempIndices.size(); j++) {
+                    if (indicesSet.count(tempIndices[j]) == 0) {
+                        unionOfAllQubitsInUse.push_back(tempIndices[j]);
+                        indicesSet.insert(tempIndices[j]);
+                    }
+                }
+            }
+            for (unsigned index : ctrlSet) {
+                if (indicesSet.count(index) == 0) {
+                    unionOfAllQubitsInUse.push_back(index);
+                    indicesSet.insert(index);
+                }
+            }
+            // performing reorder
+            std::vector<qubit_t> indexLocs = qubits(unionOfAllQubitsInUse);
+            for (unsigned i = 0; i < indexLocs.size(); i++)
+            {
+                auto currLoc = indexLocs[i];
+                reorder_wavefunction(currLoc, i);
+                indexLocs = qubits(unionOfAllQubitsInUse);
+            }
+            // keeping old and new location in order to set it appropriately
+            std::unordered_map<unsigned, unsigned> old2newDict;
+            for (unsigned i = 0; i < unionOfAllQubitsInUse.size(); i++) {
+                old2newDict[unionOfAllQubitsInUse[i]] = indexLocs[i];
+            }
+
+            for (int i = 0; i < itemsToFuse.size(); i++) {
+                itemsToFuse[i].remap_idx(old2newDict);
+            }
+            fg.remap_target_set(old2newDict);
+            fg.remap_ctrl_set(old2newDict);
+        }
+        
         fused_.flush(wfn_);
     }
 
@@ -134,7 +179,7 @@ class Wavefunction
     /// \pre the qubit has to be in a classical state in the computational basis
     void release(qubit_t q)
     {
-        unsigned p = qubit(q);
+        unsigned p = qubit(q); //returns qubitmap_[q]
         flush();
         kernels::collapse(wfn_, p, getvalue(q), true);
         for (int i = 0; i < qubitmap_.size(); ++i)
@@ -238,19 +283,53 @@ class Wavefunction
         rng_.seed(s);
     }
 
+    void reorder_wavefunction(unsigned qubitLoc, unsigned newPos) const
+    {
+        // swap qubits in wfn between qubitLoc and newPos
+        if (newPos != qubitLoc)
+        {
+            for (std::size_t i = 0ull; i < wfn_.size(); i++)
+            {
+                std::size_t bit1 = (i >> qubitLoc) & 1ull;
+                std::size_t bit2 = (i >> newPos) & 1ull;
+                std::size_t x = (bit1 ^ bit2);
+                x = (x << qubitLoc) | (x << newPos);
+                std::size_t new_i = i ^ x;
+                if (new_i > i)
+                {
+                    std::iter_swap(wfn_.begin() + i, wfn_.begin() + new_i);
+                }
+            }
+            // get id of qubit located at newPos and qubitLoc - getting index from the element
+            auto newQubitLocItr = std::find(qubitmap_.begin(), qubitmap_.end(), newPos);
+            assert(newQubitLocItr != qubitmap_.end());
+            auto origQubitLocItr = std::find(qubitmap_.begin(), qubitmap_.end(), qubitLoc);
+            assert(origQubitLocItr != qubitmap_.end());
+            // swap elements in qubitmap located at iterators
+            std::iter_swap(origQubitLocItr, newQubitLocItr);
+        }
+    }
+
     /// generic application of a gate
     template <class Gate>
     void apply(Gate const& g)
     {
-        fused_.apply(wfn_, g.matrix(), qubit(g));
+        //check flush condition
+        if (fused_.shouldFlush(wfn_, std::vector<qubit_t>{}, g.qubit())) {
+            flush();
+        }
+        fused_.apply(wfn_, g.matrix(), g.qubit());
     }
-
+    
     /// generic application of a multiply controlled gate
     template <class Gate>
     void apply_controlled(std::vector<qubit_t> cs, Gate const& g)
     {
         std::vector<qubit_t> pcs = qubits(cs);
-        fused_.apply_controlled(wfn_, g.matrix(), pcs, qubit(g));
+        if (fused_.shouldFlush(wfn_, cs, g.qubit())) {
+            flush();
+        }
+        fused_.apply_controlled(wfn_, g.matrix(), cs, g.qubit());
     }
 
     /// generic application of a controlled gate
@@ -274,7 +353,8 @@ class Wavefunction
     template <class A>
     bool subsytemwavefunction(std::vector<unsigned> const& qs, std::vector<T, A>& qubitswfn, double tolerance)
     {
-        return fused_.subsytemwavefunction(wfn_, qubits(qs), qubitswfn, tolerance);
+        flush(); // we have to flush before we can extract the state
+        return kernels::subsytemwavefunction(wfn_, qubits(qs), qubitswfn, tolerance);
     }
 
 
@@ -338,7 +418,7 @@ class Wavefunction
   private:
     unsigned num_qubits_;             // for convenience
     mutable WavefunctionStorage wfn_; // storing the wave function
-    std::vector<qubit_t> qubitmap_;   // mapping of logical to physical qubits
+    mutable std::vector<qubit_t> qubitmap_;   // mapping of logical to physical qubits
 	  int usage_;
 
     // randomness support