diff --git a/.github/workflows/cmake.yml b/.github/workflows/cmake.yml
index 9c1ace7c..ece754f4 100644
--- a/.github/workflows/cmake.yml
+++ b/.github/workflows/cmake.yml
@@ -21,7 +21,17 @@ jobs:
     - uses: actions/checkout@v3
 
     - name: Install APT packages
-      run: sudo apt install clang libboost-exception-dev libbenchmark-dev -y
+      run: |
+        sudo apt-get update
+        sudo apt install libboost-exception-dev libbenchmark-dev -y
+
+    - name: Install LLVM and Clang 18
+      run: |
+        sudo apt-get update
+        sudo apt-get install -y wget gnupg lsb-release
+        wget https://apt.llvm.org/llvm.sh
+        chmod +x llvm.sh
+        sudo ./llvm.sh 18
 
     - name: Install Blaze
       run: |
@@ -36,6 +46,13 @@ jobs:
         -DCMAKE_INSTALL_PREFIX=/usr/local/ . \
         && sudo make install
 
+    - name: Install xsimd
+      run: |
+        git clone https://github.com/xtensor-stack/xsimd.git
+        cd xsimd
+        cmake -B build -DCMAKE_INSTALL_PREFIX=/usr/local/ .
+        sudo cmake --build build --target install
+
     - name: Install GTest
       run: |
         git clone https://github.com/google/googletest.git
@@ -47,7 +64,7 @@ jobs:
       run: |
         cmake -B ${{github.workspace}}/build \
         -DCMAKE_BUILD_TYPE=${{env.BUILD_TYPE}} \
-        -DCMAKE_CXX_COMPILER=clang++ \
+        -DCMAKE_CXX_COMPILER=clang++-18 \
         -DBLAST_WITH_BENCHMARK=ON \
         -DBLAST_WITH_TEST=ON
 
@@ -60,4 +77,3 @@ jobs:
       # Execute tests defined by the CMake configuration.
       # See https://cmake.org/cmake/help/latest/manual/ctest.1.html for more detail
       run: ctest -C ${{env.BUILD_TYPE}}
-
diff --git a/CMakeLists.txt b/CMakeLists.txt
index faffe2fd..3b0ad928 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -25,6 +25,7 @@ set(CMAKE_DEBUG_POSTFIX d)
 
 find_package(Boost REQUIRED COMPONENTS exception)
 find_package(blaze REQUIRED)
+find_package(xsimd REQUIRED)
 
 add_library(blast INTERFACE)
 
@@ -35,6 +36,7 @@ target_include_directories(blast INTERFACE
 
 target_link_libraries(blast
     INTERFACE blaze::blaze
+    INTERFACE xsimd
 )
 
 target_compile_options(blast
diff --git a/Dockerfile b/Dockerfile
index 1ccfab49..fb47dea6 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -56,7 +56,7 @@ ENV PKG_CONFIG_PATH=/usr/local/lib
 RUN mkdir -p blast/build && cd blast/build \
     && cmake -DCMAKE_BUILD_TYPE=RelWithDebInfo \
         -DCMAKE_CXX_COMPILER="clang++-15" \
-        -DCMAKE_CXX_FLAGS="-march=native -mfma -mavx -mavx2 -msse4 -fno-math-errno" \
+        -DCMAKE_CXX_FLAGS="-march=native -mfma -mavx -mavx2 -msse4 -fno-math-errno -DXSIMD_DEFAULT_ARCH='fma3<avx2>'" \
         -DCMAKE_CXX_FLAGS_RELEASE="-O3 -g -DNDEBUG -ffast-math" .. \
         -DBLAST_WITH_TEST=ON \
         -DBLAST_WITH_BENCHMARK=ON \
@@ -79,4 +79,3 @@ CMD mkdir -p blast/bench_result/data \
     && mkdir -p blast/bench_result/image \
     && cd blast \
     && make -j 1 bench_result/image/dgemm_performance.png bench_result/image/dgemm_performance_ratio.png
-
diff --git a/include/blast/math/RegisterMatrix.hpp b/include/blast/math/RegisterMatrix.hpp
index 3c8944c4..851ef7ef 100644
--- a/include/blast/math/RegisterMatrix.hpp
+++ b/include/blast/math/RegisterMatrix.hpp
@@ -7,6 +7,3 @@
 
 #include <blast/math/register_matrix/RegisterMatrix.hpp>
 #include <blast/math/register_matrix/DynamicRegisterMatrix.hpp>
-#include <blast/math/register_matrix/double_4_4_4.hpp>
-#include <blast/math/register_matrix/double_8_4_4.hpp>
-#include <blast/math/register_matrix/double_12_4_4.hpp>
diff --git a/include/blast/math/RowColumnVectorPointer.hpp b/include/blast/math/RowColumnVectorPointer.hpp
index e13c311f..ae7a8b5a 100644
--- a/include/blast/math/RowColumnVectorPointer.hpp
+++ b/include/blast/math/RowColumnVectorPointer.hpp
@@ -15,7 +15,7 @@
 #pragma once
 
 #include <blast/math/typetraits/MatrixPointer.hpp>
-#include <blast/math/simd/Simd.hpp>
+#include <blast/math/Simd.hpp>
 #include <blast/math/TransposeFlag.hpp>
 #include <blast/util/Assert.hpp>
 
@@ -28,9 +28,9 @@ namespace blast
     {
     public:
         using ElementType = typename MP::ElementType;
-        using IntrinsicType = typename Simd<std::remove_cv_t<ElementType>>::IntrinsicType;
-        using MaskType = typename Simd<std::remove_cv_t<ElementType>>::MaskType;
         using SimdVecType = SimdVec<std::remove_cv_t<ElementType>>;
+        using IntrinsicType = SimdVecType::IntrinsicType;
+        using MaskType = SimdMask<std::remove_cv_t<ElementType>>;
 
         static TransposeFlag constexpr transposeFlag = TF;
         static bool constexpr aligned = MP::aligned;
@@ -176,7 +176,7 @@ namespace blast
 
 
     private:
-        static size_t constexpr SS = Simd<std::remove_cv_t<ElementType>>::size;
+        static size_t constexpr SS = SimdVecType::size();
 
         MP ptr_;
     };
@@ -244,4 +244,4 @@ namespace blast
     {
         return RowColumnVectorPointer<MP, rowVector> {std::move(p)};
     }
-}
\ No newline at end of file
+}
diff --git a/include/blast/math/Simd.hpp b/include/blast/math/Simd.hpp
new file mode 100644
index 00000000..912948ce
--- /dev/null
+++ b/include/blast/math/Simd.hpp
@@ -0,0 +1,12 @@
+// Copyright (c) 2019-2020 Mikhail Katliar All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#pragma once
+
+#include <blast/math/simd/SimdVec.hpp>
+#include <blast/math/simd/SimdMask.hpp>
+#include <blast/math/simd/SimdIndex.hpp>
+#include <blast/math/simd/SimdSize.hpp>
+#include <blast/math/simd/IsSimdAligned.hpp>
+#include <blast/math/simd/RegisterCapacity.hpp>
diff --git a/include/blast/math/dense/DynamicMatrixPointer.hpp b/include/blast/math/dense/DynamicMatrixPointer.hpp
index e6540576..5610b1c9 100644
--- a/include/blast/math/dense/DynamicMatrixPointer.hpp
+++ b/include/blast/math/dense/DynamicMatrixPointer.hpp
@@ -8,14 +8,11 @@
 #include <blast/math/TransposeFlag.hpp>
 #include <blast/math/StorageOrder.hpp>
 #include <blast/math/TypeTraits.hpp>
-#include <blast/math/simd/Simd.hpp>
-#include <blast/math/simd/IsSimdAligned.hpp>
+#include <blast/math/Simd.hpp>
 #include <blast/util/Assert.hpp>
-
 #include <blaze/util/Exception.h>
 
 
-
 namespace blast
 {
     template <typename T, bool SO, bool AF, bool PF>
@@ -23,9 +20,9 @@ namespace blast
     {
     public:
         using ElementType = T;
-        using IntrinsicType = typename Simd<std::remove_cv_t<T>>::IntrinsicType;
-        using MaskType = typename Simd<std::remove_cv_t<T>>::MaskType;
         using SimdVecType = SimdVec<std::remove_cv_t<T>>;
+        using IntrinsicType = SimdVecType::IntrinsicType;
+        using MaskType = SimdMask<std::remove_cv_t<T>>;
 
         static bool constexpr storageOrder = SO;
         static bool constexpr aligned = AF;
@@ -83,9 +80,9 @@ namespace blast
         }
 
 
-        IntrinsicType broadcast() const noexcept
+        SimdVecType broadcast() const noexcept
         {
-            return blast::broadcast<SS>(ptr_);
+            return *ptr_;
         }
 
 
@@ -187,7 +184,7 @@ namespace blast
 
 
     private:
-        static size_t constexpr SS = Simd<std::remove_cv_t<T>>::size;
+        static size_t constexpr SS = SimdVecType::size();
         static TransposeFlag constexpr majorOrientation = SO == columnMajor ? columnVector : rowVector;
 
 
@@ -253,4 +250,4 @@ namespace blast
     {
         return {p, spacing};
     }
-}
\ No newline at end of file
+}
diff --git a/include/blast/math/dense/DynamicVectorPointer.hpp b/include/blast/math/dense/DynamicVectorPointer.hpp
index b026d205..7638119a 100644
--- a/include/blast/math/dense/DynamicVectorPointer.hpp
+++ b/include/blast/math/dense/DynamicVectorPointer.hpp
@@ -4,8 +4,7 @@
 
 #pragma once
 
-
-#include <blast/math/simd/Simd.hpp>
+#include <blast/math/Simd.hpp>
 #include <blast/util/Assert.hpp>
 
 #include <type_traits>
@@ -18,9 +17,9 @@ namespace blast
     {
     public:
         using ElementType = T;
-        using IntrinsicType = typename Simd<std::remove_cv_t<T>>::IntrinsicType;
-        using MaskType = typename Simd<std::remove_cv_t<T>>::MaskType;
         using SimdVecType = SimdVec<std::remove_cv_t<T>>;
+        using IntrinsicType = SimdVecType::IntrinsicType;
+        using MaskType = SimdMask<std::remove_cv_t<T>>;
 
         static bool constexpr transposeFlag = TF;
         static bool constexpr aligned = AF;
@@ -40,7 +39,7 @@ namespace blast
         ,   spacing_ {spacing}
         {
             BLAST_USER_ASSERT(spacing > 0, "Vector element spacing must be positive.");
-            BLAST_USER_ASSERT(!AF || reinterpret_cast<ptrdiff_t>(ptr) % (SS * sizeof(T)) == 0, "Pointer is not aligned");
+            BLAST_USER_ASSERT(!AF || isSimdAligned(ptr), "Pointer is not aligned");
         }
 
 
@@ -51,6 +50,7 @@ namespace blast
         SimdVecType load() const noexcept
         {
             // Non-optimized
+            // TODO: use gather()
             IntrinsicType v;
             for (size_t i = 0; i < SS; ++i)
                 v[i] = ptr_[spacing_ * i];
@@ -62,18 +62,18 @@ namespace blast
         SimdVecType load(MaskType mask) const noexcept
         {
             // Non-optimized
-            IntrinsicType v = blast::setzero<std::remove_cv_t<ElementType>, SS>();
+            // TODO: use gather()
+            T v[SS];
             for (size_t i = 0; i < SS; ++i)
-                if (mask[i])
-                    v[i] = ptr_[spacing_ * i];
+                v[i] = mask[i] ? ptr_[spacing_ * i] : T {};
 
-            return SimdVecType {v};
+            return SimdVecType {v, false};
         }
 
 
-        IntrinsicType broadcast() const noexcept
+        SimdVecType broadcast() const noexcept
         {
-            return blast::broadcast<SS>(ptr_);
+            return *ptr_;
         }
 
 
@@ -172,7 +172,7 @@ namespace blast
 
 
     private:
-        static size_t constexpr SS = Simd<std::remove_cv_t<T>>::size;
+        static size_t constexpr SS = SimdVecType::size();
 
 
         T * ptrOffset(ptrdiff_t i) const noexcept
@@ -191,4 +191,4 @@ namespace blast
     {
         return p.trans();
     }
-}
\ No newline at end of file
+}
diff --git a/include/blast/math/dense/Iamax.hpp b/include/blast/math/dense/Iamax.hpp
index 6b03a36a..530df07b 100644
--- a/include/blast/math/dense/Iamax.hpp
+++ b/include/blast/math/dense/Iamax.hpp
@@ -17,7 +17,6 @@
 
 #include <blast/util/Exception.hpp>
 #include <blast/math/dense/VectorPointer.hpp>
-#include <blast/math/simd/Avx256.hpp>
 
 #include <cmath>
 #include <tuple>
@@ -177,4 +176,4 @@ namespace blast
 
         return iamax(N, ptr(*x));
     }
-}
\ No newline at end of file
+}
diff --git a/include/blast/math/dense/StaticMatrixPointer.hpp b/include/blast/math/dense/StaticMatrixPointer.hpp
index 2db833b8..961b5e01 100644
--- a/include/blast/math/dense/StaticMatrixPointer.hpp
+++ b/include/blast/math/dense/StaticMatrixPointer.hpp
@@ -6,8 +6,8 @@
 
 #include <blast/math/StorageOrder.hpp>
 #include <blast/math/TransposeFlag.hpp>
-#include <blast/math/simd/Simd.hpp>
 #include <blast/math/simd/SimdVec.hpp>
+#include <blast/math/simd/SimdMask.hpp>
 #include <blast/math/simd/IsSimdAligned.hpp>
 #include <blast/math/TypeTraits.hpp>
 #include <blast/util/Assert.hpp>
@@ -24,9 +24,9 @@ namespace blast
     {
     public:
         using ElementType = T;
-        using IntrinsicType = typename Simd<std::remove_cv_t<T>>::IntrinsicType;
-        using MaskType = typename Simd<std::remove_cv_t<T>>::MaskType;
         using SimdVecType = SimdVec<std::remove_cv_t<T>>;
+        using IntrinsicType = SimdVecType::IntrinsicType;
+        using MaskType = SimdMask<std::remove_cv_t<T>>;
 
         static bool constexpr storageOrder = SO;
         static bool constexpr aligned = AF;
@@ -188,7 +188,7 @@ namespace blast
 
 
     private:
-        static size_t constexpr SS = Simd<std::remove_cv_t<T>>::size;
+        static size_t constexpr SS = SimdVecType::size();
         static TransposeFlag constexpr majorOrientation = SO == columnMajor ? columnVector : rowVector;
 
 
@@ -225,4 +225,4 @@ namespace blast
     {
         return trans(ptr<AF>(m.operand(), j, i));
     }
-}
\ No newline at end of file
+}
diff --git a/include/blast/math/dense/StaticVectorPointer.hpp b/include/blast/math/dense/StaticVectorPointer.hpp
index 6608ddd0..6017dafa 100644
--- a/include/blast/math/dense/StaticVectorPointer.hpp
+++ b/include/blast/math/dense/StaticVectorPointer.hpp
@@ -5,7 +5,7 @@
 #pragma once
 
 
-#include <blast/math/simd/Simd.hpp>
+#include <blast/math/Simd.hpp>
 #include <blast/util/Assert.hpp>
 
 
@@ -16,9 +16,9 @@ namespace blast
     {
     public:
         using ElementType = T;
-        using IntrinsicType = typename Simd<std::remove_cv_t<T>>::IntrinsicType;
-        using MaskType = typename Simd<std::remove_cv_t<T>>::MaskType;
         using SimdVecType = SimdVec<std::remove_cv_t<T>>;
+        using IntrinsicType = SimdVecType::IntrinsicType;
+        using MaskType = SimdMask<std::remove_cv_t<T>>;
 
         static bool constexpr transposeFlag = TF;
         static bool constexpr aligned = AF;
@@ -35,7 +35,7 @@ namespace blast
         constexpr StaticVectorPointer(T * ptr) noexcept
         :   ptr_ {ptr}
         {
-            BLAST_USER_ASSERT(!AF || reinterpret_cast<ptrdiff_t>(ptr) % (SS * sizeof(T)) == 0, "Pointer is not aligned");
+            BLAST_USER_ASSERT(!AF || isSimdAligned(ptr), "Pointer is not aligned");
         }
 
 
@@ -70,27 +70,27 @@ namespace blast
             else
             {
                 // Non-optimized
-                IntrinsicType v = blast::setzero<ElementType, SS>();
+                // TODO: use gather
+                T v[SS];
                 for (size_t i = 0; i < SS; ++i)
-                    if (mask[i])
-                        v[i] = ptr_[S * i];
+                    v[i] = mask[i] ? ptr_[S * i] : T {};
 
-                return SimdVecType {v};
+                return SimdVecType {v, false};
             }
         }
 
 
-        IntrinsicType broadcast() const noexcept
+        SimdVecType broadcast() const noexcept
         {
-            return blast::broadcast<SS>(ptr_);
+            return *ptr_;
         }
 
 
-        void store(IntrinsicType val) const noexcept
+        void store(SimdVecType val) const noexcept
         {
             if constexpr (S == 1)
             {
-                blast::store<AF>(ptr_, val);
+                val.store(ptr_, AF);
             }
             else
             {
@@ -105,7 +105,7 @@ namespace blast
         {
             if constexpr (S == 1)
             {
-                blast::maskstore(ptr_, mask, val);
+                val.store(ptr_, mask);
             }
             else
             {
@@ -195,7 +195,7 @@ namespace blast
 
 
     private:
-        static size_t constexpr SS = Simd<std::remove_cv_t<T>>::size;
+        static size_t constexpr SS = SimdVecType::size();
 
 
         T * ptrOffset(ptrdiff_t i) const noexcept
@@ -213,4 +213,4 @@ namespace blast
     {
         return p.trans();
     }
-}
\ No newline at end of file
+}
diff --git a/include/blast/math/expressions/PMatTransExpr.hpp b/include/blast/math/expressions/PMatTransExpr.hpp
index fae79ffc..4ad4526f 100644
--- a/include/blast/math/expressions/PMatTransExpr.hpp
+++ b/include/blast/math/expressions/PMatTransExpr.hpp
@@ -12,7 +12,6 @@
 #include <blaze/math/constraints/StorageOrder.h>
 #include <blaze/math/Exception.h>
 #include <blaze/math/expressions/Computation.h>
-#include <blaze/math/expressions/DMatTransposer.h>
 #include <blaze/math/expressions/Forward.h>
 #include <blaze/math/expressions/MatTransExpr.h>
 #include <blaze/math/expressions/Transformation.h>
@@ -213,4 +212,4 @@ namespace blast
         using ReturnType = const PMatTransExpr<MT, !SO>;
         return ReturnType(*pm);
     }
-}
\ No newline at end of file
+}
diff --git a/include/blast/math/expressions/PanelMatrix.hpp b/include/blast/math/expressions/PanelMatrix.hpp
index 4a556a33..d0494887 100644
--- a/include/blast/math/expressions/PanelMatrix.hpp
+++ b/include/blast/math/expressions/PanelMatrix.hpp
@@ -5,9 +5,12 @@
 #pragma once
 
 #include <blast/math/typetraits/IsPanelMatrix.hpp>
-#include <blast/math/simd/Simd.hpp>
+#include <blast/math/simd/SimdSize.hpp>
+#include <blast/math/simd/SimdMask.hpp>
+#include <blast/math/simd/SimdIndex.hpp>
 #include <blast/math/panel/PanelSize.hpp>
-//#include <blast/math/RegisterMatrix.hpp>
+#include <blast/math/panel/MatrixPointer.hpp>
+#include <blast/math/dense/MatrixPointer.hpp>
 
 #include <blaze/math/ReductionFlag.h>
 #include <blaze/math/Matrix.h>
@@ -91,13 +94,12 @@ namespace blast
 
         using ET1 = ElementType_t<MT1>;
         using ET2 = ElementType_t<MT2>;
-        static size_t constexpr SS = Simd<ET2>::size;
+        static size_t constexpr SS = SimdSize_v<ET2>;
         static size_t constexpr PANEL_SIZE = PanelSize_v<ET2>;
         static_assert(PANEL_SIZE % SS == 0);
 
-        using MaskType = typename Simd<ET2>::MaskType;
-        using IntType = typename Simd<ET2>::IntType;
-        using SIMD = Simd<ET2>;
+        using MaskType = SimdMask<ET2, xsimd::default_arch>;
+        using IntType = typename SimdIndex<ET2>::value_type;
 
 
         if constexpr (SO1 == columnMajor && SO2 == columnMajor)
@@ -108,22 +110,22 @@ namespace blast
 
 			for (size_t i = 0; i + SS <= m; i += SS)
             {
-                ET2 const * pr = &(*rhs)(i, 0);
-                ET1 * pl = data(lhs) + i;
+                auto pr = ptr<aligned>(*rhs, i, 0);
+                auto pl = ptr<aligned>(*lhs, i, 0);
 
                 for (size_t j = 0; j < n; ++j)
-                    store<aligned>(pl + s * j, load<aligned, SS>(pr + PANEL_SIZE * j));
+                    pl(0, j).store(pr(0, j).load());
             }
 
             if (IntType const rem = m % SS)
             {
-                MaskType const mask = SIMD::index() < rem;
+                MaskType const mask = indexSequence<ET2>() < rem;
                 size_t const i = m - rem;
-                ET2 const * pr = &(*rhs)(i, 0);
-                ET1 * pl = data(lhs) + i;
+                auto pr = ptr<aligned>(*rhs, i, 0);
+                auto pl = ptr<aligned>(*lhs, i, 0);
 
                 for (size_t j = 0; j < n; ++j)
-                    maskstore(pl + s * j, mask, load<aligned, SS>(pr + PANEL_SIZE * j));
+                    pl(0, j).store(pr(0, j).load(), mask);
             }
 
 		#if 0
@@ -185,22 +187,22 @@ namespace blast
 
         using ET1 = ElementType_t<MT1>;
         using ET2 = ElementType_t<MT2>;
-        static size_t constexpr SS = Simd<ET2>::size;
+        static size_t constexpr SS = SimdSize_v<ET2>;
         static size_t constexpr PANEL_SIZE = PanelSize_v<ET2>;
         static_assert(PANEL_SIZE % SS == 0);
 
-        using MaskType = typename Simd<ET2>::MaskType;
-        using IntType = typename Simd<ET2>::IntType;
+        using MaskType = SimdMask<ET2, xsimd::default_arch>;
+        using IntType = typename SimdIndex<ET2>::value_type;
 
         if constexpr (SO1 == columnMajor && SO2 == columnMajor)
         {
 			for (size_t i = 0; i + SS <= m; i += SS)
             {
-                ET2 const * pr = data(rhs) + i;
-                ET1 * pl = &(*lhs)(i, 0);
+                auto pr = ptr<aligned>(*rhs, i, 0);
+                auto pl = ptr<aligned>(*lhs, i, 0);
 
                 for (size_t j = 0; j < n; ++j)
-                    store<aligned>(pl + PANEL_SIZE * j, load<aligned, SS>(pr + s * j));
+                    pl(0, j).store(pr(0, j).load());
             }
 
             if (IntType const rem = m % SS)
@@ -243,4 +245,4 @@ namespace blast
                     (*lhs)(i, j) = (*rhs)(i, j);
         }
     }
-}
\ No newline at end of file
+}
diff --git a/include/blast/math/panel/DynamicPanelMatrixPointer.hpp b/include/blast/math/panel/DynamicPanelMatrixPointer.hpp
index f965f4e1..1e3f1233 100644
--- a/include/blast/math/panel/DynamicPanelMatrixPointer.hpp
+++ b/include/blast/math/panel/DynamicPanelMatrixPointer.hpp
@@ -5,10 +5,9 @@
 #pragma once
 
 #include <blast/math/TransposeFlag.hpp>
-#include <blast/math/simd/SimdVec.hpp>
 #include <blast/math/StorageOrder.hpp>
 #include <blast/math/TypeTraits.hpp>
-#include <blast/math/simd/Simd.hpp>
+#include <blast/math/Simd.hpp>
 #include <blast/math/expressions/PanelMatrix.hpp>
 #include <blast/math/expressions/PMatTransExpr.hpp>
 #include <blast/util/Assert.hpp>
@@ -23,9 +22,9 @@ namespace blast
     {
     public:
         using ElementType = T;
-        using IntrinsicType = typename Simd<std::remove_cv_t<T>>::IntrinsicType;
-        using MaskType = typename Simd<std::remove_cv_t<T>>::MaskType;
         using SimdVecType = SimdVec<std::remove_cv_t<T>>;
+        using IntrinsicType = SimdVecType::IntrinsicType;
+        using MaskType = SimdMask<std::remove_cv_t<T>>;
 
         static bool constexpr storageOrder = SO;
         static bool constexpr aligned = AF;
@@ -211,7 +210,7 @@ namespace blast
 
 
     private:
-        static size_t constexpr SS = Simd<std::remove_cv_t<T>>::size;
+        static size_t constexpr SS = SimdVecType::size();
         static TransposeFlag constexpr majorOrientation = SO == columnMajor ? columnVector : rowVector;
 
 
@@ -275,4 +274,4 @@ namespace blast
     {
         return trans(ptr(m.operand(), j, i));
     }
-}
\ No newline at end of file
+}
diff --git a/include/blast/math/panel/PanelSize.hpp b/include/blast/math/panel/PanelSize.hpp
index 027d5c63..6bcf6bbf 100644
--- a/include/blast/math/panel/PanelSize.hpp
+++ b/include/blast/math/panel/PanelSize.hpp
@@ -8,11 +8,12 @@
 // Includes
 //*************************************************************************************************
 
+#include <blast/math/simd/SimdSize.hpp>
 #include <blast/math/simd/Simd.hpp>
 
 
 namespace blast
 {
-    template <typename T>
-    size_t constexpr PanelSize_v = Simd<T>::size;
+    template <typename T, typename Arch = xsimd::default_arch>
+    size_t constexpr PanelSize_v = SimdSize_v<T, Arch>;
 }
diff --git a/include/blast/math/panel/StaticPanelMatrix.hpp b/include/blast/math/panel/StaticPanelMatrix.hpp
index cfeda753..585b6f6b 100644
--- a/include/blast/math/panel/StaticPanelMatrix.hpp
+++ b/include/blast/math/panel/StaticPanelMatrix.hpp
@@ -7,7 +7,6 @@
 #include <blast/math/Forward.hpp>
 #include <blast/math/panel/Gemm.hpp>
 #include <blast/math/views/submatrix/BaseTemplate.hpp>
-#include <blast/math/simd/Simd.hpp>
 #include <blast/math/panel/PanelSize.hpp>
 #include <blast/system/CacheLine.hpp>
 
@@ -160,45 +159,6 @@ namespace blast
         }
 
 
-        Type * ptr(size_t i, size_t j)
-        {
-            // BLAST_USER_ASSERT(i % panelSize_ == 0, "Row index not aligned to panel boundary");
-            return v_ + elementIndex(i, j);
-        }
-
-
-        Type const * ptr(size_t i, size_t j) const
-        {
-            // BLAST_USER_ASSERT(i % panelSize_ == 0, "Row index not aligned to panel boundary");
-            return v_ + elementIndex(i, j);
-        }
-
-
-        template <size_t SS>
-        auto load(size_t i, size_t j) const
-        {
-            BLAZE_INTERNAL_ASSERT(i < M, "Invalid row access index");
-            BLAZE_INTERNAL_ASSERT(j < N, "Invalid column access index");
-            BLAZE_INTERNAL_ASSERT(i % panelSize_ == 0 || SO == rowMajor, "Row index not aligned to panel boundary");
-            BLAZE_INTERNAL_ASSERT(j % panelSize_ == 0 || SO == columnMajor, "Column index not aligned to panel boundary");
-
-            return blast::load<aligned, SS>(v_ + elementIndex(i, j));
-        }
-
-
-        template <typename T>
-        void store(size_t i, size_t j, T val)
-        {
-            BLAZE_INTERNAL_ASSERT(i < M, "Invalid row access index");
-            BLAZE_INTERNAL_ASSERT(j < N, "Invalid column access index");
-            BLAZE_INTERNAL_ASSERT(i % panelSize_ == 0 || SO == rowMajor, "Row index not aligned to panel boundary");
-            BLAZE_INTERNAL_ASSERT(j % panelSize_ == 0 || SO == columnMajor, "Column index not aligned to panel boundary");
-
-            // We never use maskstore here because we have padding
-            blast::store<aligned>(v_ + elementIndex(i, j), val);
-        }
-
-
     private:
         static size_t constexpr panelSize_ = PanelSize_v<Type>;
         static size_t constexpr tileRows_ = M / panelSize_ + (M % panelSize_ > 0);
@@ -329,4 +289,4 @@ namespace blaze
     {
         using Type = blast::StaticPanelMatrix<T, M, N, SO>;
     };
-}
\ No newline at end of file
+}
diff --git a/include/blast/math/panel/StaticPanelMatrixPointer.hpp b/include/blast/math/panel/StaticPanelMatrixPointer.hpp
index 50544d02..aaadbda9 100644
--- a/include/blast/math/panel/StaticPanelMatrixPointer.hpp
+++ b/include/blast/math/panel/StaticPanelMatrixPointer.hpp
@@ -7,8 +7,7 @@
 #include <blast/math/TransposeFlag.hpp>
 #include <blast/math/StorageOrder.hpp>
 #include <blast/math/TypeTraits.hpp>
-#include <blast/math/simd/SimdSize.hpp>
-#include <blast/math/simd/Simd.hpp>
+#include <blast/math/Simd.hpp>
 #include <blast/math/expressions/PanelMatrix.hpp>
 #include <blast/math/expressions/PMatTransExpr.hpp>
 #include <blast/util/Assert.hpp>
@@ -21,9 +20,9 @@ namespace blast
     {
     public:
         using ElementType = T;
-        using IntrinsicType = typename Simd<std::remove_cv_t<T>>::IntrinsicType;
-        using MaskType = typename Simd<std::remove_cv_t<T>>::MaskType;
         using SimdVecType = SimdVec<std::remove_cv_t<T>>;
+        using IntrinsicType = SimdVecType::IntrinsicType;
+        using MaskType = SimdMask<std::remove_cv_t<T>>;
 
         static bool constexpr storageOrder = SO;
         static bool constexpr aligned = AF;
@@ -204,7 +203,7 @@ namespace blast
 
 
     private:
-        static size_t constexpr SS = Simd<std::remove_cv_t<T>>::size;
+        static size_t constexpr SS = SimdVecType::size();
         static TransposeFlag constexpr majorOrientation = SO == columnMajor ? columnVector : rowVector;
 
 
@@ -267,4 +266,4 @@ namespace blast
     {
         return trans(ptr(m.operand(), j, i));
     }
-}
\ No newline at end of file
+}
diff --git a/include/blast/math/register_matrix/DynamicRegisterMatrix.hpp b/include/blast/math/register_matrix/DynamicRegisterMatrix.hpp
index a1b857cc..9cd7f826 100644
--- a/include/blast/math/register_matrix/DynamicRegisterMatrix.hpp
+++ b/include/blast/math/register_matrix/DynamicRegisterMatrix.hpp
@@ -4,8 +4,7 @@
 
 #pragma once
 
-#include <blast/math/simd/SimdVec.hpp>
-#include <blast/math/simd/Simd.hpp>
+#include <blast/math/Simd.hpp>
 #include <blast/math/typetraits/MatrixPointer.hpp>
 #include <blast/math/Side.hpp>
 #include <blast/math/UpLo.hpp>
@@ -18,8 +17,6 @@
 #include <blaze/util/Exception.h>
 #include <blaze/util/StaticAssert.h>
 
-#include <cmath>
-
 
 namespace blast
 {
@@ -121,7 +118,7 @@ namespace blast
         {
             for (size_t i = 0; i < RM; ++i)
                 for (size_t j = 0; j < N; ++j)
-                    v_[i][j] = setzero<T, SS>();
+                    v_[i][j].reset();
         }
 
 
@@ -250,14 +247,14 @@ namespace blast
 
 
     private:
-        using SIMD = Simd<T>;
-        using IntrinsicType = typename SIMD::IntrinsicType;
-        using MaskType = typename SIMD::MaskType;
-        using IntType = typename SIMD::IntType;
-        using SimdVecType = SimdVec<T>;
+        using Arch = xsimd::default_arch;
+        using SimdVecType = SimdVec<T, Arch>;
+        using IntrinsicType = typename SimdVecType::IntrinsicType;
+        using MaskType = SimdMask<T, Arch>;
+        using IntType = typename SimdIndex<T, Arch>::value_type;
 
         // SIMD size
-        static size_t constexpr SS = Simd<T>::size;
+        static size_t constexpr SS = SimdVecType::size();
 
         // Numberf of SIMD registers required to store a single column of the matrix.
         static size_t constexpr RM = M / SS;
@@ -266,9 +263,9 @@ namespace blast
         BLAZE_STATIC_ASSERT_MSG((RM > 0), "Number of rows must be not less than SIMD size");
         BLAZE_STATIC_ASSERT_MSG((RN > 0), "Number of columns must be positive");
         BLAZE_STATIC_ASSERT_MSG((M % SS == 0), "Number of rows must be a multiple of SIMD size");
-        BLAZE_STATIC_ASSERT_MSG((RM * RN <= RegisterCapacity_v<T>), "Not enough registers for a DynamicRegisterMatrix");
+        BLAZE_STATIC_ASSERT_MSG((RM * RN <= registerCapacity(Arch {})), "Not enough registers for a DynamicRegisterMatrix");
 
-        IntrinsicType v_[RM][RN];
+        SimdVecType v_[RM][RN];
         size_t const m_;
         size_t const n_;
 
@@ -329,7 +326,7 @@ namespace blast
 
         if (IntType const rem = m_ % SS)
         {
-            MaskType const mask = SIMD::index() < rem;
+            MaskType const mask = indexSequence<T>() < rem;
             size_t const i = m_ / SS;
 
             for (size_t j = 0; j < n_ && j < columns(); ++j)
@@ -349,10 +346,10 @@ namespace blast
             {
                 IntType const skip = j - ri * SS;
                 IntType const rem = m_ - ri * SS;
-                MaskType mask = SIMD::index() < rem;
+                MaskType mask = indexSequence<T>() < rem;
 
                 if (skip > 0)
-                    mask &= SIMD::index() >= skip;
+                    mask &= indexSequence<T>() >= skip;
 
                 p(SS * ri, j).store(v_[ri][j], mask);
             }
@@ -490,7 +487,7 @@ namespace blast
     //             ax[i] = alpha * a.load(i * SS, 0);
 
     //         if (rem)
-    //             ax[ii] = alpha * a.load(ii * SS, 0, SIMD::index() < rem);
+    //             ax[ii] = alpha * a.load(ii * SS, 0, indexSequence<T>() < rem);
 
     //         #pragma unroll
     //         for (size_t j = 0; j < N; ++j)
@@ -598,4 +595,4 @@ namespace blast
         ker.axpy(beta, c);
         ker.store(d);
     }
-}
\ No newline at end of file
+}
diff --git a/include/blast/math/register_matrix/RegisterMatrix.hpp b/include/blast/math/register_matrix/RegisterMatrix.hpp
index d4309043..631b530d 100644
--- a/include/blast/math/register_matrix/RegisterMatrix.hpp
+++ b/include/blast/math/register_matrix/RegisterMatrix.hpp
@@ -4,8 +4,7 @@
 
 #pragma once
 
-#include <blast/math/simd/SimdVec.hpp>
-#include <blast/math/simd/Simd.hpp>
+#include <blast/math/Simd.hpp>
 #include <blast/math/typetraits/MatrixPointer.hpp>
 #include <blast/math/typetraits/VectorPointer.hpp>
 #include <blast/math/RowColumnVectorPointer.hpp>
@@ -20,8 +19,6 @@
 #include <blaze/util/Assert.h>
 #include <blaze/util/StaticAssert.h>
 
-#include <cmath>
-
 
 namespace blast
 {
@@ -117,7 +114,7 @@ namespace blast
         {
             for (size_t i = 0; i < RM; ++i)
                 for (size_t j = 0; j < N; ++j)
-                    v_[i][j] = setzero<T, SS>();
+                    v_[i][j].reset();
         }
 
 
@@ -360,14 +357,14 @@ namespace blast
 
 
     private:
-        using SIMD = Simd<T>;
-        using IntrinsicType = typename SIMD::IntrinsicType;
-        using MaskType = typename SIMD::MaskType;
-        using IntType = typename SIMD::IntType;
-        using SimdVecType = SimdVec<T>;
+        using Arch = xsimd::default_arch;
+        using SimdVecType = SimdVec<T, Arch>;
+        using IntrinsicType = typename SimdVecType::IntrinsicType;
+        using MaskType = SimdMask<T, Arch>;
+        using IntType = typename SimdIndex<T, Arch>::value_type;
 
         // SIMD size
-        static size_t constexpr SS = Simd<T>::size;
+        static size_t constexpr SS = SimdVecType::size();
 
         // Numberf of SIMD registers required to store a single column of the matrix.
         static size_t constexpr RM = M / SS;
@@ -376,9 +373,9 @@ namespace blast
         BLAZE_STATIC_ASSERT_MSG((RM > 0), "Number of rows must be not less than SIMD size");
         BLAZE_STATIC_ASSERT_MSG((RN > 0), "Number of columns must be positive");
         BLAZE_STATIC_ASSERT_MSG((M % SS == 0), "Number of rows must be a multiple of SIMD size");
-        BLAZE_STATIC_ASSERT_MSG((RM * RN <= RegisterCapacity_v<T>), "Not enough registers for a RegisterMatrix");
+        BLAZE_STATIC_ASSERT_MSG((RM * RN <= registerCapacity(Arch {})), "Not enough registers for a RegisterMatrix");
 
-        IntrinsicType v_[RM][RN];
+        SimdVecType v_[RM][RN];
 
 
         /// @brief Reference to the matrix element at row \a i and column \a j
@@ -461,7 +458,7 @@ namespace blast
                     v_[i][j] = beta * p(SS * i, j).load();
 
                 if (size_t const rem = m % SS)
-                    v_[m / SS][j] = beta * p(m - rem, j).load(SIMD::index() < rem);
+                    v_[m / SS][j] = beta * p(m - rem, j).load(indexSequence<T, Arch>() < rem);
             }
         }
     }
@@ -493,7 +490,7 @@ namespace blast
 
         if (IntType const rem = m % SS)
         {
-            MaskType const mask = SIMD::index() < rem;
+            MaskType const mask = indexSequence<T, Arch>() < rem;
             size_t const i = m / SS;
 
             for (size_t j = 0; j < n && j < columns(); ++j)
@@ -514,7 +511,7 @@ namespace blast
 
             if (skip && ri < RM)
             {
-                MaskType const mask = SIMD::index() >= skip;
+                MaskType const mask = indexSequence<T, Arch>() >= skip;
                 p(SS * ri, j).store(v_[ri][j], mask);
                 ++ri;
             }
@@ -536,10 +533,10 @@ namespace blast
             {
                 IntType const skip = j - ri * SS;
                 IntType const rem = m - ri * SS;
-                MaskType mask = SIMD::index() < rem;
+                MaskType mask = indexSequence<T, Arch>() < rem;
 
                 if (skip > 0)
-                    mask &= SIMD::index() >= skip;
+                    mask &= indexSequence<T, Arch>() >= skip;
 
                 p(SS * ri, j).store(v_[ri][j], mask);
             }
@@ -562,7 +559,7 @@ namespace blast
                     #pragma unroll
                     for (size_t k = 0; k < j; ++k)
                     {
-                        IntrinsicType const a_kj = (~A)(k, j).broadcast();
+                        SimdVecType const a_kj = (~A)(k, j).broadcast();
 
                         #pragma unroll
                         for (size_t i = 0; i < RM; ++i)
@@ -596,7 +593,7 @@ namespace blast
         VectorPointer<PB, T> && (PB::transposeFlag == rowVector)
     BLAZE_ALWAYS_INLINE void RegisterMatrix<T, M, N, SO>::ger(T alpha, PA a, PB b) noexcept
     {
-        BLAZE_STATIC_ASSERT_MSG((RM * RN + RM + 1 <= RegisterCapacity_v<T>), "Not enough registers for ger()");
+        BLAZE_STATIC_ASSERT_MSG((RM * RN + RM + 1 <= registerCapacity(Arch {})), "Not enough registers for ger()");
 
         SimdVecType ax[RM];
 
@@ -623,7 +620,7 @@ namespace blast
         VectorPointer<PB, T> && (PB::transposeFlag == rowVector)
     BLAZE_ALWAYS_INLINE void RegisterMatrix<T, M, N, SO>::ger(PA a, PB b) noexcept
     {
-        BLAZE_STATIC_ASSERT_MSG((RM * RN + RM + 1 <= RegisterCapacity_v<T>), "Not enough registers for ger()");
+        BLAZE_STATIC_ASSERT_MSG((RM * RN + RM + 1 <= registerCapacity(Arch {})), "Not enough registers for ger()");
 
         SimdVecType ax[RM];
 
@@ -697,7 +694,7 @@ namespace blast
     BLAZE_ALWAYS_INLINE void RegisterMatrix<T, M, N, SO>::potrf() noexcept
     {
         static_assert(M >= N, "potrf() not implemented for register matrices with columns more than rows");
-        static_assert(RM * RN + 2 <= RegisterCapacity_v<T>, "Not enough registers");
+        static_assert(RM * RN + 2 <= registerCapacity(Arch {}), "Not enough registers");
 
         #pragma unroll
         for (size_t k = 0; k < N; ++k)
@@ -705,11 +702,11 @@ namespace blast
             #pragma unroll
             for (size_t j = 0; j < k; ++j)
             {
-                T const a_kj = v_[k / SS][j][k % SS];
+                SimdVecType const a_kj = v_[k / SS][j][k % SS];
 
                 #pragma unroll
                 for (size_t i = 0; i < RM; ++i) if (i >= k / SS)
-                    v_[i][k] = fnmadd(set1<SS>(a_kj), v_[i][j], v_[i][k]);
+                    v_[i][k] = fnmadd(a_kj, v_[i][j], v_[i][k]);
             }
 
             T const sqrt_a_kk = std::sqrt(v_[k / SS][k][k % SS]);
@@ -718,7 +715,7 @@ namespace blast
             for (size_t i = 0; i < RM; ++i)
             {
                 if (i < k / SS)
-                    v_[i][k] = setzero<T, SS>();
+                    v_[i][k].reset();
                 else
                     v_[i][k] /= sqrt_a_kk;
             }
@@ -745,7 +742,7 @@ namespace blast
                 ax[i] = alpha * a(i * SS, 0).load();
 
             if (rem)
-                ax[ii] = alpha * a(ii * SS, 0).load(SIMD::index() < rem);
+                ax[ii] = alpha * a(ii * SS, 0).load(indexSequence<T, Arch>() < rem);
 
             #pragma unroll
             for (size_t j = 0; j < N; ++j)
@@ -825,4 +822,4 @@ namespace blast
     {
         return rm == m;
     }
-}
\ No newline at end of file
+}
diff --git a/include/blast/math/register_matrix/double_12_4_4.hpp b/include/blast/math/register_matrix/double_12_4_4.hpp
deleted file mode 100644
index a640d2c1..00000000
--- a/include/blast/math/register_matrix/double_12_4_4.hpp
+++ /dev/null
@@ -1,167 +0,0 @@
-// Copyright (c) 2019-2020 Mikhail Katliar All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-#pragma once
-
-#include <blast/math/register_matrix/RegisterMatrix.hpp>
-#include <blast/util/Exception.hpp>
-
-#include <blaze/util/Exception.h>
-
-#include <immintrin.h>
-
-
-namespace blast
-{
-    template <>
-    template <typename P>
-    requires MatrixPointer<P, double> && (P::storageOrder == columnMajor)
-    inline void RegisterMatrix<double, 12, 4, columnMajor>::load(double beta, P ptr, size_t m, size_t n) noexcept
-    {
-        if (n > 0)
-        {
-            v_[0][0] = beta * ptr.load();
-            v_[1][0] = beta * ptr(SS, 0).load();
-            v_[2][0] = beta * ptr(2 * SS, 0).load();
-        }
-
-        if (n > 1)
-        {
-            v_[0][1] = beta * ptr(0, 1).load();
-            v_[1][1] = beta * ptr(SS, 1).load();
-            v_[2][1] = beta * ptr(2 * SS, 1).load();
-        }
-
-        if (n > 2)
-        {
-            v_[0][2] = beta * ptr(0, 2).load();
-            v_[1][2] = beta * ptr(SS, 2).load();
-            v_[2][2] = beta * ptr(2 * SS, 2).load();
-        }
-
-        if (n > 3)
-        {
-            v_[0][3] = beta * ptr(0, 3).load();
-            v_[1][3] = beta * ptr(SS, 3).load();
-            v_[2][3] = beta * ptr(2 * SS, 3).load();
-        }
-    }
-
-
-    template <>
-    template <typename P>
-    requires MatrixPointer<P, double> && (P::storageOrder == columnMajor)
-    inline void RegisterMatrix<double, 12, 4, columnMajor>::store(P ptr, size_t m, size_t n) const noexcept
-    {
-        #pragma unroll
-        for (size_t i = 0; i < 3; ++i)
-        {
-            if (m >= 4 * i + 4)
-            {
-                if (n > 0)
-                    ptr(SS * i, 0).store(v_[i][0]);
-
-                if (n > 1)
-                    ptr(SS * i, 1).store(v_[i][1]);
-
-                if (n > 2)
-                    ptr(SS * i, 2).store(v_[i][2]);
-
-                if (n > 3)
-                    ptr(SS * i, 3).store(v_[i][3]);
-            }
-            else if (m > 4 * i)
-            {
-                __m256i const mask = _mm256_set_epi64x(
-                    m > 4 * i + 3 ? 0x8000000000000000ULL : 0,
-                    m > 4 * i + 2 ? 0x8000000000000000ULL : 0,
-                    m > 4 * i + 1 ? 0x8000000000000000ULL : 0,
-                    m > 4 * i + 0 ? 0x8000000000000000ULL : 0);
-
-                if (n > 0)
-                    ptr(SS * i, 0).store(v_[i][0], mask);
-
-                if (n > 1)
-                    ptr(SS * i, 1).store(v_[i][1], mask);
-
-                if (n > 2)
-                    ptr(SS * i, 2).store(v_[i][2], mask);
-
-                if (n > 3)
-                    ptr(SS * i, 3).store(v_[i][3], mask);
-            }
-        }
-    }
-
-
-#ifdef BLAST_USE_CUSTOM_TRMM_RIGHT_LOWER
-    template <>
-    template <typename PB, typename PA>
-        requires MatrixPointer<PB, double> && (PB::storageOrder == columnMajor) && MatrixPointer<PA, double>
-    BLAZE_ALWAYS_INLINE void RegisterMatrix<double, 12, 4, columnMajor>::trmmRightLower(double alpha, PB b, PA a) noexcept
-    {
-        IntrinsicType bx[3], ax;
-
-        // k == 0
-        bx[0] = alpha * b.load(0 * SS, 0);
-        bx[1] = alpha * b.load(1 * SS, 0);
-        bx[2] = alpha * b.load(2 * SS, 0);
-        ax = a.broadcast(0, 0);
-        v_[0][0] = fmadd(bx[0], ax, v_[0][0]);
-        v_[1][0] = fmadd(bx[1], ax, v_[1][0]);
-        v_[2][0] = fmadd(bx[2], ax, v_[2][0]);
-
-        // k == 1
-        bx[0] = alpha * b.load(0 * SS, 1);
-        bx[1] = alpha * b.load(1 * SS, 1);
-        bx[2] = alpha * b.load(2 * SS, 1);
-        ax = a.broadcast(1, 0);
-        v_[0][0] = fmadd(bx[0], ax, v_[0][0]);
-        v_[1][0] = fmadd(bx[1], ax, v_[1][0]);
-        v_[2][0] = fmadd(bx[2], ax, v_[2][0]);
-        ax = a.broadcast(1, 1);
-        v_[0][1] = fmadd(bx[0], ax, v_[0][1]);
-        v_[1][1] = fmadd(bx[1], ax, v_[1][1]);
-        v_[2][1] = fmadd(bx[2], ax, v_[2][1]);
-
-        // k == 2
-        bx[0] = alpha * b.load(0 * SS, 2);
-        bx[1] = alpha * b.load(1 * SS, 2);
-        bx[2] = alpha * b.load(2 * SS, 2);
-        ax = a.broadcast(2, 0);
-        v_[0][0] = fmadd(bx[0], ax, v_[0][0]);
-        v_[1][0] = fmadd(bx[1], ax, v_[1][0]);
-        v_[2][0] = fmadd(bx[2], ax, v_[2][0]);
-        ax = a.broadcast(2, 1);
-        v_[0][1] = fmadd(bx[0], ax, v_[0][1]);
-        v_[1][1] = fmadd(bx[1], ax, v_[1][1]);
-        v_[2][1] = fmadd(bx[2], ax, v_[2][1]);
-        ax = a.broadcast(2, 2);
-        v_[0][2] = fmadd(bx[0], ax, v_[0][2]);
-        v_[1][2] = fmadd(bx[1], ax, v_[1][2]);
-        v_[2][2] = fmadd(bx[2], ax, v_[2][2]);
-
-        // k == 3
-        bx[0] = alpha * b.load(0 * SS, 3);
-        bx[1] = alpha * b.load(1 * SS, 3);
-        bx[2] = alpha * b.load(2 * SS, 3);
-        ax = a.broadcast(3, 0);
-        v_[0][0] = fmadd(bx[0], ax, v_[0][0]);
-        v_[1][0] = fmadd(bx[1], ax, v_[1][0]);
-        v_[2][0] = fmadd(bx[2], ax, v_[2][0]);
-        ax = a.broadcast(3, 1);
-        v_[0][1] = fmadd(bx[0], ax, v_[0][1]);
-        v_[1][1] = fmadd(bx[1], ax, v_[1][1]);
-        v_[2][1] = fmadd(bx[2], ax, v_[2][1]);
-        ax = a.broadcast(3, 2);
-        v_[0][2] = fmadd(bx[0], ax, v_[0][2]);
-        v_[1][2] = fmadd(bx[1], ax, v_[1][2]);
-        v_[2][2] = fmadd(bx[2], ax, v_[2][2]);
-        ax = a.broadcast(3, 3);
-        v_[0][3] = fmadd(bx[0], ax, v_[0][3]);
-        v_[1][3] = fmadd(bx[1], ax, v_[1][3]);
-        v_[2][3] = fmadd(bx[2], ax, v_[2][3]);
-    }
-#endif
-}
\ No newline at end of file
diff --git a/include/blast/math/register_matrix/double_4_4_4.hpp b/include/blast/math/register_matrix/double_4_4_4.hpp
deleted file mode 100644
index 1c73605c..00000000
--- a/include/blast/math/register_matrix/double_4_4_4.hpp
+++ /dev/null
@@ -1,79 +0,0 @@
-// Copyright (c) 2019-2020 Mikhail Katliar All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-#pragma once
-
-#include <blast/math/register_matrix/RegisterMatrix.hpp>
-#include <blast/util/Exception.hpp>
-
-#include <cmath>
-
-
-namespace blast
-{
-    template <>
-    template <typename P>
-    requires MatrixPointer<P, double> && (P::storageOrder == columnMajor)
-    BLAZE_ALWAYS_INLINE void RegisterMatrix<double, 4, 4, columnMajor>::load(double beta, P ptr, size_t m, size_t n) noexcept
-    {
-        if (n > 0)
-            v_[0][0] = beta * ptr.load();
-
-        if (n > 1)
-            v_[0][1] = beta * ptr(0, 1).load();
-
-        if (n > 2)
-            v_[0][2] = beta * ptr(0, 2).load();
-
-        if (n > 3)
-            v_[0][3] = beta * ptr(0, 3).load();
-    }
-
-
-#if 1
-    /// Magically, this function specialization is slightly faster than the default implementation of RegisterMatrix<>::store.
-    template <>
-    template <typename P>
-    requires MatrixPointer<P, double> && (P::storageOrder == columnMajor)
-    BLAZE_ALWAYS_INLINE void RegisterMatrix<double, 4, 4, columnMajor>::store(P ptr, size_t m, size_t n) const noexcept
-    {
-        if (m >= 4)
-        {
-            if (n > 0)
-                ptr.store(v_[0][0]);
-
-            if (n > 1)
-                ptr(0, 1).store(v_[0][1]);
-
-            if (n > 2)
-                ptr(0, 2).store(v_[0][2]);
-
-            if (n > 3)
-                ptr(0, 3).store(v_[0][3]);
-        }
-        else if (m > 0)
-        {
-            // Magically, the code below is significantly faster than this:
-            // __m256i const mask = _mm256_cmpgt_epi64(_mm256_set_epi64x(m, m, m, m), _mm256_set_epi64x(3, 2, 1, 0));
-            __m256i const mask = _mm256_set_epi64x(
-                m > 3 ? 0x8000000000000000ULL : 0,
-                m > 2 ? 0x8000000000000000ULL : 0,
-                m > 1 ? 0x8000000000000000ULL : 0,
-                m > 0 ? 0x8000000000000000ULL : 0);
-
-            if (n > 0)
-                ptr.store(v_[0][0], mask);
-
-            if (n > 1)
-                ptr(0, 1).store(v_[0][1], mask);
-
-            if (n > 2)
-                ptr(0, 2).store(v_[0][2], mask);
-
-            if (n > 3)
-                ptr(0, 3).store(v_[0][3], mask);
-        }
-    }
-#endif
-}
\ No newline at end of file
diff --git a/include/blast/math/register_matrix/double_8_4_4.hpp b/include/blast/math/register_matrix/double_8_4_4.hpp
deleted file mode 100644
index 4ec3d416..00000000
--- a/include/blast/math/register_matrix/double_8_4_4.hpp
+++ /dev/null
@@ -1,151 +0,0 @@
-// Copyright (c) 2019-2020 Mikhail Katliar All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-#pragma once
-
-#include <blast/math/register_matrix/RegisterMatrix.hpp>
-#include <blast/util/Exception.hpp>
-
-#include <blaze/util/Exception.h>
-
-#include <immintrin.h>
-
-
-namespace blast
-{
-    template<>
-    template <typename P>
-    requires MatrixPointer<P, double> && (P::storageOrder == columnMajor)
-    inline void RegisterMatrix<double, 8, 4, columnMajor>::load(double beta, P ptr, size_t m, size_t n) noexcept
-    {
-        if (n > 0)
-        {
-            v_[0][0] = beta * ptr.load();
-            v_[1][0] = beta * ptr(SS, 0).load();
-        }
-
-        if (n > 1)
-        {
-            v_[0][1] = beta * ptr(0, 1).load();
-            v_[1][1] = beta * ptr(SS, 1).load();
-        }
-
-        if (n > 2)
-        {
-            v_[0][2] = beta * ptr(0, 2).load();
-            v_[1][2] = beta * ptr(SS, 2).load();
-        }
-
-        if (n > 3)
-        {
-            v_[0][3] = beta * ptr(0, 3).load();
-            v_[1][3] = beta * ptr(SS, 3).load();
-        }
-    }
-
-
-#if 1
-    template<>
-    template <typename P>
-    requires MatrixPointer<P, double> && (P::storageOrder == columnMajor)
-    inline void RegisterMatrix<double, 8, 4, columnMajor>::store(P ptr, size_t m, size_t n) const noexcept
-    {
-        #pragma unroll
-        for (size_t i = 0; i < 2; ++i)
-        {
-            if (m >= 4 * i + 4)
-            {
-                if (n > 0)
-                    ptr(SS * i, 0).store(v_[i][0]);
-
-                if (n > 1)
-                    ptr(SS * i, 1).store(v_[i][1]);
-
-                if (n > 2)
-                    ptr(SS * i, 2).store(v_[i][2]);
-
-                if (n > 3)
-                    ptr(SS * i, 3).store(v_[i][3]);
-            }
-            else if (m > 4 * i)
-            {
-                __m256i const mask = _mm256_set_epi64x(
-                    m > 4 * i + 3 ? 0x8000000000000000ULL : 0,
-                    m > 4 * i + 2 ? 0x8000000000000000ULL : 0,
-                    m > 4 * i + 1 ? 0x8000000000000000ULL : 0,
-                    m > 4 * i + 0 ? 0x8000000000000000ULL : 0);
-
-                if (n > 0)
-                    ptr(SS * i, 0).store(v_[i][0], mask);
-
-                if (n > 1)
-                    ptr(SS * i, 1).store(v_[i][1], mask);
-
-                if (n > 2)
-                    ptr(SS * i, 2).store(v_[i][2], mask);
-
-                if (n > 3)
-                    ptr(SS * i, 3).store(v_[i][3], mask);
-            }
-        }
-    }
-#endif
-
-
-#ifdef BLAST_USE_CUSTOM_TRMM_RIGHT_LOWER
-    template <>
-    template <typename PB, typename PA>
-        requires MatrixPointer<PB, double> && (PB::storageOrder == columnMajor) && MatrixPointer<PA, double>
-    BLAZE_ALWAYS_INLINE void RegisterMatrix<double, 8, 4, columnMajor>::trmmRightLower(double alpha, PB b, PA a) noexcept
-    {
-        IntrinsicType bx[2], ax;
-
-        // k == 0
-        bx[0] = alpha * b.load(0 * SS, 0);
-        bx[1] = alpha * b.load(1 * SS, 0);
-        ax = a.broadcast(0, 0);
-        v_[0][0] = fmadd(bx[0], ax, v_[0][0]);
-        v_[1][0] = fmadd(bx[1], ax, v_[1][0]);
-
-        // k == 1
-        bx[0] = alpha * b.load(0 * SS, 1);
-        bx[1] = alpha * b.load(1 * SS, 1);
-        ax = a.broadcast(1, 0);
-        v_[0][0] = fmadd(bx[0], ax, v_[0][0]);
-        v_[1][0] = fmadd(bx[1], ax, v_[1][0]);
-        ax = a.broadcast(1, 1);
-        v_[0][1] = fmadd(bx[0], ax, v_[0][1]);
-        v_[1][1] = fmadd(bx[1], ax, v_[1][1]);
-
-        // k == 2
-        bx[0] = alpha * b.load(0 * SS, 2);
-        bx[1] = alpha * b.load(1 * SS, 2);
-        ax = a.broadcast(2, 0);
-        v_[0][0] = fmadd(bx[0], ax, v_[0][0]);
-        v_[1][0] = fmadd(bx[1], ax, v_[1][0]);
-        ax = a.broadcast(2, 1);
-        v_[0][1] = fmadd(bx[0], ax, v_[0][1]);
-        v_[1][1] = fmadd(bx[1], ax, v_[1][1]);
-        ax = a.broadcast(2, 2);
-        v_[0][2] = fmadd(bx[0], ax, v_[0][2]);
-        v_[1][2] = fmadd(bx[1], ax, v_[1][2]);
-
-        // k == 3
-        bx[0] = alpha * b.load(0 * SS, 3);
-        bx[1] = alpha * b.load(1 * SS, 3);
-        ax = a.broadcast(3, 0);
-        v_[0][0] = fmadd(bx[0], ax, v_[0][0]);
-        v_[1][0] = fmadd(bx[1], ax, v_[1][0]);
-        ax = a.broadcast(3, 1);
-        v_[0][1] = fmadd(bx[0], ax, v_[0][1]);
-        v_[1][1] = fmadd(bx[1], ax, v_[1][1]);
-        ax = a.broadcast(3, 2);
-        v_[0][2] = fmadd(bx[0], ax, v_[0][2]);
-        v_[1][2] = fmadd(bx[1], ax, v_[1][2]);
-        ax = a.broadcast(3, 3);
-        v_[0][3] = fmadd(bx[0], ax, v_[0][3]);
-        v_[1][3] = fmadd(bx[1], ax, v_[1][3]);
-    }
-#endif
-}
\ No newline at end of file
diff --git a/include/blast/math/simd/Avx256.hpp b/include/blast/math/simd/Avx256.hpp
deleted file mode 100644
index 8e34d3b3..00000000
--- a/include/blast/math/simd/Avx256.hpp
+++ /dev/null
@@ -1,25 +0,0 @@
-// Copyright 2023 Mikhail Katliar
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <blast/math/simd/avx256/SimdSize.hpp>
-#include <blast/math/simd/avx256/SimdVecFloat32.hpp>
-#include <blast/math/simd/avx256/SimdVecFloat64.hpp>
-#include <blast/math/simd/avx256/SimdVecInt32.hpp>
-#include <blast/math/simd/avx256/SimdVecInt64.hpp>
-// #include <blast/math/simd/avx256/SimdMask.hpp>
-#include <blast/math/simd/avx256/IntScalarType.hpp>
-
-#include <blast/math/simd/IntVecType.hpp>
diff --git a/include/blast/math/simd/Hsum.hpp b/include/blast/math/simd/Hsum.hpp
deleted file mode 100644
index 9c7b5f4b..00000000
--- a/include/blast/math/simd/Hsum.hpp
+++ /dev/null
@@ -1,31 +0,0 @@
-// Copyright (c) 2019-2020 Mikhail Katliar All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-#pragma once
-
-#include <immintrin.h>
-
-
-namespace blast
-{
-    /// dst[127:64] = sum(v1)
-    /// dst[63:0] = sum(v0)
-    inline __m128d hsum(__m256d v0, __m256d v1)
-    {
-        __m256d v01 = _mm256_hadd_pd(v0, v1);
-        return _mm_add_pd(_mm256_extractf128_pd(v01, 1), _mm256_castpd256_pd128(v01));
-    }
-
-
-    /// dst[255:192] = sum(v3)
-    /// dst[191:128] = sum(v2)
-    /// dst[127:64] = sum(v1)
-    /// dst[63:0] = sum(v0)
-    inline __m256d hsum(__m256d v0, __m256d v1, __m256d v2, __m256d v3)
-    {
-        __m128d v01 = hsum(v0, v1);
-        __m128d v23 = hsum(v2, v3);
-        return _mm256_insertf128_pd(_mm256_castpd128_pd256(v01), v23, 1);
-    }
-}
\ No newline at end of file
diff --git a/include/blast/math/simd/IntVecType.hpp b/include/blast/math/simd/IntVecType.hpp
deleted file mode 100644
index 0f8faab7..00000000
--- a/include/blast/math/simd/IntVecType.hpp
+++ /dev/null
@@ -1,24 +0,0 @@
-// Copyright 2023 Mikhail Katliar
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <blast/math/simd/IntScalarType.hpp>
-#include <blast/math/simd/SimdVec.hpp>
-
-namespace blast
-{
-    template <size_t N>
-    using IntVecType_t = SimdVec<IntScalarType_t<N>>;
-}
\ No newline at end of file
diff --git a/include/blast/math/simd/IntScalarType.hpp b/include/blast/math/simd/RegisterCapacity.hpp
similarity index 62%
rename from include/blast/math/simd/IntScalarType.hpp
rename to include/blast/math/simd/RegisterCapacity.hpp
index 3fb119ac..35b2ac7e 100644
--- a/include/blast/math/simd/IntScalarType.hpp
+++ b/include/blast/math/simd/RegisterCapacity.hpp
@@ -1,29 +1,32 @@
-// Copyright 2023 Mikhail Katliar
+// Copyright 2024 Mikhail Katliar
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
-//     http://www.apache.org/licenses/LICENSE-2.0
+//     https://www.apache.org/licenses/LICENSE-2.0
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS,
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
-
 #pragma once
 
+#include <blast/math/simd/Simd.hpp>
+
 #include <cstdlib>
-#include <cstdint>
 
 
 namespace blast
 {
-    template <size_t N>
-    struct IntScalarType;
-
-
-    template <size_t N>
-    using IntScalarType_t = typename IntScalarType<N>::type;
-}
\ No newline at end of file
+    /**
+     * @brief Number of available SIMD registers.
+     *
+     * @return Number of SIMD registers for AVX2
+     */
+    std::size_t constexpr registerCapacity(xsimd::avx2)
+    {
+        return 16;
+    }
+}
diff --git a/include/blast/math/simd/SequenceTag.hpp b/include/blast/math/simd/SequenceTag.hpp
deleted file mode 100644
index 8f37c552..00000000
--- a/include/blast/math/simd/SequenceTag.hpp
+++ /dev/null
@@ -1,23 +0,0 @@
-// Copyright 2023 Mikhail Katliar
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-
-namespace blast :: simd
-{
-    struct SequenceTag {};
-
-    inline SequenceTag constexpr sequenceTag;
-}
\ No newline at end of file
diff --git a/include/blast/math/simd/Simd.hpp b/include/blast/math/simd/Simd.hpp
index 040f4ffe..b5bd8873 100644
--- a/include/blast/math/simd/Simd.hpp
+++ b/include/blast/math/simd/Simd.hpp
@@ -1,459 +1,20 @@
-// Copyright (c) 2019-2020 Mikhail Katliar All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
+// Copyright 2024 Mikhail Katliar
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
 #pragma once
 
-#include <blast/math/simd/Avx256.hpp>
-
-#include <blaze/util/Types.h>
-#include <blaze/system/Inline.h>
-#include <blaze/math/AlignmentFlag.h>
-
-#include <immintrin.h>
-
-#include <cstdint>
-#include <type_traits>
-
-
-namespace blast
-{
-    using namespace blaze;
-
-
-    template <typename T>
-    struct Simd;
-
-
-    template <>
-    struct Simd<double>
-    {
-    private:
-        class Index;
-
-
-    public:
-        using IntrinsicType = __m256d;
-        using MaskType = __m256i;
-        using IntType = long long;
-
-        static size_t constexpr size = 4;
-        static size_t constexpr registerCapacity = 16;
-
-
-        //*******************************************************
-        //
-        // CUSTOM
-        //
-        //*******************************************************
-        static Index index() noexcept
-        {
-            return Index {};
-        }
-
-
-    private:
-        class Index
-        {
-        public:
-            MaskType operator<(IntType n) const noexcept
-            {
-                return _mm256_cmpgt_epi64(_mm256_set1_epi64x(n), val_);
-            }
-
-
-            MaskType operator>(IntType n) const noexcept
-            {
-                return _mm256_cmpgt_epi64(val_, _mm256_set1_epi64x(n));
-            }
-
-
-            MaskType operator<=(IntType n) const noexcept
-            {
-                return *this < n + 1;
-            }
-
-
-            MaskType operator>=(IntType n) const noexcept
-            {
-                return *this > n - 1;
-            }
-
-
-        private:
-            MaskType val_ = _mm256_set_epi64x(3, 2, 1, 0);
-        };
-    };
-
-
-    template <>
-    struct Simd<float>
-    {
-    private:
-        class Index;
-
-
-    public:
-        using IntrinsicType = __m256;
-        using MaskType = __m256i;
-        using IntType = int;
-
-        static size_t constexpr size = 8;
-        static size_t constexpr registerCapacity = 16;
-
-
-        //*******************************************************
-        //
-        // CUSTOM
-        //
-        //*******************************************************
-        static Index index() noexcept
-        {
-            return Index {};
-        }
-
-
-    private:
-        class Index
-        {
-        public:
-            MaskType operator<(IntType n) const noexcept
-            {
-                return _mm256_cmpgt_epi32(_mm256_set1_epi32(n), val_);
-            }
-
-
-            MaskType operator>(IntType n) const noexcept
-            {
-                return _mm256_cmpgt_epi32(val_, _mm256_set1_epi32(n));
-            }
-
-
-            MaskType operator<=(IntType n) const noexcept
-            {
-                return *this < n + 1;
-            }
-
-
-            MaskType operator>=(IntType n) const noexcept
-            {
-                return *this > n - 1;
-            }
-
-
-        private:
-            MaskType val_ = _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0);
-        };
-    };
-
-
-    template <typename T>
-    using IntrinsicType_t = typename Simd<T>::IntrinsicType;
-
-
-    template <typename T>
-    struct SimdTraits;
-
-
-    template <>
-    struct SimdTraits<__m256d>
-    {
-        using ScalarType = double;
-        using MaskType = __m256i;
-        using IntType = long long;
-        static size_t constexpr size = 4;
-    };
-
-
-    template <>
-    struct SimdTraits<__m256>
-    {
-        using ScalarType = float;
-        using MaskType = __m256i;
-        using IntType = int;
-        static size_t constexpr size = 8;
-    };
-
-
-    template <typename T>
-    using ScalarType_t = typename SimdTraits<T>::ScalarType;
-
-
-    template <typename T>
-    using MaskType_t = typename SimdTraits<T>::MaskType;
-
-    template <typename T>
-    using IntType_t = typename SimdTraits<T>::IntType;
-
-
-    template <typename T>
-    size_t constexpr RegisterCapacity_v = Simd<T>::registerCapacity;
-
-
-    //*******************************************************
-    //
-    // LOAD
-    //
-    //*******************************************************
-
-    template <bool AF, size_t SS, typename T>
-    auto load(T const * ptr);
-
-
-    template <bool AF, typename T>
-    inline auto load(T const * ptr)
-    {
-        return load<AF, Simd<T>::size>(ptr);
-    }
-
-
-    template <size_t SS, typename T>
-    auto broadcast(T const * ptr);
-
-
-    template <>
-    inline auto load<aligned, 4, double>(double const * ptr)
-    {
-        return _mm256_load_pd(ptr);
-    }
-
-
-    template <>
-    inline auto load<unaligned, 4, double>(double const * ptr)
-    {
-        return _mm256_loadu_pd(ptr);
-    }
-
-
-    template <>
-    inline auto load<aligned, 8, float>(float const * ptr)
-    {
-        return _mm256_load_ps(ptr);
-    }
-
-
-    template <>
-    inline auto load<unaligned, 8, float>(float const * ptr)
-    {
-        return _mm256_loadu_ps(ptr);
-    }
-
-
-    inline auto maskload(double const * ptr, __m256i mask)
-    {
-        return _mm256_maskload_pd(ptr, mask);
-    }
-
-
-    inline auto maskload(float const * ptr, __m256i mask)
-    {
-        return _mm256_maskload_ps(ptr, mask);
-    }
-
-
-    template <>
-    inline auto broadcast<4, double>(double const * ptr)
-    {
-        return _mm256_broadcast_sd(ptr);
-    }
-
-
-    template <>
-    inline auto broadcast<8, float>(float const * ptr)
-    {
-        return _mm256_broadcast_ss(ptr);
-    }
-
-
-    //*******************************************************
-    //
-    // STORE
-    //
-    //*******************************************************
-    template <bool AF>
-    void store(double * ptr, __m256d a);
-
-
-    template <>
-    inline void store<aligned>(double * ptr, __m256d a)
-    {
-        _mm256_store_pd(ptr, a);
-    }
-
-
-    template <>
-    inline void store<unaligned>(double * ptr, __m256d a)
-    {
-        _mm256_storeu_pd(ptr, a);
-    }
-
-
-    template <bool AF>
-    void store(float * ptr, __m256 a);
-
-
-    template <>
-    inline void store<aligned>(float * ptr, __m256 a)
-    {
-        _mm256_store_ps(ptr, a);
-    }
-
-
-    template <>
-    inline void store<unaligned>(float * ptr, __m256 a)
-    {
-        _mm256_storeu_ps(ptr, a);
-    }
-
-
-    inline void maskstore(double * ptr, __m256i m, __m256d a)
-    {
-        _mm256_maskstore_pd(ptr, m, a);
-    }
-
-
-    inline void maskstore(float * ptr, __m256i m, __m256 a)
-    {
-        _mm256_maskstore_ps(ptr, m, a);
-    }
-
-
-    //*******************************************************
-    //
-    // SET
-    //
-    //*******************************************************
-
-    inline auto set(double a3, double a2, double a1, double a0)
-    {
-        return _mm256_set_pd(a3, a2, a1, a0);
-    }
-
-
-    inline auto set(float a7, float a6, float a5, float a4, float a3, float a2, float a1, float a0)
-    {
-        return _mm256_set_ps(a7, a6, a5, a4, a3, a2, a1, a0);
-    }
-
-
-    inline auto set(long long a3, long long a2, long long a1, long long a0)
-    {
-        return _mm256_set_epi64x(a3, a2, a1, a0);
-    }
-
-
-    template <size_t N, typename T>
-    auto set1(T a);
-
-
-    inline auto set1(double a)
-    {
-        return _mm256_set1_pd(a);
-    }
-
-
-    inline auto set1(float a)
-    {
-        return _mm256_set1_ps(a);
-    }
-
-
-    template <>
-    inline auto set1<4, double>(double a)
-    {
-        return _mm256_set1_pd(a);
-    }
-
-
-    template <>
-    inline auto set1<8, float>(float a)
-    {
-        return _mm256_set1_ps(a);
-    }
-
-
-    template <>
-    inline auto set1<8, int>(int val)
-    {
-        return _mm256_set1_epi32(val);
-    }
-
-
-    template <>
-    inline auto set1<4, long long>(long long val)
-    {
-        return _mm256_set1_epi64x(val);
-    }
-
-
-    template <typename T, size_t N>
-    auto setzero();
-
-
-    template <>
-    inline auto setzero<double, 4>()
-    {
-        return _mm256_setzero_pd();
-    }
-
-
-    template <>
-    inline auto setzero<float, 8>()
-    {
-        return _mm256_setzero_ps();
-    }
-
-
-    //*******************************************************
-    //
-    // ARITHMETIC
-    //
-    //*******************************************************
-
-    inline auto fnmadd(__m256d a, __m256d b, __m256d c)
-    {
-        return _mm256_fnmadd_pd(a, b, c);
-    }
-
-
-    inline auto fnmadd(__m256 a, __m256 b, __m256 c)
-    {
-        return _mm256_fnmadd_ps(a, b, c);
-    }
-
-
-    inline auto abs(__m256d a)
-    {
-        return _mm256_andnot_pd(set1(-0.), a);
-    }
-
-
-    inline auto abs(__m256 a)
-    {
-        return _mm256_andnot_ps(set1(-0.f), a);
-    }
-
-
-    //*******************************************************
-    //
-    // COMPARE
-    //
-    //*******************************************************
-
-    template <size_t N, typename MM>
-    auto cmpgt(MM a, MM b);
-
-
-    template <>
-    inline auto cmpgt<4>(__m256i a, __m256i b)
-    {
-        return _mm256_cmpgt_epi64(a, b);
-    }
-
+#include <xsimd/xsimd.hpp>
 
-    template <>
-    inline auto cmpgt<8>(__m256i a, __m256i b)
-    {
-        return _mm256_cmpgt_epi32(a, b);
-    }
-}
\ No newline at end of file
+#if XSIMD_WITH_AVX2
+    #include <blast/math/simd/arch/Avx2.hpp>
+#endif
diff --git a/include/blast/math/simd/SimdIndex.hpp b/include/blast/math/simd/SimdIndex.hpp
new file mode 100644
index 00000000..3d7ca7b2
--- /dev/null
+++ b/include/blast/math/simd/SimdIndex.hpp
@@ -0,0 +1,98 @@
+// Copyright 2024 Mikhail Katliar
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma  once
+
+#include <blast/math/simd/Simd.hpp>
+
+#include <cstdint>
+#include <type_traits>
+
+
+namespace blast
+{
+    namespace detail
+    {
+        /// @brief Deduces integer type of given size, sized or unsigned.
+        ///
+        /// @tparam N type size in bytes
+        /// @tparam S true for signed type, false for unsigned
+        ///
+        template <std::size_t N, bool S>
+        struct IntOfSize;
+
+        template <std::size_t N, bool S>
+        using IntOfSize_t = typename IntOfSize<N, S>::Type;
+
+        template <>
+        struct IntOfSize<4, true>
+        {
+            using Type = std::int32_t;
+        };
+
+        template <>
+        struct IntOfSize<4, false>
+        {
+            using Type = std::uint32_t;
+        };
+
+        template <>
+        struct IntOfSize<8, true>
+        {
+            using Type = std::int64_t;
+        };
+
+        template <>
+        struct IntOfSize<8, false>
+        {
+            using Type = std::uint64_t;
+        };
+
+        template <typename T, typename Arch>
+        requires (xsimd::batch<T, Arch>::size == 4) && std::is_integral_v<T>
+        inline xsimd::batch<T, Arch> indexSequence(T start) noexcept
+        {
+            return {start, start + 1, start + 2, start + 3};
+        }
+
+        template <typename T, typename Arch = xsimd::default_arch>
+        requires (xsimd::batch<T, Arch>::size == 8) && std::is_integral_v<T>
+        inline xsimd::batch<T, Arch> indexSequence(T start) noexcept
+        {
+            return {start, start + 1, start + 2, start + 3, start + 4, start + 5, start + 6, start + 7};
+        }
+    }
+
+
+    /// @brief Integer SIMD type that can be used for indexing or gather-scatter operations.
+    ///
+    /// @tparam T the deduced type will be the same size as @a T
+    /// @tparam Arch instruction set architecture
+    ///
+    template <typename T, typename Arch = xsimd::default_arch>
+    using SimdIndex = xsimd::batch<detail::IntOfSize_t<sizeof(T), true>, Arch>;
+
+
+    /// @brief Construct an integer index sequence
+    ///
+    /// @param start start of the sequence
+    ///
+    /// @return [ @a start, @a start + 1, ..., @a start + N - 1 ]
+    ///   where N = SimdIndex<T, Arch>::size
+    ///
+    template <typename T, typename Arch = xsimd::default_arch>
+    inline SimdIndex<T, Arch> indexSequence(typename SimdIndex<T, Arch>::value_type start = 0) noexcept
+    {
+        return detail::indexSequence<detail::IntOfSize_t<sizeof(T), true>, Arch>(start);
+    }
+}
diff --git a/include/blast/math/simd/SimdMask.hpp b/include/blast/math/simd/SimdMask.hpp
index 7e65c867..bdb26082 100644
--- a/include/blast/math/simd/SimdMask.hpp
+++ b/include/blast/math/simd/SimdMask.hpp
@@ -14,6 +14,8 @@
 
 #pragma once
 
+#include <blast/math/simd/Simd.hpp>
+
 
 namespace blast
 {
@@ -22,7 +24,79 @@ namespace blast
      * The width of a given simd_mask instantiation is a constant expression, determined by the template parameter.
      *
      * @tparam T the element type simd_mask applies on
+     * @tparam Arch instruction set architecture
      */
-    template <typename T>
-    class SimdMask;
-}
\ No newline at end of file
+    template <typename T, typename Arch = xsimd::default_arch>
+    class SimdMask
+    :   public xsimd::batch_bool<T, Arch>
+    {
+    public:
+        using XSimdType = xsimd::batch_bool<T, Arch>;
+        using IntrinsicType = typename XSimdType::register_type;
+
+        /**
+         * @brief Construct from an intrinsic register type
+         *
+         * @param v the value to construct from
+         */
+        SimdMask(IntrinsicType v) noexcept
+        :   XSimdType {v}
+        {
+        }
+
+        /**
+         * @brief Construct from an @a xsimd::batch_bool of the same type
+         *
+         * @param v the value to construct from
+         */
+        SimdMask(XSimdType const& v) noexcept
+        :   XSimdType {v}
+        {
+        }
+
+        /**
+         * @brief Construct from an @a xsimd::batch_bool of a different type
+         *
+         * We need to allow conversion from batch_bool<U, Arch> with U != T,
+         * because we need e.g. the following code to work:
+         *
+         * int n;
+         * MaskType<double> mask = indexSequence<double>() < n;
+         *
+         * In the code above, the result of indexSequence<double>() is a batch<int64_t>,
+         * and the result of indexSequence<double>() < n is a batch_bool<int64_t>,
+         * which can not be directly assigned to a batch_bool<double>,
+         * although their underlying register_type's are identical.
+         *
+         * @param v the value to construct from
+         */
+        template <typename U>
+        SimdMask(xsimd::batch_bool<U, Arch> const& v) noexcept
+        :   XSimdType {xsimd::batch_bool_cast<T>(v)}
+        {
+        }
+
+        /**
+         * @brief In-place logical and.
+         *
+         * We need to define this operator to make the following code work:
+         *
+         * MaskType<double> mask;
+         * int n;
+         * mask &= indexSequence<double>() >= n;
+         *
+         * In the code above, the result of indexSequence<double>() is a batch<int64_t>,
+         * and the result of indexSequence<double>() >= n is a batch_bool<int64_t>,
+         * which can not be directly used as a right-hand side operand of batch_bool<double>::operator&=().
+         * Defining operator&=(SimdMask const&) allows the right-hand side to be implicitly converted to SimdMask.
+         *
+         * @param v
+         * @return SimdMask&
+         */
+        SimdMask& operator&=(SimdMask const& v) noexcept
+        {
+            XSimdType::operator&=(v);
+            return *this;
+        }
+    };
+}
diff --git a/include/blast/math/simd/SimdSize.hpp b/include/blast/math/simd/SimdSize.hpp
index 4e862f72..01a64825 100644
--- a/include/blast/math/simd/SimdSize.hpp
+++ b/include/blast/math/simd/SimdSize.hpp
@@ -1,4 +1,4 @@
-// Copyright 2023 Mikhail Katliar
+// Copyright 2023-2024 Mikhail Katliar
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -14,24 +14,20 @@
 
 #pragma once
 
+#include <blast/math/simd/Simd.hpp>
+
 #include <cstdlib>
+#include <type_traits>
 
 
 namespace blast
 {
-    template <typename T>
-    struct SimdSize;
-
-
-    template <typename T>
-    struct SimdSize<T const> : SimdSize<T> {};
-
-
     /**
-     * @brief Number of elements in a SimdVec<T> object.
+     * @brief Size of a SIMD register conatining scalars of a given type.
      *
      * @tparam T scalar type
+     * @tparam Arch instruction set architecture
      */
-    template <typename T>
-    size_t constexpr SimdSize_v = SimdSize<T>::value;
-}
\ No newline at end of file
+    template <typename T, typename Arch = xsimd::default_arch>
+    std::size_t constexpr SimdSize_v = xsimd::batch<std::remove_cv_t<T>, Arch>::size;
+}
diff --git a/include/blast/math/simd/SimdVec.hpp b/include/blast/math/simd/SimdVec.hpp
index a3195ff7..140516ef 100644
--- a/include/blast/math/simd/SimdVec.hpp
+++ b/include/blast/math/simd/SimdVec.hpp
@@ -14,15 +14,377 @@
 
 #pragma once
 
+#include <blast/math/simd/SimdIndex.hpp>
+#include <blast/math/simd/SimdMask.hpp>
+#include <blast/math/simd/Simd.hpp>
+
+#include <tuple>
+
 
 namespace blast
 {
+    template <typename T, typename Arch = xsimd::default_arch>
+    class SimdVec;
+
+    /**
+    * @brief Fused negative multiply-add
+    *
+    * Calculate -a * b + c
+    *
+    * @param a first multiplier
+    * @param b second multiplier
+    * @param c addendum
+    *
+    * @return @a a * @a b + @a c element-wise
+    */
+    template <typename T, typename Arch>
+    SimdVec<T, Arch> fnmadd(SimdVec<T, Arch> const& a, SimdVec<T, Arch> const& b, SimdVec<T, Arch> const& c) noexcept;
+
+    /**
+    * @brief Fused multiply-add
+    *
+    * Calculate a * b + c
+    *
+    * @param a first multiplier
+    * @param b second multiplier
+    * @param c addendum
+    *
+    * @return @a a * @a b + @a c element-wise
+    */
+    template <typename T, typename Arch>
+    SimdVec<T, Arch> fmadd(SimdVec<T, Arch> const& a, SimdVec<T, Arch> const& b, SimdVec<T, Arch> const& c) noexcept;
+
+    template <typename T, typename Arch>
+    SimdVec<T, Arch> blend(SimdVec<T, Arch> const& a, SimdVec<T, Arch> const& b, SimdMask<T, Arch> const& mask) noexcept;
+
+    template <typename T, typename Arch>
+    SimdVec<T, Arch> abs(SimdVec<T, Arch> const& a) noexcept;
+
+    /**
+    * @brief Vertical max (across two vectors)
+    *
+    * @param a first vector
+    * @param b second vector
+    *
+    * @return [max(a[7], b7), [max(a[6], b6), max(a[5], b5), max(a[4], b4), max(a[3], b3), max(a[2], b2), max(a[1], b1), max(a[0], b0)]
+    */
+    template <typename T, typename Arch>
+    SimdVec<T, Arch> max(SimdVec<T, Arch> const& a, SimdVec<T, Arch> const& b) noexcept;
+
+    /**
+    * @brief Horizontal max (across all elements)
+    *
+    * @param a pack of 32-bit floating point numbers
+    *
+    * @return max(a[0], a[1], ..., a[N-1])
+    */
+    template <typename T, typename Arch>
+    T max(SimdVec<T, Arch> const& x) noexcept;
+
+    template <typename T, typename Arch>
+    SimdMask<T, Arch> operator>(SimdVec<T, Arch> const& a, SimdVec<T, Arch> const& b) noexcept;
+
+    /**
+    * @brief Multiplication
+    *
+    * @param a first multiplier
+    * @param b second multiplier
+    *
+    * @return product @a a * @a b
+    */
+    template <typename T, typename Arch>
+    SimdVec<T, Arch> operator*(SimdVec<T, Arch> const& a, SimdVec<T, Arch> const& b) noexcept;
+
+    /**
+    * @brief Left multiplication with a scalar
+    *
+    * @param a scalar multiplier
+    * @param b batch multiplier
+    *
+    * @return product @a a * @a b
+    */
+    template <typename T, typename Arch>
+    SimdVec<T, Arch> operator*(T const& a, SimdVec<T, Arch> const& b) noexcept;
+
+
     /**
-     * @brief Data-parallel type with the element type bool.
-     * The width of a given simd_mask instantiation is a constant expression, determined by the template parameter.
+    * @brief Right multiplication with a scalar
+    *
+    * @param a batch multiplier
+    * @param b scalar multiplier
+    *
+    * @return product @a a * @a b
+    */
+    template <typename T, typename Arch>
+    SimdVec<T, Arch> operator*(SimdVec<T, Arch> const& a, T const& b) noexcept;
+
+
+    /**
+     * @brief Data-parallel type with a given element type.
      *
-     * @tparam T the element type simd_mask applies on
+     * @tparam T element type
      */
-    template <typename T>
-    class SimdVec;
-}
\ No newline at end of file
+    template <typename T, typename Arch>
+    class SimdVec
+    {
+    public:
+        using ValueType = T;
+        using XSimdType = xsimd::batch<T, Arch>;
+        using IntrinsicType = typename XSimdType::register_type;
+        using MaskType = SimdMask<T, Arch>;
+
+
+        /**
+         * @brief Set to [0, 0, 0, ...]
+         */
+        SimdVec() noexcept
+        :   value_ {T {}}
+        {
+        }
+
+
+        SimdVec(SimdVec const&) noexcept = default;
+
+
+        SimdVec(IntrinsicType value) noexcept
+        :   value_ {value}
+        {
+        }
+
+
+        SimdVec(XSimdType value) noexcept
+        :   value_ {value}
+        {
+        }
+
+
+        /**
+         * @brief Set to [value, value, ...]
+         *
+         * @param value value for each component of SIMD vector
+         */
+        SimdVec(ValueType value) noexcept
+        :   value_ {value}
+        {
+        }
+
+
+        /**
+         * @brief Load from location
+         *
+         * @param src memory location to load from
+         * @param aligned true indicates that an aligned read instruction should be used
+         */
+        explicit SimdVec(ValueType const * src, bool aligned) noexcept
+        :   value_ {aligned ? xsimd::load_aligned(src) : xsimd::load_unaligned(src)}
+        {
+        }
+
+
+        /**
+         * @brief Masked load from location
+         *
+         * @param src memory location to load from
+         * @param mask load mask
+         * @param aligned true if @a src is SIMD-aligned
+         */
+        explicit SimdVec(ValueType const * src, MaskType mask, bool aligned) noexcept
+        :   value_ {maskload(src, mask)}
+        {
+        }
+
+
+        /**
+         * @brief Number of elements in SIMD pack
+         */
+        static size_t constexpr size()
+        {
+            return XSimdType::size;
+        }
+
+
+        /**
+         * @brief Set to 0
+         */
+        void reset() noexcept
+        {
+            value_ = ValueType {};
+        }
+
+
+        operator IntrinsicType() const noexcept
+        {
+            return value_;
+        }
+
+
+        /**
+         * @brief Access single element
+         *
+         * @param i element index
+         *
+         * @return element value
+         */
+        ValueType operator[](size_t i) const noexcept
+        {
+            return value_.get(i);
+        }
+
+
+        /**
+         * @brief Store to memory
+         *
+         * @param dst memory location to store to
+         * @param aligned true if @a dst is SIMD-aligned
+         */
+        void store(ValueType * dst, bool aligned) const noexcept
+        {
+            if (aligned)
+                xsimd::store_aligned(dst, value_);
+            else
+                xsimd::store_unaligned(dst, value_);
+        }
+
+
+        /**
+         * @brief Masked store to memory
+         *
+         * @param dst memory location to store to
+         * @param mask store mask
+         * @param aligned true if @a dst is SIMD-aligned
+         */
+        void store(ValueType * dst, MaskType mask, bool aligned) const noexcept
+        {
+            maskstore(value_, dst, mask);
+        }
+
+
+        /**
+         * @brief In-place multiplication
+         *
+         * @param a multiplier
+         *
+         * @return @a *this after multiplication with @a a
+         */
+        SimdVec& operator*=(SimdVec const& a) noexcept
+        {
+            value_ *= a.value_;
+            return *this;
+        }
+
+
+        /**
+         * @brief In-place division
+         *
+         * @param a divisor
+         *
+         * @return @a *this after division by @a a
+         */
+        SimdVec& operator/=(SimdVec const& a) noexcept
+        {
+            value_ /= a.value_;
+            return *this;
+        }
+
+
+        friend SimdVec fmadd<>(SimdVec const& a, SimdVec const& b, SimdVec const& c) noexcept;
+        friend SimdVec fnmadd<>(SimdVec const& a, SimdVec const& b, SimdVec const& c) noexcept;
+        friend SimdVec blend<>(SimdVec const& a, SimdVec const& b, MaskType const& mask) noexcept;
+        friend SimdVec abs<>(SimdVec const& a) noexcept;
+        friend SimdVec max<>(SimdVec const& a, SimdVec const& b) noexcept;
+        friend ValueType max<>(SimdVec const& x) noexcept;
+        friend MaskType operator><>(SimdVec const& a, SimdVec const& b) noexcept;
+        friend SimdVec operator*<>(SimdVec const& a, SimdVec const& b) noexcept;
+        friend SimdVec operator*<>(ValueType const& a, SimdVec const& b) noexcept;
+        friend SimdVec operator*<>(SimdVec const& a, ValueType const& b) noexcept;
+
+        /**
+         * @brief Maximum element of a batch and its index.
+         *
+         * For example, imax([-1., 0., 4., 3.], [1, 2, -1, 42]) == ([4., 4., 4., 4.], [-1, -1, -1, -1])
+         *
+         * @param v batch of values
+         * @param idx batch of indices
+         *
+         * @return a tuple {A, I} such that A == [a, a, ..., a] and I == [idx[i], idx[i], ..., idx[i]],
+         *   where a == max_j(v[j]), i = argmax_j(v[j])
+         */
+        friend std::tuple<SimdVec, SimdIndex<T, Arch>> imax(SimdVec const& v, SimdIndex<T, Arch> const& idx) noexcept
+        {
+            return imax(v.value_, idx);
+        }
+
+    private:
+        XSimdType value_;
+    };
+
+
+    template <typename T, typename Arch>
+    inline SimdVec<T, Arch> fmadd(SimdVec<T, Arch> const& a, SimdVec<T, Arch> const& b, SimdVec<T, Arch> const& c) noexcept
+    {
+        return xsimd::fma(a.value_, b.value_, c.value_);
+    }
+
+
+    template <typename T, typename Arch>
+    inline SimdVec<T, Arch> fnmadd(SimdVec<T, Arch> const& a, SimdVec<T, Arch> const& b, SimdVec<T, Arch> const& c) noexcept
+    {
+        return xsimd::fnma(a.value_, b.value_, c.value_);
+    }
+
+
+    template <typename T, typename Arch>
+    inline SimdVec<T, Arch> blend(SimdVec<T, Arch> const& a, SimdVec<T, Arch> const& b, SimdMask<T, Arch> const& mask) noexcept
+    {
+        return xsimd::select(mask, a.value_, b.value_);
+    }
+
+
+    template <typename T, typename Arch>
+    inline SimdVec<T, Arch> abs(SimdVec<T, Arch> const& a) noexcept
+    {
+        return xsimd::abs(a.value_);
+    }
+
+
+    template <typename T, typename Arch>
+    inline SimdMask<T, Arch> operator>(SimdVec<T, Arch> const& a, SimdVec<T, Arch> const& b) noexcept
+    {
+        return xsimd::gt(a.value_, b.value_);
+    }
+
+
+    template <typename T, typename Arch>
+    inline SimdVec<T, Arch> operator*(SimdVec<T, Arch> const& a, SimdVec<T, Arch> const& b) noexcept
+    {
+        return xsimd::mul(a.value_, b.value_);
+    }
+
+
+    template <typename T, typename Arch>
+    inline SimdVec<T, Arch> operator*(T const& a, SimdVec<T, Arch> const& b) noexcept
+    {
+        return xsimd::mul(a, b.value_);
+    }
+
+
+    template <typename T, typename Arch>
+    inline SimdVec<T, Arch> operator*(SimdVec<T, Arch> const& a, T const& b) noexcept
+    {
+        return xsimd::mul(a.value_, b);
+    }
+
+
+    template <typename T, typename Arch>
+    inline SimdVec<T, Arch> max(SimdVec<T, Arch> const& a, SimdVec<T, Arch> const& b) noexcept
+    {
+        return xsimd::max(a.value_, b.value_);
+    }
+
+
+    template <typename T, typename Arch>
+    inline T max(SimdVec<T, Arch> const& x) noexcept
+    {
+        return xsimd::reduce_max(x.value_);
+    }
+}
diff --git a/include/blast/math/simd/SimdVecBase.hpp b/include/blast/math/simd/SimdVecBase.hpp
deleted file mode 100644
index 365258f2..00000000
--- a/include/blast/math/simd/SimdVecBase.hpp
+++ /dev/null
@@ -1,54 +0,0 @@
-// Copyright 2023 Mikhail Katliar
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <blast/math/simd/SimdSize.hpp>
-
-#include <cstdlib>
-
-
-namespace blast
-{
-    template <typename T, typename I>
-    class SimdVecBase
-    {
-    public:
-        using ValueType = T;
-        using IntrinsicType = I;
-
-
-        /**
-         * @brief Number of elements in SIMD pack
-         */
-        static size_t constexpr size()
-        {
-            return SimdSize_v<T>;
-        }
-
-
-        operator IntrinsicType() const noexcept
-        {
-            return value_;
-        }
-
-    protected:
-        SimdVecBase(IntrinsicType value)
-        :   value_ {value}
-        {
-        }
-
-        IntrinsicType value_;
-    };
-}
\ No newline at end of file
diff --git a/include/blast/math/simd/arch/Avx2.hpp b/include/blast/math/simd/arch/Avx2.hpp
new file mode 100644
index 00000000..b5aed8c1
--- /dev/null
+++ b/include/blast/math/simd/arch/Avx2.hpp
@@ -0,0 +1,123 @@
+// Copyright 2024 Mikhail Katliar
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+
+#include <xsimd/xsimd.hpp>
+
+#include <type_traits>
+
+
+namespace blast
+{
+    template <typename Arch>
+    requires std::is_base_of_v<xsimd::avx2, Arch>
+    inline xsimd::batch<float, Arch> maskload(float const * src, xsimd::batch_bool<float, Arch> const& mask) noexcept
+    {
+        return _mm256_maskload_ps(src, _mm256_castps_si256(mask));
+    }
+
+
+    template <typename Arch>
+    requires std::is_base_of_v<xsimd::avx2, Arch>
+    inline xsimd::batch<double, Arch> maskload(double const * src, xsimd::batch_bool<double, Arch> const& mask) noexcept
+    {
+        return _mm256_maskload_pd(src, _mm256_castpd_si256(mask));
+    }
+
+
+    template <typename Arch>
+    requires std::is_base_of_v<xsimd::avx2, Arch>
+    inline void maskstore(xsimd::batch<float, Arch> const& v, float * dst, xsimd::batch_bool<float, Arch> const& mask) noexcept
+    {
+        _mm256_maskstore_ps(dst, _mm256_castps_si256(mask), v);
+    }
+
+
+    template <typename Arch>
+    requires std::is_base_of_v<xsimd::avx2, Arch>
+    inline void maskstore(xsimd::batch<double, Arch> const& v, double * dst, xsimd::batch_bool<double, Arch> const& mask) noexcept
+    {
+        _mm256_maskstore_pd(dst, _mm256_castpd_si256(mask), v);
+    }
+
+
+    template <typename Arch>
+    requires std::is_base_of_v<xsimd::avx2, Arch>
+    inline std::tuple<xsimd::batch<float, Arch>, xsimd::batch<std::int32_t, Arch>> imax(xsimd::batch<float, Arch> const& v1, xsimd::batch<std::int32_t, Arch> const& idx) noexcept
+    {
+        /* v2 = [G H E F | C D A B]                                                                         */
+        __m256 const v2 = _mm256_permute_ps(v1, 0b10'11'00'01);
+        __m256i const iv2 = _mm256_castps_si256(_mm256_permute_ps(_mm256_castsi256_ps(idx), 0b10'11'00'01));
+
+        /* v3 = [W=max(G,H) W=max(G,H) Z=max(E,F) Z=max(E,F) | Y=max(C,D) Y=max(C,D) X=max(A,B) X=max(A,B)] */
+        /* v3 = [W W Z Z | Y Y X X]                                                                         */
+        // __m256 v3 = _mm256_max_ps(v1, v2);
+        __m256 const mask_v3 = _mm256_cmp_ps(v2, v1, _CMP_GT_OQ);
+        __m256 const v3 = _mm256_blendv_ps(v1, v2, mask_v3);
+        __m256i const iv3 = _mm256_blendv_epi8(idx, iv2, _mm256_castps_si256(mask_v3));
+
+        /* v4 = [Z Z W W | X X Y Y]                                                                         */
+        __m256 const v4 = _mm256_permute_ps(v3, 0b00'00'10'10);
+        __m256i const iv4 = _mm256_castps_si256(_mm256_permute_ps(_mm256_castsi256_ps(iv3), 0b00'00'10'10));
+
+        /* v5 = [J=max(Z,W) J=max(Z,W) J=max(Z,W) J=max(Z,W) | I=max(X,Y) I=max(X,Y) I=max(X,Y) I=max(X,Y)] */
+        /* v5 = [J J J J | I I I I]                                                                         */
+        // __m256 v5 = _mm256_max_ps(v3, v4);
+        __m256 const mask_v5 = _mm256_cmp_ps(v4, v3, _CMP_GT_OQ);
+        __m256 const v5 = _mm256_blendv_ps(v3, v4, mask_v5);
+        __m256i const iv5 = _mm256_blendv_epi8(iv3, iv4, _mm256_castps_si256(mask_v5));
+
+        /* v6 = [I I I I | J J J J]                                                                         */
+        __m256 const v6 = _mm256_permute2f128_ps(v5, v5, 0b0000'0001);
+        __m256i const iv6 = _mm256_castps_si256(
+            _mm256_permute2f128_ps(
+                _mm256_castsi256_ps(iv5),
+                _mm256_castsi256_ps(iv5),
+                0b0000'0001
+            )
+        );
+
+        /* v7 = [M=max(I,J) M=max(I,J) M=max(I,J) M=max(I,J) | M=max(I,J) M=max(I,J) M=max(I,J) M=max(I,J)] */
+        // __m128 v7 = _mm_max_ps(_mm256_castps256_ps128(v5), v6);
+        __m256 const mask_v7 = _mm256_cmp_ps(v6, v5, _CMP_GT_OQ);
+        __m256 const v7 = _mm256_blendv_ps(v5, v6, mask_v7);
+        __m256i const iv7 = _mm256_blendv_epi8(iv5, iv6, _mm256_castps_si256(mask_v7));
+
+        return {v7, iv7};
+    }
+
+
+    template <typename Arch>
+    requires std::is_base_of_v<xsimd::avx2, Arch>
+    inline std::tuple<xsimd::batch<double, Arch>, xsimd::batch<std::int64_t, Arch>> imax(xsimd::batch<double, Arch> const& x, xsimd::batch<std::int64_t, Arch> const& idx) noexcept
+    {
+        __m256d const y = _mm256_permute2f128_pd(x, x, 1); // permute 128-bit values
+        __m256i const iy = _mm256_permute2f128_si256(idx, idx, 1);
+
+        // __m256d m1 = _mm256_max_pd(x.value_, y); // m1[0] = max(x[0], x[2]), m1[1] = max(x[1], x[3]), etc.
+        __m256d const mask_m1 = _mm256_cmp_pd(y, x, _CMP_GT_OQ);
+        __m256d const m1 = _mm256_blendv_pd(x, y, mask_m1);
+        __m256i const im1 = _mm256_blendv_epi8(idx, iy, _mm256_castpd_si256(mask_m1));
+
+        __m256d const m2 = _mm256_permute_pd(m1, 5); // set m2[0] = m1[1], m2[1] = m1[0], etc.
+        __m256i const im2 = _mm256_castpd_si256(_mm256_permute_pd(_mm256_castsi256_pd(im1), 5));
+
+        // __m256d m = _mm256_max_pd(m1, m2); // all m[0] ... m[3] contain the horizontal max(x[0], x[1], x[2], x[3])
+        __m256d const mask_m = _mm256_cmp_pd(m2, m1, _CMP_GT_OQ);
+        __m256d const m = _mm256_blendv_pd(m1, m2, mask_m);
+        __m256i const im = _mm256_blendv_epi8(im1, im2, _mm256_castpd_si256(mask_m));
+
+        return {m, im};
+    }
+}
diff --git a/include/blast/math/simd/avx256/IntScalarType.hpp b/include/blast/math/simd/avx256/IntScalarType.hpp
deleted file mode 100644
index 59785c85..00000000
--- a/include/blast/math/simd/avx256/IntScalarType.hpp
+++ /dev/null
@@ -1,34 +0,0 @@
-// Copyright 2023 Mikhail Katliar
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <blast/math/simd/IntScalarType.hpp>
-
-
-namespace blast
-{
-    template <>
-    struct IntScalarType<4>
-    {
-        using type = std::int64_t;
-    };
-
-
-    template <>
-    struct IntScalarType<8>
-    {
-        using type = std::int32_t;
-    };
-}
\ No newline at end of file
diff --git a/include/blast/math/simd/avx256/SimdMask.hpp b/include/blast/math/simd/avx256/SimdMask.hpp
deleted file mode 100644
index 8dbcea64..00000000
--- a/include/blast/math/simd/avx256/SimdMask.hpp
+++ /dev/null
@@ -1,83 +0,0 @@
-// Copyright 2023 Mikhail Katliar
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <blast/math/simd/SimdSize.hpp>
-
-#include <immintrin.h>
-
-#include <cstdint>
-
-
-namespace blast
-{
-    template <typename T>
-    class SimdMask
-    {
-        static size_t constexpr SS = SimdSize_v<T>;
-
-    public:
-        using ValueType = IntElementType_t<SS>;
-        using IntrinsicType = __m256i;
-        using MaskType = __m256i;
-
-
-        /**
-         * @brief Set to [0, 0, 0, ...]
-         */
-        SimdMask() noexcept
-        :   value_ {_mm256_setzero_pd()}
-        {
-        }
-
-
-        SimdMask(IntrinsicType value) noexcept
-        :   value_ {value}
-        {
-        }
-
-
-        operator IntrinsicType() const noexcept
-        {
-            return value_;
-        }
-
-
-        /**
-         * @brief Number of elements in SIMD pack
-         */
-        static size_t constexpr size()
-        {
-            return SS;
-        }
-
-
-        /**
-         * @brief Access specified element
-         *
-         * @param i element index
-         *
-         * @return element value
-         */
-        ValueType operator[](size_t i) const noexcept
-        {
-            return value_[i];
-        }
-
-
-    private:
-        IntrinsicType value_;
-    };
-}
\ No newline at end of file
diff --git a/include/blast/math/simd/avx256/SimdSize.hpp b/include/blast/math/simd/avx256/SimdSize.hpp
deleted file mode 100644
index 2fa30b2f..00000000
--- a/include/blast/math/simd/avx256/SimdSize.hpp
+++ /dev/null
@@ -1,50 +0,0 @@
-// Copyright 2023 Mikhail Katliar
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <blast/math/simd/SimdSize.hpp>
-
-#include <cstdint>
-
-
-namespace blast
-{
-    template <>
-    struct SimdSize<float>
-    {
-        static size_t constexpr value = 8;
-    };
-
-
-    template <>
-    struct SimdSize<double>
-    {
-        static size_t constexpr value = 4;
-    };
-
-
-    template <>
-    struct SimdSize<std::int32_t>
-    {
-        static size_t constexpr value = 8;
-    };
-
-
-    template <>
-    struct SimdSize<std::int64_t>
-    {
-        static size_t constexpr value = 4;
-    };
-}
\ No newline at end of file
diff --git a/include/blast/math/simd/avx256/SimdVecFloat32.hpp b/include/blast/math/simd/avx256/SimdVecFloat32.hpp
deleted file mode 100644
index 34709629..00000000
--- a/include/blast/math/simd/avx256/SimdVecFloat32.hpp
+++ /dev/null
@@ -1,263 +0,0 @@
-// Copyright 2023 Mikhail Katliar
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <blast/math/simd/SimdVec.hpp>
-#include <blast/math/simd/SimdVecBase.hpp>
-#include <blast/math/simd/avx256/SimdSize.hpp>
-
-#include <immintrin.h>
-
-#include <tuple>
-
-
-namespace blast
-{
-    template <>
-    class SimdVec<float>
-    :   public SimdVecBase<float, __m256>
-    {
-    public:
-        using MaskType = __m256i;
-
-        /**
-         * @brief Set to [0, 0, 0, ...]
-         */
-        SimdVec() noexcept
-        :   SimdVecBase {_mm256_setzero_ps()}
-        {
-        }
-
-
-        SimdVec(IntrinsicType value) noexcept
-        :   SimdVecBase {value}
-        {
-        }
-
-
-        /**
-         * @brief Set to [value, value, ...]
-         *
-         * TODO: add "explicit"
-         *
-         * @param value value for each component of SIMD vector
-         */
-        SimdVec(ValueType value) noexcept
-        :   SimdVecBase {_mm256_set1_ps(value)}
-        {
-        }
-
-
-        /**
-         * @brief Load from location
-         *
-         * @param src memory location to load from
-         * @param aligned true if @a src is SIMD-aligned
-         */
-        explicit SimdVec(ValueType const * src, bool aligned) noexcept
-        :   SimdVecBase {aligned ? _mm256_load_ps(src) : _mm256_loadu_ps(src)}
-        {
-        }
-
-
-        /**
-         * @brief Masked load from location
-         *
-         * @param src memory location to load from
-         * @param mask load mask
-         * @param aligned true if @a src is SIMD-aligned
-         */
-        explicit SimdVec(ValueType const * src, MaskType mask, bool aligned) noexcept
-        :   SimdVecBase {_mm256_maskload_ps(src, mask)}
-        {
-        }
-
-
-        /**
-         * @brief Store to memory
-         *
-         * @param dst memory location to store to
-         * @param aligned true if @a dst is SIMD-aligned
-         */
-        void store(ValueType * dst, bool aligned) const noexcept
-        {
-            if (aligned)
-                _mm256_store_ps(dst, value_);
-            else
-                _mm256_storeu_ps(dst, value_);
-        }
-
-
-        /**
-         * @brief Masked store to memory
-         *
-         * @param dst memory location to store to
-         * @param mask store mask
-         * @param aligned true if @a dst is SIMD-aligned
-         */
-        void store(ValueType * dst, MaskType mask, bool aligned) const noexcept
-        {
-            _mm256_maskstore_ps(dst, mask, value_);
-        }
-
-
-        friend MaskType operator>(SimdVec const& a, SimdVec const& b) noexcept
-        {
-            return _mm256_castps_si256(_mm256_cmp_ps(a.value_, b.value_, _CMP_GT_OQ));
-        }
-
-
-        /**
-         * @brief Multiplication
-         *
-         * @param a first multiplier
-         * @param b second multiplier
-         *
-         * @return product @a a * @a b
-         */
-        friend SimdVec operator*(SimdVec const& a, SimdVec const& b) noexcept
-        {
-            return _mm256_mul_ps(a.value_, b.value_);
-        }
-
-
-        /**
-         * @brief Fused multiply-add
-         *
-         * Calculate a * b + c
-         *
-         * @param a first multiplier
-         * @param b second multiplier
-         * @param c addendum
-         *
-         * @return @a a * @a b + @a c element-wise
-         */
-        friend SimdVec fmadd(SimdVec const& a, SimdVec const& b, SimdVec const& c) noexcept
-        {
-            return _mm256_fmadd_ps(a.value_, b.value_, c.value_);
-        }
-
-
-        friend SimdVec blend(SimdVec const& a, SimdVec const& b, MaskType mask) noexcept
-        {
-            return _mm256_blendv_ps(a.value_, b.value_, _mm256_castsi256_ps(mask));
-        }
-
-
-        friend SimdVec abs(SimdVec const& a) noexcept
-        {
-            return _mm256_andnot_ps(SimdVec {-0.f}, a.value_);
-        }
-
-
-        /**
-        * @brief Vertical max (across two vectors)
-        *
-        * @param a first vector
-        * @param b second vector
-        *
-        * @return [max(a[7], b7), [max(a[6], b6), max(a[5], b5), max(a[4], b4), max(a[3], b3), max(a[2], b2), max(a[1], b1), max(a[0], b0)]
-        */
-        friend SimdVec max(SimdVec const& a, SimdVec const& b)
-        {
-            return _mm256_max_ps(a, b);
-        }
-
-
-        /**
-        * @brief Horizontal max (across all elements)
-        *
-        * The implementation is based on
-        * https://stackoverflow.com/questions/9795529/how-to-find-the-horizontal-maximum-in-a-256-bit-avx-vector
-        *
-        * @param a pack of 32-bit floating point numbers
-        *
-        * @return max(a[0], a[1], a[2], a[3], a[4], a[5], a[6], a[7])
-        */
-        friend ValueType max(SimdVec const& x)
-        {
-            __m256 v1 = x.value_;                                          /* v1 = [H G F E | D C B A]                                                                         */
-            __m256 v2 = _mm256_permute_ps(v1, 0b10'11'00'01);       /* v2 = [G H E F | C D A B]                                                                         */
-            __m256 v3 = _mm256_max_ps(v1, v2);                      /* v3 = [W=max(G,H) W=max(G,H) Z=max(E,F) Z=max(E,F) | Y=max(C,D) Y=max(C,D) X=max(A,B) X=max(A,B)] */
-                                                                    /* v3 = [W W Z Z | Y Y X X]                                                                         */
-            __m256 v4 = _mm256_permute_ps(v3, 0b00'00'10'10);       /* v4 = [Z Z W W | X X Y Y]                                                                         */
-            __m256 v5 = _mm256_max_ps(v3, v4);                      /* v5 = [J=max(Z,W) J=max(Z,W) J=max(Z,W) J=max(Z,W) | I=max(X,Y) I=max(X,Y) I=max(X,Y) I=max(X,Y)] */
-                                                                    /* v5 = [J J J J | I I I I]                                                                         */
-            __m128 v6 = _mm256_extractf128_ps(v5, 1);               /* v6 = [- - - - | J J J J]                                                                         */
-            __m128 v7 = _mm_max_ps(_mm256_castps256_ps128(v5), v6); /* v7 = [- - - - | M=max(I,J) M=max(I,J) M=max(I,J) M=max(I,J)]                                     */
-
-            return v7[0];
-        }
-
-
-        template <typename Index>
-        requires (SimdSize_v<Index> == size())
-        friend std::tuple<SimdVec, SimdVec<Index>> imax(SimdVec const& v1, SimdVec<Index> const& idx)
-        {
-            /* v2 = [G H E F | C D A B]                                                                         */
-            SimdVec const v2 = _mm256_permute_ps(v1, 0b10'11'00'01);
-            SimdVec<Index> const iv2 = _mm256_castps_si256(_mm256_permute_ps(_mm256_castsi256_ps(idx), 0b10'11'00'01));
-
-            /* v3 = [W=max(G,H) W=max(G,H) Z=max(E,F) Z=max(E,F) | Y=max(C,D) Y=max(C,D) X=max(A,B) X=max(A,B)] */
-            /* v3 = [W W Z Z | Y Y X X]                                                                         */
-            // __m256 v3 = _mm256_max_ps(v1, v2);
-            MaskType const mask_v3 = v2 > v1;
-            SimdVec const v3 = blend(v1, v2, mask_v3);
-            SimdVec<Index> const iv3 = blend(idx, iv2, mask_v3);
-
-            /* v4 = [Z Z W W | X X Y Y]                                                                         */
-            SimdVec const v4 = _mm256_permute_ps(v3, 0b00'00'10'10);
-            SimdVec<Index> const iv4 = _mm256_castps_si256(_mm256_permute_ps(_mm256_castsi256_ps(iv3), 0b00'00'10'10));
-
-            /* v5 = [J=max(Z,W) J=max(Z,W) J=max(Z,W) J=max(Z,W) | I=max(X,Y) I=max(X,Y) I=max(X,Y) I=max(X,Y)] */
-            /* v5 = [J J J J | I I I I]                                                                         */
-            // __m256 v5 = _mm256_max_ps(v3, v4);
-            MaskType const mask_v5 = v4 > v3;
-            SimdVec const v5 = blend(v3, v4, mask_v5);
-            SimdVec<Index> const iv5 = blend(iv3, iv4, mask_v5);
-
-            /* v6 = [I I I I | J J J J]                                                                         */
-            SimdVec const v6 = _mm256_permute2f128_ps(v5, v5, 0b0000'0001);
-            SimdVec<Index> const iv6 = _mm256_castps_si256(
-                _mm256_permute2f128_ps(
-                    _mm256_castsi256_ps(iv5),
-                    _mm256_castsi256_ps(iv5),
-                    0b0000'0001
-                )
-            );
-
-            /* v7 = [M=max(I,J) M=max(I,J) M=max(I,J) M=max(I,J) | M=max(I,J) M=max(I,J) M=max(I,J) M=max(I,J)] */
-            // __m128 v7 = _mm_max_ps(_mm256_castps256_ps128(v5), v6);
-            MaskType const mask_v7 = v6 > v5;
-            SimdVec const v7 = blend(v5, v6, mask_v7);
-            SimdVec<Index> const iv7 = blend(iv5, iv6, mask_v7);
-
-            return std::make_tuple(v7, iv7);
-        }
-
-
-        /**
-         * @brief Access single element
-         *
-         * @param i element index
-         *
-         * @return element value
-         */
-        ValueType operator[](size_t i) const noexcept
-        {
-            return value_[i];
-        }
-    };
-}
\ No newline at end of file
diff --git a/include/blast/math/simd/avx256/SimdVecFloat64.hpp b/include/blast/math/simd/avx256/SimdVecFloat64.hpp
deleted file mode 100644
index a5277ab6..00000000
--- a/include/blast/math/simd/avx256/SimdVecFloat64.hpp
+++ /dev/null
@@ -1,234 +0,0 @@
-// Copyright 2023 Mikhail Katliar
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <blast/math/simd/SimdVec.hpp>
-#include <blast/math/simd/SimdVecBase.hpp>
-#include <blast/math/simd/avx256/SimdSize.hpp>
-
-#include <immintrin.h>
-
-#include <tuple>
-
-
-namespace blast
-{
-    template <>
-    class SimdVec<double>
-    :   public SimdVecBase<double, __m256d>
-    {
-    public:
-        using MaskType = __m256i;
-
-        /**
-         * @brief Set to [0, 0, 0, ...]
-         */
-        SimdVec() noexcept
-        :   SimdVecBase {_mm256_setzero_pd()}
-        {
-        }
-
-
-        SimdVec(IntrinsicType value) noexcept
-        :   SimdVecBase {value}
-        {
-        }
-
-
-        /**
-         * @brief Set to [value, value, ...]
-         *
-         * TODO: add "explicit"
-         *
-         * @param value value for each component of SIMD vector
-         */
-        SimdVec(ValueType value) noexcept
-        :   SimdVecBase {_mm256_set1_pd(value)}
-        {
-        }
-
-
-        /**
-         * @brief Load from location
-         *
-         * @param src memory location to load from
-         * @param aligned true if @a src is SIMD-aligned
-         */
-        explicit SimdVec(ValueType const * src, bool aligned) noexcept
-        :   SimdVecBase {aligned ? _mm256_load_pd(src) : _mm256_loadu_pd(src)}
-        {
-        }
-
-
-        /**
-         * @brief Masked load from location
-         *
-         * @param src memory location to load from
-         * @param mask load mask
-         * @param aligned true if @a src is SIMD-aligned
-         */
-        explicit SimdVec(ValueType const * src, MaskType mask, bool aligned) noexcept
-        :   SimdVecBase {_mm256_maskload_pd(src, mask)}
-        {
-        }
-
-
-        /**
-         * @brief Store to memory
-         *
-         * @param dst memory location to store to
-         * @param aligned true if @a dst is SIMD-aligned
-         */
-        void store(ValueType * dst, bool aligned) const noexcept
-        {
-            if (aligned)
-                _mm256_store_pd(dst, value_);
-            else
-                _mm256_storeu_pd(dst, value_);
-        }
-
-
-        /**
-         * @brief Masked store to memory
-         *
-         * @param dst memory location to store to
-         * @param mask store mask
-         * @param aligned true if @a dst is SIMD-aligned
-         */
-        void store(ValueType * dst, MaskType mask, bool aligned) const noexcept
-        {
-            _mm256_maskstore_pd(dst, mask, value_);
-        }
-
-
-        friend MaskType operator>(SimdVec const& a, SimdVec const& b) noexcept
-        {
-            return _mm256_castpd_si256(_mm256_cmp_pd(a.value_, b.value_, _CMP_GT_OQ));
-        }
-
-
-        /**
-         * @brief Multiplication
-         *
-         * @param a first multiplier
-         * @param b second multiplier
-         *
-         * @return product @a a * @a b
-         */
-        friend SimdVec operator*(SimdVec const& a, SimdVec const& b) noexcept
-        {
-            return _mm256_mul_pd(a.value_, b.value_);
-        }
-
-
-        /**
-         * @brief Fused multiply-add
-         *
-         * Calculate a * b + c
-         *
-         * @param a first multiplier
-         * @param b second multiplier
-         * @param c addendum
-         *
-         * @return @a a * @a b + @a c element-wise
-         */
-        friend SimdVec fmadd(SimdVec const& a, SimdVec const& b, SimdVec const& c) noexcept
-        {
-            return _mm256_fmadd_pd(a.value_, b.value_, c.value_);
-        }
-
-
-        friend SimdVec blend(SimdVec const& a, SimdVec const& b, MaskType mask) noexcept
-        {
-            return _mm256_blendv_pd(a.value_, b.value_, _mm256_castsi256_pd(mask));
-        }
-
-
-        friend SimdVec abs(SimdVec const& a) noexcept
-        {
-            return _mm256_andnot_pd(SimdVec {-0.}, a.value_);
-        }
-
-
-        /**
-        * @brief Vertical max (across two vectors)
-        *
-        * @param a first vector
-        * @param b second vector
-        *
-        * @return [max(a[3], b3), max(a[2], b2), max(a[1], b1), max(a[0], b0)]
-        */
-        friend SimdVec max(SimdVec const& a, SimdVec const& b)
-        {
-            return _mm256_max_pd(a, b);
-        }
-
-
-        /**
-        * @brief Horizontal max (across all alements)
-        *
-        * The implementation is based on
-        * https://stackoverflow.com/questions/9795529/how-to-find-the-horizontal-maximum-in-a-256-bit-avx-vector
-        *
-        * @return max(x[0], x[1], x[2], x[3])
-        */
-        friend ValueType max(SimdVec x) noexcept
-        {
-            __m256d y = _mm256_permute2f128_pd(x.value_, x.value_, 1); // permute 128-bit values
-            __m256d m1 = _mm256_max_pd(x.value_, y); // m1[0] = max(x[0], x[2]), m1[1] = max(x[1], x[3]), etc.
-            __m256d m2 = _mm256_permute_pd(m1, 5); // set m2[0] = m1[1], m2[1] = m1[0], etc.
-            __m256d m = _mm256_max_pd(m1, m2); // all m[0] ... m[3] contain the horizontal max(x[0], x[1], x[2], x[3])
-
-            return m[0];
-        }
-
-
-        template <typename Index>
-        requires (SimdSize_v<Index> == size())
-        friend std::tuple<SimdVec, SimdVec<Index>> imax(SimdVec const& x, SimdVec<Index> const& idx)
-        {
-            SimdVec const y = _mm256_permute2f128_pd(x.value_, x.value_, 1); // permute 128-bit values
-            SimdVec<Index> const iy = _mm256_permute2f128_si256(idx, idx, 1);
-
-            // __m256d m1 = _mm256_max_pd(x.value_, y); // m1[0] = max(x[0], x[2]), m1[1] = max(x[1], x[3]), etc.
-            MaskType const mask_m1 = y > x;
-            SimdVec const m1 = blend(x, y, mask_m1);
-            SimdVec<Index> const im1 = blend(idx, iy, mask_m1);
-
-            SimdVec const m2 = _mm256_permute_pd(m1, 5); // set m2[0] = m1[1], m2[1] = m1[0], etc.
-            SimdVec<Index> const im2 = _mm256_castpd_si256(_mm256_permute_pd(_mm256_castsi256_pd(im1), 5));
-
-            // __m256d m = _mm256_max_pd(m1, m2); // all m[0] ... m[3] contain the horizontal max(x[0], x[1], x[2], x[3])
-            MaskType const mask_m = m2 > m1;
-            SimdVec const m = blend(m1, m2, mask_m);
-            SimdVec<Index> const im = blend(im1, im2, mask_m);
-
-            return std::make_tuple(m, im);
-        }
-
-
-        /**
-         * @brief Access single element
-         *
-         * @param i element index
-         *
-         * @return element value
-         */
-        ValueType operator[](size_t i) const noexcept
-        {
-            return value_[i];
-        }
-    };
-}
\ No newline at end of file
diff --git a/include/blast/math/simd/avx256/SimdVecInt32.hpp b/include/blast/math/simd/avx256/SimdVecInt32.hpp
deleted file mode 100644
index 8854f50f..00000000
--- a/include/blast/math/simd/avx256/SimdVecInt32.hpp
+++ /dev/null
@@ -1,122 +0,0 @@
-// Copyright 2023 Mikhail Katliar
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <blast/math/simd/SimdVec.hpp>
-#include <blast/math/simd/SimdVecBase.hpp>
-#include <blast/math/simd/SequenceTag.hpp>
-#include <blast/math/simd/avx256/SimdSize.hpp>
-
-#include <immintrin.h>
-
-#include <cstdint>
-
-
-namespace blast
-{
-    template <>
-    class SimdVec<std::int32_t>
-    :   public SimdVecBase<std::int32_t, __m256i>
-    {
-    public:
-        using MaskType = __m256i;
-
-        /**
-         * @brief Initialize to (0, 0, 0, 0, 0, 0, 0, 0)
-         */
-        SimdVec() noexcept
-        :   SimdVecBase {_mm256_setzero_si256()}
-        {
-        }
-
-
-        /**
-         * @brief Initialize to (a, a, a, a, a, a, a, a)
-         */
-        SimdVec(ValueType a) noexcept
-        :   SimdVecBase {_mm256_set1_epi32(a)}
-        {
-        }
-
-
-        /**
-         * @brief Initialize to (n + 7, n + 6, n + 5, n + 4, n + 3, n + 2, n + 1, n)
-         *
-         * @param n
-         */
-        SimdVec(simd::SequenceTag, ValueType n = 0)
-        :   SimdVec {n + 7, n + 6, n + 5, n + 4, n + 3, n + 2, n + 1, n}
-        {
-        }
-
-
-        /**
-         * @brief Initialize to (a7, a6, a5, a4, a3, a2, a1, a0),
-         * where a0 corresponds to the lower bits.
-         */
-        SimdVec(ValueType a7, ValueType a6, ValueType a5, ValueType a4,
-            ValueType a3, ValueType a2, ValueType a1, ValueType a0) noexcept
-        :   SimdVecBase {_mm256_set_epi32(a7, a6, a5, a4, a3, a2, a1, a0)}
-        {
-        }
-
-
-        SimdVec(IntrinsicType value) noexcept
-        :   SimdVecBase {value}
-        {
-        }
-
-
-        SimdVec& operator+=(ValueType x) noexcept
-        {
-            value_ = _mm256_add_epi32(value_, _mm256_set1_epi32(x));
-            return *this;
-        }
-
-
-        friend MaskType operator>(SimdVec const& a, SimdVec const& b) noexcept
-        {
-            return _mm256_cmpgt_epi32(a.value_, b.value_);
-        }
-
-
-        friend SimdVec blend(SimdVec const& a, SimdVec const& b, MaskType mask) noexcept
-        {
-            return _mm256_blendv_epi8(a.value_, b.value_, mask);
-        }
-
-
-        friend SimdVec operator+(SimdVec const& a, ValueType n) noexcept
-        {
-            return _mm256_add_epi32(a.value_, _mm256_set1_epi32(n));
-        }
-
-
-        /**
-         * @brief Access single element
-         *
-         * @param i element index
-         *
-         * @return element value
-         */
-        ValueType operator[](size_t i) const noexcept
-        {
-            alignas(IntrinsicType) ValueType v[size()];
-            _mm256_storeu_si256(reinterpret_cast<IntrinsicType *>(v), value_);
-
-            return v[i];
-        }
-    };
-}
\ No newline at end of file
diff --git a/include/blast/math/simd/avx256/SimdVecInt64.hpp b/include/blast/math/simd/avx256/SimdVecInt64.hpp
deleted file mode 100644
index 6edf2863..00000000
--- a/include/blast/math/simd/avx256/SimdVecInt64.hpp
+++ /dev/null
@@ -1,124 +0,0 @@
-// Copyright 2023 Mikhail Katliar
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <blast/math/simd/SimdVec.hpp>
-#include <blast/math/simd/SimdVecBase.hpp>
-#include <blast/math/simd/SequenceTag.hpp>
-#include <blast/math/simd/avx256/SimdSize.hpp>
-
-#include <immintrin.h>
-
-#include <cstdint>
-
-
-namespace blast
-{
-    template <>
-    class SimdVec<std::int64_t>
-    :   public SimdVecBase<std::int64_t, __m256i>
-    {
-    public:
-        using MaskType = __m256i;
-
-        /**
-         * @brief Initialize to (0, 0, 0, 0)
-         */
-        SimdVec() noexcept
-        :   SimdVecBase {_mm256_setzero_si256()}
-        {
-        }
-
-
-        /**
-         * @brief Initialize to (a, a, a, a)
-         */
-        SimdVec(ValueType a) noexcept
-        :   SimdVecBase {_mm256_set1_epi64x(a)}
-        {
-        }
-
-
-        /**
-         * @brief Initialize to (n + 3, n + 2, n + 1, n)
-         *
-         * @param n
-         */
-        SimdVec(simd::SequenceTag, ValueType n = 0)
-        :   SimdVec {n + 3, n + 2, n + 1, n}
-        {
-        }
-
-
-        /**
-         * @brief Initialize to [a3, a2, a1, a0],
-         * where a0 corresponds to the lower bits.
-         */
-        SimdVec(ValueType a3, ValueType a2, ValueType a1, ValueType a0) noexcept
-        :   SimdVecBase {_mm256_set_epi64x(a3, a2, a1, a0)}
-        {
-        }
-
-
-        SimdVec(IntrinsicType value) noexcept
-        :   SimdVecBase {value}
-        {
-        }
-
-
-        operator IntrinsicType() const noexcept
-        {
-            return value_;
-        }
-
-
-        SimdVec& operator+=(ValueType x) noexcept
-        {
-            value_ = _mm256_add_epi64(value_, _mm256_set1_epi64x(x));
-            return *this;
-        }
-
-
-        friend MaskType operator>(SimdVec const& a, SimdVec const& b) noexcept
-        {
-            return _mm256_cmpgt_epi64(a.value_, b.value_);
-        }
-
-
-        friend SimdVec blend(SimdVec const& a, SimdVec const& b, MaskType mask) noexcept
-        {
-            return _mm256_blendv_epi8(a.value_, b.value_, mask);
-        }
-
-
-        friend SimdVec operator+(SimdVec const& a, ValueType n) noexcept
-        {
-            return _mm256_add_epi64(a.value_, _mm256_set1_epi64x(n));
-        }
-
-
-        /**
-         * @brief Access single element
-         *
-         * @param i element index
-         *
-         * @return element value
-         */
-        ValueType operator[](size_t i) const noexcept
-        {
-            return value_[i];
-        }
-    };
-}
\ No newline at end of file
diff --git a/include/blast/math/views/row/Panel.hpp b/include/blast/math/views/row/Panel.hpp
index 5daadc0e..55c3f9f4 100644
--- a/include/blast/math/views/row/Panel.hpp
+++ b/include/blast/math/views/row/Panel.hpp
@@ -7,7 +7,6 @@
 #include <blast/math/PanelMatrix.hpp>
 #include <blast/math/views/submatrix/BaseTemplate.hpp>
 #include <blast/math/constraints/Submatrix.hpp>
-#include <blast/math/simd/Simd.hpp>
 #include <blast/math/panel/PanelSize.hpp>
 #include <blast/math/constraints/PanelMatrix.hpp>
 
@@ -17,9 +16,6 @@
 #include <blaze/util/constraints/Pointer.h>
 #include <blaze/util/constraints/Reference.h>
 
-#include <algorithm>
-#include <iterator>
-
 
 namespace blast
 {
@@ -195,4 +191,4 @@ namespace blast
         BLAST_CONSTRAINT_MUST_NOT_BE_PANEL_SUBMATRIX_TYPE(MT);
         //**********************************************************************************************
     };
-}
\ No newline at end of file
+}
diff --git a/include/blast/math/views/submatrix/Panel.hpp b/include/blast/math/views/submatrix/Panel.hpp
index c922bba7..50ba6a79 100644
--- a/include/blast/math/views/submatrix/Panel.hpp
+++ b/include/blast/math/views/submatrix/Panel.hpp
@@ -7,7 +7,6 @@
 #include <blast/math/PanelMatrix.hpp>
 #include <blast/math/views/submatrix/BaseTemplate.hpp>
 #include <blast/math/constraints/Submatrix.hpp>
-#include <blast/math/simd/Simd.hpp>
 #include <blast/math/panel/PanelSize.hpp>
 #include <blast/math/constraints/PanelMatrix.hpp>
 #include <blast/util/Assert.hpp>
@@ -25,9 +24,6 @@
 #include <blaze/util/Exception.h>
 #include <blaze/util/Assert.h>
 
-#include <algorithm>
-#include <iterator>
-
 
 namespace blast
 {
@@ -283,4 +279,4 @@ namespace blast
     {
         return PanelSubmatrix<MT, SO>(matrix.operand(), matrix.row() + row, matrix.column() + column, m, n);
     }
-}
\ No newline at end of file
+}
diff --git a/test/blast/CMakeLists.txt b/test/blast/CMakeLists.txt
index 5edafda1..fa479125 100644
--- a/test/blast/CMakeLists.txt
+++ b/test/blast/CMakeLists.txt
@@ -13,7 +13,6 @@ find_package(LAPACK REQUIRED)
 add_executable(test-blast
     math/simd/RegisterMatrixTest.cpp
     math/simd/DynamicRegisterMatrixTest.cpp
-    math/simd/HsumTest.cpp
     math/simd/SimdVecTest.cpp
 
     math/dense/StaticVectorPointerTest.cpp
diff --git a/test/blast/math/panel/StaticPanelMatrixTest.cpp b/test/blast/math/panel/StaticPanelMatrixTest.cpp
index 13fd564e..c319a422 100644
--- a/test/blast/math/panel/StaticPanelMatrixTest.cpp
+++ b/test/blast/math/panel/StaticPanelMatrixTest.cpp
@@ -3,9 +3,12 @@
 // license that can be found in the LICENSE file.
 
 #include <blast/math/StaticPanelMatrix.hpp>
+#include <blast/math/panel/MatrixPointer.hpp>
+#include <blast/math/simd/SimdVec.hpp>
 
 #include <blaze/Math.h>
 
+#include <blaze/math/AlignmentFlag.h>
 #include <test/Testing.hpp>
 #include <test/Randomize.hpp>
 
@@ -98,7 +101,7 @@ namespace blast :: testing
         for (size_t i = 0; i < M; i += SS)
             for (size_t j = 0; j < N; ++j)
             {
-                auto const xmm = A.template load<SS>(i, j);
+                auto const xmm = ptr<aligned>(A, i, j).load();
 
                 for (size_t k = 0; k < SS; ++k)
                     ASSERT_EQ(xmm[k], A(i + k, j)) << "element mismatch at i,j,k=" << i << "," << j << "," << k;
@@ -113,16 +116,18 @@ namespace blast :: testing
         size_t constexpr N = 3 * SS + 2;
 
         StaticPanelMatrix<TypeParam, M, N, columnMajor> A;
-        IntrinsicType_t<TypeParam> val;
 
+        std::vector<TypeParam> vec(SS);
         for (size_t i = 0; i < SS; ++i)
-            val[i] = TypeParam(i + 1);
+            vec[i] = i + 1;
+
+        SimdVec<TypeParam> val {vec.data(), false};
 
         for (size_t i = 0; i < M; i += SS)
             for (size_t j = 0; j < N; ++j)
             {
                 A = TypeParam(0.);
-                A.store(i, j, val);
+                ptr<aligned>(A, i, j).store(val);
 
                 for (size_t k = 0; k < SS && i + k < rows(A); ++k)
                     ASSERT_EQ(A(i + k, j), val[k]) << "element mismatch at i,j,k=" << i << "," << j << "," << k;
@@ -201,4 +206,4 @@ namespace blast :: testing
 
     INSTANTIATE_TYPED_TEST_SUITE_P(double, StaticPanelMatrixTest, double);
     INSTANTIATE_TYPED_TEST_SUITE_P(float, StaticPanelMatrixTest, float);
-}
\ No newline at end of file
+}
diff --git a/test/blast/math/simd/HsumTest.cpp b/test/blast/math/simd/HsumTest.cpp
deleted file mode 100644
index 3153e042..00000000
--- a/test/blast/math/simd/HsumTest.cpp
+++ /dev/null
@@ -1,38 +0,0 @@
-// Copyright (c) 2019-2020 Mikhail Katliar All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-#include <blast/math/simd/Hsum.hpp>
-
-#include <test/Testing.hpp>
-#include <test/Randomize.hpp>
-
-#include <numeric>
-
-using namespace blaze;
-
-
-namespace blast :: testing
-{
-    TEST(HsumTest, test4x256d)
-    {
-        std::array<double, 4> a, b, c, d, r;
-        randomize(a);
-        randomize(b);
-        randomize(c);
-        randomize(d);
-
-        __m256d mma = _mm256_loadu_pd(a.data());
-        __m256d mmb = _mm256_loadu_pd(b.data());
-        __m256d mmc = _mm256_loadu_pd(c.data());
-        __m256d mmd = _mm256_loadu_pd(d.data());
-
-        __m256d mmr = hsum(mma, mmb, mmc, mmd);
-        _mm256_storeu_pd(r.data(), mmr);
-
-        EXPECT_NEAR(r[0], std::accumulate(begin(a), end(a), 0.), 1.e-10);
-        EXPECT_NEAR(r[1], std::accumulate(begin(b), end(b), 0.), 1.e-10);
-        EXPECT_NEAR(r[2], std::accumulate(begin(c), end(c), 0.), 1.e-10);
-        EXPECT_NEAR(r[3], std::accumulate(begin(d), end(d), 0.), 1.e-10);
-    }
-}
\ No newline at end of file
diff --git a/test/blast/math/simd/SimdVecTest.cpp b/test/blast/math/simd/SimdVecTest.cpp
index 90be9d1c..f5b2fa04 100644
--- a/test/blast/math/simd/SimdVecTest.cpp
+++ b/test/blast/math/simd/SimdVecTest.cpp
@@ -2,7 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
-#include <blast/math/simd/Avx256.hpp>
+#include <blast/math/simd/SimdIndex.hpp>
 
 #include <blaze/Math.h>
 
@@ -64,9 +64,10 @@ namespace blast :: testing
     TYPED_TEST(SimdVecTest, testImax)
     {
         using Scalar = TypeParam;
-        size_t constexpr SS = SimdSize_v<Scalar>;
+        using SimdType = SimdVec<Scalar>;
+        size_t constexpr SS = SimdType::size();
 
-        IntVecType_t<SS> idx {simd::sequenceTag};
+        SimdIndex<Scalar> const idx = indexSequence<Scalar>();
 
         for (int i = 0; i < 100; ++i)
         {
@@ -75,15 +76,15 @@ namespace blast :: testing
             SimdVec<Scalar> const v {a.data(), false};
 
             SimdVec<Scalar> v_max;
-            IntVecType_t<SS> idx_max;
+            SimdIndex<Scalar> idx_max;
             std::tie(v_max, idx_max) = imax(v, idx);
 
             auto const max_el = std::max_element(a.begin(), a.end());
-            for (size_t i = 0; i < idx_max.size(); ++i)
-                ASSERT_EQ(idx_max[i], std::distance(a.begin(), max_el)) << "Error at element " << i;
+            for (size_t i = 0; i < idx_max.size; ++i)
+                ASSERT_EQ(idx_max.get(i), std::distance(a.begin(), max_el)) << "Error at element " << i;
 
             for (size_t i = 0; i < v_max.size(); ++i)
                 ASSERT_EQ(v_max[i], *max_el) << "Error at element " << i;
         }
     }
-}
\ No newline at end of file
+}