From 0b9550a3b998b856097e4b80f8c71996b4edc995 Mon Sep 17 00:00:00 2001
From: Mikhail Katliar <cepstr@gmail.com>
Date: Tue, 23 Jul 2024 16:05:39 +0200
Subject: [PATCH 01/27] Removed load() and store() from StaticPanelMatrix

---
 .../blast/math/panel/StaticPanelMatrix.hpp    | 39 -------------------
 .../math/panel/StaticPanelMatrixTest.cpp      |  6 ++-
 2 files changed, 4 insertions(+), 41 deletions(-)
diff --git a/include/blast/math/panel/StaticPanelMatrix.hpp b/include/blast/math/panel/StaticPanelMatrix.hpp
index cfeda753..b1d2a624 100644
--- a/include/blast/math/panel/StaticPanelMatrix.hpp
+++ b/include/blast/math/panel/StaticPanelMatrix.hpp
@@ -160,45 +160,6 @@ namespace blast
         }
 
 
-        Type * ptr(size_t i, size_t j)
-        {
-            // BLAST_USER_ASSERT(i % panelSize_ == 0, "Row index not aligned to panel boundary");
-            return v_ + elementIndex(i, j);
-        }
-
-
-        Type const * ptr(size_t i, size_t j) const
-        {
-            // BLAST_USER_ASSERT(i % panelSize_ == 0, "Row index not aligned to panel boundary");
-            return v_ + elementIndex(i, j);
-        }
-
-
-        template <size_t SS>
-        auto load(size_t i, size_t j) const
-        {
-            BLAZE_INTERNAL_ASSERT(i < M, "Invalid row access index");
-            BLAZE_INTERNAL_ASSERT(j < N, "Invalid column access index");
-            BLAZE_INTERNAL_ASSERT(i % panelSize_ == 0 || SO == rowMajor, "Row index not aligned to panel boundary");
-            BLAZE_INTERNAL_ASSERT(j % panelSize_ == 0 || SO == columnMajor, "Column index not aligned to panel boundary");
-
-            return blast::load<aligned, SS>(v_ + elementIndex(i, j));
-        }
-
-
-        template <typename T>
-        void store(size_t i, size_t j, T val)
-        {
-            BLAZE_INTERNAL_ASSERT(i < M, "Invalid row access index");
-            BLAZE_INTERNAL_ASSERT(j < N, "Invalid column access index");
-            BLAZE_INTERNAL_ASSERT(i % panelSize_ == 0 || SO == rowMajor, "Row index not aligned to panel boundary");
-            BLAZE_INTERNAL_ASSERT(j % panelSize_ == 0 || SO == columnMajor, "Column index not aligned to panel boundary");
-
-            // We never use maskstore here because we have padding
-            blast::store<aligned>(v_ + elementIndex(i, j), val);
-        }
-
-
     private:
         static size_t constexpr panelSize_ = PanelSize_v<Type>;
         static size_t constexpr tileRows_ = M / panelSize_ + (M % panelSize_ > 0);
diff --git a/test/blast/math/panel/StaticPanelMatrixTest.cpp b/test/blast/math/panel/StaticPanelMatrixTest.cpp
index 13fd564e..c5db87f3 100644
--- a/test/blast/math/panel/StaticPanelMatrixTest.cpp
+++ b/test/blast/math/panel/StaticPanelMatrixTest.cpp
@@ -3,9 +3,11 @@
 // license that can be found in the LICENSE file.
 
 #include <blast/math/StaticPanelMatrix.hpp>
+#include <blast/math/panel/MatrixPointer.hpp>
 
 #include <blaze/Math.h>
 
+#include <blaze/math/AlignmentFlag.h>
 #include <test/Testing.hpp>
 #include <test/Randomize.hpp>
 
@@ -98,7 +100,7 @@ namespace blast :: testing
         for (size_t i = 0; i < M; i += SS)
             for (size_t j = 0; j < N; ++j)
             {
-                auto const xmm = A.template load<SS>(i, j);
+                auto const xmm = ptr<aligned>(A, i, j).load();
 
                 for (size_t k = 0; k < SS; ++k)
                     ASSERT_EQ(xmm[k], A(i + k, j)) << "element mismatch at i,j,k=" << i << "," << j << "," << k;
@@ -122,7 +124,7 @@ namespace blast :: testing
             for (size_t j = 0; j < N; ++j)
             {
                 A = TypeParam(0.);
-                A.store(i, j, val);
+                ptr<aligned>(A, i, j).store(val);
 
                 for (size_t k = 0; k < SS && i + k < rows(A); ++k)
                     ASSERT_EQ(A(i + k, j), val[k]) << "element mismatch at i,j,k=" << i << "," << j << "," << k;

From b6eedca6d70d4ccef1cc7b2d087aaf2c6d1e0154 Mon Sep 17 00:00:00 2001
From: Mikhail Katliar <cepstr@gmail.com>
Date: Tue, 23 Jul 2024 17:16:12 +0200
Subject: [PATCH 02/27] Removed load() functions from Simd.hpp

---
 .../blast/math/expressions/PMatTransExpr.hpp  |  3 +-
 .../blast/math/expressions/PanelMatrix.hpp    | 22 +++++-----
 include/blast/math/simd/Simd.hpp              | 40 +------------------
 3 files changed, 14 insertions(+), 51 deletions(-)

diff --git a/include/blast/math/expressions/PMatTransExpr.hpp b/include/blast/math/expressions/PMatTransExpr.hpp
index fae79ffc..4ad4526f 100644
--- a/include/blast/math/expressions/PMatTransExpr.hpp
+++ b/include/blast/math/expressions/PMatTransExpr.hpp
@@ -12,7 +12,6 @@
 #include <blaze/math/constraints/StorageOrder.h>
 #include <blaze/math/Exception.h>
 #include <blaze/math/expressions/Computation.h>
-#include <blaze/math/expressions/DMatTransposer.h>
 #include <blaze/math/expressions/Forward.h>
 #include <blaze/math/expressions/MatTransExpr.h>
 #include <blaze/math/expressions/Transformation.h>
@@ -213,4 +212,4 @@ namespace blast
         using ReturnType = const PMatTransExpr<MT, !SO>;
         return ReturnType(*pm);
     }
-}
\ No newline at end of file
+}
diff --git a/include/blast/math/expressions/PanelMatrix.hpp b/include/blast/math/expressions/PanelMatrix.hpp
index 4a556a33..1df9b97d 100644
--- a/include/blast/math/expressions/PanelMatrix.hpp
+++ b/include/blast/math/expressions/PanelMatrix.hpp
@@ -7,6 +7,8 @@
 #include <blast/math/typetraits/IsPanelMatrix.hpp>
 #include <blast/math/simd/Simd.hpp>
 #include <blast/math/panel/PanelSize.hpp>
+#include <blast/math/panel/MatrixPointer.hpp>
+#include <blast/math/dense/MatrixPointer.hpp>
 //#include <blast/math/RegisterMatrix.hpp>
 
 #include <blaze/math/ReductionFlag.h>
@@ -108,22 +110,22 @@ namespace blast
 
 			for (size_t i = 0; i + SS <= m; i += SS)
             {
-                ET2 const * pr = &(*rhs)(i, 0);
-                ET1 * pl = data(lhs) + i;
+                auto pr = ptr<aligned>(*rhs, i, 0);
+                auto pl = ptr<aligned>(*lhs, i, 0);
 
                 for (size_t j = 0; j < n; ++j)
-                    store<aligned>(pl + s * j, load<aligned, SS>(pr + PANEL_SIZE * j));
+                    pl(0, j).store(pr(0, j).load());
             }
 
             if (IntType const rem = m % SS)
             {
                 MaskType const mask = SIMD::index() < rem;
                 size_t const i = m - rem;
-                ET2 const * pr = &(*rhs)(i, 0);
-                ET1 * pl = data(lhs) + i;
+                auto pr = ptr<aligned>(*rhs, i, 0);
+                auto pl = ptr<aligned>(*lhs, i, 0);
 
                 for (size_t j = 0; j < n; ++j)
-                    maskstore(pl + s * j, mask, load<aligned, SS>(pr + PANEL_SIZE * j));
+                    pl(0, j).store(pr(0, j).load(), mask);
             }
 
 		#if 0
@@ -196,11 +198,11 @@ namespace blast
         {
 			for (size_t i = 0; i + SS <= m; i += SS)
             {
-                ET2 const * pr = data(rhs) + i;
-                ET1 * pl = &(*lhs)(i, 0);
+                auto pr = ptr<aligned>(*rhs, i, 0);
+                auto pl = ptr<aligned>(*lhs, i, 0);
 
                 for (size_t j = 0; j < n; ++j)
-                    store<aligned>(pl + PANEL_SIZE * j, load<aligned, SS>(pr + s * j));
+                    pl(0, j).store(pr(0, j).load());
             }
 
             if (IntType const rem = m % SS)
@@ -243,4 +245,4 @@ namespace blast
                     (*lhs)(i, j) = (*rhs)(i, j);
         }
     }
-}
\ No newline at end of file
+}
diff --git a/include/blast/math/simd/Simd.hpp b/include/blast/math/simd/Simd.hpp
index 040f4ffe..c5d48bc7 100644
--- a/include/blast/math/simd/Simd.hpp
+++ b/include/blast/math/simd/Simd.hpp
@@ -196,49 +196,11 @@ namespace blast
     //
     //*******************************************************
 
-    template <bool AF, size_t SS, typename T>
-    auto load(T const * ptr);
-
-
-    template <bool AF, typename T>
-    inline auto load(T const * ptr)
-    {
-        return load<AF, Simd<T>::size>(ptr);
-    }
-
 
     template <size_t SS, typename T>
     auto broadcast(T const * ptr);
 
 
-    template <>
-    inline auto load<aligned, 4, double>(double const * ptr)
-    {
-        return _mm256_load_pd(ptr);
-    }
-
-
-    template <>
-    inline auto load<unaligned, 4, double>(double const * ptr)
-    {
-        return _mm256_loadu_pd(ptr);
-    }
-
-
-    template <>
-    inline auto load<aligned, 8, float>(float const * ptr)
-    {
-        return _mm256_load_ps(ptr);
-    }
-
-
-    template <>
-    inline auto load<unaligned, 8, float>(float const * ptr)
-    {
-        return _mm256_loadu_ps(ptr);
-    }
-
-
     inline auto maskload(double const * ptr, __m256i mask)
     {
         return _mm256_maskload_pd(ptr, mask);
@@ -456,4 +418,4 @@ namespace blast
     {
         return _mm256_cmpgt_epi32(a, b);
     }
-}
\ No newline at end of file
+}

From 231d1759fa788e2e439ea43bfe6f033d699e3836 Mon Sep 17 00:00:00 2001
From: Mikhail Katliar <cepstr@gmail.com>
Date: Thu, 25 Jul 2024 23:05:37 +0200
Subject: [PATCH 03/27] SimdVec implementation using xsimd

---
 CMakeLists.txt                                |  6 +++++
 include/blast/math/simd/Simd.hpp              | 19 +++++++--------
 include/blast/math/simd/SimdVec.hpp           | 11 +++++----
 include/blast/math/simd/SimdVecBase.hpp       | 14 +++++++----
 .../blast/math/simd/avx256/SimdVecFloat32.hpp | 24 ++++++++++---------
 .../blast/math/simd/avx256/SimdVecFloat64.hpp | 18 +++++++-------
 .../blast/math/simd/avx256/SimdVecInt32.hpp   | 11 ++++-----
 .../blast/math/simd/avx256/SimdVecInt64.hpp   |  8 +++----
 8 files changed, 60 insertions(+), 51 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index faffe2fd..aa1db7b4 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -25,6 +25,7 @@ set(CMAKE_DEBUG_POSTFIX d)
 
 find_package(Boost REQUIRED COMPONENTS exception)
 find_package(blaze REQUIRED)
+find_package(xsimd REQUIRED)
 
 add_library(blast INTERFACE)
 
@@ -35,6 +36,7 @@ target_include_directories(blast INTERFACE
 
 target_link_libraries(blast
     INTERFACE blaze::blaze
+    INTERFACE xsimd
 )
 
 target_compile_options(blast
@@ -44,6 +46,10 @@ target_compile_options(blast
     INTERFACE "-march=native" "-mfma" "-mavx" "-mavx2" "-msse4"
 )
 
+target_compile_definitions(blast
+    INTERFACE "XSIMD_DEFAULT_ARCH=avx2"
+)
+
 # BLAST_WITH_BLASFEO
 option(BLAST_WITH_BLASFEO "Build blasfeo C++ interface")
 
diff --git a/include/blast/math/simd/Simd.hpp b/include/blast/math/simd/Simd.hpp
index c5d48bc7..841f7571 100644
--- a/include/blast/math/simd/Simd.hpp
+++ b/include/blast/math/simd/Simd.hpp
@@ -10,10 +10,9 @@
 #include <blaze/system/Inline.h>
 #include <blaze/math/AlignmentFlag.h>
 
-#include <immintrin.h>
+#include <xsimd/xsimd.hpp>
 
-#include <cstdint>
-#include <type_traits>
+#include <immintrin.h>
 
 
 namespace blast
@@ -21,12 +20,12 @@ namespace blast
     using namespace blaze;
 
 
-    template <typename T>
+    template <typename T, typename Arch = xsimd::default_arch>
     struct Simd;
 
 
     template <>
-    struct Simd<double>
+    struct Simd<double, xsimd::avx2>
     {
     private:
         class Index;
@@ -87,7 +86,7 @@ namespace blast
 
 
     template <>
-    struct Simd<float>
+    struct Simd<float, xsimd::avx2>
     {
     private:
         class Index;
@@ -147,8 +146,8 @@ namespace blast
     };
 
 
-    template <typename T>
-    using IntrinsicType_t = typename Simd<T>::IntrinsicType;
+    template <typename T, typename Arch = xsimd::default_arch>
+    using IntrinsicType_t = typename Simd<T, Arch>::IntrinsicType;
 
 
     template <typename T>
@@ -186,8 +185,8 @@ namespace blast
     using IntType_t = typename SimdTraits<T>::IntType;
 
 
-    template <typename T>
-    size_t constexpr RegisterCapacity_v = Simd<T>::registerCapacity;
+    template <typename T, typename Arch = xsimd::default_arch>
+    size_t constexpr RegisterCapacity_v = Simd<T, Arch>::registerCapacity;
 
 
     //*******************************************************
diff --git a/include/blast/math/simd/SimdVec.hpp b/include/blast/math/simd/SimdVec.hpp
index a3195ff7..f65969f9 100644
--- a/include/blast/math/simd/SimdVec.hpp
+++ b/include/blast/math/simd/SimdVec.hpp
@@ -14,15 +14,16 @@
 
 #pragma once
 
+#include <xsimd/xsimd.hpp>
+
 
 namespace blast
 {
     /**
-     * @brief Data-parallel type with the element type bool.
-     * The width of a given simd_mask instantiation is a constant expression, determined by the template parameter.
+     * @brief Data-parallel type with a given element type.
      *
-     * @tparam T the element type simd_mask applies on
+     * @tparam T element type
      */
-    template <typename T>
+    template <typename T, typename Arch = xsimd::default_arch>
     class SimdVec;
-}
\ No newline at end of file
+}
diff --git a/include/blast/math/simd/SimdVecBase.hpp b/include/blast/math/simd/SimdVecBase.hpp
index 365258f2..b041fc66 100644
--- a/include/blast/math/simd/SimdVecBase.hpp
+++ b/include/blast/math/simd/SimdVecBase.hpp
@@ -16,17 +16,21 @@
 
 #include <blast/math/simd/SimdSize.hpp>
 
+#include <xsimd/xsimd.hpp>
+
 #include <cstdlib>
 
 
 namespace blast
 {
-    template <typename T, typename I>
+    template <typename T, typename Arch>
     class SimdVecBase
     {
+        using XSimdType = xsimd::batch<T, Arch>;
+
     public:
         using ValueType = T;
-        using IntrinsicType = I;
+        using IntrinsicType = typename XSimdType::register_type;
 
 
         /**
@@ -34,7 +38,7 @@ namespace blast
          */
         static size_t constexpr size()
         {
-            return SimdSize_v<T>;
+            return XSimdType::size;
         }
 
 
@@ -49,6 +53,6 @@ namespace blast
         {
         }
 
-        IntrinsicType value_;
+        XSimdType value_;
     };
-}
\ No newline at end of file
+}
diff --git a/include/blast/math/simd/avx256/SimdVecFloat32.hpp b/include/blast/math/simd/avx256/SimdVecFloat32.hpp
index 34709629..74e1bbf0 100644
--- a/include/blast/math/simd/avx256/SimdVecFloat32.hpp
+++ b/include/blast/math/simd/avx256/SimdVecFloat32.hpp
@@ -26,8 +26,8 @@
 namespace blast
 {
     template <>
-    class SimdVec<float>
-    :   public SimdVecBase<float, __m256>
+    class SimdVec<float, xsimd::avx2>
+    :   public SimdVecBase<float, xsimd::avx2>
     {
     public:
         using MaskType = __m256i;
@@ -204,33 +204,33 @@ namespace blast
 
         template <typename Index>
         requires (SimdSize_v<Index> == size())
-        friend std::tuple<SimdVec, SimdVec<Index>> imax(SimdVec const& v1, SimdVec<Index> const& idx)
+        friend std::tuple<SimdVec, SimdVec<Index, xsimd::avx2>> imax(SimdVec const& v1, SimdVec<Index, xsimd::avx2> const& idx)
         {
             /* v2 = [G H E F | C D A B]                                                                         */
             SimdVec const v2 = _mm256_permute_ps(v1, 0b10'11'00'01);
-            SimdVec<Index> const iv2 = _mm256_castps_si256(_mm256_permute_ps(_mm256_castsi256_ps(idx), 0b10'11'00'01));
+            SimdVec<Index, xsimd::avx2> const iv2 = _mm256_castps_si256(_mm256_permute_ps(_mm256_castsi256_ps(idx), 0b10'11'00'01));
 
             /* v3 = [W=max(G,H) W=max(G,H) Z=max(E,F) Z=max(E,F) | Y=max(C,D) Y=max(C,D) X=max(A,B) X=max(A,B)] */
             /* v3 = [W W Z Z | Y Y X X]                                                                         */
             // __m256 v3 = _mm256_max_ps(v1, v2);
             MaskType const mask_v3 = v2 > v1;
             SimdVec const v3 = blend(v1, v2, mask_v3);
-            SimdVec<Index> const iv3 = blend(idx, iv2, mask_v3);
+            SimdVec<Index, xsimd::avx2> const iv3 = blend(idx, iv2, mask_v3);
 
             /* v4 = [Z Z W W | X X Y Y]                                                                         */
             SimdVec const v4 = _mm256_permute_ps(v3, 0b00'00'10'10);
-            SimdVec<Index> const iv4 = _mm256_castps_si256(_mm256_permute_ps(_mm256_castsi256_ps(iv3), 0b00'00'10'10));
+            SimdVec<Index, xsimd::avx2> const iv4 = _mm256_castps_si256(_mm256_permute_ps(_mm256_castsi256_ps(iv3), 0b00'00'10'10));
 
             /* v5 = [J=max(Z,W) J=max(Z,W) J=max(Z,W) J=max(Z,W) | I=max(X,Y) I=max(X,Y) I=max(X,Y) I=max(X,Y)] */
             /* v5 = [J J J J | I I I I]                                                                         */
             // __m256 v5 = _mm256_max_ps(v3, v4);
             MaskType const mask_v5 = v4 > v3;
             SimdVec const v5 = blend(v3, v4, mask_v5);
-            SimdVec<Index> const iv5 = blend(iv3, iv4, mask_v5);
+            SimdVec<Index, xsimd::avx2> const iv5 = blend(iv3, iv4, mask_v5);
 
             /* v6 = [I I I I | J J J J]                                                                         */
             SimdVec const v6 = _mm256_permute2f128_ps(v5, v5, 0b0000'0001);
-            SimdVec<Index> const iv6 = _mm256_castps_si256(
+            SimdVec<Index, xsimd::avx2> const iv6 = _mm256_castps_si256(
                 _mm256_permute2f128_ps(
                     _mm256_castsi256_ps(iv5),
                     _mm256_castsi256_ps(iv5),
@@ -242,7 +242,7 @@ namespace blast
             // __m128 v7 = _mm_max_ps(_mm256_castps256_ps128(v5), v6);
             MaskType const mask_v7 = v6 > v5;
             SimdVec const v7 = blend(v5, v6, mask_v7);
-            SimdVec<Index> const iv7 = blend(iv5, iv6, mask_v7);
+            SimdVec<Index, xsimd::avx2> const iv7 = blend(iv5, iv6, mask_v7);
 
             return std::make_tuple(v7, iv7);
         }
@@ -251,13 +251,15 @@ namespace blast
         /**
          * @brief Access single element
          *
+         * TODO: move to SimdVecBase
+         *
          * @param i element index
          *
          * @return element value
          */
         ValueType operator[](size_t i) const noexcept
         {
-            return value_[i];
+            return value_.get(i);
         }
     };
-}
\ No newline at end of file
+}
diff --git a/include/blast/math/simd/avx256/SimdVecFloat64.hpp b/include/blast/math/simd/avx256/SimdVecFloat64.hpp
index a5277ab6..9d85640f 100644
--- a/include/blast/math/simd/avx256/SimdVecFloat64.hpp
+++ b/include/blast/math/simd/avx256/SimdVecFloat64.hpp
@@ -26,8 +26,8 @@
 namespace blast
 {
     template <>
-    class SimdVec<double>
-    :   public SimdVecBase<double, __m256d>
+    class SimdVec<double, xsimd::avx2>
+    :   public SimdVecBase<double, xsimd::avx2>
     {
     public:
         using MaskType = __m256i;
@@ -197,23 +197,23 @@ namespace blast
 
         template <typename Index>
         requires (SimdSize_v<Index> == size())
-        friend std::tuple<SimdVec, SimdVec<Index>> imax(SimdVec const& x, SimdVec<Index> const& idx)
+        friend std::tuple<SimdVec, SimdVec<Index, xsimd::avx2>> imax(SimdVec const& x, SimdVec<Index, xsimd::avx2> const& idx)
         {
             SimdVec const y = _mm256_permute2f128_pd(x.value_, x.value_, 1); // permute 128-bit values
-            SimdVec<Index> const iy = _mm256_permute2f128_si256(idx, idx, 1);
+            SimdVec<Index, xsimd::avx2> const iy = _mm256_permute2f128_si256(idx, idx, 1);
 
             // __m256d m1 = _mm256_max_pd(x.value_, y); // m1[0] = max(x[0], x[2]), m1[1] = max(x[1], x[3]), etc.
             MaskType const mask_m1 = y > x;
             SimdVec const m1 = blend(x, y, mask_m1);
-            SimdVec<Index> const im1 = blend(idx, iy, mask_m1);
+            SimdVec<Index, xsimd::avx2> const im1 = blend(idx, iy, mask_m1);
 
             SimdVec const m2 = _mm256_permute_pd(m1, 5); // set m2[0] = m1[1], m2[1] = m1[0], etc.
-            SimdVec<Index> const im2 = _mm256_castpd_si256(_mm256_permute_pd(_mm256_castsi256_pd(im1), 5));
+            SimdVec<Index, xsimd::avx2> const im2 = _mm256_castpd_si256(_mm256_permute_pd(_mm256_castsi256_pd(im1), 5));
 
             // __m256d m = _mm256_max_pd(m1, m2); // all m[0] ... m[3] contain the horizontal max(x[0], x[1], x[2], x[3])
             MaskType const mask_m = m2 > m1;
             SimdVec const m = blend(m1, m2, mask_m);
-            SimdVec<Index> const im = blend(im1, im2, mask_m);
+            SimdVec<Index, xsimd::avx2> const im = blend(im1, im2, mask_m);
 
             return std::make_tuple(m, im);
         }
@@ -228,7 +228,7 @@ namespace blast
          */
         ValueType operator[](size_t i) const noexcept
         {
-            return value_[i];
+            return value_.get(i);
         }
     };
-}
\ No newline at end of file
+}
diff --git a/include/blast/math/simd/avx256/SimdVecInt32.hpp b/include/blast/math/simd/avx256/SimdVecInt32.hpp
index 8854f50f..af2c932e 100644
--- a/include/blast/math/simd/avx256/SimdVecInt32.hpp
+++ b/include/blast/math/simd/avx256/SimdVecInt32.hpp
@@ -27,8 +27,8 @@
 namespace blast
 {
     template <>
-    class SimdVec<std::int32_t>
-    :   public SimdVecBase<std::int32_t, __m256i>
+    class SimdVec<std::int32_t, xsimd::avx2>
+    :   public SimdVecBase<std::int32_t, xsimd::avx2>
     {
     public:
         using MaskType = __m256i;
@@ -113,10 +113,7 @@ namespace blast
          */
         ValueType operator[](size_t i) const noexcept
         {
-            alignas(IntrinsicType) ValueType v[size()];
-            _mm256_storeu_si256(reinterpret_cast<IntrinsicType *>(v), value_);
-
-            return v[i];
+            return value_.get(i);
         }
     };
-}
\ No newline at end of file
+}
diff --git a/include/blast/math/simd/avx256/SimdVecInt64.hpp b/include/blast/math/simd/avx256/SimdVecInt64.hpp
index 6edf2863..26f75fa2 100644
--- a/include/blast/math/simd/avx256/SimdVecInt64.hpp
+++ b/include/blast/math/simd/avx256/SimdVecInt64.hpp
@@ -27,8 +27,8 @@
 namespace blast
 {
     template <>
-    class SimdVec<std::int64_t>
-    :   public SimdVecBase<std::int64_t, __m256i>
+    class SimdVec<std::int64_t, xsimd::avx2>
+    :   public SimdVecBase<std::int64_t, xsimd::avx2>
     {
     public:
         using MaskType = __m256i;
@@ -118,7 +118,7 @@ namespace blast
          */
         ValueType operator[](size_t i) const noexcept
         {
-            return value_[i];
+            return value_.get(i);
         }
     };
-}
\ No newline at end of file
+}

From be5b0fd55303609044fe283282f70084a128e62a Mon Sep 17 00:00:00 2001
From: Mikhail Katliar <cepstr@gmail.com>
Date: Thu, 25 Jul 2024 23:22:13 +0200
Subject: [PATCH 04/27] Moved operator[] to SimdVecBase

---
 include/blast/math/simd/SimdVecBase.hpp           | 14 ++++++++++++++
 include/blast/math/simd/avx256/SimdVecFloat32.hpp | 15 ---------------
 include/blast/math/simd/avx256/SimdVecFloat64.hpp | 13 -------------
 include/blast/math/simd/avx256/SimdVecInt32.hpp   | 13 -------------
 include/blast/math/simd/avx256/SimdVecInt64.hpp   | 13 -------------
 5 files changed, 14 insertions(+), 54 deletions(-)

diff --git a/include/blast/math/simd/SimdVecBase.hpp b/include/blast/math/simd/SimdVecBase.hpp
index b041fc66..4ee90c5a 100644
--- a/include/blast/math/simd/SimdVecBase.hpp
+++ b/include/blast/math/simd/SimdVecBase.hpp
@@ -47,6 +47,20 @@ namespace blast
             return value_;
         }
 
+
+        /**
+         * @brief Access single element
+         *
+         * @param i element index
+         *
+         * @return element value
+         */
+        ValueType operator[](size_t i) const noexcept
+        {
+            return value_.get(i);
+        }
+
+
     protected:
         SimdVecBase(IntrinsicType value)
         :   value_ {value}
diff --git a/include/blast/math/simd/avx256/SimdVecFloat32.hpp b/include/blast/math/simd/avx256/SimdVecFloat32.hpp
index 74e1bbf0..7e32e985 100644
--- a/include/blast/math/simd/avx256/SimdVecFloat32.hpp
+++ b/include/blast/math/simd/avx256/SimdVecFloat32.hpp
@@ -246,20 +246,5 @@ namespace blast
 
             return std::make_tuple(v7, iv7);
         }
-
-
-        /**
-         * @brief Access single element
-         *
-         * TODO: move to SimdVecBase
-         *
-         * @param i element index
-         *
-         * @return element value
-         */
-        ValueType operator[](size_t i) const noexcept
-        {
-            return value_.get(i);
-        }
     };
 }
diff --git a/include/blast/math/simd/avx256/SimdVecFloat64.hpp b/include/blast/math/simd/avx256/SimdVecFloat64.hpp
index 9d85640f..a17b36e1 100644
--- a/include/blast/math/simd/avx256/SimdVecFloat64.hpp
+++ b/include/blast/math/simd/avx256/SimdVecFloat64.hpp
@@ -217,18 +217,5 @@ namespace blast
 
             return std::make_tuple(m, im);
         }
-
-
-        /**
-         * @brief Access single element
-         *
-         * @param i element index
-         *
-         * @return element value
-         */
-        ValueType operator[](size_t i) const noexcept
-        {
-            return value_.get(i);
-        }
     };
 }
diff --git a/include/blast/math/simd/avx256/SimdVecInt32.hpp b/include/blast/math/simd/avx256/SimdVecInt32.hpp
index af2c932e..15db3b4b 100644
--- a/include/blast/math/simd/avx256/SimdVecInt32.hpp
+++ b/include/blast/math/simd/avx256/SimdVecInt32.hpp
@@ -102,18 +102,5 @@ namespace blast
         {
             return _mm256_add_epi32(a.value_, _mm256_set1_epi32(n));
         }
-
-
-        /**
-         * @brief Access single element
-         *
-         * @param i element index
-         *
-         * @return element value
-         */
-        ValueType operator[](size_t i) const noexcept
-        {
-            return value_.get(i);
-        }
     };
 }
diff --git a/include/blast/math/simd/avx256/SimdVecInt64.hpp b/include/blast/math/simd/avx256/SimdVecInt64.hpp
index 26f75fa2..f383016d 100644
--- a/include/blast/math/simd/avx256/SimdVecInt64.hpp
+++ b/include/blast/math/simd/avx256/SimdVecInt64.hpp
@@ -107,18 +107,5 @@ namespace blast
         {
             return _mm256_add_epi64(a.value_, _mm256_set1_epi64x(n));
         }
-
-
-        /**
-         * @brief Access single element
-         *
-         * @param i element index
-         *
-         * @return element value
-         */
-        ValueType operator[](size_t i) const noexcept
-        {
-            return value_.get(i);
-        }
     };
 }

From e27037d18ca3550d9ac780be9fe176a2b5f29e8b Mon Sep 17 00:00:00 2001
From: Mikhail Katliar <cepstr@gmail.com>
Date: Fri, 26 Jul 2024 14:56:52 +0200
Subject: [PATCH 05/27] Generic member functions in SimdVec

---
 .../register_matrix/DynamicRegisterMatrix.hpp |   4 +-
 .../math/register_matrix/RegisterMatrix.hpp   |   4 +-
 include/blast/math/simd/SimdVec.hpp           | 289 ++++++++++++-
 include/blast/math/simd/SimdVecBase.hpp       |  10 +-
 .../blast/math/simd/avx256/SimdVecFloat32.hpp | 386 ++++++++----------
 .../blast/math/simd/avx256/SimdVecFloat64.hpp | 317 ++++++--------
 6 files changed, 591 insertions(+), 419 deletions(-)

diff --git a/include/blast/math/register_matrix/DynamicRegisterMatrix.hpp b/include/blast/math/register_matrix/DynamicRegisterMatrix.hpp
index a1b857cc..ed59f67e 100644
--- a/include/blast/math/register_matrix/DynamicRegisterMatrix.hpp
+++ b/include/blast/math/register_matrix/DynamicRegisterMatrix.hpp
@@ -268,7 +268,7 @@ namespace blast
         BLAZE_STATIC_ASSERT_MSG((M % SS == 0), "Number of rows must be a multiple of SIMD size");
         BLAZE_STATIC_ASSERT_MSG((RM * RN <= RegisterCapacity_v<T>), "Not enough registers for a DynamicRegisterMatrix");
 
-        IntrinsicType v_[RM][RN];
+        SimdVecType v_[RM][RN];
         size_t const m_;
         size_t const n_;
 
@@ -598,4 +598,4 @@ namespace blast
         ker.axpy(beta, c);
         ker.store(d);
     }
-}
\ No newline at end of file
+}
diff --git a/include/blast/math/register_matrix/RegisterMatrix.hpp b/include/blast/math/register_matrix/RegisterMatrix.hpp
index d4309043..68641769 100644
--- a/include/blast/math/register_matrix/RegisterMatrix.hpp
+++ b/include/blast/math/register_matrix/RegisterMatrix.hpp
@@ -378,7 +378,7 @@ namespace blast
         BLAZE_STATIC_ASSERT_MSG((M % SS == 0), "Number of rows must be a multiple of SIMD size");
         BLAZE_STATIC_ASSERT_MSG((RM * RN <= RegisterCapacity_v<T>), "Not enough registers for a RegisterMatrix");
 
-        IntrinsicType v_[RM][RN];
+        SimdVecType v_[RM][RN];
 
 
         /// @brief Reference to the matrix element at row \a i and column \a j
@@ -825,4 +825,4 @@ namespace blast
     {
         return rm == m;
     }
-}
\ No newline at end of file
+}
diff --git a/include/blast/math/simd/SimdVec.hpp b/include/blast/math/simd/SimdVec.hpp
index f65969f9..8568961e 100644
--- a/include/blast/math/simd/SimdVec.hpp
+++ b/include/blast/math/simd/SimdVec.hpp
@@ -14,16 +14,301 @@
 
 #pragma once
 
+#include <blast/math/simd/SimdVecBase.hpp>
+
 #include <xsimd/xsimd.hpp>
 
+#include <tuple>
+
 
 namespace blast
 {
+    template <typename T, typename Arch = xsimd::default_arch>
+    class SimdVec;
+
+
+    /**
+    * @brief Fused multiply-add
+    *
+    * Calculate a * b + c
+    *
+    * @param a first multiplier
+    * @param b second multiplier
+    * @param c addendum
+    *
+    * @return @a a * @a b + @a c element-wise
+    */
+    template <typename T, typename Arch>
+    SimdVec<T, Arch> fmadd(SimdVec<T, Arch> const& a, SimdVec<T, Arch> const& b, SimdVec<T, Arch> const& c) noexcept;
+
+    template <typename T, typename Arch>
+    SimdVec<T, Arch> blend(SimdVec<T, Arch> const& a, SimdVec<T, Arch> const& b, typename SimdVec<T, Arch>::MaskType mask) noexcept;
+
+    template <typename T, typename Arch>
+    SimdVec<T, Arch> abs(SimdVec<T, Arch> const& a) noexcept;
+
+    /**
+    * @brief Vertical max (across two vectors)
+    *
+    * @param a first vector
+    * @param b second vector
+    *
+    * @return [max(a[7], b7), [max(a[6], b6), max(a[5], b5), max(a[4], b4), max(a[3], b3), max(a[2], b2), max(a[1], b1), max(a[0], b0)]
+    */
+    template <typename T, typename Arch>
+    SimdVec<T, Arch> max(SimdVec<T, Arch> const& a, SimdVec<T, Arch> const& b) noexcept;
+
+    /**
+    * @brief Horizontal max (across all elements)
+    *
+    * The implementation is based on
+    * https://stackoverflow.com/questions/9795529/how-to-find-the-horizontal-maximum-in-a-256-bit-avx-vector
+    *
+    * @param a pack of 32-bit floating point numbers
+    *
+    * @return max(a[0], a[1], a[2], a[3], a[4], a[5], a[6], a[7])
+    */
+    template <typename T, typename Arch>
+    T max(SimdVec<T, Arch> const& x) noexcept;
+
+    template <typename T, typename Arch>
+    SimdVec<T, Arch>::MaskType operator>(SimdVec<T, Arch> const& a, SimdVec<T, Arch> const& b) noexcept;
+
+    /**
+    * @brief Multiplication
+    *
+    * @param a first multiplier
+    * @param b second multiplier
+    *
+    * @return product @a a * @a b
+    */
+    template <typename T, typename Arch>
+    SimdVec<T, Arch> operator*(SimdVec<T, Arch> const& a, SimdVec<T, Arch> const& b) noexcept;
+
+    /**
+    * @brief Left multiplication with a scalar
+    *
+    * @param a scalar multiplier
+    * @param b batch multiplier
+    *
+    * @return product @a a * @a b
+    */
+    template <typename T, typename Arch>
+    SimdVec<T, Arch> operator*(T const& a, SimdVec<T, Arch> const& b) noexcept;
+
+
+    /**
+    * @brief Right multiplication with a scalar
+    *
+    * @param a batch multiplier
+    * @param b scalar multiplier
+    *
+    * @return product @a a * @a b
+    */
+    template <typename T, typename Arch>
+    SimdVec<T, Arch> operator*(SimdVec<T, Arch> const& a, T const& b) noexcept;
+
+
     /**
      * @brief Data-parallel type with a given element type.
      *
      * @tparam T element type
      */
-    template <typename T, typename Arch = xsimd::default_arch>
-    class SimdVec;
+    template <typename T, typename Arch>
+    class SimdVec
+    :   public SimdVecBase<T, Arch>
+    {
+    public:
+        using typename SimdVecBase<T, Arch>::ValueType;
+        using typename SimdVecBase<T, Arch>::IntrinsicType;
+        using typename SimdVecBase<T, Arch>::XSimdType;
+        using MaskType = xsimd::batch_bool<T, Arch>;
+
+
+        /**
+         * @brief Set to [0, 0, 0, ...]
+         */
+        SimdVec() noexcept
+        :   SimdVecBase<T, Arch> {T {}}
+        {
+        }
+
+
+        SimdVec(SimdVec const&) noexcept = default;
+
+
+        SimdVec(IntrinsicType value) noexcept
+        :   SimdVecBase<T, Arch> {value}
+        {
+        }
+
+
+        SimdVec(XSimdType value) noexcept
+        :   SimdVecBase<T, Arch> {value}
+        {
+        }
+
+
+        /**
+         * @brief Set to [value, value, ...]
+         *
+         * @param value value for each component of SIMD vector
+         */
+        SimdVec(ValueType value) noexcept
+        :   SimdVecBase<T, Arch> {value}
+        {
+        }
+
+
+        /**
+         * @brief Load from location
+         *
+         * @param src memory location to load from
+         * @param aligned true indicates that an aligned read instruction should be used
+         */
+        explicit SimdVec(ValueType const * src, bool aligned) noexcept
+        :   SimdVecBase<T, Arch> {aligned ? xsimd::load_aligned(src) : xsimd::load_unaligned(src)}
+        {
+        }
+
+
+        /**
+         * @brief Masked load from location
+         *
+         * @param src memory location to load from
+         * @param mask load mask
+         * @param aligned true if @a src is SIMD-aligned
+         */
+        explicit SimdVec(ValueType const * src, MaskType mask, bool aligned) noexcept;
+
+
+        /**
+         * @brief Store to memory
+         *
+         * @param dst memory location to store to
+         * @param aligned true if @a dst is SIMD-aligned
+         */
+        void store(ValueType * dst, bool aligned) const noexcept
+        {
+            if (aligned)
+                xsimd::store_aligned(dst, this->value_);
+            else
+                xsimd::store_unaligned(dst, this->value_);
+        }
+
+
+        /**
+         * @brief Masked store to memory
+         *
+         * @param dst memory location to store to
+         * @param mask store mask
+         * @param aligned true if @a dst is SIMD-aligned
+         */
+        void store(ValueType * dst, MaskType mask, bool aligned) const noexcept;
+
+
+        /**
+         * @brief In-place multiplication
+         *
+         * @param a multiplier
+         *
+         * @return @a *this after multiplication with @a a
+         */
+        SimdVec& operator*=(SimdVec const& a) noexcept
+        {
+            this->value_ *= a.value_;
+            return *this;
+        }
+
+
+        /**
+         * @brief In-place division
+         *
+         * @param a divisor
+         *
+         * @return @a *this after division by @a a
+         */
+        SimdVec& operator/=(SimdVec const& a) noexcept
+        {
+            this->value_ /= a.value_;
+            return *this;
+        }
+
+
+        friend SimdVec fmadd<>(SimdVec const& a, SimdVec const& b, SimdVec const& c) noexcept;
+        friend SimdVec blend<>(SimdVec const& a, SimdVec const& b, MaskType mask) noexcept;
+        friend SimdVec abs<>(SimdVec const& a) noexcept;
+        friend SimdVec max<>(SimdVec const& a, SimdVec const& b) noexcept;
+        friend ValueType max<>(SimdVec const& x) noexcept;
+        friend MaskType operator><>(SimdVec const& a, SimdVec const& b) noexcept;
+        friend SimdVec operator*<>(SimdVec const& a, SimdVec const& b) noexcept;
+        friend SimdVec operator*<>(ValueType const& a, SimdVec const& b) noexcept;
+        friend SimdVec operator*<>(SimdVec const& a, ValueType const& b) noexcept;
+
+        template <typename Index>
+        friend std::tuple<SimdVec, SimdVec<Index, xsimd::avx2>> imax(SimdVec const& v1, SimdVec<Index, xsimd::avx2> const& idx);
+    };
+
+
+    template <typename T, typename Arch>
+    inline SimdVec<T, Arch> fmadd(SimdVec<T, Arch> const& a, SimdVec<T, Arch> const& b, SimdVec<T, Arch> const& c) noexcept
+    {
+        return xsimd::fma(a.value_, b.value_, c.value_);
+    }
+
+
+    template <typename T, typename Arch>
+    inline SimdVec<T, Arch> blend(SimdVec<T, Arch> const& a, SimdVec<T, Arch> const& b, typename SimdVec<T, Arch>::MaskType mask) noexcept
+    {
+        return xsimd::select(mask, a.value_, b.value_);
+    }
+
+
+    template <typename T, typename Arch>
+    inline SimdVec<T, Arch> abs(SimdVec<T, Arch> const& a) noexcept
+    {
+        return xsimd::abs(a.value_);
+    }
+
+
+    template <typename T, typename Arch>
+    inline SimdVec<T, Arch>::MaskType operator>(SimdVec<T, Arch> const& a, SimdVec<T, Arch> const& b) noexcept
+    {
+        return xsimd::gt(a.value_, b.value_);
+    }
+
+
+    template <typename T, typename Arch>
+    inline SimdVec<T, Arch> operator*(SimdVec<T, Arch> const& a, SimdVec<T, Arch> const& b) noexcept
+    {
+        return a.value_ * b.value_;
+    }
+
+
+    template <typename T, typename Arch>
+    inline SimdVec<T, Arch> operator*(T const& a, SimdVec<T, Arch> const& b) noexcept
+    {
+        return SimdVec<T, Arch> {a} * b;
+    }
+
+
+    template <typename T, typename Arch>
+    inline SimdVec<T, Arch> operator*(SimdVec<T, Arch> const& a, T const& b) noexcept
+    {
+        return a * SimdVec<T, Arch> {b};
+    }
+
+
+    template <typename T, typename Arch>
+    inline SimdVec<T, Arch> max(SimdVec<T, Arch> const& a, SimdVec<T, Arch> const& b) noexcept
+    {
+        return xsimd::max(a, b);
+    }
+
+
+    template <typename T, typename Arch>
+    inline T max(SimdVec<T, Arch> const& x) noexcept
+    {
+        return xsimd::reduce_max(x.value_);
+    }
 }
diff --git a/include/blast/math/simd/SimdVecBase.hpp b/include/blast/math/simd/SimdVecBase.hpp
index 4ee90c5a..53ef4ba4 100644
--- a/include/blast/math/simd/SimdVecBase.hpp
+++ b/include/blast/math/simd/SimdVecBase.hpp
@@ -26,10 +26,9 @@ namespace blast
     template <typename T, typename Arch>
     class SimdVecBase
     {
-        using XSimdType = xsimd::batch<T, Arch>;
-
     public:
         using ValueType = T;
+        using XSimdType = xsimd::batch<T, Arch>;
         using IntrinsicType = typename XSimdType::register_type;
 
 
@@ -67,6 +66,13 @@ namespace blast
         {
         }
 
+
+        SimdVecBase(ValueType const& value)
+        :   value_ {value}
+        {
+        }
+
+
         XSimdType value_;
     };
 }
diff --git a/include/blast/math/simd/avx256/SimdVecFloat32.hpp b/include/blast/math/simd/avx256/SimdVecFloat32.hpp
index 7e32e985..bcaeb281 100644
--- a/include/blast/math/simd/avx256/SimdVecFloat32.hpp
+++ b/include/blast/math/simd/avx256/SimdVecFloat32.hpp
@@ -26,225 +26,171 @@
 namespace blast
 {
     template <>
-    class SimdVec<float, xsimd::avx2>
-    :   public SimdVecBase<float, xsimd::avx2>
+    inline SimdVec<float, xsimd::avx2>::SimdVec() noexcept
+    :   SimdVecBase {_mm256_setzero_ps()}
     {
-    public:
-        using MaskType = __m256i;
-
-        /**
-         * @brief Set to [0, 0, 0, ...]
-         */
-        SimdVec() noexcept
-        :   SimdVecBase {_mm256_setzero_ps()}
-        {
-        }
-
-
-        SimdVec(IntrinsicType value) noexcept
-        :   SimdVecBase {value}
-        {
-        }
-
-
-        /**
-         * @brief Set to [value, value, ...]
-         *
-         * TODO: add "explicit"
-         *
-         * @param value value for each component of SIMD vector
-         */
-        SimdVec(ValueType value) noexcept
-        :   SimdVecBase {_mm256_set1_ps(value)}
-        {
-        }
-
-
-        /**
-         * @brief Load from location
-         *
-         * @param src memory location to load from
-         * @param aligned true if @a src is SIMD-aligned
-         */
-        explicit SimdVec(ValueType const * src, bool aligned) noexcept
-        :   SimdVecBase {aligned ? _mm256_load_ps(src) : _mm256_loadu_ps(src)}
-        {
-        }
-
-
-        /**
-         * @brief Masked load from location
-         *
-         * @param src memory location to load from
-         * @param mask load mask
-         * @param aligned true if @a src is SIMD-aligned
-         */
-        explicit SimdVec(ValueType const * src, MaskType mask, bool aligned) noexcept
-        :   SimdVecBase {_mm256_maskload_ps(src, mask)}
-        {
-        }
-
-
-        /**
-         * @brief Store to memory
-         *
-         * @param dst memory location to store to
-         * @param aligned true if @a dst is SIMD-aligned
-         */
-        void store(ValueType * dst, bool aligned) const noexcept
-        {
-            if (aligned)
-                _mm256_store_ps(dst, value_);
-            else
-                _mm256_storeu_ps(dst, value_);
-        }
-
-
-        /**
-         * @brief Masked store to memory
-         *
-         * @param dst memory location to store to
-         * @param mask store mask
-         * @param aligned true if @a dst is SIMD-aligned
-         */
-        void store(ValueType * dst, MaskType mask, bool aligned) const noexcept
-        {
-            _mm256_maskstore_ps(dst, mask, value_);
-        }
-
-
-        friend MaskType operator>(SimdVec const& a, SimdVec const& b) noexcept
-        {
-            return _mm256_castps_si256(_mm256_cmp_ps(a.value_, b.value_, _CMP_GT_OQ));
-        }
-
-
-        /**
-         * @brief Multiplication
-         *
-         * @param a first multiplier
-         * @param b second multiplier
-         *
-         * @return product @a a * @a b
-         */
-        friend SimdVec operator*(SimdVec const& a, SimdVec const& b) noexcept
-        {
-            return _mm256_mul_ps(a.value_, b.value_);
-        }
-
-
-        /**
-         * @brief Fused multiply-add
-         *
-         * Calculate a * b + c
-         *
-         * @param a first multiplier
-         * @param b second multiplier
-         * @param c addendum
-         *
-         * @return @a a * @a b + @a c element-wise
-         */
-        friend SimdVec fmadd(SimdVec const& a, SimdVec const& b, SimdVec const& c) noexcept
-        {
-            return _mm256_fmadd_ps(a.value_, b.value_, c.value_);
-        }
-
-
-        friend SimdVec blend(SimdVec const& a, SimdVec const& b, MaskType mask) noexcept
-        {
-            return _mm256_blendv_ps(a.value_, b.value_, _mm256_castsi256_ps(mask));
-        }
-
-
-        friend SimdVec abs(SimdVec const& a) noexcept
-        {
-            return _mm256_andnot_ps(SimdVec {-0.f}, a.value_);
-        }
-
-
-        /**
-        * @brief Vertical max (across two vectors)
-        *
-        * @param a first vector
-        * @param b second vector
-        *
-        * @return [max(a[7], b7), [max(a[6], b6), max(a[5], b5), max(a[4], b4), max(a[3], b3), max(a[2], b2), max(a[1], b1), max(a[0], b0)]
-        */
-        friend SimdVec max(SimdVec const& a, SimdVec const& b)
-        {
-            return _mm256_max_ps(a, b);
-        }
-
-
-        /**
-        * @brief Horizontal max (across all elements)
-        *
-        * The implementation is based on
-        * https://stackoverflow.com/questions/9795529/how-to-find-the-horizontal-maximum-in-a-256-bit-avx-vector
-        *
-        * @param a pack of 32-bit floating point numbers
-        *
-        * @return max(a[0], a[1], a[2], a[3], a[4], a[5], a[6], a[7])
-        */
-        friend ValueType max(SimdVec const& x)
-        {
-            __m256 v1 = x.value_;                                          /* v1 = [H G F E | D C B A]                                                                         */
-            __m256 v2 = _mm256_permute_ps(v1, 0b10'11'00'01);       /* v2 = [G H E F | C D A B]                                                                         */
-            __m256 v3 = _mm256_max_ps(v1, v2);                      /* v3 = [W=max(G,H) W=max(G,H) Z=max(E,F) Z=max(E,F) | Y=max(C,D) Y=max(C,D) X=max(A,B) X=max(A,B)] */
-                                                                    /* v3 = [W W Z Z | Y Y X X]                                                                         */
-            __m256 v4 = _mm256_permute_ps(v3, 0b00'00'10'10);       /* v4 = [Z Z W W | X X Y Y]                                                                         */
-            __m256 v5 = _mm256_max_ps(v3, v4);                      /* v5 = [J=max(Z,W) J=max(Z,W) J=max(Z,W) J=max(Z,W) | I=max(X,Y) I=max(X,Y) I=max(X,Y) I=max(X,Y)] */
-                                                                    /* v5 = [J J J J | I I I I]                                                                         */
-            __m128 v6 = _mm256_extractf128_ps(v5, 1);               /* v6 = [- - - - | J J J J]                                                                         */
-            __m128 v7 = _mm_max_ps(_mm256_castps256_ps128(v5), v6); /* v7 = [- - - - | M=max(I,J) M=max(I,J) M=max(I,J) M=max(I,J)]                                     */
-
-            return v7[0];
-        }
-
-
-        template <typename Index>
-        requires (SimdSize_v<Index> == size())
-        friend std::tuple<SimdVec, SimdVec<Index, xsimd::avx2>> imax(SimdVec const& v1, SimdVec<Index, xsimd::avx2> const& idx)
-        {
-            /* v2 = [G H E F | C D A B]                                                                         */
-            SimdVec const v2 = _mm256_permute_ps(v1, 0b10'11'00'01);
-            SimdVec<Index, xsimd::avx2> const iv2 = _mm256_castps_si256(_mm256_permute_ps(_mm256_castsi256_ps(idx), 0b10'11'00'01));
-
-            /* v3 = [W=max(G,H) W=max(G,H) Z=max(E,F) Z=max(E,F) | Y=max(C,D) Y=max(C,D) X=max(A,B) X=max(A,B)] */
-            /* v3 = [W W Z Z | Y Y X X]                                                                         */
-            // __m256 v3 = _mm256_max_ps(v1, v2);
-            MaskType const mask_v3 = v2 > v1;
-            SimdVec const v3 = blend(v1, v2, mask_v3);
-            SimdVec<Index, xsimd::avx2> const iv3 = blend(idx, iv2, mask_v3);
-
-            /* v4 = [Z Z W W | X X Y Y]                                                                         */
-            SimdVec const v4 = _mm256_permute_ps(v3, 0b00'00'10'10);
-            SimdVec<Index, xsimd::avx2> const iv4 = _mm256_castps_si256(_mm256_permute_ps(_mm256_castsi256_ps(iv3), 0b00'00'10'10));
-
-            /* v5 = [J=max(Z,W) J=max(Z,W) J=max(Z,W) J=max(Z,W) | I=max(X,Y) I=max(X,Y) I=max(X,Y) I=max(X,Y)] */
-            /* v5 = [J J J J | I I I I]                                                                         */
-            // __m256 v5 = _mm256_max_ps(v3, v4);
-            MaskType const mask_v5 = v4 > v3;
-            SimdVec const v5 = blend(v3, v4, mask_v5);
-            SimdVec<Index, xsimd::avx2> const iv5 = blend(iv3, iv4, mask_v5);
-
-            /* v6 = [I I I I | J J J J]                                                                         */
-            SimdVec const v6 = _mm256_permute2f128_ps(v5, v5, 0b0000'0001);
-            SimdVec<Index, xsimd::avx2> const iv6 = _mm256_castps_si256(
-                _mm256_permute2f128_ps(
-                    _mm256_castsi256_ps(iv5),
-                    _mm256_castsi256_ps(iv5),
-                    0b0000'0001
-                )
-            );
-
-            /* v7 = [M=max(I,J) M=max(I,J) M=max(I,J) M=max(I,J) | M=max(I,J) M=max(I,J) M=max(I,J) M=max(I,J)] */
-            // __m128 v7 = _mm_max_ps(_mm256_castps256_ps128(v5), v6);
-            MaskType const mask_v7 = v6 > v5;
-            SimdVec const v7 = blend(v5, v6, mask_v7);
-            SimdVec<Index, xsimd::avx2> const iv7 = blend(iv5, iv6, mask_v7);
-
-            return std::make_tuple(v7, iv7);
-        }
-    };
+    }
+
+
+    template <>
+    inline SimdVec<float, xsimd::avx2>::SimdVec(IntrinsicType value) noexcept
+    :   SimdVecBase {value}
+    {
+    }
+
+
+    template <>
+    inline SimdVec<float, xsimd::avx2>::SimdVec(ValueType value) noexcept
+    :   SimdVecBase {_mm256_set1_ps(value)}
+    {
+    }
+
+
+    template <>
+    inline SimdVec<float, xsimd::avx2>::SimdVec(ValueType const * src, bool aligned) noexcept
+    :   SimdVecBase {aligned ? _mm256_load_ps(src) : _mm256_loadu_ps(src)}
+    {
+    }
+
+
+    template <>
+    inline SimdVec<float, xsimd::avx2>::SimdVec(ValueType const * src, MaskType mask, bool aligned) noexcept
+    :   SimdVecBase {_mm256_maskload_ps(src, mask)}
+    {
+    }
+
+
+    template <>
+    inline void SimdVec<float, xsimd::avx2>::store(ValueType * dst, bool aligned) const noexcept
+    {
+        if (aligned)
+            _mm256_store_ps(dst, value_);
+        else
+            _mm256_storeu_ps(dst, value_);
+    }
+
+
+    template <>
+    inline void SimdVec<float, xsimd::avx2>::store(ValueType * dst, MaskType mask, bool aligned) const noexcept
+    {
+        _mm256_maskstore_ps(dst, mask, value_);
+    }
+
+
+    template <>
+    inline SimdVec<float, xsimd::avx2>::MaskType operator>(SimdVec<float, xsimd::avx2> const& a, SimdVec<float, xsimd::avx2> const& b) noexcept
+    {
+        return _mm256_castps_si256(_mm256_cmp_ps(a.value_, b.value_, _CMP_GT_OQ));
+    }
+
+
+    template <>
+    inline SimdVec<float, xsimd::avx2> operator*(SimdVec<float, xsimd::avx2> const& a, SimdVec<float, xsimd::avx2> const& b) noexcept
+    {
+        return _mm256_mul_ps(a.value_, b.value_);
+    }
+
+
+    /*
+    NOTE: without this specialization some tests fail, e.g.:
+
+    [ RUN      ] RegisterMatrixTest/5.testPartialGerNT @ ./test/blast/math/simd/RegisterMatrixTest.cpp:386
+    │ Failure @ ./test/blast/math/simd/RegisterMatrixTest.cpp:420
+    │ │ Expected equality of these values:
+    │ │   ker(i, j)
+    │ │     Which is: 0.482636
+    │ │   i < m && j < n ? D_ref(i, j) : 0.
+    │ │     Which is: 0.482636
+    │ element mismatch at (0, 1), store size = 1x2
+    [  FAILED  ] RegisterMatrixTest/5.testPartialGerNT, where TypeParam = blast::RegisterMatrix<float, 8ul, 4ul, true> (1 ms)
+    */
+    template <>
+    inline SimdVec<float, xsimd::avx2> fmadd(SimdVec<float, xsimd::avx2> const& a, SimdVec<float, xsimd::avx2> const& b, SimdVec<float, xsimd::avx2> const& c) noexcept
+    {
+        return _mm256_fmadd_ps(a.value_, b.value_, c.value_);
+    }
+
+
+    template <>
+    inline SimdVec<float, xsimd::avx2> blend(SimdVec<float, xsimd::avx2> const& a, SimdVec<float, xsimd::avx2> const& b, SimdVec<float, xsimd::avx2>::MaskType mask) noexcept
+    {
+        return _mm256_blendv_ps(a.value_, b.value_, _mm256_castsi256_ps(mask));
+    }
+
+
+    template <>
+    inline SimdVec<float, xsimd::avx2> abs(SimdVec<float, xsimd::avx2> const& a) noexcept
+    {
+        return _mm256_andnot_ps(SimdVec<float, xsimd::avx2> {-0.f}, a.value_);
+    }
+
+
+    template <>
+    inline SimdVec<float, xsimd::avx2> max(SimdVec<float, xsimd::avx2> const& a, SimdVec<float, xsimd::avx2> const& b) noexcept
+    {
+        return _mm256_max_ps(a, b);
+    }
+
+
+    template <>
+    inline float max(SimdVec<float, xsimd::avx2> const& x) noexcept
+    {
+        __m256 v1 = x.value_;                                          /* v1 = [H G F E | D C B A]                                                                         */
+        __m256 v2 = _mm256_permute_ps(v1, 0b10'11'00'01);       /* v2 = [G H E F | C D A B]                                                                         */
+        __m256 v3 = _mm256_max_ps(v1, v2);                      /* v3 = [W=max(G,H) W=max(G,H) Z=max(E,F) Z=max(E,F) | Y=max(C,D) Y=max(C,D) X=max(A,B) X=max(A,B)] */
+                                                                /* v3 = [W W Z Z | Y Y X X]                                                                         */
+        __m256 v4 = _mm256_permute_ps(v3, 0b00'00'10'10);       /* v4 = [Z Z W W | X X Y Y]                                                                         */
+        __m256 v5 = _mm256_max_ps(v3, v4);                      /* v5 = [J=max(Z,W) J=max(Z,W) J=max(Z,W) J=max(Z,W) | I=max(X,Y) I=max(X,Y) I=max(X,Y) I=max(X,Y)] */
+                                                                /* v5 = [J J J J | I I I I]                                                                         */
+        __m128 v6 = _mm256_extractf128_ps(v5, 1);               /* v6 = [- - - - | J J J J]                                                                         */
+        __m128 v7 = _mm_max_ps(_mm256_castps256_ps128(v5), v6); /* v7 = [- - - - | M=max(I,J) M=max(I,J) M=max(I,J) M=max(I,J)]                                     */
+
+        return v7[0];
+    }
+
+
+    template <typename Index>
+    requires (SimdSize_v<Index> == SimdVec<float, xsimd::avx2>::size())
+    inline std::tuple<SimdVec<float, xsimd::avx2>, SimdVec<Index, xsimd::avx2>> imax(SimdVec<float, xsimd::avx2> const& v1, SimdVec<Index, xsimd::avx2> const& idx) noexcept
+    {
+        /* v2 = [G H E F | C D A B]                                                                         */
+        SimdVec<float, xsimd::avx2> const v2 = _mm256_permute_ps(v1, 0b10'11'00'01);
+        SimdVec<Index, xsimd::avx2> const iv2 = _mm256_castps_si256(_mm256_permute_ps(_mm256_castsi256_ps(idx), 0b10'11'00'01));
+
+        /* v3 = [W=max(G,H) W=max(G,H) Z=max(E,F) Z=max(E,F) | Y=max(C,D) Y=max(C,D) X=max(A,B) X=max(A,B)] */
+        /* v3 = [W W Z Z | Y Y X X]                                                                         */
+        // __m256 v3 = _mm256_max_ps(v1, v2);
+        SimdVec<float, xsimd::avx2>::MaskType const mask_v3 = v2 > v1;
+        SimdVec<float, xsimd::avx2> const v3 = blend(v1, v2, mask_v3);
+        SimdVec<Index, xsimd::avx2> const iv3 = blend(idx, iv2, mask_v3);
+
+        /* v4 = [Z Z W W | X X Y Y]                                                                         */
+        SimdVec<float, xsimd::avx2> const v4 = _mm256_permute_ps(v3, 0b00'00'10'10);
+        SimdVec<Index, xsimd::avx2> const iv4 = _mm256_castps_si256(_mm256_permute_ps(_mm256_castsi256_ps(iv3), 0b00'00'10'10));
+
+        /* v5 = [J=max(Z,W) J=max(Z,W) J=max(Z,W) J=max(Z,W) | I=max(X,Y) I=max(X,Y) I=max(X,Y) I=max(X,Y)] */
+        /* v5 = [J J J J | I I I I]                                                                         */
+        // __m256 v5 = _mm256_max_ps(v3, v4);
+        SimdVec<float, xsimd::avx2>::MaskType const mask_v5 = v4 > v3;
+        SimdVec<float, xsimd::avx2> const v5 = blend(v3, v4, mask_v5);
+        SimdVec<Index, xsimd::avx2> const iv5 = blend(iv3, iv4, mask_v5);
+
+        /* v6 = [I I I I | J J J J]                                                                         */
+        SimdVec<float, xsimd::avx2> const v6 = _mm256_permute2f128_ps(v5, v5, 0b0000'0001);
+        SimdVec<Index, xsimd::avx2> const iv6 = _mm256_castps_si256(
+            _mm256_permute2f128_ps(
+                _mm256_castsi256_ps(iv5),
+                _mm256_castsi256_ps(iv5),
+                0b0000'0001
+            )
+        );
+
+        /* v7 = [M=max(I,J) M=max(I,J) M=max(I,J) M=max(I,J) | M=max(I,J) M=max(I,J) M=max(I,J) M=max(I,J)] */
+        // __m128 v7 = _mm_max_ps(_mm256_castps256_ps128(v5), v6);
+        SimdVec<float, xsimd::avx2>::MaskType const mask_v7 = v6 > v5;
+        SimdVec<float, xsimd::avx2> const v7 = blend(v5, v6, mask_v7);
+        SimdVec<Index, xsimd::avx2> const iv7 = blend(iv5, iv6, mask_v7);
+
+        return std::make_tuple(v7, iv7);
+    }
 }
diff --git a/include/blast/math/simd/avx256/SimdVecFloat64.hpp b/include/blast/math/simd/avx256/SimdVecFloat64.hpp
index a17b36e1..52a3ed63 100644
--- a/include/blast/math/simd/avx256/SimdVecFloat64.hpp
+++ b/include/blast/math/simd/avx256/SimdVecFloat64.hpp
@@ -26,196 +26,131 @@
 namespace blast
 {
     template <>
-    class SimdVec<double, xsimd::avx2>
-    :   public SimdVecBase<double, xsimd::avx2>
+    inline SimdVec<double, xsimd::avx2>::SimdVec() noexcept
+    :   SimdVecBase {_mm256_setzero_pd()}
     {
-    public:
-        using MaskType = __m256i;
-
-        /**
-         * @brief Set to [0, 0, 0, ...]
-         */
-        SimdVec() noexcept
-        :   SimdVecBase {_mm256_setzero_pd()}
-        {
-        }
-
-
-        SimdVec(IntrinsicType value) noexcept
-        :   SimdVecBase {value}
-        {
-        }
-
-
-        /**
-         * @brief Set to [value, value, ...]
-         *
-         * TODO: add "explicit"
-         *
-         * @param value value for each component of SIMD vector
-         */
-        SimdVec(ValueType value) noexcept
-        :   SimdVecBase {_mm256_set1_pd(value)}
-        {
-        }
-
-
-        /**
-         * @brief Load from location
-         *
-         * @param src memory location to load from
-         * @param aligned true if @a src is SIMD-aligned
-         */
-        explicit SimdVec(ValueType const * src, bool aligned) noexcept
-        :   SimdVecBase {aligned ? _mm256_load_pd(src) : _mm256_loadu_pd(src)}
-        {
-        }
-
-
-        /**
-         * @brief Masked load from location
-         *
-         * @param src memory location to load from
-         * @param mask load mask
-         * @param aligned true if @a src is SIMD-aligned
-         */
-        explicit SimdVec(ValueType const * src, MaskType mask, bool aligned) noexcept
-        :   SimdVecBase {_mm256_maskload_pd(src, mask)}
-        {
-        }
-
-
-        /**
-         * @brief Store to memory
-         *
-         * @param dst memory location to store to
-         * @param aligned true if @a dst is SIMD-aligned
-         */
-        void store(ValueType * dst, bool aligned) const noexcept
-        {
-            if (aligned)
-                _mm256_store_pd(dst, value_);
-            else
-                _mm256_storeu_pd(dst, value_);
-        }
-
-
-        /**
-         * @brief Masked store to memory
-         *
-         * @param dst memory location to store to
-         * @param mask store mask
-         * @param aligned true if @a dst is SIMD-aligned
-         */
-        void store(ValueType * dst, MaskType mask, bool aligned) const noexcept
-        {
-            _mm256_maskstore_pd(dst, mask, value_);
-        }
-
-
-        friend MaskType operator>(SimdVec const& a, SimdVec const& b) noexcept
-        {
-            return _mm256_castpd_si256(_mm256_cmp_pd(a.value_, b.value_, _CMP_GT_OQ));
-        }
-
-
-        /**
-         * @brief Multiplication
-         *
-         * @param a first multiplier
-         * @param b second multiplier
-         *
-         * @return product @a a * @a b
-         */
-        friend SimdVec operator*(SimdVec const& a, SimdVec const& b) noexcept
-        {
-            return _mm256_mul_pd(a.value_, b.value_);
-        }
-
-
-        /**
-         * @brief Fused multiply-add
-         *
-         * Calculate a * b + c
-         *
-         * @param a first multiplier
-         * @param b second multiplier
-         * @param c addendum
-         *
-         * @return @a a * @a b + @a c element-wise
-         */
-        friend SimdVec fmadd(SimdVec const& a, SimdVec const& b, SimdVec const& c) noexcept
-        {
-            return _mm256_fmadd_pd(a.value_, b.value_, c.value_);
-        }
-
-
-        friend SimdVec blend(SimdVec const& a, SimdVec const& b, MaskType mask) noexcept
-        {
-            return _mm256_blendv_pd(a.value_, b.value_, _mm256_castsi256_pd(mask));
-        }
-
-
-        friend SimdVec abs(SimdVec const& a) noexcept
-        {
-            return _mm256_andnot_pd(SimdVec {-0.}, a.value_);
-        }
-
-
-        /**
-        * @brief Vertical max (across two vectors)
-        *
-        * @param a first vector
-        * @param b second vector
-        *
-        * @return [max(a[3], b3), max(a[2], b2), max(a[1], b1), max(a[0], b0)]
-        */
-        friend SimdVec max(SimdVec const& a, SimdVec const& b)
-        {
-            return _mm256_max_pd(a, b);
-        }
-
-
-        /**
-        * @brief Horizontal max (across all alements)
-        *
-        * The implementation is based on
-        * https://stackoverflow.com/questions/9795529/how-to-find-the-horizontal-maximum-in-a-256-bit-avx-vector
-        *
-        * @return max(x[0], x[1], x[2], x[3])
-        */
-        friend ValueType max(SimdVec x) noexcept
-        {
-            __m256d y = _mm256_permute2f128_pd(x.value_, x.value_, 1); // permute 128-bit values
-            __m256d m1 = _mm256_max_pd(x.value_, y); // m1[0] = max(x[0], x[2]), m1[1] = max(x[1], x[3]), etc.
-            __m256d m2 = _mm256_permute_pd(m1, 5); // set m2[0] = m1[1], m2[1] = m1[0], etc.
-            __m256d m = _mm256_max_pd(m1, m2); // all m[0] ... m[3] contain the horizontal max(x[0], x[1], x[2], x[3])
-
-            return m[0];
-        }
-
-
-        template <typename Index>
-        requires (SimdSize_v<Index> == size())
-        friend std::tuple<SimdVec, SimdVec<Index, xsimd::avx2>> imax(SimdVec const& x, SimdVec<Index, xsimd::avx2> const& idx)
-        {
-            SimdVec const y = _mm256_permute2f128_pd(x.value_, x.value_, 1); // permute 128-bit values
-            SimdVec<Index, xsimd::avx2> const iy = _mm256_permute2f128_si256(idx, idx, 1);
-
-            // __m256d m1 = _mm256_max_pd(x.value_, y); // m1[0] = max(x[0], x[2]), m1[1] = max(x[1], x[3]), etc.
-            MaskType const mask_m1 = y > x;
-            SimdVec const m1 = blend(x, y, mask_m1);
-            SimdVec<Index, xsimd::avx2> const im1 = blend(idx, iy, mask_m1);
-
-            SimdVec const m2 = _mm256_permute_pd(m1, 5); // set m2[0] = m1[1], m2[1] = m1[0], etc.
-            SimdVec<Index, xsimd::avx2> const im2 = _mm256_castpd_si256(_mm256_permute_pd(_mm256_castsi256_pd(im1), 5));
-
-            // __m256d m = _mm256_max_pd(m1, m2); // all m[0] ... m[3] contain the horizontal max(x[0], x[1], x[2], x[3])
-            MaskType const mask_m = m2 > m1;
-            SimdVec const m = blend(m1, m2, mask_m);
-            SimdVec<Index, xsimd::avx2> const im = blend(im1, im2, mask_m);
-
-            return std::make_tuple(m, im);
-        }
-    };
+    }
+
+
+    template <>
+    inline SimdVec<double, xsimd::avx2>::SimdVec(IntrinsicType value) noexcept
+    :   SimdVecBase {value}
+    {
+    }
+
+
+    template <>
+    inline SimdVec<double, xsimd::avx2>::SimdVec(ValueType value) noexcept
+    :   SimdVecBase {_mm256_set1_pd(value)}
+    {
+    }
+
+
+    template <>
+    inline SimdVec<double, xsimd::avx2>::SimdVec(ValueType const * src, bool aligned) noexcept
+    :   SimdVecBase {aligned ? _mm256_load_pd(src) : _mm256_loadu_pd(src)}
+    {
+    }
+
+
+    template <>
+    inline SimdVec<double, xsimd::avx2>::SimdVec(ValueType const * src, MaskType mask, bool aligned) noexcept
+    :   SimdVecBase {_mm256_maskload_pd(src, mask)}
+    {
+    }
+
+
+    template <>
+    inline void SimdVec<double, xsimd::avx2>::store(ValueType * dst, bool aligned) const noexcept
+    {
+        if (aligned)
+            _mm256_store_pd(dst, value_);
+        else
+            _mm256_storeu_pd(dst, value_);
+    }
+
+
+    template <>
+    inline void SimdVec<double, xsimd::avx2>::store(ValueType * dst, MaskType mask, bool aligned) const noexcept
+    {
+        _mm256_maskstore_pd(dst, mask, value_);
+    }
+
+
+    template <>
+    inline SimdVec<double, xsimd::avx2>::MaskType operator>(SimdVec<double, xsimd::avx2> const& a, SimdVec<double, xsimd::avx2> const& b) noexcept
+    {
+        return _mm256_castpd_si256(_mm256_cmp_pd(a.value_, b.value_, _CMP_GT_OQ));
+    }
+
+
+    template <>
+    inline SimdVec<double, xsimd::avx2> operator*(SimdVec<double, xsimd::avx2> const& a, SimdVec<double, xsimd::avx2> const& b) noexcept
+    {
+        return _mm256_mul_pd(a.value_, b.value_);
+    }
+
+
+    template <>
+    inline SimdVec<double, xsimd::avx2> fmadd(SimdVec<double, xsimd::avx2> const& a, SimdVec<double, xsimd::avx2> const& b, SimdVec<double, xsimd::avx2> const& c) noexcept
+    {
+        return _mm256_fmadd_pd(a.value_, b.value_, c.value_);
+    }
+
+
+    template <>
+    inline SimdVec<double, xsimd::avx2> blend(SimdVec<double, xsimd::avx2> const& a, SimdVec<double, xsimd::avx2> const& b, SimdVec<double, xsimd::avx2>::MaskType mask) noexcept
+    {
+        return _mm256_blendv_pd(a.value_, b.value_, _mm256_castsi256_pd(mask));
+    }
+
+
+    template <>
+    inline SimdVec<double, xsimd::avx2> abs(SimdVec<double, xsimd::avx2> const& a) noexcept
+    {
+        return _mm256_andnot_pd(_mm256_set1_pd(-0.), a.value_);
+    }
+
+
+    template <>
+    inline SimdVec<double, xsimd::avx2> max(SimdVec<double, xsimd::avx2> const& a, SimdVec<double, xsimd::avx2> const& b) noexcept
+    {
+        return _mm256_max_pd(a, b);
+    }
+
+
+    template <>
+    inline double max(SimdVec<double, xsimd::avx2> const& x) noexcept
+    {
+        __m256d y = _mm256_permute2f128_pd(x.value_, x.value_, 1); // permute 128-bit values
+        __m256d m1 = _mm256_max_pd(x.value_, y); // m1[0] = max(x[0], x[2]), m1[1] = max(x[1], x[3]), etc.
+        __m256d m2 = _mm256_permute_pd(m1, 5); // set m2[0] = m1[1], m2[1] = m1[0], etc.
+        __m256d m = _mm256_max_pd(m1, m2); // all m[0] ... m[3] contain the horizontal max(x[0], x[1], x[2], x[3])
+
+        return m[0];
+    }
+
+
+    template <typename Index>
+    requires (SimdSize_v<Index> == SimdVec<double, xsimd::avx2>::size())
+    inline std::tuple<SimdVec<double, xsimd::avx2>, SimdVec<Index, xsimd::avx2>> imax(SimdVec<double, xsimd::avx2> const& x, SimdVec<Index, xsimd::avx2> const& idx) noexcept
+    {
+        SimdVec<double, xsimd::avx2> const y = _mm256_permute2f128_pd(x, x, 1); // permute 128-bit values
+        SimdVec<Index, xsimd::avx2> const iy = _mm256_permute2f128_si256(idx, idx, 1);
+
+        // __m256d m1 = _mm256_max_pd(x.value_, y); // m1[0] = max(x[0], x[2]), m1[1] = max(x[1], x[3]), etc.
+        SimdVec<double, xsimd::avx2>::MaskType const mask_m1 = y > x;
+        SimdVec const m1 = blend(x, y, mask_m1);
+        SimdVec<Index, xsimd::avx2> const im1 = blend(idx, iy, mask_m1);
+
+        SimdVec<double, xsimd::avx2> const m2 = _mm256_permute_pd(m1, 5); // set m2[0] = m1[1], m2[1] = m1[0], etc.
+        SimdVec<Index, xsimd::avx2> const im2 = _mm256_castpd_si256(_mm256_permute_pd(_mm256_castsi256_pd(im1), 5));
+
+        // __m256d m = _mm256_max_pd(m1, m2); // all m[0] ... m[3] contain the horizontal max(x[0], x[1], x[2], x[3])
+        SimdVec<double, xsimd::avx2>::MaskType const mask_m = m2 > m1;
+        SimdVec const m = blend(m1, m2, mask_m);
+        SimdVec<Index, xsimd::avx2> const im = blend(im1, im2, mask_m);
+
+        return std::make_tuple(m, im);
+    }
 }

From 39856a3ceaebded1697232be1df232f60aaf63d8 Mon Sep 17 00:00:00 2001
From: Mikhail Katliar <cepstr@gmail.com>
Date: Fri, 26 Jul 2024 15:01:57 +0200
Subject: [PATCH 06/27] Install xsimd step

---
 .github/workflows/cmake.yml | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/cmake.yml b/.github/workflows/cmake.yml
index 9c1ace7c..37a4eb65 100644
--- a/.github/workflows/cmake.yml
+++ b/.github/workflows/cmake.yml
@@ -36,6 +36,13 @@ jobs:
         -DCMAKE_INSTALL_PREFIX=/usr/local/ . \
         && sudo make install
 
+    - name: Install xsimd
+      run: |
+        git clone https://github.com/xtensor-stack/xsimd.git
+        cd xsimd \
+        && cmake -B build -DCMAKE_INSTALL_PREFIX=/usr/local/ . \
+        && sudo cmake --build build --target install
+
     - name: Install GTest
       run: |
         git clone https://github.com/google/googletest.git
@@ -60,4 +67,3 @@ jobs:
       # Execute tests defined by the CMake configuration.
       # See https://cmake.org/cmake/help/latest/manual/ctest.1.html for more detail
       run: ctest -C ${{env.BUILD_TYPE}}
-

From 7aa0563573c658e69b5d148ce93115ac6204bd65 Mon Sep 17 00:00:00 2001
From: Mikhail Katliar <cepstr@gmail.com>
Date: Fri, 26 Jul 2024 15:11:07 +0200
Subject: [PATCH 07/27] Add missing "typename " keyword

---
 include/blast/math/simd/SimdVec.hpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/include/blast/math/simd/SimdVec.hpp b/include/blast/math/simd/SimdVec.hpp
index 8568961e..be6cb57c 100644
--- a/include/blast/math/simd/SimdVec.hpp
+++ b/include/blast/math/simd/SimdVec.hpp
@@ -72,7 +72,7 @@ namespace blast
     T max(SimdVec<T, Arch> const& x) noexcept;
 
     template <typename T, typename Arch>
-    SimdVec<T, Arch>::MaskType operator>(SimdVec<T, Arch> const& a, SimdVec<T, Arch> const& b) noexcept;
+    typename SimdVec<T, Arch>::MaskType operator>(SimdVec<T, Arch> const& a, SimdVec<T, Arch> const& b) noexcept;
 
     /**
     * @brief Multiplication
@@ -272,7 +272,7 @@ namespace blast
 
 
     template <typename T, typename Arch>
-    inline SimdVec<T, Arch>::MaskType operator>(SimdVec<T, Arch> const& a, SimdVec<T, Arch> const& b) noexcept
+    inline typename SimdVec<T, Arch>::MaskType operator>(SimdVec<T, Arch> const& a, SimdVec<T, Arch> const& b) noexcept
     {
         return xsimd::gt(a.value_, b.value_);
     }

From d663683e86eb0cc33dcaae6abdb6de2f3ccb2482 Mon Sep 17 00:00:00 2001
From: Mikhail Katliar <cepstr@gmail.com>
Date: Fri, 26 Jul 2024 15:11:39 +0200
Subject: [PATCH 08/27] Update "Build xsimd" step

---
 .github/workflows/cmake.yml | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/cmake.yml b/.github/workflows/cmake.yml
index 37a4eb65..2df1b1d3 100644
--- a/.github/workflows/cmake.yml
+++ b/.github/workflows/cmake.yml
@@ -39,9 +39,9 @@ jobs:
     - name: Install xsimd
       run: |
         git clone https://github.com/xtensor-stack/xsimd.git
-        cd xsimd \
-        && cmake -B build -DCMAKE_INSTALL_PREFIX=/usr/local/ . \
-        && sudo cmake --build build --target install
+        cd xsimd
+        cmake -B build -DCMAKE_INSTALL_PREFIX=/usr/local/ .
+        sudo cmake --build build --target install
 
     - name: Install GTest
       run: |

From 8b9124bacadf20d102bf6df369d10ff98addb647 Mon Sep 17 00:00:00 2001
From: Mikhail Katliar <cepstr@gmail.com>
Date: Thu, 1 Aug 2024 13:09:59 +0200
Subject: [PATCH 09/27] Simd<> specializations working for both avx2 and
 fma3<avx2>

---
 include/blast/math/simd/Simd.hpp | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/include/blast/math/simd/Simd.hpp b/include/blast/math/simd/Simd.hpp
index 841f7571..6f4ca1d2 100644
--- a/include/blast/math/simd/Simd.hpp
+++ b/include/blast/math/simd/Simd.hpp
@@ -10,6 +10,7 @@
 #include <blaze/system/Inline.h>
 #include <blaze/math/AlignmentFlag.h>
 
+#include <type_traits>
 #include <xsimd/xsimd.hpp>
 
 #include <immintrin.h>
@@ -24,8 +25,9 @@ namespace blast
     struct Simd;
 
 
-    template <>
-    struct Simd<double, xsimd::avx2>
+    template <typename Arch>
+    requires std::is_base_of_v<xsimd::avx2, Arch>
+    struct Simd<double, Arch>
     {
     private:
         class Index;
@@ -85,8 +87,9 @@ namespace blast
     };
 
 
-    template <>
-    struct Simd<float, xsimd::avx2>
+    template <typename Arch>
+    requires std::is_base_of_v<xsimd::avx2, Arch>
+    struct Simd<float, Arch>
     {
     private:
         class Index;

From 17521ea0db48bf2e8b3419edfd2adf44b33d916f Mon Sep 17 00:00:00 2001
From: Mikhail Katliar <cepstr@gmail.com>
Date: Thu, 1 Aug 2024 13:21:49 +0200
Subject: [PATCH 10/27] Remove XSIMD_DEFAULT_ARCH setting from CMakeLists.txt

---
 CMakeLists.txt | 4 ----
 Dockerfile     | 3 +--
 2 files changed, 1 insertion(+), 6 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index aa1db7b4..3b0ad928 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -46,10 +46,6 @@ target_compile_options(blast
     INTERFACE "-march=native" "-mfma" "-mavx" "-mavx2" "-msse4"
 )
 
-target_compile_definitions(blast
-    INTERFACE "XSIMD_DEFAULT_ARCH=avx2"
-)
-
 # BLAST_WITH_BLASFEO
 option(BLAST_WITH_BLASFEO "Build blasfeo C++ interface")
 
diff --git a/Dockerfile b/Dockerfile
index 1ccfab49..598aebdd 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -56,7 +56,7 @@ ENV PKG_CONFIG_PATH=/usr/local/lib
 RUN mkdir -p blast/build && cd blast/build \
     && cmake -DCMAKE_BUILD_TYPE=RelWithDebInfo \
         -DCMAKE_CXX_COMPILER="clang++-15" \
-        -DCMAKE_CXX_FLAGS="-march=native -mfma -mavx -mavx2 -msse4 -fno-math-errno" \
+        -DCMAKE_CXX_FLAGS="-march=native -mfma -mavx -mavx2 -msse4 -fno-math-errno -DXSIMD_DEFAULT_ARCH=avx2" \
         -DCMAKE_CXX_FLAGS_RELEASE="-O3 -g -DNDEBUG -ffast-math" .. \
         -DBLAST_WITH_TEST=ON \
         -DBLAST_WITH_BENCHMARK=ON \
@@ -79,4 +79,3 @@ CMD mkdir -p blast/bench_result/data \
     && mkdir -p blast/bench_result/image \
     && cd blast \
     && make -j 1 bench_result/image/dgemm_performance.png bench_result/image/dgemm_performance_ratio.png
-

From c3adc2a9e8c6fd88830603c20c7ec97f81adaec2 Mon Sep 17 00:00:00 2001
From: Mikhail Katliar <cepstr@gmail.com>
Date: Thu, 1 Aug 2024 18:19:46 +0200
Subject: [PATCH 11/27] Code compiles for both avx2 and fma3<avx2>

---
 include/blast/math/simd/SimdVec.hpp           | 121 +++++++++++++++++-
 .../blast/math/simd/avx256/SimdVecFloat32.hpp |  60 ---------
 .../blast/math/simd/avx256/SimdVecFloat64.hpp |  38 ------
 .../blast/math/simd/avx256/SimdVecInt32.hpp   |  21 +--
 .../blast/math/simd/avx256/SimdVecInt64.hpp   |  23 ++--
 5 files changed, 143 insertions(+), 120 deletions(-)

diff --git a/include/blast/math/simd/SimdVec.hpp b/include/blast/math/simd/SimdVec.hpp
index be6cb57c..69d1fa0b 100644
--- a/include/blast/math/simd/SimdVec.hpp
+++ b/include/blast/math/simd/SimdVec.hpp
@@ -27,6 +27,111 @@ namespace blast
     class SimdVec;
 
 
+    template <typename Arch, typename T>
+    xsimd::batch<T, Arch> maskload(T const * src, xsimd::batch_bool<T, Arch> const& mask) noexcept;
+
+    template <typename Arch>
+    requires std::is_base_of_v<xsimd::avx2, Arch>
+    inline xsimd::batch<float, Arch> maskload(float const * src, xsimd::batch_bool<float, Arch> const& mask) noexcept
+    {
+        return _mm256_maskload_ps(src, mask);
+    }
+
+    template <typename Arch>
+    requires std::is_base_of_v<xsimd::avx2, Arch>
+    inline xsimd::batch<double, Arch> maskload(double const * src, xsimd::batch_bool<double, Arch> const& mask) noexcept
+    {
+        return _mm256_maskload_pd(src, mask);
+    }
+
+    template <typename Arch, typename T>
+    void maskstore(xsimd::batch<T, Arch> const& v, T * dst, xsimd::batch_bool<T, Arch> const& mask) noexcept;
+
+    template <typename Arch>
+    requires std::is_base_of_v<xsimd::avx2, Arch>
+    inline void maskstore(xsimd::batch<float, Arch> const& v, float * dst, xsimd::batch_bool<float, Arch> const& mask) noexcept
+    {
+        _mm256_maskstore_ps(dst, mask, v);
+    }
+
+    template <typename Arch>
+    requires std::is_base_of_v<xsimd::avx2, Arch>
+    inline void maskstore(xsimd::batch<double, Arch> const& v, double * dst, xsimd::batch_bool<double, Arch> const& mask) noexcept
+    {
+        _mm256_maskstore_pd(dst, mask, v);
+    }
+
+
+    template <typename Arch>
+    requires std::is_base_of_v<xsimd::avx2, Arch>
+    inline std::tuple<xsimd::batch<float, Arch>, xsimd::batch<std::int32_t, Arch>> imax(xsimd::batch<float, Arch> const& v1, xsimd::batch<std::int32_t, Arch> const& idx) noexcept
+    {
+        /* v2 = [G H E F | C D A B]                                                                         */
+        __m256 const v2 = _mm256_permute_ps(v1, 0b10'11'00'01);
+        __m256i const iv2 = _mm256_castps_si256(_mm256_permute_ps(_mm256_castsi256_ps(idx), 0b10'11'00'01));
+
+        /* v3 = [W=max(G,H) W=max(G,H) Z=max(E,F) Z=max(E,F) | Y=max(C,D) Y=max(C,D) X=max(A,B) X=max(A,B)] */
+        /* v3 = [W W Z Z | Y Y X X]                                                                         */
+        // __m256 v3 = _mm256_max_ps(v1, v2);
+        __m256 const mask_v3 = _mm256_cmp_ps(v2, v1, _CMP_GT_OQ);
+        __m256 const v3 = _mm256_blendv_ps(v1, v2, mask_v3);
+        __m256i const iv3 = _mm256_blendv_epi8(idx, iv2, mask_v3);
+
+        /* v4 = [Z Z W W | X X Y Y]                                                                         */
+        __m256 const v4 = _mm256_permute_ps(v3, 0b00'00'10'10);
+        __m256i const iv4 = _mm256_castps_si256(_mm256_permute_ps(_mm256_castsi256_ps(iv3), 0b00'00'10'10));
+
+        /* v5 = [J=max(Z,W) J=max(Z,W) J=max(Z,W) J=max(Z,W) | I=max(X,Y) I=max(X,Y) I=max(X,Y) I=max(X,Y)] */
+        /* v5 = [J J J J | I I I I]                                                                         */
+        // __m256 v5 = _mm256_max_ps(v3, v4);
+        __m256 const mask_v5 = _mm256_cmp_ps(v4, v3, _CMP_GT_OQ);
+        __m256 const v5 = _mm256_blendv_ps(v3, v4, mask_v5);
+        __m256i const iv5 = _mm256_blendv_epi8(iv3, iv4, mask_v5);
+
+        /* v6 = [I I I I | J J J J]                                                                         */
+        __m256 const v6 = _mm256_permute2f128_ps(v5, v5, 0b0000'0001);
+        __m256i const iv6 = _mm256_castps_si256(
+            _mm256_permute2f128_ps(
+                _mm256_castsi256_ps(iv5),
+                _mm256_castsi256_ps(iv5),
+                0b0000'0001
+            )
+        );
+
+        /* v7 = [M=max(I,J) M=max(I,J) M=max(I,J) M=max(I,J) | M=max(I,J) M=max(I,J) M=max(I,J) M=max(I,J)] */
+        // __m128 v7 = _mm_max_ps(_mm256_castps256_ps128(v5), v6);
+        __m256 const mask_v7 = _mm256_cmp_ps(v6, v5, _CMP_GT_OQ);
+        __m256 const v7 = _mm256_blendv_ps(v5, v6, mask_v7);
+        __m256i const iv7 = _mm256_blendv_epi8(iv5, iv6, mask_v7);
+
+        return {v7, iv7};
+    }
+
+
+    template <typename Arch>
+    requires std::is_base_of_v<xsimd::avx2, Arch>
+    inline std::tuple<xsimd::batch<double, Arch>, xsimd::batch<std::int64_t, Arch>> imax(xsimd::batch<double, Arch> const& x, xsimd::batch<std::int64_t, Arch> const& idx) noexcept
+    {
+        __m256d const y = _mm256_permute2f128_pd(x, x, 1); // permute 128-bit values
+        __m256i const iy = _mm256_permute2f128_si256(idx, idx, 1);
+
+        // __m256d m1 = _mm256_max_pd(x.value_, y); // m1[0] = max(x[0], x[2]), m1[1] = max(x[1], x[3]), etc.
+        __m256d const mask_m1 = _mm256_cmp_pd(y, x, _CMP_GT_OQ);
+        __m256d const m1 = _mm256_blendv_pd(x, y, mask_m1);
+        __m256i const im1 = _mm256_blendv_epi8(idx, iy, mask_m1);
+
+        __m256d const m2 = _mm256_permute_pd(m1, 5); // set m2[0] = m1[1], m2[1] = m1[0], etc.
+        __m256i const im2 = _mm256_castpd_si256(_mm256_permute_pd(_mm256_castsi256_pd(im1), 5));
+
+        // __m256d m = _mm256_max_pd(m1, m2); // all m[0] ... m[3] contain the horizontal max(x[0], x[1], x[2], x[3])
+        __m256d const mask_m = _mm256_cmp_pd(m2, m1, _CMP_GT_OQ);
+        __m256d const m = _mm256_blendv_pd(m1, m2, mask_m);
+        __m256i const im = _mm256_blendv_epi8(im1, im2, mask_m);
+
+        return {m, im};
+    }
+
+
     /**
     * @brief Fused multiply-add
     *
@@ -179,7 +284,10 @@ namespace blast
          * @param mask load mask
          * @param aligned true if @a src is SIMD-aligned
          */
-        explicit SimdVec(ValueType const * src, MaskType mask, bool aligned) noexcept;
+        explicit SimdVec(ValueType const * src, MaskType mask, bool aligned) noexcept
+        :   SimdVecBase<T, Arch> {maskload(src, mask)}
+        {
+        }
 
 
         /**
@@ -204,7 +312,10 @@ namespace blast
          * @param mask store mask
          * @param aligned true if @a dst is SIMD-aligned
          */
-        void store(ValueType * dst, MaskType mask, bool aligned) const noexcept;
+        void store(ValueType * dst, MaskType mask, bool aligned) const noexcept
+        {
+            maskstore(this->value_, dst, mask);
+        }
 
 
         /**
@@ -246,7 +357,11 @@ namespace blast
         friend SimdVec operator*<>(SimdVec const& a, ValueType const& b) noexcept;
 
         template <typename Index>
-        friend std::tuple<SimdVec, SimdVec<Index, xsimd::avx2>> imax(SimdVec const& v1, SimdVec<Index, xsimd::avx2> const& idx);
+        friend std::tuple<SimdVec, SimdVec<Index, Arch>> imax(SimdVec const& v1, SimdVec<Index, Arch> const& idx) noexcept
+        {
+            auto const t = imax(v1.value_, xsimd::batch<Index, Arch>(idx));
+            return {get<0>(t), SimdVec<Index, Arch>(get<1>(t))};
+        }
     };
 
 
diff --git a/include/blast/math/simd/avx256/SimdVecFloat32.hpp b/include/blast/math/simd/avx256/SimdVecFloat32.hpp
index bcaeb281..4a8a97f4 100644
--- a/include/blast/math/simd/avx256/SimdVecFloat32.hpp
+++ b/include/blast/math/simd/avx256/SimdVecFloat32.hpp
@@ -53,13 +53,6 @@ namespace blast
     }
 
 
-    template <>
-    inline SimdVec<float, xsimd::avx2>::SimdVec(ValueType const * src, MaskType mask, bool aligned) noexcept
-    :   SimdVecBase {_mm256_maskload_ps(src, mask)}
-    {
-    }
-
-
     template <>
     inline void SimdVec<float, xsimd::avx2>::store(ValueType * dst, bool aligned) const noexcept
     {
@@ -70,13 +63,6 @@ namespace blast
     }
 
 
-    template <>
-    inline void SimdVec<float, xsimd::avx2>::store(ValueType * dst, MaskType mask, bool aligned) const noexcept
-    {
-        _mm256_maskstore_ps(dst, mask, value_);
-    }
-
-
     template <>
     inline SimdVec<float, xsimd::avx2>::MaskType operator>(SimdVec<float, xsimd::avx2> const& a, SimdVec<float, xsimd::avx2> const& b) noexcept
     {
@@ -147,50 +133,4 @@ namespace blast
 
         return v7[0];
     }
-
-
-    template <typename Index>
-    requires (SimdSize_v<Index> == SimdVec<float, xsimd::avx2>::size())
-    inline std::tuple<SimdVec<float, xsimd::avx2>, SimdVec<Index, xsimd::avx2>> imax(SimdVec<float, xsimd::avx2> const& v1, SimdVec<Index, xsimd::avx2> const& idx) noexcept
-    {
-        /* v2 = [G H E F | C D A B]                                                                         */
-        SimdVec<float, xsimd::avx2> const v2 = _mm256_permute_ps(v1, 0b10'11'00'01);
-        SimdVec<Index, xsimd::avx2> const iv2 = _mm256_castps_si256(_mm256_permute_ps(_mm256_castsi256_ps(idx), 0b10'11'00'01));
-
-        /* v3 = [W=max(G,H) W=max(G,H) Z=max(E,F) Z=max(E,F) | Y=max(C,D) Y=max(C,D) X=max(A,B) X=max(A,B)] */
-        /* v3 = [W W Z Z | Y Y X X]                                                                         */
-        // __m256 v3 = _mm256_max_ps(v1, v2);
-        SimdVec<float, xsimd::avx2>::MaskType const mask_v3 = v2 > v1;
-        SimdVec<float, xsimd::avx2> const v3 = blend(v1, v2, mask_v3);
-        SimdVec<Index, xsimd::avx2> const iv3 = blend(idx, iv2, mask_v3);
-
-        /* v4 = [Z Z W W | X X Y Y]                                                                         */
-        SimdVec<float, xsimd::avx2> const v4 = _mm256_permute_ps(v3, 0b00'00'10'10);
-        SimdVec<Index, xsimd::avx2> const iv4 = _mm256_castps_si256(_mm256_permute_ps(_mm256_castsi256_ps(iv3), 0b00'00'10'10));
-
-        /* v5 = [J=max(Z,W) J=max(Z,W) J=max(Z,W) J=max(Z,W) | I=max(X,Y) I=max(X,Y) I=max(X,Y) I=max(X,Y)] */
-        /* v5 = [J J J J | I I I I]                                                                         */
-        // __m256 v5 = _mm256_max_ps(v3, v4);
-        SimdVec<float, xsimd::avx2>::MaskType const mask_v5 = v4 > v3;
-        SimdVec<float, xsimd::avx2> const v5 = blend(v3, v4, mask_v5);
-        SimdVec<Index, xsimd::avx2> const iv5 = blend(iv3, iv4, mask_v5);
-
-        /* v6 = [I I I I | J J J J]                                                                         */
-        SimdVec<float, xsimd::avx2> const v6 = _mm256_permute2f128_ps(v5, v5, 0b0000'0001);
-        SimdVec<Index, xsimd::avx2> const iv6 = _mm256_castps_si256(
-            _mm256_permute2f128_ps(
-                _mm256_castsi256_ps(iv5),
-                _mm256_castsi256_ps(iv5),
-                0b0000'0001
-            )
-        );
-
-        /* v7 = [M=max(I,J) M=max(I,J) M=max(I,J) M=max(I,J) | M=max(I,J) M=max(I,J) M=max(I,J) M=max(I,J)] */
-        // __m128 v7 = _mm_max_ps(_mm256_castps256_ps128(v5), v6);
-        SimdVec<float, xsimd::avx2>::MaskType const mask_v7 = v6 > v5;
-        SimdVec<float, xsimd::avx2> const v7 = blend(v5, v6, mask_v7);
-        SimdVec<Index, xsimd::avx2> const iv7 = blend(iv5, iv6, mask_v7);
-
-        return std::make_tuple(v7, iv7);
-    }
 }
diff --git a/include/blast/math/simd/avx256/SimdVecFloat64.hpp b/include/blast/math/simd/avx256/SimdVecFloat64.hpp
index 52a3ed63..31e067fc 100644
--- a/include/blast/math/simd/avx256/SimdVecFloat64.hpp
+++ b/include/blast/math/simd/avx256/SimdVecFloat64.hpp
@@ -53,13 +53,6 @@ namespace blast
     }
 
 
-    template <>
-    inline SimdVec<double, xsimd::avx2>::SimdVec(ValueType const * src, MaskType mask, bool aligned) noexcept
-    :   SimdVecBase {_mm256_maskload_pd(src, mask)}
-    {
-    }
-
-
     template <>
     inline void SimdVec<double, xsimd::avx2>::store(ValueType * dst, bool aligned) const noexcept
     {
@@ -70,13 +63,6 @@ namespace blast
     }
 
 
-    template <>
-    inline void SimdVec<double, xsimd::avx2>::store(ValueType * dst, MaskType mask, bool aligned) const noexcept
-    {
-        _mm256_maskstore_pd(dst, mask, value_);
-    }
-
-
     template <>
     inline SimdVec<double, xsimd::avx2>::MaskType operator>(SimdVec<double, xsimd::avx2> const& a, SimdVec<double, xsimd::avx2> const& b) noexcept
     {
@@ -129,28 +115,4 @@ namespace blast
 
         return m[0];
     }
-
-
-    template <typename Index>
-    requires (SimdSize_v<Index> == SimdVec<double, xsimd::avx2>::size())
-    inline std::tuple<SimdVec<double, xsimd::avx2>, SimdVec<Index, xsimd::avx2>> imax(SimdVec<double, xsimd::avx2> const& x, SimdVec<Index, xsimd::avx2> const& idx) noexcept
-    {
-        SimdVec<double, xsimd::avx2> const y = _mm256_permute2f128_pd(x, x, 1); // permute 128-bit values
-        SimdVec<Index, xsimd::avx2> const iy = _mm256_permute2f128_si256(idx, idx, 1);
-
-        // __m256d m1 = _mm256_max_pd(x.value_, y); // m1[0] = max(x[0], x[2]), m1[1] = max(x[1], x[3]), etc.
-        SimdVec<double, xsimd::avx2>::MaskType const mask_m1 = y > x;
-        SimdVec const m1 = blend(x, y, mask_m1);
-        SimdVec<Index, xsimd::avx2> const im1 = blend(idx, iy, mask_m1);
-
-        SimdVec<double, xsimd::avx2> const m2 = _mm256_permute_pd(m1, 5); // set m2[0] = m1[1], m2[1] = m1[0], etc.
-        SimdVec<Index, xsimd::avx2> const im2 = _mm256_castpd_si256(_mm256_permute_pd(_mm256_castsi256_pd(im1), 5));
-
-        // __m256d m = _mm256_max_pd(m1, m2); // all m[0] ... m[3] contain the horizontal max(x[0], x[1], x[2], x[3])
-        SimdVec<double, xsimd::avx2>::MaskType const mask_m = m2 > m1;
-        SimdVec const m = blend(m1, m2, mask_m);
-        SimdVec<Index, xsimd::avx2> const im = blend(im1, im2, mask_m);
-
-        return std::make_tuple(m, im);
-    }
 }
diff --git a/include/blast/math/simd/avx256/SimdVecInt32.hpp b/include/blast/math/simd/avx256/SimdVecInt32.hpp
index 15db3b4b..db995573 100644
--- a/include/blast/math/simd/avx256/SimdVecInt32.hpp
+++ b/include/blast/math/simd/avx256/SimdVecInt32.hpp
@@ -17,27 +17,30 @@
 #include <blast/math/simd/SimdVec.hpp>
 #include <blast/math/simd/SimdVecBase.hpp>
 #include <blast/math/simd/SequenceTag.hpp>
-#include <blast/math/simd/avx256/SimdSize.hpp>
 
 #include <immintrin.h>
 
 #include <cstdint>
+#include <type_traits>
 
 
 namespace blast
 {
-    template <>
-    class SimdVec<std::int32_t, xsimd::avx2>
-    :   public SimdVecBase<std::int32_t, xsimd::avx2>
+    template <typename Arch>
+    requires std::is_base_of_v<xsimd::avx2, Arch>
+    class SimdVec<std::int32_t, Arch>
+    :   public SimdVecBase<std::int32_t, Arch>
     {
     public:
         using MaskType = __m256i;
+        using typename SimdVecBase<std::int32_t, Arch>::ValueType;
+        using typename SimdVecBase<std::int32_t, Arch>::IntrinsicType;
 
         /**
          * @brief Initialize to (0, 0, 0, 0, 0, 0, 0, 0)
          */
         SimdVec() noexcept
-        :   SimdVecBase {_mm256_setzero_si256()}
+        :   SimdVecBase<std::int32_t, Arch> {_mm256_setzero_si256()}
         {
         }
 
@@ -46,7 +49,7 @@ namespace blast
          * @brief Initialize to (a, a, a, a, a, a, a, a)
          */
         SimdVec(ValueType a) noexcept
-        :   SimdVecBase {_mm256_set1_epi32(a)}
+        :   SimdVecBase<std::int32_t, Arch> {_mm256_set1_epi32(a)}
         {
         }
 
@@ -68,20 +71,20 @@ namespace blast
          */
         SimdVec(ValueType a7, ValueType a6, ValueType a5, ValueType a4,
             ValueType a3, ValueType a2, ValueType a1, ValueType a0) noexcept
-        :   SimdVecBase {_mm256_set_epi32(a7, a6, a5, a4, a3, a2, a1, a0)}
+        :   SimdVecBase<std::int32_t, Arch> {_mm256_set_epi32(a7, a6, a5, a4, a3, a2, a1, a0)}
         {
         }
 
 
         SimdVec(IntrinsicType value) noexcept
-        :   SimdVecBase {value}
+        :   SimdVecBase<std::int32_t, Arch> {value}
         {
         }
 
 
         SimdVec& operator+=(ValueType x) noexcept
         {
-            value_ = _mm256_add_epi32(value_, _mm256_set1_epi32(x));
+            this->value_ = _mm256_add_epi32(this->value_, _mm256_set1_epi32(x));
             return *this;
         }
 
diff --git a/include/blast/math/simd/avx256/SimdVecInt64.hpp b/include/blast/math/simd/avx256/SimdVecInt64.hpp
index f383016d..e2adae8e 100644
--- a/include/blast/math/simd/avx256/SimdVecInt64.hpp
+++ b/include/blast/math/simd/avx256/SimdVecInt64.hpp
@@ -17,27 +17,30 @@
 #include <blast/math/simd/SimdVec.hpp>
 #include <blast/math/simd/SimdVecBase.hpp>
 #include <blast/math/simd/SequenceTag.hpp>
-#include <blast/math/simd/avx256/SimdSize.hpp>
 
 #include <immintrin.h>
 
 #include <cstdint>
+#include <type_traits>
 
 
 namespace blast
 {
-    template <>
-    class SimdVec<std::int64_t, xsimd::avx2>
-    :   public SimdVecBase<std::int64_t, xsimd::avx2>
+    template <typename Arch>
+    requires std::is_base_of_v<xsimd::avx2, Arch>
+    class SimdVec<std::int64_t, Arch>
+    :   public SimdVecBase<std::int64_t, Arch>
     {
     public:
         using MaskType = __m256i;
+        using typename SimdVecBase<std::int64_t, Arch>::ValueType;
+        using typename SimdVecBase<std::int64_t, Arch>::IntrinsicType;
 
         /**
          * @brief Initialize to (0, 0, 0, 0)
          */
         SimdVec() noexcept
-        :   SimdVecBase {_mm256_setzero_si256()}
+        :   SimdVecBase<std::int64_t, Arch> {_mm256_setzero_si256()}
         {
         }
 
@@ -46,7 +49,7 @@ namespace blast
          * @brief Initialize to (a, a, a, a)
          */
         SimdVec(ValueType a) noexcept
-        :   SimdVecBase {_mm256_set1_epi64x(a)}
+        :   SimdVecBase<std::int64_t, Arch> {_mm256_set1_epi64x(a)}
         {
         }
 
@@ -67,26 +70,26 @@ namespace blast
          * where a0 corresponds to the lower bits.
          */
         SimdVec(ValueType a3, ValueType a2, ValueType a1, ValueType a0) noexcept
-        :   SimdVecBase {_mm256_set_epi64x(a3, a2, a1, a0)}
+        :   SimdVecBase<std::int64_t, Arch> {_mm256_set_epi64x(a3, a2, a1, a0)}
         {
         }
 
 
         SimdVec(IntrinsicType value) noexcept
-        :   SimdVecBase {value}
+        :   SimdVecBase<std::int64_t, Arch> {value}
         {
         }
 
 
         operator IntrinsicType() const noexcept
         {
-            return value_;
+            return this->value_;
         }
 
 
         SimdVec& operator+=(ValueType x) noexcept
         {
-            value_ = _mm256_add_epi64(value_, _mm256_set1_epi64x(x));
+            this->value_ = _mm256_add_epi64(this->value_, _mm256_set1_epi64x(x));
             return *this;
         }
 

From c21776f14c14b57acbc879847310f7c402e37d0b Mon Sep 17 00:00:00 2001
From: Mikhail Katliar <cepstr@gmail.com>
Date: Thu, 1 Aug 2024 20:22:19 +0200
Subject: [PATCH 12/27] SimdVec decoupled from SimdVecBase

---
 include/blast/math/simd/SimdVec.hpp           | 62 ++++++++++++++-----
 .../blast/math/simd/avx256/SimdVecFloat32.hpp | 12 ++--
 .../blast/math/simd/avx256/SimdVecFloat64.hpp | 12 ++--
 3 files changed, 53 insertions(+), 33 deletions(-)

diff --git a/include/blast/math/simd/SimdVec.hpp b/include/blast/math/simd/SimdVec.hpp
index 69d1fa0b..554fcd00 100644
--- a/include/blast/math/simd/SimdVec.hpp
+++ b/include/blast/math/simd/SimdVec.hpp
@@ -14,8 +14,6 @@
 
 #pragma once
 
-#include <blast/math/simd/SimdVecBase.hpp>
-
 #include <xsimd/xsimd.hpp>
 
 #include <tuple>
@@ -221,12 +219,11 @@ namespace blast
      */
     template <typename T, typename Arch>
     class SimdVec
-    :   public SimdVecBase<T, Arch>
     {
     public:
-        using typename SimdVecBase<T, Arch>::ValueType;
-        using typename SimdVecBase<T, Arch>::IntrinsicType;
-        using typename SimdVecBase<T, Arch>::XSimdType;
+        using ValueType = T;
+        using XSimdType = xsimd::batch<T, Arch>;
+        using IntrinsicType = typename XSimdType::register_type;
         using MaskType = xsimd::batch_bool<T, Arch>;
 
 
@@ -234,7 +231,7 @@ namespace blast
          * @brief Set to [0, 0, 0, ...]
          */
         SimdVec() noexcept
-        :   SimdVecBase<T, Arch> {T {}}
+        :   value_ {T {}}
         {
         }
 
@@ -243,13 +240,13 @@ namespace blast
 
 
         SimdVec(IntrinsicType value) noexcept
-        :   SimdVecBase<T, Arch> {value}
+        :   value_ {value}
         {
         }
 
 
         SimdVec(XSimdType value) noexcept
-        :   SimdVecBase<T, Arch> {value}
+        :   value_ {value}
         {
         }
 
@@ -260,7 +257,7 @@ namespace blast
          * @param value value for each component of SIMD vector
          */
         SimdVec(ValueType value) noexcept
-        :   SimdVecBase<T, Arch> {value}
+        :   value_ {value}
         {
         }
 
@@ -272,7 +269,7 @@ namespace blast
          * @param aligned true indicates that an aligned read instruction should be used
          */
         explicit SimdVec(ValueType const * src, bool aligned) noexcept
-        :   SimdVecBase<T, Arch> {aligned ? xsimd::load_aligned(src) : xsimd::load_unaligned(src)}
+        :   value_ {aligned ? xsimd::load_aligned(src) : xsimd::load_unaligned(src)}
         {
         }
 
@@ -285,8 +282,36 @@ namespace blast
          * @param aligned true if @a src is SIMD-aligned
          */
         explicit SimdVec(ValueType const * src, MaskType mask, bool aligned) noexcept
-        :   SimdVecBase<T, Arch> {maskload(src, mask)}
+        :   value_ {maskload(src, mask)}
+        {
+        }
+
+
+        /**
+         * @brief Number of elements in SIMD pack
+         */
+        static size_t constexpr size()
+        {
+            return XSimdType::size;
+        }
+
+
+        operator IntrinsicType() const noexcept
+        {
+            return value_;
+        }
+
+
+        /**
+         * @brief Access single element
+         *
+         * @param i element index
+         *
+         * @return element value
+         */
+        ValueType operator[](size_t i) const noexcept
         {
+            return value_.get(i);
         }
 
 
@@ -299,9 +324,9 @@ namespace blast
         void store(ValueType * dst, bool aligned) const noexcept
         {
             if (aligned)
-                xsimd::store_aligned(dst, this->value_);
+                xsimd::store_aligned(dst, value_);
             else
-                xsimd::store_unaligned(dst, this->value_);
+                xsimd::store_unaligned(dst, value_);
         }
 
 
@@ -314,7 +339,7 @@ namespace blast
          */
         void store(ValueType * dst, MaskType mask, bool aligned) const noexcept
         {
-            maskstore(this->value_, dst, mask);
+            maskstore(value_, dst, mask);
         }
 
 
@@ -327,7 +352,7 @@ namespace blast
          */
         SimdVec& operator*=(SimdVec const& a) noexcept
         {
-            this->value_ *= a.value_;
+            value_ *= a.value_;
             return *this;
         }
 
@@ -341,7 +366,7 @@ namespace blast
          */
         SimdVec& operator/=(SimdVec const& a) noexcept
         {
-            this->value_ /= a.value_;
+            value_ /= a.value_;
             return *this;
         }
 
@@ -362,6 +387,9 @@ namespace blast
             auto const t = imax(v1.value_, xsimd::batch<Index, Arch>(idx));
             return {get<0>(t), SimdVec<Index, Arch>(get<1>(t))};
         }
+
+    private:
+        XSimdType value_;
     };
 
 
diff --git a/include/blast/math/simd/avx256/SimdVecFloat32.hpp b/include/blast/math/simd/avx256/SimdVecFloat32.hpp
index 4a8a97f4..47a3aaf1 100644
--- a/include/blast/math/simd/avx256/SimdVecFloat32.hpp
+++ b/include/blast/math/simd/avx256/SimdVecFloat32.hpp
@@ -15,40 +15,36 @@
 #pragma once
 
 #include <blast/math/simd/SimdVec.hpp>
-#include <blast/math/simd/SimdVecBase.hpp>
-#include <blast/math/simd/avx256/SimdSize.hpp>
 
 #include <immintrin.h>
 
-#include <tuple>
-
 
 namespace blast
 {
     template <>
     inline SimdVec<float, xsimd::avx2>::SimdVec() noexcept
-    :   SimdVecBase {_mm256_setzero_ps()}
+    :   value_ {_mm256_setzero_ps()}
     {
     }
 
 
     template <>
     inline SimdVec<float, xsimd::avx2>::SimdVec(IntrinsicType value) noexcept
-    :   SimdVecBase {value}
+    :   value_ {value}
     {
     }
 
 
     template <>
     inline SimdVec<float, xsimd::avx2>::SimdVec(ValueType value) noexcept
-    :   SimdVecBase {_mm256_set1_ps(value)}
+    :   value_ {_mm256_set1_ps(value)}
     {
     }
 
 
     template <>
     inline SimdVec<float, xsimd::avx2>::SimdVec(ValueType const * src, bool aligned) noexcept
-    :   SimdVecBase {aligned ? _mm256_load_ps(src) : _mm256_loadu_ps(src)}
+    :   value_ {aligned ? _mm256_load_ps(src) : _mm256_loadu_ps(src)}
     {
     }
 
diff --git a/include/blast/math/simd/avx256/SimdVecFloat64.hpp b/include/blast/math/simd/avx256/SimdVecFloat64.hpp
index 31e067fc..22d358ea 100644
--- a/include/blast/math/simd/avx256/SimdVecFloat64.hpp
+++ b/include/blast/math/simd/avx256/SimdVecFloat64.hpp
@@ -15,40 +15,36 @@
 #pragma once
 
 #include <blast/math/simd/SimdVec.hpp>
-#include <blast/math/simd/SimdVecBase.hpp>
-#include <blast/math/simd/avx256/SimdSize.hpp>
 
 #include <immintrin.h>
 
-#include <tuple>
-
 
 namespace blast
 {
     template <>
     inline SimdVec<double, xsimd::avx2>::SimdVec() noexcept
-    :   SimdVecBase {_mm256_setzero_pd()}
+    :   value_ {_mm256_setzero_pd()}
     {
     }
 
 
     template <>
     inline SimdVec<double, xsimd::avx2>::SimdVec(IntrinsicType value) noexcept
-    :   SimdVecBase {value}
+    :   value_ {value}
     {
     }
 
 
     template <>
     inline SimdVec<double, xsimd::avx2>::SimdVec(ValueType value) noexcept
-    :   SimdVecBase {_mm256_set1_pd(value)}
+    :   value_ {_mm256_set1_pd(value)}
     {
     }
 
 
     template <>
     inline SimdVec<double, xsimd::avx2>::SimdVec(ValueType const * src, bool aligned) noexcept
-    :   SimdVecBase {aligned ? _mm256_load_pd(src) : _mm256_loadu_pd(src)}
+    :   value_ {aligned ? _mm256_load_pd(src) : _mm256_loadu_pd(src)}
     {
     }
 

From 7ff99d03b7cce6d8ec6b2e87188b0f8bc95c678b Mon Sep 17 00:00:00 2001
From: Mikhail Katliar <cepstr@gmail.com>
Date: Thu, 1 Aug 2024 20:36:53 +0200
Subject: [PATCH 13/27] SimdMask<> is an alias for xsimd::batch_bool<>

---
 include/blast/math/simd/Avx256.hpp          |  1 -
 include/blast/math/simd/SimdMask.hpp        |  9 ++-
 include/blast/math/simd/avx256/SimdMask.hpp | 83 ---------------------
 3 files changed, 6 insertions(+), 87 deletions(-)
 delete mode 100644 include/blast/math/simd/avx256/SimdMask.hpp

diff --git a/include/blast/math/simd/Avx256.hpp b/include/blast/math/simd/Avx256.hpp
index 8e34d3b3..c1e7980a 100644
--- a/include/blast/math/simd/Avx256.hpp
+++ b/include/blast/math/simd/Avx256.hpp
@@ -19,7 +19,6 @@
 #include <blast/math/simd/avx256/SimdVecFloat64.hpp>
 #include <blast/math/simd/avx256/SimdVecInt32.hpp>
 #include <blast/math/simd/avx256/SimdVecInt64.hpp>
-// #include <blast/math/simd/avx256/SimdMask.hpp>
 #include <blast/math/simd/avx256/IntScalarType.hpp>
 
 #include <blast/math/simd/IntVecType.hpp>
diff --git a/include/blast/math/simd/SimdMask.hpp b/include/blast/math/simd/SimdMask.hpp
index 7e65c867..31486d2f 100644
--- a/include/blast/math/simd/SimdMask.hpp
+++ b/include/blast/math/simd/SimdMask.hpp
@@ -14,6 +14,8 @@
 
 #pragma once
 
+#include <xsimd/xsimd.hpp>
+
 
 namespace blast
 {
@@ -22,7 +24,8 @@ namespace blast
      * The width of a given simd_mask instantiation is a constant expression, determined by the template parameter.
      *
      * @tparam T the element type simd_mask applies on
+     * @tparam Arch instruction set architecture
      */
-    template <typename T>
-    class SimdMask;
-}
\ No newline at end of file
+    template <typename T, typename Arch>
+    using SimdMask = xsimd::batch_bool<T, Arch>;
+}
diff --git a/include/blast/math/simd/avx256/SimdMask.hpp b/include/blast/math/simd/avx256/SimdMask.hpp
deleted file mode 100644
index 8dbcea64..00000000
--- a/include/blast/math/simd/avx256/SimdMask.hpp
+++ /dev/null
@@ -1,83 +0,0 @@
-// Copyright 2023 Mikhail Katliar
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <blast/math/simd/SimdSize.hpp>
-
-#include <immintrin.h>
-
-#include <cstdint>
-
-
-namespace blast
-{
-    template <typename T>
-    class SimdMask
-    {
-        static size_t constexpr SS = SimdSize_v<T>;
-
-    public:
-        using ValueType = IntElementType_t<SS>;
-        using IntrinsicType = __m256i;
-        using MaskType = __m256i;
-
-
-        /**
-         * @brief Set to [0, 0, 0, ...]
-         */
-        SimdMask() noexcept
-        :   value_ {_mm256_setzero_pd()}
-        {
-        }
-
-
-        SimdMask(IntrinsicType value) noexcept
-        :   value_ {value}
-        {
-        }
-
-
-        operator IntrinsicType() const noexcept
-        {
-            return value_;
-        }
-
-
-        /**
-         * @brief Number of elements in SIMD pack
-         */
-        static size_t constexpr size()
-        {
-            return SS;
-        }
-
-
-        /**
-         * @brief Access specified element
-         *
-         * @param i element index
-         *
-         * @return element value
-         */
-        ValueType operator[](size_t i) const noexcept
-        {
-            return value_[i];
-        }
-
-
-    private:
-        IntrinsicType value_;
-    };
-}
\ No newline at end of file

From e7ed800f46ad25dc1fa66907d479a6aed2ab6c82 Mon Sep 17 00:00:00 2001
From: Mikhail Katliar <cepstr@gmail.com>
Date: Fri, 2 Aug 2024 12:44:00 +0200
Subject: [PATCH 14/27] Added SimdIndex, removed some messy types.

---
 include/blast/math/simd/Avx256.hpp            |   6 -
 include/blast/math/simd/IntScalarType.hpp     |  29 -----
 include/blast/math/simd/IntVecType.hpp        |  24 ----
 include/blast/math/simd/SequenceTag.hpp       |  23 ----
 include/blast/math/simd/SimdIndex.hpp         |  98 +++++++++++++++
 include/blast/math/simd/SimdSize.hpp          |  22 ++--
 include/blast/math/simd/SimdVec.hpp           |   8 +-
 .../blast/math/simd/avx256/IntScalarType.hpp  |  34 ------
 include/blast/math/simd/avx256/SimdSize.hpp   |  50 --------
 .../blast/math/simd/avx256/SimdVecInt32.hpp   | 109 -----------------
 .../blast/math/simd/avx256/SimdVecInt64.hpp   | 114 ------------------
 test/blast/math/simd/SimdVecTest.cpp          |  14 ++-
 12 files changed, 119 insertions(+), 412 deletions(-)
 delete mode 100644 include/blast/math/simd/IntScalarType.hpp
 delete mode 100644 include/blast/math/simd/IntVecType.hpp
 delete mode 100644 include/blast/math/simd/SequenceTag.hpp
 create mode 100644 include/blast/math/simd/SimdIndex.hpp
 delete mode 100644 include/blast/math/simd/avx256/IntScalarType.hpp
 delete mode 100644 include/blast/math/simd/avx256/SimdSize.hpp
 delete mode 100644 include/blast/math/simd/avx256/SimdVecInt32.hpp
 delete mode 100644 include/blast/math/simd/avx256/SimdVecInt64.hpp

diff --git a/include/blast/math/simd/Avx256.hpp b/include/blast/math/simd/Avx256.hpp
index c1e7980a..1fbc41c0 100644
--- a/include/blast/math/simd/Avx256.hpp
+++ b/include/blast/math/simd/Avx256.hpp
@@ -14,11 +14,5 @@
 
 #pragma once
 
-#include <blast/math/simd/avx256/SimdSize.hpp>
 #include <blast/math/simd/avx256/SimdVecFloat32.hpp>
 #include <blast/math/simd/avx256/SimdVecFloat64.hpp>
-#include <blast/math/simd/avx256/SimdVecInt32.hpp>
-#include <blast/math/simd/avx256/SimdVecInt64.hpp>
-#include <blast/math/simd/avx256/IntScalarType.hpp>
-
-#include <blast/math/simd/IntVecType.hpp>
diff --git a/include/blast/math/simd/IntScalarType.hpp b/include/blast/math/simd/IntScalarType.hpp
deleted file mode 100644
index 3fb119ac..00000000
--- a/include/blast/math/simd/IntScalarType.hpp
+++ /dev/null
@@ -1,29 +0,0 @@
-// Copyright 2023 Mikhail Katliar
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <cstdlib>
-#include <cstdint>
-
-
-namespace blast
-{
-    template <size_t N>
-    struct IntScalarType;
-
-
-    template <size_t N>
-    using IntScalarType_t = typename IntScalarType<N>::type;
-}
\ No newline at end of file
diff --git a/include/blast/math/simd/IntVecType.hpp b/include/blast/math/simd/IntVecType.hpp
deleted file mode 100644
index 0f8faab7..00000000
--- a/include/blast/math/simd/IntVecType.hpp
+++ /dev/null
@@ -1,24 +0,0 @@
-// Copyright 2023 Mikhail Katliar
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <blast/math/simd/IntScalarType.hpp>
-#include <blast/math/simd/SimdVec.hpp>
-
-namespace blast
-{
-    template <size_t N>
-    using IntVecType_t = SimdVec<IntScalarType_t<N>>;
-}
\ No newline at end of file
diff --git a/include/blast/math/simd/SequenceTag.hpp b/include/blast/math/simd/SequenceTag.hpp
deleted file mode 100644
index 8f37c552..00000000
--- a/include/blast/math/simd/SequenceTag.hpp
+++ /dev/null
@@ -1,23 +0,0 @@
-// Copyright 2023 Mikhail Katliar
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-
-namespace blast :: simd
-{
-    struct SequenceTag {};
-
-    inline SequenceTag constexpr sequenceTag;
-}
\ No newline at end of file
diff --git a/include/blast/math/simd/SimdIndex.hpp b/include/blast/math/simd/SimdIndex.hpp
new file mode 100644
index 00000000..bfbae1c5
--- /dev/null
+++ b/include/blast/math/simd/SimdIndex.hpp
@@ -0,0 +1,98 @@
+// Copyright 2024 Mikhail Katliar
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma  once
+
+#include <xsimd/xsimd.hpp>
+
+#include <cstdint>
+#include <type_traits>
+
+
+namespace blast
+{
+    namespace detail
+    {
+        /// @brief Deduces integer type of given size, sized or unsigned.
+        ///
+        /// @tparam N type size in bytes
+        /// @tparam S true for signed type, false for unsigned
+        ///
+        template <std::size_t N, bool S>
+        struct IntOfSize;
+
+        template <std::size_t N, bool S>
+        using IntOfSize_t = typename IntOfSize<N, S>::Type;
+
+        template <>
+        struct IntOfSize<4, true>
+        {
+            using Type = std::int32_t;
+        };
+
+        template <>
+        struct IntOfSize<4, false>
+        {
+            using Type = std::uint32_t;
+        };
+
+        template <>
+        struct IntOfSize<8, true>
+        {
+            using Type = std::int64_t;
+        };
+
+        template <>
+        struct IntOfSize<8, false>
+        {
+            using Type = std::uint64_t;
+        };
+
+        template <typename T, typename Arch>
+        requires (xsimd::batch<T, Arch>::size == 4) && std::is_integral_v<T>
+        inline xsimd::batch<T, Arch> indexSequence(T start) noexcept
+        {
+            return {start, start + 1, start + 2, start + 3};
+        }
+
+        template <typename T, typename Arch = xsimd::default_arch>
+        requires (xsimd::batch<T, Arch>::size == 8) && std::is_integral_v<T>
+        inline xsimd::batch<T, Arch> indexSequence(T start) noexcept
+        {
+            return {start, start + 1, start + 2, start + 3, start + 4, start + 5, start + 6, start + 7};
+        }
+    }
+
+
+    /// @brief Integer SIMD type that can be used for indexing or gather-scatter operations.
+    ///
+    /// @tparam T the deduced type will be the same size as @a T
+    /// @tparam Arch instruction set architecture
+    ///
+    template <typename T, typename Arch = xsimd::default_arch>
+    using SimdIndex = xsimd::batch<detail::IntOfSize_t<sizeof(T), true>, Arch>;
+
+
+    /// @brief Construct an integer index sequence
+    ///
+    /// @param start start of the sequence
+    ///
+    /// @return [ @a start, @a start + 1, ..., @a start + N - 1 ]
+    ///   where N = SimdIndex<T, Arch>::size
+    ///
+    template <typename T, typename Arch = xsimd::default_arch>
+    inline SimdIndex<T, Arch> indexSequence(typename SimdIndex<T, Arch>::value_type start = 0) noexcept
+    {
+        return detail::indexSequence<detail::IntOfSize_t<sizeof(T), true>, Arch>(start);
+    }
+}
diff --git a/include/blast/math/simd/SimdSize.hpp b/include/blast/math/simd/SimdSize.hpp
index 4e862f72..4a22339a 100644
--- a/include/blast/math/simd/SimdSize.hpp
+++ b/include/blast/math/simd/SimdSize.hpp
@@ -1,4 +1,4 @@
-// Copyright 2023 Mikhail Katliar
+// Copyright 2023-2024 Mikhail Katliar
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -14,24 +14,20 @@
 
 #pragma once
 
+#include <xsimd/xsimd.hpp>
+
 #include <cstdlib>
+#include <type_traits>
 
 
 namespace blast
 {
-    template <typename T>
-    struct SimdSize;
-
-
-    template <typename T>
-    struct SimdSize<T const> : SimdSize<T> {};
-
-
     /**
-     * @brief Number of elements in a SimdVec<T> object.
+     * @brief Size of a SIMD register conatining scalars of a given type.
      *
      * @tparam T scalar type
+     * @tparam Arch instruction set architecture
      */
-    template <typename T>
-    size_t constexpr SimdSize_v = SimdSize<T>::value;
-}
\ No newline at end of file
+    template <typename T, typename Arch = xsimd::default_arch>
+    std::size_t constexpr SimdSize_v = xsimd::batch<std::remove_cv_t<T>, Arch>::size;
+}
diff --git a/include/blast/math/simd/SimdVec.hpp b/include/blast/math/simd/SimdVec.hpp
index 554fcd00..2aef5e5a 100644
--- a/include/blast/math/simd/SimdVec.hpp
+++ b/include/blast/math/simd/SimdVec.hpp
@@ -14,6 +14,8 @@
 
 #pragma once
 
+#include <blast/math/simd/SimdIndex.hpp>
+
 #include <xsimd/xsimd.hpp>
 
 #include <tuple>
@@ -381,11 +383,9 @@ namespace blast
         friend SimdVec operator*<>(ValueType const& a, SimdVec const& b) noexcept;
         friend SimdVec operator*<>(SimdVec const& a, ValueType const& b) noexcept;
 
-        template <typename Index>
-        friend std::tuple<SimdVec, SimdVec<Index, Arch>> imax(SimdVec const& v1, SimdVec<Index, Arch> const& idx) noexcept
+        friend std::tuple<SimdVec, SimdIndex<T, Arch>> imax(SimdVec const& v1, SimdIndex<T, Arch> const& idx) noexcept
         {
-            auto const t = imax(v1.value_, xsimd::batch<Index, Arch>(idx));
-            return {get<0>(t), SimdVec<Index, Arch>(get<1>(t))};
+            return imax(v1.value_, idx);
         }
 
     private:
diff --git a/include/blast/math/simd/avx256/IntScalarType.hpp b/include/blast/math/simd/avx256/IntScalarType.hpp
deleted file mode 100644
index 59785c85..00000000
--- a/include/blast/math/simd/avx256/IntScalarType.hpp
+++ /dev/null
@@ -1,34 +0,0 @@
-// Copyright 2023 Mikhail Katliar
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <blast/math/simd/IntScalarType.hpp>
-
-
-namespace blast
-{
-    template <>
-    struct IntScalarType<4>
-    {
-        using type = std::int64_t;
-    };
-
-
-    template <>
-    struct IntScalarType<8>
-    {
-        using type = std::int32_t;
-    };
-}
\ No newline at end of file
diff --git a/include/blast/math/simd/avx256/SimdSize.hpp b/include/blast/math/simd/avx256/SimdSize.hpp
deleted file mode 100644
index 2fa30b2f..00000000
--- a/include/blast/math/simd/avx256/SimdSize.hpp
+++ /dev/null
@@ -1,50 +0,0 @@
-// Copyright 2023 Mikhail Katliar
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <blast/math/simd/SimdSize.hpp>
-
-#include <cstdint>
-
-
-namespace blast
-{
-    template <>
-    struct SimdSize<float>
-    {
-        static size_t constexpr value = 8;
-    };
-
-
-    template <>
-    struct SimdSize<double>
-    {
-        static size_t constexpr value = 4;
-    };
-
-
-    template <>
-    struct SimdSize<std::int32_t>
-    {
-        static size_t constexpr value = 8;
-    };
-
-
-    template <>
-    struct SimdSize<std::int64_t>
-    {
-        static size_t constexpr value = 4;
-    };
-}
\ No newline at end of file
diff --git a/include/blast/math/simd/avx256/SimdVecInt32.hpp b/include/blast/math/simd/avx256/SimdVecInt32.hpp
deleted file mode 100644
index db995573..00000000
--- a/include/blast/math/simd/avx256/SimdVecInt32.hpp
+++ /dev/null
@@ -1,109 +0,0 @@
-// Copyright 2023 Mikhail Katliar
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <blast/math/simd/SimdVec.hpp>
-#include <blast/math/simd/SimdVecBase.hpp>
-#include <blast/math/simd/SequenceTag.hpp>
-
-#include <immintrin.h>
-
-#include <cstdint>
-#include <type_traits>
-
-
-namespace blast
-{
-    template <typename Arch>
-    requires std::is_base_of_v<xsimd::avx2, Arch>
-    class SimdVec<std::int32_t, Arch>
-    :   public SimdVecBase<std::int32_t, Arch>
-    {
-    public:
-        using MaskType = __m256i;
-        using typename SimdVecBase<std::int32_t, Arch>::ValueType;
-        using typename SimdVecBase<std::int32_t, Arch>::IntrinsicType;
-
-        /**
-         * @brief Initialize to (0, 0, 0, 0, 0, 0, 0, 0)
-         */
-        SimdVec() noexcept
-        :   SimdVecBase<std::int32_t, Arch> {_mm256_setzero_si256()}
-        {
-        }
-
-
-        /**
-         * @brief Initialize to (a, a, a, a, a, a, a, a)
-         */
-        SimdVec(ValueType a) noexcept
-        :   SimdVecBase<std::int32_t, Arch> {_mm256_set1_epi32(a)}
-        {
-        }
-
-
-        /**
-         * @brief Initialize to (n + 7, n + 6, n + 5, n + 4, n + 3, n + 2, n + 1, n)
-         *
-         * @param n
-         */
-        SimdVec(simd::SequenceTag, ValueType n = 0)
-        :   SimdVec {n + 7, n + 6, n + 5, n + 4, n + 3, n + 2, n + 1, n}
-        {
-        }
-
-
-        /**
-         * @brief Initialize to (a7, a6, a5, a4, a3, a2, a1, a0),
-         * where a0 corresponds to the lower bits.
-         */
-        SimdVec(ValueType a7, ValueType a6, ValueType a5, ValueType a4,
-            ValueType a3, ValueType a2, ValueType a1, ValueType a0) noexcept
-        :   SimdVecBase<std::int32_t, Arch> {_mm256_set_epi32(a7, a6, a5, a4, a3, a2, a1, a0)}
-        {
-        }
-
-
-        SimdVec(IntrinsicType value) noexcept
-        :   SimdVecBase<std::int32_t, Arch> {value}
-        {
-        }
-
-
-        SimdVec& operator+=(ValueType x) noexcept
-        {
-            this->value_ = _mm256_add_epi32(this->value_, _mm256_set1_epi32(x));
-            return *this;
-        }
-
-
-        friend MaskType operator>(SimdVec const& a, SimdVec const& b) noexcept
-        {
-            return _mm256_cmpgt_epi32(a.value_, b.value_);
-        }
-
-
-        friend SimdVec blend(SimdVec const& a, SimdVec const& b, MaskType mask) noexcept
-        {
-            return _mm256_blendv_epi8(a.value_, b.value_, mask);
-        }
-
-
-        friend SimdVec operator+(SimdVec const& a, ValueType n) noexcept
-        {
-            return _mm256_add_epi32(a.value_, _mm256_set1_epi32(n));
-        }
-    };
-}
diff --git a/include/blast/math/simd/avx256/SimdVecInt64.hpp b/include/blast/math/simd/avx256/SimdVecInt64.hpp
deleted file mode 100644
index e2adae8e..00000000
--- a/include/blast/math/simd/avx256/SimdVecInt64.hpp
+++ /dev/null
@@ -1,114 +0,0 @@
-// Copyright 2023 Mikhail Katliar
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <blast/math/simd/SimdVec.hpp>
-#include <blast/math/simd/SimdVecBase.hpp>
-#include <blast/math/simd/SequenceTag.hpp>
-
-#include <immintrin.h>
-
-#include <cstdint>
-#include <type_traits>
-
-
-namespace blast
-{
-    template <typename Arch>
-    requires std::is_base_of_v<xsimd::avx2, Arch>
-    class SimdVec<std::int64_t, Arch>
-    :   public SimdVecBase<std::int64_t, Arch>
-    {
-    public:
-        using MaskType = __m256i;
-        using typename SimdVecBase<std::int64_t, Arch>::ValueType;
-        using typename SimdVecBase<std::int64_t, Arch>::IntrinsicType;
-
-        /**
-         * @brief Initialize to (0, 0, 0, 0)
-         */
-        SimdVec() noexcept
-        :   SimdVecBase<std::int64_t, Arch> {_mm256_setzero_si256()}
-        {
-        }
-
-
-        /**
-         * @brief Initialize to (a, a, a, a)
-         */
-        SimdVec(ValueType a) noexcept
-        :   SimdVecBase<std::int64_t, Arch> {_mm256_set1_epi64x(a)}
-        {
-        }
-
-
-        /**
-         * @brief Initialize to (n + 3, n + 2, n + 1, n)
-         *
-         * @param n
-         */
-        SimdVec(simd::SequenceTag, ValueType n = 0)
-        :   SimdVec {n + 3, n + 2, n + 1, n}
-        {
-        }
-
-
-        /**
-         * @brief Initialize to [a3, a2, a1, a0],
-         * where a0 corresponds to the lower bits.
-         */
-        SimdVec(ValueType a3, ValueType a2, ValueType a1, ValueType a0) noexcept
-        :   SimdVecBase<std::int64_t, Arch> {_mm256_set_epi64x(a3, a2, a1, a0)}
-        {
-        }
-
-
-        SimdVec(IntrinsicType value) noexcept
-        :   SimdVecBase<std::int64_t, Arch> {value}
-        {
-        }
-
-
-        operator IntrinsicType() const noexcept
-        {
-            return this->value_;
-        }
-
-
-        SimdVec& operator+=(ValueType x) noexcept
-        {
-            this->value_ = _mm256_add_epi64(this->value_, _mm256_set1_epi64x(x));
-            return *this;
-        }
-
-
-        friend MaskType operator>(SimdVec const& a, SimdVec const& b) noexcept
-        {
-            return _mm256_cmpgt_epi64(a.value_, b.value_);
-        }
-
-
-        friend SimdVec blend(SimdVec const& a, SimdVec const& b, MaskType mask) noexcept
-        {
-            return _mm256_blendv_epi8(a.value_, b.value_, mask);
-        }
-
-
-        friend SimdVec operator+(SimdVec const& a, ValueType n) noexcept
-        {
-            return _mm256_add_epi64(a.value_, _mm256_set1_epi64x(n));
-        }
-    };
-}
diff --git a/test/blast/math/simd/SimdVecTest.cpp b/test/blast/math/simd/SimdVecTest.cpp
index 90be9d1c..91765526 100644
--- a/test/blast/math/simd/SimdVecTest.cpp
+++ b/test/blast/math/simd/SimdVecTest.cpp
@@ -3,6 +3,7 @@
 // license that can be found in the LICENSE file.
 
 #include <blast/math/simd/Avx256.hpp>
+#include <blast/math/simd/SimdIndex.hpp>
 
 #include <blaze/Math.h>
 
@@ -64,9 +65,10 @@ namespace blast :: testing
     TYPED_TEST(SimdVecTest, testImax)
     {
         using Scalar = TypeParam;
-        size_t constexpr SS = SimdSize_v<Scalar>;
+        using SimdType = SimdVec<Scalar>;
+        size_t constexpr SS = SimdType::size();
 
-        IntVecType_t<SS> idx {simd::sequenceTag};
+        SimdIndex<Scalar> const idx = indexSequence<Scalar>();
 
         for (int i = 0; i < 100; ++i)
         {
@@ -75,15 +77,15 @@ namespace blast :: testing
             SimdVec<Scalar> const v {a.data(), false};
 
             SimdVec<Scalar> v_max;
-            IntVecType_t<SS> idx_max;
+            SimdIndex<Scalar> idx_max;
             std::tie(v_max, idx_max) = imax(v, idx);
 
             auto const max_el = std::max_element(a.begin(), a.end());
-            for (size_t i = 0; i < idx_max.size(); ++i)
-                ASSERT_EQ(idx_max[i], std::distance(a.begin(), max_el)) << "Error at element " << i;
+            for (size_t i = 0; i < idx_max.size; ++i)
+                ASSERT_EQ(idx_max.get(i), std::distance(a.begin(), max_el)) << "Error at element " << i;
 
             for (size_t i = 0; i < v_max.size(); ++i)
                 ASSERT_EQ(v_max[i], *max_el) << "Error at element " << i;
         }
     }
-}
\ No newline at end of file
+}

From 25538a094a97ad478a6aa204ab11acd0abb301eb Mon Sep 17 00:00:00 2001
From: Mikhail Katliar <cepstr@gmail.com>
Date: Fri, 2 Aug 2024 15:27:35 +0200
Subject: [PATCH 15/27] Got rid of Simd and SimdTraits classes

---
 include/blast/math/RowColumnVectorPointer.hpp |  10 +-
 .../blast/math/dense/DynamicMatrixPointer.hpp |   9 +-
 .../blast/math/dense/DynamicVectorPointer.hpp |  11 +-
 .../blast/math/dense/StaticMatrixPointer.hpp  |  10 +-
 .../blast/math/dense/StaticVectorPointer.hpp  |  13 +-
 .../blast/math/expressions/PanelMatrix.hpp    |  20 +-
 .../math/panel/DynamicPanelMatrixPointer.hpp  |   9 +-
 include/blast/math/panel/PanelSize.hpp        |   8 +-
 .../math/panel/StaticPanelMatrixPointer.hpp   |  10 +-
 .../register_matrix/DynamicRegisterMatrix.hpp |  29 +--
 .../math/register_matrix/RegisterMatrix.hpp   |  38 ++--
 include/blast/math/simd/RegisterCapacity.hpp  |  32 ++++
 include/blast/math/simd/Simd.hpp              | 171 ------------------
 include/blast/math/simd/SimdMask.hpp          |  75 +++++++-
 .../math/panel/StaticPanelMatrixTest.cpp      |   9 +-
 15 files changed, 201 insertions(+), 253 deletions(-)
 create mode 100644 include/blast/math/simd/RegisterCapacity.hpp

diff --git a/include/blast/math/RowColumnVectorPointer.hpp b/include/blast/math/RowColumnVectorPointer.hpp
index e13c311f..ac009f04 100644
--- a/include/blast/math/RowColumnVectorPointer.hpp
+++ b/include/blast/math/RowColumnVectorPointer.hpp
@@ -16,6 +16,8 @@
 
 #include <blast/math/typetraits/MatrixPointer.hpp>
 #include <blast/math/simd/Simd.hpp>
+#include <blast/math/simd/SimdVec.hpp>
+#include <blast/math/simd/SimdMask.hpp>
 #include <blast/math/TransposeFlag.hpp>
 #include <blast/util/Assert.hpp>
 
@@ -28,9 +30,9 @@ namespace blast
     {
     public:
         using ElementType = typename MP::ElementType;
-        using IntrinsicType = typename Simd<std::remove_cv_t<ElementType>>::IntrinsicType;
-        using MaskType = typename Simd<std::remove_cv_t<ElementType>>::MaskType;
         using SimdVecType = SimdVec<std::remove_cv_t<ElementType>>;
+        using IntrinsicType = SimdVecType::IntrinsicType;
+        using MaskType = SimdMask<std::remove_cv_t<ElementType>>;
 
         static TransposeFlag constexpr transposeFlag = TF;
         static bool constexpr aligned = MP::aligned;
@@ -176,7 +178,7 @@ namespace blast
 
 
     private:
-        static size_t constexpr SS = Simd<std::remove_cv_t<ElementType>>::size;
+        static size_t constexpr SS = SimdVecType::size();
 
         MP ptr_;
     };
@@ -244,4 +246,4 @@ namespace blast
     {
         return RowColumnVectorPointer<MP, rowVector> {std::move(p)};
     }
-}
\ No newline at end of file
+}
diff --git a/include/blast/math/dense/DynamicMatrixPointer.hpp b/include/blast/math/dense/DynamicMatrixPointer.hpp
index e6540576..2041e4b1 100644
--- a/include/blast/math/dense/DynamicMatrixPointer.hpp
+++ b/include/blast/math/dense/DynamicMatrixPointer.hpp
@@ -9,6 +9,7 @@
 #include <blast/math/StorageOrder.hpp>
 #include <blast/math/TypeTraits.hpp>
 #include <blast/math/simd/Simd.hpp>
+#include <blast/math/simd/SimdMask.hpp>
 #include <blast/math/simd/IsSimdAligned.hpp>
 #include <blast/util/Assert.hpp>
 
@@ -23,9 +24,9 @@ namespace blast
     {
     public:
         using ElementType = T;
-        using IntrinsicType = typename Simd<std::remove_cv_t<T>>::IntrinsicType;
-        using MaskType = typename Simd<std::remove_cv_t<T>>::MaskType;
         using SimdVecType = SimdVec<std::remove_cv_t<T>>;
+        using IntrinsicType = SimdVecType::IntrinsicType;
+        using MaskType = SimdMask<std::remove_cv_t<T>>;
 
         static bool constexpr storageOrder = SO;
         static bool constexpr aligned = AF;
@@ -187,7 +188,7 @@ namespace blast
 
 
     private:
-        static size_t constexpr SS = Simd<std::remove_cv_t<T>>::size;
+        static size_t constexpr SS = SimdVecType::size();
         static TransposeFlag constexpr majorOrientation = SO == columnMajor ? columnVector : rowVector;
 
 
@@ -253,4 +254,4 @@ namespace blast
     {
         return {p, spacing};
     }
-}
\ No newline at end of file
+}
diff --git a/include/blast/math/dense/DynamicVectorPointer.hpp b/include/blast/math/dense/DynamicVectorPointer.hpp
index b026d205..dbdcb19c 100644
--- a/include/blast/math/dense/DynamicVectorPointer.hpp
+++ b/include/blast/math/dense/DynamicVectorPointer.hpp
@@ -4,8 +4,9 @@
 
 #pragma once
 
-
 #include <blast/math/simd/Simd.hpp>
+#include <blast/math/simd/SimdVec.hpp>
+#include <blast/math/simd/SimdMask.hpp>
 #include <blast/util/Assert.hpp>
 
 #include <type_traits>
@@ -18,9 +19,9 @@ namespace blast
     {
     public:
         using ElementType = T;
-        using IntrinsicType = typename Simd<std::remove_cv_t<T>>::IntrinsicType;
-        using MaskType = typename Simd<std::remove_cv_t<T>>::MaskType;
         using SimdVecType = SimdVec<std::remove_cv_t<T>>;
+        using IntrinsicType = SimdVecType::IntrinsicType;
+        using MaskType = SimdMask<std::remove_cv_t<T>>;
 
         static bool constexpr transposeFlag = TF;
         static bool constexpr aligned = AF;
@@ -172,7 +173,7 @@ namespace blast
 
 
     private:
-        static size_t constexpr SS = Simd<std::remove_cv_t<T>>::size;
+        static size_t constexpr SS = SimdVecType::size();
 
 
         T * ptrOffset(ptrdiff_t i) const noexcept
@@ -191,4 +192,4 @@ namespace blast
     {
         return p.trans();
     }
-}
\ No newline at end of file
+}
diff --git a/include/blast/math/dense/StaticMatrixPointer.hpp b/include/blast/math/dense/StaticMatrixPointer.hpp
index 2db833b8..961b5e01 100644
--- a/include/blast/math/dense/StaticMatrixPointer.hpp
+++ b/include/blast/math/dense/StaticMatrixPointer.hpp
@@ -6,8 +6,8 @@
 
 #include <blast/math/StorageOrder.hpp>
 #include <blast/math/TransposeFlag.hpp>
-#include <blast/math/simd/Simd.hpp>
 #include <blast/math/simd/SimdVec.hpp>
+#include <blast/math/simd/SimdMask.hpp>
 #include <blast/math/simd/IsSimdAligned.hpp>
 #include <blast/math/TypeTraits.hpp>
 #include <blast/util/Assert.hpp>
@@ -24,9 +24,9 @@ namespace blast
     {
     public:
         using ElementType = T;
-        using IntrinsicType = typename Simd<std::remove_cv_t<T>>::IntrinsicType;
-        using MaskType = typename Simd<std::remove_cv_t<T>>::MaskType;
         using SimdVecType = SimdVec<std::remove_cv_t<T>>;
+        using IntrinsicType = SimdVecType::IntrinsicType;
+        using MaskType = SimdMask<std::remove_cv_t<T>>;
 
         static bool constexpr storageOrder = SO;
         static bool constexpr aligned = AF;
@@ -188,7 +188,7 @@ namespace blast
 
 
     private:
-        static size_t constexpr SS = Simd<std::remove_cv_t<T>>::size;
+        static size_t constexpr SS = SimdVecType::size();
         static TransposeFlag constexpr majorOrientation = SO == columnMajor ? columnVector : rowVector;
 
 
@@ -225,4 +225,4 @@ namespace blast
     {
         return trans(ptr<AF>(m.operand(), j, i));
     }
-}
\ No newline at end of file
+}
diff --git a/include/blast/math/dense/StaticVectorPointer.hpp b/include/blast/math/dense/StaticVectorPointer.hpp
index 6608ddd0..8cd7e1af 100644
--- a/include/blast/math/dense/StaticVectorPointer.hpp
+++ b/include/blast/math/dense/StaticVectorPointer.hpp
@@ -6,6 +6,9 @@
 
 
 #include <blast/math/simd/Simd.hpp>
+#include <blast/math/simd/SimdVec.hpp>
+#include <blast/math/simd/SimdMask.hpp>
+#include <blast/math/simd/IsSimdAligned.hpp>
 #include <blast/util/Assert.hpp>
 
 
@@ -16,9 +19,9 @@ namespace blast
     {
     public:
         using ElementType = T;
-        using IntrinsicType = typename Simd<std::remove_cv_t<T>>::IntrinsicType;
-        using MaskType = typename Simd<std::remove_cv_t<T>>::MaskType;
         using SimdVecType = SimdVec<std::remove_cv_t<T>>;
+        using IntrinsicType = SimdVecType::IntrinsicType;
+        using MaskType = SimdMask<std::remove_cv_t<T>>;
 
         static bool constexpr transposeFlag = TF;
         static bool constexpr aligned = AF;
@@ -35,7 +38,7 @@ namespace blast
         constexpr StaticVectorPointer(T * ptr) noexcept
         :   ptr_ {ptr}
         {
-            BLAST_USER_ASSERT(!AF || reinterpret_cast<ptrdiff_t>(ptr) % (SS * sizeof(T)) == 0, "Pointer is not aligned");
+            BLAST_USER_ASSERT(!AF || isSimdAligned(ptr), "Pointer is not aligned");
         }
 
 
@@ -195,7 +198,7 @@ namespace blast
 
 
     private:
-        static size_t constexpr SS = Simd<std::remove_cv_t<T>>::size;
+        static size_t constexpr SS = SimdVecType::size();
 
 
         T * ptrOffset(ptrdiff_t i) const noexcept
@@ -213,4 +216,4 @@ namespace blast
     {
         return p.trans();
     }
-}
\ No newline at end of file
+}
diff --git a/include/blast/math/expressions/PanelMatrix.hpp b/include/blast/math/expressions/PanelMatrix.hpp
index 1df9b97d..d0494887 100644
--- a/include/blast/math/expressions/PanelMatrix.hpp
+++ b/include/blast/math/expressions/PanelMatrix.hpp
@@ -5,11 +5,12 @@
 #pragma once
 
 #include <blast/math/typetraits/IsPanelMatrix.hpp>
-#include <blast/math/simd/Simd.hpp>
+#include <blast/math/simd/SimdSize.hpp>
+#include <blast/math/simd/SimdMask.hpp>
+#include <blast/math/simd/SimdIndex.hpp>
 #include <blast/math/panel/PanelSize.hpp>
 #include <blast/math/panel/MatrixPointer.hpp>
 #include <blast/math/dense/MatrixPointer.hpp>
-//#include <blast/math/RegisterMatrix.hpp>
 
 #include <blaze/math/ReductionFlag.h>
 #include <blaze/math/Matrix.h>
@@ -93,13 +94,12 @@ namespace blast
 
         using ET1 = ElementType_t<MT1>;
         using ET2 = ElementType_t<MT2>;
-        static size_t constexpr SS = Simd<ET2>::size;
+        static size_t constexpr SS = SimdSize_v<ET2>;
         static size_t constexpr PANEL_SIZE = PanelSize_v<ET2>;
         static_assert(PANEL_SIZE % SS == 0);
 
-        using MaskType = typename Simd<ET2>::MaskType;
-        using IntType = typename Simd<ET2>::IntType;
-        using SIMD = Simd<ET2>;
+        using MaskType = SimdMask<ET2, xsimd::default_arch>;
+        using IntType = typename SimdIndex<ET2>::value_type;
 
 
         if constexpr (SO1 == columnMajor && SO2 == columnMajor)
@@ -119,7 +119,7 @@ namespace blast
 
             if (IntType const rem = m % SS)
             {
-                MaskType const mask = SIMD::index() < rem;
+                MaskType const mask = indexSequence<ET2>() < rem;
                 size_t const i = m - rem;
                 auto pr = ptr<aligned>(*rhs, i, 0);
                 auto pl = ptr<aligned>(*lhs, i, 0);
@@ -187,12 +187,12 @@ namespace blast
 
         using ET1 = ElementType_t<MT1>;
         using ET2 = ElementType_t<MT2>;
-        static size_t constexpr SS = Simd<ET2>::size;
+        static size_t constexpr SS = SimdSize_v<ET2>;
         static size_t constexpr PANEL_SIZE = PanelSize_v<ET2>;
         static_assert(PANEL_SIZE % SS == 0);
 
-        using MaskType = typename Simd<ET2>::MaskType;
-        using IntType = typename Simd<ET2>::IntType;
+        using MaskType = SimdMask<ET2, xsimd::default_arch>;
+        using IntType = typename SimdIndex<ET2>::value_type;
 
         if constexpr (SO1 == columnMajor && SO2 == columnMajor)
         {
diff --git a/include/blast/math/panel/DynamicPanelMatrixPointer.hpp b/include/blast/math/panel/DynamicPanelMatrixPointer.hpp
index f965f4e1..840ff7e9 100644
--- a/include/blast/math/panel/DynamicPanelMatrixPointer.hpp
+++ b/include/blast/math/panel/DynamicPanelMatrixPointer.hpp
@@ -6,6 +6,7 @@
 
 #include <blast/math/TransposeFlag.hpp>
 #include <blast/math/simd/SimdVec.hpp>
+#include <blast/math/simd/SimdMask.hpp>
 #include <blast/math/StorageOrder.hpp>
 #include <blast/math/TypeTraits.hpp>
 #include <blast/math/simd/Simd.hpp>
@@ -23,9 +24,9 @@ namespace blast
     {
     public:
         using ElementType = T;
-        using IntrinsicType = typename Simd<std::remove_cv_t<T>>::IntrinsicType;
-        using MaskType = typename Simd<std::remove_cv_t<T>>::MaskType;
         using SimdVecType = SimdVec<std::remove_cv_t<T>>;
+        using IntrinsicType = SimdVecType::IntrinsicType;
+        using MaskType = SimdMask<std::remove_cv_t<T>>;
 
         static bool constexpr storageOrder = SO;
         static bool constexpr aligned = AF;
@@ -211,7 +212,7 @@ namespace blast
 
 
     private:
-        static size_t constexpr SS = Simd<std::remove_cv_t<T>>::size;
+        static size_t constexpr SS = SimdVecType::size();
         static TransposeFlag constexpr majorOrientation = SO == columnMajor ? columnVector : rowVector;
 
 
@@ -275,4 +276,4 @@ namespace blast
     {
         return trans(ptr(m.operand(), j, i));
     }
-}
\ No newline at end of file
+}
diff --git a/include/blast/math/panel/PanelSize.hpp b/include/blast/math/panel/PanelSize.hpp
index 027d5c63..9647a0cd 100644
--- a/include/blast/math/panel/PanelSize.hpp
+++ b/include/blast/math/panel/PanelSize.hpp
@@ -8,11 +8,13 @@
 // Includes
 //*************************************************************************************************
 
-#include <blast/math/simd/Simd.hpp>
+#include <blast/math/simd/SimdSize.hpp>
+
+#include <xsimd/xsimd.hpp>
 
 
 namespace blast
 {
-    template <typename T>
-    size_t constexpr PanelSize_v = Simd<T>::size;
+    template <typename T, typename Arch = xsimd::default_arch>
+    size_t constexpr PanelSize_v = SimdSize_v<T, Arch>;
 }
diff --git a/include/blast/math/panel/StaticPanelMatrixPointer.hpp b/include/blast/math/panel/StaticPanelMatrixPointer.hpp
index 50544d02..17e19194 100644
--- a/include/blast/math/panel/StaticPanelMatrixPointer.hpp
+++ b/include/blast/math/panel/StaticPanelMatrixPointer.hpp
@@ -9,6 +9,8 @@
 #include <blast/math/TypeTraits.hpp>
 #include <blast/math/simd/SimdSize.hpp>
 #include <blast/math/simd/Simd.hpp>
+#include <blast/math/simd/SimdVec.hpp>
+#include <blast/math/simd/SimdMask.hpp>
 #include <blast/math/expressions/PanelMatrix.hpp>
 #include <blast/math/expressions/PMatTransExpr.hpp>
 #include <blast/util/Assert.hpp>
@@ -21,9 +23,9 @@ namespace blast
     {
     public:
         using ElementType = T;
-        using IntrinsicType = typename Simd<std::remove_cv_t<T>>::IntrinsicType;
-        using MaskType = typename Simd<std::remove_cv_t<T>>::MaskType;
         using SimdVecType = SimdVec<std::remove_cv_t<T>>;
+        using IntrinsicType = SimdVecType::IntrinsicType;
+        using MaskType = SimdMask<std::remove_cv_t<T>>;
 
         static bool constexpr storageOrder = SO;
         static bool constexpr aligned = AF;
@@ -204,7 +206,7 @@ namespace blast
 
 
     private:
-        static size_t constexpr SS = Simd<std::remove_cv_t<T>>::size;
+        static size_t constexpr SS = SimdVecType::size();
         static TransposeFlag constexpr majorOrientation = SO == columnMajor ? columnVector : rowVector;
 
 
@@ -267,4 +269,4 @@ namespace blast
     {
         return trans(ptr(m.operand(), j, i));
     }
-}
\ No newline at end of file
+}
diff --git a/include/blast/math/register_matrix/DynamicRegisterMatrix.hpp b/include/blast/math/register_matrix/DynamicRegisterMatrix.hpp
index ed59f67e..10e135bd 100644
--- a/include/blast/math/register_matrix/DynamicRegisterMatrix.hpp
+++ b/include/blast/math/register_matrix/DynamicRegisterMatrix.hpp
@@ -4,8 +4,11 @@
 
 #pragma once
 
-#include <blast/math/simd/SimdVec.hpp>
 #include <blast/math/simd/Simd.hpp>
+#include <blast/math/simd/SimdVec.hpp>
+#include <blast/math/simd/SimdMask.hpp>
+#include <blast/math/simd/SimdIndex.hpp>
+#include <blast/math/simd/RegisterCapacity.hpp>
 #include <blast/math/typetraits/MatrixPointer.hpp>
 #include <blast/math/Side.hpp>
 #include <blast/math/UpLo.hpp>
@@ -18,8 +21,6 @@
 #include <blaze/util/Exception.h>
 #include <blaze/util/StaticAssert.h>
 
-#include <cmath>
-
 
 namespace blast
 {
@@ -250,14 +251,14 @@ namespace blast
 
 
     private:
-        using SIMD = Simd<T>;
-        using IntrinsicType = typename SIMD::IntrinsicType;
-        using MaskType = typename SIMD::MaskType;
-        using IntType = typename SIMD::IntType;
-        using SimdVecType = SimdVec<T>;
+        using Arch = xsimd::default_arch;
+        using SimdVecType = SimdVec<T, Arch>;
+        using IntrinsicType = typename SimdVecType::IntrinsicType;
+        using MaskType = SimdMask<T, Arch>;
+        using IntType = typename SimdIndex<T, Arch>::value_type;
 
         // SIMD size
-        static size_t constexpr SS = Simd<T>::size;
+        static size_t constexpr SS = SimdVecType::size();
 
         // Numberf of SIMD registers required to store a single column of the matrix.
         static size_t constexpr RM = M / SS;
@@ -266,7 +267,7 @@ namespace blast
         BLAZE_STATIC_ASSERT_MSG((RM > 0), "Number of rows must be not less than SIMD size");
         BLAZE_STATIC_ASSERT_MSG((RN > 0), "Number of columns must be positive");
         BLAZE_STATIC_ASSERT_MSG((M % SS == 0), "Number of rows must be a multiple of SIMD size");
-        BLAZE_STATIC_ASSERT_MSG((RM * RN <= RegisterCapacity_v<T>), "Not enough registers for a DynamicRegisterMatrix");
+        BLAZE_STATIC_ASSERT_MSG((RM * RN <= registerCapacity(Arch {})), "Not enough registers for a DynamicRegisterMatrix");
 
         SimdVecType v_[RM][RN];
         size_t const m_;
@@ -329,7 +330,7 @@ namespace blast
 
         if (IntType const rem = m_ % SS)
         {
-            MaskType const mask = SIMD::index() < rem;
+            MaskType const mask = indexSequence<T>() < rem;
             size_t const i = m_ / SS;
 
             for (size_t j = 0; j < n_ && j < columns(); ++j)
@@ -349,10 +350,10 @@ namespace blast
             {
                 IntType const skip = j - ri * SS;
                 IntType const rem = m_ - ri * SS;
-                MaskType mask = SIMD::index() < rem;
+                MaskType mask = indexSequence<T>() < rem;
 
                 if (skip > 0)
-                    mask &= SIMD::index() >= skip;
+                    mask &= indexSequence<T>() >= skip;
 
                 p(SS * ri, j).store(v_[ri][j], mask);
             }
@@ -490,7 +491,7 @@ namespace blast
     //             ax[i] = alpha * a.load(i * SS, 0);
 
     //         if (rem)
-    //             ax[ii] = alpha * a.load(ii * SS, 0, SIMD::index() < rem);
+    //             ax[ii] = alpha * a.load(ii * SS, 0, indexSequence<T>() < rem);
 
     //         #pragma unroll
     //         for (size_t j = 0; j < N; ++j)
diff --git a/include/blast/math/register_matrix/RegisterMatrix.hpp b/include/blast/math/register_matrix/RegisterMatrix.hpp
index 68641769..5724a05b 100644
--- a/include/blast/math/register_matrix/RegisterMatrix.hpp
+++ b/include/blast/math/register_matrix/RegisterMatrix.hpp
@@ -5,7 +5,9 @@
 #pragma once
 
 #include <blast/math/simd/SimdVec.hpp>
-#include <blast/math/simd/Simd.hpp>
+#include <blast/math/simd/SimdMask.hpp>
+#include <blast/math/simd/SimdIndex.hpp>
+#include <blast/math/simd/RegisterCapacity.hpp>
 #include <blast/math/typetraits/MatrixPointer.hpp>
 #include <blast/math/typetraits/VectorPointer.hpp>
 #include <blast/math/RowColumnVectorPointer.hpp>
@@ -20,8 +22,6 @@
 #include <blaze/util/Assert.h>
 #include <blaze/util/StaticAssert.h>
 
-#include <cmath>
-
 
 namespace blast
 {
@@ -360,14 +360,14 @@ namespace blast
 
 
     private:
-        using SIMD = Simd<T>;
-        using IntrinsicType = typename SIMD::IntrinsicType;
-        using MaskType = typename SIMD::MaskType;
-        using IntType = typename SIMD::IntType;
-        using SimdVecType = SimdVec<T>;
+        using Arch = xsimd::default_arch;
+        using SimdVecType = SimdVec<T, Arch>;
+        using IntrinsicType = typename SimdVecType::IntrinsicType;
+        using MaskType = SimdMask<T, Arch>;
+        using IntType = typename SimdIndex<T, Arch>::value_type;
 
         // SIMD size
-        static size_t constexpr SS = Simd<T>::size;
+        static size_t constexpr SS = SimdVecType::size();
 
         // Numberf of SIMD registers required to store a single column of the matrix.
         static size_t constexpr RM = M / SS;
@@ -376,7 +376,7 @@ namespace blast
         BLAZE_STATIC_ASSERT_MSG((RM > 0), "Number of rows must be not less than SIMD size");
         BLAZE_STATIC_ASSERT_MSG((RN > 0), "Number of columns must be positive");
         BLAZE_STATIC_ASSERT_MSG((M % SS == 0), "Number of rows must be a multiple of SIMD size");
-        BLAZE_STATIC_ASSERT_MSG((RM * RN <= RegisterCapacity_v<T>), "Not enough registers for a RegisterMatrix");
+        BLAZE_STATIC_ASSERT_MSG((RM * RN <= registerCapacity(Arch {})), "Not enough registers for a RegisterMatrix");
 
         SimdVecType v_[RM][RN];
 
@@ -461,7 +461,7 @@ namespace blast
                     v_[i][j] = beta * p(SS * i, j).load();
 
                 if (size_t const rem = m % SS)
-                    v_[m / SS][j] = beta * p(m - rem, j).load(SIMD::index() < rem);
+                    v_[m / SS][j] = beta * p(m - rem, j).load(indexSequence<T, Arch>() < rem);
             }
         }
     }
@@ -493,7 +493,7 @@ namespace blast
 
         if (IntType const rem = m % SS)
         {
-            MaskType const mask = SIMD::index() < rem;
+            MaskType const mask = indexSequence<T, Arch>() < rem;
             size_t const i = m / SS;
 
             for (size_t j = 0; j < n && j < columns(); ++j)
@@ -514,7 +514,7 @@ namespace blast
 
             if (skip && ri < RM)
             {
-                MaskType const mask = SIMD::index() >= skip;
+                MaskType const mask = indexSequence<T, Arch>() >= skip;
                 p(SS * ri, j).store(v_[ri][j], mask);
                 ++ri;
             }
@@ -536,10 +536,10 @@ namespace blast
             {
                 IntType const skip = j - ri * SS;
                 IntType const rem = m - ri * SS;
-                MaskType mask = SIMD::index() < rem;
+                MaskType mask = indexSequence<T, Arch>() < rem;
 
                 if (skip > 0)
-                    mask &= SIMD::index() >= skip;
+                    mask &= indexSequence<T, Arch>() >= skip;
 
                 p(SS * ri, j).store(v_[ri][j], mask);
             }
@@ -596,7 +596,7 @@ namespace blast
         VectorPointer<PB, T> && (PB::transposeFlag == rowVector)
     BLAZE_ALWAYS_INLINE void RegisterMatrix<T, M, N, SO>::ger(T alpha, PA a, PB b) noexcept
     {
-        BLAZE_STATIC_ASSERT_MSG((RM * RN + RM + 1 <= RegisterCapacity_v<T>), "Not enough registers for ger()");
+        BLAZE_STATIC_ASSERT_MSG((RM * RN + RM + 1 <= registerCapacity(Arch {})), "Not enough registers for ger()");
 
         SimdVecType ax[RM];
 
@@ -623,7 +623,7 @@ namespace blast
         VectorPointer<PB, T> && (PB::transposeFlag == rowVector)
     BLAZE_ALWAYS_INLINE void RegisterMatrix<T, M, N, SO>::ger(PA a, PB b) noexcept
     {
-        BLAZE_STATIC_ASSERT_MSG((RM * RN + RM + 1 <= RegisterCapacity_v<T>), "Not enough registers for ger()");
+        BLAZE_STATIC_ASSERT_MSG((RM * RN + RM + 1 <= registerCapacity(Arch {})), "Not enough registers for ger()");
 
         SimdVecType ax[RM];
 
@@ -697,7 +697,7 @@ namespace blast
     BLAZE_ALWAYS_INLINE void RegisterMatrix<T, M, N, SO>::potrf() noexcept
     {
         static_assert(M >= N, "potrf() not implemented for register matrices with columns more than rows");
-        static_assert(RM * RN + 2 <= RegisterCapacity_v<T>, "Not enough registers");
+        static_assert(RM * RN + 2 <= registerCapacity(Arch {}), "Not enough registers");
 
         #pragma unroll
         for (size_t k = 0; k < N; ++k)
@@ -745,7 +745,7 @@ namespace blast
                 ax[i] = alpha * a(i * SS, 0).load();
 
             if (rem)
-                ax[ii] = alpha * a(ii * SS, 0).load(SIMD::index() < rem);
+                ax[ii] = alpha * a(ii * SS, 0).load(indexSequence<T, Arch>() < rem);
 
             #pragma unroll
             for (size_t j = 0; j < N; ++j)
diff --git a/include/blast/math/simd/RegisterCapacity.hpp b/include/blast/math/simd/RegisterCapacity.hpp
new file mode 100644
index 00000000..213b6d80
--- /dev/null
+++ b/include/blast/math/simd/RegisterCapacity.hpp
@@ -0,0 +1,32 @@
+// Copyright 2024 Mikhail Katliar
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+
+#include <xsimd/xsimd.hpp>
+
+#include <cstdlib>
+
+
+namespace blast
+{
+    /**
+     * @brief Number of available SIMD registers.
+     *
+     * @return Number of SIMD registers for AVX2
+     */
+    std::size_t constexpr registerCapacity(xsimd::avx2)
+    {
+        return 16;
+    }
+}
diff --git a/include/blast/math/simd/Simd.hpp b/include/blast/math/simd/Simd.hpp
index 6f4ca1d2..65f521c4 100644
--- a/include/blast/math/simd/Simd.hpp
+++ b/include/blast/math/simd/Simd.hpp
@@ -21,177 +21,6 @@ namespace blast
     using namespace blaze;
 
 
-    template <typename T, typename Arch = xsimd::default_arch>
-    struct Simd;
-
-
-    template <typename Arch>
-    requires std::is_base_of_v<xsimd::avx2, Arch>
-    struct Simd<double, Arch>
-    {
-    private:
-        class Index;
-
-
-    public:
-        using IntrinsicType = __m256d;
-        using MaskType = __m256i;
-        using IntType = long long;
-
-        static size_t constexpr size = 4;
-        static size_t constexpr registerCapacity = 16;
-
-
-        //*******************************************************
-        //
-        // CUSTOM
-        //
-        //*******************************************************
-        static Index index() noexcept
-        {
-            return Index {};
-        }
-
-
-    private:
-        class Index
-        {
-        public:
-            MaskType operator<(IntType n) const noexcept
-            {
-                return _mm256_cmpgt_epi64(_mm256_set1_epi64x(n), val_);
-            }
-
-
-            MaskType operator>(IntType n) const noexcept
-            {
-                return _mm256_cmpgt_epi64(val_, _mm256_set1_epi64x(n));
-            }
-
-
-            MaskType operator<=(IntType n) const noexcept
-            {
-                return *this < n + 1;
-            }
-
-
-            MaskType operator>=(IntType n) const noexcept
-            {
-                return *this > n - 1;
-            }
-
-
-        private:
-            MaskType val_ = _mm256_set_epi64x(3, 2, 1, 0);
-        };
-    };
-
-
-    template <typename Arch>
-    requires std::is_base_of_v<xsimd::avx2, Arch>
-    struct Simd<float, Arch>
-    {
-    private:
-        class Index;
-
-
-    public:
-        using IntrinsicType = __m256;
-        using MaskType = __m256i;
-        using IntType = int;
-
-        static size_t constexpr size = 8;
-        static size_t constexpr registerCapacity = 16;
-
-
-        //*******************************************************
-        //
-        // CUSTOM
-        //
-        //*******************************************************
-        static Index index() noexcept
-        {
-            return Index {};
-        }
-
-
-    private:
-        class Index
-        {
-        public:
-            MaskType operator<(IntType n) const noexcept
-            {
-                return _mm256_cmpgt_epi32(_mm256_set1_epi32(n), val_);
-            }
-
-
-            MaskType operator>(IntType n) const noexcept
-            {
-                return _mm256_cmpgt_epi32(val_, _mm256_set1_epi32(n));
-            }
-
-
-            MaskType operator<=(IntType n) const noexcept
-            {
-                return *this < n + 1;
-            }
-
-
-            MaskType operator>=(IntType n) const noexcept
-            {
-                return *this > n - 1;
-            }
-
-
-        private:
-            MaskType val_ = _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0);
-        };
-    };
-
-
-    template <typename T, typename Arch = xsimd::default_arch>
-    using IntrinsicType_t = typename Simd<T, Arch>::IntrinsicType;
-
-
-    template <typename T>
-    struct SimdTraits;
-
-
-    template <>
-    struct SimdTraits<__m256d>
-    {
-        using ScalarType = double;
-        using MaskType = __m256i;
-        using IntType = long long;
-        static size_t constexpr size = 4;
-    };
-
-
-    template <>
-    struct SimdTraits<__m256>
-    {
-        using ScalarType = float;
-        using MaskType = __m256i;
-        using IntType = int;
-        static size_t constexpr size = 8;
-    };
-
-
-    template <typename T>
-    using ScalarType_t = typename SimdTraits<T>::ScalarType;
-
-
-    template <typename T>
-    using MaskType_t = typename SimdTraits<T>::MaskType;
-
-    template <typename T>
-    using IntType_t = typename SimdTraits<T>::IntType;
-
-
-    template <typename T, typename Arch = xsimd::default_arch>
-    size_t constexpr RegisterCapacity_v = Simd<T, Arch>::registerCapacity;
-
-
     //*******************************************************
     //
     // LOAD
diff --git a/include/blast/math/simd/SimdMask.hpp b/include/blast/math/simd/SimdMask.hpp
index 31486d2f..325d9ee5 100644
--- a/include/blast/math/simd/SimdMask.hpp
+++ b/include/blast/math/simd/SimdMask.hpp
@@ -26,6 +26,77 @@ namespace blast
      * @tparam T the element type simd_mask applies on
      * @tparam Arch instruction set architecture
      */
-    template <typename T, typename Arch>
-    using SimdMask = xsimd::batch_bool<T, Arch>;
+    template <typename T, typename Arch = xsimd::default_arch>
+    class SimdMask
+    :   public xsimd::batch_bool<T, Arch>
+    {
+    public:
+        using XSimdType = xsimd::batch_bool<T, Arch>;
+        using IntrinsicType = typename XSimdType::register_type;
+
+        /**
+         * @brief Construct from an intrinsic register type
+         *
+         * @param v the value to construct from
+         */
+        SimdMask(IntrinsicType v) noexcept
+        :   XSimdType {v}
+        {
+        }
+
+        /**
+         * @brief Construct from an @a xsimd::batch_bool of the same type
+         *
+         * @param v the value to construct from
+         */
+        SimdMask(XSimdType const& v) noexcept
+        :   XSimdType {v}
+        {
+        }
+
+        /**
+         * @brief Construct from an @a xsimd::batch_bool of a different type
+         *
+         * We need to allow conversion from batch_bool<U, Arch> with U != T,
+         * because we need e.g. the following code to work:
+         *
+         * int n;
+         * MaskType<double> mask = indexSequence<double>() < n;
+         *
+         * In the code above, the result of indexSequence<double>() is a batch<int64_t>,
+         * and the result of indexSequence<double>() < n is a batch_bool<int64_t>,
+         * which can not be directly assigned to a batch_bool<double>,
+         * although their underlying register_type's are identical.
+         *
+         * @param v the value to construct from
+         */
+        template <typename U>
+        SimdMask(xsimd::batch_bool<U, Arch> const& v) noexcept
+        :   XSimdType {IntrinsicType(v)}
+        {
+        }
+
+        /**
+         * @brief In-place logical and.
+         *
+         * We need to define this operator to make the following code work:
+         *
+         * MaskType<double> mask;
+         * int n;
+         * mask &= indexSequence<double>() >= n;
+         *
+         * In the code above, the result of indexSequence<double>() is a batch<int64_t>,
+         * and the result of indexSequence<double>() >= n is a batch_bool<int64_t>,
+         * which can not be directly used as a right-hand side operand of batch_bool<double>::operator&=().
+         * Defining operator&=(SimdMask const&) allows the right-hand side to be implicitly converted to SimdMask.
+         *
+         * @param v
+         * @return SimdMask&
+         */
+        SimdMask& operator&=(SimdMask const& v) noexcept
+        {
+            XSimdType::operator&=(v);
+            return *this;
+        }
+    };
 }
diff --git a/test/blast/math/panel/StaticPanelMatrixTest.cpp b/test/blast/math/panel/StaticPanelMatrixTest.cpp
index c5db87f3..c319a422 100644
--- a/test/blast/math/panel/StaticPanelMatrixTest.cpp
+++ b/test/blast/math/panel/StaticPanelMatrixTest.cpp
@@ -4,6 +4,7 @@
 
 #include <blast/math/StaticPanelMatrix.hpp>
 #include <blast/math/panel/MatrixPointer.hpp>
+#include <blast/math/simd/SimdVec.hpp>
 
 #include <blaze/Math.h>
 
@@ -115,10 +116,12 @@ namespace blast :: testing
         size_t constexpr N = 3 * SS + 2;
 
         StaticPanelMatrix<TypeParam, M, N, columnMajor> A;
-        IntrinsicType_t<TypeParam> val;
 
+        std::vector<TypeParam> vec(SS);
         for (size_t i = 0; i < SS; ++i)
-            val[i] = TypeParam(i + 1);
+            vec[i] = i + 1;
+
+        SimdVec<TypeParam> val {vec.data(), false};
 
         for (size_t i = 0; i < M; i += SS)
             for (size_t j = 0; j < N; ++j)
@@ -203,4 +206,4 @@ namespace blast :: testing
 
     INSTANTIATE_TYPED_TEST_SUITE_P(double, StaticPanelMatrixTest, double);
     INSTANTIATE_TYPED_TEST_SUITE_P(float, StaticPanelMatrixTest, float);
-}
\ No newline at end of file
+}

From 87d75654171158b2f8a7ae0bcf1feefd6cdb7924 Mon Sep 17 00:00:00 2001
From: Mikhail Katliar <cepstr@gmail.com>
Date: Fri, 2 Aug 2024 16:38:54 +0200
Subject: [PATCH 16/27] Got rid of the old Simd.hpp header

---
 include/blast/math/RowColumnVectorPointer.hpp |   4 +-
 include/blast/math/Simd.hpp                   |  13 +
 .../blast/math/dense/DynamicMatrixPointer.hpp |  10 +-
 .../blast/math/dense/DynamicVectorPointer.hpp |  19 +-
 .../blast/math/dense/StaticVectorPointer.hpp  |  23 +-
 .../math/panel/DynamicPanelMatrixPointer.hpp  |   4 +-
 .../blast/math/panel/StaticPanelMatrix.hpp    |   3 +-
 .../math/panel/StaticPanelMatrixPointer.hpp   |   5 +-
 .../register_matrix/DynamicRegisterMatrix.hpp |   8 +-
 .../math/register_matrix/RegisterMatrix.hpp   |  10 +-
 include/blast/math/simd/Simd.hpp              | 252 ------------------
 include/blast/math/simd/SimdVec.hpp           |  31 +++
 .../blast/math/simd/avx256/SimdVecFloat32.hpp |  20 +-
 .../blast/math/simd/avx256/SimdVecFloat64.hpp |   7 +
 include/blast/math/views/row/Panel.hpp        |   6 +-
 include/blast/math/views/submatrix/Panel.hpp  |   6 +-
 16 files changed, 93 insertions(+), 328 deletions(-)
 create mode 100644 include/blast/math/Simd.hpp
 delete mode 100644 include/blast/math/simd/Simd.hpp

diff --git a/include/blast/math/RowColumnVectorPointer.hpp b/include/blast/math/RowColumnVectorPointer.hpp
index ac009f04..ae7a8b5a 100644
--- a/include/blast/math/RowColumnVectorPointer.hpp
+++ b/include/blast/math/RowColumnVectorPointer.hpp
@@ -15,9 +15,7 @@
 #pragma once
 
 #include <blast/math/typetraits/MatrixPointer.hpp>
-#include <blast/math/simd/Simd.hpp>
-#include <blast/math/simd/SimdVec.hpp>
-#include <blast/math/simd/SimdMask.hpp>
+#include <blast/math/Simd.hpp>
 #include <blast/math/TransposeFlag.hpp>
 #include <blast/util/Assert.hpp>
 
diff --git a/include/blast/math/Simd.hpp b/include/blast/math/Simd.hpp
new file mode 100644
index 00000000..f971dbe4
--- /dev/null
+++ b/include/blast/math/Simd.hpp
@@ -0,0 +1,13 @@
+// Copyright (c) 2019-2020 Mikhail Katliar All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#pragma once
+
+#include <blast/math/simd/Avx256.hpp>
+#include <blast/math/simd/SimdVec.hpp>
+#include <blast/math/simd/SimdMask.hpp>
+#include <blast/math/simd/SimdIndex.hpp>
+#include <blast/math/simd/SimdSize.hpp>
+#include <blast/math/simd/IsSimdAligned.hpp>
+#include <blast/math/simd/RegisterCapacity.hpp>
diff --git a/include/blast/math/dense/DynamicMatrixPointer.hpp b/include/blast/math/dense/DynamicMatrixPointer.hpp
index 2041e4b1..5610b1c9 100644
--- a/include/blast/math/dense/DynamicMatrixPointer.hpp
+++ b/include/blast/math/dense/DynamicMatrixPointer.hpp
@@ -8,15 +8,11 @@
 #include <blast/math/TransposeFlag.hpp>
 #include <blast/math/StorageOrder.hpp>
 #include <blast/math/TypeTraits.hpp>
-#include <blast/math/simd/Simd.hpp>
-#include <blast/math/simd/SimdMask.hpp>
-#include <blast/math/simd/IsSimdAligned.hpp>
+#include <blast/math/Simd.hpp>
 #include <blast/util/Assert.hpp>
-
 #include <blaze/util/Exception.h>
 
 
-
 namespace blast
 {
     template <typename T, bool SO, bool AF, bool PF>
@@ -84,9 +80,9 @@ namespace blast
         }
 
 
-        IntrinsicType broadcast() const noexcept
+        SimdVecType broadcast() const noexcept
         {
-            return blast::broadcast<SS>(ptr_);
+            return *ptr_;
         }
 
 
diff --git a/include/blast/math/dense/DynamicVectorPointer.hpp b/include/blast/math/dense/DynamicVectorPointer.hpp
index dbdcb19c..7638119a 100644
--- a/include/blast/math/dense/DynamicVectorPointer.hpp
+++ b/include/blast/math/dense/DynamicVectorPointer.hpp
@@ -4,9 +4,7 @@
 
 #pragma once
 
-#include <blast/math/simd/Simd.hpp>
-#include <blast/math/simd/SimdVec.hpp>
-#include <blast/math/simd/SimdMask.hpp>
+#include <blast/math/Simd.hpp>
 #include <blast/util/Assert.hpp>
 
 #include <type_traits>
@@ -41,7 +39,7 @@ namespace blast
         ,   spacing_ {spacing}
         {
             BLAST_USER_ASSERT(spacing > 0, "Vector element spacing must be positive.");
-            BLAST_USER_ASSERT(!AF || reinterpret_cast<ptrdiff_t>(ptr) % (SS * sizeof(T)) == 0, "Pointer is not aligned");
+            BLAST_USER_ASSERT(!AF || isSimdAligned(ptr), "Pointer is not aligned");
         }
 
 
@@ -52,6 +50,7 @@ namespace blast
         SimdVecType load() const noexcept
         {
             // Non-optimized
+            // TODO: use gather()
             IntrinsicType v;
             for (size_t i = 0; i < SS; ++i)
                 v[i] = ptr_[spacing_ * i];
@@ -63,18 +62,18 @@ namespace blast
         SimdVecType load(MaskType mask) const noexcept
         {
             // Non-optimized
-            IntrinsicType v = blast::setzero<std::remove_cv_t<ElementType>, SS>();
+            // TODO: use gather()
+            T v[SS];
             for (size_t i = 0; i < SS; ++i)
-                if (mask[i])
-                    v[i] = ptr_[spacing_ * i];
+                v[i] = mask[i] ? ptr_[spacing_ * i] : T {};
 
-            return SimdVecType {v};
+            return SimdVecType {v, false};
         }
 
 
-        IntrinsicType broadcast() const noexcept
+        SimdVecType broadcast() const noexcept
         {
-            return blast::broadcast<SS>(ptr_);
+            return *ptr_;
         }
 
 
diff --git a/include/blast/math/dense/StaticVectorPointer.hpp b/include/blast/math/dense/StaticVectorPointer.hpp
index 8cd7e1af..6017dafa 100644
--- a/include/blast/math/dense/StaticVectorPointer.hpp
+++ b/include/blast/math/dense/StaticVectorPointer.hpp
@@ -5,10 +5,7 @@
 #pragma once
 
 
-#include <blast/math/simd/Simd.hpp>
-#include <blast/math/simd/SimdVec.hpp>
-#include <blast/math/simd/SimdMask.hpp>
-#include <blast/math/simd/IsSimdAligned.hpp>
+#include <blast/math/Simd.hpp>
 #include <blast/util/Assert.hpp>
 
 
@@ -73,27 +70,27 @@ namespace blast
             else
             {
                 // Non-optimized
-                IntrinsicType v = blast::setzero<ElementType, SS>();
+                // TODO: use gather
+                T v[SS];
                 for (size_t i = 0; i < SS; ++i)
-                    if (mask[i])
-                        v[i] = ptr_[S * i];
+                    v[i] = mask[i] ? ptr_[S * i] : T {};
 
-                return SimdVecType {v};
+                return SimdVecType {v, false};
             }
         }
 
 
-        IntrinsicType broadcast() const noexcept
+        SimdVecType broadcast() const noexcept
         {
-            return blast::broadcast<SS>(ptr_);
+            return *ptr_;
         }
 
 
-        void store(IntrinsicType val) const noexcept
+        void store(SimdVecType val) const noexcept
         {
             if constexpr (S == 1)
             {
-                blast::store<AF>(ptr_, val);
+                val.store(ptr_, AF);
             }
             else
             {
@@ -108,7 +105,7 @@ namespace blast
         {
             if constexpr (S == 1)
             {
-                blast::maskstore(ptr_, mask, val);
+                val.store(ptr_, mask);
             }
             else
             {
diff --git a/include/blast/math/panel/DynamicPanelMatrixPointer.hpp b/include/blast/math/panel/DynamicPanelMatrixPointer.hpp
index 840ff7e9..1e3f1233 100644
--- a/include/blast/math/panel/DynamicPanelMatrixPointer.hpp
+++ b/include/blast/math/panel/DynamicPanelMatrixPointer.hpp
@@ -5,11 +5,9 @@
 #pragma once
 
 #include <blast/math/TransposeFlag.hpp>
-#include <blast/math/simd/SimdVec.hpp>
-#include <blast/math/simd/SimdMask.hpp>
 #include <blast/math/StorageOrder.hpp>
 #include <blast/math/TypeTraits.hpp>
-#include <blast/math/simd/Simd.hpp>
+#include <blast/math/Simd.hpp>
 #include <blast/math/expressions/PanelMatrix.hpp>
 #include <blast/math/expressions/PMatTransExpr.hpp>
 #include <blast/util/Assert.hpp>
diff --git a/include/blast/math/panel/StaticPanelMatrix.hpp b/include/blast/math/panel/StaticPanelMatrix.hpp
index b1d2a624..585b6f6b 100644
--- a/include/blast/math/panel/StaticPanelMatrix.hpp
+++ b/include/blast/math/panel/StaticPanelMatrix.hpp
@@ -7,7 +7,6 @@
 #include <blast/math/Forward.hpp>
 #include <blast/math/panel/Gemm.hpp>
 #include <blast/math/views/submatrix/BaseTemplate.hpp>
-#include <blast/math/simd/Simd.hpp>
 #include <blast/math/panel/PanelSize.hpp>
 #include <blast/system/CacheLine.hpp>
 
@@ -290,4 +289,4 @@ namespace blaze
     {
         using Type = blast::StaticPanelMatrix<T, M, N, SO>;
     };
-}
\ No newline at end of file
+}
diff --git a/include/blast/math/panel/StaticPanelMatrixPointer.hpp b/include/blast/math/panel/StaticPanelMatrixPointer.hpp
index 17e19194..aaadbda9 100644
--- a/include/blast/math/panel/StaticPanelMatrixPointer.hpp
+++ b/include/blast/math/panel/StaticPanelMatrixPointer.hpp
@@ -7,10 +7,7 @@
 #include <blast/math/TransposeFlag.hpp>
 #include <blast/math/StorageOrder.hpp>
 #include <blast/math/TypeTraits.hpp>
-#include <blast/math/simd/SimdSize.hpp>
-#include <blast/math/simd/Simd.hpp>
-#include <blast/math/simd/SimdVec.hpp>
-#include <blast/math/simd/SimdMask.hpp>
+#include <blast/math/Simd.hpp>
 #include <blast/math/expressions/PanelMatrix.hpp>
 #include <blast/math/expressions/PMatTransExpr.hpp>
 #include <blast/util/Assert.hpp>
diff --git a/include/blast/math/register_matrix/DynamicRegisterMatrix.hpp b/include/blast/math/register_matrix/DynamicRegisterMatrix.hpp
index 10e135bd..9cd7f826 100644
--- a/include/blast/math/register_matrix/DynamicRegisterMatrix.hpp
+++ b/include/blast/math/register_matrix/DynamicRegisterMatrix.hpp
@@ -4,11 +4,7 @@
 
 #pragma once
 
-#include <blast/math/simd/Simd.hpp>
-#include <blast/math/simd/SimdVec.hpp>
-#include <blast/math/simd/SimdMask.hpp>
-#include <blast/math/simd/SimdIndex.hpp>
-#include <blast/math/simd/RegisterCapacity.hpp>
+#include <blast/math/Simd.hpp>
 #include <blast/math/typetraits/MatrixPointer.hpp>
 #include <blast/math/Side.hpp>
 #include <blast/math/UpLo.hpp>
@@ -122,7 +118,7 @@ namespace blast
         {
             for (size_t i = 0; i < RM; ++i)
                 for (size_t j = 0; j < N; ++j)
-                    v_[i][j] = setzero<T, SS>();
+                    v_[i][j].reset();
         }
 
 
diff --git a/include/blast/math/register_matrix/RegisterMatrix.hpp b/include/blast/math/register_matrix/RegisterMatrix.hpp
index 5724a05b..5c37dc64 100644
--- a/include/blast/math/register_matrix/RegisterMatrix.hpp
+++ b/include/blast/math/register_matrix/RegisterMatrix.hpp
@@ -117,7 +117,7 @@ namespace blast
         {
             for (size_t i = 0; i < RM; ++i)
                 for (size_t j = 0; j < N; ++j)
-                    v_[i][j] = setzero<T, SS>();
+                    v_[i][j].reset();
         }
 
 
@@ -562,7 +562,7 @@ namespace blast
                     #pragma unroll
                     for (size_t k = 0; k < j; ++k)
                     {
-                        IntrinsicType const a_kj = (~A)(k, j).broadcast();
+                        SimdVecType const a_kj = (~A)(k, j).broadcast();
 
                         #pragma unroll
                         for (size_t i = 0; i < RM; ++i)
@@ -705,11 +705,11 @@ namespace blast
             #pragma unroll
             for (size_t j = 0; j < k; ++j)
             {
-                T const a_kj = v_[k / SS][j][k % SS];
+                SimdVecType const a_kj = v_[k / SS][j][k % SS];
 
                 #pragma unroll
                 for (size_t i = 0; i < RM; ++i) if (i >= k / SS)
-                    v_[i][k] = fnmadd(set1<SS>(a_kj), v_[i][j], v_[i][k]);
+                    v_[i][k] = fnmadd(a_kj, v_[i][j], v_[i][k]);
             }
 
             T const sqrt_a_kk = std::sqrt(v_[k / SS][k][k % SS]);
@@ -718,7 +718,7 @@ namespace blast
             for (size_t i = 0; i < RM; ++i)
             {
                 if (i < k / SS)
-                    v_[i][k] = setzero<T, SS>();
+                    v_[i][k].reset();
                 else
                     v_[i][k] /= sqrt_a_kk;
             }
diff --git a/include/blast/math/simd/Simd.hpp b/include/blast/math/simd/Simd.hpp
deleted file mode 100644
index 65f521c4..00000000
--- a/include/blast/math/simd/Simd.hpp
+++ /dev/null
@@ -1,252 +0,0 @@
-// Copyright (c) 2019-2020 Mikhail Katliar All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-#pragma once
-
-#include <blast/math/simd/Avx256.hpp>
-
-#include <blaze/util/Types.h>
-#include <blaze/system/Inline.h>
-#include <blaze/math/AlignmentFlag.h>
-
-#include <type_traits>
-#include <xsimd/xsimd.hpp>
-
-#include <immintrin.h>
-
-
-namespace blast
-{
-    using namespace blaze;
-
-
-    //*******************************************************
-    //
-    // LOAD
-    //
-    //*******************************************************
-
-
-    template <size_t SS, typename T>
-    auto broadcast(T const * ptr);
-
-
-    inline auto maskload(double const * ptr, __m256i mask)
-    {
-        return _mm256_maskload_pd(ptr, mask);
-    }
-
-
-    inline auto maskload(float const * ptr, __m256i mask)
-    {
-        return _mm256_maskload_ps(ptr, mask);
-    }
-
-
-    template <>
-    inline auto broadcast<4, double>(double const * ptr)
-    {
-        return _mm256_broadcast_sd(ptr);
-    }
-
-
-    template <>
-    inline auto broadcast<8, float>(float const * ptr)
-    {
-        return _mm256_broadcast_ss(ptr);
-    }
-
-
-    //*******************************************************
-    //
-    // STORE
-    //
-    //*******************************************************
-    template <bool AF>
-    void store(double * ptr, __m256d a);
-
-
-    template <>
-    inline void store<aligned>(double * ptr, __m256d a)
-    {
-        _mm256_store_pd(ptr, a);
-    }
-
-
-    template <>
-    inline void store<unaligned>(double * ptr, __m256d a)
-    {
-        _mm256_storeu_pd(ptr, a);
-    }
-
-
-    template <bool AF>
-    void store(float * ptr, __m256 a);
-
-
-    template <>
-    inline void store<aligned>(float * ptr, __m256 a)
-    {
-        _mm256_store_ps(ptr, a);
-    }
-
-
-    template <>
-    inline void store<unaligned>(float * ptr, __m256 a)
-    {
-        _mm256_storeu_ps(ptr, a);
-    }
-
-
-    inline void maskstore(double * ptr, __m256i m, __m256d a)
-    {
-        _mm256_maskstore_pd(ptr, m, a);
-    }
-
-
-    inline void maskstore(float * ptr, __m256i m, __m256 a)
-    {
-        _mm256_maskstore_ps(ptr, m, a);
-    }
-
-
-    //*******************************************************
-    //
-    // SET
-    //
-    //*******************************************************
-
-    inline auto set(double a3, double a2, double a1, double a0)
-    {
-        return _mm256_set_pd(a3, a2, a1, a0);
-    }
-
-
-    inline auto set(float a7, float a6, float a5, float a4, float a3, float a2, float a1, float a0)
-    {
-        return _mm256_set_ps(a7, a6, a5, a4, a3, a2, a1, a0);
-    }
-
-
-    inline auto set(long long a3, long long a2, long long a1, long long a0)
-    {
-        return _mm256_set_epi64x(a3, a2, a1, a0);
-    }
-
-
-    template <size_t N, typename T>
-    auto set1(T a);
-
-
-    inline auto set1(double a)
-    {
-        return _mm256_set1_pd(a);
-    }
-
-
-    inline auto set1(float a)
-    {
-        return _mm256_set1_ps(a);
-    }
-
-
-    template <>
-    inline auto set1<4, double>(double a)
-    {
-        return _mm256_set1_pd(a);
-    }
-
-
-    template <>
-    inline auto set1<8, float>(float a)
-    {
-        return _mm256_set1_ps(a);
-    }
-
-
-    template <>
-    inline auto set1<8, int>(int val)
-    {
-        return _mm256_set1_epi32(val);
-    }
-
-
-    template <>
-    inline auto set1<4, long long>(long long val)
-    {
-        return _mm256_set1_epi64x(val);
-    }
-
-
-    template <typename T, size_t N>
-    auto setzero();
-
-
-    template <>
-    inline auto setzero<double, 4>()
-    {
-        return _mm256_setzero_pd();
-    }
-
-
-    template <>
-    inline auto setzero<float, 8>()
-    {
-        return _mm256_setzero_ps();
-    }
-
-
-    //*******************************************************
-    //
-    // ARITHMETIC
-    //
-    //*******************************************************
-
-    inline auto fnmadd(__m256d a, __m256d b, __m256d c)
-    {
-        return _mm256_fnmadd_pd(a, b, c);
-    }
-
-
-    inline auto fnmadd(__m256 a, __m256 b, __m256 c)
-    {
-        return _mm256_fnmadd_ps(a, b, c);
-    }
-
-
-    inline auto abs(__m256d a)
-    {
-        return _mm256_andnot_pd(set1(-0.), a);
-    }
-
-
-    inline auto abs(__m256 a)
-    {
-        return _mm256_andnot_ps(set1(-0.f), a);
-    }
-
-
-    //*******************************************************
-    //
-    // COMPARE
-    //
-    //*******************************************************
-
-    template <size_t N, typename MM>
-    auto cmpgt(MM a, MM b);
-
-
-    template <>
-    inline auto cmpgt<4>(__m256i a, __m256i b)
-    {
-        return _mm256_cmpgt_epi64(a, b);
-    }
-
-
-    template <>
-    inline auto cmpgt<8>(__m256i a, __m256i b)
-    {
-        return _mm256_cmpgt_epi32(a, b);
-    }
-}
diff --git a/include/blast/math/simd/SimdVec.hpp b/include/blast/math/simd/SimdVec.hpp
index 2aef5e5a..6cd34223 100644
--- a/include/blast/math/simd/SimdVec.hpp
+++ b/include/blast/math/simd/SimdVec.hpp
@@ -132,6 +132,20 @@ namespace blast
     }
 
 
+    /**
+    * @brief Fused negative multiply-add
+    *
+    * Calculate -a * b + c
+    *
+    * @param a first multiplier
+    * @param b second multiplier
+    * @param c addendum
+    *
+    * @return @a a * @a b + @a c element-wise
+    */
+    template <typename T, typename Arch>
+    SimdVec<T, Arch> fnmadd(SimdVec<T, Arch> const& a, SimdVec<T, Arch> const& b, SimdVec<T, Arch> const& c) noexcept;
+
     /**
     * @brief Fused multiply-add
     *
@@ -298,6 +312,15 @@ namespace blast
         }
 
 
+        /**
+         * @brief Set to 0
+         */
+        void reset() noexcept
+        {
+            value_ = ValueType {};
+        }
+
+
         operator IntrinsicType() const noexcept
         {
             return value_;
@@ -374,6 +397,7 @@ namespace blast
 
 
         friend SimdVec fmadd<>(SimdVec const& a, SimdVec const& b, SimdVec const& c) noexcept;
+        friend SimdVec fnmadd<>(SimdVec const& a, SimdVec const& b, SimdVec const& c) noexcept;
         friend SimdVec blend<>(SimdVec const& a, SimdVec const& b, MaskType mask) noexcept;
         friend SimdVec abs<>(SimdVec const& a) noexcept;
         friend SimdVec max<>(SimdVec const& a, SimdVec const& b) noexcept;
@@ -400,6 +424,13 @@ namespace blast
     }
 
 
+    template <typename T, typename Arch>
+    inline SimdVec<T, Arch> fnmadd(SimdVec<T, Arch> const& a, SimdVec<T, Arch> const& b, SimdVec<T, Arch> const& c) noexcept
+    {
+        return xsimd::fnma(a.value_, b.value_, c.value_);
+    }
+
+
     template <typename T, typename Arch>
     inline SimdVec<T, Arch> blend(SimdVec<T, Arch> const& a, SimdVec<T, Arch> const& b, typename SimdVec<T, Arch>::MaskType mask) noexcept
     {
diff --git a/include/blast/math/simd/avx256/SimdVecFloat32.hpp b/include/blast/math/simd/avx256/SimdVecFloat32.hpp
index 47a3aaf1..cc175930 100644
--- a/include/blast/math/simd/avx256/SimdVecFloat32.hpp
+++ b/include/blast/math/simd/avx256/SimdVecFloat32.hpp
@@ -73,19 +73,6 @@ namespace blast
     }
 
 
-    /*
-    NOTE: without this specialization some tests fail, e.g.:
-
-    [ RUN      ] RegisterMatrixTest/5.testPartialGerNT @ ./test/blast/math/simd/RegisterMatrixTest.cpp:386
-    │ Failure @ ./test/blast/math/simd/RegisterMatrixTest.cpp:420
-    │ │ Expected equality of these values:
-    │ │   ker(i, j)
-    │ │     Which is: 0.482636
-    │ │   i < m && j < n ? D_ref(i, j) : 0.
-    │ │     Which is: 0.482636
-    │ element mismatch at (0, 1), store size = 1x2
-    [  FAILED  ] RegisterMatrixTest/5.testPartialGerNT, where TypeParam = blast::RegisterMatrix<float, 8ul, 4ul, true> (1 ms)
-    */
     template <>
     inline SimdVec<float, xsimd::avx2> fmadd(SimdVec<float, xsimd::avx2> const& a, SimdVec<float, xsimd::avx2> const& b, SimdVec<float, xsimd::avx2> const& c) noexcept
     {
@@ -93,6 +80,13 @@ namespace blast
     }
 
 
+    template <>
+    inline SimdVec<float, xsimd::avx2> fnmadd(SimdVec<float, xsimd::avx2> const& a, SimdVec<float, xsimd::avx2> const& b, SimdVec<float, xsimd::avx2> const& c) noexcept
+    {
+        return _mm256_fnmadd_ps(a.value_, b.value_, c.value_);
+    }
+
+
     template <>
     inline SimdVec<float, xsimd::avx2> blend(SimdVec<float, xsimd::avx2> const& a, SimdVec<float, xsimd::avx2> const& b, SimdVec<float, xsimd::avx2>::MaskType mask) noexcept
     {
diff --git a/include/blast/math/simd/avx256/SimdVecFloat64.hpp b/include/blast/math/simd/avx256/SimdVecFloat64.hpp
index 22d358ea..229fd666 100644
--- a/include/blast/math/simd/avx256/SimdVecFloat64.hpp
+++ b/include/blast/math/simd/avx256/SimdVecFloat64.hpp
@@ -80,6 +80,13 @@ namespace blast
     }
 
 
+    template <>
+    inline SimdVec<double, xsimd::avx2> fnmadd(SimdVec<double, xsimd::avx2> const& a, SimdVec<double, xsimd::avx2> const& b, SimdVec<double, xsimd::avx2> const& c) noexcept
+    {
+        return _mm256_fnmadd_pd(a.value_, b.value_, c.value_);
+    }
+
+
     template <>
     inline SimdVec<double, xsimd::avx2> blend(SimdVec<double, xsimd::avx2> const& a, SimdVec<double, xsimd::avx2> const& b, SimdVec<double, xsimd::avx2>::MaskType mask) noexcept
     {
diff --git a/include/blast/math/views/row/Panel.hpp b/include/blast/math/views/row/Panel.hpp
index 5daadc0e..55c3f9f4 100644
--- a/include/blast/math/views/row/Panel.hpp
+++ b/include/blast/math/views/row/Panel.hpp
@@ -7,7 +7,6 @@
 #include <blast/math/PanelMatrix.hpp>
 #include <blast/math/views/submatrix/BaseTemplate.hpp>
 #include <blast/math/constraints/Submatrix.hpp>
-#include <blast/math/simd/Simd.hpp>
 #include <blast/math/panel/PanelSize.hpp>
 #include <blast/math/constraints/PanelMatrix.hpp>
 
@@ -17,9 +16,6 @@
 #include <blaze/util/constraints/Pointer.h>
 #include <blaze/util/constraints/Reference.h>
 
-#include <algorithm>
-#include <iterator>
-
 
 namespace blast
 {
@@ -195,4 +191,4 @@ namespace blast
         BLAST_CONSTRAINT_MUST_NOT_BE_PANEL_SUBMATRIX_TYPE(MT);
         //**********************************************************************************************
     };
-}
\ No newline at end of file
+}
diff --git a/include/blast/math/views/submatrix/Panel.hpp b/include/blast/math/views/submatrix/Panel.hpp
index c922bba7..50ba6a79 100644
--- a/include/blast/math/views/submatrix/Panel.hpp
+++ b/include/blast/math/views/submatrix/Panel.hpp
@@ -7,7 +7,6 @@
 #include <blast/math/PanelMatrix.hpp>
 #include <blast/math/views/submatrix/BaseTemplate.hpp>
 #include <blast/math/constraints/Submatrix.hpp>
-#include <blast/math/simd/Simd.hpp>
 #include <blast/math/panel/PanelSize.hpp>
 #include <blast/math/constraints/PanelMatrix.hpp>
 #include <blast/util/Assert.hpp>
@@ -25,9 +24,6 @@
 #include <blaze/util/Exception.h>
 #include <blaze/util/Assert.h>
 
-#include <algorithm>
-#include <iterator>
-
 
 namespace blast
 {
@@ -283,4 +279,4 @@ namespace blast
     {
         return PanelSubmatrix<MT, SO>(matrix.operand(), matrix.row() + row, matrix.column() + column, m, n);
     }
-}
\ No newline at end of file
+}

From f91233a5f31a83b0c08575974c91547f9d646dd7 Mon Sep 17 00:00:00 2001
From: Mikhail Katliar <cepstr@gmail.com>
Date: Fri, 2 Aug 2024 16:41:55 +0200
Subject: [PATCH 17/27] Default architecture changed to fma3<avx2> in
 Dockerfile

---
 Dockerfile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Dockerfile b/Dockerfile
index 598aebdd..fb47dea6 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -56,7 +56,7 @@ ENV PKG_CONFIG_PATH=/usr/local/lib
 RUN mkdir -p blast/build && cd blast/build \
     && cmake -DCMAKE_BUILD_TYPE=RelWithDebInfo \
         -DCMAKE_CXX_COMPILER="clang++-15" \
-        -DCMAKE_CXX_FLAGS="-march=native -mfma -mavx -mavx2 -msse4 -fno-math-errno -DXSIMD_DEFAULT_ARCH=avx2" \
+        -DCMAKE_CXX_FLAGS="-march=native -mfma -mavx -mavx2 -msse4 -fno-math-errno -DXSIMD_DEFAULT_ARCH='fma3<avx2>'" \
         -DCMAKE_CXX_FLAGS_RELEASE="-O3 -g -DNDEBUG -ffast-math" .. \
         -DBLAST_WITH_TEST=ON \
         -DBLAST_WITH_BENCHMARK=ON \

From 4b5068422a9aabf7e1d451da75468c31a6aa6288 Mon Sep 17 00:00:00 2001
From: Mikhail Katliar <cepstr@gmail.com>
Date: Fri, 2 Aug 2024 16:54:40 +0200
Subject: [PATCH 18/27] Got rid of SimdVecBase

---
 .../math/register_matrix/RegisterMatrix.hpp   |  5 +-
 include/blast/math/simd/SimdVecBase.hpp       | 78 -------------------
 2 files changed, 1 insertion(+), 82 deletions(-)
 delete mode 100644 include/blast/math/simd/SimdVecBase.hpp

diff --git a/include/blast/math/register_matrix/RegisterMatrix.hpp b/include/blast/math/register_matrix/RegisterMatrix.hpp
index 5c37dc64..631b530d 100644
--- a/include/blast/math/register_matrix/RegisterMatrix.hpp
+++ b/include/blast/math/register_matrix/RegisterMatrix.hpp
@@ -4,10 +4,7 @@
 
 #pragma once
 
-#include <blast/math/simd/SimdVec.hpp>
-#include <blast/math/simd/SimdMask.hpp>
-#include <blast/math/simd/SimdIndex.hpp>
-#include <blast/math/simd/RegisterCapacity.hpp>
+#include <blast/math/Simd.hpp>
 #include <blast/math/typetraits/MatrixPointer.hpp>
 #include <blast/math/typetraits/VectorPointer.hpp>
 #include <blast/math/RowColumnVectorPointer.hpp>
diff --git a/include/blast/math/simd/SimdVecBase.hpp b/include/blast/math/simd/SimdVecBase.hpp
deleted file mode 100644
index 53ef4ba4..00000000
--- a/include/blast/math/simd/SimdVecBase.hpp
+++ /dev/null
@@ -1,78 +0,0 @@
-// Copyright 2023 Mikhail Katliar
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <blast/math/simd/SimdSize.hpp>
-
-#include <xsimd/xsimd.hpp>
-
-#include <cstdlib>
-
-
-namespace blast
-{
-    template <typename T, typename Arch>
-    class SimdVecBase
-    {
-    public:
-        using ValueType = T;
-        using XSimdType = xsimd::batch<T, Arch>;
-        using IntrinsicType = typename XSimdType::register_type;
-
-
-        /**
-         * @brief Number of elements in SIMD pack
-         */
-        static size_t constexpr size()
-        {
-            return XSimdType::size;
-        }
-
-
-        operator IntrinsicType() const noexcept
-        {
-            return value_;
-        }
-
-
-        /**
-         * @brief Access single element
-         *
-         * @param i element index
-         *
-         * @return element value
-         */
-        ValueType operator[](size_t i) const noexcept
-        {
-            return value_.get(i);
-        }
-
-
-    protected:
-        SimdVecBase(IntrinsicType value)
-        :   value_ {value}
-        {
-        }
-
-
-        SimdVecBase(ValueType const& value)
-        :   value_ {value}
-        {
-        }
-
-
-        XSimdType value_;
-    };
-}

From 2c563055dc7428a3c108fcd35fd97230c9fd8667 Mon Sep 17 00:00:00 2001
From: Mikhail Katliar <cepstr@gmail.com>
Date: Fri, 2 Aug 2024 17:43:06 +0200
Subject: [PATCH 19/27] Made the code compile with gcc (except internal
 compiler errors)

---
 include/blast/math/Simd.hpp                   |  3 ++-
 include/blast/math/simd/SimdMask.hpp          |  2 +-
 include/blast/math/simd/SimdVec.hpp           | 21 ++++++++++---------
 .../blast/math/simd/avx256/SimdVecFloat32.hpp | 10 ++++-----
 .../blast/math/simd/avx256/SimdVecFloat64.hpp | 10 ++++-----
 5 files changed, 24 insertions(+), 22 deletions(-)

diff --git a/include/blast/math/Simd.hpp b/include/blast/math/Simd.hpp
index f971dbe4..e6b4198f 100644
--- a/include/blast/math/Simd.hpp
+++ b/include/blast/math/Simd.hpp
@@ -4,10 +4,11 @@
 
 #pragma once
 
-#include <blast/math/simd/Avx256.hpp>
 #include <blast/math/simd/SimdVec.hpp>
 #include <blast/math/simd/SimdMask.hpp>
 #include <blast/math/simd/SimdIndex.hpp>
 #include <blast/math/simd/SimdSize.hpp>
 #include <blast/math/simd/IsSimdAligned.hpp>
 #include <blast/math/simd/RegisterCapacity.hpp>
+
+#include <blast/math/simd/Avx256.hpp>
diff --git a/include/blast/math/simd/SimdMask.hpp b/include/blast/math/simd/SimdMask.hpp
index 325d9ee5..3cc7f3a8 100644
--- a/include/blast/math/simd/SimdMask.hpp
+++ b/include/blast/math/simd/SimdMask.hpp
@@ -72,7 +72,7 @@ namespace blast
          */
         template <typename U>
         SimdMask(xsimd::batch_bool<U, Arch> const& v) noexcept
-        :   XSimdType {IntrinsicType(v)}
+        :   XSimdType {xsimd::batch_bool_cast<T>(v)}
         {
         }
 
diff --git a/include/blast/math/simd/SimdVec.hpp b/include/blast/math/simd/SimdVec.hpp
index 6cd34223..02b5c1c8 100644
--- a/include/blast/math/simd/SimdVec.hpp
+++ b/include/blast/math/simd/SimdVec.hpp
@@ -15,6 +15,7 @@
 #pragma once
 
 #include <blast/math/simd/SimdIndex.hpp>
+#include <blast/math/simd/SimdMask.hpp>
 
 #include <xsimd/xsimd.hpp>
 
@@ -51,14 +52,14 @@ namespace blast
     requires std::is_base_of_v<xsimd::avx2, Arch>
     inline void maskstore(xsimd::batch<float, Arch> const& v, float * dst, xsimd::batch_bool<float, Arch> const& mask) noexcept
     {
-        _mm256_maskstore_ps(dst, mask, v);
+        _mm256_maskstore_ps(dst, xsimd::batch_bool_cast<std::int32_t>(mask), v);
     }
 
     template <typename Arch>
     requires std::is_base_of_v<xsimd::avx2, Arch>
     inline void maskstore(xsimd::batch<double, Arch> const& v, double * dst, xsimd::batch_bool<double, Arch> const& mask) noexcept
     {
-        _mm256_maskstore_pd(dst, mask, v);
+        _mm256_maskstore_pd(dst, xsimd::batch_bool_cast<std::int64_t>(mask), v);
     }
 
 
@@ -75,7 +76,7 @@ namespace blast
         // __m256 v3 = _mm256_max_ps(v1, v2);
         __m256 const mask_v3 = _mm256_cmp_ps(v2, v1, _CMP_GT_OQ);
         __m256 const v3 = _mm256_blendv_ps(v1, v2, mask_v3);
-        __m256i const iv3 = _mm256_blendv_epi8(idx, iv2, mask_v3);
+        __m256i const iv3 = _mm256_blendv_epi8(idx, iv2, _mm256_castps_si256(mask_v3));
 
         /* v4 = [Z Z W W | X X Y Y]                                                                         */
         __m256 const v4 = _mm256_permute_ps(v3, 0b00'00'10'10);
@@ -86,7 +87,7 @@ namespace blast
         // __m256 v5 = _mm256_max_ps(v3, v4);
         __m256 const mask_v5 = _mm256_cmp_ps(v4, v3, _CMP_GT_OQ);
         __m256 const v5 = _mm256_blendv_ps(v3, v4, mask_v5);
-        __m256i const iv5 = _mm256_blendv_epi8(iv3, iv4, mask_v5);
+        __m256i const iv5 = _mm256_blendv_epi8(iv3, iv4, _mm256_castps_si256(mask_v5));
 
         /* v6 = [I I I I | J J J J]                                                                         */
         __m256 const v6 = _mm256_permute2f128_ps(v5, v5, 0b0000'0001);
@@ -102,7 +103,7 @@ namespace blast
         // __m128 v7 = _mm_max_ps(_mm256_castps256_ps128(v5), v6);
         __m256 const mask_v7 = _mm256_cmp_ps(v6, v5, _CMP_GT_OQ);
         __m256 const v7 = _mm256_blendv_ps(v5, v6, mask_v7);
-        __m256i const iv7 = _mm256_blendv_epi8(iv5, iv6, mask_v7);
+        __m256i const iv7 = _mm256_blendv_epi8(iv5, iv6, _mm256_castps_si256(mask_v7));
 
         return {v7, iv7};
     }
@@ -118,7 +119,7 @@ namespace blast
         // __m256d m1 = _mm256_max_pd(x.value_, y); // m1[0] = max(x[0], x[2]), m1[1] = max(x[1], x[3]), etc.
         __m256d const mask_m1 = _mm256_cmp_pd(y, x, _CMP_GT_OQ);
         __m256d const m1 = _mm256_blendv_pd(x, y, mask_m1);
-        __m256i const im1 = _mm256_blendv_epi8(idx, iy, mask_m1);
+        __m256i const im1 = _mm256_blendv_epi8(idx, iy, _mm256_castpd_si256(mask_m1));
 
         __m256d const m2 = _mm256_permute_pd(m1, 5); // set m2[0] = m1[1], m2[1] = m1[0], etc.
         __m256i const im2 = _mm256_castpd_si256(_mm256_permute_pd(_mm256_castsi256_pd(im1), 5));
@@ -126,7 +127,7 @@ namespace blast
         // __m256d m = _mm256_max_pd(m1, m2); // all m[0] ... m[3] contain the horizontal max(x[0], x[1], x[2], x[3])
         __m256d const mask_m = _mm256_cmp_pd(m2, m1, _CMP_GT_OQ);
         __m256d const m = _mm256_blendv_pd(m1, m2, mask_m);
-        __m256i const im = _mm256_blendv_epi8(im1, im2, mask_m);
+        __m256i const im = _mm256_blendv_epi8(im1, im2, _mm256_castpd_si256(mask_m));
 
         return {m, im};
     }
@@ -191,7 +192,7 @@ namespace blast
     T max(SimdVec<T, Arch> const& x) noexcept;
 
     template <typename T, typename Arch>
-    typename SimdVec<T, Arch>::MaskType operator>(SimdVec<T, Arch> const& a, SimdVec<T, Arch> const& b) noexcept;
+    SimdMask<T, Arch> operator>(SimdVec<T, Arch> const& a, SimdVec<T, Arch> const& b) noexcept;
 
     /**
     * @brief Multiplication
@@ -240,7 +241,7 @@ namespace blast
         using ValueType = T;
         using XSimdType = xsimd::batch<T, Arch>;
         using IntrinsicType = typename XSimdType::register_type;
-        using MaskType = xsimd::batch_bool<T, Arch>;
+        using MaskType = SimdMask<T, Arch>;
 
 
         /**
@@ -446,7 +447,7 @@ namespace blast
 
 
     template <typename T, typename Arch>
-    inline typename SimdVec<T, Arch>::MaskType operator>(SimdVec<T, Arch> const& a, SimdVec<T, Arch> const& b) noexcept
+    inline SimdMask<T, Arch> operator>(SimdVec<T, Arch> const& a, SimdVec<T, Arch> const& b) noexcept
     {
         return xsimd::gt(a.value_, b.value_);
     }
diff --git a/include/blast/math/simd/avx256/SimdVecFloat32.hpp b/include/blast/math/simd/avx256/SimdVecFloat32.hpp
index cc175930..b2059a7f 100644
--- a/include/blast/math/simd/avx256/SimdVecFloat32.hpp
+++ b/include/blast/math/simd/avx256/SimdVecFloat32.hpp
@@ -14,7 +14,7 @@
 
 #pragma once
 
-#include <blast/math/simd/SimdVec.hpp>
+#include <blast/math/Simd.hpp>
 
 #include <immintrin.h>
 
@@ -60,9 +60,9 @@ namespace blast
 
 
     template <>
-    inline SimdVec<float, xsimd::avx2>::MaskType operator>(SimdVec<float, xsimd::avx2> const& a, SimdVec<float, xsimd::avx2> const& b) noexcept
+    inline SimdMask<float, xsimd::avx2> operator>(SimdVec<float, xsimd::avx2> const& a, SimdVec<float, xsimd::avx2> const& b) noexcept
     {
-        return _mm256_castps_si256(_mm256_cmp_ps(a.value_, b.value_, _CMP_GT_OQ));
+        return _mm256_cmp_ps(a.value_, b.value_, _CMP_GT_OQ);
     }
 
 
@@ -88,9 +88,9 @@ namespace blast
 
 
     template <>
-    inline SimdVec<float, xsimd::avx2> blend(SimdVec<float, xsimd::avx2> const& a, SimdVec<float, xsimd::avx2> const& b, SimdVec<float, xsimd::avx2>::MaskType mask) noexcept
+    inline SimdVec<float, xsimd::avx2> blend(SimdVec<float, xsimd::avx2> const& a, SimdVec<float, xsimd::avx2> const& b, SimdMask<float, xsimd::avx2> mask) noexcept
     {
-        return _mm256_blendv_ps(a.value_, b.value_, _mm256_castsi256_ps(mask));
+        return _mm256_blendv_ps(a.value_, b.value_, mask);
     }
 
 
diff --git a/include/blast/math/simd/avx256/SimdVecFloat64.hpp b/include/blast/math/simd/avx256/SimdVecFloat64.hpp
index 229fd666..8876909a 100644
--- a/include/blast/math/simd/avx256/SimdVecFloat64.hpp
+++ b/include/blast/math/simd/avx256/SimdVecFloat64.hpp
@@ -14,7 +14,7 @@
 
 #pragma once
 
-#include <blast/math/simd/SimdVec.hpp>
+#include <blast/math/Simd.hpp>
 
 #include <immintrin.h>
 
@@ -60,9 +60,9 @@ namespace blast
 
 
     template <>
-    inline SimdVec<double, xsimd::avx2>::MaskType operator>(SimdVec<double, xsimd::avx2> const& a, SimdVec<double, xsimd::avx2> const& b) noexcept
+    inline SimdMask<double, xsimd::avx2> operator>(SimdVec<double, xsimd::avx2> const& a, SimdVec<double, xsimd::avx2> const& b) noexcept
     {
-        return _mm256_castpd_si256(_mm256_cmp_pd(a.value_, b.value_, _CMP_GT_OQ));
+        return _mm256_cmp_pd(a.value_, b.value_, _CMP_GT_OQ);
     }
 
 
@@ -88,9 +88,9 @@ namespace blast
 
 
     template <>
-    inline SimdVec<double, xsimd::avx2> blend(SimdVec<double, xsimd::avx2> const& a, SimdVec<double, xsimd::avx2> const& b, SimdVec<double, xsimd::avx2>::MaskType mask) noexcept
+    inline SimdVec<double, xsimd::avx2> blend(SimdVec<double, xsimd::avx2> const& a, SimdVec<double, xsimd::avx2> const& b, SimdMask<double, xsimd::avx2> mask) noexcept
     {
-        return _mm256_blendv_pd(a.value_, b.value_, _mm256_castsi256_pd(mask));
+        return _mm256_blendv_pd(a.value_, b.value_, mask);
     }
 
 

From 7efe21bb2be19c65477124af603ad9067d483e97 Mon Sep 17 00:00:00 2001
From: Mikhail Katliar <cepstr@gmail.com>
Date: Fri, 2 Aug 2024 17:45:40 +0200
Subject: [PATCH 20/27] Build with clang-16

---
 .github/workflows/cmake.yml           | 14 ++++++++++++--
 include/blast/math/RegisterMatrix.hpp |  6 +++---
 2 files changed, 15 insertions(+), 5 deletions(-)

diff --git a/.github/workflows/cmake.yml b/.github/workflows/cmake.yml
index 2df1b1d3..cef30766 100644
--- a/.github/workflows/cmake.yml
+++ b/.github/workflows/cmake.yml
@@ -21,7 +21,17 @@ jobs:
     - uses: actions/checkout@v3
 
     - name: Install APT packages
-      run: sudo apt install clang libboost-exception-dev libbenchmark-dev -y
+      run: |
+        sudo apt-get update
+        sudo apt install clang-16 libboost-exception-dev libbenchmark-dev -y
+
+    # - name: Install LLVM and Clang 18
+    #   run: |
+    #     sudo apt-get update
+    #     sudo apt-get install -y wget gnupg lsb-release
+    #     wget https://apt.llvm.org/llvm.sh
+    #     chmod +x llvm.sh
+    #     sudo ./llvm.sh 18
 
     - name: Install Blaze
       run: |
@@ -54,7 +64,7 @@ jobs:
       run: |
         cmake -B ${{github.workspace}}/build \
         -DCMAKE_BUILD_TYPE=${{env.BUILD_TYPE}} \
-        -DCMAKE_CXX_COMPILER=clang++ \
+        -DCMAKE_CXX_COMPILER=clang++-16 \
         -DBLAST_WITH_BENCHMARK=ON \
         -DBLAST_WITH_TEST=ON
 
diff --git a/include/blast/math/RegisterMatrix.hpp b/include/blast/math/RegisterMatrix.hpp
index 3c8944c4..90b379c3 100644
--- a/include/blast/math/RegisterMatrix.hpp
+++ b/include/blast/math/RegisterMatrix.hpp
@@ -7,6 +7,6 @@
 
 #include <blast/math/register_matrix/RegisterMatrix.hpp>
 #include <blast/math/register_matrix/DynamicRegisterMatrix.hpp>
-#include <blast/math/register_matrix/double_4_4_4.hpp>
-#include <blast/math/register_matrix/double_8_4_4.hpp>
-#include <blast/math/register_matrix/double_12_4_4.hpp>
+// #include <blast/math/register_matrix/double_4_4_4.hpp>
+// #include <blast/math/register_matrix/double_8_4_4.hpp>
+// #include <blast/math/register_matrix/double_12_4_4.hpp>

From 84695b9d079c2cf83ebcdc2d6be48f3abc3a643e Mon Sep 17 00:00:00 2001
From: Mikhail Katliar <cepstr@gmail.com>
Date: Fri, 2 Aug 2024 18:21:37 +0200
Subject: [PATCH 21/27] Build with clang-18

---
 .github/workflows/cmake.yml | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/.github/workflows/cmake.yml b/.github/workflows/cmake.yml
index cef30766..ece754f4 100644
--- a/.github/workflows/cmake.yml
+++ b/.github/workflows/cmake.yml
@@ -23,15 +23,15 @@ jobs:
     - name: Install APT packages
       run: |
         sudo apt-get update
-        sudo apt install clang-16 libboost-exception-dev libbenchmark-dev -y
+        sudo apt install libboost-exception-dev libbenchmark-dev -y
 
-    # - name: Install LLVM and Clang 18
-    #   run: |
-    #     sudo apt-get update
-    #     sudo apt-get install -y wget gnupg lsb-release
-    #     wget https://apt.llvm.org/llvm.sh
-    #     chmod +x llvm.sh
-    #     sudo ./llvm.sh 18
+    - name: Install LLVM and Clang 18
+      run: |
+        sudo apt-get update
+        sudo apt-get install -y wget gnupg lsb-release
+        wget https://apt.llvm.org/llvm.sh
+        chmod +x llvm.sh
+        sudo ./llvm.sh 18
 
     - name: Install Blaze
       run: |
@@ -64,7 +64,7 @@ jobs:
       run: |
         cmake -B ${{github.workspace}}/build \
         -DCMAKE_BUILD_TYPE=${{env.BUILD_TYPE}} \
-        -DCMAKE_CXX_COMPILER=clang++-16 \
+        -DCMAKE_CXX_COMPILER=clang++-18 \
         -DBLAST_WITH_BENCHMARK=ON \
         -DBLAST_WITH_TEST=ON
 

From c3d5882a6d73a6a3ab181af41e8f3b24df824edd Mon Sep 17 00:00:00 2001
From: Mikhail Katliar <cepstr@gmail.com>
Date: Sat, 3 Aug 2024 04:23:07 +0200
Subject: [PATCH 22/27] Removed RegisterMatrix specializations

---
 include/blast/math/RegisterMatrix.hpp         |   3 -
 .../math/register_matrix/double_12_4_4.hpp    | 167 ------------------
 .../math/register_matrix/double_4_4_4.hpp     |  79 ---------
 .../math/register_matrix/double_8_4_4.hpp     | 151 ----------------
 4 files changed, 400 deletions(-)
 delete mode 100644 include/blast/math/register_matrix/double_12_4_4.hpp
 delete mode 100644 include/blast/math/register_matrix/double_4_4_4.hpp
 delete mode 100644 include/blast/math/register_matrix/double_8_4_4.hpp

diff --git a/include/blast/math/RegisterMatrix.hpp b/include/blast/math/RegisterMatrix.hpp
index 90b379c3..851ef7ef 100644
--- a/include/blast/math/RegisterMatrix.hpp
+++ b/include/blast/math/RegisterMatrix.hpp
@@ -7,6 +7,3 @@
 
 #include <blast/math/register_matrix/RegisterMatrix.hpp>
 #include <blast/math/register_matrix/DynamicRegisterMatrix.hpp>
-// #include <blast/math/register_matrix/double_4_4_4.hpp>
-// #include <blast/math/register_matrix/double_8_4_4.hpp>
-// #include <blast/math/register_matrix/double_12_4_4.hpp>
diff --git a/include/blast/math/register_matrix/double_12_4_4.hpp b/include/blast/math/register_matrix/double_12_4_4.hpp
deleted file mode 100644
index a640d2c1..00000000
--- a/include/blast/math/register_matrix/double_12_4_4.hpp
+++ /dev/null
@@ -1,167 +0,0 @@
-// Copyright (c) 2019-2020 Mikhail Katliar All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-#pragma once
-
-#include <blast/math/register_matrix/RegisterMatrix.hpp>
-#include <blast/util/Exception.hpp>
-
-#include <blaze/util/Exception.h>
-
-#include <immintrin.h>
-
-
-namespace blast
-{
-    template <>
-    template <typename P>
-    requires MatrixPointer<P, double> && (P::storageOrder == columnMajor)
-    inline void RegisterMatrix<double, 12, 4, columnMajor>::load(double beta, P ptr, size_t m, size_t n) noexcept
-    {
-        if (n > 0)
-        {
-            v_[0][0] = beta * ptr.load();
-            v_[1][0] = beta * ptr(SS, 0).load();
-            v_[2][0] = beta * ptr(2 * SS, 0).load();
-        }
-
-        if (n > 1)
-        {
-            v_[0][1] = beta * ptr(0, 1).load();
-            v_[1][1] = beta * ptr(SS, 1).load();
-            v_[2][1] = beta * ptr(2 * SS, 1).load();
-        }
-
-        if (n > 2)
-        {
-            v_[0][2] = beta * ptr(0, 2).load();
-            v_[1][2] = beta * ptr(SS, 2).load();
-            v_[2][2] = beta * ptr(2 * SS, 2).load();
-        }
-
-        if (n > 3)
-        {
-            v_[0][3] = beta * ptr(0, 3).load();
-            v_[1][3] = beta * ptr(SS, 3).load();
-            v_[2][3] = beta * ptr(2 * SS, 3).load();
-        }
-    }
-
-
-    template <>
-    template <typename P>
-    requires MatrixPointer<P, double> && (P::storageOrder == columnMajor)
-    inline void RegisterMatrix<double, 12, 4, columnMajor>::store(P ptr, size_t m, size_t n) const noexcept
-    {
-        #pragma unroll
-        for (size_t i = 0; i < 3; ++i)
-        {
-            if (m >= 4 * i + 4)
-            {
-                if (n > 0)
-                    ptr(SS * i, 0).store(v_[i][0]);
-
-                if (n > 1)
-                    ptr(SS * i, 1).store(v_[i][1]);
-
-                if (n > 2)
-                    ptr(SS * i, 2).store(v_[i][2]);
-
-                if (n > 3)
-                    ptr(SS * i, 3).store(v_[i][3]);
-            }
-            else if (m > 4 * i)
-            {
-                __m256i const mask = _mm256_set_epi64x(
-                    m > 4 * i + 3 ? 0x8000000000000000ULL : 0,
-                    m > 4 * i + 2 ? 0x8000000000000000ULL : 0,
-                    m > 4 * i + 1 ? 0x8000000000000000ULL : 0,
-                    m > 4 * i + 0 ? 0x8000000000000000ULL : 0);
-
-                if (n > 0)
-                    ptr(SS * i, 0).store(v_[i][0], mask);
-
-                if (n > 1)
-                    ptr(SS * i, 1).store(v_[i][1], mask);
-
-                if (n > 2)
-                    ptr(SS * i, 2).store(v_[i][2], mask);
-
-                if (n > 3)
-                    ptr(SS * i, 3).store(v_[i][3], mask);
-            }
-        }
-    }
-
-
-#ifdef BLAST_USE_CUSTOM_TRMM_RIGHT_LOWER
-    template <>
-    template <typename PB, typename PA>
-        requires MatrixPointer<PB, double> && (PB::storageOrder == columnMajor) && MatrixPointer<PA, double>
-    BLAZE_ALWAYS_INLINE void RegisterMatrix<double, 12, 4, columnMajor>::trmmRightLower(double alpha, PB b, PA a) noexcept
-    {
-        IntrinsicType bx[3], ax;
-
-        // k == 0
-        bx[0] = alpha * b.load(0 * SS, 0);
-        bx[1] = alpha * b.load(1 * SS, 0);
-        bx[2] = alpha * b.load(2 * SS, 0);
-        ax = a.broadcast(0, 0);
-        v_[0][0] = fmadd(bx[0], ax, v_[0][0]);
-        v_[1][0] = fmadd(bx[1], ax, v_[1][0]);
-        v_[2][0] = fmadd(bx[2], ax, v_[2][0]);
-
-        // k == 1
-        bx[0] = alpha * b.load(0 * SS, 1);
-        bx[1] = alpha * b.load(1 * SS, 1);
-        bx[2] = alpha * b.load(2 * SS, 1);
-        ax = a.broadcast(1, 0);
-        v_[0][0] = fmadd(bx[0], ax, v_[0][0]);
-        v_[1][0] = fmadd(bx[1], ax, v_[1][0]);
-        v_[2][0] = fmadd(bx[2], ax, v_[2][0]);
-        ax = a.broadcast(1, 1);
-        v_[0][1] = fmadd(bx[0], ax, v_[0][1]);
-        v_[1][1] = fmadd(bx[1], ax, v_[1][1]);
-        v_[2][1] = fmadd(bx[2], ax, v_[2][1]);
-
-        // k == 2
-        bx[0] = alpha * b.load(0 * SS, 2);
-        bx[1] = alpha * b.load(1 * SS, 2);
-        bx[2] = alpha * b.load(2 * SS, 2);
-        ax = a.broadcast(2, 0);
-        v_[0][0] = fmadd(bx[0], ax, v_[0][0]);
-        v_[1][0] = fmadd(bx[1], ax, v_[1][0]);
-        v_[2][0] = fmadd(bx[2], ax, v_[2][0]);
-        ax = a.broadcast(2, 1);
-        v_[0][1] = fmadd(bx[0], ax, v_[0][1]);
-        v_[1][1] = fmadd(bx[1], ax, v_[1][1]);
-        v_[2][1] = fmadd(bx[2], ax, v_[2][1]);
-        ax = a.broadcast(2, 2);
-        v_[0][2] = fmadd(bx[0], ax, v_[0][2]);
-        v_[1][2] = fmadd(bx[1], ax, v_[1][2]);
-        v_[2][2] = fmadd(bx[2], ax, v_[2][2]);
-
-        // k == 3
-        bx[0] = alpha * b.load(0 * SS, 3);
-        bx[1] = alpha * b.load(1 * SS, 3);
-        bx[2] = alpha * b.load(2 * SS, 3);
-        ax = a.broadcast(3, 0);
-        v_[0][0] = fmadd(bx[0], ax, v_[0][0]);
-        v_[1][0] = fmadd(bx[1], ax, v_[1][0]);
-        v_[2][0] = fmadd(bx[2], ax, v_[2][0]);
-        ax = a.broadcast(3, 1);
-        v_[0][1] = fmadd(bx[0], ax, v_[0][1]);
-        v_[1][1] = fmadd(bx[1], ax, v_[1][1]);
-        v_[2][1] = fmadd(bx[2], ax, v_[2][1]);
-        ax = a.broadcast(3, 2);
-        v_[0][2] = fmadd(bx[0], ax, v_[0][2]);
-        v_[1][2] = fmadd(bx[1], ax, v_[1][2]);
-        v_[2][2] = fmadd(bx[2], ax, v_[2][2]);
-        ax = a.broadcast(3, 3);
-        v_[0][3] = fmadd(bx[0], ax, v_[0][3]);
-        v_[1][3] = fmadd(bx[1], ax, v_[1][3]);
-        v_[2][3] = fmadd(bx[2], ax, v_[2][3]);
-    }
-#endif
-}
\ No newline at end of file
diff --git a/include/blast/math/register_matrix/double_4_4_4.hpp b/include/blast/math/register_matrix/double_4_4_4.hpp
deleted file mode 100644
index 1c73605c..00000000
--- a/include/blast/math/register_matrix/double_4_4_4.hpp
+++ /dev/null
@@ -1,79 +0,0 @@
-// Copyright (c) 2019-2020 Mikhail Katliar All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-#pragma once
-
-#include <blast/math/register_matrix/RegisterMatrix.hpp>
-#include <blast/util/Exception.hpp>
-
-#include <cmath>
-
-
-namespace blast
-{
-    template <>
-    template <typename P>
-    requires MatrixPointer<P, double> && (P::storageOrder == columnMajor)
-    BLAZE_ALWAYS_INLINE void RegisterMatrix<double, 4, 4, columnMajor>::load(double beta, P ptr, size_t m, size_t n) noexcept
-    {
-        if (n > 0)
-            v_[0][0] = beta * ptr.load();
-
-        if (n > 1)
-            v_[0][1] = beta * ptr(0, 1).load();
-
-        if (n > 2)
-            v_[0][2] = beta * ptr(0, 2).load();
-
-        if (n > 3)
-            v_[0][3] = beta * ptr(0, 3).load();
-    }
-
-
-#if 1
-    /// Magically, this function specialization is slightly faster than the default implementation of RegisterMatrix<>::store.
-    template <>
-    template <typename P>
-    requires MatrixPointer<P, double> && (P::storageOrder == columnMajor)
-    BLAZE_ALWAYS_INLINE void RegisterMatrix<double, 4, 4, columnMajor>::store(P ptr, size_t m, size_t n) const noexcept
-    {
-        if (m >= 4)
-        {
-            if (n > 0)
-                ptr.store(v_[0][0]);
-
-            if (n > 1)
-                ptr(0, 1).store(v_[0][1]);
-
-            if (n > 2)
-                ptr(0, 2).store(v_[0][2]);
-
-            if (n > 3)
-                ptr(0, 3).store(v_[0][3]);
-        }
-        else if (m > 0)
-        {
-            // Magically, the code below is significantly faster than this:
-            // __m256i const mask = _mm256_cmpgt_epi64(_mm256_set_epi64x(m, m, m, m), _mm256_set_epi64x(3, 2, 1, 0));
-            __m256i const mask = _mm256_set_epi64x(
-                m > 3 ? 0x8000000000000000ULL : 0,
-                m > 2 ? 0x8000000000000000ULL : 0,
-                m > 1 ? 0x8000000000000000ULL : 0,
-                m > 0 ? 0x8000000000000000ULL : 0);
-
-            if (n > 0)
-                ptr.store(v_[0][0], mask);
-
-            if (n > 1)
-                ptr(0, 1).store(v_[0][1], mask);
-
-            if (n > 2)
-                ptr(0, 2).store(v_[0][2], mask);
-
-            if (n > 3)
-                ptr(0, 3).store(v_[0][3], mask);
-        }
-    }
-#endif
-}
\ No newline at end of file
diff --git a/include/blast/math/register_matrix/double_8_4_4.hpp b/include/blast/math/register_matrix/double_8_4_4.hpp
deleted file mode 100644
index 4ec3d416..00000000
--- a/include/blast/math/register_matrix/double_8_4_4.hpp
+++ /dev/null
@@ -1,151 +0,0 @@
-// Copyright (c) 2019-2020 Mikhail Katliar All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-#pragma once
-
-#include <blast/math/register_matrix/RegisterMatrix.hpp>
-#include <blast/util/Exception.hpp>
-
-#include <blaze/util/Exception.h>
-
-#include <immintrin.h>
-
-
-namespace blast
-{
-    template<>
-    template <typename P>
-    requires MatrixPointer<P, double> && (P::storageOrder == columnMajor)
-    inline void RegisterMatrix<double, 8, 4, columnMajor>::load(double beta, P ptr, size_t m, size_t n) noexcept
-    {
-        if (n > 0)
-        {
-            v_[0][0] = beta * ptr.load();
-            v_[1][0] = beta * ptr(SS, 0).load();
-        }
-
-        if (n > 1)
-        {
-            v_[0][1] = beta * ptr(0, 1).load();
-            v_[1][1] = beta * ptr(SS, 1).load();
-        }
-
-        if (n > 2)
-        {
-            v_[0][2] = beta * ptr(0, 2).load();
-            v_[1][2] = beta * ptr(SS, 2).load();
-        }
-
-        if (n > 3)
-        {
-            v_[0][3] = beta * ptr(0, 3).load();
-            v_[1][3] = beta * ptr(SS, 3).load();
-        }
-    }
-
-
-#if 1
-    template<>
-    template <typename P>
-    requires MatrixPointer<P, double> && (P::storageOrder == columnMajor)
-    inline void RegisterMatrix<double, 8, 4, columnMajor>::store(P ptr, size_t m, size_t n) const noexcept
-    {
-        #pragma unroll
-        for (size_t i = 0; i < 2; ++i)
-        {
-            if (m >= 4 * i + 4)
-            {
-                if (n > 0)
-                    ptr(SS * i, 0).store(v_[i][0]);
-
-                if (n > 1)
-                    ptr(SS * i, 1).store(v_[i][1]);
-
-                if (n > 2)
-                    ptr(SS * i, 2).store(v_[i][2]);
-
-                if (n > 3)
-                    ptr(SS * i, 3).store(v_[i][3]);
-            }
-            else if (m > 4 * i)
-            {
-                __m256i const mask = _mm256_set_epi64x(
-                    m > 4 * i + 3 ? 0x8000000000000000ULL : 0,
-                    m > 4 * i + 2 ? 0x8000000000000000ULL : 0,
-                    m > 4 * i + 1 ? 0x8000000000000000ULL : 0,
-                    m > 4 * i + 0 ? 0x8000000000000000ULL : 0);
-
-                if (n > 0)
-                    ptr(SS * i, 0).store(v_[i][0], mask);
-
-                if (n > 1)
-                    ptr(SS * i, 1).store(v_[i][1], mask);
-
-                if (n > 2)
-                    ptr(SS * i, 2).store(v_[i][2], mask);
-
-                if (n > 3)
-                    ptr(SS * i, 3).store(v_[i][3], mask);
-            }
-        }
-    }
-#endif
-
-
-#ifdef BLAST_USE_CUSTOM_TRMM_RIGHT_LOWER
-    template <>
-    template <typename PB, typename PA>
-        requires MatrixPointer<PB, double> && (PB::storageOrder == columnMajor) && MatrixPointer<PA, double>
-    BLAZE_ALWAYS_INLINE void RegisterMatrix<double, 8, 4, columnMajor>::trmmRightLower(double alpha, PB b, PA a) noexcept
-    {
-        IntrinsicType bx[2], ax;
-
-        // k == 0
-        bx[0] = alpha * b.load(0 * SS, 0);
-        bx[1] = alpha * b.load(1 * SS, 0);
-        ax = a.broadcast(0, 0);
-        v_[0][0] = fmadd(bx[0], ax, v_[0][0]);
-        v_[1][0] = fmadd(bx[1], ax, v_[1][0]);
-
-        // k == 1
-        bx[0] = alpha * b.load(0 * SS, 1);
-        bx[1] = alpha * b.load(1 * SS, 1);
-        ax = a.broadcast(1, 0);
-        v_[0][0] = fmadd(bx[0], ax, v_[0][0]);
-        v_[1][0] = fmadd(bx[1], ax, v_[1][0]);
-        ax = a.broadcast(1, 1);
-        v_[0][1] = fmadd(bx[0], ax, v_[0][1]);
-        v_[1][1] = fmadd(bx[1], ax, v_[1][1]);
-
-        // k == 2
-        bx[0] = alpha * b.load(0 * SS, 2);
-        bx[1] = alpha * b.load(1 * SS, 2);
-        ax = a.broadcast(2, 0);
-        v_[0][0] = fmadd(bx[0], ax, v_[0][0]);
-        v_[1][0] = fmadd(bx[1], ax, v_[1][0]);
-        ax = a.broadcast(2, 1);
-        v_[0][1] = fmadd(bx[0], ax, v_[0][1]);
-        v_[1][1] = fmadd(bx[1], ax, v_[1][1]);
-        ax = a.broadcast(2, 2);
-        v_[0][2] = fmadd(bx[0], ax, v_[0][2]);
-        v_[1][2] = fmadd(bx[1], ax, v_[1][2]);
-
-        // k == 3
-        bx[0] = alpha * b.load(0 * SS, 3);
-        bx[1] = alpha * b.load(1 * SS, 3);
-        ax = a.broadcast(3, 0);
-        v_[0][0] = fmadd(bx[0], ax, v_[0][0]);
-        v_[1][0] = fmadd(bx[1], ax, v_[1][0]);
-        ax = a.broadcast(3, 1);
-        v_[0][1] = fmadd(bx[0], ax, v_[0][1]);
-        v_[1][1] = fmadd(bx[1], ax, v_[1][1]);
-        ax = a.broadcast(3, 2);
-        v_[0][2] = fmadd(bx[0], ax, v_[0][2]);
-        v_[1][2] = fmadd(bx[1], ax, v_[1][2]);
-        ax = a.broadcast(3, 3);
-        v_[0][3] = fmadd(bx[0], ax, v_[0][3]);
-        v_[1][3] = fmadd(bx[1], ax, v_[1][3]);
-    }
-#endif
-}
\ No newline at end of file

From d789021c8abdcefb0de5da9975d783f9c5186a76 Mon Sep 17 00:00:00 2001
From: Mikhail Katliar <cepstr@gmail.com>
Date: Sat, 3 Aug 2024 04:39:24 +0200
Subject: [PATCH 23/27] Removed avx2 SimdVec specializations

---
 include/blast/math/Simd.hpp                   |   2 -
 include/blast/math/dense/Iamax.hpp            |   3 +-
 include/blast/math/simd/Avx256.hpp            |  18 ---
 .../blast/math/simd/avx256/SimdVecFloat32.hpp | 126 ------------------
 .../blast/math/simd/avx256/SimdVecFloat64.hpp | 121 -----------------
 test/blast/math/simd/SimdVecTest.cpp          |   1 -
 6 files changed, 1 insertion(+), 270 deletions(-)
 delete mode 100644 include/blast/math/simd/Avx256.hpp
 delete mode 100644 include/blast/math/simd/avx256/SimdVecFloat32.hpp
 delete mode 100644 include/blast/math/simd/avx256/SimdVecFloat64.hpp

diff --git a/include/blast/math/Simd.hpp b/include/blast/math/Simd.hpp
index e6b4198f..912948ce 100644
--- a/include/blast/math/Simd.hpp
+++ b/include/blast/math/Simd.hpp
@@ -10,5 +10,3 @@
 #include <blast/math/simd/SimdSize.hpp>
 #include <blast/math/simd/IsSimdAligned.hpp>
 #include <blast/math/simd/RegisterCapacity.hpp>
-
-#include <blast/math/simd/Avx256.hpp>
diff --git a/include/blast/math/dense/Iamax.hpp b/include/blast/math/dense/Iamax.hpp
index 6b03a36a..530df07b 100644
--- a/include/blast/math/dense/Iamax.hpp
+++ b/include/blast/math/dense/Iamax.hpp
@@ -17,7 +17,6 @@
 
 #include <blast/util/Exception.hpp>
 #include <blast/math/dense/VectorPointer.hpp>
-#include <blast/math/simd/Avx256.hpp>
 
 #include <cmath>
 #include <tuple>
@@ -177,4 +176,4 @@ namespace blast
 
         return iamax(N, ptr(*x));
     }
-}
\ No newline at end of file
+}
diff --git a/include/blast/math/simd/Avx256.hpp b/include/blast/math/simd/Avx256.hpp
deleted file mode 100644
index 1fbc41c0..00000000
--- a/include/blast/math/simd/Avx256.hpp
+++ /dev/null
@@ -1,18 +0,0 @@
-// Copyright 2023 Mikhail Katliar
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <blast/math/simd/avx256/SimdVecFloat32.hpp>
-#include <blast/math/simd/avx256/SimdVecFloat64.hpp>
diff --git a/include/blast/math/simd/avx256/SimdVecFloat32.hpp b/include/blast/math/simd/avx256/SimdVecFloat32.hpp
deleted file mode 100644
index b2059a7f..00000000
--- a/include/blast/math/simd/avx256/SimdVecFloat32.hpp
+++ /dev/null
@@ -1,126 +0,0 @@
-// Copyright 2023 Mikhail Katliar
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <blast/math/Simd.hpp>
-
-#include <immintrin.h>
-
-
-namespace blast
-{
-    template <>
-    inline SimdVec<float, xsimd::avx2>::SimdVec() noexcept
-    :   value_ {_mm256_setzero_ps()}
-    {
-    }
-
-
-    template <>
-    inline SimdVec<float, xsimd::avx2>::SimdVec(IntrinsicType value) noexcept
-    :   value_ {value}
-    {
-    }
-
-
-    template <>
-    inline SimdVec<float, xsimd::avx2>::SimdVec(ValueType value) noexcept
-    :   value_ {_mm256_set1_ps(value)}
-    {
-    }
-
-
-    template <>
-    inline SimdVec<float, xsimd::avx2>::SimdVec(ValueType const * src, bool aligned) noexcept
-    :   value_ {aligned ? _mm256_load_ps(src) : _mm256_loadu_ps(src)}
-    {
-    }
-
-
-    template <>
-    inline void SimdVec<float, xsimd::avx2>::store(ValueType * dst, bool aligned) const noexcept
-    {
-        if (aligned)
-            _mm256_store_ps(dst, value_);
-        else
-            _mm256_storeu_ps(dst, value_);
-    }
-
-
-    template <>
-    inline SimdMask<float, xsimd::avx2> operator>(SimdVec<float, xsimd::avx2> const& a, SimdVec<float, xsimd::avx2> const& b) noexcept
-    {
-        return _mm256_cmp_ps(a.value_, b.value_, _CMP_GT_OQ);
-    }
-
-
-    template <>
-    inline SimdVec<float, xsimd::avx2> operator*(SimdVec<float, xsimd::avx2> const& a, SimdVec<float, xsimd::avx2> const& b) noexcept
-    {
-        return _mm256_mul_ps(a.value_, b.value_);
-    }
-
-
-    template <>
-    inline SimdVec<float, xsimd::avx2> fmadd(SimdVec<float, xsimd::avx2> const& a, SimdVec<float, xsimd::avx2> const& b, SimdVec<float, xsimd::avx2> const& c) noexcept
-    {
-        return _mm256_fmadd_ps(a.value_, b.value_, c.value_);
-    }
-
-
-    template <>
-    inline SimdVec<float, xsimd::avx2> fnmadd(SimdVec<float, xsimd::avx2> const& a, SimdVec<float, xsimd::avx2> const& b, SimdVec<float, xsimd::avx2> const& c) noexcept
-    {
-        return _mm256_fnmadd_ps(a.value_, b.value_, c.value_);
-    }
-
-
-    template <>
-    inline SimdVec<float, xsimd::avx2> blend(SimdVec<float, xsimd::avx2> const& a, SimdVec<float, xsimd::avx2> const& b, SimdMask<float, xsimd::avx2> mask) noexcept
-    {
-        return _mm256_blendv_ps(a.value_, b.value_, mask);
-    }
-
-
-    template <>
-    inline SimdVec<float, xsimd::avx2> abs(SimdVec<float, xsimd::avx2> const& a) noexcept
-    {
-        return _mm256_andnot_ps(SimdVec<float, xsimd::avx2> {-0.f}, a.value_);
-    }
-
-
-    template <>
-    inline SimdVec<float, xsimd::avx2> max(SimdVec<float, xsimd::avx2> const& a, SimdVec<float, xsimd::avx2> const& b) noexcept
-    {
-        return _mm256_max_ps(a, b);
-    }
-
-
-    template <>
-    inline float max(SimdVec<float, xsimd::avx2> const& x) noexcept
-    {
-        __m256 v1 = x.value_;                                          /* v1 = [H G F E | D C B A]                                                                         */
-        __m256 v2 = _mm256_permute_ps(v1, 0b10'11'00'01);       /* v2 = [G H E F | C D A B]                                                                         */
-        __m256 v3 = _mm256_max_ps(v1, v2);                      /* v3 = [W=max(G,H) W=max(G,H) Z=max(E,F) Z=max(E,F) | Y=max(C,D) Y=max(C,D) X=max(A,B) X=max(A,B)] */
-                                                                /* v3 = [W W Z Z | Y Y X X]                                                                         */
-        __m256 v4 = _mm256_permute_ps(v3, 0b00'00'10'10);       /* v4 = [Z Z W W | X X Y Y]                                                                         */
-        __m256 v5 = _mm256_max_ps(v3, v4);                      /* v5 = [J=max(Z,W) J=max(Z,W) J=max(Z,W) J=max(Z,W) | I=max(X,Y) I=max(X,Y) I=max(X,Y) I=max(X,Y)] */
-                                                                /* v5 = [J J J J | I I I I]                                                                         */
-        __m128 v6 = _mm256_extractf128_ps(v5, 1);               /* v6 = [- - - - | J J J J]                                                                         */
-        __m128 v7 = _mm_max_ps(_mm256_castps256_ps128(v5), v6); /* v7 = [- - - - | M=max(I,J) M=max(I,J) M=max(I,J) M=max(I,J)]                                     */
-
-        return v7[0];
-    }
-}
diff --git a/include/blast/math/simd/avx256/SimdVecFloat64.hpp b/include/blast/math/simd/avx256/SimdVecFloat64.hpp
deleted file mode 100644
index 8876909a..00000000
--- a/include/blast/math/simd/avx256/SimdVecFloat64.hpp
+++ /dev/null
@@ -1,121 +0,0 @@
-// Copyright 2023 Mikhail Katliar
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <blast/math/Simd.hpp>
-
-#include <immintrin.h>
-
-
-namespace blast
-{
-    template <>
-    inline SimdVec<double, xsimd::avx2>::SimdVec() noexcept
-    :   value_ {_mm256_setzero_pd()}
-    {
-    }
-
-
-    template <>
-    inline SimdVec<double, xsimd::avx2>::SimdVec(IntrinsicType value) noexcept
-    :   value_ {value}
-    {
-    }
-
-
-    template <>
-    inline SimdVec<double, xsimd::avx2>::SimdVec(ValueType value) noexcept
-    :   value_ {_mm256_set1_pd(value)}
-    {
-    }
-
-
-    template <>
-    inline SimdVec<double, xsimd::avx2>::SimdVec(ValueType const * src, bool aligned) noexcept
-    :   value_ {aligned ? _mm256_load_pd(src) : _mm256_loadu_pd(src)}
-    {
-    }
-
-
-    template <>
-    inline void SimdVec<double, xsimd::avx2>::store(ValueType * dst, bool aligned) const noexcept
-    {
-        if (aligned)
-            _mm256_store_pd(dst, value_);
-        else
-            _mm256_storeu_pd(dst, value_);
-    }
-
-
-    template <>
-    inline SimdMask<double, xsimd::avx2> operator>(SimdVec<double, xsimd::avx2> const& a, SimdVec<double, xsimd::avx2> const& b) noexcept
-    {
-        return _mm256_cmp_pd(a.value_, b.value_, _CMP_GT_OQ);
-    }
-
-
-    template <>
-    inline SimdVec<double, xsimd::avx2> operator*(SimdVec<double, xsimd::avx2> const& a, SimdVec<double, xsimd::avx2> const& b) noexcept
-    {
-        return _mm256_mul_pd(a.value_, b.value_);
-    }
-
-
-    template <>
-    inline SimdVec<double, xsimd::avx2> fmadd(SimdVec<double, xsimd::avx2> const& a, SimdVec<double, xsimd::avx2> const& b, SimdVec<double, xsimd::avx2> const& c) noexcept
-    {
-        return _mm256_fmadd_pd(a.value_, b.value_, c.value_);
-    }
-
-
-    template <>
-    inline SimdVec<double, xsimd::avx2> fnmadd(SimdVec<double, xsimd::avx2> const& a, SimdVec<double, xsimd::avx2> const& b, SimdVec<double, xsimd::avx2> const& c) noexcept
-    {
-        return _mm256_fnmadd_pd(a.value_, b.value_, c.value_);
-    }
-
-
-    template <>
-    inline SimdVec<double, xsimd::avx2> blend(SimdVec<double, xsimd::avx2> const& a, SimdVec<double, xsimd::avx2> const& b, SimdMask<double, xsimd::avx2> mask) noexcept
-    {
-        return _mm256_blendv_pd(a.value_, b.value_, mask);
-    }
-
-
-    template <>
-    inline SimdVec<double, xsimd::avx2> abs(SimdVec<double, xsimd::avx2> const& a) noexcept
-    {
-        return _mm256_andnot_pd(_mm256_set1_pd(-0.), a.value_);
-    }
-
-
-    template <>
-    inline SimdVec<double, xsimd::avx2> max(SimdVec<double, xsimd::avx2> const& a, SimdVec<double, xsimd::avx2> const& b) noexcept
-    {
-        return _mm256_max_pd(a, b);
-    }
-
-
-    template <>
-    inline double max(SimdVec<double, xsimd::avx2> const& x) noexcept
-    {
-        __m256d y = _mm256_permute2f128_pd(x.value_, x.value_, 1); // permute 128-bit values
-        __m256d m1 = _mm256_max_pd(x.value_, y); // m1[0] = max(x[0], x[2]), m1[1] = max(x[1], x[3]), etc.
-        __m256d m2 = _mm256_permute_pd(m1, 5); // set m2[0] = m1[1], m2[1] = m1[0], etc.
-        __m256d m = _mm256_max_pd(m1, m2); // all m[0] ... m[3] contain the horizontal max(x[0], x[1], x[2], x[3])
-
-        return m[0];
-    }
-}
diff --git a/test/blast/math/simd/SimdVecTest.cpp b/test/blast/math/simd/SimdVecTest.cpp
index 91765526..f5b2fa04 100644
--- a/test/blast/math/simd/SimdVecTest.cpp
+++ b/test/blast/math/simd/SimdVecTest.cpp
@@ -2,7 +2,6 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
-#include <blast/math/simd/Avx256.hpp>
 #include <blast/math/simd/SimdIndex.hpp>
 
 #include <blaze/Math.h>

From 01754af861e50b20fcdc085eb9af5d12ef375baa Mon Sep 17 00:00:00 2001
From: Mikhail Katliar <cepstr@gmail.com>
Date: Sat, 3 Aug 2024 04:42:36 +0200
Subject: [PATCH 24/27] Removed avx2 hsum()

---
 include/blast/math/simd/Hsum.hpp  | 31 -------------------------
 test/blast/CMakeLists.txt         |  1 -
 test/blast/math/simd/HsumTest.cpp | 38 -------------------------------
 3 files changed, 70 deletions(-)
 delete mode 100644 include/blast/math/simd/Hsum.hpp
 delete mode 100644 test/blast/math/simd/HsumTest.cpp

diff --git a/include/blast/math/simd/Hsum.hpp b/include/blast/math/simd/Hsum.hpp
deleted file mode 100644
index 9c7b5f4b..00000000
--- a/include/blast/math/simd/Hsum.hpp
+++ /dev/null
@@ -1,31 +0,0 @@
-// Copyright (c) 2019-2020 Mikhail Katliar All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-#pragma once
-
-#include <immintrin.h>
-
-
-namespace blast
-{
-    /// dst[127:64] = sum(v1)
-    /// dst[63:0] = sum(v0)
-    inline __m128d hsum(__m256d v0, __m256d v1)
-    {
-        __m256d v01 = _mm256_hadd_pd(v0, v1);
-        return _mm_add_pd(_mm256_extractf128_pd(v01, 1), _mm256_castpd256_pd128(v01));
-    }
-
-
-    /// dst[255:192] = sum(v3)
-    /// dst[191:128] = sum(v2)
-    /// dst[127:64] = sum(v1)
-    /// dst[63:0] = sum(v0)
-    inline __m256d hsum(__m256d v0, __m256d v1, __m256d v2, __m256d v3)
-    {
-        __m128d v01 = hsum(v0, v1);
-        __m128d v23 = hsum(v2, v3);
-        return _mm256_insertf128_pd(_mm256_castpd128_pd256(v01), v23, 1);
-    }
-}
\ No newline at end of file
diff --git a/test/blast/CMakeLists.txt b/test/blast/CMakeLists.txt
index 5edafda1..fa479125 100644
--- a/test/blast/CMakeLists.txt
+++ b/test/blast/CMakeLists.txt
@@ -13,7 +13,6 @@ find_package(LAPACK REQUIRED)
 add_executable(test-blast
     math/simd/RegisterMatrixTest.cpp
     math/simd/DynamicRegisterMatrixTest.cpp
-    math/simd/HsumTest.cpp
     math/simd/SimdVecTest.cpp
 
     math/dense/StaticVectorPointerTest.cpp
diff --git a/test/blast/math/simd/HsumTest.cpp b/test/blast/math/simd/HsumTest.cpp
deleted file mode 100644
index 3153e042..00000000
--- a/test/blast/math/simd/HsumTest.cpp
+++ /dev/null
@@ -1,38 +0,0 @@
-// Copyright (c) 2019-2020 Mikhail Katliar All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-#include <blast/math/simd/Hsum.hpp>
-
-#include <test/Testing.hpp>
-#include <test/Randomize.hpp>
-
-#include <numeric>
-
-using namespace blaze;
-
-
-namespace blast :: testing
-{
-    TEST(HsumTest, test4x256d)
-    {
-        std::array<double, 4> a, b, c, d, r;
-        randomize(a);
-        randomize(b);
-        randomize(c);
-        randomize(d);
-
-        __m256d mma = _mm256_loadu_pd(a.data());
-        __m256d mmb = _mm256_loadu_pd(b.data());
-        __m256d mmc = _mm256_loadu_pd(c.data());
-        __m256d mmd = _mm256_loadu_pd(d.data());
-
-        __m256d mmr = hsum(mma, mmb, mmc, mmd);
-        _mm256_storeu_pd(r.data(), mmr);
-
-        EXPECT_NEAR(r[0], std::accumulate(begin(a), end(a), 0.), 1.e-10);
-        EXPECT_NEAR(r[1], std::accumulate(begin(b), end(b), 0.), 1.e-10);
-        EXPECT_NEAR(r[2], std::accumulate(begin(c), end(c), 0.), 1.e-10);
-        EXPECT_NEAR(r[3], std::accumulate(begin(d), end(d), 0.), 1.e-10);
-    }
-}
\ No newline at end of file

From 1b8c62c4f0eaf4fdb3c4abf06b31b5da5a9c86f9 Mon Sep 17 00:00:00 2001
From: Mikhail Katliar <cepstr@gmail.com>
Date: Sat, 3 Aug 2024 05:19:27 +0200
Subject: [PATCH 25/27] Arch-specific code moved to blast/math/simd/arch

---
 include/blast/math/panel/PanelSize.hpp       |   3 +-
 include/blast/math/simd/RegisterCapacity.hpp |   2 +-
 include/blast/math/simd/Simd.hpp             |  20 +++
 include/blast/math/simd/SimdIndex.hpp        |   2 +-
 include/blast/math/simd/SimdMask.hpp         |   2 +-
 include/blast/math/simd/SimdSize.hpp         |   2 +-
 include/blast/math/simd/SimdVec.hpp          | 128 ++-----------------
 include/blast/math/simd/arch/Avx2.hpp        | 123 ++++++++++++++++++
 8 files changed, 157 insertions(+), 125 deletions(-)
 create mode 100644 include/blast/math/simd/Simd.hpp
 create mode 100644 include/blast/math/simd/arch/Avx2.hpp

diff --git a/include/blast/math/panel/PanelSize.hpp b/include/blast/math/panel/PanelSize.hpp
index 9647a0cd..6bcf6bbf 100644
--- a/include/blast/math/panel/PanelSize.hpp
+++ b/include/blast/math/panel/PanelSize.hpp
@@ -9,8 +9,7 @@
 //*************************************************************************************************
 
 #include <blast/math/simd/SimdSize.hpp>
-
-#include <xsimd/xsimd.hpp>
+#include <blast/math/simd/Simd.hpp>
 
 
 namespace blast
diff --git a/include/blast/math/simd/RegisterCapacity.hpp b/include/blast/math/simd/RegisterCapacity.hpp
index 213b6d80..35b2ac7e 100644
--- a/include/blast/math/simd/RegisterCapacity.hpp
+++ b/include/blast/math/simd/RegisterCapacity.hpp
@@ -13,7 +13,7 @@
 // limitations under the License.
 #pragma once
 
-#include <xsimd/xsimd.hpp>
+#include <blast/math/simd/Simd.hpp>
 
 #include <cstdlib>
 
diff --git a/include/blast/math/simd/Simd.hpp b/include/blast/math/simd/Simd.hpp
new file mode 100644
index 00000000..b5bd8873
--- /dev/null
+++ b/include/blast/math/simd/Simd.hpp
@@ -0,0 +1,20 @@
+// Copyright 2024 Mikhail Katliar
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+
+#include <xsimd/xsimd.hpp>
+
+#if XSIMD_WITH_AVX2
+    #include <blast/math/simd/arch/Avx2.hpp>
+#endif
diff --git a/include/blast/math/simd/SimdIndex.hpp b/include/blast/math/simd/SimdIndex.hpp
index bfbae1c5..3d7ca7b2 100644
--- a/include/blast/math/simd/SimdIndex.hpp
+++ b/include/blast/math/simd/SimdIndex.hpp
@@ -13,7 +13,7 @@
 // limitations under the License.
 #pragma  once
 
-#include <xsimd/xsimd.hpp>
+#include <blast/math/simd/Simd.hpp>
 
 #include <cstdint>
 #include <type_traits>
diff --git a/include/blast/math/simd/SimdMask.hpp b/include/blast/math/simd/SimdMask.hpp
index 3cc7f3a8..bdb26082 100644
--- a/include/blast/math/simd/SimdMask.hpp
+++ b/include/blast/math/simd/SimdMask.hpp
@@ -14,7 +14,7 @@
 
 #pragma once
 
-#include <xsimd/xsimd.hpp>
+#include <blast/math/simd/Simd.hpp>
 
 
 namespace blast
diff --git a/include/blast/math/simd/SimdSize.hpp b/include/blast/math/simd/SimdSize.hpp
index 4a22339a..01a64825 100644
--- a/include/blast/math/simd/SimdSize.hpp
+++ b/include/blast/math/simd/SimdSize.hpp
@@ -14,7 +14,7 @@
 
 #pragma once
 
-#include <xsimd/xsimd.hpp>
+#include <blast/math/simd/Simd.hpp>
 
 #include <cstdlib>
 #include <type_traits>
diff --git a/include/blast/math/simd/SimdVec.hpp b/include/blast/math/simd/SimdVec.hpp
index 02b5c1c8..5d6afea1 100644
--- a/include/blast/math/simd/SimdVec.hpp
+++ b/include/blast/math/simd/SimdVec.hpp
@@ -16,8 +16,7 @@
 
 #include <blast/math/simd/SimdIndex.hpp>
 #include <blast/math/simd/SimdMask.hpp>
-
-#include <xsimd/xsimd.hpp>
+#include <blast/math/simd/Simd.hpp>
 
 #include <tuple>
 
@@ -27,112 +26,6 @@ namespace blast
     template <typename T, typename Arch = xsimd::default_arch>
     class SimdVec;
 
-
-    template <typename Arch, typename T>
-    xsimd::batch<T, Arch> maskload(T const * src, xsimd::batch_bool<T, Arch> const& mask) noexcept;
-
-    template <typename Arch>
-    requires std::is_base_of_v<xsimd::avx2, Arch>
-    inline xsimd::batch<float, Arch> maskload(float const * src, xsimd::batch_bool<float, Arch> const& mask) noexcept
-    {
-        return _mm256_maskload_ps(src, mask);
-    }
-
-    template <typename Arch>
-    requires std::is_base_of_v<xsimd::avx2, Arch>
-    inline xsimd::batch<double, Arch> maskload(double const * src, xsimd::batch_bool<double, Arch> const& mask) noexcept
-    {
-        return _mm256_maskload_pd(src, mask);
-    }
-
-    template <typename Arch, typename T>
-    void maskstore(xsimd::batch<T, Arch> const& v, T * dst, xsimd::batch_bool<T, Arch> const& mask) noexcept;
-
-    template <typename Arch>
-    requires std::is_base_of_v<xsimd::avx2, Arch>
-    inline void maskstore(xsimd::batch<float, Arch> const& v, float * dst, xsimd::batch_bool<float, Arch> const& mask) noexcept
-    {
-        _mm256_maskstore_ps(dst, xsimd::batch_bool_cast<std::int32_t>(mask), v);
-    }
-
-    template <typename Arch>
-    requires std::is_base_of_v<xsimd::avx2, Arch>
-    inline void maskstore(xsimd::batch<double, Arch> const& v, double * dst, xsimd::batch_bool<double, Arch> const& mask) noexcept
-    {
-        _mm256_maskstore_pd(dst, xsimd::batch_bool_cast<std::int64_t>(mask), v);
-    }
-
-
-    template <typename Arch>
-    requires std::is_base_of_v<xsimd::avx2, Arch>
-    inline std::tuple<xsimd::batch<float, Arch>, xsimd::batch<std::int32_t, Arch>> imax(xsimd::batch<float, Arch> const& v1, xsimd::batch<std::int32_t, Arch> const& idx) noexcept
-    {
-        /* v2 = [G H E F | C D A B]                                                                         */
-        __m256 const v2 = _mm256_permute_ps(v1, 0b10'11'00'01);
-        __m256i const iv2 = _mm256_castps_si256(_mm256_permute_ps(_mm256_castsi256_ps(idx), 0b10'11'00'01));
-
-        /* v3 = [W=max(G,H) W=max(G,H) Z=max(E,F) Z=max(E,F) | Y=max(C,D) Y=max(C,D) X=max(A,B) X=max(A,B)] */
-        /* v3 = [W W Z Z | Y Y X X]                                                                         */
-        // __m256 v3 = _mm256_max_ps(v1, v2);
-        __m256 const mask_v3 = _mm256_cmp_ps(v2, v1, _CMP_GT_OQ);
-        __m256 const v3 = _mm256_blendv_ps(v1, v2, mask_v3);
-        __m256i const iv3 = _mm256_blendv_epi8(idx, iv2, _mm256_castps_si256(mask_v3));
-
-        /* v4 = [Z Z W W | X X Y Y]                                                                         */
-        __m256 const v4 = _mm256_permute_ps(v3, 0b00'00'10'10);
-        __m256i const iv4 = _mm256_castps_si256(_mm256_permute_ps(_mm256_castsi256_ps(iv3), 0b00'00'10'10));
-
-        /* v5 = [J=max(Z,W) J=max(Z,W) J=max(Z,W) J=max(Z,W) | I=max(X,Y) I=max(X,Y) I=max(X,Y) I=max(X,Y)] */
-        /* v5 = [J J J J | I I I I]                                                                         */
-        // __m256 v5 = _mm256_max_ps(v3, v4);
-        __m256 const mask_v5 = _mm256_cmp_ps(v4, v3, _CMP_GT_OQ);
-        __m256 const v5 = _mm256_blendv_ps(v3, v4, mask_v5);
-        __m256i const iv5 = _mm256_blendv_epi8(iv3, iv4, _mm256_castps_si256(mask_v5));
-
-        /* v6 = [I I I I | J J J J]                                                                         */
-        __m256 const v6 = _mm256_permute2f128_ps(v5, v5, 0b0000'0001);
-        __m256i const iv6 = _mm256_castps_si256(
-            _mm256_permute2f128_ps(
-                _mm256_castsi256_ps(iv5),
-                _mm256_castsi256_ps(iv5),
-                0b0000'0001
-            )
-        );
-
-        /* v7 = [M=max(I,J) M=max(I,J) M=max(I,J) M=max(I,J) | M=max(I,J) M=max(I,J) M=max(I,J) M=max(I,J)] */
-        // __m128 v7 = _mm_max_ps(_mm256_castps256_ps128(v5), v6);
-        __m256 const mask_v7 = _mm256_cmp_ps(v6, v5, _CMP_GT_OQ);
-        __m256 const v7 = _mm256_blendv_ps(v5, v6, mask_v7);
-        __m256i const iv7 = _mm256_blendv_epi8(iv5, iv6, _mm256_castps_si256(mask_v7));
-
-        return {v7, iv7};
-    }
-
-
-    template <typename Arch>
-    requires std::is_base_of_v<xsimd::avx2, Arch>
-    inline std::tuple<xsimd::batch<double, Arch>, xsimd::batch<std::int64_t, Arch>> imax(xsimd::batch<double, Arch> const& x, xsimd::batch<std::int64_t, Arch> const& idx) noexcept
-    {
-        __m256d const y = _mm256_permute2f128_pd(x, x, 1); // permute 128-bit values
-        __m256i const iy = _mm256_permute2f128_si256(idx, idx, 1);
-
-        // __m256d m1 = _mm256_max_pd(x.value_, y); // m1[0] = max(x[0], x[2]), m1[1] = max(x[1], x[3]), etc.
-        __m256d const mask_m1 = _mm256_cmp_pd(y, x, _CMP_GT_OQ);
-        __m256d const m1 = _mm256_blendv_pd(x, y, mask_m1);
-        __m256i const im1 = _mm256_blendv_epi8(idx, iy, _mm256_castpd_si256(mask_m1));
-
-        __m256d const m2 = _mm256_permute_pd(m1, 5); // set m2[0] = m1[1], m2[1] = m1[0], etc.
-        __m256i const im2 = _mm256_castpd_si256(_mm256_permute_pd(_mm256_castsi256_pd(im1), 5));
-
-        // __m256d m = _mm256_max_pd(m1, m2); // all m[0] ... m[3] contain the horizontal max(x[0], x[1], x[2], x[3])
-        __m256d const mask_m = _mm256_cmp_pd(m2, m1, _CMP_GT_OQ);
-        __m256d const m = _mm256_blendv_pd(m1, m2, mask_m);
-        __m256i const im = _mm256_blendv_epi8(im1, im2, _mm256_castpd_si256(mask_m));
-
-        return {m, im};
-    }
-
-
     /**
     * @brief Fused negative multiply-add
     *
@@ -162,7 +55,7 @@ namespace blast
     SimdVec<T, Arch> fmadd(SimdVec<T, Arch> const& a, SimdVec<T, Arch> const& b, SimdVec<T, Arch> const& c) noexcept;
 
     template <typename T, typename Arch>
-    SimdVec<T, Arch> blend(SimdVec<T, Arch> const& a, SimdVec<T, Arch> const& b, typename SimdVec<T, Arch>::MaskType mask) noexcept;
+    SimdVec<T, Arch> blend(SimdVec<T, Arch> const& a, SimdVec<T, Arch> const& b, SimdMask<T, Arch> const& mask) noexcept;
 
     template <typename T, typename Arch>
     SimdVec<T, Arch> abs(SimdVec<T, Arch> const& a) noexcept;
@@ -181,12 +74,9 @@ namespace blast
     /**
     * @brief Horizontal max (across all elements)
     *
-    * The implementation is based on
-    * https://stackoverflow.com/questions/9795529/how-to-find-the-horizontal-maximum-in-a-256-bit-avx-vector
-    *
     * @param a pack of 32-bit floating point numbers
     *
-    * @return max(a[0], a[1], a[2], a[3], a[4], a[5], a[6], a[7])
+    * @return max(a[0], a[1], ..., a[N-1])
     */
     template <typename T, typename Arch>
     T max(SimdVec<T, Arch> const& x) noexcept;
@@ -399,7 +289,7 @@ namespace blast
 
         friend SimdVec fmadd<>(SimdVec const& a, SimdVec const& b, SimdVec const& c) noexcept;
         friend SimdVec fnmadd<>(SimdVec const& a, SimdVec const& b, SimdVec const& c) noexcept;
-        friend SimdVec blend<>(SimdVec const& a, SimdVec const& b, MaskType mask) noexcept;
+        friend SimdVec blend<>(SimdVec const& a, SimdVec const& b, MaskType const& mask) noexcept;
         friend SimdVec abs<>(SimdVec const& a) noexcept;
         friend SimdVec max<>(SimdVec const& a, SimdVec const& b) noexcept;
         friend ValueType max<>(SimdVec const& x) noexcept;
@@ -433,7 +323,7 @@ namespace blast
 
 
     template <typename T, typename Arch>
-    inline SimdVec<T, Arch> blend(SimdVec<T, Arch> const& a, SimdVec<T, Arch> const& b, typename SimdVec<T, Arch>::MaskType mask) noexcept
+    inline SimdVec<T, Arch> blend(SimdVec<T, Arch> const& a, SimdVec<T, Arch> const& b, SimdMask<T, Arch> const& mask) noexcept
     {
         return xsimd::select(mask, a.value_, b.value_);
     }
@@ -456,28 +346,28 @@ namespace blast
     template <typename T, typename Arch>
     inline SimdVec<T, Arch> operator*(SimdVec<T, Arch> const& a, SimdVec<T, Arch> const& b) noexcept
     {
-        return a.value_ * b.value_;
+        return xsimd::mul(a.value_, b.value_);
     }
 
 
     template <typename T, typename Arch>
     inline SimdVec<T, Arch> operator*(T const& a, SimdVec<T, Arch> const& b) noexcept
     {
-        return SimdVec<T, Arch> {a} * b;
+        return xsimd::mul(a, b.value_);
     }
 
 
     template <typename T, typename Arch>
     inline SimdVec<T, Arch> operator*(SimdVec<T, Arch> const& a, T const& b) noexcept
     {
-        return a * SimdVec<T, Arch> {b};
+        return xsimd::mul(a.value_, b);
     }
 
 
     template <typename T, typename Arch>
     inline SimdVec<T, Arch> max(SimdVec<T, Arch> const& a, SimdVec<T, Arch> const& b) noexcept
     {
-        return xsimd::max(a, b);
+        return xsimd::max(a.value_, b.value_);
     }
 
 
diff --git a/include/blast/math/simd/arch/Avx2.hpp b/include/blast/math/simd/arch/Avx2.hpp
new file mode 100644
index 00000000..a77c09c4
--- /dev/null
+++ b/include/blast/math/simd/arch/Avx2.hpp
@@ -0,0 +1,123 @@
+// Copyright 2024 Mikhail Katliar
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+
+#include <xsimd/xsimd.hpp>
+
+#include <type_traits>
+
+
+namespace blast
+{
+    template <typename Arch>
+    requires std::is_base_of_v<xsimd::avx2, Arch>
+    inline xsimd::batch<float, Arch> maskload(float const * src, xsimd::batch_bool<float, Arch> const& mask) noexcept
+    {
+        return _mm256_maskload_ps(src, mask);
+    }
+
+
+    template <typename Arch>
+    requires std::is_base_of_v<xsimd::avx2, Arch>
+    inline xsimd::batch<double, Arch> maskload(double const * src, xsimd::batch_bool<double, Arch> const& mask) noexcept
+    {
+        return _mm256_maskload_pd(src, mask);
+    }
+
+
+    template <typename Arch>
+    requires std::is_base_of_v<xsimd::avx2, Arch>
+    inline void maskstore(xsimd::batch<float, Arch> const& v, float * dst, xsimd::batch_bool<float, Arch> const& mask) noexcept
+    {
+        _mm256_maskstore_ps(dst, xsimd::batch_bool_cast<std::int32_t>(mask), v);
+    }
+
+
+    template <typename Arch>
+    requires std::is_base_of_v<xsimd::avx2, Arch>
+    inline void maskstore(xsimd::batch<double, Arch> const& v, double * dst, xsimd::batch_bool<double, Arch> const& mask) noexcept
+    {
+        _mm256_maskstore_pd(dst, xsimd::batch_bool_cast<std::int64_t>(mask), v);
+    }
+
+
+    template <typename Arch>
+    requires std::is_base_of_v<xsimd::avx2, Arch>
+    inline std::tuple<xsimd::batch<float, Arch>, xsimd::batch<std::int32_t, Arch>> imax(xsimd::batch<float, Arch> const& v1, xsimd::batch<std::int32_t, Arch> const& idx) noexcept
+    {
+        /* v2 = [G H E F | C D A B]                                                                         */
+        __m256 const v2 = _mm256_permute_ps(v1, 0b10'11'00'01);
+        __m256i const iv2 = _mm256_castps_si256(_mm256_permute_ps(_mm256_castsi256_ps(idx), 0b10'11'00'01));
+
+        /* v3 = [W=max(G,H) W=max(G,H) Z=max(E,F) Z=max(E,F) | Y=max(C,D) Y=max(C,D) X=max(A,B) X=max(A,B)] */
+        /* v3 = [W W Z Z | Y Y X X]                                                                         */
+        // __m256 v3 = _mm256_max_ps(v1, v2);
+        __m256 const mask_v3 = _mm256_cmp_ps(v2, v1, _CMP_GT_OQ);
+        __m256 const v3 = _mm256_blendv_ps(v1, v2, mask_v3);
+        __m256i const iv3 = _mm256_blendv_epi8(idx, iv2, _mm256_castps_si256(mask_v3));
+
+        /* v4 = [Z Z W W | X X Y Y]                                                                         */
+        __m256 const v4 = _mm256_permute_ps(v3, 0b00'00'10'10);
+        __m256i const iv4 = _mm256_castps_si256(_mm256_permute_ps(_mm256_castsi256_ps(iv3), 0b00'00'10'10));
+
+        /* v5 = [J=max(Z,W) J=max(Z,W) J=max(Z,W) J=max(Z,W) | I=max(X,Y) I=max(X,Y) I=max(X,Y) I=max(X,Y)] */
+        /* v5 = [J J J J | I I I I]                                                                         */
+        // __m256 v5 = _mm256_max_ps(v3, v4);
+        __m256 const mask_v5 = _mm256_cmp_ps(v4, v3, _CMP_GT_OQ);
+        __m256 const v5 = _mm256_blendv_ps(v3, v4, mask_v5);
+        __m256i const iv5 = _mm256_blendv_epi8(iv3, iv4, _mm256_castps_si256(mask_v5));
+
+        /* v6 = [I I I I | J J J J]                                                                         */
+        __m256 const v6 = _mm256_permute2f128_ps(v5, v5, 0b0000'0001);
+        __m256i const iv6 = _mm256_castps_si256(
+            _mm256_permute2f128_ps(
+                _mm256_castsi256_ps(iv5),
+                _mm256_castsi256_ps(iv5),
+                0b0000'0001
+            )
+        );
+
+        /* v7 = [M=max(I,J) M=max(I,J) M=max(I,J) M=max(I,J) | M=max(I,J) M=max(I,J) M=max(I,J) M=max(I,J)] */
+        // __m128 v7 = _mm_max_ps(_mm256_castps256_ps128(v5), v6);
+        __m256 const mask_v7 = _mm256_cmp_ps(v6, v5, _CMP_GT_OQ);
+        __m256 const v7 = _mm256_blendv_ps(v5, v6, mask_v7);
+        __m256i const iv7 = _mm256_blendv_epi8(iv5, iv6, _mm256_castps_si256(mask_v7));
+
+        return {v7, iv7};
+    }
+
+
+    template <typename Arch>
+    requires std::is_base_of_v<xsimd::avx2, Arch>
+    inline std::tuple<xsimd::batch<double, Arch>, xsimd::batch<std::int64_t, Arch>> imax(xsimd::batch<double, Arch> const& x, xsimd::batch<std::int64_t, Arch> const& idx) noexcept
+    {
+        __m256d const y = _mm256_permute2f128_pd(x, x, 1); // permute 128-bit values
+        __m256i const iy = _mm256_permute2f128_si256(idx, idx, 1);
+
+        // __m256d m1 = _mm256_max_pd(x.value_, y); // m1[0] = max(x[0], x[2]), m1[1] = max(x[1], x[3]), etc.
+        __m256d const mask_m1 = _mm256_cmp_pd(y, x, _CMP_GT_OQ);
+        __m256d const m1 = _mm256_blendv_pd(x, y, mask_m1);
+        __m256i const im1 = _mm256_blendv_epi8(idx, iy, _mm256_castpd_si256(mask_m1));
+
+        __m256d const m2 = _mm256_permute_pd(m1, 5); // set m2[0] = m1[1], m2[1] = m1[0], etc.
+        __m256i const im2 = _mm256_castpd_si256(_mm256_permute_pd(_mm256_castsi256_pd(im1), 5));
+
+        // __m256d m = _mm256_max_pd(m1, m2); // all m[0] ... m[3] contain the horizontal max(x[0], x[1], x[2], x[3])
+        __m256d const mask_m = _mm256_cmp_pd(m2, m1, _CMP_GT_OQ);
+        __m256d const m = _mm256_blendv_pd(m1, m2, mask_m);
+        __m256i const im = _mm256_blendv_epi8(im1, im2, _mm256_castpd_si256(mask_m));
+
+        return {m, im};
+    }
+}

From 9ebf1ed27c312665f5420e6634e3dc655953330b Mon Sep 17 00:00:00 2001
From: Mikhail Katliar <cepstr@gmail.com>
Date: Sat, 3 Aug 2024 05:26:11 +0200
Subject: [PATCH 26/27] Builds with gcc

---
 include/blast/math/simd/arch/Avx2.hpp | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/include/blast/math/simd/arch/Avx2.hpp b/include/blast/math/simd/arch/Avx2.hpp
index a77c09c4..b5aed8c1 100644
--- a/include/blast/math/simd/arch/Avx2.hpp
+++ b/include/blast/math/simd/arch/Avx2.hpp
@@ -24,7 +24,7 @@ namespace blast
     requires std::is_base_of_v<xsimd::avx2, Arch>
     inline xsimd::batch<float, Arch> maskload(float const * src, xsimd::batch_bool<float, Arch> const& mask) noexcept
     {
-        return _mm256_maskload_ps(src, mask);
+        return _mm256_maskload_ps(src, _mm256_castps_si256(mask));
     }
 
 
@@ -32,7 +32,7 @@ namespace blast
     requires std::is_base_of_v<xsimd::avx2, Arch>
     inline xsimd::batch<double, Arch> maskload(double const * src, xsimd::batch_bool<double, Arch> const& mask) noexcept
     {
-        return _mm256_maskload_pd(src, mask);
+        return _mm256_maskload_pd(src, _mm256_castpd_si256(mask));
     }
 
 
@@ -40,7 +40,7 @@ namespace blast
     requires std::is_base_of_v<xsimd::avx2, Arch>
     inline void maskstore(xsimd::batch<float, Arch> const& v, float * dst, xsimd::batch_bool<float, Arch> const& mask) noexcept
     {
-        _mm256_maskstore_ps(dst, xsimd::batch_bool_cast<std::int32_t>(mask), v);
+        _mm256_maskstore_ps(dst, _mm256_castps_si256(mask), v);
     }
 
 
@@ -48,7 +48,7 @@ namespace blast
     requires std::is_base_of_v<xsimd::avx2, Arch>
     inline void maskstore(xsimd::batch<double, Arch> const& v, double * dst, xsimd::batch_bool<double, Arch> const& mask) noexcept
     {
-        _mm256_maskstore_pd(dst, xsimd::batch_bool_cast<std::int64_t>(mask), v);
+        _mm256_maskstore_pd(dst, _mm256_castpd_si256(mask), v);
     }
 
 

From 2f757bbffa95f8c632ec97644f165665bfa10bba Mon Sep 17 00:00:00 2001
From: Mikhail Katliar <cepstr@gmail.com>
Date: Mon, 5 Aug 2024 14:16:03 +0200
Subject: [PATCH 27/27] Documented iamax()

---
 include/blast/math/simd/SimdVec.hpp | 15 +++++++++++++--
 1 file changed, 13 insertions(+), 2 deletions(-)

diff --git a/include/blast/math/simd/SimdVec.hpp b/include/blast/math/simd/SimdVec.hpp
index 5d6afea1..140516ef 100644
--- a/include/blast/math/simd/SimdVec.hpp
+++ b/include/blast/math/simd/SimdVec.hpp
@@ -298,9 +298,20 @@ namespace blast
         friend SimdVec operator*<>(ValueType const& a, SimdVec const& b) noexcept;
         friend SimdVec operator*<>(SimdVec const& a, ValueType const& b) noexcept;
 
-        friend std::tuple<SimdVec, SimdIndex<T, Arch>> imax(SimdVec const& v1, SimdIndex<T, Arch> const& idx) noexcept
+        /**
+         * @brief Maximum element of a batch and its index.
+         *
+         * For example, imax([-1., 0., 4., 3.], [1, 2, -1, 42]) == ([4., 4., 4., 4.], [-1, -1, -1, -1])
+         *
+         * @param v batch of values
+         * @param idx batch of indices
+         *
+         * @return a tuple {A, I} such that A == [a, a, ..., a] and I == [idx[i], idx[i], ..., idx[i]],
+         *   where a == max_j(v[j]), i = argmax_j(v[j])
+         */
+        friend std::tuple<SimdVec, SimdIndex<T, Arch>> imax(SimdVec const& v, SimdIndex<T, Arch> const& idx) noexcept
         {
-            return imax(v1.value_, idx);
+            return imax(v.value_, idx);
         }
 
     private: