From 0b9550a3b998b856097e4b80f8c71996b4edc995 Mon Sep 17 00:00:00 2001 From: Mikhail Katliar Date: Tue, 23 Jul 2024 16:05:39 +0200 Subject: [PATCH 01/27] Removed load() and store() from StaticPanelMatrix --- .../blast/math/panel/StaticPanelMatrix.hpp | 39 ------------------- .../math/panel/StaticPanelMatrixTest.cpp | 6 ++- 2 files changed, 4 insertions(+), 41 deletions(-) diff --git a/include/blast/math/panel/StaticPanelMatrix.hpp b/include/blast/math/panel/StaticPanelMatrix.hpp index cfeda753..b1d2a624 100644 --- a/include/blast/math/panel/StaticPanelMatrix.hpp +++ b/include/blast/math/panel/StaticPanelMatrix.hpp @@ -160,45 +160,6 @@ namespace blast } - Type * ptr(size_t i, size_t j) - { - // BLAST_USER_ASSERT(i % panelSize_ == 0, "Row index not aligned to panel boundary"); - return v_ + elementIndex(i, j); - } - - - Type const * ptr(size_t i, size_t j) const - { - // BLAST_USER_ASSERT(i % panelSize_ == 0, "Row index not aligned to panel boundary"); - return v_ + elementIndex(i, j); - } - - - template - auto load(size_t i, size_t j) const - { - BLAZE_INTERNAL_ASSERT(i < M, "Invalid row access index"); - BLAZE_INTERNAL_ASSERT(j < N, "Invalid column access index"); - BLAZE_INTERNAL_ASSERT(i % panelSize_ == 0 || SO == rowMajor, "Row index not aligned to panel boundary"); - BLAZE_INTERNAL_ASSERT(j % panelSize_ == 0 || SO == columnMajor, "Column index not aligned to panel boundary"); - - return blast::load(v_ + elementIndex(i, j)); - } - - - template - void store(size_t i, size_t j, T val) - { - BLAZE_INTERNAL_ASSERT(i < M, "Invalid row access index"); - BLAZE_INTERNAL_ASSERT(j < N, "Invalid column access index"); - BLAZE_INTERNAL_ASSERT(i % panelSize_ == 0 || SO == rowMajor, "Row index not aligned to panel boundary"); - BLAZE_INTERNAL_ASSERT(j % panelSize_ == 0 || SO == columnMajor, "Column index not aligned to panel boundary"); - - // We never use maskstore here because we have padding - blast::store(v_ + elementIndex(i, j), val); - } - - private: static size_t constexpr panelSize_ = PanelSize_v; static size_t constexpr tileRows_ = M / panelSize_ + (M % panelSize_ > 0); diff --git a/test/blast/math/panel/StaticPanelMatrixTest.cpp b/test/blast/math/panel/StaticPanelMatrixTest.cpp index 13fd564e..c5db87f3 100644 --- a/test/blast/math/panel/StaticPanelMatrixTest.cpp +++ b/test/blast/math/panel/StaticPanelMatrixTest.cpp @@ -3,9 +3,11 @@ // license that can be found in the LICENSE file. #include +#include #include +#include #include #include @@ -98,7 +100,7 @@ namespace blast :: testing for (size_t i = 0; i < M; i += SS) for (size_t j = 0; j < N; ++j) { - auto const xmm = A.template load(i, j); + auto const xmm = ptr(A, i, j).load(); for (size_t k = 0; k < SS; ++k) ASSERT_EQ(xmm[k], A(i + k, j)) << "element mismatch at i,j,k=" << i << "," << j << "," << k; @@ -122,7 +124,7 @@ namespace blast :: testing for (size_t j = 0; j < N; ++j) { A = TypeParam(0.); - A.store(i, j, val); + ptr(A, i, j).store(val); for (size_t k = 0; k < SS && i + k < rows(A); ++k) ASSERT_EQ(A(i + k, j), val[k]) << "element mismatch at i,j,k=" << i << "," << j << "," << k; From b6eedca6d70d4ccef1cc7b2d087aaf2c6d1e0154 Mon Sep 17 00:00:00 2001 From: Mikhail Katliar Date: Tue, 23 Jul 2024 17:16:12 +0200 Subject: [PATCH 02/27] Removed load() functions from Simd.hpp --- .../blast/math/expressions/PMatTransExpr.hpp | 3 +- .../blast/math/expressions/PanelMatrix.hpp | 22 +++++----- include/blast/math/simd/Simd.hpp | 40 +------------------ 3 files changed, 14 insertions(+), 51 deletions(-) diff --git a/include/blast/math/expressions/PMatTransExpr.hpp b/include/blast/math/expressions/PMatTransExpr.hpp index fae79ffc..4ad4526f 100644 --- a/include/blast/math/expressions/PMatTransExpr.hpp +++ b/include/blast/math/expressions/PMatTransExpr.hpp @@ -12,7 +12,6 @@ #include #include #include -#include #include #include #include @@ -213,4 +212,4 @@ namespace blast using ReturnType = const PMatTransExpr; return ReturnType(*pm); } -} \ No newline at end of file +} diff --git a/include/blast/math/expressions/PanelMatrix.hpp b/include/blast/math/expressions/PanelMatrix.hpp index 4a556a33..1df9b97d 100644 --- a/include/blast/math/expressions/PanelMatrix.hpp +++ b/include/blast/math/expressions/PanelMatrix.hpp @@ -7,6 +7,8 @@ #include #include #include +#include +#include //#include #include @@ -108,22 +110,22 @@ namespace blast for (size_t i = 0; i + SS <= m; i += SS) { - ET2 const * pr = &(*rhs)(i, 0); - ET1 * pl = data(lhs) + i; + auto pr = ptr(*rhs, i, 0); + auto pl = ptr(*lhs, i, 0); for (size_t j = 0; j < n; ++j) - store(pl + s * j, load(pr + PANEL_SIZE * j)); + pl(0, j).store(pr(0, j).load()); } if (IntType const rem = m % SS) { MaskType const mask = SIMD::index() < rem; size_t const i = m - rem; - ET2 const * pr = &(*rhs)(i, 0); - ET1 * pl = data(lhs) + i; + auto pr = ptr(*rhs, i, 0); + auto pl = ptr(*lhs, i, 0); for (size_t j = 0; j < n; ++j) - maskstore(pl + s * j, mask, load(pr + PANEL_SIZE * j)); + pl(0, j).store(pr(0, j).load(), mask); } #if 0 @@ -196,11 +198,11 @@ namespace blast { for (size_t i = 0; i + SS <= m; i += SS) { - ET2 const * pr = data(rhs) + i; - ET1 * pl = &(*lhs)(i, 0); + auto pr = ptr(*rhs, i, 0); + auto pl = ptr(*lhs, i, 0); for (size_t j = 0; j < n; ++j) - store(pl + PANEL_SIZE * j, load(pr + s * j)); + pl(0, j).store(pr(0, j).load()); } if (IntType const rem = m % SS) @@ -243,4 +245,4 @@ namespace blast (*lhs)(i, j) = (*rhs)(i, j); } } -} \ No newline at end of file +} diff --git a/include/blast/math/simd/Simd.hpp b/include/blast/math/simd/Simd.hpp index 040f4ffe..c5d48bc7 100644 --- a/include/blast/math/simd/Simd.hpp +++ b/include/blast/math/simd/Simd.hpp @@ -196,49 +196,11 @@ namespace blast // //******************************************************* - template - auto load(T const * ptr); - - - template - inline auto load(T const * ptr) - { - return load::size>(ptr); - } - template auto broadcast(T const * ptr); - template <> - inline auto load(double const * ptr) - { - return _mm256_load_pd(ptr); - } - - - template <> - inline auto load(double const * ptr) - { - return _mm256_loadu_pd(ptr); - } - - - template <> - inline auto load(float const * ptr) - { - return _mm256_load_ps(ptr); - } - - - template <> - inline auto load(float const * ptr) - { - return _mm256_loadu_ps(ptr); - } - - inline auto maskload(double const * ptr, __m256i mask) { return _mm256_maskload_pd(ptr, mask); @@ -456,4 +418,4 @@ namespace blast { return _mm256_cmpgt_epi32(a, b); } -} \ No newline at end of file +} From 231d1759fa788e2e439ea43bfe6f033d699e3836 Mon Sep 17 00:00:00 2001 From: Mikhail Katliar Date: Thu, 25 Jul 2024 23:05:37 +0200 Subject: [PATCH 03/27] SimdVec implementation using xsimd --- CMakeLists.txt | 6 +++++ include/blast/math/simd/Simd.hpp | 19 +++++++-------- include/blast/math/simd/SimdVec.hpp | 11 +++++---- include/blast/math/simd/SimdVecBase.hpp | 14 +++++++---- .../blast/math/simd/avx256/SimdVecFloat32.hpp | 24 ++++++++++--------- .../blast/math/simd/avx256/SimdVecFloat64.hpp | 18 +++++++------- .../blast/math/simd/avx256/SimdVecInt32.hpp | 11 ++++----- .../blast/math/simd/avx256/SimdVecInt64.hpp | 8 +++---- 8 files changed, 60 insertions(+), 51 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index faffe2fd..aa1db7b4 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -25,6 +25,7 @@ set(CMAKE_DEBUG_POSTFIX d) find_package(Boost REQUIRED COMPONENTS exception) find_package(blaze REQUIRED) +find_package(xsimd REQUIRED) add_library(blast INTERFACE) @@ -35,6 +36,7 @@ target_include_directories(blast INTERFACE target_link_libraries(blast INTERFACE blaze::blaze + INTERFACE xsimd ) target_compile_options(blast @@ -44,6 +46,10 @@ target_compile_options(blast INTERFACE "-march=native" "-mfma" "-mavx" "-mavx2" "-msse4" ) +target_compile_definitions(blast + INTERFACE "XSIMD_DEFAULT_ARCH=avx2" +) + # BLAST_WITH_BLASFEO option(BLAST_WITH_BLASFEO "Build blasfeo C++ interface") diff --git a/include/blast/math/simd/Simd.hpp b/include/blast/math/simd/Simd.hpp index c5d48bc7..841f7571 100644 --- a/include/blast/math/simd/Simd.hpp +++ b/include/blast/math/simd/Simd.hpp @@ -10,10 +10,9 @@ #include #include -#include +#include -#include -#include +#include namespace blast @@ -21,12 +20,12 @@ namespace blast using namespace blaze; - template + template struct Simd; template <> - struct Simd + struct Simd { private: class Index; @@ -87,7 +86,7 @@ namespace blast template <> - struct Simd + struct Simd { private: class Index; @@ -147,8 +146,8 @@ namespace blast }; - template - using IntrinsicType_t = typename Simd::IntrinsicType; + template + using IntrinsicType_t = typename Simd::IntrinsicType; template @@ -186,8 +185,8 @@ namespace blast using IntType_t = typename SimdTraits::IntType; - template - size_t constexpr RegisterCapacity_v = Simd::registerCapacity; + template + size_t constexpr RegisterCapacity_v = Simd::registerCapacity; //******************************************************* diff --git a/include/blast/math/simd/SimdVec.hpp b/include/blast/math/simd/SimdVec.hpp index a3195ff7..f65969f9 100644 --- a/include/blast/math/simd/SimdVec.hpp +++ b/include/blast/math/simd/SimdVec.hpp @@ -14,15 +14,16 @@ #pragma once +#include + namespace blast { /** - * @brief Data-parallel type with the element type bool. - * The width of a given simd_mask instantiation is a constant expression, determined by the template parameter. + * @brief Data-parallel type with a given element type. * - * @tparam T the element type simd_mask applies on + * @tparam T element type */ - template + template class SimdVec; -} \ No newline at end of file +} diff --git a/include/blast/math/simd/SimdVecBase.hpp b/include/blast/math/simd/SimdVecBase.hpp index 365258f2..b041fc66 100644 --- a/include/blast/math/simd/SimdVecBase.hpp +++ b/include/blast/math/simd/SimdVecBase.hpp @@ -16,17 +16,21 @@ #include +#include + #include namespace blast { - template + template class SimdVecBase { + using XSimdType = xsimd::batch; + public: using ValueType = T; - using IntrinsicType = I; + using IntrinsicType = typename XSimdType::register_type; /** @@ -34,7 +38,7 @@ namespace blast */ static size_t constexpr size() { - return SimdSize_v; + return XSimdType::size; } @@ -49,6 +53,6 @@ namespace blast { } - IntrinsicType value_; + XSimdType value_; }; -} \ No newline at end of file +} diff --git a/include/blast/math/simd/avx256/SimdVecFloat32.hpp b/include/blast/math/simd/avx256/SimdVecFloat32.hpp index 34709629..74e1bbf0 100644 --- a/include/blast/math/simd/avx256/SimdVecFloat32.hpp +++ b/include/blast/math/simd/avx256/SimdVecFloat32.hpp @@ -26,8 +26,8 @@ namespace blast { template <> - class SimdVec - : public SimdVecBase + class SimdVec + : public SimdVecBase { public: using MaskType = __m256i; @@ -204,33 +204,33 @@ namespace blast template requires (SimdSize_v == size()) - friend std::tuple> imax(SimdVec const& v1, SimdVec const& idx) + friend std::tuple> imax(SimdVec const& v1, SimdVec const& idx) { /* v2 = [G H E F | C D A B] */ SimdVec const v2 = _mm256_permute_ps(v1, 0b10'11'00'01); - SimdVec const iv2 = _mm256_castps_si256(_mm256_permute_ps(_mm256_castsi256_ps(idx), 0b10'11'00'01)); + SimdVec const iv2 = _mm256_castps_si256(_mm256_permute_ps(_mm256_castsi256_ps(idx), 0b10'11'00'01)); /* v3 = [W=max(G,H) W=max(G,H) Z=max(E,F) Z=max(E,F) | Y=max(C,D) Y=max(C,D) X=max(A,B) X=max(A,B)] */ /* v3 = [W W Z Z | Y Y X X] */ // __m256 v3 = _mm256_max_ps(v1, v2); MaskType const mask_v3 = v2 > v1; SimdVec const v3 = blend(v1, v2, mask_v3); - SimdVec const iv3 = blend(idx, iv2, mask_v3); + SimdVec const iv3 = blend(idx, iv2, mask_v3); /* v4 = [Z Z W W | X X Y Y] */ SimdVec const v4 = _mm256_permute_ps(v3, 0b00'00'10'10); - SimdVec const iv4 = _mm256_castps_si256(_mm256_permute_ps(_mm256_castsi256_ps(iv3), 0b00'00'10'10)); + SimdVec const iv4 = _mm256_castps_si256(_mm256_permute_ps(_mm256_castsi256_ps(iv3), 0b00'00'10'10)); /* v5 = [J=max(Z,W) J=max(Z,W) J=max(Z,W) J=max(Z,W) | I=max(X,Y) I=max(X,Y) I=max(X,Y) I=max(X,Y)] */ /* v5 = [J J J J | I I I I] */ // __m256 v5 = _mm256_max_ps(v3, v4); MaskType const mask_v5 = v4 > v3; SimdVec const v5 = blend(v3, v4, mask_v5); - SimdVec const iv5 = blend(iv3, iv4, mask_v5); + SimdVec const iv5 = blend(iv3, iv4, mask_v5); /* v6 = [I I I I | J J J J] */ SimdVec const v6 = _mm256_permute2f128_ps(v5, v5, 0b0000'0001); - SimdVec const iv6 = _mm256_castps_si256( + SimdVec const iv6 = _mm256_castps_si256( _mm256_permute2f128_ps( _mm256_castsi256_ps(iv5), _mm256_castsi256_ps(iv5), @@ -242,7 +242,7 @@ namespace blast // __m128 v7 = _mm_max_ps(_mm256_castps256_ps128(v5), v6); MaskType const mask_v7 = v6 > v5; SimdVec const v7 = blend(v5, v6, mask_v7); - SimdVec const iv7 = blend(iv5, iv6, mask_v7); + SimdVec const iv7 = blend(iv5, iv6, mask_v7); return std::make_tuple(v7, iv7); } @@ -251,13 +251,15 @@ namespace blast /** * @brief Access single element * + * TODO: move to SimdVecBase + * * @param i element index * * @return element value */ ValueType operator[](size_t i) const noexcept { - return value_[i]; + return value_.get(i); } }; -} \ No newline at end of file +} diff --git a/include/blast/math/simd/avx256/SimdVecFloat64.hpp b/include/blast/math/simd/avx256/SimdVecFloat64.hpp index a5277ab6..9d85640f 100644 --- a/include/blast/math/simd/avx256/SimdVecFloat64.hpp +++ b/include/blast/math/simd/avx256/SimdVecFloat64.hpp @@ -26,8 +26,8 @@ namespace blast { template <> - class SimdVec - : public SimdVecBase + class SimdVec + : public SimdVecBase { public: using MaskType = __m256i; @@ -197,23 +197,23 @@ namespace blast template requires (SimdSize_v == size()) - friend std::tuple> imax(SimdVec const& x, SimdVec const& idx) + friend std::tuple> imax(SimdVec const& x, SimdVec const& idx) { SimdVec const y = _mm256_permute2f128_pd(x.value_, x.value_, 1); // permute 128-bit values - SimdVec const iy = _mm256_permute2f128_si256(idx, idx, 1); + SimdVec const iy = _mm256_permute2f128_si256(idx, idx, 1); // __m256d m1 = _mm256_max_pd(x.value_, y); // m1[0] = max(x[0], x[2]), m1[1] = max(x[1], x[3]), etc. MaskType const mask_m1 = y > x; SimdVec const m1 = blend(x, y, mask_m1); - SimdVec const im1 = blend(idx, iy, mask_m1); + SimdVec const im1 = blend(idx, iy, mask_m1); SimdVec const m2 = _mm256_permute_pd(m1, 5); // set m2[0] = m1[1], m2[1] = m1[0], etc. - SimdVec const im2 = _mm256_castpd_si256(_mm256_permute_pd(_mm256_castsi256_pd(im1), 5)); + SimdVec const im2 = _mm256_castpd_si256(_mm256_permute_pd(_mm256_castsi256_pd(im1), 5)); // __m256d m = _mm256_max_pd(m1, m2); // all m[0] ... m[3] contain the horizontal max(x[0], x[1], x[2], x[3]) MaskType const mask_m = m2 > m1; SimdVec const m = blend(m1, m2, mask_m); - SimdVec const im = blend(im1, im2, mask_m); + SimdVec const im = blend(im1, im2, mask_m); return std::make_tuple(m, im); } @@ -228,7 +228,7 @@ namespace blast */ ValueType operator[](size_t i) const noexcept { - return value_[i]; + return value_.get(i); } }; -} \ No newline at end of file +} diff --git a/include/blast/math/simd/avx256/SimdVecInt32.hpp b/include/blast/math/simd/avx256/SimdVecInt32.hpp index 8854f50f..af2c932e 100644 --- a/include/blast/math/simd/avx256/SimdVecInt32.hpp +++ b/include/blast/math/simd/avx256/SimdVecInt32.hpp @@ -27,8 +27,8 @@ namespace blast { template <> - class SimdVec - : public SimdVecBase + class SimdVec + : public SimdVecBase { public: using MaskType = __m256i; @@ -113,10 +113,7 @@ namespace blast */ ValueType operator[](size_t i) const noexcept { - alignas(IntrinsicType) ValueType v[size()]; - _mm256_storeu_si256(reinterpret_cast(v), value_); - - return v[i]; + return value_.get(i); } }; -} \ No newline at end of file +} diff --git a/include/blast/math/simd/avx256/SimdVecInt64.hpp b/include/blast/math/simd/avx256/SimdVecInt64.hpp index 6edf2863..26f75fa2 100644 --- a/include/blast/math/simd/avx256/SimdVecInt64.hpp +++ b/include/blast/math/simd/avx256/SimdVecInt64.hpp @@ -27,8 +27,8 @@ namespace blast { template <> - class SimdVec - : public SimdVecBase + class SimdVec + : public SimdVecBase { public: using MaskType = __m256i; @@ -118,7 +118,7 @@ namespace blast */ ValueType operator[](size_t i) const noexcept { - return value_[i]; + return value_.get(i); } }; -} \ No newline at end of file +} From be5b0fd55303609044fe283282f70084a128e62a Mon Sep 17 00:00:00 2001 From: Mikhail Katliar Date: Thu, 25 Jul 2024 23:22:13 +0200 Subject: [PATCH 04/27] Moved operator[] to SimdVecBase --- include/blast/math/simd/SimdVecBase.hpp | 14 ++++++++++++++ include/blast/math/simd/avx256/SimdVecFloat32.hpp | 15 --------------- include/blast/math/simd/avx256/SimdVecFloat64.hpp | 13 ------------- include/blast/math/simd/avx256/SimdVecInt32.hpp | 13 ------------- include/blast/math/simd/avx256/SimdVecInt64.hpp | 13 ------------- 5 files changed, 14 insertions(+), 54 deletions(-) diff --git a/include/blast/math/simd/SimdVecBase.hpp b/include/blast/math/simd/SimdVecBase.hpp index b041fc66..4ee90c5a 100644 --- a/include/blast/math/simd/SimdVecBase.hpp +++ b/include/blast/math/simd/SimdVecBase.hpp @@ -47,6 +47,20 @@ namespace blast return value_; } + + /** + * @brief Access single element + * + * @param i element index + * + * @return element value + */ + ValueType operator[](size_t i) const noexcept + { + return value_.get(i); + } + + protected: SimdVecBase(IntrinsicType value) : value_ {value} diff --git a/include/blast/math/simd/avx256/SimdVecFloat32.hpp b/include/blast/math/simd/avx256/SimdVecFloat32.hpp index 74e1bbf0..7e32e985 100644 --- a/include/blast/math/simd/avx256/SimdVecFloat32.hpp +++ b/include/blast/math/simd/avx256/SimdVecFloat32.hpp @@ -246,20 +246,5 @@ namespace blast return std::make_tuple(v7, iv7); } - - - /** - * @brief Access single element - * - * TODO: move to SimdVecBase - * - * @param i element index - * - * @return element value - */ - ValueType operator[](size_t i) const noexcept - { - return value_.get(i); - } }; } diff --git a/include/blast/math/simd/avx256/SimdVecFloat64.hpp b/include/blast/math/simd/avx256/SimdVecFloat64.hpp index 9d85640f..a17b36e1 100644 --- a/include/blast/math/simd/avx256/SimdVecFloat64.hpp +++ b/include/blast/math/simd/avx256/SimdVecFloat64.hpp @@ -217,18 +217,5 @@ namespace blast return std::make_tuple(m, im); } - - - /** - * @brief Access single element - * - * @param i element index - * - * @return element value - */ - ValueType operator[](size_t i) const noexcept - { - return value_.get(i); - } }; } diff --git a/include/blast/math/simd/avx256/SimdVecInt32.hpp b/include/blast/math/simd/avx256/SimdVecInt32.hpp index af2c932e..15db3b4b 100644 --- a/include/blast/math/simd/avx256/SimdVecInt32.hpp +++ b/include/blast/math/simd/avx256/SimdVecInt32.hpp @@ -102,18 +102,5 @@ namespace blast { return _mm256_add_epi32(a.value_, _mm256_set1_epi32(n)); } - - - /** - * @brief Access single element - * - * @param i element index - * - * @return element value - */ - ValueType operator[](size_t i) const noexcept - { - return value_.get(i); - } }; } diff --git a/include/blast/math/simd/avx256/SimdVecInt64.hpp b/include/blast/math/simd/avx256/SimdVecInt64.hpp index 26f75fa2..f383016d 100644 --- a/include/blast/math/simd/avx256/SimdVecInt64.hpp +++ b/include/blast/math/simd/avx256/SimdVecInt64.hpp @@ -107,18 +107,5 @@ namespace blast { return _mm256_add_epi64(a.value_, _mm256_set1_epi64x(n)); } - - - /** - * @brief Access single element - * - * @param i element index - * - * @return element value - */ - ValueType operator[](size_t i) const noexcept - { - return value_.get(i); - } }; } From e27037d18ca3550d9ac780be9fe176a2b5f29e8b Mon Sep 17 00:00:00 2001 From: Mikhail Katliar Date: Fri, 26 Jul 2024 14:56:52 +0200 Subject: [PATCH 05/27] Generic member functions in SimdVec --- .../register_matrix/DynamicRegisterMatrix.hpp | 4 +- .../math/register_matrix/RegisterMatrix.hpp | 4 +- include/blast/math/simd/SimdVec.hpp | 289 ++++++++++++- include/blast/math/simd/SimdVecBase.hpp | 10 +- .../blast/math/simd/avx256/SimdVecFloat32.hpp | 386 ++++++++---------- .../blast/math/simd/avx256/SimdVecFloat64.hpp | 317 ++++++-------- 6 files changed, 591 insertions(+), 419 deletions(-) diff --git a/include/blast/math/register_matrix/DynamicRegisterMatrix.hpp b/include/blast/math/register_matrix/DynamicRegisterMatrix.hpp index a1b857cc..ed59f67e 100644 --- a/include/blast/math/register_matrix/DynamicRegisterMatrix.hpp +++ b/include/blast/math/register_matrix/DynamicRegisterMatrix.hpp @@ -268,7 +268,7 @@ namespace blast BLAZE_STATIC_ASSERT_MSG((M % SS == 0), "Number of rows must be a multiple of SIMD size"); BLAZE_STATIC_ASSERT_MSG((RM * RN <= RegisterCapacity_v), "Not enough registers for a DynamicRegisterMatrix"); - IntrinsicType v_[RM][RN]; + SimdVecType v_[RM][RN]; size_t const m_; size_t const n_; @@ -598,4 +598,4 @@ namespace blast ker.axpy(beta, c); ker.store(d); } -} \ No newline at end of file +} diff --git a/include/blast/math/register_matrix/RegisterMatrix.hpp b/include/blast/math/register_matrix/RegisterMatrix.hpp index d4309043..68641769 100644 --- a/include/blast/math/register_matrix/RegisterMatrix.hpp +++ b/include/blast/math/register_matrix/RegisterMatrix.hpp @@ -378,7 +378,7 @@ namespace blast BLAZE_STATIC_ASSERT_MSG((M % SS == 0), "Number of rows must be a multiple of SIMD size"); BLAZE_STATIC_ASSERT_MSG((RM * RN <= RegisterCapacity_v), "Not enough registers for a RegisterMatrix"); - IntrinsicType v_[RM][RN]; + SimdVecType v_[RM][RN]; /// @brief Reference to the matrix element at row \a i and column \a j @@ -825,4 +825,4 @@ namespace blast { return rm == m; } -} \ No newline at end of file +} diff --git a/include/blast/math/simd/SimdVec.hpp b/include/blast/math/simd/SimdVec.hpp index f65969f9..8568961e 100644 --- a/include/blast/math/simd/SimdVec.hpp +++ b/include/blast/math/simd/SimdVec.hpp @@ -14,16 +14,301 @@ #pragma once +#include + #include +#include + namespace blast { + template + class SimdVec; + + + /** + * @brief Fused multiply-add + * + * Calculate a * b + c + * + * @param a first multiplier + * @param b second multiplier + * @param c addendum + * + * @return @a a * @a b + @a c element-wise + */ + template + SimdVec fmadd(SimdVec const& a, SimdVec const& b, SimdVec const& c) noexcept; + + template + SimdVec blend(SimdVec const& a, SimdVec const& b, typename SimdVec::MaskType mask) noexcept; + + template + SimdVec abs(SimdVec const& a) noexcept; + + /** + * @brief Vertical max (across two vectors) + * + * @param a first vector + * @param b second vector + * + * @return [max(a[7], b7), [max(a[6], b6), max(a[5], b5), max(a[4], b4), max(a[3], b3), max(a[2], b2), max(a[1], b1), max(a[0], b0)] + */ + template + SimdVec max(SimdVec const& a, SimdVec const& b) noexcept; + + /** + * @brief Horizontal max (across all elements) + * + * The implementation is based on + * https://stackoverflow.com/questions/9795529/how-to-find-the-horizontal-maximum-in-a-256-bit-avx-vector + * + * @param a pack of 32-bit floating point numbers + * + * @return max(a[0], a[1], a[2], a[3], a[4], a[5], a[6], a[7]) + */ + template + T max(SimdVec const& x) noexcept; + + template + SimdVec::MaskType operator>(SimdVec const& a, SimdVec const& b) noexcept; + + /** + * @brief Multiplication + * + * @param a first multiplier + * @param b second multiplier + * + * @return product @a a * @a b + */ + template + SimdVec operator*(SimdVec const& a, SimdVec const& b) noexcept; + + /** + * @brief Left multiplication with a scalar + * + * @param a scalar multiplier + * @param b batch multiplier + * + * @return product @a a * @a b + */ + template + SimdVec operator*(T const& a, SimdVec const& b) noexcept; + + + /** + * @brief Right multiplication with a scalar + * + * @param a batch multiplier + * @param b scalar multiplier + * + * @return product @a a * @a b + */ + template + SimdVec operator*(SimdVec const& a, T const& b) noexcept; + + /** * @brief Data-parallel type with a given element type. * * @tparam T element type */ - template - class SimdVec; + template + class SimdVec + : public SimdVecBase + { + public: + using typename SimdVecBase::ValueType; + using typename SimdVecBase::IntrinsicType; + using typename SimdVecBase::XSimdType; + using MaskType = xsimd::batch_bool; + + + /** + * @brief Set to [0, 0, 0, ...] + */ + SimdVec() noexcept + : SimdVecBase {T {}} + { + } + + + SimdVec(SimdVec const&) noexcept = default; + + + SimdVec(IntrinsicType value) noexcept + : SimdVecBase {value} + { + } + + + SimdVec(XSimdType value) noexcept + : SimdVecBase {value} + { + } + + + /** + * @brief Set to [value, value, ...] + * + * @param value value for each component of SIMD vector + */ + SimdVec(ValueType value) noexcept + : SimdVecBase {value} + { + } + + + /** + * @brief Load from location + * + * @param src memory location to load from + * @param aligned true indicates that an aligned read instruction should be used + */ + explicit SimdVec(ValueType const * src, bool aligned) noexcept + : SimdVecBase {aligned ? xsimd::load_aligned(src) : xsimd::load_unaligned(src)} + { + } + + + /** + * @brief Masked load from location + * + * @param src memory location to load from + * @param mask load mask + * @param aligned true if @a src is SIMD-aligned + */ + explicit SimdVec(ValueType const * src, MaskType mask, bool aligned) noexcept; + + + /** + * @brief Store to memory + * + * @param dst memory location to store to + * @param aligned true if @a dst is SIMD-aligned + */ + void store(ValueType * dst, bool aligned) const noexcept + { + if (aligned) + xsimd::store_aligned(dst, this->value_); + else + xsimd::store_unaligned(dst, this->value_); + } + + + /** + * @brief Masked store to memory + * + * @param dst memory location to store to + * @param mask store mask + * @param aligned true if @a dst is SIMD-aligned + */ + void store(ValueType * dst, MaskType mask, bool aligned) const noexcept; + + + /** + * @brief In-place multiplication + * + * @param a multiplier + * + * @return @a *this after multiplication with @a a + */ + SimdVec& operator*=(SimdVec const& a) noexcept + { + this->value_ *= a.value_; + return *this; + } + + + /** + * @brief In-place division + * + * @param a divisor + * + * @return @a *this after division by @a a + */ + SimdVec& operator/=(SimdVec const& a) noexcept + { + this->value_ /= a.value_; + return *this; + } + + + friend SimdVec fmadd<>(SimdVec const& a, SimdVec const& b, SimdVec const& c) noexcept; + friend SimdVec blend<>(SimdVec const& a, SimdVec const& b, MaskType mask) noexcept; + friend SimdVec abs<>(SimdVec const& a) noexcept; + friend SimdVec max<>(SimdVec const& a, SimdVec const& b) noexcept; + friend ValueType max<>(SimdVec const& x) noexcept; + friend MaskType operator><>(SimdVec const& a, SimdVec const& b) noexcept; + friend SimdVec operator*<>(SimdVec const& a, SimdVec const& b) noexcept; + friend SimdVec operator*<>(ValueType const& a, SimdVec const& b) noexcept; + friend SimdVec operator*<>(SimdVec const& a, ValueType const& b) noexcept; + + template + friend std::tuple> imax(SimdVec const& v1, SimdVec const& idx); + }; + + + template + inline SimdVec fmadd(SimdVec const& a, SimdVec const& b, SimdVec const& c) noexcept + { + return xsimd::fma(a.value_, b.value_, c.value_); + } + + + template + inline SimdVec blend(SimdVec const& a, SimdVec const& b, typename SimdVec::MaskType mask) noexcept + { + return xsimd::select(mask, a.value_, b.value_); + } + + + template + inline SimdVec abs(SimdVec const& a) noexcept + { + return xsimd::abs(a.value_); + } + + + template + inline SimdVec::MaskType operator>(SimdVec const& a, SimdVec const& b) noexcept + { + return xsimd::gt(a.value_, b.value_); + } + + + template + inline SimdVec operator*(SimdVec const& a, SimdVec const& b) noexcept + { + return a.value_ * b.value_; + } + + + template + inline SimdVec operator*(T const& a, SimdVec const& b) noexcept + { + return SimdVec {a} * b; + } + + + template + inline SimdVec operator*(SimdVec const& a, T const& b) noexcept + { + return a * SimdVec {b}; + } + + + template + inline SimdVec max(SimdVec const& a, SimdVec const& b) noexcept + { + return xsimd::max(a, b); + } + + + template + inline T max(SimdVec const& x) noexcept + { + return xsimd::reduce_max(x.value_); + } } diff --git a/include/blast/math/simd/SimdVecBase.hpp b/include/blast/math/simd/SimdVecBase.hpp index 4ee90c5a..53ef4ba4 100644 --- a/include/blast/math/simd/SimdVecBase.hpp +++ b/include/blast/math/simd/SimdVecBase.hpp @@ -26,10 +26,9 @@ namespace blast template class SimdVecBase { - using XSimdType = xsimd::batch; - public: using ValueType = T; + using XSimdType = xsimd::batch; using IntrinsicType = typename XSimdType::register_type; @@ -67,6 +66,13 @@ namespace blast { } + + SimdVecBase(ValueType const& value) + : value_ {value} + { + } + + XSimdType value_; }; } diff --git a/include/blast/math/simd/avx256/SimdVecFloat32.hpp b/include/blast/math/simd/avx256/SimdVecFloat32.hpp index 7e32e985..bcaeb281 100644 --- a/include/blast/math/simd/avx256/SimdVecFloat32.hpp +++ b/include/blast/math/simd/avx256/SimdVecFloat32.hpp @@ -26,225 +26,171 @@ namespace blast { template <> - class SimdVec - : public SimdVecBase + inline SimdVec::SimdVec() noexcept + : SimdVecBase {_mm256_setzero_ps()} { - public: - using MaskType = __m256i; - - /** - * @brief Set to [0, 0, 0, ...] - */ - SimdVec() noexcept - : SimdVecBase {_mm256_setzero_ps()} - { - } - - - SimdVec(IntrinsicType value) noexcept - : SimdVecBase {value} - { - } - - - /** - * @brief Set to [value, value, ...] - * - * TODO: add "explicit" - * - * @param value value for each component of SIMD vector - */ - SimdVec(ValueType value) noexcept - : SimdVecBase {_mm256_set1_ps(value)} - { - } - - - /** - * @brief Load from location - * - * @param src memory location to load from - * @param aligned true if @a src is SIMD-aligned - */ - explicit SimdVec(ValueType const * src, bool aligned) noexcept - : SimdVecBase {aligned ? _mm256_load_ps(src) : _mm256_loadu_ps(src)} - { - } - - - /** - * @brief Masked load from location - * - * @param src memory location to load from - * @param mask load mask - * @param aligned true if @a src is SIMD-aligned - */ - explicit SimdVec(ValueType const * src, MaskType mask, bool aligned) noexcept - : SimdVecBase {_mm256_maskload_ps(src, mask)} - { - } - - - /** - * @brief Store to memory - * - * @param dst memory location to store to - * @param aligned true if @a dst is SIMD-aligned - */ - void store(ValueType * dst, bool aligned) const noexcept - { - if (aligned) - _mm256_store_ps(dst, value_); - else - _mm256_storeu_ps(dst, value_); - } - - - /** - * @brief Masked store to memory - * - * @param dst memory location to store to - * @param mask store mask - * @param aligned true if @a dst is SIMD-aligned - */ - void store(ValueType * dst, MaskType mask, bool aligned) const noexcept - { - _mm256_maskstore_ps(dst, mask, value_); - } - - - friend MaskType operator>(SimdVec const& a, SimdVec const& b) noexcept - { - return _mm256_castps_si256(_mm256_cmp_ps(a.value_, b.value_, _CMP_GT_OQ)); - } - - - /** - * @brief Multiplication - * - * @param a first multiplier - * @param b second multiplier - * - * @return product @a a * @a b - */ - friend SimdVec operator*(SimdVec const& a, SimdVec const& b) noexcept - { - return _mm256_mul_ps(a.value_, b.value_); - } - - - /** - * @brief Fused multiply-add - * - * Calculate a * b + c - * - * @param a first multiplier - * @param b second multiplier - * @param c addendum - * - * @return @a a * @a b + @a c element-wise - */ - friend SimdVec fmadd(SimdVec const& a, SimdVec const& b, SimdVec const& c) noexcept - { - return _mm256_fmadd_ps(a.value_, b.value_, c.value_); - } - - - friend SimdVec blend(SimdVec const& a, SimdVec const& b, MaskType mask) noexcept - { - return _mm256_blendv_ps(a.value_, b.value_, _mm256_castsi256_ps(mask)); - } - - - friend SimdVec abs(SimdVec const& a) noexcept - { - return _mm256_andnot_ps(SimdVec {-0.f}, a.value_); - } - - - /** - * @brief Vertical max (across two vectors) - * - * @param a first vector - * @param b second vector - * - * @return [max(a[7], b7), [max(a[6], b6), max(a[5], b5), max(a[4], b4), max(a[3], b3), max(a[2], b2), max(a[1], b1), max(a[0], b0)] - */ - friend SimdVec max(SimdVec const& a, SimdVec const& b) - { - return _mm256_max_ps(a, b); - } - - - /** - * @brief Horizontal max (across all elements) - * - * The implementation is based on - * https://stackoverflow.com/questions/9795529/how-to-find-the-horizontal-maximum-in-a-256-bit-avx-vector - * - * @param a pack of 32-bit floating point numbers - * - * @return max(a[0], a[1], a[2], a[3], a[4], a[5], a[6], a[7]) - */ - friend ValueType max(SimdVec const& x) - { - __m256 v1 = x.value_; /* v1 = [H G F E | D C B A] */ - __m256 v2 = _mm256_permute_ps(v1, 0b10'11'00'01); /* v2 = [G H E F | C D A B] */ - __m256 v3 = _mm256_max_ps(v1, v2); /* v3 = [W=max(G,H) W=max(G,H) Z=max(E,F) Z=max(E,F) | Y=max(C,D) Y=max(C,D) X=max(A,B) X=max(A,B)] */ - /* v3 = [W W Z Z | Y Y X X] */ - __m256 v4 = _mm256_permute_ps(v3, 0b00'00'10'10); /* v4 = [Z Z W W | X X Y Y] */ - __m256 v5 = _mm256_max_ps(v3, v4); /* v5 = [J=max(Z,W) J=max(Z,W) J=max(Z,W) J=max(Z,W) | I=max(X,Y) I=max(X,Y) I=max(X,Y) I=max(X,Y)] */ - /* v5 = [J J J J | I I I I] */ - __m128 v6 = _mm256_extractf128_ps(v5, 1); /* v6 = [- - - - | J J J J] */ - __m128 v7 = _mm_max_ps(_mm256_castps256_ps128(v5), v6); /* v7 = [- - - - | M=max(I,J) M=max(I,J) M=max(I,J) M=max(I,J)] */ - - return v7[0]; - } - - - template - requires (SimdSize_v == size()) - friend std::tuple> imax(SimdVec const& v1, SimdVec const& idx) - { - /* v2 = [G H E F | C D A B] */ - SimdVec const v2 = _mm256_permute_ps(v1, 0b10'11'00'01); - SimdVec const iv2 = _mm256_castps_si256(_mm256_permute_ps(_mm256_castsi256_ps(idx), 0b10'11'00'01)); - - /* v3 = [W=max(G,H) W=max(G,H) Z=max(E,F) Z=max(E,F) | Y=max(C,D) Y=max(C,D) X=max(A,B) X=max(A,B)] */ - /* v3 = [W W Z Z | Y Y X X] */ - // __m256 v3 = _mm256_max_ps(v1, v2); - MaskType const mask_v3 = v2 > v1; - SimdVec const v3 = blend(v1, v2, mask_v3); - SimdVec const iv3 = blend(idx, iv2, mask_v3); - - /* v4 = [Z Z W W | X X Y Y] */ - SimdVec const v4 = _mm256_permute_ps(v3, 0b00'00'10'10); - SimdVec const iv4 = _mm256_castps_si256(_mm256_permute_ps(_mm256_castsi256_ps(iv3), 0b00'00'10'10)); - - /* v5 = [J=max(Z,W) J=max(Z,W) J=max(Z,W) J=max(Z,W) | I=max(X,Y) I=max(X,Y) I=max(X,Y) I=max(X,Y)] */ - /* v5 = [J J J J | I I I I] */ - // __m256 v5 = _mm256_max_ps(v3, v4); - MaskType const mask_v5 = v4 > v3; - SimdVec const v5 = blend(v3, v4, mask_v5); - SimdVec const iv5 = blend(iv3, iv4, mask_v5); - - /* v6 = [I I I I | J J J J] */ - SimdVec const v6 = _mm256_permute2f128_ps(v5, v5, 0b0000'0001); - SimdVec const iv6 = _mm256_castps_si256( - _mm256_permute2f128_ps( - _mm256_castsi256_ps(iv5), - _mm256_castsi256_ps(iv5), - 0b0000'0001 - ) - ); - - /* v7 = [M=max(I,J) M=max(I,J) M=max(I,J) M=max(I,J) | M=max(I,J) M=max(I,J) M=max(I,J) M=max(I,J)] */ - // __m128 v7 = _mm_max_ps(_mm256_castps256_ps128(v5), v6); - MaskType const mask_v7 = v6 > v5; - SimdVec const v7 = blend(v5, v6, mask_v7); - SimdVec const iv7 = blend(iv5, iv6, mask_v7); - - return std::make_tuple(v7, iv7); - } - }; + } + + + template <> + inline SimdVec::SimdVec(IntrinsicType value) noexcept + : SimdVecBase {value} + { + } + + + template <> + inline SimdVec::SimdVec(ValueType value) noexcept + : SimdVecBase {_mm256_set1_ps(value)} + { + } + + + template <> + inline SimdVec::SimdVec(ValueType const * src, bool aligned) noexcept + : SimdVecBase {aligned ? _mm256_load_ps(src) : _mm256_loadu_ps(src)} + { + } + + + template <> + inline SimdVec::SimdVec(ValueType const * src, MaskType mask, bool aligned) noexcept + : SimdVecBase {_mm256_maskload_ps(src, mask)} + { + } + + + template <> + inline void SimdVec::store(ValueType * dst, bool aligned) const noexcept + { + if (aligned) + _mm256_store_ps(dst, value_); + else + _mm256_storeu_ps(dst, value_); + } + + + template <> + inline void SimdVec::store(ValueType * dst, MaskType mask, bool aligned) const noexcept + { + _mm256_maskstore_ps(dst, mask, value_); + } + + + template <> + inline SimdVec::MaskType operator>(SimdVec const& a, SimdVec const& b) noexcept + { + return _mm256_castps_si256(_mm256_cmp_ps(a.value_, b.value_, _CMP_GT_OQ)); + } + + + template <> + inline SimdVec operator*(SimdVec const& a, SimdVec const& b) noexcept + { + return _mm256_mul_ps(a.value_, b.value_); + } + + + /* + NOTE: without this specialization some tests fail, e.g.: + + [ RUN ] RegisterMatrixTest/5.testPartialGerNT @ ./test/blast/math/simd/RegisterMatrixTest.cpp:386 + │ Failure @ ./test/blast/math/simd/RegisterMatrixTest.cpp:420 + │ │ Expected equality of these values: + │ │ ker(i, j) + │ │ Which is: 0.482636 + │ │ i < m && j < n ? D_ref(i, j) : 0. + │ │ Which is: 0.482636 + │ element mismatch at (0, 1), store size = 1x2 + [ FAILED ] RegisterMatrixTest/5.testPartialGerNT, where TypeParam = blast::RegisterMatrix (1 ms) + */ + template <> + inline SimdVec fmadd(SimdVec const& a, SimdVec const& b, SimdVec const& c) noexcept + { + return _mm256_fmadd_ps(a.value_, b.value_, c.value_); + } + + + template <> + inline SimdVec blend(SimdVec const& a, SimdVec const& b, SimdVec::MaskType mask) noexcept + { + return _mm256_blendv_ps(a.value_, b.value_, _mm256_castsi256_ps(mask)); + } + + + template <> + inline SimdVec abs(SimdVec const& a) noexcept + { + return _mm256_andnot_ps(SimdVec {-0.f}, a.value_); + } + + + template <> + inline SimdVec max(SimdVec const& a, SimdVec const& b) noexcept + { + return _mm256_max_ps(a, b); + } + + + template <> + inline float max(SimdVec const& x) noexcept + { + __m256 v1 = x.value_; /* v1 = [H G F E | D C B A] */ + __m256 v2 = _mm256_permute_ps(v1, 0b10'11'00'01); /* v2 = [G H E F | C D A B] */ + __m256 v3 = _mm256_max_ps(v1, v2); /* v3 = [W=max(G,H) W=max(G,H) Z=max(E,F) Z=max(E,F) | Y=max(C,D) Y=max(C,D) X=max(A,B) X=max(A,B)] */ + /* v3 = [W W Z Z | Y Y X X] */ + __m256 v4 = _mm256_permute_ps(v3, 0b00'00'10'10); /* v4 = [Z Z W W | X X Y Y] */ + __m256 v5 = _mm256_max_ps(v3, v4); /* v5 = [J=max(Z,W) J=max(Z,W) J=max(Z,W) J=max(Z,W) | I=max(X,Y) I=max(X,Y) I=max(X,Y) I=max(X,Y)] */ + /* v5 = [J J J J | I I I I] */ + __m128 v6 = _mm256_extractf128_ps(v5, 1); /* v6 = [- - - - | J J J J] */ + __m128 v7 = _mm_max_ps(_mm256_castps256_ps128(v5), v6); /* v7 = [- - - - | M=max(I,J) M=max(I,J) M=max(I,J) M=max(I,J)] */ + + return v7[0]; + } + + + template + requires (SimdSize_v == SimdVec::size()) + inline std::tuple, SimdVec> imax(SimdVec const& v1, SimdVec const& idx) noexcept + { + /* v2 = [G H E F | C D A B] */ + SimdVec const v2 = _mm256_permute_ps(v1, 0b10'11'00'01); + SimdVec const iv2 = _mm256_castps_si256(_mm256_permute_ps(_mm256_castsi256_ps(idx), 0b10'11'00'01)); + + /* v3 = [W=max(G,H) W=max(G,H) Z=max(E,F) Z=max(E,F) | Y=max(C,D) Y=max(C,D) X=max(A,B) X=max(A,B)] */ + /* v3 = [W W Z Z | Y Y X X] */ + // __m256 v3 = _mm256_max_ps(v1, v2); + SimdVec::MaskType const mask_v3 = v2 > v1; + SimdVec const v3 = blend(v1, v2, mask_v3); + SimdVec const iv3 = blend(idx, iv2, mask_v3); + + /* v4 = [Z Z W W | X X Y Y] */ + SimdVec const v4 = _mm256_permute_ps(v3, 0b00'00'10'10); + SimdVec const iv4 = _mm256_castps_si256(_mm256_permute_ps(_mm256_castsi256_ps(iv3), 0b00'00'10'10)); + + /* v5 = [J=max(Z,W) J=max(Z,W) J=max(Z,W) J=max(Z,W) | I=max(X,Y) I=max(X,Y) I=max(X,Y) I=max(X,Y)] */ + /* v5 = [J J J J | I I I I] */ + // __m256 v5 = _mm256_max_ps(v3, v4); + SimdVec::MaskType const mask_v5 = v4 > v3; + SimdVec const v5 = blend(v3, v4, mask_v5); + SimdVec const iv5 = blend(iv3, iv4, mask_v5); + + /* v6 = [I I I I | J J J J] */ + SimdVec const v6 = _mm256_permute2f128_ps(v5, v5, 0b0000'0001); + SimdVec const iv6 = _mm256_castps_si256( + _mm256_permute2f128_ps( + _mm256_castsi256_ps(iv5), + _mm256_castsi256_ps(iv5), + 0b0000'0001 + ) + ); + + /* v7 = [M=max(I,J) M=max(I,J) M=max(I,J) M=max(I,J) | M=max(I,J) M=max(I,J) M=max(I,J) M=max(I,J)] */ + // __m128 v7 = _mm_max_ps(_mm256_castps256_ps128(v5), v6); + SimdVec::MaskType const mask_v7 = v6 > v5; + SimdVec const v7 = blend(v5, v6, mask_v7); + SimdVec const iv7 = blend(iv5, iv6, mask_v7); + + return std::make_tuple(v7, iv7); + } } diff --git a/include/blast/math/simd/avx256/SimdVecFloat64.hpp b/include/blast/math/simd/avx256/SimdVecFloat64.hpp index a17b36e1..52a3ed63 100644 --- a/include/blast/math/simd/avx256/SimdVecFloat64.hpp +++ b/include/blast/math/simd/avx256/SimdVecFloat64.hpp @@ -26,196 +26,131 @@ namespace blast { template <> - class SimdVec - : public SimdVecBase + inline SimdVec::SimdVec() noexcept + : SimdVecBase {_mm256_setzero_pd()} { - public: - using MaskType = __m256i; - - /** - * @brief Set to [0, 0, 0, ...] - */ - SimdVec() noexcept - : SimdVecBase {_mm256_setzero_pd()} - { - } - - - SimdVec(IntrinsicType value) noexcept - : SimdVecBase {value} - { - } - - - /** - * @brief Set to [value, value, ...] - * - * TODO: add "explicit" - * - * @param value value for each component of SIMD vector - */ - SimdVec(ValueType value) noexcept - : SimdVecBase {_mm256_set1_pd(value)} - { - } - - - /** - * @brief Load from location - * - * @param src memory location to load from - * @param aligned true if @a src is SIMD-aligned - */ - explicit SimdVec(ValueType const * src, bool aligned) noexcept - : SimdVecBase {aligned ? _mm256_load_pd(src) : _mm256_loadu_pd(src)} - { - } - - - /** - * @brief Masked load from location - * - * @param src memory location to load from - * @param mask load mask - * @param aligned true if @a src is SIMD-aligned - */ - explicit SimdVec(ValueType const * src, MaskType mask, bool aligned) noexcept - : SimdVecBase {_mm256_maskload_pd(src, mask)} - { - } - - - /** - * @brief Store to memory - * - * @param dst memory location to store to - * @param aligned true if @a dst is SIMD-aligned - */ - void store(ValueType * dst, bool aligned) const noexcept - { - if (aligned) - _mm256_store_pd(dst, value_); - else - _mm256_storeu_pd(dst, value_); - } - - - /** - * @brief Masked store to memory - * - * @param dst memory location to store to - * @param mask store mask - * @param aligned true if @a dst is SIMD-aligned - */ - void store(ValueType * dst, MaskType mask, bool aligned) const noexcept - { - _mm256_maskstore_pd(dst, mask, value_); - } - - - friend MaskType operator>(SimdVec const& a, SimdVec const& b) noexcept - { - return _mm256_castpd_si256(_mm256_cmp_pd(a.value_, b.value_, _CMP_GT_OQ)); - } - - - /** - * @brief Multiplication - * - * @param a first multiplier - * @param b second multiplier - * - * @return product @a a * @a b - */ - friend SimdVec operator*(SimdVec const& a, SimdVec const& b) noexcept - { - return _mm256_mul_pd(a.value_, b.value_); - } - - - /** - * @brief Fused multiply-add - * - * Calculate a * b + c - * - * @param a first multiplier - * @param b second multiplier - * @param c addendum - * - * @return @a a * @a b + @a c element-wise - */ - friend SimdVec fmadd(SimdVec const& a, SimdVec const& b, SimdVec const& c) noexcept - { - return _mm256_fmadd_pd(a.value_, b.value_, c.value_); - } - - - friend SimdVec blend(SimdVec const& a, SimdVec const& b, MaskType mask) noexcept - { - return _mm256_blendv_pd(a.value_, b.value_, _mm256_castsi256_pd(mask)); - } - - - friend SimdVec abs(SimdVec const& a) noexcept - { - return _mm256_andnot_pd(SimdVec {-0.}, a.value_); - } - - - /** - * @brief Vertical max (across two vectors) - * - * @param a first vector - * @param b second vector - * - * @return [max(a[3], b3), max(a[2], b2), max(a[1], b1), max(a[0], b0)] - */ - friend SimdVec max(SimdVec const& a, SimdVec const& b) - { - return _mm256_max_pd(a, b); - } - - - /** - * @brief Horizontal max (across all alements) - * - * The implementation is based on - * https://stackoverflow.com/questions/9795529/how-to-find-the-horizontal-maximum-in-a-256-bit-avx-vector - * - * @return max(x[0], x[1], x[2], x[3]) - */ - friend ValueType max(SimdVec x) noexcept - { - __m256d y = _mm256_permute2f128_pd(x.value_, x.value_, 1); // permute 128-bit values - __m256d m1 = _mm256_max_pd(x.value_, y); // m1[0] = max(x[0], x[2]), m1[1] = max(x[1], x[3]), etc. - __m256d m2 = _mm256_permute_pd(m1, 5); // set m2[0] = m1[1], m2[1] = m1[0], etc. - __m256d m = _mm256_max_pd(m1, m2); // all m[0] ... m[3] contain the horizontal max(x[0], x[1], x[2], x[3]) - - return m[0]; - } - - - template - requires (SimdSize_v == size()) - friend std::tuple> imax(SimdVec const& x, SimdVec const& idx) - { - SimdVec const y = _mm256_permute2f128_pd(x.value_, x.value_, 1); // permute 128-bit values - SimdVec const iy = _mm256_permute2f128_si256(idx, idx, 1); - - // __m256d m1 = _mm256_max_pd(x.value_, y); // m1[0] = max(x[0], x[2]), m1[1] = max(x[1], x[3]), etc. - MaskType const mask_m1 = y > x; - SimdVec const m1 = blend(x, y, mask_m1); - SimdVec const im1 = blend(idx, iy, mask_m1); - - SimdVec const m2 = _mm256_permute_pd(m1, 5); // set m2[0] = m1[1], m2[1] = m1[0], etc. - SimdVec const im2 = _mm256_castpd_si256(_mm256_permute_pd(_mm256_castsi256_pd(im1), 5)); - - // __m256d m = _mm256_max_pd(m1, m2); // all m[0] ... m[3] contain the horizontal max(x[0], x[1], x[2], x[3]) - MaskType const mask_m = m2 > m1; - SimdVec const m = blend(m1, m2, mask_m); - SimdVec const im = blend(im1, im2, mask_m); - - return std::make_tuple(m, im); - } - }; + } + + + template <> + inline SimdVec::SimdVec(IntrinsicType value) noexcept + : SimdVecBase {value} + { + } + + + template <> + inline SimdVec::SimdVec(ValueType value) noexcept + : SimdVecBase {_mm256_set1_pd(value)} + { + } + + + template <> + inline SimdVec::SimdVec(ValueType const * src, bool aligned) noexcept + : SimdVecBase {aligned ? _mm256_load_pd(src) : _mm256_loadu_pd(src)} + { + } + + + template <> + inline SimdVec::SimdVec(ValueType const * src, MaskType mask, bool aligned) noexcept + : SimdVecBase {_mm256_maskload_pd(src, mask)} + { + } + + + template <> + inline void SimdVec::store(ValueType * dst, bool aligned) const noexcept + { + if (aligned) + _mm256_store_pd(dst, value_); + else + _mm256_storeu_pd(dst, value_); + } + + + template <> + inline void SimdVec::store(ValueType * dst, MaskType mask, bool aligned) const noexcept + { + _mm256_maskstore_pd(dst, mask, value_); + } + + + template <> + inline SimdVec::MaskType operator>(SimdVec const& a, SimdVec const& b) noexcept + { + return _mm256_castpd_si256(_mm256_cmp_pd(a.value_, b.value_, _CMP_GT_OQ)); + } + + + template <> + inline SimdVec operator*(SimdVec const& a, SimdVec const& b) noexcept + { + return _mm256_mul_pd(a.value_, b.value_); + } + + + template <> + inline SimdVec fmadd(SimdVec const& a, SimdVec const& b, SimdVec const& c) noexcept + { + return _mm256_fmadd_pd(a.value_, b.value_, c.value_); + } + + + template <> + inline SimdVec blend(SimdVec const& a, SimdVec const& b, SimdVec::MaskType mask) noexcept + { + return _mm256_blendv_pd(a.value_, b.value_, _mm256_castsi256_pd(mask)); + } + + + template <> + inline SimdVec abs(SimdVec const& a) noexcept + { + return _mm256_andnot_pd(_mm256_set1_pd(-0.), a.value_); + } + + + template <> + inline SimdVec max(SimdVec const& a, SimdVec const& b) noexcept + { + return _mm256_max_pd(a, b); + } + + + template <> + inline double max(SimdVec const& x) noexcept + { + __m256d y = _mm256_permute2f128_pd(x.value_, x.value_, 1); // permute 128-bit values + __m256d m1 = _mm256_max_pd(x.value_, y); // m1[0] = max(x[0], x[2]), m1[1] = max(x[1], x[3]), etc. + __m256d m2 = _mm256_permute_pd(m1, 5); // set m2[0] = m1[1], m2[1] = m1[0], etc. + __m256d m = _mm256_max_pd(m1, m2); // all m[0] ... m[3] contain the horizontal max(x[0], x[1], x[2], x[3]) + + return m[0]; + } + + + template + requires (SimdSize_v == SimdVec::size()) + inline std::tuple, SimdVec> imax(SimdVec const& x, SimdVec const& idx) noexcept + { + SimdVec const y = _mm256_permute2f128_pd(x, x, 1); // permute 128-bit values + SimdVec const iy = _mm256_permute2f128_si256(idx, idx, 1); + + // __m256d m1 = _mm256_max_pd(x.value_, y); // m1[0] = max(x[0], x[2]), m1[1] = max(x[1], x[3]), etc. + SimdVec::MaskType const mask_m1 = y > x; + SimdVec const m1 = blend(x, y, mask_m1); + SimdVec const im1 = blend(idx, iy, mask_m1); + + SimdVec const m2 = _mm256_permute_pd(m1, 5); // set m2[0] = m1[1], m2[1] = m1[0], etc. + SimdVec const im2 = _mm256_castpd_si256(_mm256_permute_pd(_mm256_castsi256_pd(im1), 5)); + + // __m256d m = _mm256_max_pd(m1, m2); // all m[0] ... m[3] contain the horizontal max(x[0], x[1], x[2], x[3]) + SimdVec::MaskType const mask_m = m2 > m1; + SimdVec const m = blend(m1, m2, mask_m); + SimdVec const im = blend(im1, im2, mask_m); + + return std::make_tuple(m, im); + } } From 39856a3ceaebded1697232be1df232f60aaf63d8 Mon Sep 17 00:00:00 2001 From: Mikhail Katliar Date: Fri, 26 Jul 2024 15:01:57 +0200 Subject: [PATCH 06/27] Install xsimd step --- .github/workflows/cmake.yml | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/.github/workflows/cmake.yml b/.github/workflows/cmake.yml index 9c1ace7c..37a4eb65 100644 --- a/.github/workflows/cmake.yml +++ b/.github/workflows/cmake.yml @@ -36,6 +36,13 @@ jobs: -DCMAKE_INSTALL_PREFIX=/usr/local/ . \ && sudo make install + - name: Install xsimd + run: | + git clone https://github.com/xtensor-stack/xsimd.git + cd xsimd \ + && cmake -B build -DCMAKE_INSTALL_PREFIX=/usr/local/ . \ + && sudo cmake --build build --target install + - name: Install GTest run: | git clone https://github.com/google/googletest.git @@ -60,4 +67,3 @@ jobs: # Execute tests defined by the CMake configuration. # See https://cmake.org/cmake/help/latest/manual/ctest.1.html for more detail run: ctest -C ${{env.BUILD_TYPE}} - From 7aa0563573c658e69b5d148ce93115ac6204bd65 Mon Sep 17 00:00:00 2001 From: Mikhail Katliar Date: Fri, 26 Jul 2024 15:11:07 +0200 Subject: [PATCH 07/27] Add missing "typename " keyword --- include/blast/math/simd/SimdVec.hpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/include/blast/math/simd/SimdVec.hpp b/include/blast/math/simd/SimdVec.hpp index 8568961e..be6cb57c 100644 --- a/include/blast/math/simd/SimdVec.hpp +++ b/include/blast/math/simd/SimdVec.hpp @@ -72,7 +72,7 @@ namespace blast T max(SimdVec const& x) noexcept; template - SimdVec::MaskType operator>(SimdVec const& a, SimdVec const& b) noexcept; + typename SimdVec::MaskType operator>(SimdVec const& a, SimdVec const& b) noexcept; /** * @brief Multiplication @@ -272,7 +272,7 @@ namespace blast template - inline SimdVec::MaskType operator>(SimdVec const& a, SimdVec const& b) noexcept + inline typename SimdVec::MaskType operator>(SimdVec const& a, SimdVec const& b) noexcept { return xsimd::gt(a.value_, b.value_); } From d663683e86eb0cc33dcaae6abdb6de2f3ccb2482 Mon Sep 17 00:00:00 2001 From: Mikhail Katliar Date: Fri, 26 Jul 2024 15:11:39 +0200 Subject: [PATCH 08/27] Update "Build xsimd" step --- .github/workflows/cmake.yml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/cmake.yml b/.github/workflows/cmake.yml index 37a4eb65..2df1b1d3 100644 --- a/.github/workflows/cmake.yml +++ b/.github/workflows/cmake.yml @@ -39,9 +39,9 @@ jobs: - name: Install xsimd run: | git clone https://github.com/xtensor-stack/xsimd.git - cd xsimd \ - && cmake -B build -DCMAKE_INSTALL_PREFIX=/usr/local/ . \ - && sudo cmake --build build --target install + cd xsimd + cmake -B build -DCMAKE_INSTALL_PREFIX=/usr/local/ . + sudo cmake --build build --target install - name: Install GTest run: | From 8b9124bacadf20d102bf6df369d10ff98addb647 Mon Sep 17 00:00:00 2001 From: Mikhail Katliar Date: Thu, 1 Aug 2024 13:09:59 +0200 Subject: [PATCH 09/27] Simd<> specializations working for both avx2 and fma3 --- include/blast/math/simd/Simd.hpp | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/include/blast/math/simd/Simd.hpp b/include/blast/math/simd/Simd.hpp index 841f7571..6f4ca1d2 100644 --- a/include/blast/math/simd/Simd.hpp +++ b/include/blast/math/simd/Simd.hpp @@ -10,6 +10,7 @@ #include #include +#include #include #include @@ -24,8 +25,9 @@ namespace blast struct Simd; - template <> - struct Simd + template + requires std::is_base_of_v + struct Simd { private: class Index; @@ -85,8 +87,9 @@ namespace blast }; - template <> - struct Simd + template + requires std::is_base_of_v + struct Simd { private: class Index; From 17521ea0db48bf2e8b3419edfd2adf44b33d916f Mon Sep 17 00:00:00 2001 From: Mikhail Katliar Date: Thu, 1 Aug 2024 13:21:49 +0200 Subject: [PATCH 10/27] Remove XSIMD_DEFAULT_ARCH setting from CMakeLists.txt --- CMakeLists.txt | 4 ---- Dockerfile | 3 +-- 2 files changed, 1 insertion(+), 6 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index aa1db7b4..3b0ad928 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -46,10 +46,6 @@ target_compile_options(blast INTERFACE "-march=native" "-mfma" "-mavx" "-mavx2" "-msse4" ) -target_compile_definitions(blast - INTERFACE "XSIMD_DEFAULT_ARCH=avx2" -) - # BLAST_WITH_BLASFEO option(BLAST_WITH_BLASFEO "Build blasfeo C++ interface") diff --git a/Dockerfile b/Dockerfile index 1ccfab49..598aebdd 100644 --- a/Dockerfile +++ b/Dockerfile @@ -56,7 +56,7 @@ ENV PKG_CONFIG_PATH=/usr/local/lib RUN mkdir -p blast/build && cd blast/build \ && cmake -DCMAKE_BUILD_TYPE=RelWithDebInfo \ -DCMAKE_CXX_COMPILER="clang++-15" \ - -DCMAKE_CXX_FLAGS="-march=native -mfma -mavx -mavx2 -msse4 -fno-math-errno" \ + -DCMAKE_CXX_FLAGS="-march=native -mfma -mavx -mavx2 -msse4 -fno-math-errno -DXSIMD_DEFAULT_ARCH=avx2" \ -DCMAKE_CXX_FLAGS_RELEASE="-O3 -g -DNDEBUG -ffast-math" .. \ -DBLAST_WITH_TEST=ON \ -DBLAST_WITH_BENCHMARK=ON \ @@ -79,4 +79,3 @@ CMD mkdir -p blast/bench_result/data \ && mkdir -p blast/bench_result/image \ && cd blast \ && make -j 1 bench_result/image/dgemm_performance.png bench_result/image/dgemm_performance_ratio.png - From c3adc2a9e8c6fd88830603c20c7ec97f81adaec2 Mon Sep 17 00:00:00 2001 From: Mikhail Katliar Date: Thu, 1 Aug 2024 18:19:46 +0200 Subject: [PATCH 11/27] Code compiles for both avx2 and fma3 --- include/blast/math/simd/SimdVec.hpp | 121 +++++++++++++++++- .../blast/math/simd/avx256/SimdVecFloat32.hpp | 60 --------- .../blast/math/simd/avx256/SimdVecFloat64.hpp | 38 ------ .../blast/math/simd/avx256/SimdVecInt32.hpp | 21 +-- .../blast/math/simd/avx256/SimdVecInt64.hpp | 23 ++-- 5 files changed, 143 insertions(+), 120 deletions(-) diff --git a/include/blast/math/simd/SimdVec.hpp b/include/blast/math/simd/SimdVec.hpp index be6cb57c..69d1fa0b 100644 --- a/include/blast/math/simd/SimdVec.hpp +++ b/include/blast/math/simd/SimdVec.hpp @@ -27,6 +27,111 @@ namespace blast class SimdVec; + template + xsimd::batch maskload(T const * src, xsimd::batch_bool const& mask) noexcept; + + template + requires std::is_base_of_v + inline xsimd::batch maskload(float const * src, xsimd::batch_bool const& mask) noexcept + { + return _mm256_maskload_ps(src, mask); + } + + template + requires std::is_base_of_v + inline xsimd::batch maskload(double const * src, xsimd::batch_bool const& mask) noexcept + { + return _mm256_maskload_pd(src, mask); + } + + template + void maskstore(xsimd::batch const& v, T * dst, xsimd::batch_bool const& mask) noexcept; + + template + requires std::is_base_of_v + inline void maskstore(xsimd::batch const& v, float * dst, xsimd::batch_bool const& mask) noexcept + { + _mm256_maskstore_ps(dst, mask, v); + } + + template + requires std::is_base_of_v + inline void maskstore(xsimd::batch const& v, double * dst, xsimd::batch_bool const& mask) noexcept + { + _mm256_maskstore_pd(dst, mask, v); + } + + + template + requires std::is_base_of_v + inline std::tuple, xsimd::batch> imax(xsimd::batch const& v1, xsimd::batch const& idx) noexcept + { + /* v2 = [G H E F | C D A B] */ + __m256 const v2 = _mm256_permute_ps(v1, 0b10'11'00'01); + __m256i const iv2 = _mm256_castps_si256(_mm256_permute_ps(_mm256_castsi256_ps(idx), 0b10'11'00'01)); + + /* v3 = [W=max(G,H) W=max(G,H) Z=max(E,F) Z=max(E,F) | Y=max(C,D) Y=max(C,D) X=max(A,B) X=max(A,B)] */ + /* v3 = [W W Z Z | Y Y X X] */ + // __m256 v3 = _mm256_max_ps(v1, v2); + __m256 const mask_v3 = _mm256_cmp_ps(v2, v1, _CMP_GT_OQ); + __m256 const v3 = _mm256_blendv_ps(v1, v2, mask_v3); + __m256i const iv3 = _mm256_blendv_epi8(idx, iv2, mask_v3); + + /* v4 = [Z Z W W | X X Y Y] */ + __m256 const v4 = _mm256_permute_ps(v3, 0b00'00'10'10); + __m256i const iv4 = _mm256_castps_si256(_mm256_permute_ps(_mm256_castsi256_ps(iv3), 0b00'00'10'10)); + + /* v5 = [J=max(Z,W) J=max(Z,W) J=max(Z,W) J=max(Z,W) | I=max(X,Y) I=max(X,Y) I=max(X,Y) I=max(X,Y)] */ + /* v5 = [J J J J | I I I I] */ + // __m256 v5 = _mm256_max_ps(v3, v4); + __m256 const mask_v5 = _mm256_cmp_ps(v4, v3, _CMP_GT_OQ); + __m256 const v5 = _mm256_blendv_ps(v3, v4, mask_v5); + __m256i const iv5 = _mm256_blendv_epi8(iv3, iv4, mask_v5); + + /* v6 = [I I I I | J J J J] */ + __m256 const v6 = _mm256_permute2f128_ps(v5, v5, 0b0000'0001); + __m256i const iv6 = _mm256_castps_si256( + _mm256_permute2f128_ps( + _mm256_castsi256_ps(iv5), + _mm256_castsi256_ps(iv5), + 0b0000'0001 + ) + ); + + /* v7 = [M=max(I,J) M=max(I,J) M=max(I,J) M=max(I,J) | M=max(I,J) M=max(I,J) M=max(I,J) M=max(I,J)] */ + // __m128 v7 = _mm_max_ps(_mm256_castps256_ps128(v5), v6); + __m256 const mask_v7 = _mm256_cmp_ps(v6, v5, _CMP_GT_OQ); + __m256 const v7 = _mm256_blendv_ps(v5, v6, mask_v7); + __m256i const iv7 = _mm256_blendv_epi8(iv5, iv6, mask_v7); + + return {v7, iv7}; + } + + + template + requires std::is_base_of_v + inline std::tuple, xsimd::batch> imax(xsimd::batch const& x, xsimd::batch const& idx) noexcept + { + __m256d const y = _mm256_permute2f128_pd(x, x, 1); // permute 128-bit values + __m256i const iy = _mm256_permute2f128_si256(idx, idx, 1); + + // __m256d m1 = _mm256_max_pd(x.value_, y); // m1[0] = max(x[0], x[2]), m1[1] = max(x[1], x[3]), etc. + __m256d const mask_m1 = _mm256_cmp_pd(y, x, _CMP_GT_OQ); + __m256d const m1 = _mm256_blendv_pd(x, y, mask_m1); + __m256i const im1 = _mm256_blendv_epi8(idx, iy, mask_m1); + + __m256d const m2 = _mm256_permute_pd(m1, 5); // set m2[0] = m1[1], m2[1] = m1[0], etc. + __m256i const im2 = _mm256_castpd_si256(_mm256_permute_pd(_mm256_castsi256_pd(im1), 5)); + + // __m256d m = _mm256_max_pd(m1, m2); // all m[0] ... m[3] contain the horizontal max(x[0], x[1], x[2], x[3]) + __m256d const mask_m = _mm256_cmp_pd(m2, m1, _CMP_GT_OQ); + __m256d const m = _mm256_blendv_pd(m1, m2, mask_m); + __m256i const im = _mm256_blendv_epi8(im1, im2, mask_m); + + return {m, im}; + } + + /** * @brief Fused multiply-add * @@ -179,7 +284,10 @@ namespace blast * @param mask load mask * @param aligned true if @a src is SIMD-aligned */ - explicit SimdVec(ValueType const * src, MaskType mask, bool aligned) noexcept; + explicit SimdVec(ValueType const * src, MaskType mask, bool aligned) noexcept + : SimdVecBase {maskload(src, mask)} + { + } /** @@ -204,7 +312,10 @@ namespace blast * @param mask store mask * @param aligned true if @a dst is SIMD-aligned */ - void store(ValueType * dst, MaskType mask, bool aligned) const noexcept; + void store(ValueType * dst, MaskType mask, bool aligned) const noexcept + { + maskstore(this->value_, dst, mask); + } /** @@ -246,7 +357,11 @@ namespace blast friend SimdVec operator*<>(SimdVec const& a, ValueType const& b) noexcept; template - friend std::tuple> imax(SimdVec const& v1, SimdVec const& idx); + friend std::tuple> imax(SimdVec const& v1, SimdVec const& idx) noexcept + { + auto const t = imax(v1.value_, xsimd::batch(idx)); + return {get<0>(t), SimdVec(get<1>(t))}; + } }; diff --git a/include/blast/math/simd/avx256/SimdVecFloat32.hpp b/include/blast/math/simd/avx256/SimdVecFloat32.hpp index bcaeb281..4a8a97f4 100644 --- a/include/blast/math/simd/avx256/SimdVecFloat32.hpp +++ b/include/blast/math/simd/avx256/SimdVecFloat32.hpp @@ -53,13 +53,6 @@ namespace blast } - template <> - inline SimdVec::SimdVec(ValueType const * src, MaskType mask, bool aligned) noexcept - : SimdVecBase {_mm256_maskload_ps(src, mask)} - { - } - - template <> inline void SimdVec::store(ValueType * dst, bool aligned) const noexcept { @@ -70,13 +63,6 @@ namespace blast } - template <> - inline void SimdVec::store(ValueType * dst, MaskType mask, bool aligned) const noexcept - { - _mm256_maskstore_ps(dst, mask, value_); - } - - template <> inline SimdVec::MaskType operator>(SimdVec const& a, SimdVec const& b) noexcept { @@ -147,50 +133,4 @@ namespace blast return v7[0]; } - - - template - requires (SimdSize_v == SimdVec::size()) - inline std::tuple, SimdVec> imax(SimdVec const& v1, SimdVec const& idx) noexcept - { - /* v2 = [G H E F | C D A B] */ - SimdVec const v2 = _mm256_permute_ps(v1, 0b10'11'00'01); - SimdVec const iv2 = _mm256_castps_si256(_mm256_permute_ps(_mm256_castsi256_ps(idx), 0b10'11'00'01)); - - /* v3 = [W=max(G,H) W=max(G,H) Z=max(E,F) Z=max(E,F) | Y=max(C,D) Y=max(C,D) X=max(A,B) X=max(A,B)] */ - /* v3 = [W W Z Z | Y Y X X] */ - // __m256 v3 = _mm256_max_ps(v1, v2); - SimdVec::MaskType const mask_v3 = v2 > v1; - SimdVec const v3 = blend(v1, v2, mask_v3); - SimdVec const iv3 = blend(idx, iv2, mask_v3); - - /* v4 = [Z Z W W | X X Y Y] */ - SimdVec const v4 = _mm256_permute_ps(v3, 0b00'00'10'10); - SimdVec const iv4 = _mm256_castps_si256(_mm256_permute_ps(_mm256_castsi256_ps(iv3), 0b00'00'10'10)); - - /* v5 = [J=max(Z,W) J=max(Z,W) J=max(Z,W) J=max(Z,W) | I=max(X,Y) I=max(X,Y) I=max(X,Y) I=max(X,Y)] */ - /* v5 = [J J J J | I I I I] */ - // __m256 v5 = _mm256_max_ps(v3, v4); - SimdVec::MaskType const mask_v5 = v4 > v3; - SimdVec const v5 = blend(v3, v4, mask_v5); - SimdVec const iv5 = blend(iv3, iv4, mask_v5); - - /* v6 = [I I I I | J J J J] */ - SimdVec const v6 = _mm256_permute2f128_ps(v5, v5, 0b0000'0001); - SimdVec const iv6 = _mm256_castps_si256( - _mm256_permute2f128_ps( - _mm256_castsi256_ps(iv5), - _mm256_castsi256_ps(iv5), - 0b0000'0001 - ) - ); - - /* v7 = [M=max(I,J) M=max(I,J) M=max(I,J) M=max(I,J) | M=max(I,J) M=max(I,J) M=max(I,J) M=max(I,J)] */ - // __m128 v7 = _mm_max_ps(_mm256_castps256_ps128(v5), v6); - SimdVec::MaskType const mask_v7 = v6 > v5; - SimdVec const v7 = blend(v5, v6, mask_v7); - SimdVec const iv7 = blend(iv5, iv6, mask_v7); - - return std::make_tuple(v7, iv7); - } } diff --git a/include/blast/math/simd/avx256/SimdVecFloat64.hpp b/include/blast/math/simd/avx256/SimdVecFloat64.hpp index 52a3ed63..31e067fc 100644 --- a/include/blast/math/simd/avx256/SimdVecFloat64.hpp +++ b/include/blast/math/simd/avx256/SimdVecFloat64.hpp @@ -53,13 +53,6 @@ namespace blast } - template <> - inline SimdVec::SimdVec(ValueType const * src, MaskType mask, bool aligned) noexcept - : SimdVecBase {_mm256_maskload_pd(src, mask)} - { - } - - template <> inline void SimdVec::store(ValueType * dst, bool aligned) const noexcept { @@ -70,13 +63,6 @@ namespace blast } - template <> - inline void SimdVec::store(ValueType * dst, MaskType mask, bool aligned) const noexcept - { - _mm256_maskstore_pd(dst, mask, value_); - } - - template <> inline SimdVec::MaskType operator>(SimdVec const& a, SimdVec const& b) noexcept { @@ -129,28 +115,4 @@ namespace blast return m[0]; } - - - template - requires (SimdSize_v == SimdVec::size()) - inline std::tuple, SimdVec> imax(SimdVec const& x, SimdVec const& idx) noexcept - { - SimdVec const y = _mm256_permute2f128_pd(x, x, 1); // permute 128-bit values - SimdVec const iy = _mm256_permute2f128_si256(idx, idx, 1); - - // __m256d m1 = _mm256_max_pd(x.value_, y); // m1[0] = max(x[0], x[2]), m1[1] = max(x[1], x[3]), etc. - SimdVec::MaskType const mask_m1 = y > x; - SimdVec const m1 = blend(x, y, mask_m1); - SimdVec const im1 = blend(idx, iy, mask_m1); - - SimdVec const m2 = _mm256_permute_pd(m1, 5); // set m2[0] = m1[1], m2[1] = m1[0], etc. - SimdVec const im2 = _mm256_castpd_si256(_mm256_permute_pd(_mm256_castsi256_pd(im1), 5)); - - // __m256d m = _mm256_max_pd(m1, m2); // all m[0] ... m[3] contain the horizontal max(x[0], x[1], x[2], x[3]) - SimdVec::MaskType const mask_m = m2 > m1; - SimdVec const m = blend(m1, m2, mask_m); - SimdVec const im = blend(im1, im2, mask_m); - - return std::make_tuple(m, im); - } } diff --git a/include/blast/math/simd/avx256/SimdVecInt32.hpp b/include/blast/math/simd/avx256/SimdVecInt32.hpp index 15db3b4b..db995573 100644 --- a/include/blast/math/simd/avx256/SimdVecInt32.hpp +++ b/include/blast/math/simd/avx256/SimdVecInt32.hpp @@ -17,27 +17,30 @@ #include #include #include -#include #include #include +#include namespace blast { - template <> - class SimdVec - : public SimdVecBase + template + requires std::is_base_of_v + class SimdVec + : public SimdVecBase { public: using MaskType = __m256i; + using typename SimdVecBase::ValueType; + using typename SimdVecBase::IntrinsicType; /** * @brief Initialize to (0, 0, 0, 0, 0, 0, 0, 0) */ SimdVec() noexcept - : SimdVecBase {_mm256_setzero_si256()} + : SimdVecBase {_mm256_setzero_si256()} { } @@ -46,7 +49,7 @@ namespace blast * @brief Initialize to (a, a, a, a, a, a, a, a) */ SimdVec(ValueType a) noexcept - : SimdVecBase {_mm256_set1_epi32(a)} + : SimdVecBase {_mm256_set1_epi32(a)} { } @@ -68,20 +71,20 @@ namespace blast */ SimdVec(ValueType a7, ValueType a6, ValueType a5, ValueType a4, ValueType a3, ValueType a2, ValueType a1, ValueType a0) noexcept - : SimdVecBase {_mm256_set_epi32(a7, a6, a5, a4, a3, a2, a1, a0)} + : SimdVecBase {_mm256_set_epi32(a7, a6, a5, a4, a3, a2, a1, a0)} { } SimdVec(IntrinsicType value) noexcept - : SimdVecBase {value} + : SimdVecBase {value} { } SimdVec& operator+=(ValueType x) noexcept { - value_ = _mm256_add_epi32(value_, _mm256_set1_epi32(x)); + this->value_ = _mm256_add_epi32(this->value_, _mm256_set1_epi32(x)); return *this; } diff --git a/include/blast/math/simd/avx256/SimdVecInt64.hpp b/include/blast/math/simd/avx256/SimdVecInt64.hpp index f383016d..e2adae8e 100644 --- a/include/blast/math/simd/avx256/SimdVecInt64.hpp +++ b/include/blast/math/simd/avx256/SimdVecInt64.hpp @@ -17,27 +17,30 @@ #include #include #include -#include #include #include +#include namespace blast { - template <> - class SimdVec - : public SimdVecBase + template + requires std::is_base_of_v + class SimdVec + : public SimdVecBase { public: using MaskType = __m256i; + using typename SimdVecBase::ValueType; + using typename SimdVecBase::IntrinsicType; /** * @brief Initialize to (0, 0, 0, 0) */ SimdVec() noexcept - : SimdVecBase {_mm256_setzero_si256()} + : SimdVecBase {_mm256_setzero_si256()} { } @@ -46,7 +49,7 @@ namespace blast * @brief Initialize to (a, a, a, a) */ SimdVec(ValueType a) noexcept - : SimdVecBase {_mm256_set1_epi64x(a)} + : SimdVecBase {_mm256_set1_epi64x(a)} { } @@ -67,26 +70,26 @@ namespace blast * where a0 corresponds to the lower bits. */ SimdVec(ValueType a3, ValueType a2, ValueType a1, ValueType a0) noexcept - : SimdVecBase {_mm256_set_epi64x(a3, a2, a1, a0)} + : SimdVecBase {_mm256_set_epi64x(a3, a2, a1, a0)} { } SimdVec(IntrinsicType value) noexcept - : SimdVecBase {value} + : SimdVecBase {value} { } operator IntrinsicType() const noexcept { - return value_; + return this->value_; } SimdVec& operator+=(ValueType x) noexcept { - value_ = _mm256_add_epi64(value_, _mm256_set1_epi64x(x)); + this->value_ = _mm256_add_epi64(this->value_, _mm256_set1_epi64x(x)); return *this; } From c21776f14c14b57acbc879847310f7c402e37d0b Mon Sep 17 00:00:00 2001 From: Mikhail Katliar Date: Thu, 1 Aug 2024 20:22:19 +0200 Subject: [PATCH 12/27] SimdVec decoupled from SimdVecBase --- include/blast/math/simd/SimdVec.hpp | 62 ++++++++++++++----- .../blast/math/simd/avx256/SimdVecFloat32.hpp | 12 ++-- .../blast/math/simd/avx256/SimdVecFloat64.hpp | 12 ++-- 3 files changed, 53 insertions(+), 33 deletions(-) diff --git a/include/blast/math/simd/SimdVec.hpp b/include/blast/math/simd/SimdVec.hpp index 69d1fa0b..554fcd00 100644 --- a/include/blast/math/simd/SimdVec.hpp +++ b/include/blast/math/simd/SimdVec.hpp @@ -14,8 +14,6 @@ #pragma once -#include - #include #include @@ -221,12 +219,11 @@ namespace blast */ template class SimdVec - : public SimdVecBase { public: - using typename SimdVecBase::ValueType; - using typename SimdVecBase::IntrinsicType; - using typename SimdVecBase::XSimdType; + using ValueType = T; + using XSimdType = xsimd::batch; + using IntrinsicType = typename XSimdType::register_type; using MaskType = xsimd::batch_bool; @@ -234,7 +231,7 @@ namespace blast * @brief Set to [0, 0, 0, ...] */ SimdVec() noexcept - : SimdVecBase {T {}} + : value_ {T {}} { } @@ -243,13 +240,13 @@ namespace blast SimdVec(IntrinsicType value) noexcept - : SimdVecBase {value} + : value_ {value} { } SimdVec(XSimdType value) noexcept - : SimdVecBase {value} + : value_ {value} { } @@ -260,7 +257,7 @@ namespace blast * @param value value for each component of SIMD vector */ SimdVec(ValueType value) noexcept - : SimdVecBase {value} + : value_ {value} { } @@ -272,7 +269,7 @@ namespace blast * @param aligned true indicates that an aligned read instruction should be used */ explicit SimdVec(ValueType const * src, bool aligned) noexcept - : SimdVecBase {aligned ? xsimd::load_aligned(src) : xsimd::load_unaligned(src)} + : value_ {aligned ? xsimd::load_aligned(src) : xsimd::load_unaligned(src)} { } @@ -285,8 +282,36 @@ namespace blast * @param aligned true if @a src is SIMD-aligned */ explicit SimdVec(ValueType const * src, MaskType mask, bool aligned) noexcept - : SimdVecBase {maskload(src, mask)} + : value_ {maskload(src, mask)} + { + } + + + /** + * @brief Number of elements in SIMD pack + */ + static size_t constexpr size() + { + return XSimdType::size; + } + + + operator IntrinsicType() const noexcept + { + return value_; + } + + + /** + * @brief Access single element + * + * @param i element index + * + * @return element value + */ + ValueType operator[](size_t i) const noexcept { + return value_.get(i); } @@ -299,9 +324,9 @@ namespace blast void store(ValueType * dst, bool aligned) const noexcept { if (aligned) - xsimd::store_aligned(dst, this->value_); + xsimd::store_aligned(dst, value_); else - xsimd::store_unaligned(dst, this->value_); + xsimd::store_unaligned(dst, value_); } @@ -314,7 +339,7 @@ namespace blast */ void store(ValueType * dst, MaskType mask, bool aligned) const noexcept { - maskstore(this->value_, dst, mask); + maskstore(value_, dst, mask); } @@ -327,7 +352,7 @@ namespace blast */ SimdVec& operator*=(SimdVec const& a) noexcept { - this->value_ *= a.value_; + value_ *= a.value_; return *this; } @@ -341,7 +366,7 @@ namespace blast */ SimdVec& operator/=(SimdVec const& a) noexcept { - this->value_ /= a.value_; + value_ /= a.value_; return *this; } @@ -362,6 +387,9 @@ namespace blast auto const t = imax(v1.value_, xsimd::batch(idx)); return {get<0>(t), SimdVec(get<1>(t))}; } + + private: + XSimdType value_; }; diff --git a/include/blast/math/simd/avx256/SimdVecFloat32.hpp b/include/blast/math/simd/avx256/SimdVecFloat32.hpp index 4a8a97f4..47a3aaf1 100644 --- a/include/blast/math/simd/avx256/SimdVecFloat32.hpp +++ b/include/blast/math/simd/avx256/SimdVecFloat32.hpp @@ -15,40 +15,36 @@ #pragma once #include -#include -#include #include -#include - namespace blast { template <> inline SimdVec::SimdVec() noexcept - : SimdVecBase {_mm256_setzero_ps()} + : value_ {_mm256_setzero_ps()} { } template <> inline SimdVec::SimdVec(IntrinsicType value) noexcept - : SimdVecBase {value} + : value_ {value} { } template <> inline SimdVec::SimdVec(ValueType value) noexcept - : SimdVecBase {_mm256_set1_ps(value)} + : value_ {_mm256_set1_ps(value)} { } template <> inline SimdVec::SimdVec(ValueType const * src, bool aligned) noexcept - : SimdVecBase {aligned ? _mm256_load_ps(src) : _mm256_loadu_ps(src)} + : value_ {aligned ? _mm256_load_ps(src) : _mm256_loadu_ps(src)} { } diff --git a/include/blast/math/simd/avx256/SimdVecFloat64.hpp b/include/blast/math/simd/avx256/SimdVecFloat64.hpp index 31e067fc..22d358ea 100644 --- a/include/blast/math/simd/avx256/SimdVecFloat64.hpp +++ b/include/blast/math/simd/avx256/SimdVecFloat64.hpp @@ -15,40 +15,36 @@ #pragma once #include -#include -#include #include -#include - namespace blast { template <> inline SimdVec::SimdVec() noexcept - : SimdVecBase {_mm256_setzero_pd()} + : value_ {_mm256_setzero_pd()} { } template <> inline SimdVec::SimdVec(IntrinsicType value) noexcept - : SimdVecBase {value} + : value_ {value} { } template <> inline SimdVec::SimdVec(ValueType value) noexcept - : SimdVecBase {_mm256_set1_pd(value)} + : value_ {_mm256_set1_pd(value)} { } template <> inline SimdVec::SimdVec(ValueType const * src, bool aligned) noexcept - : SimdVecBase {aligned ? _mm256_load_pd(src) : _mm256_loadu_pd(src)} + : value_ {aligned ? _mm256_load_pd(src) : _mm256_loadu_pd(src)} { } From 7ff99d03b7cce6d8ec6b2e87188b0f8bc95c678b Mon Sep 17 00:00:00 2001 From: Mikhail Katliar Date: Thu, 1 Aug 2024 20:36:53 +0200 Subject: [PATCH 13/27] SimdMask<> is an alias for xsimd::batch_bool<> --- include/blast/math/simd/Avx256.hpp | 1 - include/blast/math/simd/SimdMask.hpp | 9 ++- include/blast/math/simd/avx256/SimdMask.hpp | 83 --------------------- 3 files changed, 6 insertions(+), 87 deletions(-) delete mode 100644 include/blast/math/simd/avx256/SimdMask.hpp diff --git a/include/blast/math/simd/Avx256.hpp b/include/blast/math/simd/Avx256.hpp index 8e34d3b3..c1e7980a 100644 --- a/include/blast/math/simd/Avx256.hpp +++ b/include/blast/math/simd/Avx256.hpp @@ -19,7 +19,6 @@ #include #include #include -// #include #include #include diff --git a/include/blast/math/simd/SimdMask.hpp b/include/blast/math/simd/SimdMask.hpp index 7e65c867..31486d2f 100644 --- a/include/blast/math/simd/SimdMask.hpp +++ b/include/blast/math/simd/SimdMask.hpp @@ -14,6 +14,8 @@ #pragma once +#include + namespace blast { @@ -22,7 +24,8 @@ namespace blast * The width of a given simd_mask instantiation is a constant expression, determined by the template parameter. * * @tparam T the element type simd_mask applies on + * @tparam Arch instruction set architecture */ - template - class SimdMask; -} \ No newline at end of file + template + using SimdMask = xsimd::batch_bool; +} diff --git a/include/blast/math/simd/avx256/SimdMask.hpp b/include/blast/math/simd/avx256/SimdMask.hpp deleted file mode 100644 index 8dbcea64..00000000 --- a/include/blast/math/simd/avx256/SimdMask.hpp +++ /dev/null @@ -1,83 +0,0 @@ -// Copyright 2023 Mikhail Katliar -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once - -#include - -#include - -#include - - -namespace blast -{ - template - class SimdMask - { - static size_t constexpr SS = SimdSize_v; - - public: - using ValueType = IntElementType_t; - using IntrinsicType = __m256i; - using MaskType = __m256i; - - - /** - * @brief Set to [0, 0, 0, ...] - */ - SimdMask() noexcept - : value_ {_mm256_setzero_pd()} - { - } - - - SimdMask(IntrinsicType value) noexcept - : value_ {value} - { - } - - - operator IntrinsicType() const noexcept - { - return value_; - } - - - /** - * @brief Number of elements in SIMD pack - */ - static size_t constexpr size() - { - return SS; - } - - - /** - * @brief Access specified element - * - * @param i element index - * - * @return element value - */ - ValueType operator[](size_t i) const noexcept - { - return value_[i]; - } - - - private: - IntrinsicType value_; - }; -} \ No newline at end of file From e7ed800f46ad25dc1fa66907d479a6aed2ab6c82 Mon Sep 17 00:00:00 2001 From: Mikhail Katliar Date: Fri, 2 Aug 2024 12:44:00 +0200 Subject: [PATCH 14/27] Added SimdIndex, removed some messy types. --- include/blast/math/simd/Avx256.hpp | 6 - include/blast/math/simd/IntScalarType.hpp | 29 ----- include/blast/math/simd/IntVecType.hpp | 24 ---- include/blast/math/simd/SequenceTag.hpp | 23 ---- include/blast/math/simd/SimdIndex.hpp | 98 +++++++++++++++ include/blast/math/simd/SimdSize.hpp | 22 ++-- include/blast/math/simd/SimdVec.hpp | 8 +- .../blast/math/simd/avx256/IntScalarType.hpp | 34 ------ include/blast/math/simd/avx256/SimdSize.hpp | 50 -------- .../blast/math/simd/avx256/SimdVecInt32.hpp | 109 ----------------- .../blast/math/simd/avx256/SimdVecInt64.hpp | 114 ------------------ test/blast/math/simd/SimdVecTest.cpp | 14 ++- 12 files changed, 119 insertions(+), 412 deletions(-) delete mode 100644 include/blast/math/simd/IntScalarType.hpp delete mode 100644 include/blast/math/simd/IntVecType.hpp delete mode 100644 include/blast/math/simd/SequenceTag.hpp create mode 100644 include/blast/math/simd/SimdIndex.hpp delete mode 100644 include/blast/math/simd/avx256/IntScalarType.hpp delete mode 100644 include/blast/math/simd/avx256/SimdSize.hpp delete mode 100644 include/blast/math/simd/avx256/SimdVecInt32.hpp delete mode 100644 include/blast/math/simd/avx256/SimdVecInt64.hpp diff --git a/include/blast/math/simd/Avx256.hpp b/include/blast/math/simd/Avx256.hpp index c1e7980a..1fbc41c0 100644 --- a/include/blast/math/simd/Avx256.hpp +++ b/include/blast/math/simd/Avx256.hpp @@ -14,11 +14,5 @@ #pragma once -#include #include #include -#include -#include -#include - -#include diff --git a/include/blast/math/simd/IntScalarType.hpp b/include/blast/math/simd/IntScalarType.hpp deleted file mode 100644 index 3fb119ac..00000000 --- a/include/blast/math/simd/IntScalarType.hpp +++ /dev/null @@ -1,29 +0,0 @@ -// Copyright 2023 Mikhail Katliar -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once - -#include -#include - - -namespace blast -{ - template - struct IntScalarType; - - - template - using IntScalarType_t = typename IntScalarType::type; -} \ No newline at end of file diff --git a/include/blast/math/simd/IntVecType.hpp b/include/blast/math/simd/IntVecType.hpp deleted file mode 100644 index 0f8faab7..00000000 --- a/include/blast/math/simd/IntVecType.hpp +++ /dev/null @@ -1,24 +0,0 @@ -// Copyright 2023 Mikhail Katliar -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once - -#include -#include - -namespace blast -{ - template - using IntVecType_t = SimdVec>; -} \ No newline at end of file diff --git a/include/blast/math/simd/SequenceTag.hpp b/include/blast/math/simd/SequenceTag.hpp deleted file mode 100644 index 8f37c552..00000000 --- a/include/blast/math/simd/SequenceTag.hpp +++ /dev/null @@ -1,23 +0,0 @@ -// Copyright 2023 Mikhail Katliar -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once - - -namespace blast :: simd -{ - struct SequenceTag {}; - - inline SequenceTag constexpr sequenceTag; -} \ No newline at end of file diff --git a/include/blast/math/simd/SimdIndex.hpp b/include/blast/math/simd/SimdIndex.hpp new file mode 100644 index 00000000..bfbae1c5 --- /dev/null +++ b/include/blast/math/simd/SimdIndex.hpp @@ -0,0 +1,98 @@ +// Copyright 2024 Mikhail Katliar +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +#pragma once + +#include + +#include +#include + + +namespace blast +{ + namespace detail + { + /// @brief Deduces integer type of given size, sized or unsigned. + /// + /// @tparam N type size in bytes + /// @tparam S true for signed type, false for unsigned + /// + template + struct IntOfSize; + + template + using IntOfSize_t = typename IntOfSize::Type; + + template <> + struct IntOfSize<4, true> + { + using Type = std::int32_t; + }; + + template <> + struct IntOfSize<4, false> + { + using Type = std::uint32_t; + }; + + template <> + struct IntOfSize<8, true> + { + using Type = std::int64_t; + }; + + template <> + struct IntOfSize<8, false> + { + using Type = std::uint64_t; + }; + + template + requires (xsimd::batch::size == 4) && std::is_integral_v + inline xsimd::batch indexSequence(T start) noexcept + { + return {start, start + 1, start + 2, start + 3}; + } + + template + requires (xsimd::batch::size == 8) && std::is_integral_v + inline xsimd::batch indexSequence(T start) noexcept + { + return {start, start + 1, start + 2, start + 3, start + 4, start + 5, start + 6, start + 7}; + } + } + + + /// @brief Integer SIMD type that can be used for indexing or gather-scatter operations. + /// + /// @tparam T the deduced type will be the same size as @a T + /// @tparam Arch instruction set architecture + /// + template + using SimdIndex = xsimd::batch, Arch>; + + + /// @brief Construct an integer index sequence + /// + /// @param start start of the sequence + /// + /// @return [ @a start, @a start + 1, ..., @a start + N - 1 ] + /// where N = SimdIndex::size + /// + template + inline SimdIndex indexSequence(typename SimdIndex::value_type start = 0) noexcept + { + return detail::indexSequence, Arch>(start); + } +} diff --git a/include/blast/math/simd/SimdSize.hpp b/include/blast/math/simd/SimdSize.hpp index 4e862f72..4a22339a 100644 --- a/include/blast/math/simd/SimdSize.hpp +++ b/include/blast/math/simd/SimdSize.hpp @@ -1,4 +1,4 @@ -// Copyright 2023 Mikhail Katliar +// Copyright 2023-2024 Mikhail Katliar // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -14,24 +14,20 @@ #pragma once +#include + #include +#include namespace blast { - template - struct SimdSize; - - - template - struct SimdSize : SimdSize {}; - - /** - * @brief Number of elements in a SimdVec object. + * @brief Size of a SIMD register conatining scalars of a given type. * * @tparam T scalar type + * @tparam Arch instruction set architecture */ - template - size_t constexpr SimdSize_v = SimdSize::value; -} \ No newline at end of file + template + std::size_t constexpr SimdSize_v = xsimd::batch, Arch>::size; +} diff --git a/include/blast/math/simd/SimdVec.hpp b/include/blast/math/simd/SimdVec.hpp index 554fcd00..2aef5e5a 100644 --- a/include/blast/math/simd/SimdVec.hpp +++ b/include/blast/math/simd/SimdVec.hpp @@ -14,6 +14,8 @@ #pragma once +#include + #include #include @@ -381,11 +383,9 @@ namespace blast friend SimdVec operator*<>(ValueType const& a, SimdVec const& b) noexcept; friend SimdVec operator*<>(SimdVec const& a, ValueType const& b) noexcept; - template - friend std::tuple> imax(SimdVec const& v1, SimdVec const& idx) noexcept + friend std::tuple> imax(SimdVec const& v1, SimdIndex const& idx) noexcept { - auto const t = imax(v1.value_, xsimd::batch(idx)); - return {get<0>(t), SimdVec(get<1>(t))}; + return imax(v1.value_, idx); } private: diff --git a/include/blast/math/simd/avx256/IntScalarType.hpp b/include/blast/math/simd/avx256/IntScalarType.hpp deleted file mode 100644 index 59785c85..00000000 --- a/include/blast/math/simd/avx256/IntScalarType.hpp +++ /dev/null @@ -1,34 +0,0 @@ -// Copyright 2023 Mikhail Katliar -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once - -#include - - -namespace blast -{ - template <> - struct IntScalarType<4> - { - using type = std::int64_t; - }; - - - template <> - struct IntScalarType<8> - { - using type = std::int32_t; - }; -} \ No newline at end of file diff --git a/include/blast/math/simd/avx256/SimdSize.hpp b/include/blast/math/simd/avx256/SimdSize.hpp deleted file mode 100644 index 2fa30b2f..00000000 --- a/include/blast/math/simd/avx256/SimdSize.hpp +++ /dev/null @@ -1,50 +0,0 @@ -// Copyright 2023 Mikhail Katliar -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once - -#include - -#include - - -namespace blast -{ - template <> - struct SimdSize - { - static size_t constexpr value = 8; - }; - - - template <> - struct SimdSize - { - static size_t constexpr value = 4; - }; - - - template <> - struct SimdSize - { - static size_t constexpr value = 8; - }; - - - template <> - struct SimdSize - { - static size_t constexpr value = 4; - }; -} \ No newline at end of file diff --git a/include/blast/math/simd/avx256/SimdVecInt32.hpp b/include/blast/math/simd/avx256/SimdVecInt32.hpp deleted file mode 100644 index db995573..00000000 --- a/include/blast/math/simd/avx256/SimdVecInt32.hpp +++ /dev/null @@ -1,109 +0,0 @@ -// Copyright 2023 Mikhail Katliar -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once - -#include -#include -#include - -#include - -#include -#include - - -namespace blast -{ - template - requires std::is_base_of_v - class SimdVec - : public SimdVecBase - { - public: - using MaskType = __m256i; - using typename SimdVecBase::ValueType; - using typename SimdVecBase::IntrinsicType; - - /** - * @brief Initialize to (0, 0, 0, 0, 0, 0, 0, 0) - */ - SimdVec() noexcept - : SimdVecBase {_mm256_setzero_si256()} - { - } - - - /** - * @brief Initialize to (a, a, a, a, a, a, a, a) - */ - SimdVec(ValueType a) noexcept - : SimdVecBase {_mm256_set1_epi32(a)} - { - } - - - /** - * @brief Initialize to (n + 7, n + 6, n + 5, n + 4, n + 3, n + 2, n + 1, n) - * - * @param n - */ - SimdVec(simd::SequenceTag, ValueType n = 0) - : SimdVec {n + 7, n + 6, n + 5, n + 4, n + 3, n + 2, n + 1, n} - { - } - - - /** - * @brief Initialize to (a7, a6, a5, a4, a3, a2, a1, a0), - * where a0 corresponds to the lower bits. - */ - SimdVec(ValueType a7, ValueType a6, ValueType a5, ValueType a4, - ValueType a3, ValueType a2, ValueType a1, ValueType a0) noexcept - : SimdVecBase {_mm256_set_epi32(a7, a6, a5, a4, a3, a2, a1, a0)} - { - } - - - SimdVec(IntrinsicType value) noexcept - : SimdVecBase {value} - { - } - - - SimdVec& operator+=(ValueType x) noexcept - { - this->value_ = _mm256_add_epi32(this->value_, _mm256_set1_epi32(x)); - return *this; - } - - - friend MaskType operator>(SimdVec const& a, SimdVec const& b) noexcept - { - return _mm256_cmpgt_epi32(a.value_, b.value_); - } - - - friend SimdVec blend(SimdVec const& a, SimdVec const& b, MaskType mask) noexcept - { - return _mm256_blendv_epi8(a.value_, b.value_, mask); - } - - - friend SimdVec operator+(SimdVec const& a, ValueType n) noexcept - { - return _mm256_add_epi32(a.value_, _mm256_set1_epi32(n)); - } - }; -} diff --git a/include/blast/math/simd/avx256/SimdVecInt64.hpp b/include/blast/math/simd/avx256/SimdVecInt64.hpp deleted file mode 100644 index e2adae8e..00000000 --- a/include/blast/math/simd/avx256/SimdVecInt64.hpp +++ /dev/null @@ -1,114 +0,0 @@ -// Copyright 2023 Mikhail Katliar -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once - -#include -#include -#include - -#include - -#include -#include - - -namespace blast -{ - template - requires std::is_base_of_v - class SimdVec - : public SimdVecBase - { - public: - using MaskType = __m256i; - using typename SimdVecBase::ValueType; - using typename SimdVecBase::IntrinsicType; - - /** - * @brief Initialize to (0, 0, 0, 0) - */ - SimdVec() noexcept - : SimdVecBase {_mm256_setzero_si256()} - { - } - - - /** - * @brief Initialize to (a, a, a, a) - */ - SimdVec(ValueType a) noexcept - : SimdVecBase {_mm256_set1_epi64x(a)} - { - } - - - /** - * @brief Initialize to (n + 3, n + 2, n + 1, n) - * - * @param n - */ - SimdVec(simd::SequenceTag, ValueType n = 0) - : SimdVec {n + 3, n + 2, n + 1, n} - { - } - - - /** - * @brief Initialize to [a3, a2, a1, a0], - * where a0 corresponds to the lower bits. - */ - SimdVec(ValueType a3, ValueType a2, ValueType a1, ValueType a0) noexcept - : SimdVecBase {_mm256_set_epi64x(a3, a2, a1, a0)} - { - } - - - SimdVec(IntrinsicType value) noexcept - : SimdVecBase {value} - { - } - - - operator IntrinsicType() const noexcept - { - return this->value_; - } - - - SimdVec& operator+=(ValueType x) noexcept - { - this->value_ = _mm256_add_epi64(this->value_, _mm256_set1_epi64x(x)); - return *this; - } - - - friend MaskType operator>(SimdVec const& a, SimdVec const& b) noexcept - { - return _mm256_cmpgt_epi64(a.value_, b.value_); - } - - - friend SimdVec blend(SimdVec const& a, SimdVec const& b, MaskType mask) noexcept - { - return _mm256_blendv_epi8(a.value_, b.value_, mask); - } - - - friend SimdVec operator+(SimdVec const& a, ValueType n) noexcept - { - return _mm256_add_epi64(a.value_, _mm256_set1_epi64x(n)); - } - }; -} diff --git a/test/blast/math/simd/SimdVecTest.cpp b/test/blast/math/simd/SimdVecTest.cpp index 90be9d1c..91765526 100644 --- a/test/blast/math/simd/SimdVecTest.cpp +++ b/test/blast/math/simd/SimdVecTest.cpp @@ -3,6 +3,7 @@ // license that can be found in the LICENSE file. #include +#include #include @@ -64,9 +65,10 @@ namespace blast :: testing TYPED_TEST(SimdVecTest, testImax) { using Scalar = TypeParam; - size_t constexpr SS = SimdSize_v; + using SimdType = SimdVec; + size_t constexpr SS = SimdType::size(); - IntVecType_t idx {simd::sequenceTag}; + SimdIndex const idx = indexSequence(); for (int i = 0; i < 100; ++i) { @@ -75,15 +77,15 @@ namespace blast :: testing SimdVec const v {a.data(), false}; SimdVec v_max; - IntVecType_t idx_max; + SimdIndex idx_max; std::tie(v_max, idx_max) = imax(v, idx); auto const max_el = std::max_element(a.begin(), a.end()); - for (size_t i = 0; i < idx_max.size(); ++i) - ASSERT_EQ(idx_max[i], std::distance(a.begin(), max_el)) << "Error at element " << i; + for (size_t i = 0; i < idx_max.size; ++i) + ASSERT_EQ(idx_max.get(i), std::distance(a.begin(), max_el)) << "Error at element " << i; for (size_t i = 0; i < v_max.size(); ++i) ASSERT_EQ(v_max[i], *max_el) << "Error at element " << i; } } -} \ No newline at end of file +} From 25538a094a97ad478a6aa204ab11acd0abb301eb Mon Sep 17 00:00:00 2001 From: Mikhail Katliar Date: Fri, 2 Aug 2024 15:27:35 +0200 Subject: [PATCH 15/27] Got rid of Simd and SimdTraits classes --- include/blast/math/RowColumnVectorPointer.hpp | 10 +- .../blast/math/dense/DynamicMatrixPointer.hpp | 9 +- .../blast/math/dense/DynamicVectorPointer.hpp | 11 +- .../blast/math/dense/StaticMatrixPointer.hpp | 10 +- .../blast/math/dense/StaticVectorPointer.hpp | 13 +- .../blast/math/expressions/PanelMatrix.hpp | 20 +- .../math/panel/DynamicPanelMatrixPointer.hpp | 9 +- include/blast/math/panel/PanelSize.hpp | 8 +- .../math/panel/StaticPanelMatrixPointer.hpp | 10 +- .../register_matrix/DynamicRegisterMatrix.hpp | 29 +-- .../math/register_matrix/RegisterMatrix.hpp | 38 ++-- include/blast/math/simd/RegisterCapacity.hpp | 32 ++++ include/blast/math/simd/Simd.hpp | 171 ------------------ include/blast/math/simd/SimdMask.hpp | 75 +++++++- .../math/panel/StaticPanelMatrixTest.cpp | 9 +- 15 files changed, 201 insertions(+), 253 deletions(-) create mode 100644 include/blast/math/simd/RegisterCapacity.hpp diff --git a/include/blast/math/RowColumnVectorPointer.hpp b/include/blast/math/RowColumnVectorPointer.hpp index e13c311f..ac009f04 100644 --- a/include/blast/math/RowColumnVectorPointer.hpp +++ b/include/blast/math/RowColumnVectorPointer.hpp @@ -16,6 +16,8 @@ #include #include +#include +#include #include #include @@ -28,9 +30,9 @@ namespace blast { public: using ElementType = typename MP::ElementType; - using IntrinsicType = typename Simd>::IntrinsicType; - using MaskType = typename Simd>::MaskType; using SimdVecType = SimdVec>; + using IntrinsicType = SimdVecType::IntrinsicType; + using MaskType = SimdMask>; static TransposeFlag constexpr transposeFlag = TF; static bool constexpr aligned = MP::aligned; @@ -176,7 +178,7 @@ namespace blast private: - static size_t constexpr SS = Simd>::size; + static size_t constexpr SS = SimdVecType::size(); MP ptr_; }; @@ -244,4 +246,4 @@ namespace blast { return RowColumnVectorPointer {std::move(p)}; } -} \ No newline at end of file +} diff --git a/include/blast/math/dense/DynamicMatrixPointer.hpp b/include/blast/math/dense/DynamicMatrixPointer.hpp index e6540576..2041e4b1 100644 --- a/include/blast/math/dense/DynamicMatrixPointer.hpp +++ b/include/blast/math/dense/DynamicMatrixPointer.hpp @@ -9,6 +9,7 @@ #include #include #include +#include #include #include @@ -23,9 +24,9 @@ namespace blast { public: using ElementType = T; - using IntrinsicType = typename Simd>::IntrinsicType; - using MaskType = typename Simd>::MaskType; using SimdVecType = SimdVec>; + using IntrinsicType = SimdVecType::IntrinsicType; + using MaskType = SimdMask>; static bool constexpr storageOrder = SO; static bool constexpr aligned = AF; @@ -187,7 +188,7 @@ namespace blast private: - static size_t constexpr SS = Simd>::size; + static size_t constexpr SS = SimdVecType::size(); static TransposeFlag constexpr majorOrientation = SO == columnMajor ? columnVector : rowVector; @@ -253,4 +254,4 @@ namespace blast { return {p, spacing}; } -} \ No newline at end of file +} diff --git a/include/blast/math/dense/DynamicVectorPointer.hpp b/include/blast/math/dense/DynamicVectorPointer.hpp index b026d205..dbdcb19c 100644 --- a/include/blast/math/dense/DynamicVectorPointer.hpp +++ b/include/blast/math/dense/DynamicVectorPointer.hpp @@ -4,8 +4,9 @@ #pragma once - #include +#include +#include #include #include @@ -18,9 +19,9 @@ namespace blast { public: using ElementType = T; - using IntrinsicType = typename Simd>::IntrinsicType; - using MaskType = typename Simd>::MaskType; using SimdVecType = SimdVec>; + using IntrinsicType = SimdVecType::IntrinsicType; + using MaskType = SimdMask>; static bool constexpr transposeFlag = TF; static bool constexpr aligned = AF; @@ -172,7 +173,7 @@ namespace blast private: - static size_t constexpr SS = Simd>::size; + static size_t constexpr SS = SimdVecType::size(); T * ptrOffset(ptrdiff_t i) const noexcept @@ -191,4 +192,4 @@ namespace blast { return p.trans(); } -} \ No newline at end of file +} diff --git a/include/blast/math/dense/StaticMatrixPointer.hpp b/include/blast/math/dense/StaticMatrixPointer.hpp index 2db833b8..961b5e01 100644 --- a/include/blast/math/dense/StaticMatrixPointer.hpp +++ b/include/blast/math/dense/StaticMatrixPointer.hpp @@ -6,8 +6,8 @@ #include #include -#include #include +#include #include #include #include @@ -24,9 +24,9 @@ namespace blast { public: using ElementType = T; - using IntrinsicType = typename Simd>::IntrinsicType; - using MaskType = typename Simd>::MaskType; using SimdVecType = SimdVec>; + using IntrinsicType = SimdVecType::IntrinsicType; + using MaskType = SimdMask>; static bool constexpr storageOrder = SO; static bool constexpr aligned = AF; @@ -188,7 +188,7 @@ namespace blast private: - static size_t constexpr SS = Simd>::size; + static size_t constexpr SS = SimdVecType::size(); static TransposeFlag constexpr majorOrientation = SO == columnMajor ? columnVector : rowVector; @@ -225,4 +225,4 @@ namespace blast { return trans(ptr(m.operand(), j, i)); } -} \ No newline at end of file +} diff --git a/include/blast/math/dense/StaticVectorPointer.hpp b/include/blast/math/dense/StaticVectorPointer.hpp index 6608ddd0..8cd7e1af 100644 --- a/include/blast/math/dense/StaticVectorPointer.hpp +++ b/include/blast/math/dense/StaticVectorPointer.hpp @@ -6,6 +6,9 @@ #include +#include +#include +#include #include @@ -16,9 +19,9 @@ namespace blast { public: using ElementType = T; - using IntrinsicType = typename Simd>::IntrinsicType; - using MaskType = typename Simd>::MaskType; using SimdVecType = SimdVec>; + using IntrinsicType = SimdVecType::IntrinsicType; + using MaskType = SimdMask>; static bool constexpr transposeFlag = TF; static bool constexpr aligned = AF; @@ -35,7 +38,7 @@ namespace blast constexpr StaticVectorPointer(T * ptr) noexcept : ptr_ {ptr} { - BLAST_USER_ASSERT(!AF || reinterpret_cast(ptr) % (SS * sizeof(T)) == 0, "Pointer is not aligned"); + BLAST_USER_ASSERT(!AF || isSimdAligned(ptr), "Pointer is not aligned"); } @@ -195,7 +198,7 @@ namespace blast private: - static size_t constexpr SS = Simd>::size; + static size_t constexpr SS = SimdVecType::size(); T * ptrOffset(ptrdiff_t i) const noexcept @@ -213,4 +216,4 @@ namespace blast { return p.trans(); } -} \ No newline at end of file +} diff --git a/include/blast/math/expressions/PanelMatrix.hpp b/include/blast/math/expressions/PanelMatrix.hpp index 1df9b97d..d0494887 100644 --- a/include/blast/math/expressions/PanelMatrix.hpp +++ b/include/blast/math/expressions/PanelMatrix.hpp @@ -5,11 +5,12 @@ #pragma once #include -#include +#include +#include +#include #include #include #include -//#include #include #include @@ -93,13 +94,12 @@ namespace blast using ET1 = ElementType_t; using ET2 = ElementType_t; - static size_t constexpr SS = Simd::size; + static size_t constexpr SS = SimdSize_v; static size_t constexpr PANEL_SIZE = PanelSize_v; static_assert(PANEL_SIZE % SS == 0); - using MaskType = typename Simd::MaskType; - using IntType = typename Simd::IntType; - using SIMD = Simd; + using MaskType = SimdMask; + using IntType = typename SimdIndex::value_type; if constexpr (SO1 == columnMajor && SO2 == columnMajor) @@ -119,7 +119,7 @@ namespace blast if (IntType const rem = m % SS) { - MaskType const mask = SIMD::index() < rem; + MaskType const mask = indexSequence() < rem; size_t const i = m - rem; auto pr = ptr(*rhs, i, 0); auto pl = ptr(*lhs, i, 0); @@ -187,12 +187,12 @@ namespace blast using ET1 = ElementType_t; using ET2 = ElementType_t; - static size_t constexpr SS = Simd::size; + static size_t constexpr SS = SimdSize_v; static size_t constexpr PANEL_SIZE = PanelSize_v; static_assert(PANEL_SIZE % SS == 0); - using MaskType = typename Simd::MaskType; - using IntType = typename Simd::IntType; + using MaskType = SimdMask; + using IntType = typename SimdIndex::value_type; if constexpr (SO1 == columnMajor && SO2 == columnMajor) { diff --git a/include/blast/math/panel/DynamicPanelMatrixPointer.hpp b/include/blast/math/panel/DynamicPanelMatrixPointer.hpp index f965f4e1..840ff7e9 100644 --- a/include/blast/math/panel/DynamicPanelMatrixPointer.hpp +++ b/include/blast/math/panel/DynamicPanelMatrixPointer.hpp @@ -6,6 +6,7 @@ #include #include +#include #include #include #include @@ -23,9 +24,9 @@ namespace blast { public: using ElementType = T; - using IntrinsicType = typename Simd>::IntrinsicType; - using MaskType = typename Simd>::MaskType; using SimdVecType = SimdVec>; + using IntrinsicType = SimdVecType::IntrinsicType; + using MaskType = SimdMask>; static bool constexpr storageOrder = SO; static bool constexpr aligned = AF; @@ -211,7 +212,7 @@ namespace blast private: - static size_t constexpr SS = Simd>::size; + static size_t constexpr SS = SimdVecType::size(); static TransposeFlag constexpr majorOrientation = SO == columnMajor ? columnVector : rowVector; @@ -275,4 +276,4 @@ namespace blast { return trans(ptr(m.operand(), j, i)); } -} \ No newline at end of file +} diff --git a/include/blast/math/panel/PanelSize.hpp b/include/blast/math/panel/PanelSize.hpp index 027d5c63..9647a0cd 100644 --- a/include/blast/math/panel/PanelSize.hpp +++ b/include/blast/math/panel/PanelSize.hpp @@ -8,11 +8,13 @@ // Includes //************************************************************************************************* -#include +#include + +#include namespace blast { - template - size_t constexpr PanelSize_v = Simd::size; + template + size_t constexpr PanelSize_v = SimdSize_v; } diff --git a/include/blast/math/panel/StaticPanelMatrixPointer.hpp b/include/blast/math/panel/StaticPanelMatrixPointer.hpp index 50544d02..17e19194 100644 --- a/include/blast/math/panel/StaticPanelMatrixPointer.hpp +++ b/include/blast/math/panel/StaticPanelMatrixPointer.hpp @@ -9,6 +9,8 @@ #include #include #include +#include +#include #include #include #include @@ -21,9 +23,9 @@ namespace blast { public: using ElementType = T; - using IntrinsicType = typename Simd>::IntrinsicType; - using MaskType = typename Simd>::MaskType; using SimdVecType = SimdVec>; + using IntrinsicType = SimdVecType::IntrinsicType; + using MaskType = SimdMask>; static bool constexpr storageOrder = SO; static bool constexpr aligned = AF; @@ -204,7 +206,7 @@ namespace blast private: - static size_t constexpr SS = Simd>::size; + static size_t constexpr SS = SimdVecType::size(); static TransposeFlag constexpr majorOrientation = SO == columnMajor ? columnVector : rowVector; @@ -267,4 +269,4 @@ namespace blast { return trans(ptr(m.operand(), j, i)); } -} \ No newline at end of file +} diff --git a/include/blast/math/register_matrix/DynamicRegisterMatrix.hpp b/include/blast/math/register_matrix/DynamicRegisterMatrix.hpp index ed59f67e..10e135bd 100644 --- a/include/blast/math/register_matrix/DynamicRegisterMatrix.hpp +++ b/include/blast/math/register_matrix/DynamicRegisterMatrix.hpp @@ -4,8 +4,11 @@ #pragma once -#include #include +#include +#include +#include +#include #include #include #include @@ -18,8 +21,6 @@ #include #include -#include - namespace blast { @@ -250,14 +251,14 @@ namespace blast private: - using SIMD = Simd; - using IntrinsicType = typename SIMD::IntrinsicType; - using MaskType = typename SIMD::MaskType; - using IntType = typename SIMD::IntType; - using SimdVecType = SimdVec; + using Arch = xsimd::default_arch; + using SimdVecType = SimdVec; + using IntrinsicType = typename SimdVecType::IntrinsicType; + using MaskType = SimdMask; + using IntType = typename SimdIndex::value_type; // SIMD size - static size_t constexpr SS = Simd::size; + static size_t constexpr SS = SimdVecType::size(); // Numberf of SIMD registers required to store a single column of the matrix. static size_t constexpr RM = M / SS; @@ -266,7 +267,7 @@ namespace blast BLAZE_STATIC_ASSERT_MSG((RM > 0), "Number of rows must be not less than SIMD size"); BLAZE_STATIC_ASSERT_MSG((RN > 0), "Number of columns must be positive"); BLAZE_STATIC_ASSERT_MSG((M % SS == 0), "Number of rows must be a multiple of SIMD size"); - BLAZE_STATIC_ASSERT_MSG((RM * RN <= RegisterCapacity_v), "Not enough registers for a DynamicRegisterMatrix"); + BLAZE_STATIC_ASSERT_MSG((RM * RN <= registerCapacity(Arch {})), "Not enough registers for a DynamicRegisterMatrix"); SimdVecType v_[RM][RN]; size_t const m_; @@ -329,7 +330,7 @@ namespace blast if (IntType const rem = m_ % SS) { - MaskType const mask = SIMD::index() < rem; + MaskType const mask = indexSequence() < rem; size_t const i = m_ / SS; for (size_t j = 0; j < n_ && j < columns(); ++j) @@ -349,10 +350,10 @@ namespace blast { IntType const skip = j - ri * SS; IntType const rem = m_ - ri * SS; - MaskType mask = SIMD::index() < rem; + MaskType mask = indexSequence() < rem; if (skip > 0) - mask &= SIMD::index() >= skip; + mask &= indexSequence() >= skip; p(SS * ri, j).store(v_[ri][j], mask); } @@ -490,7 +491,7 @@ namespace blast // ax[i] = alpha * a.load(i * SS, 0); // if (rem) - // ax[ii] = alpha * a.load(ii * SS, 0, SIMD::index() < rem); + // ax[ii] = alpha * a.load(ii * SS, 0, indexSequence() < rem); // #pragma unroll // for (size_t j = 0; j < N; ++j) diff --git a/include/blast/math/register_matrix/RegisterMatrix.hpp b/include/blast/math/register_matrix/RegisterMatrix.hpp index 68641769..5724a05b 100644 --- a/include/blast/math/register_matrix/RegisterMatrix.hpp +++ b/include/blast/math/register_matrix/RegisterMatrix.hpp @@ -5,7 +5,9 @@ #pragma once #include -#include +#include +#include +#include #include #include #include @@ -20,8 +22,6 @@ #include #include -#include - namespace blast { @@ -360,14 +360,14 @@ namespace blast private: - using SIMD = Simd; - using IntrinsicType = typename SIMD::IntrinsicType; - using MaskType = typename SIMD::MaskType; - using IntType = typename SIMD::IntType; - using SimdVecType = SimdVec; + using Arch = xsimd::default_arch; + using SimdVecType = SimdVec; + using IntrinsicType = typename SimdVecType::IntrinsicType; + using MaskType = SimdMask; + using IntType = typename SimdIndex::value_type; // SIMD size - static size_t constexpr SS = Simd::size; + static size_t constexpr SS = SimdVecType::size(); // Numberf of SIMD registers required to store a single column of the matrix. static size_t constexpr RM = M / SS; @@ -376,7 +376,7 @@ namespace blast BLAZE_STATIC_ASSERT_MSG((RM > 0), "Number of rows must be not less than SIMD size"); BLAZE_STATIC_ASSERT_MSG((RN > 0), "Number of columns must be positive"); BLAZE_STATIC_ASSERT_MSG((M % SS == 0), "Number of rows must be a multiple of SIMD size"); - BLAZE_STATIC_ASSERT_MSG((RM * RN <= RegisterCapacity_v), "Not enough registers for a RegisterMatrix"); + BLAZE_STATIC_ASSERT_MSG((RM * RN <= registerCapacity(Arch {})), "Not enough registers for a RegisterMatrix"); SimdVecType v_[RM][RN]; @@ -461,7 +461,7 @@ namespace blast v_[i][j] = beta * p(SS * i, j).load(); if (size_t const rem = m % SS) - v_[m / SS][j] = beta * p(m - rem, j).load(SIMD::index() < rem); + v_[m / SS][j] = beta * p(m - rem, j).load(indexSequence() < rem); } } } @@ -493,7 +493,7 @@ namespace blast if (IntType const rem = m % SS) { - MaskType const mask = SIMD::index() < rem; + MaskType const mask = indexSequence() < rem; size_t const i = m / SS; for (size_t j = 0; j < n && j < columns(); ++j) @@ -514,7 +514,7 @@ namespace blast if (skip && ri < RM) { - MaskType const mask = SIMD::index() >= skip; + MaskType const mask = indexSequence() >= skip; p(SS * ri, j).store(v_[ri][j], mask); ++ri; } @@ -536,10 +536,10 @@ namespace blast { IntType const skip = j - ri * SS; IntType const rem = m - ri * SS; - MaskType mask = SIMD::index() < rem; + MaskType mask = indexSequence() < rem; if (skip > 0) - mask &= SIMD::index() >= skip; + mask &= indexSequence() >= skip; p(SS * ri, j).store(v_[ri][j], mask); } @@ -596,7 +596,7 @@ namespace blast VectorPointer && (PB::transposeFlag == rowVector) BLAZE_ALWAYS_INLINE void RegisterMatrix::ger(T alpha, PA a, PB b) noexcept { - BLAZE_STATIC_ASSERT_MSG((RM * RN + RM + 1 <= RegisterCapacity_v), "Not enough registers for ger()"); + BLAZE_STATIC_ASSERT_MSG((RM * RN + RM + 1 <= registerCapacity(Arch {})), "Not enough registers for ger()"); SimdVecType ax[RM]; @@ -623,7 +623,7 @@ namespace blast VectorPointer && (PB::transposeFlag == rowVector) BLAZE_ALWAYS_INLINE void RegisterMatrix::ger(PA a, PB b) noexcept { - BLAZE_STATIC_ASSERT_MSG((RM * RN + RM + 1 <= RegisterCapacity_v), "Not enough registers for ger()"); + BLAZE_STATIC_ASSERT_MSG((RM * RN + RM + 1 <= registerCapacity(Arch {})), "Not enough registers for ger()"); SimdVecType ax[RM]; @@ -697,7 +697,7 @@ namespace blast BLAZE_ALWAYS_INLINE void RegisterMatrix::potrf() noexcept { static_assert(M >= N, "potrf() not implemented for register matrices with columns more than rows"); - static_assert(RM * RN + 2 <= RegisterCapacity_v, "Not enough registers"); + static_assert(RM * RN + 2 <= registerCapacity(Arch {}), "Not enough registers"); #pragma unroll for (size_t k = 0; k < N; ++k) @@ -745,7 +745,7 @@ namespace blast ax[i] = alpha * a(i * SS, 0).load(); if (rem) - ax[ii] = alpha * a(ii * SS, 0).load(SIMD::index() < rem); + ax[ii] = alpha * a(ii * SS, 0).load(indexSequence() < rem); #pragma unroll for (size_t j = 0; j < N; ++j) diff --git a/include/blast/math/simd/RegisterCapacity.hpp b/include/blast/math/simd/RegisterCapacity.hpp new file mode 100644 index 00000000..213b6d80 --- /dev/null +++ b/include/blast/math/simd/RegisterCapacity.hpp @@ -0,0 +1,32 @@ +// Copyright 2024 Mikhail Katliar +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +#pragma once + +#include + +#include + + +namespace blast +{ + /** + * @brief Number of available SIMD registers. + * + * @return Number of SIMD registers for AVX2 + */ + std::size_t constexpr registerCapacity(xsimd::avx2) + { + return 16; + } +} diff --git a/include/blast/math/simd/Simd.hpp b/include/blast/math/simd/Simd.hpp index 6f4ca1d2..65f521c4 100644 --- a/include/blast/math/simd/Simd.hpp +++ b/include/blast/math/simd/Simd.hpp @@ -21,177 +21,6 @@ namespace blast using namespace blaze; - template - struct Simd; - - - template - requires std::is_base_of_v - struct Simd - { - private: - class Index; - - - public: - using IntrinsicType = __m256d; - using MaskType = __m256i; - using IntType = long long; - - static size_t constexpr size = 4; - static size_t constexpr registerCapacity = 16; - - - //******************************************************* - // - // CUSTOM - // - //******************************************************* - static Index index() noexcept - { - return Index {}; - } - - - private: - class Index - { - public: - MaskType operator<(IntType n) const noexcept - { - return _mm256_cmpgt_epi64(_mm256_set1_epi64x(n), val_); - } - - - MaskType operator>(IntType n) const noexcept - { - return _mm256_cmpgt_epi64(val_, _mm256_set1_epi64x(n)); - } - - - MaskType operator<=(IntType n) const noexcept - { - return *this < n + 1; - } - - - MaskType operator>=(IntType n) const noexcept - { - return *this > n - 1; - } - - - private: - MaskType val_ = _mm256_set_epi64x(3, 2, 1, 0); - }; - }; - - - template - requires std::is_base_of_v - struct Simd - { - private: - class Index; - - - public: - using IntrinsicType = __m256; - using MaskType = __m256i; - using IntType = int; - - static size_t constexpr size = 8; - static size_t constexpr registerCapacity = 16; - - - //******************************************************* - // - // CUSTOM - // - //******************************************************* - static Index index() noexcept - { - return Index {}; - } - - - private: - class Index - { - public: - MaskType operator<(IntType n) const noexcept - { - return _mm256_cmpgt_epi32(_mm256_set1_epi32(n), val_); - } - - - MaskType operator>(IntType n) const noexcept - { - return _mm256_cmpgt_epi32(val_, _mm256_set1_epi32(n)); - } - - - MaskType operator<=(IntType n) const noexcept - { - return *this < n + 1; - } - - - MaskType operator>=(IntType n) const noexcept - { - return *this > n - 1; - } - - - private: - MaskType val_ = _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0); - }; - }; - - - template - using IntrinsicType_t = typename Simd::IntrinsicType; - - - template - struct SimdTraits; - - - template <> - struct SimdTraits<__m256d> - { - using ScalarType = double; - using MaskType = __m256i; - using IntType = long long; - static size_t constexpr size = 4; - }; - - - template <> - struct SimdTraits<__m256> - { - using ScalarType = float; - using MaskType = __m256i; - using IntType = int; - static size_t constexpr size = 8; - }; - - - template - using ScalarType_t = typename SimdTraits::ScalarType; - - - template - using MaskType_t = typename SimdTraits::MaskType; - - template - using IntType_t = typename SimdTraits::IntType; - - - template - size_t constexpr RegisterCapacity_v = Simd::registerCapacity; - - //******************************************************* // // LOAD diff --git a/include/blast/math/simd/SimdMask.hpp b/include/blast/math/simd/SimdMask.hpp index 31486d2f..325d9ee5 100644 --- a/include/blast/math/simd/SimdMask.hpp +++ b/include/blast/math/simd/SimdMask.hpp @@ -26,6 +26,77 @@ namespace blast * @tparam T the element type simd_mask applies on * @tparam Arch instruction set architecture */ - template - using SimdMask = xsimd::batch_bool; + template + class SimdMask + : public xsimd::batch_bool + { + public: + using XSimdType = xsimd::batch_bool; + using IntrinsicType = typename XSimdType::register_type; + + /** + * @brief Construct from an intrinsic register type + * + * @param v the value to construct from + */ + SimdMask(IntrinsicType v) noexcept + : XSimdType {v} + { + } + + /** + * @brief Construct from an @a xsimd::batch_bool of the same type + * + * @param v the value to construct from + */ + SimdMask(XSimdType const& v) noexcept + : XSimdType {v} + { + } + + /** + * @brief Construct from an @a xsimd::batch_bool of a different type + * + * We need to allow conversion from batch_bool with U != T, + * because we need e.g. the following code to work: + * + * int n; + * MaskType mask = indexSequence() < n; + * + * In the code above, the result of indexSequence() is a batch, + * and the result of indexSequence() < n is a batch_bool, + * which can not be directly assigned to a batch_bool, + * although their underlying register_type's are identical. + * + * @param v the value to construct from + */ + template + SimdMask(xsimd::batch_bool const& v) noexcept + : XSimdType {IntrinsicType(v)} + { + } + + /** + * @brief In-place logical and. + * + * We need to define this operator to make the following code work: + * + * MaskType mask; + * int n; + * mask &= indexSequence() >= n; + * + * In the code above, the result of indexSequence() is a batch, + * and the result of indexSequence() >= n is a batch_bool, + * which can not be directly used as a right-hand side operand of batch_bool::operator&=(). + * Defining operator&=(SimdMask const&) allows the right-hand side to be implicitly converted to SimdMask. + * + * @param v + * @return SimdMask& + */ + SimdMask& operator&=(SimdMask const& v) noexcept + { + XSimdType::operator&=(v); + return *this; + } + }; } diff --git a/test/blast/math/panel/StaticPanelMatrixTest.cpp b/test/blast/math/panel/StaticPanelMatrixTest.cpp index c5db87f3..c319a422 100644 --- a/test/blast/math/panel/StaticPanelMatrixTest.cpp +++ b/test/blast/math/panel/StaticPanelMatrixTest.cpp @@ -4,6 +4,7 @@ #include #include +#include #include @@ -115,10 +116,12 @@ namespace blast :: testing size_t constexpr N = 3 * SS + 2; StaticPanelMatrix A; - IntrinsicType_t val; + std::vector vec(SS); for (size_t i = 0; i < SS; ++i) - val[i] = TypeParam(i + 1); + vec[i] = i + 1; + + SimdVec val {vec.data(), false}; for (size_t i = 0; i < M; i += SS) for (size_t j = 0; j < N; ++j) @@ -203,4 +206,4 @@ namespace blast :: testing INSTANTIATE_TYPED_TEST_SUITE_P(double, StaticPanelMatrixTest, double); INSTANTIATE_TYPED_TEST_SUITE_P(float, StaticPanelMatrixTest, float); -} \ No newline at end of file +} From 87d75654171158b2f8a7ae0bcf1feefd6cdb7924 Mon Sep 17 00:00:00 2001 From: Mikhail Katliar Date: Fri, 2 Aug 2024 16:38:54 +0200 Subject: [PATCH 16/27] Got rid of the old Simd.hpp header --- include/blast/math/RowColumnVectorPointer.hpp | 4 +- include/blast/math/Simd.hpp | 13 + .../blast/math/dense/DynamicMatrixPointer.hpp | 10 +- .../blast/math/dense/DynamicVectorPointer.hpp | 19 +- .../blast/math/dense/StaticVectorPointer.hpp | 23 +- .../math/panel/DynamicPanelMatrixPointer.hpp | 4 +- .../blast/math/panel/StaticPanelMatrix.hpp | 3 +- .../math/panel/StaticPanelMatrixPointer.hpp | 5 +- .../register_matrix/DynamicRegisterMatrix.hpp | 8 +- .../math/register_matrix/RegisterMatrix.hpp | 10 +- include/blast/math/simd/Simd.hpp | 252 ------------------ include/blast/math/simd/SimdVec.hpp | 31 +++ .../blast/math/simd/avx256/SimdVecFloat32.hpp | 20 +- .../blast/math/simd/avx256/SimdVecFloat64.hpp | 7 + include/blast/math/views/row/Panel.hpp | 6 +- include/blast/math/views/submatrix/Panel.hpp | 6 +- 16 files changed, 93 insertions(+), 328 deletions(-) create mode 100644 include/blast/math/Simd.hpp delete mode 100644 include/blast/math/simd/Simd.hpp diff --git a/include/blast/math/RowColumnVectorPointer.hpp b/include/blast/math/RowColumnVectorPointer.hpp index ac009f04..ae7a8b5a 100644 --- a/include/blast/math/RowColumnVectorPointer.hpp +++ b/include/blast/math/RowColumnVectorPointer.hpp @@ -15,9 +15,7 @@ #pragma once #include -#include -#include -#include +#include #include #include diff --git a/include/blast/math/Simd.hpp b/include/blast/math/Simd.hpp new file mode 100644 index 00000000..f971dbe4 --- /dev/null +++ b/include/blast/math/Simd.hpp @@ -0,0 +1,13 @@ +// Copyright (c) 2019-2020 Mikhail Katliar All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#pragma once + +#include +#include +#include +#include +#include +#include +#include diff --git a/include/blast/math/dense/DynamicMatrixPointer.hpp b/include/blast/math/dense/DynamicMatrixPointer.hpp index 2041e4b1..5610b1c9 100644 --- a/include/blast/math/dense/DynamicMatrixPointer.hpp +++ b/include/blast/math/dense/DynamicMatrixPointer.hpp @@ -8,15 +8,11 @@ #include #include #include -#include -#include -#include +#include #include - #include - namespace blast { template @@ -84,9 +80,9 @@ namespace blast } - IntrinsicType broadcast() const noexcept + SimdVecType broadcast() const noexcept { - return blast::broadcast(ptr_); + return *ptr_; } diff --git a/include/blast/math/dense/DynamicVectorPointer.hpp b/include/blast/math/dense/DynamicVectorPointer.hpp index dbdcb19c..7638119a 100644 --- a/include/blast/math/dense/DynamicVectorPointer.hpp +++ b/include/blast/math/dense/DynamicVectorPointer.hpp @@ -4,9 +4,7 @@ #pragma once -#include -#include -#include +#include #include #include @@ -41,7 +39,7 @@ namespace blast , spacing_ {spacing} { BLAST_USER_ASSERT(spacing > 0, "Vector element spacing must be positive."); - BLAST_USER_ASSERT(!AF || reinterpret_cast(ptr) % (SS * sizeof(T)) == 0, "Pointer is not aligned"); + BLAST_USER_ASSERT(!AF || isSimdAligned(ptr), "Pointer is not aligned"); } @@ -52,6 +50,7 @@ namespace blast SimdVecType load() const noexcept { // Non-optimized + // TODO: use gather() IntrinsicType v; for (size_t i = 0; i < SS; ++i) v[i] = ptr_[spacing_ * i]; @@ -63,18 +62,18 @@ namespace blast SimdVecType load(MaskType mask) const noexcept { // Non-optimized - IntrinsicType v = blast::setzero, SS>(); + // TODO: use gather() + T v[SS]; for (size_t i = 0; i < SS; ++i) - if (mask[i]) - v[i] = ptr_[spacing_ * i]; + v[i] = mask[i] ? ptr_[spacing_ * i] : T {}; - return SimdVecType {v}; + return SimdVecType {v, false}; } - IntrinsicType broadcast() const noexcept + SimdVecType broadcast() const noexcept { - return blast::broadcast(ptr_); + return *ptr_; } diff --git a/include/blast/math/dense/StaticVectorPointer.hpp b/include/blast/math/dense/StaticVectorPointer.hpp index 8cd7e1af..6017dafa 100644 --- a/include/blast/math/dense/StaticVectorPointer.hpp +++ b/include/blast/math/dense/StaticVectorPointer.hpp @@ -5,10 +5,7 @@ #pragma once -#include -#include -#include -#include +#include #include @@ -73,27 +70,27 @@ namespace blast else { // Non-optimized - IntrinsicType v = blast::setzero(); + // TODO: use gather + T v[SS]; for (size_t i = 0; i < SS; ++i) - if (mask[i]) - v[i] = ptr_[S * i]; + v[i] = mask[i] ? ptr_[S * i] : T {}; - return SimdVecType {v}; + return SimdVecType {v, false}; } } - IntrinsicType broadcast() const noexcept + SimdVecType broadcast() const noexcept { - return blast::broadcast(ptr_); + return *ptr_; } - void store(IntrinsicType val) const noexcept + void store(SimdVecType val) const noexcept { if constexpr (S == 1) { - blast::store(ptr_, val); + val.store(ptr_, AF); } else { @@ -108,7 +105,7 @@ namespace blast { if constexpr (S == 1) { - blast::maskstore(ptr_, mask, val); + val.store(ptr_, mask); } else { diff --git a/include/blast/math/panel/DynamicPanelMatrixPointer.hpp b/include/blast/math/panel/DynamicPanelMatrixPointer.hpp index 840ff7e9..1e3f1233 100644 --- a/include/blast/math/panel/DynamicPanelMatrixPointer.hpp +++ b/include/blast/math/panel/DynamicPanelMatrixPointer.hpp @@ -5,11 +5,9 @@ #pragma once #include -#include -#include #include #include -#include +#include #include #include #include diff --git a/include/blast/math/panel/StaticPanelMatrix.hpp b/include/blast/math/panel/StaticPanelMatrix.hpp index b1d2a624..585b6f6b 100644 --- a/include/blast/math/panel/StaticPanelMatrix.hpp +++ b/include/blast/math/panel/StaticPanelMatrix.hpp @@ -7,7 +7,6 @@ #include #include #include -#include #include #include @@ -290,4 +289,4 @@ namespace blaze { using Type = blast::StaticPanelMatrix; }; -} \ No newline at end of file +} diff --git a/include/blast/math/panel/StaticPanelMatrixPointer.hpp b/include/blast/math/panel/StaticPanelMatrixPointer.hpp index 17e19194..aaadbda9 100644 --- a/include/blast/math/panel/StaticPanelMatrixPointer.hpp +++ b/include/blast/math/panel/StaticPanelMatrixPointer.hpp @@ -7,10 +7,7 @@ #include #include #include -#include -#include -#include -#include +#include #include #include #include diff --git a/include/blast/math/register_matrix/DynamicRegisterMatrix.hpp b/include/blast/math/register_matrix/DynamicRegisterMatrix.hpp index 10e135bd..9cd7f826 100644 --- a/include/blast/math/register_matrix/DynamicRegisterMatrix.hpp +++ b/include/blast/math/register_matrix/DynamicRegisterMatrix.hpp @@ -4,11 +4,7 @@ #pragma once -#include -#include -#include -#include -#include +#include #include #include #include @@ -122,7 +118,7 @@ namespace blast { for (size_t i = 0; i < RM; ++i) for (size_t j = 0; j < N; ++j) - v_[i][j] = setzero(); + v_[i][j].reset(); } diff --git a/include/blast/math/register_matrix/RegisterMatrix.hpp b/include/blast/math/register_matrix/RegisterMatrix.hpp index 5724a05b..5c37dc64 100644 --- a/include/blast/math/register_matrix/RegisterMatrix.hpp +++ b/include/blast/math/register_matrix/RegisterMatrix.hpp @@ -117,7 +117,7 @@ namespace blast { for (size_t i = 0; i < RM; ++i) for (size_t j = 0; j < N; ++j) - v_[i][j] = setzero(); + v_[i][j].reset(); } @@ -562,7 +562,7 @@ namespace blast #pragma unroll for (size_t k = 0; k < j; ++k) { - IntrinsicType const a_kj = (~A)(k, j).broadcast(); + SimdVecType const a_kj = (~A)(k, j).broadcast(); #pragma unroll for (size_t i = 0; i < RM; ++i) @@ -705,11 +705,11 @@ namespace blast #pragma unroll for (size_t j = 0; j < k; ++j) { - T const a_kj = v_[k / SS][j][k % SS]; + SimdVecType const a_kj = v_[k / SS][j][k % SS]; #pragma unroll for (size_t i = 0; i < RM; ++i) if (i >= k / SS) - v_[i][k] = fnmadd(set1(a_kj), v_[i][j], v_[i][k]); + v_[i][k] = fnmadd(a_kj, v_[i][j], v_[i][k]); } T const sqrt_a_kk = std::sqrt(v_[k / SS][k][k % SS]); @@ -718,7 +718,7 @@ namespace blast for (size_t i = 0; i < RM; ++i) { if (i < k / SS) - v_[i][k] = setzero(); + v_[i][k].reset(); else v_[i][k] /= sqrt_a_kk; } diff --git a/include/blast/math/simd/Simd.hpp b/include/blast/math/simd/Simd.hpp deleted file mode 100644 index 65f521c4..00000000 --- a/include/blast/math/simd/Simd.hpp +++ /dev/null @@ -1,252 +0,0 @@ -// Copyright (c) 2019-2020 Mikhail Katliar All rights reserved. -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. - -#pragma once - -#include - -#include -#include -#include - -#include -#include - -#include - - -namespace blast -{ - using namespace blaze; - - - //******************************************************* - // - // LOAD - // - //******************************************************* - - - template - auto broadcast(T const * ptr); - - - inline auto maskload(double const * ptr, __m256i mask) - { - return _mm256_maskload_pd(ptr, mask); - } - - - inline auto maskload(float const * ptr, __m256i mask) - { - return _mm256_maskload_ps(ptr, mask); - } - - - template <> - inline auto broadcast<4, double>(double const * ptr) - { - return _mm256_broadcast_sd(ptr); - } - - - template <> - inline auto broadcast<8, float>(float const * ptr) - { - return _mm256_broadcast_ss(ptr); - } - - - //******************************************************* - // - // STORE - // - //******************************************************* - template - void store(double * ptr, __m256d a); - - - template <> - inline void store(double * ptr, __m256d a) - { - _mm256_store_pd(ptr, a); - } - - - template <> - inline void store(double * ptr, __m256d a) - { - _mm256_storeu_pd(ptr, a); - } - - - template - void store(float * ptr, __m256 a); - - - template <> - inline void store(float * ptr, __m256 a) - { - _mm256_store_ps(ptr, a); - } - - - template <> - inline void store(float * ptr, __m256 a) - { - _mm256_storeu_ps(ptr, a); - } - - - inline void maskstore(double * ptr, __m256i m, __m256d a) - { - _mm256_maskstore_pd(ptr, m, a); - } - - - inline void maskstore(float * ptr, __m256i m, __m256 a) - { - _mm256_maskstore_ps(ptr, m, a); - } - - - //******************************************************* - // - // SET - // - //******************************************************* - - inline auto set(double a3, double a2, double a1, double a0) - { - return _mm256_set_pd(a3, a2, a1, a0); - } - - - inline auto set(float a7, float a6, float a5, float a4, float a3, float a2, float a1, float a0) - { - return _mm256_set_ps(a7, a6, a5, a4, a3, a2, a1, a0); - } - - - inline auto set(long long a3, long long a2, long long a1, long long a0) - { - return _mm256_set_epi64x(a3, a2, a1, a0); - } - - - template - auto set1(T a); - - - inline auto set1(double a) - { - return _mm256_set1_pd(a); - } - - - inline auto set1(float a) - { - return _mm256_set1_ps(a); - } - - - template <> - inline auto set1<4, double>(double a) - { - return _mm256_set1_pd(a); - } - - - template <> - inline auto set1<8, float>(float a) - { - return _mm256_set1_ps(a); - } - - - template <> - inline auto set1<8, int>(int val) - { - return _mm256_set1_epi32(val); - } - - - template <> - inline auto set1<4, long long>(long long val) - { - return _mm256_set1_epi64x(val); - } - - - template - auto setzero(); - - - template <> - inline auto setzero() - { - return _mm256_setzero_pd(); - } - - - template <> - inline auto setzero() - { - return _mm256_setzero_ps(); - } - - - //******************************************************* - // - // ARITHMETIC - // - //******************************************************* - - inline auto fnmadd(__m256d a, __m256d b, __m256d c) - { - return _mm256_fnmadd_pd(a, b, c); - } - - - inline auto fnmadd(__m256 a, __m256 b, __m256 c) - { - return _mm256_fnmadd_ps(a, b, c); - } - - - inline auto abs(__m256d a) - { - return _mm256_andnot_pd(set1(-0.), a); - } - - - inline auto abs(__m256 a) - { - return _mm256_andnot_ps(set1(-0.f), a); - } - - - //******************************************************* - // - // COMPARE - // - //******************************************************* - - template - auto cmpgt(MM a, MM b); - - - template <> - inline auto cmpgt<4>(__m256i a, __m256i b) - { - return _mm256_cmpgt_epi64(a, b); - } - - - template <> - inline auto cmpgt<8>(__m256i a, __m256i b) - { - return _mm256_cmpgt_epi32(a, b); - } -} diff --git a/include/blast/math/simd/SimdVec.hpp b/include/blast/math/simd/SimdVec.hpp index 2aef5e5a..6cd34223 100644 --- a/include/blast/math/simd/SimdVec.hpp +++ b/include/blast/math/simd/SimdVec.hpp @@ -132,6 +132,20 @@ namespace blast } + /** + * @brief Fused negative multiply-add + * + * Calculate -a * b + c + * + * @param a first multiplier + * @param b second multiplier + * @param c addendum + * + * @return @a a * @a b + @a c element-wise + */ + template + SimdVec fnmadd(SimdVec const& a, SimdVec const& b, SimdVec const& c) noexcept; + /** * @brief Fused multiply-add * @@ -298,6 +312,15 @@ namespace blast } + /** + * @brief Set to 0 + */ + void reset() noexcept + { + value_ = ValueType {}; + } + + operator IntrinsicType() const noexcept { return value_; @@ -374,6 +397,7 @@ namespace blast friend SimdVec fmadd<>(SimdVec const& a, SimdVec const& b, SimdVec const& c) noexcept; + friend SimdVec fnmadd<>(SimdVec const& a, SimdVec const& b, SimdVec const& c) noexcept; friend SimdVec blend<>(SimdVec const& a, SimdVec const& b, MaskType mask) noexcept; friend SimdVec abs<>(SimdVec const& a) noexcept; friend SimdVec max<>(SimdVec const& a, SimdVec const& b) noexcept; @@ -400,6 +424,13 @@ namespace blast } + template + inline SimdVec fnmadd(SimdVec const& a, SimdVec const& b, SimdVec const& c) noexcept + { + return xsimd::fnma(a.value_, b.value_, c.value_); + } + + template inline SimdVec blend(SimdVec const& a, SimdVec const& b, typename SimdVec::MaskType mask) noexcept { diff --git a/include/blast/math/simd/avx256/SimdVecFloat32.hpp b/include/blast/math/simd/avx256/SimdVecFloat32.hpp index 47a3aaf1..cc175930 100644 --- a/include/blast/math/simd/avx256/SimdVecFloat32.hpp +++ b/include/blast/math/simd/avx256/SimdVecFloat32.hpp @@ -73,19 +73,6 @@ namespace blast } - /* - NOTE: without this specialization some tests fail, e.g.: - - [ RUN ] RegisterMatrixTest/5.testPartialGerNT @ ./test/blast/math/simd/RegisterMatrixTest.cpp:386 - │ Failure @ ./test/blast/math/simd/RegisterMatrixTest.cpp:420 - │ │ Expected equality of these values: - │ │ ker(i, j) - │ │ Which is: 0.482636 - │ │ i < m && j < n ? D_ref(i, j) : 0. - │ │ Which is: 0.482636 - │ element mismatch at (0, 1), store size = 1x2 - [ FAILED ] RegisterMatrixTest/5.testPartialGerNT, where TypeParam = blast::RegisterMatrix (1 ms) - */ template <> inline SimdVec fmadd(SimdVec const& a, SimdVec const& b, SimdVec const& c) noexcept { @@ -93,6 +80,13 @@ namespace blast } + template <> + inline SimdVec fnmadd(SimdVec const& a, SimdVec const& b, SimdVec const& c) noexcept + { + return _mm256_fnmadd_ps(a.value_, b.value_, c.value_); + } + + template <> inline SimdVec blend(SimdVec const& a, SimdVec const& b, SimdVec::MaskType mask) noexcept { diff --git a/include/blast/math/simd/avx256/SimdVecFloat64.hpp b/include/blast/math/simd/avx256/SimdVecFloat64.hpp index 22d358ea..229fd666 100644 --- a/include/blast/math/simd/avx256/SimdVecFloat64.hpp +++ b/include/blast/math/simd/avx256/SimdVecFloat64.hpp @@ -80,6 +80,13 @@ namespace blast } + template <> + inline SimdVec fnmadd(SimdVec const& a, SimdVec const& b, SimdVec const& c) noexcept + { + return _mm256_fnmadd_pd(a.value_, b.value_, c.value_); + } + + template <> inline SimdVec blend(SimdVec const& a, SimdVec const& b, SimdVec::MaskType mask) noexcept { diff --git a/include/blast/math/views/row/Panel.hpp b/include/blast/math/views/row/Panel.hpp index 5daadc0e..55c3f9f4 100644 --- a/include/blast/math/views/row/Panel.hpp +++ b/include/blast/math/views/row/Panel.hpp @@ -7,7 +7,6 @@ #include #include #include -#include #include #include @@ -17,9 +16,6 @@ #include #include -#include -#include - namespace blast { @@ -195,4 +191,4 @@ namespace blast BLAST_CONSTRAINT_MUST_NOT_BE_PANEL_SUBMATRIX_TYPE(MT); //********************************************************************************************** }; -} \ No newline at end of file +} diff --git a/include/blast/math/views/submatrix/Panel.hpp b/include/blast/math/views/submatrix/Panel.hpp index c922bba7..50ba6a79 100644 --- a/include/blast/math/views/submatrix/Panel.hpp +++ b/include/blast/math/views/submatrix/Panel.hpp @@ -7,7 +7,6 @@ #include #include #include -#include #include #include #include @@ -25,9 +24,6 @@ #include #include -#include -#include - namespace blast { @@ -283,4 +279,4 @@ namespace blast { return PanelSubmatrix(matrix.operand(), matrix.row() + row, matrix.column() + column, m, n); } -} \ No newline at end of file +} From f91233a5f31a83b0c08575974c91547f9d646dd7 Mon Sep 17 00:00:00 2001 From: Mikhail Katliar Date: Fri, 2 Aug 2024 16:41:55 +0200 Subject: [PATCH 17/27] Default architecture changed to fma3 in Dockerfile --- Dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Dockerfile b/Dockerfile index 598aebdd..fb47dea6 100644 --- a/Dockerfile +++ b/Dockerfile @@ -56,7 +56,7 @@ ENV PKG_CONFIG_PATH=/usr/local/lib RUN mkdir -p blast/build && cd blast/build \ && cmake -DCMAKE_BUILD_TYPE=RelWithDebInfo \ -DCMAKE_CXX_COMPILER="clang++-15" \ - -DCMAKE_CXX_FLAGS="-march=native -mfma -mavx -mavx2 -msse4 -fno-math-errno -DXSIMD_DEFAULT_ARCH=avx2" \ + -DCMAKE_CXX_FLAGS="-march=native -mfma -mavx -mavx2 -msse4 -fno-math-errno -DXSIMD_DEFAULT_ARCH='fma3'" \ -DCMAKE_CXX_FLAGS_RELEASE="-O3 -g -DNDEBUG -ffast-math" .. \ -DBLAST_WITH_TEST=ON \ -DBLAST_WITH_BENCHMARK=ON \ From 4b5068422a9aabf7e1d451da75468c31a6aa6288 Mon Sep 17 00:00:00 2001 From: Mikhail Katliar Date: Fri, 2 Aug 2024 16:54:40 +0200 Subject: [PATCH 18/27] Got rid of SimdVecBase --- .../math/register_matrix/RegisterMatrix.hpp | 5 +- include/blast/math/simd/SimdVecBase.hpp | 78 ------------------- 2 files changed, 1 insertion(+), 82 deletions(-) delete mode 100644 include/blast/math/simd/SimdVecBase.hpp diff --git a/include/blast/math/register_matrix/RegisterMatrix.hpp b/include/blast/math/register_matrix/RegisterMatrix.hpp index 5c37dc64..631b530d 100644 --- a/include/blast/math/register_matrix/RegisterMatrix.hpp +++ b/include/blast/math/register_matrix/RegisterMatrix.hpp @@ -4,10 +4,7 @@ #pragma once -#include -#include -#include -#include +#include #include #include #include diff --git a/include/blast/math/simd/SimdVecBase.hpp b/include/blast/math/simd/SimdVecBase.hpp deleted file mode 100644 index 53ef4ba4..00000000 --- a/include/blast/math/simd/SimdVecBase.hpp +++ /dev/null @@ -1,78 +0,0 @@ -// Copyright 2023 Mikhail Katliar -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once - -#include - -#include - -#include - - -namespace blast -{ - template - class SimdVecBase - { - public: - using ValueType = T; - using XSimdType = xsimd::batch; - using IntrinsicType = typename XSimdType::register_type; - - - /** - * @brief Number of elements in SIMD pack - */ - static size_t constexpr size() - { - return XSimdType::size; - } - - - operator IntrinsicType() const noexcept - { - return value_; - } - - - /** - * @brief Access single element - * - * @param i element index - * - * @return element value - */ - ValueType operator[](size_t i) const noexcept - { - return value_.get(i); - } - - - protected: - SimdVecBase(IntrinsicType value) - : value_ {value} - { - } - - - SimdVecBase(ValueType const& value) - : value_ {value} - { - } - - - XSimdType value_; - }; -} From 2c563055dc7428a3c108fcd35fd97230c9fd8667 Mon Sep 17 00:00:00 2001 From: Mikhail Katliar Date: Fri, 2 Aug 2024 17:43:06 +0200 Subject: [PATCH 19/27] Made the code compile with gcc (except internal compiler errors) --- include/blast/math/Simd.hpp | 3 ++- include/blast/math/simd/SimdMask.hpp | 2 +- include/blast/math/simd/SimdVec.hpp | 21 ++++++++++--------- .../blast/math/simd/avx256/SimdVecFloat32.hpp | 10 ++++----- .../blast/math/simd/avx256/SimdVecFloat64.hpp | 10 ++++----- 5 files changed, 24 insertions(+), 22 deletions(-) diff --git a/include/blast/math/Simd.hpp b/include/blast/math/Simd.hpp index f971dbe4..e6b4198f 100644 --- a/include/blast/math/Simd.hpp +++ b/include/blast/math/Simd.hpp @@ -4,10 +4,11 @@ #pragma once -#include #include #include #include #include #include #include + +#include diff --git a/include/blast/math/simd/SimdMask.hpp b/include/blast/math/simd/SimdMask.hpp index 325d9ee5..3cc7f3a8 100644 --- a/include/blast/math/simd/SimdMask.hpp +++ b/include/blast/math/simd/SimdMask.hpp @@ -72,7 +72,7 @@ namespace blast */ template SimdMask(xsimd::batch_bool const& v) noexcept - : XSimdType {IntrinsicType(v)} + : XSimdType {xsimd::batch_bool_cast(v)} { } diff --git a/include/blast/math/simd/SimdVec.hpp b/include/blast/math/simd/SimdVec.hpp index 6cd34223..02b5c1c8 100644 --- a/include/blast/math/simd/SimdVec.hpp +++ b/include/blast/math/simd/SimdVec.hpp @@ -15,6 +15,7 @@ #pragma once #include +#include #include @@ -51,14 +52,14 @@ namespace blast requires std::is_base_of_v inline void maskstore(xsimd::batch const& v, float * dst, xsimd::batch_bool const& mask) noexcept { - _mm256_maskstore_ps(dst, mask, v); + _mm256_maskstore_ps(dst, xsimd::batch_bool_cast(mask), v); } template requires std::is_base_of_v inline void maskstore(xsimd::batch const& v, double * dst, xsimd::batch_bool const& mask) noexcept { - _mm256_maskstore_pd(dst, mask, v); + _mm256_maskstore_pd(dst, xsimd::batch_bool_cast(mask), v); } @@ -75,7 +76,7 @@ namespace blast // __m256 v3 = _mm256_max_ps(v1, v2); __m256 const mask_v3 = _mm256_cmp_ps(v2, v1, _CMP_GT_OQ); __m256 const v3 = _mm256_blendv_ps(v1, v2, mask_v3); - __m256i const iv3 = _mm256_blendv_epi8(idx, iv2, mask_v3); + __m256i const iv3 = _mm256_blendv_epi8(idx, iv2, _mm256_castps_si256(mask_v3)); /* v4 = [Z Z W W | X X Y Y] */ __m256 const v4 = _mm256_permute_ps(v3, 0b00'00'10'10); @@ -86,7 +87,7 @@ namespace blast // __m256 v5 = _mm256_max_ps(v3, v4); __m256 const mask_v5 = _mm256_cmp_ps(v4, v3, _CMP_GT_OQ); __m256 const v5 = _mm256_blendv_ps(v3, v4, mask_v5); - __m256i const iv5 = _mm256_blendv_epi8(iv3, iv4, mask_v5); + __m256i const iv5 = _mm256_blendv_epi8(iv3, iv4, _mm256_castps_si256(mask_v5)); /* v6 = [I I I I | J J J J] */ __m256 const v6 = _mm256_permute2f128_ps(v5, v5, 0b0000'0001); @@ -102,7 +103,7 @@ namespace blast // __m128 v7 = _mm_max_ps(_mm256_castps256_ps128(v5), v6); __m256 const mask_v7 = _mm256_cmp_ps(v6, v5, _CMP_GT_OQ); __m256 const v7 = _mm256_blendv_ps(v5, v6, mask_v7); - __m256i const iv7 = _mm256_blendv_epi8(iv5, iv6, mask_v7); + __m256i const iv7 = _mm256_blendv_epi8(iv5, iv6, _mm256_castps_si256(mask_v7)); return {v7, iv7}; } @@ -118,7 +119,7 @@ namespace blast // __m256d m1 = _mm256_max_pd(x.value_, y); // m1[0] = max(x[0], x[2]), m1[1] = max(x[1], x[3]), etc. __m256d const mask_m1 = _mm256_cmp_pd(y, x, _CMP_GT_OQ); __m256d const m1 = _mm256_blendv_pd(x, y, mask_m1); - __m256i const im1 = _mm256_blendv_epi8(idx, iy, mask_m1); + __m256i const im1 = _mm256_blendv_epi8(idx, iy, _mm256_castpd_si256(mask_m1)); __m256d const m2 = _mm256_permute_pd(m1, 5); // set m2[0] = m1[1], m2[1] = m1[0], etc. __m256i const im2 = _mm256_castpd_si256(_mm256_permute_pd(_mm256_castsi256_pd(im1), 5)); @@ -126,7 +127,7 @@ namespace blast // __m256d m = _mm256_max_pd(m1, m2); // all m[0] ... m[3] contain the horizontal max(x[0], x[1], x[2], x[3]) __m256d const mask_m = _mm256_cmp_pd(m2, m1, _CMP_GT_OQ); __m256d const m = _mm256_blendv_pd(m1, m2, mask_m); - __m256i const im = _mm256_blendv_epi8(im1, im2, mask_m); + __m256i const im = _mm256_blendv_epi8(im1, im2, _mm256_castpd_si256(mask_m)); return {m, im}; } @@ -191,7 +192,7 @@ namespace blast T max(SimdVec const& x) noexcept; template - typename SimdVec::MaskType operator>(SimdVec const& a, SimdVec const& b) noexcept; + SimdMask operator>(SimdVec const& a, SimdVec const& b) noexcept; /** * @brief Multiplication @@ -240,7 +241,7 @@ namespace blast using ValueType = T; using XSimdType = xsimd::batch; using IntrinsicType = typename XSimdType::register_type; - using MaskType = xsimd::batch_bool; + using MaskType = SimdMask; /** @@ -446,7 +447,7 @@ namespace blast template - inline typename SimdVec::MaskType operator>(SimdVec const& a, SimdVec const& b) noexcept + inline SimdMask operator>(SimdVec const& a, SimdVec const& b) noexcept { return xsimd::gt(a.value_, b.value_); } diff --git a/include/blast/math/simd/avx256/SimdVecFloat32.hpp b/include/blast/math/simd/avx256/SimdVecFloat32.hpp index cc175930..b2059a7f 100644 --- a/include/blast/math/simd/avx256/SimdVecFloat32.hpp +++ b/include/blast/math/simd/avx256/SimdVecFloat32.hpp @@ -14,7 +14,7 @@ #pragma once -#include +#include #include @@ -60,9 +60,9 @@ namespace blast template <> - inline SimdVec::MaskType operator>(SimdVec const& a, SimdVec const& b) noexcept + inline SimdMask operator>(SimdVec const& a, SimdVec const& b) noexcept { - return _mm256_castps_si256(_mm256_cmp_ps(a.value_, b.value_, _CMP_GT_OQ)); + return _mm256_cmp_ps(a.value_, b.value_, _CMP_GT_OQ); } @@ -88,9 +88,9 @@ namespace blast template <> - inline SimdVec blend(SimdVec const& a, SimdVec const& b, SimdVec::MaskType mask) noexcept + inline SimdVec blend(SimdVec const& a, SimdVec const& b, SimdMask mask) noexcept { - return _mm256_blendv_ps(a.value_, b.value_, _mm256_castsi256_ps(mask)); + return _mm256_blendv_ps(a.value_, b.value_, mask); } diff --git a/include/blast/math/simd/avx256/SimdVecFloat64.hpp b/include/blast/math/simd/avx256/SimdVecFloat64.hpp index 229fd666..8876909a 100644 --- a/include/blast/math/simd/avx256/SimdVecFloat64.hpp +++ b/include/blast/math/simd/avx256/SimdVecFloat64.hpp @@ -14,7 +14,7 @@ #pragma once -#include +#include #include @@ -60,9 +60,9 @@ namespace blast template <> - inline SimdVec::MaskType operator>(SimdVec const& a, SimdVec const& b) noexcept + inline SimdMask operator>(SimdVec const& a, SimdVec const& b) noexcept { - return _mm256_castpd_si256(_mm256_cmp_pd(a.value_, b.value_, _CMP_GT_OQ)); + return _mm256_cmp_pd(a.value_, b.value_, _CMP_GT_OQ); } @@ -88,9 +88,9 @@ namespace blast template <> - inline SimdVec blend(SimdVec const& a, SimdVec const& b, SimdVec::MaskType mask) noexcept + inline SimdVec blend(SimdVec const& a, SimdVec const& b, SimdMask mask) noexcept { - return _mm256_blendv_pd(a.value_, b.value_, _mm256_castsi256_pd(mask)); + return _mm256_blendv_pd(a.value_, b.value_, mask); } From 7efe21bb2be19c65477124af603ad9067d483e97 Mon Sep 17 00:00:00 2001 From: Mikhail Katliar Date: Fri, 2 Aug 2024 17:45:40 +0200 Subject: [PATCH 20/27] Build with clang-16 --- .github/workflows/cmake.yml | 14 ++++++++++++-- include/blast/math/RegisterMatrix.hpp | 6 +++--- 2 files changed, 15 insertions(+), 5 deletions(-) diff --git a/.github/workflows/cmake.yml b/.github/workflows/cmake.yml index 2df1b1d3..cef30766 100644 --- a/.github/workflows/cmake.yml +++ b/.github/workflows/cmake.yml @@ -21,7 +21,17 @@ jobs: - uses: actions/checkout@v3 - name: Install APT packages - run: sudo apt install clang libboost-exception-dev libbenchmark-dev -y + run: | + sudo apt-get update + sudo apt install clang-16 libboost-exception-dev libbenchmark-dev -y + + # - name: Install LLVM and Clang 18 + # run: | + # sudo apt-get update + # sudo apt-get install -y wget gnupg lsb-release + # wget https://apt.llvm.org/llvm.sh + # chmod +x llvm.sh + # sudo ./llvm.sh 18 - name: Install Blaze run: | @@ -54,7 +64,7 @@ jobs: run: | cmake -B ${{github.workspace}}/build \ -DCMAKE_BUILD_TYPE=${{env.BUILD_TYPE}} \ - -DCMAKE_CXX_COMPILER=clang++ \ + -DCMAKE_CXX_COMPILER=clang++-16 \ -DBLAST_WITH_BENCHMARK=ON \ -DBLAST_WITH_TEST=ON diff --git a/include/blast/math/RegisterMatrix.hpp b/include/blast/math/RegisterMatrix.hpp index 3c8944c4..90b379c3 100644 --- a/include/blast/math/RegisterMatrix.hpp +++ b/include/blast/math/RegisterMatrix.hpp @@ -7,6 +7,6 @@ #include #include -#include -#include -#include +// #include +// #include +// #include From 84695b9d079c2cf83ebcdc2d6be48f3abc3a643e Mon Sep 17 00:00:00 2001 From: Mikhail Katliar Date: Fri, 2 Aug 2024 18:21:37 +0200 Subject: [PATCH 21/27] Build with clang-18 --- .github/workflows/cmake.yml | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/.github/workflows/cmake.yml b/.github/workflows/cmake.yml index cef30766..ece754f4 100644 --- a/.github/workflows/cmake.yml +++ b/.github/workflows/cmake.yml @@ -23,15 +23,15 @@ jobs: - name: Install APT packages run: | sudo apt-get update - sudo apt install clang-16 libboost-exception-dev libbenchmark-dev -y + sudo apt install libboost-exception-dev libbenchmark-dev -y - # - name: Install LLVM and Clang 18 - # run: | - # sudo apt-get update - # sudo apt-get install -y wget gnupg lsb-release - # wget https://apt.llvm.org/llvm.sh - # chmod +x llvm.sh - # sudo ./llvm.sh 18 + - name: Install LLVM and Clang 18 + run: | + sudo apt-get update + sudo apt-get install -y wget gnupg lsb-release + wget https://apt.llvm.org/llvm.sh + chmod +x llvm.sh + sudo ./llvm.sh 18 - name: Install Blaze run: | @@ -64,7 +64,7 @@ jobs: run: | cmake -B ${{github.workspace}}/build \ -DCMAKE_BUILD_TYPE=${{env.BUILD_TYPE}} \ - -DCMAKE_CXX_COMPILER=clang++-16 \ + -DCMAKE_CXX_COMPILER=clang++-18 \ -DBLAST_WITH_BENCHMARK=ON \ -DBLAST_WITH_TEST=ON From c3d5882a6d73a6a3ab181af41e8f3b24df824edd Mon Sep 17 00:00:00 2001 From: Mikhail Katliar Date: Sat, 3 Aug 2024 04:23:07 +0200 Subject: [PATCH 22/27] Removed RegisterMatrix specializations --- include/blast/math/RegisterMatrix.hpp | 3 - .../math/register_matrix/double_12_4_4.hpp | 167 ------------------ .../math/register_matrix/double_4_4_4.hpp | 79 --------- .../math/register_matrix/double_8_4_4.hpp | 151 ---------------- 4 files changed, 400 deletions(-) delete mode 100644 include/blast/math/register_matrix/double_12_4_4.hpp delete mode 100644 include/blast/math/register_matrix/double_4_4_4.hpp delete mode 100644 include/blast/math/register_matrix/double_8_4_4.hpp diff --git a/include/blast/math/RegisterMatrix.hpp b/include/blast/math/RegisterMatrix.hpp index 90b379c3..851ef7ef 100644 --- a/include/blast/math/RegisterMatrix.hpp +++ b/include/blast/math/RegisterMatrix.hpp @@ -7,6 +7,3 @@ #include #include -// #include -// #include -// #include diff --git a/include/blast/math/register_matrix/double_12_4_4.hpp b/include/blast/math/register_matrix/double_12_4_4.hpp deleted file mode 100644 index a640d2c1..00000000 --- a/include/blast/math/register_matrix/double_12_4_4.hpp +++ /dev/null @@ -1,167 +0,0 @@ -// Copyright (c) 2019-2020 Mikhail Katliar All rights reserved. -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. - -#pragma once - -#include -#include - -#include - -#include - - -namespace blast -{ - template <> - template - requires MatrixPointer && (P::storageOrder == columnMajor) - inline void RegisterMatrix::load(double beta, P ptr, size_t m, size_t n) noexcept - { - if (n > 0) - { - v_[0][0] = beta * ptr.load(); - v_[1][0] = beta * ptr(SS, 0).load(); - v_[2][0] = beta * ptr(2 * SS, 0).load(); - } - - if (n > 1) - { - v_[0][1] = beta * ptr(0, 1).load(); - v_[1][1] = beta * ptr(SS, 1).load(); - v_[2][1] = beta * ptr(2 * SS, 1).load(); - } - - if (n > 2) - { - v_[0][2] = beta * ptr(0, 2).load(); - v_[1][2] = beta * ptr(SS, 2).load(); - v_[2][2] = beta * ptr(2 * SS, 2).load(); - } - - if (n > 3) - { - v_[0][3] = beta * ptr(0, 3).load(); - v_[1][3] = beta * ptr(SS, 3).load(); - v_[2][3] = beta * ptr(2 * SS, 3).load(); - } - } - - - template <> - template - requires MatrixPointer && (P::storageOrder == columnMajor) - inline void RegisterMatrix::store(P ptr, size_t m, size_t n) const noexcept - { - #pragma unroll - for (size_t i = 0; i < 3; ++i) - { - if (m >= 4 * i + 4) - { - if (n > 0) - ptr(SS * i, 0).store(v_[i][0]); - - if (n > 1) - ptr(SS * i, 1).store(v_[i][1]); - - if (n > 2) - ptr(SS * i, 2).store(v_[i][2]); - - if (n > 3) - ptr(SS * i, 3).store(v_[i][3]); - } - else if (m > 4 * i) - { - __m256i const mask = _mm256_set_epi64x( - m > 4 * i + 3 ? 0x8000000000000000ULL : 0, - m > 4 * i + 2 ? 0x8000000000000000ULL : 0, - m > 4 * i + 1 ? 0x8000000000000000ULL : 0, - m > 4 * i + 0 ? 0x8000000000000000ULL : 0); - - if (n > 0) - ptr(SS * i, 0).store(v_[i][0], mask); - - if (n > 1) - ptr(SS * i, 1).store(v_[i][1], mask); - - if (n > 2) - ptr(SS * i, 2).store(v_[i][2], mask); - - if (n > 3) - ptr(SS * i, 3).store(v_[i][3], mask); - } - } - } - - -#ifdef BLAST_USE_CUSTOM_TRMM_RIGHT_LOWER - template <> - template - requires MatrixPointer && (PB::storageOrder == columnMajor) && MatrixPointer - BLAZE_ALWAYS_INLINE void RegisterMatrix::trmmRightLower(double alpha, PB b, PA a) noexcept - { - IntrinsicType bx[3], ax; - - // k == 0 - bx[0] = alpha * b.load(0 * SS, 0); - bx[1] = alpha * b.load(1 * SS, 0); - bx[2] = alpha * b.load(2 * SS, 0); - ax = a.broadcast(0, 0); - v_[0][0] = fmadd(bx[0], ax, v_[0][0]); - v_[1][0] = fmadd(bx[1], ax, v_[1][0]); - v_[2][0] = fmadd(bx[2], ax, v_[2][0]); - - // k == 1 - bx[0] = alpha * b.load(0 * SS, 1); - bx[1] = alpha * b.load(1 * SS, 1); - bx[2] = alpha * b.load(2 * SS, 1); - ax = a.broadcast(1, 0); - v_[0][0] = fmadd(bx[0], ax, v_[0][0]); - v_[1][0] = fmadd(bx[1], ax, v_[1][0]); - v_[2][0] = fmadd(bx[2], ax, v_[2][0]); - ax = a.broadcast(1, 1); - v_[0][1] = fmadd(bx[0], ax, v_[0][1]); - v_[1][1] = fmadd(bx[1], ax, v_[1][1]); - v_[2][1] = fmadd(bx[2], ax, v_[2][1]); - - // k == 2 - bx[0] = alpha * b.load(0 * SS, 2); - bx[1] = alpha * b.load(1 * SS, 2); - bx[2] = alpha * b.load(2 * SS, 2); - ax = a.broadcast(2, 0); - v_[0][0] = fmadd(bx[0], ax, v_[0][0]); - v_[1][0] = fmadd(bx[1], ax, v_[1][0]); - v_[2][0] = fmadd(bx[2], ax, v_[2][0]); - ax = a.broadcast(2, 1); - v_[0][1] = fmadd(bx[0], ax, v_[0][1]); - v_[1][1] = fmadd(bx[1], ax, v_[1][1]); - v_[2][1] = fmadd(bx[2], ax, v_[2][1]); - ax = a.broadcast(2, 2); - v_[0][2] = fmadd(bx[0], ax, v_[0][2]); - v_[1][2] = fmadd(bx[1], ax, v_[1][2]); - v_[2][2] = fmadd(bx[2], ax, v_[2][2]); - - // k == 3 - bx[0] = alpha * b.load(0 * SS, 3); - bx[1] = alpha * b.load(1 * SS, 3); - bx[2] = alpha * b.load(2 * SS, 3); - ax = a.broadcast(3, 0); - v_[0][0] = fmadd(bx[0], ax, v_[0][0]); - v_[1][0] = fmadd(bx[1], ax, v_[1][0]); - v_[2][0] = fmadd(bx[2], ax, v_[2][0]); - ax = a.broadcast(3, 1); - v_[0][1] = fmadd(bx[0], ax, v_[0][1]); - v_[1][1] = fmadd(bx[1], ax, v_[1][1]); - v_[2][1] = fmadd(bx[2], ax, v_[2][1]); - ax = a.broadcast(3, 2); - v_[0][2] = fmadd(bx[0], ax, v_[0][2]); - v_[1][2] = fmadd(bx[1], ax, v_[1][2]); - v_[2][2] = fmadd(bx[2], ax, v_[2][2]); - ax = a.broadcast(3, 3); - v_[0][3] = fmadd(bx[0], ax, v_[0][3]); - v_[1][3] = fmadd(bx[1], ax, v_[1][3]); - v_[2][3] = fmadd(bx[2], ax, v_[2][3]); - } -#endif -} \ No newline at end of file diff --git a/include/blast/math/register_matrix/double_4_4_4.hpp b/include/blast/math/register_matrix/double_4_4_4.hpp deleted file mode 100644 index 1c73605c..00000000 --- a/include/blast/math/register_matrix/double_4_4_4.hpp +++ /dev/null @@ -1,79 +0,0 @@ -// Copyright (c) 2019-2020 Mikhail Katliar All rights reserved. -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. - -#pragma once - -#include -#include - -#include - - -namespace blast -{ - template <> - template - requires MatrixPointer && (P::storageOrder == columnMajor) - BLAZE_ALWAYS_INLINE void RegisterMatrix::load(double beta, P ptr, size_t m, size_t n) noexcept - { - if (n > 0) - v_[0][0] = beta * ptr.load(); - - if (n > 1) - v_[0][1] = beta * ptr(0, 1).load(); - - if (n > 2) - v_[0][2] = beta * ptr(0, 2).load(); - - if (n > 3) - v_[0][3] = beta * ptr(0, 3).load(); - } - - -#if 1 - /// Magically, this function specialization is slightly faster than the default implementation of RegisterMatrix<>::store. - template <> - template - requires MatrixPointer && (P::storageOrder == columnMajor) - BLAZE_ALWAYS_INLINE void RegisterMatrix::store(P ptr, size_t m, size_t n) const noexcept - { - if (m >= 4) - { - if (n > 0) - ptr.store(v_[0][0]); - - if (n > 1) - ptr(0, 1).store(v_[0][1]); - - if (n > 2) - ptr(0, 2).store(v_[0][2]); - - if (n > 3) - ptr(0, 3).store(v_[0][3]); - } - else if (m > 0) - { - // Magically, the code below is significantly faster than this: - // __m256i const mask = _mm256_cmpgt_epi64(_mm256_set_epi64x(m, m, m, m), _mm256_set_epi64x(3, 2, 1, 0)); - __m256i const mask = _mm256_set_epi64x( - m > 3 ? 0x8000000000000000ULL : 0, - m > 2 ? 0x8000000000000000ULL : 0, - m > 1 ? 0x8000000000000000ULL : 0, - m > 0 ? 0x8000000000000000ULL : 0); - - if (n > 0) - ptr.store(v_[0][0], mask); - - if (n > 1) - ptr(0, 1).store(v_[0][1], mask); - - if (n > 2) - ptr(0, 2).store(v_[0][2], mask); - - if (n > 3) - ptr(0, 3).store(v_[0][3], mask); - } - } -#endif -} \ No newline at end of file diff --git a/include/blast/math/register_matrix/double_8_4_4.hpp b/include/blast/math/register_matrix/double_8_4_4.hpp deleted file mode 100644 index 4ec3d416..00000000 --- a/include/blast/math/register_matrix/double_8_4_4.hpp +++ /dev/null @@ -1,151 +0,0 @@ -// Copyright (c) 2019-2020 Mikhail Katliar All rights reserved. -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. - -#pragma once - -#include -#include - -#include - -#include - - -namespace blast -{ - template<> - template - requires MatrixPointer && (P::storageOrder == columnMajor) - inline void RegisterMatrix::load(double beta, P ptr, size_t m, size_t n) noexcept - { - if (n > 0) - { - v_[0][0] = beta * ptr.load(); - v_[1][0] = beta * ptr(SS, 0).load(); - } - - if (n > 1) - { - v_[0][1] = beta * ptr(0, 1).load(); - v_[1][1] = beta * ptr(SS, 1).load(); - } - - if (n > 2) - { - v_[0][2] = beta * ptr(0, 2).load(); - v_[1][2] = beta * ptr(SS, 2).load(); - } - - if (n > 3) - { - v_[0][3] = beta * ptr(0, 3).load(); - v_[1][3] = beta * ptr(SS, 3).load(); - } - } - - -#if 1 - template<> - template - requires MatrixPointer && (P::storageOrder == columnMajor) - inline void RegisterMatrix::store(P ptr, size_t m, size_t n) const noexcept - { - #pragma unroll - for (size_t i = 0; i < 2; ++i) - { - if (m >= 4 * i + 4) - { - if (n > 0) - ptr(SS * i, 0).store(v_[i][0]); - - if (n > 1) - ptr(SS * i, 1).store(v_[i][1]); - - if (n > 2) - ptr(SS * i, 2).store(v_[i][2]); - - if (n > 3) - ptr(SS * i, 3).store(v_[i][3]); - } - else if (m > 4 * i) - { - __m256i const mask = _mm256_set_epi64x( - m > 4 * i + 3 ? 0x8000000000000000ULL : 0, - m > 4 * i + 2 ? 0x8000000000000000ULL : 0, - m > 4 * i + 1 ? 0x8000000000000000ULL : 0, - m > 4 * i + 0 ? 0x8000000000000000ULL : 0); - - if (n > 0) - ptr(SS * i, 0).store(v_[i][0], mask); - - if (n > 1) - ptr(SS * i, 1).store(v_[i][1], mask); - - if (n > 2) - ptr(SS * i, 2).store(v_[i][2], mask); - - if (n > 3) - ptr(SS * i, 3).store(v_[i][3], mask); - } - } - } -#endif - - -#ifdef BLAST_USE_CUSTOM_TRMM_RIGHT_LOWER - template <> - template - requires MatrixPointer && (PB::storageOrder == columnMajor) && MatrixPointer - BLAZE_ALWAYS_INLINE void RegisterMatrix::trmmRightLower(double alpha, PB b, PA a) noexcept - { - IntrinsicType bx[2], ax; - - // k == 0 - bx[0] = alpha * b.load(0 * SS, 0); - bx[1] = alpha * b.load(1 * SS, 0); - ax = a.broadcast(0, 0); - v_[0][0] = fmadd(bx[0], ax, v_[0][0]); - v_[1][0] = fmadd(bx[1], ax, v_[1][0]); - - // k == 1 - bx[0] = alpha * b.load(0 * SS, 1); - bx[1] = alpha * b.load(1 * SS, 1); - ax = a.broadcast(1, 0); - v_[0][0] = fmadd(bx[0], ax, v_[0][0]); - v_[1][0] = fmadd(bx[1], ax, v_[1][0]); - ax = a.broadcast(1, 1); - v_[0][1] = fmadd(bx[0], ax, v_[0][1]); - v_[1][1] = fmadd(bx[1], ax, v_[1][1]); - - // k == 2 - bx[0] = alpha * b.load(0 * SS, 2); - bx[1] = alpha * b.load(1 * SS, 2); - ax = a.broadcast(2, 0); - v_[0][0] = fmadd(bx[0], ax, v_[0][0]); - v_[1][0] = fmadd(bx[1], ax, v_[1][0]); - ax = a.broadcast(2, 1); - v_[0][1] = fmadd(bx[0], ax, v_[0][1]); - v_[1][1] = fmadd(bx[1], ax, v_[1][1]); - ax = a.broadcast(2, 2); - v_[0][2] = fmadd(bx[0], ax, v_[0][2]); - v_[1][2] = fmadd(bx[1], ax, v_[1][2]); - - // k == 3 - bx[0] = alpha * b.load(0 * SS, 3); - bx[1] = alpha * b.load(1 * SS, 3); - ax = a.broadcast(3, 0); - v_[0][0] = fmadd(bx[0], ax, v_[0][0]); - v_[1][0] = fmadd(bx[1], ax, v_[1][0]); - ax = a.broadcast(3, 1); - v_[0][1] = fmadd(bx[0], ax, v_[0][1]); - v_[1][1] = fmadd(bx[1], ax, v_[1][1]); - ax = a.broadcast(3, 2); - v_[0][2] = fmadd(bx[0], ax, v_[0][2]); - v_[1][2] = fmadd(bx[1], ax, v_[1][2]); - ax = a.broadcast(3, 3); - v_[0][3] = fmadd(bx[0], ax, v_[0][3]); - v_[1][3] = fmadd(bx[1], ax, v_[1][3]); - } -#endif -} \ No newline at end of file From d789021c8abdcefb0de5da9975d783f9c5186a76 Mon Sep 17 00:00:00 2001 From: Mikhail Katliar Date: Sat, 3 Aug 2024 04:39:24 +0200 Subject: [PATCH 23/27] Removed avx2 SimdVec specializations --- include/blast/math/Simd.hpp | 2 - include/blast/math/dense/Iamax.hpp | 3 +- include/blast/math/simd/Avx256.hpp | 18 --- .../blast/math/simd/avx256/SimdVecFloat32.hpp | 126 ------------------ .../blast/math/simd/avx256/SimdVecFloat64.hpp | 121 ----------------- test/blast/math/simd/SimdVecTest.cpp | 1 - 6 files changed, 1 insertion(+), 270 deletions(-) delete mode 100644 include/blast/math/simd/Avx256.hpp delete mode 100644 include/blast/math/simd/avx256/SimdVecFloat32.hpp delete mode 100644 include/blast/math/simd/avx256/SimdVecFloat64.hpp diff --git a/include/blast/math/Simd.hpp b/include/blast/math/Simd.hpp index e6b4198f..912948ce 100644 --- a/include/blast/math/Simd.hpp +++ b/include/blast/math/Simd.hpp @@ -10,5 +10,3 @@ #include #include #include - -#include diff --git a/include/blast/math/dense/Iamax.hpp b/include/blast/math/dense/Iamax.hpp index 6b03a36a..530df07b 100644 --- a/include/blast/math/dense/Iamax.hpp +++ b/include/blast/math/dense/Iamax.hpp @@ -17,7 +17,6 @@ #include #include -#include #include #include @@ -177,4 +176,4 @@ namespace blast return iamax(N, ptr(*x)); } -} \ No newline at end of file +} diff --git a/include/blast/math/simd/Avx256.hpp b/include/blast/math/simd/Avx256.hpp deleted file mode 100644 index 1fbc41c0..00000000 --- a/include/blast/math/simd/Avx256.hpp +++ /dev/null @@ -1,18 +0,0 @@ -// Copyright 2023 Mikhail Katliar -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once - -#include -#include diff --git a/include/blast/math/simd/avx256/SimdVecFloat32.hpp b/include/blast/math/simd/avx256/SimdVecFloat32.hpp deleted file mode 100644 index b2059a7f..00000000 --- a/include/blast/math/simd/avx256/SimdVecFloat32.hpp +++ /dev/null @@ -1,126 +0,0 @@ -// Copyright 2023 Mikhail Katliar -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once - -#include - -#include - - -namespace blast -{ - template <> - inline SimdVec::SimdVec() noexcept - : value_ {_mm256_setzero_ps()} - { - } - - - template <> - inline SimdVec::SimdVec(IntrinsicType value) noexcept - : value_ {value} - { - } - - - template <> - inline SimdVec::SimdVec(ValueType value) noexcept - : value_ {_mm256_set1_ps(value)} - { - } - - - template <> - inline SimdVec::SimdVec(ValueType const * src, bool aligned) noexcept - : value_ {aligned ? _mm256_load_ps(src) : _mm256_loadu_ps(src)} - { - } - - - template <> - inline void SimdVec::store(ValueType * dst, bool aligned) const noexcept - { - if (aligned) - _mm256_store_ps(dst, value_); - else - _mm256_storeu_ps(dst, value_); - } - - - template <> - inline SimdMask operator>(SimdVec const& a, SimdVec const& b) noexcept - { - return _mm256_cmp_ps(a.value_, b.value_, _CMP_GT_OQ); - } - - - template <> - inline SimdVec operator*(SimdVec const& a, SimdVec const& b) noexcept - { - return _mm256_mul_ps(a.value_, b.value_); - } - - - template <> - inline SimdVec fmadd(SimdVec const& a, SimdVec const& b, SimdVec const& c) noexcept - { - return _mm256_fmadd_ps(a.value_, b.value_, c.value_); - } - - - template <> - inline SimdVec fnmadd(SimdVec const& a, SimdVec const& b, SimdVec const& c) noexcept - { - return _mm256_fnmadd_ps(a.value_, b.value_, c.value_); - } - - - template <> - inline SimdVec blend(SimdVec const& a, SimdVec const& b, SimdMask mask) noexcept - { - return _mm256_blendv_ps(a.value_, b.value_, mask); - } - - - template <> - inline SimdVec abs(SimdVec const& a) noexcept - { - return _mm256_andnot_ps(SimdVec {-0.f}, a.value_); - } - - - template <> - inline SimdVec max(SimdVec const& a, SimdVec const& b) noexcept - { - return _mm256_max_ps(a, b); - } - - - template <> - inline float max(SimdVec const& x) noexcept - { - __m256 v1 = x.value_; /* v1 = [H G F E | D C B A] */ - __m256 v2 = _mm256_permute_ps(v1, 0b10'11'00'01); /* v2 = [G H E F | C D A B] */ - __m256 v3 = _mm256_max_ps(v1, v2); /* v3 = [W=max(G,H) W=max(G,H) Z=max(E,F) Z=max(E,F) | Y=max(C,D) Y=max(C,D) X=max(A,B) X=max(A,B)] */ - /* v3 = [W W Z Z | Y Y X X] */ - __m256 v4 = _mm256_permute_ps(v3, 0b00'00'10'10); /* v4 = [Z Z W W | X X Y Y] */ - __m256 v5 = _mm256_max_ps(v3, v4); /* v5 = [J=max(Z,W) J=max(Z,W) J=max(Z,W) J=max(Z,W) | I=max(X,Y) I=max(X,Y) I=max(X,Y) I=max(X,Y)] */ - /* v5 = [J J J J | I I I I] */ - __m128 v6 = _mm256_extractf128_ps(v5, 1); /* v6 = [- - - - | J J J J] */ - __m128 v7 = _mm_max_ps(_mm256_castps256_ps128(v5), v6); /* v7 = [- - - - | M=max(I,J) M=max(I,J) M=max(I,J) M=max(I,J)] */ - - return v7[0]; - } -} diff --git a/include/blast/math/simd/avx256/SimdVecFloat64.hpp b/include/blast/math/simd/avx256/SimdVecFloat64.hpp deleted file mode 100644 index 8876909a..00000000 --- a/include/blast/math/simd/avx256/SimdVecFloat64.hpp +++ /dev/null @@ -1,121 +0,0 @@ -// Copyright 2023 Mikhail Katliar -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once - -#include - -#include - - -namespace blast -{ - template <> - inline SimdVec::SimdVec() noexcept - : value_ {_mm256_setzero_pd()} - { - } - - - template <> - inline SimdVec::SimdVec(IntrinsicType value) noexcept - : value_ {value} - { - } - - - template <> - inline SimdVec::SimdVec(ValueType value) noexcept - : value_ {_mm256_set1_pd(value)} - { - } - - - template <> - inline SimdVec::SimdVec(ValueType const * src, bool aligned) noexcept - : value_ {aligned ? _mm256_load_pd(src) : _mm256_loadu_pd(src)} - { - } - - - template <> - inline void SimdVec::store(ValueType * dst, bool aligned) const noexcept - { - if (aligned) - _mm256_store_pd(dst, value_); - else - _mm256_storeu_pd(dst, value_); - } - - - template <> - inline SimdMask operator>(SimdVec const& a, SimdVec const& b) noexcept - { - return _mm256_cmp_pd(a.value_, b.value_, _CMP_GT_OQ); - } - - - template <> - inline SimdVec operator*(SimdVec const& a, SimdVec const& b) noexcept - { - return _mm256_mul_pd(a.value_, b.value_); - } - - - template <> - inline SimdVec fmadd(SimdVec const& a, SimdVec const& b, SimdVec const& c) noexcept - { - return _mm256_fmadd_pd(a.value_, b.value_, c.value_); - } - - - template <> - inline SimdVec fnmadd(SimdVec const& a, SimdVec const& b, SimdVec const& c) noexcept - { - return _mm256_fnmadd_pd(a.value_, b.value_, c.value_); - } - - - template <> - inline SimdVec blend(SimdVec const& a, SimdVec const& b, SimdMask mask) noexcept - { - return _mm256_blendv_pd(a.value_, b.value_, mask); - } - - - template <> - inline SimdVec abs(SimdVec const& a) noexcept - { - return _mm256_andnot_pd(_mm256_set1_pd(-0.), a.value_); - } - - - template <> - inline SimdVec max(SimdVec const& a, SimdVec const& b) noexcept - { - return _mm256_max_pd(a, b); - } - - - template <> - inline double max(SimdVec const& x) noexcept - { - __m256d y = _mm256_permute2f128_pd(x.value_, x.value_, 1); // permute 128-bit values - __m256d m1 = _mm256_max_pd(x.value_, y); // m1[0] = max(x[0], x[2]), m1[1] = max(x[1], x[3]), etc. - __m256d m2 = _mm256_permute_pd(m1, 5); // set m2[0] = m1[1], m2[1] = m1[0], etc. - __m256d m = _mm256_max_pd(m1, m2); // all m[0] ... m[3] contain the horizontal max(x[0], x[1], x[2], x[3]) - - return m[0]; - } -} diff --git a/test/blast/math/simd/SimdVecTest.cpp b/test/blast/math/simd/SimdVecTest.cpp index 91765526..f5b2fa04 100644 --- a/test/blast/math/simd/SimdVecTest.cpp +++ b/test/blast/math/simd/SimdVecTest.cpp @@ -2,7 +2,6 @@ // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. -#include #include #include From 01754af861e50b20fcdc085eb9af5d12ef375baa Mon Sep 17 00:00:00 2001 From: Mikhail Katliar Date: Sat, 3 Aug 2024 04:42:36 +0200 Subject: [PATCH 24/27] Removed avx2 hsum() --- include/blast/math/simd/Hsum.hpp | 31 ------------------------- test/blast/CMakeLists.txt | 1 - test/blast/math/simd/HsumTest.cpp | 38 ------------------------------- 3 files changed, 70 deletions(-) delete mode 100644 include/blast/math/simd/Hsum.hpp delete mode 100644 test/blast/math/simd/HsumTest.cpp diff --git a/include/blast/math/simd/Hsum.hpp b/include/blast/math/simd/Hsum.hpp deleted file mode 100644 index 9c7b5f4b..00000000 --- a/include/blast/math/simd/Hsum.hpp +++ /dev/null @@ -1,31 +0,0 @@ -// Copyright (c) 2019-2020 Mikhail Katliar All rights reserved. -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. - -#pragma once - -#include - - -namespace blast -{ - /// dst[127:64] = sum(v1) - /// dst[63:0] = sum(v0) - inline __m128d hsum(__m256d v0, __m256d v1) - { - __m256d v01 = _mm256_hadd_pd(v0, v1); - return _mm_add_pd(_mm256_extractf128_pd(v01, 1), _mm256_castpd256_pd128(v01)); - } - - - /// dst[255:192] = sum(v3) - /// dst[191:128] = sum(v2) - /// dst[127:64] = sum(v1) - /// dst[63:0] = sum(v0) - inline __m256d hsum(__m256d v0, __m256d v1, __m256d v2, __m256d v3) - { - __m128d v01 = hsum(v0, v1); - __m128d v23 = hsum(v2, v3); - return _mm256_insertf128_pd(_mm256_castpd128_pd256(v01), v23, 1); - } -} \ No newline at end of file diff --git a/test/blast/CMakeLists.txt b/test/blast/CMakeLists.txt index 5edafda1..fa479125 100644 --- a/test/blast/CMakeLists.txt +++ b/test/blast/CMakeLists.txt @@ -13,7 +13,6 @@ find_package(LAPACK REQUIRED) add_executable(test-blast math/simd/RegisterMatrixTest.cpp math/simd/DynamicRegisterMatrixTest.cpp - math/simd/HsumTest.cpp math/simd/SimdVecTest.cpp math/dense/StaticVectorPointerTest.cpp diff --git a/test/blast/math/simd/HsumTest.cpp b/test/blast/math/simd/HsumTest.cpp deleted file mode 100644 index 3153e042..00000000 --- a/test/blast/math/simd/HsumTest.cpp +++ /dev/null @@ -1,38 +0,0 @@ -// Copyright (c) 2019-2020 Mikhail Katliar All rights reserved. -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. - -#include - -#include -#include - -#include - -using namespace blaze; - - -namespace blast :: testing -{ - TEST(HsumTest, test4x256d) - { - std::array a, b, c, d, r; - randomize(a); - randomize(b); - randomize(c); - randomize(d); - - __m256d mma = _mm256_loadu_pd(a.data()); - __m256d mmb = _mm256_loadu_pd(b.data()); - __m256d mmc = _mm256_loadu_pd(c.data()); - __m256d mmd = _mm256_loadu_pd(d.data()); - - __m256d mmr = hsum(mma, mmb, mmc, mmd); - _mm256_storeu_pd(r.data(), mmr); - - EXPECT_NEAR(r[0], std::accumulate(begin(a), end(a), 0.), 1.e-10); - EXPECT_NEAR(r[1], std::accumulate(begin(b), end(b), 0.), 1.e-10); - EXPECT_NEAR(r[2], std::accumulate(begin(c), end(c), 0.), 1.e-10); - EXPECT_NEAR(r[3], std::accumulate(begin(d), end(d), 0.), 1.e-10); - } -} \ No newline at end of file From 1b8c62c4f0eaf4fdb3c4abf06b31b5da5a9c86f9 Mon Sep 17 00:00:00 2001 From: Mikhail Katliar Date: Sat, 3 Aug 2024 05:19:27 +0200 Subject: [PATCH 25/27] Arch-specific code moved to blast/math/simd/arch --- include/blast/math/panel/PanelSize.hpp | 3 +- include/blast/math/simd/RegisterCapacity.hpp | 2 +- include/blast/math/simd/Simd.hpp | 20 +++ include/blast/math/simd/SimdIndex.hpp | 2 +- include/blast/math/simd/SimdMask.hpp | 2 +- include/blast/math/simd/SimdSize.hpp | 2 +- include/blast/math/simd/SimdVec.hpp | 128 ++----------------- include/blast/math/simd/arch/Avx2.hpp | 123 ++++++++++++++++++ 8 files changed, 157 insertions(+), 125 deletions(-) create mode 100644 include/blast/math/simd/Simd.hpp create mode 100644 include/blast/math/simd/arch/Avx2.hpp diff --git a/include/blast/math/panel/PanelSize.hpp b/include/blast/math/panel/PanelSize.hpp index 9647a0cd..6bcf6bbf 100644 --- a/include/blast/math/panel/PanelSize.hpp +++ b/include/blast/math/panel/PanelSize.hpp @@ -9,8 +9,7 @@ //************************************************************************************************* #include - -#include +#include namespace blast diff --git a/include/blast/math/simd/RegisterCapacity.hpp b/include/blast/math/simd/RegisterCapacity.hpp index 213b6d80..35b2ac7e 100644 --- a/include/blast/math/simd/RegisterCapacity.hpp +++ b/include/blast/math/simd/RegisterCapacity.hpp @@ -13,7 +13,7 @@ // limitations under the License. #pragma once -#include +#include #include diff --git a/include/blast/math/simd/Simd.hpp b/include/blast/math/simd/Simd.hpp new file mode 100644 index 00000000..b5bd8873 --- /dev/null +++ b/include/blast/math/simd/Simd.hpp @@ -0,0 +1,20 @@ +// Copyright 2024 Mikhail Katliar +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +#pragma once + +#include + +#if XSIMD_WITH_AVX2 + #include +#endif diff --git a/include/blast/math/simd/SimdIndex.hpp b/include/blast/math/simd/SimdIndex.hpp index bfbae1c5..3d7ca7b2 100644 --- a/include/blast/math/simd/SimdIndex.hpp +++ b/include/blast/math/simd/SimdIndex.hpp @@ -13,7 +13,7 @@ // limitations under the License. #pragma once -#include +#include #include #include diff --git a/include/blast/math/simd/SimdMask.hpp b/include/blast/math/simd/SimdMask.hpp index 3cc7f3a8..bdb26082 100644 --- a/include/blast/math/simd/SimdMask.hpp +++ b/include/blast/math/simd/SimdMask.hpp @@ -14,7 +14,7 @@ #pragma once -#include +#include namespace blast diff --git a/include/blast/math/simd/SimdSize.hpp b/include/blast/math/simd/SimdSize.hpp index 4a22339a..01a64825 100644 --- a/include/blast/math/simd/SimdSize.hpp +++ b/include/blast/math/simd/SimdSize.hpp @@ -14,7 +14,7 @@ #pragma once -#include +#include #include #include diff --git a/include/blast/math/simd/SimdVec.hpp b/include/blast/math/simd/SimdVec.hpp index 02b5c1c8..5d6afea1 100644 --- a/include/blast/math/simd/SimdVec.hpp +++ b/include/blast/math/simd/SimdVec.hpp @@ -16,8 +16,7 @@ #include #include - -#include +#include #include @@ -27,112 +26,6 @@ namespace blast template class SimdVec; - - template - xsimd::batch maskload(T const * src, xsimd::batch_bool const& mask) noexcept; - - template - requires std::is_base_of_v - inline xsimd::batch maskload(float const * src, xsimd::batch_bool const& mask) noexcept - { - return _mm256_maskload_ps(src, mask); - } - - template - requires std::is_base_of_v - inline xsimd::batch maskload(double const * src, xsimd::batch_bool const& mask) noexcept - { - return _mm256_maskload_pd(src, mask); - } - - template - void maskstore(xsimd::batch const& v, T * dst, xsimd::batch_bool const& mask) noexcept; - - template - requires std::is_base_of_v - inline void maskstore(xsimd::batch const& v, float * dst, xsimd::batch_bool const& mask) noexcept - { - _mm256_maskstore_ps(dst, xsimd::batch_bool_cast(mask), v); - } - - template - requires std::is_base_of_v - inline void maskstore(xsimd::batch const& v, double * dst, xsimd::batch_bool const& mask) noexcept - { - _mm256_maskstore_pd(dst, xsimd::batch_bool_cast(mask), v); - } - - - template - requires std::is_base_of_v - inline std::tuple, xsimd::batch> imax(xsimd::batch const& v1, xsimd::batch const& idx) noexcept - { - /* v2 = [G H E F | C D A B] */ - __m256 const v2 = _mm256_permute_ps(v1, 0b10'11'00'01); - __m256i const iv2 = _mm256_castps_si256(_mm256_permute_ps(_mm256_castsi256_ps(idx), 0b10'11'00'01)); - - /* v3 = [W=max(G,H) W=max(G,H) Z=max(E,F) Z=max(E,F) | Y=max(C,D) Y=max(C,D) X=max(A,B) X=max(A,B)] */ - /* v3 = [W W Z Z | Y Y X X] */ - // __m256 v3 = _mm256_max_ps(v1, v2); - __m256 const mask_v3 = _mm256_cmp_ps(v2, v1, _CMP_GT_OQ); - __m256 const v3 = _mm256_blendv_ps(v1, v2, mask_v3); - __m256i const iv3 = _mm256_blendv_epi8(idx, iv2, _mm256_castps_si256(mask_v3)); - - /* v4 = [Z Z W W | X X Y Y] */ - __m256 const v4 = _mm256_permute_ps(v3, 0b00'00'10'10); - __m256i const iv4 = _mm256_castps_si256(_mm256_permute_ps(_mm256_castsi256_ps(iv3), 0b00'00'10'10)); - - /* v5 = [J=max(Z,W) J=max(Z,W) J=max(Z,W) J=max(Z,W) | I=max(X,Y) I=max(X,Y) I=max(X,Y) I=max(X,Y)] */ - /* v5 = [J J J J | I I I I] */ - // __m256 v5 = _mm256_max_ps(v3, v4); - __m256 const mask_v5 = _mm256_cmp_ps(v4, v3, _CMP_GT_OQ); - __m256 const v5 = _mm256_blendv_ps(v3, v4, mask_v5); - __m256i const iv5 = _mm256_blendv_epi8(iv3, iv4, _mm256_castps_si256(mask_v5)); - - /* v6 = [I I I I | J J J J] */ - __m256 const v6 = _mm256_permute2f128_ps(v5, v5, 0b0000'0001); - __m256i const iv6 = _mm256_castps_si256( - _mm256_permute2f128_ps( - _mm256_castsi256_ps(iv5), - _mm256_castsi256_ps(iv5), - 0b0000'0001 - ) - ); - - /* v7 = [M=max(I,J) M=max(I,J) M=max(I,J) M=max(I,J) | M=max(I,J) M=max(I,J) M=max(I,J) M=max(I,J)] */ - // __m128 v7 = _mm_max_ps(_mm256_castps256_ps128(v5), v6); - __m256 const mask_v7 = _mm256_cmp_ps(v6, v5, _CMP_GT_OQ); - __m256 const v7 = _mm256_blendv_ps(v5, v6, mask_v7); - __m256i const iv7 = _mm256_blendv_epi8(iv5, iv6, _mm256_castps_si256(mask_v7)); - - return {v7, iv7}; - } - - - template - requires std::is_base_of_v - inline std::tuple, xsimd::batch> imax(xsimd::batch const& x, xsimd::batch const& idx) noexcept - { - __m256d const y = _mm256_permute2f128_pd(x, x, 1); // permute 128-bit values - __m256i const iy = _mm256_permute2f128_si256(idx, idx, 1); - - // __m256d m1 = _mm256_max_pd(x.value_, y); // m1[0] = max(x[0], x[2]), m1[1] = max(x[1], x[3]), etc. - __m256d const mask_m1 = _mm256_cmp_pd(y, x, _CMP_GT_OQ); - __m256d const m1 = _mm256_blendv_pd(x, y, mask_m1); - __m256i const im1 = _mm256_blendv_epi8(idx, iy, _mm256_castpd_si256(mask_m1)); - - __m256d const m2 = _mm256_permute_pd(m1, 5); // set m2[0] = m1[1], m2[1] = m1[0], etc. - __m256i const im2 = _mm256_castpd_si256(_mm256_permute_pd(_mm256_castsi256_pd(im1), 5)); - - // __m256d m = _mm256_max_pd(m1, m2); // all m[0] ... m[3] contain the horizontal max(x[0], x[1], x[2], x[3]) - __m256d const mask_m = _mm256_cmp_pd(m2, m1, _CMP_GT_OQ); - __m256d const m = _mm256_blendv_pd(m1, m2, mask_m); - __m256i const im = _mm256_blendv_epi8(im1, im2, _mm256_castpd_si256(mask_m)); - - return {m, im}; - } - - /** * @brief Fused negative multiply-add * @@ -162,7 +55,7 @@ namespace blast SimdVec fmadd(SimdVec const& a, SimdVec const& b, SimdVec const& c) noexcept; template - SimdVec blend(SimdVec const& a, SimdVec const& b, typename SimdVec::MaskType mask) noexcept; + SimdVec blend(SimdVec const& a, SimdVec const& b, SimdMask const& mask) noexcept; template SimdVec abs(SimdVec const& a) noexcept; @@ -181,12 +74,9 @@ namespace blast /** * @brief Horizontal max (across all elements) * - * The implementation is based on - * https://stackoverflow.com/questions/9795529/how-to-find-the-horizontal-maximum-in-a-256-bit-avx-vector - * * @param a pack of 32-bit floating point numbers * - * @return max(a[0], a[1], a[2], a[3], a[4], a[5], a[6], a[7]) + * @return max(a[0], a[1], ..., a[N-1]) */ template T max(SimdVec const& x) noexcept; @@ -399,7 +289,7 @@ namespace blast friend SimdVec fmadd<>(SimdVec const& a, SimdVec const& b, SimdVec const& c) noexcept; friend SimdVec fnmadd<>(SimdVec const& a, SimdVec const& b, SimdVec const& c) noexcept; - friend SimdVec blend<>(SimdVec const& a, SimdVec const& b, MaskType mask) noexcept; + friend SimdVec blend<>(SimdVec const& a, SimdVec const& b, MaskType const& mask) noexcept; friend SimdVec abs<>(SimdVec const& a) noexcept; friend SimdVec max<>(SimdVec const& a, SimdVec const& b) noexcept; friend ValueType max<>(SimdVec const& x) noexcept; @@ -433,7 +323,7 @@ namespace blast template - inline SimdVec blend(SimdVec const& a, SimdVec const& b, typename SimdVec::MaskType mask) noexcept + inline SimdVec blend(SimdVec const& a, SimdVec const& b, SimdMask const& mask) noexcept { return xsimd::select(mask, a.value_, b.value_); } @@ -456,28 +346,28 @@ namespace blast template inline SimdVec operator*(SimdVec const& a, SimdVec const& b) noexcept { - return a.value_ * b.value_; + return xsimd::mul(a.value_, b.value_); } template inline SimdVec operator*(T const& a, SimdVec const& b) noexcept { - return SimdVec {a} * b; + return xsimd::mul(a, b.value_); } template inline SimdVec operator*(SimdVec const& a, T const& b) noexcept { - return a * SimdVec {b}; + return xsimd::mul(a.value_, b); } template inline SimdVec max(SimdVec const& a, SimdVec const& b) noexcept { - return xsimd::max(a, b); + return xsimd::max(a.value_, b.value_); } diff --git a/include/blast/math/simd/arch/Avx2.hpp b/include/blast/math/simd/arch/Avx2.hpp new file mode 100644 index 00000000..a77c09c4 --- /dev/null +++ b/include/blast/math/simd/arch/Avx2.hpp @@ -0,0 +1,123 @@ +// Copyright 2024 Mikhail Katliar +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +#pragma once + +#include + +#include + + +namespace blast +{ + template + requires std::is_base_of_v + inline xsimd::batch maskload(float const * src, xsimd::batch_bool const& mask) noexcept + { + return _mm256_maskload_ps(src, mask); + } + + + template + requires std::is_base_of_v + inline xsimd::batch maskload(double const * src, xsimd::batch_bool const& mask) noexcept + { + return _mm256_maskload_pd(src, mask); + } + + + template + requires std::is_base_of_v + inline void maskstore(xsimd::batch const& v, float * dst, xsimd::batch_bool const& mask) noexcept + { + _mm256_maskstore_ps(dst, xsimd::batch_bool_cast(mask), v); + } + + + template + requires std::is_base_of_v + inline void maskstore(xsimd::batch const& v, double * dst, xsimd::batch_bool const& mask) noexcept + { + _mm256_maskstore_pd(dst, xsimd::batch_bool_cast(mask), v); + } + + + template + requires std::is_base_of_v + inline std::tuple, xsimd::batch> imax(xsimd::batch const& v1, xsimd::batch const& idx) noexcept + { + /* v2 = [G H E F | C D A B] */ + __m256 const v2 = _mm256_permute_ps(v1, 0b10'11'00'01); + __m256i const iv2 = _mm256_castps_si256(_mm256_permute_ps(_mm256_castsi256_ps(idx), 0b10'11'00'01)); + + /* v3 = [W=max(G,H) W=max(G,H) Z=max(E,F) Z=max(E,F) | Y=max(C,D) Y=max(C,D) X=max(A,B) X=max(A,B)] */ + /* v3 = [W W Z Z | Y Y X X] */ + // __m256 v3 = _mm256_max_ps(v1, v2); + __m256 const mask_v3 = _mm256_cmp_ps(v2, v1, _CMP_GT_OQ); + __m256 const v3 = _mm256_blendv_ps(v1, v2, mask_v3); + __m256i const iv3 = _mm256_blendv_epi8(idx, iv2, _mm256_castps_si256(mask_v3)); + + /* v4 = [Z Z W W | X X Y Y] */ + __m256 const v4 = _mm256_permute_ps(v3, 0b00'00'10'10); + __m256i const iv4 = _mm256_castps_si256(_mm256_permute_ps(_mm256_castsi256_ps(iv3), 0b00'00'10'10)); + + /* v5 = [J=max(Z,W) J=max(Z,W) J=max(Z,W) J=max(Z,W) | I=max(X,Y) I=max(X,Y) I=max(X,Y) I=max(X,Y)] */ + /* v5 = [J J J J | I I I I] */ + // __m256 v5 = _mm256_max_ps(v3, v4); + __m256 const mask_v5 = _mm256_cmp_ps(v4, v3, _CMP_GT_OQ); + __m256 const v5 = _mm256_blendv_ps(v3, v4, mask_v5); + __m256i const iv5 = _mm256_blendv_epi8(iv3, iv4, _mm256_castps_si256(mask_v5)); + + /* v6 = [I I I I | J J J J] */ + __m256 const v6 = _mm256_permute2f128_ps(v5, v5, 0b0000'0001); + __m256i const iv6 = _mm256_castps_si256( + _mm256_permute2f128_ps( + _mm256_castsi256_ps(iv5), + _mm256_castsi256_ps(iv5), + 0b0000'0001 + ) + ); + + /* v7 = [M=max(I,J) M=max(I,J) M=max(I,J) M=max(I,J) | M=max(I,J) M=max(I,J) M=max(I,J) M=max(I,J)] */ + // __m128 v7 = _mm_max_ps(_mm256_castps256_ps128(v5), v6); + __m256 const mask_v7 = _mm256_cmp_ps(v6, v5, _CMP_GT_OQ); + __m256 const v7 = _mm256_blendv_ps(v5, v6, mask_v7); + __m256i const iv7 = _mm256_blendv_epi8(iv5, iv6, _mm256_castps_si256(mask_v7)); + + return {v7, iv7}; + } + + + template + requires std::is_base_of_v + inline std::tuple, xsimd::batch> imax(xsimd::batch const& x, xsimd::batch const& idx) noexcept + { + __m256d const y = _mm256_permute2f128_pd(x, x, 1); // permute 128-bit values + __m256i const iy = _mm256_permute2f128_si256(idx, idx, 1); + + // __m256d m1 = _mm256_max_pd(x.value_, y); // m1[0] = max(x[0], x[2]), m1[1] = max(x[1], x[3]), etc. + __m256d const mask_m1 = _mm256_cmp_pd(y, x, _CMP_GT_OQ); + __m256d const m1 = _mm256_blendv_pd(x, y, mask_m1); + __m256i const im1 = _mm256_blendv_epi8(idx, iy, _mm256_castpd_si256(mask_m1)); + + __m256d const m2 = _mm256_permute_pd(m1, 5); // set m2[0] = m1[1], m2[1] = m1[0], etc. + __m256i const im2 = _mm256_castpd_si256(_mm256_permute_pd(_mm256_castsi256_pd(im1), 5)); + + // __m256d m = _mm256_max_pd(m1, m2); // all m[0] ... m[3] contain the horizontal max(x[0], x[1], x[2], x[3]) + __m256d const mask_m = _mm256_cmp_pd(m2, m1, _CMP_GT_OQ); + __m256d const m = _mm256_blendv_pd(m1, m2, mask_m); + __m256i const im = _mm256_blendv_epi8(im1, im2, _mm256_castpd_si256(mask_m)); + + return {m, im}; + } +} From 9ebf1ed27c312665f5420e6634e3dc655953330b Mon Sep 17 00:00:00 2001 From: Mikhail Katliar Date: Sat, 3 Aug 2024 05:26:11 +0200 Subject: [PATCH 26/27] Builds with gcc --- include/blast/math/simd/arch/Avx2.hpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/include/blast/math/simd/arch/Avx2.hpp b/include/blast/math/simd/arch/Avx2.hpp index a77c09c4..b5aed8c1 100644 --- a/include/blast/math/simd/arch/Avx2.hpp +++ b/include/blast/math/simd/arch/Avx2.hpp @@ -24,7 +24,7 @@ namespace blast requires std::is_base_of_v inline xsimd::batch maskload(float const * src, xsimd::batch_bool const& mask) noexcept { - return _mm256_maskload_ps(src, mask); + return _mm256_maskload_ps(src, _mm256_castps_si256(mask)); } @@ -32,7 +32,7 @@ namespace blast requires std::is_base_of_v inline xsimd::batch maskload(double const * src, xsimd::batch_bool const& mask) noexcept { - return _mm256_maskload_pd(src, mask); + return _mm256_maskload_pd(src, _mm256_castpd_si256(mask)); } @@ -40,7 +40,7 @@ namespace blast requires std::is_base_of_v inline void maskstore(xsimd::batch const& v, float * dst, xsimd::batch_bool const& mask) noexcept { - _mm256_maskstore_ps(dst, xsimd::batch_bool_cast(mask), v); + _mm256_maskstore_ps(dst, _mm256_castps_si256(mask), v); } @@ -48,7 +48,7 @@ namespace blast requires std::is_base_of_v inline void maskstore(xsimd::batch const& v, double * dst, xsimd::batch_bool const& mask) noexcept { - _mm256_maskstore_pd(dst, xsimd::batch_bool_cast(mask), v); + _mm256_maskstore_pd(dst, _mm256_castpd_si256(mask), v); } From 2f757bbffa95f8c632ec97644f165665bfa10bba Mon Sep 17 00:00:00 2001 From: Mikhail Katliar Date: Mon, 5 Aug 2024 14:16:03 +0200 Subject: [PATCH 27/27] Documented iamax() --- include/blast/math/simd/SimdVec.hpp | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/include/blast/math/simd/SimdVec.hpp b/include/blast/math/simd/SimdVec.hpp index 5d6afea1..140516ef 100644 --- a/include/blast/math/simd/SimdVec.hpp +++ b/include/blast/math/simd/SimdVec.hpp @@ -298,9 +298,20 @@ namespace blast friend SimdVec operator*<>(ValueType const& a, SimdVec const& b) noexcept; friend SimdVec operator*<>(SimdVec const& a, ValueType const& b) noexcept; - friend std::tuple> imax(SimdVec const& v1, SimdIndex const& idx) noexcept + /** + * @brief Maximum element of a batch and its index. + * + * For example, imax([-1., 0., 4., 3.], [1, 2, -1, 42]) == ([4., 4., 4., 4.], [-1, -1, -1, -1]) + * + * @param v batch of values + * @param idx batch of indices + * + * @return a tuple {A, I} such that A == [a, a, ..., a] and I == [idx[i], idx[i], ..., idx[i]], + * where a == max_j(v[j]), i = argmax_j(v[j]) + */ + friend std::tuple> imax(SimdVec const& v, SimdIndex const& idx) noexcept { - return imax(v1.value_, idx); + return imax(v.value_, idx); } private: