diff --git a/.github/workflows/cmake.yml b/.github/workflows/cmake.yml index 9c1ace7c..ece754f4 100644 --- a/.github/workflows/cmake.yml +++ b/.github/workflows/cmake.yml @@ -21,7 +21,17 @@ jobs: - uses: actions/checkout@v3 - name: Install APT packages - run: sudo apt install clang libboost-exception-dev libbenchmark-dev -y + run: | + sudo apt-get update + sudo apt install libboost-exception-dev libbenchmark-dev -y + + - name: Install LLVM and Clang 18 + run: | + sudo apt-get update + sudo apt-get install -y wget gnupg lsb-release + wget https://apt.llvm.org/llvm.sh + chmod +x llvm.sh + sudo ./llvm.sh 18 - name: Install Blaze run: | @@ -36,6 +46,13 @@ jobs: -DCMAKE_INSTALL_PREFIX=/usr/local/ . \ && sudo make install + - name: Install xsimd + run: | + git clone https://github.com/xtensor-stack/xsimd.git + cd xsimd + cmake -B build -DCMAKE_INSTALL_PREFIX=/usr/local/ . + sudo cmake --build build --target install + - name: Install GTest run: | git clone https://github.com/google/googletest.git @@ -47,7 +64,7 @@ jobs: run: | cmake -B ${{github.workspace}}/build \ -DCMAKE_BUILD_TYPE=${{env.BUILD_TYPE}} \ - -DCMAKE_CXX_COMPILER=clang++ \ + -DCMAKE_CXX_COMPILER=clang++-18 \ -DBLAST_WITH_BENCHMARK=ON \ -DBLAST_WITH_TEST=ON @@ -60,4 +77,3 @@ jobs: # Execute tests defined by the CMake configuration. # See https://cmake.org/cmake/help/latest/manual/ctest.1.html for more detail run: ctest -C ${{env.BUILD_TYPE}} - diff --git a/CMakeLists.txt b/CMakeLists.txt index faffe2fd..3b0ad928 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -25,6 +25,7 @@ set(CMAKE_DEBUG_POSTFIX d) find_package(Boost REQUIRED COMPONENTS exception) find_package(blaze REQUIRED) +find_package(xsimd REQUIRED) add_library(blast INTERFACE) @@ -35,6 +36,7 @@ target_include_directories(blast INTERFACE target_link_libraries(blast INTERFACE blaze::blaze + INTERFACE xsimd ) target_compile_options(blast diff --git a/Dockerfile b/Dockerfile index 1ccfab49..fb47dea6 100644 --- a/Dockerfile +++ b/Dockerfile @@ -56,7 +56,7 @@ ENV PKG_CONFIG_PATH=/usr/local/lib RUN mkdir -p blast/build && cd blast/build \ && cmake -DCMAKE_BUILD_TYPE=RelWithDebInfo \ -DCMAKE_CXX_COMPILER="clang++-15" \ - -DCMAKE_CXX_FLAGS="-march=native -mfma -mavx -mavx2 -msse4 -fno-math-errno" \ + -DCMAKE_CXX_FLAGS="-march=native -mfma -mavx -mavx2 -msse4 -fno-math-errno -DXSIMD_DEFAULT_ARCH='fma3'" \ -DCMAKE_CXX_FLAGS_RELEASE="-O3 -g -DNDEBUG -ffast-math" .. \ -DBLAST_WITH_TEST=ON \ -DBLAST_WITH_BENCHMARK=ON \ @@ -79,4 +79,3 @@ CMD mkdir -p blast/bench_result/data \ && mkdir -p blast/bench_result/image \ && cd blast \ && make -j 1 bench_result/image/dgemm_performance.png bench_result/image/dgemm_performance_ratio.png - diff --git a/include/blast/math/RegisterMatrix.hpp b/include/blast/math/RegisterMatrix.hpp index 3c8944c4..851ef7ef 100644 --- a/include/blast/math/RegisterMatrix.hpp +++ b/include/blast/math/RegisterMatrix.hpp @@ -7,6 +7,3 @@ #include #include -#include -#include -#include diff --git a/include/blast/math/RowColumnVectorPointer.hpp b/include/blast/math/RowColumnVectorPointer.hpp index e13c311f..ae7a8b5a 100644 --- a/include/blast/math/RowColumnVectorPointer.hpp +++ b/include/blast/math/RowColumnVectorPointer.hpp @@ -15,7 +15,7 @@ #pragma once #include -#include +#include #include #include @@ -28,9 +28,9 @@ namespace blast { public: using ElementType = typename MP::ElementType; - using IntrinsicType = typename Simd>::IntrinsicType; - using MaskType = typename Simd>::MaskType; using SimdVecType = SimdVec>; + using IntrinsicType = SimdVecType::IntrinsicType; + using MaskType = SimdMask>; static TransposeFlag constexpr transposeFlag = TF; static bool constexpr aligned = MP::aligned; @@ -176,7 +176,7 @@ namespace blast private: - static size_t constexpr SS = Simd>::size; + static size_t constexpr SS = SimdVecType::size(); MP ptr_; }; @@ -244,4 +244,4 @@ namespace blast { return RowColumnVectorPointer {std::move(p)}; } -} \ No newline at end of file +} diff --git a/include/blast/math/Simd.hpp b/include/blast/math/Simd.hpp new file mode 100644 index 00000000..912948ce --- /dev/null +++ b/include/blast/math/Simd.hpp @@ -0,0 +1,12 @@ +// Copyright (c) 2019-2020 Mikhail Katliar All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#pragma once + +#include +#include +#include +#include +#include +#include diff --git a/include/blast/math/dense/DynamicMatrixPointer.hpp b/include/blast/math/dense/DynamicMatrixPointer.hpp index e6540576..5610b1c9 100644 --- a/include/blast/math/dense/DynamicMatrixPointer.hpp +++ b/include/blast/math/dense/DynamicMatrixPointer.hpp @@ -8,14 +8,11 @@ #include #include #include -#include -#include +#include #include - #include - namespace blast { template @@ -23,9 +20,9 @@ namespace blast { public: using ElementType = T; - using IntrinsicType = typename Simd>::IntrinsicType; - using MaskType = typename Simd>::MaskType; using SimdVecType = SimdVec>; + using IntrinsicType = SimdVecType::IntrinsicType; + using MaskType = SimdMask>; static bool constexpr storageOrder = SO; static bool constexpr aligned = AF; @@ -83,9 +80,9 @@ namespace blast } - IntrinsicType broadcast() const noexcept + SimdVecType broadcast() const noexcept { - return blast::broadcast(ptr_); + return *ptr_; } @@ -187,7 +184,7 @@ namespace blast private: - static size_t constexpr SS = Simd>::size; + static size_t constexpr SS = SimdVecType::size(); static TransposeFlag constexpr majorOrientation = SO == columnMajor ? columnVector : rowVector; @@ -253,4 +250,4 @@ namespace blast { return {p, spacing}; } -} \ No newline at end of file +} diff --git a/include/blast/math/dense/DynamicVectorPointer.hpp b/include/blast/math/dense/DynamicVectorPointer.hpp index b026d205..7638119a 100644 --- a/include/blast/math/dense/DynamicVectorPointer.hpp +++ b/include/blast/math/dense/DynamicVectorPointer.hpp @@ -4,8 +4,7 @@ #pragma once - -#include +#include #include #include @@ -18,9 +17,9 @@ namespace blast { public: using ElementType = T; - using IntrinsicType = typename Simd>::IntrinsicType; - using MaskType = typename Simd>::MaskType; using SimdVecType = SimdVec>; + using IntrinsicType = SimdVecType::IntrinsicType; + using MaskType = SimdMask>; static bool constexpr transposeFlag = TF; static bool constexpr aligned = AF; @@ -40,7 +39,7 @@ namespace blast , spacing_ {spacing} { BLAST_USER_ASSERT(spacing > 0, "Vector element spacing must be positive."); - BLAST_USER_ASSERT(!AF || reinterpret_cast(ptr) % (SS * sizeof(T)) == 0, "Pointer is not aligned"); + BLAST_USER_ASSERT(!AF || isSimdAligned(ptr), "Pointer is not aligned"); } @@ -51,6 +50,7 @@ namespace blast SimdVecType load() const noexcept { // Non-optimized + // TODO: use gather() IntrinsicType v; for (size_t i = 0; i < SS; ++i) v[i] = ptr_[spacing_ * i]; @@ -62,18 +62,18 @@ namespace blast SimdVecType load(MaskType mask) const noexcept { // Non-optimized - IntrinsicType v = blast::setzero, SS>(); + // TODO: use gather() + T v[SS]; for (size_t i = 0; i < SS; ++i) - if (mask[i]) - v[i] = ptr_[spacing_ * i]; + v[i] = mask[i] ? ptr_[spacing_ * i] : T {}; - return SimdVecType {v}; + return SimdVecType {v, false}; } - IntrinsicType broadcast() const noexcept + SimdVecType broadcast() const noexcept { - return blast::broadcast(ptr_); + return *ptr_; } @@ -172,7 +172,7 @@ namespace blast private: - static size_t constexpr SS = Simd>::size; + static size_t constexpr SS = SimdVecType::size(); T * ptrOffset(ptrdiff_t i) const noexcept @@ -191,4 +191,4 @@ namespace blast { return p.trans(); } -} \ No newline at end of file +} diff --git a/include/blast/math/dense/Iamax.hpp b/include/blast/math/dense/Iamax.hpp index 6b03a36a..530df07b 100644 --- a/include/blast/math/dense/Iamax.hpp +++ b/include/blast/math/dense/Iamax.hpp @@ -17,7 +17,6 @@ #include #include -#include #include #include @@ -177,4 +176,4 @@ namespace blast return iamax(N, ptr(*x)); } -} \ No newline at end of file +} diff --git a/include/blast/math/dense/StaticMatrixPointer.hpp b/include/blast/math/dense/StaticMatrixPointer.hpp index 2db833b8..961b5e01 100644 --- a/include/blast/math/dense/StaticMatrixPointer.hpp +++ b/include/blast/math/dense/StaticMatrixPointer.hpp @@ -6,8 +6,8 @@ #include #include -#include #include +#include #include #include #include @@ -24,9 +24,9 @@ namespace blast { public: using ElementType = T; - using IntrinsicType = typename Simd>::IntrinsicType; - using MaskType = typename Simd>::MaskType; using SimdVecType = SimdVec>; + using IntrinsicType = SimdVecType::IntrinsicType; + using MaskType = SimdMask>; static bool constexpr storageOrder = SO; static bool constexpr aligned = AF; @@ -188,7 +188,7 @@ namespace blast private: - static size_t constexpr SS = Simd>::size; + static size_t constexpr SS = SimdVecType::size(); static TransposeFlag constexpr majorOrientation = SO == columnMajor ? columnVector : rowVector; @@ -225,4 +225,4 @@ namespace blast { return trans(ptr(m.operand(), j, i)); } -} \ No newline at end of file +} diff --git a/include/blast/math/dense/StaticVectorPointer.hpp b/include/blast/math/dense/StaticVectorPointer.hpp index 6608ddd0..6017dafa 100644 --- a/include/blast/math/dense/StaticVectorPointer.hpp +++ b/include/blast/math/dense/StaticVectorPointer.hpp @@ -5,7 +5,7 @@ #pragma once -#include +#include #include @@ -16,9 +16,9 @@ namespace blast { public: using ElementType = T; - using IntrinsicType = typename Simd>::IntrinsicType; - using MaskType = typename Simd>::MaskType; using SimdVecType = SimdVec>; + using IntrinsicType = SimdVecType::IntrinsicType; + using MaskType = SimdMask>; static bool constexpr transposeFlag = TF; static bool constexpr aligned = AF; @@ -35,7 +35,7 @@ namespace blast constexpr StaticVectorPointer(T * ptr) noexcept : ptr_ {ptr} { - BLAST_USER_ASSERT(!AF || reinterpret_cast(ptr) % (SS * sizeof(T)) == 0, "Pointer is not aligned"); + BLAST_USER_ASSERT(!AF || isSimdAligned(ptr), "Pointer is not aligned"); } @@ -70,27 +70,27 @@ namespace blast else { // Non-optimized - IntrinsicType v = blast::setzero(); + // TODO: use gather + T v[SS]; for (size_t i = 0; i < SS; ++i) - if (mask[i]) - v[i] = ptr_[S * i]; + v[i] = mask[i] ? ptr_[S * i] : T {}; - return SimdVecType {v}; + return SimdVecType {v, false}; } } - IntrinsicType broadcast() const noexcept + SimdVecType broadcast() const noexcept { - return blast::broadcast(ptr_); + return *ptr_; } - void store(IntrinsicType val) const noexcept + void store(SimdVecType val) const noexcept { if constexpr (S == 1) { - blast::store(ptr_, val); + val.store(ptr_, AF); } else { @@ -105,7 +105,7 @@ namespace blast { if constexpr (S == 1) { - blast::maskstore(ptr_, mask, val); + val.store(ptr_, mask); } else { @@ -195,7 +195,7 @@ namespace blast private: - static size_t constexpr SS = Simd>::size; + static size_t constexpr SS = SimdVecType::size(); T * ptrOffset(ptrdiff_t i) const noexcept @@ -213,4 +213,4 @@ namespace blast { return p.trans(); } -} \ No newline at end of file +} diff --git a/include/blast/math/expressions/PMatTransExpr.hpp b/include/blast/math/expressions/PMatTransExpr.hpp index fae79ffc..4ad4526f 100644 --- a/include/blast/math/expressions/PMatTransExpr.hpp +++ b/include/blast/math/expressions/PMatTransExpr.hpp @@ -12,7 +12,6 @@ #include #include #include -#include #include #include #include @@ -213,4 +212,4 @@ namespace blast using ReturnType = const PMatTransExpr; return ReturnType(*pm); } -} \ No newline at end of file +} diff --git a/include/blast/math/expressions/PanelMatrix.hpp b/include/blast/math/expressions/PanelMatrix.hpp index 4a556a33..d0494887 100644 --- a/include/blast/math/expressions/PanelMatrix.hpp +++ b/include/blast/math/expressions/PanelMatrix.hpp @@ -5,9 +5,12 @@ #pragma once #include -#include +#include +#include +#include #include -//#include +#include +#include #include #include @@ -91,13 +94,12 @@ namespace blast using ET1 = ElementType_t; using ET2 = ElementType_t; - static size_t constexpr SS = Simd::size; + static size_t constexpr SS = SimdSize_v; static size_t constexpr PANEL_SIZE = PanelSize_v; static_assert(PANEL_SIZE % SS == 0); - using MaskType = typename Simd::MaskType; - using IntType = typename Simd::IntType; - using SIMD = Simd; + using MaskType = SimdMask; + using IntType = typename SimdIndex::value_type; if constexpr (SO1 == columnMajor && SO2 == columnMajor) @@ -108,22 +110,22 @@ namespace blast for (size_t i = 0; i + SS <= m; i += SS) { - ET2 const * pr = &(*rhs)(i, 0); - ET1 * pl = data(lhs) + i; + auto pr = ptr(*rhs, i, 0); + auto pl = ptr(*lhs, i, 0); for (size_t j = 0; j < n; ++j) - store(pl + s * j, load(pr + PANEL_SIZE * j)); + pl(0, j).store(pr(0, j).load()); } if (IntType const rem = m % SS) { - MaskType const mask = SIMD::index() < rem; + MaskType const mask = indexSequence() < rem; size_t const i = m - rem; - ET2 const * pr = &(*rhs)(i, 0); - ET1 * pl = data(lhs) + i; + auto pr = ptr(*rhs, i, 0); + auto pl = ptr(*lhs, i, 0); for (size_t j = 0; j < n; ++j) - maskstore(pl + s * j, mask, load(pr + PANEL_SIZE * j)); + pl(0, j).store(pr(0, j).load(), mask); } #if 0 @@ -185,22 +187,22 @@ namespace blast using ET1 = ElementType_t; using ET2 = ElementType_t; - static size_t constexpr SS = Simd::size; + static size_t constexpr SS = SimdSize_v; static size_t constexpr PANEL_SIZE = PanelSize_v; static_assert(PANEL_SIZE % SS == 0); - using MaskType = typename Simd::MaskType; - using IntType = typename Simd::IntType; + using MaskType = SimdMask; + using IntType = typename SimdIndex::value_type; if constexpr (SO1 == columnMajor && SO2 == columnMajor) { for (size_t i = 0; i + SS <= m; i += SS) { - ET2 const * pr = data(rhs) + i; - ET1 * pl = &(*lhs)(i, 0); + auto pr = ptr(*rhs, i, 0); + auto pl = ptr(*lhs, i, 0); for (size_t j = 0; j < n; ++j) - store(pl + PANEL_SIZE * j, load(pr + s * j)); + pl(0, j).store(pr(0, j).load()); } if (IntType const rem = m % SS) @@ -243,4 +245,4 @@ namespace blast (*lhs)(i, j) = (*rhs)(i, j); } } -} \ No newline at end of file +} diff --git a/include/blast/math/panel/DynamicPanelMatrixPointer.hpp b/include/blast/math/panel/DynamicPanelMatrixPointer.hpp index f965f4e1..1e3f1233 100644 --- a/include/blast/math/panel/DynamicPanelMatrixPointer.hpp +++ b/include/blast/math/panel/DynamicPanelMatrixPointer.hpp @@ -5,10 +5,9 @@ #pragma once #include -#include #include #include -#include +#include #include #include #include @@ -23,9 +22,9 @@ namespace blast { public: using ElementType = T; - using IntrinsicType = typename Simd>::IntrinsicType; - using MaskType = typename Simd>::MaskType; using SimdVecType = SimdVec>; + using IntrinsicType = SimdVecType::IntrinsicType; + using MaskType = SimdMask>; static bool constexpr storageOrder = SO; static bool constexpr aligned = AF; @@ -211,7 +210,7 @@ namespace blast private: - static size_t constexpr SS = Simd>::size; + static size_t constexpr SS = SimdVecType::size(); static TransposeFlag constexpr majorOrientation = SO == columnMajor ? columnVector : rowVector; @@ -275,4 +274,4 @@ namespace blast { return trans(ptr(m.operand(), j, i)); } -} \ No newline at end of file +} diff --git a/include/blast/math/panel/PanelSize.hpp b/include/blast/math/panel/PanelSize.hpp index 027d5c63..6bcf6bbf 100644 --- a/include/blast/math/panel/PanelSize.hpp +++ b/include/blast/math/panel/PanelSize.hpp @@ -8,11 +8,12 @@ // Includes //************************************************************************************************* +#include #include namespace blast { - template - size_t constexpr PanelSize_v = Simd::size; + template + size_t constexpr PanelSize_v = SimdSize_v; } diff --git a/include/blast/math/panel/StaticPanelMatrix.hpp b/include/blast/math/panel/StaticPanelMatrix.hpp index cfeda753..585b6f6b 100644 --- a/include/blast/math/panel/StaticPanelMatrix.hpp +++ b/include/blast/math/panel/StaticPanelMatrix.hpp @@ -7,7 +7,6 @@ #include #include #include -#include #include #include @@ -160,45 +159,6 @@ namespace blast } - Type * ptr(size_t i, size_t j) - { - // BLAST_USER_ASSERT(i % panelSize_ == 0, "Row index not aligned to panel boundary"); - return v_ + elementIndex(i, j); - } - - - Type const * ptr(size_t i, size_t j) const - { - // BLAST_USER_ASSERT(i % panelSize_ == 0, "Row index not aligned to panel boundary"); - return v_ + elementIndex(i, j); - } - - - template - auto load(size_t i, size_t j) const - { - BLAZE_INTERNAL_ASSERT(i < M, "Invalid row access index"); - BLAZE_INTERNAL_ASSERT(j < N, "Invalid column access index"); - BLAZE_INTERNAL_ASSERT(i % panelSize_ == 0 || SO == rowMajor, "Row index not aligned to panel boundary"); - BLAZE_INTERNAL_ASSERT(j % panelSize_ == 0 || SO == columnMajor, "Column index not aligned to panel boundary"); - - return blast::load(v_ + elementIndex(i, j)); - } - - - template - void store(size_t i, size_t j, T val) - { - BLAZE_INTERNAL_ASSERT(i < M, "Invalid row access index"); - BLAZE_INTERNAL_ASSERT(j < N, "Invalid column access index"); - BLAZE_INTERNAL_ASSERT(i % panelSize_ == 0 || SO == rowMajor, "Row index not aligned to panel boundary"); - BLAZE_INTERNAL_ASSERT(j % panelSize_ == 0 || SO == columnMajor, "Column index not aligned to panel boundary"); - - // We never use maskstore here because we have padding - blast::store(v_ + elementIndex(i, j), val); - } - - private: static size_t constexpr panelSize_ = PanelSize_v; static size_t constexpr tileRows_ = M / panelSize_ + (M % panelSize_ > 0); @@ -329,4 +289,4 @@ namespace blaze { using Type = blast::StaticPanelMatrix; }; -} \ No newline at end of file +} diff --git a/include/blast/math/panel/StaticPanelMatrixPointer.hpp b/include/blast/math/panel/StaticPanelMatrixPointer.hpp index 50544d02..aaadbda9 100644 --- a/include/blast/math/panel/StaticPanelMatrixPointer.hpp +++ b/include/blast/math/panel/StaticPanelMatrixPointer.hpp @@ -7,8 +7,7 @@ #include #include #include -#include -#include +#include #include #include #include @@ -21,9 +20,9 @@ namespace blast { public: using ElementType = T; - using IntrinsicType = typename Simd>::IntrinsicType; - using MaskType = typename Simd>::MaskType; using SimdVecType = SimdVec>; + using IntrinsicType = SimdVecType::IntrinsicType; + using MaskType = SimdMask>; static bool constexpr storageOrder = SO; static bool constexpr aligned = AF; @@ -204,7 +203,7 @@ namespace blast private: - static size_t constexpr SS = Simd>::size; + static size_t constexpr SS = SimdVecType::size(); static TransposeFlag constexpr majorOrientation = SO == columnMajor ? columnVector : rowVector; @@ -267,4 +266,4 @@ namespace blast { return trans(ptr(m.operand(), j, i)); } -} \ No newline at end of file +} diff --git a/include/blast/math/register_matrix/DynamicRegisterMatrix.hpp b/include/blast/math/register_matrix/DynamicRegisterMatrix.hpp index a1b857cc..9cd7f826 100644 --- a/include/blast/math/register_matrix/DynamicRegisterMatrix.hpp +++ b/include/blast/math/register_matrix/DynamicRegisterMatrix.hpp @@ -4,8 +4,7 @@ #pragma once -#include -#include +#include #include #include #include @@ -18,8 +17,6 @@ #include #include -#include - namespace blast { @@ -121,7 +118,7 @@ namespace blast { for (size_t i = 0; i < RM; ++i) for (size_t j = 0; j < N; ++j) - v_[i][j] = setzero(); + v_[i][j].reset(); } @@ -250,14 +247,14 @@ namespace blast private: - using SIMD = Simd; - using IntrinsicType = typename SIMD::IntrinsicType; - using MaskType = typename SIMD::MaskType; - using IntType = typename SIMD::IntType; - using SimdVecType = SimdVec; + using Arch = xsimd::default_arch; + using SimdVecType = SimdVec; + using IntrinsicType = typename SimdVecType::IntrinsicType; + using MaskType = SimdMask; + using IntType = typename SimdIndex::value_type; // SIMD size - static size_t constexpr SS = Simd::size; + static size_t constexpr SS = SimdVecType::size(); // Numberf of SIMD registers required to store a single column of the matrix. static size_t constexpr RM = M / SS; @@ -266,9 +263,9 @@ namespace blast BLAZE_STATIC_ASSERT_MSG((RM > 0), "Number of rows must be not less than SIMD size"); BLAZE_STATIC_ASSERT_MSG((RN > 0), "Number of columns must be positive"); BLAZE_STATIC_ASSERT_MSG((M % SS == 0), "Number of rows must be a multiple of SIMD size"); - BLAZE_STATIC_ASSERT_MSG((RM * RN <= RegisterCapacity_v), "Not enough registers for a DynamicRegisterMatrix"); + BLAZE_STATIC_ASSERT_MSG((RM * RN <= registerCapacity(Arch {})), "Not enough registers for a DynamicRegisterMatrix"); - IntrinsicType v_[RM][RN]; + SimdVecType v_[RM][RN]; size_t const m_; size_t const n_; @@ -329,7 +326,7 @@ namespace blast if (IntType const rem = m_ % SS) { - MaskType const mask = SIMD::index() < rem; + MaskType const mask = indexSequence() < rem; size_t const i = m_ / SS; for (size_t j = 0; j < n_ && j < columns(); ++j) @@ -349,10 +346,10 @@ namespace blast { IntType const skip = j - ri * SS; IntType const rem = m_ - ri * SS; - MaskType mask = SIMD::index() < rem; + MaskType mask = indexSequence() < rem; if (skip > 0) - mask &= SIMD::index() >= skip; + mask &= indexSequence() >= skip; p(SS * ri, j).store(v_[ri][j], mask); } @@ -490,7 +487,7 @@ namespace blast // ax[i] = alpha * a.load(i * SS, 0); // if (rem) - // ax[ii] = alpha * a.load(ii * SS, 0, SIMD::index() < rem); + // ax[ii] = alpha * a.load(ii * SS, 0, indexSequence() < rem); // #pragma unroll // for (size_t j = 0; j < N; ++j) @@ -598,4 +595,4 @@ namespace blast ker.axpy(beta, c); ker.store(d); } -} \ No newline at end of file +} diff --git a/include/blast/math/register_matrix/RegisterMatrix.hpp b/include/blast/math/register_matrix/RegisterMatrix.hpp index d4309043..631b530d 100644 --- a/include/blast/math/register_matrix/RegisterMatrix.hpp +++ b/include/blast/math/register_matrix/RegisterMatrix.hpp @@ -4,8 +4,7 @@ #pragma once -#include -#include +#include #include #include #include @@ -20,8 +19,6 @@ #include #include -#include - namespace blast { @@ -117,7 +114,7 @@ namespace blast { for (size_t i = 0; i < RM; ++i) for (size_t j = 0; j < N; ++j) - v_[i][j] = setzero(); + v_[i][j].reset(); } @@ -360,14 +357,14 @@ namespace blast private: - using SIMD = Simd; - using IntrinsicType = typename SIMD::IntrinsicType; - using MaskType = typename SIMD::MaskType; - using IntType = typename SIMD::IntType; - using SimdVecType = SimdVec; + using Arch = xsimd::default_arch; + using SimdVecType = SimdVec; + using IntrinsicType = typename SimdVecType::IntrinsicType; + using MaskType = SimdMask; + using IntType = typename SimdIndex::value_type; // SIMD size - static size_t constexpr SS = Simd::size; + static size_t constexpr SS = SimdVecType::size(); // Numberf of SIMD registers required to store a single column of the matrix. static size_t constexpr RM = M / SS; @@ -376,9 +373,9 @@ namespace blast BLAZE_STATIC_ASSERT_MSG((RM > 0), "Number of rows must be not less than SIMD size"); BLAZE_STATIC_ASSERT_MSG((RN > 0), "Number of columns must be positive"); BLAZE_STATIC_ASSERT_MSG((M % SS == 0), "Number of rows must be a multiple of SIMD size"); - BLAZE_STATIC_ASSERT_MSG((RM * RN <= RegisterCapacity_v), "Not enough registers for a RegisterMatrix"); + BLAZE_STATIC_ASSERT_MSG((RM * RN <= registerCapacity(Arch {})), "Not enough registers for a RegisterMatrix"); - IntrinsicType v_[RM][RN]; + SimdVecType v_[RM][RN]; /// @brief Reference to the matrix element at row \a i and column \a j @@ -461,7 +458,7 @@ namespace blast v_[i][j] = beta * p(SS * i, j).load(); if (size_t const rem = m % SS) - v_[m / SS][j] = beta * p(m - rem, j).load(SIMD::index() < rem); + v_[m / SS][j] = beta * p(m - rem, j).load(indexSequence() < rem); } } } @@ -493,7 +490,7 @@ namespace blast if (IntType const rem = m % SS) { - MaskType const mask = SIMD::index() < rem; + MaskType const mask = indexSequence() < rem; size_t const i = m / SS; for (size_t j = 0; j < n && j < columns(); ++j) @@ -514,7 +511,7 @@ namespace blast if (skip && ri < RM) { - MaskType const mask = SIMD::index() >= skip; + MaskType const mask = indexSequence() >= skip; p(SS * ri, j).store(v_[ri][j], mask); ++ri; } @@ -536,10 +533,10 @@ namespace blast { IntType const skip = j - ri * SS; IntType const rem = m - ri * SS; - MaskType mask = SIMD::index() < rem; + MaskType mask = indexSequence() < rem; if (skip > 0) - mask &= SIMD::index() >= skip; + mask &= indexSequence() >= skip; p(SS * ri, j).store(v_[ri][j], mask); } @@ -562,7 +559,7 @@ namespace blast #pragma unroll for (size_t k = 0; k < j; ++k) { - IntrinsicType const a_kj = (~A)(k, j).broadcast(); + SimdVecType const a_kj = (~A)(k, j).broadcast(); #pragma unroll for (size_t i = 0; i < RM; ++i) @@ -596,7 +593,7 @@ namespace blast VectorPointer && (PB::transposeFlag == rowVector) BLAZE_ALWAYS_INLINE void RegisterMatrix::ger(T alpha, PA a, PB b) noexcept { - BLAZE_STATIC_ASSERT_MSG((RM * RN + RM + 1 <= RegisterCapacity_v), "Not enough registers for ger()"); + BLAZE_STATIC_ASSERT_MSG((RM * RN + RM + 1 <= registerCapacity(Arch {})), "Not enough registers for ger()"); SimdVecType ax[RM]; @@ -623,7 +620,7 @@ namespace blast VectorPointer && (PB::transposeFlag == rowVector) BLAZE_ALWAYS_INLINE void RegisterMatrix::ger(PA a, PB b) noexcept { - BLAZE_STATIC_ASSERT_MSG((RM * RN + RM + 1 <= RegisterCapacity_v), "Not enough registers for ger()"); + BLAZE_STATIC_ASSERT_MSG((RM * RN + RM + 1 <= registerCapacity(Arch {})), "Not enough registers for ger()"); SimdVecType ax[RM]; @@ -697,7 +694,7 @@ namespace blast BLAZE_ALWAYS_INLINE void RegisterMatrix::potrf() noexcept { static_assert(M >= N, "potrf() not implemented for register matrices with columns more than rows"); - static_assert(RM * RN + 2 <= RegisterCapacity_v, "Not enough registers"); + static_assert(RM * RN + 2 <= registerCapacity(Arch {}), "Not enough registers"); #pragma unroll for (size_t k = 0; k < N; ++k) @@ -705,11 +702,11 @@ namespace blast #pragma unroll for (size_t j = 0; j < k; ++j) { - T const a_kj = v_[k / SS][j][k % SS]; + SimdVecType const a_kj = v_[k / SS][j][k % SS]; #pragma unroll for (size_t i = 0; i < RM; ++i) if (i >= k / SS) - v_[i][k] = fnmadd(set1(a_kj), v_[i][j], v_[i][k]); + v_[i][k] = fnmadd(a_kj, v_[i][j], v_[i][k]); } T const sqrt_a_kk = std::sqrt(v_[k / SS][k][k % SS]); @@ -718,7 +715,7 @@ namespace blast for (size_t i = 0; i < RM; ++i) { if (i < k / SS) - v_[i][k] = setzero(); + v_[i][k].reset(); else v_[i][k] /= sqrt_a_kk; } @@ -745,7 +742,7 @@ namespace blast ax[i] = alpha * a(i * SS, 0).load(); if (rem) - ax[ii] = alpha * a(ii * SS, 0).load(SIMD::index() < rem); + ax[ii] = alpha * a(ii * SS, 0).load(indexSequence() < rem); #pragma unroll for (size_t j = 0; j < N; ++j) @@ -825,4 +822,4 @@ namespace blast { return rm == m; } -} \ No newline at end of file +} diff --git a/include/blast/math/register_matrix/double_12_4_4.hpp b/include/blast/math/register_matrix/double_12_4_4.hpp deleted file mode 100644 index a640d2c1..00000000 --- a/include/blast/math/register_matrix/double_12_4_4.hpp +++ /dev/null @@ -1,167 +0,0 @@ -// Copyright (c) 2019-2020 Mikhail Katliar All rights reserved. -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. - -#pragma once - -#include -#include - -#include - -#include - - -namespace blast -{ - template <> - template - requires MatrixPointer && (P::storageOrder == columnMajor) - inline void RegisterMatrix::load(double beta, P ptr, size_t m, size_t n) noexcept - { - if (n > 0) - { - v_[0][0] = beta * ptr.load(); - v_[1][0] = beta * ptr(SS, 0).load(); - v_[2][0] = beta * ptr(2 * SS, 0).load(); - } - - if (n > 1) - { - v_[0][1] = beta * ptr(0, 1).load(); - v_[1][1] = beta * ptr(SS, 1).load(); - v_[2][1] = beta * ptr(2 * SS, 1).load(); - } - - if (n > 2) - { - v_[0][2] = beta * ptr(0, 2).load(); - v_[1][2] = beta * ptr(SS, 2).load(); - v_[2][2] = beta * ptr(2 * SS, 2).load(); - } - - if (n > 3) - { - v_[0][3] = beta * ptr(0, 3).load(); - v_[1][3] = beta * ptr(SS, 3).load(); - v_[2][3] = beta * ptr(2 * SS, 3).load(); - } - } - - - template <> - template - requires MatrixPointer && (P::storageOrder == columnMajor) - inline void RegisterMatrix::store(P ptr, size_t m, size_t n) const noexcept - { - #pragma unroll - for (size_t i = 0; i < 3; ++i) - { - if (m >= 4 * i + 4) - { - if (n > 0) - ptr(SS * i, 0).store(v_[i][0]); - - if (n > 1) - ptr(SS * i, 1).store(v_[i][1]); - - if (n > 2) - ptr(SS * i, 2).store(v_[i][2]); - - if (n > 3) - ptr(SS * i, 3).store(v_[i][3]); - } - else if (m > 4 * i) - { - __m256i const mask = _mm256_set_epi64x( - m > 4 * i + 3 ? 0x8000000000000000ULL : 0, - m > 4 * i + 2 ? 0x8000000000000000ULL : 0, - m > 4 * i + 1 ? 0x8000000000000000ULL : 0, - m > 4 * i + 0 ? 0x8000000000000000ULL : 0); - - if (n > 0) - ptr(SS * i, 0).store(v_[i][0], mask); - - if (n > 1) - ptr(SS * i, 1).store(v_[i][1], mask); - - if (n > 2) - ptr(SS * i, 2).store(v_[i][2], mask); - - if (n > 3) - ptr(SS * i, 3).store(v_[i][3], mask); - } - } - } - - -#ifdef BLAST_USE_CUSTOM_TRMM_RIGHT_LOWER - template <> - template - requires MatrixPointer && (PB::storageOrder == columnMajor) && MatrixPointer - BLAZE_ALWAYS_INLINE void RegisterMatrix::trmmRightLower(double alpha, PB b, PA a) noexcept - { - IntrinsicType bx[3], ax; - - // k == 0 - bx[0] = alpha * b.load(0 * SS, 0); - bx[1] = alpha * b.load(1 * SS, 0); - bx[2] = alpha * b.load(2 * SS, 0); - ax = a.broadcast(0, 0); - v_[0][0] = fmadd(bx[0], ax, v_[0][0]); - v_[1][0] = fmadd(bx[1], ax, v_[1][0]); - v_[2][0] = fmadd(bx[2], ax, v_[2][0]); - - // k == 1 - bx[0] = alpha * b.load(0 * SS, 1); - bx[1] = alpha * b.load(1 * SS, 1); - bx[2] = alpha * b.load(2 * SS, 1); - ax = a.broadcast(1, 0); - v_[0][0] = fmadd(bx[0], ax, v_[0][0]); - v_[1][0] = fmadd(bx[1], ax, v_[1][0]); - v_[2][0] = fmadd(bx[2], ax, v_[2][0]); - ax = a.broadcast(1, 1); - v_[0][1] = fmadd(bx[0], ax, v_[0][1]); - v_[1][1] = fmadd(bx[1], ax, v_[1][1]); - v_[2][1] = fmadd(bx[2], ax, v_[2][1]); - - // k == 2 - bx[0] = alpha * b.load(0 * SS, 2); - bx[1] = alpha * b.load(1 * SS, 2); - bx[2] = alpha * b.load(2 * SS, 2); - ax = a.broadcast(2, 0); - v_[0][0] = fmadd(bx[0], ax, v_[0][0]); - v_[1][0] = fmadd(bx[1], ax, v_[1][0]); - v_[2][0] = fmadd(bx[2], ax, v_[2][0]); - ax = a.broadcast(2, 1); - v_[0][1] = fmadd(bx[0], ax, v_[0][1]); - v_[1][1] = fmadd(bx[1], ax, v_[1][1]); - v_[2][1] = fmadd(bx[2], ax, v_[2][1]); - ax = a.broadcast(2, 2); - v_[0][2] = fmadd(bx[0], ax, v_[0][2]); - v_[1][2] = fmadd(bx[1], ax, v_[1][2]); - v_[2][2] = fmadd(bx[2], ax, v_[2][2]); - - // k == 3 - bx[0] = alpha * b.load(0 * SS, 3); - bx[1] = alpha * b.load(1 * SS, 3); - bx[2] = alpha * b.load(2 * SS, 3); - ax = a.broadcast(3, 0); - v_[0][0] = fmadd(bx[0], ax, v_[0][0]); - v_[1][0] = fmadd(bx[1], ax, v_[1][0]); - v_[2][0] = fmadd(bx[2], ax, v_[2][0]); - ax = a.broadcast(3, 1); - v_[0][1] = fmadd(bx[0], ax, v_[0][1]); - v_[1][1] = fmadd(bx[1], ax, v_[1][1]); - v_[2][1] = fmadd(bx[2], ax, v_[2][1]); - ax = a.broadcast(3, 2); - v_[0][2] = fmadd(bx[0], ax, v_[0][2]); - v_[1][2] = fmadd(bx[1], ax, v_[1][2]); - v_[2][2] = fmadd(bx[2], ax, v_[2][2]); - ax = a.broadcast(3, 3); - v_[0][3] = fmadd(bx[0], ax, v_[0][3]); - v_[1][3] = fmadd(bx[1], ax, v_[1][3]); - v_[2][3] = fmadd(bx[2], ax, v_[2][3]); - } -#endif -} \ No newline at end of file diff --git a/include/blast/math/register_matrix/double_4_4_4.hpp b/include/blast/math/register_matrix/double_4_4_4.hpp deleted file mode 100644 index 1c73605c..00000000 --- a/include/blast/math/register_matrix/double_4_4_4.hpp +++ /dev/null @@ -1,79 +0,0 @@ -// Copyright (c) 2019-2020 Mikhail Katliar All rights reserved. -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. - -#pragma once - -#include -#include - -#include - - -namespace blast -{ - template <> - template - requires MatrixPointer && (P::storageOrder == columnMajor) - BLAZE_ALWAYS_INLINE void RegisterMatrix::load(double beta, P ptr, size_t m, size_t n) noexcept - { - if (n > 0) - v_[0][0] = beta * ptr.load(); - - if (n > 1) - v_[0][1] = beta * ptr(0, 1).load(); - - if (n > 2) - v_[0][2] = beta * ptr(0, 2).load(); - - if (n > 3) - v_[0][3] = beta * ptr(0, 3).load(); - } - - -#if 1 - /// Magically, this function specialization is slightly faster than the default implementation of RegisterMatrix<>::store. - template <> - template - requires MatrixPointer && (P::storageOrder == columnMajor) - BLAZE_ALWAYS_INLINE void RegisterMatrix::store(P ptr, size_t m, size_t n) const noexcept - { - if (m >= 4) - { - if (n > 0) - ptr.store(v_[0][0]); - - if (n > 1) - ptr(0, 1).store(v_[0][1]); - - if (n > 2) - ptr(0, 2).store(v_[0][2]); - - if (n > 3) - ptr(0, 3).store(v_[0][3]); - } - else if (m > 0) - { - // Magically, the code below is significantly faster than this: - // __m256i const mask = _mm256_cmpgt_epi64(_mm256_set_epi64x(m, m, m, m), _mm256_set_epi64x(3, 2, 1, 0)); - __m256i const mask = _mm256_set_epi64x( - m > 3 ? 0x8000000000000000ULL : 0, - m > 2 ? 0x8000000000000000ULL : 0, - m > 1 ? 0x8000000000000000ULL : 0, - m > 0 ? 0x8000000000000000ULL : 0); - - if (n > 0) - ptr.store(v_[0][0], mask); - - if (n > 1) - ptr(0, 1).store(v_[0][1], mask); - - if (n > 2) - ptr(0, 2).store(v_[0][2], mask); - - if (n > 3) - ptr(0, 3).store(v_[0][3], mask); - } - } -#endif -} \ No newline at end of file diff --git a/include/blast/math/register_matrix/double_8_4_4.hpp b/include/blast/math/register_matrix/double_8_4_4.hpp deleted file mode 100644 index 4ec3d416..00000000 --- a/include/blast/math/register_matrix/double_8_4_4.hpp +++ /dev/null @@ -1,151 +0,0 @@ -// Copyright (c) 2019-2020 Mikhail Katliar All rights reserved. -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. - -#pragma once - -#include -#include - -#include - -#include - - -namespace blast -{ - template<> - template - requires MatrixPointer && (P::storageOrder == columnMajor) - inline void RegisterMatrix::load(double beta, P ptr, size_t m, size_t n) noexcept - { - if (n > 0) - { - v_[0][0] = beta * ptr.load(); - v_[1][0] = beta * ptr(SS, 0).load(); - } - - if (n > 1) - { - v_[0][1] = beta * ptr(0, 1).load(); - v_[1][1] = beta * ptr(SS, 1).load(); - } - - if (n > 2) - { - v_[0][2] = beta * ptr(0, 2).load(); - v_[1][2] = beta * ptr(SS, 2).load(); - } - - if (n > 3) - { - v_[0][3] = beta * ptr(0, 3).load(); - v_[1][3] = beta * ptr(SS, 3).load(); - } - } - - -#if 1 - template<> - template - requires MatrixPointer && (P::storageOrder == columnMajor) - inline void RegisterMatrix::store(P ptr, size_t m, size_t n) const noexcept - { - #pragma unroll - for (size_t i = 0; i < 2; ++i) - { - if (m >= 4 * i + 4) - { - if (n > 0) - ptr(SS * i, 0).store(v_[i][0]); - - if (n > 1) - ptr(SS * i, 1).store(v_[i][1]); - - if (n > 2) - ptr(SS * i, 2).store(v_[i][2]); - - if (n > 3) - ptr(SS * i, 3).store(v_[i][3]); - } - else if (m > 4 * i) - { - __m256i const mask = _mm256_set_epi64x( - m > 4 * i + 3 ? 0x8000000000000000ULL : 0, - m > 4 * i + 2 ? 0x8000000000000000ULL : 0, - m > 4 * i + 1 ? 0x8000000000000000ULL : 0, - m > 4 * i + 0 ? 0x8000000000000000ULL : 0); - - if (n > 0) - ptr(SS * i, 0).store(v_[i][0], mask); - - if (n > 1) - ptr(SS * i, 1).store(v_[i][1], mask); - - if (n > 2) - ptr(SS * i, 2).store(v_[i][2], mask); - - if (n > 3) - ptr(SS * i, 3).store(v_[i][3], mask); - } - } - } -#endif - - -#ifdef BLAST_USE_CUSTOM_TRMM_RIGHT_LOWER - template <> - template - requires MatrixPointer && (PB::storageOrder == columnMajor) && MatrixPointer - BLAZE_ALWAYS_INLINE void RegisterMatrix::trmmRightLower(double alpha, PB b, PA a) noexcept - { - IntrinsicType bx[2], ax; - - // k == 0 - bx[0] = alpha * b.load(0 * SS, 0); - bx[1] = alpha * b.load(1 * SS, 0); - ax = a.broadcast(0, 0); - v_[0][0] = fmadd(bx[0], ax, v_[0][0]); - v_[1][0] = fmadd(bx[1], ax, v_[1][0]); - - // k == 1 - bx[0] = alpha * b.load(0 * SS, 1); - bx[1] = alpha * b.load(1 * SS, 1); - ax = a.broadcast(1, 0); - v_[0][0] = fmadd(bx[0], ax, v_[0][0]); - v_[1][0] = fmadd(bx[1], ax, v_[1][0]); - ax = a.broadcast(1, 1); - v_[0][1] = fmadd(bx[0], ax, v_[0][1]); - v_[1][1] = fmadd(bx[1], ax, v_[1][1]); - - // k == 2 - bx[0] = alpha * b.load(0 * SS, 2); - bx[1] = alpha * b.load(1 * SS, 2); - ax = a.broadcast(2, 0); - v_[0][0] = fmadd(bx[0], ax, v_[0][0]); - v_[1][0] = fmadd(bx[1], ax, v_[1][0]); - ax = a.broadcast(2, 1); - v_[0][1] = fmadd(bx[0], ax, v_[0][1]); - v_[1][1] = fmadd(bx[1], ax, v_[1][1]); - ax = a.broadcast(2, 2); - v_[0][2] = fmadd(bx[0], ax, v_[0][2]); - v_[1][2] = fmadd(bx[1], ax, v_[1][2]); - - // k == 3 - bx[0] = alpha * b.load(0 * SS, 3); - bx[1] = alpha * b.load(1 * SS, 3); - ax = a.broadcast(3, 0); - v_[0][0] = fmadd(bx[0], ax, v_[0][0]); - v_[1][0] = fmadd(bx[1], ax, v_[1][0]); - ax = a.broadcast(3, 1); - v_[0][1] = fmadd(bx[0], ax, v_[0][1]); - v_[1][1] = fmadd(bx[1], ax, v_[1][1]); - ax = a.broadcast(3, 2); - v_[0][2] = fmadd(bx[0], ax, v_[0][2]); - v_[1][2] = fmadd(bx[1], ax, v_[1][2]); - ax = a.broadcast(3, 3); - v_[0][3] = fmadd(bx[0], ax, v_[0][3]); - v_[1][3] = fmadd(bx[1], ax, v_[1][3]); - } -#endif -} \ No newline at end of file diff --git a/include/blast/math/simd/Avx256.hpp b/include/blast/math/simd/Avx256.hpp deleted file mode 100644 index 8e34d3b3..00000000 --- a/include/blast/math/simd/Avx256.hpp +++ /dev/null @@ -1,25 +0,0 @@ -// Copyright 2023 Mikhail Katliar -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once - -#include -#include -#include -#include -#include -// #include -#include - -#include diff --git a/include/blast/math/simd/Hsum.hpp b/include/blast/math/simd/Hsum.hpp deleted file mode 100644 index 9c7b5f4b..00000000 --- a/include/blast/math/simd/Hsum.hpp +++ /dev/null @@ -1,31 +0,0 @@ -// Copyright (c) 2019-2020 Mikhail Katliar All rights reserved. -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. - -#pragma once - -#include - - -namespace blast -{ - /// dst[127:64] = sum(v1) - /// dst[63:0] = sum(v0) - inline __m128d hsum(__m256d v0, __m256d v1) - { - __m256d v01 = _mm256_hadd_pd(v0, v1); - return _mm_add_pd(_mm256_extractf128_pd(v01, 1), _mm256_castpd256_pd128(v01)); - } - - - /// dst[255:192] = sum(v3) - /// dst[191:128] = sum(v2) - /// dst[127:64] = sum(v1) - /// dst[63:0] = sum(v0) - inline __m256d hsum(__m256d v0, __m256d v1, __m256d v2, __m256d v3) - { - __m128d v01 = hsum(v0, v1); - __m128d v23 = hsum(v2, v3); - return _mm256_insertf128_pd(_mm256_castpd128_pd256(v01), v23, 1); - } -} \ No newline at end of file diff --git a/include/blast/math/simd/IntVecType.hpp b/include/blast/math/simd/IntVecType.hpp deleted file mode 100644 index 0f8faab7..00000000 --- a/include/blast/math/simd/IntVecType.hpp +++ /dev/null @@ -1,24 +0,0 @@ -// Copyright 2023 Mikhail Katliar -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once - -#include -#include - -namespace blast -{ - template - using IntVecType_t = SimdVec>; -} \ No newline at end of file diff --git a/include/blast/math/simd/IntScalarType.hpp b/include/blast/math/simd/RegisterCapacity.hpp similarity index 62% rename from include/blast/math/simd/IntScalarType.hpp rename to include/blast/math/simd/RegisterCapacity.hpp index 3fb119ac..35b2ac7e 100644 --- a/include/blast/math/simd/IntScalarType.hpp +++ b/include/blast/math/simd/RegisterCapacity.hpp @@ -1,29 +1,32 @@ -// Copyright 2023 Mikhail Katliar +// Copyright 2024 Mikhail Katliar // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // -// http://www.apache.org/licenses/LICENSE-2.0 +// https://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. - #pragma once +#include + #include -#include namespace blast { - template - struct IntScalarType; - - - template - using IntScalarType_t = typename IntScalarType::type; -} \ No newline at end of file + /** + * @brief Number of available SIMD registers. + * + * @return Number of SIMD registers for AVX2 + */ + std::size_t constexpr registerCapacity(xsimd::avx2) + { + return 16; + } +} diff --git a/include/blast/math/simd/SequenceTag.hpp b/include/blast/math/simd/SequenceTag.hpp deleted file mode 100644 index 8f37c552..00000000 --- a/include/blast/math/simd/SequenceTag.hpp +++ /dev/null @@ -1,23 +0,0 @@ -// Copyright 2023 Mikhail Katliar -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once - - -namespace blast :: simd -{ - struct SequenceTag {}; - - inline SequenceTag constexpr sequenceTag; -} \ No newline at end of file diff --git a/include/blast/math/simd/Simd.hpp b/include/blast/math/simd/Simd.hpp index 040f4ffe..b5bd8873 100644 --- a/include/blast/math/simd/Simd.hpp +++ b/include/blast/math/simd/Simd.hpp @@ -1,459 +1,20 @@ -// Copyright (c) 2019-2020 Mikhail Katliar All rights reserved. -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. - +// Copyright 2024 Mikhail Katliar +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. #pragma once -#include - -#include -#include -#include - -#include - -#include -#include - - -namespace blast -{ - using namespace blaze; - - - template - struct Simd; - - - template <> - struct Simd - { - private: - class Index; - - - public: - using IntrinsicType = __m256d; - using MaskType = __m256i; - using IntType = long long; - - static size_t constexpr size = 4; - static size_t constexpr registerCapacity = 16; - - - //******************************************************* - // - // CUSTOM - // - //******************************************************* - static Index index() noexcept - { - return Index {}; - } - - - private: - class Index - { - public: - MaskType operator<(IntType n) const noexcept - { - return _mm256_cmpgt_epi64(_mm256_set1_epi64x(n), val_); - } - - - MaskType operator>(IntType n) const noexcept - { - return _mm256_cmpgt_epi64(val_, _mm256_set1_epi64x(n)); - } - - - MaskType operator<=(IntType n) const noexcept - { - return *this < n + 1; - } - - - MaskType operator>=(IntType n) const noexcept - { - return *this > n - 1; - } - - - private: - MaskType val_ = _mm256_set_epi64x(3, 2, 1, 0); - }; - }; - - - template <> - struct Simd - { - private: - class Index; - - - public: - using IntrinsicType = __m256; - using MaskType = __m256i; - using IntType = int; - - static size_t constexpr size = 8; - static size_t constexpr registerCapacity = 16; - - - //******************************************************* - // - // CUSTOM - // - //******************************************************* - static Index index() noexcept - { - return Index {}; - } - - - private: - class Index - { - public: - MaskType operator<(IntType n) const noexcept - { - return _mm256_cmpgt_epi32(_mm256_set1_epi32(n), val_); - } - - - MaskType operator>(IntType n) const noexcept - { - return _mm256_cmpgt_epi32(val_, _mm256_set1_epi32(n)); - } - - - MaskType operator<=(IntType n) const noexcept - { - return *this < n + 1; - } - - - MaskType operator>=(IntType n) const noexcept - { - return *this > n - 1; - } - - - private: - MaskType val_ = _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0); - }; - }; - - - template - using IntrinsicType_t = typename Simd::IntrinsicType; - - - template - struct SimdTraits; - - - template <> - struct SimdTraits<__m256d> - { - using ScalarType = double; - using MaskType = __m256i; - using IntType = long long; - static size_t constexpr size = 4; - }; - - - template <> - struct SimdTraits<__m256> - { - using ScalarType = float; - using MaskType = __m256i; - using IntType = int; - static size_t constexpr size = 8; - }; - - - template - using ScalarType_t = typename SimdTraits::ScalarType; - - - template - using MaskType_t = typename SimdTraits::MaskType; - - template - using IntType_t = typename SimdTraits::IntType; - - - template - size_t constexpr RegisterCapacity_v = Simd::registerCapacity; - - - //******************************************************* - // - // LOAD - // - //******************************************************* - - template - auto load(T const * ptr); - - - template - inline auto load(T const * ptr) - { - return load::size>(ptr); - } - - - template - auto broadcast(T const * ptr); - - - template <> - inline auto load(double const * ptr) - { - return _mm256_load_pd(ptr); - } - - - template <> - inline auto load(double const * ptr) - { - return _mm256_loadu_pd(ptr); - } - - - template <> - inline auto load(float const * ptr) - { - return _mm256_load_ps(ptr); - } - - - template <> - inline auto load(float const * ptr) - { - return _mm256_loadu_ps(ptr); - } - - - inline auto maskload(double const * ptr, __m256i mask) - { - return _mm256_maskload_pd(ptr, mask); - } - - - inline auto maskload(float const * ptr, __m256i mask) - { - return _mm256_maskload_ps(ptr, mask); - } - - - template <> - inline auto broadcast<4, double>(double const * ptr) - { - return _mm256_broadcast_sd(ptr); - } - - - template <> - inline auto broadcast<8, float>(float const * ptr) - { - return _mm256_broadcast_ss(ptr); - } - - - //******************************************************* - // - // STORE - // - //******************************************************* - template - void store(double * ptr, __m256d a); - - - template <> - inline void store(double * ptr, __m256d a) - { - _mm256_store_pd(ptr, a); - } - - - template <> - inline void store(double * ptr, __m256d a) - { - _mm256_storeu_pd(ptr, a); - } - - - template - void store(float * ptr, __m256 a); - - - template <> - inline void store(float * ptr, __m256 a) - { - _mm256_store_ps(ptr, a); - } - - - template <> - inline void store(float * ptr, __m256 a) - { - _mm256_storeu_ps(ptr, a); - } - - - inline void maskstore(double * ptr, __m256i m, __m256d a) - { - _mm256_maskstore_pd(ptr, m, a); - } - - - inline void maskstore(float * ptr, __m256i m, __m256 a) - { - _mm256_maskstore_ps(ptr, m, a); - } - - - //******************************************************* - // - // SET - // - //******************************************************* - - inline auto set(double a3, double a2, double a1, double a0) - { - return _mm256_set_pd(a3, a2, a1, a0); - } - - - inline auto set(float a7, float a6, float a5, float a4, float a3, float a2, float a1, float a0) - { - return _mm256_set_ps(a7, a6, a5, a4, a3, a2, a1, a0); - } - - - inline auto set(long long a3, long long a2, long long a1, long long a0) - { - return _mm256_set_epi64x(a3, a2, a1, a0); - } - - - template - auto set1(T a); - - - inline auto set1(double a) - { - return _mm256_set1_pd(a); - } - - - inline auto set1(float a) - { - return _mm256_set1_ps(a); - } - - - template <> - inline auto set1<4, double>(double a) - { - return _mm256_set1_pd(a); - } - - - template <> - inline auto set1<8, float>(float a) - { - return _mm256_set1_ps(a); - } - - - template <> - inline auto set1<8, int>(int val) - { - return _mm256_set1_epi32(val); - } - - - template <> - inline auto set1<4, long long>(long long val) - { - return _mm256_set1_epi64x(val); - } - - - template - auto setzero(); - - - template <> - inline auto setzero() - { - return _mm256_setzero_pd(); - } - - - template <> - inline auto setzero() - { - return _mm256_setzero_ps(); - } - - - //******************************************************* - // - // ARITHMETIC - // - //******************************************************* - - inline auto fnmadd(__m256d a, __m256d b, __m256d c) - { - return _mm256_fnmadd_pd(a, b, c); - } - - - inline auto fnmadd(__m256 a, __m256 b, __m256 c) - { - return _mm256_fnmadd_ps(a, b, c); - } - - - inline auto abs(__m256d a) - { - return _mm256_andnot_pd(set1(-0.), a); - } - - - inline auto abs(__m256 a) - { - return _mm256_andnot_ps(set1(-0.f), a); - } - - - //******************************************************* - // - // COMPARE - // - //******************************************************* - - template - auto cmpgt(MM a, MM b); - - - template <> - inline auto cmpgt<4>(__m256i a, __m256i b) - { - return _mm256_cmpgt_epi64(a, b); - } - +#include - template <> - inline auto cmpgt<8>(__m256i a, __m256i b) - { - return _mm256_cmpgt_epi32(a, b); - } -} \ No newline at end of file +#if XSIMD_WITH_AVX2 + #include +#endif diff --git a/include/blast/math/simd/SimdIndex.hpp b/include/blast/math/simd/SimdIndex.hpp new file mode 100644 index 00000000..3d7ca7b2 --- /dev/null +++ b/include/blast/math/simd/SimdIndex.hpp @@ -0,0 +1,98 @@ +// Copyright 2024 Mikhail Katliar +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +#pragma once + +#include + +#include +#include + + +namespace blast +{ + namespace detail + { + /// @brief Deduces integer type of given size, sized or unsigned. + /// + /// @tparam N type size in bytes + /// @tparam S true for signed type, false for unsigned + /// + template + struct IntOfSize; + + template + using IntOfSize_t = typename IntOfSize::Type; + + template <> + struct IntOfSize<4, true> + { + using Type = std::int32_t; + }; + + template <> + struct IntOfSize<4, false> + { + using Type = std::uint32_t; + }; + + template <> + struct IntOfSize<8, true> + { + using Type = std::int64_t; + }; + + template <> + struct IntOfSize<8, false> + { + using Type = std::uint64_t; + }; + + template + requires (xsimd::batch::size == 4) && std::is_integral_v + inline xsimd::batch indexSequence(T start) noexcept + { + return {start, start + 1, start + 2, start + 3}; + } + + template + requires (xsimd::batch::size == 8) && std::is_integral_v + inline xsimd::batch indexSequence(T start) noexcept + { + return {start, start + 1, start + 2, start + 3, start + 4, start + 5, start + 6, start + 7}; + } + } + + + /// @brief Integer SIMD type that can be used for indexing or gather-scatter operations. + /// + /// @tparam T the deduced type will be the same size as @a T + /// @tparam Arch instruction set architecture + /// + template + using SimdIndex = xsimd::batch, Arch>; + + + /// @brief Construct an integer index sequence + /// + /// @param start start of the sequence + /// + /// @return [ @a start, @a start + 1, ..., @a start + N - 1 ] + /// where N = SimdIndex::size + /// + template + inline SimdIndex indexSequence(typename SimdIndex::value_type start = 0) noexcept + { + return detail::indexSequence, Arch>(start); + } +} diff --git a/include/blast/math/simd/SimdMask.hpp b/include/blast/math/simd/SimdMask.hpp index 7e65c867..bdb26082 100644 --- a/include/blast/math/simd/SimdMask.hpp +++ b/include/blast/math/simd/SimdMask.hpp @@ -14,6 +14,8 @@ #pragma once +#include + namespace blast { @@ -22,7 +24,79 @@ namespace blast * The width of a given simd_mask instantiation is a constant expression, determined by the template parameter. * * @tparam T the element type simd_mask applies on + * @tparam Arch instruction set architecture */ - template - class SimdMask; -} \ No newline at end of file + template + class SimdMask + : public xsimd::batch_bool + { + public: + using XSimdType = xsimd::batch_bool; + using IntrinsicType = typename XSimdType::register_type; + + /** + * @brief Construct from an intrinsic register type + * + * @param v the value to construct from + */ + SimdMask(IntrinsicType v) noexcept + : XSimdType {v} + { + } + + /** + * @brief Construct from an @a xsimd::batch_bool of the same type + * + * @param v the value to construct from + */ + SimdMask(XSimdType const& v) noexcept + : XSimdType {v} + { + } + + /** + * @brief Construct from an @a xsimd::batch_bool of a different type + * + * We need to allow conversion from batch_bool with U != T, + * because we need e.g. the following code to work: + * + * int n; + * MaskType mask = indexSequence() < n; + * + * In the code above, the result of indexSequence() is a batch, + * and the result of indexSequence() < n is a batch_bool, + * which can not be directly assigned to a batch_bool, + * although their underlying register_type's are identical. + * + * @param v the value to construct from + */ + template + SimdMask(xsimd::batch_bool const& v) noexcept + : XSimdType {xsimd::batch_bool_cast(v)} + { + } + + /** + * @brief In-place logical and. + * + * We need to define this operator to make the following code work: + * + * MaskType mask; + * int n; + * mask &= indexSequence() >= n; + * + * In the code above, the result of indexSequence() is a batch, + * and the result of indexSequence() >= n is a batch_bool, + * which can not be directly used as a right-hand side operand of batch_bool::operator&=(). + * Defining operator&=(SimdMask const&) allows the right-hand side to be implicitly converted to SimdMask. + * + * @param v + * @return SimdMask& + */ + SimdMask& operator&=(SimdMask const& v) noexcept + { + XSimdType::operator&=(v); + return *this; + } + }; +} diff --git a/include/blast/math/simd/SimdSize.hpp b/include/blast/math/simd/SimdSize.hpp index 4e862f72..01a64825 100644 --- a/include/blast/math/simd/SimdSize.hpp +++ b/include/blast/math/simd/SimdSize.hpp @@ -1,4 +1,4 @@ -// Copyright 2023 Mikhail Katliar +// Copyright 2023-2024 Mikhail Katliar // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -14,24 +14,20 @@ #pragma once +#include + #include +#include namespace blast { - template - struct SimdSize; - - - template - struct SimdSize : SimdSize {}; - - /** - * @brief Number of elements in a SimdVec object. + * @brief Size of a SIMD register conatining scalars of a given type. * * @tparam T scalar type + * @tparam Arch instruction set architecture */ - template - size_t constexpr SimdSize_v = SimdSize::value; -} \ No newline at end of file + template + std::size_t constexpr SimdSize_v = xsimd::batch, Arch>::size; +} diff --git a/include/blast/math/simd/SimdVec.hpp b/include/blast/math/simd/SimdVec.hpp index a3195ff7..140516ef 100644 --- a/include/blast/math/simd/SimdVec.hpp +++ b/include/blast/math/simd/SimdVec.hpp @@ -14,15 +14,377 @@ #pragma once +#include +#include +#include + +#include + namespace blast { + template + class SimdVec; + + /** + * @brief Fused negative multiply-add + * + * Calculate -a * b + c + * + * @param a first multiplier + * @param b second multiplier + * @param c addendum + * + * @return @a a * @a b + @a c element-wise + */ + template + SimdVec fnmadd(SimdVec const& a, SimdVec const& b, SimdVec const& c) noexcept; + + /** + * @brief Fused multiply-add + * + * Calculate a * b + c + * + * @param a first multiplier + * @param b second multiplier + * @param c addendum + * + * @return @a a * @a b + @a c element-wise + */ + template + SimdVec fmadd(SimdVec const& a, SimdVec const& b, SimdVec const& c) noexcept; + + template + SimdVec blend(SimdVec const& a, SimdVec const& b, SimdMask const& mask) noexcept; + + template + SimdVec abs(SimdVec const& a) noexcept; + + /** + * @brief Vertical max (across two vectors) + * + * @param a first vector + * @param b second vector + * + * @return [max(a[7], b7), [max(a[6], b6), max(a[5], b5), max(a[4], b4), max(a[3], b3), max(a[2], b2), max(a[1], b1), max(a[0], b0)] + */ + template + SimdVec max(SimdVec const& a, SimdVec const& b) noexcept; + + /** + * @brief Horizontal max (across all elements) + * + * @param a pack of 32-bit floating point numbers + * + * @return max(a[0], a[1], ..., a[N-1]) + */ + template + T max(SimdVec const& x) noexcept; + + template + SimdMask operator>(SimdVec const& a, SimdVec const& b) noexcept; + + /** + * @brief Multiplication + * + * @param a first multiplier + * @param b second multiplier + * + * @return product @a a * @a b + */ + template + SimdVec operator*(SimdVec const& a, SimdVec const& b) noexcept; + + /** + * @brief Left multiplication with a scalar + * + * @param a scalar multiplier + * @param b batch multiplier + * + * @return product @a a * @a b + */ + template + SimdVec operator*(T const& a, SimdVec const& b) noexcept; + + /** - * @brief Data-parallel type with the element type bool. - * The width of a given simd_mask instantiation is a constant expression, determined by the template parameter. + * @brief Right multiplication with a scalar + * + * @param a batch multiplier + * @param b scalar multiplier + * + * @return product @a a * @a b + */ + template + SimdVec operator*(SimdVec const& a, T const& b) noexcept; + + + /** + * @brief Data-parallel type with a given element type. * - * @tparam T the element type simd_mask applies on + * @tparam T element type */ - template - class SimdVec; -} \ No newline at end of file + template + class SimdVec + { + public: + using ValueType = T; + using XSimdType = xsimd::batch; + using IntrinsicType = typename XSimdType::register_type; + using MaskType = SimdMask; + + + /** + * @brief Set to [0, 0, 0, ...] + */ + SimdVec() noexcept + : value_ {T {}} + { + } + + + SimdVec(SimdVec const&) noexcept = default; + + + SimdVec(IntrinsicType value) noexcept + : value_ {value} + { + } + + + SimdVec(XSimdType value) noexcept + : value_ {value} + { + } + + + /** + * @brief Set to [value, value, ...] + * + * @param value value for each component of SIMD vector + */ + SimdVec(ValueType value) noexcept + : value_ {value} + { + } + + + /** + * @brief Load from location + * + * @param src memory location to load from + * @param aligned true indicates that an aligned read instruction should be used + */ + explicit SimdVec(ValueType const * src, bool aligned) noexcept + : value_ {aligned ? xsimd::load_aligned(src) : xsimd::load_unaligned(src)} + { + } + + + /** + * @brief Masked load from location + * + * @param src memory location to load from + * @param mask load mask + * @param aligned true if @a src is SIMD-aligned + */ + explicit SimdVec(ValueType const * src, MaskType mask, bool aligned) noexcept + : value_ {maskload(src, mask)} + { + } + + + /** + * @brief Number of elements in SIMD pack + */ + static size_t constexpr size() + { + return XSimdType::size; + } + + + /** + * @brief Set to 0 + */ + void reset() noexcept + { + value_ = ValueType {}; + } + + + operator IntrinsicType() const noexcept + { + return value_; + } + + + /** + * @brief Access single element + * + * @param i element index + * + * @return element value + */ + ValueType operator[](size_t i) const noexcept + { + return value_.get(i); + } + + + /** + * @brief Store to memory + * + * @param dst memory location to store to + * @param aligned true if @a dst is SIMD-aligned + */ + void store(ValueType * dst, bool aligned) const noexcept + { + if (aligned) + xsimd::store_aligned(dst, value_); + else + xsimd::store_unaligned(dst, value_); + } + + + /** + * @brief Masked store to memory + * + * @param dst memory location to store to + * @param mask store mask + * @param aligned true if @a dst is SIMD-aligned + */ + void store(ValueType * dst, MaskType mask, bool aligned) const noexcept + { + maskstore(value_, dst, mask); + } + + + /** + * @brief In-place multiplication + * + * @param a multiplier + * + * @return @a *this after multiplication with @a a + */ + SimdVec& operator*=(SimdVec const& a) noexcept + { + value_ *= a.value_; + return *this; + } + + + /** + * @brief In-place division + * + * @param a divisor + * + * @return @a *this after division by @a a + */ + SimdVec& operator/=(SimdVec const& a) noexcept + { + value_ /= a.value_; + return *this; + } + + + friend SimdVec fmadd<>(SimdVec const& a, SimdVec const& b, SimdVec const& c) noexcept; + friend SimdVec fnmadd<>(SimdVec const& a, SimdVec const& b, SimdVec const& c) noexcept; + friend SimdVec blend<>(SimdVec const& a, SimdVec const& b, MaskType const& mask) noexcept; + friend SimdVec abs<>(SimdVec const& a) noexcept; + friend SimdVec max<>(SimdVec const& a, SimdVec const& b) noexcept; + friend ValueType max<>(SimdVec const& x) noexcept; + friend MaskType operator><>(SimdVec const& a, SimdVec const& b) noexcept; + friend SimdVec operator*<>(SimdVec const& a, SimdVec const& b) noexcept; + friend SimdVec operator*<>(ValueType const& a, SimdVec const& b) noexcept; + friend SimdVec operator*<>(SimdVec const& a, ValueType const& b) noexcept; + + /** + * @brief Maximum element of a batch and its index. + * + * For example, imax([-1., 0., 4., 3.], [1, 2, -1, 42]) == ([4., 4., 4., 4.], [-1, -1, -1, -1]) + * + * @param v batch of values + * @param idx batch of indices + * + * @return a tuple {A, I} such that A == [a, a, ..., a] and I == [idx[i], idx[i], ..., idx[i]], + * where a == max_j(v[j]), i = argmax_j(v[j]) + */ + friend std::tuple> imax(SimdVec const& v, SimdIndex const& idx) noexcept + { + return imax(v.value_, idx); + } + + private: + XSimdType value_; + }; + + + template + inline SimdVec fmadd(SimdVec const& a, SimdVec const& b, SimdVec const& c) noexcept + { + return xsimd::fma(a.value_, b.value_, c.value_); + } + + + template + inline SimdVec fnmadd(SimdVec const& a, SimdVec const& b, SimdVec const& c) noexcept + { + return xsimd::fnma(a.value_, b.value_, c.value_); + } + + + template + inline SimdVec blend(SimdVec const& a, SimdVec const& b, SimdMask const& mask) noexcept + { + return xsimd::select(mask, a.value_, b.value_); + } + + + template + inline SimdVec abs(SimdVec const& a) noexcept + { + return xsimd::abs(a.value_); + } + + + template + inline SimdMask operator>(SimdVec const& a, SimdVec const& b) noexcept + { + return xsimd::gt(a.value_, b.value_); + } + + + template + inline SimdVec operator*(SimdVec const& a, SimdVec const& b) noexcept + { + return xsimd::mul(a.value_, b.value_); + } + + + template + inline SimdVec operator*(T const& a, SimdVec const& b) noexcept + { + return xsimd::mul(a, b.value_); + } + + + template + inline SimdVec operator*(SimdVec const& a, T const& b) noexcept + { + return xsimd::mul(a.value_, b); + } + + + template + inline SimdVec max(SimdVec const& a, SimdVec const& b) noexcept + { + return xsimd::max(a.value_, b.value_); + } + + + template + inline T max(SimdVec const& x) noexcept + { + return xsimd::reduce_max(x.value_); + } +} diff --git a/include/blast/math/simd/SimdVecBase.hpp b/include/blast/math/simd/SimdVecBase.hpp deleted file mode 100644 index 365258f2..00000000 --- a/include/blast/math/simd/SimdVecBase.hpp +++ /dev/null @@ -1,54 +0,0 @@ -// Copyright 2023 Mikhail Katliar -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once - -#include - -#include - - -namespace blast -{ - template - class SimdVecBase - { - public: - using ValueType = T; - using IntrinsicType = I; - - - /** - * @brief Number of elements in SIMD pack - */ - static size_t constexpr size() - { - return SimdSize_v; - } - - - operator IntrinsicType() const noexcept - { - return value_; - } - - protected: - SimdVecBase(IntrinsicType value) - : value_ {value} - { - } - - IntrinsicType value_; - }; -} \ No newline at end of file diff --git a/include/blast/math/simd/arch/Avx2.hpp b/include/blast/math/simd/arch/Avx2.hpp new file mode 100644 index 00000000..b5aed8c1 --- /dev/null +++ b/include/blast/math/simd/arch/Avx2.hpp @@ -0,0 +1,123 @@ +// Copyright 2024 Mikhail Katliar +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +#pragma once + +#include + +#include + + +namespace blast +{ + template + requires std::is_base_of_v + inline xsimd::batch maskload(float const * src, xsimd::batch_bool const& mask) noexcept + { + return _mm256_maskload_ps(src, _mm256_castps_si256(mask)); + } + + + template + requires std::is_base_of_v + inline xsimd::batch maskload(double const * src, xsimd::batch_bool const& mask) noexcept + { + return _mm256_maskload_pd(src, _mm256_castpd_si256(mask)); + } + + + template + requires std::is_base_of_v + inline void maskstore(xsimd::batch const& v, float * dst, xsimd::batch_bool const& mask) noexcept + { + _mm256_maskstore_ps(dst, _mm256_castps_si256(mask), v); + } + + + template + requires std::is_base_of_v + inline void maskstore(xsimd::batch const& v, double * dst, xsimd::batch_bool const& mask) noexcept + { + _mm256_maskstore_pd(dst, _mm256_castpd_si256(mask), v); + } + + + template + requires std::is_base_of_v + inline std::tuple, xsimd::batch> imax(xsimd::batch const& v1, xsimd::batch const& idx) noexcept + { + /* v2 = [G H E F | C D A B] */ + __m256 const v2 = _mm256_permute_ps(v1, 0b10'11'00'01); + __m256i const iv2 = _mm256_castps_si256(_mm256_permute_ps(_mm256_castsi256_ps(idx), 0b10'11'00'01)); + + /* v3 = [W=max(G,H) W=max(G,H) Z=max(E,F) Z=max(E,F) | Y=max(C,D) Y=max(C,D) X=max(A,B) X=max(A,B)] */ + /* v3 = [W W Z Z | Y Y X X] */ + // __m256 v3 = _mm256_max_ps(v1, v2); + __m256 const mask_v3 = _mm256_cmp_ps(v2, v1, _CMP_GT_OQ); + __m256 const v3 = _mm256_blendv_ps(v1, v2, mask_v3); + __m256i const iv3 = _mm256_blendv_epi8(idx, iv2, _mm256_castps_si256(mask_v3)); + + /* v4 = [Z Z W W | X X Y Y] */ + __m256 const v4 = _mm256_permute_ps(v3, 0b00'00'10'10); + __m256i const iv4 = _mm256_castps_si256(_mm256_permute_ps(_mm256_castsi256_ps(iv3), 0b00'00'10'10)); + + /* v5 = [J=max(Z,W) J=max(Z,W) J=max(Z,W) J=max(Z,W) | I=max(X,Y) I=max(X,Y) I=max(X,Y) I=max(X,Y)] */ + /* v5 = [J J J J | I I I I] */ + // __m256 v5 = _mm256_max_ps(v3, v4); + __m256 const mask_v5 = _mm256_cmp_ps(v4, v3, _CMP_GT_OQ); + __m256 const v5 = _mm256_blendv_ps(v3, v4, mask_v5); + __m256i const iv5 = _mm256_blendv_epi8(iv3, iv4, _mm256_castps_si256(mask_v5)); + + /* v6 = [I I I I | J J J J] */ + __m256 const v6 = _mm256_permute2f128_ps(v5, v5, 0b0000'0001); + __m256i const iv6 = _mm256_castps_si256( + _mm256_permute2f128_ps( + _mm256_castsi256_ps(iv5), + _mm256_castsi256_ps(iv5), + 0b0000'0001 + ) + ); + + /* v7 = [M=max(I,J) M=max(I,J) M=max(I,J) M=max(I,J) | M=max(I,J) M=max(I,J) M=max(I,J) M=max(I,J)] */ + // __m128 v7 = _mm_max_ps(_mm256_castps256_ps128(v5), v6); + __m256 const mask_v7 = _mm256_cmp_ps(v6, v5, _CMP_GT_OQ); + __m256 const v7 = _mm256_blendv_ps(v5, v6, mask_v7); + __m256i const iv7 = _mm256_blendv_epi8(iv5, iv6, _mm256_castps_si256(mask_v7)); + + return {v7, iv7}; + } + + + template + requires std::is_base_of_v + inline std::tuple, xsimd::batch> imax(xsimd::batch const& x, xsimd::batch const& idx) noexcept + { + __m256d const y = _mm256_permute2f128_pd(x, x, 1); // permute 128-bit values + __m256i const iy = _mm256_permute2f128_si256(idx, idx, 1); + + // __m256d m1 = _mm256_max_pd(x.value_, y); // m1[0] = max(x[0], x[2]), m1[1] = max(x[1], x[3]), etc. + __m256d const mask_m1 = _mm256_cmp_pd(y, x, _CMP_GT_OQ); + __m256d const m1 = _mm256_blendv_pd(x, y, mask_m1); + __m256i const im1 = _mm256_blendv_epi8(idx, iy, _mm256_castpd_si256(mask_m1)); + + __m256d const m2 = _mm256_permute_pd(m1, 5); // set m2[0] = m1[1], m2[1] = m1[0], etc. + __m256i const im2 = _mm256_castpd_si256(_mm256_permute_pd(_mm256_castsi256_pd(im1), 5)); + + // __m256d m = _mm256_max_pd(m1, m2); // all m[0] ... m[3] contain the horizontal max(x[0], x[1], x[2], x[3]) + __m256d const mask_m = _mm256_cmp_pd(m2, m1, _CMP_GT_OQ); + __m256d const m = _mm256_blendv_pd(m1, m2, mask_m); + __m256i const im = _mm256_blendv_epi8(im1, im2, _mm256_castpd_si256(mask_m)); + + return {m, im}; + } +} diff --git a/include/blast/math/simd/avx256/IntScalarType.hpp b/include/blast/math/simd/avx256/IntScalarType.hpp deleted file mode 100644 index 59785c85..00000000 --- a/include/blast/math/simd/avx256/IntScalarType.hpp +++ /dev/null @@ -1,34 +0,0 @@ -// Copyright 2023 Mikhail Katliar -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once - -#include - - -namespace blast -{ - template <> - struct IntScalarType<4> - { - using type = std::int64_t; - }; - - - template <> - struct IntScalarType<8> - { - using type = std::int32_t; - }; -} \ No newline at end of file diff --git a/include/blast/math/simd/avx256/SimdMask.hpp b/include/blast/math/simd/avx256/SimdMask.hpp deleted file mode 100644 index 8dbcea64..00000000 --- a/include/blast/math/simd/avx256/SimdMask.hpp +++ /dev/null @@ -1,83 +0,0 @@ -// Copyright 2023 Mikhail Katliar -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once - -#include - -#include - -#include - - -namespace blast -{ - template - class SimdMask - { - static size_t constexpr SS = SimdSize_v; - - public: - using ValueType = IntElementType_t; - using IntrinsicType = __m256i; - using MaskType = __m256i; - - - /** - * @brief Set to [0, 0, 0, ...] - */ - SimdMask() noexcept - : value_ {_mm256_setzero_pd()} - { - } - - - SimdMask(IntrinsicType value) noexcept - : value_ {value} - { - } - - - operator IntrinsicType() const noexcept - { - return value_; - } - - - /** - * @brief Number of elements in SIMD pack - */ - static size_t constexpr size() - { - return SS; - } - - - /** - * @brief Access specified element - * - * @param i element index - * - * @return element value - */ - ValueType operator[](size_t i) const noexcept - { - return value_[i]; - } - - - private: - IntrinsicType value_; - }; -} \ No newline at end of file diff --git a/include/blast/math/simd/avx256/SimdSize.hpp b/include/blast/math/simd/avx256/SimdSize.hpp deleted file mode 100644 index 2fa30b2f..00000000 --- a/include/blast/math/simd/avx256/SimdSize.hpp +++ /dev/null @@ -1,50 +0,0 @@ -// Copyright 2023 Mikhail Katliar -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once - -#include - -#include - - -namespace blast -{ - template <> - struct SimdSize - { - static size_t constexpr value = 8; - }; - - - template <> - struct SimdSize - { - static size_t constexpr value = 4; - }; - - - template <> - struct SimdSize - { - static size_t constexpr value = 8; - }; - - - template <> - struct SimdSize - { - static size_t constexpr value = 4; - }; -} \ No newline at end of file diff --git a/include/blast/math/simd/avx256/SimdVecFloat32.hpp b/include/blast/math/simd/avx256/SimdVecFloat32.hpp deleted file mode 100644 index 34709629..00000000 --- a/include/blast/math/simd/avx256/SimdVecFloat32.hpp +++ /dev/null @@ -1,263 +0,0 @@ -// Copyright 2023 Mikhail Katliar -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once - -#include -#include -#include - -#include - -#include - - -namespace blast -{ - template <> - class SimdVec - : public SimdVecBase - { - public: - using MaskType = __m256i; - - /** - * @brief Set to [0, 0, 0, ...] - */ - SimdVec() noexcept - : SimdVecBase {_mm256_setzero_ps()} - { - } - - - SimdVec(IntrinsicType value) noexcept - : SimdVecBase {value} - { - } - - - /** - * @brief Set to [value, value, ...] - * - * TODO: add "explicit" - * - * @param value value for each component of SIMD vector - */ - SimdVec(ValueType value) noexcept - : SimdVecBase {_mm256_set1_ps(value)} - { - } - - - /** - * @brief Load from location - * - * @param src memory location to load from - * @param aligned true if @a src is SIMD-aligned - */ - explicit SimdVec(ValueType const * src, bool aligned) noexcept - : SimdVecBase {aligned ? _mm256_load_ps(src) : _mm256_loadu_ps(src)} - { - } - - - /** - * @brief Masked load from location - * - * @param src memory location to load from - * @param mask load mask - * @param aligned true if @a src is SIMD-aligned - */ - explicit SimdVec(ValueType const * src, MaskType mask, bool aligned) noexcept - : SimdVecBase {_mm256_maskload_ps(src, mask)} - { - } - - - /** - * @brief Store to memory - * - * @param dst memory location to store to - * @param aligned true if @a dst is SIMD-aligned - */ - void store(ValueType * dst, bool aligned) const noexcept - { - if (aligned) - _mm256_store_ps(dst, value_); - else - _mm256_storeu_ps(dst, value_); - } - - - /** - * @brief Masked store to memory - * - * @param dst memory location to store to - * @param mask store mask - * @param aligned true if @a dst is SIMD-aligned - */ - void store(ValueType * dst, MaskType mask, bool aligned) const noexcept - { - _mm256_maskstore_ps(dst, mask, value_); - } - - - friend MaskType operator>(SimdVec const& a, SimdVec const& b) noexcept - { - return _mm256_castps_si256(_mm256_cmp_ps(a.value_, b.value_, _CMP_GT_OQ)); - } - - - /** - * @brief Multiplication - * - * @param a first multiplier - * @param b second multiplier - * - * @return product @a a * @a b - */ - friend SimdVec operator*(SimdVec const& a, SimdVec const& b) noexcept - { - return _mm256_mul_ps(a.value_, b.value_); - } - - - /** - * @brief Fused multiply-add - * - * Calculate a * b + c - * - * @param a first multiplier - * @param b second multiplier - * @param c addendum - * - * @return @a a * @a b + @a c element-wise - */ - friend SimdVec fmadd(SimdVec const& a, SimdVec const& b, SimdVec const& c) noexcept - { - return _mm256_fmadd_ps(a.value_, b.value_, c.value_); - } - - - friend SimdVec blend(SimdVec const& a, SimdVec const& b, MaskType mask) noexcept - { - return _mm256_blendv_ps(a.value_, b.value_, _mm256_castsi256_ps(mask)); - } - - - friend SimdVec abs(SimdVec const& a) noexcept - { - return _mm256_andnot_ps(SimdVec {-0.f}, a.value_); - } - - - /** - * @brief Vertical max (across two vectors) - * - * @param a first vector - * @param b second vector - * - * @return [max(a[7], b7), [max(a[6], b6), max(a[5], b5), max(a[4], b4), max(a[3], b3), max(a[2], b2), max(a[1], b1), max(a[0], b0)] - */ - friend SimdVec max(SimdVec const& a, SimdVec const& b) - { - return _mm256_max_ps(a, b); - } - - - /** - * @brief Horizontal max (across all elements) - * - * The implementation is based on - * https://stackoverflow.com/questions/9795529/how-to-find-the-horizontal-maximum-in-a-256-bit-avx-vector - * - * @param a pack of 32-bit floating point numbers - * - * @return max(a[0], a[1], a[2], a[3], a[4], a[5], a[6], a[7]) - */ - friend ValueType max(SimdVec const& x) - { - __m256 v1 = x.value_; /* v1 = [H G F E | D C B A] */ - __m256 v2 = _mm256_permute_ps(v1, 0b10'11'00'01); /* v2 = [G H E F | C D A B] */ - __m256 v3 = _mm256_max_ps(v1, v2); /* v3 = [W=max(G,H) W=max(G,H) Z=max(E,F) Z=max(E,F) | Y=max(C,D) Y=max(C,D) X=max(A,B) X=max(A,B)] */ - /* v3 = [W W Z Z | Y Y X X] */ - __m256 v4 = _mm256_permute_ps(v3, 0b00'00'10'10); /* v4 = [Z Z W W | X X Y Y] */ - __m256 v5 = _mm256_max_ps(v3, v4); /* v5 = [J=max(Z,W) J=max(Z,W) J=max(Z,W) J=max(Z,W) | I=max(X,Y) I=max(X,Y) I=max(X,Y) I=max(X,Y)] */ - /* v5 = [J J J J | I I I I] */ - __m128 v6 = _mm256_extractf128_ps(v5, 1); /* v6 = [- - - - | J J J J] */ - __m128 v7 = _mm_max_ps(_mm256_castps256_ps128(v5), v6); /* v7 = [- - - - | M=max(I,J) M=max(I,J) M=max(I,J) M=max(I,J)] */ - - return v7[0]; - } - - - template - requires (SimdSize_v == size()) - friend std::tuple> imax(SimdVec const& v1, SimdVec const& idx) - { - /* v2 = [G H E F | C D A B] */ - SimdVec const v2 = _mm256_permute_ps(v1, 0b10'11'00'01); - SimdVec const iv2 = _mm256_castps_si256(_mm256_permute_ps(_mm256_castsi256_ps(idx), 0b10'11'00'01)); - - /* v3 = [W=max(G,H) W=max(G,H) Z=max(E,F) Z=max(E,F) | Y=max(C,D) Y=max(C,D) X=max(A,B) X=max(A,B)] */ - /* v3 = [W W Z Z | Y Y X X] */ - // __m256 v3 = _mm256_max_ps(v1, v2); - MaskType const mask_v3 = v2 > v1; - SimdVec const v3 = blend(v1, v2, mask_v3); - SimdVec const iv3 = blend(idx, iv2, mask_v3); - - /* v4 = [Z Z W W | X X Y Y] */ - SimdVec const v4 = _mm256_permute_ps(v3, 0b00'00'10'10); - SimdVec const iv4 = _mm256_castps_si256(_mm256_permute_ps(_mm256_castsi256_ps(iv3), 0b00'00'10'10)); - - /* v5 = [J=max(Z,W) J=max(Z,W) J=max(Z,W) J=max(Z,W) | I=max(X,Y) I=max(X,Y) I=max(X,Y) I=max(X,Y)] */ - /* v5 = [J J J J | I I I I] */ - // __m256 v5 = _mm256_max_ps(v3, v4); - MaskType const mask_v5 = v4 > v3; - SimdVec const v5 = blend(v3, v4, mask_v5); - SimdVec const iv5 = blend(iv3, iv4, mask_v5); - - /* v6 = [I I I I | J J J J] */ - SimdVec const v6 = _mm256_permute2f128_ps(v5, v5, 0b0000'0001); - SimdVec const iv6 = _mm256_castps_si256( - _mm256_permute2f128_ps( - _mm256_castsi256_ps(iv5), - _mm256_castsi256_ps(iv5), - 0b0000'0001 - ) - ); - - /* v7 = [M=max(I,J) M=max(I,J) M=max(I,J) M=max(I,J) | M=max(I,J) M=max(I,J) M=max(I,J) M=max(I,J)] */ - // __m128 v7 = _mm_max_ps(_mm256_castps256_ps128(v5), v6); - MaskType const mask_v7 = v6 > v5; - SimdVec const v7 = blend(v5, v6, mask_v7); - SimdVec const iv7 = blend(iv5, iv6, mask_v7); - - return std::make_tuple(v7, iv7); - } - - - /** - * @brief Access single element - * - * @param i element index - * - * @return element value - */ - ValueType operator[](size_t i) const noexcept - { - return value_[i]; - } - }; -} \ No newline at end of file diff --git a/include/blast/math/simd/avx256/SimdVecFloat64.hpp b/include/blast/math/simd/avx256/SimdVecFloat64.hpp deleted file mode 100644 index a5277ab6..00000000 --- a/include/blast/math/simd/avx256/SimdVecFloat64.hpp +++ /dev/null @@ -1,234 +0,0 @@ -// Copyright 2023 Mikhail Katliar -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once - -#include -#include -#include - -#include - -#include - - -namespace blast -{ - template <> - class SimdVec - : public SimdVecBase - { - public: - using MaskType = __m256i; - - /** - * @brief Set to [0, 0, 0, ...] - */ - SimdVec() noexcept - : SimdVecBase {_mm256_setzero_pd()} - { - } - - - SimdVec(IntrinsicType value) noexcept - : SimdVecBase {value} - { - } - - - /** - * @brief Set to [value, value, ...] - * - * TODO: add "explicit" - * - * @param value value for each component of SIMD vector - */ - SimdVec(ValueType value) noexcept - : SimdVecBase {_mm256_set1_pd(value)} - { - } - - - /** - * @brief Load from location - * - * @param src memory location to load from - * @param aligned true if @a src is SIMD-aligned - */ - explicit SimdVec(ValueType const * src, bool aligned) noexcept - : SimdVecBase {aligned ? _mm256_load_pd(src) : _mm256_loadu_pd(src)} - { - } - - - /** - * @brief Masked load from location - * - * @param src memory location to load from - * @param mask load mask - * @param aligned true if @a src is SIMD-aligned - */ - explicit SimdVec(ValueType const * src, MaskType mask, bool aligned) noexcept - : SimdVecBase {_mm256_maskload_pd(src, mask)} - { - } - - - /** - * @brief Store to memory - * - * @param dst memory location to store to - * @param aligned true if @a dst is SIMD-aligned - */ - void store(ValueType * dst, bool aligned) const noexcept - { - if (aligned) - _mm256_store_pd(dst, value_); - else - _mm256_storeu_pd(dst, value_); - } - - - /** - * @brief Masked store to memory - * - * @param dst memory location to store to - * @param mask store mask - * @param aligned true if @a dst is SIMD-aligned - */ - void store(ValueType * dst, MaskType mask, bool aligned) const noexcept - { - _mm256_maskstore_pd(dst, mask, value_); - } - - - friend MaskType operator>(SimdVec const& a, SimdVec const& b) noexcept - { - return _mm256_castpd_si256(_mm256_cmp_pd(a.value_, b.value_, _CMP_GT_OQ)); - } - - - /** - * @brief Multiplication - * - * @param a first multiplier - * @param b second multiplier - * - * @return product @a a * @a b - */ - friend SimdVec operator*(SimdVec const& a, SimdVec const& b) noexcept - { - return _mm256_mul_pd(a.value_, b.value_); - } - - - /** - * @brief Fused multiply-add - * - * Calculate a * b + c - * - * @param a first multiplier - * @param b second multiplier - * @param c addendum - * - * @return @a a * @a b + @a c element-wise - */ - friend SimdVec fmadd(SimdVec const& a, SimdVec const& b, SimdVec const& c) noexcept - { - return _mm256_fmadd_pd(a.value_, b.value_, c.value_); - } - - - friend SimdVec blend(SimdVec const& a, SimdVec const& b, MaskType mask) noexcept - { - return _mm256_blendv_pd(a.value_, b.value_, _mm256_castsi256_pd(mask)); - } - - - friend SimdVec abs(SimdVec const& a) noexcept - { - return _mm256_andnot_pd(SimdVec {-0.}, a.value_); - } - - - /** - * @brief Vertical max (across two vectors) - * - * @param a first vector - * @param b second vector - * - * @return [max(a[3], b3), max(a[2], b2), max(a[1], b1), max(a[0], b0)] - */ - friend SimdVec max(SimdVec const& a, SimdVec const& b) - { - return _mm256_max_pd(a, b); - } - - - /** - * @brief Horizontal max (across all alements) - * - * The implementation is based on - * https://stackoverflow.com/questions/9795529/how-to-find-the-horizontal-maximum-in-a-256-bit-avx-vector - * - * @return max(x[0], x[1], x[2], x[3]) - */ - friend ValueType max(SimdVec x) noexcept - { - __m256d y = _mm256_permute2f128_pd(x.value_, x.value_, 1); // permute 128-bit values - __m256d m1 = _mm256_max_pd(x.value_, y); // m1[0] = max(x[0], x[2]), m1[1] = max(x[1], x[3]), etc. - __m256d m2 = _mm256_permute_pd(m1, 5); // set m2[0] = m1[1], m2[1] = m1[0], etc. - __m256d m = _mm256_max_pd(m1, m2); // all m[0] ... m[3] contain the horizontal max(x[0], x[1], x[2], x[3]) - - return m[0]; - } - - - template - requires (SimdSize_v == size()) - friend std::tuple> imax(SimdVec const& x, SimdVec const& idx) - { - SimdVec const y = _mm256_permute2f128_pd(x.value_, x.value_, 1); // permute 128-bit values - SimdVec const iy = _mm256_permute2f128_si256(idx, idx, 1); - - // __m256d m1 = _mm256_max_pd(x.value_, y); // m1[0] = max(x[0], x[2]), m1[1] = max(x[1], x[3]), etc. - MaskType const mask_m1 = y > x; - SimdVec const m1 = blend(x, y, mask_m1); - SimdVec const im1 = blend(idx, iy, mask_m1); - - SimdVec const m2 = _mm256_permute_pd(m1, 5); // set m2[0] = m1[1], m2[1] = m1[0], etc. - SimdVec const im2 = _mm256_castpd_si256(_mm256_permute_pd(_mm256_castsi256_pd(im1), 5)); - - // __m256d m = _mm256_max_pd(m1, m2); // all m[0] ... m[3] contain the horizontal max(x[0], x[1], x[2], x[3]) - MaskType const mask_m = m2 > m1; - SimdVec const m = blend(m1, m2, mask_m); - SimdVec const im = blend(im1, im2, mask_m); - - return std::make_tuple(m, im); - } - - - /** - * @brief Access single element - * - * @param i element index - * - * @return element value - */ - ValueType operator[](size_t i) const noexcept - { - return value_[i]; - } - }; -} \ No newline at end of file diff --git a/include/blast/math/simd/avx256/SimdVecInt32.hpp b/include/blast/math/simd/avx256/SimdVecInt32.hpp deleted file mode 100644 index 8854f50f..00000000 --- a/include/blast/math/simd/avx256/SimdVecInt32.hpp +++ /dev/null @@ -1,122 +0,0 @@ -// Copyright 2023 Mikhail Katliar -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once - -#include -#include -#include -#include - -#include - -#include - - -namespace blast -{ - template <> - class SimdVec - : public SimdVecBase - { - public: - using MaskType = __m256i; - - /** - * @brief Initialize to (0, 0, 0, 0, 0, 0, 0, 0) - */ - SimdVec() noexcept - : SimdVecBase {_mm256_setzero_si256()} - { - } - - - /** - * @brief Initialize to (a, a, a, a, a, a, a, a) - */ - SimdVec(ValueType a) noexcept - : SimdVecBase {_mm256_set1_epi32(a)} - { - } - - - /** - * @brief Initialize to (n + 7, n + 6, n + 5, n + 4, n + 3, n + 2, n + 1, n) - * - * @param n - */ - SimdVec(simd::SequenceTag, ValueType n = 0) - : SimdVec {n + 7, n + 6, n + 5, n + 4, n + 3, n + 2, n + 1, n} - { - } - - - /** - * @brief Initialize to (a7, a6, a5, a4, a3, a2, a1, a0), - * where a0 corresponds to the lower bits. - */ - SimdVec(ValueType a7, ValueType a6, ValueType a5, ValueType a4, - ValueType a3, ValueType a2, ValueType a1, ValueType a0) noexcept - : SimdVecBase {_mm256_set_epi32(a7, a6, a5, a4, a3, a2, a1, a0)} - { - } - - - SimdVec(IntrinsicType value) noexcept - : SimdVecBase {value} - { - } - - - SimdVec& operator+=(ValueType x) noexcept - { - value_ = _mm256_add_epi32(value_, _mm256_set1_epi32(x)); - return *this; - } - - - friend MaskType operator>(SimdVec const& a, SimdVec const& b) noexcept - { - return _mm256_cmpgt_epi32(a.value_, b.value_); - } - - - friend SimdVec blend(SimdVec const& a, SimdVec const& b, MaskType mask) noexcept - { - return _mm256_blendv_epi8(a.value_, b.value_, mask); - } - - - friend SimdVec operator+(SimdVec const& a, ValueType n) noexcept - { - return _mm256_add_epi32(a.value_, _mm256_set1_epi32(n)); - } - - - /** - * @brief Access single element - * - * @param i element index - * - * @return element value - */ - ValueType operator[](size_t i) const noexcept - { - alignas(IntrinsicType) ValueType v[size()]; - _mm256_storeu_si256(reinterpret_cast(v), value_); - - return v[i]; - } - }; -} \ No newline at end of file diff --git a/include/blast/math/simd/avx256/SimdVecInt64.hpp b/include/blast/math/simd/avx256/SimdVecInt64.hpp deleted file mode 100644 index 6edf2863..00000000 --- a/include/blast/math/simd/avx256/SimdVecInt64.hpp +++ /dev/null @@ -1,124 +0,0 @@ -// Copyright 2023 Mikhail Katliar -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once - -#include -#include -#include -#include - -#include - -#include - - -namespace blast -{ - template <> - class SimdVec - : public SimdVecBase - { - public: - using MaskType = __m256i; - - /** - * @brief Initialize to (0, 0, 0, 0) - */ - SimdVec() noexcept - : SimdVecBase {_mm256_setzero_si256()} - { - } - - - /** - * @brief Initialize to (a, a, a, a) - */ - SimdVec(ValueType a) noexcept - : SimdVecBase {_mm256_set1_epi64x(a)} - { - } - - - /** - * @brief Initialize to (n + 3, n + 2, n + 1, n) - * - * @param n - */ - SimdVec(simd::SequenceTag, ValueType n = 0) - : SimdVec {n + 3, n + 2, n + 1, n} - { - } - - - /** - * @brief Initialize to [a3, a2, a1, a0], - * where a0 corresponds to the lower bits. - */ - SimdVec(ValueType a3, ValueType a2, ValueType a1, ValueType a0) noexcept - : SimdVecBase {_mm256_set_epi64x(a3, a2, a1, a0)} - { - } - - - SimdVec(IntrinsicType value) noexcept - : SimdVecBase {value} - { - } - - - operator IntrinsicType() const noexcept - { - return value_; - } - - - SimdVec& operator+=(ValueType x) noexcept - { - value_ = _mm256_add_epi64(value_, _mm256_set1_epi64x(x)); - return *this; - } - - - friend MaskType operator>(SimdVec const& a, SimdVec const& b) noexcept - { - return _mm256_cmpgt_epi64(a.value_, b.value_); - } - - - friend SimdVec blend(SimdVec const& a, SimdVec const& b, MaskType mask) noexcept - { - return _mm256_blendv_epi8(a.value_, b.value_, mask); - } - - - friend SimdVec operator+(SimdVec const& a, ValueType n) noexcept - { - return _mm256_add_epi64(a.value_, _mm256_set1_epi64x(n)); - } - - - /** - * @brief Access single element - * - * @param i element index - * - * @return element value - */ - ValueType operator[](size_t i) const noexcept - { - return value_[i]; - } - }; -} \ No newline at end of file diff --git a/include/blast/math/views/row/Panel.hpp b/include/blast/math/views/row/Panel.hpp index 5daadc0e..55c3f9f4 100644 --- a/include/blast/math/views/row/Panel.hpp +++ b/include/blast/math/views/row/Panel.hpp @@ -7,7 +7,6 @@ #include #include #include -#include #include #include @@ -17,9 +16,6 @@ #include #include -#include -#include - namespace blast { @@ -195,4 +191,4 @@ namespace blast BLAST_CONSTRAINT_MUST_NOT_BE_PANEL_SUBMATRIX_TYPE(MT); //********************************************************************************************** }; -} \ No newline at end of file +} diff --git a/include/blast/math/views/submatrix/Panel.hpp b/include/blast/math/views/submatrix/Panel.hpp index c922bba7..50ba6a79 100644 --- a/include/blast/math/views/submatrix/Panel.hpp +++ b/include/blast/math/views/submatrix/Panel.hpp @@ -7,7 +7,6 @@ #include #include #include -#include #include #include #include @@ -25,9 +24,6 @@ #include #include -#include -#include - namespace blast { @@ -283,4 +279,4 @@ namespace blast { return PanelSubmatrix(matrix.operand(), matrix.row() + row, matrix.column() + column, m, n); } -} \ No newline at end of file +} diff --git a/test/blast/CMakeLists.txt b/test/blast/CMakeLists.txt index 5edafda1..fa479125 100644 --- a/test/blast/CMakeLists.txt +++ b/test/blast/CMakeLists.txt @@ -13,7 +13,6 @@ find_package(LAPACK REQUIRED) add_executable(test-blast math/simd/RegisterMatrixTest.cpp math/simd/DynamicRegisterMatrixTest.cpp - math/simd/HsumTest.cpp math/simd/SimdVecTest.cpp math/dense/StaticVectorPointerTest.cpp diff --git a/test/blast/math/panel/StaticPanelMatrixTest.cpp b/test/blast/math/panel/StaticPanelMatrixTest.cpp index 13fd564e..c319a422 100644 --- a/test/blast/math/panel/StaticPanelMatrixTest.cpp +++ b/test/blast/math/panel/StaticPanelMatrixTest.cpp @@ -3,9 +3,12 @@ // license that can be found in the LICENSE file. #include +#include +#include #include +#include #include #include @@ -98,7 +101,7 @@ namespace blast :: testing for (size_t i = 0; i < M; i += SS) for (size_t j = 0; j < N; ++j) { - auto const xmm = A.template load(i, j); + auto const xmm = ptr(A, i, j).load(); for (size_t k = 0; k < SS; ++k) ASSERT_EQ(xmm[k], A(i + k, j)) << "element mismatch at i,j,k=" << i << "," << j << "," << k; @@ -113,16 +116,18 @@ namespace blast :: testing size_t constexpr N = 3 * SS + 2; StaticPanelMatrix A; - IntrinsicType_t val; + std::vector vec(SS); for (size_t i = 0; i < SS; ++i) - val[i] = TypeParam(i + 1); + vec[i] = i + 1; + + SimdVec val {vec.data(), false}; for (size_t i = 0; i < M; i += SS) for (size_t j = 0; j < N; ++j) { A = TypeParam(0.); - A.store(i, j, val); + ptr(A, i, j).store(val); for (size_t k = 0; k < SS && i + k < rows(A); ++k) ASSERT_EQ(A(i + k, j), val[k]) << "element mismatch at i,j,k=" << i << "," << j << "," << k; @@ -201,4 +206,4 @@ namespace blast :: testing INSTANTIATE_TYPED_TEST_SUITE_P(double, StaticPanelMatrixTest, double); INSTANTIATE_TYPED_TEST_SUITE_P(float, StaticPanelMatrixTest, float); -} \ No newline at end of file +} diff --git a/test/blast/math/simd/HsumTest.cpp b/test/blast/math/simd/HsumTest.cpp deleted file mode 100644 index 3153e042..00000000 --- a/test/blast/math/simd/HsumTest.cpp +++ /dev/null @@ -1,38 +0,0 @@ -// Copyright (c) 2019-2020 Mikhail Katliar All rights reserved. -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. - -#include - -#include -#include - -#include - -using namespace blaze; - - -namespace blast :: testing -{ - TEST(HsumTest, test4x256d) - { - std::array a, b, c, d, r; - randomize(a); - randomize(b); - randomize(c); - randomize(d); - - __m256d mma = _mm256_loadu_pd(a.data()); - __m256d mmb = _mm256_loadu_pd(b.data()); - __m256d mmc = _mm256_loadu_pd(c.data()); - __m256d mmd = _mm256_loadu_pd(d.data()); - - __m256d mmr = hsum(mma, mmb, mmc, mmd); - _mm256_storeu_pd(r.data(), mmr); - - EXPECT_NEAR(r[0], std::accumulate(begin(a), end(a), 0.), 1.e-10); - EXPECT_NEAR(r[1], std::accumulate(begin(b), end(b), 0.), 1.e-10); - EXPECT_NEAR(r[2], std::accumulate(begin(c), end(c), 0.), 1.e-10); - EXPECT_NEAR(r[3], std::accumulate(begin(d), end(d), 0.), 1.e-10); - } -} \ No newline at end of file diff --git a/test/blast/math/simd/SimdVecTest.cpp b/test/blast/math/simd/SimdVecTest.cpp index 90be9d1c..f5b2fa04 100644 --- a/test/blast/math/simd/SimdVecTest.cpp +++ b/test/blast/math/simd/SimdVecTest.cpp @@ -2,7 +2,7 @@ // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. -#include +#include #include @@ -64,9 +64,10 @@ namespace blast :: testing TYPED_TEST(SimdVecTest, testImax) { using Scalar = TypeParam; - size_t constexpr SS = SimdSize_v; + using SimdType = SimdVec; + size_t constexpr SS = SimdType::size(); - IntVecType_t idx {simd::sequenceTag}; + SimdIndex const idx = indexSequence(); for (int i = 0; i < 100; ++i) { @@ -75,15 +76,15 @@ namespace blast :: testing SimdVec const v {a.data(), false}; SimdVec v_max; - IntVecType_t idx_max; + SimdIndex idx_max; std::tie(v_max, idx_max) = imax(v, idx); auto const max_el = std::max_element(a.begin(), a.end()); - for (size_t i = 0; i < idx_max.size(); ++i) - ASSERT_EQ(idx_max[i], std::distance(a.begin(), max_el)) << "Error at element " << i; + for (size_t i = 0; i < idx_max.size; ++i) + ASSERT_EQ(idx_max.get(i), std::distance(a.begin(), max_el)) << "Error at element " << i; for (size_t i = 0; i < v_max.size(); ++i) ASSERT_EQ(v_max[i], *max_el) << "Error at element " << i; } } -} \ No newline at end of file +}