From e2a82887b98c5dd8b01e03c8b2057d265e1ea6ec Mon Sep 17 00:00:00 2001 From: Mikhail Katliar Date: Thu, 19 Sep 2024 14:33:20 +0200 Subject: [PATCH 1/5] Checking compiler before setting Clang-specific compiler options --- bench/common/CMakeLists.txt | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/bench/common/CMakeLists.txt b/bench/common/CMakeLists.txt index 2ed8b4d0..030ae1e9 100644 --- a/bench/common/CMakeLists.txt +++ b/bench/common/CMakeLists.txt @@ -13,6 +13,9 @@ target_link_libraries(bench-blast-common PUBLIC benchmark::benchmark ) -target_compile_options(bench-blast-common - PUBLIC "-mllvm" "-inline-threshold=1000" -) +if (CMAKE_CXX_COMPILER_ID STREQUAL "Clang") + # More aggressive inlining with Clang + target_compile_options(bench-blast-common + PUBLIC "-mllvm" "-inline-threshold=1000" + ) +endif() From 3f123124bfeb560238f8b5d0670781c38526308b Mon Sep 17 00:00:00 2001 From: Mikhail Katliar Date: Mon, 5 Aug 2024 14:17:51 +0200 Subject: [PATCH 2/5] iamax() decoupled from Blaze --- bench/blast/math/dense/DynamicIamax.cpp | 19 ++++--------------- include/blast/math/dense/Iamax.hpp | 7 +++---- 2 files changed, 7 insertions(+), 19 deletions(-) diff --git a/bench/blast/math/dense/DynamicIamax.cpp b/bench/blast/math/dense/DynamicIamax.cpp index ce28e659..eaf599a7 100644 --- a/bench/blast/math/dense/DynamicIamax.cpp +++ b/bench/blast/math/dense/DynamicIamax.cpp @@ -1,20 +1,9 @@ -// Copyright 2023 Mikhail Katliar -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. +// Copyright (c) 2023-2024 Mikhail Katliar All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. #include - -#include +#include #include #include diff --git a/include/blast/math/dense/Iamax.hpp b/include/blast/math/dense/Iamax.hpp index 16e2e464..1c5551e3 100644 --- a/include/blast/math/dense/Iamax.hpp +++ b/include/blast/math/dense/Iamax.hpp @@ -17,7 +17,6 @@ #include #include -#include #include #include @@ -168,13 +167,13 @@ namespace blast * * @return index of the first element in @a x having maximum absolute value. */ - template - inline size_t iamax(DenseVector const& x) + template + inline size_t iamax(VT const& x) { size_t const N = size(x); if (N == 0) BLAST_THROW_EXCEPTION(std::invalid_argument {"Vector is empty"}); - return iamax(N, ptr(*x)); + return iamax(N, ptr(x)); } } From 34dee1e7e7a29e233c4ad6d877873c9b2a5e7fee Mon Sep 17 00:00:00 2001 From: ugol-1 Date: Mon, 5 Aug 2024 17:35:42 +0200 Subject: [PATCH 3/5] Code compiles on armv8, with stubs for masked memory operatoins and iamax(). Make potrf() compile on fma3 Smarter way of calculating register matrix number of columns in potrf() Make the code compile for ARM again Fixed calculation of RegisterMatrix columns number in potrf() --- CMakeLists.txt | 3 - include/blast/math/algorithm/Tile.hpp | 4 + .../blast/math/algorithm/arch/avx2/Tile.hpp | 3 +- .../blast/math/algorithm/arch/neon64/Tile.hpp | 141 ++++++++++++++++++ include/blast/math/dense/Trmm.hpp | 1 + include/blast/math/panel/PanelSize.hpp | 7 + include/blast/math/panel/Potrf.hpp | 25 ++-- .../blast/math/panel/StaticPanelMatrix.hpp | 4 - include/blast/math/simd/RegisterCapacity.hpp | 7 +- include/blast/math/simd/Simd.hpp | 4 + include/blast/math/simd/SimdIndex.hpp | 24 +-- include/blast/math/simd/arch/Avx2.hpp | 9 ++ include/blast/math/simd/arch/Neon64.hpp | 78 ++++++++++ include/blast/system/Tile.hpp | 12 +- 14 files changed, 284 insertions(+), 38 deletions(-) create mode 100644 include/blast/math/algorithm/arch/neon64/Tile.hpp create mode 100644 include/blast/math/simd/arch/Neon64.hpp diff --git a/CMakeLists.txt b/CMakeLists.txt index 2828f75c..ec760caf 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -44,9 +44,6 @@ target_link_libraries(blast target_compile_options(blast INTERFACE "-Wno-ignored-attributes" "-fno-math-errno" "-ftemplate-backtrace-limit=0" - # Enable SIMD instruction sets, otherwise it does not compile. - # This will change when we support multiple architectures. - INTERFACE "-march=native" "-mfma" "-mavx" "-mavx2" "-msse4" ) # BLAST_WITH_BLASFEO diff --git a/include/blast/math/algorithm/Tile.hpp b/include/blast/math/algorithm/Tile.hpp index 7f6ecc63..3b4efef6 100644 --- a/include/blast/math/algorithm/Tile.hpp +++ b/include/blast/math/algorithm/Tile.hpp @@ -8,6 +8,10 @@ # include #endif +#if XSIMD_WITH_NEON64 +# include +#endif + #include #include diff --git a/include/blast/math/algorithm/arch/avx2/Tile.hpp b/include/blast/math/algorithm/arch/avx2/Tile.hpp index aac90b8a..f7decfde 100644 --- a/include/blast/math/algorithm/arch/avx2/Tile.hpp +++ b/include/blast/math/algorithm/arch/avx2/Tile.hpp @@ -6,6 +6,7 @@ #include #include #include +#include #include @@ -46,7 +47,7 @@ namespace blast :: detail BLAST_ALWAYS_INLINE void tile(xsimd::avx2 const& arch, StorageOrder traversal_order, std::size_t m, std::size_t n, FF&& f_full, FP&& f_partial) { size_t constexpr SS = SimdSize_v; - size_t constexpr TILE_STEP = 4; // TODO: this is almost arbitrary and needs to be ppoperly determined + size_t constexpr TILE_STEP = 4; // TODO: this is almost arbitrary and needs to be properly determined static_assert(SO == columnMajor, "tile() for row-major matrices not implemented"); diff --git a/include/blast/math/algorithm/arch/neon64/Tile.hpp b/include/blast/math/algorithm/arch/neon64/Tile.hpp new file mode 100644 index 00000000..652a7917 --- /dev/null +++ b/include/blast/math/algorithm/arch/neon64/Tile.hpp @@ -0,0 +1,141 @@ +// Copyright 2024 Mikhail Katliar. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include +#include +#include +#include +#include + +#include + +#include + + +namespace blast :: detail +{ + template + BLAST_ALWAYS_INLINE void tile_backend(xsimd::neon64, size_t m, size_t n, size_t i, FF&& f_full, FP&& f_partial) + { + RegisterMatrix ker; + + if (i + KM <= m) + { + size_t j = 0; + + for (; j + KN <= n; j += KN) + f_full(ker, i, j); + + if (j < n) + f_partial(ker, i, j, KM, n - j); + } + else + { + size_t j = 0; + + for (; j + KN <= n; j += KN) + f_partial(ker, i, j, m - i, KN); + + if (j < n) + f_partial(ker, i, j, m - i, n - j); + } + } + + + template + BLAST_ALWAYS_INLINE void tile(xsimd::neon64 const& arch, StorageOrder traversal_order, std::size_t m, std::size_t n, FF&& f_full, FP&& f_partial) + { + size_t constexpr SS = SimdSize_v; + size_t constexpr TILE_STEP = 4; // TODO: this is almost arbitrary and needs to be properly determined + + static_assert(SO == columnMajor, "tile() for row-major matrices not implemented"); + + if (traversal_order == columnMajor) + { + size_t j = 0; + + // Main part + for (; j + TILE_STEP <= n; j += TILE_STEP) + { + size_t i = 0; + + // i + 4 * TILE_SIZE != M is to improve performance in case when the remaining number of rows is 4 * TILE_SIZE: + // it is more efficient to apply 2 * TILE_SIZE kernel 2 times than 3 * TILE_SIZE + 1 * TILE_SIZE kernel. + for (; i + 3 * SS <= m && i + 4 * SS != m; i += 3 * SS) + { + RegisterMatrix ker; + f_full(ker, i, j); + } + + for (; i + 2 * SS <= m; i += 2 * SS) + { + RegisterMatrix ker; + f_full(ker, i, j); + } + + for (; i + 1 * SS <= m; i += 1 * SS) + { + RegisterMatrix ker; + f_full(ker, i, j); + } + + // Bottom side + if (i < m) + { + RegisterMatrix ker; + f_partial(ker, i, j, m - i, ker.columns()); + } + } + + + // Right side + if (j < n) + { + size_t i = 0; + + // i + 4 * TILE_STEP != M is to improve performance in case when the remaining number of rows is 4 * TILE_STEP: + // it is more efficient to apply 2 * TILE_STEP kernel 2 times than 3 * TILE_STEP + 1 * TILE_STEP kernel. + for (; i + 3 * SS <= m && i + 4 * SS != m; i += 3 * SS) + { + RegisterMatrix ker; + f_partial(ker, i, j, ker.rows(), n - j); + } + + for (; i + 2 * SS <= m; i += 2 * SS) + { + RegisterMatrix ker; + f_partial(ker, i, j, ker.rows(), n - j); + } + + for (; i + 1 * SS <= m; i += 1 * SS) + { + RegisterMatrix ker; + f_partial(ker, i, j, ker.rows(), n - j); + } + + // Bottom-right corner + if (i < m) + { + RegisterMatrix ker; + f_partial(ker, i, j, m - i, n - j); + } + } + } + else + { + size_t i = 0; + + // i + 4 * SS != M is to improve performance in case when the remaining number of rows is 4 * SS: + // it is more efficient to apply 2 * SS kernel 2 times than 3 * SS + 1 * SS kernel. + for (; i + 2 * SS < m && i + 4 * SS != m; i += 3 * SS) + tile_backend(arch, m, n, i, f_full, f_partial); + + for (; i + 1 * SS < m; i += 2 * SS) + tile_backend(arch, m, n, i, f_full, f_partial); + + for (; i + 0 * SS < m; i += 1 * SS) + tile_backend(arch, m, n, i, f_full, f_partial); + } + } +} diff --git a/include/blast/math/dense/Trmm.hpp b/include/blast/math/dense/Trmm.hpp index 34dc7c4f..c82bd674 100644 --- a/include/blast/math/dense/Trmm.hpp +++ b/include/blast/math/dense/Trmm.hpp @@ -5,6 +5,7 @@ #pragma once #include +#include #include #include diff --git a/include/blast/math/panel/PanelSize.hpp b/include/blast/math/panel/PanelSize.hpp index 6bcf6bbf..9f9b500e 100644 --- a/include/blast/math/panel/PanelSize.hpp +++ b/include/blast/math/panel/PanelSize.hpp @@ -14,6 +14,13 @@ namespace blast { + /** + * @brief Default size of a panel (in a panel matrix) for a given architecture and data type + * + * TODO: Is it always equal to SIMD size? Deprecate? + * + * @tparam Arch architecture + */ template size_t constexpr PanelSize_v = SimdSize_v; } diff --git a/include/blast/math/panel/Potrf.hpp b/include/blast/math/panel/Potrf.hpp index e1763809..2cd26740 100644 --- a/include/blast/math/panel/Potrf.hpp +++ b/include/blast/math/panel/Potrf.hpp @@ -7,7 +7,7 @@ #include #include #include -#include +#include #include #include @@ -61,7 +61,7 @@ namespace blast PanelMatrix const& A, PanelMatrix& L) { using ET = ElementType_t; - size_t constexpr PANEL_SIZE = PanelSize_v; + size_t constexpr SS = SimdSize_v; BLAZE_CONSTRAINT_MUST_BE_SAME_TYPE(ElementType_t, ET); @@ -77,7 +77,14 @@ namespace blast if (columns(L) != N) BLAZE_THROW_INVALID_ARGUMENT("Invalid matrix size"); - size_t constexpr KN = 4; + // Calculate Maximum number of columns of a register matrix that can be used in ger() without spilling registers. + // NOTE: RegisterMatrix.potrf() has the limitation that it works only with matrices whose number of columns + // is not less than the number of rows. This limits the max number of columns by the number of rows + // of the smallest used RegisterMatrix, which is 1 * SS. + size_t constexpr RC = registerCapacity(xsimd::default_arch {}); + size_t constexpr MAX_RM = 3; // first dimension of the largest used RegisterMatrix, in SIMD registers + static_assert(RC >= MAX_RM + 1); + size_t constexpr KN = std::min((RC - (MAX_RM + 1)) / MAX_RM, SS); size_t k = 0; // This loop unroll gives some performance benefit for N >= 18, @@ -87,14 +94,14 @@ namespace blast { size_t i = k; - for (; i + 2 * PANEL_SIZE < M; i += 3 * PANEL_SIZE) - potrf_backend<3 * PANEL_SIZE, KN>(k, i, *A, *L); + for (; i + 2 * SS < M; i += 3 * SS) + potrf_backend<3 * SS, KN>(k, i, *A, *L); - for (; i + 1 * PANEL_SIZE < M; i += 2 * PANEL_SIZE) - potrf_backend<2 * PANEL_SIZE, KN>(k, i, *A, *L); + for (; i + 1 * SS < M; i += 2 * SS) + potrf_backend<2 * SS, KN>(k, i, *A, *L); - for (; i + 0 * PANEL_SIZE < M; i += 1 * PANEL_SIZE) - potrf_backend<1 * PANEL_SIZE, KN>(k, i, *A, *L); + for (; i + 0 * SS < M; i += 1 * SS) + potrf_backend<1 * SS, KN>(k, i, *A, *L); } } } diff --git a/include/blast/math/panel/StaticPanelMatrix.hpp b/include/blast/math/panel/StaticPanelMatrix.hpp index 47e7982c..ef29dfac 100644 --- a/include/blast/math/panel/StaticPanelMatrix.hpp +++ b/include/blast/math/panel/StaticPanelMatrix.hpp @@ -15,7 +15,6 @@ #include #include #include -#include #include #include @@ -183,9 +182,6 @@ namespace blast ? i / panelSize_ * spacing_ + i % panelSize_ + j * panelSize_ : j / panelSize_ * spacing_ + j % panelSize_ + i * panelSize_; } - - - BLAZE_CONSTRAINT_MUST_BE_VECTORIZABLE_TYPE(Type); }; diff --git a/include/blast/math/simd/RegisterCapacity.hpp b/include/blast/math/simd/RegisterCapacity.hpp index 35b2ac7e..0f48789d 100644 --- a/include/blast/math/simd/RegisterCapacity.hpp +++ b/include/blast/math/simd/RegisterCapacity.hpp @@ -21,12 +21,13 @@ namespace blast { /** - * @brief Number of available SIMD registers. + * @brief Number of available SIMD registers for a given architecture. * * @return Number of SIMD registers for AVX2 */ - std::size_t constexpr registerCapacity(xsimd::avx2) + template + std::size_t constexpr registerCapacity(Arch arch) { - return 16; + return detail::registerCapacity(arch); } } diff --git a/include/blast/math/simd/Simd.hpp b/include/blast/math/simd/Simd.hpp index b5bd8873..4106ae31 100644 --- a/include/blast/math/simd/Simd.hpp +++ b/include/blast/math/simd/Simd.hpp @@ -18,3 +18,7 @@ #if XSIMD_WITH_AVX2 #include #endif + +#if XSIMD_WITH_NEON64 + #include +#endif diff --git a/include/blast/math/simd/SimdIndex.hpp b/include/blast/math/simd/SimdIndex.hpp index 3d7ca7b2..9ca8c8fd 100644 --- a/include/blast/math/simd/SimdIndex.hpp +++ b/include/blast/math/simd/SimdIndex.hpp @@ -58,18 +58,25 @@ namespace blast using Type = std::uint64_t; }; + template + requires (xsimd::batch::size == 2) && std::is_integral_v + constexpr xsimd::batch integerSequence() + { + return {0, 1}; + } + template requires (xsimd::batch::size == 4) && std::is_integral_v - inline xsimd::batch indexSequence(T start) noexcept + constexpr xsimd::batch integerSequence() { - return {start, start + 1, start + 2, start + 3}; + return {0, 1, 2, 3}; } template requires (xsimd::batch::size == 8) && std::is_integral_v - inline xsimd::batch indexSequence(T start) noexcept + constexpr xsimd::batch integerSequence() { - return {start, start + 1, start + 2, start + 3, start + 4, start + 5, start + 6, start + 7}; + return {0, 1, 2, 3, 4, 5, 6, 7}; } } @@ -85,14 +92,11 @@ namespace blast /// @brief Construct an integer index sequence /// - /// @param start start of the sequence - /// - /// @return [ @a start, @a start + 1, ..., @a start + N - 1 ] - /// where N = SimdIndex::size + /// @return [0, 1, ..., SimdIndex::size - 1] /// template - inline SimdIndex indexSequence(typename SimdIndex::value_type start = 0) noexcept + constexpr SimdIndex indexSequence() { - return detail::indexSequence, Arch>(start); + return detail::integerSequence, Arch>(); } } diff --git a/include/blast/math/simd/arch/Avx2.hpp b/include/blast/math/simd/arch/Avx2.hpp index b5aed8c1..560e9a62 100644 --- a/include/blast/math/simd/arch/Avx2.hpp +++ b/include/blast/math/simd/arch/Avx2.hpp @@ -20,6 +20,15 @@ namespace blast { + namespace detail + { + std::size_t constexpr registerCapacity(xsimd::avx2) + { + return 16; + } + } + + template requires std::is_base_of_v inline xsimd::batch maskload(float const * src, xsimd::batch_bool const& mask) noexcept diff --git a/include/blast/math/simd/arch/Neon64.hpp b/include/blast/math/simd/arch/Neon64.hpp new file mode 100644 index 00000000..35b788f4 --- /dev/null +++ b/include/blast/math/simd/arch/Neon64.hpp @@ -0,0 +1,78 @@ +// Copyright 2024 Mikhail Katliar +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +#pragma once + +#include + +#include + + +namespace blast +{ + namespace detail + { + std::size_t constexpr registerCapacity(xsimd::neon64) + { + return 32; + } + } + + + template + requires std::is_base_of_v + inline xsimd::batch maskload(float const * src, xsimd::batch_bool const& mask) noexcept + { + throw std::logic_error {"Not implemented"}; + } + + + template + requires std::is_base_of_v + inline xsimd::batch maskload(double const * src, xsimd::batch_bool const& mask) noexcept + { + throw std::logic_error {"Not implemented"}; + } + + + template + requires std::is_base_of_v + inline void maskstore(xsimd::batch const& v, float * dst, xsimd::batch_bool const& mask) noexcept + { + throw std::logic_error {"Not implemented"}; + } + + + template + requires std::is_base_of_v + inline void maskstore(xsimd::batch const& v, double * dst, xsimd::batch_bool const& mask) noexcept + { + throw std::logic_error {"Not implemented"}; + } + + + template + requires std::is_base_of_v + inline std::tuple, xsimd::batch> imax(xsimd::batch const& v1, xsimd::batch const& idx) noexcept + { + throw std::logic_error {"Not implemented"}; + } + + + template + requires std::is_base_of_v + inline std::tuple, xsimd::batch> imax(xsimd::batch const& x, xsimd::batch const& idx) noexcept + { + throw std::logic_error {"Not implemented"}; + } +} diff --git a/include/blast/system/Tile.hpp b/include/blast/system/Tile.hpp index dd471a6f..7820eb13 100644 --- a/include/blast/system/Tile.hpp +++ b/include/blast/system/Tile.hpp @@ -4,18 +4,14 @@ #pragma once -//************************************************************************************************* -// Includes -//************************************************************************************************* - -#include +#include namespace blast { - using namespace blaze; - - + /** + * @brief TODO: deprecate? + */ template struct TileSize; From fd807f5ef2f36a6416019116c491ff462dfb1bf7 Mon Sep 17 00:00:00 2001 From: Mikhail Katliar Date: Thu, 19 Sep 2024 13:04:37 +0200 Subject: [PATCH 4/5] Added DCMAKE_CXX_FLAGS in the CI build job --- .github/workflows/cmake.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/cmake.yml b/.github/workflows/cmake.yml index ece754f4..dd8be963 100644 --- a/.github/workflows/cmake.yml +++ b/.github/workflows/cmake.yml @@ -65,6 +65,7 @@ jobs: cmake -B ${{github.workspace}}/build \ -DCMAKE_BUILD_TYPE=${{env.BUILD_TYPE}} \ -DCMAKE_CXX_COMPILER=clang++-18 \ + -DCMAKE_CXX_FLAGS="-mfma -mavx -mavx2 -DXSIMD_DEFAULT_ARCH=\"fma3\"" \ -DBLAST_WITH_BENCHMARK=ON \ -DBLAST_WITH_TEST=ON From 7d91cb8085ff9ca1b09b6401deb4d33152e0befc Mon Sep 17 00:00:00 2001 From: Mikhail Katliar Date: Thu, 19 Sep 2024 14:43:02 +0200 Subject: [PATCH 5/5] GitHub build workflow for ARM --- .github/workflows/build-aarch64.yml | 37 ++ .../{cmake.yml => build-and-test-x86_64.yml} | 2 +- docker/aarch64/Dockerfile | 46 ++ docker/aarch64/blasfeo/Makefile.rule | 479 ++++++++++++++++++ Dockerfile => docker/x86_64/Dockerfile | 2 +- docker/{ => x86_64}/blasfeo/Makefile.rule | 0 6 files changed, 564 insertions(+), 2 deletions(-) create mode 100644 .github/workflows/build-aarch64.yml rename .github/workflows/{cmake.yml => build-and-test-x86_64.yml} (98%) create mode 100644 docker/aarch64/Dockerfile create mode 100644 docker/aarch64/blasfeo/Makefile.rule rename Dockerfile => docker/x86_64/Dockerfile (98%) rename docker/{ => x86_64}/blasfeo/Makefile.rule (100%) diff --git a/.github/workflows/build-aarch64.yml b/.github/workflows/build-aarch64.yml new file mode 100644 index 00000000..9d722a05 --- /dev/null +++ b/.github/workflows/build-aarch64.yml @@ -0,0 +1,37 @@ +name: Build (aarch64) + +on: + push: + branches: + - master + pull_request: + branches: + - master + +env: + # Customize the CMake build type here (Release, Debug, RelWithDebInfo, etc.) + BUILD_TYPE: Debug + +jobs: + build: + runs-on: ubuntu-latest + steps: + - name: Checkout code + uses: actions/checkout@v2 + + - name: Set up QEMU for ARM + uses: docker/setup-qemu-action@v2 + with: + platforms: arm64 + + - name: Set up Docker Buildx + uses: docker/setup-buildx-action@v2 + + - name: Build Docker image for ARM + run: | + docker buildx create --use + docker buildx build --platform linux/arm64 -t my-arm-build --load -f docker/aarch64/Dockerfile . + + # - name: Run tests on ARM Docker container + # run: | + # docker run --rm my-arm-build ./run-tests.sh diff --git a/.github/workflows/cmake.yml b/.github/workflows/build-and-test-x86_64.yml similarity index 98% rename from .github/workflows/cmake.yml rename to .github/workflows/build-and-test-x86_64.yml index dd8be963..802c80fb 100644 --- a/.github/workflows/cmake.yml +++ b/.github/workflows/build-and-test-x86_64.yml @@ -1,4 +1,4 @@ -name: CMake +name: Build and test (x86_64) on: push: diff --git a/docker/aarch64/Dockerfile b/docker/aarch64/Dockerfile new file mode 100644 index 00000000..f8fafdbc --- /dev/null +++ b/docker/aarch64/Dockerfile @@ -0,0 +1,46 @@ +FROM ubuntu:latest +WORKDIR /root +RUN apt-get update +# RUN apt-get upgrade -y +RUN DEBIAN_FRONTEND="noninteractive" apt-get install -y \ + build-essential clang-18 cmake git libopenblas-dev libboost-exception-dev pkg-config + +# Install GTest and GMock +RUN apt install -y libgtest-dev libgmock-dev + +# Install Google benchmark +RUN apt install -y libbenchmark-dev + +# Install Blaze +RUN git clone https://bitbucket.org/blaze-lib/blaze.git +RUN cd blaze && cmake -DBLAZE_BLAS_MODE=True -DBLAZE_BLAS_USE_MATRIX_MATRIX_MULTIPLICATION=False \ + -DBLAZE_BLAS_USE_MATRIX_VECTOR_MULTIPLICATION=False -DBLAZE_VECTORIZATION=False -DBLAZE_SHARED_MEMORY_PARALLELIZATION=False . && make install + +# Install Eigen3 +RUN apt install -y libeigen3-dev + +# Install blasfeo +RUN apt-get install -y bc +RUN git clone https://github.com/giaf/blasfeo.git +RUN cd blasfeo && git checkout cc90e146ee9089de518f57dbb736e064bd82394e +COPY docker/aarch64/blasfeo/Makefile.rule blasfeo +RUN cd blasfeo && make -j `nproc` static_library && make install_static + +# Install xsimd +RUN apt install -y libxsimd-dev + +# Install Clang-18 +RUN apt install -y clang-18 +ENV CC=clang-18 +ENV CXX=clang++-18 + +# Build blast +WORKDIR /blast +COPY bench ./bench +COPY cmake ./cmake +COPY include ./include +COPY test ./test +COPY CMakeLists.txt . +ENV PKG_CONFIG_PATH=/usr/local/lib +RUN cmake -B build -DCMAKE_CXX_FLAGS="-march=native -DXSIMD_DEFAULT_ARCH='neon64'" -DBLAST_WITH_TEST=ON -DBLAST_WITH_BENCHMARK=ON . +RUN cd build && make -j `nproc` diff --git a/docker/aarch64/blasfeo/Makefile.rule b/docker/aarch64/blasfeo/Makefile.rule new file mode 100644 index 00000000..3072fef0 --- /dev/null +++ b/docker/aarch64/blasfeo/Makefile.rule @@ -0,0 +1,479 @@ +################################################################################################### +# # +# This file is part of BLASFEO. # +# # +# BLASFEO -- BLAS for embedded optimization. # +# Copyright (C) 2019 by Gianluca Frison. # +# Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl. # +# All rights reserved. # +# # +# The 2-Clause BSD License # +# # +# Redistribution and use in source and binary forms, with or without # +# modification, are permitted provided that the following conditions are met: # +# # +# 1. Redistributions of source code must retain the above copyright notice, this # +# list of conditions and the following disclaimer. # +# 2. Redistributions in binary form must reproduce the above copyright notice, # +# this list of conditions and the following disclaimer in the documentation # +# and/or other materials provided with the distribution. # +# # +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND # +# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED # +# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE # +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR # +# ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES # +# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; # +# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND # +# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT # +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS # +# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. # +# # +# Author: Gianluca Frison, gianluca.frison (at) imtek.uni-freiburg.de # +# # +################################################################################################### + +# Do something in this makefile +$(info Parsing Makefile.rule) + +# Get path of Makefile.rule as main project directory +#CURRENT_DIR := $(dir $(lastword $(MAKEFILE_LIST))) +MAKEFILE_PATH := $(abspath $(lastword $(MAKEFILE_LIST))) +CURRENT_DIR := $(patsubst %/,%,$(dir $(MAKEFILE_PATH))) + + + +################################################# +### main makefile options +################################################# + +# Select target architecture (TARGET) +# +# X64_INTEL_HASWELL: x86_64 architecture with AVX2 and FMA ISA (64 bit OS) +# Code optimized for Intel Haswell, Intel Skylake and AMD Zen architectures. +# +# X64_INTEL_SANDY_BRIDGE : x86_64 architecture with AVX ISA (64 bit OS) +# Code optimized for Intel Sandy-Bridge architecture. +# +# X64_INTEL_CORE : x86_64 architecture with SSE3 ISA (64 bit OS) +# Code optimized for Intel Core archiecture. +# +# X64_AMD_BULLDOZER : x86_64 architecture with AVX and FMA ISA (64 bit OS) +# Code optimized for AMD Bulldozer. +# +# X86_AMD_JAGUAR : x86 architecture with AVX ISA (32 bit OS) +# Code optimized for AMD Jaguar. +# +# X86_AMD_BARCELONA : x86 architecture with SSE3 ISA (32 bit OS) +# Code optimized for AMD Barcelona. +# +# ARMV8A_ARM_CORTEX_A57 : ARMv8A architecture with NEON (64 bit OS) +# Code optimized for ARM Cortex A57, A72, A73. +# +# ARMV8A_ARM_CORTEX_A53 : ARMv8A architecture with NEON (64 bit OS) +# Code optimized for ARM Cortex A53. +# +# ARMV7A_ARM_CORTEX_A15 : ARMv7A architecture with NEON-VFPv4 ISA (32 bit OS) +# Code optimized for ARM Cortex A15. +# +# ARMV7A_ARM_CORTEX_A9 : ARMv7A architecture with NEON-VFPv3 ISA (32 bit OS) +# Code optimized for ARM Cortex A9. +# +# ARMV7A_ARM_CORTEX_A7 : ARMv7A architecture with NEON-VFPv4 ISA (32 bit OS) +# Code optimized for ARM Cortex A7. +# +# GENERIC : generic architecture, plain C code. +# +# +# TARGET = X64_INTEL_HASWELL +# TARGET = X64_INTEL_SANDY_BRIDGE +# TARGET = X64_INTEL_CORE +# +# TARGET = X64_AMD_BULLDOZER +# TARGET = X86_AMD_JAGUAR +# TARGET = X86_AMD_BARCELONA +# +TARGET = ARMV8A_ARM_CORTEX_A57 +# TARGET = ARMV8A_ARM_CORTEX_A53 +# TARGET = ARMV7A_ARM_CORTEX_A15 +# TARGET = ARMV7A_ARM_CORTEX_A9 +# TARGET = ARMV7A_ARM_CORTEX_A7 +# +# TARGET = GENERIC + +# Select back-end linear lagebra version (LA): +# HIGH_PERFORMANCE : target-tailored; performance-optimized for cache resident matrices; panel-major matrix format +# REFERENCE : target-unspecific lightly-optimized; small code footprint; column-major matrix format +# EXTERNAL_BLAS_WRAPPER : call to external BLAS and LAPACK libraries; column-major matrix format +# +LA = HIGH_PERFORMANCE +# LA = REFERENCE +# LA = EXTERNAL_BLAS_WRAPPER + +# Select external BLAS and LAPACK implementation (to be provided by the user). +# Edit Makefile.external_blas to specify installation location (default /opt). +# It is used by the BLASFEO library if LA=EXTERNAL_BLAS_WRAPPER. +# It may also be used as a comparison in some benchmarks and tests. +# +EXTERNAL_BLAS = 0 +# EXTERNAL_BLAS = SYSTEM +# EXTERNAL_BLAS = OPENBLAS +# EXTERNAL_BLAS = NETLIB +# EXTERNAL_BLAS = MKL +# EXTERNAL_BLAS = BLIS +# EXTERNAL_BLAS = ATLAS +include $(CURRENT_DIR)/Makefile.external_blas + +# Select operating system (automatic selection for LINUX and MAC) +# +UNAME_S := $(shell uname -s) +ifeq ($(UNAME_S), Linux) + OS = LINUX +endif +ifeq ($(UNAME_S), Darwin) + OS = MAC +endif +# +# Select operating system (manual selection) +# +# OS = LINUX +# OS = MAC +# OS = WINDOWS + +# Compile the BLAS API routines provided by BLASFEO +# +# BLAS_API = 0 +BLAS_API = 1 + +# Export standard FORTRAN namings for BLAS API routines +# 0 : routines namings are in the form blasfeo_dgemm +# 1 : routines namings are in the form dgemm_ +# +FORTRAN_BLAS_API = 0 +# FORTRAN_BLAS_API = 1 + +# Complement the BLAS_API with the Netlib BLAS (only for FORTAN_BLAS_API=1) +COMPLEMENT_WITH_NETLIB_BLAS = 0 +# COMPLEMENT_WITH_NETLIB_BLAS = 1 + +# Complement the BLAS_API with the Netlib LAPACK (only for FORTAN_BLAS_API=1) +COMPLEMENT_WITH_NETLIB_LAPACK = 0 +# COMPLEMENT_WITH_NETLIB_LAPACK = 1 + +# Compile the CBLAS API routines provided by BLASFEO +# +CBLAS_API = 0 +# CBLAS_API = 1 + +#Compile the LAPACKE API from Netlib +LAPACKE_API = 0 +# LAPACKE_API = 1 + + + +################################################# +### other makefile options and settings +################################################# + +# In BLAS API, fallback to external BLAS library for some not-yet-implemented routines +# +FALLBACK_TO_EXTERNAL_BLAS = 0 +# FALLBACK_TO_EXTERNAL_BLAS = 1 + +# Maximum inner product length K for buffer allocation on stack (decrease this value if stack size is exceeded) +# +K_MAX_STACK = 1000 + +# Macro level (code size vs performance in assembly kernels): +# 0 : no macro (min code size) +# 1 : all macro but gemm kernel +# 2 : all macro (max performance) +# +MACRO_LEVEL = 1 + +# Use C99 extension to math library +# +# USE_C99_MATH = 0 +USE_C99_MATH = 1 + +# Compile auxiliary functions with external dependencies (for memory allocation, printing and timing) +# +# EXT_DEP = 0 +EXT_DEP = 1 + +# Compile reference implementations with test_ prefix +# in order to check HIGH_PERFORMANCE routines against reference +# TODO bug: if LA=EXTERNAL_BLAS_WRAPPER and TESTING_MODE=1, reference code is used for libblasfeo.a +# Also enables the compilation of tests +# +TESTING_MODE = 0 +# TESTING_MODE = 1 + +# Compile not-yet-implemented routine with just-return instead of print-and-exit +# Also enables the compilation of benchmarks +# +BENCHMARKS_MODE = 0 +# BENCHMARKS_MODE = 1 + +# Enables the compilation of sandbox (experimental) +# +SANDBOX_MODE = 0 +# SANDBOX_MODE = 1 + +# Enable on-line checks for matrix and vector dimensions (experimental) +# +RUNTIME_CHECKS = 0 +# RUNTIME_CHECKS = 1 + +# Print name of BLAS API routines when called (for debugging purposes) +# +PRINT_NAME = 0 +# PRINT_NAME = 1 + +# C Compiler +# +CC ?= gcc +# CC = clang +# CC = x86_64-w64-mingw32-gcc + +# archive routine +# +AR = ar + +# Installation directory +# +PREFIX = /opt + +# compiler / assembler / linker flags +# +# CFLAGS = +ASFLAGS = +LDFLAGS = + +# Common optimization flags +# +CFLAGS ?= -O2 +CFLAGS += -fPIC + +# Debugging flags +# +CFLAGS += #-g #-Wall -pedantic -Wfloat-equal #-pg +ASFLAGS += #-g + +# Profiling flags +# +#CFLAGS += --coverage + + + +# Installation directory +TOP = $(CURRENT_DIR) + +# Support local options +# TODO move somewhere else ??? +-include $(CURRENT_DIR)/Makefile.local + +# search directories +CFLAGS += -I$(TOP)/include + + + +# Conditional definitions and checks + +ifeq ($(LA), HIGH_PERFORMANCE) +CFLAGS += -DLA_HIGH_PERFORMANCE +BINARY_DIR = build/$(LA)/$(TARGET) +endif +ifeq ($(LA), REFERENCE) +CFLAGS += -DLA_REFERENCE +BINARY_DIR = build/$(LA)/$(TARGET) +endif +ifeq ($(LA), EXTERNAL_BLAS_WRAPPER) +ifeq ($(EXTERNAL_BLAS), 0) +$(error No EXTERNAL_BLAS selected for LA=EXTERNAL_BLAS_WRAPPER) +endif +CFLAGS += -DLA_EXTERNAL_BLAS_WRAPPER +BINARY_DIR = build/$(LA)/$(EXTERNAL_BLAS) +endif +# TODO remove and fix tests +# CFLAGS += -DBLASFEO_LA=$(LA) + +ifeq ($(BLAS_API), 1) +CFLAGS += -DBLAS_API +ASFLAGS += -DBLAS_API +ifeq ($(FORTRAN_BLAS_API), 1) +CFLAGS += -DFORTRAN_BLAS_API +ASFLAGS += +endif # FORTRAN_BLAS_API +ifeq ($(FALLBACK_TO_EXTERNAL_BLAS), 1) +CFLAGS += -DFALLBACK_TO_EXTERNAL_BLAS +ASFLAGS += -DFALLBACK_TO_EXTERNAL_BLAS +endif # FALLBACK_TO_EXTERNAL_BLAS +endif # BLAS_API + +ifeq ($(CBLAS_API), 1) +ifeq ($(FORTRAN_BLAS_API), 0) +$(error Cannot expose non-FORTRAN style BLAS_API when building CBLAS_API) +endif +endif + +ifeq ($(LAPACKE_API), 1) +ifeq ($(FORTRAN_BLAS_API), 0) +$(error Cannot expose non-FORTRAN style BLAS_API when building LAPACKE_API) +endif +endif + +ifeq ($(COMPLEMENT_WITH_NETLIB_BLAS), 1) +ifeq ($(FORTRAN_BLAS_API), 0) +$(error Cannot expose non-FORTRAN style BLAS_API when complementing with Netlib BLAS) +endif +endif + +ifeq ($(COMPLEMENT_WITH_NETLIB_LAPACK), 1) +ifeq ($(FORTRAN_BLAS_API), 0) +$(error Cannot expose non-FORTRAN style BLAS_API when complementing with Netlib LAPACK) +endif +endif + +STACK_SIZE := $(shell ulimit -s) +ifneq ($(STACK_SIZE), unlimited) +STACK_SIZE_EXCEEDED := $(shell echo $(K_MAX_STACK)*12*8*2 \> $(STACK_SIZE)*1024 | bc ) +ifeq ($(STACK_SIZE_EXCEEDED), 1) +$(error stack size likely to be exceeded, please decrease the value of K_MAX_STACK ) +endif +endif +CFLAGS += -DK_MAX_STACK=$(K_MAX_STACK) + +ifeq ($(USE_C99_MATH), 1) +CFLAGS += -DUSE_C99_MATH +endif + +ifeq ($(RUNTIME_CHECKS), 1) +CFLAGS += -DDIM_CHECK +endif + +ifeq ($(EXT_DEP), 1) +CFLAGS += -DEXT_DEP +endif + +ifeq ($(TESTING_MODE), 1) +CFLAGS += -DTESTING_MODE +endif + +ifeq ($(BENCHMARKS_MODE), 1) +CFLAGS += -DBENCHMARKS_MODE +endif + +ifeq ($(SANDBOX_MODE), 1) +CFLAGS += -DSANDBOX_MODE +endif + +ifeq ($(MACRO_LEVEL), 1) +ASFLAGS += -DMACRO_LEVEL=1 +endif +ifeq ($(MACRO_LEVEL), 2) +ASFLAGS += -DMACRO_LEVEL=2 +endif + +ifeq ($(PRINT_NAME), 1) +CFLAGS += -DPRINT_NAME +endif + +ifeq ($(OS), LINUX) +CFLAGS += -DOS_LINUX +ASFLAGS += -DOS_LINUX +endif +ifeq ($(OS), MAC) +CFLAGS += -DOS_MAC +ASFLAGS += -DOS_MAC +endif +ifeq ($(OS), WINDOWS) +CFLAGS += -DOS_WINDOWS +ASFLAGS += -DOS_WINDOWS +endif +ifeq ($(SOC), DSPACE) +CFLAGS += -D__DSPACE__ +ASFLAGS += -D__DSPACE__ +endif +ifeq ($(SOC), BACHMANN) +CFLAGS += -D__BACHMANN__ +ASFLAGS += -D__BACHMANN__ +endif + +# EXTERNAL_BLAS + +ifndef EXTERNAL_BLAS + EXTERNAL_BLAS = 0 +endif + +CFLAGS += $(INCLUDE_EXTERNAL_BLAS) +ifeq ($(EXTERNAL_BLAS), 0) +CFLAGS += +endif +ifeq ($(EXTERNAL_BLAS), SYSTEM) +CFLAGS += -DEXTERNAL_BLAS_SYSTEM +endif +ifeq ($(EXTERNAL_BLAS), OPENBLAS) +CFLAGS += -DEXTERNAL_BLAS_OPENBLAS +endif +ifeq ($(EXTERNAL_BLAS), BLIS) +CFLAGS += -DEXTERNAL_BLAS_BLIS -std=gnu99 +endif +ifeq ($(EXTERNAL_BLAS), NETLIB) +CFLAGS += -DEXTERNAL_BLAS_NETLIB +endif +ifeq ($(EXTERNAL_BLAS), MKL) +CFLAGS += -DEXTERNAL_BLAS_MKL -std=c99 -m64 -DMKL_DIRECT_CALL_SEQ +endif +ifeq ($(EXTERNAL_BLAS), ATLAS) +CFLAGS += -DEXTERNAL_BLAS_ATLAS +endif +# TODO remove and fix tests +# CFLAGS += -DEXTERNAL_BLAS=$(EXTERNAL_BLAS) + +# Architecture-specific flags +ifeq ($(TARGET), X64_INTEL_HASWELL) +CFLAGS += -m64 -mavx2 -mfma -DTARGET_X64_INTEL_HASWELL +endif +ifeq ($(TARGET), X64_INTEL_SANDY_BRIDGE) +CFLAGS += -m64 -mavx -DTARGET_X64_INTEL_SANDY_BRIDGE +endif +ifeq ($(TARGET), X64_INTEL_CORE) +CFLAGS += -m64 -msse3 -DTARGET_X64_INTEL_CORE +endif +ifeq ($(TARGET), X64_AMD_BULLDOZER) +CFLAGS += -m64 -mavx -mfma -DTARGET_X64_AMD_BULLDOZER +endif +ifeq ($(TARGET), X86_AMD_JAGUAR) +CFLAGS += -m32 -mavx -DTARGET_X86_AMD_JAGUAR +ASFLAGS += -m32 -mavx -DTARGET_X86_AMD_JAGUAR +endif +ifeq ($(TARGET), X86_AMD_BARCELONA) +CFLAGS += -m32 -msse3 -DTARGET_X86_AMD_BARCELONA +ASFLAGS += -m32 -msse3 -DTARGET_X86_AMD_BARCELONA +endif +ifeq ($(TARGET), ARMV8A_ARM_CORTEX_A57) +CFLAGS += -march=armv8-a+crc+crypto+simd -DTARGET_ARMV8A_ARM_CORTEX_A57 +ASFLAGS += -DTARGET_ARMV8A_ARM_CORTEX_A57 +endif +ifeq ($(TARGET), ARMV8A_ARM_CORTEX_A53) +CFLAGS += -march=armv8-a+crc+crypto+simd -DTARGET_ARMV8A_ARM_CORTEX_A53 +ASFLAGS += -DTARGET_ARMV8A_ARM_CORTEX_A53 +endif +ifeq ($(TARGET), ARMV7A_ARM_CORTEX_A15) +CFLAGS += -marm -mfloat-abi=hard -mfpu=neon-vfpv4 -mcpu=cortex-a15 -DTARGET_ARMV7A_ARM_CORTEX_A15 +ASFLAGS += -mfpu=neon-vfpv4 -DTARGET_ARMV7A_ARM_CORTEX_A15 +endif +ifeq ($(TARGET), ARMV7A_ARM_CORTEX_A9) +CFLAGS += -marm -mfloat-abi=hard -mfpu=neon -mcpu=cortex-a9 -DTARGET_ARMV7A_ARM_CORTEX_A9 +ASFLAGS += -mfpu=neon -DTARGET_ARMV7A_ARM_CORTEX_A9 +endif +ifeq ($(TARGET), ARMV7A_ARM_CORTEX_A7) +CFLAGS += -marm -mfloat-abi=hard -mfpu=neon-vfpv4 -mcpu=cortex-a7 -DTARGET_ARMV7A_ARM_CORTEX_A7 +ASFLAGS += -mfpu=neon-vfpv4 -DTARGET_ARMV7A_ARM_CORTEX_A7 +endif +ifeq ($(TARGET), GENERIC) +CFLAGS += -DTARGET_GENERIC +endif +# TODO remove and fix tests +# CFLAGS += -DBLASFEO_TARGET=$(TARGET) diff --git a/Dockerfile b/docker/x86_64/Dockerfile similarity index 98% rename from Dockerfile rename to docker/x86_64/Dockerfile index 6c1b7204..6b21c225 100644 --- a/Dockerfile +++ b/docker/x86_64/Dockerfile @@ -29,7 +29,7 @@ RUN mkdir -p eigen/build && cd eigen/build && cmake -DCMAKE_INSTALL_PREFIX=/usr/ RUN apt-get install -y bc RUN git clone https://github.com/giaf/blasfeo.git RUN cd blasfeo && git checkout cc90e146ee9089de518f57dbb736e064bd82394e -COPY docker/blasfeo/Makefile.rule blasfeo +COPY docker/x86_64/blasfeo/Makefile.rule blasfeo RUN cd blasfeo && make -j `nproc` static_library && make install_static # Install libxsmm diff --git a/docker/blasfeo/Makefile.rule b/docker/x86_64/blasfeo/Makefile.rule similarity index 100% rename from docker/blasfeo/Makefile.rule rename to docker/x86_64/blasfeo/Makefile.rule