From 9ac7b62eeb26e421fed0c94ab7a5ee94807748de Mon Sep 17 00:00:00 2001 From: Robin Verschueren Date: Mon, 5 Aug 2024 16:04:41 +0200 Subject: [PATCH 01/12] Run without only aggregates --- Makefile | 2 +- bench/analysis/dgemm_performance.py | 9 ++++++++- bench/analysis/dgemm_performance_ratio.py | 10 ++++++++-- 3 files changed, 17 insertions(+), 4 deletions(-) diff --git a/Makefile b/Makefile index 9fb445e7..a1fa3930 100644 --- a/Makefile +++ b/Makefile @@ -4,7 +4,7 @@ BENCH_BLAZE = build/bin/bench-blaze BENCH_EIGEN = build/bin/bench-eigen BENCH_BLAST = build/bin/bench-blast BENCH_LIBXSMM = build/bin/bench-libxsmm -BENCHMARK_OPTIONS = --benchmark_repetitions=5 --benchmark_counters_tabular=true --benchmark_out_format=json --benchmark_report_aggregates_only=true +BENCHMARK_OPTIONS = --benchmark_repetitions=10 --benchmark_counters_tabular=true --benchmark_out_format=json RUN_MATLAB = matlab -nodisplay -nosplash -nodesktop -r BENCH_DATA = bench_result/data BENCH_IMAGE = bench_result/image diff --git a/bench/analysis/dgemm_performance.py b/bench/analysis/dgemm_performance.py index 2998639a..7f401e91 100644 --- a/bench/analysis/dgemm_performance.py +++ b/bench/analysis/dgemm_performance.py @@ -3,7 +3,14 @@ def filter_aggregate(benchmarks, name): - return [b for b in benchmarks if b['aggregate_name'] == name] + result = [] + for b in benchmarks: + try: + if b['aggregate_name'] == name: + result.append(b) + except KeyError: + continue + return result factor = 1e+9 # Giga diff --git a/bench/analysis/dgemm_performance_ratio.py b/bench/analysis/dgemm_performance_ratio.py index bb2ee21e..e7f9d7b6 100644 --- a/bench/analysis/dgemm_performance_ratio.py +++ b/bench/analysis/dgemm_performance_ratio.py @@ -3,8 +3,14 @@ def filter_aggregate(benchmarks, name): - return [b for b in benchmarks if b['aggregate_name'] == name] - + result = [] + for b in benchmarks: + try: + if b['aggregate_name'] == name: + result.append(b) + except KeyError: + continue + return result def load_benchmark(file_name): with open(file_name) as f: From 84c8c583aedb97ea9813355539cb11e980ed6f2e Mon Sep 17 00:00:00 2001 From: Robin Verschueren Date: Tue, 6 Aug 2024 11:07:37 +0200 Subject: [PATCH 02/12] Compare performance across Blast commits --- Makefile | 25 +++++++++++++------------ bench/analysis/dgemm_performance.py | 20 +++++++++++--------- 2 files changed, 24 insertions(+), 21 deletions(-) diff --git a/Makefile b/Makefile index a1fa3930..3240a33c 100644 --- a/Makefile +++ b/Makefile @@ -61,21 +61,21 @@ ${BENCH_DATA}/sgemm-blaze-static.json: $(BENCH_BLAZE) $(BENCH_BLAZE) --benchmark_filter="BM_gemm_static*" $(BENCHMARK_OPTIONS) \ --benchmark_out=${BENCH_DATA}/sgemm-blaze-static.json -${BENCH_DATA}/dgemm-blast-static-panel.json: $(BENCH_BLAST) +${BENCH_DATA}/$(shell git rev-parse --short HEAD)/dgemm-blast-static-panel.json: $(BENCH_BLAST) $(BENCH_BLAST) --benchmark_filter="BM_gemm_static_panel" $(BENCHMARK_OPTIONS) \ - --benchmark_out=${BENCH_DATA}/dgemm-blast-static-panel.json + --benchmark_out=${BENCH_DATA}/$(shell git rev-parse --short HEAD)/dgemm-blast-static-panel.json -${BENCH_DATA}/dgemm-blast-dynamic-panel.json: $(BENCH_BLAST) +${BENCH_DATA}/$(shell git rev-parse --short HEAD)/dgemm-blast-dynamic-panel.json: $(BENCH_BLAST) $(BENCH_BLAST) --benchmark_filter="BM_gemm_dynamic_panel" $(BENCHMARK_OPTIONS) \ - --benchmark_out=${BENCH_DATA}/dgemm-blast-dynamic-panel.json + --benchmark_out=${BENCH_DATA}/$(shell git rev-parse --short HEAD)/dgemm-blast-dynamic-panel.json -${BENCH_DATA}/dgemm-blast-static-plain.json: $(BENCH_BLAST) +${BENCH_DATA}/$(shell git rev-parse --short HEAD)/dgemm-blast-static-plain.json: $(BENCH_BLAST) $(BENCH_BLAST) --benchmark_filter="BM_gemm_static_plain" $(BENCHMARK_OPTIONS) \ - --benchmark_out=${BENCH_DATA}/dgemm-blast-static-plain.json + --benchmark_out=${BENCH_DATA}/$(shell git rev-parse --short HEAD)/dgemm-blast-static-plain.json -${BENCH_DATA}/dgemm-blast-dynamic-plain.json: $(BENCH_BLAST) +${BENCH_DATA}/$(shell git rev-parse --short HEAD)/dgemm-blast-dynamic-plain.json: $(BENCH_BLAST) $(BENCH_BLAST) --benchmark_filter="BM_gemm_dynamic_plain" $(BENCHMARK_OPTIONS) \ - --benchmark_out=${BENCH_DATA}/dgemm-blast-dynamic-plain.json + --benchmark_out=${BENCH_DATA}/$(shell git rev-parse --short HEAD)/dgemm-blast-dynamic-plain.json ${BENCH_DATA}/sgemm-blast-static-panel.json: $(BENCH_BLAST) $(BENCH_BLAST) --benchmark_filter="BM_gemm_static_panel" $(BENCHMARK_OPTIONS) \ @@ -94,16 +94,17 @@ ${BENCH_DATA}/sgemm-libxsmm.json: $(BENCH_LIBXSMM) --benchmark_out=${BENCH_DATA}/sgemm-libxsmm.json dgemm-benchmarks: \ + $(shell mkdir -p ${BENCH_DATA}/$(shell git rev-parse --short HEAD)) \ ${BENCH_DATA}/dgemm-openblas.json \ ${BENCH_DATA}/dgemm-mkl.json \ ${BENCH_DATA}/dgemm-libxsmm.json \ ${BENCH_DATA}/dgemm-blasfeo.json \ ${BENCH_DATA}/dgemm-blaze-static.json \ ${BENCH_DATA}/dgemm-eigen-static.json \ - ${BENCH_DATA}/dgemm-blast-static-panel.json \ - ${BENCH_DATA}/dgemm-blast-dynamic-panel.json \ - ${BENCH_DATA}/dgemm-blast-static-plain.json \ - ${BENCH_DATA}/dgemm-blast-dynamic-plain.json + ${BENCH_DATA}/$(shell git rev-parse --short HEAD)/dgemm-blast-static-panel.json \ + ${BENCH_DATA}/$(shell git rev-parse --short HEAD)/dgemm-blast-dynamic-panel.json \ + ${BENCH_DATA}/$(shell git rev-parse --short HEAD)/dgemm-blast-static-plain.json \ + ${BENCH_DATA}/$(shell git rev-parse --short HEAD)/dgemm-blast-dynamic-plain.json # diff --git a/bench/analysis/dgemm_performance.py b/bench/analysis/dgemm_performance.py index 7f401e91..b3c16b40 100644 --- a/bench/analysis/dgemm_performance.py +++ b/bench/analysis/dgemm_performance.py @@ -1,6 +1,7 @@ import matplotlib.pyplot as plt import json - +import glob +import pathlib def filter_aggregate(benchmarks, name): result = [] @@ -22,20 +23,21 @@ def filter_aggregate(benchmarks, name): plots = [ # {'data_file': 'dgemm-openblas.json', 'label': 'OpenBLAS'}, - {'data_file': 'dgemm-mkl.json', 'label': 'MKL'}, - {'data_file': 'dgemm-blasfeo.json', 'label': 'BLASFEO'}, + # {'data_file': 'dgemm-mkl.json', 'label': 'MKL'}, + # {'data_file': 'dgemm-blasfeo.json', 'label': 'BLASFEO'}, # {'data_file': 'dgemm-blasfeo-blas.json', 'label': 'BLASFEO*'}, - {'data_file': 'dgemm-libxsmm.json', 'label': 'LIBXSMM'}, + # {'data_file': 'dgemm-libxsmm.json', 'label': 'LIBXSMM'}, # {'data_file': 'dgemm-eigen-dynamic.json', 'label': 'Eigen (D)'}, # {'data_file': 'dgemm-eigen-static.json', 'label': 'Eigen (S)'}, # {'data_file': 'dgemm-blaze-dynamic.json', 'label': 'Blaze (D)'}, - {'data_file': 'dgemm-blaze-static.json', 'label': 'Blaze (S)'}, - {'data_file': 'dgemm-blast-static-panel.json', 'label': 'BLAST (SP)'}, - {'data_file': 'dgemm-blast-static-plain.json', 'label': 'BLAST (SD)'}, - {'data_file': 'dgemm-blast-dynamic-panel.json', 'label': 'BLAST (DP)'}, - {'data_file': 'dgemm-blast-dynamic-plain.json', 'label': 'BLAST (DD)'}, + # {'data_file': 'dgemm-blaze-static.json', 'label': 'Blaze (S)'}, ] +for benchmark_file, benchmark_label in [('dgemm-blast-static-panel.json', 'SP'), ('dgemm-blast-static-plain.json', 'SD'), ('dgemm-blast-dynamic-panel.json', 'DP'), ('dgemm-blast-dynamic-plain.json', 'DD')]: + files = glob.glob('./**/' + benchmark_file, recursive=True, root_dir='bench_result/data') + for file in files: + plots.append({'data_file': file, 'label': f'BLAST ({benchmark_label}) {pathlib.Path(file).parent.stem}'}) + fig = plt.figure(figsize=[10, 6]) ax = fig.subplots() From 1b0925db7c034049258fd674af880f258e8146c1 Mon Sep 17 00:00:00 2001 From: Mikhail Katliar Date: Mon, 12 Aug 2024 19:24:15 +0200 Subject: [PATCH 03/12] `ptr()` clean-up (#24) - Matrix concept and type traits - Clean ptr mess - Blaze-specific code moved to blast/blaze --------- Co-authored-by: Mikhail Katliar --- bench/blast/math/dense/DynamicGemm.cpp | 4 +- bench/blast/math/dense/StaticGemm.cpp | 4 +- bench/blast/math/dense/StaticIamax.cpp | 2 +- bench/blast/math/dense/StaticTrmm.cpp | 7 +- bench/blast/math/panel/StaticGemm.cpp | 2 + bench/blast/math/simd/Trmm.cpp | 6 +- include/blast/blaze/Math.hpp | 10 + include/blast/blaze/math/TypeTraits.hpp | 12 + include/blast/blaze/math/Vector.hpp | 38 +++ .../blaze/math/typetraits/IsContiguous.hpp | 41 +++ .../blaze/math/typetraits/IsDenseMatrix.hpp | 23 ++ .../blaze/math/typetraits/IsDenseVector.hpp | 23 ++ .../blast/blaze/math/typetraits/IsStatic.hpp | 24 ++ .../math/typetraits/IsStaticallySpaced.hpp | 44 +++ .../blast/blaze/math/typetraits/Spacing.hpp | 59 ++++ include/blast/math/Matrix.hpp | 47 ++++ include/blast/math/TypeTraits.hpp | 23 +- include/blast/math/Vector.hpp | 45 ++++ .../blast/math/dense/DynamicMatrixPointer.hpp | 52 ++-- .../blast/math/dense/DynamicVectorPointer.hpp | 46 +++- include/blast/math/dense/Gemm.hpp | 8 +- include/blast/math/dense/Ger.hpp | 37 +-- include/blast/math/dense/Getf2.hpp | 9 +- include/blast/math/dense/Getrf.hpp | 4 +- include/blast/math/dense/Iamax.hpp | 3 +- include/blast/math/dense/Laswp.hpp | 9 +- include/blast/math/dense/MatrixPointer.hpp | 45 ---- include/blast/math/dense/Potrf.hpp | 36 ++- .../blast/math/dense/StaticMatrixPointer.hpp | 28 +- .../blast/math/dense/StaticVectorPointer.hpp | 43 +++ include/blast/math/dense/Swap.hpp | 5 +- include/blast/math/dense/Syrk.hpp | 71 +++-- include/blast/math/dense/Trmm.hpp | 40 +-- include/blast/math/dense/VectorPointer.hpp | 253 ------------------ .../blast/math/expressions/PMatTransExpr.hpp | 21 ++ .../blast/math/expressions/PanelMatrix.hpp | 21 +- .../blast/math/panel/DynamicPanelMatrix.hpp | 4 +- .../math/panel/DynamicPanelMatrixPointer.hpp | 26 +- include/blast/math/panel/Gemm.hpp | 2 +- include/blast/math/panel/MatrixPointer.hpp | 46 ---- .../blast/math/panel/StaticPanelMatrix.hpp | 14 + .../math/panel/StaticPanelMatrixPointer.hpp | 24 +- .../register_matrix/DynamicRegisterMatrix.hpp | 56 ++-- .../math/register_matrix/RegisterMatrix.hpp | 34 +-- .../blast/math/typetraits/IsContiguous.hpp | 26 ++ .../blast/math/typetraits/IsDenseMatrix.hpp | 27 ++ .../blast/math/typetraits/IsDenseVector.hpp | 28 ++ include/blast/math/typetraits/IsStatic.hpp | 37 +-- .../math/typetraits/IsStaticallySpaced.hpp | 27 ++ include/blast/math/typetraits/Matrix.hpp | 29 ++ include/blast/math/typetraits/Spacing.hpp | 32 +++ include/blast/math/typetraits/Vector.hpp | 29 ++ include/blast/util/Types.hpp | 9 + .../math/dense/DynamicVectorPointerTest.cpp | 22 +- test/blast/math/dense/GemmTest.cpp | 4 +- test/blast/math/dense/GerTest.cpp | 6 +- test/blast/math/dense/Getf2Test.cpp | 14 +- test/blast/math/dense/GetrfTest.cpp | 16 +- test/blast/math/dense/Iamax.cpp | 21 +- test/blast/math/dense/MatrixPointerTest.cpp | 16 +- test/blast/math/dense/PotrfTest.cpp | 13 +- .../math/dense/StaticVectorPointerTest.cpp | 5 +- test/blast/math/dense/SyrkTest.cpp | 11 +- test/blast/math/dense/TrmmTest.cpp | 5 +- test/blast/math/panel/PotrfTest.cpp | 3 +- .../math/panel/StaticPanelMatrixTest.cpp | 2 +- .../math/simd/DynamicRegisterMatrixTest.cpp | 5 +- test/blast/math/simd/RegisterMatrixTest.cpp | 18 +- 68 files changed, 1006 insertions(+), 750 deletions(-) create mode 100644 include/blast/blaze/Math.hpp create mode 100644 include/blast/blaze/math/TypeTraits.hpp create mode 100644 include/blast/blaze/math/Vector.hpp create mode 100644 include/blast/blaze/math/typetraits/IsContiguous.hpp create mode 100644 include/blast/blaze/math/typetraits/IsDenseMatrix.hpp create mode 100644 include/blast/blaze/math/typetraits/IsDenseVector.hpp create mode 100644 include/blast/blaze/math/typetraits/IsStatic.hpp create mode 100644 include/blast/blaze/math/typetraits/IsStaticallySpaced.hpp create mode 100644 include/blast/blaze/math/typetraits/Spacing.hpp create mode 100644 include/blast/math/Matrix.hpp create mode 100644 include/blast/math/Vector.hpp delete mode 100644 include/blast/math/dense/MatrixPointer.hpp delete mode 100644 include/blast/math/dense/VectorPointer.hpp delete mode 100644 include/blast/math/panel/MatrixPointer.hpp create mode 100644 include/blast/math/typetraits/IsContiguous.hpp create mode 100644 include/blast/math/typetraits/IsDenseMatrix.hpp create mode 100644 include/blast/math/typetraits/IsDenseVector.hpp create mode 100644 include/blast/math/typetraits/IsStaticallySpaced.hpp create mode 100644 include/blast/math/typetraits/Matrix.hpp create mode 100644 include/blast/math/typetraits/Spacing.hpp create mode 100644 include/blast/math/typetraits/Vector.hpp create mode 100644 include/blast/util/Types.hpp diff --git a/bench/blast/math/dense/DynamicGemm.cpp b/bench/blast/math/dense/DynamicGemm.cpp index e0f78260..9d702760 100644 --- a/bench/blast/math/dense/DynamicGemm.cpp +++ b/bench/blast/math/dense/DynamicGemm.cpp @@ -3,11 +3,11 @@ // license that can be found in the LICENSE file. #include +#include +#include #include -#include - #include diff --git a/bench/blast/math/dense/StaticGemm.cpp b/bench/blast/math/dense/StaticGemm.cpp index 22d6f68e..960c8ae9 100644 --- a/bench/blast/math/dense/StaticGemm.cpp +++ b/bench/blast/math/dense/StaticGemm.cpp @@ -3,11 +3,11 @@ // license that can be found in the LICENSE file. #include +#include +#include #include -#include - #include diff --git a/bench/blast/math/dense/StaticIamax.cpp b/bench/blast/math/dense/StaticIamax.cpp index a3a53101..6fab6a87 100644 --- a/bench/blast/math/dense/StaticIamax.cpp +++ b/bench/blast/math/dense/StaticIamax.cpp @@ -14,7 +14,7 @@ #include -#include +#include #include #include diff --git a/bench/blast/math/dense/StaticTrmm.cpp b/bench/blast/math/dense/StaticTrmm.cpp index 1365197b..1cc263ad 100644 --- a/bench/blast/math/dense/StaticTrmm.cpp +++ b/bench/blast/math/dense/StaticTrmm.cpp @@ -3,15 +3,12 @@ // license that can be found in the LICENSE file. #include - -#include +#include +#include #include #include -#include -#include - namespace blast :: benchmark { diff --git a/bench/blast/math/panel/StaticGemm.cpp b/bench/blast/math/panel/StaticGemm.cpp index 0d4f4ee2..c515c35e 100644 --- a/bench/blast/math/panel/StaticGemm.cpp +++ b/bench/blast/math/panel/StaticGemm.cpp @@ -3,7 +3,9 @@ // license that can be found in the LICENSE file. #include +#include #include +#include #include diff --git a/bench/blast/math/simd/Trmm.cpp b/bench/blast/math/simd/Trmm.cpp index 9ab4a7c3..5cc29978 100644 --- a/bench/blast/math/simd/Trmm.cpp +++ b/bench/blast/math/simd/Trmm.cpp @@ -2,14 +2,14 @@ // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. -#include +#include #include #include #include -#include +#include namespace blast :: benchmark @@ -81,4 +81,4 @@ namespace blast :: benchmark BENCHMARK_TEMPLATE(BM_RegisterMatrix_trmmRightLower, float, 24, 4, columnMajor); BENCHMARK_TEMPLATE(BM_RegisterMatrix_trmmRightLower, float, 16, 5, columnMajor); BENCHMARK_TEMPLATE(BM_RegisterMatrix_trmmRightLower, float, 16, 6, columnMajor); -} \ No newline at end of file +} diff --git a/include/blast/blaze/Math.hpp b/include/blast/blaze/Math.hpp new file mode 100644 index 00000000..48c1c3cc --- /dev/null +++ b/include/blast/blaze/Math.hpp @@ -0,0 +1,10 @@ +// Copyright 2024 Mikhail Katliar. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#pragma once + +#include +#include + +#include diff --git a/include/blast/blaze/math/TypeTraits.hpp b/include/blast/blaze/math/TypeTraits.hpp new file mode 100644 index 00000000..40737815 --- /dev/null +++ b/include/blast/blaze/math/TypeTraits.hpp @@ -0,0 +1,12 @@ +// Copyright 2024 Mikhail Katliar. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#pragma once + +#include +#include +#include +#include +#include +#include diff --git a/include/blast/blaze/math/Vector.hpp b/include/blast/blaze/math/Vector.hpp new file mode 100644 index 00000000..452cc7d4 --- /dev/null +++ b/include/blast/blaze/math/Vector.hpp @@ -0,0 +1,38 @@ +// Copyright 2024 Mikhail Katliar. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#pragma once + +#include + +#include +#include + + +namespace blaze +{ + /** + * @brief Memory distance between consecutive elements of a dense Blaze vector + * + * NOTE: The function is declared in blaze namespace s.t. it can be found by ADL. + * + * @tparam VT vector type + * + * @param v vector + * + * @return memory distance between consecutive elements of @a v + */ + template + requires blaze::IsDenseVector_v + inline size_t spacing(VT const& v) noexcept + { + if constexpr (IsContiguous_v) + return 1; + else + if constexpr (blaze::IsView_v) + return spacing(v.operand()); + else + static_assert(false, "Spacing is not defined for a type which is not a view and is not contiguous"); + } +} diff --git a/include/blast/blaze/math/typetraits/IsContiguous.hpp b/include/blast/blaze/math/typetraits/IsContiguous.hpp new file mode 100644 index 00000000..544b4486 --- /dev/null +++ b/include/blast/blaze/math/typetraits/IsContiguous.hpp @@ -0,0 +1,41 @@ +// Copyright 2024 Mikhail Katliar. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#pragma once + +#include + +#include +#include +#include +#include + +#include + + +namespace blast +{ + /** + * @brief Specialization for Blaze vectors which are not transpose expressions + * + * @tparam T type + */ + template + requires blaze::IsVector_v && (!blaze::IsTransExpr_v) + struct IsContiguous : blaze::IsContiguous {}; + + + /** + * @brief Specialization for Blaze vector transpose expressions + * + * The trasnposed vector expression is contiguous iff its operand is contiguous. + * + * This specialization is required to fix this Blaze bug: https://bitbucket.org/blaze-lib/blaze/issues/474 + * + * @tparam T type + */ + template + requires blaze::IsVector_v && blaze::IsTransExpr_v + struct IsContiguous : IsContiguous>> {}; +} diff --git a/include/blast/blaze/math/typetraits/IsDenseMatrix.hpp b/include/blast/blaze/math/typetraits/IsDenseMatrix.hpp new file mode 100644 index 00000000..e03e6c5a --- /dev/null +++ b/include/blast/blaze/math/typetraits/IsDenseMatrix.hpp @@ -0,0 +1,23 @@ +// Copyright (c) 2019-2020 Mikhail Katliar All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#pragma once + +#include + +#include +#include + + +namespace blast +{ + /** + * @brief Specialization for Blaze matrices + * + * @tparam T matrix type + */ + template + requires blaze::IsMatrix_v + struct IsDenseMatrix : blaze::IsDenseMatrix {}; +} diff --git a/include/blast/blaze/math/typetraits/IsDenseVector.hpp b/include/blast/blaze/math/typetraits/IsDenseVector.hpp new file mode 100644 index 00000000..eaeae86e --- /dev/null +++ b/include/blast/blaze/math/typetraits/IsDenseVector.hpp @@ -0,0 +1,23 @@ +// Copyright (c) 2019-2020 Mikhail Katliar All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#pragma once + +#include + +#include +#include + + +namespace blast +{ + /** + * @brief Specialization for Blaze vectors + * + * @tparam T vector type + */ + template + requires blaze::IsVector_v + struct IsDenseVector : blaze::IsDenseVector {}; +} diff --git a/include/blast/blaze/math/typetraits/IsStatic.hpp b/include/blast/blaze/math/typetraits/IsStatic.hpp new file mode 100644 index 00000000..3b0031ba --- /dev/null +++ b/include/blast/blaze/math/typetraits/IsStatic.hpp @@ -0,0 +1,24 @@ +// Copyright 2024 Mikhail Katliar. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#pragma once + +#include + +#include +#include +#include + + +namespace blast +{ + /** + * @brief Specialization for Blaze matrix and vector types + * + * @tparam T matrix or vector type + */ + template + requires blaze::IsVector_v || blaze::IsMatrix_v + struct IsStatic : blaze::IsStatic {}; +} diff --git a/include/blast/blaze/math/typetraits/IsStaticallySpaced.hpp b/include/blast/blaze/math/typetraits/IsStaticallySpaced.hpp new file mode 100644 index 00000000..4184ab83 --- /dev/null +++ b/include/blast/blaze/math/typetraits/IsStaticallySpaced.hpp @@ -0,0 +1,44 @@ +// Copyright 2024 Mikhail Katliar. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#pragma once + +#include +#include + +#include +#include +#include +#include + + +namespace blast +{ + /** + * @brief Specialization for Blaze vectors which are not views + * + * Blaze vectors which are not views are statically spaced iff they are contiguous + * + * @tparam VT vector type + */ + template + requires blaze::IsVector_v && (!blaze::IsView_v) + struct IsStaticallySpaced : IsContiguous {}; + + + /** + * @brief Specialization for Blaze vectors which are views + * + * Blaze vectors which are views are statically spaced iff they are contiguous or their viewed type is static + * + * @tparam VT vector type + */ + template + requires blaze::IsVector_v && blaze::IsView_v + struct IsStaticallySpaced + : std::integral_constant || blaze::IsStatic_v> + > + {}; +} diff --git a/include/blast/blaze/math/typetraits/Spacing.hpp b/include/blast/blaze/math/typetraits/Spacing.hpp new file mode 100644 index 00000000..6ead89fc --- /dev/null +++ b/include/blast/blaze/math/typetraits/Spacing.hpp @@ -0,0 +1,59 @@ +// Copyright (c) 2019-2020 Mikhail Katliar All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#pragma once + +#include +#include + +#include +#include +#include +#include +#include + +#include + + +namespace blast +{ + /** + * @brief Specialization for Blaze dense matrices which are not transpose expressions + * + * @tparam MT matrix type + */ + template + requires blaze::IsDenseMatrix_v && (!blaze::IsTransExpr_v) + struct Spacing : std::integral_constant {}; + + + /** + * @brief Specialization for Blaze dense matrix transpose expressions + * + * @tparam MT matrix type + */ + template + requires blaze::IsDenseMatrix_v && blaze::IsTransExpr_v + struct Spacing : Spacing>> {}; + + + /** + * @brief Specialization for contiguous dense Blaze vectors + * + * @tparam VT vector type + */ + template + requires blaze::IsDenseVector_v && IsContiguous_v + struct Spacing : std::integral_constant {}; + + + /** + * @brief Specialization for non-contiguous dense Blaze vector views + * + * @tparam VT vector type + */ + template + requires blaze::IsDenseVector_v && (!IsContiguous_v) && blaze::IsView_v + struct Spacing : Spacing> {}; +} diff --git a/include/blast/math/Matrix.hpp b/include/blast/math/Matrix.hpp new file mode 100644 index 00000000..7002226d --- /dev/null +++ b/include/blast/math/Matrix.hpp @@ -0,0 +1,47 @@ +// Copyright (c) 2019-2024 Mikhail Katliar All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#pragma once + +#include +#include +#include +#include +#include +#include + + +namespace blast +{ + /** + * @brief Pointer to the first element of a matrix + * + * @tparam MT matrix type + * + * @param m matrix + * + * @return pointer to @a m(0, 0) + */ + template + BLAST_ALWAYS_INLINE auto ptr(MT& m) + { + return ptr>(m, 0, 0); + } + + + /** + * @brief Pointer to the first element of a const matrix + * + * @tparam MT matrix type + * + * @param m matrix + * + * @return pointer to @a m(0, 0) + */ + template + BLAST_ALWAYS_INLINE auto ptr(MT const& m) + { + return ptr>(m, 0, 0); + } +} diff --git a/include/blast/math/TypeTraits.hpp b/include/blast/math/TypeTraits.hpp index a65b0ac3..bff67adc 100644 --- a/include/blast/math/TypeTraits.hpp +++ b/include/blast/math/TypeTraits.hpp @@ -1,25 +1,22 @@ -// Copyright 2023 Mikhail Katliar -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. +// Copyright 2024 Mikhail Katliar. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. #pragma once #include #include #include +#include +#include #include #include +#include +#include +#include #include #include #include #include +#include +#include diff --git a/include/blast/math/Vector.hpp b/include/blast/math/Vector.hpp new file mode 100644 index 00000000..f1026b29 --- /dev/null +++ b/include/blast/math/Vector.hpp @@ -0,0 +1,45 @@ +// Copyright 2023-2024 Mikhail Katliar. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#pragma once + +#include +#include +#include +#include + + +namespace blast +{ + /** + * @brief Pointer to the first element of a vector + * + * @tparam VT vector type + * @param v vector + * + * @return pointer to the first element of @a v + */ + template + requires IsDenseVector_v + BLAST_ALWAYS_INLINE auto ptr(VT& v) + { + return ptr>(v, 0); + } + + + /** + * @brief Pointer to the first element of a const vector + * + * @tparam VT vector type + * @param v vector + * + * @return pointer to the first element of @a v + */ + template + requires IsDenseVector_v + BLAST_ALWAYS_INLINE auto ptr(VT const& v) + { + return ptr>(v, 0); + } +} diff --git a/include/blast/math/dense/DynamicMatrixPointer.hpp b/include/blast/math/dense/DynamicMatrixPointer.hpp index 5610b1c9..b7651373 100644 --- a/include/blast/math/dense/DynamicMatrixPointer.hpp +++ b/include/blast/math/dense/DynamicMatrixPointer.hpp @@ -4,13 +4,12 @@ #pragma once - #include #include #include #include #include -#include +#include namespace blast @@ -203,51 +202,32 @@ namespace blast template - BLAZE_ALWAYS_INLINE auto trans(DynamicMatrixPointer const& p) noexcept + BLAST_ALWAYS_INLINE auto trans(DynamicMatrixPointer const& p) noexcept { return p.trans(); } - template - requires (!IsStatic_v) - BLAZE_ALWAYS_INLINE DynamicMatrixPointer, SO, AF, IsPadded_v> - ptr(DenseMatrix& m, size_t i, size_t j) - { - if constexpr (SO == columnMajor) - return {(*m).data() + i + spacing(m) * j, spacing(m)}; - else - return {(*m).data() + spacing(m) * i + j, spacing(m)}; - } - - - template - requires (!IsStatic_v) - BLAZE_ALWAYS_INLINE DynamicMatrixPointer const, SO, AF, IsPadded_v> - ptr(DenseMatrix const& m, size_t i, size_t j) + template + requires (!IsStatic_v) && IsDenseMatrix_v + BLAST_ALWAYS_INLINE DynamicMatrixPointer, StorageOrder_v, AF, IsPadded_v> + ptr(MT& m, size_t i, size_t j) { - if constexpr (SO == columnMajor) - return {(*m).data() + i + spacing(m) * j, spacing(m)}; + if constexpr (StorageOrder_v == columnMajor) + return {data(m) + i + spacing(m) * j, spacing(m)}; else - return {(*m).data() + spacing(m) * i + j, spacing(m)}; + return {data(m) + spacing(m) * i + j, spacing(m)}; } - template - requires (!IsStatic_v) - BLAZE_ALWAYS_INLINE DynamicMatrixPointer const, SO, AF, IsPadded_v> - ptr(DMatTransExpr const& m, size_t i, size_t j) + template + requires (!IsStatic_v) && IsDenseMatrix_v + BLAST_ALWAYS_INLINE DynamicMatrixPointer const, StorageOrder_v, AF, IsPadded_v> + ptr(MT const& m, size_t i, size_t j) { - if constexpr (SO == columnMajor) - return {(*m).data() + i + spacing(m) * j, spacing(m)}; + if constexpr (StorageOrder_v == columnMajor) + return {data(m) + i + spacing(m) * j, spacing(m)}; else - return {(*m).data() + spacing(m) * i + j, spacing(m)}; - } - - - template - BLAZE_ALWAYS_INLINE DynamicMatrixPointer ptr(T * p, size_t spacing) - { - return {p, spacing}; + return {data(m) + spacing(m) * i + j, spacing(m)}; } } diff --git a/include/blast/math/dense/DynamicVectorPointer.hpp b/include/blast/math/dense/DynamicVectorPointer.hpp index 7638119a..6843925d 100644 --- a/include/blast/math/dense/DynamicVectorPointer.hpp +++ b/include/blast/math/dense/DynamicVectorPointer.hpp @@ -5,7 +5,9 @@ #pragma once #include +#include #include +#include #include @@ -187,8 +189,50 @@ namespace blast template - BLAZE_ALWAYS_INLINE auto trans(DynamicVectorPointer const& p) noexcept + BLAST_ALWAYS_INLINE auto trans(DynamicVectorPointer const& p) noexcept { return p.trans(); } + + + /** + * @brief Pointer to a dynamically spaced vector + * + * @tparam AF true if the pointer is SIMD-aligned + * @tparam VT vector type + * + * @param v vector + * @param i index within @a v + * + * @return vector pointer to @a i -th element of @a v + */ + template + requires (!IsStaticallySpaced_v) + BLAST_ALWAYS_INLINE auto ptr(VT& v, size_t i) noexcept + { + // NOTE: we don't use data(v) here because of this bug: + // https://bitbucket.org/blaze-lib/blaze/issues/457 + return DynamicVectorPointer, VT::transposeFlag, AF, IsPadded_v> {&v[i], spacing(v)}; + } + + + /** + * @brief Pointer to a dynamically spaced const vector + * + * @tparam AF true if the pointer is SIMD-aligned + * @tparam VT vector type + * + * @param v vector + * @param i index within @a v + * + * @return vector pointer to @a i -th element of @a v + */ + template + requires (!IsStaticallySpaced_v) + BLAST_ALWAYS_INLINE auto ptr(VT const& v, size_t i) noexcept + { + // NOTE: we don't use data(v) here because of this bug: + // https://bitbucket.org/blaze-lib/blaze/issues/457 + return DynamicVectorPointer const, VT::transposeFlag, AF, IsPadded_v> {&v[i], spacing(v)}; + } } diff --git a/include/blast/math/dense/Gemm.hpp b/include/blast/math/dense/Gemm.hpp index a386baee..7c6cc40a 100644 --- a/include/blast/math/dense/Gemm.hpp +++ b/include/blast/math/dense/Gemm.hpp @@ -5,13 +5,9 @@ #pragma once #include -#include +#include #include -#include -#include -#include - namespace blast { @@ -80,4 +76,4 @@ namespace blast using ET = ElementType_t; gemm(ET(1.), A, B, ET(1.), C, D); } -} \ No newline at end of file +} diff --git a/include/blast/math/dense/Ger.hpp b/include/blast/math/dense/Ger.hpp index 807412c3..c19557bf 100644 --- a/include/blast/math/dense/Ger.hpp +++ b/include/blast/math/dense/Ger.hpp @@ -1,26 +1,14 @@ -// Copyright 2023 Mikhail Katliar -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. +// Copyright 2023-2024 Mikhail Katliar. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. #pragma once #include - #include #include -#include -#include -#include +#include +#include namespace blast @@ -57,13 +45,9 @@ namespace blast MatrixPointer && (StorageOrder_v == columnMajor) inline void ger(size_t M, size_t N, Scalar alpha, VPX x, VPY y, MPA A, MPB B) { - using ET = ElementType_t; + using ET = Scalar; size_t constexpr TILE_SIZE = TileSize_v; - BLAZE_CONSTRAINT_MUST_BE_SAME_TYPE(ElementType_t, ET); - BLAZE_CONSTRAINT_MUST_BE_SAME_TYPE(ElementType_t, ET); - BLAZE_CONSTRAINT_MUST_BE_SAME_TYPE(ElementType_t, ET); - size_t j = 0; // Main part @@ -182,13 +166,6 @@ namespace blast DenseMatrix& B ) { - using ET = ElementType_t; - size_t constexpr TILE_SIZE = TileSize_v; - - BLAZE_CONSTRAINT_MUST_BE_SAME_TYPE(ElementType_t, ET); - BLAZE_CONSTRAINT_MUST_BE_SAME_TYPE(ElementType_t, ET); - BLAZE_CONSTRAINT_MUST_BE_SAME_TYPE(ElementType_t, ET); - size_t const M = size(x); size_t const N = size(y); @@ -319,4 +296,4 @@ namespace blast { ger(alpha, x, y, A, B); } -} \ No newline at end of file +} diff --git a/include/blast/math/dense/Getf2.hpp b/include/blast/math/dense/Getf2.hpp index 73614b01..bd359835 100644 --- a/include/blast/math/dense/Getf2.hpp +++ b/include/blast/math/dense/Getf2.hpp @@ -19,10 +19,7 @@ #include #include #include -#include -#include - -#include +#include namespace blast @@ -106,7 +103,7 @@ namespace blast template inline void getf2(DenseMatrix& A, size_t * ipiv) { - getf2(rows(*A), columns(*A), ptr(A), ipiv); + getf2(rows(*A), columns(*A), ptr(*A), ipiv); } @@ -165,4 +162,4 @@ namespace blast ); } } -} \ No newline at end of file +} diff --git a/include/blast/math/dense/Getrf.hpp b/include/blast/math/dense/Getrf.hpp index cc94faa0..543a3ae5 100644 --- a/include/blast/math/dense/Getrf.hpp +++ b/include/blast/math/dense/Getrf.hpp @@ -57,7 +57,7 @@ namespace blast size_t const M = rows(A); size_t const N = columns(A); - auto pA = ptr(A); + auto pA = ptr(*A); size_t k = 0; @@ -160,4 +160,4 @@ namespace blast ipiv[k] = k; } } -} \ No newline at end of file +} diff --git a/include/blast/math/dense/Iamax.hpp b/include/blast/math/dense/Iamax.hpp index 530df07b..16e2e464 100644 --- a/include/blast/math/dense/Iamax.hpp +++ b/include/blast/math/dense/Iamax.hpp @@ -16,7 +16,8 @@ #include -#include +#include +#include #include #include diff --git a/include/blast/math/dense/Laswp.hpp b/include/blast/math/dense/Laswp.hpp index f363e756..844ec02a 100644 --- a/include/blast/math/dense/Laswp.hpp +++ b/include/blast/math/dense/Laswp.hpp @@ -14,9 +14,10 @@ #pragma once - #include +#include + namespace blast { @@ -34,10 +35,10 @@ namespace blast @a k0 ... @a k1 - 1 of @a ipiv are accessed. ipiv[k] = l implies rows k and l are to be interchanged. */ template - inline void laswp(DenseMatrix& A, size_t k0, size_t k1, size_t * ipiv) + inline void laswp(blaze::DenseMatrix& A, size_t k0, size_t k1, size_t * ipiv) { for (size_t k = k0; k < k1; ++k) if (k != ipiv[k]) - swap(row(*A, k, unchecked), row(*A, ipiv[k], unchecked)); + swap(row(*A, k, blaze::unchecked), row(*A, ipiv[k], blaze::unchecked)); } -} \ No newline at end of file +} diff --git a/include/blast/math/dense/MatrixPointer.hpp b/include/blast/math/dense/MatrixPointer.hpp deleted file mode 100644 index 06372ed9..00000000 --- a/include/blast/math/dense/MatrixPointer.hpp +++ /dev/null @@ -1,45 +0,0 @@ -// Copyright 2023 Mikhail Katliar -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once - -#include -#include - -#include -#include - - -namespace blast -{ - template - BLAZE_ALWAYS_INLINE auto ptr(blaze::DenseMatrix& m) - { - return ptr>(m, 0, 0); - } - - - template - BLAZE_ALWAYS_INLINE auto ptr(blaze::DenseMatrix const& m) - { - return ptr>(m, 0, 0); - } - - - template - BLAZE_ALWAYS_INLINE auto ptr(blaze::DMatTransExpr const& m) - { - return ptr>(m, 0, 0); - } -} \ No newline at end of file diff --git a/include/blast/math/dense/Potrf.hpp b/include/blast/math/dense/Potrf.hpp index 50cfd56d..3b1e1e02 100644 --- a/include/blast/math/dense/Potrf.hpp +++ b/include/blast/math/dense/Potrf.hpp @@ -5,27 +5,23 @@ #pragma once -#include +#include #include #include #include -#include -#include +#include #include namespace blast { - using namespace blaze; - - template BLAZE_ALWAYS_INLINE void potrf_backend(size_t k, size_t i, - DenseMatrix const& A, DenseMatrix& L) + blaze::DenseMatrix const& A, blaze::DenseMatrix& L) { - using ET = ElementType_t; + using ET = blaze::ElementType_t; size_t constexpr TILE_SIZE = TileSize_v; size_t const M = rows(A); @@ -36,10 +32,10 @@ namespace blast RegisterMatrix ker; - ker.load(1., ptr(A, i, k)); + ker.load(1., ptr(*A, i, k)); - auto a = ptr(L, i, 0); - auto b = ptr(L, k, 0); + auto a = ptr(*L, i, 0); + auto b = ptr(*L, k, 0); for (size_t l = 0; l < k; ++l) ker.ger(ET(-1.), column(a(0, l)), row(trans(b)(l, 0))); @@ -50,31 +46,31 @@ namespace blast ker.potrf(); if (k + KN <= N) - ker.storeLower(ptr(L, i, k)); + ker.storeLower(ptr(*L, i, k)); else - ker.storeLower(ptr(L, i, k), std::min(M - i, KM), N - k); + ker.storeLower(ptr(*L, i, k), std::min(M - i, KM), N - k); } else { // Off-diagonal blocks - ker.trsm(Side::Right, UpLo::Upper, ptr(L, k, k).trans()); + ker.trsm(Side::Right, UpLo::Upper, ptr(*L, k, k).trans()); if (k + KN <= N) - ker.store(ptr(L, i, k)); + ker.store(ptr(*L, i, k)); else - ker.store(ptr(L, i, k), std::min(M - i, KM), N - k); + ker.store(ptr(*L, i, k), std::min(M - i, KM), N - k); } } template inline void potrf( - DenseMatrix const& A, DenseMatrix& L) + blaze::DenseMatrix const& A, blaze::DenseMatrix& L) { - using ET = ElementType_t; + using ET = blaze::ElementType_t; size_t constexpr TILE_SIZE = TileSize_v; - BLAZE_CONSTRAINT_MUST_BE_SAME_TYPE(ElementType_t, ET); + BLAZE_CONSTRAINT_MUST_BE_SAME_TYPE(blaze::ElementType_t, ET); size_t const M = rows(A); size_t const N = columns(A); @@ -108,4 +104,4 @@ namespace blast potrf_backend<1 * TILE_SIZE, KN>(k, i, *A, *L); } } -} \ No newline at end of file +} diff --git a/include/blast/math/dense/StaticMatrixPointer.hpp b/include/blast/math/dense/StaticMatrixPointer.hpp index 961b5e01..bb70c091 100644 --- a/include/blast/math/dense/StaticMatrixPointer.hpp +++ b/include/blast/math/dense/StaticMatrixPointer.hpp @@ -12,10 +12,6 @@ #include #include -#include -#include -#include - namespace blast { @@ -203,26 +199,18 @@ namespace blast } - template - requires IsStatic_v - BLAZE_ALWAYS_INLINE auto ptr(blaze::DenseMatrix& m, size_t i, size_t j) - { - return StaticMatrixPointer, MT::spacing(), SO, AF, IsPadded_v>((*m).data(), i, j); - } - - - template - requires IsStatic_v - BLAZE_ALWAYS_INLINE auto ptr(blaze::DenseMatrix const& m, size_t i, size_t j) + template + requires IsStatic_v && IsDenseMatrix_v + BLAZE_ALWAYS_INLINE auto ptr(MT& m, size_t i, size_t j) { - return StaticMatrixPointer const, MT::spacing(), SO, AF, IsPadded_v>((*m).data(), i, j); + return StaticMatrixPointer, Spacing_v, StorageOrder_v, AF, IsPadded_v>(data(m), i, j); } - template - requires IsStatic_v - BLAZE_ALWAYS_INLINE auto ptr(blaze::DMatTransExpr const& m, size_t i, size_t j) + template + requires IsStatic_v && IsDenseMatrix_v + BLAZE_ALWAYS_INLINE auto ptr(MT const& m, size_t i, size_t j) { - return trans(ptr(m.operand(), j, i)); + return StaticMatrixPointer const, Spacing_v, StorageOrder_v, AF, IsPadded_v>(data(m), i, j); } } diff --git a/include/blast/math/dense/StaticVectorPointer.hpp b/include/blast/math/dense/StaticVectorPointer.hpp index 6017dafa..6f7a0e2e 100644 --- a/include/blast/math/dense/StaticVectorPointer.hpp +++ b/include/blast/math/dense/StaticVectorPointer.hpp @@ -6,6 +6,7 @@ #include +#include #include @@ -213,4 +214,46 @@ namespace blast { return p.trans(); } + + + /** + * @brief Pointer to a statically spaced vector + * + * @tparam AF true if the pointer is SIMD-aligned + * @tparam VT vector type + * + * @param v vector + * @param i index within @a v + * + * @return vector pointer to @a i -th element of @a v + */ + template + requires IsStaticallySpaced_v + BLAST_ALWAYS_INLINE auto ptr(VT& v, size_t i) + { + // NOTE: we don't use data(v) here because of this bug: + // https://bitbucket.org/blaze-lib/blaze/issues/457 + return StaticVectorPointer, Spacing_v, VT::transposeFlag, AF, IsPadded_v> {&v[i]}; + } + + + /** + * @brief Pointer to a statically spaced const vector + * + * @tparam AF true if the pointer is SIMD-aligned + * @tparam VT vector type + * + * @param v vector + * @param i index within @a v + * + * @return vector pointer to @a i -th element of @a v + */ + template + requires IsStaticallySpaced_v + BLAST_ALWAYS_INLINE auto ptr(VT const& v, size_t i) + { + // NOTE: we don't use data(v) here because of this bug: + // https://bitbucket.org/blaze-lib/blaze/issues/457 + return StaticVectorPointer const, Spacing_v, VT::transposeFlag, AF, IsPadded_v> {&v[i]}; + } } diff --git a/include/blast/math/dense/Swap.hpp b/include/blast/math/dense/Swap.hpp index d192e9fe..cf574c59 100644 --- a/include/blast/math/dense/Swap.hpp +++ b/include/blast/math/dense/Swap.hpp @@ -15,8 +15,7 @@ #pragma once #include - -#include +#include #include @@ -69,4 +68,4 @@ namespace blast if (N > 0) swap(N, ptr(*x), ptr(*y)); } -} \ No newline at end of file +} diff --git a/include/blast/math/dense/Syrk.hpp b/include/blast/math/dense/Syrk.hpp index 182789eb..a39d47f3 100644 --- a/include/blast/math/dense/Syrk.hpp +++ b/include/blast/math/dense/Syrk.hpp @@ -7,10 +7,7 @@ #include #include #include - -#include -#include -#include +#include namespace blast @@ -48,46 +45,46 @@ namespace blast for (; i + 3 * TILE_SIZE <= M && i + 4 * TILE_SIZE != M; i += 3 * TILE_SIZE) { RegisterMatrix ker; - ker.load(beta, ptr(C, i, j)); - gemm(ker, K, alpha, ptr(A, i, 0), trans(ptr(A, j, 0))); + ker.load(beta, ptr(*C, i, j)); + gemm(ker, K, alpha, ptr(*A, i, 0), trans(ptr(*A, j, 0))); if (i == j) - ker.storeLower(ptr(D, i, j)); + ker.storeLower(ptr(*D, i, j)); else - ker.store(ptr(D, i, j)); + ker.store(ptr(*D, i, j)); } for (; i + 2 * TILE_SIZE <= M; i += 2 * TILE_SIZE) { RegisterMatrix ker; - ker.load(beta, ptr(C, i, j)); - gemm(ker, K, alpha, ptr(A, i, 0), trans(ptr(A, j, 0))); + ker.load(beta, ptr(*C, i, j)); + gemm(ker, K, alpha, ptr(*A, i, 0), trans(ptr(*A, j, 0))); if (i == j) - ker.storeLower(ptr(D, i, j)); + ker.storeLower(ptr(*D, i, j)); else - ker.store(ptr(D, i, j)); + ker.store(ptr(*D, i, j)); } for (; i + 1 * TILE_SIZE <= M; i += 1 * TILE_SIZE) { RegisterMatrix ker; - ker.load(beta, ptr(C, i, j)); - gemm(ker, K, alpha, ptr(A, i, 0), trans(ptr(A, j, 0))); + ker.load(beta, ptr(*C, i, j)); + gemm(ker, K, alpha, ptr(*A, i, 0), trans(ptr(*A, j, 0))); if (i == j) - ker.storeLower(ptr(D, i, j)); + ker.storeLower(ptr(*D, i, j)); else - ker.store(ptr(D, i, j)); + ker.store(ptr(*D, i, j)); } // Bottom side if (i < M) { RegisterMatrix ker; - ker.load(beta, ptr(C, i, j), M - i, ker.columns()); - gemm(ker, K, alpha, ptr(A, i, 0), trans(ptr(A, j, 0)), M - i, ker.columns()); + ker.load(beta, ptr(*C, i, j), M - i, ker.columns()); + gemm(ker, K, alpha, ptr(*A, i, 0), trans(ptr(*A, j, 0)), M - i, ker.columns()); if (i == j) - ker.storeLower(ptr(D, i, j), M - i, ker.columns()); + ker.storeLower(ptr(*D, i, j), M - i, ker.columns()); else - ker.store(ptr(D, i, j), M - i, ker.columns()); + ker.store(ptr(*D, i, j), M - i, ker.columns()); } } @@ -102,48 +99,48 @@ namespace blast for (; i + 3 * TILE_SIZE <= M && i + 4 * TILE_SIZE != M; i += 3 * TILE_SIZE) { RegisterMatrix ker; - ker.load(beta, ptr(C, i, j), ker.rows(), M - j); - gemm(ker, K, alpha, ptr(A, i, 0), trans(ptr(A, j, 0)), ker.rows(), M - j); + ker.load(beta, ptr(*C, i, j), ker.rows(), M - j); + gemm(ker, K, alpha, ptr(*A, i, 0), trans(ptr(*A, j, 0)), ker.rows(), M - j); if (i == j) - ker.storeLower(ptr(D, i, j), ker.rows(), M - j); + ker.storeLower(ptr(*D, i, j), ker.rows(), M - j); else - ker.store(ptr(D, i, j), ker.rows(), M - j); + ker.store(ptr(*D, i, j), ker.rows(), M - j); } for (; i + 2 * TILE_SIZE <= M; i += 2 * TILE_SIZE) { RegisterMatrix ker; - ker.load(beta, ptr(C, i, j), ker.rows(), M - j); - gemm(ker, K, alpha, ptr(A, i, 0), trans(ptr(A, j, 0)), ker.rows(), M - j); + ker.load(beta, ptr(*C, i, j), ker.rows(), M - j); + gemm(ker, K, alpha, ptr(*A, i, 0), trans(ptr(*A, j, 0)), ker.rows(), M - j); if (i == j) - ker.storeLower(ptr(D, i, j), ker.rows(), M - j); + ker.storeLower(ptr(*D, i, j), ker.rows(), M - j); else - ker.store(ptr(D, i, j), ker.rows(), M - j); + ker.store(ptr(*D, i, j), ker.rows(), M - j); } for (; i + 1 * TILE_SIZE <= M; i += 1 * TILE_SIZE) { RegisterMatrix ker; - ker.load(beta, ptr(C, i, j), ker.rows(), M - j); - gemm(ker, K, alpha, ptr(A, i, 0), trans(ptr(A, j, 0)), ker.rows(), M - j); + ker.load(beta, ptr(*C, i, j), ker.rows(), M - j); + gemm(ker, K, alpha, ptr(*A, i, 0), trans(ptr(*A, j, 0)), ker.rows(), M - j); if (i == j) - ker.storeLower(ptr(D, i, j), ker.rows(), M - j); + ker.storeLower(ptr(*D, i, j), ker.rows(), M - j); else - ker.store(ptr(D, i, j), ker.rows(), M - j); + ker.store(ptr(*D, i, j), ker.rows(), M - j); } // Bottom-right corner if (i < M) { RegisterMatrix ker; - ker.load(beta, ptr(C, i, j), ker.rows(), M - j); - gemm(ker, K, alpha, ptr(A, i, 0), trans(ptr(A, j, 0)), M - i, M - j); + ker.load(beta, ptr(*C, i, j), ker.rows(), M - j); + gemm(ker, K, alpha, ptr(*A, i, 0), trans(ptr(*A, j, 0)), M - i, M - j); if (i == j) - ker.storeLower(ptr(D, i, j), M - i, M - j); + ker.storeLower(ptr(*D, i, j), M - i, M - j); else - ker.store(ptr(D, i, j), M - i, M - j); + ker.store(ptr(*D, i, j), M - i, M - j); } } } -} \ No newline at end of file +} diff --git a/include/blast/math/dense/Trmm.hpp b/include/blast/math/dense/Trmm.hpp index 041159e0..34dc7c4f 100644 --- a/include/blast/math/dense/Trmm.hpp +++ b/include/blast/math/dense/Trmm.hpp @@ -44,15 +44,15 @@ namespace blast // it is more efficient to apply 2 * TILE_SIZE kernel 2 times than 3 * TILE_SIZE + 1 * TILE_SIZE kernel. for (; i + 2 * TILE_SIZE < M && i + 4 * TILE_SIZE != M; i += 3 * TILE_SIZE) trmmLeftUpper_backend<3 * TILE_SIZE, TILE_SIZE>( - M - i, N, alpha, ptr(A, i, i), ptr(B, i, 0), ptr(C, i, 0)); + M - i, N, alpha, ptr(*A, i, i), ptr(*B, i, 0), ptr(*C, i, 0)); for (; i + 1 * TILE_SIZE < M; i += 2 * TILE_SIZE) trmmLeftUpper_backend<2 * TILE_SIZE, TILE_SIZE>( - M - i, N, alpha, ptr(A, i, i), ptr(B, i, 0), ptr(C, i, 0)); + M - i, N, alpha, ptr(*A, i, i), ptr(*B, i, 0), ptr(*C, i, 0)); for (; i + 0 * TILE_SIZE < M; i += 1 * TILE_SIZE) trmmLeftUpper_backend<1 * TILE_SIZE, TILE_SIZE>( - M - i, N, alpha, ptr(A, i, i), ptr(B, i, 0), ptr(C, i, 0)); + M - i, N, alpha, ptr(*A, i, i), ptr(*B, i, 0), ptr(*C, i, 0)); } @@ -92,46 +92,46 @@ namespace blast for (; i + 3 * TILE_SIZE <= M && i + 4 * TILE_SIZE != M; i += 3 * TILE_SIZE) { RegisterMatrix ker; - gemm(ker, N - j, alpha, ptr(B, i, j), ptr(A, j, j)); + gemm(ker, N - j, alpha, ptr(*B, i, j), ptr(*A, j, j)); /* ker.trmmRightLower(alpha, ptr(B, i, j), ptr(A, j, j)); ker.gemm(K, alpha, ptr(B, i, j + TILE_SIZE), ptr(A, j + TILE_SIZE, j)); */ - ker.store(ptr(C, i, j)); + ker.store(ptr(*C, i, j)); } for (; i + 2 * TILE_SIZE <= M; i += 2 * TILE_SIZE) { RegisterMatrix ker; - gemm(ker, N - j, alpha, ptr(B, i, j), ptr(A, j, j)); + gemm(ker, N - j, alpha, ptr(*B, i, j), ptr(*A, j, j)); /* ker.trmmRightLower(alpha, ptr(B, i, j), ptr(A, j, j)); ker.gemm(K, alpha, ptr(B, i, j + TILE_SIZE), ptr(A, j + TILE_SIZE, j)); */ - ker.store(ptr(C, i, j)); + ker.store(ptr(*C, i, j)); } for (; i + 1 * TILE_SIZE <= M; i += 1 * TILE_SIZE) { RegisterMatrix ker; - gemm(ker, N - j, alpha, ptr(B, i, j), ptr(A, j, j)); + gemm(ker, N - j, alpha, ptr(*B, i, j), ptr(*A, j, j)); /* ker.trmmRightLower(alpha, ptr(B, i, j), ptr(A, j, j)); ker.gemm(K, alpha, ptr(B, i, j + TILE_SIZE), ptr(A, j + TILE_SIZE, j)); */ - ker.store(ptr(C, i, j)); + ker.store(ptr(*C, i, j)); } // Bottom side if (i < M) { RegisterMatrix ker; - gemm(ker, N - j, alpha, ptr(B, i, j), ptr(A, j, j), M - i, ker.columns()); + gemm(ker, N - j, alpha, ptr(*B, i, j), ptr(*A, j, j), M - i, ker.columns()); /* ker.trmmRightLower(alpha, ptr(B, i, j), ptr(A, j, j)); ker.gemm(K, alpha, ptr(B, i, j + TILE_SIZE), ptr(A, j + TILE_SIZE, j), M - i, ker.columns()); */ - ker.store(ptr(C, i, j), M - i, ker.columns()); + ker.store(ptr(*C, i, j), M - i, ker.columns()); } } @@ -146,31 +146,31 @@ namespace blast for (; i + 3 * TILE_SIZE <= M && i + 4 * TILE_SIZE != M; i += 3 * TILE_SIZE) { RegisterMatrix ker; - gemm(ker, N - j, alpha, ptr(B, i, j), ptr(A, j, j), ker.rows(), N - j); - ker.store(ptr(C, i, j), ker.rows(), N - j); + gemm(ker, N - j, alpha, ptr(*B, i, j), ptr(*A, j, j), ker.rows(), N - j); + ker.store(ptr(*C, i, j), ker.rows(), N - j); } for (; i + 2 * TILE_SIZE <= M; i += 2 * TILE_SIZE) { RegisterMatrix ker; - gemm(ker, N - j, alpha, ptr(B, i, j), ptr(A, j, j), ker.rows(), N - j); - ker.store(ptr(C, i, j), ker.rows(), N - j); + gemm(ker, N - j, alpha, ptr(*B, i, j), ptr(*A, j, j), ker.rows(), N - j); + ker.store(ptr(*C, i, j), ker.rows(), N - j); } for (; i + 1 * TILE_SIZE <= M; i += 1 * TILE_SIZE) { RegisterMatrix ker; - gemm(ker, N - j, alpha, ptr(B, i, j), ptr(A, j, j), ker.rows(), N - j); - ker.store(ptr(C, i, j), ker.rows(), N - j); + gemm(ker, N - j, alpha, ptr(*B, i, j), ptr(*A, j, j), ker.rows(), N - j); + ker.store(ptr(*C, i, j), ker.rows(), N - j); } // Bottom-right corner if (i < M) { RegisterMatrix ker; - gemm(ker, N - j, alpha, ptr(B, i, j), ptr(A, j, j), M - i, N - j); - ker.store(ptr(C, i, j), M - i, N - j); + gemm(ker, N - j, alpha, ptr(*B, i, j), ptr(*A, j, j), M - i, N - j); + ker.store(ptr(*C, i, j), M - i, N - j); } } } -} \ No newline at end of file +} diff --git a/include/blast/math/dense/VectorPointer.hpp b/include/blast/math/dense/VectorPointer.hpp deleted file mode 100644 index 39a4538e..00000000 --- a/include/blast/math/dense/VectorPointer.hpp +++ /dev/null @@ -1,253 +0,0 @@ -// Copyright 2023 Mikhail Katliar -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include -#include - - -namespace blast -{ - /* - ======================================================================================= - - Pointer to static dense vector. - - ======================================================================================= - */ - - template - BLAZE_ALWAYS_INLINE auto ptr(StaticVector& v, size_t i) - { - return StaticVectorPointer {v.data() + i}; - } - - - template - BLAZE_ALWAYS_INLINE auto ptr(StaticVector const& v, size_t i) - { - return StaticVectorPointer {v.data() + i}; - } - - - /* - ======================================================================================= - - Pointer to dynamic dense vector. - - ======================================================================================= - */ - - template - BLAZE_ALWAYS_INLINE auto ptr(DynamicVector& v, size_t i) - { - using VT = DynamicVector; - return StaticVectorPointer> {v.data() + i}; - } - - - template - BLAZE_ALWAYS_INLINE auto ptr(DynamicVector const& v, size_t i) - { - using VT = DynamicVector; - return StaticVectorPointer> {v.data() + i}; - } - - - /* - ======================================================================================= - - Pointer to vector transpose expression. - - ======================================================================================= - */ - - template - requires IsDenseVector_v && IsTransExpr_v - BLAZE_ALWAYS_INLINE auto ptr(VT& v, size_t i) noexcept - { - return trans(ptr(v.operand(), i)); - } - - - template - requires IsDenseVector_v && IsTransExpr_v - BLAZE_ALWAYS_INLINE auto ptr(VT const& v, size_t i) noexcept - { - return trans(ptr(v.operand(), i)); - } - - - /* - ======================================================================================= - - Pointer to row of a matrix. - - ======================================================================================= - */ - - template - requires IsDenseVector_v && IsRow_v - BLAZE_ALWAYS_INLINE auto ptr(VT& v, size_t i) noexcept - { - using MT = ViewedType_t; - using ET = ElementType_t; - - if constexpr (IsRowMajorMatrix_v) - { - return StaticVectorPointer> {&(*v)[i]}; - } - else - { - if constexpr (IsStatic_v) - return StaticVectorPointer> {&(*v)[i]}; - else - return DynamicVectorPointer> {&(*v)[i], v.operand().spacing()}; - } - } - - - template - requires IsDenseVector_v && IsRow_v - BLAZE_ALWAYS_INLINE auto ptr(VT const& v, size_t i) noexcept - { - using MT = ViewedType_t; - using ET = ElementType_t; - - if constexpr (IsRowMajorMatrix_v) - { - return StaticVectorPointer> {&(*v)[i]}; - } - else - { - if constexpr (IsStatic_v) - return StaticVectorPointer> {&(*v)[i]}; - else - return DynamicVectorPointer> {&(*v)[i], v.operand().spacing()}; - } - } - - - /* - ======================================================================================= - - Pointer to column of a matrix. - - ======================================================================================= - */ - - template - requires IsDenseVector_v && IsColumn_v - BLAZE_ALWAYS_INLINE auto ptr(VT& v, size_t i) noexcept - { - using MT = ViewedType_t; - using ET = ElementType_t; - - if constexpr (IsColumnMajorMatrix_v) - { - return StaticVectorPointer> {&(*v)[i]}; - } - else - { - if constexpr (IsStatic_v) - return StaticVectorPointer> {&(*v)[i]}; - else - return DynamicVectorPointer> {&(*v)[i], v.operand().spacing()}; - } - } - - - template - requires IsDenseVector_v && IsColumn_v - BLAZE_ALWAYS_INLINE auto ptr(VT const& v, size_t i) noexcept - { - using MT = ViewedType_t; - using ET = ElementType_t; - - if constexpr (IsColumnMajorMatrix_v) - { - return StaticVectorPointer> {&(*v)[i]}; - } - else - { - if constexpr (IsStatic_v) - return StaticVectorPointer> {&(*v)[i]}; - else - return DynamicVectorPointer> {&(*v)[i], v.operand().spacing()}; - } - } - - - /* - ======================================================================================= - - Pointer to a subvector. - - ======================================================================================= - */ - - template - requires IsDenseVector_v && IsSubvector_v - BLAZE_ALWAYS_INLINE auto ptr(VT& v, size_t i) noexcept - { - return ptr(v.operand(), v.offset() + i); - } - - - template - requires IsDenseVector_v && IsSubvector_v - BLAZE_ALWAYS_INLINE auto ptr(VT const& v, size_t i) noexcept - { - return ptr(v.operand(), v.offset() + i); - } - - - /* - ======================================================================================= - - Pointer to general dense vector. - - ======================================================================================= - */ - template - requires IsDenseVector_v - BLAZE_ALWAYS_INLINE auto ptr(VT& v) - { - return ptr>(v, 0); - } - - - template - requires IsDenseVector_v - BLAZE_ALWAYS_INLINE auto ptr(VT const& v) - { - return ptr>(v, 0); - } -} \ No newline at end of file diff --git a/include/blast/math/expressions/PMatTransExpr.hpp b/include/blast/math/expressions/PMatTransExpr.hpp index 3098741a..ad5b262c 100644 --- a/include/blast/math/expressions/PMatTransExpr.hpp +++ b/include/blast/math/expressions/PMatTransExpr.hpp @@ -6,6 +6,7 @@ #include #include +#include #include #include @@ -32,6 +33,8 @@ #include #include +#include + namespace blast { @@ -234,4 +237,22 @@ namespace blast using ReturnType = const PMatTransExpr; return ReturnType(*pm); } + + + template + struct IsPanelMatrix> : std::true_type {}; + + + template + struct IsStatic> : IsStatic {}; + + + /** + * @brief Specialization for @a PMatTransExpr + * + * @tparam expression operand type + * @tparam SO storage order + */ + template + struct Spacing> : Spacing {}; } diff --git a/include/blast/math/expressions/PanelMatrix.hpp b/include/blast/math/expressions/PanelMatrix.hpp index d0494887..50d12268 100644 --- a/include/blast/math/expressions/PanelMatrix.hpp +++ b/include/blast/math/expressions/PanelMatrix.hpp @@ -5,28 +5,21 @@ #pragma once #include +#include #include #include #include #include -#include -#include +#include -#include -#include -#include -#include -#include +#include namespace blast { - using namespace blaze; - - template struct PanelMatrix - : public Matrix + : public blaze::Matrix { public: using TagType = Group0; @@ -80,14 +73,14 @@ namespace blast template inline auto assign(PanelMatrix& lhs, blaze::DMatTDMatMultExpr const& rhs) - -> blaze::EnableIf_t && IsRowMajorMatrix_v && IsPanelMatrix_v && IsRowMajorMatrix_v> + -> blaze::EnableIf_t && blaze::IsRowMajorMatrix_v && IsPanelMatrix_v && blaze::IsRowMajorMatrix_v> { BLAZE_THROW_LOGIC_ERROR("Not implemented 2"); } template - inline void assign(DenseMatrix& lhs, PanelMatrix const& rhs) + inline void assign(blaze::DenseMatrix& lhs, PanelMatrix const& rhs) { BLAZE_INTERNAL_ASSERT( (*lhs).rows() == (*rhs).rows() , "Invalid number of rows" ); BLAZE_INTERNAL_ASSERT( (*lhs).columns() == (*rhs).columns(), "Invalid number of columns" ); @@ -176,7 +169,7 @@ namespace blast template - inline void assign(PanelMatrix& lhs, DenseMatrix const& rhs) + inline void assign(PanelMatrix& lhs, blaze::DenseMatrix const& rhs) { size_t const m = (*rhs).rows(); size_t const n = (*rhs).columns(); diff --git a/include/blast/math/panel/DynamicPanelMatrix.hpp b/include/blast/math/panel/DynamicPanelMatrix.hpp index f82e51e6..4dd69d1f 100644 --- a/include/blast/math/panel/DynamicPanelMatrix.hpp +++ b/include/blast/math/panel/DynamicPanelMatrix.hpp @@ -109,7 +109,7 @@ namespace blast template< typename MT // Type of the right-hand side matrix , bool SO2 > // Storage order of the right-hand side matrix - DynamicPanelMatrix& operator=(Matrix const& rhs) + DynamicPanelMatrix& operator=(blaze::Matrix const& rhs) { assign(*this, *rhs); return *this; @@ -238,4 +238,4 @@ namespace blaze { using Type = blast::DynamicPanelMatrix; }; -} \ No newline at end of file +} diff --git a/include/blast/math/panel/DynamicPanelMatrixPointer.hpp b/include/blast/math/panel/DynamicPanelMatrixPointer.hpp index 13d23cec..f58dcabf 100644 --- a/include/blast/math/panel/DynamicPanelMatrixPointer.hpp +++ b/include/blast/math/panel/DynamicPanelMatrixPointer.hpp @@ -8,12 +8,8 @@ #include #include #include -#include -#include #include -#include - namespace blast { @@ -252,26 +248,18 @@ namespace blast } - template - requires (!IsStatic_v) - BLAZE_ALWAYS_INLINE auto ptr(PanelMatrix& m, size_t i, size_t j) - { - return DynamicPanelMatrixPointer, SO, AF, IsPadded_v>((*m).data(), spacing(m), i, j); - } - - - template - requires (!IsStatic_v) - BLAZE_ALWAYS_INLINE auto ptr(PanelMatrix const& m, size_t i, size_t j) + template + requires (!IsStatic_v) && IsPanelMatrix_v + BLAZE_ALWAYS_INLINE auto ptr(MT& m, size_t i, size_t j) { - return DynamicPanelMatrixPointer const, SO, AF, IsPadded_v>((*m).data(), spacing(m), i, j); + return DynamicPanelMatrixPointer, StorageOrder_v, AF, IsPadded_v>(data(m), spacing(m), i, j); } - template + template requires (!IsStatic_v) && IsPanelMatrix_v - BLAZE_ALWAYS_INLINE auto ptr(PMatTransExpr const& m, size_t i, size_t j) + BLAZE_ALWAYS_INLINE auto ptr(MT const& m, size_t i, size_t j) { - return trans(ptr(m.operand(), j, i)); + return DynamicPanelMatrixPointer const, StorageOrder_v, AF, IsPadded_v>(data(m), spacing(m), i, j); } } diff --git a/include/blast/math/panel/Gemm.hpp b/include/blast/math/panel/Gemm.hpp index aaeb9231..531c2801 100644 --- a/include/blast/math/panel/Gemm.hpp +++ b/include/blast/math/panel/Gemm.hpp @@ -5,7 +5,7 @@ #pragma once #include -#include +#include #include #include diff --git a/include/blast/math/panel/MatrixPointer.hpp b/include/blast/math/panel/MatrixPointer.hpp deleted file mode 100644 index 6caca766..00000000 --- a/include/blast/math/panel/MatrixPointer.hpp +++ /dev/null @@ -1,46 +0,0 @@ -// Copyright 2023 Mikhail Katliar -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once - -#include -#include -#include -#include -#include - - -namespace blast -{ - template - BLAZE_ALWAYS_INLINE auto ptr(PanelMatrix& m) - { - return ptr>(*m, 0, 0); - } - - - template - BLAZE_ALWAYS_INLINE auto ptr(PanelMatrix const& m) - { - return ptr>(*m, 0, 0); - } - - - template - requires IsPanelMatrix_v - BLAZE_ALWAYS_INLINE auto ptr(blaze::DMatTransExpr const& m) - { - return ptr>(*m, 0, 0); - } -} diff --git a/include/blast/math/panel/StaticPanelMatrix.hpp b/include/blast/math/panel/StaticPanelMatrix.hpp index 585b6f6b..82358689 100644 --- a/include/blast/math/panel/StaticPanelMatrix.hpp +++ b/include/blast/math/panel/StaticPanelMatrix.hpp @@ -8,6 +8,7 @@ #include #include #include +#include #include #include @@ -17,6 +18,7 @@ #include #include +#include namespace blast @@ -223,6 +225,18 @@ namespace blast return *this; } + + + /** + * @brief Specialization for @a StaticPanelMatrix + * + * @tparam Type element type + * @tparam M number of rows + * @tparam N number of columns + * @tparam SO storage order + */ + template + struct Spacing> : std::integral_constant::spacing()> {}; } namespace blaze diff --git a/include/blast/math/panel/StaticPanelMatrixPointer.hpp b/include/blast/math/panel/StaticPanelMatrixPointer.hpp index 85f3d1ba..da10f069 100644 --- a/include/blast/math/panel/StaticPanelMatrixPointer.hpp +++ b/include/blast/math/panel/StaticPanelMatrixPointer.hpp @@ -8,8 +8,6 @@ #include #include #include -#include -#include #include @@ -244,26 +242,18 @@ namespace blast } - template - requires IsStatic_v - BLAZE_ALWAYS_INLINE auto ptr(PanelMatrix& m, size_t i, size_t j) - { - return StaticPanelMatrixPointer, MT::spacing(), SO, AF, IsPadded_v>((*m).data(), i, j); - } - - - template - requires IsStatic_v - BLAZE_ALWAYS_INLINE auto ptr(PanelMatrix const& m, size_t i, size_t j) + template + requires IsStatic_v && IsPanelMatrix_v + BLAZE_ALWAYS_INLINE auto ptr(MT& m, size_t i, size_t j) { - return StaticPanelMatrixPointer const, MT::spacing(), SO, AF, IsPadded_v>((*m).data(), i, j); + return StaticPanelMatrixPointer, Spacing_v, StorageOrder_v, AF, IsPadded_v>(data(m), i, j); } - template + template requires IsStatic_v && IsPanelMatrix_v - BLAZE_ALWAYS_INLINE auto ptr(PMatTransExpr const& m, size_t i, size_t j) + BLAZE_ALWAYS_INLINE auto ptr(MT const& m, size_t i, size_t j) { - return trans(ptr(m.operand(), j, i)); + return StaticPanelMatrixPointer const, Spacing_v, StorageOrder_v, AF, IsPadded_v>(data(m), i, j); } } diff --git a/include/blast/math/register_matrix/DynamicRegisterMatrix.hpp b/include/blast/math/register_matrix/DynamicRegisterMatrix.hpp index 9cd7f826..caa0eeb3 100644 --- a/include/blast/math/register_matrix/DynamicRegisterMatrix.hpp +++ b/include/blast/math/register_matrix/DynamicRegisterMatrix.hpp @@ -6,14 +6,16 @@ #include #include +#include #include #include +#include #include +#include #include #include #include -#include #include #include @@ -31,12 +33,12 @@ namespace blast /// @tparam SO orientation of SIMD registers. template class DynamicRegisterMatrix - : public Matrix, SO> + : public blaze::Matrix, SO> { public: static_assert(SO == columnMajor, "Only column-major register matrices are currently supported"); - using BaseType = Matrix, SO>; + using BaseType = blaze::Matrix, SO>; using BaseType::storageOrder; /// @brief Type of matrix elements @@ -135,7 +137,7 @@ namespace blast /// @brief R(0:m-1, 0:n-1) += beta * A template - requires MatrixPointer && (PA::storageOrder == columnMajor) + requires MatrixPointer && (PA::storageOrder == columnMajor) void axpy(T beta, PA a) noexcept { #pragma unroll @@ -148,25 +150,25 @@ namespace blast /// @brief Load from memory template - requires MatrixPointer && (P::storageOrder == columnMajor) + requires MatrixPointer && (P::storageOrder == columnMajor) void load(P p) noexcept; /// @brief Load from memory template - requires MatrixPointer && (P::storageOrder == columnMajor) + requires MatrixPointer && (P::storageOrder == columnMajor) void load(T beta, P p) noexcept; /// @brief Store matrix at location pointed by \a p template - requires MatrixPointer && (P::storageOrder == columnMajor) + requires MatrixPointer && (P::storageOrder == columnMajor) void store(P p) const noexcept; /// @brief Store lower-triangular part of the matrix at location pointed by \a p. template - requires MatrixPointer && (P::storageOrder == columnMajor) + requires MatrixPointer && (P::storageOrder == columnMajor) void storeLower(P p) const noexcept; @@ -178,7 +180,7 @@ namespace blast /// @param a pointer to the first element of the first matrix. /// @param b pointer to the first element of the second matrix. template - requires MatrixPointer && (PA::storageOrder == columnMajor) && MatrixPointer + requires MatrixPointer && (PA::storageOrder == columnMajor) && MatrixPointer void ger(T alpha, PA a, PB b) noexcept; @@ -189,7 +191,7 @@ namespace blast /// @param a pointer to the first element of the first matrix. /// @param b pointer to the first element of the second matrix. template - requires MatrixPointer && (PA::storageOrder == columnMajor) && MatrixPointer + requires MatrixPointer && (PA::storageOrder == columnMajor) && MatrixPointer void ger(PA a, PB b) noexcept; @@ -202,7 +204,7 @@ namespace blast /// @brief a pointer to a triangular matrix /// template - requires MatrixPointer + requires MatrixPointer void trsmRightUpper(P a); @@ -222,7 +224,7 @@ namespace blast /// @param b general matrix. /// template - requires MatrixPointer && (P1::storageOrder == columnMajor) && MatrixPointer + requires MatrixPointer && (P1::storageOrder == columnMajor) && MatrixPointer void trmmLeftUpper(T alpha, P1 a, P2 b) noexcept; @@ -242,7 +244,7 @@ namespace blast /// @param b general matrix. /// template - requires MatrixPointer && (P1::storageOrder == columnMajor) && MatrixPointer + requires MatrixPointer && (P1::storageOrder == columnMajor) && MatrixPointer void trmmRightLower(T alpha, P1 a, P2 b) noexcept; @@ -289,7 +291,7 @@ namespace blast template template - requires MatrixPointer && (P::storageOrder == columnMajor) + requires MatrixPointer && (P::storageOrder == columnMajor) inline void DynamicRegisterMatrix::load(P p) noexcept { #pragma unroll @@ -302,7 +304,7 @@ namespace blast template template - requires MatrixPointer && (P::storageOrder == columnMajor) + requires MatrixPointer && (P::storageOrder == columnMajor) inline void DynamicRegisterMatrix::load(T beta, P p) noexcept { #pragma unroll @@ -315,7 +317,7 @@ namespace blast template template - requires MatrixPointer && (P::storageOrder == columnMajor) + requires MatrixPointer && (P::storageOrder == columnMajor) inline void DynamicRegisterMatrix::store(P p) const noexcept { // The compile-time constant size of the j loop in combination with the if() expression @@ -337,7 +339,7 @@ namespace blast template template - requires MatrixPointer && (P::storageOrder == columnMajor) + requires MatrixPointer && (P::storageOrder == columnMajor) inline void DynamicRegisterMatrix::storeLower(P p) const noexcept { for (size_t j = 0; j < N; ++j) if (j < n_) @@ -393,7 +395,7 @@ namespace blast template template - requires MatrixPointer && (PA::storageOrder == columnMajor) && MatrixPointer + requires MatrixPointer && (PA::storageOrder == columnMajor) && MatrixPointer BLAZE_ALWAYS_INLINE void DynamicRegisterMatrix::ger(T alpha, PA a, PB b) noexcept { SimdVecType ax[RM]; @@ -416,7 +418,7 @@ namespace blast template template - requires MatrixPointer && (PA::storageOrder == columnMajor) && MatrixPointer + requires MatrixPointer && (PA::storageOrder == columnMajor) && MatrixPointer BLAZE_ALWAYS_INLINE void DynamicRegisterMatrix::ger(PA a, PB b) noexcept { SimdVecType ax[RM]; @@ -550,23 +552,23 @@ namespace blast // } - template - inline bool operator==(DynamicRegisterMatrix const& rm, Matrix const& m) + template + inline bool operator==(DynamicRegisterMatrix const& rm, MT const& m) { if (rows(m) != rm.rows() || columns(m) != rm.columns()) return false; for (size_t i = 0; i < rm.rows(); ++i) for (size_t j = 0; j < rm.columns(); ++j) - if (rm(i, j) != (*m)(i, j)) + if (rm(i, j) != m(i, j)) return false; return true; } - template - inline bool operator==(Matrix const& m, DynamicRegisterMatrix const& rm) + template + inline bool operator==(MT const& m, DynamicRegisterMatrix const& rm) { return rm == m; } @@ -576,9 +578,9 @@ namespace blast typename T, size_t M, size_t N, bool SO, typename PA, typename PB, typename PC, typename PD > - requires MatrixPointer && (PA::storageOrder == columnMajor) - && MatrixPointer - && MatrixPointer && (PC::storageOrder == columnMajor) + requires MatrixPointer && (PA::storageOrder == columnMajor) + && MatrixPointer + && MatrixPointer && (PC::storageOrder == columnMajor) BLAZE_ALWAYS_INLINE void gemm(DynamicRegisterMatrix& ker, size_t K, T alpha, PA a, PB b, T beta, PC c, PD d) noexcept { diff --git a/include/blast/math/register_matrix/RegisterMatrix.hpp b/include/blast/math/register_matrix/RegisterMatrix.hpp index 631b530d..156031c0 100644 --- a/include/blast/math/register_matrix/RegisterMatrix.hpp +++ b/include/blast/math/register_matrix/RegisterMatrix.hpp @@ -7,9 +7,12 @@ #include #include #include +#include #include #include #include +#include +#include #include #include @@ -22,9 +25,6 @@ namespace blast { - using namespace blaze; - - /// @brief Register-resident matrix /// /// The RegisterMatrix class provides basic linear algebra operations that can be performed @@ -39,12 +39,12 @@ namespace blast /// template class RegisterMatrix - : public Matrix, SO> + : public blaze::Matrix, SO> { public: static_assert(SO == columnMajor, "Only column-major register matrices are currently supported"); - using BaseType = Matrix, SO>; + using BaseType = blaze::Matrix, SO>; using BaseType::storageOrder; /// @brief Type of matrix elements @@ -131,7 +131,7 @@ namespace blast /// @brief R += beta * A template - requires MatrixPointer && (PA::storageOrder == columnMajor) + requires MatrixPointer && (PA::storageOrder == columnMajor) void axpy(T beta, PA a) noexcept { SimdVecType const beta_simd {beta}; @@ -146,7 +146,7 @@ namespace blast /// @brief R(0:m-1, 0:n-1) += beta * A template - requires MatrixPointer && (PA::storageOrder == columnMajor) + requires MatrixPointer && (PA::storageOrder == columnMajor) void axpy(T beta, PA a, size_t m, size_t n) noexcept { SimdVecType const beta_simd {beta}; @@ -191,7 +191,7 @@ namespace blast /// @brief Store lower-triangular part of the matrix at location pointed by \a p. template - requires MatrixPointer && (P::storageOrder == columnMajor) + requires MatrixPointer && (P::storageOrder == columnMajor) void storeLower(P p) const noexcept; @@ -466,7 +466,7 @@ namespace blast template template - requires MatrixPointer && (P::storageOrder == columnMajor) + requires MatrixPointer && (P::storageOrder == columnMajor) inline void RegisterMatrix::store(P p) const noexcept { #pragma unroll @@ -479,7 +479,7 @@ namespace blast template template - requires MatrixPointer && (P::storageOrder == columnMajor) + requires MatrixPointer && (P::storageOrder == columnMajor) inline void RegisterMatrix::store(P p, size_t m, size_t n) const noexcept { // The compile-time constant size of the j loop in combination with the if() expression @@ -501,7 +501,7 @@ namespace blast template template - requires MatrixPointer && (P::storageOrder == columnMajor) + requires MatrixPointer && (P::storageOrder == columnMajor) inline void RegisterMatrix::storeLower(P p) const noexcept { for (size_t j = 0; j < N; ++j) @@ -524,7 +524,7 @@ namespace blast template template - requires MatrixPointer && (P::storageOrder == columnMajor) + requires MatrixPointer && (P::storageOrder == columnMajor) inline void RegisterMatrix::storeLower(P p, size_t m, size_t n) const noexcept { for (size_t j = 0; j < N; ++j) if (j < n) @@ -802,23 +802,23 @@ namespace blast } - template - inline bool operator==(RegisterMatrix const& rm, Matrix const& m) + template + inline bool operator==(RegisterMatrix const& rm, MT const& m) { if (rows(m) != rm.rows() || columns(m) != rm.columns()) return false; for (size_t i = 0; i < rm.rows(); ++i) for (size_t j = 0; j < rm.columns(); ++j) - if (rm(i, j) != (*m)(i, j)) + if (rm(i, j) != m(i, j)) return false; return true; } - template - inline bool operator==(Matrix const& m, RegisterMatrix const& rm) + template + inline bool operator==(MT const& m, RegisterMatrix const& rm) { return rm == m; } diff --git a/include/blast/math/typetraits/IsContiguous.hpp b/include/blast/math/typetraits/IsContiguous.hpp new file mode 100644 index 00000000..5a978b7b --- /dev/null +++ b/include/blast/math/typetraits/IsContiguous.hpp @@ -0,0 +1,26 @@ +// Copyright 2024 Mikhail Katliar. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#pragma once + + +namespace blast +{ + /** + * @brief Tests whether the elements of a given data type lie contiguous in memory + * + * @tparam T data type + */ + template + struct IsContiguous; + + + /** + * @brief Shortcut for @a IsContiguous::value + * + * @tparam T data type + */ + template + bool constexpr IsContiguous_v = IsContiguous::value; +} diff --git a/include/blast/math/typetraits/IsDenseMatrix.hpp b/include/blast/math/typetraits/IsDenseMatrix.hpp new file mode 100644 index 00000000..0709f0f8 --- /dev/null +++ b/include/blast/math/typetraits/IsDenseMatrix.hpp @@ -0,0 +1,27 @@ +// Copyright (c) 2019-2020 Mikhail Katliar All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#pragma once + +#include + +namespace blast +{ + /** + * @brief Tests if the given data type is a dense matrix with column- or row-major storage + * + * @tparam T data type + */ + template + struct IsDenseMatrix : std::false_type {}; + + + /** + * @brief Shortcut for @a IsDenseMatrix::value + * + * @tparam T data type + */ + template + bool constexpr IsDenseMatrix_v = IsDenseMatrix::value; +} diff --git a/include/blast/math/typetraits/IsDenseVector.hpp b/include/blast/math/typetraits/IsDenseVector.hpp new file mode 100644 index 00000000..ae0df964 --- /dev/null +++ b/include/blast/math/typetraits/IsDenseVector.hpp @@ -0,0 +1,28 @@ +// Copyright (c) 2019-2020 Mikhail Katliar All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#pragma once + +#include + + +namespace blast +{ + /** + * @brief Tests if the given data type is a vector with dense storage + * + * @tparam T data type + */ + template + struct IsDenseVector : std::false_type {}; + + + /** + * @brief Shortcut for @a IsDenseVector::value + * + * @tparam T data type + */ + template + bool constexpr IsDenseVector_v = IsDenseVector::value; +} diff --git a/include/blast/math/typetraits/IsStatic.hpp b/include/blast/math/typetraits/IsStatic.hpp index 0f60867e..a6bb784b 100644 --- a/include/blast/math/typetraits/IsStatic.hpp +++ b/include/blast/math/typetraits/IsStatic.hpp @@ -1,23 +1,26 @@ -// Copyright 2023 Mikhail Katliar -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. +// Copyright 2024 Mikhail Katliar. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. #pragma once -#include - namespace blast { - using blaze::IsStatic_v; -} \ No newline at end of file + /** + * @brief Tests if dimensions of the given matrix or vector type are known at compile-time. + * + * @tparam T matrix or vector type + */ + template + struct IsStatic; + + + /** + * @brief Shortcut for @a IsStatic::value + * + * @tparam T matrix or vector type + */ + template + bool constexpr IsStatic_v = IsStatic::value; +} diff --git a/include/blast/math/typetraits/IsStaticallySpaced.hpp b/include/blast/math/typetraits/IsStaticallySpaced.hpp new file mode 100644 index 00000000..97abccc7 --- /dev/null +++ b/include/blast/math/typetraits/IsStaticallySpaced.hpp @@ -0,0 +1,27 @@ +// Copyright 2024 Mikhail Katliar. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#pragma once + + +namespace blast +{ + /** + * @brief Tests if the given vector data type has spacing between elements + * which is known at compile-time. + * + * @tparam VT vector data type + */ + template + struct IsStaticallySpaced; + + + /** + * @brief Shortcut for @a IsStaticallySpaced::value + * + * @tparam VT vector data type + */ + template + bool constexpr IsStaticallySpaced_v = IsStaticallySpaced::value; +} diff --git a/include/blast/math/typetraits/Matrix.hpp b/include/blast/math/typetraits/Matrix.hpp new file mode 100644 index 00000000..e263b625 --- /dev/null +++ b/include/blast/math/typetraits/Matrix.hpp @@ -0,0 +1,29 @@ +// Copyright (c) 2019-2023 Mikhail Katliar All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#pragma once + +#include +#include + + +namespace blast +{ + /** + * @brief Matrix concept + * + * @tparam M matrix type + * @tparam T element type + */ + template > + concept Matrix = requires(M m, T v, T * p, size_t i, size_t j) + { + m; + // m(i, j) = v; + v = m(i, j); + i = rows(m); + j = columns(m); + // p = data(m); + }; +} diff --git a/include/blast/math/typetraits/Spacing.hpp b/include/blast/math/typetraits/Spacing.hpp new file mode 100644 index 00000000..118da285 --- /dev/null +++ b/include/blast/math/typetraits/Spacing.hpp @@ -0,0 +1,32 @@ +// Copyright (c) 2019-2020 Mikhail Katliar All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#pragma once + +#include + + +namespace blast +{ + /** + * @brief Deduces memory spacing of the given data type if it is known at compile-time + * - for row-major matrices, deduced to the memory distance between rows + * - for column-major matrices, deduced to the memory distance between columns + * - for panel-major matrices, deduced to memory distance between panels + * - for vectors, deduced to memory distance between consecutive elements + * + * @tparam T data type + */ + template + struct Spacing; + + + /** + * @brief Shortcut for @a Spacing::value + * + * @tparam MT matrix type + */ + template + size_t constexpr Spacing_v = Spacing::value; +} diff --git a/include/blast/math/typetraits/Vector.hpp b/include/blast/math/typetraits/Vector.hpp new file mode 100644 index 00000000..1ebbff20 --- /dev/null +++ b/include/blast/math/typetraits/Vector.hpp @@ -0,0 +1,29 @@ +// Copyright (c) 2019-2023 Mikhail Katliar All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#pragma once + +#include +#include + + +namespace blast +{ + /** + * @brief Vector concept + * + * @tparam V vector type + * @tparam T element type + */ + template > + concept Vector = requires(V v, T a, T * p, size_t i) + { + v; + // v[i] = a; + a = v[i]; + i = size(v); + // i = spacing(v); + p = data(v); + }; +} diff --git a/include/blast/util/Types.hpp b/include/blast/util/Types.hpp new file mode 100644 index 00000000..f7c82547 --- /dev/null +++ b/include/blast/util/Types.hpp @@ -0,0 +1,9 @@ +#pragma once + +#include + +namespace blast +{ + using std::size_t; + using std::ptrdiff_t; +} diff --git a/test/blast/math/dense/DynamicVectorPointerTest.cpp b/test/blast/math/dense/DynamicVectorPointerTest.cpp index de9d27db..4ce95ca0 100644 --- a/test/blast/math/dense/DynamicVectorPointerTest.cpp +++ b/test/blast/math/dense/DynamicVectorPointerTest.cpp @@ -2,7 +2,8 @@ // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. -#include +#include +#include #include @@ -210,4 +211,21 @@ namespace blast :: testing { this->template testMatrixColumnSubvectorImpl(); } -} \ No newline at end of file + + + TEST(DynamicVectorPointerTest, testSpacing) + { + blaze::DynamicVector dv(10); + ASSERT_EQ(spacing(dv), 1); + + blaze::StaticVector sv; + ASSERT_EQ(spacing(sv), 1); + } + + + TEST(DynamicVectorPointerTest, testStaticallySpaced) + { + ASSERT_TRUE((IsStaticallySpaced_v>)); + ASSERT_TRUE((IsStaticallySpaced_v>)); + } +} diff --git a/test/blast/math/dense/GemmTest.cpp b/test/blast/math/dense/GemmTest.cpp index e19eaca7..82a8b318 100644 --- a/test/blast/math/dense/GemmTest.cpp +++ b/test/blast/math/dense/GemmTest.cpp @@ -9,7 +9,7 @@ #include #include -#include +#include namespace blast :: testing @@ -121,4 +121,4 @@ namespace blast :: testing INSTANTIATE_TYPED_TEST_SUITE_P(double, DenseGemmTest, double); -} \ No newline at end of file +} diff --git a/test/blast/math/dense/GerTest.cpp b/test/blast/math/dense/GerTest.cpp index 70576ef1..cbf84913 100644 --- a/test/blast/math/dense/GerTest.cpp +++ b/test/blast/math/dense/GerTest.cpp @@ -6,8 +6,8 @@ #define BLAST_USER_ASSERTION 1 #include - - +#include +#include #include #include @@ -64,4 +64,4 @@ namespace blast :: testing INSTANTIATE_TYPED_TEST_SUITE_P(double, DenseGerTest, double); -} \ No newline at end of file +} diff --git a/test/blast/math/dense/Getf2Test.cpp b/test/blast/math/dense/Getf2Test.cpp index 80cdab7d..9f725701 100644 --- a/test/blast/math/dense/Getf2Test.cpp +++ b/test/blast/math/dense/Getf2Test.cpp @@ -2,9 +2,9 @@ // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. - #include #include +#include #include #include @@ -23,13 +23,13 @@ namespace blast :: testing using Real = T; template - static DynamicMatrix luRestore(Matrix const& LU, size_t * ipiv) + static blaze::DynamicMatrix luRestore(blaze::Matrix const& LU, size_t * ipiv) { auto const M = rows(LU); auto const N = columns(LU); auto const K = std::min(M, N); - DynamicMatrix L(M, K, 0.); + blaze::DynamicMatrix L(M, K, 0.); for (size_t i = 0; i < M; ++i) { for (size_t j = 0; j < i && j < K; ++j) @@ -39,7 +39,7 @@ namespace blast :: testing L(i, i) = 1.; } - DynamicMatrix U(K, N, 0.); + blaze::DynamicMatrix U(K, N, 0.); for (size_t i = 0; i < K; ++i) { for (size_t j = i; j < N; ++j) @@ -63,9 +63,9 @@ namespace blast :: testing // Init matrices // - DynamicMatrix A(M, N); + blaze::DynamicMatrix A(M, N); randomize(A); - DynamicMatrix A_orig = A; + blaze::DynamicMatrix A_orig = A; // Do getrf std::vector ipiv(K); @@ -105,4 +105,4 @@ namespace blast :: testing INSTANTIATE_TYPED_TEST_SUITE_P(double, DenseGetf2Test, double); // INSTANTIATE_TYPED_TEST_SUITE_P(Potrf_float, DensePotrtTest, float); -} \ No newline at end of file +} diff --git a/test/blast/math/dense/GetrfTest.cpp b/test/blast/math/dense/GetrfTest.cpp index 4a1a3732..59eaa7cb 100644 --- a/test/blast/math/dense/GetrfTest.cpp +++ b/test/blast/math/dense/GetrfTest.cpp @@ -2,14 +2,12 @@ // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. -#include "blast/math/dense/Laswp.hpp" #include #include #include - #include +#include -#include #include #include #include @@ -27,13 +25,13 @@ namespace blast :: testing using Real = T; template - static DynamicMatrix luRestore(Matrix const& LU, size_t * ipiv) + static blaze::DynamicMatrix luRestore(blaze::Matrix const& LU, size_t * ipiv) { auto const M = rows(LU); auto const N = columns(LU); auto const K = std::min(M, N); - DynamicMatrix L(M, K, 0.); + blaze::DynamicMatrix L(M, K, 0.); for (size_t i = 0; i < M; ++i) { for (size_t j = 0; j < i && j < K; ++j) @@ -43,7 +41,7 @@ namespace blast :: testing L(i, i) = 1.; } - DynamicMatrix U(K, N, 0.); + blaze::DynamicMatrix U(K, N, 0.); for (size_t i = 0; i < K; ++i) { for (size_t j = i; j < N; ++j) @@ -67,9 +65,9 @@ namespace blast :: testing // Init matrices // - DynamicMatrix A(M, N); + blaze::DynamicMatrix A(M, N); randomize(A); - DynamicMatrix A_orig = A; + blaze::DynamicMatrix A_orig = A; // Do getrf std::vector ipiv(K); @@ -132,4 +130,4 @@ namespace blast :: testing INSTANTIATE_TYPED_TEST_SUITE_P(double, DenseGetrfTest, double); // INSTANTIATE_TYPED_TEST_SUITE_P(Potrf_float, DensePotrtTest, float); -} \ No newline at end of file +} diff --git a/test/blast/math/dense/Iamax.cpp b/test/blast/math/dense/Iamax.cpp index 7d0cb254..c66134e1 100644 --- a/test/blast/math/dense/Iamax.cpp +++ b/test/blast/math/dense/Iamax.cpp @@ -1,18 +1,9 @@ -// Copyright 2023 Mikhail Katliar -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. +// Copyright 2024 Mikhail Katliar. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. #include +#include #include #include @@ -37,7 +28,7 @@ namespace blast :: testing for (size_t n = 1; n < 50; ++n) { - DynamicVector x(n); + blaze::DynamicVector x(n); randomize(x); size_t const ind = iamax(x); @@ -46,4 +37,4 @@ namespace blast :: testing ASSERT_LE(std::abs(v), x[ind]) << "Error at size " << n; } } -} \ No newline at end of file +} diff --git a/test/blast/math/dense/MatrixPointerTest.cpp b/test/blast/math/dense/MatrixPointerTest.cpp index 5bc45c2f..123a4dc0 100644 --- a/test/blast/math/dense/MatrixPointerTest.cpp +++ b/test/blast/math/dense/MatrixPointerTest.cpp @@ -2,19 +2,13 @@ // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. -#include -#include +#include #include #include -#include -#include -#include #include -#include - -#include +#include namespace blast :: testing @@ -53,7 +47,7 @@ namespace blast :: testing template struct MatrixType> { - using type = blaze::DynamicPanelMatrix; + using type = DynamicPanelMatrix; }; @@ -67,7 +61,7 @@ namespace blast :: testing static size_t constexpr N = SO == columnMajor ? S / SimdSize_v : defaultColumns; public: - using type = blaze::StaticPanelMatrix; + using type = StaticPanelMatrix; static_assert(type::spacing() == S); }; @@ -245,4 +239,4 @@ namespace blast :: testing EXPECT_EQ(p_trans.storageOrder, !p.storageOrder); } } -} \ No newline at end of file +} diff --git a/test/blast/math/dense/PotrfTest.cpp b/test/blast/math/dense/PotrfTest.cpp index 0e8def7a..f8fa24d4 100644 --- a/test/blast/math/dense/PotrfTest.cpp +++ b/test/blast/math/dense/PotrfTest.cpp @@ -2,13 +2,14 @@ // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. - #include #include #include #include +#include + namespace blast :: testing { @@ -30,7 +31,7 @@ namespace blast :: testing { // Init matrices // - DynamicMatrix A(M, M), L(M, M); + blaze::DynamicMatrix A(M, M), L(M, M); makePositiveDefinite(A); reset(L); @@ -51,7 +52,7 @@ namespace blast :: testing // Init matrices // - StaticMatrix A, L; + blaze::StaticMatrix A, L; makePositiveDefinite(A); reset(L); @@ -71,10 +72,10 @@ namespace blast :: testing // Init matrices // - StaticMatrix A_orig; + blaze::StaticMatrix A_orig; makePositiveDefinite(A_orig); - StaticMatrix A = A_orig; + blaze::StaticMatrix A = A_orig; for (size_t i = 0; i < M; ++i) for (size_t j = i + 1; j < M; ++j) reset(A(i, j)); @@ -96,4 +97,4 @@ namespace blast :: testing INSTANTIATE_TYPED_TEST_SUITE_P(double, DensePotrtTest, double); // INSTANTIATE_TYPED_TEST_SUITE_P(Potrf_float, DensePotrtTest, float); -} \ No newline at end of file +} diff --git a/test/blast/math/dense/StaticVectorPointerTest.cpp b/test/blast/math/dense/StaticVectorPointerTest.cpp index c25d3a43..a8050c8f 100644 --- a/test/blast/math/dense/StaticVectorPointerTest.cpp +++ b/test/blast/math/dense/StaticVectorPointerTest.cpp @@ -2,7 +2,8 @@ // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. -#include +#include +#include #include @@ -114,4 +115,4 @@ namespace blast :: testing { this->template testMatrixRowSubvectorImpl(); } -} \ No newline at end of file +} diff --git a/test/blast/math/dense/SyrkTest.cpp b/test/blast/math/dense/SyrkTest.cpp index 7a2334ae..897b4d34 100644 --- a/test/blast/math/dense/SyrkTest.cpp +++ b/test/blast/math/dense/SyrkTest.cpp @@ -6,11 +6,12 @@ #include - - #include #include +#include + + namespace blast :: testing { template @@ -32,8 +33,8 @@ namespace blast :: testing { // Init Blaze matrices // - DynamicMatrix A(m, k); - DynamicMatrix C(m, m), D(m, m); + blaze::DynamicMatrix A(m, k); + blaze::DynamicMatrix C(m, m), D(m, m); randomize(A); makeSymmetric(C); // for (size_t i = 0; i < m; ++i) @@ -71,4 +72,4 @@ namespace blast :: testing INSTANTIATE_TYPED_TEST_SUITE_P(double, DenseSyrkTest, double); -} \ No newline at end of file +} diff --git a/test/blast/math/dense/TrmmTest.cpp b/test/blast/math/dense/TrmmTest.cpp index e5c08041..a61e3c4c 100644 --- a/test/blast/math/dense/TrmmTest.cpp +++ b/test/blast/math/dense/TrmmTest.cpp @@ -4,8 +4,7 @@ #include #include - - +#include #include #include @@ -70,4 +69,4 @@ namespace blast :: testing << "trmm error at size m,n=" << m << "," << n; } } -} \ No newline at end of file +} diff --git a/test/blast/math/panel/PotrfTest.cpp b/test/blast/math/panel/PotrfTest.cpp index 28c27115..d0c267cd 100644 --- a/test/blast/math/panel/PotrfTest.cpp +++ b/test/blast/math/panel/PotrfTest.cpp @@ -5,8 +5,7 @@ #include #include #include - -#include +#include #include #include diff --git a/test/blast/math/panel/StaticPanelMatrixTest.cpp b/test/blast/math/panel/StaticPanelMatrixTest.cpp index c319a422..486b61c8 100644 --- a/test/blast/math/panel/StaticPanelMatrixTest.cpp +++ b/test/blast/math/panel/StaticPanelMatrixTest.cpp @@ -3,7 +3,7 @@ // license that can be found in the LICENSE file. #include -#include +#include #include #include diff --git a/test/blast/math/simd/DynamicRegisterMatrixTest.cpp b/test/blast/math/simd/DynamicRegisterMatrixTest.cpp index ae1f91fb..45908e9b 100644 --- a/test/blast/math/simd/DynamicRegisterMatrixTest.cpp +++ b/test/blast/math/simd/DynamicRegisterMatrixTest.cpp @@ -4,8 +4,9 @@ #include #include -#include +#include #include +#include #include #include @@ -223,4 +224,4 @@ namespace blast :: testing // std::clog << "DynamicRegisterMatrixTest.testPotrf not implemented for kernels with columns more than rows!" << std::endl; // } // } -} \ No newline at end of file +} diff --git a/test/blast/math/simd/RegisterMatrixTest.cpp b/test/blast/math/simd/RegisterMatrixTest.cpp index a09865f1..2ad3866a 100644 --- a/test/blast/math/simd/RegisterMatrixTest.cpp +++ b/test/blast/math/simd/RegisterMatrixTest.cpp @@ -4,17 +4,15 @@ #include #include -#include -#include -#include +#include +#include #include +#include #include #include #include -#include - namespace blast :: testing { @@ -376,7 +374,7 @@ namespace blast :: testing blaze::randomize(alpha); TypeParam ker; - ker.load(1., ptr(C)); + ker.load(1., ptr(*C)); ker.ger(alpha, ptr(a), ptr(b)); BLAST_EXPECT_APPROX_EQ(ker, evaluate(C + alpha * a * b), absTol(), relTol()); @@ -445,7 +443,7 @@ namespace blast :: testing for (size_t n = 0; n <= columns(C); ++n) { TypeParam ker; - ker.load(1., ptr(C)); + ker.load(1., ptr(*C)); ker.ger(ET(1.), ptr(a), ptr(trans(b)), m, n); for (size_t i = 0; i < m; ++i) @@ -473,9 +471,9 @@ namespace blast :: testing randomize(C); TypeParam ker; - ker.load(1., ptr(C)); + ker.load(1., ptr(*C)); ker.ger(ET(1.), ptr(a), ptr(trans(b))); - ker.store(ptr(D)); + ker.store(ptr(*D)); BLAST_EXPECT_EQ(D, evaluate(C + a * trans(b))); } @@ -643,4 +641,4 @@ namespace blast :: testing // TODO: should be strictly equal? BLAST_ASSERT_APPROX_EQ(ker, alpha * B * A, absTol(), relTol()); } -} \ No newline at end of file +} From ed8835672db7ccdf8092b4e28cc208aaa030031f Mon Sep 17 00:00:00 2001 From: Mikhail Katliar Date: Mon, 5 Aug 2024 14:17:51 +0200 Subject: [PATCH 04/12] Low-level operations using xsimd (#12) --- .github/workflows/cmake.yml | 22 +- CMakeLists.txt | 2 + Dockerfile | 3 +- include/blast/math/RegisterMatrix.hpp | 3 - include/blast/math/RowColumnVectorPointer.hpp | 10 +- include/blast/math/Simd.hpp | 12 + .../blast/math/dense/DynamicMatrixPointer.hpp | 17 +- .../blast/math/dense/DynamicVectorPointer.hpp | 26 +- include/blast/math/dense/Iamax.hpp | 3 +- .../blast/math/dense/StaticMatrixPointer.hpp | 10 +- .../blast/math/dense/StaticVectorPointer.hpp | 30 +- .../blast/math/expressions/PMatTransExpr.hpp | 3 +- .../blast/math/expressions/PanelMatrix.hpp | 42 +- .../math/panel/DynamicPanelMatrixPointer.hpp | 11 +- include/blast/math/panel/PanelSize.hpp | 5 +- .../blast/math/panel/StaticPanelMatrix.hpp | 42 +- .../math/panel/StaticPanelMatrixPointer.hpp | 11 +- .../register_matrix/DynamicRegisterMatrix.hpp | 33 +- .../math/register_matrix/RegisterMatrix.hpp | 51 +- .../math/register_matrix/double_12_4_4.hpp | 167 ------- .../math/register_matrix/double_4_4_4.hpp | 79 --- .../math/register_matrix/double_8_4_4.hpp | 151 ------ include/blast/math/simd/Avx256.hpp | 25 - include/blast/math/simd/Hsum.hpp | 31 -- include/blast/math/simd/IntVecType.hpp | 24 - ...IntScalarType.hpp => RegisterCapacity.hpp} | 25 +- include/blast/math/simd/SequenceTag.hpp | 23 - include/blast/math/simd/Simd.hpp | 473 +----------------- include/blast/math/simd/SimdIndex.hpp | 98 ++++ include/blast/math/simd/SimdMask.hpp | 80 ++- include/blast/math/simd/SimdSize.hpp | 22 +- include/blast/math/simd/SimdVec.hpp | 374 +++++++++++++- include/blast/math/simd/SimdVecBase.hpp | 54 -- include/blast/math/simd/arch/Avx2.hpp | 123 +++++ .../blast/math/simd/avx256/IntScalarType.hpp | 34 -- include/blast/math/simd/avx256/SimdMask.hpp | 83 --- include/blast/math/simd/avx256/SimdSize.hpp | 50 -- .../blast/math/simd/avx256/SimdVecFloat32.hpp | 263 ---------- .../blast/math/simd/avx256/SimdVecFloat64.hpp | 234 --------- .../blast/math/simd/avx256/SimdVecInt32.hpp | 122 ----- .../blast/math/simd/avx256/SimdVecInt64.hpp | 124 ----- include/blast/math/views/row/Panel.hpp | 6 +- include/blast/math/views/submatrix/Panel.hpp | 6 +- test/blast/CMakeLists.txt | 1 - .../math/panel/StaticPanelMatrixTest.cpp | 15 +- test/blast/math/simd/HsumTest.cpp | 38 -- test/blast/math/simd/SimdVecTest.cpp | 15 +- 47 files changed, 882 insertions(+), 2194 deletions(-) create mode 100644 include/blast/math/Simd.hpp delete mode 100644 include/blast/math/register_matrix/double_12_4_4.hpp delete mode 100644 include/blast/math/register_matrix/double_4_4_4.hpp delete mode 100644 include/blast/math/register_matrix/double_8_4_4.hpp delete mode 100644 include/blast/math/simd/Avx256.hpp delete mode 100644 include/blast/math/simd/Hsum.hpp delete mode 100644 include/blast/math/simd/IntVecType.hpp rename include/blast/math/simd/{IntScalarType.hpp => RegisterCapacity.hpp} (62%) delete mode 100644 include/blast/math/simd/SequenceTag.hpp create mode 100644 include/blast/math/simd/SimdIndex.hpp delete mode 100644 include/blast/math/simd/SimdVecBase.hpp create mode 100644 include/blast/math/simd/arch/Avx2.hpp delete mode 100644 include/blast/math/simd/avx256/IntScalarType.hpp delete mode 100644 include/blast/math/simd/avx256/SimdMask.hpp delete mode 100644 include/blast/math/simd/avx256/SimdSize.hpp delete mode 100644 include/blast/math/simd/avx256/SimdVecFloat32.hpp delete mode 100644 include/blast/math/simd/avx256/SimdVecFloat64.hpp delete mode 100644 include/blast/math/simd/avx256/SimdVecInt32.hpp delete mode 100644 include/blast/math/simd/avx256/SimdVecInt64.hpp delete mode 100644 test/blast/math/simd/HsumTest.cpp diff --git a/.github/workflows/cmake.yml b/.github/workflows/cmake.yml index 9c1ace7c..ece754f4 100644 --- a/.github/workflows/cmake.yml +++ b/.github/workflows/cmake.yml @@ -21,7 +21,17 @@ jobs: - uses: actions/checkout@v3 - name: Install APT packages - run: sudo apt install clang libboost-exception-dev libbenchmark-dev -y + run: | + sudo apt-get update + sudo apt install libboost-exception-dev libbenchmark-dev -y + + - name: Install LLVM and Clang 18 + run: | + sudo apt-get update + sudo apt-get install -y wget gnupg lsb-release + wget https://apt.llvm.org/llvm.sh + chmod +x llvm.sh + sudo ./llvm.sh 18 - name: Install Blaze run: | @@ -36,6 +46,13 @@ jobs: -DCMAKE_INSTALL_PREFIX=/usr/local/ . \ && sudo make install + - name: Install xsimd + run: | + git clone https://github.com/xtensor-stack/xsimd.git + cd xsimd + cmake -B build -DCMAKE_INSTALL_PREFIX=/usr/local/ . + sudo cmake --build build --target install + - name: Install GTest run: | git clone https://github.com/google/googletest.git @@ -47,7 +64,7 @@ jobs: run: | cmake -B ${{github.workspace}}/build \ -DCMAKE_BUILD_TYPE=${{env.BUILD_TYPE}} \ - -DCMAKE_CXX_COMPILER=clang++ \ + -DCMAKE_CXX_COMPILER=clang++-18 \ -DBLAST_WITH_BENCHMARK=ON \ -DBLAST_WITH_TEST=ON @@ -60,4 +77,3 @@ jobs: # Execute tests defined by the CMake configuration. # See https://cmake.org/cmake/help/latest/manual/ctest.1.html for more detail run: ctest -C ${{env.BUILD_TYPE}} - diff --git a/CMakeLists.txt b/CMakeLists.txt index faffe2fd..3b0ad928 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -25,6 +25,7 @@ set(CMAKE_DEBUG_POSTFIX d) find_package(Boost REQUIRED COMPONENTS exception) find_package(blaze REQUIRED) +find_package(xsimd REQUIRED) add_library(blast INTERFACE) @@ -35,6 +36,7 @@ target_include_directories(blast INTERFACE target_link_libraries(blast INTERFACE blaze::blaze + INTERFACE xsimd ) target_compile_options(blast diff --git a/Dockerfile b/Dockerfile index 1ccfab49..fb47dea6 100644 --- a/Dockerfile +++ b/Dockerfile @@ -56,7 +56,7 @@ ENV PKG_CONFIG_PATH=/usr/local/lib RUN mkdir -p blast/build && cd blast/build \ && cmake -DCMAKE_BUILD_TYPE=RelWithDebInfo \ -DCMAKE_CXX_COMPILER="clang++-15" \ - -DCMAKE_CXX_FLAGS="-march=native -mfma -mavx -mavx2 -msse4 -fno-math-errno" \ + -DCMAKE_CXX_FLAGS="-march=native -mfma -mavx -mavx2 -msse4 -fno-math-errno -DXSIMD_DEFAULT_ARCH='fma3'" \ -DCMAKE_CXX_FLAGS_RELEASE="-O3 -g -DNDEBUG -ffast-math" .. \ -DBLAST_WITH_TEST=ON \ -DBLAST_WITH_BENCHMARK=ON \ @@ -79,4 +79,3 @@ CMD mkdir -p blast/bench_result/data \ && mkdir -p blast/bench_result/image \ && cd blast \ && make -j 1 bench_result/image/dgemm_performance.png bench_result/image/dgemm_performance_ratio.png - diff --git a/include/blast/math/RegisterMatrix.hpp b/include/blast/math/RegisterMatrix.hpp index 3c8944c4..851ef7ef 100644 --- a/include/blast/math/RegisterMatrix.hpp +++ b/include/blast/math/RegisterMatrix.hpp @@ -7,6 +7,3 @@ #include #include -#include -#include -#include diff --git a/include/blast/math/RowColumnVectorPointer.hpp b/include/blast/math/RowColumnVectorPointer.hpp index e13c311f..ae7a8b5a 100644 --- a/include/blast/math/RowColumnVectorPointer.hpp +++ b/include/blast/math/RowColumnVectorPointer.hpp @@ -15,7 +15,7 @@ #pragma once #include -#include +#include #include #include @@ -28,9 +28,9 @@ namespace blast { public: using ElementType = typename MP::ElementType; - using IntrinsicType = typename Simd>::IntrinsicType; - using MaskType = typename Simd>::MaskType; using SimdVecType = SimdVec>; + using IntrinsicType = SimdVecType::IntrinsicType; + using MaskType = SimdMask>; static TransposeFlag constexpr transposeFlag = TF; static bool constexpr aligned = MP::aligned; @@ -176,7 +176,7 @@ namespace blast private: - static size_t constexpr SS = Simd>::size; + static size_t constexpr SS = SimdVecType::size(); MP ptr_; }; @@ -244,4 +244,4 @@ namespace blast { return RowColumnVectorPointer {std::move(p)}; } -} \ No newline at end of file +} diff --git a/include/blast/math/Simd.hpp b/include/blast/math/Simd.hpp new file mode 100644 index 00000000..912948ce --- /dev/null +++ b/include/blast/math/Simd.hpp @@ -0,0 +1,12 @@ +// Copyright (c) 2019-2020 Mikhail Katliar All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#pragma once + +#include +#include +#include +#include +#include +#include diff --git a/include/blast/math/dense/DynamicMatrixPointer.hpp b/include/blast/math/dense/DynamicMatrixPointer.hpp index e6540576..5610b1c9 100644 --- a/include/blast/math/dense/DynamicMatrixPointer.hpp +++ b/include/blast/math/dense/DynamicMatrixPointer.hpp @@ -8,14 +8,11 @@ #include #include #include -#include -#include +#include #include - #include - namespace blast { template @@ -23,9 +20,9 @@ namespace blast { public: using ElementType = T; - using IntrinsicType = typename Simd>::IntrinsicType; - using MaskType = typename Simd>::MaskType; using SimdVecType = SimdVec>; + using IntrinsicType = SimdVecType::IntrinsicType; + using MaskType = SimdMask>; static bool constexpr storageOrder = SO; static bool constexpr aligned = AF; @@ -83,9 +80,9 @@ namespace blast } - IntrinsicType broadcast() const noexcept + SimdVecType broadcast() const noexcept { - return blast::broadcast(ptr_); + return *ptr_; } @@ -187,7 +184,7 @@ namespace blast private: - static size_t constexpr SS = Simd>::size; + static size_t constexpr SS = SimdVecType::size(); static TransposeFlag constexpr majorOrientation = SO == columnMajor ? columnVector : rowVector; @@ -253,4 +250,4 @@ namespace blast { return {p, spacing}; } -} \ No newline at end of file +} diff --git a/include/blast/math/dense/DynamicVectorPointer.hpp b/include/blast/math/dense/DynamicVectorPointer.hpp index b026d205..7638119a 100644 --- a/include/blast/math/dense/DynamicVectorPointer.hpp +++ b/include/blast/math/dense/DynamicVectorPointer.hpp @@ -4,8 +4,7 @@ #pragma once - -#include +#include #include #include @@ -18,9 +17,9 @@ namespace blast { public: using ElementType = T; - using IntrinsicType = typename Simd>::IntrinsicType; - using MaskType = typename Simd>::MaskType; using SimdVecType = SimdVec>; + using IntrinsicType = SimdVecType::IntrinsicType; + using MaskType = SimdMask>; static bool constexpr transposeFlag = TF; static bool constexpr aligned = AF; @@ -40,7 +39,7 @@ namespace blast , spacing_ {spacing} { BLAST_USER_ASSERT(spacing > 0, "Vector element spacing must be positive."); - BLAST_USER_ASSERT(!AF || reinterpret_cast(ptr) % (SS * sizeof(T)) == 0, "Pointer is not aligned"); + BLAST_USER_ASSERT(!AF || isSimdAligned(ptr), "Pointer is not aligned"); } @@ -51,6 +50,7 @@ namespace blast SimdVecType load() const noexcept { // Non-optimized + // TODO: use gather() IntrinsicType v; for (size_t i = 0; i < SS; ++i) v[i] = ptr_[spacing_ * i]; @@ -62,18 +62,18 @@ namespace blast SimdVecType load(MaskType mask) const noexcept { // Non-optimized - IntrinsicType v = blast::setzero, SS>(); + // TODO: use gather() + T v[SS]; for (size_t i = 0; i < SS; ++i) - if (mask[i]) - v[i] = ptr_[spacing_ * i]; + v[i] = mask[i] ? ptr_[spacing_ * i] : T {}; - return SimdVecType {v}; + return SimdVecType {v, false}; } - IntrinsicType broadcast() const noexcept + SimdVecType broadcast() const noexcept { - return blast::broadcast(ptr_); + return *ptr_; } @@ -172,7 +172,7 @@ namespace blast private: - static size_t constexpr SS = Simd>::size; + static size_t constexpr SS = SimdVecType::size(); T * ptrOffset(ptrdiff_t i) const noexcept @@ -191,4 +191,4 @@ namespace blast { return p.trans(); } -} \ No newline at end of file +} diff --git a/include/blast/math/dense/Iamax.hpp b/include/blast/math/dense/Iamax.hpp index 6b03a36a..530df07b 100644 --- a/include/blast/math/dense/Iamax.hpp +++ b/include/blast/math/dense/Iamax.hpp @@ -17,7 +17,6 @@ #include #include -#include #include #include @@ -177,4 +176,4 @@ namespace blast return iamax(N, ptr(*x)); } -} \ No newline at end of file +} diff --git a/include/blast/math/dense/StaticMatrixPointer.hpp b/include/blast/math/dense/StaticMatrixPointer.hpp index 2db833b8..961b5e01 100644 --- a/include/blast/math/dense/StaticMatrixPointer.hpp +++ b/include/blast/math/dense/StaticMatrixPointer.hpp @@ -6,8 +6,8 @@ #include #include -#include #include +#include #include #include #include @@ -24,9 +24,9 @@ namespace blast { public: using ElementType = T; - using IntrinsicType = typename Simd>::IntrinsicType; - using MaskType = typename Simd>::MaskType; using SimdVecType = SimdVec>; + using IntrinsicType = SimdVecType::IntrinsicType; + using MaskType = SimdMask>; static bool constexpr storageOrder = SO; static bool constexpr aligned = AF; @@ -188,7 +188,7 @@ namespace blast private: - static size_t constexpr SS = Simd>::size; + static size_t constexpr SS = SimdVecType::size(); static TransposeFlag constexpr majorOrientation = SO == columnMajor ? columnVector : rowVector; @@ -225,4 +225,4 @@ namespace blast { return trans(ptr(m.operand(), j, i)); } -} \ No newline at end of file +} diff --git a/include/blast/math/dense/StaticVectorPointer.hpp b/include/blast/math/dense/StaticVectorPointer.hpp index 6608ddd0..6017dafa 100644 --- a/include/blast/math/dense/StaticVectorPointer.hpp +++ b/include/blast/math/dense/StaticVectorPointer.hpp @@ -5,7 +5,7 @@ #pragma once -#include +#include #include @@ -16,9 +16,9 @@ namespace blast { public: using ElementType = T; - using IntrinsicType = typename Simd>::IntrinsicType; - using MaskType = typename Simd>::MaskType; using SimdVecType = SimdVec>; + using IntrinsicType = SimdVecType::IntrinsicType; + using MaskType = SimdMask>; static bool constexpr transposeFlag = TF; static bool constexpr aligned = AF; @@ -35,7 +35,7 @@ namespace blast constexpr StaticVectorPointer(T * ptr) noexcept : ptr_ {ptr} { - BLAST_USER_ASSERT(!AF || reinterpret_cast(ptr) % (SS * sizeof(T)) == 0, "Pointer is not aligned"); + BLAST_USER_ASSERT(!AF || isSimdAligned(ptr), "Pointer is not aligned"); } @@ -70,27 +70,27 @@ namespace blast else { // Non-optimized - IntrinsicType v = blast::setzero(); + // TODO: use gather + T v[SS]; for (size_t i = 0; i < SS; ++i) - if (mask[i]) - v[i] = ptr_[S * i]; + v[i] = mask[i] ? ptr_[S * i] : T {}; - return SimdVecType {v}; + return SimdVecType {v, false}; } } - IntrinsicType broadcast() const noexcept + SimdVecType broadcast() const noexcept { - return blast::broadcast(ptr_); + return *ptr_; } - void store(IntrinsicType val) const noexcept + void store(SimdVecType val) const noexcept { if constexpr (S == 1) { - blast::store(ptr_, val); + val.store(ptr_, AF); } else { @@ -105,7 +105,7 @@ namespace blast { if constexpr (S == 1) { - blast::maskstore(ptr_, mask, val); + val.store(ptr_, mask); } else { @@ -195,7 +195,7 @@ namespace blast private: - static size_t constexpr SS = Simd>::size; + static size_t constexpr SS = SimdVecType::size(); T * ptrOffset(ptrdiff_t i) const noexcept @@ -213,4 +213,4 @@ namespace blast { return p.trans(); } -} \ No newline at end of file +} diff --git a/include/blast/math/expressions/PMatTransExpr.hpp b/include/blast/math/expressions/PMatTransExpr.hpp index fae79ffc..4ad4526f 100644 --- a/include/blast/math/expressions/PMatTransExpr.hpp +++ b/include/blast/math/expressions/PMatTransExpr.hpp @@ -12,7 +12,6 @@ #include #include #include -#include #include #include #include @@ -213,4 +212,4 @@ namespace blast using ReturnType = const PMatTransExpr; return ReturnType(*pm); } -} \ No newline at end of file +} diff --git a/include/blast/math/expressions/PanelMatrix.hpp b/include/blast/math/expressions/PanelMatrix.hpp index 4a556a33..d0494887 100644 --- a/include/blast/math/expressions/PanelMatrix.hpp +++ b/include/blast/math/expressions/PanelMatrix.hpp @@ -5,9 +5,12 @@ #pragma once #include -#include +#include +#include +#include #include -//#include +#include +#include #include #include @@ -91,13 +94,12 @@ namespace blast using ET1 = ElementType_t; using ET2 = ElementType_t; - static size_t constexpr SS = Simd::size; + static size_t constexpr SS = SimdSize_v; static size_t constexpr PANEL_SIZE = PanelSize_v; static_assert(PANEL_SIZE % SS == 0); - using MaskType = typename Simd::MaskType; - using IntType = typename Simd::IntType; - using SIMD = Simd; + using MaskType = SimdMask; + using IntType = typename SimdIndex::value_type; if constexpr (SO1 == columnMajor && SO2 == columnMajor) @@ -108,22 +110,22 @@ namespace blast for (size_t i = 0; i + SS <= m; i += SS) { - ET2 const * pr = &(*rhs)(i, 0); - ET1 * pl = data(lhs) + i; + auto pr = ptr(*rhs, i, 0); + auto pl = ptr(*lhs, i, 0); for (size_t j = 0; j < n; ++j) - store(pl + s * j, load(pr + PANEL_SIZE * j)); + pl(0, j).store(pr(0, j).load()); } if (IntType const rem = m % SS) { - MaskType const mask = SIMD::index() < rem; + MaskType const mask = indexSequence() < rem; size_t const i = m - rem; - ET2 const * pr = &(*rhs)(i, 0); - ET1 * pl = data(lhs) + i; + auto pr = ptr(*rhs, i, 0); + auto pl = ptr(*lhs, i, 0); for (size_t j = 0; j < n; ++j) - maskstore(pl + s * j, mask, load(pr + PANEL_SIZE * j)); + pl(0, j).store(pr(0, j).load(), mask); } #if 0 @@ -185,22 +187,22 @@ namespace blast using ET1 = ElementType_t; using ET2 = ElementType_t; - static size_t constexpr SS = Simd::size; + static size_t constexpr SS = SimdSize_v; static size_t constexpr PANEL_SIZE = PanelSize_v; static_assert(PANEL_SIZE % SS == 0); - using MaskType = typename Simd::MaskType; - using IntType = typename Simd::IntType; + using MaskType = SimdMask; + using IntType = typename SimdIndex::value_type; if constexpr (SO1 == columnMajor && SO2 == columnMajor) { for (size_t i = 0; i + SS <= m; i += SS) { - ET2 const * pr = data(rhs) + i; - ET1 * pl = &(*lhs)(i, 0); + auto pr = ptr(*rhs, i, 0); + auto pl = ptr(*lhs, i, 0); for (size_t j = 0; j < n; ++j) - store(pl + PANEL_SIZE * j, load(pr + s * j)); + pl(0, j).store(pr(0, j).load()); } if (IntType const rem = m % SS) @@ -243,4 +245,4 @@ namespace blast (*lhs)(i, j) = (*rhs)(i, j); } } -} \ No newline at end of file +} diff --git a/include/blast/math/panel/DynamicPanelMatrixPointer.hpp b/include/blast/math/panel/DynamicPanelMatrixPointer.hpp index f965f4e1..1e3f1233 100644 --- a/include/blast/math/panel/DynamicPanelMatrixPointer.hpp +++ b/include/blast/math/panel/DynamicPanelMatrixPointer.hpp @@ -5,10 +5,9 @@ #pragma once #include -#include #include #include -#include +#include #include #include #include @@ -23,9 +22,9 @@ namespace blast { public: using ElementType = T; - using IntrinsicType = typename Simd>::IntrinsicType; - using MaskType = typename Simd>::MaskType; using SimdVecType = SimdVec>; + using IntrinsicType = SimdVecType::IntrinsicType; + using MaskType = SimdMask>; static bool constexpr storageOrder = SO; static bool constexpr aligned = AF; @@ -211,7 +210,7 @@ namespace blast private: - static size_t constexpr SS = Simd>::size; + static size_t constexpr SS = SimdVecType::size(); static TransposeFlag constexpr majorOrientation = SO == columnMajor ? columnVector : rowVector; @@ -275,4 +274,4 @@ namespace blast { return trans(ptr(m.operand(), j, i)); } -} \ No newline at end of file +} diff --git a/include/blast/math/panel/PanelSize.hpp b/include/blast/math/panel/PanelSize.hpp index 027d5c63..6bcf6bbf 100644 --- a/include/blast/math/panel/PanelSize.hpp +++ b/include/blast/math/panel/PanelSize.hpp @@ -8,11 +8,12 @@ // Includes //************************************************************************************************* +#include #include namespace blast { - template - size_t constexpr PanelSize_v = Simd::size; + template + size_t constexpr PanelSize_v = SimdSize_v; } diff --git a/include/blast/math/panel/StaticPanelMatrix.hpp b/include/blast/math/panel/StaticPanelMatrix.hpp index cfeda753..585b6f6b 100644 --- a/include/blast/math/panel/StaticPanelMatrix.hpp +++ b/include/blast/math/panel/StaticPanelMatrix.hpp @@ -7,7 +7,6 @@ #include #include #include -#include #include #include @@ -160,45 +159,6 @@ namespace blast } - Type * ptr(size_t i, size_t j) - { - // BLAST_USER_ASSERT(i % panelSize_ == 0, "Row index not aligned to panel boundary"); - return v_ + elementIndex(i, j); - } - - - Type const * ptr(size_t i, size_t j) const - { - // BLAST_USER_ASSERT(i % panelSize_ == 0, "Row index not aligned to panel boundary"); - return v_ + elementIndex(i, j); - } - - - template - auto load(size_t i, size_t j) const - { - BLAZE_INTERNAL_ASSERT(i < M, "Invalid row access index"); - BLAZE_INTERNAL_ASSERT(j < N, "Invalid column access index"); - BLAZE_INTERNAL_ASSERT(i % panelSize_ == 0 || SO == rowMajor, "Row index not aligned to panel boundary"); - BLAZE_INTERNAL_ASSERT(j % panelSize_ == 0 || SO == columnMajor, "Column index not aligned to panel boundary"); - - return blast::load(v_ + elementIndex(i, j)); - } - - - template - void store(size_t i, size_t j, T val) - { - BLAZE_INTERNAL_ASSERT(i < M, "Invalid row access index"); - BLAZE_INTERNAL_ASSERT(j < N, "Invalid column access index"); - BLAZE_INTERNAL_ASSERT(i % panelSize_ == 0 || SO == rowMajor, "Row index not aligned to panel boundary"); - BLAZE_INTERNAL_ASSERT(j % panelSize_ == 0 || SO == columnMajor, "Column index not aligned to panel boundary"); - - // We never use maskstore here because we have padding - blast::store(v_ + elementIndex(i, j), val); - } - - private: static size_t constexpr panelSize_ = PanelSize_v; static size_t constexpr tileRows_ = M / panelSize_ + (M % panelSize_ > 0); @@ -329,4 +289,4 @@ namespace blaze { using Type = blast::StaticPanelMatrix; }; -} \ No newline at end of file +} diff --git a/include/blast/math/panel/StaticPanelMatrixPointer.hpp b/include/blast/math/panel/StaticPanelMatrixPointer.hpp index 50544d02..aaadbda9 100644 --- a/include/blast/math/panel/StaticPanelMatrixPointer.hpp +++ b/include/blast/math/panel/StaticPanelMatrixPointer.hpp @@ -7,8 +7,7 @@ #include #include #include -#include -#include +#include #include #include #include @@ -21,9 +20,9 @@ namespace blast { public: using ElementType = T; - using IntrinsicType = typename Simd>::IntrinsicType; - using MaskType = typename Simd>::MaskType; using SimdVecType = SimdVec>; + using IntrinsicType = SimdVecType::IntrinsicType; + using MaskType = SimdMask>; static bool constexpr storageOrder = SO; static bool constexpr aligned = AF; @@ -204,7 +203,7 @@ namespace blast private: - static size_t constexpr SS = Simd>::size; + static size_t constexpr SS = SimdVecType::size(); static TransposeFlag constexpr majorOrientation = SO == columnMajor ? columnVector : rowVector; @@ -267,4 +266,4 @@ namespace blast { return trans(ptr(m.operand(), j, i)); } -} \ No newline at end of file +} diff --git a/include/blast/math/register_matrix/DynamicRegisterMatrix.hpp b/include/blast/math/register_matrix/DynamicRegisterMatrix.hpp index a1b857cc..9cd7f826 100644 --- a/include/blast/math/register_matrix/DynamicRegisterMatrix.hpp +++ b/include/blast/math/register_matrix/DynamicRegisterMatrix.hpp @@ -4,8 +4,7 @@ #pragma once -#include -#include +#include #include #include #include @@ -18,8 +17,6 @@ #include #include -#include - namespace blast { @@ -121,7 +118,7 @@ namespace blast { for (size_t i = 0; i < RM; ++i) for (size_t j = 0; j < N; ++j) - v_[i][j] = setzero(); + v_[i][j].reset(); } @@ -250,14 +247,14 @@ namespace blast private: - using SIMD = Simd; - using IntrinsicType = typename SIMD::IntrinsicType; - using MaskType = typename SIMD::MaskType; - using IntType = typename SIMD::IntType; - using SimdVecType = SimdVec; + using Arch = xsimd::default_arch; + using SimdVecType = SimdVec; + using IntrinsicType = typename SimdVecType::IntrinsicType; + using MaskType = SimdMask; + using IntType = typename SimdIndex::value_type; // SIMD size - static size_t constexpr SS = Simd::size; + static size_t constexpr SS = SimdVecType::size(); // Numberf of SIMD registers required to store a single column of the matrix. static size_t constexpr RM = M / SS; @@ -266,9 +263,9 @@ namespace blast BLAZE_STATIC_ASSERT_MSG((RM > 0), "Number of rows must be not less than SIMD size"); BLAZE_STATIC_ASSERT_MSG((RN > 0), "Number of columns must be positive"); BLAZE_STATIC_ASSERT_MSG((M % SS == 0), "Number of rows must be a multiple of SIMD size"); - BLAZE_STATIC_ASSERT_MSG((RM * RN <= RegisterCapacity_v), "Not enough registers for a DynamicRegisterMatrix"); + BLAZE_STATIC_ASSERT_MSG((RM * RN <= registerCapacity(Arch {})), "Not enough registers for a DynamicRegisterMatrix"); - IntrinsicType v_[RM][RN]; + SimdVecType v_[RM][RN]; size_t const m_; size_t const n_; @@ -329,7 +326,7 @@ namespace blast if (IntType const rem = m_ % SS) { - MaskType const mask = SIMD::index() < rem; + MaskType const mask = indexSequence() < rem; size_t const i = m_ / SS; for (size_t j = 0; j < n_ && j < columns(); ++j) @@ -349,10 +346,10 @@ namespace blast { IntType const skip = j - ri * SS; IntType const rem = m_ - ri * SS; - MaskType mask = SIMD::index() < rem; + MaskType mask = indexSequence() < rem; if (skip > 0) - mask &= SIMD::index() >= skip; + mask &= indexSequence() >= skip; p(SS * ri, j).store(v_[ri][j], mask); } @@ -490,7 +487,7 @@ namespace blast // ax[i] = alpha * a.load(i * SS, 0); // if (rem) - // ax[ii] = alpha * a.load(ii * SS, 0, SIMD::index() < rem); + // ax[ii] = alpha * a.load(ii * SS, 0, indexSequence() < rem); // #pragma unroll // for (size_t j = 0; j < N; ++j) @@ -598,4 +595,4 @@ namespace blast ker.axpy(beta, c); ker.store(d); } -} \ No newline at end of file +} diff --git a/include/blast/math/register_matrix/RegisterMatrix.hpp b/include/blast/math/register_matrix/RegisterMatrix.hpp index d4309043..631b530d 100644 --- a/include/blast/math/register_matrix/RegisterMatrix.hpp +++ b/include/blast/math/register_matrix/RegisterMatrix.hpp @@ -4,8 +4,7 @@ #pragma once -#include -#include +#include #include #include #include @@ -20,8 +19,6 @@ #include #include -#include - namespace blast { @@ -117,7 +114,7 @@ namespace blast { for (size_t i = 0; i < RM; ++i) for (size_t j = 0; j < N; ++j) - v_[i][j] = setzero(); + v_[i][j].reset(); } @@ -360,14 +357,14 @@ namespace blast private: - using SIMD = Simd; - using IntrinsicType = typename SIMD::IntrinsicType; - using MaskType = typename SIMD::MaskType; - using IntType = typename SIMD::IntType; - using SimdVecType = SimdVec; + using Arch = xsimd::default_arch; + using SimdVecType = SimdVec; + using IntrinsicType = typename SimdVecType::IntrinsicType; + using MaskType = SimdMask; + using IntType = typename SimdIndex::value_type; // SIMD size - static size_t constexpr SS = Simd::size; + static size_t constexpr SS = SimdVecType::size(); // Numberf of SIMD registers required to store a single column of the matrix. static size_t constexpr RM = M / SS; @@ -376,9 +373,9 @@ namespace blast BLAZE_STATIC_ASSERT_MSG((RM > 0), "Number of rows must be not less than SIMD size"); BLAZE_STATIC_ASSERT_MSG((RN > 0), "Number of columns must be positive"); BLAZE_STATIC_ASSERT_MSG((M % SS == 0), "Number of rows must be a multiple of SIMD size"); - BLAZE_STATIC_ASSERT_MSG((RM * RN <= RegisterCapacity_v), "Not enough registers for a RegisterMatrix"); + BLAZE_STATIC_ASSERT_MSG((RM * RN <= registerCapacity(Arch {})), "Not enough registers for a RegisterMatrix"); - IntrinsicType v_[RM][RN]; + SimdVecType v_[RM][RN]; /// @brief Reference to the matrix element at row \a i and column \a j @@ -461,7 +458,7 @@ namespace blast v_[i][j] = beta * p(SS * i, j).load(); if (size_t const rem = m % SS) - v_[m / SS][j] = beta * p(m - rem, j).load(SIMD::index() < rem); + v_[m / SS][j] = beta * p(m - rem, j).load(indexSequence() < rem); } } } @@ -493,7 +490,7 @@ namespace blast if (IntType const rem = m % SS) { - MaskType const mask = SIMD::index() < rem; + MaskType const mask = indexSequence() < rem; size_t const i = m / SS; for (size_t j = 0; j < n && j < columns(); ++j) @@ -514,7 +511,7 @@ namespace blast if (skip && ri < RM) { - MaskType const mask = SIMD::index() >= skip; + MaskType const mask = indexSequence() >= skip; p(SS * ri, j).store(v_[ri][j], mask); ++ri; } @@ -536,10 +533,10 @@ namespace blast { IntType const skip = j - ri * SS; IntType const rem = m - ri * SS; - MaskType mask = SIMD::index() < rem; + MaskType mask = indexSequence() < rem; if (skip > 0) - mask &= SIMD::index() >= skip; + mask &= indexSequence() >= skip; p(SS * ri, j).store(v_[ri][j], mask); } @@ -562,7 +559,7 @@ namespace blast #pragma unroll for (size_t k = 0; k < j; ++k) { - IntrinsicType const a_kj = (~A)(k, j).broadcast(); + SimdVecType const a_kj = (~A)(k, j).broadcast(); #pragma unroll for (size_t i = 0; i < RM; ++i) @@ -596,7 +593,7 @@ namespace blast VectorPointer && (PB::transposeFlag == rowVector) BLAZE_ALWAYS_INLINE void RegisterMatrix::ger(T alpha, PA a, PB b) noexcept { - BLAZE_STATIC_ASSERT_MSG((RM * RN + RM + 1 <= RegisterCapacity_v), "Not enough registers for ger()"); + BLAZE_STATIC_ASSERT_MSG((RM * RN + RM + 1 <= registerCapacity(Arch {})), "Not enough registers for ger()"); SimdVecType ax[RM]; @@ -623,7 +620,7 @@ namespace blast VectorPointer && (PB::transposeFlag == rowVector) BLAZE_ALWAYS_INLINE void RegisterMatrix::ger(PA a, PB b) noexcept { - BLAZE_STATIC_ASSERT_MSG((RM * RN + RM + 1 <= RegisterCapacity_v), "Not enough registers for ger()"); + BLAZE_STATIC_ASSERT_MSG((RM * RN + RM + 1 <= registerCapacity(Arch {})), "Not enough registers for ger()"); SimdVecType ax[RM]; @@ -697,7 +694,7 @@ namespace blast BLAZE_ALWAYS_INLINE void RegisterMatrix::potrf() noexcept { static_assert(M >= N, "potrf() not implemented for register matrices with columns more than rows"); - static_assert(RM * RN + 2 <= RegisterCapacity_v, "Not enough registers"); + static_assert(RM * RN + 2 <= registerCapacity(Arch {}), "Not enough registers"); #pragma unroll for (size_t k = 0; k < N; ++k) @@ -705,11 +702,11 @@ namespace blast #pragma unroll for (size_t j = 0; j < k; ++j) { - T const a_kj = v_[k / SS][j][k % SS]; + SimdVecType const a_kj = v_[k / SS][j][k % SS]; #pragma unroll for (size_t i = 0; i < RM; ++i) if (i >= k / SS) - v_[i][k] = fnmadd(set1(a_kj), v_[i][j], v_[i][k]); + v_[i][k] = fnmadd(a_kj, v_[i][j], v_[i][k]); } T const sqrt_a_kk = std::sqrt(v_[k / SS][k][k % SS]); @@ -718,7 +715,7 @@ namespace blast for (size_t i = 0; i < RM; ++i) { if (i < k / SS) - v_[i][k] = setzero(); + v_[i][k].reset(); else v_[i][k] /= sqrt_a_kk; } @@ -745,7 +742,7 @@ namespace blast ax[i] = alpha * a(i * SS, 0).load(); if (rem) - ax[ii] = alpha * a(ii * SS, 0).load(SIMD::index() < rem); + ax[ii] = alpha * a(ii * SS, 0).load(indexSequence() < rem); #pragma unroll for (size_t j = 0; j < N; ++j) @@ -825,4 +822,4 @@ namespace blast { return rm == m; } -} \ No newline at end of file +} diff --git a/include/blast/math/register_matrix/double_12_4_4.hpp b/include/blast/math/register_matrix/double_12_4_4.hpp deleted file mode 100644 index a640d2c1..00000000 --- a/include/blast/math/register_matrix/double_12_4_4.hpp +++ /dev/null @@ -1,167 +0,0 @@ -// Copyright (c) 2019-2020 Mikhail Katliar All rights reserved. -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. - -#pragma once - -#include -#include - -#include - -#include - - -namespace blast -{ - template <> - template - requires MatrixPointer && (P::storageOrder == columnMajor) - inline void RegisterMatrix::load(double beta, P ptr, size_t m, size_t n) noexcept - { - if (n > 0) - { - v_[0][0] = beta * ptr.load(); - v_[1][0] = beta * ptr(SS, 0).load(); - v_[2][0] = beta * ptr(2 * SS, 0).load(); - } - - if (n > 1) - { - v_[0][1] = beta * ptr(0, 1).load(); - v_[1][1] = beta * ptr(SS, 1).load(); - v_[2][1] = beta * ptr(2 * SS, 1).load(); - } - - if (n > 2) - { - v_[0][2] = beta * ptr(0, 2).load(); - v_[1][2] = beta * ptr(SS, 2).load(); - v_[2][2] = beta * ptr(2 * SS, 2).load(); - } - - if (n > 3) - { - v_[0][3] = beta * ptr(0, 3).load(); - v_[1][3] = beta * ptr(SS, 3).load(); - v_[2][3] = beta * ptr(2 * SS, 3).load(); - } - } - - - template <> - template - requires MatrixPointer && (P::storageOrder == columnMajor) - inline void RegisterMatrix::store(P ptr, size_t m, size_t n) const noexcept - { - #pragma unroll - for (size_t i = 0; i < 3; ++i) - { - if (m >= 4 * i + 4) - { - if (n > 0) - ptr(SS * i, 0).store(v_[i][0]); - - if (n > 1) - ptr(SS * i, 1).store(v_[i][1]); - - if (n > 2) - ptr(SS * i, 2).store(v_[i][2]); - - if (n > 3) - ptr(SS * i, 3).store(v_[i][3]); - } - else if (m > 4 * i) - { - __m256i const mask = _mm256_set_epi64x( - m > 4 * i + 3 ? 0x8000000000000000ULL : 0, - m > 4 * i + 2 ? 0x8000000000000000ULL : 0, - m > 4 * i + 1 ? 0x8000000000000000ULL : 0, - m > 4 * i + 0 ? 0x8000000000000000ULL : 0); - - if (n > 0) - ptr(SS * i, 0).store(v_[i][0], mask); - - if (n > 1) - ptr(SS * i, 1).store(v_[i][1], mask); - - if (n > 2) - ptr(SS * i, 2).store(v_[i][2], mask); - - if (n > 3) - ptr(SS * i, 3).store(v_[i][3], mask); - } - } - } - - -#ifdef BLAST_USE_CUSTOM_TRMM_RIGHT_LOWER - template <> - template - requires MatrixPointer && (PB::storageOrder == columnMajor) && MatrixPointer - BLAZE_ALWAYS_INLINE void RegisterMatrix::trmmRightLower(double alpha, PB b, PA a) noexcept - { - IntrinsicType bx[3], ax; - - // k == 0 - bx[0] = alpha * b.load(0 * SS, 0); - bx[1] = alpha * b.load(1 * SS, 0); - bx[2] = alpha * b.load(2 * SS, 0); - ax = a.broadcast(0, 0); - v_[0][0] = fmadd(bx[0], ax, v_[0][0]); - v_[1][0] = fmadd(bx[1], ax, v_[1][0]); - v_[2][0] = fmadd(bx[2], ax, v_[2][0]); - - // k == 1 - bx[0] = alpha * b.load(0 * SS, 1); - bx[1] = alpha * b.load(1 * SS, 1); - bx[2] = alpha * b.load(2 * SS, 1); - ax = a.broadcast(1, 0); - v_[0][0] = fmadd(bx[0], ax, v_[0][0]); - v_[1][0] = fmadd(bx[1], ax, v_[1][0]); - v_[2][0] = fmadd(bx[2], ax, v_[2][0]); - ax = a.broadcast(1, 1); - v_[0][1] = fmadd(bx[0], ax, v_[0][1]); - v_[1][1] = fmadd(bx[1], ax, v_[1][1]); - v_[2][1] = fmadd(bx[2], ax, v_[2][1]); - - // k == 2 - bx[0] = alpha * b.load(0 * SS, 2); - bx[1] = alpha * b.load(1 * SS, 2); - bx[2] = alpha * b.load(2 * SS, 2); - ax = a.broadcast(2, 0); - v_[0][0] = fmadd(bx[0], ax, v_[0][0]); - v_[1][0] = fmadd(bx[1], ax, v_[1][0]); - v_[2][0] = fmadd(bx[2], ax, v_[2][0]); - ax = a.broadcast(2, 1); - v_[0][1] = fmadd(bx[0], ax, v_[0][1]); - v_[1][1] = fmadd(bx[1], ax, v_[1][1]); - v_[2][1] = fmadd(bx[2], ax, v_[2][1]); - ax = a.broadcast(2, 2); - v_[0][2] = fmadd(bx[0], ax, v_[0][2]); - v_[1][2] = fmadd(bx[1], ax, v_[1][2]); - v_[2][2] = fmadd(bx[2], ax, v_[2][2]); - - // k == 3 - bx[0] = alpha * b.load(0 * SS, 3); - bx[1] = alpha * b.load(1 * SS, 3); - bx[2] = alpha * b.load(2 * SS, 3); - ax = a.broadcast(3, 0); - v_[0][0] = fmadd(bx[0], ax, v_[0][0]); - v_[1][0] = fmadd(bx[1], ax, v_[1][0]); - v_[2][0] = fmadd(bx[2], ax, v_[2][0]); - ax = a.broadcast(3, 1); - v_[0][1] = fmadd(bx[0], ax, v_[0][1]); - v_[1][1] = fmadd(bx[1], ax, v_[1][1]); - v_[2][1] = fmadd(bx[2], ax, v_[2][1]); - ax = a.broadcast(3, 2); - v_[0][2] = fmadd(bx[0], ax, v_[0][2]); - v_[1][2] = fmadd(bx[1], ax, v_[1][2]); - v_[2][2] = fmadd(bx[2], ax, v_[2][2]); - ax = a.broadcast(3, 3); - v_[0][3] = fmadd(bx[0], ax, v_[0][3]); - v_[1][3] = fmadd(bx[1], ax, v_[1][3]); - v_[2][3] = fmadd(bx[2], ax, v_[2][3]); - } -#endif -} \ No newline at end of file diff --git a/include/blast/math/register_matrix/double_4_4_4.hpp b/include/blast/math/register_matrix/double_4_4_4.hpp deleted file mode 100644 index 1c73605c..00000000 --- a/include/blast/math/register_matrix/double_4_4_4.hpp +++ /dev/null @@ -1,79 +0,0 @@ -// Copyright (c) 2019-2020 Mikhail Katliar All rights reserved. -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. - -#pragma once - -#include -#include - -#include - - -namespace blast -{ - template <> - template - requires MatrixPointer && (P::storageOrder == columnMajor) - BLAZE_ALWAYS_INLINE void RegisterMatrix::load(double beta, P ptr, size_t m, size_t n) noexcept - { - if (n > 0) - v_[0][0] = beta * ptr.load(); - - if (n > 1) - v_[0][1] = beta * ptr(0, 1).load(); - - if (n > 2) - v_[0][2] = beta * ptr(0, 2).load(); - - if (n > 3) - v_[0][3] = beta * ptr(0, 3).load(); - } - - -#if 1 - /// Magically, this function specialization is slightly faster than the default implementation of RegisterMatrix<>::store. - template <> - template - requires MatrixPointer && (P::storageOrder == columnMajor) - BLAZE_ALWAYS_INLINE void RegisterMatrix::store(P ptr, size_t m, size_t n) const noexcept - { - if (m >= 4) - { - if (n > 0) - ptr.store(v_[0][0]); - - if (n > 1) - ptr(0, 1).store(v_[0][1]); - - if (n > 2) - ptr(0, 2).store(v_[0][2]); - - if (n > 3) - ptr(0, 3).store(v_[0][3]); - } - else if (m > 0) - { - // Magically, the code below is significantly faster than this: - // __m256i const mask = _mm256_cmpgt_epi64(_mm256_set_epi64x(m, m, m, m), _mm256_set_epi64x(3, 2, 1, 0)); - __m256i const mask = _mm256_set_epi64x( - m > 3 ? 0x8000000000000000ULL : 0, - m > 2 ? 0x8000000000000000ULL : 0, - m > 1 ? 0x8000000000000000ULL : 0, - m > 0 ? 0x8000000000000000ULL : 0); - - if (n > 0) - ptr.store(v_[0][0], mask); - - if (n > 1) - ptr(0, 1).store(v_[0][1], mask); - - if (n > 2) - ptr(0, 2).store(v_[0][2], mask); - - if (n > 3) - ptr(0, 3).store(v_[0][3], mask); - } - } -#endif -} \ No newline at end of file diff --git a/include/blast/math/register_matrix/double_8_4_4.hpp b/include/blast/math/register_matrix/double_8_4_4.hpp deleted file mode 100644 index 4ec3d416..00000000 --- a/include/blast/math/register_matrix/double_8_4_4.hpp +++ /dev/null @@ -1,151 +0,0 @@ -// Copyright (c) 2019-2020 Mikhail Katliar All rights reserved. -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. - -#pragma once - -#include -#include - -#include - -#include - - -namespace blast -{ - template<> - template - requires MatrixPointer && (P::storageOrder == columnMajor) - inline void RegisterMatrix::load(double beta, P ptr, size_t m, size_t n) noexcept - { - if (n > 0) - { - v_[0][0] = beta * ptr.load(); - v_[1][0] = beta * ptr(SS, 0).load(); - } - - if (n > 1) - { - v_[0][1] = beta * ptr(0, 1).load(); - v_[1][1] = beta * ptr(SS, 1).load(); - } - - if (n > 2) - { - v_[0][2] = beta * ptr(0, 2).load(); - v_[1][2] = beta * ptr(SS, 2).load(); - } - - if (n > 3) - { - v_[0][3] = beta * ptr(0, 3).load(); - v_[1][3] = beta * ptr(SS, 3).load(); - } - } - - -#if 1 - template<> - template - requires MatrixPointer && (P::storageOrder == columnMajor) - inline void RegisterMatrix::store(P ptr, size_t m, size_t n) const noexcept - { - #pragma unroll - for (size_t i = 0; i < 2; ++i) - { - if (m >= 4 * i + 4) - { - if (n > 0) - ptr(SS * i, 0).store(v_[i][0]); - - if (n > 1) - ptr(SS * i, 1).store(v_[i][1]); - - if (n > 2) - ptr(SS * i, 2).store(v_[i][2]); - - if (n > 3) - ptr(SS * i, 3).store(v_[i][3]); - } - else if (m > 4 * i) - { - __m256i const mask = _mm256_set_epi64x( - m > 4 * i + 3 ? 0x8000000000000000ULL : 0, - m > 4 * i + 2 ? 0x8000000000000000ULL : 0, - m > 4 * i + 1 ? 0x8000000000000000ULL : 0, - m > 4 * i + 0 ? 0x8000000000000000ULL : 0); - - if (n > 0) - ptr(SS * i, 0).store(v_[i][0], mask); - - if (n > 1) - ptr(SS * i, 1).store(v_[i][1], mask); - - if (n > 2) - ptr(SS * i, 2).store(v_[i][2], mask); - - if (n > 3) - ptr(SS * i, 3).store(v_[i][3], mask); - } - } - } -#endif - - -#ifdef BLAST_USE_CUSTOM_TRMM_RIGHT_LOWER - template <> - template - requires MatrixPointer && (PB::storageOrder == columnMajor) && MatrixPointer - BLAZE_ALWAYS_INLINE void RegisterMatrix::trmmRightLower(double alpha, PB b, PA a) noexcept - { - IntrinsicType bx[2], ax; - - // k == 0 - bx[0] = alpha * b.load(0 * SS, 0); - bx[1] = alpha * b.load(1 * SS, 0); - ax = a.broadcast(0, 0); - v_[0][0] = fmadd(bx[0], ax, v_[0][0]); - v_[1][0] = fmadd(bx[1], ax, v_[1][0]); - - // k == 1 - bx[0] = alpha * b.load(0 * SS, 1); - bx[1] = alpha * b.load(1 * SS, 1); - ax = a.broadcast(1, 0); - v_[0][0] = fmadd(bx[0], ax, v_[0][0]); - v_[1][0] = fmadd(bx[1], ax, v_[1][0]); - ax = a.broadcast(1, 1); - v_[0][1] = fmadd(bx[0], ax, v_[0][1]); - v_[1][1] = fmadd(bx[1], ax, v_[1][1]); - - // k == 2 - bx[0] = alpha * b.load(0 * SS, 2); - bx[1] = alpha * b.load(1 * SS, 2); - ax = a.broadcast(2, 0); - v_[0][0] = fmadd(bx[0], ax, v_[0][0]); - v_[1][0] = fmadd(bx[1], ax, v_[1][0]); - ax = a.broadcast(2, 1); - v_[0][1] = fmadd(bx[0], ax, v_[0][1]); - v_[1][1] = fmadd(bx[1], ax, v_[1][1]); - ax = a.broadcast(2, 2); - v_[0][2] = fmadd(bx[0], ax, v_[0][2]); - v_[1][2] = fmadd(bx[1], ax, v_[1][2]); - - // k == 3 - bx[0] = alpha * b.load(0 * SS, 3); - bx[1] = alpha * b.load(1 * SS, 3); - ax = a.broadcast(3, 0); - v_[0][0] = fmadd(bx[0], ax, v_[0][0]); - v_[1][0] = fmadd(bx[1], ax, v_[1][0]); - ax = a.broadcast(3, 1); - v_[0][1] = fmadd(bx[0], ax, v_[0][1]); - v_[1][1] = fmadd(bx[1], ax, v_[1][1]); - ax = a.broadcast(3, 2); - v_[0][2] = fmadd(bx[0], ax, v_[0][2]); - v_[1][2] = fmadd(bx[1], ax, v_[1][2]); - ax = a.broadcast(3, 3); - v_[0][3] = fmadd(bx[0], ax, v_[0][3]); - v_[1][3] = fmadd(bx[1], ax, v_[1][3]); - } -#endif -} \ No newline at end of file diff --git a/include/blast/math/simd/Avx256.hpp b/include/blast/math/simd/Avx256.hpp deleted file mode 100644 index 8e34d3b3..00000000 --- a/include/blast/math/simd/Avx256.hpp +++ /dev/null @@ -1,25 +0,0 @@ -// Copyright 2023 Mikhail Katliar -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once - -#include -#include -#include -#include -#include -// #include -#include - -#include diff --git a/include/blast/math/simd/Hsum.hpp b/include/blast/math/simd/Hsum.hpp deleted file mode 100644 index 9c7b5f4b..00000000 --- a/include/blast/math/simd/Hsum.hpp +++ /dev/null @@ -1,31 +0,0 @@ -// Copyright (c) 2019-2020 Mikhail Katliar All rights reserved. -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. - -#pragma once - -#include - - -namespace blast -{ - /// dst[127:64] = sum(v1) - /// dst[63:0] = sum(v0) - inline __m128d hsum(__m256d v0, __m256d v1) - { - __m256d v01 = _mm256_hadd_pd(v0, v1); - return _mm_add_pd(_mm256_extractf128_pd(v01, 1), _mm256_castpd256_pd128(v01)); - } - - - /// dst[255:192] = sum(v3) - /// dst[191:128] = sum(v2) - /// dst[127:64] = sum(v1) - /// dst[63:0] = sum(v0) - inline __m256d hsum(__m256d v0, __m256d v1, __m256d v2, __m256d v3) - { - __m128d v01 = hsum(v0, v1); - __m128d v23 = hsum(v2, v3); - return _mm256_insertf128_pd(_mm256_castpd128_pd256(v01), v23, 1); - } -} \ No newline at end of file diff --git a/include/blast/math/simd/IntVecType.hpp b/include/blast/math/simd/IntVecType.hpp deleted file mode 100644 index 0f8faab7..00000000 --- a/include/blast/math/simd/IntVecType.hpp +++ /dev/null @@ -1,24 +0,0 @@ -// Copyright 2023 Mikhail Katliar -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once - -#include -#include - -namespace blast -{ - template - using IntVecType_t = SimdVec>; -} \ No newline at end of file diff --git a/include/blast/math/simd/IntScalarType.hpp b/include/blast/math/simd/RegisterCapacity.hpp similarity index 62% rename from include/blast/math/simd/IntScalarType.hpp rename to include/blast/math/simd/RegisterCapacity.hpp index 3fb119ac..35b2ac7e 100644 --- a/include/blast/math/simd/IntScalarType.hpp +++ b/include/blast/math/simd/RegisterCapacity.hpp @@ -1,29 +1,32 @@ -// Copyright 2023 Mikhail Katliar +// Copyright 2024 Mikhail Katliar // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // -// http://www.apache.org/licenses/LICENSE-2.0 +// https://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. - #pragma once +#include + #include -#include namespace blast { - template - struct IntScalarType; - - - template - using IntScalarType_t = typename IntScalarType::type; -} \ No newline at end of file + /** + * @brief Number of available SIMD registers. + * + * @return Number of SIMD registers for AVX2 + */ + std::size_t constexpr registerCapacity(xsimd::avx2) + { + return 16; + } +} diff --git a/include/blast/math/simd/SequenceTag.hpp b/include/blast/math/simd/SequenceTag.hpp deleted file mode 100644 index 8f37c552..00000000 --- a/include/blast/math/simd/SequenceTag.hpp +++ /dev/null @@ -1,23 +0,0 @@ -// Copyright 2023 Mikhail Katliar -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once - - -namespace blast :: simd -{ - struct SequenceTag {}; - - inline SequenceTag constexpr sequenceTag; -} \ No newline at end of file diff --git a/include/blast/math/simd/Simd.hpp b/include/blast/math/simd/Simd.hpp index 040f4ffe..b5bd8873 100644 --- a/include/blast/math/simd/Simd.hpp +++ b/include/blast/math/simd/Simd.hpp @@ -1,459 +1,20 @@ -// Copyright (c) 2019-2020 Mikhail Katliar All rights reserved. -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. - +// Copyright 2024 Mikhail Katliar +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. #pragma once -#include - -#include -#include -#include - -#include - -#include -#include - - -namespace blast -{ - using namespace blaze; - - - template - struct Simd; - - - template <> - struct Simd - { - private: - class Index; - - - public: - using IntrinsicType = __m256d; - using MaskType = __m256i; - using IntType = long long; - - static size_t constexpr size = 4; - static size_t constexpr registerCapacity = 16; - - - //******************************************************* - // - // CUSTOM - // - //******************************************************* - static Index index() noexcept - { - return Index {}; - } - - - private: - class Index - { - public: - MaskType operator<(IntType n) const noexcept - { - return _mm256_cmpgt_epi64(_mm256_set1_epi64x(n), val_); - } - - - MaskType operator>(IntType n) const noexcept - { - return _mm256_cmpgt_epi64(val_, _mm256_set1_epi64x(n)); - } - - - MaskType operator<=(IntType n) const noexcept - { - return *this < n + 1; - } - - - MaskType operator>=(IntType n) const noexcept - { - return *this > n - 1; - } - - - private: - MaskType val_ = _mm256_set_epi64x(3, 2, 1, 0); - }; - }; - - - template <> - struct Simd - { - private: - class Index; - - - public: - using IntrinsicType = __m256; - using MaskType = __m256i; - using IntType = int; - - static size_t constexpr size = 8; - static size_t constexpr registerCapacity = 16; - - - //******************************************************* - // - // CUSTOM - // - //******************************************************* - static Index index() noexcept - { - return Index {}; - } - - - private: - class Index - { - public: - MaskType operator<(IntType n) const noexcept - { - return _mm256_cmpgt_epi32(_mm256_set1_epi32(n), val_); - } - - - MaskType operator>(IntType n) const noexcept - { - return _mm256_cmpgt_epi32(val_, _mm256_set1_epi32(n)); - } - - - MaskType operator<=(IntType n) const noexcept - { - return *this < n + 1; - } - - - MaskType operator>=(IntType n) const noexcept - { - return *this > n - 1; - } - - - private: - MaskType val_ = _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0); - }; - }; - - - template - using IntrinsicType_t = typename Simd::IntrinsicType; - - - template - struct SimdTraits; - - - template <> - struct SimdTraits<__m256d> - { - using ScalarType = double; - using MaskType = __m256i; - using IntType = long long; - static size_t constexpr size = 4; - }; - - - template <> - struct SimdTraits<__m256> - { - using ScalarType = float; - using MaskType = __m256i; - using IntType = int; - static size_t constexpr size = 8; - }; - - - template - using ScalarType_t = typename SimdTraits::ScalarType; - - - template - using MaskType_t = typename SimdTraits::MaskType; - - template - using IntType_t = typename SimdTraits::IntType; - - - template - size_t constexpr RegisterCapacity_v = Simd::registerCapacity; - - - //******************************************************* - // - // LOAD - // - //******************************************************* - - template - auto load(T const * ptr); - - - template - inline auto load(T const * ptr) - { - return load::size>(ptr); - } - - - template - auto broadcast(T const * ptr); - - - template <> - inline auto load(double const * ptr) - { - return _mm256_load_pd(ptr); - } - - - template <> - inline auto load(double const * ptr) - { - return _mm256_loadu_pd(ptr); - } - - - template <> - inline auto load(float const * ptr) - { - return _mm256_load_ps(ptr); - } - - - template <> - inline auto load(float const * ptr) - { - return _mm256_loadu_ps(ptr); - } - - - inline auto maskload(double const * ptr, __m256i mask) - { - return _mm256_maskload_pd(ptr, mask); - } - - - inline auto maskload(float const * ptr, __m256i mask) - { - return _mm256_maskload_ps(ptr, mask); - } - - - template <> - inline auto broadcast<4, double>(double const * ptr) - { - return _mm256_broadcast_sd(ptr); - } - - - template <> - inline auto broadcast<8, float>(float const * ptr) - { - return _mm256_broadcast_ss(ptr); - } - - - //******************************************************* - // - // STORE - // - //******************************************************* - template - void store(double * ptr, __m256d a); - - - template <> - inline void store(double * ptr, __m256d a) - { - _mm256_store_pd(ptr, a); - } - - - template <> - inline void store(double * ptr, __m256d a) - { - _mm256_storeu_pd(ptr, a); - } - - - template - void store(float * ptr, __m256 a); - - - template <> - inline void store(float * ptr, __m256 a) - { - _mm256_store_ps(ptr, a); - } - - - template <> - inline void store(float * ptr, __m256 a) - { - _mm256_storeu_ps(ptr, a); - } - - - inline void maskstore(double * ptr, __m256i m, __m256d a) - { - _mm256_maskstore_pd(ptr, m, a); - } - - - inline void maskstore(float * ptr, __m256i m, __m256 a) - { - _mm256_maskstore_ps(ptr, m, a); - } - - - //******************************************************* - // - // SET - // - //******************************************************* - - inline auto set(double a3, double a2, double a1, double a0) - { - return _mm256_set_pd(a3, a2, a1, a0); - } - - - inline auto set(float a7, float a6, float a5, float a4, float a3, float a2, float a1, float a0) - { - return _mm256_set_ps(a7, a6, a5, a4, a3, a2, a1, a0); - } - - - inline auto set(long long a3, long long a2, long long a1, long long a0) - { - return _mm256_set_epi64x(a3, a2, a1, a0); - } - - - template - auto set1(T a); - - - inline auto set1(double a) - { - return _mm256_set1_pd(a); - } - - - inline auto set1(float a) - { - return _mm256_set1_ps(a); - } - - - template <> - inline auto set1<4, double>(double a) - { - return _mm256_set1_pd(a); - } - - - template <> - inline auto set1<8, float>(float a) - { - return _mm256_set1_ps(a); - } - - - template <> - inline auto set1<8, int>(int val) - { - return _mm256_set1_epi32(val); - } - - - template <> - inline auto set1<4, long long>(long long val) - { - return _mm256_set1_epi64x(val); - } - - - template - auto setzero(); - - - template <> - inline auto setzero() - { - return _mm256_setzero_pd(); - } - - - template <> - inline auto setzero() - { - return _mm256_setzero_ps(); - } - - - //******************************************************* - // - // ARITHMETIC - // - //******************************************************* - - inline auto fnmadd(__m256d a, __m256d b, __m256d c) - { - return _mm256_fnmadd_pd(a, b, c); - } - - - inline auto fnmadd(__m256 a, __m256 b, __m256 c) - { - return _mm256_fnmadd_ps(a, b, c); - } - - - inline auto abs(__m256d a) - { - return _mm256_andnot_pd(set1(-0.), a); - } - - - inline auto abs(__m256 a) - { - return _mm256_andnot_ps(set1(-0.f), a); - } - - - //******************************************************* - // - // COMPARE - // - //******************************************************* - - template - auto cmpgt(MM a, MM b); - - - template <> - inline auto cmpgt<4>(__m256i a, __m256i b) - { - return _mm256_cmpgt_epi64(a, b); - } - +#include - template <> - inline auto cmpgt<8>(__m256i a, __m256i b) - { - return _mm256_cmpgt_epi32(a, b); - } -} \ No newline at end of file +#if XSIMD_WITH_AVX2 + #include +#endif diff --git a/include/blast/math/simd/SimdIndex.hpp b/include/blast/math/simd/SimdIndex.hpp new file mode 100644 index 00000000..3d7ca7b2 --- /dev/null +++ b/include/blast/math/simd/SimdIndex.hpp @@ -0,0 +1,98 @@ +// Copyright 2024 Mikhail Katliar +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +#pragma once + +#include + +#include +#include + + +namespace blast +{ + namespace detail + { + /// @brief Deduces integer type of given size, sized or unsigned. + /// + /// @tparam N type size in bytes + /// @tparam S true for signed type, false for unsigned + /// + template + struct IntOfSize; + + template + using IntOfSize_t = typename IntOfSize::Type; + + template <> + struct IntOfSize<4, true> + { + using Type = std::int32_t; + }; + + template <> + struct IntOfSize<4, false> + { + using Type = std::uint32_t; + }; + + template <> + struct IntOfSize<8, true> + { + using Type = std::int64_t; + }; + + template <> + struct IntOfSize<8, false> + { + using Type = std::uint64_t; + }; + + template + requires (xsimd::batch::size == 4) && std::is_integral_v + inline xsimd::batch indexSequence(T start) noexcept + { + return {start, start + 1, start + 2, start + 3}; + } + + template + requires (xsimd::batch::size == 8) && std::is_integral_v + inline xsimd::batch indexSequence(T start) noexcept + { + return {start, start + 1, start + 2, start + 3, start + 4, start + 5, start + 6, start + 7}; + } + } + + + /// @brief Integer SIMD type that can be used for indexing or gather-scatter operations. + /// + /// @tparam T the deduced type will be the same size as @a T + /// @tparam Arch instruction set architecture + /// + template + using SimdIndex = xsimd::batch, Arch>; + + + /// @brief Construct an integer index sequence + /// + /// @param start start of the sequence + /// + /// @return [ @a start, @a start + 1, ..., @a start + N - 1 ] + /// where N = SimdIndex::size + /// + template + inline SimdIndex indexSequence(typename SimdIndex::value_type start = 0) noexcept + { + return detail::indexSequence, Arch>(start); + } +} diff --git a/include/blast/math/simd/SimdMask.hpp b/include/blast/math/simd/SimdMask.hpp index 7e65c867..bdb26082 100644 --- a/include/blast/math/simd/SimdMask.hpp +++ b/include/blast/math/simd/SimdMask.hpp @@ -14,6 +14,8 @@ #pragma once +#include + namespace blast { @@ -22,7 +24,79 @@ namespace blast * The width of a given simd_mask instantiation is a constant expression, determined by the template parameter. * * @tparam T the element type simd_mask applies on + * @tparam Arch instruction set architecture */ - template - class SimdMask; -} \ No newline at end of file + template + class SimdMask + : public xsimd::batch_bool + { + public: + using XSimdType = xsimd::batch_bool; + using IntrinsicType = typename XSimdType::register_type; + + /** + * @brief Construct from an intrinsic register type + * + * @param v the value to construct from + */ + SimdMask(IntrinsicType v) noexcept + : XSimdType {v} + { + } + + /** + * @brief Construct from an @a xsimd::batch_bool of the same type + * + * @param v the value to construct from + */ + SimdMask(XSimdType const& v) noexcept + : XSimdType {v} + { + } + + /** + * @brief Construct from an @a xsimd::batch_bool of a different type + * + * We need to allow conversion from batch_bool with U != T, + * because we need e.g. the following code to work: + * + * int n; + * MaskType mask = indexSequence() < n; + * + * In the code above, the result of indexSequence() is a batch, + * and the result of indexSequence() < n is a batch_bool, + * which can not be directly assigned to a batch_bool, + * although their underlying register_type's are identical. + * + * @param v the value to construct from + */ + template + SimdMask(xsimd::batch_bool const& v) noexcept + : XSimdType {xsimd::batch_bool_cast(v)} + { + } + + /** + * @brief In-place logical and. + * + * We need to define this operator to make the following code work: + * + * MaskType mask; + * int n; + * mask &= indexSequence() >= n; + * + * In the code above, the result of indexSequence() is a batch, + * and the result of indexSequence() >= n is a batch_bool, + * which can not be directly used as a right-hand side operand of batch_bool::operator&=(). + * Defining operator&=(SimdMask const&) allows the right-hand side to be implicitly converted to SimdMask. + * + * @param v + * @return SimdMask& + */ + SimdMask& operator&=(SimdMask const& v) noexcept + { + XSimdType::operator&=(v); + return *this; + } + }; +} diff --git a/include/blast/math/simd/SimdSize.hpp b/include/blast/math/simd/SimdSize.hpp index 4e862f72..01a64825 100644 --- a/include/blast/math/simd/SimdSize.hpp +++ b/include/blast/math/simd/SimdSize.hpp @@ -1,4 +1,4 @@ -// Copyright 2023 Mikhail Katliar +// Copyright 2023-2024 Mikhail Katliar // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -14,24 +14,20 @@ #pragma once +#include + #include +#include namespace blast { - template - struct SimdSize; - - - template - struct SimdSize : SimdSize {}; - - /** - * @brief Number of elements in a SimdVec object. + * @brief Size of a SIMD register conatining scalars of a given type. * * @tparam T scalar type + * @tparam Arch instruction set architecture */ - template - size_t constexpr SimdSize_v = SimdSize::value; -} \ No newline at end of file + template + std::size_t constexpr SimdSize_v = xsimd::batch, Arch>::size; +} diff --git a/include/blast/math/simd/SimdVec.hpp b/include/blast/math/simd/SimdVec.hpp index a3195ff7..140516ef 100644 --- a/include/blast/math/simd/SimdVec.hpp +++ b/include/blast/math/simd/SimdVec.hpp @@ -14,15 +14,377 @@ #pragma once +#include +#include +#include + +#include + namespace blast { + template + class SimdVec; + + /** + * @brief Fused negative multiply-add + * + * Calculate -a * b + c + * + * @param a first multiplier + * @param b second multiplier + * @param c addendum + * + * @return @a a * @a b + @a c element-wise + */ + template + SimdVec fnmadd(SimdVec const& a, SimdVec const& b, SimdVec const& c) noexcept; + + /** + * @brief Fused multiply-add + * + * Calculate a * b + c + * + * @param a first multiplier + * @param b second multiplier + * @param c addendum + * + * @return @a a * @a b + @a c element-wise + */ + template + SimdVec fmadd(SimdVec const& a, SimdVec const& b, SimdVec const& c) noexcept; + + template + SimdVec blend(SimdVec const& a, SimdVec const& b, SimdMask const& mask) noexcept; + + template + SimdVec abs(SimdVec const& a) noexcept; + + /** + * @brief Vertical max (across two vectors) + * + * @param a first vector + * @param b second vector + * + * @return [max(a[7], b7), [max(a[6], b6), max(a[5], b5), max(a[4], b4), max(a[3], b3), max(a[2], b2), max(a[1], b1), max(a[0], b0)] + */ + template + SimdVec max(SimdVec const& a, SimdVec const& b) noexcept; + + /** + * @brief Horizontal max (across all elements) + * + * @param a pack of 32-bit floating point numbers + * + * @return max(a[0], a[1], ..., a[N-1]) + */ + template + T max(SimdVec const& x) noexcept; + + template + SimdMask operator>(SimdVec const& a, SimdVec const& b) noexcept; + + /** + * @brief Multiplication + * + * @param a first multiplier + * @param b second multiplier + * + * @return product @a a * @a b + */ + template + SimdVec operator*(SimdVec const& a, SimdVec const& b) noexcept; + + /** + * @brief Left multiplication with a scalar + * + * @param a scalar multiplier + * @param b batch multiplier + * + * @return product @a a * @a b + */ + template + SimdVec operator*(T const& a, SimdVec const& b) noexcept; + + /** - * @brief Data-parallel type with the element type bool. - * The width of a given simd_mask instantiation is a constant expression, determined by the template parameter. + * @brief Right multiplication with a scalar + * + * @param a batch multiplier + * @param b scalar multiplier + * + * @return product @a a * @a b + */ + template + SimdVec operator*(SimdVec const& a, T const& b) noexcept; + + + /** + * @brief Data-parallel type with a given element type. * - * @tparam T the element type simd_mask applies on + * @tparam T element type */ - template - class SimdVec; -} \ No newline at end of file + template + class SimdVec + { + public: + using ValueType = T; + using XSimdType = xsimd::batch; + using IntrinsicType = typename XSimdType::register_type; + using MaskType = SimdMask; + + + /** + * @brief Set to [0, 0, 0, ...] + */ + SimdVec() noexcept + : value_ {T {}} + { + } + + + SimdVec(SimdVec const&) noexcept = default; + + + SimdVec(IntrinsicType value) noexcept + : value_ {value} + { + } + + + SimdVec(XSimdType value) noexcept + : value_ {value} + { + } + + + /** + * @brief Set to [value, value, ...] + * + * @param value value for each component of SIMD vector + */ + SimdVec(ValueType value) noexcept + : value_ {value} + { + } + + + /** + * @brief Load from location + * + * @param src memory location to load from + * @param aligned true indicates that an aligned read instruction should be used + */ + explicit SimdVec(ValueType const * src, bool aligned) noexcept + : value_ {aligned ? xsimd::load_aligned(src) : xsimd::load_unaligned(src)} + { + } + + + /** + * @brief Masked load from location + * + * @param src memory location to load from + * @param mask load mask + * @param aligned true if @a src is SIMD-aligned + */ + explicit SimdVec(ValueType const * src, MaskType mask, bool aligned) noexcept + : value_ {maskload(src, mask)} + { + } + + + /** + * @brief Number of elements in SIMD pack + */ + static size_t constexpr size() + { + return XSimdType::size; + } + + + /** + * @brief Set to 0 + */ + void reset() noexcept + { + value_ = ValueType {}; + } + + + operator IntrinsicType() const noexcept + { + return value_; + } + + + /** + * @brief Access single element + * + * @param i element index + * + * @return element value + */ + ValueType operator[](size_t i) const noexcept + { + return value_.get(i); + } + + + /** + * @brief Store to memory + * + * @param dst memory location to store to + * @param aligned true if @a dst is SIMD-aligned + */ + void store(ValueType * dst, bool aligned) const noexcept + { + if (aligned) + xsimd::store_aligned(dst, value_); + else + xsimd::store_unaligned(dst, value_); + } + + + /** + * @brief Masked store to memory + * + * @param dst memory location to store to + * @param mask store mask + * @param aligned true if @a dst is SIMD-aligned + */ + void store(ValueType * dst, MaskType mask, bool aligned) const noexcept + { + maskstore(value_, dst, mask); + } + + + /** + * @brief In-place multiplication + * + * @param a multiplier + * + * @return @a *this after multiplication with @a a + */ + SimdVec& operator*=(SimdVec const& a) noexcept + { + value_ *= a.value_; + return *this; + } + + + /** + * @brief In-place division + * + * @param a divisor + * + * @return @a *this after division by @a a + */ + SimdVec& operator/=(SimdVec const& a) noexcept + { + value_ /= a.value_; + return *this; + } + + + friend SimdVec fmadd<>(SimdVec const& a, SimdVec const& b, SimdVec const& c) noexcept; + friend SimdVec fnmadd<>(SimdVec const& a, SimdVec const& b, SimdVec const& c) noexcept; + friend SimdVec blend<>(SimdVec const& a, SimdVec const& b, MaskType const& mask) noexcept; + friend SimdVec abs<>(SimdVec const& a) noexcept; + friend SimdVec max<>(SimdVec const& a, SimdVec const& b) noexcept; + friend ValueType max<>(SimdVec const& x) noexcept; + friend MaskType operator><>(SimdVec const& a, SimdVec const& b) noexcept; + friend SimdVec operator*<>(SimdVec const& a, SimdVec const& b) noexcept; + friend SimdVec operator*<>(ValueType const& a, SimdVec const& b) noexcept; + friend SimdVec operator*<>(SimdVec const& a, ValueType const& b) noexcept; + + /** + * @brief Maximum element of a batch and its index. + * + * For example, imax([-1., 0., 4., 3.], [1, 2, -1, 42]) == ([4., 4., 4., 4.], [-1, -1, -1, -1]) + * + * @param v batch of values + * @param idx batch of indices + * + * @return a tuple {A, I} such that A == [a, a, ..., a] and I == [idx[i], idx[i], ..., idx[i]], + * where a == max_j(v[j]), i = argmax_j(v[j]) + */ + friend std::tuple> imax(SimdVec const& v, SimdIndex const& idx) noexcept + { + return imax(v.value_, idx); + } + + private: + XSimdType value_; + }; + + + template + inline SimdVec fmadd(SimdVec const& a, SimdVec const& b, SimdVec const& c) noexcept + { + return xsimd::fma(a.value_, b.value_, c.value_); + } + + + template + inline SimdVec fnmadd(SimdVec const& a, SimdVec const& b, SimdVec const& c) noexcept + { + return xsimd::fnma(a.value_, b.value_, c.value_); + } + + + template + inline SimdVec blend(SimdVec const& a, SimdVec const& b, SimdMask const& mask) noexcept + { + return xsimd::select(mask, a.value_, b.value_); + } + + + template + inline SimdVec abs(SimdVec const& a) noexcept + { + return xsimd::abs(a.value_); + } + + + template + inline SimdMask operator>(SimdVec const& a, SimdVec const& b) noexcept + { + return xsimd::gt(a.value_, b.value_); + } + + + template + inline SimdVec operator*(SimdVec const& a, SimdVec const& b) noexcept + { + return xsimd::mul(a.value_, b.value_); + } + + + template + inline SimdVec operator*(T const& a, SimdVec const& b) noexcept + { + return xsimd::mul(a, b.value_); + } + + + template + inline SimdVec operator*(SimdVec const& a, T const& b) noexcept + { + return xsimd::mul(a.value_, b); + } + + + template + inline SimdVec max(SimdVec const& a, SimdVec const& b) noexcept + { + return xsimd::max(a.value_, b.value_); + } + + + template + inline T max(SimdVec const& x) noexcept + { + return xsimd::reduce_max(x.value_); + } +} diff --git a/include/blast/math/simd/SimdVecBase.hpp b/include/blast/math/simd/SimdVecBase.hpp deleted file mode 100644 index 365258f2..00000000 --- a/include/blast/math/simd/SimdVecBase.hpp +++ /dev/null @@ -1,54 +0,0 @@ -// Copyright 2023 Mikhail Katliar -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once - -#include - -#include - - -namespace blast -{ - template - class SimdVecBase - { - public: - using ValueType = T; - using IntrinsicType = I; - - - /** - * @brief Number of elements in SIMD pack - */ - static size_t constexpr size() - { - return SimdSize_v; - } - - - operator IntrinsicType() const noexcept - { - return value_; - } - - protected: - SimdVecBase(IntrinsicType value) - : value_ {value} - { - } - - IntrinsicType value_; - }; -} \ No newline at end of file diff --git a/include/blast/math/simd/arch/Avx2.hpp b/include/blast/math/simd/arch/Avx2.hpp new file mode 100644 index 00000000..b5aed8c1 --- /dev/null +++ b/include/blast/math/simd/arch/Avx2.hpp @@ -0,0 +1,123 @@ +// Copyright 2024 Mikhail Katliar +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +#pragma once + +#include + +#include + + +namespace blast +{ + template + requires std::is_base_of_v + inline xsimd::batch maskload(float const * src, xsimd::batch_bool const& mask) noexcept + { + return _mm256_maskload_ps(src, _mm256_castps_si256(mask)); + } + + + template + requires std::is_base_of_v + inline xsimd::batch maskload(double const * src, xsimd::batch_bool const& mask) noexcept + { + return _mm256_maskload_pd(src, _mm256_castpd_si256(mask)); + } + + + template + requires std::is_base_of_v + inline void maskstore(xsimd::batch const& v, float * dst, xsimd::batch_bool const& mask) noexcept + { + _mm256_maskstore_ps(dst, _mm256_castps_si256(mask), v); + } + + + template + requires std::is_base_of_v + inline void maskstore(xsimd::batch const& v, double * dst, xsimd::batch_bool const& mask) noexcept + { + _mm256_maskstore_pd(dst, _mm256_castpd_si256(mask), v); + } + + + template + requires std::is_base_of_v + inline std::tuple, xsimd::batch> imax(xsimd::batch const& v1, xsimd::batch const& idx) noexcept + { + /* v2 = [G H E F | C D A B] */ + __m256 const v2 = _mm256_permute_ps(v1, 0b10'11'00'01); + __m256i const iv2 = _mm256_castps_si256(_mm256_permute_ps(_mm256_castsi256_ps(idx), 0b10'11'00'01)); + + /* v3 = [W=max(G,H) W=max(G,H) Z=max(E,F) Z=max(E,F) | Y=max(C,D) Y=max(C,D) X=max(A,B) X=max(A,B)] */ + /* v3 = [W W Z Z | Y Y X X] */ + // __m256 v3 = _mm256_max_ps(v1, v2); + __m256 const mask_v3 = _mm256_cmp_ps(v2, v1, _CMP_GT_OQ); + __m256 const v3 = _mm256_blendv_ps(v1, v2, mask_v3); + __m256i const iv3 = _mm256_blendv_epi8(idx, iv2, _mm256_castps_si256(mask_v3)); + + /* v4 = [Z Z W W | X X Y Y] */ + __m256 const v4 = _mm256_permute_ps(v3, 0b00'00'10'10); + __m256i const iv4 = _mm256_castps_si256(_mm256_permute_ps(_mm256_castsi256_ps(iv3), 0b00'00'10'10)); + + /* v5 = [J=max(Z,W) J=max(Z,W) J=max(Z,W) J=max(Z,W) | I=max(X,Y) I=max(X,Y) I=max(X,Y) I=max(X,Y)] */ + /* v5 = [J J J J | I I I I] */ + // __m256 v5 = _mm256_max_ps(v3, v4); + __m256 const mask_v5 = _mm256_cmp_ps(v4, v3, _CMP_GT_OQ); + __m256 const v5 = _mm256_blendv_ps(v3, v4, mask_v5); + __m256i const iv5 = _mm256_blendv_epi8(iv3, iv4, _mm256_castps_si256(mask_v5)); + + /* v6 = [I I I I | J J J J] */ + __m256 const v6 = _mm256_permute2f128_ps(v5, v5, 0b0000'0001); + __m256i const iv6 = _mm256_castps_si256( + _mm256_permute2f128_ps( + _mm256_castsi256_ps(iv5), + _mm256_castsi256_ps(iv5), + 0b0000'0001 + ) + ); + + /* v7 = [M=max(I,J) M=max(I,J) M=max(I,J) M=max(I,J) | M=max(I,J) M=max(I,J) M=max(I,J) M=max(I,J)] */ + // __m128 v7 = _mm_max_ps(_mm256_castps256_ps128(v5), v6); + __m256 const mask_v7 = _mm256_cmp_ps(v6, v5, _CMP_GT_OQ); + __m256 const v7 = _mm256_blendv_ps(v5, v6, mask_v7); + __m256i const iv7 = _mm256_blendv_epi8(iv5, iv6, _mm256_castps_si256(mask_v7)); + + return {v7, iv7}; + } + + + template + requires std::is_base_of_v + inline std::tuple, xsimd::batch> imax(xsimd::batch const& x, xsimd::batch const& idx) noexcept + { + __m256d const y = _mm256_permute2f128_pd(x, x, 1); // permute 128-bit values + __m256i const iy = _mm256_permute2f128_si256(idx, idx, 1); + + // __m256d m1 = _mm256_max_pd(x.value_, y); // m1[0] = max(x[0], x[2]), m1[1] = max(x[1], x[3]), etc. + __m256d const mask_m1 = _mm256_cmp_pd(y, x, _CMP_GT_OQ); + __m256d const m1 = _mm256_blendv_pd(x, y, mask_m1); + __m256i const im1 = _mm256_blendv_epi8(idx, iy, _mm256_castpd_si256(mask_m1)); + + __m256d const m2 = _mm256_permute_pd(m1, 5); // set m2[0] = m1[1], m2[1] = m1[0], etc. + __m256i const im2 = _mm256_castpd_si256(_mm256_permute_pd(_mm256_castsi256_pd(im1), 5)); + + // __m256d m = _mm256_max_pd(m1, m2); // all m[0] ... m[3] contain the horizontal max(x[0], x[1], x[2], x[3]) + __m256d const mask_m = _mm256_cmp_pd(m2, m1, _CMP_GT_OQ); + __m256d const m = _mm256_blendv_pd(m1, m2, mask_m); + __m256i const im = _mm256_blendv_epi8(im1, im2, _mm256_castpd_si256(mask_m)); + + return {m, im}; + } +} diff --git a/include/blast/math/simd/avx256/IntScalarType.hpp b/include/blast/math/simd/avx256/IntScalarType.hpp deleted file mode 100644 index 59785c85..00000000 --- a/include/blast/math/simd/avx256/IntScalarType.hpp +++ /dev/null @@ -1,34 +0,0 @@ -// Copyright 2023 Mikhail Katliar -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once - -#include - - -namespace blast -{ - template <> - struct IntScalarType<4> - { - using type = std::int64_t; - }; - - - template <> - struct IntScalarType<8> - { - using type = std::int32_t; - }; -} \ No newline at end of file diff --git a/include/blast/math/simd/avx256/SimdMask.hpp b/include/blast/math/simd/avx256/SimdMask.hpp deleted file mode 100644 index 8dbcea64..00000000 --- a/include/blast/math/simd/avx256/SimdMask.hpp +++ /dev/null @@ -1,83 +0,0 @@ -// Copyright 2023 Mikhail Katliar -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once - -#include - -#include - -#include - - -namespace blast -{ - template - class SimdMask - { - static size_t constexpr SS = SimdSize_v; - - public: - using ValueType = IntElementType_t; - using IntrinsicType = __m256i; - using MaskType = __m256i; - - - /** - * @brief Set to [0, 0, 0, ...] - */ - SimdMask() noexcept - : value_ {_mm256_setzero_pd()} - { - } - - - SimdMask(IntrinsicType value) noexcept - : value_ {value} - { - } - - - operator IntrinsicType() const noexcept - { - return value_; - } - - - /** - * @brief Number of elements in SIMD pack - */ - static size_t constexpr size() - { - return SS; - } - - - /** - * @brief Access specified element - * - * @param i element index - * - * @return element value - */ - ValueType operator[](size_t i) const noexcept - { - return value_[i]; - } - - - private: - IntrinsicType value_; - }; -} \ No newline at end of file diff --git a/include/blast/math/simd/avx256/SimdSize.hpp b/include/blast/math/simd/avx256/SimdSize.hpp deleted file mode 100644 index 2fa30b2f..00000000 --- a/include/blast/math/simd/avx256/SimdSize.hpp +++ /dev/null @@ -1,50 +0,0 @@ -// Copyright 2023 Mikhail Katliar -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once - -#include - -#include - - -namespace blast -{ - template <> - struct SimdSize - { - static size_t constexpr value = 8; - }; - - - template <> - struct SimdSize - { - static size_t constexpr value = 4; - }; - - - template <> - struct SimdSize - { - static size_t constexpr value = 8; - }; - - - template <> - struct SimdSize - { - static size_t constexpr value = 4; - }; -} \ No newline at end of file diff --git a/include/blast/math/simd/avx256/SimdVecFloat32.hpp b/include/blast/math/simd/avx256/SimdVecFloat32.hpp deleted file mode 100644 index 34709629..00000000 --- a/include/blast/math/simd/avx256/SimdVecFloat32.hpp +++ /dev/null @@ -1,263 +0,0 @@ -// Copyright 2023 Mikhail Katliar -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once - -#include -#include -#include - -#include - -#include - - -namespace blast -{ - template <> - class SimdVec - : public SimdVecBase - { - public: - using MaskType = __m256i; - - /** - * @brief Set to [0, 0, 0, ...] - */ - SimdVec() noexcept - : SimdVecBase {_mm256_setzero_ps()} - { - } - - - SimdVec(IntrinsicType value) noexcept - : SimdVecBase {value} - { - } - - - /** - * @brief Set to [value, value, ...] - * - * TODO: add "explicit" - * - * @param value value for each component of SIMD vector - */ - SimdVec(ValueType value) noexcept - : SimdVecBase {_mm256_set1_ps(value)} - { - } - - - /** - * @brief Load from location - * - * @param src memory location to load from - * @param aligned true if @a src is SIMD-aligned - */ - explicit SimdVec(ValueType const * src, bool aligned) noexcept - : SimdVecBase {aligned ? _mm256_load_ps(src) : _mm256_loadu_ps(src)} - { - } - - - /** - * @brief Masked load from location - * - * @param src memory location to load from - * @param mask load mask - * @param aligned true if @a src is SIMD-aligned - */ - explicit SimdVec(ValueType const * src, MaskType mask, bool aligned) noexcept - : SimdVecBase {_mm256_maskload_ps(src, mask)} - { - } - - - /** - * @brief Store to memory - * - * @param dst memory location to store to - * @param aligned true if @a dst is SIMD-aligned - */ - void store(ValueType * dst, bool aligned) const noexcept - { - if (aligned) - _mm256_store_ps(dst, value_); - else - _mm256_storeu_ps(dst, value_); - } - - - /** - * @brief Masked store to memory - * - * @param dst memory location to store to - * @param mask store mask - * @param aligned true if @a dst is SIMD-aligned - */ - void store(ValueType * dst, MaskType mask, bool aligned) const noexcept - { - _mm256_maskstore_ps(dst, mask, value_); - } - - - friend MaskType operator>(SimdVec const& a, SimdVec const& b) noexcept - { - return _mm256_castps_si256(_mm256_cmp_ps(a.value_, b.value_, _CMP_GT_OQ)); - } - - - /** - * @brief Multiplication - * - * @param a first multiplier - * @param b second multiplier - * - * @return product @a a * @a b - */ - friend SimdVec operator*(SimdVec const& a, SimdVec const& b) noexcept - { - return _mm256_mul_ps(a.value_, b.value_); - } - - - /** - * @brief Fused multiply-add - * - * Calculate a * b + c - * - * @param a first multiplier - * @param b second multiplier - * @param c addendum - * - * @return @a a * @a b + @a c element-wise - */ - friend SimdVec fmadd(SimdVec const& a, SimdVec const& b, SimdVec const& c) noexcept - { - return _mm256_fmadd_ps(a.value_, b.value_, c.value_); - } - - - friend SimdVec blend(SimdVec const& a, SimdVec const& b, MaskType mask) noexcept - { - return _mm256_blendv_ps(a.value_, b.value_, _mm256_castsi256_ps(mask)); - } - - - friend SimdVec abs(SimdVec const& a) noexcept - { - return _mm256_andnot_ps(SimdVec {-0.f}, a.value_); - } - - - /** - * @brief Vertical max (across two vectors) - * - * @param a first vector - * @param b second vector - * - * @return [max(a[7], b7), [max(a[6], b6), max(a[5], b5), max(a[4], b4), max(a[3], b3), max(a[2], b2), max(a[1], b1), max(a[0], b0)] - */ - friend SimdVec max(SimdVec const& a, SimdVec const& b) - { - return _mm256_max_ps(a, b); - } - - - /** - * @brief Horizontal max (across all elements) - * - * The implementation is based on - * https://stackoverflow.com/questions/9795529/how-to-find-the-horizontal-maximum-in-a-256-bit-avx-vector - * - * @param a pack of 32-bit floating point numbers - * - * @return max(a[0], a[1], a[2], a[3], a[4], a[5], a[6], a[7]) - */ - friend ValueType max(SimdVec const& x) - { - __m256 v1 = x.value_; /* v1 = [H G F E | D C B A] */ - __m256 v2 = _mm256_permute_ps(v1, 0b10'11'00'01); /* v2 = [G H E F | C D A B] */ - __m256 v3 = _mm256_max_ps(v1, v2); /* v3 = [W=max(G,H) W=max(G,H) Z=max(E,F) Z=max(E,F) | Y=max(C,D) Y=max(C,D) X=max(A,B) X=max(A,B)] */ - /* v3 = [W W Z Z | Y Y X X] */ - __m256 v4 = _mm256_permute_ps(v3, 0b00'00'10'10); /* v4 = [Z Z W W | X X Y Y] */ - __m256 v5 = _mm256_max_ps(v3, v4); /* v5 = [J=max(Z,W) J=max(Z,W) J=max(Z,W) J=max(Z,W) | I=max(X,Y) I=max(X,Y) I=max(X,Y) I=max(X,Y)] */ - /* v5 = [J J J J | I I I I] */ - __m128 v6 = _mm256_extractf128_ps(v5, 1); /* v6 = [- - - - | J J J J] */ - __m128 v7 = _mm_max_ps(_mm256_castps256_ps128(v5), v6); /* v7 = [- - - - | M=max(I,J) M=max(I,J) M=max(I,J) M=max(I,J)] */ - - return v7[0]; - } - - - template - requires (SimdSize_v == size()) - friend std::tuple> imax(SimdVec const& v1, SimdVec const& idx) - { - /* v2 = [G H E F | C D A B] */ - SimdVec const v2 = _mm256_permute_ps(v1, 0b10'11'00'01); - SimdVec const iv2 = _mm256_castps_si256(_mm256_permute_ps(_mm256_castsi256_ps(idx), 0b10'11'00'01)); - - /* v3 = [W=max(G,H) W=max(G,H) Z=max(E,F) Z=max(E,F) | Y=max(C,D) Y=max(C,D) X=max(A,B) X=max(A,B)] */ - /* v3 = [W W Z Z | Y Y X X] */ - // __m256 v3 = _mm256_max_ps(v1, v2); - MaskType const mask_v3 = v2 > v1; - SimdVec const v3 = blend(v1, v2, mask_v3); - SimdVec const iv3 = blend(idx, iv2, mask_v3); - - /* v4 = [Z Z W W | X X Y Y] */ - SimdVec const v4 = _mm256_permute_ps(v3, 0b00'00'10'10); - SimdVec const iv4 = _mm256_castps_si256(_mm256_permute_ps(_mm256_castsi256_ps(iv3), 0b00'00'10'10)); - - /* v5 = [J=max(Z,W) J=max(Z,W) J=max(Z,W) J=max(Z,W) | I=max(X,Y) I=max(X,Y) I=max(X,Y) I=max(X,Y)] */ - /* v5 = [J J J J | I I I I] */ - // __m256 v5 = _mm256_max_ps(v3, v4); - MaskType const mask_v5 = v4 > v3; - SimdVec const v5 = blend(v3, v4, mask_v5); - SimdVec const iv5 = blend(iv3, iv4, mask_v5); - - /* v6 = [I I I I | J J J J] */ - SimdVec const v6 = _mm256_permute2f128_ps(v5, v5, 0b0000'0001); - SimdVec const iv6 = _mm256_castps_si256( - _mm256_permute2f128_ps( - _mm256_castsi256_ps(iv5), - _mm256_castsi256_ps(iv5), - 0b0000'0001 - ) - ); - - /* v7 = [M=max(I,J) M=max(I,J) M=max(I,J) M=max(I,J) | M=max(I,J) M=max(I,J) M=max(I,J) M=max(I,J)] */ - // __m128 v7 = _mm_max_ps(_mm256_castps256_ps128(v5), v6); - MaskType const mask_v7 = v6 > v5; - SimdVec const v7 = blend(v5, v6, mask_v7); - SimdVec const iv7 = blend(iv5, iv6, mask_v7); - - return std::make_tuple(v7, iv7); - } - - - /** - * @brief Access single element - * - * @param i element index - * - * @return element value - */ - ValueType operator[](size_t i) const noexcept - { - return value_[i]; - } - }; -} \ No newline at end of file diff --git a/include/blast/math/simd/avx256/SimdVecFloat64.hpp b/include/blast/math/simd/avx256/SimdVecFloat64.hpp deleted file mode 100644 index a5277ab6..00000000 --- a/include/blast/math/simd/avx256/SimdVecFloat64.hpp +++ /dev/null @@ -1,234 +0,0 @@ -// Copyright 2023 Mikhail Katliar -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once - -#include -#include -#include - -#include - -#include - - -namespace blast -{ - template <> - class SimdVec - : public SimdVecBase - { - public: - using MaskType = __m256i; - - /** - * @brief Set to [0, 0, 0, ...] - */ - SimdVec() noexcept - : SimdVecBase {_mm256_setzero_pd()} - { - } - - - SimdVec(IntrinsicType value) noexcept - : SimdVecBase {value} - { - } - - - /** - * @brief Set to [value, value, ...] - * - * TODO: add "explicit" - * - * @param value value for each component of SIMD vector - */ - SimdVec(ValueType value) noexcept - : SimdVecBase {_mm256_set1_pd(value)} - { - } - - - /** - * @brief Load from location - * - * @param src memory location to load from - * @param aligned true if @a src is SIMD-aligned - */ - explicit SimdVec(ValueType const * src, bool aligned) noexcept - : SimdVecBase {aligned ? _mm256_load_pd(src) : _mm256_loadu_pd(src)} - { - } - - - /** - * @brief Masked load from location - * - * @param src memory location to load from - * @param mask load mask - * @param aligned true if @a src is SIMD-aligned - */ - explicit SimdVec(ValueType const * src, MaskType mask, bool aligned) noexcept - : SimdVecBase {_mm256_maskload_pd(src, mask)} - { - } - - - /** - * @brief Store to memory - * - * @param dst memory location to store to - * @param aligned true if @a dst is SIMD-aligned - */ - void store(ValueType * dst, bool aligned) const noexcept - { - if (aligned) - _mm256_store_pd(dst, value_); - else - _mm256_storeu_pd(dst, value_); - } - - - /** - * @brief Masked store to memory - * - * @param dst memory location to store to - * @param mask store mask - * @param aligned true if @a dst is SIMD-aligned - */ - void store(ValueType * dst, MaskType mask, bool aligned) const noexcept - { - _mm256_maskstore_pd(dst, mask, value_); - } - - - friend MaskType operator>(SimdVec const& a, SimdVec const& b) noexcept - { - return _mm256_castpd_si256(_mm256_cmp_pd(a.value_, b.value_, _CMP_GT_OQ)); - } - - - /** - * @brief Multiplication - * - * @param a first multiplier - * @param b second multiplier - * - * @return product @a a * @a b - */ - friend SimdVec operator*(SimdVec const& a, SimdVec const& b) noexcept - { - return _mm256_mul_pd(a.value_, b.value_); - } - - - /** - * @brief Fused multiply-add - * - * Calculate a * b + c - * - * @param a first multiplier - * @param b second multiplier - * @param c addendum - * - * @return @a a * @a b + @a c element-wise - */ - friend SimdVec fmadd(SimdVec const& a, SimdVec const& b, SimdVec const& c) noexcept - { - return _mm256_fmadd_pd(a.value_, b.value_, c.value_); - } - - - friend SimdVec blend(SimdVec const& a, SimdVec const& b, MaskType mask) noexcept - { - return _mm256_blendv_pd(a.value_, b.value_, _mm256_castsi256_pd(mask)); - } - - - friend SimdVec abs(SimdVec const& a) noexcept - { - return _mm256_andnot_pd(SimdVec {-0.}, a.value_); - } - - - /** - * @brief Vertical max (across two vectors) - * - * @param a first vector - * @param b second vector - * - * @return [max(a[3], b3), max(a[2], b2), max(a[1], b1), max(a[0], b0)] - */ - friend SimdVec max(SimdVec const& a, SimdVec const& b) - { - return _mm256_max_pd(a, b); - } - - - /** - * @brief Horizontal max (across all alements) - * - * The implementation is based on - * https://stackoverflow.com/questions/9795529/how-to-find-the-horizontal-maximum-in-a-256-bit-avx-vector - * - * @return max(x[0], x[1], x[2], x[3]) - */ - friend ValueType max(SimdVec x) noexcept - { - __m256d y = _mm256_permute2f128_pd(x.value_, x.value_, 1); // permute 128-bit values - __m256d m1 = _mm256_max_pd(x.value_, y); // m1[0] = max(x[0], x[2]), m1[1] = max(x[1], x[3]), etc. - __m256d m2 = _mm256_permute_pd(m1, 5); // set m2[0] = m1[1], m2[1] = m1[0], etc. - __m256d m = _mm256_max_pd(m1, m2); // all m[0] ... m[3] contain the horizontal max(x[0], x[1], x[2], x[3]) - - return m[0]; - } - - - template - requires (SimdSize_v == size()) - friend std::tuple> imax(SimdVec const& x, SimdVec const& idx) - { - SimdVec const y = _mm256_permute2f128_pd(x.value_, x.value_, 1); // permute 128-bit values - SimdVec const iy = _mm256_permute2f128_si256(idx, idx, 1); - - // __m256d m1 = _mm256_max_pd(x.value_, y); // m1[0] = max(x[0], x[2]), m1[1] = max(x[1], x[3]), etc. - MaskType const mask_m1 = y > x; - SimdVec const m1 = blend(x, y, mask_m1); - SimdVec const im1 = blend(idx, iy, mask_m1); - - SimdVec const m2 = _mm256_permute_pd(m1, 5); // set m2[0] = m1[1], m2[1] = m1[0], etc. - SimdVec const im2 = _mm256_castpd_si256(_mm256_permute_pd(_mm256_castsi256_pd(im1), 5)); - - // __m256d m = _mm256_max_pd(m1, m2); // all m[0] ... m[3] contain the horizontal max(x[0], x[1], x[2], x[3]) - MaskType const mask_m = m2 > m1; - SimdVec const m = blend(m1, m2, mask_m); - SimdVec const im = blend(im1, im2, mask_m); - - return std::make_tuple(m, im); - } - - - /** - * @brief Access single element - * - * @param i element index - * - * @return element value - */ - ValueType operator[](size_t i) const noexcept - { - return value_[i]; - } - }; -} \ No newline at end of file diff --git a/include/blast/math/simd/avx256/SimdVecInt32.hpp b/include/blast/math/simd/avx256/SimdVecInt32.hpp deleted file mode 100644 index 8854f50f..00000000 --- a/include/blast/math/simd/avx256/SimdVecInt32.hpp +++ /dev/null @@ -1,122 +0,0 @@ -// Copyright 2023 Mikhail Katliar -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once - -#include -#include -#include -#include - -#include - -#include - - -namespace blast -{ - template <> - class SimdVec - : public SimdVecBase - { - public: - using MaskType = __m256i; - - /** - * @brief Initialize to (0, 0, 0, 0, 0, 0, 0, 0) - */ - SimdVec() noexcept - : SimdVecBase {_mm256_setzero_si256()} - { - } - - - /** - * @brief Initialize to (a, a, a, a, a, a, a, a) - */ - SimdVec(ValueType a) noexcept - : SimdVecBase {_mm256_set1_epi32(a)} - { - } - - - /** - * @brief Initialize to (n + 7, n + 6, n + 5, n + 4, n + 3, n + 2, n + 1, n) - * - * @param n - */ - SimdVec(simd::SequenceTag, ValueType n = 0) - : SimdVec {n + 7, n + 6, n + 5, n + 4, n + 3, n + 2, n + 1, n} - { - } - - - /** - * @brief Initialize to (a7, a6, a5, a4, a3, a2, a1, a0), - * where a0 corresponds to the lower bits. - */ - SimdVec(ValueType a7, ValueType a6, ValueType a5, ValueType a4, - ValueType a3, ValueType a2, ValueType a1, ValueType a0) noexcept - : SimdVecBase {_mm256_set_epi32(a7, a6, a5, a4, a3, a2, a1, a0)} - { - } - - - SimdVec(IntrinsicType value) noexcept - : SimdVecBase {value} - { - } - - - SimdVec& operator+=(ValueType x) noexcept - { - value_ = _mm256_add_epi32(value_, _mm256_set1_epi32(x)); - return *this; - } - - - friend MaskType operator>(SimdVec const& a, SimdVec const& b) noexcept - { - return _mm256_cmpgt_epi32(a.value_, b.value_); - } - - - friend SimdVec blend(SimdVec const& a, SimdVec const& b, MaskType mask) noexcept - { - return _mm256_blendv_epi8(a.value_, b.value_, mask); - } - - - friend SimdVec operator+(SimdVec const& a, ValueType n) noexcept - { - return _mm256_add_epi32(a.value_, _mm256_set1_epi32(n)); - } - - - /** - * @brief Access single element - * - * @param i element index - * - * @return element value - */ - ValueType operator[](size_t i) const noexcept - { - alignas(IntrinsicType) ValueType v[size()]; - _mm256_storeu_si256(reinterpret_cast(v), value_); - - return v[i]; - } - }; -} \ No newline at end of file diff --git a/include/blast/math/simd/avx256/SimdVecInt64.hpp b/include/blast/math/simd/avx256/SimdVecInt64.hpp deleted file mode 100644 index 6edf2863..00000000 --- a/include/blast/math/simd/avx256/SimdVecInt64.hpp +++ /dev/null @@ -1,124 +0,0 @@ -// Copyright 2023 Mikhail Katliar -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once - -#include -#include -#include -#include - -#include - -#include - - -namespace blast -{ - template <> - class SimdVec - : public SimdVecBase - { - public: - using MaskType = __m256i; - - /** - * @brief Initialize to (0, 0, 0, 0) - */ - SimdVec() noexcept - : SimdVecBase {_mm256_setzero_si256()} - { - } - - - /** - * @brief Initialize to (a, a, a, a) - */ - SimdVec(ValueType a) noexcept - : SimdVecBase {_mm256_set1_epi64x(a)} - { - } - - - /** - * @brief Initialize to (n + 3, n + 2, n + 1, n) - * - * @param n - */ - SimdVec(simd::SequenceTag, ValueType n = 0) - : SimdVec {n + 3, n + 2, n + 1, n} - { - } - - - /** - * @brief Initialize to [a3, a2, a1, a0], - * where a0 corresponds to the lower bits. - */ - SimdVec(ValueType a3, ValueType a2, ValueType a1, ValueType a0) noexcept - : SimdVecBase {_mm256_set_epi64x(a3, a2, a1, a0)} - { - } - - - SimdVec(IntrinsicType value) noexcept - : SimdVecBase {value} - { - } - - - operator IntrinsicType() const noexcept - { - return value_; - } - - - SimdVec& operator+=(ValueType x) noexcept - { - value_ = _mm256_add_epi64(value_, _mm256_set1_epi64x(x)); - return *this; - } - - - friend MaskType operator>(SimdVec const& a, SimdVec const& b) noexcept - { - return _mm256_cmpgt_epi64(a.value_, b.value_); - } - - - friend SimdVec blend(SimdVec const& a, SimdVec const& b, MaskType mask) noexcept - { - return _mm256_blendv_epi8(a.value_, b.value_, mask); - } - - - friend SimdVec operator+(SimdVec const& a, ValueType n) noexcept - { - return _mm256_add_epi64(a.value_, _mm256_set1_epi64x(n)); - } - - - /** - * @brief Access single element - * - * @param i element index - * - * @return element value - */ - ValueType operator[](size_t i) const noexcept - { - return value_[i]; - } - }; -} \ No newline at end of file diff --git a/include/blast/math/views/row/Panel.hpp b/include/blast/math/views/row/Panel.hpp index 5daadc0e..55c3f9f4 100644 --- a/include/blast/math/views/row/Panel.hpp +++ b/include/blast/math/views/row/Panel.hpp @@ -7,7 +7,6 @@ #include #include #include -#include #include #include @@ -17,9 +16,6 @@ #include #include -#include -#include - namespace blast { @@ -195,4 +191,4 @@ namespace blast BLAST_CONSTRAINT_MUST_NOT_BE_PANEL_SUBMATRIX_TYPE(MT); //********************************************************************************************** }; -} \ No newline at end of file +} diff --git a/include/blast/math/views/submatrix/Panel.hpp b/include/blast/math/views/submatrix/Panel.hpp index c922bba7..50ba6a79 100644 --- a/include/blast/math/views/submatrix/Panel.hpp +++ b/include/blast/math/views/submatrix/Panel.hpp @@ -7,7 +7,6 @@ #include #include #include -#include #include #include #include @@ -25,9 +24,6 @@ #include #include -#include -#include - namespace blast { @@ -283,4 +279,4 @@ namespace blast { return PanelSubmatrix(matrix.operand(), matrix.row() + row, matrix.column() + column, m, n); } -} \ No newline at end of file +} diff --git a/test/blast/CMakeLists.txt b/test/blast/CMakeLists.txt index 5edafda1..fa479125 100644 --- a/test/blast/CMakeLists.txt +++ b/test/blast/CMakeLists.txt @@ -13,7 +13,6 @@ find_package(LAPACK REQUIRED) add_executable(test-blast math/simd/RegisterMatrixTest.cpp math/simd/DynamicRegisterMatrixTest.cpp - math/simd/HsumTest.cpp math/simd/SimdVecTest.cpp math/dense/StaticVectorPointerTest.cpp diff --git a/test/blast/math/panel/StaticPanelMatrixTest.cpp b/test/blast/math/panel/StaticPanelMatrixTest.cpp index 13fd564e..c319a422 100644 --- a/test/blast/math/panel/StaticPanelMatrixTest.cpp +++ b/test/blast/math/panel/StaticPanelMatrixTest.cpp @@ -3,9 +3,12 @@ // license that can be found in the LICENSE file. #include +#include +#include #include +#include #include #include @@ -98,7 +101,7 @@ namespace blast :: testing for (size_t i = 0; i < M; i += SS) for (size_t j = 0; j < N; ++j) { - auto const xmm = A.template load(i, j); + auto const xmm = ptr(A, i, j).load(); for (size_t k = 0; k < SS; ++k) ASSERT_EQ(xmm[k], A(i + k, j)) << "element mismatch at i,j,k=" << i << "," << j << "," << k; @@ -113,16 +116,18 @@ namespace blast :: testing size_t constexpr N = 3 * SS + 2; StaticPanelMatrix A; - IntrinsicType_t val; + std::vector vec(SS); for (size_t i = 0; i < SS; ++i) - val[i] = TypeParam(i + 1); + vec[i] = i + 1; + + SimdVec val {vec.data(), false}; for (size_t i = 0; i < M; i += SS) for (size_t j = 0; j < N; ++j) { A = TypeParam(0.); - A.store(i, j, val); + ptr(A, i, j).store(val); for (size_t k = 0; k < SS && i + k < rows(A); ++k) ASSERT_EQ(A(i + k, j), val[k]) << "element mismatch at i,j,k=" << i << "," << j << "," << k; @@ -201,4 +206,4 @@ namespace blast :: testing INSTANTIATE_TYPED_TEST_SUITE_P(double, StaticPanelMatrixTest, double); INSTANTIATE_TYPED_TEST_SUITE_P(float, StaticPanelMatrixTest, float); -} \ No newline at end of file +} diff --git a/test/blast/math/simd/HsumTest.cpp b/test/blast/math/simd/HsumTest.cpp deleted file mode 100644 index 3153e042..00000000 --- a/test/blast/math/simd/HsumTest.cpp +++ /dev/null @@ -1,38 +0,0 @@ -// Copyright (c) 2019-2020 Mikhail Katliar All rights reserved. -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. - -#include - -#include -#include - -#include - -using namespace blaze; - - -namespace blast :: testing -{ - TEST(HsumTest, test4x256d) - { - std::array a, b, c, d, r; - randomize(a); - randomize(b); - randomize(c); - randomize(d); - - __m256d mma = _mm256_loadu_pd(a.data()); - __m256d mmb = _mm256_loadu_pd(b.data()); - __m256d mmc = _mm256_loadu_pd(c.data()); - __m256d mmd = _mm256_loadu_pd(d.data()); - - __m256d mmr = hsum(mma, mmb, mmc, mmd); - _mm256_storeu_pd(r.data(), mmr); - - EXPECT_NEAR(r[0], std::accumulate(begin(a), end(a), 0.), 1.e-10); - EXPECT_NEAR(r[1], std::accumulate(begin(b), end(b), 0.), 1.e-10); - EXPECT_NEAR(r[2], std::accumulate(begin(c), end(c), 0.), 1.e-10); - EXPECT_NEAR(r[3], std::accumulate(begin(d), end(d), 0.), 1.e-10); - } -} \ No newline at end of file diff --git a/test/blast/math/simd/SimdVecTest.cpp b/test/blast/math/simd/SimdVecTest.cpp index 90be9d1c..f5b2fa04 100644 --- a/test/blast/math/simd/SimdVecTest.cpp +++ b/test/blast/math/simd/SimdVecTest.cpp @@ -2,7 +2,7 @@ // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. -#include +#include #include @@ -64,9 +64,10 @@ namespace blast :: testing TYPED_TEST(SimdVecTest, testImax) { using Scalar = TypeParam; - size_t constexpr SS = SimdSize_v; + using SimdType = SimdVec; + size_t constexpr SS = SimdType::size(); - IntVecType_t idx {simd::sequenceTag}; + SimdIndex const idx = indexSequence(); for (int i = 0; i < 100; ++i) { @@ -75,15 +76,15 @@ namespace blast :: testing SimdVec const v {a.data(), false}; SimdVec v_max; - IntVecType_t idx_max; + SimdIndex idx_max; std::tie(v_max, idx_max) = imax(v, idx); auto const max_el = std::max_element(a.begin(), a.end()); - for (size_t i = 0; i < idx_max.size(); ++i) - ASSERT_EQ(idx_max[i], std::distance(a.begin(), max_el)) << "Error at element " << i; + for (size_t i = 0; i < idx_max.size; ++i) + ASSERT_EQ(idx_max.get(i), std::distance(a.begin(), max_el)) << "Error at element " << i; for (size_t i = 0; i < v_max.size(); ++i) ASSERT_EQ(v_max[i], *max_el) << "Error at element " << i; } } -} \ No newline at end of file +} From 8ddc815fa8e080855c83065ac08acb4c9bc53ec0 Mon Sep 17 00:00:00 2001 From: Robin Verschueren Date: Tue, 6 Aug 2024 15:46:49 +0200 Subject: [PATCH 05/12] Add BLASFEO back again --- bench/analysis/dgemm_performance.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bench/analysis/dgemm_performance.py b/bench/analysis/dgemm_performance.py index b3c16b40..c74b7d8e 100644 --- a/bench/analysis/dgemm_performance.py +++ b/bench/analysis/dgemm_performance.py @@ -24,7 +24,7 @@ def filter_aggregate(benchmarks, name): plots = [ # {'data_file': 'dgemm-openblas.json', 'label': 'OpenBLAS'}, # {'data_file': 'dgemm-mkl.json', 'label': 'MKL'}, - # {'data_file': 'dgemm-blasfeo.json', 'label': 'BLASFEO'}, + {'data_file': 'dgemm-blasfeo.json', 'label': 'BLASFEO'}, # {'data_file': 'dgemm-blasfeo-blas.json', 'label': 'BLASFEO*'}, # {'data_file': 'dgemm-libxsmm.json', 'label': 'LIBXSMM'}, # {'data_file': 'dgemm-eigen-dynamic.json', 'label': 'Eigen (D)'}, From 64fae3cd0c907ac5aca1c98df5e9e9ecc452e355 Mon Sep 17 00:00:00 2001 From: Robin Verschueren Date: Tue, 13 Aug 2024 09:40:55 +0200 Subject: [PATCH 06/12] Address comments + add benchmark enhancements --- Makefile | 48 +++++++++++++++++++++++++----------------------- 1 file changed, 25 insertions(+), 23 deletions(-) diff --git a/Makefile b/Makefile index 3240a33c..4f8dc708 100644 --- a/Makefile +++ b/Makefile @@ -3,8 +3,10 @@ BENCH_BLASFEO = build/bin/bench-blasfeo BENCH_BLAZE = build/bin/bench-blaze BENCH_EIGEN = build/bin/bench-eigen BENCH_BLAST = build/bin/bench-blast +BENCH_BLAST_OUTPUT_DIR = $(shell git rev-parse --short HEAD) +BENCH_BLAST_CPU_CORE = 11 BENCH_LIBXSMM = build/bin/bench-libxsmm -BENCHMARK_OPTIONS = --benchmark_repetitions=10 --benchmark_counters_tabular=true --benchmark_out_format=json +BENCHMARK_OPTIONS = --benchmark_repetitions=30 --benchmark_counters_tabular=true --benchmark_out_format=json --benchmark_enable_random_interleaving=true --benchmark_min_warmup_time=10 --benchmark_min_time=1000000x RUN_MATLAB = matlab -nodisplay -nosplash -nodesktop -r BENCH_DATA = bench_result/data BENCH_IMAGE = bench_result/image @@ -61,29 +63,29 @@ ${BENCH_DATA}/sgemm-blaze-static.json: $(BENCH_BLAZE) $(BENCH_BLAZE) --benchmark_filter="BM_gemm_static*" $(BENCHMARK_OPTIONS) \ --benchmark_out=${BENCH_DATA}/sgemm-blaze-static.json -${BENCH_DATA}/$(shell git rev-parse --short HEAD)/dgemm-blast-static-panel.json: $(BENCH_BLAST) - $(BENCH_BLAST) --benchmark_filter="BM_gemm_static_panel" $(BENCHMARK_OPTIONS) \ - --benchmark_out=${BENCH_DATA}/$(shell git rev-parse --short HEAD)/dgemm-blast-static-panel.json +${BENCH_DATA}/${BENCH_BLAST_OUTPUT_DIR}/dgemm-blast-static-panel.json: $(BENCH_BLAST) + taskset -c ${BENCH_BLAST_CPU_CORE} $(BENCH_BLAST) --benchmark_filter="BM_gemm_static_panel" $(BENCHMARK_OPTIONS) \ + --benchmark_out=${BENCH_DATA}/${BENCH_BLAST_OUTPUT_DIR}/dgemm-blast-static-panel.json -${BENCH_DATA}/$(shell git rev-parse --short HEAD)/dgemm-blast-dynamic-panel.json: $(BENCH_BLAST) - $(BENCH_BLAST) --benchmark_filter="BM_gemm_dynamic_panel" $(BENCHMARK_OPTIONS) \ - --benchmark_out=${BENCH_DATA}/$(shell git rev-parse --short HEAD)/dgemm-blast-dynamic-panel.json +${BENCH_DATA}/${BENCH_BLAST_OUTPUT_DIR}/dgemm-blast-dynamic-panel.json: $(BENCH_BLAST) + taskset -c ${BENCH_BLAST_CPU_CORE} $(BENCH_BLAST) --benchmark_filter="BM_gemm_dynamic_panel" $(BENCHMARK_OPTIONS) \ + --benchmark_out=${BENCH_DATA}/${BENCH_BLAST_OUTPUT_DIR}/dgemm-blast-dynamic-panel.json -${BENCH_DATA}/$(shell git rev-parse --short HEAD)/dgemm-blast-static-plain.json: $(BENCH_BLAST) - $(BENCH_BLAST) --benchmark_filter="BM_gemm_static_plain" $(BENCHMARK_OPTIONS) \ - --benchmark_out=${BENCH_DATA}/$(shell git rev-parse --short HEAD)/dgemm-blast-static-plain.json +${BENCH_DATA}/${BENCH_BLAST_OUTPUT_DIR}/dgemm-blast-static-plain.json: $(BENCH_BLAST) + taskset -c ${BENCH_BLAST_CPU_CORE} $(BENCH_BLAST) --benchmark_filter="BM_gemm_static_plain" $(BENCHMARK_OPTIONS) \ + --benchmark_out=${BENCH_DATA}/${BENCH_BLAST_OUTPUT_DIR}/dgemm-blast-static-plain.json -${BENCH_DATA}/$(shell git rev-parse --short HEAD)/dgemm-blast-dynamic-plain.json: $(BENCH_BLAST) - $(BENCH_BLAST) --benchmark_filter="BM_gemm_dynamic_plain" $(BENCHMARK_OPTIONS) \ - --benchmark_out=${BENCH_DATA}/$(shell git rev-parse --short HEAD)/dgemm-blast-dynamic-plain.json +${BENCH_DATA}/${BENCH_BLAST_OUTPUT_DIR}/dgemm-blast-dynamic-plain.json: $(BENCH_BLAST) + taskset -c ${BENCH_BLAST_CPU_CORE} $(BENCH_BLAST) --benchmark_filter="BM_gemm_dynamic_plain" $(BENCHMARK_OPTIONS) \ + --benchmark_out=${BENCH_DATA}/${BENCH_BLAST_OUTPUT_DIR}/dgemm-blast-dynamic-plain.json ${BENCH_DATA}/sgemm-blast-static-panel.json: $(BENCH_BLAST) - $(BENCH_BLAST) --benchmark_filter="BM_gemm_static_panel" $(BENCHMARK_OPTIONS) \ - --benchmark_out=${BENCH_DATA}/sgemm-blast-static-panel.json + taskset -c ${BENCH_BLAST_CPU_CORE} $(BENCH_BLAST) --benchmark_filter="BM_gemm_static_panel" $(BENCHMARK_OPTIONS) \ + --benchmark_out=${BENCH_DATA}/${BENCH_BLAST_OUTPUT_DIR}/sgemm-blast-static-panel.json ${BENCH_DATA}/sgemm-blast-dynamic-panel.json: $(BENCH_BLAST) - $(BENCH_BLAST) --benchmark_filter="BM_gemm_dynamic_panel" $(BENCHMARK_OPTIONS) \ - --benchmark_out=${BENCH_DATA}/sgemm-blast-dynamic-panel.json + taskset -c ${BENCH_BLAST_CPU_CORE} $(BENCH_BLAST) --benchmark_filter="BM_gemm_dynamic_panel" $(BENCHMARK_OPTIONS) \ + --benchmark_out=${BENCH_DATA}/${BENCH_BLAST_OUTPUT_DIR}/sgemm-blast-dynamic-panel.json ${BENCH_DATA}/dgemm-libxsmm.json: $(BENCH_LIBXSMM) $(BENCH_LIBXSMM) --benchmark_filter="BM_gemm_nt" $(BENCHMARK_OPTIONS) \ @@ -94,17 +96,17 @@ ${BENCH_DATA}/sgemm-libxsmm.json: $(BENCH_LIBXSMM) --benchmark_out=${BENCH_DATA}/sgemm-libxsmm.json dgemm-benchmarks: \ - $(shell mkdir -p ${BENCH_DATA}/$(shell git rev-parse --short HEAD)) \ + $(shell mkdir -p ${BENCH_DATA}/${BENCH_BLAST_OUTPUT_DIR}) \ ${BENCH_DATA}/dgemm-openblas.json \ ${BENCH_DATA}/dgemm-mkl.json \ ${BENCH_DATA}/dgemm-libxsmm.json \ ${BENCH_DATA}/dgemm-blasfeo.json \ ${BENCH_DATA}/dgemm-blaze-static.json \ ${BENCH_DATA}/dgemm-eigen-static.json \ - ${BENCH_DATA}/$(shell git rev-parse --short HEAD)/dgemm-blast-static-panel.json \ - ${BENCH_DATA}/$(shell git rev-parse --short HEAD)/dgemm-blast-dynamic-panel.json \ - ${BENCH_DATA}/$(shell git rev-parse --short HEAD)/dgemm-blast-static-plain.json \ - ${BENCH_DATA}/$(shell git rev-parse --short HEAD)/dgemm-blast-dynamic-plain.json + ${BENCH_DATA}/${BENCH_BLAST_OUTPUT_DIR}/dgemm-blast-static-panel.json \ + ${BENCH_DATA}/${BENCH_BLAST_OUTPUT_DIR}/dgemm-blast-dynamic-panel.json \ + ${BENCH_DATA}/${BENCH_BLAST_OUTPUT_DIR}/dgemm-blast-static-plain.json \ + ${BENCH_DATA}/${BENCH_BLAST_OUTPUT_DIR}/dgemm-blast-dynamic-plain.json # @@ -239,4 +241,4 @@ ${BENCH_IMAGE}/mpc_software.pdf_tex: ${BENCH_IMAGE}/mpc_software.svg /usr/bin/inkscape --without-gui --file=${BENCH_IMAGE}/mpc_software.svg --export-pdf=${BENCH_IMAGE}/mpc_software.pdf --export-latex --export-area-drawing ${BENCH_IMAGE}/mpc_software.pdf: ${BENCH_IMAGE}/mpc_software.svg - /usr/bin/inkscape --without-gui --file=${BENCH_IMAGE}/mpc_software.svg --export-pdf=${BENCH_IMAGE}/mpc_software.pdf --export-area-drawing \ No newline at end of file + /usr/bin/inkscape --without-gui --file=${BENCH_IMAGE}/mpc_software.svg --export-pdf=${BENCH_IMAGE}/mpc_software.pdf --export-area-drawing From 49030d9131bb0330241486405018191c0b41e272 Mon Sep 17 00:00:00 2001 From: Mikhail Katliar Date: Mon, 5 Aug 2024 13:06:40 +0200 Subject: [PATCH 07/12] Moved avx2-specific implementation of tile() to blast/math/algorithm/arch/avx2 --- include/blast/math/algorithm/Gemm.hpp | 3 +- include/blast/math/algorithm/Tile.hpp | 141 ++-------------- .../blast/math/algorithm/arch/avx2/Tile.hpp | 150 ++++++++++++++++++ include/blast/system/Inline.hpp | 12 +- 4 files changed, 176 insertions(+), 130 deletions(-) create mode 100644 include/blast/math/algorithm/arch/avx2/Tile.hpp diff --git a/include/blast/math/algorithm/Gemm.hpp b/include/blast/math/algorithm/Gemm.hpp index 211c3acb..36b14939 100644 --- a/include/blast/math/algorithm/Gemm.hpp +++ b/include/blast/math/algorithm/Gemm.hpp @@ -62,6 +62,7 @@ namespace blast BLAZE_CONSTRAINT_MUST_BE_SAME_TYPE(std::remove_cv_t>, ET); tile)>( + xsimd::default_arch {}, D.cachePreferredTraversal, M, N, [&] (auto& ker, size_t i, size_t j) @@ -74,4 +75,4 @@ namespace blast } ); } -} \ No newline at end of file +} diff --git a/include/blast/math/algorithm/Tile.hpp b/include/blast/math/algorithm/Tile.hpp index 41d7aadb..280267fa 100644 --- a/include/blast/math/algorithm/Tile.hpp +++ b/include/blast/math/algorithm/Tile.hpp @@ -12,45 +12,20 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include -#include -#include +#include + +#if XSIMD_WITH_AVX2 +# include +#endif + #include -#include +#include #include namespace blast { - template - BLAZE_ALWAYS_INLINE void tile_backend(size_t m, size_t n, size_t i, FF&& f_full, FP&& f_partial) - { - RegisterMatrix ker; - - if (i + KM <= m) - { - size_t j = 0; - - for (; j + KN <= n; j += KN) - f_full(ker, i, j); - - if (j < n) - f_partial(ker, i, j, KM, n - j); - } - else - { - size_t j = 0; - - for (; j + KN <= n; j += KN) - f_partial(ker, i, j, m - i, KN); - - if (j < n) - f_partial(ker, i, j, m - i, n - j); - } - } - - /** * @brief Cover a matrix with tiles of different sizes in a performance-efficient way. * @@ -69,106 +44,18 @@ namespace blast * * @tparam ET type of matrix elements * @tparam SO matrix storage order - * @tparam F functor type + * @tparam FF functor type for full tiles + * @tparam FP functor type for partial tiles + * @tparam Arch instruction set architecture * * @param m number of matrix rows * @param n number of matrix columns * @param f_full functor to call on full tiles * @param f_partial functor to call on partial tiles */ - template - BLAST_ALWAYS_INLINE void tile(StorageOrder traversal_order, std::size_t m, std::size_t n, FF&& f_full, FP&& f_partial) + template + BLAST_ALWAYS_INLINE void tile(Arch arch, StorageOrder traversal_order, std::size_t m, std::size_t n, FF&& f_full, FP&& f_partial) { - size_t constexpr SS = SimdSize_v; - size_t constexpr TILE_STEP = 4; // TODO: this is almost arbitrary and needs to be ppoperly determined - - static_assert(SO == columnMajor, "tile() for row-major matrices not implemented"); - - if (traversal_order == columnMajor) - { - size_t j = 0; - - // Main part - for (; j + TILE_STEP <= n; j += TILE_STEP) - { - size_t i = 0; - - // i + 4 * TILE_SIZE != M is to improve performance in case when the remaining number of rows is 4 * TILE_SIZE: - // it is more efficient to apply 2 * TILE_SIZE kernel 2 times than 3 * TILE_SIZE + 1 * TILE_SIZE kernel. - for (; i + 3 * SS <= m && i + 4 * SS != m; i += 3 * SS) - { - RegisterMatrix ker; - f_full(ker, i, j); - } - - for (; i + 2 * SS <= m; i += 2 * SS) - { - RegisterMatrix ker; - f_full(ker, i, j); - } - - for (; i + 1 * SS <= m; i += 1 * SS) - { - RegisterMatrix ker; - f_full(ker, i, j); - } - - // Bottom side - if (i < m) - { - RegisterMatrix ker; - f_partial(ker, i, j, m - i, ker.columns()); - } - } - - - // Right side - if (j < n) - { - size_t i = 0; - - // i + 4 * TILE_STEP != M is to improve performance in case when the remaining number of rows is 4 * TILE_STEP: - // it is more efficient to apply 2 * TILE_STEP kernel 2 times than 3 * TILE_STEP + 1 * TILE_STEP kernel. - for (; i + 3 * SS <= m && i + 4 * SS != m; i += 3 * SS) - { - RegisterMatrix ker; - f_partial(ker, i, j, ker.rows(), n - j); - } - - for (; i + 2 * SS <= m; i += 2 * SS) - { - RegisterMatrix ker; - f_partial(ker, i, j, ker.rows(), n - j); - } - - for (; i + 1 * SS <= m; i += 1 * SS) - { - RegisterMatrix ker; - f_partial(ker, i, j, ker.rows(), n - j); - } - - // Bottom-right corner - if (i < m) - { - RegisterMatrix ker; - f_partial(ker, i, j, m - i, n - j); - } - } - } - else - { - size_t i = 0; - - // i + 4 * SS != M is to improve performance in case when the remaining number of rows is 4 * SS: - // it is more efficient to apply 2 * SS kernel 2 times than 3 * SS + 1 * SS kernel. - for (; i + 2 * SS < m && i + 4 * SS != m; i += 3 * SS) - tile_backend(m, n, i, f_full, f_partial); - - for (; i + 1 * SS < m; i += 2 * SS) - tile_backend(m, n, i, f_full, f_partial); - - for (; i + 0 * SS < m; i += 1 * SS) - tile_backend(m, n, i, f_full, f_partial); - } + detail::tile(arch, traversal_order, m, n, f_full, f_partial); } -} \ No newline at end of file +} diff --git a/include/blast/math/algorithm/arch/avx2/Tile.hpp b/include/blast/math/algorithm/arch/avx2/Tile.hpp new file mode 100644 index 00000000..01974521 --- /dev/null +++ b/include/blast/math/algorithm/arch/avx2/Tile.hpp @@ -0,0 +1,150 @@ +// Copyright 2023 Mikhail Katliar +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include +#include +#include + +#include + +#include + + +namespace blast :: detail +{ + template + BLAST_ALWAYS_INLINE void tile_backend(xsimd::avx2, size_t m, size_t n, size_t i, FF&& f_full, FP&& f_partial) + { + RegisterMatrix ker; + + if (i + KM <= m) + { + size_t j = 0; + + for (; j + KN <= n; j += KN) + f_full(ker, i, j); + + if (j < n) + f_partial(ker, i, j, KM, n - j); + } + else + { + size_t j = 0; + + for (; j + KN <= n; j += KN) + f_partial(ker, i, j, m - i, KN); + + if (j < n) + f_partial(ker, i, j, m - i, n - j); + } + } + + + template + BLAST_ALWAYS_INLINE void tile(xsimd::avx2 const& arch, StorageOrder traversal_order, std::size_t m, std::size_t n, FF&& f_full, FP&& f_partial) + { + size_t constexpr SS = SimdSize_v; + size_t constexpr TILE_STEP = 4; // TODO: this is almost arbitrary and needs to be ppoperly determined + + static_assert(SO == columnMajor, "tile() for row-major matrices not implemented"); + + if (traversal_order == columnMajor) + { + size_t j = 0; + + // Main part + for (; j + TILE_STEP <= n; j += TILE_STEP) + { + size_t i = 0; + + // i + 4 * TILE_SIZE != M is to improve performance in case when the remaining number of rows is 4 * TILE_SIZE: + // it is more efficient to apply 2 * TILE_SIZE kernel 2 times than 3 * TILE_SIZE + 1 * TILE_SIZE kernel. + for (; i + 3 * SS <= m && i + 4 * SS != m; i += 3 * SS) + { + RegisterMatrix ker; + f_full(ker, i, j); + } + + for (; i + 2 * SS <= m; i += 2 * SS) + { + RegisterMatrix ker; + f_full(ker, i, j); + } + + for (; i + 1 * SS <= m; i += 1 * SS) + { + RegisterMatrix ker; + f_full(ker, i, j); + } + + // Bottom side + if (i < m) + { + RegisterMatrix ker; + f_partial(ker, i, j, m - i, ker.columns()); + } + } + + + // Right side + if (j < n) + { + size_t i = 0; + + // i + 4 * TILE_STEP != M is to improve performance in case when the remaining number of rows is 4 * TILE_STEP: + // it is more efficient to apply 2 * TILE_STEP kernel 2 times than 3 * TILE_STEP + 1 * TILE_STEP kernel. + for (; i + 3 * SS <= m && i + 4 * SS != m; i += 3 * SS) + { + RegisterMatrix ker; + f_partial(ker, i, j, ker.rows(), n - j); + } + + for (; i + 2 * SS <= m; i += 2 * SS) + { + RegisterMatrix ker; + f_partial(ker, i, j, ker.rows(), n - j); + } + + for (; i + 1 * SS <= m; i += 1 * SS) + { + RegisterMatrix ker; + f_partial(ker, i, j, ker.rows(), n - j); + } + + // Bottom-right corner + if (i < m) + { + RegisterMatrix ker; + f_partial(ker, i, j, m - i, n - j); + } + } + } + else + { + size_t i = 0; + + // i + 4 * SS != M is to improve performance in case when the remaining number of rows is 4 * SS: + // it is more efficient to apply 2 * SS kernel 2 times than 3 * SS + 1 * SS kernel. + for (; i + 2 * SS < m && i + 4 * SS != m; i += 3 * SS) + tile_backend(arch, m, n, i, f_full, f_partial); + + for (; i + 1 * SS < m; i += 2 * SS) + tile_backend(arch, m, n, i, f_full, f_partial); + + for (; i + 0 * SS < m; i += 1 * SS) + tile_backend(arch, m, n, i, f_full, f_partial); + } + } +} diff --git a/include/blast/system/Inline.hpp b/include/blast/system/Inline.hpp index 8ec59251..ff88fb88 100644 --- a/include/blast/system/Inline.hpp +++ b/include/blast/system/Inline.hpp @@ -14,6 +14,14 @@ #pragma once -#include +#if defined(_MSC_VER) || defined(__INTEL_COMPILER) +# define BLAST_STRONG_INLINE __forceinline +#else +# define BLAST_STRONG_INLINE inline +#endif -#define BLAST_ALWAYS_INLINE BLAZE_ALWAYS_INLINE +#if defined(__GNUC__) +# define BLAST_ALWAYS_INLINE __attribute__((always_inline)) inline +#else +# define BLAST_ALWAYS_INLINE BLAST_STRONG_INLINE +#endif From da170a12342b3973c1e61e2133475ddddb973a9b Mon Sep 17 00:00:00 2001 From: Mikhail Katliar Date: Tue, 13 Aug 2024 12:23:26 +0200 Subject: [PATCH 08/12] gemm() implementation independent of matrix types --- bench/blast/math/dense/DynamicGemm.cpp | 2 +- bench/blast/math/dense/StaticGemm.cpp | 2 +- bench/blast/math/panel/DynamicGemm.cpp | 2 +- bench/blast/math/panel/StaticGemm.cpp | 2 +- include/blast/math/StaticPanelMatrix.hpp | 1 + include/blast/math/algorithm/Gemm.hpp | 94 ++++++++++++++----- include/blast/math/algorithm/Tile.hpp | 19 +--- .../blast/math/algorithm/arch/avx2/Tile.hpp | 16 +--- include/blast/math/dense/Gemm.hpp | 79 ---------------- include/blast/math/dense/Getrf.hpp | 2 +- include/blast/math/panel/Gemm.hpp | 80 ---------------- include/blast/math/panel/Potrf.hpp | 4 +- .../blast/math/panel/StaticPanelMatrix.hpp | 3 +- .../math/views/submatrix/BaseTemplate.hpp | 4 +- include/blast/system/Inline.hpp | 16 +--- test/blast/math/dense/GemmTest.cpp | 2 +- test/blast/math/dense/TrmmTest.cpp | 2 +- .../math/expressions/PMatTransExprTest.cpp | 4 +- test/blast/math/panel/GemmTest.cpp | 2 +- test/blast/math/panel/PotrfTest.cpp | 2 +- 20 files changed, 96 insertions(+), 242 deletions(-) delete mode 100644 include/blast/math/dense/Gemm.hpp delete mode 100644 include/blast/math/panel/Gemm.hpp diff --git a/bench/blast/math/dense/DynamicGemm.cpp b/bench/blast/math/dense/DynamicGemm.cpp index 9d702760..10d8fb32 100644 --- a/bench/blast/math/dense/DynamicGemm.cpp +++ b/bench/blast/math/dense/DynamicGemm.cpp @@ -2,7 +2,7 @@ // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. -#include +#include #include #include diff --git a/bench/blast/math/dense/StaticGemm.cpp b/bench/blast/math/dense/StaticGemm.cpp index 960c8ae9..2565d36a 100644 --- a/bench/blast/math/dense/StaticGemm.cpp +++ b/bench/blast/math/dense/StaticGemm.cpp @@ -2,7 +2,7 @@ // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. -#include +#include #include #include diff --git a/bench/blast/math/panel/DynamicGemm.cpp b/bench/blast/math/panel/DynamicGemm.cpp index 96ac4d62..3b8866d0 100644 --- a/bench/blast/math/panel/DynamicGemm.cpp +++ b/bench/blast/math/panel/DynamicGemm.cpp @@ -3,7 +3,7 @@ // license that can be found in the LICENSE file. #include -#include +#include #include diff --git a/bench/blast/math/panel/StaticGemm.cpp b/bench/blast/math/panel/StaticGemm.cpp index c515c35e..0c201346 100644 --- a/bench/blast/math/panel/StaticGemm.cpp +++ b/bench/blast/math/panel/StaticGemm.cpp @@ -4,7 +4,7 @@ #include #include -#include +#include #include #include diff --git a/include/blast/math/StaticPanelMatrix.hpp b/include/blast/math/StaticPanelMatrix.hpp index b191d789..cfbcde3f 100644 --- a/include/blast/math/StaticPanelMatrix.hpp +++ b/include/blast/math/StaticPanelMatrix.hpp @@ -5,6 +5,7 @@ #pragma once #include +#include #include #include diff --git a/include/blast/math/algorithm/Gemm.hpp b/include/blast/math/algorithm/Gemm.hpp index 36b14939..ae80d0d1 100644 --- a/include/blast/math/algorithm/Gemm.hpp +++ b/include/blast/math/algorithm/Gemm.hpp @@ -4,15 +4,11 @@ #pragma once -#include -#include +#include #include #include - -#include - -#include -#include +#include +#include namespace blast @@ -26,12 +22,12 @@ namespace blast * alpha and beta are scalars, and A, B and C are matrices, with A * an m by k matrix, B a k by n matrix and C an m by n matrix. * - * @tparam ST1 - * @tparam MPA - * @tparam MPB - * @tparam ST2 - * @tparam MPC - * @tparam MPD + * @tparam ST1 scalar type for @a alpha + * @tparam MPA matrix pointer type for @a A + * @tparam MPB matrix pointer type for @a B + * @tparam ST2 scalar type for @a beta + * @tparam MPC matrix pointer type for @a C + * @tparam MPD matrix pointer type for @a D * * @param M the number of rows of the matrices A, C, and D. * @param N the number of columns of the matrices B and C. @@ -44,23 +40,13 @@ namespace blast * @param D the output matrix D */ template < - typename ST1, typename MPA, typename MPB, - typename ST2, typename MPC, typename MPD + typename ST1, MatrixPointer MPA, MatrixPointer MPB, + typename ST2, MatrixPointer MPC, MatrixPointer MPD > - requires ( - MatrixPointer && StorageOrder_v == columnMajor && - MatrixPointer && - MatrixPointer && StorageOrder_v == columnMajor && - MatrixPointer && StorageOrder_v == columnMajor - ) - BLAST_ALWAYS_INLINE void gemm(size_t M, size_t N, size_t K, ST1 alpha, MPA A, MPB B, ST2 beta, MPC C, MPD D) + inline void gemm(size_t M, size_t N, size_t K, ST1 alpha, MPA A, MPB B, ST2 beta, MPC C, MPD D) { using ET = std::remove_cv_t>; - BLAZE_CONSTRAINT_MUST_BE_SAME_TYPE(std::remove_cv_t>, ET); - BLAZE_CONSTRAINT_MUST_BE_SAME_TYPE(std::remove_cv_t>, ET); - BLAZE_CONSTRAINT_MUST_BE_SAME_TYPE(std::remove_cv_t>, ET); - tile)>( xsimd::default_arch {}, D.cachePreferredTraversal, @@ -75,4 +61,60 @@ namespace blast } ); } + + + /** + * @brief Matrix-matrix multiplication for @a DenseMatrix arguments + * + * D := alpha*A*B + beta*C + * + * alpha and beta are scalars, and A, B and C are matrices, with A + * an m by k matrix, B a k by n matrix and C an m by n matrix. + * + * @param alpha the scalar alpha + * @param A the matrix A + * @param B the matrix B + * @param beta the scalar beta + * @param C the matrix C + * @param D the output matrix D + */ + template + inline void gemm(ST1 alpha, MT1 const& A, MT2 const& B, ST2 beta, MT3 const& C, MT4& D) + { + size_t const M = rows(A); + size_t const N = columns(B); + size_t const K = columns(A); + + if (rows(B) != K) + BLAST_THROW_EXCEPTION(std::invalid_argument {"Matrix sizes do not match"}); + + if (rows(C) != M || columns(C) != N) + BLAST_THROW_EXCEPTION(std::invalid_argument {"Matrix sizes do not match"}); + + if (rows(D) != M || columns(D) != N) + BLAST_THROW_EXCEPTION(std::invalid_argument {"Matrix sizes do not match"}); + + gemm(M, N, K, alpha, ptr(*A), ptr(*B), beta, ptr(*C), ptr(*D)); + } + + + /** + * @brief Matrix-matrix multiplication for @a DenseMatrix arguments + * + * D := A*B + C + * + * A, B and C are matrices, with A + * an m by k matrix, B a k by n matrix and C an m by n matrix. + * + * @param A the matrix A + * @param B the matrix B + * @param C the matrix C + * @param D the output matrix D + */ + template + inline void gemm(MT1 const& A, MT2 const& B, MT3 const& C, MT4& D) + { + using ET = ElementType_t; + gemm(ET(1.), A, B, ET(1.), C, D); + } } diff --git a/include/blast/math/algorithm/Tile.hpp b/include/blast/math/algorithm/Tile.hpp index 280267fa..7f6ecc63 100644 --- a/include/blast/math/algorithm/Tile.hpp +++ b/include/blast/math/algorithm/Tile.hpp @@ -1,16 +1,6 @@ -// Copyright 2023 Mikhail Katliar -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. +// Copyright 2024 Mikhail Katliar. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. #include @@ -19,7 +9,6 @@ #endif #include -#include #include @@ -54,7 +43,7 @@ namespace blast * @param f_partial functor to call on partial tiles */ template - BLAST_ALWAYS_INLINE void tile(Arch arch, StorageOrder traversal_order, std::size_t m, std::size_t n, FF&& f_full, FP&& f_partial) + inline void tile(Arch arch, StorageOrder traversal_order, std::size_t m, std::size_t n, FF&& f_full, FP&& f_partial) { detail::tile(arch, traversal_order, m, n, f_full, f_partial); } diff --git a/include/blast/math/algorithm/arch/avx2/Tile.hpp b/include/blast/math/algorithm/arch/avx2/Tile.hpp index 01974521..aac90b8a 100644 --- a/include/blast/math/algorithm/arch/avx2/Tile.hpp +++ b/include/blast/math/algorithm/arch/avx2/Tile.hpp @@ -1,16 +1,6 @@ -// Copyright 2023 Mikhail Katliar -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. +// Copyright 2024 Mikhail Katliar. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. #include #include diff --git a/include/blast/math/dense/Gemm.hpp b/include/blast/math/dense/Gemm.hpp deleted file mode 100644 index 7c6cc40a..00000000 --- a/include/blast/math/dense/Gemm.hpp +++ /dev/null @@ -1,79 +0,0 @@ -// Copyright (c) 2019-2020 Mikhail Katliar All rights reserved. -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. - -#pragma once - -#include -#include -#include - - -namespace blast -{ - /** - * @brief Matrix-matrix multiplication for @a DenseMatrix arguments - * - * D := alpha*A*B + beta*C - * - * alpha and beta are scalars, and A, B and C are matrices, with A - * an m by k matrix, B a k by n matrix and C an m by n matrix. - * - * @param alpha the scalar alpha - * @param A the matrix A - * @param B the matrix B - * @param beta the scalar beta - * @param C the matrix C - * @param D the output matrix D - */ - template < - typename ST1, typename MT1, typename MT2, bool SO2, - typename ST2, typename MT3, typename MT4 - > - inline void gemm( - ST1 alpha, - DenseMatrix const& A, DenseMatrix const& B, - ST2 beta, DenseMatrix const& C, DenseMatrix& D) - { - size_t const M = rows(A); - size_t const N = columns(B); - size_t const K = columns(A); - - if (rows(B) != K) - BLAST_THROW_EXCEPTION(std::invalid_argument {"Matrix sizes do not match"}); - - if (rows(C) != M || columns(C) != N) - BLAST_THROW_EXCEPTION(std::invalid_argument {"Matrix sizes do not match"}); - - if (rows(D) != M || columns(D) != N) - BLAST_THROW_EXCEPTION(std::invalid_argument {"Matrix sizes do not match"}); - - gemm(M, N, K, alpha, ptr(*A), ptr(*B), beta, ptr(*C), ptr(*D)); - } - - - /** - * @brief Matrix-matrix multiplication for @a DenseMatrix arguments - * - * D := A*B + C - * - * A, B and C are matrices, with A - * an m by k matrix, B a k by n matrix and C an m by n matrix. - * - * @param A the matrix A - * @param B the matrix B - * @param C the matrix C - * @param D the output matrix D - */ - template < - typename MT1, typename MT2, bool SO2, - typename MT3, typename MT4 - > - inline void gemm( - DenseMatrix const& A, DenseMatrix const& B, - DenseMatrix const& C, DenseMatrix& D) - { - using ET = ElementType_t; - gemm(ET(1.), A, B, ET(1.), C, D); - } -} diff --git a/include/blast/math/dense/Getrf.hpp b/include/blast/math/dense/Getrf.hpp index 543a3ae5..7b14d59a 100644 --- a/include/blast/math/dense/Getrf.hpp +++ b/include/blast/math/dense/Getrf.hpp @@ -13,7 +13,7 @@ #include #include #include -#include +#include #include #include diff --git a/include/blast/math/panel/Gemm.hpp b/include/blast/math/panel/Gemm.hpp deleted file mode 100644 index 531c2801..00000000 --- a/include/blast/math/panel/Gemm.hpp +++ /dev/null @@ -1,80 +0,0 @@ -// Copyright (c) 2019-2020 Mikhail Katliar All rights reserved. -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. - -#pragma once - -#include -#include -#include -#include - - -namespace blast -{ - /** - * @brief Matrix-matrix multiplication for @a PanelMatrix arguments - * - * D := alpha*A*B + beta*C - * - * alpha and beta are scalars, and A, B and C are matrices, with A - * an m by k matrix, B a k by n matrix and C an m by n matrix. - * - * @param alpha the scalar alpha - * @param A the matrix A - * @param B the matrix B - * @param beta the scalar beta - * @param C the matrix C - * @param D the output matrix D - */ - template < - typename ST1, typename MT1, typename MT2, bool SO2, - typename ST2, typename MT3, typename MT4 - > - inline void gemm( - ST1 alpha, - PanelMatrix const& A, PanelMatrix const& B, - ST2 beta, PanelMatrix const& C, PanelMatrix& D) - { - size_t const M = rows(A); - size_t const N = columns(B); - size_t const K = columns(A); - - if (rows(B) != K) - BLAST_THROW_EXCEPTION(std::invalid_argument {"Matrix sizes do not match"}); - - if (rows(C) != M || columns(C) != N) - BLAST_THROW_EXCEPTION(std::invalid_argument {"Matrix sizes do not match"}); - - if (rows(D) != M || columns(D) != N) - BLAST_THROW_EXCEPTION(std::invalid_argument {"Matrix sizes do not match"}); - - gemm(M, N, K, alpha, ptr(*A), ptr(*B), beta, ptr(*C), ptr(*D)); - } - - - /** - * @brief Matrix-matrix multiplication for @a PanelMatrix arguments - * - * D := A*B + C - * - * A, B and C are matrices, with A - * an m by k matrix, B a k by n matrix and C an m by n matrix. - * - * @param A the matrix A - * @param B the matrix B - * @param C the matrix C - * @param D the output matrix D - */ - template < - typename MT1, typename MT2, bool SO2, - typename MT3, typename MT4 - > - inline void gemm( - PanelMatrix const& A, PanelMatrix const& B, - PanelMatrix const& C, PanelMatrix& D) - { - using ET = ElementType_t; - gemm(ET(1.), A, B, ET(1.), C, D); - } -} diff --git a/include/blast/math/panel/Potrf.hpp b/include/blast/math/panel/Potrf.hpp index 879c46f0..e1763809 100644 --- a/include/blast/math/panel/Potrf.hpp +++ b/include/blast/math/panel/Potrf.hpp @@ -6,7 +6,7 @@ #include #include -#include +#include #include #include @@ -97,4 +97,4 @@ namespace blast potrf_backend<1 * PANEL_SIZE, KN>(k, i, *A, *L); } } -} \ No newline at end of file +} diff --git a/include/blast/math/panel/StaticPanelMatrix.hpp b/include/blast/math/panel/StaticPanelMatrix.hpp index 82358689..0fa0a7b3 100644 --- a/include/blast/math/panel/StaticPanelMatrix.hpp +++ b/include/blast/math/panel/StaticPanelMatrix.hpp @@ -5,8 +5,8 @@ #pragma once #include -#include #include +#include #include #include #include @@ -16,6 +16,7 @@ #include #include #include +#include #include #include diff --git a/include/blast/math/views/submatrix/BaseTemplate.hpp b/include/blast/math/views/submatrix/BaseTemplate.hpp index ef3cc409..a19b7cfa 100644 --- a/include/blast/math/views/submatrix/BaseTemplate.hpp +++ b/include/blast/math/views/submatrix/BaseTemplate.hpp @@ -4,6 +4,8 @@ #pragma once +#include + namespace blast { @@ -13,4 +15,4 @@ namespace blast class PanelSubmatrix { }; -} \ No newline at end of file +} diff --git a/include/blast/system/Inline.hpp b/include/blast/system/Inline.hpp index ff88fb88..68897585 100644 --- a/include/blast/system/Inline.hpp +++ b/include/blast/system/Inline.hpp @@ -1,16 +1,6 @@ -// Copyright 2023 Mikhail Katliar -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. +// Copyright 2024 Mikhail Katliar. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. #pragma once diff --git a/test/blast/math/dense/GemmTest.cpp b/test/blast/math/dense/GemmTest.cpp index 82a8b318..d56daaf5 100644 --- a/test/blast/math/dense/GemmTest.cpp +++ b/test/blast/math/dense/GemmTest.cpp @@ -4,7 +4,7 @@ #define BLAST_USER_ASSERTION 1 -#include +#include #include #include diff --git a/test/blast/math/dense/TrmmTest.cpp b/test/blast/math/dense/TrmmTest.cpp index a61e3c4c..04005df3 100644 --- a/test/blast/math/dense/TrmmTest.cpp +++ b/test/blast/math/dense/TrmmTest.cpp @@ -3,7 +3,7 @@ // license that can be found in the LICENSE file. #include -#include +#include #include #include diff --git a/test/blast/math/expressions/PMatTransExprTest.cpp b/test/blast/math/expressions/PMatTransExprTest.cpp index d268c250..bd850f35 100644 --- a/test/blast/math/expressions/PMatTransExprTest.cpp +++ b/test/blast/math/expressions/PMatTransExprTest.cpp @@ -4,12 +4,10 @@ #include #include -#include -#include #include #include -#include + namespace blast :: testing { diff --git a/test/blast/math/panel/GemmTest.cpp b/test/blast/math/panel/GemmTest.cpp index 2accdfcb..71e88bd1 100644 --- a/test/blast/math/panel/GemmTest.cpp +++ b/test/blast/math/panel/GemmTest.cpp @@ -7,7 +7,7 @@ #include #include #include -#include +#include #include diff --git a/test/blast/math/panel/PotrfTest.cpp b/test/blast/math/panel/PotrfTest.cpp index d0c267cd..46502097 100644 --- a/test/blast/math/panel/PotrfTest.cpp +++ b/test/blast/math/panel/PotrfTest.cpp @@ -4,7 +4,7 @@ #include #include -#include +#include #include #include From c697c69dbad23448a39bc4798cc46e7f95cba661 Mon Sep 17 00:00:00 2001 From: ugol-1 Date: Mon, 2 Sep 2024 12:53:15 +0200 Subject: [PATCH 09/12] Remove platform-specific compiler options from the main CMakeLists.txt --- CMakeLists.txt | 3 --- 1 file changed, 3 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 3b0ad928..a57ed077 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -41,9 +41,6 @@ target_link_libraries(blast target_compile_options(blast INTERFACE "-Wno-ignored-attributes" "-fno-math-errno" "-ftemplate-backtrace-limit=0" - # Enable SIMD instruction sets, otherwise it does not compile. - # This will change when we support multiple architectures. - INTERFACE "-march=native" "-mfma" "-mavx" "-mavx2" "-msse4" ) # BLAST_WITH_BLASFEO From c0c2aa3e76332a332890d648928b86b85bf74e37 Mon Sep 17 00:00:00 2001 From: Mikhail Katliar Date: Fri, 6 Sep 2024 08:36:08 +0000 Subject: [PATCH 10/12] Declare dependency on Boost in package.xml --- package.xml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/package.xml b/package.xml index 952ada91..b67347d1 100644 --- a/package.xml +++ b/package.xml @@ -7,6 +7,8 @@ Mikhail Katliar BSD 3-Clause License + boost + cmake From 82f085da520da81509f2e5f5528d8eebe9b95373 Mon Sep 17 00:00:00 2001 From: Mikhail Katliar Date: Fri, 25 Apr 2025 12:36:04 +0000 Subject: [PATCH 11/12] Don't apply -ftemplate-backtrace-limit to C files --- CMakeLists.txt | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index a57ed077..dddf56c3 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -40,7 +40,12 @@ target_link_libraries(blast ) target_compile_options(blast - INTERFACE "-Wno-ignored-attributes" "-fno-math-errno" "-ftemplate-backtrace-limit=0" + INTERFACE "-Wno-ignored-attributes" "-fno-math-errno" +) + +target_compile_options(blast + INTERFACE + $<$:-ftemplate-backtrace-limit=0> ) # BLAST_WITH_BLASFEO From 9b67433f619752ee1153575ca9f7d71f16672f90 Mon Sep 17 00:00:00 2001 From: Mikhail Katliar Date: Wed, 7 May 2025 10:37:54 +0000 Subject: [PATCH 12/12] Got rid of obsolete forcePrint() --- include/test/Testing.hpp | 30 ++---------------------------- 1 file changed, 2 insertions(+), 28 deletions(-) diff --git a/include/test/Testing.hpp b/include/test/Testing.hpp index 2802ab79..687c923c 100644 --- a/include/test/Testing.hpp +++ b/include/test/Testing.hpp @@ -21,32 +21,6 @@ namespace blast :: testing using namespace ::testing; - namespace detail - { - template - class ForcePrintImpl - : public T - { - friend void PrintTo(const ForcePrintImpl &m, ::std::ostream *o) - { - *o << "\n" << m; - } - }; - } - - - /* - * Makes the Eigen3 matrix classes printable from GTest checking macros like EXPECT_EQ. - * Usage: EXPECT_EQ(forcePrint(a), forcePrint(b)) - * Taken from this post: http://stackoverflow.com/questions/25146997/teach-google-test-how-to-print-eigen-matrix - */ - template - decltype(auto) forcePrint(T const& val) - { - return static_cast const&>(val); - } - - MATCHER_P(FloatNearPointwise, tol, "Out of range") { return (std::get<0>(arg) > std::get<1>(arg) - tol && std::get<0>(arg) < std::get<1>(arg) + tol) ; @@ -202,7 +176,7 @@ namespace blast :: testing ASSERT_TRUE(::blast::testing::approxEqual(val, expected, abs_tol, rel_tol)) #define BLAST_EXPECT_EQ(val, expected) \ - EXPECT_EQ(::blast::testing::forcePrint(val), ::blast::testing::forcePrint(expected)) + EXPECT_EQ(val, expected) #define BLAST_ASSERT_EQ(val, expected) \ - ASSERT_EQ(::blast::testing::forcePrint(val), ::blast::testing::forcePrint(expected)) + ASSERT_EQ(val, expected)