From ff514cb0f8ac3b2a39c37f1143c03d058c1b97f1 Mon Sep 17 00:00:00 2001 From: "Daniele G. Spampinato" Date: Wed, 14 Dec 2022 14:17:49 +0100 Subject: [PATCH 1/9] Started drafting mxm using 2.5D algo --- include/alp/omp/blas3.hpp | 268 ++++++++++++++++++++++++++++++++++++++ include/alp/omp/io.hpp | 2 + 2 files changed, 270 insertions(+) create mode 100644 include/alp/omp/blas3.hpp diff --git a/include/alp/omp/blas3.hpp b/include/alp/omp/blas3.hpp new file mode 100644 index 000000000..cdd24324b --- /dev/null +++ b/include/alp/omp/blas3.hpp @@ -0,0 +1,268 @@ + +/* + * Copyright 2021 Huawei Technologies Co., Ltd. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * @author A. N. Yzelman + * @date 14th of January 2022 + */ + +#ifndef _H_ALP_OMP_BLAS3 +#define _H_ALP_OMP_BLAS3 + +#include // for std::min/max +#include // for std::enable_if + +#include +#include +#include + +// Include backend to which sequential work is delegated +#ifdef _ALP_OMP_WITH_REFERENCE + #include + #include +#endif + +#include "matrix.hpp" +#include "storage.hpp" + + +namespace alp { + + namespace internal { + + /** + * \internal general mxm implementation that all mxm variants using + * structured matrices refer to. + */ + template< + bool allow_void, + class MulMonoid, + typename OutputType, typename InputType1, typename InputType2, + class Operator, class Monoid, + typename OutputStructure, typename OutputView, + typename OutputImfR, typename OutputImfC, + typename InputStructure1, typename InputView1, + typename InputImfR1, typename InputImfC1, + typename InputStructure2, typename InputView2, + typename InputImfR2, typename InputImfC2 + > + RC mxm_generic( + alp::Matrix< OutputType, OutputStructure, + Density::Dense, OutputView, OutputImfR, OutputImfC, omp > &C, + const alp::Matrix< InputType1, InputStructure1, + Density::Dense, InputView1, InputImfR1, InputImfC1, omp > &A, + const alp::Matrix< InputType2, InputStructure2, + Density::Dense, InputView2, InputImfR2, InputImfC2, omp > &B, + const Operator &oper, + const Monoid &monoid, + const MulMonoid &mulMonoid, + const typename std::enable_if< !alp::is_object< OutputType >::value && + !alp::is_object< InputType1 >::value && ! + alp::is_object< InputType2 >::value && + alp::is_operator< Operator >::value && + alp::is_monoid< Monoid >::value, + void >::type * const = NULL + ) { + + static_assert( + !( + std::is_same< InputType1, void >::value || + std::is_same< InputType2, void >::value + ), + "alp::internal::mxm_generic: the operator-monoid version of mxm cannot be " + "used if either of the input matrices is a pattern matrix (of type " + "void)" + ); + +#ifdef _DEBUG + std::cout << "In alp::internal::mxm_generic (omp)\n"; +#endif + + // Early exit checks + if( ! internal::getInitialized( A ) || + ! internal::getInitialized( B ) || + ! internal::getInitialized( C ) + ) { + internal::setInitialized( C, false ); + return SUCCESS; + } + + const std::ptrdiff_t m { static_cast< std::ptrdiff_t >( nrows( C ) ) }; + const std::ptrdiff_t n { static_cast< std::ptrdiff_t >( ncols( C ) ) }; + const std::ptrdiff_t m_a { static_cast< std::ptrdiff_t >( nrows( A ) ) }; + const std::ptrdiff_t k { static_cast< std::ptrdiff_t >( ncols( A ) ) }; + const std::ptrdiff_t k_b { static_cast< std::ptrdiff_t >( nrows( B ) ) }; + const std::ptrdiff_t n_b { static_cast< std::ptrdiff_t >( ncols( B ) ) }; + + if( m != m_a || k != k_b || n != n_b ) { + return MISMATCH; + } + + const Distribution &da = internal::getAmf( A ).getDistribution(); + const Distribution &db = internal::getAmf( B ).getDistribution(); + const Distribution &dc = internal::getAmf( C ).getDistribution(); + + RC rc = SUCCESS; + + #pragma omp parallel for + for( size_t thread = 0; thread < config::OMP::current_threads(); ++thread ) { + + const auto th_ijk_a = da.getThreadCoords( thread ); + const auto th_ijk_b = da.getThreadCoords( thread ); + const auto th_ijk_c = da.getThreadCoords( thread ); + + const auto block_grid_dims_a = d.getLocalBlockGridDims( t_coords_a ); + const auto block_grid_dims_b = d.getLocalBlockGridDims( t_coords_b ); + const auto block_grid_dims_c = d.getLocalBlockGridDims( t_coords_c ); + + RC local_rc = SUCCESS; + + // Broadcast Aij and Bij to all c layers + if( t_coords.rt > 0 ) { + + typename Distribution::ThreadCoord th_ij0_a( t_coords_a.tr, t_coords_a.tc, 0 ); + + for( size_t br = 0; br < block_grid_dims_a.first; ++br ) { + for( size_t bc = 0; bc < block_grid_dims_a.second; ++bc ) { + + auto refAij0 = internal::get_view( A, th_ij0_a, br, bc ); + auto refAijk = internal::get_view( A, th_ijk_a, br, bc ); + + local_rc = local_rc ? local_rc : set( refAijk, refAij0 ); + + + } + } + + if( local_rc != SUCCESS ) { + + typename Distribution::ThreadCoord th_ij0_b( t_coords_b.tr, t_coords_b.tc, 0 ); + + for( size_t br = 0; br < block_grid_dims_b.first; ++br ) { + for( size_t bc = 0; bc < block_grid_dims_b.second; ++bc ) { + + auto refBij0 = internal::get_view( B, th_ij0_b, br, bc ); + auto refBijk = internal::get_view( B, th_ijk_b, br, bc ); + + local_rc = local_rc ? local_rc : set( refBijk, refBij0 ); + } + } + } + + } + // End Broadcast of Aij and Bij + #pragma omp barrier + + + // for( size_t br = 0; br < block_grid_dims.first; ++br ) { + // for( size_t bc = 0; bc < block_grid_dims.second; ++bc ) { + + // // Get a sequential matrix view over the block + // auto refC = internal::get_view( C, tr, tc, 1 /* rt */, br, bc ); + + // // Construct a sequential Scalar container from the input Scalar + // Scalar< InputType, InputStructure, config::default_sequential_backend > ref_val( *val ); + + // // Delegate the call to the sequential set implementation + // local_rc = local_rc ? local_rc : set( refC, ref_val ); + + // if( local_rc != SUCCESS ) { + // rc = local_rc; + // } + // } + // } + } + + internal::setInitialized( C, true ); + return rc; + + } + + } // namespace internal + + /** + * Dense Matrix-Matrix multiply between structured matrices. + * Version with semiring parameter. + * + * @tparam descr The descriptors under which to perform the computation. + * @tparam OutputStructMatT The structured matrix type of the output matrix. + * @tparam InputStructMatT1 The structured matrix type of the the left-hand side input + * matrix. + * @tparam InputStructMatT2 The structured matrix type of the right-hand side input + * matrix. + * @tparam Semiring The semiring under which to perform the + * multiplication. + * + * @returns SUCCESS If the computation completed as intended. + * @returns MISMATCH Whenever the structures or dimensions of \a A, \a B, and \a C do + * not match. All input data containers are left + * untouched if this exit code is returned; it will be + * as though this call was never made. + * + * @param[out] C The output matrix \f$ C = AB \f$ when the function returns + * #SUCCESS. + * @param[in] A The left-hand side input matrix \f$ A \f$. + * @param[in] B The left-hand side input matrix \f$ B \f$. + * @param[in] ring (Optional.) The semiring under which the computation should + * proceed. + * @param phase The execution phase. + */ + template< + typename OutputStructMatT, + typename InputStructMatT1, + typename InputStructMatT2, + class Semiring + > + RC mxm( OutputStructMatT & C, + const InputStructMatT1 & A, + const InputStructMatT2 & B, + const Semiring & ring = Semiring(), + const PHASE &phase = NUMERICAL, + const typename std::enable_if< ! alp::is_object< typename OutputStructMatT::value_type >::value && ! alp::is_object< typename InputStructMatT1::value_type >::value && ! alp::is_object< typename InputStructMatT2::value_type >::value && alp::is_semiring< Semiring >::value, + void >::type * const = NULL ) { + (void)phase; + + return internal::mxm_generic< false >( C, A, B, ring.getMultiplicativeOperator(), ring.getAdditiveMonoid(), ring.getMultiplicativeMonoid() ); + } + + /** + * Dense Matrix-Matrix multiply between structured matrices. + * Version with additive monoid and multiplicative operator + */ + template< typename OutputStructMatT, + typename InputStructMatT1, + typename InputStructMatT2, + class Operator, class Monoid + > + RC mxm( OutputStructMatT & C, + const InputStructMatT1 & A, + const InputStructMatT2 & B, + const Operator & mulOp, + const Monoid & addM, + const PHASE &phase = NUMERICAL, + const typename std::enable_if< ! alp::is_object< typename OutputStructMatT::value_type >::value && ! alp::is_object< typename InputStructMatT1::value_type >::value && ! alp::is_object< typename InputStructMatT2::value_type >::value && + alp::is_operator< Operator >::value && alp::is_monoid< Monoid >::value, + void >::type * const = NULL ) { + (void)phase; + + return internal::mxm_generic< false >( C, A, B, mulOp, addM, Monoid() ); + } + +} // end namespace ``alp'' + +#endif // end ``_H_ALP_OMP_BLAS3'' + diff --git a/include/alp/omp/io.hpp b/include/alp/omp/io.hpp index 0f5793ac6..bc5b6aa97 100644 --- a/include/alp/omp/io.hpp +++ b/include/alp/omp/io.hpp @@ -34,6 +34,8 @@ #include #endif +#include "storage.hpp" + #define NO_CAST_ASSERT( x, y, z ) \ static_assert( x, \ "\n\n" \ From f9e7773d19152c90547d98c9b3492ed15d071905 Mon Sep 17 00:00:00 2001 From: "Daniele G. Spampinato" Date: Wed, 14 Dec 2022 14:17:49 +0100 Subject: [PATCH 2/9] WIP implementation of shifting + compute --- include/alp/omp/blas3.hpp | 75 +++++++++++++++++++++++-------------- include/alp/omp/config.hpp | 6 ++- include/alp/omp/storage.hpp | 9 ++++- 3 files changed, 59 insertions(+), 31 deletions(-) diff --git a/include/alp/omp/blas3.hpp b/include/alp/omp/blas3.hpp index cdd24324b..833f19a70 100644 --- a/include/alp/omp/blas3.hpp +++ b/include/alp/omp/blas3.hpp @@ -112,29 +112,35 @@ namespace alp { return MISMATCH; } - const Distribution &da = internal::getAmf( A ).getDistribution(); - const Distribution &db = internal::getAmf( B ).getDistribution(); - const Distribution &dc = internal::getAmf( C ).getDistribution(); + const Distribution_2_5D &da = internal::getAmf( A ).getDistribution(); + const Distribution_2_5D &db = internal::getAmf( B ).getDistribution(); + const Distribution_2_5D &dc = internal::getAmf( C ).getDistribution(); + + const auto tg_a = da.getThreadGridDims(); + const auto tg_b = db.getThreadGridDims(); + const auto tg_c = dc.getThreadGridDims(); + + using th_coord_t = typename Distribution_2_5D::ThreadCoords; RC rc = SUCCESS; #pragma omp parallel for for( size_t thread = 0; thread < config::OMP::current_threads(); ++thread ) { - const auto th_ijk_a = da.getThreadCoords( thread ); - const auto th_ijk_b = da.getThreadCoords( thread ); - const auto th_ijk_c = da.getThreadCoords( thread ); + const th_coord_t th_ijk_a = da.getThreadCoords( thread ); + const th_coord_t th_ijk_b = da.getThreadCoords( thread ); + const th_coord_t th_ijk_c = da.getThreadCoords( thread ); - const auto block_grid_dims_a = d.getLocalBlockGridDims( t_coords_a ); - const auto block_grid_dims_b = d.getLocalBlockGridDims( t_coords_b ); - const auto block_grid_dims_c = d.getLocalBlockGridDims( t_coords_c ); + const auto block_grid_dims_a = d.getLocalBlockGridDims( th_ijk_a ); + const auto block_grid_dims_b = d.getLocalBlockGridDims( th_ijk_b ); + const auto block_grid_dims_c = d.getLocalBlockGridDims( th_ijk_c ); RC local_rc = SUCCESS; // Broadcast Aij and Bij to all c layers - if( t_coords.rt > 0 ) { + if( th_ijk_a.rt > 0 ) { - typename Distribution::ThreadCoord th_ij0_a( t_coords_a.tr, t_coords_a.tc, 0 ); + th_coord_t th_ij0_a( th_ijk_a.tr, th_ijk_a.tc, 0 ); for( size_t br = 0; br < block_grid_dims_a.first; ++br ) { for( size_t bc = 0; bc < block_grid_dims_a.second; ++bc ) { @@ -150,7 +156,7 @@ namespace alp { if( local_rc != SUCCESS ) { - typename Distribution::ThreadCoord th_ij0_b( t_coords_b.tr, t_coords_b.tc, 0 ); + th_coord_t th_ij0_b( th_ijk_b.tr, th_ijk_b.tc, 0 ); for( size_t br = 0; br < block_grid_dims_b.first; ++br ) { for( size_t bc = 0; bc < block_grid_dims_b.second; ++bc ) { @@ -166,25 +172,38 @@ namespace alp { } // End Broadcast of Aij and Bij #pragma omp barrier - - - // for( size_t br = 0; br < block_grid_dims.first; ++br ) { - // for( size_t bc = 0; bc < block_grid_dims.second; ++bc ) { - - // // Get a sequential matrix view over the block - // auto refC = internal::get_view( C, tr, tc, 1 /* rt */, br, bc ); - - // // Construct a sequential Scalar container from the input Scalar - // Scalar< InputType, InputStructure, config::default_sequential_backend > ref_val( *val ); - - // // Delegate the call to the sequential set implementation - // local_rc = local_rc ? local_rc : set( refC, ref_val ); - - // if( local_rc != SUCCESS ) { - // rc = local_rc; + + auto mod = [](const size_t _k, const size_t _n) = { + return ( ( _k %= _n ) < 0 ) ? _k + _n : _k; + }; + + // TODO from here + // size_t s_a = mod( th_ijk_a.tc - th_ijk_a.tr + th_ijk_a.rt * tg_a.tr / th_ijk_a.rt, tg_a.tr ); + // th_coord_t th_isk_a( th_ijk_a.tr, s, th_ijk_a.tr ); + + // size_t s_b = mod( th_ijk_b.tr - th_ijk_b.tc + th_ijk_b.rt * tg_a.tr / th_ijk_a.rt, tg_a.tr ); + // th_coord_t th_isk_a( th_ijk_a.tr, s, th_ijk_a.tr ); + + // for( size_t bk = 0; bk < block_grid_dims_a.second; ++bk ) { + // for( size_t br = 0; br < block_grid_dims_c.first; ++br ) { + // for( size_t bc = 0; bc < block_grid_dims_c.second; ++bc ) { + + // // Get a sequential matrix view over the block + // auto refC = internal::get_view( C, tr, tc, 1 /* rt */, br, bc ); + + // // Construct a sequential Scalar container from the input Scalar + // Scalar< InputType, InputStructure, config::default_sequential_backend > ref_val( *val ); + + // // Delegate the call to the sequential set implementation + // local_rc = local_rc ? local_rc : set( refC, ref_val ); + + // if( local_rc != SUCCESS ) { + // rc = local_rc; + // } // } // } // } + } internal::setInitialized( C, true ); diff --git a/include/alp/omp/config.hpp b/include/alp/omp/config.hpp index 3bdb700ed..5e51df69c 100644 --- a/include/alp/omp/config.hpp +++ b/include/alp/omp/config.hpp @@ -37,7 +37,11 @@ namespace alp { constexpr size_t BLOCK_ROW_DIM = 16; constexpr size_t BLOCK_COL_DIM = 16; - constexpr size_t REPLICATION_FACTOR_THREADS = 1; + // Temporary solution to accomodate for 2.5D mxm + constexpr size_t THREAD_ROW_DIM = 4; + constexpr size_t THREAD_COL_DIM = 4; + + constexpr size_t REPLICATION_FACTOR_THREADS = 2; } // namespace config diff --git a/include/alp/omp/storage.hpp b/include/alp/omp/storage.hpp index fb24da0bf..6bd333a70 100644 --- a/include/alp/omp/storage.hpp +++ b/include/alp/omp/storage.hpp @@ -172,8 +172,8 @@ namespace alp { const size_t num_threads ) : m( m ), n( n ), - Tr( static_cast< size_t >( sqrt( num_threads/ Rt ) ) ), - Tc( num_threads / Rt / Tr ), + Tr( config::THREAD_ROW_DIM ), // Temporary + Tc( config::THREAD_COL_DIM ), Br( static_cast< size_t >( std::ceil( static_cast< double >( m ) / config::BLOCK_ROW_DIM ) ) ), Bc( static_cast< size_t >( std::ceil( static_cast< double >( n ) / config::BLOCK_COL_DIM ) ) ) { @@ -231,6 +231,11 @@ namespace alp { return Tr * Tc * Rt; } + /** Returns the thread grid size */ + ThreadCoords getThreadGridDims() const { + return { Tr, Tc, Rt }; + } + /** Returns the total global amount of blocks */ std::pair< size_t, size_t > getGlobalBlockGridDims() const { return { Br, Bc }; From f874c53ae48f1035d2d9a655db9fd5adfc45d54d Mon Sep 17 00:00:00 2001 From: "Daniele G. Spampinato" Date: Wed, 14 Dec 2022 14:17:49 +0100 Subject: [PATCH 3/9] First complete draft --- include/alp/omp/blas3.hpp | 147 ++++++++++++++++++++++++++++---------- include/alp/utils.hpp | 17 ++++- 2 files changed, 125 insertions(+), 39 deletions(-) diff --git a/include/alp/omp/blas3.hpp b/include/alp/omp/blas3.hpp index 833f19a70..390f5f186 100644 --- a/include/alp/omp/blas3.hpp +++ b/include/alp/omp/blas3.hpp @@ -120,6 +120,14 @@ namespace alp { const auto tg_b = db.getThreadGridDims(); const auto tg_c = dc.getThreadGridDims(); + if( tg_c.rt != tg_a.rt || tg_a.rt != tg_b.rt ) { + return MISMATCH; + } + + if( tg_c.tr != tg_a.tr || tg_c.tc != tg_b.tc || tg_a.tc != tg_b.tr ) { + return MISMATCH; + } + using th_coord_t = typename Distribution_2_5D::ThreadCoords; RC rc = SUCCESS; @@ -128,38 +136,46 @@ namespace alp { for( size_t thread = 0; thread < config::OMP::current_threads(); ++thread ) { const th_coord_t th_ijk_a = da.getThreadCoords( thread ); - const th_coord_t th_ijk_b = da.getThreadCoords( thread ); - const th_coord_t th_ijk_c = da.getThreadCoords( thread ); + const th_coord_t th_ijk_b = db.getThreadCoords( thread ); + const th_coord_t th_ijk_c = dc.getThreadCoords( thread ); - const auto block_grid_dims_a = d.getLocalBlockGridDims( th_ijk_a ); - const auto block_grid_dims_b = d.getLocalBlockGridDims( th_ijk_b ); - const auto block_grid_dims_c = d.getLocalBlockGridDims( th_ijk_c ); + const auto set_block_grid_dims_a = da.getLocalBlockGridDims( th_ijk_a ); + const auto set_block_grid_dims_b = db.getLocalBlockGridDims( th_ijk_b ); + const auto block_grid_dims_c = dc.getLocalBlockGridDims( th_ijk_c ); RC local_rc = SUCCESS; - // Broadcast Aij and Bij to all c layers - if( th_ijk_a.rt > 0 ) { + if( block_grid_dims_c.first != set_block_grid_dims_a.first + || block_grid_dims_c.second != set_block_grid_dims_b.second + || set_block_grid_dims_a.second != set_block_grid_dims_b.first + ) { + local_rc = MISMATCH; + } + + // Broadcast Aij and Bij to all c-dimensional layers + if( local_rc == SUCCESS && th_ijk_a.rt > 0 ) { th_coord_t th_ij0_a( th_ijk_a.tr, th_ijk_a.tc, 0 ); - for( size_t br = 0; br < block_grid_dims_a.first; ++br ) { - for( size_t bc = 0; bc < block_grid_dims_a.second; ++bc ) { + for( size_t br = 0; br < set_block_grid_dims_a.first; ++br ) { + for( size_t bc = 0; bc < set_block_grid_dims_a.second; ++bc ) { auto refAij0 = internal::get_view( A, th_ij0_a, br, bc ); auto refAijk = internal::get_view( A, th_ijk_a, br, bc ); local_rc = local_rc ? local_rc : set( refAijk, refAij0 ); - } } + + } // End Broadcast of Aij - if( local_rc != SUCCESS ) { + if( local_rc == SUCCESS && th_ijk_b.rt > 0 ) { th_coord_t th_ij0_b( th_ijk_b.tr, th_ijk_b.tc, 0 ); - for( size_t br = 0; br < block_grid_dims_b.first; ++br ) { - for( size_t bc = 0; bc < block_grid_dims_b.second; ++bc ) { + for( size_t br = 0; br < set_block_grid_dims_b.first; ++br ) { + for( size_t bc = 0; bc < set_block_grid_dims_b.second; ++bc ) { auto refBij0 = internal::get_view( B, th_ij0_b, br, bc ); auto refBijk = internal::get_view( B, th_ijk_b, br, bc ); @@ -173,40 +189,95 @@ namespace alp { // End Broadcast of Aij and Bij #pragma omp barrier - auto mod = [](const size_t _k, const size_t _n) = { - return ( ( _k %= _n ) < 0 ) ? _k + _n : _k; - }; - - // TODO from here - // size_t s_a = mod( th_ijk_a.tc - th_ijk_a.tr + th_ijk_a.rt * tg_a.tr / th_ijk_a.rt, tg_a.tr ); - // th_coord_t th_isk_a( th_ijk_a.tr, s, th_ijk_a.tr ); + if( local_rc == SUCCESS ) { + + // Initialize circular shifts at stride of Rt + size_t s_a = utils::modulus( th_ijk_a.tc - th_ijk_a.tr + th_ijk_a.rt * tg_a.tc / tg_a.rt, tg_a.tc ); + size_t s_b = utils::modulus( th_ijk_b.tr - th_ijk_b.tc + th_ijk_b.rt * tg_b.tr / tg_b.rt, tg_b.tr ); + + // Accumulate remaining local c-dimension factor + for( size_t r = 0; r < tg_a.tc / tg_a.rt; ++r ) { + + const th_coord_t th_isk_a( th_ijk_a.tr, s_a, th_ijk_a.rt ); + const th_coord_t th_sjk_b( s_b, th_ijk_b.tc, th_ijk_b.rt ); + + const auto mxm_block_grid_dims_a = d.getLocalBlockGridDims( th_isk_a ); + const auto mxm_block_grid_dims_b = d.getLocalBlockGridDims( th_sjk_b ); + + if( block_grid_dims_c.first != mxm_block_grid_dims_a.first + || block_grid_dims_c.second != mxm_block_grid_dims_b.second + || mxm_block_grid_dims_a.second != mxm_block_grid_dims_b.first + ) { + local_rc = MISMATCH; + } + + if( local_rc == SUCCESS ) { - // size_t s_b = mod( th_ijk_b.tr - th_ijk_b.tc + th_ijk_b.rt * tg_a.tr / th_ijk_a.rt, tg_a.tr ); - // th_coord_t th_isk_a( th_ijk_a.tr, s, th_ijk_a.tr ); + for( size_t bk = 0; bk < mxm_block_grid_dims_a.second; ++bk ) { + for( size_t br = 0; br < block_grid_dims_c.first; ++br ) { + + const auto refA_loc = internal::get_view( A, th_isk_a, br, bk ); - // for( size_t bk = 0; bk < block_grid_dims_a.second; ++bk ) { - // for( size_t br = 0; br < block_grid_dims_c.first; ++br ) { - // for( size_t bc = 0; bc < block_grid_dims_c.second; ++bc ) { + for( size_t bc = 0; bc < block_grid_dims_c.second; ++bc ) { - // // Get a sequential matrix view over the block - // auto refC = internal::get_view( C, tr, tc, 1 /* rt */, br, bc ); + const auto refB_loc = internal::get_view( B, th_sjk_b, bk, bc ); + auto refC_ijk = internal::get_view( C, th_ijk_c, br, bc ); - // // Construct a sequential Scalar container from the input Scalar - // Scalar< InputType, InputStructure, config::default_sequential_backend > ref_val( *val ); + // Delegate the call to the sequential mxm implementation + local_rc = local_rc ? local_rc : internal::mxm_generic< allow_void >( refC_ijk, refA_loc, refB_loc, oper, monoid, mulMonoid ); - // // Delegate the call to the sequential set implementation - // local_rc = local_rc ? local_rc : set( refC, ref_val ); + } + } + } + + // Circular shift rightwards for A + s_a = utils::modulus( s_a + 1, tg_a.tc ); + // Circular shift downwards for B + s_b = utils::modulus( s_b + 1, tg_b.tr ); + + } else { + break; + } - // if( local_rc != SUCCESS ) { - // rc = local_rc; - // } - // } - // } - // } + } + + // End layer-by-layer partial computation + #pragma omp barrier + + // Global c-dimension reduction + // (Consider if rt > 0 critical section?) + if ( local_rc == SUCCESS && th_ijk_c.rt == 0 ) { + + for( size_t r = 1; r < tg_c.rt; ++r ) { + + const th_coord_t th_ijr_c( th_ijk_c.tr, th_ijk_c.tc, r ); + + for( size_t br = 0; br < block_grid_dims_c.first; ++br ) { + for( size_t bc = 0; bc < block_grid_dims_c.second; ++bc ) { + + auto refCij0 = internal::get_view( C, th_ijk_c, br, bc ); // k == 0 + auto refCijr = internal::get_view( C, th_ijr_c, br, bc ); + + // Final result in C at layer 0 + local_rc = local_rc ? local_rc : foldl( refCij0, refCijr, monoid ); + } + } + + if( local_rc != SUCCESS ) { + break; + } + } + + } + } + + // Different values for rc could converge here (eg, MISMATCH, FAILED). + if( local_rc != SUCCESS ) { + rc = local_rc; + } } - internal::setInitialized( C, true ); return rc; } diff --git a/include/alp/utils.hpp b/include/alp/utils.hpp index 330c69b3b..7e002cbff 100644 --- a/include/alp/utils.hpp +++ b/include/alp/utils.hpp @@ -26,6 +26,7 @@ #include #include //fabs +#include // modulus #include //numeric_limits #include @@ -295,7 +296,21 @@ namespace alp { } }; - + + + /** + * Computation of modulus operation \f$ mod(x, n) \f$ based on remainder + * of division for type \a T. + * Assumes \a T implements: \a operator%, \a operator<, and \a operator+. + */ + template < typename T > + T modulus( const T x, const T n, const T zero) { + + const T rem = std::modulus( _k, _n ); + + return ( rem < zero ) ? rem + _n : rem; + }; + } // namespace utils } // namespace alp From b7f46ff92c8b9526650eb35d4c231a04ba2b293e Mon Sep 17 00:00:00 2001 From: "Daniele G. Spampinato" Date: Fri, 2 Dec 2022 17:42:43 +0100 Subject: [PATCH 4/9] compiling mxm test on general matrices --- include/alp/blas3.hpp | 3 + include/alp/omp/blas3.hpp | 177 +++++++++++---- include/alp/omp/config.hpp | 8 +- include/alp/omp/io.hpp | 47 +++- include/alp/omp/matrix.hpp | 11 +- include/alp/reference/blas3.hpp | 65 ++++-- include/alp/reference/io.hpp | 3 - include/alp/utils.hpp | 8 +- src/alp/omp/init.cpp | 4 +- tests/unit/CMakeLists.txt | 4 + tests/unit/dense_omp_mxm.cpp | 367 ++++++++++++++++++++++++++++++++ 11 files changed, 611 insertions(+), 86 deletions(-) create mode 100644 tests/unit/dense_omp_mxm.cpp diff --git a/include/alp/blas3.hpp b/include/alp/blas3.hpp index e1f6d0baa..976d3ccb7 100644 --- a/include/alp/blas3.hpp +++ b/include/alp/blas3.hpp @@ -26,6 +26,9 @@ #ifdef _ALP_WITH_REFERENCE #include #endif +#ifdef _ALP_WITH_OMP + #include +#endif #endif // end _H_ALP_BLAS3 diff --git a/include/alp/omp/blas3.hpp b/include/alp/omp/blas3.hpp index 390f5f186..539366bfc 100644 --- a/include/alp/omp/blas3.hpp +++ b/include/alp/omp/blas3.hpp @@ -28,7 +28,10 @@ #include #include +#include +#include #include +#include // Include backend to which sequential work is delegated #ifdef _ALP_OMP_WITH_REFERENCE @@ -36,8 +39,9 @@ #include #endif -#include "matrix.hpp" -#include "storage.hpp" +#ifndef _NDEBUG +#include "../../../tests/utils/print_alp_containers.hpp" +#endif namespace alp { @@ -63,9 +67,9 @@ namespace alp { RC mxm_generic( alp::Matrix< OutputType, OutputStructure, Density::Dense, OutputView, OutputImfR, OutputImfC, omp > &C, - const alp::Matrix< InputType1, InputStructure1, + alp::Matrix< InputType1, InputStructure1, Density::Dense, InputView1, InputImfR1, InputImfC1, omp > &A, - const alp::Matrix< InputType2, InputStructure2, + alp::Matrix< InputType2, InputStructure2, Density::Dense, InputView2, InputImfR2, InputImfC2, omp > &B, const Operator &oper, const Monoid &monoid, @@ -88,7 +92,7 @@ namespace alp { "void)" ); -#ifdef _DEBUG +#ifndef _NDEBUG std::cout << "In alp::internal::mxm_generic (omp)\n"; #endif @@ -132,8 +136,11 @@ namespace alp { RC rc = SUCCESS; - #pragma omp parallel for - for( size_t thread = 0; thread < config::OMP::current_threads(); ++thread ) { + #pragma omp parallel + { + // #pragma omp for + // for( size_t thread = 0; thread < config::OMP::current_threads(); ++thread ) { + const size_t thread = config::OMP::current_thread_ID(); const th_coord_t th_ijk_a = da.getThreadCoords( thread ); const th_coord_t th_ijk_b = db.getThreadCoords( thread ); @@ -149,6 +156,11 @@ namespace alp { || block_grid_dims_c.second != set_block_grid_dims_b.second || set_block_grid_dims_a.second != set_block_grid_dims_b.first ) { +#ifndef _NDEBUG + #pragma omp critical + std::cerr << "Thread " << thread << " in alp::internal::mxm_generic (omp)\n" + "\tMismatching local block grid size on set." << std::endl; +#endif local_rc = MISMATCH; } @@ -160,11 +172,16 @@ namespace alp { for( size_t br = 0; br < set_block_grid_dims_a.first; ++br ) { for( size_t bc = 0; bc < set_block_grid_dims_a.second; ++bc ) { - auto refAij0 = internal::get_view( A, th_ij0_a, br, bc ); - auto refAijk = internal::get_view( A, th_ijk_a, br, bc ); + auto refAij0 = get_view( A, th_ij0_a, br, bc ); + auto refAijk = get_view( A, th_ijk_a, br, bc ); local_rc = local_rc ? local_rc : set( refAijk, refAij0 ); +#ifndef _NDEBUG + print_matrix("refAij0", refAij0); + print_matrix("refAijk", refAijk); +#endif + } } @@ -172,42 +189,59 @@ namespace alp { if( local_rc == SUCCESS && th_ijk_b.rt > 0 ) { - th_coord_t th_ij0_b( th_ijk_b.tr, th_ijk_b.tc, 0 ); + th_coord_t th_ij0_b( th_ijk_b.tr, th_ijk_b.tc, 0 ); - for( size_t br = 0; br < set_block_grid_dims_b.first; ++br ) { - for( size_t bc = 0; bc < set_block_grid_dims_b.second; ++bc ) { + for( size_t br = 0; br < set_block_grid_dims_b.first; ++br ) { + for( size_t bc = 0; bc < set_block_grid_dims_b.second; ++bc ) { - auto refBij0 = internal::get_view( B, th_ij0_b, br, bc ); - auto refBijk = internal::get_view( B, th_ijk_b, br, bc ); + auto refBij0 = internal::get_view( B, th_ij0_b, br, bc ); + auto refBijk = internal::get_view( B, th_ijk_b, br, bc ); - local_rc = local_rc ? local_rc : set( refBijk, refBij0 ); - } + local_rc = local_rc ? local_rc : set( refBijk, refBij0 ); } } + } + // Different values for rc could converge here (eg, MISMATCH, FAILED). + if( local_rc != SUCCESS ) { +#ifndef _NDEBUG + #pragma omp critical + std::cerr << "Thread " << thread << " in alp::internal::mxm_generic (omp)\n" + "\tIssues replicating input matrices." << std::endl; +#endif + rc = local_rc; } - // End Broadcast of Aij and Bij + + // } // End Broadcast of Aij and Bij #pragma omp barrier - if( local_rc == SUCCESS ) { + if( rc == SUCCESS ) { + // #pragma omp for + // for( size_t thread = 0; thread < config::OMP::current_threads(); ++thread ) { + // Initialize circular shifts at stride of Rt size_t s_a = utils::modulus( th_ijk_a.tc - th_ijk_a.tr + th_ijk_a.rt * tg_a.tc / tg_a.rt, tg_a.tc ); size_t s_b = utils::modulus( th_ijk_b.tr - th_ijk_b.tc + th_ijk_b.rt * tg_b.tr / tg_b.rt, tg_b.tr ); - // Accumulate remaining local c-dimension factor + // Per-c-dimensional-layer partial computation for( size_t r = 0; r < tg_a.tc / tg_a.rt; ++r ) { const th_coord_t th_isk_a( th_ijk_a.tr, s_a, th_ijk_a.rt ); const th_coord_t th_sjk_b( s_b, th_ijk_b.tc, th_ijk_b.rt ); - const auto mxm_block_grid_dims_a = d.getLocalBlockGridDims( th_isk_a ); - const auto mxm_block_grid_dims_b = d.getLocalBlockGridDims( th_sjk_b ); + const auto mxm_block_grid_dims_a = da.getLocalBlockGridDims( th_isk_a ); + const auto mxm_block_grid_dims_b = db.getLocalBlockGridDims( th_sjk_b ); if( block_grid_dims_c.first != mxm_block_grid_dims_a.first || block_grid_dims_c.second != mxm_block_grid_dims_b.second || mxm_block_grid_dims_a.second != mxm_block_grid_dims_b.first ) { +#ifndef _NDEBUG + #pragma omp critical + std::cerr << "Thread " << thread << " in alp::internal::mxm_generic (omp)\n" + "\tMismatching local block grid size on mxm." << std::endl; +#endif local_rc = MISMATCH; } @@ -235,15 +269,28 @@ namespace alp { // Circular shift downwards for B s_b = utils::modulus( s_b + 1, tg_b.tr ); - } else { - break; - } + } + // else { + // break; + // } } + // } + } + + // Different values for rc could converge here (eg, MISMATCH, FAILED). + if( local_rc != SUCCESS ) { + #pragma omp critical + std::cerr << "Thread " << thread << " in alp::internal::mxm_generic (omp)\n" + "\tIssues with local mxm computations." << std::endl; + rc = local_rc; + } - // End layer-by-layer partial computation - #pragma omp barrier + // End layer-by-layer partial computation + #pragma omp barrier + if( rc == SUCCESS ) { + // Global c-dimension reduction // (Consider if rt > 0 critical section?) if ( local_rc == SUCCESS && th_ijk_c.rt == 0 ) { @@ -263,9 +310,9 @@ namespace alp { } } - if( local_rc != SUCCESS ) { - break; - } + // if( local_rc != SUCCESS ) { + // break; + // } } } @@ -273,6 +320,9 @@ namespace alp { // Different values for rc could converge here (eg, MISMATCH, FAILED). if( local_rc != SUCCESS ) { + #pragma omp critical + std::cerr << "Thread " << thread << " in alp::internal::mxm_generic (omp)\n" + "\tIssues with final reduction." << std::endl; rc = local_rc; } @@ -312,44 +362,77 @@ namespace alp { * @param phase The execution phase. */ template< - typename OutputStructMatT, - typename InputStructMatT1, - typename InputStructMatT2, + typename OutputType, typename InputType1, typename InputType2, + typename OutputStructure, typename OutputView, + typename OutputImfR, typename OutputImfC, + typename InputStructure1, typename InputView1, + typename InputImfR1, typename InputImfC1, + typename InputStructure2, typename InputView2, + typename InputImfR2, typename InputImfC2, class Semiring > - RC mxm( OutputStructMatT & C, - const InputStructMatT1 & A, - const InputStructMatT2 & B, + RC mxm( + alp::Matrix< OutputType, OutputStructure, + Density::Dense, OutputView, OutputImfR, OutputImfC, omp > &C, + alp::Matrix< InputType1, InputStructure1, + Density::Dense, InputView1, InputImfR1, InputImfC1, omp > &A, + alp::Matrix< InputType2, InputStructure2, + Density::Dense, InputView2, InputImfR2, InputImfC2, omp > &B, const Semiring & ring = Semiring(), const PHASE &phase = NUMERICAL, - const typename std::enable_if< ! alp::is_object< typename OutputStructMatT::value_type >::value && ! alp::is_object< typename InputStructMatT1::value_type >::value && ! alp::is_object< typename InputStructMatT2::value_type >::value && alp::is_semiring< Semiring >::value, - void >::type * const = NULL ) { + const typename std::enable_if< + !alp::is_object< OutputType >::value && + !alp::is_object< InputType1 >::value && + !alp::is_object< InputType2 >::value && + alp::is_semiring< Semiring >::value, + void + >::type * const = NULL + ) { (void)phase; - return internal::mxm_generic< false >( C, A, B, ring.getMultiplicativeOperator(), ring.getAdditiveMonoid(), ring.getMultiplicativeMonoid() ); + return internal::mxm_generic< false >( + C, A, B, + ring.getMultiplicativeOperator(), ring.getAdditiveMonoid(), ring.getMultiplicativeMonoid() + ); + } /** * Dense Matrix-Matrix multiply between structured matrices. * Version with additive monoid and multiplicative operator */ - template< typename OutputStructMatT, - typename InputStructMatT1, - typename InputStructMatT2, + template< + typename OutputType, typename InputType1, typename InputType2, + typename OutputStructure, typename OutputView, + typename OutputImfR, typename OutputImfC, + typename InputStructure1, typename InputView1, + typename InputImfR1, typename InputImfC1, + typename InputStructure2, typename InputView2, + typename InputImfR2, typename InputImfC2, class Operator, class Monoid > - RC mxm( OutputStructMatT & C, - const InputStructMatT1 & A, - const InputStructMatT2 & B, + RC mxm( + alp::Matrix< OutputType, OutputStructure, + Density::Dense, OutputView, OutputImfR, OutputImfC, omp > &C, + alp::Matrix< InputType1, InputStructure1, + Density::Dense, InputView1, InputImfR1, InputImfC1, omp > &A, + alp::Matrix< InputType2, InputStructure2, + Density::Dense, InputView2, InputImfR2, InputImfC2, omp > &B, const Operator & mulOp, const Monoid & addM, const PHASE &phase = NUMERICAL, - const typename std::enable_if< ! alp::is_object< typename OutputStructMatT::value_type >::value && ! alp::is_object< typename InputStructMatT1::value_type >::value && ! alp::is_object< typename InputStructMatT2::value_type >::value && - alp::is_operator< Operator >::value && alp::is_monoid< Monoid >::value, - void >::type * const = NULL ) { + const typename std::enable_if< + !alp::is_object< OutputType >::value && + !alp::is_object< InputType1 >::value && + !alp::is_object< InputType2 >::value && + alp::is_operator< Operator >::value && alp::is_monoid< Monoid >::value, + void + >::type * const = NULL + ) { (void)phase; return internal::mxm_generic< false >( C, A, B, mulOp, addM, Monoid() ); + } } // end namespace ``alp'' diff --git a/include/alp/omp/config.hpp b/include/alp/omp/config.hpp index 5e51df69c..dff95c18e 100644 --- a/include/alp/omp/config.hpp +++ b/include/alp/omp/config.hpp @@ -34,12 +34,12 @@ namespace alp { class OMP : public grb::config::OMP {}; // Dimensions of blocks counted in number of elements per dimension - constexpr size_t BLOCK_ROW_DIM = 16; - constexpr size_t BLOCK_COL_DIM = 16; + constexpr size_t BLOCK_ROW_DIM = 2; + constexpr size_t BLOCK_COL_DIM = 2; // Temporary solution to accomodate for 2.5D mxm - constexpr size_t THREAD_ROW_DIM = 4; - constexpr size_t THREAD_COL_DIM = 4; + constexpr size_t THREAD_ROW_DIM = 2; + constexpr size_t THREAD_COL_DIM = 2; constexpr size_t REPLICATION_FACTOR_THREADS = 2; diff --git a/include/alp/omp/io.hpp b/include/alp/omp/io.hpp index bc5b6aa97..2794b1336 100644 --- a/include/alp/omp/io.hpp +++ b/include/alp/omp/io.hpp @@ -23,6 +23,8 @@ #ifndef _H_ALP_OMP_IO #define _H_ALP_OMP_IO +#include + #include #include "matrix.hpp" @@ -34,8 +36,6 @@ #include #endif -#include "storage.hpp" - #define NO_CAST_ASSERT( x, y, z ) \ static_assert( x, \ "\n\n" \ @@ -138,6 +138,49 @@ namespace alp { return rc; } + /** + * Temporarily assuming 1-1, row-major mapping with user container. + */ + template< + typename InputType, typename Structure, typename View, + typename ImfR, typename ImfC, typename fwd_iterator + > + RC buildMatrix( + Matrix< InputType, Structure, Density::Dense, View, ImfR, ImfC, omp > &A, + const fwd_iterator &start, + const fwd_iterator &end + ) noexcept { + + static_assert( + ( std::is_same< InputType, typename fwd_iterator::value_type >::value ), + "alp::buildMatrix (omp): Mismatching type between user-provided input " + "container and output ALP container." + ); + + const size_t m = nrows( A ); + const size_t n = ncols( A ); + + if( ( end - start ) != static_cast< std::ptrdiff_t >( m * n ) ) { + + std::cerr << "alp::buildMatrix (omp): Mismatching sizes between " + "user-provided input container and output ALP container." << std::endl; + + return MISMATCH; + } + + internal::setInitialized(A, true); + + fwd_iterator it = start; + + for( size_t i = 0; i < m; ++i ) { + for( size_t j = 0; j < n; ++j ) { + internal::access( A, internal::getStorageIndex( A, i, j ) ) = *( it++ ); + } + } + + return SUCCESS; + } + } // end namespace ``alp'' #undef NO_CAST_ASSERT diff --git a/include/alp/omp/matrix.hpp b/include/alp/omp/matrix.hpp index afb16d412..9d279ba7f 100644 --- a/include/alp/omp/matrix.hpp +++ b/include/alp/omp/matrix.hpp @@ -23,6 +23,8 @@ #ifndef _H_ALP_OMP_MATRIX #define _H_ALP_OMP_MATRIX +#include + #include #include @@ -37,6 +39,7 @@ #include // For AMFFactory #endif + namespace alp { // Currently no backend specific implementation @@ -56,10 +59,10 @@ namespace alp { typename SourceMatrix, typename ThreadCoords, std::enable_if_t< - is_matrix< SourceMatrix >::value + alp::is_matrix< SourceMatrix >::value > * = nullptr > - typename internal::new_container_type_from< + typename new_container_type_from< typename SourceMatrix::template view_type< view::gather >::type >::template change_backend< config::default_sequential_backend >::type get_view( SourceMatrix &source, const ThreadCoords t, const size_t br, const size_t bc ) { @@ -68,7 +71,7 @@ namespace alp { const auto &distribution = getAmf( source ).getDistribution(); const size_t thread_id = distribution.getThreadId( t ); const size_t block_id = distribution.getLocalBlockId( t, br, bc ); - auto &container = internal::getLocalContainer( internal::getContainer( source ), thread_id, block_id ); + auto &container = getLocalContainer( getContainer( source ), thread_id, block_id ); // make an AMF // note: When making a view over a vector, the second imf must be imf::Zero @@ -92,7 +95,7 @@ namespace alp { ); // create a sequential container with the container and AMF above - using target_t = typename internal::new_container_type_from< + using target_t = typename new_container_type_from< typename SourceMatrix::template view_type< view::gather >::type >::template change_backend< config::default_sequential_backend >::type; diff --git a/include/alp/reference/blas3.hpp b/include/alp/reference/blas3.hpp index b0ad5ef3d..8a76ee6e3 100644 --- a/include/alp/reference/blas3.hpp +++ b/include/alp/reference/blas3.hpp @@ -490,46 +490,71 @@ namespace alp { * @param phase The execution phase. */ template< - typename OutputStructMatT, - typename InputStructMatT1, - typename InputStructMatT2, + typename OutputType, typename InputType1, typename InputType2, + typename OutputStructure, typename OutputView, + typename OutputImfR, typename OutputImfC, + typename InputStructure1, typename InputView1, + typename InputImfR1, typename InputImfC1, + typename InputStructure2, typename InputView2, + typename InputImfR2, typename InputImfC2, class Semiring > RC mxm( - OutputStructMatT & C, - const InputStructMatT1 & A, - const InputStructMatT2 & B, + alp::Matrix< OutputType, OutputStructure, + Density::Dense, OutputView, OutputImfR, OutputImfC, reference > &C, + const alp::Matrix< InputType1, InputStructure1, + Density::Dense, InputView1, InputImfR1, InputImfC1, reference > &A, + const alp::Matrix< InputType2, InputStructure2, + Density::Dense, InputView2, InputImfR2, InputImfC2, reference > &B, const Semiring & ring = Semiring(), const PHASE &phase = NUMERICAL, - const std::enable_if_t< - ! alp::is_object< typename OutputStructMatT::value_type >::value && ! alp::is_object< typename InputStructMatT1::value_type >::value && ! alp::is_object< typename InputStructMatT2::value_type >::value && alp::is_semiring< Semiring >::value - > * const = NULL + const typename std::enable_if< + !alp::is_object< OutputType >::value && + !alp::is_object< InputType1 >::value && + !alp::is_object< InputType2 >::value && + alp::is_semiring< Semiring >::value, + void + >::type * const = NULL ) { (void)phase; - return internal::mxm_generic< false >( C, A, B, ring.getMultiplicativeOperator(), ring.getAdditiveMonoid(), ring.getMultiplicativeMonoid() ); + return internal::mxm_generic< false >( + C, A, B, + ring.getMultiplicativeOperator(), ring.getAdditiveMonoid(), ring.getMultiplicativeMonoid() + ); } /** * Dense Matrix-Matrix multiply between structured matrices. * Version with additive monoid and multiplicative operator */ - template< typename OutputStructMatT, - typename InputStructMatT1, - typename InputStructMatT2, + template< + typename OutputType, typename InputType1, typename InputType2, + typename OutputStructure, typename OutputView, + typename OutputImfR, typename OutputImfC, + typename InputStructure1, typename InputView1, + typename InputImfR1, typename InputImfC1, + typename InputStructure2, typename InputView2, + typename InputImfR2, typename InputImfC2, class Operator, class Monoid > RC mxm( - OutputStructMatT & C, - const InputStructMatT1 & A, - const InputStructMatT2 & B, + alp::Matrix< OutputType, OutputStructure, + Density::Dense, OutputView, OutputImfR, OutputImfC, reference > &C, + const alp::Matrix< InputType1, InputStructure1, + Density::Dense, InputView1, InputImfR1, InputImfC1, reference > &A, + const alp::Matrix< InputType2, InputStructure2, + Density::Dense, InputView2, InputImfR2, InputImfC2, reference > &B, const Operator & mulOp, const Monoid & addM, const PHASE &phase = NUMERICAL, - const std::enable_if_t< - ! alp::is_object< typename OutputStructMatT::value_type >::value && ! alp::is_object< typename InputStructMatT1::value_type >::value && ! alp::is_object< typename InputStructMatT2::value_type >::value && - alp::is_operator< Operator >::value && alp::is_monoid< Monoid >::value - > * const = NULL + const typename std::enable_if< + !alp::is_object< OutputType >::value && + !alp::is_object< InputType1 >::value && + !alp::is_object< InputType2 >::value && + alp::is_operator< Operator >::value && alp::is_monoid< Monoid >::value, + void + >::type * const = NULL ) { (void)phase; diff --git a/include/alp/reference/io.hpp b/include/alp/reference/io.hpp index c6de63f5d..da1e287c5 100644 --- a/include/alp/reference/io.hpp +++ b/include/alp/reference/io.hpp @@ -878,9 +878,6 @@ namespace alp { const fwd_iterator &start, const fwd_iterator &end ) noexcept { - (void)A; - (void)start; - (void)end; // Temporarily assuming 1-1 mapping with user container internal::setInitialized(A, true); diff --git a/include/alp/utils.hpp b/include/alp/utils.hpp index 7e002cbff..d053f998d 100644 --- a/include/alp/utils.hpp +++ b/include/alp/utils.hpp @@ -300,15 +300,15 @@ namespace alp { /** * Computation of modulus operation \f$ mod(x, n) \f$ based on remainder - * of division for type \a T. + * of division for type \a T within range [0, n). * Assumes \a T implements: \a operator%, \a operator<, and \a operator+. */ template < typename T > - T modulus( const T x, const T n, const T zero) { + T modulus( const T x, const T n ) { - const T rem = std::modulus( _k, _n ); + const T rem = x % n; - return ( rem < zero ) ? rem + _n : rem; + return ( rem < static_cast< T >( 0 ) ) ? rem + n : rem; }; } // namespace utils diff --git a/src/alp/omp/init.cpp b/src/alp/omp/init.cpp index f7f8b5d42..f57351285 100644 --- a/src/alp/omp/init.cpp +++ b/src/alp/omp/init.cpp @@ -36,7 +36,7 @@ alp::RC alp::init< alp::omp >( const size_t s, const size_t P, void * const data RC rc = alp::SUCCESS; // print output const auto T = config::OMP::threads(); - std::cerr << "Info: alp::init (omp) called. OpenMP is set to utilise " << T << " threads.\n"; + std::cout << "Info: alp::init (omp) called. OpenMP is set to utilise " << T << " threads.\n"; // sanity checks if( P > 1 ) { @@ -54,7 +54,7 @@ alp::RC alp::init< alp::omp >( const size_t s, const size_t P, void * const data template<> alp::RC alp::finalize< alp::omp >() { - std::cerr << "Info: alp::finalize (omp) called.\n"; + std::cout << "Info: alp::finalize (omp) called.\n"; return alp::SUCCESS; } diff --git a/tests/unit/CMakeLists.txt b/tests/unit/CMakeLists.txt index b54e0a43b..3eed07ea7 100644 --- a/tests/unit/CMakeLists.txt +++ b/tests/unit/CMakeLists.txt @@ -269,6 +269,10 @@ add_grb_executables( dense_omp_matrix dense_omp_matrix.cpp BACKENDS alp_omp ) +add_grb_executables( dense_omp_mxm dense_omp_mxm.cpp + BACKENDS alp_reference alp_omp +) + add_grb_executables( dense_outer dense_outer.cpp BACKENDS alp_reference ) diff --git a/tests/unit/dense_omp_mxm.cpp b/tests/unit/dense_omp_mxm.cpp new file mode 100644 index 000000000..25f6426ce --- /dev/null +++ b/tests/unit/dense_omp_mxm.cpp @@ -0,0 +1,367 @@ + +/* + * Copyright 2021 Huawei Technologies Co., Ltd. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include + +#include +//#include "../utils/print_alp_containers.hpp" + +using namespace alp; + + +template< typename T > +void print_stdvec_as_matrix( std::string name, const std::vector< T > & vA, const size_t m, const size_t n, const size_t lda ) { + + std::cout << "Vec " << name << ":" << std::endl; + for( size_t row = 0; row < m; ++row ) { + std::cout << "[\t"; + for( size_t col = 0; col < n; ++col ) { + std::cout << vA[ row * lda + col ] << "\t"; + } + std::cout << "]" << std::endl; + } +} + +template< typename T, typename Operator, typename Monoid > +void mxm_stdvec_as_matrix( std::vector< T > & vC, const size_t ldc, + const std::vector< T > & vA, const size_t lda, + const std::vector< T > & vB, const size_t ldb, + const size_t m, const size_t k, const size_t n, + const Operator oper, + const Monoid monoid ) { + + T temp; + + print_stdvec_as_matrix("vA", vA, n, n, n); + print_stdvec_as_matrix("vB", vB, n, n, n); + print_stdvec_as_matrix("vC - PRE", vC, n, n, n); + + for( size_t i = 0; i < m; ++i ) { + for( size_t j = 0; j < n; ++j ) { + T & c_val { vC[ i * ldc + j ] }; + for( size_t l = 0; l < k; ++l ) { + const T & a_val { vA[ i * lda + l ] }; + const T & b_val { vB[ l * ldb + j ] }; + // std::cout << c_val << " += " << a_val << " * " << b_val << std::endl; + (void)internal::apply( temp, a_val, b_val, oper ); + // std::cout << "temp = " << temp << std::endl; + (void)internal::foldl( c_val, temp, monoid.getOperator() ); + } + } + } + + print_stdvec_as_matrix("vC - POST", vC, n, n, n); + +} + +template< typename Structure, typename T > +void stdvec_build_matrix( std::vector< T > & vA, const size_t m, const size_t n, const size_t lda, const T zero, const T one ) { + + if( std::is_same< Structure, structures::General >::value ) { + std::fill( vA.begin(), vA.end(), one ); + } else if( std::is_same< Structure, structures::Symmetric >::value ) { + std::fill( vA.begin(), vA.end(), one ); + } else if( std::is_same< Structure, structures::UpperTriangular >::value ) { + for( size_t row = 0; row < m; ++row ) { + for( size_t col = 0; col < row; ++col ) { + vA[ row * lda + col ] = zero; + } + for( size_t col = row; col < n; ++col ) { + vA[ row * lda + col ] = one; + } + } + } + +} + +template< typename Structure, typename T > +void stdvec_build_matrix( std::vector< T > & vA, const size_t m, const size_t n, const size_t lda, const T zero, const T one, const T inc ) { + + T val = one; + if( std::is_same< Structure, structures::General >::value ) { + for( size_t row = 0; row < m; ++row ) { + for( size_t col = 0; col < n; ++col ) { + vA[ row * lda + col ] = val; + val += inc; + } + } + } else if( std::is_same< Structure, structures::Symmetric >::value ) { + for( size_t row = 0; row < m; ++row ) { + for( size_t col = row; col < n; ++col ) { + vA[ row * lda + col ] = vA[ col * lda + row ] = val; + val += inc; + } + } + } else if( std::is_same< Structure, structures::UpperTriangular >::value ) { + for( size_t row = 0; row < m; ++row ) { + for( size_t col = 0; col < row; ++col ) { + vA[ row * lda + col ] = zero; + } + for( size_t col = row; col < n; ++col ) { + vA[ row * lda + col ] = val; + val += inc; + } + } + } + +} + +template< typename Structure, typename T > +void stdvec_build_matrix_packed( std::vector< T > & vA, const T one ) { + + std::fill( vA.begin(), vA.end(), one ); + +} + +template< typename Structure, typename T > +void stdvec_build_matrix_packed( std::vector< T > & vA, const T one, const T inc ) { + + T val = one; + if( std::is_same< Structure, structures::Symmetric >::value ) { // Assumes Packed Row - Upper + for( auto & elem: vA ) { + elem = val; + val += inc; + } + } else if( std::is_same< Structure, structures::UpperTriangular >::value ) { // Assumes Packed Row - Upper + for( auto & elem: vA ) { + elem = val; + val += inc; + } + } + +} + +template< typename MatType, typename T > +void diff_stdvec_matrix( const std::vector< T > & vA, const size_t m, const size_t n, const size_t lda, + const MatType & mA, double threshold=1e-7 ) { + + if( std::is_same< typename MatType::structure, structures::General >::value ) { + for( size_t row = 0; row < m; ++row ) { + for( size_t col = 0; col < n; ++col ) { + double va = ( double )( vA[ row * lda + col ] ); + double vm = ( double )( internal::access( mA, internal::getStorageIndex( mA, row, col ) ) ); + double re = std::abs( ( va - vm ) / va ); + if( re > threshold ) { + std::cout << "Error ( " << row << ", " << col << " ): " << va << " v " << vm << std::endl; + } + } + } + } else if( std::is_same< typename MatType::structure, structures::Symmetric >::value ) { + for( size_t row = 0; row < m; ++row ) { + for( size_t col = row; col < n; ++col ) { + double va = ( double )( vA[ row * lda + col ] ); + double vm = ( double )( internal::access( mA, internal::getStorageIndex( mA, row, col ) ) ); + double re = std::abs( ( va - vm ) / va ); + if( re > threshold ) { + std::cout << "Error ( " << row << ", " << col << " ): " << va << " v " << vm << std::endl; + } + } + } + } else if( std::is_same< typename MatType::structure, structures::UpperTriangular >::value ) { + for( size_t row = 0; row < m; ++row ) { + for( size_t col = row; col < n; ++col ) { + double va = ( double )( vA[ row * lda + col ] ); + double vm = ( double )( internal::access( mA, internal::getStorageIndex( mA, row, col ) ) ); + double re = std::abs( ( va - vm ) / va ); + if( re > threshold ) { + std::cout << "Error ( " << row << ", " << col << " ): " << va << " v " << vm << std::endl; + } + } + } + } + +} + +void alp_program( const size_t & n, alp::RC & rc ) { + + typedef double T; + + alp::Semiring< alp::operators::add< T >, alp::operators::mul< T >, alp::identities::zero, alp::identities::one > ring; + + T one = ring.getOne< T >(); + T zero = ring.getZero< T >(); + + std::vector< T > A_data( n * n ); + std::vector< T > B_data( n * n ); + std::vector< T > C_data( n * n, zero ); + + // std::vector< T > A_packed( n * ( n + 1 ) / 2 ); + // std::vector< T > B_packed( n * ( n + 1 ) / 2 ); + // std::vector< T > C_packed( n * ( n + 1 ) / 2, zero ); + + std::vector< T > A_vec( n * n ); + std::vector< T > B_vec( n * n ); + std::vector< T > C_vec( n * n, zero ); + + std::cout << "\tTesting dense General mxm " << n << std::endl; + + stdvec_build_matrix< structures::General >( A_data, n, n, n, zero, one, one ); + stdvec_build_matrix< structures::General >( B_data, n, n, n, zero, one, one ); + + // initialize test + alp::Matrix< T, structures::General > A( n, n ); + alp::Matrix< T, structures::General > B( n, n ); + alp::Matrix< T, structures::General > C( n, n ); + + // Initialize input matrices + rc = alp::buildMatrix( A, A_data.begin(), A_data.end() ); + rc = alp::buildMatrix( B, B_data.begin(), B_data.end() ); + rc = alp::buildMatrix( C, C_data.begin(), C_data.end() ); + + print_matrix("A", A); + print_matrix("B", B); + print_matrix("C - PRE", C); + + rc = alp::mxm( C, A, B, ring ); + + print_matrix("C - POST", C); + + stdvec_build_matrix< structures::General >( A_vec, n, n, n, zero, one, one ); + stdvec_build_matrix< structures::General >( B_vec, n, n, n, zero, one, one ); + + mxm_stdvec_as_matrix( C_vec, n, A_vec, n, B_vec, n, n, n, n, ring.getMultiplicativeOperator(), ring.getAdditiveMonoid() ); + + diff_stdvec_matrix( C_vec, n, n, n, C ); + + // std::cout << "\n\n=========== Testing Uppertriangular ============\n\n"; + + // alp::Matrix< T, structures::UpperTriangular > UA( n ); + // alp::Matrix< T, structures::UpperTriangular > UB( n ); + // alp::Matrix< T, structures::UpperTriangular > UC( n ); + + // stdvec_build_matrix_packed< structures::UpperTriangular >( A_packed, one, one ); + // stdvec_build_matrix_packed< structures::UpperTriangular >( B_packed, one, one ); + + // rc = alp::buildMatrix( UA, A_packed.begin(), A_packed.end() ); + // rc = alp::buildMatrix( UB, B_packed.begin(), B_packed.end() ); + // rc = alp::buildMatrix( UC, C_packed.begin(), C_packed.end() ); + + // print_matrix("UA", UA); + // print_matrix("UB", UB); + // print_matrix("UC - PRE", UC); + // rc = alp::mxm( UC, UA, UB, ring ); + // print_matrix("UC - POST", UC); + + // stdvec_build_matrix< structures::UpperTriangular >( A_vec, n, n, n, zero, one, one ); + // stdvec_build_matrix< structures::UpperTriangular >( B_vec, n, n, n, zero, one, one ); + // stdvec_build_matrix< structures::General >( C_vec, n, n, n, zero, zero ); + + // mxm_stdvec_as_matrix( C_vec, n, A_vec, n, B_vec, n, n, n, n, ring.getMultiplicativeOperator(), ring.getAdditiveMonoid() ); + + // diff_stdvec_matrix( C_vec, n, n, n, UC ); + + // std::cout << "\n\n=========== Testing Symmetric Output ============\n\n"; + + // alp::Matrix< T, structures::Symmetric > SC( n ); + + // stdvec_build_matrix< structures::Symmetric >( A_data, n, n, n, zero, one, one ); + + // rc = alp::buildMatrix( A, A_data.begin(), A_data.end() ); + // rc = alp::buildMatrix( SC, C_packed.begin(), C_packed.end() ); + + // print_matrix("A", A ); + // print_matrix("A^T", alp::get_view< alp::view::transpose >( A ) ); + // print_matrix("SC - PRE", SC); + // rc = alp::mxm( SC, A, alp::get_view< alp::view::transpose >( A ), ring ); + // print_matrix("SC - POST", SC); + + // stdvec_build_matrix< structures::Symmetric >( A_vec, n, n, n, zero, one, one ); + // stdvec_build_matrix< structures::Symmetric >( C_vec, n, n, n, zero, zero ); + + // mxm_stdvec_as_matrix( C_vec, n, A_vec, n, A_vec, n, n, n, n, ring.getMultiplicativeOperator(), ring.getAdditiveMonoid() ); + + // diff_stdvec_matrix( C_vec, n, n, n, SC ); + + // std::cout << "\n\n=========== Testing Symmetric x Symmetric Output ============\n\n"; + + // alp::Matrix< T, structures::Symmetric > SA( n ); + // alp::Matrix< T, structures::Symmetric > SB( n ); + + // stdvec_build_matrix_packed< structures::Symmetric >( A_packed, one, one ); + // stdvec_build_matrix_packed< structures::Symmetric >( B_packed, one, one + one ); + + // rc = alp::buildMatrix( SA, A_packed.begin(), A_packed.end() ); + // rc = alp::buildMatrix( SB, B_packed.begin(), B_packed.end() ); + // rc = alp::buildMatrix( C, C_data.begin(), C_data.end() ); + + // print_matrix("SA", SA); + // print_matrix("SB", SB); + // print_matrix("C - PRE", C); + // rc = alp::mxm( C, SA, SB, ring ); + // print_matrix("C - POST", C); + + // stdvec_build_matrix< structures::Symmetric >( A_vec, n, n, n, zero, one, one ); + // stdvec_build_matrix< structures::Symmetric >( B_vec, n, n, n, zero, one, one + one ); + // stdvec_build_matrix< structures::General >( C_vec, n, n, n, zero, zero ); + + // mxm_stdvec_as_matrix( C_vec, n, A_vec, n, B_vec, n, n, n, n, ring.getMultiplicativeOperator(), ring.getAdditiveMonoid() ); + + // diff_stdvec_matrix( C_vec, n, n, n, C ); + + +} + +int main( int argc, char ** argv ) { + // defaults + bool printUsage = false; + size_t in = 4; + + // error checking + if( argc > 2 ) { + printUsage = true; + } + if( argc == 2 ) { + size_t read; + std::istringstream ss( argv[ 1 ] ); + if( ! ( ss >> read ) ) { + std::cerr << "Error parsing first argument\n"; + printUsage = true; + } else if( ! ss.eof() ) { + std::cerr << "Error parsing first argument\n"; + printUsage = true; + } else if( read % 2 != 0 ) { + std::cerr << "Given value for n is odd\n"; + printUsage = true; + } else { + // all OK + in = read; + } + } + if( printUsage ) { + std::cerr << "Usage: " << argv[ 0 ] << " [n]\n"; + std::cerr << " -n (optional, default is 100): an even integer, the " + "test size.\n"; + return 1; + } + + std::cout << "This is functional test " << argv[ 0 ] << " " << in << "\n"; + alp::Launcher< AUTOMATIC > launcher; + alp::RC out; + if( launcher.exec( &alp_program, in, out, true ) != SUCCESS ) { + std::cerr << "Launching test FAILED\n"; + return 255; + } + if( out != SUCCESS ) { + std::cerr << "Test FAILED (" << alp::toString( out ) << ")" << std::endl; + } else { + std::cout << "Test OK" << std::endl; + } + return 0; +} + From 0e47854602a32f24e029575df5a77f4a05ca7438 Mon Sep 17 00:00:00 2001 From: "Daniele G. Spampinato" Date: Fri, 2 Dec 2022 17:42:43 +0100 Subject: [PATCH 5/9] shared mem (ge) mxm passing functional tests --- include/alp/omp/blas3.hpp | 65 ++++++++++++++++++++---------------- include/alp/omp/config.hpp | 4 +-- include/alp/omp/matrix.hpp | 4 ++- tests/unit/dense_omp_mxm.cpp | 13 ++++++-- 4 files changed, 52 insertions(+), 34 deletions(-) diff --git a/include/alp/omp/blas3.hpp b/include/alp/omp/blas3.hpp index 539366bfc..d720a9e8e 100644 --- a/include/alp/omp/blas3.hpp +++ b/include/alp/omp/blas3.hpp @@ -39,7 +39,7 @@ #include #endif -#ifndef _NDEBUG +#ifdef _DEBUG #include "../../../tests/utils/print_alp_containers.hpp" #endif @@ -92,14 +92,14 @@ namespace alp { "void)" ); -#ifndef _NDEBUG +#ifdef _DEBUG std::cout << "In alp::internal::mxm_generic (omp)\n"; #endif // Early exit checks - if( ! internal::getInitialized( A ) || - ! internal::getInitialized( B ) || - ! internal::getInitialized( C ) + if( ! internal::getInitialized( A ) || + ! internal::getInitialized( B ) || + ! internal::getInitialized( C ) ) { internal::setInitialized( C, false ); return SUCCESS; @@ -156,7 +156,7 @@ namespace alp { || block_grid_dims_c.second != set_block_grid_dims_b.second || set_block_grid_dims_a.second != set_block_grid_dims_b.first ) { -#ifndef _NDEBUG +#ifdef _DEBUG #pragma omp critical std::cerr << "Thread " << thread << " in alp::internal::mxm_generic (omp)\n" "\tMismatching local block grid size on set." << std::endl; @@ -164,7 +164,7 @@ namespace alp { local_rc = MISMATCH; } - // Broadcast Aij and Bij to all c-dimensional layers + // Broadcast A and B to all c-dimensional layers if( local_rc == SUCCESS && th_ijk_a.rt > 0 ) { th_coord_t th_ij0_a( th_ijk_a.tr, th_ijk_a.tc, 0 ); @@ -177,15 +177,10 @@ namespace alp { local_rc = local_rc ? local_rc : set( refAijk, refAij0 ); -#ifndef _NDEBUG - print_matrix("refAij0", refAij0); - print_matrix("refAijk", refAijk); -#endif - } } - } // End Broadcast of Aij + } // End Broadcast of A if( local_rc == SUCCESS && th_ijk_b.rt > 0 ) { @@ -194,17 +189,31 @@ namespace alp { for( size_t br = 0; br < set_block_grid_dims_b.first; ++br ) { for( size_t bc = 0; bc < set_block_grid_dims_b.second; ++bc ) { - auto refBij0 = internal::get_view( B, th_ij0_b, br, bc ); - auto refBijk = internal::get_view( B, th_ijk_b, br, bc ); + auto refBij0 = get_view( B, th_ij0_b, br, bc ); + auto refBijk = get_view( B, th_ijk_b, br, bc ); local_rc = local_rc ? local_rc : set( refBijk, refBij0 ); } } - } + } // End Broadcast of B + + if( local_rc == SUCCESS && th_ijk_c.rt > 0 ) { + + for( size_t br = 0; br < block_grid_dims_c.first; ++br ) { + for( size_t bc = 0; bc < block_grid_dims_c.second; ++bc ) { + + auto refCijk = get_view( C, th_ijk_c, br, bc ); + alp::Scalar< + OutputType, alp::structures::General, config::default_sequential_backend + > zero( monoid.template getIdentity< OutputType >() ); + local_rc = local_rc ? local_rc : set( refCijk, zero ); + } + } + } // End Zero-ing of C // Different values for rc could converge here (eg, MISMATCH, FAILED). if( local_rc != SUCCESS ) { -#ifndef _NDEBUG +#ifdef _DEBUG #pragma omp critical std::cerr << "Thread " << thread << " in alp::internal::mxm_generic (omp)\n" "\tIssues replicating input matrices." << std::endl; @@ -212,7 +221,7 @@ namespace alp { rc = local_rc; } - // } // End Broadcast of Aij and Bij + // End Broadcast of A and B and zero-ing of C #pragma omp barrier if( rc == SUCCESS ) { @@ -237,7 +246,7 @@ namespace alp { || block_grid_dims_c.second != mxm_block_grid_dims_b.second || mxm_block_grid_dims_a.second != mxm_block_grid_dims_b.first ) { -#ifndef _NDEBUG +#ifdef _DEBUG #pragma omp critical std::cerr << "Thread " << thread << " in alp::internal::mxm_generic (omp)\n" "\tMismatching local block grid size on mxm." << std::endl; @@ -250,15 +259,15 @@ namespace alp { for( size_t bk = 0; bk < mxm_block_grid_dims_a.second; ++bk ) { for( size_t br = 0; br < block_grid_dims_c.first; ++br ) { - const auto refA_loc = internal::get_view( A, th_isk_a, br, bk ); + const auto refA_loc = get_view( A, th_isk_a, br, bk ); for( size_t bc = 0; bc < block_grid_dims_c.second; ++bc ) { - const auto refB_loc = internal::get_view( B, th_sjk_b, bk, bc ); - auto refC_ijk = internal::get_view( C, th_ijk_c, br, bc ); + const auto refB_loc = get_view( B, th_sjk_b, bk, bc ); + auto refC_ijk = get_view( C, th_ijk_c, br, bc ); // Delegate the call to the sequential mxm implementation - local_rc = local_rc ? local_rc : internal::mxm_generic< allow_void >( refC_ijk, refA_loc, refB_loc, oper, monoid, mulMonoid ); + local_rc = local_rc ? local_rc : mxm_generic< allow_void >( refC_ijk, refA_loc, refB_loc, oper, monoid, mulMonoid ); } } @@ -270,9 +279,6 @@ namespace alp { s_b = utils::modulus( s_b + 1, tg_b.tr ); } - // else { - // break; - // } } // } @@ -280,9 +286,11 @@ namespace alp { // Different values for rc could converge here (eg, MISMATCH, FAILED). if( local_rc != SUCCESS ) { +#ifdef _DEBUG #pragma omp critical std::cerr << "Thread " << thread << " in alp::internal::mxm_generic (omp)\n" "\tIssues with local mxm computations." << std::endl; +#endif rc = local_rc; } @@ -310,9 +318,6 @@ namespace alp { } } - // if( local_rc != SUCCESS ) { - // break; - // } } } @@ -320,9 +325,11 @@ namespace alp { // Different values for rc could converge here (eg, MISMATCH, FAILED). if( local_rc != SUCCESS ) { +#ifdef _DEBUG #pragma omp critical std::cerr << "Thread " << thread << " in alp::internal::mxm_generic (omp)\n" "\tIssues with final reduction." << std::endl; +#endif rc = local_rc; } diff --git a/include/alp/omp/config.hpp b/include/alp/omp/config.hpp index dff95c18e..4a4ec7323 100644 --- a/include/alp/omp/config.hpp +++ b/include/alp/omp/config.hpp @@ -34,8 +34,8 @@ namespace alp { class OMP : public grb::config::OMP {}; // Dimensions of blocks counted in number of elements per dimension - constexpr size_t BLOCK_ROW_DIM = 2; - constexpr size_t BLOCK_COL_DIM = 2; + constexpr size_t BLOCK_ROW_DIM = 64; + constexpr size_t BLOCK_COL_DIM = 64; // Temporary solution to accomodate for 2.5D mxm constexpr size_t THREAD_ROW_DIM = 2; diff --git a/include/alp/omp/matrix.hpp b/include/alp/omp/matrix.hpp index 9d279ba7f..94e600979 100644 --- a/include/alp/omp/matrix.hpp +++ b/include/alp/omp/matrix.hpp @@ -99,7 +99,9 @@ namespace alp { typename SourceMatrix::template view_type< view::gather >::type >::template change_backend< config::default_sequential_backend >::type; - return target_t( container, amf ); + target_t blk_matrix( container, amf ); + internal::setInitialized( blk_matrix, internal::getInitialized( source ) ); + return blk_matrix; } } // namespace internal diff --git a/tests/unit/dense_omp_mxm.cpp b/tests/unit/dense_omp_mxm.cpp index 25f6426ce..415499595 100644 --- a/tests/unit/dense_omp_mxm.cpp +++ b/tests/unit/dense_omp_mxm.cpp @@ -20,7 +20,9 @@ #include #include -//#include "../utils/print_alp_containers.hpp" +#ifndef _DEBUG + #include "../utils/print_alp_containers.hpp" +#endif using namespace alp; @@ -48,9 +50,11 @@ void mxm_stdvec_as_matrix( std::vector< T > & vC, const size_t ldc, T temp; +#ifdef _DEBUG print_stdvec_as_matrix("vA", vA, n, n, n); print_stdvec_as_matrix("vB", vB, n, n, n); print_stdvec_as_matrix("vC - PRE", vC, n, n, n); +#endif for( size_t i = 0; i < m; ++i ) { for( size_t j = 0; j < n; ++j ) { @@ -66,8 +70,9 @@ void mxm_stdvec_as_matrix( std::vector< T > & vC, const size_t ldc, } } +#ifdef _DEBUG print_stdvec_as_matrix("vC - POST", vC, n, n, n); - +#endif } template< typename Structure, typename T > @@ -224,13 +229,17 @@ void alp_program( const size_t & n, alp::RC & rc ) { rc = alp::buildMatrix( B, B_data.begin(), B_data.end() ); rc = alp::buildMatrix( C, C_data.begin(), C_data.end() ); +#ifdef _DEBUG print_matrix("A", A); print_matrix("B", B); print_matrix("C - PRE", C); +#endif rc = alp::mxm( C, A, B, ring ); +#ifdef _DEBUG print_matrix("C - POST", C); +#endif stdvec_build_matrix< structures::General >( A_vec, n, n, n, zero, one, one ); stdvec_build_matrix< structures::General >( B_vec, n, n, n, zero, one, one ); From 474d1075e25338bb9f0d8d87921c86d96b2ac57d Mon Sep 17 00:00:00 2001 From: "Daniele G. Spampinato" Date: Fri, 2 Dec 2022 17:42:43 +0100 Subject: [PATCH 6/9] Tmp debugging info + fix in shift computation --- include/alp/omp/blas3.hpp | 134 +++++++++++++++++++++++++++-------- include/alp/omp/config.hpp | 10 +-- include/alp/omp/storage.hpp | 28 +++++--- tests/unit/dense_omp_mxm.cpp | 46 ++++++------ 4 files changed, 154 insertions(+), 64 deletions(-) diff --git a/include/alp/omp/blas3.hpp b/include/alp/omp/blas3.hpp index d720a9e8e..8f194b82c 100644 --- a/include/alp/omp/blas3.hpp +++ b/include/alp/omp/blas3.hpp @@ -39,7 +39,7 @@ #include #endif -#ifdef _DEBUG +#ifndef NDEBUG #include "../../../tests/utils/print_alp_containers.hpp" #endif @@ -92,7 +92,7 @@ namespace alp { "void)" ); -#ifdef _DEBUG +#ifndef NDEBUG std::cout << "In alp::internal::mxm_generic (omp)\n"; #endif @@ -123,8 +123,9 @@ namespace alp { const auto tg_a = da.getThreadGridDims(); const auto tg_b = db.getThreadGridDims(); const auto tg_c = dc.getThreadGridDims(); - - if( tg_c.rt != tg_a.rt || tg_a.rt != tg_b.rt ) { + + // 2.5D factor ranges within 3D limits + if( tg_c.rt != tg_a.rt || tg_a.rt != tg_b.rt || ( tg_a.tc % tg_a.rt ) != 0 ) { return MISMATCH; } @@ -136,10 +137,14 @@ namespace alp { RC rc = SUCCESS; +#ifndef NDEBUG + #pragma omp critical + std::cerr << "In alp::internal::mxm_generic (omp)\n" + "\tBeginning parallel region" << std::endl; +#endif + #pragma omp parallel { - // #pragma omp for - // for( size_t thread = 0; thread < config::OMP::current_threads(); ++thread ) { const size_t thread = config::OMP::current_thread_ID(); const th_coord_t th_ijk_a = da.getThreadCoords( thread ); @@ -156,7 +161,7 @@ namespace alp { || block_grid_dims_c.second != set_block_grid_dims_b.second || set_block_grid_dims_a.second != set_block_grid_dims_b.first ) { -#ifdef _DEBUG +#ifndef NDEBUG #pragma omp critical std::cerr << "Thread " << thread << " in alp::internal::mxm_generic (omp)\n" "\tMismatching local block grid size on set." << std::endl; @@ -167,6 +172,12 @@ namespace alp { // Broadcast A and B to all c-dimensional layers if( local_rc == SUCCESS && th_ijk_a.rt > 0 ) { +#ifndef NDEBUG + #pragma omp critical + std::cerr << "Thread " << thread << " in alp::internal::mxm_generic (omp)\n" + "\tCopying A." << std::endl; +#endif + th_coord_t th_ij0_a( th_ijk_a.tr, th_ijk_a.tc, 0 ); for( size_t br = 0; br < set_block_grid_dims_a.first; ++br ) { @@ -184,6 +195,12 @@ namespace alp { if( local_rc == SUCCESS && th_ijk_b.rt > 0 ) { +#ifndef NDEBUG + #pragma omp critical + std::cerr << "Thread " << thread << " in alp::internal::mxm_generic (omp)\n" + "\tCopying B." << std::endl; +#endif + th_coord_t th_ij0_b( th_ijk_b.tr, th_ijk_b.tc, 0 ); for( size_t br = 0; br < set_block_grid_dims_b.first; ++br ) { @@ -199,13 +216,19 @@ namespace alp { if( local_rc == SUCCESS && th_ijk_c.rt > 0 ) { +#ifndef NDEBUG + #pragma omp critical + std::cerr << "Thread " << thread << " in alp::internal::mxm_generic (omp)\n" + "\tZeroing C." << std::endl; +#endif + + alp::Scalar< + OutputType, alp::structures::General, config::default_sequential_backend + > zero( monoid.template getIdentity< OutputType >() ); + for( size_t br = 0; br < block_grid_dims_c.first; ++br ) { for( size_t bc = 0; bc < block_grid_dims_c.second; ++bc ) { - auto refCijk = get_view( C, th_ijk_c, br, bc ); - alp::Scalar< - OutputType, alp::structures::General, config::default_sequential_backend - > zero( monoid.template getIdentity< OutputType >() ); local_rc = local_rc ? local_rc : set( refCijk, zero ); } } @@ -213,7 +236,7 @@ namespace alp { // Different values for rc could converge here (eg, MISMATCH, FAILED). if( local_rc != SUCCESS ) { -#ifdef _DEBUG +#ifndef NDEBUG #pragma omp critical std::cerr << "Thread " << thread << " in alp::internal::mxm_generic (omp)\n" "\tIssues replicating input matrices." << std::endl; @@ -223,21 +246,38 @@ namespace alp { // End Broadcast of A and B and zero-ing of C #pragma omp barrier - + +#ifndef NDEBUG + #pragma omp critical + std::cerr << "Thread " << thread << " in alp::internal::mxm_generic (omp)\n" + "\tPassing barrier" << std::endl; +#endif + if( rc == SUCCESS ) { - // #pragma omp for - // for( size_t thread = 0; thread < config::OMP::current_threads(); ++thread ) { - // Initialize circular shifts at stride of Rt - size_t s_a = utils::modulus( th_ijk_a.tc - th_ijk_a.tr + th_ijk_a.rt * tg_a.tc / tg_a.rt, tg_a.tc ); - size_t s_b = utils::modulus( th_ijk_b.tr - th_ijk_b.tc + th_ijk_b.rt * tg_b.tr / tg_b.rt, tg_b.tr ); + // size_t s_a = utils::modulus( th_ijk_a.tc - th_ijk_a.tr + th_ijk_a.rt * tg_a.tc / tg_a.rt, tg_a.tc ); + // size_t s_b = utils::modulus( th_ijk_b.tr - th_ijk_b.tc + th_ijk_b.rt * tg_b.tr / tg_b.rt, tg_b.tr ); + size_t c_a = utils::modulus( th_ijk_a.tc + th_ijk_a.tr + th_ijk_a.rt * tg_a.tc / tg_a.rt, tg_a.tc ); + size_t r_b = utils::modulus( th_ijk_b.tr + th_ijk_b.tc + th_ijk_b.rt * tg_b.tr / tg_b.rt, tg_b.tr ); // Per-c-dimensional-layer partial computation for( size_t r = 0; r < tg_a.tc / tg_a.rt; ++r ) { - const th_coord_t th_isk_a( th_ijk_a.tr, s_a, th_ijk_a.rt ); - const th_coord_t th_sjk_b( s_b, th_ijk_b.tc, th_ijk_b.rt ); + const th_coord_t th_isk_a( th_ijk_a.tr, c_a, th_ijk_a.rt ); + const th_coord_t th_sjk_b( r_b, th_ijk_b.tc, th_ijk_b.rt ); + +#ifndef NDEBUG + if( thread == 1 ) { + #pragma omp critical + { + std::cerr << "Thread " << thread << " in alp::internal::mxm_generic (omp)\n" + "\tCompute iteration r=" << r << "\n" + "\tStarting from A( " << th_ijk_a.tr << ", " << c_a << ", " << th_ijk_a.rt << " )\n" + "\tStarting from B( " << r_b << ", " << th_ijk_b.tc << ", " << th_ijk_b.rt << " )" <( refC_ijk, refA_loc, refB_loc, oper, monoid, mulMonoid ); +#ifndef NDEBUG + if( thread == 1 ) { + #pragma omp critical + { + print_matrix("Cref post", refC_ijk ); + } + } +#endif + } } } // Circular shift rightwards for A - s_a = utils::modulus( s_a + 1, tg_a.tc ); + c_a = utils::modulus( c_a + 1, tg_a.tc ); // Circular shift downwards for B - s_b = utils::modulus( s_b + 1, tg_b.tr ); + r_b = utils::modulus( r_b + 1, tg_b.tr ); } } - // } - } + + } // End computation per layer of c-dimension // Different values for rc could converge here (eg, MISMATCH, FAILED). if( local_rc != SUCCESS ) { -#ifdef _DEBUG +#ifndef NDEBUG #pragma omp critical std::cerr << "Thread " << thread << " in alp::internal::mxm_generic (omp)\n" "\tIssues with local mxm computations." << std::endl; @@ -299,12 +362,24 @@ namespace alp { if( rc == SUCCESS ) { - // Global c-dimension reduction + // Final c-dimension reduction // (Consider if rt > 0 critical section?) if ( local_rc == SUCCESS && th_ijk_c.rt == 0 ) { +#ifndef NDEBUG + #pragma omp critical + std::cerr << "Thread " << thread << " in alp::internal::mxm_generic (omp)\n" + "\tEntering reduction." << std::endl; +#endif + for( size_t r = 1; r < tg_c.rt; ++r ) { +#ifndef NDEBUG + #pragma omp critical + std::cerr << "Thread " << thread << " in alp::internal::mxm_generic (omp)\n" + "\tReduction iteration r=" << r << std::endl; +#endif + const th_coord_t th_ijr_c( th_ijk_c.tr, th_ijk_c.tc, r ); for( size_t br = 0; br < block_grid_dims_c.first; ++br ) { @@ -321,11 +396,12 @@ namespace alp { } } - } + + } // End of final reduction along c-dimension // Different values for rc could converge here (eg, MISMATCH, FAILED). if( local_rc != SUCCESS ) { -#ifdef _DEBUG +#ifndef NDEBUG #pragma omp critical std::cerr << "Thread " << thread << " in alp::internal::mxm_generic (omp)\n" "\tIssues with final reduction." << std::endl; diff --git a/include/alp/omp/config.hpp b/include/alp/omp/config.hpp index 4a4ec7323..d313a3a6d 100644 --- a/include/alp/omp/config.hpp +++ b/include/alp/omp/config.hpp @@ -34,14 +34,14 @@ namespace alp { class OMP : public grb::config::OMP {}; // Dimensions of blocks counted in number of elements per dimension - constexpr size_t BLOCK_ROW_DIM = 64; - constexpr size_t BLOCK_COL_DIM = 64; + constexpr size_t BLOCK_ROW_DIM = 2; + constexpr size_t BLOCK_COL_DIM = 2; // Temporary solution to accomodate for 2.5D mxm - constexpr size_t THREAD_ROW_DIM = 2; - constexpr size_t THREAD_COL_DIM = 2; + constexpr size_t THREAD_ROW_DIM = 4; + constexpr size_t THREAD_COL_DIM = 4; - constexpr size_t REPLICATION_FACTOR_THREADS = 2; + constexpr size_t REPLICATION_FACTOR_THREADS = 1; } // namespace config diff --git a/include/alp/omp/storage.hpp b/include/alp/omp/storage.hpp index 6bd333a70..47a728e9c 100644 --- a/include/alp/omp/storage.hpp +++ b/include/alp/omp/storage.hpp @@ -156,14 +156,14 @@ namespace alp { /** Row and column dimensions of the associated container */ const size_t m; const size_t n; - /** The row and column dimensions of the thread grid */ - const size_t Tr; - const size_t Tc; /** Replication factor in thread-coordinate space */ static constexpr size_t Rt = config::REPLICATION_FACTOR_THREADS; /** The row and column dimensions of the global block grid */ const size_t Br; const size_t Bc; + /** The row and column dimensions of the thread grid */ + const size_t Tr; + const size_t Tc; public: @@ -172,13 +172,14 @@ namespace alp { const size_t num_threads ) : m( m ), n( n ), - Tr( config::THREAD_ROW_DIM ), // Temporary - Tc( config::THREAD_COL_DIM ), Br( static_cast< size_t >( std::ceil( static_cast< double >( m ) / config::BLOCK_ROW_DIM ) ) ), - Bc( static_cast< size_t >( std::ceil( static_cast< double >( n ) / config::BLOCK_COL_DIM ) ) ) { - - if( num_threads != Tr * Tc * Rt ) { - std::cerr << "Warning: Provided number of threads cannot be factorized in a 3D grid.\n"; + Bc( static_cast< size_t >( std::ceil( static_cast< double >( n ) / config::BLOCK_COL_DIM ) ) ), + Tr( ( Br > config::THREAD_ROW_DIM ) ? config::THREAD_ROW_DIM : Br ), // Temporary + Tc( ( Bc > config::THREAD_COL_DIM ) ? config::THREAD_COL_DIM : Bc ) + { + if( ( Tr > 1 ) && ( Tc > 1 ) && ( num_threads != Tr * Tc * Rt ) ) { + std::cerr << "Warning: Provided number of threads cannot be factorized in a 3D grid:\n" + "\t" << Tr << " x " << Tc << " x " << Rt << std::endl; } } @@ -303,6 +304,15 @@ namespace alp { } ThreadCoords getThreadCoords( const size_t thread_id ) const { + // const size_t _Tr = ( Tr == 1 ) ? Tc : Tr; + // const size_t _Tc = ( Tc == 1 ) ? Tr : Tc; + + // const size_t rt = thread_id / ( _Tr * _Tc ); + // const size_t tr = ( thread_id % ( _Tr * _Tc ) ) / _Tc; + // const size_t tc = ( thread_id % ( _Tr * _Tc ) ) % _Tc; + + // return { tr % Tr, tc % Tc, rt }; + const size_t rt = thread_id / ( Tr * Tc ); const size_t tr = ( thread_id % ( Tr * Tc ) ) / Tc; const size_t tc = ( thread_id % ( Tr * Tc ) ) % Tc; diff --git a/tests/unit/dense_omp_mxm.cpp b/tests/unit/dense_omp_mxm.cpp index 415499595..be05c340b 100644 --- a/tests/unit/dense_omp_mxm.cpp +++ b/tests/unit/dense_omp_mxm.cpp @@ -20,7 +20,7 @@ #include #include -#ifndef _DEBUG +#ifdef NDEBUG #include "../utils/print_alp_containers.hpp" #endif @@ -50,7 +50,7 @@ void mxm_stdvec_as_matrix( std::vector< T > & vC, const size_t ldc, T temp; -#ifdef _DEBUG +#ifndef NDEBUG print_stdvec_as_matrix("vA", vA, n, n, n); print_stdvec_as_matrix("vB", vB, n, n, n); print_stdvec_as_matrix("vC - PRE", vC, n, n, n); @@ -70,7 +70,7 @@ void mxm_stdvec_as_matrix( std::vector< T > & vC, const size_t ldc, } } -#ifdef _DEBUG +#ifndef NDEBUG print_stdvec_as_matrix("vC - POST", vC, n, n, n); #endif } @@ -193,6 +193,10 @@ void diff_stdvec_matrix( const std::vector< T > & vA, const size_t m, const size } +#define M ( alp::config::BLOCK_ROW_DIM * n ) +#define K ( alp::config::BLOCK_COL_DIM * n ) +#define N ( alp::config::BLOCK_COL_DIM * n ) + void alp_program( const size_t & n, alp::RC & rc ) { typedef double T; @@ -202,34 +206,34 @@ void alp_program( const size_t & n, alp::RC & rc ) { T one = ring.getOne< T >(); T zero = ring.getZero< T >(); - std::vector< T > A_data( n * n ); - std::vector< T > B_data( n * n ); - std::vector< T > C_data( n * n, zero ); + std::vector< T > A_data( M * K ); + std::vector< T > B_data( K * N ); + std::vector< T > C_data( M * N, zero ); // std::vector< T > A_packed( n * ( n + 1 ) / 2 ); // std::vector< T > B_packed( n * ( n + 1 ) / 2 ); // std::vector< T > C_packed( n * ( n + 1 ) / 2, zero ); - std::vector< T > A_vec( n * n ); - std::vector< T > B_vec( n * n ); - std::vector< T > C_vec( n * n, zero ); + std::vector< T > A_vec( M * K ); + std::vector< T > B_vec( K * N ); + std::vector< T > C_vec( M * N, zero ); - std::cout << "\tTesting dense General mxm " << n << std::endl; + std::cout << "\tTesting dense General mxm " << M << " " << K << " " << N << std::endl; - stdvec_build_matrix< structures::General >( A_data, n, n, n, zero, one, one ); - stdvec_build_matrix< structures::General >( B_data, n, n, n, zero, one, one ); + stdvec_build_matrix< structures::General >( A_data, M, K, K, zero, one, one ); + stdvec_build_matrix< structures::General >( B_data, K, N, N, zero, one, one ); // initialize test - alp::Matrix< T, structures::General > A( n, n ); - alp::Matrix< T, structures::General > B( n, n ); - alp::Matrix< T, structures::General > C( n, n ); + alp::Matrix< T, structures::General > A( M, K ); + alp::Matrix< T, structures::General > B( K, N ); + alp::Matrix< T, structures::General > C( M, N ); // Initialize input matrices rc = alp::buildMatrix( A, A_data.begin(), A_data.end() ); rc = alp::buildMatrix( B, B_data.begin(), B_data.end() ); rc = alp::buildMatrix( C, C_data.begin(), C_data.end() ); -#ifdef _DEBUG +#ifndef NDEBUG print_matrix("A", A); print_matrix("B", B); print_matrix("C - PRE", C); @@ -237,16 +241,16 @@ void alp_program( const size_t & n, alp::RC & rc ) { rc = alp::mxm( C, A, B, ring ); -#ifdef _DEBUG +#ifndef NDEBUG print_matrix("C - POST", C); #endif - stdvec_build_matrix< structures::General >( A_vec, n, n, n, zero, one, one ); - stdvec_build_matrix< structures::General >( B_vec, n, n, n, zero, one, one ); + stdvec_build_matrix< structures::General >( A_vec, M, K, K, zero, one, one ); + stdvec_build_matrix< structures::General >( B_vec, K, N, N, zero, one, one ); - mxm_stdvec_as_matrix( C_vec, n, A_vec, n, B_vec, n, n, n, n, ring.getMultiplicativeOperator(), ring.getAdditiveMonoid() ); + mxm_stdvec_as_matrix( C_vec, N, A_vec, K, B_vec, N, M, K, N, ring.getMultiplicativeOperator(), ring.getAdditiveMonoid() ); - diff_stdvec_matrix( C_vec, n, n, n, C ); + diff_stdvec_matrix( C_vec, M, N, N, C ); // std::cout << "\n\n=========== Testing Uppertriangular ============\n\n"; From 47d879bc959bc7d58a4bd7c2e17f1fb0ee27e49e Mon Sep 17 00:00:00 2001 From: "Daniele G. Spampinato" Date: Fri, 2 Dec 2022 17:42:43 +0100 Subject: [PATCH 7/9] Refactoring includes --- include/alp/omp/blas3.hpp | 11 +++++++---- include/alp/scalar.hpp | 2 -- tests/unit/CMakeLists.txt | 2 +- 3 files changed, 8 insertions(+), 7 deletions(-) diff --git a/include/alp/omp/blas3.hpp b/include/alp/omp/blas3.hpp index 8f194b82c..eae9fd9e6 100644 --- a/include/alp/omp/blas3.hpp +++ b/include/alp/omp/blas3.hpp @@ -26,15 +26,18 @@ #include // for std::min/max #include // for std::enable_if -#include #include -#include -#include #include -#include + +// #include +#include + +#include "matrix.hpp" +#include "storage.hpp" // Include backend to which sequential work is delegated #ifdef _ALP_OMP_WITH_REFERENCE + #include #include #include #endif diff --git a/include/alp/scalar.hpp b/include/alp/scalar.hpp index 95c73cfbc..33210aba2 100644 --- a/include/alp/scalar.hpp +++ b/include/alp/scalar.hpp @@ -24,8 +24,6 @@ #include "base/scalar.hpp" -#include - // now include all specialisations contained in the backend directories: #ifdef _ALP_WITH_REFERENCE #include diff --git a/tests/unit/CMakeLists.txt b/tests/unit/CMakeLists.txt index 3eed07ea7..f16504294 100644 --- a/tests/unit/CMakeLists.txt +++ b/tests/unit/CMakeLists.txt @@ -270,7 +270,7 @@ add_grb_executables( dense_omp_matrix dense_omp_matrix.cpp ) add_grb_executables( dense_omp_mxm dense_omp_mxm.cpp - BACKENDS alp_reference alp_omp + BACKENDS alp_omp ) add_grb_executables( dense_outer dense_outer.cpp From 68995561f3674d7ebb7b882abfc318683d8900cc Mon Sep 17 00:00:00 2001 From: "Daniele G. Spampinato" Date: Mon, 5 Dec 2022 18:39:50 +0100 Subject: [PATCH 8/9] - Fixing mismatching new/delete bug - Enabling allocation/computation on sub-grid of threads - Added unit tests for non-cubic mxm --- include/alp/omp/blas3.hpp | 85 +++++++++--------- include/alp/omp/config.hpp | 2 +- include/alp/omp/storage.hpp | 136 +++++++++++++++-------------- include/alp/omp/vector.hpp | 82 +++++++++++------- tests/unit/dense_omp_mxm.cpp | 162 ++++++++++++++--------------------- 5 files changed, 226 insertions(+), 241 deletions(-) diff --git a/include/alp/omp/blas3.hpp b/include/alp/omp/blas3.hpp index eae9fd9e6..0bba4444b 100644 --- a/include/alp/omp/blas3.hpp +++ b/include/alp/omp/blas3.hpp @@ -154,26 +154,22 @@ namespace alp { const th_coord_t th_ijk_b = db.getThreadCoords( thread ); const th_coord_t th_ijk_c = dc.getThreadCoords( thread ); - const auto set_block_grid_dims_a = da.getLocalBlockGridDims( th_ijk_a ); - const auto set_block_grid_dims_b = db.getLocalBlockGridDims( th_ijk_b ); - const auto block_grid_dims_c = dc.getLocalBlockGridDims( th_ijk_c ); - RC local_rc = SUCCESS; - if( block_grid_dims_c.first != set_block_grid_dims_a.first - || block_grid_dims_c.second != set_block_grid_dims_b.second - || set_block_grid_dims_a.second != set_block_grid_dims_b.first - ) { -#ifndef NDEBUG - #pragma omp critical - std::cerr << "Thread " << thread << " in alp::internal::mxm_generic (omp)\n" - "\tMismatching local block grid size on set." << std::endl; -#endif - local_rc = MISMATCH; - } +// if( block_grid_dims_c.first != set_block_grid_dims_a.first +// || block_grid_dims_c.second != set_block_grid_dims_b.second +// || set_block_grid_dims_a.second != set_block_grid_dims_b.first +// ) { +// #ifndef NDEBUG +// #pragma omp critical +// std::cerr << "Thread " << thread << " in alp::internal::mxm_generic (omp)\n" +// "\tMismatching local block grid size on set." << std::endl; +// #endif +// local_rc = MISMATCH; +// } // Broadcast A and B to all c-dimensional layers - if( local_rc == SUCCESS && th_ijk_a.rt > 0 ) { + if( local_rc == SUCCESS && da.isActiveThread( th_ijk_a ) && th_ijk_a.rt > 0 ) { #ifndef NDEBUG #pragma omp critical @@ -181,6 +177,8 @@ namespace alp { "\tCopying A." << std::endl; #endif + const auto set_block_grid_dims_a = da.getLocalBlockGridDims( th_ijk_a ); + th_coord_t th_ij0_a( th_ijk_a.tr, th_ijk_a.tc, 0 ); for( size_t br = 0; br < set_block_grid_dims_a.first; ++br ) { @@ -196,7 +194,7 @@ namespace alp { } // End Broadcast of A - if( local_rc == SUCCESS && th_ijk_b.rt > 0 ) { + if( local_rc == SUCCESS && db.isActiveThread( th_ijk_b ) && th_ijk_b.rt > 0 ) { #ifndef NDEBUG #pragma omp critical @@ -204,6 +202,8 @@ namespace alp { "\tCopying B." << std::endl; #endif + const auto set_block_grid_dims_b = db.getLocalBlockGridDims( th_ijk_b ); + th_coord_t th_ij0_b( th_ijk_b.tr, th_ijk_b.tc, 0 ); for( size_t br = 0; br < set_block_grid_dims_b.first; ++br ) { @@ -217,7 +217,7 @@ namespace alp { } } // End Broadcast of B - if( local_rc == SUCCESS && th_ijk_c.rt > 0 ) { + if( local_rc == SUCCESS && dc.isActiveThread( th_ijk_c ) && th_ijk_c.rt > 0 ) { #ifndef NDEBUG #pragma omp critical @@ -225,6 +225,8 @@ namespace alp { "\tZeroing C." << std::endl; #endif + const auto block_grid_dims_c = dc.getLocalBlockGridDims( th_ijk_c ); + alp::Scalar< OutputType, alp::structures::General, config::default_sequential_backend > zero( monoid.template getIdentity< OutputType >() ); @@ -256,11 +258,11 @@ namespace alp { "\tPassing barrier" << std::endl; #endif - if( rc == SUCCESS ) { - + if( rc == SUCCESS && dc.isActiveThread( th_ijk_c ) ) { + + const auto block_grid_dims_c = dc.getLocalBlockGridDims( th_ijk_c ); + // Initialize circular shifts at stride of Rt - // size_t s_a = utils::modulus( th_ijk_a.tc - th_ijk_a.tr + th_ijk_a.rt * tg_a.tc / tg_a.rt, tg_a.tc ); - // size_t s_b = utils::modulus( th_ijk_b.tr - th_ijk_b.tc + th_ijk_b.rt * tg_b.tr / tg_b.rt, tg_b.tr ); size_t c_a = utils::modulus( th_ijk_a.tc + th_ijk_a.tr + th_ijk_a.rt * tg_a.tc / tg_a.rt, tg_a.tc ); size_t r_b = utils::modulus( th_ijk_b.tr + th_ijk_b.tc + th_ijk_b.rt * tg_b.tr / tg_b.rt, tg_b.tr ); @@ -363,39 +365,36 @@ namespace alp { // End layer-by-layer partial computation #pragma omp barrier - if( rc == SUCCESS ) { + // Final c-dimension reduction + if( rc == SUCCESS && dc.isActiveThread( th_ijk_c ) && th_ijk_c.rt == 0 ) { - // Final c-dimension reduction - // (Consider if rt > 0 critical section?) - if ( local_rc == SUCCESS && th_ijk_c.rt == 0 ) { - #ifndef NDEBUG - #pragma omp critical - std::cerr << "Thread " << thread << " in alp::internal::mxm_generic (omp)\n" - "\tEntering reduction." << std::endl; + #pragma omp critical + std::cerr << "Thread " << thread << " in alp::internal::mxm_generic (omp)\n" + "\tEntering reduction." << std::endl; #endif - for( size_t r = 1; r < tg_c.rt; ++r ) { + const auto block_grid_dims_c = dc.getLocalBlockGridDims( th_ijk_c ); + + for( size_t r = 1; r < tg_c.rt; ++r ) { #ifndef NDEBUG - #pragma omp critical - std::cerr << "Thread " << thread << " in alp::internal::mxm_generic (omp)\n" - "\tReduction iteration r=" << r << std::endl; + #pragma omp critical + std::cerr << "Thread " << thread << " in alp::internal::mxm_generic (omp)\n" + "\tReduction iteration r=" << r << std::endl; #endif - const th_coord_t th_ijr_c( th_ijk_c.tr, th_ijk_c.tc, r ); + const th_coord_t th_ijr_c( th_ijk_c.tr, th_ijk_c.tc, r ); - for( size_t br = 0; br < block_grid_dims_c.first; ++br ) { - for( size_t bc = 0; bc < block_grid_dims_c.second; ++bc ) { + for( size_t br = 0; br < block_grid_dims_c.first; ++br ) { + for( size_t bc = 0; bc < block_grid_dims_c.second; ++bc ) { - auto refCij0 = internal::get_view( C, th_ijk_c, br, bc ); // k == 0 - auto refCijr = internal::get_view( C, th_ijr_c, br, bc ); + auto refCij0 = internal::get_view( C, th_ijk_c, br, bc ); // k == 0 + auto refCijr = internal::get_view( C, th_ijr_c, br, bc ); - // Final result in C at layer 0 - local_rc = local_rc ? local_rc : foldl( refCij0, refCijr, monoid ); - } + // Final result in C at layer 0 + local_rc = local_rc ? local_rc : foldl( refCij0, refCijr, monoid ); } - } } diff --git a/include/alp/omp/config.hpp b/include/alp/omp/config.hpp index d313a3a6d..79200558f 100644 --- a/include/alp/omp/config.hpp +++ b/include/alp/omp/config.hpp @@ -37,7 +37,7 @@ namespace alp { constexpr size_t BLOCK_ROW_DIM = 2; constexpr size_t BLOCK_COL_DIM = 2; - // Temporary solution to accomodate for 2.5D mxm + // Temporary squared solution to accomodate for 2.5D mxm constexpr size_t THREAD_ROW_DIM = 4; constexpr size_t THREAD_COL_DIM = 4; diff --git a/include/alp/omp/storage.hpp b/include/alp/omp/storage.hpp index 47a728e9c..8bfe855fc 100644 --- a/include/alp/omp/storage.hpp +++ b/include/alp/omp/storage.hpp @@ -131,25 +131,25 @@ namespace alp { }; - /** Type encapsulating the local block coordinate. */ - struct LocalBlockCoord { - - const size_t tr; - const size_t tc; - const size_t rt; - const size_t br; - const size_t bc; - - LocalBlockCoord( - const size_t tr, const size_t tc, - const size_t rt, - const size_t br, const size_t bc - ) : - tr( tr ), tc( tc ), - rt( rt ), - br( br ), bc( bc ) {} - - }; + // /** Type encapsulating the local block coordinate. */ + // struct LocalBlockCoord { + + // const size_t tr; + // const size_t tc; + // const size_t rt; + // const size_t br; + // const size_t bc; + + // LocalBlockCoord( + // const size_t tr, const size_t tc, + // const size_t rt, + // const size_t br, const size_t bc + // ) : + // tr( tr ), tc( tc ), + // rt( rt ), + // br( br ), bc( bc ) {} + + // }; private: @@ -174,25 +174,25 @@ namespace alp { m( m ), n( n ), Br( static_cast< size_t >( std::ceil( static_cast< double >( m ) / config::BLOCK_ROW_DIM ) ) ), Bc( static_cast< size_t >( std::ceil( static_cast< double >( n ) / config::BLOCK_COL_DIM ) ) ), - Tr( ( Br > config::THREAD_ROW_DIM ) ? config::THREAD_ROW_DIM : Br ), // Temporary + Tr( ( Br > config::THREAD_ROW_DIM ) ? config::THREAD_ROW_DIM : Br ), Tc( ( Bc > config::THREAD_COL_DIM ) ? config::THREAD_COL_DIM : Bc ) { - if( ( Tr > 1 ) && ( Tc > 1 ) && ( num_threads != Tr * Tc * Rt ) ) { - std::cerr << "Warning: Provided number of threads cannot be factorized in a 3D grid:\n" - "\t" << Tr << " x " << Tc << " x " << Rt << std::endl; + if( num_threads != config::THREAD_ROW_DIM * config::THREAD_COL_DIM * config::REPLICATION_FACTOR_THREADS ) { + std::cerr << "Warning: Provided number of threads cannot be factorized in a 2.5D grid:\n" + "\t" << num_threads << " != " << config::THREAD_ROW_DIM << " x " << config::THREAD_COL_DIM << " x " << config::REPLICATION_FACTOR_THREADS << std::endl; } } - LocalBlockCoord mapBlockGlobalToLocal( const GlobalBlockCoord &g ) const { - (void) g; - return LocalBlockCoord( 0, 0, 0, 0, 0 ); - } + // LocalBlockCoord mapBlockGlobalToLocal( const GlobalBlockCoord &g ) const { + // (void) g; + // return LocalBlockCoord( 0, 0, 0, 0, 0 ); + // } - GlobalBlockCoord mapBlockLocalToGlobal( const LocalBlockCoord &l ) const { - const size_t block_id_r = l.br * Tr + l.tr; - const size_t block_id_c = l.bc * Tc + l.tc; - return GlobalBlockCoord( block_id_r, block_id_c ); - } + // GlobalBlockCoord mapBlockLocalToGlobal( const LocalBlockCoord &l ) const { + // const size_t block_id_r = l.br * Tr + l.tr; + // const size_t block_id_c = l.bc * Tc + l.tc; + // return GlobalBlockCoord( block_id_r, block_id_c );// Temporary + // } LocalCoord mapGlobalToLocal( const GlobalCoord &g ) const { const size_t global_br = g.i / config::BLOCK_ROW_DIM; @@ -213,23 +213,26 @@ namespace alp { ); } - /** - * Maps coordinates from local to global space. - * - * \todo Add implementation + // /** + // * Maps coordinates from local to global space. + // * + // * \todo Add implementation + // */ + // GlobalCoord mapLocalToGlobal( const LocalCoord &l ) const { + // (void) l; + // return GlobalCoord( 0, 0 ); + // } + + /** + * Returns the thread ID corresponding to the given thread coordinates. + * The fixed thread grid enables to map left-over threads. */ - GlobalCoord mapLocalToGlobal( const LocalCoord &l ) const { - (void) l; - return GlobalCoord( 0, 0 ); - } - - /** Returns the thread ID corresponding to the given thread coordinates. */ size_t getThreadId( const ThreadCoords t ) const { - return t.rt * Tr * Tc + t.tr * Tc + t.tc; + return t.rt * config::THREAD_ROW_DIM * config::THREAD_COL_DIM + t.tr * config::THREAD_COL_DIM + t.tc; } size_t getNumberOfThreads() const { - return Tr * Tc * Rt; + return config::THREAD_ROW_DIM * config::THREAD_COL_DIM * config::REPLICATION_FACTOR_THREADS; } /** Returns the thread grid size */ @@ -251,17 +254,17 @@ namespace alp { return { blocks_r, blocks_c }; } - /** Returns the global block coordinates based on the thread and local block coordinates */ - std::pair< size_t, size_t > getGlobalBlockCoords( const size_t tr, const size_t tc, const size_t br, const size_t bc ) const { - const size_t global_br = br * Tr + tr % Tr; - const size_t global_bc = bc * Tc + tc % Tc; - return { global_br, global_bc }; - } + // /** Returns the global block coordinates based on the thread and local block coordinates */ + // std::pair< size_t, size_t > getGlobalBlockCoords( const size_t tr, const size_t tc, const size_t br, const size_t bc ) const { + // const size_t global_br = br * Tr + tr % Tr; + // const size_t global_bc = bc * Tc + tc % Tc; + // return { global_br, global_bc }; + // } - size_t getGlobalBlockId( const size_t tr, const size_t tc, const size_t br, const size_t bc ) const { - const auto global_coords = getGlobalBlockCoords( tr, tc, br, bc ); - return global_coords.first * Bc + global_coords.second; - } + // size_t getGlobalBlockId( const size_t tr, const size_t tc, const size_t br, const size_t bc ) const { + // const auto global_coords = getGlobalBlockCoords( tr, tc, br, bc ); + // return global_coords.first * Bc + global_coords.second; + // } size_t getLocalBlockId( const LocalCoord &local ) const { return local.br * getLocalBlockGridDims( local.getThreadCoords() ).second + local.bc; @@ -297,27 +300,28 @@ namespace alp { } /** For a given block, returns its offset from the beginning of the buffer in which it is stored */ - size_t getBlocksOffset( const ThreadCoords t, const size_t br, const size_t bc ) const { + size_t getBlocksOffset( const ThreadCoords &t, const size_t br, const size_t bc ) const { // The offset is calculated as the sum of sizes of all previous blocks const size_t block_coord_1D = br * getLocalBlockGridDims( t ).second + bc; return block_coord_1D * getBlockSize(); } ThreadCoords getThreadCoords( const size_t thread_id ) const { - // const size_t _Tr = ( Tr == 1 ) ? Tc : Tr; - // const size_t _Tc = ( Tc == 1 ) ? Tr : Tc; - - // const size_t rt = thread_id / ( _Tr * _Tc ); - // const size_t tr = ( thread_id % ( _Tr * _Tc ) ) / _Tc; - // const size_t tc = ( thread_id % ( _Tr * _Tc ) ) % _Tc; + const size_t rt = thread_id / ( config::THREAD_ROW_DIM * config::THREAD_COL_DIM ); + const size_t tr = ( thread_id % ( config::THREAD_ROW_DIM * config::THREAD_COL_DIM ) ) / config::THREAD_COL_DIM; + const size_t tc = ( thread_id % ( config::THREAD_ROW_DIM * config::THREAD_COL_DIM ) ) % config::THREAD_COL_DIM; + return { tr, tc, rt }; + } - // return { tr % Tr, tc % Tc, rt }; + bool isActiveThread(const ThreadCoords &t ) const { + return t.tr < Tr && t.tc < Tc && t.rt < Rt; + } - const size_t rt = thread_id / ( Tr * Tc ); - const size_t tr = ( thread_id % ( Tr * Tc ) ) / Tc; - const size_t tc = ( thread_id % ( Tr * Tc ) ) % Tc; - return { tr, tc, rt }; + bool isActiveThread(const size_t thread_id ) const { + const auto th_coords = getThreadCoords( thread_id ); + return isActiveThread( th_coords ); } + }; namespace storage { diff --git a/include/alp/omp/vector.hpp b/include/alp/omp/vector.hpp index 8116f1f9f..b653de004 100644 --- a/include/alp/omp/vector.hpp +++ b/include/alp/omp/vector.hpp @@ -104,6 +104,9 @@ namespace alp { private: + /** The distribution of the vector among threads. */ + Distribution_2_5D distr; + /** The number of buffers. */ size_t num_buffers; @@ -149,7 +152,8 @@ namespace alp { Vector( const Distribution_2_5D &d, const size_t cap = 0 - ) : num_buffers( d.getNumberOfThreads() ), + ) : distr( d ), + num_buffers( d.getNumberOfThreads() ), containers( num_buffers ), initialized( false ) { @@ -165,47 +169,54 @@ namespace alp { throw std::runtime_error( "Could not allocate memory during alp::Vector construction." ); } - #pragma omp parallel for - for( size_t thread = 0; thread < config::OMP::current_threads(); ++thread ) { + #pragma omp parallel + { + const size_t thread = config::OMP::current_thread_ID(); + const auto t_coords = d.getThreadCoords( thread ); - const auto block_grid_dims = d.getLocalBlockGridDims( t_coords ); - // Assuming that all blocks are of the same size - const size_t alloc_size = block_grid_dims.first * block_grid_dims.second * d.getBlockSize(); + if ( d.isActiveThread( t_coords ) ) { + + const auto block_grid_dims = d.getLocalBlockGridDims( t_coords ); + + // Assuming that all blocks are of the same size + const size_t alloc_size = block_grid_dims.first * block_grid_dims.second * d.getBlockSize(); #ifdef DEBUG - #pragma omp critical - { - if( thread != config::OMP::current_thread_ID() ) { - std::cout << "Warning: thread != OMP::current_thread_id()\n"; + #pragma omp critical + { + if( thread != config::OMP::current_thread_ID() ) { + std::cout << "Warning: thread != OMP::current_thread_id()\n"; + } + std::cout << "Thread with global coordinates tr = " << t_coords.tr << " tc = " << t_coords.tc + << " on OpenMP thread " << config::OMP::current_thread_ID() + << " allocating buffer of " << alloc_size << " elements " + << " holding " << block_grid_dims.first << " x " << block_grid_dims.second << " blocks.\n"; } - std::cout << "Thread with global coordinates tr = " << t_coords.tr << " tc = " << t_coords.tc - << " on OpenMP thread " << config::OMP::current_thread_ID() - << " allocating buffer of " << alloc_size << " elements " - << " holding " << block_grid_dims.first << " x " << block_grid_dims.second << " blocks.\n"; - } #endif - // TODO: Implement allocation properly - buffers[ thread ] = new ( std::nothrow ) value_type[ alloc_size ]; + // TODO: Implement allocation properly + buffers[ thread ] = new ( std::nothrow ) value_type[ alloc_size ]; - if( buffers[ thread ] == nullptr ) { - throw std::runtime_error( "Could not allocate memory during alp::Vector construction." ); - } + if( buffers[ thread ] == nullptr ) { + throw std::runtime_error( "Could not allocate memory during alp::Vector construction." ); + } - // Reserve space for all internal container wrappers to avoid re-allocation - containers[ thread ].reserve( block_grid_dims.first * block_grid_dims.second ); + // Reserve space for all internal container wrappers to avoid re-allocation + containers[ thread ].reserve( block_grid_dims.first * block_grid_dims.second ); - // Populate the array of internal container wrappers - for( size_t br = 0; br < block_grid_dims.first; ++br ) { - for( size_t bc = 0; bc < block_grid_dims.second; ++bc ) { - const size_t offset = d.getBlocksOffset( t_coords, br, bc ); - containers[ thread ].emplace_back( &( buffers[ thread ][ offset ] ), d.getBlockSize() ); + // Populate the array of internal container wrappers + for( size_t br = 0; br < block_grid_dims.first; ++br ) { + for( size_t bc = 0; bc < block_grid_dims.second; ++bc ) { + const size_t offset = d.getBlocksOffset( t_coords, br, bc ); + containers[ thread ].emplace_back( &( buffers[ thread ][ offset ] ), d.getBlockSize() ); + } } - } - // Ensure that the array contains the expected number of containers - assert( containers[ thread ].size() == block_grid_dims.first * block_grid_dims.second ); + // Ensure that the array contains the expected number of containers + assert( containers[ thread ].size() == block_grid_dims.first * block_grid_dims.second ); + + } // End active threads allocation } } @@ -277,9 +288,14 @@ namespace alp { */ ~Vector() { if( buffers != nullptr ) { - for( size_t i = 0; i < num_buffers; ++i ) { - if( buffers[ i ] != nullptr ) { - delete buffers[ i ]; + #pragma omp parallel + { + const size_t thread = config::OMP::current_thread_ID(); + + if ( distr.isActiveThread( thread ) ) { + if( buffers[ thread ] != nullptr ) { + delete [] buffers[ thread ]; + } } } delete [] buffers; diff --git a/tests/unit/dense_omp_mxm.cpp b/tests/unit/dense_omp_mxm.cpp index be05c340b..157b71494 100644 --- a/tests/unit/dense_omp_mxm.cpp +++ b/tests/unit/dense_omp_mxm.cpp @@ -193,45 +193,44 @@ void diff_stdvec_matrix( const std::vector< T > & vA, const size_t m, const size } -#define M ( alp::config::BLOCK_ROW_DIM * n ) -#define K ( alp::config::BLOCK_COL_DIM * n ) -#define N ( alp::config::BLOCK_COL_DIM * n ) - -void alp_program( const size_t & n, alp::RC & rc ) { - - typedef double T; +template < typename T, typename SemiringT > +void run_mxm( const size_t m, const size_t k, const size_t n, alp::RC &rc ) { - alp::Semiring< alp::operators::add< T >, alp::operators::mul< T >, alp::identities::zero, alp::identities::one > ring; + const SemiringT ring; + const T one = ring.template getOne< T >(); + const T zero = ring.template getZero< T >(); - T one = ring.getOne< T >(); - T zero = ring.getZero< T >(); + std::vector< T > A_data( m * k ); + std::vector< T > B_data( k * n ); + std::vector< T > C_data( m * n, zero ); - std::vector< T > A_data( M * K ); - std::vector< T > B_data( K * N ); - std::vector< T > C_data( M * N, zero ); + std::vector< T > A_vec( m * k ); + std::vector< T > B_vec( k * n ); + std::vector< T > C_vec( m * n, zero ); - // std::vector< T > A_packed( n * ( n + 1 ) / 2 ); - // std::vector< T > B_packed( n * ( n + 1 ) / 2 ); - // std::vector< T > C_packed( n * ( n + 1 ) / 2, zero ); + std::cout << "\tTesting dense General mxm " << m << " " << k << " " << n << std::endl; - std::vector< T > A_vec( M * K ); - std::vector< T > B_vec( K * N ); - std::vector< T > C_vec( M * N, zero ); - - std::cout << "\tTesting dense General mxm " << M << " " << K << " " << N << std::endl; - - stdvec_build_matrix< structures::General >( A_data, M, K, K, zero, one, one ); - stdvec_build_matrix< structures::General >( B_data, K, N, N, zero, one, one ); + stdvec_build_matrix< structures::General >( A_data, m, k, k, zero, one, one ); + stdvec_build_matrix< structures::General >( B_data, k, n, n, zero, one, one ); // initialize test - alp::Matrix< T, structures::General > A( M, K ); - alp::Matrix< T, structures::General > B( K, N ); - alp::Matrix< T, structures::General > C( M, N ); + alp::Matrix< T, structures::General > A( m, k ); + alp::Matrix< T, structures::General > B( k, n ); + alp::Matrix< T, structures::General > C( m, n ); // Initialize input matrices - rc = alp::buildMatrix( A, A_data.begin(), A_data.end() ); - rc = alp::buildMatrix( B, B_data.begin(), B_data.end() ); - rc = alp::buildMatrix( C, C_data.begin(), C_data.end() ); + rc = rc ? rc : alp::buildMatrix( A, A_data.begin(), A_data.end() ); + if ( rc != alp::SUCCESS ) { + std::cerr << "\tIssues building A" << std::endl; + return; + } + rc = rc ? rc : alp::buildMatrix( B, B_data.begin(), B_data.end() ); + rc = rc ? rc : alp::buildMatrix( C, C_data.begin(), C_data.end() ); + + if ( rc != alp::SUCCESS ) { + std::cerr << "\tIssues building matrices" << std::endl; + return; + } #ifndef NDEBUG print_matrix("A", A); @@ -239,94 +238,61 @@ void alp_program( const size_t & n, alp::RC & rc ) { print_matrix("C - PRE", C); #endif - rc = alp::mxm( C, A, B, ring ); + rc = rc ? rc : alp::mxm( C, A, B, ring ); #ifndef NDEBUG print_matrix("C - POST", C); #endif - stdvec_build_matrix< structures::General >( A_vec, M, K, K, zero, one, one ); - stdvec_build_matrix< structures::General >( B_vec, K, N, N, zero, one, one ); - - mxm_stdvec_as_matrix( C_vec, N, A_vec, K, B_vec, N, M, K, N, ring.getMultiplicativeOperator(), ring.getAdditiveMonoid() ); - - diff_stdvec_matrix( C_vec, M, N, N, C ); - - // std::cout << "\n\n=========== Testing Uppertriangular ============\n\n"; + if ( rc != alp::SUCCESS ) + return; - // alp::Matrix< T, structures::UpperTriangular > UA( n ); - // alp::Matrix< T, structures::UpperTriangular > UB( n ); - // alp::Matrix< T, structures::UpperTriangular > UC( n ); + stdvec_build_matrix< structures::General >( A_vec, m, k, k, zero, one, one ); + stdvec_build_matrix< structures::General >( B_vec, k, n, n, zero, one, one ); - // stdvec_build_matrix_packed< structures::UpperTriangular >( A_packed, one, one ); - // stdvec_build_matrix_packed< structures::UpperTriangular >( B_packed, one, one ); + mxm_stdvec_as_matrix( C_vec, n, A_vec, k, B_vec, n, m, k, n, ring.getMultiplicativeOperator(), ring.getAdditiveMonoid() ); - // rc = alp::buildMatrix( UA, A_packed.begin(), A_packed.end() ); - // rc = alp::buildMatrix( UB, B_packed.begin(), B_packed.end() ); - // rc = alp::buildMatrix( UC, C_packed.begin(), C_packed.end() ); + diff_stdvec_matrix( C_vec, m, n, n, C ); - // print_matrix("UA", UA); - // print_matrix("UB", UB); - // print_matrix("UC - PRE", UC); - // rc = alp::mxm( UC, UA, UB, ring ); - // print_matrix("UC - POST", UC); - // stdvec_build_matrix< structures::UpperTriangular >( A_vec, n, n, n, zero, one, one ); - // stdvec_build_matrix< structures::UpperTriangular >( B_vec, n, n, n, zero, one, one ); - // stdvec_build_matrix< structures::General >( C_vec, n, n, n, zero, zero ); + std::cout << "\tDone." << std::endl; - // mxm_stdvec_as_matrix( C_vec, n, A_vec, n, B_vec, n, n, n, n, ring.getMultiplicativeOperator(), ring.getAdditiveMonoid() ); - - // diff_stdvec_matrix( C_vec, n, n, n, UC ); - - // std::cout << "\n\n=========== Testing Symmetric Output ============\n\n"; - - // alp::Matrix< T, structures::Symmetric > SC( n ); - - // stdvec_build_matrix< structures::Symmetric >( A_data, n, n, n, zero, one, one ); - - // rc = alp::buildMatrix( A, A_data.begin(), A_data.end() ); - // rc = alp::buildMatrix( SC, C_packed.begin(), C_packed.end() ); - - // print_matrix("A", A ); - // print_matrix("A^T", alp::get_view< alp::view::transpose >( A ) ); - // print_matrix("SC - PRE", SC); - // rc = alp::mxm( SC, A, alp::get_view< alp::view::transpose >( A ), ring ); - // print_matrix("SC - POST", SC); - - // stdvec_build_matrix< structures::Symmetric >( A_vec, n, n, n, zero, one, one ); - // stdvec_build_matrix< structures::Symmetric >( C_vec, n, n, n, zero, zero ); - - // mxm_stdvec_as_matrix( C_vec, n, A_vec, n, A_vec, n, n, n, n, ring.getMultiplicativeOperator(), ring.getAdditiveMonoid() ); - - // diff_stdvec_matrix( C_vec, n, n, n, SC ); +} - // std::cout << "\n\n=========== Testing Symmetric x Symmetric Output ============\n\n"; +#define M ( alp::config::BLOCK_ROW_DIM * n ) +#define K ( alp::config::BLOCK_COL_DIM * 2 * n ) +#define N ( alp::config::BLOCK_COL_DIM * 3 * n ) - // alp::Matrix< T, structures::Symmetric > SA( n ); - // alp::Matrix< T, structures::Symmetric > SB( n ); +void alp_program( const size_t &n, alp::RC &rc ) { - // stdvec_build_matrix_packed< structures::Symmetric >( A_packed, one, one ); - // stdvec_build_matrix_packed< structures::Symmetric >( B_packed, one, one + one ); + using T = double; - // rc = alp::buildMatrix( SA, A_packed.begin(), A_packed.end() ); - // rc = alp::buildMatrix( SB, B_packed.begin(), B_packed.end() ); - // rc = alp::buildMatrix( C, C_data.begin(), C_data.end() ); + using SemiringT = alp::Semiring< + alp::operators::add< T >, alp::operators::mul< T >, + alp::identities::zero, alp::identities::one + >; - // print_matrix("SA", SA); - // print_matrix("SB", SB); - // print_matrix("C - PRE", C); - // rc = alp::mxm( C, SA, SB, ring ); - // print_matrix("C - POST", C); + rc = alp::SUCCESS; - // stdvec_build_matrix< structures::Symmetric >( A_vec, n, n, n, zero, one, one ); - // stdvec_build_matrix< structures::Symmetric >( B_vec, n, n, n, zero, one, one + one ); - // stdvec_build_matrix< structures::General >( C_vec, n, n, n, zero, zero ); + /** + * Testing cubic mxm. + */ + run_mxm< T, SemiringT >( M, M, M, rc ); - // mxm_stdvec_as_matrix( C_vec, n, A_vec, n, B_vec, n, n, n, n, ring.getMultiplicativeOperator(), ring.getAdditiveMonoid() ); + /** + * Testing rectangular mxm + */ + run_mxm< T, SemiringT >( M, K, N, rc ); - // diff_stdvec_matrix( C_vec, n, n, n, C ); + /** + * Testing outer-prod of blocks mxm + */ + run_mxm< T, SemiringT >( M, alp::config::BLOCK_COL_DIM, N, rc ); + /** + * Testing dot-prod of blocks mxm + */ + run_mxm< T, SemiringT >( alp::config::BLOCK_ROW_DIM, M, alp::config::BLOCK_COL_DIM, rc ); } From 16f4c97387a28008c186a7ba99528aaafd1bb0bd Mon Sep 17 00:00:00 2001 From: "Daniele G. Spampinato" Date: Tue, 13 Dec 2022 15:15:15 +0100 Subject: [PATCH 9/9] Fixed style and typos --- include/alp/omp/blas3.hpp | 35 +++++++--------------- include/alp/omp/io.hpp | 2 +- include/alp/omp/storage.hpp | 2 +- tests/unit/dense_omp_mxm.cpp | 56 +++++++++++++++++++----------------- 4 files changed, 41 insertions(+), 54 deletions(-) diff --git a/include/alp/omp/blas3.hpp b/include/alp/omp/blas3.hpp index 0bba4444b..4954e00b5 100644 --- a/include/alp/omp/blas3.hpp +++ b/include/alp/omp/blas3.hpp @@ -29,7 +29,6 @@ #include #include -// #include #include #include "matrix.hpp" @@ -77,12 +76,12 @@ namespace alp { const Operator &oper, const Monoid &monoid, const MulMonoid &mulMonoid, - const typename std::enable_if< !alp::is_object< OutputType >::value && + const std::enable_if_t< !alp::is_object< OutputType >::value && !alp::is_object< InputType1 >::value && ! alp::is_object< InputType2 >::value && alp::is_operator< Operator >::value && alp::is_monoid< Monoid >::value, - void >::type * const = NULL + void > * const = NULL ) { static_assert( @@ -100,18 +99,17 @@ namespace alp { #endif // Early exit checks - if( ! internal::getInitialized( A ) || - ! internal::getInitialized( B ) || + if( ! internal::getInitialized( A ) || ! internal::getInitialized( B ) || ! internal::getInitialized( C ) ) { internal::setInitialized( C, false ); return SUCCESS; } - const std::ptrdiff_t m { static_cast< std::ptrdiff_t >( nrows( C ) ) }; - const std::ptrdiff_t n { static_cast< std::ptrdiff_t >( ncols( C ) ) }; + const std::ptrdiff_t m { static_cast< std::ptrdiff_t >( nrows( C ) ) }; + const std::ptrdiff_t n { static_cast< std::ptrdiff_t >( ncols( C ) ) }; const std::ptrdiff_t m_a { static_cast< std::ptrdiff_t >( nrows( A ) ) }; - const std::ptrdiff_t k { static_cast< std::ptrdiff_t >( ncols( A ) ) }; + const std::ptrdiff_t k { static_cast< std::ptrdiff_t >( ncols( A ) ) }; const std::ptrdiff_t k_b { static_cast< std::ptrdiff_t >( nrows( B ) ) }; const std::ptrdiff_t n_b { static_cast< std::ptrdiff_t >( ncols( B ) ) }; @@ -156,18 +154,6 @@ namespace alp { RC local_rc = SUCCESS; -// if( block_grid_dims_c.first != set_block_grid_dims_a.first -// || block_grid_dims_c.second != set_block_grid_dims_b.second -// || set_block_grid_dims_a.second != set_block_grid_dims_b.first -// ) { -// #ifndef NDEBUG -// #pragma omp critical -// std::cerr << "Thread " << thread << " in alp::internal::mxm_generic (omp)\n" -// "\tMismatching local block grid size on set." << std::endl; -// #endif -// local_rc = MISMATCH; -// } - // Broadcast A and B to all c-dimensional layers if( local_rc == SUCCESS && da.isActiveThread( th_ijk_a ) && th_ijk_a.rt > 0 ) { @@ -308,7 +294,6 @@ namespace alp { for( size_t bc = 0; bc < block_grid_dims_c.second; ++bc ) { - const auto refB_loc = get_view( B, th_sjk_b, bk, bc ); auto refC_ijk = get_view( C, th_ijk_c, br, bc ); @@ -465,13 +450,13 @@ namespace alp { Density::Dense, InputView2, InputImfR2, InputImfC2, omp > &B, const Semiring & ring = Semiring(), const PHASE &phase = NUMERICAL, - const typename std::enable_if< + const std::enable_if_t< !alp::is_object< OutputType >::value && !alp::is_object< InputType1 >::value && !alp::is_object< InputType2 >::value && alp::is_semiring< Semiring >::value, void - >::type * const = NULL + > * const = NULL ) { (void)phase; @@ -506,13 +491,13 @@ namespace alp { const Operator & mulOp, const Monoid & addM, const PHASE &phase = NUMERICAL, - const typename std::enable_if< + const std::enable_if_t< !alp::is_object< OutputType >::value && !alp::is_object< InputType1 >::value && !alp::is_object< InputType2 >::value && alp::is_operator< Operator >::value && alp::is_monoid< Monoid >::value, void - >::type * const = NULL + > * const = NULL ) { (void)phase; diff --git a/include/alp/omp/io.hpp b/include/alp/omp/io.hpp index 2794b1336..9fc66e815 100644 --- a/include/alp/omp/io.hpp +++ b/include/alp/omp/io.hpp @@ -152,7 +152,7 @@ namespace alp { ) noexcept { static_assert( - ( std::is_same< InputType, typename fwd_iterator::value_type >::value ), + std::is_same< InputType, typename fwd_iterator::value_type >::value, "alp::buildMatrix (omp): Mismatching type between user-provided input " "container and output ALP container." ); diff --git a/include/alp/omp/storage.hpp b/include/alp/omp/storage.hpp index 8bfe855fc..6e3e23cf6 100644 --- a/include/alp/omp/storage.hpp +++ b/include/alp/omp/storage.hpp @@ -314,7 +314,7 @@ namespace alp { } bool isActiveThread(const ThreadCoords &t ) const { - return t.tr < Tr && t.tc < Tc && t.rt < Rt; + return ( t.tr < Tr ) && ( t.tc < Tc ) && ( t.rt < Rt ); } bool isActiveThread(const size_t thread_id ) const { diff --git a/tests/unit/dense_omp_mxm.cpp b/tests/unit/dense_omp_mxm.cpp index 157b71494..3628d5d45 100644 --- a/tests/unit/dense_omp_mxm.cpp +++ b/tests/unit/dense_omp_mxm.cpp @@ -28,7 +28,7 @@ using namespace alp; template< typename T > -void print_stdvec_as_matrix( std::string name, const std::vector< T > & vA, const size_t m, const size_t n, const size_t lda ) { +void print_stdvec_as_matrix( std::string name, const std::vector< T > &vA, const size_t m, const size_t n, const size_t lda ) { std::cout << "Vec " << name << ":" << std::endl; for( size_t row = 0; row < m; ++row ) { @@ -41,27 +41,29 @@ void print_stdvec_as_matrix( std::string name, const std::vector< T > & vA, cons } template< typename T, typename Operator, typename Monoid > -void mxm_stdvec_as_matrix( std::vector< T > & vC, const size_t ldc, - const std::vector< T > & vA, const size_t lda, - const std::vector< T > & vB, const size_t ldb, - const size_t m, const size_t k, const size_t n, - const Operator oper, - const Monoid monoid ) { +void mxm_stdvec_as_matrix( + std::vector< T > &vC, const size_t ldc, + const std::vector< T > &vA, const size_t lda, + const std::vector< T > &vB, const size_t ldb, + const size_t m, const size_t k, const size_t n, + const Operator oper, + const Monoid monoid +) { T temp; #ifndef NDEBUG - print_stdvec_as_matrix("vA", vA, n, n, n); - print_stdvec_as_matrix("vB", vB, n, n, n); - print_stdvec_as_matrix("vC - PRE", vC, n, n, n); + print_stdvec_as_matrix( "vA", vA, n, n, n ); + print_stdvec_as_matrix( "vB", vB, n, n, n ); + print_stdvec_as_matrix( "vC - PRE", vC, n, n, n ); #endif for( size_t i = 0; i < m; ++i ) { for( size_t j = 0; j < n; ++j ) { - T & c_val { vC[ i * ldc + j ] }; + T &c_val { vC[ i * ldc + j ] }; for( size_t l = 0; l < k; ++l ) { - const T & a_val { vA[ i * lda + l ] }; - const T & b_val { vB[ l * ldb + j ] }; + const T &a_val { vA[ i * lda + l ] }; + const T &b_val { vB[ l * ldb + j ] }; // std::cout << c_val << " += " << a_val << " * " << b_val << std::endl; (void)internal::apply( temp, a_val, b_val, oper ); // std::cout << "temp = " << temp << std::endl; @@ -71,12 +73,12 @@ void mxm_stdvec_as_matrix( std::vector< T > & vC, const size_t ldc, } #ifndef NDEBUG - print_stdvec_as_matrix("vC - POST", vC, n, n, n); + print_stdvec_as_matrix( "vC - POST", vC, n, n, n ); #endif } template< typename Structure, typename T > -void stdvec_build_matrix( std::vector< T > & vA, const size_t m, const size_t n, const size_t lda, const T zero, const T one ) { +void stdvec_build_matrix( std::vector< T > &vA, const size_t m, const size_t n, const size_t lda, const T zero, const T one ) { if( std::is_same< Structure, structures::General >::value ) { std::fill( vA.begin(), vA.end(), one ); @@ -96,7 +98,7 @@ void stdvec_build_matrix( std::vector< T > & vA, const size_t m, const size_t n, } template< typename Structure, typename T > -void stdvec_build_matrix( std::vector< T > & vA, const size_t m, const size_t n, const size_t lda, const T zero, const T one, const T inc ) { +void stdvec_build_matrix( std::vector< T > &vA, const size_t m, const size_t n, const size_t lda, const T zero, const T one, const T inc ) { T val = one; if( std::is_same< Structure, structures::General >::value ) { @@ -128,23 +130,23 @@ void stdvec_build_matrix( std::vector< T > & vA, const size_t m, const size_t n, } template< typename Structure, typename T > -void stdvec_build_matrix_packed( std::vector< T > & vA, const T one ) { +void stdvec_build_matrix_packed( std::vector< T > &vA, const T one ) { std::fill( vA.begin(), vA.end(), one ); } template< typename Structure, typename T > -void stdvec_build_matrix_packed( std::vector< T > & vA, const T one, const T inc ) { +void stdvec_build_matrix_packed( std::vector< T > &vA, const T one, const T inc ) { T val = one; if( std::is_same< Structure, structures::Symmetric >::value ) { // Assumes Packed Row - Upper - for( auto & elem: vA ) { + for( auto &elem: vA ) { elem = val; val += inc; } } else if( std::is_same< Structure, structures::UpperTriangular >::value ) { // Assumes Packed Row - Upper - for( auto & elem: vA ) { + for( auto &elem: vA ) { elem = val; val += inc; } @@ -153,8 +155,8 @@ void stdvec_build_matrix_packed( std::vector< T > & vA, const T one, const T inc } template< typename MatType, typename T > -void diff_stdvec_matrix( const std::vector< T > & vA, const size_t m, const size_t n, const size_t lda, - const MatType & mA, double threshold=1e-7 ) { +void diff_stdvec_matrix( const std::vector< T > &vA, const size_t m, const size_t n, const size_t lda, + const MatType & mA, double threshold = 1e-7 ) { if( std::is_same< typename MatType::structure, structures::General >::value ) { for( size_t row = 0; row < m; ++row ) { @@ -233,15 +235,15 @@ void run_mxm( const size_t m, const size_t k, const size_t n, alp::RC &rc ) { } #ifndef NDEBUG - print_matrix("A", A); - print_matrix("B", B); - print_matrix("C - PRE", C); + print_matrix( "A", A ); + print_matrix( "B", B ); + print_matrix( "C - PRE", C ); #endif rc = rc ? rc : alp::mxm( C, A, B, ring ); #ifndef NDEBUG - print_matrix("C - POST", C); + print_matrix( "C - POST", C ); #endif if ( rc != alp::SUCCESS ) @@ -296,7 +298,7 @@ void alp_program( const size_t &n, alp::RC &rc ) { } -int main( int argc, char ** argv ) { +int main( int argc, char **argv ) { // defaults bool printUsage = false; size_t in = 4;