diff --git a/include/alp/blas3.hpp b/include/alp/blas3.hpp index e1f6d0baa..976d3ccb7 100644 --- a/include/alp/blas3.hpp +++ b/include/alp/blas3.hpp @@ -26,6 +26,9 @@ #ifdef _ALP_WITH_REFERENCE #include #endif +#ifdef _ALP_WITH_OMP + #include +#endif #endif // end _H_ALP_BLAS3 diff --git a/include/alp/omp/blas3.hpp b/include/alp/omp/blas3.hpp new file mode 100644 index 000000000..4954e00b5 --- /dev/null +++ b/include/alp/omp/blas3.hpp @@ -0,0 +1,511 @@ + +/* + * Copyright 2021 Huawei Technologies Co., Ltd. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * @author A. N. Yzelman + * @date 14th of January 2022 + */ + +#ifndef _H_ALP_OMP_BLAS3 +#define _H_ALP_OMP_BLAS3 + +#include // for std::min/max +#include // for std::enable_if + +#include +#include + +#include + +#include "matrix.hpp" +#include "storage.hpp" + +// Include backend to which sequential work is delegated +#ifdef _ALP_OMP_WITH_REFERENCE + #include + #include + #include +#endif + +#ifndef NDEBUG +#include "../../../tests/utils/print_alp_containers.hpp" +#endif + + +namespace alp { + + namespace internal { + + /** + * \internal general mxm implementation that all mxm variants using + * structured matrices refer to. + */ + template< + bool allow_void, + class MulMonoid, + typename OutputType, typename InputType1, typename InputType2, + class Operator, class Monoid, + typename OutputStructure, typename OutputView, + typename OutputImfR, typename OutputImfC, + typename InputStructure1, typename InputView1, + typename InputImfR1, typename InputImfC1, + typename InputStructure2, typename InputView2, + typename InputImfR2, typename InputImfC2 + > + RC mxm_generic( + alp::Matrix< OutputType, OutputStructure, + Density::Dense, OutputView, OutputImfR, OutputImfC, omp > &C, + alp::Matrix< InputType1, InputStructure1, + Density::Dense, InputView1, InputImfR1, InputImfC1, omp > &A, + alp::Matrix< InputType2, InputStructure2, + Density::Dense, InputView2, InputImfR2, InputImfC2, omp > &B, + const Operator &oper, + const Monoid &monoid, + const MulMonoid &mulMonoid, + const std::enable_if_t< !alp::is_object< OutputType >::value && + !alp::is_object< InputType1 >::value && ! + alp::is_object< InputType2 >::value && + alp::is_operator< Operator >::value && + alp::is_monoid< Monoid >::value, + void > * const = NULL + ) { + + static_assert( + !( + std::is_same< InputType1, void >::value || + std::is_same< InputType2, void >::value + ), + "alp::internal::mxm_generic: the operator-monoid version of mxm cannot be " + "used if either of the input matrices is a pattern matrix (of type " + "void)" + ); + +#ifndef NDEBUG + std::cout << "In alp::internal::mxm_generic (omp)\n"; +#endif + + // Early exit checks + if( ! internal::getInitialized( A ) || ! internal::getInitialized( B ) || + ! internal::getInitialized( C ) + ) { + internal::setInitialized( C, false ); + return SUCCESS; + } + + const std::ptrdiff_t m { static_cast< std::ptrdiff_t >( nrows( C ) ) }; + const std::ptrdiff_t n { static_cast< std::ptrdiff_t >( ncols( C ) ) }; + const std::ptrdiff_t m_a { static_cast< std::ptrdiff_t >( nrows( A ) ) }; + const std::ptrdiff_t k { static_cast< std::ptrdiff_t >( ncols( A ) ) }; + const std::ptrdiff_t k_b { static_cast< std::ptrdiff_t >( nrows( B ) ) }; + const std::ptrdiff_t n_b { static_cast< std::ptrdiff_t >( ncols( B ) ) }; + + if( m != m_a || k != k_b || n != n_b ) { + return MISMATCH; + } + + const Distribution_2_5D &da = internal::getAmf( A ).getDistribution(); + const Distribution_2_5D &db = internal::getAmf( B ).getDistribution(); + const Distribution_2_5D &dc = internal::getAmf( C ).getDistribution(); + + const auto tg_a = da.getThreadGridDims(); + const auto tg_b = db.getThreadGridDims(); + const auto tg_c = dc.getThreadGridDims(); + + // 2.5D factor ranges within 3D limits + if( tg_c.rt != tg_a.rt || tg_a.rt != tg_b.rt || ( tg_a.tc % tg_a.rt ) != 0 ) { + return MISMATCH; + } + + if( tg_c.tr != tg_a.tr || tg_c.tc != tg_b.tc || tg_a.tc != tg_b.tr ) { + return MISMATCH; + } + + using th_coord_t = typename Distribution_2_5D::ThreadCoords; + + RC rc = SUCCESS; + +#ifndef NDEBUG + #pragma omp critical + std::cerr << "In alp::internal::mxm_generic (omp)\n" + "\tBeginning parallel region" << std::endl; +#endif + + #pragma omp parallel + { + const size_t thread = config::OMP::current_thread_ID(); + + const th_coord_t th_ijk_a = da.getThreadCoords( thread ); + const th_coord_t th_ijk_b = db.getThreadCoords( thread ); + const th_coord_t th_ijk_c = dc.getThreadCoords( thread ); + + RC local_rc = SUCCESS; + + // Broadcast A and B to all c-dimensional layers + if( local_rc == SUCCESS && da.isActiveThread( th_ijk_a ) && th_ijk_a.rt > 0 ) { + +#ifndef NDEBUG + #pragma omp critical + std::cerr << "Thread " << thread << " in alp::internal::mxm_generic (omp)\n" + "\tCopying A." << std::endl; +#endif + + const auto set_block_grid_dims_a = da.getLocalBlockGridDims( th_ijk_a ); + + th_coord_t th_ij0_a( th_ijk_a.tr, th_ijk_a.tc, 0 ); + + for( size_t br = 0; br < set_block_grid_dims_a.first; ++br ) { + for( size_t bc = 0; bc < set_block_grid_dims_a.second; ++bc ) { + + auto refAij0 = get_view( A, th_ij0_a, br, bc ); + auto refAijk = get_view( A, th_ijk_a, br, bc ); + + local_rc = local_rc ? local_rc : set( refAijk, refAij0 ); + + } + } + + } // End Broadcast of A + + if( local_rc == SUCCESS && db.isActiveThread( th_ijk_b ) && th_ijk_b.rt > 0 ) { + +#ifndef NDEBUG + #pragma omp critical + std::cerr << "Thread " << thread << " in alp::internal::mxm_generic (omp)\n" + "\tCopying B." << std::endl; +#endif + + const auto set_block_grid_dims_b = db.getLocalBlockGridDims( th_ijk_b ); + + th_coord_t th_ij0_b( th_ijk_b.tr, th_ijk_b.tc, 0 ); + + for( size_t br = 0; br < set_block_grid_dims_b.first; ++br ) { + for( size_t bc = 0; bc < set_block_grid_dims_b.second; ++bc ) { + + auto refBij0 = get_view( B, th_ij0_b, br, bc ); + auto refBijk = get_view( B, th_ijk_b, br, bc ); + + local_rc = local_rc ? local_rc : set( refBijk, refBij0 ); + } + } + } // End Broadcast of B + + if( local_rc == SUCCESS && dc.isActiveThread( th_ijk_c ) && th_ijk_c.rt > 0 ) { + +#ifndef NDEBUG + #pragma omp critical + std::cerr << "Thread " << thread << " in alp::internal::mxm_generic (omp)\n" + "\tZeroing C." << std::endl; +#endif + + const auto block_grid_dims_c = dc.getLocalBlockGridDims( th_ijk_c ); + + alp::Scalar< + OutputType, alp::structures::General, config::default_sequential_backend + > zero( monoid.template getIdentity< OutputType >() ); + + for( size_t br = 0; br < block_grid_dims_c.first; ++br ) { + for( size_t bc = 0; bc < block_grid_dims_c.second; ++bc ) { + auto refCijk = get_view( C, th_ijk_c, br, bc ); + local_rc = local_rc ? local_rc : set( refCijk, zero ); + } + } + } // End Zero-ing of C + + // Different values for rc could converge here (eg, MISMATCH, FAILED). + if( local_rc != SUCCESS ) { +#ifndef NDEBUG + #pragma omp critical + std::cerr << "Thread " << thread << " in alp::internal::mxm_generic (omp)\n" + "\tIssues replicating input matrices." << std::endl; +#endif + rc = local_rc; + } + + // End Broadcast of A and B and zero-ing of C + #pragma omp barrier + +#ifndef NDEBUG + #pragma omp critical + std::cerr << "Thread " << thread << " in alp::internal::mxm_generic (omp)\n" + "\tPassing barrier" << std::endl; +#endif + + if( rc == SUCCESS && dc.isActiveThread( th_ijk_c ) ) { + + const auto block_grid_dims_c = dc.getLocalBlockGridDims( th_ijk_c ); + + // Initialize circular shifts at stride of Rt + size_t c_a = utils::modulus( th_ijk_a.tc + th_ijk_a.tr + th_ijk_a.rt * tg_a.tc / tg_a.rt, tg_a.tc ); + size_t r_b = utils::modulus( th_ijk_b.tr + th_ijk_b.tc + th_ijk_b.rt * tg_b.tr / tg_b.rt, tg_b.tr ); + + // Per-c-dimensional-layer partial computation + for( size_t r = 0; r < tg_a.tc / tg_a.rt; ++r ) { + + const th_coord_t th_isk_a( th_ijk_a.tr, c_a, th_ijk_a.rt ); + const th_coord_t th_sjk_b( r_b, th_ijk_b.tc, th_ijk_b.rt ); + +#ifndef NDEBUG + if( thread == 1 ) { + #pragma omp critical + { + std::cerr << "Thread " << thread << " in alp::internal::mxm_generic (omp)\n" + "\tCompute iteration r=" << r << "\n" + "\tStarting from A( " << th_ijk_a.tr << ", " << c_a << ", " << th_ijk_a.rt << " )\n" + "\tStarting from B( " << r_b << ", " << th_ijk_b.tc << ", " << th_ijk_b.rt << " )" <( refC_ijk, refA_loc, refB_loc, oper, monoid, mulMonoid ); + +#ifndef NDEBUG + if( thread == 1 ) { + #pragma omp critical + { + print_matrix("Cref post", refC_ijk ); + } + } +#endif + + } + } + } + + // Circular shift rightwards for A + c_a = utils::modulus( c_a + 1, tg_a.tc ); + // Circular shift downwards for B + r_b = utils::modulus( r_b + 1, tg_b.tr ); + + } + + } + + } // End computation per layer of c-dimension + + // Different values for rc could converge here (eg, MISMATCH, FAILED). + if( local_rc != SUCCESS ) { +#ifndef NDEBUG + #pragma omp critical + std::cerr << "Thread " << thread << " in alp::internal::mxm_generic (omp)\n" + "\tIssues with local mxm computations." << std::endl; +#endif + rc = local_rc; + } + + // End layer-by-layer partial computation + #pragma omp barrier + + // Final c-dimension reduction + if( rc == SUCCESS && dc.isActiveThread( th_ijk_c ) && th_ijk_c.rt == 0 ) { + +#ifndef NDEBUG + #pragma omp critical + std::cerr << "Thread " << thread << " in alp::internal::mxm_generic (omp)\n" + "\tEntering reduction." << std::endl; +#endif + + const auto block_grid_dims_c = dc.getLocalBlockGridDims( th_ijk_c ); + + for( size_t r = 1; r < tg_c.rt; ++r ) { + +#ifndef NDEBUG + #pragma omp critical + std::cerr << "Thread " << thread << " in alp::internal::mxm_generic (omp)\n" + "\tReduction iteration r=" << r << std::endl; +#endif + + const th_coord_t th_ijr_c( th_ijk_c.tr, th_ijk_c.tc, r ); + + for( size_t br = 0; br < block_grid_dims_c.first; ++br ) { + for( size_t bc = 0; bc < block_grid_dims_c.second; ++bc ) { + + auto refCij0 = internal::get_view( C, th_ijk_c, br, bc ); // k == 0 + auto refCijr = internal::get_view( C, th_ijr_c, br, bc ); + + // Final result in C at layer 0 + local_rc = local_rc ? local_rc : foldl( refCij0, refCijr, monoid ); + } + } + + } + + } // End of final reduction along c-dimension + + // Different values for rc could converge here (eg, MISMATCH, FAILED). + if( local_rc != SUCCESS ) { +#ifndef NDEBUG + #pragma omp critical + std::cerr << "Thread " << thread << " in alp::internal::mxm_generic (omp)\n" + "\tIssues with final reduction." << std::endl; +#endif + rc = local_rc; + } + + } + + return rc; + + } + + } // namespace internal + + /** + * Dense Matrix-Matrix multiply between structured matrices. + * Version with semiring parameter. + * + * @tparam descr The descriptors under which to perform the computation. + * @tparam OutputStructMatT The structured matrix type of the output matrix. + * @tparam InputStructMatT1 The structured matrix type of the the left-hand side input + * matrix. + * @tparam InputStructMatT2 The structured matrix type of the right-hand side input + * matrix. + * @tparam Semiring The semiring under which to perform the + * multiplication. + * + * @returns SUCCESS If the computation completed as intended. + * @returns MISMATCH Whenever the structures or dimensions of \a A, \a B, and \a C do + * not match. All input data containers are left + * untouched if this exit code is returned; it will be + * as though this call was never made. + * + * @param[out] C The output matrix \f$ C = AB \f$ when the function returns + * #SUCCESS. + * @param[in] A The left-hand side input matrix \f$ A \f$. + * @param[in] B The left-hand side input matrix \f$ B \f$. + * @param[in] ring (Optional.) The semiring under which the computation should + * proceed. + * @param phase The execution phase. + */ + template< + typename OutputType, typename InputType1, typename InputType2, + typename OutputStructure, typename OutputView, + typename OutputImfR, typename OutputImfC, + typename InputStructure1, typename InputView1, + typename InputImfR1, typename InputImfC1, + typename InputStructure2, typename InputView2, + typename InputImfR2, typename InputImfC2, + class Semiring + > + RC mxm( + alp::Matrix< OutputType, OutputStructure, + Density::Dense, OutputView, OutputImfR, OutputImfC, omp > &C, + alp::Matrix< InputType1, InputStructure1, + Density::Dense, InputView1, InputImfR1, InputImfC1, omp > &A, + alp::Matrix< InputType2, InputStructure2, + Density::Dense, InputView2, InputImfR2, InputImfC2, omp > &B, + const Semiring & ring = Semiring(), + const PHASE &phase = NUMERICAL, + const std::enable_if_t< + !alp::is_object< OutputType >::value && + !alp::is_object< InputType1 >::value && + !alp::is_object< InputType2 >::value && + alp::is_semiring< Semiring >::value, + void + > * const = NULL + ) { + (void)phase; + + return internal::mxm_generic< false >( + C, A, B, + ring.getMultiplicativeOperator(), ring.getAdditiveMonoid(), ring.getMultiplicativeMonoid() + ); + + } + + /** + * Dense Matrix-Matrix multiply between structured matrices. + * Version with additive monoid and multiplicative operator + */ + template< + typename OutputType, typename InputType1, typename InputType2, + typename OutputStructure, typename OutputView, + typename OutputImfR, typename OutputImfC, + typename InputStructure1, typename InputView1, + typename InputImfR1, typename InputImfC1, + typename InputStructure2, typename InputView2, + typename InputImfR2, typename InputImfC2, + class Operator, class Monoid + > + RC mxm( + alp::Matrix< OutputType, OutputStructure, + Density::Dense, OutputView, OutputImfR, OutputImfC, omp > &C, + alp::Matrix< InputType1, InputStructure1, + Density::Dense, InputView1, InputImfR1, InputImfC1, omp > &A, + alp::Matrix< InputType2, InputStructure2, + Density::Dense, InputView2, InputImfR2, InputImfC2, omp > &B, + const Operator & mulOp, + const Monoid & addM, + const PHASE &phase = NUMERICAL, + const std::enable_if_t< + !alp::is_object< OutputType >::value && + !alp::is_object< InputType1 >::value && + !alp::is_object< InputType2 >::value && + alp::is_operator< Operator >::value && alp::is_monoid< Monoid >::value, + void + > * const = NULL + ) { + (void)phase; + + return internal::mxm_generic< false >( C, A, B, mulOp, addM, Monoid() ); + + } + +} // end namespace ``alp'' + +#endif // end ``_H_ALP_OMP_BLAS3'' + diff --git a/include/alp/omp/config.hpp b/include/alp/omp/config.hpp index 3bdb700ed..79200558f 100644 --- a/include/alp/omp/config.hpp +++ b/include/alp/omp/config.hpp @@ -34,8 +34,12 @@ namespace alp { class OMP : public grb::config::OMP {}; // Dimensions of blocks counted in number of elements per dimension - constexpr size_t BLOCK_ROW_DIM = 16; - constexpr size_t BLOCK_COL_DIM = 16; + constexpr size_t BLOCK_ROW_DIM = 2; + constexpr size_t BLOCK_COL_DIM = 2; + + // Temporary squared solution to accomodate for 2.5D mxm + constexpr size_t THREAD_ROW_DIM = 4; + constexpr size_t THREAD_COL_DIM = 4; constexpr size_t REPLICATION_FACTOR_THREADS = 1; diff --git a/include/alp/omp/io.hpp b/include/alp/omp/io.hpp index 0f5793ac6..9fc66e815 100644 --- a/include/alp/omp/io.hpp +++ b/include/alp/omp/io.hpp @@ -23,6 +23,8 @@ #ifndef _H_ALP_OMP_IO #define _H_ALP_OMP_IO +#include + #include #include "matrix.hpp" @@ -136,6 +138,49 @@ namespace alp { return rc; } + /** + * Temporarily assuming 1-1, row-major mapping with user container. + */ + template< + typename InputType, typename Structure, typename View, + typename ImfR, typename ImfC, typename fwd_iterator + > + RC buildMatrix( + Matrix< InputType, Structure, Density::Dense, View, ImfR, ImfC, omp > &A, + const fwd_iterator &start, + const fwd_iterator &end + ) noexcept { + + static_assert( + std::is_same< InputType, typename fwd_iterator::value_type >::value, + "alp::buildMatrix (omp): Mismatching type between user-provided input " + "container and output ALP container." + ); + + const size_t m = nrows( A ); + const size_t n = ncols( A ); + + if( ( end - start ) != static_cast< std::ptrdiff_t >( m * n ) ) { + + std::cerr << "alp::buildMatrix (omp): Mismatching sizes between " + "user-provided input container and output ALP container." << std::endl; + + return MISMATCH; + } + + internal::setInitialized(A, true); + + fwd_iterator it = start; + + for( size_t i = 0; i < m; ++i ) { + for( size_t j = 0; j < n; ++j ) { + internal::access( A, internal::getStorageIndex( A, i, j ) ) = *( it++ ); + } + } + + return SUCCESS; + } + } // end namespace ``alp'' #undef NO_CAST_ASSERT diff --git a/include/alp/omp/matrix.hpp b/include/alp/omp/matrix.hpp index afb16d412..94e600979 100644 --- a/include/alp/omp/matrix.hpp +++ b/include/alp/omp/matrix.hpp @@ -23,6 +23,8 @@ #ifndef _H_ALP_OMP_MATRIX #define _H_ALP_OMP_MATRIX +#include + #include #include @@ -37,6 +39,7 @@ #include // For AMFFactory #endif + namespace alp { // Currently no backend specific implementation @@ -56,10 +59,10 @@ namespace alp { typename SourceMatrix, typename ThreadCoords, std::enable_if_t< - is_matrix< SourceMatrix >::value + alp::is_matrix< SourceMatrix >::value > * = nullptr > - typename internal::new_container_type_from< + typename new_container_type_from< typename SourceMatrix::template view_type< view::gather >::type >::template change_backend< config::default_sequential_backend >::type get_view( SourceMatrix &source, const ThreadCoords t, const size_t br, const size_t bc ) { @@ -68,7 +71,7 @@ namespace alp { const auto &distribution = getAmf( source ).getDistribution(); const size_t thread_id = distribution.getThreadId( t ); const size_t block_id = distribution.getLocalBlockId( t, br, bc ); - auto &container = internal::getLocalContainer( internal::getContainer( source ), thread_id, block_id ); + auto &container = getLocalContainer( getContainer( source ), thread_id, block_id ); // make an AMF // note: When making a view over a vector, the second imf must be imf::Zero @@ -92,11 +95,13 @@ namespace alp { ); // create a sequential container with the container and AMF above - using target_t = typename internal::new_container_type_from< + using target_t = typename new_container_type_from< typename SourceMatrix::template view_type< view::gather >::type >::template change_backend< config::default_sequential_backend >::type; - return target_t( container, amf ); + target_t blk_matrix( container, amf ); + internal::setInitialized( blk_matrix, internal::getInitialized( source ) ); + return blk_matrix; } } // namespace internal diff --git a/include/alp/omp/storage.hpp b/include/alp/omp/storage.hpp index fb24da0bf..6e3e23cf6 100644 --- a/include/alp/omp/storage.hpp +++ b/include/alp/omp/storage.hpp @@ -131,39 +131,39 @@ namespace alp { }; - /** Type encapsulating the local block coordinate. */ - struct LocalBlockCoord { - - const size_t tr; - const size_t tc; - const size_t rt; - const size_t br; - const size_t bc; - - LocalBlockCoord( - const size_t tr, const size_t tc, - const size_t rt, - const size_t br, const size_t bc - ) : - tr( tr ), tc( tc ), - rt( rt ), - br( br ), bc( bc ) {} - - }; + // /** Type encapsulating the local block coordinate. */ + // struct LocalBlockCoord { + + // const size_t tr; + // const size_t tc; + // const size_t rt; + // const size_t br; + // const size_t bc; + + // LocalBlockCoord( + // const size_t tr, const size_t tc, + // const size_t rt, + // const size_t br, const size_t bc + // ) : + // tr( tr ), tc( tc ), + // rt( rt ), + // br( br ), bc( bc ) {} + + // }; private: /** Row and column dimensions of the associated container */ const size_t m; const size_t n; - /** The row and column dimensions of the thread grid */ - const size_t Tr; - const size_t Tc; /** Replication factor in thread-coordinate space */ static constexpr size_t Rt = config::REPLICATION_FACTOR_THREADS; /** The row and column dimensions of the global block grid */ const size_t Br; const size_t Bc; + /** The row and column dimensions of the thread grid */ + const size_t Tr; + const size_t Tc; public: @@ -172,26 +172,27 @@ namespace alp { const size_t num_threads ) : m( m ), n( n ), - Tr( static_cast< size_t >( sqrt( num_threads/ Rt ) ) ), - Tc( num_threads / Rt / Tr ), Br( static_cast< size_t >( std::ceil( static_cast< double >( m ) / config::BLOCK_ROW_DIM ) ) ), - Bc( static_cast< size_t >( std::ceil( static_cast< double >( n ) / config::BLOCK_COL_DIM ) ) ) { - - if( num_threads != Tr * Tc * Rt ) { - std::cerr << "Warning: Provided number of threads cannot be factorized in a 3D grid.\n"; + Bc( static_cast< size_t >( std::ceil( static_cast< double >( n ) / config::BLOCK_COL_DIM ) ) ), + Tr( ( Br > config::THREAD_ROW_DIM ) ? config::THREAD_ROW_DIM : Br ), + Tc( ( Bc > config::THREAD_COL_DIM ) ? config::THREAD_COL_DIM : Bc ) + { + if( num_threads != config::THREAD_ROW_DIM * config::THREAD_COL_DIM * config::REPLICATION_FACTOR_THREADS ) { + std::cerr << "Warning: Provided number of threads cannot be factorized in a 2.5D grid:\n" + "\t" << num_threads << " != " << config::THREAD_ROW_DIM << " x " << config::THREAD_COL_DIM << " x " << config::REPLICATION_FACTOR_THREADS << std::endl; } } - LocalBlockCoord mapBlockGlobalToLocal( const GlobalBlockCoord &g ) const { - (void) g; - return LocalBlockCoord( 0, 0, 0, 0, 0 ); - } + // LocalBlockCoord mapBlockGlobalToLocal( const GlobalBlockCoord &g ) const { + // (void) g; + // return LocalBlockCoord( 0, 0, 0, 0, 0 ); + // } - GlobalBlockCoord mapBlockLocalToGlobal( const LocalBlockCoord &l ) const { - const size_t block_id_r = l.br * Tr + l.tr; - const size_t block_id_c = l.bc * Tc + l.tc; - return GlobalBlockCoord( block_id_r, block_id_c ); - } + // GlobalBlockCoord mapBlockLocalToGlobal( const LocalBlockCoord &l ) const { + // const size_t block_id_r = l.br * Tr + l.tr; + // const size_t block_id_c = l.bc * Tc + l.tc; + // return GlobalBlockCoord( block_id_r, block_id_c );// Temporary + // } LocalCoord mapGlobalToLocal( const GlobalCoord &g ) const { const size_t global_br = g.i / config::BLOCK_ROW_DIM; @@ -212,23 +213,31 @@ namespace alp { ); } - /** - * Maps coordinates from local to global space. - * - * \todo Add implementation + // /** + // * Maps coordinates from local to global space. + // * + // * \todo Add implementation + // */ + // GlobalCoord mapLocalToGlobal( const LocalCoord &l ) const { + // (void) l; + // return GlobalCoord( 0, 0 ); + // } + + /** + * Returns the thread ID corresponding to the given thread coordinates. + * The fixed thread grid enables to map left-over threads. */ - GlobalCoord mapLocalToGlobal( const LocalCoord &l ) const { - (void) l; - return GlobalCoord( 0, 0 ); - } - - /** Returns the thread ID corresponding to the given thread coordinates. */ size_t getThreadId( const ThreadCoords t ) const { - return t.rt * Tr * Tc + t.tr * Tc + t.tc; + return t.rt * config::THREAD_ROW_DIM * config::THREAD_COL_DIM + t.tr * config::THREAD_COL_DIM + t.tc; } size_t getNumberOfThreads() const { - return Tr * Tc * Rt; + return config::THREAD_ROW_DIM * config::THREAD_COL_DIM * config::REPLICATION_FACTOR_THREADS; + } + + /** Returns the thread grid size */ + ThreadCoords getThreadGridDims() const { + return { Tr, Tc, Rt }; } /** Returns the total global amount of blocks */ @@ -245,17 +254,17 @@ namespace alp { return { blocks_r, blocks_c }; } - /** Returns the global block coordinates based on the thread and local block coordinates */ - std::pair< size_t, size_t > getGlobalBlockCoords( const size_t tr, const size_t tc, const size_t br, const size_t bc ) const { - const size_t global_br = br * Tr + tr % Tr; - const size_t global_bc = bc * Tc + tc % Tc; - return { global_br, global_bc }; - } + // /** Returns the global block coordinates based on the thread and local block coordinates */ + // std::pair< size_t, size_t > getGlobalBlockCoords( const size_t tr, const size_t tc, const size_t br, const size_t bc ) const { + // const size_t global_br = br * Tr + tr % Tr; + // const size_t global_bc = bc * Tc + tc % Tc; + // return { global_br, global_bc }; + // } - size_t getGlobalBlockId( const size_t tr, const size_t tc, const size_t br, const size_t bc ) const { - const auto global_coords = getGlobalBlockCoords( tr, tc, br, bc ); - return global_coords.first * Bc + global_coords.second; - } + // size_t getGlobalBlockId( const size_t tr, const size_t tc, const size_t br, const size_t bc ) const { + // const auto global_coords = getGlobalBlockCoords( tr, tc, br, bc ); + // return global_coords.first * Bc + global_coords.second; + // } size_t getLocalBlockId( const LocalCoord &local ) const { return local.br * getLocalBlockGridDims( local.getThreadCoords() ).second + local.bc; @@ -291,18 +300,28 @@ namespace alp { } /** For a given block, returns its offset from the beginning of the buffer in which it is stored */ - size_t getBlocksOffset( const ThreadCoords t, const size_t br, const size_t bc ) const { + size_t getBlocksOffset( const ThreadCoords &t, const size_t br, const size_t bc ) const { // The offset is calculated as the sum of sizes of all previous blocks const size_t block_coord_1D = br * getLocalBlockGridDims( t ).second + bc; return block_coord_1D * getBlockSize(); } ThreadCoords getThreadCoords( const size_t thread_id ) const { - const size_t rt = thread_id / ( Tr * Tc ); - const size_t tr = ( thread_id % ( Tr * Tc ) ) / Tc; - const size_t tc = ( thread_id % ( Tr * Tc ) ) % Tc; + const size_t rt = thread_id / ( config::THREAD_ROW_DIM * config::THREAD_COL_DIM ); + const size_t tr = ( thread_id % ( config::THREAD_ROW_DIM * config::THREAD_COL_DIM ) ) / config::THREAD_COL_DIM; + const size_t tc = ( thread_id % ( config::THREAD_ROW_DIM * config::THREAD_COL_DIM ) ) % config::THREAD_COL_DIM; return { tr, tc, rt }; } + + bool isActiveThread(const ThreadCoords &t ) const { + return ( t.tr < Tr ) && ( t.tc < Tc ) && ( t.rt < Rt ); + } + + bool isActiveThread(const size_t thread_id ) const { + const auto th_coords = getThreadCoords( thread_id ); + return isActiveThread( th_coords ); + } + }; namespace storage { diff --git a/include/alp/omp/vector.hpp b/include/alp/omp/vector.hpp index 8116f1f9f..b653de004 100644 --- a/include/alp/omp/vector.hpp +++ b/include/alp/omp/vector.hpp @@ -104,6 +104,9 @@ namespace alp { private: + /** The distribution of the vector among threads. */ + Distribution_2_5D distr; + /** The number of buffers. */ size_t num_buffers; @@ -149,7 +152,8 @@ namespace alp { Vector( const Distribution_2_5D &d, const size_t cap = 0 - ) : num_buffers( d.getNumberOfThreads() ), + ) : distr( d ), + num_buffers( d.getNumberOfThreads() ), containers( num_buffers ), initialized( false ) { @@ -165,47 +169,54 @@ namespace alp { throw std::runtime_error( "Could not allocate memory during alp::Vector construction." ); } - #pragma omp parallel for - for( size_t thread = 0; thread < config::OMP::current_threads(); ++thread ) { + #pragma omp parallel + { + const size_t thread = config::OMP::current_thread_ID(); + const auto t_coords = d.getThreadCoords( thread ); - const auto block_grid_dims = d.getLocalBlockGridDims( t_coords ); - // Assuming that all blocks are of the same size - const size_t alloc_size = block_grid_dims.first * block_grid_dims.second * d.getBlockSize(); + if ( d.isActiveThread( t_coords ) ) { + + const auto block_grid_dims = d.getLocalBlockGridDims( t_coords ); + + // Assuming that all blocks are of the same size + const size_t alloc_size = block_grid_dims.first * block_grid_dims.second * d.getBlockSize(); #ifdef DEBUG - #pragma omp critical - { - if( thread != config::OMP::current_thread_ID() ) { - std::cout << "Warning: thread != OMP::current_thread_id()\n"; + #pragma omp critical + { + if( thread != config::OMP::current_thread_ID() ) { + std::cout << "Warning: thread != OMP::current_thread_id()\n"; + } + std::cout << "Thread with global coordinates tr = " << t_coords.tr << " tc = " << t_coords.tc + << " on OpenMP thread " << config::OMP::current_thread_ID() + << " allocating buffer of " << alloc_size << " elements " + << " holding " << block_grid_dims.first << " x " << block_grid_dims.second << " blocks.\n"; } - std::cout << "Thread with global coordinates tr = " << t_coords.tr << " tc = " << t_coords.tc - << " on OpenMP thread " << config::OMP::current_thread_ID() - << " allocating buffer of " << alloc_size << " elements " - << " holding " << block_grid_dims.first << " x " << block_grid_dims.second << " blocks.\n"; - } #endif - // TODO: Implement allocation properly - buffers[ thread ] = new ( std::nothrow ) value_type[ alloc_size ]; + // TODO: Implement allocation properly + buffers[ thread ] = new ( std::nothrow ) value_type[ alloc_size ]; - if( buffers[ thread ] == nullptr ) { - throw std::runtime_error( "Could not allocate memory during alp::Vector construction." ); - } + if( buffers[ thread ] == nullptr ) { + throw std::runtime_error( "Could not allocate memory during alp::Vector construction." ); + } - // Reserve space for all internal container wrappers to avoid re-allocation - containers[ thread ].reserve( block_grid_dims.first * block_grid_dims.second ); + // Reserve space for all internal container wrappers to avoid re-allocation + containers[ thread ].reserve( block_grid_dims.first * block_grid_dims.second ); - // Populate the array of internal container wrappers - for( size_t br = 0; br < block_grid_dims.first; ++br ) { - for( size_t bc = 0; bc < block_grid_dims.second; ++bc ) { - const size_t offset = d.getBlocksOffset( t_coords, br, bc ); - containers[ thread ].emplace_back( &( buffers[ thread ][ offset ] ), d.getBlockSize() ); + // Populate the array of internal container wrappers + for( size_t br = 0; br < block_grid_dims.first; ++br ) { + for( size_t bc = 0; bc < block_grid_dims.second; ++bc ) { + const size_t offset = d.getBlocksOffset( t_coords, br, bc ); + containers[ thread ].emplace_back( &( buffers[ thread ][ offset ] ), d.getBlockSize() ); + } } - } - // Ensure that the array contains the expected number of containers - assert( containers[ thread ].size() == block_grid_dims.first * block_grid_dims.second ); + // Ensure that the array contains the expected number of containers + assert( containers[ thread ].size() == block_grid_dims.first * block_grid_dims.second ); + + } // End active threads allocation } } @@ -277,9 +288,14 @@ namespace alp { */ ~Vector() { if( buffers != nullptr ) { - for( size_t i = 0; i < num_buffers; ++i ) { - if( buffers[ i ] != nullptr ) { - delete buffers[ i ]; + #pragma omp parallel + { + const size_t thread = config::OMP::current_thread_ID(); + + if ( distr.isActiveThread( thread ) ) { + if( buffers[ thread ] != nullptr ) { + delete [] buffers[ thread ]; + } } } delete [] buffers; diff --git a/include/alp/reference/blas3.hpp b/include/alp/reference/blas3.hpp index b0ad5ef3d..8a76ee6e3 100644 --- a/include/alp/reference/blas3.hpp +++ b/include/alp/reference/blas3.hpp @@ -490,46 +490,71 @@ namespace alp { * @param phase The execution phase. */ template< - typename OutputStructMatT, - typename InputStructMatT1, - typename InputStructMatT2, + typename OutputType, typename InputType1, typename InputType2, + typename OutputStructure, typename OutputView, + typename OutputImfR, typename OutputImfC, + typename InputStructure1, typename InputView1, + typename InputImfR1, typename InputImfC1, + typename InputStructure2, typename InputView2, + typename InputImfR2, typename InputImfC2, class Semiring > RC mxm( - OutputStructMatT & C, - const InputStructMatT1 & A, - const InputStructMatT2 & B, + alp::Matrix< OutputType, OutputStructure, + Density::Dense, OutputView, OutputImfR, OutputImfC, reference > &C, + const alp::Matrix< InputType1, InputStructure1, + Density::Dense, InputView1, InputImfR1, InputImfC1, reference > &A, + const alp::Matrix< InputType2, InputStructure2, + Density::Dense, InputView2, InputImfR2, InputImfC2, reference > &B, const Semiring & ring = Semiring(), const PHASE &phase = NUMERICAL, - const std::enable_if_t< - ! alp::is_object< typename OutputStructMatT::value_type >::value && ! alp::is_object< typename InputStructMatT1::value_type >::value && ! alp::is_object< typename InputStructMatT2::value_type >::value && alp::is_semiring< Semiring >::value - > * const = NULL + const typename std::enable_if< + !alp::is_object< OutputType >::value && + !alp::is_object< InputType1 >::value && + !alp::is_object< InputType2 >::value && + alp::is_semiring< Semiring >::value, + void + >::type * const = NULL ) { (void)phase; - return internal::mxm_generic< false >( C, A, B, ring.getMultiplicativeOperator(), ring.getAdditiveMonoid(), ring.getMultiplicativeMonoid() ); + return internal::mxm_generic< false >( + C, A, B, + ring.getMultiplicativeOperator(), ring.getAdditiveMonoid(), ring.getMultiplicativeMonoid() + ); } /** * Dense Matrix-Matrix multiply between structured matrices. * Version with additive monoid and multiplicative operator */ - template< typename OutputStructMatT, - typename InputStructMatT1, - typename InputStructMatT2, + template< + typename OutputType, typename InputType1, typename InputType2, + typename OutputStructure, typename OutputView, + typename OutputImfR, typename OutputImfC, + typename InputStructure1, typename InputView1, + typename InputImfR1, typename InputImfC1, + typename InputStructure2, typename InputView2, + typename InputImfR2, typename InputImfC2, class Operator, class Monoid > RC mxm( - OutputStructMatT & C, - const InputStructMatT1 & A, - const InputStructMatT2 & B, + alp::Matrix< OutputType, OutputStructure, + Density::Dense, OutputView, OutputImfR, OutputImfC, reference > &C, + const alp::Matrix< InputType1, InputStructure1, + Density::Dense, InputView1, InputImfR1, InputImfC1, reference > &A, + const alp::Matrix< InputType2, InputStructure2, + Density::Dense, InputView2, InputImfR2, InputImfC2, reference > &B, const Operator & mulOp, const Monoid & addM, const PHASE &phase = NUMERICAL, - const std::enable_if_t< - ! alp::is_object< typename OutputStructMatT::value_type >::value && ! alp::is_object< typename InputStructMatT1::value_type >::value && ! alp::is_object< typename InputStructMatT2::value_type >::value && - alp::is_operator< Operator >::value && alp::is_monoid< Monoid >::value - > * const = NULL + const typename std::enable_if< + !alp::is_object< OutputType >::value && + !alp::is_object< InputType1 >::value && + !alp::is_object< InputType2 >::value && + alp::is_operator< Operator >::value && alp::is_monoid< Monoid >::value, + void + >::type * const = NULL ) { (void)phase; diff --git a/include/alp/reference/io.hpp b/include/alp/reference/io.hpp index c6de63f5d..da1e287c5 100644 --- a/include/alp/reference/io.hpp +++ b/include/alp/reference/io.hpp @@ -878,9 +878,6 @@ namespace alp { const fwd_iterator &start, const fwd_iterator &end ) noexcept { - (void)A; - (void)start; - (void)end; // Temporarily assuming 1-1 mapping with user container internal::setInitialized(A, true); diff --git a/include/alp/scalar.hpp b/include/alp/scalar.hpp index 95c73cfbc..33210aba2 100644 --- a/include/alp/scalar.hpp +++ b/include/alp/scalar.hpp @@ -24,8 +24,6 @@ #include "base/scalar.hpp" -#include - // now include all specialisations contained in the backend directories: #ifdef _ALP_WITH_REFERENCE #include diff --git a/include/alp/utils.hpp b/include/alp/utils.hpp index 330c69b3b..d053f998d 100644 --- a/include/alp/utils.hpp +++ b/include/alp/utils.hpp @@ -26,6 +26,7 @@ #include #include //fabs +#include // modulus #include //numeric_limits #include @@ -295,7 +296,21 @@ namespace alp { } }; - + + + /** + * Computation of modulus operation \f$ mod(x, n) \f$ based on remainder + * of division for type \a T within range [0, n). + * Assumes \a T implements: \a operator%, \a operator<, and \a operator+. + */ + template < typename T > + T modulus( const T x, const T n ) { + + const T rem = x % n; + + return ( rem < static_cast< T >( 0 ) ) ? rem + n : rem; + }; + } // namespace utils } // namespace alp diff --git a/src/alp/omp/init.cpp b/src/alp/omp/init.cpp index f7f8b5d42..f57351285 100644 --- a/src/alp/omp/init.cpp +++ b/src/alp/omp/init.cpp @@ -36,7 +36,7 @@ alp::RC alp::init< alp::omp >( const size_t s, const size_t P, void * const data RC rc = alp::SUCCESS; // print output const auto T = config::OMP::threads(); - std::cerr << "Info: alp::init (omp) called. OpenMP is set to utilise " << T << " threads.\n"; + std::cout << "Info: alp::init (omp) called. OpenMP is set to utilise " << T << " threads.\n"; // sanity checks if( P > 1 ) { @@ -54,7 +54,7 @@ alp::RC alp::init< alp::omp >( const size_t s, const size_t P, void * const data template<> alp::RC alp::finalize< alp::omp >() { - std::cerr << "Info: alp::finalize (omp) called.\n"; + std::cout << "Info: alp::finalize (omp) called.\n"; return alp::SUCCESS; } diff --git a/tests/unit/CMakeLists.txt b/tests/unit/CMakeLists.txt index b54e0a43b..f16504294 100644 --- a/tests/unit/CMakeLists.txt +++ b/tests/unit/CMakeLists.txt @@ -269,6 +269,10 @@ add_grb_executables( dense_omp_matrix dense_omp_matrix.cpp BACKENDS alp_omp ) +add_grb_executables( dense_omp_mxm dense_omp_mxm.cpp + BACKENDS alp_omp +) + add_grb_executables( dense_outer dense_outer.cpp BACKENDS alp_reference ) diff --git a/tests/unit/dense_omp_mxm.cpp b/tests/unit/dense_omp_mxm.cpp new file mode 100644 index 000000000..3628d5d45 --- /dev/null +++ b/tests/unit/dense_omp_mxm.cpp @@ -0,0 +1,348 @@ + +/* + * Copyright 2021 Huawei Technologies Co., Ltd. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include + +#include +#ifdef NDEBUG + #include "../utils/print_alp_containers.hpp" +#endif + +using namespace alp; + + +template< typename T > +void print_stdvec_as_matrix( std::string name, const std::vector< T > &vA, const size_t m, const size_t n, const size_t lda ) { + + std::cout << "Vec " << name << ":" << std::endl; + for( size_t row = 0; row < m; ++row ) { + std::cout << "[\t"; + for( size_t col = 0; col < n; ++col ) { + std::cout << vA[ row * lda + col ] << "\t"; + } + std::cout << "]" << std::endl; + } +} + +template< typename T, typename Operator, typename Monoid > +void mxm_stdvec_as_matrix( + std::vector< T > &vC, const size_t ldc, + const std::vector< T > &vA, const size_t lda, + const std::vector< T > &vB, const size_t ldb, + const size_t m, const size_t k, const size_t n, + const Operator oper, + const Monoid monoid +) { + + T temp; + +#ifndef NDEBUG + print_stdvec_as_matrix( "vA", vA, n, n, n ); + print_stdvec_as_matrix( "vB", vB, n, n, n ); + print_stdvec_as_matrix( "vC - PRE", vC, n, n, n ); +#endif + + for( size_t i = 0; i < m; ++i ) { + for( size_t j = 0; j < n; ++j ) { + T &c_val { vC[ i * ldc + j ] }; + for( size_t l = 0; l < k; ++l ) { + const T &a_val { vA[ i * lda + l ] }; + const T &b_val { vB[ l * ldb + j ] }; + // std::cout << c_val << " += " << a_val << " * " << b_val << std::endl; + (void)internal::apply( temp, a_val, b_val, oper ); + // std::cout << "temp = " << temp << std::endl; + (void)internal::foldl( c_val, temp, monoid.getOperator() ); + } + } + } + +#ifndef NDEBUG + print_stdvec_as_matrix( "vC - POST", vC, n, n, n ); +#endif +} + +template< typename Structure, typename T > +void stdvec_build_matrix( std::vector< T > &vA, const size_t m, const size_t n, const size_t lda, const T zero, const T one ) { + + if( std::is_same< Structure, structures::General >::value ) { + std::fill( vA.begin(), vA.end(), one ); + } else if( std::is_same< Structure, structures::Symmetric >::value ) { + std::fill( vA.begin(), vA.end(), one ); + } else if( std::is_same< Structure, structures::UpperTriangular >::value ) { + for( size_t row = 0; row < m; ++row ) { + for( size_t col = 0; col < row; ++col ) { + vA[ row * lda + col ] = zero; + } + for( size_t col = row; col < n; ++col ) { + vA[ row * lda + col ] = one; + } + } + } + +} + +template< typename Structure, typename T > +void stdvec_build_matrix( std::vector< T > &vA, const size_t m, const size_t n, const size_t lda, const T zero, const T one, const T inc ) { + + T val = one; + if( std::is_same< Structure, structures::General >::value ) { + for( size_t row = 0; row < m; ++row ) { + for( size_t col = 0; col < n; ++col ) { + vA[ row * lda + col ] = val; + val += inc; + } + } + } else if( std::is_same< Structure, structures::Symmetric >::value ) { + for( size_t row = 0; row < m; ++row ) { + for( size_t col = row; col < n; ++col ) { + vA[ row * lda + col ] = vA[ col * lda + row ] = val; + val += inc; + } + } + } else if( std::is_same< Structure, structures::UpperTriangular >::value ) { + for( size_t row = 0; row < m; ++row ) { + for( size_t col = 0; col < row; ++col ) { + vA[ row * lda + col ] = zero; + } + for( size_t col = row; col < n; ++col ) { + vA[ row * lda + col ] = val; + val += inc; + } + } + } + +} + +template< typename Structure, typename T > +void stdvec_build_matrix_packed( std::vector< T > &vA, const T one ) { + + std::fill( vA.begin(), vA.end(), one ); + +} + +template< typename Structure, typename T > +void stdvec_build_matrix_packed( std::vector< T > &vA, const T one, const T inc ) { + + T val = one; + if( std::is_same< Structure, structures::Symmetric >::value ) { // Assumes Packed Row - Upper + for( auto &elem: vA ) { + elem = val; + val += inc; + } + } else if( std::is_same< Structure, structures::UpperTriangular >::value ) { // Assumes Packed Row - Upper + for( auto &elem: vA ) { + elem = val; + val += inc; + } + } + +} + +template< typename MatType, typename T > +void diff_stdvec_matrix( const std::vector< T > &vA, const size_t m, const size_t n, const size_t lda, + const MatType & mA, double threshold = 1e-7 ) { + + if( std::is_same< typename MatType::structure, structures::General >::value ) { + for( size_t row = 0; row < m; ++row ) { + for( size_t col = 0; col < n; ++col ) { + double va = ( double )( vA[ row * lda + col ] ); + double vm = ( double )( internal::access( mA, internal::getStorageIndex( mA, row, col ) ) ); + double re = std::abs( ( va - vm ) / va ); + if( re > threshold ) { + std::cout << "Error ( " << row << ", " << col << " ): " << va << " v " << vm << std::endl; + } + } + } + } else if( std::is_same< typename MatType::structure, structures::Symmetric >::value ) { + for( size_t row = 0; row < m; ++row ) { + for( size_t col = row; col < n; ++col ) { + double va = ( double )( vA[ row * lda + col ] ); + double vm = ( double )( internal::access( mA, internal::getStorageIndex( mA, row, col ) ) ); + double re = std::abs( ( va - vm ) / va ); + if( re > threshold ) { + std::cout << "Error ( " << row << ", " << col << " ): " << va << " v " << vm << std::endl; + } + } + } + } else if( std::is_same< typename MatType::structure, structures::UpperTriangular >::value ) { + for( size_t row = 0; row < m; ++row ) { + for( size_t col = row; col < n; ++col ) { + double va = ( double )( vA[ row * lda + col ] ); + double vm = ( double )( internal::access( mA, internal::getStorageIndex( mA, row, col ) ) ); + double re = std::abs( ( va - vm ) / va ); + if( re > threshold ) { + std::cout << "Error ( " << row << ", " << col << " ): " << va << " v " << vm << std::endl; + } + } + } + } + +} + +template < typename T, typename SemiringT > +void run_mxm( const size_t m, const size_t k, const size_t n, alp::RC &rc ) { + + const SemiringT ring; + const T one = ring.template getOne< T >(); + const T zero = ring.template getZero< T >(); + + std::vector< T > A_data( m * k ); + std::vector< T > B_data( k * n ); + std::vector< T > C_data( m * n, zero ); + + std::vector< T > A_vec( m * k ); + std::vector< T > B_vec( k * n ); + std::vector< T > C_vec( m * n, zero ); + + std::cout << "\tTesting dense General mxm " << m << " " << k << " " << n << std::endl; + + stdvec_build_matrix< structures::General >( A_data, m, k, k, zero, one, one ); + stdvec_build_matrix< structures::General >( B_data, k, n, n, zero, one, one ); + + // initialize test + alp::Matrix< T, structures::General > A( m, k ); + alp::Matrix< T, structures::General > B( k, n ); + alp::Matrix< T, structures::General > C( m, n ); + + // Initialize input matrices + rc = rc ? rc : alp::buildMatrix( A, A_data.begin(), A_data.end() ); + if ( rc != alp::SUCCESS ) { + std::cerr << "\tIssues building A" << std::endl; + return; + } + rc = rc ? rc : alp::buildMatrix( B, B_data.begin(), B_data.end() ); + rc = rc ? rc : alp::buildMatrix( C, C_data.begin(), C_data.end() ); + + if ( rc != alp::SUCCESS ) { + std::cerr << "\tIssues building matrices" << std::endl; + return; + } + +#ifndef NDEBUG + print_matrix( "A", A ); + print_matrix( "B", B ); + print_matrix( "C - PRE", C ); +#endif + + rc = rc ? rc : alp::mxm( C, A, B, ring ); + +#ifndef NDEBUG + print_matrix( "C - POST", C ); +#endif + + if ( rc != alp::SUCCESS ) + return; + + stdvec_build_matrix< structures::General >( A_vec, m, k, k, zero, one, one ); + stdvec_build_matrix< structures::General >( B_vec, k, n, n, zero, one, one ); + + mxm_stdvec_as_matrix( C_vec, n, A_vec, k, B_vec, n, m, k, n, ring.getMultiplicativeOperator(), ring.getAdditiveMonoid() ); + + diff_stdvec_matrix( C_vec, m, n, n, C ); + + + std::cout << "\tDone." << std::endl; + +} + +#define M ( alp::config::BLOCK_ROW_DIM * n ) +#define K ( alp::config::BLOCK_COL_DIM * 2 * n ) +#define N ( alp::config::BLOCK_COL_DIM * 3 * n ) + +void alp_program( const size_t &n, alp::RC &rc ) { + + using T = double; + + using SemiringT = alp::Semiring< + alp::operators::add< T >, alp::operators::mul< T >, + alp::identities::zero, alp::identities::one + >; + + rc = alp::SUCCESS; + + /** + * Testing cubic mxm. + */ + run_mxm< T, SemiringT >( M, M, M, rc ); + + /** + * Testing rectangular mxm + */ + run_mxm< T, SemiringT >( M, K, N, rc ); + + /** + * Testing outer-prod of blocks mxm + */ + run_mxm< T, SemiringT >( M, alp::config::BLOCK_COL_DIM, N, rc ); + + /** + * Testing dot-prod of blocks mxm + */ + run_mxm< T, SemiringT >( alp::config::BLOCK_ROW_DIM, M, alp::config::BLOCK_COL_DIM, rc ); + +} + +int main( int argc, char **argv ) { + // defaults + bool printUsage = false; + size_t in = 4; + + // error checking + if( argc > 2 ) { + printUsage = true; + } + if( argc == 2 ) { + size_t read; + std::istringstream ss( argv[ 1 ] ); + if( ! ( ss >> read ) ) { + std::cerr << "Error parsing first argument\n"; + printUsage = true; + } else if( ! ss.eof() ) { + std::cerr << "Error parsing first argument\n"; + printUsage = true; + } else if( read % 2 != 0 ) { + std::cerr << "Given value for n is odd\n"; + printUsage = true; + } else { + // all OK + in = read; + } + } + if( printUsage ) { + std::cerr << "Usage: " << argv[ 0 ] << " [n]\n"; + std::cerr << " -n (optional, default is 100): an even integer, the " + "test size.\n"; + return 1; + } + + std::cout << "This is functional test " << argv[ 0 ] << " " << in << "\n"; + alp::Launcher< AUTOMATIC > launcher; + alp::RC out; + if( launcher.exec( &alp_program, in, out, true ) != SUCCESS ) { + std::cerr << "Launching test FAILED\n"; + return 255; + } + if( out != SUCCESS ) { + std::cerr << "Test FAILED (" << alp::toString( out ) << ")" << std::endl; + } else { + std::cout << "Test OK" << std::endl; + } + return 0; +} +