diff --git a/include/alp/omp/io.hpp b/include/alp/omp/io.hpp index 3c550f5a9..01fcdebc8 100644 --- a/include/alp/omp/io.hpp +++ b/include/alp/omp/io.hpp @@ -99,15 +99,14 @@ namespace alp { return SUCCESS; } - const Distribution &d = internal::getAmf( C ).getDistribution(); + const Distribution_2_5D &d = internal::getAmf( C ).getDistribution(); RC rc = SUCCESS; #pragma omp parallel for for( size_t thread = 0; thread < config::OMP::current_threads(); ++thread ) { - const size_t tr = d.getThreadCoords( thread ).first; - const size_t tc = d.getThreadCoords( thread ).second; - const auto block_grid_dims = d.getLocalBlockGridDims( tr, tc ); + const auto t_coords = d.getThreadCoords( thread ); + const auto block_grid_dims = d.getLocalBlockGridDims( t_coords ); RC local_rc = SUCCESS; @@ -115,7 +114,7 @@ namespace alp { for( size_t bc = 0; bc < block_grid_dims.second; ++bc ) { // Get a sequential matrix view over the block - auto refC = internal::get_view( C, tr, tc, 1 /* rt */, br, bc ); + auto refC = internal::get_view( C, t_coords, br, bc ); // Construct a sequential Scalar container from the input Scalar Scalar< InputType, InputStructure, config::default_sequential_backend > ref_val( *val ); diff --git a/include/alp/omp/matrix.hpp b/include/alp/omp/matrix.hpp index 3ecd193f8..9b57f5d8b 100644 --- a/include/alp/omp/matrix.hpp +++ b/include/alp/omp/matrix.hpp @@ -50,6 +50,7 @@ namespace alp { template< enum view::Views target_view = view::original, typename SourceMatrix, + typename ThreadCoords, std::enable_if_t< is_matrix< SourceMatrix >::value > * = nullptr @@ -57,14 +58,12 @@ namespace alp { typename internal::new_container_type_from< typename SourceMatrix::template view_type< view::gather >::type >::template change_backend< config::default_sequential_backend >::type - get_view( SourceMatrix &source, const size_t tr, const size_t tc, const size_t rt, const size_t br, const size_t bc ) { - - (void) rt; + get_view( SourceMatrix &source, const ThreadCoords t, const size_t br, const size_t bc ) { // get the container const auto &distribution = getAmf( source ).getDistribution(); - const size_t thread_id = tr * distribution.getThreadGridDims().second + tc; - const size_t block_id = br * distribution.getLocalBlockGridDims( tr, tc ).second + bc; + const size_t thread_id = distribution.getThreadId( t ); + const size_t block_id = distribution.getLocalBlockId( t, br, bc ); auto &container = internal::getLocalContainer( internal::getContainer( source ), thread_id, block_id ); // make an AMF diff --git a/include/alp/omp/storage.hpp b/include/alp/omp/storage.hpp index 7c6e060dd..fb24da0bf 100644 --- a/include/alp/omp/storage.hpp +++ b/include/alp/omp/storage.hpp @@ -70,10 +70,19 @@ namespace alp { * among threads. * */ - class Distribution { + class Distribution_2_5D { public: + /** Type encapsulating thread coordinates within the thread grid. */ + struct ThreadCoords { + const size_t tr; + const size_t tc; + const size_t rt; + + ThreadCoords( const size_t tr, const size_t tc, const size_t rt ) : tr( tr ), tc( tc ), rt( rt ) {} + }; + /** Type encapsulating the global element coordinate. */ struct GlobalCoord { @@ -87,9 +96,7 @@ namespace alp { /** Type encapsulating the local element coordinate. */ struct LocalCoord { - const size_t tr; - const size_t tc; - const size_t rt; + const ThreadCoords t; const size_t br; const size_t bc; const size_t i; @@ -101,10 +108,13 @@ namespace alp { const size_t br, const size_t bc, const size_t i, const size_t j ) : - tr( tr ), tc( tc ), - rt( rt ), + t( tr, tc, rt ), br( br ), bc( bc ), i( i ), j( j ) {} + + const ThreadCoords &getThreadCoords() const { + return t; + } }; @@ -150,26 +160,25 @@ namespace alp { const size_t Tr; const size_t Tc; /** Replication factor in thread-coordinate space */ - const size_t Rt; + static constexpr size_t Rt = config::REPLICATION_FACTOR_THREADS; /** The row and column dimensions of the global block grid */ const size_t Br; const size_t Bc; public: - Distribution( + Distribution_2_5D( const size_t m, const size_t n, const size_t num_threads ) : m( m ), n( n ), - Tr( static_cast< size_t >( sqrt( num_threads ) ) ), - Tc( num_threads / Tr ), - Rt( config::REPLICATION_FACTOR_THREADS ), + Tr( static_cast< size_t >( sqrt( num_threads/ Rt ) ) ), + Tc( num_threads / Rt / Tr ), Br( static_cast< size_t >( std::ceil( static_cast< double >( m ) / config::BLOCK_ROW_DIM ) ) ), Bc( static_cast< size_t >( std::ceil( static_cast< double >( n ) / config::BLOCK_COL_DIM ) ) ) { - if( num_threads != Tr * Tc ) { - std::cerr << "Error\n"; + if( num_threads != Tr * Tc * Rt ) { + std::cerr << "Warning: Provided number of threads cannot be factorized in a 3D grid.\n"; } } @@ -197,25 +206,29 @@ namespace alp { return LocalCoord( tr, tc, - 0, // Rt + 0, // Rt always maps to the front layer local_br, local_bc, local_i, local_j ); } + /** + * Maps coordinates from local to global space. + * + * \todo Add implementation + */ GlobalCoord mapLocalToGlobal( const LocalCoord &l ) const { (void) l; return GlobalCoord( 0, 0 ); } - /** Returns the dimensions of the thread grid */ - std::pair< size_t, size_t > getThreadGridDims() const { - return { Tr, Tc }; + /** Returns the thread ID corresponding to the given thread coordinates. */ + size_t getThreadId( const ThreadCoords t ) const { + return t.rt * Tr * Tc + t.tr * Tc + t.tc; } - /** Returns the thread ID corresponding to the given thread coordinates. */ - size_t getThreadId( const size_t tr, const size_t tc ) const { - return tr * Tc + tc; + size_t getNumberOfThreads() const { + return Tr * Tc * Rt; } /** Returns the total global amount of blocks */ @@ -224,11 +237,11 @@ namespace alp { } /** Returns the dimensions of the block grid associated to the given thread */ - std::pair< size_t, size_t > getLocalBlockGridDims( const size_t tr, const size_t tc ) const { - // The LHS of the + operand covers the case + std::pair< size_t, size_t > getLocalBlockGridDims( const ThreadCoords t ) const { + // The RHS of the + operand covers the case // when the last block of threads is not full - const size_t blocks_r = Br / Tr + ( tr < Br % Tr ? 1 : 0 ); - const size_t blocks_c = Bc / Tc + ( tc < Bc % Tc ? 1 : 0 ); + const size_t blocks_r = Br / Tr + ( t.tr < Br % Tr ? 1 : 0 ); + const size_t blocks_c = Bc / Tc + ( t.tc < Bc % Tc ? 1 : 0 ); return { blocks_r, blocks_c }; } @@ -244,6 +257,14 @@ namespace alp { return global_coords.first * Bc + global_coords.second; } + size_t getLocalBlockId( const LocalCoord &local ) const { + return local.br * getLocalBlockGridDims( local.getThreadCoords() ).second + local.bc; + } + + size_t getLocalBlockId( const ThreadCoords &t, const size_t br, const size_t bc ) const { + return br * getLocalBlockGridDims( t ).second + bc; + } + /** * Returns the dimensions of the block given by the block id */ @@ -270,16 +291,17 @@ namespace alp { } /** For a given block, returns its offset from the beginning of the buffer in which it is stored */ - size_t getBlocksOffset( const size_t tr, const size_t tc, const size_t br, const size_t bc ) const { + size_t getBlocksOffset( const ThreadCoords t, const size_t br, const size_t bc ) const { // The offset is calculated as the sum of sizes of all previous blocks - const size_t block_coord_1D = br * getLocalBlockGridDims( tr, tc ).second + bc; + const size_t block_coord_1D = br * getLocalBlockGridDims( t ).second + bc; return block_coord_1D * getBlockSize(); } - std::pair< size_t, size_t > getThreadCoords( const size_t thread_id ) const { - const size_t tr = thread_id / Tc; - const size_t tc = thread_id % Tc; - return { tr, tc }; + ThreadCoords getThreadCoords( const size_t thread_id ) const { + const size_t rt = thread_id / ( Tr * Tc ); + const size_t tr = ( thread_id % ( Tr * Tc ) ) / Tc; + const size_t tc = ( thread_id % ( Tr * Tc ) ) % Tc; + return { tr, tc, rt }; } }; @@ -334,7 +356,7 @@ namespace alp { */ const size_t num_threads; - const Distribution distribution; + const Distribution_2_5D distribution; AMF( ImfR imf_r, @@ -360,7 +382,7 @@ namespace alp { std::cout << "Entering OMP AMF move constructor\n"; } - const Distribution &getDistribution() const { + const Distribution_2_5D &getDistribution() const { return distribution; } @@ -400,12 +422,11 @@ namespace alp { storage_index_type getStorageIndex( const size_t i, const size_t j, const size_t s, const size_t P ) const { (void) s; (void) P; - const typename Distribution::GlobalCoord global( imf_r.map( i ), imf_c.map( j ) ); - const typename Distribution::LocalCoord local = distribution.mapGlobalToLocal( global ); - - const size_t thread = local.tr * distribution.getThreadGridDims().second + local.tc; + const typename Distribution_2_5D::GlobalCoord global( imf_r.map( i ), imf_c.map( j ) ); + const typename Distribution_2_5D::LocalCoord local = distribution.mapGlobalToLocal( global ); - const size_t local_block = local.br * distribution.getLocalBlockGridDims( local.tr, local.tc ).second + local.bc; + const size_t thread = distribution.getThreadId( local.getThreadCoords() ); + const size_t local_block = distribution.getLocalBlockId( local ); const size_t local_element = local.i * config::BLOCK_ROW_DIM + local.j; return storage_index_type( thread, local_block, local_element ); diff --git a/include/alp/omp/vector.hpp b/include/alp/omp/vector.hpp index 096e79c45..8fc712b7a 100644 --- a/include/alp/omp/vector.hpp +++ b/include/alp/omp/vector.hpp @@ -149,9 +149,9 @@ namespace alp { * code sections. */ Vector( - const Distribution &d, + const Distribution_2_5D &d, const size_t cap = 0 - ) : num_buffers( d.getThreadGridDims().first * d.getThreadGridDims().second ), + ) : num_buffers( d.getNumberOfThreads() ), containers( num_buffers ), initialized( false ) { @@ -169,9 +169,8 @@ namespace alp { #pragma omp parallel for for( size_t thread = 0; thread < config::OMP::current_threads(); ++thread ) { - const size_t tr = d.getThreadCoords( thread ).first; - const size_t tc = d.getThreadCoords( thread ).second; - const auto block_grid_dims = d.getLocalBlockGridDims( tr, tc ); + const auto t_coords = d.getThreadCoords( thread ); + const auto block_grid_dims = d.getLocalBlockGridDims( t_coords ); // Assuming that all blocks are of the same size const size_t alloc_size = block_grid_dims.first * block_grid_dims.second * d.getBlockSize(); @@ -182,7 +181,7 @@ namespace alp { if( thread != config::OMP::current_thread_ID() ) { std::cout << "Warning: thread != OMP::current_thread_id()\n"; } - std::cout << "Thread with global coordinates tr = " << tr << " tc = " << tc + std::cout << "Thread with global coordinates tr = " << t_coords.tr << " tc = " << t_coords.tc << " on OpenMP thread " << config::OMP::current_thread_ID() << " allocating buffer of " << alloc_size << " elements " << " holding " << block_grid_dims.first << " x " << block_grid_dims.second << " blocks.\n"; @@ -202,7 +201,7 @@ namespace alp { // Populate the array of internal container wrappers for( size_t br = 0; br < block_grid_dims.first; ++br ) { for( size_t bc = 0; bc < block_grid_dims.second; ++bc ) { - const size_t offset = d.getBlocksOffset( tr, tc, br, bc ); + const size_t offset = d.getBlocksOffset( t_coords, br, bc ); containers[ thread ].emplace_back( &( buffers[ thread ][ offset ] ), d.getBlockSize() ); } }