Algebraic-Programming · Nov 25, 2022 · Nov 25, 2022 · Nov 25, 2022 · Nov 25, 2022 · Nov 25, 2022
diff --git a/include/alp/omp/io.hpp b/include/alp/omp/io.hpp
@@ -99,23 +99,22 @@ namespace alp {
 			return SUCCESS;
 		}
 
-		const Distribution &d = internal::getAmf( C ).getDistribution();
+		const Distribution_2_5D &d = internal::getAmf( C ).getDistribution();
 
 		RC rc = SUCCESS;
 
 		#pragma omp parallel for
 		for( size_t thread = 0; thread < config::OMP::current_threads(); ++thread ) {
-			const size_t tr = d.getThreadCoords( thread ).first;
-			const size_t tc = d.getThreadCoords( thread ).second;
-			const auto block_grid_dims = d.getLocalBlockGridDims( tr, tc );
+			const auto t_coords = d.getThreadCoords( thread );
+			const auto block_grid_dims = d.getLocalBlockGridDims( t_coords );
 
 			RC local_rc = SUCCESS;
 
 			for( size_t br = 0; br < block_grid_dims.first; ++br ) {
 				for( size_t bc = 0; bc < block_grid_dims.second; ++bc ) {
 
 					// Get a sequential matrix view over the block
-					auto refC = internal::get_view( C, tr, tc, 1 /* rt */, br, bc );
+					auto refC = internal::get_view( C, t_coords, br, bc );
 
 					// Construct a sequential Scalar container from the input Scalar
 					Scalar< InputType, InputStructure, config::default_sequential_backend > ref_val( *val );

diff --git a/include/alp/omp/matrix.hpp b/include/alp/omp/matrix.hpp
@@ -50,21 +50,20 @@ namespace alp {
 		template<
 			enum view::Views target_view = view::original,
 			typename SourceMatrix,
+			typename ThreadCoords,
 			std::enable_if_t<
 				is_matrix< SourceMatrix >::value
 			> * = nullptr
 		>
 		typename internal::new_container_type_from<
 			typename SourceMatrix::template view_type< view::gather >::type
 		>::template change_backend< config::default_sequential_backend >::type
-		get_view( SourceMatrix &source, const size_t tr, const size_t tc, const size_t rt, const size_t br, const size_t bc ) {
-
-			(void) rt;
+		get_view( SourceMatrix &source, const ThreadCoords t, const size_t br, const size_t bc ) {
 
 			// get the container
 			const auto &distribution = getAmf( source ).getDistribution();
-			const size_t thread_id = tr * distribution.getThreadGridDims().second + tc;
-			const size_t block_id = br * distribution.getLocalBlockGridDims( tr, tc ).second + bc;
+			const size_t thread_id = distribution.getThreadId( t );
+			const size_t block_id = distribution.getLocalBlockId( t, br, bc );
 			auto &container = internal::getLocalContainer( internal::getContainer( source ), thread_id, block_id );
 
 			// make an AMF

diff --git a/include/alp/omp/storage.hpp b/include/alp/omp/storage.hpp
@@ -70,10 +70,19 @@ namespace alp {
 	 * among threads.
 	 *
 	 */
-	class Distribution {
+	class Distribution_2_5D {
 
 		public:
 
+			/** Type encapsulating thread coordinates within the thread grid. */
+			struct ThreadCoords {
+				const size_t tr;
+				const size_t tc;
+				const size_t rt;
+
+				ThreadCoords( const size_t tr, const size_t tc, const size_t rt ) : tr( tr ), tc( tc ), rt( rt ) {}
+			};
+
 			/** Type encapsulating the global element coordinate. */
 			struct GlobalCoord {
 
@@ -87,9 +96,7 @@ namespace alp {
 			/** Type encapsulating the local element coordinate. */
 			struct LocalCoord {
 
-				const size_t tr;
-				const size_t tc;
-				const size_t rt;
+				const ThreadCoords t;
 				const size_t br;
 				const size_t bc;
 				const size_t i;
@@ -101,10 +108,13 @@ namespace alp {
 					const size_t br, const size_t bc,
 					const size_t i, const size_t j
 				) :
-					tr( tr ), tc( tc ),
-					rt( rt ),
+					t( tr, tc, rt ),
 					br( br ), bc( bc ),
 					i( i ), j( j ) {}
+
+				const ThreadCoords &getThreadCoords() const {
+					return t;
+				}
 
 			};
 
@@ -150,26 +160,25 @@ namespace alp {
 			const size_t Tr;
 			const size_t Tc;
 			/** Replication factor in thread-coordinate space */
-			const size_t Rt;
+			static constexpr size_t Rt = config::REPLICATION_FACTOR_THREADS;
 			/** The row and column dimensions of the global block grid */
 			const size_t Br;
 			const size_t Bc;
 
 		public:
 
-			Distribution(
+			Distribution_2_5D(
 				const size_t m, const size_t n,
 				const size_t num_threads
 			) :
 				m( m ), n( n ),
-				Tr( static_cast< size_t >( sqrt( num_threads ) ) ),
-				Tc( num_threads / Tr ),
-				Rt( config::REPLICATION_FACTOR_THREADS ),
+				Tr( static_cast< size_t >( sqrt( num_threads/ Rt ) ) ),
+				Tc( num_threads / Rt / Tr ),
 				Br( static_cast< size_t >( std::ceil( static_cast< double >( m ) / config::BLOCK_ROW_DIM ) ) ),
 				Bc( static_cast< size_t >( std::ceil( static_cast< double >( n ) / config::BLOCK_COL_DIM ) ) ) {
 
-				if( num_threads != Tr * Tc ) {
-					std::cerr << "Error\n";
+				if( num_threads != Tr * Tc * Rt ) {
+					std::cerr << "Warning: Provided number of threads cannot be factorized in a 3D grid.\n";
 				}
 			}
 
@@ -197,25 +206,29 @@ namespace alp {
 
 				return LocalCoord(
 					tr, tc,
-					0, // Rt
+					0, // Rt always maps to the front layer
 					local_br, local_bc,
 					local_i, local_j
 				);
 			}
 
+			/**
+			 * Maps coordinates from local to global space.
+			 *
+			 * \todo Add implementation
+			 */
 			GlobalCoord mapLocalToGlobal( const LocalCoord &l ) const {
 				(void) l;
 				return GlobalCoord( 0, 0 );
 			}
 
-			/** Returns the dimensions of the thread grid */
-			std::pair< size_t, size_t > getThreadGridDims() const {
-				return { Tr, Tc };
+			/** Returns the thread ID corresponding to the given thread coordinates. */
+			size_t getThreadId( const ThreadCoords t ) const {
+				return t.rt * Tr * Tc + t.tr * Tc + t.tc;
 			}
 
-			/** Returns the thread ID corresponding to the given thread coordinates. */
-			size_t getThreadId( const size_t tr, const size_t tc ) const {
-				return tr * Tc + tc;
+			size_t getNumberOfThreads() const {
+				return Tr * Tc * Rt;
 			}
 
 			/** Returns the total global amount of blocks */
@@ -224,11 +237,11 @@ namespace alp {
 			}
 
 			/** Returns the dimensions of the block grid associated to the given thread */
-			std::pair< size_t, size_t > getLocalBlockGridDims( const size_t tr, const size_t tc ) const {
-				// The LHS of the + operand covers the case
+			std::pair< size_t, size_t > getLocalBlockGridDims( const ThreadCoords t ) const {
+				// The RHS of the + operand covers the case
 				// when the last block of threads is not full
-				const size_t blocks_r = Br / Tr + ( tr < Br % Tr ? 1 : 0 );
-				const size_t blocks_c = Bc / Tc + ( tc < Bc % Tc ? 1 : 0 );
+				const size_t blocks_r = Br / Tr + ( t.tr < Br % Tr ? 1 : 0 );
+				const size_t blocks_c = Bc / Tc + ( t.tc < Bc % Tc ? 1 : 0 );
 				return { blocks_r, blocks_c };
 			}
 
@@ -244,6 +257,14 @@ namespace alp {
 				return global_coords.first * Bc + global_coords.second;
 			}
 
+			size_t getLocalBlockId( const LocalCoord &local ) const {
+				return local.br * getLocalBlockGridDims( local.getThreadCoords() ).second + local.bc;
+			}
+
+			size_t getLocalBlockId( const ThreadCoords &t, const size_t br, const size_t bc ) const {
+				return br * getLocalBlockGridDims( t ).second + bc;
+			}
+
 			/**
 			 * Returns the dimensions of the block given by the block id
 			 */
@@ -270,16 +291,17 @@ namespace alp {
 			}
 
 			/** For a given block, returns its offset from the beginning of the buffer in which it is stored */
-			size_t getBlocksOffset( const size_t tr, const size_t tc, const size_t br, const size_t bc ) const {
+			size_t getBlocksOffset( const ThreadCoords t, const size_t br, const size_t bc ) const {
 				// The offset is calculated as the sum of sizes of all previous blocks
-				const size_t block_coord_1D = br * getLocalBlockGridDims( tr, tc ).second + bc;
+				const size_t block_coord_1D = br * getLocalBlockGridDims( t ).second + bc;
 				return block_coord_1D * getBlockSize();
 			}
 
-			std::pair< size_t, size_t > getThreadCoords( const size_t thread_id ) const {
-				const size_t tr = thread_id / Tc;
-				const size_t tc = thread_id % Tc;
-				return { tr, tc };
+			ThreadCoords getThreadCoords( const size_t thread_id ) const {
+				const size_t rt = thread_id / ( Tr * Tc );
+				const size_t tr = ( thread_id % ( Tr * Tc ) ) / Tc;
+				const size_t tc = ( thread_id % ( Tr * Tc ) ) % Tc;
+				return { tr, tc, rt };
 			}
 	};
 
@@ -334,7 +356,7 @@ namespace alp {
 				 */
 				const size_t num_threads;
 
-				const Distribution distribution;
+				const Distribution_2_5D distribution;
 
 				AMF(
 					ImfR imf_r,
@@ -360,7 +382,7 @@ namespace alp {
 					std::cout << "Entering OMP AMF move constructor\n";
 				}
 
-				const Distribution &getDistribution() const {
+				const Distribution_2_5D &getDistribution() const {
 					return distribution;
 				}
 
@@ -400,12 +422,11 @@ namespace alp {
 				storage_index_type getStorageIndex( const size_t i, const size_t j, const size_t s, const size_t P ) const {
 					(void) s;
 					(void) P;
-					const typename Distribution::GlobalCoord global( imf_r.map( i ), imf_c.map( j ) );
-					const typename Distribution::LocalCoord local = distribution.mapGlobalToLocal( global );
-
-					const size_t thread = local.tr * distribution.getThreadGridDims().second + local.tc;
+					const typename Distribution_2_5D::GlobalCoord global( imf_r.map( i ), imf_c.map( j ) );
+					const typename Distribution_2_5D::LocalCoord local = distribution.mapGlobalToLocal( global );
 
-					const size_t local_block = local.br * distribution.getLocalBlockGridDims( local.tr, local.tc ).second + local.bc;
+					const size_t thread = distribution.getThreadId( local.getThreadCoords() );
+					const size_t local_block = distribution.getLocalBlockId( local );
 					const size_t local_element = local.i * config::BLOCK_ROW_DIM + local.j;
 
 					return storage_index_type( thread, local_block, local_element );

diff --git a/include/alp/omp/vector.hpp b/include/alp/omp/vector.hpp
@@ -149,9 +149,9 @@ namespace alp {
 				 *          code sections.
 				 */
 				Vector(
-					const Distribution &d,
+					const Distribution_2_5D &d,
 					const size_t cap = 0
-				) : num_buffers( d.getThreadGridDims().first * d.getThreadGridDims().second ),
+				) : num_buffers( d.getNumberOfThreads() ),
 					containers( num_buffers ),
 					initialized( false ) {
 
@@ -169,9 +169,8 @@ namespace alp {
 
 					#pragma omp parallel for
 					for( size_t thread = 0; thread < config::OMP::current_threads(); ++thread ) {
-						const size_t tr = d.getThreadCoords( thread ).first;
-						const size_t tc = d.getThreadCoords( thread ).second;
-						const auto block_grid_dims = d.getLocalBlockGridDims( tr, tc );
+						const auto t_coords = d.getThreadCoords( thread );
+						const auto block_grid_dims = d.getLocalBlockGridDims( t_coords );
 
 						// Assuming that all blocks are of the same size
 						const size_t alloc_size = block_grid_dims.first * block_grid_dims.second * d.getBlockSize();
@@ -182,7 +181,7 @@ namespace alp {
 							if( thread != config::OMP::current_thread_ID() ) {
 								std::cout << "Warning: thread != OMP::current_thread_id()\n";
 							}
-							std::cout << "Thread with global coordinates tr = " << tr << " tc = " << tc
+							std::cout << "Thread with global coordinates tr = " << t_coords.tr << " tc = " << t_coords.tc
 								<< " on OpenMP thread " << config::OMP::current_thread_ID()
 								<< " allocating buffer of " << alloc_size << " elements "
 								<< " holding " << block_grid_dims.first << " x " << block_grid_dims.second << " blocks.\n";
@@ -202,7 +201,7 @@ namespace alp {
 						// Populate the array of internal container wrappers
 						for( size_t br = 0; br < block_grid_dims.first; ++br ) {
 							for( size_t bc = 0; bc < block_grid_dims.second; ++bc ) {
-								const size_t offset = d.getBlocksOffset( tr, tc, br, bc );
+								const size_t offset = d.getBlocksOffset( t_coords, br, bc );
 								containers[ thread ].emplace_back( &( buffers[ thread ][ offset ] ), d.getBlockSize() );
 							}
 						}