From ff514cb0f8ac3b2a39c37f1143c03d058c1b97f1 Mon Sep 17 00:00:00 2001
From: "Daniele G. Spampinato" <daniele.giuseppe.spampinato@huawei.com>
Date: Wed, 14 Dec 2022 14:17:49 +0100
Subject: [PATCH 1/9] Started drafting mxm using 2.5D algo

---
 include/alp/omp/blas3.hpp | 268 ++++++++++++++++++++++++++++++++++++++
 include/alp/omp/io.hpp    |   2 +
 2 files changed, 270 insertions(+)
 create mode 100644 include/alp/omp/blas3.hpp
diff --git a/include/alp/omp/blas3.hpp b/include/alp/omp/blas3.hpp
new file mode 100644
index 000000000..cdd24324b
--- /dev/null
+++ b/include/alp/omp/blas3.hpp
@@ -0,0 +1,268 @@
+
+/*
+ *   Copyright 2021 Huawei Technologies Co., Ltd.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * @author A. N. Yzelman
+ * @date 14th of January 2022
+ */
+
+#ifndef _H_ALP_OMP_BLAS3
+#define _H_ALP_OMP_BLAS3
+
+#include <algorithm>   // for std::min/max
+#include <type_traits> // for std::enable_if
+
+#include <alp/base/blas3.hpp>
+#include <alp/descriptors.hpp>
+#include <alp/structures.hpp>
+
+// Include backend to which sequential work is delegated
+#ifdef _ALP_OMP_WITH_REFERENCE
+ #include <alp/reference/blas3.hpp>
+ #include <alp/reference/io.hpp>
+#endif
+
+#include "matrix.hpp"
+#include "storage.hpp"
+
+
+namespace alp {
+
+	namespace internal {
+
+		/**
+		 * \internal general mxm implementation that all mxm variants using 
+		 * structured matrices refer to.
+		 */
+		template<
+			bool allow_void,
+			class MulMonoid,
+			typename OutputType, typename InputType1, typename InputType2,
+			class Operator, class Monoid,
+			typename OutputStructure, typename OutputView, 
+			typename OutputImfR, typename OutputImfC,
+			typename InputStructure1, typename InputView1, 
+			typename InputImfR1, typename InputImfC1,
+			typename InputStructure2, typename InputView2, 
+			typename InputImfR2, typename InputImfC2
+		>
+		RC mxm_generic( 
+			alp::Matrix< OutputType, OutputStructure, 
+			Density::Dense, OutputView, OutputImfR, OutputImfC, omp > &C,
+			const alp::Matrix< InputType1, InputStructure1, 
+			Density::Dense, InputView1, InputImfR1, InputImfC1, omp > &A,
+			const alp::Matrix< InputType2, InputStructure2, 
+			Density::Dense, InputView2, InputImfR2, InputImfC2, omp > &B,
+			const Operator &oper,
+			const Monoid &monoid,
+			const MulMonoid &mulMonoid,
+			const typename std::enable_if< !alp::is_object< OutputType >::value &&
+				!alp::is_object< InputType1 >::value && !
+				alp::is_object< InputType2 >::value &&
+				alp::is_operator< Operator >::value &&
+				alp::is_monoid< Monoid >::value,
+			void >::type * const = NULL
+		) {
+
+			static_assert( 
+				!(
+					std::is_same< InputType1, void >::value ||
+					std::is_same< InputType2, void >::value
+				),
+				"alp::internal::mxm_generic: the operator-monoid version of mxm cannot be "
+				"used if either of the input matrices is a pattern matrix (of type "
+				"void)"
+			);
+
+#ifdef _DEBUG
+			std::cout << "In alp::internal::mxm_generic (omp)\n";
+#endif
+
+			// Early exit checks 
+			if( ! internal::getInitialized( A ) || 
+				! internal::getInitialized( B ) || 
+				! internal::getInitialized( C ) 
+			) {
+				internal::setInitialized( C, false );
+				return SUCCESS;
+			}
+
+			const std::ptrdiff_t m   { static_cast< std::ptrdiff_t >( nrows( C ) ) };
+			const std::ptrdiff_t n   { static_cast< std::ptrdiff_t >( ncols( C ) ) };
+			const std::ptrdiff_t m_a { static_cast< std::ptrdiff_t >( nrows( A ) ) };
+			const std::ptrdiff_t k   { static_cast< std::ptrdiff_t >( ncols( A ) ) };
+			const std::ptrdiff_t k_b { static_cast< std::ptrdiff_t >( nrows( B ) ) };
+			const std::ptrdiff_t n_b { static_cast< std::ptrdiff_t >( ncols( B ) ) };
+
+			if( m != m_a || k != k_b || n != n_b ) {
+				return MISMATCH;
+			}
+
+			const Distribution &da = internal::getAmf( A ).getDistribution();
+			const Distribution &db = internal::getAmf( B ).getDistribution();
+			const Distribution &dc = internal::getAmf( C ).getDistribution();
+
+			RC rc = SUCCESS;
+
+			#pragma omp parallel for
+			for( size_t thread = 0; thread < config::OMP::current_threads(); ++thread ) {
+
+				const auto th_ijk_a = da.getThreadCoords( thread );
+				const auto th_ijk_b = da.getThreadCoords( thread );
+				const auto th_ijk_c = da.getThreadCoords( thread );
+
+				const auto block_grid_dims_a = d.getLocalBlockGridDims( t_coords_a );
+				const auto block_grid_dims_b = d.getLocalBlockGridDims( t_coords_b );
+				const auto block_grid_dims_c = d.getLocalBlockGridDims( t_coords_c );
+
+				RC local_rc = SUCCESS;
+
+				// Broadcast Aij and Bij to all c layers
+				if( t_coords.rt > 0 ) {
+
+					typename Distribution::ThreadCoord th_ij0_a( t_coords_a.tr, t_coords_a.tc, 0 );
+
+					for( size_t br = 0; br < block_grid_dims_a.first; ++br ) {
+						for( size_t bc = 0; bc < block_grid_dims_a.second; ++bc ) {
+
+							auto refAij0 = internal::get_view( A, th_ij0_a, br, bc );
+							auto refAijk = internal::get_view( A, th_ijk_a, br, bc );
+
+							local_rc = local_rc ? local_rc : set( refAijk, refAij0 );
+
+
+						}
+					}
+
+					if( local_rc != SUCCESS ) {
+
+						typename Distribution::ThreadCoord th_ij0_b( t_coords_b.tr, t_coords_b.tc, 0 );
+
+						for( size_t br = 0; br < block_grid_dims_b.first; ++br ) {
+							for( size_t bc = 0; bc < block_grid_dims_b.second; ++bc ) {
+
+								auto refBij0 = internal::get_view( B, th_ij0_b, br, bc );
+								auto refBijk = internal::get_view( B, th_ijk_b, br, bc );
+
+								local_rc = local_rc ? local_rc : set( refBijk, refBij0 );
+							}
+						}
+					}
+
+				}
+				// End Broadcast of Aij and Bij
+				#pragma omp barrier
+
+
+				// for( size_t br = 0; br < block_grid_dims.first; ++br ) {
+				// 	for( size_t bc = 0; bc < block_grid_dims.second; ++bc ) {
+
+				// 		// Get a sequential matrix view over the block
+				// 		auto refC = internal::get_view( C, tr, tc, 1 /* rt */, br, bc );
+
+				// 		// Construct a sequential Scalar container from the input Scalar
+				// 		Scalar< InputType, InputStructure, config::default_sequential_backend > ref_val( *val );
+
+				// 		// Delegate the call to the sequential set implementation
+				// 		local_rc = local_rc ? local_rc : set( refC, ref_val );
+
+				// 		if( local_rc != SUCCESS ) {
+				// 			rc = local_rc;
+				// 		}
+				// 	}
+				// }
+			}
+
+			internal::setInitialized( C, true );
+			return rc;
+
+		}
+
+	} // namespace internal
+
+	/**
+	 * Dense Matrix-Matrix multiply between structured matrices.
+	 * Version with semiring parameter.
+	 *
+	 * @tparam descr      		The descriptors under which to perform the computation.
+	 * @tparam OutputStructMatT The structured matrix type of the output matrix.
+	 * @tparam InputStructMatT1 The structured matrix type of the the left-hand side input
+	 *                    		matrix.
+	 * @tparam InputStructMatT2 The structured matrix type of the right-hand side input
+	 *                    		matrix.
+	 * @tparam Semiring   		The semiring under which to perform the
+	 *                    		multiplication.
+	 *
+	 * @returns SUCCESS  If the computation completed as intended.
+	 * @returns MISMATCH Whenever the structures or dimensions of \a A, \a B, and \a C do
+ *                       not match. All input data containers are left
+ *                       untouched if this exit code is returned; it will be
+ *                       as though this call was never made.
+	 *
+	 * @param[out] C 	The output matrix \f$ C = AB \f$ when the function returns
+	 *               	#SUCCESS.
+	 * @param[in]  A 	The left-hand side input matrix \f$ A \f$.
+	 * @param[in]  B 	The left-hand side input matrix \f$ B \f$.
+	 * @param[in] ring  (Optional.) The semiring under which the computation should
+	 *                             proceed.
+	 * @param phase 	The execution phase.
+	 */
+	template<
+		typename OutputStructMatT,
+		typename InputStructMatT1,
+		typename InputStructMatT2,
+		class Semiring
+	>
+	RC mxm( OutputStructMatT & C,
+		const InputStructMatT1 & A,
+		const InputStructMatT2 & B,
+		const Semiring & ring = Semiring(),
+		const PHASE &phase = NUMERICAL,
+		const typename std::enable_if< ! alp::is_object< typename OutputStructMatT::value_type >::value && ! alp::is_object< typename InputStructMatT1::value_type >::value && ! alp::is_object< typename InputStructMatT2::value_type >::value && alp::is_semiring< Semiring >::value,
+			void >::type * const = NULL ) {
+		(void)phase;
+
+		return internal::mxm_generic< false >( C, A, B, ring.getMultiplicativeOperator(), ring.getAdditiveMonoid(), ring.getMultiplicativeMonoid() );
+	}
+
+	/**
+	 * Dense Matrix-Matrix multiply between structured matrices.
+	 * Version with additive monoid and multiplicative operator
+	 */
+	template< typename OutputStructMatT, 
+		typename InputStructMatT1,
+		typename InputStructMatT2,
+		class Operator, class Monoid
+	>
+	RC mxm( OutputStructMatT & C,
+		const InputStructMatT1 & A,
+		const InputStructMatT2 & B,
+		const Operator & mulOp,
+		const Monoid & addM,
+		const PHASE &phase = NUMERICAL,
+		const typename std::enable_if< ! alp::is_object< typename OutputStructMatT::value_type >::value && ! alp::is_object< typename InputStructMatT1::value_type >::value && ! alp::is_object< typename InputStructMatT2::value_type >::value &&
+		                               alp::is_operator< Operator >::value && alp::is_monoid< Monoid >::value,
+			void >::type * const = NULL ) {
+		(void)phase;
+
+		return internal::mxm_generic< false >( C, A, B, mulOp, addM, Monoid() );
+	}
+
+} // end namespace ``alp''
+
+#endif // end ``_H_ALP_OMP_BLAS3''
+
diff --git a/include/alp/omp/io.hpp b/include/alp/omp/io.hpp
index 0f5793ac6..bc5b6aa97 100644
--- a/include/alp/omp/io.hpp
+++ b/include/alp/omp/io.hpp
@@ -34,6 +34,8 @@
  #include <alp/reference/io.hpp>
 #endif
 
+#include "storage.hpp"
+
 #define NO_CAST_ASSERT( x, y, z )                                              \
 	static_assert( x,                                                          \
 		"\n\n"                                                                 \

From f9e7773d19152c90547d98c9b3492ed15d071905 Mon Sep 17 00:00:00 2001
From: "Daniele G. Spampinato" <daniele.giuseppe.spampinato@huawei.com>
Date: Wed, 14 Dec 2022 14:17:49 +0100
Subject: [PATCH 2/9] WIP implementation of shifting + compute

---
 include/alp/omp/blas3.hpp   | 75 +++++++++++++++++++++++--------------
 include/alp/omp/config.hpp  |  6 ++-
 include/alp/omp/storage.hpp |  9 ++++-
 3 files changed, 59 insertions(+), 31 deletions(-)

diff --git a/include/alp/omp/blas3.hpp b/include/alp/omp/blas3.hpp
index cdd24324b..833f19a70 100644
--- a/include/alp/omp/blas3.hpp
+++ b/include/alp/omp/blas3.hpp
@@ -112,29 +112,35 @@ namespace alp {
 				return MISMATCH;
 			}
 
-			const Distribution &da = internal::getAmf( A ).getDistribution();
-			const Distribution &db = internal::getAmf( B ).getDistribution();
-			const Distribution &dc = internal::getAmf( C ).getDistribution();
+			const Distribution_2_5D &da = internal::getAmf( A ).getDistribution();
+			const Distribution_2_5D &db = internal::getAmf( B ).getDistribution();
+			const Distribution_2_5D &dc = internal::getAmf( C ).getDistribution();
+
+			const auto tg_a = da.getThreadGridDims();
+			const auto tg_b = db.getThreadGridDims();
+			const auto tg_c = dc.getThreadGridDims();
+
+			using th_coord_t = typename Distribution_2_5D::ThreadCoords;
 
 			RC rc = SUCCESS;
 
 			#pragma omp parallel for
 			for( size_t thread = 0; thread < config::OMP::current_threads(); ++thread ) {
 
-				const auto th_ijk_a = da.getThreadCoords( thread );
-				const auto th_ijk_b = da.getThreadCoords( thread );
-				const auto th_ijk_c = da.getThreadCoords( thread );
+				const th_coord_t th_ijk_a = da.getThreadCoords( thread );
+				const th_coord_t th_ijk_b = da.getThreadCoords( thread );
+				const th_coord_t th_ijk_c = da.getThreadCoords( thread );
 
-				const auto block_grid_dims_a = d.getLocalBlockGridDims( t_coords_a );
-				const auto block_grid_dims_b = d.getLocalBlockGridDims( t_coords_b );
-				const auto block_grid_dims_c = d.getLocalBlockGridDims( t_coords_c );
+				const auto block_grid_dims_a = d.getLocalBlockGridDims( th_ijk_a );
+				const auto block_grid_dims_b = d.getLocalBlockGridDims( th_ijk_b );
+				const auto block_grid_dims_c = d.getLocalBlockGridDims( th_ijk_c );
 
 				RC local_rc = SUCCESS;
 
 				// Broadcast Aij and Bij to all c layers
-				if( t_coords.rt > 0 ) {
+				if( th_ijk_a.rt > 0 ) {
 
-					typename Distribution::ThreadCoord th_ij0_a( t_coords_a.tr, t_coords_a.tc, 0 );
+					th_coord_t th_ij0_a( th_ijk_a.tr, th_ijk_a.tc, 0 );
 
 					for( size_t br = 0; br < block_grid_dims_a.first; ++br ) {
 						for( size_t bc = 0; bc < block_grid_dims_a.second; ++bc ) {
@@ -150,7 +156,7 @@ namespace alp {
 
 					if( local_rc != SUCCESS ) {
 
-						typename Distribution::ThreadCoord th_ij0_b( t_coords_b.tr, t_coords_b.tc, 0 );
+						th_coord_t th_ij0_b( th_ijk_b.tr, th_ijk_b.tc, 0 );
 
 						for( size_t br = 0; br < block_grid_dims_b.first; ++br ) {
 							for( size_t bc = 0; bc < block_grid_dims_b.second; ++bc ) {
@@ -166,25 +172,38 @@ namespace alp {
 				}
 				// End Broadcast of Aij and Bij
 				#pragma omp barrier
-
-
-				// for( size_t br = 0; br < block_grid_dims.first; ++br ) {
-				// 	for( size_t bc = 0; bc < block_grid_dims.second; ++bc ) {
-
-				// 		// Get a sequential matrix view over the block
-				// 		auto refC = internal::get_view( C, tr, tc, 1 /* rt */, br, bc );
-
-				// 		// Construct a sequential Scalar container from the input Scalar
-				// 		Scalar< InputType, InputStructure, config::default_sequential_backend > ref_val( *val );
-
-				// 		// Delegate the call to the sequential set implementation
-				// 		local_rc = local_rc ? local_rc : set( refC, ref_val );
-
-				// 		if( local_rc != SUCCESS ) {
-				// 			rc = local_rc;
+				
+				auto mod = [](const size_t _k, const size_t _n) = {
+					return ( ( _k %= _n ) < 0 ) ? _k + _n : _k;
+				};
+				
+				// TODO from here
+				// size_t s_a = mod( th_ijk_a.tc - th_ijk_a.tr + th_ijk_a.rt * tg_a.tr / th_ijk_a.rt, tg_a.tr );
+				// th_coord_t th_isk_a( th_ijk_a.tr, s, th_ijk_a.tr );
+
+				// size_t s_b = mod( th_ijk_b.tr - th_ijk_b.tc + th_ijk_b.rt * tg_a.tr / th_ijk_a.rt, tg_a.tr );
+				// th_coord_t th_isk_a( th_ijk_a.tr, s, th_ijk_a.tr );
+
+				// for( size_t bk = 0; bk < block_grid_dims_a.second; ++bk ) {
+				// 	for( size_t br = 0; br < block_grid_dims_c.first; ++br ) {
+				// 		for( size_t bc = 0; bc < block_grid_dims_c.second; ++bc ) {
+
+				// 			// Get a sequential matrix view over the block
+				// 			auto refC = internal::get_view( C, tr, tc, 1 /* rt */, br, bc );
+
+				// 			// Construct a sequential Scalar container from the input Scalar
+				// 			Scalar< InputType, InputStructure, config::default_sequential_backend > ref_val( *val );
+
+				// 			// Delegate the call to the sequential set implementation
+				// 			local_rc = local_rc ? local_rc : set( refC, ref_val );
+
+				// 			if( local_rc != SUCCESS ) {
+				// 				rc = local_rc;
+				// 			}
 				// 		}
 				// 	}
 				// }
+
 			}
 
 			internal::setInitialized( C, true );
diff --git a/include/alp/omp/config.hpp b/include/alp/omp/config.hpp
index 3bdb700ed..5e51df69c 100644
--- a/include/alp/omp/config.hpp
+++ b/include/alp/omp/config.hpp
@@ -37,7 +37,11 @@ namespace alp {
 		constexpr size_t BLOCK_ROW_DIM = 16;
 		constexpr size_t BLOCK_COL_DIM = 16;
 
-		constexpr size_t REPLICATION_FACTOR_THREADS = 1;
+		// Temporary solution to accomodate for 2.5D mxm
+		constexpr size_t THREAD_ROW_DIM = 4;
+		constexpr size_t THREAD_COL_DIM = 4;
+
+		constexpr size_t REPLICATION_FACTOR_THREADS = 2;
 
 
 	} // namespace config
diff --git a/include/alp/omp/storage.hpp b/include/alp/omp/storage.hpp
index fb24da0bf..6bd333a70 100644
--- a/include/alp/omp/storage.hpp
+++ b/include/alp/omp/storage.hpp
@@ -172,8 +172,8 @@ namespace alp {
 				const size_t num_threads
 			) :
 				m( m ), n( n ),
-				Tr( static_cast< size_t >( sqrt( num_threads/ Rt ) ) ),
-				Tc( num_threads / Rt / Tr ),
+				Tr( config::THREAD_ROW_DIM ), // Temporary
+				Tc( config::THREAD_COL_DIM ),
 				Br( static_cast< size_t >( std::ceil( static_cast< double >( m ) / config::BLOCK_ROW_DIM ) ) ),
 				Bc( static_cast< size_t >( std::ceil( static_cast< double >( n ) / config::BLOCK_COL_DIM ) ) ) {
 
@@ -231,6 +231,11 @@ namespace alp {
 				return Tr * Tc * Rt;
 			}
 
+			/** Returns the thread grid size */
+			ThreadCoords getThreadGridDims() const {
+				return { Tr, Tc, Rt };
+			}
+
 			/** Returns the total global amount of blocks */
 			std::pair< size_t, size_t > getGlobalBlockGridDims() const {
 				return { Br, Bc };

From f874c53ae48f1035d2d9a655db9fd5adfc45d54d Mon Sep 17 00:00:00 2001
From: "Daniele G. Spampinato" <daniele.giuseppe.spampinato@huawei.com>
Date: Wed, 14 Dec 2022 14:17:49 +0100
Subject: [PATCH 3/9] First complete draft

---
 include/alp/omp/blas3.hpp | 147 ++++++++++++++++++++++++++++----------
 include/alp/utils.hpp     |  17 ++++-
 2 files changed, 125 insertions(+), 39 deletions(-)

diff --git a/include/alp/omp/blas3.hpp b/include/alp/omp/blas3.hpp
index 833f19a70..390f5f186 100644
--- a/include/alp/omp/blas3.hpp
+++ b/include/alp/omp/blas3.hpp
@@ -120,6 +120,14 @@ namespace alp {
 			const auto tg_b = db.getThreadGridDims();
 			const auto tg_c = dc.getThreadGridDims();
 
+			if( tg_c.rt != tg_a.rt || tg_a.rt != tg_b.rt ) {
+				return MISMATCH;
+			}
+
+			if( tg_c.tr != tg_a.tr || tg_c.tc != tg_b.tc || tg_a.tc != tg_b.tr ) {
+				return MISMATCH;
+			}
+
 			using th_coord_t = typename Distribution_2_5D::ThreadCoords;
 
 			RC rc = SUCCESS;
@@ -128,38 +136,46 @@ namespace alp {
 			for( size_t thread = 0; thread < config::OMP::current_threads(); ++thread ) {
 
 				const th_coord_t th_ijk_a = da.getThreadCoords( thread );
-				const th_coord_t th_ijk_b = da.getThreadCoords( thread );
-				const th_coord_t th_ijk_c = da.getThreadCoords( thread );
+				const th_coord_t th_ijk_b = db.getThreadCoords( thread );
+				const th_coord_t th_ijk_c = dc.getThreadCoords( thread );
 
-				const auto block_grid_dims_a = d.getLocalBlockGridDims( th_ijk_a );
-				const auto block_grid_dims_b = d.getLocalBlockGridDims( th_ijk_b );
-				const auto block_grid_dims_c = d.getLocalBlockGridDims( th_ijk_c );
+				const auto set_block_grid_dims_a = da.getLocalBlockGridDims( th_ijk_a );
+				const auto set_block_grid_dims_b = db.getLocalBlockGridDims( th_ijk_b );
+				const auto block_grid_dims_c = dc.getLocalBlockGridDims( th_ijk_c );
 
 				RC local_rc = SUCCESS;
 
-				// Broadcast Aij and Bij to all c layers
-				if( th_ijk_a.rt > 0 ) {
+				if( block_grid_dims_c.first != set_block_grid_dims_a.first 
+					|| block_grid_dims_c.second != set_block_grid_dims_b.second 
+					|| set_block_grid_dims_a.second != set_block_grid_dims_b.first 
+				) {
+					local_rc = MISMATCH;
+				}
+
+				// Broadcast Aij and Bij to all c-dimensional layers
+				if( local_rc == SUCCESS && th_ijk_a.rt > 0 ) {
 
 					th_coord_t th_ij0_a( th_ijk_a.tr, th_ijk_a.tc, 0 );
 
-					for( size_t br = 0; br < block_grid_dims_a.first; ++br ) {
-						for( size_t bc = 0; bc < block_grid_dims_a.second; ++bc ) {
+					for( size_t br = 0; br < set_block_grid_dims_a.first; ++br ) {
+						for( size_t bc = 0; bc < set_block_grid_dims_a.second; ++bc ) {
 
 							auto refAij0 = internal::get_view( A, th_ij0_a, br, bc );
 							auto refAijk = internal::get_view( A, th_ijk_a, br, bc );
 
 							local_rc = local_rc ? local_rc : set( refAijk, refAij0 );
 
-
 						}
 					}
+				
+				} // End Broadcast of Aij
 
-					if( local_rc != SUCCESS ) {
+				if( local_rc == SUCCESS && th_ijk_b.rt > 0 ) {
 
 						th_coord_t th_ij0_b( th_ijk_b.tr, th_ijk_b.tc, 0 );
 
-						for( size_t br = 0; br < block_grid_dims_b.first; ++br ) {
-							for( size_t bc = 0; bc < block_grid_dims_b.second; ++bc ) {
+						for( size_t br = 0; br < set_block_grid_dims_b.first; ++br ) {
+							for( size_t bc = 0; bc < set_block_grid_dims_b.second; ++bc ) {
 
 								auto refBij0 = internal::get_view( B, th_ij0_b, br, bc );
 								auto refBijk = internal::get_view( B, th_ijk_b, br, bc );
@@ -173,40 +189,95 @@ namespace alp {
 				// End Broadcast of Aij and Bij
 				#pragma omp barrier
 				
-				auto mod = [](const size_t _k, const size_t _n) = {
-					return ( ( _k %= _n ) < 0 ) ? _k + _n : _k;
-				};
-				
-				// TODO from here
-				// size_t s_a = mod( th_ijk_a.tc - th_ijk_a.tr + th_ijk_a.rt * tg_a.tr / th_ijk_a.rt, tg_a.tr );
-				// th_coord_t th_isk_a( th_ijk_a.tr, s, th_ijk_a.tr );
+				if( local_rc == SUCCESS ) {
+					
+					// Initialize circular shifts at stride of Rt
+					size_t s_a = utils::modulus( th_ijk_a.tc - th_ijk_a.tr + th_ijk_a.rt * tg_a.tc / tg_a.rt, tg_a.tc );
+					size_t s_b = utils::modulus( th_ijk_b.tr - th_ijk_b.tc + th_ijk_b.rt * tg_b.tr / tg_b.rt, tg_b.tr );
+
+					// Accumulate remaining local c-dimension factor
+					for( size_t r = 0; r < tg_a.tc / tg_a.rt; ++r ) {
+
+						const th_coord_t th_isk_a( th_ijk_a.tr, s_a, th_ijk_a.rt );
+						const th_coord_t th_sjk_b( s_b, th_ijk_b.tc, th_ijk_b.rt );
+
+						const auto mxm_block_grid_dims_a = d.getLocalBlockGridDims( th_isk_a );
+						const auto mxm_block_grid_dims_b = d.getLocalBlockGridDims( th_sjk_b );
+
+						if( block_grid_dims_c.first != mxm_block_grid_dims_a.first 
+							|| block_grid_dims_c.second != mxm_block_grid_dims_b.second 
+							|| mxm_block_grid_dims_a.second != mxm_block_grid_dims_b.first 
+						) {
+							local_rc = MISMATCH;
+						}
+
+						if( local_rc == SUCCESS ) {
 
-				// size_t s_b = mod( th_ijk_b.tr - th_ijk_b.tc + th_ijk_b.rt * tg_a.tr / th_ijk_a.rt, tg_a.tr );
-				// th_coord_t th_isk_a( th_ijk_a.tr, s, th_ijk_a.tr );
+							for( size_t bk = 0; bk < mxm_block_grid_dims_a.second; ++bk ) {
+								for( size_t br = 0; br < block_grid_dims_c.first; ++br ) {
+	
+									const auto refA_loc = internal::get_view( A, th_isk_a, br, bk );
 
-				// for( size_t bk = 0; bk < block_grid_dims_a.second; ++bk ) {
-				// 	for( size_t br = 0; br < block_grid_dims_c.first; ++br ) {
-				// 		for( size_t bc = 0; bc < block_grid_dims_c.second; ++bc ) {
+									for( size_t bc = 0; bc < block_grid_dims_c.second; ++bc ) {
 
-				// 			// Get a sequential matrix view over the block
-				// 			auto refC = internal::get_view( C, tr, tc, 1 /* rt */, br, bc );
+										const auto refB_loc = internal::get_view( B, th_sjk_b, bk, bc );
+										auto refC_ijk = internal::get_view( C, th_ijk_c, br, bc );
 
-				// 			// Construct a sequential Scalar container from the input Scalar
-				// 			Scalar< InputType, InputStructure, config::default_sequential_backend > ref_val( *val );
+										// Delegate the call to the sequential mxm implementation
+										local_rc = local_rc ? local_rc : internal::mxm_generic< allow_void >( refC_ijk, refA_loc, refB_loc, oper, monoid, mulMonoid );
 
-				// 			// Delegate the call to the sequential set implementation
-				// 			local_rc = local_rc ? local_rc : set( refC, ref_val );
+									}
+								}
+							}
+
+							// Circular shift rightwards for A
+							s_a = utils::modulus( s_a + 1, tg_a.tc );
+							// Circular shift downwards for B
+							s_b = utils::modulus( s_b + 1, tg_b.tr );
+
+						} else {
+							break;
+						}
 
-				// 			if( local_rc != SUCCESS ) {
-				// 				rc = local_rc;
-				// 			}
-				// 		}
-				// 	}
-				// }
+					} 
+
+					// End layer-by-layer partial computation
+					#pragma omp barrier
+
+					// Global c-dimension reduction
+					// (Consider if rt > 0 critical section?)
+					if ( local_rc == SUCCESS && th_ijk_c.rt == 0 ) {
+
+						for( size_t r = 1; r < tg_c.rt; ++r ) {
+
+							const th_coord_t th_ijr_c( th_ijk_c.tr, th_ijk_c.tc, r );
+
+							for( size_t br = 0; br < block_grid_dims_c.first; ++br ) {
+								for( size_t bc = 0; bc < block_grid_dims_c.second; ++bc ) {
+
+									auto refCij0 = internal::get_view( C, th_ijk_c, br, bc ); // k == 0 
+									auto refCijr = internal::get_view( C, th_ijr_c, br, bc );
+
+									// Final result in C at layer 0
+									local_rc = local_rc ? local_rc : foldl( refCij0, refCijr, monoid );
+								}
+							}
+
+							if( local_rc != SUCCESS ) {
+								break;
+							}
+						}
+
+					}
+				}
+				
+				// Different values for rc could converge here (eg, MISMATCH, FAILED).
+				if( local_rc != SUCCESS ) {
+					rc = local_rc;
+				}
 
 			}
 
-			internal::setInitialized( C, true );
 			return rc;
 
 		}
diff --git a/include/alp/utils.hpp b/include/alp/utils.hpp
index 330c69b3b..7e002cbff 100644
--- a/include/alp/utils.hpp
+++ b/include/alp/utils.hpp
@@ -26,6 +26,7 @@
 #include <assert.h>
 
 #include <cmath>  //fabs
+#include <functional> // modulus
 #include <limits> //numeric_limits
 #include <type_traits>
 
@@ -295,7 +296,21 @@ namespace alp {
 			}
 
 		};
-
+		
+		
+		/**
+		 * Computation of modulus operation \f$ mod(x, n) \f$ based on remainder 
+		 * of division for type \a T.
+		 * Assumes \a T implements: \a operator%, \a operator<, and \a operator+.
+		 */
+		template < typename T >
+		T modulus( const T x, const T n, const T zero) {
+			
+			const T rem = std::modulus( _k, _n );
+			
+			return ( rem < zero ) ? rem + _n : rem;
+		};
+					
 	} // namespace utils
 
 } // namespace alp

From b7f46ff92c8b9526650eb35d4c231a04ba2b293e Mon Sep 17 00:00:00 2001
From: "Daniele G. Spampinato" <daniele.giuseppe.spampinato@huawei.com>
Date: Fri, 2 Dec 2022 17:42:43 +0100
Subject: [PATCH 4/9] compiling mxm test on general matrices

---
 include/alp/blas3.hpp           |   3 +
 include/alp/omp/blas3.hpp       | 177 +++++++++++----
 include/alp/omp/config.hpp      |   8 +-
 include/alp/omp/io.hpp          |  47 +++-
 include/alp/omp/matrix.hpp      |  11 +-
 include/alp/reference/blas3.hpp |  65 ++++--
 include/alp/reference/io.hpp    |   3 -
 include/alp/utils.hpp           |   8 +-
 src/alp/omp/init.cpp            |   4 +-
 tests/unit/CMakeLists.txt       |   4 +
 tests/unit/dense_omp_mxm.cpp    | 367 ++++++++++++++++++++++++++++++++
 11 files changed, 611 insertions(+), 86 deletions(-)
 create mode 100644 tests/unit/dense_omp_mxm.cpp

diff --git a/include/alp/blas3.hpp b/include/alp/blas3.hpp
index e1f6d0baa..976d3ccb7 100644
--- a/include/alp/blas3.hpp
+++ b/include/alp/blas3.hpp
@@ -26,6 +26,9 @@
 #ifdef _ALP_WITH_REFERENCE
  #include <alp/reference/blas3.hpp>
 #endif
+#ifdef _ALP_WITH_OMP
+ #include <alp/omp/blas3.hpp>
+#endif
 
 #endif // end _H_ALP_BLAS3
 
diff --git a/include/alp/omp/blas3.hpp b/include/alp/omp/blas3.hpp
index 390f5f186..539366bfc 100644
--- a/include/alp/omp/blas3.hpp
+++ b/include/alp/omp/blas3.hpp
@@ -28,7 +28,10 @@
 
 #include <alp/base/blas3.hpp>
 #include <alp/descriptors.hpp>
+#include <alp/matrix.hpp>
+#include <alp/amf-based/matrix.hpp>
 #include <alp/structures.hpp>
+#include <alp/storage.hpp>
 
 // Include backend to which sequential work is delegated
 #ifdef _ALP_OMP_WITH_REFERENCE
@@ -36,8 +39,9 @@
  #include <alp/reference/io.hpp>
 #endif
 
-#include "matrix.hpp"
-#include "storage.hpp"
+#ifndef _NDEBUG
+#include "../../../tests/utils/print_alp_containers.hpp"
+#endif
 
 
 namespace alp {
@@ -63,9 +67,9 @@ namespace alp {
 		RC mxm_generic( 
 			alp::Matrix< OutputType, OutputStructure, 
 			Density::Dense, OutputView, OutputImfR, OutputImfC, omp > &C,
-			const alp::Matrix< InputType1, InputStructure1, 
+			alp::Matrix< InputType1, InputStructure1, 
 			Density::Dense, InputView1, InputImfR1, InputImfC1, omp > &A,
-			const alp::Matrix< InputType2, InputStructure2, 
+			alp::Matrix< InputType2, InputStructure2, 
 			Density::Dense, InputView2, InputImfR2, InputImfC2, omp > &B,
 			const Operator &oper,
 			const Monoid &monoid,
@@ -88,7 +92,7 @@ namespace alp {
 				"void)"
 			);
 
-#ifdef _DEBUG
+#ifndef _NDEBUG
 			std::cout << "In alp::internal::mxm_generic (omp)\n";
 #endif
 
@@ -132,8 +136,11 @@ namespace alp {
 
 			RC rc = SUCCESS;
 
-			#pragma omp parallel for
-			for( size_t thread = 0; thread < config::OMP::current_threads(); ++thread ) {
+			#pragma omp parallel
+			{			
+				// #pragma omp for
+				// for( size_t thread = 0; thread < config::OMP::current_threads(); ++thread ) {
+				const size_t thread = config::OMP::current_thread_ID();
 
 				const th_coord_t th_ijk_a = da.getThreadCoords( thread );
 				const th_coord_t th_ijk_b = db.getThreadCoords( thread );
@@ -149,6 +156,11 @@ namespace alp {
 					|| block_grid_dims_c.second != set_block_grid_dims_b.second 
 					|| set_block_grid_dims_a.second != set_block_grid_dims_b.first 
 				) {
+#ifndef _NDEBUG
+					#pragma omp critical
+					std::cerr << "Thread " << thread << " in alp::internal::mxm_generic (omp)\n"
+						"\tMismatching local block grid size on set." << std::endl;
+#endif
 					local_rc = MISMATCH;
 				}
 
@@ -160,11 +172,16 @@ namespace alp {
 					for( size_t br = 0; br < set_block_grid_dims_a.first; ++br ) {
 						for( size_t bc = 0; bc < set_block_grid_dims_a.second; ++bc ) {
 
-							auto refAij0 = internal::get_view( A, th_ij0_a, br, bc );
-							auto refAijk = internal::get_view( A, th_ijk_a, br, bc );
+							auto refAij0 = get_view( A, th_ij0_a, br, bc );
+							auto refAijk = get_view( A, th_ijk_a, br, bc );
 
 							local_rc = local_rc ? local_rc : set( refAijk, refAij0 );
 
+#ifndef _NDEBUG
+							print_matrix("refAij0", refAij0);
+							print_matrix("refAijk", refAijk);
+#endif
+
 						}
 					}
 				
@@ -172,42 +189,59 @@ namespace alp {
 
 				if( local_rc == SUCCESS && th_ijk_b.rt > 0 ) {
 
-						th_coord_t th_ij0_b( th_ijk_b.tr, th_ijk_b.tc, 0 );
+					th_coord_t th_ij0_b( th_ijk_b.tr, th_ijk_b.tc, 0 );
 
-						for( size_t br = 0; br < set_block_grid_dims_b.first; ++br ) {
-							for( size_t bc = 0; bc < set_block_grid_dims_b.second; ++bc ) {
+					for( size_t br = 0; br < set_block_grid_dims_b.first; ++br ) {
+						for( size_t bc = 0; bc < set_block_grid_dims_b.second; ++bc ) {
 
-								auto refBij0 = internal::get_view( B, th_ij0_b, br, bc );
-								auto refBijk = internal::get_view( B, th_ijk_b, br, bc );
+							auto refBij0 = internal::get_view( B, th_ij0_b, br, bc );
+							auto refBijk = internal::get_view( B, th_ijk_b, br, bc );
 
-								local_rc = local_rc ? local_rc : set( refBijk, refBij0 );
-							}
+							local_rc = local_rc ? local_rc : set( refBijk, refBij0 );
 						}
 					}
+				}
 
+				// Different values for rc could converge here (eg, MISMATCH, FAILED).
+				if( local_rc != SUCCESS ) {
+#ifndef _NDEBUG
+					#pragma omp critical
+					std::cerr << "Thread " << thread << " in alp::internal::mxm_generic (omp)\n"
+						"\tIssues replicating input matrices." << std::endl;
+#endif
+					rc = local_rc;
 				}
-				// End Broadcast of Aij and Bij
+
+				// } // End Broadcast of Aij and Bij
 				#pragma omp barrier
 				
-				if( local_rc == SUCCESS ) {
+				if( rc == SUCCESS ) {
 					
+					// #pragma omp for
+					// for( size_t thread = 0; thread < config::OMP::current_threads(); ++thread ) {
+
 					// Initialize circular shifts at stride of Rt
 					size_t s_a = utils::modulus( th_ijk_a.tc - th_ijk_a.tr + th_ijk_a.rt * tg_a.tc / tg_a.rt, tg_a.tc );
 					size_t s_b = utils::modulus( th_ijk_b.tr - th_ijk_b.tc + th_ijk_b.rt * tg_b.tr / tg_b.rt, tg_b.tr );
 
-					// Accumulate remaining local c-dimension factor
+					// Per-c-dimensional-layer partial computation
 					for( size_t r = 0; r < tg_a.tc / tg_a.rt; ++r ) {
 
 						const th_coord_t th_isk_a( th_ijk_a.tr, s_a, th_ijk_a.rt );
 						const th_coord_t th_sjk_b( s_b, th_ijk_b.tc, th_ijk_b.rt );
 
-						const auto mxm_block_grid_dims_a = d.getLocalBlockGridDims( th_isk_a );
-						const auto mxm_block_grid_dims_b = d.getLocalBlockGridDims( th_sjk_b );
+						const auto mxm_block_grid_dims_a = da.getLocalBlockGridDims( th_isk_a );
+						const auto mxm_block_grid_dims_b = db.getLocalBlockGridDims( th_sjk_b );
 
 						if( block_grid_dims_c.first != mxm_block_grid_dims_a.first 
 							|| block_grid_dims_c.second != mxm_block_grid_dims_b.second 
 							|| mxm_block_grid_dims_a.second != mxm_block_grid_dims_b.first 
 						) {
+#ifndef _NDEBUG
+							#pragma omp critical
+							std::cerr << "Thread " << thread << " in alp::internal::mxm_generic (omp)\n"
+								"\tMismatching local block grid size on mxm." << std::endl;
+#endif
 							local_rc = MISMATCH;
 						}
 
@@ -235,15 +269,28 @@ namespace alp {
 							// Circular shift downwards for B
 							s_b = utils::modulus( s_b + 1, tg_b.tr );
 
-						} else {
-							break;
-						}
+						} 
+						// else {
+						// 	break;
+						// }
 
 					} 
+					// }
+				}
+
+				// Different values for rc could converge here (eg, MISMATCH, FAILED).
+				if( local_rc != SUCCESS ) {
+					#pragma omp critical
+					std::cerr << "Thread " << thread << " in alp::internal::mxm_generic (omp)\n"
+						"\tIssues with local mxm computations." << std::endl;
+					rc = local_rc;
+				}
 
-					// End layer-by-layer partial computation
-					#pragma omp barrier
+				// End layer-by-layer partial computation
+				#pragma omp barrier
 
+				if( rc == SUCCESS ) {
+					
 					// Global c-dimension reduction
 					// (Consider if rt > 0 critical section?)
 					if ( local_rc == SUCCESS && th_ijk_c.rt == 0 ) {
@@ -263,9 +310,9 @@ namespace alp {
 								}
 							}
 
-							if( local_rc != SUCCESS ) {
-								break;
-							}
+							// if( local_rc != SUCCESS ) {
+							// 	break;
+							// }
 						}
 
 					}
@@ -273,6 +320,9 @@ namespace alp {
 				
 				// Different values for rc could converge here (eg, MISMATCH, FAILED).
 				if( local_rc != SUCCESS ) {
+					#pragma omp critical
+					std::cerr << "Thread " << thread << " in alp::internal::mxm_generic (omp)\n"
+						"\tIssues with final reduction." << std::endl;
 					rc = local_rc;
 				}
 
@@ -312,44 +362,77 @@ namespace alp {
 	 * @param phase 	The execution phase.
 	 */
 	template<
-		typename OutputStructMatT,
-		typename InputStructMatT1,
-		typename InputStructMatT2,
+		typename OutputType, typename InputType1, typename InputType2,
+		typename OutputStructure, typename OutputView, 
+		typename OutputImfR, typename OutputImfC,
+		typename InputStructure1, typename InputView1, 
+		typename InputImfR1, typename InputImfC1,
+		typename InputStructure2, typename InputView2, 
+		typename InputImfR2, typename InputImfC2,
 		class Semiring
 	>
-	RC mxm( OutputStructMatT & C,
-		const InputStructMatT1 & A,
-		const InputStructMatT2 & B,
+	RC mxm( 
+		alp::Matrix< OutputType, OutputStructure, 
+		Density::Dense, OutputView, OutputImfR, OutputImfC, omp > &C,
+		alp::Matrix< InputType1, InputStructure1, 
+		Density::Dense, InputView1, InputImfR1, InputImfC1, omp > &A,
+		alp::Matrix< InputType2, InputStructure2, 
+		Density::Dense, InputView2, InputImfR2, InputImfC2, omp > &B,
 		const Semiring & ring = Semiring(),
 		const PHASE &phase = NUMERICAL,
-		const typename std::enable_if< ! alp::is_object< typename OutputStructMatT::value_type >::value && ! alp::is_object< typename InputStructMatT1::value_type >::value && ! alp::is_object< typename InputStructMatT2::value_type >::value && alp::is_semiring< Semiring >::value,
-			void >::type * const = NULL ) {
+		const typename std::enable_if< 
+			!alp::is_object< OutputType >::value &&
+			!alp::is_object< InputType1 >::value &&
+			!alp::is_object< InputType2 >::value &&
+			alp::is_semiring< Semiring >::value,
+			void 
+		>::type * const = NULL 
+	) {
 		(void)phase;
 
-		return internal::mxm_generic< false >( C, A, B, ring.getMultiplicativeOperator(), ring.getAdditiveMonoid(), ring.getMultiplicativeMonoid() );
+		return internal::mxm_generic< false >( 
+			C, A, B,
+			ring.getMultiplicativeOperator(), ring.getAdditiveMonoid(), ring.getMultiplicativeMonoid()
+		);
+
 	}
 
 	/**
 	 * Dense Matrix-Matrix multiply between structured matrices.
 	 * Version with additive monoid and multiplicative operator
 	 */
-	template< typename OutputStructMatT, 
-		typename InputStructMatT1,
-		typename InputStructMatT2,
+	template< 
+		typename OutputType, typename InputType1, typename InputType2,
+		typename OutputStructure, typename OutputView, 
+		typename OutputImfR, typename OutputImfC,
+		typename InputStructure1, typename InputView1, 
+		typename InputImfR1, typename InputImfC1,
+		typename InputStructure2, typename InputView2, 
+		typename InputImfR2, typename InputImfC2,
 		class Operator, class Monoid
 	>
-	RC mxm( OutputStructMatT & C,
-		const InputStructMatT1 & A,
-		const InputStructMatT2 & B,
+	RC mxm( 
+		alp::Matrix< OutputType, OutputStructure, 
+		Density::Dense, OutputView, OutputImfR, OutputImfC, omp > &C,
+		alp::Matrix< InputType1, InputStructure1, 
+		Density::Dense, InputView1, InputImfR1, InputImfC1, omp > &A,
+		alp::Matrix< InputType2, InputStructure2, 
+		Density::Dense, InputView2, InputImfR2, InputImfC2, omp > &B,
 		const Operator & mulOp,
 		const Monoid & addM,
 		const PHASE &phase = NUMERICAL,
-		const typename std::enable_if< ! alp::is_object< typename OutputStructMatT::value_type >::value && ! alp::is_object< typename InputStructMatT1::value_type >::value && ! alp::is_object< typename InputStructMatT2::value_type >::value &&
-		                               alp::is_operator< Operator >::value && alp::is_monoid< Monoid >::value,
-			void >::type * const = NULL ) {
+		const typename std::enable_if< 
+			!alp::is_object< OutputType >::value &&
+			!alp::is_object< InputType1 >::value &&
+			!alp::is_object< InputType2 >::value &&
+			alp::is_operator< Operator >::value && alp::is_monoid< Monoid >::value,
+			void 
+		>::type * const = NULL 
+	) {
 		(void)phase;
 
 		return internal::mxm_generic< false >( C, A, B, mulOp, addM, Monoid() );
+	
 	}
 
 } // end namespace ``alp''
diff --git a/include/alp/omp/config.hpp b/include/alp/omp/config.hpp
index 5e51df69c..dff95c18e 100644
--- a/include/alp/omp/config.hpp
+++ b/include/alp/omp/config.hpp
@@ -34,12 +34,12 @@ namespace alp {
 		class OMP : public grb::config::OMP {};
 
 		// Dimensions of blocks counted in number of elements per dimension
-		constexpr size_t BLOCK_ROW_DIM = 16;
-		constexpr size_t BLOCK_COL_DIM = 16;
+		constexpr size_t BLOCK_ROW_DIM = 2;
+		constexpr size_t BLOCK_COL_DIM = 2;
 
 		// Temporary solution to accomodate for 2.5D mxm
-		constexpr size_t THREAD_ROW_DIM = 4;
-		constexpr size_t THREAD_COL_DIM = 4;
+		constexpr size_t THREAD_ROW_DIM = 2;
+		constexpr size_t THREAD_COL_DIM = 2;
 
 		constexpr size_t REPLICATION_FACTOR_THREADS = 2;
 
diff --git a/include/alp/omp/io.hpp b/include/alp/omp/io.hpp
index bc5b6aa97..2794b1336 100644
--- a/include/alp/omp/io.hpp
+++ b/include/alp/omp/io.hpp
@@ -23,6 +23,8 @@
 #ifndef _H_ALP_OMP_IO
 #define _H_ALP_OMP_IO
 
+#include <type_traits>
+
 #include <alp/base/io.hpp>
 
 #include "matrix.hpp"
@@ -34,8 +36,6 @@
  #include <alp/reference/io.hpp>
 #endif
 
-#include "storage.hpp"
-
 #define NO_CAST_ASSERT( x, y, z )                                              \
 	static_assert( x,                                                          \
 		"\n\n"                                                                 \
@@ -138,6 +138,49 @@ namespace alp {
 		return rc;
 	}
 
+	/**
+	 * Temporarily assuming 1-1, row-major mapping with user container.
+	 */
+	template< 
+		typename InputType, typename Structure, typename View, 
+		typename ImfR, typename ImfC, typename fwd_iterator 
+	>
+	RC buildMatrix(
+		Matrix< InputType, Structure, Density::Dense, View, ImfR, ImfC, omp > &A,
+		const fwd_iterator &start,
+		const fwd_iterator &end
+	) noexcept {
+		
+		static_assert( 
+			( std::is_same< InputType, typename fwd_iterator::value_type >::value ), 
+			"alp::buildMatrix (omp): Mismatching type between user-provided input "
+			"container and output ALP container."
+		);
+
+		const size_t m = nrows( A );
+		const size_t n = ncols( A );
+
+		if( ( end - start ) != static_cast< std::ptrdiff_t >( m * n ) ) {
+			
+			std::cerr << "alp::buildMatrix (omp): Mismatching sizes between "
+				"user-provided input container and output ALP container." << std::endl;
+
+			return MISMATCH;
+		}
+
+		internal::setInitialized(A, true);
+
+		fwd_iterator it = start;
+
+		for( size_t i = 0; i < m; ++i ) {
+			for( size_t j = 0; j < n; ++j ) {
+				internal::access( A, internal::getStorageIndex( A, i, j ) ) = *( it++ );
+			}
+		}
+		
+		return SUCCESS;
+	}
+
 } // end namespace ``alp''
 
 #undef NO_CAST_ASSERT
diff --git a/include/alp/omp/matrix.hpp b/include/alp/omp/matrix.hpp
index afb16d412..9d279ba7f 100644
--- a/include/alp/omp/matrix.hpp
+++ b/include/alp/omp/matrix.hpp
@@ -23,6 +23,8 @@
 #ifndef _H_ALP_OMP_MATRIX
 #define _H_ALP_OMP_MATRIX
 
+#include <type_traits>
+
 #include <alp/backends.hpp>
 #include <alp/type_traits.hpp>
 
@@ -37,6 +39,7 @@
  #include <alp/reference/storage.hpp> // For AMFFactory
 #endif
 
+
 namespace alp {
 
 	// Currently no backend specific implementation
@@ -56,10 +59,10 @@ namespace alp {
 			typename SourceMatrix,
 			typename ThreadCoords,
 			std::enable_if_t<
-				is_matrix< SourceMatrix >::value
+				alp::is_matrix< SourceMatrix >::value
 			> * = nullptr
 		>
-		typename internal::new_container_type_from<
+		typename new_container_type_from<
 			typename SourceMatrix::template view_type< view::gather >::type
 		>::template change_backend< config::default_sequential_backend >::type
 		get_view( SourceMatrix &source, const ThreadCoords t, const size_t br, const size_t bc ) {
@@ -68,7 +71,7 @@ namespace alp {
 			const auto &distribution = getAmf( source ).getDistribution();
 			const size_t thread_id = distribution.getThreadId( t );
 			const size_t block_id = distribution.getLocalBlockId( t, br, bc );
-			auto &container = internal::getLocalContainer( internal::getContainer( source ), thread_id, block_id );
+			auto &container = getLocalContainer( getContainer( source ), thread_id, block_id );
 
 			// make an AMF
 			// note: When making a view over a vector, the second imf must be imf::Zero
@@ -92,7 +95,7 @@ namespace alp {
 			);
 
 			// create a sequential container with the container and AMF above
-			using target_t = typename internal::new_container_type_from<
+			using target_t = typename new_container_type_from<
 				typename SourceMatrix::template view_type< view::gather >::type
 			>::template change_backend< config::default_sequential_backend >::type;
 
diff --git a/include/alp/reference/blas3.hpp b/include/alp/reference/blas3.hpp
index b0ad5ef3d..8a76ee6e3 100644
--- a/include/alp/reference/blas3.hpp
+++ b/include/alp/reference/blas3.hpp
@@ -490,46 +490,71 @@ namespace alp {
 	 * @param phase 	The execution phase.
 	 */
 	template<
-		typename OutputStructMatT,
-		typename InputStructMatT1,
-		typename InputStructMatT2,
+		typename OutputType, typename InputType1, typename InputType2,
+		typename OutputStructure, typename OutputView, 
+		typename OutputImfR, typename OutputImfC,
+		typename InputStructure1, typename InputView1, 
+		typename InputImfR1, typename InputImfC1,
+		typename InputStructure2, typename InputView2, 
+		typename InputImfR2, typename InputImfC2,
 		class Semiring
 	>
 	RC mxm( 
-		OutputStructMatT & C,
-		const InputStructMatT1 & A,
-		const InputStructMatT2 & B,
+		alp::Matrix< OutputType, OutputStructure, 
+		Density::Dense, OutputView, OutputImfR, OutputImfC, reference > &C,
+		const alp::Matrix< InputType1, InputStructure1, 
+		Density::Dense, InputView1, InputImfR1, InputImfC1, reference > &A,
+		const alp::Matrix< InputType2, InputStructure2, 
+		Density::Dense, InputView2, InputImfR2, InputImfC2, reference > &B,
 		const Semiring & ring = Semiring(),
 		const PHASE &phase = NUMERICAL,
-		const std::enable_if_t< 
-			! alp::is_object< typename OutputStructMatT::value_type >::value && ! alp::is_object< typename InputStructMatT1::value_type >::value && ! alp::is_object< typename InputStructMatT2::value_type >::value && alp::is_semiring< Semiring >::value
-		> * const = NULL 
+		const typename std::enable_if< 
+			!alp::is_object< OutputType >::value && 
+			!alp::is_object< InputType1 >::value && 
+			!alp::is_object< InputType2 >::value && 
+			alp::is_semiring< Semiring >::value,
+			void 
+		>::type * const = NULL 
 	) {
 		(void)phase;
 
-		return internal::mxm_generic< false >( C, A, B, ring.getMultiplicativeOperator(), ring.getAdditiveMonoid(), ring.getMultiplicativeMonoid() );
+		return internal::mxm_generic< false >( 
+			C, A, B, 
+			ring.getMultiplicativeOperator(), ring.getAdditiveMonoid(), ring.getMultiplicativeMonoid() 
+		);
 	}
 
 	/**
 	 * Dense Matrix-Matrix multiply between structured matrices.
 	 * Version with additive monoid and multiplicative operator
 	 */
-	template< typename OutputStructMatT, 
-		typename InputStructMatT1,
-		typename InputStructMatT2,
+	template< 
+		typename OutputType, typename InputType1, typename InputType2,
+		typename OutputStructure, typename OutputView, 
+		typename OutputImfR, typename OutputImfC,
+		typename InputStructure1, typename InputView1, 
+		typename InputImfR1, typename InputImfC1,
+		typename InputStructure2, typename InputView2, 
+		typename InputImfR2, typename InputImfC2,
 		class Operator, class Monoid
 	>
 	RC mxm( 
-		OutputStructMatT & C,
-		const InputStructMatT1 & A,
-		const InputStructMatT2 & B,
+		alp::Matrix< OutputType, OutputStructure, 
+		Density::Dense, OutputView, OutputImfR, OutputImfC, reference > &C,
+		const alp::Matrix< InputType1, InputStructure1, 
+		Density::Dense, InputView1, InputImfR1, InputImfC1, reference > &A,
+		const alp::Matrix< InputType2, InputStructure2, 
+		Density::Dense, InputView2, InputImfR2, InputImfC2, reference > &B,
 		const Operator & mulOp,
 		const Monoid & addM,
 		const PHASE &phase = NUMERICAL,
-		const std::enable_if_t< 
-			! alp::is_object< typename OutputStructMatT::value_type >::value && ! alp::is_object< typename InputStructMatT1::value_type >::value && ! alp::is_object< typename InputStructMatT2::value_type >::value &&
-		    alp::is_operator< Operator >::value && alp::is_monoid< Monoid >::value
-		> * const = NULL 
+		const typename std::enable_if< 
+			!alp::is_object< OutputType >::value && 
+			!alp::is_object< InputType1 >::value && 
+			!alp::is_object< InputType2 >::value &&
+			alp::is_operator< Operator >::value && alp::is_monoid< Monoid >::value,
+			void 
+		>::type * const = NULL 
 	) {
 		(void)phase;
 
diff --git a/include/alp/reference/io.hpp b/include/alp/reference/io.hpp
index c6de63f5d..da1e287c5 100644
--- a/include/alp/reference/io.hpp
+++ b/include/alp/reference/io.hpp
@@ -878,9 +878,6 @@ namespace alp {
 		const fwd_iterator &start,
 		const fwd_iterator &end
 	) noexcept {
-		(void)A;
-		(void)start;
-		(void)end;
 
 		// Temporarily assuming 1-1 mapping with user container
 		internal::setInitialized(A, true);
diff --git a/include/alp/utils.hpp b/include/alp/utils.hpp
index 7e002cbff..d053f998d 100644
--- a/include/alp/utils.hpp
+++ b/include/alp/utils.hpp
@@ -300,15 +300,15 @@ namespace alp {
 		
 		/**
 		 * Computation of modulus operation \f$ mod(x, n) \f$ based on remainder 
-		 * of division for type \a T.
+		 * of division for type \a T within range [0, n).
 		 * Assumes \a T implements: \a operator%, \a operator<, and \a operator+.
 		 */
 		template < typename T >
-		T modulus( const T x, const T n, const T zero) {
+		T modulus( const T x, const T n ) {
 			
-			const T rem = std::modulus( _k, _n );
+			const T rem = x % n;
 			
-			return ( rem < zero ) ? rem + _n : rem;
+			return ( rem < static_cast< T >( 0 ) ) ? rem + n : rem;
 		};
 					
 	} // namespace utils
diff --git a/src/alp/omp/init.cpp b/src/alp/omp/init.cpp
index f7f8b5d42..f57351285 100644
--- a/src/alp/omp/init.cpp
+++ b/src/alp/omp/init.cpp
@@ -36,7 +36,7 @@ alp::RC alp::init< alp::omp >( const size_t s, const size_t P, void * const data
 	RC rc = alp::SUCCESS;
 	// print output
 	const auto T = config::OMP::threads();
-	std::cerr << "Info: alp::init (omp) called. OpenMP is set to utilise " << T << " threads.\n";
+	std::cout << "Info: alp::init (omp) called. OpenMP is set to utilise " << T << " threads.\n";
 
 	// sanity checks
 	if( P > 1 ) {
@@ -54,7 +54,7 @@ alp::RC alp::init< alp::omp >( const size_t s, const size_t P, void * const data
 
 template<>
 alp::RC alp::finalize< alp::omp >() {
-	std::cerr << "Info: alp::finalize (omp) called.\n";
+	std::cout << "Info: alp::finalize (omp) called.\n";
 	return alp::SUCCESS;
 }
 
diff --git a/tests/unit/CMakeLists.txt b/tests/unit/CMakeLists.txt
index b54e0a43b..3eed07ea7 100644
--- a/tests/unit/CMakeLists.txt
+++ b/tests/unit/CMakeLists.txt
@@ -269,6 +269,10 @@ add_grb_executables( dense_omp_matrix dense_omp_matrix.cpp
 	BACKENDS alp_omp
 )
 
+add_grb_executables( dense_omp_mxm dense_omp_mxm.cpp
+	BACKENDS alp_reference alp_omp
+)
+
 add_grb_executables( dense_outer dense_outer.cpp
 	BACKENDS alp_reference
 )
diff --git a/tests/unit/dense_omp_mxm.cpp b/tests/unit/dense_omp_mxm.cpp
new file mode 100644
index 000000000..25f6426ce
--- /dev/null
+++ b/tests/unit/dense_omp_mxm.cpp
@@ -0,0 +1,367 @@
+
+/*
+ *   Copyright 2021 Huawei Technologies Co., Ltd.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <iostream>
+#include <sstream>
+#include <vector>
+
+#include <alp.hpp>
+//#include "../utils/print_alp_containers.hpp"
+
+using namespace alp;
+
+
+template< typename T >
+void print_stdvec_as_matrix( std::string name, const std::vector< T > & vA, const size_t m, const size_t n, const size_t lda ) {
+
+	std::cout << "Vec " << name << ":" << std::endl;
+	for( size_t row = 0; row < m; ++row ) {
+		std::cout << "[\t";
+		for( size_t col = 0; col < n; ++col ) {
+			std::cout << vA[ row * lda + col ] << "\t";
+		}
+		std::cout << "]" << std::endl;
+	}
+}
+
+template< typename T, typename Operator, typename Monoid >
+void mxm_stdvec_as_matrix(	std::vector< T > & vC, const size_t ldc,
+							const std::vector< T > & vA, const size_t lda,
+							const std::vector< T > & vB, const size_t ldb,
+							const size_t m, const size_t k, const size_t n,
+							const Operator oper,
+							const Monoid monoid ) {
+    
+	T temp;
+
+	print_stdvec_as_matrix("vA", vA, n, n, n);
+	print_stdvec_as_matrix("vB", vB, n, n, n);
+	print_stdvec_as_matrix("vC - PRE", vC, n, n, n);
+
+	for( size_t i = 0; i < m; ++i ) {
+		for( size_t j = 0; j < n; ++j ) {
+			T & c_val { vC[ i * ldc + j ] };
+			for( size_t l = 0; l < k; ++l ) {
+					const T & a_val { vA[ i * lda + l ] };
+					const T & b_val { vB[ l * ldb + j ] };
+					// std::cout << c_val << " += " << a_val << " * " << b_val << std::endl;
+					(void)internal::apply( temp, a_val, b_val, oper );
+					// std::cout << "temp = " << temp << std::endl;
+					(void)internal::foldl( c_val, temp, monoid.getOperator() );
+			}
+		}
+	}
+
+	print_stdvec_as_matrix("vC - POST", vC, n, n, n);
+
+}
+
+template< typename Structure, typename T >
+void stdvec_build_matrix( std::vector< T > & vA, const size_t m, const size_t n, const size_t lda, const T zero, const T one ) {
+
+		if( std::is_same< Structure, structures::General >::value ) {
+			std::fill( vA.begin(), vA.end(), one );
+		} else if( std::is_same< Structure, structures::Symmetric >::value ) {
+			std::fill( vA.begin(), vA.end(), one );
+		} else if( std::is_same< Structure, structures::UpperTriangular >::value ) {
+			for( size_t row = 0; row < m; ++row ) {
+				for( size_t col = 0; col < row; ++col ) {
+					vA[ row * lda + col ] = zero;
+				}
+				for( size_t col = row; col < n; ++col ) {
+					vA[ row * lda + col ] = one;
+				}
+			}
+		}
+
+}
+
+template< typename Structure, typename T >
+void stdvec_build_matrix( std::vector< T > & vA, const size_t m, const size_t n, const size_t lda, const T zero, const T one, const T inc ) {
+
+		T val = one;
+		if( std::is_same< Structure, structures::General >::value ) {
+			for( size_t row = 0; row < m; ++row ) {
+				for( size_t col = 0; col < n; ++col ) {
+					vA[ row * lda + col ] = val;
+					val += inc;
+				}
+			}
+		} else if( std::is_same< Structure, structures::Symmetric >::value ) {
+			for( size_t row = 0; row < m; ++row ) {
+				for( size_t col = row; col < n; ++col ) {
+					vA[ row * lda + col ] = vA[ col * lda + row ] = val;
+					val += inc;
+				}
+			}
+		} else if( std::is_same< Structure, structures::UpperTriangular >::value ) {
+			for( size_t row = 0; row < m; ++row ) {
+				for( size_t col = 0; col < row; ++col ) {
+					vA[ row * lda + col ] = zero;
+				}
+				for( size_t col = row; col < n; ++col ) {
+					vA[ row * lda + col ] = val;
+					val += inc;
+				}
+			}
+		}
+
+}
+
+template< typename Structure, typename T >
+void stdvec_build_matrix_packed( std::vector< T > & vA, const T one ) {
+
+	std::fill( vA.begin(), vA.end(), one );
+
+}
+
+template< typename Structure, typename T >
+void stdvec_build_matrix_packed( std::vector< T > & vA, const T one, const T inc ) {
+
+		T val = one;
+		if( std::is_same< Structure, structures::Symmetric >::value ) { // Assumes Packed Row - Upper
+			for( auto & elem: vA ) {
+				elem = val;
+				val += inc;
+			}
+		} else if( std::is_same< Structure, structures::UpperTriangular >::value ) { // Assumes Packed Row - Upper
+			for( auto & elem: vA ) {
+				elem = val;
+				val += inc;
+			}
+		}
+
+}
+
+template< typename MatType, typename T >
+void diff_stdvec_matrix( const std::vector< T > & vA, const size_t m, const size_t n, const size_t lda,
+						 const MatType & mA, double threshold=1e-7 ) {
+
+	if( std::is_same< typename MatType::structure, structures::General >::value ) {
+		for( size_t row = 0; row < m; ++row ) {
+			for( size_t col = 0; col < n; ++col ) {
+				double va = ( double )( vA[ row * lda + col ] );
+				double vm = ( double )( internal::access( mA, internal::getStorageIndex( mA, row, col ) ) );
+				double re = std::abs( ( va - vm ) / va );
+				if( re > threshold ) {
+					std::cout << "Error ( " << row << ", " << col << " ): " << va << " v " << vm << std::endl; 
+				}
+			}
+		}
+	} else if( std::is_same< typename MatType::structure, structures::Symmetric >::value ) {
+		for( size_t row = 0; row < m; ++row ) {
+			for( size_t col = row; col < n; ++col ) {
+				double va = ( double )( vA[ row * lda + col ] );
+				double vm = ( double )( internal::access( mA, internal::getStorageIndex( mA, row, col ) ) );
+				double re = std::abs( ( va - vm ) / va );
+				if( re > threshold ) {
+					std::cout << "Error ( " << row << ", " << col << " ): " << va << " v " << vm << std::endl; 
+				}
+			}
+		}
+	} else if( std::is_same< typename MatType::structure, structures::UpperTriangular >::value ) {
+		for( size_t row = 0; row < m; ++row ) {
+			for( size_t col = row; col < n; ++col ) {
+				double va = ( double )( vA[ row * lda + col ] );
+				double vm = ( double )( internal::access( mA, internal::getStorageIndex( mA, row, col ) ) );
+				double re = std::abs( ( va - vm ) / va );
+				if( re > threshold ) {
+					std::cout << "Error ( " << row << ", " << col << " ): " << va << " v " << vm << std::endl; 
+				}
+			}
+		}
+	}
+
+}
+
+void alp_program( const size_t & n, alp::RC & rc ) {
+
+	typedef double T;
+
+	alp::Semiring< alp::operators::add< T >, alp::operators::mul< T >, alp::identities::zero, alp::identities::one > ring;
+
+	T one  = ring.getOne< T >();
+	T zero = ring.getZero< T >();
+
+	std::vector< T > A_data( n * n );
+	std::vector< T > B_data( n * n );
+	std::vector< T > C_data( n * n, zero );
+
+	// std::vector< T > A_packed( n * ( n + 1 ) / 2 );
+	// std::vector< T > B_packed( n * ( n + 1 ) / 2 );
+	// std::vector< T > C_packed( n * ( n + 1 ) / 2, zero );
+
+	std::vector< T > A_vec( n * n );
+	std::vector< T > B_vec( n * n );
+	std::vector< T > C_vec( n * n, zero );
+
+	std::cout << "\tTesting dense General mxm " << n << std::endl;
+
+	stdvec_build_matrix< structures::General >( A_data, n, n, n, zero, one, one );
+	stdvec_build_matrix< structures::General >( B_data, n, n, n, zero, one, one );
+
+	// initialize test
+	alp::Matrix< T, structures::General > A( n, n );
+	alp::Matrix< T, structures::General > B( n, n );
+	alp::Matrix< T, structures::General > C( n, n );
+
+	// Initialize input matrices
+	rc = alp::buildMatrix( A, A_data.begin(), A_data.end() );
+	rc = alp::buildMatrix( B, B_data.begin(), B_data.end() );
+	rc = alp::buildMatrix( C, C_data.begin(), C_data.end() );
+
+	print_matrix("A", A);
+	print_matrix("B", B);
+	print_matrix("C - PRE", C);
+
+	rc = alp::mxm( C, A, B, ring );
+
+	print_matrix("C - POST", C);
+
+	stdvec_build_matrix< structures::General >( A_vec, n, n, n, zero, one, one );
+	stdvec_build_matrix< structures::General >( B_vec, n, n, n, zero, one, one );
+
+	mxm_stdvec_as_matrix( C_vec, n, A_vec, n, B_vec, n, n, n, n, ring.getMultiplicativeOperator(), ring.getAdditiveMonoid() );
+
+	diff_stdvec_matrix( C_vec, n, n, n, C );
+
+	// std::cout << "\n\n=========== Testing Uppertriangular ============\n\n";
+
+	// alp::Matrix< T, structures::UpperTriangular > UA( n );
+	// alp::Matrix< T, structures::UpperTriangular > UB( n );
+	// alp::Matrix< T, structures::UpperTriangular > UC( n );
+
+	// stdvec_build_matrix_packed< structures::UpperTriangular >( A_packed, one, one );
+	// stdvec_build_matrix_packed< structures::UpperTriangular >( B_packed, one, one );
+
+	// rc = alp::buildMatrix( UA, A_packed.begin(), A_packed.end() );
+	// rc = alp::buildMatrix( UB, B_packed.begin(), B_packed.end() );
+	// rc = alp::buildMatrix( UC, C_packed.begin(), C_packed.end() );
+
+	// print_matrix("UA", UA);
+	// print_matrix("UB", UB);
+	// print_matrix("UC - PRE", UC);
+	// rc = alp::mxm( UC, UA, UB, ring );
+	// print_matrix("UC - POST", UC);
+
+	// stdvec_build_matrix< structures::UpperTriangular >( A_vec, n, n, n, zero, one, one );
+	// stdvec_build_matrix< structures::UpperTriangular >( B_vec, n, n, n, zero, one, one );
+	// stdvec_build_matrix< structures::General >( C_vec, n, n, n, zero, zero );
+
+	// mxm_stdvec_as_matrix( C_vec, n, A_vec, n, B_vec, n, n, n, n, ring.getMultiplicativeOperator(), ring.getAdditiveMonoid() );
+
+	// diff_stdvec_matrix( C_vec, n, n, n, UC );
+
+	// std::cout << "\n\n=========== Testing Symmetric Output ============\n\n";
+
+	// alp::Matrix< T, structures::Symmetric > SC( n );
+
+	// stdvec_build_matrix< structures::Symmetric >( A_data, n, n, n, zero, one, one );
+
+	// rc = alp::buildMatrix( A,  A_data.begin(), A_data.end() );
+	// rc = alp::buildMatrix( SC, C_packed.begin(), C_packed.end() );
+
+	// print_matrix("A", A );
+	// print_matrix("A^T", alp::get_view< alp::view::transpose >( A ) );
+	// print_matrix("SC - PRE", SC);
+	// rc = alp::mxm( SC, A, alp::get_view< alp::view::transpose >( A ), ring );
+	// print_matrix("SC - POST", SC);
+
+	// stdvec_build_matrix< structures::Symmetric >( A_vec, n, n, n, zero, one, one );
+	// stdvec_build_matrix< structures::Symmetric >( C_vec, n, n, n, zero, zero );
+
+	// mxm_stdvec_as_matrix( C_vec, n, A_vec, n, A_vec, n, n, n, n, ring.getMultiplicativeOperator(), ring.getAdditiveMonoid() );
+
+	// diff_stdvec_matrix( C_vec, n, n, n, SC );
+
+	// std::cout << "\n\n=========== Testing Symmetric x Symmetric Output ============\n\n";
+
+	// alp::Matrix< T, structures::Symmetric > SA( n );
+	// alp::Matrix< T, structures::Symmetric > SB( n );
+
+	// stdvec_build_matrix_packed< structures::Symmetric >( A_packed, one, one );
+	// stdvec_build_matrix_packed< structures::Symmetric >( B_packed, one, one + one );
+
+	// rc = alp::buildMatrix( SA, A_packed.begin(), A_packed.end() );
+	// rc = alp::buildMatrix( SB, B_packed.begin(), B_packed.end() );
+	// rc = alp::buildMatrix( C,  C_data.begin(), C_data.end() );
+
+	// print_matrix("SA", SA);
+	// print_matrix("SB", SB);
+	// print_matrix("C - PRE", C);
+	// rc = alp::mxm( C, SA, SB, ring );
+	// print_matrix("C - POST", C);
+
+	// stdvec_build_matrix< structures::Symmetric >( A_vec, n, n, n, zero, one, one );
+	// stdvec_build_matrix< structures::Symmetric >( B_vec, n, n, n, zero, one, one + one );
+	// stdvec_build_matrix< structures::General >( C_vec, n, n, n, zero, zero );
+
+	// mxm_stdvec_as_matrix( C_vec, n, A_vec, n, B_vec, n, n, n, n, ring.getMultiplicativeOperator(), ring.getAdditiveMonoid() );
+
+	// diff_stdvec_matrix( C_vec, n, n, n, C );
+
+
+}
+
+int main( int argc, char ** argv ) {
+	// defaults
+	bool printUsage = false;
+	size_t in = 4;
+
+	// error checking
+	if( argc > 2 ) {
+		printUsage = true;
+	}
+	if( argc == 2 ) {
+		size_t read;
+		std::istringstream ss( argv[ 1 ] );
+		if( ! ( ss >> read ) ) {
+			std::cerr << "Error parsing first argument\n";
+			printUsage = true;
+		} else if( ! ss.eof() ) {
+			std::cerr << "Error parsing first argument\n";
+			printUsage = true;
+		} else if( read % 2 != 0 ) {
+			std::cerr << "Given value for n is odd\n";
+			printUsage = true;
+		} else {
+			// all OK
+			in = read;
+		}
+	}
+	if( printUsage ) {
+		std::cerr << "Usage: " << argv[ 0 ] << " [n]\n";
+		std::cerr << "  -n (optional, default is 100): an even integer, the "
+					 "test size.\n";
+		return 1;
+	}
+
+	std::cout << "This is functional test " << argv[ 0 ] << " " << in << "\n";
+	alp::Launcher< AUTOMATIC > launcher;
+	alp::RC out;
+	if( launcher.exec( &alp_program, in, out, true ) != SUCCESS ) {
+		std::cerr << "Launching test FAILED\n";
+		return 255;
+	}
+	if( out != SUCCESS ) {
+		std::cerr << "Test FAILED (" << alp::toString( out ) << ")" << std::endl;
+	} else {
+		std::cout << "Test OK" << std::endl;
+	}
+	return 0;
+}
+

From 0e47854602a32f24e029575df5a77f4a05ca7438 Mon Sep 17 00:00:00 2001
From: "Daniele G. Spampinato" <daniele.giuseppe.spampinato@huawei.com>
Date: Fri, 2 Dec 2022 17:42:43 +0100
Subject: [PATCH 5/9] shared mem (ge) mxm passing functional tests

---
 include/alp/omp/blas3.hpp    | 65 ++++++++++++++++++++----------------
 include/alp/omp/config.hpp   |  4 +--
 include/alp/omp/matrix.hpp   |  4 ++-
 tests/unit/dense_omp_mxm.cpp | 13 ++++++--
 4 files changed, 52 insertions(+), 34 deletions(-)

diff --git a/include/alp/omp/blas3.hpp b/include/alp/omp/blas3.hpp
index 539366bfc..d720a9e8e 100644
--- a/include/alp/omp/blas3.hpp
+++ b/include/alp/omp/blas3.hpp
@@ -39,7 +39,7 @@
  #include <alp/reference/io.hpp>
 #endif
 
-#ifndef _NDEBUG
+#ifdef _DEBUG
 #include "../../../tests/utils/print_alp_containers.hpp"
 #endif
 
@@ -92,14 +92,14 @@ namespace alp {
 				"void)"
 			);
 
-#ifndef _NDEBUG
+#ifdef _DEBUG
 			std::cout << "In alp::internal::mxm_generic (omp)\n";
 #endif
 
 			// Early exit checks 
-			if( ! internal::getInitialized( A ) || 
-				! internal::getInitialized( B ) || 
-				! internal::getInitialized( C ) 
+			if( ! internal::getInitialized( A ) ||
+				! internal::getInitialized( B ) ||
+				! internal::getInitialized( C )
 			) {
 				internal::setInitialized( C, false );
 				return SUCCESS;
@@ -156,7 +156,7 @@ namespace alp {
 					|| block_grid_dims_c.second != set_block_grid_dims_b.second 
 					|| set_block_grid_dims_a.second != set_block_grid_dims_b.first 
 				) {
-#ifndef _NDEBUG
+#ifdef _DEBUG
 					#pragma omp critical
 					std::cerr << "Thread " << thread << " in alp::internal::mxm_generic (omp)\n"
 						"\tMismatching local block grid size on set." << std::endl;
@@ -164,7 +164,7 @@ namespace alp {
 					local_rc = MISMATCH;
 				}
 
-				// Broadcast Aij and Bij to all c-dimensional layers
+				// Broadcast A and B to all c-dimensional layers
 				if( local_rc == SUCCESS && th_ijk_a.rt > 0 ) {
 
 					th_coord_t th_ij0_a( th_ijk_a.tr, th_ijk_a.tc, 0 );
@@ -177,15 +177,10 @@ namespace alp {
 
 							local_rc = local_rc ? local_rc : set( refAijk, refAij0 );
 
-#ifndef _NDEBUG
-							print_matrix("refAij0", refAij0);
-							print_matrix("refAijk", refAijk);
-#endif
-
 						}
 					}
 				
-				} // End Broadcast of Aij
+				} // End Broadcast of A
 
 				if( local_rc == SUCCESS && th_ijk_b.rt > 0 ) {
 
@@ -194,17 +189,31 @@ namespace alp {
 					for( size_t br = 0; br < set_block_grid_dims_b.first; ++br ) {
 						for( size_t bc = 0; bc < set_block_grid_dims_b.second; ++bc ) {
 
-							auto refBij0 = internal::get_view( B, th_ij0_b, br, bc );
-							auto refBijk = internal::get_view( B, th_ijk_b, br, bc );
+							auto refBij0 = get_view( B, th_ij0_b, br, bc );
+							auto refBijk = get_view( B, th_ijk_b, br, bc );
 
 							local_rc = local_rc ? local_rc : set( refBijk, refBij0 );
 						}
 					}
-				}
+				} // End Broadcast of B
+
+				if( local_rc == SUCCESS && th_ijk_c.rt > 0 ) {
+
+					for( size_t br = 0; br < block_grid_dims_c.first; ++br ) {
+						for( size_t bc = 0; bc < block_grid_dims_c.second; ++bc ) {
+
+							auto refCijk = get_view( C, th_ijk_c, br, bc );
+							alp::Scalar< 
+								OutputType, alp::structures::General, config::default_sequential_backend 
+							> zero( monoid.template getIdentity< OutputType >() );
+							local_rc = local_rc ? local_rc : set( refCijk, zero );
+						}
+					}
+				} // End Zero-ing of C
 
 				// Different values for rc could converge here (eg, MISMATCH, FAILED).
 				if( local_rc != SUCCESS ) {
-#ifndef _NDEBUG
+#ifdef _DEBUG
 					#pragma omp critical
 					std::cerr << "Thread " << thread << " in alp::internal::mxm_generic (omp)\n"
 						"\tIssues replicating input matrices." << std::endl;
@@ -212,7 +221,7 @@ namespace alp {
 					rc = local_rc;
 				}
 
-				// } // End Broadcast of Aij and Bij
+				// End Broadcast of A and B and zero-ing of C
 				#pragma omp barrier
 				
 				if( rc == SUCCESS ) {
@@ -237,7 +246,7 @@ namespace alp {
 							|| block_grid_dims_c.second != mxm_block_grid_dims_b.second 
 							|| mxm_block_grid_dims_a.second != mxm_block_grid_dims_b.first 
 						) {
-#ifndef _NDEBUG
+#ifdef _DEBUG
 							#pragma omp critical
 							std::cerr << "Thread " << thread << " in alp::internal::mxm_generic (omp)\n"
 								"\tMismatching local block grid size on mxm." << std::endl;
@@ -250,15 +259,15 @@ namespace alp {
 							for( size_t bk = 0; bk < mxm_block_grid_dims_a.second; ++bk ) {
 								for( size_t br = 0; br < block_grid_dims_c.first; ++br ) {
 	
-									const auto refA_loc = internal::get_view( A, th_isk_a, br, bk );
+									const auto refA_loc = get_view( A, th_isk_a, br, bk );
 
 									for( size_t bc = 0; bc < block_grid_dims_c.second; ++bc ) {
 
-										const auto refB_loc = internal::get_view( B, th_sjk_b, bk, bc );
-										auto refC_ijk = internal::get_view( C, th_ijk_c, br, bc );
+										const auto refB_loc = get_view( B, th_sjk_b, bk, bc );
+										auto refC_ijk = get_view( C, th_ijk_c, br, bc );
 
 										// Delegate the call to the sequential mxm implementation
-										local_rc = local_rc ? local_rc : internal::mxm_generic< allow_void >( refC_ijk, refA_loc, refB_loc, oper, monoid, mulMonoid );
+										local_rc = local_rc ? local_rc : mxm_generic< allow_void >( refC_ijk, refA_loc, refB_loc, oper, monoid, mulMonoid );
 
 									}
 								}
@@ -270,9 +279,6 @@ namespace alp {
 							s_b = utils::modulus( s_b + 1, tg_b.tr );
 
 						} 
-						// else {
-						// 	break;
-						// }
 
 					} 
 					// }
@@ -280,9 +286,11 @@ namespace alp {
 
 				// Different values for rc could converge here (eg, MISMATCH, FAILED).
 				if( local_rc != SUCCESS ) {
+#ifdef _DEBUG
 					#pragma omp critical
 					std::cerr << "Thread " << thread << " in alp::internal::mxm_generic (omp)\n"
 						"\tIssues with local mxm computations." << std::endl;
+#endif
 					rc = local_rc;
 				}
 
@@ -310,9 +318,6 @@ namespace alp {
 								}
 							}
 
-							// if( local_rc != SUCCESS ) {
-							// 	break;
-							// }
 						}
 
 					}
@@ -320,9 +325,11 @@ namespace alp {
 				
 				// Different values for rc could converge here (eg, MISMATCH, FAILED).
 				if( local_rc != SUCCESS ) {
+#ifdef _DEBUG
 					#pragma omp critical
 					std::cerr << "Thread " << thread << " in alp::internal::mxm_generic (omp)\n"
 						"\tIssues with final reduction." << std::endl;
+#endif
 					rc = local_rc;
 				}
 
diff --git a/include/alp/omp/config.hpp b/include/alp/omp/config.hpp
index dff95c18e..4a4ec7323 100644
--- a/include/alp/omp/config.hpp
+++ b/include/alp/omp/config.hpp
@@ -34,8 +34,8 @@ namespace alp {
 		class OMP : public grb::config::OMP {};
 
 		// Dimensions of blocks counted in number of elements per dimension
-		constexpr size_t BLOCK_ROW_DIM = 2;
-		constexpr size_t BLOCK_COL_DIM = 2;
+		constexpr size_t BLOCK_ROW_DIM = 64;
+		constexpr size_t BLOCK_COL_DIM = 64;
 
 		// Temporary solution to accomodate for 2.5D mxm
 		constexpr size_t THREAD_ROW_DIM = 2;
diff --git a/include/alp/omp/matrix.hpp b/include/alp/omp/matrix.hpp
index 9d279ba7f..94e600979 100644
--- a/include/alp/omp/matrix.hpp
+++ b/include/alp/omp/matrix.hpp
@@ -99,7 +99,9 @@ namespace alp {
 				typename SourceMatrix::template view_type< view::gather >::type
 			>::template change_backend< config::default_sequential_backend >::type;
 
-			return target_t( container, amf );
+			target_t blk_matrix( container, amf );
+			internal::setInitialized( blk_matrix, internal::getInitialized( source ) );
+			return blk_matrix;
 		}
 
 	} // namespace internal
diff --git a/tests/unit/dense_omp_mxm.cpp b/tests/unit/dense_omp_mxm.cpp
index 25f6426ce..415499595 100644
--- a/tests/unit/dense_omp_mxm.cpp
+++ b/tests/unit/dense_omp_mxm.cpp
@@ -20,7 +20,9 @@
 #include <vector>
 
 #include <alp.hpp>
-//#include "../utils/print_alp_containers.hpp"
+#ifndef _DEBUG
+ #include "../utils/print_alp_containers.hpp"
+#endif
 
 using namespace alp;
 
@@ -48,9 +50,11 @@ void mxm_stdvec_as_matrix(	std::vector< T > & vC, const size_t ldc,
     
 	T temp;
 
+#ifdef _DEBUG
 	print_stdvec_as_matrix("vA", vA, n, n, n);
 	print_stdvec_as_matrix("vB", vB, n, n, n);
 	print_stdvec_as_matrix("vC - PRE", vC, n, n, n);
+#endif
 
 	for( size_t i = 0; i < m; ++i ) {
 		for( size_t j = 0; j < n; ++j ) {
@@ -66,8 +70,9 @@ void mxm_stdvec_as_matrix(	std::vector< T > & vC, const size_t ldc,
 		}
 	}
 
+#ifdef _DEBUG
 	print_stdvec_as_matrix("vC - POST", vC, n, n, n);
-
+#endif
 }
 
 template< typename Structure, typename T >
@@ -224,13 +229,17 @@ void alp_program( const size_t & n, alp::RC & rc ) {
 	rc = alp::buildMatrix( B, B_data.begin(), B_data.end() );
 	rc = alp::buildMatrix( C, C_data.begin(), C_data.end() );
 
+#ifdef _DEBUG
 	print_matrix("A", A);
 	print_matrix("B", B);
 	print_matrix("C - PRE", C);
+#endif
 
 	rc = alp::mxm( C, A, B, ring );
 
+#ifdef _DEBUG
 	print_matrix("C - POST", C);
+#endif
 
 	stdvec_build_matrix< structures::General >( A_vec, n, n, n, zero, one, one );
 	stdvec_build_matrix< structures::General >( B_vec, n, n, n, zero, one, one );

From 474d1075e25338bb9f0d8d87921c86d96b2ac57d Mon Sep 17 00:00:00 2001
From: "Daniele G. Spampinato" <daniele.giuseppe.spampinato@huawei.com>
Date: Fri, 2 Dec 2022 17:42:43 +0100
Subject: [PATCH 6/9] Tmp debugging info + fix in shift computation

---
 include/alp/omp/blas3.hpp    | 134 +++++++++++++++++++++++++++--------
 include/alp/omp/config.hpp   |  10 +--
 include/alp/omp/storage.hpp  |  28 +++++---
 tests/unit/dense_omp_mxm.cpp |  46 ++++++------
 4 files changed, 154 insertions(+), 64 deletions(-)

diff --git a/include/alp/omp/blas3.hpp b/include/alp/omp/blas3.hpp
index d720a9e8e..8f194b82c 100644
--- a/include/alp/omp/blas3.hpp
+++ b/include/alp/omp/blas3.hpp
@@ -39,7 +39,7 @@
  #include <alp/reference/io.hpp>
 #endif
 
-#ifdef _DEBUG
+#ifndef NDEBUG
 #include "../../../tests/utils/print_alp_containers.hpp"
 #endif
 
@@ -92,7 +92,7 @@ namespace alp {
 				"void)"
 			);
 
-#ifdef _DEBUG
+#ifndef NDEBUG
 			std::cout << "In alp::internal::mxm_generic (omp)\n";
 #endif
 
@@ -123,8 +123,9 @@ namespace alp {
 			const auto tg_a = da.getThreadGridDims();
 			const auto tg_b = db.getThreadGridDims();
 			const auto tg_c = dc.getThreadGridDims();
-
-			if( tg_c.rt != tg_a.rt || tg_a.rt != tg_b.rt ) {
+			
+			// 2.5D factor ranges within 3D limits
+			if( tg_c.rt != tg_a.rt || tg_a.rt != tg_b.rt || ( tg_a.tc % tg_a.rt ) != 0 ) {
 				return MISMATCH;
 			}
 
@@ -136,10 +137,14 @@ namespace alp {
 
 			RC rc = SUCCESS;
 
+#ifndef NDEBUG
+			#pragma omp critical
+			std::cerr << "In alp::internal::mxm_generic (omp)\n"
+				"\tBeginning parallel region" << std::endl;
+#endif
+
 			#pragma omp parallel
 			{			
-				// #pragma omp for
-				// for( size_t thread = 0; thread < config::OMP::current_threads(); ++thread ) {
 				const size_t thread = config::OMP::current_thread_ID();
 
 				const th_coord_t th_ijk_a = da.getThreadCoords( thread );
@@ -156,7 +161,7 @@ namespace alp {
 					|| block_grid_dims_c.second != set_block_grid_dims_b.second 
 					|| set_block_grid_dims_a.second != set_block_grid_dims_b.first 
 				) {
-#ifdef _DEBUG
+#ifndef NDEBUG
 					#pragma omp critical
 					std::cerr << "Thread " << thread << " in alp::internal::mxm_generic (omp)\n"
 						"\tMismatching local block grid size on set." << std::endl;
@@ -167,6 +172,12 @@ namespace alp {
 				// Broadcast A and B to all c-dimensional layers
 				if( local_rc == SUCCESS && th_ijk_a.rt > 0 ) {
 
+#ifndef NDEBUG
+					#pragma omp critical
+					std::cerr << "Thread " << thread << " in alp::internal::mxm_generic (omp)\n"
+						"\tCopying A." << std::endl;
+#endif
+
 					th_coord_t th_ij0_a( th_ijk_a.tr, th_ijk_a.tc, 0 );
 
 					for( size_t br = 0; br < set_block_grid_dims_a.first; ++br ) {
@@ -184,6 +195,12 @@ namespace alp {
 
 				if( local_rc == SUCCESS && th_ijk_b.rt > 0 ) {
 
+#ifndef NDEBUG
+					#pragma omp critical
+					std::cerr << "Thread " << thread << " in alp::internal::mxm_generic (omp)\n"
+						"\tCopying B." << std::endl;
+#endif
+
 					th_coord_t th_ij0_b( th_ijk_b.tr, th_ijk_b.tc, 0 );
 
 					for( size_t br = 0; br < set_block_grid_dims_b.first; ++br ) {
@@ -199,13 +216,19 @@ namespace alp {
 
 				if( local_rc == SUCCESS && th_ijk_c.rt > 0 ) {
 
+#ifndef NDEBUG
+					#pragma omp critical
+					std::cerr << "Thread " << thread << " in alp::internal::mxm_generic (omp)\n"
+						"\tZeroing C." << std::endl;
+#endif
+
+					alp::Scalar< 
+						OutputType, alp::structures::General, config::default_sequential_backend 
+					> zero( monoid.template getIdentity< OutputType >() );
+
 					for( size_t br = 0; br < block_grid_dims_c.first; ++br ) {
 						for( size_t bc = 0; bc < block_grid_dims_c.second; ++bc ) {
-
 							auto refCijk = get_view( C, th_ijk_c, br, bc );
-							alp::Scalar< 
-								OutputType, alp::structures::General, config::default_sequential_backend 
-							> zero( monoid.template getIdentity< OutputType >() );
 							local_rc = local_rc ? local_rc : set( refCijk, zero );
 						}
 					}
@@ -213,7 +236,7 @@ namespace alp {
 
 				// Different values for rc could converge here (eg, MISMATCH, FAILED).
 				if( local_rc != SUCCESS ) {
-#ifdef _DEBUG
+#ifndef NDEBUG
 					#pragma omp critical
 					std::cerr << "Thread " << thread << " in alp::internal::mxm_generic (omp)\n"
 						"\tIssues replicating input matrices." << std::endl;
@@ -223,21 +246,38 @@ namespace alp {
 
 				// End Broadcast of A and B and zero-ing of C
 				#pragma omp barrier
-				
+
+#ifndef NDEBUG
+				#pragma omp critical
+				std::cerr << "Thread " << thread << " in alp::internal::mxm_generic (omp)\n"
+					"\tPassing barrier" << std::endl;
+#endif
+
 				if( rc == SUCCESS ) {
 					
-					// #pragma omp for
-					// for( size_t thread = 0; thread < config::OMP::current_threads(); ++thread ) {
-
 					// Initialize circular shifts at stride of Rt
-					size_t s_a = utils::modulus( th_ijk_a.tc - th_ijk_a.tr + th_ijk_a.rt * tg_a.tc / tg_a.rt, tg_a.tc );
-					size_t s_b = utils::modulus( th_ijk_b.tr - th_ijk_b.tc + th_ijk_b.rt * tg_b.tr / tg_b.rt, tg_b.tr );
+					// size_t s_a = utils::modulus( th_ijk_a.tc - th_ijk_a.tr + th_ijk_a.rt * tg_a.tc / tg_a.rt, tg_a.tc );
+					// size_t s_b = utils::modulus( th_ijk_b.tr - th_ijk_b.tc + th_ijk_b.rt * tg_b.tr / tg_b.rt, tg_b.tr );
+					size_t c_a = utils::modulus( th_ijk_a.tc + th_ijk_a.tr + th_ijk_a.rt * tg_a.tc / tg_a.rt, tg_a.tc );
+					size_t r_b = utils::modulus( th_ijk_b.tr + th_ijk_b.tc + th_ijk_b.rt * tg_b.tr / tg_b.rt, tg_b.tr );
 
 					// Per-c-dimensional-layer partial computation
 					for( size_t r = 0; r < tg_a.tc / tg_a.rt; ++r ) {
 
-						const th_coord_t th_isk_a( th_ijk_a.tr, s_a, th_ijk_a.rt );
-						const th_coord_t th_sjk_b( s_b, th_ijk_b.tc, th_ijk_b.rt );
+						const th_coord_t th_isk_a( th_ijk_a.tr, c_a, th_ijk_a.rt );
+						const th_coord_t th_sjk_b( r_b, th_ijk_b.tc, th_ijk_b.rt );
+
+#ifndef NDEBUG
+						if( thread == 1 ) {
+							#pragma omp critical
+							{
+								std::cerr << "Thread " << thread << " in alp::internal::mxm_generic (omp)\n"
+									"\tCompute iteration r=" << r << "\n"
+									"\tStarting from A( " << th_ijk_a.tr << ", " << c_a << ", " << th_ijk_a.rt << " )\n"
+									"\tStarting from B( " << r_b << ", " << th_ijk_b.tc << ", " << th_ijk_b.rt << " )" <<std::endl;
+							}
+						}
+#endif
 
 						const auto mxm_block_grid_dims_a = da.getLocalBlockGridDims( th_isk_a );
 						const auto mxm_block_grid_dims_b = db.getLocalBlockGridDims( th_sjk_b );
@@ -246,7 +286,7 @@ namespace alp {
 							|| block_grid_dims_c.second != mxm_block_grid_dims_b.second 
 							|| mxm_block_grid_dims_a.second != mxm_block_grid_dims_b.first 
 						) {
-#ifdef _DEBUG
+#ifndef NDEBUG
 							#pragma omp critical
 							std::cerr << "Thread " << thread << " in alp::internal::mxm_generic (omp)\n"
 								"\tMismatching local block grid size on mxm." << std::endl;
@@ -263,30 +303,53 @@ namespace alp {
 
 									for( size_t bc = 0; bc < block_grid_dims_c.second; ++bc ) {
 
+
 										const auto refB_loc = get_view( B, th_sjk_b, bk, bc );
 										auto refC_ijk = get_view( C, th_ijk_c, br, bc );
 
+#ifndef NDEBUG
+										if( thread == 1 ) {
+											#pragma omp critical 
+											{
+												std::cerr << "Thread " << thread << " in alp::internal::mxm_generic (omp)\n"
+													"\tIteration r=" << r << " computing: " << bk << " " << br << " " << bc << std::endl;
+												print_matrix("Aref pre", refA_loc );
+												print_matrix("Bref pre", refB_loc );
+												print_matrix("Cref pre", refC_ijk );
+											}
+										}
+#endif
+
 										// Delegate the call to the sequential mxm implementation
 										local_rc = local_rc ? local_rc : mxm_generic< allow_void >( refC_ijk, refA_loc, refB_loc, oper, monoid, mulMonoid );
 
+#ifndef NDEBUG
+										if( thread == 1 ) {
+											#pragma omp critical 
+											{
+												print_matrix("Cref post", refC_ijk );
+											}
+										}
+#endif
+
 									}
 								}
 							}
 
 							// Circular shift rightwards for A
-							s_a = utils::modulus( s_a + 1, tg_a.tc );
+							c_a = utils::modulus( c_a + 1, tg_a.tc );
 							// Circular shift downwards for B
-							s_b = utils::modulus( s_b + 1, tg_b.tr );
+							r_b = utils::modulus( r_b + 1, tg_b.tr );
 
 						} 
 
 					} 
-					// }
-				}
+
+				} // End computation per layer of c-dimension
 
 				// Different values for rc could converge here (eg, MISMATCH, FAILED).
 				if( local_rc != SUCCESS ) {
-#ifdef _DEBUG
+#ifndef NDEBUG
 					#pragma omp critical
 					std::cerr << "Thread " << thread << " in alp::internal::mxm_generic (omp)\n"
 						"\tIssues with local mxm computations." << std::endl;
@@ -299,12 +362,24 @@ namespace alp {
 
 				if( rc == SUCCESS ) {
 					
-					// Global c-dimension reduction
+					// Final c-dimension reduction
 					// (Consider if rt > 0 critical section?)
 					if ( local_rc == SUCCESS && th_ijk_c.rt == 0 ) {
 
+#ifndef NDEBUG
+						#pragma omp critical
+						std::cerr << "Thread " << thread << " in alp::internal::mxm_generic (omp)\n"
+							"\tEntering reduction." << std::endl;
+#endif
+
 						for( size_t r = 1; r < tg_c.rt; ++r ) {
 
+#ifndef NDEBUG
+							#pragma omp critical
+							std::cerr << "Thread " << thread << " in alp::internal::mxm_generic (omp)\n"
+								"\tReduction iteration r=" << r << std::endl;
+#endif
+
 							const th_coord_t th_ijr_c( th_ijk_c.tr, th_ijk_c.tc, r );
 
 							for( size_t br = 0; br < block_grid_dims_c.first; ++br ) {
@@ -321,11 +396,12 @@ namespace alp {
 						}
 
 					}
-				}
+				
+				} // End of final reduction along c-dimension
 				
 				// Different values for rc could converge here (eg, MISMATCH, FAILED).
 				if( local_rc != SUCCESS ) {
-#ifdef _DEBUG
+#ifndef NDEBUG
 					#pragma omp critical
 					std::cerr << "Thread " << thread << " in alp::internal::mxm_generic (omp)\n"
 						"\tIssues with final reduction." << std::endl;
diff --git a/include/alp/omp/config.hpp b/include/alp/omp/config.hpp
index 4a4ec7323..d313a3a6d 100644
--- a/include/alp/omp/config.hpp
+++ b/include/alp/omp/config.hpp
@@ -34,14 +34,14 @@ namespace alp {
 		class OMP : public grb::config::OMP {};
 
 		// Dimensions of blocks counted in number of elements per dimension
-		constexpr size_t BLOCK_ROW_DIM = 64;
-		constexpr size_t BLOCK_COL_DIM = 64;
+		constexpr size_t BLOCK_ROW_DIM = 2;
+		constexpr size_t BLOCK_COL_DIM = 2;
 
 		// Temporary solution to accomodate for 2.5D mxm
-		constexpr size_t THREAD_ROW_DIM = 2;
-		constexpr size_t THREAD_COL_DIM = 2;
+		constexpr size_t THREAD_ROW_DIM = 4;
+		constexpr size_t THREAD_COL_DIM = 4;
 
-		constexpr size_t REPLICATION_FACTOR_THREADS = 2;
+		constexpr size_t REPLICATION_FACTOR_THREADS = 1;
 
 
 	} // namespace config
diff --git a/include/alp/omp/storage.hpp b/include/alp/omp/storage.hpp
index 6bd333a70..47a728e9c 100644
--- a/include/alp/omp/storage.hpp
+++ b/include/alp/omp/storage.hpp
@@ -156,14 +156,14 @@ namespace alp {
 			/** Row and column dimensions of the associated container */
 			const size_t m;
 			const size_t n;
-			/** The row and column dimensions of the thread grid */
-			const size_t Tr;
-			const size_t Tc;
 			/** Replication factor in thread-coordinate space */
 			static constexpr size_t Rt = config::REPLICATION_FACTOR_THREADS;
 			/** The row and column dimensions of the global block grid */
 			const size_t Br;
 			const size_t Bc;
+			/** The row and column dimensions of the thread grid */
+			const size_t Tr;
+			const size_t Tc;
 
 		public:
 
@@ -172,13 +172,14 @@ namespace alp {
 				const size_t num_threads
 			) :
 				m( m ), n( n ),
-				Tr( config::THREAD_ROW_DIM ), // Temporary
-				Tc( config::THREAD_COL_DIM ),
 				Br( static_cast< size_t >( std::ceil( static_cast< double >( m ) / config::BLOCK_ROW_DIM ) ) ),
-				Bc( static_cast< size_t >( std::ceil( static_cast< double >( n ) / config::BLOCK_COL_DIM ) ) ) {
-
-				if( num_threads != Tr * Tc * Rt ) {
-					std::cerr << "Warning: Provided number of threads cannot be factorized in a 3D grid.\n";
+				Bc( static_cast< size_t >( std::ceil( static_cast< double >( n ) / config::BLOCK_COL_DIM ) ) ),
+				Tr( ( Br > config::THREAD_ROW_DIM ) ? config::THREAD_ROW_DIM : Br ), // Temporary
+				Tc( ( Bc > config::THREAD_COL_DIM ) ? config::THREAD_COL_DIM : Bc ) 
+			{
+				if( ( Tr > 1 ) && ( Tc > 1 ) && ( num_threads != Tr * Tc * Rt ) ) {
+					std::cerr << "Warning: Provided number of threads cannot be factorized in a 3D grid:\n"
+						"\t" << Tr << " x " << Tc << " x " << Rt << std::endl;
 				}
 			}
 
@@ -303,6 +304,15 @@ namespace alp {
 			}
 
 			ThreadCoords getThreadCoords( const size_t thread_id ) const {
+				// const size_t _Tr = ( Tr == 1 ) ? Tc : Tr;
+				// const size_t _Tc = ( Tc == 1 ) ? Tr : Tc;
+
+				// const size_t rt = thread_id / ( _Tr * _Tc );
+				// const size_t tr = ( thread_id % ( _Tr * _Tc ) ) / _Tc;
+				// const size_t tc = ( thread_id % ( _Tr * _Tc ) ) % _Tc;
+
+				// return { tr % Tr, tc % Tc, rt };
+
 				const size_t rt = thread_id / ( Tr * Tc );
 				const size_t tr = ( thread_id % ( Tr * Tc ) ) / Tc;
 				const size_t tc = ( thread_id % ( Tr * Tc ) ) % Tc;
diff --git a/tests/unit/dense_omp_mxm.cpp b/tests/unit/dense_omp_mxm.cpp
index 415499595..be05c340b 100644
--- a/tests/unit/dense_omp_mxm.cpp
+++ b/tests/unit/dense_omp_mxm.cpp
@@ -20,7 +20,7 @@
 #include <vector>
 
 #include <alp.hpp>
-#ifndef _DEBUG
+#ifdef NDEBUG
  #include "../utils/print_alp_containers.hpp"
 #endif
 
@@ -50,7 +50,7 @@ void mxm_stdvec_as_matrix(	std::vector< T > & vC, const size_t ldc,
     
 	T temp;
 
-#ifdef _DEBUG
+#ifndef NDEBUG
 	print_stdvec_as_matrix("vA", vA, n, n, n);
 	print_stdvec_as_matrix("vB", vB, n, n, n);
 	print_stdvec_as_matrix("vC - PRE", vC, n, n, n);
@@ -70,7 +70,7 @@ void mxm_stdvec_as_matrix(	std::vector< T > & vC, const size_t ldc,
 		}
 	}
 
-#ifdef _DEBUG
+#ifndef NDEBUG
 	print_stdvec_as_matrix("vC - POST", vC, n, n, n);
 #endif
 }
@@ -193,6 +193,10 @@ void diff_stdvec_matrix( const std::vector< T > & vA, const size_t m, const size
 
 }
 
+#define M ( alp::config::BLOCK_ROW_DIM * n )
+#define K ( alp::config::BLOCK_COL_DIM * n )
+#define N ( alp::config::BLOCK_COL_DIM * n )
+
 void alp_program( const size_t & n, alp::RC & rc ) {
 
 	typedef double T;
@@ -202,34 +206,34 @@ void alp_program( const size_t & n, alp::RC & rc ) {
 	T one  = ring.getOne< T >();
 	T zero = ring.getZero< T >();
 
-	std::vector< T > A_data( n * n );
-	std::vector< T > B_data( n * n );
-	std::vector< T > C_data( n * n, zero );
+	std::vector< T > A_data( M * K );
+	std::vector< T > B_data( K * N );
+	std::vector< T > C_data( M * N, zero );
 
 	// std::vector< T > A_packed( n * ( n + 1 ) / 2 );
 	// std::vector< T > B_packed( n * ( n + 1 ) / 2 );
 	// std::vector< T > C_packed( n * ( n + 1 ) / 2, zero );
 
-	std::vector< T > A_vec( n * n );
-	std::vector< T > B_vec( n * n );
-	std::vector< T > C_vec( n * n, zero );
+	std::vector< T > A_vec( M * K );
+	std::vector< T > B_vec( K * N );
+	std::vector< T > C_vec( M * N, zero );
 
-	std::cout << "\tTesting dense General mxm " << n << std::endl;
+	std::cout << "\tTesting dense General mxm " << M << " " << K << " " << N << std::endl;
 
-	stdvec_build_matrix< structures::General >( A_data, n, n, n, zero, one, one );
-	stdvec_build_matrix< structures::General >( B_data, n, n, n, zero, one, one );
+	stdvec_build_matrix< structures::General >( A_data, M, K, K, zero, one, one );
+	stdvec_build_matrix< structures::General >( B_data, K, N, N, zero, one, one );
 
 	// initialize test
-	alp::Matrix< T, structures::General > A( n, n );
-	alp::Matrix< T, structures::General > B( n, n );
-	alp::Matrix< T, structures::General > C( n, n );
+	alp::Matrix< T, structures::General > A( M, K );
+	alp::Matrix< T, structures::General > B( K, N );
+	alp::Matrix< T, structures::General > C( M, N );
 
 	// Initialize input matrices
 	rc = alp::buildMatrix( A, A_data.begin(), A_data.end() );
 	rc = alp::buildMatrix( B, B_data.begin(), B_data.end() );
 	rc = alp::buildMatrix( C, C_data.begin(), C_data.end() );
 
-#ifdef _DEBUG
+#ifndef NDEBUG
 	print_matrix("A", A);
 	print_matrix("B", B);
 	print_matrix("C - PRE", C);
@@ -237,16 +241,16 @@ void alp_program( const size_t & n, alp::RC & rc ) {
 
 	rc = alp::mxm( C, A, B, ring );
 
-#ifdef _DEBUG
+#ifndef NDEBUG
 	print_matrix("C - POST", C);
 #endif
 
-	stdvec_build_matrix< structures::General >( A_vec, n, n, n, zero, one, one );
-	stdvec_build_matrix< structures::General >( B_vec, n, n, n, zero, one, one );
+	stdvec_build_matrix< structures::General >( A_vec, M, K, K, zero, one, one );
+	stdvec_build_matrix< structures::General >( B_vec, K, N, N, zero, one, one );
 
-	mxm_stdvec_as_matrix( C_vec, n, A_vec, n, B_vec, n, n, n, n, ring.getMultiplicativeOperator(), ring.getAdditiveMonoid() );
+	mxm_stdvec_as_matrix( C_vec, N, A_vec, K, B_vec, N, M, K, N, ring.getMultiplicativeOperator(), ring.getAdditiveMonoid() );
 
-	diff_stdvec_matrix( C_vec, n, n, n, C );
+	diff_stdvec_matrix( C_vec, M, N, N, C );
 
 	// std::cout << "\n\n=========== Testing Uppertriangular ============\n\n";
 

From 47d879bc959bc7d58a4bd7c2e17f1fb0ee27e49e Mon Sep 17 00:00:00 2001
From: "Daniele G. Spampinato" <daniele.giuseppe.spampinato@huawei.com>
Date: Fri, 2 Dec 2022 17:42:43 +0100
Subject: [PATCH 7/9] Refactoring includes

---
 include/alp/omp/blas3.hpp | 11 +++++++----
 include/alp/scalar.hpp    |  2 --
 tests/unit/CMakeLists.txt |  2 +-
 3 files changed, 8 insertions(+), 7 deletions(-)

diff --git a/include/alp/omp/blas3.hpp b/include/alp/omp/blas3.hpp
index 8f194b82c..eae9fd9e6 100644
--- a/include/alp/omp/blas3.hpp
+++ b/include/alp/omp/blas3.hpp
@@ -26,15 +26,18 @@
 #include <algorithm>   // for std::min/max
 #include <type_traits> // for std::enable_if
 
-#include <alp/base/blas3.hpp>
 #include <alp/descriptors.hpp>
-#include <alp/matrix.hpp>
-#include <alp/amf-based/matrix.hpp>
 #include <alp/structures.hpp>
-#include <alp/storage.hpp>
+
+// #include <alp/amf-based/matrix.hpp>
+#include <alp/base/blas3.hpp>
+
+#include "matrix.hpp"
+#include "storage.hpp"
 
 // Include backend to which sequential work is delegated
 #ifdef _ALP_OMP_WITH_REFERENCE
+ #include <alp/reference/blas2.hpp>
  #include <alp/reference/blas3.hpp>
  #include <alp/reference/io.hpp>
 #endif
diff --git a/include/alp/scalar.hpp b/include/alp/scalar.hpp
index 95c73cfbc..33210aba2 100644
--- a/include/alp/scalar.hpp
+++ b/include/alp/scalar.hpp
@@ -24,8 +24,6 @@
 
 #include "base/scalar.hpp"
 
-#include <alp/structures.hpp>
-
 // now include all specialisations contained in the backend directories:
 #ifdef _ALP_WITH_REFERENCE
  #include <alp/reference/scalar.hpp>
diff --git a/tests/unit/CMakeLists.txt b/tests/unit/CMakeLists.txt
index 3eed07ea7..f16504294 100644
--- a/tests/unit/CMakeLists.txt
+++ b/tests/unit/CMakeLists.txt
@@ -270,7 +270,7 @@ add_grb_executables( dense_omp_matrix dense_omp_matrix.cpp
 )
 
 add_grb_executables( dense_omp_mxm dense_omp_mxm.cpp
-	BACKENDS alp_reference alp_omp
+	BACKENDS alp_omp
 )
 
 add_grb_executables( dense_outer dense_outer.cpp

From 68995561f3674d7ebb7b882abfc318683d8900cc Mon Sep 17 00:00:00 2001
From: "Daniele G. Spampinato" <daniele.giuseppe.spampinato@huawei.com>
Date: Mon, 5 Dec 2022 18:39:50 +0100
Subject: [PATCH 8/9] - Fixing mismatching new/delete bug - Enabling
 allocation/computation on sub-grid of threads -  Added unit tests for
 non-cubic mxm

---
 include/alp/omp/blas3.hpp    |  85 +++++++++---------
 include/alp/omp/config.hpp   |   2 +-
 include/alp/omp/storage.hpp  | 136 +++++++++++++++--------------
 include/alp/omp/vector.hpp   |  82 +++++++++++-------
 tests/unit/dense_omp_mxm.cpp | 162 ++++++++++++++---------------------
 5 files changed, 226 insertions(+), 241 deletions(-)

diff --git a/include/alp/omp/blas3.hpp b/include/alp/omp/blas3.hpp
index eae9fd9e6..0bba4444b 100644
--- a/include/alp/omp/blas3.hpp
+++ b/include/alp/omp/blas3.hpp
@@ -154,26 +154,22 @@ namespace alp {
 				const th_coord_t th_ijk_b = db.getThreadCoords( thread );
 				const th_coord_t th_ijk_c = dc.getThreadCoords( thread );
 
-				const auto set_block_grid_dims_a = da.getLocalBlockGridDims( th_ijk_a );
-				const auto set_block_grid_dims_b = db.getLocalBlockGridDims( th_ijk_b );
-				const auto block_grid_dims_c = dc.getLocalBlockGridDims( th_ijk_c );
-
 				RC local_rc = SUCCESS;
 
-				if( block_grid_dims_c.first != set_block_grid_dims_a.first 
-					|| block_grid_dims_c.second != set_block_grid_dims_b.second 
-					|| set_block_grid_dims_a.second != set_block_grid_dims_b.first 
-				) {
-#ifndef NDEBUG
-					#pragma omp critical
-					std::cerr << "Thread " << thread << " in alp::internal::mxm_generic (omp)\n"
-						"\tMismatching local block grid size on set." << std::endl;
-#endif
-					local_rc = MISMATCH;
-				}
+// 				if( block_grid_dims_c.first != set_block_grid_dims_a.first 
+// 					|| block_grid_dims_c.second != set_block_grid_dims_b.second 
+// 					|| set_block_grid_dims_a.second != set_block_grid_dims_b.first 
+// 				) {
+// #ifndef NDEBUG
+// 					#pragma omp critical
+// 					std::cerr << "Thread " << thread << " in alp::internal::mxm_generic (omp)\n"
+// 						"\tMismatching local block grid size on set." << std::endl;
+// #endif
+// 					local_rc = MISMATCH;
+// 				}
 
 				// Broadcast A and B to all c-dimensional layers
-				if( local_rc == SUCCESS && th_ijk_a.rt > 0 ) {
+				if( local_rc == SUCCESS && da.isActiveThread( th_ijk_a ) && th_ijk_a.rt > 0 ) {
 
 #ifndef NDEBUG
 					#pragma omp critical
@@ -181,6 +177,8 @@ namespace alp {
 						"\tCopying A." << std::endl;
 #endif
 
+					const auto set_block_grid_dims_a = da.getLocalBlockGridDims( th_ijk_a );
+
 					th_coord_t th_ij0_a( th_ijk_a.tr, th_ijk_a.tc, 0 );
 
 					for( size_t br = 0; br < set_block_grid_dims_a.first; ++br ) {
@@ -196,7 +194,7 @@ namespace alp {
 				
 				} // End Broadcast of A
 
-				if( local_rc == SUCCESS && th_ijk_b.rt > 0 ) {
+				if( local_rc == SUCCESS && db.isActiveThread( th_ijk_b ) && th_ijk_b.rt > 0 ) {
 
 #ifndef NDEBUG
 					#pragma omp critical
@@ -204,6 +202,8 @@ namespace alp {
 						"\tCopying B." << std::endl;
 #endif
 
+					const auto set_block_grid_dims_b = db.getLocalBlockGridDims( th_ijk_b );
+
 					th_coord_t th_ij0_b( th_ijk_b.tr, th_ijk_b.tc, 0 );
 
 					for( size_t br = 0; br < set_block_grid_dims_b.first; ++br ) {
@@ -217,7 +217,7 @@ namespace alp {
 					}
 				} // End Broadcast of B
 
-				if( local_rc == SUCCESS && th_ijk_c.rt > 0 ) {
+				if( local_rc == SUCCESS && dc.isActiveThread( th_ijk_c ) && th_ijk_c.rt > 0 ) {
 
 #ifndef NDEBUG
 					#pragma omp critical
@@ -225,6 +225,8 @@ namespace alp {
 						"\tZeroing C." << std::endl;
 #endif
 
+					const auto block_grid_dims_c = dc.getLocalBlockGridDims( th_ijk_c );
+
 					alp::Scalar< 
 						OutputType, alp::structures::General, config::default_sequential_backend 
 					> zero( monoid.template getIdentity< OutputType >() );
@@ -256,11 +258,11 @@ namespace alp {
 					"\tPassing barrier" << std::endl;
 #endif
 
-				if( rc == SUCCESS ) {
-					
+				if( rc == SUCCESS && dc.isActiveThread( th_ijk_c ) ) {
+
+					const auto block_grid_dims_c = dc.getLocalBlockGridDims( th_ijk_c );
+
 					// Initialize circular shifts at stride of Rt
-					// size_t s_a = utils::modulus( th_ijk_a.tc - th_ijk_a.tr + th_ijk_a.rt * tg_a.tc / tg_a.rt, tg_a.tc );
-					// size_t s_b = utils::modulus( th_ijk_b.tr - th_ijk_b.tc + th_ijk_b.rt * tg_b.tr / tg_b.rt, tg_b.tr );
 					size_t c_a = utils::modulus( th_ijk_a.tc + th_ijk_a.tr + th_ijk_a.rt * tg_a.tc / tg_a.rt, tg_a.tc );
 					size_t r_b = utils::modulus( th_ijk_b.tr + th_ijk_b.tc + th_ijk_b.rt * tg_b.tr / tg_b.rt, tg_b.tr );
 
@@ -363,39 +365,36 @@ namespace alp {
 				// End layer-by-layer partial computation
 				#pragma omp barrier
 
-				if( rc == SUCCESS ) {
+				// Final c-dimension reduction
+				if( rc == SUCCESS && dc.isActiveThread( th_ijk_c ) && th_ijk_c.rt == 0 ) {
 					
-					// Final c-dimension reduction
-					// (Consider if rt > 0 critical section?)
-					if ( local_rc == SUCCESS && th_ijk_c.rt == 0 ) {
-
 #ifndef NDEBUG
-						#pragma omp critical
-						std::cerr << "Thread " << thread << " in alp::internal::mxm_generic (omp)\n"
-							"\tEntering reduction." << std::endl;
+					#pragma omp critical
+					std::cerr << "Thread " << thread << " in alp::internal::mxm_generic (omp)\n"
+						"\tEntering reduction." << std::endl;
 #endif
 
-						for( size_t r = 1; r < tg_c.rt; ++r ) {
+					const auto block_grid_dims_c = dc.getLocalBlockGridDims( th_ijk_c );
+
+					for( size_t r = 1; r < tg_c.rt; ++r ) {
 
 #ifndef NDEBUG
-							#pragma omp critical
-							std::cerr << "Thread " << thread << " in alp::internal::mxm_generic (omp)\n"
-								"\tReduction iteration r=" << r << std::endl;
+						#pragma omp critical
+						std::cerr << "Thread " << thread << " in alp::internal::mxm_generic (omp)\n"
+							"\tReduction iteration r=" << r << std::endl;
 #endif
 
-							const th_coord_t th_ijr_c( th_ijk_c.tr, th_ijk_c.tc, r );
+						const th_coord_t th_ijr_c( th_ijk_c.tr, th_ijk_c.tc, r );
 
-							for( size_t br = 0; br < block_grid_dims_c.first; ++br ) {
-								for( size_t bc = 0; bc < block_grid_dims_c.second; ++bc ) {
+						for( size_t br = 0; br < block_grid_dims_c.first; ++br ) {
+							for( size_t bc = 0; bc < block_grid_dims_c.second; ++bc ) {
 
-									auto refCij0 = internal::get_view( C, th_ijk_c, br, bc ); // k == 0 
-									auto refCijr = internal::get_view( C, th_ijr_c, br, bc );
+								auto refCij0 = internal::get_view( C, th_ijk_c, br, bc ); // k == 0 
+								auto refCijr = internal::get_view( C, th_ijr_c, br, bc );
 
-									// Final result in C at layer 0
-									local_rc = local_rc ? local_rc : foldl( refCij0, refCijr, monoid );
-								}
+								// Final result in C at layer 0
+								local_rc = local_rc ? local_rc : foldl( refCij0, refCijr, monoid );
 							}
-
 						}
 
 					}
diff --git a/include/alp/omp/config.hpp b/include/alp/omp/config.hpp
index d313a3a6d..79200558f 100644
--- a/include/alp/omp/config.hpp
+++ b/include/alp/omp/config.hpp
@@ -37,7 +37,7 @@ namespace alp {
 		constexpr size_t BLOCK_ROW_DIM = 2;
 		constexpr size_t BLOCK_COL_DIM = 2;
 
-		// Temporary solution to accomodate for 2.5D mxm
+		// Temporary squared solution to accomodate for 2.5D mxm
 		constexpr size_t THREAD_ROW_DIM = 4;
 		constexpr size_t THREAD_COL_DIM = 4;
 
diff --git a/include/alp/omp/storage.hpp b/include/alp/omp/storage.hpp
index 47a728e9c..8bfe855fc 100644
--- a/include/alp/omp/storage.hpp
+++ b/include/alp/omp/storage.hpp
@@ -131,25 +131,25 @@ namespace alp {
 
 			};
 
-			/** Type encapsulating the local block coordinate. */
-			struct LocalBlockCoord {
-
-				const size_t tr;
-				const size_t tc;
-				const size_t rt;
-				const size_t br;
-				const size_t bc;
-
-				LocalBlockCoord(
-					const size_t tr, const size_t tc,
-					const size_t rt,
-					const size_t br, const size_t bc
-				) :
-					tr( tr ), tc( tc ),
-					rt( rt ),
-					br( br ), bc( bc ) {}
-
-			};
+			// /** Type encapsulating the local block coordinate. */
+			// struct LocalBlockCoord {
+
+			// 	const size_t tr;
+			// 	const size_t tc;
+			// 	const size_t rt;
+			// 	const size_t br;
+			// 	const size_t bc;
+
+			// 	LocalBlockCoord(
+			// 		const size_t tr, const size_t tc,
+			// 		const size_t rt,
+			// 		const size_t br, const size_t bc
+			// 	) :
+			// 		tr( tr ), tc( tc ),
+			// 		rt( rt ),
+			// 		br( br ), bc( bc ) {}
+
+			// };
 
 		private:
 
@@ -174,25 +174,25 @@ namespace alp {
 				m( m ), n( n ),
 				Br( static_cast< size_t >( std::ceil( static_cast< double >( m ) / config::BLOCK_ROW_DIM ) ) ),
 				Bc( static_cast< size_t >( std::ceil( static_cast< double >( n ) / config::BLOCK_COL_DIM ) ) ),
-				Tr( ( Br > config::THREAD_ROW_DIM ) ? config::THREAD_ROW_DIM : Br ), // Temporary
+				Tr( ( Br > config::THREAD_ROW_DIM ) ? config::THREAD_ROW_DIM : Br ), 
 				Tc( ( Bc > config::THREAD_COL_DIM ) ? config::THREAD_COL_DIM : Bc ) 
 			{
-				if( ( Tr > 1 ) && ( Tc > 1 ) && ( num_threads != Tr * Tc * Rt ) ) {
-					std::cerr << "Warning: Provided number of threads cannot be factorized in a 3D grid:\n"
-						"\t" << Tr << " x " << Tc << " x " << Rt << std::endl;
+				if( num_threads != config::THREAD_ROW_DIM * config::THREAD_COL_DIM * config::REPLICATION_FACTOR_THREADS ) {
+					std::cerr << "Warning: Provided number of threads cannot be factorized in a 2.5D grid:\n"
+						"\t" << num_threads << " != " << config::THREAD_ROW_DIM << " x " << config::THREAD_COL_DIM << " x " << config::REPLICATION_FACTOR_THREADS << std::endl;
 				}
 			}
 
-			LocalBlockCoord mapBlockGlobalToLocal( const GlobalBlockCoord &g ) const {
-				(void) g;
-				return LocalBlockCoord( 0, 0, 0, 0, 0 );
-			}
+			// LocalBlockCoord mapBlockGlobalToLocal( const GlobalBlockCoord &g ) const {
+			// 	(void) g;
+			// 	return LocalBlockCoord( 0, 0, 0, 0, 0 );
+			// }
 
-			GlobalBlockCoord mapBlockLocalToGlobal( const LocalBlockCoord &l ) const {
-				const size_t block_id_r = l.br * Tr + l.tr;
-				const size_t block_id_c = l.bc * Tc + l.tc;
-				return GlobalBlockCoord( block_id_r, block_id_c );
-			}
+			// GlobalBlockCoord mapBlockLocalToGlobal( const LocalBlockCoord &l ) const {
+			// 	const size_t block_id_r = l.br * Tr + l.tr;
+			// 	const size_t block_id_c = l.bc * Tc + l.tc;
+			// 	return GlobalBlockCoord( block_id_r, block_id_c );// Temporary
+			// }
 
 			LocalCoord mapGlobalToLocal( const GlobalCoord &g ) const {
 				const size_t global_br = g.i / config::BLOCK_ROW_DIM;
@@ -213,23 +213,26 @@ namespace alp {
 				);
 			}
 
-			/**
-			 * Maps coordinates from local to global space.
-			 *
-			 * \todo Add implementation
+			// /**
+			//  * Maps coordinates from local to global space.
+			//  *
+			//  * \todo Add implementation
+			//  */
+			// GlobalCoord mapLocalToGlobal( const LocalCoord &l ) const {
+			// 	(void) l;
+			// 	return GlobalCoord( 0, 0 );
+			// }
+
+			/** 
+			 * Returns the thread ID corresponding to the given thread coordinates. 
+			 * The fixed thread grid enables to map left-over threads.
 			 */
-			GlobalCoord mapLocalToGlobal( const LocalCoord &l ) const {
-				(void) l;
-				return GlobalCoord( 0, 0 );
-			}
-
-			/** Returns the thread ID corresponding to the given thread coordinates. */
 			size_t getThreadId( const ThreadCoords t ) const {
-				return t.rt * Tr * Tc + t.tr * Tc + t.tc;
+				return t.rt * config::THREAD_ROW_DIM * config::THREAD_COL_DIM + t.tr * config::THREAD_COL_DIM + t.tc;
 			}
 
 			size_t getNumberOfThreads() const {
-				return Tr * Tc * Rt;
+				return config::THREAD_ROW_DIM * config::THREAD_COL_DIM * config::REPLICATION_FACTOR_THREADS;
 			}
 
 			/** Returns the thread grid size */
@@ -251,17 +254,17 @@ namespace alp {
 				return { blocks_r, blocks_c };
 			}
 
-			/** Returns the global block coordinates based on the thread and local block coordinates */
-			std::pair< size_t, size_t > getGlobalBlockCoords( const size_t tr, const size_t tc, const size_t br, const size_t bc ) const {
-				const size_t global_br = br * Tr + tr % Tr;
-				const size_t global_bc = bc * Tc + tc % Tc;
-				return { global_br, global_bc };
-			}
+			// /** Returns the global block coordinates based on the thread and local block coordinates */
+			// std::pair< size_t, size_t > getGlobalBlockCoords( const size_t tr, const size_t tc, const size_t br, const size_t bc ) const {
+			// 	const size_t global_br = br * Tr + tr % Tr;
+			// 	const size_t global_bc = bc * Tc + tc % Tc;
+			// 	return { global_br, global_bc };
+			// }
 
-			size_t getGlobalBlockId( const size_t tr, const size_t tc, const size_t br, const size_t bc ) const {
-				const auto global_coords = getGlobalBlockCoords( tr, tc, br, bc );
-				return global_coords.first * Bc + global_coords.second;
-			}
+			// size_t getGlobalBlockId( const size_t tr, const size_t tc, const size_t br, const size_t bc ) const {
+			// 	const auto global_coords = getGlobalBlockCoords( tr, tc, br, bc );
+			// 	return global_coords.first * Bc + global_coords.second;
+			// }
 
 			size_t getLocalBlockId( const LocalCoord &local ) const {
 				return local.br * getLocalBlockGridDims( local.getThreadCoords() ).second + local.bc;
@@ -297,27 +300,28 @@ namespace alp {
 			}
 
 			/** For a given block, returns its offset from the beginning of the buffer in which it is stored */
-			size_t getBlocksOffset( const ThreadCoords t, const size_t br, const size_t bc ) const {
+			size_t getBlocksOffset( const ThreadCoords &t, const size_t br, const size_t bc ) const {
 				// The offset is calculated as the sum of sizes of all previous blocks
 				const size_t block_coord_1D = br * getLocalBlockGridDims( t ).second + bc;
 				return block_coord_1D * getBlockSize();
 			}
 
 			ThreadCoords getThreadCoords( const size_t thread_id ) const {
-				// const size_t _Tr = ( Tr == 1 ) ? Tc : Tr;
-				// const size_t _Tc = ( Tc == 1 ) ? Tr : Tc;
-
-				// const size_t rt = thread_id / ( _Tr * _Tc );
-				// const size_t tr = ( thread_id % ( _Tr * _Tc ) ) / _Tc;
-				// const size_t tc = ( thread_id % ( _Tr * _Tc ) ) % _Tc;
+				const size_t rt = thread_id / ( config::THREAD_ROW_DIM * config::THREAD_COL_DIM );
+				const size_t tr = ( thread_id % ( config::THREAD_ROW_DIM * config::THREAD_COL_DIM ) ) / config::THREAD_COL_DIM;
+				const size_t tc = ( thread_id % ( config::THREAD_ROW_DIM * config::THREAD_COL_DIM ) ) % config::THREAD_COL_DIM;
+				return { tr, tc, rt };
+			}
 
-				// return { tr % Tr, tc % Tc, rt };
+			bool isActiveThread(const ThreadCoords &t ) const {
+				return t.tr < Tr && t.tc < Tc && t.rt < Rt;
+			}
 
-				const size_t rt = thread_id / ( Tr * Tc );
-				const size_t tr = ( thread_id % ( Tr * Tc ) ) / Tc;
-				const size_t tc = ( thread_id % ( Tr * Tc ) ) % Tc;
-				return { tr, tc, rt };
+			bool isActiveThread(const size_t thread_id ) const {
+				const auto th_coords = getThreadCoords( thread_id );
+				return isActiveThread( th_coords );
 			}
+
 	};
 		
 	namespace storage {
diff --git a/include/alp/omp/vector.hpp b/include/alp/omp/vector.hpp
index 8116f1f9f..b653de004 100644
--- a/include/alp/omp/vector.hpp
+++ b/include/alp/omp/vector.hpp
@@ -104,6 +104,9 @@ namespace alp {
 
 			private:
 
+				/** The distribution of the vector among threads. */
+				Distribution_2_5D distr;
+
 				/** The number of buffers. */
 				size_t num_buffers;
 
@@ -149,7 +152,8 @@ namespace alp {
 				Vector(
 					const Distribution_2_5D &d,
 					const size_t cap = 0
-				) : num_buffers( d.getNumberOfThreads() ),
+				) : distr( d ),
+					num_buffers( d.getNumberOfThreads() ),
 					containers( num_buffers ),
 					initialized( false ) {
 
@@ -165,47 +169,54 @@ namespace alp {
 						throw std::runtime_error( "Could not allocate memory during alp::Vector<omp> construction." );
 					}
 
-					#pragma omp parallel for
-					for( size_t thread = 0; thread < config::OMP::current_threads(); ++thread ) {
+					#pragma omp parallel
+					{
+						const size_t thread = config::OMP::current_thread_ID();
+
 						const auto t_coords = d.getThreadCoords( thread );
-						const auto block_grid_dims = d.getLocalBlockGridDims( t_coords );
 
-						// Assuming that all blocks are of the same size
-						const size_t alloc_size = block_grid_dims.first * block_grid_dims.second * d.getBlockSize();
+						if ( d.isActiveThread( t_coords ) ) {
+
+							const auto block_grid_dims = d.getLocalBlockGridDims( t_coords );
+
+							// Assuming that all blocks are of the same size
+							const size_t alloc_size = block_grid_dims.first * block_grid_dims.second * d.getBlockSize();
 
 #ifdef DEBUG
-						#pragma omp critical
-						{
-							if( thread != config::OMP::current_thread_ID() ) {
-								std::cout << "Warning: thread != OMP::current_thread_id()\n";
+							#pragma omp critical
+							{
+								if( thread != config::OMP::current_thread_ID() ) {
+									std::cout << "Warning: thread != OMP::current_thread_id()\n";
+								}
+								std::cout << "Thread with global coordinates tr = " << t_coords.tr << " tc = " << t_coords.tc
+									<< " on OpenMP thread " << config::OMP::current_thread_ID()
+									<< " allocating buffer of " << alloc_size << " elements "
+									<< " holding " << block_grid_dims.first << " x " << block_grid_dims.second << " blocks.\n";
 							}
-							std::cout << "Thread with global coordinates tr = " << t_coords.tr << " tc = " << t_coords.tc
-								<< " on OpenMP thread " << config::OMP::current_thread_ID()
-								<< " allocating buffer of " << alloc_size << " elements "
-								<< " holding " << block_grid_dims.first << " x " << block_grid_dims.second << " blocks.\n";
-						}
 #endif
 
-						// TODO: Implement allocation properly
-						buffers[ thread ] = new ( std::nothrow ) value_type[ alloc_size ];
+							// TODO: Implement allocation properly
+							buffers[ thread ] = new ( std::nothrow ) value_type[ alloc_size ];
 
-						if( buffers[ thread ] == nullptr ) {
-							throw std::runtime_error( "Could not allocate memory during alp::Vector<omp> construction." );
-						}
+							if( buffers[ thread ] == nullptr ) {
+								throw std::runtime_error( "Could not allocate memory during alp::Vector<omp> construction." );
+							}
 
-						// Reserve space for all internal container wrappers to avoid re-allocation
-						containers[ thread ].reserve( block_grid_dims.first * block_grid_dims.second );
+							// Reserve space for all internal container wrappers to avoid re-allocation
+							containers[ thread ].reserve( block_grid_dims.first * block_grid_dims.second );
 
-						// Populate the array of internal container wrappers
-						for( size_t br = 0; br < block_grid_dims.first; ++br ) {
-							for( size_t bc = 0; bc < block_grid_dims.second; ++bc ) {
-								const size_t offset = d.getBlocksOffset( t_coords, br, bc );
-								containers[ thread ].emplace_back( &( buffers[ thread ][ offset ] ), d.getBlockSize() );
+							// Populate the array of internal container wrappers
+							for( size_t br = 0; br < block_grid_dims.first; ++br ) {
+								for( size_t bc = 0; bc < block_grid_dims.second; ++bc ) {
+									const size_t offset = d.getBlocksOffset( t_coords, br, bc );
+									containers[ thread ].emplace_back( &( buffers[ thread ][ offset ] ), d.getBlockSize() );
+								}
 							}
-						}
 
-						// Ensure that the array contains the expected number of containers
-						assert( containers[ thread ].size() == block_grid_dims.first * block_grid_dims.second );
+							// Ensure that the array contains the expected number of containers
+							assert( containers[ thread ].size() == block_grid_dims.first * block_grid_dims.second );
+						
+						} // End active threads allocation
 					}
 				}
 
@@ -277,9 +288,14 @@ namespace alp {
 				 */
 				~Vector() {
 					if( buffers != nullptr ) {
-						for( size_t i = 0; i < num_buffers; ++i ) {
-							if( buffers[ i ] != nullptr ) {
-								delete buffers[ i ];
+						#pragma omp parallel
+						{
+							const size_t thread = config::OMP::current_thread_ID();
+
+							if ( distr.isActiveThread( thread ) ) {
+								if( buffers[ thread ] != nullptr ) {
+									delete [] buffers[ thread ];
+								}
 							}
 						}
 						delete [] buffers;
diff --git a/tests/unit/dense_omp_mxm.cpp b/tests/unit/dense_omp_mxm.cpp
index be05c340b..157b71494 100644
--- a/tests/unit/dense_omp_mxm.cpp
+++ b/tests/unit/dense_omp_mxm.cpp
@@ -193,45 +193,44 @@ void diff_stdvec_matrix( const std::vector< T > & vA, const size_t m, const size
 
 }
 
-#define M ( alp::config::BLOCK_ROW_DIM * n )
-#define K ( alp::config::BLOCK_COL_DIM * n )
-#define N ( alp::config::BLOCK_COL_DIM * n )
-
-void alp_program( const size_t & n, alp::RC & rc ) {
-
-	typedef double T;
+template < typename T, typename SemiringT >
+void run_mxm( const size_t m, const size_t k, const size_t n, alp::RC &rc ) {
 
-	alp::Semiring< alp::operators::add< T >, alp::operators::mul< T >, alp::identities::zero, alp::identities::one > ring;
+	const SemiringT ring;
+	const T one  = ring.template getOne< T >();
+	const T zero = ring.template getZero< T >();
 
-	T one  = ring.getOne< T >();
-	T zero = ring.getZero< T >();
+	std::vector< T > A_data( m * k );
+	std::vector< T > B_data( k * n );
+	std::vector< T > C_data( m * n, zero );
 
-	std::vector< T > A_data( M * K );
-	std::vector< T > B_data( K * N );
-	std::vector< T > C_data( M * N, zero );
+	std::vector< T > A_vec( m * k );
+	std::vector< T > B_vec( k * n );
+	std::vector< T > C_vec( m * n, zero );
 
-	// std::vector< T > A_packed( n * ( n + 1 ) / 2 );
-	// std::vector< T > B_packed( n * ( n + 1 ) / 2 );
-	// std::vector< T > C_packed( n * ( n + 1 ) / 2, zero );
+	std::cout << "\tTesting dense General mxm " << m << " " << k << " " << n << std::endl;
 
-	std::vector< T > A_vec( M * K );
-	std::vector< T > B_vec( K * N );
-	std::vector< T > C_vec( M * N, zero );
-
-	std::cout << "\tTesting dense General mxm " << M << " " << K << " " << N << std::endl;
-
-	stdvec_build_matrix< structures::General >( A_data, M, K, K, zero, one, one );
-	stdvec_build_matrix< structures::General >( B_data, K, N, N, zero, one, one );
+	stdvec_build_matrix< structures::General >( A_data, m, k, k, zero, one, one );
+	stdvec_build_matrix< structures::General >( B_data, k, n, n, zero, one, one );
 
 	// initialize test
-	alp::Matrix< T, structures::General > A( M, K );
-	alp::Matrix< T, structures::General > B( K, N );
-	alp::Matrix< T, structures::General > C( M, N );
+	alp::Matrix< T, structures::General > A( m, k );
+	alp::Matrix< T, structures::General > B( k, n );
+	alp::Matrix< T, structures::General > C( m, n );
 
 	// Initialize input matrices
-	rc = alp::buildMatrix( A, A_data.begin(), A_data.end() );
-	rc = alp::buildMatrix( B, B_data.begin(), B_data.end() );
-	rc = alp::buildMatrix( C, C_data.begin(), C_data.end() );
+	rc = rc ? rc : alp::buildMatrix( A, A_data.begin(), A_data.end() );
+	if ( rc != alp::SUCCESS ) {
+		std::cerr << "\tIssues building A" << std::endl;
+		return;
+	}
+	rc = rc ? rc : alp::buildMatrix( B, B_data.begin(), B_data.end() );
+	rc = rc ? rc : alp::buildMatrix( C, C_data.begin(), C_data.end() );
+
+	if ( rc != alp::SUCCESS ) {
+		std::cerr << "\tIssues building matrices" << std::endl;
+		return;
+	}
 
 #ifndef NDEBUG
 	print_matrix("A", A);
@@ -239,94 +238,61 @@ void alp_program( const size_t & n, alp::RC & rc ) {
 	print_matrix("C - PRE", C);
 #endif
 
-	rc = alp::mxm( C, A, B, ring );
+	rc = rc ? rc : alp::mxm( C, A, B, ring );
 
 #ifndef NDEBUG
 	print_matrix("C - POST", C);
 #endif
 
-	stdvec_build_matrix< structures::General >( A_vec, M, K, K, zero, one, one );
-	stdvec_build_matrix< structures::General >( B_vec, K, N, N, zero, one, one );
-
-	mxm_stdvec_as_matrix( C_vec, N, A_vec, K, B_vec, N, M, K, N, ring.getMultiplicativeOperator(), ring.getAdditiveMonoid() );
-
-	diff_stdvec_matrix( C_vec, M, N, N, C );
-
-	// std::cout << "\n\n=========== Testing Uppertriangular ============\n\n";
+	if ( rc != alp::SUCCESS )
+		return;
 
-	// alp::Matrix< T, structures::UpperTriangular > UA( n );
-	// alp::Matrix< T, structures::UpperTriangular > UB( n );
-	// alp::Matrix< T, structures::UpperTriangular > UC( n );
+	stdvec_build_matrix< structures::General >( A_vec, m, k, k, zero, one, one );
+	stdvec_build_matrix< structures::General >( B_vec, k, n, n, zero, one, one );
 
-	// stdvec_build_matrix_packed< structures::UpperTriangular >( A_packed, one, one );
-	// stdvec_build_matrix_packed< structures::UpperTriangular >( B_packed, one, one );
+	mxm_stdvec_as_matrix( C_vec, n, A_vec, k, B_vec, n, m, k, n, ring.getMultiplicativeOperator(), ring.getAdditiveMonoid() );
 
-	// rc = alp::buildMatrix( UA, A_packed.begin(), A_packed.end() );
-	// rc = alp::buildMatrix( UB, B_packed.begin(), B_packed.end() );
-	// rc = alp::buildMatrix( UC, C_packed.begin(), C_packed.end() );
+	diff_stdvec_matrix( C_vec, m, n, n, C );
 
-	// print_matrix("UA", UA);
-	// print_matrix("UB", UB);
-	// print_matrix("UC - PRE", UC);
-	// rc = alp::mxm( UC, UA, UB, ring );
-	// print_matrix("UC - POST", UC);
 
-	// stdvec_build_matrix< structures::UpperTriangular >( A_vec, n, n, n, zero, one, one );
-	// stdvec_build_matrix< structures::UpperTriangular >( B_vec, n, n, n, zero, one, one );
-	// stdvec_build_matrix< structures::General >( C_vec, n, n, n, zero, zero );
+	std::cout << "\tDone." << std::endl;
 
-	// mxm_stdvec_as_matrix( C_vec, n, A_vec, n, B_vec, n, n, n, n, ring.getMultiplicativeOperator(), ring.getAdditiveMonoid() );
-
-	// diff_stdvec_matrix( C_vec, n, n, n, UC );
-
-	// std::cout << "\n\n=========== Testing Symmetric Output ============\n\n";
-
-	// alp::Matrix< T, structures::Symmetric > SC( n );
-
-	// stdvec_build_matrix< structures::Symmetric >( A_data, n, n, n, zero, one, one );
-
-	// rc = alp::buildMatrix( A,  A_data.begin(), A_data.end() );
-	// rc = alp::buildMatrix( SC, C_packed.begin(), C_packed.end() );
-
-	// print_matrix("A", A );
-	// print_matrix("A^T", alp::get_view< alp::view::transpose >( A ) );
-	// print_matrix("SC - PRE", SC);
-	// rc = alp::mxm( SC, A, alp::get_view< alp::view::transpose >( A ), ring );
-	// print_matrix("SC - POST", SC);
-
-	// stdvec_build_matrix< structures::Symmetric >( A_vec, n, n, n, zero, one, one );
-	// stdvec_build_matrix< structures::Symmetric >( C_vec, n, n, n, zero, zero );
-
-	// mxm_stdvec_as_matrix( C_vec, n, A_vec, n, A_vec, n, n, n, n, ring.getMultiplicativeOperator(), ring.getAdditiveMonoid() );
-
-	// diff_stdvec_matrix( C_vec, n, n, n, SC );
+}
 
-	// std::cout << "\n\n=========== Testing Symmetric x Symmetric Output ============\n\n";
+#define M ( alp::config::BLOCK_ROW_DIM * n )
+#define K ( alp::config::BLOCK_COL_DIM * 2 * n )
+#define N ( alp::config::BLOCK_COL_DIM * 3 * n )
 
-	// alp::Matrix< T, structures::Symmetric > SA( n );
-	// alp::Matrix< T, structures::Symmetric > SB( n );
+void alp_program( const size_t &n, alp::RC &rc ) {
 
-	// stdvec_build_matrix_packed< structures::Symmetric >( A_packed, one, one );
-	// stdvec_build_matrix_packed< structures::Symmetric >( B_packed, one, one + one );
+	using T = double;
 
-	// rc = alp::buildMatrix( SA, A_packed.begin(), A_packed.end() );
-	// rc = alp::buildMatrix( SB, B_packed.begin(), B_packed.end() );
-	// rc = alp::buildMatrix( C,  C_data.begin(), C_data.end() );
+	using SemiringT = alp::Semiring< 
+		alp::operators::add< T >, alp::operators::mul< T >, 
+		alp::identities::zero, alp::identities::one 
+	>;
 
-	// print_matrix("SA", SA);
-	// print_matrix("SB", SB);
-	// print_matrix("C - PRE", C);
-	// rc = alp::mxm( C, SA, SB, ring );
-	// print_matrix("C - POST", C);
+	rc = alp::SUCCESS;
 
-	// stdvec_build_matrix< structures::Symmetric >( A_vec, n, n, n, zero, one, one );
-	// stdvec_build_matrix< structures::Symmetric >( B_vec, n, n, n, zero, one, one + one );
-	// stdvec_build_matrix< structures::General >( C_vec, n, n, n, zero, zero );
+	/** 
+	 * Testing cubic mxm.
+	 */
+	run_mxm< T, SemiringT >( M, M, M, rc );
 
-	// mxm_stdvec_as_matrix( C_vec, n, A_vec, n, B_vec, n, n, n, n, ring.getMultiplicativeOperator(), ring.getAdditiveMonoid() );
+	/**
+	 * Testing rectangular mxm
+	 */
+	run_mxm< T, SemiringT >( M, K, N, rc );
 
-	// diff_stdvec_matrix( C_vec, n, n, n, C );
+	/**
+	 * Testing outer-prod of blocks mxm
+	 */
+	run_mxm< T, SemiringT >( M, alp::config::BLOCK_COL_DIM, N, rc );
 
+	/**
+	 * Testing dot-prod of blocks mxm
+	 */
+	run_mxm< T, SemiringT >( alp::config::BLOCK_ROW_DIM, M, alp::config::BLOCK_COL_DIM, rc );
 
 }
 

From 16f4c97387a28008c186a7ba99528aaafd1bb0bd Mon Sep 17 00:00:00 2001
From: "Daniele G. Spampinato" <daniele.giuseppe.spampinato@huawei.com>
Date: Tue, 13 Dec 2022 15:15:15 +0100
Subject: [PATCH 9/9] Fixed style and typos

---
 include/alp/omp/blas3.hpp    | 35 +++++++---------------
 include/alp/omp/io.hpp       |  2 +-
 include/alp/omp/storage.hpp  |  2 +-
 tests/unit/dense_omp_mxm.cpp | 56 +++++++++++++++++++-----------------
 4 files changed, 41 insertions(+), 54 deletions(-)

diff --git a/include/alp/omp/blas3.hpp b/include/alp/omp/blas3.hpp
index 0bba4444b..4954e00b5 100644
--- a/include/alp/omp/blas3.hpp
+++ b/include/alp/omp/blas3.hpp
@@ -29,7 +29,6 @@
 #include <alp/descriptors.hpp>
 #include <alp/structures.hpp>
 
-// #include <alp/amf-based/matrix.hpp>
 #include <alp/base/blas3.hpp>
 
 #include "matrix.hpp"
@@ -77,12 +76,12 @@ namespace alp {
 			const Operator &oper,
 			const Monoid &monoid,
 			const MulMonoid &mulMonoid,
-			const typename std::enable_if< !alp::is_object< OutputType >::value &&
+			const std::enable_if_t< !alp::is_object< OutputType >::value &&
 				!alp::is_object< InputType1 >::value && !
 				alp::is_object< InputType2 >::value &&
 				alp::is_operator< Operator >::value &&
 				alp::is_monoid< Monoid >::value,
-			void >::type * const = NULL
+			void > * const = NULL
 		) {
 
 			static_assert( 
@@ -100,18 +99,17 @@ namespace alp {
 #endif
 
 			// Early exit checks 
-			if( ! internal::getInitialized( A ) ||
-				! internal::getInitialized( B ) ||
+			if( ! internal::getInitialized( A ) || ! internal::getInitialized( B ) ||
 				! internal::getInitialized( C )
 			) {
 				internal::setInitialized( C, false );
 				return SUCCESS;
 			}
 
-			const std::ptrdiff_t m   { static_cast< std::ptrdiff_t >( nrows( C ) ) };
-			const std::ptrdiff_t n   { static_cast< std::ptrdiff_t >( ncols( C ) ) };
+			const std::ptrdiff_t m { static_cast< std::ptrdiff_t >( nrows( C ) ) };
+			const std::ptrdiff_t n { static_cast< std::ptrdiff_t >( ncols( C ) ) };
 			const std::ptrdiff_t m_a { static_cast< std::ptrdiff_t >( nrows( A ) ) };
-			const std::ptrdiff_t k   { static_cast< std::ptrdiff_t >( ncols( A ) ) };
+			const std::ptrdiff_t k { static_cast< std::ptrdiff_t >( ncols( A ) ) };
 			const std::ptrdiff_t k_b { static_cast< std::ptrdiff_t >( nrows( B ) ) };
 			const std::ptrdiff_t n_b { static_cast< std::ptrdiff_t >( ncols( B ) ) };
 
@@ -156,18 +154,6 @@ namespace alp {
 
 				RC local_rc = SUCCESS;
 
-// 				if( block_grid_dims_c.first != set_block_grid_dims_a.first 
-// 					|| block_grid_dims_c.second != set_block_grid_dims_b.second 
-// 					|| set_block_grid_dims_a.second != set_block_grid_dims_b.first 
-// 				) {
-// #ifndef NDEBUG
-// 					#pragma omp critical
-// 					std::cerr << "Thread " << thread << " in alp::internal::mxm_generic (omp)\n"
-// 						"\tMismatching local block grid size on set." << std::endl;
-// #endif
-// 					local_rc = MISMATCH;
-// 				}
-
 				// Broadcast A and B to all c-dimensional layers
 				if( local_rc == SUCCESS && da.isActiveThread( th_ijk_a ) && th_ijk_a.rt > 0 ) {
 
@@ -308,7 +294,6 @@ namespace alp {
 
 									for( size_t bc = 0; bc < block_grid_dims_c.second; ++bc ) {
 
-
 										const auto refB_loc = get_view( B, th_sjk_b, bk, bc );
 										auto refC_ijk = get_view( C, th_ijk_c, br, bc );
 
@@ -465,13 +450,13 @@ namespace alp {
 		Density::Dense, InputView2, InputImfR2, InputImfC2, omp > &B,
 		const Semiring & ring = Semiring(),
 		const PHASE &phase = NUMERICAL,
-		const typename std::enable_if< 
+		const std::enable_if_t< 
 			!alp::is_object< OutputType >::value &&
 			!alp::is_object< InputType1 >::value &&
 			!alp::is_object< InputType2 >::value &&
 			alp::is_semiring< Semiring >::value,
 			void 
-		>::type * const = NULL 
+		> * const = NULL 
 	) {
 		(void)phase;
 
@@ -506,13 +491,13 @@ namespace alp {
 		const Operator & mulOp,
 		const Monoid & addM,
 		const PHASE &phase = NUMERICAL,
-		const typename std::enable_if< 
+		const std::enable_if_t< 
 			!alp::is_object< OutputType >::value &&
 			!alp::is_object< InputType1 >::value &&
 			!alp::is_object< InputType2 >::value &&
 			alp::is_operator< Operator >::value && alp::is_monoid< Monoid >::value,
 			void 
-		>::type * const = NULL 
+		> * const = NULL 
 	) {
 		(void)phase;
 
diff --git a/include/alp/omp/io.hpp b/include/alp/omp/io.hpp
index 2794b1336..9fc66e815 100644
--- a/include/alp/omp/io.hpp
+++ b/include/alp/omp/io.hpp
@@ -152,7 +152,7 @@ namespace alp {
 	) noexcept {
 		
 		static_assert( 
-			( std::is_same< InputType, typename fwd_iterator::value_type >::value ), 
+			std::is_same< InputType, typename fwd_iterator::value_type >::value, 
 			"alp::buildMatrix (omp): Mismatching type between user-provided input "
 			"container and output ALP container."
 		);
diff --git a/include/alp/omp/storage.hpp b/include/alp/omp/storage.hpp
index 8bfe855fc..6e3e23cf6 100644
--- a/include/alp/omp/storage.hpp
+++ b/include/alp/omp/storage.hpp
@@ -314,7 +314,7 @@ namespace alp {
 			}
 
 			bool isActiveThread(const ThreadCoords &t ) const {
-				return t.tr < Tr && t.tc < Tc && t.rt < Rt;
+				return ( t.tr < Tr ) && ( t.tc < Tc ) && ( t.rt < Rt );
 			}
 
 			bool isActiveThread(const size_t thread_id ) const {
diff --git a/tests/unit/dense_omp_mxm.cpp b/tests/unit/dense_omp_mxm.cpp
index 157b71494..3628d5d45 100644
--- a/tests/unit/dense_omp_mxm.cpp
+++ b/tests/unit/dense_omp_mxm.cpp
@@ -28,7 +28,7 @@ using namespace alp;
 
 
 template< typename T >
-void print_stdvec_as_matrix( std::string name, const std::vector< T > & vA, const size_t m, const size_t n, const size_t lda ) {
+void print_stdvec_as_matrix( std::string name, const std::vector< T > &vA, const size_t m, const size_t n, const size_t lda ) {
 
 	std::cout << "Vec " << name << ":" << std::endl;
 	for( size_t row = 0; row < m; ++row ) {
@@ -41,27 +41,29 @@ void print_stdvec_as_matrix( std::string name, const std::vector< T > & vA, cons
 }
 
 template< typename T, typename Operator, typename Monoid >
-void mxm_stdvec_as_matrix(	std::vector< T > & vC, const size_t ldc,
-							const std::vector< T > & vA, const size_t lda,
-							const std::vector< T > & vB, const size_t ldb,
-							const size_t m, const size_t k, const size_t n,
-							const Operator oper,
-							const Monoid monoid ) {
+void mxm_stdvec_as_matrix(	
+	std::vector< T > &vC, const size_t ldc,
+	const std::vector< T > &vA, const size_t lda,
+	const std::vector< T > &vB, const size_t ldb,
+	const size_t m, const size_t k, const size_t n,
+	const Operator oper,
+	const Monoid monoid 
+) {
     
 	T temp;
 
 #ifndef NDEBUG
-	print_stdvec_as_matrix("vA", vA, n, n, n);
-	print_stdvec_as_matrix("vB", vB, n, n, n);
-	print_stdvec_as_matrix("vC - PRE", vC, n, n, n);
+	print_stdvec_as_matrix( "vA", vA, n, n, n );
+	print_stdvec_as_matrix( "vB", vB, n, n, n );
+	print_stdvec_as_matrix( "vC - PRE", vC, n, n, n );
 #endif
 
 	for( size_t i = 0; i < m; ++i ) {
 		for( size_t j = 0; j < n; ++j ) {
-			T & c_val { vC[ i * ldc + j ] };
+			T &c_val { vC[ i * ldc + j ] };
 			for( size_t l = 0; l < k; ++l ) {
-					const T & a_val { vA[ i * lda + l ] };
-					const T & b_val { vB[ l * ldb + j ] };
+					const T &a_val { vA[ i * lda + l ] };
+					const T &b_val { vB[ l * ldb + j ] };
 					// std::cout << c_val << " += " << a_val << " * " << b_val << std::endl;
 					(void)internal::apply( temp, a_val, b_val, oper );
 					// std::cout << "temp = " << temp << std::endl;
@@ -71,12 +73,12 @@ void mxm_stdvec_as_matrix(	std::vector< T > & vC, const size_t ldc,
 	}
 
 #ifndef NDEBUG
-	print_stdvec_as_matrix("vC - POST", vC, n, n, n);
+	print_stdvec_as_matrix( "vC - POST", vC, n, n, n );
 #endif
 }
 
 template< typename Structure, typename T >
-void stdvec_build_matrix( std::vector< T > & vA, const size_t m, const size_t n, const size_t lda, const T zero, const T one ) {
+void stdvec_build_matrix( std::vector< T > &vA, const size_t m, const size_t n, const size_t lda, const T zero, const T one ) {
 
 		if( std::is_same< Structure, structures::General >::value ) {
 			std::fill( vA.begin(), vA.end(), one );
@@ -96,7 +98,7 @@ void stdvec_build_matrix( std::vector< T > & vA, const size_t m, const size_t n,
 }
 
 template< typename Structure, typename T >
-void stdvec_build_matrix( std::vector< T > & vA, const size_t m, const size_t n, const size_t lda, const T zero, const T one, const T inc ) {
+void stdvec_build_matrix( std::vector< T > &vA, const size_t m, const size_t n, const size_t lda, const T zero, const T one, const T inc ) {
 
 		T val = one;
 		if( std::is_same< Structure, structures::General >::value ) {
@@ -128,23 +130,23 @@ void stdvec_build_matrix( std::vector< T > & vA, const size_t m, const size_t n,
 }
 
 template< typename Structure, typename T >
-void stdvec_build_matrix_packed( std::vector< T > & vA, const T one ) {
+void stdvec_build_matrix_packed( std::vector< T > &vA, const T one ) {
 
 	std::fill( vA.begin(), vA.end(), one );
 
 }
 
 template< typename Structure, typename T >
-void stdvec_build_matrix_packed( std::vector< T > & vA, const T one, const T inc ) {
+void stdvec_build_matrix_packed( std::vector< T > &vA, const T one, const T inc ) {
 
 		T val = one;
 		if( std::is_same< Structure, structures::Symmetric >::value ) { // Assumes Packed Row - Upper
-			for( auto & elem: vA ) {
+			for( auto &elem: vA ) {
 				elem = val;
 				val += inc;
 			}
 		} else if( std::is_same< Structure, structures::UpperTriangular >::value ) { // Assumes Packed Row - Upper
-			for( auto & elem: vA ) {
+			for( auto &elem: vA ) {
 				elem = val;
 				val += inc;
 			}
@@ -153,8 +155,8 @@ void stdvec_build_matrix_packed( std::vector< T > & vA, const T one, const T inc
 }
 
 template< typename MatType, typename T >
-void diff_stdvec_matrix( const std::vector< T > & vA, const size_t m, const size_t n, const size_t lda,
-						 const MatType & mA, double threshold=1e-7 ) {
+void diff_stdvec_matrix( const std::vector< T > &vA, const size_t m, const size_t n, const size_t lda,
+						 const MatType & mA, double threshold = 1e-7 ) {
 
 	if( std::is_same< typename MatType::structure, structures::General >::value ) {
 		for( size_t row = 0; row < m; ++row ) {
@@ -233,15 +235,15 @@ void run_mxm( const size_t m, const size_t k, const size_t n, alp::RC &rc ) {
 	}
 
 #ifndef NDEBUG
-	print_matrix("A", A);
-	print_matrix("B", B);
-	print_matrix("C - PRE", C);
+	print_matrix( "A", A );
+	print_matrix( "B", B );
+	print_matrix( "C - PRE", C );
 #endif
 
 	rc = rc ? rc : alp::mxm( C, A, B, ring );
 
 #ifndef NDEBUG
-	print_matrix("C - POST", C);
+	print_matrix( "C - POST", C );
 #endif
 
 	if ( rc != alp::SUCCESS )
@@ -296,7 +298,7 @@ void alp_program( const size_t &n, alp::RC &rc ) {
 
 }
 
-int main( int argc, char ** argv ) {
+int main( int argc, char **argv ) {
 	// defaults
 	bool printUsage = false;
 	size_t in = 4;