diff --git a/include/alp/blas3.hpp b/include/alp/blas3.hpp
index e1f6d0baa..976d3ccb7 100644
--- a/include/alp/blas3.hpp
+++ b/include/alp/blas3.hpp
@@ -26,6 +26,9 @@
 #ifdef _ALP_WITH_REFERENCE
  #include <alp/reference/blas3.hpp>
 #endif
+#ifdef _ALP_WITH_OMP
+ #include <alp/omp/blas3.hpp>
+#endif
 
 #endif // end _H_ALP_BLAS3
 
diff --git a/include/alp/omp/blas3.hpp b/include/alp/omp/blas3.hpp
new file mode 100644
index 000000000..4954e00b5
--- /dev/null
+++ b/include/alp/omp/blas3.hpp
@@ -0,0 +1,511 @@
+
+/*
+ *   Copyright 2021 Huawei Technologies Co., Ltd.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * @author A. N. Yzelman
+ * @date 14th of January 2022
+ */
+
+#ifndef _H_ALP_OMP_BLAS3
+#define _H_ALP_OMP_BLAS3
+
+#include <algorithm>   // for std::min/max
+#include <type_traits> // for std::enable_if
+
+#include <alp/descriptors.hpp>
+#include <alp/structures.hpp>
+
+#include <alp/base/blas3.hpp>
+
+#include "matrix.hpp"
+#include "storage.hpp"
+
+// Include backend to which sequential work is delegated
+#ifdef _ALP_OMP_WITH_REFERENCE
+ #include <alp/reference/blas2.hpp>
+ #include <alp/reference/blas3.hpp>
+ #include <alp/reference/io.hpp>
+#endif
+
+#ifndef NDEBUG
+#include "../../../tests/utils/print_alp_containers.hpp"
+#endif
+
+
+namespace alp {
+
+	namespace internal {
+
+		/**
+		 * \internal general mxm implementation that all mxm variants using 
+		 * structured matrices refer to.
+		 */
+		template<
+			bool allow_void,
+			class MulMonoid,
+			typename OutputType, typename InputType1, typename InputType2,
+			class Operator, class Monoid,
+			typename OutputStructure, typename OutputView, 
+			typename OutputImfR, typename OutputImfC,
+			typename InputStructure1, typename InputView1, 
+			typename InputImfR1, typename InputImfC1,
+			typename InputStructure2, typename InputView2, 
+			typename InputImfR2, typename InputImfC2
+		>
+		RC mxm_generic( 
+			alp::Matrix< OutputType, OutputStructure, 
+			Density::Dense, OutputView, OutputImfR, OutputImfC, omp > &C,
+			alp::Matrix< InputType1, InputStructure1, 
+			Density::Dense, InputView1, InputImfR1, InputImfC1, omp > &A,
+			alp::Matrix< InputType2, InputStructure2, 
+			Density::Dense, InputView2, InputImfR2, InputImfC2, omp > &B,
+			const Operator &oper,
+			const Monoid &monoid,
+			const MulMonoid &mulMonoid,
+			const std::enable_if_t< !alp::is_object< OutputType >::value &&
+				!alp::is_object< InputType1 >::value && !
+				alp::is_object< InputType2 >::value &&
+				alp::is_operator< Operator >::value &&
+				alp::is_monoid< Monoid >::value,
+			void > * const = NULL
+		) {
+
+			static_assert( 
+				!(
+					std::is_same< InputType1, void >::value ||
+					std::is_same< InputType2, void >::value
+				),
+				"alp::internal::mxm_generic: the operator-monoid version of mxm cannot be "
+				"used if either of the input matrices is a pattern matrix (of type "
+				"void)"
+			);
+
+#ifndef NDEBUG
+			std::cout << "In alp::internal::mxm_generic (omp)\n";
+#endif
+
+			// Early exit checks 
+			if( ! internal::getInitialized( A ) || ! internal::getInitialized( B ) ||
+				! internal::getInitialized( C )
+			) {
+				internal::setInitialized( C, false );
+				return SUCCESS;
+			}
+
+			const std::ptrdiff_t m { static_cast< std::ptrdiff_t >( nrows( C ) ) };
+			const std::ptrdiff_t n { static_cast< std::ptrdiff_t >( ncols( C ) ) };
+			const std::ptrdiff_t m_a { static_cast< std::ptrdiff_t >( nrows( A ) ) };
+			const std::ptrdiff_t k { static_cast< std::ptrdiff_t >( ncols( A ) ) };
+			const std::ptrdiff_t k_b { static_cast< std::ptrdiff_t >( nrows( B ) ) };
+			const std::ptrdiff_t n_b { static_cast< std::ptrdiff_t >( ncols( B ) ) };
+
+			if( m != m_a || k != k_b || n != n_b ) {
+				return MISMATCH;
+			}
+
+			const Distribution_2_5D &da = internal::getAmf( A ).getDistribution();
+			const Distribution_2_5D &db = internal::getAmf( B ).getDistribution();
+			const Distribution_2_5D &dc = internal::getAmf( C ).getDistribution();
+
+			const auto tg_a = da.getThreadGridDims();
+			const auto tg_b = db.getThreadGridDims();
+			const auto tg_c = dc.getThreadGridDims();
+			
+			// 2.5D factor ranges within 3D limits
+			if( tg_c.rt != tg_a.rt || tg_a.rt != tg_b.rt || ( tg_a.tc % tg_a.rt ) != 0 ) {
+				return MISMATCH;
+			}
+
+			if( tg_c.tr != tg_a.tr || tg_c.tc != tg_b.tc || tg_a.tc != tg_b.tr ) {
+				return MISMATCH;
+			}
+
+			using th_coord_t = typename Distribution_2_5D::ThreadCoords;
+
+			RC rc = SUCCESS;
+
+#ifndef NDEBUG
+			#pragma omp critical
+			std::cerr << "In alp::internal::mxm_generic (omp)\n"
+				"\tBeginning parallel region" << std::endl;
+#endif
+
+			#pragma omp parallel
+			{			
+				const size_t thread = config::OMP::current_thread_ID();
+
+				const th_coord_t th_ijk_a = da.getThreadCoords( thread );
+				const th_coord_t th_ijk_b = db.getThreadCoords( thread );
+				const th_coord_t th_ijk_c = dc.getThreadCoords( thread );
+
+				RC local_rc = SUCCESS;
+
+				// Broadcast A and B to all c-dimensional layers
+				if( local_rc == SUCCESS && da.isActiveThread( th_ijk_a ) && th_ijk_a.rt > 0 ) {
+
+#ifndef NDEBUG
+					#pragma omp critical
+					std::cerr << "Thread " << thread << " in alp::internal::mxm_generic (omp)\n"
+						"\tCopying A." << std::endl;
+#endif
+
+					const auto set_block_grid_dims_a = da.getLocalBlockGridDims( th_ijk_a );
+
+					th_coord_t th_ij0_a( th_ijk_a.tr, th_ijk_a.tc, 0 );
+
+					for( size_t br = 0; br < set_block_grid_dims_a.first; ++br ) {
+						for( size_t bc = 0; bc < set_block_grid_dims_a.second; ++bc ) {
+
+							auto refAij0 = get_view( A, th_ij0_a, br, bc );
+							auto refAijk = get_view( A, th_ijk_a, br, bc );
+
+							local_rc = local_rc ? local_rc : set( refAijk, refAij0 );
+
+						}
+					}
+				
+				} // End Broadcast of A
+
+				if( local_rc == SUCCESS && db.isActiveThread( th_ijk_b ) && th_ijk_b.rt > 0 ) {
+
+#ifndef NDEBUG
+					#pragma omp critical
+					std::cerr << "Thread " << thread << " in alp::internal::mxm_generic (omp)\n"
+						"\tCopying B." << std::endl;
+#endif
+
+					const auto set_block_grid_dims_b = db.getLocalBlockGridDims( th_ijk_b );
+
+					th_coord_t th_ij0_b( th_ijk_b.tr, th_ijk_b.tc, 0 );
+
+					for( size_t br = 0; br < set_block_grid_dims_b.first; ++br ) {
+						for( size_t bc = 0; bc < set_block_grid_dims_b.second; ++bc ) {
+
+							auto refBij0 = get_view( B, th_ij0_b, br, bc );
+							auto refBijk = get_view( B, th_ijk_b, br, bc );
+
+							local_rc = local_rc ? local_rc : set( refBijk, refBij0 );
+						}
+					}
+				} // End Broadcast of B
+
+				if( local_rc == SUCCESS && dc.isActiveThread( th_ijk_c ) && th_ijk_c.rt > 0 ) {
+
+#ifndef NDEBUG
+					#pragma omp critical
+					std::cerr << "Thread " << thread << " in alp::internal::mxm_generic (omp)\n"
+						"\tZeroing C." << std::endl;
+#endif
+
+					const auto block_grid_dims_c = dc.getLocalBlockGridDims( th_ijk_c );
+
+					alp::Scalar< 
+						OutputType, alp::structures::General, config::default_sequential_backend 
+					> zero( monoid.template getIdentity< OutputType >() );
+
+					for( size_t br = 0; br < block_grid_dims_c.first; ++br ) {
+						for( size_t bc = 0; bc < block_grid_dims_c.second; ++bc ) {
+							auto refCijk = get_view( C, th_ijk_c, br, bc );
+							local_rc = local_rc ? local_rc : set( refCijk, zero );
+						}
+					}
+				} // End Zero-ing of C
+
+				// Different values for rc could converge here (eg, MISMATCH, FAILED).
+				if( local_rc != SUCCESS ) {
+#ifndef NDEBUG
+					#pragma omp critical
+					std::cerr << "Thread " << thread << " in alp::internal::mxm_generic (omp)\n"
+						"\tIssues replicating input matrices." << std::endl;
+#endif
+					rc = local_rc;
+				}
+
+				// End Broadcast of A and B and zero-ing of C
+				#pragma omp barrier
+
+#ifndef NDEBUG
+				#pragma omp critical
+				std::cerr << "Thread " << thread << " in alp::internal::mxm_generic (omp)\n"
+					"\tPassing barrier" << std::endl;
+#endif
+
+				if( rc == SUCCESS && dc.isActiveThread( th_ijk_c ) ) {
+
+					const auto block_grid_dims_c = dc.getLocalBlockGridDims( th_ijk_c );
+
+					// Initialize circular shifts at stride of Rt
+					size_t c_a = utils::modulus( th_ijk_a.tc + th_ijk_a.tr + th_ijk_a.rt * tg_a.tc / tg_a.rt, tg_a.tc );
+					size_t r_b = utils::modulus( th_ijk_b.tr + th_ijk_b.tc + th_ijk_b.rt * tg_b.tr / tg_b.rt, tg_b.tr );
+
+					// Per-c-dimensional-layer partial computation
+					for( size_t r = 0; r < tg_a.tc / tg_a.rt; ++r ) {
+
+						const th_coord_t th_isk_a( th_ijk_a.tr, c_a, th_ijk_a.rt );
+						const th_coord_t th_sjk_b( r_b, th_ijk_b.tc, th_ijk_b.rt );
+
+#ifndef NDEBUG
+						if( thread == 1 ) {
+							#pragma omp critical
+							{
+								std::cerr << "Thread " << thread << " in alp::internal::mxm_generic (omp)\n"
+									"\tCompute iteration r=" << r << "\n"
+									"\tStarting from A( " << th_ijk_a.tr << ", " << c_a << ", " << th_ijk_a.rt << " )\n"
+									"\tStarting from B( " << r_b << ", " << th_ijk_b.tc << ", " << th_ijk_b.rt << " )" <<std::endl;
+							}
+						}
+#endif
+
+						const auto mxm_block_grid_dims_a = da.getLocalBlockGridDims( th_isk_a );
+						const auto mxm_block_grid_dims_b = db.getLocalBlockGridDims( th_sjk_b );
+
+						if( block_grid_dims_c.first != mxm_block_grid_dims_a.first 
+							|| block_grid_dims_c.second != mxm_block_grid_dims_b.second 
+							|| mxm_block_grid_dims_a.second != mxm_block_grid_dims_b.first 
+						) {
+#ifndef NDEBUG
+							#pragma omp critical
+							std::cerr << "Thread " << thread << " in alp::internal::mxm_generic (omp)\n"
+								"\tMismatching local block grid size on mxm." << std::endl;
+#endif
+							local_rc = MISMATCH;
+						}
+
+						if( local_rc == SUCCESS ) {
+
+							for( size_t bk = 0; bk < mxm_block_grid_dims_a.second; ++bk ) {
+								for( size_t br = 0; br < block_grid_dims_c.first; ++br ) {
+	
+									const auto refA_loc = get_view( A, th_isk_a, br, bk );
+
+									for( size_t bc = 0; bc < block_grid_dims_c.second; ++bc ) {
+
+										const auto refB_loc = get_view( B, th_sjk_b, bk, bc );
+										auto refC_ijk = get_view( C, th_ijk_c, br, bc );
+
+#ifndef NDEBUG
+										if( thread == 1 ) {
+											#pragma omp critical 
+											{
+												std::cerr << "Thread " << thread << " in alp::internal::mxm_generic (omp)\n"
+													"\tIteration r=" << r << " computing: " << bk << " " << br << " " << bc << std::endl;
+												print_matrix("Aref pre", refA_loc );
+												print_matrix("Bref pre", refB_loc );
+												print_matrix("Cref pre", refC_ijk );
+											}
+										}
+#endif
+
+										// Delegate the call to the sequential mxm implementation
+										local_rc = local_rc ? local_rc : mxm_generic< allow_void >( refC_ijk, refA_loc, refB_loc, oper, monoid, mulMonoid );
+
+#ifndef NDEBUG
+										if( thread == 1 ) {
+											#pragma omp critical 
+											{
+												print_matrix("Cref post", refC_ijk );
+											}
+										}
+#endif
+
+									}
+								}
+							}
+
+							// Circular shift rightwards for A
+							c_a = utils::modulus( c_a + 1, tg_a.tc );
+							// Circular shift downwards for B
+							r_b = utils::modulus( r_b + 1, tg_b.tr );
+
+						} 
+
+					} 
+
+				} // End computation per layer of c-dimension
+
+				// Different values for rc could converge here (eg, MISMATCH, FAILED).
+				if( local_rc != SUCCESS ) {
+#ifndef NDEBUG
+					#pragma omp critical
+					std::cerr << "Thread " << thread << " in alp::internal::mxm_generic (omp)\n"
+						"\tIssues with local mxm computations." << std::endl;
+#endif
+					rc = local_rc;
+				}
+
+				// End layer-by-layer partial computation
+				#pragma omp barrier
+
+				// Final c-dimension reduction
+				if( rc == SUCCESS && dc.isActiveThread( th_ijk_c ) && th_ijk_c.rt == 0 ) {
+					
+#ifndef NDEBUG
+					#pragma omp critical
+					std::cerr << "Thread " << thread << " in alp::internal::mxm_generic (omp)\n"
+						"\tEntering reduction." << std::endl;
+#endif
+
+					const auto block_grid_dims_c = dc.getLocalBlockGridDims( th_ijk_c );
+
+					for( size_t r = 1; r < tg_c.rt; ++r ) {
+
+#ifndef NDEBUG
+						#pragma omp critical
+						std::cerr << "Thread " << thread << " in alp::internal::mxm_generic (omp)\n"
+							"\tReduction iteration r=" << r << std::endl;
+#endif
+
+						const th_coord_t th_ijr_c( th_ijk_c.tr, th_ijk_c.tc, r );
+
+						for( size_t br = 0; br < block_grid_dims_c.first; ++br ) {
+							for( size_t bc = 0; bc < block_grid_dims_c.second; ++bc ) {
+
+								auto refCij0 = internal::get_view( C, th_ijk_c, br, bc ); // k == 0 
+								auto refCijr = internal::get_view( C, th_ijr_c, br, bc );
+
+								// Final result in C at layer 0
+								local_rc = local_rc ? local_rc : foldl( refCij0, refCijr, monoid );
+							}
+						}
+
+					}
+				
+				} // End of final reduction along c-dimension
+				
+				// Different values for rc could converge here (eg, MISMATCH, FAILED).
+				if( local_rc != SUCCESS ) {
+#ifndef NDEBUG
+					#pragma omp critical
+					std::cerr << "Thread " << thread << " in alp::internal::mxm_generic (omp)\n"
+						"\tIssues with final reduction." << std::endl;
+#endif
+					rc = local_rc;
+				}
+
+			}
+
+			return rc;
+
+		}
+
+	} // namespace internal
+
+	/**
+	 * Dense Matrix-Matrix multiply between structured matrices.
+	 * Version with semiring parameter.
+	 *
+	 * @tparam descr      		The descriptors under which to perform the computation.
+	 * @tparam OutputStructMatT The structured matrix type of the output matrix.
+	 * @tparam InputStructMatT1 The structured matrix type of the the left-hand side input
+	 *                    		matrix.
+	 * @tparam InputStructMatT2 The structured matrix type of the right-hand side input
+	 *                    		matrix.
+	 * @tparam Semiring   		The semiring under which to perform the
+	 *                    		multiplication.
+	 *
+	 * @returns SUCCESS  If the computation completed as intended.
+	 * @returns MISMATCH Whenever the structures or dimensions of \a A, \a B, and \a C do
+ *                       not match. All input data containers are left
+ *                       untouched if this exit code is returned; it will be
+ *                       as though this call was never made.
+	 *
+	 * @param[out] C 	The output matrix \f$ C = AB \f$ when the function returns
+	 *               	#SUCCESS.
+	 * @param[in]  A 	The left-hand side input matrix \f$ A \f$.
+	 * @param[in]  B 	The left-hand side input matrix \f$ B \f$.
+	 * @param[in] ring  (Optional.) The semiring under which the computation should
+	 *                             proceed.
+	 * @param phase 	The execution phase.
+	 */
+	template<
+		typename OutputType, typename InputType1, typename InputType2,
+		typename OutputStructure, typename OutputView, 
+		typename OutputImfR, typename OutputImfC,
+		typename InputStructure1, typename InputView1, 
+		typename InputImfR1, typename InputImfC1,
+		typename InputStructure2, typename InputView2, 
+		typename InputImfR2, typename InputImfC2,
+		class Semiring
+	>
+	RC mxm( 
+		alp::Matrix< OutputType, OutputStructure, 
+		Density::Dense, OutputView, OutputImfR, OutputImfC, omp > &C,
+		alp::Matrix< InputType1, InputStructure1, 
+		Density::Dense, InputView1, InputImfR1, InputImfC1, omp > &A,
+		alp::Matrix< InputType2, InputStructure2, 
+		Density::Dense, InputView2, InputImfR2, InputImfC2, omp > &B,
+		const Semiring & ring = Semiring(),
+		const PHASE &phase = NUMERICAL,
+		const std::enable_if_t< 
+			!alp::is_object< OutputType >::value &&
+			!alp::is_object< InputType1 >::value &&
+			!alp::is_object< InputType2 >::value &&
+			alp::is_semiring< Semiring >::value,
+			void 
+		> * const = NULL 
+	) {
+		(void)phase;
+
+		return internal::mxm_generic< false >( 
+			C, A, B,
+			ring.getMultiplicativeOperator(), ring.getAdditiveMonoid(), ring.getMultiplicativeMonoid()
+		);
+
+	}
+
+	/**
+	 * Dense Matrix-Matrix multiply between structured matrices.
+	 * Version with additive monoid and multiplicative operator
+	 */
+	template< 
+		typename OutputType, typename InputType1, typename InputType2,
+		typename OutputStructure, typename OutputView, 
+		typename OutputImfR, typename OutputImfC,
+		typename InputStructure1, typename InputView1, 
+		typename InputImfR1, typename InputImfC1,
+		typename InputStructure2, typename InputView2, 
+		typename InputImfR2, typename InputImfC2,
+		class Operator, class Monoid
+	>
+	RC mxm( 
+		alp::Matrix< OutputType, OutputStructure, 
+		Density::Dense, OutputView, OutputImfR, OutputImfC, omp > &C,
+		alp::Matrix< InputType1, InputStructure1, 
+		Density::Dense, InputView1, InputImfR1, InputImfC1, omp > &A,
+		alp::Matrix< InputType2, InputStructure2, 
+		Density::Dense, InputView2, InputImfR2, InputImfC2, omp > &B,
+		const Operator & mulOp,
+		const Monoid & addM,
+		const PHASE &phase = NUMERICAL,
+		const std::enable_if_t< 
+			!alp::is_object< OutputType >::value &&
+			!alp::is_object< InputType1 >::value &&
+			!alp::is_object< InputType2 >::value &&
+			alp::is_operator< Operator >::value && alp::is_monoid< Monoid >::value,
+			void 
+		> * const = NULL 
+	) {
+		(void)phase;
+
+		return internal::mxm_generic< false >( C, A, B, mulOp, addM, Monoid() );
+	
+	}
+
+} // end namespace ``alp''
+
+#endif // end ``_H_ALP_OMP_BLAS3''
+
diff --git a/include/alp/omp/config.hpp b/include/alp/omp/config.hpp
index 3bdb700ed..79200558f 100644
--- a/include/alp/omp/config.hpp
+++ b/include/alp/omp/config.hpp
@@ -34,8 +34,12 @@ namespace alp {
 		class OMP : public grb::config::OMP {};
 
 		// Dimensions of blocks counted in number of elements per dimension
-		constexpr size_t BLOCK_ROW_DIM = 16;
-		constexpr size_t BLOCK_COL_DIM = 16;
+		constexpr size_t BLOCK_ROW_DIM = 2;
+		constexpr size_t BLOCK_COL_DIM = 2;
+
+		// Temporary squared solution to accomodate for 2.5D mxm
+		constexpr size_t THREAD_ROW_DIM = 4;
+		constexpr size_t THREAD_COL_DIM = 4;
 
 		constexpr size_t REPLICATION_FACTOR_THREADS = 1;
 
diff --git a/include/alp/omp/io.hpp b/include/alp/omp/io.hpp
index 0f5793ac6..9fc66e815 100644
--- a/include/alp/omp/io.hpp
+++ b/include/alp/omp/io.hpp
@@ -23,6 +23,8 @@
 #ifndef _H_ALP_OMP_IO
 #define _H_ALP_OMP_IO
 
+#include <type_traits>
+
 #include <alp/base/io.hpp>
 
 #include "matrix.hpp"
@@ -136,6 +138,49 @@ namespace alp {
 		return rc;
 	}
 
+	/**
+	 * Temporarily assuming 1-1, row-major mapping with user container.
+	 */
+	template< 
+		typename InputType, typename Structure, typename View, 
+		typename ImfR, typename ImfC, typename fwd_iterator 
+	>
+	RC buildMatrix(
+		Matrix< InputType, Structure, Density::Dense, View, ImfR, ImfC, omp > &A,
+		const fwd_iterator &start,
+		const fwd_iterator &end
+	) noexcept {
+		
+		static_assert( 
+			std::is_same< InputType, typename fwd_iterator::value_type >::value, 
+			"alp::buildMatrix (omp): Mismatching type between user-provided input "
+			"container and output ALP container."
+		);
+
+		const size_t m = nrows( A );
+		const size_t n = ncols( A );
+
+		if( ( end - start ) != static_cast< std::ptrdiff_t >( m * n ) ) {
+			
+			std::cerr << "alp::buildMatrix (omp): Mismatching sizes between "
+				"user-provided input container and output ALP container." << std::endl;
+
+			return MISMATCH;
+		}
+
+		internal::setInitialized(A, true);
+
+		fwd_iterator it = start;
+
+		for( size_t i = 0; i < m; ++i ) {
+			for( size_t j = 0; j < n; ++j ) {
+				internal::access( A, internal::getStorageIndex( A, i, j ) ) = *( it++ );
+			}
+		}
+		
+		return SUCCESS;
+	}
+
 } // end namespace ``alp''
 
 #undef NO_CAST_ASSERT
diff --git a/include/alp/omp/matrix.hpp b/include/alp/omp/matrix.hpp
index afb16d412..94e600979 100644
--- a/include/alp/omp/matrix.hpp
+++ b/include/alp/omp/matrix.hpp
@@ -23,6 +23,8 @@
 #ifndef _H_ALP_OMP_MATRIX
 #define _H_ALP_OMP_MATRIX
 
+#include <type_traits>
+
 #include <alp/backends.hpp>
 #include <alp/type_traits.hpp>
 
@@ -37,6 +39,7 @@
  #include <alp/reference/storage.hpp> // For AMFFactory
 #endif
 
+
 namespace alp {
 
 	// Currently no backend specific implementation
@@ -56,10 +59,10 @@ namespace alp {
 			typename SourceMatrix,
 			typename ThreadCoords,
 			std::enable_if_t<
-				is_matrix< SourceMatrix >::value
+				alp::is_matrix< SourceMatrix >::value
 			> * = nullptr
 		>
-		typename internal::new_container_type_from<
+		typename new_container_type_from<
 			typename SourceMatrix::template view_type< view::gather >::type
 		>::template change_backend< config::default_sequential_backend >::type
 		get_view( SourceMatrix &source, const ThreadCoords t, const size_t br, const size_t bc ) {
@@ -68,7 +71,7 @@ namespace alp {
 			const auto &distribution = getAmf( source ).getDistribution();
 			const size_t thread_id = distribution.getThreadId( t );
 			const size_t block_id = distribution.getLocalBlockId( t, br, bc );
-			auto &container = internal::getLocalContainer( internal::getContainer( source ), thread_id, block_id );
+			auto &container = getLocalContainer( getContainer( source ), thread_id, block_id );
 
 			// make an AMF
 			// note: When making a view over a vector, the second imf must be imf::Zero
@@ -92,11 +95,13 @@ namespace alp {
 			);
 
 			// create a sequential container with the container and AMF above
-			using target_t = typename internal::new_container_type_from<
+			using target_t = typename new_container_type_from<
 				typename SourceMatrix::template view_type< view::gather >::type
 			>::template change_backend< config::default_sequential_backend >::type;
 
-			return target_t( container, amf );
+			target_t blk_matrix( container, amf );
+			internal::setInitialized( blk_matrix, internal::getInitialized( source ) );
+			return blk_matrix;
 		}
 
 	} // namespace internal
diff --git a/include/alp/omp/storage.hpp b/include/alp/omp/storage.hpp
index fb24da0bf..6e3e23cf6 100644
--- a/include/alp/omp/storage.hpp
+++ b/include/alp/omp/storage.hpp
@@ -131,39 +131,39 @@ namespace alp {
 
 			};
 
-			/** Type encapsulating the local block coordinate. */
-			struct LocalBlockCoord {
-
-				const size_t tr;
-				const size_t tc;
-				const size_t rt;
-				const size_t br;
-				const size_t bc;
-
-				LocalBlockCoord(
-					const size_t tr, const size_t tc,
-					const size_t rt,
-					const size_t br, const size_t bc
-				) :
-					tr( tr ), tc( tc ),
-					rt( rt ),
-					br( br ), bc( bc ) {}
-
-			};
+			// /** Type encapsulating the local block coordinate. */
+			// struct LocalBlockCoord {
+
+			// 	const size_t tr;
+			// 	const size_t tc;
+			// 	const size_t rt;
+			// 	const size_t br;
+			// 	const size_t bc;
+
+			// 	LocalBlockCoord(
+			// 		const size_t tr, const size_t tc,
+			// 		const size_t rt,
+			// 		const size_t br, const size_t bc
+			// 	) :
+			// 		tr( tr ), tc( tc ),
+			// 		rt( rt ),
+			// 		br( br ), bc( bc ) {}
+
+			// };
 
 		private:
 
 			/** Row and column dimensions of the associated container */
 			const size_t m;
 			const size_t n;
-			/** The row and column dimensions of the thread grid */
-			const size_t Tr;
-			const size_t Tc;
 			/** Replication factor in thread-coordinate space */
 			static constexpr size_t Rt = config::REPLICATION_FACTOR_THREADS;
 			/** The row and column dimensions of the global block grid */
 			const size_t Br;
 			const size_t Bc;
+			/** The row and column dimensions of the thread grid */
+			const size_t Tr;
+			const size_t Tc;
 
 		public:
 
@@ -172,26 +172,27 @@ namespace alp {
 				const size_t num_threads
 			) :
 				m( m ), n( n ),
-				Tr( static_cast< size_t >( sqrt( num_threads/ Rt ) ) ),
-				Tc( num_threads / Rt / Tr ),
 				Br( static_cast< size_t >( std::ceil( static_cast< double >( m ) / config::BLOCK_ROW_DIM ) ) ),
-				Bc( static_cast< size_t >( std::ceil( static_cast< double >( n ) / config::BLOCK_COL_DIM ) ) ) {
-
-				if( num_threads != Tr * Tc * Rt ) {
-					std::cerr << "Warning: Provided number of threads cannot be factorized in a 3D grid.\n";
+				Bc( static_cast< size_t >( std::ceil( static_cast< double >( n ) / config::BLOCK_COL_DIM ) ) ),
+				Tr( ( Br > config::THREAD_ROW_DIM ) ? config::THREAD_ROW_DIM : Br ), 
+				Tc( ( Bc > config::THREAD_COL_DIM ) ? config::THREAD_COL_DIM : Bc ) 
+			{
+				if( num_threads != config::THREAD_ROW_DIM * config::THREAD_COL_DIM * config::REPLICATION_FACTOR_THREADS ) {
+					std::cerr << "Warning: Provided number of threads cannot be factorized in a 2.5D grid:\n"
+						"\t" << num_threads << " != " << config::THREAD_ROW_DIM << " x " << config::THREAD_COL_DIM << " x " << config::REPLICATION_FACTOR_THREADS << std::endl;
 				}
 			}
 
-			LocalBlockCoord mapBlockGlobalToLocal( const GlobalBlockCoord &g ) const {
-				(void) g;
-				return LocalBlockCoord( 0, 0, 0, 0, 0 );
-			}
+			// LocalBlockCoord mapBlockGlobalToLocal( const GlobalBlockCoord &g ) const {
+			// 	(void) g;
+			// 	return LocalBlockCoord( 0, 0, 0, 0, 0 );
+			// }
 
-			GlobalBlockCoord mapBlockLocalToGlobal( const LocalBlockCoord &l ) const {
-				const size_t block_id_r = l.br * Tr + l.tr;
-				const size_t block_id_c = l.bc * Tc + l.tc;
-				return GlobalBlockCoord( block_id_r, block_id_c );
-			}
+			// GlobalBlockCoord mapBlockLocalToGlobal( const LocalBlockCoord &l ) const {
+			// 	const size_t block_id_r = l.br * Tr + l.tr;
+			// 	const size_t block_id_c = l.bc * Tc + l.tc;
+			// 	return GlobalBlockCoord( block_id_r, block_id_c );// Temporary
+			// }
 
 			LocalCoord mapGlobalToLocal( const GlobalCoord &g ) const {
 				const size_t global_br = g.i / config::BLOCK_ROW_DIM;
@@ -212,23 +213,31 @@ namespace alp {
 				);
 			}
 
-			/**
-			 * Maps coordinates from local to global space.
-			 *
-			 * \todo Add implementation
+			// /**
+			//  * Maps coordinates from local to global space.
+			//  *
+			//  * \todo Add implementation
+			//  */
+			// GlobalCoord mapLocalToGlobal( const LocalCoord &l ) const {
+			// 	(void) l;
+			// 	return GlobalCoord( 0, 0 );
+			// }
+
+			/** 
+			 * Returns the thread ID corresponding to the given thread coordinates. 
+			 * The fixed thread grid enables to map left-over threads.
 			 */
-			GlobalCoord mapLocalToGlobal( const LocalCoord &l ) const {
-				(void) l;
-				return GlobalCoord( 0, 0 );
-			}
-
-			/** Returns the thread ID corresponding to the given thread coordinates. */
 			size_t getThreadId( const ThreadCoords t ) const {
-				return t.rt * Tr * Tc + t.tr * Tc + t.tc;
+				return t.rt * config::THREAD_ROW_DIM * config::THREAD_COL_DIM + t.tr * config::THREAD_COL_DIM + t.tc;
 			}
 
 			size_t getNumberOfThreads() const {
-				return Tr * Tc * Rt;
+				return config::THREAD_ROW_DIM * config::THREAD_COL_DIM * config::REPLICATION_FACTOR_THREADS;
+			}
+
+			/** Returns the thread grid size */
+			ThreadCoords getThreadGridDims() const {
+				return { Tr, Tc, Rt };
 			}
 
 			/** Returns the total global amount of blocks */
@@ -245,17 +254,17 @@ namespace alp {
 				return { blocks_r, blocks_c };
 			}
 
-			/** Returns the global block coordinates based on the thread and local block coordinates */
-			std::pair< size_t, size_t > getGlobalBlockCoords( const size_t tr, const size_t tc, const size_t br, const size_t bc ) const {
-				const size_t global_br = br * Tr + tr % Tr;
-				const size_t global_bc = bc * Tc + tc % Tc;
-				return { global_br, global_bc };
-			}
+			// /** Returns the global block coordinates based on the thread and local block coordinates */
+			// std::pair< size_t, size_t > getGlobalBlockCoords( const size_t tr, const size_t tc, const size_t br, const size_t bc ) const {
+			// 	const size_t global_br = br * Tr + tr % Tr;
+			// 	const size_t global_bc = bc * Tc + tc % Tc;
+			// 	return { global_br, global_bc };
+			// }
 
-			size_t getGlobalBlockId( const size_t tr, const size_t tc, const size_t br, const size_t bc ) const {
-				const auto global_coords = getGlobalBlockCoords( tr, tc, br, bc );
-				return global_coords.first * Bc + global_coords.second;
-			}
+			// size_t getGlobalBlockId( const size_t tr, const size_t tc, const size_t br, const size_t bc ) const {
+			// 	const auto global_coords = getGlobalBlockCoords( tr, tc, br, bc );
+			// 	return global_coords.first * Bc + global_coords.second;
+			// }
 
 			size_t getLocalBlockId( const LocalCoord &local ) const {
 				return local.br * getLocalBlockGridDims( local.getThreadCoords() ).second + local.bc;
@@ -291,18 +300,28 @@ namespace alp {
 			}
 
 			/** For a given block, returns its offset from the beginning of the buffer in which it is stored */
-			size_t getBlocksOffset( const ThreadCoords t, const size_t br, const size_t bc ) const {
+			size_t getBlocksOffset( const ThreadCoords &t, const size_t br, const size_t bc ) const {
 				// The offset is calculated as the sum of sizes of all previous blocks
 				const size_t block_coord_1D = br * getLocalBlockGridDims( t ).second + bc;
 				return block_coord_1D * getBlockSize();
 			}
 
 			ThreadCoords getThreadCoords( const size_t thread_id ) const {
-				const size_t rt = thread_id / ( Tr * Tc );
-				const size_t tr = ( thread_id % ( Tr * Tc ) ) / Tc;
-				const size_t tc = ( thread_id % ( Tr * Tc ) ) % Tc;
+				const size_t rt = thread_id / ( config::THREAD_ROW_DIM * config::THREAD_COL_DIM );
+				const size_t tr = ( thread_id % ( config::THREAD_ROW_DIM * config::THREAD_COL_DIM ) ) / config::THREAD_COL_DIM;
+				const size_t tc = ( thread_id % ( config::THREAD_ROW_DIM * config::THREAD_COL_DIM ) ) % config::THREAD_COL_DIM;
 				return { tr, tc, rt };
 			}
+
+			bool isActiveThread(const ThreadCoords &t ) const {
+				return ( t.tr < Tr ) && ( t.tc < Tc ) && ( t.rt < Rt );
+			}
+
+			bool isActiveThread(const size_t thread_id ) const {
+				const auto th_coords = getThreadCoords( thread_id );
+				return isActiveThread( th_coords );
+			}
+
 	};
 		
 	namespace storage {
diff --git a/include/alp/omp/vector.hpp b/include/alp/omp/vector.hpp
index 8116f1f9f..b653de004 100644
--- a/include/alp/omp/vector.hpp
+++ b/include/alp/omp/vector.hpp
@@ -104,6 +104,9 @@ namespace alp {
 
 			private:
 
+				/** The distribution of the vector among threads. */
+				Distribution_2_5D distr;
+
 				/** The number of buffers. */
 				size_t num_buffers;
 
@@ -149,7 +152,8 @@ namespace alp {
 				Vector(
 					const Distribution_2_5D &d,
 					const size_t cap = 0
-				) : num_buffers( d.getNumberOfThreads() ),
+				) : distr( d ),
+					num_buffers( d.getNumberOfThreads() ),
 					containers( num_buffers ),
 					initialized( false ) {
 
@@ -165,47 +169,54 @@ namespace alp {
 						throw std::runtime_error( "Could not allocate memory during alp::Vector<omp> construction." );
 					}
 
-					#pragma omp parallel for
-					for( size_t thread = 0; thread < config::OMP::current_threads(); ++thread ) {
+					#pragma omp parallel
+					{
+						const size_t thread = config::OMP::current_thread_ID();
+
 						const auto t_coords = d.getThreadCoords( thread );
-						const auto block_grid_dims = d.getLocalBlockGridDims( t_coords );
 
-						// Assuming that all blocks are of the same size
-						const size_t alloc_size = block_grid_dims.first * block_grid_dims.second * d.getBlockSize();
+						if ( d.isActiveThread( t_coords ) ) {
+
+							const auto block_grid_dims = d.getLocalBlockGridDims( t_coords );
+
+							// Assuming that all blocks are of the same size
+							const size_t alloc_size = block_grid_dims.first * block_grid_dims.second * d.getBlockSize();
 
 #ifdef DEBUG
-						#pragma omp critical
-						{
-							if( thread != config::OMP::current_thread_ID() ) {
-								std::cout << "Warning: thread != OMP::current_thread_id()\n";
+							#pragma omp critical
+							{
+								if( thread != config::OMP::current_thread_ID() ) {
+									std::cout << "Warning: thread != OMP::current_thread_id()\n";
+								}
+								std::cout << "Thread with global coordinates tr = " << t_coords.tr << " tc = " << t_coords.tc
+									<< " on OpenMP thread " << config::OMP::current_thread_ID()
+									<< " allocating buffer of " << alloc_size << " elements "
+									<< " holding " << block_grid_dims.first << " x " << block_grid_dims.second << " blocks.\n";
 							}
-							std::cout << "Thread with global coordinates tr = " << t_coords.tr << " tc = " << t_coords.tc
-								<< " on OpenMP thread " << config::OMP::current_thread_ID()
-								<< " allocating buffer of " << alloc_size << " elements "
-								<< " holding " << block_grid_dims.first << " x " << block_grid_dims.second << " blocks.\n";
-						}
 #endif
 
-						// TODO: Implement allocation properly
-						buffers[ thread ] = new ( std::nothrow ) value_type[ alloc_size ];
+							// TODO: Implement allocation properly
+							buffers[ thread ] = new ( std::nothrow ) value_type[ alloc_size ];
 
-						if( buffers[ thread ] == nullptr ) {
-							throw std::runtime_error( "Could not allocate memory during alp::Vector<omp> construction." );
-						}
+							if( buffers[ thread ] == nullptr ) {
+								throw std::runtime_error( "Could not allocate memory during alp::Vector<omp> construction." );
+							}
 
-						// Reserve space for all internal container wrappers to avoid re-allocation
-						containers[ thread ].reserve( block_grid_dims.first * block_grid_dims.second );
+							// Reserve space for all internal container wrappers to avoid re-allocation
+							containers[ thread ].reserve( block_grid_dims.first * block_grid_dims.second );
 
-						// Populate the array of internal container wrappers
-						for( size_t br = 0; br < block_grid_dims.first; ++br ) {
-							for( size_t bc = 0; bc < block_grid_dims.second; ++bc ) {
-								const size_t offset = d.getBlocksOffset( t_coords, br, bc );
-								containers[ thread ].emplace_back( &( buffers[ thread ][ offset ] ), d.getBlockSize() );
+							// Populate the array of internal container wrappers
+							for( size_t br = 0; br < block_grid_dims.first; ++br ) {
+								for( size_t bc = 0; bc < block_grid_dims.second; ++bc ) {
+									const size_t offset = d.getBlocksOffset( t_coords, br, bc );
+									containers[ thread ].emplace_back( &( buffers[ thread ][ offset ] ), d.getBlockSize() );
+								}
 							}
-						}
 
-						// Ensure that the array contains the expected number of containers
-						assert( containers[ thread ].size() == block_grid_dims.first * block_grid_dims.second );
+							// Ensure that the array contains the expected number of containers
+							assert( containers[ thread ].size() == block_grid_dims.first * block_grid_dims.second );
+						
+						} // End active threads allocation
 					}
 				}
 
@@ -277,9 +288,14 @@ namespace alp {
 				 */
 				~Vector() {
 					if( buffers != nullptr ) {
-						for( size_t i = 0; i < num_buffers; ++i ) {
-							if( buffers[ i ] != nullptr ) {
-								delete buffers[ i ];
+						#pragma omp parallel
+						{
+							const size_t thread = config::OMP::current_thread_ID();
+
+							if ( distr.isActiveThread( thread ) ) {
+								if( buffers[ thread ] != nullptr ) {
+									delete [] buffers[ thread ];
+								}
 							}
 						}
 						delete [] buffers;
diff --git a/include/alp/reference/blas3.hpp b/include/alp/reference/blas3.hpp
index b0ad5ef3d..8a76ee6e3 100644
--- a/include/alp/reference/blas3.hpp
+++ b/include/alp/reference/blas3.hpp
@@ -490,46 +490,71 @@ namespace alp {
 	 * @param phase 	The execution phase.
 	 */
 	template<
-		typename OutputStructMatT,
-		typename InputStructMatT1,
-		typename InputStructMatT2,
+		typename OutputType, typename InputType1, typename InputType2,
+		typename OutputStructure, typename OutputView, 
+		typename OutputImfR, typename OutputImfC,
+		typename InputStructure1, typename InputView1, 
+		typename InputImfR1, typename InputImfC1,
+		typename InputStructure2, typename InputView2, 
+		typename InputImfR2, typename InputImfC2,
 		class Semiring
 	>
 	RC mxm( 
-		OutputStructMatT & C,
-		const InputStructMatT1 & A,
-		const InputStructMatT2 & B,
+		alp::Matrix< OutputType, OutputStructure, 
+		Density::Dense, OutputView, OutputImfR, OutputImfC, reference > &C,
+		const alp::Matrix< InputType1, InputStructure1, 
+		Density::Dense, InputView1, InputImfR1, InputImfC1, reference > &A,
+		const alp::Matrix< InputType2, InputStructure2, 
+		Density::Dense, InputView2, InputImfR2, InputImfC2, reference > &B,
 		const Semiring & ring = Semiring(),
 		const PHASE &phase = NUMERICAL,
-		const std::enable_if_t< 
-			! alp::is_object< typename OutputStructMatT::value_type >::value && ! alp::is_object< typename InputStructMatT1::value_type >::value && ! alp::is_object< typename InputStructMatT2::value_type >::value && alp::is_semiring< Semiring >::value
-		> * const = NULL 
+		const typename std::enable_if< 
+			!alp::is_object< OutputType >::value && 
+			!alp::is_object< InputType1 >::value && 
+			!alp::is_object< InputType2 >::value && 
+			alp::is_semiring< Semiring >::value,
+			void 
+		>::type * const = NULL 
 	) {
 		(void)phase;
 
-		return internal::mxm_generic< false >( C, A, B, ring.getMultiplicativeOperator(), ring.getAdditiveMonoid(), ring.getMultiplicativeMonoid() );
+		return internal::mxm_generic< false >( 
+			C, A, B, 
+			ring.getMultiplicativeOperator(), ring.getAdditiveMonoid(), ring.getMultiplicativeMonoid() 
+		);
 	}
 
 	/**
 	 * Dense Matrix-Matrix multiply between structured matrices.
 	 * Version with additive monoid and multiplicative operator
 	 */
-	template< typename OutputStructMatT, 
-		typename InputStructMatT1,
-		typename InputStructMatT2,
+	template< 
+		typename OutputType, typename InputType1, typename InputType2,
+		typename OutputStructure, typename OutputView, 
+		typename OutputImfR, typename OutputImfC,
+		typename InputStructure1, typename InputView1, 
+		typename InputImfR1, typename InputImfC1,
+		typename InputStructure2, typename InputView2, 
+		typename InputImfR2, typename InputImfC2,
 		class Operator, class Monoid
 	>
 	RC mxm( 
-		OutputStructMatT & C,
-		const InputStructMatT1 & A,
-		const InputStructMatT2 & B,
+		alp::Matrix< OutputType, OutputStructure, 
+		Density::Dense, OutputView, OutputImfR, OutputImfC, reference > &C,
+		const alp::Matrix< InputType1, InputStructure1, 
+		Density::Dense, InputView1, InputImfR1, InputImfC1, reference > &A,
+		const alp::Matrix< InputType2, InputStructure2, 
+		Density::Dense, InputView2, InputImfR2, InputImfC2, reference > &B,
 		const Operator & mulOp,
 		const Monoid & addM,
 		const PHASE &phase = NUMERICAL,
-		const std::enable_if_t< 
-			! alp::is_object< typename OutputStructMatT::value_type >::value && ! alp::is_object< typename InputStructMatT1::value_type >::value && ! alp::is_object< typename InputStructMatT2::value_type >::value &&
-		    alp::is_operator< Operator >::value && alp::is_monoid< Monoid >::value
-		> * const = NULL 
+		const typename std::enable_if< 
+			!alp::is_object< OutputType >::value && 
+			!alp::is_object< InputType1 >::value && 
+			!alp::is_object< InputType2 >::value &&
+			alp::is_operator< Operator >::value && alp::is_monoid< Monoid >::value,
+			void 
+		>::type * const = NULL 
 	) {
 		(void)phase;
 
diff --git a/include/alp/reference/io.hpp b/include/alp/reference/io.hpp
index c6de63f5d..da1e287c5 100644
--- a/include/alp/reference/io.hpp
+++ b/include/alp/reference/io.hpp
@@ -878,9 +878,6 @@ namespace alp {
 		const fwd_iterator &start,
 		const fwd_iterator &end
 	) noexcept {
-		(void)A;
-		(void)start;
-		(void)end;
 
 		// Temporarily assuming 1-1 mapping with user container
 		internal::setInitialized(A, true);
diff --git a/include/alp/scalar.hpp b/include/alp/scalar.hpp
index 95c73cfbc..33210aba2 100644
--- a/include/alp/scalar.hpp
+++ b/include/alp/scalar.hpp
@@ -24,8 +24,6 @@
 
 #include "base/scalar.hpp"
 
-#include <alp/structures.hpp>
-
 // now include all specialisations contained in the backend directories:
 #ifdef _ALP_WITH_REFERENCE
  #include <alp/reference/scalar.hpp>
diff --git a/include/alp/utils.hpp b/include/alp/utils.hpp
index 330c69b3b..d053f998d 100644
--- a/include/alp/utils.hpp
+++ b/include/alp/utils.hpp
@@ -26,6 +26,7 @@
 #include <assert.h>
 
 #include <cmath>  //fabs
+#include <functional> // modulus
 #include <limits> //numeric_limits
 #include <type_traits>
 
@@ -295,7 +296,21 @@ namespace alp {
 			}
 
 		};
-
+		
+		
+		/**
+		 * Computation of modulus operation \f$ mod(x, n) \f$ based on remainder 
+		 * of division for type \a T within range [0, n).
+		 * Assumes \a T implements: \a operator%, \a operator<, and \a operator+.
+		 */
+		template < typename T >
+		T modulus( const T x, const T n ) {
+			
+			const T rem = x % n;
+			
+			return ( rem < static_cast< T >( 0 ) ) ? rem + n : rem;
+		};
+					
 	} // namespace utils
 
 } // namespace alp
diff --git a/src/alp/omp/init.cpp b/src/alp/omp/init.cpp
index f7f8b5d42..f57351285 100644
--- a/src/alp/omp/init.cpp
+++ b/src/alp/omp/init.cpp
@@ -36,7 +36,7 @@ alp::RC alp::init< alp::omp >( const size_t s, const size_t P, void * const data
 	RC rc = alp::SUCCESS;
 	// print output
 	const auto T = config::OMP::threads();
-	std::cerr << "Info: alp::init (omp) called. OpenMP is set to utilise " << T << " threads.\n";
+	std::cout << "Info: alp::init (omp) called. OpenMP is set to utilise " << T << " threads.\n";
 
 	// sanity checks
 	if( P > 1 ) {
@@ -54,7 +54,7 @@ alp::RC alp::init< alp::omp >( const size_t s, const size_t P, void * const data
 
 template<>
 alp::RC alp::finalize< alp::omp >() {
-	std::cerr << "Info: alp::finalize (omp) called.\n";
+	std::cout << "Info: alp::finalize (omp) called.\n";
 	return alp::SUCCESS;
 }
 
diff --git a/tests/unit/CMakeLists.txt b/tests/unit/CMakeLists.txt
index b54e0a43b..f16504294 100644
--- a/tests/unit/CMakeLists.txt
+++ b/tests/unit/CMakeLists.txt
@@ -269,6 +269,10 @@ add_grb_executables( dense_omp_matrix dense_omp_matrix.cpp
 	BACKENDS alp_omp
 )
 
+add_grb_executables( dense_omp_mxm dense_omp_mxm.cpp
+	BACKENDS alp_omp
+)
+
 add_grb_executables( dense_outer dense_outer.cpp
 	BACKENDS alp_reference
 )
diff --git a/tests/unit/dense_omp_mxm.cpp b/tests/unit/dense_omp_mxm.cpp
new file mode 100644
index 000000000..3628d5d45
--- /dev/null
+++ b/tests/unit/dense_omp_mxm.cpp
@@ -0,0 +1,348 @@
+
+/*
+ *   Copyright 2021 Huawei Technologies Co., Ltd.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <iostream>
+#include <sstream>
+#include <vector>
+
+#include <alp.hpp>
+#ifdef NDEBUG
+ #include "../utils/print_alp_containers.hpp"
+#endif
+
+using namespace alp;
+
+
+template< typename T >
+void print_stdvec_as_matrix( std::string name, const std::vector< T > &vA, const size_t m, const size_t n, const size_t lda ) {
+
+	std::cout << "Vec " << name << ":" << std::endl;
+	for( size_t row = 0; row < m; ++row ) {
+		std::cout << "[\t";
+		for( size_t col = 0; col < n; ++col ) {
+			std::cout << vA[ row * lda + col ] << "\t";
+		}
+		std::cout << "]" << std::endl;
+	}
+}
+
+template< typename T, typename Operator, typename Monoid >
+void mxm_stdvec_as_matrix(	
+	std::vector< T > &vC, const size_t ldc,
+	const std::vector< T > &vA, const size_t lda,
+	const std::vector< T > &vB, const size_t ldb,
+	const size_t m, const size_t k, const size_t n,
+	const Operator oper,
+	const Monoid monoid 
+) {
+    
+	T temp;
+
+#ifndef NDEBUG
+	print_stdvec_as_matrix( "vA", vA, n, n, n );
+	print_stdvec_as_matrix( "vB", vB, n, n, n );
+	print_stdvec_as_matrix( "vC - PRE", vC, n, n, n );
+#endif
+
+	for( size_t i = 0; i < m; ++i ) {
+		for( size_t j = 0; j < n; ++j ) {
+			T &c_val { vC[ i * ldc + j ] };
+			for( size_t l = 0; l < k; ++l ) {
+					const T &a_val { vA[ i * lda + l ] };
+					const T &b_val { vB[ l * ldb + j ] };
+					// std::cout << c_val << " += " << a_val << " * " << b_val << std::endl;
+					(void)internal::apply( temp, a_val, b_val, oper );
+					// std::cout << "temp = " << temp << std::endl;
+					(void)internal::foldl( c_val, temp, monoid.getOperator() );
+			}
+		}
+	}
+
+#ifndef NDEBUG
+	print_stdvec_as_matrix( "vC - POST", vC, n, n, n );
+#endif
+}
+
+template< typename Structure, typename T >
+void stdvec_build_matrix( std::vector< T > &vA, const size_t m, const size_t n, const size_t lda, const T zero, const T one ) {
+
+		if( std::is_same< Structure, structures::General >::value ) {
+			std::fill( vA.begin(), vA.end(), one );
+		} else if( std::is_same< Structure, structures::Symmetric >::value ) {
+			std::fill( vA.begin(), vA.end(), one );
+		} else if( std::is_same< Structure, structures::UpperTriangular >::value ) {
+			for( size_t row = 0; row < m; ++row ) {
+				for( size_t col = 0; col < row; ++col ) {
+					vA[ row * lda + col ] = zero;
+				}
+				for( size_t col = row; col < n; ++col ) {
+					vA[ row * lda + col ] = one;
+				}
+			}
+		}
+
+}
+
+template< typename Structure, typename T >
+void stdvec_build_matrix( std::vector< T > &vA, const size_t m, const size_t n, const size_t lda, const T zero, const T one, const T inc ) {
+
+		T val = one;
+		if( std::is_same< Structure, structures::General >::value ) {
+			for( size_t row = 0; row < m; ++row ) {
+				for( size_t col = 0; col < n; ++col ) {
+					vA[ row * lda + col ] = val;
+					val += inc;
+				}
+			}
+		} else if( std::is_same< Structure, structures::Symmetric >::value ) {
+			for( size_t row = 0; row < m; ++row ) {
+				for( size_t col = row; col < n; ++col ) {
+					vA[ row * lda + col ] = vA[ col * lda + row ] = val;
+					val += inc;
+				}
+			}
+		} else if( std::is_same< Structure, structures::UpperTriangular >::value ) {
+			for( size_t row = 0; row < m; ++row ) {
+				for( size_t col = 0; col < row; ++col ) {
+					vA[ row * lda + col ] = zero;
+				}
+				for( size_t col = row; col < n; ++col ) {
+					vA[ row * lda + col ] = val;
+					val += inc;
+				}
+			}
+		}
+
+}
+
+template< typename Structure, typename T >
+void stdvec_build_matrix_packed( std::vector< T > &vA, const T one ) {
+
+	std::fill( vA.begin(), vA.end(), one );
+
+}
+
+template< typename Structure, typename T >
+void stdvec_build_matrix_packed( std::vector< T > &vA, const T one, const T inc ) {
+
+		T val = one;
+		if( std::is_same< Structure, structures::Symmetric >::value ) { // Assumes Packed Row - Upper
+			for( auto &elem: vA ) {
+				elem = val;
+				val += inc;
+			}
+		} else if( std::is_same< Structure, structures::UpperTriangular >::value ) { // Assumes Packed Row - Upper
+			for( auto &elem: vA ) {
+				elem = val;
+				val += inc;
+			}
+		}
+
+}
+
+template< typename MatType, typename T >
+void diff_stdvec_matrix( const std::vector< T > &vA, const size_t m, const size_t n, const size_t lda,
+						 const MatType & mA, double threshold = 1e-7 ) {
+
+	if( std::is_same< typename MatType::structure, structures::General >::value ) {
+		for( size_t row = 0; row < m; ++row ) {
+			for( size_t col = 0; col < n; ++col ) {
+				double va = ( double )( vA[ row * lda + col ] );
+				double vm = ( double )( internal::access( mA, internal::getStorageIndex( mA, row, col ) ) );
+				double re = std::abs( ( va - vm ) / va );
+				if( re > threshold ) {
+					std::cout << "Error ( " << row << ", " << col << " ): " << va << " v " << vm << std::endl; 
+				}
+			}
+		}
+	} else if( std::is_same< typename MatType::structure, structures::Symmetric >::value ) {
+		for( size_t row = 0; row < m; ++row ) {
+			for( size_t col = row; col < n; ++col ) {
+				double va = ( double )( vA[ row * lda + col ] );
+				double vm = ( double )( internal::access( mA, internal::getStorageIndex( mA, row, col ) ) );
+				double re = std::abs( ( va - vm ) / va );
+				if( re > threshold ) {
+					std::cout << "Error ( " << row << ", " << col << " ): " << va << " v " << vm << std::endl; 
+				}
+			}
+		}
+	} else if( std::is_same< typename MatType::structure, structures::UpperTriangular >::value ) {
+		for( size_t row = 0; row < m; ++row ) {
+			for( size_t col = row; col < n; ++col ) {
+				double va = ( double )( vA[ row * lda + col ] );
+				double vm = ( double )( internal::access( mA, internal::getStorageIndex( mA, row, col ) ) );
+				double re = std::abs( ( va - vm ) / va );
+				if( re > threshold ) {
+					std::cout << "Error ( " << row << ", " << col << " ): " << va << " v " << vm << std::endl; 
+				}
+			}
+		}
+	}
+
+}
+
+template < typename T, typename SemiringT >
+void run_mxm( const size_t m, const size_t k, const size_t n, alp::RC &rc ) {
+
+	const SemiringT ring;
+	const T one  = ring.template getOne< T >();
+	const T zero = ring.template getZero< T >();
+
+	std::vector< T > A_data( m * k );
+	std::vector< T > B_data( k * n );
+	std::vector< T > C_data( m * n, zero );
+
+	std::vector< T > A_vec( m * k );
+	std::vector< T > B_vec( k * n );
+	std::vector< T > C_vec( m * n, zero );
+
+	std::cout << "\tTesting dense General mxm " << m << " " << k << " " << n << std::endl;
+
+	stdvec_build_matrix< structures::General >( A_data, m, k, k, zero, one, one );
+	stdvec_build_matrix< structures::General >( B_data, k, n, n, zero, one, one );
+
+	// initialize test
+	alp::Matrix< T, structures::General > A( m, k );
+	alp::Matrix< T, structures::General > B( k, n );
+	alp::Matrix< T, structures::General > C( m, n );
+
+	// Initialize input matrices
+	rc = rc ? rc : alp::buildMatrix( A, A_data.begin(), A_data.end() );
+	if ( rc != alp::SUCCESS ) {
+		std::cerr << "\tIssues building A" << std::endl;
+		return;
+	}
+	rc = rc ? rc : alp::buildMatrix( B, B_data.begin(), B_data.end() );
+	rc = rc ? rc : alp::buildMatrix( C, C_data.begin(), C_data.end() );
+
+	if ( rc != alp::SUCCESS ) {
+		std::cerr << "\tIssues building matrices" << std::endl;
+		return;
+	}
+
+#ifndef NDEBUG
+	print_matrix( "A", A );
+	print_matrix( "B", B );
+	print_matrix( "C - PRE", C );
+#endif
+
+	rc = rc ? rc : alp::mxm( C, A, B, ring );
+
+#ifndef NDEBUG
+	print_matrix( "C - POST", C );
+#endif
+
+	if ( rc != alp::SUCCESS )
+		return;
+
+	stdvec_build_matrix< structures::General >( A_vec, m, k, k, zero, one, one );
+	stdvec_build_matrix< structures::General >( B_vec, k, n, n, zero, one, one );
+
+	mxm_stdvec_as_matrix( C_vec, n, A_vec, k, B_vec, n, m, k, n, ring.getMultiplicativeOperator(), ring.getAdditiveMonoid() );
+
+	diff_stdvec_matrix( C_vec, m, n, n, C );
+
+
+	std::cout << "\tDone." << std::endl;
+
+}
+
+#define M ( alp::config::BLOCK_ROW_DIM * n )
+#define K ( alp::config::BLOCK_COL_DIM * 2 * n )
+#define N ( alp::config::BLOCK_COL_DIM * 3 * n )
+
+void alp_program( const size_t &n, alp::RC &rc ) {
+
+	using T = double;
+
+	using SemiringT = alp::Semiring< 
+		alp::operators::add< T >, alp::operators::mul< T >, 
+		alp::identities::zero, alp::identities::one 
+	>;
+
+	rc = alp::SUCCESS;
+
+	/** 
+	 * Testing cubic mxm.
+	 */
+	run_mxm< T, SemiringT >( M, M, M, rc );
+
+	/**
+	 * Testing rectangular mxm
+	 */
+	run_mxm< T, SemiringT >( M, K, N, rc );
+
+	/**
+	 * Testing outer-prod of blocks mxm
+	 */
+	run_mxm< T, SemiringT >( M, alp::config::BLOCK_COL_DIM, N, rc );
+
+	/**
+	 * Testing dot-prod of blocks mxm
+	 */
+	run_mxm< T, SemiringT >( alp::config::BLOCK_ROW_DIM, M, alp::config::BLOCK_COL_DIM, rc );
+
+}
+
+int main( int argc, char **argv ) {
+	// defaults
+	bool printUsage = false;
+	size_t in = 4;
+
+	// error checking
+	if( argc > 2 ) {
+		printUsage = true;
+	}
+	if( argc == 2 ) {
+		size_t read;
+		std::istringstream ss( argv[ 1 ] );
+		if( ! ( ss >> read ) ) {
+			std::cerr << "Error parsing first argument\n";
+			printUsage = true;
+		} else if( ! ss.eof() ) {
+			std::cerr << "Error parsing first argument\n";
+			printUsage = true;
+		} else if( read % 2 != 0 ) {
+			std::cerr << "Given value for n is odd\n";
+			printUsage = true;
+		} else {
+			// all OK
+			in = read;
+		}
+	}
+	if( printUsage ) {
+		std::cerr << "Usage: " << argv[ 0 ] << " [n]\n";
+		std::cerr << "  -n (optional, default is 100): an even integer, the "
+					 "test size.\n";
+		return 1;
+	}
+
+	std::cout << "This is functional test " << argv[ 0 ] << " " << in << "\n";
+	alp::Launcher< AUTOMATIC > launcher;
+	alp::RC out;
+	if( launcher.exec( &alp_program, in, out, true ) != SUCCESS ) {
+		std::cerr << "Launching test FAILED\n";
+		return 255;
+	}
+	if( out != SUCCESS ) {
+		std::cerr << "Test FAILED (" << alp::toString( out ) << ")" << std::endl;
+	} else {
+		std::cout << "Test OK" << std::endl;
+	}
+	return 0;
+}
+