diff --git a/CREDITS b/CREDITS index 55c974f1bb..939351c007 100644 --- a/CREDITS +++ b/CREDITS @@ -74,6 +74,7 @@ but many others have contributed code and feedback, including @nagsingh Bhaskar Nallani @BhaskarNallani (AMD) Stepan Nassyr @stepannassyr (Jülich Supercomputing Centre) + Nisanth M P @nisanthmp Nisanth Padinharepatt (AMD) Ajay Panyala @ajaypanyala Marc-Antoine Parent @maparent (Conversence) diff --git a/frame/3/gemm/bli_gemm_ker_var2.c b/frame/3/gemm/bli_gemm_ker_var2.c index 51dceced28..d596950819 100644 --- a/frame/3/gemm/bli_gemm_ker_var2.c +++ b/frame/3/gemm/bli_gemm_ker_var2.c @@ -131,13 +131,10 @@ void bli_gemm_ker_var2 const char* alpha_cast = bli_obj_internal_scalar_buffer( &scalar_b ); const char* beta_cast = bli_obj_internal_scalar_buffer( c ); - // If 1m is being employed on a column- or row-stored matrix with a - // real-valued beta, we can use the real domain macro-kernel, which - // eliminates a little overhead associated with the 1m virtual - // micro-kernel. - // Only employ this optimization if the storage datatype of C is - // equal to the execution/computation datatype. #if 1 + // Under certain conditions, we can avoid the overhead of calling the 1m + // virtual microkernel by having the real-domain macrokernel execute with + // the real-domain microkernel. (See the function definition for details.) if ( bli_cntx_method( cntx ) == BLIS_1M ) { bli_gemm_ind_recast_1m_params @@ -149,7 +146,8 @@ void bli_gemm_ker_var2 &m, &n, &k, &pd_a, &ps_a, &pd_b, &ps_b, - &rs_c, &cs_c + &rs_c, &cs_c, + cntx ); } #endif diff --git a/frame/3/gemm/ind/bli_gemm_ind_opt.h b/frame/3/gemm/ind/bli_gemm_ind_opt.h index 789d5895cf..a573255800 100644 --- a/frame/3/gemm/ind/bli_gemm_ind_opt.h +++ b/frame/3/gemm/ind/bli_gemm_ind_opt.h @@ -34,27 +34,59 @@ BLIS_INLINE void bli_gemm_ind_recast_1m_params ( - num_t* dt_exec, - num_t* dt_c, - pack_t schema_a, - const obj_t* c, - dim_t* m, - dim_t* n, - dim_t* k, - inc_t* pd_a, inc_t* ps_a, - inc_t* pd_b, inc_t* ps_b, - inc_t* rs_c, inc_t* cs_c + num_t* dt_exec, + num_t* dt_c, + pack_t schema_a, + const obj_t* c, + dim_t* m, + dim_t* n, + dim_t* k, + inc_t* pd_a, inc_t* ps_a, + inc_t* pd_b, inc_t* ps_b, + inc_t* rs_c, inc_t* cs_c, + const cntx_t* cntx ) { obj_t beta; - /* Detach the beta scalar from c so that we can test its imaginary - component. */ + // Detach the beta scalar from c so that we can test its imaginary + // component. bli_obj_scalar_detach( c, &beta ); - /* If beta is in the real domain, and c is row- or column-stored, - then we may proceed with the optimization. */ - if ( bli_obj_imag_is_zero( &beta ) && +#if 1 + // Determine whether the storage of C matches the IO preference of the + // microkernel. (We cannot utilize the optimization below if there is a + // mismatch.) + const ukr_t ukr_id = BLIS_GEMM_VIR_UKR; + + const bool row_stored = bli_is_row_stored( *rs_c, *cs_c ); + const bool col_stored = !row_stored; + const bool row_pref = bli_cntx_ukr_prefers_rows_dt( *dt_c, ukr_id, cntx ); + const bool col_pref = !row_pref; + + const bool is_match = ( row_stored && row_pref ) || + ( col_stored && col_pref ); +#else + // This was the previous behavior, which resulted in buggy behavior + // when executing right-sided hemm, and: + // - the 1m method is enabled, + // - BLIS_DISABLE_HEMM_RIGHT is #defined, and + // - the storage of C matches the microkernel IO preference PRIOR to + // detecting the right-sidedness of the operation. + // See Issue #621 for details. + const bool is_match = TRUE; +#endif + + // If (a) the storage of C matches the IO pref of the ukernel, (b) beta is + // in the real domain, and (c) C is row- or column-stored, then we may + // proceed with the optimization below, which allows 1m to be induced by + // executing the real-domain macrokernel with the real-domain microkernel + // plus a few tweaked parameters. Otherwise, we must skip the optimization + // and allow 1m to execute via the complex-domain macrokernel calling the + // 1m virtual microkernel function, which will incur a little extra + // overhead. + if ( is_match && + bli_obj_imag_is_zero( &beta ) && !bli_is_gen_stored( *rs_c, *cs_c ) ) { *dt_exec = bli_dt_proj_to_real( *dt_exec ); @@ -69,7 +101,7 @@ BLIS_INLINE void bli_gemm_ind_recast_1m_params *pd_b *= 1; *ps_b *= 2; *rs_c *= 1; *cs_c *= 2; } - else /* if ( bli_is_1r_packed( schema_a ) ) */ + else // if ( bli_is_1r_packed( schema_a ) ) { *m *= 1; *n *= 2;