From 4e7ed3082e0a1eb5198987fc6e2a69537452fe82 Mon Sep 17 00:00:00 2001 From: "Field G. Van Zee" Date: Sat, 10 Dec 2022 16:38:21 -0600 Subject: [PATCH 1/2] Fixed func calls in gemmlike/bls_l3_packm_var?.c. Details: - Updated bls_l3_packm_var[123].c to use bli_thrinfo_n_way() and bli_thrinfo_work_id() instead of bli_thrinfo_num_threads() and bli_thrinfo_thread_id(), respectively. This change probably should have been included in aeb5f0c. - This change is the first of multiple that are necessary to get the gemmlike sandbox operating at full performance again. (It appears each thread is packing every micropanel, and so the thrinfo_t tree needs to be reconfigured. Same goes for sup when optional packing is requested.) --- sandbox/gemmlike/bls_l3_packm_var1.c | 4 ++-- sandbox/gemmlike/bls_l3_packm_var2.c | 4 ++-- sandbox/gemmlike/bls_l3_packm_var3.c | 4 ++-- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/sandbox/gemmlike/bls_l3_packm_var1.c b/sandbox/gemmlike/bls_l3_packm_var1.c index e4d566b44c..7c2c4e9a90 100644 --- a/sandbox/gemmlike/bls_l3_packm_var1.c +++ b/sandbox/gemmlike/bls_l3_packm_var1.c @@ -121,8 +121,8 @@ void PASTECH2(bls_,ch,varname) \ \ /* Query the number of threads and thread ids from the current thread's packm thrinfo_t node. */ \ - const dim_t nt = bli_thrinfo_num_threads( thread ); \ - const dim_t tid = bli_thrinfo_thread_id( thread ); \ + const dim_t nt = bli_thrinfo_n_way( thread ); \ + const dim_t tid = bli_thrinfo_work_id( thread ); \ \ /* Suppress warnings in case tid isn't used (ie: as in slab partitioning). */ \ ( void )nt; \ diff --git a/sandbox/gemmlike/bls_l3_packm_var2.c b/sandbox/gemmlike/bls_l3_packm_var2.c index 3e7e7888a8..94ee0efcd8 100644 --- a/sandbox/gemmlike/bls_l3_packm_var2.c +++ b/sandbox/gemmlike/bls_l3_packm_var2.c @@ -121,8 +121,8 @@ void PASTECH2(bls_,ch,varname) \ \ /* Query the number of threads and thread ids from the current thread's packm thrinfo_t node. */ \ - const dim_t nt = bli_thrinfo_num_threads( thread ); \ - const dim_t tid = bli_thrinfo_thread_id( thread ); \ + const dim_t nt = bli_thrinfo_n_way( thread ); \ + const dim_t tid = bli_thrinfo_work_id( thread ); \ \ /* Suppress warnings in case tid isn't used (ie: as in slab partitioning). */ \ ( void )nt; \ diff --git a/sandbox/gemmlike/bls_l3_packm_var3.c b/sandbox/gemmlike/bls_l3_packm_var3.c index 4ccb1828dd..48cd6dd608 100644 --- a/sandbox/gemmlike/bls_l3_packm_var3.c +++ b/sandbox/gemmlike/bls_l3_packm_var3.c @@ -121,8 +121,8 @@ void PASTECH2(bls_,ch,varname) \ \ /* Query the number of threads and thread ids from the current thread's packm thrinfo_t node. */ \ - const dim_t nt = bli_thrinfo_num_threads( thread ); \ - const dim_t tid = bli_thrinfo_thread_id( thread ); \ + const dim_t nt = bli_thrinfo_n_way( thread ); \ + const dim_t tid = bli_thrinfo_work_id( thread ); \ \ /* Suppress warnings in case tid isn't used (ie: as in slab partitioning). */ \ ( void )nt; \ From d2ff04f5429183f0703dc3b722c412dcea0c59da Mon Sep 17 00:00:00 2001 From: "Field G. Van Zee" Date: Mon, 12 Dec 2022 18:35:49 -0600 Subject: [PATCH 2/2] Fixed perf of mt sup with packing, and mt gemmlike. Details: - Brought the gemmsup code path up to date relative to the latest thrinfo_t semantics introduced in the October Omnibus commit (aeb5f0c). This was done by passing in the prenode (instead of the current node) into the packm variant within bli_l3_sup_packm.c as well as creating the prenodes and attaching them to the thrinfo_t tree in bli_l3_sup_thrinfo_create(). These changes erase the performance degradation introduced in the omnibus when running multithreaded sup with optional packing enabled. Special thanks to Devin Matthews for sussing out this fix in short order. - Fixed the gemmlike sandbox in a manner similar to that of sup with packing, described above. This also involved passing the prenode into the local gemmlike packm variant. That gemmlike sandbox code currently recycles use of bli_l3_sup_thrinfo_create(), so no other changes were needed at this time. --- frame/3/bli_l3_sup_packm.c | 4 ++-- frame/3/bli_l3_thrinfo.c | 9 +++++++++ sandbox/gemmlike/bls_l3_packm_a.c | 2 +- sandbox/gemmlike/bls_l3_packm_b.c | 2 +- 4 files changed, 13 insertions(+), 4 deletions(-) diff --git a/frame/3/bli_l3_sup_packm.c b/frame/3/bli_l3_sup_packm.c index 797335aeb5..890980da3b 100644 --- a/frame/3/bli_l3_sup_packm.c +++ b/frame/3/bli_l3_sup_packm.c @@ -394,7 +394,7 @@ void bli_packm_sup ( void* )a, rs_a, cs_a, *p, *rs_p, *cs_p, ( cntx_t* )cntx, - thread + bli_thrinfo_sub_prenode( thread ) ); } else // if ( schema == BLIS_PACKED_ROW_PANELS ) @@ -415,7 +415,7 @@ void bli_packm_sup *p, *rs_p, *cs_p, pd_p, *ps_p, ( cntx_t* )cntx, - thread + bli_thrinfo_sub_prenode( thread ) ); } diff --git a/frame/3/bli_l3_thrinfo.c b/frame/3/bli_l3_thrinfo.c index 0b45abbf6d..95d2a54398 100644 --- a/frame/3/bli_l3_thrinfo.c +++ b/frame/3/bli_l3_thrinfo.c @@ -150,6 +150,15 @@ thrinfo_t* bli_l3_sup_thrinfo_create thrinfo_t* thread_jr = bli_thrinfo_split( n_way_jr, thread_pa ); thrinfo_t* thread_ir = bli_thrinfo_split( n_way_ir, thread_jr ); + const dim_t n_way_pb = bli_thrinfo_num_threads( thread_pb ); + const dim_t n_way_pa = bli_thrinfo_num_threads( thread_pa ); + + // Create and set the prenodes for the packb and packa thrinfo_t nodes. + thrinfo_t* thread_pb_single = bli_thrinfo_split( n_way_pb, thread_pb ); + thrinfo_t* thread_pa_single = bli_thrinfo_split( n_way_pa, thread_pa ); + bli_thrinfo_set_sub_prenode( thread_pb_single, thread_pb ); + bli_thrinfo_set_sub_prenode( thread_pa_single, thread_pa ); + bli_thrinfo_set_sub_node( thread_jc, root ); bli_thrinfo_set_sub_node( thread_pc, thread_jc ); bli_thrinfo_set_sub_node( thread_pb, thread_pc ); diff --git a/sandbox/gemmlike/bls_l3_packm_a.c b/sandbox/gemmlike/bls_l3_packm_a.c index 412c6c24ea..742c78bfb0 100644 --- a/sandbox/gemmlike/bls_l3_packm_a.c +++ b/sandbox/gemmlike/bls_l3_packm_a.c @@ -276,7 +276,7 @@ void PASTECH2(bls_,ch,opname) \ *p, *rs_p, *cs_p, \ pd_p, *ps_p, \ cntx, \ - thread \ + bli_thrinfo_sub_prenode( thread ) \ ); \ \ /* Barrier so that packing is done before computation. */ \ diff --git a/sandbox/gemmlike/bls_l3_packm_b.c b/sandbox/gemmlike/bls_l3_packm_b.c index cc9757b1de..db6bca8fca 100644 --- a/sandbox/gemmlike/bls_l3_packm_b.c +++ b/sandbox/gemmlike/bls_l3_packm_b.c @@ -276,7 +276,7 @@ void PASTECH2(bls_,ch,opname) \ *p, *rs_p, *cs_p, \ pd_p, *ps_p, \ cntx, \ - thread \ + bli_thrinfo_sub_prenode( thread ) \ ); \ \ /* Barrier so that packing is done before computation. */ \