diff --git a/addon/gemmd/bao_gemmd_bp_var1.c b/addon/gemmd/bao_gemmd_bp_var1.c index e3f47982ce..689471367f 100644 --- a/addon/gemmd/bao_gemmd_bp_var1.c +++ b/addon/gemmd/bao_gemmd_bp_var1.c @@ -458,7 +458,7 @@ void PASTECH2(bao_,ch,varname) \ /* This barrier is needed to prevent threads from starting to pack the next row panel of B before the current row panel is fully computed upon. */ \ - bli_thread_barrier( rntm, thread_pb ); \ + bli_thread_barrier( thread_pb ); \ } \ } \ \ diff --git a/addon/gemmd/bao_l3_packm_a.c b/addon/gemmd/bao_l3_packm_a.c index 1d6502884b..c69edec2d0 100644 --- a/addon/gemmd/bao_l3_packm_a.c +++ b/addon/gemmd/bao_l3_packm_a.c @@ -61,7 +61,7 @@ void PASTECH2(bao_,ch,opname) \ \ /* Barrier to make sure all threads are caught up and ready to begin the packm stage. */ \ - bli_thread_barrier( rntm, thread ); \ + bli_thread_barrier( thread ); \ \ /* Compute the size of the memory block eneded. */ \ siz_t size_needed = sizeof( ctype ) * m_pack * k_pack; \ @@ -319,7 +319,7 @@ void PASTECH2(bao_,ch,opname) \ ); \ \ /* Barrier so that packing is done before computation. */ \ - bli_thread_barrier( rntm, thread ); \ + bli_thread_barrier( thread ); \ } //INSERT_GENTFUNC_BASIC0( packm_a ) diff --git a/addon/gemmd/bao_l3_packm_b.c b/addon/gemmd/bao_l3_packm_b.c index 8d020007c9..dc041d99f4 100644 --- a/addon/gemmd/bao_l3_packm_b.c +++ b/addon/gemmd/bao_l3_packm_b.c @@ -61,7 +61,7 @@ void PASTECH2(bao_,ch,opname) \ \ /* Barrier to make sure all threads are caught up and ready to begin the packm stage. */ \ - bli_thread_barrier( rntm, thread ); \ + bli_thread_barrier( thread ); \ \ /* Compute the size of the memory block eneded. */ \ siz_t size_needed = sizeof( ctype ) * k_pack * n_pack; \ @@ -319,7 +319,7 @@ void PASTECH2(bao_,ch,opname) \ ); \ \ /* Barrier so that packing is done before computation. */ \ - bli_thread_barrier( rntm, thread ); \ + bli_thread_barrier( thread ); \ } //INSERT_GENTFUNC_BASIC0( packm_b ) diff --git a/build/libblis-symbols.def b/build/libblis-symbols.def index 8d29d73b25..db20ffbca4 100644 --- a/build/libblis-symbols.def +++ b/build/libblis-symbols.def @@ -1,122 +1,69 @@ EXPORTS bli_abort bli_absqsc -bli_absqsc_check -bli_absqsc_qfp bli_acquire_mij bli_acquire_mpart bli_acquire_mpart_b2t bli_acquire_mpart_br2tl bli_acquire_mpart_l2r -bli_acquire_mpart_l2r_check bli_acquire_mpart_mdim bli_acquire_mpart_mndim bli_acquire_mpart_ndim bli_acquire_mpart_r2l bli_acquire_mpart_t2b -bli_acquire_mpart_t2b_check bli_acquire_mpart_tl2br -bli_acquire_mpart_tl2br_check bli_acquire_vi bli_acquire_vpart_b2f bli_acquire_vpart_f2b bli_addd -bli_addd_check bli_addd_ex -bli_addd_ex_qfp bli_addm -bli_addm_check bli_addm_ex -bli_addm_ex_qfp bli_addsc -bli_addsc_check -bli_addsc_qfp bli_addv -bli_addv_check bli_addv_ex -bli_addv_ex_qfp -bli_adjust_strides bli_align_dim_to_mult bli_align_dim_to_size bli_align_ptr_to_size bli_amaxv -bli_amaxv_check bli_amaxv_ex -bli_amaxv_ex_qfp -bli_apool_alloc_block -bli_apool_array_elem -bli_apool_checkin_array -bli_apool_checkout_array -bli_apool_finalize -bli_apool_free_block -bli_apool_grow -bli_apool_init bli_arch_query_id -bli_arch_set_id -bli_arch_set_id_once bli_arch_string -bli_array_elem -bli_array_finalize -bli_array_init -bli_array_resize -bli_array_set_elem bli_asumv -bli_asumv_check bli_asumv_ex -bli_asumv_ex_qfp bli_axpbyv -bli_axpbyv_check bli_axpbyv_ex -bli_axpbyv_ex_qfp bli_axpy2v -bli_axpy2v_check bli_axpy2v_ex -bli_axpy2v_ex_qfp bli_axpyd -bli_axpyd_check bli_axpyd_ex -bli_axpyd_ex_qfp bli_axpyf -bli_axpyf_check bli_axpyf_ex -bli_axpyf_ex_qfp bli_axpym -bli_axpym_check bli_axpym_ex -bli_axpym_ex_qfp bli_axpyv -bli_axpyv_check bli_axpyv_ex -bli_axpyv_ex_qfp bli_blksz_create bli_blksz_create_ed bli_blksz_free bli_blksz_init bli_blksz_init_easy bli_blksz_init_ed -bli_blksz_reduce_def_to -bli_blksz_reduce_max_to bli_cabsqsc bli_caddd bli_caddd_ex bli_caddm bli_caddm_ex -bli_caddm_unb_var1 bli_caddsc bli_caddv bli_caddv_ex -bli_calloc_intl bli_camaxv bli_camaxv_ex bli_castm -bli_castm_check bli_castnzm -bli_castnzm_check bli_castv -bli_castv_check bli_casumv bli_casumv_ex -bli_casumv_unb_var1 bli_caxpbyv bli_caxpbyv_ex bli_caxpy2v @@ -127,33 +74,24 @@ bli_caxpyf bli_caxpyf_ex bli_caxpym bli_caxpym_ex -bli_caxpym_unb_var1 bli_caxpyv bli_caxpyv_ex bli_cccastm bli_cccastnzm bli_cccastv bli_cccopysc -bli_ccgemm_ker_var2_md bli_ccopyd bli_ccopyd_ex bli_ccopym bli_ccopym_ex -bli_ccopym_unb_var1 bli_ccopyv bli_ccopyv_ex -bli_ccpackm_blk_var1_md -bli_ccpackm_cxk_1e_md -bli_ccpackm_cxk_1r_md -bli_ccpackm_struc_cxk_md bli_ccxpbym_md bli_ccxpbym_md_ex -bli_ccxpbym_md_unb_var1 bli_cdcastm bli_cdcastnzm bli_cdcastv bli_cdcopysc -bli_cdgemm_ker_var2_md bli_cdivsc bli_cdotaxpyv bli_cdotaxpyv_ex @@ -165,288 +103,111 @@ bli_cdotxf bli_cdotxf_ex bli_cdotxv bli_cdotxv_ex -bli_cdpackm_blk_var1_md -bli_cdpackm_cxk_1e_md -bli_cdpackm_cxk_1r_md -bli_cdpackm_struc_cxk_md bli_cdxpbym_md bli_cdxpbym_md_ex -bli_cdxpbym_md_unb_var1 +bli_ceqm +bli_ceqsc +bli_ceqv bli_cfprintm bli_cfprintv bli_cgemm -bli_cgemm1m -bli_cgemm3m1 -bli_cgemm3mh -bli_cgemm4m1 -bli_cgemm4mb -bli_cgemm4mb_ker_var2 -bli_cgemm4mh bli_cgemm_ex -bli_cgemm_ker_var2 -bli_cgemm_md_c2r_ref -bli_cgemmtrsm_l_ukernel -bli_cgemmtrsm_u_ukernel -bli_cgemm_ukernel +bli_cgemmt +bli_cgemmt_ex bli_cgemv bli_cgemv_ex -bli_cgemv_unb_var1 -bli_cgemv_unb_var2 -bli_cgemv_unf_var1 -bli_cgemv_unf_var2 bli_cger bli_cger_ex -bli_cger_unb_var1 -bli_cger_unb_var2 bli_cgetijm +bli_cgetijv bli_cgetsc -bli_check_alignment_is_mult_of_ptr_size -bli_check_alignment_is_power_of_two -bli_check_conformal_dims -bli_check_consistent_datatypes -bli_check_consistent_object_datatypes -bli_check_consistent_object_precisions -bli_check_consistent_precisions -bli_check_datatype_real_proj_of -bli_check_equal_vector_lengths bli_check_error_code_helper -bli_check_floating_datatype -bli_check_floating_object -bli_check_general_object -bli_check_hermitian_object -bli_check_if_exhausted_pool -bli_check_integer_datatype -bli_check_integer_object -bli_check_level3_dims -bli_check_matrix_object -bli_check_matrix_strides -bli_check_nonconstant_datatype -bli_check_nonconstant_object -bli_check_noninteger_datatype -bli_check_noninteger_object -bli_check_nonunit_diag -bli_check_null_pointer -bli_check_object_alias_of -bli_check_object_buffer -bli_check_object_diag_offset_equals -bli_check_object_length_equals -bli_check_object_real_proj_of -bli_check_object_struc -bli_check_object_valid_datatype -bli_check_object_width_equals -bli_check_packm_schema_on_unpack -bli_check_packv_schema_on_unpack -bli_check_real_datatype -bli_check_real_object -bli_check_real_valued_object -bli_check_scalar_object -bli_check_square_object -bli_check_sufficient_stack_buf_size -bli_check_symmetric_object -bli_check_triangular_object -bli_check_upper_or_lower_object -bli_check_valid_1x3_subpart -bli_check_valid_3x1_subpart -bli_check_valid_3x3_subpart -bli_check_valid_arch_id -bli_check_valid_cntl -bli_check_valid_datatype -bli_check_valid_diag -bli_check_valid_error_level -bli_check_valid_kc_mod_mult -bli_check_valid_malloc_buf -bli_check_valid_mc_mod_mult -bli_check_valid_nc_mod_mult -bli_check_valid_packbuf -bli_check_valid_side -bli_check_valid_trans -bli_check_valid_uplo -bli_check_vector_dim_equals -bli_check_vector_object bli_chemm -bli_chemm1m -bli_chemm3m1 -bli_chemm3mh -bli_chemm4m1 -bli_chemm4mh bli_chemm_ex bli_chemv bli_chemv_ex -bli_chemv_unb_var1 -bli_chemv_unb_var2 -bli_chemv_unb_var3 -bli_chemv_unb_var4 -bli_chemv_unf_var1 -bli_chemv_unf_var1a -bli_chemv_unf_var3 -bli_chemv_unf_var3a bli_cher bli_cher2 bli_cher2_ex bli_cher2k -bli_cher2k1m -bli_cher2k3m1 -bli_cher2k3mh -bli_cher2k4m1 -bli_cher2k4mh bli_cher2k_ex -bli_cher2_unb_var1 -bli_cher2_unb_var2 -bli_cher2_unb_var3 -bli_cher2_unb_var4 -bli_cher2_unf_var1 -bli_cher2_unf_var4 bli_cher_ex bli_cherk -bli_cherk1m -bli_cherk3m1 -bli_cherk3mh -bli_cherk4m1 -bli_cherk4mh bli_cherk_ex -bli_cherk_l_ker_var2 -bli_cherk_u_ker_var2 -bli_cher_unb_var1 -bli_cher_unb_var2 bli_cinvertd bli_cinvertd_ex bli_cinvertsc bli_cinvertv bli_cinvertv_ex +bli_cinvscald +bli_cinvscald_ex +bli_cinvscalm +bli_cinvscalm_ex +bli_cinvscalv +bli_cinvscalv_ex bli_clock -bli_clock_helper bli_clock_min_diff bli_cmachval bli_cmkherm bli_cmkherm_ex -bli_cmkherm_unb_var1 bli_cmksymm bli_cmksymm_ex -bli_cmksymm_unb_var1 bli_cmktrim bli_cmktrim_ex -bli_cmktrim_unb_var1 bli_cmulsc bli_cnorm1m bli_cnorm1m_ex -bli_cnorm1m_unb_var1 bli_cnorm1v bli_cnorm1v_ex -bli_cnorm1v_unb_var1 bli_cnormfm bli_cnormfm_ex -bli_cnormfm_unb_var1 bli_cnormfsc bli_cnormfv bli_cnormfv_ex -bli_cnormfv_unb_var1 bli_cnormim bli_cnormim_ex -bli_cnormim_unb_var1 bli_cnormiv bli_cnormiv_ex -bli_cnormiv_unb_var1 -bli_cntl_calc_num_threads_in bli_cntl_clear_node bli_cntl_copy bli_cntl_create_node bli_cntl_free bli_cntl_free_node -bli_cntl_free_wo_thrinfo -bli_cntl_free_w_thrinfo bli_cntl_mark_family -bli_cntx_1m_stage -bli_cntx_3m1_stage -bli_cntx_3mh_stage -bli_cntx_4m1_stage -bli_cntx_4mb_stage -bli_cntx_4mh_stage bli_cntx_clear -bli_cntx_ind_stage -bli_cntx_nat_stage bli_cntx_print bli_cntx_set_blkszs bli_cntx_set_ind_blkszs -bli_cntx_set_l1f_kers -bli_cntx_set_l1v_kers -bli_cntx_set_l3_nat_ukrs -bli_cntx_set_packm_kers +bli_cntx_set_l3_sup_handlers +bli_cntx_set_ukr_prefs +bli_cntx_set_ukrs bli_copyd -bli_copyd_check bli_copyd_ex -bli_copyd_ex_qfp bli_copym -bli_copym_check bli_copym_ex -bli_copym_ex_qfp bli_copysc -bli_copysc_check bli_copyv -bli_copyv_check bli_copyv_ex -bli_copyv_ex_qfp -bli_cpackm_blk_var1 -bli_cpackm_cxk -bli_cpackm_cxk_1er -bli_cpackm_cxk_3mis -bli_cpackm_cxk_4mi -bli_cpackm_cxk_rih -bli_cpackm_herm_cxk -bli_cpackm_herm_cxk_1er -bli_cpackm_herm_cxk_3mis -bli_cpackm_herm_cxk_4mi -bli_cpackm_herm_cxk_rih -bli_cpackm_struc_cxk -bli_cpackm_struc_cxk_1er -bli_cpackm_struc_cxk_3mis -bli_cpackm_struc_cxk_4mi -bli_cpackm_struc_cxk_rih -bli_cpackm_tri_cxk -bli_cpackm_tri_cxk_1er -bli_cpackm_tri_cxk_3mis -bli_cpackm_tri_cxk_4mi -bli_cpackm_tri_cxk_rih -bli_cpackm_unb_var1 bli_cprintm -bli_cprintm_ex bli_cprintv -bli_cprintv_ex -bli_cpuid_is_bulldozer -bli_cpuid_is_excavator -bli_cpuid_is_haswell -bli_cpuid_is_knl -bli_cpuid_is_penryn -bli_cpuid_is_piledriver -bli_cpuid_is_sandybridge -bli_cpuid_is_skx -bli_cpuid_is_steamroller -bli_cpuid_is_zen -bli_cpuid_query -bli_cpuid_query_id bli_crandm bli_crandm_ex -bli_crandm_unb_var1 bli_crandnm bli_crandnm_ex -bli_crandnm_unb_var1 bli_crandnv bli_crandnv_ex -bli_crandnv_unb_var1 bli_crandv bli_crandv_ex -bli_crandv_unb_var1 bli_cscal2d bli_cscal2d_ex bli_cscal2m bli_cscal2m_ex -bli_cscal2m_unb_var1 bli_cscal2v bli_cscal2v_ex bli_cscald bli_cscald_ex bli_cscalm bli_cscalm_ex -bli_cscalm_unb_var1 bli_cscalv bli_cscalv_ex bli_cscastm @@ -458,42 +219,29 @@ bli_csetd_ex bli_csetid bli_csetid_ex bli_csetijm +bli_csetijv bli_csetm bli_csetm_ex -bli_csetm_unb_var1 bli_csetsc bli_csetv bli_csetv_ex -bli_csgemm_ker_var2_md bli_cshiftd bli_cshiftd_ex -bli_cspackm_blk_var1_md -bli_cspackm_cxk_1e_md -bli_cspackm_cxk_1r_md -bli_cspackm_struc_cxk_md bli_csqrtsc bli_csubd bli_csubd_ex bli_csubm bli_csubm_ex -bli_csubm_unb_var1 bli_csubsc bli_csubv bli_csubv_ex bli_csumsqv bli_csumsqv_ex -bli_csumsqv_unb_var1 bli_cswapv bli_cswapv_ex bli_csxpbym_md bli_csxpbym_md_ex -bli_csxpbym_md_unb_var1 bli_csymm -bli_csymm1m -bli_csymm3m1 -bli_csymm3mh -bli_csymm4m1 -bli_csymm4mh bli_csymm_ex bli_csymv bli_csymv_ex @@ -501,89 +249,39 @@ bli_csyr bli_csyr2 bli_csyr2_ex bli_csyr2k -bli_csyr2k1m -bli_csyr2k3m1 -bli_csyr2k3mh -bli_csyr2k4m1 -bli_csyr2k4mh bli_csyr2k_ex bli_csyr_ex bli_csyrk -bli_csyrk1m -bli_csyrk3m1 -bli_csyrk3mh -bli_csyrk4m1 -bli_csyrk4mh bli_csyrk_ex bli_ctrmm -bli_ctrmm1m bli_ctrmm3 -bli_ctrmm31m -bli_ctrmm33m1 -bli_ctrmm33mh -bli_ctrmm34m1 -bli_ctrmm34mh bli_ctrmm3_ex -bli_ctrmm3m1 -bli_ctrmm4m1 bli_ctrmm_ex -bli_ctrmm_ll_ker_var2 -bli_ctrmm_lu_ker_var2 -bli_ctrmm_rl_ker_var2 -bli_ctrmm_ru_ker_var2 bli_ctrmv bli_ctrmv_ex -bli_ctrmv_unb_var1 -bli_ctrmv_unb_var2 -bli_ctrmv_unf_var1 -bli_ctrmv_unf_var2 bli_ctrsm -bli_ctrsm1m -bli_ctrsm3m1 -bli_ctrsm4m1 bli_ctrsm_ex -bli_ctrsm_ll_ker_var2 -bli_ctrsm_l_ukernel -bli_ctrsm_lu_ker_var2 -bli_ctrsm_rl_ker_var2 -bli_ctrsm_ru_ker_var2 -bli_ctrsm_u_ukernel bli_ctrsv bli_ctrsv_ex -bli_ctrsv_unb_var1 -bli_ctrsv_unb_var2 -bli_ctrsv_unf_var1 -bli_ctrsv_unf_var2 -bli_cunpackm_blk_var1 -bli_cunpackm_cxk -bli_cunpackm_unb_var1 bli_cunzipsc bli_cxpbyd bli_cxpbyd_ex bli_cxpbym bli_cxpbym_ex -bli_cxpbym_unb_var1 bli_cxpbyv bli_cxpbyv_ex bli_czcastm bli_czcastnzm bli_czcastv bli_czcopysc -bli_czgemm_ker_var2_md bli_czipsc -bli_czpackm_blk_var1_md -bli_czpackm_cxk_1e_md -bli_czpackm_cxk_1r_md -bli_czpackm_struc_cxk_md bli_czxpbym_md bli_czxpbym_md_ex -bli_czxpbym_md_unb_var1 bli_dabsqsc bli_daddd bli_daddd_ex bli_daddm bli_daddm_ex -bli_daddm_unb_var1 bli_daddsc bli_daddv bli_daddv_ex @@ -591,7 +289,6 @@ bli_damaxv bli_damaxv_ex bli_dasumv bli_dasumv_ex -bli_dasumv_unb_var1 bli_daxpbyv bli_daxpbyv_ex bli_daxpy2v @@ -602,33 +299,24 @@ bli_daxpyf bli_daxpyf_ex bli_daxpym bli_daxpym_ex -bli_daxpym_unb_var1 bli_daxpyv bli_daxpyv_ex bli_dccastm bli_dccastnzm bli_dccastv bli_dccopysc -bli_dcgemm_ker_var2_md bli_dcopyd bli_dcopyd_ex bli_dcopym bli_dcopym_ex -bli_dcopym_unb_var1 bli_dcopyv bli_dcopyv_ex -bli_dcpackm_blk_var1_md -bli_dcpackm_cxk_1e_md -bli_dcpackm_cxk_1r_md -bli_dcpackm_struc_cxk_md bli_dcxpbym_md bli_dcxpbym_md_ex -bli_dcxpbym_md_unb_var1 bli_ddcastm bli_ddcastnzm bli_ddcastv bli_ddcopysc -bli_ddgemm_ker_var2_md bli_ddivsc bli_ddotaxpyv bli_ddotaxpyv_ex @@ -640,183 +328,99 @@ bli_ddotxf bli_ddotxf_ex bli_ddotxv bli_ddotxv_ex -bli_ddpackm_blk_var1_md -bli_ddpackm_cxk_1e_md -bli_ddpackm_cxk_1r_md -bli_ddpackm_struc_cxk_md bli_ddxpbym_md bli_ddxpbym_md_ex -bli_ddxpbym_md_unb_var1 -bli_determine_blocksize -bli_determine_blocksize_b -bli_determine_blocksize_b_sub -bli_determine_blocksize_f -bli_determine_blocksize_f_sub +bli_deqm +bli_deqsc +bli_deqv bli_dfprintm bli_dfprintv bli_dgemm -bli_dgemm1m -bli_dgemm3m1 -bli_dgemm3mh -bli_dgemm4m1 -bli_dgemm4mb -bli_dgemm4mb_ker_var2 -bli_dgemm4mh bli_dgemm_ex -bli_dgemm_ker_var2 -bli_dgemmtrsm_l_ukernel -bli_dgemmtrsm_u_ukernel -bli_dgemm_ukernel +bli_dgemmt +bli_dgemmt_ex bli_dgemv bli_dgemv_ex -bli_dgemv_unb_var1 -bli_dgemv_unb_var2 -bli_dgemv_unf_var1 -bli_dgemv_unf_var2 bli_dger bli_dger_ex -bli_dger_unb_var1 -bli_dger_unb_var2 bli_dgetijm +bli_dgetijv bli_dgetsc bli_dhemm -bli_dhemm1m -bli_dhemm3m1 -bli_dhemm3mh -bli_dhemm4m1 -bli_dhemm4mh bli_dhemm_ex bli_dhemv bli_dhemv_ex -bli_dhemv_unb_var1 -bli_dhemv_unb_var2 -bli_dhemv_unb_var3 -bli_dhemv_unb_var4 -bli_dhemv_unf_var1 -bli_dhemv_unf_var1a -bli_dhemv_unf_var3 -bli_dhemv_unf_var3a bli_dher bli_dher2 bli_dher2_ex bli_dher2k -bli_dher2k1m -bli_dher2k3m1 -bli_dher2k3mh -bli_dher2k4m1 -bli_dher2k4mh bli_dher2k_ex -bli_dher2_unb_var1 -bli_dher2_unb_var2 -bli_dher2_unb_var3 -bli_dher2_unb_var4 -bli_dher2_unf_var1 -bli_dher2_unf_var4 bli_dher_ex bli_dherk -bli_dherk1m -bli_dherk3m1 -bli_dherk3mh -bli_dherk4m1 -bli_dherk4mh bli_dherk_ex -bli_dherk_l_ker_var2 -bli_dherk_u_ker_var2 -bli_dher_unb_var1 -bli_dher_unb_var2 bli_dinvertd bli_dinvertd_ex bli_dinvertsc bli_dinvertv bli_dinvertv_ex +bli_dinvscald +bli_dinvscald_ex +bli_dinvscalm +bli_dinvscalm_ex +bli_dinvscalv +bli_dinvscalv_ex bli_divsc -bli_divsc_check -bli_divsc_qfp -bli_dlamch bli_dmachval bli_dmkherm bli_dmkherm_ex -bli_dmkherm_unb_var1 bli_dmksymm bli_dmksymm_ex -bli_dmksymm_unb_var1 bli_dmktrim bli_dmktrim_ex -bli_dmktrim_unb_var1 bli_dmulsc bli_dnorm1m bli_dnorm1m_ex -bli_dnorm1m_unb_var1 bli_dnorm1v bli_dnorm1v_ex -bli_dnorm1v_unb_var1 bli_dnormfm bli_dnormfm_ex -bli_dnormfm_unb_var1 bli_dnormfsc bli_dnormfv bli_dnormfv_ex -bli_dnormfv_unb_var1 bli_dnormim bli_dnormim_ex -bli_dnormim_unb_var1 bli_dnormiv bli_dnormiv_ex -bli_dnormiv_unb_var1 bli_dotaxpyv -bli_dotaxpyv_check bli_dotaxpyv_ex -bli_dotaxpyv_ex_qfp bli_dotv -bli_dotv_check bli_dotv_ex -bli_dotv_ex_qfp bli_dotxaxpyf -bli_dotxaxpyf_check bli_dotxaxpyf_ex -bli_dotxaxpyf_ex_qfp bli_dotxf -bli_dotxf_check bli_dotxf_ex -bli_dotxf_ex_qfp bli_dotxv -bli_dotxv_check bli_dotxv_ex -bli_dotxv_ex_qfp -bli_dpackm_blk_var1 -bli_dpackm_cxk -bli_dpackm_herm_cxk -bli_dpackm_struc_cxk -bli_dpackm_tri_cxk -bli_dpackm_unb_var1 bli_dprintm -bli_dprintm_ex bli_dprintv -bli_dprintv_ex bli_drandm bli_drandm_ex -bli_drandm_unb_var1 bli_drandnm bli_drandnm_ex -bli_drandnm_unb_var1 bli_drandnv bli_drandnv_ex -bli_drandnv_unb_var1 bli_drandv bli_drandv_ex -bli_drandv_unb_var1 bli_dscal2d bli_dscal2d_ex bli_dscal2m bli_dscal2m_ex -bli_dscal2m_unb_var1 bli_dscal2v bli_dscal2v_ex bli_dscald bli_dscald_ex bli_dscalm bli_dscalm_ex -bli_dscalm_unb_var1 bli_dscalv bli_dscalv_ex bli_dscastm @@ -828,42 +432,29 @@ bli_dsetd_ex bli_dsetid bli_dsetid_ex bli_dsetijm +bli_dsetijv bli_dsetm bli_dsetm_ex -bli_dsetm_unb_var1 bli_dsetsc bli_dsetv bli_dsetv_ex -bli_dsgemm_ker_var2_md bli_dshiftd bli_dshiftd_ex -bli_dspackm_blk_var1_md -bli_dspackm_cxk_1e_md -bli_dspackm_cxk_1r_md -bli_dspackm_struc_cxk_md bli_dsqrtsc bli_dsubd bli_dsubd_ex bli_dsubm bli_dsubm_ex -bli_dsubm_unb_var1 bli_dsubsc bli_dsubv bli_dsubv_ex bli_dsumsqv bli_dsumsqv_ex -bli_dsumsqv_unb_var1 bli_dswapv bli_dswapv_ex bli_dsxpbym_md bli_dsxpbym_md_ex -bli_dsxpbym_md_unb_var1 bli_dsymm -bli_dsymm1m -bli_dsymm3m1 -bli_dsymm3mh -bli_dsymm4m1 -bli_dsymm4mh bli_dsymm_ex bli_dsymv bli_dsymv_ex @@ -871,301 +462,79 @@ bli_dsyr bli_dsyr2 bli_dsyr2_ex bli_dsyr2k -bli_dsyr2k1m -bli_dsyr2k3m1 -bli_dsyr2k3mh -bli_dsyr2k4m1 -bli_dsyr2k4mh bli_dsyr2k_ex bli_dsyr_ex bli_dsyrk -bli_dsyrk1m -bli_dsyrk3m1 -bli_dsyrk3mh -bli_dsyrk4m1 -bli_dsyrk4mh bli_dsyrk_ex bli_dtrmm -bli_dtrmm1m bli_dtrmm3 -bli_dtrmm31m -bli_dtrmm33m1 -bli_dtrmm33mh -bli_dtrmm34m1 -bli_dtrmm34mh bli_dtrmm3_ex -bli_dtrmm3m1 -bli_dtrmm4m1 bli_dtrmm_ex -bli_dtrmm_ll_ker_var2 -bli_dtrmm_lu_ker_var2 -bli_dtrmm_rl_ker_var2 -bli_dtrmm_ru_ker_var2 bli_dtrmv bli_dtrmv_ex -bli_dtrmv_unb_var1 -bli_dtrmv_unb_var2 -bli_dtrmv_unf_var1 -bli_dtrmv_unf_var2 bli_dtrsm -bli_dtrsm1m -bli_dtrsm3m1 -bli_dtrsm4m1 bli_dtrsm_ex -bli_dtrsm_ll_ker_var2 -bli_dtrsm_l_ukernel -bli_dtrsm_lu_ker_var2 -bli_dtrsm_rl_ker_var2 -bli_dtrsm_ru_ker_var2 -bli_dtrsm_u_ukernel bli_dtrsv bli_dtrsv_ex -bli_dtrsv_unb_var1 -bli_dtrsv_unb_var2 -bli_dtrsv_unf_var1 -bli_dtrsv_unf_var2 bli_dt_size -bli_dt_size_check bli_dt_string -bli_dt_string_check -bli_dt_union_check -bli_dunpackm_blk_var1 -bli_dunpackm_cxk -bli_dunpackm_unb_var1 bli_dunzipsc bli_dxpbyd bli_dxpbyd_ex bli_dxpbym bli_dxpbym_ex -bli_dxpbym_unb_var1 bli_dxpbyv bli_dxpbyv_ex bli_dzcastm bli_dzcastnzm bli_dzcastv bli_dzcopysc -bli_dzgemm_ker_var2_md bli_dzipsc -bli_dzpackm_blk_var1_md -bli_dzpackm_cxk_1e_md -bli_dzpackm_cxk_1r_md -bli_dzpackm_struc_cxk_md bli_dzxpbym_md bli_dzxpbym_md_ex -bli_dzxpbym_md_unb_var1 +bli_eqm +bli_eqsc +bli_eqv bli_error_checking_is_enabled bli_error_checking_level bli_error_checking_level_set -bli_error_string_for_code -bli_ffree_align -bli_ffree_noalign bli_finalize -bli_finalize_apis -bli_finalize_auto -bli_finalize_once -bli_find_area_trap_l -bli_fmalloc_align -bli_fmalloc_align_check -bli_fmalloc_noalign -bli_fmalloc_post_check bli_fprintm -bli_fprintm_check -bli_fprintm_ex -bli_fprintm_qfp bli_fprintv -bli_fprintv_check -bli_fprintv_ex -bli_fprintv_qfp -bli_free_intl bli_free_user -bli_func_create -bli_func_free -bli_func_init -bli_func_init_null -bli_func_is_null -bli_func_is_null_dt -bli_gcd bli_gemm -bli_gemm1m -bli_gemm3m1 -bli_gemm3mh -bli_gemm4m1 -bli_gemm4mb -bli_gemm4mb_ker_var2 -bli_gemm4mh -bli_gemm_basic_check -bli_gemm_blk_var1 -bli_gemm_blk_var2 -bli_gemm_blk_var3 -bli_gemmbp_cntl_create -bli_gemm_check -bli_gemm_cntl_create -bli_gemm_cntl_create_node -bli_gemm_cntl_free -bli_gemm_determine_kc -bli_gemm_determine_kc_b -bli_gemm_determine_kc_f -bli_gemm_direct bli_gemm_ex -bli_gemm_front -bli_gemmind -bli_gemmind_get_avail -bli_gemm_int -bli_gemm_ker_var2 -bli_gemm_ker_var2_md -bli_gemm_md -bli_gemm_md_ccc -bli_gemm_md_ccr -bli_gemm_md_crc -bli_gemm_md_crr -bli_gemm_md_rcc -bli_gemm_md_rcr -bli_gemm_md_rrc -bli_gemm_md_rrr -bli_gemmnat -bli_gemm_packa -bli_gemm_packb -bli_gemm_prune_unref_mparts_k -bli_gemm_prune_unref_mparts_m -bli_gemm_prune_unref_mparts_n -bli_gemmtrsm_l_ukernel_qfp +bli_gemmt +bli_gemmt_ex bli_gemmtrsm_ukernel -bli_gemmtrsm_u_ukernel_qfp bli_gemm_ukernel -bli_gemm_ukernel_qfp bli_gemv -bli_gemv_check bli_gemv_ex -bli_gemv_ex_qfp -bli_gemv_unb_var1 -bli_gemv_unb_var1_qfp -bli_gemv_unb_var2 -bli_gemv_unb_var2_qfp -bli_gemv_unf_var1 -bli_gemv_unf_var1_qfp -bli_gemv_unf_var2 -bli_gemv_unf_var2_qfp bli_ger -bli_ger_check bli_ger_ex -bli_ger_ex_qfp -bli_ger_unb_var1 -bli_ger_unb_var1_qfp -bli_ger_unb_var2 -bli_ger_unb_var2_qfp bli_getijm +bli_getijv bli_getopt bli_getopt_init_state bli_getsc -bli_getsc_check -bli_getsc_qfp -bli_gks_cntx_l3_nat_ukr_is_ref -bli_gks_finalize -bli_gks_init -bli_gks_init_index bli_gks_init_ref_cntx bli_gks_l3_ukr_impl_string bli_gks_l3_ukr_impl_type -bli_gks_lookup_ind_cntx -bli_gks_lookup_nat_cntx bli_gks_query_cntx -bli_gks_query_cntx_noinit bli_gks_query_ind_cntx bli_gks_query_nat_cntx -bli_gks_register_cntx bli_hemm -bli_hemm1m -bli_hemm3m1 -bli_hemm3mh -bli_hemm4m1 -bli_hemm4mh -bli_hemm_basic_check -bli_hemm_check bli_hemm_ex -bli_hemm_front -bli_hemmind -bli_hemmind_get_avail -bli_hemmnat bli_hemv -bli_hemv_check bli_hemv_ex -bli_hemv_ex_qfp -bli_hemv_unb_var1 -bli_hemv_unb_var1_qfp -bli_hemv_unb_var2 -bli_hemv_unb_var2_qfp -bli_hemv_unb_var3 -bli_hemv_unb_var3_qfp -bli_hemv_unb_var4 -bli_hemv_unb_var4_qfp -bli_hemv_unf_var1 -bli_hemv_unf_var1a -bli_hemv_unf_var1a_qfp -bli_hemv_unf_var1_qfp -bli_hemv_unf_var3 -bli_hemv_unf_var3a -bli_hemv_unf_var3a_qfp -bli_hemv_unf_var3_qfp bli_her bli_her2 -bli_her2_check bli_her2_ex -bli_her2_ex_qfp bli_her2k -bli_her2k1m -bli_her2k3m1 -bli_her2k3mh -bli_her2k4m1 -bli_her2k4mh -bli_her2k_basic_check -bli_her2k_check bli_her2k_ex -bli_her2k_front -bli_her2kind -bli_her2kind_get_avail -bli_her2knat -bli_her2_unb_var1 -bli_her2_unb_var1_qfp -bli_her2_unb_var2 -bli_her2_unb_var2_qfp -bli_her2_unb_var3 -bli_her2_unb_var3_qfp -bli_her2_unb_var4 -bli_her2_unb_var4_qfp -bli_her2_unf_var1 -bli_her2_unf_var1_qfp -bli_her2_unf_var4 -bli_her2_unf_var4_qfp -bli_her_check bli_her_ex -bli_her_ex_qfp bli_herk -bli_herk1m -bli_herk3m1 -bli_herk3mh -bli_herk4m1 -bli_herk4mh -bli_herk_basic_check -bli_herk_check -bli_herk_determine_kc -bli_herk_determine_kc_b -bli_herk_determine_kc_f -bli_herk_direct bli_herk_ex -bli_herk_front -bli_herkind -bli_herkind_get_avail -bli_herk_l_ker_var2 -bli_herknat -bli_herk_prune_unref_mparts_k -bli_herk_prune_unref_mparts_m -bli_herk_prune_unref_mparts_n -bli_herk_u_ker_var2 -bli_herk_x_ker_var2 -bli_her_unb_var1 -bli_her_unb_var1_qfp -bli_her_unb_var2 -bli_her_unb_var2_qfp bli_ifprintm bli_ifprintv bli_igetsc @@ -1175,13 +544,8 @@ bli_ind_disable_all_dt bli_ind_disable_dt bli_ind_enable bli_ind_enable_dt -bli_ind_finalize -bli_ind_get_impl_string -bli_ind_init -bli_ind_map_cdt_to_index bli_ind_oper_enable_only bli_ind_oper_find_avail -bli_ind_oper_get_avail bli_ind_oper_get_avail_impl_string bli_ind_oper_is_impl bli_info_get_blas_int_type_size @@ -1189,13 +553,15 @@ bli_info_get_enable_blas bli_info_get_enable_cblas bli_info_get_enable_memkind bli_info_get_enable_openmp +bli_info_get_enable_openmp_as_default bli_info_get_enable_pba_pools bli_info_get_enable_pthreads +bli_info_get_enable_pthreads_as_default bli_info_get_enable_sandbox bli_info_get_enable_sba_pools -bli_info_get_enable_stay_auto_init bli_info_get_enable_threading bli_info_get_gemm_impl_string +bli_info_get_gemmt_impl_string bli_info_get_gemmtrsm_l_ukr_impl_string bli_info_get_gemmtrsm_u_ukr_impl_string bli_info_get_gemm_ukr_impl_string @@ -1209,7 +575,14 @@ bli_info_get_int_type_size_str bli_info_get_max_type_size bli_info_get_num_fp_types bli_info_get_page_size -bli_info_get_pool_addr_align_size +bli_info_get_pool_addr_align_size_a +bli_info_get_pool_addr_align_size_b +bli_info_get_pool_addr_align_size_c +bli_info_get_pool_addr_align_size_gen +bli_info_get_pool_addr_offset_size_a +bli_info_get_pool_addr_offset_size_b +bli_info_get_pool_addr_offset_size_c +bli_info_get_pool_addr_offset_size_gen bli_info_get_simd_align_size bli_info_get_simd_num_registers bli_info_get_simd_size @@ -1227,152 +600,57 @@ bli_info_get_trsm_l_ukr_impl_string bli_info_get_trsm_u_ukr_impl_string bli_info_get_version_str bli_init -bli_init_apis -bli_init_auto -bli_init_once bli_invertd -bli_invertd_check bli_invertd_ex -bli_invertd_ex_qfp bli_invertsc -bli_invertsc_check -bli_invertsc_qfp bli_invertv -bli_invertv_check bli_invertv_ex -bli_invertv_ex_qfp -bli_ipow +bli_invscald +bli_invscald_ex +bli_invscalm +bli_invscalm_ex +bli_invscalv +bli_invscalv_ex bli_iprintm -bli_iprintm_ex bli_iprintv -bli_iprintv_ex bli_isetsc -bli_l0_xsc_check -bli_l0_xx2sc_check -bli_l0_xxsc_check -bli_l1d_ax_check -bli_l1d_axy_check -bli_l1d_x_check -bli_l1d_xy_check -bli_l1m_ax_check -bli_l1m_axy_check -bli_l1m_xy_check -bli_l1v_axby_check -bli_l1v_ax_check -bli_l1v_axy_check -bli_l1v_dot_check -bli_l1v_xby_check -bli_l1v_x_check -bli_l1v_xi_check -bli_l1v_xy_check -bli_l3_basic_check -bli_l3_cntl_create_if bli_l3_cntl_free -bli_l3_determine_kc -bli_l3_direct -bli_l3_ind_oper_enable_only -bli_l3_ind_oper_find_avail -bli_l3_ind_oper_get_enable -bli_l3_ind_oper_get_func -bli_l3_ind_oper_set_enable -bli_l3_ind_oper_set_enable_all -bli_l3_ind_set_enable_dt -bli_l3_packm -bli_l3_prune_unref_mparts_k -bli_l3_prune_unref_mparts_m -bli_l3_prune_unref_mparts_n -bli_l3_thread_decorator -bli_l3_thread_entry -bli_l3_thrinfo_create_root -bli_l3_thrinfo_free -bli_l3_thrinfo_free_paths -bli_l3_thrinfo_init_single -bli_l3_thrinfo_print_gemm_paths -bli_l3_thrinfo_print_trsm_paths -bli_lcm -bli_lsame +bli_l3_thrinfo_create bli_machval -bli_malloc_intl bli_malloc_user -bli_mbool_create -bli_mbool_free -bli_mbool_init -bli_pba_acquire_m -bli_pba_compute_pool_block_sizes -bli_pba_compute_pool_block_sizes_dt -bli_pba_finalize -bli_pba_finalize_pools -bli_pba_init -bli_pba_init_pools -bli_pba_pool_size -bli_pba_query -bli_pba_release -bli_memsys_finalize -bli_memsys_init bli_mkherm -bli_mkherm_check bli_mkherm_ex -bli_mkherm_ex_qfp bli_mksymm -bli_mksymm_check bli_mksymm_ex -bli_mksymm_ex_qfp bli_mktrim -bli_mktrim_check bli_mktrim_ex -bli_mktrim_ex_qfp bli_mulsc -bli_mulsc_check -bli_mulsc_qfp -bli_next_prime_factor bli_norm1m -bli_norm1m_check bli_norm1m_ex -bli_norm1m_ex_qfp bli_norm1v -bli_norm1v_check bli_norm1v_ex -bli_norm1v_ex_qfp bli_normfm -bli_normfm_check bli_normfm_ex -bli_normfm_ex_qfp bli_normfsc -bli_normfsc_check -bli_normfsc_qfp bli_normfv -bli_normfv_check bli_normfv_ex -bli_normfv_ex_qfp bli_normim -bli_normim_check bli_normim_ex -bli_normim_ex_qfp bli_normiv -bli_normiv_check bli_normiv_ex -bli_normiv_ex_qfp bli_obj_alloc_buffer -bli_obj_alloc_buffer_check bli_obj_attach_buffer -bli_obj_attach_buffer_check bli_obj_create bli_obj_create_1x1 bli_obj_create_1x1_with_attached_buffer -bli_obj_create_check bli_obj_create_conf_to -bli_obj_create_const_check -bli_obj_create_scalar_check bli_obj_create_with_attached_buffer bli_obj_create_without_buffer -bli_obj_create_without_buffer_check bli_obj_equals bli_obj_free -bli_obj_free_check bli_obj_imag_equals bli_obj_imag_is_zero bli_obj_print -bli_obj_print_check bli_obj_scalar_apply_scalar bli_obj_scalar_attach bli_obj_scalar_cast_to @@ -1382,21 +660,16 @@ bli_obj_scalar_has_nonzero_imag bli_obj_scalar_init_detached bli_obj_scalar_init_detached_copy_of bli_obj_scalar_reset -bli_packm_acquire_mpart_l2r -bli_packm_acquire_mpart_t2b -bli_packm_acquire_mpart_tl2br +bli_pack_get_pack_a +bli_pack_get_pack_b +bli_packm_alloc +bli_packm_alloc_ex bli_packm_blk_var1 -bli_packm_blk_var1_md bli_packm_cntl_create_node bli_packm_init -bli_packm_init_check -bli_packm_init_pack -bli_packm_int -bli_packm_int_check -bli_packm_offset_to_panel_for -bli_packm_thrinfo_init -bli_packm_thrinfo_init_single -bli_packm_unb_var1 +bli_packm_scalar +bli_pack_set_pack_a +bli_pack_set_pack_b bli_param_map_blis_to_char_conj bli_param_map_blis_to_char_diag bli_param_map_blis_to_char_dt @@ -1414,33 +687,11 @@ bli_param_map_char_to_blis_dt bli_param_map_char_to_blis_side bli_param_map_char_to_blis_trans bli_param_map_char_to_blis_uplo -bli_param_map_netlib_to_blis_diag -bli_param_map_netlib_to_blis_side -bli_param_map_netlib_to_blis_trans -bli_param_map_netlib_to_blis_uplo -bli_partition_2x2 -bli_pblk_print -bli_pool_alloc_block -bli_pool_checkin_block -bli_pool_checkout_block -bli_pool_finalize -bli_pool_free_block -bli_pool_grow -bli_pool_init -bli_pool_print -bli_pool_reinit -bli_pool_shrink -bli_prime_factorization +bli_pba_query bli_printm -bli_printm_ex -bli_print_msg bli_printv -bli_printv_ex bli_projm -bli_projm_check bli_projv -bli_projv_check -bli_prune_unref_mparts bli_pthread_barrier_destroy bli_pthread_barrier_init bli_pthread_barrier_wait @@ -1457,30 +708,22 @@ bli_pthread_mutex_trylock bli_pthread_mutex_unlock bli_pthread_once bli_randm -bli_randm_check bli_randm_ex -bli_randm_ex_qfp bli_randnm -bli_randnm_check bli_randnm_ex -bli_randnm_ex_qfp bli_randnv -bli_randnv_check bli_randnv_ex -bli_randnv_ex_qfp bli_randv -bli_randv_check bli_randv_ex -bli_randv_ex_qfp -bli_rntm_print +bli_rntm_init_from_global +bli_rntm_set_num_threads +bli_rntm_set_ways bli_rntm_set_ways_for_op -bli_rntm_set_ways_from_rntm bli_sabsqsc bli_saddd bli_saddd_ex bli_saddm bli_saddm_ex -bli_saddm_unb_var1 bli_saddsc bli_saddv bli_saddv_ex @@ -1488,7 +731,6 @@ bli_samaxv bli_samaxv_ex bli_sasumv bli_sasumv_ex -bli_sasumv_unb_var1 bli_saxpbyv bli_saxpbyv_ex bli_saxpy2v @@ -1499,65 +741,36 @@ bli_saxpyf bli_saxpyf_ex bli_saxpym bli_saxpym_ex -bli_saxpym_unb_var1 bli_saxpyv bli_saxpyv_ex -bli_sba_acquire -bli_sba_checkin_array -bli_sba_checkout_array -bli_sba_finalize -bli_sba_init -bli_sba_query -bli_sba_release -bli_sba_rntm_set_pool bli_scal2d -bli_scal2d_check bli_scal2d_ex -bli_scal2d_ex_qfp bli_scal2m -bli_scal2m_check bli_scal2m_ex -bli_scal2m_ex_qfp bli_scal2v -bli_scal2v_check bli_scal2v_ex -bli_scal2v_ex_qfp bli_scald -bli_scald_check bli_scald_ex -bli_scald_ex_qfp bli_scalm -bli_scalm_check bli_scalm_ex -bli_scalm_ex_qfp bli_scalv -bli_scalv_check bli_scalv_ex -bli_scalv_ex_qfp bli_sccastm bli_sccastnzm bli_sccastv bli_sccopysc -bli_scgemm_ker_var2_md bli_scopyd bli_scopyd_ex bli_scopym bli_scopym_ex -bli_scopym_unb_var1 bli_scopyv bli_scopyv_ex -bli_scpackm_blk_var1_md -bli_scpackm_cxk_1e_md -bli_scpackm_cxk_1r_md -bli_scpackm_struc_cxk_md bli_scxpbym_md bli_scxpbym_md_ex -bli_scxpbym_md_unb_var1 bli_sdcastm bli_sdcastnzm bli_sdcastv bli_sdcopysc -bli_sdgemm_ker_var2_md bli_sdivsc bli_sdotaxpyv bli_sdotaxpyv_ex @@ -1569,187 +782,107 @@ bli_sdotxf bli_sdotxf_ex bli_sdotxv bli_sdotxv_ex -bli_sdpackm_blk_var1_md -bli_sdpackm_cxk_1e_md -bli_sdpackm_cxk_1r_md -bli_sdpackm_struc_cxk_md bli_sdxpbym_md bli_sdxpbym_md_ex -bli_sdxpbym_md_unb_var1 +bli_seqm +bli_seqsc +bli_seqv bli_setd -bli_setd_check bli_setd_ex -bli_setd_ex_qfp bli_setid -bli_setid_check bli_setid_ex -bli_setid_ex_qfp bli_setijm +bli_setijv bli_setim bli_setiv bli_setm -bli_setm_check bli_setm_ex -bli_setm_ex_qfp bli_setrm bli_setrv bli_setsc -bli_setsc_check -bli_setsc_qfp bli_setv -bli_setv_check bli_setv_ex -bli_setv_ex_qfp bli_sfprintm bli_sfprintv bli_sgemm -bli_sgemm1m -bli_sgemm3m1 -bli_sgemm3mh -bli_sgemm4m1 -bli_sgemm4mb -bli_sgemm4mb_ker_var2 -bli_sgemm4mh bli_sgemm_ex -bli_sgemm_ker_var2 -bli_sgemmtrsm_l_ukernel -bli_sgemmtrsm_u_ukernel -bli_sgemm_ukernel +bli_sgemmt +bli_sgemmt_ex bli_sgemv bli_sgemv_ex -bli_sgemv_unb_var1 -bli_sgemv_unb_var2 -bli_sgemv_unf_var1 -bli_sgemv_unf_var2 bli_sger bli_sger_ex -bli_sger_unb_var1 -bli_sger_unb_var2 bli_sgetijm +bli_sgetijv bli_sgetsc bli_shemm -bli_shemm1m -bli_shemm3m1 -bli_shemm3mh -bli_shemm4m1 -bli_shemm4mh bli_shemm_ex bli_shemv bli_shemv_ex -bli_shemv_unb_var1 -bli_shemv_unb_var2 -bli_shemv_unb_var3 -bli_shemv_unb_var4 -bli_shemv_unf_var1 -bli_shemv_unf_var1a -bli_shemv_unf_var3 -bli_shemv_unf_var3a bli_sher bli_sher2 bli_sher2_ex bli_sher2k -bli_sher2k1m -bli_sher2k3m1 -bli_sher2k3mh -bli_sher2k4m1 -bli_sher2k4mh bli_sher2k_ex -bli_sher2_unb_var1 -bli_sher2_unb_var2 -bli_sher2_unb_var3 -bli_sher2_unb_var4 -bli_sher2_unf_var1 -bli_sher2_unf_var4 bli_sher_ex bli_sherk -bli_sherk1m -bli_sherk3m1 -bli_sherk3mh -bli_sherk4m1 -bli_sherk4mh bli_sherk_ex -bli_sherk_l_ker_var2 -bli_sherk_u_ker_var2 -bli_sher_unb_var1 -bli_sher_unb_var2 bli_shiftd -bli_shiftd_check bli_shiftd_ex -bli_shiftd_ex_qfp bli_sinvertd bli_sinvertd_ex bli_sinvertsc bli_sinvertv bli_sinvertv_ex -bli_slamch +bli_sinvscald +bli_sinvscald_ex +bli_sinvscalm +bli_sinvscalm_ex +bli_sinvscalv +bli_sinvscalv_ex bli_sleep bli_smachval bli_smkherm bli_smkherm_ex -bli_smkherm_unb_var1 bli_smksymm bli_smksymm_ex -bli_smksymm_unb_var1 bli_smktrim bli_smktrim_ex -bli_smktrim_unb_var1 bli_smulsc bli_snorm1m bli_snorm1m_ex -bli_snorm1m_unb_var1 bli_snorm1v bli_snorm1v_ex -bli_snorm1v_unb_var1 bli_snormfm bli_snormfm_ex -bli_snormfm_unb_var1 bli_snormfsc bli_snormfv bli_snormfv_ex -bli_snormfv_unb_var1 bli_snormim bli_snormim_ex -bli_snormim_unb_var1 bli_snormiv bli_snormiv_ex -bli_snormiv_unb_var1 -bli_spackm_blk_var1 -bli_spackm_cxk -bli_spackm_herm_cxk -bli_spackm_struc_cxk -bli_spackm_tri_cxk -bli_spackm_unb_var1 bli_sprintm -bli_sprintm_ex bli_sprintv -bli_sprintv_ex bli_sqrtsc -bli_sqrtsc_check -bli_sqrtsc_qfp bli_srandm bli_srandm_ex -bli_srandm_unb_var1 bli_srandnm bli_srandnm_ex -bli_srandnm_unb_var1 bli_srandnv bli_srandnv_ex -bli_srandnv_unb_var1 bli_srandv bli_srandv_ex -bli_srandv_unb_var1 bli_sscal2d bli_sscal2d_ex bli_sscal2m bli_sscal2m_ex -bli_sscal2m_unb_var1 bli_sscal2v bli_sscal2v_ex bli_sscald bli_sscald_ex bli_sscalm bli_sscalm_ex -bli_sscalm_unb_var1 bli_sscalv bli_sscalv_ex bli_sscastm @@ -1761,42 +894,29 @@ bli_ssetd_ex bli_ssetid bli_ssetid_ex bli_ssetijm +bli_ssetijv bli_ssetm bli_ssetm_ex -bli_ssetm_unb_var1 bli_ssetsc bli_ssetv bli_ssetv_ex -bli_ssgemm_ker_var2_md bli_sshiftd bli_sshiftd_ex -bli_sspackm_blk_var1_md -bli_sspackm_cxk_1e_md -bli_sspackm_cxk_1r_md -bli_sspackm_struc_cxk_md bli_ssqrtsc bli_ssubd bli_ssubd_ex bli_ssubm bli_ssubm_ex -bli_ssubm_unb_var1 bli_ssubsc bli_ssubv bli_ssubv_ex bli_ssumsqv bli_ssumsqv_ex -bli_ssumsqv_unb_var1 bli_sswapv bli_sswapv_ex bli_ssxpbym_md bli_ssxpbym_md_ex -bli_ssxpbym_md_unb_var1 bli_ssymm -bli_ssymm1m -bli_ssymm3m1 -bli_ssymm3mh -bli_ssymm4m1 -bli_ssymm4mh bli_ssymm_ex bli_ssymv bli_ssymv_ex @@ -1804,330 +924,99 @@ bli_ssyr bli_ssyr2 bli_ssyr2_ex bli_ssyr2k -bli_ssyr2k1m -bli_ssyr2k3m1 -bli_ssyr2k3mh -bli_ssyr2k4m1 -bli_ssyr2k4mh bli_ssyr2k_ex bli_ssyr_ex bli_ssyrk -bli_ssyrk1m -bli_ssyrk3m1 -bli_ssyrk3mh -bli_ssyrk4m1 -bli_ssyrk4mh bli_ssyrk_ex -bli_string_mkupper bli_strmm -bli_strmm1m bli_strmm3 -bli_strmm31m -bli_strmm33m1 -bli_strmm33mh -bli_strmm34m1 -bli_strmm34mh bli_strmm3_ex -bli_strmm3m1 -bli_strmm4m1 bli_strmm_ex -bli_strmm_ll_ker_var2 -bli_strmm_lu_ker_var2 -bli_strmm_rl_ker_var2 -bli_strmm_ru_ker_var2 bli_strmv bli_strmv_ex -bli_strmv_unb_var1 -bli_strmv_unb_var2 -bli_strmv_unf_var1 -bli_strmv_unf_var2 bli_strsm -bli_strsm1m -bli_strsm3m1 -bli_strsm4m1 bli_strsm_ex -bli_strsm_ll_ker_var2 -bli_strsm_l_ukernel -bli_strsm_lu_ker_var2 -bli_strsm_rl_ker_var2 -bli_strsm_ru_ker_var2 -bli_strsm_u_ukernel bli_strsv bli_strsv_ex -bli_strsv_unb_var1 -bli_strsv_unb_var2 -bli_strsv_unf_var1 -bli_strsv_unf_var2 bli_subd -bli_subd_check bli_subd_ex -bli_subd_ex_qfp bli_subm -bli_subm_check bli_subm_ex -bli_subm_ex_qfp bli_subsc -bli_subsc_check -bli_subsc_qfp bli_subv -bli_subv_check bli_subv_ex -bli_subv_ex_qfp bli_sumsqv -bli_sumsqv_check bli_sumsqv_ex -bli_sumsqv_ex_qfp -bli_sunpackm_blk_var1 -bli_sunpackm_cxk -bli_sunpackm_unb_var1 bli_sunzipsc bli_swapv -bli_swapv_check bli_swapv_ex -bli_swapv_ex_qfp bli_sxpbyd bli_sxpbyd_ex bli_sxpbym bli_sxpbym_ex -bli_sxpbym_unb_var1 bli_sxpbyv bli_sxpbyv_ex bli_symm -bli_symm1m -bli_symm3m1 -bli_symm3mh -bli_symm4m1 -bli_symm4mh -bli_symm_check bli_symm_ex -bli_symm_front -bli_symmind -bli_symmind_get_avail -bli_symmnat bli_symv -bli_symv_check bli_symv_ex -bli_symv_ex_qfp bli_syr bli_syr2 -bli_syr2_check bli_syr2_ex -bli_syr2_ex_qfp bli_syr2k -bli_syr2k1m -bli_syr2k3m1 -bli_syr2k3mh -bli_syr2k4m1 -bli_syr2k4mh -bli_syr2k_check bli_syr2k_ex -bli_syr2k_front -bli_syr2kind -bli_syr2kind_get_avail -bli_syr2knat -bli_syr_check bli_syr_ex -bli_syr_ex_qfp bli_syrk -bli_syrk1m -bli_syrk3m1 -bli_syrk3mh -bli_syrk4m1 -bli_syrk4mh -bli_syrk_check bli_syrk_ex -bli_syrk_front -bli_syrkind -bli_syrkind_get_avail -bli_syrknat bli_szcastm bli_szcastnzm bli_szcastv bli_szcopysc -bli_szgemm_ker_var2_md bli_szipsc -bli_szpackm_blk_var1_md -bli_szpackm_cxk_1e_md -bli_szpackm_cxk_1r_md -bli_szpackm_struc_cxk_md bli_szxpbym_md bli_szxpbym_md_ex -bli_szxpbym_md_unb_var1 bli_thrcomm_barrier -bli_thrcomm_barrier_atomic bli_thrcomm_bcast -bli_thrcomm_cleanup -bli_thrcomm_create -bli_thrcomm_free -bli_thrcomm_init -bli_thread_finalize -bli_thread_get_env bli_thread_get_ic_nt bli_thread_get_ir_nt bli_thread_get_jc_nt bli_thread_get_jr_nt bli_thread_get_num_threads bli_thread_get_pc_nt -bli_thread_init -bli_thread_init_rntm -bli_thread_init_rntm_from_env -bli_thread_range_b2t -bli_thread_range_l2r -bli_thread_range_mdim -bli_thread_range_ndim -bli_thread_range_r2l +bli_thread_get_thread_impl +bli_thread_get_thread_impl_str bli_thread_range_sub -bli_thread_range_t2b -bli_thread_range_weighted_b2t -bli_thread_range_weighted_l2r -bli_thread_range_weighted_r2l -bli_thread_range_weighted_sub -bli_thread_range_weighted_t2b -bli_thread_range_width_l bli_thread_set_num_threads bli_thread_set_num_threads_ +bli_thread_set_thread_impl bli_thread_set_ways bli_thread_set_ways_ -bli_thrinfo_create -bli_thrinfo_create_for_cntl -bli_thrinfo_create_for_cntl_prenode bli_thrinfo_free -bli_thrinfo_grow -bli_thrinfo_init -bli_thrinfo_init_single -bli_thrinfo_rgrow -bli_thrinfo_rgrow_prenode bli_trmm -bli_trmm1m bli_trmm3 -bli_trmm31m -bli_trmm33m1 -bli_trmm33mh -bli_trmm34m1 -bli_trmm34mh bli_trmm3_ex -bli_trmm3_front -bli_trmm3ind -bli_trmm3ind_get_avail -bli_trmm3m1 -bli_trmm3nat -bli_trmm4m1 -bli_trmm_check -bli_trmm_determine_kc -bli_trmm_determine_kc_b -bli_trmm_determine_kc_f -bli_trmm_direct bli_trmm_ex -bli_trmm_front -bli_trmmind -bli_trmmind_get_avail -bli_trmm_ll_ker_var2 -bli_trmm_lu_ker_var2 -bli_trmmnat -bli_trmm_prune_unref_mparts_k -bli_trmm_prune_unref_mparts_m -bli_trmm_prune_unref_mparts_n -bli_trmm_rl_ker_var2 -bli_trmm_ru_ker_var2 -bli_trmm_xx_ker_var2 bli_trmv -bli_trmv_check bli_trmv_ex -bli_trmv_ex_qfp -bli_trmv_unb_var1 -bli_trmv_unb_var1_qfp -bli_trmv_unb_var2 -bli_trmv_unb_var2_qfp -bli_trmv_unf_var1 -bli_trmv_unf_var1_qfp -bli_trmv_unf_var2 -bli_trmv_unf_var2_qfp bli_trsm -bli_trsm1m -bli_trsm3m1 -bli_trsm4m1 -bli_trsm_blk_var1 -bli_trsm_blk_var2 -bli_trsm_blk_var3 -bli_trsm_check -bli_trsm_cntl_create -bli_trsm_cntl_create_node -bli_trsm_cntl_free -bli_trsm_determine_kc -bli_trsm_determine_kc_b -bli_trsm_determine_kc_f -bli_trsm_direct bli_trsm_ex -bli_trsm_front -bli_trsmind -bli_trsmind_get_avail -bli_trsm_int -bli_trsm_l_cntl_create -bli_trsm_ll_ker_var2 -bli_trsm_l_ukernel_qfp -bli_trsm_lu_ker_var2 -bli_trsmnat -bli_trsm_packa -bli_trsm_packb -bli_trsm_prune_unref_mparts_k -bli_trsm_prune_unref_mparts_m -bli_trsm_prune_unref_mparts_n -bli_trsm_r_cntl_create -bli_trsm_rl_ker_var2 -bli_trsm_ru_ker_var2 bli_trsm_ukernel -bli_trsm_u_ukernel_qfp -bli_trsm_xx_ker_var2 bli_trsv -bli_trsv_check bli_trsv_ex -bli_trsv_ex_qfp -bli_trsv_unb_var1 -bli_trsv_unb_var1_qfp -bli_trsv_unb_var2 -bli_trsv_unb_var2_qfp -bli_trsv_unf_var1 -bli_trsv_unf_var1_qfp -bli_trsv_unf_var2 -bli_trsv_unf_var2_qfp -bli_unpackm_blk_var1 -bli_unpackm_cntl_create_node -bli_unpackm_int -bli_unpackm_int_check -bli_unpackm_unb_var1 bli_unzipsc -bli_unzipsc_check -bli_unzipsc_qfp -bli_utilm_fprint_check -bli_utilm_mkhst_check -bli_utilm_norm_check -bli_utilm_rand_check -bli_utilv_norm_check -bli_utilv_sumsqv_check -bli_utilv_xa_check bli_xpbyd -bli_xpbyd_check bli_xpbyd_ex -bli_xpbyd_ex_qfp bli_xpbym -bli_xpbym_check bli_xpbym_ex -bli_xpbym_ex_qfp bli_xpbym_md bli_xpbym_md_ex -bli_xpbym_md_ex_qfp2 bli_xpbyv -bli_xpbyv_check bli_xpbyv_ex -bli_xpbyv_ex_qfp -bli_xxmv_check -bli_xxr_check bli_zabsqsc bli_zaddd bli_zaddd_ex bli_zaddm bli_zaddm_ex -bli_zaddm_unb_var1 bli_zaddsc bli_zaddv bli_zaddv_ex @@ -2135,7 +1024,6 @@ bli_zamaxv bli_zamaxv_ex bli_zasumv bli_zasumv_ex -bli_zasumv_unb_var1 bli_zaxpbyv bli_zaxpbyv_ex bli_zaxpy2v @@ -2146,33 +1034,24 @@ bli_zaxpyf bli_zaxpyf_ex bli_zaxpym bli_zaxpym_ex -bli_zaxpym_unb_var1 bli_zaxpyv bli_zaxpyv_ex bli_zccastm bli_zccastnzm bli_zccastv bli_zccopysc -bli_zcgemm_ker_var2_md bli_zcopyd bli_zcopyd_ex bli_zcopym bli_zcopym_ex -bli_zcopym_unb_var1 bli_zcopyv bli_zcopyv_ex -bli_zcpackm_blk_var1_md -bli_zcpackm_cxk_1e_md -bli_zcpackm_cxk_1r_md -bli_zcpackm_struc_cxk_md bli_zcxpbym_md bli_zcxpbym_md_ex -bli_zcxpbym_md_unb_var1 bli_zdcastm bli_zdcastnzm bli_zdcastv bli_zdcopysc -bli_zdgemm_ker_var2_md bli_zdivsc bli_zdotaxpyv bli_zdotaxpyv_ex @@ -2184,174 +1063,89 @@ bli_zdotxf bli_zdotxf_ex bli_zdotxv bli_zdotxv_ex -bli_zdpackm_blk_var1_md -bli_zdpackm_cxk_1e_md -bli_zdpackm_cxk_1r_md -bli_zdpackm_struc_cxk_md bli_zdxpbym_md bli_zdxpbym_md_ex -bli_zdxpbym_md_unb_var1 +bli_zeqm +bli_zeqsc +bli_zeqv bli_zfprintm bli_zfprintv bli_zgemm -bli_zgemm1m -bli_zgemm3m1 -bli_zgemm3mh -bli_zgemm4m1 -bli_zgemm4mb -bli_zgemm4mb_ker_var2 -bli_zgemm4mh bli_zgemm_ex -bli_zgemm_ker_var2 -bli_zgemm_md_c2r_ref -bli_zgemmtrsm_l_ukernel -bli_zgemmtrsm_u_ukernel -bli_zgemm_ukernel +bli_zgemmt +bli_zgemmt_ex bli_zgemv bli_zgemv_ex -bli_zgemv_unb_var1 -bli_zgemv_unb_var2 -bli_zgemv_unf_var1 -bli_zgemv_unf_var2 bli_zger bli_zger_ex -bli_zger_unb_var1 -bli_zger_unb_var2 bli_zgetijm +bli_zgetijv bli_zgetsc bli_zhemm -bli_zhemm1m -bli_zhemm3m1 -bli_zhemm3mh -bli_zhemm4m1 -bli_zhemm4mh bli_zhemm_ex bli_zhemv bli_zhemv_ex -bli_zhemv_unb_var1 -bli_zhemv_unb_var2 -bli_zhemv_unb_var3 -bli_zhemv_unb_var4 -bli_zhemv_unf_var1 -bli_zhemv_unf_var1a -bli_zhemv_unf_var3 -bli_zhemv_unf_var3a bli_zher bli_zher2 bli_zher2_ex bli_zher2k -bli_zher2k1m -bli_zher2k3m1 -bli_zher2k3mh -bli_zher2k4m1 -bli_zher2k4mh bli_zher2k_ex -bli_zher2_unb_var1 -bli_zher2_unb_var2 -bli_zher2_unb_var3 -bli_zher2_unb_var4 -bli_zher2_unf_var1 -bli_zher2_unf_var4 bli_zher_ex bli_zherk -bli_zherk1m -bli_zherk3m1 -bli_zherk3mh -bli_zherk4m1 -bli_zherk4mh bli_zherk_ex -bli_zherk_l_ker_var2 -bli_zherk_u_ker_var2 -bli_zher_unb_var1 -bli_zher_unb_var2 bli_zinvertd bli_zinvertd_ex bli_zinvertsc bli_zinvertv bli_zinvertv_ex +bli_zinvscald +bli_zinvscald_ex +bli_zinvscalm +bli_zinvscalm_ex +bli_zinvscalv +bli_zinvscalv_ex bli_zipsc -bli_zipsc_check -bli_zipsc_qfp bli_zmachval bli_zmkherm bli_zmkherm_ex -bli_zmkherm_unb_var1 bli_zmksymm bli_zmksymm_ex -bli_zmksymm_unb_var1 bli_zmktrim bli_zmktrim_ex -bli_zmktrim_unb_var1 bli_zmulsc bli_znorm1m bli_znorm1m_ex -bli_znorm1m_unb_var1 bli_znorm1v bli_znorm1v_ex -bli_znorm1v_unb_var1 bli_znormfm bli_znormfm_ex -bli_znormfm_unb_var1 bli_znormfsc bli_znormfv bli_znormfv_ex -bli_znormfv_unb_var1 bli_znormim bli_znormim_ex -bli_znormim_unb_var1 bli_znormiv bli_znormiv_ex -bli_znormiv_unb_var1 -bli_zpackm_blk_var1 -bli_zpackm_cxk -bli_zpackm_cxk_1er -bli_zpackm_cxk_3mis -bli_zpackm_cxk_4mi -bli_zpackm_cxk_rih -bli_zpackm_herm_cxk -bli_zpackm_herm_cxk_1er -bli_zpackm_herm_cxk_3mis -bli_zpackm_herm_cxk_4mi -bli_zpackm_herm_cxk_rih -bli_zpackm_struc_cxk -bli_zpackm_struc_cxk_1er -bli_zpackm_struc_cxk_3mis -bli_zpackm_struc_cxk_4mi -bli_zpackm_struc_cxk_rih -bli_zpackm_tri_cxk -bli_zpackm_tri_cxk_1er -bli_zpackm_tri_cxk_3mis -bli_zpackm_tri_cxk_4mi -bli_zpackm_tri_cxk_rih -bli_zpackm_unb_var1 bli_zprintm -bli_zprintm_ex bli_zprintv -bli_zprintv_ex bli_zrandm bli_zrandm_ex -bli_zrandm_unb_var1 bli_zrandnm bli_zrandnm_ex -bli_zrandnm_unb_var1 bli_zrandnv bli_zrandnv_ex -bli_zrandnv_unb_var1 bli_zrandv bli_zrandv_ex -bli_zrandv_unb_var1 bli_zscal2d bli_zscal2d_ex bli_zscal2m bli_zscal2m_ex -bli_zscal2m_unb_var1 bli_zscal2v bli_zscal2v_ex bli_zscald bli_zscald_ex bli_zscalm bli_zscalm_ex -bli_zscalm_unb_var1 bli_zscalv bli_zscalv_ex bli_zscastm @@ -2363,42 +1157,29 @@ bli_zsetd_ex bli_zsetid bli_zsetid_ex bli_zsetijm +bli_zsetijv bli_zsetm bli_zsetm_ex -bli_zsetm_unb_var1 bli_zsetsc bli_zsetv bli_zsetv_ex -bli_zsgemm_ker_var2_md bli_zshiftd bli_zshiftd_ex -bli_zspackm_blk_var1_md -bli_zspackm_cxk_1e_md -bli_zspackm_cxk_1r_md -bli_zspackm_struc_cxk_md bli_zsqrtsc bli_zsubd bli_zsubd_ex bli_zsubm bli_zsubm_ex -bli_zsubm_unb_var1 bli_zsubsc bli_zsubv bli_zsubv_ex bli_zsumsqv bli_zsumsqv_ex -bli_zsumsqv_unb_var1 bli_zswapv bli_zswapv_ex bli_zsxpbym_md bli_zsxpbym_md_ex -bli_zsxpbym_md_unb_var1 bli_zsymm -bli_zsymm1m -bli_zsymm3m1 -bli_zsymm3mh -bli_zsymm4m1 -bli_zsymm4mh bli_zsymm_ex bli_zsymv bli_zsymv_ex @@ -2406,85 +1187,37 @@ bli_zsyr bli_zsyr2 bli_zsyr2_ex bli_zsyr2k -bli_zsyr2k1m -bli_zsyr2k3m1 -bli_zsyr2k3mh -bli_zsyr2k4m1 -bli_zsyr2k4mh bli_zsyr2k_ex bli_zsyr_ex bli_zsyrk -bli_zsyrk1m -bli_zsyrk3m1 -bli_zsyrk3mh -bli_zsyrk4m1 -bli_zsyrk4mh bli_zsyrk_ex bli_ztrmm -bli_ztrmm1m bli_ztrmm3 -bli_ztrmm31m -bli_ztrmm33m1 -bli_ztrmm33mh -bli_ztrmm34m1 -bli_ztrmm34mh bli_ztrmm3_ex -bli_ztrmm3m1 -bli_ztrmm4m1 bli_ztrmm_ex -bli_ztrmm_ll_ker_var2 -bli_ztrmm_lu_ker_var2 -bli_ztrmm_rl_ker_var2 -bli_ztrmm_ru_ker_var2 bli_ztrmv bli_ztrmv_ex -bli_ztrmv_unb_var1 -bli_ztrmv_unb_var2 -bli_ztrmv_unf_var1 -bli_ztrmv_unf_var2 bli_ztrsm -bli_ztrsm1m -bli_ztrsm3m1 -bli_ztrsm4m1 bli_ztrsm_ex -bli_ztrsm_ll_ker_var2 -bli_ztrsm_l_ukernel -bli_ztrsm_lu_ker_var2 -bli_ztrsm_rl_ker_var2 -bli_ztrsm_ru_ker_var2 -bli_ztrsm_u_ukernel bli_ztrsv bli_ztrsv_ex -bli_ztrsv_unb_var1 -bli_ztrsv_unb_var2 -bli_ztrsv_unf_var1 -bli_ztrsv_unf_var2 -bli_zunpackm_blk_var1 -bli_zunpackm_cxk -bli_zunpackm_unb_var1 bli_zunzipsc bli_zxpbyd bli_zxpbyd_ex bli_zxpbym bli_zxpbym_ex -bli_zxpbym_unb_var1 bli_zxpbyv bli_zxpbyv_ex bli_zzcastm bli_zzcastnzm bli_zzcastv bli_zzcopysc -bli_zzgemm_ker_var2_md bli_zzipsc -bli_zzpackm_blk_var1_md -bli_zzpackm_cxk_1e_md -bli_zzpackm_cxk_1r_md -bli_zzpackm_struc_cxk_md bli_zzxpbym_md bli_zzxpbym_md_ex -bli_zzxpbym_md_unb_var1 sasum_ sasumsub_ +saxpby_ saxpy_ scabs1_ scasum_ @@ -2498,6 +1231,8 @@ sdsdot_ sdsdotsub_ sgbmv_ sgemm_ +sgemm_batch_ +sgemmt_ sgemv_ sger_ snrm2_ @@ -2528,6 +1263,7 @@ strsm_ strsv_ dasum_ dasumsub_ +daxpby_ daxpy_ dcabs1_ dcopy_ @@ -2535,6 +1271,8 @@ ddot_ ddotsub_ dgbmv_ dgemm_ +dgemm_batch_ +dgemmt_ dgemv_ dger_ dnrm2_ @@ -2569,6 +1307,7 @@ dzasum_ dzasumsub_ dznrm2_ dznrm2sub_ +caxpby_ caxpy_ ccopy_ cdotc_ @@ -2577,6 +1316,9 @@ cdotu_ cdotusub_ cgbmv_ cgemm_ +cgemm3m_ +cgemm_batch_ +cgemmt_ cgemv_ cgerc_ cgeru_ @@ -2606,6 +1348,7 @@ ctrmm_ ctrmv_ ctrsm_ ctrsv_ +zaxpby_ zaxpy_ zcopy_ zdotc_ @@ -2616,6 +1359,9 @@ zdrot_ zdscal_ zgbmv_ zgemm_ +zgemm3m_ +zgemm_batch_ +zgemmt_ zgemv_ zgerc_ zgeru_ @@ -2651,12 +1397,16 @@ isamax_ isamaxsub_ izamax_ izamaxsub_ +cblas_caxpby cblas_caxpy cblas_ccopy cblas_cdotc_sub cblas_cdotu_sub cblas_cgbmv cblas_cgemm +cblas_cgemm3m +cblas_cgemm_batch +cblas_cgemmt cblas_cgemv cblas_cgerc cblas_cgeru @@ -2685,11 +1435,14 @@ cblas_ctrmv cblas_ctrsm cblas_ctrsv cblas_dasum +cblas_daxpby cblas_daxpy cblas_dcopy cblas_ddot cblas_dgbmv cblas_dgemm +cblas_dgemm_batch +cblas_dgemmt cblas_dgemv cblas_dger cblas_dnrm2 @@ -2725,6 +1478,7 @@ cblas_idamax cblas_isamax cblas_izamax cblas_sasum +cblas_saxpby cblas_saxpy cblas_scasum cblas_scnrm2 @@ -2733,6 +1487,8 @@ cblas_sdot cblas_sdsdot cblas_sgbmv cblas_sgemm +cblas_sgemm_batch +cblas_sgemmt cblas_sgemv cblas_sger cblas_snrm2 @@ -2761,6 +1517,7 @@ cblas_strmv cblas_strsm cblas_strsv cblas_xerbla +cblas_zaxpby cblas_zaxpy cblas_zcopy cblas_zdotc_sub @@ -2768,6 +1525,9 @@ cblas_zdotu_sub cblas_zdscal cblas_zgbmv cblas_zgemm +cblas_zgemm3m +cblas_zgemm_batch +cblas_zgemmt cblas_zgemv cblas_zgerc cblas_zgeru diff --git a/frame/1m/bli_l1m_oft_var.h b/frame/1m/bli_l1m_oft_var.h index 325ed0ecff..4888cbdaa7 100644 --- a/frame/1m/bli_l1m_oft_var.h +++ b/frame/1m/bli_l1m_oft_var.h @@ -48,9 +48,8 @@ typedef void (*PASTECH(opname,_var_oft)) \ const obj_t* a, \ obj_t* p, \ const cntx_t* cntx, \ - rntm_t* rntm, \ - cntl_t* cntl, \ - const thrinfo_t* thread \ + const cntl_t* cntl, \ + thrinfo_t* thread \ ); GENTDEF( packm ) diff --git a/frame/1m/bli_l1m_tapi.c b/frame/1m/bli_l1m_tapi.c index 0a641cf9e5..5547cfadf8 100644 --- a/frame/1m/bli_l1m_tapi.c +++ b/frame/1m/bli_l1m_tapi.c @@ -78,7 +78,7 @@ void PASTEMAC2(ch,opname,EX_SUF) \ ( ctype* )x, rs_x, cs_x, \ y, rs_y, cs_y, \ ( cntx_t* )cntx, \ - rntm \ + ( rntm_t* )rntm \ ); \ \ /* When the diagonal of an upper- or lower-stored matrix is unit, @@ -143,7 +143,7 @@ void PASTEMAC2(ch,opname,EX_SUF) \ ( ctype* )x, rs_x, cs_x, \ y, rs_y, cs_y, \ ( cntx_t* )cntx, \ - rntm \ + ( rntm_t* )rntm \ ); \ \ /* When the diagonal of an upper- or lower-stored matrix is unit, @@ -217,7 +217,7 @@ void PASTEMAC2(ch,opname,EX_SUF) \ ( ctype* )x, rs_x, cs_x, \ y, rs_y, cs_y, \ ( cntx_t* )cntx, \ - rntm \ + ( rntm_t* )rntm \ ); \ \ /* When the diagonal of an upper- or lower-stored matrix is unit, @@ -306,7 +306,7 @@ void PASTEMAC2(ch,opname,EX_SUF) \ ( ctype* )x, rs_x, cs_x, \ y, rs_y, cs_y, \ ( cntx_t* )cntx, \ - rntm \ + ( rntm_t* )rntm \ ); \ \ /* When the diagonal of an upper- or lower-stored matrix is unit, @@ -374,7 +374,7 @@ void PASTEMAC2(ch,opname,EX_SUF) \ ( ctype* )alpha, \ x, rs_x, cs_x, \ ( cntx_t* )cntx, \ - rntm \ + ( rntm_t* )rntm \ ); \ } @@ -423,7 +423,7 @@ void PASTEMAC2(ch,opname,EX_SUF) \ ( ctype* )x, rs_x, cs_x, \ y, rs_y, cs_y, \ ( cntx_t* )cntx, \ - rntm \ + ( rntm_t* )rntm \ ); \ \ return; \ @@ -443,7 +443,7 @@ void PASTEMAC2(ch,opname,EX_SUF) \ ( ctype* )beta, \ y, rs_y, cs_y, \ ( cntx_t* )cntx, \ - rntm \ + ( rntm_t* )rntm \ ); \ \ /* When the diagonal of an upper- or lower-stored matrix is unit, @@ -525,7 +525,7 @@ void PASTEMAC3(chx,chy,opname,EX_SUF) \ ( ctype_y* )beta, \ y, rs_y, cs_y, \ ( cntx_t* )cntx, \ - rntm \ + ( rntm_t* )rntm \ ); \ } diff --git a/frame/1m/packm/bli_packm.h b/frame/1m/packm/bli_packm.h index 7d73bf903e..80878fba01 100644 --- a/frame/1m/packm/bli_packm.h +++ b/frame/1m/packm/bli_packm.h @@ -39,6 +39,7 @@ #include "bli_packm_init.h" #include "bli_packm_int.h" #include "bli_packm_scalar.h" +#include "bli_packm_thrinfo.h" #include "bli_packm_part.h" diff --git a/frame/1m/packm/bli_packm_alloc.c b/frame/1m/packm/bli_packm_alloc.c index 07f54de786..8e4ede2597 100644 --- a/frame/1m/packm/bli_packm_alloc.c +++ b/frame/1m/packm/bli_packm_alloc.c @@ -38,9 +38,8 @@ void* bli_packm_alloc ( siz_t size_needed, - rntm_t* rntm, - cntl_t* cntl, - const thrinfo_t* thread + const cntl_t* cntl, + thrinfo_t* thread ) { // Query the pack buffer type from the control tree node. @@ -50,35 +49,32 @@ void* bli_packm_alloc ( size_needed, pack_buf_type, - rntm, - cntl, thread ); } void* bli_packm_alloc_ex ( - siz_t size_needed, - packbuf_t pack_buf_type, - rntm_t* rntm, - cntl_t* cntl, - const thrinfo_t* thread + siz_t size_needed, + packbuf_t pack_buf_type, + thrinfo_t* thread ) { // Query the address of the mem_t entry within the control tree node. - mem_t* cntl_mem_p = bli_cntl_pack_mem( cntl ); + mem_t* cntl_mem_p = bli_thread_mem( thread ); + pba_t* pba = bli_thread_pba( thread ); mem_t* local_mem_p; mem_t local_mem_s; - siz_t cntl_mem_size = 0; + siz_t cntl_mem_size = 0; if ( bli_mem_is_alloc( cntl_mem_p ) ) cntl_mem_size = bli_mem_size( cntl_mem_p ); if ( cntl_mem_size < size_needed ) { - if ( bli_thread_am_ochief( thread ) ) + if ( bli_thread_am_chief( thread ) ) { // The chief thread releases the existing block associated with // the mem_t entry in the control tree, and then re-acquires a @@ -87,14 +83,14 @@ void* bli_packm_alloc_ex { bli_pba_release ( - rntm, + pba, cntl_mem_p ); } bli_pba_acquire_m ( - rntm, + pba, size_needed, pack_buf_type, &local_mem_s @@ -103,7 +99,7 @@ void* bli_packm_alloc_ex // Broadcast the address of the chief thread's local mem_t entry to // all threads. - local_mem_p = bli_thread_broadcast( rntm, thread, &local_mem_s ); + local_mem_p = bli_thread_broadcast( thread, &local_mem_s ); // Save the chief thread's local mem_t entry to the mem_t field in // this thread's control tree node. @@ -111,7 +107,7 @@ void* bli_packm_alloc_ex // Barrier so that the master thread doesn't return from the function // before we are done reading. - bli_thread_barrier( rntm, thread ); + bli_thread_barrier( thread ); } return bli_mem_buffer( cntl_mem_p ); diff --git a/frame/1m/packm/bli_packm_alloc.h b/frame/1m/packm/bli_packm_alloc.h index aec2e1af53..e308709b0e 100644 --- a/frame/1m/packm/bli_packm_alloc.h +++ b/frame/1m/packm/bli_packm_alloc.h @@ -35,17 +35,14 @@ BLIS_EXPORT_BLIS void* bli_packm_alloc ( siz_t size_needed, - rntm_t* rntm, - cntl_t* cntl, - const thrinfo_t* thread + const cntl_t* cntl, + thrinfo_t* thread ); BLIS_EXPORT_BLIS void* bli_packm_alloc_ex ( - siz_t size_needed, - packbuf_t pack_buf_type, - rntm_t* rntm, - cntl_t* cntl, - const thrinfo_t* thread + siz_t size_needed, + packbuf_t pack_buf_type, + thrinfo_t* thread ); diff --git a/frame/1m/packm/bli_packm_blk_var1.c b/frame/1m/packm/bli_packm_blk_var1.c index 601f2c05c5..71aaeb67f6 100644 --- a/frame/1m/packm/bli_packm_blk_var1.c +++ b/frame/1m/packm/bli_packm_blk_var1.c @@ -35,7 +35,6 @@ #include "blis.h" - static func_t packm_struc_cxk_kers[BLIS_NUM_PACK_SCHEMA_TYPES] = { /* float (0) scomplex (1) double (2) dcomplex (3) */ @@ -57,9 +56,8 @@ void bli_packm_blk_var1 const obj_t* c, obj_t* p, const cntx_t* cntx, - rntm_t* rntm, - cntl_t* cntl, - const thrinfo_t* thread + const cntl_t* cntl, + thrinfo_t* thread ) { // Extract various fields from the control tree. @@ -71,7 +69,7 @@ void bli_packm_blk_var1 // Every thread initializes p and determines the size of memory // block needed (which gets embedded into the otherwise "blank" mem_t // entry in the control tree node). Return early if no packing is required. - if ( !bli_packm_init( c, p, cntx, rntm, cntl, thread ) ) + if ( !bli_packm_init( c, p, cntx, cntl, thread ) ) return; // Check parameters. @@ -161,8 +159,8 @@ void bli_packm_blk_var1 // Query the number of threads and thread ids from the current thread's // packm thrinfo_t node. - const dim_t nt = bli_thread_n_way( thread ); - const dim_t tid = bli_thread_work_id( thread ); + const dim_t nt = bli_thread_num_threads( thread ); + const dim_t tid = bli_thread_thread_id( thread ); // Determine the thread range and increment using the current thread's // packm thrinfo_t node. NOTE: The definition of bli_thread_range_jrir() @@ -272,7 +270,7 @@ void bli_packm_blk_var1 p_use, ldp, is_p_use, ( cntx_t* )cntx, - params ); + bli_cntl_params( cntl ) ); } // NOTE: This value is usually LESS than ps_p because triangular @@ -304,7 +302,7 @@ void bli_packm_blk_var1 c_begin, incc, ldc, p_begin, ldp, is_p, ( cntx_t* )cntx, - params ); + bli_cntl_params( cntl ) ); } } diff --git a/frame/1m/packm/bli_packm_blk_var1.h b/frame/1m/packm/bli_packm_blk_var1.h index 5797e3b941..aad0800784 100644 --- a/frame/1m/packm/bli_packm_blk_var1.h +++ b/frame/1m/packm/bli_packm_blk_var1.h @@ -52,8 +52,7 @@ BLIS_EXPORT_BLIS void bli_packm_blk_var1 const obj_t* c, obj_t* p, const cntx_t* cntx, - rntm_t* rntm, - cntl_t* cntl, - const thrinfo_t* t + const cntl_t* cntl, + thrinfo_t* t ); diff --git a/frame/1m/packm/bli_packm_cntl.c b/frame/1m/packm/bli_packm_cntl.c index e99ed9cf3d..7f74010451 100644 --- a/frame/1m/packm/bli_packm_cntl.c +++ b/frame/1m/packm/bli_packm_cntl.c @@ -37,7 +37,7 @@ BLIS_EXPORT_BLIS cntl_t* bli_packm_cntl_create_node ( - rntm_t* rntm, + pool_t* pool, void_fp var_func, bszid_t bmid_m, bszid_t bmid_n, @@ -57,7 +57,7 @@ BLIS_EXPORT_BLIS cntl_t* bli_packm_cntl_create_node #endif // Allocate a packm_params_t struct. - params = bli_sba_acquire( rntm, sizeof( packm_params_t ) ); + params = bli_sba_acquire( pool, sizeof( packm_params_t ) ); // Initialize the packm_params_t struct. params->size = sizeof( packm_params_t ); @@ -79,7 +79,7 @@ BLIS_EXPORT_BLIS cntl_t* bli_packm_cntl_create_node // sync with the cntl_t tree. cntl = bli_cntl_create_node ( - rntm, + pool, BLIS_NOID, BLIS_NO_PART, var_func, diff --git a/frame/1m/packm/bli_packm_cntl.h b/frame/1m/packm/bli_packm_cntl.h index be0fc8fdef..8a43f711d1 100644 --- a/frame/1m/packm/bli_packm_cntl.h +++ b/frame/1m/packm/bli_packm_cntl.h @@ -85,7 +85,7 @@ BLIS_INLINE packbuf_t bli_cntl_packm_params_pack_buf_type( const cntl_t* cntl ) cntl_t* bli_packm_cntl_create_node ( - rntm_t* rntm, + pool_t* pool, void_fp var_func, bszid_t bmid_m, bszid_t bmid_n, diff --git a/frame/1m/packm/bli_packm_init.c b/frame/1m/packm/bli_packm_init.c index 67e02ac0e5..d4480f2c1f 100644 --- a/frame/1m/packm/bli_packm_init.c +++ b/frame/1m/packm/bli_packm_init.c @@ -40,9 +40,8 @@ bool bli_packm_init const obj_t* c, obj_t* p, const cntx_t* cntx, - rntm_t* rntm, - cntl_t* cntl, - const thrinfo_t* thread + const cntl_t* cntl, + thrinfo_t* thread ) { bli_init_once(); @@ -179,7 +178,7 @@ bool bli_packm_init // Update the buffer address in p to point to the buffer associated // with the mem_t entry acquired from the memory broker (now cached in // the control tree node). - void* buffer = bli_packm_alloc( size_p, rntm, cntl, thread ); + void* buffer = bli_packm_alloc( size_p, cntl, thread ); bli_obj_set_buffer( buffer, p ); return true; diff --git a/frame/1m/packm/bli_packm_init.h b/frame/1m/packm/bli_packm_init.h index 6f9b472736..b34bd53799 100644 --- a/frame/1m/packm/bli_packm_init.h +++ b/frame/1m/packm/bli_packm_init.h @@ -37,8 +37,7 @@ BLIS_EXPORT_BLIS bool bli_packm_init const obj_t* a, obj_t* p, const cntx_t* cntx, - rntm_t* rntm, - cntl_t* cntl, - const thrinfo_t* thread + const cntl_t* cntl, + thrinfo_t* thread ); diff --git a/frame/1m/packm/bli_packm_int.c b/frame/1m/packm/bli_packm_int.c index ae788e671d..b918d50bd7 100644 --- a/frame/1m/packm/bli_packm_int.c +++ b/frame/1m/packm/bli_packm_int.c @@ -39,9 +39,8 @@ void bli_packm_int const obj_t* a, obj_t* p, const cntx_t* cntx, - rntm_t* rntm, - cntl_t* cntl, - const thrinfo_t* thread + const cntl_t* cntl, + thrinfo_t* thread ) { bli_init_once(); @@ -51,7 +50,7 @@ void bli_packm_int // Barrier so that we know threads are done with previous computation // with the same packing buffer before starting to pack. - bli_thread_barrier( rntm, thread ); + bli_thread_barrier( thread ); // Invoke the variant with kappa_use. f @@ -59,12 +58,11 @@ void bli_packm_int a, p, cntx, - rntm, cntl, thread ); // Barrier so that packing is done before computation. - bli_thread_barrier( rntm, thread ); + bli_thread_barrier( thread ); } diff --git a/frame/1m/packm/bli_packm_int.h b/frame/1m/packm/bli_packm_int.h index a4cf17d592..b7720cd3e6 100644 --- a/frame/1m/packm/bli_packm_int.h +++ b/frame/1m/packm/bli_packm_int.h @@ -37,7 +37,6 @@ void bli_packm_int const obj_t* a, obj_t* p, const cntx_t* cntx, - rntm_t* rntm, - cntl_t* cntl, - const thrinfo_t* thread + const cntl_t* cntl, + thrinfo_t* thread ); diff --git a/frame/1m/packm/bli_packm_thrinfo.c b/frame/1m/packm/bli_packm_thrinfo.c deleted file mode 100644 index 4b57971ef2..0000000000 --- a/frame/1m/packm/bli_packm_thrinfo.c +++ /dev/null @@ -1,75 +0,0 @@ -/* - - BLIS - An object-based framework for developing high-performance BLAS-like - libraries. - - Copyright (C) 2014, The University of Texas at Austin - Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc. - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are - met: - - Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - - Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - - Neither the name(s) of the copyright holder(s) nor the names of its - contributors may be used to endorse or promote products derived - from this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -*/ - -#include "blis.h" - -void bli_packm_thrinfo_init - ( - thrinfo_t* thread, - thrcomm_t* ocomm, - dim_t ocomm_id, - dim_t n_way, - dim_t work_id, - bszid_t bszid, - thrinfo_t* sub_node - ) -{ - bli_thrinfo_init - ( - thread, - ocomm, ocomm_id, - n_way, work_id, - FALSE, - BLIS_NO_PART, - sub_node - ); -} - -void bli_packm_thrinfo_init_single - ( - thrinfo_t* thread - ) -{ - bli_packm_thrinfo_init - ( - thread, - &BLIS_SINGLE_COMM, 0, - 1, - 0, - BLIS_NO_PART, - NULL - ); -} - diff --git a/frame/1m/packm/bli_packm_thrinfo.h b/frame/1m/packm/bli_packm_thrinfo.h index 85b61931c1..a49ff94991 100644 --- a/frame/1m/packm/bli_packm_thrinfo.h +++ b/frame/1m/packm/bli_packm_thrinfo.h @@ -64,42 +64,3 @@ #endif - -// -// thrinfo_t APIs specific to packm. -// - -#if 0 -thrinfo_t* bli_packm_thrinfo_create - ( - thrcomm_t* ocomm, - dim_t ocomm_id, - dim_t n_way, - dim_t work_id, - thrinfo_t* sub_node - ); -#endif - -void bli_packm_thrinfo_init - ( - thrinfo_t* thread, - thrcomm_t* ocomm, - dim_t ocomm_id, - dim_t n_way, - dim_t work_id, - bszid_t bszid, - thrinfo_t* sub_node - ); - -void bli_packm_thrinfo_init_single - ( - thrinfo_t* thread - ); - -#if 0 -void bli_packm_thrinfo_free - ( - thrinfo_t* thread - ); -#endif - diff --git a/frame/1m/unpackm/bli_unpackm_cntl.c b/frame/1m/unpackm/bli_unpackm_cntl.c index 95d0545bec..d879e9dfec 100644 --- a/frame/1m/unpackm/bli_unpackm_cntl.c +++ b/frame/1m/unpackm/bli_unpackm_cntl.c @@ -37,7 +37,7 @@ cntl_t* bli_unpackm_cntl_create_node ( - rntm_t* rntm, + pool_t* pool, void_fp var_func, void_fp unpackm_var_func, cntl_t* sub_node @@ -64,7 +64,7 @@ cntl_t* bli_unpackm_cntl_create_node // sync with the cntl_t tree. cntl = bli_cntl_create_node ( - rntm, + pool, BLIS_NOID, BLIS_NO_PART, var_func, diff --git a/frame/1m/unpackm/bli_unpackm_cntl.h b/frame/1m/unpackm/bli_unpackm_cntl.h index 5c41d94657..87298826ab 100644 --- a/frame/1m/unpackm/bli_unpackm_cntl.h +++ b/frame/1m/unpackm/bli_unpackm_cntl.h @@ -48,7 +48,7 @@ typedef struct unpackm_params_s unpackm_params_t; cntl_t* bli_unpackm_cntl_create_node ( - rntm_t* rntm, + pool_t* pool, void_fp var_func, void_fp unpackm_var_func, cntl_t* sub_node diff --git a/frame/1m/unpackm/bli_unpackm_int.c b/frame/1m/unpackm/bli_unpackm_int.c index 3b542b0617..82fa6ccc42 100644 --- a/frame/1m/unpackm/bli_unpackm_int.c +++ b/frame/1m/unpackm/bli_unpackm_int.c @@ -61,7 +61,7 @@ void bli_unpackm_int f = bli_cntl_unpackm_params_var_func( cntl ); // Invoke the variant. - if ( bli_thread_am_ochief( thread ) ) + if ( bli_thread_am_chief( thread ) ) { f ( @@ -74,6 +74,6 @@ void bli_unpackm_int } // Barrier so that unpacking is done before computation. - bli_thread_barrier( rntm, thread ); + bli_thread_barrier( thread ); } diff --git a/frame/3/bli_l3_cntl.c b/frame/3/bli_l3_cntl.c index d7fd9649e8..35cabf6b80 100644 --- a/frame/3/bli_l3_cntl.c +++ b/frame/3/bli_l3_cntl.c @@ -44,8 +44,8 @@ void bli_l3_cntl_create_if const obj_t* a, const obj_t* b, const obj_t* c, - rntm_t* rntm, - cntl_t* cntl_orig, + pool_t* pool, + const cntl_t* cntl_orig, cntl_t** cntl_use ) { @@ -59,7 +59,7 @@ void bli_l3_cntl_create_if { *cntl_use = bli_gemm_cntl_create ( - rntm, + pool, family, schema_a, schema_b, @@ -75,7 +75,7 @@ void bli_l3_cntl_create_if *cntl_use = bli_trsm_cntl_create ( - rntm, + pool, side, schema_a, schema_b, @@ -88,7 +88,7 @@ void bli_l3_cntl_create_if // If the user provided a control tree, create a copy and use it // instead (so that threads can use its local tree as a place to // cache things like pack mem_t entries). - *cntl_use = bli_cntl_copy( rntm, cntl_orig ); + *cntl_use = bli_cntl_copy( pool, cntl_orig ); // Recursively set the family fields of the newly copied control tree // nodes. @@ -98,9 +98,8 @@ void bli_l3_cntl_create_if void bli_l3_cntl_free ( - rntm_t* rntm, - cntl_t* cntl_use, - thrinfo_t* thread + pool_t* pool, + cntl_t* cntl_use ) { // NOTE: We don't actually need to call separate _cntl_free() functions @@ -114,11 +113,11 @@ void bli_l3_cntl_free family == BLIS_GEMMT || family == BLIS_TRMM ) { - bli_gemm_cntl_free( rntm, cntl_use, thread ); + bli_gemm_cntl_free( pool, cntl_use ); } else // if ( family == BLIS_TRSM ) { - bli_trsm_cntl_free( rntm, cntl_use, thread ); + bli_trsm_cntl_free( pool, cntl_use ); } } diff --git a/frame/3/bli_l3_cntl.h b/frame/3/bli_l3_cntl.h index eb4321ecd7..337f82c319 100644 --- a/frame/3/bli_l3_cntl.h +++ b/frame/3/bli_l3_cntl.h @@ -46,15 +46,15 @@ void bli_l3_cntl_create_if const obj_t* a, const obj_t* b, const obj_t* c, - rntm_t* rntm, - cntl_t* cntl_orig, + pool_t* pool, + const cntl_t* cntl_orig, cntl_t** cntl_use ); +BLIS_EXPORT_BLIS void bli_l3_cntl_free ( - rntm_t* rntm, - cntl_t* cntl_use, - thrinfo_t* thread + pool_t* pool, + cntl_t* cntl_use ); diff --git a/frame/3/bli_l3_int.c b/frame/3/bli_l3_int.c index b9d3898391..07c5e6dcb8 100644 --- a/frame/3/bli_l3_int.c +++ b/frame/3/bli_l3_int.c @@ -42,8 +42,7 @@ void bli_l3_int const obj_t* beta, const obj_t* c, const cntx_t* cntx, - rntm_t* rntm, - cntl_t* cntl, + const cntl_t* cntl, thrinfo_t* thread ) { @@ -68,9 +67,9 @@ void bli_l3_int if ( bli_obj_has_zero_dim( a ) || bli_obj_has_zero_dim( b ) ) { - if ( bli_thread_am_ochief( thread ) ) + if ( bli_thread_am_chief( thread ) ) bli_scalm( beta, c ); - bli_thread_barrier( rntm, thread ); + bli_thread_barrier( thread ); return; } @@ -82,9 +81,9 @@ void bli_l3_int // This should never execute. bli_abort(); - if ( bli_thread_am_ochief( thread ) ) + if ( bli_thread_am_chief( thread ) ) bli_scalm( beta, c ); - bli_thread_barrier( rntm, thread ); + bli_thread_barrier( thread ); return; } @@ -130,9 +129,6 @@ void bli_l3_int if ( !bli_obj_equals( beta, &BLIS_ONE ) ) bli_obj_scalar_apply_scalar( beta, &c_local ); - // Create the next node in the thrinfo_t structure. - bli_thrinfo_grow( rntm, cntl, thread ); - // Extract the function pointer from the current control tree node. l3_var_oft f = bli_cntl_var_func( cntl ); @@ -143,7 +139,6 @@ void bli_l3_int &b_local, &c_local, cntx, - rntm, cntl, thread ); diff --git a/frame/3/bli_l3_int.h b/frame/3/bli_l3_int.h index 65485206de..8364d91e4f 100644 --- a/frame/3/bli_l3_int.h +++ b/frame/3/bli_l3_int.h @@ -40,8 +40,7 @@ void bli_l3_int const obj_t* beta, const obj_t* c, const cntx_t* cntx, - rntm_t* rntm, - cntl_t* cntl, + const cntl_t* cntl, thrinfo_t* thread ); diff --git a/frame/3/bli_l3_oapi_ex.c b/frame/3/bli_l3_oapi_ex.c index 16e5f15de3..2599b0c2c1 100644 --- a/frame/3/bli_l3_oapi_ex.c +++ b/frame/3/bli_l3_oapi_ex.c @@ -50,7 +50,7 @@ void PASTEMAC(gemm,BLIS_OAPI_EX_SUF) const obj_t* beta, const obj_t* c, const cntx_t* cntx, - rntm_t* rntm + const rntm_t* rntm ) { bli_init_once(); @@ -90,8 +90,8 @@ void PASTEMAC(gemm,BLIS_OAPI_EX_SUF) // Initialize a local runtime with global settings if necessary. Note // that in the case that a runtime is passed in, we make a local copy. rntm_t rntm_l; - if ( rntm == NULL ) { bli_rntm_init_from_global( &rntm_l ); rntm = &rntm_l; } - else { rntm_l = *rntm; rntm = &rntm_l; } + if ( rntm == NULL ) { bli_rntm_init_from_global( &rntm_l ); } + else { rntm_l = *rntm; } // Default to using native execution. num_t dt = bli_obj_dt( c ); @@ -122,7 +122,7 @@ void PASTEMAC(gemm,BLIS_OAPI_EX_SUF) bli_gemm_check( alpha, a, b, beta, c, cntx ); // Invoke the operation's front-end and request the default control tree. - bli_gemm_front( alpha, a, b, beta, c, cntx, rntm, NULL ); + bli_gemm_front( alpha, a, b, beta, c, cntx, &rntm_l, NULL ); } #endif @@ -136,7 +136,7 @@ void PASTEMAC(gemmt,BLIS_OAPI_EX_SUF) const obj_t* beta, const obj_t* c, const cntx_t* cntx, - rntm_t* rntm + const rntm_t* rntm ) { bli_init_once(); @@ -157,8 +157,8 @@ void PASTEMAC(gemmt,BLIS_OAPI_EX_SUF) // Initialize a local runtime with global settings if necessary. Note // that in the case that a runtime is passed in, we make a local copy. rntm_t rntm_l; - if ( rntm == NULL ) { bli_rntm_init_from_global( &rntm_l ); rntm = &rntm_l; } - else { rntm_l = *rntm; rntm = &rntm_l; } + if ( rntm == NULL ) { bli_rntm_init_from_global( &rntm_l ); } + else { rntm_l = *rntm; } // Default to using native execution. num_t dt = bli_obj_dt( c ); @@ -186,7 +186,7 @@ void PASTEMAC(gemmt,BLIS_OAPI_EX_SUF) bli_gemmt_check( alpha, a, b, beta, c, cntx ); // Invoke the operation's front-end and request the default control tree. - bli_gemmt_front( alpha, a, b, beta, c, cntx, rntm, NULL ); + bli_gemmt_front( alpha, a, b, beta, c, cntx, &rntm_l, NULL ); } @@ -198,7 +198,7 @@ void PASTEMAC(her2k,BLIS_OAPI_EX_SUF) const obj_t* beta, const obj_t* c, const cntx_t* cntx, - rntm_t* rntm + const rntm_t* rntm ) { bli_init_once(); @@ -244,7 +244,7 @@ void PASTEMAC(syr2k,BLIS_OAPI_EX_SUF) const obj_t* beta, const obj_t* c, const cntx_t* cntx, - rntm_t* rntm + const rntm_t* rntm ) { bli_init_once(); @@ -277,7 +277,7 @@ void PASTEMAC(hemm,BLIS_OAPI_EX_SUF) const obj_t* beta, const obj_t* c, const cntx_t* cntx, - rntm_t* rntm + const rntm_t* rntm ) { bli_init_once(); @@ -285,8 +285,8 @@ void PASTEMAC(hemm,BLIS_OAPI_EX_SUF) // Initialize a local runtime with global settings if necessary. Note // that in the case that a runtime is passed in, we make a local copy. rntm_t rntm_l; - if ( rntm == NULL ) { bli_rntm_init_from_global( &rntm_l ); rntm = &rntm_l; } - else { rntm_l = *rntm; rntm = &rntm_l; } + if ( rntm == NULL ) { bli_rntm_init_from_global( &rntm_l ); } + else { rntm_l = *rntm; } // Default to using native execution. num_t dt = bli_obj_dt( c ); @@ -314,7 +314,7 @@ void PASTEMAC(hemm,BLIS_OAPI_EX_SUF) bli_hemm_check( side, alpha, a, b, beta, c, cntx ); // Invoke the operation's front-end and request the default control tree. - bli_hemm_front( side, alpha, a, b, beta, c, cntx, rntm, NULL ); + bli_hemm_front( side, alpha, a, b, beta, c, cntx, &rntm_l, NULL ); } @@ -327,7 +327,7 @@ void PASTEMAC(symm,BLIS_OAPI_EX_SUF) const obj_t* beta, const obj_t* c, const cntx_t* cntx, - rntm_t* rntm + const rntm_t* rntm ) { bli_init_once(); @@ -335,8 +335,8 @@ void PASTEMAC(symm,BLIS_OAPI_EX_SUF) // Initialize a local runtime with global settings if necessary. Note // that in the case that a runtime is passed in, we make a local copy. rntm_t rntm_l; - if ( rntm == NULL ) { bli_rntm_init_from_global( &rntm_l ); rntm = &rntm_l; } - else { rntm_l = *rntm; rntm = &rntm_l; } + if ( rntm == NULL ) { bli_rntm_init_from_global( &rntm_l ); } + else { rntm_l = *rntm; } // Default to using native execution. num_t dt = bli_obj_dt( c ); @@ -364,7 +364,7 @@ void PASTEMAC(symm,BLIS_OAPI_EX_SUF) bli_symm_check( side, alpha, a, b, beta, c, cntx ); // Invoke the operation's front-end and request the default control tree. - bli_symm_front( side, alpha, a, b, beta, c, cntx, rntm, NULL ); + bli_symm_front( side, alpha, a, b, beta, c, cntx, &rntm_l, NULL ); } @@ -377,7 +377,7 @@ void PASTEMAC(trmm3,BLIS_OAPI_EX_SUF) const obj_t* beta, const obj_t* c, const cntx_t* cntx, - rntm_t* rntm + const rntm_t* rntm ) { bli_init_once(); @@ -385,8 +385,8 @@ void PASTEMAC(trmm3,BLIS_OAPI_EX_SUF) // Initialize a local runtime with global settings if necessary. Note // that in the case that a runtime is passed in, we make a local copy. rntm_t rntm_l; - if ( rntm == NULL ) { bli_rntm_init_from_global( &rntm_l ); rntm = &rntm_l; } - else { rntm_l = *rntm; rntm = &rntm_l; } + if ( rntm == NULL ) { bli_rntm_init_from_global( &rntm_l ); } + else { rntm_l = *rntm; } // Default to using native execution. num_t dt = bli_obj_dt( c ); @@ -414,7 +414,7 @@ void PASTEMAC(trmm3,BLIS_OAPI_EX_SUF) bli_trmm3_check( side, alpha, a, b, beta, c, cntx ); // Invoke the operation's front-end and request the default control tree. - bli_trmm3_front( side, alpha, a, b, beta, c, cntx, rntm, NULL ); + bli_trmm3_front( side, alpha, a, b, beta, c, cntx, &rntm_l, NULL ); } @@ -425,7 +425,7 @@ void PASTEMAC(herk,BLIS_OAPI_EX_SUF) const obj_t* beta, const obj_t* c, const cntx_t* cntx, - rntm_t* rntm + const rntm_t* rntm ) { bli_init_once(); @@ -459,7 +459,7 @@ void PASTEMAC(syrk,BLIS_OAPI_EX_SUF) const obj_t* beta, const obj_t* c, const cntx_t* cntx, - rntm_t* rntm + const rntm_t* rntm ) { bli_init_once(); @@ -484,7 +484,7 @@ void PASTEMAC(trmm,BLIS_OAPI_EX_SUF) const obj_t* a, const obj_t* b, const cntx_t* cntx, - rntm_t* rntm + const rntm_t* rntm ) { bli_init_once(); @@ -492,8 +492,8 @@ void PASTEMAC(trmm,BLIS_OAPI_EX_SUF) // Initialize a local runtime with global settings if necessary. Note // that in the case that a runtime is passed in, we make a local copy. rntm_t rntm_l; - if ( rntm == NULL ) { bli_rntm_init_from_global( &rntm_l ); rntm = &rntm_l; } - else { rntm_l = *rntm; rntm = &rntm_l; } + if ( rntm == NULL ) { bli_rntm_init_from_global( &rntm_l ); } + else { rntm_l = *rntm; } // Default to using native execution. num_t dt = bli_obj_dt( b ); @@ -520,7 +520,7 @@ void PASTEMAC(trmm,BLIS_OAPI_EX_SUF) bli_trmm_check( side, alpha, a, b, cntx ); // Invoke the operation's front-end and request the default control tree. - bli_trmm_front( side, alpha, a, b, cntx, rntm, NULL ); + bli_trmm_front( side, alpha, a, b, cntx, &rntm_l, NULL ); } @@ -531,7 +531,7 @@ void PASTEMAC(trsm,BLIS_OAPI_EX_SUF) const obj_t* a, const obj_t* b, const cntx_t* cntx, - rntm_t* rntm + const rntm_t* rntm ) { bli_init_once(); @@ -539,8 +539,8 @@ void PASTEMAC(trsm,BLIS_OAPI_EX_SUF) // Initialize a local runtime with global settings if necessary. Note // that in the case that a runtime is passed in, we make a local copy. rntm_t rntm_l; - if ( rntm == NULL ) { bli_rntm_init_from_global( &rntm_l ); rntm = &rntm_l; } - else { rntm_l = *rntm; rntm = &rntm_l; } + if ( rntm == NULL ) { bli_rntm_init_from_global( &rntm_l ); } + else { rntm_l = *rntm; } // Default to using native execution. num_t dt = bli_obj_dt( b ); @@ -567,5 +567,5 @@ void PASTEMAC(trsm,BLIS_OAPI_EX_SUF) bli_trsm_check( side, alpha, a, b, cntx ); // Invoke the operation's front-end and request the default control tree. - bli_trsm_front( side, alpha, a, b, cntx, rntm, NULL ); + bli_trsm_front( side, alpha, a, b, cntx, &rntm_l, NULL ); } diff --git a/frame/3/bli_l3_oapi_ex.h b/frame/3/bli_l3_oapi_ex.h index 58091704b4..dd7624d929 100644 --- a/frame/3/bli_l3_oapi_ex.h +++ b/frame/3/bli_l3_oapi_ex.h @@ -49,7 +49,7 @@ BLIS_EXPORT_BLIS void PASTEMAC(opname,BLIS_OAPI_EX_SUF) \ const obj_t* beta, \ const obj_t* c, \ const cntx_t* cntx, \ - rntm_t* rntm \ + const rntm_t* rntm \ ); GENPROT( gemm ) @@ -70,7 +70,7 @@ BLIS_EXPORT_BLIS void PASTEMAC(opname,BLIS_OAPI_EX_SUF) \ const obj_t* beta, \ const obj_t* c, \ const cntx_t* cntx, \ - rntm_t* rntm \ + const rntm_t* rntm \ ); GENPROT( hemm ) @@ -88,7 +88,7 @@ BLIS_EXPORT_BLIS void PASTEMAC(opname,BLIS_OAPI_EX_SUF) \ const obj_t* beta, \ const obj_t* c, \ const cntx_t* cntx, \ - rntm_t* rntm \ + const rntm_t* rntm \ ); GENPROT( herk ) @@ -105,7 +105,7 @@ BLIS_EXPORT_BLIS void PASTEMAC(opname,BLIS_OAPI_EX_SUF) \ const obj_t* a, \ const obj_t* b, \ const cntx_t* cntx, \ - rntm_t* rntm \ + const rntm_t* rntm \ ); GENPROT( trmm ) diff --git a/frame/3/bli_l3_oft.h b/frame/3/bli_l3_oft.h index 997ade58e0..67fa2c75de 100644 --- a/frame/3/bli_l3_oft.h +++ b/frame/3/bli_l3_oft.h @@ -54,7 +54,7 @@ typedef void (*PASTECH(opname,_oft)) \ const obj_t* beta, \ const obj_t* c, \ const cntx_t* cntx, \ - rntm_t* rntm \ + const rntm_t* rntm \ ); GENTDEF( gemm ) @@ -77,7 +77,7 @@ typedef void (*PASTECH(opname,_oft)) \ const obj_t* beta, \ const obj_t* c, \ const cntx_t* cntx, \ - rntm_t* rntm \ + const rntm_t* rntm \ ); GENTDEF( hemm ) @@ -97,7 +97,7 @@ typedef void (*PASTECH(opname,_oft)) \ const obj_t* beta, \ const obj_t* c, \ const cntx_t* cntx, \ - rntm_t* rntm \ + const rntm_t* rntm \ ); GENTDEF( herk ) @@ -116,7 +116,7 @@ typedef void (*PASTECH(opname,_oft)) \ const obj_t* a, \ const obj_t* b, \ const cntx_t* cntx, \ - rntm_t* rntm \ + const rntm_t* rntm \ ); GENTDEF( trmm ) diff --git a/frame/3/bli_l3_oft_var.h b/frame/3/bli_l3_oft_var.h index ee529b115a..b295b5812a 100644 --- a/frame/3/bli_l3_oft_var.h +++ b/frame/3/bli_l3_oft_var.h @@ -49,8 +49,7 @@ typedef void (*PASTECH(opname,_var_oft)) \ const obj_t* b, \ const obj_t* c, \ const cntx_t* cntx, \ - rntm_t* rntm, \ - cntl_t* cntl, \ + const cntl_t* cntl, \ thrinfo_t* thread \ ); diff --git a/frame/3/bli_l3_packab.c b/frame/3/bli_l3_packab.c index 6f18169b28..65776d49fb 100644 --- a/frame/3/bli_l3_packab.c +++ b/frame/3/bli_l3_packab.c @@ -40,8 +40,7 @@ void bli_l3_packa const obj_t* b, const obj_t* c, const cntx_t* cntx, - rntm_t* rntm, - cntl_t* cntl, + const cntl_t* cntl, thrinfo_t* thread ) { @@ -60,7 +59,6 @@ void bli_l3_packa &a_local, &a_pack, cntx, - rntm, cntl, thread ); @@ -74,7 +72,6 @@ void bli_l3_packa &BLIS_ONE, c, cntx, - rntm, bli_cntl_sub_node( cntl ), bli_thrinfo_sub_node( thread ) ); @@ -88,8 +85,7 @@ void bli_l3_packb const obj_t* b, const obj_t* c, const cntx_t* cntx, - rntm_t* rntm, - cntl_t* cntl, + const cntl_t* cntl, thrinfo_t* thread ) { @@ -112,7 +108,6 @@ void bli_l3_packb &bt_local, &bt_pack, cntx, - rntm, cntl, thread ); @@ -129,7 +124,6 @@ void bli_l3_packb &BLIS_ONE, c, cntx, - rntm, bli_cntl_sub_node( cntl ), bli_thrinfo_sub_node( thread ) ); diff --git a/frame/3/bli_l3_packab.h b/frame/3/bli_l3_packab.h index f03b7f62ce..e58a08e4b4 100644 --- a/frame/3/bli_l3_packab.h +++ b/frame/3/bli_l3_packab.h @@ -38,8 +38,7 @@ void bli_l3_packa const obj_t* b, const obj_t* c, const cntx_t* cntx, - rntm_t* rntm, - cntl_t* cntl, + const cntl_t* cntl, thrinfo_t* thread ); @@ -49,8 +48,7 @@ void bli_l3_packb const obj_t* b, const obj_t* c, const cntx_t* cntx, - rntm_t* rntm, - cntl_t* cntl, + const cntl_t* cntl, thrinfo_t* thread ); diff --git a/frame/3/bli_l3_sup.c b/frame/3/bli_l3_sup.c index eedbd9ec51..fa064d74f6 100644 --- a/frame/3/bli_l3_sup.c +++ b/frame/3/bli_l3_sup.c @@ -42,7 +42,7 @@ err_t bli_gemmsup const obj_t* beta, const obj_t* c, const cntx_t* cntx, - rntm_t* rntm + const rntm_t* rntm ) { // Return early if small matrix handling is disabled at configure-time. @@ -127,7 +127,7 @@ printf( "dims: %d %d %d (threshs: %d %d %d)\n", beta, c, cntx, - rntm + &rntm_l ); } @@ -140,7 +140,7 @@ err_t bli_gemmtsup const obj_t* beta, const obj_t* c, const cntx_t* cntx, - rntm_t* rntm + const rntm_t* rntm ) { // Return early if small matrix handling is disabled at configure-time. @@ -196,7 +196,7 @@ err_t bli_gemmtsup beta, c, cntx, - rntm + &rntm_l ); } diff --git a/frame/3/bli_l3_sup.h b/frame/3/bli_l3_sup.h index 33b3f8ca74..77ff02d912 100644 --- a/frame/3/bli_l3_sup.h +++ b/frame/3/bli_l3_sup.h @@ -40,7 +40,7 @@ err_t bli_gemmsup const obj_t* beta, const obj_t* c, const cntx_t* cntx, - rntm_t* rntm + const rntm_t* rntm ); err_t bli_gemmtsup @@ -51,6 +51,6 @@ err_t bli_gemmtsup const obj_t* beta, const obj_t* c, const cntx_t* cntx, - rntm_t* rntm + const rntm_t* rntm ); diff --git a/frame/3/bli_l3_sup_int.c b/frame/3/bli_l3_sup_int.c index 3ff13bdb59..d944c3ad50 100644 --- a/frame/3/bli_l3_sup_int.c +++ b/frame/3/bli_l3_sup_int.c @@ -42,7 +42,7 @@ err_t bli_gemmsup_int const obj_t* beta, const obj_t* c, const cntx_t* cntx, - rntm_t* rntm, + const rntm_t* rntm, thrinfo_t* thread ) { @@ -94,12 +94,12 @@ err_t bli_gemmsup_int const dim_t n = bli_obj_width( c ); const dim_t MR = bli_cntx_get_blksz_def_dt( dt, BLIS_MR, cntx ); const dim_t NR = bli_cntx_get_blksz_def_dt( dt, BLIS_NR, cntx ); - const bool auto_factor = bli_rntm_auto_factor( rntm ); - const dim_t n_threads = bli_rntm_num_threads( rntm ); + const bool auto_factor = bli_rntm_auto_factor( rntm ); + const dim_t n_threads = bli_rntm_num_threads( rntm ); bool use_bp = TRUE; dim_t jc_new; dim_t ic_new; - + rntm_t rntm_l = *rntm; if ( is_primary ) { @@ -137,32 +137,31 @@ err_t bli_gemmsup_int // Update the ways of parallelism for the jc and ic loops, and then // update the current thread's root thrinfo_t node according to the // new ways of parallelism value for the jc loop. - bli_rntm_set_ways_only( jc_new, 1, ic_new, 1, 1, rntm ); - bli_l3_sup_thrinfo_update_root( rntm, thread ); + bli_rntm_set_ways_only( jc_new, 1, ic_new, 1, 1, &rntm_l ); + bli_l3_sup_thrinfo_update( &rntm_l, &thread ); } - if ( use_bp ) { #ifdef TRACEVAR - if ( bli_thread_am_ochief( thread ) ) + if ( bli_thread_am_chief( thread ) ) printf( "bli_l3_sup_int(): var2m primary\n" ); #endif // block-panel macrokernel; m -> mc, mr; n -> nc, nr: var2() bli_gemmsup_ref_var2m( BLIS_NO_TRANSPOSE, alpha, a, b, beta, c, - stor_id, cntx, rntm, thread ); + stor_id, cntx, &rntm_l, bli_thrinfo_sub_node( thread ) ); } else // use_pb { #ifdef TRACEVAR - if ( bli_thread_am_ochief( thread ) ) + if ( bli_thread_am_chief( thread ) ) printf( "bli_l3_sup_int(): var1n primary\n" ); #endif // panel-block macrokernel; m -> nc*,mr; n -> mc*,nr: var1() bli_gemmsup_ref_var1n( BLIS_NO_TRANSPOSE, alpha, a, b, beta, c, - stor_id, cntx, rntm, thread ); + stor_id, cntx, &rntm_l, bli_thrinfo_sub_node( thread ) ); // *requires nudging of nc up to be a multiple of mr. } } @@ -202,32 +201,32 @@ err_t bli_gemmsup_int // Update the ways of parallelism for the jc and ic loops, and then // update the current thread's root thrinfo_t node according to the // new ways of parallelism value for the jc loop. - bli_rntm_set_ways_only( jc_new, 1, ic_new, 1, 1, rntm ); - bli_l3_sup_thrinfo_update_root( rntm, thread ); + bli_rntm_set_ways_only( jc_new, 1, ic_new, 1, 1, &rntm_l ); + bli_l3_sup_thrinfo_update( &rntm_l, &thread ); } if ( use_bp ) { #ifdef TRACEVAR - if ( bli_thread_am_ochief( thread ) ) + if ( bli_thread_am_chief( thread ) ) printf( "bli_l3_sup_int(): var2m non-primary\n" ); #endif // panel-block macrokernel; m -> nc, nr; n -> mc, mr: var2() + trans bli_gemmsup_ref_var2m( BLIS_TRANSPOSE, alpha, a, b, beta, c, - stor_id, cntx, rntm, thread ); + stor_id, cntx, &rntm_l, bli_thrinfo_sub_node( thread ) ); } else // use_pb { #ifdef TRACEVAR - if ( bli_thread_am_ochief( thread ) ) + if ( bli_thread_am_chief( thread ) ) printf( "bli_l3_sup_int(): var1n non-primary\n" ); #endif // block-panel macrokernel; m -> mc*,nr; n -> nc*,mr: var1() + trans bli_gemmsup_ref_var1n( BLIS_TRANSPOSE, alpha, a, b, beta, c, - stor_id, cntx, rntm, thread ); + stor_id, cntx, &rntm_l, bli_thrinfo_sub_node( thread ) ); // *requires nudging of mc up to be a multiple of nr. } } @@ -246,7 +245,7 @@ err_t bli_gemmtsup_int const obj_t* beta, const obj_t* c, const cntx_t* cntx, - rntm_t* rntm, + const rntm_t* rntm, thrinfo_t* thread ) { @@ -273,7 +272,7 @@ err_t bli_gemmtsup_int bool use_bp = TRUE; dim_t jc_new; dim_t ic_new; - + rntm_t rntm_l = *rntm; if ( is_primary ) { @@ -311,35 +310,35 @@ err_t bli_gemmtsup_int // Update the ways of parallelism for the jc and ic loops, and then // update the current thread's root thrinfo_t node according to the // new ways of parallelism value for the jc loop. - bli_rntm_set_ways_only( jc_new, 1, ic_new, 1, 1, rntm ); - bli_l3_sup_thrinfo_update_root( rntm, thread ); + bli_rntm_set_ways_only( jc_new, 1, ic_new, 1, 1, &rntm_l ); + bli_l3_sup_thrinfo_update( &rntm_l, &thread ); } if ( use_bp ) { #ifdef TRACEVAR - if ( bli_thread_am_ochief( thread ) ) + if ( bli_thread_am_chief( thread ) ) printf( "bli_l3_sup_int(): var2m primary\n" ); #endif // block-panel macrokernel; m -> mc, mr; n -> nc, nr: var2() #if 0 bli_gemmtsup_ref_var2m( BLIS_NO_TRANSPOSE, alpha, a, b, beta, c, - stor_id, cntx, rntm, thread ); + stor_id, cntx, &rntm_l, bli_thrinfo_sub_node( thread ) ); #endif } else // use_pb { #ifdef TRACEVAR - if ( bli_thread_am_ochief( thread ) ) + if ( bli_thread_am_chief( thread ) ) printf( "bli_l3_sup_int(): var1n primary\n" ); #endif // panel-block macrokernel; m -> nc*,mr; n -> mc*,nr: var1() #if 0 bli_gemmtsup_ref_var1n( BLIS_NO_TRANSPOSE, alpha, a, b, beta, c, - stor_id, cntx, rntm, thread ); + stor_id, cntx, &rntm_l, bli_thrinfo_sub_node( thread ) ); #endif // *requires nudging of nc up to be a multiple of mr. } @@ -380,35 +379,35 @@ err_t bli_gemmtsup_int // Update the ways of parallelism for the jc and ic loops, and then // update the current thread's root thrinfo_t node according to the // new ways of parallelism value for the jc loop. - bli_rntm_set_ways_only( jc_new, 1, ic_new, 1, 1, rntm ); - bli_l3_sup_thrinfo_update_root( rntm, thread ); + bli_rntm_set_ways_only( jc_new, 1, ic_new, 1, 1, &rntm_l ); + bli_l3_sup_thrinfo_update( &rntm_l, &thread ); } if ( use_bp ) { #ifdef TRACEVAR - if ( bli_thread_am_ochief( thread ) ) + if ( bli_thread_am_chief( thread ) ) printf( "bli_l3_sup_int(): var2m non-primary\n" ); #endif // panel-block macrokernel; m -> nc, nr; n -> mc, mr: var2() + trans #if 0 bli_gemmtsup_ref_var2m( BLIS_TRANSPOSE, alpha, a, b, beta, c, - stor_id, cntx, rntm, thread ); + stor_id, cntx, &rntm_l, bli_thrinfo_sub_node( thread ) ); #endif } else // use_pb { #ifdef TRACEVAR - if ( bli_thread_am_ochief( thread ) ) + if ( bli_thread_am_chief( thread ) ) printf( "bli_l3_sup_int(): var1n non-primary\n" ); #endif // block-panel macrokernel; m -> mc*,nr; n -> nc*,mr: var1() + trans #if 0 bli_gemmtsup_ref_var1n( BLIS_TRANSPOSE, alpha, a, b, beta, c, - stor_id, cntx, rntm, thread ); + stor_id, cntx, &rntm_l, bli_thrinfo_sub_node( thread ) ); #endif // *requires nudging of mc up to be a multiple of nr. } diff --git a/frame/3/bli_l3_sup_int.h b/frame/3/bli_l3_sup_int.h index 195e3ca405..e76f21360c 100644 --- a/frame/3/bli_l3_sup_int.h +++ b/frame/3/bli_l3_sup_int.h @@ -40,7 +40,7 @@ err_t bli_gemmsup_int const obj_t* beta, const obj_t* c, const cntx_t* cntx, - rntm_t* rntm, + const rntm_t* rntm, thrinfo_t* thread ); @@ -52,6 +52,6 @@ err_t bli_gemmtsup_int const obj_t* beta, const obj_t* c, const cntx_t* cntx, - rntm_t* rntm, + const rntm_t* rntm, thrinfo_t* thread ); diff --git a/frame/3/bli_l3_sup_oft.h b/frame/3/bli_l3_sup_oft.h index ba60035b78..c36197201e 100644 --- a/frame/3/bli_l3_sup_oft.h +++ b/frame/3/bli_l3_sup_oft.h @@ -53,7 +53,7 @@ typedef err_t (*PASTECH(opname,_oft)) \ const obj_t* beta, \ const obj_t* c, \ const cntx_t* cntx, \ - rntm_t* rntm \ + const rntm_t* rntm \ ); GENTDEF( gemmsup ) diff --git a/frame/3/bli_l3_sup_packm.c b/frame/3/bli_l3_sup_packm.c index 5ed7700dcb..e08657757b 100644 --- a/frame/3/bli_l3_sup_packm.c +++ b/frame/3/bli_l3_sup_packm.c @@ -43,8 +43,6 @@ void bli_packm_sup_init_mem dim_t m, dim_t k, dim_t mr, - rntm_t* rntm, - mem_t* mem, thrinfo_t* thread ) { @@ -54,6 +52,9 @@ void bli_packm_sup_init_mem } else // if ( will_pack == TRUE ) { + mem_t* mem = bli_thread_mem( thread ); + pba_t* pba = bli_thread_pba( thread ); + // NOTE: This "rounding up" of the last upanel is actually optional // for the rrc/crc cases, but absolutely necessary for the other cases // since we NEED that last micropanel to have the same ldim (cs_p) as @@ -64,7 +65,7 @@ void bli_packm_sup_init_mem // Barrier to make sure all threads are caught up and ready to begin // the packm stage. - bli_thread_barrier( rntm, thread ); + bli_thread_barrier( thread ); // Compute the size of the memory block eneded. siz_t size_needed = bli_dt_size( dt ) * m_pack * k_pack; @@ -73,7 +74,7 @@ void bli_packm_sup_init_mem // then we need to acquire a block from the pba. if ( bli_mem_is_unalloc( mem ) ) { - if ( bli_thread_am_ochief( thread ) ) + if ( bli_thread_am_chief( thread ) ) { // Acquire directly to the chief thread's mem_t that was // passed in. It needs to be that mem_t struct, and not a @@ -85,7 +86,7 @@ void bli_packm_sup_init_mem // then again, I prefer to keep barriers to a minimum.) bli_pba_acquire_m ( - rntm, + pba, size_needed, pack_buf_type, mem @@ -94,13 +95,13 @@ void bli_packm_sup_init_mem // Broadcast the address of the chief thread's passed-in mem_t // to all threads. - mem_t* mem_p = bli_thread_broadcast( rntm, thread, mem ); + mem_t* mem_p = bli_thread_broadcast( thread, mem ); // Non-chief threads: Copy the contents of the chief thread's // passed-in mem_t to the passed-in mem_t for this thread. (The // chief thread already has the mem_t, so it does not need to // perform any copy.) - if ( !bli_thread_am_ochief( thread ) ) + if ( !bli_thread_am_chief( thread ) ) { *mem = *mem_p; } @@ -119,7 +120,7 @@ void bli_packm_sup_init_mem if ( mem_size < size_needed ) { - if ( bli_thread_am_ochief( thread ) ) + if ( bli_thread_am_chief( thread ) ) { // The chief thread releases the existing block associated // with the mem_t, and then re-acquires a new block, saving @@ -129,12 +130,12 @@ void bli_packm_sup_init_mem // (temporary) mem_t. bli_pba_release ( - rntm, + pba, mem ); bli_pba_acquire_m ( - rntm, + pba, size_needed, pack_buf_type, mem @@ -143,13 +144,13 @@ void bli_packm_sup_init_mem // Broadcast the address of the chief thread's passed-in mem_t // to all threads. - mem_t* mem_p = bli_thread_broadcast( rntm, thread, mem ); + mem_t* mem_p = bli_thread_broadcast( thread, mem ); // Non-chief threads: Copy the contents of the chief thread's // passed-in mem_t to the passed-in mem_t for this thread. (The // chief thread already has the mem_t, so it does not need to // perform any copy.) - if ( !bli_thread_am_ochief( thread ) ) + if ( !bli_thread_am_chief( thread ) ) { *mem = *mem_p; } @@ -166,8 +167,6 @@ void bli_packm_sup_init_mem void bli_packm_sup_finalize_mem ( bool did_pack, - rntm_t* rntm, - mem_t* mem, thrinfo_t* thread ) { @@ -178,8 +177,11 @@ void bli_packm_sup_finalize_mem } else // if ( did_pack == TRUE ) { + mem_t* mem = bli_thread_mem( thread ); + pba_t* pba = bli_thread_pba( thread ); + if ( thread != NULL ) - if ( bli_thread_am_ochief( thread ) ) + if ( bli_thread_am_chief( thread ) ) { // Check the mem_t entry provided by the caller. Only proceed if it // is allocated, which it should be. @@ -187,7 +189,7 @@ void bli_packm_sup_finalize_mem { bli_pba_release ( - rntm, + pba, mem ); } @@ -197,18 +199,18 @@ void bli_packm_sup_finalize_mem void bli_packm_sup_init ( - bool will_pack, - stor3_t stor_id, - pack_t* schema, - dim_t m, - dim_t k, - dim_t mr, - dim_t* m_max, - dim_t* k_max, - const void* x, inc_t rs_x, inc_t cs_x, - void** p, inc_t* rs_p, inc_t* cs_p, - dim_t* pd_p, inc_t* ps_p, - mem_t* mem + bool will_pack, + stor3_t stor_id, + pack_t* schema, + dim_t m, + dim_t k, + dim_t mr, + dim_t* m_max, + dim_t* k_max, + const void* x, inc_t rs_x, inc_t cs_x, + void** p, inc_t* rs_p, inc_t* cs_p, + dim_t* pd_p, inc_t* ps_p, + thrinfo_t* thread ) { // Inspect whether we are going to be packing matrix A. @@ -277,7 +279,7 @@ void bli_packm_sup_init // Set the buffer address provided by the caller to point to the // memory associated with the mem_t entry acquired from the pba. - *p = bli_mem_buffer( mem ); + *p = bli_mem_buffer( bli_thread_mem( thread ) ); } } @@ -334,8 +336,6 @@ void bli_packm_sup void** p, inc_t* rs_p, inc_t* cs_p, inc_t* ps_p, const cntx_t* cntx, - rntm_t* rntm, - mem_t* mem, thrinfo_t* thread ) { @@ -351,8 +351,6 @@ void bli_packm_sup will_pack, pack_buf_type, dt, m_alloc, k_alloc, mr, - rntm, - mem, thread ); @@ -369,7 +367,7 @@ void bli_packm_sup a, rs_a, cs_a, p, rs_p, cs_p, &pd_p, ps_p, - mem + thread ); // Inspect whether we are going to be packing matrix A. @@ -422,7 +420,7 @@ void bli_packm_sup } // Barrier so that packing is done before computation. - bli_thread_barrier( rntm, thread ); + bli_thread_barrier( thread ); } } diff --git a/frame/3/bli_l3_sup_packm.h b/frame/3/bli_l3_sup_packm.h index a84d4e45c3..032ba0afe9 100644 --- a/frame/3/bli_l3_sup_packm.h +++ b/frame/3/bli_l3_sup_packm.h @@ -42,16 +42,12 @@ void bli_packm_sup_init_mem dim_t m, dim_t k, dim_t mr, - rntm_t* rntm, - mem_t* mem, thrinfo_t* thread ); void bli_packm_sup_finalize_mem ( bool did_pack, - rntm_t* rntm, - mem_t* mem, thrinfo_t* thread ); @@ -68,7 +64,7 @@ void bli_packm_sup_init const void* x, inc_t rs_x, inc_t cs_x, void** p, inc_t* rs_p, inc_t* cs_p, dim_t* pd_p, inc_t* ps_p, - mem_t* mem + thrinfo_t* thread ); void bli_packm_sup @@ -88,8 +84,6 @@ void bli_packm_sup void** p, inc_t* rs_p, inc_t* cs_p, inc_t* ps_p, const cntx_t* cntx, - rntm_t* rntm, - mem_t* mem, thrinfo_t* thread ); diff --git a/frame/3/bli_l3_sup_packm_var.c b/frame/3/bli_l3_sup_packm_var.c index 71357cec4b..3572510022 100644 --- a/frame/3/bli_l3_sup_packm_var.c +++ b/frame/3/bli_l3_sup_packm_var.c @@ -244,7 +244,7 @@ if ( col_stored ) { \ ( ctype* )p_use, rs_p, cs_p, "%4.1f", "" ); \ fflush( stdout ); \ } \ -bli_thread_barrier( rntm, thread ); \ +bli_thread_barrier( thread ); \ if ( bli_thread_work_id( thread ) == 1 ) \ { \ printf( "packm_blk_var1: thread %lu (a = %p, ap = %p)\n", bli_thread_work_id( thread ), c_use, p_use ); \ @@ -255,7 +255,7 @@ bli_thread_barrier( rntm, thread ); \ ( ctype* )p_use, rs_p, cs_p, "%4.1f", "" ); \ fflush( stdout ); \ } \ -bli_thread_barrier( rntm, thread ); \ +bli_thread_barrier( thread ); \ } \ else { \ if ( bli_thread_work_id( thread ) == 0 ) \ @@ -268,7 +268,7 @@ else { \ ( ctype* )p_use, rs_p, cs_p, "%4.1f", "" ); \ fflush( stdout ); \ } \ -bli_thread_barrier( rntm, thread ); \ +bli_thread_barrier( thread ); \ if ( bli_thread_work_id( thread ) == 1 ) \ { \ printf( "packm_blk_var1: thread %lu (b = %p, bp = %p)\n", bli_thread_work_id( thread ), c_use, p_use ); \ @@ -279,7 +279,7 @@ bli_thread_barrier( rntm, thread ); \ ( ctype* )p_use, rs_p, cs_p, "%4.1f", "" ); \ fflush( stdout ); \ } \ -bli_thread_barrier( rntm, thread ); \ +bli_thread_barrier( thread ); \ } \ */ /* diff --git a/frame/3/bli_l3_sup_ref.c b/frame/3/bli_l3_sup_ref.c index 76314aba76..7756d539d6 100644 --- a/frame/3/bli_l3_sup_ref.c +++ b/frame/3/bli_l3_sup_ref.c @@ -42,7 +42,7 @@ err_t bli_gemmsup_ref const obj_t* beta, const obj_t* c, const cntx_t* cntx, - rntm_t* rntm + const rntm_t* rntm ) { // This function implements the default gemmsup handler. If you are a @@ -54,25 +54,6 @@ err_t bli_gemmsup_ref if ( bli_error_checking_is_enabled() ) bli_gemm_check( alpha, a, b, beta, c, cntx ); -#if 0 - // NOTE: This special case handling is done within the variants. - - // If alpha is zero, scale by beta and return. - if ( bli_obj_equals( alpha, &BLIS_ZERO ) ) - { - bli_scalm( beta, c ); - return; - } - - // If A or B has a zero dimension, scale C by beta and return early. - if ( bli_obj_has_zero_dim( a ) || - bli_obj_has_zero_dim( b ) ) - { - bli_scalm( beta, c ); - return BLIS_SUCCESS; - } -#endif - const stor3_t stor_id = bli_obj_stor3_from_strides( c, a, b ); // Don't use the small/unpacked implementation if one of the matrices @@ -89,20 +70,21 @@ err_t bli_gemmsup_ref // Parse and interpret the contents of the rntm_t object to properly // set the ways of parallelism for each loop. + rntm_t rntm_l = *rntm; bli_rntm_factorize_sup ( bli_obj_length( c ), bli_obj_width( c ), bli_obj_width( a ), - rntm + &rntm_l ); #if 0 - printf( "rntm.pack_a = %d\n", ( int )bli_rntm_pack_a( rntm ) ); - printf( "rntm.pack_b = %d\n", ( int )bli_rntm_pack_b( rntm ) ); + printf( "rntm.pack_a = %d\n", ( int )bli_rntm_pack_a( &rntm_l ) ); + printf( "rntm.pack_b = %d\n", ( int )bli_rntm_pack_b( &rntm_l ) ); - //bli_rntm_set_pack_a( 0, rntm ); - //bli_rntm_set_pack_b( 0, rntm ); + //bli_rntm_set_pack_a( 0, &rntm_l ); + //bli_rntm_set_pack_b( 0, &rntm_l ); #endif return @@ -116,7 +98,7 @@ err_t bli_gemmsup_ref beta, c, cntx, - rntm + &rntm_l ); } @@ -130,7 +112,7 @@ err_t bli_gemmtsup_ref const obj_t* beta, const obj_t* c, const cntx_t* cntx, - rntm_t* rntm + const rntm_t* rntm ) { // This function implements the default gemmtsup handler. If you are a @@ -163,12 +145,13 @@ err_t bli_gemmtsup_ref // Parse and interpret the contents of the rntm_t object to properly // set the ways of parallelism for each loop. + rntm_t rntm_l = *rntm; bli_rntm_factorize_sup ( bli_obj_length( c ), bli_obj_width( c ), bli_obj_width( a ), - rntm + &rntm_l ); return @@ -182,7 +165,7 @@ err_t bli_gemmtsup_ref beta, c, cntx, - rntm + &rntm_l ); } diff --git a/frame/3/bli_l3_sup_ref.h b/frame/3/bli_l3_sup_ref.h index 4d4811db34..fbd7054df5 100644 --- a/frame/3/bli_l3_sup_ref.h +++ b/frame/3/bli_l3_sup_ref.h @@ -40,7 +40,7 @@ err_t bli_gemmsup_ref const obj_t* beta, const obj_t* c, const cntx_t* cntx, - rntm_t* rntm + const rntm_t* rntm ); err_t bli_gemmtsup_ref @@ -51,6 +51,6 @@ err_t bli_gemmtsup_ref const obj_t* beta, const obj_t* c, const cntx_t* cntx, - rntm_t* rntm + const rntm_t* rntm ); diff --git a/frame/3/bli_l3_sup_var1n2m.c b/frame/3/bli_l3_sup_var1n2m.c index e4858621aa..c58ede3952 100644 --- a/frame/3/bli_l3_sup_var1n2m.c +++ b/frame/3/bli_l3_sup_var1n2m.c @@ -48,7 +48,7 @@ void bli_gemmsup_ref_var1n const obj_t* c, stor3_t stor_id, const cntx_t* cntx, - rntm_t* rntm, + const rntm_t* rntm, thrinfo_t* thread ) { @@ -230,55 +230,15 @@ void bli_gemmsup_ref_var1n auxinfo_t aux; - mem_t mem_a = BLIS_MEM_INITIALIZER; - mem_t mem_b = BLIS_MEM_INITIALIZER; - - // Define an array of bszid_t ids, which will act as our substitute for - // the cntl_t tree. - // NOTE: These bszid_t values, and their order, match that of the bp - // algorithm (variant 2) because they are not used to query actual - // blocksizes but rather query the ways of parallelism for the various - // loops. For example, the 2nd loop in variant 1 partitions in the m - // dimension (in increments of MR), but parallelizes that m dimension - // with BLIS_JR_NT. - // Note that this panel-block algorithm partitions an NC x KC submatrix - // of A to be packed in the 4th loop, and a KC x MC submatrix of B to be - // packed in the 3rd loop. - // 5thloop 4thloop packa 3rdloop packb 2ndloop 1stloop ukrloop - bszid_t bszids[8] = { BLIS_NC, BLIS_KC, BLIS_NO_PART, BLIS_MC, BLIS_NO_PART, BLIS_NR, BLIS_MR, BLIS_KR }; - // Determine whether we are using more than one thread. const bool is_mt = ( bli_rntm_calc_num_threads( rntm ) > 1 ); - thrinfo_t* thread_jc = NULL; - thrinfo_t* thread_pc = NULL; - thrinfo_t* thread_pa = NULL; - thrinfo_t* thread_ic = NULL; - thrinfo_t* thread_pb = NULL; - thrinfo_t* thread_jr = NULL; - - // Pre-grow the thrinfo_t tree. - bszid_t* bszids_jc = bszids; - thread_jc = thread; - bli_thrinfo_sup_grow( rntm, bszids_jc, thread_jc ); - - bszid_t* bszids_pc = &bszids_jc[1]; - thread_pc = bli_thrinfo_sub_node( thread_jc ); - bli_thrinfo_sup_grow( rntm, bszids_pc, thread_pc ); - - bszid_t* bszids_pa = &bszids_pc[1]; - thread_pa = bli_thrinfo_sub_node( thread_pc ); - - bszid_t* bszids_ic = &bszids_pa[1]; - thread_ic = bli_thrinfo_sub_node( thread_pa ); - bli_thrinfo_sup_grow( rntm, bszids_ic, thread_ic ); - - bszid_t* bszids_pb = &bszids_ic[1]; - thread_pb = bli_thrinfo_sub_node( thread_ic ); - - bszid_t* bszids_jr = &bszids_pb[1]; - thread_jr = bli_thrinfo_sub_node( thread_pb ); - bli_thrinfo_sup_grow( rntm, bszids_jr, thread_jr ); + thrinfo_t* thread_jc = thread; + thrinfo_t* thread_pc = bli_thrinfo_sub_node( thread_jc ); + thrinfo_t* thread_pa = bli_thrinfo_sub_node( thread_pc ); + thrinfo_t* thread_ic = bli_thrinfo_sub_node( thread_pa ); + thrinfo_t* thread_pb = bli_thrinfo_sub_node( thread_ic ); + thrinfo_t* thread_jr = bli_thrinfo_sub_node( thread_pb ); // Compute the JC loop thread range for the current thread. dim_t jc_start, jc_end; @@ -344,8 +304,6 @@ void bli_gemmsup_ref_var1n ( void** )&a_use, &rs_a_use, &cs_a_use, &ps_a_use, cntx, - rntm, - &mem_a, thread_pa ); @@ -402,8 +360,6 @@ void bli_gemmsup_ref_var1n ( void** )&b_use, &cs_b_use, &rs_b_use, &ps_b_use, cntx, - rntm, - &mem_b, thread_pb ); @@ -472,7 +428,7 @@ void bli_gemmsup_ref_var1n // NOTE: This barrier is only needed if we are packing A (since // that matrix is packed within the pc loop of this variant). - if ( packa ) bli_thread_barrier( rntm, thread_pa ); + if ( packa ) bli_thread_barrier( thread_pa ); } } @@ -480,15 +436,11 @@ void bli_gemmsup_ref_var1n bli_packm_sup_finalize_mem ( packa, - rntm, - &mem_a, thread_pa ); bli_packm_sup_finalize_mem ( packb, - rntm, - &mem_b, thread_pb ); @@ -514,7 +466,7 @@ void bli_gemmsup_ref_var2m const obj_t* c, stor3_t stor_id, const cntx_t* cntx, - rntm_t* rntm, + const rntm_t* rntm, thrinfo_t* thread ) { @@ -680,46 +632,15 @@ void bli_gemmsup_ref_var2m auxinfo_t aux; - mem_t mem_a = BLIS_MEM_INITIALIZER; - mem_t mem_b = BLIS_MEM_INITIALIZER; - - // Define an array of bszid_t ids, which will act as our substitute for - // the cntl_t tree. - // 5thloop 4thloop packb 3rdloop packa 2ndloop 1stloop ukrloop - bszid_t bszids[8] = { BLIS_NC, BLIS_KC, BLIS_NO_PART, BLIS_MC, BLIS_NO_PART, BLIS_NR, BLIS_MR, BLIS_KR }; - // Determine whether we are using more than one thread. const bool is_mt = ( bli_rntm_calc_num_threads( rntm ) > 1 ); - thrinfo_t* thread_jc = NULL; - thrinfo_t* thread_pc = NULL; - thrinfo_t* thread_pb = NULL; - thrinfo_t* thread_ic = NULL; - thrinfo_t* thread_pa = NULL; - thrinfo_t* thread_jr = NULL; - - // Pre-grow the thrinfo_t tree. - bszid_t* bszids_jc = bszids; - thread_jc = thread; - bli_thrinfo_sup_grow( rntm, bszids_jc, thread_jc ); - - bszid_t* bszids_pc = &bszids_jc[1]; - thread_pc = bli_thrinfo_sub_node( thread_jc ); - bli_thrinfo_sup_grow( rntm, bszids_pc, thread_pc ); - - bszid_t* bszids_pb = &bszids_pc[1]; - thread_pb = bli_thrinfo_sub_node( thread_pc ); - - bszid_t* bszids_ic = &bszids_pb[1]; - thread_ic = bli_thrinfo_sub_node( thread_pb ); - bli_thrinfo_sup_grow( rntm, bszids_ic, thread_ic ); - - bszid_t* bszids_pa = &bszids_ic[1]; - thread_pa = bli_thrinfo_sub_node( thread_ic ); - - bszid_t* bszids_jr = &bszids_pa[1]; - thread_jr = bli_thrinfo_sub_node( thread_pa ); - bli_thrinfo_sup_grow( rntm, bszids_jr, thread_jr ); + thrinfo_t* thread_jc = thread; + thrinfo_t* thread_pc = bli_thrinfo_sub_node( thread_jc ); + thrinfo_t* thread_pb = bli_thrinfo_sub_node( thread_pc ); + thrinfo_t* thread_ic = bli_thrinfo_sub_node( thread_pb ); + thrinfo_t* thread_pa = bli_thrinfo_sub_node( thread_ic ); + thrinfo_t* thread_jr = bli_thrinfo_sub_node( thread_pa ); // Compute the JC loop thread range for the current thread. dim_t jc_start, jc_end; @@ -783,8 +704,6 @@ void bli_gemmsup_ref_var2m ( void** )&b_use, &cs_b_use, &rs_b_use, &ps_b_use, cntx, - rntm, - &mem_b, thread_pb ); @@ -839,8 +758,6 @@ void bli_gemmsup_ref_var2m ( void** )&a_use, &rs_a_use, &cs_a_use, &ps_a_use, cntx, - rntm, - &mem_a, thread_pa ); @@ -909,7 +826,7 @@ void bli_gemmsup_ref_var2m // NOTE: This barrier is only needed if we are packing B (since // that matrix is packed within the pc loop of this variant). - if ( packb ) bli_thread_barrier( rntm, thread_pb ); + if ( packb ) bli_thread_barrier( thread_pb ); } } @@ -917,15 +834,11 @@ void bli_gemmsup_ref_var2m bli_packm_sup_finalize_mem ( packa, - rntm, - &mem_a, thread_pa ); bli_packm_sup_finalize_mem ( packb, - rntm, - &mem_b, thread_pb ); diff --git a/frame/3/bli_l3_sup_vars.h b/frame/3/bli_l3_sup_vars.h index be6b17f390..8bbb73ca94 100644 --- a/frame/3/bli_l3_sup_vars.h +++ b/frame/3/bli_l3_sup_vars.h @@ -50,7 +50,7 @@ void PASTEMAC0(opname) \ const obj_t* c, \ stor3_t eff_id, \ const cntx_t* cntx, \ - rntm_t* rntm, \ + const rntm_t* rntm, \ thrinfo_t* thread \ ); diff --git a/frame/3/bli_l3_tapi_ex.c b/frame/3/bli_l3_tapi_ex.c index c934ba9493..130237ee4d 100644 --- a/frame/3/bli_l3_tapi_ex.c +++ b/frame/3/bli_l3_tapi_ex.c @@ -55,7 +55,7 @@ void PASTEMAC2(ch,opname,BLIS_OAPI_EX_SUF) \ const ctype* beta, \ ctype* c, inc_t rs_c, inc_t cs_c, \ const cntx_t* cntx, \ - rntm_t* rntm \ + const rntm_t* rntm \ ) \ { \ bli_init_once(); \ @@ -115,7 +115,7 @@ void PASTEMAC2(ch,opname,BLIS_OAPI_EX_SUF) \ const ctype* beta, \ ctype* c, inc_t rs_c, inc_t cs_c, \ const cntx_t* cntx, \ - rntm_t* rntm \ + const rntm_t* rntm \ ) \ { \ bli_init_once(); \ @@ -178,7 +178,7 @@ void PASTEMAC2(ch,opname,BLIS_OAPI_EX_SUF) \ const ctype_r* beta, \ ctype* c, inc_t rs_c, inc_t cs_c, \ const cntx_t* cntx, \ - rntm_t* rntm \ + const rntm_t* rntm \ ) \ { \ bli_init_once(); \ @@ -236,7 +236,7 @@ void PASTEMAC2(ch,opname,BLIS_OAPI_EX_SUF) \ const ctype_r* beta, \ ctype* c, inc_t rs_c, inc_t cs_c, \ const cntx_t* cntx, \ - rntm_t* rntm \ + const rntm_t* rntm \ ) \ { \ bli_init_once(); \ @@ -298,7 +298,7 @@ void PASTEMAC2(ch,opname,BLIS_OAPI_EX_SUF) \ const ctype* beta, \ ctype* c, inc_t rs_c, inc_t cs_c, \ const cntx_t* cntx, \ - rntm_t* rntm \ + const rntm_t* rntm \ ) \ { \ bli_init_once(); \ @@ -355,7 +355,7 @@ void PASTEMAC2(ch,opname,BLIS_OAPI_EX_SUF) \ const ctype* beta, \ ctype* c, inc_t rs_c, inc_t cs_c, \ const cntx_t* cntx, \ - rntm_t* rntm \ + const rntm_t* rntm \ ) \ { \ bli_init_once(); \ @@ -418,7 +418,7 @@ void PASTEMAC2(ch,opname,BLIS_OAPI_EX_SUF) \ const ctype* beta, \ ctype* c, inc_t rs_c, inc_t cs_c, \ const cntx_t* cntx, \ - rntm_t* rntm \ + const rntm_t* rntm \ ) \ { \ bli_init_once(); \ @@ -481,7 +481,7 @@ void PASTEMAC2(ch,opname,BLIS_OAPI_EX_SUF) \ const ctype* beta, \ ctype* c, inc_t rs_c, inc_t cs_c, \ const cntx_t* cntx, \ - rntm_t* rntm \ + const rntm_t* rntm \ ) \ { \ bli_init_once(); \ @@ -545,7 +545,7 @@ void PASTEMAC2(ch,opname,BLIS_OAPI_EX_SUF) \ const ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* b, inc_t rs_b, inc_t cs_b, \ const cntx_t* cntx, \ - rntm_t* rntm \ + const rntm_t* rntm \ ) \ { \ bli_init_once(); \ diff --git a/frame/3/bli_l3_tapi_ex.h b/frame/3/bli_l3_tapi_ex.h index eb142af05d..d8610dee82 100644 --- a/frame/3/bli_l3_tapi_ex.h +++ b/frame/3/bli_l3_tapi_ex.h @@ -54,7 +54,7 @@ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,BLIS_TAPI_EX_SUF) \ const ctype* beta, \ ctype* c, inc_t rs_c, inc_t cs_c, \ const cntx_t* cntx, \ - rntm_t* rntm \ + const rntm_t* rntm \ ); INSERT_GENTPROT_BASIC0( gemm ) @@ -76,7 +76,7 @@ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,BLIS_TAPI_EX_SUF) \ const ctype* beta, \ ctype* c, inc_t rs_c, inc_t cs_c, \ const cntx_t* cntx, \ - rntm_t* rntm \ + const rntm_t* rntm \ ); INSERT_GENTPROT_BASIC0( hemm ) @@ -97,7 +97,7 @@ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,BLIS_TAPI_EX_SUF) \ const ctype_r* beta, \ ctype* c, inc_t rs_c, inc_t cs_c, \ const cntx_t* cntx, \ - rntm_t* rntm \ + const rntm_t* rntm \ ); INSERT_GENTPROTR_BASIC0( herk ) @@ -119,7 +119,7 @@ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,BLIS_TAPI_EX_SUF) \ const ctype_r* beta, \ ctype* c, inc_t rs_c, inc_t cs_c, \ const cntx_t* cntx, \ - rntm_t* rntm \ + const rntm_t* rntm \ ); INSERT_GENTPROTR_BASIC0( her2k ) @@ -139,7 +139,7 @@ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,BLIS_TAPI_EX_SUF) \ const ctype* beta, \ ctype* c, inc_t rs_c, inc_t cs_c, \ const cntx_t* cntx, \ - rntm_t* rntm \ + const rntm_t* rntm \ ); INSERT_GENTPROT_BASIC0( syrk ) @@ -161,7 +161,7 @@ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,BLIS_TAPI_EX_SUF) \ const ctype* beta, \ ctype* c, inc_t rs_c, inc_t cs_c, \ const cntx_t* cntx, \ - rntm_t* rntm \ + const rntm_t* rntm \ ); INSERT_GENTPROT_BASIC0( gemmt ) @@ -186,7 +186,7 @@ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,BLIS_TAPI_EX_SUF) \ const ctype* beta, \ ctype* c, inc_t rs_c, inc_t cs_c, \ const cntx_t* cntx, \ - rntm_t* rntm \ + const rntm_t* rntm \ ); INSERT_GENTPROT_BASIC0( trmm3 ) @@ -207,7 +207,7 @@ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,BLIS_TAPI_EX_SUF) \ const ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* b, inc_t rs_b, inc_t cs_b, \ const cntx_t* cntx, \ - rntm_t* rntm \ + const rntm_t* rntm \ ); INSERT_GENTPROT_BASIC0( trmm ) diff --git a/frame/3/bli_l3_thrinfo.c b/frame/3/bli_l3_thrinfo.c index f866cfd4c5..97f70bbf2b 100644 --- a/frame/3/bli_l3_thrinfo.c +++ b/frame/3/bli_l3_thrinfo.c @@ -34,139 +34,137 @@ */ #include "blis.h" -#include "assert.h" -void bli_l3_thrinfo_init_single +thrinfo_t* bli_l3_thrinfo_create ( - thrinfo_t* thread + dim_t id, + thrcomm_t* gl_comm, + array_t* array, + const rntm_t* rntm, + const cntl_t* cntl ) { - bli_thrinfo_init_single( thread ); -} + pool_t* pool = NULL; + if ( array != NULL ) + pool = bli_apool_array_elem( id, array ); -void bli_l3_thrinfo_free - ( - rntm_t* rntm, - thrinfo_t* thread - ) -{ - bli_thrinfo_free( rntm, thread ); -} + // Create the root thrinfo_t node. + thrinfo_t* root = bli_thrinfo_create_root + ( + gl_comm, + id, + pool, + bli_pba_query() + ); -void bli_l3_sup_thrinfo_free - ( - rntm_t* rntm, - thrinfo_t* thread - ) -{ - bli_thrinfo_free( rntm, thread ); -} + thrinfo_t* thread = bli_l3_thrinfo_grow( root, rntm, cntl ); + bli_thrinfo_set_sub_node( thread, root ); -// ----------------------------------------------------------------------------- + return root; +} -void bli_l3_thrinfo_create_root +thrinfo_t* bli_l3_thrinfo_grow ( - dim_t id, - thrcomm_t* gl_comm, - rntm_t* rntm, - cntl_t* cntl, - thrinfo_t** thread + thrinfo_t* thread_par, + const rntm_t* rntm, + const cntl_t* cntl ) { - // Query the global communicator for the total number of threads to use. - dim_t n_threads = bli_thrcomm_num_threads( gl_comm ); - - // Use the thread id passed in as the global communicator id. - dim_t gl_comm_id = id; - - // Use the blocksize id of the current (root) control tree node to - // query the top-most ways of parallelism to obtain. - bszid_t bszid = bli_cntl_bszid( cntl ); - dim_t xx_way = bli_rntm_ways_for( bszid, rntm ); - - // Determine the work id for this thrinfo_t node. - dim_t work_id = gl_comm_id / ( n_threads / xx_way ); - - // Create the root thrinfo_t node. - *thread = bli_thrinfo_create - ( - rntm, - gl_comm, - gl_comm_id, - xx_way, - work_id, - TRUE, - bszid, - NULL - ); + const cntl_t* sub_prenode = bli_cntl_sub_prenode( cntl ); + const cntl_t* sub_node = bli_cntl_sub_node( cntl ); + const bszid_t bszid = bli_cntl_bszid( cntl ); + const dim_t n_way = bli_rntm_ways_for( bszid, rntm ); + + thrinfo_t* thread_cur = bli_thrinfo_split( n_way, thread_par ); + + if ( sub_prenode != NULL ) + { + // A pre-node is only used in the IC loop of trsm. In this case, + // we cannot actually thread in the m dimension due to data dependencies + // and so all parallelism must be moved down to the JR loop. + rntm_t rntm_l = *rntm; + const dim_t ic_nway = bli_rntm_ic_ways( &rntm_l ); + const dim_t jr_nway = bli_rntm_jr_ways( &rntm_l ); + bli_rntm_set_ic_ways_only( 1, &rntm_l ); + bli_rntm_set_jr_ways_only( ic_nway*jr_nway, &rntm_l ); + + // Use thread_par instead of thread_cur since we *don't* want to + // do any parallelism at this level. + thrinfo_t* thread_chl = bli_l3_thrinfo_grow( thread_par, &rntm_l, sub_prenode ); + bli_thrinfo_set_sub_prenode( thread_chl, thread_cur ); + } + + if ( sub_node != NULL ) + { + thrinfo_t* thread_chl = bli_l3_thrinfo_grow( thread_cur, rntm, sub_node ); + bli_thrinfo_set_sub_node( thread_chl, thread_cur ); + } + + return thread_cur; } // ----------------------------------------------------------------------------- -void bli_l3_sup_thrinfo_create_root +thrinfo_t* bli_l3_sup_thrinfo_create ( - dim_t id, - thrcomm_t* gl_comm, - rntm_t* rntm, - thrinfo_t** thread + dim_t id, + thrcomm_t* gl_comm, + pool_t* pool, + const rntm_t* rntm ) { - // Query the global communicator for the total number of threads to use. - dim_t n_threads = bli_thrcomm_num_threads( gl_comm ); - - // Use the thread id passed in as the global communicator id. - dim_t gl_comm_id = id; - - // Use the BLIS_NC blocksize id to query the top-most ways of parallelism - // to obtain. Note that hard-coding BLIS_NC like this is a little bit of a - // hack, but it works fine since both of the sup algorithms (bp and pb) use - // the cache blocksizes down to the 3rd loop. (See the definitions of - // bli_rntm_calc_num_threads_bp() and bli_rntm_calc_num_threads_pb() for - // a concise enumeration of these bszid_t ids.) - const bszid_t bszid = BLIS_NC; - dim_t xx_way = bli_rntm_ways_for( BLIS_NC, rntm ); - - // Determine the work id for this thrinfo_t node. - dim_t work_id = gl_comm_id / ( n_threads / xx_way ); - // Create the root thrinfo_t node. - *thread = bli_thrinfo_create + thrinfo_t* root = bli_thrinfo_create_root ( - rntm, gl_comm, - gl_comm_id, - xx_way, - work_id, - TRUE, - bszid, - NULL + id, + pool, + bli_pba_query() ); -} -// ----------------------------------------------------------------------------- + const dim_t n_way_jc = bli_rntm_ways_for( BLIS_NC, rntm ); + const dim_t n_way_pc = bli_rntm_ways_for( BLIS_KC, rntm ); + const dim_t n_way_ic = bli_rntm_ways_for( BLIS_MC, rntm ); + const dim_t n_way_jr = bli_rntm_ways_for( BLIS_NR, rntm ); + const dim_t n_way_ir = bli_rntm_ways_for( BLIS_MR, rntm ); + + thrinfo_t* thread_jc = bli_thrinfo_split( n_way_jc, root ); + thrinfo_t* thread_pc = bli_thrinfo_split( n_way_pc, thread_jc ); + thrinfo_t* thread_pb = bli_thrinfo_split( 1, thread_pc ); + thrinfo_t* thread_ic = bli_thrinfo_split( n_way_ic, thread_pb ); + thrinfo_t* thread_pa = bli_thrinfo_split( 1, thread_ic ); + thrinfo_t* thread_jr = bli_thrinfo_split( n_way_jr, thread_pa ); + thrinfo_t* thread_ir = bli_thrinfo_split( n_way_ir, thread_jr ); + + bli_thrinfo_set_sub_node( thread_jc, root ); + bli_thrinfo_set_sub_node( thread_pc, thread_jc ); + bli_thrinfo_set_sub_node( thread_pb, thread_pc ); + bli_thrinfo_set_sub_node( thread_ic, thread_pb ); + bli_thrinfo_set_sub_node( thread_pa, thread_ic ); + bli_thrinfo_set_sub_node( thread_jr, thread_pa ); + bli_thrinfo_set_sub_node( thread_ir, thread_jr ); + + return root; +} -void bli_l3_sup_thrinfo_update_root +void bli_l3_sup_thrinfo_update ( - rntm_t* rntm, - thrinfo_t* thread + const rntm_t* rntm, + thrinfo_t** root ) { - // Query the current root for the total number of threads to use. - const dim_t n_threads = bli_thread_num_threads( thread ); - - // Query the current root for the (global) comm id. - const dim_t gl_comm_id = bli_thread_ocomm_id( thread ); - - // Query the rntm_t for the updated number of ways of parallelism. - const dim_t xx_way = bli_rntm_ways_for( BLIS_NC, rntm ); - - // Recompute the work id for this thrinfo_t node using the updated - // number of ways of parallelism. - dim_t work_id = gl_comm_id / ( n_threads / xx_way ); - - // Save the updated ways of parallelism and work id to the thrinfo_t node. - bli_thrinfo_set_n_way( xx_way, thread ); - bli_thrinfo_set_work_id( work_id, thread ); + thrcomm_t* gl_comm = bli_thrinfo_comm( *root ); + dim_t tid = bli_thread_thread_id( *root ); + pool_t* pool = bli_thread_sba_pool( *root ); + dim_t nt = bli_thread_num_threads( *root ); + + // Return early in single-threaded execution + // since the thread control tree may not have been + // allocated normally + if ( nt == 1 ) return; + + bli_thrinfo_free( *root ); + *root = bli_l3_sup_thrinfo_create( tid, gl_comm, pool, rntm ); } // ----------------------------------------------------------------------------- @@ -283,43 +281,43 @@ void bli_l3_thrinfo_print_gemm_paths if ( !jc_info ) goto print_thrinfo; - jc_comm_id = bli_thread_ocomm_id( jc_info ); + jc_comm_id = bli_thread_thread_id( jc_info ); jc_work_id = bli_thread_work_id( jc_info ); pc_info = bli_thrinfo_sub_node( jc_info ); if ( !pc_info ) goto print_thrinfo; - pc_comm_id = bli_thread_ocomm_id( pc_info ); + pc_comm_id = bli_thread_thread_id( pc_info ); pc_work_id = bli_thread_work_id( pc_info ); pb_info = bli_thrinfo_sub_node( pc_info ); if ( !pb_info ) goto print_thrinfo; - pb_comm_id = bli_thread_ocomm_id( pb_info ); + pb_comm_id = bli_thread_thread_id( pb_info ); pb_work_id = bli_thread_work_id( pb_info ); ic_info = bli_thrinfo_sub_node( pb_info ); if ( !ic_info ) goto print_thrinfo; - ic_comm_id = bli_thread_ocomm_id( ic_info ); + ic_comm_id = bli_thread_thread_id( ic_info ); ic_work_id = bli_thread_work_id( ic_info ); pa_info = bli_thrinfo_sub_node( ic_info ); if ( !pa_info ) goto print_thrinfo; - pa_comm_id = bli_thread_ocomm_id( pa_info ); + pa_comm_id = bli_thread_thread_id( pa_info ); pa_work_id = bli_thread_work_id( pa_info ); jr_info = bli_thrinfo_sub_node( pa_info ); if ( !jr_info ) goto print_thrinfo; - jr_comm_id = bli_thread_ocomm_id( jr_info ); + jr_comm_id = bli_thread_thread_id( jr_info ); jr_work_id = bli_thread_work_id( jr_info ); ir_info = bli_thrinfo_sub_node( jr_info ); if ( !ir_info ) goto print_thrinfo; - ir_comm_id = bli_thread_ocomm_id( ir_info ); + ir_comm_id = bli_thread_thread_id( ir_info ); ir_work_id = bli_thread_work_id( ir_info ); print_thrinfo: @@ -493,25 +491,25 @@ void bli_l3_thrinfo_print_trsm_paths if ( !jc_info ) goto print_thrinfo; - jc_comm_id = bli_thread_ocomm_id( jc_info ); + jc_comm_id = bli_thread_thread_id( jc_info ); jc_work_id = bli_thread_work_id( jc_info ); pc_info = bli_thrinfo_sub_node( jc_info ); if ( !pc_info ) goto print_thrinfo; - pc_comm_id = bli_thread_ocomm_id( pc_info ); + pc_comm_id = bli_thread_thread_id( pc_info ); pc_work_id = bli_thread_work_id( pc_info ); pb_info = bli_thrinfo_sub_node( pc_info ); if ( !pb_info ) goto print_thrinfo; - pb_comm_id = bli_thread_ocomm_id( pb_info ); + pb_comm_id = bli_thread_thread_id( pb_info ); pb_work_id = bli_thread_work_id( pb_info ); ic_info = bli_thrinfo_sub_node( pb_info ); if ( !ic_info ) goto print_thrinfo; - ic_comm_id = bli_thread_ocomm_id( ic_info ); + ic_comm_id = bli_thread_thread_id( ic_info ); ic_work_id = bli_thread_work_id( ic_info ); pa_info = bli_thrinfo_sub_node( ic_info ); pa_info0 = bli_thrinfo_sub_prenode( ic_info ); @@ -520,38 +518,38 @@ void bli_l3_thrinfo_print_trsm_paths if ( !pa_info0 ) goto check_thrinfo_node; - pa_comm_id0 = bli_thread_ocomm_id( pa_info0 ); + pa_comm_id0 = bli_thread_thread_id( pa_info0 ); pa_work_id0 = bli_thread_work_id( pa_info0 ); jr_info0 = bli_thrinfo_sub_node( pa_info0 ); if ( !jr_info0 ) goto check_thrinfo_node; - jr_comm_id0 = bli_thread_ocomm_id( jr_info0 ); + jr_comm_id0 = bli_thread_thread_id( jr_info0 ); jr_work_id0 = bli_thread_work_id( jr_info0 ); ir_info0 = bli_thrinfo_sub_node( jr_info0 ); if ( !ir_info0 ) goto check_thrinfo_node; - ir_comm_id0 = bli_thread_ocomm_id( ir_info0 ); + ir_comm_id0 = bli_thread_thread_id( ir_info0 ); ir_work_id0 = bli_thread_work_id( ir_info0 ); check_thrinfo_node: if ( !pa_info ) goto print_thrinfo; - pa_comm_id = bli_thread_ocomm_id( pa_info ); + pa_comm_id = bli_thread_thread_id( pa_info ); pa_work_id = bli_thread_work_id( pa_info ); jr_info = bli_thrinfo_sub_node( pa_info ); if ( !jr_info ) goto print_thrinfo; - jr_comm_id = bli_thread_ocomm_id( jr_info ); + jr_comm_id = bli_thread_thread_id( jr_info ); jr_work_id = bli_thread_work_id( jr_info ); ir_info = bli_thrinfo_sub_node( jr_info ); if ( !ir_info ) goto print_thrinfo; - ir_comm_id = bli_thread_ocomm_id( ir_info ); + ir_comm_id = bli_thread_thread_id( ir_info ); ir_work_id = bli_thread_work_id( ir_info ); print_thrinfo: @@ -584,7 +582,7 @@ void bli_l3_thrinfo_print_trsm_paths } else { - jc_comm_id = bli_thread_ocomm_id( jc_info ); + jc_comm_id = bli_thread_thread_id( jc_info ); jc_work_id = bli_thread_work_id( jc_info ); pc_info = bli_thrinfo_sub_node( jc_info ); @@ -595,7 +593,7 @@ void bli_l3_thrinfo_print_trsm_paths } else { - pc_comm_id = bli_thread_ocomm_id( pc_info ); + pc_comm_id = bli_thread_thread_id( pc_info ); pc_work_id = bli_thread_work_id( pc_info ); pb_info = bli_thrinfo_sub_node( pc_info ); @@ -606,7 +604,7 @@ void bli_l3_thrinfo_print_trsm_paths } else { - pb_comm_id = bli_thread_ocomm_id( pb_info ); + pb_comm_id = bli_thread_thread_id( pb_info ); pb_work_id = bli_thread_work_id( pb_info ); ic_info = bli_thrinfo_sub_node( pb_info ); @@ -617,7 +615,7 @@ void bli_l3_thrinfo_print_trsm_paths } else { - ic_comm_id = bli_thread_ocomm_id( ic_info ); + ic_comm_id = bli_thread_thread_id( ic_info ); ic_work_id = bli_thread_work_id( ic_info ); pa_info0 = bli_thrinfo_sub_prenode( ic_info ); pa_info = bli_thrinfo_sub_node( ic_info ); @@ -630,7 +628,7 @@ void bli_l3_thrinfo_print_trsm_paths } else { - pa_comm_id0 = bli_thread_ocomm_id( pa_info0 ); + pa_comm_id0 = bli_thread_thread_id( pa_info0 ); pa_work_id0 = bli_thread_work_id( pa_info0 ); jr_info0 = bli_thrinfo_sub_node( pa_info0 ); @@ -641,7 +639,7 @@ void bli_l3_thrinfo_print_trsm_paths } else { - jr_comm_id0 = bli_thread_ocomm_id( jr_info0 ); + jr_comm_id0 = bli_thread_thread_id( jr_info0 ); jr_work_id0 = bli_thread_work_id( jr_info0 ); ir_info0 = bli_thrinfo_sub_node( jr_info0 ); @@ -652,7 +650,7 @@ void bli_l3_thrinfo_print_trsm_paths } else { - ir_comm_id0 = bli_thread_ocomm_id( ir_info0 ); + ir_comm_id0 = bli_thread_thread_id( ir_info0 ); ir_work_id0 = bli_thread_work_id( ir_info0 ); } } @@ -666,7 +664,7 @@ void bli_l3_thrinfo_print_trsm_paths } else { - pa_comm_id = bli_thread_ocomm_id( pa_info ); + pa_comm_id = bli_thread_thread_id( pa_info ); pa_work_id = bli_thread_work_id( pa_info ); jr_info = bli_thrinfo_sub_node( pa_info ); @@ -677,7 +675,7 @@ void bli_l3_thrinfo_print_trsm_paths } else { - jr_comm_id = bli_thread_ocomm_id( jr_info ); + jr_comm_id = bli_thread_thread_id( jr_info ); jr_work_id = bli_thread_work_id( jr_info ); ir_info = bli_thrinfo_sub_node( jr_info ); @@ -688,7 +686,7 @@ void bli_l3_thrinfo_print_trsm_paths } else { - ir_comm_id = bli_thread_ocomm_id( ir_info ); + ir_comm_id = bli_thread_thread_id( ir_info ); ir_work_id = bli_thread_work_id( ir_info ); } } @@ -724,7 +722,6 @@ void bli_l3_thrinfo_print_trsm_paths void bli_l3_thrinfo_free_paths ( - rntm_t* rntm, thrinfo_t** threads ) { @@ -732,7 +729,7 @@ void bli_l3_thrinfo_free_paths dim_t i; for ( i = 0; i < n_threads; ++i ) - bli_l3_thrinfo_free( rntm, threads[i] ); + bli_thrinfo_free( threads[i] ); bli_free_intl( threads ); } diff --git a/frame/3/bli_l3_thrinfo.h b/frame/3/bli_l3_thrinfo.h index 37a3909fd6..35e26ec01b 100644 --- a/frame/3/bli_l3_thrinfo.h +++ b/frame/3/bli_l3_thrinfo.h @@ -68,60 +68,37 @@ \ ( index % thread->n_way == thread->work_id % thread->n_way ) -// -// thrinfo_t APIs specific to level-3 operations. -// - -void bli_l3_thrinfo_init - ( - thrinfo_t* thread, - thrcomm_t* ocomm, - dim_t ocomm_id, - dim_t n_way, - dim_t work_id, - thrinfo_t* sub_node - ); - -void bli_l3_thrinfo_init_single - ( - thrinfo_t* thread - ); - -void bli_l3_thrinfo_free - ( - rntm_t* rntm, - thrinfo_t* thread - ); +// ----------------------------------------------------------------------------- -void bli_l3_sup_thrinfo_free +BLIS_EXPORT_BLIS +thrinfo_t* bli_l3_thrinfo_create ( - rntm_t* rntm, - thrinfo_t* thread + dim_t id, + thrcomm_t* gl_comm, + array_t* array, + const rntm_t* rntm, + const cntl_t* cntl ); -// ----------------------------------------------------------------------------- - -void bli_l3_thrinfo_create_root +thrinfo_t* bli_l3_thrinfo_grow ( - dim_t id, - thrcomm_t* gl_comm, - rntm_t* rntm, - cntl_t* cntl, - thrinfo_t** thread + thrinfo_t* thread_par, + const rntm_t* rntm, + const cntl_t* cntl ); -void bli_l3_sup_thrinfo_create_root +thrinfo_t* bli_l3_sup_thrinfo_create ( - dim_t id, - thrcomm_t* gl_comm, - rntm_t* rntm, - thrinfo_t** thread + dim_t id, + thrcomm_t* gl_comm, + pool_t* pool, + const rntm_t* rntm ); -void bli_l3_sup_thrinfo_update_root +void bli_l3_sup_thrinfo_update ( - rntm_t* rntm, - thrinfo_t* thread + const rntm_t* rntm, + thrinfo_t** root ); void bli_l3_thrinfo_print_gemm_paths @@ -138,7 +115,6 @@ void bli_l3_thrinfo_print_trsm_paths void bli_l3_thrinfo_free_paths ( - rntm_t* rntm, thrinfo_t** threads ); diff --git a/frame/3/gemm/bli_gemm.h b/frame/3/gemm/bli_gemm.h index ddd88e1633..1018f119d3 100644 --- a/frame/3/gemm/bli_gemm.h +++ b/frame/3/gemm/bli_gemm.h @@ -32,11 +32,12 @@ */ +// bli_gemm_var.h must be included before bli_gemm_cntl.h +#include "bli_gemm_var.h" + #include "bli_gemm_cntl.h" #include "bli_gemm_front.h" -#include "bli_gemm_var.h" - #include "bli_gemm_ind_opt.h" // Mixed datatype support. diff --git a/frame/3/gemm/bli_gemm_blk_var1.c b/frame/3/gemm/bli_gemm_blk_var1.c index 485779a902..5a713738c6 100644 --- a/frame/3/gemm/bli_gemm_blk_var1.c +++ b/frame/3/gemm/bli_gemm_blk_var1.c @@ -41,8 +41,7 @@ void bli_gemm_blk_var1 const obj_t* b, const obj_t* c, const cntx_t* cntx, - rntm_t* rntm, - cntl_t* cntl, + const cntl_t* cntl, thrinfo_t* thread ) { @@ -88,7 +87,6 @@ void bli_gemm_blk_var1 &BLIS_ONE, &c1, cntx, - rntm, bli_cntl_sub_node( cntl ), bli_thrinfo_sub_node( thread ) ); diff --git a/frame/3/gemm/bli_gemm_blk_var2.c b/frame/3/gemm/bli_gemm_blk_var2.c index 254a310648..dbaf04a67c 100644 --- a/frame/3/gemm/bli_gemm_blk_var2.c +++ b/frame/3/gemm/bli_gemm_blk_var2.c @@ -41,8 +41,7 @@ void bli_gemm_blk_var2 const obj_t* b, const obj_t* c, const cntx_t* cntx, - rntm_t* rntm, - cntl_t* cntl, + const cntl_t* cntl, thrinfo_t* thread ) { @@ -88,7 +87,6 @@ void bli_gemm_blk_var2 &BLIS_ONE, &c1, cntx, - rntm, bli_cntl_sub_node( cntl ), bli_thrinfo_sub_node( thread ) ); diff --git a/frame/3/gemm/bli_gemm_blk_var3.c b/frame/3/gemm/bli_gemm_blk_var3.c index cb20b7f367..1083b4da04 100644 --- a/frame/3/gemm/bli_gemm_blk_var3.c +++ b/frame/3/gemm/bli_gemm_blk_var3.c @@ -40,8 +40,7 @@ void bli_gemm_blk_var3 const obj_t* b, const obj_t* c, const cntx_t* cntx, - rntm_t* rntm, - cntl_t* cntl, + const cntl_t* cntl, thrinfo_t* thread ) { @@ -83,12 +82,11 @@ void bli_gemm_blk_var3 &BLIS_ONE, &cs, cntx, - rntm, bli_cntl_sub_node( cntl ), bli_thrinfo_sub_node( thread ) ); - bli_thread_barrier( rntm, bli_thrinfo_sub_node( thread ) ); + bli_thread_barrier( bli_thrinfo_sub_node( thread ) ); // This variant executes multiple rank-k updates. Therefore, if the // internal beta scalar on matrix C is non-zero, we must use it diff --git a/frame/3/gemm/bli_gemm_cntl.c b/frame/3/gemm/bli_gemm_cntl.c index 052c812a33..9f658fa4dc 100644 --- a/frame/3/gemm/bli_gemm_cntl.c +++ b/frame/3/gemm/bli_gemm_cntl.c @@ -37,21 +37,21 @@ cntl_t* bli_gemm_cntl_create ( - rntm_t* rntm, + pool_t* pool, opid_t family, pack_t schema_a, pack_t schema_b, void_fp ker ) { - return bli_gemmbp_cntl_create( rntm, family, schema_a, schema_b, ker ); + return bli_gemmbp_cntl_create( pool, family, schema_a, schema_b, ker ); } // ----------------------------------------------------------------------------- cntl_t* bli_gemmbp_cntl_create ( - rntm_t* rntm, + pool_t* pool, opid_t family, pack_t schema_a, pack_t schema_b, @@ -73,7 +73,7 @@ cntl_t* bli_gemmbp_cntl_create // Create two nodes for the macro-kernel. cntl_t* gemm_cntl_bu_ke = bli_gemm_cntl_create_node ( - rntm, // the thread's runtime structure + pool, // the thread's runtime structure family, // the operation family BLIS_MR, // needed for bli_thrinfo_rgrow() NULL, // variant function pointer not used @@ -82,7 +82,7 @@ cntl_t* bli_gemmbp_cntl_create cntl_t* gemm_cntl_bp_bu = bli_gemm_cntl_create_node ( - rntm, // the thread's runtime structure + pool, // the thread's runtime structure family, BLIS_NR, // not used by macro-kernel, but needed for bli_thrinfo_rgrow() macro_kernel_fp, @@ -92,7 +92,7 @@ cntl_t* bli_gemmbp_cntl_create // Create a node for packing matrix A. cntl_t* gemm_cntl_packa = bli_packm_cntl_create_node ( - rntm, + pool, bli_l3_packa, // pack the left-hand operand BLIS_MR, BLIS_KR, @@ -107,7 +107,7 @@ cntl_t* bli_gemmbp_cntl_create // Create a node for partitioning the m dimension by MC. cntl_t* gemm_cntl_op_bp = bli_gemm_cntl_create_node ( - rntm, + pool, family, BLIS_MC, bli_gemm_blk_var1, @@ -117,7 +117,7 @@ cntl_t* bli_gemmbp_cntl_create // Create a node for packing matrix B. cntl_t* gemm_cntl_packb = bli_packm_cntl_create_node ( - rntm, + pool, bli_l3_packb, // pack the right-hand operand BLIS_NR, BLIS_KR, @@ -132,7 +132,7 @@ cntl_t* bli_gemmbp_cntl_create // Create a node for partitioning the k dimension by KC. cntl_t* gemm_cntl_mm_op = bli_gemm_cntl_create_node ( - rntm, + pool, family, BLIS_KC, bli_gemm_blk_var3, @@ -142,7 +142,7 @@ cntl_t* bli_gemmbp_cntl_create // Create a node for partitioning the n dimension by NC. cntl_t* gemm_cntl_vl_mm = bli_gemm_cntl_create_node ( - rntm, + pool, family, BLIS_NC, bli_gemm_blk_var2, @@ -253,25 +253,24 @@ cntl_t* bli_gemmpb_cntl_create void bli_gemm_cntl_free ( - rntm_t* rntm, - cntl_t* cntl, - thrinfo_t* thread + pool_t* pool, + cntl_t* cntl ) { - bli_cntl_free( rntm, cntl, thread ); + bli_cntl_free( pool, cntl ); } // ----------------------------------------------------------------------------- cntl_t* bli_gemm_cntl_create_node ( - rntm_t* rntm, + pool_t* pool, opid_t family, bszid_t bszid, void_fp var_func, cntl_t* sub_node ) { - return bli_cntl_create_node( rntm, family, bszid, var_func, NULL, sub_node ); + return bli_cntl_create_node( pool, family, bszid, var_func, NULL, sub_node ); } diff --git a/frame/3/gemm/bli_gemm_cntl.h b/frame/3/gemm/bli_gemm_cntl.h index 5fa213ac41..a971f05d3d 100644 --- a/frame/3/gemm/bli_gemm_cntl.h +++ b/frame/3/gemm/bli_gemm_cntl.h @@ -35,7 +35,7 @@ cntl_t* bli_gemm_cntl_create ( - rntm_t* rntm, + pool_t* pool, opid_t family, pack_t schema_a, pack_t schema_b, @@ -46,7 +46,7 @@ cntl_t* bli_gemm_cntl_create cntl_t* bli_gemmbp_cntl_create ( - rntm_t* rntm, + pool_t* pool, opid_t family, pack_t schema_a, pack_t schema_b, @@ -64,16 +64,15 @@ cntl_t* bli_gemmpb_cntl_create void bli_gemm_cntl_free ( - rntm_t* rntm, - cntl_t* cntl, - thrinfo_t* thread + pool_t* pool, + cntl_t* cntl ); // ----------------------------------------------------------------------------- cntl_t* bli_gemm_cntl_create_node ( - rntm_t* rntm, + pool_t* pool, opid_t family, bszid_t bszid, void_fp var_func, diff --git a/frame/3/gemm/bli_gemm_front.c b/frame/3/gemm/bli_gemm_front.c index 5f992bd679..8c10066aa9 100644 --- a/frame/3/gemm/bli_gemm_front.c +++ b/frame/3/gemm/bli_gemm_front.c @@ -44,7 +44,7 @@ void bli_gemm_front const obj_t* c, const cntx_t* cntx, rntm_t* rntm, - cntl_t* cntl + const cntl_t* cntl ) { bli_init_once(); diff --git a/frame/3/gemm/bli_gemm_front.h b/frame/3/gemm/bli_gemm_front.h index 744f88d1b2..f237fb9492 100644 --- a/frame/3/gemm/bli_gemm_front.h +++ b/frame/3/gemm/bli_gemm_front.h @@ -41,7 +41,7 @@ void bli_gemm_front const obj_t* c, const cntx_t* cntx, rntm_t* rntm, - cntl_t* cntl + const cntl_t* cntl ); #ifdef BLIS_ENABLE_SMALL_MATRIX @@ -53,7 +53,7 @@ err_t bli_gemm_small const obj_t* beta, const obj_t* c, const cntx_t* cntx, - cntl_t* cntl + const cntl_t* cntl ); #endif diff --git a/frame/3/gemm/bli_gemm_ker_var2.c b/frame/3/gemm/bli_gemm_ker_var2.c index 199e72cb65..4308d62eff 100644 --- a/frame/3/gemm/bli_gemm_ker_var2.c +++ b/frame/3/gemm/bli_gemm_ker_var2.c @@ -81,8 +81,7 @@ void bli_gemm_ker_var2 const obj_t* b, const obj_t* c, const cntx_t* cntx, - rntm_t* rntm, - cntl_t* cntl, + const cntl_t* cntl, thrinfo_t* thread ) { diff --git a/frame/3/gemm/bli_gemm_md.h b/frame/3/gemm/bli_gemm_md.h index d71d97987a..2ff126c342 100644 --- a/frame/3/gemm/bli_gemm_md.h +++ b/frame/3/gemm/bli_gemm_md.h @@ -69,8 +69,8 @@ void bli_gemm_md_front const obj_t* beta, const obj_t* c, const cntx_t* cntx, - rntm_t* rntm, - cntl_t* cntl + const rntm_t* rntm, + const cntl_t* cntl ); void bli_gemm_md_zgemm @@ -81,8 +81,8 @@ void bli_gemm_md_zgemm const obj_t* beta, const obj_t* c, const cntx_t* cntx, - rntm_t* rntm, - cntl_t* cntl + const rntm_t* rntm, + const cntl_t* cntl ); // ----------------------------------------------------------------------------- diff --git a/frame/3/gemm/bli_gemm_var.h b/frame/3/gemm/bli_gemm_var.h index d3109e6003..ef7139efac 100644 --- a/frame/3/gemm/bli_gemm_var.h +++ b/frame/3/gemm/bli_gemm_var.h @@ -57,8 +57,7 @@ void PASTEMAC0(opname) \ const obj_t* b, \ const obj_t* c, \ const cntx_t* cntx, \ - rntm_t* rntm, \ - cntl_t* cntl, \ + const cntl_t* cntl, \ thrinfo_t* thread \ ); diff --git a/frame/3/gemmt/bli_gemmt_front.c b/frame/3/gemmt/bli_gemmt_front.c index 49b32c9762..e86b2408ae 100644 --- a/frame/3/gemmt/bli_gemmt_front.c +++ b/frame/3/gemmt/bli_gemmt_front.c @@ -44,7 +44,7 @@ void bli_gemmt_front const obj_t* c, const cntx_t* cntx, rntm_t* rntm, - cntl_t* cntl + const cntl_t* cntl ) { bli_init_once(); diff --git a/frame/3/gemmt/bli_gemmt_front.h b/frame/3/gemmt/bli_gemmt_front.h index 0f2a9ada2b..310cf7c8cb 100644 --- a/frame/3/gemmt/bli_gemmt_front.h +++ b/frame/3/gemmt/bli_gemmt_front.h @@ -42,5 +42,5 @@ void bli_gemmt_front const obj_t* c, const cntx_t* cntx, rntm_t* rntm, - cntl_t* cntl + const cntl_t* cntl ); diff --git a/frame/3/gemmt/bli_gemmt_l_ker_var2.c b/frame/3/gemmt/bli_gemmt_l_ker_var2.c index aed0359ecb..dd699406dc 100644 --- a/frame/3/gemmt/bli_gemmt_l_ker_var2.c +++ b/frame/3/gemmt/bli_gemmt_l_ker_var2.c @@ -53,7 +53,6 @@ typedef void (*FUNCPTR_T) void* beta, void* c, inc_t rs_c, inc_t cs_c, cntx_t* cntx, - rntm_t* rntm, thrinfo_t* thread ); @@ -66,8 +65,7 @@ void bli_gemmt_l_ker_var2 const obj_t* b, const obj_t* c, const cntx_t* cntx, - rntm_t* rntm, - cntl_t* cntl, + const cntl_t* cntl, thrinfo_t* thread ) { @@ -127,7 +125,6 @@ void bli_gemmt_l_ker_var2 ( void* )buf_beta, buf_c, rs_c, cs_c, ( cntx_t* )cntx, - rntm, thread ); } @@ -152,7 +149,6 @@ void PASTEMAC(ch,varname) \ void* beta, \ void* c, inc_t rs_c, inc_t cs_c, \ cntx_t* cntx, \ - rntm_t* rntm, \ thrinfo_t* thread \ ) \ { \ diff --git a/frame/3/gemmt/bli_gemmt_u_ker_var2.c b/frame/3/gemmt/bli_gemmt_u_ker_var2.c index 87d77ee554..50f034118b 100644 --- a/frame/3/gemmt/bli_gemmt_u_ker_var2.c +++ b/frame/3/gemmt/bli_gemmt_u_ker_var2.c @@ -53,7 +53,6 @@ typedef void (*FUNCPTR_T) void* beta, void* c, inc_t rs_c, inc_t cs_c, cntx_t* cntx, - rntm_t* rntm, thrinfo_t* thread ); @@ -66,8 +65,7 @@ void bli_gemmt_u_ker_var2 const obj_t* b, const obj_t* c, const cntx_t* cntx, - rntm_t* rntm, - cntl_t* cntl, + const cntl_t* cntl, thrinfo_t* thread ) { @@ -127,7 +125,6 @@ void bli_gemmt_u_ker_var2 ( void* )buf_beta, buf_c, rs_c, cs_c, ( cntx_t* )cntx, - rntm, thread ); } @@ -152,7 +149,6 @@ void PASTEMAC(ch,varname) \ void* beta, \ void* c, inc_t rs_c, inc_t cs_c, \ cntx_t* cntx, \ - rntm_t* rntm, \ thrinfo_t* thread \ ) \ { \ diff --git a/frame/3/gemmt/bli_gemmt_var.h b/frame/3/gemmt/bli_gemmt_var.h index 98d8f55633..eb6e160180 100644 --- a/frame/3/gemmt/bli_gemmt_var.h +++ b/frame/3/gemmt/bli_gemmt_var.h @@ -47,8 +47,7 @@ void PASTEMAC0(opname) \ const obj_t* ah, \ const obj_t* c, \ const cntx_t* cntx, \ - rntm_t* rntm, \ - cntl_t* cntl, \ + const cntl_t* cntl, \ thrinfo_t* thread \ ); @@ -81,7 +80,6 @@ void PASTEMAC(ch,varname) \ void* beta, \ void* c, inc_t rs_c, inc_t cs_c, \ cntx_t* cntx, \ - rntm_t* rntm, \ thrinfo_t* thread \ ); diff --git a/frame/3/gemmt/bli_gemmt_x_ker_var2.c b/frame/3/gemmt/bli_gemmt_x_ker_var2.c index 76fe106b08..0ccefc2498 100644 --- a/frame/3/gemmt/bli_gemmt_x_ker_var2.c +++ b/frame/3/gemmt/bli_gemmt_x_ker_var2.c @@ -46,8 +46,7 @@ void bli_gemmt_x_ker_var2 const obj_t* ah, const obj_t* c, const cntx_t* cntx, - rntm_t* rntm, - cntl_t* cntl, + const cntl_t* cntl, thrinfo_t* thread ) { @@ -68,7 +67,6 @@ void bli_gemmt_x_ker_var2 ah, c, cntx, - rntm, cntl, thread ); diff --git a/frame/3/hemm/bli_hemm_front.c b/frame/3/hemm/bli_hemm_front.c index c39703503d..22ed8a51f4 100644 --- a/frame/3/hemm/bli_hemm_front.c +++ b/frame/3/hemm/bli_hemm_front.c @@ -44,7 +44,7 @@ void bli_hemm_front const obj_t* c, const cntx_t* cntx, rntm_t* rntm, - cntl_t* cntl + const cntl_t* cntl ) { bli_init_once(); diff --git a/frame/3/hemm/bli_hemm_front.h b/frame/3/hemm/bli_hemm_front.h index 63eb91cd3a..372ab91b28 100644 --- a/frame/3/hemm/bli_hemm_front.h +++ b/frame/3/hemm/bli_hemm_front.h @@ -42,5 +42,5 @@ void bli_hemm_front const obj_t* c, const cntx_t* cntx, rntm_t* rntm, - cntl_t* cntl + const cntl_t* cntl ); diff --git a/frame/3/symm/bli_symm_front.c b/frame/3/symm/bli_symm_front.c index c9aada9893..ab32d3b386 100644 --- a/frame/3/symm/bli_symm_front.c +++ b/frame/3/symm/bli_symm_front.c @@ -44,7 +44,7 @@ void bli_symm_front const obj_t* c, const cntx_t* cntx, rntm_t* rntm, - cntl_t* cntl + const cntl_t* cntl ) { bli_init_once(); diff --git a/frame/3/symm/bli_symm_front.h b/frame/3/symm/bli_symm_front.h index 417cb9acb2..98832346c2 100644 --- a/frame/3/symm/bli_symm_front.h +++ b/frame/3/symm/bli_symm_front.h @@ -42,5 +42,5 @@ void bli_symm_front const obj_t* c, const cntx_t* cntx, rntm_t* rntm, - cntl_t* cntl + const cntl_t* cntl ); diff --git a/frame/3/trmm/bli_trmm_front.c b/frame/3/trmm/bli_trmm_front.c index edd4ce1efb..00f6d99279 100644 --- a/frame/3/trmm/bli_trmm_front.c +++ b/frame/3/trmm/bli_trmm_front.c @@ -43,7 +43,7 @@ void bli_trmm_front const obj_t* b, const cntx_t* cntx, rntm_t* rntm, - cntl_t* cntl + const cntl_t* cntl ) { bli_init_once(); diff --git a/frame/3/trmm/bli_trmm_front.h b/frame/3/trmm/bli_trmm_front.h index cfefdd39bc..f2d98106d3 100644 --- a/frame/3/trmm/bli_trmm_front.h +++ b/frame/3/trmm/bli_trmm_front.h @@ -40,5 +40,5 @@ void bli_trmm_front const obj_t* b, const cntx_t* cntx, rntm_t* rntm, - cntl_t* cntl + const cntl_t* cntl ); diff --git a/frame/3/trmm/bli_trmm_ll_ker_var2.c b/frame/3/trmm/bli_trmm_ll_ker_var2.c index f5476b2cad..0497733a05 100644 --- a/frame/3/trmm/bli_trmm_ll_ker_var2.c +++ b/frame/3/trmm/bli_trmm_ll_ker_var2.c @@ -51,7 +51,6 @@ typedef void (*FUNCPTR_T) void* beta, void* c, inc_t rs_c, inc_t cs_c, cntx_t* cntx, - rntm_t* rntm, thrinfo_t* thread ); @@ -64,8 +63,7 @@ void bli_trmm_ll_ker_var2 const obj_t* b, const obj_t* c, const cntx_t* cntx, - rntm_t* rntm, - cntl_t* cntl, + const cntl_t* cntl, thrinfo_t* thread ) { @@ -121,7 +119,6 @@ void bli_trmm_ll_ker_var2 ( void* )buf_beta, buf_c, rs_c, cs_c, ( cntx_t* )cntx, - rntm, thread ); } @@ -144,7 +141,6 @@ void PASTEMAC(ch,varname) \ void* beta, \ void* c, inc_t rs_c, inc_t cs_c, \ cntx_t* cntx, \ - rntm_t* rntm, \ thrinfo_t* thread \ ) \ { \ diff --git a/frame/3/trmm/bli_trmm_lu_ker_var2.c b/frame/3/trmm/bli_trmm_lu_ker_var2.c index df5b2dac55..45b31787ff 100644 --- a/frame/3/trmm/bli_trmm_lu_ker_var2.c +++ b/frame/3/trmm/bli_trmm_lu_ker_var2.c @@ -51,7 +51,6 @@ typedef void (*FUNCPTR_T) void* beta, void* c, inc_t rs_c, inc_t cs_c, cntx_t* cntx, - rntm_t* rntm, thrinfo_t* thread ); @@ -64,8 +63,7 @@ void bli_trmm_lu_ker_var2 const obj_t* b, const obj_t* c, const cntx_t* cntx, - rntm_t* rntm, - cntl_t* cntl, + const cntl_t* cntl, thrinfo_t* thread ) { @@ -121,7 +119,6 @@ void bli_trmm_lu_ker_var2 ( void* )buf_beta, buf_c, rs_c, cs_c, ( cntx_t* )cntx, - rntm, thread ); } @@ -144,7 +141,6 @@ void PASTEMAC(ch,varname) \ void* beta, \ void* c, inc_t rs_c, inc_t cs_c, \ cntx_t* cntx, \ - rntm_t* rntm, \ thrinfo_t* thread \ ) \ { \ diff --git a/frame/3/trmm/bli_trmm_rl_ker_var2.c b/frame/3/trmm/bli_trmm_rl_ker_var2.c index 89f86aa3a8..682ee90207 100644 --- a/frame/3/trmm/bli_trmm_rl_ker_var2.c +++ b/frame/3/trmm/bli_trmm_rl_ker_var2.c @@ -51,7 +51,6 @@ typedef void (*FUNCPTR_T) void* beta, void* c, inc_t rs_c, inc_t cs_c, cntx_t* cntx, - rntm_t* rntm, thrinfo_t* thread ); @@ -64,8 +63,7 @@ void bli_trmm_rl_ker_var2 const obj_t* b, const obj_t* c, const cntx_t* cntx, - rntm_t* rntm, - cntl_t* cntl, + const cntl_t* cntl, thrinfo_t* thread ) { @@ -121,7 +119,6 @@ void bli_trmm_rl_ker_var2 ( void* )buf_beta, buf_c, rs_c, cs_c, ( cntx_t* )cntx, - rntm, thread ); } @@ -144,7 +141,6 @@ void PASTEMAC(ch,varname) \ void* beta, \ void* c, inc_t rs_c, inc_t cs_c, \ cntx_t* cntx, \ - rntm_t* rntm, \ thrinfo_t* thread \ ) \ { \ diff --git a/frame/3/trmm/bli_trmm_ru_ker_var2.c b/frame/3/trmm/bli_trmm_ru_ker_var2.c index 4ed38e7610..dfaecb3f17 100644 --- a/frame/3/trmm/bli_trmm_ru_ker_var2.c +++ b/frame/3/trmm/bli_trmm_ru_ker_var2.c @@ -51,7 +51,6 @@ typedef void (*FUNCPTR_T) void* beta, void* c, inc_t rs_c, inc_t cs_c, cntx_t* cntx, - rntm_t* rntm, thrinfo_t* thread ); @@ -64,8 +63,7 @@ void bli_trmm_ru_ker_var2 const obj_t* b, const obj_t* c, const cntx_t* cntx, - rntm_t* rntm, - cntl_t* cntl, + const cntl_t* cntl, thrinfo_t* thread ) { @@ -121,7 +119,6 @@ void bli_trmm_ru_ker_var2 ( void* )buf_beta, buf_c, rs_c, cs_c, ( cntx_t* )cntx, - rntm, thread ); } @@ -144,7 +141,6 @@ void PASTEMAC(ch,varname) \ void* beta, \ void* c, inc_t rs_c, inc_t cs_c, \ cntx_t* cntx, \ - rntm_t* rntm, \ thrinfo_t* thread \ ) \ { \ diff --git a/frame/3/trmm/bli_trmm_var.h b/frame/3/trmm/bli_trmm_var.h index 2f0642ca8f..f8c3d7ee20 100644 --- a/frame/3/trmm/bli_trmm_var.h +++ b/frame/3/trmm/bli_trmm_var.h @@ -47,8 +47,7 @@ void PASTEMAC0(opname) \ const obj_t* b, \ const obj_t* c, \ const cntx_t* cntx, \ - rntm_t* rntm, \ - cntl_t* cntl, \ + const cntl_t* cntl, \ thrinfo_t* thread \ ); @@ -87,7 +86,6 @@ void PASTEMAC(ch,varname) \ void* beta, \ void* c, inc_t rs_c, inc_t cs_c, \ cntx_t* cntx, \ - rntm_t* rntm, \ thrinfo_t* thread \ ); diff --git a/frame/3/trmm/bli_trmm_xx_ker_var2.c b/frame/3/trmm/bli_trmm_xx_ker_var2.c index d42bc88c2d..efbd67dc72 100644 --- a/frame/3/trmm/bli_trmm_xx_ker_var2.c +++ b/frame/3/trmm/bli_trmm_xx_ker_var2.c @@ -47,8 +47,7 @@ void bli_trmm_xx_ker_var2 const obj_t* b, const obj_t* c, const cntx_t* cntx, - rntm_t* rntm, - cntl_t* cntl, + const cntl_t* cntl, thrinfo_t* thread ) { @@ -82,7 +81,6 @@ void bli_trmm_xx_ker_var2 b, c, cntx, - rntm, cntl, thread ); diff --git a/frame/3/trmm3/bli_trmm3_front.c b/frame/3/trmm3/bli_trmm3_front.c index 9681eb6406..022e3da354 100644 --- a/frame/3/trmm3/bli_trmm3_front.c +++ b/frame/3/trmm3/bli_trmm3_front.c @@ -44,7 +44,7 @@ void bli_trmm3_front const obj_t* c, const cntx_t* cntx, rntm_t* rntm, - cntl_t* cntl + const cntl_t* cntl ) { bli_init_once(); diff --git a/frame/3/trmm3/bli_trmm3_front.h b/frame/3/trmm3/bli_trmm3_front.h index b5dde34cd0..53f4920f49 100644 --- a/frame/3/trmm3/bli_trmm3_front.h +++ b/frame/3/trmm3/bli_trmm3_front.h @@ -42,5 +42,5 @@ void bli_trmm3_front const obj_t* c, const cntx_t* cntx, rntm_t* rntm, - cntl_t* cntl + const cntl_t* cntl ); diff --git a/frame/3/trsm/bli_trsm_blk_var1.c b/frame/3/trsm/bli_trsm_blk_var1.c index 413b128187..40364c2194 100644 --- a/frame/3/trsm/bli_trsm_blk_var1.c +++ b/frame/3/trsm/bli_trsm_blk_var1.c @@ -43,8 +43,7 @@ void bli_trsm_blk_var1 const obj_t* b, const obj_t* c, const cntx_t* cntx, - rntm_t* rntm, - cntl_t* cntl, + const cntl_t* cntl, thrinfo_t* thread ) { @@ -105,7 +104,6 @@ void bli_trsm_blk_var1 &BLIS_ONE, &c1_1, cntx, - rntm, bli_cntl_sub_prenode( cntl ), bli_thrinfo_sub_prenode( thread ) ); @@ -118,7 +116,7 @@ void bli_trsm_blk_var1 // We must execute a barrier here because the upcoming rank-k update // requires the packed matrix B to be fully updated by the trsm // subproblem. - bli_thread_barrier( rntm, thread ); + bli_thread_barrier( thread ); // Isolate the remaining part of the column panel matrix A, which we do by // acquiring the subpartition ahead of A11 (that is, A21 or A01, depending @@ -177,7 +175,6 @@ void bli_trsm_blk_var1 &BLIS_ONE, &c1, cntx, - rntm, bli_cntl_sub_node( cntl ), bli_thrinfo_sub_node( thread ) ); diff --git a/frame/3/trsm/bli_trsm_blk_var2.c b/frame/3/trsm/bli_trsm_blk_var2.c index 88db57e519..b04d87f55e 100644 --- a/frame/3/trsm/bli_trsm_blk_var2.c +++ b/frame/3/trsm/bli_trsm_blk_var2.c @@ -41,8 +41,7 @@ void bli_trsm_blk_var2 const obj_t* b, const obj_t* c, const cntx_t* cntx, - rntm_t* rntm, - cntl_t* cntl, + const cntl_t* cntl, thrinfo_t* thread ) { @@ -88,7 +87,6 @@ void bli_trsm_blk_var2 &BLIS_ONE, &c1, cntx, - rntm, bli_cntl_sub_node( cntl ), bli_thrinfo_sub_node( thread ) ); diff --git a/frame/3/trsm/bli_trsm_blk_var3.c b/frame/3/trsm/bli_trsm_blk_var3.c index 229259a952..556eebb515 100644 --- a/frame/3/trsm/bli_trsm_blk_var3.c +++ b/frame/3/trsm/bli_trsm_blk_var3.c @@ -40,8 +40,7 @@ void bli_trsm_blk_var3 const obj_t* b, const obj_t* c, const cntx_t* cntx, - rntm_t* rntm, - cntl_t* cntl, + const cntl_t* cntl, thrinfo_t* thread ) { @@ -83,13 +82,12 @@ void bli_trsm_blk_var3 &BLIS_ONE, &cs, cntx, - rntm, bli_cntl_sub_node( cntl ), bli_thrinfo_sub_node( thread ) ); //bli_thread_ibarrier( thread ); - bli_thread_barrier( rntm, bli_thrinfo_sub_node( thread ) ); + bli_thread_barrier( bli_thrinfo_sub_node( thread ) ); // This variant executes multiple rank-k updates. Therefore, if the // internal alpha scalars on A/B and C are non-zero, we must ensure diff --git a/frame/3/trsm/bli_trsm_cntl.c b/frame/3/trsm/bli_trsm_cntl.c index 0a3be87f74..2a6db6547a 100644 --- a/frame/3/trsm/bli_trsm_cntl.c +++ b/frame/3/trsm/bli_trsm_cntl.c @@ -37,7 +37,7 @@ cntl_t* bli_trsm_cntl_create ( - rntm_t* rntm, + pool_t* pool, side_t side, pack_t schema_a, pack_t schema_b, @@ -45,14 +45,14 @@ cntl_t* bli_trsm_cntl_create ) { if ( bli_is_left( side ) ) - return bli_trsm_l_cntl_create( rntm, schema_a, schema_b, ker ); + return bli_trsm_l_cntl_create( pool, schema_a, schema_b, ker ); else - return bli_trsm_r_cntl_create( rntm, schema_a, schema_b, ker ); + return bli_trsm_r_cntl_create( pool, schema_a, schema_b, ker ); } cntl_t* bli_trsm_l_cntl_create ( - rntm_t* rntm, + pool_t* pool, pack_t schema_a, pack_t schema_b, void_fp ker @@ -73,7 +73,7 @@ cntl_t* bli_trsm_l_cntl_create cntl_t* gemm_cntl_bu_ke = bli_trsm_cntl_create_node ( - rntm, // the thread's runtime structure + pool, // the thread's runtime structure family, // the operation family BLIS_MR, // needed for bli_thrinfo_rgrow() NULL, // variant function pointer not used @@ -82,7 +82,7 @@ cntl_t* bli_trsm_l_cntl_create cntl_t* gemm_cntl_bp_bu = bli_trsm_cntl_create_node ( - rntm, + pool, family, BLIS_NR, // not used by macro-kernel, but needed for bli_thrinfo_rgrow() macro_kernel_p, @@ -92,7 +92,7 @@ cntl_t* bli_trsm_l_cntl_create // Create a node for packing matrix A. cntl_t* gemm_cntl_packa = bli_packm_cntl_create_node ( - rntm, + pool, bli_l3_packa, // trsm operation's packm function for A. BLIS_MR, BLIS_MR, @@ -110,7 +110,7 @@ cntl_t* bli_trsm_l_cntl_create cntl_t* trsm_cntl_bu_ke = bli_trsm_cntl_create_node ( - rntm, // the thread's runtime structure + pool, // the thread's runtime structure family, // the operation family BLIS_MR, // needed for bli_thrinfo_rgrow() NULL, // variant function pointer not used @@ -119,7 +119,7 @@ cntl_t* bli_trsm_l_cntl_create cntl_t* trsm_cntl_bp_bu = bli_trsm_cntl_create_node ( - rntm, + pool, family, BLIS_NR, // not used by macro-kernel, but needed for bli_thrinfo_rgrow() macro_kernel_p, @@ -129,7 +129,7 @@ cntl_t* bli_trsm_l_cntl_create // Create a node for packing matrix A. cntl_t* trsm_cntl_packa = bli_packm_cntl_create_node ( - rntm, + pool, bli_l3_packa, // trsm operation's packm function for A. BLIS_MR, BLIS_MR, @@ -151,7 +151,7 @@ cntl_t* bli_trsm_l_cntl_create // NOTE: We attach the gemm sub-tree as the main branch. cntl_t* trsm_cntl_op_bp = bli_trsm_cntl_create_node ( - rntm, + pool, family, BLIS_MC, bli_trsm_blk_var1, @@ -166,7 +166,7 @@ cntl_t* bli_trsm_l_cntl_create // Create a node for packing matrix B. cntl_t* trsm_cntl_packb = bli_packm_cntl_create_node ( - rntm, + pool, bli_l3_packb, BLIS_NR, BLIS_MR, @@ -181,7 +181,7 @@ cntl_t* bli_trsm_l_cntl_create // Create a node for partitioning the k dimension by KC. cntl_t* trsm_cntl_mm_op = bli_trsm_cntl_create_node ( - rntm, + pool, family, BLIS_KC, bli_trsm_blk_var3, @@ -191,7 +191,7 @@ cntl_t* bli_trsm_l_cntl_create // Create a node for partitioning the n dimension by NC. cntl_t* trsm_cntl_vl_mm = bli_trsm_cntl_create_node ( - rntm, + pool, family, BLIS_NC, bli_trsm_blk_var2, @@ -203,7 +203,7 @@ cntl_t* bli_trsm_l_cntl_create cntl_t* bli_trsm_r_cntl_create ( - rntm_t* rntm, + pool_t* pool, pack_t schema_a, pack_t schema_b, void_fp ker @@ -220,7 +220,7 @@ cntl_t* bli_trsm_r_cntl_create // Create two nodes for the macro-kernel. cntl_t* trsm_cntl_bu_ke = bli_trsm_cntl_create_node ( - rntm, + pool, family, BLIS_MR, // needed for bli_thrinfo_rgrow() NULL, // variant function pointer not used @@ -229,7 +229,7 @@ cntl_t* bli_trsm_r_cntl_create cntl_t* trsm_cntl_bp_bu = bli_trsm_cntl_create_node ( - rntm, + pool, family, BLIS_NR, // not used by macro-kernel, but needed for bli_thrinfo_rgrow() macro_kernel_p, @@ -239,7 +239,7 @@ cntl_t* bli_trsm_r_cntl_create // Create a node for packing matrix A. cntl_t* trsm_cntl_packa = bli_packm_cntl_create_node ( - rntm, + pool, bli_l3_packa, BLIS_NR, BLIS_MR, @@ -254,7 +254,7 @@ cntl_t* bli_trsm_r_cntl_create // Create a node for partitioning the m dimension by MC. cntl_t* trsm_cntl_op_bp = bli_trsm_cntl_create_node ( - rntm, + pool, family, BLIS_MC, bli_trsm_blk_var1, @@ -264,7 +264,7 @@ cntl_t* bli_trsm_r_cntl_create // Create a node for packing matrix B. cntl_t* trsm_cntl_packb = bli_packm_cntl_create_node ( - rntm, + pool, bli_l3_packb, BLIS_MR, BLIS_MR, @@ -279,7 +279,7 @@ cntl_t* bli_trsm_r_cntl_create // Create a node for partitioning the k dimension by KC. cntl_t* trsm_cntl_mm_op = bli_trsm_cntl_create_node ( - rntm, + pool, family, BLIS_KC, bli_trsm_blk_var3, @@ -289,7 +289,7 @@ cntl_t* bli_trsm_r_cntl_create // Create a node for partitioning the n dimension by NC. cntl_t* trsm_cntl_vl_mm = bli_trsm_cntl_create_node ( - rntm, + pool, family, BLIS_NC, bli_trsm_blk_var2, @@ -301,25 +301,24 @@ cntl_t* bli_trsm_r_cntl_create void bli_trsm_cntl_free ( - rntm_t* rntm, - cntl_t* cntl, - thrinfo_t* thread + pool_t* pool, + cntl_t* cntl ) { - bli_cntl_free( rntm, cntl, thread ); + bli_cntl_free( pool, cntl ); } // ----------------------------------------------------------------------------- cntl_t* bli_trsm_cntl_create_node ( - rntm_t* rntm, + pool_t* pool, opid_t family, bszid_t bszid, void_fp var_func, cntl_t* sub_node ) { - return bli_cntl_create_node( rntm, family, bszid, var_func, NULL, sub_node ); + return bli_cntl_create_node( pool, family, bszid, var_func, NULL, sub_node ); } diff --git a/frame/3/trsm/bli_trsm_cntl.h b/frame/3/trsm/bli_trsm_cntl.h index 86f4a29b2a..a23120ff89 100644 --- a/frame/3/trsm/bli_trsm_cntl.h +++ b/frame/3/trsm/bli_trsm_cntl.h @@ -35,7 +35,7 @@ cntl_t* bli_trsm_cntl_create ( - rntm_t* rntm, + pool_t* pool, side_t side, pack_t schema_a, pack_t schema_b, @@ -44,7 +44,7 @@ cntl_t* bli_trsm_cntl_create cntl_t* bli_trsm_l_cntl_create ( - rntm_t* rntm, + pool_t* pool, pack_t schema_a, pack_t schema_b, void_fp ker @@ -52,7 +52,7 @@ cntl_t* bli_trsm_l_cntl_create cntl_t* bli_trsm_r_cntl_create ( - rntm_t* rntm, + pool_t* pool, pack_t schema_a, pack_t schema_b, void_fp ker @@ -60,16 +60,15 @@ cntl_t* bli_trsm_r_cntl_create void bli_trsm_cntl_free ( - rntm_t* rntm, - cntl_t* cntl, - thrinfo_t* thread + pool_t* pool, + cntl_t* cntl ); // ----------------------------------------------------------------------------- cntl_t* bli_trsm_cntl_create_node ( - rntm_t* rntm, + pool_t* pool, opid_t family, bszid_t bszid, void_fp var_func, diff --git a/frame/3/trsm/bli_trsm_front.c b/frame/3/trsm/bli_trsm_front.c index b94a129d99..9af9b1381b 100644 --- a/frame/3/trsm/bli_trsm_front.c +++ b/frame/3/trsm/bli_trsm_front.c @@ -43,7 +43,7 @@ void bli_trsm_front const obj_t* b, const cntx_t* cntx, rntm_t* rntm, - cntl_t* cntl + const cntl_t* cntl ) { bli_init_once(); diff --git a/frame/3/trsm/bli_trsm_front.h b/frame/3/trsm/bli_trsm_front.h index b31e88b041..cd58a1d410 100644 --- a/frame/3/trsm/bli_trsm_front.h +++ b/frame/3/trsm/bli_trsm_front.h @@ -41,7 +41,7 @@ void bli_trsm_front const obj_t* b, const cntx_t* cntx, rntm_t* rntm, - cntl_t* cntl + const cntl_t* cntl ); #ifdef BLIS_ENABLE_SMALL_MATRIX @@ -52,7 +52,7 @@ err_t bli_trsm_small obj_t* a, obj_t* b, cntx_t* cntx, - cntl_t* cntl + const cntl_t* cntl ); #endif diff --git a/frame/3/trsm/bli_trsm_ll_ker_var2.c b/frame/3/trsm/bli_trsm_ll_ker_var2.c index 075b403362..8fc51f0796 100644 --- a/frame/3/trsm/bli_trsm_ll_ker_var2.c +++ b/frame/3/trsm/bli_trsm_ll_ker_var2.c @@ -51,7 +51,6 @@ typedef void (*FUNCPTR_T) void* alpha2, void* c, inc_t rs_c, inc_t cs_c, cntx_t* cntx, - rntm_t* rntm, thrinfo_t* thread ); @@ -64,8 +63,7 @@ void bli_trsm_ll_ker_var2 const obj_t* b, const obj_t* c, const cntx_t* cntx, - rntm_t* rntm, - cntl_t* cntl, + const cntl_t* cntl, thrinfo_t* thread ) { @@ -126,7 +124,6 @@ void bli_trsm_ll_ker_var2 ( void* )buf_alpha2, buf_c, rs_c, cs_c, ( cntx_t* )cntx, - rntm, thread ); } @@ -149,7 +146,6 @@ void PASTEMAC(ch,varname) \ void* alpha2, \ void* c, inc_t rs_c, inc_t cs_c, \ cntx_t* cntx, \ - rntm_t* rntm, \ thrinfo_t* thread \ ) \ { \ diff --git a/frame/3/trsm/bli_trsm_lu_ker_var2.c b/frame/3/trsm/bli_trsm_lu_ker_var2.c index 799fdd1013..5578fa1e86 100644 --- a/frame/3/trsm/bli_trsm_lu_ker_var2.c +++ b/frame/3/trsm/bli_trsm_lu_ker_var2.c @@ -51,7 +51,6 @@ typedef void (*FUNCPTR_T) void* alpha2, void* c, inc_t rs_c, inc_t cs_c, cntx_t* cntx, - rntm_t* rntm, thrinfo_t* thread ); @@ -64,8 +63,7 @@ void bli_trsm_lu_ker_var2 const obj_t* b, const obj_t* c, const cntx_t* cntx, - rntm_t* rntm, - cntl_t* cntl, + const cntl_t* cntl, thrinfo_t* thread ) { @@ -126,7 +124,6 @@ void bli_trsm_lu_ker_var2 ( void* )buf_alpha2, buf_c, rs_c, cs_c, ( cntx_t* )cntx, - rntm, thread ); } @@ -149,7 +146,6 @@ void PASTEMAC(ch,varname) \ void* alpha2, \ void* c, inc_t rs_c, inc_t cs_c, \ cntx_t* cntx, \ - rntm_t* rntm, \ thrinfo_t* thread \ ) \ { \ diff --git a/frame/3/trsm/bli_trsm_rl_ker_var2.c b/frame/3/trsm/bli_trsm_rl_ker_var2.c index 721203df72..c9010378f5 100644 --- a/frame/3/trsm/bli_trsm_rl_ker_var2.c +++ b/frame/3/trsm/bli_trsm_rl_ker_var2.c @@ -51,7 +51,6 @@ typedef void (*FUNCPTR_T) void* alpha2, void* c, inc_t rs_c, inc_t cs_c, cntx_t* cntx, - rntm_t* rntm, thrinfo_t* thread ); @@ -64,8 +63,7 @@ void bli_trsm_rl_ker_var2 const obj_t* b, const obj_t* c, const cntx_t* cntx, - rntm_t* rntm, - cntl_t* cntl, + const cntl_t* cntl, thrinfo_t* thread ) { @@ -126,7 +124,6 @@ void bli_trsm_rl_ker_var2 ( void* )buf_alpha2, buf_c, rs_c, cs_c, ( cntx_t* )cntx, - rntm, thread ); } @@ -149,7 +146,6 @@ void PASTEMAC(ch,varname) \ void* alpha2, \ void* c, inc_t rs_c, inc_t cs_c, \ cntx_t* cntx, \ - rntm_t* rntm, \ thrinfo_t* thread \ ) \ { \ diff --git a/frame/3/trsm/bli_trsm_ru_ker_var2.c b/frame/3/trsm/bli_trsm_ru_ker_var2.c index 447fbf8cd5..ea9e351d30 100644 --- a/frame/3/trsm/bli_trsm_ru_ker_var2.c +++ b/frame/3/trsm/bli_trsm_ru_ker_var2.c @@ -51,7 +51,6 @@ typedef void (*FUNCPTR_T) void* alpha2, void* c, inc_t rs_c, inc_t cs_c, cntx_t* cntx, - rntm_t* rntm, thrinfo_t* thread ); @@ -64,8 +63,7 @@ void bli_trsm_ru_ker_var2 const obj_t* b, const obj_t* c, const cntx_t* cntx, - rntm_t* rntm, - cntl_t* cntl, + const cntl_t* cntl, thrinfo_t* thread ) { @@ -126,7 +124,6 @@ void bli_trsm_ru_ker_var2 ( void* )buf_alpha2, buf_c, rs_c, cs_c, ( cntx_t* )cntx, - rntm, thread ); } @@ -149,7 +146,6 @@ void PASTEMAC(ch,varname) \ void* alpha2, \ void* c, inc_t rs_c, inc_t cs_c, \ cntx_t* cntx, \ - rntm_t* rntm, \ thrinfo_t* thread \ ) \ { \ diff --git a/frame/3/trsm/bli_trsm_var.h b/frame/3/trsm/bli_trsm_var.h index 7e747b4a88..5ab9a43bfd 100644 --- a/frame/3/trsm/bli_trsm_var.h +++ b/frame/3/trsm/bli_trsm_var.h @@ -47,8 +47,7 @@ void PASTEMAC0(opname) \ const obj_t* b, \ const obj_t* c, \ const cntx_t* cntx, \ - rntm_t* rntm, \ - cntl_t* cntl, \ + const cntl_t* cntl, \ thrinfo_t* thread \ ); @@ -87,7 +86,6 @@ void PASTEMAC(ch,varname) \ void* alpha2, \ void* c, inc_t rs_c, inc_t cs_c, \ cntx_t* cntx, \ - rntm_t* rntm, \ thrinfo_t* thread \ ); diff --git a/frame/3/trsm/bli_trsm_xx_ker_var2.c b/frame/3/trsm/bli_trsm_xx_ker_var2.c index a0a59c0a85..f5d25bbbe4 100644 --- a/frame/3/trsm/bli_trsm_xx_ker_var2.c +++ b/frame/3/trsm/bli_trsm_xx_ker_var2.c @@ -47,8 +47,7 @@ void bli_trsm_xx_ker_var2 const obj_t* b, const obj_t* c, const cntx_t* cntx, - rntm_t* rntm, - cntl_t* cntl, + const cntl_t* cntl, thrinfo_t* thread ) { @@ -82,7 +81,6 @@ void bli_trsm_xx_ker_var2 b, c, cntx, - rntm, cntl, thread ); diff --git a/frame/base/bli_cntl.c b/frame/base/bli_cntl.c index b22ddbee0b..b4a5870c9b 100644 --- a/frame/base/bli_cntl.c +++ b/frame/base/bli_cntl.c @@ -37,7 +37,7 @@ cntl_t* bli_cntl_create_node ( - rntm_t* rntm, + pool_t* pool, opid_t family, bszid_t bszid, void_fp var_func, @@ -46,14 +46,13 @@ cntl_t* bli_cntl_create_node ) { cntl_t* cntl; - mem_t* pack_mem; #ifdef BLIS_ENABLE_MEM_TRACING printf( "bli_cntl_create_node(): " ); #endif // Allocate the cntl_t struct. - cntl = bli_sba_acquire( rntm, sizeof( cntl_t ) ); + cntl = bli_sba_acquire( pool, sizeof( cntl_t ) ); bli_cntl_set_family( family, cntl ); bli_cntl_set_bszid( bszid, cntl ); @@ -62,19 +61,12 @@ cntl_t* bli_cntl_create_node bli_cntl_set_sub_prenode( NULL, cntl ); bli_cntl_set_sub_node( sub_node, cntl ); - // Query the address of the node's packed mem_t entry so we can initialize - // key fields (to NULL or 0). - // NOTE: This initialization is important, since it allows threads to - // discern whether blocks have been acquired from the memory allocator. - pack_mem = bli_cntl_pack_mem( cntl ); - bli_mem_clear( pack_mem ); - return cntl; } void bli_cntl_free_node ( - rntm_t* rntm, + pool_t* pool, cntl_t* cntl ) { @@ -82,7 +74,7 @@ void bli_cntl_free_node printf( "bli_cntl_free_node(): " ); #endif - bli_sba_release( rntm, cntl ); + bli_sba_release( pool, cntl ); } void bli_cntl_clear_node @@ -90,118 +82,19 @@ void bli_cntl_clear_node cntl_t* cntl ) { - mem_t* pack_mem; - // Clear various fields in the control tree. Clearing these fields // actually is not needed, but we do it for debugging/completeness. bli_cntl_set_var_func( NULL, cntl ); bli_cntl_set_params( NULL, cntl ); bli_cntl_set_sub_prenode( NULL, cntl ); bli_cntl_set_sub_node( NULL, cntl ); - - // Clearing these fields is potentially more important if the control - // tree is cached somewhere and reused. - pack_mem = bli_cntl_pack_mem( cntl ); - bli_mem_clear( pack_mem ); } // ----------------------------------------------------------------------------- void bli_cntl_free ( - rntm_t* rntm, - cntl_t* cntl, - thrinfo_t* thread - ) -{ - if ( thread != NULL ) bli_cntl_free_w_thrinfo( rntm, cntl, thread ); - else bli_cntl_free_wo_thrinfo( rntm, cntl ); -} - -void bli_cntl_free_w_thrinfo - ( - rntm_t* rntm, - cntl_t* cntl, - thrinfo_t* thread - ) -{ - // Base case: simply return when asked to free NULL nodes. - if ( cntl == NULL ) return; - - cntl_t* cntl_sub_prenode = bli_cntl_sub_prenode( cntl ); - cntl_t* cntl_sub_node = bli_cntl_sub_node( cntl ); - void* cntl_params = bli_cntl_params( cntl ); - mem_t* cntl_pack_mem = bli_cntl_pack_mem( cntl ); - - // Don't immediately dereference the prenode and subnode of the thrinfo_t - // node. In some cases, the thrinfo_t tree is not built out all the way, - // perhaps because there are more ways of parallelization than micropanels - // of data in this dimension, or because the problem is small enough that - // there is no gemm subproblem in bli_trsm_blk_var1(). Thus, we start with - // NULL values for these variables and only dereference the fields of the - // thrinfo_t struct if the thrinfo_t exists (ie: is non-NULL). We will also - // have to check the thrinfo_t pointer for NULLness before using it below, - // when checking if we need to free the pack_mem field of the cntl_t node - // (see below). - thrinfo_t* thread_sub_prenode = NULL; - thrinfo_t* thread_sub_node = NULL; - - if ( thread != NULL ) - { - thread_sub_prenode = bli_thrinfo_sub_prenode( thread ); - thread_sub_node = bli_thrinfo_sub_node( thread ); - } - - // Only recurse into prenode branch if it exists. - if ( cntl_sub_prenode != NULL ) - { - // Recursively free all memory associated with the sub-prenode and its - // children. - bli_cntl_free_w_thrinfo( rntm, cntl_sub_prenode, thread_sub_prenode ); - } - - // Only recurse into the child node if it exists. - if ( cntl_sub_node != NULL ) - { - // Recursively free all memory associated with the sub-node and its - // children. - bli_cntl_free_w_thrinfo( rntm, cntl_sub_node, thread_sub_node ); - } - - // Free the current node's params field, if it is non-NULL. - if ( cntl_params != NULL ) - { - #ifdef BLIS_ENABLE_MEM_TRACING - printf( "bli_cntl_free_w_thrinfo(): " ); - #endif - - bli_sba_release( rntm, cntl_params ); - } - - // Release the current node's pack mem_t entry back to the memory - // broker from which it originated, but only if the mem_t entry is - // allocated, and only if the current thread is chief for its group. - // Also note that we don't proceed with either of the above tests if - // the thrinfo_t pointer is NULL. (See above for background on when - // this can happen.) - if ( thread != NULL ) - if ( bli_thread_am_ochief( thread ) ) - if ( bli_mem_is_alloc( cntl_pack_mem ) ) - { - #ifdef BLIS_ENABLE_MEM_TRACING - printf( "bli_cntl_free_w_thrinfo(): releasing mem pool block.\n" ); - #endif - - bli_pba_release( rntm, cntl_pack_mem ); - } - - // Free the current node. - bli_cntl_free_node( rntm, cntl ); -} - -void bli_cntl_free_wo_thrinfo - ( - rntm_t* rntm, + pool_t* pool, cntl_t* cntl ) { @@ -211,44 +104,35 @@ void bli_cntl_free_wo_thrinfo cntl_t* cntl_sub_prenode = bli_cntl_sub_prenode( cntl ); cntl_t* cntl_sub_node = bli_cntl_sub_node( cntl ); void* cntl_params = bli_cntl_params( cntl ); - mem_t* cntl_pack_mem = bli_cntl_pack_mem( cntl ); { // Recursively free all memory associated with the sub-prenode and its // children. - bli_cntl_free_wo_thrinfo( rntm, cntl_sub_prenode ); + bli_cntl_free( pool, cntl_sub_prenode ); } { // Recursively free all memory associated with the sub-node and its // children. - bli_cntl_free_wo_thrinfo( rntm, cntl_sub_node ); + bli_cntl_free( pool, cntl_sub_node ); } // Free the current node's params field, if it is non-NULL. if ( cntl_params != NULL ) { - bli_sba_release( rntm, cntl_params ); - } - - // Release the current node's pack mem_t entry back to the memory - // broker from which it originated, but only if the mem_t entry is - // allocated. - if ( bli_mem_is_alloc( cntl_pack_mem ) ) - { - bli_pba_release( rntm, cntl_pack_mem ); + bli_sba_release( pool, cntl_params ); } // Free the current node. - bli_cntl_free_node( rntm, cntl ); + bli_cntl_free_node( pool, cntl ); } // ----------------------------------------------------------------------------- cntl_t* bli_cntl_copy ( - rntm_t* rntm, - cntl_t* cntl + pool_t* pool, + const cntl_t* cntl ) { // Make a copy of the current node. Notice that the source node @@ -257,7 +141,7 @@ cntl_t* bli_cntl_copy // field. cntl_t* cntl_copy = bli_cntl_create_node ( - rntm, + pool, bli_cntl_family( cntl ), bli_cntl_bszid( cntl ), bli_cntl_var_func( cntl ), @@ -273,7 +157,7 @@ cntl_t* bli_cntl_copy // struct. uint64_t params_size = bli_cntl_params_size( cntl ); void* params_orig = bli_cntl_params( cntl ); - void* params_copy = bli_sba_acquire( rntm, ( size_t )params_size ); + void* params_copy = bli_sba_acquire( pool, ( size_t )params_size ); // Copy the original params struct to the new memory region. memcpy( params_copy, params_orig, params_size ); @@ -288,7 +172,7 @@ cntl_t* bli_cntl_copy { cntl_t* sub_prenode_copy = bli_cntl_copy ( - rntm, + pool, bli_cntl_sub_prenode( cntl ) ); @@ -302,7 +186,7 @@ cntl_t* bli_cntl_copy { cntl_t* sub_node_copy = bli_cntl_copy ( - rntm, + pool, bli_cntl_sub_node( cntl ) ); diff --git a/frame/base/bli_cntl.h b/frame/base/bli_cntl.h index 406a350eec..7e6a1d2f83 100644 --- a/frame/base/bli_cntl.h +++ b/frame/base/bli_cntl.h @@ -50,9 +50,6 @@ struct cntl_s // NOTE: first field of params must be a uint64_t containing the size // of the struct. void* params; - - // Internal fields that track "cached" data. - mem_t pack_mem; }; typedef struct cntl_s cntl_t; */ @@ -62,7 +59,7 @@ typedef struct cntl_s cntl_t; BLIS_EXPORT_BLIS cntl_t* bli_cntl_create_node ( - rntm_t* rntm, + pool_t* pool, opid_t family, bszid_t bszid, void_fp var_func, @@ -72,7 +69,7 @@ BLIS_EXPORT_BLIS cntl_t* bli_cntl_create_node BLIS_EXPORT_BLIS void bli_cntl_free_node ( - rntm_t* rntm, + pool_t* pool, cntl_t* cntl ); @@ -85,28 +82,14 @@ BLIS_EXPORT_BLIS void bli_cntl_clear_node BLIS_EXPORT_BLIS void bli_cntl_free ( - rntm_t* rntm, - cntl_t* cntl, - thrinfo_t* thread - ); - -BLIS_EXPORT_BLIS void bli_cntl_free_w_thrinfo - ( - rntm_t* rntm, - cntl_t* cntl, - thrinfo_t* thread - ); - -BLIS_EXPORT_BLIS void bli_cntl_free_wo_thrinfo - ( - rntm_t* rntm, - cntl_t* cntl + pool_t* pool, + cntl_t* cntl ); BLIS_EXPORT_BLIS cntl_t* bli_cntl_copy ( - rntm_t* rntm, - cntl_t* cntl + pool_t* pool, + const cntl_t* cntl ); BLIS_EXPORT_BLIS void bli_cntl_mark_family @@ -163,11 +146,6 @@ BLIS_INLINE uint64_t bli_cntl_params_size( const cntl_t* cntl ) return *( ( uint64_t* )(cntl->params) ); } -BLIS_INLINE mem_t* bli_cntl_pack_mem( cntl_t* cntl ) -{ - return &(cntl->pack_mem); -} - // cntl_t query (complex) BLIS_INLINE bool bli_cntl_is_null( const cntl_t* cntl ) @@ -220,8 +198,3 @@ BLIS_INLINE void bli_cntl_set_params( void* params, cntl_t* cntl ) cntl->params = params; } -BLIS_INLINE void bli_cntl_set_pack_mem( mem_t* pack_mem, cntl_t* cntl ) -{ - cntl->pack_mem = *pack_mem; -} - diff --git a/frame/base/bli_env.c b/frame/base/bli_env.c index fab6af89eb..68b759168d 100644 --- a/frame/base/bli_env.c +++ b/frame/base/bli_env.c @@ -67,8 +67,8 @@ gint_t bli_env_get_var( const char* env, gint_t fallback ) { - gint_t r_val; - const char* str; + gint_t r_val; + const char* str; // Query the environment variable and store the result in str. str = getenv( env ); diff --git a/frame/base/bli_pba.c b/frame/base/bli_pba.c index cabaf4ff6a..6b3456dfbe 100644 --- a/frame/base/bli_pba.c +++ b/frame/base/bli_pba.c @@ -92,7 +92,7 @@ void bli_pba_finalize void bli_pba_acquire_m ( - rntm_t* rntm, + pba_t* pba, siz_t req_size, packbuf_t buf_type, mem_t* mem @@ -115,10 +115,6 @@ void bli_pba_acquire_m #endif #endif - // Query the memory broker from the runtime. - pba_t* pba = bli_rntm_pba( rntm ); - - if ( buf_type == BLIS_BUFFER_FOR_GEN_USE ) { malloc_ft malloc_fp = bli_pba_malloc_fp( pba ); @@ -197,15 +193,16 @@ void bli_pba_acquire_m void bli_pba_release ( - rntm_t* rntm, - mem_t* mem + pba_t* pba, + mem_t* mem ) { - // Query the memory broker from the runtime. - pba_t* pba = bli_rntm_pba( rntm ); + packbuf_t buf_type; + pool_t* pool; + pblk_t* pblk; // Extract the buffer type so we know what kind of memory was allocated. - packbuf_t buf_type = bli_mem_buf_type( mem ); + buf_type = bli_mem_buf_type( mem ); #ifndef BLIS_ENABLE_PBA_POOLS #ifdef BLIS_ENABLE_MEM_TRACING @@ -227,10 +224,10 @@ void bli_pba_release { // Extract the address of the pool from which the memory was // allocated. - pool_t* pool = bli_mem_pool( mem ); + pool = bli_mem_pool( mem ); // Extract the address of the pblk_t struct within the mem_t struct. - pblk_t* pblk = bli_mem_pblk( mem ); + pblk = bli_mem_pblk( mem ); // Acquire the mutex associated with the pba object. bli_pba_lock( pba ); @@ -340,8 +337,8 @@ void bli_pba_init_pools // For blocks of A and panels of B, start off with block_ptrs arrays that // are of a decent length. For C, we can start off with an empty array. - const dim_t block_ptrs_len_a = 80; - const dim_t block_ptrs_len_b = 80; + const dim_t block_ptrs_len_a = 1;//80; + const dim_t block_ptrs_len_b = 1;//80; const dim_t block_ptrs_len_c = 0; // Use the address alignment sizes designated (at configure-time) for pools. @@ -410,10 +407,12 @@ void bli_pba_compute_pool_block_sizes siz_t bs_cand_b = 0; siz_t bs_cand_c = 0; + num_t dt; + // Compute pool block sizes for each datatype and find the maximum // size for each pool. This is done so that new pools do not need // to be allocated if the user switches datatypes. - for ( num_t dt = BLIS_DT_LO; dt <= BLIS_DT_HI; ++dt ) + for ( dt = BLIS_DT_LO; dt <= BLIS_DT_HI; ++dt ) { siz_t bs_dt_a; siz_t bs_dt_b; @@ -450,36 +449,64 @@ void bli_pba_compute_pool_block_sizes_dt const cntx_t* cntx ) { + siz_t size_dt = bli_dt_size( dt ); + + const blksz_t* mr; + const blksz_t* nr; + + const blksz_t* mc; + const blksz_t* kc; + const blksz_t* nc; + + dim_t mr_dt; + dim_t nr_dt; + dim_t max_mnr_dt; + + dim_t mc_max_dt; + dim_t kc_max_dt; + dim_t nc_max_dt; + + dim_t packmr_dt; + dim_t packnr_dt; + dim_t max_packmnr_dt; + + dim_t scale_num_dt; + dim_t scale_den_dt; + + dim_t pool_mc_dt, left_mc_dt; + dim_t pool_nc_dt, left_nc_dt; + dim_t pool_kc_dt; + // // Find the larger of the two register blocksizes. // // Query the mr and nr blksz_t objects for the given method of // execution. - const blksz_t* mr = bli_cntx_get_blksz( BLIS_MR, cntx ); - const blksz_t* nr = bli_cntx_get_blksz( BLIS_NR, cntx ); + mr = bli_cntx_get_blksz( BLIS_MR, cntx ); + nr = bli_cntx_get_blksz( BLIS_NR, cntx ); // Extract the mr and nr values specific to the current datatype. - dim_t mr_dt = bli_blksz_get_def( dt, mr ); - dim_t nr_dt = bli_blksz_get_def( dt, nr ); + mr_dt = bli_blksz_get_def( dt, mr ); + nr_dt = bli_blksz_get_def( dt, nr ); // Find the maximum of mr and nr. - dim_t max_mnr_dt = bli_max( mr_dt, nr_dt ); + max_mnr_dt = bli_max( mr_dt, nr_dt ); // // Define local maximum cache blocksizes. // // Query the mc, kc, and nc blksz_t objects for native execution. - const blksz_t* mc = bli_cntx_get_blksz( BLIS_MC, cntx ); - const blksz_t* kc = bli_cntx_get_blksz( BLIS_KC, cntx ); - const blksz_t* nc = bli_cntx_get_blksz( BLIS_NC, cntx ); + mc = bli_cntx_get_blksz( BLIS_MC, cntx ); + kc = bli_cntx_get_blksz( BLIS_KC, cntx ); + nc = bli_cntx_get_blksz( BLIS_NC, cntx ); // Extract the maximum mc, kc, and nc values specific to the current // datatype. - dim_t mc_max_dt = bli_blksz_get_max( dt, mc ); - dim_t kc_max_dt = bli_blksz_get_max( dt, kc ); - dim_t nc_max_dt = bli_blksz_get_max( dt, nc ); + mc_max_dt = bli_blksz_get_max( dt, mc ); + kc_max_dt = bli_blksz_get_max( dt, kc ); + nc_max_dt = bli_blksz_get_max( dt, nc ); // Add max(mr,nr) to kc to make room for the nudging of kc at // runtime to be a multiple of mr or nr for triangular operations @@ -511,11 +538,8 @@ void bli_pba_compute_pool_block_sizes_dt // So, if packmr * nr >= packnr * mr, then we will use packmr and mr as // our scaling factors. Otherwise, we'll use packnr and nr. - dim_t packmr_dt = bli_blksz_get_max( dt, mr ); - dim_t packnr_dt = bli_blksz_get_max( dt, nr ); - - dim_t scale_num_dt; - dim_t scale_den_dt; + packmr_dt = bli_blksz_get_max( dt, mr ); + packnr_dt = bli_blksz_get_max( dt, nr ); if ( packmr_dt * nr_dt >= packnr_dt * mr_dt ) { scale_num_dt = packmr_dt; @@ -527,13 +551,13 @@ void bli_pba_compute_pool_block_sizes_dt // Compute pool block dimensions. // - dim_t pool_mc_dt = ( mc_max_dt * scale_num_dt ) / scale_den_dt; - dim_t left_mc_dt = ( mc_max_dt * scale_num_dt ) % scale_den_dt; + pool_mc_dt = ( mc_max_dt * scale_num_dt ) / scale_den_dt; + left_mc_dt = ( mc_max_dt * scale_num_dt ) % scale_den_dt; - dim_t pool_nc_dt = ( nc_max_dt * scale_num_dt ) / scale_den_dt; - dim_t left_nc_dt = ( nc_max_dt * scale_num_dt ) % scale_den_dt; + pool_nc_dt = ( nc_max_dt * scale_num_dt ) / scale_den_dt; + left_nc_dt = ( nc_max_dt * scale_num_dt ) % scale_den_dt; - dim_t pool_kc_dt = ( kc_max_dt ); + pool_kc_dt = ( kc_max_dt ); if ( left_mc_dt > 0 ) pool_mc_dt += 1; if ( left_nc_dt > 0 ) pool_nc_dt += 1; @@ -542,12 +566,10 @@ void bli_pba_compute_pool_block_sizes_dt // Compute pool block sizes // - siz_t size_dt = bli_dt_size( dt ); - // We add an extra micro-panel of space to the block sizes for A and B // just to be sure any pre-loading performed by the micro-kernel does // not cause a segmentation fault. - dim_t max_packmnr_dt = bli_max( packmr_dt, packnr_dt ); + max_packmnr_dt = bli_max( packmr_dt, packnr_dt ); *bs_a = ( pool_mc_dt + max_packmnr_dt ) * pool_kc_dt * size_dt; *bs_b = ( pool_nc_dt + max_packmnr_dt ) * pool_kc_dt * size_dt; diff --git a/frame/base/bli_pba.h b/frame/base/bli_pba.h index dfda530902..0adde19414 100644 --- a/frame/base/bli_pba.h +++ b/frame/base/bli_pba.h @@ -132,7 +132,7 @@ void bli_pba_finalize void bli_pba_acquire_m ( - rntm_t* rntm, + pba_t* pba, siz_t req_size, packbuf_t buf_type, mem_t* mem @@ -140,20 +140,10 @@ void bli_pba_acquire_m void bli_pba_release ( - rntm_t* rntm, - mem_t* mem + pba_t* pba, + mem_t* mem ); -BLIS_INLINE void bli_pba_rntm_set_pba - ( - rntm_t* rntm - ) -{ - pba_t* pba = bli_pba_query(); - - bli_rntm_set_pba( pba, rntm ); -} - siz_t bli_pba_pool_size ( const pba_t* pba, diff --git a/frame/base/bli_pool.c b/frame/base/bli_pool.c index 891f770aef..cbd9fd7dfc 100644 --- a/frame/base/bli_pool.c +++ b/frame/base/bli_pool.c @@ -334,10 +334,6 @@ void bli_pool_checkin_block // Query the top_index of the pool. const siz_t top_index = bli_pool_top_index( pool ); - // Check for double-free and other conditions which may prematurely - // exhaust the memory pool. - if ( top_index == 0 ) bli_abort(); - #ifdef BLIS_ENABLE_MEM_TRACING printf( "bli_pool_checkin_block(): checking in block %d of size %d " "(align %d, offset %d).\n", @@ -347,6 +343,10 @@ void bli_pool_checkin_block fflush( stdout ); #endif + // Check for double-free and other conditions which may prematurely + // exhaust the memory pool. + if ( top_index == 0 ) bli_abort(); + // Copy the caller's pblk_t struct to the block at top_index - 1. block_ptrs[ top_index - 1 ] = *block; diff --git a/frame/base/bli_rntm.h b/frame/base/bli_rntm.h index f6756c5890..f20f8452e3 100644 --- a/frame/base/bli_rntm.h +++ b/frame/base/bli_rntm.h @@ -52,10 +52,6 @@ typedef struct rntm_s bool pack_a; bool pack_b; bool l3_sup; - - pool_t* sba_pool; - pba_t* pba; - } rntm_t; */ @@ -80,7 +76,7 @@ BLIS_INLINE dim_t bli_rntm_num_threads( const rntm_t* rntm ) BLIS_INLINE dim_t bli_rntm_ways_for( bszid_t bszid, const rntm_t* rntm ) { - return rntm->thrloop[ bszid ]; + return bszid == BLIS_NO_PART ? 1 : rntm->thrloop[ bszid ]; } BLIS_INLINE dim_t bli_rntm_jc_ways( const rntm_t* rntm ) @@ -122,20 +118,6 @@ BLIS_INLINE bool bli_rntm_l3_sup( const rntm_t* rntm ) return rntm->l3_sup; } -// -// -- rntm_t query (internal use only) ----------------------------------------- -// - -BLIS_INLINE pool_t* bli_rntm_sba_pool( const rntm_t* rntm ) -{ - return rntm->sba_pool; -} - -BLIS_INLINE pba_t* bli_rntm_pba( const rntm_t* rntm ) -{ - return rntm->pba; -} - // // -- rntm_t modification (internal use only) ---------------------------------- // @@ -196,16 +178,6 @@ BLIS_INLINE void bli_rntm_set_ways_only( dim_t jc, dim_t pc, dim_t ic, dim_t jr, bli_rntm_set_pr_ways_only( 1, rntm ); } -BLIS_INLINE void bli_rntm_set_sba_pool( pool_t* sba_pool, rntm_t* rntm ) -{ - rntm->sba_pool = sba_pool; -} - -BLIS_INLINE void bli_rntm_set_pba( pba_t* pba, rntm_t* rntm ) -{ - rntm->pba = pba; -} - BLIS_INLINE void bli_rntm_clear_num_threads_only( rntm_t* rntm ) { bli_rntm_set_num_threads_only( 1, rntm ); @@ -276,15 +248,6 @@ BLIS_INLINE void bli_rntm_clear_l3_sup( rntm_t* rntm ) bli_rntm_set_l3_sup( TRUE, rntm ); } -BLIS_INLINE void bli_rntm_clear_sba_pool( rntm_t* rntm ) -{ - bli_rntm_set_sba_pool( NULL, rntm ); -} -BLIS_INLINE void bli_rntm_clear_pba( rntm_t* rntm ) -{ - bli_rntm_set_pba( NULL, rntm ); -} - // // -- rntm_t initialization ---------------------------------------------------- // @@ -302,8 +265,6 @@ BLIS_INLINE void bli_rntm_clear_pba( rntm_t* rntm ) .pack_a = FALSE, \ .pack_b = FALSE, \ .l3_sup = TRUE, \ - .sba_pool = NULL, \ - .pba = NULL, \ } \ BLIS_INLINE void bli_rntm_init( rntm_t* rntm ) @@ -317,9 +278,6 @@ BLIS_INLINE void bli_rntm_init( rntm_t* rntm ) bli_rntm_clear_pack_a( rntm ); bli_rntm_clear_pack_b( rntm ); bli_rntm_clear_l3_sup( rntm ); - - bli_rntm_clear_sba_pool( rntm ); - bli_rntm_clear_pba( rntm ); } // diff --git a/frame/base/bli_sba.c b/frame/base/bli_sba.c index 776622bb4a..0eaedbed5a 100644 --- a/frame/base/bli_sba.c +++ b/frame/base/bli_sba.c @@ -57,7 +57,7 @@ void bli_sba_finalize( void ) void* bli_sba_acquire ( - rntm_t* rntm, + pool_t* pool, siz_t req_size ) { @@ -65,50 +65,40 @@ void* bli_sba_acquire err_t r_val; #ifdef BLIS_ENABLE_SBA_POOLS - if ( rntm == NULL ) + pblk_t pblk; + + // We don't expect NULL sba_pool pointers in the normal course of BLIS + // operation. However, there are rare instances where it is convenient + // to support use of bli_sba_acquire() without having to pass in a valid + // sba pool data structure. The case that inspired this branch was the + // gemm_ukr and related test modules in the BLIS testsuite. (There, it + // is convenient to not have to checkout an array_t from the sba, and it + // does no harm since the malloc() happens outside of the region that + // would be timed.) + if ( pool == NULL ) { - block = bli_malloc_intl( req_size, &r_val ); + block = bli_malloc_intl( req_size, &r_val ); } else { - pblk_t pblk; - - // Query the small block pool from the rntm. - pool_t* pool = bli_rntm_sba_pool( rntm ); - - // We don't expect NULL sba_pool pointers in the normal course of BLIS - // operation. However, there are rare instances where it is convenient - // to support use of bli_sba_acquire() without having to pass in a valid - // sba pool data structure. The case that inspired this branch was the - // gemm_ukr and related test modules in the BLIS testsuite. (There, it - // is convenient to not have to checkout an array_t from the sba, and it - // does no harm since the malloc() happens outside of the region that - // would be timed.) - if ( pool == NULL ) - { - block = bli_malloc_intl( req_size, &r_val ); - } - else + // Query the block_size of the pool_t so that we can request the exact + // size present. + const siz_t block_size = bli_pool_block_size( pool ); + + // Sanity check: Make sure the requested size is no larger than the + // block_size field of the pool. + if ( block_size < req_size ) { - // Query the block_size of the pool_t so that we can request the exact - // size present. - const siz_t block_size = bli_pool_block_size( pool ); - - // Sanity check: Make sure the requested size is no larger than the - // block_size field of the pool. - if ( block_size < req_size ) - { - printf( "bli_sba_acquire(): ** pool block_size is %d but req_size is %d.\n", - ( int )block_size, ( int )req_size ); - bli_abort(); - } - - // Check out a block using the block_size queried above. - bli_pool_checkout_block( block_size, &pblk, pool ); - - // The block address is stored within the pblk_t. - block = bli_pblk_buf( &pblk ); + printf( "bli_sba_acquire(): ** pool block_size is %d but req_size is %d.\n", + ( int )block_size, ( int )req_size ); + bli_abort(); } + + // Check out a block using the block_size queried above. + bli_pool_checkout_block( block_size, &pblk, pool ); + + // The block address is stored within the pblk_t. + block = bli_pblk_buf( &pblk ); } #else @@ -122,44 +112,34 @@ void* bli_sba_acquire void bli_sba_release ( - rntm_t* rntm, + pool_t* pool, void* block ) { #ifdef BLIS_ENABLE_SBA_POOLS - if ( rntm == NULL ) + pblk_t pblk; + + if ( pool == NULL ) { - bli_free_intl( block ); + bli_free_intl( block ); } else { - pblk_t pblk; - - // Query the small block pool from the rntm. - pool_t* pool = bli_rntm_sba_pool( rntm ); - - if ( pool == NULL ) - { - bli_free_intl( block ); - } - else - { - // Query the block_size field from the pool. This is not super-important - // for this particular application of the pool_t (that is, the "leaf" - // component of the sba), but it seems like good housekeeping to maintain - // the block_size field of the pblk_t in case its ever needed/read. - const siz_t block_size = bli_pool_block_size( pool ); - - // Embed the block's memory address into a pblk_t, along with the - // block_size queried from the pool. - bli_pblk_set_buf( block, &pblk ); - bli_pblk_set_block_size( block_size, &pblk ); - - // Check the pblk_t back into the pool_t. (It's okay that the pblk_t is - // a local variable since its contents are copied into the pool's internal - // data structure--an array of pblk_t.) - bli_pool_checkin_block( &pblk, pool ); - } + // Query the block_size field from the pool. This is not super-important + // for this particular application of the pool_t (that is, the "leaf" + // component of the sba), but it seems like good housekeeping to maintain + // the block_size field of the pblk_t in case its ever needed/read. + const siz_t block_size = bli_pool_block_size( pool ); + + // Embed the block's memory address into a pblk_t, along with the + // block_size queried from the pool. + bli_pblk_set_buf( block, &pblk ); + bli_pblk_set_block_size( block_size, &pblk ); + + // Check the pblk_t back into the pool_t. (It's okay that the pblk_t is + // a local variable since its contents are copied into the pool's internal + // data structure--an array of pblk_t.) + bli_pool_checkin_block( &pblk, pool ); } #else @@ -192,23 +172,4 @@ void bli_sba_checkin_array bli_apool_checkin_array( array, &sba ); } -void bli_sba_rntm_set_pool - ( - siz_t index, - array_t* array, - rntm_t* rntm - ) -{ - #ifndef BLIS_ENABLE_SBA_POOLS - bli_rntm_set_sba_pool( NULL, rntm ); - return; - #endif - - // Query the pool_t* in the array_t corresponding to index. - pool_t* pool = bli_apool_array_elem( index, array ); - - // Embed the pool_t* into the rntm_t. - bli_rntm_set_sba_pool( pool, rntm ); -} - diff --git a/frame/base/bli_sba.h b/frame/base/bli_sba.h index 4fc3aaaeea..edda199856 100644 --- a/frame/base/bli_sba.h +++ b/frame/base/bli_sba.h @@ -52,21 +52,15 @@ void bli_sba_checkin_array array_t* array ); -void bli_sba_rntm_set_pool - ( - siz_t index, - array_t* array, - rntm_t* rntm - ); - void* bli_sba_acquire ( - rntm_t* rntm, + pool_t* pool, siz_t req_size ); + void bli_sba_release ( - rntm_t* rntm, + pool_t* pool, void* block ); diff --git a/frame/include/bli_extern_defs.h b/frame/include/bli_extern_defs.h index 42ad9c72ba..71a6096e10 100644 --- a/frame/include/bli_extern_defs.h +++ b/frame/include/bli_extern_defs.h @@ -44,7 +44,5 @@ BLIS_EXPORT_BLIS extern const obj_t BLIS_MINUS_ONE; BLIS_EXPORT_BLIS extern const obj_t BLIS_MINUS_TWO; BLIS_EXPORT_BLIS extern thrcomm_t BLIS_SINGLE_COMM; -BLIS_EXPORT_BLIS extern thrinfo_t BLIS_PACKM_SINGLE_THREADED; -BLIS_EXPORT_BLIS extern thrinfo_t BLIS_GEMM_SINGLE_THREADED; #endif diff --git a/frame/include/bli_oapi_ba.h b/frame/include/bli_oapi_ba.h index d802635973..2ef0cae182 100644 --- a/frame/include/bli_oapi_ba.h +++ b/frame/include/bli_oapi_ba.h @@ -55,5 +55,5 @@ // warnings by the compiler. #undef BLIS_OAPI_EX_DECLS #define BLIS_OAPI_EX_DECLS const cntx_t* cntx = NULL; ( void )cntx; \ - rntm_t* rntm = NULL; ( void )rntm; + const rntm_t* rntm = NULL; ( void )rntm; diff --git a/frame/include/bli_oapi_ex.h b/frame/include/bli_oapi_ex.h index 7252fd7fff..b150b89fca 100644 --- a/frame/include/bli_oapi_ex.h +++ b/frame/include/bli_oapi_ex.h @@ -48,7 +48,7 @@ // Define the macro to add expert arguments to function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS -#define BLIS_OAPI_EX_PARAMS , const cntx_t* cntx, rntm_t* rntm +#define BLIS_OAPI_EX_PARAMS , const cntx_t* cntx, const rntm_t* rntm // Define the macro to omit the expert variable declaration block, since // it is not needed when expert parameters are passed in through the API. diff --git a/frame/include/bli_tapi_ba.h b/frame/include/bli_tapi_ba.h index 6a7e195abe..f7a85855cf 100644 --- a/frame/include/bli_tapi_ba.h +++ b/frame/include/bli_tapi_ba.h @@ -55,5 +55,5 @@ // warnings by the compiler. #undef BLIS_TAPI_EX_DECLS #define BLIS_TAPI_EX_DECLS const cntx_t* cntx = NULL; ( void )cntx; \ - rntm_t* rntm = NULL; ( void )rntm; + const rntm_t* rntm = NULL; ( void )rntm; diff --git a/frame/include/bli_tapi_ex.h b/frame/include/bli_tapi_ex.h index f12be24b89..e7665e779a 100644 --- a/frame/include/bli_tapi_ex.h +++ b/frame/include/bli_tapi_ex.h @@ -48,7 +48,7 @@ // Define the macro to add expert arguments to function signatures // and prototypes. #undef BLIS_TAPI_EX_PARAMS -#define BLIS_TAPI_EX_PARAMS , const cntx_t* cntx, rntm_t* rntm +#define BLIS_TAPI_EX_PARAMS , const cntx_t* cntx, const rntm_t* rntm // Define the macro to omit the expert variable declaration block, since // it is not needed when expert parameters are passed in through the API. diff --git a/frame/include/bli_type_defs.h b/frame/include/bli_type_defs.h index d37e62f8a4..f78926d780 100644 --- a/frame/include/bli_type_defs.h +++ b/frame/include/bli_type_defs.h @@ -1077,9 +1077,6 @@ struct cntl_s // NOTE: first field of params must be a uint64_t containing the size // of the struct. void* params; - - // Internal fields that track "cached" data. - mem_t pack_mem; }; typedef struct cntl_s cntl_t; @@ -1184,9 +1181,8 @@ typedef void (*obj_pack_fn_t) const struct obj_s* a, struct obj_s* ap, const struct cntx_s* cntx, - struct rntm_s* rntm, - struct cntl_s* cntl, - const struct thrinfo_s* thread + const struct cntl_s* cntl, + struct thrinfo_s* thread ); typedef void (*obj_ker_fn_t) @@ -1195,9 +1191,8 @@ typedef void (*obj_ker_fn_t) const struct obj_s* b, const struct obj_s* c, const struct cntx_s* cntx, - struct rntm_s* rntm, - struct cntl_s* cntl, - const struct thrinfo_s* thread + const struct cntl_s* cntl, + struct thrinfo_s* thread ); typedef struct obj_s @@ -1458,14 +1453,6 @@ typedef struct rntm_s bool pack_b; // enable/disable packing of right-hand matrix B. bool l3_sup; // enable/disable small matrix handling in level-3 ops. - // "Internal" fields: these should not be exposed to the end-user. - - // The small block pool, which is attached in the l3 thread decorator. - pool_t* sba_pool; - - // The packing block allocator, which is attached in the l3 thread decorator. - pba_t* pba; - } rntm_t; diff --git a/frame/thread/bli_l3_decor.c b/frame/thread/bli_l3_decor.c index 33fb834be4..226608a023 100644 --- a/frame/thread/bli_l3_decor.c +++ b/frame/thread/bli_l3_decor.c @@ -71,8 +71,8 @@ void bli_l3_thread_decorator const obj_t* beta, const obj_t* c, const cntx_t* cntx, - rntm_t* rntm, - cntl_t* cntl + const rntm_t* rntm, + const cntl_t* cntl ) { rntm_t rntm_l; @@ -146,7 +146,7 @@ void bli_l3_thread_decorator void bli_l3_thread_decorator_check ( - rntm_t* rntm + const rntm_t* rntm ) { //err_t e_val; diff --git a/frame/thread/bli_l3_decor.h b/frame/thread/bli_l3_decor.h index 087eda8745..bb80764e14 100644 --- a/frame/thread/bli_l3_decor.h +++ b/frame/thread/bli_l3_decor.h @@ -45,8 +45,7 @@ typedef void (*l3int_ft) const obj_t* beta, const obj_t* c, const cntx_t* cntx, - rntm_t* rntm, - cntl_t* cntl, + const cntl_t* cntl, thrinfo_t* thread ); @@ -61,8 +60,8 @@ typedef void (*l3_decor_ft) const obj_t* beta, const obj_t* c, const cntx_t* cntx, - rntm_t* rntm, - cntl_t* cntl + const rntm_t* rntm, + const cntl_t* cntl ); // Level-3 thread decorator prototype. @@ -76,13 +75,13 @@ void bli_l3_thread_decorator const obj_t* beta, const obj_t* c, const cntx_t* cntx, - rntm_t* rntm, - cntl_t* cntl + const rntm_t* rntm, + const cntl_t* cntl ); void bli_l3_thread_decorator_check ( - rntm_t* rntm + const rntm_t* rntm ); // Include definitions specific to the method of multithreading for the diff --git a/frame/thread/bli_l3_decor_openmp.c b/frame/thread/bli_l3_decor_openmp.c index 890c174cff..e1ac633451 100644 --- a/frame/thread/bli_l3_decor_openmp.c +++ b/frame/thread/bli_l3_decor_openmp.c @@ -50,8 +50,8 @@ void bli_l3_thread_decorator_openmp const obj_t* beta, const obj_t* c, const cntx_t* cntx, - rntm_t* rntm, - cntl_t* cntl + const rntm_t* rntm, + const cntl_t* cntl ) { // Query the total number of threads from the rntm_t object. @@ -77,19 +77,9 @@ void bli_l3_thread_decorator_openmp // resize the array_t, if necessary. array_t* array = bli_sba_checkout_array( n_threads ); - // Access the pool_t* for thread 0 and embed it into the rntm. We do - // this up-front only so that we have the rntm_t.sba_pool field - // initialized and ready for the global communicator creation below. - bli_sba_rntm_set_pool( 0, array, rntm ); - - // Set the packing block allocator field of the rntm. This will be - // inherited by all of the child threads when they make local copies of - // the rntm below. - bli_pba_rntm_set_pba( rntm ); - // Allocate a global communicator for the root thrinfo_t structures. - thrcomm_t* gl_comm = bli_thrcomm_create( rntm, n_threads ); - + timpl_t ti = bli_rntm_thread_impl( rntm ); + thrcomm_t* gl_comm = bli_thrcomm_create( NULL, ti, n_threads ); _Pragma( "omp parallel num_threads(n_threads)" ) { @@ -105,21 +95,12 @@ void bli_l3_thread_decorator_openmp // Check for a somewhat obscure OpenMP thread-mistmatch issue. bli_l3_thread_decorator_thread_check( n_threads, tid, gl_comm, rntm_p ); - // Use the thread id to access the appropriate pool_t* within the - // array_t, and use it to set the sba_pool field within the rntm_t. - // If the pool_t* element within the array_t is NULL, it will first - // be allocated/initialized. - bli_sba_rntm_set_pool( tid, array, rntm_p ); - - obj_t a_t, b_t, c_t; - cntl_t* cntl_use; - thrinfo_t* thread; - // Alias thread-local copies of A, B, and C. These will be the objects // we pass down the algorithmic function stack. Making thread-local // aliases is highly recommended in case a thread needs to change any // of the properties of an object without affecting other threads' // objects. + obj_t a_t, b_t, c_t; bli_obj_alias_to( a, &a_t ); bli_obj_alias_to( b, &b_t ); bli_obj_alias_to( c, &c_t ); @@ -136,13 +117,14 @@ void bli_l3_thread_decorator_openmp bli_obj_set_pack_schema( BLIS_NOT_PACKED, &b_t ); // Create a default control tree for the operation, if needed. + cntl_t* cntl_use; + pool_t* pool = bli_apool_array_elem( tid, array ); bli_l3_cntl_create_if( family, schema_a, schema_b, - &a_t, &b_t, &c_t, rntm_p, cntl, &cntl_use ); + &a_t, &b_t, &c_t, pool, cntl, &cntl_use ); // Create the root node of the current thread's thrinfo_t structure. - bli_l3_thrinfo_create_root( tid, gl_comm, rntm_p, cntl_use, &thread ); + thrinfo_t* thread = bli_l3_thrinfo_create( tid, gl_comm, array, rntm_p, cntl_use ); -#if 1 func ( alpha, @@ -151,40 +133,31 @@ void bli_l3_thread_decorator_openmp beta, &c_t, cntx, - rntm_p, cntl_use, - thread + bli_thrinfo_sub_node( thread ) ); -#else - bli_thrinfo_grow_tree - ( - rntm_p, - cntl_use, - thread - ); -#endif - // Free the thread's local control tree. - bli_l3_cntl_free( rntm_p, cntl_use, thread ); + // Free the thread's local control tree. + bli_l3_cntl_free( pool, cntl_use ); #ifdef PRINT_THRINFO threads[tid] = thread; #else // Free the current thread's thrinfo_t structure. - bli_l3_thrinfo_free( rntm_p, thread ); + bli_thrinfo_free( thread ); #endif } - // We shouldn't free the global communicator since it was already freed - // by the global communicator's chief thread in bli_l3_thrinfo_free() - // (called above). - #ifdef PRINT_THRINFO if ( family != BLIS_TRSM ) bli_l3_thrinfo_print_gemm_paths( threads ); else bli_l3_thrinfo_print_trsm_paths( threads ); exit(1); #endif + // Free the global communicator, because the root thrinfo_t node + // never frees its communicator. + bli_thrcomm_free( NULL, gl_comm ); + // Check the array_t back into the small block allocator. Similar to the // check-out, this is done using a lock embedded within the sba to ensure // mutual exclusion. diff --git a/frame/thread/bli_l3_decor_openmp.h b/frame/thread/bli_l3_decor_openmp.h index 95e1582e59..2821e7eb0c 100644 --- a/frame/thread/bli_l3_decor_openmp.h +++ b/frame/thread/bli_l3_decor_openmp.h @@ -49,8 +49,8 @@ void bli_l3_thread_decorator_openmp const obj_t* beta, const obj_t* c, const cntx_t* cntx, - rntm_t* rntm, - cntl_t* cntl + const rntm_t* rntm, + const cntl_t* cntl ); void bli_l3_thread_decorator_thread_check diff --git a/frame/thread/bli_l3_decor_pthreads.c b/frame/thread/bli_l3_decor_pthreads.c index d31414d3bc..27ab805b88 100644 --- a/frame/thread/bli_l3_decor_pthreads.c +++ b/frame/thread/bli_l3_decor_pthreads.c @@ -48,8 +48,8 @@ typedef struct thread_data const obj_t* beta; const obj_t* c; const cntx_t* cntx; - rntm_t* rntm; - cntl_t* cntl; + const rntm_t* rntm; + const cntl_t* cntl; dim_t tid; thrcomm_t* gl_comm; array_t* array; @@ -68,33 +68,18 @@ void* bli_l3_thread_entry( void* data_void ) const obj_t* beta = data->beta; const obj_t* c = data->c; const cntx_t* cntx = data->cntx; - rntm_t* rntm = data->rntm; - cntl_t* cntl = data->cntl; + const rntm_t* rntm = data->rntm; + const cntl_t* cntl = data->cntl; const dim_t tid = data->tid; array_t* array = data->array; thrcomm_t* gl_comm = data->gl_comm; - // Create a thread-local copy of the master thread's rntm_t. This is - // necessary since we want each thread to be able to track its own - // small block pool_t as it executes down the function stack. - rntm_t rntm_l = *rntm; - rntm_t* rntm_p = &rntm_l; - - // Use the thread id to access the appropriate pool_t* within the - // array_t, and use it to set the sba_pool field within the rntm_t. - // If the pool_t* element within the array_t is NULL, it will first - // be allocated/initialized. - bli_sba_rntm_set_pool( tid, array, rntm_p ); - - obj_t a_t, b_t, c_t; - cntl_t* cntl_use; - thrinfo_t* thread; - // Alias thread-local copies of A, B, and C. These will be the objects // we pass down the algorithmic function stack. Making thread-local // aliases is highly recommended in case a thread needs to change any // of the properties of an object without affecting other threads' // objects. + obj_t a_t, b_t, c_t; bli_obj_alias_to( a, &a_t ); bli_obj_alias_to( b, &b_t ); bli_obj_alias_to( c, &c_t ); @@ -111,11 +96,15 @@ void* bli_l3_thread_entry( void* data_void ) bli_obj_set_pack_schema( BLIS_NOT_PACKED, &b_t ); // Create a default control tree for the operation, if needed. + cntl_t* cntl_use; + pool_t* pool = bli_apool_array_elem( tid, array ); bli_l3_cntl_create_if( family, schema_a, schema_b, - &a_t, &b_t, &c_t, rntm_p, cntl, &cntl_use ); + &a_t, &b_t, &c_t, pool, cntl, &cntl_use ); // Create the root node of the current thread's thrinfo_t structure. - bli_l3_thrinfo_create_root( tid, gl_comm, rntm_p, cntl_use, &thread ); + // The root node is the *parent* of the node corresponding to the first + // control tree node. + thrinfo_t* thread = bli_l3_thrinfo_create( tid, gl_comm, array, rntm, cntl_use ); func ( @@ -125,16 +114,15 @@ void* bli_l3_thread_entry( void* data_void ) beta, &c_t, cntx, - rntm_p, cntl_use, - thread + bli_thrinfo_sub_node( thread ) ); // Free the thread's local control tree. - bli_l3_cntl_free( rntm_p, cntl_use, thread ); + bli_l3_cntl_free( pool, cntl_use ); // Free the current thread's thrinfo_t structure. - bli_l3_thrinfo_free( rntm_p, thread ); + bli_thrinfo_free( thread ); return NULL; } @@ -151,8 +139,8 @@ void bli_l3_thread_decorator_pthreads const obj_t* beta, const obj_t* c, const cntx_t* cntx, - rntm_t* rntm, - cntl_t* cntl + const rntm_t* rntm, + const cntl_t* cntl ) { err_t r_val; @@ -173,20 +161,11 @@ void bli_l3_thread_decorator_pthreads // with an internal lock to ensure only one application thread accesses // the sba at a time. bli_sba_checkout_array() will also automatically // resize the array_t, if necessary. - array_t* array = bli_sba_checkout_array( n_threads ); - - // Access the pool_t* for thread 0 and embed it into the rntm. We do - // this up-front only so that we have the rntm_t.sba_pool field - // initialized and ready for the global communicator creation below. - bli_sba_rntm_set_pool( 0, array, rntm ); - - // Set the packing block allocator field of the rntm. This will be - // inherited by all of the child threads when they make local copies of - // the rntm below. - bli_pba_rntm_set_pba( rntm ); + array_t* array = bli_sba_checkout_array( n_threads ); // Allocate a global communicator for the root thrinfo_t structures. - thrcomm_t* gl_comm = bli_thrcomm_create( rntm, n_threads ); + timpl_t ti = bli_rntm_thread_impl( rntm ); + thrcomm_t* gl_comm = bli_thrcomm_create( NULL, ti, n_threads ); // Allocate an array of pthread objects and auxiliary data structs to pass // to the thread entry functions. @@ -227,16 +206,16 @@ void bli_l3_thread_decorator_pthreads bli_l3_thread_entry( ( void* )(&datas[0]) ); } - // We shouldn't free the global communicator since it was already freed - // by the global communicator's chief thread in bli_l3_thrinfo_free() - // (called from the thread entry function). - // Thread 0 waits for additional threads to finish. for ( dim_t tid = 1; tid < n_threads; tid++ ) { bli_pthread_join( pthreads[tid], NULL ); } + // Free the global communicator, because the root thrinfo_t node + // never frees its communicator. + bli_thrcomm_free( NULL, gl_comm ); + // Check the array_t back into the small block allocator. Similar to the // check-out, this is done using a lock embedded within the sba to ensure // mutual exclusion. diff --git a/frame/thread/bli_l3_decor_pthreads.h b/frame/thread/bli_l3_decor_pthreads.h index edf36cf6ec..f58581402b 100644 --- a/frame/thread/bli_l3_decor_pthreads.h +++ b/frame/thread/bli_l3_decor_pthreads.h @@ -51,8 +51,8 @@ void bli_l3_thread_decorator_pthreads const obj_t* beta, const obj_t* c, const cntx_t* cntx, - rntm_t* rntm, - cntl_t* cntl + const rntm_t* rntm, + const cntl_t* cntl ); #endif diff --git a/frame/thread/bli_l3_decor_single.c b/frame/thread/bli_l3_decor_single.c index 6f0f8603bb..3055fc9530 100644 --- a/frame/thread/bli_l3_decor_single.c +++ b/frame/thread/bli_l3_decor_single.c @@ -47,8 +47,8 @@ void bli_l3_thread_decorator_single const obj_t* beta, const obj_t* c, const cntx_t* cntx, - rntm_t* rntm, - cntl_t* cntl + const rntm_t* rntm, + const cntl_t* cntl ) { // For sequential execution, we use only one thread. @@ -84,31 +84,10 @@ void bli_l3_thread_decorator_single // resize the array_t, if necessary. array_t* array = bli_sba_checkout_array( n_threads ); - // Access the pool_t* for thread 0 and embed it into the rntm. We do - // this up-front only so that we can create the global comm below. - bli_sba_rntm_set_pool( 0, array, rntm ); - - // Set the packing block allocator field of the rntm. - bli_pba_rntm_set_pba( rntm ); - - // Allcoate a global communicator for the root thrinfo_t structures. - thrcomm_t* gl_comm = bli_thrcomm_create( rntm, n_threads ); -#if 0 - timpl_t ti2 = bli_rntm_thread_impl( rntm ); - printf( "l3_decor_single: created thrcomm_t.ti = %s\n", - ( ti2 == BLIS_SINGLE ? "single" : - ( ti2 == BLIS_OPENMP ? "openmp" : "pthreads" ) ) ); -#endif - + // Use the single-threaded communicator + thrcomm_t* gl_comm = &BLIS_SINGLE_COMM; { - // NOTE: We don't need to create another copy of the rntm_t since - // it was already copied in one of the high-level oapi functions. - rntm_t* rntm_p = rntm; - - cntl_t* cntl_use; - thrinfo_t* thread; - const dim_t tid = 0; // Use the thread id to access the appropriate pool_t* within the @@ -127,11 +106,13 @@ void bli_l3_thread_decorator_single // elsewhere. // Create a default control tree for the operation, if needed. + cntl_t* cntl_use; + pool_t* pool = bli_apool_array_elem( tid, array ); bli_l3_cntl_create_if( family, schema_a, schema_b, - &a_t, &b_t, c, rntm_p, cntl, &cntl_use ); + &a_t, &b_t, c, pool, cntl, &cntl_use ); // Create the root node of the thread's thrinfo_t structure. - bli_l3_thrinfo_create_root( tid, gl_comm, rntm_p, cntl_use, &thread ); + thrinfo_t* thread = bli_l3_thrinfo_create( tid, gl_comm, array, rntm, cntl_use ); func ( @@ -141,16 +122,15 @@ void bli_l3_thread_decorator_single beta, c, cntx, - rntm_p, cntl_use, - thread + bli_thrinfo_sub_node( thread ) ); // Free the thread's local control tree. - bli_l3_cntl_free( rntm_p, cntl_use, thread ); + bli_l3_cntl_free( pool, cntl_use ); // Free the current thread's thrinfo_t structure. - bli_l3_thrinfo_free( rntm_p, thread ); + bli_thrinfo_free( thread ); } // We shouldn't free the global communicator since it was already freed diff --git a/frame/thread/bli_l3_decor_single.h b/frame/thread/bli_l3_decor_single.h index c118ad7beb..08044f07c2 100644 --- a/frame/thread/bli_l3_decor_single.h +++ b/frame/thread/bli_l3_decor_single.h @@ -45,8 +45,8 @@ void bli_l3_thread_decorator_single const obj_t* beta, const obj_t* c, const cntx_t* cntx, - rntm_t* rntm, - cntl_t* cntl + const rntm_t* rntm, + const cntl_t* cntl ); #endif diff --git a/frame/thread/bli_l3_sup_decor.c b/frame/thread/bli_l3_sup_decor.c index 53c7b41be3..f4330ed720 100644 --- a/frame/thread/bli_l3_sup_decor.c +++ b/frame/thread/bli_l3_sup_decor.c @@ -71,7 +71,7 @@ err_t bli_l3_sup_thread_decorator const obj_t* beta, const obj_t* c, const cntx_t* cntx, - rntm_t* rntm + const rntm_t* rntm ) { rntm_t rntm_l; @@ -129,9 +129,9 @@ err_t bli_l3_sup_thread_decorator void bli_l3_sup_thread_decorator_check ( - rntm_t* rntm + const rntm_t* rntm ) { - bli_l3_sup_thread_decorator_check( rntm ); + bli_l3_thread_decorator_check( rntm ); } diff --git a/frame/thread/bli_l3_sup_decor.h b/frame/thread/bli_l3_sup_decor.h index a271920b43..e7a3985e97 100644 --- a/frame/thread/bli_l3_sup_decor.h +++ b/frame/thread/bli_l3_sup_decor.h @@ -47,7 +47,7 @@ typedef err_t (*l3supint_ft) const obj_t* beta, const obj_t* c, const cntx_t* cntx, - rntm_t* rntm, + const rntm_t* rntm, thrinfo_t* thread ); @@ -62,7 +62,7 @@ typedef err_t (*l3_sup_decor_ft) const obj_t* beta, const obj_t* c, const cntx_t* cntx, - rntm_t* rntm + const rntm_t* rntm ); // Level-3 sup thread decorator prototype. @@ -76,12 +76,12 @@ err_t bli_l3_sup_thread_decorator const obj_t* beta, const obj_t* c, const cntx_t* cntx, - rntm_t* rntm + const rntm_t* rntm ); void bli_l3_sup_thread_decorator_check ( - rntm_t* rntm + const rntm_t* rntm ); // Include definitions specific to the method of multithreading for the diff --git a/frame/thread/bli_l3_sup_decor_openmp.c b/frame/thread/bli_l3_sup_decor_openmp.c index 7d06ad622e..9e55f36520 100644 --- a/frame/thread/bli_l3_sup_decor_openmp.c +++ b/frame/thread/bli_l3_sup_decor_openmp.c @@ -49,7 +49,7 @@ err_t bli_l3_sup_thread_decorator_openmp const obj_t* beta, const obj_t* c, const cntx_t* cntx, - rntm_t* rntm + const rntm_t* rntm ) { // Query the total number of threads from the rntm_t object. @@ -63,19 +63,9 @@ err_t bli_l3_sup_thread_decorator_openmp // resize the array_t, if necessary. array_t* array = bli_sba_checkout_array( n_threads ); - // Access the pool_t* for thread 0 and embed it into the rntm. We do - // this up-front only so that we have the rntm_t.sba_pool field - // initialized and ready for the global communicator creation below. - bli_sba_rntm_set_pool( 0, array, rntm ); - - // Set the packing block allocator field of the rntm. This will be - // inherited by all of the child threads when they make local copies of - // the rntm below. - bli_pba_rntm_set_pba( rntm ); - // Allcoate a global communicator for the root thrinfo_t structures. - thrcomm_t* gl_comm = bli_thrcomm_create( rntm, n_threads ); - + timpl_t ti = bli_rntm_thread_impl( rntm ); + thrcomm_t* gl_comm = bli_thrcomm_create( NULL, ti, n_threads ); _Pragma( "omp parallel num_threads(n_threads)" ) { @@ -86,23 +76,16 @@ err_t bli_l3_sup_thread_decorator_openmp rntm_t* rntm_p = &rntm_l; // Query the thread's id from OpenMP. - const dim_t tid = omp_get_thread_num(); + const dim_t tid = omp_get_thread_num(); + pool_t* pool = bli_apool_array_elem( tid, array ); // Check for a somewhat obscure OpenMP thread-mistmatch issue. // NOTE: This calls the same function used for the conventional/large // code path. bli_l3_thread_decorator_thread_check( n_threads, tid, gl_comm, rntm_p ); - // Use the thread id to access the appropriate pool_t* within the - // array_t, and use it to set the sba_pool field within the rntm_t. - // If the pool_t* element within the array_t is NULL, it will first - // be allocated/initialized. - bli_sba_rntm_set_pool( tid, array, rntm_p ); - - thrinfo_t* thread = NULL; - // Create the root node of the thread's thrinfo_t structure. - bli_l3_sup_thrinfo_create_root( tid, gl_comm, rntm_p, &thread ); + thrinfo_t* thread = bli_l3_sup_thrinfo_create( tid, gl_comm, pool, rntm_p ); func ( @@ -117,12 +100,12 @@ err_t bli_l3_sup_thread_decorator_openmp ); // Free the current thread's thrinfo_t structure. - bli_l3_sup_thrinfo_free( rntm_p, thread ); + bli_thrinfo_free( thread ); } - // We shouldn't free the global communicator since it was already freed - // by the global communicator's chief thread in bli_l3_thrinfo_free() - // (called from the thread entry function). + // Free the global communicator, because the root thrinfo_t node + // never frees its communicator. + bli_thrcomm_free( NULL, gl_comm ); // Check the array_t back into the small block allocator. Similar to the // check-out, this is done using a lock embedded within the sba to ensure diff --git a/frame/thread/bli_l3_sup_decor_openmp.h b/frame/thread/bli_l3_sup_decor_openmp.h index 4c5059d003..f226b959fa 100644 --- a/frame/thread/bli_l3_sup_decor_openmp.h +++ b/frame/thread/bli_l3_sup_decor_openmp.h @@ -48,7 +48,7 @@ err_t bli_l3_sup_thread_decorator_openmp const obj_t* beta, const obj_t* c, const cntx_t* cntx, - rntm_t* rntm + const rntm_t* rntm ); #endif diff --git a/frame/thread/bli_l3_sup_decor_pthreads.c b/frame/thread/bli_l3_sup_decor_pthreads.c index 7be5cf8fb9..c2aa7e965b 100644 --- a/frame/thread/bli_l3_sup_decor_pthreads.c +++ b/frame/thread/bli_l3_sup_decor_pthreads.c @@ -48,7 +48,7 @@ typedef struct thread_data const obj_t* beta; const obj_t* c; const cntx_t* cntx; - rntm_t* rntm; + const rntm_t* rntm; dim_t tid; thrcomm_t* gl_comm; array_t* array; @@ -67,7 +67,7 @@ void* bli_l3_sup_thread_entry( void* data_void ) const obj_t* beta = data->beta; const obj_t* c = data->c; const cntx_t* cntx = data->cntx; - rntm_t* rntm = data->rntm; + const rntm_t* rntm = data->rntm; dim_t tid = data->tid; array_t* array = data->array; thrcomm_t* gl_comm = data->gl_comm; @@ -79,17 +79,10 @@ void* bli_l3_sup_thread_entry( void* data_void ) // small block pool_t as it executes down the function stack. rntm_t rntm_l = *rntm; rntm_t* rntm_p = &rntm_l; - - // Use the thread id to access the appropriate pool_t* within the - // array_t, and use it to set the sba_pool field within the rntm_t. - // If the pool_t* element within the array_t is NULL, it will first - // be allocated/initialized. - bli_sba_rntm_set_pool( tid, array, rntm_p ); - - thrinfo_t* thread = NULL; + pool_t* pool = bli_apool_array_elem( tid, array ); // Create the root node of the current thread's thrinfo_t structure. - bli_l3_sup_thrinfo_create_root( tid, gl_comm, rntm_p, &thread ); + thrinfo_t* thread = bli_l3_sup_thrinfo_create( tid, gl_comm, pool, rntm_p ); func ( @@ -104,7 +97,7 @@ void* bli_l3_sup_thread_entry( void* data_void ) ); // Free the current thread's thrinfo_t structure. - bli_l3_sup_thrinfo_free( rntm_p, thread ); + bli_thrinfo_free( thread ); return NULL; } @@ -119,7 +112,7 @@ err_t bli_l3_sup_thread_decorator_pthreads const obj_t* beta, const obj_t* c, const cntx_t* cntx, - rntm_t* rntm + const rntm_t* rntm ) { err_t r_val; @@ -135,18 +128,9 @@ err_t bli_l3_sup_thread_decorator_pthreads // resize the array_t, if necessary. array_t* array = bli_sba_checkout_array( n_threads ); - // Access the pool_t* for thread 0 and embed it into the rntm. We do - // this up-front only so that we have the rntm_t.sba_pool field - // initialized and ready for the global communicator creation below. - bli_sba_rntm_set_pool( 0, array, rntm ); - - // Set the packing block allocator field of the rntm. This will be - // inherited by all of the child threads when they make local copies of - // the rntm below. - bli_pba_rntm_set_pba( rntm ); - // Allocate a global communicator for the root thrinfo_t structures. - thrcomm_t* gl_comm = bli_thrcomm_create( rntm, n_threads ); + timpl_t ti = bli_rntm_thread_impl( rntm ); + thrcomm_t* gl_comm = bli_thrcomm_create( NULL, ti, n_threads ); // Allocate an array of pthread objects and auxiliary data structs to pass // to the thread entry functions. @@ -186,16 +170,16 @@ err_t bli_l3_sup_thread_decorator_pthreads bli_l3_sup_thread_entry( ( void* )(&datas[0]) ); } - // We shouldn't free the global communicator since it was already freed - // by the global communicator's chief thread in bli_l3_thrinfo_free() - // (called from the thread entry function). - // Thread 0 waits for additional threads to finish. for ( dim_t tid = 1; tid < n_threads; tid++ ) { bli_pthread_join( pthreads[tid], NULL ); } + // Free the global communicator, because the root thrinfo_t node + // never frees its communicator. + bli_thrcomm_free( NULL, gl_comm ); + // Check the array_t back into the small block allocator. Similar to the // check-out, this is done using a lock embedded within the sba to ensure // mutual exclusion. diff --git a/frame/thread/bli_l3_sup_decor_pthreads.h b/frame/thread/bli_l3_sup_decor_pthreads.h index 310ea4e8b4..f8b64c7bae 100644 --- a/frame/thread/bli_l3_sup_decor_pthreads.h +++ b/frame/thread/bli_l3_sup_decor_pthreads.h @@ -51,7 +51,7 @@ err_t bli_l3_sup_thread_decorator_pthreads const obj_t* beta, const obj_t* c, const cntx_t* cntx, - rntm_t* rntm + const rntm_t* rntm ); #endif diff --git a/frame/thread/bli_l3_sup_decor_single.c b/frame/thread/bli_l3_sup_decor_single.c index a419154e7c..62a6c72c27 100644 --- a/frame/thread/bli_l3_sup_decor_single.c +++ b/frame/thread/bli_l3_sup_decor_single.c @@ -35,8 +35,6 @@ #include "blis.h" -#define SKIP_THRINFO_TREE - err_t bli_l3_sup_thread_decorator_single ( l3supint_ft func, @@ -47,7 +45,7 @@ err_t bli_l3_sup_thread_decorator_single const obj_t* beta, const obj_t* c, const cntx_t* cntx, - rntm_t* rntm + const rntm_t* rntm ) { // For sequential execution, we use only one thread. @@ -61,50 +59,11 @@ err_t bli_l3_sup_thread_decorator_single // resize the array_t, if necessary. array_t* array = bli_sba_checkout_array( n_threads ); - // Access the pool_t* for thread 0 and embed it into the rntm. - bli_sba_rntm_set_pool( 0, array, rntm ); - - // Set the packing block allocator field of the rntm. - bli_pba_rntm_set_pba( rntm ); - -#ifndef SKIP_THRINFO_TREE - // Allcoate a global communicator for the root thrinfo_t structures. - thrcomm_t* gl_comm = bli_thrcomm_create( rntm, n_threads ); -#endif - - { - // NOTE: We don't need to create another copy of the rntm_t since - // it was already copied in one of the high-level oapi functions. - rntm_t* rntm_p = rntm; - - // There is only one thread id (for the thief thread). - const dim_t tid = 0; - - // Use the thread id to access the appropriate pool_t* within the - // array_t, and use it to set the sba_pool field within the rntm_t. - // If the pool_t* element within the array_t is NULL, it will first - // be allocated/initialized. - // NOTE: This is commented out because, in the single-threaded case, - // this is redundant since it's already been done above. - //bli_sba_rntm_set_pool( tid, array, rntm_p ); - -#ifndef SKIP_THRINFO_TREE - thrinfo_t* thread = NULL; - - // Create the root node of the thread's thrinfo_t structure. - bli_l3_sup_thrinfo_create_root( tid, gl_comm, rntm_p, &thread ); -#else - // This optimization allows us to use one of the global thrinfo_t - // objects for single-threaded execution rather than grow one from - // scratch. The key is that bli_thrinfo_sup_grow(), which is called - // from within the variants, will immediately return if it detects - // that the thrinfo_t* passed into it is either - // &BLIS_GEMM_SINGLE_THREADED or &BLIS_PACKM_SINGLE_THREADED. - thrinfo_t* thread = &BLIS_GEMM_SINGLE_THREADED; - - ( void )tid; -#endif + // Create a special thrinfo_t structure which indicates + // single-threaded execution for all nodes. + pool_t* sba_pool = bli_apool_array_elem( 0, array ); + thrinfo_t* thread = bli_l3_sup_thrinfo_create( 0, &BLIS_SINGLE_COMM, sba_pool, rntm ); func ( @@ -114,20 +73,14 @@ err_t bli_l3_sup_thread_decorator_single beta, c, cntx, - rntm_p, + rntm, thread ); -#ifndef SKIP_THRINFO_TREE // Free the current thread's thrinfo_t structure. - bli_l3_sup_thrinfo_free( rntm_p, thread ); -#endif + bli_thrinfo_free( thread ); } - // We shouldn't free the global communicator since it was already freed - // by the global communicator's chief thread in bli_l3_thrinfo_free() - // (called above). - // Check the array_t back into the small block allocator. Similar to the // check-out, this is done using a lock embedded within the sba to ensure // mutual exclusion. diff --git a/frame/thread/bli_l3_sup_decor_single.h b/frame/thread/bli_l3_sup_decor_single.h index 8ca279baf0..945597a23e 100644 --- a/frame/thread/bli_l3_sup_decor_single.h +++ b/frame/thread/bli_l3_sup_decor_single.h @@ -45,7 +45,7 @@ err_t bli_l3_sup_thread_decorator_single const obj_t* beta, const obj_t* c, const cntx_t* cntx, - rntm_t* rntm + const rntm_t* rntm ); #endif diff --git a/frame/thread/bli_thrcomm.c b/frame/thread/bli_thrcomm.c index 6cd4325dfb..58eb891a4a 100644 --- a/frame/thread/bli_thrcomm.c +++ b/frame/thread/bli_thrcomm.c @@ -37,34 +37,30 @@ // -- Method-agnostic functions ------------------------------------------------ -thrcomm_t* bli_thrcomm_create( rntm_t* rntm, dim_t n_threads ) +thrcomm_t* bli_thrcomm_create( pool_t* sba_pool, timpl_t ti, dim_t n_threads ) { #ifdef BLIS_ENABLE_MEM_TRACING printf( "bli_thrcomm_create(): " ); #endif - thrcomm_t* comm = bli_sba_acquire( rntm, sizeof(thrcomm_t) ); - - const timpl_t ti = bli_rntm_thread_impl( rntm ); + thrcomm_t* comm = bli_sba_acquire( sba_pool, sizeof( thrcomm_t ) ); bli_thrcomm_init( ti, n_threads, comm ); return comm; } -void bli_thrcomm_free( rntm_t* rntm, thrcomm_t* comm ) +void bli_thrcomm_free( pool_t* sba_pool, thrcomm_t* comm ) { if ( comm == NULL ) return; - const timpl_t ti = bli_rntm_thread_impl( rntm ); - - bli_thrcomm_cleanup( ti, comm ); + bli_thrcomm_cleanup( comm ); #ifdef BLIS_ENABLE_MEM_TRACING printf( "bli_thrcomm_free(): " ); #endif - bli_sba_release( rntm, comm ); + bli_sba_release( sba_pool, comm ); } // -- Method-specific functions ------------------------------------------------ @@ -154,8 +150,9 @@ void bli_thrcomm_init( timpl_t ti, dim_t nt, thrcomm_t* comm ) comm->ti = ti; } -void bli_thrcomm_cleanup( timpl_t ti, thrcomm_t* comm ) +void bli_thrcomm_cleanup( thrcomm_t* comm ) { + const timpl_t ti = bli_thrcomm_thread_impl( comm ); const thrcomm_cleanup_ft fp = cleanup_fpa[ ti ]; if ( fp == NULL ) bli_abort(); @@ -181,8 +178,9 @@ void bli_thrcomm_cleanup( timpl_t ti, thrcomm_t* comm ) fp( comm ); } -void bli_thrcomm_barrier( timpl_t ti, dim_t tid, thrcomm_t* comm ) +void bli_thrcomm_barrier( dim_t tid, thrcomm_t* comm ) { + const timpl_t ti = bli_thrcomm_thread_impl( comm ); const thrcomm_barrier_ft fp = barrier_fpa[ ti ]; if ( fp == NULL ) bli_abort(); @@ -211,19 +209,18 @@ void bli_thrcomm_barrier( timpl_t ti, dim_t tid, thrcomm_t* comm ) void* bli_thrcomm_bcast ( - timpl_t ti, dim_t id, void* to_send, thrcomm_t* comm ) -{ +{ if ( comm == NULL || comm->n_threads == 1 ) return to_send; if ( id == 0 ) comm->sent_object = to_send; - bli_thrcomm_barrier( ti, id, comm ); + bli_thrcomm_barrier( id, comm ); void* object = comm->sent_object; - bli_thrcomm_barrier( ti, id, comm ); + bli_thrcomm_barrier( id, comm ); return object; } @@ -257,7 +254,7 @@ void bli_thrcomm_barrier_atomic( dim_t t_id, thrcomm_t* comm ) // the current barrier. The first n-1 threads will spin on this variable // until it changes. The sense variable gets incremented by the last // thread to enter the barrier, just before it exits. But it turns out - // that you don't need many unique IDs before you can wrap around. In + // that you don't need many unique IDs before you can wrap around. In // fact, if everything else is working, a binary variable is sufficient, // which is what we do here (i.e., 0 is incremented to 1, which is then // decremented back to 0, and so forth). diff --git a/frame/thread/bli_thrcomm.h b/frame/thread/bli_thrcomm.h index 4532fd00d8..8492b4707e 100644 --- a/frame/thread/bli_thrcomm.h +++ b/frame/thread/bli_thrcomm.h @@ -120,22 +120,23 @@ BLIS_INLINE dim_t bli_thrcomm_num_threads( thrcomm_t* comm ) return comm->n_threads; } +BLIS_INLINE timpl_t bli_thrcomm_thread_impl( thrcomm_t* comm ) +{ + return comm->ti; +} + // Threading method-agnostic function prototypes. -thrcomm_t* bli_thrcomm_create( rntm_t* rntm, dim_t n_threads ); -void bli_thrcomm_free( rntm_t* rntm, thrcomm_t* comm ); +thrcomm_t* bli_thrcomm_create( pool_t* sba_pool, timpl_t ti, dim_t n_threads ); +void bli_thrcomm_free( pool_t* sba_pool, thrcomm_t* comm ); // Threading method-specific function prototypes. -// NOTE: These are the prototypes to the dispatcher functions and thus they -// require the timpl_t as an argument. The threading-specific functions can -// (and do) omit the timpl_t from their function signatures since their -// threading implementation is intrinsically known. void bli_thrcomm_init( timpl_t ti, dim_t n_threads, thrcomm_t* comm ); -void bli_thrcomm_cleanup( timpl_t ti, thrcomm_t* comm ); -BLIS_EXPORT_BLIS void bli_thrcomm_barrier( timpl_t ti, dim_t thread_id, thrcomm_t* comm ); +void bli_thrcomm_cleanup( thrcomm_t* comm ); +BLIS_EXPORT_BLIS void bli_thrcomm_barrier( dim_t thread_id, thrcomm_t* comm ); // Other function prototypes. -BLIS_EXPORT_BLIS void* bli_thrcomm_bcast( timpl_t ti, dim_t inside_id, void* to_send, thrcomm_t* comm ); +BLIS_EXPORT_BLIS void* bli_thrcomm_bcast( dim_t inside_id, void* to_send, thrcomm_t* comm ); void bli_thrcomm_barrier_atomic( dim_t thread_id, thrcomm_t* comm ); #endif diff --git a/frame/thread/bli_thrcomm_openmp.c b/frame/thread/bli_thrcomm_openmp.c index a42dabe180..b229e33c63 100644 --- a/frame/thread/bli_thrcomm_openmp.c +++ b/frame/thread/bli_thrcomm_openmp.c @@ -155,10 +155,10 @@ barrier_t* bli_thrcomm_tree_barrier_create( int num_threads, int arity, barrier_ kid->dad = me; leaf_index += threads_this_kid; - } + } me->count = arity; me->arity = arity; - } + } return me; } diff --git a/frame/thread/bli_thread.c b/frame/thread/bli_thread.c index eefc20fdd9..91f172c181 100644 --- a/frame/thread/bli_thread.c +++ b/frame/thread/bli_thread.c @@ -35,9 +35,7 @@ #include "blis.h" -thrinfo_t BLIS_PACKM_SINGLE_THREADED = {}; -thrinfo_t BLIS_GEMM_SINGLE_THREADED = {}; -thrcomm_t BLIS_SINGLE_COMM = {}; +thrcomm_t BLIS_SINGLE_COMM = {}; // The global rntm_t structure. (The definition resides in bli_rntm.c.) extern rntm_t global_rntm; @@ -51,8 +49,6 @@ extern bli_pthread_mutex_t global_rntm_mutex; void bli_thread_init( void ) { bli_thrcomm_init( BLIS_SINGLE, 1, &BLIS_SINGLE_COMM ); - bli_packm_thrinfo_init_single( &BLIS_PACKM_SINGLE_THREADED ); - bli_l3_thrinfo_init_single( &BLIS_GEMM_SINGLE_THREADED ); // Read the environment variables and use them to initialize the // global runtime object. diff --git a/frame/thread/bli_thread.h b/frame/thread/bli_thread.h index 88bdccda54..1636dfef37 100644 --- a/frame/thread/bli_thread.h +++ b/frame/thread/bli_thread.h @@ -42,11 +42,8 @@ // Include thread info (thrinfo_t) object definitions and prototypes. #include "bli_thrinfo.h" -#include "bli_thrinfo_sup.h" // Include some operation-specific thrinfo_t prototypes. -// Note that the bli_packm_thrinfo.h must be included before the others! -#include "bli_packm_thrinfo.h" #include "bli_l3_thrinfo.h" // Include the level-3 thread decorator and related definitions and prototypes diff --git a/frame/thread/bli_thrinfo.c b/frame/thread/bli_thrinfo.c index 3730ab9465..2613cb0222 100644 --- a/frame/thread/bli_thrinfo.c +++ b/frame/thread/bli_thrinfo.c @@ -35,101 +35,83 @@ #include "blis.h" +#define BLIS_NUM_STATIC_COMMS 80 + +thrinfo_t* bli_thrinfo_create_root + ( + thrcomm_t* comm, + dim_t thread_id, + pool_t* sba_pool, + pba_t* pba + ) +{ + return bli_thrinfo_create + ( + comm, + thread_id, + 1, + 0, + FALSE, + sba_pool, + pba + ); +} + thrinfo_t* bli_thrinfo_create ( - rntm_t* rntm, - thrcomm_t* ocomm, - dim_t ocomm_id, + thrcomm_t* comm, + dim_t thread_id, dim_t n_way, dim_t work_id, bool free_comm, - bszid_t bszid, - thrinfo_t* sub_node + pool_t* sba_pool, + pba_t* pba ) { #ifdef BLIS_ENABLE_MEM_TRACING printf( "bli_thrinfo_create(): " ); #endif - thrinfo_t* thread = bli_sba_acquire( rntm, sizeof( thrinfo_t ) ); - - bli_thrinfo_init - ( - thread, - ocomm, ocomm_id, - n_way, work_id, - free_comm, - bszid, - sub_node - ); - - return thread; -} + thrinfo_t* thread = bli_sba_acquire( sba_pool, sizeof( thrinfo_t ) ); -void bli_thrinfo_init - ( - thrinfo_t* thread, - thrcomm_t* ocomm, - dim_t ocomm_id, - dim_t n_way, - dim_t work_id, - bool free_comm, - bszid_t bszid, - thrinfo_t* sub_node - ) -{ - bli_thrinfo_set_ocomm( ocomm, thread ); - bli_thrinfo_set_ocomm_id( ocomm_id, thread ); + bli_thrinfo_set_comm( comm, thread ); + bli_thrinfo_set_thread_id( thread_id, thread ); bli_thrinfo_set_n_way( n_way, thread ); bli_thrinfo_set_work_id( work_id, thread ); bli_thrinfo_set_free_comm( free_comm, thread ); - bli_thrinfo_set_bszid( bszid, thread ); + bli_thrinfo_set_sba_pool( sba_pool, thread ); + bli_thrinfo_set_pba( pba, thread ); + bli_mem_clear( bli_thread_mem( thread ) ); - bli_thrinfo_set_sub_node( sub_node, thread ); + bli_thrinfo_set_sub_node( NULL, thread ); bli_thrinfo_set_sub_prenode( NULL, thread ); -} -void bli_thrinfo_init_single - ( - thrinfo_t* thread - ) -{ - bli_thrinfo_init - ( - thread, - &BLIS_SINGLE_COMM, 0, - 1, - 0, - FALSE, - BLIS_NO_PART, - thread - ); + return thread; } void bli_thrinfo_free ( - rntm_t* rntm, thrinfo_t* thread ) { - if ( thread == NULL || - thread == &BLIS_PACKM_SINGLE_THREADED || - thread == &BLIS_GEMM_SINGLE_THREADED - ) return; + if ( thread == NULL ) return; thrinfo_t* thrinfo_sub_prenode = bli_thrinfo_sub_prenode( thread ); thrinfo_t* thrinfo_sub_node = bli_thrinfo_sub_node( thread ); + pool_t* sba_pool = bli_thread_sba_pool( thread ); + mem_t* cntl_mem_p = bli_thread_mem( thread ); + pba_t* pba = bli_thread_pba( thread ); // Recursively free all children of the current thrinfo_t. if ( thrinfo_sub_prenode != NULL ) { - bli_thrinfo_free( rntm, thrinfo_sub_prenode ); + bli_thrinfo_free( thrinfo_sub_prenode ); } // Recursively free all children of the current thrinfo_t. if ( thrinfo_sub_node != NULL ) { - bli_thrinfo_free( rntm, thrinfo_sub_node ); + bli_thrinfo_free( thrinfo_sub_node ); } // Free the communicators, but only if the current thrinfo_t struct @@ -139,198 +121,47 @@ void bli_thrinfo_free if ( bli_thrinfo_needs_free_comm( thread ) ) { // The ochief always frees his communicator. - if ( bli_thread_am_ochief( thread ) ) - bli_thrcomm_free( rntm, bli_thrinfo_ocomm( thread ) ); + if ( bli_thread_am_chief( thread ) ) + bli_thrcomm_free( sba_pool, bli_thrinfo_comm( thread ) ); } #ifdef BLIS_ENABLE_MEM_TRACING printf( "bli_thrinfo_free(): " ); #endif - // Free the thrinfo_t struct. - bli_sba_release( rntm, thread ); -} - -// ----------------------------------------------------------------------------- - -void bli_thrinfo_grow - ( - rntm_t* rntm, - cntl_t* cntl, - thrinfo_t* thread - ) -{ - // First, consider the prenode branch of the thrinfo_t tree, which should be - // expanded only if there exists a prenode branch in the cntl_t tree. - - if ( bli_cntl_sub_prenode( cntl ) != NULL ) - { - // We only need to take action if the thrinfo_t sub-node is NULL; if it - // is non-NULL, then it has already been created and we'll use it as-is. - if ( bli_thrinfo_sub_prenode( thread ) == NULL ) - { - // Assertion / sanity check. - if ( bli_cntl_bszid( cntl ) != BLIS_MC ) - { - printf( "Assertion failed: Expanding prenode for non-IC loop?\n" ); - bli_abort(); - } - - // Now we must create the packa, jr, and ir nodes that make up - // the prenode branch of current cntl_t node. - - // Create a new node (or, if needed, multiple nodes) along the - // prenode branch of the tree and return the pointer to the - // (highest) child. - thrinfo_t* thread_prenode = bli_thrinfo_rgrow_prenode - ( - rntm, - cntl, - bli_cntl_sub_prenode( cntl ), - thread - ); - - // Attach the child thrinfo_t node for the secondary branch to its - // parent structure. - bli_thrinfo_set_sub_prenode( thread_prenode, thread ); - } - } - - // Now, grow the primary branch of the thrinfo_t tree. - - // NOTE: If bli_thrinfo_rgrow() is being called, the sub_node field will - // always be non-NULL, and so there's no need to check it. - //if ( bli_cntl_sub_node( cntl ) != NULL ) + // Free any allocated memory from the pba. + if ( bli_mem_is_alloc( cntl_mem_p ) && bli_thread_am_chief( thread ) ) { - // We only need to take action if the thrinfo_t sub-node is NULL; if it - // is non-NULL, then it has already been created and we'll use it as-is. - if ( bli_thrinfo_sub_node( thread ) == NULL ) - { - // Create a new node (or, if needed, multiple nodes) along the - // main sub-node branch of the tree and return the pointer to the - // (highest) child. - thrinfo_t* thread_child = bli_thrinfo_rgrow - ( - rntm, - cntl, - bli_cntl_sub_node( cntl ), - thread - ); - - // Attach the child thrinfo_t node for the primary branch to its - // parent structure. - bli_thrinfo_set_sub_node( thread_child, thread ); - } - } -} - -// ----------------------------------------------------------------------------- - -thrinfo_t* bli_thrinfo_rgrow - ( - rntm_t* rntm, - cntl_t* cntl_par, - cntl_t* cntl_cur, - thrinfo_t* thread_par - ) -{ - thrinfo_t* thread_cur; - - // We must handle two cases: those where the next node in the - // control tree is a partitioning node, and those where it is - // a non-partitioning (ie: packing) node. - if ( bli_cntl_bszid( cntl_cur ) != BLIS_NO_PART ) - { - // Create the child thrinfo_t node corresponding to cntl_cur, - // with cntl_par being the parent. - thread_cur = bli_thrinfo_create_for_cntl + bli_pba_release ( - rntm, - cntl_par, - cntl_cur, - thread_par - ); - } - else // if ( bli_cntl_bszid( cntl_cur ) == BLIS_NO_PART ) - { - // Recursively grow the thread structure and return the top-most - // thrinfo_t node of that segment. - thrinfo_t* thread_seg = bli_thrinfo_rgrow - ( - rntm, - cntl_par, - bli_cntl_sub_node( cntl_cur ), - thread_par - ); - - // Create a thrinfo_t node corresponding to cntl_cur. Since the - // corresponding cntl node, cntl_cur, is a non-partitioning node - // (bszid = BLIS_NO_PART), this means it's a packing node. Packing - // thrinfo_t nodes are formed differently than those corresponding to - // partitioning nodes; specifically, their work_id's are set equal to - // the their comm_id's. Also, notice that the free_comm field is set - // to FALSE since cntl_cur is a non-partitioning node. The reason: - // the communicator used here will be freed when thread_seg, or one - // of its descendents, is freed. - thread_cur = bli_thrinfo_create - ( - rntm, // rntm - bli_thrinfo_ocomm( thread_seg ), // ocomm - bli_thread_ocomm_id( thread_seg ), // ocomm_id - bli_cntl_calc_num_threads_in( rntm, cntl_cur ), // n_way - bli_thread_ocomm_id( thread_seg ), // work_id - FALSE, // free_comm - BLIS_NO_PART, // bszid - thread_seg // sub_node + pba, + cntl_mem_p ); } - return thread_cur; + // Free the thrinfo_t struct. + bli_sba_release( sba_pool, thread ); } -#define BLIS_NUM_STATIC_COMMS 80 +// ----------------------------------------------------------------------------- -thrinfo_t* bli_thrinfo_create_for_cntl +thrinfo_t* bli_thrinfo_split ( - rntm_t* rntm, - cntl_t* cntl_par, - cntl_t* cntl_chl, + dim_t n_way, thrinfo_t* thread_par ) { - // If we are running with a single thread, all of the code can be reduced - // and simplified to this. - if ( bli_rntm_calc_num_threads( rntm ) == 1 ) - { - thrinfo_t* thread_chl = bli_thrinfo_create - ( - rntm, // rntm - &BLIS_SINGLE_COMM, // ocomm - 0, // ocomm_id - 1, // n_way - 0, // work_id - FALSE, // free_comm - BLIS_NO_PART, // bszid - NULL // sub_node - ); - return thread_chl; - } - - thrcomm_t* static_comms[ BLIS_NUM_STATIC_COMMS ]; - thrcomm_t** new_comms = NULL; - - const bszid_t bszid_chl = bli_cntl_bszid( cntl_chl ); - - const dim_t parent_nt_in = bli_thread_num_threads( thread_par ); - const dim_t parent_n_way = bli_thread_n_way( thread_par ); - const dim_t parent_comm_id = bli_thread_ocomm_id( thread_par ); - const dim_t parent_work_id = bli_thread_work_id( thread_par ); + const timpl_t ti = bli_thrcomm_thread_impl( bli_thrinfo_comm( thread_par ) ); + const dim_t parent_num_threads = bli_thread_num_threads( thread_par ); + const dim_t parent_thread_id = bli_thread_thread_id( thread_par ); + pool_t* sba_pool = bli_thread_sba_pool( thread_par ); + pba_t* pba = bli_thread_pba( thread_par ); // Sanity check: make sure the number of threads in the parent's // communicator is divisible by the number of new sub-groups. - if ( parent_nt_in % parent_n_way != 0 ) + if ( parent_num_threads % n_way != 0 ) { - printf( "Assertion failed: parent_nt_in parent_n_way != 0\n" ); + printf( "Assertion failed: parent_num_threads %% n_way != 0\n" ); bli_abort(); } @@ -339,312 +170,105 @@ thrinfo_t* bli_thrinfo_create_for_cntl // - the current thread's id within the new communicator, // - the current thread's work id, given the ways of parallelism // to be obtained within the next loop. - const dim_t child_nt_in = bli_cntl_calc_num_threads_in( rntm, cntl_chl ); - const dim_t child_n_way = bli_rntm_ways_for( bszid_chl, rntm ); - const dim_t child_comm_id = parent_comm_id % child_nt_in; - const dim_t child_work_id = child_comm_id / ( child_nt_in / child_n_way ); - -//printf( "thread %d: child_n_way = %d child_nt_in = %d parent_n_way = %d (bszid = %d->%d)\n", (int)child_comm_id, (int)child_nt_in, (int)child_n_way, (int)parent_n_way, (int)bli_cntl_bszid( cntl_par ), (int)bszid_chl ); + const dim_t child_num_threads = parent_num_threads / n_way; + const dim_t child_thread_id = parent_thread_id % child_num_threads; + const dim_t child_work_id = parent_thread_id / child_num_threads; - // The parent's chief thread creates a temporary array of thrcomm_t - // pointers. - if ( bli_thread_am_ochief( thread_par ) ) - { - err_t r_val; - - if ( parent_n_way > BLIS_NUM_STATIC_COMMS ) - new_comms = bli_malloc_intl( parent_n_way * sizeof( thrcomm_t* ), &r_val ); - else - new_comms = static_comms; - } - - // Broadcast the temporary array to all threads in the parent's - // communicator. - new_comms = bli_thread_broadcast( rntm, thread_par, new_comms ); - - // Chiefs in the child communicator allocate the communicator - // object and store it in the array element corresponding to the - // parent's work id. - if ( child_comm_id == 0 ) - new_comms[ parent_work_id ] = bli_thrcomm_create( rntm, child_nt_in ); - - bli_thread_barrier( rntm, thread_par ); + thrcomm_t* static_comms[ BLIS_NUM_STATIC_COMMS ]; + thrcomm_t** new_comms = NULL; + thrcomm_t* my_comm = NULL; + bool free_comm = FALSE; + + if ( n_way == 1 ) + { + my_comm = bli_thrinfo_comm( thread_par ); + } + else if ( n_way == parent_num_threads ) + { + my_comm = &BLIS_SINGLE_COMM; + } + else + { + // The parent's chief thread creates a temporary array of thrcomm_t + // pointers. + if ( bli_thread_am_chief( thread_par ) ) + { + err_t r_val; + + if ( n_way > BLIS_NUM_STATIC_COMMS ) + new_comms = bli_malloc_intl( n_way * sizeof( thrcomm_t* ), &r_val ); + else + new_comms = static_comms; + } + + // Broadcast the temporary array to all threads in the parent's + // communicator. + new_comms = bli_thread_broadcast( thread_par, new_comms ); + + // Chiefs in the child communicator allocate the communicator + // object and store it in the array element corresponding to the + // parent's work id. + if ( child_thread_id == 0 ) + new_comms[ child_work_id ] = bli_thrcomm_create( sba_pool, ti, child_num_threads ); + + bli_thread_barrier( thread_par ); + + my_comm = new_comms[ child_work_id ]; + free_comm = TRUE; + } // All threads create a new thrinfo_t node using the communicator // that was created by their chief, as identified by parent_work_id. thrinfo_t* thread_chl = bli_thrinfo_create ( - rntm, // rntm - new_comms[ parent_work_id ], // ocomm - child_comm_id, // ocomm_id - child_n_way, // n_way - child_work_id, // work_id - TRUE, // free_comm - bszid_chl, // bszid - NULL // sub_node + my_comm, + child_thread_id, + n_way, + child_work_id, + free_comm, + sba_pool, + pba ); - bli_thread_barrier( rntm, thread_par ); + bli_thread_barrier( thread_par ); // The parent's chief thread frees the temporary array of thrcomm_t // pointers. - if ( bli_thread_am_ochief( thread_par ) ) - { - if ( parent_n_way > BLIS_NUM_STATIC_COMMS ) - bli_free_intl( new_comms ); + if ( bli_thread_am_chief( thread_par ) && + new_comms != static_comms ) + { + bli_free_intl( new_comms ); } return thread_chl; } -// ----------------------------------------------------------------------------- - -thrinfo_t* bli_thrinfo_rgrow_prenode - ( - rntm_t* rntm, - cntl_t* cntl_par, - cntl_t* cntl_cur, - thrinfo_t* thread_par - ) -{ - thrinfo_t* thread_cur; - - // We must handle two cases: those where the next node in the - // control tree is a partitioning node, and those where it is - // a non-partitioning (ie: packing) node. - if ( bli_cntl_bszid( cntl_cur ) != BLIS_NO_PART ) - { - // Create the child thrinfo_t node corresponding to cntl_cur, - // with cntl_par being the parent. - thread_cur = bli_thrinfo_create_for_cntl_prenode - ( - rntm, - cntl_par, - cntl_cur, - thread_par - ); - } - else // if ( bli_cntl_bszid( cntl_cur ) == BLIS_NO_PART ) - { - // Recursively grow the thread structure and return the top-most - // thrinfo_t node of that segment. - thrinfo_t* thread_seg = bli_thrinfo_rgrow_prenode - ( - rntm, - cntl_par, - bli_cntl_sub_node( cntl_cur ), - thread_par - ); - - // Create a thrinfo_t node corresponding to cntl_cur. Since the - // corresponding cntl node, cntl_cur, is a non-partitioning node - // (bszid = BLIS_NO_PART), this means it's a packing node. Packing - // thrinfo_t nodes are formed differently than those corresponding to - // partitioning nodes; specifically, their work_id's are set equal to - // the their comm_id's. Also, notice that the free_comm field is set - // to FALSE since cntl_cur is a non-partitioning node. The reason: - // the communicator used here will be freed when thread_seg, or one - // of its descendents, is freed. - thread_cur = bli_thrinfo_create - ( - rntm, // rntm - bli_thrinfo_ocomm( thread_seg ), // ocomm - bli_thread_ocomm_id( thread_seg ), // ocomm_id - bli_cntl_calc_num_threads_in( rntm, cntl_par ), // n_way - bli_thread_ocomm_id( thread_seg ), // work_id - FALSE, // free_comm - BLIS_NO_PART, // bszid - thread_seg // sub_node - ); - } - - return thread_cur; -} - -thrinfo_t* bli_thrinfo_create_for_cntl_prenode +void bli_thrinfo_print ( - rntm_t* rntm, - cntl_t* cntl_par, - cntl_t* cntl_chl, - thrinfo_t* thread_par - ) -{ - // NOTE: This function only has to work for the ic -> (pa -> jr) - // thrinfo_t tree branch extension. After that, the function - // bli_thrinfo_create_for_cntl() will be called for the last jr->ir - // branch extension. - - const bszid_t bszid_chl = bli_cntl_bszid( cntl_chl ); - - const dim_t parent_nt_in = bli_thread_num_threads( thread_par ); - const dim_t parent_n_way = bli_thread_n_way( thread_par ); - const dim_t parent_comm_id = bli_thread_ocomm_id( thread_par ); - //const dim_t parent_work_id = bli_thread_work_id( thread_par ); - - // Sanity check: make sure the number of threads in the parent's - // communicator is divisible by the number of new sub-groups. - if ( parent_nt_in % parent_n_way != 0 ) - { - printf( "Assertion failed: parent_nt_in (%d) parent_n_way (%d) != 0\n", - ( int )parent_nt_in, ( int )parent_n_way ); - bli_abort(); - } - - //dim_t child_nt_in = bli_cntl_calc_num_threads_in( rntm, cntl_chl ); - //dim_t child_n_way = bli_rntm_ways_for( bszid_chl, rntm ); - const dim_t child_nt_in = parent_nt_in; - const dim_t child_n_way = parent_nt_in; - const dim_t child_comm_id = parent_comm_id % child_nt_in; - const dim_t child_work_id = child_comm_id / ( child_nt_in / child_n_way ); - - bli_thread_barrier( rntm, thread_par ); - - // NOTE: Recall that parent_comm_id == child_comm_id, so checking for the - // parent's chief-ness is equivalent to checking for chief-ness in the new - // about-to-be-created communicator group. - thrcomm_t* new_comm = NULL; - if ( bli_thread_am_ochief( thread_par ) ) - new_comm = bli_thrcomm_create( rntm, child_nt_in ); - - // Broadcast the new thrcomm_t address to the other threads in the - // parent's group. - new_comm = bli_thread_broadcast( rntm, thread_par, new_comm ); - - // All threads create a new thrinfo_t node using the communicator - // that was created by their chief, as identified by parent_work_id. - thrinfo_t* thread_chl = bli_thrinfo_create - ( - rntm, // rntm - new_comm, // ocomm - child_comm_id, // ocomm_id - child_n_way, // n_way - child_work_id, // work_id - TRUE, // free_comm - bszid_chl, // bszid - NULL // sub_node - ); - - bli_thread_barrier( rntm, thread_par ); - - return thread_chl; -} - -// ----------------------------------------------------------------------------- - -#if 0 -void bli_thrinfo_grow_tree - ( - rntm_t* rntm, - cntl_t* cntl, thrinfo_t* thread ) { - cntl_t* cntl_jc = cntl; - thrinfo_t* thrinfo_jc = thread; - - bli_thrinfo_grow( rntm, cntl_jc, thrinfo_jc ); - - // inside jc loop: - cntl_t* cntl_pc = bli_cntl_sub_node( cntl_jc ); - thrinfo_t* thrinfo_pc = bli_thrinfo_sub_node( thrinfo_jc ); - - bli_thrinfo_grow( rntm, cntl_pc, thrinfo_pc ); - - // inside pc loop: - cntl_t* cntl_pb = bli_cntl_sub_node( cntl_pc ); - thrinfo_t* thrinfo_pb = bli_thrinfo_sub_node( thrinfo_pc ); - - bli_thrinfo_grow( rntm, cntl_pb, thrinfo_pb ); - - // after pb packing: - cntl_t* cntl_ic = bli_cntl_sub_node( cntl_pb ); - thrinfo_t* thrinfo_ic = bli_thrinfo_sub_node( thrinfo_pb ); - - bli_thrinfo_grow( rntm, cntl_ic, thrinfo_ic ); - - // -- main branch -- - - // inside ic loop: - cntl_t* cntl_pa = bli_cntl_sub_node( cntl_ic ); - thrinfo_t* thrinfo_pa = bli_thrinfo_sub_node( thrinfo_ic ); - - bli_thrinfo_grow( rntm, cntl_pa, thrinfo_pa ); - - // after pa packing: - cntl_t* cntl_jr = bli_cntl_sub_node( cntl_pa ); - thrinfo_t* thrinfo_jr = bli_thrinfo_sub_node( thrinfo_pa ); - - bli_thrinfo_grow( rntm, cntl_jr, thrinfo_jr ); - - // inside jr loop: - //cntl_t* cntl_ir = bli_cntl_sub_node( cntl_jr ); - //thrinfo_t* thrinfo_ir = bli_thrinfo_sub_node( thrinfo_jr ); - - // -- trsm branch -- - - // inside ic loop: - cntl_t* cntl_pa0 = bli_cntl_sub_prenode( cntl_ic ); - thrinfo_t* thrinfo_pa0 = bli_thrinfo_sub_prenode( thrinfo_ic ); - - bli_thrinfo_grow( rntm, cntl_pa0, thrinfo_pa0 ); - - // after pa packing: - cntl_t* cntl_jr0 = bli_cntl_sub_node( cntl_pa0 ); - thrinfo_t* thrinfo_jr0 = bli_thrinfo_sub_node( thrinfo_pa0 ); - - bli_thrinfo_grow( rntm, cntl_jr0, thrinfo_jr0 ); - - // inside jr loop: - //cntl_t* cntl_ir0 = bli_cntl_sub_node( cntl_jr0 ); - //thrinfo_t* thrinfo_ir0= bli_thrinfo_sub_node( thrinfo_jr0 ); + printf( " lvl nt tid nway wkid free\n" ); + bli_thrinfo_print_sub( thread, 0 ); } -void bli_thrinfo_grow_tree_ic +void bli_thrinfo_print_sub ( - rntm_t* rntm, - cntl_t* cntl, - thrinfo_t* thread + thrinfo_t* thread, + gint_t level ) { - cntl_t* cntl_ic = cntl; - thrinfo_t* thrinfo_ic = thread; - - bli_thrinfo_grow( rntm, cntl_ic, thrinfo_ic ); - - // -- main branch -- - - // inside ic loop: - cntl_t* cntl_pa = bli_cntl_sub_node( cntl_ic ); - thrinfo_t* thrinfo_pa = bli_thrinfo_sub_node( thrinfo_ic ); - - bli_thrinfo_grow( rntm, cntl_pa, thrinfo_pa ); - - // after pa packing: - cntl_t* cntl_jr = bli_cntl_sub_node( cntl_pa ); - thrinfo_t* thrinfo_jr = bli_thrinfo_sub_node( thrinfo_pa ); - - bli_thrinfo_grow( rntm, cntl_jr, thrinfo_jr ); - - // inside jr loop: - //cntl_t* cntl_ir = bli_cntl_sub_node( cntl_jr ); - //thrinfo_t* thrinfo_ir = bli_thrinfo_sub_node( thrinfo_jr ); - - // -- trsm branch -- - - // inside ic loop: - cntl_t* cntl_pa0 = bli_cntl_sub_prenode( cntl_ic ); - thrinfo_t* thrinfo_pa0 = bli_thrinfo_sub_prenode( thrinfo_ic ); - - bli_thrinfo_grow( rntm, cntl_pa0, thrinfo_pa0 ); - - // after pa packing: - cntl_t* cntl_jr0 = bli_cntl_sub_node( cntl_pa0 ); - thrinfo_t* thrinfo_jr0 = bli_thrinfo_sub_node( thrinfo_pa0 ); - - bli_thrinfo_grow( rntm, cntl_jr0, thrinfo_jr0 ); - - // inside jr loop: - //cntl_t* cntl_ir0 = bli_cntl_sub_node( cntl_jr0 ); - //thrinfo_t* thrinfo_ir0= bli_thrinfo_sub_node( thrinfo_jr0 ); + if ( thread == NULL ) return; + + printf( "%4ld %4ld %4ld %4ld %4ld %4ld\n", + ( unsigned long )level, + ( unsigned long )bli_thread_num_threads( thread ), + ( unsigned long )bli_thread_thread_id( thread ), + ( unsigned long )bli_thread_n_way( thread ), + ( unsigned long )bli_thread_work_id( thread ), + ( unsigned long )bli_thrinfo_needs_free_comm( thread )); + + bli_thrinfo_print_sub( bli_thrinfo_sub_prenode( thread ), level+1 ); + bli_thrinfo_print_sub( bli_thrinfo_sub_node( thread ), level+1 ); } -#endif + diff --git a/frame/thread/bli_thrinfo.h b/frame/thread/bli_thrinfo.h index 9d234bc91c..d7d8190bce 100644 --- a/frame/thread/bli_thrinfo.h +++ b/frame/thread/bli_thrinfo.h @@ -41,15 +41,16 @@ struct thrinfo_s { // The thread communicator for the other threads sharing the same work // at this level. - thrcomm_t* ocomm; + thrcomm_t* comm; - // Our thread id within the ocomm thread communicator. - dim_t ocomm_id; + // Our thread id within the thread communicator. + dim_t thread_id; - // The number of distinct threads used to parallelize the loop. + // The number of communicators which are "siblings" of our communicator dim_t n_way; - // What we're working on. + // What we're working on. This is the same for all threads in the same + // communicator, and 0 <= work_id < n_way. dim_t work_id; // When freeing, should the communicators in this node be freed? Usually, @@ -58,9 +59,14 @@ struct thrinfo_s // to false. bool free_comm; - // The bszid_t to help identify the node. This is mostly only useful when - // debugging or tracing the allocation and release of thrinfo_t nodes. - bszid_t bszid; + // The small block pool. + pool_t* sba_pool; + + // The packing block allocator. + pba_t* pba; + + // Storage for allocated memory obtained from the PBA. + mem_t mem; struct thrinfo_s* sub_prenode; struct thrinfo_s* sub_node; @@ -77,12 +83,12 @@ typedef struct thrinfo_s thrinfo_t; BLIS_INLINE dim_t bli_thread_num_threads( const thrinfo_t* t ) { - return (t->ocomm)->n_threads; + return (t->comm)->n_threads; } -BLIS_INLINE dim_t bli_thread_ocomm_id( const thrinfo_t* t ) +BLIS_INLINE dim_t bli_thread_thread_id( const thrinfo_t* t ) { - return t->ocomm_id; + return t->thread_id; } BLIS_INLINE dim_t bli_thread_n_way( const thrinfo_t* t ) @@ -95,9 +101,9 @@ BLIS_INLINE dim_t bli_thread_work_id( const thrinfo_t* t ) return t->work_id; } -BLIS_INLINE thrcomm_t* bli_thrinfo_ocomm( const thrinfo_t* t ) +BLIS_INLINE thrcomm_t* bli_thrinfo_comm( const thrinfo_t* t ) { - return t->ocomm; + return t->comm; } BLIS_INLINE bool bli_thrinfo_needs_free_comm( const thrinfo_t* t ) @@ -105,9 +111,19 @@ BLIS_INLINE bool bli_thrinfo_needs_free_comm( const thrinfo_t* t ) return t->free_comm; } -BLIS_INLINE dim_t bli_thread_bszid( const thrinfo_t* t ) +BLIS_INLINE pool_t* bli_thread_sba_pool( const thrinfo_t* t ) +{ + return t->sba_pool; +} + +BLIS_INLINE pba_t* bli_thread_pba( const thrinfo_t* t ) +{ + return t->pba; +} + +BLIS_INLINE mem_t* bli_thread_mem( thrinfo_t* t ) { - return t->bszid; + return &t->mem; } BLIS_INLINE thrinfo_t* bli_thrinfo_sub_node( const thrinfo_t* t ) @@ -122,21 +138,21 @@ BLIS_INLINE thrinfo_t* bli_thrinfo_sub_prenode( const thrinfo_t* t ) // thrinfo_t query (complex) -BLIS_INLINE bool bli_thread_am_ochief( const thrinfo_t* t ) +BLIS_INLINE bool bli_thread_am_chief( const thrinfo_t* t ) { - return t->ocomm_id == 0; + return t->thread_id == 0; } // thrinfo_t modification -BLIS_INLINE void bli_thrinfo_set_ocomm( thrcomm_t* ocomm, thrinfo_t* t ) +BLIS_INLINE void bli_thrinfo_set_comm( thrcomm_t* comm, thrinfo_t* t ) { - t->ocomm = ocomm; + t->comm = comm; } -BLIS_INLINE void bli_thrinfo_set_ocomm_id( dim_t ocomm_id, thrinfo_t* t ) +BLIS_INLINE void bli_thrinfo_set_thread_id( dim_t thread_id, thrinfo_t* t ) { - t->ocomm_id = ocomm_id; + t->thread_id = thread_id; } BLIS_INLINE void bli_thrinfo_set_n_way( dim_t n_way, thrinfo_t* t ) @@ -154,9 +170,14 @@ BLIS_INLINE void bli_thrinfo_set_free_comm( bool free_comm, thrinfo_t* t ) t->free_comm = free_comm; } -BLIS_INLINE void bli_thrinfo_set_bszid( bszid_t bszid, thrinfo_t* t ) +BLIS_INLINE void bli_thrinfo_set_sba_pool( pool_t* sba_pool, thrinfo_t* t ) +{ + t->sba_pool = sba_pool; +} + +BLIS_INLINE void bli_thrinfo_set_pba( pba_t* pba, thrinfo_t* t ) { - t->bszid = bszid; + t->pba = pba; } BLIS_INLINE void bli_thrinfo_set_sub_node( thrinfo_t* sub_node, thrinfo_t* t ) @@ -171,22 +192,14 @@ BLIS_INLINE void bli_thrinfo_set_sub_prenode( thrinfo_t* sub_prenode, thrinfo_t* // other thrinfo_t-related functions -BLIS_INLINE void* bli_thread_broadcast( const rntm_t* rntm, const thrinfo_t* t, void* p ) +BLIS_INLINE void* bli_thread_broadcast( const thrinfo_t* t, void* p ) { - // We can't use any bli_rntm_*() APIs here because they haven't been - // defined yet. So we have to manually access the timpl_t field (le ugh). - //const timpl_t ti = bli_rntm_thread_impl( rntm ); - - return bli_thrcomm_bcast( rntm->thread_impl, t->ocomm_id, p, t->ocomm ); + return bli_thrcomm_bcast( t->thread_id, p, t->comm ); } -BLIS_INLINE void bli_thread_barrier( const rntm_t* rntm, const thrinfo_t* t ) +BLIS_INLINE void bli_thread_barrier( const thrinfo_t* t ) { - // We can't use any bli_rntm_*() APIs here because they haven't been - // defined yet. So we have to manually access the timpl_t field (le ugh). - //const timpl_t ti = bli_rntm_thread_impl( rntm ); - - bli_thrcomm_barrier( rntm->thread_impl, t->ocomm_id, t->ocomm ); + bli_thrcomm_barrier( t->thread_id, t->comm ); } @@ -194,98 +207,48 @@ BLIS_INLINE void bli_thread_barrier( const rntm_t* rntm, const thrinfo_t* t ) // Prototypes for level-3 thrinfo functions not specific to any operation. // -thrinfo_t* bli_thrinfo_create +thrinfo_t* bli_thrinfo_create_root ( - rntm_t* rntm, - thrcomm_t* ocomm, - dim_t ocomm_id, - dim_t n_way, - dim_t work_id, - bool free_comm, - bszid_t bszid, - thrinfo_t* sub_node + thrcomm_t* comm, + dim_t thread_id, + pool_t* sba_pool, + pba_t* pba ); -void bli_thrinfo_init +thrinfo_t* bli_thrinfo_create ( - thrinfo_t* thread, - thrcomm_t* ocomm, - dim_t ocomm_id, + thrcomm_t* comm, + dim_t thread_id, dim_t n_way, dim_t work_id, bool free_comm, - bszid_t bszid, - thrinfo_t* sub_node - ); - -void bli_thrinfo_init_single - ( - thrinfo_t* thread + pool_t* sba_pool, + pba_t* pba ); +BLIS_EXPORT_BLIS void bli_thrinfo_free ( - rntm_t* rntm, thrinfo_t* thread ); // ----------------------------------------------------------------------------- -void bli_thrinfo_grow +thrinfo_t* bli_thrinfo_split ( - rntm_t* rntm, - cntl_t* cntl, - thrinfo_t* thread - ); - -thrinfo_t* bli_thrinfo_rgrow - ( - rntm_t* rntm, - cntl_t* cntl_par, - cntl_t* cntl_cur, - thrinfo_t* thread_par - ); - -thrinfo_t* bli_thrinfo_create_for_cntl - ( - rntm_t* rntm, - cntl_t* cntl_par, - cntl_t* cntl_chl, - thrinfo_t* thread_par - ); - -thrinfo_t* bli_thrinfo_rgrow_prenode - ( - rntm_t* rntm, - cntl_t* cntl_par, - cntl_t* cntl_cur, + dim_t n_way, thrinfo_t* thread_par ); -thrinfo_t* bli_thrinfo_create_for_cntl_prenode +void bli_thrinfo_print ( - rntm_t* rntm, - cntl_t* cntl_par, - cntl_t* cntl_chl, - thrinfo_t* thread_par - ); - -// ----------------------------------------------------------------------------- - -#if 0 -void bli_thrinfo_grow_tree - ( - rntm_t* rntm, - cntl_t* cntl, thrinfo_t* thread ); -void bli_thrinfo_grow_tree_ic +void bli_thrinfo_print_sub ( - rntm_t* rntm, - cntl_t* cntl, - thrinfo_t* thread + thrinfo_t* thread, + gint_t level ); -#endif #endif diff --git a/frame/thread/bli_thrinfo_sup.c b/frame/thread/bli_thrinfo_sup.c deleted file mode 100644 index 26a40e00fd..0000000000 --- a/frame/thread/bli_thrinfo_sup.c +++ /dev/null @@ -1,290 +0,0 @@ -/* - - BLIS - An object-based framework for developing high-performance BLAS-like - libraries. - - Copyright (C) 2014, The University of Texas at Austin - Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc. - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are - met: - - Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - - Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - - Neither the name(s) of the copyright holder(s) nor the names of its - contributors may be used to endorse or promote products derived - from this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -*/ - -#include "blis.h" - -void bli_thrinfo_sup_grow - ( - rntm_t* rntm, - const bszid_t* bszid_par, - thrinfo_t* thread - ) -{ - if ( thread == &BLIS_GEMM_SINGLE_THREADED || - thread == &BLIS_PACKM_SINGLE_THREADED ) return; - - // NOTE: If bli_thrinfo_sup_rgrow() is being called, the sub_node field will - // always be non-NULL, and so there's no need to check it. - //if ( bli_cntl_sub_node( cntl ) != NULL ) - { - // We only need to take action if the thrinfo_t sub-node is NULL; if it - // is non-NULL, then it has already been created and we'll use it as-is. - if ( bli_thrinfo_sub_node( thread ) == NULL ) - { - // Create a new node (or, if needed, multiple nodes) along the - // main sub-node branch of the tree and return the pointer to the - // (highest) child. - thrinfo_t* thread_child = bli_thrinfo_sup_rgrow - ( - rntm, - bszid_par, - &bszid_par[1], - thread - ); - - // Attach the child thrinfo_t node for the primary branch to its - // parent structure. - bli_thrinfo_set_sub_node( thread_child, thread ); - } - } -} - -// ----------------------------------------------------------------------------- - -thrinfo_t* bli_thrinfo_sup_rgrow - ( - rntm_t* rntm, - const bszid_t* bszid_par, - const bszid_t* bszid_cur, - thrinfo_t* thread_par - ) -{ - thrinfo_t* thread_cur; - - // We must handle two cases: those where the next node in the - // control tree is a partitioning node, and those where it is - // a non-partitioning (ie: packing) node. - if ( *bszid_cur != BLIS_NO_PART ) - { - // Create the child thrinfo_t node corresponding to cntl_cur, - // with cntl_par being the parent. - thread_cur = bli_thrinfo_sup_create_for_cntl - ( - rntm, - bszid_par, - bszid_cur, - thread_par - ); - } - else // if ( *bszid_cur == BLIS_NO_PART ) - { - // Recursively grow the thread structure and return the top-most - // thrinfo_t node of that segment. - thrinfo_t* thread_seg = bli_thrinfo_sup_rgrow - ( - rntm, - bszid_par, - &bszid_cur[1], - thread_par - ); - - // Create a thrinfo_t node corresponding to cntl_cur. Since the - // corresponding cntl node, cntl_cur, is a non-partitioning node - // (bszid = BLIS_NO_PART), this means it's a packing node. Packing - // thrinfo_t nodes are formed differently than those corresponding to - // partitioning nodes; specifically, their work_id's are set equal to - // the their comm_id's. Also, notice that the free_comm field is set - // to FALSE since cntl_cur is a non-partitioning node. The reason: - // the communicator used here will be freed when thread_seg, or one - // of its descendents, is freed. - thread_cur = bli_thrinfo_create - ( - rntm, // rntm - bli_thrinfo_ocomm( thread_seg ), // ocomm - bli_thread_ocomm_id( thread_seg ), // ocomm_id - bli_rntm_calc_num_threads_in( bszid_cur, rntm ), // n_way - bli_thread_ocomm_id( thread_seg ), // work_id - FALSE, // free_comm - BLIS_NO_PART, // bszid - thread_seg // sub_node - ); - } - - return thread_cur; -} - -#define BLIS_NUM_STATIC_COMMS 80 - -thrinfo_t* bli_thrinfo_sup_create_for_cntl - ( - rntm_t* rntm, - const bszid_t* bszid_par, - const bszid_t* bszid_chl, - thrinfo_t* thread_par - ) -{ - // If we are running with a single thread, all of the code can be reduced - // and simplified to this. - if ( bli_rntm_calc_num_threads( rntm ) == 1 ) - { - thrinfo_t* thread_chl = bli_thrinfo_create - ( - rntm, // rntm - &BLIS_SINGLE_COMM, // ocomm - 0, // ocomm_id - 1, // n_way - 0, // work_id - FALSE, // free_comm - BLIS_NO_PART, // bszid - NULL // sub_node - ); - - return thread_chl; - } - - // The remainder of this function handles the cases involving the use of - // multiple BLIS threads. - - if ( bli_rntm_pack_a( rntm ) == FALSE && - bli_rntm_pack_b( rntm ) == FALSE ) - { - // If we are packing neither A nor B, there are no broadcasts or barriers - // needed to synchronize threads (since all threads can work completely - // independently). In this special case situation, the thrinfo_t can be - // created with much simpler logic. - - const dim_t parent_comm_id = bli_thread_ocomm_id( thread_par ); - - // Compute: - // - the number of threads inside the new child comm, - // - the current thread's id within the new communicator, - // - the current thread's work id, given the ways of parallelism - // to be obtained within the next loop. - const dim_t child_nt_in = bli_rntm_calc_num_threads_in( bszid_chl, rntm ); - const dim_t child_n_way = bli_rntm_ways_for( *bszid_chl, rntm ); - const dim_t child_comm_id = parent_comm_id % child_nt_in; - const dim_t child_work_id = child_comm_id / ( child_nt_in / child_n_way ); - - // All threads create a new thrinfo_t node using the communicator - // that was created by their chief, as identified by parent_work_id. - thrinfo_t* thread_chl = bli_thrinfo_create - ( - rntm, // rntm - NULL, // ocomm - child_comm_id, // ocomm_id - child_n_way, // n_way - child_work_id, // work_id - TRUE, // free_comm - *bszid_chl, // bszid - NULL // sub_node - ); - - return thread_chl; - } - else - { - // If we are packing at least one of A or B, then we use the general - // approach that employs broadcasts and barriers. - - thrcomm_t* static_comms[ BLIS_NUM_STATIC_COMMS ]; - thrcomm_t** new_comms = NULL; - - const dim_t parent_nt_in = bli_thread_num_threads( thread_par ); - const dim_t parent_n_way = bli_thread_n_way( thread_par ); - const dim_t parent_comm_id = bli_thread_ocomm_id( thread_par ); - const dim_t parent_work_id = bli_thread_work_id( thread_par ); - - // Sanity check: make sure the number of threads in the parent's - // communicator is divisible by the number of new sub-groups. - if ( parent_nt_in % parent_n_way != 0 ) - { - printf( "Assertion failed: parent_nt_in parent_n_way != 0\n" ); - bli_abort(); - } - - // Compute: - // - the number of threads inside the new child comm, - // - the current thread's id within the new communicator, - // - the current thread's work id, given the ways of parallelism - // to be obtained within the next loop. - const dim_t child_nt_in = bli_rntm_calc_num_threads_in( bszid_chl, rntm ); - const dim_t child_n_way = bli_rntm_ways_for( *bszid_chl, rntm ); - const dim_t child_comm_id = parent_comm_id % child_nt_in; - const dim_t child_work_id = child_comm_id / ( child_nt_in / child_n_way ); - -//printf( "thread %d: child_n_way = %d child_nt_in = %d parent_n_way = %d (bszid = %d->%d)\n", (int)child_comm_id, (int)child_nt_in, (int)child_n_way, (int)parent_n_way, (int)bli_cntl_bszid( cntl_par ), (int)bszid_chl ); - - // The parent's chief thread creates a temporary array of thrcomm_t - // pointers. - if ( bli_thread_am_ochief( thread_par ) ) - { - err_t r_val; - - if ( parent_n_way > BLIS_NUM_STATIC_COMMS ) - new_comms = bli_malloc_intl( parent_n_way * sizeof( thrcomm_t* ), &r_val ); - else - new_comms = static_comms; - } - - // Broadcast the temporary array to all threads in the parent's - // communicator. - new_comms = bli_thread_broadcast( rntm, thread_par, new_comms ); - - // Chiefs in the child communicator allocate the communicator - // object and store it in the array element corresponding to the - // parent's work id. - if ( child_comm_id == 0 ) - new_comms[ parent_work_id ] = bli_thrcomm_create( rntm, child_nt_in ); - - bli_thread_barrier( rntm, thread_par ); - - // All threads create a new thrinfo_t node using the communicator - // that was created by their chief, as identified by parent_work_id. - thrinfo_t* thread_chl = bli_thrinfo_create - ( - rntm, // rntm - new_comms[ parent_work_id ], // ocomm - child_comm_id, // ocomm_id - child_n_way, // n_way - child_work_id, // work_id - TRUE, // free_comm - *bszid_chl, // bszid - NULL // sub_node - ); - - bli_thread_barrier( rntm, thread_par ); - - // The parent's chief thread frees the temporary array of thrcomm_t - // pointers. - if ( bli_thread_am_ochief( thread_par ) ) - { - if ( parent_n_way > BLIS_NUM_STATIC_COMMS ) - bli_free_intl( new_comms ); - } - - return thread_chl; - } -} - diff --git a/frame/thread/bli_thrinfo_sup.h b/frame/thread/bli_thrinfo_sup.h deleted file mode 100644 index 1afcd3337e..0000000000 --- a/frame/thread/bli_thrinfo_sup.h +++ /dev/null @@ -1,66 +0,0 @@ -/* - - BLIS - An object-based framework for developing high-performance BLAS-like - libraries. - - Copyright (C) 2014, The University of Texas at Austin - Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc. - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are - met: - - Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - - Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - - Neither the name(s) of the copyright holder(s) nor the names of its - contributors may be used to endorse or promote products derived - from this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -*/ - -#ifndef BLIS_THRINFO_SUP_H -#define BLIS_THRINFO_SUP_H - -// -// Prototypes for level-3 thrinfo sup functions. -// - -void bli_thrinfo_sup_grow - ( - rntm_t* rntm, - const bszid_t* bszid_par, - thrinfo_t* thread - ); - -thrinfo_t* bli_thrinfo_sup_rgrow - ( - rntm_t* rntm, - const bszid_t* bszid_par, - const bszid_t* bszid_cur, - thrinfo_t* thread_par - ); - -thrinfo_t* bli_thrinfo_sup_create_for_cntl - ( - rntm_t* rntm, - const bszid_t* bszid_par, - const bszid_t* bszid_chl, - thrinfo_t* thread_par - ); - -#endif diff --git a/frame/util/bli_util_tapi.c b/frame/util/bli_util_tapi.c index abc9c90890..5bd03882ac 100644 --- a/frame/util/bli_util_tapi.c +++ b/frame/util/bli_util_tapi.c @@ -74,7 +74,7 @@ void PASTEMAC2(ch,opname,EX_SUF) \ ( ctype* )x, incx, \ asum, \ ( cntx_t* )cntx, \ - rntm \ + ( rntm_t* )rntm \ ); \ } @@ -110,7 +110,7 @@ void PASTEMAC2(ch,opname,EX_SUF) \ m, \ a, rs_a, cs_a, \ ( cntx_t* )cntx, \ - rntm \ + ( rntm_t* )rntm \ ); \ } @@ -153,7 +153,7 @@ void PASTEMAC2(ch,opname,EX_SUF) \ ( ctype* )x, incx, \ norm, \ ( cntx_t* )cntx, \ - rntm \ + ( rntm_t* )rntm \ ); \ } @@ -204,7 +204,7 @@ void PASTEMAC2(ch,opname,EX_SUF) \ ( ctype* )x, rs_x, cs_x, \ norm, \ ( cntx_t* )cntx, \ - rntm \ + ( rntm_t* )rntm \ ); \ } @@ -248,7 +248,7 @@ void PASTEMAC2(ch,opname,EX_SUF) \ n, \ x, incx, \ ( cntx_t* )cntx, \ - rntm \ + ( rntm_t* )rntm \ ); \ \ /* Check the 1-norm of the randomzied vector. In the unlikely event that @@ -310,7 +310,7 @@ void PASTEMAC2(ch,opname,EX_SUF) \ n, \ x, rs_x, cs_x, \ ( cntx_t* )cntx, \ - rntm \ + ( rntm_t* )rntm \ ); \ \ /* Check the 1-norm of the randomzied matrix. In the unlikely event that @@ -366,7 +366,7 @@ void PASTEMAC2(ch,opname,EX_SUF) \ scale, \ sumsq, \ ( cntx_t* )cntx, \ - rntm \ + ( rntm_t* )rntm \ ); \ } diff --git a/frame/util/bli_util_unb_var1.c b/frame/util/bli_util_unb_var1.c index 3c501d1075..0e52b83783 100644 --- a/frame/util/bli_util_unb_var1.c +++ b/frame/util/bli_util_unb_var1.c @@ -34,7 +34,6 @@ */ #include "blis.h" -#include // // Define BLAS-like interfaces with typed operands. diff --git a/sandbox/gemmlike/bli_gemm_ex.c b/sandbox/gemmlike/bli_gemm_ex.c index fe220e6031..00d4448c6b 100644 --- a/sandbox/gemmlike/bli_gemm_ex.c +++ b/sandbox/gemmlike/bli_gemm_ex.c @@ -52,7 +52,7 @@ void bli_gemm_ex const obj_t* beta, const obj_t* c, const cntx_t* cntx, - rntm_t* rntm + const rntm_t* rntm ) { bli_init_once(); diff --git a/sandbox/gemmlike/bls_gemm.c b/sandbox/gemmlike/bls_gemm.c index 1e567a114b..5ce2dcc2f1 100644 --- a/sandbox/gemmlike/bls_gemm.c +++ b/sandbox/gemmlike/bls_gemm.c @@ -33,6 +33,7 @@ */ #include "blis.h" +#include "thread/bls_l3_decor.h" // // -- Define the gemm-like operation's object API ------------------------------ @@ -67,7 +68,7 @@ void bls_gemm_ex const obj_t* beta, const obj_t* c, const cntx_t* cntx, - rntm_t* rntm + const rntm_t* rntm ) { bli_init_once(); @@ -75,8 +76,8 @@ void bls_gemm_ex // Initialize a local runtime with global settings if necessary. Note // that in the case that a runtime is passed in, we make a local copy. rntm_t rntm_l; - if ( rntm == NULL ) { bli_rntm_init_from_global( &rntm_l ); rntm = &rntm_l; } - else { rntm_l = *rntm; rntm = &rntm_l; } + if ( rntm == NULL ) { bli_rntm_init_from_global( &rntm_l ); } + else { rntm_l = *rntm; } // Set the .pack_a and .pack_b fields to TRUE. This is only needed because // this sandbox uses bli_thrinfo_sup_grow(), which calls @@ -87,8 +88,8 @@ void bls_gemm_ex // while this sandbox implementation executes (and it also reinforces the // fact that we *are* indeed packing A and B, albeit not in the sup context // originally envisioned for the .pack_a and .pack_b fields). - bli_rntm_set_pack_a( TRUE, rntm ); - bli_rntm_set_pack_b( TRUE, rntm ); + bli_rntm_set_pack_a( TRUE, &rntm_l ); + bli_rntm_set_pack_b( TRUE, &rntm_l ); // Obtain a valid (native) context from the gks if necessary. // NOTE: This must be done before calling the _check() function, since @@ -166,7 +167,7 @@ void bls_gemm_ex bli_obj_length( &c_local ), bli_obj_width( &c_local ), bli_obj_width( &a_local ), - rntm + &rntm_l ); // Spawn threads (if applicable), where bls_gemm_int() is the thread entry @@ -182,7 +183,7 @@ void bls_gemm_ex ( obj_t* )beta, ( obj_t* )&c_local, ( cntx_t* )cntx, - rntm + &rntm_l ); } diff --git a/sandbox/gemmlike/bls_gemm.h b/sandbox/gemmlike/bls_gemm.h index d01c6647ee..7380f02add 100644 --- a/sandbox/gemmlike/bls_gemm.h +++ b/sandbox/gemmlike/bls_gemm.h @@ -53,7 +53,7 @@ void bls_gemm_ex const obj_t* beta, const obj_t* c, const cntx_t* cntx, - rntm_t* rntm + const rntm_t* rntm ); // diff --git a/sandbox/gemmlike/bls_gemm_bp_var1.c b/sandbox/gemmlike/bls_gemm_bp_var1.c index c8fd500839..dac38bab0b 100644 --- a/sandbox/gemmlike/bls_gemm_bp_var1.c +++ b/sandbox/gemmlike/bls_gemm_bp_var1.c @@ -186,42 +186,13 @@ void PASTECH2(bls_,ch,varname) \ \ auxinfo_t aux; \ \ - /* Initialize a mem_t entry for A and B. Strictly speaking, this is only - needed for the matrix we will be packing (if any), but we do it - unconditionally to be safe. */ \ - mem_t mem_a = BLIS_MEM_INITIALIZER; \ - mem_t mem_b = BLIS_MEM_INITIALIZER; \ -\ - /* Define an array of bszid_t ids, which will act as our substitute for - the cntl_t tree. */ \ - bszid_t bszids[8] = { BLIS_NC, /* 5th loop */ \ - BLIS_KC, /* 4th loop */ \ - BLIS_NO_PART, /* pack B */ \ - BLIS_MC, /* 3rd loop */ \ - BLIS_NO_PART, /* pack A */ \ - BLIS_NR, /* 2nd loop */ \ - BLIS_MR, /* 1st loop */ \ - BLIS_KR }; /* microkernel loop */ \ -\ - bszid_t* restrict bszids_jc = &bszids[0]; \ - bszid_t* restrict bszids_pc = &bszids[1]; \ - /*bszid_t* restrict bszids_pb = &bszids[2];*/ \ - bszid_t* restrict bszids_ic = &bszids[3]; \ - /*bszid_t* restrict bszids_pa = &bszids[4];*/ \ - bszid_t* restrict bszids_jr = &bszids[5]; \ - /*bszid_t* restrict bszids_ir = &bszids[6];*/ \ -\ - thrinfo_t* restrict thread_jc = NULL; \ - thrinfo_t* restrict thread_pc = NULL; \ - thrinfo_t* restrict thread_pb = NULL; \ - thrinfo_t* restrict thread_ic = NULL; \ - thrinfo_t* restrict thread_pa = NULL; \ - thrinfo_t* restrict thread_jr = NULL; \ - thrinfo_t* restrict thread_ir = NULL; \ -\ - /* Identify the current thrinfo_t node and then grow the tree. */ \ - thread_jc = thread; \ - bli_thrinfo_sup_grow( rntm, bszids_jc, thread_jc ); \ + thrinfo_t* restrict thread_jc = thread; \ + thrinfo_t* restrict thread_pc = bli_thrinfo_sub_node( thread_jc ); \ + thrinfo_t* restrict thread_pb = bli_thrinfo_sub_node( thread_pc ); \ + thrinfo_t* restrict thread_ic = bli_thrinfo_sub_node( thread_pb ); \ + thrinfo_t* restrict thread_pa = bli_thrinfo_sub_node( thread_ic ); \ + thrinfo_t* restrict thread_jr = bli_thrinfo_sub_node( thread_pa ); \ + thrinfo_t* restrict thread_ir = bli_thrinfo_sub_node( thread_jr ); \ \ /* Compute the JC loop thread range for the current thread. */ \ dim_t jc_start, jc_end; \ @@ -240,10 +211,6 @@ void PASTECH2(bls_,ch,varname) \ \ ctype* restrict b_jc = b_00 + jj * jcstep_b; \ ctype* restrict c_jc = c_00 + jj * jcstep_c; \ -\ - /* Identify the current thrinfo_t node and then grow the tree. */ \ - thread_pc = bli_thrinfo_sub_node( thread_jc ); \ - bli_thrinfo_sup_grow( rntm, bszids_pc, thread_pc ); \ \ /* Compute the PC loop thread range for the current thread. */ \ const dim_t pc_start = 0, pc_end = k; \ @@ -267,14 +234,6 @@ void PASTECH2(bls_,ch,varname) \ \ ctype* b_use; \ inc_t rs_b_use, cs_b_use, ps_b_use; \ -\ - /* Identify the current thrinfo_t node. Note that the thrinfo_t - node will have already been created by a previous call to - bli_thrinfo_sup_grow() since bszid_t values of BLIS_NO_PART - cause the tree to grow by two (e.g. to the next bszid that is - a normal bszid_t value). */ \ - thread_pb = bli_thrinfo_sub_node( thread_pc ); \ - /*bli_thrinfo_sup_grow( rntm, bszids_pb, thread_pb );*/ \ \ /* Determine the packing buffer and related parameters for matrix B. Then call the packm implementation. */ \ @@ -288,18 +247,12 @@ void PASTECH2(bls_,ch,varname) \ &b_use, &rs_b_use, &cs_b_use, \ &ps_b_use, \ cntx, \ - rntm, \ - &mem_b, \ thread_pb \ ); \ \ /* Alias b_use so that it's clear this is our current block of matrix B. */ \ ctype* restrict b_pc_use = b_use; \ -\ - /* Identify the current thrinfo_t node and then grow the tree. */ \ - thread_ic = bli_thrinfo_sub_node( thread_pb ); \ - bli_thrinfo_sup_grow( rntm, bszids_ic, thread_ic ); \ \ /* Compute the IC loop thread range for the current thread. */ \ dim_t ic_start, ic_end; \ @@ -321,14 +274,6 @@ void PASTECH2(bls_,ch,varname) \ \ ctype* a_use; \ inc_t rs_a_use, cs_a_use, ps_a_use; \ -\ - /* Identify the current thrinfo_t node. Note that the thrinfo_t - node will have already been created by a previous call to - bli_thrinfo_sup_grow() since bszid_t values of BLIS_NO_PART - cause the tree to grow by two (e.g. to the next bszid that is - a normal bszid_t value). */ \ - thread_pa = bli_thrinfo_sub_node( thread_ic ); \ - /*bli_thrinfo_sup_grow( rntm, bszids_pa, thread_pa );*/ \ \ /* Determine the packing buffer and related parameters for matrix A. Then call the packm implementation. */ \ @@ -342,18 +287,12 @@ void PASTECH2(bls_,ch,varname) \ &a_use, &rs_a_use, &cs_a_use, \ &ps_a_use, \ cntx, \ - rntm, \ - &mem_a, \ thread_pa \ ); \ \ /* Alias a_use so that it's clear this is our current block of matrix A. */ \ ctype* restrict a_ic_use = a_use; \ -\ - /* Identify the current thrinfo_t node and then grow the tree. */ \ - thread_jr = bli_thrinfo_sub_node( thread_pa ); \ - bli_thrinfo_sup_grow( rntm, bszids_jr, thread_jr ); \ \ /* Query the number of threads and thread ids for the JR loop. NOTE: These values are only needed when computing the next @@ -381,9 +320,6 @@ void PASTECH2(bls_,ch,varname) \ /* Assume for now that our next panel of B to be the current panel of B. */ \ ctype* restrict b2 = b_jr; \ -\ - /* Identify the current thrinfo_t node. */ \ - thread_ir = bli_thrinfo_sub_node( thread_jr ); \ \ /* Query the number of threads and thread ids for the IR loop. NOTE: These values are only needed when computing the next @@ -446,23 +382,9 @@ void PASTECH2(bls_,ch,varname) \ /* This barrier is needed to prevent threads from starting to pack the next row panel of B before the current row panel is fully computed upon. */ \ - bli_thread_barrier( rntm, thread_pb ); \ + bli_thread_barrier( thread_pb ); \ } \ } \ -\ - /* Release any memory that was acquired for packing matrices A and B. */ \ - PASTECH2(bls_,ch,packm_finalize_mem_a) \ - ( \ - rntm, \ - &mem_a, \ - thread_pa \ - ); \ - PASTECH2(bls_,ch,packm_finalize_mem_b) \ - ( \ - rntm, \ - &mem_b, \ - thread_pb \ - ); \ \ /* PASTEMAC(ch,fprintm)( stdout, "gemm_bp_var1: a1_packed", mr_cur, kc_cur, a_ir, rs_a_use, cs_a_use, "%5.2f", "" ); \ diff --git a/sandbox/gemmlike/bls_l3_packm_a.c b/sandbox/gemmlike/bls_l3_packm_a.c index 9e1f67fc5b..326f83b0cd 100644 --- a/sandbox/gemmlike/bls_l3_packm_a.c +++ b/sandbox/gemmlike/bls_l3_packm_a.c @@ -43,8 +43,6 @@ void PASTECH2(bls_,ch,opname) \ dim_t k, \ dim_t mr, \ cntx_t* restrict cntx, \ - rntm_t* restrict rntm, \ - mem_t* restrict mem, \ thrinfo_t* restrict thread \ ) \ { \ @@ -61,16 +59,18 @@ void PASTECH2(bls_,ch,opname) \ \ /* Barrier to make sure all threads are caught up and ready to begin the packm stage. */ \ - bli_thread_barrier( rntm, thread ); \ + bli_thread_barrier( thread ); \ \ /* Compute the size of the memory block eneded. */ \ siz_t size_needed = sizeof( ctype ) * m_pack * k_pack; \ +\ + mem_t* mem = bli_thread_mem( thread ); \ \ /* Check the mem_t entry provided by the caller. If it is unallocated, then we need to acquire a block from the packed block allocator. */ \ if ( bli_mem_is_unalloc( mem ) ) \ { \ - if ( bli_thread_am_ochief( thread ) ) \ + if ( bli_thread_am_chief( thread ) ) \ { \ /* Acquire directly to the chief thread's mem_t that was passed in. It needs to be that mem_t struct, and not a local (temporary) @@ -81,7 +81,7 @@ void PASTECH2(bls_,ch,opname) \ again, I prefer to keep barriers to a minimum.) */ \ bli_pba_acquire_m \ ( \ - rntm, \ + bli_thread_pba( thread ), \ size_needed, \ pack_buf_type, \ mem \ @@ -90,13 +90,13 @@ void PASTECH2(bls_,ch,opname) \ \ /* Broadcast the address of the chief thread's passed-in mem_t to all threads. */ \ - mem_t* mem_p = bli_thread_broadcast( rntm, thread, mem ); \ + mem_t* mem_p = bli_thread_broadcast( thread, mem ); \ \ /* Non-chief threads: Copy the contents of the chief thread's passed-in mem_t to the passed-in mem_t for this thread. (The chief thread already has the mem_t, so it does not need to perform any copy.) */ \ - if ( !bli_thread_am_ochief( thread ) ) \ + if ( !bli_thread_am_chief( thread ) ) \ { \ *mem = *mem_p; \ } \ @@ -115,7 +115,7 @@ void PASTECH2(bls_,ch,opname) \ \ if ( mem_size < size_needed ) \ { \ - if ( bli_thread_am_ochief( thread ) ) \ + if ( bli_thread_am_chief( thread ) ) \ { \ /* The chief thread releases the existing block associated with the mem_t, and then re-acquires a new block, saving @@ -125,12 +125,12 @@ void PASTECH2(bls_,ch,opname) \ (temporary) mem_t. */ \ bli_pba_release \ ( \ - rntm, \ + bli_thread_pba( thread ), \ mem \ ); \ bli_pba_acquire_m \ ( \ - rntm, \ + bli_thread_pba( thread ), \ size_needed, \ pack_buf_type, \ mem \ @@ -139,13 +139,13 @@ void PASTECH2(bls_,ch,opname) \ \ /* Broadcast the address of the chief thread's passed-in mem_t to all threads. */ \ - mem_t* mem_p = bli_thread_broadcast( rntm, thread, mem ); \ + mem_t* mem_p = bli_thread_broadcast( thread, mem ); \ \ /* Non-chief threads: Copy the contents of the chief thread's passed-in mem_t to the passed-in mem_t for this thread. (The chief thread already has the mem_t, so it does not need to perform any copy.) */ \ - if ( !bli_thread_am_ochief( thread ) ) \ + if ( !bli_thread_am_chief( thread ) ) \ { \ *mem = *mem_p; \ } \ @@ -165,39 +165,6 @@ GENTFUNC( scomplex, c, packm_init_mem_a ) GENTFUNC( dcomplex, z, packm_init_mem_a ) -#undef GENTFUNC -#define GENTFUNC( ctype, ch, opname ) \ -\ -void PASTECH2(bls_,ch,opname) \ - ( \ - rntm_t* restrict rntm, \ - mem_t* restrict mem, \ - thrinfo_t* restrict thread \ - ) \ -{ \ - if ( thread != NULL ) \ - if ( bli_thread_am_ochief( thread ) ) \ - { \ - /* Check the mem_t entry provided by the caller. Only proceed if it - is allocated, which it should be. */ \ - if ( bli_mem_is_alloc( mem ) ) \ - { \ - bli_pba_release \ - ( \ - rntm, \ - mem \ - ); \ - } \ - } \ -} - -//INSERT_GENTFUNC_BASIC0( packm_finalize_mem_a ) -GENTFUNC( float, s, packm_finalize_mem_a ) -GENTFUNC( double, d, packm_finalize_mem_a ) -GENTFUNC( scomplex, c, packm_finalize_mem_a ) -GENTFUNC( dcomplex, z, packm_finalize_mem_a ) - - #undef GENTFUNC #define GENTFUNC( ctype, ch, opname ) \ \ @@ -267,8 +234,6 @@ void PASTECH2(bls_,ch,opname) \ ctype** restrict p, inc_t* restrict rs_p, inc_t* restrict cs_p, \ inc_t* restrict ps_p, \ cntx_t* restrict cntx, \ - rntm_t* restrict rntm, \ - mem_t* restrict mem, \ thrinfo_t* restrict thread \ ) \ { \ @@ -282,8 +247,6 @@ void PASTECH2(bls_,ch,opname) \ ( \ m_alloc, k_alloc, mr, \ cntx, \ - rntm, \ - mem, \ thread \ ); \ \ @@ -295,7 +258,7 @@ void PASTECH2(bls_,ch,opname) \ &m_max, &k_max, \ p, rs_p, cs_p, \ &pd_p, ps_p, \ - mem \ + bli_thread_mem( thread ) \ ); \ \ /* Pack matrix A to the destination buffer chosen above. Here, the packed @@ -317,7 +280,7 @@ void PASTECH2(bls_,ch,opname) \ ); \ \ /* Barrier so that packing is done before computation. */ \ - bli_thread_barrier( rntm, thread ); \ + bli_thread_barrier( thread ); \ } //INSERT_GENTFUNC_BASIC0( packm_a ) diff --git a/sandbox/gemmlike/bls_l3_packm_a.h b/sandbox/gemmlike/bls_l3_packm_a.h index 201a24efae..2ab53dcbf2 100644 --- a/sandbox/gemmlike/bls_l3_packm_a.h +++ b/sandbox/gemmlike/bls_l3_packm_a.h @@ -41,8 +41,6 @@ void PASTECH2(bls_,ch,opname) \ dim_t k, \ dim_t mr, \ cntx_t* restrict cntx, \ - rntm_t* restrict rntm, \ - mem_t* restrict mem, \ thrinfo_t* restrict thread \ ); \ @@ -53,23 +51,6 @@ GENTPROT( scomplex, c, packm_init_mem_a ) GENTPROT( dcomplex, z, packm_init_mem_a ) -#undef GENTPROT -#define GENTPROT( ctype, ch, opname ) \ -\ -void PASTECH2(bls_,ch,opname) \ - ( \ - rntm_t* restrict rntm, \ - mem_t* restrict mem, \ - thrinfo_t* restrict thread \ - ); \ - -//INSERT_GENTPROT_BASIC0( packm_finalize_mem_a ) -GENTPROT( float, s, packm_finalize_mem_a ) -GENTPROT( double, d, packm_finalize_mem_a ) -GENTPROT( scomplex, c, packm_finalize_mem_a ) -GENTPROT( dcomplex, z, packm_finalize_mem_a ) - - #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ @@ -109,8 +90,6 @@ void PASTECH2(bls_,ch,opname) \ ctype** restrict p, inc_t* restrict rs_p, inc_t* restrict cs_p, \ inc_t* restrict ps_p, \ cntx_t* restrict cntx, \ - rntm_t* restrict rntm, \ - mem_t* restrict mem, \ thrinfo_t* restrict thread \ ); \ diff --git a/sandbox/gemmlike/bls_l3_packm_b.c b/sandbox/gemmlike/bls_l3_packm_b.c index cb8275fae9..4ebe1062ba 100644 --- a/sandbox/gemmlike/bls_l3_packm_b.c +++ b/sandbox/gemmlike/bls_l3_packm_b.c @@ -43,8 +43,6 @@ void PASTECH2(bls_,ch,opname) \ dim_t n, \ dim_t nr, \ cntx_t* restrict cntx, \ - rntm_t* restrict rntm, \ - mem_t* restrict mem, \ thrinfo_t* restrict thread \ ) \ { \ @@ -61,16 +59,18 @@ void PASTECH2(bls_,ch,opname) \ \ /* Barrier to make sure all threads are caught up and ready to begin the packm stage. */ \ - bli_thread_barrier( rntm, thread ); \ + bli_thread_barrier( thread ); \ \ /* Compute the size of the memory block eneded. */ \ siz_t size_needed = sizeof( ctype ) * k_pack * n_pack; \ +\ + mem_t* mem = bli_thread_mem( thread ); \ \ /* Check the mem_t entry provided by the caller. If it is unallocated, then we need to acquire a block from the packed block allocator. */ \ if ( bli_mem_is_unalloc( mem ) ) \ { \ - if ( bli_thread_am_ochief( thread ) ) \ + if ( bli_thread_am_chief( thread ) ) \ { \ /* Acquire directly to the chief thread's mem_t that was passed in. It needs to be that mem_t struct, and not a local (temporary) @@ -81,7 +81,7 @@ void PASTECH2(bls_,ch,opname) \ again, I prefer to keep barriers to a minimum.) */ \ bli_pba_acquire_m \ ( \ - rntm, \ + bli_thread_pba( thread ), \ size_needed, \ pack_buf_type, \ mem \ @@ -90,13 +90,13 @@ void PASTECH2(bls_,ch,opname) \ \ /* Broadcast the address of the chief thread's passed-in mem_t to all threads. */ \ - mem_t* mem_p = bli_thread_broadcast( rntm, thread, mem ); \ + mem_t* mem_p = bli_thread_broadcast( thread, mem ); \ \ /* Non-chief threads: Copy the contents of the chief thread's passed-in mem_t to the passed-in mem_t for this thread. (The chief thread already has the mem_t, so it does not need to perform any copy.) */ \ - if ( !bli_thread_am_ochief( thread ) ) \ + if ( !bli_thread_am_chief( thread ) ) \ { \ *mem = *mem_p; \ } \ @@ -115,7 +115,7 @@ void PASTECH2(bls_,ch,opname) \ \ if ( mem_size < size_needed ) \ { \ - if ( bli_thread_am_ochief( thread ) ) \ + if ( bli_thread_am_chief( thread ) ) \ { \ /* The chief thread releases the existing block associated with the mem_t, and then re-acquires a new block, saving @@ -125,12 +125,12 @@ void PASTECH2(bls_,ch,opname) \ (temporary) mem_t. */ \ bli_pba_release \ ( \ - rntm, \ + bli_thread_pba( thread ), \ mem \ ); \ bli_pba_acquire_m \ ( \ - rntm, \ + bli_thread_pba( thread ), \ size_needed, \ pack_buf_type, \ mem \ @@ -139,13 +139,13 @@ void PASTECH2(bls_,ch,opname) \ \ /* Broadcast the address of the chief thread's passed-in mem_t to all threads. */ \ - mem_t* mem_p = bli_thread_broadcast( rntm, thread, mem ); \ + mem_t* mem_p = bli_thread_broadcast( thread, mem ); \ \ /* Non-chief threads: Copy the contents of the chief thread's passed-in mem_t to the passed-in mem_t for this thread. (The chief thread already has the mem_t, so it does not need to perform any copy.) */ \ - if ( !bli_thread_am_ochief( thread ) ) \ + if ( !bli_thread_am_chief( thread ) ) \ { \ *mem = *mem_p; \ } \ @@ -165,39 +165,6 @@ GENTFUNC( scomplex, c, packm_init_mem_b ) GENTFUNC( dcomplex, z, packm_init_mem_b ) -#undef GENTFUNC -#define GENTFUNC( ctype, ch, opname ) \ -\ -void PASTECH2(bls_,ch,opname) \ - ( \ - rntm_t* restrict rntm, \ - mem_t* restrict mem, \ - thrinfo_t* restrict thread \ - ) \ -{ \ - if ( thread != NULL ) \ - if ( bli_thread_am_ochief( thread ) ) \ - { \ - /* Check the mem_t entry provided by the caller. Only proceed if it - is allocated, which it should be. */ \ - if ( bli_mem_is_alloc( mem ) ) \ - { \ - bli_pba_release \ - ( \ - rntm, \ - mem \ - ); \ - } \ - } \ -} - -//INSERT_GENTFUNC_BASIC0( packm_finalize_mem_b ) -GENTFUNC( float, s, packm_finalize_mem_b ) -GENTFUNC( double, d, packm_finalize_mem_b ) -GENTFUNC( scomplex, c, packm_finalize_mem_b ) -GENTFUNC( dcomplex, z, packm_finalize_mem_b ) - - #undef GENTFUNC #define GENTFUNC( ctype, ch, opname ) \ \ @@ -267,8 +234,6 @@ void PASTECH2(bls_,ch,opname) \ ctype** restrict p, inc_t* restrict rs_p, inc_t* restrict cs_p, \ inc_t* restrict ps_p, \ cntx_t* restrict cntx, \ - rntm_t* restrict rntm, \ - mem_t* restrict mem, \ thrinfo_t* restrict thread \ ) \ { \ @@ -282,8 +247,6 @@ void PASTECH2(bls_,ch,opname) \ ( \ k_alloc, n_alloc, nr, \ cntx, \ - rntm, \ - mem, \ thread \ ); \ \ @@ -295,7 +258,7 @@ void PASTECH2(bls_,ch,opname) \ &k_max, &n_max, \ p, rs_p, cs_p, \ &pd_p, ps_p, \ - mem \ + bli_thread_mem( thread ) \ ); \ \ /* Pack matrix B to the destination buffer chosen above. Here, the packed @@ -317,7 +280,7 @@ void PASTECH2(bls_,ch,opname) \ ); \ \ /* Barrier so that packing is done before computation. */ \ - bli_thread_barrier( rntm, thread ); \ + bli_thread_barrier( thread ); \ } //INSERT_GENTFUNC_BASIC0( packm_b ) diff --git a/sandbox/gemmlike/bls_l3_packm_b.h b/sandbox/gemmlike/bls_l3_packm_b.h index 728d21aed5..791cf9b712 100644 --- a/sandbox/gemmlike/bls_l3_packm_b.h +++ b/sandbox/gemmlike/bls_l3_packm_b.h @@ -41,8 +41,6 @@ void PASTECH2(bls_,ch,opname) \ dim_t n, \ dim_t nr, \ cntx_t* restrict cntx, \ - rntm_t* restrict rntm, \ - mem_t* restrict mem, \ thrinfo_t* restrict thread \ ); \ @@ -53,23 +51,6 @@ GENTPROT( scomplex, c, packm_init_mem_b ) GENTPROT( dcomplex, z, packm_init_mem_b ) -#undef GENTPROT -#define GENTPROT( ctype, ch, opname ) \ -\ -void PASTECH2(bls_,ch,opname) \ - ( \ - rntm_t* restrict rntm, \ - mem_t* restrict mem, \ - thrinfo_t* restrict thread \ - ); \ - -//INSERT_GENTPROT_BASIC0( packm_finalize_mem_b ) -GENTPROT( float, s, packm_finalize_mem_b ) -GENTPROT( double, d, packm_finalize_mem_b ) -GENTPROT( scomplex, c, packm_finalize_mem_b ) -GENTPROT( dcomplex, z, packm_finalize_mem_b ) - - #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ @@ -109,8 +90,6 @@ void PASTECH2(bls_,ch,opname) \ ctype** restrict p, inc_t* restrict rs_p, inc_t* restrict cs_p, \ inc_t* restrict ps_p, \ cntx_t* restrict cntx, \ - rntm_t* restrict rntm, \ - mem_t* restrict mem, \ thrinfo_t* restrict thread \ ); \ diff --git a/sandbox/gemmlike/bls_l3_packm_var.h b/sandbox/gemmlike/bls_l3_packm_var.h index 98300536bc..4c6db2cac6 100644 --- a/sandbox/gemmlike/bls_l3_packm_var.h +++ b/sandbox/gemmlike/bls_l3_packm_var.h @@ -41,7 +41,7 @@ \ void PASTECH2(bls_,ch,varname) \ ( \ - trans_t transc, \ + conj_t conjc, \ pack_t schema, \ dim_t m, \ dim_t n, \ diff --git a/sandbox/gemmlike/bls_l3_packm_var1.c b/sandbox/gemmlike/bls_l3_packm_var1.c index c0649a9ec4..263ee8bbeb 100644 --- a/sandbox/gemmlike/bls_l3_packm_var1.c +++ b/sandbox/gemmlike/bls_l3_packm_var1.c @@ -43,7 +43,7 @@ \ void PASTECH2(bls_,ch,varname) \ ( \ - trans_t transc, \ + conj_t conjc, \ pack_t schema, \ dim_t m, \ dim_t n, \ @@ -73,11 +73,6 @@ void PASTECH2(bls_,ch,varname) \ inc_t incc; \ inc_t ldc; \ inc_t ldp; \ - conj_t conjc; \ -\ -\ - /* Extract the conjugation bit from the transposition argument. */ \ - conjc = bli_extract_conj( transc ); \ \ /* Create flags to incidate row or column storage. Note that the schema bit that encodes row or column is describing the form of @@ -126,8 +121,8 @@ void PASTECH2(bls_,ch,varname) \ \ /* Query the number of threads and thread ids from the current thread's packm thrinfo_t node. */ \ - const dim_t nt = bli_thread_n_way( thread ); \ - const dim_t tid = bli_thread_work_id( thread ); \ + const dim_t nt = bli_thread_num_threads( thread ); \ + const dim_t tid = bli_thread_thread_id( thread ); \ \ /* Suppress warnings in case tid isn't used (ie: as in slab partitioning). */ \ ( void )nt; \ diff --git a/sandbox/gemmlike/bls_l3_packm_var2.c b/sandbox/gemmlike/bls_l3_packm_var2.c index 8d2b90cac1..b3dddd72ac 100644 --- a/sandbox/gemmlike/bls_l3_packm_var2.c +++ b/sandbox/gemmlike/bls_l3_packm_var2.c @@ -43,7 +43,7 @@ \ void PASTECH2(bls_,ch,varname) \ ( \ - trans_t transc, \ + conj_t conjc, \ pack_t schema, \ dim_t m, \ dim_t n, \ @@ -73,11 +73,6 @@ void PASTECH2(bls_,ch,varname) \ inc_t incc; \ inc_t ldc; \ inc_t ldp; \ - conj_t conjc; \ -\ -\ - /* Extract the conjugation bit from the transposition argument. */ \ - conjc = bli_extract_conj( transc ); \ \ /* Create flags to incidate row or column storage. Note that the schema bit that encodes row or column is describing the form of @@ -126,8 +121,8 @@ void PASTECH2(bls_,ch,varname) \ \ /* Query the number of threads and thread ids from the current thread's packm thrinfo_t node. */ \ - const dim_t nt = bli_thread_n_way( thread ); \ - const dim_t tid = bli_thread_work_id( thread ); \ + const dim_t nt = bli_thread_num_threads( thread ); \ + const dim_t tid = bli_thread_thread_id( thread ); \ \ /* Suppress warnings in case tid isn't used (ie: as in slab partitioning). */ \ ( void )nt; \ diff --git a/sandbox/gemmlike/bls_l3_packm_var3.c b/sandbox/gemmlike/bls_l3_packm_var3.c index 5ea80ff424..62c1a895fb 100644 --- a/sandbox/gemmlike/bls_l3_packm_var3.c +++ b/sandbox/gemmlike/bls_l3_packm_var3.c @@ -45,7 +45,7 @@ \ void PASTECH2(bls_,ch,varname) \ ( \ - trans_t transc, \ + conj_t conjc, \ pack_t schema, \ dim_t m, \ dim_t n, \ @@ -75,11 +75,6 @@ void PASTECH2(bls_,ch,varname) \ inc_t incc; \ inc_t ldc; \ inc_t ldp; \ - conj_t conjc; \ -\ -\ - /* Extract the conjugation bit from the transposition argument. */ \ - conjc = bli_extract_conj( transc ); \ \ /* Create flags to incidate row or column storage. Note that the schema bit that encodes row or column is describing the form of @@ -126,8 +121,8 @@ void PASTECH2(bls_,ch,varname) \ \ /* Query the number of threads and thread ids from the current thread's packm thrinfo_t node. */ \ - const dim_t nt = bli_thread_n_way( thread ); \ - const dim_t tid = bli_thread_work_id( thread ); \ + const dim_t nt = bli_thread_num_threads( thread ); \ + const dim_t tid = bli_thread_thread_id( thread ); \ \ /* Suppress warnings in case tid isn't used (ie: as in slab partitioning). */ \ ( void )nt; \ diff --git a/sandbox/gemmlike/thread/bls_l3_decor_openmp.c b/sandbox/gemmlike/thread/bls_l3_decor_openmp.c index 9c29ef27e7..d8ad17e94e 100644 --- a/sandbox/gemmlike/thread/bls_l3_decor_openmp.c +++ b/sandbox/gemmlike/thread/bls_l3_decor_openmp.c @@ -62,44 +62,25 @@ void bls_l3_thread_decorator_openmp // resize the array_t, if necessary. array_t* array = bli_sba_checkout_array( n_threads ); - // Access the pool_t* for thread 0 and embed it into the rntm. We do - // this up-front only so that we have the rntm_t.sba_pool field - // initialized and ready for the global communicator creation below. - bli_sba_rntm_set_pool( 0, array, rntm ); - - // Set the packing block allocator field of the rntm. This will be - // inherited by all of the child threads when they make local copies of - // the rntm below. - bli_pba_rntm_set_pba( rntm ); - // Allcoate a global communicator for the root thrinfo_t structures. - thrcomm_t* gl_comm = bli_thrcomm_create( rntm, n_threads ); - + thrcomm_t* gl_comm = bli_thrcomm_create( NULL, BLIS_OPENMP, n_threads ); _Pragma( "omp parallel num_threads(n_threads)" ) { // Create a thread-local copy of the master thread's rntm_t. This is // necessary since we want each thread to be able to track its own // small block pool_t as it executes down the function stack. - rntm_t rntm_l = *rntm; - rntm_t* restrict rntm_p = &rntm_l; + rntm_t rntm_l = *rntm; // Query the thread's id from OpenMP. const dim_t tid = omp_get_thread_num(); // Check for a somewhat obscure OpenMP thread-mistmatch issue. - bli_l3_thread_decorator_thread_check( n_threads, tid, gl_comm, rntm_p ); - - // Use the thread id to access the appropriate pool_t* within the - // array_t, and use it to set the sba_pool field within the rntm_t. - // If the pool_t* element within the array_t is NULL, it will first - // be allocated/initialized. - bli_sba_rntm_set_pool( tid, array, rntm_p ); - - thrinfo_t* thread = NULL; + bli_l3_thread_decorator_thread_check( n_threads, tid, gl_comm, &rntm_l ); // Create the root node of the thread's thrinfo_t structure. - bli_l3_sup_thrinfo_create_root( tid, gl_comm, rntm_p, &thread ); + pool_t* pool = bli_apool_array_elem( tid, array ); + thrinfo_t* thread = bli_l3_sup_thrinfo_create( tid, gl_comm, pool, &rntm_l ); func ( @@ -109,12 +90,12 @@ void bls_l3_thread_decorator_openmp beta, c, cntx, - rntm_p, - thread + &rntm_l, + bli_thrinfo_sub_node( thread ) ); // Free the current thread's thrinfo_t structure. - bli_l3_sup_thrinfo_free( rntm_p, thread ); + bli_thrinfo_free( thread ); } // We shouldn't free the global communicator since it was already freed diff --git a/sandbox/gemmlike/thread/bls_l3_decor_pthreads.c b/sandbox/gemmlike/thread/bls_l3_decor_pthreads.c index 95d0e968ec..9f57dc4e61 100644 --- a/sandbox/gemmlike/thread/bls_l3_decor_pthreads.c +++ b/sandbox/gemmlike/thread/bls_l3_decor_pthreads.c @@ -76,19 +76,11 @@ void* bls_l3_thread_entry( void* data_void ) // Create a thread-local copy of the master thread's rntm_t. This is // necessary since we want each thread to be able to track its own // small block pool_t as it executes down the function stack. - rntm_t rntm_l = *rntm; - rntm_t* restrict rntm_p = &rntm_l; + rntm_t rntm_l = *rntm; - // Use the thread id to access the appropriate pool_t* within the - // array_t, and use it to set the sba_pool field within the rntm_t. - // If the pool_t* element within the array_t is NULL, it will first - // be allocated/initialized. - bli_sba_rntm_set_pool( tid, array, rntm_p ); - - thrinfo_t* thread = NULL; - - // Create the root node of the current thread's thrinfo_t structure. - bli_l3_sup_thrinfo_create_root( tid, gl_comm, rntm_p, &thread ); + // Create the root node of the thread's thrinfo_t structure. + pool_t* pool = bli_apool_array_elem( tid, array ); + thrinfo_t* thread = bli_l3_sup_thrinfo_create( tid, gl_comm, pool, &rntm_l ); func ( @@ -98,12 +90,12 @@ void* bls_l3_thread_entry( void* data_void ) beta, c, cntx, - rntm_p, - thread + &rntm_l, + bli_thrinfo_sub_node( thread ) ); // Free the current thread's thrinfo_t structure. - bli_l3_sup_thrinfo_free( rntm_p, thread ); + bli_thrinfo_free( thread ); return NULL; } @@ -132,20 +124,10 @@ void bls_l3_thread_decorator_pthreads // with an internal lock to ensure only one application thread accesses // the sba at a time. bli_sba_checkout_array() will also automatically // resize the array_t, if necessary. - array_t* restrict array = bli_sba_checkout_array( n_threads ); - - // Access the pool_t* for thread 0 and embed it into the rntm. We do - // this up-front only so that we have the rntm_t.sba_pool field - // initialized and ready for the global communicator creation below. - bli_sba_rntm_set_pool( 0, array, rntm ); - - // Set the packing block allocator field of the rntm. This will be - // inherited by all of the child threads when they make local copies of - // the rntm below. - bli_pba_rntm_set_pba( rntm ); + array_t* array = bli_sba_checkout_array( n_threads ); // Allocate a global communicator for the root thrinfo_t structures. - thrcomm_t* gl_comm = bli_thrcomm_create( rntm, n_threads ); + thrcomm_t* gl_comm = bli_thrcomm_create( NULL, BLIS_POSIX, n_threads ); // Allocate an array of pthread objects and auxiliary data structs to pass // to the thread entry functions. diff --git a/sandbox/gemmlike/thread/bls_l3_decor_single.c b/sandbox/gemmlike/thread/bls_l3_decor_single.c index b5f5a66692..118712a062 100644 --- a/sandbox/gemmlike/thread/bls_l3_decor_single.c +++ b/sandbox/gemmlike/thread/bls_l3_decor_single.c @@ -62,50 +62,16 @@ void bls_l3_thread_decorator_single // resize the array_t, if necessary. array_t* array = bli_sba_checkout_array( n_threads ); - // Access the pool_t* for thread 0 and embed it into the rntm. - bli_sba_rntm_set_pool( 0, array, rntm ); - - // Set the packing block allocator field of the rntm. - bli_pba_rntm_set_pba( rntm ); - -#ifndef SKIP_THRINFO_TREE // Allcoate a global communicator for the root thrinfo_t structures. - thrcomm_t* gl_comm = bli_thrcomm_create( rntm, n_threads ); -#endif - + thrcomm_t* gl_comm = &BLIS_SINGLE_COMM; { - // NOTE: We don't need to create another copy of the rntm_t since - // it was already copied in one of the high-level oapi functions. - rntm_t* rntm_p = rntm; - // There is only one thread id (for the thief thread). const dim_t tid = 0; - // Use the thread id to access the appropriate pool_t* within the - // array_t, and use it to set the sba_pool field within the rntm_t. - // If the pool_t* element within the array_t is NULL, it will first - // be allocated/initialized. - // NOTE: This is commented out because, in the single-threaded case, - // this is redundant since it's already been done above. - //bli_sba_rntm_set_pool( tid, array, rntm_p ); - -#ifndef SKIP_THRINFO_TREE - thrinfo_t* thread = NULL; - - // Create the root node of the thread's thrinfo_t structure. - bli_l3_sup_thrinfo_create_root( tid, gl_comm, rntm_p, &thread ); -#else - // This optimization allows us to use one of the global thrinfo_t - // objects for single-threaded execution rather than grow one from - // scratch. The key is that bli_thrinfo_sup_grow(), which is called - // from within the variants, will immediately return if it detects - // that the thrinfo_t* passed into it is either - // &BLIS_GEMM_SINGLE_THREADED or &BLIS_PACKM_SINGLE_THREADED. - thrinfo_t* thread = &BLIS_GEMM_SINGLE_THREADED; - - ( void )tid; -#endif + // Create the root node of the thread's thrinfo_t structure. + pool_t* pool = bli_apool_array_elem( tid, array ); + thrinfo_t* thread = bli_l3_sup_thrinfo_create( tid, gl_comm, pool, rntm ); func ( @@ -115,14 +81,12 @@ void bls_l3_thread_decorator_single beta, c, cntx, - rntm_p, - thread + rntm, + bli_thrinfo_sub_node( thread ) ); -#ifndef SKIP_THRINFO_TREE // Free the current thread's thrinfo_t structure. - bli_l3_sup_thrinfo_free( rntm_p, thread ); -#endif + bli_thrinfo_free( thread ); } // We shouldn't free the global communicator since it was already freed diff --git a/testsuite/src/test_gemm_ukr.c b/testsuite/src/test_gemm_ukr.c index 69ee4339da..f3b5f7b520 100644 --- a/testsuite/src/test_gemm_ukr.c +++ b/testsuite/src/test_gemm_ukr.c @@ -231,16 +231,12 @@ void libblis_test_gemm_ukr_experiment libblis_test_mobj_randomize( params, TRUE, &c ); bli_copym( &c, &c_save ); - rntm_t rntm; - bli_rntm_init( &rntm ); - bli_pba_rntm_set_pba( &rntm ); - // Transpose B to B^T for packing. bli_obj_induce_trans( &b ); // Create pack objects for a and b, and pack them to ap and bp, // respectively. - cntl_t* cntl_a = libblis_test_pobj_create + thrinfo_t* thread_a = libblis_test_pobj_create ( BLIS_MR, BLIS_KR, @@ -248,10 +244,9 @@ void libblis_test_gemm_ukr_experiment BLIS_PACKED_ROW_PANELS, BLIS_BUFFER_FOR_A_BLOCK, &a, &ap, - cntx, - &rntm + cntx ); - cntl_t* cntl_b = libblis_test_pobj_create + thrinfo_t* thread_b = libblis_test_pobj_create ( BLIS_NR, BLIS_KR, @@ -259,8 +254,7 @@ void libblis_test_gemm_ukr_experiment BLIS_PACKED_COL_PANELS, BLIS_BUFFER_FOR_B_PANEL, &b, &bp, - cntx, - &rntm + cntx ); // Transpose B^T back to B and Bp^T back to Bp. @@ -293,8 +287,8 @@ void libblis_test_gemm_ukr_experiment // Free the control tree nodes and release their cached mem_t entries // back to the pba. - bli_cntl_free( &rntm, cntl_a, &BLIS_PACKM_SINGLE_THREADED ); - bli_cntl_free( &rntm, cntl_b, &BLIS_PACKM_SINGLE_THREADED ); + bli_thrinfo_free( thread_a ); + bli_thrinfo_free( thread_b ); // Free the test objects. bli_obj_free( &a ); diff --git a/testsuite/src/test_gemmtrsm_ukr.c b/testsuite/src/test_gemmtrsm_ukr.c index 44ba51587c..cf2b9d0409 100644 --- a/testsuite/src/test_gemmtrsm_ukr.c +++ b/testsuite/src/test_gemmtrsm_ukr.c @@ -283,13 +283,9 @@ void libblis_test_gemmtrsm_ukr_experiment bli_copym( &b11, &c11 ); bli_copym( &c11, &c11_save ); - rntm_t rntm; - bli_rntm_init( &rntm ); - bli_pba_rntm_set_pba( &rntm ); - // Create pack objects for a and b, and pack them to ap and bp, // respectively. - cntl_t* cntl_a = libblis_test_pobj_create + thrinfo_t* thread_a = libblis_test_pobj_create ( BLIS_MR, BLIS_MR, @@ -297,8 +293,7 @@ void libblis_test_gemmtrsm_ukr_experiment BLIS_PACKED_ROW_PANELS, BLIS_BUFFER_FOR_A_BLOCK, &a, &ap, - cntx, - &rntm + cntx ); // Set the diagonal offset of ap. @@ -315,7 +310,7 @@ bli_printm( "a", &a, "%5.2f", "" ); bli_printm( "ap", &ap, "%5.2f", "" ); #endif - cntl_t* cntl_b = NULL; + thrinfo_t* thread_b = NULL; // Repeat the experiment n_repeats times and record results. for ( i = 0; i < n_repeats; ++i ) @@ -325,7 +320,7 @@ bli_printm( "ap", &ap, "%5.2f", "" ); // Transpose B to B^T for packing. bli_obj_induce_trans( &b ); - cntl_b = libblis_test_pobj_create + thread_b = libblis_test_pobj_create ( BLIS_NR, BLIS_MR, @@ -333,8 +328,7 @@ bli_printm( "ap", &ap, "%5.2f", "" ); BLIS_PACKED_COL_PANELS, BLIS_BUFFER_FOR_B_PANEL, &b, &bp, - cntx, - &rntm + cntx ); // Transpose B^T back to B and Bp^T back to Bp. @@ -362,9 +356,9 @@ bli_printm( "ap", &ap, "%5.2f", "" ); // to perform the correctness check later. if ( i < n_repeats - 1 ) { - // Free the control tree nodes and release their cached mem_t entries + // Free the thread control tree nodes and release their cached mem_t entries // back to the memory broker. - bli_cntl_free( &rntm, cntl_b, &BLIS_PACKM_SINGLE_THREADED ); + bli_thrinfo_free( thread_b ); } } @@ -401,11 +395,11 @@ bli_printm( "ap", &ap, "%5.2f", "" ); // Zero out performance and residual if output matrix is empty. //libblis_test_check_empty_problem( &c11, perf, resid ); - // Free the control tree nodes and release their cached mem_t entries + // Free the thread control tree nodes and release their cached mem_t entries // back to the pba. - bli_cntl_free( &rntm, cntl_a, &BLIS_PACKM_SINGLE_THREADED ); - if ( cntl_b ) - bli_cntl_free( &rntm, cntl_b, &BLIS_PACKM_SINGLE_THREADED ); + bli_thrinfo_free( thread_a ); + if ( thread_b ) + bli_thrinfo_free( thread_b ); // Free the test objects. bli_obj_free( &a_big ); diff --git a/testsuite/src/test_libblis.c b/testsuite/src/test_libblis.c index a355385a30..7eaa7621df 100644 --- a/testsuite/src/test_libblis.c +++ b/testsuite/src/test_libblis.c @@ -2652,17 +2652,20 @@ void libblis_test_mobj_create( test_params_t* params, num_t dt, trans_t trans, c } -cntl_t* libblis_test_pobj_create( bszid_t bmult_id_m, bszid_t bmult_id_n, invdiag_t inv_diag, pack_t pack_schema, packbuf_t pack_buf, obj_t* a, obj_t* p, cntx_t* cntx, rntm_t* rntm ) +thrinfo_t* libblis_test_pobj_create( bszid_t bmult_id_m, bszid_t bmult_id_n, invdiag_t inv_diag, pack_t pack_schema, packbuf_t pack_buf, obj_t* a, obj_t* p, cntx_t* cntx ) { - bool does_inv_diag; + bool does_inv_diag; if ( inv_diag == BLIS_NO_INVERT_DIAG ) does_inv_diag = FALSE; else does_inv_diag = TRUE; + rntm_t rntm; + bli_rntm_init( &rntm ); + // Create a control tree node for the packing operation. cntl_t* cntl = bli_packm_cntl_create_node ( - NULL, // we don't need the small block allocator from the runtime. + NULL, // pass NULL as the pool so that malloc() is used. NULL, // func ptr is not referenced b/c we don't call via l3 _int(). bmult_id_m, bmult_id_n, @@ -2674,12 +2677,17 @@ cntl_t* libblis_test_pobj_create( bszid_t bmult_id_m, bszid_t bmult_id_n, invdia NULL // no child node needed ); + thrinfo_t* thread = bli_l3_thrinfo_create( 0, &BLIS_SINGLE_COMM, NULL, &rntm, cntl ); + // Pack the contents of A to P. - bli_packm_blk_var1( a, p, cntx, rntm, cntl, &BLIS_PACKM_SINGLE_THREADED ); + bli_packm_blk_var1( a, p, cntx, cntl, thread ); + + // Free the control tree. + bli_l3_cntl_free( NULL, cntl ); - // Return the control tree pointer so the caller can free the cntl_t and its + // Return the thread control tree pointer so the caller can free the thrinfo_t and its // mem_t entry later on. - return cntl; + return thread; } diff --git a/testsuite/src/test_libblis.h b/testsuite/src/test_libblis.h index 9e38964ee7..93c892c4f2 100644 --- a/testsuite/src/test_libblis.h +++ b/testsuite/src/test_libblis.h @@ -420,7 +420,7 @@ void fill_string_with_n_spaces( char* str, unsigned int n_spaces ); // --- Create object --- void libblis_test_mobj_create( test_params_t* params, num_t dt, trans_t trans, char storage, dim_t m, dim_t n, obj_t* a ); -cntl_t* libblis_test_pobj_create( bszid_t bmult_id_m, bszid_t bmult_id_n, invdiag_t inv_diag, pack_t pack_schema, packbuf_t pack_buf, obj_t* a, obj_t* p, cntx_t* cntx, rntm_t* rntm ); +thrinfo_t* libblis_test_pobj_create( bszid_t bmult_id_m, bszid_t bmult_id_n, invdiag_t inv_diag, pack_t pack_schema, packbuf_t pack_buf, obj_t* a, obj_t* p, cntx_t* cntx ); void libblis_test_vobj_create( test_params_t* params, num_t dt, char storage, dim_t m, obj_t* x ); // --- Randomize/initialize object --- diff --git a/testsuite/src/test_trsm_ukr.c b/testsuite/src/test_trsm_ukr.c index 5f4988e1c7..df8c2b8eab 100644 --- a/testsuite/src/test_trsm_ukr.c +++ b/testsuite/src/test_trsm_ukr.c @@ -232,13 +232,9 @@ void libblis_test_trsm_ukr_experiment libblis_test_mobj_randomize( params, TRUE, &c ); bli_copym( &c, &c_save ); - rntm_t rntm; - bli_rntm_init( &rntm ); - bli_pba_rntm_set_pba( &rntm ); - // Create pack objects for a and b, and pack them to ap and bp, // respectively. - cntl_t* cntl_a = libblis_test_pobj_create + thrinfo_t* thread_a = libblis_test_pobj_create ( BLIS_MR, BLIS_MR, @@ -246,8 +242,7 @@ void libblis_test_trsm_ukr_experiment BLIS_PACKED_ROW_PANELS, BLIS_BUFFER_FOR_A_BLOCK, &a, &ap, - cntx, - &rntm + cntx ); // Set the diagonal offset of ap. @@ -271,7 +266,7 @@ bli_printm( "ap", &ap, "%5.2f", "" ); // Transpose B to B^T for packing. bli_obj_induce_trans( &b ); - cntl_t* cntl_b = libblis_test_pobj_create + thrinfo_t* thread_b = libblis_test_pobj_create ( BLIS_NR, BLIS_MR, @@ -279,8 +274,7 @@ bli_printm( "ap", &ap, "%5.2f", "" ); BLIS_PACKED_COL_PANELS, BLIS_BUFFER_FOR_B_PANEL, &b, &bp, - cntx, - &rntm + cntx ); // Transpose B^T back to B and Bp^T back to Bp. @@ -297,7 +291,7 @@ bli_printm( "ap", &ap, "%5.2f", "" ); // Free the control tree nodes and release their cached mem_t entries // back to the memory broker. - bli_cntl_free( &rntm, cntl_b, &BLIS_PACKM_SINGLE_THREADED ); + bli_thrinfo_free( thread_b ); } // Estimate the performance of the best experiment repeat. @@ -312,7 +306,7 @@ bli_printm( "ap", &ap, "%5.2f", "" ); // Free the control tree nodes and release their cached mem_t entries // back to the memory broker. - bli_cntl_free( &rntm, cntl_a, &BLIS_PACKM_SINGLE_THREADED ); + bli_thrinfo_free( thread_a ); // Free the test objects. bli_obj_free( &a );