diff --git a/build/libblis-symbols.def b/build/libblis-symbols.def index 97146a7861..8d29d73b25 100644 --- a/build/libblis-symbols.def +++ b/build/libblis-symbols.def @@ -1307,7 +1307,6 @@ bli_pba_init_pools bli_pba_pool_size bli_pba_query bli_pba_release -bli_pba_rntm_set_pba bli_memsys_finalize bli_memsys_init bli_mkherm diff --git a/frame/1m/bli_l1m_ft_ker.h b/frame/1m/bli_l1m_ft_ker.h index 1146ca7d2c..2e813cf4a6 100644 --- a/frame/1m/bli_l1m_ft_ker.h +++ b/frame/1m/bli_l1m_ft_ker.h @@ -50,21 +50,23 @@ typedef void (*PASTECH3(ch,opname,_ker,tsuf)) \ ( \ struc_t strucc, \ - doff_t diagoffc, \ diag_t diagc, \ uplo_t uploc, \ conj_t conjc, \ pack_t schema, \ bool invdiag, \ - dim_t m_panel, \ - dim_t n_panel, \ - dim_t m_panel_max, \ - dim_t n_panel_max, \ + dim_t panel_dim, \ + dim_t panel_len, \ + dim_t panel_dim_max, \ + dim_t panel_len_max, \ + dim_t panel_dim_off, \ + dim_t panel_len_off, \ ctype* restrict kappa, \ - ctype* restrict c, inc_t rs_c, inc_t cs_c, \ - ctype* restrict p, inc_t rs_p, inc_t cs_p, \ + ctype* restrict c, inc_t incc, inc_t ldc, \ + ctype* restrict p, inc_t ldp, \ inc_t is_p, \ - cntx_t* cntx \ + cntx_t* cntx, \ + void* params \ ); INSERT_GENTDEF( packm ) diff --git a/frame/1m/bli_l1m_oft_var.h b/frame/1m/bli_l1m_oft_var.h index 15e9dae6f5..0b60d4e2f6 100644 --- a/frame/1m/bli_l1m_oft_var.h +++ b/frame/1m/bli_l1m_oft_var.h @@ -48,6 +48,7 @@ typedef void (*PASTECH(opname,_var_oft)) \ obj_t* a, \ obj_t* p, \ cntx_t* cntx, \ + rntm_t* rntm, \ cntl_t* cntl, \ thrinfo_t* thread \ ); diff --git a/frame/1m/packm/bli_packm.h b/frame/1m/packm/bli_packm.h index e8aa363288..88657a7128 100644 --- a/frame/1m/packm/bli_packm.h +++ b/frame/1m/packm/bli_packm.h @@ -33,15 +33,15 @@ */ +#include "bli_packm_alloc.h" #include "bli_packm_cntl.h" #include "bli_packm_check.h" #include "bli_packm_init.h" #include "bli_packm_int.h" +#include "bli_packm_scalar.h" #include "bli_packm_part.h" -#include "bli_packm_var.h" - #include "bli_packm_struc_cxk.h" #include "bli_packm_struc_cxk_1er.h" @@ -50,6 +50,8 @@ // Mixed datatype support. #ifdef BLIS_ENABLE_GEMM_MD -#include "bli_packm_md.h" +#include "bli_packm_struc_cxk_md.h" #endif +#include "bli_packm_blk_var1.h" + diff --git a/frame/1m/packm/bli_packm_var.h b/frame/1m/packm/bli_packm_alloc.c similarity index 50% rename from frame/1m/packm/bli_packm_var.h rename to frame/1m/packm/bli_packm_alloc.c index 723e6fdb4a..df6750d7ac 100644 --- a/frame/1m/packm/bli_packm_var.h +++ b/frame/1m/packm/bli_packm_alloc.c @@ -5,7 +5,7 @@ libraries. Copyright (C) 2014, The University of Texas at Austin - Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc. + Copyright (C) 2016, Hewlett Packard Enterprise Development LP Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -33,78 +33,67 @@ */ -// -// Prototype object-based interfaces. -// - -#undef GENPROT -#define GENPROT( opname ) \ -\ -BLIS_EXPORT_BLIS void PASTEMAC0(opname) \ - ( \ - obj_t* c, \ - obj_t* p, \ - cntx_t* cntx, \ - cntl_t* cntl, \ - thrinfo_t* t \ - ); - -GENPROT( packm_unb_var1 ) -GENPROT( packm_blk_var1 ) - -// -// Prototype BLAS-like interfaces with void pointer operands. -// - -#undef GENTPROT -#define GENTPROT( ctype, ch, varname ) \ -\ -void PASTEMAC(ch,varname) \ - ( \ - struc_t strucc, \ - doff_t diagoffc, \ - diag_t diagc, \ - uplo_t uploc, \ - trans_t transc, \ - dim_t m, \ - dim_t n, \ - dim_t m_max, \ - dim_t n_max, \ - void* kappa, \ - void* c, inc_t rs_c, inc_t cs_c, \ - void* p, inc_t rs_p, inc_t cs_p, \ - cntx_t* cntx \ - ); - -INSERT_GENTPROT_BASIC0( packm_unb_var1 ) - -#undef GENTPROT -#define GENTPROT( ctype, ch, varname ) \ -\ -void PASTEMAC(ch,varname) \ - ( \ - struc_t strucc, \ - doff_t diagoffc, \ - diag_t diagc, \ - uplo_t uploc, \ - trans_t transc, \ - pack_t schema, \ - bool invdiag, \ - bool revifup, \ - bool reviflo, \ - dim_t m, \ - dim_t n, \ - dim_t m_max, \ - dim_t n_max, \ - void* kappa, \ - void* c, inc_t rs_c, inc_t cs_c, \ - void* p, inc_t rs_p, inc_t cs_p, \ - inc_t is_p, \ - dim_t pd_p, inc_t ps_p, \ - void_fp packm_ker, \ - cntx_t* cntx, \ - thrinfo_t* thread \ - ); - -INSERT_GENTPROT_BASIC0( packm_blk_var1 ) +#include "blis.h" + +void* bli_packm_alloc + ( + siz_t size_needed, + rntm_t* rntm, + cntl_t* cntl, + thrinfo_t* thread + ) +{ + // Query the pack buffer type from the control tree node. + packbuf_t pack_buf_type = bli_cntl_packm_params_pack_buf_type( cntl ); + + // Query the address of the mem_t entry within the control tree node. + mem_t* cntl_mem_p = bli_cntl_pack_mem( cntl ); + + mem_t* local_mem_p; + mem_t local_mem_s; + + siz_t cntl_mem_size = 0; + + if ( bli_mem_is_alloc( cntl_mem_p ) ) + cntl_mem_size = bli_mem_size( cntl_mem_p ); + + if ( cntl_mem_size < size_needed ) + { + if ( bli_thread_am_ochief( thread ) ) + { + // The chief thread releases the existing block associated with + // the mem_t entry in the control tree, and then re-acquires a + // new block, saving the associated mem_t entry to local_mem_s. + if ( bli_mem_is_alloc( cntl_mem_p ) ) + { + bli_pba_release + ( + rntm, + cntl_mem_p + ); + } + bli_pba_acquire_m + ( + rntm, + size_needed, + pack_buf_type, + &local_mem_s + ); + } + + // Broadcast the address of the chief thread's local mem_t entry to + // all threads. + local_mem_p = bli_thread_broadcast( thread, &local_mem_s ); + + // Save the chief thread's local mem_t entry to the mem_t field in + // this thread's control tree node. + *cntl_mem_p = *local_mem_p; + + // Barrier so that the master thread doesn't return from the function + // before we are done reading. + bli_thread_barrier( thread ); + } + + return bli_mem_buffer( cntl_mem_p ); +} diff --git a/frame/3/bli_l3_packm.h b/frame/1m/packm/bli_packm_alloc.h similarity index 88% rename from frame/3/bli_l3_packm.h rename to frame/1m/packm/bli_packm_alloc.h index 696dabf593..b433be350a 100644 --- a/frame/3/bli_l3_packm.h +++ b/frame/1m/packm/bli_packm_alloc.h @@ -5,7 +5,6 @@ libraries. Copyright (C) 2014, The University of Texas at Austin - Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -33,13 +32,11 @@ */ -void bli_l3_packm - ( - obj_t* x, - obj_t* x_pack, - cntx_t* cntx, - rntm_t* rntm, - cntl_t* cntl, - thrinfo_t* thread - ); +BLIS_EXPORT_BLIS void* bli_packm_alloc + ( + siz_t size_needed, + rntm_t* rntm, + cntl_t* cntl, + thrinfo_t* thread + ); diff --git a/frame/1m/packm/bli_packm_blk_var1.c b/frame/1m/packm/bli_packm_blk_var1.c index 5073f78127..edeeae2b98 100644 --- a/frame/1m/packm/bli_packm_blk_var1.c +++ b/frame/1m/packm/bli_packm_blk_var1.c @@ -35,35 +35,6 @@ #include "blis.h" -#define FUNCPTR_T packm_fp - -typedef void (*FUNCPTR_T) - ( - struc_t strucc, - doff_t diagoffc, - diag_t diagc, - uplo_t uploc, - trans_t transc, - pack_t schema, - bool invdiag, - bool revifup, - bool reviflo, - dim_t m, - dim_t n, - dim_t m_max, - dim_t n_max, - void* kappa, - void* c, inc_t rs_c, inc_t cs_c, - void* p, inc_t rs_p, inc_t cs_p, - inc_t is_p, - dim_t pd_p, inc_t ps_p, - void_fp packm_ker, - cntx_t* cntx, - thrinfo_t* thread - ); - -static FUNCPTR_T GENARRAY(ftypes,packm_blk_var1); - static func_t packm_struc_cxk_kers[BLIS_NUM_PACK_SCHEMA_TYPES] = { @@ -79,614 +50,265 @@ static func_t packm_struc_cxk_kers[BLIS_NUM_PACK_SCHEMA_TYPES] = NULL, bli_zpackm_struc_cxk_1er, } }, }; +static void_fp GENARRAY2_ALL(packm_struc_cxk_md,packm_struc_cxk_md); void bli_packm_blk_var1 ( obj_t* c, obj_t* p, cntx_t* cntx, + rntm_t* rntm, cntl_t* cntl, - thrinfo_t* t + thrinfo_t* thread ) { -#ifdef BLIS_ENABLE_GEMM_MD - // Call a different packm implementation when the storage and target - // datatypes differ. - if ( bli_obj_dt( c ) != bli_obj_target_dt( c ) ) - { - bli_packm_blk_var1_md( c, p, cntx, cntl, t ); + // Extract various fields from the control tree. + pack_t schema = bli_cntl_packm_params_pack_schema( cntl ); + bool invdiag = bli_cntl_packm_params_does_invert_diag( cntl ); + bool revifup = bli_cntl_packm_params_rev_iter_if_upper( cntl ); + bool reviflo = bli_cntl_packm_params_rev_iter_if_lower( cntl ); + + // Every thread initializes p and determines the size of memory + // block needed (which gets embedded into the otherwise "blank" mem_t + // entry in the control tree node). Return early if no packing is required. + if ( !bli_packm_init( c, p, cntx, rntm, cntl, thread ) ) return; - } -#endif - num_t dt_p = bli_obj_dt( p ); + // Check parameters. + if ( bli_error_checking_is_enabled() ) + bli_packm_int_check( c, p, cntx ); - struc_t strucc = bli_obj_struc( c ); - doff_t diagoffc = bli_obj_diag_offset( c ); - diag_t diagc = bli_obj_diag( c ); - uplo_t uploc = bli_obj_uplo( c ); - trans_t transc = bli_obj_conjtrans_status( c ); - pack_t schema = bli_obj_pack_schema( p ); - bool invdiag = bli_obj_has_inverted_diag( p ); - bool revifup = bli_obj_is_pack_rev_if_upper( p ); - bool reviflo = bli_obj_is_pack_rev_if_lower( p ); + num_t dt_c = bli_obj_dt( c ); + dim_t dt_c_size = bli_dt_size( dt_c ); - dim_t m_p = bli_obj_length( p ); - dim_t n_p = bli_obj_width( p ); - dim_t m_max_p = bli_obj_padded_length( p ); - dim_t n_max_p = bli_obj_padded_width( p ); + num_t dt_p = bli_obj_dt( p ); + dim_t dt_p_size = bli_dt_size( dt_p ); - void* buf_c = bli_obj_buffer_at_off( c ); - inc_t rs_c = bli_obj_row_stride( c ); - inc_t cs_c = bli_obj_col_stride( c ); + struc_t strucc = bli_obj_struc( c ); + doff_t diagoffc = bli_obj_diag_offset( c ); + diag_t diagc = bli_obj_diag( c ); + uplo_t uploc = bli_obj_uplo( c ); + conj_t conjc = bli_obj_conj_status( c ); - void* buf_p = bli_obj_buffer_at_off( p ); - inc_t rs_p = bli_obj_row_stride( p ); - inc_t cs_p = bli_obj_col_stride( p ); - inc_t is_p = bli_obj_imag_stride( p ); - dim_t pd_p = bli_obj_panel_dim( p ); - inc_t ps_p = bli_obj_panel_stride( p ); + dim_t iter_dim = bli_obj_length( p ); + dim_t panel_len_full = bli_obj_width( p ); + dim_t panel_len_max = bli_obj_padded_width( p ); - obj_t kappa; - void* buf_kappa; + char* c_cast = bli_obj_buffer_at_off( c ); + inc_t incc = bli_obj_row_stride( c ); + inc_t ldc = bli_obj_col_stride( c ); + dim_t panel_dim_off = bli_obj_row_off( c ); + dim_t panel_len_off = bli_obj_col_off( c ); - func_t* packm_kers; - void_fp packm_ker; + char* p_cast = bli_obj_buffer( p ); + inc_t ldp = bli_obj_col_stride( p ); + inc_t is_p = bli_obj_imag_stride( p ); + dim_t panel_dim_max = bli_obj_panel_dim( p ); + inc_t ps_p = bli_obj_panel_stride( p ); - FUNCPTR_T f; + doff_t diagoffc_inc = ( doff_t )panel_dim_max; + obj_t kappa_local; + char* kappa_cast = bli_packm_scalar( &kappa_local, p ); - // Treatment of kappa (ie: packing during scaling) depends on - // whether we are executing an induced method. - if ( bli_is_nat_packed( schema ) ) - { - // This branch is for native execution, where we assume that - // the micro-kernel will always apply the alpha scalar of the - // higher-level operation. Thus, we use BLIS_ONE for kappa so - // that the underlying packm implementation does not perform - // any scaling during packing. - buf_kappa = bli_obj_buffer_for_const( dt_p, &BLIS_ONE ); - } - else // if ( bli_is_ind_packed( schema ) ) - { - obj_t* kappa_p; - - // The value for kappa we use will depend on whether the scalar - // attached to A has a nonzero imaginary component. If it does, - // then we will apply the scalar during packing to facilitate - // implementing induced complex domain algorithms in terms of - // real domain micro-kernels. (In the aforementioned situation, - // applying a real scalar is easy, but applying a complex one is - // harder, so we avoid the need altogether with the code below.) - if ( bli_obj_scalar_has_nonzero_imag( p ) ) - { - //printf( "applying non-zero imag kappa\n" ); + // we use the default lookup table to determine the right func_t + // for the current schema. + func_t* packm_kers = &packm_struc_cxk_kers[ bli_pack_schema_index( schema ) ]; - // Detach the scalar. - bli_obj_scalar_detach( p, &kappa ); + // Query the datatype-specific function pointer from the func_t object. + packm_ker_vft packm_ker_cast = bli_func_get_dt( dt_p, packm_kers ); - // Reset the attached scalar (to 1.0). - bli_obj_scalar_reset( p ); + // For mixed-precision gemm, select the proper kernel (only dense panels). + if ( dt_c != dt_p ) + { + packm_ker_cast = packm_struc_cxk_md[ dt_c ][ dt_p ]; + } - kappa_p = κ - } - else - { - // If the internal scalar of A has only a real component, then - // we will apply it later (in the micro-kernel), and so we will - // use BLIS_ONE to indicate no scaling during packing. - kappa_p = &BLIS_ONE; - } + // Query the address of the packm params field of the obj_t. The user might + // have set this field in order to specify a custom packm kernel. + packm_blk_var1_params_t* params = bli_obj_pack_params( c ); - // Acquire the buffer to the kappa chosen above. - buf_kappa = bli_obj_buffer_for_1x1( dt_p, kappa_p ); + if ( params && params->ukr_fn[ dt_c ][ dt_p ] ) + { + // Query the user-provided packing kernel from the obj_t. If provided, + // this overrides the kernel determined above. + packm_ker_cast = params->ukr_fn[ dt_c ][ dt_p ]; } + /* Compute the total number of iterations we'll need. */ + dim_t n_iter = iter_dim / panel_dim_max + ( iter_dim % panel_dim_max ? 1 : 0 ); - // The original idea here was to read the packm_ukr from the context - // if it is non-NULL. The problem is, it requires that we be able to - // assume that the packm_ukr field is initialized to NULL, which it - // currently is not. - - //func_t* cntx_packm_kers = bli_cntx_get_packm_ukr( cntx ); + /* Set the initial values and increments for indices related to C and P + based on whether reverse iteration was requested. */ + dim_t ic0, ip0; + doff_t ic_inc, ip_inc; - //if ( bli_func_is_null_dt( dt_c, cntx_packm_kers ) ) + if ( ( revifup && bli_is_upper( uploc ) && bli_is_triangular( strucc ) ) || + ( reviflo && bli_is_lower( uploc ) && bli_is_triangular( strucc ) ) ) { - // If the packm structure-aware kernel func_t in the context is - // NULL (which is the default value after the context is created), - // we use the default lookup table to determine the right func_t - // for the current schema. - const dim_t i = bli_pack_schema_index( schema ); - - packm_kers = &packm_struc_cxk_kers[ i ]; + ic0 = (n_iter - 1) * panel_dim_max; + ic_inc = -panel_dim_max; + ip0 = n_iter - 1; + ip_inc = -1; } -#if 0 - else // cntx's packm func_t overrides + else { - // If the packm structure-aware kernel func_t in the context is - // non-NULL (ie: assumed to be valid), we use that instead. - //packm_kers = bli_cntx_packm_ukrs( cntx ); - packm_kers = cntx_packm_kers; + ic0 = 0; + ic_inc = panel_dim_max; + ip0 = 0; + ip_inc = 1; } -#endif - // Query the datatype-specific function pointer from the func_t object. - packm_ker = bli_func_get_dt( dt_p, packm_kers ); - - // Index into the type combination array to extract the correct - // function pointer. - f = ftypes[dt_p]; - - // Invoke the function. - f( strucc, - diagoffc, - diagc, - uploc, - transc, - schema, - invdiag, - revifup, - reviflo, - m_p, - n_p, - m_max_p, - n_max_p, - buf_kappa, - buf_c, rs_c, cs_c, - buf_p, rs_p, cs_p, - is_p, - pd_p, ps_p, - packm_ker, - cntx, - t ); -} + // Query the number of threads and thread ids from the current thread's + // packm thrinfo_t node. + const dim_t nt = bli_thread_n_way( thread ); + const dim_t tid = bli_thread_work_id( thread ); + // Determine the thread range and increment using the current thread's + // packm thrinfo_t node. NOTE: The definition of bli_thread_range_jrir() + // will depend on whether slab or round-robin partitioning was requested + // at configure-time. + dim_t it_start, it_end, it_inc; + bli_thread_range_jrir( thread, n_iter, 1, FALSE, &it_start, &it_end, &it_inc ); -#undef GENTFUNCR -#define GENTFUNCR( ctype, ctype_r, ch, chr, opname, varname ) \ -\ -void PASTEMAC(ch,varname) \ - ( \ - struc_t strucc, \ - doff_t diagoffc, \ - diag_t diagc, \ - uplo_t uploc, \ - trans_t transc, \ - pack_t schema, \ - bool invdiag, \ - bool revifup, \ - bool reviflo, \ - dim_t m, \ - dim_t n, \ - dim_t m_max, \ - dim_t n_max, \ - void* kappa, \ - void* c, inc_t rs_c, inc_t cs_c, \ - void* p, inc_t rs_p, inc_t cs_p, \ - inc_t is_p, \ - dim_t pd_p, inc_t ps_p, \ - void_fp packm_ker, \ - cntx_t* cntx, \ - thrinfo_t* thread \ - ) \ -{ \ - PASTECH2(ch,opname,_ker_ft) packm_ker_cast = packm_ker; \ -\ - ctype* restrict kappa_cast = kappa; \ - ctype* restrict c_cast = c; \ - ctype* restrict p_cast = p; \ - ctype* restrict c_begin; \ - ctype* restrict p_begin; \ -\ - dim_t iter_dim; \ - dim_t n_iter; \ - dim_t it, ic, ip; \ - dim_t ic0, ip0; \ - doff_t ic_inc, ip_inc; \ - doff_t diagoffc_i; \ - doff_t diagoffc_inc; \ - dim_t panel_len_full; \ - dim_t panel_len_i; \ - dim_t panel_len_max; \ - dim_t panel_len_max_i; \ - dim_t panel_dim_i; \ - dim_t panel_dim_max; \ - dim_t panel_off_i; \ - inc_t vs_c; \ - inc_t ldc; \ - inc_t ldp, p_inc; \ - dim_t* m_panel_full; \ - dim_t* n_panel_full; \ - dim_t* m_panel_use; \ - dim_t* n_panel_use; \ - dim_t* m_panel_max; \ - dim_t* n_panel_max; \ - conj_t conjc; \ - bool row_stored; \ - bool col_stored; \ - inc_t is_p_use; \ -\ - ctype* restrict c_use; \ - ctype* restrict p_use; \ - doff_t diagoffp_i; \ -\ -\ - /* If C is zeros and part of a triangular matrix, then we don't need - to pack it. */ \ - if ( bli_is_zeros( uploc ) && \ - bli_is_triangular( strucc ) ) return; \ -\ - /* Extract the conjugation bit from the transposition argument. */ \ - conjc = bli_extract_conj( transc ); \ -\ - /* If c needs a transposition, induce it so that we can more simply - express the remaining parameters and code. */ \ - if ( bli_does_trans( transc ) ) \ - { \ - bli_swap_incs( &rs_c, &cs_c ); \ - bli_negate_diag_offset( &diagoffc ); \ - bli_toggle_uplo( &uploc ); \ - bli_toggle_trans( &transc ); \ - } \ -\ - /* Create flags to incidate row or column storage. Note that the - schema bit that encodes row or column is describing the form of - micro-panel, not the storage in the micro-panel. Hence the - mismatch in "row" and "column" semantics. */ \ - row_stored = bli_is_col_packed( schema ); \ - col_stored = bli_is_row_packed( schema ); \ -\ - /* If the row storage flag indicates row storage, then we are packing - to column panels; otherwise, if the strides indicate column storage, - we are packing to row panels. */ \ - if ( row_stored ) \ - { \ - /* Prepare to pack to row-stored column panels. */ \ - iter_dim = n; \ - panel_len_full = m; \ - panel_len_max = m_max; \ - panel_dim_max = pd_p; \ - ldc = rs_c; \ - vs_c = cs_c; \ - diagoffc_inc = -( doff_t )panel_dim_max; \ - ldp = rs_p; \ - m_panel_full = &m; \ - n_panel_full = &panel_dim_i; \ - m_panel_use = &panel_len_i; \ - n_panel_use = &panel_dim_i; \ - m_panel_max = &panel_len_max_i; \ - n_panel_max = &panel_dim_max; \ - } \ - else /* if ( col_stored ) */ \ - { \ - /* Prepare to pack to column-stored row panels. */ \ - iter_dim = m; \ - panel_len_full = n; \ - panel_len_max = n_max; \ - panel_dim_max = pd_p; \ - ldc = cs_c; \ - vs_c = rs_c; \ - diagoffc_inc = ( doff_t )panel_dim_max; \ - ldp = cs_p; \ - m_panel_full = &panel_dim_i; \ - n_panel_full = &n; \ - m_panel_use = &panel_dim_i; \ - n_panel_use = &panel_len_i; \ - m_panel_max = &panel_dim_max; \ - n_panel_max = &panel_len_max_i; \ - } \ -\ - /* Compute the total number of iterations we'll need. */ \ - n_iter = iter_dim / panel_dim_max + ( iter_dim % panel_dim_max ? 1 : 0 ); \ -\ - /* Set the initial values and increments for indices related to C and P - based on whether reverse iteration was requested. */ \ - if ( ( revifup && bli_is_upper( uploc ) && bli_is_triangular( strucc ) ) || \ - ( reviflo && bli_is_lower( uploc ) && bli_is_triangular( strucc ) ) ) \ - { \ - ic0 = (n_iter - 1) * panel_dim_max; \ - ic_inc = -panel_dim_max; \ - ip0 = n_iter - 1; \ - ip_inc = -1; \ - } \ - else \ - { \ - ic0 = 0; \ - ic_inc = panel_dim_max; \ - ip0 = 0; \ - ip_inc = 1; \ - } \ -\ - p_begin = p_cast; \ -\ - /* Query the number of threads and thread ids from the current thread's - packm thrinfo_t node. */ \ - const dim_t nt = bli_thread_n_way( thread ); \ - const dim_t tid = bli_thread_work_id( thread ); \ -\ - dim_t it_start, it_end, it_inc; \ -\ - /* Determine the thread range and increment using the current thread's - packm thrinfo_t node. NOTE: The definition of bli_thread_range_jrir() - will depend on whether slab or round-robin partitioning was requested - at configure-time. */ \ - bli_thread_range_jrir( thread, n_iter, 1, FALSE, &it_start, &it_end, &it_inc ); \ -\ - /* Iterate over every logical micropanel in the source matrix. */ \ - for ( ic = ic0, ip = ip0, it = 0; it < n_iter; \ - ic += ic_inc, ip += ip_inc, it += 1 ) \ - { \ - panel_dim_i = bli_min( panel_dim_max, iter_dim - ic ); \ -\ - diagoffc_i = diagoffc + (ip )*diagoffc_inc; \ - c_begin = c_cast + (ic )*vs_c; \ -\ - if ( bli_is_triangular( strucc ) && \ - bli_is_unstored_subpart_n( diagoffc_i, uploc, *m_panel_full, *n_panel_full ) ) \ - { \ - /* This case executes if the panel belongs to a triangular - matrix AND is completely unstored (ie: zero). If the panel - is unstored, we do nothing. (Notice that we don't even - increment p_begin.) */ \ -\ - continue; \ - } \ - else if ( bli_is_triangular( strucc ) && \ - bli_intersects_diag_n( diagoffc_i, *m_panel_full, *n_panel_full ) ) \ - { \ - /* This case executes if the panel belongs to a triangular - matrix AND is diagonal-intersecting. Notice that we - cannot bury the following conditional logic into - packm_struc_cxk() because we need to know the value of - panel_len_max_i so we can properly increment p_inc. */ \ -\ - /* Sanity check. Diagonals should not intersect the short end of - a micro-panel. If they do, then somehow the constraints on - cache blocksizes being a whole multiple of the register - blocksizes was somehow violated. */ \ - if ( ( col_stored && diagoffc_i < 0 ) || \ - ( row_stored && diagoffc_i > 0 ) ) \ - bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED ); \ -\ - if ( ( row_stored && bli_is_upper( uploc ) ) || \ - ( col_stored && bli_is_lower( uploc ) ) ) \ - { \ - panel_off_i = 0; \ - panel_len_i = bli_abs( diagoffc_i ) + panel_dim_i; \ - panel_len_max_i = bli_min( bli_abs( diagoffc_i ) + panel_dim_max, \ - panel_len_max ); \ - diagoffp_i = diagoffc_i; \ - } \ - else /* if ( ( row_stored && bli_is_lower( uploc ) ) || \ - ( col_stored && bli_is_upper( uploc ) ) ) */ \ - { \ - panel_off_i = bli_abs( diagoffc_i ); \ - panel_len_i = panel_len_full - panel_off_i; \ - panel_len_max_i = panel_len_max - panel_off_i; \ - diagoffp_i = 0; \ - } \ -\ - c_use = c_begin + (panel_off_i )*ldc; \ - p_use = p_begin; \ -\ - /* We need to re-compute the imaginary stride as a function of - panel_len_max_i since triangular packed matrices have panels - of varying lengths. NOTE: This imaginary stride value is - only referenced by the packm kernels for induced methods. */ \ - is_p_use = ldp * panel_len_max_i; \ -\ - /* We nudge the imaginary stride up by one if it is odd. */ \ - is_p_use += ( bli_is_odd( is_p_use ) ? 1 : 0 ); \ -\ - /* NOTE: We MUST use round-robin partitioning when packing - micropanels of a triangular matrix. Hermitian/symmetric - and general packing may use slab or round-robin, depending - on which was selected at configure-time. */ \ - if ( bli_packm_my_iter_rr( it, it_start, it_end, tid, nt ) ) \ - { \ - packm_ker_cast( strucc, \ - diagoffp_i, \ - diagc, \ - uploc, \ - conjc, \ - schema, \ - invdiag, \ - *m_panel_use, \ - *n_panel_use, \ - *m_panel_max, \ - *n_panel_max, \ - kappa_cast, \ - c_use, rs_c, cs_c, \ - p_use, rs_p, cs_p, \ - is_p_use, \ - cntx ); \ - } \ -\ - /* NOTE: This value is usually LESS than ps_p because triangular - matrices usually have several micro-panels that are shorter - than a "full" micro-panel. */ \ - p_inc = is_p_use; \ - } \ - else if ( bli_is_herm_or_symm( strucc ) ) \ - { \ - /* This case executes if the panel belongs to a Hermitian or - symmetric matrix, which includes stored, unstored, and - diagonal-intersecting panels. */ \ -\ - c_use = c_begin; \ - p_use = p_begin; \ -\ - panel_len_i = panel_len_full; \ - panel_len_max_i = panel_len_max; \ -\ - is_p_use = is_p; \ -\ - /* The definition of bli_packm_my_iter() will depend on whether slab - or round-robin partitioning was requested at configure-time. */ \ - if ( bli_packm_my_iter( it, it_start, it_end, tid, nt ) ) \ - { \ - packm_ker_cast( strucc, \ - diagoffc_i, \ - diagc, \ - uploc, \ - conjc, \ - schema, \ - invdiag, \ - *m_panel_use, \ - *n_panel_use, \ - *m_panel_max, \ - *n_panel_max, \ - kappa_cast, \ - c_use, rs_c, cs_c, \ - p_use, rs_p, cs_p, \ - is_p_use, \ - cntx ); \ - } \ -\ - p_inc = ps_p; \ - } \ - else \ - { \ - /* This case executes if the panel is general, or, if the - panel is part of a triangular matrix and is neither unstored - (ie: zero) nor diagonal-intersecting. */ \ -\ - c_use = c_begin; \ - p_use = p_begin; \ -\ - panel_len_i = panel_len_full; \ - panel_len_max_i = panel_len_max; \ -\ - is_p_use = is_p; \ -\ - /* The definition of bli_packm_my_iter() will depend on whether slab - or round-robin partitioning was requested at configure-time. */ \ - if ( bli_packm_my_iter( it, it_start, it_end, tid, nt ) ) \ - { \ - packm_ker_cast( BLIS_GENERAL, \ - 0, \ - diagc, \ - BLIS_DENSE, \ - conjc, \ - schema, \ - invdiag, \ - *m_panel_use, \ - *n_panel_use, \ - *m_panel_max, \ - *n_panel_max, \ - kappa_cast, \ - c_use, rs_c, cs_c, \ - p_use, rs_p, cs_p, \ - is_p_use, \ - cntx ); \ - } \ -\ - /* NOTE: This value is equivalent to ps_p. */ \ - p_inc = ps_p; \ - } \ -\ - p_begin += p_inc; \ -\ - } \ -} + char* p_begin = p_cast; -INSERT_GENTFUNCR_BASIC( packm, packm_blk_var1 ) + // Iterate over every logical micropanel in the source matrix. + for ( dim_t ic = ic0, ip = ip0, it = 0; it < n_iter; + ic += ic_inc, ip += ip_inc, it += 1 ) + { + dim_t panel_dim_i = bli_min( panel_dim_max, iter_dim - ic ); + dim_t panel_dim_off_i = panel_dim_off + ic; + + doff_t diagoffc_i = diagoffc + (ip )*diagoffc_inc; + char* c_begin = c_cast + (ic )*incc*dt_c_size; + + inc_t p_inc = ps_p; + + // NOTE: We MUST use round-robin partitioning when packing + // micropanels of a triangular matrix. Hermitian/symmetric + // and general packing may use slab or round-robin, depending + // on which was selected at configure-time. + // The definition of bli_packm_my_iter() will depend on whether slab + // or round-robin partitioning was requested at configure-time. + bool my_iter = bli_is_triangular( strucc ) + ? bli_packm_my_iter_rr( it, it_start, it_end, tid, nt ) + : bli_packm_my_iter ( it, it_start, it_end, tid, nt ); + + if ( bli_is_triangular( strucc ) && + bli_is_unstored_subpart_n( diagoffc_i, uploc, panel_dim_i, panel_len_full ) ) + { + // This case executes if the panel belongs to a triangular + // matrix AND is completely unstored (ie: zero). If the panel + // is unstored, we do nothing. (Notice that we don't even + // increment p_begin.) + continue; + } + else if ( bli_is_triangular( strucc ) && + bli_intersects_diag_n( diagoffc_i, panel_dim_i, panel_len_full ) ) + { + // This case executes if the panel belongs to a triangular + // matrix AND is diagonal-intersecting. Notice that we + // cannot bury the following conditional logic into + // packm_struc_cxk() because we need to know the value of + // panel_len_max_i so we can properly increment p_inc. + + // Sanity check. Diagonals should not intersect the short end of + // a micro-panel. If they do, then somehow the constraints on + // cache blocksizes being a whole multiple of the register + // blocksizes was somehow violated. + if ( diagoffc_i < 0 ) + bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED ); + + dim_t panel_off_i; + dim_t panel_len_i; + dim_t panel_len_max_i; + + if ( bli_is_lower( uploc ) ) + { + panel_off_i = 0; + panel_len_i = bli_abs( diagoffc_i ) + panel_dim_i; + panel_len_max_i = bli_min( bli_abs( diagoffc_i ) + panel_dim_max, + panel_len_max ); + } + else // if ( bli_is_upper( uploc ) ) + { + panel_off_i = bli_abs( diagoffc_i ); + panel_len_i = panel_len_full - panel_off_i; + panel_len_max_i = panel_len_max - panel_off_i; + } + + dim_t panel_len_off_i = panel_off_i + panel_len_off; + + char* c_use = c_begin + (panel_off_i )*ldc*dt_c_size; + char* p_use = p_begin; + + // We need to re-compute the imaginary stride as a function of + // panel_len_max_i since triangular packed matrices have panels + // of varying lengths. NOTE: This imaginary stride value is + // only referenced by the packm kernels for induced methods. + inc_t is_p_use = ldp * panel_len_max_i; + + // We nudge the imaginary stride up by one if it is odd. + is_p_use += ( bli_is_odd( is_p_use ) ? 1 : 0 ); + + if ( my_iter ) + { + packm_ker_cast( strucc, + diagc, + uploc, + conjc, + schema, + invdiag, + panel_dim_i, + panel_len_i, + panel_dim_max, + panel_len_max_i, + panel_dim_off_i, + panel_len_off_i, + kappa_cast, + c_use, incc, ldc, + p_use, ldp, + is_p_use, + cntx, + params ); + } + + // NOTE: This value is usually LESS than ps_p because triangular + // matrices usually have several micro-panels that are shorter + // than a "full" micro-panel. + p_inc = is_p_use; + } + else + { + // This case executes if the panel is either dense, or belongs + // to a Hermitian or symmetric matrix, which includes stored, + // unstored, and diagonal-intersecting panels. + + if ( my_iter ) + { + packm_ker_cast( bli_is_triangular( strucc ) ? BLIS_GENERAL : strucc, + diagc, + uploc, + conjc, + schema, + invdiag, + panel_dim_i, + panel_len_full, + panel_dim_max, + panel_len_max, + panel_dim_off_i, + panel_len_off, + kappa_cast, + c_begin, incc, ldc, + p_begin, ldp, is_p, + cntx, + params ); + } + } + p_begin += p_inc*dt_p_size; + } +} -/* -if ( row_stored ) \ -PASTEMAC(ch,fprintm)( stdout, "packm_var2: b", m, n, \ - c_cast, rs_c, cs_c, "%4.1f", "" ); \ -if ( col_stored ) \ -PASTEMAC(ch,fprintm)( stdout, "packm_var2: a", m, n, \ - c_cast, rs_c, cs_c, "%4.1f", "" ); \ -*/ -/* -if ( row_stored ) \ -PASTEMAC(ch,fprintm)( stdout, "packm_blk_var1: b packed", *m_panel_max, *n_panel_max, \ - p_use, rs_p, cs_p, "%5.2f", "" ); \ -else \ -PASTEMAC(ch,fprintm)( stdout, "packm_blk_var1: a packed", *m_panel_max, *n_panel_max, \ - p_use, rs_p, cs_p, "%5.2f", "" ); \ -*/ \ -\ -/* -if ( col_stored ) { \ - if ( bli_thread_work_id( thread ) == 0 ) \ - { \ - printf( "packm_blk_var1: thread %lu (a = %p, ap = %p)\n", bli_thread_work_id( thread ), c_use, p_use ); \ - fflush( stdout ); \ - PASTEMAC(ch,fprintm)( stdout, "packm_blk_var1: a", *m_panel_use, *n_panel_use, \ - ( ctype* )c_use, rs_c, cs_c, "%4.1f", "" ); \ - PASTEMAC(ch,fprintm)( stdout, "packm_blk_var1: ap", *m_panel_max, *n_panel_max, \ - ( ctype* )p_use, rs_p, cs_p, "%4.1f", "" ); \ - fflush( stdout ); \ - } \ -bli_thread_barrier( thread ); \ - if ( bli_thread_work_id( thread ) == 1 ) \ - { \ - printf( "packm_blk_var1: thread %lu (a = %p, ap = %p)\n", bli_thread_work_id( thread ), c_use, p_use ); \ - fflush( stdout ); \ - PASTEMAC(ch,fprintm)( stdout, "packm_blk_var1: a", *m_panel_use, *n_panel_use, \ - ( ctype* )c_use, rs_c, cs_c, "%4.1f", "" ); \ - PASTEMAC(ch,fprintm)( stdout, "packm_blk_var1: ap", *m_panel_max, *n_panel_max, \ - ( ctype* )p_use, rs_p, cs_p, "%4.1f", "" ); \ - fflush( stdout ); \ - } \ -bli_thread_barrier( thread ); \ -} \ -else { \ - if ( bli_thread_work_id( thread ) == 0 ) \ - { \ - printf( "packm_blk_var1: thread %lu (b = %p, bp = %p)\n", bli_thread_work_id( thread ), c_use, p_use ); \ - fflush( stdout ); \ - PASTEMAC(ch,fprintm)( stdout, "packm_blk_var1: b", *m_panel_use, *n_panel_use, \ - ( ctype* )c_use, rs_c, cs_c, "%4.1f", "" ); \ - PASTEMAC(ch,fprintm)( stdout, "packm_blk_var1: bp", *m_panel_max, *n_panel_max, \ - ( ctype* )p_use, rs_p, cs_p, "%4.1f", "" ); \ - fflush( stdout ); \ - } \ -bli_thread_barrier( thread ); \ - if ( bli_thread_work_id( thread ) == 1 ) \ - { \ - printf( "packm_blk_var1: thread %lu (b = %p, bp = %p)\n", bli_thread_work_id( thread ), c_use, p_use ); \ - fflush( stdout ); \ - PASTEMAC(ch,fprintm)( stdout, "packm_blk_var1: b", *m_panel_use, *n_panel_use, \ - ( ctype* )c_use, rs_c, cs_c, "%4.1f", "" ); \ - PASTEMAC(ch,fprintm)( stdout, "packm_blk_var1: bp", *m_panel_max, *n_panel_max, \ - ( ctype* )p_use, rs_p, cs_p, "%4.1f", "" ); \ - fflush( stdout ); \ - } \ -bli_thread_barrier( thread ); \ -} \ -*/ -/* - PASTEMAC(chr,fprintm)( stdout, "packm_var2: bp_rpi", *m_panel_max, *n_panel_max, \ - ( ctype_r* )p_use, rs_p, cs_p, "%4.1f", "" ); \ -*/ -/* - if ( row_stored ) { \ - PASTEMAC(chr,fprintm)( stdout, "packm_var2: b_r", *m_panel_max, *n_panel_max, \ - ( ctype_r* )c_use, 2*rs_c, 2*cs_c, "%4.1f", "" ); \ - PASTEMAC(chr,fprintm)( stdout, "packm_var2: b_i", *m_panel_max, *n_panel_max, \ - (( ctype_r* )c_use)+rs_c, 2*rs_c, 2*cs_c, "%4.1f", "" ); \ - PASTEMAC(chr,fprintm)( stdout, "packm_var2: bp_r", *m_panel_max, *n_panel_max, \ - ( ctype_r* )p_use, rs_p, cs_p, "%4.1f", "" ); \ - inc_t is_b = rs_p * *m_panel_max; \ - PASTEMAC(chr,fprintm)( stdout, "packm_var2: bp_i", *m_panel_max, *n_panel_max, \ - ( ctype_r* )p_use + is_b, rs_p, cs_p, "%4.1f", "" ); \ - } \ -*/ -/* - if ( col_stored ) { \ - PASTEMAC(chr,fprintm)( stdout, "packm_var2: a_r", *m_panel_max, *n_panel_max, \ - ( ctype_r* )c_use, 2*rs_c, 2*cs_c, "%4.1f", "" ); \ - PASTEMAC(chr,fprintm)( stdout, "packm_var2: a_i", *m_panel_max, *n_panel_max, \ - (( ctype_r* )c_use)+rs_c, 2*rs_c, 2*cs_c, "%4.1f", "" ); \ - PASTEMAC(chr,fprintm)( stdout, "packm_var2: ap_r", *m_panel_max, *n_panel_max, \ - ( ctype_r* )p_use, rs_p, cs_p, "%4.1f", "" ); \ - PASTEMAC(chr,fprintm)( stdout, "packm_var2: ap_i", *m_panel_max, *n_panel_max, \ - ( ctype_r* )p_use + p_inc, rs_p, cs_p, "%4.1f", "" ); \ - } \ -*/ diff --git a/frame/1m/packm/bli_packm_blk_var1.h b/frame/1m/packm/bli_packm_blk_var1.h new file mode 100644 index 0000000000..9cda5828b5 --- /dev/null +++ b/frame/1m/packm/bli_packm_blk_var1.h @@ -0,0 +1,59 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +// +// packm params types. +// + +typedef struct +{ + // Type of C Type of P + packm_ker_vft ukr_fn[BLIS_NUM_FP_TYPES][BLIS_NUM_FP_TYPES]; +} packm_blk_var1_params_t; + +// +// Prototype object-based interfaces. +// + +BLIS_EXPORT_BLIS void bli_packm_blk_var1 + ( + obj_t* c, + obj_t* p, + cntx_t* cntx, + rntm_t* rntm, + cntl_t* cntl, + thrinfo_t* t + ); + diff --git a/frame/1m/packm/bli_packm_blk_var1_md.c b/frame/1m/packm/bli_packm_blk_var1_md.c deleted file mode 100644 index a7c694e4fc..0000000000 --- a/frame/1m/packm/bli_packm_blk_var1_md.c +++ /dev/null @@ -1,344 +0,0 @@ -/* - - BLIS - An object-based framework for developing high-performance BLAS-like - libraries. - - Copyright (C) 2014, The University of Texas at Austin - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are - met: - - Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - - Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - - Neither the name(s) of the copyright holder(s) nor the names of its - contributors may be used to endorse or promote products derived - from this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -*/ - -#include "blis.h" - -#ifdef BLIS_ENABLE_GEMM_MD - -#define FUNCPTR_T packm_fp - -typedef void (*FUNCPTR_T)( - trans_t transc, - pack_t schema, - dim_t m, - dim_t n, - dim_t m_max, - dim_t n_max, - void* kappa, - void* c, inc_t rs_c, inc_t cs_c, - void* p, inc_t rs_p, inc_t cs_p, - inc_t is_p, - dim_t pd_p, inc_t ps_p, - cntx_t* cntx, - thrinfo_t* thread - ); - -static FUNCPTR_T GENARRAY2_ALL(ftypes,packm_blk_var1_md); - - -void bli_packm_blk_var1_md - ( - obj_t* c, - obj_t* p, - cntx_t* cntx, - cntl_t* cntl, - thrinfo_t* t - ) -{ - num_t dt_c = bli_obj_dt( c ); - num_t dt_p = bli_obj_dt( p ); - - trans_t transc = bli_obj_conjtrans_status( c ); - pack_t schema = bli_obj_pack_schema( p ); - - dim_t m_p = bli_obj_length( p ); - dim_t n_p = bli_obj_width( p ); - dim_t m_max_p = bli_obj_padded_length( p ); - dim_t n_max_p = bli_obj_padded_width( p ); - - void* buf_c = bli_obj_buffer_at_off( c ); - inc_t rs_c = bli_obj_row_stride( c ); - inc_t cs_c = bli_obj_col_stride( c ); - - void* buf_p = bli_obj_buffer_at_off( p ); - inc_t rs_p = bli_obj_row_stride( p ); - inc_t cs_p = bli_obj_col_stride( p ); - inc_t is_p = bli_obj_imag_stride( p ); - dim_t pd_p = bli_obj_panel_dim( p ); - inc_t ps_p = bli_obj_panel_stride( p ); - - obj_t kappa; - void* buf_kappa; - - FUNCPTR_T f; - - - // Treatment of kappa (ie: packing during scaling) depends on - // whether we are executing an induced method. - if ( bli_is_nat_packed( schema ) ) - { - // This branch is for native execution, where we assume that - // the micro-kernel will always apply the alpha scalar of the - // higher-level operation. Thus, we use BLIS_ONE for kappa so - // that the underlying packm implementation does not perform - // any scaling during packing. - buf_kappa = bli_obj_buffer_for_const( dt_p, &BLIS_ONE ); - } - else // if ( bli_is_ind_packed( schema ) ) - { - obj_t* kappa_p; - - // The value for kappa we use will depend on whether the scalar - // attached to A has a nonzero imaginary component. If it does, - // then we will apply the scalar during packing to facilitate - // implementing induced complex domain algorithms in terms of - // real domain micro-kernels. (In the aforementioned situation, - // applying a real scalar is easy, but applying a complex one is - // harder, so we avoid the need altogether with the code below.) - if ( bli_obj_scalar_has_nonzero_imag( p ) ) - { - // Detach the scalar. - bli_obj_scalar_detach( p, &kappa ); - - // Reset the attached scalar (to 1.0). - bli_obj_scalar_reset( p ); - - kappa_p = κ - } - else - { - // If the internal scalar of A has only a real component, then - // we will apply it later (in the micro-kernel), and so we will - // use BLIS_ONE to indicate no scaling during packing. - kappa_p = &BLIS_ONE; - } - - // Acquire the buffer to the kappa chosen above. - buf_kappa = bli_obj_buffer_for_1x1( dt_p, kappa_p ); - } - - - // Index into the type combination array to extract the correct - // function pointer. - f = ftypes[dt_c][dt_p]; - - // Invoke the function. - f( - transc, - schema, - m_p, - n_p, - m_max_p, - n_max_p, - buf_kappa, - buf_c, rs_c, cs_c, - buf_p, rs_p, cs_p, - is_p, - pd_p, ps_p, - cntx, - t ); -} - - -#undef GENTFUNC2 -#define GENTFUNC2( ctype_c, ctype_p, chc, chp, varname ) \ -\ -void PASTEMAC2(chc,chp,varname) \ - ( \ - trans_t transc, \ - pack_t schema, \ - dim_t m, \ - dim_t n, \ - dim_t m_max, \ - dim_t n_max, \ - void* kappa, \ - void* c, inc_t rs_c, inc_t cs_c, \ - void* p, inc_t rs_p, inc_t cs_p, \ - inc_t is_p, \ - dim_t pd_p, inc_t ps_p, \ - cntx_t* cntx, \ - thrinfo_t* thread \ - ) \ -{ \ - ctype_p* restrict kappa_cast = kappa; \ - ctype_c* restrict c_cast = c; \ - ctype_p* restrict p_cast = p; \ - ctype_c* restrict c_begin; \ - ctype_p* restrict p_begin; \ -\ - dim_t iter_dim; \ - dim_t n_iter; \ - dim_t it, ic, ip; \ - doff_t ic_inc, ip_inc; \ - dim_t panel_len_full; \ - dim_t panel_len_i; \ - dim_t panel_len_max; \ - dim_t panel_len_max_i; \ - dim_t panel_dim_i; \ - dim_t panel_dim_max; \ - inc_t vs_c; \ - inc_t p_inc; \ - dim_t* m_panel_use; \ - dim_t* n_panel_use; \ - dim_t* m_panel_max; \ - dim_t* n_panel_max; \ - conj_t conjc; \ - bool row_stored; \ - bool col_stored; \ -\ - ctype_c* restrict c_use; \ - ctype_p* restrict p_use; \ -\ -\ - /* Extract the conjugation bit from the transposition argument. */ \ - conjc = bli_extract_conj( transc ); \ -\ - /* If c needs a transposition, induce it so that we can more simply - express the remaining parameters and code. */ \ - if ( bli_does_trans( transc ) ) \ - { \ - bli_swap_incs( &rs_c, &cs_c ); \ - bli_toggle_trans( &transc ); \ - } \ -\ - /* Create flags to incidate row or column storage. Note that the - schema bit that encodes row or column is describing the form of - micro-panel, not the storage in the micro-panel. Hence the - mismatch in "row" and "column" semantics. */ \ - row_stored = bli_is_col_packed( schema ); \ - col_stored = bli_is_row_packed( schema ); \ -\ - ( void )col_stored; \ -\ - /* If the row storage flag indicates row storage, then we are packing - to column panels; otherwise, if the strides indicate column storage, - we are packing to row panels. */ \ - if ( row_stored ) \ - { \ - /* Prepare to pack to row-stored column panels. */ \ - iter_dim = n; \ - panel_len_full = m; \ - panel_len_max = m_max; \ - panel_dim_max = pd_p; \ - vs_c = cs_c; \ - m_panel_use = &panel_len_i; \ - n_panel_use = &panel_dim_i; \ - m_panel_max = &panel_len_max_i; \ - n_panel_max = &panel_dim_max; \ - } \ - else /* if ( col_stored ) */ \ - { \ - /* Prepare to pack to column-stored row panels. */ \ - iter_dim = m; \ - panel_len_full = n; \ - panel_len_max = n_max; \ - panel_dim_max = pd_p; \ - vs_c = rs_c; \ - m_panel_use = &panel_dim_i; \ - n_panel_use = &panel_len_i; \ - m_panel_max = &panel_dim_max; \ - n_panel_max = &panel_len_max_i; \ - } \ -\ - /* Compute the total number of iterations we'll need. */ \ - n_iter = iter_dim / panel_dim_max + ( iter_dim % panel_dim_max ? 1 : 0 ); \ -\ - { \ - ic_inc = panel_dim_max; \ - ip_inc = 1; \ - } \ -\ - p_begin = p_cast; \ -\ - /* Query the number of threads and thread ids from the current thread's - packm thrinfo_t node. */ \ - const dim_t nt = bli_thread_n_way( thread ); \ - const dim_t tid = bli_thread_work_id( thread ); \ -\ - /* Suppress unused variable warnings when slab partitioning is enabled, - since the slab-based definition of bli_packm_my_iter() does not - actually use tid or nt. */ \ - ( void )nt; ( void )tid; \ -\ - dim_t it_start, it_end, it_inc; \ -\ - /* Determine the thread range and increment using the current thread's - packm thrinfo_t node. NOTE: The definition of bli_thread_range_jrir() - will depend on whether slab or round-robin partitioning was requested - at configure-time. */ \ - bli_thread_range_jrir( thread, n_iter, 1, FALSE, &it_start, &it_end, &it_inc ); \ -\ - for ( ic = 0, ip = 0, it = 0; it < n_iter; \ - ic += ic_inc, ip += ip_inc, it += 1 ) \ - { \ - panel_dim_i = bli_min( panel_dim_max, iter_dim - ic ); \ -\ - c_begin = c_cast + (ic )*vs_c; \ -\ - { \ - c_use = c_begin; \ - p_use = p_begin; \ -\ - panel_len_i = panel_len_full; \ - panel_len_max_i = panel_len_max; \ -\ - if ( bli_packm_my_iter( it, it_start, it_end, tid, nt ) ) \ - { \ - PASTEMAC2(chc,chp,packm_struc_cxk_md) \ - ( \ - conjc, \ - schema, \ - *m_panel_use, \ - *n_panel_use, \ - *m_panel_max, \ - *n_panel_max, \ - kappa_cast, \ - c_use, rs_c, cs_c, \ - p_use, rs_p, cs_p, \ - is_p, \ - cntx \ - ); \ - } \ -\ - p_inc = ps_p; \ - } \ -\ -/* -if ( row_stored ) \ -PASTEMAC(chp,fprintm)( stdout, "packm_blk_var1_md: b packed", *m_panel_max, *n_panel_max, \ - p_use, rs_p, cs_p, "%5.2f", "" ); \ -else \ -PASTEMAC(chp,fprintm)( stdout, "packm_blk_var1_md: a packed", *m_panel_max, *n_panel_max, \ - p_use, rs_p, cs_p, "%5.2f", "" ); \ -*/ \ -\ - p_begin += p_inc; \ -\ - } \ -} - -INSERT_GENTFUNC2_BASIC0( packm_blk_var1_md ) -INSERT_GENTFUNC2_MIXDP0( packm_blk_var1_md ) - -#endif diff --git a/frame/1m/packm/bli_packm_blk_var1_md.h b/frame/1m/packm/bli_packm_blk_var1_md.h deleted file mode 100644 index e6bf151d07..0000000000 --- a/frame/1m/packm/bli_packm_blk_var1_md.h +++ /dev/null @@ -1,67 +0,0 @@ -/* - - BLIS - An object-based framework for developing high-performance BLAS-like - libraries. - - Copyright (C) 2014, The University of Texas at Austin - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are - met: - - Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - - Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - - Neither the name(s) of the copyright holder(s) nor the names of its - contributors may be used to endorse or promote products derived - from this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -*/ - -void bli_packm_blk_var1_md - ( - obj_t* c, - obj_t* p, - cntx_t* cntx, - cntl_t* cntl, - thrinfo_t* t - ); - - -#undef GENTPROT2 -#define GENTPROT2( ctype_c, ctype_p, chc, chp, varname ) \ -\ -void PASTEMAC2(chc,chp,varname) \ - ( \ - trans_t transc, \ - pack_t schema, \ - dim_t m, \ - dim_t n, \ - dim_t m_max, \ - dim_t n_max, \ - void* kappa, \ - void* c, inc_t rs_c, inc_t cs_c, \ - void* p, inc_t rs_p, inc_t cs_p, \ - inc_t is_p, \ - dim_t pd_p, inc_t ps_p, \ - cntx_t* cntx, \ - thrinfo_t* thread \ - ); - -INSERT_GENTPROT2_BASIC0( packm_blk_var1_md ) -INSERT_GENTPROT2_MIXDP0( packm_blk_var1_md ) - diff --git a/frame/1m/packm/bli_packm_cntl.c b/frame/1m/packm/bli_packm_cntl.c index fc6ba8052c..e99ed9cf3d 100644 --- a/frame/1m/packm/bli_packm_cntl.c +++ b/frame/1m/packm/bli_packm_cntl.c @@ -35,11 +35,10 @@ #include "blis.h" -cntl_t* bli_packm_cntl_create_node +BLIS_EXPORT_BLIS cntl_t* bli_packm_cntl_create_node ( rntm_t* rntm, void_fp var_func, - void_fp packm_var_func, bszid_t bmid_m, bszid_t bmid_n, bool does_invert_diag, @@ -62,7 +61,6 @@ cntl_t* bli_packm_cntl_create_node // Initialize the packm_params_t struct. params->size = sizeof( packm_params_t ); - params->var_func = packm_var_func; params->bmid_m = bmid_m; params->bmid_n = bmid_n; params->does_invert_diag = does_invert_diag; diff --git a/frame/1m/packm/bli_packm_cntl.h b/frame/1m/packm/bli_packm_cntl.h index 17aa196e8d..14bfe1ce85 100644 --- a/frame/1m/packm/bli_packm_cntl.h +++ b/frame/1m/packm/bli_packm_cntl.h @@ -36,7 +36,6 @@ struct packm_params_s { uint64_t size; // size field must be present and come first. - packm_var_oft var_func; bszid_t bmid_m; bszid_t bmid_n; bool does_invert_diag; @@ -47,11 +46,6 @@ struct packm_params_s }; typedef struct packm_params_s packm_params_t; -BLIS_INLINE packm_var_oft bli_cntl_packm_params_var_func( cntl_t* cntl ) -{ - packm_params_t* ppp = ( packm_params_t* )cntl->params; return ppp->var_func; -} - BLIS_INLINE bszid_t bli_cntl_packm_params_bmid_m( cntl_t* cntl ) { packm_params_t* ppp = ( packm_params_t* )cntl->params; return ppp->bmid_m; @@ -93,7 +87,6 @@ cntl_t* bli_packm_cntl_create_node ( rntm_t* rntm, void_fp var_func, - void_fp packm_var_func, bszid_t bmid_m, bszid_t bmid_n, bool does_invert_diag, diff --git a/frame/1m/packm/bli_packm_init.c b/frame/1m/packm/bli_packm_init.c index 739fd5f1d2..5a7d716fe6 100644 --- a/frame/1m/packm/bli_packm_init.c +++ b/frame/1m/packm/bli_packm_init.c @@ -35,12 +35,14 @@ #include "blis.h" -siz_t bli_packm_init +bool bli_packm_init ( - obj_t* a, + obj_t* c, obj_t* p, cntx_t* cntx, - cntl_t* cntl + rntm_t* rntm, + cntl_t* cntl, + thrinfo_t* thread ) { bli_init_once(); @@ -51,139 +53,27 @@ siz_t bli_packm_init // suitable block of memory from the memory allocator (if such a block // of memory has not already been allocated previously). - bszid_t bmult_id_m; - bszid_t bmult_id_n; - bool does_invert_diag; - bool rev_iter_if_upper; - bool rev_iter_if_lower; - pack_t schema; - //packbuf_t pack_buf_type; - siz_t size_needed; - // Check parameters. if ( bli_error_checking_is_enabled() ) - bli_packm_init_check( a, p, cntx ); - - // Extract various fields from the control tree. - bmult_id_m = bli_cntl_packm_params_bmid_m( cntl ); - bmult_id_n = bli_cntl_packm_params_bmid_n( cntl ); - does_invert_diag = bli_cntl_packm_params_does_invert_diag( cntl ); - rev_iter_if_upper = bli_cntl_packm_params_rev_iter_if_upper( cntl ); - rev_iter_if_lower = bli_cntl_packm_params_rev_iter_if_lower( cntl ); - schema = bli_cntl_packm_params_pack_schema( cntl ); - //pack_buf_type = bli_cntl_packm_params_pack_buf_type( cntl ); - -#if 0 - // Let us now check to see if the object has already been packed. First - // we check if it has been packed to an unspecified (row or column) - // format, in which case we can alias the object and return. - // NOTE: The reason we don't need to even look at the control tree in - // this case is as follows: an object's pack status is only set to - // BLIS_PACKED_UNSPEC for situations when the actual format used is - // not important, as long as its packed into contiguous rows or - // contiguous columns. A good example of this is packing for matrix - // operands in the level-2 operations. - if ( bli_obj_pack_schema( a ) == BLIS_PACKED_UNSPEC ) - { - bli_obj_alias_to( a, p ); - return 0; - } + bli_packm_init_check( c, p, cntx ); - // Now we check if the object has already been packed to the desired - // schema (as encoded in the control tree). If so, we can alias and - // return 0. - // NOTE: In most cases, an object's pack status will be BLIS_NOT_PACKED - // and thus packing will be called for (but in some cases packing has - // already taken place, or does not need to take place, and so that will - // be indicated by the pack status). Also, not all combinations of - // current pack status and desired pack schema are valid. - if ( bli_obj_pack_schema( a ) == pack_schema ) - { - bli_obj_alias_to( a, p ); - return 0; - } -#endif + // We begin by copying the fields of A. + bli_obj_alias_to( c, p ); // If the object is marked as being filled with zeros, then we can skip // the packm operation entirely and alias. - if ( bli_obj_is_zeros( a ) ) - { - bli_obj_alias_to( a, p ); - return 0; - } - - // Prepare a few other variables based on properties of the control - // tree. - - invdiag_t invert_diag; - packord_t pack_ord_if_up; - packord_t pack_ord_if_lo; - - if ( does_invert_diag ) invert_diag = BLIS_INVERT_DIAG; - else invert_diag = BLIS_NO_INVERT_DIAG; - - if ( rev_iter_if_upper ) pack_ord_if_up = BLIS_PACK_REV_IF_UPPER; - else pack_ord_if_up = BLIS_PACK_FWD_IF_UPPER; - - if ( rev_iter_if_lower ) pack_ord_if_lo = BLIS_PACK_REV_IF_LOWER; - else pack_ord_if_lo = BLIS_PACK_FWD_IF_LOWER; - - // Initialize object p for the final packed matrix. - size_needed - = - bli_packm_init_pack - ( - invert_diag, - schema, - pack_ord_if_up, - pack_ord_if_lo, - bmult_id_m, - bmult_id_n, - a, - p, - cntx - ); - - // Return the size needed for memory allocation of the packed buffer. - return size_needed; -} + if ( bli_obj_is_zeros( c ) ) + return false; - -siz_t bli_packm_init_pack - ( - invdiag_t invert_diag, - pack_t schema, - packord_t pack_ord_if_up, - packord_t pack_ord_if_lo, - bszid_t bmult_id_m, - bszid_t bmult_id_n, - obj_t* a, - obj_t* p, - cntx_t* cntx - ) -{ - bli_init_once(); - - num_t dt_tar = bli_obj_target_dt( a ); - num_t dt_scalar = bli_obj_scalar_dt( a ); - trans_t transa = bli_obj_onlytrans_status( a ); - dim_t m_a = bli_obj_length( a ); - dim_t n_a = bli_obj_width( a ); - dim_t bmult_m_def = bli_cntx_get_blksz_def_dt( dt_tar, bmult_id_m, cntx ); - dim_t bmult_m_pack = bli_cntx_get_blksz_max_dt( dt_tar, bmult_id_m, cntx ); - dim_t bmult_n_def = bli_cntx_get_blksz_def_dt( dt_tar, bmult_id_n, cntx ); - dim_t bmult_n_pack = bli_cntx_get_blksz_max_dt( dt_tar, bmult_id_n, cntx ); - - dim_t m_p, n_p; - dim_t m_p_pad, n_p_pad; - siz_t size_p; - siz_t elem_size_p; - inc_t rs_p, cs_p; - inc_t is_p; - - - // We begin by copying the fields of A. - bli_obj_alias_to( a, p ); + // Extract various fields from the control tree. + bszid_t bmult_id_m = bli_cntl_packm_params_bmid_m( cntl ); + bszid_t bmult_id_n = bli_cntl_packm_params_bmid_n( cntl ); + pack_t schema = bli_cntl_packm_params_pack_schema( cntl ); + num_t dt_tar = bli_obj_target_dt( c ); + num_t dt_scalar = bli_obj_scalar_dt( c ); + dim_t bmult_m_def = bli_cntx_get_blksz_def_dt( dt_tar, bmult_id_m, cntx ); + dim_t bmult_m_pack = bli_cntx_get_blksz_max_dt( dt_tar, bmult_id_m, cntx ); + dim_t bmult_n_def = bli_cntx_get_blksz_def_dt( dt_tar, bmult_id_n, cntx ); // Typecast the internal scalar value to the target datatype. // Note that if the typecasting is needed, this must happen BEFORE we @@ -195,51 +85,21 @@ siz_t bli_packm_init_pack // Update the storage datatype of P to be the target datatype of A. bli_obj_set_dt( dt_tar, p ); + bli_obj_set_elem_size( bli_dt_size( dt_tar ), p ); - // Update the dimension fields to explicitly reflect a transposition, - // if needed. - // Then, clear the conjugation and transposition fields from the object - // since matrix packing in BLIS is deemed to take care of all conjugation - // and transposition necessary. - // Then, we adjust the properties of P when A needs a transposition. - // We negate the diagonal offset, and if A is upper- or lower-stored, - // we either toggle the uplo of P. - // Finally, if we mark P as dense since we assume that all matrices, - // regardless of structure, will be densified. - bli_obj_set_dims_with_trans( transa, m_a, n_a, p ); - bli_obj_set_conjtrans( BLIS_NO_TRANSPOSE, p ); - if ( bli_does_trans( transa ) ) - { - bli_obj_negate_diag_offset( p ); - if ( bli_obj_is_upper_or_lower( a ) ) - bli_obj_toggle_uplo( p ); - } + // Store the pack schema to the object. + bli_obj_set_pack_schema( schema, p ); - // If we are packing micropanels, mark P as dense. Otherwise, we are - // probably being called in the context of a level-2 operation, in - // which case we do not want to overwrite the uplo field of P (inherited - // from A) with BLIS_DENSE because that information may be needed by - // the level-2 operation's unblocked variant to decide whether to - // execute a "lower" or "upper" branch of code. - if ( bli_is_panel_packed( schema ) ) - { - bli_obj_set_uplo( BLIS_DENSE, p ); - } + // Clear the conjugation field from the object since matrix packing + // in BLIS is deemed to take care of all conjugation necessary. + bli_obj_set_conj( BLIS_NO_CONJUGATE, p ); + + // Since we are packing micropanels, mark P as dense. + bli_obj_set_uplo( BLIS_DENSE, p ); // Reset the view offsets to (0,0). bli_obj_set_offs( 0, 0, p ); - // Set the invert diagonal field. - bli_obj_set_invert_diag( invert_diag, p ); - - // Set the pack status of P to the pack schema prescribed in the control - // tree node. - bli_obj_set_pack_schema( schema, p ); - - // Set the packing order bits. - bli_obj_set_pack_order_if_upper( pack_ord_if_up, p ); - bli_obj_set_pack_order_if_lower( pack_ord_if_lo, p ); - // Compute the dimensions padded by the dimension multiples. These // dimensions will be the dimensions of the packed matrices, including // zero-padding, and will be used by the macro- and micro-kernels. @@ -247,10 +107,10 @@ siz_t bli_packm_init_pack // in P) and aligning them to the dimension multiples (typically equal // to register blocksizes). This does waste a little bit of space for // level-2 operations, but that's okay with us. - m_p = bli_obj_length( p ); - n_p = bli_obj_width( p ); - m_p_pad = bli_align_dim_to_mult( m_p, bmult_m_def ); - n_p_pad = bli_align_dim_to_mult( n_p, bmult_n_def ); + dim_t m_p = bli_obj_length( p ); + dim_t n_p = bli_obj_width( p ); + dim_t m_p_pad = bli_align_dim_to_mult( m_p, bmult_m_def ); + dim_t n_p_pad = bli_align_dim_to_mult( n_p, bmult_n_def ); // Save the padded dimensions into the packed object. It is important // to save these dimensions since they represent the actual dimensions @@ -258,177 +118,70 @@ siz_t bli_packm_init_pack bli_obj_set_padded_dims( m_p_pad, n_p_pad, p ); // Now we prepare to compute strides, align them, and compute the - // total number of bytes needed for the packed buffer. The caller - // will then use that value to acquire an appropriate block of memory - // from the memory allocator. + // total number of bytes needed for the packed buffer. Then we use + // that value to acquire an appropriate block of memory from the + // memory allocator. // Extract the element size for the packed object. - elem_size_p = bli_obj_elem_size( p ); - - // Set the row and column strides of p based on the pack schema. - if ( bli_is_row_packed( schema ) && - !bli_is_panel_packed( schema ) ) - { - // For regular row storage, the padded width of our matrix - // should be used for the row stride, with the column stride set - // to one. By using the WIDTH of the mem_t region, we allow for - // zero-padding (if necessary/desired) along the right edge of - // the matrix. - rs_p = n_p_pad; - cs_p = 1; - - // Align the leading dimension according to the heap stride - // alignment size so that the second, third, etc rows begin at - // aligned addresses. - rs_p = bli_align_dim_to_size( rs_p, elem_size_p, - BLIS_HEAP_STRIDE_ALIGN_SIZE ); - - // Store the strides in P. - bli_obj_set_strides( rs_p, cs_p, p ); - - // Compute the size of the packed buffer. - size_p = m_p_pad * rs_p * elem_size_p; - } - else if ( bli_is_col_packed( schema ) && - !bli_is_panel_packed( schema ) ) - { - // For regular column storage, the padded length of our matrix - // should be used for the column stride, with the row stride set - // to one. By using the LENGTH of the mem_t region, we allow for - // zero-padding (if necessary/desired) along the bottom edge of - // the matrix. - cs_p = m_p_pad; - rs_p = 1; - - // Align the leading dimension according to the heap stride - // alignment size so that the second, third, etc columns begin at - // aligned addresses. - cs_p = bli_align_dim_to_size( cs_p, elem_size_p, - BLIS_HEAP_STRIDE_ALIGN_SIZE ); - - // Store the strides in P. - bli_obj_set_strides( rs_p, cs_p, p ); - - // Compute the size of the packed buffer. - size_p = cs_p * n_p_pad * elem_size_p; - } - else if ( bli_is_row_packed( schema ) && - bli_is_panel_packed( schema ) ) - { - dim_t m_panel; - dim_t ps_p; - - // The panel dimension (for each datatype) should be equal to the - // default (logical) blocksize multiple in the m dimension. - m_panel = bmult_m_def; - - // The "column stride" of a row-micropanel packed object is interpreted - // as the column stride WITHIN a micropanel. Thus, this is equal to the - // packing (storage) blocksize multiple, which may be equal to the - // default (logical) blocksize multiple). - cs_p = bmult_m_pack; - - // The "row stride" of a row-micropanel packed object is interpreted - // as the row stride WITHIN a micropanel. Thus, it is unit. - rs_p = 1; - - // The "panel stride" of a micropanel packed object is interpreted as - // the distance between the (0,0) element of panel k and the (0,0) - // element of panel k+1. We use the padded width computed above to - // allow for zero-padding (if necessary/desired) along the far end - // of each micropanel (ie: the right edge of the matrix). Zero-padding - // can also occur along the long edge of the last micropanel if the m - // dimension of the matrix is not a whole multiple of MR. - ps_p = cs_p * n_p_pad; - - // As a general rule, we don't want micropanel strides to be odd. - // NOTE: This safety feature *may* not be necessary anymore, but was - // definitely needed to support certain variations of the 3m method. - if ( bli_is_odd( ps_p ) ) ps_p += 1; - - // Set the imaginary stride (in units of fundamental elements). - // This is the number of real elements that must be traversed before - // reaching the imaginary part of the packed micropanel. NOTE: the - // imaginary stride is mostly vestigial and left over from the 3m - // and 4m implementations. - is_p = 1; - - // Store the strides and panel dimension in P. - bli_obj_set_strides( rs_p, cs_p, p ); - bli_obj_set_imag_stride( is_p, p ); - bli_obj_set_panel_dim( m_panel, p ); - bli_obj_set_panel_stride( ps_p, p ); - bli_obj_set_panel_length( m_panel, p ); - bli_obj_set_panel_width( n_p, p ); - - // Compute the size of the packed buffer. - size_p = ps_p * ( m_p_pad / m_panel ) * elem_size_p; - } - else if ( bli_is_col_packed( schema ) && - bli_is_panel_packed( schema ) ) - { - dim_t n_panel; - dim_t ps_p; - - // The panel dimension (for each datatype) should be equal to the - // default (logical) blocksize multiple in the n dimension. - n_panel = bmult_n_def; - - // The "row stride" of a column-micropanel packed object is interpreted - // as the row stride WITHIN a micropanel. Thus, this is equal to the - // packing (storage) blocksize multiple (which may be equal to the - // default (logical) blocksize multiple. - rs_p = bmult_n_pack; - - // The "column stride" of a column-micropanel packed object is - // interpreted as the column stride WITHIN a micropanel. Thus, it is - // unit. - cs_p = 1; - - // The "panel stride" of a micropanel packed object is interpreted as - // the distance between the (0,0) element of panel k and the (0,0) - // element of panel k+1. We use the padded length computed above to - // allow for zero-padding (if necessary/desired) along the far end - // of each micropanel (ie: the bottom edge of the matrix). Zero-padding - // can also occur along the long edge of the last micropanel if the n - // dimension of the matrix is not a whole multiple of NR. - ps_p = m_p_pad * rs_p; - - // As a general rule, we don't want micropanel strides to be odd. - // NOTE: This safety feature *may* not be necessary anymore, but was - // definitely needed to support certain variations of the 3m method. - if ( bli_is_odd( ps_p ) ) ps_p += 1; - - // Set the imaginary stride (in units of fundamental elements). - // This is the number of real elements that must be traversed before - // reaching the imaginary part of the packed micropanel. NOTE: the - // imaginary stride is mostly vestigial and left over from the 3m - // and 4m implementations. - is_p = 1; - - // Store the strides and panel dimension in P. - bli_obj_set_strides( rs_p, cs_p, p ); - bli_obj_set_imag_stride( is_p, p ); - bli_obj_set_panel_dim( n_panel, p ); - bli_obj_set_panel_stride( ps_p, p ); - bli_obj_set_panel_length( m_p, p ); - bli_obj_set_panel_width( n_panel, p ); - - // Compute the size of the packed buffer. - size_p = ps_p * ( n_p_pad / n_panel ) * elem_size_p; - } - else - { - // NOTE: When implementing block storage, we only need to implement - // the following two cases: - // - row-stored blocks in row-major order - // - column-stored blocks in column-major order - // The other two combinations coincide with that of packed row-panel - // and packed column- panel storage. - - size_p = 0; - } - - return size_p; + siz_t elem_size_p = bli_obj_elem_size( p ); + + // The panel dimension (for each datatype) should be equal to the + // default (logical) blocksize multiple in the m dimension. + dim_t m_panel = bmult_m_def; + + // The "column stride" of a row-micropanel packed object is interpreted + // as the column stride WITHIN a micropanel. Thus, this is equal to the + // packing (storage) blocksize multiple, which may be equal to the + // default (logical) blocksize multiple). + inc_t cs_p = bmult_m_pack; + + // The "row stride" of a row-micropanel packed object is interpreted + // as the row stride WITHIN a micropanel. Thus, it is unit. + inc_t rs_p = 1; + + // The "panel stride" of a micropanel packed object is interpreted as + // the distance between the (0,0) element of panel k and the (0,0) + // element of panel k+1. We use the padded width computed above to + // allow for zero-padding (if necessary/desired) along the far end + // of each micropanel (ie: the right edge of the matrix). Zero-padding + // can also occur along the long edge of the last micropanel if the m + // dimension of the matrix is not a whole multiple of MR. + inc_t ps_p = cs_p * n_p_pad; + + // As a general rule, we don't want micropanel strides to be odd. There + // are very few instances where this can happen, but we've seen it happen + // more than zero times (such as for certain small problems), and so we + // check for it here. + if ( bli_is_odd( ps_p ) ) ps_p += 1; + + // Set the imaginary stride (in units of fundamental elements). + // This is the number of real elements that must be traversed before + // reaching the imaginary part of the packed micropanel. NOTE: the + // imaginary stride is mostly vestigial and left over from the 3m + // and 4m implementations. + inc_t is_p = 1; + + // Store the strides and panel dimension in P. + bli_obj_set_strides( rs_p, cs_p, p ); + bli_obj_set_imag_stride( is_p, p ); + bli_obj_set_panel_dim( m_panel, p ); + bli_obj_set_panel_stride( ps_p, p ); + bli_obj_set_panel_length( m_panel, p ); + bli_obj_set_panel_width( n_p, p ); + + // Compute the size of the packed buffer. + siz_t size_p = ps_p * ( m_p_pad / m_panel ) * elem_size_p; + + // If the requested size is zero, then we don't need to do any allocation. + if ( size_p == 0 ) + return false; + + // Update the buffer address in p to point to the buffer associated + // with the mem_t entry acquired from the memory broker (now cached in + // the control tree node). + void* buffer = bli_packm_alloc( size_p, rntm, cntl, thread ); + bli_obj_set_buffer( buffer, p ); + + return true; } diff --git a/frame/1m/packm/bli_packm_init.h b/frame/1m/packm/bli_packm_init.h index 9365a131ef..152c6f15cd 100644 --- a/frame/1m/packm/bli_packm_init.h +++ b/frame/1m/packm/bli_packm_init.h @@ -32,24 +32,13 @@ */ -siz_t bli_packm_init +BLIS_EXPORT_BLIS bool bli_packm_init ( obj_t* a, obj_t* p, cntx_t* cntx, - cntl_t* cntl - ); - -BLIS_EXPORT_BLIS siz_t bli_packm_init_pack - ( - invdiag_t invert_diag, - pack_t schema, - packord_t pack_ord_if_up, - packord_t pack_ord_if_lo, - bszid_t bmult_id_m, - bszid_t bmult_id_n, - obj_t* a, - obj_t* p, - cntx_t* cntx + rntm_t* rntm, + cntl_t* cntl, + thrinfo_t* thread ); diff --git a/frame/1m/packm/bli_packm_int.c b/frame/1m/packm/bli_packm_int.c index 6dc9ec85af..c9a2bb9db2 100644 --- a/frame/1m/packm/bli_packm_int.c +++ b/frame/1m/packm/bli_packm_int.c @@ -39,59 +39,19 @@ void bli_packm_int obj_t* a, obj_t* p, cntx_t* cntx, + rntm_t* rntm, cntl_t* cntl, thrinfo_t* thread ) { bli_init_once(); - packm_var_oft f; + // Extract the function pointer from the object. + packm_var_oft f = bli_obj_pack_fn( a ); - // Check parameters. - if ( bli_error_checking_is_enabled() ) - bli_packm_int_check( a, p, cntx ); - - // Sanity check; A should never have a zero dimension. If we must support - // it, then we should fold it into the next alias-and-early-exit block. - //if ( bli_obj_has_zero_dim( a ) ) bli_abort(); - - // Let us now check to see if the object has already been packed. First - // we check if it has been packed to an unspecified (row or column) - // format, in which case we can return, since by now aliasing has already - // taken place in packm_init(). - // NOTE: The reason we don't need to even look at the control tree in - // this case is as follows: an object's pack status is only set to - // BLIS_PACKED_UNSPEC for situations when the actual format used is - // not important, as long as its packed into contiguous rows or - // contiguous columns. A good example of this is packing for matrix - // operands in the level-2 operations. - if ( bli_obj_pack_schema( a ) == BLIS_PACKED_UNSPEC ) - { - return; - } - - // At this point, we can be assured that cntl is not NULL. Now we check - // if the object has already been packed to the desired schema (as en- - // coded in the control tree). If so, we can return, as above. - // NOTE: In most cases, an object's pack status will be BLIS_NOT_PACKED - // and thus packing will be called for (but in some cases packing has - // already taken place, or does not need to take place, and so that will - // be indicated by the pack status). Also, not all combinations of - // current pack status and desired pack schema are valid. - if ( bli_obj_pack_schema( a ) == bli_cntl_packm_params_pack_schema( cntl ) ) - { - return; - } - - // If the object is marked as being filled with zeros, then we can skip - // the packm operation entirely. - if ( bli_obj_is_zeros( a ) ) - { - return; - } - - // Extract the function pointer from the current control tree node. - f = bli_cntl_packm_params_var_func( cntl ); + // Barrier so that we know threads are done with previous computation + // with the same packing buffer before starting to pack. + bli_thread_barrier( thread ); // Invoke the variant with kappa_use. f @@ -99,8 +59,12 @@ void bli_packm_int a, p, cntx, + rntm, cntl, thread ); + + // Barrier so that packing is done before computation. + bli_thread_barrier( thread ); } diff --git a/frame/1m/packm/bli_packm_int.h b/frame/1m/packm/bli_packm_int.h index 573a299d67..16a5c2c34d 100644 --- a/frame/1m/packm/bli_packm_int.h +++ b/frame/1m/packm/bli_packm_int.h @@ -37,6 +37,7 @@ void bli_packm_int obj_t* a, obj_t* p, cntx_t* cntx, + rntm_t* rntm, cntl_t* cntl, thrinfo_t* thread ); diff --git a/frame/3/trsm/bli_trsm_packab.c b/frame/1m/packm/bli_packm_scalar.c similarity index 53% rename from frame/3/trsm/bli_trsm_packab.c rename to frame/1m/packm/bli_packm_scalar.c index 841230d80d..f613028c93 100644 --- a/frame/3/trsm/bli_trsm_packab.c +++ b/frame/1m/packm/bli_packm_scalar.c @@ -5,6 +5,7 @@ libraries. Copyright (C) 2014, The University of Texas at Austin + Copyright (C) 2016, Hewlett Packard Enterprise Development LP Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -34,83 +35,42 @@ #include "blis.h" -void bli_trsm_packa - ( - obj_t* a, - obj_t* b, - obj_t* c, - cntx_t* cntx, - rntm_t* rntm, - cntl_t* cntl, - thrinfo_t* thread - ) +void* bli_packm_scalar( obj_t* kappa, obj_t* p ) { - obj_t a_pack; + num_t dt_p = bli_obj_dt( p ); + pack_t schema = bli_obj_pack_schema( p ); - // Pack matrix A according to the control tree node. - bli_l3_packm - ( - a, - &a_pack, - cntx, - rntm, - cntl, - thread - ); + // The value for kappa we use will depends on whether the scalar + // attached to A has a nonzero imaginary component. If it does, + // then we will apply the scalar during packing to facilitate + // implementing induced complex domain algorithms in terms of + // real domain micro-kernels. (In the aforementioned situation, + // applying a real scalar is easy, but applying a complex one is + // harder, so we avoid the need altogether with the code below.) + if ( bli_obj_scalar_has_nonzero_imag( p ) && + !bli_is_nat_packed( schema ) ) + { + //printf( "applying non-zero imag kappa\n_p" ); - // Proceed with execution using packed matrix A. - bli_trsm_int - ( - &BLIS_ONE, - &a_pack, - b, - &BLIS_ONE, - c, - cntx, - rntm, - bli_cntl_sub_node( cntl ), - bli_thrinfo_sub_node( thread ) - ); -} - -// ----------------------------------------------------------------------------- - -void bli_trsm_packb - ( - obj_t* a, - obj_t* b, - obj_t* c, - cntx_t* cntx, - rntm_t* rntm, - cntl_t* cntl, - thrinfo_t* thread - ) -{ - obj_t b_pack; + // Detach the scalar. + bli_obj_scalar_detach( p, kappa ); - // Pack matrix B according to the control tree node. - bli_l3_packm - ( - b, - &b_pack, - cntx, - rntm, - cntl, - thread - ); + // Reset the attached scalar (to 1.0). + bli_obj_scalar_reset( p ); - // Proceed with execution using packed matrix B. - bli_trsm_int - ( - &BLIS_ONE, - a, - &b_pack, - &BLIS_ONE, - c, - cntx, - rntm, - bli_cntl_sub_node( cntl ), - bli_thrinfo_sub_node( thread ) - ); + return bli_obj_buffer_for_1x1( dt_p, kappa ); + } + // This branch is also for native execution, where we assume that + // the micro-kernel will always apply the alpha scalar of the + // higher-level operation. Thus, we use BLIS_ONE for kappa so + // that the underlying packm implementation does not perform + // any scaling during packing. + else + { + // If the internal scalar of A has only a real component, then + // we will apply it later (in the micro-kernel), and so we will + // use BLIS_ONE to indicate no scaling during packing. + return bli_obj_buffer_for_1x1( dt_p, &BLIS_ONE ); + } } diff --git a/frame/1m/packm/bli_packm_md.h b/frame/1m/packm/bli_packm_scalar.h similarity index 95% rename from frame/1m/packm/bli_packm_md.h rename to frame/1m/packm/bli_packm_scalar.h index bb9d6d6135..3745accf9d 100644 --- a/frame/1m/packm/bli_packm_md.h +++ b/frame/1m/packm/bli_packm_scalar.h @@ -32,6 +32,5 @@ */ -#include "bli_packm_blk_var1_md.h" -#include "bli_packm_struc_cxk_md.h" +BLIS_EXPORT_BLIS void* bli_packm_scalar( obj_t* kappa, obj_t* p ); diff --git a/frame/1m/packm/bli_packm_struc_cxk.c b/frame/1m/packm/bli_packm_struc_cxk.c index a3b2d66e63..2a52c42def 100644 --- a/frame/1m/packm/bli_packm_struc_cxk.c +++ b/frame/1m/packm/bli_packm_struc_cxk.c @@ -40,57 +40,24 @@ void PASTEMAC(ch,varname) \ ( \ struc_t strucc, \ - doff_t diagoffc, \ diag_t diagc, \ uplo_t uploc, \ conj_t conjc, \ pack_t schema, \ bool invdiag, \ - dim_t m_panel, \ - dim_t n_panel, \ - dim_t m_panel_max, \ - dim_t n_panel_max, \ + dim_t panel_dim, \ + dim_t panel_len, \ + dim_t panel_dim_max, \ + dim_t panel_len_max, \ + dim_t panel_dim_off, \ + dim_t panel_len_off, \ ctype* restrict kappa, \ - ctype* restrict c, inc_t rs_c, inc_t cs_c, \ - ctype* restrict p, inc_t rs_p, inc_t cs_p, \ + ctype* restrict c, inc_t incc, inc_t ldc, \ + ctype* restrict p, inc_t ldp, \ inc_t is_p, \ cntx_t* cntx \ ) \ { \ - dim_t panel_dim; \ - dim_t panel_dim_max; \ - dim_t panel_len; \ - dim_t panel_len_max; \ - inc_t incc, ldc; \ - inc_t ldp; \ -\ -\ - /* Determine the dimensions and relative strides of the micro-panel - based on its pack schema. */ \ - if ( bli_is_col_packed( schema ) ) \ - { \ - /* Prepare to pack to row-stored column panel. */ \ - panel_dim = n_panel; \ - panel_dim_max = n_panel_max; \ - panel_len = m_panel; \ - panel_len_max = m_panel_max; \ - incc = cs_c; \ - ldc = rs_c; \ - ldp = rs_p; \ - } \ - else /* if ( bli_is_row_packed( schema ) ) */ \ - { \ - /* Prepare to pack to column-stored row panel. */ \ - panel_dim = m_panel; \ - panel_dim_max = m_panel_max; \ - panel_len = n_panel; \ - panel_len_max = n_panel_max; \ - incc = rs_c; \ - ldc = cs_c; \ - ldp = cs_p; \ - } \ -\ -\ /* Handle micro-panel packing based on the structure of the matrix being packed. */ \ if ( bli_is_general( strucc ) ) \ @@ -118,23 +85,21 @@ void PASTEMAC(ch,varname) \ PASTEMAC(ch,packm_herm_cxk) \ ( \ strucc, \ - diagoffc, \ + diagc, \ uploc, \ conjc, \ schema, \ - m_panel, \ - n_panel, \ - m_panel_max, \ - n_panel_max, \ + invdiag, \ panel_dim, \ - panel_dim_max, \ panel_len, \ + panel_dim_max, \ panel_len_max, \ + panel_dim_off, \ + panel_len_off, \ kappa, \ - c, rs_c, cs_c, \ - incc, ldc, \ - p, rs_p, cs_p, \ - ldp, \ + c, incc, ldc, \ + p, ldp, \ + is_p, \ cntx \ ); \ } \ @@ -145,130 +110,24 @@ void PASTEMAC(ch,varname) \ PASTEMAC(ch,packm_tri_cxk) \ ( \ strucc, \ - diagoffc, \ diagc, \ uploc, \ conjc, \ schema, \ invdiag, \ - m_panel, \ - n_panel, \ - m_panel_max, \ - n_panel_max, \ panel_dim, \ - panel_dim_max, \ panel_len, \ + panel_dim_max, \ panel_len_max, \ + panel_dim_off, \ + panel_len_off, \ kappa, \ - c, rs_c, cs_c, \ - incc, ldc, \ - p, rs_p, cs_p, \ - ldp, \ + c, incc, ldc, \ + p, ldp, \ + is_p, \ cntx \ ); \ } \ -\ -\ - /* If m_panel < m_panel_max, or n_panel < n_panel_max, we would normally - fill the edge region (the bottom m_panel_max - m_panel rows or right- - side n_panel_max - n_panel columns) of the micropanel with zeros. - However, this responsibility has been moved to the packm microkernel. - This change allows experts to use custom kernels that pack to custom - packing formats when the problem size is not a nice multiple of the - register blocksize. */ \ -\ -/* - if ( m_panel != m_panel_max ) \ - { \ - ctype* restrict zero = PASTEMAC(ch,0); \ - dim_t i = m_panel; \ - dim_t m_edge = m_panel_max - i; \ - dim_t n_edge = n_panel_max; \ - ctype* p_edge = p + (i )*rs_p; \ -\ - PASTEMAC2(ch,setm,BLIS_TAPI_EX_SUF) \ - ( \ - BLIS_NO_CONJUGATE, \ - 0, \ - BLIS_NONUNIT_DIAG, \ - BLIS_DENSE, \ - m_edge, \ - n_edge, \ - zero, \ - p_edge, rs_p, cs_p, \ - cntx, \ - NULL \ - ); \ - } \ -\ - if ( n_panel != n_panel_max ) \ - { \ - ctype* restrict zero = PASTEMAC(ch,0); \ - dim_t j = n_panel; \ - dim_t m_edge = m_panel_max; \ - dim_t n_edge = n_panel_max - j; \ - ctype* p_edge = p + (j )*cs_p; \ -\ - PASTEMAC2(ch,setm,BLIS_TAPI_EX_SUF) \ - ( \ - BLIS_NO_CONJUGATE, \ - 0, \ - BLIS_NONUNIT_DIAG, \ - BLIS_DENSE, \ - m_edge, \ - n_edge, \ - zero, \ - p_edge, rs_p, cs_p, \ - cntx, \ - NULL \ - ); \ - } \ -*/ \ -\ -\ - if ( bli_is_triangular( strucc ) ) \ - { \ - /* If this panel is an edge case in both panel dimension and length, - then it must be a bottom-right corner case. Set the part of the - diagonal that extends into the zero-padded region to identity. - NOTE: This is actually only necessary when packing for trsm, as - it helps prevent NaNs and Infs from creeping into the computation. - However, we set the region to identity for trmm as well. Those - 1.0's end up getting muliplied by the 0.0's in the zero-padded - region of the other matrix, so there is no harm in this. */ \ - if ( m_panel != m_panel_max && \ - n_panel != n_panel_max ) \ - { \ - ctype* restrict one = PASTEMAC(ch,1); \ - dim_t i = m_panel; \ - dim_t j = n_panel; \ - dim_t m_br = m_panel_max - i; \ - dim_t n_br = n_panel_max - j; \ - ctype* p_br = p + (i )*rs_p + (j )*cs_p; \ -\ - PASTEMAC2(ch,setd,BLIS_TAPI_EX_SUF) \ - ( \ - BLIS_NO_CONJUGATE, \ - 0, \ - m_br, \ - n_br, \ - one, \ - p_br, rs_p, cs_p, \ - cntx, \ - NULL \ - ); \ - } \ - } \ -\ -\ -/* - if ( bli_is_col_packed( schema ) ) \ - PASTEMAC(ch,fprintm)( stdout, "packm_struc_cxk: bp copied", m_panel_max, n_panel_max, \ - p, rs_p, cs_p, "%4.1f", "" ); \ - else if ( bli_is_row_packed( schema ) ) \ - PASTEMAC(ch,fprintm)( stdout, "packm_struc_cxk: ap copied", m_panel_max, n_panel_max, \ - p, rs_p, cs_p, "%4.1f", "" ); \ -*/ \ } INSERT_GENTFUNC_BASIC( packm_struc_cxk, packm_cxk ) @@ -282,42 +141,31 @@ INSERT_GENTFUNC_BASIC( packm_struc_cxk, packm_cxk ) void PASTEMAC(ch,varname) \ ( \ struc_t strucc, \ - doff_t diagoffc, \ + diag_t diagc, \ uplo_t uploc, \ conj_t conjc, \ pack_t schema, \ - dim_t m_panel, \ - dim_t n_panel, \ - dim_t m_panel_max, \ - dim_t n_panel_max, \ + bool invdiag, \ dim_t panel_dim, \ - dim_t panel_dim_max, \ dim_t panel_len, \ + dim_t panel_dim_max, \ dim_t panel_len_max, \ + dim_t panel_dim_off, \ + dim_t panel_len_off, \ ctype* restrict kappa, \ - ctype* restrict c, inc_t rs_c, inc_t cs_c, \ - inc_t incc, inc_t ldc, \ - ctype* restrict p, inc_t rs_p, inc_t cs_p, \ - inc_t ldp, \ + ctype* restrict c, inc_t incc, inc_t ldc, \ + ctype* restrict p, inc_t ldp, \ + inc_t is_p, \ cntx_t* cntx \ ) \ { \ - doff_t diagoffc_abs; \ - dim_t i, j; \ - bool row_stored; \ - bool col_stored; \ -\ -\ - /* Create flags to incidate row or column storage. Note that the - schema bit that encodes row or column is describing the form of - micro-panel, not the storage in the micro-panel. Hence the - mismatch in "row" and "column" semantics. */ \ - row_stored = bli_is_col_packed( schema ); \ - col_stored = bli_is_row_packed( schema ); \ + doff_t diagoffc = panel_dim_off - panel_len_off; \ + doff_t diagoffc_abs; \ + dim_t i, j; \ \ /* Handle the case where the micro-panel does NOT intersect the diagonal separately from the case where it does intersect. */ \ - if ( !bli_intersects_diag_n( diagoffc, m_panel, n_panel ) ) \ + if ( !bli_intersects_diag_n( diagoffc, panel_dim, panel_len ) ) \ { \ /* If the current panel is unstored, we need to make a few adjustments so we refer to the data where it is actually @@ -325,10 +173,10 @@ void PASTEMAC(ch,varname) \ implicitly assumes we are operating on a dense panel within a larger symmetric or Hermitian matrix, since a general matrix would not contain any unstored region.) */ \ - if ( bli_is_unstored_subpart_n( diagoffc, uploc, m_panel, n_panel ) ) \ + if ( bli_is_unstored_subpart_n( diagoffc, uploc, panel_dim, panel_len ) ) \ { \ - c = c + diagoffc * ( doff_t )cs_c + \ - -diagoffc * ( doff_t )rs_c; \ + c = c + diagoffc * ( doff_t )ldc + \ + -diagoffc * ( doff_t )incc; \ bli_swap_incs( &incc, &ldc ); \ \ if ( bli_is_hermitian( strucc ) ) \ @@ -350,7 +198,7 @@ void PASTEMAC(ch,varname) \ cntx \ ); \ } \ - else /* if ( bli_intersects_diag_n( diagoffc, m_panel, n_panel ) ) */ \ + else /* if ( bli_intersects_diag_n( diagoffc, panel_dim, panel_len ) ) */ \ { \ ctype* restrict c10; \ ctype* restrict p10; \ @@ -370,14 +218,12 @@ void PASTEMAC(ch,varname) \ a micro-panel. If they do, then somehow the constraints on cache blocksizes being a whole multiple of the register blocksizes was somehow violated. */ \ - if ( ( col_stored && diagoffc < 0 ) || \ - ( row_stored && diagoffc > 0 ) ) \ + if ( diagoffc < 0 ) \ bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED ); \ \ diagoffc_abs = bli_abs( diagoffc ); \ \ - if ( ( row_stored && bli_is_upper( uploc ) ) || \ - ( col_stored && bli_is_lower( uploc ) ) ) \ + if ( bli_is_lower( uploc ) ) \ { \ p10_dim = panel_dim; \ p10_len = diagoffc_abs; \ @@ -393,8 +239,8 @@ void PASTEMAC(ch,varname) \ diagoffc12 = diagoffc_abs - j; \ p12 = p + (j )*ldp; \ c12 = c + (j )*ldc; \ - c12 = c12 + diagoffc12 * ( doff_t )cs_c + \ - -diagoffc12 * ( doff_t )rs_c; \ + c12 = c12 + diagoffc12 * ( doff_t )ldc + \ + -diagoffc12 * ( doff_t )incc; \ incc12 = ldc; \ ldc12 = incc; \ conjc12 = conjc; \ @@ -402,16 +248,15 @@ void PASTEMAC(ch,varname) \ if ( bli_is_hermitian( strucc ) ) \ bli_toggle_conj( &conjc12 ); \ } \ - else /* if ( ( row_stored && bli_is_lower( uploc ) ) || \ - ( col_stored && bli_is_upper( uploc ) ) ) */ \ + else /* if ( bli_is_upper( uploc ) ) */ \ { \ p10_dim = panel_dim; \ p10_len = diagoffc_abs + panel_dim; \ diagoffc10 = diagoffc; \ p10 = p; \ c10 = c; \ - c10 = c10 + diagoffc10 * ( doff_t )cs_c + \ - -diagoffc10 * ( doff_t )rs_c; \ + c10 = c10 + diagoffc10 * ( doff_t )ldc + \ + -diagoffc10 * ( doff_t )incc; \ incc10 = ldc; \ ldc10 = incc; \ conjc10 = conjc; \ @@ -486,8 +331,8 @@ void PASTEMAC(ch,varname) \ transc, \ p11_m, \ p11_n, \ - c11, rs_c, cs_c, \ - p11, rs_p, cs_p, \ + c11, incc, ldc, \ + p11, 1, ldp, \ cntx, \ NULL \ ); \ @@ -503,7 +348,7 @@ void PASTEMAC(ch,varname) \ { \ PASTEMAC(ch,seti0s)( *pi11 ); \ \ - pi11 += rs_p + cs_p; \ + pi11 += 1 + ldp; \ } \ } \ \ @@ -519,7 +364,7 @@ void PASTEMAC(ch,varname) \ p11_m, \ p11_n, \ kappa, \ - p11, rs_p, cs_p, \ + p11, 1, ldp, \ cntx, \ NULL \ ); \ @@ -539,28 +384,26 @@ INSERT_GENTFUNC_BASIC( packm_herm_cxk, packm_cxk ) void PASTEMAC(ch,varname) \ ( \ struc_t strucc, \ - doff_t diagoffp, \ diag_t diagc, \ uplo_t uploc, \ conj_t conjc, \ pack_t schema, \ bool invdiag, \ - dim_t m_panel, \ - dim_t n_panel, \ - dim_t m_panel_max, \ - dim_t n_panel_max, \ dim_t panel_dim, \ - dim_t panel_dim_max, \ dim_t panel_len, \ + dim_t panel_dim_max, \ dim_t panel_len_max, \ + dim_t panel_dim_off, \ + dim_t panel_len_off, \ ctype* restrict kappa, \ - ctype* restrict c, inc_t rs_c, inc_t cs_c, \ - inc_t incc, inc_t ldc, \ - ctype* restrict p, inc_t rs_p, inc_t cs_p, \ - inc_t ldp, \ + ctype* restrict c, inc_t incc, inc_t ldc, \ + ctype* restrict p, inc_t ldp, \ + inc_t is_p, \ cntx_t* cntx \ ) \ { \ + doff_t diagoffc = panel_dim_off - panel_len_off; \ +\ /* Pack the panel. */ \ PASTEMAC(ch,kername) \ ( \ @@ -584,11 +427,11 @@ void PASTEMAC(ch,varname) \ PASTEMAC2(ch,setd,BLIS_TAPI_EX_SUF) \ ( \ BLIS_NO_CONJUGATE, \ - diagoffp, \ - m_panel, \ - n_panel, \ + diagoffc, \ + panel_dim, \ + panel_len, \ kappa, \ - p, rs_p, cs_p, \ + p, 1, ldp, \ cntx, \ NULL \ ); \ @@ -599,10 +442,10 @@ void PASTEMAC(ch,varname) \ { \ PASTEMAC2(ch,invertd,BLIS_TAPI_EX_SUF) \ ( \ - diagoffp, \ - m_panel, \ - n_panel, \ - p, rs_p, cs_p, \ + diagoffc, \ + panel_dim, \ + panel_len, \ + p, 1, ldp, \ cntx, \ NULL \ ); \ @@ -621,23 +464,53 @@ void PASTEMAC(ch,varname) \ uplo_t uplop = uploc; \ \ bli_toggle_uplo( &uplop ); \ - bli_shift_diag_offset_to_shrink_uplo( uplop, &diagoffp ); \ + bli_shift_diag_offset_to_shrink_uplo( uplop, &diagoffc ); \ \ PASTEMAC2(ch,setm,BLIS_TAPI_EX_SUF) \ ( \ BLIS_NO_CONJUGATE, \ - diagoffp, \ + diagoffc, \ BLIS_NONUNIT_DIAG, \ uplop, \ - m_panel, \ - n_panel, \ + panel_dim, \ + panel_len, \ zero, \ - p, rs_p, cs_p, \ + p, 1, ldp, \ cntx, \ NULL \ ); \ } \ \ + /* If this panel is an edge case in both panel dimension and length, + then it must be a bottom-right corner case. Set the part of the + diagonal that extends into the zero-padded region to identity. + NOTE: This is actually only necessary when packing for trsm, as + it helps prevent NaNs and Infs from creeping into the computation. + However, we set the region to identity for trmm as well. Those + 1.0's end up getting muliplied by the 0.0's in the zero-padded + region of the other matrix, so there is no harm in this. */ \ + if ( panel_dim != panel_dim_max && \ + panel_len != panel_len_max ) \ + { \ + ctype* restrict one = PASTEMAC(ch,1); \ + dim_t i = panel_dim; \ + dim_t j = panel_len; \ + dim_t m_br = panel_dim_max - i; \ + dim_t n_br = panel_len_max - j; \ + ctype* p_br = p + (i ) + (j )*ldp; \ +\ + PASTEMAC2(ch,setd,BLIS_TAPI_EX_SUF) \ + ( \ + BLIS_NO_CONJUGATE, \ + 0, \ + m_br, \ + n_br, \ + one, \ + p_br, 1, ldp, \ + cntx, \ + NULL \ + ); \ + } \ } INSERT_GENTFUNC_BASIC( packm_tri_cxk, packm_cxk ) diff --git a/frame/1m/packm/bli_packm_struc_cxk.h b/frame/1m/packm/bli_packm_struc_cxk.h index 08afb19bde..973a02612b 100644 --- a/frame/1m/packm/bli_packm_struc_cxk.h +++ b/frame/1m/packm/bli_packm_struc_cxk.h @@ -38,84 +38,25 @@ void PASTEMAC(ch,varname) \ ( \ struc_t strucc, \ - doff_t diagoffp, \ diag_t diagc, \ uplo_t uploc, \ conj_t conjc, \ pack_t schema, \ bool invdiag, \ - dim_t m_panel, \ - dim_t n_panel, \ - dim_t m_panel_max, \ - dim_t n_panel_max, \ - ctype* restrict kappa, \ - ctype* restrict c, inc_t rs_c, inc_t cs_c, \ - ctype* restrict p, inc_t rs_p, inc_t cs_p, \ - inc_t is_p, \ - cntx_t* cntx \ - ); - -INSERT_GENTPROT_BASIC0( packm_struc_cxk ) - - - -#undef GENTPROT -#define GENTPROT( ctype, ch, varname ) \ -\ -void PASTEMAC(ch,varname) \ - ( \ - struc_t strucc, \ - doff_t diagoffc, \ - uplo_t uploc, \ - conj_t conjc, \ - pack_t schema, \ - dim_t m_panel, \ - dim_t n_panel, \ - dim_t m_panel_max, \ - dim_t n_panel_max, \ dim_t panel_dim, \ - dim_t panel_dim_max, \ dim_t panel_len, \ - dim_t panel_len_max, \ - ctype* restrict kappa, \ - ctype* restrict c, inc_t rs_c, inc_t cs_c, \ - inc_t incc, inc_t ldc, \ - ctype* restrict p, inc_t rs_p, inc_t cs_p, \ - inc_t ldp, \ - cntx_t* cntx \ - ); - -INSERT_GENTPROT_BASIC0( packm_herm_cxk ) - - - -#undef GENTPROT -#define GENTPROT( ctype, ch, varname ) \ -\ -void PASTEMAC(ch,varname) \ - ( \ - struc_t strucc, \ - doff_t diagoffc, \ - diag_t diagc, \ - uplo_t uploc, \ - conj_t conjc, \ - pack_t schema, \ - bool invdiag, \ - dim_t m_panel, \ - dim_t n_panel, \ - dim_t m_panel_max, \ - dim_t n_panel_max, \ - dim_t panel_dim, \ dim_t panel_dim_max, \ - dim_t panel_len, \ dim_t panel_len_max, \ + dim_t panel_dim_off, \ + dim_t panel_len_off, \ ctype* restrict kappa, \ - ctype* restrict c, inc_t rs_c, inc_t cs_c, \ - inc_t incc, inc_t ldc, \ - ctype* restrict p, inc_t rs_p, inc_t cs_p, \ - inc_t ldp, \ + ctype* restrict c, inc_t incc, inc_t ldc, \ + ctype* restrict p, inc_t ldp, \ + inc_t is_p, \ cntx_t* cntx \ ); +INSERT_GENTPROT_BASIC0( packm_struc_cxk ) +INSERT_GENTPROT_BASIC0( packm_herm_cxk ) INSERT_GENTPROT_BASIC0( packm_tri_cxk ) diff --git a/frame/1m/packm/bli_packm_struc_cxk_1er.c b/frame/1m/packm/bli_packm_struc_cxk_1er.c index a66ba5ff6b..b3be9dff95 100644 --- a/frame/1m/packm/bli_packm_struc_cxk_1er.c +++ b/frame/1m/packm/bli_packm_struc_cxk_1er.c @@ -40,57 +40,25 @@ void PASTEMAC(ch,varname) \ ( \ struc_t strucc, \ - doff_t diagoffc, \ diag_t diagc, \ uplo_t uploc, \ conj_t conjc, \ pack_t schema, \ bool invdiag, \ - dim_t m_panel, \ - dim_t n_panel, \ - dim_t m_panel_max, \ - dim_t n_panel_max, \ + dim_t panel_dim, \ + dim_t panel_len, \ + dim_t panel_dim_max, \ + dim_t panel_len_max, \ + dim_t panel_dim_off, \ + dim_t panel_len_off, \ ctype* restrict kappa, \ - ctype* restrict c, inc_t rs_c, inc_t cs_c, \ - ctype* restrict p, inc_t rs_p, inc_t cs_p, \ + ctype* restrict c, inc_t incc, inc_t ldc, \ + ctype* restrict p, inc_t ldp, \ inc_t is_p, \ - cntx_t* cntx \ + cntx_t* cntx, \ + void* params \ ) \ { \ - dim_t panel_dim; \ - dim_t panel_dim_max; \ - dim_t panel_len; \ - dim_t panel_len_max; \ - inc_t incc, ldc; \ - inc_t ldp; \ -\ -\ - /* Determine the dimensions and relative strides of the micro-panel - based on its pack schema. */ \ - if ( bli_is_col_packed( schema ) ) \ - { \ - /* Prepare to pack to row-stored column panel. */ \ - panel_dim = n_panel; \ - panel_dim_max = n_panel_max; \ - panel_len = m_panel; \ - panel_len_max = m_panel_max; \ - incc = cs_c; \ - ldc = rs_c; \ - ldp = rs_p; \ - } \ - else /* if ( bli_is_row_packed( schema ) ) */ \ - { \ - /* Prepare to pack to column-stored row panel. */ \ - panel_dim = m_panel; \ - panel_dim_max = m_panel_max; \ - panel_len = n_panel; \ - panel_len_max = n_panel_max; \ - incc = rs_c; \ - ldc = cs_c; \ - ldp = cs_p; \ - } \ -\ -\ /* Handle micro-panel packing based on the structure of the matrix being packed. */ \ if ( bli_is_general( strucc ) ) \ @@ -108,7 +76,7 @@ void PASTEMAC(ch,varname) \ kappa, \ c, incc, ldc, \ p, ldp, \ - cntx \ + cntx \ ); \ } \ else if ( bli_is_herm_or_symm( strucc ) ) \ @@ -118,24 +86,23 @@ void PASTEMAC(ch,varname) \ PASTEMAC(ch,packm_herm_cxk_1er) \ ( \ strucc, \ - diagoffc, \ + diagc, \ uploc, \ conjc, \ schema, \ - m_panel, \ - n_panel, \ - m_panel_max, \ - n_panel_max, \ + invdiag, \ panel_dim, \ - panel_dim_max, \ panel_len, \ + panel_dim_max, \ panel_len_max, \ + panel_dim_off, \ + panel_len_off, \ kappa, \ - c, rs_c, cs_c, \ - incc, ldc, \ - p, rs_p, cs_p, \ - ldp, \ - cntx \ + c, incc, ldc, \ + p, ldp, \ + is_p, \ + cntx, \ + params \ ); \ } \ else /* ( bli_is_triangular( strucc ) ) */ \ @@ -145,125 +112,25 @@ void PASTEMAC(ch,varname) \ PASTEMAC(ch,packm_tri_cxk_1er) \ ( \ strucc, \ - diagoffc, \ diagc, \ uploc, \ conjc, \ schema, \ invdiag, \ - m_panel, \ - n_panel, \ - m_panel_max, \ - n_panel_max, \ panel_dim, \ - panel_dim_max, \ panel_len, \ + panel_dim_max, \ panel_len_max, \ + panel_dim_off, \ + panel_len_off, \ kappa, \ - c, rs_c, cs_c, \ - incc, ldc, \ - p, rs_p, cs_p, \ - ldp, \ - cntx \ - ); \ - } \ -\ -\ - /* If m_panel < m_panel_max, or n_panel < n_panel_max, we would normally - fill the edge region (the bottom m_panel_max - m_panel rows or right- - side n_panel_max - n_panel columns) of the micropanel with zeros. - However, this responsibility has been moved to the packm microkernel. - This change allows experts to use custom kernels that pack to custom - packing formats when the problem size is not a nice multiple of the - register blocksize. */ \ -/* - if ( m_panel != m_panel_max ) \ - { \ - ctype* restrict zero = PASTEMAC(ch,0); \ - dim_t offm = m_panel; \ - dim_t offn = 0; \ - dim_t m_edge = m_panel_max - m_panel; \ - dim_t n_edge = n_panel_max; \ -\ - PASTEMAC(ch,set1ms_mxn) \ - ( \ - schema, \ - offm, \ - offn, \ - m_edge, \ - n_edge, \ - zero, \ - p, rs_p, cs_p, ldp \ - ); \ - } \ -\ - if ( n_panel != n_panel_max ) \ - { \ - ctype* restrict zero = PASTEMAC(ch,0); \ - dim_t offm = 0; \ - dim_t offn = n_panel; \ - dim_t m_edge = m_panel_max; \ - dim_t n_edge = n_panel_max - n_panel; \ -\ - PASTEMAC(ch,set1ms_mxn) \ - ( \ - schema, \ - offm, \ - offn, \ - m_edge, \ - n_edge, \ - zero, \ - p, rs_p, cs_p, ldp \ + c, incc, ldc, \ + p, ldp, \ + is_p, \ + cntx, \ + params \ ); \ } \ -*/ \ -\ - if ( bli_is_triangular( strucc ) ) \ - { \ - /* If this micro-panel is an edge case in both panel dimension and - length, then it must be a bottom-right corner case, which - typically only happens for micro-panels being packed for trsm. - (It also happens for trmm if kr > 1.) Here, we set the part of - the diagonal that extends into the zero-padded region to - identity. This prevents NaNs and Infs from creeping into the - computation. If this code does execute for trmm, it is okay, - because those 1.0's that extend into the bottom-right region - end up getting muliplied by the 0.0's in the zero-padded region - of the other matrix. */ \ - if ( m_panel != m_panel_max && \ - n_panel != n_panel_max ) \ - { \ - ctype* restrict one = PASTEMAC(ch,1); \ - dim_t offm = m_panel; \ - dim_t offn = n_panel; \ - dim_t m_edge = m_panel_max - m_panel; \ - dim_t n_edge = n_panel_max - n_panel; \ -\ - PASTEMAC(ch,set1ms_mxn_diag) \ - ( \ - schema, \ - offm, \ - offn, \ - m_edge, \ - n_edge, \ - one, \ - p, rs_p, cs_p, ldp \ - ); \ - } \ - } \ -\ -\ -/* - if ( bli_is_1r_packed( schema ) ) { \ - PASTEMAC(chr,fprintm)( stdout, "packm_struc_cxk_1er (1r): bp", m_panel_max, 2*n_panel_max, \ - ( ctype_r* )p, rs_p, cs_p, "%4.1f", "" ); \ - } \ - \ - if ( bli_is_1e_packed( schema ) ) { \ - PASTEMAC(chr,fprintm)( stdout, "packm_struc_cxk_1er (1e): ap", 2*m_panel_max, 2*n_panel_max, \ - ( ctype_r* )p, rs_p, cs_p, "%4.1f", "" ); \ - } \ -*/ \ } INSERT_GENTFUNCCO_BASIC( packm_struc_cxk_1er, packm_cxk_1er ) @@ -277,42 +144,32 @@ INSERT_GENTFUNCCO_BASIC( packm_struc_cxk_1er, packm_cxk_1er ) void PASTEMAC(ch,varname) \ ( \ struc_t strucc, \ - doff_t diagoffc, \ + diag_t diagc, \ uplo_t uploc, \ conj_t conjc, \ pack_t schema, \ - dim_t m_panel, \ - dim_t n_panel, \ - dim_t m_panel_max, \ - dim_t n_panel_max, \ + bool invdiag, \ dim_t panel_dim, \ - dim_t panel_dim_max, \ dim_t panel_len, \ + dim_t panel_dim_max, \ dim_t panel_len_max, \ + dim_t panel_dim_off, \ + dim_t panel_len_off, \ ctype* restrict kappa, \ - ctype* restrict c, inc_t rs_c, inc_t cs_c, \ - inc_t incc, inc_t ldc, \ - ctype* restrict p, inc_t rs_p, inc_t cs_p, \ - inc_t ldp, \ - cntx_t* cntx \ + ctype* restrict c, inc_t incc, inc_t ldc, \ + ctype* restrict p, inc_t ldp, \ + inc_t is_p, \ + cntx_t* cntx, \ + void* params \ ) \ { \ - doff_t diagoffc_abs; \ - dim_t j; \ - bool row_stored; \ - bool col_stored; \ -\ -\ - /* Create flags to incidate row or column storage. Note that the - schema bit that encodes row or column is describing the form of - micro-panel, not the storage in the micro-panel. Hence the - mismatch in "row" and "column" semantics. */ \ - row_stored = bli_is_col_packed( schema ); \ - col_stored = bli_is_row_packed( schema ); \ + doff_t diagoffc = panel_dim_off - panel_len_off; \ + doff_t diagoffc_abs; \ + dim_t j; \ \ /* Handle the case where the micro-panel does NOT intersect the diagonal separately from the case where it does intersect. */ \ - if ( !bli_intersects_diag_n( diagoffc, m_panel, n_panel ) ) \ + if ( !bli_intersects_diag_n( diagoffc, panel_dim, panel_len ) ) \ { \ /* If the current panel is unstored, we need to make a few adjustments so we refer to the data where it is actually @@ -320,10 +177,10 @@ void PASTEMAC(ch,varname) \ implicitly assumes we are operating on a dense panel within a larger symmetric or Hermitian matrix, since a general matrix would not contain any unstored region.) */ \ - if ( bli_is_unstored_subpart_n( diagoffc, uploc, m_panel, n_panel ) ) \ + if ( bli_is_unstored_subpart_n( diagoffc, uploc, panel_dim, panel_len ) ) \ { \ - c = c + diagoffc * ( doff_t )cs_c + \ - -diagoffc * ( doff_t )rs_c; \ + c = c + diagoffc * ( doff_t )ldc + \ + -diagoffc * ( doff_t )incc; \ bli_swap_incs( &incc, &ldc ); \ \ if ( bli_is_hermitian( strucc ) ) \ @@ -345,7 +202,7 @@ void PASTEMAC(ch,varname) \ cntx \ ); \ } \ - else /* if ( bli_intersects_diag_n( diagoffc, m_panel, n_panel ) ) */ \ + else /* if ( bli_intersects_diag_n( diagoffc, panel_dim, panel_len ) ) */ \ { \ ctype* restrict c10; \ ctype* restrict p10; \ @@ -366,14 +223,12 @@ void PASTEMAC(ch,varname) \ a micro-panel. If they do, then somehow the constraints on cache blocksizes being a whole multiple of the register blocksizes was somehow violated. */ \ - if ( ( col_stored && diagoffc < 0 ) || \ - ( row_stored && diagoffc > 0 ) ) \ + if ( diagoffc < 0 ) \ bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED ); \ \ diagoffc_abs = bli_abs( diagoffc ); \ \ - if ( ( row_stored && bli_is_upper( uploc ) ) || \ - ( col_stored && bli_is_lower( uploc ) ) ) \ + if ( bli_is_lower( uploc ) ) \ { \ p10_dim = panel_dim; \ p10_len = diagoffc_abs; \ @@ -389,8 +244,8 @@ void PASTEMAC(ch,varname) \ diagoffc12 = diagoffc_abs - j; \ p12 = p + (j )*ldp; \ c12 = c + (j )*ldc; \ - c12 = c12 + diagoffc12 * ( doff_t )cs_c + \ - -diagoffc12 * ( doff_t )rs_c; \ + c12 = c12 + diagoffc12 * ( doff_t )ldc + \ + -diagoffc12 * ( doff_t )incc; \ incc12 = ldc; \ ldc12 = incc; \ conjc12 = conjc; \ @@ -398,16 +253,15 @@ void PASTEMAC(ch,varname) \ if ( bli_is_hermitian( strucc ) ) \ bli_toggle_conj( &conjc12 ); \ } \ - else /* if ( ( row_stored && bli_is_lower( uploc ) ) || \ - ( col_stored && bli_is_upper( uploc ) ) ) */ \ + else /* if ( bli_is_upper( uploc ) ) */ \ { \ p10_dim = panel_dim; \ p10_len = diagoffc_abs + panel_dim; \ diagoffc10 = diagoffc; \ p10 = p; \ c10 = c; \ - c10 = c10 + diagoffc10 * ( doff_t )cs_c + \ - -diagoffc10 * ( doff_t )rs_c; \ + c10 = c10 + diagoffc10 * ( doff_t )ldc + \ + -diagoffc10 * ( doff_t )incc; \ incc10 = ldc; \ ldc10 = incc; \ conjc10 = conjc; \ @@ -478,8 +332,8 @@ void PASTEMAC(ch,varname) \ conjc, \ panel_dim, \ kappa, \ - c11, rs_c, cs_c, \ - p11, rs_p, cs_p, ldp \ + c11, incc, ldc, \ + p11, 1, ldp, ldp \ ); \ \ /* If we are packing a micro-panel with Hermitian structure, @@ -495,8 +349,8 @@ void PASTEMAC(ch,varname) \ if ( bli_is_hermitian( strucc ) ) \ { \ ctype_r* restrict c11_r = ( ctype_r* )c11; \ - const dim_t rs_c2 = 2*rs_c; \ - const dim_t cs_c2 = 2*cs_c; \ + const dim_t incc2 = 2*incc; \ + const dim_t ldc2 = 2*ldc; \ \ PASTEMAC3(ch,chr,ch,scal21ms_mxn_diag) \ ( \ @@ -504,8 +358,8 @@ void PASTEMAC(ch,varname) \ panel_dim, \ panel_dim, \ kappa, \ - c11_r, rs_c2, cs_c2, \ - p11, rs_p, cs_p, ldp \ + c11_r, incc2, ldc2, \ + p11, 1, ldp, ldp \ ); \ } \ } \ @@ -523,30 +377,28 @@ INSERT_GENTFUNCCO_BASIC( packm_herm_cxk_1er, packm_cxk_1er ) void PASTEMAC(ch,varname) \ ( \ struc_t strucc, \ - doff_t diagoffp, \ diag_t diagc, \ uplo_t uploc, \ conj_t conjc, \ pack_t schema, \ bool invdiag, \ - dim_t m_panel, \ - dim_t n_panel, \ - dim_t m_panel_max, \ - dim_t n_panel_max, \ dim_t panel_dim, \ - dim_t panel_dim_max, \ dim_t panel_len, \ + dim_t panel_dim_max, \ dim_t panel_len_max, \ + dim_t panel_dim_off, \ + dim_t panel_len_off, \ ctype* restrict kappa, \ - ctype* restrict c, inc_t rs_c, inc_t cs_c, \ - inc_t incc, inc_t ldc, \ - ctype* restrict p, inc_t rs_p, inc_t cs_p, \ - inc_t ldp, \ - cntx_t* cntx \ + ctype* restrict c, inc_t incc, inc_t ldc, \ + ctype* restrict p, inc_t ldp, \ + inc_t is_p, \ + cntx_t* cntx, \ + void* params \ ) \ { \ - doff_t diagoffp_abs = bli_abs( diagoffp ); \ - ctype* p11 = p + (diagoffp_abs )*ldp; \ + doff_t diagoffc = panel_dim_off - panel_len_off; \ + doff_t diagoffc_abs = bli_abs( diagoffc ); \ + ctype* p11 = p + (diagoffc_abs )*ldp; \ \ \ /* Pack the panel. */ \ @@ -579,7 +431,7 @@ void PASTEMAC(ch,varname) \ panel_dim, \ panel_dim, \ kappa, \ - p11, rs_p, cs_p, ldp \ + p11, 1, ldp, ldp \ ); \ } \ \ @@ -594,7 +446,7 @@ void PASTEMAC(ch,varname) \ 0, \ panel_dim, \ panel_dim, \ - p11, rs_p, cs_p, ldp \ + p11, 1, ldp, ldp \ ); \ } \ \ @@ -610,11 +462,11 @@ void PASTEMAC(ch,varname) \ { \ ctype* restrict zero = PASTEMAC(ch,0); \ uplo_t uplop = uploc; \ - doff_t diagoffp11_0 = 0; \ + doff_t diagoffc11_0 = 0; \ dim_t p11_0_dim = panel_dim - 1; \ \ bli_toggle_uplo( &uplop ); \ - bli_shift_diag_offset_to_shrink_uplo( uplop, &diagoffp11_0 ); \ + bli_shift_diag_offset_to_shrink_uplo( uplop, &diagoffc11_0 ); \ \ /* Note that this macro works a little differently than the setm operation. Here, we pass in the dimensions of only p11, rather @@ -622,20 +474,51 @@ void PASTEMAC(ch,varname) \ "shrunken" dimensions of p11, corresponding to the toggling and shrinking of the diagonal above. The macro will do the right thing, incrementing the pointer to p11 by the appropriate - leading dimension (cs_p or rs_p), and setting only the lower + leading dimension (ldp or rs_p), and setting only the lower or upper triangle to zero. */ \ PASTEMAC(ch,set1ms_mxn_uplo) \ ( \ schema, \ - diagoffp11_0, \ + diagoffc11_0, \ uplop, \ p11_0_dim, \ p11_0_dim, \ zero, \ - p11, rs_p, cs_p, ldp \ + p11, 1, ldp, ldp \ ); \ } \ } \ +\ + /* If this micro-panel is an edge case in both panel dimension and + length, then it must be a bottom-right corner case, which + typically only happens for micro-panels being packed for trsm. + (It also happens for trmm if kr > 1.) Here, we set the part of + the diagonal that extends into the zero-padded region to + identity. This prevents NaNs and Infs from creeping into the + computation. If this code does execute for trmm, it is okay, + because those 1.0's that extend into the bottom-right region + end up getting muliplied by the 0.0's in the zero-padded region + of the other matrix. */ \ + if ( panel_dim != panel_dim_max && \ + panel_len != panel_len_max ) \ + { \ + ctype* restrict one = PASTEMAC(ch,1); \ + dim_t offm = panel_dim; \ + dim_t offn = panel_len; \ + dim_t m_edge = panel_dim_max - panel_dim; \ + dim_t n_edge = panel_len_max - panel_len; \ +\ + PASTEMAC(ch,set1ms_mxn_diag) \ + ( \ + schema, \ + offm, \ + offn, \ + m_edge, \ + n_edge, \ + one, \ + p, 1, ldp, ldp \ + ); \ + } \ } INSERT_GENTFUNCCO_BASIC( packm_tri_cxk_1er, packm_cxk_1er ) diff --git a/frame/1m/packm/bli_packm_struc_cxk_1er.h b/frame/1m/packm/bli_packm_struc_cxk_1er.h index 6e62d8f69e..a953e93673 100644 --- a/frame/1m/packm/bli_packm_struc_cxk_1er.h +++ b/frame/1m/packm/bli_packm_struc_cxk_1er.h @@ -38,84 +38,26 @@ void PASTEMAC(ch,varname) \ ( \ struc_t strucc, \ - doff_t diagoffp, \ diag_t diagc, \ uplo_t uploc, \ conj_t conjc, \ pack_t schema, \ bool invdiag, \ - dim_t m_panel, \ - dim_t n_panel, \ - dim_t m_panel_max, \ - dim_t n_panel_max, \ - ctype* restrict kappa, \ - ctype* restrict c, inc_t rs_c, inc_t cs_c, \ - ctype* restrict p, inc_t rs_p, inc_t cs_p, \ - inc_t is_p, \ - cntx_t* cntx \ - ); - -INSERT_GENTPROTCO_BASIC0( packm_struc_cxk_1er ) - - - -#undef GENTPROTCO -#define GENTPROTCO( ctype, ctype_r, ch, chr, varname ) \ -\ -void PASTEMAC(ch,varname) \ - ( \ - struc_t strucc, \ - doff_t diagoffc, \ - uplo_t uploc, \ - conj_t conjc, \ - pack_t schema, \ - dim_t m_panel, \ - dim_t n_panel, \ - dim_t m_panel_max, \ - dim_t n_panel_max, \ dim_t panel_dim, \ - dim_t panel_dim_max, \ dim_t panel_len, \ - dim_t panel_len_max, \ - ctype* restrict kappa, \ - ctype* restrict c, inc_t rs_c, inc_t cs_c, \ - inc_t incc, inc_t ldc, \ - ctype* restrict p, inc_t rs_p, inc_t cs_p, \ - inc_t ldp, \ - cntx_t* cntx \ - ); - -INSERT_GENTPROTCO_BASIC0( packm_herm_cxk_1er ) - - - -#undef GENTPROTCO -#define GENTPROTCO( ctype, ctype_r, ch, chr, varname ) \ -\ -void PASTEMAC(ch,varname) \ - ( \ - struc_t strucc, \ - doff_t diagoffc, \ - diag_t diagc, \ - uplo_t uploc, \ - conj_t conjc, \ - pack_t schema, \ - bool invdiag, \ - dim_t m_panel, \ - dim_t n_panel, \ - dim_t m_panel_max, \ - dim_t n_panel_max, \ - dim_t panel_dim, \ dim_t panel_dim_max, \ - dim_t panel_len, \ dim_t panel_len_max, \ + dim_t panel_dim_off, \ + dim_t panel_len_off, \ ctype* restrict kappa, \ - ctype* restrict c, inc_t rs_c, inc_t cs_c, \ - inc_t incc, inc_t ldc, \ - ctype* restrict p, inc_t rs_p, inc_t cs_p, \ - inc_t ldp, \ - cntx_t* cntx \ + ctype* restrict c, inc_t incc, inc_t ldc, \ + ctype* restrict p, inc_t ldp, \ + inc_t is_p, \ + cntx_t* cntx, \ + void* params \ ); +INSERT_GENTPROTCO_BASIC0( packm_struc_cxk_1er ) +INSERT_GENTPROTCO_BASIC0( packm_herm_cxk_1er ) INSERT_GENTPROTCO_BASIC0( packm_tri_cxk_1er ) diff --git a/frame/1m/packm/bli_packm_struc_cxk_md.c b/frame/1m/packm/bli_packm_struc_cxk_md.c index 52a1f9817f..650b6178c9 100644 --- a/frame/1m/packm/bli_packm_struc_cxk_md.c +++ b/frame/1m/packm/bli_packm_struc_cxk_md.c @@ -41,53 +41,26 @@ \ void PASTEMAC2(chc,chp,varname) \ ( \ + struc_t strucc, \ + diag_t diagc, \ + uplo_t uploc, \ conj_t conjc, \ pack_t schema, \ - dim_t m_panel, \ - dim_t n_panel, \ - dim_t m_panel_max, \ - dim_t n_panel_max, \ + bool invdiag, \ + dim_t panel_dim, \ + dim_t panel_len, \ + dim_t panel_dim_max, \ + dim_t panel_len_max, \ + dim_t panel_dim_off, \ + dim_t panel_len_off, \ ctype_p* restrict kappa, \ - ctype_c* restrict c, inc_t rs_c, inc_t cs_c, \ - ctype_p* restrict p, inc_t rs_p, inc_t cs_p, \ + ctype_c* restrict c, inc_t incc, inc_t ldc, \ + ctype_p* restrict p, inc_t ldp, \ inc_t is_p, \ - cntx_t* cntx \ + cntx_t* cntx, \ + void* params \ ) \ { \ - dim_t panel_dim; \ - dim_t panel_dim_max; \ - dim_t panel_len; \ - dim_t panel_len_max; \ - inc_t incc, ldc; \ - inc_t ldp; \ -\ -\ - /* Determine the dimensions and relative strides of the micro-panel - based on its pack schema. */ \ - if ( bli_is_col_packed( schema ) ) \ - { \ - /* Prepare to pack to row-stored column panel. */ \ - panel_dim = n_panel; \ - panel_dim_max = n_panel_max; \ - panel_len = m_panel; \ - panel_len_max = m_panel_max; \ - incc = cs_c; \ - ldc = rs_c; \ - ldp = rs_p; \ - } \ - else /* if ( bli_is_row_packed( schema ) ) */ \ - { \ - /* Prepare to pack to column-stored row panel. */ \ - panel_dim = m_panel; \ - panel_dim_max = m_panel_max; \ - panel_len = n_panel; \ - panel_len_max = n_panel_max; \ - incc = rs_c; \ - ldc = cs_c; \ - ldp = cs_p; \ - } \ -\ -\ if ( bli_is_nat_packed( schema ) ) \ { \ /* Sanity check: Make sure that kappa is 1.0. Mixed-datatype alpha @@ -318,7 +291,7 @@ void PASTEMAC2(cha,chp,opname) \ conj_t conja, \ dim_t m, \ dim_t n, \ - ctype_p* restrict kappa, \ + ctype_p* restrict kappa, \ ctype_a* restrict a, inc_t inca, inc_t lda, \ ctype_p* restrict p, inc_t ldp \ ) \ @@ -445,7 +418,7 @@ void PASTEMAC2(cha,chp,opname) \ conj_t conja, \ dim_t m, \ dim_t n, \ - ctype_p* restrict kappa, \ + ctype_p* restrict kappa, \ ctype_a* restrict a, inc_t inca, inc_t lda, \ ctype_p* restrict p, inc_t ldp \ ) \ diff --git a/frame/1m/packm/bli_packm_struc_cxk_md.h b/frame/1m/packm/bli_packm_struc_cxk_md.h index 72ca67937f..f493838b3a 100644 --- a/frame/1m/packm/bli_packm_struc_cxk_md.h +++ b/frame/1m/packm/bli_packm_struc_cxk_md.h @@ -37,17 +37,24 @@ \ void PASTEMAC2(chc,chp,varname) \ ( \ + struc_t strucc, \ + diag_t diagc, \ + uplo_t uploc, \ conj_t conjc, \ pack_t schema, \ - dim_t m_panel, \ - dim_t n_panel, \ - dim_t m_panel_max, \ - dim_t n_panel_max, \ + bool invdiag, \ + dim_t panel_dim, \ + dim_t panel_len, \ + dim_t panel_dim_max, \ + dim_t panel_len_max, \ + dim_t panel_dim_off, \ + dim_t panel_len_off, \ ctype_p* restrict kappa, \ - ctype_c* restrict c, inc_t rs_c, inc_t cs_c, \ - ctype_p* restrict p, inc_t rs_p, inc_t cs_p, \ + ctype_c* restrict c, inc_t incc, inc_t ldc, \ + ctype_p* restrict p, inc_t ldp, \ inc_t is_p, \ - cntx_t* cntx \ + cntx_t* cntx, \ + void* params \ ); INSERT_GENTPROT2_BASIC0( packm_struc_cxk_md ) diff --git a/frame/1m/packm/bli_packm_unb_var1.c b/frame/1m/packm/bli_packm_unb_var1.c deleted file mode 100644 index 6e72b3e9d0..0000000000 --- a/frame/1m/packm/bli_packm_unb_var1.c +++ /dev/null @@ -1,297 +0,0 @@ -/* - - BLIS - An object-based framework for developing high-performance BLAS-like - libraries. - - Copyright (C) 2014, The University of Texas at Austin - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are - met: - - Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - - Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - - Neither the name(s) of the copyright holder(s) nor the names of its - contributors may be used to endorse or promote products derived - from this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -*/ - -#include "blis.h" - -#define FUNCPTR_T packm_fp - -typedef void (*FUNCPTR_T)( - struc_t strucc, - doff_t diagoffc, - diag_t diagc, - uplo_t uploc, - trans_t transc, - dim_t m, - dim_t n, - dim_t m_max, - dim_t n_max, - void* kappa, - void* c, inc_t rs_c, inc_t cs_c, - void* p, inc_t rs_p, inc_t cs_p, - cntx_t* cntx - ); - -static FUNCPTR_T GENARRAY(ftypes,packm_unb_var1); - - -void bli_packm_unb_var1 - ( - obj_t* c, - obj_t* p, - cntx_t* cntx, - cntl_t* cntl, - thrinfo_t* thread - ) -{ - num_t dt_cp = bli_obj_dt( c ); - - struc_t strucc = bli_obj_struc( c ); - doff_t diagoffc = bli_obj_diag_offset( c ); - diag_t diagc = bli_obj_diag( c ); - uplo_t uploc = bli_obj_uplo( c ); - trans_t transc = bli_obj_conjtrans_status( c ); - - dim_t m_p = bli_obj_length( p ); - dim_t n_p = bli_obj_width( p ); - dim_t m_max_p = bli_obj_padded_length( p ); - dim_t n_max_p = bli_obj_padded_width( p ); - - void* buf_c = bli_obj_buffer_at_off( c ); - inc_t rs_c = bli_obj_row_stride( c ); - inc_t cs_c = bli_obj_col_stride( c ); - - void* buf_p = bli_obj_buffer_at_off( p ); - inc_t rs_p = bli_obj_row_stride( p ); - inc_t cs_p = bli_obj_col_stride( p ); - - void* buf_kappa; - - FUNCPTR_T f; - - - // This variant assumes that the computational kernel will always apply - // the alpha scalar of the higher-level operation. Thus, we use BLIS_ONE - // for kappa so that the underlying packm implementation does not scale - // during packing. - buf_kappa = bli_obj_buffer_for_const( dt_cp, &BLIS_ONE ); - - // Index into the type combination array to extract the correct - // function pointer. - f = ftypes[dt_cp]; - - if( bli_thread_am_ochief( thread ) ) { - // Invoke the function. - f - ( - strucc, - diagoffc, - diagc, - uploc, - transc, - m_p, - n_p, - m_max_p, - n_max_p, - buf_kappa, - buf_c, rs_c, cs_c, - buf_p, rs_p, cs_p, - cntx - ); - } -} - - -#undef GENTFUNC -#define GENTFUNC( ctype, ch, varname ) \ -\ -void PASTEMAC(ch,varname) \ - ( \ - struc_t strucc, \ - doff_t diagoffc, \ - diag_t diagc, \ - uplo_t uploc, \ - trans_t transc, \ - dim_t m, \ - dim_t n, \ - dim_t m_max, \ - dim_t n_max, \ - void* kappa, \ - void* c, inc_t rs_c, inc_t cs_c, \ - void* p, inc_t rs_p, inc_t cs_p, \ - cntx_t* cntx \ - ) \ -{ \ - ctype* restrict kappa_cast = kappa; \ - ctype* restrict c_cast = c; \ - ctype* restrict p_cast = p; \ - ctype* restrict zero = PASTEMAC(ch,0); \ -\ - /* We begin by packing the region indicated by the parameters. If - matrix c is dense (either because the structure is general or - because the structure has already been "densified"), this ends - up being the only action we take. Note that if kappa is unit, - the data is simply copied (rather than scaled by one). */ \ - PASTEMAC2(ch,scal2m,BLIS_TAPI_EX_SUF) \ - ( \ - diagoffc, \ - diagc, \ - uploc, \ - transc, \ - m, \ - n, \ - kappa_cast, \ - c_cast, rs_c, cs_c, \ - p_cast, rs_p, cs_p, \ - cntx, \ - NULL \ - ); \ -\ - /* If uploc is upper or lower, then the structure of c is necessarily - non-dense (ie: Hermitian, symmetric, or triangular, where part of the - matrix is unstored). In these cases, we want to fill in the unstored - part of the matrix. How this is done depends on the structure of c. */ \ - if ( bli_is_upper_or_lower( uploc ) ) \ - { \ - /* The Hermitian and symmetric cases are almost identical, so we - handle them in one conditional block. */ \ - if ( bli_is_hermitian( strucc ) || bli_is_symmetric( strucc ) ) \ - { \ - /* First we must reflect the region referenced to the opposite - side of the diagonal. */ \ - c_cast = c_cast + diagoffc * ( doff_t )cs_c + \ - -diagoffc * ( doff_t )rs_c; \ - bli_negate_diag_offset( &diagoffc ); \ - bli_toggle_trans( &transc ); \ - if ( bli_is_upper( uploc ) ) diagoffc += 1; \ - else if ( bli_is_lower( uploc ) ) diagoffc -= 1; \ -\ - /* If c is Hermitian, we need to apply a conjugation when - copying the region opposite the diagonal. */ \ - if ( bli_is_hermitian( strucc ) ) \ - transc = bli_trans_toggled_conj( transc ); \ -\ - /* Copy the data from the region opposite the diagonal of c - (as specified by the original value of diagoffc). Notice - that we use a diag parameter of non-unit since we can - assume nothing about the neighboring off-diagonal. */ \ - PASTEMAC2(ch,scal2m,BLIS_TAPI_EX_SUF) \ - ( \ - diagoffc, \ - BLIS_NONUNIT_DIAG, \ - uploc, \ - transc, \ - m, \ - n, \ - kappa_cast, \ - c_cast, rs_c, cs_c, \ - p_cast, rs_p, cs_p, \ - cntx, \ - NULL \ - ); \ - } \ - else /* if ( bli_is_triangular( strucc ) ) */ \ - { \ - doff_t diagoffp = diagoffc; \ - uplo_t uplop = uploc; \ -\ - /* For this step we need the uplo and diagonal offset of p, which - we can derive from the parameters given. */ \ - if ( bli_does_trans( transc ) ) \ - { \ - bli_negate_diag_offset( &diagoffp ); \ - bli_toggle_uplo( &uplop ); \ - } \ -\ - /* For triangular matrices, we wish to reference the region - strictly opposite the diagonal of C. This amounts to - toggling uploc and then shifting the diagonal offset to - shrink the stored region (by one diagonal). */ \ - bli_toggle_uplo( &uplop ); \ - bli_shift_diag_offset_to_shrink_uplo( uplop, &diagoffp ); \ -\ - /* Set the region opposite the diagonal of p to zero. */ \ - PASTEMAC2(ch,setm,BLIS_TAPI_EX_SUF) \ - ( \ - BLIS_NO_CONJUGATE, \ - diagoffp, \ - BLIS_NONUNIT_DIAG, \ - uplop, \ - m, \ - n, \ - zero, \ - p_cast, rs_p, cs_p, \ - cntx, \ - NULL \ - ); \ - } \ - } \ -\ - /* The packed memory region was acquired/allocated with "aligned" - dimensions (ie: dimensions that were possibly inflated up to a - multiple). When these dimension are inflated, it creates empty - regions along the bottom and/or right edges of the matrix. If - eithe region exists, we set them to zero. This simplifies the - register level micro kernel in that it does not need to support - different register blockings for the edge cases. */ \ - if ( m != m_max ) \ - { \ - ctype* p_edge = p_cast + (m )*rs_p; \ -\ - PASTEMAC2(ch,setm,BLIS_TAPI_EX_SUF) \ - ( \ - BLIS_NO_CONJUGATE, \ - 0, \ - BLIS_NONUNIT_DIAG, \ - BLIS_DENSE, \ - m_max - m, \ - n_max, \ - zero, \ - p_edge, rs_p, cs_p, \ - cntx, \ - NULL \ - ); \ - } \ -\ - if ( n != n_max ) \ - { \ - ctype* p_edge = p_cast + (n )*cs_p; \ -\ - PASTEMAC2(ch,setm,BLIS_TAPI_EX_SUF) \ - ( \ - BLIS_NO_CONJUGATE, \ - 0, \ - BLIS_NONUNIT_DIAG, \ - BLIS_DENSE, \ - m_max, \ - n_max - n, \ - zero, \ - p_edge, rs_p, cs_p, \ - cntx, \ - NULL \ - ); \ - } \ -} - -INSERT_GENTFUNC_BASIC0( packm_unb_var1 ) - diff --git a/frame/1m/packm/bli_packm_unb_var1.h b/frame/1m/packm/bli_packm_unb_var1.h deleted file mode 100644 index 8960c8661a..0000000000 --- a/frame/1m/packm/bli_packm_unb_var1.h +++ /dev/null @@ -1,66 +0,0 @@ -/* - - BLIS - An object-based framework for developing high-performance BLAS-like - libraries. - - Copyright (C) 2014, The University of Texas at Austin - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are - met: - - Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - - Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - - Neither the name(s) of the copyright holder(s) nor the names of its - contributors may be used to endorse or promote products derived - from this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -*/ - -void bli_packm_unb_var1 - ( - obj_t* c, - obj_t* p, - cntx_t* cntx, - cntl_t* cntl, - thrinfo_t* thread - ); - - -#undef GENTPROT -#define GENTPROT( ctype, ch, varname ) \ -\ -void PASTEMAC(ch,varname) \ - ( \ - struc_t strucc, \ - doff_t diagoffc, \ - diag_t diagc, \ - uplo_t uploc, \ - trans_t transc, \ - dim_t m, \ - dim_t n, \ - dim_t m_max, \ - dim_t n_max, \ - void* kappa, \ - void* c, inc_t rs_c, inc_t cs_c, \ - void* p, inc_t rs_p, inc_t cs_p, \ - cntx_t* cntx \ - ); - -INSERT_GENTPROT_BASIC0( packm_unb_var1 ) - diff --git a/frame/1m/unpackm/bli_unpackm.h b/frame/1m/unpackm/bli_unpackm.h index b32d02d9ba..5e45428410 100644 --- a/frame/1m/unpackm/bli_unpackm.h +++ b/frame/1m/unpackm/bli_unpackm.h @@ -36,8 +36,6 @@ #include "bli_unpackm_check.h" #include "bli_unpackm_int.h" -#include "bli_unpackm_unb_var1.h" - #include "bli_unpackm_blk_var1.h" #include "bli_unpackm_cxk.h" diff --git a/frame/1m/unpackm/bli_unpackm_unb_var1.c b/frame/1m/unpackm/bli_unpackm_unb_var1.c deleted file mode 100644 index c1033c2cb9..0000000000 --- a/frame/1m/unpackm/bli_unpackm_unb_var1.c +++ /dev/null @@ -1,131 +0,0 @@ -/* - - BLIS - An object-based framework for developing high-performance BLAS-like - libraries. - - Copyright (C) 2014, The University of Texas at Austin - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are - met: - - Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - - Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - - Neither the name(s) of the copyright holder(s) nor the names of its - contributors may be used to endorse or promote products derived - from this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -*/ - -#include "blis.h" - -#define FUNCPTR_T unpackm_fp - -typedef void (*FUNCPTR_T)( - doff_t diagoffp, - uplo_t uplop, - trans_t transp, - dim_t m, - dim_t n, - void* p, inc_t rs_p, inc_t cs_p, - void* c, inc_t rs_c, inc_t cs_c, - cntx_t* cntx - ); - -static FUNCPTR_T GENARRAY(ftypes,unpackm_unb_var1); - - -void bli_unpackm_unb_var1 - ( - obj_t* p, - obj_t* c, - cntx_t* cntx, - cntl_t* cntl, - thrinfo_t* thread - ) -{ - num_t dt_pc = bli_obj_dt( p ); - - doff_t diagoffp = bli_obj_diag_offset( p ); - uplo_t uplop = bli_obj_uplo( p ); - trans_t transc = bli_obj_onlytrans_status( c ); - - dim_t m_c = bli_obj_length( c ); - dim_t n_c = bli_obj_width( c ); - - void* buf_p = bli_obj_buffer_at_off( p ); - inc_t rs_p = bli_obj_row_stride( p ); - inc_t cs_p = bli_obj_col_stride( p ); - - void* buf_c = bli_obj_buffer_at_off( c ); - inc_t rs_c = bli_obj_row_stride( c ); - inc_t cs_c = bli_obj_col_stride( c ); - - FUNCPTR_T f; - - // Index into the type combination array to extract the correct - // function pointer. - f = ftypes[dt_pc]; - - // Invoke the function. - f( diagoffp, - uplop, - transc, - m_c, - n_c, - buf_p, rs_p, cs_p, - buf_c, rs_c, cs_c, - cntx - ); -} - - -#undef GENTFUNC -#define GENTFUNC( ctype, ch, opname, varname ) \ -\ -void PASTEMAC(ch,varname)( \ - doff_t diagoffp, \ - uplo_t uplop, \ - trans_t transp, \ - dim_t m, \ - dim_t n, \ - void* p, inc_t rs_p, inc_t cs_p, \ - void* c, inc_t rs_c, inc_t cs_c, \ - cntx_t* cntx \ - ) \ -{ \ - ctype* p_cast = p; \ - ctype* c_cast = c; \ -\ - PASTEMAC2(ch,copym,BLIS_TAPI_EX_SUF) \ - ( \ - diagoffp,\ - BLIS_NONUNIT_DIAG, \ - uplop, \ - transp, \ - m, \ - n, \ - p_cast, rs_p, cs_p, \ - c_cast, rs_c, cs_c, \ - cntx, \ - NULL \ - ); \ -} - -INSERT_GENTFUNC_BASIC( unpackm, unpackm_unb_var1 ) - diff --git a/frame/1m/unpackm/bli_unpackm_unb_var1.h b/frame/1m/unpackm/bli_unpackm_unb_var1.h deleted file mode 100644 index 5119aaa7ff..0000000000 --- a/frame/1m/unpackm/bli_unpackm_unb_var1.h +++ /dev/null @@ -1,60 +0,0 @@ -/* - - BLIS - An object-based framework for developing high-performance BLAS-like - libraries. - - Copyright (C) 2014, The University of Texas at Austin - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are - met: - - Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - - Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - - Neither the name(s) of the copyright holder(s) nor the names of its - contributors may be used to endorse or promote products derived - from this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -*/ - -void bli_unpackm_unb_var1 - ( - obj_t* p, - obj_t* c, - cntx_t* cntx, - cntl_t* cntl, - thrinfo_t* thread - ); - -#undef GENTPROT -#define GENTPROT( ctype, ch, varname ) \ -\ -void PASTEMAC(ch,varname) \ - ( \ - doff_t diagoffp, \ - uplo_t uplop, \ - trans_t transp, \ - dim_t m, \ - dim_t n, \ - void* p, inc_t rs_p, inc_t cs_p, \ - void* c, inc_t rs_c, inc_t cs_c, \ - cntx_t* cntx \ - ); - -INSERT_GENTPROT_BASIC0( unpackm_unb_var1 ) - diff --git a/frame/3/bli_l3.h b/frame/3/bli_l3.h index da93488444..4dc1a9d545 100644 --- a/frame/3/bli_l3.h +++ b/frame/3/bli_l3.h @@ -35,6 +35,8 @@ #include "bli_l3_cntl.h" #include "bli_l3_check.h" +#include "bli_l3_int.h" +#include "bli_l3_packab.h" // Define function types. //#include "bli_l3_ft_ex.h" @@ -45,7 +47,6 @@ #include "bli_l3_blocksize.h" #include "bli_l3_direct.h" #include "bli_l3_prune.h" -#include "bli_l3_packm.h" #include "bli_l3_schema.h" // Prototype object APIs (basic and expert). diff --git a/frame/3/bli_l3_check.c b/frame/3/bli_l3_check.c index 50da4627ca..3e7882bc39 100644 --- a/frame/3/bli_l3_check.c +++ b/frame/3/bli_l3_check.c @@ -53,7 +53,7 @@ void bli_gemm_check // Check object structure. // NOTE: Can't perform these checks as long as bli_gemm_check() is called - // from bli_gemm_int(), which is in the execution path for structured + // from bli_l3_int(), which is in the execution path for structured // level-3 operations such as hemm. //e_val = bli_check_general_object( a ); @@ -109,7 +109,7 @@ void bli_hemm_check } void bli_herk_check - ( + ( obj_t* alpha, obj_t* a, obj_t* beta, @@ -197,7 +197,7 @@ void bli_symm_check } void bli_syrk_check - ( + ( obj_t* alpha, obj_t* a, obj_t* beta, diff --git a/frame/3/trsm/bli_trsm_int.c b/frame/3/bli_l3_int.c similarity index 74% rename from frame/3/trsm/bli_trsm_int.c rename to frame/3/bli_l3_int.c index 53a22c3556..d4b974030c 100644 --- a/frame/3/trsm/bli_trsm_int.c +++ b/frame/3/bli_l3_int.c @@ -34,7 +34,7 @@ #include "blis.h" -void bli_trsm_int +void bli_l3_int ( obj_t* alpha, obj_t* a, @@ -47,10 +47,9 @@ void bli_trsm_int thrinfo_t* thread ) { - obj_t a_local; - obj_t b_local; - obj_t c_local; - trsm_var_oft f; + obj_t a_local; + obj_t b_local; + obj_t c_local; // Return early if the current control tree node is NULL. if ( bli_cntl_is_null( cntl ) ) return; @@ -60,72 +59,82 @@ void bli_trsm_int bli_gemm_basic_check( alpha, a, b, beta, c, cntx ); // If C has a zero dimension, return early. - if ( bli_obj_has_zero_dim( c ) ) return; + if ( bli_obj_has_zero_dim( c ) ) + { + return; + } // If A or B has a zero dimension, scale C by beta and return early. if ( bli_obj_has_zero_dim( a ) || bli_obj_has_zero_dim( b ) ) { if ( bli_thread_am_ochief( thread ) ) - bli_scalm( beta, c ); + bli_scalm( beta, c ); bli_thread_barrier( thread ); return; } - // Alias A and B in case we need to update attached scalars. + // If A or B is marked as being filled with zeros, scale C by beta and + // return early. + if ( bli_obj_is_zeros( a ) || + bli_obj_is_zeros( b ) ) + { + // This should never execute. + bli_abort(); + + if ( bli_thread_am_ochief( thread ) ) + bli_scalm( beta, c ); + bli_thread_barrier( thread ); + return; + } + + // Alias A, B, and C in case we need to update attached scalars. bli_obj_alias_to( a, &a_local ); bli_obj_alias_to( b, &b_local ); - - // Alias C in case we need to induce a transposition. bli_obj_alias_to( c, &c_local ); + // Ensure that a valid packing function is set on A and B. + if ( !bli_obj_pack_fn( &a_local ) ) + bli_obj_set_pack_fn( bli_packm_blk_var1, &a_local ); + + if ( !bli_obj_pack_fn( &b_local ) ) + bli_obj_set_pack_fn( bli_packm_blk_var1, &b_local ); + // If we are about to call a leaf-level implementation, and matrix C // still needs a transposition, then we must induce one by swapping the // strides and dimensions. Note that this transposition would normally // be handled explicitly in the packing of C, but if C is not being // packed, this is our last chance to handle the transposition. - if ( bli_cntl_is_leaf( cntl ) && bli_obj_has_trans( c ) ) + //if ( bli_cntl_is_leaf( cntl ) && bli_obj_has_trans( c ) ) + if ( bli_obj_has_trans( c ) ) { bli_obj_induce_trans( &c_local ); bli_obj_set_onlytrans( BLIS_NO_TRANSPOSE, &c_local ); } - // If beta is non-unit, apply it to the scalar attached to C. - if ( !bli_obj_equals( beta, &BLIS_ONE ) ) + // If alpha is non-unit, typecast and apply it to the scalar attached + // to B, unless it happens to be triangular. + if ( bli_obj_root_is_triangular( b ) ) { - bli_obj_scalar_apply_scalar( beta, &c_local ); - } - - // Set two bools: one based on the implied side parameter (the structure - // of the root object) and one based on the uplo field of the triangular - // matrix's root object (whether that is matrix A or matrix B). - if ( bli_obj_root_is_triangular( a ) ) - { - // If alpha is non-unit, typecast and apply it to the scalar - // attached to B (the non-triangular matrix). if ( !bli_obj_equals( alpha, &BLIS_ONE ) ) - { - bli_obj_scalar_apply_scalar( alpha, &b_local ); - } + bli_obj_scalar_apply_scalar( alpha, &a_local ); } else // if ( bli_obj_root_is_triangular( b ) ) { - // If alpha is non-unit, typecast and apply it to the scalar - // attached to A (the non-triangular matrix). if ( !bli_obj_equals( alpha, &BLIS_ONE ) ) - { - bli_obj_scalar_apply_scalar( alpha, &a_local ); - } + bli_obj_scalar_apply_scalar( alpha, &b_local ); } - // FGVZ->TMS: Is this barrier still needed? - bli_thread_barrier( thread ); + // If beta is non-unit, typecast and apply it to the scalar attached + // to C. + if ( !bli_obj_equals( beta, &BLIS_ONE ) ) + bli_obj_scalar_apply_scalar( beta, &c_local ); // Create the next node in the thrinfo_t structure. bli_thrinfo_grow( rntm, cntl, thread ); // Extract the function pointer from the current control tree node. - f = bli_cntl_var_func( cntl ); + l3_var_oft f = bli_cntl_var_func( cntl ); // Invoke the variant. f diff --git a/frame/3/gemm/bli_gemm_int.h b/frame/3/bli_l3_int.h similarity index 99% rename from frame/3/gemm/bli_gemm_int.h rename to frame/3/bli_l3_int.h index 2bbe5480a6..d76b0ac3e2 100644 --- a/frame/3/gemm/bli_gemm_int.h +++ b/frame/3/bli_l3_int.h @@ -32,7 +32,7 @@ */ -void bli_gemm_int +void bli_l3_int ( obj_t* alpha, obj_t* a, diff --git a/frame/3/bli_l3_oft_var.h b/frame/3/bli_l3_oft_var.h index 1456f8eff3..ea10d80904 100644 --- a/frame/3/bli_l3_oft_var.h +++ b/frame/3/bli_l3_oft_var.h @@ -54,24 +54,7 @@ typedef void (*PASTECH(opname,_var_oft)) \ thrinfo_t* thread \ ); -GENTDEF( gemm ) - - -#undef GENTDEF -#define GENTDEF( opname ) \ -\ -typedef void (*PASTECH(opname,_var_oft)) \ -( \ - obj_t* a, \ - obj_t* b, \ - obj_t* c, \ - cntx_t* cntx, \ - rntm_t* rntm, \ - cntl_t* cntl, \ - thrinfo_t* thread \ -); - -GENTDEF( trsm ) +GENTDEF( l3 ) diff --git a/frame/3/gemm/bli_gemm_packab.c b/frame/3/bli_l3_packab.c similarity index 80% rename from frame/3/gemm/bli_gemm_packab.c rename to frame/3/bli_l3_packab.c index a15192994e..d911819429 100644 --- a/frame/3/gemm/bli_gemm_packab.c +++ b/frame/3/bli_l3_packab.c @@ -34,7 +34,7 @@ #include "blis.h" -void bli_gemm_packa +void bli_l3_packa ( obj_t* a, obj_t* b, @@ -45,12 +45,19 @@ void bli_gemm_packa thrinfo_t* thread ) { - obj_t a_pack; + obj_t a_local, a_pack; + + bli_obj_alias_to( a, &a_local ); + if ( bli_obj_has_trans( a ) ) + { + bli_obj_induce_trans( &a_local ); + bli_obj_set_onlytrans( BLIS_NO_TRANSPOSE, &a_local ); + } // Pack matrix A according to the control tree node. - bli_l3_packm + bli_packm_int ( - a, + &a_local, &a_pack, cntx, rntm, @@ -59,7 +66,7 @@ void bli_gemm_packa ); // Proceed with execution using packed matrix A. - bli_gemm_int + bli_l3_int ( &BLIS_ONE, &a_pack, @@ -75,7 +82,7 @@ void bli_gemm_packa // ----------------------------------------------------------------------------- -void bli_gemm_packb +void bli_l3_packb ( obj_t* a, obj_t* b, @@ -86,25 +93,39 @@ void bli_gemm_packb thrinfo_t* thread ) { - obj_t b_pack; + obj_t bt_local, bt_pack; + + // We always pass B^T to bli_l3_packm. + bli_obj_alias_to( b, &bt_local ); + if ( bli_obj_has_trans( b ) ) + { + bli_obj_set_onlytrans( BLIS_NO_TRANSPOSE, &bt_local ); + } + else + { + bli_obj_induce_trans( &bt_local ); + } // Pack matrix B according to the control tree node. - bli_l3_packm + bli_packm_int ( - b, - &b_pack, + &bt_local, + &bt_pack, cntx, rntm, cntl, thread ); + // Transpose packed object back to B. + bli_obj_induce_trans( &bt_pack ); + // Proceed with execution using packed matrix B. - bli_gemm_int + bli_l3_int ( &BLIS_ONE, a, - &b_pack, + &bt_pack, &BLIS_ONE, c, cntx, diff --git a/frame/3/trsm/bli_trsm_int.h b/frame/3/bli_l3_packab.h similarity index 90% rename from frame/3/trsm/bli_trsm_int.h rename to frame/3/bli_l3_packab.h index aabb2a8aa6..380ca72123 100644 --- a/frame/3/trsm/bli_trsm_int.h +++ b/frame/3/bli_l3_packab.h @@ -32,12 +32,21 @@ */ -void bli_trsm_int +void bli_l3_packa + ( + obj_t* a, + obj_t* b, + obj_t* c, + cntx_t* cntx, + rntm_t* rntm, + cntl_t* cntl, + thrinfo_t* thread + ); + +void bli_l3_packb ( - obj_t* alpha, obj_t* a, obj_t* b, - obj_t* beta, obj_t* c, cntx_t* cntx, rntm_t* rntm, diff --git a/frame/3/bli_l3_packm.c b/frame/3/bli_l3_packm.c deleted file mode 100644 index 48f55c3602..0000000000 --- a/frame/3/bli_l3_packm.c +++ /dev/null @@ -1,187 +0,0 @@ -/* - - BLIS - An object-based framework for developing high-performance BLAS-like - libraries. - - Copyright (C) 2014, The University of Texas at Austin - Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc. - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are - met: - - Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - - Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - - Neither the name(s) of the copyright holder(s) nor the names of its - contributors may be used to endorse or promote products derived - from this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -*/ - -#include "blis.h" - -void bli_l3_packm - ( - obj_t* x, - obj_t* x_pack, - cntx_t* cntx, - rntm_t* rntm, - cntl_t* cntl, - thrinfo_t* thread - ) -{ - packbuf_t pack_buf_type; - mem_t* cntl_mem_p; - siz_t size_needed; - - // FGVZ: Not sure why we need this barrier, but we do. - bli_thread_barrier( thread ); - - // Every thread initializes x_pack and determines the size of memory - // block needed (which gets embedded into the otherwise "blank" mem_t - // entry in the control tree node). - size_needed - = - bli_packm_init - ( - x, - x_pack, - cntx, - cntl - ); - - // If zero was returned, no memory needs to be allocated and so we can - // return early. - if ( size_needed == 0 ) return; - - // Query the pack buffer type from the control tree node. - pack_buf_type = bli_cntl_packm_params_pack_buf_type( cntl ); - - // Query the address of the mem_t entry within the control tree node. - cntl_mem_p = bli_cntl_pack_mem( cntl ); - - // Check the mem_t field in the control tree. If it is unallocated, then - // we need to acquire a block from the memory broker and broadcast it to - // all threads in the chief's thread group. - if ( bli_mem_is_unalloc( cntl_mem_p ) ) - { - mem_t* local_mem_p; - mem_t local_mem_s; - - if ( bli_thread_am_ochief( thread ) ) - { - #ifdef BLIS_ENABLE_MEM_TRACING - printf( "bli_l3_packm(): acquiring mem pool block\n" ); - #endif - - // The chief thread acquires a block from the memory broker - // and saves the associated mem_t entry to local_mem_s. - bli_pba_acquire_m - ( - rntm, - size_needed, - pack_buf_type, - &local_mem_s - ); - } - - // Broadcast the address of the chief thread's local mem_t entry to - // all threads. - local_mem_p = bli_thread_broadcast( thread, &local_mem_s ); - - // Save the contents of the chief thread's local mem_t entry to the - // mem_t field in this thread's control tree node. - *cntl_mem_p = *local_mem_p; - } - else // ( bli_mem_is_alloc( cntl_mem_p ) ) - { - mem_t* local_mem_p; - mem_t local_mem_s; - - // If the mem_t entry in the control tree does NOT contain a NULL - // buffer, then a block has already been acquired from the memory - // broker and cached in the control tree. - - // As a sanity check, we should make sure that the mem_t object isn't - // associated with a block that is too small compared to the size of - // the packed matrix buffer that is needed, according to the return - // value from packm_init(). - siz_t cntl_mem_size = bli_mem_size( cntl_mem_p ); - - if ( cntl_mem_size < size_needed ) - { - if ( bli_thread_am_ochief( thread ) ) - { - // The chief thread releases the existing block associated with - // the mem_t entry in the control tree, and then re-acquires a - // new block, saving the associated mem_t entry to local_mem_s. - bli_pba_release - ( - rntm, - cntl_mem_p - ); - bli_pba_acquire_m - ( - rntm, - size_needed, - pack_buf_type, - &local_mem_s - ); - } - - // Broadcast the address of the chief thread's local mem_t entry to - // all threads. - local_mem_p = bli_thread_broadcast( thread, &local_mem_s ); - - // Save the chief thread's local mem_t entry to the mem_t field in - // this thread's control tree node. - *cntl_mem_p = *local_mem_p; - } - else - { - // If the mem_t entry is already allocated and sufficiently large, - // then we use it as-is. No action is needed, because all threads - // will already have the cached values in their local control - // trees' mem_t entries, currently pointed to by cntl_mem_p. - - bli_thread_barrier( thread ); - } - } - - - // Update the buffer address in x_pack to point to the buffer associated - // with the mem_t entry acquired from the memory broker (now cached in - // the control tree node). - void* buf = bli_mem_buffer( cntl_mem_p ); - bli_obj_set_buffer( buf, x_pack ); - - - // Pack the contents of object x to object x_pack. - bli_packm_int - ( - x, - x_pack, - cntx, - cntl, - thread - ); - - // Barrier so that packing is done before computation. - bli_thread_barrier( thread ); -} - diff --git a/frame/3/gemm/bli_gemm.h b/frame/3/gemm/bli_gemm.h index a6f8b4e1e0..ddd88e1633 100644 --- a/frame/3/gemm/bli_gemm.h +++ b/frame/3/gemm/bli_gemm.h @@ -34,7 +34,6 @@ #include "bli_gemm_cntl.h" #include "bli_gemm_front.h" -#include "bli_gemm_int.h" #include "bli_gemm_var.h" diff --git a/frame/3/gemm/bli_gemm_blk_var1.c b/frame/3/gemm/bli_gemm_blk_var1.c index 3b7634338e..de077e5adc 100644 --- a/frame/3/gemm/bli_gemm_blk_var1.c +++ b/frame/3/gemm/bli_gemm_blk_var1.c @@ -77,7 +77,7 @@ void bli_gemm_blk_var1 i, b_alg, c, &c1 ); // Perform gemm subproblem. - bli_gemm_int + bli_l3_int ( &BLIS_ONE, &a1, diff --git a/frame/3/gemm/bli_gemm_blk_var2.c b/frame/3/gemm/bli_gemm_blk_var2.c index d89a710534..53943e47cd 100644 --- a/frame/3/gemm/bli_gemm_blk_var2.c +++ b/frame/3/gemm/bli_gemm_blk_var2.c @@ -77,7 +77,7 @@ void bli_gemm_blk_var2 i, b_alg, c, &c1 ); // Perform gemm subproblem. - bli_gemm_int + bli_l3_int ( &BLIS_ONE, a, diff --git a/frame/3/gemm/bli_gemm_blk_var3.c b/frame/3/gemm/bli_gemm_blk_var3.c index 7883dfd6de..28029777de 100644 --- a/frame/3/gemm/bli_gemm_blk_var3.c +++ b/frame/3/gemm/bli_gemm_blk_var3.c @@ -71,7 +71,7 @@ void bli_gemm_blk_var3 i, b_alg, b, &b1 ); // Perform gemm subproblem. - bli_gemm_int + bli_l3_int ( &BLIS_ONE, &a1, diff --git a/frame/3/gemm/bli_gemm_cntl.c b/frame/3/gemm/bli_gemm_cntl.c index 27678e0bf8..72d78efe16 100644 --- a/frame/3/gemm/bli_gemm_cntl.c +++ b/frame/3/gemm/bli_gemm_cntl.c @@ -57,8 +57,6 @@ cntl_t* bli_gemmbp_cntl_create ) { void_fp macro_kernel_fp; - void_fp packa_fp; - void_fp packb_fp; // Use the function pointers to the macrokernels that use slab // assignment of micropanels to threads in the jr and ir loops. @@ -67,9 +65,6 @@ cntl_t* bli_gemmbp_cntl_create else if ( family == BLIS_TRMM ) macro_kernel_fp = bli_trmm_xx_ker_var2; else /* should never execute */ macro_kernel_fp = NULL; - packa_fp = bli_packm_blk_var1; - packb_fp = bli_packm_blk_var1; - // Create two nodes for the macro-kernel. cntl_t* gemm_cntl_bu_ke = bli_gemm_cntl_create_node ( @@ -93,8 +88,7 @@ cntl_t* bli_gemmbp_cntl_create cntl_t* gemm_cntl_packa = bli_packm_cntl_create_node ( rntm, - bli_gemm_packa, // pack the left-hand operand - packa_fp, + bli_l3_packa, // pack the left-hand operand BLIS_MR, BLIS_KR, FALSE, // do NOT invert diagonal @@ -119,10 +113,9 @@ cntl_t* bli_gemmbp_cntl_create cntl_t* gemm_cntl_packb = bli_packm_cntl_create_node ( rntm, - bli_gemm_packb, // pack the right-hand operand - packb_fp, - BLIS_KR, + bli_l3_packb, // pack the right-hand operand BLIS_NR, + BLIS_KR, FALSE, // do NOT invert diagonal FALSE, // reverse iteration if upper? FALSE, // reverse iteration if lower? @@ -194,8 +187,8 @@ cntl_t* bli_gemmpb_cntl_create ( bli_gemm_packb, // pack the right-hand operand bli_packm_blk_var1, - BLIS_KR, BLIS_MR, + BLIS_KR, FALSE, // do NOT invert diagonal FALSE, // reverse iteration if upper? FALSE, // reverse iteration if lower? diff --git a/frame/3/gemm/bli_gemm_front.c b/frame/3/gemm/bli_gemm_front.c index 792d69af5f..a9ea21dc43 100644 --- a/frame/3/gemm/bli_gemm_front.c +++ b/frame/3/gemm/bli_gemm_front.c @@ -87,13 +87,14 @@ void bli_gemm_front bli_obj_alias_to( b, &b_local ); bli_obj_alias_to( c, &c_local ); -#ifdef BLIS_ENABLE_GEMM_MD - // Don't perform the following optimization for ccr or crc cases, as - // those cases are sensitive to the ukernel storage preference (ie: - // transposing the operation would break them). - if ( !bli_gemm_md_is_ccr( &a_local, &b_local, &c_local ) && - !bli_gemm_md_is_crc( &a_local, &b_local, &c_local ) ) -#endif + // Set the obj_t buffer field to the location currently implied by the row + // and column offsets and then zero the offsets. If any of the original + // obj_t's were views into larger matrices, this step effectively makes + // those obj_t's "forget" their lineage. + bli_obj_reset_origin( &a_local ); + bli_obj_reset_origin( &b_local ); + bli_obj_reset_origin( &c_local ); + // An optimization: If C is stored by rows and the micro-kernel prefers // contiguous columns, or if C is stored by columns and the micro-kernel // prefers contiguous rows, transpose the entire operation to allow the @@ -251,7 +252,7 @@ void bli_gemm_front // Invoke the internal back-end via the thread handler. bli_l3_thread_decorator ( - bli_gemm_int, + bli_l3_int, BLIS_GEMM, // operation family id alpha, &a_local, diff --git a/frame/3/gemm/bli_gemm_int.c b/frame/3/gemm/bli_gemm_int.c deleted file mode 100644 index 208e9bdca3..0000000000 --- a/frame/3/gemm/bli_gemm_int.c +++ /dev/null @@ -1,127 +0,0 @@ -/* - - BLIS - An object-based framework for developing high-performance BLAS-like - libraries. - - Copyright (C) 2014, The University of Texas at Austin - Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc. - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are - met: - - Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - - Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - - Neither the name(s) of the copyright holder(s) nor the names of its - contributors may be used to endorse or promote products derived - from this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -*/ - -#include "blis.h" - -void bli_gemm_int - ( - obj_t* alpha, - obj_t* a, - obj_t* b, - obj_t* beta, - obj_t* c, - cntx_t* cntx, - rntm_t* rntm, - cntl_t* cntl, - thrinfo_t* thread - ) -{ - obj_t a_local; - obj_t b_local; - obj_t c_local; - gemm_var_oft f; - - // Check parameters. - if ( bli_error_checking_is_enabled() ) - bli_gemm_basic_check( alpha, a, b, beta, c, cntx ); - - // If C has a zero dimension, return early. - if ( bli_obj_has_zero_dim( c ) ) - { - return; - } - - // If A or B has a zero dimension, scale C by beta and return early. - if ( bli_obj_has_zero_dim( a ) || - bli_obj_has_zero_dim( b ) ) - { - if ( bli_thread_am_ochief( thread ) ) - bli_scalm( beta, c ); - bli_thread_barrier( thread ); - return; - } - - // If A or B is marked as being filled with zeros, scale C by beta and - // return early. - if ( bli_obj_is_zeros( a ) || - bli_obj_is_zeros( b ) ) - { - // This should never execute. - bli_abort(); - - if ( bli_thread_am_ochief( thread ) ) - bli_scalm( beta, c ); - bli_thread_barrier( thread ); - return; - } - - // Alias A, B, and C in case we need to update attached scalars. - bli_obj_alias_to( a, &a_local ); - bli_obj_alias_to( b, &b_local ); - bli_obj_alias_to( c, &c_local ); - - // If alpha is non-unit, typecast and apply it to the scalar attached - // to B. - if ( !bli_obj_equals( alpha, &BLIS_ONE ) ) - { - bli_obj_scalar_apply_scalar( alpha, &b_local ); - } - - // If beta is non-unit, typecast and apply it to the scalar attached - // to C. - if ( !bli_obj_equals( beta, &BLIS_ONE ) ) - { - bli_obj_scalar_apply_scalar( beta, &c_local ); - } - - // Create the next node in the thrinfo_t structure. - bli_thrinfo_grow( rntm, cntl, thread ); - - // Extract the function pointer from the current control tree node. - f = bli_cntl_var_func( cntl ); - - // Invoke the variant. - f - ( - &a_local, - &b_local, - &c_local, - cntx, - rntm, - cntl, - thread - ); -} - diff --git a/frame/3/gemm/bli_gemm_var.h b/frame/3/gemm/bli_gemm_var.h index 7bcc8a013b..e7befc5b46 100644 --- a/frame/3/gemm/bli_gemm_var.h +++ b/frame/3/gemm/bli_gemm_var.h @@ -55,11 +55,8 @@ void PASTEMAC0(opname) \ GENPROT( gemm_blk_var1 ) GENPROT( gemm_blk_var2 ) GENPROT( gemm_blk_var3 ) -GENPROT( gemm_packa ) -GENPROT( gemm_packb ) GENPROT( gemm_ker_var1 ) - GENPROT( gemm_ker_var2 ) diff --git a/frame/3/gemmt/bli_gemmt_front.c b/frame/3/gemmt/bli_gemmt_front.c index 9f18a717df..2a9d91759b 100644 --- a/frame/3/gemmt/bli_gemmt_front.c +++ b/frame/3/gemmt/bli_gemmt_front.c @@ -73,7 +73,14 @@ void bli_gemmt_front bli_obj_alias_to( a, &a_local ); bli_obj_alias_to( b, &b_local ); bli_obj_alias_to( c, &c_local ); - bli_obj_set_as_root( &c_local ); + + // Set the obj_t buffer field to the location currently implied by the row + // and column offsets and then zero the offsets. If any of the original + // obj_t's were views into larger matrices, this step effectively makes + // those obj_t's "forget" their lineage. + bli_obj_reset_origin( &a_local ); + bli_obj_reset_origin( &b_local ); + bli_obj_reset_origin( &c_local ); // An optimization: If C is stored by rows and the micro-kernel prefers // contiguous columns, or if C is stored by columns and the micro-kernel @@ -107,7 +114,7 @@ void bli_gemmt_front // Invoke the internal back-end via the thread handler. bli_l3_thread_decorator ( - bli_gemm_int, + bli_l3_int, BLIS_GEMMT, // operation family id alpha, &a_local, diff --git a/frame/3/gemmt/bli_gemmt_x_ker_var2.c b/frame/3/gemmt/bli_gemmt_x_ker_var2.c index 6d24ea4969..3a1d681c3b 100644 --- a/frame/3/gemmt/bli_gemmt_x_ker_var2.c +++ b/frame/3/gemmt/bli_gemmt_x_ker_var2.c @@ -35,7 +35,7 @@ #include "blis.h" -static gemm_var_oft vars[2] = +static l3_var_oft vars[2] = { bli_gemmt_l_ker_var2, bli_gemmt_u_ker_var2, }; @@ -51,8 +51,8 @@ void bli_gemmt_x_ker_var2 thrinfo_t* thread ) { - dim_t uplo; - gemm_var_oft f; + dim_t uplo; + l3_var_oft f; // Set a bool based on the uplo field of C's root object. if ( bli_obj_root_is_lower( c ) ) uplo = 0; diff --git a/frame/3/hemm/bli_hemm_front.c b/frame/3/hemm/bli_hemm_front.c index 7869f800ac..9835de9c15 100644 --- a/frame/3/hemm/bli_hemm_front.c +++ b/frame/3/hemm/bli_hemm_front.c @@ -65,6 +65,14 @@ void bli_hemm_front bli_obj_alias_to( b, &b_local ); bli_obj_alias_to( c, &c_local ); + // Set the obj_t buffer field to the location currently implied by the row + // and column offsets and then zero the offsets. If any of the original + // obj_t's were views into larger matrices, this step effectively makes + // those obj_t's "forget" their lineage. + bli_obj_reset_origin( &a_local ); + bli_obj_reset_origin( &b_local ); + bli_obj_reset_origin( &c_local ); + #ifdef BLIS_DISABLE_HEMM_RIGHT // NOTE: This case casts right-side hemm in terms of left side. This is // necessary when the current subconfiguration uses a gemm microkernel @@ -129,13 +137,6 @@ void bli_hemm_front // Set the pack schemas within the objects. bli_l3_set_schemas( &a_local, &b_local, &c_local, cntx ); - // Set each alias as the root object. - // NOTE: We MUST wait until we are done potentially swapping the objects - // before setting the root fields! - bli_obj_set_as_root( &a_local ); - bli_obj_set_as_root( &b_local ); - bli_obj_set_as_root( &c_local ); - // Parse and interpret the contents of the rntm_t object to properly // set the ways of parallelism for each loop, and then make any // additional modifications necessary for the current operation. @@ -152,7 +153,7 @@ void bli_hemm_front // Invoke the internal back-end. bli_l3_thread_decorator ( - bli_gemm_int, + bli_l3_int, BLIS_GEMM, // operation family id alpha, &a_local, diff --git a/frame/3/symm/bli_symm_front.c b/frame/3/symm/bli_symm_front.c index 52ef4cf36b..be94c44c1b 100644 --- a/frame/3/symm/bli_symm_front.c +++ b/frame/3/symm/bli_symm_front.c @@ -65,6 +65,14 @@ void bli_symm_front bli_obj_alias_to( b, &b_local ); bli_obj_alias_to( c, &c_local ); + // Set the obj_t buffer field to the location currently implied by the row + // and column offsets and then zero the offsets. If any of the original + // obj_t's were views into larger matrices, this step effectively makes + // those obj_t's "forget" their lineage. + bli_obj_reset_origin( &a_local ); + bli_obj_reset_origin( &b_local ); + bli_obj_reset_origin( &c_local ); + #ifdef BLIS_DISABLE_SYMM_RIGHT // NOTE: This case casts right-side symm in terms of left side. This is // necessary when the current subconfiguration uses a gemm microkernel @@ -128,13 +136,6 @@ void bli_symm_front // Set the pack schemas within the objects. bli_l3_set_schemas( &a_local, &b_local, &c_local, cntx ); - // Set each alias as the root object. - // NOTE: We MUST wait until we are done potentially swapping the objects - // before setting the root fields! - bli_obj_set_as_root( &a_local ); - bli_obj_set_as_root( &b_local ); - bli_obj_set_as_root( &c_local ); - // Parse and interpret the contents of the rntm_t object to properly // set the ways of parallelism for each loop, and then make any // additional modifications necessary for the current operation. @@ -151,7 +152,7 @@ void bli_symm_front // Invoke the internal back-end. bli_l3_thread_decorator ( - bli_gemm_int, + bli_l3_int, BLIS_GEMM, // operation family id alpha, &a_local, diff --git a/frame/3/trmm/bli_trmm_front.c b/frame/3/trmm/bli_trmm_front.c index fac7349f5c..1de28958eb 100644 --- a/frame/3/trmm/bli_trmm_front.c +++ b/frame/3/trmm/bli_trmm_front.c @@ -64,6 +64,14 @@ void bli_trmm_front bli_obj_alias_to( b, &b_local ); bli_obj_alias_to( b, &c_local ); + // Set the obj_t buffer field to the location currently implied by the row + // and column offsets and then zero the offsets. If any of the original + // obj_t's were views into larger matrices, this step effectively makes + // those obj_t's "forget" their lineage. + bli_obj_reset_origin( &a_local ); + bli_obj_reset_origin( &b_local ); + bli_obj_reset_origin( &c_local ); + // We do not explicitly implement the cases where A is transposed. // However, we can still handle them. Specifically, if A is marked as // needing a transposition, we simply induce a transposition. This @@ -147,13 +155,6 @@ void bli_trmm_front // Set the pack schemas within the objects. bli_l3_set_schemas( &a_local, &b_local, &c_local, cntx ); - // Set each alias as the root object. - // NOTE: We MUST wait until we are done potentially swapping the objects - // before setting the root fields! - bli_obj_set_as_root( &a_local ); - bli_obj_set_as_root( &b_local ); - bli_obj_set_as_root( &c_local ); - // Parse and interpret the contents of the rntm_t object to properly // set the ways of parallelism for each loop, and then make any // additional modifications necessary for the current operation. @@ -170,7 +171,7 @@ void bli_trmm_front // Invoke the internal back-end. bli_l3_thread_decorator ( - bli_gemm_int, + bli_l3_int, BLIS_TRMM, // operation family id alpha, &a_local, diff --git a/frame/3/trmm/bli_trmm_xx_ker_var2.c b/frame/3/trmm/bli_trmm_xx_ker_var2.c index b9c176d973..898cfe2423 100644 --- a/frame/3/trmm/bli_trmm_xx_ker_var2.c +++ b/frame/3/trmm/bli_trmm_xx_ker_var2.c @@ -35,7 +35,7 @@ #include "blis.h" -static gemm_var_oft vars[2][2] = +static l3_var_oft vars[2][2] = { { bli_trmm_ll_ker_var2, bli_trmm_lu_ker_var2 }, { bli_trmm_rl_ker_var2, bli_trmm_ru_ker_var2 } @@ -52,9 +52,9 @@ void bli_trmm_xx_ker_var2 thrinfo_t* thread ) { - dim_t side; - dim_t uplo; - gemm_var_oft f; + dim_t side; + dim_t uplo; + l3_var_oft f; // Set two bools: one based on the implied side parameter (the structure // of the root object) and one based on the uplo field of the triangular diff --git a/frame/3/trmm3/bli_trmm3_front.c b/frame/3/trmm3/bli_trmm3_front.c index 0ce961d1cd..3b97539603 100644 --- a/frame/3/trmm3/bli_trmm3_front.c +++ b/frame/3/trmm3/bli_trmm3_front.c @@ -65,6 +65,14 @@ void bli_trmm3_front bli_obj_alias_to( b, &b_local ); bli_obj_alias_to( c, &c_local ); + // Set the obj_t buffer field to the location currently implied by the row + // and column offsets and then zero the offsets. If any of the original + // obj_t's were views into larger matrices, this step effectively makes + // those obj_t's "forget" their lineage. + bli_obj_reset_origin( &a_local ); + bli_obj_reset_origin( &b_local ); + bli_obj_reset_origin( &c_local ); + // We do not explicitly implement the cases where A is transposed. // However, we can still handle them. Specifically, if A is marked as // needing a transposition, we simply induce a transposition. This @@ -139,13 +147,6 @@ void bli_trmm3_front // Set the pack schemas within the objects. bli_l3_set_schemas( &a_local, &b_local, &c_local, cntx ); - // Set each alias as the root object. - // NOTE: We MUST wait until we are done potentially swapping the objects - // before setting the root fields! - bli_obj_set_as_root( &a_local ); - bli_obj_set_as_root( &b_local ); - bli_obj_set_as_root( &c_local ); - // Parse and interpret the contents of the rntm_t object to properly // set the ways of parallelism for each loop, and then make any // additional modifications necessary for the current operation. @@ -162,7 +163,7 @@ void bli_trmm3_front // Invoke the internal back-end. bli_l3_thread_decorator ( - bli_gemm_int, + bli_l3_int, BLIS_TRMM, // operation family id alpha, &a_local, diff --git a/frame/3/trsm/bli_trsm.h b/frame/3/trsm/bli_trsm.h index 00b604de6e..964422d017 100644 --- a/frame/3/trsm/bli_trsm.h +++ b/frame/3/trsm/bli_trsm.h @@ -34,7 +34,5 @@ #include "bli_trsm_cntl.h" #include "bli_trsm_front.h" -#include "bli_trsm_int.h" - #include "bli_trsm_var.h" diff --git a/frame/3/trsm/bli_trsm_blk_var1.c b/frame/3/trsm/bli_trsm_blk_var1.c index 578c37c329..30bf6921cd 100644 --- a/frame/3/trsm/bli_trsm_blk_var1.c +++ b/frame/3/trsm/bli_trsm_blk_var1.c @@ -58,7 +58,7 @@ void bli_trsm_blk_var1 bli_l3_prune_unref_mparts_m( a, b, c, cntl ); // Isolate the diagonal block A11 and its corresponding row panel C1. - const dim_t kc = bli_obj_width( a ); + const dim_t kc = bli_obj_width_after_trans( a ); obj_t a11, c1; bli_acquire_mpart_mdim( direct, BLIS_SUBPART1, 0, kc, a, &a11 ); @@ -96,7 +96,7 @@ void bli_trsm_blk_var1 #endif // Perform trsm subproblem. - bli_trsm_int + bli_l3_int ( &BLIS_ONE, &a11_1, @@ -169,7 +169,7 @@ void bli_trsm_blk_var1 // Perform gemm subproblem. (Note that we use the same backend // function as before, since we're calling the same macrokernel.) - bli_trsm_int + bli_l3_int ( &BLIS_ONE, &a11, diff --git a/frame/3/trsm/bli_trsm_blk_var2.c b/frame/3/trsm/bli_trsm_blk_var2.c index 23fd3ed4ca..5691c964ad 100644 --- a/frame/3/trsm/bli_trsm_blk_var2.c +++ b/frame/3/trsm/bli_trsm_blk_var2.c @@ -60,7 +60,7 @@ void bli_trsm_blk_var2 bli_thread_range_ndim ( direct, thread, a, b, c, cntl, cntx, - &my_start, &my_end + &my_start, &my_end ); // Partition along the n dimension. @@ -77,7 +77,7 @@ void bli_trsm_blk_var2 i, b_alg, c, &c1 ); // Perform trsm subproblem. - bli_trsm_int + bli_l3_int ( &BLIS_ONE, a, diff --git a/frame/3/trsm/bli_trsm_blk_var3.c b/frame/3/trsm/bli_trsm_blk_var3.c index a68cc853b5..43fc25f16d 100644 --- a/frame/3/trsm/bli_trsm_blk_var3.c +++ b/frame/3/trsm/bli_trsm_blk_var3.c @@ -71,7 +71,7 @@ void bli_trsm_blk_var3 i, b_alg, b, &b1 ); // Perform trsm subproblem. - bli_trsm_int + bli_l3_int ( &BLIS_ONE, &a1, diff --git a/frame/3/trsm/bli_trsm_cntl.c b/frame/3/trsm/bli_trsm_cntl.c index 4a7a4de8fd..a8196ebb93 100644 --- a/frame/3/trsm/bli_trsm_cntl.c +++ b/frame/3/trsm/bli_trsm_cntl.c @@ -57,16 +57,11 @@ cntl_t* bli_trsm_l_cntl_create ) { void_fp macro_kernel_p; - void_fp packa_fp; - void_fp packb_fp; // Use the function pointer to the macrokernels that use slab // assignment of micropanels to threads in the jr and ir loops. macro_kernel_p = bli_trsm_xx_ker_var2; - packa_fp = bli_packm_blk_var1; - packb_fp = bli_packm_blk_var1; - const opid_t family = BLIS_TRSM; // @@ -95,8 +90,7 @@ cntl_t* bli_trsm_l_cntl_create cntl_t* gemm_cntl_packa = bli_packm_cntl_create_node ( rntm, - bli_trsm_packa, // trsm operation's packm function for A. - packa_fp, + bli_l3_packa, // trsm operation's packm function for A. BLIS_MR, BLIS_MR, FALSE, // do NOT invert diagonal @@ -133,8 +127,7 @@ cntl_t* bli_trsm_l_cntl_create cntl_t* trsm_cntl_packa = bli_packm_cntl_create_node ( rntm, - bli_trsm_packa, // trsm operation's packm function for A. - packa_fp, + bli_l3_packa, // trsm operation's packm function for A. BLIS_MR, BLIS_MR, #ifdef BLIS_ENABLE_TRSM_PREINVERSION @@ -171,10 +164,9 @@ cntl_t* bli_trsm_l_cntl_create cntl_t* trsm_cntl_packb = bli_packm_cntl_create_node ( rntm, - bli_trsm_packb, - packb_fp, - BLIS_MR, + bli_l3_packb, BLIS_NR, + BLIS_MR, FALSE, // do NOT invert diagonal FALSE, // reverse iteration if upper? FALSE, // reverse iteration if lower? @@ -208,7 +200,7 @@ cntl_t* bli_trsm_l_cntl_create cntl_t* bli_trsm_r_cntl_create ( - rntm_t* rntm, + rntm_t* rntm, pack_t schema_a, pack_t schema_b ) @@ -216,9 +208,6 @@ cntl_t* bli_trsm_r_cntl_create // NOTE: trsm macrokernels are presently disabled for right-side execution. void_fp macro_kernel_p = bli_trsm_xx_ker_var2; - void_fp packa_fp = bli_packm_blk_var1; - void_fp packb_fp = bli_packm_blk_var1; - const opid_t family = BLIS_TRSM; // Create two nodes for the macro-kernel. @@ -244,8 +233,7 @@ cntl_t* bli_trsm_r_cntl_create cntl_t* trsm_cntl_packa = bli_packm_cntl_create_node ( rntm, - bli_trsm_packa, - packa_fp, + bli_l3_packa, BLIS_NR, BLIS_MR, FALSE, // do NOT invert diagonal @@ -270,8 +258,7 @@ cntl_t* bli_trsm_r_cntl_create cntl_t* trsm_cntl_packb = bli_packm_cntl_create_node ( rntm, - bli_trsm_packb, - packb_fp, + bli_l3_packb, BLIS_MR, BLIS_MR, TRUE, // do NOT invert diagonal diff --git a/frame/3/trsm/bli_trsm_front.c b/frame/3/trsm/bli_trsm_front.c index 68a60b5bdb..7f3d17aeff 100644 --- a/frame/3/trsm/bli_trsm_front.c +++ b/frame/3/trsm/bli_trsm_front.c @@ -71,6 +71,14 @@ void bli_trsm_front bli_obj_alias_to( b, &b_local ); bli_obj_alias_to( b, &c_local ); + // Set the obj_t buffer field to the location currently implied by the row + // and column offsets and then zero the offsets. If any of the original + // obj_t's were views into larger matrices, this step effectively makes + // those obj_t's "forget" their lineage. + bli_obj_reset_origin( &a_local ); + bli_obj_reset_origin( &b_local ); + bli_obj_reset_origin( &c_local ); + // We do not explicitly implement the cases where A is transposed. // However, we can still handle them. Specifically, if A is marked as // needing a transposition, we simply induce a transposition. This @@ -121,13 +129,6 @@ void bli_trsm_front // Set the pack schemas within the objects. bli_l3_set_schemas( &a_local, &b_local, &c_local, cntx ); - // Set each alias as the root object. - // NOTE: We MUST wait until we are done potentially swapping the objects - // before setting the root fields! - bli_obj_set_as_root( &a_local ); - bli_obj_set_as_root( &b_local ); - bli_obj_set_as_root( &c_local ); - // Parse and interpret the contents of the rntm_t object to properly // set the ways of parallelism for each loop, and then make any // additional modifications necessary for the current operation. @@ -144,7 +145,7 @@ void bli_trsm_front // Invoke the internal back-end. bli_l3_thread_decorator ( - bli_trsm_int, + bli_l3_int, BLIS_TRSM, // operation family id alpha, &a_local, diff --git a/frame/3/trsm/bli_trsm_var.h b/frame/3/trsm/bli_trsm_var.h index de7c65936f..8322a8b5b6 100644 --- a/frame/3/trsm/bli_trsm_var.h +++ b/frame/3/trsm/bli_trsm_var.h @@ -55,8 +55,6 @@ void PASTEMAC0(opname) \ GENPROT( trsm_blk_var1 ) GENPROT( trsm_blk_var2 ) GENPROT( trsm_blk_var3 ) -GENPROT( trsm_packa ) -GENPROT( trsm_packb ) GENPROT( trsm_xx_ker_var2 ) diff --git a/frame/3/trsm/bli_trsm_xx_ker_var2.c b/frame/3/trsm/bli_trsm_xx_ker_var2.c index e30e6d7510..c30a5828a3 100644 --- a/frame/3/trsm/bli_trsm_xx_ker_var2.c +++ b/frame/3/trsm/bli_trsm_xx_ker_var2.c @@ -35,7 +35,7 @@ #include "blis.h" -static trsm_var_oft vars[2][2] = +static l3_var_oft vars[2][2] = { { bli_trsm_ll_ker_var2, bli_trsm_lu_ker_var2 }, { bli_trsm_rl_ker_var2, bli_trsm_ru_ker_var2 } @@ -52,9 +52,9 @@ void bli_trsm_xx_ker_var2 thrinfo_t* thread ) { - dim_t side; - dim_t uplo; - trsm_var_oft f; + dim_t side; + dim_t uplo; + l3_var_oft f; // Set two bools: one based on the implied side parameter (the structure // of the root object) and one based on the uplo field of the triangular diff --git a/frame/base/bli_obj.c b/frame/base/bli_obj.c index 43e5101b5f..23fbb4cd10 100644 --- a/frame/base/bli_obj.c +++ b/frame/base/bli_obj.c @@ -118,6 +118,11 @@ void bli_obj_create_without_buffer bli_obj_set_offs( 0, 0, obj ); bli_obj_set_diag_offset( 0, obj ); + bli_obj_set_pack_fn( NULL, obj ); + bli_obj_set_pack_params( NULL, obj ); + bli_obj_set_ker_fn( NULL, obj ); + bli_obj_set_ker_params( NULL, obj ); + // Set the internal scalar to 1.0. bli_obj_set_scalar_dt( dt, obj ); s = bli_obj_internal_scalar_buffer( obj ); @@ -356,7 +361,7 @@ void bli_obj_free buf_a = bli_obj_buffer_at_off( a ); - bli_zzsets( 0.0, 0.0, value ); + bli_zzsets( 0.0, 0.0, value ); if ( bli_obj_is_float( a ) ) { @@ -500,7 +505,7 @@ void bli_adjust_strides // Set the column stride to indicate that this is a column vector // stored in column-major order. This is done for legacy reasons, // because we at one time we had to satisify the error checking - // in the underlying BLAS library, which expects the leading + // in the underlying BLAS library, which expects the leading // dimension to be set to at least m, even if it will never be // used for indexing since it is a vector and thus only has one // column of data. diff --git a/frame/base/bli_pba.c b/frame/base/bli_pba.c index a924bbefc8..f8835e5de0 100644 --- a/frame/base/bli_pba.c +++ b/frame/base/bli_pba.c @@ -282,17 +282,6 @@ void bli_pba_acquire_v #endif -void bli_pba_rntm_set_pba - ( - rntm_t* rntm - ) -{ - pba_t* pba = bli_pba_query(); - - bli_rntm_set_pba( pba, rntm ); -} - - siz_t bli_pba_pool_size ( pba_t* pba, diff --git a/frame/base/bli_pba.h b/frame/base/bli_pba.h index ce19991f55..6431607ec9 100644 --- a/frame/base/bli_pba.h +++ b/frame/base/bli_pba.h @@ -119,7 +119,7 @@ BLIS_INLINE void bli_pba_unlock( pba_t* pba ) // ----------------------------------------------------------------------------- -pba_t* bli_pba_query( void ); +BLIS_EXPORT_BLIS pba_t* bli_pba_query( void ); void bli_pba_init ( @@ -144,10 +144,15 @@ void bli_pba_release mem_t* mem ); -void bli_pba_rntm_set_pba +BLIS_INLINE void bli_pba_rntm_set_pba ( rntm_t* rntm - ); + ) +{ + pba_t* pba = bli_pba_query(); + + bli_rntm_set_pba( pba, rntm ); +} siz_t bli_pba_pool_size ( diff --git a/frame/base/bli_sba.c b/frame/base/bli_sba.c index 1da6723c79..5b6ff6a0f0 100644 --- a/frame/base/bli_sba.c +++ b/frame/base/bli_sba.c @@ -76,24 +76,39 @@ void* bli_sba_acquire // Query the small block pool from the rntm. pool_t* restrict pool = bli_rntm_sba_pool( rntm ); - // Query the block_size of the pool_t so that we can request the exact - // size present. - const siz_t block_size = bli_pool_block_size( pool ); - - // Sanity check: Make sure the requested size is no larger than the - // block_size field of the pool. - if ( block_size < req_size ) + // We don't expect NULL sba_pool pointers in the normal course of BLIS + // operation. However, there are rare instances where it is convenient + // to support use of bli_sba_acquire() without having to pass in a valid + // sba pool data structure. The case that inspired this branch was the + // gemm_ukr and related test modules in the BLIS testsuite. (There, it + // is convenient to not have to checkout an array_t from the sba, and it + // does no harm since the malloc() happens outside of the region that + // would be timed.) + if ( pool == NULL ) { - printf( "bli_sba_acquire(): ** pool block_size is %d but req_size is %d.\n", - ( int )block_size, ( int )req_size ); - bli_abort(); + block = bli_malloc_intl( req_size, &r_val ); + } + else + { + // Query the block_size of the pool_t so that we can request the exact + // size present. + const siz_t block_size = bli_pool_block_size( pool ); + + // Sanity check: Make sure the requested size is no larger than the + // block_size field of the pool. + if ( block_size < req_size ) + { + printf( "bli_sba_acquire(): ** pool block_size is %d but req_size is %d.\n", + ( int )block_size, ( int )req_size ); + bli_abort(); + } + + // Check out a block using the block_size queried above. + bli_pool_checkout_block( block_size, &pblk, pool ); + + // The block address is stored within the pblk_t. + block = bli_pblk_buf( &pblk ); } - - // Check out a block using the block_size queried above. - bli_pool_checkout_block( block_size, &pblk, pool ); - - // The block address is stored within the pblk_t. - block = bli_pblk_buf( &pblk ); } #else @@ -123,21 +138,28 @@ void bli_sba_release // Query the small block pool from the rntm. pool_t* restrict pool = bli_rntm_sba_pool( rntm ); - // Query the block_size field from the pool. This is not super-important - // for this particular application of the pool_t (that is, the "leaf" - // component of the sba), but it seems like good housekeeping to maintain - // the block_size field of the pblk_t in case its ever needed/read. - const siz_t block_size = bli_pool_block_size( pool ); - - // Embed the block's memory address into a pblk_t, along with the - // block_size queried from the pool. - bli_pblk_set_buf( block, &pblk ); - bli_pblk_set_block_size( block_size, &pblk ); - - // Check the pblk_t back into the pool_t. (It's okay that the pblk_t is - // a local variable since its contents are copied into the pool's internal - // data structure--an array of pblk_t.) - bli_pool_checkin_block( &pblk, pool ); + if ( pool == NULL ) + { + bli_free_intl( block ); + } + else + { + // Query the block_size field from the pool. This is not super-important + // for this particular application of the pool_t (that is, the "leaf" + // component of the sba), but it seems like good housekeeping to maintain + // the block_size field of the pblk_t in case its ever needed/read. + const siz_t block_size = bli_pool_block_size( pool ); + + // Embed the block's memory address into a pblk_t, along with the + // block_size queried from the pool. + bli_pblk_set_buf( block, &pblk ); + bli_pblk_set_block_size( block_size, &pblk ); + + // Check the pblk_t back into the pool_t. (It's okay that the pblk_t is + // a local variable since its contents are copied into the pool's internal + // data structure--an array of pblk_t.) + bli_pool_checkin_block( &pblk, pool ); + } } #else diff --git a/frame/include/bli_obj_macro_defs.h b/frame/include/bli_obj_macro_defs.h index 84c977289c..fe174202cf 100644 --- a/frame/include/bli_obj_macro_defs.h +++ b/frame/include/bli_obj_macro_defs.h @@ -1189,52 +1189,48 @@ BLIS_INLINE stor3_t bli_obj_stor3_from_strides( obj_t* c, obj_t* a, obj_t* b ) // -- User-provided information macros -- -// User data query - -BLIS_INLINE void* bli_obj_user_data( obj_t* obj ) -{ - return obj->user_data; -} - -// User data modification +// Function pointer query -BLIS_INLINE void bli_obj_set_user_data( void* data, obj_t* obj ) +BLIS_INLINE obj_pack_fn_t bli_obj_pack_fn( obj_t* obj ) { - obj->user_data = data; + return obj->pack_fn; } -// Function pointer query - -BLIS_INLINE obj_pack_fn_t bli_obj_pack_fn( obj_t* obj ) +BLIS_INLINE void* bli_obj_pack_params( obj_t* obj ) { - return obj->pack; + return obj->pack_params; } BLIS_INLINE obj_ker_fn_t bli_obj_ker_fn( obj_t* obj ) { - return obj->ker; + return obj->ker_fn; } -BLIS_INLINE obj_ukr_fn_t bli_obj_ukr_fn( obj_t* obj ) +BLIS_INLINE void* bli_obj_ker_params( obj_t* obj ) { - return obj->ukr; + return obj->ker_params; } // Function pointer modification -BLIS_INLINE void bli_obj_set_pack_fn( obj_pack_fn_t pack, obj_t* obj ) +BLIS_INLINE void bli_obj_set_pack_fn( obj_pack_fn_t pack_fn, obj_t* obj ) { - obj->pack = pack; + obj->pack_fn = pack_fn; } -BLIS_INLINE void bli_obj_set_ker_fn( obj_ker_fn_t ker, obj_t* obj ) +BLIS_INLINE void bli_obj_set_pack_params( void* params, obj_t* obj ) { - obj->ker = ker; + obj->pack_params = params; } -BLIS_INLINE void bli_obj_set_ukr_fn( obj_ukr_fn_t ukr, obj_t* obj ) +BLIS_INLINE void bli_obj_set_ker_fn( obj_ker_fn_t ker_fn, obj_t* obj ) { - obj->ukr = ukr; + obj->ker_fn = ker_fn; +} + +BLIS_INLINE void bli_obj_set_ker_params( void* params, obj_t* obj ) +{ + obj->ker_params = params; } @@ -1357,6 +1353,18 @@ BLIS_INLINE void* bli_obj_buffer_for_1x1( num_t dt, obj_t* obj ) ); } +// Adjust the pointer based on current offsets, zero the offsets, and then +// set the current object as the root. For obj_t's with at least one non-zero +// offset, this effectively makes the obj_t "forget" that it was ever a view +// into a larger matrix. + +BLIS_INLINE void bli_obj_reset_origin( obj_t* obj ) +{ + bli_obj_set_buffer( bli_obj_buffer_at_off( obj ), obj ); + bli_obj_set_offs( 0, 0, obj ); + bli_obj_set_as_root( obj ); +} + // Make a full alias (shallow copy). BLIS_INLINE void bli_obj_alias_to( obj_t* a, obj_t* b ) @@ -1482,7 +1490,13 @@ BLIS_INLINE void bli_obj_scalar_set_dt_buffer( obj_t* obj, num_t dt_aux, num_t* BLIS_INLINE void bli_obj_swap( obj_t* a, obj_t* b ) { + bool a_root_is_self = ( bli_obj_root( a ) == a ); + bool b_root_is_self = ( bli_obj_root( b ) == b ); + obj_t t = *b; *b = *a; *a = t; + + if ( a_root_is_self ) bli_obj_set_as_root( b ); + if ( b_root_is_self ) bli_obj_set_as_root( a ); } // Swap object pack schemas. diff --git a/frame/include/bli_type_defs.h b/frame/include/bli_type_defs.h index c2db052e52..c6a67e4d17 100644 --- a/frame/include/bli_type_defs.h +++ b/frame/include/bli_type_defs.h @@ -1173,12 +1173,11 @@ struct thrinfo_s; typedef void (*obj_pack_fn_t) ( - mdim_t mat, - mem_t* mem, struct obj_s* a, struct obj_s* ap, struct cntx_s* cntx, struct rntm_s* rntm, + struct cntl_s* cntl, struct thrinfo_s* thread ); @@ -1189,23 +1188,10 @@ typedef void (*obj_ker_fn_t) struct obj_s* c, struct cntx_s* cntx, struct rntm_s* rntm, + struct cntl_s* cntl, struct thrinfo_s* thread ); -typedef void (*obj_ukr_fn_t) - ( - dim_t m, - dim_t n, - dim_t k, - void* restrict alpha, - void* restrict a, inc_t rs_a, inc_t cs_a, - void* restrict b, inc_t rs_b, inc_t cs_b, - void* restrict beta, - void* restrict c, inc_t rs_c, inc_t cs_c, - auxinfo_t* restrict data, - struct cntx_s* restrict cntx - ); - typedef struct obj_s { // Basic fields @@ -1236,13 +1222,11 @@ typedef struct obj_s dim_t m_panel; // m dimension of a "full" panel dim_t n_panel; // n dimension of a "full" panel - // User data pointer - void* user_data; - - // Function pointers - obj_pack_fn_t pack; - obj_ker_fn_t ker; - obj_ukr_fn_t ukr; + // User-customizable fields + obj_pack_fn_t pack_fn; + void* pack_params; + obj_ker_fn_t ker_fn; + void* ker_params; } obj_t; @@ -1257,70 +1241,68 @@ typedef struct obj_s #define BLIS_OBJECT_INITIALIZER \ { \ - .root = NULL, \ + .root = NULL, \ \ - .off = { 0, 0 }, \ - .dim = { 0, 0 }, \ - .diag_off = 0, \ + .off = { 0, 0 }, \ + .dim = { 0, 0 }, \ + .diag_off = 0, \ \ - .info = 0x0 | BLIS_BITVAL_DENSE | \ - BLIS_BITVAL_GENERAL, \ - .info2 = 0x0, \ - .elem_size = sizeof( float ), /* this is changed later. */ \ + .info = 0x0 | BLIS_BITVAL_DENSE | \ + BLIS_BITVAL_GENERAL, \ + .info2 = 0x0, \ + .elem_size = sizeof( float ), /* this is changed later. */ \ \ - .buffer = NULL, \ - .rs = 0, \ - .cs = 0, \ - .is = 1, \ + .buffer = NULL, \ + .rs = 0, \ + .cs = 0, \ + .is = 1, \ \ - .scalar = { 0.0, 0.0 }, \ + .scalar = { 0.0, 0.0 }, \ \ - .m_padded = 0, \ - .n_padded = 0, \ - .ps = 0, \ - .pd = 0, \ - .m_panel = 0, \ - .n_panel = 0, \ + .m_padded = 0, \ + .n_padded = 0, \ + .ps = 0, \ + .pd = 0, \ + .m_panel = 0, \ + .n_panel = 0, \ \ - .user_data = NULL, \ -\ - .pack = NULL, \ - .ker = NULL, \ - .ukr = NULL \ + .pack_fn = NULL, \ + .pack_params = NULL, \ + .ker_fn = NULL, \ + .ker_params = NULL \ } #define BLIS_OBJECT_INITIALIZER_1X1 \ { \ - .root = NULL, \ -\ - .off = { 0, 0 }, \ - .dim = { 1, 1 }, \ - .diag_off = 0, \ + .root = NULL, \ \ - .info = 0x0 | BLIS_BITVAL_DENSE | \ - BLIS_BITVAL_GENERAL, \ - .info2 = 0x0, \ - .elem_size = sizeof( float ), /* this is changed later. */ \ + .off = { 0, 0 }, \ + .dim = { 1, 1 }, \ + .diag_off = 0, \ \ - .buffer = NULL, \ - .rs = 0, \ - .cs = 0, \ - .is = 1, \ + .info = 0x0 | BLIS_BITVAL_DENSE | \ + BLIS_BITVAL_GENERAL, \ + .info2 = 0x0, \ + .elem_size = sizeof( float ), /* this is changed later. */ \ \ - .scalar = { 0.0, 0.0 }, \ + .buffer = NULL, \ + .rs = 0, \ + .cs = 0, \ + .is = 1, \ \ - .m_padded = 0, \ - .n_padded = 0, \ - .ps = 0, \ - .pd = 0, \ - .m_panel = 0, \ - .n_panel = 0, \ + .scalar = { 0.0, 0.0 }, \ \ - .user_data = NULL, \ + .m_padded = 0, \ + .n_padded = 0, \ + .ps = 0, \ + .pd = 0, \ + .m_panel = 0, \ + .n_panel = 0, \ \ - .pack = NULL, \ - .ker = NULL, \ - .ukr = NULL \ + .pack_fn = NULL, \ + .pack_params = NULL, \ + .ker_fn = NULL, \ + .ker_params = NULL \ } // Define these macros here since they must be updated if contents of @@ -1328,77 +1310,75 @@ typedef struct obj_s BLIS_INLINE void bli_obj_init_full_shallow_copy_of( obj_t* a, obj_t* b ) { - b->root = a->root; - - b->off[0] = a->off[0]; - b->off[1] = a->off[1]; - b->dim[0] = a->dim[0]; - b->dim[1] = a->dim[1]; - b->diag_off = a->diag_off; - - b->info = a->info; - b->info2 = a->info2; - b->elem_size = a->elem_size; - - b->buffer = a->buffer; - b->rs = a->rs; - b->cs = a->cs; - b->is = a->is; - - b->scalar = a->scalar; - - //b->pack_mem = a->pack_mem; - b->m_padded = a->m_padded; - b->n_padded = a->n_padded; - b->ps = a->ps; - b->pd = a->pd; - b->m_panel = a->m_panel; - b->n_panel = a->n_panel; - - b->user_data = a->user_data; - - b->pack = a->pack; - b->ker = a->ker; - b->ukr = a->ukr; + b->root = a->root; + + b->off[0] = a->off[0]; + b->off[1] = a->off[1]; + b->dim[0] = a->dim[0]; + b->dim[1] = a->dim[1]; + b->diag_off = a->diag_off; + + b->info = a->info; + b->info2 = a->info2; + b->elem_size = a->elem_size; + + b->buffer = a->buffer; + b->rs = a->rs; + b->cs = a->cs; + b->is = a->is; + + b->scalar = a->scalar; + + //b->pack_mem = a->pack_mem; + b->m_padded = a->m_padded; + b->n_padded = a->n_padded; + b->ps = a->ps; + b->pd = a->pd; + b->m_panel = a->m_panel; + b->n_panel = a->n_panel; + + b->pack_fn = a->pack_fn; + b->pack_params = a->pack_params; + b->ker_fn = a->ker_fn; + b->ker_params = a->ker_params; } BLIS_INLINE void bli_obj_init_subpart_from( obj_t* a, obj_t* b ) { - b->root = a->root; + b->root = a->root; - b->off[0] = a->off[0]; - b->off[1] = a->off[1]; + b->off[0] = a->off[0]; + b->off[1] = a->off[1]; // Avoid copying m and n since they will be overwritten. - //b->dim[0] = a->dim[0]; - //b->dim[1] = a->dim[1]; - b->diag_off = a->diag_off; + //b->dim[0] = a->dim[0]; + //b->dim[1] = a->dim[1]; + b->diag_off = a->diag_off; - b->info = a->info; - b->info2 = a->info2; - b->elem_size = a->elem_size; + b->info = a->info; + b->info2 = a->info2; + b->elem_size = a->elem_size; - b->buffer = a->buffer; - b->rs = a->rs; - b->cs = a->cs; - b->is = a->is; + b->buffer = a->buffer; + b->rs = a->rs; + b->cs = a->cs; + b->is = a->is; - b->scalar = a->scalar; + b->scalar = a->scalar; // Avoid copying pack_mem entry. // FGVZ: You should probably make sure this is right. - //b->pack_mem = a->pack_mem; - b->m_padded = a->m_padded; - b->n_padded = a->n_padded; - b->ps = a->ps; - b->pd = a->pd; - b->m_panel = a->m_panel; - b->n_panel = a->n_panel; - - b->user_data = a->user_data; - - b->pack = a->pack; - b->ker = a->ker; - b->ukr = a->ukr; + //b->pack_mem = a->pack_mem; + b->m_padded = a->m_padded; + b->n_padded = a->n_padded; + b->ps = a->ps; + b->pd = a->pd; + b->m_panel = a->m_panel; + b->n_panel = a->n_panel; + + b->pack_fn = a->pack_fn; + b->pack_params = a->pack_params; + b->ker_fn = a->ker_fn; + b->ker_params = a->ker_params; } // Initializors for global scalar constants. diff --git a/testsuite/src/test_gemm_ukr.c b/testsuite/src/test_gemm_ukr.c index 48996f28e7..d37005b285 100644 --- a/testsuite/src/test_gemm_ukr.c +++ b/testsuite/src/test_gemm_ukr.c @@ -169,7 +169,6 @@ void libblis_test_gemm_ukr_experiment num_t datatype; dim_t m, n, k; - inc_t ldap, ldbp; char sc_a = 'c'; char sc_b = 'r'; @@ -194,11 +193,6 @@ void libblis_test_gemm_ukr_experiment m = bli_cntx_get_blksz_def_dt( datatype, BLIS_MR, cntx ); n = bli_cntx_get_blksz_def_dt( datatype, BLIS_NR, cntx ); - // Also query PACKMR and PACKNR as the leading dimensions to ap and bp, - // respectively. - ldap = bli_cntx_get_blksz_max_dt( datatype, BLIS_MR, cntx ); - ldbp = bli_cntx_get_blksz_max_dt( datatype, BLIS_NR, cntx ); - // Store the register blocksizes so that the driver can retrieve the // values later when printing results. op->dim_aux[0] = m; @@ -237,7 +231,13 @@ void libblis_test_gemm_ukr_experiment libblis_test_mobj_randomize( params, TRUE, &c ); bli_copym( &c, &c_save ); -#if 0 + rntm_t rntm; + bli_rntm_init( &rntm ); + bli_pba_rntm_set_pba( &rntm ); + + // Transpose B to B^T for packing. + bli_obj_induce_trans( &b ); + // Create pack objects for a and b, and pack them to ap and bp, // respectively. cntl_t* cntl_a = libblis_test_pobj_create @@ -248,56 +248,26 @@ void libblis_test_gemm_ukr_experiment BLIS_PACKED_ROW_PANELS, BLIS_BUFFER_FOR_A_BLOCK, &a, &ap, - cntx + cntx, + &rntm ); cntl_t* cntl_b = libblis_test_pobj_create ( - BLIS_KR, BLIS_NR, + BLIS_KR, BLIS_NO_INVERT_DIAG, BLIS_PACKED_COL_PANELS, BLIS_BUFFER_FOR_B_PANEL, &b, &bp, - cntx + cntx, + &rntm ); -#endif - - // Create the packed objects. Use packmr and packnr as the leading - // dimensions of ap and bp, respectively. Note that we use the ldims - // instead of the matrix dimensions for allocation purposes here. - // This is a little hacky and was prompted when trying to support - // configurations such as power9 that employ duplication/broadcasting - // of elements in one of the packed matrix objects. Thankfully, packm - // doesn't care about those dimensions and instead relies on - // information taken from the source object. Thus, this is merely - // about coaxing bli_obj_create() in allocating enough space for our - // purposes. - bli_obj_create( datatype, ldap, k, 1, ldap, &ap ); - bli_obj_create( datatype, k, ldbp, ldbp, 1, &bp ); - - // Set up the objects for packing. Calling packm_init_pack() does everything - // except checkout a memory pool block and save its address to the obj_t's. - // However, it does overwrite the buffer field of packed object with that of - // the source object (as a side-effect of bli_obj_alias_to(); that buffer - // field would normally be overwritten yet again by the address from the - // memory pool block). So, we have to save the buffer address that was - // allocated so we can re-store it to the object afterward. - void* buf_ap = bli_obj_buffer( &ap ); - void* buf_bp = bli_obj_buffer( &bp ); - bli_packm_init_pack( BLIS_NO_INVERT_DIAG, BLIS_PACKED_ROW_PANELS, - BLIS_PACK_FWD_IF_UPPER, BLIS_PACK_FWD_IF_LOWER, - BLIS_MR, BLIS_KR, &a, &ap, cntx ); - bli_packm_init_pack( BLIS_NO_INVERT_DIAG, BLIS_PACKED_COL_PANELS, - BLIS_PACK_FWD_IF_UPPER, BLIS_PACK_FWD_IF_LOWER, - BLIS_KR, BLIS_NR, &b, &bp, cntx ); - bli_obj_set_buffer( buf_ap, &ap ); - bli_obj_set_buffer( buf_bp, &bp ); - - // Pack the data from the source objects. - bli_packm_blk_var1( &a, &ap, cntx, NULL, &BLIS_PACKM_SINGLE_THREADED ); - bli_packm_blk_var1( &b, &bp, cntx, NULL, &BLIS_PACKM_SINGLE_THREADED ); - - // Repeat the experiment n_repeats times and record results. + + // Transpose B^T back to B and Bp^T back to Bp. + bli_obj_induce_trans( &b ); + bli_obj_induce_trans( &bp ); + + // Repeat the experiment n_repeats times and record results. for ( i = 0; i < n_repeats; ++i ) { bli_copym( &c_save, &c ); @@ -321,16 +291,10 @@ void libblis_test_gemm_ukr_experiment // Zero out performance and residual if output matrix is empty. libblis_test_check_empty_problem( &c, perf, resid ); -#if 0 // Free the control tree nodes and release their cached mem_t entries - // back to the memory broker. - bli_cntl_free( cntl_a, &BLIS_PACKM_SINGLE_THREADED ); - bli_cntl_free( cntl_b, &BLIS_PACKM_SINGLE_THREADED ); -#endif - - // Free the packed objects. - bli_obj_free( &ap ); - bli_obj_free( &bp ); + // back to the pba. + bli_cntl_free( &rntm, cntl_a, &BLIS_PACKM_SINGLE_THREADED ); + bli_cntl_free( &rntm, cntl_b, &BLIS_PACKM_SINGLE_THREADED ); // Free the test objects. bli_obj_free( &a ); diff --git a/testsuite/src/test_gemmtrsm_ukr.c b/testsuite/src/test_gemmtrsm_ukr.c index b3916db6a1..48fcb78db7 100644 --- a/testsuite/src/test_gemmtrsm_ukr.c +++ b/testsuite/src/test_gemmtrsm_ukr.c @@ -283,7 +283,10 @@ void libblis_test_gemmtrsm_ukr_experiment bli_copym( &b11, &c11 ); bli_copym( &c11, &c11_save ); -#if 0 + rntm_t rntm; + bli_rntm_init( &rntm ); + bli_pba_rntm_set_pba( &rntm ); + // Create pack objects for a and b, and pack them to ap and bp, // respectively. cntl_t* cntl_a = libblis_test_pobj_create @@ -294,59 +297,9 @@ void libblis_test_gemmtrsm_ukr_experiment BLIS_PACKED_ROW_PANELS, BLIS_BUFFER_FOR_A_BLOCK, &a, &ap, - &cntx - ); - cntl_t* cntl_b = libblis_test_pobj_create - ( - BLIS_MR, - BLIS_NR, - BLIS_NO_INVERT_DIAG, - BLIS_PACKED_COL_PANELS, - BLIS_BUFFER_FOR_B_PANEL, - &b, &bp, - &cntx + cntx, + &rntm ); -#endif - - // Create the packed objects. Use packmr and packnr as the leading - // dimensions of ap and bp, respectively. Note that we use the ldims - // instead of the matrix dimensions for allocation purposes here. - // This is a little hacky and was prompted when trying to support - // configurations such as power9 that employ duplication/broadcasting - // of elements in one of the packed matrix objects. Thankfully, packm - // doesn't care about those dimensions and instead relies on - // information taken from the source object. Thus, this is merely - // about coaxing bli_obj_create() in allocating enough space for our - // purposes. - bli_obj_create( datatype, ldap, k+m, 1, ldap, &ap ); - bli_obj_create( datatype, k+m, ldbp, ldbp, 1, &bp ); - - // We overwrite the m dimension of ap and n dimension of bp with - // m and n, respectively, so that these objects contain the correct - // logical dimensions. Recall that ldap and ldbp were used only to - // induce bli_obj_create() to allocate sufficient memory for the - // duplication in rare instances where the subconfig uses a gemm - // ukernel that duplicates elements in one of the operands. - bli_obj_set_length( m, &ap ); - bli_obj_set_width( n, &bp ); - - // Set up the objects for packing. Calling packm_init_pack() does everything - // except checkout a memory pool block and save its address to the obj_t's. - // However, it does overwrite the buffer field of packed object with that of - // the source object (as a side-effect of bli_obj_alias_to(); that buffer - // field would normally be overwritten yet again by the address from the - // memory pool block). So, we have to save the buffer address that was - // allocated so we can re-store it to the object afterward. - void* buf_ap = bli_obj_buffer( &ap ); - void* buf_bp = bli_obj_buffer( &bp ); - bli_packm_init_pack( BLIS_INVERT_DIAG, BLIS_PACKED_ROW_PANELS, - BLIS_PACK_FWD_IF_UPPER, BLIS_PACK_FWD_IF_LOWER, - BLIS_MR, BLIS_KR, &a, &ap, cntx ); - bli_packm_init_pack( BLIS_NO_INVERT_DIAG, BLIS_PACKED_COL_PANELS, - BLIS_PACK_FWD_IF_UPPER, BLIS_PACK_FWD_IF_LOWER, - BLIS_KR, BLIS_NR, &b, &bp, cntx ); - bli_obj_set_buffer( buf_ap, &ap ); - bli_obj_set_buffer( buf_bp, &bp ); // Set the diagonal offset of ap. if ( bli_is_lower( uploa ) ) { bli_obj_set_diag_offset( k, &ap ); } @@ -357,32 +310,45 @@ void libblis_test_gemmtrsm_ukr_experiment // to know how to initialize the subpartitions. bli_obj_set_uplo( uploa, &ap ); - // Pack the data from the source objects. - bli_packm_blk_var1( &a, &ap, cntx, NULL, &BLIS_PACKM_SINGLE_THREADED ); - bli_packm_blk_var1( &b, &bp, cntx, NULL, &BLIS_PACKM_SINGLE_THREADED ); - - // Create subpartitions from the a and b panels. - bli_gemmtrsm_ukr_make_subparts( k, &ap, &bp, - &a1xp, &a11p, &bx1p, &b11p ); - - // Set the uplo field of a11p since the default for packed objects is - // BLIS_DENSE, and the _ukernel() wrapper needs this information to - // know which set of micro-kernels (lower or upper) to choose from. - bli_obj_set_uplo( uploa, &a11p ); - #if 0 bli_printm( "a", &a, "%5.2f", "" ); bli_printm( "ap", &ap, "%5.2f", "" ); #endif - // Repeat the experiment n_repeats times and record results. + cntl_t* cntl_b = NULL; + + // Repeat the experiment n_repeats times and record results. for ( i = 0; i < n_repeats; ++i ) { bli_copym( &c11_save, &c11 ); - // Re-pack (restore) the contents of b to bp. - //bli_packm_blk_var1( &b, &bp, &cntx, cntl_b, &BLIS_PACKM_SINGLE_THREADED ); - bli_packm_blk_var1( &b, &bp, cntx, NULL, &BLIS_PACKM_SINGLE_THREADED ); + // Transpose B to B^T for packing. + bli_obj_induce_trans( &b ); + + cntl_b = libblis_test_pobj_create + ( + BLIS_NR, + BLIS_MR, + BLIS_NO_INVERT_DIAG, + BLIS_PACKED_COL_PANELS, + BLIS_BUFFER_FOR_B_PANEL, + &b, &bp, + cntx, + &rntm + ); + + // Transpose B^T back to B and Bp^T back to Bp. + bli_obj_induce_trans( &b ); + bli_obj_induce_trans( &bp ); + + // Create subpartitions from the a and b panels. + bli_gemmtrsm_ukr_make_subparts( k, &ap, &bp, + &a1xp, &a11p, &bx1p, &b11p ); + + // Set the uplo field of a11p since the default for packed objects is + // BLIS_DENSE, and the _ukernel() wrapper needs this information to + // know which set of micro-kernels (lower or upper) to choose from. + bli_obj_set_uplo( uploa, &a11p ); time = bli_clock(); @@ -391,6 +357,15 @@ bli_printm( "ap", &ap, "%5.2f", "" ); cntx ); time_min = bli_clock_min_diff( time_min, time ); + + // On the last pass, we must keep the packed B buffer checked out in order + // to perform the correctness check later. + if ( i < n_repeats - 1 ) + { + // Free the control tree nodes and release their cached mem_t entries + // back to the memory broker. + bli_cntl_free( &rntm, cntl_b, &BLIS_PACKM_SINGLE_THREADED ); + } } // Estimate the performance of the best experiment repeat. @@ -426,16 +401,11 @@ bli_printm( "ap", &ap, "%5.2f", "" ); // Zero out performance and residual if output matrix is empty. //libblis_test_check_empty_problem( &c11, perf, resid ); -#if 0 // Free the control tree nodes and release their cached mem_t entries - // back to the memory broker. - bli_cntl_free( cntl_a, &BLIS_PACKM_SINGLE_THREADED ); - bli_cntl_free( cntl_b, &BLIS_PACKM_SINGLE_THREADED ); -#endif - - // Free the packed objects. - bli_obj_free( &ap ); - bli_obj_free( &bp ); + // back to the pba. + bli_cntl_free( &rntm, cntl_a, &BLIS_PACKM_SINGLE_THREADED ); + if ( cntl_b ) + bli_cntl_free( &rntm, cntl_b, &BLIS_PACKM_SINGLE_THREADED ); // Free the test objects. bli_obj_free( &a_big ); diff --git a/testsuite/src/test_libblis.c b/testsuite/src/test_libblis.c index bbfd0ac63c..edab9796d2 100644 --- a/testsuite/src/test_libblis.c +++ b/testsuite/src/test_libblis.c @@ -636,7 +636,7 @@ void libblis_test_read_op_info( test_ops_t* ops, int i, p; // Initialize the operation type field. - op->opid = opid; + op->opid = opid; // Read the line for the overall operation switch. libblis_test_read_next_line( buffer, input_stream ); @@ -671,7 +671,7 @@ void libblis_test_read_op_info( test_ops_t* ops, //printf( "buffer[p]: %s\n", &buffer[p] ); // Advance until we hit non-whitespace (ie: the next number). - for ( ; isspace( buffer[p] ); ++p ) ; + for ( ; isspace( buffer[p] ); ++p ) ; //printf( "buffer[p] after: %s\n", &buffer[p] ); @@ -680,7 +680,7 @@ void libblis_test_read_op_info( test_ops_t* ops, //printf( "dim[%d] = %d\n", i, op->dim_spec[i] ); // Advance until we hit whitespace (ie: the space before the next number). - for ( ; !isspace( buffer[p] ); ++p ) ; + for ( ; !isspace( buffer[p] ); ++p ) ; } } @@ -778,11 +778,11 @@ void libblis_test_output_params_struct( FILE* os, test_params_t* params ) // convert these values into strings, with "unset" being used if the // value returned was -1 (indicating the environment variable was unset). dim_t nt = bli_thread_get_num_threads(); - dim_t jc_nt = bli_thread_get_jc_nt(); - dim_t pc_nt = bli_thread_get_pc_nt(); - dim_t ic_nt = bli_thread_get_ic_nt(); - dim_t jr_nt = bli_thread_get_jr_nt(); - dim_t ir_nt = bli_thread_get_ir_nt(); + dim_t jc_nt = bli_thread_get_jc_nt(); + dim_t pc_nt = bli_thread_get_pc_nt(); + dim_t ic_nt = bli_thread_get_ic_nt(); + dim_t jr_nt = bli_thread_get_jr_nt(); + dim_t ir_nt = bli_thread_get_ir_nt(); if ( nt == -1 ) sprintf( nt_str, "unset" ); else sprintf( nt_str, "%d", ( int ) nt ); @@ -1739,7 +1739,7 @@ void libblis_test_op_driver = ( char* ) malloc( ( n_operands + 1 ) * sizeof( char ) ); for ( o = 0; o < n_operands; ++o ) - { + { unsigned int ij; operand_t operand_type = libblis_test_get_operand_type_for_char( o_types[o] ); @@ -2181,7 +2181,7 @@ void libblis_test_op_driver ind_str = bli_ind_oper_get_avail_impl_string( op->opid, datatype ); // Loop over the requested parameter combinations. - for ( pci = 0; pci < n_param_combos; ++pci ) + for ( pci = 0; pci < n_param_combos; ++pci ) { // Loop over the requested problem sizes. for ( p_cur = p_first, pi = 1; p_cur <= p_max; p_cur += p_inc, ++pi ) @@ -2403,7 +2403,7 @@ void libblis_test_build_function_string if ( strlen( funcname_str ) > MAX_FUNC_STRING_LENGTH ) libblis_test_printf_error( "Function name string length (%d) exceeds maximum (%d).\n", strlen( funcname_str ), MAX_FUNC_STRING_LENGTH ); - + } @@ -2545,7 +2545,7 @@ void libblis_test_mobj_create( test_params_t* params, num_t dt, trans_t trans, c dim_t n_trans = n; dim_t rs = 1; // Initialization avoids a compiler warning. dim_t cs = 1; // Initialization avoids a compiler warning. - + // Apply the trans parameter to the dimensions (if needed). bli_set_dims_with_trans( trans, m, n, &m_trans, &n_trans ); @@ -2591,12 +2591,9 @@ void libblis_test_mobj_create( test_params_t* params, num_t dt, trans_t trans, c } - -#if 0 -cntl_t* libblis_test_pobj_create( bszid_t bmult_id_m, bszid_t bmult_id_n, invdiag_t inv_diag, pack_t pack_schema, packbuf_t pack_buf, obj_t* a, obj_t* p, cntx_t* cntx ) +cntl_t* libblis_test_pobj_create( bszid_t bmult_id_m, bszid_t bmult_id_n, invdiag_t inv_diag, pack_t pack_schema, packbuf_t pack_buf, obj_t* a, obj_t* p, cntx_t* cntx, rntm_t* rntm ) { bool does_inv_diag; - rntm_t rntm; if ( inv_diag == BLIS_NO_INVERT_DIAG ) does_inv_diag = FALSE; else does_inv_diag = TRUE; @@ -2606,7 +2603,6 @@ cntl_t* libblis_test_pobj_create( bszid_t bmult_id_m, bszid_t bmult_id_n, invdia ( NULL, // we don't need the small block allocator from the runtime. NULL, // func ptr is not referenced b/c we don't call via l3 _int(). - bli_packm_blk_var1, bmult_id_m, bmult_id_n, does_inv_diag, @@ -2617,20 +2613,13 @@ cntl_t* libblis_test_pobj_create( bszid_t bmult_id_m, bszid_t bmult_id_n, invdia NULL // no child node needed ); - // Initialize a local-to-BLIS rntm_t. This is simply so we have something - // to pass into bli_l3_packm(). The function doesn't (currently) use the - // runtime object, and even if it did, one with default values would work - // fine here. - bli_rntm_init( &rntm ); - // Pack the contents of A to P. - bli_l3_packm( a, p, cntx, &rntm, cntl, &BLIS_PACKM_SINGLE_THREADED ); + bli_packm_blk_var1( a, p, cntx, rntm, cntl, &BLIS_PACKM_SINGLE_THREADED ); // Return the control tree pointer so the caller can free the cntl_t and its // mem_t entry later on. return cntl; } -#endif void libblis_test_vobj_create( test_params_t* params, num_t dt, char storage, dim_t m, obj_t* x ) @@ -2975,7 +2964,7 @@ void libblis_test_parse_message( FILE* output_stream, char* message, va_list arg char* the_string; char the_char; - // Begin looping over message to insert variables wherever there are + // Begin looping over message to insert variables wherever there are // format specifiers. for ( c = 0; message[c] != '\0'; ) { diff --git a/testsuite/src/test_libblis.h b/testsuite/src/test_libblis.h index 786f82b308..cdb3c6dac4 100644 --- a/testsuite/src/test_libblis.h +++ b/testsuite/src/test_libblis.h @@ -418,7 +418,7 @@ void fill_string_with_n_spaces( char* str, unsigned int n_spaces ); // --- Create object --- void libblis_test_mobj_create( test_params_t* params, num_t dt, trans_t trans, char storage, dim_t m, dim_t n, obj_t* a ); -cntl_t* libblis_test_pobj_create( bszid_t bmult_id_m, bszid_t bmult_id_n, invdiag_t inv_diag, pack_t pack_schema, packbuf_t pack_buf, obj_t* a, obj_t* p, cntx_t* cntx ); +cntl_t* libblis_test_pobj_create( bszid_t bmult_id_m, bszid_t bmult_id_n, invdiag_t inv_diag, pack_t pack_schema, packbuf_t pack_buf, obj_t* a, obj_t* p, cntx_t* cntx, rntm_t* rntm ); void libblis_test_vobj_create( test_params_t* params, num_t dt, char storage, dim_t m, obj_t* x ); // --- Randomize/initialize object --- diff --git a/testsuite/src/test_trsm_ukr.c b/testsuite/src/test_trsm_ukr.c index 6366e5fc3c..b07da91cc8 100644 --- a/testsuite/src/test_trsm_ukr.c +++ b/testsuite/src/test_trsm_ukr.c @@ -171,7 +171,6 @@ void libblis_test_trsm_ukr_experiment num_t datatype; dim_t m, n; - inc_t ldap, ldbp; char sc_a = 'c'; char sc_b = 'r'; @@ -196,11 +195,6 @@ void libblis_test_trsm_ukr_experiment m = bli_cntx_get_blksz_def_dt( datatype, BLIS_MR, cntx ); n = bli_cntx_get_blksz_def_dt( datatype, BLIS_NR, cntx ); - // Also query PACKMR and PACKNR as the leading dimensions to ap and bp, - // respectively. - ldap = bli_cntx_get_blksz_max_dt( datatype, BLIS_MR, cntx ); - ldbp = bli_cntx_get_blksz_max_dt( datatype, BLIS_NR, cntx ); - // Store the register blocksizes so that the driver can retrieve the // values later when printing results. op->dim_aux[0] = m; @@ -238,7 +232,10 @@ void libblis_test_trsm_ukr_experiment libblis_test_mobj_randomize( params, TRUE, &c ); bli_copym( &c, &c_save ); -#if 0 + rntm_t rntm; + bli_rntm_init( &rntm ); + bli_pba_rntm_set_pba( &rntm ); + // Create pack objects for a and b, and pack them to ap and bp, // respectively. cntl_t* cntl_a = libblis_test_pobj_create @@ -249,50 +246,9 @@ void libblis_test_trsm_ukr_experiment BLIS_PACKED_ROW_PANELS, BLIS_BUFFER_FOR_A_BLOCK, &a, &ap, - cntx + cntx, + &rntm ); - cntl_t* cntl_b = libblis_test_pobj_create - ( - BLIS_MR, - BLIS_NR, - BLIS_NO_INVERT_DIAG, - BLIS_PACKED_COL_PANELS, - BLIS_BUFFER_FOR_B_PANEL, - &b, &bp, - cntx - ); -#endif - - // Create the packed objects. Use packmr and packnr as the leading - // dimensions of ap and bp, respectively. Note that we use the ldims - // instead of the matrix dimensions for allocation purposes here. - // This is a little hacky and was prompted when trying to support - // configurations such as power9 that employ duplication/broadcasting - // of elements in one of the packed matrix objects. Thankfully, packm - // doesn't care about those dimensions and instead relies on - // information taken from the source object. Thus, this is merely - // about coaxing bli_obj_create() in allocating enough space for our - // purposes. - bli_obj_create( datatype, ldap, m, 1, ldap, &ap ); - bli_obj_create( datatype, m, ldbp, ldbp, 1, &bp ); - - // Set up the objects for packing. Calling packm_init_pack() does everything - // except checkout a memory pool block and save its address to the obj_t's. - // However, it does overwrite the buffer field of packed object with that of - // the source object (as a side-effect of bli_obj_alias_to(); that buffer - // field would normally be overwritten yet again by the address from the - // memory pool block). So, we have to save the buffer address that was - // allocated so we can re-store it to the object afterward. - void* buf_ap = bli_obj_buffer( &ap ); - void* buf_bp = bli_obj_buffer( &bp ); - bli_packm_init_pack( BLIS_INVERT_DIAG, BLIS_PACKED_ROW_PANELS, - BLIS_PACK_FWD_IF_UPPER, BLIS_PACK_FWD_IF_LOWER, - BLIS_MR, BLIS_KR, &a, &ap, cntx ); - bli_packm_init_pack( BLIS_NO_INVERT_DIAG, BLIS_PACKED_COL_PANELS, - BLIS_PACK_FWD_IF_UPPER, BLIS_PACK_FWD_IF_LOWER, - BLIS_KR, BLIS_NR, &b, &bp, cntx ); - bli_obj_set_buffer( buf_ap, &ap ); - bli_obj_set_buffer( buf_bp, &bp ); // Set the diagonal offset of ap. bli_obj_set_diag_offset( 0, &ap ); @@ -302,24 +258,35 @@ void libblis_test_trsm_ukr_experiment // know which set of micro-kernels (lower or upper) to choose from. bli_obj_set_uplo( uploa, &ap ); - // Pack the data from the source objects. - bli_packm_blk_var1( &a, &ap, cntx, NULL, &BLIS_PACKM_SINGLE_THREADED ); - bli_packm_blk_var1( &b, &bp, cntx, NULL, &BLIS_PACKM_SINGLE_THREADED ); - #if 0 bli_printm( "a", &a, "%5.2f", "" ); bli_printm( "ap", &ap, "%5.2f", "" ); #endif - // Repeat the experiment n_repeats times and record results. + // Repeat the experiment n_repeats times and record results. for ( i = 0; i < n_repeats; ++i ) { - // Re-pack the contents of b to bp. - //bli_packm_blk_var1( &b, &bp, cntx, cntl_b, &BLIS_PACKM_SINGLE_THREADED ); - bli_packm_blk_var1( &b, &bp, cntx, NULL, &BLIS_PACKM_SINGLE_THREADED ); - bli_copym( &c_save, &c ); + // Transpose B to B^T for packing. + bli_obj_induce_trans( &b ); + + cntl_t* cntl_b = libblis_test_pobj_create + ( + BLIS_NR, + BLIS_MR, + BLIS_NO_INVERT_DIAG, + BLIS_PACKED_COL_PANELS, + BLIS_BUFFER_FOR_B_PANEL, + &b, &bp, + cntx, + &rntm + ); + + // Transpose B^T back to B and Bp^T back to Bp. + bli_obj_induce_trans( &b ); + bli_obj_induce_trans( &bp ); + time = bli_clock(); libblis_test_trsm_ukr_impl( iface, side, @@ -327,6 +294,10 @@ bli_printm( "ap", &ap, "%5.2f", "" ); cntx ); time_min = bli_clock_min_diff( time_min, time ); + + // Free the control tree nodes and release their cached mem_t entries + // back to the memory broker. + bli_cntl_free( &rntm, cntl_b, &BLIS_PACKM_SINGLE_THREADED ); } // Estimate the performance of the best experiment repeat. @@ -339,16 +310,9 @@ bli_printm( "ap", &ap, "%5.2f", "" ); // Zero out performance and residual if output matrix is empty. //libblis_test_check_empty_problem( &c, perf, resid ); -#if 0 // Free the control tree nodes and release their cached mem_t entries // back to the memory broker. - bli_cntl_free( NULL, cntl_a, &BLIS_PACKM_SINGLE_THREADED ); - bli_cntl_free( NULL, cntl_b, &BLIS_PACKM_SINGLE_THREADED ); -#endif - - // Free the packed objects. - bli_obj_free( &ap ); - bli_obj_free( &bp ); + bli_cntl_free( &rntm, cntl_a, &BLIS_PACKM_SINGLE_THREADED ); // Free the test objects. bli_obj_free( &a );