From 204e52f184488fb65ba302c81e437e127d50f8a0 Mon Sep 17 00:00:00 2001 From: Devin Matthews Date: Sat, 14 Aug 2021 22:17:25 -0500 Subject: [PATCH 01/24] Simplify the l3 oapi by removing the rather unnecessary macro layer. --- frame/3/bli_l3_oapi.c | 708 ++++++++++++++++++++++++++------------- frame/3/bli_l3_oapi.h | 281 ++++++++++++---- frame/3/bli_l3_oapi_ba.c | 46 --- frame/3/bli_l3_oapi_ex.c | 46 --- 4 files changed, 692 insertions(+), 389 deletions(-) delete mode 100644 frame/3/bli_l3_oapi_ba.c delete mode 100644 frame/3/bli_l3_oapi_ex.c diff --git a/frame/3/bli_l3_oapi.c b/frame/3/bli_l3_oapi.c index 0de6f65817..827dc4534d 100644 --- a/frame/3/bli_l3_oapi.c +++ b/frame/3/bli_l3_oapi.c @@ -33,314 +33,564 @@ */ -// Guard the function definitions so that they are only compiled when -// #included from files that define the object API macros. -#ifdef BLIS_ENABLE_OAPI +#include "blis.h" // // Define object-based interfaces. // -#undef GENFRONT -#define GENFRONT( opname ) \ -\ -void PASTEMAC(opname,EX_SUF) \ - ( \ - obj_t* alpha, \ - obj_t* a, \ - obj_t* b, \ - obj_t* beta, \ - obj_t* c \ - BLIS_OAPI_EX_PARAMS \ - ) \ -{ \ - bli_init_once(); \ -\ - BLIS_OAPI_EX_DECLS \ -\ +void bli_gemm_ex + ( + obj_t* alpha, + obj_t* a, + obj_t* b, + obj_t* beta, + obj_t* c, + cntx_t* cntx, + rntm_t* rntm + ) +{ + bli_init_once(); + /* If the rntm is non-NULL, it may indicate that we should forgo sup - handling altogether. */ \ - bool enable_sup = TRUE; \ - if ( rntm != NULL ) enable_sup = bli_rntm_l3_sup( rntm ); \ -\ - if ( enable_sup ) \ - { \ + handling altogether. */ + bool enable_sup = TRUE; + if ( rntm != NULL ) enable_sup = bli_rntm_l3_sup( rntm ); + + if ( enable_sup ) + { /* Execute the small/unpacked oapi handler. If it finds that the problem does not fall within the thresholds that define "small", or for some other reason decides not to use the small/unpacked implementation, the function returns with BLIS_FAILURE, which causes execution to - proceed towards the conventional implementation. */ \ - err_t result = PASTEMAC(opname,sup)( alpha, a, b, beta, c, cntx, rntm ); \ - if ( result == BLIS_SUCCESS ) \ - { \ - return; \ - } \ - } \ -\ + proceed towards the conventional implementation. */ + err_t result = bli_gemmsup( alpha, a, b, beta, c, cntx, rntm ); + if ( result == BLIS_SUCCESS ) + { + return; + } + } + /* Only proceed with an induced method if each of the operands have a complex storage datatype. NOTE: Allowing precisions to vary while using 1m, which is what we do here, is unique to gemm; other level-3 operations use 1m only if all storage datatypes are equal (and they ignore the computation precision). If any operands are real, skip the induced method chooser function and proceed directly with native - execution. */ \ - if ( bli_obj_is_complex( c ) && \ - bli_obj_is_complex( a ) && \ - bli_obj_is_complex( b ) ) \ - { \ + execution. */ + if ( bli_obj_is_complex( c ) && + bli_obj_is_complex( a ) && + bli_obj_is_complex( b ) ) + { /* Invoke the operation's "ind" function--its induced method front-end. For complex problems, it calls the highest priority induced method that is available (ie: implemented and enabled), and if none are enabled, it calls native execution. (For real problems, it calls - the operation's native execution interface.) */ \ - PASTEMAC(opname,ind)( alpha, a, b, beta, c, cntx, rntm ); \ - } \ - else \ - { \ - PASTEMAC(opname,nat)( alpha, a, b, beta, c, cntx, rntm ); \ - } \ + the operation's native execution interface.) */ + bli_gemmind( alpha, a, b, beta, c, cntx, rntm ); + } + else + { + bli_gemmnat( alpha, a, b, beta, c, cntx, rntm ); + } } -GENFRONT( gemm ) - - -#undef GENFRONT -#define GENFRONT( opname ) \ -\ -void PASTEMAC(opname,EX_SUF) \ - ( \ - obj_t* alpha, \ - obj_t* a, \ - obj_t* b, \ - obj_t* beta, \ - obj_t* c \ - BLIS_OAPI_EX_PARAMS \ - ) \ -{ \ - bli_init_once(); \ -\ - BLIS_OAPI_EX_DECLS \ -\ +void bli_gemm + ( + obj_t* alpha, + obj_t* a, + obj_t* b, + obj_t* beta, + obj_t* c + ) +{ + bli_gemm_ex( alpha, a, b, beta, c, NULL, NULL ); +} + +void bli_gemmt_ex + ( + obj_t* alpha, + obj_t* a, + obj_t* b, + obj_t* beta, + obj_t* c, + cntx_t* cntx, + rntm_t* rntm + ) +{ + bli_init_once(); + /* If the rntm is non-NULL, it may indicate that we should forgo sup - handling altogether. */ \ + handling altogether. */ /* - bool enable_sup = TRUE; \ - if ( rntm != NULL ) enable_sup = bli_rntm_l3_sup( rntm ); \ - */ \ -\ + bool enable_sup = TRUE; + if ( rntm != NULL ) enable_sup = bli_rntm_l3_sup( rntm ); + */ + /* NOTE: The sup handling for gemmt is disabled here because gemmtsup - is not yet fully implemented. */ \ + is not yet fully implemented. */ /* - if ( enable_sup ) \ - { \ - */ \ + if ( enable_sup ) + { + */ /* Execute the small/unpacked oapi handler. If it finds that the problem does not fall within the thresholds that define "small", or for some other reason decides not to use the small/unpacked implementation, the function returns with BLIS_FAILURE, which causes execution to - proceed towards the conventional implementation. */ \ + proceed towards the conventional implementation. */ /* - err_t result = PASTEMAC(opname,sup)( alpha, a, b, beta, c, cntx, rntm ); \ - if ( result == BLIS_SUCCESS ) \ - { \ - return; \ - } \ - } \ - */ \ -\ + err_t result = PASTEMAC(opname,sup)( alpha, a, b, beta, c, cntx, rntm ); + if ( result == BLIS_SUCCESS ) + { + return; + } + } + */ + /* Only proceed with an induced method if each of the operands have a complex storage datatype. NOTE: Allowing precisions to vary while using 1m, which is what we do here, is unique to gemm; other level-3 operations use 1m only if all storage datatypes are equal (and they ignore the computation precision). If any operands are real, skip the induced method chooser function and proceed directly with native - execution. */ \ - if ( bli_obj_is_complex( c ) && \ - bli_obj_is_complex( a ) && \ - bli_obj_is_complex( b ) ) \ - { \ + execution. */ + if ( bli_obj_is_complex( c ) && + bli_obj_is_complex( a ) && + bli_obj_is_complex( b ) ) + { /* FIXME: BLIS does not yet support induced methods for gemmt. Thus, - we call the native implementation code path for now. */ \ - /*PASTEMAC(opname,ind)( alpha, a, b, beta, c, cntx, rntm );*/ \ - PASTEMAC(opname,nat)( alpha, a, b, beta, c, cntx, rntm ); \ - } \ - else \ - { \ - PASTEMAC(opname,nat)( alpha, a, b, beta, c, cntx, rntm ); \ - } \ + we call the native implementation code path for now. */ + /*PASTEMAC(opname,ind)( alpha, a, b, beta, c, cntx, rntm );*/ + bli_gemmtnat( alpha, a, b, beta, c, cntx, rntm ); + } + else + { + bli_gemmtnat( alpha, a, b, beta, c, cntx, rntm ); + } } -GENFRONT( gemmt ) - - -#undef GENFRONT -#define GENFRONT( opname ) \ -\ -void PASTEMAC(opname,EX_SUF) \ - ( \ - obj_t* alpha, \ - obj_t* a, \ - obj_t* b, \ - obj_t* beta, \ - obj_t* c \ - BLIS_OAPI_EX_PARAMS \ - ) \ -{ \ - bli_init_once(); \ -\ - BLIS_OAPI_EX_DECLS \ -\ +void bli_gemmt + ( + obj_t* alpha, + obj_t* a, + obj_t* b, + obj_t* beta, + obj_t* c + ) +{ + bli_gemmt_ex( alpha, a, b, beta, c, NULL, NULL ); +} + +void bli_her2k_ex + ( + obj_t* alpha, + obj_t* a, + obj_t* b, + obj_t* beta, + obj_t* c, + cntx_t* cntx, + rntm_t* rntm + ) +{ + bli_init_once(); + /* Only proceed with an induced method if each of the operands have a complex storage datatype. NOTE: Allowing precisions to vary while using 1m, which is what we do here, is unique to gemm; other level-3 operations use 1m only if all storage datatypes are equal (and they ignore the computation precision). If any operands are real, skip the induced method chooser function and proceed directly with native - execution. */ \ - if ( bli_obj_is_complex( c ) && \ - bli_obj_is_complex( a ) && \ - bli_obj_is_complex( b ) ) \ - { \ + execution. */ + if ( bli_obj_is_complex( c ) && + bli_obj_is_complex( a ) && + bli_obj_is_complex( b ) ) + { /* Invoke the operation's "ind" function--its induced method front-end. For complex problems, it calls the highest priority induced method that is available (ie: implemented and enabled), and if none are enabled, it calls native execution. (For real problems, it calls - the operation's native execution interface.) */ \ - PASTEMAC(opname,ind)( alpha, a, b, beta, c, cntx, rntm ); \ - } \ - else \ - { \ - PASTEMAC(opname,nat)( alpha, a, b, beta, c, cntx, rntm ); \ - } \ + the operation's native execution interface.) */ + bli_her2kind( alpha, a, b, beta, c, cntx, rntm ); + } + else + { + bli_her2knat( alpha, a, b, beta, c, cntx, rntm ); + } } -GENFRONT( her2k ) -GENFRONT( syr2k ) - - -#undef GENFRONT -#define GENFRONT( opname ) \ -\ -void PASTEMAC(opname,EX_SUF) \ - ( \ - side_t side, \ - obj_t* alpha, \ - obj_t* a, \ - obj_t* b, \ - obj_t* beta, \ - obj_t* c \ - BLIS_OAPI_EX_PARAMS \ - ) \ -{ \ - bli_init_once(); \ -\ - BLIS_OAPI_EX_DECLS \ -\ +void bli_her2k + ( + obj_t* alpha, + obj_t* a, + obj_t* b, + obj_t* beta, + obj_t* c + ) +{ + bli_her2k_ex( alpha, a, b, beta, c, NULL, NULL ); +} + +void bli_syr2k_ex + ( + obj_t* alpha, + obj_t* a, + obj_t* b, + obj_t* beta, + obj_t* c, + cntx_t* cntx, + rntm_t* rntm + ) +{ + bli_init_once(); + + /* Only proceed with an induced method if each of the operands have a + complex storage datatype. NOTE: Allowing precisions to vary while + using 1m, which is what we do here, is unique to gemm; other level-3 + operations use 1m only if all storage datatypes are equal (and they + ignore the computation precision). If any operands are real, skip the + induced method chooser function and proceed directly with native + execution. */ + if ( bli_obj_is_complex( c ) && + bli_obj_is_complex( a ) && + bli_obj_is_complex( b ) ) + { + /* Invoke the operation's "ind" function--its induced method front-end. + For complex problems, it calls the highest priority induced method + that is available (ie: implemented and enabled), and if none are + enabled, it calls native execution. (For real problems, it calls + the operation's native execution interface.) */ + bli_syr2kind( alpha, a, b, beta, c, cntx, rntm ); + } + else + { + bli_syr2knat( alpha, a, b, beta, c, cntx, rntm ); + } +} + +void bli_syr2k + ( + obj_t* alpha, + obj_t* a, + obj_t* b, + obj_t* beta, + obj_t* c + ) +{ + bli_syr2k_ex( alpha, a, b, beta, c, NULL, NULL ); +} + +void bli_hemm_ex + ( + side_t side, + obj_t* alpha, + obj_t* a, + obj_t* b, + obj_t* beta, + obj_t* c, + cntx_t* cntx, + rntm_t* rntm + ) +{ + bli_init_once(); + /* Only proceed with an induced method if all operands have the same (complex) datatype. If any datatypes differ, skip the induced method chooser function and proceed directly with native execution, which is - where mixed datatype support will be implemented (if at all). */ \ - if ( bli_obj_dt( a ) == bli_obj_dt( c ) && \ - bli_obj_dt( b ) == bli_obj_dt( c ) && \ - bli_obj_is_complex( c ) ) \ - { \ + where mixed datatype support will be implemented (if at all). */ + if ( bli_obj_dt( a ) == bli_obj_dt( c ) && + bli_obj_dt( b ) == bli_obj_dt( c ) && + bli_obj_is_complex( c ) ) + { /* Invoke the operation's "ind" function--its induced method front-end. For complex problems, it calls the highest priority induced method that is available (ie: implemented and enabled), and if none are enabled, it calls native execution. (For real problems, it calls - the operation's native execution interface.) */ \ - PASTEMAC(opname,ind)( side, alpha, a, b, beta, c, cntx, rntm ); \ - } \ - else \ - { \ - PASTEMAC(opname,nat)( side, alpha, a, b, beta, c, cntx, rntm ); \ - } \ + the operation's native execution interface.) */ + bli_hemmind( side, alpha, a, b, beta, c, cntx, rntm ); + } + else + { + bli_hemmnat( side, alpha, a, b, beta, c, cntx, rntm ); + } +} + +void bli_hemm + ( + side_t side, + obj_t* alpha, + obj_t* a, + obj_t* b, + obj_t* beta, + obj_t* c + ) +{ + bli_hemm_ex( side, alpha, a, b, beta, c, NULL, NULL ); } -GENFRONT( hemm ) -GENFRONT( symm ) -GENFRONT( trmm3 ) - - -#undef GENFRONT -#define GENFRONT( opname ) \ -\ -void PASTEMAC(opname,EX_SUF) \ - ( \ - obj_t* alpha, \ - obj_t* a, \ - obj_t* beta, \ - obj_t* c \ - BLIS_OAPI_EX_PARAMS \ - ) \ -{ \ - bli_init_once(); \ -\ - BLIS_OAPI_EX_DECLS \ -\ +void bli_symm_ex + ( + side_t side, + obj_t* alpha, + obj_t* a, + obj_t* b, + obj_t* beta, + obj_t* c, + cntx_t* cntx, + rntm_t* rntm + ) +{ + bli_init_once(); + /* Only proceed with an induced method if all operands have the same (complex) datatype. If any datatypes differ, skip the induced method chooser function and proceed directly with native execution, which is - where mixed datatype support will be implemented (if at all). */ \ - if ( bli_obj_dt( a ) == bli_obj_dt( c ) && \ - bli_obj_is_complex( c ) ) \ - { \ + where mixed datatype support will be implemented (if at all). */ + if ( bli_obj_dt( a ) == bli_obj_dt( c ) && + bli_obj_dt( b ) == bli_obj_dt( c ) && + bli_obj_is_complex( c ) ) + { /* Invoke the operation's "ind" function--its induced method front-end. For complex problems, it calls the highest priority induced method that is available (ie: implemented and enabled), and if none are enabled, it calls native execution. (For real problems, it calls - the operation's native execution interface.) */ \ - PASTEMAC(opname,ind)( alpha, a, beta, c, cntx, rntm ); \ - } \ - else \ - { \ - PASTEMAC(opname,nat)( alpha, a, beta, c, cntx, rntm ); \ - } \ + the operation's native execution interface.) */ + bli_symmind( side, alpha, a, b, beta, c, cntx, rntm ); + } + else + { + bli_symmnat( side, alpha, a, b, beta, c, cntx, rntm ); + } +} + +void bli_symm + ( + side_t side, + obj_t* alpha, + obj_t* a, + obj_t* b, + obj_t* beta, + obj_t* c + ) +{ + bli_symm_ex( side, alpha, a, b, beta, c, NULL, NULL ); } -GENFRONT( herk ) -GENFRONT( syrk ) - - -#undef GENFRONT -#define GENFRONT( opname ) \ -\ -void PASTEMAC(opname,EX_SUF) \ - ( \ - side_t side, \ - obj_t* alpha, \ - obj_t* a, \ - obj_t* b \ - BLIS_OAPI_EX_PARAMS \ - ) \ -{ \ - bli_init_once(); \ -\ - BLIS_OAPI_EX_DECLS \ -\ +void bli_trmm3_ex + ( + side_t side, + obj_t* alpha, + obj_t* a, + obj_t* b, + obj_t* beta, + obj_t* c, + cntx_t* cntx, + rntm_t* rntm + ) +{ + bli_init_once(); + /* Only proceed with an induced method if all operands have the same (complex) datatype. If any datatypes differ, skip the induced method chooser function and proceed directly with native execution, which is - where mixed datatype support will be implemented (if at all). */ \ - if ( bli_obj_dt( a ) == bli_obj_dt( b ) && \ - bli_obj_is_complex( b ) ) \ - { \ + where mixed datatype support will be implemented (if at all). */ + if ( bli_obj_dt( a ) == bli_obj_dt( c ) && + bli_obj_dt( b ) == bli_obj_dt( c ) && + bli_obj_is_complex( c ) ) + { /* Invoke the operation's "ind" function--its induced method front-end. For complex problems, it calls the highest priority induced method that is available (ie: implemented and enabled), and if none are enabled, it calls native execution. (For real problems, it calls - the operation's native execution interface.) */ \ - PASTEMAC(opname,ind)( side, alpha, a, b, cntx, rntm ); \ - } \ - else \ - { \ - PASTEMAC(opname,nat)( side, alpha, a, b, cntx, rntm ); \ - } \ + the operation's native execution interface.) */ + bli_trmm3ind( side, alpha, a, b, beta, c, cntx, rntm ); + } + else + { + bli_trmm3nat( side, alpha, a, b, beta, c, cntx, rntm ); + } } -GENFRONT( trmm ) -GENFRONT( trsm ) +void bli_trmm3 + ( + side_t side, + obj_t* alpha, + obj_t* a, + obj_t* b, + obj_t* beta, + obj_t* c + ) +{ + bli_trmm3_ex( side, alpha, a, b, beta, c, NULL, NULL ); +} +void bli_herk_ex + ( + obj_t* alpha, + obj_t* a, + obj_t* beta, + obj_t* c, + cntx_t* cntx, + rntm_t* rntm + ) +{ + bli_init_once(); -#endif + /* Only proceed with an induced method if all operands have the same + (complex) datatype. If any datatypes differ, skip the induced method + chooser function and proceed directly with native execution, which is + where mixed datatype support will be implemented (if at all). */ + if ( bli_obj_dt( a ) == bli_obj_dt( c ) && + bli_obj_is_complex( c ) ) + { + /* Invoke the operation's "ind" function--its induced method front-end. + For complex problems, it calls the highest priority induced method + that is available (ie: implemented and enabled), and if none are + enabled, it calls native execution. (For real problems, it calls + the operation's native execution interface.) */ + bli_herkind( alpha, a, beta, c, cntx, rntm ); + } + else + { + bli_herknat( alpha, a, beta, c, cntx, rntm ); + } +} + +void bli_herk + ( + obj_t* alpha, + obj_t* a, + obj_t* beta, + obj_t* c + ) +{ + bli_herk_ex( alpha, a, beta, c, NULL, NULL ); +} + +void bli_syrk_ex + ( + obj_t* alpha, + obj_t* a, + obj_t* beta, + obj_t* c, + cntx_t* cntx, + rntm_t* rntm + ) +{ + bli_init_once(); + + /* Only proceed with an induced method if all operands have the same + (complex) datatype. If any datatypes differ, skip the induced method + chooser function and proceed directly with native execution, which is + where mixed datatype support will be implemented (if at all). */ + if ( bli_obj_dt( a ) == bli_obj_dt( c ) && + bli_obj_is_complex( c ) ) + { + /* Invoke the operation's "ind" function--its induced method front-end. + For complex problems, it calls the highest priority induced method + that is available (ie: implemented and enabled), and if none are + enabled, it calls native execution. (For real problems, it calls + the operation's native execution interface.) */ + bli_syrkind( alpha, a, beta, c, cntx, rntm ); + } + else + { + bli_syrknat( alpha, a, beta, c, cntx, rntm ); + } +} + +void bli_syrk + ( + obj_t* alpha, + obj_t* a, + obj_t* beta, + obj_t* c + ) +{ + bli_syrk_ex( alpha, a, beta, c, NULL, NULL ); +} + +void bli_trmm_ex + ( + side_t side, + obj_t* alpha, + obj_t* a, + obj_t* b, + cntx_t* cntx, + rntm_t* rntm + ) +{ + bli_init_once(); + + /* Only proceed with an induced method if all operands have the same + (complex) datatype. If any datatypes differ, skip the induced method + chooser function and proceed directly with native execution, which is + where mixed datatype support will be implemented (if at all). */ + if ( bli_obj_dt( a ) == bli_obj_dt( b ) && + bli_obj_is_complex( b ) ) + { + /* Invoke the operation's "ind" function--its induced method front-end. + For complex problems, it calls the highest priority induced method + that is available (ie: implemented and enabled), and if none are + enabled, it calls native execution. (For real problems, it calls + the operation's native execution interface.) */ + bli_trmmind( side, alpha, a, b, cntx, rntm ); + } + else + { + bli_trmmnat( side, alpha, a, b, cntx, rntm ); + } +} + +void bli_trmm + ( + side_t side, + obj_t* alpha, + obj_t* a, + obj_t* b + ) +{ + bli_trmm_ex( side, alpha, a, b, NULL, NULL ); +} + +void bli_trsm_ex + ( + side_t side, + obj_t* alpha, + obj_t* a, + obj_t* b, + cntx_t* cntx, + rntm_t* rntm + ) +{ + bli_init_once(); + /* Only proceed with an induced method if all operands have the same + (complex) datatype. If any datatypes differ, skip the induced method + chooser function and proceed directly with native execution, which is + where mixed datatype support will be implemented (if at all). */ + if ( bli_obj_dt( a ) == bli_obj_dt( b ) && + bli_obj_is_complex( b ) ) + { + /* Invoke the operation's "ind" function--its induced method front-end. + For complex problems, it calls the highest priority induced method + that is available (ie: implemented and enabled), and if none are + enabled, it calls native execution. (For real problems, it calls + the operation's native execution interface.) */ + bli_trsmind( side, alpha, a, b, cntx, rntm ); + } + else + { + bli_trsmnat( side, alpha, a, b, cntx, rntm ); + } +} + +void bli_trsm + ( + side_t side, + obj_t* alpha, + obj_t* a, + obj_t* b + ) +{ + bli_trsm_ex( side, alpha, a, b, NULL, NULL ); +} diff --git a/frame/3/bli_l3_oapi.h b/frame/3/bli_l3_oapi.h index fcbc9dec4d..02845fe309 100644 --- a/frame/3/bli_l3_oapi.h +++ b/frame/3/bli_l3_oapi.h @@ -38,72 +38,217 @@ // Prototype object-based interfaces. // -#undef GENPROT -#define GENPROT( opname ) \ -\ -BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ - ( \ - obj_t* alpha, \ - obj_t* a, \ - obj_t* b, \ - obj_t* beta, \ - obj_t* c \ - BLIS_OAPI_EX_PARAMS \ - ); - -GENPROT( gemm ) -GENPROT( gemmt ) -GENPROT( her2k ) -GENPROT( syr2k ) - - -#undef GENPROT -#define GENPROT( opname ) \ -\ -BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ - ( \ - side_t side, \ - obj_t* alpha, \ - obj_t* a, \ - obj_t* b, \ - obj_t* beta, \ - obj_t* c \ - BLIS_OAPI_EX_PARAMS \ - ); - -GENPROT( hemm ) -GENPROT( symm ) -GENPROT( trmm3 ) - - -#undef GENPROT -#define GENPROT( opname ) \ -\ -BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ - ( \ - obj_t* alpha, \ - obj_t* a, \ - obj_t* beta, \ - obj_t* c \ - BLIS_OAPI_EX_PARAMS \ - ); - -GENPROT( herk ) -GENPROT( syrk ) - - -#undef GENPROT -#define GENPROT( opname ) \ -\ -BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ - ( \ - side_t side, \ - obj_t* alpha, \ - obj_t* a, \ - obj_t* b \ - BLIS_OAPI_EX_PARAMS \ - ); - -GENPROT( trmm ) -GENPROT( trsm ) +BLIS_EXPORT_BLIS void bli_gemm_ex + ( + obj_t* alpha, + obj_t* a, + obj_t* b, + obj_t* beta, + obj_t* c, + cntx_t* cntx, + rntm_t* rntm + ); + +BLIS_EXPORT_BLIS void bli_gemm + ( + obj_t* alpha, + obj_t* a, + obj_t* b, + obj_t* beta, + obj_t* c + ); + +BLIS_EXPORT_BLIS void bli_gemmt_ex + ( + obj_t* alpha, + obj_t* a, + obj_t* b, + obj_t* beta, + obj_t* c, + cntx_t* cntx, + rntm_t* rntm + ); + +BLIS_EXPORT_BLIS void bli_gemmt + ( + obj_t* alpha, + obj_t* a, + obj_t* b, + obj_t* beta, + obj_t* c + ); + +BLIS_EXPORT_BLIS void bli_her2k_ex + ( + obj_t* alpha, + obj_t* a, + obj_t* b, + obj_t* beta, + obj_t* c, + cntx_t* cntx, + rntm_t* rntm + ); + +BLIS_EXPORT_BLIS void bli_her2k + ( + obj_t* alpha, + obj_t* a, + obj_t* b, + obj_t* beta, + obj_t* c + ); + +BLIS_EXPORT_BLIS void bli_syr2k_ex + ( + obj_t* alpha, + obj_t* a, + obj_t* b, + obj_t* beta, + obj_t* c, + cntx_t* cntx, + rntm_t* rntm + ); + +BLIS_EXPORT_BLIS void bli_syr2k + ( + obj_t* alpha, + obj_t* a, + obj_t* b, + obj_t* beta, + obj_t* c + ); +BLIS_EXPORT_BLIS void bli_hemm_ex + ( + side_t side, + obj_t* alpha, + obj_t* a, + obj_t* b, + obj_t* beta, + obj_t* c, + cntx_t* cntx, + rntm_t* rntm + ); +BLIS_EXPORT_BLIS void bli_hemm + ( + side_t side, + obj_t* alpha, + obj_t* a, + obj_t* b, + obj_t* beta, + obj_t* c + ); + +BLIS_EXPORT_BLIS void bli_symm_ex + ( + side_t side, + obj_t* alpha, + obj_t* a, + obj_t* b, + obj_t* beta, + obj_t* c, + cntx_t* cntx, + rntm_t* rntm + ); +BLIS_EXPORT_BLIS void bli_symm + ( + side_t side, + obj_t* alpha, + obj_t* a, + obj_t* b, + obj_t* beta, + obj_t* c + ); + +BLIS_EXPORT_BLIS void bli_trmm3_ex + ( + side_t side, + obj_t* alpha, + obj_t* a, + obj_t* b, + obj_t* beta, + obj_t* c, + cntx_t* cntx, + rntm_t* rntm + ); +BLIS_EXPORT_BLIS void bli_trmm3 + ( + side_t side, + obj_t* alpha, + obj_t* a, + obj_t* b, + obj_t* beta, + obj_t* c + ); + +BLIS_EXPORT_BLIS void bli_herk_ex + ( + obj_t* alpha, + obj_t* a, + obj_t* beta, + obj_t* c, + cntx_t* cntx, + rntm_t* rntm + ); + +BLIS_EXPORT_BLIS void bli_herk + ( + obj_t* alpha, + obj_t* a, + obj_t* beta, + obj_t* c + ); + +BLIS_EXPORT_BLIS void bli_syrk_ex + ( + obj_t* alpha, + obj_t* a, + obj_t* beta, + obj_t* c, + cntx_t* cntx, + rntm_t* rntm + ); + +BLIS_EXPORT_BLIS void bli_syrk + ( + obj_t* alpha, + obj_t* a, + obj_t* beta, + obj_t* c + ); + +BLIS_EXPORT_BLIS void bli_trmm_ex + ( + side_t side, + obj_t* alpha, + obj_t* a, + obj_t* b, + cntx_t* cntx, + rntm_t* rntm + ); + +BLIS_EXPORT_BLIS void bli_trmm + ( + side_t side, + obj_t* alpha, + obj_t* a, + obj_t* b + ); + +BLIS_EXPORT_BLIS void bli_trsm_ex + ( + side_t side, + obj_t* alpha, + obj_t* a, + obj_t* b, + cntx_t* cntx, + rntm_t* rntm + ); + +BLIS_EXPORT_BLIS void bli_trsm + ( + side_t side, + obj_t* alpha, + obj_t* a, + obj_t* b + ); diff --git a/frame/3/bli_l3_oapi_ba.c b/frame/3/bli_l3_oapi_ba.c deleted file mode 100644 index d6e3b2f3d5..0000000000 --- a/frame/3/bli_l3_oapi_ba.c +++ /dev/null @@ -1,46 +0,0 @@ -/* - - BLIS - An object-based framework for developing high-performance BLAS-like - libraries. - - Copyright (C) 2014, The University of Texas at Austin - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are - met: - - Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - - Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - - Neither the name(s) of the copyright holder(s) nor the names of its - contributors may be used to endorse or promote products derived - from this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -*/ - -#include "blis.h" - -// Include cpp macros that instantiate the API definition templates as -// omitting expert parameters. -#include "bli_oapi_ba.h" - -// Define the macro protecting the object API definitions. -#define BLIS_ENABLE_OAPI - -// Include the object API definitions here. -#include "bli_l3_oapi.c" - diff --git a/frame/3/bli_l3_oapi_ex.c b/frame/3/bli_l3_oapi_ex.c deleted file mode 100644 index 76f4fe16ab..0000000000 --- a/frame/3/bli_l3_oapi_ex.c +++ /dev/null @@ -1,46 +0,0 @@ -/* - - BLIS - An object-based framework for developing high-performance BLAS-like - libraries. - - Copyright (C) 2014, The University of Texas at Austin - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are - met: - - Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - - Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - - Neither the name(s) of the copyright holder(s) nor the names of its - contributors may be used to endorse or promote products derived - from this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -*/ - -#include "blis.h" - -// Include cpp macros that instantiate the API definition templates as -// having expert parameters. -#include "bli_oapi_ex.h" - -// Define the macro protecting the object API definitions. -#define BLIS_ENABLE_OAPI - -// Include the object API definitions here. -#include "bli_l3_oapi.c" - From 905b266f7314a148b6f8a1811b2d142e9ba999ca Mon Sep 17 00:00:00 2001 From: Devin Matthews Date: Sun, 15 Aug 2021 13:25:33 -0500 Subject: [PATCH 02/24] Completely cull the framework code for syrk/herk/syr2k/her2k and just call gemmt instead. The induced methods for gemmt are currently missing but I imagine that is easy to fill in. --- config/zen/bli_family_zen.h | 4 +- config/zen2/bli_family_zen2.h | 4 +- frame/3/bli_l3.h | 10 - frame/3/bli_l3_blocksize.c | 12 +- frame/3/bli_l3_blocksize.h | 6 +- frame/3/bli_l3_check.c | 221 ------- frame/3/bli_l3_check.h | 40 -- frame/3/bli_l3_cntl.c | 4 +- frame/3/bli_l3_direct.c | 6 +- frame/3/bli_l3_direct.h | 2 +- frame/3/bli_l3_oapi.c | 150 +++-- frame/3/bli_l3_prune.c | 6 +- frame/3/bli_l3_prune.h | 6 +- frame/3/bli_l3_thrinfo.h | 8 +- frame/3/gemm/bli_gemm_blk_var3.c | 2 +- frame/3/gemm/bli_gemm_cntl.c | 6 +- frame/3/gemmt/bli_gemmt.h | 2 + frame/3/gemmt/bli_gemmt_front.c | 10 +- .../bli_gemmt_l_ker_var2.c} | 16 +- .../bli_gemmt_u_ker_var2.c} | 16 +- .../bli_herk_var.h => gemmt/bli_gemmt_var.h} | 16 +- .../bli_gemmt_x_ker_var2.c} | 4 +- frame/3/her2k/bli_her2k.h | 36 -- frame/3/her2k/bli_her2k_front.c | 174 ------ frame/3/her2k/bli_her2k_front.h | 45 -- frame/3/herk/bli_herk.h | 38 -- frame/3/herk/bli_herk_front.c | 136 ----- frame/3/herk/bli_herk_front.h | 44 -- .../herk/other/bli_herk_l_ker_var2.1looprr.c | 420 ------------- frame/3/herk/other/bli_herk_l_ker_var2.c | 409 ------------- frame/3/herk/other/bli_herk_l_ker_var2rr.c | 555 ----------------- frame/3/herk/other/bli_herk_l_ker_var2sl.c | 556 ----------------- .../herk/other/bli_herk_u_ker_var2.1looprr.c | 420 ------------- frame/3/herk/other/bli_herk_u_ker_var2.c | 409 ------------- frame/3/herk/other/bli_herk_u_ker_var2rr.c | 557 ----------------- frame/3/herk/other/bli_herk_u_ker_var2sl.c | 558 ------------------ frame/3/syr2k/bli_syr2k.h | 36 -- frame/3/syr2k/bli_syr2k_front.c | 147 ----- frame/3/syr2k/bli_syr2k_front.h | 45 -- frame/3/syrk/bli_syrk.h | 36 -- frame/3/syrk/bli_syrk_front.c | 131 ---- frame/3/syrk/bli_syrk_front.h | 58 -- frame/base/bli_info.c | 9 +- frame/base/bli_info.h | 1 + frame/base/bli_part.c | 18 +- frame/ind/bli_l3_ind.c | 34 +- frame/ind/bli_l3_ind.h | 4 - frame/ind/oapi/bli_l3_3m4m1m_oapi.c | 101 +--- frame/ind/oapi/bli_l3_ind_oapi.c | 37 +- frame/ind/oapi/bli_l3_ind_oapi.h | 8 - frame/ind/oapi/bli_l3_nat_oapi.c | 41 +- frame/ind/tapi/bli_l3_ind_tapi.c | 248 -------- frame/ind/tapi/bli_l3_ind_tapi.h | 100 ---- frame/thread/bli_thread.c | 4 +- .../3/{bli_syrk_small.c => bli_gemmt_small.c} | 88 +-- 55 files changed, 213 insertions(+), 5841 deletions(-) rename frame/3/{herk/bli_herk_l_ker_var2.c => gemmt/bli_gemmt_l_ker_var2.c} (97%) rename frame/3/{herk/bli_herk_u_ker_var2.c => gemmt/bli_gemmt_u_ker_var2.c} (97%) rename frame/3/{herk/bli_herk_var.h => gemmt/bli_gemmt_var.h} (90%) rename frame/3/{herk/bli_herk_x_ker_var2.c => gemmt/bli_gemmt_x_ker_var2.c} (97%) delete mode 100644 frame/3/her2k/bli_her2k.h delete mode 100644 frame/3/her2k/bli_her2k_front.c delete mode 100644 frame/3/her2k/bli_her2k_front.h delete mode 100644 frame/3/herk/bli_herk.h delete mode 100644 frame/3/herk/bli_herk_front.c delete mode 100644 frame/3/herk/bli_herk_front.h delete mode 100644 frame/3/herk/other/bli_herk_l_ker_var2.1looprr.c delete mode 100644 frame/3/herk/other/bli_herk_l_ker_var2.c delete mode 100644 frame/3/herk/other/bli_herk_l_ker_var2rr.c delete mode 100644 frame/3/herk/other/bli_herk_l_ker_var2sl.c delete mode 100644 frame/3/herk/other/bli_herk_u_ker_var2.1looprr.c delete mode 100644 frame/3/herk/other/bli_herk_u_ker_var2.c delete mode 100644 frame/3/herk/other/bli_herk_u_ker_var2rr.c delete mode 100644 frame/3/herk/other/bli_herk_u_ker_var2sl.c delete mode 100644 frame/3/syr2k/bli_syr2k.h delete mode 100644 frame/3/syr2k/bli_syr2k_front.c delete mode 100644 frame/3/syr2k/bli_syr2k_front.h delete mode 100644 frame/3/syrk/bli_syrk.h delete mode 100644 frame/3/syrk/bli_syrk_front.c delete mode 100644 frame/3/syrk/bli_syrk_front.h rename kernels/zen/3/{bli_syrk_small.c => bli_gemmt_small.c} (99%) diff --git a/config/zen/bli_family_zen.h b/config/zen/bli_family_zen.h index c82392b60e..d1c4ef828a 100644 --- a/config/zen/bli_family_zen.h +++ b/config/zen/bli_family_zen.h @@ -52,8 +52,8 @@ #define BLIS_SMALL_MATRIX_THRES_TRSM 32768 //128(128+128) => m*(m+n) #define BLIS_SMALL_MATRIX_A_THRES_TRSM 128 -#define BLIS_SMALL_MATRIX_A_THRES_M_SYRK 96 -#define BLIS_SMALL_MATRIX_A_THRES_N_SYRK 128 +#define BLIS_SMALL_MATRIX_A_THRES_M_GEMMT 96 +#define BLIS_SMALL_MATRIX_A_THRES_N_GEMMT 128 //This macro will enable BLIS DGEMM to choose block sizes for a single instance mode #define BLIS_ENABLE_SINGLE_INSTANCE_BLOCK_SIZES 0 diff --git a/config/zen2/bli_family_zen2.h b/config/zen2/bli_family_zen2.h index a0f5b574d2..d7adddf3c8 100644 --- a/config/zen2/bli_family_zen2.h +++ b/config/zen2/bli_family_zen2.h @@ -51,8 +51,8 @@ #define BLIS_SMALL_MATRIX_THRES_TRSM 32768 //128(128+128) => m*(m+n) #define BLIS_SMALL_MATRIX_A_THRES_TRSM 128 -#define BLIS_SMALL_MATRIX_A_THRES_M_SYRK 96 -#define BLIS_SMALL_MATRIX_A_THRES_N_SYRK 128 +#define BLIS_SMALL_MATRIX_A_THRES_M_GEMMT 96 +#define BLIS_SMALL_MATRIX_A_THRES_N_GEMMT 128 #define BLIS_ENABLE_SMALL_MATRIX_ROME #define BLIS_SMALL_MATRIX_THRES_ROME 400 diff --git a/frame/3/bli_l3.h b/frame/3/bli_l3.h index 740733c3ed..7a35ca9740 100644 --- a/frame/3/bli_l3.h +++ b/frame/3/bli_l3.h @@ -48,13 +48,7 @@ #include "bli_l3_packm.h" // Prototype object APIs (expert and non-expert). -#include "bli_oapi_ex.h" #include "bli_l3_oapi.h" -#include "bli_xapi_undef.h" - -#include "bli_oapi_ba.h" -#include "bli_l3_oapi.h" -#include "bli_xapi_undef.h" // Prototype typed APIs (expert and non-expert). #include "bli_tapi_ex.h" @@ -93,11 +87,7 @@ // Operation-specific headers. #include "bli_gemm.h" #include "bli_hemm.h" -#include "bli_herk.h" -#include "bli_her2k.h" #include "bli_symm.h" -#include "bli_syrk.h" -#include "bli_syr2k.h" #include "bli_trmm.h" #include "bli_trmm3.h" #include "bli_trsm.h" diff --git a/frame/3/bli_l3_blocksize.c b/frame/3/bli_l3_blocksize.c index 58b658d1d8..1986b3b0f6 100644 --- a/frame/3/bli_l3_blocksize.c +++ b/frame/3/bli_l3_blocksize.c @@ -51,8 +51,8 @@ dim_t bli_l3_determine_kc if ( family == BLIS_GEMM ) return bli_gemm_determine_kc( direct, i, dim, a, b, bszid, cntx ); - else if ( family == BLIS_HERK ) - return bli_herk_determine_kc( direct, i, dim, a, b, bszid, cntx ); + else if ( family == BLIS_GEMMT ) + return bli_gemmt_determine_kc( direct, i, dim, a, b, bszid, cntx ); else if ( family == BLIS_TRMM ) return bli_trmm_determine_kc( direct, i, dim, a, b, bszid, cntx ); else if ( family == BLIS_TRSM ) @@ -91,7 +91,7 @@ dim_t PASTEMAC0(opname) \ } GENFRONT( gemm_determine_kc, gemm ) -GENFRONT( herk_determine_kc, herk ) +GENFRONT( gemmt_determine_kc, gemmt ) GENFRONT( trmm_determine_kc, trmm ) GENFRONT( trsm_determine_kc, trsm ) @@ -201,7 +201,7 @@ dim_t PASTEMAC0(opname) \ b_alg = bli_blksz_get_def( dt, bsize ); \ b_max = bli_blksz_get_max( dt, bsize ); \ \ - /* Notice that for herk, we do not need to perform any special handling + /* Notice that for gemmt, we do not need to perform any special handling for the default and maximum kc blocksizes vis-a-vis MR or NR. */ \ \ /* Call the bli_determine_blocksize_[fb]_sub() helper routine defined @@ -211,8 +211,8 @@ dim_t PASTEMAC0(opname) \ return b_use; \ } -GENFRONT( herk_determine_kc_f, f ) -GENFRONT( herk_determine_kc_b, b ) +GENFRONT( gemmt_determine_kc_f, f ) +GENFRONT( gemmt_determine_kc_b, b ) // ----------------------------------------------------------------------------- diff --git a/frame/3/bli_l3_blocksize.h b/frame/3/bli_l3_blocksize.h index c3301ee13a..3ea3c5aa02 100644 --- a/frame/3/bli_l3_blocksize.h +++ b/frame/3/bli_l3_blocksize.h @@ -60,7 +60,7 @@ dim_t PASTEMAC0(opname) \ ); GENPROT( gemm_determine_kc ) -GENPROT( herk_determine_kc ) +GENPROT( gemmt_determine_kc ) GENPROT( trmm_determine_kc ) GENPROT( trsm_determine_kc ) @@ -81,8 +81,8 @@ dim_t PASTEMAC0(opname) \ GENPROT( gemm_determine_kc_f ) GENPROT( gemm_determine_kc_b ) -GENPROT( herk_determine_kc_f ) -GENPROT( herk_determine_kc_b ) +GENPROT( gemmt_determine_kc_f ) +GENPROT( gemmt_determine_kc_b ) GENPROT( trmm_determine_kc_f ) GENPROT( trmm_determine_kc_b ) diff --git a/frame/3/bli_l3_check.c b/frame/3/bli_l3_check.c index 945b267fda..88d89bbdc9 100644 --- a/frame/3/bli_l3_check.c +++ b/frame/3/bli_l3_check.c @@ -108,71 +108,6 @@ void bli_hemm_check bli_check_error_code( e_val ); } -void bli_herk_check - ( - obj_t* alpha, - obj_t* a, - obj_t* beta, - obj_t* c, - cntx_t* cntx - ) -{ - err_t e_val; - obj_t ah; - - // Alias A to A^H so we can perform dimension checks. - bli_obj_alias_with_trans( BLIS_CONJ_TRANSPOSE, a, &ah ); - - // Check basic properties of the operation. - - bli_herk_basic_check( alpha, a, &ah, beta, c, cntx ); - - // Check for real-valued alpha and beta. - - e_val = bli_check_real_valued_object( alpha ); - bli_check_error_code( e_val ); - - e_val = bli_check_real_valued_object( beta ); - bli_check_error_code( e_val ); - - // Check matrix structure. - - e_val = bli_check_hermitian_object( c ); - bli_check_error_code( e_val ); -} - -void bli_her2k_check - ( - obj_t* alpha, - obj_t* a, - obj_t* b, - obj_t* beta, - obj_t* c, - cntx_t* cntx - ) -{ - err_t e_val; - obj_t ah, bh; - - // Alias A and B to A^H and B^H so we can perform dimension checks. - bli_obj_alias_with_trans( BLIS_CONJ_TRANSPOSE, a, &ah ); - bli_obj_alias_with_trans( BLIS_CONJ_TRANSPOSE, b, &bh ); - - // Check basic properties of the operation. - - bli_her2k_basic_check( alpha, a, &bh, b, &ah, beta, c, cntx ); - - // Check for real-valued beta. - - e_val = bli_check_real_valued_object( beta ); - bli_check_error_code( e_val ); - - // Check matrix structure. - - e_val = bli_check_hermitian_object( c ); - bli_check_error_code( e_val ); -} - void bli_symm_check ( side_t side, @@ -196,58 +131,6 @@ void bli_symm_check bli_check_error_code( e_val ); } -void bli_syrk_check - ( - obj_t* alpha, - obj_t* a, - obj_t* beta, - obj_t* c, - cntx_t* cntx - ) -{ - err_t e_val; - obj_t at; - - // Alias A to A^T so we can perform dimension checks. - bli_obj_alias_with_trans( BLIS_TRANSPOSE, a, &at ); - - // Check basic properties of the operation. - - bli_herk_basic_check( alpha, a, &at, beta, c, cntx ); - - // Check matrix structure. - - e_val = bli_check_symmetric_object( c ); - bli_check_error_code( e_val ); -} - -void bli_syr2k_check - ( - obj_t* alpha, - obj_t* a, - obj_t* b, - obj_t* beta, - obj_t* c, - cntx_t* cntx - ) -{ - err_t e_val; - obj_t at, bt; - - // Alias A and B to A^T and B^T so we can perform dimension checks. - bli_obj_alias_with_trans( BLIS_TRANSPOSE, a, &at ); - bli_obj_alias_with_trans( BLIS_TRANSPOSE, b, &bt ); - - // Check basic properties of the operation. - - bli_her2k_basic_check( alpha, a, &bt, b, &at, beta, c, cntx ); - - // Check matrix structure. - - e_val = bli_check_symmetric_object( c ); - bli_check_error_code( e_val ); -} - void bli_trmm_check ( side_t side, @@ -412,110 +295,6 @@ void bli_hemm_basic_check bli_check_error_code( e_val ); } -void bli_herk_basic_check - ( - obj_t* alpha, - obj_t* a, - obj_t* ah, - obj_t* beta, - obj_t* c, - cntx_t* cntx - ) -{ - err_t e_val; - - // Perform standard checks. - - bli_l3_basic_check( alpha, a, ah, beta, c, cntx ); - - // Check object dimensions. - - e_val = bli_check_level3_dims( a, ah, c ); - bli_check_error_code( e_val ); - - // Check matrix squareness. - - e_val = bli_check_square_object( c ); - bli_check_error_code( e_val ); - - // Check matrix structure. - - e_val = bli_check_general_object( a ); - bli_check_error_code( e_val ); - - e_val = bli_check_general_object( ah ); - bli_check_error_code( e_val ); - - // Check for consistent datatypes. - - e_val = bli_check_consistent_object_datatypes( c, a ); - bli_check_error_code( e_val ); - - e_val = bli_check_consistent_object_datatypes( c, ah ); - bli_check_error_code( e_val ); -} - -void bli_her2k_basic_check - ( - obj_t* alpha, - obj_t* a, - obj_t* bh, - obj_t* b, - obj_t* ah, - obj_t* beta, - obj_t* c, - cntx_t* cntx - ) -{ - err_t e_val; - - // Perform standard checks. - - bli_l3_basic_check( alpha, a, bh, beta, c, cntx ); - bli_l3_basic_check( alpha, b, ah, beta, c, cntx ); - - // Check object dimensions. - - e_val = bli_check_level3_dims( a, bh, c ); - bli_check_error_code( e_val ); - - e_val = bli_check_level3_dims( b, ah, c ); - bli_check_error_code( e_val ); - - // Check matrix squareness. - - e_val = bli_check_square_object( c ); - bli_check_error_code( e_val ); - - // Check matrix structure. - - e_val = bli_check_general_object( a ); - bli_check_error_code( e_val ); - - e_val = bli_check_general_object( bh ); - bli_check_error_code( e_val ); - - e_val = bli_check_general_object( b ); - bli_check_error_code( e_val ); - - e_val = bli_check_general_object( ah ); - bli_check_error_code( e_val ); - - // Check for consistent datatypes. - - e_val = bli_check_consistent_object_datatypes( c, a ); - bli_check_error_code( e_val ); - - e_val = bli_check_consistent_object_datatypes( c, ah ); - bli_check_error_code( e_val ); - - e_val = bli_check_consistent_object_datatypes( c, b ); - bli_check_error_code( e_val ); - - e_val = bli_check_consistent_object_datatypes( c, bh ); - bli_check_error_code( e_val ); -} - void bli_l3_basic_check ( obj_t* alpha, diff --git a/frame/3/bli_l3_check.h b/frame/3/bli_l3_check.h index b2216c34bd..9719404a93 100644 --- a/frame/3/bli_l3_check.h +++ b/frame/3/bli_l3_check.h @@ -52,8 +52,6 @@ void PASTEMAC(opname,_check) \ GENPROT( gemm ) GENPROT( gemmt ) -GENPROT( her2k ) -GENPROT( syr2k ) #undef GENPROT @@ -76,22 +74,6 @@ GENPROT( trmm ) GENPROT( trsm ) -#undef GENPROT -#define GENPROT( opname ) \ -\ -void PASTEMAC(opname,_check) \ - ( \ - obj_t* alpha, \ - obj_t* a, \ - obj_t* beta, \ - obj_t* c, \ - cntx_t* cntx \ - ); - -GENPROT( herk ) -GENPROT( syrk ) - - // ----------------------------------------------------------------------------- void bli_gemm_basic_check @@ -125,28 +107,6 @@ void bli_hemm_basic_check cntx_t* cntx ); -void bli_herk_basic_check - ( - obj_t* alpha, - obj_t* a, - obj_t* ah, - obj_t* beta, - obj_t* c, - cntx_t* cntx - ); - -void bli_her2k_basic_check - ( - obj_t* alpha, - obj_t* a, - obj_t* bh, - obj_t* b, - obj_t* ah, - obj_t* beta, - obj_t* c, - cntx_t* cntx - ); - void bli_l3_basic_check ( obj_t* alpha, diff --git a/frame/3/bli_l3_cntl.c b/frame/3/bli_l3_cntl.c index f6bfbedbb9..3cdecfbc26 100644 --- a/frame/3/bli_l3_cntl.c +++ b/frame/3/bli_l3_cntl.c @@ -54,7 +54,7 @@ void bli_l3_cntl_create_if if ( cntl_orig == NULL ) { if ( family == BLIS_GEMM || - family == BLIS_HERK || + family == BLIS_GEMMT || family == BLIS_TRMM ) { *cntl_use = bli_gemm_cntl_create( rntm, family, schema_a, schema_b ); @@ -97,7 +97,7 @@ void bli_l3_cntl_free opid_t family = bli_cntl_family( cntl_use ); if ( family == BLIS_GEMM || - family == BLIS_HERK || + family == BLIS_GEMMT || family == BLIS_TRMM ) { bli_gemm_cntl_free( rntm, cntl_use, thread ); diff --git a/frame/3/bli_l3_direct.c b/frame/3/bli_l3_direct.c index 7baf2d6ef5..0d0a719214 100644 --- a/frame/3/bli_l3_direct.c +++ b/frame/3/bli_l3_direct.c @@ -46,7 +46,7 @@ dir_t bli_l3_direct opid_t family = bli_cntl_family( cntl ); if ( family == BLIS_GEMM ) return bli_gemm_direct( a, b, c ); - else if ( family == BLIS_HERK ) return bli_herk_direct( a, b, c ); + else if ( family == BLIS_GEMMT ) return bli_gemmt_direct( a, b, c ); else if ( family == BLIS_TRMM ) return bli_trmm_direct( a, b, c ); else if ( family == BLIS_TRSM ) return bli_trsm_direct( a, b, c ); @@ -68,14 +68,14 @@ dir_t bli_gemm_direct return BLIS_FWD; } -dir_t bli_herk_direct +dir_t bli_gemmt_direct ( obj_t* a, obj_t* b, obj_t* c ) { - // For herk, movement may be forwards (or backwards). + // For gemmt, movement may be forwards (or backwards). return BLIS_FWD; } diff --git a/frame/3/bli_l3_direct.h b/frame/3/bli_l3_direct.h index 7383c4a9fb..39798407a2 100644 --- a/frame/3/bli_l3_direct.h +++ b/frame/3/bli_l3_direct.h @@ -53,7 +53,7 @@ dir_t PASTEMAC0(opname) \ ); GENPROT( gemm_direct ) -GENPROT( herk_direct ) +GENPROT( gemmt_direct ) GENPROT( trmm_direct ) GENPROT( trsm_direct ) diff --git a/frame/3/bli_l3_oapi.c b/frame/3/bli_l3_oapi.c index 827dc4534d..a540d94e43 100644 --- a/frame/3/bli_l3_oapi.c +++ b/frame/3/bli_l3_oapi.c @@ -192,30 +192,36 @@ void bli_her2k_ex rntm_t* rntm ) { - bli_init_once(); + bli_init_once(); - /* Only proceed with an induced method if each of the operands have a - complex storage datatype. NOTE: Allowing precisions to vary while - using 1m, which is what we do here, is unique to gemm; other level-3 - operations use 1m only if all storage datatypes are equal (and they - ignore the computation precision). If any operands are real, skip the - induced method chooser function and proceed directly with native - execution. */ - if ( bli_obj_is_complex( c ) && - bli_obj_is_complex( a ) && - bli_obj_is_complex( b ) ) - { - /* Invoke the operation's "ind" function--its induced method front-end. - For complex problems, it calls the highest priority induced method - that is available (ie: implemented and enabled), and if none are - enabled, it calls native execution. (For real problems, it calls - the operation's native execution interface.) */ - bli_her2kind( alpha, a, b, beta, c, cntx, rntm ); - } - else - { - bli_her2knat( alpha, a, b, beta, c, cntx, rntm ); - } + obj_t ah; + obj_t bh; + obj_t alphah; + + bli_obj_alias_to( alpha, &alphah ); + bli_obj_toggle_conj( &alphah ); + + bli_obj_alias_to( a, &ah ); + bli_obj_induce_trans( &ah ); + bli_obj_toggle_conj( &ah ); + + bli_obj_alias_to( b, &bh ); + bli_obj_induce_trans( &bh ); + bli_obj_toggle_conj( &bh ); + + // Invoke gemmt twice, using beta only the first time. + + bli_gemmt_ex( alpha, a, &bh, beta, c, cntx, rntm ); + bli_gemmt_ex( &alphah, b, &ah, &BLIS_ONE, c, cntx, rntm ); + + // The Hermitian rank-2k product was computed as A*B'+B*A', even for + // the diagonal elements. Mathematically, the imaginary components of + // diagonal elements of a Hermitian rank-2k product should always be + // zero. However, in practice, they sometimes accumulate meaningless + // non-zero values. To prevent this, we explicitly set those values + // to zero before returning. + + bli_setid( &BLIS_ZERO, c ); } void bli_her2k @@ -241,30 +247,20 @@ void bli_syr2k_ex rntm_t* rntm ) { - bli_init_once(); + bli_init_once(); - /* Only proceed with an induced method if each of the operands have a - complex storage datatype. NOTE: Allowing precisions to vary while - using 1m, which is what we do here, is unique to gemm; other level-3 - operations use 1m only if all storage datatypes are equal (and they - ignore the computation precision). If any operands are real, skip the - induced method chooser function and proceed directly with native - execution. */ - if ( bli_obj_is_complex( c ) && - bli_obj_is_complex( a ) && - bli_obj_is_complex( b ) ) - { - /* Invoke the operation's "ind" function--its induced method front-end. - For complex problems, it calls the highest priority induced method - that is available (ie: implemented and enabled), and if none are - enabled, it calls native execution. (For real problems, it calls - the operation's native execution interface.) */ - bli_syr2kind( alpha, a, b, beta, c, cntx, rntm ); - } - else - { - bli_syr2knat( alpha, a, b, beta, c, cntx, rntm ); - } + obj_t at; + obj_t bt; + + bli_obj_alias_to( b, &bt ); + bli_obj_induce_trans( &bt ); + bli_obj_alias_to( a, &at ); + bli_obj_induce_trans( &at ); + + // Invoke gemmt twice, using beta only the first time. + + bli_gemmt_ex( alpha, a, &bt, beta, c, cntx, rntm ); + bli_gemmt_ex( alpha, b, &at, &BLIS_ONE, c, cntx, rntm ); } void bli_syr2k @@ -433,26 +429,24 @@ void bli_herk_ex rntm_t* rntm ) { - bli_init_once(); + bli_init_once(); - /* Only proceed with an induced method if all operands have the same - (complex) datatype. If any datatypes differ, skip the induced method - chooser function and proceed directly with native execution, which is - where mixed datatype support will be implemented (if at all). */ - if ( bli_obj_dt( a ) == bli_obj_dt( c ) && - bli_obj_is_complex( c ) ) - { - /* Invoke the operation's "ind" function--its induced method front-end. - For complex problems, it calls the highest priority induced method - that is available (ie: implemented and enabled), and if none are - enabled, it calls native execution. (For real problems, it calls - the operation's native execution interface.) */ - bli_herkind( alpha, a, beta, c, cntx, rntm ); - } - else - { - bli_herknat( alpha, a, beta, c, cntx, rntm ); - } + obj_t ah; + + bli_obj_alias_to( a, &ah ); + bli_obj_induce_trans( &ah ); + bli_obj_toggle_conj( &ah ); + + bli_gemmt_ex( alpha, a, &ah, beta, c, cntx, rntm ); + + // The Hermitian rank-k product was computed as A*A', even for the + // diagonal elements. Mathematically, the imaginary components of + // diagonal elements of a Hermitian rank-k product should always be + // zero. However, in practice, they sometimes accumulate meaningless + // non-zero values. To prevent this, we explicitly set those values + // to zero before returning. + + bli_setid( &BLIS_ZERO, c ); } void bli_herk @@ -476,26 +470,14 @@ void bli_syrk_ex rntm_t* rntm ) { - bli_init_once(); + bli_init_once(); - /* Only proceed with an induced method if all operands have the same - (complex) datatype. If any datatypes differ, skip the induced method - chooser function and proceed directly with native execution, which is - where mixed datatype support will be implemented (if at all). */ - if ( bli_obj_dt( a ) == bli_obj_dt( c ) && - bli_obj_is_complex( c ) ) - { - /* Invoke the operation's "ind" function--its induced method front-end. - For complex problems, it calls the highest priority induced method - that is available (ie: implemented and enabled), and if none are - enabled, it calls native execution. (For real problems, it calls - the operation's native execution interface.) */ - bli_syrkind( alpha, a, beta, c, cntx, rntm ); - } - else - { - bli_syrknat( alpha, a, beta, c, cntx, rntm ); - } + obj_t at; + + bli_obj_alias_to( a, &at ); + bli_obj_induce_trans( &at ); + + bli_gemmt_ex( alpha, a, &at, beta, c, cntx, rntm ); } void bli_syrk diff --git a/frame/3/bli_l3_prune.c b/frame/3/bli_l3_prune.c index fa008fd15e..6ca8244cbb 100644 --- a/frame/3/bli_l3_prune.c +++ b/frame/3/bli_l3_prune.c @@ -47,7 +47,7 @@ void bli_l3_prune_unref_mparts_m opid_t family = bli_cntl_family( cntl ); if ( family == BLIS_GEMM ) return; // No pruning is necessary for gemm. - else if ( family == BLIS_HERK ) bli_herk_prune_unref_mparts_m( a, b, c ); + else if ( family == BLIS_GEMMT ) bli_gemmt_prune_unref_mparts_m( a, b, c ); else if ( family == BLIS_TRMM ) bli_trmm_prune_unref_mparts_m( a, b, c ); else if ( family == BLIS_TRSM ) bli_trsm_prune_unref_mparts_m( a, b, c ); } @@ -68,7 +68,7 @@ void PASTEMAC(l3_prune_unref_mparts_,dim) \ opid_t family = bli_cntl_family( cntl ); \ \ if ( family == BLIS_GEMM ) return; /* No pruning is necessary for gemm. */ \ - else if ( family == BLIS_HERK ) PASTEMAC(herk_prune_unref_mparts_,dim)( a, b, c ); \ + else if ( family == BLIS_GEMMT ) PASTEMAC(gemmt_prune_unref_mparts_,dim)( a, b, c ); \ else if ( family == BLIS_TRMM ) PASTEMAC(trmm_prune_unref_mparts_,dim)( a, b, c ); \ else if ( family == BLIS_TRSM ) PASTEMAC(trsm_prune_unref_mparts_,dim)( a, b, c ); \ } @@ -152,7 +152,7 @@ void PASTEMAC(opname,_prune_unref_mparts_k) \ for the k dimension. */ \ } -GENFRONT( herk ) +GENFRONT( gemmt ) // ----------------------------------------------------------------------------- diff --git a/frame/3/bli_l3_prune.h b/frame/3/bli_l3_prune.h index 340ecd4dbf..ad8f07dc43 100644 --- a/frame/3/bli_l3_prune.h +++ b/frame/3/bli_l3_prune.h @@ -64,9 +64,9 @@ GENPROT( gemm, m ) GENPROT( gemm, n ) GENPROT( gemm, k ) -GENPROT( herk, m ) -GENPROT( herk, n ) -GENPROT( herk, k ) +GENPROT( gemmt, m ) +GENPROT( gemmt, n ) +GENPROT( gemmt, k ) GENPROT( trmm, m ) GENPROT( trmm, n ) diff --git a/frame/3/bli_l3_thrinfo.h b/frame/3/bli_l3_thrinfo.h index 4726e10421..37a3909fd6 100644 --- a/frame/3/bli_l3_thrinfo.h +++ b/frame/3/bli_l3_thrinfo.h @@ -44,12 +44,12 @@ #define bli_gemm_get_next_a_upanel( a1, step, inc ) ( a1 + step * inc ) #define bli_gemm_get_next_b_upanel( b1, step, inc ) ( b1 + step * inc ) -// herk +// gemmt -// NOTE: The definition of bli_herk_get_next_?_upanel() does not need to +// NOTE: The definition of bli_gemmt_get_next_?_upanel() does not need to // change depending on BLIS_ENABLE_JRIR_SLAB / BLIS_ENABLE_JRIR_RR. -#define bli_herk_get_next_a_upanel( a1, step, inc ) ( a1 + step * inc ) -#define bli_herk_get_next_b_upanel( b1, step, inc ) ( b1 + step * inc ) +#define bli_gemmt_get_next_a_upanel( a1, step, inc ) ( a1 + step * inc ) +#define bli_gemmt_get_next_b_upanel( b1, step, inc ) ( b1 + step * inc ) // trmm diff --git a/frame/3/gemm/bli_gemm_blk_var3.c b/frame/3/gemm/bli_gemm_blk_var3.c index 94f0af4098..7883dfd6de 100644 --- a/frame/3/gemm/bli_gemm_blk_var3.c +++ b/frame/3/gemm/bli_gemm_blk_var3.c @@ -93,7 +93,7 @@ void bli_gemm_blk_var3 // can simply overwrite the internal beta scalar with BLIS_ONE once // it has been used in the first iteration. However... - // Unlike variant 3 of gemm and herk, which reset the internal scalar + // Unlike variant 3 of gemm and gemmt, which reset the internal scalar // on C at the end of the first iteration so that subsequent iterations // do not erroneously apply beta more than once, it is important that // this behavior not be applied to trmm. That is because the order of diff --git a/frame/3/gemm/bli_gemm_cntl.c b/frame/3/gemm/bli_gemm_cntl.c index d7cd0a92ce..27678e0bf8 100644 --- a/frame/3/gemm/bli_gemm_cntl.c +++ b/frame/3/gemm/bli_gemm_cntl.c @@ -63,7 +63,7 @@ cntl_t* bli_gemmbp_cntl_create // Use the function pointers to the macrokernels that use slab // assignment of micropanels to threads in the jr and ir loops. if ( family == BLIS_GEMM ) macro_kernel_fp = bli_gemm_ker_var2; - else if ( family == BLIS_HERK ) macro_kernel_fp = bli_herk_x_ker_var2; + else if ( family == BLIS_GEMMT ) macro_kernel_fp = bli_gemmt_x_ker_var2; else if ( family == BLIS_TRMM ) macro_kernel_fp = bli_trmm_xx_ker_var2; else /* should never execute */ macro_kernel_fp = NULL; @@ -167,8 +167,8 @@ cntl_t* bli_gemmpb_cntl_create { void_fp macro_kernel_p = bli_gemm_ker_var1; - // Change the macro-kernel if the operation family is herk or trmm. - //if ( family == BLIS_HERK ) macro_kernel_p = bli_herk_x_ker_var2; + // Change the macro-kernel if the operation family is gemmt or trmm. + //if ( family == BLIS_GEMMT ) macro_kernel_p = bli_gemmt_x_ker_var2; //else if ( family == BLIS_TRMM ) macro_kernel_p = bli_trmm_xx_ker_var2; // Create two nodes for the macro-kernel. diff --git a/frame/3/gemmt/bli_gemmt.h b/frame/3/gemmt/bli_gemmt.h index ed522ee135..32ab3865e7 100644 --- a/frame/3/gemmt/bli_gemmt.h +++ b/frame/3/gemmt/bli_gemmt.h @@ -34,3 +34,5 @@ #include "bli_gemmt_front.h" +#include "bli_gemmt_var.h" + diff --git a/frame/3/gemmt/bli_gemmt_front.c b/frame/3/gemmt/bli_gemmt_front.c index d652618cb0..6bf32943d2 100644 --- a/frame/3/gemmt/bli_gemmt_front.c +++ b/frame/3/gemmt/bli_gemmt_front.c @@ -53,6 +53,14 @@ void bli_gemmt_front obj_t b_local; obj_t c_local; +#if 0 +#ifdef BLIS_ENABLE_SMALL_MATRIX + gint_t status = bli_gemmt_small( alpha, &a_local, &at_local, beta, &c_local, + cntx, cntl ); + if ( status == BLIS_SUCCESS ) return; +#endif +#endif + // Check parameters. if ( bli_error_checking_is_enabled() ) bli_gemmt_check( alpha, a, b, beta, c, cntx ); @@ -120,7 +128,7 @@ void bli_gemmt_front bli_l3_thread_decorator ( bli_gemm_int, - BLIS_HERK, // operation family id (gemmt uses 'herk' family) + BLIS_GEMMT, // operation family id alpha, &a_local, &b_local, diff --git a/frame/3/herk/bli_herk_l_ker_var2.c b/frame/3/gemmt/bli_gemmt_l_ker_var2.c similarity index 97% rename from frame/3/herk/bli_herk_l_ker_var2.c rename to frame/3/gemmt/bli_gemmt_l_ker_var2.c index 81df2840fe..5ede0711c3 100644 --- a/frame/3/herk/bli_herk_l_ker_var2.c +++ b/frame/3/gemmt/bli_gemmt_l_ker_var2.c @@ -35,7 +35,7 @@ #include "blis.h" -#define FUNCPTR_T herk_fp +#define FUNCPTR_T gemmt_fp typedef void (*FUNCPTR_T) ( @@ -57,10 +57,10 @@ typedef void (*FUNCPTR_T) thrinfo_t* thread ); -static FUNCPTR_T GENARRAY(ftypes,herk_l_ker_var2); +static FUNCPTR_T GENARRAY(ftypes,gemmt_l_ker_var2); -void bli_herk_l_ker_var2 +void bli_gemmt_l_ker_var2 ( obj_t* a, obj_t* b, @@ -362,11 +362,11 @@ void PASTEMAC(ch,varname) \ m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \ \ /* Compute the addresses of the next panels of A and B. */ \ - a2 = bli_herk_get_next_a_upanel( a1, rstep_a, ir_inc ); \ + a2 = bli_gemmt_get_next_a_upanel( a1, rstep_a, ir_inc ); \ if ( bli_is_last_iter( i, m_iter, ir_tid, ir_nt ) ) \ { \ a2 = a_cast; \ - b2 = bli_herk_get_next_b_upanel( b1, cstep_b, jr_inc ); \ + b2 = bli_gemmt_get_next_b_upanel( b1, cstep_b, jr_inc ); \ if ( bli_is_last_iter( j, n_iter, jr_tid, jr_nt ) ) \ b2 = b_cast; \ } \ @@ -467,11 +467,11 @@ void PASTEMAC(ch,varname) \ m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \ \ /* Compute the addresses of the next panels of A and B. */ \ - a2 = bli_herk_get_next_a_upanel( a1, rstep_a, ir_inc ); \ + a2 = bli_gemmt_get_next_a_upanel( a1, rstep_a, ir_inc ); \ if ( bli_is_last_iter( i, m_iter, ir_tid, ir_nt ) ) \ { \ a2 = a_cast; \ - b2 = bli_herk_get_next_b_upanel( b1, cstep_b, jr_inc ); \ + b2 = bli_gemmt_get_next_b_upanel( b1, cstep_b, jr_inc ); \ if ( bli_is_last_iter_rr( j, n_iter, jr_tid, jr_nt ) ) \ b2 = b_cast; \ } \ @@ -554,5 +554,5 @@ void PASTEMAC(ch,varname) \ } \ } -INSERT_GENTFUNC_BASIC0( herk_l_ker_var2 ) +INSERT_GENTFUNC_BASIC0( gemmt_l_ker_var2 ) diff --git a/frame/3/herk/bli_herk_u_ker_var2.c b/frame/3/gemmt/bli_gemmt_u_ker_var2.c similarity index 97% rename from frame/3/herk/bli_herk_u_ker_var2.c rename to frame/3/gemmt/bli_gemmt_u_ker_var2.c index 82de8d44a3..cb9539f798 100644 --- a/frame/3/herk/bli_herk_u_ker_var2.c +++ b/frame/3/gemmt/bli_gemmt_u_ker_var2.c @@ -35,7 +35,7 @@ #include "blis.h" -#define FUNCPTR_T herk_fp +#define FUNCPTR_T gemmt_fp typedef void (*FUNCPTR_T) ( @@ -57,10 +57,10 @@ typedef void (*FUNCPTR_T) thrinfo_t* thread ); -static FUNCPTR_T GENARRAY(ftypes,herk_u_ker_var2); +static FUNCPTR_T GENARRAY(ftypes,gemmt_u_ker_var2); -void bli_herk_u_ker_var2 +void bli_gemmt_u_ker_var2 ( obj_t* a, obj_t* b, @@ -362,11 +362,11 @@ void PASTEMAC(ch,varname) \ m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \ \ /* Compute the addresses of the next panels of A and B. */ \ - a2 = bli_herk_get_next_a_upanel( a1, rstep_a, ir_inc ); \ + a2 = bli_gemmt_get_next_a_upanel( a1, rstep_a, ir_inc ); \ if ( bli_is_last_iter( i, m_iter, ir_tid, ir_nt ) ) \ { \ a2 = a_cast; \ - b2 = bli_herk_get_next_b_upanel( b1, cstep_b, jr_inc ); \ + b2 = bli_gemmt_get_next_b_upanel( b1, cstep_b, jr_inc ); \ if ( bli_is_last_iter_rr( j, n_iter, jr_tid, jr_nt ) ) \ b2 = b_cast; \ } \ @@ -493,11 +493,11 @@ void PASTEMAC(ch,varname) \ m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \ \ /* Compute the addresses of the next panels of A and B. */ \ - a2 = bli_herk_get_next_a_upanel( a1, rstep_a, ir_inc ); \ + a2 = bli_gemmt_get_next_a_upanel( a1, rstep_a, ir_inc ); \ if ( bli_is_last_iter( i, m_iter, ir_tid, ir_nt ) ) \ { \ a2 = a_cast; \ - b2 = bli_herk_get_next_b_upanel( b1, cstep_b, jr_inc ); \ + b2 = bli_gemmt_get_next_b_upanel( b1, cstep_b, jr_inc ); \ if ( bli_is_last_iter( j, n_iter, jr_tid, jr_nt ) ) \ b2 = b_cast; \ } \ @@ -557,5 +557,5 @@ void PASTEMAC(ch,varname) \ } \ } -INSERT_GENTFUNC_BASIC0( herk_u_ker_var2 ) +INSERT_GENTFUNC_BASIC0( gemmt_u_ker_var2 ) diff --git a/frame/3/herk/bli_herk_var.h b/frame/3/gemmt/bli_gemmt_var.h similarity index 90% rename from frame/3/herk/bli_herk_var.h rename to frame/3/gemmt/bli_gemmt_var.h index 00b85fc5c6..60c68c9f59 100644 --- a/frame/3/herk/bli_herk_var.h +++ b/frame/3/gemmt/bli_gemmt_var.h @@ -52,16 +52,10 @@ void PASTEMAC0(opname) \ thrinfo_t* thread \ ); -//GENPROT( herk_blk_var1 ) -//GENPROT( herk_blk_var2 ) -//GENPROT( herk_blk_var3 ) +GENPROT( gemmt_x_ker_var2 ) -GENPROT( herk_x_ker_var2 ) - -GENPROT( herk_l_ker_var2 ) -GENPROT( herk_u_ker_var2 ) -//GENPROT( herk_packa ) -//GENPROT( herk_packb ) +GENPROT( gemmt_l_ker_var2 ) +GENPROT( gemmt_u_ker_var2 ) // @@ -91,6 +85,6 @@ void PASTEMAC(ch,varname) \ thrinfo_t* thread \ ); -INSERT_GENTPROT_BASIC0( herk_l_ker_var2 ) -INSERT_GENTPROT_BASIC0( herk_u_ker_var2 ) +INSERT_GENTPROT_BASIC0( gemmt_l_ker_var2 ) +INSERT_GENTPROT_BASIC0( gemmt_u_ker_var2 ) diff --git a/frame/3/herk/bli_herk_x_ker_var2.c b/frame/3/gemmt/bli_gemmt_x_ker_var2.c similarity index 97% rename from frame/3/herk/bli_herk_x_ker_var2.c rename to frame/3/gemmt/bli_gemmt_x_ker_var2.c index b6769d7192..6d24ea4969 100644 --- a/frame/3/herk/bli_herk_x_ker_var2.c +++ b/frame/3/gemmt/bli_gemmt_x_ker_var2.c @@ -37,10 +37,10 @@ static gemm_var_oft vars[2] = { - bli_herk_l_ker_var2, bli_herk_u_ker_var2, + bli_gemmt_l_ker_var2, bli_gemmt_u_ker_var2, }; -void bli_herk_x_ker_var2 +void bli_gemmt_x_ker_var2 ( obj_t* a, obj_t* ah, diff --git a/frame/3/her2k/bli_her2k.h b/frame/3/her2k/bli_her2k.h deleted file mode 100644 index 02975c2b51..0000000000 --- a/frame/3/her2k/bli_her2k.h +++ /dev/null @@ -1,36 +0,0 @@ -/* - - BLIS - An object-based framework for developing high-performance BLAS-like - libraries. - - Copyright (C) 2014, The University of Texas at Austin - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are - met: - - Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - - Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - - Neither the name(s) of the copyright holder(s) nor the names of its - contributors may be used to endorse or promote products derived - from this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -*/ - -#include "bli_her2k_front.h" - diff --git a/frame/3/her2k/bli_her2k_front.c b/frame/3/her2k/bli_her2k_front.c deleted file mode 100644 index 096ea463bc..0000000000 --- a/frame/3/her2k/bli_her2k_front.c +++ /dev/null @@ -1,174 +0,0 @@ -/* - - BLIS - An object-based framework for developing high-performance BLAS-like - libraries. - - Copyright (C) 2014, The University of Texas at Austin - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are - met: - - Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - - Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - - Neither the name(s) of the copyright holder(s) nor the names of its - contributors may be used to endorse or promote products derived - from this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -*/ - -#include "blis.h" - -void bli_her2k_front - ( - obj_t* alpha, - obj_t* a, - obj_t* b, - obj_t* beta, - obj_t* c, - cntx_t* cntx, - rntm_t* rntm, - cntl_t* cntl - ) -{ - bli_init_once(); - - obj_t alpha_conj; - obj_t c_local; - obj_t a_local; - obj_t bh_local; - obj_t b_local; - obj_t ah_local; - - // Check parameters. - if ( bli_error_checking_is_enabled() ) - bli_her2k_check( alpha, a, b, beta, c, cntx ); - - // If alpha is zero, scale by beta, zero the imaginary components of - // the diagonal elements, and return. - if ( bli_obj_equals( alpha, &BLIS_ZERO ) ) - { - bli_scalm( beta, c ); - bli_setid( &BLIS_ZERO, c ); - return; - } - - // Alias A, B, and C in case we need to apply transformations. - bli_obj_alias_to( a, &a_local ); - bli_obj_alias_to( b, &b_local ); - bli_obj_alias_to( c, &c_local ); - bli_obj_set_as_root( &c_local ); - - // For her2k, the first and second right-hand "B" operands are simply B' - // and A'. - bli_obj_alias_to( b, &bh_local ); - bli_obj_induce_trans( &bh_local ); - bli_obj_toggle_conj( &bh_local ); - bli_obj_alias_to( a, &ah_local ); - bli_obj_induce_trans( &ah_local ); - bli_obj_toggle_conj( &ah_local ); - - // Initialize a conjugated copy of alpha. - bli_obj_scalar_init_detached_copy_of( bli_obj_dt( a ), - BLIS_CONJUGATE, - alpha, - &alpha_conj ); - - // An optimization: If C is stored by rows and the micro-kernel prefers - // contiguous columns, or if C is stored by columns and the micro-kernel - // prefers contiguous rows, transpose the entire operation to allow the - // micro-kernel to access elements of C in its preferred manner. - if ( bli_cntx_l3_vir_ukr_dislikes_storage_of( &c_local, BLIS_GEMM_UKR, cntx ) ) - { - bli_obj_swap( &a_local, &bh_local ); - bli_obj_swap( &b_local, &ah_local ); - - bli_obj_induce_trans( &a_local ); - bli_obj_induce_trans( &bh_local ); - bli_obj_induce_trans( &b_local ); - bli_obj_induce_trans( &ah_local ); - - bli_obj_induce_trans( &c_local ); - } - - // Parse and interpret the contents of the rntm_t object to properly - // set the ways of parallelism for each loop, and then make any - // additional modifications necessary for the current operation. - bli_rntm_set_ways_for_op - ( - BLIS_HER2K, - BLIS_LEFT, // ignored for her[2]k/syr[2]k - bli_obj_length( &c_local ), - bli_obj_width( &c_local ), - bli_obj_width( &a_local ), - rntm - ); - - // A sort of hack for communicating the desired pach schemas for A and B - // to bli_gemm_cntl_create() (via bli_l3_thread_decorator() and - // bli_l3_cntl_create_if()). This allows us to access the schemas from - // the control tree, which hopefully reduces some confusion, particularly - // in bli_packm_init(). - pack_t schema_a = bli_cntx_schema_a_block( cntx ); - pack_t schema_b = bli_cntx_schema_b_panel( cntx ); - - bli_obj_set_pack_schema( schema_a, &a_local ); - bli_obj_set_pack_schema( schema_b, &bh_local ); - bli_obj_set_pack_schema( schema_a, &b_local ); - bli_obj_set_pack_schema( schema_b, &ah_local ); - - // Invoke herk twice, using beta only the first time. - - // Invoke the internal back-end. - bli_l3_thread_decorator - ( - bli_gemm_int, - BLIS_HERK, // operation family id - alpha, - &a_local, - &bh_local, - beta, - &c_local, - cntx, - rntm, - cntl - ); - - bli_l3_thread_decorator - ( - bli_gemm_int, - BLIS_HERK, // operation family id - &alpha_conj, - &b_local, - &ah_local, - &BLIS_ONE, - &c_local, - cntx, - rntm, - cntl - ); - - // The Hermitian rank-2k product was computed as A*B'+B*A', even for - // the diagonal elements. Mathematically, the imaginary components of - // diagonal elements of a Hermitian rank-2k product should always be - // zero. However, in practice, they sometimes accumulate meaningless - // non-zero values. To prevent this, we explicitly set those values - // to zero before returning. - bli_setid( &BLIS_ZERO, &c_local ); -} - diff --git a/frame/3/her2k/bli_her2k_front.h b/frame/3/her2k/bli_her2k_front.h deleted file mode 100644 index 0efdb86c2d..0000000000 --- a/frame/3/her2k/bli_her2k_front.h +++ /dev/null @@ -1,45 +0,0 @@ -/* - - BLIS - An object-based framework for developing high-performance BLAS-like - libraries. - - Copyright (C) 2014, The University of Texas at Austin - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are - met: - - Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - - Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - - Neither the name(s) of the copyright holder(s) nor the names of its - contributors may be used to endorse or promote products derived - from this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -*/ - -void bli_her2k_front - ( - obj_t* alpha, - obj_t* a, - obj_t* b, - obj_t* beta, - obj_t* c, - cntx_t* cntx, - rntm_t* rntm, - cntl_t* cntl - ); diff --git a/frame/3/herk/bli_herk.h b/frame/3/herk/bli_herk.h deleted file mode 100644 index c437289688..0000000000 --- a/frame/3/herk/bli_herk.h +++ /dev/null @@ -1,38 +0,0 @@ -/* - - BLIS - An object-based framework for developing high-performance BLAS-like - libraries. - - Copyright (C) 2014, The University of Texas at Austin - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are - met: - - Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - - Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - - Neither the name(s) of the copyright holder(s) nor the names of its - contributors may be used to endorse or promote products derived - from this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -*/ - -#include "bli_herk_front.h" - -#include "bli_herk_var.h" - diff --git a/frame/3/herk/bli_herk_front.c b/frame/3/herk/bli_herk_front.c deleted file mode 100644 index a88d23e90a..0000000000 --- a/frame/3/herk/bli_herk_front.c +++ /dev/null @@ -1,136 +0,0 @@ -/* - - BLIS - An object-based framework for developing high-performance BLAS-like - libraries. - - Copyright (C) 2014, The University of Texas at Austin - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are - met: - - Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - - Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - - Neither the name(s) of the copyright holder(s) nor the names of its - contributors may be used to endorse or promote products derived - from this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -*/ - -#include "blis.h" - -void bli_herk_front - ( - obj_t* alpha, - obj_t* a, - obj_t* beta, - obj_t* c, - cntx_t* cntx, - rntm_t* rntm, - cntl_t* cntl - ) -{ - bli_init_once(); - - obj_t a_local; - obj_t ah_local; - obj_t c_local; - - // Check parameters. - if ( bli_error_checking_is_enabled() ) - bli_herk_check( alpha, a, beta, c, cntx ); - - // If alpha is zero, scale by beta, zero the imaginary components of - // the diagonal elements, and return. - if ( bli_obj_equals( alpha, &BLIS_ZERO ) ) - { - bli_scalm( beta, c ); - bli_setid( &BLIS_ZERO, c ); - return; - } - - // Alias A and C in case we need to apply transformations. - bli_obj_alias_to( a, &a_local ); - bli_obj_alias_to( c, &c_local ); - bli_obj_set_as_root( &c_local ); - - // For herk, the right-hand "B" operand is simply A'. - bli_obj_alias_to( a, &ah_local ); - bli_obj_induce_trans( &ah_local ); - bli_obj_toggle_conj( &ah_local ); - - // An optimization: If C is stored by rows and the micro-kernel prefers - // contiguous columns, or if C is stored by columns and the micro-kernel - // prefers contiguous rows, transpose the entire operation to allow the - // micro-kernel to access elements of C in its preferred manner. - if ( bli_cntx_l3_vir_ukr_dislikes_storage_of( &c_local, BLIS_GEMM_UKR, cntx ) ) - { - bli_obj_toggle_conj( &a_local ); - bli_obj_toggle_conj( &ah_local ); - - bli_obj_induce_trans( &c_local ); - } - - // Parse and interpret the contents of the rntm_t object to properly - // set the ways of parallelism for each loop, and then make any - // additional modifications necessary for the current operation. - bli_rntm_set_ways_for_op - ( - BLIS_HERK, - BLIS_LEFT, // ignored for her[2]k/syr[2]k - bli_obj_length( &c_local ), - bli_obj_width( &c_local ), - bli_obj_width( &a_local ), - rntm - ); - - // A sort of hack for communicating the desired pach schemas for A and B - // to bli_gemm_cntl_create() (via bli_l3_thread_decorator() and - // bli_l3_cntl_create_if()). This allows us to access the schemas from - // the control tree, which hopefully reduces some confusion, particularly - // in bli_packm_init(). - pack_t schema_a = bli_cntx_schema_a_block( cntx ); - pack_t schema_b = bli_cntx_schema_b_panel( cntx ); - - bli_obj_set_pack_schema( schema_a, &a_local ); - bli_obj_set_pack_schema( schema_b, &ah_local ); - - // Invoke the internal back-end. - bli_l3_thread_decorator - ( - bli_gemm_int, - BLIS_HERK, // operation family id - alpha, - &a_local, - &ah_local, - beta, - &c_local, - cntx, - rntm, - cntl - ); - - // The Hermitian rank-k product was computed as A*A', even for the - // diagonal elements. Mathematically, the imaginary components of - // diagonal elements of a Hermitian rank-k product should always be - // zero. However, in practice, they sometimes accumulate meaningless - // non-zero values. To prevent this, we explicitly set those values - // to zero before returning. - bli_setid( &BLIS_ZERO, &c_local ); -} - diff --git a/frame/3/herk/bli_herk_front.h b/frame/3/herk/bli_herk_front.h deleted file mode 100644 index 44778a450a..0000000000 --- a/frame/3/herk/bli_herk_front.h +++ /dev/null @@ -1,44 +0,0 @@ -/* - - BLIS - An object-based framework for developing high-performance BLAS-like - libraries. - - Copyright (C) 2014, The University of Texas at Austin - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are - met: - - Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - - Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - - Neither the name(s) of the copyright holder(s) nor the names of its - contributors may be used to endorse or promote products derived - from this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -*/ - -void bli_herk_front - ( - obj_t* alpha, - obj_t* a, - obj_t* beta, - obj_t* c, - cntx_t* cntx, - rntm_t* rntm, - cntl_t* cntl - ); diff --git a/frame/3/herk/other/bli_herk_l_ker_var2.1looprr.c b/frame/3/herk/other/bli_herk_l_ker_var2.1looprr.c deleted file mode 100644 index 8a99a2e241..0000000000 --- a/frame/3/herk/other/bli_herk_l_ker_var2.1looprr.c +++ /dev/null @@ -1,420 +0,0 @@ -/* - - BLIS - An object-based framework for developing high-performance BLAS-like - libraries. - - Copyright (C) 2014, The University of Texas at Austin - Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc. - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are - met: - - Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - - Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - - Neither the name(s) of the copyright holder(s) nor the names of its - contributors may be used to endorse or promote products derived - from this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -*/ - -#include "blis.h" - -#define FUNCPTR_T herk_fp - -typedef void (*FUNCPTR_T) - ( - doff_t diagoffc, - pack_t schema_a, - pack_t schema_b, - dim_t m, - dim_t n, - dim_t k, - void* alpha, - void* a, inc_t cs_a, inc_t is_a, - dim_t pd_a, inc_t ps_a, - void* b, inc_t rs_b, inc_t is_b, - dim_t pd_b, inc_t ps_b, - void* beta, - void* c, inc_t rs_c, inc_t cs_c, - cntx_t* cntx, - rntm_t* rntm, - thrinfo_t* thread - ); - -static FUNCPTR_T GENARRAY(ftypes,herk_l_ker_var2); - - -void bli_herk_l_ker_var2 - ( - obj_t* a, - obj_t* b, - obj_t* c, - cntx_t* cntx, - rntm_t* rntm, - cntl_t* cntl, - thrinfo_t* thread - ) -{ - num_t dt_exec = bli_obj_exec_dt( c ); - - doff_t diagoffc = bli_obj_diag_offset( c ); - - pack_t schema_a = bli_obj_pack_schema( a ); - pack_t schema_b = bli_obj_pack_schema( b ); - - dim_t m = bli_obj_length( c ); - dim_t n = bli_obj_width( c ); - dim_t k = bli_obj_width( a ); - - void* buf_a = bli_obj_buffer_at_off( a ); - inc_t cs_a = bli_obj_col_stride( a ); - inc_t is_a = bli_obj_imag_stride( a ); - dim_t pd_a = bli_obj_panel_dim( a ); - inc_t ps_a = bli_obj_panel_stride( a ); - - void* buf_b = bli_obj_buffer_at_off( b ); - inc_t rs_b = bli_obj_row_stride( b ); - inc_t is_b = bli_obj_imag_stride( b ); - dim_t pd_b = bli_obj_panel_dim( b ); - inc_t ps_b = bli_obj_panel_stride( b ); - - void* buf_c = bli_obj_buffer_at_off( c ); - inc_t rs_c = bli_obj_row_stride( c ); - inc_t cs_c = bli_obj_col_stride( c ); - - obj_t scalar_a; - obj_t scalar_b; - - void* buf_alpha; - void* buf_beta; - - FUNCPTR_T f; - - // Detach and multiply the scalars attached to A and B. - bli_obj_scalar_detach( a, &scalar_a ); - bli_obj_scalar_detach( b, &scalar_b ); - bli_mulsc( &scalar_a, &scalar_b ); - - // Grab the addresses of the internal scalar buffers for the scalar - // merged above and the scalar attached to C. - buf_alpha = bli_obj_internal_scalar_buffer( &scalar_b ); - buf_beta = bli_obj_internal_scalar_buffer( c ); - - // Index into the type combination array to extract the correct - // function pointer. - f = ftypes[dt_exec]; - - // Invoke the function. - f( diagoffc, - schema_a, - schema_b, - m, - n, - k, - buf_alpha, - buf_a, cs_a, is_a, - pd_a, ps_a, - buf_b, rs_b, is_b, - pd_b, ps_b, - buf_beta, - buf_c, rs_c, cs_c, - cntx, - rntm, - thread ); -} - - -#undef GENTFUNC -#define GENTFUNC( ctype, ch, varname ) \ -\ -void PASTEMAC(ch,varname) \ - ( \ - doff_t diagoffc, \ - pack_t schema_a, \ - pack_t schema_b, \ - dim_t m, \ - dim_t n, \ - dim_t k, \ - void* alpha, \ - void* a, inc_t cs_a, inc_t is_a, \ - dim_t pd_a, inc_t ps_a, \ - void* b, inc_t rs_b, inc_t is_b, \ - dim_t pd_b, inc_t ps_b, \ - void* beta, \ - void* c, inc_t rs_c, inc_t cs_c, \ - cntx_t* cntx, \ - rntm_t* rntm, \ - thrinfo_t* thread \ - ) \ -{ \ - const num_t dt = PASTEMAC(ch,type); \ -\ - /* Alias some constants to simpler names. */ \ - const dim_t MR = pd_a; \ - const dim_t NR = pd_b; \ - /*const dim_t PACKMR = cs_a;*/ \ - /*const dim_t PACKNR = rs_b;*/ \ -\ - /* Query the context for the micro-kernel address and cast it to its - function pointer type. */ \ - PASTECH(ch,gemm_ukr_ft) \ - gemm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \ -\ - /* Temporary C buffer for edge cases. Note that the strides of this - temporary buffer are set so that they match the storage of the - original C matrix. For example, if C is column-stored, ct will be - column-stored as well. */ \ - ctype ct[ BLIS_STACK_BUF_MAX_SIZE \ - / sizeof( ctype ) ] \ - __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \ - const bool col_pref = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \ - const inc_t rs_ct = ( col_pref ? 1 : NR ); \ - const inc_t cs_ct = ( col_pref ? MR : 1 ); \ -\ - ctype* restrict zero = PASTEMAC(ch,0); \ - ctype* restrict a_cast = a; \ - ctype* restrict b_cast = b; \ - ctype* restrict c_cast = c; \ - ctype* restrict alpha_cast = alpha; \ - ctype* restrict beta_cast = beta; \ - ctype* restrict b1; \ - ctype* restrict c1; \ -\ - doff_t diagoffc_ij; \ - dim_t m_iter, m_left; \ - dim_t n_iter, n_left; \ - dim_t m_cur; \ - dim_t n_cur; \ - dim_t i, j, ip; \ - inc_t rstep_a; \ - inc_t cstep_b; \ - inc_t rstep_c, cstep_c; \ - auxinfo_t aux; \ -\ - /* - Assumptions/assertions: - rs_a == 1 - cs_a == PACKMR - pd_a == MR - ps_a == stride to next micro-panel of A - rs_b == PACKNR - cs_b == 1 - pd_b == NR - ps_b == stride to next micro-panel of B - rs_c == (no assumptions) - cs_c == (no assumptions) - */ \ -\ - /* If any dimension is zero, return immediately. */ \ - if ( bli_zero_dim3( m, n, k ) ) return; \ -\ - /* Safeguard: If the current panel of C is entirely above the diagonal, - it is not stored. So we do nothing. */ \ - if ( bli_is_strictly_above_diag_n( diagoffc, m, n ) ) return; \ -\ - /* If there is a zero region above where the diagonal of C intersects - the left edge of the panel, adjust the pointer to C and A and treat - this case as if the diagonal offset were zero. */ \ - if ( diagoffc < 0 ) \ - { \ - ip = -diagoffc / MR; \ - i = ip * MR; \ - m = m - i; \ - diagoffc = -diagoffc % MR; \ - c_cast = c_cast + (i )*rs_c; \ - a_cast = a_cast + (ip )*ps_a; \ - } \ -\ - /* If there is a zero region to the right of where the diagonal - of C intersects the bottom of the panel, shrink it to prevent - "no-op" iterations from executing. */ \ - if ( diagoffc + m < n ) \ - { \ - n = diagoffc + m; \ - } \ -\ - /* Clear the temporary C buffer in case it has any infs or NaNs. */ \ - PASTEMAC(ch,set0s_mxn)( MR, NR, \ - ct, rs_ct, cs_ct ); \ -\ - /* Compute number of primary and leftover components of the m and n - dimensions. */ \ - n_iter = n / NR; \ - n_left = n % NR; \ -\ - m_iter = m / MR; \ - m_left = m % MR; \ -\ - if ( n_left ) ++n_iter; \ - if ( m_left ) ++m_iter; \ -\ - /* Determine some increments used to step through A, B, and C. */ \ - rstep_a = ps_a; \ -\ - cstep_b = ps_b; \ -\ - rstep_c = rs_c * MR; \ - cstep_c = cs_c * NR; \ -\ - /* Save the pack schemas of A and B to the auxinfo_t object. */ \ - bli_auxinfo_set_schema_a( schema_a, &aux ); \ - bli_auxinfo_set_schema_b( schema_b, &aux ); \ -\ - /* Save the imaginary stride of A and B to the auxinfo_t object. */ \ - bli_auxinfo_set_is_a( is_a, &aux ); \ - bli_auxinfo_set_is_b( is_b, &aux ); \ -\ - /* The 'thread' argument points to the thrinfo_t node for the 2nd (jr) - loop around the microkernel. Here we query the thrinfo_t node for the - 1st (ir) loop around the microkernel. */ \ - thrinfo_t* caucus = bli_thrinfo_sub_node( thread ); \ -\ - /* Query the number of threads and thread ids for each loop. */ \ - dim_t jr_nt = bli_thread_n_way( thread ); \ - dim_t jr_tid = bli_thread_work_id( thread ); \ - dim_t ir_nt = bli_thread_n_way( caucus ); \ - dim_t ir_tid = bli_thread_work_id( caucus ); \ -\ - dim_t jr_start, jr_end; \ - dim_t ir_start, ir_end; \ - dim_t jr_inc, ir_inc; \ -\ - /* Use interleaved (round robin) assignment of micropanels to threads in - the 2nd and 1st loops. */ \ - bli_thread_range_jrir_rr( thread, n_iter, 1, FALSE, &jr_start, &jr_end, &jr_inc ); \ - bli_thread_range_jrir_rr( caucus, m_iter, 1, FALSE, &ir_start, &ir_end, &ir_inc ); \ -\ - /* Loop over the n dimension (NR columns at a time). */ \ - for ( j = jr_start; j < jr_end; j += jr_inc ) \ - { \ - ctype* restrict a1; \ - ctype* restrict c11; \ - ctype* restrict b2; \ -\ - b1 = b_cast + j * cstep_b; \ - c1 = c_cast + j * cstep_c; \ -\ - n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \ -\ - /* Initialize our next panel of B to be the current panel of B. */ \ - b2 = b1; \ -\ - /* Interior loop over the m dimension (MR rows at a time). */ \ - for ( i = ir_start; i < ir_end; i += ir_inc ) \ - { \ - ctype* restrict a2; \ -\ - a1 = a_cast + i * rstep_a; \ - c11 = c1 + i * rstep_c; \ -\ - /* Compute the diagonal offset for the submatrix at (i,j). */ \ - diagoffc_ij = diagoffc - (doff_t)j*NR + (doff_t)i*MR; \ -\ - m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \ -\ - /* Compute the addresses of the next panels of A and B. */ \ - a2 = bli_herk_get_next_a_upanel( a1, rstep_a, ir_inc ); \ - if ( bli_is_last_iter( i, m_iter, ir_tid, ir_nt ) ) \ - { \ - a2 = a_cast; \ - b2 = bli_herk_get_next_b_upanel( b1, cstep_b, jr_inc ); \ - if ( bli_is_last_iter( j, n_iter, jr_tid, jr_nt ) ) \ - b2 = b_cast; \ - } \ -\ - /* Save addresses of next panels of A and B to the auxinfo_t - object. */ \ - bli_auxinfo_set_next_a( a2, &aux ); \ - bli_auxinfo_set_next_b( b2, &aux ); \ -\ - /* If the diagonal intersects the current MR x NR submatrix, we - compute it the temporary buffer and then add in the elements - on or below the diagonal. - Otherwise, if the submatrix is strictly below the diagonal, - we compute and store as we normally would. - And if we're strictly above the diagonal, we do nothing and - continue. */ \ - if ( bli_intersects_diag_n( diagoffc_ij, m_cur, n_cur ) ) \ - { \ - /* Invoke the gemm micro-kernel. */ \ - gemm_ukr \ - ( \ - k, \ - alpha_cast, \ - a1, \ - b1, \ - zero, \ - ct, rs_ct, cs_ct, \ - &aux, \ - cntx \ - ); \ -\ - /* Scale C and add the result to only the stored part. */ \ - PASTEMAC(ch,xpbys_mxn_l)( diagoffc_ij, \ - m_cur, n_cur, \ - ct, rs_ct, cs_ct, \ - beta_cast, \ - c11, rs_c, cs_c ); \ - } \ - else if ( bli_is_strictly_below_diag_n( diagoffc_ij, m_cur, n_cur ) ) \ - { \ - /* Handle interior and edge cases separately. */ \ - if ( m_cur == MR && n_cur == NR ) \ - { \ - /* Invoke the gemm micro-kernel. */ \ - gemm_ukr \ - ( \ - k, \ - alpha_cast, \ - a1, \ - b1, \ - beta_cast, \ - c11, rs_c, cs_c, \ - &aux, \ - cntx \ - ); \ - } \ - else \ - { \ - /* Invoke the gemm micro-kernel. */ \ - gemm_ukr \ - ( \ - k, \ - alpha_cast, \ - a1, \ - b1, \ - zero, \ - ct, rs_ct, cs_ct, \ - &aux, \ - cntx \ - ); \ -\ - /* Scale the edge of C and add the result. */ \ - PASTEMAC(ch,xpbys_mxn)( m_cur, n_cur, \ - ct, rs_ct, cs_ct, \ - beta_cast, \ - c11, rs_c, cs_c ); \ - } \ - } \ - } \ - } \ -} - -INSERT_GENTFUNC_BASIC0( herk_l_ker_var2 ) - diff --git a/frame/3/herk/other/bli_herk_l_ker_var2.c b/frame/3/herk/other/bli_herk_l_ker_var2.c deleted file mode 100644 index 22439f5b2e..0000000000 --- a/frame/3/herk/other/bli_herk_l_ker_var2.c +++ /dev/null @@ -1,409 +0,0 @@ -/* - - BLIS - An object-based framework for developing high-performance BLAS-like - libraries. - - Copyright (C) 2014, The University of Texas at Austin - Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc. - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are - met: - - Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - - Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - - Neither the name(s) of the copyright holder(s) nor the names of its - contributors may be used to endorse or promote products derived - from this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -*/ - -#include "blis.h" - -#define FUNCPTR_T herk_fp - -typedef void (*FUNCPTR_T) - ( - doff_t diagoffc, - pack_t schema_a, - pack_t schema_b, - dim_t m, - dim_t n, - dim_t k, - void* alpha, - void* a, inc_t cs_a, inc_t is_a, - dim_t pd_a, inc_t ps_a, - void* b, inc_t rs_b, inc_t is_b, - dim_t pd_b, inc_t ps_b, - void* beta, - void* c, inc_t rs_c, inc_t cs_c, - cntx_t* cntx, - rntm_t* rntm, - thrinfo_t* thread - ); - -static FUNCPTR_T GENARRAY(ftypes,herk_l_ker_var2); - - -void bli_herk_l_ker_var2 - ( - obj_t* a, - obj_t* b, - obj_t* c, - cntx_t* cntx, - rntm_t* rntm, - cntl_t* cntl, - thrinfo_t* thread - ) -{ - num_t dt_exec = bli_obj_exec_dt( c ); - - doff_t diagoffc = bli_obj_diag_offset( c ); - - pack_t schema_a = bli_obj_pack_schema( a ); - pack_t schema_b = bli_obj_pack_schema( b ); - - dim_t m = bli_obj_length( c ); - dim_t n = bli_obj_width( c ); - dim_t k = bli_obj_width( a ); - - void* buf_a = bli_obj_buffer_at_off( a ); - inc_t cs_a = bli_obj_col_stride( a ); - inc_t is_a = bli_obj_imag_stride( a ); - dim_t pd_a = bli_obj_panel_dim( a ); - inc_t ps_a = bli_obj_panel_stride( a ); - - void* buf_b = bli_obj_buffer_at_off( b ); - inc_t rs_b = bli_obj_row_stride( b ); - inc_t is_b = bli_obj_imag_stride( b ); - dim_t pd_b = bli_obj_panel_dim( b ); - inc_t ps_b = bli_obj_panel_stride( b ); - - void* buf_c = bli_obj_buffer_at_off( c ); - inc_t rs_c = bli_obj_row_stride( c ); - inc_t cs_c = bli_obj_col_stride( c ); - - obj_t scalar_a; - obj_t scalar_b; - - void* buf_alpha; - void* buf_beta; - - FUNCPTR_T f; - - // Detach and multiply the scalars attached to A and B. - bli_obj_scalar_detach( a, &scalar_a ); - bli_obj_scalar_detach( b, &scalar_b ); - bli_mulsc( &scalar_a, &scalar_b ); - - // Grab the addresses of the internal scalar buffers for the scalar - // merged above and the scalar attached to C. - buf_alpha = bli_obj_internal_scalar_buffer( &scalar_b ); - buf_beta = bli_obj_internal_scalar_buffer( c ); - - // Index into the type combination array to extract the correct - // function pointer. - f = ftypes[dt_exec]; - - // Invoke the function. - f( diagoffc, - schema_a, - schema_b, - m, - n, - k, - buf_alpha, - buf_a, cs_a, is_a, - pd_a, ps_a, - buf_b, rs_b, is_b, - pd_b, ps_b, - buf_beta, - buf_c, rs_c, cs_c, - cntx, - rntm, - thread ); -} - - -#undef GENTFUNC -#define GENTFUNC( ctype, ch, varname ) \ -\ -void PASTEMAC(ch,varname) \ - ( \ - doff_t diagoffc, \ - pack_t schema_a, \ - pack_t schema_b, \ - dim_t m, \ - dim_t n, \ - dim_t k, \ - void* alpha, \ - void* a, inc_t cs_a, inc_t is_a, \ - dim_t pd_a, inc_t ps_a, \ - void* b, inc_t rs_b, inc_t is_b, \ - dim_t pd_b, inc_t ps_b, \ - void* beta, \ - void* c, inc_t rs_c, inc_t cs_c, \ - cntx_t* cntx, \ - rntm_t* rntm, \ - thrinfo_t* thread \ - ) \ -{ \ - const num_t dt = PASTEMAC(ch,type); \ -\ - /* Alias some constants to simpler names. */ \ - const dim_t MR = pd_a; \ - const dim_t NR = pd_b; \ - /*const dim_t PACKMR = cs_a;*/ \ - /*const dim_t PACKNR = rs_b;*/ \ -\ - /* Query the context for the micro-kernel address and cast it to its - function pointer type. */ \ - PASTECH(ch,gemm_ukr_ft) \ - gemm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \ -\ - /* Temporary C buffer for edge cases. Note that the strides of this - temporary buffer are set so that they match the storage of the - original C matrix. For example, if C is column-stored, ct will be - column-stored as well. */ \ - ctype ct[ BLIS_STACK_BUF_MAX_SIZE \ - / sizeof( ctype ) ] \ - __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \ - const bool col_pref = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \ - const inc_t rs_ct = ( col_pref ? 1 : NR ); \ - const inc_t cs_ct = ( col_pref ? MR : 1 ); \ -\ - ctype* restrict zero = PASTEMAC(ch,0); \ - ctype* restrict a_cast = a; \ - ctype* restrict b_cast = b; \ - ctype* restrict c_cast = c; \ - ctype* restrict alpha_cast = alpha; \ - ctype* restrict beta_cast = beta; \ - ctype* restrict b1; \ - ctype* restrict c1; \ -\ - doff_t diagoffc_ij; \ - dim_t m_iter, m_left; \ - dim_t n_iter, n_left; \ - dim_t m_cur; \ - dim_t n_cur; \ - dim_t i, j, ip; \ - inc_t rstep_a; \ - inc_t cstep_b; \ - inc_t rstep_c, cstep_c; \ - auxinfo_t aux; \ -\ - /* - Assumptions/assertions: - rs_a == 1 - cs_a == PACKMR - pd_a == MR - ps_a == stride to next micro-panel of A - rs_b == PACKNR - cs_b == 1 - pd_b == NR - ps_b == stride to next micro-panel of B - rs_c == (no assumptions) - cs_c == (no assumptions) - */ \ -\ - /* If any dimension is zero, return immediately. */ \ - if ( bli_zero_dim3( m, n, k ) ) return; \ -\ - /* Safeguard: If the current panel of C is entirely above the diagonal, - it is not stored. So we do nothing. */ \ - if ( bli_is_strictly_above_diag_n( diagoffc, m, n ) ) return; \ -\ - /* If there is a zero region above where the diagonal of C intersects - the left edge of the panel, adjust the pointer to C and A and treat - this case as if the diagonal offset were zero. */ \ - if ( diagoffc < 0 ) \ - { \ - ip = -diagoffc / MR; \ - i = ip * MR; \ - m = m - i; \ - diagoffc = -diagoffc % MR; \ - c_cast = c_cast + (i )*rs_c; \ - a_cast = a_cast + (ip )*ps_a; \ - } \ -\ - /* If there is a zero region to the right of where the diagonal - of C intersects the bottom of the panel, shrink it to prevent - "no-op" iterations from executing. */ \ - if ( diagoffc + m < n ) \ - { \ - n = diagoffc + m; \ - } \ -\ - /* Clear the temporary C buffer in case it has any infs or NaNs. */ \ - PASTEMAC(ch,set0s_mxn)( MR, NR, \ - ct, rs_ct, cs_ct ); \ -\ - /* Compute number of primary and leftover components of the m and n - dimensions. */ \ - n_iter = n / NR; \ - n_left = n % NR; \ -\ - m_iter = m / MR; \ - m_left = m % MR; \ -\ - if ( n_left ) ++n_iter; \ - if ( m_left ) ++m_iter; \ -\ - /* Determine some increments used to step through A, B, and C. */ \ - rstep_a = ps_a; \ -\ - cstep_b = ps_b; \ -\ - rstep_c = rs_c * MR; \ - cstep_c = cs_c * NR; \ -\ - /* Save the pack schemas of A and B to the auxinfo_t object. */ \ - bli_auxinfo_set_schema_a( schema_a, &aux ); \ - bli_auxinfo_set_schema_b( schema_b, &aux ); \ -\ - /* Save the imaginary stride of A and B to the auxinfo_t object. */ \ - bli_auxinfo_set_is_a( is_a, &aux ); \ - bli_auxinfo_set_is_b( is_b, &aux ); \ -\ - b1 = b_cast; \ - c1 = c_cast; \ -\ - thrinfo_t* caucus = bli_thrinfo_sub_node( thread ); \ - dim_t jr_num_threads = bli_thread_n_way( thread ); \ - dim_t jr_thread_id = bli_thread_work_id( thread ); \ - dim_t ir_num_threads = bli_thread_n_way( caucus ); \ - dim_t ir_thread_id = bli_thread_work_id( caucus ); \ -\ - /* Loop over the n dimension (NR columns at a time). */ \ - for ( j = jr_thread_id; j < n_iter; j += jr_num_threads ) \ - { \ - ctype* restrict a1; \ - ctype* restrict c11; \ - ctype* restrict b2; \ -\ - b1 = b_cast + j * cstep_b; \ - c1 = c_cast + j * cstep_c; \ -\ - n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \ -\ - /* Initialize our next panel of B to be the current panel of B. */ \ - b2 = b1; \ -\ - /* Interior loop over the m dimension (MR rows at a time). */ \ - for ( i = ir_thread_id; i < m_iter; i += ir_num_threads ) \ - { \ - ctype* restrict a2; \ -\ - a1 = a_cast + i * rstep_a; \ - c11 = c1 + i * rstep_c; \ -\ - /* Compute the diagonal offset for the submatrix at (i,j). */ \ - diagoffc_ij = diagoffc - (doff_t)j*NR + (doff_t)i*MR; \ -\ - m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \ -\ - /* Compute the addresses of the next panels of A and B. */ \ - a2 = bli_herk_get_next_a_upanel( caucus, a1, rstep_a ); \ - if ( bli_is_last_iter( i, m_iter, ir_thread_id, ir_num_threads ) ) \ - { \ - a2 = a_cast; \ - b2 = bli_herk_get_next_b_upanel( thread, b1, cstep_b ); \ - if ( bli_is_last_iter( j, n_iter, jr_thread_id, jr_num_threads ) ) \ - b2 = b_cast; \ - } \ -\ - /* Save addresses of next panels of A and B to the auxinfo_t - object. */ \ - bli_auxinfo_set_next_a( a2, &aux ); \ - bli_auxinfo_set_next_b( b2, &aux ); \ -\ - /* If the diagonal intersects the current MR x NR submatrix, we - compute it the temporary buffer and then add in the elements - on or below the diagonal. - Otherwise, if the submatrix is strictly below the diagonal, - we compute and store as we normally would. - And if we're strictly above the diagonal, we do nothing and - continue. */ \ - if ( bli_intersects_diag_n( diagoffc_ij, m_cur, n_cur ) ) \ - { \ - /* Invoke the gemm micro-kernel. */ \ - gemm_ukr \ - ( \ - k, \ - alpha_cast, \ - a1, \ - b1, \ - zero, \ - ct, rs_ct, cs_ct, \ - &aux, \ - cntx \ - ); \ -\ - /* Scale C and add the result to only the stored part. */ \ - PASTEMAC(ch,xpbys_mxn_l)( diagoffc_ij, \ - m_cur, n_cur, \ - ct, rs_ct, cs_ct, \ - beta_cast, \ - c11, rs_c, cs_c ); \ - } \ - else if ( bli_is_strictly_below_diag_n( diagoffc_ij, m_cur, n_cur ) ) \ - { \ - /* Handle interior and edge cases separately. */ \ - if ( m_cur == MR && n_cur == NR ) \ - { \ - /* Invoke the gemm micro-kernel. */ \ - gemm_ukr \ - ( \ - k, \ - alpha_cast, \ - a1, \ - b1, \ - beta_cast, \ - c11, rs_c, cs_c, \ - &aux, \ - cntx \ - ); \ - } \ - else \ - { \ - /* Invoke the gemm micro-kernel. */ \ - gemm_ukr \ - ( \ - k, \ - alpha_cast, \ - a1, \ - b1, \ - zero, \ - ct, rs_ct, cs_ct, \ - &aux, \ - cntx \ - ); \ -\ - /* Scale the edge of C and add the result. */ \ - PASTEMAC(ch,xpbys_mxn)( m_cur, n_cur, \ - ct, rs_ct, cs_ct, \ - beta_cast, \ - c11, rs_c, cs_c ); \ - } \ - } \ - } \ - } \ -} - -INSERT_GENTFUNC_BASIC0( herk_l_ker_var2 ) - diff --git a/frame/3/herk/other/bli_herk_l_ker_var2rr.c b/frame/3/herk/other/bli_herk_l_ker_var2rr.c deleted file mode 100644 index c78a36b297..0000000000 --- a/frame/3/herk/other/bli_herk_l_ker_var2rr.c +++ /dev/null @@ -1,555 +0,0 @@ -/* - - BLIS - An object-based framework for developing high-performance BLAS-like - libraries. - - Copyright (C) 2014, The University of Texas at Austin - Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc. - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are - met: - - Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - - Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - - Neither the name(s) of the copyright holder(s) nor the names of its - contributors may be used to endorse or promote products derived - from this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -*/ - -#include "blis.h" - -#define FUNCPTR_T herk_fp - -typedef void (*FUNCPTR_T) - ( - doff_t diagoffc, - pack_t schema_a, - pack_t schema_b, - dim_t m, - dim_t n, - dim_t k, - void* alpha, - void* a, inc_t cs_a, inc_t is_a, - dim_t pd_a, inc_t ps_a, - void* b, inc_t rs_b, inc_t is_b, - dim_t pd_b, inc_t ps_b, - void* beta, - void* c, inc_t rs_c, inc_t cs_c, - cntx_t* cntx, - rntm_t* rntm, - thrinfo_t* thread - ); - -static FUNCPTR_T GENARRAY(ftypes,herk_l_ker_var2rr); - -// -// -- Macrokernel functions for round-robin partitioning ----------------------- -// - -void bli_herk_l_ker_var2rr - ( - obj_t* a, - obj_t* b, - obj_t* c, - cntx_t* cntx, - rntm_t* rntm, - cntl_t* cntl, - thrinfo_t* thread - ) -{ - num_t dt_exec = bli_obj_exec_dt( c ); - - doff_t diagoffc = bli_obj_diag_offset( c ); - - pack_t schema_a = bli_obj_pack_schema( a ); - pack_t schema_b = bli_obj_pack_schema( b ); - - dim_t m = bli_obj_length( c ); - dim_t n = bli_obj_width( c ); - dim_t k = bli_obj_width( a ); - - void* buf_a = bli_obj_buffer_at_off( a ); - inc_t cs_a = bli_obj_col_stride( a ); - inc_t is_a = bli_obj_imag_stride( a ); - dim_t pd_a = bli_obj_panel_dim( a ); - inc_t ps_a = bli_obj_panel_stride( a ); - - void* buf_b = bli_obj_buffer_at_off( b ); - inc_t rs_b = bli_obj_row_stride( b ); - inc_t is_b = bli_obj_imag_stride( b ); - dim_t pd_b = bli_obj_panel_dim( b ); - inc_t ps_b = bli_obj_panel_stride( b ); - - void* buf_c = bli_obj_buffer_at_off( c ); - inc_t rs_c = bli_obj_row_stride( c ); - inc_t cs_c = bli_obj_col_stride( c ); - - obj_t scalar_a; - obj_t scalar_b; - - void* buf_alpha; - void* buf_beta; - - FUNCPTR_T f; - - // Detach and multiply the scalars attached to A and B. - bli_obj_scalar_detach( a, &scalar_a ); - bli_obj_scalar_detach( b, &scalar_b ); - bli_mulsc( &scalar_a, &scalar_b ); - - // Grab the addresses of the internal scalar buffers for the scalar - // merged above and the scalar attached to C. - buf_alpha = bli_obj_internal_scalar_buffer( &scalar_b ); - buf_beta = bli_obj_internal_scalar_buffer( c ); - - // Index into the type combination array to extract the correct - // function pointer. - f = ftypes[dt_exec]; - - // Invoke the function. - f( diagoffc, - schema_a, - schema_b, - m, - n, - k, - buf_alpha, - buf_a, cs_a, is_a, - pd_a, ps_a, - buf_b, rs_b, is_b, - pd_b, ps_b, - buf_beta, - buf_c, rs_c, cs_c, - cntx, - rntm, - thread ); -} - - -#undef GENTFUNC -#define GENTFUNC( ctype, ch, varname ) \ -\ -void PASTEMAC(ch,varname) \ - ( \ - doff_t diagoffc, \ - pack_t schema_a, \ - pack_t schema_b, \ - dim_t m, \ - dim_t n, \ - dim_t k, \ - void* alpha, \ - void* a, inc_t cs_a, inc_t is_a, \ - dim_t pd_a, inc_t ps_a, \ - void* b, inc_t rs_b, inc_t is_b, \ - dim_t pd_b, inc_t ps_b, \ - void* beta, \ - void* c, inc_t rs_c, inc_t cs_c, \ - cntx_t* cntx, \ - rntm_t* rntm, \ - thrinfo_t* thread \ - ) \ -{ \ - const num_t dt = PASTEMAC(ch,type); \ -\ - /* Alias some constants to simpler names. */ \ - const dim_t MR = pd_a; \ - const dim_t NR = pd_b; \ - /*const dim_t PACKMR = cs_a;*/ \ - /*const dim_t PACKNR = rs_b;*/ \ -\ - /* Query the context for the micro-kernel address and cast it to its - function pointer type. */ \ - PASTECH(ch,gemm_ukr_ft) \ - gemm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \ -\ - /* Temporary C buffer for edge cases. Note that the strides of this - temporary buffer are set so that they match the storage of the - original C matrix. For example, if C is column-stored, ct will be - column-stored as well. */ \ - ctype ct[ BLIS_STACK_BUF_MAX_SIZE \ - / sizeof( ctype ) ] \ - __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \ - const bool col_pref = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \ - const inc_t rs_ct = ( col_pref ? 1 : NR ); \ - const inc_t cs_ct = ( col_pref ? MR : 1 ); \ -\ - ctype* restrict zero = PASTEMAC(ch,0); \ - ctype* restrict a_cast = a; \ - ctype* restrict b_cast = b; \ - ctype* restrict c_cast = c; \ - ctype* restrict alpha_cast = alpha; \ - ctype* restrict beta_cast = beta; \ - ctype* restrict b1; \ - ctype* restrict c1; \ -\ - doff_t diagoffc_ij; \ - dim_t m_iter, m_left; \ - dim_t n_iter, n_left; \ - dim_t m_cur; \ - dim_t n_cur; \ - dim_t i, j, ip; \ - inc_t rstep_a; \ - inc_t cstep_b; \ - inc_t rstep_c, cstep_c; \ - auxinfo_t aux; \ -\ - /* - Assumptions/assertions: - rs_a == 1 - cs_a == PACKMR - pd_a == MR - ps_a == stride to next micro-panel of A - rs_b == PACKNR - cs_b == 1 - pd_b == NR - ps_b == stride to next micro-panel of B - rs_c == (no assumptions) - cs_c == (no assumptions) - */ \ -\ - /* If any dimension is zero, return immediately. */ \ - if ( bli_zero_dim3( m, n, k ) ) return; \ -\ - /* Safeguard: If the current panel of C is entirely above the diagonal, - it is not stored. So we do nothing. */ \ - if ( bli_is_strictly_above_diag_n( diagoffc, m, n ) ) return; \ -\ - /* If there is a zero region above where the diagonal of C intersects - the left edge of the panel, adjust the pointer to C and A and treat - this case as if the diagonal offset were zero. */ \ - if ( diagoffc < 0 ) \ - { \ - ip = -diagoffc / MR; \ - i = ip * MR; \ - m = m - i; \ - diagoffc = -diagoffc % MR; \ - c_cast = c_cast + (i )*rs_c; \ - a_cast = a_cast + (ip )*ps_a; \ - } \ -\ - /* If there is a zero region to the right of where the diagonal - of C intersects the bottom of the panel, shrink it to prevent - "no-op" iterations from executing. */ \ - if ( diagoffc + m < n ) \ - { \ - n = diagoffc + m; \ - } \ -\ - /* Clear the temporary C buffer in case it has any infs or NaNs. */ \ - PASTEMAC(ch,set0s_mxn)( MR, NR, \ - ct, rs_ct, cs_ct ); \ -\ - /* Compute number of primary and leftover components of the m and n - dimensions. */ \ - n_iter = n / NR; \ - n_left = n % NR; \ -\ - m_iter = m / MR; \ - m_left = m % MR; \ -\ - if ( n_left ) ++n_iter; \ - if ( m_left ) ++m_iter; \ -\ - /* Determine some increments used to step through A, B, and C. */ \ - rstep_a = ps_a; \ -\ - cstep_b = ps_b; \ -\ - rstep_c = rs_c * MR; \ - cstep_c = cs_c * NR; \ -\ - /* Save the pack schemas of A and B to the auxinfo_t object. */ \ - bli_auxinfo_set_schema_a( schema_a, &aux ); \ - bli_auxinfo_set_schema_b( schema_b, &aux ); \ -\ - /* Save the imaginary stride of A and B to the auxinfo_t object. */ \ - bli_auxinfo_set_is_a( is_a, &aux ); \ - bli_auxinfo_set_is_b( is_b, &aux ); \ -\ - /* The 'thread' argument points to the thrinfo_t node for the 2nd (jr) - loop around the microkernel. Here we query the thrinfo_t node for the - 1st (ir) loop around the microkernel. */ \ - thrinfo_t* caucus = bli_thrinfo_sub_node( thread ); \ -\ - /* Query the number of threads and thread ids for each loop. */ \ - dim_t jr_nt = bli_thread_n_way( thread ); \ - dim_t jr_tid = bli_thread_work_id( thread ); \ - dim_t ir_nt = bli_thread_n_way( caucus ); \ - dim_t ir_tid = bli_thread_work_id( caucus ); \ -\ - dim_t jr_start, jr_end; \ - dim_t ir_start, ir_end; \ - dim_t jr_inc, ir_inc; \ -\ - /* Note that we partition the 2nd loop into two regions: the rectangular - part of C, and the triangular portion. */ \ - dim_t n_iter_rct; \ - dim_t n_iter_tri; \ -\ - if ( bli_is_strictly_below_diag_n( diagoffc, m, n ) ) \ - { \ - /* If the entire panel of C does not intersect the diagonal, there is - no triangular region, and therefore we can skip the second set of - loops. */ \ - n_iter_rct = n_iter; \ - n_iter_tri = 0; \ - } \ - else \ - { \ - /* If the panel of C does intersect the diagonal, compute the number of - iterations in the rectangular region by dividing NR into the diagonal - offset. Any remainder from this integer division is discarded, which - is what we want. That is, we want the rectangular region to contain - as many columns of whole microtiles as possible without including any - microtiles that intersect the diagonal. The number of iterations in - the triangular (or trapezoidal) region is computed as the remaining - number of iterations in the n dimension. */ \ - n_iter_rct = diagoffc / NR; \ - n_iter_tri = n_iter - n_iter_rct; \ - } \ -\ - /* Use round-robin assignment of micropanels to threads in the 2nd and 1st - loops for the initial rectangular region of C (if it exists). */ \ - bli_thread_range_jrir_rr( thread, n_iter_rct, 1, FALSE, &jr_start, &jr_end, &jr_inc ); \ - bli_thread_range_jrir_rr( caucus, m_iter, 1, FALSE, &ir_start, &ir_end, &ir_inc ); \ -\ - /* Loop over the n dimension (NR columns at a time). */ \ - for ( j = jr_start; j < jr_end; j += jr_inc ) \ - { \ - ctype* restrict a1; \ - ctype* restrict c11; \ - ctype* restrict b2; \ -\ - b1 = b_cast + j * cstep_b; \ - c1 = c_cast + j * cstep_c; \ -\ - n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \ -\ - /* Initialize our next panel of B to be the current panel of B. */ \ - b2 = b1; \ -\ - /* Interior loop over the m dimension (MR rows at a time). */ \ - for ( i = ir_start; i < ir_end; i += ir_inc ) \ - { \ - ctype* restrict a2; \ -\ - a1 = a_cast + i * rstep_a; \ - c11 = c1 + i * rstep_c; \ -\ - /* No need to compute the diagonal offset for the rectangular - region. */ \ - /*diagoffc_ij = diagoffc - (doff_t)j*NR + (doff_t)i*MR;*/ \ -\ - m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \ -\ - /* Compute the addresses of the next panels of A and B. */ \ - a2 = bli_herk_get_next_a_upanel( a1, rstep_a, ir_inc ); \ - if ( bli_is_last_iter_rr( i, m_iter, ir_tid, ir_nt ) ) \ - { \ - a2 = a_cast; \ - b2 = bli_herk_get_next_b_upanel( b1, cstep_b, jr_inc ); \ - if ( bli_is_last_iter_rr( j, n_iter, jr_tid, jr_nt ) ) \ - b2 = b_cast; \ - } \ -\ - /* Save addresses of next panels of A and B to the auxinfo_t - object. */ \ - bli_auxinfo_set_next_a( a2, &aux ); \ - bli_auxinfo_set_next_b( b2, &aux ); \ -\ - /* If the diagonal intersects the current MR x NR submatrix, we - compute it the temporary buffer and then add in the elements - on or below the diagonal. - Otherwise, if the submatrix is strictly below the diagonal, - we compute and store as we normally would. - And if we're strictly above the diagonal, we do nothing and - continue. */ \ - { \ - /* Handle interior and edge cases separately. */ \ - if ( m_cur == MR && n_cur == NR ) \ - { \ - /* Invoke the gemm micro-kernel. */ \ - gemm_ukr \ - ( \ - k, \ - alpha_cast, \ - a1, \ - b1, \ - beta_cast, \ - c11, rs_c, cs_c, \ - &aux, \ - cntx \ - ); \ - } \ - else \ - { \ - /* Invoke the gemm micro-kernel. */ \ - gemm_ukr \ - ( \ - k, \ - alpha_cast, \ - a1, \ - b1, \ - zero, \ - ct, rs_ct, cs_ct, \ - &aux, \ - cntx \ - ); \ -\ - /* Scale the edge of C and add the result. */ \ - PASTEMAC(ch,xpbys_mxn)( m_cur, n_cur, \ - ct, rs_ct, cs_ct, \ - beta_cast, \ - c11, rs_c, cs_c ); \ - } \ - } \ - } \ - } \ -\ - /* If there is no triangular region, then we're done. */ \ - if ( n_iter_tri == 0 ) return; \ -\ - /* Use round-robin assignment of micropanels to threads in the 2nd and - 1st loops for the remaining triangular region of C. */ \ - bli_thread_range_jrir_rr( thread, n_iter_tri, 1, FALSE, &jr_start, &jr_end, &jr_inc ); \ -\ - /* Advance the start and end iteration offsets for the triangular region - by the number of iterations used for the rectangular region. */ \ - jr_start += n_iter_rct; \ - jr_end += n_iter_rct; \ -\ - /* Loop over the n dimension (NR columns at a time). */ \ - for ( j = jr_start; j < jr_end; j += jr_inc ) \ - { \ - ctype* restrict a1; \ - ctype* restrict c11; \ - ctype* restrict b2; \ -\ - b1 = b_cast + j * cstep_b; \ - c1 = c_cast + j * cstep_c; \ -\ - n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \ -\ - /* Initialize our next panel of B to be the current panel of B. */ \ - b2 = b1; \ -\ - /* Interior loop over the m dimension (MR rows at a time). */ \ - for ( i = ir_start; i < ir_end; i += ir_inc ) \ - { \ - ctype* restrict a2; \ -\ - a1 = a_cast + i * rstep_a; \ - c11 = c1 + i * rstep_c; \ -\ - /* Compute the diagonal offset for the submatrix at (i,j). */ \ - diagoffc_ij = diagoffc - (doff_t)j*NR + (doff_t)i*MR; \ -\ - m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \ -\ - /* Compute the addresses of the next panels of A and B. */ \ - a2 = bli_herk_get_next_a_upanel( a1, rstep_a, ir_inc ); \ - if ( bli_is_last_iter_rr( i, m_iter, ir_tid, ir_nt ) ) \ - { \ - a2 = a_cast; \ - b2 = bli_herk_get_next_b_upanel( b1, cstep_b, jr_inc ); \ - if ( bli_is_last_iter_rr( j, n_iter, jr_tid, jr_nt ) ) \ - b2 = b_cast; \ - } \ -\ - /* Save addresses of next panels of A and B to the auxinfo_t - object. */ \ - bli_auxinfo_set_next_a( a2, &aux ); \ - bli_auxinfo_set_next_b( b2, &aux ); \ -\ - /* If the diagonal intersects the current MR x NR submatrix, we - compute it the temporary buffer and then add in the elements - on or below the diagonal. - Otherwise, if the submatrix is strictly below the diagonal, - we compute and store as we normally would. - And if we're strictly above the diagonal, we do nothing and - continue. */ \ - if ( bli_intersects_diag_n( diagoffc_ij, m_cur, n_cur ) ) \ - { \ - /* Invoke the gemm micro-kernel. */ \ - gemm_ukr \ - ( \ - k, \ - alpha_cast, \ - a1, \ - b1, \ - zero, \ - ct, rs_ct, cs_ct, \ - &aux, \ - cntx \ - ); \ -\ - /* Scale C and add the result to only the stored part. */ \ - PASTEMAC(ch,xpbys_mxn_l)( diagoffc_ij, \ - m_cur, n_cur, \ - ct, rs_ct, cs_ct, \ - beta_cast, \ - c11, rs_c, cs_c ); \ - } \ - else if ( bli_is_strictly_below_diag_n( diagoffc_ij, m_cur, n_cur ) ) \ - { \ - /* Handle interior and edge cases separately. */ \ - if ( m_cur == MR && n_cur == NR ) \ - { \ - /* Invoke the gemm micro-kernel. */ \ - gemm_ukr \ - ( \ - k, \ - alpha_cast, \ - a1, \ - b1, \ - beta_cast, \ - c11, rs_c, cs_c, \ - &aux, \ - cntx \ - ); \ - } \ - else \ - { \ - /* Invoke the gemm micro-kernel. */ \ - gemm_ukr \ - ( \ - k, \ - alpha_cast, \ - a1, \ - b1, \ - zero, \ - ct, rs_ct, cs_ct, \ - &aux, \ - cntx \ - ); \ -\ - /* Scale the edge of C and add the result. */ \ - PASTEMAC(ch,xpbys_mxn)( m_cur, n_cur, \ - ct, rs_ct, cs_ct, \ - beta_cast, \ - c11, rs_c, cs_c ); \ - } \ - } \ - } \ - } \ -} - -INSERT_GENTFUNC_BASIC0( herk_l_ker_var2rr ) - diff --git a/frame/3/herk/other/bli_herk_l_ker_var2sl.c b/frame/3/herk/other/bli_herk_l_ker_var2sl.c deleted file mode 100644 index 17e0b0d0e0..0000000000 --- a/frame/3/herk/other/bli_herk_l_ker_var2sl.c +++ /dev/null @@ -1,556 +0,0 @@ -/* - - BLIS - An object-based framework for developing high-performance BLAS-like - libraries. - - Copyright (C) 2014, The University of Texas at Austin - Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc. - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are - met: - - Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - - Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - - Neither the name(s) of the copyright holder(s) nor the names of its - contributors may be used to endorse or promote products derived - from this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -*/ - -#include "blis.h" - -#define FUNCPTR_T herk_fp - -typedef void (*FUNCPTR_T) - ( - doff_t diagoffc, - pack_t schema_a, - pack_t schema_b, - dim_t m, - dim_t n, - dim_t k, - void* alpha, - void* a, inc_t cs_a, inc_t is_a, - dim_t pd_a, inc_t ps_a, - void* b, inc_t rs_b, inc_t is_b, - dim_t pd_b, inc_t ps_b, - void* beta, - void* c, inc_t rs_c, inc_t cs_c, - cntx_t* cntx, - rntm_t* rntm, - thrinfo_t* thread - ); - -static FUNCPTR_T GENARRAY(ftypes,herk_l_ker_var2sl); - -// -// -- Macrokernel functions for slab partitioning ------------------------------ -// - -void bli_herk_l_ker_var2sl - ( - obj_t* a, - obj_t* b, - obj_t* c, - cntx_t* cntx, - rntm_t* rntm, - cntl_t* cntl, - thrinfo_t* thread - ) -{ - num_t dt_exec = bli_obj_exec_dt( c ); - - doff_t diagoffc = bli_obj_diag_offset( c ); - - pack_t schema_a = bli_obj_pack_schema( a ); - pack_t schema_b = bli_obj_pack_schema( b ); - - dim_t m = bli_obj_length( c ); - dim_t n = bli_obj_width( c ); - dim_t k = bli_obj_width( a ); - - void* buf_a = bli_obj_buffer_at_off( a ); - inc_t cs_a = bli_obj_col_stride( a ); - inc_t is_a = bli_obj_imag_stride( a ); - dim_t pd_a = bli_obj_panel_dim( a ); - inc_t ps_a = bli_obj_panel_stride( a ); - - void* buf_b = bli_obj_buffer_at_off( b ); - inc_t rs_b = bli_obj_row_stride( b ); - inc_t is_b = bli_obj_imag_stride( b ); - dim_t pd_b = bli_obj_panel_dim( b ); - inc_t ps_b = bli_obj_panel_stride( b ); - - void* buf_c = bli_obj_buffer_at_off( c ); - inc_t rs_c = bli_obj_row_stride( c ); - inc_t cs_c = bli_obj_col_stride( c ); - - obj_t scalar_a; - obj_t scalar_b; - - void* buf_alpha; - void* buf_beta; - - FUNCPTR_T f; - - // Detach and multiply the scalars attached to A and B. - bli_obj_scalar_detach( a, &scalar_a ); - bli_obj_scalar_detach( b, &scalar_b ); - bli_mulsc( &scalar_a, &scalar_b ); - - // Grab the addresses of the internal scalar buffers for the scalar - // merged above and the scalar attached to C. - buf_alpha = bli_obj_internal_scalar_buffer( &scalar_b ); - buf_beta = bli_obj_internal_scalar_buffer( c ); - - // Index into the type combination array to extract the correct - // function pointer. - f = ftypes[dt_exec]; - - // Invoke the function. - f( diagoffc, - schema_a, - schema_b, - m, - n, - k, - buf_alpha, - buf_a, cs_a, is_a, - pd_a, ps_a, - buf_b, rs_b, is_b, - pd_b, ps_b, - buf_beta, - buf_c, rs_c, cs_c, - cntx, - rntm, - thread ); -} - - -#undef GENTFUNC -#define GENTFUNC( ctype, ch, varname ) \ -\ -void PASTEMAC(ch,varname) \ - ( \ - doff_t diagoffc, \ - pack_t schema_a, \ - pack_t schema_b, \ - dim_t m, \ - dim_t n, \ - dim_t k, \ - void* alpha, \ - void* a, inc_t cs_a, inc_t is_a, \ - dim_t pd_a, inc_t ps_a, \ - void* b, inc_t rs_b, inc_t is_b, \ - dim_t pd_b, inc_t ps_b, \ - void* beta, \ - void* c, inc_t rs_c, inc_t cs_c, \ - cntx_t* cntx, \ - rntm_t* rntm, \ - thrinfo_t* thread \ - ) \ -{ \ - const num_t dt = PASTEMAC(ch,type); \ -\ - /* Alias some constants to simpler names. */ \ - const dim_t MR = pd_a; \ - const dim_t NR = pd_b; \ - /*const dim_t PACKMR = cs_a;*/ \ - /*const dim_t PACKNR = rs_b;*/ \ -\ - /* Query the context for the micro-kernel address and cast it to its - function pointer type. */ \ - PASTECH(ch,gemm_ukr_ft) \ - gemm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \ -\ - /* Temporary C buffer for edge cases. Note that the strides of this - temporary buffer are set so that they match the storage of the - original C matrix. For example, if C is column-stored, ct will be - column-stored as well. */ \ - ctype ct[ BLIS_STACK_BUF_MAX_SIZE \ - / sizeof( ctype ) ] \ - __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \ - const bool col_pref = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \ - const inc_t rs_ct = ( col_pref ? 1 : NR ); \ - const inc_t cs_ct = ( col_pref ? MR : 1 ); \ -\ - ctype* restrict zero = PASTEMAC(ch,0); \ - ctype* restrict a_cast = a; \ - ctype* restrict b_cast = b; \ - ctype* restrict c_cast = c; \ - ctype* restrict alpha_cast = alpha; \ - ctype* restrict beta_cast = beta; \ - ctype* restrict b1; \ - ctype* restrict c1; \ -\ - doff_t diagoffc_ij; \ - dim_t m_iter, m_left; \ - dim_t n_iter, n_left; \ - dim_t m_cur; \ - dim_t n_cur; \ - dim_t i, j, ip; \ - inc_t rstep_a; \ - inc_t cstep_b; \ - inc_t rstep_c, cstep_c; \ - auxinfo_t aux; \ -\ - /* - Assumptions/assertions: - rs_a == 1 - cs_a == PACKMR - pd_a == MR - ps_a == stride to next micro-panel of A - rs_b == PACKNR - cs_b == 1 - pd_b == NR - ps_b == stride to next micro-panel of B - rs_c == (no assumptions) - cs_c == (no assumptions) - */ \ -\ - /* If any dimension is zero, return immediately. */ \ - if ( bli_zero_dim3( m, n, k ) ) return; \ -\ - /* Safeguard: If the current panel of C is entirely above the diagonal, - it is not stored. So we do nothing. */ \ - if ( bli_is_strictly_above_diag_n( diagoffc, m, n ) ) return; \ -\ - /* If there is a zero region above where the diagonal of C intersects - the left edge of the panel, adjust the pointer to C and A and treat - this case as if the diagonal offset were zero. */ \ - if ( diagoffc < 0 ) \ - { \ - ip = -diagoffc / MR; \ - i = ip * MR; \ - m = m - i; \ - diagoffc = -diagoffc % MR; \ - c_cast = c_cast + (i )*rs_c; \ - a_cast = a_cast + (ip )*ps_a; \ - } \ -\ - /* If there is a zero region to the right of where the diagonal - of C intersects the bottom of the panel, shrink it to prevent - "no-op" iterations from executing. */ \ - if ( diagoffc + m < n ) \ - { \ - n = diagoffc + m; \ - } \ -\ - /* Clear the temporary C buffer in case it has any infs or NaNs. */ \ - PASTEMAC(ch,set0s_mxn)( MR, NR, \ - ct, rs_ct, cs_ct ); \ -\ - /* Compute number of primary and leftover components of the m and n - dimensions. */ \ - n_iter = n / NR; \ - n_left = n % NR; \ -\ - m_iter = m / MR; \ - m_left = m % MR; \ -\ - if ( n_left ) ++n_iter; \ - if ( m_left ) ++m_iter; \ -\ - /* Determine some increments used to step through A, B, and C. */ \ - rstep_a = ps_a; \ -\ - cstep_b = ps_b; \ -\ - rstep_c = rs_c * MR; \ - cstep_c = cs_c * NR; \ -\ - /* Save the pack schemas of A and B to the auxinfo_t object. */ \ - bli_auxinfo_set_schema_a( schema_a, &aux ); \ - bli_auxinfo_set_schema_b( schema_b, &aux ); \ -\ - /* Save the imaginary stride of A and B to the auxinfo_t object. */ \ - bli_auxinfo_set_is_a( is_a, &aux ); \ - bli_auxinfo_set_is_b( is_b, &aux ); \ -\ - /* The 'thread' argument points to the thrinfo_t node for the 2nd (jr) - loop around the microkernel. Here we query the thrinfo_t node for the - 1st (ir) loop around the microkernel. */ \ - thrinfo_t* caucus = bli_thrinfo_sub_node( thread ); \ -\ - /* Query the number of threads and thread ids for each loop. */ \ - dim_t jr_nt = bli_thread_n_way( thread ); \ - dim_t jr_tid = bli_thread_work_id( thread ); \ - dim_t ir_nt = bli_thread_n_way( caucus ); \ - dim_t ir_tid = bli_thread_work_id( caucus ); \ -\ - dim_t jr_start, jr_end; \ - dim_t ir_start, ir_end; \ - dim_t jr_inc, ir_inc; \ -\ - /* Note that we partition the 2nd loop into two regions: the rectangular - part of C, and the triangular portion. */ \ - dim_t n_iter_rct; \ - dim_t n_iter_tri; \ -\ - if ( bli_is_strictly_below_diag_n( diagoffc, m, n ) ) \ - { \ - /* If the entire panel of C does not intersect the diagonal, there is - no triangular region, and therefore we can skip the second set of - loops. */ \ - n_iter_rct = n_iter; \ - n_iter_tri = 0; \ - } \ - else \ - { \ - /* If the panel of C does intersect the diagonal, compute the number of - iterations in the rectangular region by dividing NR into the diagonal - offset. Any remainder from this integer division is discarded, which - is what we want. That is, we want the rectangular region to contain - as many columns of whole microtiles as possible without including any - microtiles that intersect the diagonal. The number of iterations in - the triangular (or trapezoidal) region is computed as the remaining - number of iterations in the n dimension. */ \ - n_iter_rct = diagoffc / NR; \ - n_iter_tri = n_iter - n_iter_rct; \ - } \ -\ - /* Use slab assignment of micropanels to threads in the 2nd and 1st - loops for the initial rectangular region of C (if it exists). */ \ - bli_thread_range_jrir_sl( thread, n_iter_rct, 1, FALSE, &jr_start, &jr_end, &jr_inc ); \ - bli_thread_range_jrir_sl( caucus, m_iter, 1, FALSE, &ir_start, &ir_end, &ir_inc ); \ -\ - /* Loop over the n dimension (NR columns at a time). */ \ - for ( j = jr_start; j < jr_end; j += jr_inc ) \ - { \ - ctype* restrict a1; \ - ctype* restrict c11; \ - ctype* restrict b2; \ -\ - b1 = b_cast + j * cstep_b; \ - c1 = c_cast + j * cstep_c; \ -\ - n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \ -\ - /* Initialize our next panel of B to be the current panel of B. */ \ - b2 = b1; \ -\ - /* Interior loop over the m dimension (MR rows at a time). */ \ - for ( i = ir_start; i < ir_end; i += ir_inc ) \ - { \ - ctype* restrict a2; \ -\ - a1 = a_cast + i * rstep_a; \ - c11 = c1 + i * rstep_c; \ -\ - /* No need to compute the diagonal offset for the rectangular - region. */ \ - /*diagoffc_ij = diagoffc - (doff_t)j*NR + (doff_t)i*MR;*/ \ -\ - m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \ -\ - /* Compute the addresses of the next panels of A and B. */ \ - a2 = bli_herk_get_next_a_upanel( a1, rstep_a, ir_inc ); \ - if ( bli_is_last_iter_sl( i, m_iter, ir_tid, ir_nt ) ) \ - { \ - a2 = a_cast; \ - b2 = bli_herk_get_next_b_upanel( b1, cstep_b, jr_inc ); \ - if ( bli_is_last_iter_sl( j, n_iter, jr_tid, jr_nt ) ) \ - b2 = b_cast; \ - } \ -\ - /* Save addresses of next panels of A and B to the auxinfo_t - object. */ \ - bli_auxinfo_set_next_a( a2, &aux ); \ - bli_auxinfo_set_next_b( b2, &aux ); \ -\ - /* If the diagonal intersects the current MR x NR submatrix, we - compute it the temporary buffer and then add in the elements - on or below the diagonal. - Otherwise, if the submatrix is strictly below the diagonal, - we compute and store as we normally would. - And if we're strictly above the diagonal, we do nothing and - continue. */ \ - { \ - /* Handle interior and edge cases separately. */ \ - if ( m_cur == MR && n_cur == NR ) \ - { \ - /* Invoke the gemm micro-kernel. */ \ - gemm_ukr \ - ( \ - k, \ - alpha_cast, \ - a1, \ - b1, \ - beta_cast, \ - c11, rs_c, cs_c, \ - &aux, \ - cntx \ - ); \ - } \ - else \ - { \ - /* Invoke the gemm micro-kernel. */ \ - gemm_ukr \ - ( \ - k, \ - alpha_cast, \ - a1, \ - b1, \ - zero, \ - ct, rs_ct, cs_ct, \ - &aux, \ - cntx \ - ); \ -\ - /* Scale the edge of C and add the result. */ \ - PASTEMAC(ch,xpbys_mxn)( m_cur, n_cur, \ - ct, rs_ct, cs_ct, \ - beta_cast, \ - c11, rs_c, cs_c ); \ - } \ - } \ - } \ - } \ -\ - /* If there is no triangular region, then we're done. */ \ - if ( n_iter_tri == 0 ) return; \ -\ - /* Use round-robin assignment of micropanels to threads in the 2nd - loop and slab partitioning in the 1st loop for the remaining - triangular region of C. */ \ - bli_thread_range_jrir_rr( thread, n_iter_tri, 1, FALSE, &jr_start, &jr_end, &jr_inc ); \ -\ - /* Advance the start and end iteration offsets for the triangular region - by the number of iterations used for the rectangular region. */ \ - jr_start += n_iter_rct; \ - jr_end += n_iter_rct; \ -\ - /* Loop over the n dimension (NR columns at a time). */ \ - for ( j = jr_start; j < jr_end; j += jr_inc ) \ - { \ - ctype* restrict a1; \ - ctype* restrict c11; \ - ctype* restrict b2; \ -\ - b1 = b_cast + j * cstep_b; \ - c1 = c_cast + j * cstep_c; \ -\ - n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \ -\ - /* Initialize our next panel of B to be the current panel of B. */ \ - b2 = b1; \ -\ - /* Interior loop over the m dimension (MR rows at a time). */ \ - for ( i = ir_start; i < ir_end; i += ir_inc ) \ - { \ - ctype* restrict a2; \ -\ - a1 = a_cast + i * rstep_a; \ - c11 = c1 + i * rstep_c; \ -\ - /* Compute the diagonal offset for the submatrix at (i,j). */ \ - diagoffc_ij = diagoffc - (doff_t)j*NR + (doff_t)i*MR; \ -\ - m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \ -\ - /* Compute the addresses of the next panels of A and B. */ \ - a2 = bli_herk_get_next_a_upanel( a1, rstep_a, ir_inc ); \ - if ( bli_is_last_iter_rr( i, m_iter, ir_tid, ir_nt ) ) \ - { \ - a2 = a_cast; \ - b2 = bli_herk_get_next_b_upanel( b1, cstep_b, jr_inc ); \ - if ( bli_is_last_iter_rr( j, n_iter, jr_tid, jr_nt ) ) \ - b2 = b_cast; \ - } \ -\ - /* Save addresses of next panels of A and B to the auxinfo_t - object. */ \ - bli_auxinfo_set_next_a( a2, &aux ); \ - bli_auxinfo_set_next_b( b2, &aux ); \ -\ - /* If the diagonal intersects the current MR x NR submatrix, we - compute it the temporary buffer and then add in the elements - on or below the diagonal. - Otherwise, if the submatrix is strictly below the diagonal, - we compute and store as we normally would. - And if we're strictly above the diagonal, we do nothing and - continue. */ \ - if ( bli_intersects_diag_n( diagoffc_ij, m_cur, n_cur ) ) \ - { \ - /* Invoke the gemm micro-kernel. */ \ - gemm_ukr \ - ( \ - k, \ - alpha_cast, \ - a1, \ - b1, \ - zero, \ - ct, rs_ct, cs_ct, \ - &aux, \ - cntx \ - ); \ -\ - /* Scale C and add the result to only the stored part. */ \ - PASTEMAC(ch,xpbys_mxn_l)( diagoffc_ij, \ - m_cur, n_cur, \ - ct, rs_ct, cs_ct, \ - beta_cast, \ - c11, rs_c, cs_c ); \ - } \ - else if ( bli_is_strictly_below_diag_n( diagoffc_ij, m_cur, n_cur ) ) \ - { \ - /* Handle interior and edge cases separately. */ \ - if ( m_cur == MR && n_cur == NR ) \ - { \ - /* Invoke the gemm micro-kernel. */ \ - gemm_ukr \ - ( \ - k, \ - alpha_cast, \ - a1, \ - b1, \ - beta_cast, \ - c11, rs_c, cs_c, \ - &aux, \ - cntx \ - ); \ - } \ - else \ - { \ - /* Invoke the gemm micro-kernel. */ \ - gemm_ukr \ - ( \ - k, \ - alpha_cast, \ - a1, \ - b1, \ - zero, \ - ct, rs_ct, cs_ct, \ - &aux, \ - cntx \ - ); \ -\ - /* Scale the edge of C and add the result. */ \ - PASTEMAC(ch,xpbys_mxn)( m_cur, n_cur, \ - ct, rs_ct, cs_ct, \ - beta_cast, \ - c11, rs_c, cs_c ); \ - } \ - } \ - } \ - } \ -} - -INSERT_GENTFUNC_BASIC0( herk_l_ker_var2sl ) - diff --git a/frame/3/herk/other/bli_herk_u_ker_var2.1looprr.c b/frame/3/herk/other/bli_herk_u_ker_var2.1looprr.c deleted file mode 100644 index 31d8fab62f..0000000000 --- a/frame/3/herk/other/bli_herk_u_ker_var2.1looprr.c +++ /dev/null @@ -1,420 +0,0 @@ -/* - - BLIS - An object-based framework for developing high-performance BLAS-like - libraries. - - Copyright (C) 2014, The University of Texas at Austin - Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc. - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are - met: - - Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - - Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - - Neither the name(s) of the copyright holder(s) nor the names of its - contributors may be used to endorse or promote products derived - from this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -*/ - -#include "blis.h" - -#define FUNCPTR_T herk_fp - -typedef void (*FUNCPTR_T) - ( - doff_t diagoffc, - pack_t schema_a, - pack_t schema_b, - dim_t m, - dim_t n, - dim_t k, - void* alpha, - void* a, inc_t cs_a, inc_t is_a, - dim_t pd_a, inc_t ps_a, - void* b, inc_t rs_b, inc_t is_b, - dim_t pd_b, inc_t ps_b, - void* beta, - void* c, inc_t rs_c, inc_t cs_c, - cntx_t* cntx, - rntm_t* rntm, - thrinfo_t* thread - ); - -static FUNCPTR_T GENARRAY(ftypes,herk_u_ker_var2); - - -void bli_herk_u_ker_var2 - ( - obj_t* a, - obj_t* b, - obj_t* c, - cntx_t* cntx, - rntm_t* rntm, - cntl_t* cntl, - thrinfo_t* thread - ) -{ - num_t dt_exec = bli_obj_exec_dt( c ); - - doff_t diagoffc = bli_obj_diag_offset( c ); - - pack_t schema_a = bli_obj_pack_schema( a ); - pack_t schema_b = bli_obj_pack_schema( b ); - - dim_t m = bli_obj_length( c ); - dim_t n = bli_obj_width( c ); - dim_t k = bli_obj_width( a ); - - void* buf_a = bli_obj_buffer_at_off( a ); - inc_t cs_a = bli_obj_col_stride( a ); - inc_t is_a = bli_obj_imag_stride( a ); - dim_t pd_a = bli_obj_panel_dim( a ); - inc_t ps_a = bli_obj_panel_stride( a ); - - void* buf_b = bli_obj_buffer_at_off( b ); - inc_t rs_b = bli_obj_row_stride( b ); - inc_t is_b = bli_obj_imag_stride( b ); - dim_t pd_b = bli_obj_panel_dim( b ); - inc_t ps_b = bli_obj_panel_stride( b ); - - void* buf_c = bli_obj_buffer_at_off( c ); - inc_t rs_c = bli_obj_row_stride( c ); - inc_t cs_c = bli_obj_col_stride( c ); - - obj_t scalar_a; - obj_t scalar_b; - - void* buf_alpha; - void* buf_beta; - - FUNCPTR_T f; - - // Detach and multiply the scalars attached to A and B. - bli_obj_scalar_detach( a, &scalar_a ); - bli_obj_scalar_detach( b, &scalar_b ); - bli_mulsc( &scalar_a, &scalar_b ); - - // Grab the addresses of the internal scalar buffers for the scalar - // merged above and the scalar attached to C. - buf_alpha = bli_obj_internal_scalar_buffer( &scalar_b ); - buf_beta = bli_obj_internal_scalar_buffer( c ); - - // Index into the type combination array to extract the correct - // function pointer. - f = ftypes[dt_exec]; - - // Invoke the function. - f( diagoffc, - schema_a, - schema_b, - m, - n, - k, - buf_alpha, - buf_a, cs_a, is_a, - pd_a, ps_a, - buf_b, rs_b, is_b, - pd_b, ps_b, - buf_beta, - buf_c, rs_c, cs_c, - cntx, - rntm, - thread ); -} - - -#undef GENTFUNC -#define GENTFUNC( ctype, ch, varname ) \ -\ -void PASTEMAC(ch,varname) \ - ( \ - doff_t diagoffc, \ - pack_t schema_a, \ - pack_t schema_b, \ - dim_t m, \ - dim_t n, \ - dim_t k, \ - void* alpha, \ - void* a, inc_t cs_a, inc_t is_a, \ - dim_t pd_a, inc_t ps_a, \ - void* b, inc_t rs_b, inc_t is_b, \ - dim_t pd_b, inc_t ps_b, \ - void* beta, \ - void* c, inc_t rs_c, inc_t cs_c, \ - cntx_t* cntx, \ - rntm_t* rntm, \ - thrinfo_t* thread \ - ) \ -{ \ - const num_t dt = PASTEMAC(ch,type); \ -\ - /* Alias some constants to simpler names. */ \ - const dim_t MR = pd_a; \ - const dim_t NR = pd_b; \ - /*const dim_t PACKMR = cs_a;*/ \ - /*const dim_t PACKNR = rs_b;*/ \ -\ - /* Query the context for the micro-kernel address and cast it to its - function pointer type. */ \ - PASTECH(ch,gemm_ukr_ft) \ - gemm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \ -\ - /* Temporary C buffer for edge cases. Note that the strides of this - temporary buffer are set so that they match the storage of the - original C matrix. For example, if C is column-stored, ct will be - column-stored as well. */ \ - ctype ct[ BLIS_STACK_BUF_MAX_SIZE \ - / sizeof( ctype ) ] \ - __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \ - const bool col_pref = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \ - const inc_t rs_ct = ( col_pref ? 1 : NR ); \ - const inc_t cs_ct = ( col_pref ? MR : 1 ); \ -\ - ctype* restrict zero = PASTEMAC(ch,0); \ - ctype* restrict a_cast = a; \ - ctype* restrict b_cast = b; \ - ctype* restrict c_cast = c; \ - ctype* restrict alpha_cast = alpha; \ - ctype* restrict beta_cast = beta; \ - ctype* restrict b1; \ - ctype* restrict c1; \ -\ - doff_t diagoffc_ij; \ - dim_t m_iter, m_left; \ - dim_t n_iter, n_left; \ - dim_t m_cur; \ - dim_t n_cur; \ - dim_t i, j, jp; \ - inc_t rstep_a; \ - inc_t cstep_b; \ - inc_t rstep_c, cstep_c; \ - auxinfo_t aux; \ -\ - /* - Assumptions/assertions: - rs_a == 1 - cs_a == PACKMR - pd_a == MR - ps_a == stride to next micro-panel of A - rs_b == PACKNR - cs_b == 1 - pd_b == NR - ps_b == stride to next micro-panel of B - rs_c == (no assumptions) - cs_c == (no assumptions) - */ \ -\ - /* If any dimension is zero, return immediately. */ \ - if ( bli_zero_dim3( m, n, k ) ) return; \ -\ - /* Safeguard: If the current panel of C is entirely below the diagonal, - it is not stored. So we do nothing. */ \ - if ( bli_is_strictly_below_diag_n( diagoffc, m, n ) ) return; \ -\ - /* If there is a zero region to the left of where the diagonal of C - intersects the top edge of the panel, adjust the pointer to C and B - and treat this case as if the diagonal offset were zero. */ \ - if ( diagoffc > 0 ) \ - { \ - jp = diagoffc / NR; \ - j = jp * NR; \ - n = n - j; \ - diagoffc = diagoffc % NR; \ - c_cast = c_cast + (j )*cs_c; \ - b_cast = b_cast + (jp )*ps_b; \ - } \ -\ - /* If there is a zero region below where the diagonal of C intersects - the right edge of the panel, shrink it to prevent "no-op" iterations - from executing. */ \ - if ( -diagoffc + n < m ) \ - { \ - m = -diagoffc + n; \ - } \ -\ - /* Clear the temporary C buffer in case it has any infs or NaNs. */ \ - PASTEMAC(ch,set0s_mxn)( MR, NR, \ - ct, rs_ct, cs_ct ); \ -\ - /* Compute number of primary and leftover components of the m and n - dimensions. */ \ - n_iter = n / NR; \ - n_left = n % NR; \ -\ - m_iter = m / MR; \ - m_left = m % MR; \ -\ - if ( n_left ) ++n_iter; \ - if ( m_left ) ++m_iter; \ -\ - /* Determine some increments used to step through A, B, and C. */ \ - rstep_a = ps_a; \ -\ - cstep_b = ps_b; \ -\ - rstep_c = rs_c * MR; \ - cstep_c = cs_c * NR; \ -\ - /* Save the pack schemas of A and B to the auxinfo_t object. */ \ - bli_auxinfo_set_schema_a( schema_a, &aux ); \ - bli_auxinfo_set_schema_b( schema_b, &aux ); \ -\ - /* Save the imaginary stride of A and B to the auxinfo_t object. */ \ - bli_auxinfo_set_is_a( is_a, &aux ); \ - bli_auxinfo_set_is_b( is_b, &aux ); \ -\ - /* The 'thread' argument points to the thrinfo_t node for the 2nd (jr) - loop around the microkernel. Here we query the thrinfo_t node for the - 1st (ir) loop around the microkernel. */ \ - thrinfo_t* caucus = bli_thrinfo_sub_node( thread ); \ -\ - /* Query the number of threads and thread ids for each loop. */ \ - dim_t jr_nt = bli_thread_n_way( thread ); \ - dim_t jr_tid = bli_thread_work_id( thread ); \ - dim_t ir_nt = bli_thread_n_way( caucus ); \ - dim_t ir_tid = bli_thread_work_id( caucus ); \ -\ - dim_t jr_start, jr_end; \ - dim_t ir_start, ir_end; \ - dim_t jr_inc, ir_inc; \ -\ - /* Use interleaved (round robin) assignment of micropanels to threads in - the 2nd and 1st loops. */ \ - bli_thread_range_jrir_rr( thread, n_iter, 1, FALSE, &jr_start, &jr_end, &jr_inc ); \ - bli_thread_range_jrir_rr( caucus, m_iter, 1, FALSE, &ir_start, &ir_end, &ir_inc ); \ -\ - /* Loop over the n dimension (NR columns at a time). */ \ - for ( j = jr_start; j < jr_end; j += jr_inc ) \ - { \ - ctype* restrict a1; \ - ctype* restrict c11; \ - ctype* restrict b2; \ -\ - b1 = b_cast + j * cstep_b; \ - c1 = c_cast + j * cstep_c; \ -\ - n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \ -\ - /* Initialize our next panel of B to be the current panel of B. */ \ - b2 = b1; \ -\ - /* Interior loop over the m dimension (MR rows at a time). */ \ - for ( i = ir_start; i < ir_end; i += ir_inc ) \ - { \ - ctype* restrict a2; \ -\ - a1 = a_cast + i * rstep_a; \ - c11 = c1 + i * rstep_c; \ -\ - /* Compute the diagonal offset for the submatrix at (i,j). */ \ - diagoffc_ij = diagoffc - (doff_t)j*NR + (doff_t)i*MR; \ -\ - m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \ -\ - /* Compute the addresses of the next panels of A and B. */ \ - a2 = bli_herk_get_next_a_upanel( a1, rstep_a, ir_inc ); \ - if ( bli_is_last_iter( i, m_iter, ir_tid, ir_nt ) ) \ - { \ - a2 = a_cast; \ - b2 = bli_herk_get_next_b_upanel( b1, cstep_b, jr_inc ); \ - if ( bli_is_last_iter( j, n_iter, jr_tid, jr_nt ) ) \ - b2 = b_cast; \ - } \ -\ - /* Save addresses of next panels of A and B to the auxinfo_t - object. */ \ - bli_auxinfo_set_next_a( a2, &aux ); \ - bli_auxinfo_set_next_b( b2, &aux ); \ -\ - /* If the diagonal intersects the current MR x NR submatrix, we - compute it the temporary buffer and then add in the elements - on or below the diagonal. - Otherwise, if the submatrix is strictly above the diagonal, - we compute and store as we normally would. - And if we're strictly below the diagonal, we do nothing and - continue. */ \ - if ( bli_intersects_diag_n( diagoffc_ij, m_cur, n_cur ) ) \ - { \ - /* Invoke the gemm micro-kernel. */ \ - gemm_ukr \ - ( \ - k, \ - alpha_cast, \ - a1, \ - b1, \ - zero, \ - ct, rs_ct, cs_ct, \ - &aux, \ - cntx \ - ); \ -\ - /* Scale C and add the result to only the stored part. */ \ - PASTEMAC(ch,xpbys_mxn_u)( diagoffc_ij, \ - m_cur, n_cur, \ - ct, rs_ct, cs_ct, \ - beta_cast, \ - c11, rs_c, cs_c ); \ - } \ - else if ( bli_is_strictly_above_diag_n( diagoffc_ij, m_cur, n_cur ) ) \ - { \ - /* Handle interior and edge cases separately. */ \ - if ( m_cur == MR && n_cur == NR ) \ - { \ - /* Invoke the gemm micro-kernel. */ \ - gemm_ukr \ - ( \ - k, \ - alpha_cast, \ - a1, \ - b1, \ - beta_cast, \ - c11, rs_c, cs_c, \ - &aux, \ - cntx \ - ); \ - } \ - else \ - { \ - /* Invoke the gemm micro-kernel. */ \ - gemm_ukr \ - ( \ - k, \ - alpha_cast, \ - a1, \ - b1, \ - zero, \ - ct, rs_ct, cs_ct, \ - &aux, \ - cntx \ - ); \ -\ - /* Scale the edge of C and add the result. */ \ - PASTEMAC(ch,xpbys_mxn)( m_cur, n_cur, \ - ct, rs_ct, cs_ct, \ - beta_cast, \ - c11, rs_c, cs_c ); \ - } \ - } \ - } \ - } \ -} - -INSERT_GENTFUNC_BASIC0( herk_u_ker_var2 ) - diff --git a/frame/3/herk/other/bli_herk_u_ker_var2.c b/frame/3/herk/other/bli_herk_u_ker_var2.c deleted file mode 100644 index 1aa3ce12df..0000000000 --- a/frame/3/herk/other/bli_herk_u_ker_var2.c +++ /dev/null @@ -1,409 +0,0 @@ -/* - - BLIS - An object-based framework for developing high-performance BLAS-like - libraries. - - Copyright (C) 2014, The University of Texas at Austin - Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc. - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are - met: - - Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - - Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - - Neither the name(s) of the copyright holder(s) nor the names of its - contributors may be used to endorse or promote products derived - from this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -*/ - -#include "blis.h" - -#define FUNCPTR_T herk_fp - -typedef void (*FUNCPTR_T) - ( - doff_t diagoffc, - pack_t schema_a, - pack_t schema_b, - dim_t m, - dim_t n, - dim_t k, - void* alpha, - void* a, inc_t cs_a, inc_t is_a, - dim_t pd_a, inc_t ps_a, - void* b, inc_t rs_b, inc_t is_b, - dim_t pd_b, inc_t ps_b, - void* beta, - void* c, inc_t rs_c, inc_t cs_c, - cntx_t* cntx, - rntm_t* rntm, - thrinfo_t* thread - ); - -static FUNCPTR_T GENARRAY(ftypes,herk_u_ker_var2); - - -void bli_herk_u_ker_var2 - ( - obj_t* a, - obj_t* b, - obj_t* c, - cntx_t* cntx, - rntm_t* rntm, - cntl_t* cntl, - thrinfo_t* thread - ) -{ - num_t dt_exec = bli_obj_exec_dt( c ); - - doff_t diagoffc = bli_obj_diag_offset( c ); - - pack_t schema_a = bli_obj_pack_schema( a ); - pack_t schema_b = bli_obj_pack_schema( b ); - - dim_t m = bli_obj_length( c ); - dim_t n = bli_obj_width( c ); - dim_t k = bli_obj_width( a ); - - void* buf_a = bli_obj_buffer_at_off( a ); - inc_t cs_a = bli_obj_col_stride( a ); - inc_t is_a = bli_obj_imag_stride( a ); - dim_t pd_a = bli_obj_panel_dim( a ); - inc_t ps_a = bli_obj_panel_stride( a ); - - void* buf_b = bli_obj_buffer_at_off( b ); - inc_t rs_b = bli_obj_row_stride( b ); - inc_t is_b = bli_obj_imag_stride( b ); - dim_t pd_b = bli_obj_panel_dim( b ); - inc_t ps_b = bli_obj_panel_stride( b ); - - void* buf_c = bli_obj_buffer_at_off( c ); - inc_t rs_c = bli_obj_row_stride( c ); - inc_t cs_c = bli_obj_col_stride( c ); - - obj_t scalar_a; - obj_t scalar_b; - - void* buf_alpha; - void* buf_beta; - - FUNCPTR_T f; - - // Detach and multiply the scalars attached to A and B. - bli_obj_scalar_detach( a, &scalar_a ); - bli_obj_scalar_detach( b, &scalar_b ); - bli_mulsc( &scalar_a, &scalar_b ); - - // Grab the addresses of the internal scalar buffers for the scalar - // merged above and the scalar attached to C. - buf_alpha = bli_obj_internal_scalar_buffer( &scalar_b ); - buf_beta = bli_obj_internal_scalar_buffer( c ); - - // Index into the type combination array to extract the correct - // function pointer. - f = ftypes[dt_exec]; - - // Invoke the function. - f( diagoffc, - schema_a, - schema_b, - m, - n, - k, - buf_alpha, - buf_a, cs_a, is_a, - pd_a, ps_a, - buf_b, rs_b, is_b, - pd_b, ps_b, - buf_beta, - buf_c, rs_c, cs_c, - cntx, - rntm, - thread ); -} - - -#undef GENTFUNC -#define GENTFUNC( ctype, ch, varname ) \ -\ -void PASTEMAC(ch,varname) \ - ( \ - doff_t diagoffc, \ - pack_t schema_a, \ - pack_t schema_b, \ - dim_t m, \ - dim_t n, \ - dim_t k, \ - void* alpha, \ - void* a, inc_t cs_a, inc_t is_a, \ - dim_t pd_a, inc_t ps_a, \ - void* b, inc_t rs_b, inc_t is_b, \ - dim_t pd_b, inc_t ps_b, \ - void* beta, \ - void* c, inc_t rs_c, inc_t cs_c, \ - cntx_t* cntx, \ - rntm_t* rntm, \ - thrinfo_t* thread \ - ) \ -{ \ - const num_t dt = PASTEMAC(ch,type); \ -\ - /* Alias some constants to simpler names. */ \ - const dim_t MR = pd_a; \ - const dim_t NR = pd_b; \ - /*const dim_t PACKMR = cs_a;*/ \ - /*const dim_t PACKNR = rs_b;*/ \ -\ - /* Query the context for the micro-kernel address and cast it to its - function pointer type. */ \ - PASTECH(ch,gemm_ukr_ft) \ - gemm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \ -\ - /* Temporary C buffer for edge cases. Note that the strides of this - temporary buffer are set so that they match the storage of the - original C matrix. For example, if C is column-stored, ct will be - column-stored as well. */ \ - ctype ct[ BLIS_STACK_BUF_MAX_SIZE \ - / sizeof( ctype ) ] \ - __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \ - const bool col_pref = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \ - const inc_t rs_ct = ( col_pref ? 1 : NR ); \ - const inc_t cs_ct = ( col_pref ? MR : 1 ); \ -\ - ctype* restrict zero = PASTEMAC(ch,0); \ - ctype* restrict a_cast = a; \ - ctype* restrict b_cast = b; \ - ctype* restrict c_cast = c; \ - ctype* restrict alpha_cast = alpha; \ - ctype* restrict beta_cast = beta; \ - ctype* restrict b1; \ - ctype* restrict c1; \ -\ - doff_t diagoffc_ij; \ - dim_t m_iter, m_left; \ - dim_t n_iter, n_left; \ - dim_t m_cur; \ - dim_t n_cur; \ - dim_t i, j, jp; \ - inc_t rstep_a; \ - inc_t cstep_b; \ - inc_t rstep_c, cstep_c; \ - auxinfo_t aux; \ -\ - /* - Assumptions/assertions: - rs_a == 1 - cs_a == PACKMR - pd_a == MR - ps_a == stride to next micro-panel of A - rs_b == PACKNR - cs_b == 1 - pd_b == NR - ps_b == stride to next micro-panel of B - rs_c == (no assumptions) - cs_c == (no assumptions) - */ \ -\ - /* If any dimension is zero, return immediately. */ \ - if ( bli_zero_dim3( m, n, k ) ) return; \ -\ - /* Safeguard: If the current panel of C is entirely below the diagonal, - it is not stored. So we do nothing. */ \ - if ( bli_is_strictly_below_diag_n( diagoffc, m, n ) ) return; \ -\ - /* If there is a zero region to the left of where the diagonal of C - intersects the top edge of the panel, adjust the pointer to C and B - and treat this case as if the diagonal offset were zero. */ \ - if ( diagoffc > 0 ) \ - { \ - jp = diagoffc / NR; \ - j = jp * NR; \ - n = n - j; \ - diagoffc = diagoffc % NR; \ - c_cast = c_cast + (j )*cs_c; \ - b_cast = b_cast + (jp )*ps_b; \ - } \ -\ - /* If there is a zero region below where the diagonal of C intersects - the right edge of the panel, shrink it to prevent "no-op" iterations - from executing. */ \ - if ( -diagoffc + n < m ) \ - { \ - m = -diagoffc + n; \ - } \ -\ - /* Clear the temporary C buffer in case it has any infs or NaNs. */ \ - PASTEMAC(ch,set0s_mxn)( MR, NR, \ - ct, rs_ct, cs_ct ); \ -\ - /* Compute number of primary and leftover components of the m and n - dimensions. */ \ - n_iter = n / NR; \ - n_left = n % NR; \ -\ - m_iter = m / MR; \ - m_left = m % MR; \ -\ - if ( n_left ) ++n_iter; \ - if ( m_left ) ++m_iter; \ -\ - /* Determine some increments used to step through A, B, and C. */ \ - rstep_a = ps_a; \ -\ - cstep_b = ps_b; \ -\ - rstep_c = rs_c * MR; \ - cstep_c = cs_c * NR; \ -\ - /* Save the pack schemas of A and B to the auxinfo_t object. */ \ - bli_auxinfo_set_schema_a( schema_a, &aux ); \ - bli_auxinfo_set_schema_b( schema_b, &aux ); \ -\ - /* Save the imaginary stride of A and B to the auxinfo_t object. */ \ - bli_auxinfo_set_is_a( is_a, &aux ); \ - bli_auxinfo_set_is_b( is_b, &aux ); \ -\ - b1 = b_cast; \ - c1 = c_cast; \ -\ - thrinfo_t* caucus = bli_thrinfo_sub_node( thread ); \ - dim_t jr_num_threads = bli_thread_n_way( thread ); \ - dim_t jr_thread_id = bli_thread_work_id( thread ); \ - dim_t ir_num_threads = bli_thread_n_way( caucus ); \ - dim_t ir_thread_id = bli_thread_work_id( caucus ); \ -\ - /* Loop over the n dimension (NR columns at a time). */ \ - for ( j = jr_thread_id; j < n_iter; j += jr_num_threads ) \ - { \ - ctype* restrict a1; \ - ctype* restrict c11; \ - ctype* restrict b2; \ -\ - b1 = b_cast + j * cstep_b; \ - c1 = c_cast + j * cstep_c; \ -\ - n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \ -\ - /* Initialize our next panel of B to be the current panel of B. */ \ - b2 = b1; \ -\ - /* Interior loop over the m dimension (MR rows at a time). */ \ - for ( i = ir_thread_id; i < m_iter; i += ir_num_threads ) \ - { \ - ctype* restrict a2; \ -\ - a1 = a_cast + i * rstep_a; \ - c11 = c1 + i * rstep_c; \ -\ - /* Compute the diagonal offset for the submatrix at (i,j). */ \ - diagoffc_ij = diagoffc - (doff_t)j*NR + (doff_t)i*MR; \ -\ - m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \ -\ - /* Compute the addresses of the next panels of A and B. */ \ - a2 = bli_herk_get_next_a_upanel( caucus, a1, rstep_a ); \ - if ( bli_is_last_iter( i, m_iter, ir_thread_id, ir_num_threads ) ) \ - { \ - a2 = a_cast; \ - b2 = bli_herk_get_next_b_upanel( thread, b1, cstep_b ); \ - if ( bli_is_last_iter( j, n_iter, jr_thread_id, jr_num_threads ) ) \ - b2 = b_cast; \ - } \ -\ - /* Save addresses of next panels of A and B to the auxinfo_t - object. */ \ - bli_auxinfo_set_next_a( a2, &aux ); \ - bli_auxinfo_set_next_b( b2, &aux ); \ -\ - /* If the diagonal intersects the current MR x NR submatrix, we - compute it the temporary buffer and then add in the elements - on or below the diagonal. - Otherwise, if the submatrix is strictly above the diagonal, - we compute and store as we normally would. - And if we're strictly below the diagonal, we do nothing and - continue. */ \ - if ( bli_intersects_diag_n( diagoffc_ij, m_cur, n_cur ) ) \ - { \ - /* Invoke the gemm micro-kernel. */ \ - gemm_ukr \ - ( \ - k, \ - alpha_cast, \ - a1, \ - b1, \ - zero, \ - ct, rs_ct, cs_ct, \ - &aux, \ - cntx \ - ); \ -\ - /* Scale C and add the result to only the stored part. */ \ - PASTEMAC(ch,xpbys_mxn_u)( diagoffc_ij, \ - m_cur, n_cur, \ - ct, rs_ct, cs_ct, \ - beta_cast, \ - c11, rs_c, cs_c ); \ - } \ - else if ( bli_is_strictly_above_diag_n( diagoffc_ij, m_cur, n_cur ) ) \ - { \ - /* Handle interior and edge cases separately. */ \ - if ( m_cur == MR && n_cur == NR ) \ - { \ - /* Invoke the gemm micro-kernel. */ \ - gemm_ukr \ - ( \ - k, \ - alpha_cast, \ - a1, \ - b1, \ - beta_cast, \ - c11, rs_c, cs_c, \ - &aux, \ - cntx \ - ); \ - } \ - else \ - { \ - /* Invoke the gemm micro-kernel. */ \ - gemm_ukr \ - ( \ - k, \ - alpha_cast, \ - a1, \ - b1, \ - zero, \ - ct, rs_ct, cs_ct, \ - &aux, \ - cntx \ - ); \ -\ - /* Scale the edge of C and add the result. */ \ - PASTEMAC(ch,xpbys_mxn)( m_cur, n_cur, \ - ct, rs_ct, cs_ct, \ - beta_cast, \ - c11, rs_c, cs_c ); \ - } \ - } \ - } \ - } \ -} - -INSERT_GENTFUNC_BASIC0( herk_u_ker_var2 ) - diff --git a/frame/3/herk/other/bli_herk_u_ker_var2rr.c b/frame/3/herk/other/bli_herk_u_ker_var2rr.c deleted file mode 100644 index 085ef63083..0000000000 --- a/frame/3/herk/other/bli_herk_u_ker_var2rr.c +++ /dev/null @@ -1,557 +0,0 @@ -/* - - BLIS - An object-based framework for developing high-performance BLAS-like - libraries. - - Copyright (C) 2014, The University of Texas at Austin - Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc. - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are - met: - - Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - - Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - - Neither the name(s) of the copyright holder(s) nor the names of its - contributors may be used to endorse or promote products derived - from this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -*/ - -#include "blis.h" - -#define FUNCPTR_T herk_fp - -typedef void (*FUNCPTR_T) - ( - doff_t diagoffc, - pack_t schema_a, - pack_t schema_b, - dim_t m, - dim_t n, - dim_t k, - void* alpha, - void* a, inc_t cs_a, inc_t is_a, - dim_t pd_a, inc_t ps_a, - void* b, inc_t rs_b, inc_t is_b, - dim_t pd_b, inc_t ps_b, - void* beta, - void* c, inc_t rs_c, inc_t cs_c, - cntx_t* cntx, - rntm_t* rntm, - thrinfo_t* thread - ); - -static FUNCPTR_T GENARRAY(ftypes,herk_u_ker_var2rr); - -// -// -- Macrokernel functions for round-robin partitioning ----------------------- -// - -void bli_herk_u_ker_var2rr - ( - obj_t* a, - obj_t* b, - obj_t* c, - cntx_t* cntx, - rntm_t* rntm, - cntl_t* cntl, - thrinfo_t* thread - ) -{ - num_t dt_exec = bli_obj_exec_dt( c ); - - doff_t diagoffc = bli_obj_diag_offset( c ); - - pack_t schema_a = bli_obj_pack_schema( a ); - pack_t schema_b = bli_obj_pack_schema( b ); - - dim_t m = bli_obj_length( c ); - dim_t n = bli_obj_width( c ); - dim_t k = bli_obj_width( a ); - - void* buf_a = bli_obj_buffer_at_off( a ); - inc_t cs_a = bli_obj_col_stride( a ); - inc_t is_a = bli_obj_imag_stride( a ); - dim_t pd_a = bli_obj_panel_dim( a ); - inc_t ps_a = bli_obj_panel_stride( a ); - - void* buf_b = bli_obj_buffer_at_off( b ); - inc_t rs_b = bli_obj_row_stride( b ); - inc_t is_b = bli_obj_imag_stride( b ); - dim_t pd_b = bli_obj_panel_dim( b ); - inc_t ps_b = bli_obj_panel_stride( b ); - - void* buf_c = bli_obj_buffer_at_off( c ); - inc_t rs_c = bli_obj_row_stride( c ); - inc_t cs_c = bli_obj_col_stride( c ); - - obj_t scalar_a; - obj_t scalar_b; - - void* buf_alpha; - void* buf_beta; - - FUNCPTR_T f; - - // Detach and multiply the scalars attached to A and B. - bli_obj_scalar_detach( a, &scalar_a ); - bli_obj_scalar_detach( b, &scalar_b ); - bli_mulsc( &scalar_a, &scalar_b ); - - // Grab the addresses of the internal scalar buffers for the scalar - // merged above and the scalar attached to C. - buf_alpha = bli_obj_internal_scalar_buffer( &scalar_b ); - buf_beta = bli_obj_internal_scalar_buffer( c ); - - // Index into the type combination array to extract the correct - // function pointer. - f = ftypes[dt_exec]; - - // Invoke the function. - f( diagoffc, - schema_a, - schema_b, - m, - n, - k, - buf_alpha, - buf_a, cs_a, is_a, - pd_a, ps_a, - buf_b, rs_b, is_b, - pd_b, ps_b, - buf_beta, - buf_c, rs_c, cs_c, - cntx, - rntm, - thread ); -} - - -#undef GENTFUNC -#define GENTFUNC( ctype, ch, varname ) \ -\ -void PASTEMAC(ch,varname) \ - ( \ - doff_t diagoffc, \ - pack_t schema_a, \ - pack_t schema_b, \ - dim_t m, \ - dim_t n, \ - dim_t k, \ - void* alpha, \ - void* a, inc_t cs_a, inc_t is_a, \ - dim_t pd_a, inc_t ps_a, \ - void* b, inc_t rs_b, inc_t is_b, \ - dim_t pd_b, inc_t ps_b, \ - void* beta, \ - void* c, inc_t rs_c, inc_t cs_c, \ - cntx_t* cntx, \ - rntm_t* rntm, \ - thrinfo_t* thread \ - ) \ -{ \ - const num_t dt = PASTEMAC(ch,type); \ -\ - /* Alias some constants to simpler names. */ \ - const dim_t MR = pd_a; \ - const dim_t NR = pd_b; \ - /*const dim_t PACKMR = cs_a;*/ \ - /*const dim_t PACKNR = rs_b;*/ \ -\ - /* Query the context for the micro-kernel address and cast it to its - function pointer type. */ \ - PASTECH(ch,gemm_ukr_ft) \ - gemm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \ -\ - /* Temporary C buffer for edge cases. Note that the strides of this - temporary buffer are set so that they match the storage of the - original C matrix. For example, if C is column-stored, ct will be - column-stored as well. */ \ - ctype ct[ BLIS_STACK_BUF_MAX_SIZE \ - / sizeof( ctype ) ] \ - __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \ - const bool col_pref = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \ - const inc_t rs_ct = ( col_pref ? 1 : NR ); \ - const inc_t cs_ct = ( col_pref ? MR : 1 ); \ -\ - ctype* restrict zero = PASTEMAC(ch,0); \ - ctype* restrict a_cast = a; \ - ctype* restrict b_cast = b; \ - ctype* restrict c_cast = c; \ - ctype* restrict alpha_cast = alpha; \ - ctype* restrict beta_cast = beta; \ - ctype* restrict b1; \ - ctype* restrict c1; \ -\ - doff_t diagoffc_ij; \ - dim_t m_iter, m_left; \ - dim_t n_iter, n_left; \ - dim_t m_cur; \ - dim_t n_cur; \ - dim_t i, j, jp; \ - inc_t rstep_a; \ - inc_t cstep_b; \ - inc_t rstep_c, cstep_c; \ - auxinfo_t aux; \ -\ - /* - Assumptions/assertions: - rs_a == 1 - cs_a == PACKMR - pd_a == MR - ps_a == stride to next micro-panel of A - rs_b == PACKNR - cs_b == 1 - pd_b == NR - ps_b == stride to next micro-panel of B - rs_c == (no assumptions) - cs_c == (no assumptions) - */ \ -\ - /* If any dimension is zero, return immediately. */ \ - if ( bli_zero_dim3( m, n, k ) ) return; \ -\ - /* Safeguard: If the current panel of C is entirely below the diagonal, - it is not stored. So we do nothing. */ \ - if ( bli_is_strictly_below_diag_n( diagoffc, m, n ) ) return; \ -\ - /* If there is a zero region to the left of where the diagonal of C - intersects the top edge of the panel, adjust the pointer to C and B - and treat this case as if the diagonal offset were zero. - NOTE: It's possible that after this pruning that the diagonal offset - is still positive (though it is guaranteed to be less than NR). */ \ - if ( diagoffc > 0 ) \ - { \ - jp = diagoffc / NR; \ - j = jp * NR; \ - n = n - j; \ - diagoffc = diagoffc % NR; \ - c_cast = c_cast + (j )*cs_c; \ - b_cast = b_cast + (jp )*ps_b; \ - } \ -\ - /* If there is a zero region below where the diagonal of C intersects - the right edge of the panel, shrink it to prevent "no-op" iterations - from executing. */ \ - if ( -diagoffc + n < m ) \ - { \ - m = -diagoffc + n; \ - } \ -\ - /* Clear the temporary C buffer in case it has any infs or NaNs. */ \ - PASTEMAC(ch,set0s_mxn)( MR, NR, \ - ct, rs_ct, cs_ct ); \ -\ - /* Compute number of primary and leftover components of the m and n - dimensions. */ \ - n_iter = n / NR; \ - n_left = n % NR; \ -\ - m_iter = m / MR; \ - m_left = m % MR; \ -\ - if ( n_left ) ++n_iter; \ - if ( m_left ) ++m_iter; \ -\ - /* Determine some increments used to step through A, B, and C. */ \ - rstep_a = ps_a; \ -\ - cstep_b = ps_b; \ -\ - rstep_c = rs_c * MR; \ - cstep_c = cs_c * NR; \ -\ - /* Save the pack schemas of A and B to the auxinfo_t object. */ \ - bli_auxinfo_set_schema_a( schema_a, &aux ); \ - bli_auxinfo_set_schema_b( schema_b, &aux ); \ -\ - /* Save the imaginary stride of A and B to the auxinfo_t object. */ \ - bli_auxinfo_set_is_a( is_a, &aux ); \ - bli_auxinfo_set_is_b( is_b, &aux ); \ -\ - /* The 'thread' argument points to the thrinfo_t node for the 2nd (jr) - loop around the microkernel. Here we query the thrinfo_t node for the - 1st (ir) loop around the microkernel. */ \ - thrinfo_t* caucus = bli_thrinfo_sub_node( thread ); \ -\ - /* Query the number of threads and thread ids for each loop. */ \ - dim_t jr_nt = bli_thread_n_way( thread ); \ - dim_t jr_tid = bli_thread_work_id( thread ); \ - dim_t ir_nt = bli_thread_n_way( caucus ); \ - dim_t ir_tid = bli_thread_work_id( caucus ); \ -\ - dim_t jr_start, jr_end; \ - dim_t ir_start, ir_end; \ - dim_t jr_inc, ir_inc; \ -\ - /* Note that we partition the 2nd loop into two regions: the triangular - part of C, and the rectangular portion. */ \ - dim_t n_iter_tri; \ - dim_t n_iter_rct; \ -\ - if ( bli_is_strictly_above_diag_n( diagoffc, m, n ) ) \ - { \ - /* If the entire panel of C does not intersect the diagonal, there is - no triangular region, and therefore we can skip the first set of - loops. */ \ - n_iter_tri = 0; \ - n_iter_rct = n_iter; \ - } \ - else \ - { \ - /* If the panel of C does intersect the diagonal, compute the number of - iterations in the triangular (or trapezoidal) region by dividing NR - into the number of rows in C. A non-zero remainder means we need to - add one additional iteration. That is, we want the triangular region - to contain as few columns of whole microtiles as possible while still - including all microtiles that intersect the diagonal. The number of - iterations in the rectangular region is computed as the remaining - number of iterations in the n dimension. */ \ - n_iter_tri = ( m + diagoffc ) / NR + ( ( m + diagoffc ) % NR ? 1 : 0 ); \ - n_iter_rct = n_iter - n_iter_tri; \ - } \ -\ - /* Use round-robin assignment of micropanels to threads in the 2nd and 1st - loops for the initial triangular region of C (if it exists). */ \ - bli_thread_range_jrir_rr( thread, n_iter_tri, 1, FALSE, &jr_start, &jr_end, &jr_inc ); \ - bli_thread_range_jrir_rr( caucus, m_iter, 1, FALSE, &ir_start, &ir_end, &ir_inc ); \ -\ - /* Loop over the n dimension (NR columns at a time). */ \ - for ( j = jr_start; j < jr_end; j += jr_inc ) \ - { \ - ctype* restrict a1; \ - ctype* restrict c11; \ - ctype* restrict b2; \ -\ - b1 = b_cast + j * cstep_b; \ - c1 = c_cast + j * cstep_c; \ -\ - n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \ -\ - /* Initialize our next panel of B to be the current panel of B. */ \ - b2 = b1; \ -\ - /* Interior loop over the m dimension (MR rows at a time). */ \ - for ( i = ir_start; i < ir_end; i += ir_inc ) \ - { \ - ctype* restrict a2; \ -\ - a1 = a_cast + i * rstep_a; \ - c11 = c1 + i * rstep_c; \ -\ - /* Compute the diagonal offset for the submatrix at (i,j). */ \ - diagoffc_ij = diagoffc - (doff_t)j*NR + (doff_t)i*MR; \ -\ - m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \ -\ - /* Compute the addresses of the next panels of A and B. */ \ - a2 = bli_herk_get_next_a_upanel( a1, rstep_a, ir_inc ); \ - if ( bli_is_last_iter_rr( i, m_iter, ir_tid, ir_nt ) ) \ - { \ - a2 = a_cast; \ - b2 = bli_herk_get_next_b_upanel( b1, cstep_b, jr_inc ); \ - if ( bli_is_last_iter_rr( j, n_iter, jr_tid, jr_nt ) ) \ - b2 = b_cast; \ - } \ -\ - /* Save addresses of next panels of A and B to the auxinfo_t - object. */ \ - bli_auxinfo_set_next_a( a2, &aux ); \ - bli_auxinfo_set_next_b( b2, &aux ); \ -\ - /* If the diagonal intersects the current MR x NR submatrix, we - compute it the temporary buffer and then add in the elements - on or below the diagonal. - Otherwise, if the submatrix is strictly above the diagonal, - we compute and store as we normally would. - And if we're strictly below the diagonal, we do nothing and - continue. */ \ - if ( bli_intersects_diag_n( diagoffc_ij, m_cur, n_cur ) ) \ - { \ - /* Invoke the gemm micro-kernel. */ \ - gemm_ukr \ - ( \ - k, \ - alpha_cast, \ - a1, \ - b1, \ - zero, \ - ct, rs_ct, cs_ct, \ - &aux, \ - cntx \ - ); \ -\ - /* Scale C and add the result to only the stored part. */ \ - PASTEMAC(ch,xpbys_mxn_u)( diagoffc_ij, \ - m_cur, n_cur, \ - ct, rs_ct, cs_ct, \ - beta_cast, \ - c11, rs_c, cs_c ); \ - } \ - else if ( bli_is_strictly_above_diag_n( diagoffc_ij, m_cur, n_cur ) ) \ - { \ - /* Handle interior and edge cases separately. */ \ - if ( m_cur == MR && n_cur == NR ) \ - { \ - /* Invoke the gemm micro-kernel. */ \ - gemm_ukr \ - ( \ - k, \ - alpha_cast, \ - a1, \ - b1, \ - beta_cast, \ - c11, rs_c, cs_c, \ - &aux, \ - cntx \ - ); \ - } \ - else \ - { \ - /* Invoke the gemm micro-kernel. */ \ - gemm_ukr \ - ( \ - k, \ - alpha_cast, \ - a1, \ - b1, \ - zero, \ - ct, rs_ct, cs_ct, \ - &aux, \ - cntx \ - ); \ -\ - /* Scale the edge of C and add the result. */ \ - PASTEMAC(ch,xpbys_mxn)( m_cur, n_cur, \ - ct, rs_ct, cs_ct, \ - beta_cast, \ - c11, rs_c, cs_c ); \ - } \ - } \ - } \ - } \ -\ - /* If there is no rectangular region, then we're done. */ \ - if ( n_iter_rct == 0 ) return; \ -\ - /* Use round-robin assignment of micropanels to threads in the 2nd and 1st - loops for the remaining triangular region of C. */ \ - bli_thread_range_jrir_rr( thread, n_iter_rct, 1, FALSE, &jr_start, &jr_end, &jr_inc ); \ -\ - /* Advance the start and end iteration offsets for the rectangular region - by the number of iterations used for the triangular region. */ \ - jr_start += n_iter_tri; \ - jr_end += n_iter_tri; \ -\ - /* Loop over the n dimension (NR columns at a time). */ \ - for ( j = jr_start; j < jr_end; j += jr_inc ) \ - { \ - ctype* restrict a1; \ - ctype* restrict c11; \ - ctype* restrict b2; \ -\ - b1 = b_cast + j * cstep_b; \ - c1 = c_cast + j * cstep_c; \ -\ - n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \ -\ - /* Initialize our next panel of B to be the current panel of B. */ \ - b2 = b1; \ -\ - /* Interior loop over the m dimension (MR rows at a time). */ \ - for ( i = ir_start; i < ir_end; i += ir_inc ) \ - { \ - ctype* restrict a2; \ -\ - a1 = a_cast + i * rstep_a; \ - c11 = c1 + i * rstep_c; \ -\ - /* No need to compute the diagonal offset for the rectangular - region. */ \ - /*diagoffc_ij = diagoffc - (doff_t)j*NR + (doff_t)i*MR;*/ \ -\ - m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \ -\ - /* Compute the addresses of the next panels of A and B. */ \ - a2 = bli_herk_get_next_a_upanel( a1, rstep_a, ir_inc ); \ - if ( bli_is_last_iter_rr( i, m_iter, ir_tid, ir_nt ) ) \ - { \ - a2 = a_cast; \ - b2 = bli_herk_get_next_b_upanel( b1, cstep_b, jr_inc ); \ - if ( bli_is_last_iter_rr( j, n_iter, jr_tid, jr_nt ) ) \ - b2 = b_cast; \ - } \ -\ - /* Save addresses of next panels of A and B to the auxinfo_t - object. */ \ - bli_auxinfo_set_next_a( a2, &aux ); \ - bli_auxinfo_set_next_b( b2, &aux ); \ -\ - /* If the diagonal intersects the current MR x NR submatrix, we - compute it the temporary buffer and then add in the elements - on or below the diagonal. - Otherwise, if the submatrix is strictly above the diagonal, - we compute and store as we normally would. - And if we're strictly below the diagonal, we do nothing and - continue. */ \ - { \ - /* Handle interior and edge cases separately. */ \ - if ( m_cur == MR && n_cur == NR ) \ - { \ - /* Invoke the gemm micro-kernel. */ \ - gemm_ukr \ - ( \ - k, \ - alpha_cast, \ - a1, \ - b1, \ - beta_cast, \ - c11, rs_c, cs_c, \ - &aux, \ - cntx \ - ); \ - } \ - else \ - { \ - /* Invoke the gemm micro-kernel. */ \ - gemm_ukr \ - ( \ - k, \ - alpha_cast, \ - a1, \ - b1, \ - zero, \ - ct, rs_ct, cs_ct, \ - &aux, \ - cntx \ - ); \ -\ - /* Scale the edge of C and add the result. */ \ - PASTEMAC(ch,xpbys_mxn)( m_cur, n_cur, \ - ct, rs_ct, cs_ct, \ - beta_cast, \ - c11, rs_c, cs_c ); \ - } \ - } \ - } \ - } \ -} - -INSERT_GENTFUNC_BASIC0( herk_u_ker_var2rr ) - diff --git a/frame/3/herk/other/bli_herk_u_ker_var2sl.c b/frame/3/herk/other/bli_herk_u_ker_var2sl.c deleted file mode 100644 index abc6e51884..0000000000 --- a/frame/3/herk/other/bli_herk_u_ker_var2sl.c +++ /dev/null @@ -1,558 +0,0 @@ -/* - - BLIS - An object-based framework for developing high-performance BLAS-like - libraries. - - Copyright (C) 2014, The University of Texas at Austin - Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc. - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are - met: - - Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - - Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - - Neither the name(s) of the copyright holder(s) nor the names of its - contributors may be used to endorse or promote products derived - from this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -*/ - -#include "blis.h" - -#define FUNCPTR_T herk_fp - -typedef void (*FUNCPTR_T) - ( - doff_t diagoffc, - pack_t schema_a, - pack_t schema_b, - dim_t m, - dim_t n, - dim_t k, - void* alpha, - void* a, inc_t cs_a, inc_t is_a, - dim_t pd_a, inc_t ps_a, - void* b, inc_t rs_b, inc_t is_b, - dim_t pd_b, inc_t ps_b, - void* beta, - void* c, inc_t rs_c, inc_t cs_c, - cntx_t* cntx, - rntm_t* rntm, - thrinfo_t* thread - ); - -static FUNCPTR_T GENARRAY(ftypes,herk_u_ker_var2sl); - -// -// -- Macrokernel functions for slab partitioning ------------------------------ -// - -void bli_herk_u_ker_var2sl - ( - obj_t* a, - obj_t* b, - obj_t* c, - cntx_t* cntx, - rntm_t* rntm, - cntl_t* cntl, - thrinfo_t* thread - ) -{ - num_t dt_exec = bli_obj_exec_dt( c ); - - doff_t diagoffc = bli_obj_diag_offset( c ); - - pack_t schema_a = bli_obj_pack_schema( a ); - pack_t schema_b = bli_obj_pack_schema( b ); - - dim_t m = bli_obj_length( c ); - dim_t n = bli_obj_width( c ); - dim_t k = bli_obj_width( a ); - - void* buf_a = bli_obj_buffer_at_off( a ); - inc_t cs_a = bli_obj_col_stride( a ); - inc_t is_a = bli_obj_imag_stride( a ); - dim_t pd_a = bli_obj_panel_dim( a ); - inc_t ps_a = bli_obj_panel_stride( a ); - - void* buf_b = bli_obj_buffer_at_off( b ); - inc_t rs_b = bli_obj_row_stride( b ); - inc_t is_b = bli_obj_imag_stride( b ); - dim_t pd_b = bli_obj_panel_dim( b ); - inc_t ps_b = bli_obj_panel_stride( b ); - - void* buf_c = bli_obj_buffer_at_off( c ); - inc_t rs_c = bli_obj_row_stride( c ); - inc_t cs_c = bli_obj_col_stride( c ); - - obj_t scalar_a; - obj_t scalar_b; - - void* buf_alpha; - void* buf_beta; - - FUNCPTR_T f; - - // Detach and multiply the scalars attached to A and B. - bli_obj_scalar_detach( a, &scalar_a ); - bli_obj_scalar_detach( b, &scalar_b ); - bli_mulsc( &scalar_a, &scalar_b ); - - // Grab the addresses of the internal scalar buffers for the scalar - // merged above and the scalar attached to C. - buf_alpha = bli_obj_internal_scalar_buffer( &scalar_b ); - buf_beta = bli_obj_internal_scalar_buffer( c ); - - // Index into the type combination array to extract the correct - // function pointer. - f = ftypes[dt_exec]; - - // Invoke the function. - f( diagoffc, - schema_a, - schema_b, - m, - n, - k, - buf_alpha, - buf_a, cs_a, is_a, - pd_a, ps_a, - buf_b, rs_b, is_b, - pd_b, ps_b, - buf_beta, - buf_c, rs_c, cs_c, - cntx, - rntm, - thread ); -} - - -#undef GENTFUNC -#define GENTFUNC( ctype, ch, varname ) \ -\ -void PASTEMAC(ch,varname) \ - ( \ - doff_t diagoffc, \ - pack_t schema_a, \ - pack_t schema_b, \ - dim_t m, \ - dim_t n, \ - dim_t k, \ - void* alpha, \ - void* a, inc_t cs_a, inc_t is_a, \ - dim_t pd_a, inc_t ps_a, \ - void* b, inc_t rs_b, inc_t is_b, \ - dim_t pd_b, inc_t ps_b, \ - void* beta, \ - void* c, inc_t rs_c, inc_t cs_c, \ - cntx_t* cntx, \ - rntm_t* rntm, \ - thrinfo_t* thread \ - ) \ -{ \ - const num_t dt = PASTEMAC(ch,type); \ -\ - /* Alias some constants to simpler names. */ \ - const dim_t MR = pd_a; \ - const dim_t NR = pd_b; \ - /*const dim_t PACKMR = cs_a;*/ \ - /*const dim_t PACKNR = rs_b;*/ \ -\ - /* Query the context for the micro-kernel address and cast it to its - function pointer type. */ \ - PASTECH(ch,gemm_ukr_ft) \ - gemm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \ -\ - /* Temporary C buffer for edge cases. Note that the strides of this - temporary buffer are set so that they match the storage of the - original C matrix. For example, if C is column-stored, ct will be - column-stored as well. */ \ - ctype ct[ BLIS_STACK_BUF_MAX_SIZE \ - / sizeof( ctype ) ] \ - __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \ - const bool col_pref = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \ - const inc_t rs_ct = ( col_pref ? 1 : NR ); \ - const inc_t cs_ct = ( col_pref ? MR : 1 ); \ -\ - ctype* restrict zero = PASTEMAC(ch,0); \ - ctype* restrict a_cast = a; \ - ctype* restrict b_cast = b; \ - ctype* restrict c_cast = c; \ - ctype* restrict alpha_cast = alpha; \ - ctype* restrict beta_cast = beta; \ - ctype* restrict b1; \ - ctype* restrict c1; \ -\ - doff_t diagoffc_ij; \ - dim_t m_iter, m_left; \ - dim_t n_iter, n_left; \ - dim_t m_cur; \ - dim_t n_cur; \ - dim_t i, j, jp; \ - inc_t rstep_a; \ - inc_t cstep_b; \ - inc_t rstep_c, cstep_c; \ - auxinfo_t aux; \ -\ - /* - Assumptions/assertions: - rs_a == 1 - cs_a == PACKMR - pd_a == MR - ps_a == stride to next micro-panel of A - rs_b == PACKNR - cs_b == 1 - pd_b == NR - ps_b == stride to next micro-panel of B - rs_c == (no assumptions) - cs_c == (no assumptions) - */ \ -\ - /* If any dimension is zero, return immediately. */ \ - if ( bli_zero_dim3( m, n, k ) ) return; \ -\ - /* Safeguard: If the current panel of C is entirely below the diagonal, - it is not stored. So we do nothing. */ \ - if ( bli_is_strictly_below_diag_n( diagoffc, m, n ) ) return; \ -\ - /* If there is a zero region to the left of where the diagonal of C - intersects the top edge of the panel, adjust the pointer to C and B - and treat this case as if the diagonal offset were zero. - NOTE: It's possible that after this pruning that the diagonal offset - is still positive (though it is guaranteed to be less than NR). */ \ - if ( diagoffc > 0 ) \ - { \ - jp = diagoffc / NR; \ - j = jp * NR; \ - n = n - j; \ - diagoffc = diagoffc % NR; \ - c_cast = c_cast + (j )*cs_c; \ - b_cast = b_cast + (jp )*ps_b; \ - } \ -\ - /* If there is a zero region below where the diagonal of C intersects - the right edge of the panel, shrink it to prevent "no-op" iterations - from executing. */ \ - if ( -diagoffc + n < m ) \ - { \ - m = -diagoffc + n; \ - } \ -\ - /* Clear the temporary C buffer in case it has any infs or NaNs. */ \ - PASTEMAC(ch,set0s_mxn)( MR, NR, \ - ct, rs_ct, cs_ct ); \ -\ - /* Compute number of primary and leftover components of the m and n - dimensions. */ \ - n_iter = n / NR; \ - n_left = n % NR; \ -\ - m_iter = m / MR; \ - m_left = m % MR; \ -\ - if ( n_left ) ++n_iter; \ - if ( m_left ) ++m_iter; \ -\ - /* Determine some increments used to step through A, B, and C. */ \ - rstep_a = ps_a; \ -\ - cstep_b = ps_b; \ -\ - rstep_c = rs_c * MR; \ - cstep_c = cs_c * NR; \ -\ - /* Save the pack schemas of A and B to the auxinfo_t object. */ \ - bli_auxinfo_set_schema_a( schema_a, &aux ); \ - bli_auxinfo_set_schema_b( schema_b, &aux ); \ -\ - /* Save the imaginary stride of A and B to the auxinfo_t object. */ \ - bli_auxinfo_set_is_a( is_a, &aux ); \ - bli_auxinfo_set_is_b( is_b, &aux ); \ -\ - /* The 'thread' argument points to the thrinfo_t node for the 2nd (jr) - loop around the microkernel. Here we query the thrinfo_t node for the - 1st (ir) loop around the microkernel. */ \ - thrinfo_t* caucus = bli_thrinfo_sub_node( thread ); \ -\ - /* Query the number of threads and thread ids for each loop. */ \ - dim_t jr_nt = bli_thread_n_way( thread ); \ - dim_t jr_tid = bli_thread_work_id( thread ); \ - dim_t ir_nt = bli_thread_n_way( caucus ); \ - dim_t ir_tid = bli_thread_work_id( caucus ); \ -\ - dim_t jr_start, jr_end; \ - dim_t ir_start, ir_end; \ - dim_t jr_inc, ir_inc; \ -\ - /* Note that we partition the 2nd loop into two regions: the triangular - part of C, and the rectangular portion. */ \ - dim_t n_iter_tri; \ - dim_t n_iter_rct; \ -\ - if ( bli_is_strictly_above_diag_n( diagoffc, m, n ) ) \ - { \ - /* If the entire panel of C does not intersect the diagonal, there is - no triangular region, and therefore we can skip the first set of - loops. */ \ - n_iter_tri = 0; \ - n_iter_rct = n_iter; \ - } \ - else \ - { \ - /* If the panel of C does intersect the diagonal, compute the number of - iterations in the triangular (or trapezoidal) region by dividing NR - into the number of rows in C. A non-zero remainder means we need to - add one additional iteration. That is, we want the triangular region - to contain as few columns of whole microtiles as possible while still - including all microtiles that intersect the diagonal. The number of - iterations in the rectangular region is computed as the remaining - number of iterations in the n dimension. */ \ - n_iter_tri = ( m + diagoffc ) / NR + ( ( m + diagoffc ) % NR ? 1 : 0 ); \ - n_iter_rct = n_iter - n_iter_tri; \ - } \ -\ - /* Use round-robin assignment of micropanels to threads in the 2nd loop - and slab partitioning in the 1st loop for the initial triangular region - of C (if it exists). */ \ - bli_thread_range_jrir_rr( thread, n_iter_tri, 1, FALSE, &jr_start, &jr_end, &jr_inc ); \ - bli_thread_range_jrir_sl( caucus, m_iter, 1, FALSE, &ir_start, &ir_end, &ir_inc ); \ -\ - /* Loop over the n dimension (NR columns at a time). */ \ - for ( j = jr_start; j < jr_end; j += jr_inc ) \ - { \ - ctype* restrict a1; \ - ctype* restrict c11; \ - ctype* restrict b2; \ -\ - b1 = b_cast + j * cstep_b; \ - c1 = c_cast + j * cstep_c; \ -\ - n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \ -\ - /* Initialize our next panel of B to be the current panel of B. */ \ - b2 = b1; \ -\ - /* Interior loop over the m dimension (MR rows at a time). */ \ - for ( i = ir_start; i < ir_end; i += ir_inc ) \ - { \ - ctype* restrict a2; \ -\ - a1 = a_cast + i * rstep_a; \ - c11 = c1 + i * rstep_c; \ -\ - /* Compute the diagonal offset for the submatrix at (i,j). */ \ - diagoffc_ij = diagoffc - (doff_t)j*NR + (doff_t)i*MR; \ -\ - m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \ -\ - /* Compute the addresses of the next panels of A and B. */ \ - a2 = bli_herk_get_next_a_upanel( a1, rstep_a, ir_inc ); \ - if ( bli_is_last_iter_sl( i, m_iter, ir_tid, ir_nt ) ) \ - { \ - a2 = a_cast; \ - b2 = bli_herk_get_next_b_upanel( b1, cstep_b, jr_inc ); \ - if ( bli_is_last_iter_rr( j, n_iter, jr_tid, jr_nt ) ) \ - b2 = b_cast; \ - } \ -\ - /* Save addresses of next panels of A and B to the auxinfo_t - object. */ \ - bli_auxinfo_set_next_a( a2, &aux ); \ - bli_auxinfo_set_next_b( b2, &aux ); \ -\ - /* If the diagonal intersects the current MR x NR submatrix, we - compute it the temporary buffer and then add in the elements - on or below the diagonal. - Otherwise, if the submatrix is strictly above the diagonal, - we compute and store as we normally would. - And if we're strictly below the diagonal, we do nothing and - continue. */ \ - if ( bli_intersects_diag_n( diagoffc_ij, m_cur, n_cur ) ) \ - { \ - /* Invoke the gemm micro-kernel. */ \ - gemm_ukr \ - ( \ - k, \ - alpha_cast, \ - a1, \ - b1, \ - zero, \ - ct, rs_ct, cs_ct, \ - &aux, \ - cntx \ - ); \ -\ - /* Scale C and add the result to only the stored part. */ \ - PASTEMAC(ch,xpbys_mxn_u)( diagoffc_ij, \ - m_cur, n_cur, \ - ct, rs_ct, cs_ct, \ - beta_cast, \ - c11, rs_c, cs_c ); \ - } \ - else if ( bli_is_strictly_above_diag_n( diagoffc_ij, m_cur, n_cur ) ) \ - { \ - /* Handle interior and edge cases separately. */ \ - if ( m_cur == MR && n_cur == NR ) \ - { \ - /* Invoke the gemm micro-kernel. */ \ - gemm_ukr \ - ( \ - k, \ - alpha_cast, \ - a1, \ - b1, \ - beta_cast, \ - c11, rs_c, cs_c, \ - &aux, \ - cntx \ - ); \ - } \ - else \ - { \ - /* Invoke the gemm micro-kernel. */ \ - gemm_ukr \ - ( \ - k, \ - alpha_cast, \ - a1, \ - b1, \ - zero, \ - ct, rs_ct, cs_ct, \ - &aux, \ - cntx \ - ); \ -\ - /* Scale the edge of C and add the result. */ \ - PASTEMAC(ch,xpbys_mxn)( m_cur, n_cur, \ - ct, rs_ct, cs_ct, \ - beta_cast, \ - c11, rs_c, cs_c ); \ - } \ - } \ - } \ - } \ -\ - /* If there is no rectangular region, then we're done. */ \ - if ( n_iter_rct == 0 ) return; \ -\ - /* Use slab assignment of micropanels to threads in the 2nd and 1st loops - loop for the remaining triangular region of C. */ \ - bli_thread_range_jrir_sl( thread, n_iter_rct, 1, FALSE, &jr_start, &jr_end, &jr_inc ); \ -\ - /* Advance the start and end iteration offsets for the rectangular region - by the number of iterations used for the triangular region. */ \ - jr_start += n_iter_tri; \ - jr_end += n_iter_tri; \ -\ - /* Loop over the n dimension (NR columns at a time). */ \ - for ( j = jr_start; j < jr_end; j += jr_inc ) \ - { \ - ctype* restrict a1; \ - ctype* restrict c11; \ - ctype* restrict b2; \ -\ - b1 = b_cast + j * cstep_b; \ - c1 = c_cast + j * cstep_c; \ -\ - n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \ -\ - /* Initialize our next panel of B to be the current panel of B. */ \ - b2 = b1; \ -\ - /* Interior loop over the m dimension (MR rows at a time). */ \ - for ( i = ir_start; i < ir_end; i += ir_inc ) \ - { \ - ctype* restrict a2; \ -\ - a1 = a_cast + i * rstep_a; \ - c11 = c1 + i * rstep_c; \ -\ - /* No need to compute the diagonal offset for the rectangular - region. */ \ - /*diagoffc_ij = diagoffc - (doff_t)j*NR + (doff_t)i*MR;*/ \ -\ - m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \ -\ - /* Compute the addresses of the next panels of A and B. */ \ - a2 = bli_herk_get_next_a_upanel( a1, rstep_a, ir_inc ); \ - if ( bli_is_last_iter_sl( i, m_iter, ir_tid, ir_nt ) ) \ - { \ - a2 = a_cast; \ - b2 = bli_herk_get_next_b_upanel( b1, cstep_b, jr_inc ); \ - if ( bli_is_last_iter_sl( j, n_iter, jr_tid, jr_nt ) ) \ - b2 = b_cast; \ - } \ -\ - /* Save addresses of next panels of A and B to the auxinfo_t - object. */ \ - bli_auxinfo_set_next_a( a2, &aux ); \ - bli_auxinfo_set_next_b( b2, &aux ); \ -\ - /* If the diagonal intersects the current MR x NR submatrix, we - compute it the temporary buffer and then add in the elements - on or below the diagonal. - Otherwise, if the submatrix is strictly above the diagonal, - we compute and store as we normally would. - And if we're strictly below the diagonal, we do nothing and - continue. */ \ - { \ - /* Handle interior and edge cases separately. */ \ - if ( m_cur == MR && n_cur == NR ) \ - { \ - /* Invoke the gemm micro-kernel. */ \ - gemm_ukr \ - ( \ - k, \ - alpha_cast, \ - a1, \ - b1, \ - beta_cast, \ - c11, rs_c, cs_c, \ - &aux, \ - cntx \ - ); \ - } \ - else \ - { \ - /* Invoke the gemm micro-kernel. */ \ - gemm_ukr \ - ( \ - k, \ - alpha_cast, \ - a1, \ - b1, \ - zero, \ - ct, rs_ct, cs_ct, \ - &aux, \ - cntx \ - ); \ -\ - /* Scale the edge of C and add the result. */ \ - PASTEMAC(ch,xpbys_mxn)( m_cur, n_cur, \ - ct, rs_ct, cs_ct, \ - beta_cast, \ - c11, rs_c, cs_c ); \ - } \ - } \ - } \ - } \ -} - -INSERT_GENTFUNC_BASIC0( herk_u_ker_var2sl ) - diff --git a/frame/3/syr2k/bli_syr2k.h b/frame/3/syr2k/bli_syr2k.h deleted file mode 100644 index 680e6e3997..0000000000 --- a/frame/3/syr2k/bli_syr2k.h +++ /dev/null @@ -1,36 +0,0 @@ -/* - - BLIS - An object-based framework for developing high-performance BLAS-like - libraries. - - Copyright (C) 2014, The University of Texas at Austin - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are - met: - - Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - - Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - - Neither the name(s) of the copyright holder(s) nor the names of its - contributors may be used to endorse or promote products derived - from this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -*/ - -#include "bli_syr2k_front.h" - diff --git a/frame/3/syr2k/bli_syr2k_front.c b/frame/3/syr2k/bli_syr2k_front.c deleted file mode 100644 index c1532b92d7..0000000000 --- a/frame/3/syr2k/bli_syr2k_front.c +++ /dev/null @@ -1,147 +0,0 @@ -/* - - BLIS - An object-based framework for developing high-performance BLAS-like - libraries. - - Copyright (C) 2014, The University of Texas at Austin - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are - met: - - Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - - Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - - Neither the name(s) of the copyright holder(s) nor the names of its - contributors may be used to endorse or promote products derived - from this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -*/ - -#include "blis.h" - -void bli_syr2k_front - ( - obj_t* alpha, - obj_t* a, - obj_t* b, - obj_t* beta, - obj_t* c, - cntx_t* cntx, - rntm_t* rntm, - cntl_t* cntl - ) -{ - bli_init_once(); - - obj_t c_local; - obj_t a_local; - obj_t bt_local; - obj_t b_local; - obj_t at_local; - - // Check parameters. - if ( bli_error_checking_is_enabled() ) - bli_syr2k_check( alpha, a, b, beta, c, cntx ); - - // If alpha is zero, scale by beta and return. - if ( bli_obj_equals( alpha, &BLIS_ZERO ) ) - { - bli_scalm( beta, c ); - return; - } - - // Alias A, B, and C in case we need to apply transformations. - bli_obj_alias_to( a, &a_local ); - bli_obj_alias_to( b, &b_local ); - bli_obj_alias_to( c, &c_local ); - bli_obj_set_as_root( &c_local ); - - // For syr2k, the first and second right-hand "B" operands are simply B' - // and A'. - bli_obj_alias_to( b, &bt_local ); - bli_obj_induce_trans( &bt_local ); - bli_obj_alias_to( a, &at_local ); - bli_obj_induce_trans( &at_local ); - - // An optimization: If C is stored by rows and the micro-kernel prefers - // contiguous columns, or if C is stored by columns and the micro-kernel - // prefers contiguous rows, transpose the entire operation to allow the - // micro-kernel to access elements of C in its preferred manner. - if ( bli_cntx_l3_vir_ukr_dislikes_storage_of( &c_local, BLIS_GEMM_UKR, cntx ) ) - { - bli_obj_induce_trans( &c_local ); - } - - // Parse and interpret the contents of the rntm_t object to properly - // set the ways of parallelism for each loop, and then make any - // additional modifications necessary for the current operation. - bli_rntm_set_ways_for_op - ( - BLIS_SYR2K, - BLIS_LEFT, // ignored for her[2]k/syr[2]k - bli_obj_length( &c_local ), - bli_obj_width( &c_local ), - bli_obj_width( &a_local ), - rntm - ); - - // A sort of hack for communicating the desired pach schemas for A and B - // to bli_gemm_cntl_create() (via bli_l3_thread_decorator() and - // bli_l3_cntl_create_if()). This allows us to access the schemas from - // the control tree, which hopefully reduces some confusion, particularly - // in bli_packm_init(). - pack_t schema_a = bli_cntx_schema_a_block( cntx ); - pack_t schema_b = bli_cntx_schema_b_panel( cntx ); - - bli_obj_set_pack_schema( schema_a, &a_local ); - bli_obj_set_pack_schema( schema_b, &bt_local ); - bli_obj_set_pack_schema( schema_a, &b_local ); - bli_obj_set_pack_schema( schema_b, &at_local ); - - // Invoke herk twice, using beta only the first time. - - // Invoke the internal back-end. - bli_l3_thread_decorator - ( - bli_gemm_int, - BLIS_HERK, // operation family id - alpha, - &a_local, - &bt_local, - beta, - &c_local, - cntx, - rntm, - cntl - ); - - bli_l3_thread_decorator - ( - bli_gemm_int, - BLIS_HERK, // operation family id - alpha, - &b_local, - &at_local, - &BLIS_ONE, - &c_local, - cntx, - rntm, - cntl - ); -} - diff --git a/frame/3/syr2k/bli_syr2k_front.h b/frame/3/syr2k/bli_syr2k_front.h deleted file mode 100644 index 767bb6ee11..0000000000 --- a/frame/3/syr2k/bli_syr2k_front.h +++ /dev/null @@ -1,45 +0,0 @@ -/* - - BLIS - An object-based framework for developing high-performance BLAS-like - libraries. - - Copyright (C) 2014, The University of Texas at Austin - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are - met: - - Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - - Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - - Neither the name(s) of the copyright holder(s) nor the names of its - contributors may be used to endorse or promote products derived - from this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -*/ - -void bli_syr2k_front - ( - obj_t* alpha, - obj_t* a, - obj_t* b, - obj_t* beta, - obj_t* c, - cntx_t* cntx, - rntm_t* rntm, - cntl_t* cntl - ); diff --git a/frame/3/syrk/bli_syrk.h b/frame/3/syrk/bli_syrk.h deleted file mode 100644 index 4936fe431e..0000000000 --- a/frame/3/syrk/bli_syrk.h +++ /dev/null @@ -1,36 +0,0 @@ -/* - - BLIS - An object-based framework for developing high-performance BLAS-like - libraries. - - Copyright (C) 2014, The University of Texas at Austin - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are - met: - - Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - - Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - - Neither the name(s) of the copyright holder(s) nor the names of its - contributors may be used to endorse or promote products derived - from this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -*/ - -#include "bli_syrk_front.h" - diff --git a/frame/3/syrk/bli_syrk_front.c b/frame/3/syrk/bli_syrk_front.c deleted file mode 100644 index 14c5d4a3da..0000000000 --- a/frame/3/syrk/bli_syrk_front.c +++ /dev/null @@ -1,131 +0,0 @@ -/* - - BLIS - An object-based framework for developing high-performance BLAS-like - libraries. - - Copyright (C) 2014, The University of Texas at Austin - Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc. - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are - met: - - Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - - Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - - Neither the name(s) of the copyright holder(s) nor the names of its - contributors may be used to endorse or promote products derived - from this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -*/ - -#include "blis.h" - -void bli_syrk_front - ( - obj_t* alpha, - obj_t* a, - obj_t* beta, - obj_t* c, - cntx_t* cntx, - rntm_t* rntm, - cntl_t* cntl - ) -{ - bli_init_once(); - - obj_t a_local; - obj_t at_local; - obj_t c_local; - - // Alias A and C in case we need to apply transformations. - bli_obj_alias_to( a, &a_local ); - bli_obj_alias_to( c, &c_local ); - bli_obj_set_as_root( &c_local ); - - // For syrk, the right-hand "B" operand is simply A^T. - bli_obj_alias_to( a, &at_local ); - bli_obj_induce_trans( &at_local ); - -#if 0 -#ifdef BLIS_ENABLE_SMALL_MATRIX - gint_t status = bli_syrk_small( alpha, &a_local, &at_local, beta, &c_local, - cntx, cntl ); - if ( status == BLIS_SUCCESS ) return; -#endif -#endif - - // Check parameters. - if ( bli_error_checking_is_enabled() ) - bli_syrk_check( alpha, a, beta, c, cntx ); - - // If alpha is zero, scale by beta and return. - if ( bli_obj_equals( alpha, &BLIS_ZERO ) ) - { - bli_scalm( beta, c ); - return; - } - - // An optimization: If C is stored by rows and the micro-kernel prefers - // contiguous columns, or if C is stored by columns and the micro-kernel - // prefers contiguous rows, transpose the entire operation to allow the - // micro-kernel to access elements of C in its preferred manner. - if ( bli_cntx_l3_vir_ukr_dislikes_storage_of( &c_local, BLIS_GEMM_UKR, cntx ) ) - { - bli_obj_induce_trans( &c_local ); - } - - // Parse and interpret the contents of the rntm_t object to properly - // set the ways of parallelism for each loop, and then make any - // additional modifications necessary for the current operation. - bli_rntm_set_ways_for_op - ( - BLIS_SYRK, - BLIS_LEFT, // ignored for her[2]k/syr[2]k - bli_obj_length( &c_local ), - bli_obj_width( &c_local ), - bli_obj_width( &a_local ), - rntm - ); - - // A sort of hack for communicating the desired pach schemas for A and B - // to bli_gemm_cntl_create() (via bli_l3_thread_decorator() and - // bli_l3_cntl_create_if()). This allows us to access the schemas from - // the control tree, which hopefully reduces some confusion, particularly - // in bli_packm_init(). - pack_t schema_a = bli_cntx_schema_a_block( cntx ); - pack_t schema_b = bli_cntx_schema_b_panel( cntx ); - - bli_obj_set_pack_schema( schema_a, &a_local ); - bli_obj_set_pack_schema( schema_b, &at_local ); - - // Invoke the internal back-end. - bli_l3_thread_decorator - ( - bli_gemm_int, - BLIS_HERK, // operation family id - alpha, - &a_local, - &at_local, - beta, - &c_local, - cntx, - rntm, - cntl - ); -} - diff --git a/frame/3/syrk/bli_syrk_front.h b/frame/3/syrk/bli_syrk_front.h deleted file mode 100644 index bf8d26a52c..0000000000 --- a/frame/3/syrk/bli_syrk_front.h +++ /dev/null @@ -1,58 +0,0 @@ -/* - - BLIS - An object-based framework for developing high-performance BLAS-like - libraries. - - Copyright (C) 2014, The University of Texas at Austin - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are - met: - - Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - - Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - - Neither the name(s) of the copyright holder(s) nor the names of its - contributors may be used to endorse or promote products derived - from this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -*/ - -void bli_syrk_front - ( - obj_t* alpha, - obj_t* a, - obj_t* beta, - obj_t* c, - cntx_t* cntx, - rntm_t* rntm, - cntl_t* cntl - ); - -#ifdef BLIS_ENABLE_SMALL_MATRIX -err_t bli_syrk_small - ( - obj_t* alpha, - obj_t* a, - obj_t* b, - obj_t* beta, - obj_t* c, - cntx_t* cntx, - cntl_t* cntl - ); -#endif - diff --git a/frame/base/bli_info.c b/frame/base/bli_info.c index fa7901583f..8a3dcd30a6 100644 --- a/frame/base/bli_info.c +++ b/frame/base/bli_info.c @@ -180,12 +180,13 @@ char* bli_info_get_trsm_u_ukr_impl_string( ind_t method, num_t dt ) // -- BLIS implementation query (level-3) -------------------------------------- char* bli_info_get_gemm_impl_string( num_t dt ) { return bli_ind_oper_get_avail_impl_string( BLIS_GEMM, dt ); } +char* bli_info_get_gemmt_impl_string( num_t dt ) { return bli_ind_oper_get_avail_impl_string( BLIS_GEMMT, dt ); } char* bli_info_get_hemm_impl_string( num_t dt ) { return bli_ind_oper_get_avail_impl_string( BLIS_HEMM, dt ); } -char* bli_info_get_herk_impl_string( num_t dt ) { return bli_ind_oper_get_avail_impl_string( BLIS_HERK, dt ); } -char* bli_info_get_her2k_impl_string( num_t dt ) { return bli_ind_oper_get_avail_impl_string( BLIS_HER2K, dt ); } +char* bli_info_get_herk_impl_string( num_t dt ) { return bli_ind_oper_get_avail_impl_string( BLIS_GEMMT, dt ); } +char* bli_info_get_her2k_impl_string( num_t dt ) { return bli_ind_oper_get_avail_impl_string( BLIS_GEMMT, dt ); } char* bli_info_get_symm_impl_string( num_t dt ) { return bli_ind_oper_get_avail_impl_string( BLIS_SYMM, dt ); } -char* bli_info_get_syrk_impl_string( num_t dt ) { return bli_ind_oper_get_avail_impl_string( BLIS_SYRK, dt ); } -char* bli_info_get_syr2k_impl_string( num_t dt ) { return bli_ind_oper_get_avail_impl_string( BLIS_SYR2K, dt ); } +char* bli_info_get_syrk_impl_string( num_t dt ) { return bli_ind_oper_get_avail_impl_string( BLIS_GEMMT, dt ); } +char* bli_info_get_syr2k_impl_string( num_t dt ) { return bli_ind_oper_get_avail_impl_string( BLIS_GEMMT, dt ); } char* bli_info_get_trmm_impl_string( num_t dt ) { return bli_ind_oper_get_avail_impl_string( BLIS_TRMM, dt ); } char* bli_info_get_trmm3_impl_string( num_t dt ) { return bli_ind_oper_get_avail_impl_string( BLIS_TRMM3, dt ); } char* bli_info_get_trsm_impl_string( num_t dt ) { return bli_ind_oper_get_avail_impl_string( BLIS_TRSM, dt ); } diff --git a/frame/base/bli_info.h b/frame/base/bli_info.h index d900ca4f51..99c7d000db 100644 --- a/frame/base/bli_info.h +++ b/frame/base/bli_info.h @@ -91,6 +91,7 @@ BLIS_EXPORT_BLIS char* bli_info_get_trsm_u_ukr_impl_string( ind_t method, num_t // -- BLIS implementation query (level-3) -------------------------------------- BLIS_EXPORT_BLIS char* bli_info_get_gemm_impl_string( num_t dt ); +BLIS_EXPORT_BLIS char* bli_info_get_gemmt_impl_string( num_t dt ); BLIS_EXPORT_BLIS char* bli_info_get_hemm_impl_string( num_t dt ); BLIS_EXPORT_BLIS char* bli_info_get_herk_impl_string( num_t dt ); BLIS_EXPORT_BLIS char* bli_info_get_her2k_impl_string( num_t dt ); diff --git a/frame/base/bli_part.c b/frame/base/bli_part.c index da7643eb67..95587e4a71 100644 --- a/frame/base/bli_part.c +++ b/frame/base/bli_part.c @@ -266,7 +266,7 @@ void bli_acquire_mpart_mdim // diagonal, then set the subpartition structure to "general"; otherwise // we let the subpartition inherit the storage structure of its immediate // parent. - if ( !bli_obj_root_is_general( sub_obj ) && + if ( !bli_obj_root_is_general( sub_obj ) && bli_obj_is_outside_diag( sub_obj ) ) { // NOTE: This comment may be out-of-date since we now distinguish @@ -274,10 +274,10 @@ void bli_acquire_mpart_mdim // Note that we cannot mark the subpartition object as general/dense // here since it makes sense to preserve the existing uplo information // a while longer so that the correct kernels are invoked. (Example: - // incremental packing/computing in herk produces subpartitions that + // incremental packing/computing in gemmt produces subpartitions that // appear general/dense, but their uplo fields are needed to be either // lower or upper, to determine which macro-kernel gets called in the - // herk_int() back-end.) + // gemmt_int() back-end.) // If the subpartition lies entirely in an "unstored" triangle of the // root matrix, then we need to tweak the subpartition. If the root @@ -489,7 +489,7 @@ void bli_acquire_mpart_ndim // diagonal), and the subpartition does not intersect the root matrix's // diagonal, then we might need to modify some of the subpartition's // properties, depending on its structure type. - if ( !bli_obj_root_is_general( sub_obj ) && + if ( !bli_obj_root_is_general( sub_obj ) && bli_obj_is_outside_diag( sub_obj ) ) { // NOTE: This comment may be out-of-date since we now distinguish @@ -497,10 +497,10 @@ void bli_acquire_mpart_ndim // Note that we cannot mark the subpartition object as general/dense // here since it makes sense to preserve the existing uplo information // a while longer so that the correct kernels are invoked. (Example: - // incremental packing/computing in herk produces subpartitions that + // incremental packing/computing in gemmt produces subpartitions that // appear general/dense, but their uplo fields are needed to be either // lower or upper, to determine which macro-kernel gets called in the - // herk_int() back-end.) + // gemmt_int() back-end.) // If the subpartition lies entirely in an "unstored" triangle of the // root matrix, then we need to tweak the subpartition. If the root @@ -742,7 +742,7 @@ void bli_acquire_mpart_mndim // diagonal, then set the subpartition structure to "general"; otherwise // we let the subpartition inherit the storage structure of its immediate // parent. - if ( !bli_obj_root_is_general( sub_obj ) && + if ( !bli_obj_root_is_general( sub_obj ) && req_part != BLIS_SUBPART00 && req_part != BLIS_SUBPART11 && req_part != BLIS_SUBPART22 ) @@ -762,10 +762,10 @@ void bli_acquire_mpart_mndim // Note that we cannot mark the subpartition object as general/dense // here since it makes sense to preserve the existing uplo information // a while longer so that the correct kernels are invoked. (Example: - // incremental packing/computing in herk produces subpartitions that + // incremental packing/computing in gemmt produces subpartitions that // appear general/dense, but their uplo fields are needed to be either // lower or upper, to determine which macro-kernel gets called in the - // herk_int() back-end.) + // gemmt_int() back-end.) // If the subpartition lies entirely in an "unstored" triangle of the // root matrix, then we need to tweak the subpartition. If the root diff --git a/frame/ind/bli_l3_ind.c b/frame/ind/bli_l3_ind.c index 8496981c3a..81b4ea6f60 100644 --- a/frame/ind/bli_l3_ind.c +++ b/frame/ind/bli_l3_ind.c @@ -37,21 +37,22 @@ static void_fp bli_l3_ind_oper_fp[BLIS_NUM_IND_METHODS][BLIS_NUM_LEVEL3_OPS] = { - /* gemm gemmt hemm herk her2k symm syrk syr2k trmm3 trmm trsm */ -/* 3mh */ { bli_gemm3mh, NULL, bli_hemm3mh, bli_herk3mh, bli_her2k3mh, bli_symm3mh, - bli_syrk3mh, bli_syr2k3mh, bli_trmm33mh, NULL, NULL }, -/* 3m1 */ { bli_gemm3m1, NULL, bli_hemm3m1, bli_herk3m1, bli_her2k3m1, bli_symm3m1, - bli_syrk3m1, bli_syr2k3m1, bli_trmm33m1, bli_trmm3m1, bli_trsm3m1 }, -/* 4mh */ { bli_gemm4mh, NULL, bli_hemm4mh, bli_herk4mh, bli_her2k4mh, bli_symm4mh, - bli_syrk4mh, bli_syr2k4mh, bli_trmm34mh, NULL, NULL }, + /* gemm gemmt hemm herk her2k symm + syrk syr2k trmm3 trmm trsm */ +/* 3mh */ { bli_gemm3mh, NULL, bli_hemm3mh, NULL, NULL, bli_symm3mh, + NULL, NULL, bli_trmm33mh, NULL, NULL }, +/* 3m1 */ { bli_gemm3m1, NULL, bli_hemm3m1, NULL, NULL, bli_symm3m1, + NULL, NULL, bli_trmm33m1, bli_trmm3m1, bli_trsm3m1 }, +/* 4mh */ { bli_gemm4mh, NULL, bli_hemm4mh, NULL, NULL, bli_symm4mh, + NULL, NULL, bli_trmm34mh, NULL, NULL }, /* 4mb */ { bli_gemm4mb, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL }, -/* 4m1 */ { bli_gemm4m1, NULL, bli_hemm4m1, bli_herk4m1, bli_her2k4m1, bli_symm4m1, - bli_syrk4m1, bli_syr2k4m1, bli_trmm34m1, bli_trmm4m1, bli_trsm4m1 }, -/* 1m */ { bli_gemm1m, NULL, bli_hemm1m, bli_herk1m, bli_her2k1m, bli_symm1m, - bli_syrk1m, bli_syr2k1m, bli_trmm31m, bli_trmm1m, bli_trsm1m }, -/* nat */ { bli_gemmnat, bli_gemmtnat, bli_hemmnat, bli_herknat, bli_her2knat, bli_symmnat, - bli_syrknat, bli_syr2knat, bli_trmm3nat, bli_trmmnat, bli_trsmnat }, +/* 4m1 */ { bli_gemm4m1, NULL, bli_hemm4m1, NULL, NULL, bli_symm4m1, + NULL, NULL, bli_trmm34m1, bli_trmm4m1, bli_trsm4m1 }, +/* 1m */ { bli_gemm1m, NULL, bli_hemm1m, NULL, NULL, bli_symm1m, + NULL, NULL, bli_trmm31m, bli_trmm1m, bli_trsm1m }, +/* nat */ { bli_gemmnat, bli_gemmtnat, bli_hemmnat, NULL, NULL, bli_symmnat, + NULL, NULL, bli_trmm3nat, bli_trmmnat, bli_trsmnat }, }; // @@ -65,7 +66,8 @@ static void_fp bli_l3_ind_oper_fp[BLIS_NUM_IND_METHODS][BLIS_NUM_LEVEL3_OPS] = static BLIS_THREAD_LOCAL bool bli_l3_ind_oper_st[BLIS_NUM_IND_METHODS][BLIS_NUM_LEVEL3_OPS][2] = { - /* gemm gemmt hemm herk her2k symm syrk syr2k trmm3 trmm trsm */ + /* gemm gemmt hemm herk her2k symm + syrk syr2k trmm3 trmm trsm */ /* c z */ /* 3mh */ { {FALSE,FALSE}, {FALSE,FALSE}, {FALSE,FALSE}, {FALSE,FALSE}, {FALSE,FALSE}, {FALSE,FALSE}, {FALSE,FALSE}, {FALSE,FALSE}, {FALSE,FALSE}, {FALSE,FALSE}, {FALSE,FALSE} }, @@ -102,11 +104,7 @@ bool PASTEMAC(opname,ind_has_avail)( num_t dt ) GENFUNC( gemm, BLIS_GEMM ) GENFUNC( gemmt, BLIS_GEMMT ) GENFUNC( hemm, BLIS_HEMM ) -GENFUNC( herk, BLIS_HERK ) -GENFUNC( her2k, BLIS_HER2K ) GENFUNC( symm, BLIS_SYMM ) -GENFUNC( syrk, BLIS_SYRK ) -GENFUNC( syr2k, BLIS_SYR2K ) GENFUNC( trmm3, BLIS_TRMM3 ) GENFUNC( trmm, BLIS_TRMM ) GENFUNC( trsm, BLIS_TRSM ) diff --git a/frame/ind/bli_l3_ind.h b/frame/ind/bli_l3_ind.h index 0c2554ae2a..6842aaf49c 100644 --- a/frame/ind/bli_l3_ind.h +++ b/frame/ind/bli_l3_ind.h @@ -47,11 +47,7 @@ void_fp PASTEMAC(opname,ind_get_avail)( num_t dt ); GENPROT( gemm ) GENPROT( gemmt ) GENPROT( hemm ) -GENPROT( herk ) -GENPROT( her2k ) GENPROT( symm ) -GENPROT( syrk ) -GENPROT( syr2k ) GENPROT( trmm3 ) GENPROT( trmm ) GENPROT( trsm ) diff --git a/frame/ind/oapi/bli_l3_3m4m1m_oapi.c b/frame/ind/oapi/bli_l3_3m4m1m_oapi.c index e5658a3948..3089ecfa7f 100644 --- a/frame/ind/oapi/bli_l3_3m4m1m_oapi.c +++ b/frame/ind/oapi/bli_l3_3m4m1m_oapi.c @@ -35,7 +35,7 @@ #include "blis.h" -// -- gemm/her2k/syr2k --------------------------------------------------------- +// -- gemm --------------------------------------------------------------------- #undef GENFRONT #define GENFRONT( opname, cname, imeth, nstage ) \ @@ -125,22 +125,6 @@ GENFRONT( gemm, gemm, 4mb, 1 ) GENFRONT( gemm, gemm, 4m1, 1 ) GENFRONT( gemm, gemm, 1m, 1 ) -// her2k -GENFRONT( her2k, gemm, 3mh, 3 ) -GENFRONT( her2k, gemm, 3m1, 1 ) -GENFRONT( her2k, gemm, 4mh, 4 ) -//GENFRONT( her2k, gemm, 4mb, 1 ) // Not implemented. -GENFRONT( her2k, gemm, 4m1, 1 ) -GENFRONT( her2k, gemm, 1m, 1 ) - -// syr2k -GENFRONT( syr2k, gemm, 3mh, 3 ) -GENFRONT( syr2k, gemm, 3m1, 1 ) -GENFRONT( syr2k, gemm, 4mh, 4 ) -//GENFRONT( syr2k, gemm, 4mb, 1 ) // Not implemented. -GENFRONT( syr2k, gemm, 4m1, 1 ) -GENFRONT( syr2k, gemm, 1m, 1 ) - // -- hemm/symm/trmm3 ---------------------------------------------------------- @@ -235,89 +219,6 @@ GENFRONT( trmm3, gemm, 4m1, 1 ) GENFRONT( trmm3, gemm, 1m, 1 ) -// -- herk/syrk ---------------------------------------------------------------- - -#undef GENFRONT -#define GENFRONT( opname, cname, imeth, nstage ) \ -\ -void PASTEMAC(opname,imeth) \ - ( \ - obj_t* alpha, \ - obj_t* a, \ - obj_t* beta, \ - obj_t* c, \ - cntx_t* cntx, \ - rntm_t* rntm \ - ) \ -{ \ - bli_init_once(); \ -\ - ind_t ind = PASTEMAC0(imeth); \ - num_t dt = bli_obj_dt( c ); \ - obj_t* beta_use = beta; \ -\ - dim_t i; \ -\ - /* If the objects are in the real domain, execute the native - implementation. */ \ - if ( bli_obj_is_real( c ) ) \ - { \ - PASTEMAC(opname,nat)( alpha, a, beta, c, cntx, rntm ); \ - return; \ - } \ -\ - /* Query a context for the current induced method. This context is - managed and cached by the gks and should not be freed by the caller. - Note that the datatype argument is needed because it will be passed - in when bli_gks_query_ind_cntx() eventually calls the induced method's - _cntx_init() function. */ \ - cntx = bli_gks_query_ind_cntx( ind, dt ); \ -\ - /* 3mh and 4mh change the context for each stage, and so in order to - remain thread-safe, we must make a local copy of the context for - those induced methods. */ \ - cntx_t cntx_l; \ - if ( ind == BLIS_3MH || ind == BLIS_4MH ) { cntx_l = *cntx; cntx = &cntx_l; } \ -\ - /* Initialize a local runtime with global settings if necessary. Note - that in the case that a runtime is passed in, we make a local copy. */ \ - rntm_t rntm_l; \ - if ( rntm == NULL ) { bli_rntm_init_from_global( &rntm_l ); rntm = &rntm_l; } \ - else { rntm_l = *rntm; rntm = &rntm_l; } \ -\ - /* Some induced methods execute in multiple "stages". */ \ - for ( i = 0; i < nstage; ++i ) \ - { \ - /* Prepare the context for the ith stage of computation. */ \ - bli_cntx_ind_stage( ind, i, cntx ); \ -\ - /* For multi-stage methods, use BLIS_ONE as beta after the first - stage. */ \ - if ( i > 0 ) beta_use = &BLIS_ONE; \ -\ - /* Invoke the operation's front end and request the default control - tree. */ \ - PASTEMAC(opname,_front)( alpha, a, beta_use, c, cntx, rntm, NULL ); \ - } \ -} - -// herk -GENFRONT( herk, gemm, 3mh, 3 ) -GENFRONT( herk, gemm, 3m1, 1 ) -GENFRONT( herk, gemm, 4mh, 4 ) -//GENFRONT( herk, gemm, 4mb, 1 ) // Not implemented. -GENFRONT( herk, gemm, 4m1, 1 ) -GENFRONT( herk, gemm, 1m, 1 ) - -// syrk -GENFRONT( syrk, gemm, 3mh, 3 ) -GENFRONT( syrk, gemm, 3m1, 1 ) -GENFRONT( syrk, gemm, 4mh, 4 ) -//GENFRONT( syrk, gemm, 4mb, 1 ) // Not implemented. -GENFRONT( syrk, gemm, 4m1, 1 ) -GENFRONT( syrk, gemm, 1m, 1 ) - - // -- trmm --------------------------------------------------------------------- #undef GENFRONT diff --git a/frame/ind/oapi/bli_l3_ind_oapi.c b/frame/ind/oapi/bli_l3_ind_oapi.c index 931153a2d1..3b0d8d69a8 100644 --- a/frame/ind/oapi/bli_l3_ind_oapi.c +++ b/frame/ind/oapi/bli_l3_ind_oapi.c @@ -36,7 +36,7 @@ #include "blis.h" -// -- gemm/her2k/syr2k --------------------------------------------------------- +// -- gemm --------------------------------------------------------------------- #undef GENFRONT #define GENFRONT( opname, imeth ) \ @@ -68,8 +68,6 @@ void PASTEMAC(opname,imeth) \ GENFRONT( gemm, ind ) GENFRONT( gemmt, ind ) -GENFRONT( her2k, ind ) -GENFRONT( syr2k, ind ) // -- hemm/symm/trmm3 ---------------------------------------------------------- @@ -108,39 +106,6 @@ GENFRONT( symm, ind ) GENFRONT( trmm3, ind ) -// -- herk/syrk ---------------------------------------------------------------- - -#undef GENFRONT -#define GENFRONT( opname, imeth ) \ -\ -void PASTEMAC(opname,imeth) \ - ( \ - obj_t* alpha, \ - obj_t* a, \ - obj_t* beta, \ - obj_t* c, \ - cntx_t* cntx, \ - rntm_t* rntm \ - ) \ -{ \ - bli_init_once(); \ -\ - num_t dt = bli_obj_dt( c ); \ - PASTECH(opname,_oft) func = PASTEMAC(opname,ind_get_avail)( dt ); \ -\ - /* Initialize a local runtime with global settings if necessary. Note - that in the case that a runtime is passed in, we make a local copy. */ \ - rntm_t rntm_l; \ - if ( rntm == NULL ) { bli_rntm_init_from_global( &rntm_l ); rntm = &rntm_l; } \ - else { rntm_l = *rntm; rntm = &rntm_l; } \ -\ - func( alpha, a, beta, c, cntx, rntm ); \ -} - -GENFRONT( herk, ind ) -GENFRONT( syrk, ind ) - - // -- trmm/trsm ---------------------------------------------------------------- #undef GENFRONT diff --git a/frame/ind/oapi/bli_l3_ind_oapi.h b/frame/ind/oapi/bli_l3_ind_oapi.h index 6d469d9c72..6ae66a2370 100644 --- a/frame/ind/oapi/bli_l3_ind_oapi.h +++ b/frame/ind/oapi/bli_l3_ind_oapi.h @@ -44,11 +44,7 @@ BLIS_EXPORT_BLIS void PASTEMAC(gemm,imeth) ( obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx, rntm_t* rntm ); \ BLIS_EXPORT_BLIS void PASTEMAC(gemmt,imeth)( obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx, rntm_t* rntm ); \ BLIS_EXPORT_BLIS void PASTEMAC(hemm,imeth) ( side_t side, obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx, rntm_t* rntm ); \ -BLIS_EXPORT_BLIS void PASTEMAC(herk,imeth) ( obj_t* alpha, obj_t* a, obj_t* beta, obj_t* c, cntx_t* cntx, rntm_t* rntm ); \ -BLIS_EXPORT_BLIS void PASTEMAC(her2k,imeth)( obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx, rntm_t* rntm ); \ BLIS_EXPORT_BLIS void PASTEMAC(symm,imeth) ( side_t side, obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx, rntm_t* rntm ); \ -BLIS_EXPORT_BLIS void PASTEMAC(syrk,imeth) ( obj_t* alpha, obj_t* a, obj_t* beta, obj_t* c, cntx_t* cntx, rntm_t* rntm ); \ -BLIS_EXPORT_BLIS void PASTEMAC(syr2k,imeth)( obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx, rntm_t* rntm ); \ BLIS_EXPORT_BLIS void PASTEMAC(trmm3,imeth)( side_t side, obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx, rntm_t* rntm ); \ BLIS_EXPORT_BLIS void PASTEMAC(trmm,imeth) ( side_t side, obj_t* alpha, obj_t* a, obj_t* b, cntx_t* cntx, rntm_t* rntm ); \ BLIS_EXPORT_BLIS void PASTEMAC(trsm,imeth) ( side_t side, obj_t* alpha, obj_t* a, obj_t* b, cntx_t* cntx, rntm_t* rntm ); @@ -69,11 +65,7 @@ GENPROT( 1m ) \ BLIS_EXPORT_BLIS void PASTEMAC(gemm,imeth) ( obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx, rntm_t* rntm ); \ BLIS_EXPORT_BLIS void PASTEMAC(hemm,imeth) ( side_t side, obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx, rntm_t* rntm ); \ -BLIS_EXPORT_BLIS void PASTEMAC(herk,imeth) ( obj_t* alpha, obj_t* a, obj_t* beta, obj_t* c, cntx_t* cntx, rntm_t* rntm ); \ -BLIS_EXPORT_BLIS void PASTEMAC(her2k,imeth)( obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx, rntm_t* rntm ); \ BLIS_EXPORT_BLIS void PASTEMAC(symm,imeth) ( side_t side, obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx, rntm_t* rntm ); \ -BLIS_EXPORT_BLIS void PASTEMAC(syrk,imeth) ( obj_t* alpha, obj_t* a, obj_t* beta, obj_t* c, cntx_t* cntx, rntm_t* rntm ); \ -BLIS_EXPORT_BLIS void PASTEMAC(syr2k,imeth)( obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx, rntm_t* rntm ); \ BLIS_EXPORT_BLIS void PASTEMAC(trmm3,imeth)( side_t side, obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx, rntm_t* rntm ); GENPROT_NO2OP( 3mh ) diff --git a/frame/ind/oapi/bli_l3_nat_oapi.c b/frame/ind/oapi/bli_l3_nat_oapi.c index 9e59303eed..1f8d81bf85 100644 --- a/frame/ind/oapi/bli_l3_nat_oapi.c +++ b/frame/ind/oapi/bli_l3_nat_oapi.c @@ -41,7 +41,7 @@ // of executing one iteration of a for loop, plus the overhead of calling a // function that does nothing (ie: the _cntx_init_stage() function). -// -- gemm/her2k/syr2k/gemmt --------------------------------------------------- +// -- gemm --------------------------------------------------------------------- #undef GENFRONT #define GENFRONT( opname, cname, imeth ) \ @@ -81,8 +81,6 @@ void PASTEMAC(opname,imeth) \ GENFRONT( gemm, gemm, nat ) #endif GENFRONT( gemmt, gemm, nat ) -GENFRONT( her2k, gemm, nat ) -GENFRONT( syr2k, gemm, nat ) // -- hemm/symm/trmm3 ---------------------------------------------------------- @@ -125,43 +123,6 @@ GENFRONT( symm, gemm, nat ) GENFRONT( trmm3, gemm, nat ) -// -- herk/syrk ---------------------------------------------------------------- - -#undef GENFRONT -#define GENFRONT( opname, cname, imeth ) \ -\ -void PASTEMAC(opname,imeth) \ - ( \ - obj_t* alpha, \ - obj_t* a, \ - obj_t* beta, \ - obj_t* c, \ - cntx_t* cntx, \ - rntm_t* rntm \ - ) \ -{ \ - bli_init_once(); \ -\ - /* Obtain a valid (native) context from the gks if necessary. */ \ - if ( cntx == NULL ) cntx = bli_gks_query_cntx(); \ -\ - /* Initialize a local runtime with global settings if necessary. Note - that in the case that a runtime is passed in, we make a local copy. */ \ - rntm_t rntm_l; \ - if ( rntm == NULL ) { bli_rntm_init_from_global( &rntm_l ); rntm = &rntm_l; } \ - else { rntm_l = *rntm; rntm = &rntm_l; } \ -\ - /* Invoke the operation's front end. */ \ - PASTEMAC(opname,_front) \ - ( \ - alpha, a, beta, c, cntx, rntm, NULL \ - ); \ -} - -GENFRONT( herk, gemm, nat ) -GENFRONT( syrk, gemm, nat ) - - // -- trmm --------------------------------------------------------------------- #undef GENFRONT diff --git a/frame/ind/tapi/bli_l3_ind_tapi.c b/frame/ind/tapi/bli_l3_ind_tapi.c index 9ca7746bc0..02458e285f 100644 --- a/frame/ind/tapi/bli_l3_ind_tapi.c +++ b/frame/ind/tapi/bli_l3_ind_tapi.c @@ -165,131 +165,6 @@ INSERT_GENTFUNC_BASIC0( hemm4m1 ) INSERT_GENTFUNC_BASIC0( hemm1m ) -// -- herk --------------------------------------------------------------------- - -#undef GENTFUNCR -#define GENTFUNCR( ctype, ctype_r, ch, chr, opname ) \ -\ -void PASTEMAC(ch,opname) \ - ( \ - uplo_t uploc, \ - trans_t transa, \ - dim_t m, \ - dim_t k, \ - ctype_r* alpha, \ - ctype* a, inc_t rs_a, inc_t cs_a, \ - ctype_r* beta, \ - ctype* c, inc_t rs_c, inc_t cs_c, \ - cntx_t* cntx, \ - rntm_t* rntm \ - ) \ -{ \ - bli_init_once(); \ -\ - const num_t dt_r = PASTEMAC(chr,type); \ - const num_t dt = PASTEMAC(ch,type); \ -\ - obj_t alphao, ao, betao, co; \ -\ - dim_t m_a, n_a; \ -\ - bli_set_dims_with_trans( transa, m, k, &m_a, &n_a ); \ -\ - bli_obj_create_1x1_with_attached_buffer( dt_r, alpha, &alphao ); \ - bli_obj_create_1x1_with_attached_buffer( dt_r, beta, &betao ); \ -\ - bli_obj_create_with_attached_buffer( dt, m_a, n_a, a, rs_a, cs_a, &ao ); \ - bli_obj_create_with_attached_buffer( dt, m, m, c, rs_c, cs_c, &co ); \ -\ - bli_obj_set_uplo( uploc, &co ); \ - bli_obj_set_conjtrans( transa, &ao ); \ -\ - bli_obj_set_struc( BLIS_HERMITIAN, &co ); \ -\ - PASTEMAC0(opname) \ - ( \ - &alphao, \ - &ao, \ - &betao, \ - &co, \ - cntx, \ - rntm \ - ); \ -} - -INSERT_GENTFUNCR_BASIC0( herk3mh ) -INSERT_GENTFUNCR_BASIC0( herk3m1 ) -INSERT_GENTFUNCR_BASIC0( herk4mh ) -INSERT_GENTFUNCR_BASIC0( herk4m1 ) -INSERT_GENTFUNCR_BASIC0( herk1m ) - - -// -- her2k -------------------------------------------------------------------- - -#undef GENTFUNCR -#define GENTFUNCR( ctype, ctype_r, ch, chr, opname ) \ -\ -void PASTEMAC(ch,opname) \ - ( \ - uplo_t uploc, \ - trans_t transa, \ - trans_t transb, \ - dim_t m, \ - dim_t k, \ - ctype* alpha, \ - ctype* a, inc_t rs_a, inc_t cs_a, \ - ctype* b, inc_t rs_b, inc_t cs_b, \ - ctype_r* beta, \ - ctype* c, inc_t rs_c, inc_t cs_c, \ - cntx_t* cntx, \ - rntm_t* rntm \ - ) \ -{ \ - bli_init_once(); \ -\ - const num_t dt_r = PASTEMAC(chr,type); \ - const num_t dt = PASTEMAC(ch,type); \ -\ - obj_t alphao, ao, bo, betao, co; \ -\ - dim_t m_a, n_a; \ - dim_t m_b, n_b; \ -\ - bli_set_dims_with_trans( transa, m, k, &m_a, &n_a ); \ - bli_set_dims_with_trans( transb, m, k, &m_b, &n_b ); \ -\ - bli_obj_create_1x1_with_attached_buffer( dt, alpha, &alphao ); \ - bli_obj_create_1x1_with_attached_buffer( dt_r, beta, &betao ); \ -\ - bli_obj_create_with_attached_buffer( dt, m_a, n_a, a, rs_a, cs_a, &ao ); \ - bli_obj_create_with_attached_buffer( dt, m_b, n_b, b, rs_b, cs_b, &bo ); \ - bli_obj_create_with_attached_buffer( dt, m, m, c, rs_c, cs_c, &co ); \ -\ - bli_obj_set_uplo( uploc, &co ); \ - bli_obj_set_conjtrans( transa, &ao ); \ - bli_obj_set_conjtrans( transb, &bo ); \ -\ - bli_obj_set_struc( BLIS_HERMITIAN, &co ); \ -\ - PASTEMAC0(opname) \ - ( \ - &alphao, \ - &ao, \ - &bo, \ - &betao, \ - &co, \ - cntx, \ - rntm \ - ); \ -} - -INSERT_GENTFUNCR_BASIC0( her2k3mh ) -INSERT_GENTFUNCR_BASIC0( her2k3m1 ) -INSERT_GENTFUNCR_BASIC0( her2k4mh ) -INSERT_GENTFUNCR_BASIC0( her2k4m1 ) -INSERT_GENTFUNCR_BASIC0( her2k1m ) - - // -- symm --------------------------------------------------------------------- #undef GENTFUNC @@ -357,129 +232,6 @@ INSERT_GENTFUNC_BASIC0( symm4m1 ) INSERT_GENTFUNC_BASIC0( symm1m ) -// -- syrk --------------------------------------------------------------------- - -#undef GENTFUNC -#define GENTFUNC( ctype, ch, opname ) \ -\ -void PASTEMAC(ch,opname) \ - ( \ - uplo_t uploc, \ - trans_t transa, \ - dim_t m, \ - dim_t k, \ - ctype* alpha, \ - ctype* a, inc_t rs_a, inc_t cs_a, \ - ctype* beta, \ - ctype* c, inc_t rs_c, inc_t cs_c, \ - cntx_t* cntx, \ - rntm_t* rntm \ - ) \ -{ \ - bli_init_once(); \ -\ - const num_t dt = PASTEMAC(ch,type); \ -\ - obj_t alphao, ao, betao, co; \ -\ - dim_t m_a, n_a; \ -\ - bli_set_dims_with_trans( transa, m, k, &m_a, &n_a ); \ -\ - bli_obj_create_1x1_with_attached_buffer( dt, alpha, &alphao ); \ - bli_obj_create_1x1_with_attached_buffer( dt, beta, &betao ); \ -\ - bli_obj_create_with_attached_buffer( dt, m_a, n_a, a, rs_a, cs_a, &ao ); \ - bli_obj_create_with_attached_buffer( dt, m, m, c, rs_c, cs_c, &co ); \ -\ - bli_obj_set_uplo( uploc, &co ); \ - bli_obj_set_conjtrans( transa, &ao ); \ -\ - bli_obj_set_struc( BLIS_SYMMETRIC, &co ); \ -\ - PASTEMAC0(opname) \ - ( \ - &alphao, \ - &ao, \ - &betao, \ - &co, \ - cntx, \ - rntm \ - ); \ -} - -INSERT_GENTFUNC_BASIC0( syrk3mh ) -INSERT_GENTFUNC_BASIC0( syrk3m1 ) -INSERT_GENTFUNC_BASIC0( syrk4mh ) -INSERT_GENTFUNC_BASIC0( syrk4m1 ) -INSERT_GENTFUNC_BASIC0( syrk1m ) - - -// -- syr2k -------------------------------------------------------------------- - -#undef GENTFUNC -#define GENTFUNC( ctype, ch, opname ) \ -\ -void PASTEMAC(ch,opname) \ - ( \ - uplo_t uploc, \ - trans_t transa, \ - trans_t transb, \ - dim_t m, \ - dim_t k, \ - ctype* alpha, \ - ctype* a, inc_t rs_a, inc_t cs_a, \ - ctype* b, inc_t rs_b, inc_t cs_b, \ - ctype* beta, \ - ctype* c, inc_t rs_c, inc_t cs_c, \ - cntx_t* cntx, \ - rntm_t* rntm \ - ) \ -{ \ - bli_init_once(); \ -\ - const num_t dt = PASTEMAC(ch,type); \ -\ - obj_t alphao, ao, bo, betao, co; \ -\ - dim_t m_a, n_a; \ - dim_t m_b, n_b; \ -\ - bli_set_dims_with_trans( transa, m, k, &m_a, &n_a ); \ - bli_set_dims_with_trans( transb, m, k, &m_b, &n_b ); \ -\ - bli_obj_create_1x1_with_attached_buffer( dt, alpha, &alphao ); \ - bli_obj_create_1x1_with_attached_buffer( dt, beta, &betao ); \ -\ - bli_obj_create_with_attached_buffer( dt, m_a, n_a, a, rs_a, cs_a, &ao ); \ - bli_obj_create_with_attached_buffer( dt, m_b, n_b, b, rs_b, cs_b, &bo ); \ - bli_obj_create_with_attached_buffer( dt, m, m, c, rs_c, cs_c, &co ); \ -\ - bli_obj_set_uplo( uploc, &co ); \ - bli_obj_set_conjtrans( transa, &ao ); \ - bli_obj_set_conjtrans( transb, &bo ); \ -\ - bli_obj_set_struc( BLIS_SYMMETRIC, &co ); \ -\ - PASTEMAC0(opname) \ - ( \ - &alphao, \ - &ao, \ - &bo, \ - &betao, \ - &co, \ - cntx, \ - rntm \ - ); \ -} - -INSERT_GENTFUNC_BASIC0( syr2k3mh ) -INSERT_GENTFUNC_BASIC0( syr2k3m1 ) -INSERT_GENTFUNC_BASIC0( syr2k4mh ) -INSERT_GENTFUNC_BASIC0( syr2k4m1 ) -INSERT_GENTFUNC_BASIC0( syr2k1m ) - - // -- trmm3 -------------------------------------------------------------------- #undef GENTFUNC diff --git a/frame/ind/tapi/bli_l3_ind_tapi.h b/frame/ind/tapi/bli_l3_ind_tapi.h index 49ff6a8739..ecf8c729b3 100644 --- a/frame/ind/tapi/bli_l3_ind_tapi.h +++ b/frame/ind/tapi/bli_l3_ind_tapi.h @@ -87,56 +87,6 @@ INSERT_GENTPROT_BASIC0( hemm4m1 ) INSERT_GENTPROT_BASIC0( hemm1m ) -#undef GENTPROTR -#define GENTPROTR( ctype, ctype_r, ch, chr, opname ) \ -\ -void PASTEMAC(ch,opname) \ - ( \ - uplo_t uploc, \ - trans_t transa, \ - trans_t transb, \ - dim_t m, \ - dim_t k, \ - ctype* alpha, \ - ctype* a, inc_t rs_a, inc_t cs_a, \ - ctype* b, inc_t rs_b, inc_t cs_b, \ - ctype_r* beta, \ - ctype* c, inc_t rs_c, inc_t cs_c, \ - cntx_t* cntx, \ - rntm_t* rntmx \ - ); - -INSERT_GENTPROTR_BASIC0( her2k3mh ) -INSERT_GENTPROTR_BASIC0( her2k3m1 ) -INSERT_GENTPROTR_BASIC0( her2k4mh ) -INSERT_GENTPROTR_BASIC0( her2k4m1 ) -INSERT_GENTPROTR_BASIC0( her2k1m ) - - -#undef GENTPROTR -#define GENTPROTR( ctype, ctype_r, ch, chr, opname ) \ -\ -void PASTEMAC(ch,opname) \ - ( \ - uplo_t uploc, \ - trans_t transa, \ - dim_t m, \ - dim_t k, \ - ctype_r* alpha, \ - ctype* a, inc_t rs_a, inc_t cs_a, \ - ctype_r* beta, \ - ctype* c, inc_t rs_c, inc_t cs_c, \ - cntx_t* cntx, \ - rntm_t* rntmx \ - ); - -INSERT_GENTPROTR_BASIC0( herk3mh ) -INSERT_GENTPROTR_BASIC0( herk3m1 ) -INSERT_GENTPROTR_BASIC0( herk4mh ) -INSERT_GENTPROTR_BASIC0( herk4m1 ) -INSERT_GENTPROTR_BASIC0( herk1m ) - - #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ @@ -164,56 +114,6 @@ INSERT_GENTPROT_BASIC0( symm4m1 ) INSERT_GENTPROT_BASIC0( symm1m ) -#undef GENTPROT -#define GENTPROT( ctype, ch, opname ) \ -\ -void PASTEMAC(ch,opname) \ - ( \ - uplo_t uploc, \ - trans_t transa, \ - trans_t transb, \ - dim_t m, \ - dim_t k, \ - ctype* alpha, \ - ctype* a, inc_t rs_a, inc_t cs_a, \ - ctype* b, inc_t rs_b, inc_t cs_b, \ - ctype* beta, \ - ctype* c, inc_t rs_c, inc_t cs_c, \ - cntx_t* cntx, \ - rntm_t* rntm \ - ); - -INSERT_GENTPROT_BASIC0( syr2k3mh ) -INSERT_GENTPROT_BASIC0( syr2k3m1 ) -INSERT_GENTPROT_BASIC0( syr2k4mh ) -INSERT_GENTPROT_BASIC0( syr2k4m1 ) -INSERT_GENTPROT_BASIC0( syr2k1m ) - - -#undef GENTPROT -#define GENTPROT( ctype, ch, opname ) \ -\ -void PASTEMAC(ch,opname) \ - ( \ - uplo_t uploc, \ - trans_t transa, \ - dim_t m, \ - dim_t k, \ - ctype* alpha, \ - ctype* a, inc_t rs_a, inc_t cs_a, \ - ctype* beta, \ - ctype* c, inc_t rs_c, inc_t cs_c, \ - cntx_t* cntx, \ - rntm_t* rntm \ - ); - -INSERT_GENTPROT_BASIC0( syrk3mh ) -INSERT_GENTPROT_BASIC0( syrk3m1 ) -INSERT_GENTPROT_BASIC0( syrk4mh ) -INSERT_GENTPROT_BASIC0( syrk4m1 ) -INSERT_GENTPROT_BASIC0( syrk1m ) - - #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ diff --git a/frame/thread/bli_thread.c b/frame/thread/bli_thread.c index 9ebd47de1d..6dc4f9141c 100644 --- a/frame/thread/bli_thread.c +++ b/frame/thread/bli_thread.c @@ -678,7 +678,7 @@ siz_t bli_thread_range_mdim // structured matrix, even though they represent part of that matrix // that will be dense and full (after packing). if ( family == BLIS_GEMM ) { x = a; use_weighted = FALSE; } - else if ( family == BLIS_HERK ) { x = c; use_weighted = TRUE; } + else if ( family == BLIS_GEMMT ) { x = c; use_weighted = TRUE; } else if ( family == BLIS_TRMM ) { x = a; use_weighted = TRUE; } else /*family == BLIS_TRSM*/ { x = a; use_weighted = FALSE; } @@ -737,7 +737,7 @@ siz_t bli_thread_range_ndim // structured matrix, even though they represent part of that matrix // that will be dense and full (after packing). if ( family == BLIS_GEMM ) { x = b; use_weighted = FALSE; } - else if ( family == BLIS_HERK ) { x = c; use_weighted = TRUE; } + else if ( family == BLIS_GEMMT ) { x = c; use_weighted = TRUE; } else if ( family == BLIS_TRMM ) { x = b; use_weighted = TRUE; } else /*family == BLIS_TRSM*/ { x = b; use_weighted = FALSE; } diff --git a/kernels/zen/3/bli_syrk_small.c b/kernels/zen/3/bli_gemmt_small.c similarity index 99% rename from kernels/zen/3/bli_syrk_small.c rename to kernels/zen/3/bli_gemmt_small.c index 23d47298c6..f2fd88de7b 100644 --- a/kernels/zen/3/bli_syrk_small.c +++ b/kernels/zen/3/bli_gemmt_small.c @@ -52,9 +52,9 @@ static float C_pack[F_SCRATCH_DIM] __attribute__((aligned(64))); #define D_SCRATCH_DIM (D_BLIS_SMALL_MATRIX_THRES * D_BLIS_SMALL_MATRIX_THRES) static double D_A_pack[D_SCRATCH_DIM] __attribute__((aligned(64))); static double D_C_pack[D_SCRATCH_DIM] __attribute__((aligned(64))); -#define BLIS_ATBN_M_THRES 40 // Threshold value of M for/below which small matrix code is called. -#define AT_MR 4 // The kernel dimension of the A transpose SYRK kernel.(AT_MR * NR). -static err_t bli_ssyrk_small +#define BLIS_ATBN_M_THRES 40 // Threshold value of M for/below which small matrix code is called. +#define AT_MR 4 // The kernel dimension of the A transpose GEMMT kernel.(AT_MR * NR). +static err_t bli_sgemmt_small ( obj_t* alpha, obj_t* a, @@ -65,7 +65,7 @@ static err_t bli_ssyrk_small cntl_t* cntl ); -static err_t bli_dsyrk_small +static err_t bli_dgemmt_small ( obj_t* alpha, obj_t* a, @@ -76,7 +76,7 @@ static err_t bli_dsyrk_small cntl_t* cntl ); -static err_t bli_ssyrk_small_atbn +static err_t bli_sgemmt_small_atbn ( obj_t* alpha, obj_t* a, @@ -87,7 +87,7 @@ static err_t bli_ssyrk_small_atbn cntl_t* cntl ); -static err_t bli_dsyrk_small_atbn +static err_t bli_dgemmt_small_atbn ( obj_t* alpha, obj_t* a, @@ -98,11 +98,11 @@ static err_t bli_dsyrk_small_atbn cntl_t* cntl ); /* -* The bli_syrk_small function will use the +* The bli_gemmt_small function will use the * custom MRxNR kernels, to perform the computation. * The custom kernels are used if the [M * N] < 240 * 240 */ -err_t bli_syrk_small +err_t bli_gemmt_small ( obj_t* alpha, obj_t* a, @@ -113,20 +113,20 @@ err_t bli_syrk_small cntl_t* cntl ) { - // FGVZ: This code was originally in bli_syrk_front(). However, it really - // fits more naturally here within the bli_syrk_small() function. This + // FGVZ: This code was originally in bli_gemmt_front(). However, it really + // fits more naturally here within the bli_gemmt_small() function. This // becomes a bit more obvious now that the code is here, as it contains - // cpp macros such as BLIS_SMALL_MATRIX_A_THRES_M_SYRK, which are specific + // cpp macros such as BLIS_SMALL_MATRIX_A_THRES_M_GEMMT, which are specific // to this implementation. if ( bli_obj_has_trans( a ) ) { // Continue with small implementation. ; } - else if ( ( bli_obj_length( a ) <= BLIS_SMALL_MATRIX_A_THRES_M_SYRK && - bli_obj_width( a ) < BLIS_SMALL_MATRIX_A_THRES_N_SYRK ) || - ( bli_obj_length( a ) < BLIS_SMALL_MATRIX_A_THRES_M_SYRK && - bli_obj_width( a ) <= BLIS_SMALL_MATRIX_A_THRES_N_SYRK ) ) + else if ( ( bli_obj_length( a ) <= BLIS_SMALL_MATRIX_A_THRES_M_GEMMT && + bli_obj_width( a ) < BLIS_SMALL_MATRIX_A_THRES_N_GEMMT ) || + ( bli_obj_length( a ) < BLIS_SMALL_MATRIX_A_THRES_M_GEMMT && + bli_obj_width( a ) <= BLIS_SMALL_MATRIX_A_THRES_N_GEMMT ) ) { // Continue with small implementation. ; @@ -162,11 +162,11 @@ err_t bli_syrk_small { if (dt == BLIS_FLOAT) { - return bli_ssyrk_small_atbn(alpha, a, b, beta, c, cntx, cntl); + return bli_sgemmt_small_atbn(alpha, a, b, beta, c, cntx, cntl); } else if (dt == BLIS_DOUBLE) { - return bli_dsyrk_small_atbn(alpha, a, b, beta, c, cntx, cntl); + return bli_dgemmt_small_atbn(alpha, a, b, beta, c, cntx, cntl); } } @@ -175,19 +175,19 @@ err_t bli_syrk_small if (dt == BLIS_DOUBLE) { - return bli_dsyrk_small(alpha, a, b, beta, c, cntx, cntl); + return bli_dgemmt_small(alpha, a, b, beta, c, cntx, cntl); } if (dt == BLIS_FLOAT) { - return bli_ssyrk_small(alpha, a, b, beta, c, cntx, cntl); + return bli_sgemmt_small(alpha, a, b, beta, c, cntx, cntl); } return BLIS_NOT_YET_IMPLEMENTED; }; -static err_t bli_ssyrk_small +static err_t bli_sgemmt_small ( obj_t* alpha, obj_t* a, @@ -240,7 +240,7 @@ static err_t bli_ssyrk_small beta_cast = (beta->buffer); int required_packing_A = 1; - // when N is equal to 1 call GEMV instead of SYRK + // when N is equal to 1 call GEMV instead of GEMMT if (N == 1) { bli_gemv @@ -1584,7 +1584,7 @@ static err_t bli_ssyrk_small } } } - + //copy/compute sryk values back to C using SIMD if ( bli_seq0( *beta_cast ) ) {//just copy in case of beta = 0 @@ -1673,7 +1673,7 @@ static err_t bli_ssyrk_small _i = 0; for ( _l = 0; _l < k; _l++ ) { - ymm2 = _mm256_loadu_ps((matCbuf + _i*rs_matC)); + ymm2 = _mm256_loadu_ps((matCbuf + _i*rs_matC)); ymm0 = _mm256_loadu_ps((C + _i*rsc)); ymm0 = _mm256_fmadd_ps(ymm2, ymm1, ymm0); _mm256_storeu_ps((matCbuf + _i*rs_matC), ymm0); @@ -1703,11 +1703,11 @@ static err_t bli_ssyrk_small _l = 0; while ( _l < k ) { - ymm2 = _mm256_loadu_ps((matCbuf + _i*rs_matC + _j*ldc_matC)); + ymm2 = _mm256_loadu_ps((matCbuf + _i*rs_matC + _j*ldc_matC)); ymm0 = _mm256_loadu_ps((C + _i*rsc + _j*ldc)); ymm0 = _mm256_fmadd_ps(ymm2, ymm1, ymm0); _mm256_storeu_ps((matCbuf + _i*rs_matC + _j*ldc_matC), ymm0); - + _i += 8; _l++; } @@ -1729,8 +1729,8 @@ static err_t bli_ssyrk_small _i = 0; _l = 0; while ( _l < k ) - { - ymm2 = _mm256_loadu_ps((matCbuf + _i*rs_matC + _j*ldc_matC)); + { + ymm2 = _mm256_loadu_ps((matCbuf + _i*rs_matC + _j*ldc_matC)); ymm0 = _mm256_loadu_ps((C + _i*rsc + _j*ldc)); ymm0 = _mm256_fmadd_ps(ymm2, ymm1, ymm0); _mm256_storeu_ps((matCbuf + _i*rs_matC + _j*ldc_matC), ymm0); @@ -1747,7 +1747,7 @@ static err_t bli_ssyrk_small } } } - + return BLIS_SUCCESS; } else @@ -1756,7 +1756,7 @@ static err_t bli_ssyrk_small }; -static err_t bli_dsyrk_small +static err_t bli_dgemmt_small ( obj_t* alpha, obj_t* a, @@ -1810,7 +1810,7 @@ static err_t bli_dsyrk_small beta_cast = (beta->buffer); int required_packing_A = 1; - // when N is equal to 1 call GEMV instead of SYRK + // when N is equal to 1 call GEMV instead of GEMMT if (N == 1) { bli_gemv @@ -3154,7 +3154,7 @@ static err_t bli_dsyrk_small } } } - + //copy/compute sryk values back to C using SIMD if ( bli_seq0( *beta_cast ) ) {//just copy for beta = 0 @@ -3195,7 +3195,7 @@ static err_t bli_dsyrk_small { ymm0 = _mm256_loadu_pd((C + _i*rsc + _j*ldc)); _mm256_storeu_pd((matCbuf + _i*rs_matC + _j*ldc_matC), ymm0); - + _i += 4; _l++; } @@ -3243,7 +3243,7 @@ static err_t bli_dsyrk_small _i = 0; for ( _l = 0; _l < k; _l++ ) { - ymm2 = _mm256_loadu_pd((matCbuf + _i*rs_matC)); + ymm2 = _mm256_loadu_pd((matCbuf + _i*rs_matC)); ymm0 = _mm256_loadu_pd((C + _i*rsc)); ymm0 = _mm256_fmadd_pd(ymm2, ymm1, ymm0); _mm256_storeu_pd((matCbuf + _i*rs_matC), ymm0); @@ -3273,7 +3273,7 @@ static err_t bli_dsyrk_small _l = 0; while ( _l < k ) { - ymm2 = _mm256_loadu_pd((matCbuf + _i*rs_matC + _j*ldc_matC)); + ymm2 = _mm256_loadu_pd((matCbuf + _i*rs_matC + _j*ldc_matC)); ymm0 = _mm256_loadu_pd((C + _i*rsc + _j*ldc)); ymm0 = _mm256_fmadd_pd(ymm2, ymm1, ymm0); _mm256_storeu_pd((matCbuf + _i*rs_matC + _j*ldc_matC), ymm0); @@ -3299,8 +3299,8 @@ static err_t bli_dsyrk_small _i = 0; _l = 0; while ( _l < k ) - { - ymm2 = _mm256_loadu_pd((matCbuf + _i*rs_matC + _j*ldc_matC)); + { + ymm2 = _mm256_loadu_pd((matCbuf + _i*rs_matC + _j*ldc_matC)); ymm0 = _mm256_loadu_pd((C + _i*rsc + _j*ldc)); ymm0 = _mm256_fmadd_pd(ymm2, ymm1, ymm0); _mm256_storeu_pd((matCbuf + _i*rs_matC + _j*ldc_matC), ymm0); @@ -3317,7 +3317,7 @@ static err_t bli_dsyrk_small } } } - + return BLIS_SUCCESS; } else @@ -3326,7 +3326,7 @@ static err_t bli_dsyrk_small }; -static err_t bli_ssyrk_small_atbn +static err_t bli_sgemmt_small_atbn ( obj_t* alpha, obj_t* a, @@ -3364,7 +3364,7 @@ static err_t bli_ssyrk_small_atbn alpha_cast = (alpha->buffer); beta_cast = (beta->buffer); - // The non-copy version of the A^T SYRK gives better performance for the small M cases. + // The non-copy version of the A^T GEMMT gives better performance for the small M cases. // The threshold is controlled by BLIS_ATBN_M_THRES if (M <= BLIS_ATBN_M_THRES) { @@ -3715,7 +3715,7 @@ static err_t bli_ssyrk_small_atbn } } } - + //copy/compute sryk values back to C if ( bli_seq0( *beta_cast ) ) //when beta is 0, just copy result to C { @@ -3774,7 +3774,7 @@ static err_t bli_ssyrk_small_atbn return BLIS_NONCONFORMAL_DIMENSIONS; } -static err_t bli_dsyrk_small_atbn +static err_t bli_dgemmt_small_atbn ( obj_t* alpha, obj_t* a, @@ -3812,7 +3812,7 @@ static err_t bli_dsyrk_small_atbn alpha_cast = (alpha->buffer); beta_cast = (beta->buffer); - // The non-copy version of the A^T SYRK gives better performance for the small M cases. + // The non-copy version of the A^T GEMMT gives better performance for the small M cases. // The threshold is controlled by BLIS_ATBN_M_THRES if (M <= BLIS_ATBN_M_THRES) { @@ -3968,7 +3968,7 @@ static err_t bli_dsyrk_small_atbn result *= (*alpha_cast); tC[3] = result/* + tC[3] * (*beta_cast)*/; - + tC += ldc; ymm6 = _mm256_hadd_pd(ymm6, ymm6); _mm256_storeu_pd(scratch, ymm6); @@ -4199,7 +4199,7 @@ static err_t bli_dsyrk_small_atbn } } } - + return BLIS_SUCCESS; } else From eba07771bf786433cb4a052df353f9255135eef1 Mon Sep 17 00:00:00 2001 From: Devin Matthews Date: Sat, 4 Sep 2021 11:52:19 -0500 Subject: [PATCH 03/24] Implement user-defined packing microkernels. --- frame/1m/packm/bli_packm_blk_var1.c | 971 +++++++++++++--------------- frame/base/bli_obj.c | 9 +- frame/include/bli_obj_macro_defs.h | 10 + frame/include/bli_type_defs.h | 26 +- 4 files changed, 497 insertions(+), 519 deletions(-) diff --git a/frame/1m/packm/bli_packm_blk_var1.c b/frame/1m/packm/bli_packm_blk_var1.c index 87f8df4f7d..ff4ac9e3a0 100644 --- a/frame/1m/packm/bli_packm_blk_var1.c +++ b/frame/1m/packm/bli_packm_blk_var1.c @@ -35,35 +35,6 @@ #include "blis.h" -#define FUNCPTR_T packm_fp - -typedef void (*FUNCPTR_T) - ( - struc_t strucc, - doff_t diagoffc, - diag_t diagc, - uplo_t uploc, - trans_t transc, - pack_t schema, - bool invdiag, - bool revifup, - bool reviflo, - dim_t m, - dim_t n, - dim_t m_max, - dim_t n_max, - void* kappa, - void* c, inc_t rs_c, inc_t cs_c, - void* p, inc_t rs_p, inc_t cs_p, - inc_t is_p, - dim_t pd_p, inc_t ps_p, - void_fp packm_ker, - cntx_t* cntx, - thrinfo_t* thread - ); - -static FUNCPTR_T GENARRAY(ftypes,packm_blk_var1); - static func_t packm_struc_cxk_kers[BLIS_NUM_PACK_SCHEMA_TYPES] = { @@ -107,7 +78,7 @@ void bli_packm_blk_var1 obj_t* p, cntx_t* cntx, cntl_t* cntl, - thrinfo_t* t + thrinfo_t* thread ) { #ifdef BLIS_ENABLE_GEMM_MD @@ -115,38 +86,41 @@ void bli_packm_blk_var1 // datatypes differ. if ( bli_obj_dt( c ) != bli_obj_target_dt( c ) ) { - bli_packm_blk_var1_md( c, p, cntx, cntl, t ); + bli_packm_blk_var1_md( c, p, cntx, cntl, thread ); return; } #endif - num_t dt_p = bli_obj_dt( p ); - - struc_t strucc = bli_obj_struc( c ); - doff_t diagoffc = bli_obj_diag_offset( c ); - diag_t diagc = bli_obj_diag( c ); - uplo_t uploc = bli_obj_uplo( c ); - trans_t transc = bli_obj_conjtrans_status( c ); - pack_t schema = bli_obj_pack_schema( p ); - bool invdiag = bli_obj_has_inverted_diag( p ); - bool revifup = bli_obj_is_pack_rev_if_upper( p ); - bool reviflo = bli_obj_is_pack_rev_if_lower( p ); - - dim_t m_p = bli_obj_length( p ); - dim_t n_p = bli_obj_width( p ); - dim_t m_max_p = bli_obj_padded_length( p ); - dim_t n_max_p = bli_obj_padded_width( p ); - - void* buf_c = bli_obj_buffer_at_off( c ); - inc_t rs_c = bli_obj_row_stride( c ); - inc_t cs_c = bli_obj_col_stride( c ); - - void* buf_p = bli_obj_buffer_at_off( p ); - inc_t rs_p = bli_obj_row_stride( p ); - inc_t cs_p = bli_obj_col_stride( p ); - inc_t is_p = bli_obj_imag_stride( p ); - dim_t pd_p = bli_obj_panel_dim( p ); - inc_t ps_p = bli_obj_panel_stride( p ); + num_t dt_p = bli_obj_dt( p ); + dim_t dt_size = bli_dt_size( dt_p ); + + struc_t strucc = bli_obj_struc( c ); + doff_t diagoffc = bli_obj_diag_offset( c ); + diag_t diagc = bli_obj_diag( c ); + uplo_t uploc = bli_obj_uplo( c ); + trans_t transc = bli_obj_conjtrans_status( c ); + pack_t schema = bli_obj_pack_schema( p ); + bool invdiag = bli_obj_has_inverted_diag( p ); + bool revifup = bli_obj_is_pack_rev_if_upper( p ); + bool reviflo = bli_obj_is_pack_rev_if_lower( p ); + + dim_t m_p = bli_obj_length( p ); + dim_t n_p = bli_obj_width( p ); + dim_t m_max_p = bli_obj_padded_length( p ); + dim_t n_max_p = bli_obj_padded_width( p ); + + void* buf_c = bli_obj_buffer_at_off( c ); + inc_t rs_c = bli_obj_row_stride( c ); + inc_t cs_c = bli_obj_col_stride( c ); + dim_t m_off_c = bli_obj_row_off( p ); + dim_t n_off_c = bli_obj_col_off( p ); + + void* buf_p = bli_obj_buffer_at_off( p ); + inc_t rs_p = bli_obj_row_stride( p ); + inc_t cs_p = bli_obj_col_stride( p ); + inc_t is_p = bli_obj_imag_stride( p ); + dim_t pd_p = bli_obj_panel_dim( p ); + inc_t ps_p = bli_obj_panel_stride( p ); obj_t kappa; void* buf_kappa; @@ -154,8 +128,6 @@ void bli_packm_blk_var1 func_t* packm_kers; void_fp packm_ker; - FUNCPTR_T f; - // Treatment of kappa (ie: packing during scaling) depends on // whether we are executing an induced method. @@ -181,7 +153,7 @@ void bli_packm_blk_var1 // harder, so we avoid the need altogether with the code below.) if ( bli_obj_scalar_has_nonzero_imag( p ) ) { - //printf( "applying non-zero imag kappa\n" ); + //printf( "applying non-zero imag kappa\n_p" ); // Detach the scalar. bli_obj_scalar_detach( p, &kappa ); @@ -244,516 +216,487 @@ void bli_packm_blk_var1 // Query the datatype-specific function pointer from the func_t object. packm_ker = bli_func_get_dt( dt_p, packm_kers ); - // Index into the type combination array to extract the correct - // function pointer. - f = ftypes[dt_p]; - - // Invoke the function. - f( strucc, - diagoffc, - diagc, - uploc, - transc, - schema, - invdiag, - revifup, - reviflo, - m_p, - n_p, - m_max_p, - n_max_p, - buf_kappa, - buf_c, rs_c, cs_c, - buf_p, rs_p, cs_p, - is_p, - pd_p, ps_p, - packm_ker, - cntx, - t ); -} + packm_ker_vft packm_ker_cast = packm_ker; + obj_pack_ukr_fn_t pack_ker_user = bli_obj_pack_ukr_fn( c ); + + char* restrict kappa_cast = buf_kappa; + char* restrict c_cast = buf_c; + char* restrict p_cast = buf_p; + char* restrict c_begin; + char* restrict p_begin; + + dim_t iter_dim; + dim_t n_iter; + dim_t it, ic, ip; + dim_t ic0, ip0; + doff_t ic_inc, ip_inc; + doff_t diagoffc_i; + doff_t diagoffc_inc; + dim_t panel_len_full; + dim_t panel_len_i; + dim_t panel_len_max; + dim_t panel_len_max_i; + dim_t panel_dim_i; + dim_t panel_dim_max; + dim_t panel_off_i; + dim_t panel_len_off; + dim_t panel_dim_off; + dim_t panel_dim_off_i; + inc_t vs_c; + inc_t ldc; + inc_t ldp, p_inc; + dim_t* m_panel_full; + dim_t* n_panel_full; + dim_t* m_panel_use; + dim_t* n_panel_use; + dim_t* m_panel_max; + dim_t* n_panel_max; + conj_t conjc; + bool row_stored; + bool col_stored; + inc_t is_p_use; + dim_t ss_num; + dim_t ss_den; + + char* restrict c_use; + char* restrict p_use; + doff_t diagoffp_i; -#undef GENTFUNCR -#define GENTFUNCR( ctype, ctype_r, ch, chr, opname, varname ) \ -\ -void PASTEMAC(ch,varname) \ - ( \ - struc_t strucc, \ - doff_t diagoffc, \ - diag_t diagc, \ - uplo_t uploc, \ - trans_t transc, \ - pack_t schema, \ - bool invdiag, \ - bool revifup, \ - bool reviflo, \ - dim_t m, \ - dim_t n, \ - dim_t m_max, \ - dim_t n_max, \ - void* kappa, \ - void* c, inc_t rs_c, inc_t cs_c, \ - void* p, inc_t rs_p, inc_t cs_p, \ - inc_t is_p, \ - dim_t pd_p, inc_t ps_p, \ - void_fp packm_ker, \ - cntx_t* cntx, \ - thrinfo_t* thread \ - ) \ -{ \ - PASTECH2(ch,opname,_ker_ft) packm_ker_cast = packm_ker; \ -\ - ctype* restrict kappa_cast = kappa; \ - ctype* restrict c_cast = c; \ - ctype* restrict p_cast = p; \ - ctype* restrict c_begin; \ - ctype* restrict p_begin; \ -\ - dim_t iter_dim; \ - dim_t n_iter; \ - dim_t it, ic, ip; \ - dim_t ic0, ip0; \ - doff_t ic_inc, ip_inc; \ - doff_t diagoffc_i; \ - doff_t diagoffc_inc; \ - dim_t panel_len_full; \ - dim_t panel_len_i; \ - dim_t panel_len_max; \ - dim_t panel_len_max_i; \ - dim_t panel_dim_i; \ - dim_t panel_dim_max; \ - dim_t panel_off_i; \ - inc_t vs_c; \ - inc_t ldc; \ - inc_t ldp, p_inc; \ - dim_t* m_panel_full; \ - dim_t* n_panel_full; \ - dim_t* m_panel_use; \ - dim_t* n_panel_use; \ - dim_t* m_panel_max; \ - dim_t* n_panel_max; \ - conj_t conjc; \ - bool row_stored; \ - bool col_stored; \ - inc_t is_p_use; \ - dim_t ss_num; \ - dim_t ss_den; \ -\ - ctype* restrict c_use; \ - ctype* restrict p_use; \ - doff_t diagoffp_i; \ -\ -\ /* If C is zeros and part of a triangular matrix, then we don't need - to pack it. */ \ - if ( bli_is_zeros( uploc ) && \ - bli_is_triangular( strucc ) ) return; \ -\ - /* Extract the conjugation bit from the transposition argument. */ \ - conjc = bli_extract_conj( transc ); \ -\ + to pack it. */ + if ( bli_is_zeros( uploc ) && + bli_is_triangular( strucc ) ) return; + + /* Extract the conjugation bit from the transposition argument. */ + conjc = bli_extract_conj( transc ); + /* If c needs a transposition, induce it so that we can more simply - express the remaining parameters and code. */ \ - if ( bli_does_trans( transc ) ) \ - { \ - bli_swap_incs( &rs_c, &cs_c ); \ - bli_negate_diag_offset( &diagoffc ); \ - bli_toggle_uplo( &uploc ); \ - bli_toggle_trans( &transc ); \ - } \ -\ + express the remaining parameters and code. */ + if ( bli_does_trans( transc ) ) + { + bli_swap_incs( &rs_c, &cs_c ); + bli_negate_diag_offset( &diagoffc ); + bli_toggle_uplo( &uploc ); + bli_toggle_trans( &transc ); + } + /* Create flags to incidate row or column storage. Note that the schema bit that encodes row or column is describing the form of micro-panel, not the storage in the micro-panel. Hence the - mismatch in "row" and "column" semantics. */ \ - row_stored = bli_is_col_packed( schema ); \ - col_stored = bli_is_row_packed( schema ); \ -\ + mismatch in "row" and "column" semantics. */ + row_stored = bli_is_col_packed( schema ); + col_stored = bli_is_row_packed( schema ); + /* If the row storage flag indicates row storage, then we are packing to column panels; otherwise, if the strides indicate column storage, - we are packing to row panels. */ \ - if ( row_stored ) \ - { \ - /* Prepare to pack to row-stored column panels. */ \ - iter_dim = n; \ - panel_len_full = m; \ - panel_len_max = m_max; \ - panel_dim_max = pd_p; \ - ldc = rs_c; \ - vs_c = cs_c; \ - diagoffc_inc = -( doff_t )panel_dim_max; \ - ldp = rs_p; \ - m_panel_full = &m; \ - n_panel_full = &panel_dim_i; \ - m_panel_use = &panel_len_i; \ - n_panel_use = &panel_dim_i; \ - m_panel_max = &panel_len_max_i; \ - n_panel_max = &panel_dim_max; \ - } \ - else /* if ( col_stored ) */ \ - { \ - /* Prepare to pack to column-stored row panels. */ \ - iter_dim = m; \ - panel_len_full = n; \ - panel_len_max = n_max; \ - panel_dim_max = pd_p; \ - ldc = cs_c; \ - vs_c = rs_c; \ - diagoffc_inc = ( doff_t )panel_dim_max; \ - ldp = cs_p; \ - m_panel_full = &panel_dim_i; \ - n_panel_full = &n; \ - m_panel_use = &panel_dim_i; \ - n_panel_use = &panel_len_i; \ - m_panel_max = &panel_dim_max; \ - n_panel_max = &panel_len_max_i; \ - } \ -\ + we are packing to row panels. */ + if ( row_stored ) + { + /* Prepare to pack to row-stored column panels. */ + iter_dim = n_p; + panel_len_full = m_p; + panel_len_max = m_max_p; + panel_dim_max = pd_p; + panel_len_off = m_off_c; + panel_dim_off = n_off_c; + ldc = rs_c; + vs_c = cs_c; + diagoffc_inc = -( doff_t )panel_dim_max; + ldp = rs_p; + m_panel_full = &m_p; + n_panel_full = &panel_dim_i; + m_panel_use = &panel_len_i; + n_panel_use = &panel_dim_i; + m_panel_max = &panel_len_max_i; + n_panel_max = &panel_dim_max; + } + else /* if ( col_stored ) */ + { + /* Prepare to pack to column-stored row panels. */ + iter_dim = m_p; + panel_len_full = n_p; + panel_len_max = n_max_p; + panel_dim_max = pd_p; + panel_len_off = n_off_c; + panel_dim_off = m_off_c; + ldc = cs_c; + vs_c = rs_c; + diagoffc_inc = ( doff_t )panel_dim_max; + ldp = cs_p; + m_panel_full = &panel_dim_i; + n_panel_full = &n_p; + m_panel_use = &panel_dim_i; + n_panel_use = &panel_len_i; + m_panel_max = &panel_dim_max; + n_panel_max = &panel_len_max_i; + } + /* Compute the storage stride scaling. Usually this is just 1. However, in the case of interleaved 3m, we need to scale by 3/2, and in the cases of real-only, imag-only, or summed-only, we need to scale by 1/2. In both cases, we are compensating for the fact that pointer arithmetic occurs in terms of complex elements rather than real - elements. */ \ - if ( bli_is_3mi_packed( schema ) ) { ss_num = 3; ss_den = 2; } \ - else if ( bli_is_3ms_packed( schema ) ) { ss_num = 1; ss_den = 2; } \ - else if ( bli_is_rih_packed( schema ) ) { ss_num = 1; ss_den = 2; } \ - else { ss_num = 1; ss_den = 1; } \ -\ - /* Compute the total number of iterations we'll need. */ \ - n_iter = iter_dim / panel_dim_max + ( iter_dim % panel_dim_max ? 1 : 0 ); \ -\ + elements. */ + if ( bli_is_3mi_packed( schema ) ) { ss_num = 3; ss_den = 2; } + else if ( bli_is_3ms_packed( schema ) ) { ss_num = 1; ss_den = 2; } + else if ( bli_is_rih_packed( schema ) ) { ss_num = 1; ss_den = 2; } + else { ss_num = 1; ss_den = 1; } + + /* Compute the total number of iterations we'll need. */ + n_iter = iter_dim / panel_dim_max + ( iter_dim % panel_dim_max ? 1 : 0 ); + /* Set the initial values and increments for indices related to C and P - based on whether reverse iteration was requested. */ \ - if ( ( revifup && bli_is_upper( uploc ) && bli_is_triangular( strucc ) ) || \ - ( reviflo && bli_is_lower( uploc ) && bli_is_triangular( strucc ) ) ) \ - { \ - ic0 = (n_iter - 1) * panel_dim_max; \ - ic_inc = -panel_dim_max; \ - ip0 = n_iter - 1; \ - ip_inc = -1; \ - } \ - else \ - { \ - ic0 = 0; \ - ic_inc = panel_dim_max; \ - ip0 = 0; \ - ip_inc = 1; \ - } \ -\ - p_begin = p_cast; \ -\ + based on whether reverse iteration was requested. */ + if ( ( revifup && bli_is_upper( uploc ) && bli_is_triangular( strucc ) ) || + ( reviflo && bli_is_lower( uploc ) && bli_is_triangular( strucc ) ) ) + { + ic0 = (n_iter - 1) * panel_dim_max; + ic_inc = -panel_dim_max; + ip0 = n_iter - 1; + ip_inc = -1; + } + else + { + ic0 = 0; + ic_inc = panel_dim_max; + ip0 = 0; + ip_inc = 1; + } + + p_begin = p_cast; + /* Query the number of threads and thread ids from the current thread's - packm thrinfo_t node. */ \ - const dim_t nt = bli_thread_n_way( thread ); \ - const dim_t tid = bli_thread_work_id( thread ); \ -\ - dim_t it_start, it_end, it_inc; \ -\ + packm thrinfo_t node. */ + const dim_t nt = bli_thread_n_way( thread ); + const dim_t tid = bli_thread_work_id( thread ); + + dim_t it_start, it_end, it_inc; + /* Determine the thread range and increment using the current thread's packm thrinfo_t node. NOTE: The definition of bli_thread_range_jrir() will depend on whether slab or round-robin partitioning was requested - at configure-time. */ \ - bli_thread_range_jrir( thread, n_iter, 1, FALSE, &it_start, &it_end, &it_inc ); \ -\ - /* Iterate over every logical micropanel in the source matrix. */ \ - for ( ic = ic0, ip = ip0, it = 0; it < n_iter; \ - ic += ic_inc, ip += ip_inc, it += 1 ) \ - { \ - panel_dim_i = bli_min( panel_dim_max, iter_dim - ic ); \ -\ - diagoffc_i = diagoffc + (ip )*diagoffc_inc; \ - c_begin = c_cast + (ic )*vs_c; \ -\ - if ( bli_is_triangular( strucc ) && \ - bli_is_unstored_subpart_n( diagoffc_i, uploc, *m_panel_full, *n_panel_full ) ) \ - { \ + at configure-time. */ + bli_thread_range_jrir( thread, n_iter, 1, FALSE, &it_start, &it_end, &it_inc ); + + /* Iterate over every logical micropanel in the source matrix. */ + for ( ic = ic0, ip = ip0, it = 0; it < n_iter; + ic += ic_inc, ip += ip_inc, it += 1 ) + { + panel_dim_i = bli_min( panel_dim_max, iter_dim - ic ); + + diagoffc_i = diagoffc + (ip )*diagoffc_inc; + c_begin = c_cast + (ic )*vs_c*dt_size; + + p_inc = ps_p; + + if ( pack_ker_user ) + { + /* This case executes if the user has specified a custom packing microkernel */ + + panel_dim_off_i = panel_dim_off + ic; + + c_use = c_begin; + p_use = p_begin; + + /* The definition of bli_packm_my_iter() will depend on whether slab + or round-robin partitioning was requested at configure-time. */ + if ( bli_packm_my_iter( it, it_start, it_end, tid, nt ) ) + { + pack_ker_user( panel_dim_i, + panel_dim_max, + panel_dim_off_i, + panel_len_full, + panel_len_max, + panel_len_off, + kappa_cast, + c_use, vs_c, ldc, + p_use, ldp, + bli_obj_user_data( c ), + cntx ); + } + } + else if ( bli_is_triangular( strucc ) && + bli_is_unstored_subpart_n( diagoffc_i, uploc, *m_panel_full, *n_panel_full ) ) + { /* This case executes if the panel belongs to a triangular matrix AND is completely unstored (ie: zero). If the panel is unstored, we do nothing. (Notice that we don't even - increment p_begin.) */ \ -\ - continue; \ - } \ - else if ( bli_is_triangular( strucc ) && \ - bli_intersects_diag_n( diagoffc_i, *m_panel_full, *n_panel_full ) ) \ - { \ + increment p_begin.) */ + + continue; + } + else if ( bli_is_triangular( strucc ) && + bli_intersects_diag_n( diagoffc_i, *m_panel_full, *n_panel_full ) ) + { /* This case executes if the panel belongs to a triangular matrix AND is diagonal-intersecting. Notice that we cannot bury the following conditional logic into packm_struc_cxk() because we need to know the value of - panel_len_max_i so we can properly increment p_inc. */ \ -\ + panel_len_max_i so we can properly increment p_inc. */ + /* Sanity check. Diagonals should not intersect the short end of a micro-panel. If they do, then somehow the constraints on cache blocksizes being a whole multiple of the register - blocksizes was somehow violated. */ \ - if ( ( col_stored && diagoffc_i < 0 ) || \ - ( row_stored && diagoffc_i > 0 ) ) \ - bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED ); \ -\ - if ( ( row_stored && bli_is_upper( uploc ) ) || \ - ( col_stored && bli_is_lower( uploc ) ) ) \ - { \ - panel_off_i = 0; \ - panel_len_i = bli_abs( diagoffc_i ) + panel_dim_i; \ - panel_len_max_i = bli_min( bli_abs( diagoffc_i ) + panel_dim_max, \ - panel_len_max ); \ - diagoffp_i = diagoffc_i; \ - } \ - else /* if ( ( row_stored && bli_is_lower( uploc ) ) || \ - ( col_stored && bli_is_upper( uploc ) ) ) */ \ - { \ - panel_off_i = bli_abs( diagoffc_i ); \ - panel_len_i = panel_len_full - panel_off_i; \ - panel_len_max_i = panel_len_max - panel_off_i; \ - diagoffp_i = 0; \ - } \ -\ - c_use = c_begin + (panel_off_i )*ldc; \ - p_use = p_begin; \ -\ + blocksizes was somehow violated. */ + if ( ( col_stored && diagoffc_i < 0 ) || + ( row_stored && diagoffc_i > 0 ) ) + bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED ); + + if ( ( row_stored && bli_is_upper( uploc ) ) || + ( col_stored && bli_is_lower( uploc ) ) ) + { + panel_off_i = 0; + panel_len_i = bli_abs( diagoffc_i ) + panel_dim_i; + panel_len_max_i = bli_min( bli_abs( diagoffc_i ) + panel_dim_max, + panel_len_max ); + diagoffp_i = diagoffc_i; + } + else /* if ( ( row_stored && bli_is_lower( uploc ) ) || + ( col_stored && bli_is_upper( uploc ) ) ) */ + { + panel_off_i = bli_abs( diagoffc_i ); + panel_len_i = panel_len_full - panel_off_i; + panel_len_max_i = panel_len_max - panel_off_i; + diagoffp_i = 0; + } + + c_use = c_begin + (panel_off_i )*ldc*dt_size; + p_use = p_begin; + /* We need to re-compute the imaginary stride as a function of panel_len_max_i since triangular packed matrices have panels of varying lengths. NOTE: This imaginary stride value is - only referenced by the packm kernels for induced methods. */ \ - is_p_use = ldp * panel_len_max_i; \ -\ - /* We nudge the imaginary stride up by one if it is odd. */ \ - is_p_use += ( bli_is_odd( is_p_use ) ? 1 : 0 ); \ -\ + only referenced by the packm kernels for induced methods. */ + is_p_use = ldp * panel_len_max_i; + + /* We nudge the imaginary stride up by one if it is odd. */ + is_p_use += ( bli_is_odd( is_p_use ) ? 1 : 0 ); + /* NOTE: We MUST use round-robin partitioning when packing micropanels of a triangular matrix. Hermitian/symmetric and general packing may use slab or round-robin, depending - on which was selected at configure-time. */ \ - if ( bli_packm_my_iter_rr( it, it_start, it_end, tid, nt ) ) \ - { \ - packm_ker_cast( strucc, \ - diagoffp_i, \ - diagc, \ - uploc, \ - conjc, \ - schema, \ - invdiag, \ - *m_panel_use, \ - *n_panel_use, \ - *m_panel_max, \ - *n_panel_max, \ - kappa_cast, \ - c_use, rs_c, cs_c, \ - p_use, rs_p, cs_p, \ - is_p_use, \ - cntx ); \ - } \ -\ + on which was selected at configure-time. */ + if ( bli_packm_my_iter_rr( it, it_start, it_end, tid, nt ) ) + { + packm_ker_cast( strucc, + diagoffp_i, + diagc, + uploc, + conjc, + schema, + invdiag, + *m_panel_use, + *n_panel_use, + *m_panel_max, + *n_panel_max, + kappa_cast, + c_use, rs_c, cs_c, + p_use, rs_p, cs_p, + is_p_use, + cntx ); + } + /* NOTE: This value is usually LESS than ps_p because triangular matrices usually have several micro-panels that are shorter - than a "full" micro-panel. */ \ - p_inc = ( is_p_use * ss_num ) / ss_den; \ - } \ - else if ( bli_is_herm_or_symm( strucc ) ) \ - { \ + than a "full" micro-panel. */ + p_inc = ( is_p_use * ss_num ) / ss_den; + } + else if ( bli_is_herm_or_symm( strucc ) ) + { /* This case executes if the panel belongs to a Hermitian or symmetric matrix, which includes stored, unstored, and - diagonal-intersecting panels. */ \ -\ - c_use = c_begin; \ - p_use = p_begin; \ -\ - panel_len_i = panel_len_full; \ - panel_len_max_i = panel_len_max; \ -\ - is_p_use = is_p; \ -\ + diagonal-intersecting panels. */ + + c_use = c_begin; + p_use = p_begin; + + panel_len_i = panel_len_full; + panel_len_max_i = panel_len_max; + + is_p_use = is_p; + /* The definition of bli_packm_my_iter() will depend on whether slab - or round-robin partitioning was requested at configure-time. */ \ - if ( bli_packm_my_iter( it, it_start, it_end, tid, nt ) ) \ - { \ - packm_ker_cast( strucc, \ - diagoffc_i, \ - diagc, \ - uploc, \ - conjc, \ - schema, \ - invdiag, \ - *m_panel_use, \ - *n_panel_use, \ - *m_panel_max, \ - *n_panel_max, \ - kappa_cast, \ - c_use, rs_c, cs_c, \ - p_use, rs_p, cs_p, \ - is_p_use, \ - cntx ); \ - } \ -\ - p_inc = ps_p; \ - } \ - else \ - { \ + or round-robin partitioning was requested at configure-time. */ + if ( bli_packm_my_iter( it, it_start, it_end, tid, nt ) ) + { + packm_ker_cast( strucc, + diagoffc_i, + diagc, + uploc, + conjc, + schema, + invdiag, + *m_panel_use, + *n_panel_use, + *m_panel_max, + *n_panel_max, + kappa_cast, + c_use, rs_c, cs_c, + p_use, rs_p, cs_p, + is_p_use, + cntx ); + } + } + else + { /* This case executes if the panel is general, or, if the panel is part of a triangular matrix and is neither unstored - (ie: zero) nor diagonal-intersecting. */ \ -\ - c_use = c_begin; \ - p_use = p_begin; \ -\ - panel_len_i = panel_len_full; \ - panel_len_max_i = panel_len_max; \ -\ - is_p_use = is_p; \ -\ + (ie: zero) nor diagonal-intersecting. */ + + c_use = c_begin; + p_use = p_begin; + + panel_len_i = panel_len_full; + panel_len_max_i = panel_len_max; + + is_p_use = is_p; + /* The definition of bli_packm_my_iter() will depend on whether slab - or round-robin partitioning was requested at configure-time. */ \ - if ( bli_packm_my_iter( it, it_start, it_end, tid, nt ) ) \ - { \ - packm_ker_cast( BLIS_GENERAL, \ - 0, \ - diagc, \ - BLIS_DENSE, \ - conjc, \ - schema, \ - invdiag, \ - *m_panel_use, \ - *n_panel_use, \ - *m_panel_max, \ - *n_panel_max, \ - kappa_cast, \ - c_use, rs_c, cs_c, \ - p_use, rs_p, cs_p, \ - is_p_use, \ - cntx ); \ - } \ -\ - /* NOTE: This value is equivalent to ps_p. */ \ - p_inc = ps_p; \ - } \ -\ - p_begin += p_inc; \ -\ - } \ -} + or round-robin partitioning was requested at configure-time. */ + if ( bli_packm_my_iter( it, it_start, it_end, tid, nt ) ) + { + packm_ker_cast( BLIS_GENERAL, + 0, + diagc, + BLIS_DENSE, + conjc, + schema, + invdiag, + *m_panel_use, + *n_panel_use, + *m_panel_max, + *n_panel_max, + kappa_cast, + c_use, rs_c, cs_c, + p_use, rs_p, cs_p, + is_p_use, + cntx ); + } + } -INSERT_GENTFUNCR_BASIC( packm, packm_blk_var1 ) + p_begin += p_inc*dt_size; + } +} /* -if ( row_stored ) \ -PASTEMAC(ch,fprintm)( stdout, "packm_var2: b", m, n, \ - c_cast, rs_c, cs_c, "%4.1f", "" ); \ -if ( col_stored ) \ -PASTEMAC(ch,fprintm)( stdout, "packm_var2: a", m, n, \ - c_cast, rs_c, cs_c, "%4.1f", "" ); \ +if ( row_stored ) +PASTEMAC(ch,fprintm)( stdout, "packm_var2: b", m_p, n_p, + c_cast, rs_c, cs_c, "%4.1f", "" ); +if ( col_stored ) +PASTEMAC(ch,fprintm)( stdout, "packm_var2: a", m_p, n_p, + c_cast, rs_c, cs_c, "%4.1f", "" ); */ /* -if ( row_stored ) \ -PASTEMAC(ch,fprintm)( stdout, "packm_blk_var1: b packed", *m_panel_max, *n_panel_max, \ - p_use, rs_p, cs_p, "%5.2f", "" ); \ -else \ -PASTEMAC(ch,fprintm)( stdout, "packm_blk_var1: a packed", *m_panel_max, *n_panel_max, \ - p_use, rs_p, cs_p, "%5.2f", "" ); \ -*/ \ -\ +if ( row_stored ) +PASTEMAC(ch,fprintm)( stdout, "packm_blk_var1: b packed", *m_panel_max, *n_panel_max, + p_use, rs_p, cs_p, "%5.2f", "" ); +else +PASTEMAC(ch,fprintm)( stdout, "packm_blk_var1: a packed", *m_panel_max, *n_panel_max, + p_use, rs_p, cs_p, "%5.2f", "" ); +*/ + /* -if ( col_stored ) { \ - if ( bli_thread_work_id( thread ) == 0 ) \ - { \ - printf( "packm_blk_var1: thread %lu (a = %p, ap = %p)\n", bli_thread_work_id( thread ), c_use, p_use ); \ - fflush( stdout ); \ - PASTEMAC(ch,fprintm)( stdout, "packm_blk_var1: a", *m_panel_use, *n_panel_use, \ - ( ctype* )c_use, rs_c, cs_c, "%4.1f", "" ); \ - PASTEMAC(ch,fprintm)( stdout, "packm_blk_var1: ap", *m_panel_max, *n_panel_max, \ - ( ctype* )p_use, rs_p, cs_p, "%4.1f", "" ); \ - fflush( stdout ); \ - } \ -bli_thread_barrier( thread ); \ - if ( bli_thread_work_id( thread ) == 1 ) \ - { \ - printf( "packm_blk_var1: thread %lu (a = %p, ap = %p)\n", bli_thread_work_id( thread ), c_use, p_use ); \ - fflush( stdout ); \ - PASTEMAC(ch,fprintm)( stdout, "packm_blk_var1: a", *m_panel_use, *n_panel_use, \ - ( ctype* )c_use, rs_c, cs_c, "%4.1f", "" ); \ - PASTEMAC(ch,fprintm)( stdout, "packm_blk_var1: ap", *m_panel_max, *n_panel_max, \ - ( ctype* )p_use, rs_p, cs_p, "%4.1f", "" ); \ - fflush( stdout ); \ - } \ -bli_thread_barrier( thread ); \ -} \ -else { \ - if ( bli_thread_work_id( thread ) == 0 ) \ - { \ - printf( "packm_blk_var1: thread %lu (b = %p, bp = %p)\n", bli_thread_work_id( thread ), c_use, p_use ); \ - fflush( stdout ); \ - PASTEMAC(ch,fprintm)( stdout, "packm_blk_var1: b", *m_panel_use, *n_panel_use, \ - ( ctype* )c_use, rs_c, cs_c, "%4.1f", "" ); \ - PASTEMAC(ch,fprintm)( stdout, "packm_blk_var1: bp", *m_panel_max, *n_panel_max, \ - ( ctype* )p_use, rs_p, cs_p, "%4.1f", "" ); \ - fflush( stdout ); \ - } \ -bli_thread_barrier( thread ); \ - if ( bli_thread_work_id( thread ) == 1 ) \ - { \ - printf( "packm_blk_var1: thread %lu (b = %p, bp = %p)\n", bli_thread_work_id( thread ), c_use, p_use ); \ - fflush( stdout ); \ - PASTEMAC(ch,fprintm)( stdout, "packm_blk_var1: b", *m_panel_use, *n_panel_use, \ - ( ctype* )c_use, rs_c, cs_c, "%4.1f", "" ); \ - PASTEMAC(ch,fprintm)( stdout, "packm_blk_var1: bp", *m_panel_max, *n_panel_max, \ - ( ctype* )p_use, rs_p, cs_p, "%4.1f", "" ); \ - fflush( stdout ); \ - } \ -bli_thread_barrier( thread ); \ -} \ +if ( col_stored ) { + if ( bli_thread_work_id( thread ) == 0 ) + { + printf( "packm_blk_var1: thread %lu (a = %p, ap = %p)\n_p", bli_thread_work_id( thread ), c_use, p_use ); + fflush( stdout ); + PASTEMAC(ch,fprintm)( stdout, "packm_blk_var1: a", *m_panel_use, *n_panel_use, + ( ctype* )c_use, rs_c, cs_c, "%4.1f", "" ); + PASTEMAC(ch,fprintm)( stdout, "packm_blk_var1: ap", *m_panel_max, *n_panel_max, + ( ctype* )p_use, rs_p, cs_p, "%4.1f", "" ); + fflush( stdout ); + } +bli_thread_barrier( thread ); + if ( bli_thread_work_id( thread ) == 1 ) + { + printf( "packm_blk_var1: thread %lu (a = %p, ap = %p)\n_p", bli_thread_work_id( thread ), c_use, p_use ); + fflush( stdout ); + PASTEMAC(ch,fprintm)( stdout, "packm_blk_var1: a", *m_panel_use, *n_panel_use, + ( ctype* )c_use, rs_c, cs_c, "%4.1f", "" ); + PASTEMAC(ch,fprintm)( stdout, "packm_blk_var1: ap", *m_panel_max, *n_panel_max, + ( ctype* )p_use, rs_p, cs_p, "%4.1f", "" ); + fflush( stdout ); + } +bli_thread_barrier( thread ); +} +else { + if ( bli_thread_work_id( thread ) == 0 ) + { + printf( "packm_blk_var1: thread %lu (b = %p, bp = %p)\n_p", bli_thread_work_id( thread ), c_use, p_use ); + fflush( stdout ); + PASTEMAC(ch,fprintm)( stdout, "packm_blk_var1: b", *m_panel_use, *n_panel_use, + ( ctype* )c_use, rs_c, cs_c, "%4.1f", "" ); + PASTEMAC(ch,fprintm)( stdout, "packm_blk_var1: bp", *m_panel_max, *n_panel_max, + ( ctype* )p_use, rs_p, cs_p, "%4.1f", "" ); + fflush( stdout ); + } +bli_thread_barrier( thread ); + if ( bli_thread_work_id( thread ) == 1 ) + { + printf( "packm_blk_var1: thread %lu (b = %p, bp = %p)\n_p", bli_thread_work_id( thread ), c_use, p_use ); + fflush( stdout ); + PASTEMAC(ch,fprintm)( stdout, "packm_blk_var1: b", *m_panel_use, *n_panel_use, + ( ctype* )c_use, rs_c, cs_c, "%4.1f", "" ); + PASTEMAC(ch,fprintm)( stdout, "packm_blk_var1: bp", *m_panel_max, *n_panel_max, + ( ctype* )p_use, rs_p, cs_p, "%4.1f", "" ); + fflush( stdout ); + } +bli_thread_barrier( thread ); +} */ /* - if ( bli_is_4mi_packed( schema ) ) { \ - printf( "packm_var2: is_p_use = %lu\n", is_p_use ); \ - if ( col_stored ) { \ - if ( 0 ) \ - PASTEMAC(chr,fprintm)( stdout, "packm_var2: a_r", *m_panel_use, *n_panel_use, \ - ( ctype_r* )c_use, 2*rs_c, 2*cs_c, "%4.1f", "" ); \ - PASTEMAC(chr,fprintm)( stdout, "packm_var2: ap_r", *m_panel_max, *n_panel_max, \ - ( ctype_r* )p_use, rs_p, cs_p, "%4.1f", "" ); \ - PASTEMAC(chr,fprintm)( stdout, "packm_var2: ap_i", *m_panel_max, *n_panel_max, \ - ( ctype_r* )p_use + is_p_use, rs_p, cs_p, "%4.1f", "" ); \ - } \ - if ( row_stored ) { \ - if ( 0 ) \ - PASTEMAC(chr,fprintm)( stdout, "packm_var2: b_r", *m_panel_use, *n_panel_use, \ - ( ctype_r* )c_use, 2*rs_c, 2*cs_c, "%4.1f", "" ); \ - PASTEMAC(chr,fprintm)( stdout, "packm_var2: bp_r", *m_panel_max, *n_panel_max, \ - ( ctype_r* )p_use, rs_p, cs_p, "%4.1f", "" ); \ - PASTEMAC(chr,fprintm)( stdout, "packm_var2: bp_i", *m_panel_max, *n_panel_max, \ - ( ctype_r* )p_use + is_p_use, rs_p, cs_p, "%4.1f", "" ); \ - } \ - } \ + if ( bli_is_4mi_packed( schema ) ) { + printf( "packm_var2: is_p_use = %lu\n_p", is_p_use ); + if ( col_stored ) { + if ( 0 ) + PASTEMAC(chr,fprintm)( stdout, "packm_var2: a_r", *m_panel_use, *n_panel_use, + ( ctype_r* )c_use, 2*rs_c, 2*cs_c, "%4.1f", "" ); + PASTEMAC(chr,fprintm)( stdout, "packm_var2: ap_r", *m_panel_max, *n_panel_max, + ( ctype_r* )p_use, rs_p, cs_p, "%4.1f", "" ); + PASTEMAC(chr,fprintm)( stdout, "packm_var2: ap_i", *m_panel_max, *n_panel_max, + ( ctype_r* )p_use + is_p_use, rs_p, cs_p, "%4.1f", "" ); + } + if ( row_stored ) { + if ( 0 ) + PASTEMAC(chr,fprintm)( stdout, "packm_var2: b_r", *m_panel_use, *n_panel_use, + ( ctype_r* )c_use, 2*rs_c, 2*cs_c, "%4.1f", "" ); + PASTEMAC(chr,fprintm)( stdout, "packm_var2: bp_r", *m_panel_max, *n_panel_max, + ( ctype_r* )p_use, rs_p, cs_p, "%4.1f", "" ); + PASTEMAC(chr,fprintm)( stdout, "packm_var2: bp_i", *m_panel_max, *n_panel_max, + ( ctype_r* )p_use + is_p_use, rs_p, cs_p, "%4.1f", "" ); + } + } */ /* - PASTEMAC(chr,fprintm)( stdout, "packm_var2: bp_rpi", *m_panel_max, *n_panel_max, \ - ( ctype_r* )p_use, rs_p, cs_p, "%4.1f", "" ); \ + PASTEMAC(chr,fprintm)( stdout, "packm_var2: bp_rpi", *m_panel_max, *n_panel_max, + ( ctype_r* )p_use, rs_p, cs_p, "%4.1f", "" ); */ /* - if ( row_stored ) { \ - PASTEMAC(chr,fprintm)( stdout, "packm_var2: b_r", *m_panel_max, *n_panel_max, \ - ( ctype_r* )c_use, 2*rs_c, 2*cs_c, "%4.1f", "" ); \ - PASTEMAC(chr,fprintm)( stdout, "packm_var2: b_i", *m_panel_max, *n_panel_max, \ - (( ctype_r* )c_use)+rs_c, 2*rs_c, 2*cs_c, "%4.1f", "" ); \ - PASTEMAC(chr,fprintm)( stdout, "packm_var2: bp_r", *m_panel_max, *n_panel_max, \ - ( ctype_r* )p_use, rs_p, cs_p, "%4.1f", "" ); \ - inc_t is_b = rs_p * *m_panel_max; \ - PASTEMAC(chr,fprintm)( stdout, "packm_var2: bp_i", *m_panel_max, *n_panel_max, \ - ( ctype_r* )p_use + is_b, rs_p, cs_p, "%4.1f", "" ); \ - } \ + if ( row_stored ) { + PASTEMAC(chr,fprintm)( stdout, "packm_var2: b_r", *m_panel_max, *n_panel_max, + ( ctype_r* )c_use, 2*rs_c, 2*cs_c, "%4.1f", "" ); + PASTEMAC(chr,fprintm)( stdout, "packm_var2: b_i", *m_panel_max, *n_panel_max, + (( ctype_r* )c_use)+rs_c, 2*rs_c, 2*cs_c, "%4.1f", "" ); + PASTEMAC(chr,fprintm)( stdout, "packm_var2: bp_r", *m_panel_max, *n_panel_max, + ( ctype_r* )p_use, rs_p, cs_p, "%4.1f", "" ); + inc_t is_b = rs_p * *m_panel_max; + PASTEMAC(chr,fprintm)( stdout, "packm_var2: bp_i", *m_panel_max, *n_panel_max, + ( ctype_r* )p_use + is_b, rs_p, cs_p, "%4.1f", "" ); + } */ /* - if ( col_stored ) { \ - PASTEMAC(chr,fprintm)( stdout, "packm_var2: a_r", *m_panel_max, *n_panel_max, \ - ( ctype_r* )c_use, 2*rs_c, 2*cs_c, "%4.1f", "" ); \ - PASTEMAC(chr,fprintm)( stdout, "packm_var2: a_i", *m_panel_max, *n_panel_max, \ - (( ctype_r* )c_use)+rs_c, 2*rs_c, 2*cs_c, "%4.1f", "" ); \ - PASTEMAC(chr,fprintm)( stdout, "packm_var2: ap_r", *m_panel_max, *n_panel_max, \ - ( ctype_r* )p_use, rs_p, cs_p, "%4.1f", "" ); \ - PASTEMAC(chr,fprintm)( stdout, "packm_var2: ap_i", *m_panel_max, *n_panel_max, \ - ( ctype_r* )p_use + p_inc, rs_p, cs_p, "%4.1f", "" ); \ - } \ + if ( col_stored ) { + PASTEMAC(chr,fprintm)( stdout, "packm_var2: a_r", *m_panel_max, *n_panel_max, + ( ctype_r* )c_use, 2*rs_c, 2*cs_c, "%4.1f", "" ); + PASTEMAC(chr,fprintm)( stdout, "packm_var2: a_i", *m_panel_max, *n_panel_max, + (( ctype_r* )c_use)+rs_c, 2*rs_c, 2*cs_c, "%4.1f", "" ); + PASTEMAC(chr,fprintm)( stdout, "packm_var2: ap_r", *m_panel_max, *n_panel_max, + ( ctype_r* )p_use, rs_p, cs_p, "%4.1f", "" ); + PASTEMAC(chr,fprintm)( stdout, "packm_var2: ap_i", *m_panel_max, *n_panel_max, + ( ctype_r* )p_use + p_inc, rs_p, cs_p, "%4.1f", "" ); + } */ diff --git a/frame/base/bli_obj.c b/frame/base/bli_obj.c index 43e5101b5f..b3245c9611 100644 --- a/frame/base/bli_obj.c +++ b/frame/base/bli_obj.c @@ -118,6 +118,11 @@ void bli_obj_create_without_buffer bli_obj_set_offs( 0, 0, obj ); bli_obj_set_diag_offset( 0, obj ); + bli_obj_set_pack_fn( NULL, obj ); + bli_obj_set_pack_ukr_fn( NULL, obj ); + bli_obj_set_ker_fn( NULL, obj ); + bli_obj_set_ukr_fn( NULL, obj ); + // Set the internal scalar to 1.0. bli_obj_set_scalar_dt( dt, obj ); s = bli_obj_internal_scalar_buffer( obj ); @@ -356,7 +361,7 @@ void bli_obj_free buf_a = bli_obj_buffer_at_off( a ); - bli_zzsets( 0.0, 0.0, value ); + bli_zzsets( 0.0, 0.0, value ); if ( bli_obj_is_float( a ) ) { @@ -500,7 +505,7 @@ void bli_adjust_strides // Set the column stride to indicate that this is a column vector // stored in column-major order. This is done for legacy reasons, // because we at one time we had to satisify the error checking - // in the underlying BLAS library, which expects the leading + // in the underlying BLAS library, which expects the leading // dimension to be set to at least m, even if it will never be // used for indexing since it is a vector and thus only has one // column of data. diff --git a/frame/include/bli_obj_macro_defs.h b/frame/include/bli_obj_macro_defs.h index 84c977289c..d4829854d2 100644 --- a/frame/include/bli_obj_macro_defs.h +++ b/frame/include/bli_obj_macro_defs.h @@ -1210,6 +1210,11 @@ BLIS_INLINE obj_pack_fn_t bli_obj_pack_fn( obj_t* obj ) return obj->pack; } +BLIS_INLINE obj_pack_ukr_fn_t bli_obj_pack_ukr_fn( obj_t* obj ) +{ + return obj->pack_ukr; +} + BLIS_INLINE obj_ker_fn_t bli_obj_ker_fn( obj_t* obj ) { return obj->ker; @@ -1227,6 +1232,11 @@ BLIS_INLINE void bli_obj_set_pack_fn( obj_pack_fn_t pack, obj_t* obj ) obj->pack = pack; } +BLIS_INLINE void bli_obj_set_pack_ukr_fn( obj_pack_ukr_fn_t pack_ukr, obj_t* obj ) +{ + obj->pack_ukr = pack_ukr; +} + BLIS_INLINE void bli_obj_set_ker_fn( obj_ker_fn_t ker, obj_t* obj ) { obj->ker = ker; diff --git a/frame/include/bli_type_defs.h b/frame/include/bli_type_defs.h index f03fc72acd..021cbdf942 100644 --- a/frame/include/bli_type_defs.h +++ b/frame/include/bli_type_defs.h @@ -1249,6 +1249,21 @@ typedef void (*obj_pack_fn_t) struct thrinfo_s* thread ); +typedef void (*obj_pack_ukr_fn_t) + ( + dim_t m, \ + dim_t m_max, \ + dim_t m_off, \ + dim_t n, \ + dim_t n_max, \ + dim_t n_off, \ + void* restrict kappa, \ + void* restrict a, inc_t inca, inc_t lda, \ + void* restrict p, inc_t ldp, \ + void* params, \ + struct cntx_s* cntx \ + ); + typedef void (*obj_ker_fn_t) ( struct obj_s* a, @@ -1307,9 +1322,10 @@ typedef struct obj_s void* user_data; // Function pointers - obj_pack_fn_t pack; - obj_ker_fn_t ker; - obj_ukr_fn_t ukr; + obj_pack_fn_t pack; + obj_pack_ukr_fn_t pack_ukr; + obj_ker_fn_t ker; + obj_ukr_fn_t ukr; } obj_t; @@ -1352,6 +1368,7 @@ typedef struct obj_s .user_data = NULL, \ \ .pack = NULL, \ + .pack_ukr = NULL, \ .ker = NULL, \ .ukr = NULL \ } @@ -1386,6 +1403,7 @@ typedef struct obj_s .user_data = NULL, \ \ .pack = NULL, \ + .pack_ukr = NULL, \ .ker = NULL, \ .ukr = NULL \ } @@ -1425,6 +1443,7 @@ BLIS_INLINE void bli_obj_init_full_shallow_copy_of( obj_t* a, obj_t* b ) b->user_data = a->user_data; b->pack = a->pack; + b->pack_ukr = a->pack_ukr; b->ker = a->ker; b->ukr = a->ukr; } @@ -1464,6 +1483,7 @@ BLIS_INLINE void bli_obj_init_subpart_from( obj_t* a, obj_t* b ) b->user_data = a->user_data; b->pack = a->pack; + b->pack_ukr = a->pack_ukr; b->ker = a->ker; b->ukr = a->ukr; } From f57edcc4e914ff0e20db738c97c96fbc9d34ae86 Mon Sep 17 00:00:00 2001 From: Devin Matthews Date: Fri, 10 Sep 2021 12:34:50 -0500 Subject: [PATCH 04/24] Fix induced method handling for gemmt/syr2k/her2k/syrk/herk. --- frame/3/bli_l3_oapi.c | 286 ++++++++++++++++------------ frame/3/bli_l3_oapi.h | 91 +++++---- frame/ind/bli_l3_ind.c | 24 +-- frame/ind/oapi/bli_l3_3m4m1m_oapi.c | 10 +- frame/ind/oapi/bli_l3_ind_oapi.c | 2 +- frame/ind/oapi/bli_l3_ind_oapi.h | 1 + frame/ind/oapi/bli_l3_nat_oapi.c | 2 +- frame/ind/tapi/bli_l3_ind_tapi.c | 65 +++++++ frame/ind/tapi/bli_l3_ind_tapi.h | 26 +++ 9 files changed, 328 insertions(+), 179 deletions(-) diff --git a/frame/3/bli_l3_oapi.c b/frame/3/bli_l3_oapi.c index a540d94e43..0a18ddc372 100644 --- a/frame/3/bli_l3_oapi.c +++ b/frame/3/bli_l3_oapi.c @@ -147,21 +147,15 @@ void bli_gemmt_ex } */ - /* Only proceed with an induced method if each of the operands have a - complex storage datatype. NOTE: Allowing precisions to vary while - using 1m, which is what we do here, is unique to gemm; other level-3 - operations use 1m only if all storage datatypes are equal (and they - ignore the computation precision). If any operands are real, skip the - induced method chooser function and proceed directly with native - execution. */ - if ( bli_obj_is_complex( c ) && - bli_obj_is_complex( a ) && - bli_obj_is_complex( b ) ) + /* Only proceed with an induced method if all operands have the same + (complex) datatype. If any datatypes differ, skip the induced method + chooser function and proceed directly with native execution, which is + where mixed datatype support will be implemented (if at all). */ + if ( bli_obj_dt( a ) == bli_obj_dt( c ) && + bli_obj_dt( b ) == bli_obj_dt( c ) && + bli_obj_is_complex( c ) ) { - /* FIXME: BLIS does not yet support induced methods for gemmt. Thus, - we call the native implementation code path for now. */ - /*PASTEMAC(opname,ind)( alpha, a, b, beta, c, cntx, rntm );*/ - bli_gemmtnat( alpha, a, b, beta, c, cntx, rntm ); + bli_gemmtind( alpha, a, b, beta, c, cntx, rntm ); } else { @@ -181,49 +175,59 @@ void bli_gemmt bli_gemmt_ex( alpha, a, b, beta, c, NULL, NULL ); } -void bli_her2k_ex - ( - obj_t* alpha, - obj_t* a, - obj_t* b, - obj_t* beta, - obj_t* c, - cntx_t* cntx, - rntm_t* rntm - ) -{ - bli_init_once(); - - obj_t ah; - obj_t bh; - obj_t alphah; - - bli_obj_alias_to( alpha, &alphah ); - bli_obj_toggle_conj( &alphah ); - - bli_obj_alias_to( a, &ah ); - bli_obj_induce_trans( &ah ); - bli_obj_toggle_conj( &ah ); - - bli_obj_alias_to( b, &bh ); - bli_obj_induce_trans( &bh ); - bli_obj_toggle_conj( &bh ); - - // Invoke gemmt twice, using beta only the first time. - - bli_gemmt_ex( alpha, a, &bh, beta, c, cntx, rntm ); - bli_gemmt_ex( &alphah, b, &ah, &BLIS_ONE, c, cntx, rntm ); - - // The Hermitian rank-2k product was computed as A*B'+B*A', even for - // the diagonal elements. Mathematically, the imaginary components of - // diagonal elements of a Hermitian rank-2k product should always be - // zero. However, in practice, they sometimes accumulate meaningless - // non-zero values. To prevent this, we explicitly set those values - // to zero before returning. - - bli_setid( &BLIS_ZERO, c ); +#undef GENTFUNC +#define GENTFUNC(opname,ind) \ +void PASTEMAC(opname,ind) \ + ( \ + obj_t* alpha, \ + obj_t* a, \ + obj_t* b, \ + obj_t* beta, \ + obj_t* c, \ + cntx_t* cntx, \ + rntm_t* rntm \ + ) \ +{ \ + bli_init_once(); \ +\ + obj_t ah; \ + obj_t bh; \ + obj_t alphah; \ +\ + bli_obj_alias_to( alpha, &alphah ); \ + bli_obj_toggle_conj( &alphah ); \ +\ + bli_obj_alias_to( a, &ah ); \ + bli_obj_induce_trans( &ah ); \ + bli_obj_toggle_conj( &ah ); \ +\ + bli_obj_alias_to( b, &bh ); \ + bli_obj_induce_trans( &bh ); \ + bli_obj_toggle_conj( &bh ); \ +\ + /* Invoke gemmt twice, using beta only the first time. */ \ +\ + PASTEMAC(gemmt,ind)( alpha, a, &bh, beta, c, cntx, rntm ); \ + PASTEMAC(gemmt,ind)( &alphah, b, &ah, &BLIS_ONE, c, cntx, rntm ); \ +\ + /* The Hermitian rank-2k product was computed as A*B'+B*A', even for \ + * the diagonal elements. Mathematically, the imaginary components of \ + * diagonal elements of a Hermitian rank-2k product should always be \ + * zero. However, in practice, they sometimes accumulate meaningless \ + * non-zero values. To prevent this, we explicitly set those values \ + * to zero before returning. */ \ + \ + bli_setid( &BLIS_ZERO, c ); \ } +GENTFUNC(her2k,_ex); +GENTFUNC(her2k,3mh); +GENTFUNC(her2k,3m1); +GENTFUNC(her2k,4mh); +GENTFUNC(her2k,4m1); +GENTFUNC(her2k,1m); +GENTFUNC(her2k,nat); + void bli_her2k ( obj_t* alpha, @@ -236,33 +240,43 @@ void bli_her2k bli_her2k_ex( alpha, a, b, beta, c, NULL, NULL ); } -void bli_syr2k_ex - ( - obj_t* alpha, - obj_t* a, - obj_t* b, - obj_t* beta, - obj_t* c, - cntx_t* cntx, - rntm_t* rntm - ) -{ - bli_init_once(); - - obj_t at; - obj_t bt; - - bli_obj_alias_to( b, &bt ); - bli_obj_induce_trans( &bt ); - bli_obj_alias_to( a, &at ); - bli_obj_induce_trans( &at ); - - // Invoke gemmt twice, using beta only the first time. - - bli_gemmt_ex( alpha, a, &bt, beta, c, cntx, rntm ); - bli_gemmt_ex( alpha, b, &at, &BLIS_ONE, c, cntx, rntm ); +#undef GENTFUNC +#define GENTFUNC(opname,ind) \ +void PASTEMAC(opname,ind) \ + ( \ + obj_t* alpha, \ + obj_t* a, \ + obj_t* b, \ + obj_t* beta, \ + obj_t* c, \ + cntx_t* cntx, \ + rntm_t* rntm \ + ) \ +{ \ + bli_init_once(); \ +\ + obj_t at; \ + obj_t bt; \ +\ + bli_obj_alias_to( b, &bt ); \ + bli_obj_induce_trans( &bt ); \ + bli_obj_alias_to( a, &at ); \ + bli_obj_induce_trans( &at ); \ +\ + /* Invoke gemmt twice, using beta only the first time. */ \ +\ + PASTEMAC(gemmt,ind)( alpha, a, &bt, beta, c, cntx, rntm ); \ + PASTEMAC(gemmt,ind)( alpha, b, &at, &BLIS_ONE, c, cntx, rntm ); \ } +GENTFUNC(syr2k,_ex); +GENTFUNC(syr2k,3mh); +GENTFUNC(syr2k,3m1); +GENTFUNC(syr2k,4mh); +GENTFUNC(syr2k,4m1); +GENTFUNC(syr2k,1m); +GENTFUNC(syr2k,nat); + void bli_syr2k ( obj_t* alpha, @@ -419,36 +433,46 @@ void bli_trmm3 bli_trmm3_ex( side, alpha, a, b, beta, c, NULL, NULL ); } -void bli_herk_ex - ( - obj_t* alpha, - obj_t* a, - obj_t* beta, - obj_t* c, - cntx_t* cntx, - rntm_t* rntm - ) -{ - bli_init_once(); - - obj_t ah; - - bli_obj_alias_to( a, &ah ); - bli_obj_induce_trans( &ah ); - bli_obj_toggle_conj( &ah ); - - bli_gemmt_ex( alpha, a, &ah, beta, c, cntx, rntm ); - - // The Hermitian rank-k product was computed as A*A', even for the - // diagonal elements. Mathematically, the imaginary components of - // diagonal elements of a Hermitian rank-k product should always be - // zero. However, in practice, they sometimes accumulate meaningless - // non-zero values. To prevent this, we explicitly set those values - // to zero before returning. - - bli_setid( &BLIS_ZERO, c ); +#undef GENTFUNC +#define GENTFUNC(opname,ind) \ +void PASTEMAC(opname,ind) \ + ( \ + obj_t* alpha, \ + obj_t* a, \ + obj_t* beta, \ + obj_t* c, \ + cntx_t* cntx, \ + rntm_t* rntm \ + ) \ +{ \ + bli_init_once(); \ +\ + obj_t ah; \ +\ + bli_obj_alias_to( a, &ah ); \ + bli_obj_induce_trans( &ah ); \ + bli_obj_toggle_conj( &ah ); \ +\ + PASTEMAC(gemmt,ind)( alpha, a, &ah, beta, c, cntx, rntm ); \ +\ + /* The Hermitian rank-k product was computed as A*A', even for the \ + * diagonal elements. Mathematically, the imaginary components of \ + * diagonal elements of a Hermitian rank-k product should always be \ + * zero. However, in practice, they sometimes accumulate meaningless \ + * non-zero values. To prevent this, we explicitly set those values \ + * to zero before returning. */ \ +\ + bli_setid( &BLIS_ZERO, c ); \ } +GENTFUNC(herk,_ex); +GENTFUNC(herk,3mh); +GENTFUNC(herk,3m1); +GENTFUNC(herk,4mh); +GENTFUNC(herk,4m1); +GENTFUNC(herk,1m); +GENTFUNC(herk,nat); + void bli_herk ( obj_t* alpha, @@ -460,26 +484,36 @@ void bli_herk bli_herk_ex( alpha, a, beta, c, NULL, NULL ); } -void bli_syrk_ex - ( - obj_t* alpha, - obj_t* a, - obj_t* beta, - obj_t* c, - cntx_t* cntx, - rntm_t* rntm - ) -{ - bli_init_once(); - - obj_t at; - - bli_obj_alias_to( a, &at ); - bli_obj_induce_trans( &at ); - - bli_gemmt_ex( alpha, a, &at, beta, c, cntx, rntm ); +#undef GENTFUNC +#define GENTFUNC(opname,ind) \ +void PASTEMAC(opname,ind) \ + ( \ + obj_t* alpha, \ + obj_t* a, \ + obj_t* beta, \ + obj_t* c, \ + cntx_t* cntx, \ + rntm_t* rntm \ + ) \ +{ \ + bli_init_once(); \ +\ + obj_t at; \ +\ + bli_obj_alias_to( a, &at ); \ + bli_obj_induce_trans( &at ); \ +\ + PASTEMAC(gemmt,ind)( alpha, a, &at, beta, c, cntx, rntm ); \ } +GENTFUNC(syrk,_ex); +GENTFUNC(syrk,3mh); +GENTFUNC(syrk,3m1); +GENTFUNC(syrk,4mh); +GENTFUNC(syrk,4m1); +GENTFUNC(syrk,1m); +GENTFUNC(syrk,nat); + void bli_syrk ( obj_t* alpha, diff --git a/frame/3/bli_l3_oapi.h b/frame/3/bli_l3_oapi.h index 02845fe309..e3d5f91f99 100644 --- a/frame/3/bli_l3_oapi.h +++ b/frame/3/bli_l3_oapi.h @@ -78,17 +78,35 @@ BLIS_EXPORT_BLIS void bli_gemmt obj_t* c ); -BLIS_EXPORT_BLIS void bli_her2k_ex - ( - obj_t* alpha, - obj_t* a, - obj_t* b, - obj_t* beta, - obj_t* c, - cntx_t* cntx, - rntm_t* rntm +#undef GENTDEF +#define GENTDEF(opname,ind) \ +BLIS_EXPORT_BLIS void PASTEMAC(opname,ind) \ + ( \ + obj_t* alpha, \ + obj_t* a, \ + obj_t* b, \ + obj_t* beta, \ + obj_t* c, \ + cntx_t* cntx, \ + rntm_t* rntm \ ); +GENTDEF(her2k,_ex); +GENTDEF(her2k,3mh); +GENTDEF(her2k,3m1); +GENTDEF(her2k,4mh); +GENTDEF(her2k,4m1); +GENTDEF(her2k,1m); +GENTDEF(her2k,nat); + +GENTDEF(syr2k,_ex); +GENTDEF(syr2k,3mh); +GENTDEF(syr2k,3m1); +GENTDEF(syr2k,4mh); +GENTDEF(syr2k,4m1); +GENTDEF(syr2k,1m); +GENTDEF(syr2k,nat); + BLIS_EXPORT_BLIS void bli_her2k ( obj_t* alpha, @@ -98,17 +116,6 @@ BLIS_EXPORT_BLIS void bli_her2k obj_t* c ); -BLIS_EXPORT_BLIS void bli_syr2k_ex - ( - obj_t* alpha, - obj_t* a, - obj_t* b, - obj_t* beta, - obj_t* c, - cntx_t* cntx, - rntm_t* rntm - ); - BLIS_EXPORT_BLIS void bli_syr2k ( obj_t* alpha, @@ -181,16 +188,34 @@ BLIS_EXPORT_BLIS void bli_trmm3 obj_t* c ); -BLIS_EXPORT_BLIS void bli_herk_ex - ( - obj_t* alpha, - obj_t* a, - obj_t* beta, - obj_t* c, - cntx_t* cntx, - rntm_t* rntm +#undef GENTDEF +#define GENTDEF(opname,ind) \ +BLIS_EXPORT_BLIS void PASTEMAC(opname,ind) \ + ( \ + obj_t* alpha, \ + obj_t* a, \ + obj_t* beta, \ + obj_t* c, \ + cntx_t* cntx, \ + rntm_t* rntm \ ); +GENTDEF(herk,_ex); +GENTDEF(herk,3mh); +GENTDEF(herk,3m1); +GENTDEF(herk,4mh); +GENTDEF(herk,4m1); +GENTDEF(herk,1m); +GENTDEF(herk,nat); + +GENTDEF(syrk,_ex); +GENTDEF(syrk,3mh); +GENTDEF(syrk,3m1); +GENTDEF(syrk,4mh); +GENTDEF(syrk,4m1); +GENTDEF(syrk,1m); +GENTDEF(syrk,nat); + BLIS_EXPORT_BLIS void bli_herk ( obj_t* alpha, @@ -199,16 +224,6 @@ BLIS_EXPORT_BLIS void bli_herk obj_t* c ); -BLIS_EXPORT_BLIS void bli_syrk_ex - ( - obj_t* alpha, - obj_t* a, - obj_t* beta, - obj_t* c, - cntx_t* cntx, - rntm_t* rntm - ); - BLIS_EXPORT_BLIS void bli_syrk ( obj_t* alpha, diff --git a/frame/ind/bli_l3_ind.c b/frame/ind/bli_l3_ind.c index 81b4ea6f60..6f83190174 100644 --- a/frame/ind/bli_l3_ind.c +++ b/frame/ind/bli_l3_ind.c @@ -39,20 +39,20 @@ static void_fp bli_l3_ind_oper_fp[BLIS_NUM_IND_METHODS][BLIS_NUM_LEVEL3_OPS] = { /* gemm gemmt hemm herk her2k symm syrk syr2k trmm3 trmm trsm */ -/* 3mh */ { bli_gemm3mh, NULL, bli_hemm3mh, NULL, NULL, bli_symm3mh, - NULL, NULL, bli_trmm33mh, NULL, NULL }, -/* 3m1 */ { bli_gemm3m1, NULL, bli_hemm3m1, NULL, NULL, bli_symm3m1, - NULL, NULL, bli_trmm33m1, bli_trmm3m1, bli_trsm3m1 }, -/* 4mh */ { bli_gemm4mh, NULL, bli_hemm4mh, NULL, NULL, bli_symm4mh, - NULL, NULL, bli_trmm34mh, NULL, NULL }, +/* 3mh */ { bli_gemm3mh, bli_gemmt3mh, bli_hemm3mh, bli_herk3mh, bli_her2k3mh, bli_symm3mh, + bli_syrk3mh, bli_syr2k3mh, bli_trmm33mh, NULL, NULL }, +/* 3m1 */ { bli_gemm3m1, bli_gemmt3m1, bli_hemm3m1, bli_herk3m1, bli_her2k3m1, bli_symm3m1, + bli_syrk3m1, bli_syr2k3m1, bli_trmm33m1, bli_trmm3m1, bli_trsm3m1 }, +/* 4mh */ { bli_gemm4mh, bli_gemmt4mh, bli_hemm4mh, bli_herk4mh, bli_her2k4mh, bli_symm4mh, + bli_syrk4mh, bli_syr2k4mh, bli_trmm34mh, NULL, NULL }, /* 4mb */ { bli_gemm4mb, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL }, -/* 4m1 */ { bli_gemm4m1, NULL, bli_hemm4m1, NULL, NULL, bli_symm4m1, - NULL, NULL, bli_trmm34m1, bli_trmm4m1, bli_trsm4m1 }, -/* 1m */ { bli_gemm1m, NULL, bli_hemm1m, NULL, NULL, bli_symm1m, - NULL, NULL, bli_trmm31m, bli_trmm1m, bli_trsm1m }, -/* nat */ { bli_gemmnat, bli_gemmtnat, bli_hemmnat, NULL, NULL, bli_symmnat, - NULL, NULL, bli_trmm3nat, bli_trmmnat, bli_trsmnat }, +/* 4m1 */ { bli_gemm4m1, bli_gemmt4m1, bli_hemm4m1, bli_herk4m1, bli_her2k4m1, bli_symm4m1, + bli_syrk4m1, bli_syr2k4m1, bli_trmm34m1, bli_trmm4m1, bli_trsm4m1 }, +/* 1m */ { bli_gemm1m, bli_gemmt1m, bli_hemm1m, bli_herk1m, bli_her2k1m, bli_symm1m, + bli_syrk1m, bli_syr2k1m, bli_trmm31m, bli_trmm1m, bli_trsm1m }, +/* nat */ { bli_gemmnat, bli_gemmtnat, bli_hemmnat, bli_herknat, bli_her2knat, bli_symmnat, + bli_syrknat, bli_syr2knat, bli_trmm3nat, bli_trmmnat, bli_trsmnat }, }; // diff --git a/frame/ind/oapi/bli_l3_3m4m1m_oapi.c b/frame/ind/oapi/bli_l3_3m4m1m_oapi.c index 3089ecfa7f..4db771b7fa 100644 --- a/frame/ind/oapi/bli_l3_3m4m1m_oapi.c +++ b/frame/ind/oapi/bli_l3_3m4m1m_oapi.c @@ -35,7 +35,7 @@ #include "blis.h" -// -- gemm --------------------------------------------------------------------- +// -- gemm/gemmt --------------------------------------------------------------- #undef GENFRONT #define GENFRONT( opname, cname, imeth, nstage ) \ @@ -125,6 +125,14 @@ GENFRONT( gemm, gemm, 4mb, 1 ) GENFRONT( gemm, gemm, 4m1, 1 ) GENFRONT( gemm, gemm, 1m, 1 ) +// gemmt +GENFRONT( gemmt, gemmt, 3mh, 3 ) +GENFRONT( gemmt, gemmt, 3m1, 1 ) +GENFRONT( gemmt, gemmt, 4mh, 4 ) +//GENFRONT( gemmt, gemmt, 4mb, 1 ) // Not implemented. +GENFRONT( gemmt, gemmt, 4m1, 1 ) +GENFRONT( gemmt, gemmt, 1m, 1 ) + // -- hemm/symm/trmm3 ---------------------------------------------------------- diff --git a/frame/ind/oapi/bli_l3_ind_oapi.c b/frame/ind/oapi/bli_l3_ind_oapi.c index 3b0d8d69a8..ab4cb4ac3e 100644 --- a/frame/ind/oapi/bli_l3_ind_oapi.c +++ b/frame/ind/oapi/bli_l3_ind_oapi.c @@ -36,7 +36,7 @@ #include "blis.h" -// -- gemm --------------------------------------------------------------------- +// -- gemm/gemmt --------------------------------------------------------------- #undef GENFRONT #define GENFRONT( opname, imeth ) \ diff --git a/frame/ind/oapi/bli_l3_ind_oapi.h b/frame/ind/oapi/bli_l3_ind_oapi.h index 6ae66a2370..722970d7c3 100644 --- a/frame/ind/oapi/bli_l3_ind_oapi.h +++ b/frame/ind/oapi/bli_l3_ind_oapi.h @@ -64,6 +64,7 @@ GENPROT( 1m ) #define GENPROT_NO2OP( imeth ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(gemm,imeth) ( obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx, rntm_t* rntm ); \ +BLIS_EXPORT_BLIS void PASTEMAC(gemmt,imeth)( obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx, rntm_t* rntm ); \ BLIS_EXPORT_BLIS void PASTEMAC(hemm,imeth) ( side_t side, obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx, rntm_t* rntm ); \ BLIS_EXPORT_BLIS void PASTEMAC(symm,imeth) ( side_t side, obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx, rntm_t* rntm ); \ BLIS_EXPORT_BLIS void PASTEMAC(trmm3,imeth)( side_t side, obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx, rntm_t* rntm ); diff --git a/frame/ind/oapi/bli_l3_nat_oapi.c b/frame/ind/oapi/bli_l3_nat_oapi.c index 1f8d81bf85..6c9ed02e9e 100644 --- a/frame/ind/oapi/bli_l3_nat_oapi.c +++ b/frame/ind/oapi/bli_l3_nat_oapi.c @@ -41,7 +41,7 @@ // of executing one iteration of a for loop, plus the overhead of calling a // function that does nothing (ie: the _cntx_init_stage() function). -// -- gemm --------------------------------------------------------------------- +// -- gemm/gemmt --------------------------------------------------------------- #undef GENFRONT #define GENFRONT( opname, cname, imeth ) \ diff --git a/frame/ind/tapi/bli_l3_ind_tapi.c b/frame/ind/tapi/bli_l3_ind_tapi.c index 02458e285f..21ba50c554 100644 --- a/frame/ind/tapi/bli_l3_ind_tapi.c +++ b/frame/ind/tapi/bli_l3_ind_tapi.c @@ -98,6 +98,71 @@ INSERT_GENTFUNC_BASIC0( gemm4m1 ) INSERT_GENTFUNC_BASIC0( gemm1m ) +// -- gemmt -------------------------------------------------------------------- + +#undef GENTFUNC +#define GENTFUNC( ctype, ch, opname ) \ +\ +void PASTEMAC(ch,opname) \ + ( \ + uplo_t uploc, \ + trans_t transa, \ + trans_t transb, \ + dim_t m, \ + dim_t k, \ + ctype* alpha, \ + ctype* a, inc_t rs_a, inc_t cs_a, \ + ctype* b, inc_t rs_b, inc_t cs_b, \ + ctype* beta, \ + ctype* c, inc_t rs_c, inc_t cs_c, \ + cntx_t* cntx, \ + rntm_t* rntm \ + ) \ +{ \ + bli_init_once(); \ +\ + const num_t dt = PASTEMAC(ch,type); \ +\ + obj_t alphao, ao, bo, betao, co; \ +\ + dim_t m_a, n_a; \ + dim_t m_b, n_b; \ +\ + bli_set_dims_with_trans( transa, m, k, &m_a, &n_a ); \ + bli_set_dims_with_trans( transb, k, m, &m_b, &n_b ); \ +\ + bli_obj_create_1x1_with_attached_buffer( dt, alpha, &alphao ); \ + bli_obj_create_1x1_with_attached_buffer( dt, beta, &betao ); \ +\ + bli_obj_create_with_attached_buffer( dt, m_a, n_a, a, rs_a, cs_a, &ao ); \ + bli_obj_create_with_attached_buffer( dt, m_b, n_b, b, rs_b, cs_b, &bo ); \ + bli_obj_create_with_attached_buffer( dt, m, m, c, rs_c, cs_c, &co ); \ +\ + bli_obj_set_uplo( uploc, &co ); \ + bli_obj_set_conjtrans( transa, &ao ); \ + bli_obj_set_conjtrans( transb, &bo ); \ +\ + bli_obj_set_struc( BLIS_SYMMETRIC, &co ); \ +\ + PASTEMAC0(opname) \ + ( \ + &alphao, \ + &ao, \ + &bo, \ + &betao, \ + &co, \ + cntx, \ + rntm \ + ); \ +} + +INSERT_GENTFUNC_BASIC0( gemmt3mh ) +INSERT_GENTFUNC_BASIC0( gemmt3m1 ) +INSERT_GENTFUNC_BASIC0( gemmt4mh ) +INSERT_GENTFUNC_BASIC0( gemmt4m1 ) +INSERT_GENTFUNC_BASIC0( gemmt1m ) + + // -- hemm --------------------------------------------------------------------- #undef GENTFUNC diff --git a/frame/ind/tapi/bli_l3_ind_tapi.h b/frame/ind/tapi/bli_l3_ind_tapi.h index ecf8c729b3..ac824fabfe 100644 --- a/frame/ind/tapi/bli_l3_ind_tapi.h +++ b/frame/ind/tapi/bli_l3_ind_tapi.h @@ -60,6 +60,32 @@ INSERT_GENTPROT_BASIC0( gemm4m1 ) INSERT_GENTPROT_BASIC0( gemm1m ) +#undef GENTPROT +#define GENTPROT( ctype, ch, opname ) \ +\ +void PASTEMAC(ch,opname) \ + ( \ + uplo_t uploc, \ + trans_t transa, \ + trans_t transb, \ + dim_t m, \ + dim_t k, \ + ctype* alpha, \ + ctype* a, inc_t rs_a, inc_t cs_a, \ + ctype* b, inc_t rs_b, inc_t cs_b, \ + ctype* beta, \ + ctype* c, inc_t rs_c, inc_t cs_c, \ + cntx_t* cntx, \ + rntm_t* rntm \ + ); + +INSERT_GENTPROT_BASIC0( gemmt3mh ) +INSERT_GENTPROT_BASIC0( gemmt3m1 ) +INSERT_GENTPROT_BASIC0( gemmt4mh ) +INSERT_GENTPROT_BASIC0( gemmt4m1 ) +INSERT_GENTPROT_BASIC0( gemmt1m ) + + #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ From 123677d75d934716eb89e7b66bd9bb716e70a50e Mon Sep 17 00:00:00 2001 From: Devin Matthews Date: Mon, 13 Sep 2021 11:42:50 -0500 Subject: [PATCH 05/24] Major update of l3 packing code: 1. Removed bli_packm_blk_var1_md and merged with bli_packm_blk_var1. 2. Simplified packing code under the assumption that the matrices are passed as A and B^T (i.e., that k is always the row dimension). This primarily affects bli_packm_blk_var1 and the bli_packm_struc_cxk functions. To compensate, explicit transposition is done as appropriate in bli_(gemm|trsm)_pack[ab]. CAVEAT: explicit transitions of B and Bp are also needed in the gemm and gemmtrsm ukr tests. 3. Explicitly remove any offsets in the user-provided obj_t's. Thus the offsets represent only the partitioning by the framework. --- frame/1m/bli_l1m_ft_ker.h | 12 +- frame/1m/packm/bli_packm_blk_var1.c | 434 ++++++-------------- frame/1m/packm/bli_packm_blk_var1_md.c | 344 ---------------- frame/1m/packm/bli_packm_blk_var1_md.h | 67 --- frame/1m/packm/bli_packm_init.c | 124 +----- frame/1m/packm/bli_packm_md.h | 1 - frame/1m/packm/bli_packm_struc_cxk.c | 351 +++++----------- frame/1m/packm/bli_packm_struc_cxk.h | 72 +--- frame/1m/packm/bli_packm_struc_cxk_1er.c | 343 +++++----------- frame/1m/packm/bli_packm_struc_cxk_1er.h | 72 +--- frame/1m/packm/bli_packm_struc_cxk_3mis.c | 477 +++++++--------------- frame/1m/packm/bli_packm_struc_cxk_3mis.h | 72 +--- frame/1m/packm/bli_packm_struc_cxk_4mi.c | 427 +++++++------------ frame/1m/packm/bli_packm_struc_cxk_4mi.h | 72 +--- frame/1m/packm/bli_packm_struc_cxk_md.c | 51 +-- frame/1m/packm/bli_packm_struc_cxk_md.h | 17 +- frame/1m/packm/bli_packm_struc_cxk_rih.c | 320 +++++---------- frame/1m/packm/bli_packm_struc_cxk_rih.h | 72 +--- frame/3/gemm/bli_gemm_cntl.c | 4 +- frame/3/gemm/bli_gemm_front.c | 4 + frame/3/gemm/bli_gemm_md.c | 6 +- frame/3/gemm/bli_gemm_packab.c | 33 +- frame/3/hemm/bli_hemm_front.c | 4 + frame/3/symm/bli_symm_front.c | 4 + frame/3/trmm/bli_trmm_front.c | 4 + frame/3/trmm3/bli_trmm3_front.c | 4 + frame/3/trsm/bli_trsm_blk_var1.c | 2 +- frame/3/trsm/bli_trsm_cntl.c | 2 +- frame/3/trsm/bli_trsm_front.c | 4 + frame/3/trsm/bli_trsm_packab.c | 33 +- frame/include/bli_obj_macro_defs.h | 8 + frame/include/bli_type_defs.h | 1 - testsuite/src/test_gemm_ukr.c | 13 +- testsuite/src/test_gemmtrsm_ukr.c | 19 +- 34 files changed, 912 insertions(+), 2561 deletions(-) delete mode 100644 frame/1m/packm/bli_packm_blk_var1_md.c delete mode 100644 frame/1m/packm/bli_packm_blk_var1_md.h diff --git a/frame/1m/bli_l1m_ft_ker.h b/frame/1m/bli_l1m_ft_ker.h index e8ebdec0d8..1fc8bdf571 100644 --- a/frame/1m/bli_l1m_ft_ker.h +++ b/frame/1m/bli_l1m_ft_ker.h @@ -56,13 +56,13 @@ typedef void (*PASTECH3(ch,opname,_ker,tsuf)) \ conj_t conjc, \ pack_t schema, \ bool invdiag, \ - dim_t m_panel, \ - dim_t n_panel, \ - dim_t m_panel_max, \ - dim_t n_panel_max, \ + dim_t panel_dim, \ + dim_t panel_len, \ + dim_t panel_dim_max, \ + dim_t panel_len_max, \ ctype* restrict kappa, \ - ctype* restrict c, inc_t rs_c, inc_t cs_c, \ - ctype* restrict p, inc_t rs_p, inc_t cs_p, \ + ctype* restrict c, inc_t incc, inc_t ldc, \ + ctype* restrict p, inc_t ldp, \ inc_t is_p, \ cntx_t* cntx \ ); diff --git a/frame/1m/packm/bli_packm_blk_var1.c b/frame/1m/packm/bli_packm_blk_var1.c index ff4ac9e3a0..a2442360af 100644 --- a/frame/1m/packm/bli_packm_blk_var1.c +++ b/frame/1m/packm/bli_packm_blk_var1.c @@ -71,6 +71,7 @@ static func_t packm_struc_cxk_kers[BLIS_NUM_PACK_SCHEMA_TYPES] = NULL, bli_zpackm_struc_cxk_1er, } }, }; +static packm_ker_vft GENARRAY2_ALL(packm_struc_cxk_md,packm_struc_cxk_md); void bli_packm_blk_var1 ( @@ -81,256 +82,101 @@ void bli_packm_blk_var1 thrinfo_t* thread ) { -#ifdef BLIS_ENABLE_GEMM_MD - // Call a different packm implementation when the storage and target - // datatypes differ. - if ( bli_obj_dt( c ) != bli_obj_target_dt( c ) ) - { - bli_packm_blk_var1_md( c, p, cntx, cntl, thread ); - return; - } -#endif - - num_t dt_p = bli_obj_dt( p ); - dim_t dt_size = bli_dt_size( dt_p ); - - struc_t strucc = bli_obj_struc( c ); - doff_t diagoffc = bli_obj_diag_offset( c ); - diag_t diagc = bli_obj_diag( c ); - uplo_t uploc = bli_obj_uplo( c ); - trans_t transc = bli_obj_conjtrans_status( c ); - pack_t schema = bli_obj_pack_schema( p ); - bool invdiag = bli_obj_has_inverted_diag( p ); - bool revifup = bli_obj_is_pack_rev_if_upper( p ); - bool reviflo = bli_obj_is_pack_rev_if_lower( p ); - - dim_t m_p = bli_obj_length( p ); - dim_t n_p = bli_obj_width( p ); - dim_t m_max_p = bli_obj_padded_length( p ); - dim_t n_max_p = bli_obj_padded_width( p ); - - void* buf_c = bli_obj_buffer_at_off( c ); - inc_t rs_c = bli_obj_row_stride( c ); - inc_t cs_c = bli_obj_col_stride( c ); - dim_t m_off_c = bli_obj_row_off( p ); - dim_t n_off_c = bli_obj_col_off( p ); - - void* buf_p = bli_obj_buffer_at_off( p ); - inc_t rs_p = bli_obj_row_stride( p ); - inc_t cs_p = bli_obj_col_stride( p ); - inc_t is_p = bli_obj_imag_stride( p ); - dim_t pd_p = bli_obj_panel_dim( p ); - inc_t ps_p = bli_obj_panel_stride( p ); - - obj_t kappa; - void* buf_kappa; - - func_t* packm_kers; - void_fp packm_ker; - - - // Treatment of kappa (ie: packing during scaling) depends on - // whether we are executing an induced method. - if ( bli_is_nat_packed( schema ) ) - { - // This branch is for native execution, where we assume that - // the micro-kernel will always apply the alpha scalar of the - // higher-level operation. Thus, we use BLIS_ONE for kappa so - // that the underlying packm implementation does not perform - // any scaling during packing. - buf_kappa = bli_obj_buffer_for_const( dt_p, &BLIS_ONE ); - } - else // if ( bli_is_ind_packed( schema ) ) - { - obj_t* kappa_p; - - // The value for kappa we use will depend on whether the scalar - // attached to A has a nonzero imaginary component. If it does, - // then we will apply the scalar during packing to facilitate - // implementing induced complex domain algorithms in terms of - // real domain micro-kernels. (In the aforementioned situation, - // applying a real scalar is easy, but applying a complex one is - // harder, so we avoid the need altogether with the code below.) - if ( bli_obj_scalar_has_nonzero_imag( p ) ) - { - //printf( "applying non-zero imag kappa\n_p" ); - - // Detach the scalar. - bli_obj_scalar_detach( p, &kappa ); - - // Reset the attached scalar (to 1.0). - bli_obj_scalar_reset( p ); - - kappa_p = κ - } - else - { - // If the internal scalar of A has only a real component, then - // we will apply it later (in the micro-kernel), and so we will - // use BLIS_ONE to indicate no scaling during packing. - kappa_p = &BLIS_ONE; - } + num_t dt_c = bli_obj_dt( c ); + dim_t dt_c_size = bli_dt_size( dt_c ); + + num_t dt_p = bli_obj_dt( p ); + dim_t dt_p_size = bli_dt_size( dt_p ); + + struc_t strucc = bli_obj_struc( c ); + doff_t diagoffc = bli_obj_diag_offset( c ); + diag_t diagc = bli_obj_diag( c ); + uplo_t uploc = bli_obj_uplo( c ); + trans_t transc = bli_obj_conjtrans_status( c ); + pack_t schema = bli_obj_pack_schema( p ); + bool invdiag = bli_obj_has_inverted_diag( p ); + bool revifup = bli_obj_is_pack_rev_if_upper( p ); + bool reviflo = bli_obj_is_pack_rev_if_lower( p ); + + dim_t iter_dim = bli_obj_length( p ); + dim_t panel_len_full = bli_obj_width( p ); + dim_t panel_len_max = bli_obj_padded_width( p ); + + char* c_cast = bli_obj_buffer_at_off( c ); + inc_t incc = bli_obj_row_stride( c ); + inc_t ldc = bli_obj_col_stride( c ); + dim_t panel_dim_off = bli_obj_row_off( c ); + dim_t panel_len_off = bli_obj_col_off( c ); + + char* p_cast = bli_obj_buffer_at_off( p ); + inc_t ldp = bli_obj_col_stride( p ); + inc_t is_p = bli_obj_imag_stride( p ); + dim_t panel_dim_max = bli_obj_panel_dim( p ); + inc_t ps_p = bli_obj_panel_stride( p ); + + doff_t diagoffc_inc = ( doff_t )panel_dim_max; - // Acquire the buffer to the kappa chosen above. - buf_kappa = bli_obj_buffer_for_1x1( dt_p, kappa_p ); - } + /* If C is zeros and part of a triangular matrix, then we don't need + to pack it. */ + if ( bli_is_zeros( uploc ) && + bli_is_triangular( strucc ) ) return; + char* kappa_cast; + + // The value for kappa we use will depends on whether the scalar + // attached to A has a nonzero imaginary component. If it does, + // then we will apply the scalar during packing to facilitate + // implementing induced complex domain algorithms in terms of + // real domain micro-kernels. (In the aforementioned situation, + // applying a real scalar is easy, but applying a complex one is + // harder, so we avoid the need altogether with the code below.) + if ( bli_obj_scalar_has_nonzero_imag( p ) && + !bli_is_nat_packed( schema ) ) + { + //printf( "applying non-zero imag kappa\n_p" ); + obj_t kappa; -#if 0 - if ( bli_is_4mi_packed( schema ) ) packm_kers = packm_struc_cxk_4mi_kers; - else if ( bli_is_3mi_packed( schema ) || - bli_is_3ms_packed( schema ) ) packm_kers = packm_struc_cxk_3mis_kers; - else if ( bli_is_ro_packed( schema ) || - bli_is_io_packed( schema ) || - bli_is_rpi_packed( schema ) ) packm_kers = packm_struc_cxk_rih_kers; - else packm_kers = packm_struc_cxk_kers; -#else - // The original idea here was to read the packm_ukr from the context - // if it is non-NULL. The problem is, it requires that we be able to - // assume that the packm_ukr field is initialized to NULL, which it - // currently is not. + // Detach the scalar. + bli_obj_scalar_detach( p, &kappa ); - //func_t* cntx_packm_kers = bli_cntx_get_packm_ukr( cntx ); + // Reset the attached scalar (to 1.0). + bli_obj_scalar_reset( p ); - //if ( bli_func_is_null_dt( dt_c, cntx_packm_kers ) ) - { - // If the packm structure-aware kernel func_t in the context is - // NULL (which is the default value after the context is created), - // we use the default lookup table to determine the right func_t - // for the current schema. - const dim_t i = bli_pack_schema_index( schema ); - - packm_kers = &packm_struc_cxk_kers[ i ]; + kappa_cast = bli_obj_buffer_for_1x1( dt_p, &kappa ); } -#if 0 - else // cntx's packm func_t overrides + // This branch is also for native execution, where we assume that + // the micro-kernel will always apply the alpha scalar of the + // higher-level operation. Thus, we use BLIS_ONE for kappa so + // that the underlying packm implementation does not perform + // any scaling during packing. + else { - // If the packm structure-aware kernel func_t in the context is - // non-NULL (ie: assumed to be valid), we use that instead. - //packm_kers = bli_cntx_packm_ukrs( cntx ); - packm_kers = cntx_packm_kers; + // If the internal scalar of A has only a real component, then + // we will apply it later (in the micro-kernel), and so we will + // use BLIS_ONE to indicate no scaling during packing. + kappa_cast = bli_obj_buffer_for_1x1( dt_p, &BLIS_ONE ); } -#endif -#endif + + // If the packm structure-aware kernel func_t in the context is + // NULL (which is the default value after the context is created), + // we use the default lookup table to determine the right func_t + // for the current schema. + func_t* packm_kers = &packm_struc_cxk_kers[ bli_pack_schema_index( schema ) ]; // Query the datatype-specific function pointer from the func_t object. - packm_ker = bli_func_get_dt( dt_p, packm_kers ); - - packm_ker_vft packm_ker_cast = packm_ker; - obj_pack_ukr_fn_t pack_ker_user = bli_obj_pack_ukr_fn( c ); - - char* restrict kappa_cast = buf_kappa; - char* restrict c_cast = buf_c; - char* restrict p_cast = buf_p; - char* restrict c_begin; - char* restrict p_begin; - - dim_t iter_dim; - dim_t n_iter; - dim_t it, ic, ip; - dim_t ic0, ip0; - doff_t ic_inc, ip_inc; - doff_t diagoffc_i; - doff_t diagoffc_inc; - dim_t panel_len_full; - dim_t panel_len_i; - dim_t panel_len_max; - dim_t panel_len_max_i; - dim_t panel_dim_i; - dim_t panel_dim_max; - dim_t panel_off_i; - dim_t panel_len_off; - dim_t panel_dim_off; - dim_t panel_dim_off_i; - inc_t vs_c; - inc_t ldc; - inc_t ldp, p_inc; - dim_t* m_panel_full; - dim_t* n_panel_full; - dim_t* m_panel_use; - dim_t* n_panel_use; - dim_t* m_panel_max; - dim_t* n_panel_max; - conj_t conjc; - bool row_stored; - bool col_stored; - inc_t is_p_use; - dim_t ss_num; - dim_t ss_den; - - char* restrict c_use; - char* restrict p_use; - doff_t diagoffp_i; + packm_ker_vft packm_ker_cast = bli_func_get_dt( dt_p, packm_kers ); + // For mixed-precision gemm, select the proper kernel (only dense panels). + if ( dt_c != dt_p ) + { + packm_ker_cast = packm_struc_cxk_md[ dt_c ][ dt_p ]; + } - /* If C is zeros and part of a triangular matrix, then we don't need - to pack it. */ - if ( bli_is_zeros( uploc ) && - bli_is_triangular( strucc ) ) return; + // Query the user-provided packing kernel from the obj_t. + obj_pack_ukr_fn_t pack_ker_user = bli_obj_pack_ukr_fn( c ); /* Extract the conjugation bit from the transposition argument. */ - conjc = bli_extract_conj( transc ); - - /* If c needs a transposition, induce it so that we can more simply - express the remaining parameters and code. */ - if ( bli_does_trans( transc ) ) - { - bli_swap_incs( &rs_c, &cs_c ); - bli_negate_diag_offset( &diagoffc ); - bli_toggle_uplo( &uploc ); - bli_toggle_trans( &transc ); - } - - /* Create flags to incidate row or column storage. Note that the - schema bit that encodes row or column is describing the form of - micro-panel, not the storage in the micro-panel. Hence the - mismatch in "row" and "column" semantics. */ - row_stored = bli_is_col_packed( schema ); - col_stored = bli_is_row_packed( schema ); - - /* If the row storage flag indicates row storage, then we are packing - to column panels; otherwise, if the strides indicate column storage, - we are packing to row panels. */ - if ( row_stored ) - { - /* Prepare to pack to row-stored column panels. */ - iter_dim = n_p; - panel_len_full = m_p; - panel_len_max = m_max_p; - panel_dim_max = pd_p; - panel_len_off = m_off_c; - panel_dim_off = n_off_c; - ldc = rs_c; - vs_c = cs_c; - diagoffc_inc = -( doff_t )panel_dim_max; - ldp = rs_p; - m_panel_full = &m_p; - n_panel_full = &panel_dim_i; - m_panel_use = &panel_len_i; - n_panel_use = &panel_dim_i; - m_panel_max = &panel_len_max_i; - n_panel_max = &panel_dim_max; - } - else /* if ( col_stored ) */ - { - /* Prepare to pack to column-stored row panels. */ - iter_dim = m_p; - panel_len_full = n_p; - panel_len_max = n_max_p; - panel_dim_max = pd_p; - panel_len_off = n_off_c; - panel_dim_off = m_off_c; - ldc = cs_c; - vs_c = rs_c; - diagoffc_inc = ( doff_t )panel_dim_max; - ldp = cs_p; - m_panel_full = &panel_dim_i; - n_panel_full = &n_p; - m_panel_use = &panel_dim_i; - n_panel_use = &panel_len_i; - m_panel_max = &panel_dim_max; - n_panel_max = &panel_len_max_i; - } + conj_t conjc = bli_extract_conj( transc ); /* Compute the storage stride scaling. Usually this is just 1. However, in the case of interleaved 3m, we need to scale by 3/2, and in the @@ -338,16 +184,22 @@ void bli_packm_blk_var1 1/2. In both cases, we are compensating for the fact that pointer arithmetic occurs in terms of complex elements rather than real elements. */ + dim_t ss_num; + dim_t ss_den; + if ( bli_is_3mi_packed( schema ) ) { ss_num = 3; ss_den = 2; } else if ( bli_is_3ms_packed( schema ) ) { ss_num = 1; ss_den = 2; } else if ( bli_is_rih_packed( schema ) ) { ss_num = 1; ss_den = 2; } else { ss_num = 1; ss_den = 1; } /* Compute the total number of iterations we'll need. */ - n_iter = iter_dim / panel_dim_max + ( iter_dim % panel_dim_max ? 1 : 0 ); + dim_t n_iter = iter_dim / panel_dim_max + ( iter_dim % panel_dim_max ? 1 : 0 ); /* Set the initial values and increments for indices related to C and P based on whether reverse iteration was requested. */ + dim_t ic0, ip0; + doff_t ic_inc, ip_inc; + if ( ( revifup && bli_is_upper( uploc ) && bli_is_triangular( strucc ) ) || ( reviflo && bli_is_lower( uploc ) && bli_is_triangular( strucc ) ) ) { @@ -364,40 +216,36 @@ void bli_packm_blk_var1 ip_inc = 1; } - p_begin = p_cast; - /* Query the number of threads and thread ids from the current thread's packm thrinfo_t node. */ const dim_t nt = bli_thread_n_way( thread ); const dim_t tid = bli_thread_work_id( thread ); - dim_t it_start, it_end, it_inc; - /* Determine the thread range and increment using the current thread's packm thrinfo_t node. NOTE: The definition of bli_thread_range_jrir() will depend on whether slab or round-robin partitioning was requested at configure-time. */ + dim_t it_start, it_end, it_inc; bli_thread_range_jrir( thread, n_iter, 1, FALSE, &it_start, &it_end, &it_inc ); + char* p_begin = p_cast; + /* Iterate over every logical micropanel in the source matrix. */ - for ( ic = ic0, ip = ip0, it = 0; it < n_iter; - ic += ic_inc, ip += ip_inc, it += 1 ) + for ( dim_t ic = ic0, ip = ip0, it = 0; it < n_iter; + ic += ic_inc, ip += ip_inc, it += 1 ) { - panel_dim_i = bli_min( panel_dim_max, iter_dim - ic ); + dim_t panel_dim_i = bli_min( panel_dim_max, iter_dim - ic ); - diagoffc_i = diagoffc + (ip )*diagoffc_inc; - c_begin = c_cast + (ic )*vs_c*dt_size; + doff_t diagoffc_i = diagoffc + (ip )*diagoffc_inc; + char* c_begin = c_cast + (ic )*incc*dt_c_size; - p_inc = ps_p; + inc_t p_inc = ps_p; if ( pack_ker_user ) { /* This case executes if the user has specified a custom packing microkernel */ - panel_dim_off_i = panel_dim_off + ic; - - c_use = c_begin; - p_use = p_begin; + dim_t panel_dim_off_i = panel_dim_off + ic; /* The definition of bli_packm_my_iter() will depend on whether slab or round-robin partitioning was requested at configure-time. */ @@ -410,14 +258,14 @@ void bli_packm_blk_var1 panel_len_max, panel_len_off, kappa_cast, - c_use, vs_c, ldc, - p_use, ldp, + c_begin, incc, ldc, + p_begin, ldp, bli_obj_user_data( c ), cntx ); } } else if ( bli_is_triangular( strucc ) && - bli_is_unstored_subpart_n( diagoffc_i, uploc, *m_panel_full, *n_panel_full ) ) + bli_is_unstored_subpart_n( diagoffc_i, uploc, panel_dim_i, panel_len_full ) ) { /* This case executes if the panel belongs to a triangular matrix AND is completely unstored (ie: zero). If the panel @@ -427,7 +275,7 @@ void bli_packm_blk_var1 continue; } else if ( bli_is_triangular( strucc ) && - bli_intersects_diag_n( diagoffc_i, *m_panel_full, *n_panel_full ) ) + bli_intersects_diag_n( diagoffc_i, panel_dim_i, panel_len_full ) ) { /* This case executes if the panel belongs to a triangular matrix AND is diagonal-intersecting. Notice that we @@ -439,12 +287,15 @@ void bli_packm_blk_var1 a micro-panel. If they do, then somehow the constraints on cache blocksizes being a whole multiple of the register blocksizes was somehow violated. */ - if ( ( col_stored && diagoffc_i < 0 ) || - ( row_stored && diagoffc_i > 0 ) ) + if ( diagoffc_i < 0 ) bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED ); - if ( ( row_stored && bli_is_upper( uploc ) ) || - ( col_stored && bli_is_lower( uploc ) ) ) + dim_t panel_off_i; + dim_t panel_len_i; + dim_t panel_len_max_i; + doff_t diagoffp_i; + + if ( bli_is_lower( uploc ) ) { panel_off_i = 0; panel_len_i = bli_abs( diagoffc_i ) + panel_dim_i; @@ -452,8 +303,7 @@ void bli_packm_blk_var1 panel_len_max ); diagoffp_i = diagoffc_i; } - else /* if ( ( row_stored && bli_is_lower( uploc ) ) || - ( col_stored && bli_is_upper( uploc ) ) ) */ + else /* if ( bli_is_upper( uploc ) ) */ { panel_off_i = bli_abs( diagoffc_i ); panel_len_i = panel_len_full - panel_off_i; @@ -461,14 +311,14 @@ void bli_packm_blk_var1 diagoffp_i = 0; } - c_use = c_begin + (panel_off_i )*ldc*dt_size; - p_use = p_begin; + char* c_use = c_begin + (panel_off_i )*ldc*dt_c_size; + char* p_use = p_begin; /* We need to re-compute the imaginary stride as a function of panel_len_max_i since triangular packed matrices have panels of varying lengths. NOTE: This imaginary stride value is only referenced by the packm kernels for induced methods. */ - is_p_use = ldp * panel_len_max_i; + inc_t is_p_use = ldp * panel_len_max_i; /* We nudge the imaginary stride up by one if it is odd. */ is_p_use += ( bli_is_odd( is_p_use ) ? 1 : 0 ); @@ -486,13 +336,13 @@ void bli_packm_blk_var1 conjc, schema, invdiag, - *m_panel_use, - *n_panel_use, - *m_panel_max, - *n_panel_max, + panel_dim_i, + panel_len_i, + panel_dim_max, + panel_len_max, kappa_cast, - c_use, rs_c, cs_c, - p_use, rs_p, cs_p, + c_use, incc, ldc, + p_use, ldp, is_p_use, cntx ); } @@ -508,14 +358,6 @@ void bli_packm_blk_var1 symmetric matrix, which includes stored, unstored, and diagonal-intersecting panels. */ - c_use = c_begin; - p_use = p_begin; - - panel_len_i = panel_len_full; - panel_len_max_i = panel_len_max; - - is_p_use = is_p; - /* The definition of bli_packm_my_iter() will depend on whether slab or round-robin partitioning was requested at configure-time. */ if ( bli_packm_my_iter( it, it_start, it_end, tid, nt ) ) @@ -527,14 +369,13 @@ void bli_packm_blk_var1 conjc, schema, invdiag, - *m_panel_use, - *n_panel_use, - *m_panel_max, - *n_panel_max, + panel_dim_i, + panel_len_full, + panel_dim_max, + panel_len_max, kappa_cast, - c_use, rs_c, cs_c, - p_use, rs_p, cs_p, - is_p_use, + c_begin, incc, ldc, + p_begin, ldp, is_p, cntx ); } } @@ -544,14 +385,6 @@ void bli_packm_blk_var1 panel is part of a triangular matrix and is neither unstored (ie: zero) nor diagonal-intersecting. */ - c_use = c_begin; - p_use = p_begin; - - panel_len_i = panel_len_full; - panel_len_max_i = panel_len_max; - - is_p_use = is_p; - /* The definition of bli_packm_my_iter() will depend on whether slab or round-robin partitioning was requested at configure-time. */ if ( bli_packm_my_iter( it, it_start, it_end, tid, nt ) ) @@ -563,19 +396,18 @@ void bli_packm_blk_var1 conjc, schema, invdiag, - *m_panel_use, - *n_panel_use, - *m_panel_max, - *n_panel_max, + panel_dim_i, + panel_len_full, + panel_dim_max, + panel_len_max, kappa_cast, - c_use, rs_c, cs_c, - p_use, rs_p, cs_p, - is_p_use, + c_begin, incc, ldc, + p_begin, ldp, is_p, cntx ); } } - p_begin += p_inc*dt_size; + p_begin += p_inc*dt_p_size; } } diff --git a/frame/1m/packm/bli_packm_blk_var1_md.c b/frame/1m/packm/bli_packm_blk_var1_md.c deleted file mode 100644 index a7c694e4fc..0000000000 --- a/frame/1m/packm/bli_packm_blk_var1_md.c +++ /dev/null @@ -1,344 +0,0 @@ -/* - - BLIS - An object-based framework for developing high-performance BLAS-like - libraries. - - Copyright (C) 2014, The University of Texas at Austin - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are - met: - - Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - - Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - - Neither the name(s) of the copyright holder(s) nor the names of its - contributors may be used to endorse or promote products derived - from this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -*/ - -#include "blis.h" - -#ifdef BLIS_ENABLE_GEMM_MD - -#define FUNCPTR_T packm_fp - -typedef void (*FUNCPTR_T)( - trans_t transc, - pack_t schema, - dim_t m, - dim_t n, - dim_t m_max, - dim_t n_max, - void* kappa, - void* c, inc_t rs_c, inc_t cs_c, - void* p, inc_t rs_p, inc_t cs_p, - inc_t is_p, - dim_t pd_p, inc_t ps_p, - cntx_t* cntx, - thrinfo_t* thread - ); - -static FUNCPTR_T GENARRAY2_ALL(ftypes,packm_blk_var1_md); - - -void bli_packm_blk_var1_md - ( - obj_t* c, - obj_t* p, - cntx_t* cntx, - cntl_t* cntl, - thrinfo_t* t - ) -{ - num_t dt_c = bli_obj_dt( c ); - num_t dt_p = bli_obj_dt( p ); - - trans_t transc = bli_obj_conjtrans_status( c ); - pack_t schema = bli_obj_pack_schema( p ); - - dim_t m_p = bli_obj_length( p ); - dim_t n_p = bli_obj_width( p ); - dim_t m_max_p = bli_obj_padded_length( p ); - dim_t n_max_p = bli_obj_padded_width( p ); - - void* buf_c = bli_obj_buffer_at_off( c ); - inc_t rs_c = bli_obj_row_stride( c ); - inc_t cs_c = bli_obj_col_stride( c ); - - void* buf_p = bli_obj_buffer_at_off( p ); - inc_t rs_p = bli_obj_row_stride( p ); - inc_t cs_p = bli_obj_col_stride( p ); - inc_t is_p = bli_obj_imag_stride( p ); - dim_t pd_p = bli_obj_panel_dim( p ); - inc_t ps_p = bli_obj_panel_stride( p ); - - obj_t kappa; - void* buf_kappa; - - FUNCPTR_T f; - - - // Treatment of kappa (ie: packing during scaling) depends on - // whether we are executing an induced method. - if ( bli_is_nat_packed( schema ) ) - { - // This branch is for native execution, where we assume that - // the micro-kernel will always apply the alpha scalar of the - // higher-level operation. Thus, we use BLIS_ONE for kappa so - // that the underlying packm implementation does not perform - // any scaling during packing. - buf_kappa = bli_obj_buffer_for_const( dt_p, &BLIS_ONE ); - } - else // if ( bli_is_ind_packed( schema ) ) - { - obj_t* kappa_p; - - // The value for kappa we use will depend on whether the scalar - // attached to A has a nonzero imaginary component. If it does, - // then we will apply the scalar during packing to facilitate - // implementing induced complex domain algorithms in terms of - // real domain micro-kernels. (In the aforementioned situation, - // applying a real scalar is easy, but applying a complex one is - // harder, so we avoid the need altogether with the code below.) - if ( bli_obj_scalar_has_nonzero_imag( p ) ) - { - // Detach the scalar. - bli_obj_scalar_detach( p, &kappa ); - - // Reset the attached scalar (to 1.0). - bli_obj_scalar_reset( p ); - - kappa_p = κ - } - else - { - // If the internal scalar of A has only a real component, then - // we will apply it later (in the micro-kernel), and so we will - // use BLIS_ONE to indicate no scaling during packing. - kappa_p = &BLIS_ONE; - } - - // Acquire the buffer to the kappa chosen above. - buf_kappa = bli_obj_buffer_for_1x1( dt_p, kappa_p ); - } - - - // Index into the type combination array to extract the correct - // function pointer. - f = ftypes[dt_c][dt_p]; - - // Invoke the function. - f( - transc, - schema, - m_p, - n_p, - m_max_p, - n_max_p, - buf_kappa, - buf_c, rs_c, cs_c, - buf_p, rs_p, cs_p, - is_p, - pd_p, ps_p, - cntx, - t ); -} - - -#undef GENTFUNC2 -#define GENTFUNC2( ctype_c, ctype_p, chc, chp, varname ) \ -\ -void PASTEMAC2(chc,chp,varname) \ - ( \ - trans_t transc, \ - pack_t schema, \ - dim_t m, \ - dim_t n, \ - dim_t m_max, \ - dim_t n_max, \ - void* kappa, \ - void* c, inc_t rs_c, inc_t cs_c, \ - void* p, inc_t rs_p, inc_t cs_p, \ - inc_t is_p, \ - dim_t pd_p, inc_t ps_p, \ - cntx_t* cntx, \ - thrinfo_t* thread \ - ) \ -{ \ - ctype_p* restrict kappa_cast = kappa; \ - ctype_c* restrict c_cast = c; \ - ctype_p* restrict p_cast = p; \ - ctype_c* restrict c_begin; \ - ctype_p* restrict p_begin; \ -\ - dim_t iter_dim; \ - dim_t n_iter; \ - dim_t it, ic, ip; \ - doff_t ic_inc, ip_inc; \ - dim_t panel_len_full; \ - dim_t panel_len_i; \ - dim_t panel_len_max; \ - dim_t panel_len_max_i; \ - dim_t panel_dim_i; \ - dim_t panel_dim_max; \ - inc_t vs_c; \ - inc_t p_inc; \ - dim_t* m_panel_use; \ - dim_t* n_panel_use; \ - dim_t* m_panel_max; \ - dim_t* n_panel_max; \ - conj_t conjc; \ - bool row_stored; \ - bool col_stored; \ -\ - ctype_c* restrict c_use; \ - ctype_p* restrict p_use; \ -\ -\ - /* Extract the conjugation bit from the transposition argument. */ \ - conjc = bli_extract_conj( transc ); \ -\ - /* If c needs a transposition, induce it so that we can more simply - express the remaining parameters and code. */ \ - if ( bli_does_trans( transc ) ) \ - { \ - bli_swap_incs( &rs_c, &cs_c ); \ - bli_toggle_trans( &transc ); \ - } \ -\ - /* Create flags to incidate row or column storage. Note that the - schema bit that encodes row or column is describing the form of - micro-panel, not the storage in the micro-panel. Hence the - mismatch in "row" and "column" semantics. */ \ - row_stored = bli_is_col_packed( schema ); \ - col_stored = bli_is_row_packed( schema ); \ -\ - ( void )col_stored; \ -\ - /* If the row storage flag indicates row storage, then we are packing - to column panels; otherwise, if the strides indicate column storage, - we are packing to row panels. */ \ - if ( row_stored ) \ - { \ - /* Prepare to pack to row-stored column panels. */ \ - iter_dim = n; \ - panel_len_full = m; \ - panel_len_max = m_max; \ - panel_dim_max = pd_p; \ - vs_c = cs_c; \ - m_panel_use = &panel_len_i; \ - n_panel_use = &panel_dim_i; \ - m_panel_max = &panel_len_max_i; \ - n_panel_max = &panel_dim_max; \ - } \ - else /* if ( col_stored ) */ \ - { \ - /* Prepare to pack to column-stored row panels. */ \ - iter_dim = m; \ - panel_len_full = n; \ - panel_len_max = n_max; \ - panel_dim_max = pd_p; \ - vs_c = rs_c; \ - m_panel_use = &panel_dim_i; \ - n_panel_use = &panel_len_i; \ - m_panel_max = &panel_dim_max; \ - n_panel_max = &panel_len_max_i; \ - } \ -\ - /* Compute the total number of iterations we'll need. */ \ - n_iter = iter_dim / panel_dim_max + ( iter_dim % panel_dim_max ? 1 : 0 ); \ -\ - { \ - ic_inc = panel_dim_max; \ - ip_inc = 1; \ - } \ -\ - p_begin = p_cast; \ -\ - /* Query the number of threads and thread ids from the current thread's - packm thrinfo_t node. */ \ - const dim_t nt = bli_thread_n_way( thread ); \ - const dim_t tid = bli_thread_work_id( thread ); \ -\ - /* Suppress unused variable warnings when slab partitioning is enabled, - since the slab-based definition of bli_packm_my_iter() does not - actually use tid or nt. */ \ - ( void )nt; ( void )tid; \ -\ - dim_t it_start, it_end, it_inc; \ -\ - /* Determine the thread range and increment using the current thread's - packm thrinfo_t node. NOTE: The definition of bli_thread_range_jrir() - will depend on whether slab or round-robin partitioning was requested - at configure-time. */ \ - bli_thread_range_jrir( thread, n_iter, 1, FALSE, &it_start, &it_end, &it_inc ); \ -\ - for ( ic = 0, ip = 0, it = 0; it < n_iter; \ - ic += ic_inc, ip += ip_inc, it += 1 ) \ - { \ - panel_dim_i = bli_min( panel_dim_max, iter_dim - ic ); \ -\ - c_begin = c_cast + (ic )*vs_c; \ -\ - { \ - c_use = c_begin; \ - p_use = p_begin; \ -\ - panel_len_i = panel_len_full; \ - panel_len_max_i = panel_len_max; \ -\ - if ( bli_packm_my_iter( it, it_start, it_end, tid, nt ) ) \ - { \ - PASTEMAC2(chc,chp,packm_struc_cxk_md) \ - ( \ - conjc, \ - schema, \ - *m_panel_use, \ - *n_panel_use, \ - *m_panel_max, \ - *n_panel_max, \ - kappa_cast, \ - c_use, rs_c, cs_c, \ - p_use, rs_p, cs_p, \ - is_p, \ - cntx \ - ); \ - } \ -\ - p_inc = ps_p; \ - } \ -\ -/* -if ( row_stored ) \ -PASTEMAC(chp,fprintm)( stdout, "packm_blk_var1_md: b packed", *m_panel_max, *n_panel_max, \ - p_use, rs_p, cs_p, "%5.2f", "" ); \ -else \ -PASTEMAC(chp,fprintm)( stdout, "packm_blk_var1_md: a packed", *m_panel_max, *n_panel_max, \ - p_use, rs_p, cs_p, "%5.2f", "" ); \ -*/ \ -\ - p_begin += p_inc; \ -\ - } \ -} - -INSERT_GENTFUNC2_BASIC0( packm_blk_var1_md ) -INSERT_GENTFUNC2_MIXDP0( packm_blk_var1_md ) - -#endif diff --git a/frame/1m/packm/bli_packm_blk_var1_md.h b/frame/1m/packm/bli_packm_blk_var1_md.h deleted file mode 100644 index e6bf151d07..0000000000 --- a/frame/1m/packm/bli_packm_blk_var1_md.h +++ /dev/null @@ -1,67 +0,0 @@ -/* - - BLIS - An object-based framework for developing high-performance BLAS-like - libraries. - - Copyright (C) 2014, The University of Texas at Austin - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are - met: - - Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - - Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - - Neither the name(s) of the copyright holder(s) nor the names of its - contributors may be used to endorse or promote products derived - from this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -*/ - -void bli_packm_blk_var1_md - ( - obj_t* c, - obj_t* p, - cntx_t* cntx, - cntl_t* cntl, - thrinfo_t* t - ); - - -#undef GENTPROT2 -#define GENTPROT2( ctype_c, ctype_p, chc, chp, varname ) \ -\ -void PASTEMAC2(chc,chp,varname) \ - ( \ - trans_t transc, \ - pack_t schema, \ - dim_t m, \ - dim_t n, \ - dim_t m_max, \ - dim_t n_max, \ - void* kappa, \ - void* c, inc_t rs_c, inc_t cs_c, \ - void* p, inc_t rs_p, inc_t cs_p, \ - inc_t is_p, \ - dim_t pd_p, inc_t ps_p, \ - cntx_t* cntx, \ - thrinfo_t* thread \ - ); - -INSERT_GENTPROT2_BASIC0( packm_blk_var1_md ) -INSERT_GENTPROT2_MIXDP0( packm_blk_var1_md ) - diff --git a/frame/1m/packm/bli_packm_init.c b/frame/1m/packm/bli_packm_init.c index 57c1175bfe..832cdbc7e2 100644 --- a/frame/1m/packm/bli_packm_init.c +++ b/frame/1m/packm/bli_packm_init.c @@ -212,7 +212,6 @@ siz_t bli_packm_init_pack num_t dt_tar = bli_obj_target_dt( a ); num_t dt_scalar = bli_obj_scalar_dt( a ); - trans_t transa = bli_obj_onlytrans_status( a ); dim_t m_a = bli_obj_length( a ); dim_t n_a = bli_obj_width( a ); dim_t bmult_m_def = bli_cntx_get_blksz_def_dt( dt_tar, bmult_id_m, cntx ); @@ -242,24 +241,9 @@ siz_t bli_packm_init_pack // Update the storage datatype of P to be the target datatype of A. bli_obj_set_dt( dt_tar, p ); - // Update the dimension fields to explicitly reflect a transposition, - // if needed. - // Then, clear the conjugation and transposition fields from the object - // since matrix packing in BLIS is deemed to take care of all conjugation - // and transposition necessary. - // Then, we adjust the properties of P when A needs a transposition. - // We negate the diagonal offset, and if A is upper- or lower-stored, - // we either toggle the uplo of P. - // Finally, if we mark P as dense since we assume that all matrices, - // regardless of structure, will be densified. - bli_obj_set_dims_with_trans( transa, m_a, n_a, p ); - bli_obj_set_conjtrans( BLIS_NO_TRANSPOSE, p ); - if ( bli_does_trans( transa ) ) - { - bli_obj_negate_diag_offset( p ); - if ( bli_obj_is_upper_or_lower( a ) ) - bli_obj_toggle_uplo( p ); - } + // Clear the conjugation field from the object since matrix packing + // in BLIS is deemed to take care of all conjugation necessary. + bli_obj_set_conj( BLIS_NO_CONJUGATE, p ); // If we are packing micropanels, mark P as dense. Otherwise, we are // probably being called in the context of a level-2 operation, in @@ -358,8 +342,7 @@ siz_t bli_packm_init_pack // Compute the size of the packed buffer. size_p = cs_p * n_p_pad * elem_size_p; } - else if ( bli_is_row_packed( schema ) && - bli_is_panel_packed( schema ) ) + else if ( bli_is_panel_packed( schema ) ) { dim_t m_panel; dim_t ps_p, ps_p_orig; @@ -413,11 +396,6 @@ siz_t bli_packm_init_pack bli_is_io_packed( schema ) || bli_is_rpi_packed( schema ) ) { - // The division by 2 below assumes that ps_p is an even number. - // However, it is possible that, at this point, ps_p is an odd. - // If it is indeed odd, we nudge it higher. - if ( bli_is_odd( ps_p ) ) ps_p += 1; - // Despite the fact that the packed micropanels will contain // real elements, the panel stride that we store in the obj_t // (which is passed into the macro-kernel) needs to be in units @@ -451,100 +429,6 @@ siz_t bli_packm_init_pack // Compute the size of the packed buffer. size_p = ps_p * ( m_p_pad / m_panel ) * elem_size_p; } - else if ( bli_is_col_packed( schema ) && - bli_is_panel_packed( schema ) ) - { - dim_t n_panel; - dim_t ps_p, ps_p_orig; - - // The panel dimension (for each datatype) should be equal to the - // default (logical) blocksize multiple in the n dimension. - n_panel = bmult_n_def; - - // The "row stride" of a column-micropanel packed object is interpreted - // as the row stride WITHIN a micropanel. Thus, this is equal to the - // packing (storage) blocksize multiple (which may be equal to the - // default (logical) blocksize multiple. - rs_p = bmult_n_pack; - - // The "column stride" of a column-micropanel packed object is - // interpreted as the column stride WITHIN a micropanel. Thus, it is - // unit. - cs_p = 1; - - // The "panel stride" of a micropanel packed object is interpreted as - // the distance between the (0,0) element of panel k and the (0,0) - // element of panel k+1. We use the padded length computed above to - // allow for zero-padding (if necessary/desired) along the far end - // of each micropanel (ie: the bottom edge of the matrix). Zero-padding - // can also occur along the long edge of the last micropanel if the n - // dimension of the matrix is not a whole multiple of NR. - ps_p = m_p_pad * rs_p; - - // As a general rule, we don't want micropanel strides to be odd. This - // is primarily motivated by our desire to support interleaved 3m - // micropanels, in which case we have to scale the panel stride - // by 3/2. That division by 2 means the numerator (prior to being - // scaled by 3) must be even. - if ( bli_is_odd( ps_p ) ) ps_p += 1; - - // Preserve this early panel stride value for use later, if needed. - ps_p_orig = ps_p; - - // Here, we adjust the panel stride, if necessary. Remember: ps_p is - // always interpreted as being in units of the datatype of the object - // which is not necessarily how the micropanels will be stored. For - // interleaved 3m, we will increase ps_p by 50%, and for ro/io/rpi, - // we halve ps_p. Why? Because the macro-kernel indexes in units of - // the complex datatype. So these changes "trick" it into indexing - // the correct amount. - if ( bli_is_3mi_packed( schema ) ) - { - ps_p = ( ps_p * 3 ) / 2; - } - else if ( bli_is_3ms_packed( schema ) || - bli_is_ro_packed( schema ) || - bli_is_io_packed( schema ) || - bli_is_rpi_packed( schema ) ) - { - // The division by 2 below assumes that ps_p is an even number. - // However, it is possible that, at this point, ps_p is an odd. - // If it is indeed odd, we nudge it higher. - if ( bli_is_odd( ps_p ) ) ps_p += 1; - - // Despite the fact that the packed micropanels will contain - // real elements, the panel stride that we store in the obj_t - // (which is passed into the macro-kernel) needs to be in units - // of complex elements, since the macro-kernel will index through - // micropanels via complex pointer arithmetic for trmm/trsm. - // Since the indexing "increment" will be twice as large as each - // actual stored element, we divide the panel_stride by 2. - ps_p = ps_p / 2; - } - - // Set the imaginary stride (in units of fundamental elements) for - // 3m and 4m (separated or interleaved). We use ps_p_orig since - // that variable tracks the number of real part elements contained - // within each micropanel of the source matrix. Therefore, this - // is the number of real elements that must be traversed before - // reaching the imaginary part (3mi/4mi) of the packed micropanel, - // or the real part of the next micropanel (3ms). - if ( bli_is_3mi_packed( schema ) ) is_p = ps_p_orig; - else if ( bli_is_4mi_packed( schema ) ) is_p = ps_p_orig; - else if ( bli_is_3ms_packed( schema ) ) is_p = ps_p_orig * ( n_p_pad / n_panel ); - else is_p = 1; - - // Store the strides and panel dimension in P. - bli_obj_set_strides( rs_p, cs_p, p ); - bli_obj_set_imag_stride( is_p, p ); - bli_obj_set_panel_dim( n_panel, p ); - bli_obj_set_panel_stride( ps_p, p ); - bli_obj_set_panel_length( m_p, p ); - bli_obj_set_panel_width( n_panel, p ); - - // Compute the size of the packed buffer. - size_p = ps_p * ( n_p_pad / n_panel ) * elem_size_p; - } else { // NOTE: When implementing block storage, we only need to implement diff --git a/frame/1m/packm/bli_packm_md.h b/frame/1m/packm/bli_packm_md.h index bb9d6d6135..1b7e44496f 100644 --- a/frame/1m/packm/bli_packm_md.h +++ b/frame/1m/packm/bli_packm_md.h @@ -32,6 +32,5 @@ */ -#include "bli_packm_blk_var1_md.h" #include "bli_packm_struc_cxk_md.h" diff --git a/frame/1m/packm/bli_packm_struc_cxk.c b/frame/1m/packm/bli_packm_struc_cxk.c index a3b2d66e63..d954353c77 100644 --- a/frame/1m/packm/bli_packm_struc_cxk.c +++ b/frame/1m/packm/bli_packm_struc_cxk.c @@ -46,51 +46,17 @@ void PASTEMAC(ch,varname) \ conj_t conjc, \ pack_t schema, \ bool invdiag, \ - dim_t m_panel, \ - dim_t n_panel, \ - dim_t m_panel_max, \ - dim_t n_panel_max, \ + dim_t panel_dim, \ + dim_t panel_len, \ + dim_t panel_dim_max, \ + dim_t panel_len_max, \ ctype* restrict kappa, \ - ctype* restrict c, inc_t rs_c, inc_t cs_c, \ - ctype* restrict p, inc_t rs_p, inc_t cs_p, \ + ctype* restrict c, inc_t incc, inc_t ldc, \ + ctype* restrict p, inc_t ldp, \ inc_t is_p, \ cntx_t* cntx \ ) \ { \ - dim_t panel_dim; \ - dim_t panel_dim_max; \ - dim_t panel_len; \ - dim_t panel_len_max; \ - inc_t incc, ldc; \ - inc_t ldp; \ -\ -\ - /* Determine the dimensions and relative strides of the micro-panel - based on its pack schema. */ \ - if ( bli_is_col_packed( schema ) ) \ - { \ - /* Prepare to pack to row-stored column panel. */ \ - panel_dim = n_panel; \ - panel_dim_max = n_panel_max; \ - panel_len = m_panel; \ - panel_len_max = m_panel_max; \ - incc = cs_c; \ - ldc = rs_c; \ - ldp = rs_p; \ - } \ - else /* if ( bli_is_row_packed( schema ) ) */ \ - { \ - /* Prepare to pack to column-stored row panel. */ \ - panel_dim = m_panel; \ - panel_dim_max = m_panel_max; \ - panel_len = n_panel; \ - panel_len_max = n_panel_max; \ - incc = rs_c; \ - ldc = cs_c; \ - ldp = cs_p; \ - } \ -\ -\ /* Handle micro-panel packing based on the structure of the matrix being packed. */ \ if ( bli_is_general( strucc ) ) \ @@ -117,25 +83,22 @@ void PASTEMAC(ch,varname) \ matrices. */ \ PASTEMAC(ch,packm_herm_cxk) \ ( \ - strucc, \ - diagoffc, \ - uploc, \ - conjc, \ - schema, \ - m_panel, \ - n_panel, \ - m_panel_max, \ - n_panel_max, \ - panel_dim, \ - panel_dim_max, \ - panel_len, \ - panel_len_max, \ - kappa, \ - c, rs_c, cs_c, \ - incc, ldc, \ - p, rs_p, cs_p, \ - ldp, \ - cntx \ + strucc, \ + diagoffc, \ + diagc, \ + uploc, \ + conjc, \ + schema, \ + invdiag, \ + panel_dim, \ + panel_len, \ + panel_dim_max, \ + panel_len_max, \ + kappa, \ + c, incc, ldc, \ + p, ldp, \ + is_p, \ + cntx \ ); \ } \ else /* ( bli_is_triangular( strucc ) ) */ \ @@ -144,131 +107,24 @@ void PASTEMAC(ch,varname) \ matrices. */ \ PASTEMAC(ch,packm_tri_cxk) \ ( \ - strucc, \ - diagoffc, \ - diagc, \ - uploc, \ - conjc, \ - schema, \ - invdiag, \ - m_panel, \ - n_panel, \ - m_panel_max, \ - n_panel_max, \ - panel_dim, \ - panel_dim_max, \ - panel_len, \ - panel_len_max, \ - kappa, \ - c, rs_c, cs_c, \ - incc, ldc, \ - p, rs_p, cs_p, \ - ldp, \ - cntx \ - ); \ - } \ -\ -\ - /* If m_panel < m_panel_max, or n_panel < n_panel_max, we would normally - fill the edge region (the bottom m_panel_max - m_panel rows or right- - side n_panel_max - n_panel columns) of the micropanel with zeros. - However, this responsibility has been moved to the packm microkernel. - This change allows experts to use custom kernels that pack to custom - packing formats when the problem size is not a nice multiple of the - register blocksize. */ \ -\ -/* - if ( m_panel != m_panel_max ) \ - { \ - ctype* restrict zero = PASTEMAC(ch,0); \ - dim_t i = m_panel; \ - dim_t m_edge = m_panel_max - i; \ - dim_t n_edge = n_panel_max; \ - ctype* p_edge = p + (i )*rs_p; \ -\ - PASTEMAC2(ch,setm,BLIS_TAPI_EX_SUF) \ - ( \ - BLIS_NO_CONJUGATE, \ - 0, \ - BLIS_NONUNIT_DIAG, \ - BLIS_DENSE, \ - m_edge, \ - n_edge, \ - zero, \ - p_edge, rs_p, cs_p, \ - cntx, \ - NULL \ + strucc, \ + diagoffc, \ + diagc, \ + uploc, \ + conjc, \ + schema, \ + invdiag, \ + panel_dim, \ + panel_len, \ + panel_dim_max, \ + panel_len_max, \ + kappa, \ + c, incc, ldc, \ + p, ldp, \ + is_p, \ + cntx \ ); \ } \ -\ - if ( n_panel != n_panel_max ) \ - { \ - ctype* restrict zero = PASTEMAC(ch,0); \ - dim_t j = n_panel; \ - dim_t m_edge = m_panel_max; \ - dim_t n_edge = n_panel_max - j; \ - ctype* p_edge = p + (j )*cs_p; \ -\ - PASTEMAC2(ch,setm,BLIS_TAPI_EX_SUF) \ - ( \ - BLIS_NO_CONJUGATE, \ - 0, \ - BLIS_NONUNIT_DIAG, \ - BLIS_DENSE, \ - m_edge, \ - n_edge, \ - zero, \ - p_edge, rs_p, cs_p, \ - cntx, \ - NULL \ - ); \ - } \ -*/ \ -\ -\ - if ( bli_is_triangular( strucc ) ) \ - { \ - /* If this panel is an edge case in both panel dimension and length, - then it must be a bottom-right corner case. Set the part of the - diagonal that extends into the zero-padded region to identity. - NOTE: This is actually only necessary when packing for trsm, as - it helps prevent NaNs and Infs from creeping into the computation. - However, we set the region to identity for trmm as well. Those - 1.0's end up getting muliplied by the 0.0's in the zero-padded - region of the other matrix, so there is no harm in this. */ \ - if ( m_panel != m_panel_max && \ - n_panel != n_panel_max ) \ - { \ - ctype* restrict one = PASTEMAC(ch,1); \ - dim_t i = m_panel; \ - dim_t j = n_panel; \ - dim_t m_br = m_panel_max - i; \ - dim_t n_br = n_panel_max - j; \ - ctype* p_br = p + (i )*rs_p + (j )*cs_p; \ -\ - PASTEMAC2(ch,setd,BLIS_TAPI_EX_SUF) \ - ( \ - BLIS_NO_CONJUGATE, \ - 0, \ - m_br, \ - n_br, \ - one, \ - p_br, rs_p, cs_p, \ - cntx, \ - NULL \ - ); \ - } \ - } \ -\ -\ -/* - if ( bli_is_col_packed( schema ) ) \ - PASTEMAC(ch,fprintm)( stdout, "packm_struc_cxk: bp copied", m_panel_max, n_panel_max, \ - p, rs_p, cs_p, "%4.1f", "" ); \ - else if ( bli_is_row_packed( schema ) ) \ - PASTEMAC(ch,fprintm)( stdout, "packm_struc_cxk: ap copied", m_panel_max, n_panel_max, \ - p, rs_p, cs_p, "%4.1f", "" ); \ -*/ \ } INSERT_GENTFUNC_BASIC( packm_struc_cxk, packm_cxk ) @@ -283,41 +139,28 @@ void PASTEMAC(ch,varname) \ ( \ struc_t strucc, \ doff_t diagoffc, \ + diag_t diagc, \ uplo_t uploc, \ conj_t conjc, \ pack_t schema, \ - dim_t m_panel, \ - dim_t n_panel, \ - dim_t m_panel_max, \ - dim_t n_panel_max, \ + bool invdiag, \ dim_t panel_dim, \ - dim_t panel_dim_max, \ dim_t panel_len, \ + dim_t panel_dim_max, \ dim_t panel_len_max, \ ctype* restrict kappa, \ - ctype* restrict c, inc_t rs_c, inc_t cs_c, \ - inc_t incc, inc_t ldc, \ - ctype* restrict p, inc_t rs_p, inc_t cs_p, \ - inc_t ldp, \ + ctype* restrict c, inc_t incc, inc_t ldc, \ + ctype* restrict p, inc_t ldp, \ + inc_t is_p, \ cntx_t* cntx \ ) \ { \ doff_t diagoffc_abs; \ dim_t i, j; \ - bool row_stored; \ - bool col_stored; \ -\ -\ - /* Create flags to incidate row or column storage. Note that the - schema bit that encodes row or column is describing the form of - micro-panel, not the storage in the micro-panel. Hence the - mismatch in "row" and "column" semantics. */ \ - row_stored = bli_is_col_packed( schema ); \ - col_stored = bli_is_row_packed( schema ); \ \ /* Handle the case where the micro-panel does NOT intersect the diagonal separately from the case where it does intersect. */ \ - if ( !bli_intersects_diag_n( diagoffc, m_panel, n_panel ) ) \ + if ( !bli_intersects_diag_n( diagoffc, panel_dim, panel_len ) ) \ { \ /* If the current panel is unstored, we need to make a few adjustments so we refer to the data where it is actually @@ -325,10 +168,10 @@ void PASTEMAC(ch,varname) \ implicitly assumes we are operating on a dense panel within a larger symmetric or Hermitian matrix, since a general matrix would not contain any unstored region.) */ \ - if ( bli_is_unstored_subpart_n( diagoffc, uploc, m_panel, n_panel ) ) \ + if ( bli_is_unstored_subpart_n( diagoffc, uploc, panel_dim, panel_len ) ) \ { \ - c = c + diagoffc * ( doff_t )cs_c + \ - -diagoffc * ( doff_t )rs_c; \ + c = c + diagoffc * ( doff_t )ldc + \ + -diagoffc * ( doff_t )incc; \ bli_swap_incs( &incc, &ldc ); \ \ if ( bli_is_hermitian( strucc ) ) \ @@ -350,7 +193,7 @@ void PASTEMAC(ch,varname) \ cntx \ ); \ } \ - else /* if ( bli_intersects_diag_n( diagoffc, m_panel, n_panel ) ) */ \ + else /* if ( bli_intersects_diag_n( diagoffc, panel_dim, panel_len ) ) */ \ { \ ctype* restrict c10; \ ctype* restrict p10; \ @@ -370,14 +213,12 @@ void PASTEMAC(ch,varname) \ a micro-panel. If they do, then somehow the constraints on cache blocksizes being a whole multiple of the register blocksizes was somehow violated. */ \ - if ( ( col_stored && diagoffc < 0 ) || \ - ( row_stored && diagoffc > 0 ) ) \ + if ( diagoffc < 0 ) \ bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED ); \ \ diagoffc_abs = bli_abs( diagoffc ); \ \ - if ( ( row_stored && bli_is_upper( uploc ) ) || \ - ( col_stored && bli_is_lower( uploc ) ) ) \ + if ( bli_is_lower( uploc ) ) \ { \ p10_dim = panel_dim; \ p10_len = diagoffc_abs; \ @@ -393,8 +234,8 @@ void PASTEMAC(ch,varname) \ diagoffc12 = diagoffc_abs - j; \ p12 = p + (j )*ldp; \ c12 = c + (j )*ldc; \ - c12 = c12 + diagoffc12 * ( doff_t )cs_c + \ - -diagoffc12 * ( doff_t )rs_c; \ + c12 = c12 + diagoffc12 * ( doff_t )ldc + \ + -diagoffc12 * ( doff_t )incc; \ incc12 = ldc; \ ldc12 = incc; \ conjc12 = conjc; \ @@ -402,16 +243,15 @@ void PASTEMAC(ch,varname) \ if ( bli_is_hermitian( strucc ) ) \ bli_toggle_conj( &conjc12 ); \ } \ - else /* if ( ( row_stored && bli_is_lower( uploc ) ) || \ - ( col_stored && bli_is_upper( uploc ) ) ) */ \ + else /* if ( bli_is_upper( uploc ) ) */ \ { \ p10_dim = panel_dim; \ p10_len = diagoffc_abs + panel_dim; \ diagoffc10 = diagoffc; \ p10 = p; \ c10 = c; \ - c10 = c10 + diagoffc10 * ( doff_t )cs_c + \ - -diagoffc10 * ( doff_t )rs_c; \ + c10 = c10 + diagoffc10 * ( doff_t )ldc + \ + -diagoffc10 * ( doff_t )incc; \ incc10 = ldc; \ ldc10 = incc; \ conjc10 = conjc; \ @@ -486,8 +326,8 @@ void PASTEMAC(ch,varname) \ transc, \ p11_m, \ p11_n, \ - c11, rs_c, cs_c, \ - p11, rs_p, cs_p, \ + c11, incc, ldc, \ + p11, 1, ldp, \ cntx, \ NULL \ ); \ @@ -503,7 +343,7 @@ void PASTEMAC(ch,varname) \ { \ PASTEMAC(ch,seti0s)( *pi11 ); \ \ - pi11 += rs_p + cs_p; \ + pi11 += 1 + ldp; \ } \ } \ \ @@ -519,7 +359,7 @@ void PASTEMAC(ch,varname) \ p11_m, \ p11_n, \ kappa, \ - p11, rs_p, cs_p, \ + p11, 1, ldp, \ cntx, \ NULL \ ); \ @@ -539,25 +379,20 @@ INSERT_GENTFUNC_BASIC( packm_herm_cxk, packm_cxk ) void PASTEMAC(ch,varname) \ ( \ struc_t strucc, \ - doff_t diagoffp, \ + doff_t diagoffc, \ diag_t diagc, \ uplo_t uploc, \ conj_t conjc, \ pack_t schema, \ bool invdiag, \ - dim_t m_panel, \ - dim_t n_panel, \ - dim_t m_panel_max, \ - dim_t n_panel_max, \ dim_t panel_dim, \ - dim_t panel_dim_max, \ dim_t panel_len, \ + dim_t panel_dim_max, \ dim_t panel_len_max, \ ctype* restrict kappa, \ - ctype* restrict c, inc_t rs_c, inc_t cs_c, \ - inc_t incc, inc_t ldc, \ - ctype* restrict p, inc_t rs_p, inc_t cs_p, \ - inc_t ldp, \ + ctype* restrict c, inc_t incc, inc_t ldc, \ + ctype* restrict p, inc_t ldp, \ + inc_t is_p, \ cntx_t* cntx \ ) \ { \ @@ -584,11 +419,11 @@ void PASTEMAC(ch,varname) \ PASTEMAC2(ch,setd,BLIS_TAPI_EX_SUF) \ ( \ BLIS_NO_CONJUGATE, \ - diagoffp, \ - m_panel, \ - n_panel, \ + diagoffc, \ + panel_dim, \ + panel_len, \ kappa, \ - p, rs_p, cs_p, \ + p, 1, ldp, \ cntx, \ NULL \ ); \ @@ -599,10 +434,10 @@ void PASTEMAC(ch,varname) \ { \ PASTEMAC2(ch,invertd,BLIS_TAPI_EX_SUF) \ ( \ - diagoffp, \ - m_panel, \ - n_panel, \ - p, rs_p, cs_p, \ + diagoffc, \ + panel_dim, \ + panel_len, \ + p, 1, ldp, \ cntx, \ NULL \ ); \ @@ -621,23 +456,53 @@ void PASTEMAC(ch,varname) \ uplo_t uplop = uploc; \ \ bli_toggle_uplo( &uplop ); \ - bli_shift_diag_offset_to_shrink_uplo( uplop, &diagoffp ); \ + bli_shift_diag_offset_to_shrink_uplo( uplop, &diagoffc ); \ \ PASTEMAC2(ch,setm,BLIS_TAPI_EX_SUF) \ ( \ BLIS_NO_CONJUGATE, \ - diagoffp, \ + diagoffc, \ BLIS_NONUNIT_DIAG, \ uplop, \ - m_panel, \ - n_panel, \ + panel_dim, \ + panel_len, \ zero, \ - p, rs_p, cs_p, \ + p, 1, ldp, \ cntx, \ NULL \ ); \ } \ \ + /* If this panel is an edge case in both panel dimension and length, + then it must be a bottom-right corner case. Set the part of the + diagonal that extends into the zero-padded region to identity. + NOTE: This is actually only necessary when packing for trsm, as + it helps prevent NaNs and Infs from creeping into the computation. + However, we set the region to identity for trmm as well. Those + 1.0's end up getting muliplied by the 0.0's in the zero-padded + region of the other matrix, so there is no harm in this. */ \ + if ( panel_dim != panel_dim_max && \ + panel_len != panel_len_max ) \ + { \ + ctype* restrict one = PASTEMAC(ch,1); \ + dim_t i = panel_dim; \ + dim_t j = panel_len; \ + dim_t m_br = panel_dim_max - i; \ + dim_t n_br = panel_len_max - j; \ + ctype* p_br = p + (i ) + (j )*ldp; \ +\ + PASTEMAC2(ch,setd,BLIS_TAPI_EX_SUF) \ + ( \ + BLIS_NO_CONJUGATE, \ + 0, \ + m_br, \ + n_br, \ + one, \ + p_br, 1, ldp, \ + cntx, \ + NULL \ + ); \ + } \ } INSERT_GENTFUNC_BASIC( packm_tri_cxk, packm_cxk ) diff --git a/frame/1m/packm/bli_packm_struc_cxk.h b/frame/1m/packm/bli_packm_struc_cxk.h index 08afb19bde..c4de5cf2e7 100644 --- a/frame/1m/packm/bli_packm_struc_cxk.h +++ b/frame/1m/packm/bli_packm_struc_cxk.h @@ -32,63 +32,6 @@ */ -#undef GENTPROT -#define GENTPROT( ctype, ch, varname ) \ -\ -void PASTEMAC(ch,varname) \ - ( \ - struc_t strucc, \ - doff_t diagoffp, \ - diag_t diagc, \ - uplo_t uploc, \ - conj_t conjc, \ - pack_t schema, \ - bool invdiag, \ - dim_t m_panel, \ - dim_t n_panel, \ - dim_t m_panel_max, \ - dim_t n_panel_max, \ - ctype* restrict kappa, \ - ctype* restrict c, inc_t rs_c, inc_t cs_c, \ - ctype* restrict p, inc_t rs_p, inc_t cs_p, \ - inc_t is_p, \ - cntx_t* cntx \ - ); - -INSERT_GENTPROT_BASIC0( packm_struc_cxk ) - - - -#undef GENTPROT -#define GENTPROT( ctype, ch, varname ) \ -\ -void PASTEMAC(ch,varname) \ - ( \ - struc_t strucc, \ - doff_t diagoffc, \ - uplo_t uploc, \ - conj_t conjc, \ - pack_t schema, \ - dim_t m_panel, \ - dim_t n_panel, \ - dim_t m_panel_max, \ - dim_t n_panel_max, \ - dim_t panel_dim, \ - dim_t panel_dim_max, \ - dim_t panel_len, \ - dim_t panel_len_max, \ - ctype* restrict kappa, \ - ctype* restrict c, inc_t rs_c, inc_t cs_c, \ - inc_t incc, inc_t ldc, \ - ctype* restrict p, inc_t rs_p, inc_t cs_p, \ - inc_t ldp, \ - cntx_t* cntx \ - ); - -INSERT_GENTPROT_BASIC0( packm_herm_cxk ) - - - #undef GENTPROT #define GENTPROT( ctype, ch, varname ) \ \ @@ -101,21 +44,18 @@ void PASTEMAC(ch,varname) \ conj_t conjc, \ pack_t schema, \ bool invdiag, \ - dim_t m_panel, \ - dim_t n_panel, \ - dim_t m_panel_max, \ - dim_t n_panel_max, \ dim_t panel_dim, \ - dim_t panel_dim_max, \ dim_t panel_len, \ + dim_t panel_dim_max, \ dim_t panel_len_max, \ ctype* restrict kappa, \ - ctype* restrict c, inc_t rs_c, inc_t cs_c, \ - inc_t incc, inc_t ldc, \ - ctype* restrict p, inc_t rs_p, inc_t cs_p, \ - inc_t ldp, \ + ctype* restrict c, inc_t incc, inc_t ldc, \ + ctype* restrict p, inc_t ldp, \ + inc_t is_p, \ cntx_t* cntx \ ); +INSERT_GENTPROT_BASIC0( packm_struc_cxk ) +INSERT_GENTPROT_BASIC0( packm_herm_cxk ) INSERT_GENTPROT_BASIC0( packm_tri_cxk ) diff --git a/frame/1m/packm/bli_packm_struc_cxk_1er.c b/frame/1m/packm/bli_packm_struc_cxk_1er.c index a66ba5ff6b..54134056f0 100644 --- a/frame/1m/packm/bli_packm_struc_cxk_1er.c +++ b/frame/1m/packm/bli_packm_struc_cxk_1er.c @@ -46,51 +46,17 @@ void PASTEMAC(ch,varname) \ conj_t conjc, \ pack_t schema, \ bool invdiag, \ - dim_t m_panel, \ - dim_t n_panel, \ - dim_t m_panel_max, \ - dim_t n_panel_max, \ + dim_t panel_dim, \ + dim_t panel_len, \ + dim_t panel_dim_max, \ + dim_t panel_len_max, \ ctype* restrict kappa, \ - ctype* restrict c, inc_t rs_c, inc_t cs_c, \ - ctype* restrict p, inc_t rs_p, inc_t cs_p, \ + ctype* restrict c, inc_t incc, inc_t ldc, \ + ctype* restrict p, inc_t ldp, \ inc_t is_p, \ cntx_t* cntx \ ) \ { \ - dim_t panel_dim; \ - dim_t panel_dim_max; \ - dim_t panel_len; \ - dim_t panel_len_max; \ - inc_t incc, ldc; \ - inc_t ldp; \ -\ -\ - /* Determine the dimensions and relative strides of the micro-panel - based on its pack schema. */ \ - if ( bli_is_col_packed( schema ) ) \ - { \ - /* Prepare to pack to row-stored column panel. */ \ - panel_dim = n_panel; \ - panel_dim_max = n_panel_max; \ - panel_len = m_panel; \ - panel_len_max = m_panel_max; \ - incc = cs_c; \ - ldc = rs_c; \ - ldp = rs_p; \ - } \ - else /* if ( bli_is_row_packed( schema ) ) */ \ - { \ - /* Prepare to pack to column-stored row panel. */ \ - panel_dim = m_panel; \ - panel_dim_max = m_panel_max; \ - panel_len = n_panel; \ - panel_len_max = n_panel_max; \ - incc = rs_c; \ - ldc = cs_c; \ - ldp = cs_p; \ - } \ -\ -\ /* Handle micro-panel packing based on the structure of the matrix being packed. */ \ if ( bli_is_general( strucc ) ) \ @@ -117,25 +83,22 @@ void PASTEMAC(ch,varname) \ matrices. */ \ PASTEMAC(ch,packm_herm_cxk_1er) \ ( \ - strucc, \ - diagoffc, \ - uploc, \ - conjc, \ - schema, \ - m_panel, \ - n_panel, \ - m_panel_max, \ - n_panel_max, \ - panel_dim, \ - panel_dim_max, \ - panel_len, \ - panel_len_max, \ - kappa, \ - c, rs_c, cs_c, \ - incc, ldc, \ - p, rs_p, cs_p, \ - ldp, \ - cntx \ + strucc, \ + diagoffc, \ + diagc, \ + uploc, \ + conjc, \ + schema, \ + invdiag, \ + panel_dim, \ + panel_len, \ + panel_dim_max, \ + panel_len_max, \ + kappa, \ + c, incc, ldc, \ + p, ldp, \ + is_p, \ + cntx \ ); \ } \ else /* ( bli_is_triangular( strucc ) ) */ \ @@ -144,126 +107,24 @@ void PASTEMAC(ch,varname) \ matrices. */ \ PASTEMAC(ch,packm_tri_cxk_1er) \ ( \ - strucc, \ - diagoffc, \ - diagc, \ - uploc, \ - conjc, \ - schema, \ - invdiag, \ - m_panel, \ - n_panel, \ - m_panel_max, \ - n_panel_max, \ - panel_dim, \ - panel_dim_max, \ - panel_len, \ - panel_len_max, \ - kappa, \ - c, rs_c, cs_c, \ - incc, ldc, \ - p, rs_p, cs_p, \ - ldp, \ - cntx \ - ); \ - } \ -\ -\ - /* If m_panel < m_panel_max, or n_panel < n_panel_max, we would normally - fill the edge region (the bottom m_panel_max - m_panel rows or right- - side n_panel_max - n_panel columns) of the micropanel with zeros. - However, this responsibility has been moved to the packm microkernel. - This change allows experts to use custom kernels that pack to custom - packing formats when the problem size is not a nice multiple of the - register blocksize. */ \ -/* - if ( m_panel != m_panel_max ) \ - { \ - ctype* restrict zero = PASTEMAC(ch,0); \ - dim_t offm = m_panel; \ - dim_t offn = 0; \ - dim_t m_edge = m_panel_max - m_panel; \ - dim_t n_edge = n_panel_max; \ -\ - PASTEMAC(ch,set1ms_mxn) \ - ( \ - schema, \ - offm, \ - offn, \ - m_edge, \ - n_edge, \ - zero, \ - p, rs_p, cs_p, ldp \ + strucc, \ + diagoffc, \ + diagc, \ + uploc, \ + conjc, \ + schema, \ + invdiag, \ + panel_dim, \ + panel_len, \ + panel_dim_max, \ + panel_len_max, \ + kappa, \ + c, incc, ldc, \ + p, ldp, \ + is_p, \ + cntx \ ); \ } \ -\ - if ( n_panel != n_panel_max ) \ - { \ - ctype* restrict zero = PASTEMAC(ch,0); \ - dim_t offm = 0; \ - dim_t offn = n_panel; \ - dim_t m_edge = m_panel_max; \ - dim_t n_edge = n_panel_max - n_panel; \ -\ - PASTEMAC(ch,set1ms_mxn) \ - ( \ - schema, \ - offm, \ - offn, \ - m_edge, \ - n_edge, \ - zero, \ - p, rs_p, cs_p, ldp \ - ); \ - } \ -*/ \ -\ - if ( bli_is_triangular( strucc ) ) \ - { \ - /* If this micro-panel is an edge case in both panel dimension and - length, then it must be a bottom-right corner case, which - typically only happens for micro-panels being packed for trsm. - (It also happens for trmm if kr > 1.) Here, we set the part of - the diagonal that extends into the zero-padded region to - identity. This prevents NaNs and Infs from creeping into the - computation. If this code does execute for trmm, it is okay, - because those 1.0's that extend into the bottom-right region - end up getting muliplied by the 0.0's in the zero-padded region - of the other matrix. */ \ - if ( m_panel != m_panel_max && \ - n_panel != n_panel_max ) \ - { \ - ctype* restrict one = PASTEMAC(ch,1); \ - dim_t offm = m_panel; \ - dim_t offn = n_panel; \ - dim_t m_edge = m_panel_max - m_panel; \ - dim_t n_edge = n_panel_max - n_panel; \ -\ - PASTEMAC(ch,set1ms_mxn_diag) \ - ( \ - schema, \ - offm, \ - offn, \ - m_edge, \ - n_edge, \ - one, \ - p, rs_p, cs_p, ldp \ - ); \ - } \ - } \ -\ -\ -/* - if ( bli_is_1r_packed( schema ) ) { \ - PASTEMAC(chr,fprintm)( stdout, "packm_struc_cxk_1er (1r): bp", m_panel_max, 2*n_panel_max, \ - ( ctype_r* )p, rs_p, cs_p, "%4.1f", "" ); \ - } \ - \ - if ( bli_is_1e_packed( schema ) ) { \ - PASTEMAC(chr,fprintm)( stdout, "packm_struc_cxk_1er (1e): ap", 2*m_panel_max, 2*n_panel_max, \ - ( ctype_r* )p, rs_p, cs_p, "%4.1f", "" ); \ - } \ -*/ \ } INSERT_GENTFUNCCO_BASIC( packm_struc_cxk_1er, packm_cxk_1er ) @@ -278,41 +139,28 @@ void PASTEMAC(ch,varname) \ ( \ struc_t strucc, \ doff_t diagoffc, \ + diag_t diagc, \ uplo_t uploc, \ conj_t conjc, \ pack_t schema, \ - dim_t m_panel, \ - dim_t n_panel, \ - dim_t m_panel_max, \ - dim_t n_panel_max, \ + bool invdiag, \ dim_t panel_dim, \ - dim_t panel_dim_max, \ dim_t panel_len, \ + dim_t panel_dim_max, \ dim_t panel_len_max, \ ctype* restrict kappa, \ - ctype* restrict c, inc_t rs_c, inc_t cs_c, \ - inc_t incc, inc_t ldc, \ - ctype* restrict p, inc_t rs_p, inc_t cs_p, \ - inc_t ldp, \ + ctype* restrict c, inc_t incc, inc_t ldc, \ + ctype* restrict p, inc_t ldp, \ + inc_t is_p, \ cntx_t* cntx \ ) \ { \ doff_t diagoffc_abs; \ dim_t j; \ - bool row_stored; \ - bool col_stored; \ -\ -\ - /* Create flags to incidate row or column storage. Note that the - schema bit that encodes row or column is describing the form of - micro-panel, not the storage in the micro-panel. Hence the - mismatch in "row" and "column" semantics. */ \ - row_stored = bli_is_col_packed( schema ); \ - col_stored = bli_is_row_packed( schema ); \ \ /* Handle the case where the micro-panel does NOT intersect the diagonal separately from the case where it does intersect. */ \ - if ( !bli_intersects_diag_n( diagoffc, m_panel, n_panel ) ) \ + if ( !bli_intersects_diag_n( diagoffc, panel_dim, panel_len ) ) \ { \ /* If the current panel is unstored, we need to make a few adjustments so we refer to the data where it is actually @@ -320,10 +168,10 @@ void PASTEMAC(ch,varname) \ implicitly assumes we are operating on a dense panel within a larger symmetric or Hermitian matrix, since a general matrix would not contain any unstored region.) */ \ - if ( bli_is_unstored_subpart_n( diagoffc, uploc, m_panel, n_panel ) ) \ + if ( bli_is_unstored_subpart_n( diagoffc, uploc, panel_dim, panel_len ) ) \ { \ - c = c + diagoffc * ( doff_t )cs_c + \ - -diagoffc * ( doff_t )rs_c; \ + c = c + diagoffc * ( doff_t )ldc + \ + -diagoffc * ( doff_t )incc; \ bli_swap_incs( &incc, &ldc ); \ \ if ( bli_is_hermitian( strucc ) ) \ @@ -345,7 +193,7 @@ void PASTEMAC(ch,varname) \ cntx \ ); \ } \ - else /* if ( bli_intersects_diag_n( diagoffc, m_panel, n_panel ) ) */ \ + else /* if ( bli_intersects_diag_n( diagoffc, panel_dim, panel_len ) ) */ \ { \ ctype* restrict c10; \ ctype* restrict p10; \ @@ -366,14 +214,12 @@ void PASTEMAC(ch,varname) \ a micro-panel. If they do, then somehow the constraints on cache blocksizes being a whole multiple of the register blocksizes was somehow violated. */ \ - if ( ( col_stored && diagoffc < 0 ) || \ - ( row_stored && diagoffc > 0 ) ) \ + if ( diagoffc < 0 ) \ bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED ); \ \ diagoffc_abs = bli_abs( diagoffc ); \ \ - if ( ( row_stored && bli_is_upper( uploc ) ) || \ - ( col_stored && bli_is_lower( uploc ) ) ) \ + if ( bli_is_lower( uploc ) ) \ { \ p10_dim = panel_dim; \ p10_len = diagoffc_abs; \ @@ -389,8 +235,8 @@ void PASTEMAC(ch,varname) \ diagoffc12 = diagoffc_abs - j; \ p12 = p + (j )*ldp; \ c12 = c + (j )*ldc; \ - c12 = c12 + diagoffc12 * ( doff_t )cs_c + \ - -diagoffc12 * ( doff_t )rs_c; \ + c12 = c12 + diagoffc12 * ( doff_t )ldc + \ + -diagoffc12 * ( doff_t )incc; \ incc12 = ldc; \ ldc12 = incc; \ conjc12 = conjc; \ @@ -398,16 +244,15 @@ void PASTEMAC(ch,varname) \ if ( bli_is_hermitian( strucc ) ) \ bli_toggle_conj( &conjc12 ); \ } \ - else /* if ( ( row_stored && bli_is_lower( uploc ) ) || \ - ( col_stored && bli_is_upper( uploc ) ) ) */ \ + else /* if ( bli_is_upper( uploc ) ) */ \ { \ p10_dim = panel_dim; \ p10_len = diagoffc_abs + panel_dim; \ diagoffc10 = diagoffc; \ p10 = p; \ c10 = c; \ - c10 = c10 + diagoffc10 * ( doff_t )cs_c + \ - -diagoffc10 * ( doff_t )rs_c; \ + c10 = c10 + diagoffc10 * ( doff_t )ldc + \ + -diagoffc10 * ( doff_t )incc; \ incc10 = ldc; \ ldc10 = incc; \ conjc10 = conjc; \ @@ -478,8 +323,8 @@ void PASTEMAC(ch,varname) \ conjc, \ panel_dim, \ kappa, \ - c11, rs_c, cs_c, \ - p11, rs_p, cs_p, ldp \ + c11, incc, ldc, \ + p11, 1, ldp, ldp \ ); \ \ /* If we are packing a micro-panel with Hermitian structure, @@ -495,8 +340,8 @@ void PASTEMAC(ch,varname) \ if ( bli_is_hermitian( strucc ) ) \ { \ ctype_r* restrict c11_r = ( ctype_r* )c11; \ - const dim_t rs_c2 = 2*rs_c; \ - const dim_t cs_c2 = 2*cs_c; \ + const dim_t incc2 = 2*incc; \ + const dim_t ldc2 = 2*ldc; \ \ PASTEMAC3(ch,chr,ch,scal21ms_mxn_diag) \ ( \ @@ -504,8 +349,8 @@ void PASTEMAC(ch,varname) \ panel_dim, \ panel_dim, \ kappa, \ - c11_r, rs_c2, cs_c2, \ - p11, rs_p, cs_p, ldp \ + c11_r, incc2, ldc2, \ + p11, 1, ldp, ldp \ ); \ } \ } \ @@ -523,30 +368,25 @@ INSERT_GENTFUNCCO_BASIC( packm_herm_cxk_1er, packm_cxk_1er ) void PASTEMAC(ch,varname) \ ( \ struc_t strucc, \ - doff_t diagoffp, \ + doff_t diagoffc, \ diag_t diagc, \ uplo_t uploc, \ conj_t conjc, \ pack_t schema, \ bool invdiag, \ - dim_t m_panel, \ - dim_t n_panel, \ - dim_t m_panel_max, \ - dim_t n_panel_max, \ dim_t panel_dim, \ - dim_t panel_dim_max, \ dim_t panel_len, \ + dim_t panel_dim_max, \ dim_t panel_len_max, \ ctype* restrict kappa, \ - ctype* restrict c, inc_t rs_c, inc_t cs_c, \ - inc_t incc, inc_t ldc, \ - ctype* restrict p, inc_t rs_p, inc_t cs_p, \ - inc_t ldp, \ + ctype* restrict c, inc_t incc, inc_t ldc, \ + ctype* restrict p, inc_t ldp, \ + inc_t is_p, \ cntx_t* cntx \ ) \ { \ - doff_t diagoffp_abs = bli_abs( diagoffp ); \ - ctype* p11 = p + (diagoffp_abs )*ldp; \ + doff_t diagoffc_abs = bli_abs( diagoffc ); \ + ctype* p11 = p + (diagoffc_abs )*ldp; \ \ \ /* Pack the panel. */ \ @@ -579,7 +419,7 @@ void PASTEMAC(ch,varname) \ panel_dim, \ panel_dim, \ kappa, \ - p11, rs_p, cs_p, ldp \ + p11, 1, ldp, ldp \ ); \ } \ \ @@ -594,7 +434,7 @@ void PASTEMAC(ch,varname) \ 0, \ panel_dim, \ panel_dim, \ - p11, rs_p, cs_p, ldp \ + p11, 1, ldp, ldp \ ); \ } \ \ @@ -610,11 +450,11 @@ void PASTEMAC(ch,varname) \ { \ ctype* restrict zero = PASTEMAC(ch,0); \ uplo_t uplop = uploc; \ - doff_t diagoffp11_0 = 0; \ + doff_t diagoffc11_0 = 0; \ dim_t p11_0_dim = panel_dim - 1; \ \ bli_toggle_uplo( &uplop ); \ - bli_shift_diag_offset_to_shrink_uplo( uplop, &diagoffp11_0 ); \ + bli_shift_diag_offset_to_shrink_uplo( uplop, &diagoffc11_0 ); \ \ /* Note that this macro works a little differently than the setm operation. Here, we pass in the dimensions of only p11, rather @@ -622,20 +462,51 @@ void PASTEMAC(ch,varname) \ "shrunken" dimensions of p11, corresponding to the toggling and shrinking of the diagonal above. The macro will do the right thing, incrementing the pointer to p11 by the appropriate - leading dimension (cs_p or rs_p), and setting only the lower + leading dimension (ldp or rs_p), and setting only the lower or upper triangle to zero. */ \ PASTEMAC(ch,set1ms_mxn_uplo) \ ( \ schema, \ - diagoffp11_0, \ + diagoffc11_0, \ uplop, \ p11_0_dim, \ p11_0_dim, \ zero, \ - p11, rs_p, cs_p, ldp \ + p11, 1, ldp, ldp \ ); \ } \ } \ +\ + /* If this micro-panel is an edge case in both panel dimension and + length, then it must be a bottom-right corner case, which + typically only happens for micro-panels being packed for trsm. + (It also happens for trmm if kr > 1.) Here, we set the part of + the diagonal that extends into the zero-padded region to + identity. This prevents NaNs and Infs from creeping into the + computation. If this code does execute for trmm, it is okay, + because those 1.0's that extend into the bottom-right region + end up getting muliplied by the 0.0's in the zero-padded region + of the other matrix. */ \ + if ( panel_dim != panel_dim_max && \ + panel_len != panel_len_max ) \ + { \ + ctype* restrict one = PASTEMAC(ch,1); \ + dim_t offm = panel_dim; \ + dim_t offn = panel_len; \ + dim_t m_edge = panel_dim_max - panel_dim; \ + dim_t n_edge = panel_len_max - panel_len; \ +\ + PASTEMAC(ch,set1ms_mxn_diag) \ + ( \ + schema, \ + offm, \ + offn, \ + m_edge, \ + n_edge, \ + one, \ + p, 1, ldp, ldp \ + ); \ + } \ } INSERT_GENTFUNCCO_BASIC( packm_tri_cxk_1er, packm_cxk_1er ) diff --git a/frame/1m/packm/bli_packm_struc_cxk_1er.h b/frame/1m/packm/bli_packm_struc_cxk_1er.h index 6e62d8f69e..677b600138 100644 --- a/frame/1m/packm/bli_packm_struc_cxk_1er.h +++ b/frame/1m/packm/bli_packm_struc_cxk_1er.h @@ -32,63 +32,6 @@ */ -#undef GENTPROTCO -#define GENTPROTCO( ctype, ctype_r, ch, chr, varname ) \ -\ -void PASTEMAC(ch,varname) \ - ( \ - struc_t strucc, \ - doff_t diagoffp, \ - diag_t diagc, \ - uplo_t uploc, \ - conj_t conjc, \ - pack_t schema, \ - bool invdiag, \ - dim_t m_panel, \ - dim_t n_panel, \ - dim_t m_panel_max, \ - dim_t n_panel_max, \ - ctype* restrict kappa, \ - ctype* restrict c, inc_t rs_c, inc_t cs_c, \ - ctype* restrict p, inc_t rs_p, inc_t cs_p, \ - inc_t is_p, \ - cntx_t* cntx \ - ); - -INSERT_GENTPROTCO_BASIC0( packm_struc_cxk_1er ) - - - -#undef GENTPROTCO -#define GENTPROTCO( ctype, ctype_r, ch, chr, varname ) \ -\ -void PASTEMAC(ch,varname) \ - ( \ - struc_t strucc, \ - doff_t diagoffc, \ - uplo_t uploc, \ - conj_t conjc, \ - pack_t schema, \ - dim_t m_panel, \ - dim_t n_panel, \ - dim_t m_panel_max, \ - dim_t n_panel_max, \ - dim_t panel_dim, \ - dim_t panel_dim_max, \ - dim_t panel_len, \ - dim_t panel_len_max, \ - ctype* restrict kappa, \ - ctype* restrict c, inc_t rs_c, inc_t cs_c, \ - inc_t incc, inc_t ldc, \ - ctype* restrict p, inc_t rs_p, inc_t cs_p, \ - inc_t ldp, \ - cntx_t* cntx \ - ); - -INSERT_GENTPROTCO_BASIC0( packm_herm_cxk_1er ) - - - #undef GENTPROTCO #define GENTPROTCO( ctype, ctype_r, ch, chr, varname ) \ \ @@ -101,21 +44,18 @@ void PASTEMAC(ch,varname) \ conj_t conjc, \ pack_t schema, \ bool invdiag, \ - dim_t m_panel, \ - dim_t n_panel, \ - dim_t m_panel_max, \ - dim_t n_panel_max, \ dim_t panel_dim, \ - dim_t panel_dim_max, \ dim_t panel_len, \ + dim_t panel_dim_max, \ dim_t panel_len_max, \ ctype* restrict kappa, \ - ctype* restrict c, inc_t rs_c, inc_t cs_c, \ - inc_t incc, inc_t ldc, \ - ctype* restrict p, inc_t rs_p, inc_t cs_p, \ - inc_t ldp, \ + ctype* restrict c, inc_t incc, inc_t ldc, \ + ctype* restrict p, inc_t ldp, \ + inc_t is_p, \ cntx_t* cntx \ ); +INSERT_GENTPROTCO_BASIC0( packm_struc_cxk_1er ) +INSERT_GENTPROTCO_BASIC0( packm_herm_cxk_1er ) INSERT_GENTPROTCO_BASIC0( packm_tri_cxk_1er ) diff --git a/frame/1m/packm/bli_packm_struc_cxk_3mis.c b/frame/1m/packm/bli_packm_struc_cxk_3mis.c index 95908c8e7b..97c96cd209 100644 --- a/frame/1m/packm/bli_packm_struc_cxk_3mis.c +++ b/frame/1m/packm/bli_packm_struc_cxk_3mis.c @@ -46,51 +46,17 @@ void PASTEMAC(ch,varname) \ conj_t conjc, \ pack_t schema, \ bool invdiag, \ - dim_t m_panel, \ - dim_t n_panel, \ - dim_t m_panel_max, \ - dim_t n_panel_max, \ + dim_t panel_dim, \ + dim_t panel_len, \ + dim_t panel_dim_max, \ + dim_t panel_len_max, \ ctype* restrict kappa, \ - ctype* restrict c, inc_t rs_c, inc_t cs_c, \ - ctype* restrict p, inc_t rs_p, inc_t cs_p, \ + ctype* restrict c, inc_t incc, inc_t ldc, \ + ctype* restrict p, inc_t ldp, \ inc_t is_p, \ cntx_t* cntx \ ) \ { \ - dim_t panel_dim; \ - dim_t panel_dim_max; \ - dim_t panel_len; \ - dim_t panel_len_max; \ - inc_t incc, ldc; \ - inc_t ldp; \ -\ -\ - /* Determine the dimensions and relative strides of the micro-panel - based on its pack schema. */ \ - if ( bli_is_col_packed( schema ) ) \ - { \ - /* Prepare to pack to row-stored column panel. */ \ - panel_dim = n_panel; \ - panel_dim_max = n_panel_max; \ - panel_len = m_panel; \ - panel_len_max = m_panel_max; \ - incc = cs_c; \ - ldc = rs_c; \ - ldp = rs_p; \ - } \ - else /* if ( bli_is_row_packed( schema ) ) */ \ - { \ - /* Prepare to pack to column-stored row panel. */ \ - panel_dim = m_panel; \ - panel_dim_max = m_panel_max; \ - panel_len = n_panel; \ - panel_len_max = n_panel_max; \ - incc = rs_c; \ - ldc = cs_c; \ - ldp = cs_p; \ - } \ -\ -\ /* Handle micro-panel packing based on the structure of the matrix being packed. */ \ if ( bli_is_general( strucc ) ) \ @@ -116,25 +82,22 @@ void PASTEMAC(ch,varname) \ matrices. */ \ PASTEMAC(ch,packm_herm_cxk_3mis) \ ( \ - strucc, \ - diagoffc, \ - uploc, \ - conjc, \ - schema, \ - m_panel, \ - n_panel, \ - m_panel_max, \ - n_panel_max, \ - panel_dim, \ - panel_dim_max, \ - panel_len, \ - panel_len_max, \ - kappa, \ - c, rs_c, cs_c, \ - incc, ldc, \ - p, rs_p, cs_p, \ - is_p, ldp, \ - cntx \ + strucc, \ + diagoffc, \ + diagc, \ + uploc, \ + conjc, \ + schema, \ + invdiag, \ + panel_dim, \ + panel_len, \ + panel_dim_max, \ + panel_len_max, \ + kappa, \ + c, incc, ldc, \ + p, ldp, \ + is_p, \ + cntx \ ); \ } \ else /* ( bli_is_triangular( strucc ) ) */ \ @@ -143,191 +106,24 @@ void PASTEMAC(ch,varname) \ matrices. */ \ PASTEMAC(ch,packm_tri_cxk_3mis) \ ( \ - strucc, \ - diagoffc, \ - diagc, \ - uploc, \ - conjc, \ - schema, \ - invdiag, \ - m_panel, \ - n_panel, \ - m_panel_max, \ - n_panel_max, \ - panel_dim, \ - panel_dim_max, \ - panel_len, \ - panel_len_max, \ - kappa, \ - c, rs_c, cs_c, \ - incc, ldc, \ - p, rs_p, cs_p, \ - is_p, ldp, \ - cntx \ - ); \ - } \ -\ -\ - /* If m_panel < m_panel_max, or n_panel < n_panel_max, we would normally - fill the edge region (the bottom m_panel_max - m_panel rows or right- - side n_panel_max - n_panel columns) of the micropanel with zeros. - However, this responsibility has been moved to the packm microkernel. - This change allows experts to use custom kernels that pack to custom - packing formats when the problem size is not a nice multiple of the - register blocksize. */ \ -/* - if ( m_panel != m_panel_max ) \ - { \ - ctype_r* restrict zero_r = PASTEMAC(chr,0); \ - dim_t i = m_panel; \ - dim_t m_edge = m_panel_max - i; \ - dim_t n_edge = n_panel_max; \ - ctype_r* p_edge_r = ( ctype_r* )p + (i )*rs_p; \ - ctype_r* p_edge_i = ( ctype_r* )p + is_p + (i )*rs_p; \ - ctype_r* p_edge_rpi = ( ctype_r* )p + 2*is_p + (i )*rs_p; \ -\ - PASTEMAC2(chr,setm,BLIS_TAPI_EX_SUF) \ - ( \ - BLIS_NO_CONJUGATE, \ - 0, \ - BLIS_NONUNIT_DIAG, \ - BLIS_DENSE, \ - m_edge, \ - n_edge, \ - zero_r, \ - p_edge_r, rs_p, cs_p, \ - cntx, \ - NULL \ - ); \ - PASTEMAC2(chr,setm,BLIS_TAPI_EX_SUF) \ - ( \ - BLIS_NO_CONJUGATE, \ - 0, \ - BLIS_NONUNIT_DIAG, \ - BLIS_DENSE, \ - m_edge, \ - n_edge, \ - zero_r, \ - p_edge_i, rs_p, cs_p, \ - cntx, \ - NULL \ - ); \ - PASTEMAC2(chr,setm,BLIS_TAPI_EX_SUF) \ - ( \ - BLIS_NO_CONJUGATE, \ - 0, \ - BLIS_NONUNIT_DIAG, \ - BLIS_DENSE, \ - m_edge, \ - n_edge, \ - zero_r, \ - p_edge_rpi, rs_p, cs_p, \ - cntx, \ - NULL \ + strucc, \ + diagoffc, \ + diagc, \ + uploc, \ + conjc, \ + schema, \ + invdiag, \ + panel_dim, \ + panel_len, \ + panel_dim_max, \ + panel_len_max, \ + kappa, \ + c, incc, ldc, \ + p, ldp, \ + is_p, \ + cntx \ ); \ } \ -*/ \ -\ -/* - if ( n_panel != n_panel_max ) \ - { \ - ctype_r* restrict zero_r = PASTEMAC(chr,0); \ - dim_t j = n_panel; \ - dim_t m_edge = m_panel_max; \ - dim_t n_edge = n_panel_max - j; \ - ctype_r* p_edge_r = ( ctype_r* )p + (j )*cs_p; \ - ctype_r* p_edge_i = ( ctype_r* )p + is_p + (j )*cs_p; \ - ctype_r* p_edge_rpi = ( ctype_r* )p + 2*is_p + (j )*cs_p; \ -\ - PASTEMAC2(chr,setm,BLIS_TAPI_EX_SUF) \ - ( \ - BLIS_NO_CONJUGATE, \ - 0, \ - BLIS_NONUNIT_DIAG, \ - BLIS_DENSE, \ - m_edge, \ - n_edge, \ - zero_r, \ - p_edge_r, rs_p, cs_p, \ - cntx, \ - NULL \ - ); \ - PASTEMAC2(chr,setm,BLIS_TAPI_EX_SUF) \ - ( \ - BLIS_NO_CONJUGATE, \ - 0, \ - BLIS_NONUNIT_DIAG, \ - BLIS_DENSE, \ - m_edge, \ - n_edge, \ - zero_r, \ - p_edge_i, rs_p, cs_p, \ - cntx, \ - NULL \ - ); \ - PASTEMAC2(chr,setm,BLIS_TAPI_EX_SUF) \ - ( \ - BLIS_NO_CONJUGATE, \ - 0, \ - BLIS_NONUNIT_DIAG, \ - BLIS_DENSE, \ - m_edge, \ - n_edge, \ - zero_r, \ - p_edge_rpi, rs_p, cs_p, \ - cntx, \ - NULL \ - ); \ - } \ -*/ \ -\ -\ - if ( bli_is_triangular( strucc ) ) \ - { \ - /* If this panel is an edge case in both panel dimension and length, - then it must be a bottom-right corner case. Set the part of the - diagonal that extends into the zero-padded region to identity. - NOTE: This is actually only necessary when packing for trsm, as - it helps prevent NaNs and Infs from creeping into the computation. - However, we set the region to identity for trmm as well. Those - 1.0's end up getting muliplied by the 0.0's in the zero-padded - region of the other matrix, so there is no harm in this. */ \ - if ( m_panel != m_panel_max && \ - n_panel != n_panel_max ) \ - { \ - ctype_r* restrict one_r = PASTEMAC(chr,1); \ - ctype_r* restrict zero_r = PASTEMAC(chr,0); \ - dim_t i = m_panel; \ - dim_t j = n_panel; \ - dim_t m_br = m_panel_max - i; \ - dim_t n_br = n_panel_max - j; \ - ctype_r* p_br_r = ( ctype_r* )p + (i )*rs_p + (j )*cs_p; \ - ctype_r* p_br_i = ( ctype_r* )p + is_p + (i )*rs_p + (j )*cs_p; \ -\ - PASTEMAC2(chr,setd,BLIS_TAPI_EX_SUF) \ - ( \ - BLIS_NO_CONJUGATE, \ - 0, \ - m_br, \ - n_br, \ - one_r, \ - p_br_r, rs_p, cs_p, \ - cntx, \ - NULL \ - ); \ - PASTEMAC2(chr,setd,BLIS_TAPI_EX_SUF) \ - ( \ - BLIS_NO_CONJUGATE, \ - 0, \ - m_br, \ - n_br, \ - zero_r, \ - p_br_i, rs_p, cs_p, \ - cntx, \ - NULL \ - ); \ - } \ - } \ } INSERT_GENTFUNCCO_BASIC( packm_struc_cxk_3mis, packm_cxk_3mis ) @@ -342,42 +138,29 @@ void PASTEMAC(ch,varname) \ ( \ struc_t strucc, \ doff_t diagoffc, \ + diag_t diagc, \ uplo_t uploc, \ conj_t conjc, \ pack_t schema, \ - dim_t m_panel, \ - dim_t n_panel, \ - dim_t m_panel_max, \ - dim_t n_panel_max, \ + bool invdiag, \ dim_t panel_dim, \ - dim_t panel_dim_max, \ dim_t panel_len, \ + dim_t panel_dim_max, \ dim_t panel_len_max, \ ctype* restrict kappa, \ - ctype* restrict c, inc_t rs_c, inc_t cs_c, \ - inc_t incc, inc_t ldc, \ - ctype* restrict p, inc_t rs_p, inc_t cs_p, \ - inc_t is_p, inc_t ldp, \ + ctype* restrict c, inc_t incc, inc_t ldc, \ + ctype* restrict p, inc_t ldp, \ + inc_t is_p, \ cntx_t* cntx \ ) \ { \ doff_t diagoffc_abs; \ dim_t i, j; \ - bool row_stored; \ - bool col_stored; \ -\ -\ - /* Create flags to incidate row or column storage. Note that the - schema bit that encodes row or column is describing the form of - micro-panel, not the storage in the micro-panel. Hence the - mismatch in "row" and "column" semantics. */ \ - row_stored = bli_is_col_packed( schema ); \ - col_stored = bli_is_row_packed( schema ); \ \ \ /* Handle the case where the micro-panel does NOT intersect the diagonal separately from the case where it does intersect. */ \ - if ( !bli_intersects_diag_n( diagoffc, m_panel, n_panel ) ) \ + if ( !bli_intersects_diag_n( diagoffc, panel_dim, panel_len ) ) \ { \ /* If the current panel is unstored, we need to make a few adjustments so we refer to the data where it is actually @@ -385,10 +168,10 @@ void PASTEMAC(ch,varname) \ implicitly assumes we are operating on a dense panel within a larger symmetric or Hermitian matrix, since a general matrix would not contain any unstored region.) */ \ - if ( bli_is_unstored_subpart_n( diagoffc, uploc, m_panel, n_panel ) ) \ + if ( bli_is_unstored_subpart_n( diagoffc, uploc, panel_dim, panel_len ) ) \ { \ - c = c + diagoffc * ( doff_t )cs_c + \ - -diagoffc * ( doff_t )rs_c; \ + c = c + diagoffc * ( doff_t )ldc + \ + -diagoffc * ( doff_t )incc; \ bli_swap_incs( &incc, &ldc ); \ \ if ( bli_is_hermitian( strucc ) ) \ @@ -409,7 +192,7 @@ void PASTEMAC(ch,varname) \ cntx \ ); \ } \ - else /* if ( bli_intersects_diag_n( diagoffc, m_panel, n_panel ) ) */ \ + else /* if ( bli_intersects_diag_n( diagoffc, panel_dim, panel_len ) ) */ \ { \ ctype_r* restrict p_r = ( ctype_r* )p; \ \ @@ -434,14 +217,12 @@ void PASTEMAC(ch,varname) \ a micro-panel. If they do, then somehow the constraints on cache blocksizes being a whole multiple of the register blocksizes was somehow violated. */ \ - if ( ( col_stored && diagoffc < 0 ) || \ - ( row_stored && diagoffc > 0 ) ) \ + if ( diagoffc < 0 ) \ bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED ); \ \ diagoffc_abs = bli_abs( diagoffc ); \ \ - if ( ( row_stored && bli_is_upper( uploc ) ) || \ - ( col_stored && bli_is_lower( uploc ) ) ) \ + if ( bli_is_lower( uploc ) ) \ { \ p10_dim = panel_dim; \ p10_len = diagoffc_abs; \ @@ -457,8 +238,8 @@ void PASTEMAC(ch,varname) \ diagoffc12 = diagoffc_abs - j; \ p12 = p_r + (j )*ldp; \ c12 = c + (j )*ldc; \ - c12 = c12 + diagoffc12 * ( doff_t )cs_c + \ - -diagoffc12 * ( doff_t )rs_c; \ + c12 = c12 + diagoffc12 * ( doff_t )ldc + \ + -diagoffc12 * ( doff_t )incc; \ incc12 = ldc; \ ldc12 = incc; \ conjc12 = conjc; \ @@ -466,16 +247,15 @@ void PASTEMAC(ch,varname) \ if ( bli_is_hermitian( strucc ) ) \ bli_toggle_conj( &conjc12 ); \ } \ - else /* if ( ( row_stored && bli_is_lower( uploc ) ) || \ - ( col_stored && bli_is_upper( uploc ) ) ) */ \ + else /* if ( bli_is_upper( uploc ) ) */ \ { \ p10_dim = panel_dim; \ p10_len = diagoffc_abs + panel_dim; \ diagoffc10 = diagoffc; \ p10 = p_r; \ c10 = c; \ - c10 = c10 + diagoffc10 * ( doff_t )cs_c + \ - -diagoffc10 * ( doff_t )rs_c; \ + c10 = c10 + diagoffc10 * ( doff_t )ldc + \ + -diagoffc10 * ( doff_t )incc; \ incc10 = ldc; \ ldc10 = incc; \ conjc10 = conjc; \ @@ -535,8 +315,8 @@ void PASTEMAC(ch,varname) \ { \ dim_t p11_m = panel_dim; \ dim_t p11_n = panel_dim; \ - inc_t rs_c11 = 2*rs_c; \ - inc_t cs_c11 = 2*cs_c; \ + inc_t incc11 = 2*incc; \ + inc_t ldc11 = 2*ldc; \ dim_t j2 = diagoffc_abs; \ ctype* c11 = ( ctype* )c + (j2 )*ldc; \ ctype_r* p11 = ( ctype_r* )p_r + (j2 )*ldp; \ @@ -559,8 +339,8 @@ void PASTEMAC(ch,varname) \ p11_m, \ p11_n, \ alpha_r, \ - c11_r, rs_c11, cs_c11, \ - p11_r, rs_p, cs_p, \ + c11_r, incc11, ldc11, \ + p11_r, 1, ldp, \ cntx, \ NULL \ ); \ @@ -576,8 +356,8 @@ void PASTEMAC(ch,varname) \ p11_m, \ p11_n, \ alpha_i, \ - c11_i, rs_c11, cs_c11, \ - p11_i, rs_p, cs_p, \ + c11_i, incc11, ldc11, \ + p11_i, 1, ldp, \ cntx, \ NULL \ ); \ @@ -589,7 +369,7 @@ void PASTEMAC(ch,varname) \ { \ for ( i = 0; i < p11_m; ++i ) \ { \ - ctype_r* pi11_i = p11_i + (i )*rs_p + (i )*cs_p; \ + ctype_r* pi11_i = p11_i + (i ) + (i )*ldp; \ \ PASTEMAC(chr,set0s)( *pi11_i ); \ } \ @@ -607,7 +387,7 @@ void PASTEMAC(ch,varname) \ &kappa_r, \ &kappa_i, \ p11_r, \ - p11_i, rs_p, cs_p \ + p11_i, 1, ldp \ ); \ } \ else \ @@ -620,7 +400,7 @@ void PASTEMAC(ch,varname) \ &kappa_r, \ &kappa_i, \ p11_r, \ - p11_i, rs_p, cs_p \ + p11_i, 1, ldp \ ); \ } \ \ @@ -632,9 +412,9 @@ void PASTEMAC(ch,varname) \ for ( j = 0; j < p11_n; ++j ) \ for ( i = 0; i < p11_m; ++i ) \ { \ - ctype_r* pi11_r = p11_r + (i )*rs_p + (j )*cs_p; \ - ctype_r* pi11_i = p11_i + (i )*rs_p + (j )*cs_p; \ - ctype_r* pi11_rpi = p11_rpi + (i )*rs_p + (j )*cs_p; \ + ctype_r* pi11_r = p11_r + (i ) + (j )*ldp; \ + ctype_r* pi11_i = p11_i + (i ) + (j )*ldp; \ + ctype_r* pi11_rpi = p11_rpi + (i ) + (j )*ldp; \ \ PASTEMAC(chr,add3s) \ ( \ @@ -660,25 +440,20 @@ INSERT_GENTFUNCCO_BASIC( packm_herm_cxk_3mis, packm_cxk_3mis ) void PASTEMAC(ch,varname) \ ( \ struc_t strucc, \ - doff_t diagoffp, \ + doff_t diagoffc, \ diag_t diagc, \ uplo_t uploc, \ conj_t conjc, \ pack_t schema, \ bool invdiag, \ - dim_t m_panel, \ - dim_t n_panel, \ - dim_t m_panel_max, \ - dim_t n_panel_max, \ dim_t panel_dim, \ - dim_t panel_dim_max, \ dim_t panel_len, \ + dim_t panel_dim_max, \ dim_t panel_len_max, \ ctype* restrict kappa, \ - ctype* restrict c, inc_t rs_c, inc_t cs_c, \ - inc_t incc, inc_t ldc, \ - ctype* restrict p, inc_t rs_p, inc_t cs_p, \ - inc_t is_p, inc_t ldp, \ + ctype* restrict c, inc_t incc, inc_t ldc, \ + ctype* restrict p, inc_t ldp, \ + inc_t is_p, \ cntx_t* cntx \ ) \ { \ @@ -703,18 +478,18 @@ void PASTEMAC(ch,varname) \ ctype_r* p_i = ( ctype_r* )p + is_p; \ ctype_r* p_rpi = ( ctype_r* )p + 2*is_p; \ \ - dim_t j = bli_abs( diagoffp ); \ + dim_t j = bli_abs( diagoffc ); \ ctype_r* p11_r = p_r + (j )*ldp; \ ctype_r* p11_i = p_i + (j )*ldp; \ ctype_r* p11_rpi = p_rpi + (j )*ldp; \ \ - dim_t p11_m = m_panel; \ - dim_t p11_n = n_panel; \ + dim_t p11_m = panel_dim; \ + dim_t p11_n = panel_len; \ \ dim_t min_p11_m_n; \ \ - if ( diagoffp < 0 ) p11_m -= j; \ - else if ( diagoffp > 0 ) p11_n -= j; \ + if ( diagoffc < 0 ) p11_m -= j; \ + else if ( diagoffc > 0 ) p11_n -= j; \ \ min_p11_m_n = bli_min( p11_m, p11_n ); \ \ @@ -730,22 +505,22 @@ void PASTEMAC(ch,varname) \ PASTEMAC2(chr,setd,BLIS_TAPI_EX_SUF) \ ( \ BLIS_NO_CONJUGATE, \ - diagoffp, \ - m_panel, \ - n_panel, \ + diagoffc, \ + panel_dim, \ + panel_len, \ &kappa_r, \ - p_r, rs_p, cs_p, \ + p_r, 1, ldp, \ cntx, \ NULL \ ); \ PASTEMAC2(chr,setd,BLIS_TAPI_EX_SUF) \ ( \ BLIS_NO_CONJUGATE, \ - diagoffp, \ - m_panel, \ - n_panel, \ + diagoffc, \ + panel_dim, \ + panel_len, \ &kappa_i, \ - p_i, rs_p, cs_p, \ + p_i, 1, ldp, \ cntx, \ NULL \ ); \ @@ -755,9 +530,9 @@ void PASTEMAC(ch,varname) \ and p11_i. */ \ for ( i = 0; i < min_p11_m_n; ++i ) \ { \ - ctype_r* pi11_r = p11_r + (i )*rs_p + (i )*cs_p; \ - ctype_r* pi11_i = p11_i + (i )*rs_p + (i )*cs_p; \ - ctype_r* pi11_rpi = p11_rpi + (i )*rs_p + (i )*cs_p; \ + ctype_r* pi11_r = p11_r + (i ) + (i )*ldp; \ + ctype_r* pi11_i = p11_i + (i ) + (i )*ldp; \ + ctype_r* pi11_rpi = p11_rpi + (i ) + (i )*ldp; \ \ PASTEMAC(chr,add3s)( *pi11_r, *pi11_i, *pi11_rpi ); \ } \ @@ -773,8 +548,8 @@ void PASTEMAC(ch,varname) \ \ for ( i = 0; i < min_p11_m_n; ++i ) \ { \ - ctype_r* pi11_r = p11_r + (i )*rs_p + (i )*cs_p; \ - ctype_r* pi11_i = p11_i + (i )*rs_p + (i )*cs_p; \ + ctype_r* pi11_r = p11_r + (i ) + (i )*ldp; \ + ctype_r* pi11_i = p11_i + (i ) + (i )*ldp; \ \ PASTEMAC(ch,invertris)( *pi11_r, *pi11_i ); \ } \ @@ -793,49 +568,93 @@ void PASTEMAC(ch,varname) \ uplo_t uplop = uploc; \ \ bli_toggle_uplo( &uplop ); \ - bli_shift_diag_offset_to_shrink_uplo( uplop, &diagoffp ); \ + bli_shift_diag_offset_to_shrink_uplo( uplop, &diagoffc ); \ \ PASTEMAC2(chr,setm,BLIS_TAPI_EX_SUF) \ ( \ BLIS_NO_CONJUGATE, \ - diagoffp, \ + diagoffc, \ BLIS_NONUNIT_DIAG, \ uplop, \ - m_panel, \ - n_panel, \ + panel_dim, \ + panel_len, \ zero_r, \ - p_r, rs_p, cs_p, \ + p_r, 1, ldp, \ cntx, \ NULL \ ); \ PASTEMAC2(chr,setm,BLIS_TAPI_EX_SUF) \ ( \ BLIS_NO_CONJUGATE, \ - diagoffp, \ + diagoffc, \ BLIS_NONUNIT_DIAG, \ uplop, \ - m_panel, \ - n_panel, \ + panel_dim, \ + panel_len, \ zero_r, \ - p_i, rs_p, cs_p, \ + p_i, 1, ldp, \ cntx, \ NULL \ ); \ PASTEMAC2(chr,setm,BLIS_TAPI_EX_SUF) \ ( \ BLIS_NO_CONJUGATE, \ - diagoffp, \ + diagoffc, \ BLIS_NONUNIT_DIAG, \ uplop, \ - m_panel, \ - n_panel, \ + panel_dim, \ + panel_len, \ zero_r, \ - p_rpi, rs_p, cs_p, \ + p_rpi, 1, ldp, \ cntx, \ NULL \ ); \ } \ } \ +\ + /* If this panel is an edge case in both panel dimension and length, + then it must be a bottom-right corner case. Set the part of the + diagonal that extends into the zero-padded region to identity. + NOTE: This is actually only necessary when packing for trsm, as + it helps prevent NaNs and Infs from creeping into the computation. + However, we set the region to identity for trmm as well. Those + 1.0's end up getting muliplied by the 0.0's in the zero-padded + region of the other matrix, so there is no harm in this. */ \ + if ( panel_dim != panel_dim_max && \ + panel_len != panel_len_max ) \ + { \ + ctype_r* restrict one_r = PASTEMAC(chr,1); \ + ctype_r* restrict zero_r = PASTEMAC(chr,0); \ + dim_t i = panel_dim; \ + dim_t j = panel_len; \ + dim_t m_br = panel_dim_max - i; \ + dim_t n_br = panel_len_max - j; \ + ctype_r* p_br_r = ( ctype_r* )p + (i ) + (j )*ldp; \ + ctype_r* p_br_i = ( ctype_r* )p + is_p + (i ) + (j )*ldp; \ +\ + PASTEMAC2(chr,setd,BLIS_TAPI_EX_SUF) \ + ( \ + BLIS_NO_CONJUGATE, \ + 0, \ + m_br, \ + n_br, \ + one_r, \ + p_br_r, 1, ldp, \ + cntx, \ + NULL \ + ); \ + PASTEMAC2(chr,setd,BLIS_TAPI_EX_SUF) \ + ( \ + BLIS_NO_CONJUGATE, \ + 0, \ + m_br, \ + n_br, \ + zero_r, \ + p_br_i, 1, ldp, \ + cntx, \ + NULL \ + ); \ + } \ } INSERT_GENTFUNCCO_BASIC( packm_tri_cxk_3mis, packm_cxk_3mis ) diff --git a/frame/1m/packm/bli_packm_struc_cxk_3mis.h b/frame/1m/packm/bli_packm_struc_cxk_3mis.h index 01c8510a43..9744ea66b4 100644 --- a/frame/1m/packm/bli_packm_struc_cxk_3mis.h +++ b/frame/1m/packm/bli_packm_struc_cxk_3mis.h @@ -32,63 +32,6 @@ */ -#undef GENTPROTCO -#define GENTPROTCO( ctype, ctype_r, ch, chr, varname ) \ -\ -void PASTEMAC(ch,varname) \ - ( \ - struc_t strucc, \ - doff_t diagoffp, \ - diag_t diagc, \ - uplo_t uploc, \ - conj_t conjc, \ - pack_t schema, \ - bool invdiag, \ - dim_t m_panel, \ - dim_t n_panel, \ - dim_t m_panel_max, \ - dim_t n_panel_max, \ - ctype* restrict kappa, \ - ctype* restrict c, inc_t rs_c, inc_t cs_c, \ - ctype* restrict p, inc_t rs_p, inc_t cs_p, \ - inc_t is_p, \ - cntx_t* cntx \ - ); - -INSERT_GENTPROTCO_BASIC0( packm_struc_cxk_3mis ) - - - -#undef GENTPROTCO -#define GENTPROTCO( ctype, ctype_r, ch, chr, varname ) \ -\ -void PASTEMAC(ch,varname) \ - ( \ - struc_t strucc, \ - doff_t diagoffc, \ - uplo_t uploc, \ - conj_t conjc, \ - pack_t schema, \ - dim_t m_panel, \ - dim_t n_panel, \ - dim_t m_panel_max, \ - dim_t n_panel_max, \ - dim_t panel_dim, \ - dim_t panel_dim_max, \ - dim_t panel_len, \ - dim_t panel_len_max, \ - ctype* restrict kappa, \ - ctype* restrict c, inc_t rs_c, inc_t cs_c, \ - inc_t incc, inc_t ldc, \ - ctype* restrict p, inc_t rs_p, inc_t cs_p, \ - inc_t is_p, inc_t ldp, \ - cntx_t* cntx \ - ); - -INSERT_GENTPROTCO_BASIC0( packm_herm_cxk_3mis ) - - - #undef GENTPROTCO #define GENTPROTCO( ctype, ctype_r, ch, chr, varname ) \ \ @@ -101,21 +44,18 @@ void PASTEMAC(ch,varname) \ conj_t conjc, \ pack_t schema, \ bool invdiag, \ - dim_t m_panel, \ - dim_t n_panel, \ - dim_t m_panel_max, \ - dim_t n_panel_max, \ dim_t panel_dim, \ - dim_t panel_dim_max, \ dim_t panel_len, \ + dim_t panel_dim_max, \ dim_t panel_len_max, \ ctype* restrict kappa, \ - ctype* restrict c, inc_t rs_c, inc_t cs_c, \ - inc_t incc, inc_t ldc, \ - ctype* restrict p, inc_t rs_p, inc_t cs_p, \ - inc_t is_p, inc_t ldp, \ + ctype* restrict c, inc_t incc, inc_t ldc, \ + ctype* restrict p, inc_t ldp, \ + inc_t is_p, \ cntx_t* cntx \ ); +INSERT_GENTPROTCO_BASIC0( packm_struc_cxk_3mis ) +INSERT_GENTPROTCO_BASIC0( packm_herm_cxk_3mis ) INSERT_GENTPROTCO_BASIC0( packm_tri_cxk_3mis ) diff --git a/frame/1m/packm/bli_packm_struc_cxk_4mi.c b/frame/1m/packm/bli_packm_struc_cxk_4mi.c index 62c2d5086d..2095f3341b 100644 --- a/frame/1m/packm/bli_packm_struc_cxk_4mi.c +++ b/frame/1m/packm/bli_packm_struc_cxk_4mi.c @@ -46,51 +46,17 @@ void PASTEMAC(ch,varname) \ conj_t conjc, \ pack_t schema, \ bool invdiag, \ - dim_t m_panel, \ - dim_t n_panel, \ - dim_t m_panel_max, \ - dim_t n_panel_max, \ + dim_t panel_dim, \ + dim_t panel_len, \ + dim_t panel_dim_max, \ + dim_t panel_len_max, \ ctype* restrict kappa, \ - ctype* restrict c, inc_t rs_c, inc_t cs_c, \ - ctype* restrict p, inc_t rs_p, inc_t cs_p, \ + ctype* restrict c, inc_t incc, inc_t ldc, \ + ctype* restrict p, inc_t ldp, \ inc_t is_p, \ cntx_t* cntx \ ) \ { \ - dim_t panel_dim; \ - dim_t panel_dim_max; \ - dim_t panel_len; \ - dim_t panel_len_max; \ - inc_t incc, ldc; \ - inc_t ldp; \ -\ -\ - /* Determine the dimensions and relative strides of the micro-panel - based on its pack schema. */ \ - if ( bli_is_col_packed( schema ) ) \ - { \ - /* Prepare to pack to row-stored column panel. */ \ - panel_dim = n_panel; \ - panel_dim_max = n_panel_max; \ - panel_len = m_panel; \ - panel_len_max = m_panel_max; \ - incc = cs_c; \ - ldc = rs_c; \ - ldp = rs_p; \ - } \ - else /* if ( bli_is_row_packed( schema ) ) */ \ - { \ - /* Prepare to pack to column-stored row panel. */ \ - panel_dim = m_panel; \ - panel_dim_max = m_panel_max; \ - panel_len = n_panel; \ - panel_len_max = n_panel_max; \ - incc = rs_c; \ - ldc = cs_c; \ - ldp = cs_p; \ - } \ -\ -\ /* Handle micro-panel packing based on the structure of the matrix being packed. */ \ if ( bli_is_general( strucc ) ) \ @@ -116,25 +82,22 @@ void PASTEMAC(ch,varname) \ matrices. */ \ PASTEMAC(ch,packm_herm_cxk_4mi) \ ( \ - strucc, \ - diagoffc, \ - uploc, \ - conjc, \ - schema, \ - m_panel, \ - n_panel, \ - m_panel_max, \ - n_panel_max, \ - panel_dim, \ - panel_dim_max, \ - panel_len, \ - panel_len_max, \ - kappa, \ - c, rs_c, cs_c, \ - incc, ldc, \ - p, rs_p, cs_p, \ - is_p, ldp, \ - cntx \ + strucc, \ + diagoffc, \ + diagc, \ + uploc, \ + conjc, \ + schema, \ + invdiag, \ + panel_dim, \ + panel_len, \ + panel_dim_max, \ + panel_len_max, \ + kappa, \ + c, incc, ldc, \ + p, ldp, \ + is_p, \ + cntx \ ); \ } \ else /* ( bli_is_triangular( strucc ) ) */ \ @@ -143,161 +106,24 @@ void PASTEMAC(ch,varname) \ matrices. */ \ PASTEMAC(ch,packm_tri_cxk_4mi) \ ( \ - strucc, \ - diagoffc, \ - diagc, \ - uploc, \ - conjc, \ - schema, \ - invdiag, \ - m_panel, \ - n_panel, \ - m_panel_max, \ - n_panel_max, \ - panel_dim, \ - panel_dim_max, \ - panel_len, \ - panel_len_max, \ - kappa, \ - c, rs_c, cs_c, \ - incc, ldc, \ - p, rs_p, cs_p, \ - is_p, ldp, \ - cntx \ + strucc, \ + diagoffc, \ + diagc, \ + uploc, \ + conjc, \ + schema, \ + invdiag, \ + panel_dim, \ + panel_len, \ + panel_dim_max, \ + panel_len_max, \ + kappa, \ + c, incc, ldc, \ + p, ldp, \ + is_p, \ + cntx \ ); \ } \ -\ -\ - /* If m_panel < m_panel_max, or n_panel < n_panel_max, we would normally - fill the edge region (the bottom m_panel_max - m_panel rows or right- - side n_panel_max - n_panel columns) of the micropanel with zeros. - However, this responsibility has been moved to the packm microkernel. - This change allows experts to use custom kernels that pack to custom - packing formats when the problem size is not a nice multiple of the - register blocksize. */ \ -/* - if ( m_panel != m_panel_max ) \ - { \ - ctype_r* restrict zero_r = PASTEMAC(chr,0); \ - dim_t i = m_panel; \ - dim_t m_edge = m_panel_max - i; \ - dim_t n_edge = n_panel_max; \ - ctype_r* p_edge_r = ( ctype_r* )p + (i )*rs_p; \ - ctype_r* p_edge_i = ( ctype_r* )p + is_p + (i )*rs_p; \ -\ - PASTEMAC2(chr,setm,BLIS_TAPI_EX_SUF) \ - ( \ - BLIS_NO_CONJUGATE, \ - 0, \ - BLIS_NONUNIT_DIAG, \ - BLIS_DENSE, \ - m_edge, \ - n_edge, \ - zero_r, \ - p_edge_r, rs_p, cs_p, \ - cntx, \ - NULL \ - ); \ - PASTEMAC2(chr,setm,BLIS_TAPI_EX_SUF) \ - ( \ - BLIS_NO_CONJUGATE, \ - 0, \ - BLIS_NONUNIT_DIAG, \ - BLIS_DENSE, \ - m_edge, \ - n_edge, \ - zero_r, \ - p_edge_i, rs_p, cs_p, \ - cntx, \ - NULL \ - ); \ - } \ -\ - if ( n_panel != n_panel_max ) \ - { \ - ctype_r* restrict zero_r = PASTEMAC(chr,0); \ - dim_t j = n_panel; \ - dim_t m_edge = m_panel_max; \ - dim_t n_edge = n_panel_max - j; \ - ctype_r* p_edge_r = ( ctype_r* )p + (j )*cs_p; \ - ctype_r* p_edge_i = ( ctype_r* )p + is_p + (j )*cs_p; \ -\ - PASTEMAC2(chr,setm,BLIS_TAPI_EX_SUF) \ - ( \ - BLIS_NO_CONJUGATE, \ - 0, \ - BLIS_NONUNIT_DIAG, \ - BLIS_DENSE, \ - m_edge, \ - n_edge, \ - zero_r, \ - p_edge_r, rs_p, cs_p, \ - cntx, \ - NULL \ - ); \ - PASTEMAC2(chr,setm,BLIS_TAPI_EX_SUF) \ - ( \ - BLIS_NO_CONJUGATE, \ - 0, \ - BLIS_NONUNIT_DIAG, \ - BLIS_DENSE, \ - m_edge, \ - n_edge, \ - zero_r, \ - p_edge_i, rs_p, cs_p, \ - cntx, \ - NULL \ - ); \ - } \ -*/ \ -\ -\ - if ( bli_is_triangular( strucc ) ) \ - { \ - /* If this panel is an edge case in both panel dimension and length, - then it must be a bottom-right corner case. Set the part of the - diagonal that extends into the zero-padded region to identity. - NOTE: This is actually only necessary when packing for trsm, as - it helps prevent NaNs and Infs from creeping into the computation. - However, we set the region to identity for trmm as well. Those - 1.0's end up getting muliplied by the 0.0's in the zero-padded - region of the other matrix, so there is no harm in this. */ \ - if ( m_panel != m_panel_max && \ - n_panel != n_panel_max ) \ - { \ - ctype_r* restrict one_r = PASTEMAC(chr,1); \ - ctype_r* restrict zero_r = PASTEMAC(chr,0); \ - dim_t i = m_panel; \ - dim_t j = n_panel; \ - dim_t m_br = m_panel_max - i; \ - dim_t n_br = n_panel_max - j; \ - ctype_r* p_br_r = ( ctype_r* )p + (i )*rs_p + (j )*cs_p; \ - ctype_r* p_br_i = ( ctype_r* )p + is_p + (i )*rs_p + (j )*cs_p; \ -\ - PASTEMAC2(chr,setd,BLIS_TAPI_EX_SUF) \ - ( \ - BLIS_NO_CONJUGATE, \ - 0, \ - m_br, \ - n_br, \ - one_r, \ - p_br_r, rs_p, cs_p, \ - cntx, \ - NULL \ - ); \ - PASTEMAC2(chr,setd,BLIS_TAPI_EX_SUF) \ - ( \ - BLIS_NO_CONJUGATE, \ - 0, \ - m_br, \ - n_br, \ - zero_r, \ - p_br_i, rs_p, cs_p, \ - cntx, \ - NULL \ - ); \ - } \ - } \ } INSERT_GENTFUNCCO_BASIC( packm_struc_cxk_4mi, packm_cxk_4mi ) @@ -312,42 +138,29 @@ void PASTEMAC(ch,varname) \ ( \ struc_t strucc, \ doff_t diagoffc, \ + diag_t diagc, \ uplo_t uploc, \ conj_t conjc, \ pack_t schema, \ - dim_t m_panel, \ - dim_t n_panel, \ - dim_t m_panel_max, \ - dim_t n_panel_max, \ + bool invdiag, \ dim_t panel_dim, \ - dim_t panel_dim_max, \ dim_t panel_len, \ + dim_t panel_dim_max, \ dim_t panel_len_max, \ ctype* restrict kappa, \ - ctype* restrict c, inc_t rs_c, inc_t cs_c, \ - inc_t incc, inc_t ldc, \ - ctype* restrict p, inc_t rs_p, inc_t cs_p, \ - inc_t is_p, inc_t ldp, \ + ctype* restrict c, inc_t incc, inc_t ldc, \ + ctype* restrict p, inc_t ldp, \ + inc_t is_p, \ cntx_t* cntx \ ) \ { \ doff_t diagoffc_abs; \ dim_t i, j; \ - bool row_stored; \ - bool col_stored; \ -\ -\ - /* Create flags to incidate row or column storage. Note that the - schema bit that encodes row or column is describing the form of - micro-panel, not the storage in the micro-panel. Hence the - mismatch in "row" and "column" semantics. */ \ - row_stored = bli_is_col_packed( schema ); \ - col_stored = bli_is_row_packed( schema ); \ \ \ /* Handle the case where the micro-panel does NOT intersect the diagonal separately from the case where it does intersect. */ \ - if ( !bli_intersects_diag_n( diagoffc, m_panel, n_panel ) ) \ + if ( !bli_intersects_diag_n( diagoffc, panel_dim, panel_len ) ) \ { \ /* If the current panel is unstored, we need to make a few adjustments so we refer to the data where it is actually @@ -355,10 +168,10 @@ void PASTEMAC(ch,varname) \ implicitly assumes we are operating on a dense panel within a larger symmetric or Hermitian matrix, since a general matrix would not contain any unstored region.) */ \ - if ( bli_is_unstored_subpart_n( diagoffc, uploc, m_panel, n_panel ) ) \ + if ( bli_is_unstored_subpart_n( diagoffc, uploc, panel_dim, panel_len ) ) \ { \ - c = c + diagoffc * ( doff_t )cs_c + \ - -diagoffc * ( doff_t )rs_c; \ + c = c + diagoffc * ( doff_t )ldc + \ + -diagoffc * ( doff_t )incc; \ bli_swap_incs( &incc, &ldc ); \ \ if ( bli_is_hermitian( strucc ) ) \ @@ -379,7 +192,7 @@ void PASTEMAC(ch,varname) \ cntx \ ); \ } \ - else /* if ( bli_intersects_diag_n( diagoffc, m_panel, n_panel ) ) */ \ + else /* if ( bli_intersects_diag_n( diagoffc, panel_dim, panel_len ) ) */ \ { \ ctype_r* restrict p_r = ( ctype_r* )p; \ \ @@ -404,14 +217,12 @@ void PASTEMAC(ch,varname) \ a micro-panel. If they do, then somehow the constraints on cache blocksizes being a whole multiple of the register blocksizes was somehow violated. */ \ - if ( ( col_stored && diagoffc < 0 ) || \ - ( row_stored && diagoffc > 0 ) ) \ + if ( diagoffc < 0 ) \ bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED ); \ \ diagoffc_abs = bli_abs( diagoffc ); \ \ - if ( ( row_stored && bli_is_upper( uploc ) ) || \ - ( col_stored && bli_is_lower( uploc ) ) ) \ + if ( bli_is_lower( uploc ) ) \ { \ p10_dim = panel_dim; \ p10_len = diagoffc_abs; \ @@ -427,8 +238,8 @@ void PASTEMAC(ch,varname) \ diagoffc12 = diagoffc_abs - j; \ p12 = p_r + (j )*ldp; \ c12 = c + (j )*ldc; \ - c12 = c12 + diagoffc12 * ( doff_t )cs_c + \ - -diagoffc12 * ( doff_t )rs_c; \ + c12 = c12 + diagoffc12 * ( doff_t )ldc + \ + -diagoffc12 * ( doff_t )incc; \ incc12 = ldc; \ ldc12 = incc; \ conjc12 = conjc; \ @@ -436,16 +247,15 @@ void PASTEMAC(ch,varname) \ if ( bli_is_hermitian( strucc ) ) \ bli_toggle_conj( &conjc12 ); \ } \ - else /* if ( ( row_stored && bli_is_lower( uploc ) ) || \ - ( col_stored && bli_is_upper( uploc ) ) ) */ \ + else /* if ( bli_is_upper( uploc ) ) */ \ { \ p10_dim = panel_dim; \ p10_len = diagoffc_abs + panel_dim; \ diagoffc10 = diagoffc; \ p10 = p_r; \ c10 = c; \ - c10 = c10 + diagoffc10 * ( doff_t )cs_c + \ - -diagoffc10 * ( doff_t )rs_c; \ + c10 = c10 + diagoffc10 * ( doff_t )ldc + \ + -diagoffc10 * ( doff_t )incc; \ incc10 = ldc; \ ldc10 = incc; \ conjc10 = conjc; \ @@ -505,8 +315,8 @@ void PASTEMAC(ch,varname) \ { \ dim_t p11_m = panel_dim; \ dim_t p11_n = panel_dim; \ - inc_t rs_c11 = 2*rs_c; \ - inc_t cs_c11 = 2*cs_c; \ + inc_t incc11 = 2*incc; \ + inc_t ldc11 = 2*ldc; \ dim_t j2 = diagoffc_abs; \ ctype* c11 = ( ctype* )c + (j2 )*ldc; \ ctype_r* p11 = ( ctype_r* )p_r + (j2 )*ldp; \ @@ -529,8 +339,8 @@ void PASTEMAC(ch,varname) \ p11_m, \ p11_n, \ alpha_r, \ - c11_r, rs_c11, cs_c11, \ - p11_r, rs_p, cs_p, \ + c11_r, incc11, ldc11, \ + p11_r, 1, ldp, \ cntx, \ NULL \ ); \ @@ -546,8 +356,8 @@ void PASTEMAC(ch,varname) \ p11_m, \ p11_n, \ alpha_i, \ - c11_i, rs_c11, cs_c11, \ - p11_i, rs_p, cs_p, \ + c11_i, incc11, ldc11, \ + p11_i, 1, ldp, \ cntx, \ NULL \ ); \ @@ -559,7 +369,7 @@ void PASTEMAC(ch,varname) \ { \ for ( i = 0; i < p11_m; ++i ) \ { \ - ctype_r* pi11_i = p11_i + (i )*rs_p + (i )*cs_p; \ + ctype_r* pi11_i = p11_i + (i ) + (i )*ldp; \ \ PASTEMAC(chr,set0s)( *pi11_i ); \ } \ @@ -577,7 +387,7 @@ void PASTEMAC(ch,varname) \ &kappa_r, \ &kappa_i, \ p11_r, \ - p11_i, rs_p, cs_p \ + p11_i, 1, ldp \ ); \ } \ else \ @@ -590,14 +400,14 @@ void PASTEMAC(ch,varname) \ &kappa_r, \ &kappa_i, \ p11_r, \ - p11_i, rs_p, cs_p \ + p11_i, 1, ldp \ ); \ } \ /* - PASTEMAC(chr,fprintm)( stdout, "packm_herm_cxk: ap_r copied", m_panel_max, n_panel_max, \ - p_r + 0*is_p, rs_p, cs_p, "%4.1f", "" ); \ - PASTEMAC(chr,fprintm)( stdout, "packm_herm_cxk: ap_i copied", m_panel_max, n_panel_max, \ - p_r + 1*is_p, rs_p, cs_p, "%4.1f", "" ); \ + PASTEMAC(chr,fprintm)( stdout, "packm_herm_cxk: ap_r copied", panel_dim_max, panel_len_max, \ + p_r + 0*is_p, rs_p, ldp, "%4.1f", "" ); \ + PASTEMAC(chr,fprintm)( stdout, "packm_herm_cxk: ap_i copied", panel_dim_max, panel_len_max, \ + p_r + 1*is_p, rs_p, ldp, "%4.1f", "" ); \ */ \ } \ } \ @@ -615,25 +425,20 @@ INSERT_GENTFUNCCO_BASIC( packm_herm_cxk_4mi, packm_cxk_4mi ) void PASTEMAC(ch,varname) \ ( \ struc_t strucc, \ - doff_t diagoffp, \ + doff_t diagoffc, \ diag_t diagc, \ uplo_t uploc, \ conj_t conjc, \ pack_t schema, \ bool invdiag, \ - dim_t m_panel, \ - dim_t n_panel, \ - dim_t m_panel_max, \ - dim_t n_panel_max, \ dim_t panel_dim, \ - dim_t panel_dim_max, \ dim_t panel_len, \ + dim_t panel_dim_max, \ dim_t panel_len_max, \ ctype* restrict kappa, \ - ctype* restrict c, inc_t rs_c, inc_t cs_c, \ - inc_t incc, inc_t ldc, \ - ctype* restrict p, inc_t rs_p, inc_t cs_p, \ - inc_t is_p, inc_t ldp, \ + ctype* restrict c, inc_t incc, inc_t ldc, \ + ctype* restrict p, inc_t ldp, \ + inc_t is_p, \ cntx_t* cntx \ ) \ { \ @@ -657,7 +462,7 @@ void PASTEMAC(ch,varname) \ ctype_r* p_r = ( ctype_r* )p; \ ctype_r* p_i = ( ctype_r* )p + is_p; \ \ - dim_t j = bli_abs( diagoffp ); \ + dim_t j = bli_abs( diagoffc ); \ ctype_r* p11_r = p_r + (j )*ldp; \ ctype_r* p11_i = p_i + (j )*ldp; \ \ @@ -671,22 +476,22 @@ void PASTEMAC(ch,varname) \ PASTEMAC2(chr,setd,BLIS_TAPI_EX_SUF) \ ( \ BLIS_NO_CONJUGATE, \ - diagoffp, \ - m_panel, \ - n_panel, \ + diagoffc, \ + panel_dim, \ + panel_len, \ &kappa_r, \ - p_r, rs_p, cs_p, \ + p_r, 1, ldp, \ cntx, \ NULL \ ); \ PASTEMAC2(chr,setd,BLIS_TAPI_EX_SUF) \ ( \ BLIS_NO_CONJUGATE, \ - diagoffp, \ - m_panel, \ - n_panel, \ + diagoffc, \ + panel_dim, \ + panel_len, \ &kappa_i, \ - p_i, rs_p, cs_p, \ + p_i, 1, ldp, \ cntx, \ NULL \ ); \ @@ -700,8 +505,8 @@ void PASTEMAC(ch,varname) \ \ for ( i = 0; i < panel_dim; ++i ) \ { \ - ctype_r* pi11_r = p11_r + (i )*rs_p + (i )*cs_p; \ - ctype_r* pi11_i = p11_i + (i )*rs_p + (i )*cs_p; \ + ctype_r* pi11_r = p11_r + (i ) + (i )*ldp; \ + ctype_r* pi11_i = p11_i + (i ) + (i )*ldp; \ \ PASTEMAC(ch,invertris)( *pi11_r, *pi11_i ); \ } \ @@ -721,36 +526,80 @@ void PASTEMAC(ch,varname) \ uplo_t uplop = uploc; \ \ bli_toggle_uplo( &uplop ); \ - bli_shift_diag_offset_to_shrink_uplo( uplop, &diagoffp ); \ + bli_shift_diag_offset_to_shrink_uplo( uplop, &diagoffc ); \ \ PASTEMAC2(chr,setm,BLIS_TAPI_EX_SUF) \ ( \ BLIS_NO_CONJUGATE, \ - diagoffp, \ + diagoffc, \ BLIS_NONUNIT_DIAG, \ uplop, \ - m_panel, \ - n_panel, \ + panel_dim, \ + panel_len, \ zero_r, \ - p_r, rs_p, cs_p, \ + p_r, 1, ldp, \ cntx, \ NULL \ ); \ PASTEMAC2(chr,setm,BLIS_TAPI_EX_SUF) \ ( \ BLIS_NO_CONJUGATE, \ - diagoffp, \ + diagoffc, \ BLIS_NONUNIT_DIAG, \ uplop, \ - m_panel, \ - n_panel, \ + panel_dim, \ + panel_len, \ zero_r, \ - p_i, rs_p, cs_p, \ + p_i, 1, ldp, \ cntx, \ NULL \ ); \ } \ } \ +\ + /* If this panel is an edge case in both panel dimension and length, + then it must be a bottom-right corner case. Set the part of the + diagonal that extends into the zero-padded region to identity. + NOTE: This is actually only necessary when packing for trsm, as + it helps prevent NaNs and Infs from creeping into the computation. + However, we set the region to identity for trmm as well. Those + 1.0's end up getting muliplied by the 0.0's in the zero-padded + region of the other matrix, so there is no harm in this. */ \ + if ( panel_dim != panel_dim_max && \ + panel_len != panel_len_max ) \ + { \ + ctype_r* restrict one_r = PASTEMAC(chr,1); \ + ctype_r* restrict zero_r = PASTEMAC(chr,0); \ + dim_t i = panel_dim; \ + dim_t j = panel_len; \ + dim_t m_br = panel_dim_max - i; \ + dim_t n_br = panel_len_max - j; \ + ctype_r* p_br_r = ( ctype_r* )p + (i ) + (j )*ldp; \ + ctype_r* p_br_i = ( ctype_r* )p + is_p + (i ) + (j )*ldp; \ +\ + PASTEMAC2(chr,setd,BLIS_TAPI_EX_SUF) \ + ( \ + BLIS_NO_CONJUGATE, \ + 0, \ + m_br, \ + n_br, \ + one_r, \ + p_br_r, 1, ldp, \ + cntx, \ + NULL \ + ); \ + PASTEMAC2(chr,setd,BLIS_TAPI_EX_SUF) \ + ( \ + BLIS_NO_CONJUGATE, \ + 0, \ + m_br, \ + n_br, \ + zero_r, \ + p_br_i, 1, ldp, \ + cntx, \ + NULL \ + ); \ + } \ } INSERT_GENTFUNCCO_BASIC( packm_tri_cxk_4mi, packm_cxk_4mi ) diff --git a/frame/1m/packm/bli_packm_struc_cxk_4mi.h b/frame/1m/packm/bli_packm_struc_cxk_4mi.h index 5abfb585fd..5e0b234525 100644 --- a/frame/1m/packm/bli_packm_struc_cxk_4mi.h +++ b/frame/1m/packm/bli_packm_struc_cxk_4mi.h @@ -32,63 +32,6 @@ */ -#undef GENTPROTCO -#define GENTPROTCO( ctype, ctype_r, ch, chr, varname ) \ -\ -void PASTEMAC(ch,varname) \ - ( \ - struc_t strucc, \ - doff_t diagoffp, \ - diag_t diagc, \ - uplo_t uploc, \ - conj_t conjc, \ - pack_t schema, \ - bool invdiag, \ - dim_t m_panel, \ - dim_t n_panel, \ - dim_t m_panel_max, \ - dim_t n_panel_max, \ - ctype* restrict kappa, \ - ctype* restrict c, inc_t rs_c, inc_t cs_c, \ - ctype* restrict p, inc_t rs_p, inc_t cs_p, \ - inc_t is_p, \ - cntx_t* cntx \ - ); - -INSERT_GENTPROTCO_BASIC0( packm_struc_cxk_4mi ) - - - -#undef GENTPROTCO -#define GENTPROTCO( ctype, ctype_r, ch, chr, varname ) \ -\ -void PASTEMAC(ch,varname) \ - ( \ - struc_t strucc, \ - doff_t diagoffc, \ - uplo_t uploc, \ - conj_t conjc, \ - pack_t schema, \ - dim_t m_panel, \ - dim_t n_panel, \ - dim_t m_panel_max, \ - dim_t n_panel_max, \ - dim_t panel_dim, \ - dim_t panel_dim_max, \ - dim_t panel_len, \ - dim_t panel_len_max, \ - ctype* restrict kappa, \ - ctype* restrict c, inc_t rs_c, inc_t cs_c, \ - inc_t incc, inc_t ldc, \ - ctype* restrict p, inc_t rs_p, inc_t cs_p, \ - inc_t is_p, inc_t ldp, \ - cntx_t* cntx \ - ); - -INSERT_GENTPROTCO_BASIC0( packm_herm_cxk_4mi ) - - - #undef GENTPROTCO #define GENTPROTCO( ctype, ctype_r, ch, chr, varname ) \ \ @@ -101,21 +44,18 @@ void PASTEMAC(ch,varname) \ conj_t conjc, \ pack_t schema, \ bool invdiag, \ - dim_t m_panel, \ - dim_t n_panel, \ - dim_t m_panel_max, \ - dim_t n_panel_max, \ dim_t panel_dim, \ - dim_t panel_dim_max, \ dim_t panel_len, \ + dim_t panel_dim_max, \ dim_t panel_len_max, \ ctype* restrict kappa, \ - ctype* restrict c, inc_t rs_c, inc_t cs_c, \ - inc_t incc, inc_t ldc, \ - ctype* restrict p, inc_t rs_p, inc_t cs_p, \ - inc_t is_p, inc_t ldp, \ + ctype* restrict c, inc_t incc, inc_t ldc, \ + ctype* restrict p, inc_t ldp, \ + inc_t is_p, \ cntx_t* cntx \ ); +INSERT_GENTPROTCO_BASIC0( packm_struc_cxk_4mi ) +INSERT_GENTPROTCO_BASIC0( packm_herm_cxk_4mi ) INSERT_GENTPROTCO_BASIC0( packm_tri_cxk_4mi ) diff --git a/frame/1m/packm/bli_packm_struc_cxk_md.c b/frame/1m/packm/bli_packm_struc_cxk_md.c index 52a1f9817f..6ded844188 100644 --- a/frame/1m/packm/bli_packm_struc_cxk_md.c +++ b/frame/1m/packm/bli_packm_struc_cxk_md.c @@ -41,53 +41,24 @@ \ void PASTEMAC2(chc,chp,varname) \ ( \ + struc_t strucc, \ + doff_t diagoffc, \ + diag_t diagc, \ + uplo_t uploc, \ conj_t conjc, \ pack_t schema, \ - dim_t m_panel, \ - dim_t n_panel, \ - dim_t m_panel_max, \ - dim_t n_panel_max, \ + bool invdiag, \ + dim_t panel_dim, \ + dim_t panel_len, \ + dim_t panel_dim_max, \ + dim_t panel_len_max, \ ctype_p* restrict kappa, \ - ctype_c* restrict c, inc_t rs_c, inc_t cs_c, \ - ctype_p* restrict p, inc_t rs_p, inc_t cs_p, \ + ctype_c* restrict c, inc_t incc, inc_t ldc, \ + ctype_p* restrict p, inc_t ldp, \ inc_t is_p, \ cntx_t* cntx \ ) \ { \ - dim_t panel_dim; \ - dim_t panel_dim_max; \ - dim_t panel_len; \ - dim_t panel_len_max; \ - inc_t incc, ldc; \ - inc_t ldp; \ -\ -\ - /* Determine the dimensions and relative strides of the micro-panel - based on its pack schema. */ \ - if ( bli_is_col_packed( schema ) ) \ - { \ - /* Prepare to pack to row-stored column panel. */ \ - panel_dim = n_panel; \ - panel_dim_max = n_panel_max; \ - panel_len = m_panel; \ - panel_len_max = m_panel_max; \ - incc = cs_c; \ - ldc = rs_c; \ - ldp = rs_p; \ - } \ - else /* if ( bli_is_row_packed( schema ) ) */ \ - { \ - /* Prepare to pack to column-stored row panel. */ \ - panel_dim = m_panel; \ - panel_dim_max = m_panel_max; \ - panel_len = n_panel; \ - panel_len_max = n_panel_max; \ - incc = rs_c; \ - ldc = cs_c; \ - ldp = cs_p; \ - } \ -\ -\ if ( bli_is_nat_packed( schema ) ) \ { \ /* Sanity check: Make sure that kappa is 1.0. Mixed-datatype alpha diff --git a/frame/1m/packm/bli_packm_struc_cxk_md.h b/frame/1m/packm/bli_packm_struc_cxk_md.h index 72ca67937f..5c6dc321cf 100644 --- a/frame/1m/packm/bli_packm_struc_cxk_md.h +++ b/frame/1m/packm/bli_packm_struc_cxk_md.h @@ -37,15 +37,20 @@ \ void PASTEMAC2(chc,chp,varname) \ ( \ + struc_t strucc, \ + doff_t diagoffc, \ + diag_t diagc, \ + uplo_t uploc, \ conj_t conjc, \ pack_t schema, \ - dim_t m_panel, \ - dim_t n_panel, \ - dim_t m_panel_max, \ - dim_t n_panel_max, \ + bool invdiag, \ + dim_t panel_dim, \ + dim_t panel_len, \ + dim_t panel_dim_max, \ + dim_t panel_len_max, \ ctype_p* restrict kappa, \ - ctype_c* restrict c, inc_t rs_c, inc_t cs_c, \ - ctype_p* restrict p, inc_t rs_p, inc_t cs_p, \ + ctype_c* restrict c, inc_t incc, inc_t ldc, \ + ctype_p* restrict p, inc_t ldp, \ inc_t is_p, \ cntx_t* cntx \ ); diff --git a/frame/1m/packm/bli_packm_struc_cxk_rih.c b/frame/1m/packm/bli_packm_struc_cxk_rih.c index 59b34ede8a..e7dd56ce1b 100644 --- a/frame/1m/packm/bli_packm_struc_cxk_rih.c +++ b/frame/1m/packm/bli_packm_struc_cxk_rih.c @@ -46,51 +46,17 @@ void PASTEMAC(ch,varname) \ conj_t conjc, \ pack_t schema, \ bool invdiag, \ - dim_t m_panel, \ - dim_t n_panel, \ - dim_t m_panel_max, \ - dim_t n_panel_max, \ + dim_t panel_dim, \ + dim_t panel_len, \ + dim_t panel_dim_max, \ + dim_t panel_len_max, \ ctype* restrict kappa, \ - ctype* restrict c, inc_t rs_c, inc_t cs_c, \ - ctype* restrict p, inc_t rs_p, inc_t cs_p, \ + ctype* restrict c, inc_t incc, inc_t ldc, \ + ctype* restrict p, inc_t ldp, \ inc_t is_p, \ cntx_t* cntx \ ) \ { \ - dim_t panel_dim; \ - dim_t panel_dim_max; \ - dim_t panel_len; \ - dim_t panel_len_max; \ - inc_t incc, ldc; \ - inc_t ldp; \ -\ -\ - /* Determine the dimensions and relative strides of the micro-panel - based on its pack schema. */ \ - if ( bli_is_col_packed( schema ) ) \ - { \ - /* Prepare to pack to row-stored column panel. */ \ - panel_dim = n_panel; \ - panel_dim_max = n_panel_max; \ - panel_len = m_panel; \ - panel_len_max = m_panel_max; \ - incc = cs_c; \ - ldc = rs_c; \ - ldp = rs_p; \ - } \ - else /* if ( bli_is_row_packed( schema ) ) */ \ - { \ - /* Prepare to pack to column-stored row panel. */ \ - panel_dim = m_panel; \ - panel_dim_max = m_panel_max; \ - panel_len = n_panel; \ - panel_len_max = n_panel_max; \ - incc = rs_c; \ - ldc = cs_c; \ - ldp = cs_p; \ - } \ -\ -\ /* Handle micro-panel packing based on the structure of the matrix being packed. */ \ if ( bli_is_general( strucc ) ) \ @@ -117,25 +83,22 @@ void PASTEMAC(ch,varname) \ matrices. */ \ PASTEMAC(ch,packm_herm_cxk_rih) \ ( \ - strucc, \ - diagoffc, \ - uploc, \ - conjc, \ - schema, \ - m_panel, \ - n_panel, \ - m_panel_max, \ - n_panel_max, \ - panel_dim, \ - panel_dim_max, \ - panel_len, \ - panel_len_max, \ - kappa, \ - c, rs_c, cs_c, \ - incc, ldc, \ - p, rs_p, cs_p, \ - ldp, \ - cntx \ + strucc, \ + diagoffc, \ + diagc, \ + uploc, \ + conjc, \ + schema, \ + invdiag, \ + panel_dim, \ + panel_len, \ + panel_dim_max, \ + panel_len_max, \ + kappa, \ + c, incc, ldc, \ + p, ldp, \ + is_p, \ + cntx \ ); \ } \ else /* ( bli_is_triangular( strucc ) ) */ \ @@ -144,117 +107,33 @@ void PASTEMAC(ch,varname) \ matrices. */ \ PASTEMAC(ch,packm_tri_cxk_rih) \ ( \ - strucc, \ - diagoffc, \ - diagc, \ - uploc, \ - conjc, \ - schema, \ - invdiag, \ - m_panel, \ - n_panel, \ - m_panel_max, \ - n_panel_max, \ - panel_dim, \ - panel_dim_max, \ - panel_len, \ - panel_len_max, \ - kappa, \ - c, rs_c, cs_c, \ - incc, ldc, \ - p, rs_p, cs_p, \ - ldp, \ - cntx \ - ); \ - } \ -\ -\ - /* If m_panel < m_panel_max, or n_panel < n_panel_max, we would normally - fill the edge region (the bottom m_panel_max - m_panel rows or right- - side n_panel_max - n_panel columns) of the micropanel with zeros. - However, this responsibility has been moved to the packm microkernel. - This change allows experts to use custom kernels that pack to custom - packing formats when the problem size is not a nice multiple of the - register blocksize. */ \ -/* - if ( m_panel != m_panel_max ) \ - { \ - ctype_r* restrict zero_r = PASTEMAC(chr,0); \ - dim_t i = m_panel; \ - dim_t m_edge = m_panel_max - i; \ - dim_t n_edge = n_panel_max; \ - ctype_r* p_edge_r = ( ctype_r* )p + (i )*rs_p; \ -\ - PASTEMAC2(chr,setm,BLIS_TAPI_EX_SUF) \ - ( \ - BLIS_NO_CONJUGATE, \ - 0, \ - BLIS_NONUNIT_DIAG, \ - BLIS_DENSE, \ - m_edge, \ - n_edge, \ - zero_r, \ - p_edge_r, rs_p, cs_p, \ - cntx, \ - NULL \ + strucc, \ + diagoffc, \ + diagc, \ + uploc, \ + conjc, \ + schema, \ + invdiag, \ + panel_dim, \ + panel_len, \ + panel_dim_max, \ + panel_len_max, \ + kappa, \ + c, incc, ldc, \ + p, ldp, \ + is_p, \ + cntx \ ); \ } \ -\ - if ( n_panel != n_panel_max ) \ - { \ - ctype_r* restrict zero_r = PASTEMAC(chr,0); \ - dim_t j = n_panel; \ - dim_t m_edge = m_panel_max; \ - dim_t n_edge = n_panel_max - j; \ - ctype_r* p_edge_r = ( ctype_r* )p + (j )*cs_p; \ -\ - PASTEMAC2(chr,setm,BLIS_TAPI_EX_SUF) \ - ( \ - BLIS_NO_CONJUGATE, \ - 0, \ - BLIS_NONUNIT_DIAG, \ - BLIS_DENSE, \ - m_edge, \ - n_edge, \ - zero_r, \ - p_edge_r, rs_p, cs_p, \ - cntx, \ - NULL \ - ); \ - } \ -*/ \ -\ -\ - if ( bli_is_triangular( strucc ) ) \ - { \ - /* If this panel is an edge case in both panel dimension and length, - then it must be a bottom-right corner case. Set the part of the - diagonal that extends into the zero-padded region to identity. - NOTE: This is actually only necessary when packing for trsm, as - it helps prevent NaNs and Infs from creeping into the computation. - However, we set the region to identity for trmm as well. Those - 1.0's end up getting muliplied by the 0.0's in the zero-padded - region of the other matrix, so there is no harm in this. */ \ - if ( m_panel != m_panel_max && \ - n_panel != n_panel_max ) \ - { \ - /* We don't need this case if we aren't supporting trsm. - Why? Because trmm's packm control tree node should be - using k dimension multiples of 1 (kr == 1), which means - there will never be zero padding at the far end of a - micro-panel. */ \ - } \ - } \ -\ \ /* { \ if ( bli_is_col_packed( schema ) ) \ - PASTEMAC(chr,fprintm)( stdout, "packm_struc_cxk_rih: bp copied", m_panel_max, n_panel_max, \ - ( ctype_r* )p, rs_p, cs_p, "%4.1f", "" ); \ + PASTEMAC(chr,fprintm)( stdout, "packm_struc_cxk_rih: bp copied", panel_dim_max, panel_len_max, \ + ( ctype_r* )p, rs_p, ldp, "%4.1f", "" ); \ else if ( bli_is_row_packed( schema ) ) \ - PASTEMAC(chr,fprintm)( stdout, "packm_struc_cxk_rih: ap copied", m_panel_max, n_panel_max, \ - ( ctype_r* )p, rs_p, cs_p, "%4.1f", "" ); \ + PASTEMAC(chr,fprintm)( stdout, "packm_struc_cxk_rih: ap copied", panel_dim_max, panel_len_max, \ + ( ctype_r* )p, rs_p, ldp, "%4.1f", "" ); \ } \ */ \ \ @@ -273,42 +152,29 @@ void PASTEMAC(ch,varname) \ ( \ struc_t strucc, \ doff_t diagoffc, \ + diag_t diagc, \ uplo_t uploc, \ conj_t conjc, \ pack_t schema, \ - dim_t m_panel, \ - dim_t n_panel, \ - dim_t m_panel_max, \ - dim_t n_panel_max, \ + bool invdiag, \ dim_t panel_dim, \ - dim_t panel_dim_max, \ dim_t panel_len, \ + dim_t panel_dim_max, \ dim_t panel_len_max, \ ctype* restrict kappa, \ - ctype* restrict c, inc_t rs_c, inc_t cs_c, \ - inc_t incc, inc_t ldc, \ - ctype* restrict p, inc_t rs_p, inc_t cs_p, \ - inc_t ldp, \ + ctype* restrict c, inc_t incc, inc_t ldc, \ + ctype* restrict p, inc_t ldp, \ + inc_t is_p, \ cntx_t* cntx \ ) \ { \ - bool row_stored; \ - bool col_stored; \ doff_t diagoffc_abs; \ dim_t j; \ \ -\ - /* Create flags to incidate row or column storage. Note that the - schema bit that encodes row or column is describing the form of - micro-panel, not the storage in the micro-panel. Hence the - mismatch in "row" and "column" semantics. */ \ - row_stored = bli_is_col_packed( schema ); \ - col_stored = bli_is_row_packed( schema ); \ -\ \ /* Handle the case where the micro-panel does NOT intersect the diagonal separately from the case where it does intersect. */ \ - if ( !bli_intersects_diag_n( diagoffc, m_panel, n_panel ) ) \ + if ( !bli_intersects_diag_n( diagoffc, panel_dim, panel_len ) ) \ { \ /* If the current panel is unstored, we need to make a few adjustments so we refer to the data where it is actually @@ -316,10 +182,10 @@ void PASTEMAC(ch,varname) \ implicitly assumes we are operating on a dense panel within a larger symmetric or Hermitian matrix, since a general matrix would not contain any unstored region.) */ \ - if ( bli_is_unstored_subpart_n( diagoffc, uploc, m_panel, n_panel ) ) \ + if ( bli_is_unstored_subpart_n( diagoffc, uploc, panel_dim, panel_len ) ) \ { \ - c = c + diagoffc * ( doff_t )cs_c + \ - -diagoffc * ( doff_t )rs_c; \ + c = c + diagoffc * ( doff_t )ldc + \ + -diagoffc * ( doff_t )incc; \ bli_swap_incs( &incc, &ldc ); \ \ if ( bli_is_hermitian( strucc ) ) \ @@ -341,7 +207,7 @@ void PASTEMAC(ch,varname) \ cntx \ ); \ } \ - else /* if ( bli_intersects_diag_n( diagoffc, m_panel, n_panel ) ) */ \ + else /* if ( bli_intersects_diag_n( diagoffc, panel_dim, panel_len ) ) */ \ { \ ctype_r* restrict p_r = ( ctype_r* )p; \ \ @@ -363,14 +229,12 @@ void PASTEMAC(ch,varname) \ a micro-panel. If they do, then somehow the constraints on cache blocksizes being a whole multiple of the register blocksizes was somehow violated. */ \ - if ( ( col_stored && diagoffc < 0 ) || \ - ( row_stored && diagoffc > 0 ) ) \ + if ( diagoffc < 0 ) \ bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED ); \ \ diagoffc_abs = bli_abs( diagoffc ); \ \ - if ( ( row_stored && bli_is_upper( uploc ) ) || \ - ( col_stored && bli_is_lower( uploc ) ) ) \ + if ( bli_is_lower( uploc ) ) \ { \ p10_dim = panel_dim; \ p10_len = diagoffc_abs; \ @@ -386,8 +250,8 @@ void PASTEMAC(ch,varname) \ diagoffc12 = diagoffc_abs - j; \ p12 = p_r + (j )*ldp; \ c12 = c + (j )*ldc; \ - c12 = c12 + diagoffc12 * ( doff_t )cs_c + \ - -diagoffc12 * ( doff_t )rs_c; \ + c12 = c12 + diagoffc12 * ( doff_t )ldc + \ + -diagoffc12 * ( doff_t )incc; \ incc12 = ldc; \ ldc12 = incc; \ conjc12 = conjc; \ @@ -395,16 +259,15 @@ void PASTEMAC(ch,varname) \ if ( bli_is_hermitian( strucc ) ) \ bli_toggle_conj( &conjc12 ); \ } \ - else /* if ( ( row_stored && bli_is_lower( uploc ) ) || \ - ( col_stored && bli_is_upper( uploc ) ) ) */ \ + else /* if ( bli_is_upper( uploc ) ) */ \ { \ p10_dim = panel_dim; \ p10_len = diagoffc_abs + panel_dim; \ diagoffc10 = diagoffc; \ p10 = p_r; \ c10 = c; \ - c10 = c10 + diagoffc10 * ( doff_t )cs_c + \ - -diagoffc10 * ( doff_t )rs_c; \ + c10 = c10 + diagoffc10 * ( doff_t )ldc + \ + -diagoffc10 * ( doff_t )incc; \ incc10 = ldc; \ ldc10 = incc; \ conjc10 = conjc; \ @@ -476,8 +339,8 @@ void PASTEMAC(ch,varname) \ conjc, \ panel_dim, \ kappa, \ - c11, rs_c, cs_c, \ - p11_r, rs_p, cs_p \ + c11, incc, ldc, \ + p11_r, 1, ldp \ ); \ \ /* If we are packing a micro-panel with Hermitian structure, @@ -498,16 +361,16 @@ void PASTEMAC(ch,varname) \ panel_dim, \ panel_dim, \ kappa, \ - c11, rs_c, cs_c, \ - p11_r, rs_p, cs_p \ + c11, incc, ldc, \ + p11_r, 1, ldp \ ); \ } \ \ /* - PASTEMAC(chr,fprintm)( stdout, "packm_herm_cxk: ap_r copied", m_panel_max, n_panel_max, \ - p_r + 0*is_p, rs_p, cs_p, "%4.1f", "" ); \ - PASTEMAC(chr,fprintm)( stdout, "packm_herm_cxk: ap_i copied", m_panel_max, n_panel_max, \ - p_r + 1*is_p, rs_p, cs_p, "%4.1f", "" ); \ + PASTEMAC(chr,fprintm)( stdout, "packm_herm_cxk: ap_r copied", panel_dim_max, panel_len_max, \ + p_r + 0*is_p, rs_p, ldp, "%4.1f", "" ); \ + PASTEMAC(chr,fprintm)( stdout, "packm_herm_cxk: ap_i copied", panel_dim_max, panel_len_max, \ + p_r + 1*is_p, rs_p, ldp, "%4.1f", "" ); \ */ \ } \ } \ @@ -525,25 +388,20 @@ INSERT_GENTFUNCCO_BASIC( packm_herm_cxk_rih, packm_cxk_rih ) void PASTEMAC(ch,varname) \ ( \ struc_t strucc, \ - doff_t diagoffp, \ + doff_t diagoffc, \ diag_t diagc, \ uplo_t uploc, \ conj_t conjc, \ pack_t schema, \ bool invdiag, \ - dim_t m_panel, \ - dim_t n_panel, \ - dim_t m_panel_max, \ - dim_t n_panel_max, \ dim_t panel_dim, \ - dim_t panel_dim_max, \ dim_t panel_len, \ + dim_t panel_dim_max, \ dim_t panel_len_max, \ ctype* restrict kappa, \ - ctype* restrict c, inc_t rs_c, inc_t cs_c, \ - inc_t incc, inc_t ldc, \ - ctype* restrict p, inc_t rs_p, inc_t cs_p, \ - inc_t ldp, \ + ctype* restrict c, inc_t incc, inc_t ldc, \ + ctype* restrict p, inc_t ldp, \ + inc_t is_p, \ cntx_t* cntx \ ) \ { \ @@ -567,7 +425,7 @@ void PASTEMAC(ch,varname) \ { \ ctype_r* p_r = ( ctype_r* )p; \ \ - dim_t j = bli_abs( diagoffp ); \ + dim_t j = bli_abs( diagoffc ); \ ctype_r* p11_r = p_r + (j )*ldp; \ \ /* If the diagonal of c is implicitly unit, explicitly set the @@ -580,7 +438,7 @@ void PASTEMAC(ch,varname) \ panel_dim, \ panel_dim, \ kappa, \ - p11_r, rs_p, cs_p \ + p11_r, 1, ldp \ ); \ } \ \ @@ -602,23 +460,41 @@ void PASTEMAC(ch,varname) \ uplo_t uplop = uploc; \ \ bli_toggle_uplo( &uplop ); \ - bli_shift_diag_offset_to_shrink_uplo( uplop, &diagoffp ); \ + bli_shift_diag_offset_to_shrink_uplo( uplop, &diagoffc ); \ \ PASTEMAC2(chr,setm,BLIS_TAPI_EX_SUF) \ ( \ BLIS_NO_CONJUGATE, \ - diagoffp, \ + diagoffc, \ BLIS_NONUNIT_DIAG, \ uplop, \ - m_panel, \ - n_panel, \ + panel_dim, \ + panel_len, \ zero_r, \ - p_r, rs_p, cs_p, \ + p_r, 1, ldp, \ cntx, \ NULL \ ); \ } \ } \ +\ + /* If this panel is an edge case in both panel dimension and length, + then it must be a bottom-right corner case. Set the part of the + diagonal that extends into the zero-padded region to identity. + NOTE: This is actually only necessary when packing for trsm, as + it helps prevent NaNs and Infs from creeping into the computation. + However, we set the region to identity for trmm as well. Those + 1.0's end up getting muliplied by the 0.0's in the zero-padded + region of the other matrix, so there is no harm in this. */ \ + if ( panel_dim != panel_dim_max && \ + panel_len != panel_len_max ) \ + { \ + /* We don't need this case if we aren't supporting trsm. + Why? Because trmm's packm control tree node should be + using k dimension multiples of 1 (kr == 1), which means + there will never be zero padding at the far end of a + micro-panel. */ \ + } \ } INSERT_GENTFUNCCO_BASIC( packm_tri_cxk_rih, packm_cxk_rih ) diff --git a/frame/1m/packm/bli_packm_struc_cxk_rih.h b/frame/1m/packm/bli_packm_struc_cxk_rih.h index 0af4d33e82..deb2fdf5e2 100644 --- a/frame/1m/packm/bli_packm_struc_cxk_rih.h +++ b/frame/1m/packm/bli_packm_struc_cxk_rih.h @@ -32,63 +32,6 @@ */ -#undef GENTPROTCO -#define GENTPROTCO( ctype, ctype_r, ch, chr, varname ) \ -\ -void PASTEMAC(ch,varname) \ - ( \ - struc_t strucc, \ - doff_t diagoffp, \ - diag_t diagc, \ - uplo_t uploc, \ - conj_t conjc, \ - pack_t schema, \ - bool invdiag, \ - dim_t m_panel, \ - dim_t n_panel, \ - dim_t m_panel_max, \ - dim_t n_panel_max, \ - ctype* restrict kappa, \ - ctype* restrict c, inc_t rs_c, inc_t cs_c, \ - ctype* restrict p, inc_t rs_p, inc_t cs_p, \ - inc_t is_p, \ - cntx_t* cntx \ - ); - -INSERT_GENTPROTCO_BASIC0( packm_struc_cxk_rih ) - - - -#undef GENTPROTCO -#define GENTPROTCO( ctype, ctype_r, ch, chr, varname ) \ -\ -void PASTEMAC(ch,varname) \ - ( \ - struc_t strucc, \ - doff_t diagoffc, \ - uplo_t uploc, \ - conj_t conjc, \ - pack_t schema, \ - dim_t m_panel, \ - dim_t n_panel, \ - dim_t m_panel_max, \ - dim_t n_panel_max, \ - dim_t panel_dim, \ - dim_t panel_dim_max, \ - dim_t panel_len, \ - dim_t panel_len_max, \ - ctype* restrict kappa, \ - ctype* restrict c, inc_t rs_c, inc_t cs_c, \ - inc_t incc, inc_t ldc, \ - ctype* restrict p, inc_t rs_p, inc_t cs_p, \ - inc_t ldp, \ - cntx_t* cntx \ - ); - -INSERT_GENTPROTCO_BASIC0( packm_herm_cxk_rih ) - - - #undef GENTPROTCO #define GENTPROTCO( ctype, ctype_r, ch, chr, varname ) \ \ @@ -101,21 +44,18 @@ void PASTEMAC(ch,varname) \ conj_t conjc, \ pack_t schema, \ bool invdiag, \ - dim_t m_panel, \ - dim_t n_panel, \ - dim_t m_panel_max, \ - dim_t n_panel_max, \ dim_t panel_dim, \ - dim_t panel_dim_max, \ dim_t panel_len, \ + dim_t panel_dim_max, \ dim_t panel_len_max, \ ctype* restrict kappa, \ - ctype* restrict c, inc_t rs_c, inc_t cs_c, \ - inc_t incc, inc_t ldc, \ - ctype* restrict p, inc_t rs_p, inc_t cs_p, \ - inc_t ldp, \ + ctype* restrict c, inc_t incc, inc_t ldc, \ + ctype* restrict p, inc_t ldp, \ + inc_t is_p, \ cntx_t* cntx \ ); +INSERT_GENTPROTCO_BASIC0( packm_struc_cxk_rih ) +INSERT_GENTPROTCO_BASIC0( packm_herm_cxk_rih ) INSERT_GENTPROTCO_BASIC0( packm_tri_cxk_rih ) diff --git a/frame/3/gemm/bli_gemm_cntl.c b/frame/3/gemm/bli_gemm_cntl.c index 27678e0bf8..dbefdcc408 100644 --- a/frame/3/gemm/bli_gemm_cntl.c +++ b/frame/3/gemm/bli_gemm_cntl.c @@ -121,8 +121,8 @@ cntl_t* bli_gemmbp_cntl_create rntm, bli_gemm_packb, // pack the right-hand operand packb_fp, - BLIS_KR, BLIS_NR, + BLIS_KR, FALSE, // do NOT invert diagonal FALSE, // reverse iteration if upper? FALSE, // reverse iteration if lower? @@ -194,8 +194,8 @@ cntl_t* bli_gemmpb_cntl_create ( bli_gemm_packb, // pack the right-hand operand bli_packm_blk_var1, - BLIS_KR, BLIS_MR, + BLIS_KR, FALSE, // do NOT invert diagonal FALSE, // reverse iteration if upper? FALSE, // reverse iteration if lower? diff --git a/frame/3/gemm/bli_gemm_front.c b/frame/3/gemm/bli_gemm_front.c index 3a46c4ecfc..5371e4df41 100644 --- a/frame/3/gemm/bli_gemm_front.c +++ b/frame/3/gemm/bli_gemm_front.c @@ -91,6 +91,10 @@ void bli_gemm_front bli_obj_alias_to( b, &b_local ); bli_obj_alias_to( c, &c_local ); + bli_obj_remove_offs( &a_local ); + bli_obj_remove_offs( &b_local ); + bli_obj_remove_offs( &c_local ); + #ifdef BLIS_ENABLE_GEMM_MD cntx_t cntx_local; diff --git a/frame/3/gemm/bli_gemm_md.c b/frame/3/gemm/bli_gemm_md.c index 0f82b15f3e..a0671582dc 100644 --- a/frame/3/gemm/bli_gemm_md.c +++ b/frame/3/gemm/bli_gemm_md.c @@ -183,9 +183,9 @@ mddm_t bli_gemm_md_ccr { bli_obj_swap( a, b ); - bli_obj_induce_trans( a ); - bli_obj_induce_trans( b ); - bli_obj_induce_trans( c ); + bli_obj_toggle_trans( a ); + bli_obj_toggle_trans( b ); + bli_obj_toggle_trans( c ); return bli_gemm_md_crc( a, b, beta, c, cntx_local, cntx ); } diff --git a/frame/3/gemm/bli_gemm_packab.c b/frame/3/gemm/bli_gemm_packab.c index a15192994e..6176b2f911 100644 --- a/frame/3/gemm/bli_gemm_packab.c +++ b/frame/3/gemm/bli_gemm_packab.c @@ -45,12 +45,19 @@ void bli_gemm_packa thrinfo_t* thread ) { - obj_t a_pack; + obj_t a_local, a_pack; + + bli_obj_alias_to( a, &a_local ); + if ( bli_obj_has_trans( a ) ) + { + bli_obj_induce_trans( &a_local ); + bli_obj_set_onlytrans( BLIS_NO_TRANSPOSE, &a_local ); + } // Pack matrix A according to the control tree node. bli_l3_packm ( - a, + &a_local, &a_pack, cntx, rntm, @@ -86,25 +93,39 @@ void bli_gemm_packb thrinfo_t* thread ) { - obj_t b_pack; + obj_t bt_local, bt_pack; + + // We always pass B^T to bli_l3_packm. + bli_obj_alias_to( b, &bt_local ); + if ( bli_obj_has_trans( b ) ) + { + bli_obj_set_onlytrans( BLIS_NO_TRANSPOSE, &bt_local ); + } + else + { + bli_obj_induce_trans( &bt_local ); + } // Pack matrix B according to the control tree node. bli_l3_packm ( - b, - &b_pack, + &bt_local, + &bt_pack, cntx, rntm, cntl, thread ); + // Transpose packed object back to B. + bli_obj_induce_trans( &bt_pack ); + // Proceed with execution using packed matrix B. bli_gemm_int ( &BLIS_ONE, a, - &b_pack, + &bt_pack, &BLIS_ONE, c, cntx, diff --git a/frame/3/hemm/bli_hemm_front.c b/frame/3/hemm/bli_hemm_front.c index d1746eb4eb..c0bdc41c68 100644 --- a/frame/3/hemm/bli_hemm_front.c +++ b/frame/3/hemm/bli_hemm_front.c @@ -69,6 +69,10 @@ void bli_hemm_front bli_obj_alias_to( b, &b_local ); bli_obj_alias_to( c, &c_local ); + bli_obj_remove_offs( &a_local ); + bli_obj_remove_offs( &b_local ); + bli_obj_remove_offs( &c_local ); + #ifdef BLIS_DISABLE_HEMM_RIGHT // NOTE: This case casts right-side hemm in terms of left side. This is // necessary when the current subconfiguration uses a gemm microkernel diff --git a/frame/3/symm/bli_symm_front.c b/frame/3/symm/bli_symm_front.c index 61238fb158..71dabfb153 100644 --- a/frame/3/symm/bli_symm_front.c +++ b/frame/3/symm/bli_symm_front.c @@ -69,6 +69,10 @@ void bli_symm_front bli_obj_alias_to( b, &b_local ); bli_obj_alias_to( c, &c_local ); + bli_obj_remove_offs( &a_local ); + bli_obj_remove_offs( &b_local ); + bli_obj_remove_offs( &c_local ); + #ifdef BLIS_DISABLE_SYMM_RIGHT // NOTE: This case casts right-side symm in terms of left side. This is // necessary when the current subconfiguration uses a gemm microkernel diff --git a/frame/3/trmm/bli_trmm_front.c b/frame/3/trmm/bli_trmm_front.c index 63fc8053f9..71a6b7b909 100644 --- a/frame/3/trmm/bli_trmm_front.c +++ b/frame/3/trmm/bli_trmm_front.c @@ -68,6 +68,10 @@ void bli_trmm_front bli_obj_alias_to( b, &b_local ); bli_obj_alias_to( b, &c_local ); + bli_obj_remove_offs( &a_local ); + bli_obj_remove_offs( &b_local ); + bli_obj_remove_offs( &c_local ); + // We do not explicitly implement the cases where A is transposed. // However, we can still handle them. Specifically, if A is marked as // needing a transposition, we simply induce a transposition. This diff --git a/frame/3/trmm3/bli_trmm3_front.c b/frame/3/trmm3/bli_trmm3_front.c index ba7d3a91ff..079ab18aa6 100644 --- a/frame/3/trmm3/bli_trmm3_front.c +++ b/frame/3/trmm3/bli_trmm3_front.c @@ -69,6 +69,10 @@ void bli_trmm3_front bli_obj_alias_to( b, &b_local ); bli_obj_alias_to( c, &c_local ); + bli_obj_remove_offs( &a_local ); + bli_obj_remove_offs( &b_local ); + bli_obj_remove_offs( &c_local ); + // We do not explicitly implement the cases where A is transposed. // However, we can still handle them. Specifically, if A is marked as // needing a transposition, we simply induce a transposition. This diff --git a/frame/3/trsm/bli_trsm_blk_var1.c b/frame/3/trsm/bli_trsm_blk_var1.c index 578c37c329..d6957bc853 100644 --- a/frame/3/trsm/bli_trsm_blk_var1.c +++ b/frame/3/trsm/bli_trsm_blk_var1.c @@ -58,7 +58,7 @@ void bli_trsm_blk_var1 bli_l3_prune_unref_mparts_m( a, b, c, cntl ); // Isolate the diagonal block A11 and its corresponding row panel C1. - const dim_t kc = bli_obj_width( a ); + const dim_t kc = bli_obj_width_after_trans( a ); obj_t a11, c1; bli_acquire_mpart_mdim( direct, BLIS_SUBPART1, 0, kc, a, &a11 ); diff --git a/frame/3/trsm/bli_trsm_cntl.c b/frame/3/trsm/bli_trsm_cntl.c index 4a7a4de8fd..138503cb92 100644 --- a/frame/3/trsm/bli_trsm_cntl.c +++ b/frame/3/trsm/bli_trsm_cntl.c @@ -173,8 +173,8 @@ cntl_t* bli_trsm_l_cntl_create rntm, bli_trsm_packb, packb_fp, - BLIS_MR, BLIS_NR, + BLIS_MR, FALSE, // do NOT invert diagonal FALSE, // reverse iteration if upper? FALSE, // reverse iteration if lower? diff --git a/frame/3/trsm/bli_trsm_front.c b/frame/3/trsm/bli_trsm_front.c index 77c177d8a5..c60e33f678 100644 --- a/frame/3/trsm/bli_trsm_front.c +++ b/frame/3/trsm/bli_trsm_front.c @@ -75,6 +75,10 @@ void bli_trsm_front bli_obj_alias_to( b, &b_local ); bli_obj_alias_to( b, &c_local ); + bli_obj_remove_offs( &a_local ); + bli_obj_remove_offs( &b_local ); + bli_obj_remove_offs( &c_local ); + // We do not explicitly implement the cases where A is transposed. // However, we can still handle them. Specifically, if A is marked as // needing a transposition, we simply induce a transposition. This diff --git a/frame/3/trsm/bli_trsm_packab.c b/frame/3/trsm/bli_trsm_packab.c index 841230d80d..6f61058853 100644 --- a/frame/3/trsm/bli_trsm_packab.c +++ b/frame/3/trsm/bli_trsm_packab.c @@ -45,12 +45,19 @@ void bli_trsm_packa thrinfo_t* thread ) { - obj_t a_pack; + obj_t a_local, a_pack; + + bli_obj_alias_to( a, &a_local ); + if ( bli_obj_has_trans( a ) ) + { + bli_obj_induce_trans( &a_local ); + bli_obj_set_onlytrans( BLIS_NO_TRANSPOSE, &a_local ); + } // Pack matrix A according to the control tree node. bli_l3_packm ( - a, + &a_local, &a_pack, cntx, rntm, @@ -86,25 +93,39 @@ void bli_trsm_packb thrinfo_t* thread ) { - obj_t b_pack; + obj_t bt_local, bt_pack; + + // We always pass B^T to bli_l3_packm. + bli_obj_alias_to( b, &bt_local ); + if ( bli_obj_has_trans( b ) ) + { + bli_obj_set_onlytrans( BLIS_NO_TRANSPOSE, &bt_local ); + } + else + { + bli_obj_induce_trans( &bt_local ); + } // Pack matrix B according to the control tree node. bli_l3_packm ( - b, - &b_pack, + &bt_local, + &bt_pack, cntx, rntm, cntl, thread ); + // Transpose packed object back to B. + bli_obj_induce_trans( &bt_pack ); + // Proceed with execution using packed matrix B. bli_trsm_int ( &BLIS_ONE, a, - &b_pack, + &bt_pack, &BLIS_ONE, c, cntx, diff --git a/frame/include/bli_obj_macro_defs.h b/frame/include/bli_obj_macro_defs.h index d4829854d2..43c6581846 100644 --- a/frame/include/bli_obj_macro_defs.h +++ b/frame/include/bli_obj_macro_defs.h @@ -1367,6 +1367,14 @@ BLIS_INLINE void* bli_obj_buffer_for_1x1( num_t dt, obj_t* obj ) ); } +// Adjust pointer based on offsets and then zero them + +BLIS_INLINE void bli_obj_remove_offs( obj_t* obj ) +{ + bli_obj_set_buffer( bli_obj_buffer_at_off( obj ), obj ); + bli_obj_set_offs( 0, 0, obj ); +} + // Make a full alias (shallow copy). BLIS_INLINE void bli_obj_alias_to( obj_t* a, obj_t* b ) diff --git a/frame/include/bli_type_defs.h b/frame/include/bli_type_defs.h index 021cbdf942..0d2b25f7f0 100644 --- a/frame/include/bli_type_defs.h +++ b/frame/include/bli_type_defs.h @@ -1240,7 +1240,6 @@ struct thrinfo_s; typedef void (*obj_pack_fn_t) ( - mdim_t mat, mem_t* mem, struct obj_s* a, struct obj_s* ap, diff --git a/testsuite/src/test_gemm_ukr.c b/testsuite/src/test_gemm_ukr.c index 48996f28e7..c5a35d678d 100644 --- a/testsuite/src/test_gemm_ukr.c +++ b/testsuite/src/test_gemm_ukr.c @@ -273,7 +273,10 @@ void libblis_test_gemm_ukr_experiment // about coaxing bli_obj_create() in allocating enough space for our // purposes. bli_obj_create( datatype, ldap, k, 1, ldap, &ap ); - bli_obj_create( datatype, k, ldbp, ldbp, 1, &bp ); + bli_obj_create( datatype, ldbp, k, 1, ldbp, &bp ); + + // Transpose B to B^T for packing + bli_obj_induce_trans( &b ); // Set up the objects for packing. Calling packm_init_pack() does everything // except checkout a memory pool block and save its address to the obj_t's. @@ -289,7 +292,7 @@ void libblis_test_gemm_ukr_experiment BLIS_MR, BLIS_KR, &a, &ap, cntx ); bli_packm_init_pack( BLIS_NO_INVERT_DIAG, BLIS_PACKED_COL_PANELS, BLIS_PACK_FWD_IF_UPPER, BLIS_PACK_FWD_IF_LOWER, - BLIS_KR, BLIS_NR, &b, &bp, cntx ); + BLIS_NR, BLIS_KR, &b, &bp, cntx ); bli_obj_set_buffer( buf_ap, &ap ); bli_obj_set_buffer( buf_bp, &bp ); @@ -297,7 +300,11 @@ void libblis_test_gemm_ukr_experiment bli_packm_blk_var1( &a, &ap, cntx, NULL, &BLIS_PACKM_SINGLE_THREADED ); bli_packm_blk_var1( &b, &bp, cntx, NULL, &BLIS_PACKM_SINGLE_THREADED ); - // Repeat the experiment n_repeats times and record results. + // Transpose B^T back to B and Bp^T back to Bp + bli_obj_induce_trans( &b ); + bli_obj_induce_trans( &bp ); + + // Repeat the experiment n_repeats times and record results. for ( i = 0; i < n_repeats; ++i ) { bli_copym( &c_save, &c ); diff --git a/testsuite/src/test_gemmtrsm_ukr.c b/testsuite/src/test_gemmtrsm_ukr.c index b3916db6a1..f1e170bc78 100644 --- a/testsuite/src/test_gemmtrsm_ukr.c +++ b/testsuite/src/test_gemmtrsm_ukr.c @@ -319,7 +319,7 @@ void libblis_test_gemmtrsm_ukr_experiment // about coaxing bli_obj_create() in allocating enough space for our // purposes. bli_obj_create( datatype, ldap, k+m, 1, ldap, &ap ); - bli_obj_create( datatype, k+m, ldbp, ldbp, 1, &bp ); + bli_obj_create( datatype, ldbp, k+m, 1, ldbp, &bp ); // We overwrite the m dimension of ap and n dimension of bp with // m and n, respectively, so that these objects contain the correct @@ -328,7 +328,10 @@ void libblis_test_gemmtrsm_ukr_experiment // duplication in rare instances where the subconfig uses a gemm // ukernel that duplicates elements in one of the operands. bli_obj_set_length( m, &ap ); - bli_obj_set_width( n, &bp ); + bli_obj_set_length( n, &bp ); + + // Transpose B to B^T for packing + bli_obj_induce_trans( &b ); // Set up the objects for packing. Calling packm_init_pack() does everything // except checkout a memory pool block and save its address to the obj_t's. @@ -344,7 +347,7 @@ void libblis_test_gemmtrsm_ukr_experiment BLIS_MR, BLIS_KR, &a, &ap, cntx ); bli_packm_init_pack( BLIS_NO_INVERT_DIAG, BLIS_PACKED_COL_PANELS, BLIS_PACK_FWD_IF_UPPER, BLIS_PACK_FWD_IF_LOWER, - BLIS_KR, BLIS_NR, &b, &bp, cntx ); + BLIS_NR, BLIS_KR, &b, &bp, cntx ); bli_obj_set_buffer( buf_ap, &ap ); bli_obj_set_buffer( buf_bp, &bp ); @@ -361,6 +364,10 @@ void libblis_test_gemmtrsm_ukr_experiment bli_packm_blk_var1( &a, &ap, cntx, NULL, &BLIS_PACKM_SINGLE_THREADED ); bli_packm_blk_var1( &b, &bp, cntx, NULL, &BLIS_PACKM_SINGLE_THREADED ); + // Transpose B^T back to B and Bp^T back to Bp + bli_obj_induce_trans( &b ); + bli_obj_induce_trans( &bp ); + // Create subpartitions from the a and b panels. bli_gemmtrsm_ukr_make_subparts( k, &ap, &bp, &a1xp, &a11p, &bx1p, &b11p ); @@ -375,14 +382,18 @@ bli_printm( "a", &a, "%5.2f", "" ); bli_printm( "ap", &ap, "%5.2f", "" ); #endif - // Repeat the experiment n_repeats times and record results. + // Repeat the experiment n_repeats times and record results. for ( i = 0; i < n_repeats; ++i ) { bli_copym( &c11_save, &c11 ); // Re-pack (restore) the contents of b to bp. //bli_packm_blk_var1( &b, &bp, &cntx, cntl_b, &BLIS_PACKM_SINGLE_THREADED ); + bli_obj_induce_trans( &b ); + bli_obj_induce_trans( &bp ); bli_packm_blk_var1( &b, &bp, cntx, NULL, &BLIS_PACKM_SINGLE_THREADED ); + bli_obj_induce_trans( &b ); + bli_obj_induce_trans( &bp ); time = bli_clock(); From 2eba2821ba948af77a672b2f7da5e823eae5f6cb Mon Sep 17 00:00:00 2001 From: Devin Matthews Date: Mon, 13 Sep 2021 16:26:55 -0500 Subject: [PATCH 06/24] Moved packed object initialization and pack buffer acquisition to bli_packm_blk_var1. gemm/gemmtrsm/trsm ukr tests required an overhaul to make use of the modified packing kernel. --- frame/1m/bli_l1m_oft_var.h | 1 + frame/1m/packm/bli_packm.h | 2 + .../packm/bli_packm_alloc.c} | 67 +--- .../packm/bli_packm_alloc.h} | 17 +- frame/1m/packm/bli_packm_blk_var1.c | 177 ++--------- frame/1m/packm/bli_packm_init.c | 200 +++--------- frame/1m/packm/bli_packm_init.h | 19 +- frame/1m/packm/bli_packm_int.c | 12 +- frame/1m/packm/bli_packm_int.h | 1 + ...li_packm_unb_var1.h => bli_packm_scalar.c} | 66 ++-- .../bli_packm_scalar.h} | 26 +- frame/1m/packm/bli_packm_unb_var1.c | 297 ------------------ frame/1m/packm/bli_packm_var.h | 77 +---- frame/1m/unpackm/bli_unpackm.h | 2 - frame/1m/unpackm/bli_unpackm_unb_var1.c | 131 -------- frame/3/bli_l3.h | 1 - frame/3/gemm/bli_gemm_packab.c | 4 +- frame/3/trsm/bli_trsm_packab.c | 4 +- frame/base/bli_sba.c | 80 +++-- testsuite/src/test_gemm_ukr.c | 65 +--- testsuite/src/test_gemmtrsm_ukr.c | 120 +++---- testsuite/src/test_libblis.c | 40 +-- testsuite/src/test_libblis.h | 2 +- testsuite/src/test_trsm_ukr.c | 92 ++---- 24 files changed, 282 insertions(+), 1221 deletions(-) rename frame/{3/bli_l3_packm.c => 1m/packm/bli_packm_alloc.c} (77%) rename frame/{3/bli_l3_packm.h => 1m/packm/bli_packm_alloc.h} (88%) rename frame/1m/packm/{bli_packm_unb_var1.h => bli_packm_scalar.c} (52%) rename frame/1m/{unpackm/bli_unpackm_unb_var1.h => packm/bli_packm_scalar.h} (76%) delete mode 100644 frame/1m/packm/bli_packm_unb_var1.c delete mode 100644 frame/1m/unpackm/bli_unpackm_unb_var1.c diff --git a/frame/1m/bli_l1m_oft_var.h b/frame/1m/bli_l1m_oft_var.h index 15e9dae6f5..0b60d4e2f6 100644 --- a/frame/1m/bli_l1m_oft_var.h +++ b/frame/1m/bli_l1m_oft_var.h @@ -48,6 +48,7 @@ typedef void (*PASTECH(opname,_var_oft)) \ obj_t* a, \ obj_t* p, \ cntx_t* cntx, \ + rntm_t* rntm, \ cntl_t* cntl, \ thrinfo_t* thread \ ); diff --git a/frame/1m/packm/bli_packm.h b/frame/1m/packm/bli_packm.h index 85f7011655..abe61445cd 100644 --- a/frame/1m/packm/bli_packm.h +++ b/frame/1m/packm/bli_packm.h @@ -33,10 +33,12 @@ */ +#include "bli_packm_alloc.h" #include "bli_packm_cntl.h" #include "bli_packm_check.h" #include "bli_packm_init.h" #include "bli_packm_int.h" +#include "bli_packm_scalar.h" #include "bli_packm_part.h" diff --git a/frame/3/bli_l3_packm.c b/frame/1m/packm/bli_packm_alloc.c similarity index 77% rename from frame/3/bli_l3_packm.c rename to frame/1m/packm/bli_packm_alloc.c index 48f55c3602..81a7a95504 100644 --- a/frame/3/bli_l3_packm.c +++ b/frame/1m/packm/bli_packm_alloc.c @@ -5,7 +5,7 @@ libraries. Copyright (C) 2014, The University of Texas at Austin - Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc. + Copyright (C) 2016, Hewlett Packard Enterprise Development LP Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -35,45 +35,19 @@ #include "blis.h" -void bli_l3_packm - ( - obj_t* x, - obj_t* x_pack, - cntx_t* cntx, - rntm_t* rntm, - cntl_t* cntl, - thrinfo_t* thread - ) +void* bli_packm_alloc + ( + siz_t size_needed, + rntm_t* rntm, + cntl_t* cntl, + thrinfo_t* thread + ) { - packbuf_t pack_buf_type; - mem_t* cntl_mem_p; - siz_t size_needed; - - // FGVZ: Not sure why we need this barrier, but we do. - bli_thread_barrier( thread ); - - // Every thread initializes x_pack and determines the size of memory - // block needed (which gets embedded into the otherwise "blank" mem_t - // entry in the control tree node). - size_needed - = - bli_packm_init - ( - x, - x_pack, - cntx, - cntl - ); - - // If zero was returned, no memory needs to be allocated and so we can - // return early. - if ( size_needed == 0 ) return; - // Query the pack buffer type from the control tree node. - pack_buf_type = bli_cntl_packm_params_pack_buf_type( cntl ); + packbuf_t pack_buf_type = bli_cntl_packm_params_pack_buf_type( cntl ); // Query the address of the mem_t entry within the control tree node. - cntl_mem_p = bli_cntl_pack_mem( cntl ); + mem_t* cntl_mem_p = bli_cntl_pack_mem( cntl ); // Check the mem_t field in the control tree. If it is unallocated, then // we need to acquire a block from the memory broker and broadcast it to @@ -163,25 +137,6 @@ void bli_l3_packm } } - - // Update the buffer address in x_pack to point to the buffer associated - // with the mem_t entry acquired from the memory broker (now cached in - // the control tree node). - void* buf = bli_mem_buffer( cntl_mem_p ); - bli_obj_set_buffer( buf, x_pack ); - - - // Pack the contents of object x to object x_pack. - bli_packm_int - ( - x, - x_pack, - cntx, - cntl, - thread - ); - - // Barrier so that packing is done before computation. - bli_thread_barrier( thread ); + return bli_mem_buffer( cntl_mem_p ); } diff --git a/frame/3/bli_l3_packm.h b/frame/1m/packm/bli_packm_alloc.h similarity index 88% rename from frame/3/bli_l3_packm.h rename to frame/1m/packm/bli_packm_alloc.h index 696dabf593..b433be350a 100644 --- a/frame/3/bli_l3_packm.h +++ b/frame/1m/packm/bli_packm_alloc.h @@ -5,7 +5,6 @@ libraries. Copyright (C) 2014, The University of Texas at Austin - Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -33,13 +32,11 @@ */ -void bli_l3_packm - ( - obj_t* x, - obj_t* x_pack, - cntx_t* cntx, - rntm_t* rntm, - cntl_t* cntl, - thrinfo_t* thread - ); +BLIS_EXPORT_BLIS void* bli_packm_alloc + ( + siz_t size_needed, + rntm_t* rntm, + cntl_t* cntl, + thrinfo_t* thread + ); diff --git a/frame/1m/packm/bli_packm_blk_var1.c b/frame/1m/packm/bli_packm_blk_var1.c index a2442360af..308e1acbf5 100644 --- a/frame/1m/packm/bli_packm_blk_var1.c +++ b/frame/1m/packm/bli_packm_blk_var1.c @@ -78,10 +78,25 @@ void bli_packm_blk_var1 obj_t* c, obj_t* p, cntx_t* cntx, + rntm_t* rntm, cntl_t* cntl, thrinfo_t* thread ) { + // Every thread initializes p and determines the size of memory + // block needed (which gets embedded into the otherwise "blank" mem_t + // entry in the control tree node). + // Update the buffer address in p to point to the buffer associated + // with the mem_t entry acquired from the memory broker (now cached in + // the control tree node). + // Return early if no packing is required. + if ( !bli_packm_init( c, p, cntx, rntm, cntl, thread ) ) + return; + + // Check parameters. + if ( bli_error_checking_is_enabled() ) + bli_packm_int_check( c, p, cntx ); + num_t dt_c = bli_obj_dt( c ); dim_t dt_c_size = bli_dt_size( dt_c ); @@ -108,7 +123,7 @@ void bli_packm_blk_var1 dim_t panel_dim_off = bli_obj_row_off( c ); dim_t panel_len_off = bli_obj_col_off( c ); - char* p_cast = bli_obj_buffer_at_off( p ); + char* p_cast = bli_obj_buffer( p ); inc_t ldp = bli_obj_col_stride( p ); inc_t is_p = bli_obj_imag_stride( p ); dim_t panel_dim_max = bli_obj_panel_dim( p ); @@ -116,47 +131,14 @@ void bli_packm_blk_var1 doff_t diagoffc_inc = ( doff_t )panel_dim_max; + obj_t kappa_local; + char* kappa_cast = bli_packm_scalar( &kappa_local, p ); + /* If C is zeros and part of a triangular matrix, then we don't need to pack it. */ if ( bli_is_zeros( uploc ) && bli_is_triangular( strucc ) ) return; - char* kappa_cast; - - // The value for kappa we use will depends on whether the scalar - // attached to A has a nonzero imaginary component. If it does, - // then we will apply the scalar during packing to facilitate - // implementing induced complex domain algorithms in terms of - // real domain micro-kernels. (In the aforementioned situation, - // applying a real scalar is easy, but applying a complex one is - // harder, so we avoid the need altogether with the code below.) - if ( bli_obj_scalar_has_nonzero_imag( p ) && - !bli_is_nat_packed( schema ) ) - { - //printf( "applying non-zero imag kappa\n_p" ); - obj_t kappa; - - // Detach the scalar. - bli_obj_scalar_detach( p, &kappa ); - - // Reset the attached scalar (to 1.0). - bli_obj_scalar_reset( p ); - - kappa_cast = bli_obj_buffer_for_1x1( dt_p, &kappa ); - } - // This branch is also for native execution, where we assume that - // the micro-kernel will always apply the alpha scalar of the - // higher-level operation. Thus, we use BLIS_ONE for kappa so - // that the underlying packm implementation does not perform - // any scaling during packing. - else - { - // If the internal scalar of A has only a real component, then - // we will apply it later (in the micro-kernel), and so we will - // use BLIS_ONE to indicate no scaling during packing. - kappa_cast = bli_obj_buffer_for_1x1( dt_p, &BLIS_ONE ); - } - // If the packm structure-aware kernel func_t in the context is // NULL (which is the default value after the context is created), // we use the default lookup table to determine the right func_t @@ -411,124 +393,3 @@ void bli_packm_blk_var1 } } - - -/* -if ( row_stored ) -PASTEMAC(ch,fprintm)( stdout, "packm_var2: b", m_p, n_p, - c_cast, rs_c, cs_c, "%4.1f", "" ); -if ( col_stored ) -PASTEMAC(ch,fprintm)( stdout, "packm_var2: a", m_p, n_p, - c_cast, rs_c, cs_c, "%4.1f", "" ); -*/ -/* -if ( row_stored ) -PASTEMAC(ch,fprintm)( stdout, "packm_blk_var1: b packed", *m_panel_max, *n_panel_max, - p_use, rs_p, cs_p, "%5.2f", "" ); -else -PASTEMAC(ch,fprintm)( stdout, "packm_blk_var1: a packed", *m_panel_max, *n_panel_max, - p_use, rs_p, cs_p, "%5.2f", "" ); -*/ - -/* -if ( col_stored ) { - if ( bli_thread_work_id( thread ) == 0 ) - { - printf( "packm_blk_var1: thread %lu (a = %p, ap = %p)\n_p", bli_thread_work_id( thread ), c_use, p_use ); - fflush( stdout ); - PASTEMAC(ch,fprintm)( stdout, "packm_blk_var1: a", *m_panel_use, *n_panel_use, - ( ctype* )c_use, rs_c, cs_c, "%4.1f", "" ); - PASTEMAC(ch,fprintm)( stdout, "packm_blk_var1: ap", *m_panel_max, *n_panel_max, - ( ctype* )p_use, rs_p, cs_p, "%4.1f", "" ); - fflush( stdout ); - } -bli_thread_barrier( thread ); - if ( bli_thread_work_id( thread ) == 1 ) - { - printf( "packm_blk_var1: thread %lu (a = %p, ap = %p)\n_p", bli_thread_work_id( thread ), c_use, p_use ); - fflush( stdout ); - PASTEMAC(ch,fprintm)( stdout, "packm_blk_var1: a", *m_panel_use, *n_panel_use, - ( ctype* )c_use, rs_c, cs_c, "%4.1f", "" ); - PASTEMAC(ch,fprintm)( stdout, "packm_blk_var1: ap", *m_panel_max, *n_panel_max, - ( ctype* )p_use, rs_p, cs_p, "%4.1f", "" ); - fflush( stdout ); - } -bli_thread_barrier( thread ); -} -else { - if ( bli_thread_work_id( thread ) == 0 ) - { - printf( "packm_blk_var1: thread %lu (b = %p, bp = %p)\n_p", bli_thread_work_id( thread ), c_use, p_use ); - fflush( stdout ); - PASTEMAC(ch,fprintm)( stdout, "packm_blk_var1: b", *m_panel_use, *n_panel_use, - ( ctype* )c_use, rs_c, cs_c, "%4.1f", "" ); - PASTEMAC(ch,fprintm)( stdout, "packm_blk_var1: bp", *m_panel_max, *n_panel_max, - ( ctype* )p_use, rs_p, cs_p, "%4.1f", "" ); - fflush( stdout ); - } -bli_thread_barrier( thread ); - if ( bli_thread_work_id( thread ) == 1 ) - { - printf( "packm_blk_var1: thread %lu (b = %p, bp = %p)\n_p", bli_thread_work_id( thread ), c_use, p_use ); - fflush( stdout ); - PASTEMAC(ch,fprintm)( stdout, "packm_blk_var1: b", *m_panel_use, *n_panel_use, - ( ctype* )c_use, rs_c, cs_c, "%4.1f", "" ); - PASTEMAC(ch,fprintm)( stdout, "packm_blk_var1: bp", *m_panel_max, *n_panel_max, - ( ctype* )p_use, rs_p, cs_p, "%4.1f", "" ); - fflush( stdout ); - } -bli_thread_barrier( thread ); -} -*/ -/* - if ( bli_is_4mi_packed( schema ) ) { - printf( "packm_var2: is_p_use = %lu\n_p", is_p_use ); - if ( col_stored ) { - if ( 0 ) - PASTEMAC(chr,fprintm)( stdout, "packm_var2: a_r", *m_panel_use, *n_panel_use, - ( ctype_r* )c_use, 2*rs_c, 2*cs_c, "%4.1f", "" ); - PASTEMAC(chr,fprintm)( stdout, "packm_var2: ap_r", *m_panel_max, *n_panel_max, - ( ctype_r* )p_use, rs_p, cs_p, "%4.1f", "" ); - PASTEMAC(chr,fprintm)( stdout, "packm_var2: ap_i", *m_panel_max, *n_panel_max, - ( ctype_r* )p_use + is_p_use, rs_p, cs_p, "%4.1f", "" ); - } - if ( row_stored ) { - if ( 0 ) - PASTEMAC(chr,fprintm)( stdout, "packm_var2: b_r", *m_panel_use, *n_panel_use, - ( ctype_r* )c_use, 2*rs_c, 2*cs_c, "%4.1f", "" ); - PASTEMAC(chr,fprintm)( stdout, "packm_var2: bp_r", *m_panel_max, *n_panel_max, - ( ctype_r* )p_use, rs_p, cs_p, "%4.1f", "" ); - PASTEMAC(chr,fprintm)( stdout, "packm_var2: bp_i", *m_panel_max, *n_panel_max, - ( ctype_r* )p_use + is_p_use, rs_p, cs_p, "%4.1f", "" ); - } - } -*/ -/* - PASTEMAC(chr,fprintm)( stdout, "packm_var2: bp_rpi", *m_panel_max, *n_panel_max, - ( ctype_r* )p_use, rs_p, cs_p, "%4.1f", "" ); -*/ -/* - if ( row_stored ) { - PASTEMAC(chr,fprintm)( stdout, "packm_var2: b_r", *m_panel_max, *n_panel_max, - ( ctype_r* )c_use, 2*rs_c, 2*cs_c, "%4.1f", "" ); - PASTEMAC(chr,fprintm)( stdout, "packm_var2: b_i", *m_panel_max, *n_panel_max, - (( ctype_r* )c_use)+rs_c, 2*rs_c, 2*cs_c, "%4.1f", "" ); - PASTEMAC(chr,fprintm)( stdout, "packm_var2: bp_r", *m_panel_max, *n_panel_max, - ( ctype_r* )p_use, rs_p, cs_p, "%4.1f", "" ); - inc_t is_b = rs_p * *m_panel_max; - PASTEMAC(chr,fprintm)( stdout, "packm_var2: bp_i", *m_panel_max, *n_panel_max, - ( ctype_r* )p_use + is_b, rs_p, cs_p, "%4.1f", "" ); - } -*/ -/* - if ( col_stored ) { - PASTEMAC(chr,fprintm)( stdout, "packm_var2: a_r", *m_panel_max, *n_panel_max, - ( ctype_r* )c_use, 2*rs_c, 2*cs_c, "%4.1f", "" ); - PASTEMAC(chr,fprintm)( stdout, "packm_var2: a_i", *m_panel_max, *n_panel_max, - (( ctype_r* )c_use)+rs_c, 2*rs_c, 2*cs_c, "%4.1f", "" ); - PASTEMAC(chr,fprintm)( stdout, "packm_var2: ap_r", *m_panel_max, *n_panel_max, - ( ctype_r* )p_use, rs_p, cs_p, "%4.1f", "" ); - PASTEMAC(chr,fprintm)( stdout, "packm_var2: ap_i", *m_panel_max, *n_panel_max, - ( ctype_r* )p_use + p_inc, rs_p, cs_p, "%4.1f", "" ); - } -*/ diff --git a/frame/1m/packm/bli_packm_init.c b/frame/1m/packm/bli_packm_init.c index 832cdbc7e2..acc2fd944c 100644 --- a/frame/1m/packm/bli_packm_init.c +++ b/frame/1m/packm/bli_packm_init.c @@ -35,12 +35,14 @@ #include "blis.h" -siz_t bli_packm_init +bool bli_packm_init ( obj_t* a, obj_t* p, cntx_t* cntx, - cntl_t* cntl + rntm_t* rntm, + cntl_t* cntl, + thrinfo_t* thread ) { bli_init_once(); @@ -51,115 +53,30 @@ siz_t bli_packm_init // suitable block of memory from the memory allocator (if such a block // of memory has not already been allocated previously). - bszid_t bmult_id_m; - bszid_t bmult_id_n; - bool does_invert_diag; - bool rev_iter_if_upper; - bool rev_iter_if_lower; - pack_t schema; - //packbuf_t pack_buf_type; - siz_t size_needed; - // Check parameters. if ( bli_error_checking_is_enabled() ) bli_packm_init_check( a, p, cntx ); - // Extract various fields from the control tree. - bmult_id_m = bli_cntl_packm_params_bmid_m( cntl ); - bmult_id_n = bli_cntl_packm_params_bmid_n( cntl ); - does_invert_diag = bli_cntl_packm_params_does_invert_diag( cntl ); - rev_iter_if_upper = bli_cntl_packm_params_rev_iter_if_upper( cntl ); - rev_iter_if_lower = bli_cntl_packm_params_rev_iter_if_lower( cntl ); - schema = bli_cntl_packm_params_pack_schema( cntl ); - //pack_buf_type = bli_cntl_packm_params_pack_buf_type( cntl ); - -#if 0 - // Let us now check to see if the object has already been packed. First - // we check if it has been packed to an unspecified (row or column) - // format, in which case we can alias the object and return. - // NOTE: The reason we don't need to even look at the control tree in - // this case is as follows: an object's pack status is only set to - // BLIS_PACKED_UNSPEC for situations when the actual format used is - // not important, as long as its packed into contiguous rows or - // contiguous columns. A good example of this is packing for matrix - // operands in the level-2 operations. - if ( bli_obj_pack_schema( a ) == BLIS_PACKED_UNSPEC ) - { - bli_obj_alias_to( a, p ); - return 0; - } - - // Now we check if the object has already been packed to the desired - // schema (as encoded in the control tree). If so, we can alias and - // return 0. - // NOTE: In most cases, an object's pack status will be BLIS_NOT_PACKED - // and thus packing will be called for (but in some cases packing has - // already taken place, or does not need to take place, and so that will - // be indicated by the pack status). Also, not all combinations of - // current pack status and desired pack schema are valid. - if ( bli_obj_pack_schema( a ) == pack_schema ) - { - bli_obj_alias_to( a, p ); - return 0; - } -#endif + // We begin by copying the fields of A. + bli_obj_alias_to( a, p ); // If the object is marked as being filled with zeros, then we can skip // the packm operation entirely and alias. if ( bli_obj_is_zeros( a ) ) - { - bli_obj_alias_to( a, p ); - return 0; - } + return false; -#if 0 - pack_t schema; - - if ( bli_cntx_method( cntx ) != BLIS_NAT ) - { - // We now ignore the pack_schema field in the control tree and - // extract the schema from the context, depending on whether we are - // preparing to pack a block of A or panel of B. For A and B, we must - // obtain the schema from the context since the induced methods reuse - // the same control trees used by native execution, and those induced - // methods specify the schema used by the current execution phase - // within the context (whereas the control tree does not change). - - if ( pack_buf_type == BLIS_BUFFER_FOR_A_BLOCK ) - { - schema = bli_cntx_schema_a_block( cntx ); - } - else if ( pack_buf_type == BLIS_BUFFER_FOR_B_PANEL ) - { - schema = bli_cntx_schema_b_panel( cntx ); - } - else // if ( pack_buf_type == BLIS_BUFFER_FOR_C_PANEL ) - { - schema = bli_cntl_packm_params_pack_schema( cntl ); - } - } - else // ( bli_cntx_method( cntx ) == BLIS_NAT ) - { - // For native execution, we obtain the schema from the control tree - // node. (Notice that it doesn't matter if the pack_buf_type is for - // A or B.) - schema = bli_cntl_packm_params_pack_schema( cntl ); - } - // This is no longer needed now that we branch between native and - // non-native cases above. -#if 0 - if ( pack_buf_type == BLIS_BUFFER_FOR_C_PANEL ) - { - // If we get a request to pack C for some reason, it is likely - // not part of an induced method, and so it would be safe (and - // necessary) to read the pack schema from the control tree. - schema = bli_cntl_packm_params_pack_schema( cntl ); - } -#endif -#endif - - // Prepare a few other variables based on properties of the control - // tree. + // Extract various fields from the control tree. + bszid_t bmult_id_m = bli_cntl_packm_params_bmid_m( cntl ); + bszid_t bmult_id_n = bli_cntl_packm_params_bmid_n( cntl ); + bool does_invert_diag = bli_cntl_packm_params_does_invert_diag( cntl ); + bool rev_iter_if_upper = bli_cntl_packm_params_rev_iter_if_upper( cntl ); + bool rev_iter_if_lower = bli_cntl_packm_params_rev_iter_if_lower( cntl ); + pack_t schema = bli_cntl_packm_params_pack_schema( cntl ); + num_t dt_tar = bli_obj_target_dt( a ); + num_t dt_scalar = bli_obj_scalar_dt( a ); + dim_t bmult_m_def = bli_cntx_get_blksz_def_dt( dt_tar, bmult_id_m, cntx ); + dim_t bmult_m_pack = bli_cntx_get_blksz_max_dt( dt_tar, bmult_id_m, cntx ); + dim_t bmult_n_def = bli_cntx_get_blksz_def_dt( dt_tar, bmult_id_n, cntx ); invdiag_t invert_diag; packord_t pack_ord_if_up; @@ -174,62 +91,6 @@ siz_t bli_packm_init if ( rev_iter_if_lower ) pack_ord_if_lo = BLIS_PACK_REV_IF_LOWER; else pack_ord_if_lo = BLIS_PACK_FWD_IF_LOWER; - // Initialize object p for the final packed matrix. - size_needed - = - bli_packm_init_pack - ( - invert_diag, - schema, - pack_ord_if_up, - pack_ord_if_lo, - bmult_id_m, - bmult_id_n, - a, - p, - cntx - ); - - // Return the size needed for memory allocation of the packed buffer. - return size_needed; -} - - -siz_t bli_packm_init_pack - ( - invdiag_t invert_diag, - pack_t schema, - packord_t pack_ord_if_up, - packord_t pack_ord_if_lo, - bszid_t bmult_id_m, - bszid_t bmult_id_n, - obj_t* a, - obj_t* p, - cntx_t* cntx - ) -{ - bli_init_once(); - - num_t dt_tar = bli_obj_target_dt( a ); - num_t dt_scalar = bli_obj_scalar_dt( a ); - dim_t m_a = bli_obj_length( a ); - dim_t n_a = bli_obj_width( a ); - dim_t bmult_m_def = bli_cntx_get_blksz_def_dt( dt_tar, bmult_id_m, cntx ); - dim_t bmult_m_pack = bli_cntx_get_blksz_max_dt( dt_tar, bmult_id_m, cntx ); - dim_t bmult_n_def = bli_cntx_get_blksz_def_dt( dt_tar, bmult_id_n, cntx ); - dim_t bmult_n_pack = bli_cntx_get_blksz_max_dt( dt_tar, bmult_id_n, cntx ); - - dim_t m_p, n_p; - dim_t m_p_pad, n_p_pad; - siz_t size_p; - siz_t elem_size_p; - inc_t rs_p, cs_p; - inc_t is_p; - - - // We begin by copying the fields of A. - bli_obj_alias_to( a, p ); - // Typecast the internal scalar value to the target datatype. // Note that if the typecasting is needed, this must happen BEFORE we // change the datatype of P to reflect the target_dt. @@ -277,10 +138,10 @@ siz_t bli_packm_init_pack // in P) and aligning them to the dimension multiples (typically equal // to register blocksizes). This does waste a little bit of space for // level-2 operations, but that's okay with us. - m_p = bli_obj_length( p ); - n_p = bli_obj_width( p ); - m_p_pad = bli_align_dim_to_mult( m_p, bmult_m_def ); - n_p_pad = bli_align_dim_to_mult( n_p, bmult_n_def ); + dim_t m_p = bli_obj_length( p ); + dim_t n_p = bli_obj_width( p ); + dim_t m_p_pad = bli_align_dim_to_mult( m_p, bmult_m_def ); + dim_t n_p_pad = bli_align_dim_to_mult( n_p, bmult_n_def ); // Save the padded dimensions into the packed object. It is important // to save these dimensions since they represent the actual dimensions @@ -293,7 +154,10 @@ siz_t bli_packm_init_pack // from the memory allocator. // Extract the element size for the packed object. - elem_size_p = bli_obj_elem_size( p ); + siz_t elem_size_p = bli_obj_elem_size( p ); + + inc_t rs_p, cs_p, is_p; + siz_t size_p; // Set the row and column strides of p based on the pack schema. if ( bli_is_row_packed( schema ) && @@ -438,9 +302,15 @@ siz_t bli_packm_init_pack // The other two combinations coincide with that of packed row-panel // and packed column- panel storage. - size_p = 0; + return false; } - return size_p; + if ( size_p == 0 ) + return false; + + void* buffer = bli_packm_alloc( size_p, rntm, cntl, thread ); + bli_obj_set_buffer( buffer, p ); + + return true; } diff --git a/frame/1m/packm/bli_packm_init.h b/frame/1m/packm/bli_packm_init.h index 9365a131ef..152c6f15cd 100644 --- a/frame/1m/packm/bli_packm_init.h +++ b/frame/1m/packm/bli_packm_init.h @@ -32,24 +32,13 @@ */ -siz_t bli_packm_init +BLIS_EXPORT_BLIS bool bli_packm_init ( obj_t* a, obj_t* p, cntx_t* cntx, - cntl_t* cntl - ); - -BLIS_EXPORT_BLIS siz_t bli_packm_init_pack - ( - invdiag_t invert_diag, - pack_t schema, - packord_t pack_ord_if_up, - packord_t pack_ord_if_lo, - bszid_t bmult_id_m, - bszid_t bmult_id_n, - obj_t* a, - obj_t* p, - cntx_t* cntx + rntm_t* rntm, + cntl_t* cntl, + thrinfo_t* thread ); diff --git a/frame/1m/packm/bli_packm_int.c b/frame/1m/packm/bli_packm_int.c index 6dc9ec85af..7d3a5ede5e 100644 --- a/frame/1m/packm/bli_packm_int.c +++ b/frame/1m/packm/bli_packm_int.c @@ -39,6 +39,7 @@ void bli_packm_int obj_t* a, obj_t* p, cntx_t* cntx, + rntm_t* rntm, cntl_t* cntl, thrinfo_t* thread ) @@ -47,10 +48,6 @@ void bli_packm_int packm_var_oft f; - // Check parameters. - if ( bli_error_checking_is_enabled() ) - bli_packm_int_check( a, p, cntx ); - // Sanity check; A should never have a zero dimension. If we must support // it, then we should fold it into the next alias-and-early-exit block. //if ( bli_obj_has_zero_dim( a ) ) bli_abort(); @@ -93,14 +90,21 @@ void bli_packm_int // Extract the function pointer from the current control tree node. f = bli_cntl_packm_params_var_func( cntl ); + // FGVZ: Not sure why we need this barrier, but we do. + bli_thread_barrier( thread ); + // Invoke the variant with kappa_use. f ( a, p, cntx, + rntm, cntl, thread ); + + // Barrier so that packing is done before computation. + bli_thread_barrier( thread ); } diff --git a/frame/1m/packm/bli_packm_int.h b/frame/1m/packm/bli_packm_int.h index 573a299d67..16a5c2c34d 100644 --- a/frame/1m/packm/bli_packm_int.h +++ b/frame/1m/packm/bli_packm_int.h @@ -37,6 +37,7 @@ void bli_packm_int obj_t* a, obj_t* p, cntx_t* cntx, + rntm_t* rntm, cntl_t* cntl, thrinfo_t* thread ); diff --git a/frame/1m/packm/bli_packm_unb_var1.h b/frame/1m/packm/bli_packm_scalar.c similarity index 52% rename from frame/1m/packm/bli_packm_unb_var1.h rename to frame/1m/packm/bli_packm_scalar.c index 8960c8661a..a225427b1a 100644 --- a/frame/1m/packm/bli_packm_unb_var1.h +++ b/frame/1m/packm/bli_packm_scalar.c @@ -5,6 +5,7 @@ libraries. Copyright (C) 2014, The University of Texas at Austin + Copyright (C) 2016, Hewlett Packard Enterprise Development LP Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -32,35 +33,44 @@ */ -void bli_packm_unb_var1 - ( - obj_t* c, - obj_t* p, - cntx_t* cntx, - cntl_t* cntl, - thrinfo_t* thread - ); +#include "blis.h" +void* bli_packm_scalar( obj_t* kappa, obj_t* p ) +{ + num_t dt_p = bli_obj_dt( p ); + pack_t schema = bli_obj_pack_schema( p ); -#undef GENTPROT -#define GENTPROT( ctype, ch, varname ) \ -\ -void PASTEMAC(ch,varname) \ - ( \ - struc_t strucc, \ - doff_t diagoffc, \ - diag_t diagc, \ - uplo_t uploc, \ - trans_t transc, \ - dim_t m, \ - dim_t n, \ - dim_t m_max, \ - dim_t n_max, \ - void* kappa, \ - void* c, inc_t rs_c, inc_t cs_c, \ - void* p, inc_t rs_p, inc_t cs_p, \ - cntx_t* cntx \ - ); + // The value for kappa we use will depends on whether the scalar + // attached to A has a nonzero imaginary component. If it does, + // then we will apply the scalar during packing to facilitate + // implementing induced complex domain algorithms in terms of + // real domain micro-kernels. (In the aforementioned situation, + // applying a real scalar is easy, but applying a complex one is + // harder, so we avoid the need altogether with the code below.) + if ( bli_obj_scalar_has_nonzero_imag( p ) && + !bli_is_nat_packed( schema ) ) + { + //printf( "applying non-zero imag kappa\n_p" ); -INSERT_GENTPROT_BASIC0( packm_unb_var1 ) + // Detach the scalar. + bli_obj_scalar_detach( p, kappa ); + + // Reset the attached scalar (to 1.0). + bli_obj_scalar_reset( p ); + + return bli_obj_buffer_for_1x1( dt_p, kappa ); + } + // This branch is also for native execution, where we assume that + // the micro-kernel will always apply the alpha scalar of the + // higher-level operation. Thus, we use BLIS_ONE for kappa so + // that the underlying packm implementation does not perform + // any scaling during packing. + else + { + // If the internal scalar of A has only a real component, then + // we will apply it later (in the micro-kernel), and so we will + // use BLIS_ONE to indicate no scaling during packing. + return bli_obj_buffer_for_1x1( dt_p, &BLIS_ONE ); + } +} diff --git a/frame/1m/unpackm/bli_unpackm_unb_var1.h b/frame/1m/packm/bli_packm_scalar.h similarity index 76% rename from frame/1m/unpackm/bli_unpackm_unb_var1.h rename to frame/1m/packm/bli_packm_scalar.h index 5119aaa7ff..3745accf9d 100644 --- a/frame/1m/unpackm/bli_unpackm_unb_var1.h +++ b/frame/1m/packm/bli_packm_scalar.h @@ -32,29 +32,5 @@ */ -void bli_unpackm_unb_var1 - ( - obj_t* p, - obj_t* c, - cntx_t* cntx, - cntl_t* cntl, - thrinfo_t* thread - ); - -#undef GENTPROT -#define GENTPROT( ctype, ch, varname ) \ -\ -void PASTEMAC(ch,varname) \ - ( \ - doff_t diagoffp, \ - uplo_t uplop, \ - trans_t transp, \ - dim_t m, \ - dim_t n, \ - void* p, inc_t rs_p, inc_t cs_p, \ - void* c, inc_t rs_c, inc_t cs_c, \ - cntx_t* cntx \ - ); - -INSERT_GENTPROT_BASIC0( unpackm_unb_var1 ) +BLIS_EXPORT_BLIS void* bli_packm_scalar( obj_t* kappa, obj_t* p ); diff --git a/frame/1m/packm/bli_packm_unb_var1.c b/frame/1m/packm/bli_packm_unb_var1.c deleted file mode 100644 index 6e72b3e9d0..0000000000 --- a/frame/1m/packm/bli_packm_unb_var1.c +++ /dev/null @@ -1,297 +0,0 @@ -/* - - BLIS - An object-based framework for developing high-performance BLAS-like - libraries. - - Copyright (C) 2014, The University of Texas at Austin - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are - met: - - Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - - Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - - Neither the name(s) of the copyright holder(s) nor the names of its - contributors may be used to endorse or promote products derived - from this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -*/ - -#include "blis.h" - -#define FUNCPTR_T packm_fp - -typedef void (*FUNCPTR_T)( - struc_t strucc, - doff_t diagoffc, - diag_t diagc, - uplo_t uploc, - trans_t transc, - dim_t m, - dim_t n, - dim_t m_max, - dim_t n_max, - void* kappa, - void* c, inc_t rs_c, inc_t cs_c, - void* p, inc_t rs_p, inc_t cs_p, - cntx_t* cntx - ); - -static FUNCPTR_T GENARRAY(ftypes,packm_unb_var1); - - -void bli_packm_unb_var1 - ( - obj_t* c, - obj_t* p, - cntx_t* cntx, - cntl_t* cntl, - thrinfo_t* thread - ) -{ - num_t dt_cp = bli_obj_dt( c ); - - struc_t strucc = bli_obj_struc( c ); - doff_t diagoffc = bli_obj_diag_offset( c ); - diag_t diagc = bli_obj_diag( c ); - uplo_t uploc = bli_obj_uplo( c ); - trans_t transc = bli_obj_conjtrans_status( c ); - - dim_t m_p = bli_obj_length( p ); - dim_t n_p = bli_obj_width( p ); - dim_t m_max_p = bli_obj_padded_length( p ); - dim_t n_max_p = bli_obj_padded_width( p ); - - void* buf_c = bli_obj_buffer_at_off( c ); - inc_t rs_c = bli_obj_row_stride( c ); - inc_t cs_c = bli_obj_col_stride( c ); - - void* buf_p = bli_obj_buffer_at_off( p ); - inc_t rs_p = bli_obj_row_stride( p ); - inc_t cs_p = bli_obj_col_stride( p ); - - void* buf_kappa; - - FUNCPTR_T f; - - - // This variant assumes that the computational kernel will always apply - // the alpha scalar of the higher-level operation. Thus, we use BLIS_ONE - // for kappa so that the underlying packm implementation does not scale - // during packing. - buf_kappa = bli_obj_buffer_for_const( dt_cp, &BLIS_ONE ); - - // Index into the type combination array to extract the correct - // function pointer. - f = ftypes[dt_cp]; - - if( bli_thread_am_ochief( thread ) ) { - // Invoke the function. - f - ( - strucc, - diagoffc, - diagc, - uploc, - transc, - m_p, - n_p, - m_max_p, - n_max_p, - buf_kappa, - buf_c, rs_c, cs_c, - buf_p, rs_p, cs_p, - cntx - ); - } -} - - -#undef GENTFUNC -#define GENTFUNC( ctype, ch, varname ) \ -\ -void PASTEMAC(ch,varname) \ - ( \ - struc_t strucc, \ - doff_t diagoffc, \ - diag_t diagc, \ - uplo_t uploc, \ - trans_t transc, \ - dim_t m, \ - dim_t n, \ - dim_t m_max, \ - dim_t n_max, \ - void* kappa, \ - void* c, inc_t rs_c, inc_t cs_c, \ - void* p, inc_t rs_p, inc_t cs_p, \ - cntx_t* cntx \ - ) \ -{ \ - ctype* restrict kappa_cast = kappa; \ - ctype* restrict c_cast = c; \ - ctype* restrict p_cast = p; \ - ctype* restrict zero = PASTEMAC(ch,0); \ -\ - /* We begin by packing the region indicated by the parameters. If - matrix c is dense (either because the structure is general or - because the structure has already been "densified"), this ends - up being the only action we take. Note that if kappa is unit, - the data is simply copied (rather than scaled by one). */ \ - PASTEMAC2(ch,scal2m,BLIS_TAPI_EX_SUF) \ - ( \ - diagoffc, \ - diagc, \ - uploc, \ - transc, \ - m, \ - n, \ - kappa_cast, \ - c_cast, rs_c, cs_c, \ - p_cast, rs_p, cs_p, \ - cntx, \ - NULL \ - ); \ -\ - /* If uploc is upper or lower, then the structure of c is necessarily - non-dense (ie: Hermitian, symmetric, or triangular, where part of the - matrix is unstored). In these cases, we want to fill in the unstored - part of the matrix. How this is done depends on the structure of c. */ \ - if ( bli_is_upper_or_lower( uploc ) ) \ - { \ - /* The Hermitian and symmetric cases are almost identical, so we - handle them in one conditional block. */ \ - if ( bli_is_hermitian( strucc ) || bli_is_symmetric( strucc ) ) \ - { \ - /* First we must reflect the region referenced to the opposite - side of the diagonal. */ \ - c_cast = c_cast + diagoffc * ( doff_t )cs_c + \ - -diagoffc * ( doff_t )rs_c; \ - bli_negate_diag_offset( &diagoffc ); \ - bli_toggle_trans( &transc ); \ - if ( bli_is_upper( uploc ) ) diagoffc += 1; \ - else if ( bli_is_lower( uploc ) ) diagoffc -= 1; \ -\ - /* If c is Hermitian, we need to apply a conjugation when - copying the region opposite the diagonal. */ \ - if ( bli_is_hermitian( strucc ) ) \ - transc = bli_trans_toggled_conj( transc ); \ -\ - /* Copy the data from the region opposite the diagonal of c - (as specified by the original value of diagoffc). Notice - that we use a diag parameter of non-unit since we can - assume nothing about the neighboring off-diagonal. */ \ - PASTEMAC2(ch,scal2m,BLIS_TAPI_EX_SUF) \ - ( \ - diagoffc, \ - BLIS_NONUNIT_DIAG, \ - uploc, \ - transc, \ - m, \ - n, \ - kappa_cast, \ - c_cast, rs_c, cs_c, \ - p_cast, rs_p, cs_p, \ - cntx, \ - NULL \ - ); \ - } \ - else /* if ( bli_is_triangular( strucc ) ) */ \ - { \ - doff_t diagoffp = diagoffc; \ - uplo_t uplop = uploc; \ -\ - /* For this step we need the uplo and diagonal offset of p, which - we can derive from the parameters given. */ \ - if ( bli_does_trans( transc ) ) \ - { \ - bli_negate_diag_offset( &diagoffp ); \ - bli_toggle_uplo( &uplop ); \ - } \ -\ - /* For triangular matrices, we wish to reference the region - strictly opposite the diagonal of C. This amounts to - toggling uploc and then shifting the diagonal offset to - shrink the stored region (by one diagonal). */ \ - bli_toggle_uplo( &uplop ); \ - bli_shift_diag_offset_to_shrink_uplo( uplop, &diagoffp ); \ -\ - /* Set the region opposite the diagonal of p to zero. */ \ - PASTEMAC2(ch,setm,BLIS_TAPI_EX_SUF) \ - ( \ - BLIS_NO_CONJUGATE, \ - diagoffp, \ - BLIS_NONUNIT_DIAG, \ - uplop, \ - m, \ - n, \ - zero, \ - p_cast, rs_p, cs_p, \ - cntx, \ - NULL \ - ); \ - } \ - } \ -\ - /* The packed memory region was acquired/allocated with "aligned" - dimensions (ie: dimensions that were possibly inflated up to a - multiple). When these dimension are inflated, it creates empty - regions along the bottom and/or right edges of the matrix. If - eithe region exists, we set them to zero. This simplifies the - register level micro kernel in that it does not need to support - different register blockings for the edge cases. */ \ - if ( m != m_max ) \ - { \ - ctype* p_edge = p_cast + (m )*rs_p; \ -\ - PASTEMAC2(ch,setm,BLIS_TAPI_EX_SUF) \ - ( \ - BLIS_NO_CONJUGATE, \ - 0, \ - BLIS_NONUNIT_DIAG, \ - BLIS_DENSE, \ - m_max - m, \ - n_max, \ - zero, \ - p_edge, rs_p, cs_p, \ - cntx, \ - NULL \ - ); \ - } \ -\ - if ( n != n_max ) \ - { \ - ctype* p_edge = p_cast + (n )*cs_p; \ -\ - PASTEMAC2(ch,setm,BLIS_TAPI_EX_SUF) \ - ( \ - BLIS_NO_CONJUGATE, \ - 0, \ - BLIS_NONUNIT_DIAG, \ - BLIS_DENSE, \ - m_max, \ - n_max - n, \ - zero, \ - p_edge, rs_p, cs_p, \ - cntx, \ - NULL \ - ); \ - } \ -} - -INSERT_GENTFUNC_BASIC0( packm_unb_var1 ) - diff --git a/frame/1m/packm/bli_packm_var.h b/frame/1m/packm/bli_packm_var.h index 723e6fdb4a..cd9e14186b 100644 --- a/frame/1m/packm/bli_packm_var.h +++ b/frame/1m/packm/bli_packm_var.h @@ -37,74 +37,13 @@ // Prototype object-based interfaces. // -#undef GENPROT -#define GENPROT( opname ) \ -\ -BLIS_EXPORT_BLIS void PASTEMAC0(opname) \ - ( \ - obj_t* c, \ - obj_t* p, \ - cntx_t* cntx, \ - cntl_t* cntl, \ - thrinfo_t* t \ +BLIS_EXPORT_BLIS void bli_packm_blk_var1 + ( + obj_t* c, + obj_t* p, + cntx_t* cntx, + rntm_t* rntm, + cntl_t* cntl, + thrinfo_t* t ); -GENPROT( packm_unb_var1 ) -GENPROT( packm_blk_var1 ) - -// -// Prototype BLAS-like interfaces with void pointer operands. -// - -#undef GENTPROT -#define GENTPROT( ctype, ch, varname ) \ -\ -void PASTEMAC(ch,varname) \ - ( \ - struc_t strucc, \ - doff_t diagoffc, \ - diag_t diagc, \ - uplo_t uploc, \ - trans_t transc, \ - dim_t m, \ - dim_t n, \ - dim_t m_max, \ - dim_t n_max, \ - void* kappa, \ - void* c, inc_t rs_c, inc_t cs_c, \ - void* p, inc_t rs_p, inc_t cs_p, \ - cntx_t* cntx \ - ); - -INSERT_GENTPROT_BASIC0( packm_unb_var1 ) - -#undef GENTPROT -#define GENTPROT( ctype, ch, varname ) \ -\ -void PASTEMAC(ch,varname) \ - ( \ - struc_t strucc, \ - doff_t diagoffc, \ - diag_t diagc, \ - uplo_t uploc, \ - trans_t transc, \ - pack_t schema, \ - bool invdiag, \ - bool revifup, \ - bool reviflo, \ - dim_t m, \ - dim_t n, \ - dim_t m_max, \ - dim_t n_max, \ - void* kappa, \ - void* c, inc_t rs_c, inc_t cs_c, \ - void* p, inc_t rs_p, inc_t cs_p, \ - inc_t is_p, \ - dim_t pd_p, inc_t ps_p, \ - void_fp packm_ker, \ - cntx_t* cntx, \ - thrinfo_t* thread \ - ); - -INSERT_GENTPROT_BASIC0( packm_blk_var1 ) - diff --git a/frame/1m/unpackm/bli_unpackm.h b/frame/1m/unpackm/bli_unpackm.h index b32d02d9ba..5e45428410 100644 --- a/frame/1m/unpackm/bli_unpackm.h +++ b/frame/1m/unpackm/bli_unpackm.h @@ -36,8 +36,6 @@ #include "bli_unpackm_check.h" #include "bli_unpackm_int.h" -#include "bli_unpackm_unb_var1.h" - #include "bli_unpackm_blk_var1.h" #include "bli_unpackm_cxk.h" diff --git a/frame/1m/unpackm/bli_unpackm_unb_var1.c b/frame/1m/unpackm/bli_unpackm_unb_var1.c deleted file mode 100644 index c1033c2cb9..0000000000 --- a/frame/1m/unpackm/bli_unpackm_unb_var1.c +++ /dev/null @@ -1,131 +0,0 @@ -/* - - BLIS - An object-based framework for developing high-performance BLAS-like - libraries. - - Copyright (C) 2014, The University of Texas at Austin - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are - met: - - Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - - Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - - Neither the name(s) of the copyright holder(s) nor the names of its - contributors may be used to endorse or promote products derived - from this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -*/ - -#include "blis.h" - -#define FUNCPTR_T unpackm_fp - -typedef void (*FUNCPTR_T)( - doff_t diagoffp, - uplo_t uplop, - trans_t transp, - dim_t m, - dim_t n, - void* p, inc_t rs_p, inc_t cs_p, - void* c, inc_t rs_c, inc_t cs_c, - cntx_t* cntx - ); - -static FUNCPTR_T GENARRAY(ftypes,unpackm_unb_var1); - - -void bli_unpackm_unb_var1 - ( - obj_t* p, - obj_t* c, - cntx_t* cntx, - cntl_t* cntl, - thrinfo_t* thread - ) -{ - num_t dt_pc = bli_obj_dt( p ); - - doff_t diagoffp = bli_obj_diag_offset( p ); - uplo_t uplop = bli_obj_uplo( p ); - trans_t transc = bli_obj_onlytrans_status( c ); - - dim_t m_c = bli_obj_length( c ); - dim_t n_c = bli_obj_width( c ); - - void* buf_p = bli_obj_buffer_at_off( p ); - inc_t rs_p = bli_obj_row_stride( p ); - inc_t cs_p = bli_obj_col_stride( p ); - - void* buf_c = bli_obj_buffer_at_off( c ); - inc_t rs_c = bli_obj_row_stride( c ); - inc_t cs_c = bli_obj_col_stride( c ); - - FUNCPTR_T f; - - // Index into the type combination array to extract the correct - // function pointer. - f = ftypes[dt_pc]; - - // Invoke the function. - f( diagoffp, - uplop, - transc, - m_c, - n_c, - buf_p, rs_p, cs_p, - buf_c, rs_c, cs_c, - cntx - ); -} - - -#undef GENTFUNC -#define GENTFUNC( ctype, ch, opname, varname ) \ -\ -void PASTEMAC(ch,varname)( \ - doff_t diagoffp, \ - uplo_t uplop, \ - trans_t transp, \ - dim_t m, \ - dim_t n, \ - void* p, inc_t rs_p, inc_t cs_p, \ - void* c, inc_t rs_c, inc_t cs_c, \ - cntx_t* cntx \ - ) \ -{ \ - ctype* p_cast = p; \ - ctype* c_cast = c; \ -\ - PASTEMAC2(ch,copym,BLIS_TAPI_EX_SUF) \ - ( \ - diagoffp,\ - BLIS_NONUNIT_DIAG, \ - uplop, \ - transp, \ - m, \ - n, \ - p_cast, rs_p, cs_p, \ - c_cast, rs_c, cs_c, \ - cntx, \ - NULL \ - ); \ -} - -INSERT_GENTFUNC_BASIC( unpackm, unpackm_unb_var1 ) - diff --git a/frame/3/bli_l3.h b/frame/3/bli_l3.h index 7a35ca9740..ff92328565 100644 --- a/frame/3/bli_l3.h +++ b/frame/3/bli_l3.h @@ -45,7 +45,6 @@ #include "bli_l3_blocksize.h" #include "bli_l3_direct.h" #include "bli_l3_prune.h" -#include "bli_l3_packm.h" // Prototype object APIs (expert and non-expert). #include "bli_l3_oapi.h" diff --git a/frame/3/gemm/bli_gemm_packab.c b/frame/3/gemm/bli_gemm_packab.c index 6176b2f911..39034ba286 100644 --- a/frame/3/gemm/bli_gemm_packab.c +++ b/frame/3/gemm/bli_gemm_packab.c @@ -55,7 +55,7 @@ void bli_gemm_packa } // Pack matrix A according to the control tree node. - bli_l3_packm + bli_packm_int ( &a_local, &a_pack, @@ -107,7 +107,7 @@ void bli_gemm_packb } // Pack matrix B according to the control tree node. - bli_l3_packm + bli_packm_int ( &bt_local, &bt_pack, diff --git a/frame/3/trsm/bli_trsm_packab.c b/frame/3/trsm/bli_trsm_packab.c index 6f61058853..79f4e2046c 100644 --- a/frame/3/trsm/bli_trsm_packab.c +++ b/frame/3/trsm/bli_trsm_packab.c @@ -55,7 +55,7 @@ void bli_trsm_packa } // Pack matrix A according to the control tree node. - bli_l3_packm + bli_packm_int ( &a_local, &a_pack, @@ -107,7 +107,7 @@ void bli_trsm_packb } // Pack matrix B according to the control tree node. - bli_l3_packm + bli_packm_int ( &bt_local, &bt_pack, diff --git a/frame/base/bli_sba.c b/frame/base/bli_sba.c index 1da6723c79..3f0ba49764 100644 --- a/frame/base/bli_sba.c +++ b/frame/base/bli_sba.c @@ -76,24 +76,31 @@ void* bli_sba_acquire // Query the small block pool from the rntm. pool_t* restrict pool = bli_rntm_sba_pool( rntm ); - // Query the block_size of the pool_t so that we can request the exact - // size present. - const siz_t block_size = bli_pool_block_size( pool ); - - // Sanity check: Make sure the requested size is no larger than the - // block_size field of the pool. - if ( block_size < req_size ) - { - printf( "bli_sba_acquire(): ** pool block_size is %d but req_size is %d.\n", - ( int )block_size, ( int )req_size ); - bli_abort(); - } - - // Check out a block using the block_size queried above. - bli_pool_checkout_block( block_size, &pblk, pool ); - - // The block address is stored within the pblk_t. - block = bli_pblk_buf( &pblk ); + if ( pool == NULL ) + { + block = bli_malloc_intl( req_size, &r_val ); + } + else + { + // Query the block_size of the pool_t so that we can request the exact + // size present. + const siz_t block_size = bli_pool_block_size( pool ); + + // Sanity check: Make sure the requested size is no larger than the + // block_size field of the pool. + if ( block_size < req_size ) + { + printf( "bli_sba_acquire(): ** pool block_size is %d but req_size is %d.\n", + ( int )block_size, ( int )req_size ); + bli_abort(); + } + + // Check out a block using the block_size queried above. + bli_pool_checkout_block( block_size, &pblk, pool ); + + // The block address is stored within the pblk_t. + block = bli_pblk_buf( &pblk ); + } } #else @@ -123,21 +130,28 @@ void bli_sba_release // Query the small block pool from the rntm. pool_t* restrict pool = bli_rntm_sba_pool( rntm ); - // Query the block_size field from the pool. This is not super-important - // for this particular application of the pool_t (that is, the "leaf" - // component of the sba), but it seems like good housekeeping to maintain - // the block_size field of the pblk_t in case its ever needed/read. - const siz_t block_size = bli_pool_block_size( pool ); - - // Embed the block's memory address into a pblk_t, along with the - // block_size queried from the pool. - bli_pblk_set_buf( block, &pblk ); - bli_pblk_set_block_size( block_size, &pblk ); - - // Check the pblk_t back into the pool_t. (It's okay that the pblk_t is - // a local variable since its contents are copied into the pool's internal - // data structure--an array of pblk_t.) - bli_pool_checkin_block( &pblk, pool ); + if ( pool == NULL ) + { + bli_free_intl( block ); + } + else + { + // Query the block_size field from the pool. This is not super-important + // for this particular application of the pool_t (that is, the "leaf" + // component of the sba), but it seems like good housekeeping to maintain + // the block_size field of the pblk_t in case its ever needed/read. + const siz_t block_size = bli_pool_block_size( pool ); + + // Embed the block's memory address into a pblk_t, along with the + // block_size queried from the pool. + bli_pblk_set_buf( block, &pblk ); + bli_pblk_set_block_size( block_size, &pblk ); + + // Check the pblk_t back into the pool_t. (It's okay that the pblk_t is + // a local variable since its contents are copied into the pool's internal + // data structure--an array of pblk_t.) + bli_pool_checkin_block( &pblk, pool ); + } } #else diff --git a/testsuite/src/test_gemm_ukr.c b/testsuite/src/test_gemm_ukr.c index c5a35d678d..08328f6773 100644 --- a/testsuite/src/test_gemm_ukr.c +++ b/testsuite/src/test_gemm_ukr.c @@ -237,7 +237,13 @@ void libblis_test_gemm_ukr_experiment libblis_test_mobj_randomize( params, TRUE, &c ); bli_copym( &c, &c_save ); -#if 0 + rntm_t rntm; + bli_rntm_init( &rntm ); + bli_pba_rntm_set_pba( &rntm ); + + // Transpose B to B^T for packing + bli_obj_induce_trans( &b ); + // Create pack objects for a and b, and pack them to ap and bp, // respectively. cntl_t* cntl_a = libblis_test_pobj_create @@ -248,57 +254,20 @@ void libblis_test_gemm_ukr_experiment BLIS_PACKED_ROW_PANELS, BLIS_BUFFER_FOR_A_BLOCK, &a, &ap, - cntx + cntx, + &rntm ); cntl_t* cntl_b = libblis_test_pobj_create ( - BLIS_KR, BLIS_NR, + BLIS_KR, BLIS_NO_INVERT_DIAG, BLIS_PACKED_COL_PANELS, BLIS_BUFFER_FOR_B_PANEL, &b, &bp, - cntx + cntx, + &rntm ); -#endif - - // Create the packed objects. Use packmr and packnr as the leading - // dimensions of ap and bp, respectively. Note that we use the ldims - // instead of the matrix dimensions for allocation purposes here. - // This is a little hacky and was prompted when trying to support - // configurations such as power9 that employ duplication/broadcasting - // of elements in one of the packed matrix objects. Thankfully, packm - // doesn't care about those dimensions and instead relies on - // information taken from the source object. Thus, this is merely - // about coaxing bli_obj_create() in allocating enough space for our - // purposes. - bli_obj_create( datatype, ldap, k, 1, ldap, &ap ); - bli_obj_create( datatype, ldbp, k, 1, ldbp, &bp ); - - // Transpose B to B^T for packing - bli_obj_induce_trans( &b ); - - // Set up the objects for packing. Calling packm_init_pack() does everything - // except checkout a memory pool block and save its address to the obj_t's. - // However, it does overwrite the buffer field of packed object with that of - // the source object (as a side-effect of bli_obj_alias_to(); that buffer - // field would normally be overwritten yet again by the address from the - // memory pool block). So, we have to save the buffer address that was - // allocated so we can re-store it to the object afterward. - void* buf_ap = bli_obj_buffer( &ap ); - void* buf_bp = bli_obj_buffer( &bp ); - bli_packm_init_pack( BLIS_NO_INVERT_DIAG, BLIS_PACKED_ROW_PANELS, - BLIS_PACK_FWD_IF_UPPER, BLIS_PACK_FWD_IF_LOWER, - BLIS_MR, BLIS_KR, &a, &ap, cntx ); - bli_packm_init_pack( BLIS_NO_INVERT_DIAG, BLIS_PACKED_COL_PANELS, - BLIS_PACK_FWD_IF_UPPER, BLIS_PACK_FWD_IF_LOWER, - BLIS_NR, BLIS_KR, &b, &bp, cntx ); - bli_obj_set_buffer( buf_ap, &ap ); - bli_obj_set_buffer( buf_bp, &bp ); - - // Pack the data from the source objects. - bli_packm_blk_var1( &a, &ap, cntx, NULL, &BLIS_PACKM_SINGLE_THREADED ); - bli_packm_blk_var1( &b, &bp, cntx, NULL, &BLIS_PACKM_SINGLE_THREADED ); // Transpose B^T back to B and Bp^T back to Bp bli_obj_induce_trans( &b ); @@ -328,16 +297,10 @@ void libblis_test_gemm_ukr_experiment // Zero out performance and residual if output matrix is empty. libblis_test_check_empty_problem( &c, perf, resid ); -#if 0 // Free the control tree nodes and release their cached mem_t entries // back to the memory broker. - bli_cntl_free( cntl_a, &BLIS_PACKM_SINGLE_THREADED ); - bli_cntl_free( cntl_b, &BLIS_PACKM_SINGLE_THREADED ); -#endif - - // Free the packed objects. - bli_obj_free( &ap ); - bli_obj_free( &bp ); + bli_cntl_free( &rntm, cntl_a, &BLIS_PACKM_SINGLE_THREADED ); + bli_cntl_free( &rntm, cntl_b, &BLIS_PACKM_SINGLE_THREADED ); // Free the test objects. bli_obj_free( &a ); diff --git a/testsuite/src/test_gemmtrsm_ukr.c b/testsuite/src/test_gemmtrsm_ukr.c index f1e170bc78..b5287f6b50 100644 --- a/testsuite/src/test_gemmtrsm_ukr.c +++ b/testsuite/src/test_gemmtrsm_ukr.c @@ -283,7 +283,10 @@ void libblis_test_gemmtrsm_ukr_experiment bli_copym( &b11, &c11 ); bli_copym( &c11, &c11_save ); -#if 0 + rntm_t rntm; + bli_rntm_init( &rntm ); + bli_pba_rntm_set_pba( &rntm ); + // Create pack objects for a and b, and pack them to ap and bp, // respectively. cntl_t* cntl_a = libblis_test_pobj_create @@ -294,62 +297,9 @@ void libblis_test_gemmtrsm_ukr_experiment BLIS_PACKED_ROW_PANELS, BLIS_BUFFER_FOR_A_BLOCK, &a, &ap, - &cntx - ); - cntl_t* cntl_b = libblis_test_pobj_create - ( - BLIS_MR, - BLIS_NR, - BLIS_NO_INVERT_DIAG, - BLIS_PACKED_COL_PANELS, - BLIS_BUFFER_FOR_B_PANEL, - &b, &bp, - &cntx + cntx, + &rntm ); -#endif - - // Create the packed objects. Use packmr and packnr as the leading - // dimensions of ap and bp, respectively. Note that we use the ldims - // instead of the matrix dimensions for allocation purposes here. - // This is a little hacky and was prompted when trying to support - // configurations such as power9 that employ duplication/broadcasting - // of elements in one of the packed matrix objects. Thankfully, packm - // doesn't care about those dimensions and instead relies on - // information taken from the source object. Thus, this is merely - // about coaxing bli_obj_create() in allocating enough space for our - // purposes. - bli_obj_create( datatype, ldap, k+m, 1, ldap, &ap ); - bli_obj_create( datatype, ldbp, k+m, 1, ldbp, &bp ); - - // We overwrite the m dimension of ap and n dimension of bp with - // m and n, respectively, so that these objects contain the correct - // logical dimensions. Recall that ldap and ldbp were used only to - // induce bli_obj_create() to allocate sufficient memory for the - // duplication in rare instances where the subconfig uses a gemm - // ukernel that duplicates elements in one of the operands. - bli_obj_set_length( m, &ap ); - bli_obj_set_length( n, &bp ); - - // Transpose B to B^T for packing - bli_obj_induce_trans( &b ); - - // Set up the objects for packing. Calling packm_init_pack() does everything - // except checkout a memory pool block and save its address to the obj_t's. - // However, it does overwrite the buffer field of packed object with that of - // the source object (as a side-effect of bli_obj_alias_to(); that buffer - // field would normally be overwritten yet again by the address from the - // memory pool block). So, we have to save the buffer address that was - // allocated so we can re-store it to the object afterward. - void* buf_ap = bli_obj_buffer( &ap ); - void* buf_bp = bli_obj_buffer( &bp ); - bli_packm_init_pack( BLIS_INVERT_DIAG, BLIS_PACKED_ROW_PANELS, - BLIS_PACK_FWD_IF_UPPER, BLIS_PACK_FWD_IF_LOWER, - BLIS_MR, BLIS_KR, &a, &ap, cntx ); - bli_packm_init_pack( BLIS_NO_INVERT_DIAG, BLIS_PACKED_COL_PANELS, - BLIS_PACK_FWD_IF_UPPER, BLIS_PACK_FWD_IF_LOWER, - BLIS_NR, BLIS_KR, &b, &bp, cntx ); - bli_obj_set_buffer( buf_ap, &ap ); - bli_obj_set_buffer( buf_bp, &bp ); // Set the diagonal offset of ap. if ( bli_is_lower( uploa ) ) { bli_obj_set_diag_offset( k, &ap ); } @@ -360,23 +310,6 @@ void libblis_test_gemmtrsm_ukr_experiment // to know how to initialize the subpartitions. bli_obj_set_uplo( uploa, &ap ); - // Pack the data from the source objects. - bli_packm_blk_var1( &a, &ap, cntx, NULL, &BLIS_PACKM_SINGLE_THREADED ); - bli_packm_blk_var1( &b, &bp, cntx, NULL, &BLIS_PACKM_SINGLE_THREADED ); - - // Transpose B^T back to B and Bp^T back to Bp - bli_obj_induce_trans( &b ); - bli_obj_induce_trans( &bp ); - - // Create subpartitions from the a and b panels. - bli_gemmtrsm_ukr_make_subparts( k, &ap, &bp, - &a1xp, &a11p, &bx1p, &b11p ); - - // Set the uplo field of a11p since the default for packed objects is - // BLIS_DENSE, and the _ukernel() wrapper needs this information to - // know which set of micro-kernels (lower or upper) to choose from. - bli_obj_set_uplo( uploa, &a11p ); - #if 0 bli_printm( "a", &a, "%5.2f", "" ); bli_printm( "ap", &ap, "%5.2f", "" ); @@ -387,14 +320,34 @@ bli_printm( "ap", &ap, "%5.2f", "" ); { bli_copym( &c11_save, &c11 ); - // Re-pack (restore) the contents of b to bp. - //bli_packm_blk_var1( &b, &bp, &cntx, cntl_b, &BLIS_PACKM_SINGLE_THREADED ); + // Transpose B to B^T for packing bli_obj_induce_trans( &b ); - bli_obj_induce_trans( &bp ); - bli_packm_blk_var1( &b, &bp, cntx, NULL, &BLIS_PACKM_SINGLE_THREADED ); + + cntl_t* cntl_b = libblis_test_pobj_create + ( + BLIS_NR, + BLIS_MR, + BLIS_NO_INVERT_DIAG, + BLIS_PACKED_COL_PANELS, + BLIS_BUFFER_FOR_B_PANEL, + &b, &bp, + cntx, + &rntm + ); + + // Transpose B^T back to B and Bp^T back to Bp bli_obj_induce_trans( &b ); bli_obj_induce_trans( &bp ); + // Create subpartitions from the a and b panels. + bli_gemmtrsm_ukr_make_subparts( k, &ap, &bp, + &a1xp, &a11p, &bx1p, &b11p ); + + // Set the uplo field of a11p since the default for packed objects is + // BLIS_DENSE, and the _ukernel() wrapper needs this information to + // know which set of micro-kernels (lower or upper) to choose from. + bli_obj_set_uplo( uploa, &a11p ); + time = bli_clock(); libblis_test_gemmtrsm_ukr_impl( iface, side, &alpha, @@ -402,6 +355,10 @@ bli_printm( "ap", &ap, "%5.2f", "" ); cntx ); time_min = bli_clock_min_diff( time_min, time ); + + // Free the control tree nodes and release their cached mem_t entries + // back to the memory broker. + bli_cntl_free( &rntm, cntl_b, &BLIS_PACKM_SINGLE_THREADED ); } // Estimate the performance of the best experiment repeat. @@ -437,16 +394,9 @@ bli_printm( "ap", &ap, "%5.2f", "" ); // Zero out performance and residual if output matrix is empty. //libblis_test_check_empty_problem( &c11, perf, resid ); -#if 0 // Free the control tree nodes and release their cached mem_t entries // back to the memory broker. - bli_cntl_free( cntl_a, &BLIS_PACKM_SINGLE_THREADED ); - bli_cntl_free( cntl_b, &BLIS_PACKM_SINGLE_THREADED ); -#endif - - // Free the packed objects. - bli_obj_free( &ap ); - bli_obj_free( &bp ); + bli_cntl_free( &rntm, cntl_a, &BLIS_PACKM_SINGLE_THREADED ); // Free the test objects. bli_obj_free( &a_big ); diff --git a/testsuite/src/test_libblis.c b/testsuite/src/test_libblis.c index f771290f0e..c06e73ecb0 100644 --- a/testsuite/src/test_libblis.c +++ b/testsuite/src/test_libblis.c @@ -667,7 +667,7 @@ void libblis_test_read_op_info( test_ops_t* ops, int i, p; // Initialize the operation type field. - op->opid = opid; + op->opid = opid; // Read the line for the overall operation switch. libblis_test_read_next_line( buffer, input_stream ); @@ -702,7 +702,7 @@ void libblis_test_read_op_info( test_ops_t* ops, //printf( "buffer[p]: %s\n", &buffer[p] ); // Advance until we hit non-whitespace (ie: the next number). - for ( ; isspace( buffer[p] ); ++p ) ; + for ( ; isspace( buffer[p] ); ++p ) ; //printf( "buffer[p] after: %s\n", &buffer[p] ); @@ -711,7 +711,7 @@ void libblis_test_read_op_info( test_ops_t* ops, //printf( "dim[%d] = %d\n", i, op->dim_spec[i] ); // Advance until we hit whitespace (ie: the space before the next number). - for ( ; !isspace( buffer[p] ); ++p ) ; + for ( ; !isspace( buffer[p] ); ++p ) ; } } @@ -809,11 +809,11 @@ void libblis_test_output_params_struct( FILE* os, test_params_t* params ) // convert these values into strings, with "unset" being used if the // value returned was -1 (indicating the environment variable was unset). dim_t nt = bli_thread_get_num_threads(); - dim_t jc_nt = bli_thread_get_jc_nt(); - dim_t pc_nt = bli_thread_get_pc_nt(); - dim_t ic_nt = bli_thread_get_ic_nt(); - dim_t jr_nt = bli_thread_get_jr_nt(); - dim_t ir_nt = bli_thread_get_ir_nt(); + dim_t jc_nt = bli_thread_get_jc_nt(); + dim_t pc_nt = bli_thread_get_pc_nt(); + dim_t ic_nt = bli_thread_get_ic_nt(); + dim_t jr_nt = bli_thread_get_jr_nt(); + dim_t ir_nt = bli_thread_get_ir_nt(); if ( nt == -1 ) sprintf( nt_str, "unset" ); else sprintf( nt_str, "%d", ( int ) nt ); @@ -1775,7 +1775,7 @@ void libblis_test_op_driver = ( char* ) malloc( ( n_operands + 1 ) * sizeof( char ) ); for ( o = 0; o < n_operands; ++o ) - { + { unsigned int ij; operand_t operand_type = libblis_test_get_operand_type_for_char( o_types[o] ); @@ -2217,7 +2217,7 @@ void libblis_test_op_driver ind_str = bli_ind_oper_get_avail_impl_string( op->opid, datatype ); // Loop over the requested parameter combinations. - for ( pci = 0; pci < n_param_combos; ++pci ) + for ( pci = 0; pci < n_param_combos; ++pci ) { // Loop over the requested problem sizes. for ( p_cur = p_first, pi = 1; p_cur <= p_max; p_cur += p_inc, ++pi ) @@ -2435,7 +2435,7 @@ void libblis_test_build_function_string if ( strlen( funcname_str ) > MAX_FUNC_STRING_LENGTH ) libblis_test_printf_error( "Function name string length (%d) exceeds maximum (%d).\n", strlen( funcname_str ), MAX_FUNC_STRING_LENGTH ); - + } @@ -2577,7 +2577,7 @@ void libblis_test_mobj_create( test_params_t* params, num_t dt, trans_t trans, c dim_t n_trans = n; dim_t rs = 1; // Initialization avoids a compiler warning. dim_t cs = 1; // Initialization avoids a compiler warning. - + // Apply the trans parameter to the dimensions (if needed). bli_set_dims_with_trans( trans, m, n, &m_trans, &n_trans ); @@ -2623,12 +2623,9 @@ void libblis_test_mobj_create( test_params_t* params, num_t dt, trans_t trans, c } - -#if 0 -cntl_t* libblis_test_pobj_create( bszid_t bmult_id_m, bszid_t bmult_id_n, invdiag_t inv_diag, pack_t pack_schema, packbuf_t pack_buf, obj_t* a, obj_t* p, cntx_t* cntx ) +cntl_t* libblis_test_pobj_create( bszid_t bmult_id_m, bszid_t bmult_id_n, invdiag_t inv_diag, pack_t pack_schema, packbuf_t pack_buf, obj_t* a, obj_t* p, cntx_t* cntx, rntm_t* rntm ) { bool does_inv_diag; - rntm_t rntm; if ( inv_diag == BLIS_NO_INVERT_DIAG ) does_inv_diag = FALSE; else does_inv_diag = TRUE; @@ -2649,20 +2646,13 @@ cntl_t* libblis_test_pobj_create( bszid_t bmult_id_m, bszid_t bmult_id_n, invdia NULL // no child node needed ); - // Initialize a local-to-BLIS rntm_t. This is simply so we have something - // to pass into bli_l3_packm(). The function doesn't (currently) use the - // runtime object, and even if it did, one with default values would work - // fine here. - bli_rntm_init( &rntm ); - // Pack the contents of A to P. - bli_l3_packm( a, p, cntx, &rntm, cntl, &BLIS_PACKM_SINGLE_THREADED ); + bli_packm_int( a, p, cntx, rntm, cntl, &BLIS_PACKM_SINGLE_THREADED ); // Return the control tree pointer so the caller can free the cntl_t and its // mem_t entry later on. return cntl; } -#endif void libblis_test_vobj_create( test_params_t* params, num_t dt, char storage, dim_t m, obj_t* x ) @@ -3007,7 +2997,7 @@ void libblis_test_parse_message( FILE* output_stream, char* message, va_list arg char* the_string; char the_char; - // Begin looping over message to insert variables wherever there are + // Begin looping over message to insert variables wherever there are // format specifiers. for ( c = 0; message[c] != '\0'; ) { diff --git a/testsuite/src/test_libblis.h b/testsuite/src/test_libblis.h index 786f82b308..cdb3c6dac4 100644 --- a/testsuite/src/test_libblis.h +++ b/testsuite/src/test_libblis.h @@ -418,7 +418,7 @@ void fill_string_with_n_spaces( char* str, unsigned int n_spaces ); // --- Create object --- void libblis_test_mobj_create( test_params_t* params, num_t dt, trans_t trans, char storage, dim_t m, dim_t n, obj_t* a ); -cntl_t* libblis_test_pobj_create( bszid_t bmult_id_m, bszid_t bmult_id_n, invdiag_t inv_diag, pack_t pack_schema, packbuf_t pack_buf, obj_t* a, obj_t* p, cntx_t* cntx ); +cntl_t* libblis_test_pobj_create( bszid_t bmult_id_m, bszid_t bmult_id_n, invdiag_t inv_diag, pack_t pack_schema, packbuf_t pack_buf, obj_t* a, obj_t* p, cntx_t* cntx, rntm_t* rntm ); void libblis_test_vobj_create( test_params_t* params, num_t dt, char storage, dim_t m, obj_t* x ); // --- Randomize/initialize object --- diff --git a/testsuite/src/test_trsm_ukr.c b/testsuite/src/test_trsm_ukr.c index 6366e5fc3c..7a478dfe90 100644 --- a/testsuite/src/test_trsm_ukr.c +++ b/testsuite/src/test_trsm_ukr.c @@ -238,7 +238,10 @@ void libblis_test_trsm_ukr_experiment libblis_test_mobj_randomize( params, TRUE, &c ); bli_copym( &c, &c_save ); -#if 0 + rntm_t rntm; + bli_rntm_init( &rntm ); + bli_pba_rntm_set_pba( &rntm ); + // Create pack objects for a and b, and pack them to ap and bp, // respectively. cntl_t* cntl_a = libblis_test_pobj_create @@ -249,50 +252,9 @@ void libblis_test_trsm_ukr_experiment BLIS_PACKED_ROW_PANELS, BLIS_BUFFER_FOR_A_BLOCK, &a, &ap, - cntx + cntx, + &rntm ); - cntl_t* cntl_b = libblis_test_pobj_create - ( - BLIS_MR, - BLIS_NR, - BLIS_NO_INVERT_DIAG, - BLIS_PACKED_COL_PANELS, - BLIS_BUFFER_FOR_B_PANEL, - &b, &bp, - cntx - ); -#endif - - // Create the packed objects. Use packmr and packnr as the leading - // dimensions of ap and bp, respectively. Note that we use the ldims - // instead of the matrix dimensions for allocation purposes here. - // This is a little hacky and was prompted when trying to support - // configurations such as power9 that employ duplication/broadcasting - // of elements in one of the packed matrix objects. Thankfully, packm - // doesn't care about those dimensions and instead relies on - // information taken from the source object. Thus, this is merely - // about coaxing bli_obj_create() in allocating enough space for our - // purposes. - bli_obj_create( datatype, ldap, m, 1, ldap, &ap ); - bli_obj_create( datatype, m, ldbp, ldbp, 1, &bp ); - - // Set up the objects for packing. Calling packm_init_pack() does everything - // except checkout a memory pool block and save its address to the obj_t's. - // However, it does overwrite the buffer field of packed object with that of - // the source object (as a side-effect of bli_obj_alias_to(); that buffer - // field would normally be overwritten yet again by the address from the - // memory pool block). So, we have to save the buffer address that was - // allocated so we can re-store it to the object afterward. - void* buf_ap = bli_obj_buffer( &ap ); - void* buf_bp = bli_obj_buffer( &bp ); - bli_packm_init_pack( BLIS_INVERT_DIAG, BLIS_PACKED_ROW_PANELS, - BLIS_PACK_FWD_IF_UPPER, BLIS_PACK_FWD_IF_LOWER, - BLIS_MR, BLIS_KR, &a, &ap, cntx ); - bli_packm_init_pack( BLIS_NO_INVERT_DIAG, BLIS_PACKED_COL_PANELS, - BLIS_PACK_FWD_IF_UPPER, BLIS_PACK_FWD_IF_LOWER, - BLIS_KR, BLIS_NR, &b, &bp, cntx ); - bli_obj_set_buffer( buf_ap, &ap ); - bli_obj_set_buffer( buf_bp, &bp ); // Set the diagonal offset of ap. bli_obj_set_diag_offset( 0, &ap ); @@ -302,24 +264,35 @@ void libblis_test_trsm_ukr_experiment // know which set of micro-kernels (lower or upper) to choose from. bli_obj_set_uplo( uploa, &ap ); - // Pack the data from the source objects. - bli_packm_blk_var1( &a, &ap, cntx, NULL, &BLIS_PACKM_SINGLE_THREADED ); - bli_packm_blk_var1( &b, &bp, cntx, NULL, &BLIS_PACKM_SINGLE_THREADED ); - #if 0 bli_printm( "a", &a, "%5.2f", "" ); bli_printm( "ap", &ap, "%5.2f", "" ); #endif - // Repeat the experiment n_repeats times and record results. + // Repeat the experiment n_repeats times and record results. for ( i = 0; i < n_repeats; ++i ) { - // Re-pack the contents of b to bp. - //bli_packm_blk_var1( &b, &bp, cntx, cntl_b, &BLIS_PACKM_SINGLE_THREADED ); - bli_packm_blk_var1( &b, &bp, cntx, NULL, &BLIS_PACKM_SINGLE_THREADED ); - bli_copym( &c_save, &c ); + // Transpose B to B^T for packing + bli_obj_induce_trans( &b ); + + cntl_t* cntl_b = libblis_test_pobj_create + ( + BLIS_NR, + BLIS_MR, + BLIS_NO_INVERT_DIAG, + BLIS_PACKED_COL_PANELS, + BLIS_BUFFER_FOR_B_PANEL, + &b, &bp, + cntx, + &rntm + ); + + // Transpose B^T back to B and Bp^T back to Bp + bli_obj_induce_trans( &b ); + bli_obj_induce_trans( &bp ); + time = bli_clock(); libblis_test_trsm_ukr_impl( iface, side, @@ -327,6 +300,10 @@ bli_printm( "ap", &ap, "%5.2f", "" ); cntx ); time_min = bli_clock_min_diff( time_min, time ); + + // Free the control tree nodes and release their cached mem_t entries + // back to the memory broker. + bli_cntl_free( &rntm, cntl_b, &BLIS_PACKM_SINGLE_THREADED ); } // Estimate the performance of the best experiment repeat. @@ -339,16 +316,9 @@ bli_printm( "ap", &ap, "%5.2f", "" ); // Zero out performance and residual if output matrix is empty. //libblis_test_check_empty_problem( &c, perf, resid ); -#if 0 // Free the control tree nodes and release their cached mem_t entries // back to the memory broker. - bli_cntl_free( NULL, cntl_a, &BLIS_PACKM_SINGLE_THREADED ); - bli_cntl_free( NULL, cntl_b, &BLIS_PACKM_SINGLE_THREADED ); -#endif - - // Free the packed objects. - bli_obj_free( &ap ); - bli_obj_free( &bp ); + bli_cntl_free( &rntm, cntl_a, &BLIS_PACKM_SINGLE_THREADED ); // Free the test objects. bli_obj_free( &a ); From 05c9ee54396d38448fb5abf30f8dd9929dedf432 Mon Sep 17 00:00:00 2001 From: Devin Matthews Date: Mon, 13 Sep 2021 21:44:24 -0500 Subject: [PATCH 07/24] Merge bli_gemm_int and bli_trsm_into into one happy function. bli_gemm_packab and bli_trsm_packab came along for the ride. --- frame/3/bli_l3.h | 2 + frame/3/bli_l3_check.c | 2 +- frame/3/{gemm/bli_gemm_int.c => bli_l3_int.c} | 39 +++-- frame/3/{trsm/bli_trsm_int.h => bli_l3_int.h} | 2 +- frame/3/bli_l3_oft_var.h | 19 +-- .../bli_trsm_packab.c => bli_l3_packab.c} | 8 +- .../{gemm/bli_gemm_int.h => bli_l3_packab.h} | 15 +- frame/3/gemm/bli_gemm.h | 1 - frame/3/gemm/bli_gemm_blk_var1.c | 2 +- frame/3/gemm/bli_gemm_blk_var2.c | 2 +- frame/3/gemm/bli_gemm_blk_var3.c | 2 +- frame/3/gemm/bli_gemm_cntl.c | 4 +- frame/3/gemm/bli_gemm_front.c | 2 +- frame/3/gemm/bli_gemm_md.c | 4 +- frame/3/gemm/bli_gemm_packab.c | 137 ----------------- frame/3/gemm/bli_gemm_var.h | 3 - frame/3/gemmt/bli_gemmt_front.c | 2 +- frame/3/gemmt/bli_gemmt_x_ker_var2.c | 6 +- frame/3/hemm/bli_hemm_front.c | 2 +- frame/3/symm/bli_symm_front.c | 2 +- frame/3/trmm/bli_trmm_front.c | 2 +- frame/3/trmm/bli_trmm_xx_ker_var2.c | 8 +- frame/3/trmm3/bli_trmm3_front.c | 2 +- frame/3/trsm/bli_trsm.h | 2 - frame/3/trsm/bli_trsm_blk_var1.c | 4 +- frame/3/trsm/bli_trsm_blk_var2.c | 2 +- frame/3/trsm/bli_trsm_blk_var3.c | 2 +- frame/3/trsm/bli_trsm_cntl.c | 10 +- frame/3/trsm/bli_trsm_front.c | 2 +- frame/3/trsm/bli_trsm_int.c | 142 ------------------ frame/3/trsm/bli_trsm_var.h | 2 - frame/3/trsm/bli_trsm_xx_ker_var2.c | 8 +- 32 files changed, 83 insertions(+), 359 deletions(-) rename frame/3/{gemm/bli_gemm_int.c => bli_l3_int.c} (77%) rename frame/3/{trsm/bli_trsm_int.h => bli_l3_int.h} (99%) rename frame/3/{trsm/bli_trsm_packab.c => bli_l3_packab.c} (98%) rename frame/3/{gemm/bli_gemm_int.h => bli_l3_packab.h} (90%) delete mode 100644 frame/3/gemm/bli_gemm_packab.c delete mode 100644 frame/3/trsm/bli_trsm_int.c diff --git a/frame/3/bli_l3.h b/frame/3/bli_l3.h index ff92328565..8429f837f4 100644 --- a/frame/3/bli_l3.h +++ b/frame/3/bli_l3.h @@ -35,6 +35,8 @@ #include "bli_l3_cntl.h" #include "bli_l3_check.h" +#include "bli_l3_int.h" +#include "bli_l3_packab.h" // Define function types. //#include "bli_l3_ft_ex.h" diff --git a/frame/3/bli_l3_check.c b/frame/3/bli_l3_check.c index 88d89bbdc9..ccc166de2f 100644 --- a/frame/3/bli_l3_check.c +++ b/frame/3/bli_l3_check.c @@ -53,7 +53,7 @@ void bli_gemm_check // Check object structure. // NOTE: Can't perform these checks as long as bli_gemm_check() is called - // from bli_gemm_int(), which is in the execution path for structured + // from bli_l3_int(), which is in the execution path for structured // level-3 operations such as hemm. //e_val = bli_check_general_object( a ); diff --git a/frame/3/gemm/bli_gemm_int.c b/frame/3/bli_l3_int.c similarity index 77% rename from frame/3/gemm/bli_gemm_int.c rename to frame/3/bli_l3_int.c index f665bda172..8935c0a38e 100644 --- a/frame/3/gemm/bli_gemm_int.c +++ b/frame/3/bli_l3_int.c @@ -35,7 +35,7 @@ #include "blis.h" -void bli_gemm_int +void bli_l3_int ( obj_t* alpha, obj_t* a, @@ -48,10 +48,12 @@ void bli_gemm_int thrinfo_t* thread ) { - obj_t a_local; - obj_t b_local; - obj_t c_local; - gemm_var_oft f; + obj_t a_local; + obj_t b_local; + obj_t c_local; + + // Return early if the current control tree node is NULL. + if ( bli_cntl_is_null( cntl ) ) return; // Check parameters. if ( bli_error_checking_is_enabled() ) @@ -92,25 +94,40 @@ void bli_gemm_int bli_obj_alias_to( b, &b_local ); bli_obj_alias_to( c, &c_local ); + // If we are about to call a leaf-level implementation, and matrix C + // still needs a transposition, then we must induce one by swapping the + // strides and dimensions. Note that this transposition would normally + // be handled explicitly in the packing of C, but if C is not being + // packed, this is our last chance to handle the transposition. + if ( bli_cntl_is_leaf( cntl ) && bli_obj_has_trans( c ) ) + { + bli_obj_induce_trans( &c_local ); + bli_obj_set_onlytrans( BLIS_NO_TRANSPOSE, &c_local ); + } + // If alpha is non-unit, typecast and apply it to the scalar attached - // to B. - if ( !bli_obj_equals( alpha, &BLIS_ONE ) ) + // to B, unless it happens to be triangular. + if ( bli_obj_root_is_triangular( b ) ) { - bli_obj_scalar_apply_scalar( alpha, &b_local ); + if ( !bli_obj_equals( alpha, &BLIS_ONE ) ) + bli_obj_scalar_apply_scalar( alpha, &a_local ); + } + else // if ( bli_obj_root_is_triangular( b ) ) + { + if ( !bli_obj_equals( alpha, &BLIS_ONE ) ) + bli_obj_scalar_apply_scalar( alpha, &b_local ); } // If beta is non-unit, typecast and apply it to the scalar attached // to C. if ( !bli_obj_equals( beta, &BLIS_ONE ) ) - { bli_obj_scalar_apply_scalar( beta, &c_local ); - } // Create the next node in the thrinfo_t structure. bli_thrinfo_grow( rntm, cntl, thread ); // Extract the function pointer from the current control tree node. - f = bli_cntl_var_func( cntl ); + l3_var_oft f = bli_cntl_var_func( cntl ); // Somewhat hackish support for 4m1b method implementation. { diff --git a/frame/3/trsm/bli_trsm_int.h b/frame/3/bli_l3_int.h similarity index 99% rename from frame/3/trsm/bli_trsm_int.h rename to frame/3/bli_l3_int.h index aabb2a8aa6..d76b0ac3e2 100644 --- a/frame/3/trsm/bli_trsm_int.h +++ b/frame/3/bli_l3_int.h @@ -32,7 +32,7 @@ */ -void bli_trsm_int +void bli_l3_int ( obj_t* alpha, obj_t* a, diff --git a/frame/3/bli_l3_oft_var.h b/frame/3/bli_l3_oft_var.h index 1456f8eff3..ea10d80904 100644 --- a/frame/3/bli_l3_oft_var.h +++ b/frame/3/bli_l3_oft_var.h @@ -54,24 +54,7 @@ typedef void (*PASTECH(opname,_var_oft)) \ thrinfo_t* thread \ ); -GENTDEF( gemm ) - - -#undef GENTDEF -#define GENTDEF( opname ) \ -\ -typedef void (*PASTECH(opname,_var_oft)) \ -( \ - obj_t* a, \ - obj_t* b, \ - obj_t* c, \ - cntx_t* cntx, \ - rntm_t* rntm, \ - cntl_t* cntl, \ - thrinfo_t* thread \ -); - -GENTDEF( trsm ) +GENTDEF( l3 ) diff --git a/frame/3/trsm/bli_trsm_packab.c b/frame/3/bli_l3_packab.c similarity index 98% rename from frame/3/trsm/bli_trsm_packab.c rename to frame/3/bli_l3_packab.c index 79f4e2046c..d2736c04ac 100644 --- a/frame/3/trsm/bli_trsm_packab.c +++ b/frame/3/bli_l3_packab.c @@ -34,7 +34,7 @@ #include "blis.h" -void bli_trsm_packa +void bli_l3_packa ( obj_t* a, obj_t* b, @@ -66,7 +66,7 @@ void bli_trsm_packa ); // Proceed with execution using packed matrix A. - bli_trsm_int + bli_l3_int ( &BLIS_ONE, &a_pack, @@ -82,7 +82,7 @@ void bli_trsm_packa // ----------------------------------------------------------------------------- -void bli_trsm_packb +void bli_l3_packb ( obj_t* a, obj_t* b, @@ -121,7 +121,7 @@ void bli_trsm_packb bli_obj_induce_trans( &bt_pack ); // Proceed with execution using packed matrix B. - bli_trsm_int + bli_l3_int ( &BLIS_ONE, a, diff --git a/frame/3/gemm/bli_gemm_int.h b/frame/3/bli_l3_packab.h similarity index 90% rename from frame/3/gemm/bli_gemm_int.h rename to frame/3/bli_l3_packab.h index 2bbe5480a6..380ca72123 100644 --- a/frame/3/gemm/bli_gemm_int.h +++ b/frame/3/bli_l3_packab.h @@ -32,12 +32,21 @@ */ -void bli_gemm_int +void bli_l3_packa + ( + obj_t* a, + obj_t* b, + obj_t* c, + cntx_t* cntx, + rntm_t* rntm, + cntl_t* cntl, + thrinfo_t* thread + ); + +void bli_l3_packb ( - obj_t* alpha, obj_t* a, obj_t* b, - obj_t* beta, obj_t* c, cntx_t* cntx, rntm_t* rntm, diff --git a/frame/3/gemm/bli_gemm.h b/frame/3/gemm/bli_gemm.h index a6f8b4e1e0..ddd88e1633 100644 --- a/frame/3/gemm/bli_gemm.h +++ b/frame/3/gemm/bli_gemm.h @@ -34,7 +34,6 @@ #include "bli_gemm_cntl.h" #include "bli_gemm_front.h" -#include "bli_gemm_int.h" #include "bli_gemm_var.h" diff --git a/frame/3/gemm/bli_gemm_blk_var1.c b/frame/3/gemm/bli_gemm_blk_var1.c index 3b7634338e..de077e5adc 100644 --- a/frame/3/gemm/bli_gemm_blk_var1.c +++ b/frame/3/gemm/bli_gemm_blk_var1.c @@ -77,7 +77,7 @@ void bli_gemm_blk_var1 i, b_alg, c, &c1 ); // Perform gemm subproblem. - bli_gemm_int + bli_l3_int ( &BLIS_ONE, &a1, diff --git a/frame/3/gemm/bli_gemm_blk_var2.c b/frame/3/gemm/bli_gemm_blk_var2.c index d89a710534..53943e47cd 100644 --- a/frame/3/gemm/bli_gemm_blk_var2.c +++ b/frame/3/gemm/bli_gemm_blk_var2.c @@ -77,7 +77,7 @@ void bli_gemm_blk_var2 i, b_alg, c, &c1 ); // Perform gemm subproblem. - bli_gemm_int + bli_l3_int ( &BLIS_ONE, a, diff --git a/frame/3/gemm/bli_gemm_blk_var3.c b/frame/3/gemm/bli_gemm_blk_var3.c index 7883dfd6de..28029777de 100644 --- a/frame/3/gemm/bli_gemm_blk_var3.c +++ b/frame/3/gemm/bli_gemm_blk_var3.c @@ -71,7 +71,7 @@ void bli_gemm_blk_var3 i, b_alg, b, &b1 ); // Perform gemm subproblem. - bli_gemm_int + bli_l3_int ( &BLIS_ONE, &a1, diff --git a/frame/3/gemm/bli_gemm_cntl.c b/frame/3/gemm/bli_gemm_cntl.c index dbefdcc408..f389a3bcc7 100644 --- a/frame/3/gemm/bli_gemm_cntl.c +++ b/frame/3/gemm/bli_gemm_cntl.c @@ -93,7 +93,7 @@ cntl_t* bli_gemmbp_cntl_create cntl_t* gemm_cntl_packa = bli_packm_cntl_create_node ( rntm, - bli_gemm_packa, // pack the left-hand operand + bli_l3_packa, // pack the left-hand operand packa_fp, BLIS_MR, BLIS_KR, @@ -119,7 +119,7 @@ cntl_t* bli_gemmbp_cntl_create cntl_t* gemm_cntl_packb = bli_packm_cntl_create_node ( rntm, - bli_gemm_packb, // pack the right-hand operand + bli_l3_packb, // pack the right-hand operand packb_fp, BLIS_NR, BLIS_KR, diff --git a/frame/3/gemm/bli_gemm_front.c b/frame/3/gemm/bli_gemm_front.c index 5371e4df41..af0745c6c9 100644 --- a/frame/3/gemm/bli_gemm_front.c +++ b/frame/3/gemm/bli_gemm_front.c @@ -276,7 +276,7 @@ void bli_gemm_front // Invoke the internal back-end via the thread handler. bli_l3_thread_decorator ( - bli_gemm_int, + bli_l3_int, BLIS_GEMM, // operation family id alpha, &a_local, diff --git a/frame/3/gemm/bli_gemm_md.c b/frame/3/gemm/bli_gemm_md.c index a0671582dc..a5b5754924 100644 --- a/frame/3/gemm/bli_gemm_md.c +++ b/frame/3/gemm/bli_gemm_md.c @@ -698,7 +698,7 @@ void bli_gemm_md_front // Invoke the internal back-end via the thread handler. bli_l3_thread_decorator ( - bli_gemm_int, + bli_l3_int, BLIS_GEMM, // operation family id alpha, &a_local, @@ -839,7 +839,7 @@ void bli_gemm_md_zgemm // Invoke the internal back-end via the thread handler. bli_l3_thread_decorator ( - bli_gemm_int, + bli_l3_int, BLIS_GEMM, // operation family id alpha, &a_local, diff --git a/frame/3/gemm/bli_gemm_packab.c b/frame/3/gemm/bli_gemm_packab.c deleted file mode 100644 index 39034ba286..0000000000 --- a/frame/3/gemm/bli_gemm_packab.c +++ /dev/null @@ -1,137 +0,0 @@ -/* - - BLIS - An object-based framework for developing high-performance BLAS-like - libraries. - - Copyright (C) 2014, The University of Texas at Austin - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are - met: - - Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - - Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - - Neither the name(s) of the copyright holder(s) nor the names of its - contributors may be used to endorse or promote products derived - from this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -*/ - -#include "blis.h" - -void bli_gemm_packa - ( - obj_t* a, - obj_t* b, - obj_t* c, - cntx_t* cntx, - rntm_t* rntm, - cntl_t* cntl, - thrinfo_t* thread - ) -{ - obj_t a_local, a_pack; - - bli_obj_alias_to( a, &a_local ); - if ( bli_obj_has_trans( a ) ) - { - bli_obj_induce_trans( &a_local ); - bli_obj_set_onlytrans( BLIS_NO_TRANSPOSE, &a_local ); - } - - // Pack matrix A according to the control tree node. - bli_packm_int - ( - &a_local, - &a_pack, - cntx, - rntm, - cntl, - thread - ); - - // Proceed with execution using packed matrix A. - bli_gemm_int - ( - &BLIS_ONE, - &a_pack, - b, - &BLIS_ONE, - c, - cntx, - rntm, - bli_cntl_sub_node( cntl ), - bli_thrinfo_sub_node( thread ) - ); -} - -// ----------------------------------------------------------------------------- - -void bli_gemm_packb - ( - obj_t* a, - obj_t* b, - obj_t* c, - cntx_t* cntx, - rntm_t* rntm, - cntl_t* cntl, - thrinfo_t* thread - ) -{ - obj_t bt_local, bt_pack; - - // We always pass B^T to bli_l3_packm. - bli_obj_alias_to( b, &bt_local ); - if ( bli_obj_has_trans( b ) ) - { - bli_obj_set_onlytrans( BLIS_NO_TRANSPOSE, &bt_local ); - } - else - { - bli_obj_induce_trans( &bt_local ); - } - - // Pack matrix B according to the control tree node. - bli_packm_int - ( - &bt_local, - &bt_pack, - cntx, - rntm, - cntl, - thread - ); - - // Transpose packed object back to B. - bli_obj_induce_trans( &bt_pack ); - - // Proceed with execution using packed matrix B. - bli_gemm_int - ( - &BLIS_ONE, - a, - &bt_pack, - &BLIS_ONE, - c, - cntx, - rntm, - bli_cntl_sub_node( cntl ), - bli_thrinfo_sub_node( thread ) - ); -} - diff --git a/frame/3/gemm/bli_gemm_var.h b/frame/3/gemm/bli_gemm_var.h index b08271e9b9..7a0de8933c 100644 --- a/frame/3/gemm/bli_gemm_var.h +++ b/frame/3/gemm/bli_gemm_var.h @@ -55,11 +55,8 @@ void PASTEMAC0(opname) \ GENPROT( gemm_blk_var1 ) GENPROT( gemm_blk_var2 ) GENPROT( gemm_blk_var3 ) -GENPROT( gemm_packa ) -GENPROT( gemm_packb ) GENPROT( gemm_ker_var1 ) - GENPROT( gemm_ker_var2 ) // Headers for induced algorithms: diff --git a/frame/3/gemmt/bli_gemmt_front.c b/frame/3/gemmt/bli_gemmt_front.c index 6bf32943d2..aa73f861fb 100644 --- a/frame/3/gemmt/bli_gemmt_front.c +++ b/frame/3/gemmt/bli_gemmt_front.c @@ -127,7 +127,7 @@ void bli_gemmt_front // Invoke the internal back-end via the thread handler. bli_l3_thread_decorator ( - bli_gemm_int, + bli_l3_int, BLIS_GEMMT, // operation family id alpha, &a_local, diff --git a/frame/3/gemmt/bli_gemmt_x_ker_var2.c b/frame/3/gemmt/bli_gemmt_x_ker_var2.c index 6d24ea4969..3a1d681c3b 100644 --- a/frame/3/gemmt/bli_gemmt_x_ker_var2.c +++ b/frame/3/gemmt/bli_gemmt_x_ker_var2.c @@ -35,7 +35,7 @@ #include "blis.h" -static gemm_var_oft vars[2] = +static l3_var_oft vars[2] = { bli_gemmt_l_ker_var2, bli_gemmt_u_ker_var2, }; @@ -51,8 +51,8 @@ void bli_gemmt_x_ker_var2 thrinfo_t* thread ) { - dim_t uplo; - gemm_var_oft f; + dim_t uplo; + l3_var_oft f; // Set a bool based on the uplo field of C's root object. if ( bli_obj_root_is_lower( c ) ) uplo = 0; diff --git a/frame/3/hemm/bli_hemm_front.c b/frame/3/hemm/bli_hemm_front.c index c0bdc41c68..030a94939a 100644 --- a/frame/3/hemm/bli_hemm_front.c +++ b/frame/3/hemm/bli_hemm_front.c @@ -168,7 +168,7 @@ void bli_hemm_front // Invoke the internal back-end. bli_l3_thread_decorator ( - bli_gemm_int, + bli_l3_int, BLIS_GEMM, // operation family id alpha, &a_local, diff --git a/frame/3/symm/bli_symm_front.c b/frame/3/symm/bli_symm_front.c index 71dabfb153..ff83fdd842 100644 --- a/frame/3/symm/bli_symm_front.c +++ b/frame/3/symm/bli_symm_front.c @@ -167,7 +167,7 @@ void bli_symm_front // Invoke the internal back-end. bli_l3_thread_decorator ( - bli_gemm_int, + bli_l3_int, BLIS_GEMM, // operation family id alpha, &a_local, diff --git a/frame/3/trmm/bli_trmm_front.c b/frame/3/trmm/bli_trmm_front.c index 71a6b7b909..dc6665c69e 100644 --- a/frame/3/trmm/bli_trmm_front.c +++ b/frame/3/trmm/bli_trmm_front.c @@ -186,7 +186,7 @@ void bli_trmm_front // Invoke the internal back-end. bli_l3_thread_decorator ( - bli_gemm_int, + bli_l3_int, BLIS_TRMM, // operation family id alpha, &a_local, diff --git a/frame/3/trmm/bli_trmm_xx_ker_var2.c b/frame/3/trmm/bli_trmm_xx_ker_var2.c index b9c176d973..898cfe2423 100644 --- a/frame/3/trmm/bli_trmm_xx_ker_var2.c +++ b/frame/3/trmm/bli_trmm_xx_ker_var2.c @@ -35,7 +35,7 @@ #include "blis.h" -static gemm_var_oft vars[2][2] = +static l3_var_oft vars[2][2] = { { bli_trmm_ll_ker_var2, bli_trmm_lu_ker_var2 }, { bli_trmm_rl_ker_var2, bli_trmm_ru_ker_var2 } @@ -52,9 +52,9 @@ void bli_trmm_xx_ker_var2 thrinfo_t* thread ) { - dim_t side; - dim_t uplo; - gemm_var_oft f; + dim_t side; + dim_t uplo; + l3_var_oft f; // Set two bools: one based on the implied side parameter (the structure // of the root object) and one based on the uplo field of the triangular diff --git a/frame/3/trmm3/bli_trmm3_front.c b/frame/3/trmm3/bli_trmm3_front.c index 079ab18aa6..553ad73f0f 100644 --- a/frame/3/trmm3/bli_trmm3_front.c +++ b/frame/3/trmm3/bli_trmm3_front.c @@ -178,7 +178,7 @@ void bli_trmm3_front // Invoke the internal back-end. bli_l3_thread_decorator ( - bli_gemm_int, + bli_l3_int, BLIS_TRMM, // operation family id alpha, &a_local, diff --git a/frame/3/trsm/bli_trsm.h b/frame/3/trsm/bli_trsm.h index 00b604de6e..964422d017 100644 --- a/frame/3/trsm/bli_trsm.h +++ b/frame/3/trsm/bli_trsm.h @@ -34,7 +34,5 @@ #include "bli_trsm_cntl.h" #include "bli_trsm_front.h" -#include "bli_trsm_int.h" - #include "bli_trsm_var.h" diff --git a/frame/3/trsm/bli_trsm_blk_var1.c b/frame/3/trsm/bli_trsm_blk_var1.c index d6957bc853..30bf6921cd 100644 --- a/frame/3/trsm/bli_trsm_blk_var1.c +++ b/frame/3/trsm/bli_trsm_blk_var1.c @@ -96,7 +96,7 @@ void bli_trsm_blk_var1 #endif // Perform trsm subproblem. - bli_trsm_int + bli_l3_int ( &BLIS_ONE, &a11_1, @@ -169,7 +169,7 @@ void bli_trsm_blk_var1 // Perform gemm subproblem. (Note that we use the same backend // function as before, since we're calling the same macrokernel.) - bli_trsm_int + bli_l3_int ( &BLIS_ONE, &a11, diff --git a/frame/3/trsm/bli_trsm_blk_var2.c b/frame/3/trsm/bli_trsm_blk_var2.c index 23fd3ed4ca..2e1923dbf0 100644 --- a/frame/3/trsm/bli_trsm_blk_var2.c +++ b/frame/3/trsm/bli_trsm_blk_var2.c @@ -77,7 +77,7 @@ void bli_trsm_blk_var2 i, b_alg, c, &c1 ); // Perform trsm subproblem. - bli_trsm_int + bli_l3_int ( &BLIS_ONE, a, diff --git a/frame/3/trsm/bli_trsm_blk_var3.c b/frame/3/trsm/bli_trsm_blk_var3.c index a68cc853b5..43fc25f16d 100644 --- a/frame/3/trsm/bli_trsm_blk_var3.c +++ b/frame/3/trsm/bli_trsm_blk_var3.c @@ -71,7 +71,7 @@ void bli_trsm_blk_var3 i, b_alg, b, &b1 ); // Perform trsm subproblem. - bli_trsm_int + bli_l3_int ( &BLIS_ONE, &a1, diff --git a/frame/3/trsm/bli_trsm_cntl.c b/frame/3/trsm/bli_trsm_cntl.c index 138503cb92..39a5ce3f46 100644 --- a/frame/3/trsm/bli_trsm_cntl.c +++ b/frame/3/trsm/bli_trsm_cntl.c @@ -95,7 +95,7 @@ cntl_t* bli_trsm_l_cntl_create cntl_t* gemm_cntl_packa = bli_packm_cntl_create_node ( rntm, - bli_trsm_packa, // trsm operation's packm function for A. + bli_l3_packa, // trsm operation's packm function for A. packa_fp, BLIS_MR, BLIS_MR, @@ -133,7 +133,7 @@ cntl_t* bli_trsm_l_cntl_create cntl_t* trsm_cntl_packa = bli_packm_cntl_create_node ( rntm, - bli_trsm_packa, // trsm operation's packm function for A. + bli_l3_packa, // trsm operation's packm function for A. packa_fp, BLIS_MR, BLIS_MR, @@ -171,7 +171,7 @@ cntl_t* bli_trsm_l_cntl_create cntl_t* trsm_cntl_packb = bli_packm_cntl_create_node ( rntm, - bli_trsm_packb, + bli_l3_packb, packb_fp, BLIS_NR, BLIS_MR, @@ -244,7 +244,7 @@ cntl_t* bli_trsm_r_cntl_create cntl_t* trsm_cntl_packa = bli_packm_cntl_create_node ( rntm, - bli_trsm_packa, + bli_l3_packa, packa_fp, BLIS_NR, BLIS_MR, @@ -270,7 +270,7 @@ cntl_t* bli_trsm_r_cntl_create cntl_t* trsm_cntl_packb = bli_packm_cntl_create_node ( rntm, - bli_trsm_packb, + bli_l3_packb, packb_fp, BLIS_MR, BLIS_MR, diff --git a/frame/3/trsm/bli_trsm_front.c b/frame/3/trsm/bli_trsm_front.c index c60e33f678..cc56332a7f 100644 --- a/frame/3/trsm/bli_trsm_front.c +++ b/frame/3/trsm/bli_trsm_front.c @@ -160,7 +160,7 @@ void bli_trsm_front // Invoke the internal back-end. bli_l3_thread_decorator ( - bli_trsm_int, + bli_l3_int, BLIS_TRSM, // operation family id alpha, &a_local, diff --git a/frame/3/trsm/bli_trsm_int.c b/frame/3/trsm/bli_trsm_int.c deleted file mode 100644 index 53a22c3556..0000000000 --- a/frame/3/trsm/bli_trsm_int.c +++ /dev/null @@ -1,142 +0,0 @@ -/* - - BLIS - An object-based framework for developing high-performance BLAS-like - libraries. - - Copyright (C) 2014, The University of Texas at Austin - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are - met: - - Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - - Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - - Neither the name(s) of the copyright holder(s) nor the names of its - contributors may be used to endorse or promote products derived - from this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -*/ - -#include "blis.h" - -void bli_trsm_int - ( - obj_t* alpha, - obj_t* a, - obj_t* b, - obj_t* beta, - obj_t* c, - cntx_t* cntx, - rntm_t* rntm, - cntl_t* cntl, - thrinfo_t* thread - ) -{ - obj_t a_local; - obj_t b_local; - obj_t c_local; - trsm_var_oft f; - - // Return early if the current control tree node is NULL. - if ( bli_cntl_is_null( cntl ) ) return; - - // Check parameters. - if ( bli_error_checking_is_enabled() ) - bli_gemm_basic_check( alpha, a, b, beta, c, cntx ); - - // If C has a zero dimension, return early. - if ( bli_obj_has_zero_dim( c ) ) return; - - // If A or B has a zero dimension, scale C by beta and return early. - if ( bli_obj_has_zero_dim( a ) || - bli_obj_has_zero_dim( b ) ) - { - if ( bli_thread_am_ochief( thread ) ) - bli_scalm( beta, c ); - bli_thread_barrier( thread ); - return; - } - - // Alias A and B in case we need to update attached scalars. - bli_obj_alias_to( a, &a_local ); - bli_obj_alias_to( b, &b_local ); - - // Alias C in case we need to induce a transposition. - bli_obj_alias_to( c, &c_local ); - - // If we are about to call a leaf-level implementation, and matrix C - // still needs a transposition, then we must induce one by swapping the - // strides and dimensions. Note that this transposition would normally - // be handled explicitly in the packing of C, but if C is not being - // packed, this is our last chance to handle the transposition. - if ( bli_cntl_is_leaf( cntl ) && bli_obj_has_trans( c ) ) - { - bli_obj_induce_trans( &c_local ); - bli_obj_set_onlytrans( BLIS_NO_TRANSPOSE, &c_local ); - } - - // If beta is non-unit, apply it to the scalar attached to C. - if ( !bli_obj_equals( beta, &BLIS_ONE ) ) - { - bli_obj_scalar_apply_scalar( beta, &c_local ); - } - - // Set two bools: one based on the implied side parameter (the structure - // of the root object) and one based on the uplo field of the triangular - // matrix's root object (whether that is matrix A or matrix B). - if ( bli_obj_root_is_triangular( a ) ) - { - // If alpha is non-unit, typecast and apply it to the scalar - // attached to B (the non-triangular matrix). - if ( !bli_obj_equals( alpha, &BLIS_ONE ) ) - { - bli_obj_scalar_apply_scalar( alpha, &b_local ); - } - } - else // if ( bli_obj_root_is_triangular( b ) ) - { - // If alpha is non-unit, typecast and apply it to the scalar - // attached to A (the non-triangular matrix). - if ( !bli_obj_equals( alpha, &BLIS_ONE ) ) - { - bli_obj_scalar_apply_scalar( alpha, &a_local ); - } - } - - // FGVZ->TMS: Is this barrier still needed? - bli_thread_barrier( thread ); - - // Create the next node in the thrinfo_t structure. - bli_thrinfo_grow( rntm, cntl, thread ); - - // Extract the function pointer from the current control tree node. - f = bli_cntl_var_func( cntl ); - - // Invoke the variant. - f - ( - &a_local, - &b_local, - &c_local, - cntx, - rntm, - cntl, - thread - ); -} - diff --git a/frame/3/trsm/bli_trsm_var.h b/frame/3/trsm/bli_trsm_var.h index de7c65936f..8322a8b5b6 100644 --- a/frame/3/trsm/bli_trsm_var.h +++ b/frame/3/trsm/bli_trsm_var.h @@ -55,8 +55,6 @@ void PASTEMAC0(opname) \ GENPROT( trsm_blk_var1 ) GENPROT( trsm_blk_var2 ) GENPROT( trsm_blk_var3 ) -GENPROT( trsm_packa ) -GENPROT( trsm_packb ) GENPROT( trsm_xx_ker_var2 ) diff --git a/frame/3/trsm/bli_trsm_xx_ker_var2.c b/frame/3/trsm/bli_trsm_xx_ker_var2.c index e30e6d7510..c30a5828a3 100644 --- a/frame/3/trsm/bli_trsm_xx_ker_var2.c +++ b/frame/3/trsm/bli_trsm_xx_ker_var2.c @@ -35,7 +35,7 @@ #include "blis.h" -static trsm_var_oft vars[2][2] = +static l3_var_oft vars[2][2] = { { bli_trsm_ll_ker_var2, bli_trsm_lu_ker_var2 }, { bli_trsm_rl_ker_var2, bli_trsm_ru_ker_var2 } @@ -52,9 +52,9 @@ void bli_trsm_xx_ker_var2 thrinfo_t* thread ) { - dim_t side; - dim_t uplo; - trsm_var_oft f; + dim_t side; + dim_t uplo; + l3_var_oft f; // Set two bools: one based on the implied side parameter (the structure // of the root object) and one based on the uplo field of the triangular From 3f32e87f042f25deef3e8d65ddd1da0a8ccf052b Mon Sep 17 00:00:00 2001 From: Devin Matthews Date: Tue, 14 Sep 2021 21:38:50 -0500 Subject: [PATCH 08/24] Enable 1m in the "fast" testsuite. --- testsuite/input.general.fast | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/testsuite/input.general.fast b/testsuite/input.general.fast index 02b30b897d..79b49f1b69 100644 --- a/testsuite/input.general.fast +++ b/testsuite/input.general.fast @@ -36,7 +36,7 @@ sdcz # Datatype(s) to test: 0 # 4mh ('1' = enable; '0' = disable) 0 # 4m1b ('1' = enable; '0' = disable) 0 # 4m1a ('1' = enable; '0' = disable) -0 # 1m ('1' = enable; '0' = disable) +1 # 1m ('1' = enable; '0' = disable) 1 # native ('1' = enable; '0' = disable) 1 # Simulate application-level threading: # '1' = disable / use one testsuite thread; From 57f116cf0a49e8455591e8d3b4a9c33f5b9fbfe9 Mon Sep 17 00:00:00 2001 From: Devin Matthews Date: Tue, 14 Sep 2021 21:39:24 -0500 Subject: [PATCH 09/24] The packing kernel (variant) can now be overridden by the user. --- frame/1m/bli_l1m_ft_ker.h | 6 +- frame/1m/packm/bli_packm.h | 6 +- frame/1m/packm/bli_packm_blk_var1.c | 196 +++++------- .../{bli_packm_var.h => bli_packm_blk_var1.h} | 24 ++ frame/1m/packm/bli_packm_cntl.c | 2 - frame/1m/packm/bli_packm_cntl.h | 7 - frame/1m/packm/bli_packm_init.c | 296 ++++++------------ frame/1m/packm/bli_packm_int.c | 45 +-- frame/1m/packm/bli_packm_md.h | 36 --- frame/1m/packm/bli_packm_struc_cxk.c | 22 +- frame/1m/packm/bli_packm_struc_cxk.h | 3 +- frame/1m/packm/bli_packm_struc_cxk_1er.c | 38 ++- frame/1m/packm/bli_packm_struc_cxk_1er.h | 6 +- frame/1m/packm/bli_packm_struc_cxk_3mis.c | 37 ++- frame/1m/packm/bli_packm_struc_cxk_3mis.h | 6 +- frame/1m/packm/bli_packm_struc_cxk_4mi.c | 37 ++- frame/1m/packm/bli_packm_struc_cxk_4mi.h | 6 +- frame/1m/packm/bli_packm_struc_cxk_md.c | 6 +- frame/1m/packm/bli_packm_struc_cxk_md.h | 6 +- frame/1m/packm/bli_packm_struc_cxk_rih.c | 37 ++- frame/1m/packm/bli_packm_struc_cxk_rih.h | 6 +- frame/3/bli_l3_int.c | 30 +- frame/3/gemm/bli_gemm_cntl.c | 7 - frame/3/trsm/bli_trsm_cntl.c | 13 - frame/base/bli_obj.c | 4 +- frame/include/bli_obj_macro_defs.h | 42 +-- frame/include/bli_type_defs.h | 259 +++++++-------- testsuite/src/test_libblis.c | 3 +- 28 files changed, 502 insertions(+), 684 deletions(-) rename frame/1m/packm/{bli_packm_var.h => bli_packm_blk_var1.h} (72%) delete mode 100644 frame/1m/packm/bli_packm_md.h diff --git a/frame/1m/bli_l1m_ft_ker.h b/frame/1m/bli_l1m_ft_ker.h index 1fc8bdf571..66367bf0e5 100644 --- a/frame/1m/bli_l1m_ft_ker.h +++ b/frame/1m/bli_l1m_ft_ker.h @@ -50,7 +50,6 @@ typedef void (*PASTECH3(ch,opname,_ker,tsuf)) \ ( \ struc_t strucc, \ - doff_t diagoffc, \ diag_t diagc, \ uplo_t uploc, \ conj_t conjc, \ @@ -60,11 +59,14 @@ typedef void (*PASTECH3(ch,opname,_ker,tsuf)) \ dim_t panel_len, \ dim_t panel_dim_max, \ dim_t panel_len_max, \ + dim_t panel_dim_off, \ + dim_t panel_len_off, \ ctype* restrict kappa, \ ctype* restrict c, inc_t incc, inc_t ldc, \ ctype* restrict p, inc_t ldp, \ inc_t is_p, \ - cntx_t* cntx \ + cntx_t* cntx, \ + void* params \ ); INSERT_GENTDEF( packm ) diff --git a/frame/1m/packm/bli_packm.h b/frame/1m/packm/bli_packm.h index abe61445cd..1534747be7 100644 --- a/frame/1m/packm/bli_packm.h +++ b/frame/1m/packm/bli_packm.h @@ -42,8 +42,6 @@ #include "bli_packm_part.h" -#include "bli_packm_var.h" - #include "bli_packm_struc_cxk.h" #include "bli_packm_struc_cxk_4mi.h" #include "bli_packm_struc_cxk_3mis.h" @@ -58,6 +56,8 @@ // Mixed datatype support. #ifdef BLIS_ENABLE_GEMM_MD -#include "bli_packm_md.h" +#include "bli_packm_struc_cxk_md.h" #endif +#include "bli_packm_blk_var1.h" + diff --git a/frame/1m/packm/bli_packm_blk_var1.c b/frame/1m/packm/bli_packm_blk_var1.c index 308e1acbf5..c71d24a64e 100644 --- a/frame/1m/packm/bli_packm_blk_var1.c +++ b/frame/1m/packm/bli_packm_blk_var1.c @@ -40,38 +40,38 @@ static func_t packm_struc_cxk_kers[BLIS_NUM_PACK_SCHEMA_TYPES] = { /* float (0) scomplex (1) double (2) dcomplex (3) */ // 0000 row/col panels - { { bli_spackm_struc_cxk, bli_cpackm_struc_cxk, - bli_dpackm_struc_cxk, bli_zpackm_struc_cxk, } }, + { { bli_spackm_struc_cxk, bli_cpackm_struc_cxk, + bli_dpackm_struc_cxk, bli_zpackm_struc_cxk, } }, // 0001 row/col panels: 4m interleaved - { { NULL, bli_cpackm_struc_cxk_4mi, - NULL, bli_zpackm_struc_cxk_4mi, } }, + { { NULL, bli_cpackm_struc_cxk_4mi, + NULL, bli_zpackm_struc_cxk_4mi, } }, // 0010 row/col panels: 3m interleaved - { { NULL, bli_cpackm_struc_cxk_3mis, - NULL, bli_zpackm_struc_cxk_3mis, } }, + { { NULL, bli_cpackm_struc_cxk_3mis, + NULL, bli_zpackm_struc_cxk_3mis, } }, // 0011 row/col panels: 4m separated (NOT IMPLEMENTED) - { { NULL, NULL, - NULL, NULL, } }, + { { NULL, NULL, + NULL, NULL, } }, // 0100 row/col panels: 3m separated - { { NULL, bli_cpackm_struc_cxk_3mis, - NULL, bli_zpackm_struc_cxk_3mis, } }, + { { NULL, bli_cpackm_struc_cxk_3mis, + NULL, bli_zpackm_struc_cxk_3mis, } }, // 0101 row/col panels: real only - { { NULL, bli_cpackm_struc_cxk_rih, - NULL, bli_zpackm_struc_cxk_rih, } }, + { { NULL, bli_cpackm_struc_cxk_rih, + NULL, bli_zpackm_struc_cxk_rih, } }, // 0110 row/col panels: imaginary only - { { NULL, bli_cpackm_struc_cxk_rih, - NULL, bli_zpackm_struc_cxk_rih, } }, + { { NULL, bli_cpackm_struc_cxk_rih, + NULL, bli_zpackm_struc_cxk_rih, } }, // 0111 row/col panels: real+imaginary only - { { NULL, bli_cpackm_struc_cxk_rih, - NULL, bli_zpackm_struc_cxk_rih, } }, + { { NULL, bli_cpackm_struc_cxk_rih, + NULL, bli_zpackm_struc_cxk_rih, } }, // 1000 row/col panels: 1m-expanded (1e) - { { NULL, bli_cpackm_struc_cxk_1er, - NULL, bli_zpackm_struc_cxk_1er, } }, + { { NULL, bli_cpackm_struc_cxk_1er, + NULL, bli_zpackm_struc_cxk_1er, } }, // 1001 row/col panels: 1m-reordered (1r) - { { NULL, bli_cpackm_struc_cxk_1er, - NULL, bli_zpackm_struc_cxk_1er, } }, + { { NULL, bli_cpackm_struc_cxk_1er, + NULL, bli_zpackm_struc_cxk_1er, } }, }; -static packm_ker_vft GENARRAY2_ALL(packm_struc_cxk_md,packm_struc_cxk_md); +static void_fp GENARRAY2_ALL(packm_struc_cxk_md,packm_struc_cxk_md); void bli_packm_blk_var1 ( @@ -83,35 +83,33 @@ void bli_packm_blk_var1 thrinfo_t* thread ) { + // Extract various fields from the control tree. + pack_t schema = bli_cntl_packm_params_pack_schema( cntl ); + bool invdiag = bli_cntl_packm_params_does_invert_diag( cntl ); + bool revifup = bli_cntl_packm_params_rev_iter_if_upper( cntl ); + bool reviflo = bli_cntl_packm_params_rev_iter_if_lower( cntl ); + // Every thread initializes p and determines the size of memory // block needed (which gets embedded into the otherwise "blank" mem_t - // entry in the control tree node). - // Update the buffer address in p to point to the buffer associated - // with the mem_t entry acquired from the memory broker (now cached in - // the control tree node). - // Return early if no packing is required. + // entry in the control tree node). Return early if no packing is required. if ( !bli_packm_init( c, p, cntx, rntm, cntl, thread ) ) - return; + return; // Check parameters. if ( bli_error_checking_is_enabled() ) bli_packm_int_check( c, p, cntx ); num_t dt_c = bli_obj_dt( c ); - dim_t dt_c_size = bli_dt_size( dt_c ); + dim_t dt_c_size = bli_dt_size( dt_c ); num_t dt_p = bli_obj_dt( p ); - dim_t dt_p_size = bli_dt_size( dt_p ); + dim_t dt_p_size = bli_dt_size( dt_p ); struc_t strucc = bli_obj_struc( c ); doff_t diagoffc = bli_obj_diag_offset( c ); diag_t diagc = bli_obj_diag( c ); uplo_t uploc = bli_obj_uplo( c ); - trans_t transc = bli_obj_conjtrans_status( c ); - pack_t schema = bli_obj_pack_schema( p ); - bool invdiag = bli_obj_has_inverted_diag( p ); - bool revifup = bli_obj_is_pack_rev_if_upper( p ); - bool reviflo = bli_obj_is_pack_rev_if_lower( p ); + conj_t conjc = bli_obj_conj_status( c ); dim_t iter_dim = bli_obj_length( p ); dim_t panel_len_full = bli_obj_width( p ); @@ -131,13 +129,8 @@ void bli_packm_blk_var1 doff_t diagoffc_inc = ( doff_t )panel_dim_max; - obj_t kappa_local; - char* kappa_cast = bli_packm_scalar( &kappa_local, p ); - - /* If C is zeros and part of a triangular matrix, then we don't need - to pack it. */ - if ( bli_is_zeros( uploc ) && - bli_is_triangular( strucc ) ) return; + obj_t kappa_local; + char* kappa_cast = bli_packm_scalar( &kappa_local, p ); // If the packm structure-aware kernel func_t in the context is // NULL (which is the default value after the context is created), @@ -148,17 +141,20 @@ void bli_packm_blk_var1 // Query the datatype-specific function pointer from the func_t object. packm_ker_vft packm_ker_cast = bli_func_get_dt( dt_p, packm_kers ); - // For mixed-precision gemm, select the proper kernel (only dense panels). - if ( dt_c != dt_p ) - { - packm_ker_cast = packm_struc_cxk_md[ dt_c ][ dt_p ]; - } + // For mixed-precision gemm, select the proper kernel (only dense panels). + if ( dt_c != dt_p ) + { + packm_ker_cast = packm_struc_cxk_md[ dt_c ][ dt_p ]; + } - // Query the user-provided packing kernel from the obj_t. - obj_pack_ukr_fn_t pack_ker_user = bli_obj_pack_ukr_fn( c ); + // Query the user-provided packing kernel from the obj_t. If provided, + // this overrides the kernel determined above. + packm_blk_var1_params_t* params = bli_obj_pack_params( c ); - /* Extract the conjugation bit from the transposition argument. */ - conj_t conjc = bli_extract_conj( transc ); + if ( params ) + { + packm_ker_cast = params->ukr_fn[ dt_c ][ dt_p ]; + } /* Compute the storage stride scaling. Usually this is just 1. However, in the case of interleaved 3m, we need to scale by 3/2, and in the @@ -216,38 +212,16 @@ void bli_packm_blk_var1 for ( dim_t ic = ic0, ip = ip0, it = 0; it < n_iter; ic += ic_inc, ip += ip_inc, it += 1 ) { - dim_t panel_dim_i = bli_min( panel_dim_max, iter_dim - ic ); - - doff_t diagoffc_i = diagoffc + (ip )*diagoffc_inc; - char* c_begin = c_cast + (ic )*incc*dt_c_size; - - inc_t p_inc = ps_p; + dim_t panel_dim_i = bli_min( panel_dim_max, iter_dim - ic ); + dim_t panel_dim_off_i = panel_dim_off + ic; - if ( pack_ker_user ) - { - /* This case executes if the user has specified a custom packing microkernel */ + doff_t diagoffc_i = diagoffc + (ip )*diagoffc_inc; + char* c_begin = c_cast + (ic )*incc*dt_c_size; - dim_t panel_dim_off_i = panel_dim_off + ic; + inc_t p_inc = ps_p; - /* The definition of bli_packm_my_iter() will depend on whether slab - or round-robin partitioning was requested at configure-time. */ - if ( bli_packm_my_iter( it, it_start, it_end, tid, nt ) ) - { - pack_ker_user( panel_dim_i, - panel_dim_max, - panel_dim_off_i, - panel_len_full, - panel_len_max, - panel_len_off, - kappa_cast, - c_begin, incc, ldc, - p_begin, ldp, - bli_obj_user_data( c ), - cntx ); - } - } - else if ( bli_is_triangular( strucc ) && - bli_is_unstored_subpart_n( diagoffc_i, uploc, panel_dim_i, panel_len_full ) ) + if ( bli_is_triangular( strucc ) && + bli_is_unstored_subpart_n( diagoffc_i, uploc, panel_dim_i, panel_len_full ) ) { /* This case executes if the panel belongs to a triangular matrix AND is completely unstored (ie: zero). If the panel @@ -272,10 +246,9 @@ void bli_packm_blk_var1 if ( diagoffc_i < 0 ) bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED ); - dim_t panel_off_i; - dim_t panel_len_i; - dim_t panel_len_max_i; - doff_t diagoffp_i; + dim_t panel_off_i; + dim_t panel_len_i; + dim_t panel_len_max_i; if ( bli_is_lower( uploc ) ) { @@ -283,18 +256,18 @@ void bli_packm_blk_var1 panel_len_i = bli_abs( diagoffc_i ) + panel_dim_i; panel_len_max_i = bli_min( bli_abs( diagoffc_i ) + panel_dim_max, panel_len_max ); - diagoffp_i = diagoffc_i; } else /* if ( bli_is_upper( uploc ) ) */ { panel_off_i = bli_abs( diagoffc_i ); panel_len_i = panel_len_full - panel_off_i; panel_len_max_i = panel_len_max - panel_off_i; - diagoffp_i = 0; } - char* c_use = c_begin + (panel_off_i )*ldc*dt_c_size; - char* p_use = p_begin; + dim_t panel_len_off_i = panel_off_i + panel_len_off; + + char* c_use = c_begin + (panel_off_i )*ldc*dt_c_size; + char* p_use = p_begin; /* We need to re-compute the imaginary stride as a function of panel_len_max_i since triangular packed matrices have panels @@ -312,7 +285,6 @@ void bli_packm_blk_var1 if ( bli_packm_my_iter_rr( it, it_start, it_end, tid, nt ) ) { packm_ker_cast( strucc, - diagoffp_i, diagc, uploc, conjc, @@ -322,11 +294,14 @@ void bli_packm_blk_var1 panel_len_i, panel_dim_max, panel_len_max, + panel_dim_off_i, + panel_len_off_i, kappa_cast, c_use, incc, ldc, p_use, ldp, - is_p_use, - cntx ); + is_p_use, + cntx, + params ); } /* NOTE: This value is usually LESS than ps_p because triangular @@ -334,18 +309,17 @@ void bli_packm_blk_var1 than a "full" micro-panel. */ p_inc = ( is_p_use * ss_num ) / ss_den; } - else if ( bli_is_herm_or_symm( strucc ) ) + else { - /* This case executes if the panel belongs to a Hermitian or - symmetric matrix, which includes stored, unstored, and - diagonal-intersecting panels. */ + /* This case executes if the panel is either dense, or belongs + to a Hermitian or symmetric matrix, which includes stored, + unstored, and diagonal-intersecting panels. */ /* The definition of bli_packm_my_iter() will depend on whether slab or round-robin partitioning was requested at configure-time. */ if ( bli_packm_my_iter( it, it_start, it_end, tid, nt ) ) { packm_ker_cast( strucc, - diagoffc_i, diagc, uploc, conjc, @@ -355,37 +329,13 @@ void bli_packm_blk_var1 panel_len_full, panel_dim_max, panel_len_max, + panel_dim_off_i, + panel_len_off, kappa_cast, c_begin, incc, ldc, p_begin, ldp, is_p, - cntx ); - } - } - else - { - /* This case executes if the panel is general, or, if the - panel is part of a triangular matrix and is neither unstored - (ie: zero) nor diagonal-intersecting. */ - - /* The definition of bli_packm_my_iter() will depend on whether slab - or round-robin partitioning was requested at configure-time. */ - if ( bli_packm_my_iter( it, it_start, it_end, tid, nt ) ) - { - packm_ker_cast( BLIS_GENERAL, - 0, - diagc, - BLIS_DENSE, - conjc, - schema, - invdiag, - panel_dim_i, - panel_len_full, - panel_dim_max, - panel_len_max, - kappa_cast, - c_begin, incc, ldc, - p_begin, ldp, is_p, - cntx ); + cntx, + params ); } } diff --git a/frame/1m/packm/bli_packm_var.h b/frame/1m/packm/bli_packm_blk_var1.h similarity index 72% rename from frame/1m/packm/bli_packm_var.h rename to frame/1m/packm/bli_packm_blk_var1.h index cd9e14186b..bb2c2f0f52 100644 --- a/frame/1m/packm/bli_packm_var.h +++ b/frame/1m/packm/bli_packm_blk_var1.h @@ -37,6 +37,30 @@ // Prototype object-based interfaces. // +typedef struct +{ + // Type of C Type of P + packm_ker_vft ukr_fn[BLIS_NUM_FP_TYPES][BLIS_NUM_FP_TYPES]; +} packm_blk_var1_params_t; + +BLIS_INLINE void bli_packm_blk_var1_init_params + ( + packm_blk_var1_params_t* params + ) +{ + #undef GENTFUNC2 + #define GENTFUNC2(ctypec,ctypep,chc,chp,name) \ + params->ukr_fn[ PASTEMAC(chc,type) ][ PASTEMAC(chp,type) ] = \ + ( packm_ker_vft )PASTEMAC2(chc,chp,name); + INSERT_GENTFUNC2_MIXDP0(packm_struc_cxk_md) + + #undef GENTFUNC + #define GENTFUNC(ctype,ch,name) \ + params->ukr_fn[ PASTEMAC(ch,type) ][ PASTEMAC(ch,type) ] = \ + ( packm_ker_vft )PASTEMAC(ch,name); + INSERT_GENTFUNC_BASIC0(packm_struc_cxk) +} + BLIS_EXPORT_BLIS void bli_packm_blk_var1 ( obj_t* c, diff --git a/frame/1m/packm/bli_packm_cntl.c b/frame/1m/packm/bli_packm_cntl.c index fc6ba8052c..4a61c92d03 100644 --- a/frame/1m/packm/bli_packm_cntl.c +++ b/frame/1m/packm/bli_packm_cntl.c @@ -39,7 +39,6 @@ cntl_t* bli_packm_cntl_create_node ( rntm_t* rntm, void_fp var_func, - void_fp packm_var_func, bszid_t bmid_m, bszid_t bmid_n, bool does_invert_diag, @@ -62,7 +61,6 @@ cntl_t* bli_packm_cntl_create_node // Initialize the packm_params_t struct. params->size = sizeof( packm_params_t ); - params->var_func = packm_var_func; params->bmid_m = bmid_m; params->bmid_n = bmid_n; params->does_invert_diag = does_invert_diag; diff --git a/frame/1m/packm/bli_packm_cntl.h b/frame/1m/packm/bli_packm_cntl.h index 17aa196e8d..14bfe1ce85 100644 --- a/frame/1m/packm/bli_packm_cntl.h +++ b/frame/1m/packm/bli_packm_cntl.h @@ -36,7 +36,6 @@ struct packm_params_s { uint64_t size; // size field must be present and come first. - packm_var_oft var_func; bszid_t bmid_m; bszid_t bmid_n; bool does_invert_diag; @@ -47,11 +46,6 @@ struct packm_params_s }; typedef struct packm_params_s packm_params_t; -BLIS_INLINE packm_var_oft bli_cntl_packm_params_var_func( cntl_t* cntl ) -{ - packm_params_t* ppp = ( packm_params_t* )cntl->params; return ppp->var_func; -} - BLIS_INLINE bszid_t bli_cntl_packm_params_bmid_m( cntl_t* cntl ) { packm_params_t* ppp = ( packm_params_t* )cntl->params; return ppp->bmid_m; @@ -93,7 +87,6 @@ cntl_t* bli_packm_cntl_create_node ( rntm_t* rntm, void_fp var_func, - void_fp packm_var_func, bszid_t bmid_m, bszid_t bmid_n, bool does_invert_diag, diff --git a/frame/1m/packm/bli_packm_init.c b/frame/1m/packm/bli_packm_init.c index acc2fd944c..299be85d38 100644 --- a/frame/1m/packm/bli_packm_init.c +++ b/frame/1m/packm/bli_packm_init.c @@ -37,7 +37,7 @@ bool bli_packm_init ( - obj_t* a, + obj_t* c, obj_t* p, cntx_t* cntx, rntm_t* rntm, @@ -55,41 +55,25 @@ bool bli_packm_init // Check parameters. if ( bli_error_checking_is_enabled() ) - bli_packm_init_check( a, p, cntx ); + bli_packm_init_check( c, p, cntx ); // We begin by copying the fields of A. - bli_obj_alias_to( a, p ); + bli_obj_alias_to( c, p ); // If the object is marked as being filled with zeros, then we can skip // the packm operation entirely and alias. - if ( bli_obj_is_zeros( a ) ) + if ( bli_obj_is_zeros( c ) ) return false; // Extract various fields from the control tree. - bszid_t bmult_id_m = bli_cntl_packm_params_bmid_m( cntl ); - bszid_t bmult_id_n = bli_cntl_packm_params_bmid_n( cntl ); - bool does_invert_diag = bli_cntl_packm_params_does_invert_diag( cntl ); - bool rev_iter_if_upper = bli_cntl_packm_params_rev_iter_if_upper( cntl ); - bool rev_iter_if_lower = bli_cntl_packm_params_rev_iter_if_lower( cntl ); - pack_t schema = bli_cntl_packm_params_pack_schema( cntl ); - num_t dt_tar = bli_obj_target_dt( a ); - num_t dt_scalar = bli_obj_scalar_dt( a ); - dim_t bmult_m_def = bli_cntx_get_blksz_def_dt( dt_tar, bmult_id_m, cntx ); - dim_t bmult_m_pack = bli_cntx_get_blksz_max_dt( dt_tar, bmult_id_m, cntx ); - dim_t bmult_n_def = bli_cntx_get_blksz_def_dt( dt_tar, bmult_id_n, cntx ); - - invdiag_t invert_diag; - packord_t pack_ord_if_up; - packord_t pack_ord_if_lo; - - if ( does_invert_diag ) invert_diag = BLIS_INVERT_DIAG; - else invert_diag = BLIS_NO_INVERT_DIAG; - - if ( rev_iter_if_upper ) pack_ord_if_up = BLIS_PACK_REV_IF_UPPER; - else pack_ord_if_up = BLIS_PACK_FWD_IF_UPPER; - - if ( rev_iter_if_lower ) pack_ord_if_lo = BLIS_PACK_REV_IF_LOWER; - else pack_ord_if_lo = BLIS_PACK_FWD_IF_LOWER; + bszid_t bmult_id_m = bli_cntl_packm_params_bmid_m( cntl ); + bszid_t bmult_id_n = bli_cntl_packm_params_bmid_n( cntl ); + pack_t schema = bli_cntl_packm_params_pack_schema( cntl ); + num_t dt_tar = bli_obj_target_dt( c ); + num_t dt_scalar = bli_obj_scalar_dt( c ); + dim_t bmult_m_def = bli_cntx_get_blksz_def_dt( dt_tar, bmult_id_m, cntx ); + dim_t bmult_m_pack = bli_cntx_get_blksz_max_dt( dt_tar, bmult_id_m, cntx ); + dim_t bmult_n_def = bli_cntx_get_blksz_def_dt( dt_tar, bmult_id_n, cntx ); // Typecast the internal scalar value to the target datatype. // Note that if the typecasting is needed, this must happen BEFORE we @@ -102,35 +86,19 @@ bool bli_packm_init // Update the storage datatype of P to be the target datatype of A. bli_obj_set_dt( dt_tar, p ); + // Store the pack schema to the object. + bli_obj_set_pack_schema( schema, p ); + // Clear the conjugation field from the object since matrix packing - // in BLIS is deemed to take care of all conjugation necessary. + // in BLIS is deemed to take care of all conjugation necessary. bli_obj_set_conj( BLIS_NO_CONJUGATE, p ); - // If we are packing micropanels, mark P as dense. Otherwise, we are - // probably being called in the context of a level-2 operation, in - // which case we do not want to overwrite the uplo field of P (inherited - // from A) with BLIS_DENSE because that information may be needed by - // the level-2 operation's unblocked variant to decide whether to - // execute a "lower" or "upper" branch of code. - if ( bli_is_panel_packed( schema ) ) - { - bli_obj_set_uplo( BLIS_DENSE, p ); - } + // If we are packing micropanels, mark P as dense. + bli_obj_set_uplo( BLIS_DENSE, p ); // Reset the view offsets to (0,0). bli_obj_set_offs( 0, 0, p ); - // Set the invert diagonal field. - bli_obj_set_invert_diag( invert_diag, p ); - - // Set the pack status of P to the pack schema prescribed in the control - // tree node. - bli_obj_set_pack_schema( schema, p ); - - // Set the packing order bits. - bli_obj_set_pack_order_if_upper( pack_ord_if_up, p ); - bli_obj_set_pack_order_if_lower( pack_ord_if_lo, p ); - // Compute the dimensions padded by the dimension multiples. These // dimensions will be the dimensions of the packed matrices, including // zero-padding, and will be used by the macro- and micro-kernels. @@ -156,161 +124,95 @@ bool bli_packm_init // Extract the element size for the packed object. siz_t elem_size_p = bli_obj_elem_size( p ); - inc_t rs_p, cs_p, is_p; - siz_t size_p; - - // Set the row and column strides of p based on the pack schema. - if ( bli_is_row_packed( schema ) && - !bli_is_panel_packed( schema ) ) - { - // For regular row storage, the padded width of our matrix - // should be used for the row stride, with the column stride set - // to one. By using the WIDTH of the mem_t region, we allow for - // zero-padding (if necessary/desired) along the right edge of - // the matrix. - rs_p = n_p_pad; - cs_p = 1; - - // Align the leading dimension according to the heap stride - // alignment size so that the second, third, etc rows begin at - // aligned addresses. - rs_p = bli_align_dim_to_size( rs_p, elem_size_p, - BLIS_HEAP_STRIDE_ALIGN_SIZE ); - - // Store the strides in P. - bli_obj_set_strides( rs_p, cs_p, p ); - - // Compute the size of the packed buffer. - size_p = m_p_pad * rs_p * elem_size_p; - } - else if ( bli_is_col_packed( schema ) && - !bli_is_panel_packed( schema ) ) + // The panel dimension (for each datatype) should be equal to the + // default (logical) blocksize multiple in the m dimension. + dim_t m_panel = bmult_m_def; + + // The "column stride" of a row-micropanel packed object is interpreted + // as the column stride WITHIN a micropanel. Thus, this is equal to the + // packing (storage) blocksize multiple, which may be equal to the + // default (logical) blocksize multiple). + inc_t cs_p = bmult_m_pack; + + // The "row stride" of a row-micropanel packed object is interpreted + // as the row stride WITHIN a micropanel. Thus, it is unit. + inc_t rs_p = 1; + + // The "panel stride" of a micropanel packed object is interpreted as + // the distance between the (0,0) element of panel k and the (0,0) + // element of panel k+1. We use the padded width computed above to + // allow for zero-padding (if necessary/desired) along the far end + // of each micropanel (ie: the right edge of the matrix). Zero-padding + // can also occur along the long edge of the last micropanel if the m + // dimension of the matrix is not a whole multiple of MR. + inc_t ps_p = cs_p * n_p_pad; + + // As a general rule, we don't want micropanel strides to be odd. This + // is primarily motivated by our desire to support interleaved 3m + // micropanels, in which case we have to scale the panel stride + // by 3/2. That division by 2 means the numerator (prior to being + // scaled by 3) must be even. + if ( bli_is_odd( ps_p ) ) ps_p += 1; + + // Set the imaginary stride (in units of fundamental elements) for + // 3m and 4m (separated or interleaved). We use ps_p_orig since + // that variable tracks the number of real part elements contained + // within each micropanel of the source matrix. Therefore, this + // is the number of real elements that must be traversed before + // reaching the imaginary part (3mi/4mi) of the packed micropanel, + // or the real part of the next micropanel (3ms). + inc_t is_p; + if ( bli_is_3mi_packed( schema ) ) is_p = ps_p; + else if ( bli_is_4mi_packed( schema ) ) is_p = ps_p; + else if ( bli_is_3ms_packed( schema ) ) is_p = ps_p * ( m_p_pad / m_panel ); + else is_p = 1; + + // Here, we adjust the panel stride, if necessary. Remember: ps_p is + // always interpreted as being in units of the datatype of the object + // which is not necessarily how the micropanels will be stored. For + // interleaved 3m, we will increase ps_p by 50%, and for ro/io/rpi, + // we halve ps_p. Why? Because the macro-kernel indexes in units of + // the complex datatype. So these changes "trick" it into indexing + // the correct amount. + if ( bli_is_3mi_packed( schema ) ) { - // For regular column storage, the padded length of our matrix - // should be used for the column stride, with the row stride set - // to one. By using the LENGTH of the mem_t region, we allow for - // zero-padding (if necessary/desired) along the bottom edge of - // the matrix. - cs_p = m_p_pad; - rs_p = 1; - - // Align the leading dimension according to the heap stride - // alignment size so that the second, third, etc columns begin at - // aligned addresses. - cs_p = bli_align_dim_to_size( cs_p, elem_size_p, - BLIS_HEAP_STRIDE_ALIGN_SIZE ); - - // Store the strides in P. - bli_obj_set_strides( rs_p, cs_p, p ); - - // Compute the size of the packed buffer. - size_p = cs_p * n_p_pad * elem_size_p; + ps_p = ( ps_p * 3 ) / 2; } - else if ( bli_is_panel_packed( schema ) ) + else if ( bli_is_3ms_packed( schema ) || + bli_is_ro_packed( schema ) || + bli_is_io_packed( schema ) || + bli_is_rpi_packed( schema ) ) { - dim_t m_panel; - dim_t ps_p, ps_p_orig; - - // The panel dimension (for each datatype) should be equal to the - // default (logical) blocksize multiple in the m dimension. - m_panel = bmult_m_def; - - // The "column stride" of a row-micropanel packed object is interpreted - // as the column stride WITHIN a micropanel. Thus, this is equal to the - // packing (storage) blocksize multiple, which may be equal to the - // default (logical) blocksize multiple). - cs_p = bmult_m_pack; - - // The "row stride" of a row-micropanel packed object is interpreted - // as the row stride WITHIN a micropanel. Thus, it is unit. - rs_p = 1; - - // The "panel stride" of a micropanel packed object is interpreted as - // the distance between the (0,0) element of panel k and the (0,0) - // element of panel k+1. We use the padded width computed above to - // allow for zero-padding (if necessary/desired) along the far end - // of each micropanel (ie: the right edge of the matrix). Zero-padding - // can also occur along the long edge of the last micropanel if the m - // dimension of the matrix is not a whole multiple of MR. - ps_p = cs_p * n_p_pad; - - // As a general rule, we don't want micropanel strides to be odd. This - // is primarily motivated by our desire to support interleaved 3m - // micropanels, in which case we have to scale the panel stride - // by 3/2. That division by 2 means the numerator (prior to being - // scaled by 3) must be even. - if ( bli_is_odd( ps_p ) ) ps_p += 1; - - // Preserve this early panel stride value for use later, if needed. - ps_p_orig = ps_p; - - // Here, we adjust the panel stride, if necessary. Remember: ps_p is - // always interpreted as being in units of the datatype of the object - // which is not necessarily how the micropanels will be stored. For - // interleaved 3m, we will increase ps_p by 50%, and for ro/io/rpi, - // we halve ps_p. Why? Because the macro-kernel indexes in units of - // the complex datatype. So these changes "trick" it into indexing - // the correct amount. - if ( bli_is_3mi_packed( schema ) ) - { - ps_p = ( ps_p * 3 ) / 2; - } - else if ( bli_is_3ms_packed( schema ) || - bli_is_ro_packed( schema ) || - bli_is_io_packed( schema ) || - bli_is_rpi_packed( schema ) ) - { - // Despite the fact that the packed micropanels will contain - // real elements, the panel stride that we store in the obj_t - // (which is passed into the macro-kernel) needs to be in units - // of complex elements, since the macro-kernel will index through - // micropanels via complex pointer arithmetic for trmm/trsm. - // Since the indexing "increment" will be twice as large as each - // actual stored element, we divide the panel_stride by 2. - ps_p = ps_p / 2; - } - - // Set the imaginary stride (in units of fundamental elements) for - // 3m and 4m (separated or interleaved). We use ps_p_orig since - // that variable tracks the number of real part elements contained - // within each micropanel of the source matrix. Therefore, this - // is the number of real elements that must be traversed before - // reaching the imaginary part (3mi/4mi) of the packed micropanel, - // or the real part of the next micropanel (3ms). - if ( bli_is_3mi_packed( schema ) ) is_p = ps_p_orig; - else if ( bli_is_4mi_packed( schema ) ) is_p = ps_p_orig; - else if ( bli_is_3ms_packed( schema ) ) is_p = ps_p_orig * ( m_p_pad / m_panel ); - else is_p = 1; - - // Store the strides and panel dimension in P. - bli_obj_set_strides( rs_p, cs_p, p ); - bli_obj_set_imag_stride( is_p, p ); - bli_obj_set_panel_dim( m_panel, p ); - bli_obj_set_panel_stride( ps_p, p ); - bli_obj_set_panel_length( m_panel, p ); - bli_obj_set_panel_width( n_p, p ); - - // Compute the size of the packed buffer. - size_p = ps_p * ( m_p_pad / m_panel ) * elem_size_p; + // Despite the fact that the packed micropanels will contain + // real elements, the panel stride that we store in the obj_t + // (which is passed into the macro-kernel) needs to be in units + // of complex elements, since the macro-kernel will index through + // micropanels via complex pointer arithmetic for trmm/trsm. + // Since the indexing "increment" will be twice as large as each + // actual stored element, we divide the panel_stride by 2. + ps_p = ps_p / 2; } - else - { - // NOTE: When implementing block storage, we only need to implement - // the following two cases: - // - row-stored blocks in row-major order - // - column-stored blocks in column-major order - // The other two combinations coincide with that of packed row-panel - // and packed column- panel storage. - return false; - } + // Store the strides and panel dimension in P. + bli_obj_set_strides( rs_p, cs_p, p ); + bli_obj_set_imag_stride( is_p, p ); + bli_obj_set_panel_dim( m_panel, p ); + bli_obj_set_panel_stride( ps_p, p ); + bli_obj_set_panel_length( m_panel, p ); + bli_obj_set_panel_width( n_p, p ); - if ( size_p == 0 ) - return false; + // Compute the size of the packed buffer. + siz_t size_p = ps_p * ( m_p_pad / m_panel ) * elem_size_p; + + if ( size_p == 0 ) + return false; + // Update the buffer address in p to point to the buffer associated + // with the mem_t entry acquired from the memory broker (now cached in + // the control tree node). void* buffer = bli_packm_alloc( size_p, rntm, cntl, thread ); - bli_obj_set_buffer( buffer, p ); + bli_obj_set_buffer( buffer, p ); - return true; + return true; } diff --git a/frame/1m/packm/bli_packm_int.c b/frame/1m/packm/bli_packm_int.c index 7d3a5ede5e..1db6765a15 100644 --- a/frame/1m/packm/bli_packm_int.c +++ b/frame/1m/packm/bli_packm_int.c @@ -46,49 +46,8 @@ void bli_packm_int { bli_init_once(); - packm_var_oft f; - - // Sanity check; A should never have a zero dimension. If we must support - // it, then we should fold it into the next alias-and-early-exit block. - //if ( bli_obj_has_zero_dim( a ) ) bli_abort(); - - // Let us now check to see if the object has already been packed. First - // we check if it has been packed to an unspecified (row or column) - // format, in which case we can return, since by now aliasing has already - // taken place in packm_init(). - // NOTE: The reason we don't need to even look at the control tree in - // this case is as follows: an object's pack status is only set to - // BLIS_PACKED_UNSPEC for situations when the actual format used is - // not important, as long as its packed into contiguous rows or - // contiguous columns. A good example of this is packing for matrix - // operands in the level-2 operations. - if ( bli_obj_pack_schema( a ) == BLIS_PACKED_UNSPEC ) - { - return; - } - - // At this point, we can be assured that cntl is not NULL. Now we check - // if the object has already been packed to the desired schema (as en- - // coded in the control tree). If so, we can return, as above. - // NOTE: In most cases, an object's pack status will be BLIS_NOT_PACKED - // and thus packing will be called for (but in some cases packing has - // already taken place, or does not need to take place, and so that will - // be indicated by the pack status). Also, not all combinations of - // current pack status and desired pack schema are valid. - if ( bli_obj_pack_schema( a ) == bli_cntl_packm_params_pack_schema( cntl ) ) - { - return; - } - - // If the object is marked as being filled with zeros, then we can skip - // the packm operation entirely. - if ( bli_obj_is_zeros( a ) ) - { - return; - } - - // Extract the function pointer from the current control tree node. - f = bli_cntl_packm_params_var_func( cntl ); + // Extract the function pointer from the object. + packm_var_oft f = bli_obj_pack_fn( a ); // FGVZ: Not sure why we need this barrier, but we do. bli_thread_barrier( thread ); diff --git a/frame/1m/packm/bli_packm_md.h b/frame/1m/packm/bli_packm_md.h deleted file mode 100644 index 1b7e44496f..0000000000 --- a/frame/1m/packm/bli_packm_md.h +++ /dev/null @@ -1,36 +0,0 @@ -/* - - BLIS - An object-based framework for developing high-performance BLAS-like - libraries. - - Copyright (C) 2014, The University of Texas at Austin - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are - met: - - Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - - Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - - Neither the name(s) of the copyright holder(s) nor the names of its - contributors may be used to endorse or promote products derived - from this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -*/ - -#include "bli_packm_struc_cxk_md.h" - diff --git a/frame/1m/packm/bli_packm_struc_cxk.c b/frame/1m/packm/bli_packm_struc_cxk.c index d954353c77..9ca12a9da0 100644 --- a/frame/1m/packm/bli_packm_struc_cxk.c +++ b/frame/1m/packm/bli_packm_struc_cxk.c @@ -40,7 +40,6 @@ void PASTEMAC(ch,varname) \ ( \ struc_t strucc, \ - doff_t diagoffc, \ diag_t diagc, \ uplo_t uploc, \ conj_t conjc, \ @@ -50,6 +49,8 @@ void PASTEMAC(ch,varname) \ dim_t panel_len, \ dim_t panel_dim_max, \ dim_t panel_len_max, \ + dim_t panel_dim_off, \ + dim_t panel_len_off, \ ctype* restrict kappa, \ ctype* restrict c, inc_t incc, inc_t ldc, \ ctype* restrict p, inc_t ldp, \ @@ -84,7 +85,6 @@ void PASTEMAC(ch,varname) \ PASTEMAC(ch,packm_herm_cxk) \ ( \ strucc, \ - diagoffc, \ diagc, \ uploc, \ conjc, \ @@ -94,6 +94,8 @@ void PASTEMAC(ch,varname) \ panel_len, \ panel_dim_max, \ panel_len_max, \ + panel_dim_off, \ + panel_len_off, \ kappa, \ c, incc, ldc, \ p, ldp, \ @@ -108,7 +110,6 @@ void PASTEMAC(ch,varname) \ PASTEMAC(ch,packm_tri_cxk) \ ( \ strucc, \ - diagoffc, \ diagc, \ uploc, \ conjc, \ @@ -118,6 +119,8 @@ void PASTEMAC(ch,varname) \ panel_len, \ panel_dim_max, \ panel_len_max, \ + panel_dim_off, \ + panel_len_off, \ kappa, \ c, incc, ldc, \ p, ldp, \ @@ -138,7 +141,6 @@ INSERT_GENTFUNC_BASIC( packm_struc_cxk, packm_cxk ) void PASTEMAC(ch,varname) \ ( \ struc_t strucc, \ - doff_t diagoffc, \ diag_t diagc, \ uplo_t uploc, \ conj_t conjc, \ @@ -148,6 +150,8 @@ void PASTEMAC(ch,varname) \ dim_t panel_len, \ dim_t panel_dim_max, \ dim_t panel_len_max, \ + dim_t panel_dim_off, \ + dim_t panel_len_off, \ ctype* restrict kappa, \ ctype* restrict c, inc_t incc, inc_t ldc, \ ctype* restrict p, inc_t ldp, \ @@ -155,8 +159,9 @@ void PASTEMAC(ch,varname) \ cntx_t* cntx \ ) \ { \ - doff_t diagoffc_abs; \ - dim_t i, j; \ + doff_t diagoffc = panel_dim_off - panel_len_off; \ + doff_t diagoffc_abs; \ + dim_t i, j; \ \ /* Handle the case where the micro-panel does NOT intersect the diagonal separately from the case where it does intersect. */ \ @@ -379,7 +384,6 @@ INSERT_GENTFUNC_BASIC( packm_herm_cxk, packm_cxk ) void PASTEMAC(ch,varname) \ ( \ struc_t strucc, \ - doff_t diagoffc, \ diag_t diagc, \ uplo_t uploc, \ conj_t conjc, \ @@ -389,6 +393,8 @@ void PASTEMAC(ch,varname) \ dim_t panel_len, \ dim_t panel_dim_max, \ dim_t panel_len_max, \ + dim_t panel_dim_off, \ + dim_t panel_len_off, \ ctype* restrict kappa, \ ctype* restrict c, inc_t incc, inc_t ldc, \ ctype* restrict p, inc_t ldp, \ @@ -396,6 +402,8 @@ void PASTEMAC(ch,varname) \ cntx_t* cntx \ ) \ { \ + doff_t diagoffc = panel_dim_off - panel_len_off; \ +\ /* Pack the panel. */ \ PASTEMAC(ch,kername) \ ( \ diff --git a/frame/1m/packm/bli_packm_struc_cxk.h b/frame/1m/packm/bli_packm_struc_cxk.h index c4de5cf2e7..973a02612b 100644 --- a/frame/1m/packm/bli_packm_struc_cxk.h +++ b/frame/1m/packm/bli_packm_struc_cxk.h @@ -38,7 +38,6 @@ void PASTEMAC(ch,varname) \ ( \ struc_t strucc, \ - doff_t diagoffc, \ diag_t diagc, \ uplo_t uploc, \ conj_t conjc, \ @@ -48,6 +47,8 @@ void PASTEMAC(ch,varname) \ dim_t panel_len, \ dim_t panel_dim_max, \ dim_t panel_len_max, \ + dim_t panel_dim_off, \ + dim_t panel_len_off, \ ctype* restrict kappa, \ ctype* restrict c, inc_t incc, inc_t ldc, \ ctype* restrict p, inc_t ldp, \ diff --git a/frame/1m/packm/bli_packm_struc_cxk_1er.c b/frame/1m/packm/bli_packm_struc_cxk_1er.c index 54134056f0..c2cbfa2190 100644 --- a/frame/1m/packm/bli_packm_struc_cxk_1er.c +++ b/frame/1m/packm/bli_packm_struc_cxk_1er.c @@ -40,7 +40,6 @@ void PASTEMAC(ch,varname) \ ( \ struc_t strucc, \ - doff_t diagoffc, \ diag_t diagc, \ uplo_t uploc, \ conj_t conjc, \ @@ -50,11 +49,14 @@ void PASTEMAC(ch,varname) \ dim_t panel_len, \ dim_t panel_dim_max, \ dim_t panel_len_max, \ + dim_t panel_dim_off, \ + dim_t panel_len_off, \ ctype* restrict kappa, \ ctype* restrict c, inc_t incc, inc_t ldc, \ ctype* restrict p, inc_t ldp, \ inc_t is_p, \ - cntx_t* cntx \ + cntx_t* cntx, \ + void* params \ ) \ { \ /* Handle micro-panel packing based on the structure of the matrix @@ -74,7 +76,7 @@ void PASTEMAC(ch,varname) \ kappa, \ c, incc, ldc, \ p, ldp, \ - cntx \ + cntx \ ); \ } \ else if ( bli_is_herm_or_symm( strucc ) ) \ @@ -84,7 +86,6 @@ void PASTEMAC(ch,varname) \ PASTEMAC(ch,packm_herm_cxk_1er) \ ( \ strucc, \ - diagoffc, \ diagc, \ uploc, \ conjc, \ @@ -94,11 +95,14 @@ void PASTEMAC(ch,varname) \ panel_len, \ panel_dim_max, \ panel_len_max, \ + panel_dim_off, \ + panel_len_off, \ kappa, \ c, incc, ldc, \ p, ldp, \ is_p, \ - cntx \ + cntx, \ + params \ ); \ } \ else /* ( bli_is_triangular( strucc ) ) */ \ @@ -108,7 +112,6 @@ void PASTEMAC(ch,varname) \ PASTEMAC(ch,packm_tri_cxk_1er) \ ( \ strucc, \ - diagoffc, \ diagc, \ uploc, \ conjc, \ @@ -118,11 +121,14 @@ void PASTEMAC(ch,varname) \ panel_len, \ panel_dim_max, \ panel_len_max, \ + panel_dim_off, \ + panel_len_off, \ kappa, \ c, incc, ldc, \ p, ldp, \ is_p, \ - cntx \ + cntx, \ + params \ ); \ } \ } @@ -138,7 +144,6 @@ INSERT_GENTFUNCCO_BASIC( packm_struc_cxk_1er, packm_cxk_1er ) void PASTEMAC(ch,varname) \ ( \ struc_t strucc, \ - doff_t diagoffc, \ diag_t diagc, \ uplo_t uploc, \ conj_t conjc, \ @@ -148,15 +153,19 @@ void PASTEMAC(ch,varname) \ dim_t panel_len, \ dim_t panel_dim_max, \ dim_t panel_len_max, \ + dim_t panel_dim_off, \ + dim_t panel_len_off, \ ctype* restrict kappa, \ ctype* restrict c, inc_t incc, inc_t ldc, \ ctype* restrict p, inc_t ldp, \ inc_t is_p, \ - cntx_t* cntx \ + cntx_t* cntx, \ + void* params \ ) \ { \ - doff_t diagoffc_abs; \ - dim_t j; \ + doff_t diagoffc = panel_dim_off - panel_len_off; \ + doff_t diagoffc_abs; \ + dim_t j; \ \ /* Handle the case where the micro-panel does NOT intersect the diagonal separately from the case where it does intersect. */ \ @@ -368,7 +377,6 @@ INSERT_GENTFUNCCO_BASIC( packm_herm_cxk_1er, packm_cxk_1er ) void PASTEMAC(ch,varname) \ ( \ struc_t strucc, \ - doff_t diagoffc, \ diag_t diagc, \ uplo_t uploc, \ conj_t conjc, \ @@ -378,13 +386,17 @@ void PASTEMAC(ch,varname) \ dim_t panel_len, \ dim_t panel_dim_max, \ dim_t panel_len_max, \ + dim_t panel_dim_off, \ + dim_t panel_len_off, \ ctype* restrict kappa, \ ctype* restrict c, inc_t incc, inc_t ldc, \ ctype* restrict p, inc_t ldp, \ inc_t is_p, \ - cntx_t* cntx \ + cntx_t* cntx, \ + void* params \ ) \ { \ + doff_t diagoffc = panel_dim_off - panel_len_off; \ doff_t diagoffc_abs = bli_abs( diagoffc ); \ ctype* p11 = p + (diagoffc_abs )*ldp; \ \ diff --git a/frame/1m/packm/bli_packm_struc_cxk_1er.h b/frame/1m/packm/bli_packm_struc_cxk_1er.h index 677b600138..a953e93673 100644 --- a/frame/1m/packm/bli_packm_struc_cxk_1er.h +++ b/frame/1m/packm/bli_packm_struc_cxk_1er.h @@ -38,7 +38,6 @@ void PASTEMAC(ch,varname) \ ( \ struc_t strucc, \ - doff_t diagoffc, \ diag_t diagc, \ uplo_t uploc, \ conj_t conjc, \ @@ -48,11 +47,14 @@ void PASTEMAC(ch,varname) \ dim_t panel_len, \ dim_t panel_dim_max, \ dim_t panel_len_max, \ + dim_t panel_dim_off, \ + dim_t panel_len_off, \ ctype* restrict kappa, \ ctype* restrict c, inc_t incc, inc_t ldc, \ ctype* restrict p, inc_t ldp, \ inc_t is_p, \ - cntx_t* cntx \ + cntx_t* cntx, \ + void* params \ ); INSERT_GENTPROTCO_BASIC0( packm_struc_cxk_1er ) diff --git a/frame/1m/packm/bli_packm_struc_cxk_3mis.c b/frame/1m/packm/bli_packm_struc_cxk_3mis.c index 97c96cd209..16e7de70ea 100644 --- a/frame/1m/packm/bli_packm_struc_cxk_3mis.c +++ b/frame/1m/packm/bli_packm_struc_cxk_3mis.c @@ -40,7 +40,6 @@ void PASTEMAC(ch,varname) \ ( \ struc_t strucc, \ - doff_t diagoffc, \ diag_t diagc, \ uplo_t uploc, \ conj_t conjc, \ @@ -50,11 +49,14 @@ void PASTEMAC(ch,varname) \ dim_t panel_len, \ dim_t panel_dim_max, \ dim_t panel_len_max, \ + dim_t panel_dim_off, \ + dim_t panel_len_off, \ ctype* restrict kappa, \ ctype* restrict c, inc_t incc, inc_t ldc, \ ctype* restrict p, inc_t ldp, \ inc_t is_p, \ - cntx_t* cntx \ + cntx_t* cntx, \ + void* params \ ) \ { \ /* Handle micro-panel packing based on the structure of the matrix @@ -83,7 +85,6 @@ void PASTEMAC(ch,varname) \ PASTEMAC(ch,packm_herm_cxk_3mis) \ ( \ strucc, \ - diagoffc, \ diagc, \ uploc, \ conjc, \ @@ -93,11 +94,14 @@ void PASTEMAC(ch,varname) \ panel_len, \ panel_dim_max, \ panel_len_max, \ + panel_dim_off, \ + panel_len_off, \ kappa, \ c, incc, ldc, \ p, ldp, \ is_p, \ - cntx \ + cntx, \ + params \ ); \ } \ else /* ( bli_is_triangular( strucc ) ) */ \ @@ -107,7 +111,6 @@ void PASTEMAC(ch,varname) \ PASTEMAC(ch,packm_tri_cxk_3mis) \ ( \ strucc, \ - diagoffc, \ diagc, \ uploc, \ conjc, \ @@ -117,11 +120,14 @@ void PASTEMAC(ch,varname) \ panel_len, \ panel_dim_max, \ panel_len_max, \ + panel_dim_off, \ + panel_len_off, \ kappa, \ c, incc, ldc, \ p, ldp, \ is_p, \ - cntx \ + cntx, \ + params \ ); \ } \ } @@ -137,7 +143,6 @@ INSERT_GENTFUNCCO_BASIC( packm_struc_cxk_3mis, packm_cxk_3mis ) void PASTEMAC(ch,varname) \ ( \ struc_t strucc, \ - doff_t diagoffc, \ diag_t diagc, \ uplo_t uploc, \ conj_t conjc, \ @@ -147,15 +152,19 @@ void PASTEMAC(ch,varname) \ dim_t panel_len, \ dim_t panel_dim_max, \ dim_t panel_len_max, \ + dim_t panel_dim_off, \ + dim_t panel_len_off, \ ctype* restrict kappa, \ ctype* restrict c, inc_t incc, inc_t ldc, \ ctype* restrict p, inc_t ldp, \ inc_t is_p, \ - cntx_t* cntx \ + cntx_t* cntx, \ + void* params \ ) \ { \ - doff_t diagoffc_abs; \ - dim_t i, j; \ + doff_t diagoffc = panel_dim_off - panel_len_off; \ + doff_t diagoffc_abs; \ + dim_t i, j; \ \ \ /* Handle the case where the micro-panel does NOT intersect the @@ -440,7 +449,6 @@ INSERT_GENTFUNCCO_BASIC( packm_herm_cxk_3mis, packm_cxk_3mis ) void PASTEMAC(ch,varname) \ ( \ struc_t strucc, \ - doff_t diagoffc, \ diag_t diagc, \ uplo_t uploc, \ conj_t conjc, \ @@ -450,13 +458,18 @@ void PASTEMAC(ch,varname) \ dim_t panel_len, \ dim_t panel_dim_max, \ dim_t panel_len_max, \ + dim_t panel_dim_off, \ + dim_t panel_len_off, \ ctype* restrict kappa, \ ctype* restrict c, inc_t incc, inc_t ldc, \ ctype* restrict p, inc_t ldp, \ inc_t is_p, \ - cntx_t* cntx \ + cntx_t* cntx, \ + void* params \ ) \ { \ + doff_t diagoffc = panel_dim_off - panel_len_off; \ +\ /* Pack the panel. */ \ PASTEMAC(ch,kername) \ ( \ diff --git a/frame/1m/packm/bli_packm_struc_cxk_3mis.h b/frame/1m/packm/bli_packm_struc_cxk_3mis.h index 9744ea66b4..3dcbf177e5 100644 --- a/frame/1m/packm/bli_packm_struc_cxk_3mis.h +++ b/frame/1m/packm/bli_packm_struc_cxk_3mis.h @@ -38,7 +38,6 @@ void PASTEMAC(ch,varname) \ ( \ struc_t strucc, \ - doff_t diagoffc, \ diag_t diagc, \ uplo_t uploc, \ conj_t conjc, \ @@ -48,11 +47,14 @@ void PASTEMAC(ch,varname) \ dim_t panel_len, \ dim_t panel_dim_max, \ dim_t panel_len_max, \ + dim_t panel_dim_off, \ + dim_t panel_len_off, \ ctype* restrict kappa, \ ctype* restrict c, inc_t incc, inc_t ldc, \ ctype* restrict p, inc_t ldp, \ inc_t is_p, \ - cntx_t* cntx \ + cntx_t* cntx, \ + void* params \ ); INSERT_GENTPROTCO_BASIC0( packm_struc_cxk_3mis ) diff --git a/frame/1m/packm/bli_packm_struc_cxk_4mi.c b/frame/1m/packm/bli_packm_struc_cxk_4mi.c index 2095f3341b..d711961091 100644 --- a/frame/1m/packm/bli_packm_struc_cxk_4mi.c +++ b/frame/1m/packm/bli_packm_struc_cxk_4mi.c @@ -40,7 +40,6 @@ void PASTEMAC(ch,varname) \ ( \ struc_t strucc, \ - doff_t diagoffc, \ diag_t diagc, \ uplo_t uploc, \ conj_t conjc, \ @@ -50,11 +49,14 @@ void PASTEMAC(ch,varname) \ dim_t panel_len, \ dim_t panel_dim_max, \ dim_t panel_len_max, \ + dim_t panel_dim_off, \ + dim_t panel_len_off, \ ctype* restrict kappa, \ ctype* restrict c, inc_t incc, inc_t ldc, \ ctype* restrict p, inc_t ldp, \ inc_t is_p, \ - cntx_t* cntx \ + cntx_t* cntx, \ + void* params \ ) \ { \ /* Handle micro-panel packing based on the structure of the matrix @@ -83,7 +85,6 @@ void PASTEMAC(ch,varname) \ PASTEMAC(ch,packm_herm_cxk_4mi) \ ( \ strucc, \ - diagoffc, \ diagc, \ uploc, \ conjc, \ @@ -93,11 +94,14 @@ void PASTEMAC(ch,varname) \ panel_len, \ panel_dim_max, \ panel_len_max, \ + panel_dim_off, \ + panel_len_off, \ kappa, \ c, incc, ldc, \ p, ldp, \ is_p, \ - cntx \ + cntx, \ + params \ ); \ } \ else /* ( bli_is_triangular( strucc ) ) */ \ @@ -107,7 +111,6 @@ void PASTEMAC(ch,varname) \ PASTEMAC(ch,packm_tri_cxk_4mi) \ ( \ strucc, \ - diagoffc, \ diagc, \ uploc, \ conjc, \ @@ -117,11 +120,14 @@ void PASTEMAC(ch,varname) \ panel_len, \ panel_dim_max, \ panel_len_max, \ + panel_dim_off, \ + panel_len_off, \ kappa, \ c, incc, ldc, \ p, ldp, \ is_p, \ - cntx \ + cntx, \ + params \ ); \ } \ } @@ -137,7 +143,6 @@ INSERT_GENTFUNCCO_BASIC( packm_struc_cxk_4mi, packm_cxk_4mi ) void PASTEMAC(ch,varname) \ ( \ struc_t strucc, \ - doff_t diagoffc, \ diag_t diagc, \ uplo_t uploc, \ conj_t conjc, \ @@ -147,15 +152,19 @@ void PASTEMAC(ch,varname) \ dim_t panel_len, \ dim_t panel_dim_max, \ dim_t panel_len_max, \ + dim_t panel_dim_off, \ + dim_t panel_len_off, \ ctype* restrict kappa, \ ctype* restrict c, inc_t incc, inc_t ldc, \ ctype* restrict p, inc_t ldp, \ inc_t is_p, \ - cntx_t* cntx \ + cntx_t* cntx, \ + void* params \ ) \ { \ - doff_t diagoffc_abs; \ - dim_t i, j; \ + doff_t diagoffc = panel_dim_off - panel_len_off; \ + doff_t diagoffc_abs; \ + dim_t i, j; \ \ \ /* Handle the case where the micro-panel does NOT intersect the @@ -425,7 +434,6 @@ INSERT_GENTFUNCCO_BASIC( packm_herm_cxk_4mi, packm_cxk_4mi ) void PASTEMAC(ch,varname) \ ( \ struc_t strucc, \ - doff_t diagoffc, \ diag_t diagc, \ uplo_t uploc, \ conj_t conjc, \ @@ -435,13 +443,18 @@ void PASTEMAC(ch,varname) \ dim_t panel_len, \ dim_t panel_dim_max, \ dim_t panel_len_max, \ + dim_t panel_dim_off, \ + dim_t panel_len_off, \ ctype* restrict kappa, \ ctype* restrict c, inc_t incc, inc_t ldc, \ ctype* restrict p, inc_t ldp, \ inc_t is_p, \ - cntx_t* cntx \ + cntx_t* cntx, \ + void* params \ ) \ { \ + doff_t diagoffc = panel_dim_off - panel_len_off; \ +\ /* Pack the panel. */ \ PASTEMAC(ch,kername) \ ( \ diff --git a/frame/1m/packm/bli_packm_struc_cxk_4mi.h b/frame/1m/packm/bli_packm_struc_cxk_4mi.h index 5e0b234525..f290473d09 100644 --- a/frame/1m/packm/bli_packm_struc_cxk_4mi.h +++ b/frame/1m/packm/bli_packm_struc_cxk_4mi.h @@ -38,7 +38,6 @@ void PASTEMAC(ch,varname) \ ( \ struc_t strucc, \ - doff_t diagoffc, \ diag_t diagc, \ uplo_t uploc, \ conj_t conjc, \ @@ -48,11 +47,14 @@ void PASTEMAC(ch,varname) \ dim_t panel_len, \ dim_t panel_dim_max, \ dim_t panel_len_max, \ + dim_t panel_dim_off, \ + dim_t panel_len_off, \ ctype* restrict kappa, \ ctype* restrict c, inc_t incc, inc_t ldc, \ ctype* restrict p, inc_t ldp, \ inc_t is_p, \ - cntx_t* cntx \ + cntx_t* cntx, \ + void* params \ ); INSERT_GENTPROTCO_BASIC0( packm_struc_cxk_4mi ) diff --git a/frame/1m/packm/bli_packm_struc_cxk_md.c b/frame/1m/packm/bli_packm_struc_cxk_md.c index 6ded844188..8c64fedede 100644 --- a/frame/1m/packm/bli_packm_struc_cxk_md.c +++ b/frame/1m/packm/bli_packm_struc_cxk_md.c @@ -42,7 +42,6 @@ void PASTEMAC2(chc,chp,varname) \ ( \ struc_t strucc, \ - doff_t diagoffc, \ diag_t diagc, \ uplo_t uploc, \ conj_t conjc, \ @@ -52,11 +51,14 @@ void PASTEMAC2(chc,chp,varname) \ dim_t panel_len, \ dim_t panel_dim_max, \ dim_t panel_len_max, \ + dim_t panel_dim_off, \ + dim_t panel_len_off, \ ctype_p* restrict kappa, \ ctype_c* restrict c, inc_t incc, inc_t ldc, \ ctype_p* restrict p, inc_t ldp, \ inc_t is_p, \ - cntx_t* cntx \ + cntx_t* cntx, \ + void* params \ ) \ { \ if ( bli_is_nat_packed( schema ) ) \ diff --git a/frame/1m/packm/bli_packm_struc_cxk_md.h b/frame/1m/packm/bli_packm_struc_cxk_md.h index 5c6dc321cf..f493838b3a 100644 --- a/frame/1m/packm/bli_packm_struc_cxk_md.h +++ b/frame/1m/packm/bli_packm_struc_cxk_md.h @@ -38,7 +38,6 @@ void PASTEMAC2(chc,chp,varname) \ ( \ struc_t strucc, \ - doff_t diagoffc, \ diag_t diagc, \ uplo_t uploc, \ conj_t conjc, \ @@ -48,11 +47,14 @@ void PASTEMAC2(chc,chp,varname) \ dim_t panel_len, \ dim_t panel_dim_max, \ dim_t panel_len_max, \ + dim_t panel_dim_off, \ + dim_t panel_len_off, \ ctype_p* restrict kappa, \ ctype_c* restrict c, inc_t incc, inc_t ldc, \ ctype_p* restrict p, inc_t ldp, \ inc_t is_p, \ - cntx_t* cntx \ + cntx_t* cntx, \ + void* params \ ); INSERT_GENTPROT2_BASIC0( packm_struc_cxk_md ) diff --git a/frame/1m/packm/bli_packm_struc_cxk_rih.c b/frame/1m/packm/bli_packm_struc_cxk_rih.c index e7dd56ce1b..726fa16a6d 100644 --- a/frame/1m/packm/bli_packm_struc_cxk_rih.c +++ b/frame/1m/packm/bli_packm_struc_cxk_rih.c @@ -40,7 +40,6 @@ void PASTEMAC(ch,varname) \ ( \ struc_t strucc, \ - doff_t diagoffc, \ diag_t diagc, \ uplo_t uploc, \ conj_t conjc, \ @@ -50,11 +49,14 @@ void PASTEMAC(ch,varname) \ dim_t panel_len, \ dim_t panel_dim_max, \ dim_t panel_len_max, \ + dim_t panel_dim_off, \ + dim_t panel_len_off, \ ctype* restrict kappa, \ ctype* restrict c, inc_t incc, inc_t ldc, \ ctype* restrict p, inc_t ldp, \ inc_t is_p, \ - cntx_t* cntx \ + cntx_t* cntx, \ + void* params \ ) \ { \ /* Handle micro-panel packing based on the structure of the matrix @@ -84,7 +86,6 @@ void PASTEMAC(ch,varname) \ PASTEMAC(ch,packm_herm_cxk_rih) \ ( \ strucc, \ - diagoffc, \ diagc, \ uploc, \ conjc, \ @@ -94,11 +95,14 @@ void PASTEMAC(ch,varname) \ panel_len, \ panel_dim_max, \ panel_len_max, \ + panel_dim_off, \ + panel_len_off, \ kappa, \ c, incc, ldc, \ p, ldp, \ is_p, \ - cntx \ + cntx, \ + params \ ); \ } \ else /* ( bli_is_triangular( strucc ) ) */ \ @@ -108,7 +112,6 @@ void PASTEMAC(ch,varname) \ PASTEMAC(ch,packm_tri_cxk_rih) \ ( \ strucc, \ - diagoffc, \ diagc, \ uploc, \ conjc, \ @@ -118,11 +121,14 @@ void PASTEMAC(ch,varname) \ panel_len, \ panel_dim_max, \ panel_len_max, \ + panel_dim_off, \ + panel_len_off, \ kappa, \ c, incc, ldc, \ p, ldp, \ is_p, \ - cntx \ + cntx, \ + params \ ); \ } \ \ @@ -151,7 +157,6 @@ INSERT_GENTFUNCCO_BASIC( packm_struc_cxk_rih, packm_cxk_rih ) void PASTEMAC(ch,varname) \ ( \ struc_t strucc, \ - doff_t diagoffc, \ diag_t diagc, \ uplo_t uploc, \ conj_t conjc, \ @@ -161,15 +166,19 @@ void PASTEMAC(ch,varname) \ dim_t panel_len, \ dim_t panel_dim_max, \ dim_t panel_len_max, \ + dim_t panel_dim_off, \ + dim_t panel_len_off, \ ctype* restrict kappa, \ ctype* restrict c, inc_t incc, inc_t ldc, \ ctype* restrict p, inc_t ldp, \ inc_t is_p, \ - cntx_t* cntx \ + cntx_t* cntx, \ + void* params \ ) \ { \ - doff_t diagoffc_abs; \ - dim_t j; \ + doff_t diagoffc = panel_dim_off - panel_len_off; \ + doff_t diagoffc_abs; \ + dim_t j; \ \ \ /* Handle the case where the micro-panel does NOT intersect the @@ -388,7 +397,6 @@ INSERT_GENTFUNCCO_BASIC( packm_herm_cxk_rih, packm_cxk_rih ) void PASTEMAC(ch,varname) \ ( \ struc_t strucc, \ - doff_t diagoffc, \ diag_t diagc, \ uplo_t uploc, \ conj_t conjc, \ @@ -398,13 +406,18 @@ void PASTEMAC(ch,varname) \ dim_t panel_len, \ dim_t panel_dim_max, \ dim_t panel_len_max, \ + dim_t panel_dim_off, \ + dim_t panel_len_off, \ ctype* restrict kappa, \ ctype* restrict c, inc_t incc, inc_t ldc, \ ctype* restrict p, inc_t ldp, \ inc_t is_p, \ - cntx_t* cntx \ + cntx_t* cntx, \ + void* params \ ) \ { \ + doff_t diagoffc = panel_dim_off - panel_len_off; \ +\ /* Pack the panel. */ \ PASTEMAC(ch,kername) \ ( \ diff --git a/frame/1m/packm/bli_packm_struc_cxk_rih.h b/frame/1m/packm/bli_packm_struc_cxk_rih.h index deb2fdf5e2..3329d25e13 100644 --- a/frame/1m/packm/bli_packm_struc_cxk_rih.h +++ b/frame/1m/packm/bli_packm_struc_cxk_rih.h @@ -38,7 +38,6 @@ void PASTEMAC(ch,varname) \ ( \ struc_t strucc, \ - doff_t diagoffc, \ diag_t diagc, \ uplo_t uploc, \ conj_t conjc, \ @@ -48,11 +47,14 @@ void PASTEMAC(ch,varname) \ dim_t panel_len, \ dim_t panel_dim_max, \ dim_t panel_len_max, \ + dim_t panel_dim_off, \ + dim_t panel_len_off, \ ctype* restrict kappa, \ ctype* restrict c, inc_t incc, inc_t ldc, \ ctype* restrict p, inc_t ldp, \ inc_t is_p, \ - cntx_t* cntx \ + cntx_t* cntx, \ + void* params \ ); INSERT_GENTPROTCO_BASIC0( packm_struc_cxk_rih ) diff --git a/frame/3/bli_l3_int.c b/frame/3/bli_l3_int.c index 8935c0a38e..0ae5b03aa7 100644 --- a/frame/3/bli_l3_int.c +++ b/frame/3/bli_l3_int.c @@ -94,6 +94,34 @@ void bli_l3_int bli_obj_alias_to( b, &b_local ); bli_obj_alias_to( c, &c_local ); + // Ensure that a valid packing function is set on A and B + if ( !bli_obj_pack_fn( &a_local ) ) + bli_obj_set_pack_fn( bli_packm_blk_var1, &a_local ); + + if ( !bli_obj_pack_fn( &b_local ) ) + bli_obj_set_pack_fn( bli_packm_blk_var1, &b_local ); + + // If we are using the default packing functon, ensure that the + // packing parameters are set. If using a custom packing function, + // it's up to the user to make sure there are valid parameters. + /* + packm_blk_var1_params_t params_a, params_b; + + if ( bli_obj_pack_fn( &a_local ) == bli_packm_blk_var1 && + !bli_obj_pack_params( &a_local ) ) + { + bli_packm_blk_var1_init_params( ¶ms_a ); + bli_obj_set_pack_params( ¶ms_a, &a_local ); + } + + if ( bli_obj_pack_fn( &b_local ) == bli_packm_blk_var1 && + !bli_obj_pack_params( &b_local ) ) + { + bli_packm_blk_var1_init_params( ¶ms_b ); + bli_obj_set_pack_params( ¶ms_b, &b_local ); + } + */ + // If we are about to call a leaf-level implementation, and matrix C // still needs a transposition, then we must induce one by swapping the // strides and dimensions. Note that this transposition would normally @@ -115,7 +143,7 @@ void bli_l3_int else // if ( bli_obj_root_is_triangular( b ) ) { if ( !bli_obj_equals( alpha, &BLIS_ONE ) ) - bli_obj_scalar_apply_scalar( alpha, &b_local ); + bli_obj_scalar_apply_scalar( alpha, &b_local ); } // If beta is non-unit, typecast and apply it to the scalar attached diff --git a/frame/3/gemm/bli_gemm_cntl.c b/frame/3/gemm/bli_gemm_cntl.c index f389a3bcc7..72d78efe16 100644 --- a/frame/3/gemm/bli_gemm_cntl.c +++ b/frame/3/gemm/bli_gemm_cntl.c @@ -57,8 +57,6 @@ cntl_t* bli_gemmbp_cntl_create ) { void_fp macro_kernel_fp; - void_fp packa_fp; - void_fp packb_fp; // Use the function pointers to the macrokernels that use slab // assignment of micropanels to threads in the jr and ir loops. @@ -67,9 +65,6 @@ cntl_t* bli_gemmbp_cntl_create else if ( family == BLIS_TRMM ) macro_kernel_fp = bli_trmm_xx_ker_var2; else /* should never execute */ macro_kernel_fp = NULL; - packa_fp = bli_packm_blk_var1; - packb_fp = bli_packm_blk_var1; - // Create two nodes for the macro-kernel. cntl_t* gemm_cntl_bu_ke = bli_gemm_cntl_create_node ( @@ -94,7 +89,6 @@ cntl_t* bli_gemmbp_cntl_create ( rntm, bli_l3_packa, // pack the left-hand operand - packa_fp, BLIS_MR, BLIS_KR, FALSE, // do NOT invert diagonal @@ -120,7 +114,6 @@ cntl_t* bli_gemmbp_cntl_create ( rntm, bli_l3_packb, // pack the right-hand operand - packb_fp, BLIS_NR, BLIS_KR, FALSE, // do NOT invert diagonal diff --git a/frame/3/trsm/bli_trsm_cntl.c b/frame/3/trsm/bli_trsm_cntl.c index 39a5ce3f46..422f8040c9 100644 --- a/frame/3/trsm/bli_trsm_cntl.c +++ b/frame/3/trsm/bli_trsm_cntl.c @@ -57,16 +57,11 @@ cntl_t* bli_trsm_l_cntl_create ) { void_fp macro_kernel_p; - void_fp packa_fp; - void_fp packb_fp; // Use the function pointer to the macrokernels that use slab // assignment of micropanels to threads in the jr and ir loops. macro_kernel_p = bli_trsm_xx_ker_var2; - packa_fp = bli_packm_blk_var1; - packb_fp = bli_packm_blk_var1; - const opid_t family = BLIS_TRSM; // @@ -96,7 +91,6 @@ cntl_t* bli_trsm_l_cntl_create ( rntm, bli_l3_packa, // trsm operation's packm function for A. - packa_fp, BLIS_MR, BLIS_MR, FALSE, // do NOT invert diagonal @@ -134,7 +128,6 @@ cntl_t* bli_trsm_l_cntl_create ( rntm, bli_l3_packa, // trsm operation's packm function for A. - packa_fp, BLIS_MR, BLIS_MR, #ifdef BLIS_ENABLE_TRSM_PREINVERSION @@ -172,7 +165,6 @@ cntl_t* bli_trsm_l_cntl_create ( rntm, bli_l3_packb, - packb_fp, BLIS_NR, BLIS_MR, FALSE, // do NOT invert diagonal @@ -216,9 +208,6 @@ cntl_t* bli_trsm_r_cntl_create // NOTE: trsm macrokernels are presently disabled for right-side execution. void_fp macro_kernel_p = bli_trsm_xx_ker_var2; - void_fp packa_fp = bli_packm_blk_var1; - void_fp packb_fp = bli_packm_blk_var1; - const opid_t family = BLIS_TRSM; // Create two nodes for the macro-kernel. @@ -245,7 +234,6 @@ cntl_t* bli_trsm_r_cntl_create ( rntm, bli_l3_packa, - packa_fp, BLIS_NR, BLIS_MR, FALSE, // do NOT invert diagonal @@ -271,7 +259,6 @@ cntl_t* bli_trsm_r_cntl_create ( rntm, bli_l3_packb, - packb_fp, BLIS_MR, BLIS_MR, TRUE, // do NOT invert diagonal diff --git a/frame/base/bli_obj.c b/frame/base/bli_obj.c index b3245c9611..2824540896 100644 --- a/frame/base/bli_obj.c +++ b/frame/base/bli_obj.c @@ -119,9 +119,9 @@ void bli_obj_create_without_buffer bli_obj_set_diag_offset( 0, obj ); bli_obj_set_pack_fn( NULL, obj ); - bli_obj_set_pack_ukr_fn( NULL, obj ); + bli_obj_set_pack_params( NULL, obj ); bli_obj_set_ker_fn( NULL, obj ); - bli_obj_set_ukr_fn( NULL, obj ); + bli_obj_set_ker_params( NULL, obj ); // Set the internal scalar to 1.0. bli_obj_set_scalar_dt( dt, obj ); diff --git a/frame/include/bli_obj_macro_defs.h b/frame/include/bli_obj_macro_defs.h index 43c6581846..e229dab877 100644 --- a/frame/include/bli_obj_macro_defs.h +++ b/frame/include/bli_obj_macro_defs.h @@ -1189,62 +1189,48 @@ BLIS_INLINE stor3_t bli_obj_stor3_from_strides( obj_t* c, obj_t* a, obj_t* b ) // -- User-provided information macros -- -// User data query - -BLIS_INLINE void* bli_obj_user_data( obj_t* obj ) -{ - return obj->user_data; -} - -// User data modification - -BLIS_INLINE void bli_obj_set_user_data( void* data, obj_t* obj ) -{ - obj->user_data = data; -} - // Function pointer query BLIS_INLINE obj_pack_fn_t bli_obj_pack_fn( obj_t* obj ) { - return obj->pack; + return obj->pack_fn; } -BLIS_INLINE obj_pack_ukr_fn_t bli_obj_pack_ukr_fn( obj_t* obj ) +BLIS_INLINE void* bli_obj_pack_params( obj_t* obj ) { - return obj->pack_ukr; + return obj->pack_params; } BLIS_INLINE obj_ker_fn_t bli_obj_ker_fn( obj_t* obj ) { - return obj->ker; + return obj->ker_fn; } -BLIS_INLINE obj_ukr_fn_t bli_obj_ukr_fn( obj_t* obj ) +BLIS_INLINE void* bli_obj_ker_params( obj_t* obj ) { - return obj->ukr; + return obj->ker_params; } // Function pointer modification -BLIS_INLINE void bli_obj_set_pack_fn( obj_pack_fn_t pack, obj_t* obj ) +BLIS_INLINE void bli_obj_set_pack_fn( obj_pack_fn_t pack_fn, obj_t* obj ) { - obj->pack = pack; + obj->pack_fn = pack_fn; } -BLIS_INLINE void bli_obj_set_pack_ukr_fn( obj_pack_ukr_fn_t pack_ukr, obj_t* obj ) +BLIS_INLINE void bli_obj_set_pack_params( void* params, obj_t* obj ) { - obj->pack_ukr = pack_ukr; + obj->pack_params = params; } -BLIS_INLINE void bli_obj_set_ker_fn( obj_ker_fn_t ker, obj_t* obj ) +BLIS_INLINE void bli_obj_set_ker_fn( obj_ker_fn_t ker_fn, obj_t* obj ) { - obj->ker = ker; + obj->ker_fn = ker_fn; } -BLIS_INLINE void bli_obj_set_ukr_fn( obj_ukr_fn_t ukr, obj_t* obj ) +BLIS_INLINE void bli_obj_set_ker_params( void* params, obj_t* obj ) { - obj->ukr = ukr; + obj->ker_params = params; } diff --git a/frame/include/bli_type_defs.h b/frame/include/bli_type_defs.h index 0d2b25f7f0..5c637f7837 100644 --- a/frame/include/bli_type_defs.h +++ b/frame/include/bli_type_defs.h @@ -1240,29 +1240,14 @@ struct thrinfo_s; typedef void (*obj_pack_fn_t) ( - mem_t* mem, struct obj_s* a, struct obj_s* ap, struct cntx_s* cntx, struct rntm_s* rntm, + struct cntl_s* cntl, struct thrinfo_s* thread ); -typedef void (*obj_pack_ukr_fn_t) - ( - dim_t m, \ - dim_t m_max, \ - dim_t m_off, \ - dim_t n, \ - dim_t n_max, \ - dim_t n_off, \ - void* restrict kappa, \ - void* restrict a, inc_t inca, inc_t lda, \ - void* restrict p, inc_t ldp, \ - void* params, \ - struct cntx_s* cntx \ - ); - typedef void (*obj_ker_fn_t) ( struct obj_s* a, @@ -1270,23 +1255,10 @@ typedef void (*obj_ker_fn_t) struct obj_s* c, struct cntx_s* cntx, struct rntm_s* rntm, + struct cntl_s* cntl, struct thrinfo_s* thread ); -typedef void (*obj_ukr_fn_t) - ( - dim_t m, - dim_t n, - dim_t k, - void* restrict alpha, - void* restrict a, inc_t rs_a, inc_t cs_a, - void* restrict b, inc_t rs_b, inc_t cs_b, - void* restrict beta, - void* restrict c, inc_t rs_c, inc_t cs_c, - auxinfo_t* restrict data, - struct cntx_s* restrict cntx - ); - typedef struct obj_s { // Basic fields @@ -1317,14 +1289,11 @@ typedef struct obj_s dim_t m_panel; // m dimension of a "full" panel dim_t n_panel; // n dimension of a "full" panel - // User data pointer - void* user_data; - - // Function pointers - obj_pack_fn_t pack; - obj_pack_ukr_fn_t pack_ukr; - obj_ker_fn_t ker; - obj_ukr_fn_t ukr; + // User-customizable fields + obj_pack_fn_t pack_fn; + void* pack_params; + obj_ker_fn_t ker_fn; + void* ker_params; } obj_t; @@ -1339,72 +1308,68 @@ typedef struct obj_s #define BLIS_OBJECT_INITIALIZER \ { \ - .root = NULL, \ -\ - .off = { 0, 0 }, \ - .dim = { 0, 0 }, \ - .diag_off = 0, \ + .root = NULL, \ \ - .info = 0x0 | BLIS_BITVAL_DENSE | \ - BLIS_BITVAL_GENERAL, \ - .info2 = 0x0, \ - .elem_size = sizeof( float ), /* this is changed later. */ \ + .off = { 0, 0 }, \ + .dim = { 0, 0 }, \ + .diag_off = 0, \ \ - .buffer = NULL, \ - .rs = 0, \ - .cs = 0, \ - .is = 1, \ + .info = 0x0 | BLIS_BITVAL_DENSE | \ + BLIS_BITVAL_GENERAL, \ + .info2 = 0x0, \ + .elem_size = sizeof( float ), /* this is changed later. */ \ \ - .scalar = { 0.0, 0.0 }, \ + .buffer = NULL, \ + .rs = 0, \ + .cs = 0, \ + .is = 1, \ \ - .m_padded = 0, \ - .n_padded = 0, \ - .ps = 0, \ - .pd = 0, \ - .m_panel = 0, \ - .n_panel = 0, \ + .scalar = { 0.0, 0.0 }, \ \ - .user_data = NULL, \ + .m_padded = 0, \ + .n_padded = 0, \ + .ps = 0, \ + .pd = 0, \ + .m_panel = 0, \ + .n_panel = 0, \ \ - .pack = NULL, \ - .pack_ukr = NULL, \ - .ker = NULL, \ - .ukr = NULL \ + .pack_fn = NULL, \ + .pack_params = NULL, \ + .ker_fn = NULL, \ + .ker_params = NULL \ } #define BLIS_OBJECT_INITIALIZER_1X1 \ { \ - .root = NULL, \ -\ - .off = { 0, 0 }, \ - .dim = { 1, 1 }, \ - .diag_off = 0, \ + .root = NULL, \ \ - .info = 0x0 | BLIS_BITVAL_DENSE | \ - BLIS_BITVAL_GENERAL, \ - .info2 = 0x0, \ - .elem_size = sizeof( float ), /* this is changed later. */ \ + .off = { 0, 0 }, \ + .dim = { 1, 1 }, \ + .diag_off = 0, \ \ - .buffer = NULL, \ - .rs = 0, \ - .cs = 0, \ - .is = 1, \ + .info = 0x0 | BLIS_BITVAL_DENSE | \ + BLIS_BITVAL_GENERAL, \ + .info2 = 0x0, \ + .elem_size = sizeof( float ), /* this is changed later. */ \ \ - .scalar = { 0.0, 0.0 }, \ + .buffer = NULL, \ + .rs = 0, \ + .cs = 0, \ + .is = 1, \ \ - .m_padded = 0, \ - .n_padded = 0, \ - .ps = 0, \ - .pd = 0, \ - .m_panel = 0, \ - .n_panel = 0, \ + .scalar = { 0.0, 0.0 }, \ \ - .user_data = NULL, \ + .m_padded = 0, \ + .n_padded = 0, \ + .ps = 0, \ + .pd = 0, \ + .m_panel = 0, \ + .n_panel = 0, \ \ - .pack = NULL, \ - .pack_ukr = NULL, \ - .ker = NULL, \ - .ukr = NULL \ + .pack_fn = NULL, \ + .pack_params = NULL, \ + .ker_fn = NULL, \ + .ker_params = NULL \ } // Define these macros here since they must be updated if contents of @@ -1412,79 +1377,75 @@ typedef struct obj_s BLIS_INLINE void bli_obj_init_full_shallow_copy_of( obj_t* a, obj_t* b ) { - b->root = a->root; - - b->off[0] = a->off[0]; - b->off[1] = a->off[1]; - b->dim[0] = a->dim[0]; - b->dim[1] = a->dim[1]; - b->diag_off = a->diag_off; - - b->info = a->info; - b->info2 = a->info2; - b->elem_size = a->elem_size; - - b->buffer = a->buffer; - b->rs = a->rs; - b->cs = a->cs; - b->is = a->is; - - b->scalar = a->scalar; - - //b->pack_mem = a->pack_mem; - b->m_padded = a->m_padded; - b->n_padded = a->n_padded; - b->ps = a->ps; - b->pd = a->pd; - b->m_panel = a->m_panel; - b->n_panel = a->n_panel; - - b->user_data = a->user_data; - - b->pack = a->pack; - b->pack_ukr = a->pack_ukr; - b->ker = a->ker; - b->ukr = a->ukr; + b->root = a->root; + + b->off[0] = a->off[0]; + b->off[1] = a->off[1]; + b->dim[0] = a->dim[0]; + b->dim[1] = a->dim[1]; + b->diag_off = a->diag_off; + + b->info = a->info; + b->info2 = a->info2; + b->elem_size = a->elem_size; + + b->buffer = a->buffer; + b->rs = a->rs; + b->cs = a->cs; + b->is = a->is; + + b->scalar = a->scalar; + + //b->pack_mem = a->pack_mem; + b->m_padded = a->m_padded; + b->n_padded = a->n_padded; + b->ps = a->ps; + b->pd = a->pd; + b->m_panel = a->m_panel; + b->n_panel = a->n_panel; + + b->pack_fn = a->pack_fn; + b->pack_params = a->pack_params; + b->ker_fn = a->ker_fn; + b->ker_params = a->ker_params; } BLIS_INLINE void bli_obj_init_subpart_from( obj_t* a, obj_t* b ) { - b->root = a->root; + b->root = a->root; - b->off[0] = a->off[0]; - b->off[1] = a->off[1]; + b->off[0] = a->off[0]; + b->off[1] = a->off[1]; // Avoid copying m and n since they will be overwritten. - //b->dim[0] = a->dim[0]; - //b->dim[1] = a->dim[1]; - b->diag_off = a->diag_off; + //b->dim[0] = a->dim[0]; + //b->dim[1] = a->dim[1]; + b->diag_off = a->diag_off; - b->info = a->info; - b->info2 = a->info2; - b->elem_size = a->elem_size; + b->info = a->info; + b->info2 = a->info2; + b->elem_size = a->elem_size; - b->buffer = a->buffer; - b->rs = a->rs; - b->cs = a->cs; - b->is = a->is; + b->buffer = a->buffer; + b->rs = a->rs; + b->cs = a->cs; + b->is = a->is; - b->scalar = a->scalar; + b->scalar = a->scalar; // Avoid copying pack_mem entry. // FGVZ: You should probably make sure this is right. - //b->pack_mem = a->pack_mem; - b->m_padded = a->m_padded; - b->n_padded = a->n_padded; - b->ps = a->ps; - b->pd = a->pd; - b->m_panel = a->m_panel; - b->n_panel = a->n_panel; - - b->user_data = a->user_data; - - b->pack = a->pack; - b->pack_ukr = a->pack_ukr; - b->ker = a->ker; - b->ukr = a->ukr; + //b->pack_mem = a->pack_mem; + b->m_padded = a->m_padded; + b->n_padded = a->n_padded; + b->ps = a->ps; + b->pd = a->pd; + b->m_panel = a->m_panel; + b->n_panel = a->n_panel; + + b->pack_fn = a->pack_fn; + b->pack_params = a->pack_params; + b->ker_fn = a->ker_fn; + b->ker_params = a->ker_params; } // Initializors for global scalar constants. diff --git a/testsuite/src/test_libblis.c b/testsuite/src/test_libblis.c index c06e73ecb0..b837227fd1 100644 --- a/testsuite/src/test_libblis.c +++ b/testsuite/src/test_libblis.c @@ -2635,7 +2635,6 @@ cntl_t* libblis_test_pobj_create( bszid_t bmult_id_m, bszid_t bmult_id_n, invdia ( NULL, // we don't need the small block allocator from the runtime. NULL, // func ptr is not referenced b/c we don't call via l3 _int(). - bli_packm_blk_var1, bmult_id_m, bmult_id_n, does_inv_diag, @@ -2647,7 +2646,7 @@ cntl_t* libblis_test_pobj_create( bszid_t bmult_id_m, bszid_t bmult_id_n, invdia ); // Pack the contents of A to P. - bli_packm_int( a, p, cntx, rntm, cntl, &BLIS_PACKM_SINGLE_THREADED ); + bli_packm_blk_var1( a, p, cntx, rntm, cntl, &BLIS_PACKM_SINGLE_THREADED ); // Return the control tree pointer so the caller can free the cntl_t and its // mem_t entry later on. From dfc1267cbb30d6752a9a299ed6cbfa0b3cde2ba8 Mon Sep 17 00:00:00 2001 From: Devin Matthews Date: Tue, 28 Sep 2021 18:57:52 -0500 Subject: [PATCH 10/24] Fix race condition in packing allocation. All threads need to make a copy of the master thread's local mem_t struct, but the master thread may exit before this is complete. Fixed by putting the thread barrier in the right place. Also, fixed a problem with the max panel length and work partitioning for triangular matrices. --- frame/1m/packm/bli_packm_alloc.c | 93 ++++++++--------------------- frame/1m/packm/bli_packm_blk_var1.c | 24 ++++---- 2 files changed, 39 insertions(+), 78 deletions(-) diff --git a/frame/1m/packm/bli_packm_alloc.c b/frame/1m/packm/bli_packm_alloc.c index 81a7a95504..df6750d7ac 100644 --- a/frame/1m/packm/bli_packm_alloc.c +++ b/frame/1m/packm/bli_packm_alloc.c @@ -49,22 +49,29 @@ void* bli_packm_alloc // Query the address of the mem_t entry within the control tree node. mem_t* cntl_mem_p = bli_cntl_pack_mem( cntl ); - // Check the mem_t field in the control tree. If it is unallocated, then - // we need to acquire a block from the memory broker and broadcast it to - // all threads in the chief's thread group. - if ( bli_mem_is_unalloc( cntl_mem_p ) ) - { - mem_t* local_mem_p; - mem_t local_mem_s; + mem_t* local_mem_p; + mem_t local_mem_s; + + siz_t cntl_mem_size = 0; + if ( bli_mem_is_alloc( cntl_mem_p ) ) + cntl_mem_size = bli_mem_size( cntl_mem_p ); + + if ( cntl_mem_size < size_needed ) + { if ( bli_thread_am_ochief( thread ) ) { - #ifdef BLIS_ENABLE_MEM_TRACING - printf( "bli_l3_packm(): acquiring mem pool block\n" ); - #endif - - // The chief thread acquires a block from the memory broker - // and saves the associated mem_t entry to local_mem_s. + // The chief thread releases the existing block associated with + // the mem_t entry in the control tree, and then re-acquires a + // new block, saving the associated mem_t entry to local_mem_s. + if ( bli_mem_is_alloc( cntl_mem_p ) ) + { + bli_pba_release + ( + rntm, + cntl_mem_p + ); + } bli_pba_acquire_m ( rntm, @@ -78,63 +85,13 @@ void* bli_packm_alloc // all threads. local_mem_p = bli_thread_broadcast( thread, &local_mem_s ); - // Save the contents of the chief thread's local mem_t entry to the - // mem_t field in this thread's control tree node. + // Save the chief thread's local mem_t entry to the mem_t field in + // this thread's control tree node. *cntl_mem_p = *local_mem_p; - } - else // ( bli_mem_is_alloc( cntl_mem_p ) ) - { - mem_t* local_mem_p; - mem_t local_mem_s; - // If the mem_t entry in the control tree does NOT contain a NULL - // buffer, then a block has already been acquired from the memory - // broker and cached in the control tree. - - // As a sanity check, we should make sure that the mem_t object isn't - // associated with a block that is too small compared to the size of - // the packed matrix buffer that is needed, according to the return - // value from packm_init(). - siz_t cntl_mem_size = bli_mem_size( cntl_mem_p ); - - if ( cntl_mem_size < size_needed ) - { - if ( bli_thread_am_ochief( thread ) ) - { - // The chief thread releases the existing block associated with - // the mem_t entry in the control tree, and then re-acquires a - // new block, saving the associated mem_t entry to local_mem_s. - bli_pba_release - ( - rntm, - cntl_mem_p - ); - bli_pba_acquire_m - ( - rntm, - size_needed, - pack_buf_type, - &local_mem_s - ); - } - - // Broadcast the address of the chief thread's local mem_t entry to - // all threads. - local_mem_p = bli_thread_broadcast( thread, &local_mem_s ); - - // Save the chief thread's local mem_t entry to the mem_t field in - // this thread's control tree node. - *cntl_mem_p = *local_mem_p; - } - else - { - // If the mem_t entry is already allocated and sufficiently large, - // then we use it as-is. No action is needed, because all threads - // will already have the cached values in their local control - // trees' mem_t entries, currently pointed to by cntl_mem_p. - - bli_thread_barrier( thread ); - } + // Barrier so that the master thread doesn't return from the function + // before we are done reading. + bli_thread_barrier( thread ); } return bli_mem_buffer( cntl_mem_p ); diff --git a/frame/1m/packm/bli_packm_blk_var1.c b/frame/1m/packm/bli_packm_blk_var1.c index c71d24a64e..5a87de5bcb 100644 --- a/frame/1m/packm/bli_packm_blk_var1.c +++ b/frame/1m/packm/bli_packm_blk_var1.c @@ -151,7 +151,7 @@ void bli_packm_blk_var1 // this overrides the kernel determined above. packm_blk_var1_params_t* params = bli_obj_pack_params( c ); - if ( params ) + if ( params && params->ukr_fn[ dt_c ][ dt_p ] ) { packm_ker_cast = params->ukr_fn[ dt_c ][ dt_p ]; } @@ -220,6 +220,16 @@ void bli_packm_blk_var1 inc_t p_inc = ps_p; + /* NOTE: We MUST use round-robin partitioning when packing + micropanels of a triangular matrix. Hermitian/symmetric + and general packing may use slab or round-robin, depending + on which was selected at configure-time. */ + /* The definition of bli_packm_my_iter() will depend on whether slab + or round-robin partitioning was requested at configure-time. */ + bool my_iter = bli_is_triangular( strucc ) + ? bli_packm_my_iter_rr( it, it_start, it_end, tid, nt ) + : bli_packm_my_iter ( it, it_start, it_end, tid, nt ); + if ( bli_is_triangular( strucc ) && bli_is_unstored_subpart_n( diagoffc_i, uploc, panel_dim_i, panel_len_full ) ) { @@ -278,11 +288,7 @@ void bli_packm_blk_var1 /* We nudge the imaginary stride up by one if it is odd. */ is_p_use += ( bli_is_odd( is_p_use ) ? 1 : 0 ); - /* NOTE: We MUST use round-robin partitioning when packing - micropanels of a triangular matrix. Hermitian/symmetric - and general packing may use slab or round-robin, depending - on which was selected at configure-time. */ - if ( bli_packm_my_iter_rr( it, it_start, it_end, tid, nt ) ) + if ( my_iter ) { packm_ker_cast( strucc, diagc, @@ -293,7 +299,7 @@ void bli_packm_blk_var1 panel_dim_i, panel_len_i, panel_dim_max, - panel_len_max, + panel_len_max_i, panel_dim_off_i, panel_len_off_i, kappa_cast, @@ -315,9 +321,7 @@ void bli_packm_blk_var1 to a Hermitian or symmetric matrix, which includes stored, unstored, and diagonal-intersecting panels. */ - /* The definition of bli_packm_my_iter() will depend on whether slab - or round-robin partitioning was requested at configure-time. */ - if ( bli_packm_my_iter( it, it_start, it_end, tid, nt ) ) + if ( my_iter ) { packm_ker_cast( strucc, diagc, From 184cb76d9aa6b4df4649143a948715b335105e88 Mon Sep 17 00:00:00 2001 From: Devin Matthews Date: Wed, 29 Sep 2021 09:59:53 -0500 Subject: [PATCH 11/24] `bli_packm_struc_cxk_1er` doesn't handle stored sub-parts of triangular matrices correctly, so pass BLIS_GENERAL in this case. --- frame/1m/packm/bli_packm_blk_var1.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/frame/1m/packm/bli_packm_blk_var1.c b/frame/1m/packm/bli_packm_blk_var1.c index 5a87de5bcb..3fb8124cb2 100644 --- a/frame/1m/packm/bli_packm_blk_var1.c +++ b/frame/1m/packm/bli_packm_blk_var1.c @@ -323,7 +323,7 @@ void bli_packm_blk_var1 if ( my_iter ) { - packm_ker_cast( strucc, + packm_ker_cast( bli_is_triangular( strucc ) ? BLIS_GENERAL : strucc, diagc, uploc, conjc, From 6051056e18f8e8f5699f980df722247bc2a916dc Mon Sep 17 00:00:00 2001 From: Devin Matthews Date: Wed, 29 Sep 2021 10:33:00 -0500 Subject: [PATCH 12/24] Fixes for GEMM-MD. --- frame/1m/packm/bli_packm_init.c | 1 + frame/3/bli_l3_int.c | 3 ++- frame/3/gemm/bli_gemm_md.c | 6 +++--- 3 files changed, 6 insertions(+), 4 deletions(-) diff --git a/frame/1m/packm/bli_packm_init.c b/frame/1m/packm/bli_packm_init.c index 299be85d38..2cb2bb4f2b 100644 --- a/frame/1m/packm/bli_packm_init.c +++ b/frame/1m/packm/bli_packm_init.c @@ -85,6 +85,7 @@ bool bli_packm_init // Update the storage datatype of P to be the target datatype of A. bli_obj_set_dt( dt_tar, p ); + bli_obj_set_elem_size( bli_dt_size( dt_tar ), p ); // Store the pack schema to the object. bli_obj_set_pack_schema( schema, p ); diff --git a/frame/3/bli_l3_int.c b/frame/3/bli_l3_int.c index 0ae5b03aa7..6936327fcb 100644 --- a/frame/3/bli_l3_int.c +++ b/frame/3/bli_l3_int.c @@ -127,7 +127,8 @@ void bli_l3_int // strides and dimensions. Note that this transposition would normally // be handled explicitly in the packing of C, but if C is not being // packed, this is our last chance to handle the transposition. - if ( bli_cntl_is_leaf( cntl ) && bli_obj_has_trans( c ) ) + //if ( bli_cntl_is_leaf( cntl ) && bli_obj_has_trans( c ) ) + if ( bli_obj_has_trans( c ) ) { bli_obj_induce_trans( &c_local ); bli_obj_set_onlytrans( BLIS_NO_TRANSPOSE, &c_local ); diff --git a/frame/3/gemm/bli_gemm_md.c b/frame/3/gemm/bli_gemm_md.c index a5b5754924..fe181e6174 100644 --- a/frame/3/gemm/bli_gemm_md.c +++ b/frame/3/gemm/bli_gemm_md.c @@ -183,9 +183,9 @@ mddm_t bli_gemm_md_ccr { bli_obj_swap( a, b ); - bli_obj_toggle_trans( a ); - bli_obj_toggle_trans( b ); - bli_obj_toggle_trans( c ); + bli_obj_induce_trans( a ); + bli_obj_induce_trans( b ); + bli_obj_induce_trans( c ); return bli_gemm_md_crc( a, b, beta, c, cntx_local, cntx ); } From d58d4c0cc5e976c50c2ef873e1fe7a2956c1bca6 Mon Sep 17 00:00:00 2001 From: Devin Matthews Date: Wed, 29 Sep 2021 16:42:33 -0500 Subject: [PATCH 13/24] Add explicit handling for beta == 0 in armsve sd and armv7a d gemm ukrs. --- .../3/bli_gemm_armsve_asm_d2vx10_unindexed.c | 19 +- .../3/bli_gemm_armsve_asm_s2vx10_unindexed.c | 18 +- kernels/armv7a/3/bli_gemm_armv7a_int_d4x4.c | 189 ++++++++++-------- 3 files changed, 140 insertions(+), 86 deletions(-) diff --git a/kernels/armsve/3/bli_gemm_armsve_asm_d2vx10_unindexed.c b/kernels/armsve/3/bli_gemm_armsve_asm_d2vx10_unindexed.c index 5824d2d550..b48117ce08 100644 --- a/kernels/armsve/3/bli_gemm_armsve_asm_d2vx10_unindexed.c +++ b/kernels/armsve/3/bli_gemm_armsve_asm_d2vx10_unindexed.c @@ -264,12 +264,20 @@ SCALE_COL20(z0,z1,z2,z3,z4,z5,z6,z7,z8,z9,z10,z11,z12,z13,z14,z15,z16,z17,z18,z1 " \n\t" " WRITE_MEM_C: \n\t" // Available scratch: Z[20-30]. " \n\t" // Here used scratch: Z[20-29]. +" fmov s28, #0.0 \n\t" +" fmov w16, s28 \n\t" +" cmp w16, w8 \n\t" +" b.eq BETA_ZERO_C \n\t" +" \n\t" // First half of C is already loaded in this case. GEMM_C_FMAD_LOAD_UKER_C(z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,p0,p0,z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,z31,x9,x7) +GEMM_C_FMAD_UKER(z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,p0,p0,z10,z12,z14,z16,z18,z11,z13,z15,z17,z19,z31) +" \n\t" +" BETA_ZERO_C: \n\t" " \n\t" GEMM_C_STORE_UKER_C(z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,p0,p0,x5,x7) -GEMM_C_FMAD_UKER(z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,p0,p0,z10,z12,z14,z16,z18,z11,z13,z15,z17,z19,z31) GEMM_C_STORE_UKER_C(z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,p0,p0,x5,x7) +" \n\t" " b END_WRITE_MEM \n\t" " \n\t" " WRITE_MEM_G: \n\t" // Available scratch: Z[20-30]. @@ -278,12 +286,19 @@ GEMM_C_STORE_UKER_C(z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,p0,p0,x5,x7) " incb x8 \n\t" " madd x8, x8, x6, xzr \n\t" // C-column's logical 1-vector skip. " index z30.d, xzr, x6 \n\t" // Skips passed to index is not multiplied by 8. +" fmov s28, #0.0 \n\t" +" fmov w16, s28 \n\t" +" cmp w16, w8 \n\t" +" b.eq BETA_ZERO_G \n\t" +" \n\t" GEMM_C_LOAD_UKER_G(z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,z30,p0,p0,x9,x7,x8,x16) GEMM_C_FMAD_UKER(z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,p0,p0,z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,z31) GEMM_C_LOAD_UKER_G(z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,z30,p0,p0,x9,x7,x8,x16) +GEMM_C_FMAD_UKER(z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,p0,p0,z10,z12,z14,z16,z18,z11,z13,z15,z17,z19,z31) +" \n\t" +" BETA_ZERO_G: \n\t" " \n\t" GEMM_C_STORE_UKER_G(z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,z30,p0,p0,x5,x7,x8,x16) -GEMM_C_FMAD_UKER(z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,p0,p0,z10,z12,z14,z16,z18,z11,z13,z15,z17,z19,z31) GEMM_C_STORE_UKER_G(z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,z30,p0,p0,x5,x7,x8,x16) " \n\t" " END_WRITE_MEM: \n\t" diff --git a/kernels/armsve/3/bli_gemm_armsve_asm_s2vx10_unindexed.c b/kernels/armsve/3/bli_gemm_armsve_asm_s2vx10_unindexed.c index 8659e8b7ee..94bc08ad97 100644 --- a/kernels/armsve/3/bli_gemm_armsve_asm_s2vx10_unindexed.c +++ b/kernels/armsve/3/bli_gemm_armsve_asm_s2vx10_unindexed.c @@ -252,12 +252,19 @@ SCALE_COL20(z0,z1,z2,z3,z4,z5,z6,z7,z8,z9,z10,z11,z12,z13,z14,z15,z16,z17,z18,z1 " \n\t" " WRITE_MEM_C: \n\t" // Available scratch: Z[20-30]. " \n\t" // Here used scratch: Z[20-29]. +" fmov s28, #0.0 \n\t" +" fmov w16, s28 \n\t" +" cmp w16, w8 \n\t" +" b.eq BETA_ZERO_C \n\t" +" \n\t" GEMM_C_LOAD_UKER_C(z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,p0,p0,x9,x7) GEMM_C_FMAD_UKER(z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,p0,p0,z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,z31) GEMM_C_LOAD_UKER_C(z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,p0,p0,x9,x7) +GEMM_C_FMAD_UKER(z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,p0,p0,z10,z12,z14,z16,z18,z11,z13,z15,z17,z19,z31) +" \n\t" +" BETA_ZERO_C: \n\t" " \n\t" GEMM_C_STORE_UKER_C(z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,p0,p0,x5,x7) -GEMM_C_FMAD_UKER(z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,p0,p0,z10,z12,z14,z16,z18,z11,z13,z15,z17,z19,z31) GEMM_C_STORE_UKER_C(z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,p0,p0,x5,x7) " b END_WRITE_MEM \n\t" " \n\t" @@ -267,12 +274,19 @@ GEMM_C_STORE_UKER_C(z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,p0,p0,x5,x7) " incb x8 \n\t" " madd x8, x8, x6, xzr \n\t" // C-column's logical 1-vector skip. " index z30.s, wzr, w6 \n\t" // Skips passed to index is not multiplied by 8. +" fmov s28, #0.0 \n\t" +" fmov w16, s28 \n\t" +" cmp w16, w8 \n\t" +" b.eq BETA_ZERO_G \n\t" +" \n\t" GEMM_C_LOAD_UKER_G(z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,z30,p0,p0,x9,x7,x8,x16) GEMM_C_FMAD_UKER(z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,p0,p0,z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,z31) GEMM_C_LOAD_UKER_G(z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,z30,p0,p0,x9,x7,x8,x16) +GEMM_C_FMAD_UKER(z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,p0,p0,z10,z12,z14,z16,z18,z11,z13,z15,z17,z19,z31) +" \n\t" +" BETA_ZERO_G: \n\t" " \n\t" GEMM_C_STORE_UKER_G(z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,z30,p0,p0,x5,x7,x8,x16) -GEMM_C_FMAD_UKER(z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,p0,p0,z10,z12,z14,z16,z18,z11,z13,z15,z17,z19,z31) GEMM_C_STORE_UKER_G(z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,z30,p0,p0,x5,x7,x8,x16) " \n\t" " END_WRITE_MEM: \n\t" diff --git a/kernels/armv7a/3/bli_gemm_armv7a_int_d4x4.c b/kernels/armv7a/3/bli_gemm_armv7a_int_d4x4.c index e502a34ed6..b9db587266 100644 --- a/kernels/armv7a/3/bli_gemm_armv7a_int_d4x4.c +++ b/kernels/armv7a/3/bli_gemm_armv7a_int_d4x4.c @@ -330,53 +330,53 @@ void bli_dgemm_armv7a_int_4x4 double b0, b1, b2, b3; double B0, B1, B2, B3; - double ab00, ab01, ab02, ab03; - double ab10, ab11, ab12, ab13; + double ab00, ab01, ab02, ab03; + double ab10, ab11, ab12, ab13; double ab20, ab21, ab22, ab23; - double ab30, ab31, ab32, ab33; + double ab30, ab31, ab32, ab33; - double* restrict c00, * restrict c01, * restrict c02, * restrict c03; + double* restrict c00, * restrict c01, * restrict c02, * restrict c03; double* restrict c10, * restrict c11, * restrict c12, * restrict c13; double* restrict c20, * restrict c21, * restrict c22, * restrict c23; - double* restrict c30, * restrict c31, * restrict c32, * restrict c33; + double* restrict c30, * restrict c31, * restrict c32, * restrict c33; double* restrict ap = a; - double* restrict bp = b; + double* restrict bp = b; double* restrict Ap = a + 4; - double* restrict Bp = b + 4; + double* restrict Bp = b + 4; - c00 = (c + 0*rs_c + 0*cs_c); - c10 = (c + 1*rs_c + 0*cs_c); - c20 = (c + 2*rs_c + 0*cs_c); - c30 = (c + 3*rs_c + 0*cs_c); + c00 = (c + 0*rs_c + 0*cs_c); + c10 = (c + 1*rs_c + 0*cs_c); + c20 = (c + 2*rs_c + 0*cs_c); + c30 = (c + 3*rs_c + 0*cs_c); - c01 = (c + 0*rs_c + 1*cs_c); - c11 = (c + 1*rs_c + 1*cs_c); - c21 = (c + 2*rs_c + 1*cs_c); - c31 = (c + 3*rs_c + 1*cs_c); + c01 = (c + 0*rs_c + 1*cs_c); + c11 = (c + 1*rs_c + 1*cs_c); + c21 = (c + 2*rs_c + 1*cs_c); + c31 = (c + 3*rs_c + 1*cs_c); - c02 = (c + 0*rs_c + 2*cs_c); - c12 = (c + 1*rs_c + 2*cs_c); - c22 = (c + 2*rs_c + 2*cs_c); - c32 = (c + 3*rs_c + 2*cs_c); + c02 = (c + 0*rs_c + 2*cs_c); + c12 = (c + 1*rs_c + 2*cs_c); + c22 = (c + 2*rs_c + 2*cs_c); + c32 = (c + 3*rs_c + 2*cs_c); - c03 = (c + 0*rs_c + 3*cs_c); - c13 = (c + 1*rs_c + 3*cs_c); - c23 = (c + 2*rs_c + 3*cs_c); - c33 = (c + 3*rs_c + 3*cs_c); + c03 = (c + 0*rs_c + 3*cs_c); + c13 = (c + 1*rs_c + 3*cs_c); + c23 = (c + 2*rs_c + 3*cs_c); + c33 = (c + 3*rs_c + 3*cs_c); ab00 = 0.0; ab10 = 0.0; ab20 = 0.0; ab30 = 0.0; ab01 = 0.0; ab11 = 0.0; ab21 = 0.0; ab31 = 0.0; ab02 = 0.0; ab12 = 0.0; ab22 = 0.0; ab32 = 0.0; ab03 = 0.0; ab13 = 0.0; ab23 = 0.0; ab33 = 0.0; - A0 = *(Ap + 0); - A1 = *(Ap + 1); - A2 = *(Ap + 2); - A3 = *(Ap + 3); + A0 = *(Ap + 0); + A1 = *(Ap + 1); + A2 = *(Ap + 2); + A3 = *(Ap + 3); - a0 = *(ap + 0); + a0 = *(ap + 0); a1 = *(ap + 1); a2 = *(ap + 2); @@ -389,11 +389,11 @@ void bli_dgemm_armv7a_int_4x4 b1 = *(bp + 1); b2 = *(bp + 2); - double *Aplast = (Ap + 4*(k-k_left)); + double *Aplast = (Ap + 4*(k-k_left)); //for ( i = 0; i < k_iter; ++i ) // Unroll by factor 4. for ( ; Ap != Aplast ; ) // Unroll by factor 4. - { + { /* Prefetch */ //__asm__ ("pld\t[%0],#100\n\t" : :"r"(Ap) : ); __builtin_prefetch( ap + 112 ); @@ -452,7 +452,7 @@ void bli_dgemm_armv7a_int_4x4 b2 = *(bp + 10); ab03 += a0 * b3; - a0 = *(ap + 8); + a0 = *(ap + 8); ab13 += a1 * b3; a1 = *(ap + 9); ab23 += a2 * b3; @@ -460,17 +460,17 @@ void bli_dgemm_armv7a_int_4x4 ab33 += a3 * b3; //a3 = *(ap + 11); - ap += 8; - Ap += 8; - bp += 8; - Bp += 8; + ap += 8; + Ap += 8; + bp += 8; + Bp += 8; - } + } - for ( i = 0; i < k_left; ++i ) - { - a0 = *(ap + 0); + for ( i = 0; i < k_left; ++i ) + { + a0 = *(ap + 0); a1 = *(ap + 1); a2 = *(ap + 2); a3 = *(ap + 3); @@ -500,48 +500,73 @@ void bli_dgemm_armv7a_int_4x4 ab23 += a2 * b3; ab33 += a3 * b3; - ap += 4; - bp += 4; - } - - *c00 = *c00 * *beta; - *c10 = *c10 * *beta; - *c20 = *c20 * *beta; - *c30 = *c30 * *beta; - - *c01 = *c01 * *beta; - *c11 = *c11 * *beta; - *c21 = *c21 * *beta; - *c31 = *c31 * *beta; - - *c02 = *c02 * *beta; - *c12 = *c12 * *beta; - *c22 = *c22 * *beta; - *c32 = *c32 * *beta; - - *c03 = *c03 * *beta; - *c13 = *c13 * *beta; - *c23 = *c23 * *beta; - *c33 = *c33 * *beta; - - *c00 += ab00 * *alpha; - *c10 += ab10 * *alpha; - *c20 += ab20 * *alpha; - *c30 += ab30 * *alpha; - - *c01 += ab01 * *alpha; - *c11 += ab11 * *alpha; - *c21 += ab21 * *alpha; - *c31 += ab31 * *alpha; - - *c02 += ab02 * *alpha; - *c12 += ab12 * *alpha; - *c22 += ab22 * *alpha; - *c32 += ab32 * *alpha; - - *c03 += ab03 * *alpha; - *c13 += ab13 * *alpha; - *c23 += ab23 * *alpha; - *c33 += ab33 * *alpha; + ap += 4; + bp += 4; + } + + if ( *beta == 0.0 ) + { + *c00 = ab00 * *alpha; + *c10 = ab10 * *alpha; + *c20 = ab20 * *alpha; + *c30 = ab30 * *alpha; + + *c01 = ab01 * *alpha; + *c11 = ab11 * *alpha; + *c21 = ab21 * *alpha; + *c31 = ab31 * *alpha; + + *c02 = ab02 * *alpha; + *c12 = ab12 * *alpha; + *c22 = ab22 * *alpha; + *c32 = ab32 * *alpha; + + *c03 = ab03 * *alpha; + *c13 = ab13 * *alpha; + *c23 = ab23 * *alpha; + *c33 = ab33 * *alpha; + } + else + { + *c00 = *c00 * *beta; + *c10 = *c10 * *beta; + *c20 = *c20 * *beta; + *c30 = *c30 * *beta; + + *c01 = *c01 * *beta; + *c11 = *c11 * *beta; + *c21 = *c21 * *beta; + *c31 = *c31 * *beta; + + *c02 = *c02 * *beta; + *c12 = *c12 * *beta; + *c22 = *c22 * *beta; + *c32 = *c32 * *beta; + + *c03 = *c03 * *beta; + *c13 = *c13 * *beta; + *c23 = *c23 * *beta; + *c33 = *c33 * *beta; + + *c00 += ab00 * *alpha; + *c10 += ab10 * *alpha; + *c20 += ab20 * *alpha; + *c30 += ab30 * *alpha; + + *c01 += ab01 * *alpha; + *c11 += ab11 * *alpha; + *c21 += ab21 * *alpha; + *c31 += ab31 * *alpha; + + *c02 += ab02 * *alpha; + *c12 += ab12 * *alpha; + *c22 += ab22 * *alpha; + *c32 += ab32 * *alpha; + + *c03 += ab03 * *alpha; + *c13 += ab13 * *alpha; + *c23 += ab23 * *alpha; + *c33 += ab33 * *alpha; + } } From dc30955a9de033416108ebda0ee1938de349ca06 Mon Sep 17 00:00:00 2001 From: Devin Matthews Date: Sat, 2 Oct 2021 20:40:25 +0000 Subject: [PATCH 14/24] Apply patch from @xrq-phys. --- kernels/armsve/3/armsve_asm_2vx10.h | 7 +++++ .../3/bli_gemm_armsve_asm_d2vx10_unindexed.c | 29 ++++++++----------- .../3/bli_gemm_armsve_asm_s2vx10_unindexed.c | 29 +++++++------------ 3 files changed, 30 insertions(+), 35 deletions(-) diff --git a/kernels/armsve/3/armsve_asm_2vx10.h b/kernels/armsve/3/armsve_asm_2vx10.h index 8e37585cba..ae89fa1ece 100644 --- a/kernels/armsve/3/armsve_asm_2vx10.h +++ b/kernels/armsve/3/armsve_asm_2vx10.h @@ -130,6 +130,13 @@ SCALE_COL4(Z12,Z13,Z14,Z15,ZFACTOR) \ SCALE_COL4(Z16,Z17,Z18,Z19,ZFACTOR) +#define GEMM_C_FMLA_UKER(C0FH,C1FH,C2FH,C3FH,C4FH,C0LH,C1LH,C2LH,C3LH,C4LH,PT,Z0FH,Z1FH,Z2FH,Z3FH,Z4FH,Z0LH,Z1LH,Z2LH,Z3LH,Z4LH,ZSCALE) \ + GEMM_FMLA2(C0FH,C0LH,PT,Z0FH,Z0LH,ZSCALE) \ + GEMM_FMLA2(C1FH,C1LH,PT,Z1FH,Z1LH,ZSCALE) \ + GEMM_FMLA2(C2FH,C2LH,PT,Z2FH,Z2LH,ZSCALE) \ + GEMM_FMLA2(C3FH,C3LH,PT,Z3FH,Z3LH,ZSCALE) \ + GEMM_FMLA2(C4FH,C4LH,PT,Z4FH,Z4LH,ZSCALE) + #define GEMM_C_FMAD_UKER(Z0FH,Z1FH,Z2FH,Z3FH,Z4FH,Z0LH,Z1LH,Z2LH,Z3LH,Z4LH,PFH,PLH,C0FH,C1FH,C2FH,C3FH,C4FH,C0LH,C1LH,C2LH,C3LH,C4LH,ZSCALE) \ GEMM_CCOL_FMAD(Z0FH,Z0LH,PFH,PLH,C0FH,C0LH,ZSCALE) \ GEMM_CCOL_FMAD(Z1FH,Z1LH,PFH,PLH,C1FH,C1LH,ZSCALE) \ diff --git a/kernels/armsve/3/bli_gemm_armsve_asm_d2vx10_unindexed.c b/kernels/armsve/3/bli_gemm_armsve_asm_d2vx10_unindexed.c index b48117ce08..e5b78a5921 100644 --- a/kernels/armsve/3/bli_gemm_armsve_asm_d2vx10_unindexed.c +++ b/kernels/armsve/3/bli_gemm_armsve_asm_d2vx10_unindexed.c @@ -264,20 +264,17 @@ SCALE_COL20(z0,z1,z2,z3,z4,z5,z6,z7,z8,z9,z10,z11,z12,z13,z14,z15,z16,z17,z18,z1 " \n\t" " WRITE_MEM_C: \n\t" // Available scratch: Z[20-30]. " \n\t" // Here used scratch: Z[20-29]. -" fmov s28, #0.0 \n\t" -" fmov w16, s28 \n\t" -" cmp w16, w8 \n\t" +" fcmp d31, #0.0 \n\t" // Skip loading if *beta == 0 to override NaN. " b.eq BETA_ZERO_C \n\t" -" \n\t" // First half of C is already loaded in this case. -GEMM_C_FMAD_LOAD_UKER_C(z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,p0,p0,z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,z31,x9,x7) -GEMM_C_FMAD_UKER(z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,p0,p0,z10,z12,z14,z16,z18,z11,z13,z15,z17,z19,z31) +// GEMM_C_FMAD_LOAD_UKER_C(z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,p0,p0,z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,z31,x9,x7) +GEMM_C_FMLA_UKER(z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,p0,z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,z31) +GEMM_C_LOAD_UKER_C(z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,p0,p0,x9,x7) +GEMM_C_FMLA_UKER(z10,z12,z14,z16,z18,z11,z13,z15,z17,z19,p0,z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,z31) " \n\t" " BETA_ZERO_C: \n\t" -" \n\t" -GEMM_C_STORE_UKER_C(z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,p0,p0,x5,x7) GEMM_C_STORE_UKER_C(z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,p0,p0,x5,x7) -" \n\t" +GEMM_C_STORE_UKER_C(z10,z12,z14,z16,z18,z11,z13,z15,z17,z19,p0,p0,x5,x7) " b END_WRITE_MEM \n\t" " \n\t" " WRITE_MEM_G: \n\t" // Available scratch: Z[20-30]. @@ -286,20 +283,18 @@ GEMM_C_STORE_UKER_C(z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,p0,p0,x5,x7) " incb x8 \n\t" " madd x8, x8, x6, xzr \n\t" // C-column's logical 1-vector skip. " index z30.d, xzr, x6 \n\t" // Skips passed to index is not multiplied by 8. -" fmov s28, #0.0 \n\t" -" fmov w16, s28 \n\t" -" cmp w16, w8 \n\t" +" \n\t" +" fcmp d31, #0.0 \n\t" // Skip loading if *beta == 0 to override NaN. " b.eq BETA_ZERO_G \n\t" " \n\t" GEMM_C_LOAD_UKER_G(z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,z30,p0,p0,x9,x7,x8,x16) -GEMM_C_FMAD_UKER(z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,p0,p0,z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,z31) -GEMM_C_LOAD_UKER_G(z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,z30,p0,p0,x9,x7,x8,x16) -GEMM_C_FMAD_UKER(z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,p0,p0,z10,z12,z14,z16,z18,z11,z13,z15,z17,z19,z31) +GEMM_C_FMLA_UKER(z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,p0,z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,z31) +GEMM_C_LOAD_UKER_G(z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,z30,p0,p0,x9,x7,x8,x16) +GEMM_C_FMLA_UKER(z10,z12,z14,z16,z18,z11,z13,z15,z17,z19,p0,z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,z31) " \n\t" " BETA_ZERO_G: \n\t" -" \n\t" -GEMM_C_STORE_UKER_G(z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,z30,p0,p0,x5,x7,x8,x16) GEMM_C_STORE_UKER_G(z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,z30,p0,p0,x5,x7,x8,x16) +GEMM_C_STORE_UKER_G(z10,z12,z14,z16,z18,z11,z13,z15,z17,z19,z30,p0,p0,x5,x7,x8,x16) " \n\t" " END_WRITE_MEM: \n\t" " b END_EXEC \n\t" diff --git a/kernels/armsve/3/bli_gemm_armsve_asm_s2vx10_unindexed.c b/kernels/armsve/3/bli_gemm_armsve_asm_s2vx10_unindexed.c index 94bc08ad97..00b3f20b44 100644 --- a/kernels/armsve/3/bli_gemm_armsve_asm_s2vx10_unindexed.c +++ b/kernels/armsve/3/bli_gemm_armsve_asm_s2vx10_unindexed.c @@ -252,20 +252,16 @@ SCALE_COL20(z0,z1,z2,z3,z4,z5,z6,z7,z8,z9,z10,z11,z12,z13,z14,z15,z16,z17,z18,z1 " \n\t" " WRITE_MEM_C: \n\t" // Available scratch: Z[20-30]. " \n\t" // Here used scratch: Z[20-29]. -" fmov s28, #0.0 \n\t" -" fmov w16, s28 \n\t" -" cmp w16, w8 \n\t" +" fcmp s31, #0.0 \n\t" " b.eq BETA_ZERO_C \n\t" -" \n\t" GEMM_C_LOAD_UKER_C(z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,p0,p0,x9,x7) -GEMM_C_FMAD_UKER(z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,p0,p0,z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,z31) -GEMM_C_LOAD_UKER_C(z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,p0,p0,x9,x7) -GEMM_C_FMAD_UKER(z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,p0,p0,z10,z12,z14,z16,z18,z11,z13,z15,z17,z19,z31) +GEMM_C_FMLA_UKER(z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,p0,z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,z31) +GEMM_C_LOAD_UKER_C(z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,p0,p0,x9,x7) +GEMM_C_FMLA_UKER(z10,z12,z14,z16,z18,z11,z13,z15,z17,z19,p0,z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,z31) " \n\t" " BETA_ZERO_C: \n\t" -" \n\t" -GEMM_C_STORE_UKER_C(z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,p0,p0,x5,x7) GEMM_C_STORE_UKER_C(z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,p0,p0,x5,x7) +GEMM_C_STORE_UKER_C(z10,z12,z14,z16,z18,z11,z13,z15,z17,z19,p0,p0,x5,x7) " b END_WRITE_MEM \n\t" " \n\t" " WRITE_MEM_G: \n\t" // Available scratch: Z[20-30]. @@ -274,20 +270,17 @@ GEMM_C_STORE_UKER_C(z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,p0,p0,x5,x7) " incb x8 \n\t" " madd x8, x8, x6, xzr \n\t" // C-column's logical 1-vector skip. " index z30.s, wzr, w6 \n\t" // Skips passed to index is not multiplied by 8. -" fmov s28, #0.0 \n\t" -" fmov w16, s28 \n\t" -" cmp w16, w8 \n\t" -" b.eq BETA_ZERO_G \n\t" " \n\t" +" fcmp s31, #0.0 \n\t" +" b.eq BETA_ZERO_G \n\t" GEMM_C_LOAD_UKER_G(z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,z30,p0,p0,x9,x7,x8,x16) -GEMM_C_FMAD_UKER(z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,p0,p0,z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,z31) -GEMM_C_LOAD_UKER_G(z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,z30,p0,p0,x9,x7,x8,x16) -GEMM_C_FMAD_UKER(z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,p0,p0,z10,z12,z14,z16,z18,z11,z13,z15,z17,z19,z31) +GEMM_C_FMLA_UKER(z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,p0,z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,z31) +GEMM_C_LOAD_UKER_G(z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,z30,p0,p0,x9,x7,x8,x16) +GEMM_C_FMLA_UKER(z10,z12,z14,z16,z18,z11,z13,z15,z17,z19,p0,z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,z31) " \n\t" " BETA_ZERO_G: \n\t" -" \n\t" -GEMM_C_STORE_UKER_G(z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,z30,p0,p0,x5,x7,x8,x16) GEMM_C_STORE_UKER_G(z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,z30,p0,p0,x5,x7,x8,x16) +GEMM_C_STORE_UKER_G(z10,z12,z14,z16,z18,z11,z13,z15,z17,z19,z30,p0,p0,x5,x7,x8,x16) " \n\t" " END_WRITE_MEM: \n\t" " b END_EXEC \n\t" From c4fadf2f5cffbe70ec21ccba5a28f85e366af837 Mon Sep 17 00:00:00 2001 From: Devin Matthews Date: Sat, 2 Oct 2021 16:42:25 -0500 Subject: [PATCH 15/24] Make `bli_pba_rntm_set_pba` inline, instead of forcing export on Windows. --- build/libblis-symbols.def | 1 - frame/base/bli_pba.c | 11 ----------- frame/base/bli_pba.h | 9 +++++++-- 3 files changed, 7 insertions(+), 14 deletions(-) diff --git a/build/libblis-symbols.def b/build/libblis-symbols.def index 97146a7861..8d29d73b25 100644 --- a/build/libblis-symbols.def +++ b/build/libblis-symbols.def @@ -1307,7 +1307,6 @@ bli_pba_init_pools bli_pba_pool_size bli_pba_query bli_pba_release -bli_pba_rntm_set_pba bli_memsys_finalize bli_memsys_init bli_mkherm diff --git a/frame/base/bli_pba.c b/frame/base/bli_pba.c index a924bbefc8..f8835e5de0 100644 --- a/frame/base/bli_pba.c +++ b/frame/base/bli_pba.c @@ -282,17 +282,6 @@ void bli_pba_acquire_v #endif -void bli_pba_rntm_set_pba - ( - rntm_t* rntm - ) -{ - pba_t* pba = bli_pba_query(); - - bli_rntm_set_pba( pba, rntm ); -} - - siz_t bli_pba_pool_size ( pba_t* pba, diff --git a/frame/base/bli_pba.h b/frame/base/bli_pba.h index ce19991f55..6f2e4c0b8c 100644 --- a/frame/base/bli_pba.h +++ b/frame/base/bli_pba.h @@ -144,10 +144,15 @@ void bli_pba_release mem_t* mem ); -void bli_pba_rntm_set_pba +BLIS_INLINE void bli_pba_rntm_set_pba ( rntm_t* rntm - ); + ) +{ + pba_t* pba = bli_pba_query(); + + bli_rntm_set_pba( pba, rntm ); +} siz_t bli_pba_pool_size ( From c493032bc00fecff280a9e6dde4f541e2a91893b Mon Sep 17 00:00:00 2001 From: Devin Matthews Date: Sat, 2 Oct 2021 18:48:50 -0500 Subject: [PATCH 16/24] Make error checking level a thread-local variable. Previously, this was a global variable. Setting the value was synchronized via a mutex but reading the value was not. Of course, these accesses are almost certainly atomic, but there is still the possibility of one thread attempting to set the value and then reading the value set by another thread. For correct operation under user threading (e.g. pthreads), this should probably be thread-local with no mutex. --- frame/base/bli_error.c | 18 +++--------------- 1 file changed, 3 insertions(+), 15 deletions(-) diff --git a/frame/base/bli_error.c b/frame/base/bli_error.c index 1381afef0e..f4d6acf806 100644 --- a/frame/base/bli_error.c +++ b/frame/base/bli_error.c @@ -33,6 +33,7 @@ */ +#include "bli_lang_defs.h" #include "blis.h" // Internal array to hold error strings. @@ -133,11 +134,8 @@ void bli_abort( void ) // ----------------------------------------------------------------------------- -// A mutex to allow synchronous access to bli_err_chk_level. -static bli_pthread_mutex_t err_mutex = BLIS_PTHREAD_MUTEX_INITIALIZER; - // Current error checking level. -static errlev_t bli_err_chk_level = BLIS_FULL_ERROR_CHECKING; +static BLIS_THREAD_LOCAL errlev_t bli_err_chk_level = BLIS_FULL_ERROR_CHECKING; errlev_t bli_error_checking_level( void ) { @@ -151,17 +149,7 @@ void bli_error_checking_level_set( errlev_t new_level ) e_val = bli_check_valid_error_level( new_level ); bli_check_error_code( e_val ); - // Acquire the mutex protecting bli_err_chk_level. - bli_pthread_mutex_lock( &err_mutex ); - - // BEGIN CRITICAL SECTION - { - bli_err_chk_level = new_level; - } - // END CRITICAL SECTION - - // Release the mutex protecting bli_err_chk_level. - bli_pthread_mutex_unlock( &err_mutex ); + bli_err_chk_level = new_level; } bool bli_error_checking_is_enabled( void ) From ef64cdf5346a7f9823ffafde9755babf10835c2a Mon Sep 17 00:00:00 2001 From: Devin Matthews Date: Sat, 2 Oct 2021 18:50:44 -0500 Subject: [PATCH 17/24] Fix data race in the testsuite. There was a data race when simulating application-level threading: the `test_done` field of the operation structures was set by all threads on a shared object. The threading driver now copies all of the operation structures to be local to each thread. --- testsuite/src/test_libblis.c | 58 ++++++++++++++++++++++++++++++++++-- testsuite/src/test_libblis.h | 2 +- 2 files changed, 57 insertions(+), 3 deletions(-) diff --git a/testsuite/src/test_libblis.c b/testsuite/src/test_libblis.c index b837227fd1..2272ce779b 100644 --- a/testsuite/src/test_libblis.c +++ b/testsuite/src/test_libblis.c @@ -109,7 +109,7 @@ void* libblis_test_thread_entry( void* tdata_void ) thread_data_t* tdata = tdata_void; test_params_t* params = tdata->params; - test_ops_t* ops = tdata->ops; + test_ops_t* ops = &tdata->ops; // Walk through all test modules. libblis_test_all_ops( tdata, params, ops ); @@ -162,13 +162,67 @@ void libblis_test_thread_decorator( test_params_t* params, test_ops_t* ops ) for ( signed int id = nt - 1; 0 <= id; id-- ) { tdata[id].params = params; - tdata[id].ops = ops; + tdata[id].ops = *ops; tdata[id].nt = nt; tdata[id].id = id; tdata[id].xc = 0; //tdata[id].mutex = mutex; tdata[id].barrier = barrier; + tdata[id].ops.randv.ops = &tdata[id].ops; + tdata[id].ops.randm.ops = &tdata[id].ops; + tdata[id].ops.addv.ops = &tdata[id].ops; + tdata[id].ops.amaxv.ops = &tdata[id].ops; + tdata[id].ops.axpbyv.ops = &tdata[id].ops; + tdata[id].ops.axpyv.ops = &tdata[id].ops; + tdata[id].ops.copyv.ops = &tdata[id].ops; + tdata[id].ops.dotv.ops = &tdata[id].ops; + tdata[id].ops.dotxv.ops = &tdata[id].ops; + tdata[id].ops.normfv.ops = &tdata[id].ops; + tdata[id].ops.scalv.ops = &tdata[id].ops; + tdata[id].ops.scal2v.ops = &tdata[id].ops; + tdata[id].ops.setv.ops = &tdata[id].ops; + tdata[id].ops.subv.ops = &tdata[id].ops; + tdata[id].ops.xpbyv.ops = &tdata[id].ops; + tdata[id].ops.addm.ops = &tdata[id].ops; + tdata[id].ops.axpym.ops = &tdata[id].ops; + tdata[id].ops.copym.ops = &tdata[id].ops; + tdata[id].ops.normfm.ops = &tdata[id].ops; + tdata[id].ops.scalm.ops = &tdata[id].ops; + tdata[id].ops.scal2m.ops = &tdata[id].ops; + tdata[id].ops.setm.ops = &tdata[id].ops; + tdata[id].ops.subm.ops = &tdata[id].ops; + tdata[id].ops.xpbym.ops = &tdata[id].ops; + tdata[id].ops.axpy2v.ops = &tdata[id].ops; + tdata[id].ops.dotaxpyv.ops = &tdata[id].ops; + tdata[id].ops.axpyf.ops = &tdata[id].ops; + tdata[id].ops.dotxf.ops = &tdata[id].ops; + tdata[id].ops.dotxaxpyf.ops = &tdata[id].ops; + tdata[id].ops.gemv.ops = &tdata[id].ops; + tdata[id].ops.ger.ops = &tdata[id].ops; + tdata[id].ops.hemv.ops = &tdata[id].ops; + tdata[id].ops.her.ops = &tdata[id].ops; + tdata[id].ops.her2.ops = &tdata[id].ops; + tdata[id].ops.symv.ops = &tdata[id].ops; + tdata[id].ops.syr.ops = &tdata[id].ops; + tdata[id].ops.syr2.ops = &tdata[id].ops; + tdata[id].ops.trmv.ops = &tdata[id].ops; + tdata[id].ops.trsv.ops = &tdata[id].ops; + tdata[id].ops.gemm_ukr.ops = &tdata[id].ops; + tdata[id].ops.trsm_ukr.ops = &tdata[id].ops; + tdata[id].ops.gemmtrsm_ukr.ops = &tdata[id].ops; + tdata[id].ops.gemm.ops = &tdata[id].ops; + tdata[id].ops.gemmt.ops = &tdata[id].ops; + tdata[id].ops.hemm.ops = &tdata[id].ops; + tdata[id].ops.herk.ops = &tdata[id].ops; + tdata[id].ops.her2k.ops = &tdata[id].ops; + tdata[id].ops.symm.ops = &tdata[id].ops; + tdata[id].ops.syrk.ops = &tdata[id].ops; + tdata[id].ops.syr2k.ops = &tdata[id].ops; + tdata[id].ops.trmm.ops = &tdata[id].ops; + tdata[id].ops.trmm3.ops = &tdata[id].ops; + tdata[id].ops.trsm.ops = &tdata[id].ops; + // Spawn additional threads for ids greater than 1. if ( id != 0 ) bli_pthread_create( &pthread[id], NULL, libblis_test_thread_entry, &tdata[id] ); diff --git a/testsuite/src/test_libblis.h b/testsuite/src/test_libblis.h index cdb3c6dac4..89e558e8f1 100644 --- a/testsuite/src/test_libblis.h +++ b/testsuite/src/test_libblis.h @@ -297,7 +297,7 @@ typedef struct typedef struct thread_data { test_params_t* params; - test_ops_t* ops; + test_ops_t ops; unsigned int nt; unsigned int id; unsigned int xc; From edffd67ac5ff1a269f2fed70ea244e1467b29be3 Mon Sep 17 00:00:00 2001 From: Devin Matthews Date: Sat, 2 Oct 2021 18:52:07 -0500 Subject: [PATCH 18/24] Fix problem in the gemmtrsm ukr test driver. The packing buffer for B was being released too early. The fix is a bit ugly because it really does need to get release at that point for all but the last repeat, but later on the last one. --- testsuite/src/test_gemmtrsm_ukr.c | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/testsuite/src/test_gemmtrsm_ukr.c b/testsuite/src/test_gemmtrsm_ukr.c index b5287f6b50..ca4b93d072 100644 --- a/testsuite/src/test_gemmtrsm_ukr.c +++ b/testsuite/src/test_gemmtrsm_ukr.c @@ -315,6 +315,8 @@ bli_printm( "a", &a, "%5.2f", "" ); bli_printm( "ap", &ap, "%5.2f", "" ); #endif + cntl_t* cntl_b; + // Repeat the experiment n_repeats times and record results. for ( i = 0; i < n_repeats; ++i ) { @@ -323,7 +325,7 @@ bli_printm( "ap", &ap, "%5.2f", "" ); // Transpose B to B^T for packing bli_obj_induce_trans( &b ); - cntl_t* cntl_b = libblis_test_pobj_create + cntl_b = libblis_test_pobj_create ( BLIS_NR, BLIS_MR, @@ -356,9 +358,14 @@ bli_printm( "ap", &ap, "%5.2f", "" ); time_min = bli_clock_min_diff( time_min, time ); - // Free the control tree nodes and release their cached mem_t entries - // back to the memory broker. - bli_cntl_free( &rntm, cntl_b, &BLIS_PACKM_SINGLE_THREADED ); + // On the last pass, we must keep the packed B buffer checked out in order + // to perform the correctness check later. + if (i < n_repeats-1) + { + // Free the control tree nodes and release their cached mem_t entries + // back to the memory broker. + bli_cntl_free( &rntm, cntl_b, &BLIS_PACKM_SINGLE_THREADED ); + } } // Estimate the performance of the best experiment repeat. @@ -397,6 +404,7 @@ bli_printm( "ap", &ap, "%5.2f", "" ); // Free the control tree nodes and release their cached mem_t entries // back to the memory broker. bli_cntl_free( &rntm, cntl_a, &BLIS_PACKM_SINGLE_THREADED ); + bli_cntl_free( &rntm, cntl_b, &BLIS_PACKM_SINGLE_THREADED ); // Free the test objects. bli_obj_free( &a_big ); From 69915e9e53954057c1f6378035165a03f688148b Mon Sep 17 00:00:00 2001 From: Devin Matthews Date: Sat, 2 Oct 2021 18:54:19 -0500 Subject: [PATCH 19/24] VSCode keeps inserting headers. STOP IT! --- frame/base/bli_error.c | 1 - 1 file changed, 1 deletion(-) diff --git a/frame/base/bli_error.c b/frame/base/bli_error.c index f4d6acf806..809fc5dfc8 100644 --- a/frame/base/bli_error.c +++ b/frame/base/bli_error.c @@ -33,7 +33,6 @@ */ -#include "bli_lang_defs.h" #include "blis.h" // Internal array to hold error strings. From ca933143e5084800e038fe85aca3f73e091369a8 Mon Sep 17 00:00:00 2001 From: Devin Matthews Date: Sun, 3 Oct 2021 10:49:42 -0500 Subject: [PATCH 20/24] Export `bli_pba_query`. --- frame/base/bli_pba.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/frame/base/bli_pba.h b/frame/base/bli_pba.h index 6f2e4c0b8c..6431607ec9 100644 --- a/frame/base/bli_pba.h +++ b/frame/base/bli_pba.h @@ -119,7 +119,7 @@ BLIS_INLINE void bli_pba_unlock( pba_t* pba ) // ----------------------------------------------------------------------------- -pba_t* bli_pba_query( void ); +BLIS_EXPORT_BLIS pba_t* bli_pba_query( void ); void bli_pba_init ( From 288b551c0284eff36479fda72b4dbf48f710e4cf Mon Sep 17 00:00:00 2001 From: Devin Matthews Date: Mon, 4 Oct 2021 11:33:21 -0500 Subject: [PATCH 21/24] Fix symbol visibility problems. --- frame/1m/packm/bli_packm_blk_var1.h | 18 ------------------ frame/1m/packm/bli_packm_cntl.c | 2 +- 2 files changed, 1 insertion(+), 19 deletions(-) diff --git a/frame/1m/packm/bli_packm_blk_var1.h b/frame/1m/packm/bli_packm_blk_var1.h index bb2c2f0f52..eeb18a2166 100644 --- a/frame/1m/packm/bli_packm_blk_var1.h +++ b/frame/1m/packm/bli_packm_blk_var1.h @@ -43,24 +43,6 @@ typedef struct packm_ker_vft ukr_fn[BLIS_NUM_FP_TYPES][BLIS_NUM_FP_TYPES]; } packm_blk_var1_params_t; -BLIS_INLINE void bli_packm_blk_var1_init_params - ( - packm_blk_var1_params_t* params - ) -{ - #undef GENTFUNC2 - #define GENTFUNC2(ctypec,ctypep,chc,chp,name) \ - params->ukr_fn[ PASTEMAC(chc,type) ][ PASTEMAC(chp,type) ] = \ - ( packm_ker_vft )PASTEMAC2(chc,chp,name); - INSERT_GENTFUNC2_MIXDP0(packm_struc_cxk_md) - - #undef GENTFUNC - #define GENTFUNC(ctype,ch,name) \ - params->ukr_fn[ PASTEMAC(ch,type) ][ PASTEMAC(ch,type) ] = \ - ( packm_ker_vft )PASTEMAC(ch,name); - INSERT_GENTFUNC_BASIC0(packm_struc_cxk) -} - BLIS_EXPORT_BLIS void bli_packm_blk_var1 ( obj_t* c, diff --git a/frame/1m/packm/bli_packm_cntl.c b/frame/1m/packm/bli_packm_cntl.c index 4a61c92d03..e99ed9cf3d 100644 --- a/frame/1m/packm/bli_packm_cntl.c +++ b/frame/1m/packm/bli_packm_cntl.c @@ -35,7 +35,7 @@ #include "blis.h" -cntl_t* bli_packm_cntl_create_node +BLIS_EXPORT_BLIS cntl_t* bli_packm_cntl_create_node ( rntm_t* rntm, void_fp var_func, From ec7d24e2bfbeda4f82b2b6b576502232f1a6bdc6 Mon Sep 17 00:00:00 2001 From: Devin Matthews Date: Tue, 5 Oct 2021 15:23:21 -0500 Subject: [PATCH 22/24] Revert "Fix data race in the testsuite." This reverts commit ef64cdf5346a7f9823ffafde9755babf10835c2a. --- testsuite/src/test_libblis.c | 58 ++---------------------------------- testsuite/src/test_libblis.h | 2 +- 2 files changed, 3 insertions(+), 57 deletions(-) diff --git a/testsuite/src/test_libblis.c b/testsuite/src/test_libblis.c index 2272ce779b..b837227fd1 100644 --- a/testsuite/src/test_libblis.c +++ b/testsuite/src/test_libblis.c @@ -109,7 +109,7 @@ void* libblis_test_thread_entry( void* tdata_void ) thread_data_t* tdata = tdata_void; test_params_t* params = tdata->params; - test_ops_t* ops = &tdata->ops; + test_ops_t* ops = tdata->ops; // Walk through all test modules. libblis_test_all_ops( tdata, params, ops ); @@ -162,67 +162,13 @@ void libblis_test_thread_decorator( test_params_t* params, test_ops_t* ops ) for ( signed int id = nt - 1; 0 <= id; id-- ) { tdata[id].params = params; - tdata[id].ops = *ops; + tdata[id].ops = ops; tdata[id].nt = nt; tdata[id].id = id; tdata[id].xc = 0; //tdata[id].mutex = mutex; tdata[id].barrier = barrier; - tdata[id].ops.randv.ops = &tdata[id].ops; - tdata[id].ops.randm.ops = &tdata[id].ops; - tdata[id].ops.addv.ops = &tdata[id].ops; - tdata[id].ops.amaxv.ops = &tdata[id].ops; - tdata[id].ops.axpbyv.ops = &tdata[id].ops; - tdata[id].ops.axpyv.ops = &tdata[id].ops; - tdata[id].ops.copyv.ops = &tdata[id].ops; - tdata[id].ops.dotv.ops = &tdata[id].ops; - tdata[id].ops.dotxv.ops = &tdata[id].ops; - tdata[id].ops.normfv.ops = &tdata[id].ops; - tdata[id].ops.scalv.ops = &tdata[id].ops; - tdata[id].ops.scal2v.ops = &tdata[id].ops; - tdata[id].ops.setv.ops = &tdata[id].ops; - tdata[id].ops.subv.ops = &tdata[id].ops; - tdata[id].ops.xpbyv.ops = &tdata[id].ops; - tdata[id].ops.addm.ops = &tdata[id].ops; - tdata[id].ops.axpym.ops = &tdata[id].ops; - tdata[id].ops.copym.ops = &tdata[id].ops; - tdata[id].ops.normfm.ops = &tdata[id].ops; - tdata[id].ops.scalm.ops = &tdata[id].ops; - tdata[id].ops.scal2m.ops = &tdata[id].ops; - tdata[id].ops.setm.ops = &tdata[id].ops; - tdata[id].ops.subm.ops = &tdata[id].ops; - tdata[id].ops.xpbym.ops = &tdata[id].ops; - tdata[id].ops.axpy2v.ops = &tdata[id].ops; - tdata[id].ops.dotaxpyv.ops = &tdata[id].ops; - tdata[id].ops.axpyf.ops = &tdata[id].ops; - tdata[id].ops.dotxf.ops = &tdata[id].ops; - tdata[id].ops.dotxaxpyf.ops = &tdata[id].ops; - tdata[id].ops.gemv.ops = &tdata[id].ops; - tdata[id].ops.ger.ops = &tdata[id].ops; - tdata[id].ops.hemv.ops = &tdata[id].ops; - tdata[id].ops.her.ops = &tdata[id].ops; - tdata[id].ops.her2.ops = &tdata[id].ops; - tdata[id].ops.symv.ops = &tdata[id].ops; - tdata[id].ops.syr.ops = &tdata[id].ops; - tdata[id].ops.syr2.ops = &tdata[id].ops; - tdata[id].ops.trmv.ops = &tdata[id].ops; - tdata[id].ops.trsv.ops = &tdata[id].ops; - tdata[id].ops.gemm_ukr.ops = &tdata[id].ops; - tdata[id].ops.trsm_ukr.ops = &tdata[id].ops; - tdata[id].ops.gemmtrsm_ukr.ops = &tdata[id].ops; - tdata[id].ops.gemm.ops = &tdata[id].ops; - tdata[id].ops.gemmt.ops = &tdata[id].ops; - tdata[id].ops.hemm.ops = &tdata[id].ops; - tdata[id].ops.herk.ops = &tdata[id].ops; - tdata[id].ops.her2k.ops = &tdata[id].ops; - tdata[id].ops.symm.ops = &tdata[id].ops; - tdata[id].ops.syrk.ops = &tdata[id].ops; - tdata[id].ops.syr2k.ops = &tdata[id].ops; - tdata[id].ops.trmm.ops = &tdata[id].ops; - tdata[id].ops.trmm3.ops = &tdata[id].ops; - tdata[id].ops.trsm.ops = &tdata[id].ops; - // Spawn additional threads for ids greater than 1. if ( id != 0 ) bli_pthread_create( &pthread[id], NULL, libblis_test_thread_entry, &tdata[id] ); diff --git a/testsuite/src/test_libblis.h b/testsuite/src/test_libblis.h index 89e558e8f1..cdb3c6dac4 100644 --- a/testsuite/src/test_libblis.h +++ b/testsuite/src/test_libblis.h @@ -297,7 +297,7 @@ typedef struct typedef struct thread_data { test_params_t* params; - test_ops_t ops; + test_ops_t* ops; unsigned int nt; unsigned int id; unsigned int xc; From a099d871814c11f0dd6ff062a0338d5d7aadedfd Mon Sep 17 00:00:00 2001 From: "Field G. Van Zee" Date: Tue, 30 Nov 2021 15:13:22 -0600 Subject: [PATCH 23/24] Handle .root correctly in bli_obj_swap(). Details: - Fixed bli_obj_swap() so that it correctly handles the root fields after obj_t contents are swapped. - Renamed bli_obj_remove_offs() to bli_obj_reset_origin() and changed the function to reset the object's root field. - Comment and whitespace updates. --- frame/1m/packm/bli_packm_blk_var1.c | 98 ++++++++++++------------ frame/1m/packm/bli_packm_blk_var1.h | 6 +- frame/1m/packm/bli_packm_init.c | 20 ++--- frame/1m/packm/bli_packm_int.c | 7 +- frame/1m/packm/bli_packm_scalar.c | 6 +- frame/1m/packm/bli_packm_struc_cxk.c | 68 ++++++++-------- frame/1m/packm/bli_packm_struc_cxk_1er.c | 68 ++++++++-------- frame/1m/packm/bli_packm_struc_cxk_md.c | 4 +- frame/3/bli_l3_int.c | 2 +- frame/3/bli_l3_packab.c | 36 ++++----- frame/3/gemm/bli_gemm_front.c | 10 ++- frame/3/gemmt/bli_gemmt_front.c | 18 ++++- frame/3/hemm/bli_hemm_front.c | 10 ++- frame/3/symm/bli_symm_front.c | 10 ++- frame/3/trmm/bli_trmm_front.c | 10 ++- frame/3/trmm3/bli_trmm3_front.c | 10 ++- frame/3/trsm/bli_trsm_blk_var2.c | 2 +- frame/3/trsm/bli_trsm_cntl.c | 2 +- frame/3/trsm/bli_trsm_front.c | 10 ++- frame/base/bli_obj.c | 8 +- frame/base/bli_sba.c | 98 +++++++++++++----------- frame/include/bli_obj_macro_defs.h | 14 +++- testsuite/src/test_gemm_ukr.c | 18 ++--- testsuite/src/test_gemmtrsm_ukr.c | 80 +++++++++---------- testsuite/src/test_trsm_ukr.c | 46 +++++------ 25 files changed, 359 insertions(+), 302 deletions(-) diff --git a/frame/1m/packm/bli_packm_blk_var1.c b/frame/1m/packm/bli_packm_blk_var1.c index 0c6fad7e9a..edeeae2b98 100644 --- a/frame/1m/packm/bli_packm_blk_var1.c +++ b/frame/1m/packm/bli_packm_blk_var1.c @@ -111,8 +111,6 @@ void bli_packm_blk_var1 obj_t kappa_local; char* kappa_cast = bli_packm_scalar( &kappa_local, p ); - // If the packm structure-aware kernel func_t in the context is - // NULL (which is the default value after the context is created), // we use the default lookup table to determine the right func_t // for the current schema. func_t* packm_kers = &packm_struc_cxk_kers[ bli_pack_schema_index( schema ) ]; @@ -126,14 +124,16 @@ void bli_packm_blk_var1 packm_ker_cast = packm_struc_cxk_md[ dt_c ][ dt_p ]; } - // Query the user-provided packing kernel from the obj_t. If provided, - // this overrides the kernel determined above. + // Query the address of the packm params field of the obj_t. The user might + // have set this field in order to specify a custom packm kernel. packm_blk_var1_params_t* params = bli_obj_pack_params( c ); if ( params && params->ukr_fn[ dt_c ][ dt_p ] ) - { - packm_ker_cast = params->ukr_fn[ dt_c ][ dt_p ]; - } + { + // Query the user-provided packing kernel from the obj_t. If provided, + // this overrides the kernel determined above. + packm_ker_cast = params->ukr_fn[ dt_c ][ dt_p ]; + } /* Compute the total number of iterations we'll need. */ dim_t n_iter = iter_dim / panel_dim_max + ( iter_dim % panel_dim_max ? 1 : 0 ); @@ -159,21 +159,21 @@ void bli_packm_blk_var1 ip_inc = 1; } - /* Query the number of threads and thread ids from the current thread's - packm thrinfo_t node. */ + // Query the number of threads and thread ids from the current thread's + // packm thrinfo_t node. const dim_t nt = bli_thread_n_way( thread ); const dim_t tid = bli_thread_work_id( thread ); - /* Determine the thread range and increment using the current thread's - packm thrinfo_t node. NOTE: The definition of bli_thread_range_jrir() - will depend on whether slab or round-robin partitioning was requested - at configure-time. */ + // Determine the thread range and increment using the current thread's + // packm thrinfo_t node. NOTE: The definition of bli_thread_range_jrir() + // will depend on whether slab or round-robin partitioning was requested + // at configure-time. dim_t it_start, it_end, it_inc; bli_thread_range_jrir( thread, n_iter, 1, FALSE, &it_start, &it_end, &it_inc ); char* p_begin = p_cast; - /* Iterate over every logical micropanel in the source matrix. */ + // Iterate over every logical micropanel in the source matrix. for ( dim_t ic = ic0, ip = ip0, it = 0; it < n_iter; ic += ic_inc, ip += ip_inc, it += 1 ) { @@ -185,39 +185,39 @@ void bli_packm_blk_var1 inc_t p_inc = ps_p; - /* NOTE: We MUST use round-robin partitioning when packing - micropanels of a triangular matrix. Hermitian/symmetric - and general packing may use slab or round-robin, depending - on which was selected at configure-time. */ - /* The definition of bli_packm_my_iter() will depend on whether slab - or round-robin partitioning was requested at configure-time. */ - bool my_iter = bli_is_triangular( strucc ) - ? bli_packm_my_iter_rr( it, it_start, it_end, tid, nt ) - : bli_packm_my_iter ( it, it_start, it_end, tid, nt ); + // NOTE: We MUST use round-robin partitioning when packing + // micropanels of a triangular matrix. Hermitian/symmetric + // and general packing may use slab or round-robin, depending + // on which was selected at configure-time. + // The definition of bli_packm_my_iter() will depend on whether slab + // or round-robin partitioning was requested at configure-time. + bool my_iter = bli_is_triangular( strucc ) + ? bli_packm_my_iter_rr( it, it_start, it_end, tid, nt ) + : bli_packm_my_iter ( it, it_start, it_end, tid, nt ); if ( bli_is_triangular( strucc ) && bli_is_unstored_subpart_n( diagoffc_i, uploc, panel_dim_i, panel_len_full ) ) { - /* This case executes if the panel belongs to a triangular - matrix AND is completely unstored (ie: zero). If the panel - is unstored, we do nothing. (Notice that we don't even - increment p_begin.) */ + // This case executes if the panel belongs to a triangular + // matrix AND is completely unstored (ie: zero). If the panel + // is unstored, we do nothing. (Notice that we don't even + // increment p_begin.) continue; } else if ( bli_is_triangular( strucc ) && bli_intersects_diag_n( diagoffc_i, panel_dim_i, panel_len_full ) ) { - /* This case executes if the panel belongs to a triangular - matrix AND is diagonal-intersecting. Notice that we - cannot bury the following conditional logic into - packm_struc_cxk() because we need to know the value of - panel_len_max_i so we can properly increment p_inc. */ - - /* Sanity check. Diagonals should not intersect the short end of - a micro-panel. If they do, then somehow the constraints on - cache blocksizes being a whole multiple of the register - blocksizes was somehow violated. */ + // This case executes if the panel belongs to a triangular + // matrix AND is diagonal-intersecting. Notice that we + // cannot bury the following conditional logic into + // packm_struc_cxk() because we need to know the value of + // panel_len_max_i so we can properly increment p_inc. + + // Sanity check. Diagonals should not intersect the short end of + // a micro-panel. If they do, then somehow the constraints on + // cache blocksizes being a whole multiple of the register + // blocksizes was somehow violated. if ( diagoffc_i < 0 ) bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED ); @@ -232,7 +232,7 @@ void bli_packm_blk_var1 panel_len_max_i = bli_min( bli_abs( diagoffc_i ) + panel_dim_max, panel_len_max ); } - else /* if ( bli_is_upper( uploc ) ) */ + else // if ( bli_is_upper( uploc ) ) { panel_off_i = bli_abs( diagoffc_i ); panel_len_i = panel_len_full - panel_off_i; @@ -244,13 +244,13 @@ void bli_packm_blk_var1 char* c_use = c_begin + (panel_off_i )*ldc*dt_c_size; char* p_use = p_begin; - /* We need to re-compute the imaginary stride as a function of - panel_len_max_i since triangular packed matrices have panels - of varying lengths. NOTE: This imaginary stride value is - only referenced by the packm kernels for induced methods. */ + // We need to re-compute the imaginary stride as a function of + // panel_len_max_i since triangular packed matrices have panels + // of varying lengths. NOTE: This imaginary stride value is + // only referenced by the packm kernels for induced methods. inc_t is_p_use = ldp * panel_len_max_i; - /* We nudge the imaginary stride up by one if it is odd. */ + // We nudge the imaginary stride up by one if it is odd. is_p_use += ( bli_is_odd( is_p_use ) ? 1 : 0 ); if ( my_iter ) @@ -275,16 +275,16 @@ void bli_packm_blk_var1 params ); } - /* NOTE: This value is usually LESS than ps_p because triangular - matrices usually have several micro-panels that are shorter - than a "full" micro-panel. */ + // NOTE: This value is usually LESS than ps_p because triangular + // matrices usually have several micro-panels that are shorter + // than a "full" micro-panel. p_inc = is_p_use; } else { - /* This case executes if the panel is either dense, or belongs - to a Hermitian or symmetric matrix, which includes stored, - unstored, and diagonal-intersecting panels. */ + // This case executes if the panel is either dense, or belongs + // to a Hermitian or symmetric matrix, which includes stored, + // unstored, and diagonal-intersecting panels. if ( my_iter ) { diff --git a/frame/1m/packm/bli_packm_blk_var1.h b/frame/1m/packm/bli_packm_blk_var1.h index eeb18a2166..9cda5828b5 100644 --- a/frame/1m/packm/bli_packm_blk_var1.h +++ b/frame/1m/packm/bli_packm_blk_var1.h @@ -34,7 +34,7 @@ */ // -// Prototype object-based interfaces. +// packm params types. // typedef struct @@ -43,6 +43,10 @@ typedef struct packm_ker_vft ukr_fn[BLIS_NUM_FP_TYPES][BLIS_NUM_FP_TYPES]; } packm_blk_var1_params_t; +// +// Prototype object-based interfaces. +// + BLIS_EXPORT_BLIS void bli_packm_blk_var1 ( obj_t* c, diff --git a/frame/1m/packm/bli_packm_init.c b/frame/1m/packm/bli_packm_init.c index e46864eb72..5a7d716fe6 100644 --- a/frame/1m/packm/bli_packm_init.c +++ b/frame/1m/packm/bli_packm_init.c @@ -85,7 +85,7 @@ bool bli_packm_init // Update the storage datatype of P to be the target datatype of A. bli_obj_set_dt( dt_tar, p ); - bli_obj_set_elem_size( bli_dt_size( dt_tar ), p ); + bli_obj_set_elem_size( bli_dt_size( dt_tar ), p ); // Store the pack schema to the object. bli_obj_set_pack_schema( schema, p ); @@ -94,7 +94,7 @@ bool bli_packm_init // in BLIS is deemed to take care of all conjugation necessary. bli_obj_set_conj( BLIS_NO_CONJUGATE, p ); - // If we are packing micropanels, mark P as dense. + // Since we are packing micropanels, mark P as dense. bli_obj_set_uplo( BLIS_DENSE, p ); // Reset the view offsets to (0,0). @@ -118,9 +118,9 @@ bool bli_packm_init bli_obj_set_padded_dims( m_p_pad, n_p_pad, p ); // Now we prepare to compute strides, align them, and compute the - // total number of bytes needed for the packed buffer. The caller - // will then use that value to acquire an appropriate block of memory - // from the memory allocator. + // total number of bytes needed for the packed buffer. Then we use + // that value to acquire an appropriate block of memory from the + // memory allocator. // Extract the element size for the packed object. siz_t elem_size_p = bli_obj_elem_size( p ); @@ -148,11 +148,10 @@ bool bli_packm_init // dimension of the matrix is not a whole multiple of MR. inc_t ps_p = cs_p * n_p_pad; - // As a general rule, we don't want micropanel strides to be odd. This - // is primarily motivated by our desire to support interleaved 3m - // micropanels, in which case we have to scale the panel stride - // by 3/2. That division by 2 means the numerator (prior to being - // scaled by 3) must be even. + // As a general rule, we don't want micropanel strides to be odd. There + // are very few instances where this can happen, but we've seen it happen + // more than zero times (such as for certain small problems), and so we + // check for it here. if ( bli_is_odd( ps_p ) ) ps_p += 1; // Set the imaginary stride (in units of fundamental elements). @@ -173,6 +172,7 @@ bool bli_packm_init // Compute the size of the packed buffer. siz_t size_p = ps_p * ( m_p_pad / m_panel ) * elem_size_p; + // If the requested size is zero, then we don't need to do any allocation. if ( size_p == 0 ) return false; diff --git a/frame/1m/packm/bli_packm_int.c b/frame/1m/packm/bli_packm_int.c index 1db6765a15..c9a2bb9db2 100644 --- a/frame/1m/packm/bli_packm_int.c +++ b/frame/1m/packm/bli_packm_int.c @@ -47,9 +47,10 @@ void bli_packm_int bli_init_once(); // Extract the function pointer from the object. - packm_var_oft f = bli_obj_pack_fn( a ); + packm_var_oft f = bli_obj_pack_fn( a ); - // FGVZ: Not sure why we need this barrier, but we do. + // Barrier so that we know threads are done with previous computation + // with the same packing buffer before starting to pack. bli_thread_barrier( thread ); // Invoke the variant with kappa_use. @@ -58,7 +59,7 @@ void bli_packm_int a, p, cntx, - rntm, + rntm, cntl, thread ); diff --git a/frame/1m/packm/bli_packm_scalar.c b/frame/1m/packm/bli_packm_scalar.c index a225427b1a..f613028c93 100644 --- a/frame/1m/packm/bli_packm_scalar.c +++ b/frame/1m/packm/bli_packm_scalar.c @@ -48,7 +48,7 @@ void* bli_packm_scalar( obj_t* kappa, obj_t* p ) // applying a real scalar is easy, but applying a complex one is // harder, so we avoid the need altogether with the code below.) if ( bli_obj_scalar_has_nonzero_imag( p ) && - !bli_is_nat_packed( schema ) ) + !bli_is_nat_packed( schema ) ) { //printf( "applying non-zero imag kappa\n_p" ); @@ -58,7 +58,7 @@ void* bli_packm_scalar( obj_t* kappa, obj_t* p ) // Reset the attached scalar (to 1.0). bli_obj_scalar_reset( p ); - return bli_obj_buffer_for_1x1( dt_p, kappa ); + return bli_obj_buffer_for_1x1( dt_p, kappa ); } // This branch is also for native execution, where we assume that // the micro-kernel will always apply the alpha scalar of the @@ -70,7 +70,7 @@ void* bli_packm_scalar( obj_t* kappa, obj_t* p ) // If the internal scalar of A has only a real component, then // we will apply it later (in the micro-kernel), and so we will // use BLIS_ONE to indicate no scaling during packing. - return bli_obj_buffer_for_1x1( dt_p, &BLIS_ONE ); + return bli_obj_buffer_for_1x1( dt_p, &BLIS_ONE ); } } diff --git a/frame/1m/packm/bli_packm_struc_cxk.c b/frame/1m/packm/bli_packm_struc_cxk.c index 9ca12a9da0..2a52c42def 100644 --- a/frame/1m/packm/bli_packm_struc_cxk.c +++ b/frame/1m/packm/bli_packm_struc_cxk.c @@ -84,23 +84,23 @@ void PASTEMAC(ch,varname) \ matrices. */ \ PASTEMAC(ch,packm_herm_cxk) \ ( \ - strucc, \ - diagc, \ - uploc, \ - conjc, \ - schema, \ - invdiag, \ - panel_dim, \ - panel_len, \ - panel_dim_max, \ - panel_len_max, \ - panel_dim_off, \ - panel_len_off, \ - kappa, \ - c, incc, ldc, \ - p, ldp, \ - is_p, \ - cntx \ + strucc, \ + diagc, \ + uploc, \ + conjc, \ + schema, \ + invdiag, \ + panel_dim, \ + panel_len, \ + panel_dim_max, \ + panel_len_max, \ + panel_dim_off, \ + panel_len_off, \ + kappa, \ + c, incc, ldc, \ + p, ldp, \ + is_p, \ + cntx \ ); \ } \ else /* ( bli_is_triangular( strucc ) ) */ \ @@ -109,23 +109,23 @@ void PASTEMAC(ch,varname) \ matrices. */ \ PASTEMAC(ch,packm_tri_cxk) \ ( \ - strucc, \ - diagc, \ - uploc, \ - conjc, \ - schema, \ - invdiag, \ - panel_dim, \ - panel_len, \ - panel_dim_max, \ - panel_len_max, \ - panel_dim_off, \ - panel_len_off, \ - kappa, \ - c, incc, ldc, \ - p, ldp, \ - is_p, \ - cntx \ + strucc, \ + diagc, \ + uploc, \ + conjc, \ + schema, \ + invdiag, \ + panel_dim, \ + panel_len, \ + panel_dim_max, \ + panel_len_max, \ + panel_dim_off, \ + panel_len_off, \ + kappa, \ + c, incc, ldc, \ + p, ldp, \ + is_p, \ + cntx \ ); \ } \ } diff --git a/frame/1m/packm/bli_packm_struc_cxk_1er.c b/frame/1m/packm/bli_packm_struc_cxk_1er.c index c2cbfa2190..b3be9dff95 100644 --- a/frame/1m/packm/bli_packm_struc_cxk_1er.c +++ b/frame/1m/packm/bli_packm_struc_cxk_1er.c @@ -85,24 +85,24 @@ void PASTEMAC(ch,varname) \ matrices. */ \ PASTEMAC(ch,packm_herm_cxk_1er) \ ( \ - strucc, \ - diagc, \ - uploc, \ - conjc, \ - schema, \ - invdiag, \ - panel_dim, \ - panel_len, \ - panel_dim_max, \ - panel_len_max, \ - panel_dim_off, \ - panel_len_off, \ - kappa, \ - c, incc, ldc, \ - p, ldp, \ - is_p, \ + strucc, \ + diagc, \ + uploc, \ + conjc, \ + schema, \ + invdiag, \ + panel_dim, \ + panel_len, \ + panel_dim_max, \ + panel_len_max, \ + panel_dim_off, \ + panel_len_off, \ + kappa, \ + c, incc, ldc, \ + p, ldp, \ + is_p, \ cntx, \ - params \ + params \ ); \ } \ else /* ( bli_is_triangular( strucc ) ) */ \ @@ -111,24 +111,24 @@ void PASTEMAC(ch,varname) \ matrices. */ \ PASTEMAC(ch,packm_tri_cxk_1er) \ ( \ - strucc, \ - diagc, \ - uploc, \ - conjc, \ - schema, \ - invdiag, \ - panel_dim, \ - panel_len, \ - panel_dim_max, \ - panel_len_max, \ - panel_dim_off, \ - panel_len_off, \ - kappa, \ - c, incc, ldc, \ - p, ldp, \ - is_p, \ + strucc, \ + diagc, \ + uploc, \ + conjc, \ + schema, \ + invdiag, \ + panel_dim, \ + panel_len, \ + panel_dim_max, \ + panel_len_max, \ + panel_dim_off, \ + panel_len_off, \ + kappa, \ + c, incc, ldc, \ + p, ldp, \ + is_p, \ cntx, \ - params \ + params \ ); \ } \ } diff --git a/frame/1m/packm/bli_packm_struc_cxk_md.c b/frame/1m/packm/bli_packm_struc_cxk_md.c index 8c64fedede..650b6178c9 100644 --- a/frame/1m/packm/bli_packm_struc_cxk_md.c +++ b/frame/1m/packm/bli_packm_struc_cxk_md.c @@ -291,7 +291,7 @@ void PASTEMAC2(cha,chp,opname) \ conj_t conja, \ dim_t m, \ dim_t n, \ - ctype_p* restrict kappa, \ + ctype_p* restrict kappa, \ ctype_a* restrict a, inc_t inca, inc_t lda, \ ctype_p* restrict p, inc_t ldp \ ) \ @@ -418,7 +418,7 @@ void PASTEMAC2(cha,chp,opname) \ conj_t conja, \ dim_t m, \ dim_t n, \ - ctype_p* restrict kappa, \ + ctype_p* restrict kappa, \ ctype_a* restrict a, inc_t inca, inc_t lda, \ ctype_p* restrict p, inc_t ldp \ ) \ diff --git a/frame/3/bli_l3_int.c b/frame/3/bli_l3_int.c index e193907675..d4b974030c 100644 --- a/frame/3/bli_l3_int.c +++ b/frame/3/bli_l3_int.c @@ -93,7 +93,7 @@ void bli_l3_int bli_obj_alias_to( b, &b_local ); bli_obj_alias_to( c, &c_local ); - // Ensure that a valid packing function is set on A and B + // Ensure that a valid packing function is set on A and B. if ( !bli_obj_pack_fn( &a_local ) ) bli_obj_set_pack_fn( bli_packm_blk_var1, &a_local ); diff --git a/frame/3/bli_l3_packab.c b/frame/3/bli_l3_packab.c index d2736c04ac..d911819429 100644 --- a/frame/3/bli_l3_packab.c +++ b/frame/3/bli_l3_packab.c @@ -47,12 +47,12 @@ void bli_l3_packa { obj_t a_local, a_pack; - bli_obj_alias_to( a, &a_local ); - if ( bli_obj_has_trans( a ) ) - { - bli_obj_induce_trans( &a_local ); - bli_obj_set_onlytrans( BLIS_NO_TRANSPOSE, &a_local ); - } + bli_obj_alias_to( a, &a_local ); + if ( bli_obj_has_trans( a ) ) + { + bli_obj_induce_trans( &a_local ); + bli_obj_set_onlytrans( BLIS_NO_TRANSPOSE, &a_local ); + } // Pack matrix A according to the control tree node. bli_packm_int @@ -95,16 +95,16 @@ void bli_l3_packb { obj_t bt_local, bt_pack; - // We always pass B^T to bli_l3_packm. - bli_obj_alias_to( b, &bt_local ); - if ( bli_obj_has_trans( b ) ) - { - bli_obj_set_onlytrans( BLIS_NO_TRANSPOSE, &bt_local ); - } - else - { - bli_obj_induce_trans( &bt_local ); - } + // We always pass B^T to bli_l3_packm. + bli_obj_alias_to( b, &bt_local ); + if ( bli_obj_has_trans( b ) ) + { + bli_obj_set_onlytrans( BLIS_NO_TRANSPOSE, &bt_local ); + } + else + { + bli_obj_induce_trans( &bt_local ); + } // Pack matrix B according to the control tree node. bli_packm_int @@ -117,8 +117,8 @@ void bli_l3_packb thread ); - // Transpose packed object back to B. - bli_obj_induce_trans( &bt_pack ); + // Transpose packed object back to B. + bli_obj_induce_trans( &bt_pack ); // Proceed with execution using packed matrix B. bli_l3_int diff --git a/frame/3/gemm/bli_gemm_front.c b/frame/3/gemm/bli_gemm_front.c index 38cc851e1f..a9ea21dc43 100644 --- a/frame/3/gemm/bli_gemm_front.c +++ b/frame/3/gemm/bli_gemm_front.c @@ -87,9 +87,13 @@ void bli_gemm_front bli_obj_alias_to( b, &b_local ); bli_obj_alias_to( c, &c_local ); - bli_obj_remove_offs( &a_local ); - bli_obj_remove_offs( &b_local ); - bli_obj_remove_offs( &c_local ); + // Set the obj_t buffer field to the location currently implied by the row + // and column offsets and then zero the offsets. If any of the original + // obj_t's were views into larger matrices, this step effectively makes + // those obj_t's "forget" their lineage. + bli_obj_reset_origin( &a_local ); + bli_obj_reset_origin( &b_local ); + bli_obj_reset_origin( &c_local ); // An optimization: If C is stored by rows and the micro-kernel prefers // contiguous columns, or if C is stored by columns and the micro-kernel diff --git a/frame/3/gemmt/bli_gemmt_front.c b/frame/3/gemmt/bli_gemmt_front.c index ef067ed49e..a910c8d5cb 100644 --- a/frame/3/gemmt/bli_gemmt_front.c +++ b/frame/3/gemmt/bli_gemmt_front.c @@ -73,11 +73,14 @@ void bli_gemmt_front bli_obj_alias_to( a, &a_local ); bli_obj_alias_to( b, &b_local ); bli_obj_alias_to( c, &c_local ); - bli_obj_set_as_root( &c_local ); - bli_obj_remove_offs( &a_local ); - bli_obj_remove_offs( &b_local ); - bli_obj_remove_offs( &c_local ); + // Set the obj_t buffer field to the location currently implied by the row + // and column offsets and then zero the offsets. If any of the original + // obj_t's were views into larger matrices, this step effectively makes + // those obj_t's "forget" their lineage. + bli_obj_reset_origin( &a_local ); + bli_obj_reset_origin( &b_local ); + bli_obj_reset_origin( &c_local ); // An optimization: If C is stored by rows and the micro-kernel prefers // contiguous columns, or if C is stored by columns and the micro-kernel @@ -95,6 +98,13 @@ void bli_gemmt_front // Set the pack schemas within the objects, as appropriate. bli_l3_set_schemas( &a_local, &b_local, &c_local, cntx ); + // Set each alias as the root object. + // NOTE: We MUST wait until we are done potentially swapping the objects + // before setting the root fields! + bli_obj_set_as_root( &a_local ); + bli_obj_set_as_root( &b_local ); + bli_obj_set_as_root( &c_local ); + // Parse and interpret the contents of the rntm_t object to properly // set the ways of parallelism for each loop, and then make any // additional modifications necessary for the current operation. diff --git a/frame/3/hemm/bli_hemm_front.c b/frame/3/hemm/bli_hemm_front.c index a0631c3ebf..770742e1a3 100644 --- a/frame/3/hemm/bli_hemm_front.c +++ b/frame/3/hemm/bli_hemm_front.c @@ -65,9 +65,13 @@ void bli_hemm_front bli_obj_alias_to( b, &b_local ); bli_obj_alias_to( c, &c_local ); - bli_obj_remove_offs( &a_local ); - bli_obj_remove_offs( &b_local ); - bli_obj_remove_offs( &c_local ); + // Set the obj_t buffer field to the location currently implied by the row + // and column offsets and then zero the offsets. If any of the original + // obj_t's were views into larger matrices, this step effectively makes + // those obj_t's "forget" their lineage. + bli_obj_reset_origin( &a_local ); + bli_obj_reset_origin( &b_local ); + bli_obj_reset_origin( &c_local ); #ifdef BLIS_DISABLE_HEMM_RIGHT // NOTE: This case casts right-side hemm in terms of left side. This is diff --git a/frame/3/symm/bli_symm_front.c b/frame/3/symm/bli_symm_front.c index 06a042da41..ba60bffb21 100644 --- a/frame/3/symm/bli_symm_front.c +++ b/frame/3/symm/bli_symm_front.c @@ -65,9 +65,13 @@ void bli_symm_front bli_obj_alias_to( b, &b_local ); bli_obj_alias_to( c, &c_local ); - bli_obj_remove_offs( &a_local ); - bli_obj_remove_offs( &b_local ); - bli_obj_remove_offs( &c_local ); + // Set the obj_t buffer field to the location currently implied by the row + // and column offsets and then zero the offsets. If any of the original + // obj_t's were views into larger matrices, this step effectively makes + // those obj_t's "forget" their lineage. + bli_obj_reset_origin( &a_local ); + bli_obj_reset_origin( &b_local ); + bli_obj_reset_origin( &c_local ); #ifdef BLIS_DISABLE_SYMM_RIGHT // NOTE: This case casts right-side symm in terms of left side. This is diff --git a/frame/3/trmm/bli_trmm_front.c b/frame/3/trmm/bli_trmm_front.c index 43ee0f5841..9251dcf799 100644 --- a/frame/3/trmm/bli_trmm_front.c +++ b/frame/3/trmm/bli_trmm_front.c @@ -64,9 +64,13 @@ void bli_trmm_front bli_obj_alias_to( b, &b_local ); bli_obj_alias_to( b, &c_local ); - bli_obj_remove_offs( &a_local ); - bli_obj_remove_offs( &b_local ); - bli_obj_remove_offs( &c_local ); + // Set the obj_t buffer field to the location currently implied by the row + // and column offsets and then zero the offsets. If any of the original + // obj_t's were views into larger matrices, this step effectively makes + // those obj_t's "forget" their lineage. + bli_obj_reset_origin( &a_local ); + bli_obj_reset_origin( &b_local ); + bli_obj_reset_origin( &c_local ); // We do not explicitly implement the cases where A is transposed. // However, we can still handle them. Specifically, if A is marked as diff --git a/frame/3/trmm3/bli_trmm3_front.c b/frame/3/trmm3/bli_trmm3_front.c index 62f7d65af5..fc082e7d53 100644 --- a/frame/3/trmm3/bli_trmm3_front.c +++ b/frame/3/trmm3/bli_trmm3_front.c @@ -65,9 +65,13 @@ void bli_trmm3_front bli_obj_alias_to( b, &b_local ); bli_obj_alias_to( c, &c_local ); - bli_obj_remove_offs( &a_local ); - bli_obj_remove_offs( &b_local ); - bli_obj_remove_offs( &c_local ); + // Set the obj_t buffer field to the location currently implied by the row + // and column offsets and then zero the offsets. If any of the original + // obj_t's were views into larger matrices, this step effectively makes + // those obj_t's "forget" their lineage. + bli_obj_reset_origin( &a_local ); + bli_obj_reset_origin( &b_local ); + bli_obj_reset_origin( &c_local ); // We do not explicitly implement the cases where A is transposed. // However, we can still handle them. Specifically, if A is marked as diff --git a/frame/3/trsm/bli_trsm_blk_var2.c b/frame/3/trsm/bli_trsm_blk_var2.c index 2e1923dbf0..5691c964ad 100644 --- a/frame/3/trsm/bli_trsm_blk_var2.c +++ b/frame/3/trsm/bli_trsm_blk_var2.c @@ -60,7 +60,7 @@ void bli_trsm_blk_var2 bli_thread_range_ndim ( direct, thread, a, b, c, cntl, cntx, - &my_start, &my_end + &my_start, &my_end ); // Partition along the n dimension. diff --git a/frame/3/trsm/bli_trsm_cntl.c b/frame/3/trsm/bli_trsm_cntl.c index 422f8040c9..a8196ebb93 100644 --- a/frame/3/trsm/bli_trsm_cntl.c +++ b/frame/3/trsm/bli_trsm_cntl.c @@ -200,7 +200,7 @@ cntl_t* bli_trsm_l_cntl_create cntl_t* bli_trsm_r_cntl_create ( - rntm_t* rntm, + rntm_t* rntm, pack_t schema_a, pack_t schema_b ) diff --git a/frame/3/trsm/bli_trsm_front.c b/frame/3/trsm/bli_trsm_front.c index ec858ce950..d7b75cec67 100644 --- a/frame/3/trsm/bli_trsm_front.c +++ b/frame/3/trsm/bli_trsm_front.c @@ -71,9 +71,13 @@ void bli_trsm_front bli_obj_alias_to( b, &b_local ); bli_obj_alias_to( b, &c_local ); - bli_obj_remove_offs( &a_local ); - bli_obj_remove_offs( &b_local ); - bli_obj_remove_offs( &c_local ); + // Set the obj_t buffer field to the location currently implied by the row + // and column offsets and then zero the offsets. If any of the original + // obj_t's were views into larger matrices, this step effectively makes + // those obj_t's "forget" their lineage. + bli_obj_reset_origin( &a_local ); + bli_obj_reset_origin( &b_local ); + bli_obj_reset_origin( &c_local ); // We do not explicitly implement the cases where A is transposed. // However, we can still handle them. Specifically, if A is marked as diff --git a/frame/base/bli_obj.c b/frame/base/bli_obj.c index 2824540896..23fbb4cd10 100644 --- a/frame/base/bli_obj.c +++ b/frame/base/bli_obj.c @@ -118,10 +118,10 @@ void bli_obj_create_without_buffer bli_obj_set_offs( 0, 0, obj ); bli_obj_set_diag_offset( 0, obj ); - bli_obj_set_pack_fn( NULL, obj ); - bli_obj_set_pack_params( NULL, obj ); - bli_obj_set_ker_fn( NULL, obj ); - bli_obj_set_ker_params( NULL, obj ); + bli_obj_set_pack_fn( NULL, obj ); + bli_obj_set_pack_params( NULL, obj ); + bli_obj_set_ker_fn( NULL, obj ); + bli_obj_set_ker_params( NULL, obj ); // Set the internal scalar to 1.0. bli_obj_set_scalar_dt( dt, obj ); diff --git a/frame/base/bli_sba.c b/frame/base/bli_sba.c index 3f0ba49764..5b6ff6a0f0 100644 --- a/frame/base/bli_sba.c +++ b/frame/base/bli_sba.c @@ -76,31 +76,39 @@ void* bli_sba_acquire // Query the small block pool from the rntm. pool_t* restrict pool = bli_rntm_sba_pool( rntm ); - if ( pool == NULL ) - { + // We don't expect NULL sba_pool pointers in the normal course of BLIS + // operation. However, there are rare instances where it is convenient + // to support use of bli_sba_acquire() without having to pass in a valid + // sba pool data structure. The case that inspired this branch was the + // gemm_ukr and related test modules in the BLIS testsuite. (There, it + // is convenient to not have to checkout an array_t from the sba, and it + // does no harm since the malloc() happens outside of the region that + // would be timed.) + if ( pool == NULL ) + { block = bli_malloc_intl( req_size, &r_val ); - } - else - { - // Query the block_size of the pool_t so that we can request the exact - // size present. - const siz_t block_size = bli_pool_block_size( pool ); - - // Sanity check: Make sure the requested size is no larger than the - // block_size field of the pool. - if ( block_size < req_size ) - { - printf( "bli_sba_acquire(): ** pool block_size is %d but req_size is %d.\n", - ( int )block_size, ( int )req_size ); - bli_abort(); - } - - // Check out a block using the block_size queried above. - bli_pool_checkout_block( block_size, &pblk, pool ); - - // The block address is stored within the pblk_t. - block = bli_pblk_buf( &pblk ); - } + } + else + { + // Query the block_size of the pool_t so that we can request the exact + // size present. + const siz_t block_size = bli_pool_block_size( pool ); + + // Sanity check: Make sure the requested size is no larger than the + // block_size field of the pool. + if ( block_size < req_size ) + { + printf( "bli_sba_acquire(): ** pool block_size is %d but req_size is %d.\n", + ( int )block_size, ( int )req_size ); + bli_abort(); + } + + // Check out a block using the block_size queried above. + bli_pool_checkout_block( block_size, &pblk, pool ); + + // The block address is stored within the pblk_t. + block = bli_pblk_buf( &pblk ); + } } #else @@ -130,28 +138,28 @@ void bli_sba_release // Query the small block pool from the rntm. pool_t* restrict pool = bli_rntm_sba_pool( rntm ); - if ( pool == NULL ) - { + if ( pool == NULL ) + { bli_free_intl( block ); - } - else - { - // Query the block_size field from the pool. This is not super-important - // for this particular application of the pool_t (that is, the "leaf" - // component of the sba), but it seems like good housekeeping to maintain - // the block_size field of the pblk_t in case its ever needed/read. - const siz_t block_size = bli_pool_block_size( pool ); - - // Embed the block's memory address into a pblk_t, along with the - // block_size queried from the pool. - bli_pblk_set_buf( block, &pblk ); - bli_pblk_set_block_size( block_size, &pblk ); - - // Check the pblk_t back into the pool_t. (It's okay that the pblk_t is - // a local variable since its contents are copied into the pool's internal - // data structure--an array of pblk_t.) - bli_pool_checkin_block( &pblk, pool ); - } + } + else + { + // Query the block_size field from the pool. This is not super-important + // for this particular application of the pool_t (that is, the "leaf" + // component of the sba), but it seems like good housekeeping to maintain + // the block_size field of the pblk_t in case its ever needed/read. + const siz_t block_size = bli_pool_block_size( pool ); + + // Embed the block's memory address into a pblk_t, along with the + // block_size queried from the pool. + bli_pblk_set_buf( block, &pblk ); + bli_pblk_set_block_size( block_size, &pblk ); + + // Check the pblk_t back into the pool_t. (It's okay that the pblk_t is + // a local variable since its contents are copied into the pool's internal + // data structure--an array of pblk_t.) + bli_pool_checkin_block( &pblk, pool ); + } } #else diff --git a/frame/include/bli_obj_macro_defs.h b/frame/include/bli_obj_macro_defs.h index e229dab877..fe174202cf 100644 --- a/frame/include/bli_obj_macro_defs.h +++ b/frame/include/bli_obj_macro_defs.h @@ -1353,12 +1353,16 @@ BLIS_INLINE void* bli_obj_buffer_for_1x1( num_t dt, obj_t* obj ) ); } -// Adjust pointer based on offsets and then zero them +// Adjust the pointer based on current offsets, zero the offsets, and then +// set the current object as the root. For obj_t's with at least one non-zero +// offset, this effectively makes the obj_t "forget" that it was ever a view +// into a larger matrix. -BLIS_INLINE void bli_obj_remove_offs( obj_t* obj ) +BLIS_INLINE void bli_obj_reset_origin( obj_t* obj ) { bli_obj_set_buffer( bli_obj_buffer_at_off( obj ), obj ); bli_obj_set_offs( 0, 0, obj ); + bli_obj_set_as_root( obj ); } // Make a full alias (shallow copy). @@ -1486,7 +1490,13 @@ BLIS_INLINE void bli_obj_scalar_set_dt_buffer( obj_t* obj, num_t dt_aux, num_t* BLIS_INLINE void bli_obj_swap( obj_t* a, obj_t* b ) { + bool a_root_is_self = ( bli_obj_root( a ) == a ); + bool b_root_is_self = ( bli_obj_root( b ) == b ); + obj_t t = *b; *b = *a; *a = t; + + if ( a_root_is_self ) bli_obj_set_as_root( b ); + if ( b_root_is_self ) bli_obj_set_as_root( a ); } // Swap object pack schemas. diff --git a/testsuite/src/test_gemm_ukr.c b/testsuite/src/test_gemm_ukr.c index 74b1ece669..d37005b285 100644 --- a/testsuite/src/test_gemm_ukr.c +++ b/testsuite/src/test_gemm_ukr.c @@ -231,12 +231,12 @@ void libblis_test_gemm_ukr_experiment libblis_test_mobj_randomize( params, TRUE, &c ); bli_copym( &c, &c_save ); - rntm_t rntm; + rntm_t rntm; bli_rntm_init( &rntm ); bli_pba_rntm_set_pba( &rntm ); - // Transpose B to B^T for packing - bli_obj_induce_trans( &b ); + // Transpose B to B^T for packing. + bli_obj_induce_trans( &b ); // Create pack objects for a and b, and pack them to ap and bp, // respectively. @@ -249,7 +249,7 @@ void libblis_test_gemm_ukr_experiment BLIS_BUFFER_FOR_A_BLOCK, &a, &ap, cntx, - &rntm + &rntm ); cntl_t* cntl_b = libblis_test_pobj_create ( @@ -260,12 +260,12 @@ void libblis_test_gemm_ukr_experiment BLIS_BUFFER_FOR_B_PANEL, &b, &bp, cntx, - &rntm + &rntm ); - // Transpose B^T back to B and Bp^T back to Bp - bli_obj_induce_trans( &b ); - bli_obj_induce_trans( &bp ); + // Transpose B^T back to B and Bp^T back to Bp. + bli_obj_induce_trans( &b ); + bli_obj_induce_trans( &bp ); // Repeat the experiment n_repeats times and record results. for ( i = 0; i < n_repeats; ++i ) @@ -292,7 +292,7 @@ void libblis_test_gemm_ukr_experiment libblis_test_check_empty_problem( &c, perf, resid ); // Free the control tree nodes and release their cached mem_t entries - // back to the memory broker. + // back to the pba. bli_cntl_free( &rntm, cntl_a, &BLIS_PACKM_SINGLE_THREADED ); bli_cntl_free( &rntm, cntl_b, &BLIS_PACKM_SINGLE_THREADED ); diff --git a/testsuite/src/test_gemmtrsm_ukr.c b/testsuite/src/test_gemmtrsm_ukr.c index 9f97359db9..48fcb78db7 100644 --- a/testsuite/src/test_gemmtrsm_ukr.c +++ b/testsuite/src/test_gemmtrsm_ukr.c @@ -283,7 +283,7 @@ void libblis_test_gemmtrsm_ukr_experiment bli_copym( &b11, &c11 ); bli_copym( &c11, &c11_save ); - rntm_t rntm; + rntm_t rntm; bli_rntm_init( &rntm ); bli_pba_rntm_set_pba( &rntm ); @@ -298,7 +298,7 @@ void libblis_test_gemmtrsm_ukr_experiment BLIS_BUFFER_FOR_A_BLOCK, &a, &ap, cntx, - &rntm + &rntm ); // Set the diagonal offset of ap. @@ -315,40 +315,40 @@ bli_printm( "a", &a, "%5.2f", "" ); bli_printm( "ap", &ap, "%5.2f", "" ); #endif - cntl_t* cntl_b = NULL; + cntl_t* cntl_b = NULL; // Repeat the experiment n_repeats times and record results. for ( i = 0; i < n_repeats; ++i ) { bli_copym( &c11_save, &c11 ); - // Transpose B to B^T for packing - bli_obj_induce_trans( &b ); - - cntl_b = libblis_test_pobj_create - ( - BLIS_NR, - BLIS_MR, - BLIS_NO_INVERT_DIAG, - BLIS_PACKED_COL_PANELS, - BLIS_BUFFER_FOR_B_PANEL, - &b, &bp, - cntx, - &rntm - ); - - // Transpose B^T back to B and Bp^T back to Bp - bli_obj_induce_trans( &b ); - bli_obj_induce_trans( &bp ); - - // Create subpartitions from the a and b panels. - bli_gemmtrsm_ukr_make_subparts( k, &ap, &bp, - &a1xp, &a11p, &bx1p, &b11p ); - - // Set the uplo field of a11p since the default for packed objects is - // BLIS_DENSE, and the _ukernel() wrapper needs this information to - // know which set of micro-kernels (lower or upper) to choose from. - bli_obj_set_uplo( uploa, &a11p ); + // Transpose B to B^T for packing. + bli_obj_induce_trans( &b ); + + cntl_b = libblis_test_pobj_create + ( + BLIS_NR, + BLIS_MR, + BLIS_NO_INVERT_DIAG, + BLIS_PACKED_COL_PANELS, + BLIS_BUFFER_FOR_B_PANEL, + &b, &bp, + cntx, + &rntm + ); + + // Transpose B^T back to B and Bp^T back to Bp. + bli_obj_induce_trans( &b ); + bli_obj_induce_trans( &bp ); + + // Create subpartitions from the a and b panels. + bli_gemmtrsm_ukr_make_subparts( k, &ap, &bp, + &a1xp, &a11p, &bx1p, &b11p ); + + // Set the uplo field of a11p since the default for packed objects is + // BLIS_DENSE, and the _ukernel() wrapper needs this information to + // know which set of micro-kernels (lower or upper) to choose from. + bli_obj_set_uplo( uploa, &a11p ); time = bli_clock(); @@ -358,14 +358,14 @@ bli_printm( "ap", &ap, "%5.2f", "" ); time_min = bli_clock_min_diff( time_min, time ); - // On the last pass, we must keep the packed B buffer checked out in order - // to perform the correctness check later. - if (i < n_repeats-1) - { - // Free the control tree nodes and release their cached mem_t entries - // back to the memory broker. - bli_cntl_free( &rntm, cntl_b, &BLIS_PACKM_SINGLE_THREADED ); - } + // On the last pass, we must keep the packed B buffer checked out in order + // to perform the correctness check later. + if ( i < n_repeats - 1 ) + { + // Free the control tree nodes and release their cached mem_t entries + // back to the memory broker. + bli_cntl_free( &rntm, cntl_b, &BLIS_PACKM_SINGLE_THREADED ); + } } // Estimate the performance of the best experiment repeat. @@ -402,9 +402,9 @@ bli_printm( "ap", &ap, "%5.2f", "" ); //libblis_test_check_empty_problem( &c11, perf, resid ); // Free the control tree nodes and release their cached mem_t entries - // back to the memory broker. + // back to the pba. bli_cntl_free( &rntm, cntl_a, &BLIS_PACKM_SINGLE_THREADED ); - if ( cntl_b ) + if ( cntl_b ) bli_cntl_free( &rntm, cntl_b, &BLIS_PACKM_SINGLE_THREADED ); // Free the test objects. diff --git a/testsuite/src/test_trsm_ukr.c b/testsuite/src/test_trsm_ukr.c index 3d7ab7cb46..b07da91cc8 100644 --- a/testsuite/src/test_trsm_ukr.c +++ b/testsuite/src/test_trsm_ukr.c @@ -232,7 +232,7 @@ void libblis_test_trsm_ukr_experiment libblis_test_mobj_randomize( params, TRUE, &c ); bli_copym( &c, &c_save ); - rntm_t rntm; + rntm_t rntm; bli_rntm_init( &rntm ); bli_pba_rntm_set_pba( &rntm ); @@ -247,7 +247,7 @@ void libblis_test_trsm_ukr_experiment BLIS_BUFFER_FOR_A_BLOCK, &a, &ap, cntx, - &rntm + &rntm ); // Set the diagonal offset of ap. @@ -268,24 +268,24 @@ bli_printm( "ap", &ap, "%5.2f", "" ); { bli_copym( &c_save, &c ); - // Transpose B to B^T for packing - bli_obj_induce_trans( &b ); - - cntl_t* cntl_b = libblis_test_pobj_create - ( - BLIS_NR, - BLIS_MR, - BLIS_NO_INVERT_DIAG, - BLIS_PACKED_COL_PANELS, - BLIS_BUFFER_FOR_B_PANEL, - &b, &bp, - cntx, - &rntm - ); - - // Transpose B^T back to B and Bp^T back to Bp - bli_obj_induce_trans( &b ); - bli_obj_induce_trans( &bp ); + // Transpose B to B^T for packing. + bli_obj_induce_trans( &b ); + + cntl_t* cntl_b = libblis_test_pobj_create + ( + BLIS_NR, + BLIS_MR, + BLIS_NO_INVERT_DIAG, + BLIS_PACKED_COL_PANELS, + BLIS_BUFFER_FOR_B_PANEL, + &b, &bp, + cntx, + &rntm + ); + + // Transpose B^T back to B and Bp^T back to Bp. + bli_obj_induce_trans( &b ); + bli_obj_induce_trans( &bp ); time = bli_clock(); @@ -295,9 +295,9 @@ bli_printm( "ap", &ap, "%5.2f", "" ); time_min = bli_clock_min_diff( time_min, time ); - // Free the control tree nodes and release their cached mem_t entries - // back to the memory broker. - bli_cntl_free( &rntm, cntl_b, &BLIS_PACKM_SINGLE_THREADED ); + // Free the control tree nodes and release their cached mem_t entries + // back to the memory broker. + bli_cntl_free( &rntm, cntl_b, &BLIS_PACKM_SINGLE_THREADED ); } // Estimate the performance of the best experiment repeat. From 9f5b215d9efee89db1e1ee693ec145662928719a Mon Sep 17 00:00:00 2001 From: "Field G. Van Zee" Date: Thu, 2 Dec 2021 15:29:36 -0600 Subject: [PATCH 24/24] Removed old calls to bli_obj_set_as_root(). Details: - Removed calls to bli_obj_set_as_root() within the level-3 _front() functions since the setting of the aliases' root fields is now handled by bli_obj_reset_origin(). (The calls to bli_obj_set_as_root() in gemm_front() has already been removed.) --- frame/3/gemmt/bli_gemmt_front.c | 7 ------- frame/3/hemm/bli_hemm_front.c | 7 ------- frame/3/symm/bli_symm_front.c | 7 ------- frame/3/trmm/bli_trmm_front.c | 7 ------- frame/3/trmm3/bli_trmm3_front.c | 7 ------- frame/3/trsm/bli_trsm_front.c | 7 ------- 6 files changed, 42 deletions(-) diff --git a/frame/3/gemmt/bli_gemmt_front.c b/frame/3/gemmt/bli_gemmt_front.c index a910c8d5cb..2a9d91759b 100644 --- a/frame/3/gemmt/bli_gemmt_front.c +++ b/frame/3/gemmt/bli_gemmt_front.c @@ -98,13 +98,6 @@ void bli_gemmt_front // Set the pack schemas within the objects, as appropriate. bli_l3_set_schemas( &a_local, &b_local, &c_local, cntx ); - // Set each alias as the root object. - // NOTE: We MUST wait until we are done potentially swapping the objects - // before setting the root fields! - bli_obj_set_as_root( &a_local ); - bli_obj_set_as_root( &b_local ); - bli_obj_set_as_root( &c_local ); - // Parse and interpret the contents of the rntm_t object to properly // set the ways of parallelism for each loop, and then make any // additional modifications necessary for the current operation. diff --git a/frame/3/hemm/bli_hemm_front.c b/frame/3/hemm/bli_hemm_front.c index 770742e1a3..9835de9c15 100644 --- a/frame/3/hemm/bli_hemm_front.c +++ b/frame/3/hemm/bli_hemm_front.c @@ -137,13 +137,6 @@ void bli_hemm_front // Set the pack schemas within the objects. bli_l3_set_schemas( &a_local, &b_local, &c_local, cntx ); - // Set each alias as the root object. - // NOTE: We MUST wait until we are done potentially swapping the objects - // before setting the root fields! - bli_obj_set_as_root( &a_local ); - bli_obj_set_as_root( &b_local ); - bli_obj_set_as_root( &c_local ); - // Parse and interpret the contents of the rntm_t object to properly // set the ways of parallelism for each loop, and then make any // additional modifications necessary for the current operation. diff --git a/frame/3/symm/bli_symm_front.c b/frame/3/symm/bli_symm_front.c index ba60bffb21..be94c44c1b 100644 --- a/frame/3/symm/bli_symm_front.c +++ b/frame/3/symm/bli_symm_front.c @@ -136,13 +136,6 @@ void bli_symm_front // Set the pack schemas within the objects. bli_l3_set_schemas( &a_local, &b_local, &c_local, cntx ); - // Set each alias as the root object. - // NOTE: We MUST wait until we are done potentially swapping the objects - // before setting the root fields! - bli_obj_set_as_root( &a_local ); - bli_obj_set_as_root( &b_local ); - bli_obj_set_as_root( &c_local ); - // Parse and interpret the contents of the rntm_t object to properly // set the ways of parallelism for each loop, and then make any // additional modifications necessary for the current operation. diff --git a/frame/3/trmm/bli_trmm_front.c b/frame/3/trmm/bli_trmm_front.c index 9251dcf799..1de28958eb 100644 --- a/frame/3/trmm/bli_trmm_front.c +++ b/frame/3/trmm/bli_trmm_front.c @@ -155,13 +155,6 @@ void bli_trmm_front // Set the pack schemas within the objects. bli_l3_set_schemas( &a_local, &b_local, &c_local, cntx ); - // Set each alias as the root object. - // NOTE: We MUST wait until we are done potentially swapping the objects - // before setting the root fields! - bli_obj_set_as_root( &a_local ); - bli_obj_set_as_root( &b_local ); - bli_obj_set_as_root( &c_local ); - // Parse and interpret the contents of the rntm_t object to properly // set the ways of parallelism for each loop, and then make any // additional modifications necessary for the current operation. diff --git a/frame/3/trmm3/bli_trmm3_front.c b/frame/3/trmm3/bli_trmm3_front.c index fc082e7d53..3b97539603 100644 --- a/frame/3/trmm3/bli_trmm3_front.c +++ b/frame/3/trmm3/bli_trmm3_front.c @@ -147,13 +147,6 @@ void bli_trmm3_front // Set the pack schemas within the objects. bli_l3_set_schemas( &a_local, &b_local, &c_local, cntx ); - // Set each alias as the root object. - // NOTE: We MUST wait until we are done potentially swapping the objects - // before setting the root fields! - bli_obj_set_as_root( &a_local ); - bli_obj_set_as_root( &b_local ); - bli_obj_set_as_root( &c_local ); - // Parse and interpret the contents of the rntm_t object to properly // set the ways of parallelism for each loop, and then make any // additional modifications necessary for the current operation. diff --git a/frame/3/trsm/bli_trsm_front.c b/frame/3/trsm/bli_trsm_front.c index d7b75cec67..7f3d17aeff 100644 --- a/frame/3/trsm/bli_trsm_front.c +++ b/frame/3/trsm/bli_trsm_front.c @@ -129,13 +129,6 @@ void bli_trsm_front // Set the pack schemas within the objects. bli_l3_set_schemas( &a_local, &b_local, &c_local, cntx ); - // Set each alias as the root object. - // NOTE: We MUST wait until we are done potentially swapping the objects - // before setting the root fields! - bli_obj_set_as_root( &a_local ); - bli_obj_set_as_root( &b_local ); - bli_obj_set_as_root( &c_local ); - // Parse and interpret the contents of the rntm_t object to properly // set the ways of parallelism for each loop, and then make any // additional modifications necessary for the current operation.