From cda8a83b76d1908376d3f16b1f98dadac21a493a Mon Sep 17 00:00:00 2001 From: Angelika Schwarz Date: Sat, 19 Feb 2022 13:32:31 +0000 Subject: [PATCH 1/8] Add level-3 BLAS triangular Sylvester equation solver Force compatibility with [ds]trsyl. Use two floating-point scaling factors (rather than integer scaling factors). This does not eliminate the problem that scalings can be flushed, making any result useless. That problem could be eliminated by replacing the floating-point scale factor with an integer scale factor. --- SRC/CMakeLists.txt | 10 +- SRC/Makefile | 10 +- SRC/dlarmm.f | 99 ++++ SRC/dtrsyl3.f | 1242 ++++++++++++++++++++++++++++++++++++++++++++ SRC/ilaenv.f | 8 + SRC/slarmm.f | 99 ++++ SRC/strsyl3.f | 1241 +++++++++++++++++++++++++++++++++++++++++++ 7 files changed, 2699 insertions(+), 10 deletions(-) create mode 100644 SRC/dlarmm.f create mode 100644 SRC/dtrsyl3.f create mode 100644 SRC/slarmm.f create mode 100644 SRC/strsyl3.f diff --git a/SRC/CMakeLists.txt b/SRC/CMakeLists.txt index 79e79f06eb..b5fa0168f5 100644 --- a/SRC/CMakeLists.txt +++ b/SRC/CMakeLists.txt @@ -105,8 +105,8 @@ set(SLASRC slaqgb.f slaqge.f slaqp2.f slaqps.f slaqsb.f slaqsp.f slaqsy.f slaqr0.f slaqr1.f slaqr2.f slaqr3.f slaqr4.f slaqr5.f slaqtr.f slar1v.f slar2v.f ilaslr.f ilaslc.f - slarf.f slarfb.f slarfb_gett.f slarfg.f slarfgp.f slarft.f slarfx.f slarfy.f slargv.f - slarrv.f slartv.f + slarf.f slarfb.f slarfb_gett.f slarfg.f slarfgp.f slarft.f slarfx.f slarfy.f + slargv.f slarmm.f slarrv.f slartv.f slarz.f slarzb.f slarzt.f slasy2.f slasyf.f slasyf_rook.f slasyf_rk.f slasyf_aa.f slatbs.f slatdf.f slatps.f slatrd.f slatrs.f slatrz.f @@ -141,7 +141,7 @@ set(SLASRC stgsja.f stgsna.f stgsy2.f stgsyl.f stpcon.f stprfs.f stptri.f stptrs.f strcon.f strevc.f strevc3.f strexc.f strrfs.f strsen.f strsna.f strsyl.f - strti2.f strtri.f strtrs.f stzrzf.f sstemr.f + strsyl3.f strti2.f strtri.f strtrs.f stzrzf.f sstemr.f slansf.f spftrf.f spftri.f spftrs.f ssfrk.f stfsm.f stftri.f stfttp.f stfttr.f stpttf.f stpttr.f strttf.f strttp.f sgejsv.f sgesvj.f sgsvj0.f sgsvj1.f @@ -306,7 +306,7 @@ set(DLASRC dlaqr0.f dlaqr1.f dlaqr2.f dlaqr3.f dlaqr4.f dlaqr5.f dlaqtr.f dlar1v.f dlar2v.f iladlr.f iladlc.f dlarf.f dlarfb.f dlarfb_gett.f dlarfg.f dlarfgp.f dlarft.f dlarfx.f dlarfy.f - dlargv.f dlarrv.f dlartv.f + dlargv.f dlarmm.f dlarrv.f dlartv.f dlarz.f dlarzb.f dlarzt.f dlaswp.f dlasy2.f dlasyf.f dlasyf_rook.f dlasyf_rk.f dlasyf_aa.f dlatbs.f dlatdf.f dlatps.f dlatrd.f dlatrs.f dlatrz.f dlauu2.f @@ -342,7 +342,7 @@ set(DLASRC dtgsja.f dtgsna.f dtgsy2.f dtgsyl.f dtpcon.f dtprfs.f dtptri.f dtptrs.f dtrcon.f dtrevc.f dtrevc3.f dtrexc.f dtrrfs.f dtrsen.f dtrsna.f dtrsyl.f - dtrti2.f dtrtri.f dtrtrs.f dtzrzf.f dstemr.f + dtrsyl3.f dtrti2.f dtrtri.f dtrtrs.f dtzrzf.f dstemr.f dsgesv.f dsposv.f dlag2s.f slag2d.f dlat2s.f dlansf.f dpftrf.f dpftri.f dpftrs.f dsfrk.f dtfsm.f dtftri.f dtfttp.f dtfttr.f dtpttf.f dtpttr.f dtrttf.f dtrttp.f diff --git a/SRC/Makefile b/SRC/Makefile index b05c81fddd..3f8f6f4695 100644 --- a/SRC/Makefile +++ b/SRC/Makefile @@ -137,8 +137,8 @@ SLASRC = \ slaqgb.o slaqge.o slaqp2.o slaqps.o slaqsb.o slaqsp.o slaqsy.o \ slaqr0.o slaqr1.o slaqr2.o slaqr3.o slaqr4.o slaqr5.o \ slaqtr.o slar1v.o slar2v.o ilaslr.o ilaslc.o \ - slarf.o slarfb.o slarfb_gett.o slarfg.o slarfgp.o slarft.o slarfx.o slarfy.o slargv.o \ - slarrv.o slartv.o \ + slarf.o slarfb.o slarfb_gett.o slarfg.o slarfgp.o slarft.o slarfx.o slarfy.o \ + slargv.o slarmm.o slarrv.o slartv.o \ slarz.o slarzb.o slarzt.o slaswp.o slasy2.o slasyf.o slasyf_rook.o \ slasyf_rk.o \ slatbs.o slatdf.o slatps.o slatrd.o slatrs.o slatrz.o \ @@ -174,7 +174,7 @@ SLASRC = \ stgsja.o stgsna.o stgsy2.o stgsyl.o stpcon.o stprfs.o stptri.o \ stptrs.o \ strcon.o strevc.o strevc3.o strexc.o strrfs.o strsen.o strsna.o strsyl.o \ - strti2.o strtri.o strtrs.o stzrzf.o sstemr.o \ + strsyl3.o strti2.o strtri.o strtrs.o stzrzf.o sstemr.o \ slansf.o spftrf.o spftri.o spftrs.o ssfrk.o stfsm.o stftri.o stfttp.o \ stfttr.o stpttf.o stpttr.o strttf.o strttp.o \ sgejsv.o sgesvj.o sgsvj0.o sgsvj1.o \ @@ -340,7 +340,7 @@ DLASRC = \ dlaqr0.o dlaqr1.o dlaqr2.o dlaqr3.o dlaqr4.o dlaqr5.o \ dlaqtr.o dlar1v.o dlar2v.o iladlr.o iladlc.o \ dlarf.o dlarfb.o dlarfb_gett.o dlarfg.o dlarfgp.o dlarft.o dlarfx.o dlarfy.o \ - dlargv.o dlarrv.o dlartv.o \ + dlargv.o dlarmm.o dlarrv.o dlartv.o \ dlarz.o dlarzb.o dlarzt.o dlaswp.o dlasy2.o \ dlasyf.o dlasyf_rook.o dlasyf_rk.o \ dlatbs.o dlatdf.o dlatps.o dlatrd.o dlatrs.o dlatrz.o dlauu2.o \ @@ -376,7 +376,7 @@ DLASRC = \ dtgsja.o dtgsna.o dtgsy2.o dtgsyl.o dtpcon.o dtprfs.o dtptri.o \ dtptrs.o \ dtrcon.o dtrevc.o dtrevc3.o dtrexc.o dtrrfs.o dtrsen.o dtrsna.o dtrsyl.o \ - dtrti2.o dtrtri.o dtrtrs.o dtzrzf.o dstemr.o \ + dtrsyl3.o dtrti2.o dtrtri.o dtrtrs.o dtzrzf.o dstemr.o \ dsgesv.o dsposv.o dlag2s.o slag2d.o dlat2s.o \ dlansf.o dpftrf.o dpftri.o dpftrs.o dsfrk.o dtfsm.o dtftri.o dtfttp.o \ dtfttr.o dtpttf.o dtpttr.o dtrttf.o dtrttp.o \ diff --git a/SRC/dlarmm.f b/SRC/dlarmm.f new file mode 100644 index 0000000000..c360420092 --- /dev/null +++ b/SRC/dlarmm.f @@ -0,0 +1,99 @@ +*> \brief \b DLARMM +* +* Definition: +* =========== +* +* DOUBLE PRECISION FUNCTION DLARMM( ANORM, BNORM, CNORM ) +* +* .. Scalar Arguments .. +* DOUBLE PRECISION ANORM, BNORM, CNORM +* .. +* +*> \par Purpose: +* ======= +*> +*> \verbatim +*> +*> DLARMM returns a factor s in (0, 1] such that the linear updates +*> +*> (s * C) - A * (s * B) and (s * C) - (s * A) * B +*> +*> cannot overflow, where A, B, and C are matrices of conforming +*> dimensions. +*> +*> This is an auxiliary routine so there is no argument checking. +*> \endverbatim +* +* Arguments: +* ========= +* +*> \param[in] ANORM +*> \verbatim +*> ANORM is DOUBLE PRECISION +*> The infinity norm of A. ANORM >= 0. +*> The number of rows of the matrix A. M >= 0. +*> \endverbatim +*> +*> \param[in] BNORM +*> \verbatim +*> BNORM is DOUBLE PRECISION +*> The infinity norm of B. BNORM >= 0. +*> \endverbatim +*> +*> \param[in] CNORM +*> \verbatim +*> CNORM is DOUBLE PRECISION +*> The infinity norm of C. CNORM >= 0. +*> \endverbatim +*> +*> +* ===================================================================== +*> References: +*> C. C. Kjelgaard Mikkelsen and L. Karlsson, Blocked Algorithms for +*> Robust Solution of Triangular Linear Systems. In: International +*> Conference on Parallel Processing and Applied Mathematics, pages +*> 68--78. Springer, 2017. +*> +*> \ingroup OTHERauxiliary +* ===================================================================== + + DOUBLE PRECISION FUNCTION DLARMM( ANORM, BNORM, CNORM ) + IMPLICIT NONE +* .. Scalar Arguments .. + DOUBLE PRECISION ANORM, BNORM, CNORM +* .. Parameters .. + DOUBLE PRECISION ONE, HALF, FOUR + PARAMETER ( ONE = 1.0D0, HALF = 0.5D+0, FOUR = 4.0D0 ) +* .. +* .. Local Scalars .. + DOUBLE PRECISION BIGNUM, SMLNUM +* .. +* .. External Functions .. + DOUBLE PRECISION DLAMCH + EXTERNAL DLAMCH +* .. +* .. Executable Statements .. +* +* +* Determine machine dependent parameters to control overflow. +* + SMLNUM = DLAMCH( 'Safe minimum' ) / DLAMCH( 'Precision' ) + BIGNUM = ( ONE / SMLNUM ) / FOUR +* +* Compute a scale factor. +* + DLARMM = ONE + IF( BNORM .LE. ONE ) THEN + IF( ANORM * BNORM .GT. BIGNUM - CNORM ) THEN + DLARMM = HALF + END IF + ELSE + IF( ANORM .GT. (BIGNUM - CNORM) / BNORM ) THEN + DLARMM = HALF / BNORM + END IF + END IF + RETURN +* +* ==== End of DLARMM ==== +* + END diff --git a/SRC/dtrsyl3.f b/SRC/dtrsyl3.f new file mode 100644 index 0000000000..dd5f2f48f5 --- /dev/null +++ b/SRC/dtrsyl3.f @@ -0,0 +1,1242 @@ +*> \brief \b DTRSYL3 +* +* Definition: +* =========== +* +* +*> \par Purpose +* ============= +*> +*> \verbatim +*> +*> DTRSYL3 solves the real Sylvester matrix equation: +*> +*> op(A)*X + X*op(B) = scale*C or +*> op(A)*X - X*op(B) = scale*C, +*> +*> where op(A) = A or A**T, and A and B are both upper quasi- +*> triangular. A is M-by-M and B is N-by-N; the right hand side C and +*> the solution X are M-by-N; and scale is an output scale factor, set +*> <= 1 to avoid overflow in X. +*> +*> A and B must be in Schur canonical form (as returned by DHSEQR), that +*> is, block upper triangular with 1-by-1 and 2-by-2 diagonal blocks; +*> each 2-by-2 diagonal block has its diagonal elements equal and its +*> off-diagonal elements of opposite sign. +*> +*> This is the block version of the algorithm. +*> \endverbatim +* +* Arguments +* ========= +* +*> \param[in] TRANA +*> \verbatim +*> TRANA is CHARACTER*1 +*> Specifies the option op(A): +*> = 'N': op(A) = A (No transpose) +*> = 'T': op(A) = A**T (Transpose) +*> = 'C': op(A) = A**H (Conjugate transpose = Transpose) +*> \endverbatim +*> +*> \param[in] TRANB +*> \verbatim +*> TRANB is CHARACTER*1 +*> Specifies the option op(B): +*> = 'N': op(B) = B (No transpose) +*> = 'T': op(B) = B**T (Transpose) +*> = 'C': op(B) = B**H (Conjugate transpose = Transpose) +*> \endverbatim +*> +*> \param[in] ISGN +*> \verbatim +*> ISGN is INTEGER +*> Specifies the sign in the equation: +*> = +1: solve op(A)*X + X*op(B) = scale*C +*> = -1: solve op(A)*X - X*op(B) = scale*C +*> \endverbatim +*> +*> \param[in] M +*> \verbatim +*> M is INTEGER +*> The order of the matrix A, and the number of rows in the +*> matrices X and C. M >= 0. +*> \endverbatim +*> +*> \param[in] N +*> \verbatim +*> N is INTEGER +*> The order of the matrix B, and the number of columns in the +*> matrices X and C. N >= 0. +*> \endverbatim +*> +*> \param[in] A +*> \verbatim +*> A is DOUBLE PRECISION array, dimension (LDA,M) +*> The upper quasi-triangular matrix A, in Schur canonical form. +*> \endverbatim +*> +*> \param[in] LDA +*> \verbatim +*> LDA is INTEGER +*> The leading dimension of the array A. LDA >= max(1,M). +*> \endverbatim +*> +*> \param[in] B +*> \verbatim +*> B is DOUBLE PRECISION array, dimension (LDB,N) +*> The upper quasi-triangular matrix B, in Schur canonical form. +*> \endverbatim +*> +*> \param[in] LDB +*> \verbatim +*> LDB is INTEGER +*> The leading dimension of the array B. LDB >= max(1,N). +*> \endverbatim +*> +*> \param[in,out] C +*> \verbatim +*> C is DOUBLE PRECISION array, dimension (LDC,N) +*> On entry, the M-by-N right hand side matrix C. +*> On exit, C is overwritten by the solution matrix X. +*> \endverbatim +*> +*> \param[in] LDC +*> \verbatim +*> LDC is INTEGER +*> The leading dimension of the array C. LDC >= max(1,M) +*> \endverbatim +*> +*> \param[out] SCALE +*> \verbatim +*> SCALE is DOUBLE PRECISION +*> The scale factor, scale, set <= 1 to avoid overflow in X. +*> \endverbatim +*> +*> \param[out] IWORK +*> \verbatim +*> IWORK is INTEGER array, dimension (MAX(1,LIWORK)) +*> On exit, if INFO = 0, IWORK(1) returns the optimal LIWORK. +*> \endverbatim +*> +*> \param[in] LIWORK +*> \verbatim +*> IWORK is INTEGER +*> The dimension of the array IWORK. LIWORK >= ((M + NB - 1) / NB + 1) +*> + ((N + NB - 1) / NB + 1), where NB is the optimal block size. +*> +*> If LIWORK = -1, then a workspace query is assumed; the routine +*> only calculates the optimal dimension of the IWORK array, +*> returns this value as the first entry of the IWORK array, and +*> no error message related to LIWORK is issued by XERBLA. +*> \endverbatim +*> +*> \param[out] SWORK +*> \verbatim +*> SWORK is DOUBLE PRECISION array, dimension (MAX(2, ROWS), +*> MAX(1,COLS)). +*> On exit, if INFO = 0, SWORK(1) returns the optimal value ROWS +*> and SWORK(2) returns the optimal COLS. +*> \endverbatim +*> +*> \param[in] LDSWORK +*> \verbatim +*> LDSWORK is INTEGER +*> LDSWORK >= MAX(2,ROWS), where ROWS = ((M + NB - 1) / NB + 1) +*> and NB is the optimal block size. +*> +*> If LDSWORK = -1, then a workspace query is assumed; the routine +*> only calculates the optimal dimensions of the SWORK matrix, +*> returns these values as the first and second entry of the SWORK +*> matrix, and no error message related LWORK is issued by XERBLA. +*> \endverbatim +*> +*> \param[out] INFO +*> \verbatim +*> INFO is INTEGER +*> = 0: successful exit +*> < 0: if INFO = -i, the i-th argument had an illegal value +*> = 1: A and B have common or very close eigenvalues; perturbed +*> values were used to solve the equation (but the matrices +*> A and B are unchanged). +*> \endverbatim +* +* ===================================================================== +* References: +* E. S. Quintana-Orti and R. A. Van De Geijn (2003). Formal derivation of +* algorithms: The triangular Sylvester equation, ACM Transactions +* on Mathematical Software (TOMS), volume 29, pages 218--243. +* +* A. Schwarz and C. C. Kjelgaard Mikkelsen (2020). Robust Task-Parallel +* Solution of the Triangular Sylvester Equation. Lecture Notes in +* Computer Science, vol 12043, pages 82--92, Springer. +* +* Contributor: +* Angelika Schwarz, Umea University, Sweden. +* +* ===================================================================== + SUBROUTINE DTRSYL3( TRANA, TRANB, ISGN, M, N, A, LDA, B, LDB, C, + $ LDC, SCALE, IWORK, LIWORK, SWORK, LDSWORK, + $ INFO ) + IMPLICIT NONE +* +* .. Scalar Arguments .. + CHARACTER TRANA, TRANB + INTEGER INFO, ISGN, LDA, LDB, LDC, M, N, + $ LIWORK, LDSWORK + DOUBLE PRECISION SCALE +* .. +* .. Array Arguments .. + INTEGER IWORK( * ) + DOUBLE PRECISION A( LDA, * ), B( LDB, * ), C( LDC, * ), + $ SWORK( LDSWORK, * ) +* .. +* .. Parameters .. + DOUBLE PRECISION ZERO, ONE + PARAMETER ( ZERO = 0.0D+0, ONE = 1.0D+0 ) +* .. +* .. Local Scalars .. + LOGICAL NOTRNA, NOTRNB, LQUERY, SKIP + INTEGER AWRK, BWRK, I, I1, I2, IINFO, J, J1, J2, JJ, + $ K, K1, K2, L, L1, L2, LL, NBA, NB, NBB, PC + DOUBLE PRECISION ANRM, BIGNUM, BNRM, CNRM, SCAL, SCALOC, + $ SCAMIN, SGN, XNRM, BUF, SMLNUM +* .. +* .. Local Arrays .. + DOUBLE PRECISION WNRM( MAX( M, N ) ) +* .. +* .. External Functions .. + LOGICAL LSAME + INTEGER ILAENV + DOUBLE PRECISION DLANGE, DLAMCH, DLARMM + EXTERNAL DLANGE, DLAMCH, DLARMM, ILAENV, LSAME +* .. +* .. External Subroutines .. + EXTERNAL DGEMM, DLASCL, DSCAL, DTRSYL, XERBLA +* .. +* .. Intrinsic Functions .. + INTRINSIC ABS, EXPONENT, DBLE, MAX, MIN +* .. +* .. Executable Statements .. +* +* Decode and Test input parameters +* + NOTRNA = LSAME( TRANA, 'N' ) + NOTRNB = LSAME( TRANB, 'N' ) +* +* Use the same block size for all matrices. +* + NB = MAX(8, ILAENV( 1, 'DTRSYL', '', M, N, -1, -1) ) +* +* Compute number of blocks in A and B +* + NBA = MAX( 1, (M + NB - 1) / NB ) + NBB = MAX( 1, (N + NB - 1) / NB ) +* +* Compute workspace +* + INFO = 0 + LQUERY = ( LIWORK.EQ.-1 .OR. LDSWORK.EQ.-1 ) + IWORK( 1 ) = NBA + NBB + 2 + IF( LQUERY ) THEN + LDSWORK = 2 + SWORK( 1, 1 ) = MAX( NBA, NBB ) + SWORK( 2, 1 ) = 2 * NBB + NBA + END IF +* +* Test the input arguments +* + IF( .NOT.NOTRNA .AND. .NOT.LSAME( TRANA, 'T' ) .AND. .NOT. + $ LSAME( TRANA, 'C' ) ) THEN + INFO = -1 + ELSE IF( .NOT.NOTRNB .AND. .NOT.LSAME( TRANB, 'T' ) .AND. .NOT. + $ LSAME( TRANB, 'C' ) ) THEN + INFO = -2 + ELSE IF( ISGN.NE.1 .AND. ISGN.NE.-1 ) THEN + INFO = -3 + ELSE IF( M.LT.0 ) THEN + INFO = -4 + ELSE IF( N.LT.0 ) THEN + INFO = -5 + ELSE IF( LDA.LT.MAX( 1, M ) ) THEN + INFO = -7 + ELSE IF( LDB.LT.MAX( 1, N ) ) THEN + INFO = -9 + ELSE IF( LDC.LT.MAX( 1, M ) ) THEN + INFO = -11 + ELSE IF( .NOT.LQUERY .AND. LIWORK.LT.IWORK(1) ) THEN + INFO = -14 + ELSE IF( .NOT.LQUERY .AND. LDSWORK.LT.MAX( NBA, NBB ) ) THEN + INFO = -16 + END IF + IF( INFO.NE.0 ) THEN + CALL XERBLA( 'DTRSYL3', -INFO ) + RETURN + ELSE IF( LQUERY ) THEN + RETURN + END IF +* +* Quick return if possible +* + IF( M.EQ.0 .OR. N.EQ.0 ) + $ RETURN +* +* Use unblocked code for small problems +* + IF( NBA.EQ.1 .OR. NBB.EQ.1 ) THEN + CALL DTRSYL( TRANA, TRANB, ISGN, M, N, A, LDA, B, LDB, + $ C, LDC, SCALE, INFO ) + RETURN + END IF +* +* Set constants to control overflow +* + SMLNUM = DLAMCH( 'S' ) + BIGNUM = ONE / SMLNUM +* +* Partition A such that 2-by-2 blocks on the diagonal are not split +* + SKIP = .FALSE. + DO I = 1, NBA + IWORK( I ) = ( I - 1 ) * NB + 1 + END DO + IWORK( NBA + 1 ) = M + 1 + DO K = 1, NBA + L1 = IWORK( K ) + L2 = IWORK( K + 1 ) - 1 + DO L = L1, L2 + IF( SKIP ) THEN + SKIP = .FALSE. + CYCLE + END IF + IF( L.GE.M ) THEN +* A( M, M ) is a 1-by-1 block + CYCLE + END IF + IF( A( L, L+1 ).NE.ZERO .AND. A( L+1, L ).NE.ZERO ) THEN +* Check if 2-by-2 block is split + IF( L + 1 .EQ. IWORK( K + 1 ) ) THEN + IWORK( K + 1 ) = IWORK( K + 1 ) + 1 + CYCLE + END IF + SKIP = .TRUE. + END IF + END DO + END DO + IWORK( NBA + 1 ) = M + 1 + IF( IWORK( NBA ).GE.IWORK( NBA + 1 ) ) THEN + IWORK( NBA ) = IWORK( NBA + 1 ) + NBA = NBA - 1 + END IF +* +* Partition B such that 2-by-2 blocks on the diagonal are not split +* + PC = NBA + 1 + SKIP = .FALSE. + DO I = 1, NBB + IWORK( PC + I ) = ( I - 1 ) * NB + 1 + END DO + IWORK( PC + NBB + 1 ) = N + 1 + DO K = 1, NBB + L1 = IWORK( PC + K ) + L2 = IWORK( PC + K + 1 ) - 1 + DO L = L1, L2 + IF( SKIP ) THEN + SKIP = .FALSE. + CYCLE + END IF + IF( L.GE.N ) THEN +* B( N, N ) is a 1-by-1 block + CYCLE + END IF + IF( B( L, L+1 ).NE.ZERO .AND. B( L+1, L ).NE.ZERO ) THEN +* Check if 2-by-2 block is split + IF( L + 1 .EQ. IWORK( PC + K + 1 ) ) THEN + IWORK( PC + K + 1 ) = IWORK( PC + K + 1 ) + 1 + CYCLE + END IF + SKIP = .TRUE. + END IF + END DO + END DO + IWORK( PC + NBB + 1 ) = N + 1 + IF( IWORK( PC + NBB ).GE.IWORK( PC + NBB + 1 ) ) THEN + IWORK( PC + NBB ) = IWORK( PC + NBB + 1 ) + NBB = NBB - 1 + END IF +* +* Set local scaling factors - must never attain zero. +* + DO L = 1, NBB + DO K = 1, NBA + SWORK( K, L ) = ONE + END DO + END DO +* +* Fallback scaling factor to prevent flushing of SWORK( K, L ) to zero. +* This scaling is to ensure compatibility with TRSYL and may get flushed. +* + BUF = ONE +* +* Compute upper bounds of blocks of A and B +* + AWRK = NBB + DO K = 1, NBA + K1 = IWORK( K ) + K2 = IWORK( K + 1 ) + DO L = K, NBA + L1 = IWORK( L ) + L2 = IWORK( L + 1 ) + IF( NOTRNA ) THEN + SWORK( K, AWRK + L ) = DLANGE( 'I', K2-K1, L2-L1, + $ A( K1, L1 ), LDA, WNRM ) + ELSE + SWORK( L, AWRK + K ) = DLANGE( '1', K2-K1, L2-L1, + $ A( K1, L1 ), LDA, WNRM ) + END IF + END DO + END DO + BWRK = NBB + NBA + DO K = 1, NBB + K1 = IWORK( PC + K ) + K2 = IWORK( PC + K + 1 ) + DO L = K, NBB + L1 = IWORK( PC + L ) + L2 = IWORK( PC + L + 1 ) + IF( NOTRNB ) THEN + SWORK( K, BWRK + L ) = DLANGE( 'I', K2-K1, L2-L1, + $ B( K1, L1 ), LDB, WNRM ) + ELSE + SWORK( L, BWRK + K ) = DLANGE( '1', K2-K1, L2-L1, + $ B( K1, L1 ), LDB, WNRM ) + END IF + END DO + END DO +* + SGN = DBLE( ISGN ) +* + IF( NOTRNA .AND. NOTRNB ) THEN +* +* Solve A*X + ISGN*X*B = scale*C. +* +* The (K,L)th block of X is determined starting from +* bottom-left corner column by column by +* +* A(K,K)*X(K,L) + ISGN*X(K,L)*B(L,L) = C(K,L) - R(K,L) +* +* Where +* M L-1 +* R(K,L) = SUM [A(K,I)*X(I,L)] + ISGN*SUM [X(K,J)*B(J,L)]. +* I=K+1 J=1 +* +* Start loop over block rows (index = K) and block columns (index = L) +* + DO K = NBA, 1, -1 +* +* K1: row index of the first row in X( K, L ) +* K2: row index of the first row in X( K+1, L ) +* so the K2 - K1 is the column count of the block X( K, L ) +* + K1 = IWORK( K ) + K2 = IWORK( K + 1 ) + DO L = 1, NBB +* +* L1: column index of the first column in X( K, L ) +* L2: column index of the first column in X( K, L + 1) +* so that L2 - L1 is the row count of the block X( K, L ) +* + L1 = IWORK( PC + L ) + L2 = IWORK( PC + L + 1 ) +* + CALL DTRSYL( TRANA, TRANB, ISGN, K2-K1, L2-L1, + $ A( K1, K1 ), LDA, + $ B( L1, L1 ), LDB, + $ C( K1, L1 ), LDC, SCALOC, IINFO ) + INFO = MAX( INFO, IINFO ) +* + IF ( SCALOC * SWORK( K, L ) .EQ. ZERO ) THEN + IF( SCALOC .EQ. ZERO ) THEN +* The magnitude of the largest entry of X(K1:K2-1, L1:L2-1) +* is larger than the product of BIGNUM**2 and cannot be +* represented in the form (1/SCALE)*X(K1:K2-1, L1:L2-1). +* Mark the computation as pointless. + BUF = ZERO + ELSE +* Use second scaling factor to prevent flushing to zero. + BUF = BUF*2.D0**EXPONENT( SCALOC ) + END IF + DO JJ = 1, NBB + DO LL = 1, NBA +* Bound by BIGNUM to not introduce Inf. The value +* is irrelevant; corresponding entries of the +* solution will be flushed in consistency scaling. + SWORK( LL, JJ ) = MIN( BIGNUM, + $ SWORK( LL, JJ ) / 2.D0**EXPONENT( SCALOC ) ) + END DO + END DO + END IF + SWORK( K, L ) = SCALOC * SWORK( K, L ) + XNRM = DLANGE( 'I', K2-K1, L2-L1, C( K1, L1 ), LDC, + $ WNRM ) +* + DO I = K - 1, 1, -1 +* +* C( I, L ) := C( I, L ) - A( I, K ) * C( K, L ) +* + I1 = IWORK( I ) + I2 = IWORK( I + 1 ) +* +* Compute scaling factor to survive the linear update +* simulating consistent scaling. +* + CNRM = DLANGE( 'I', I2-I1, L2-L1, C( I1, L1 ), + $ LDC, WNRM ) + SCAMIN = MIN( SWORK( I, L ), SWORK( K, L ) ) + CNRM = CNRM * ( SCAMIN / SWORK( I, L ) ) + XNRM = XNRM * ( SCAMIN / SWORK( K, L ) ) + ANRM = SWORK( I, AWRK + K ) + SCALOC = DLARMM( ANRM, XNRM, CNRM ) + IF( SCALOC * SCAMIN .EQ. ZERO ) THEN +* Use second scaling factor to prevent flushing to zero. + BUF = BUF*2.D0**EXPONENT( SCALOC ) + DO JJ = 1, NBB + DO LL = 1, NBA + SWORK( LL, JJ ) = MIN( BIGNUM, + $ SWORK( LL, JJ ) / 2.D0**EXPONENT( SCALOC ) ) + END DO + END DO + SCAMIN = SCAMIN / 2.D0**EXPONENT( SCALOC ) + SCALOC = SCALOC / 2.D0**EXPONENT( SCALOC ) + END IF + CNRM = CNRM * SCALOC + XNRM = XNRM * SCALOC +* +* Simultaneously apply the robust update factor and the +* consistency scaling factor to C( I, L ) and C( K, L ). +* + SCAL = ( SCAMIN / SWORK( K, L ) ) * SCALOC + IF (SCAL .NE. ONE) THEN + DO JJ = L1, L2-1 + CALL DSCAL( K2-K1, SCAL, C( K1, JJ ), 1) + END DO + ENDIF +* + SCAL = ( SCAMIN / SWORK( I, L ) ) * SCALOC + IF (SCAL .NE. ONE) THEN + DO LL = L1, L2-1 + CALL DSCAL( I2-I1, SCAL, C( I1, LL ), 1) + END DO + ENDIF +* +* Record current scaling factor +* + SWORK( K, L ) = SCAMIN * SCALOC + SWORK( I, L ) = SCAMIN * SCALOC +* + CALL DGEMM( 'N', 'N', I2-I1, L2-L1, K2-K1, -ONE, + $ A( I1, K1 ), LDA, C( K1, L1 ), LDC, + $ ONE, C( I1, L1 ), LDC ) +* + END DO +* + DO J = L + 1, NBB +* +* C( K, J ) := C( K, J ) - SGN * C( K, L ) * B( L, J ) +* + J1 = IWORK( PC + J ) + J2 = IWORK( PC + J + 1 ) +* +* Compute scaling factor to survive the linear update +* simulating consistent scaling. +* + CNRM = DLANGE( 'I', K2-K1, J2-J1, C( K1, J1 ), + $ LDC, WNRM ) + SCAMIN = MIN( SWORK( K, J ), SWORK( K, L ) ) + CNRM = CNRM * ( SCAMIN / SWORK( K, J ) ) + XNRM = XNRM * ( SCAMIN / SWORK( K, L ) ) + BNRM = SWORK(L, BWRK + J) + SCALOC = DLARMM( BNRM, XNRM, CNRM ) + IF( SCALOC * SCAMIN .EQ. ZERO ) THEN +* Use second scaling factor to prevent flushing to zero. + BUF = BUF*2.D0**EXPONENT( SCALOC ) + DO JJ = 1, NBB + DO LL = 1, NBA + SWORK( LL, JJ ) = MIN( BIGNUM, + $ SWORK( LL, JJ ) / 2.D0**EXPONENT( SCALOC ) ) + END DO + END DO + SCAMIN = SCAMIN / 2.D0**EXPONENT( SCALOC ) + SCALOC = SCALOC / 2.D0**EXPONENT( SCALOC ) + END IF + CNRM = CNRM * SCALOC + XNRM = XNRM * SCALOC +* +* Simultaneously apply the robust update factor and the +* consistency scaling factor to C( K, J ) and C( K, L). +* + SCAL = ( SCAMIN / SWORK( K, L ) ) * SCALOC + IF( SCAL .NE. ONE ) THEN + DO LL = L1, L2-1 + CALL DSCAL( K2-K1, SCAL, C( K1, LL ), 1 ) + END DO + ENDIF +* + SCAL = ( SCAMIN / SWORK( K, J ) ) * SCALOC + IF( SCAL .NE. ONE ) THEN + DO JJ = J1, J2-1 + CALL DSCAL( K2-K1, SCAL, C( K1, JJ ), 1 ) + END DO + ENDIF +* +* Record current scaling factor +* + SWORK( K, L ) = SCAMIN * SCALOC + SWORK( K, J ) = SCAMIN * SCALOC +* + CALL DGEMM( 'N', 'N', K2-K1, J2-J1, L2-L1, -SGN, + $ C( K1, L1 ), LDC, B( L1, J1 ), LDB, + $ ONE, C( K1, J1 ), LDC ) + END DO + END DO + END DO + ELSE IF( .NOT.NOTRNA .AND. NOTRNB ) THEN +* +* Solve A**T*X + ISGN*X*B = scale*C. +* +* The (K,L)th block of X is determined starting from +* upper-left corner column by column by +* +* A(K,K)**T*X(K,L) + ISGN*X(K,L)*B(L,L) = C(K,L) - R(K,L) +* +* Where +* K-1 L-1 +* R(K,L) = SUM [A(I,K)**T*X(I,L)] +ISGN*SUM [X(K,J)*B(J,L)] +* I=1 J=1 +* +* Start loop over block rows (index = K) and block columns (index = L) +* + DO K = 1, NBA +* +* K1: row index of the first row in X( K, L ) +* K2: row index of the first row in X( K+1, L ) +* so the K2 - K1 is the column count of the block X( K, L ) +* + K1 = IWORK( K ) + K2 = IWORK( K + 1 ) + DO L = 1, NBB +* +* L1: column index of the first column in X( K, L ) +* L2: column index of the first column in X( K, L + 1) +* so that L2 - L1 is the row count of the block X( K, L ) +* + L1 = IWORK( PC + L ) + L2 = IWORK( PC + L + 1 ) +* + CALL DTRSYL( TRANA, TRANB, ISGN, K2-K1, L2-L1, + $ A( K1, K1 ), LDA, + $ B( L1, L1 ), LDB, + $ C( K1, L1 ), LDC, SCALOC, IINFO ) + INFO = MAX( INFO, IINFO ) +* + IF( SCALOC * SWORK( K, L ) .EQ. ZERO ) THEN + IF( SCALOC .EQ. ZERO ) THEN +* The magnitude of the largest entry of X(K1:K2-1, L1:L2-1) +* is larger than the product of BIGNUM**2 and cannot be +* represented in the form (1/SCALE)*X(K1:K2-1, L1:L2-1). +* Mark the computation as pointless. + BUF = ZERO + ELSE +* Use second scaling factor to prevent flushing to zero. + BUF = BUF*2.D0**EXPONENT( SCALOC ) + END IF + DO JJ = 1, NBB + DO LL = 1, NBA +* Bound by BIGNUM to not introduce Inf. The value +* is irrelevant; corresponding entries of the +* solution will be flushed in consistency scaling. + SWORK( LL, JJ ) = MIN( BIGNUM, + $ SWORK( LL, JJ ) / 2.D0**EXPONENT( SCALOC ) ) + END DO + END DO + END IF + SWORK( K, L ) = SCALOC * SWORK( K, L ) + XNRM = DLANGE( 'I', K2-K1, L2-L1, C( K1, L1 ), LDC, + $ WNRM ) +* + DO I = K + 1, NBA +* +* C( I, L ) := C( I, L ) - A( K, I )**T * C( K, L ) +* + I1 = IWORK( I ) + I2 = IWORK( I + 1 ) +* +* Compute scaling factor to survive the linear update +* simulating consistent scaling. +* + CNRM = DLANGE( 'I', I2-I1, L2-L1, C( I1, L1 ), + $ LDC, WNRM ) + SCAMIN = MIN( SWORK( I, L ), SWORK( K, L ) ) + CNRM = CNRM * ( SCAMIN / SWORK( I, L ) ) + XNRM = XNRM * ( SCAMIN / SWORK( K, L ) ) + ANRM = SWORK( I, AWRK + K ) + SCALOC = DLARMM( ANRM, XNRM, CNRM ) + IF( SCALOC * SCAMIN .EQ. ZERO ) THEN +* Use second scaling factor to prevent flushing to zero. + BUF = BUF*2.D0**EXPONENT( SCALOC ) + DO JJ = 1, NBB + DO LL = 1, NBA + SWORK( LL, JJ ) = MIN( BIGNUM, + $ SWORK( LL, JJ ) / 2.D0**EXPONENT( SCALOC ) ) + END DO + END DO + SCAMIN = SCAMIN / 2.D0**EXPONENT( SCALOC ) + SCALOC = SCALOC / 2.D0**EXPONENT( SCALOC ) + END IF + CNRM = CNRM * SCALOC + XNRM = XNRM * SCALOC +* +* Simultaneously apply the robust update factor and the +* consistency scaling factor to to C( I, L ) and C( K, L ). +* + SCAL = ( SCAMIN / SWORK( K, L ) ) * SCALOC + IF (SCAL .NE. ONE) THEN + DO LL = L1, L2-1 + CALL DSCAL( K2-K1, SCAL, C( K1, LL ), 1 ) + END DO + ENDIF +* + SCAL = ( SCAMIN / SWORK( I, L ) ) * SCALOC + IF (SCAL .NE. ONE) THEN + DO LL = L1, L2-1 + CALL DSCAL( I2-I1, SCAL, C( I1, LL ), 1 ) + END DO + ENDIF +* +* Record current scaling factor +* + SWORK( K, L ) = SCAMIN * SCALOC + SWORK( I, L ) = SCAMIN * SCALOC +* + CALL DGEMM( 'T', 'N', I2-I1, L2-L1, K2-K1, -ONE, + $ A( K1, I1 ), LDA, C( K1, L1 ), LDC, + $ ONE, C( I1, L1 ), LDC ) + END DO +* + DO J = L + 1, NBB +* +* C( K, J ) := C( K, J ) - SGN * C( K, L ) * B( L, J ) +* + J1 = IWORK( PC + J ) + J2 = IWORK( PC + J + 1 ) +* +* Compute scaling factor to survive the linear update +* simulating consistent scaling. +* + CNRM = DLANGE( 'I', K2-K1, J2-J1, C( K1, J1 ), + $ LDC, WNRM ) + SCAMIN = MIN( SWORK( K, J ), SWORK( K, L ) ) + CNRM = CNRM * ( SCAMIN / SWORK( K, J ) ) + XNRM = XNRM * ( SCAMIN / SWORK( K, L ) ) + BNRM = SWORK( L, BWRK + J ) + SCALOC = DLARMM( BNRM, XNRM, CNRM ) + IF( SCALOC * SCAMIN .EQ. ZERO ) THEN +* Use second scaling factor to prevent flushing to zero. + BUF = BUF*2.D0**EXPONENT( SCALOC ) + DO JJ = 1, NBB + DO LL = 1, NBA + SWORK( LL, JJ ) = MIN( BIGNUM, + $ SWORK( LL, JJ ) / 2.D0**EXPONENT( SCALOC ) ) + END DO + END DO + SCAMIN = SCAMIN / 2.D0**EXPONENT( SCALOC ) + SCALOC = SCALOC / 2.D0**EXPONENT( SCALOC ) + END IF + CNRM = CNRM * SCALOC + XNRM = XNRM * SCALOC +* +* Simultaneously apply the robust update factor and the +* consistency scaling factor to to C( K, J ) and C( K, L ). +* + SCAL = ( SCAMIN / SWORK( K, L ) ) * SCALOC + IF( SCAL .NE. ONE ) THEN + DO LL = L1, L2-1 + CALL DSCAL( K2-K1, SCAL, C( K1, LL ), 1 ) + END DO + ENDIF +* + SCAL = ( SCAMIN / SWORK( K, J ) ) * SCALOC + IF( SCAL .NE. ONE ) THEN + DO JJ = J1, J2-1 + CALL DSCAL( K2-K1, SCAL, C( K1, JJ ), 1 ) + END DO + ENDIF +* +* Record current scaling factor +* + SWORK( K, L ) = SCAMIN * SCALOC + SWORK( K, J ) = SCAMIN * SCALOC +* + CALL DGEMM( 'N', 'N', K2-K1, J2-J1, L2-L1, -SGN, + $ C( K1, L1 ), LDC, B( L1, J1 ), LDB, + $ ONE, C( K1, J1 ), LDC ) + END DO + END DO + END DO + ELSE IF( .NOT.NOTRNA .AND. .NOT.NOTRNB ) THEN +* +* Solve A**T*X + ISGN*X*B**T = scale*C. +* +* The (K,L)th block of X is determined starting from +* top-right corner column by column by +* +* A(K,K)**T*X(K,L) + ISGN*X(K,L)*B(L,L)**T = C(K,L) - R(K,L) +* +* Where +* K-1 N +* R(K,L) = SUM [A(I,K)**T*X(I,L)] + ISGN*SUM [X(K,J)*B(L,J)**T]. +* I=1 J=L+1 +* +* Start loop over block rows (index = K) and block columns (index = L) +* + DO K = 1, NBA +* +* K1: row index of the first row in X( K, L ) +* K2: row index of the first row in X( K+1, L ) +* so the K2 - K1 is the column count of the block X( K, L ) +* + K1 = IWORK( K ) + K2 = IWORK( K + 1 ) + DO L = NBB, 1, -1 +* +* L1: column index of the first column in X( K, L ) +* L2: column index of the first column in X( K, L + 1) +* so that L2 - L1 is the row count of the block X( K, L ) +* + L1 = IWORK( PC + L ) + L2 = IWORK( PC + L + 1 ) +* + CALL DTRSYL( TRANA, TRANB, ISGN, K2-K1, L2-L1, + $ A( K1, K1 ), LDA, + $ B( L1, L1 ), LDB, + $ C( K1, L1 ), LDC, SCALOC, IINFO ) + INFO = MAX( INFO, IINFO ) +* + SWORK( K, L ) = SCALOC * SWORK( K, L ) + IF( SCALOC * SWORK( K, L ) .EQ. ZERO ) THEN + IF( SCALOC .EQ. ZERO ) THEN +* The magnitude of the largest entry of X(K1:K2-1, L1:L2-1) +* is larger than the product of BIGNUM**2 and cannot be +* represented in the form (1/SCALE)*X(K1:K2-1, L1:L2-1). +* Mark the computation as pointless. + BUF = ZERO + ELSE +* Use second scaling factor to prevent flushing to zero. + BUF = BUF*2.D0**EXPONENT( SCALOC ) + END IF + DO JJ = 1, NBB + DO LL = 1, NBA +* Bound by BIGNUM to not introduce Inf. The value +* is irrelevant; corresponding entries of the +* solution will be flushed in consistency scaling. + SWORK( LL, JJ ) = MIN( BIGNUM, + $ SWORK( LL, JJ ) / 2.D0**EXPONENT( SCALOC ) ) + END DO + END DO + END IF + XNRM = DLANGE( 'I', K2-K1, L2-L1, C( K1, L1 ), LDC, + $ WNRM ) +* + DO I = K + 1, NBA +* +* C( I, L ) := C( I, L ) - A( K, I )**T * C( K, L ) +* + I1 = IWORK( I ) + I2 = IWORK( I + 1 ) +* +* Compute scaling factor to survive the linear update +* simulating consistent scaling. +* + CNRM = DLANGE( 'I', I2-I1, L2-L1, C( I1, L1 ), + $ LDC, WNRM ) + SCAMIN = MIN( SWORK( I, L ), SWORK( K, L ) ) + CNRM = CNRM * ( SCAMIN / SWORK( I, L ) ) + XNRM = XNRM * ( SCAMIN / SWORK( K, L ) ) + ANRM = SWORK( I, AWRK + K ) + SCALOC = DLARMM( ANRM, XNRM, CNRM ) + IF( SCALOC * SCAMIN .EQ. ZERO ) THEN +* Use second scaling factor to prevent flushing to zero. + BUF = BUF*2.D0**EXPONENT( SCALOC ) + DO JJ = 1, NBB + DO LL = 1, NBA + SWORK( LL, JJ ) = MIN( BIGNUM, + $ SWORK( LL, JJ ) / 2.D0**EXPONENT( SCALOC ) ) + END DO + END DO + SCAMIN = SCAMIN / 2.D0**EXPONENT( SCALOC ) + SCALOC = SCALOC / 2.D0**EXPONENT( SCALOC ) + END IF + CNRM = CNRM * SCALOC + XNRM = XNRM * SCALOC +* +* Simultaneously apply the robust update factor and the +* consistency scaling factor to C( I, L ) and C( K, L ). +* + SCAL = ( SCAMIN / SWORK( K, L ) ) * SCALOC + IF (SCAL .NE. ONE) THEN + DO LL = L1, L2-1 + CALL DSCAL( K2-K1, SCAL, C( K1, LL ), 1 ) + END DO + ENDIF +* + SCAL = ( SCAMIN / SWORK( I, L ) ) * SCALOC + IF (SCAL .NE. ONE) THEN + DO LL = L1, L2-1 + CALL DSCAL( I2-I1, SCAL, C( I1, LL ), 1 ) + END DO + ENDIF +* +* Record current scaling factor +* + SWORK( K, L ) = SCAMIN * SCALOC + SWORK( I, L ) = SCAMIN * SCALOC +* + CALL DGEMM( 'T', 'N', I2-I1, L2-L1, K2-K1, -ONE, + $ A( K1, I1 ), LDA, C( K1, L1 ), LDC, + $ ONE, C( I1, L1 ), LDC ) + END DO +* + DO J = 1, L - 1 +* +* C( K, J ) := C( K, J ) - SGN * C( K, L ) * B( J, L )**T +* + J1 = IWORK( PC + J ) + J2 = IWORK( PC + J + 1 ) +* +* Compute scaling factor to survive the linear update +* simulating consistent scaling. +* + CNRM = DLANGE( 'I', K2-K1, J2-J1, C( K1, J1 ), + $ LDC, WNRM ) + SCAMIN = MIN( SWORK( K, J ), SWORK( K, L ) ) + CNRM = CNRM * ( SCAMIN / SWORK( K, J ) ) + XNRM = XNRM * ( SCAMIN / SWORK( K, L ) ) + BNRM = SWORK( L, BWRK + J ) + SCALOC = DLARMM( BNRM, XNRM, CNRM ) + IF( SCALOC * SCAMIN .EQ. ZERO ) THEN +* Use second scaling factor to prevent flushing to zero. + BUF = BUF*2.D0**EXPONENT( SCALOC ) + DO JJ = 1, NBB + DO LL = 1, NBA + SWORK( LL, JJ ) = MIN( BIGNUM, + $ SWORK( LL, JJ ) / 2.D0**EXPONENT( SCALOC ) ) + END DO + END DO + SCAMIN = SCAMIN / 2.D0**EXPONENT( SCALOC ) + SCALOC = SCALOC / 2.D0**EXPONENT( SCALOC ) + END IF + CNRM = CNRM * SCALOC + XNRM = XNRM * SCALOC +* +* Simultaneously apply the robust update factor and the +* consistency scaling factor to C( K, J ) and C( K, L ). +* + SCAL = ( SCAMIN / SWORK( K, L ) ) * SCALOC + IF( SCAL .NE. ONE ) THEN + DO LL = L1, L2-1 + CALL DSCAL( K2-K1, SCAL, C( K1, LL ), 1) + END DO + ENDIF +* + SCAL = ( SCAMIN / SWORK( K, J ) ) * SCALOC + IF( SCAL .NE. ONE ) THEN + DO JJ = J1, J2-1 + CALL DSCAL( K2-K1, SCAL, C( K1, JJ ), 1 ) + END DO + ENDIF +* +* Record current scaling factor +* + SWORK( K, L ) = SCAMIN * SCALOC + SWORK( K, J ) = SCAMIN * SCALOC +* + CALL DGEMM( 'N', 'T', K2-K1, J2-J1, L2-L1, -SGN, + $ C( K1, L1 ), LDC, B( J1, L1 ), LDB, + $ ONE, C( K1, J1 ), LDC ) + END DO + END DO + END DO + ELSE IF( NOTRNA .AND. .NOT.NOTRNB ) THEN +* +* Solve A*X + ISGN*X*B**T = scale*C. +* +* The (K,L)th block of X is determined starting from +* bottom-right corner column by column by +* +* A(K,K)*X(K,L) + ISGN*X(K,L)*B(L,L)**T = C(K,L) - R(K,L) +* +* Where +* M N +* R(K,L) = SUM [A(K,I)*X(I,L)] + ISGN*SUM [X(K,J)*B(L,J)**T]. +* I=K+1 J=L+1 +* +* Start loop over block rows (index = K) and block columns (index = L) +* + DO K = NBA, 1, -1 +* +* K1: row index of the first row in X( K, L ) +* K2: row index of the first row in X( K+1, L ) +* so the K2 - K1 is the column count of the block X( K, L ) +* + K1 = IWORK( K ) + K2 = IWORK( K + 1 ) + DO L = NBB, 1, -1 +* +* L1: column index of the first column in X( K, L ) +* L2: column index of the first column in X( K, L + 1) +* so that L2 - L1 is the row count of the block X( K, L ) +* + L1 = IWORK( PC + L ) + L2 = IWORK( PC + L + 1 ) +* + CALL DTRSYL( TRANA, TRANB, ISGN, K2-K1, L2-L1, + $ A( K1, K1 ), LDA, + $ B( L1, L1 ), LDB, + $ C( K1, L1 ), LDC, SCALOC, IINFO ) + INFO = MAX( INFO, IINFO ) +* + IF( SCALOC * SWORK( K, L ) .EQ. ZERO ) THEN + IF( SCALOC .EQ. ZERO ) THEN +* The magnitude of the largest entry of X(K1:K2-1, L1:L2-1) +* is larger than the product of BIGNUM**2 and cannot be +* represented in the form (1/SCALE)*X(K1:K2-1, L1:L2-1). +* Mark the computation as pointless. + BUF = ZERO + ELSE +* Use second scaling factor to prevent flushing to zero. + BUF = BUF*2.D0**EXPONENT( SCALOC ) + END IF + DO JJ = 1, NBB + DO LL = 1, NBA +* Bound by BIGNUM to not introduce Inf. The value +* is irrelevant; corresponding entries of the +* solution will be flushed in consistency scaling. + SWORK( LL, JJ ) = MIN( BIGNUM, + $ SWORK( LL, JJ ) / 2.D0**EXPONENT( SCALOC ) ) + END DO + END DO + END IF + SWORK( K, L ) = SCALOC * SWORK( K, L ) + XNRM = DLANGE( 'I', K2-K1, L2-L1, C( K1, L1 ), LDC, + $ WNRM ) +* + DO I = 1, K - 1 +* +* C( I, L ) := C( I, L ) - A( I, K ) * C( K, L ) +* + I1 = IWORK( I ) + I2 = IWORK( I + 1 ) +* +* Compute scaling factor to survive the linear update +* simulating consistent scaling. +* + CNRM = DLANGE( 'I', I2-I1, L2-L1, C( I1, L1 ), + $ LDC, WNRM ) + SCAMIN = MIN( SWORK( I, L ), SWORK( K, L ) ) + CNRM = CNRM * ( SCAMIN / SWORK( I, L ) ) + XNRM = XNRM * ( SCAMIN / SWORK( K, L ) ) + ANRM = SWORK( I, AWRK + K ) + SCALOC = DLARMM( ANRM, XNRM, CNRM ) + IF( SCALOC * SCAMIN .EQ. ZERO ) THEN +* Use second scaling factor to prevent flushing to zero. + BUF = BUF*2.D0**EXPONENT( SCALOC ) + DO JJ = 1, NBB + DO LL = 1, NBA + SWORK( LL, JJ ) = MIN( BIGNUM, + $ SWORK( LL, JJ ) / 2.D0**EXPONENT( SCALOC ) ) + END DO + END DO + SCAMIN = SCAMIN / 2.D0**EXPONENT( SCALOC ) + SCALOC = SCALOC / 2.D0**EXPONENT( SCALOC ) + END IF + CNRM = CNRM * SCALOC + XNRM = XNRM * SCALOC +* +* Simultaneously apply the robust update factor and the +* consistency scaling factor to C( I, L ) and C( K, L ). +* + SCAL = ( SCAMIN / SWORK( K, L ) ) * SCALOC + IF (SCAL .NE. ONE) THEN + DO LL = L1, L2-1 + CALL DSCAL( K2-K1, SCAL, C( K1, LL ), 1 ) + END DO + ENDIF +* + SCAL = ( SCAMIN / SWORK( I, L ) ) * SCALOC + IF (SCAL .NE. ONE) THEN + DO LL = L1, L2-1 + CALL DSCAL( I2-I1, SCAL, C( I1, LL ), 1 ) + END DO + ENDIF +* +* Record current scaling factor +* + SWORK( K, L ) = SCAMIN * SCALOC + SWORK( I, L ) = SCAMIN * SCALOC +* + CALL DGEMM( 'N', 'N', I2-I1, L2-L1, K2-K1, -ONE, + $ A( I1, K1 ), LDA, C( K1, L1 ), LDC, + $ ONE, C( I1, L1 ), LDC ) +* + END DO +* + DO J = 1, L - 1 +* +* C( K, J ) := C( K, J ) - SGN * C( K, L ) * B( J, L )**T +* + J1 = IWORK( PC + J ) + J2 = IWORK( PC + J + 1 ) +* +* Compute scaling factor to survive the linear update +* simulating consistent scaling. +* + CNRM = DLANGE( 'I', K2-K1, J2-J1, C( K1, J1 ), + $ LDC, WNRM ) + SCAMIN = MIN( SWORK( K, J ), SWORK( K, L ) ) + CNRM = CNRM * ( SCAMIN / SWORK( K, J ) ) + XNRM = XNRM * ( SCAMIN / SWORK( K, L ) ) + BNRM = SWORK( L, BWRK + J ) + SCALOC = DLARMM( BNRM, XNRM, CNRM ) + IF( SCALOC * SCAMIN .EQ. ZERO ) THEN +* Use second scaling factor to prevent flushing to zero. + BUF = BUF*2.D0**EXPONENT( SCALOC ) + DO JJ = 1, NBB + DO LL = 1, NBA + SWORK( LL, JJ ) = MIN( BIGNUM, + $ SWORK( LL, JJ ) / 2.D0**EXPONENT( SCALOC ) ) + END DO + END DO + SCAMIN = SCAMIN / 2.D0**EXPONENT( SCALOC ) + SCALOC = SCALOC / 2.D0**EXPONENT( SCALOC ) + END IF + CNRM = CNRM * SCALOC + XNRM = XNRM * SCALOC +* +* Simultaneously apply the robust update factor and the +* consistency scaling factor to C( K, J ) and C( K, L ). +* + SCAL = ( SCAMIN / SWORK( K, L ) ) * SCALOC + IF( SCAL .NE. ONE ) THEN + DO JJ = L1, L2-1 + CALL DSCAL( K2-K1, SCAL, C( K1, JJ ), 1 ) + END DO + ENDIF +* + SCAL = ( SCAMIN / SWORK( K, J ) ) * SCALOC + IF( SCAL .NE. ONE ) THEN + DO JJ = J1, J2-1 + CALL DSCAL( K2-K1, SCAL, C( K1, JJ ), 1 ) + END DO + ENDIF +* +* Record current scaling factor +* + SWORK( K, L ) = SCAMIN * SCALOC + SWORK( K, J ) = SCAMIN * SCALOC +* + CALL DGEMM( 'N', 'T', K2-K1, J2-J1, L2-L1, -SGN, + $ C( K1, L1 ), LDC, B( J1, L1 ), LDB, + $ ONE, C( K1, J1 ), LDC ) + END DO + END DO + END DO +* + END IF +* +* Reduce local scaling factors +* + SCALE = SWORK( 1, 1 ) + DO K = 1, NBA + DO L = 1, NBB + SCALE = MIN( SCALE, SWORK( K, L ) ) + END DO + END DO +* + IF( SCALE .EQ. ZERO ) THEN +* +* The magnitude of the largest entry of the solution is larger +* than the product of BIGNUM**2 and cannot be represented in the +* form (1/SCALE)*X if SCALE is DOUBLE PRECISION. Set SCALE to +* zero and give up. +* + IWORK(1) = NBA + NBB + 2 + SWORK(1,1) = MAX( NBA, NBB ) + SWORK(2,1) = 2 * NBB + NBA + RETURN + END IF +* +* Realize consistent scaling +* + DO K = 1, NBA + K1 = IWORK( K ) + K2 = IWORK( K + 1 ) + DO L = 1, NBB + L1 = IWORK( PC + L ) + L2 = IWORK( PC + L + 1 ) + SCAL = SCALE / SWORK( K, L ) + IF( SCAL .NE. ONE ) THEN + DO LL = L1, L2-1 + CALL DSCAL( K2-K1, SCAL, C( K1, LL ), 1 ) + END DO + ENDIF + END DO + END DO +* + IF( BUF .NE. ONE .AND. BUF.GT.ZERO ) THEN +* +* Decrease SCALE as much as possible. +* + SCALOC = MIN( SCALE / SMLNUM, ONE / BUF ) + BUF = BUF * SCALOC + SCALE = SCALE / SCALOC + END IF + + IF( BUF.NE.ONE .AND. BUF.GT.ZERO ) THEN +* +* In case of overly aggressive scaling during the computation, +* flushing of the global scale factor may be prevented by +* undoing some of the scaling. This step is to ensure that +* this routine flushes only scale factors that TRSYL also +* flushes and be usable as a drop-in replacement. +* +* How much can the normwise largest entry be upscaled? +* + SCAL = C( 1, 1 ) + DO K = 1, M + DO L = 1, N + SCAL = MAX( SCAL, ABS( C( K, L ) ) ) + END DO + END DO +* +* Increase BUF as close to 1 as possible and apply scaling. +* + SCALOC = MIN( BIGNUM / SCAL, ONE / BUF ) + BUF = BUF * SCALOC + CALL DLASCL( 'G', -1, -1, ONE, SCALOC, M, N, C, LDC, IWORK ) + END IF +* +* Combine with buffer scaling factor. SCALE will be flushed if +* BUF is less than one here. +* + SCALE = SCALE * BUF +* +* Restore workspace dimensions +* + IWORK(1) = NBA + NBB + 2 + SWORK(1,1) = MAX( NBA, NBB ) + SWORK(2,1) = 2 * NBB + NBA +* + RETURN +* +* End of DTRSYL3 +* + END diff --git a/SRC/ilaenv.f b/SRC/ilaenv.f index af28503986..0e3fd38159 100644 --- a/SRC/ilaenv.f +++ b/SRC/ilaenv.f @@ -469,6 +469,14 @@ INTEGER FUNCTION ILAENV( ISPEC, NAME, OPTS, N1, N2, N3, N4 ) ELSE NB = 64 END IF + ELSE IF( C3.EQ.'SYL' ) THEN +* The upper bound is to prevent overly aggressive scaling. + IF( SNAME ) THEN + NB = MIN( MAX( 48, INT( ( MIN( N1, N2 ) * 16 ) / 100) ), + $ 240 ) + ELSE + NB = -1 + END IF END IF ELSE IF( C2.EQ.'LA' ) THEN IF( C3.EQ.'UUM' ) THEN diff --git a/SRC/slarmm.f b/SRC/slarmm.f new file mode 100644 index 0000000000..643dd67487 --- /dev/null +++ b/SRC/slarmm.f @@ -0,0 +1,99 @@ +*> \brief \b SLARMM +* +* Definition: +* =========== +* +* REAL FUNCTION SLARMM( ANORM, BNORM, CNORM ) +* +* .. Scalar Arguments .. +* REAL ANORM, BNORM, CNORM +* .. +* +*> \par Purpose: +* ======= +*> +*> \verbatim +*> +*> SLARMM returns a factor s in (0, 1] such that the linear updates +*> +*> (s * C) - A * (s * B) and (s * C) - (s * A) * B +*> +*> cannot overflow, where A, B, and C are matrices of conforming +*> dimensions. +*> +*> This is an auxiliary routine so there is no argument checking. +*> \endverbatim +* +* Arguments: +* ========= +* +*> \param[in] ANORM +*> \verbatim +*> ANORM is REAL +*> The infinity norm of A. ANORM >= 0. +*> The number of rows of the matrix A. M >= 0. +*> \endverbatim +*> +*> \param[in] BNORM +*> \verbatim +*> BNORM is REAL +*> The infinity norm of B. BNORM >= 0. +*> \endverbatim +*> +*> \param[in] CNORM +*> \verbatim +*> CNORM is REAL +*> The infinity norm of C. CNORM >= 0. +*> \endverbatim +*> +*> +* ===================================================================== +*> References: +*> C. C. Kjelgaard Mikkelsen and L. Karlsson, Blocked Algorithms for +*> Robust Solution of Triangular Linear Systems. In: International +*> Conference on Parallel Processing and Applied Mathematics, pages +*> 68--78. Springer, 2017. +*> +*> \ingroup OTHERauxiliary +* ===================================================================== + + REAL FUNCTION SLARMM( ANORM, BNORM, CNORM ) + IMPLICIT NONE +* .. Scalar Arguments .. + REAL ANORM, BNORM, CNORM +* .. Parameters .. + REAL ONE, HALF, FOUR + PARAMETER ( ONE = 1.0E0, HALF = 0.5E+0, FOUR = 4.0E+0 ) +* .. +* .. Local Scalars .. + REAL BIGNUM, SMLNUM +* .. +* .. External Functions .. + REAL SLAMCH + EXTERNAL SLAMCH +* .. +* .. Executable Statements .. +* +* +* Determine machine dependent parameters to control overflow. +* + SMLNUM = SLAMCH( 'Safe minimum' ) / SLAMCH( 'Precision' ) + BIGNUM = ( ONE / SMLNUM ) / FOUR +* +* Compute a scale factor. +* + SLARMM = ONE + IF( BNORM .LE. ONE ) THEN + IF( ANORM * BNORM .GT. BIGNUM - CNORM ) THEN + SLARMM = HALF + END IF + ELSE + IF( ANORM .GT. (BIGNUM - CNORM) / BNORM ) THEN + SLARMM = HALF / BNORM + END IF + END IF + RETURN +* +* ==== End of SLARMM ==== +* + END diff --git a/SRC/strsyl3.f b/SRC/strsyl3.f new file mode 100644 index 0000000000..8f9766837a --- /dev/null +++ b/SRC/strsyl3.f @@ -0,0 +1,1241 @@ +*> \brief \b STRSYL3 +* +* Definition: +* =========== +* +* +*> \par Purpose +* ============= +*> +*> \verbatim +*> +*> STRSYL3 solves the real Sylvester matrix equation: +*> +*> op(A)*X + X*op(B) = scale*C or +*> op(A)*X - X*op(B) = scale*C, +*> +*> where op(A) = A or A**T, and A and B are both upper quasi- +*> triangular. A is M-by-M and B is N-by-N; the right hand side C and +*> the solution X are M-by-N; and scale is an output scale factor, set +*> <= 1 to avoid overflow in X. +*> +*> A and B must be in Schur canonical form (as returned by SHSEQR), that +*> is, block upper triangular with 1-by-1 and 2-by-2 diagonal blocks; +*> each 2-by-2 diagonal block has its diagonal elements equal and its +*> off-diagonal elements of opposite sign. +*> +*> This is the block version of the algorithm. +*> \endverbatim +* +* Arguments +* ========= +* +*> \param[in] TRANA +*> \verbatim +*> TRANA is CHARACTER*1 +*> Specifies the option op(A): +*> = 'N': op(A) = A (No transpose) +*> = 'T': op(A) = A**T (Transpose) +*> = 'C': op(A) = A**H (Conjugate transpose = Transpose) +*> \endverbatim +*> +*> \param[in] TRANB +*> \verbatim +*> TRANB is CHARACTER*1 +*> Specifies the option op(B): +*> = 'N': op(B) = B (No transpose) +*> = 'T': op(B) = B**T (Transpose) +*> = 'C': op(B) = B**H (Conjugate transpose = Transpose) +*> \endverbatim +*> +*> \param[in] ISGN +*> \verbatim +*> ISGN is INTEGER +*> Specifies the sign in the equation: +*> = +1: solve op(A)*X + X*op(B) = scale*C +*> = -1: solve op(A)*X - X*op(B) = scale*C +*> \endverbatim +*> +*> \param[in] M +*> \verbatim +*> M is INTEGER +*> The order of the matrix A, and the number of rows in the +*> matrices X and C. M >= 0. +*> \endverbatim +*> +*> \param[in] N +*> \verbatim +*> N is INTEGER +*> The order of the matrix B, and the number of columns in the +*> matrices X and C. N >= 0. +*> \endverbatim +*> +*> \param[in] A +*> \verbatim +*> A is REAL array, dimension (LDA,M) +*> The upper quasi-triangular matrix A, in Schur canonical form. +*> \endverbatim +*> +*> \param[in] LDA +*> \verbatim +*> LDA is INTEGER +*> The leading dimension of the array A. LDA >= max(1,M). +*> \endverbatim +*> +*> \param[in] B +*> \verbatim +*> B is REAL array, dimension (LDB,N) +*> The upper quasi-triangular matrix B, in Schur canonical form. +*> \endverbatim +*> +*> \param[in] LDB +*> \verbatim +*> LDB is INTEGER +*> The leading dimension of the array B. LDB >= max(1,N). +*> \endverbatim +*> +*> \param[in,out] C +*> \verbatim +*> C is REAL array, dimension (LDC,N) +*> On entry, the M-by-N right hand side matrix C. +*> On exit, C is overwritten by the solution matrix X. +*> \endverbatim +*> +*> \param[in] LDC +*> \verbatim +*> LDC is INTEGER +*> The leading dimension of the array C. LDC >= max(1,M) +*> \endverbatim +*> +*> \param[out] SCALE +*> \verbatim +*> SCALE is REAL +*> The scale factor, scale, set <= 1 to avoid overflow in X. +*> \endverbatim +*> +*> \param[out] IWORK +*> \verbatim +*> IWORK is INTEGER array, dimension (MAX(1,LIWORK)) +*> On exit, if INFO = 0, IWORK(1) returns the optimal LIWORK. +*> \endverbatim +*> +*> \param[in] LIWORK +*> \verbatim +*> IWORK is INTEGER +*> The dimension of the array IWORK. LIWORK >= ((M + NB - 1) / NB + 1) +*> + ((N + NB - 1) / NB + 1), where NB is the optimal block size. +*> +*> If LIWORK = -1, then a workspace query is assumed; the routine +*> only calculates the optimal dimension of the IWORK array, +*> returns this value as the first entry of the IWORK array, and +*> no error message related to LIWORK is issued by XERBLA. +*> \endverbatim +*> +*> \param[out] SWORK +*> \verbatim +*> SWORK is REAL array, dimension (MAX(2, ROWS), +*> MAX(1,COLS)). +*> On exit, if INFO = 0, SWORK(1) returns the optimal value ROWS +*> and SWORK(2) returns the optimal COLS. +*> \endverbatim +*> +*> \param[in] LDSWORK +*> \verbatim +*> LDSWORK is INTEGER +*> LDSWORK >= MAX(2,ROWS), where ROWS = ((M + NB - 1) / NB + 1) +*> and NB is the optimal block size. +*> +*> If LDSWORK = -1, then a workspace query is assumed; the routine +*> only calculates the optimal dimensions of the SWORK matrix, +*> returns these values as the first and second entry of the SWORK +*> matrix, and no error message related LWORK is issued by XERBLA. +*> \endverbatim +*> +*> \param[out] INFO +*> \verbatim +*> INFO is INTEGER +*> = 0: successful exit +*> < 0: if INFO = -i, the i-th argument had an illegal value +*> = 1: A and B have common or very close eigenvalues; perturbed +*> values were used to solve the equation (but the matrices +*> A and B are unchanged). +*> \endverbatim +* +* ===================================================================== +* References: +* E. S. Quintana-Orti and R. A. Van De Geijn (2003). Formal derivation of +* algorithms: The triangular Sylvester equation, ACM Transactions +* on Mathematical Software (TOMS), volume 29, pages 218--243. +* +* A. Schwarz and C. C. Kjelgaard Mikkelsen (2020). Robust Task-Parallel +* Solution of the Triangular Sylvester Equation. Lecture Notes in +* Computer Science, vol 12043, pages 82--92, Springer. +* +* Contributor: +* Angelika Schwarz, Umea University, Sweden. +* +* ===================================================================== + SUBROUTINE STRSYL3( TRANA, TRANB, ISGN, M, N, A, LDA, B, LDB, C, + $ LDC, SCALE, IWORK, LIWORK, SWORK, LDSWORK, + $ INFO ) + IMPLICIT NONE +* +* .. Scalar Arguments .. + CHARACTER TRANA, TRANB + INTEGER INFO, ISGN, LDA, LDB, LDC, M, N, + $ LIWORK, LDSWORK + REAL SCALE +* .. +* .. Array Arguments .. + INTEGER IWORK( * ) + REAL A( LDA, * ), B( LDB, * ), C( LDC, * ), + $ SWORK( LDSWORK, * ) +* .. +* .. Parameters .. + REAL ZERO, ONE + PARAMETER ( ZERO = 0.0E+0, ONE = 1.0E+0 ) +* .. +* .. Local Scalars .. + LOGICAL NOTRNA, NOTRNB, LQUERY, SKIP + INTEGER AWRK, BWRK, I, I1, I2, IINFO, J, J1, J2, JJ, + $ K, K1, K2, L, L1, L2, LL, NBA, NB, NBB, PC + REAL ANRM, BIGNUM, BNRM, CNRM, SCAL, SCALOC, + $ SCAMIN, SGN, XNRM, BUF, SMLNUM +* .. +* .. Local Arrays .. + REAL WNRM( MAX( M, N ) ) +* .. +* .. External Functions .. + LOGICAL LSAME + INTEGER ILAENV + REAL SLANGE, SLAMCH, SLARMM + EXTERNAL SLANGE, SLAMCH, SLARMM, ILAENV, LSAME +* .. +* .. External Subroutines .. + EXTERNAL SGEMM, SLASCL, SSCAL, STRSYL, XERBLA +* .. +* .. Intrinsic Functions .. + INTRINSIC ABS, EXPONENT, MAX, MIN, REAL +* .. +* .. Executable Statements .. +* +* Decode and Test input parameters +* + NOTRNA = LSAME( TRANA, 'N' ) + NOTRNB = LSAME( TRANB, 'N' ) +* +* Use the same block size for all matrices. +* + NB = MAX(8, ILAENV( 1, 'STRSYL', '', M, N, -1, -1) ) +* +* Compute number of blocks in A and B +* + NBA = MAX( 1, (M + NB - 1) / NB ) + NBB = MAX( 1, (N + NB - 1) / NB ) +* +* Compute workspace +* + INFO = 0 + LQUERY = ( LIWORK.EQ.-1 .OR. LDSWORK.EQ.-1 ) + IWORK( 1 ) = NBA + NBB + 2 + IF( LQUERY ) THEN + LDSWORK = 2 + SWORK( 1, 1 ) = MAX( NBA, NBB ) + SWORK( 2, 1 ) = 2 * NBB + NBA + END IF +* +* Test the input arguments +* + IF( .NOT.NOTRNA .AND. .NOT.LSAME( TRANA, 'T' ) .AND. .NOT. + $ LSAME( TRANA, 'C' ) ) THEN + INFO = -1 + ELSE IF( .NOT.NOTRNB .AND. .NOT.LSAME( TRANB, 'T' ) .AND. .NOT. + $ LSAME( TRANB, 'C' ) ) THEN + INFO = -2 + ELSE IF( ISGN.NE.1 .AND. ISGN.NE.-1 ) THEN + INFO = -3 + ELSE IF( M.LT.0 ) THEN + INFO = -4 + ELSE IF( N.LT.0 ) THEN + INFO = -5 + ELSE IF( LDA.LT.MAX( 1, M ) ) THEN + INFO = -7 + ELSE IF( LDB.LT.MAX( 1, N ) ) THEN + INFO = -9 + ELSE IF( LDC.LT.MAX( 1, M ) ) THEN + INFO = -11 + ELSE IF( .NOT.LQUERY .AND. LIWORK.LT.IWORK(1) ) THEN + INFO = -14 + ELSE IF( .NOT.LQUERY .AND. LDSWORK.LT.MAX( NBA, NBB ) ) THEN + INFO = -16 + END IF + IF( INFO.NE.0 ) THEN + CALL XERBLA( 'STRSYL3', -INFO ) + RETURN + ELSE IF( LQUERY ) THEN + RETURN + END IF +* +* Quick return if possible +* + IF( M.EQ.0 .OR. N.EQ.0 ) + $ RETURN +* +* Use unblocked code for small problems +* + IF( NBA.EQ.1 .OR. NBB.EQ.1 ) THEN + CALL STRSYL( TRANA, TRANB, ISGN, M, N, A, LDA, B, LDB, + $ C, LDC, SCALE, INFO ) + RETURN + END IF +* +* Set constants to control overflow +* + SMLNUM = SLAMCH( 'S' ) + BIGNUM = ONE / SMLNUM +* +* Partition A such that 2-by-2 blocks on the diagonal are not split +* + SKIP = .FALSE. + DO I = 1, NBA + IWORK( I ) = ( I - 1 ) * NB + 1 + END DO + IWORK( NBA + 1 ) = M + 1 + DO K = 1, NBA + L1 = IWORK( K ) + L2 = IWORK( K + 1 ) - 1 + DO L = L1, L2 + IF( SKIP ) THEN + SKIP = .FALSE. + CYCLE + END IF + IF( L.GE.M ) THEN +* A( M, M ) is a 1-by-1 block + CYCLE + END IF + IF( A( L, L+1 ).NE.ZERO .AND. A( L+1, L ).NE.ZERO ) THEN +* Check if 2-by-2 block is split + IF( L + 1 .EQ. IWORK( K + 1 ) ) THEN + IWORK( K + 1 ) = IWORK( K + 1 ) + 1 + CYCLE + END IF + SKIP = .TRUE. + END IF + END DO + END DO + IWORK( NBA + 1 ) = M + 1 + IF( IWORK( NBA ).GE.IWORK( NBA + 1 ) ) THEN + IWORK( NBA ) = IWORK( NBA + 1 ) + NBA = NBA - 1 + END IF +* +* Partition B such that 2-by-2 blocks on the diagonal are not split +* + PC = NBA + 1 + SKIP = .FALSE. + DO I = 1, NBB + IWORK( PC + I ) = ( I - 1 ) * NB + 1 + END DO + IWORK( PC + NBB + 1 ) = N + 1 + DO K = 1, NBB + L1 = IWORK( PC + K ) + L2 = IWORK( PC + K + 1 ) - 1 + DO L = L1, L2 + IF( SKIP ) THEN + SKIP = .FALSE. + CYCLE + END IF + IF( L.GE.N ) THEN +* B( N, N ) is a 1-by-1 block + CYCLE + END IF + IF( B( L, L+1 ).NE.ZERO .AND. B( L+1, L ).NE.ZERO ) THEN +* Check if 2-by-2 block is split + IF( L + 1 .EQ. IWORK( PC + K + 1 ) ) THEN + IWORK( PC + K + 1 ) = IWORK( PC + K + 1 ) + 1 + CYCLE + END IF + SKIP = .TRUE. + END IF + END DO + END DO + IWORK( PC + NBB + 1 ) = N + 1 + IF( IWORK( PC + NBB ).GE.IWORK( PC + NBB + 1 ) ) THEN + IWORK( PC + NBB ) = IWORK( PC + NBB + 1 ) + NBB = NBB - 1 + END IF +* +* Set local scaling factors - must never attain zero. +* + DO L = 1, NBB + DO K = 1, NBA + SWORK( K, L ) = ONE + END DO + END DO +* +* Fallback scaling factor to prevent flushing of SWORK( K, L ) to zero. +* This scaling is to ensure compatibility with TRSYL and may get flushed. +* + BUF = ONE +* +* Compute upper bounds of blocks of A and B +* + AWRK = NBB + DO K = 1, NBA + K1 = IWORK( K ) + K2 = IWORK( K + 1 ) + DO L = K, NBA + L1 = IWORK( L ) + L2 = IWORK( L + 1 ) + IF( NOTRNA ) THEN + SWORK( K, AWRK + L ) = SLANGE( 'I', K2-K1, L2-L1, + $ A( K1, L1 ), LDA, WNRM ) + ELSE + SWORK( L, AWRK + K ) = SLANGE( '1', K2-K1, L2-L1, + $ A( K1, L1 ), LDA, WNRM ) + END IF + END DO + END DO + BWRK = NBB + NBA + DO K = 1, NBB + K1 = IWORK( PC + K ) + K2 = IWORK( PC + K + 1 ) + DO L = K, NBB + L1 = IWORK( PC + L ) + L2 = IWORK( PC + L + 1 ) + IF( NOTRNB ) THEN + SWORK( K, BWRK + L ) = SLANGE( 'I', K2-K1, L2-L1, + $ B( K1, L1 ), LDB, WNRM ) + ELSE + SWORK( L, BWRK + K ) = SLANGE( '1', K2-K1, L2-L1, + $ B( K1, L1 ), LDB, WNRM ) + END IF + END DO + END DO +* + SGN = REAL( ISGN ) +* + IF( NOTRNA .AND. NOTRNB ) THEN +* +* Solve A*X + ISGN*X*B = scale*C. +* +* The (K,L)th block of X is determined starting from +* bottom-left corner column by column by +* +* A(K,K)*X(K,L) + ISGN*X(K,L)*B(L,L) = C(K,L) - R(K,L) +* +* Where +* M L-1 +* R(K,L) = SUM [A(K,I)*X(I,L)] + ISGN*SUM [X(K,J)*B(J,L)]. +* I=K+1 J=1 +* +* Start loop over block rows (index = K) and block columns (index = L) +* + DO K = NBA, 1, -1 +* +* K1: row index of the first row in X( K, L ) +* K2: row index of the first row in X( K+1, L ) +* so the K2 - K1 is the column count of the block X( K, L ) +* + K1 = IWORK( K ) + K2 = IWORK( K + 1 ) + DO L = 1, NBB +* +* L1: column index of the first column in X( K, L ) +* L2: column index of the first column in X( K, L + 1) +* so that L2 - L1 is the row count of the block X( K, L ) +* + L1 = IWORK( PC + L ) + L2 = IWORK( PC + L + 1 ) +* + CALL STRSYL( TRANA, TRANB, ISGN, K2-K1, L2-L1, + $ A( K1, K1 ), LDA, + $ B( L1, L1 ), LDB, + $ C( K1, L1 ), LDC, SCALOC, IINFO ) + INFO = MAX( INFO, IINFO ) +* + IF ( SCALOC * SWORK( K, L ) .EQ. ZERO ) THEN + IF( SCALOC .EQ. ZERO ) THEN +* The magnitude of the largest entry of X(K1:K2-1, L1:L2-1) +* is larger than the product of BIGNUM**2 and cannot be +* represented in the form (1/SCALE)*X(K1:K2-1, L1:L2-1). +* Mark the computation as pointless. + BUF = ZERO + ELSE +* Use second scaling factor to prevent flushing to zero. + BUF = BUF*2.E0**EXPONENT( SCALOC ) + END IF + DO JJ = 1, NBB + DO LL = 1, NBA +* Bound by BIGNUM to not introduce Inf. The value +* is irrelevant; corresponding entries of the +* solution will be flushed in consistency scaling. + SWORK( LL, JJ ) = MIN( BIGNUM, + $ SWORK( LL, JJ ) / 2.E0**EXPONENT( SCALOC ) ) + END DO + END DO + END IF + SWORK( K, L ) = SCALOC * SWORK( K, L ) + XNRM = SLANGE( 'I', K2-K1, L2-L1, C( K1, L1 ), LDC, + $ WNRM ) +* + DO I = K - 1, 1, -1 +* +* C( I, L ) := C( I, L ) - A( I, K ) * C( K, L ) +* + I1 = IWORK( I ) + I2 = IWORK( I + 1 ) +* +* Compute scaling factor to survive the linear update +* simulating consistent scaling. +* + CNRM = SLANGE( 'I', I2-I1, L2-L1, C( I1, L1 ), + $ LDC, WNRM ) + SCAMIN = MIN( SWORK( I, L ), SWORK( K, L ) ) + CNRM = CNRM * ( SCAMIN / SWORK( I, L ) ) + XNRM = XNRM * ( SCAMIN / SWORK( K, L ) ) + ANRM = SWORK( I, AWRK + K ) + SCALOC = SLARMM( ANRM, XNRM, CNRM ) + IF( SCALOC * SCAMIN .EQ. ZERO ) THEN +* Use second scaling factor to prevent flushing to zero. + BUF = BUF*2.E0**EXPONENT( SCALOC ) + DO JJ = 1, NBB + DO LL = 1, NBA + SWORK( LL, JJ ) = MIN( BIGNUM, + $ SWORK( LL, JJ ) / 2.E0**EXPONENT( SCALOC ) ) + END DO + END DO + SCAMIN = SCAMIN / 2.E0**EXPONENT( SCALOC ) + SCALOC = SCALOC / 2.E0**EXPONENT( SCALOC ) + END IF + CNRM = CNRM * SCALOC + XNRM = XNRM * SCALOC +* +* Simultaneously apply the robust update factor and the +* consistency scaling factor to C( I, L ) and C( K, L ). +* + SCAL = ( SCAMIN / SWORK( K, L ) ) * SCALOC + IF (SCAL .NE. ONE) THEN + DO JJ = L1, L2-1 + CALL SSCAL( K2-K1, SCAL, C( K1, JJ ), 1) + END DO + ENDIF +* + SCAL = ( SCAMIN / SWORK( I, L ) ) * SCALOC + IF (SCAL .NE. ONE) THEN + DO LL = L1, L2-1 + CALL SSCAL( I2-I1, SCAL, C( I1, LL ), 1) + END DO + ENDIF +* +* Record current scaling factor +* + SWORK( K, L ) = SCAMIN * SCALOC + SWORK( I, L ) = SCAMIN * SCALOC +* + CALL SGEMM( 'N', 'N', I2-I1, L2-L1, K2-K1, -ONE, + $ A( I1, K1 ), LDA, C( K1, L1 ), LDC, + $ ONE, C( I1, L1 ), LDC ) +* + END DO +* + DO J = L + 1, NBB +* +* C( K, J ) := C( K, J ) - SGN * C( K, L ) * B( L, J ) +* + J1 = IWORK( PC + J ) + J2 = IWORK( PC + J + 1 ) +* +* Compute scaling factor to survive the linear update +* simulating consistent scaling. +* + CNRM = SLANGE( 'I', K2-K1, J2-J1, C( K1, J1 ), + $ LDC, WNRM ) + SCAMIN = MIN( SWORK( K, J ), SWORK( K, L ) ) + CNRM = CNRM * ( SCAMIN / SWORK( K, J ) ) + XNRM = XNRM * ( SCAMIN / SWORK( K, L ) ) + BNRM = SWORK(L, BWRK + J) + SCALOC = SLARMM( BNRM, XNRM, CNRM ) + IF( SCALOC * SCAMIN .EQ. ZERO ) THEN +* Use second scaling factor to prevent flushing to zero. + BUF = BUF*2.E0**EXPONENT( SCALOC ) + DO JJ = 1, NBB + DO LL = 1, NBA + SWORK( LL, JJ ) = MIN( BIGNUM, + $ SWORK( LL, JJ ) / 2.E0**EXPONENT( SCALOC ) ) + END DO + END DO + SCAMIN = SCAMIN / 2.E0**EXPONENT( SCALOC ) + SCALOC = SCALOC / 2.E0**EXPONENT( SCALOC ) + END IF + CNRM = CNRM * SCALOC + XNRM = XNRM * SCALOC +* +* Simultaneously apply the robust update factor and the +* consistency scaling factor to C( K, J ) and C( K, L). +* + SCAL = ( SCAMIN / SWORK( K, L ) ) * SCALOC + IF( SCAL .NE. ONE ) THEN + DO LL = L1, L2-1 + CALL SSCAL( K2-K1, SCAL, C( K1, LL ), 1 ) + END DO + ENDIF +* + SCAL = ( SCAMIN / SWORK( K, J ) ) * SCALOC + IF( SCAL .NE. ONE ) THEN + DO JJ = J1, J2-1 + CALL SSCAL( K2-K1, SCAL, C( K1, JJ ), 1 ) + END DO + ENDIF +* +* Record current scaling factor +* + SWORK( K, L ) = SCAMIN * SCALOC + SWORK( K, J ) = SCAMIN * SCALOC +* + CALL SGEMM( 'N', 'N', K2-K1, J2-J1, L2-L1, -SGN, + $ C( K1, L1 ), LDC, B( L1, J1 ), LDB, + $ ONE, C( K1, J1 ), LDC ) + END DO + END DO + END DO + ELSE IF( .NOT.NOTRNA .AND. NOTRNB ) THEN +* +* Solve A**T*X + ISGN*X*B = scale*C. +* +* The (K,L)th block of X is determined starting from +* upper-left corner column by column by +* +* A(K,K)**T*X(K,L) + ISGN*X(K,L)*B(L,L) = C(K,L) - R(K,L) +* +* Where +* K-1 L-1 +* R(K,L) = SUM [A(I,K)**T*X(I,L)] +ISGN*SUM [X(K,J)*B(J,L)] +* I=1 J=1 +* +* Start loop over block rows (index = K) and block columns (index = L) +* + DO K = 1, NBA +* +* K1: row index of the first row in X( K, L ) +* K2: row index of the first row in X( K+1, L ) +* so the K2 - K1 is the column count of the block X( K, L ) +* + K1 = IWORK( K ) + K2 = IWORK( K + 1 ) + DO L = 1, NBB +* +* L1: column index of the first column in X( K, L ) +* L2: column index of the first column in X( K, L + 1) +* so that L2 - L1 is the row count of the block X( K, L ) +* + L1 = IWORK( PC + L ) + L2 = IWORK( PC + L + 1 ) +* + CALL STRSYL( TRANA, TRANB, ISGN, K2-K1, L2-L1, + $ A( K1, K1 ), LDA, + $ B( L1, L1 ), LDB, + $ C( K1, L1 ), LDC, SCALOC, IINFO ) + INFO = MAX( INFO, IINFO ) +* + IF( SCALOC * SWORK( K, L ) .EQ. ZERO ) THEN + IF( SCALOC .EQ. ZERO ) THEN +* The magnitude of the largest entry of X(K1:K2-1, L1:L2-1) +* is larger than the product of BIGNUM**2 and cannot be +* represented in the form (1/SCALE)*X(K1:K2-1, L1:L2-1). +* Mark the computation as pointless. + BUF = ZERO + ELSE +* Use second scaling factor to prevent flushing to zero. + BUF = BUF*2.E0**EXPONENT( SCALOC ) + END IF + DO JJ = 1, NBB + DO LL = 1, NBA +* Bound by BIGNUM to not introduce Inf. The value +* is irrelevant; corresponding entries of the +* solution will be flushed in consistency scaling. + SWORK( LL, JJ ) = MIN( BIGNUM, + $ SWORK( LL, JJ ) / 2.E0**EXPONENT( SCALOC ) ) + END DO + END DO + END IF + SWORK( K, L ) = SCALOC * SWORK( K, L ) + XNRM = SLANGE( 'I', K2-K1, L2-L1, C( K1, L1 ), LDC, + $ WNRM ) +* + DO I = K + 1, NBA +* +* C( I, L ) := C( I, L ) - A( K, I )**T * C( K, L ) +* + I1 = IWORK( I ) + I2 = IWORK( I + 1 ) +* +* Compute scaling factor to survive the linear update +* simulating consistent scaling. +* + CNRM = SLANGE( 'I', I2-I1, L2-L1, C( I1, L1 ), + $ LDC, WNRM ) + SCAMIN = MIN( SWORK( I, L ), SWORK( K, L ) ) + CNRM = CNRM * ( SCAMIN / SWORK( I, L ) ) + XNRM = XNRM * ( SCAMIN / SWORK( K, L ) ) + ANRM = SWORK( I, AWRK + K ) + SCALOC = SLARMM( ANRM, XNRM, CNRM ) + IF( SCALOC * SCAMIN .EQ. ZERO ) THEN +* Use second scaling factor to prevent flushing to zero. + BUF = BUF*2.E0**EXPONENT( SCALOC ) + DO JJ = 1, NBB + DO LL = 1, NBA + SWORK( LL, JJ ) = MIN( BIGNUM, + $ SWORK( LL, JJ ) / 2.E0**EXPONENT( SCALOC ) ) + END DO + END DO + SCAMIN = SCAMIN / 2.E0**EXPONENT( SCALOC ) + SCALOC = SCALOC / 2.E0**EXPONENT( SCALOC ) + END IF + CNRM = CNRM * SCALOC + XNRM = XNRM * SCALOC +* +* Simultaneously apply the robust update factor and the +* consistency scaling factor to to C( I, L ) and C( K, L ). +* + SCAL = ( SCAMIN / SWORK( K, L ) ) * SCALOC + IF (SCAL .NE. ONE) THEN + DO LL = L1, L2-1 + CALL SSCAL( K2-K1, SCAL, C( K1, LL ), 1 ) + END DO + ENDIF +* + SCAL = ( SCAMIN / SWORK( I, L ) ) * SCALOC + IF (SCAL .NE. ONE) THEN + DO LL = L1, L2-1 + CALL SSCAL( I2-I1, SCAL, C( I1, LL ), 1 ) + END DO + ENDIF +* +* Record current scaling factor +* + SWORK( K, L ) = SCAMIN * SCALOC + SWORK( I, L ) = SCAMIN * SCALOC +* + CALL SGEMM( 'T', 'N', I2-I1, L2-L1, K2-K1, -ONE, + $ A( K1, I1 ), LDA, C( K1, L1 ), LDC, + $ ONE, C( I1, L1 ), LDC ) + END DO +* + DO J = L + 1, NBB +* +* C( K, J ) := C( K, J ) - SGN * C( K, L ) * B( L, J ) +* + J1 = IWORK( PC + J ) + J2 = IWORK( PC + J + 1 ) +* +* Compute scaling factor to survive the linear update +* simulating consistent scaling. +* + CNRM = SLANGE( 'I', K2-K1, J2-J1, C( K1, J1 ), + $ LDC, WNRM ) + SCAMIN = MIN( SWORK( K, J ), SWORK( K, L ) ) + CNRM = CNRM * ( SCAMIN / SWORK( K, J ) ) + XNRM = XNRM * ( SCAMIN / SWORK( K, L ) ) + BNRM = SWORK( L, BWRK + J ) + SCALOC = SLARMM( BNRM, XNRM, CNRM ) + IF( SCALOC * SCAMIN .EQ. ZERO ) THEN +* Use second scaling factor to prevent flushing to zero. + BUF = BUF*2.E0**EXPONENT( SCALOC ) + DO JJ = 1, NBB + DO LL = 1, NBA + SWORK( LL, JJ ) = MIN( BIGNUM, + $ SWORK( LL, JJ ) / 2.E0**EXPONENT( SCALOC ) ) + END DO + END DO + SCAMIN = SCAMIN / 2.E0**EXPONENT( SCALOC ) + SCALOC = SCALOC / 2.E0**EXPONENT( SCALOC ) + END IF + CNRM = CNRM * SCALOC + XNRM = XNRM * SCALOC +* +* Simultaneously apply the robust update factor and the +* consistency scaling factor to to C( K, J ) and C( K, L ). +* + SCAL = ( SCAMIN / SWORK( K, L ) ) * SCALOC + IF( SCAL .NE. ONE ) THEN + DO LL = L1, L2-1 + CALL SSCAL( K2-K1, SCAL, C( K1, LL ), 1 ) + END DO + ENDIF +* + SCAL = ( SCAMIN / SWORK( K, J ) ) * SCALOC + IF( SCAL .NE. ONE ) THEN + DO JJ = J1, J2-1 + CALL SSCAL( K2-K1, SCAL, C( K1, JJ ), 1 ) + END DO + ENDIF +* +* Record current scaling factor +* + SWORK( K, L ) = SCAMIN * SCALOC + SWORK( K, J ) = SCAMIN * SCALOC +* + CALL SGEMM( 'N', 'N', K2-K1, J2-J1, L2-L1, -SGN, + $ C( K1, L1 ), LDC, B( L1, J1 ), LDB, + $ ONE, C( K1, J1 ), LDC ) + END DO + END DO + END DO + ELSE IF( .NOT.NOTRNA .AND. .NOT.NOTRNB ) THEN +* +* Solve A**T*X + ISGN*X*B**T = scale*C. +* +* The (K,L)th block of X is determined starting from +* top-right corner column by column by +* +* A(K,K)**T*X(K,L) + ISGN*X(K,L)*B(L,L)**T = C(K,L) - R(K,L) +* +* Where +* K-1 N +* R(K,L) = SUM [A(I,K)**T*X(I,L)] + ISGN*SUM [X(K,J)*B(L,J)**T]. +* I=1 J=L+1 +* +* Start loop over block rows (index = K) and block columns (index = L) +* + DO K = 1, NBA +* +* K1: row index of the first row in X( K, L ) +* K2: row index of the first row in X( K+1, L ) +* so the K2 - K1 is the column count of the block X( K, L ) +* + K1 = IWORK( K ) + K2 = IWORK( K + 1 ) + DO L = NBB, 1, -1 +* +* L1: column index of the first column in X( K, L ) +* L2: column index of the first column in X( K, L + 1) +* so that L2 - L1 is the row count of the block X( K, L ) +* + L1 = IWORK( PC + L ) + L2 = IWORK( PC + L + 1 ) +* + CALL STRSYL( TRANA, TRANB, ISGN, K2-K1, L2-L1, + $ A( K1, K1 ), LDA, + $ B( L1, L1 ), LDB, + $ C( K1, L1 ), LDC, SCALOC, IINFO ) + INFO = MAX( INFO, IINFO ) +* + IF( SCALOC * SWORK( K, L ) .EQ. ZERO ) THEN + IF( SCALOC .EQ. ZERO ) THEN +* The magnitude of the largest entry of X(K1:K2-1, L1:L2-1) +* is larger than the product of BIGNUM**2 and cannot be +* represented in the form (1/SCALE)*X(K1:K2-1, L1:L2-1). +* Mark the computation as pointless. + BUF = ZERO + ELSE +* Use second scaling factor to prevent flushing to zero. + BUF = BUF*2.E0**EXPONENT( SCALOC ) + END IF + DO JJ = 1, NBB + DO LL = 1, NBA +* Bound by BIGNUM to not introduce Inf. The value +* is irrelevant; corresponding entries of the +* solution will be flushed in consistency scaling. + SWORK( LL, JJ ) = MIN( BIGNUM, + $ SWORK( LL, JJ ) / 2.E0**EXPONENT( SCALOC ) ) + END DO + END DO + END IF + SWORK( K, L ) = SCALOC * SWORK( K, L ) + XNRM = SLANGE( 'I', K2-K1, L2-L1, C( K1, L1 ), LDC, + $ WNRM ) +* + DO I = K + 1, NBA +* +* C( I, L ) := C( I, L ) - A( K, I )**T * C( K, L ) +* + I1 = IWORK( I ) + I2 = IWORK( I + 1 ) +* +* Compute scaling factor to survive the linear update +* simulating consistent scaling. +* + CNRM = SLANGE( 'I', I2-I1, L2-L1, C( I1, L1 ), + $ LDC, WNRM ) + SCAMIN = MIN( SWORK( I, L ), SWORK( K, L ) ) + CNRM = CNRM * ( SCAMIN / SWORK( I, L ) ) + XNRM = XNRM * ( SCAMIN / SWORK( K, L ) ) + ANRM = SWORK( I, AWRK + K ) + SCALOC = SLARMM( ANRM, XNRM, CNRM ) + IF( SCALOC * SCAMIN .EQ. ZERO ) THEN +* Use second scaling factor to prevent flushing to zero. + BUF = BUF*2.E0**EXPONENT( SCALOC ) + DO JJ = 1, NBB + DO LL = 1, NBA + SWORK( LL, JJ ) = MIN( BIGNUM, + $ SWORK( LL, JJ ) / 2.E0**EXPONENT( SCALOC ) ) + END DO + END DO + SCAMIN = SCAMIN / 2.E0**EXPONENT( SCALOC ) + SCALOC = SCALOC / 2.E0**EXPONENT( SCALOC ) + END IF + CNRM = CNRM * SCALOC + XNRM = XNRM * SCALOC +* +* Simultaneously apply the robust update factor and the +* consistency scaling factor to C( I, L ) and C( K, L ). +* + SCAL = ( SCAMIN / SWORK( K, L ) ) * SCALOC + IF (SCAL .NE. ONE) THEN + DO LL = L1, L2-1 + CALL SSCAL( K2-K1, SCAL, C( K1, LL ), 1 ) + END DO + ENDIF +* + SCAL = ( SCAMIN / SWORK( I, L ) ) * SCALOC + IF (SCAL .NE. ONE) THEN + DO LL = L1, L2-1 + CALL SSCAL( I2-I1, SCAL, C( I1, LL ), 1 ) + END DO + ENDIF +* +* Record current scaling factor +* + SWORK( K, L ) = SCAMIN * SCALOC + SWORK( I, L ) = SCAMIN * SCALOC +* + CALL SGEMM( 'T', 'N', I2-I1, L2-L1, K2-K1, -ONE, + $ A( K1, I1 ), LDA, C( K1, L1 ), LDC, + $ ONE, C( I1, L1 ), LDC ) + END DO +* + DO J = 1, L - 1 +* +* C( K, J ) := C( K, J ) - SGN * C( K, L ) * B( J, L )**T +* + J1 = IWORK( PC + J ) + J2 = IWORK( PC + J + 1 ) +* +* Compute scaling factor to survive the linear update +* simulating consistent scaling. +* + CNRM = SLANGE( 'I', K2-K1, J2-J1, C( K1, J1 ), + $ LDC, WNRM ) + SCAMIN = MIN( SWORK( K, J ), SWORK( K, L ) ) + CNRM = CNRM * ( SCAMIN / SWORK( K, J ) ) + XNRM = XNRM * ( SCAMIN / SWORK( K, L ) ) + BNRM = SWORK( L, BWRK + J ) + SCALOC = SLARMM( BNRM, XNRM, CNRM ) + IF( SCALOC * SCAMIN .EQ. ZERO ) THEN +* Use second scaling factor to prevent flushing to zero. + BUF = BUF*2.E0**EXPONENT( SCALOC ) + DO JJ = 1, NBB + DO LL = 1, NBA + SWORK( LL, JJ ) = MIN( BIGNUM, + $ SWORK( LL, JJ ) / 2.E0**EXPONENT( SCALOC ) ) + END DO + END DO + SCAMIN = SCAMIN / 2.E0**EXPONENT( SCALOC ) + SCALOC = SCALOC / 2.E0**EXPONENT( SCALOC ) + END IF + CNRM = CNRM * SCALOC + XNRM = XNRM * SCALOC +* +* Simultaneously apply the robust update factor and the +* consistency scaling factor to C( K, J ) and C( K, L ). +* + SCAL = ( SCAMIN / SWORK( K, L ) ) * SCALOC + IF( SCAL .NE. ONE ) THEN + DO LL = L1, L2-1 + CALL SSCAL( K2-K1, SCAL, C( K1, LL ), 1) + END DO + ENDIF +* + SCAL = ( SCAMIN / SWORK( K, J ) ) * SCALOC + IF( SCAL .NE. ONE ) THEN + DO JJ = J1, J2-1 + CALL SSCAL( K2-K1, SCAL, C( K1, JJ ), 1 ) + END DO + ENDIF +* +* Record current scaling factor +* + SWORK( K, L ) = SCAMIN * SCALOC + SWORK( K, J ) = SCAMIN * SCALOC +* + CALL SGEMM( 'N', 'T', K2-K1, J2-J1, L2-L1, -SGN, + $ C( K1, L1 ), LDC, B( J1, L1 ), LDB, + $ ONE, C( K1, J1 ), LDC ) + END DO + END DO + END DO + ELSE IF( NOTRNA .AND. .NOT.NOTRNB ) THEN +* +* Solve A*X + ISGN*X*B**T = scale*C. +* +* The (K,L)th block of X is determined starting from +* bottom-right corner column by column by +* +* A(K,K)*X(K,L) + ISGN*X(K,L)*B(L,L)**T = C(K,L) - R(K,L) +* +* Where +* M N +* R(K,L) = SUM [A(K,I)*X(I,L)] + ISGN*SUM [X(K,J)*B(L,J)**T]. +* I=K+1 J=L+1 +* +* Start loop over block rows (index = K) and block columns (index = L) +* + DO K = NBA, 1, -1 +* +* K1: row index of the first row in X( K, L ) +* K2: row index of the first row in X( K+1, L ) +* so the K2 - K1 is the column count of the block X( K, L ) +* + K1 = IWORK( K ) + K2 = IWORK( K + 1 ) + DO L = NBB, 1, -1 +* +* L1: column index of the first column in X( K, L ) +* L2: column index of the first column in X( K, L + 1) +* so that L2 - L1 is the row count of the block X( K, L ) +* + L1 = IWORK( PC + L ) + L2 = IWORK( PC + L + 1 ) +* + CALL STRSYL( TRANA, TRANB, ISGN, K2-K1, L2-L1, + $ A( K1, K1 ), LDA, + $ B( L1, L1 ), LDB, + $ C( K1, L1 ), LDC, SCALOC, IINFO ) + INFO = MAX( INFO, IINFO ) +* + IF( SCALOC * SWORK( K, L ) .EQ. ZERO ) THEN + IF( SCALOC .EQ. ZERO ) THEN +* The magnitude of the largest entry of X(K1:K2-1, L1:L2-1) +* is larger than the product of BIGNUM**2 and cannot be +* represented in the form (1/SCALE)*X(K1:K2-1, L1:L2-1). +* Mark the computation as pointless. + BUF = ZERO + ELSE +* Use second scaling factor to prevent flushing to zero. + BUF = BUF*2.E0**EXPONENT( SCALOC ) + END IF + DO JJ = 1, NBB + DO LL = 1, NBA +* Bound by BIGNUM to not introduce Inf. The value +* is irrelevant; corresponding entries of the +* solution will be flushed in consistency scaling. + SWORK( LL, JJ ) = MIN( BIGNUM, + $ SWORK( LL, JJ ) / 2.E0**EXPONENT( SCALOC ) ) + END DO + END DO + END IF + SWORK( K, L ) = SCALOC * SWORK( K, L ) + XNRM = SLANGE( 'I', K2-K1, L2-L1, C( K1, L1 ), LDC, + $ WNRM ) +* + DO I = 1, K - 1 +* +* C( I, L ) := C( I, L ) - A( I, K ) * C( K, L ) +* + I1 = IWORK( I ) + I2 = IWORK( I + 1 ) +* +* Compute scaling factor to survive the linear update +* simulating consistent scaling. +* + CNRM = SLANGE( 'I', I2-I1, L2-L1, C( I1, L1 ), + $ LDC, WNRM ) + SCAMIN = MIN( SWORK( I, L ), SWORK( K, L ) ) + CNRM = CNRM * ( SCAMIN / SWORK( I, L ) ) + XNRM = XNRM * ( SCAMIN / SWORK( K, L ) ) + ANRM = SWORK( I, AWRK + K ) + SCALOC = SLARMM( ANRM, XNRM, CNRM ) + IF( SCALOC * SCAMIN .EQ. ZERO ) THEN +* Use second scaling factor to prevent flushing to zero. + BUF = BUF*2.E0**EXPONENT( SCALOC ) + DO JJ = 1, NBB + DO LL = 1, NBA + SWORK( LL, JJ ) = MIN( BIGNUM, + $ SWORK( LL, JJ ) / 2.E0**EXPONENT( SCALOC ) ) + END DO + END DO + SCAMIN = SCAMIN / 2.E0**EXPONENT( SCALOC ) + SCALOC = SCALOC / 2.E0**EXPONENT( SCALOC ) + END IF + CNRM = CNRM * SCALOC + XNRM = XNRM * SCALOC +* +* Simultaneously apply the robust update factor and the +* consistency scaling factor to C( I, L ) and C( K, L ). +* + SCAL = ( SCAMIN / SWORK( K, L ) ) * SCALOC + IF (SCAL .NE. ONE) THEN + DO LL = L1, L2-1 + CALL SSCAL( K2-K1, SCAL, C( K1, LL ), 1 ) + END DO + ENDIF +* + SCAL = ( SCAMIN / SWORK( I, L ) ) * SCALOC + IF (SCAL .NE. ONE) THEN + DO LL = L1, L2-1 + CALL SSCAL( I2-I1, SCAL, C( I1, LL ), 1 ) + END DO + ENDIF +* +* Record current scaling factor +* + SWORK( K, L ) = SCAMIN * SCALOC + SWORK( I, L ) = SCAMIN * SCALOC +* + CALL SGEMM( 'N', 'N', I2-I1, L2-L1, K2-K1, -ONE, + $ A( I1, K1 ), LDA, C( K1, L1 ), LDC, + $ ONE, C( I1, L1 ), LDC ) +* + END DO +* + DO J = 1, L - 1 +* +* C( K, J ) := C( K, J ) - SGN * C( K, L ) * B( J, L )**T +* + J1 = IWORK( PC + J ) + J2 = IWORK( PC + J + 1 ) +* +* Compute scaling factor to survive the linear update +* simulating consistent scaling. +* + CNRM = SLANGE( 'I', K2-K1, J2-J1, C( K1, J1 ), + $ LDC, WNRM ) + SCAMIN = MIN( SWORK( K, J ), SWORK( K, L ) ) + CNRM = CNRM * ( SCAMIN / SWORK( K, J ) ) + XNRM = XNRM * ( SCAMIN / SWORK( K, L ) ) + BNRM = SWORK( L, BWRK + J ) + SCALOC = SLARMM( BNRM, XNRM, CNRM ) + IF( SCALOC * SCAMIN .EQ. ZERO ) THEN +* Use second scaling factor to prevent flushing to zero. + BUF = BUF*2.E0**EXPONENT( SCALOC ) + DO JJ = 1, NBB + DO LL = 1, NBA + SWORK( LL, JJ ) = MIN( BIGNUM, + $ SWORK( LL, JJ ) / 2.E0**EXPONENT( SCALOC ) ) + END DO + END DO + SCAMIN = SCAMIN / 2.E0**EXPONENT( SCALOC ) + SCALOC = SCALOC / 2.E0**EXPONENT( SCALOC ) + END IF + CNRM = CNRM * SCALOC + XNRM = XNRM * SCALOC +* +* Simultaneously apply the robust update factor and the +* consistency scaling factor to C( K, J ) and C( K, L ). +* + SCAL = ( SCAMIN / SWORK( K, L ) ) * SCALOC + IF( SCAL .NE. ONE ) THEN + DO JJ = L1, L2-1 + CALL SSCAL( K2-K1, SCAL, C( K1, JJ ), 1 ) + END DO + ENDIF +* + SCAL = ( SCAMIN / SWORK( K, J ) ) * SCALOC + IF( SCAL .NE. ONE ) THEN + DO JJ = J1, J2-1 + CALL SSCAL( K2-K1, SCAL, C( K1, JJ ), 1 ) + END DO + ENDIF +* +* Record current scaling factor +* + SWORK( K, L ) = SCAMIN * SCALOC + SWORK( K, J ) = SCAMIN * SCALOC +* + CALL SGEMM( 'N', 'T', K2-K1, J2-J1, L2-L1, -SGN, + $ C( K1, L1 ), LDC, B( J1, L1 ), LDB, + $ ONE, C( K1, J1 ), LDC ) + END DO + END DO + END DO +* + END IF +* +* Reduce local scaling factors +* + SCALE = SWORK( 1, 1 ) + DO K = 1, NBA + DO L = 1, NBB + SCALE = MIN( SCALE, SWORK( K, L ) ) + END DO + END DO +* + IF( SCALE .EQ. ZERO ) THEN +* +* The magnitude of the largest entry of the solution is larger +* than the product of BIGNUM**2 and cannot be represented in the +* form (1/SCALE)*X if SCALE is REAL. Set SCALE to zero and give up. +* + IWORK(1) = NBA + NBB + 2 + SWORK(1,1) = MAX( NBA, NBB ) + SWORK(2,1) = 2 * NBB + NBA + RETURN + END IF +* +* Realize consistent scaling +* + DO K = 1, NBA + K1 = IWORK( K ) + K2 = IWORK( K + 1 ) + DO L = 1, NBB + L1 = IWORK( PC + L ) + L2 = IWORK( PC + L + 1 ) + SCAL = SCALE / SWORK( K, L ) + IF( SCAL .NE. ONE ) THEN + DO LL = L1, L2-1 + CALL SSCAL( K2-K1, SCAL, C( K1, LL ), 1 ) + END DO + ENDIF + END DO + END DO +* + IF( BUF .NE. ONE .AND. BUF.GT.ZERO ) THEN +* +* Decrease SCALE as much as possible. +* + SCALOC = MIN( SCALE / SMLNUM, ONE / BUF ) + BUF = BUF * SCALOC + SCALE = SCALE / SCALOC + END IF + + IF( BUF.NE.ONE .AND. BUF.GT.ZERO ) THEN +* +* In case of overly aggressive scaling during the computation, +* flushing of the global scale factor may be prevented by +* undoing some of the scaling. This step is to ensure that +* this routine flushes only scale factors that TRSYL also +* flushes and be usable as a drop-in replacement. +* +* How much can the normwise largest entry be upscaled? +* + SCAL = C( 1, 1 ) + DO K = 1, M + DO L = 1, N + SCAL = MAX( SCAL, ABS( C( K, L ) ) ) + END DO + END DO +* +* Increase BUF as close to 1 as possible and apply scaling. +* + SCALOC = MIN( BIGNUM / SCAL, ONE / BUF ) + BUF = BUF * SCALOC + CALL SLASCL( 'G', -1, -1, ONE, SCALOC, M, N, C, LDC, IWORK ) + END IF +* +* Combine with buffer scaling factor. SCALE will be flushed if +* BUF is less than one here. +* + SCALE = SCALE * BUF +* +* Restore workspace dimensions +* + IWORK(1) = NBA + NBB + 2 + SWORK(1,1) = MAX( NBA, NBB ) + SWORK(2,1) = 2 * NBB + NBA +* + RETURN +* +* End of STRSYL3 +* + END From 833cd585b59cfca09da4abf75442be7632404ae6 Mon Sep 17 00:00:00 2001 From: Angelika Schwarz Date: Tue, 2 Aug 2022 18:27:14 +0100 Subject: [PATCH 2/8] Test [sd]trsyl3 --- TESTING/EIG/CMakeLists.txt | 2 +- TESTING/EIG/Makefile | 2 +- TESTING/EIG/dchkec.f | 46 ++++-- TESTING/EIG/derrec.f | 41 +++++- TESTING/EIG/dsyl01.f | 288 +++++++++++++++++++++++++++++++++++++ TESTING/EIG/schkec.f | 46 ++++-- TESTING/EIG/serrec.f | 41 +++++- TESTING/EIG/ssyl01.f | 288 +++++++++++++++++++++++++++++++++++++ 8 files changed, 724 insertions(+), 30 deletions(-) create mode 100644 TESTING/EIG/dsyl01.f create mode 100644 TESTING/EIG/ssyl01.f diff --git a/TESTING/EIG/CMakeLists.txt b/TESTING/EIG/CMakeLists.txt index 5993233bdf..a4ba5dfd6e 100644 --- a/TESTING/EIG/CMakeLists.txt +++ b/TESTING/EIG/CMakeLists.txt @@ -40,7 +40,7 @@ set(SEIGTST schkee.F sget54.f sglmts.f sgqrts.f sgrqts.f sgsvts3.f shst01.f slarfy.f slarhs.f slatm4.f slctes.f slctsx.f slsets.f sort01.f sort03.f ssbt21.f ssgt01.f sslect.f sspt21.f sstt21.f - sstt22.f ssyt21.f ssyt22.f) + sstt22.f ssyl01.f ssyt21.f ssyt22.f) set(CEIGTST cchkee.F cbdt01.f cbdt02.f cbdt03.f cbdt05.f diff --git a/TESTING/EIG/Makefile b/TESTING/EIG/Makefile index e8342cdabe..c3d3774849 100644 --- a/TESTING/EIG/Makefile +++ b/TESTING/EIG/Makefile @@ -62,7 +62,7 @@ SEIGTST = schkee.o \ sget54.o sglmts.o sgqrts.o sgrqts.o sgsvts3.o \ shst01.o slarfy.o slarhs.o slatm4.o slctes.o slctsx.o slsets.o sort01.o \ sort03.o ssbt21.o ssgt01.o sslect.o sspt21.o sstt21.o \ - sstt22.o ssyt21.o ssyt22.o + sstt22.o ssyl01.o ssyt21.o ssyt22.o CEIGTST = cchkee.o \ cbdt01.o cbdt02.o cbdt03.o cbdt05.o \ diff --git a/TESTING/EIG/dchkec.f b/TESTING/EIG/dchkec.f index fbdf924c8c..c4451a627a 100644 --- a/TESTING/EIG/dchkec.f +++ b/TESTING/EIG/dchkec.f @@ -90,21 +90,23 @@ SUBROUTINE DCHKEC( THRESH, TSTERR, NIN, NOUT ) LOGICAL OK CHARACTER*3 PATH INTEGER KLAEXC, KLALN2, KLANV2, KLAQTR, KLASY2, KTREXC, - $ KTRSEN, KTRSNA, KTRSYL, LLAEXC, LLALN2, LLANV2, - $ LLAQTR, LLASY2, LTREXC, LTRSYL, NLANV2, NLAQTR, - $ NLASY2, NTESTS, NTRSYL, KTGEXC, LTGEXC + $ KTRSEN, KTRSNA, KTRSYL, KTRSYL3, LLAEXC, + $ LLALN2, LLANV2, LLAQTR, LLASY2, LTREXC, LTRSYL, + $ NLANV2, NLAQTR, NLASY2, NTESTS, NTRSYL, KTGEXC, + $ LTGEXC DOUBLE PRECISION EPS, RLAEXC, RLALN2, RLANV2, RLAQTR, RLASY2, - $ RTREXC, RTRSYL, SFMIN, RTGEXC + $ RTREXC, SFMIN, RTGEXC * .. * .. Local Arrays .. - INTEGER LTRSEN( 3 ), LTRSNA( 3 ), NLAEXC( 2 ), - $ NLALN2( 2 ), NTGEXC( 2 ), NTREXC( 3 ), - $ NTRSEN( 3 ), NTRSNA( 3 ) - DOUBLE PRECISION RTRSEN( 3 ), RTRSNA( 3 ) + INTEGER FTRSYL( 3 ), ITRSYL( 2 ), LTRSEN( 3 ), + $ LTRSNA( 3 ), NLAEXC( 2 ), NLALN2( 2 ), + $ NTGEXC( 2 ), NTREXC( 3 ), NTRSEN( 3 ), + $ NTRSNA( 3 ) + DOUBLE PRECISION RTRSEN( 3 ), RTRSNA( 3 ), RTRSYL( 2 ) * .. * .. External Subroutines .. EXTERNAL DERREC, DGET31, DGET32, DGET33, DGET34, DGET35, - $ DGET36, DGET37, DGET38, DGET39, DGET40 + $ DGET36, DGET37, DGET38, DGET39, DGET40, DSYL01 * .. * .. External Functions .. DOUBLE PRECISION DLAMCH @@ -153,10 +155,24 @@ SUBROUTINE DCHKEC( THRESH, TSTERR, NIN, NOUT ) WRITE( NOUT, FMT = 9996 )RLAEXC, LLAEXC, NLAEXC, KLAEXC END IF * - CALL DGET35( RTRSYL, LTRSYL, NTRSYL, KTRSYL ) - IF( RTRSYL.GT.THRESH ) THEN + CALL DGET35( RTRSYL( 1 ), LTRSYL, NTRSYL, KTRSYL ) + IF( RTRSYL( 1 ).GT.THRESH ) THEN OK = .FALSE. - WRITE( NOUT, FMT = 9995 )RTRSYL, LTRSYL, NTRSYL, KTRSYL + WRITE( NOUT, FMT = 9995 )RTRSYL( 1 ), LTRSYL, NTRSYL, KTRSYL + END IF +* + CALL DSYL01( THRESH, FTRSYL, RTRSYL, ITRSYL, KTRSYL3 ) + IF( FTRSYL( 1 ).GT.0 ) THEN + OK = .FALSE. + WRITE( NOUT, FMT = 9970 )FTRSYL( 1 ), RTRSYL( 1 ), THRESH + END IF + IF( FTRSYL( 2 ).GT.0 ) THEN + OK = .FALSE. + WRITE( NOUT, FMT = 9971 )FTRSYL( 2 ), RTRSYL( 2 ), THRESH + END IF + IF( FTRSYL( 3 ).GT.0 ) THEN + OK = .FALSE. + WRITE( NOUT, FMT = 9972 )FTRSYL( 3 ) END IF * CALL DGET36( RTREXC, LTREXC, NTREXC, KTREXC, NIN ) @@ -228,6 +244,12 @@ SUBROUTINE DCHKEC( THRESH, TSTERR, NIN, NOUT ) $ 's than', F8.2, / / ) 9986 FORMAT( ' Error in DTGEXC: RMAX =', D12.3, / ' LMAX = ', I8, ' N', $ 'INFO=', 2I8, ' KNT=', I8 ) + 9972 FORMAT( 'DTRSYL and DTRSYL3 compute an inconsistent result ', + $ 'factor in ', I8, ' tests.') + 9971 FORMAT( 'Error in DTRSYL3: ', I8, ' tests fail the threshold.', / + $ 'Maximum test ratio =', D12.3, ' threshold =', D12.3 ) + 9970 FORMAT( 'Error in DTRSYL: ', I8, ' tests fail the threshold.', / + $ 'Maximum test ratio =', D12.3, ' threshold =', D12.3 ) * * End of DCHKEC * diff --git a/TESTING/EIG/derrec.f b/TESTING/EIG/derrec.f index d5863ad426..f11f488878 100644 --- a/TESTING/EIG/derrec.f +++ b/TESTING/EIG/derrec.f @@ -23,7 +23,7 @@ *> *> DERREC tests the error exits for the routines for eigen- condition *> estimation for DOUBLE PRECISION matrices: -*> DTRSYL, DTREXC, DTRSNA and DTRSEN. +*> DTRSYL, DTRSYL3, DTREXC, DTRSNA and DTRSEN. *> \endverbatim * * Arguments: @@ -82,7 +82,7 @@ SUBROUTINE DERREC( PATH, NUNIT ) $ WI( NMAX ), WORK( NMAX ), WR( NMAX ) * .. * .. External Subroutines .. - EXTERNAL CHKXER, DTREXC, DTRSEN, DTRSNA, DTRSYL + EXTERNAL CHKXER, DTREXC, DTRSEN, DTRSNA, DTRSYL, DTRSYL3 * .. * .. Scalars in Common .. LOGICAL LERR, OK @@ -141,6 +141,43 @@ SUBROUTINE DERREC( PATH, NUNIT ) CALL CHKXER( 'DTRSYL', INFOT, NOUT, LERR, OK ) NT = NT + 8 * +* Test DTRSYL3 +* + SRNAMT = 'DTRSYL3' + INFOT = 1 + CALL DTRSYL3( 'X', 'N', 1, 0, 0, A, 1, B, 1, C, 1, SCALE, + $ IWORK, NMAX, WORK, NMAX, INFO ) + CALL CHKXER( 'DTRSYL3', INFOT, NOUT, LERR, OK ) + INFOT = 2 + CALL DTRSYL3( 'N', 'X', 1, 0, 0, A, 1, B, 1, C, 1, SCALE, + $ IWORK, NMAX, WORK, NMAX, INFO ) + CALL CHKXER( 'DTRSYL3', INFOT, NOUT, LERR, OK ) + INFOT = 3 + CALL DTRSYL3( 'N', 'N', 0, 0, 0, A, 1, B, 1, C, 1, SCALE, + $ IWORK, NMAX, WORK, NMAX, INFO ) + CALL CHKXER( 'DTRSYL3', INFOT, NOUT, LERR, OK ) + INFOT = 4 + CALL DTRSYL3( 'N', 'N', 1, -1, 0, A, 1, B, 1, C, 1, SCALE, + $ IWORK, NMAX, WORK, NMAX, INFO ) + CALL CHKXER( 'DTRSYL3', INFOT, NOUT, LERR, OK ) + INFOT = 5 + CALL DTRSYL3( 'N', 'N', 1, 0, -1, A, 1, B, 1, C, 1, SCALE, + $ IWORK, NMAX, WORK, NMAX, INFO ) + CALL CHKXER( 'DTRSYL3', INFOT, NOUT, LERR, OK ) + INFOT = 7 + CALL DTRSYL3( 'N', 'N', 1, 2, 0, A, 1, B, 1, C, 2, SCALE, + $ IWORK, NMAX, WORK, NMAX, INFO ) + CALL CHKXER( 'DTRSYL3', INFOT, NOUT, LERR, OK ) + INFOT = 9 + CALL DTRSYL3( 'N', 'N', 1, 0, 2, A, 1, B, 1, C, 1, SCALE, + $ IWORK, NMAX, WORK, NMAX, INFO ) + CALL CHKXER( 'DTRSYL3', INFOT, NOUT, LERR, OK ) + INFOT = 11 + CALL DTRSYL3( 'N', 'N', 1, 2, 0, A, 2, B, 1, C, 1, SCALE, + $ IWORK, NMAX, WORK, NMAX, INFO ) + CALL CHKXER( 'DTRSYL3', INFOT, NOUT, LERR, OK ) + NT = NT + 8 +* * Test DTREXC * SRNAMT = 'DTREXC' diff --git a/TESTING/EIG/dsyl01.f b/TESTING/EIG/dsyl01.f new file mode 100644 index 0000000000..782d2cd42f --- /dev/null +++ b/TESTING/EIG/dsyl01.f @@ -0,0 +1,288 @@ +*> \brief \b DSYL01 +* +* =========== DOCUMENTATION =========== +* +* Online html documentation available at +* http://www.netlib.org/lapack/explore-html/ +* +* Definition: +* =========== +* +* SUBROUTINE DSYL01( THRESH, NFAIL, RMAX, NINFO, KNT ) +* +* .. Scalar Arguments .. +* INTEGER KNT +* DOUBLE PRECISION THRESH +* .. +* .. Array Arguments .. +* INTEGER NFAIL( 3 ), NINFO( 2 ) +* DOUBLE PRECISION RMAX( 2 ) +* .. +* +* +*> \par Purpose: +* ============= +*> +*> \verbatim +*> +*> DSYL01 tests DTRSYL and DTRSYL3, routines for solving the Sylvester matrix +*> equation +*> +*> op(A)*X + ISGN*X*op(B) = scale*C, +*> +*> A and B are assumed to be in Schur canonical form, op() represents an +*> optional transpose, and ISGN can be -1 or +1. Scale is an output +*> less than or equal to 1, chosen to avoid overflow in X. +*> +*> The test code verifies that the following residual does not exceed +*> the provided threshold: +*> +*> norm(op(A)*X + ISGN*X*op(B) - scale*C) / +*> (EPS*max(norm(A),norm(B))*norm(X)) +*> +*> This routine complements DGET35 by testing with larger, +*> random matrices, of which some require rescaling of X to avoid overflow. +*> +*> \endverbatim +* +* Arguments: +* ========== +* +*> \param[in] THRESH +*> \verbatim +*> THRESH is DOUBLE PRECISION +*> A test will count as "failed" if the residual, computed as +*> described above, exceeds THRESH. +*> \endverbatim +*> +*> \param[out] NFAIL +*> \verbatim +*> NFAIL is INTEGER array, dimension (3) +*> NFAIL(1) = No. of times residual DTRSYL exceeds threshold THRESH +*> NFAIL(2) = No. of times residual DTRSYL3 exceeds threshold THRESH +*> NFAIL(3) = No. of times DTRSYL3 and DTRSYL deviate +*> \endverbatim +*> +*> \param[out] RMAX +*> \verbatim +*> RMAX is DOUBLE PRECISION, dimension (2) +*> RMAX(1) = Value of the largest test ratio of DTRSYL +*> RMAX(2) = Value of the largest test ratio of DTRSYL3 +*> \endverbatim +*> +*> \param[out] NINFO +*> \verbatim +*> NINFO is INTEGER array, dimension (2) +*> NINFO(1) = No. of times DTRSYL returns an expected INFO +*> NINFO(2) = No. of times DTRSYL3 returns an expected INFO +*> \endverbatim +*> +*> \param[out] KNT +*> \verbatim +*> KNT is INTEGER +*> Total number of examples tested. +*> \endverbatim + +* +* -- LAPACK test routine -- + SUBROUTINE DSYL01( THRESH, NFAIL, RMAX, NINFO, KNT ) + IMPLICIT NONE +* +* -- LAPACK test routine -- +* -- LAPACK is a software package provided by Univ. of Tennessee, -- +* -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- +* +* .. Scalar Arguments .. + INTEGER KNT + DOUBLE PRECISION THRESH +* .. +* .. Array Arguments .. + INTEGER NFAIL( 3 ), NINFO( 2 ) + DOUBLE PRECISION RMAX( 2 ) +* .. +* +* ===================================================================== +* .. +* .. Parameters .. + DOUBLE PRECISION ZERO, ONE + PARAMETER ( ZERO = 0.0D0, ONE = 1.0D0 ) + INTEGER MAXM, MAXN, LDSWORK + PARAMETER ( MAXM = 245, MAXN = 192, LDSWORK = 36 ) +* .. +* .. Local Scalars .. + CHARACTER TRANA, TRANB + INTEGER I, INFO, IINFO, ISGN, ITRANA, ITRANB, J, KLA, + $ KUA, KLB, KUB, LIWORK, M, N + DOUBLE PRECISION ANRM, BNRM, BIGNUM, EPS, RES, RES1, RMUL, + $ SCALE, SCALE3, SMLNUM, TNRM, XNRM +* .. +* .. Local Arrays .. + DOUBLE PRECISION A( MAXM, MAXM ), B( MAXN, MAXN ), + $ C( MAXM, MAXN ), CC( MAXM, MAXN ), + $ X( MAXM, MAXN ), + $ DUML( MAXM ), DUMR( MAXN ), + $ D( MAX( MAXM, MAXN ) ), DUM( MAXN ), + $ SWORK( LDSWORK, 126 ), VM( 2 ) + INTEGER ISEED( 4 ), IWORK( MAXM + MAXN + 2 ), IDUM( 2 ) +* .. +* .. External Functions .. + LOGICAL DISNAN + DOUBLE PRECISION DLAMCH, DLANGE + EXTERNAL DLAMCH, DLANGE +* .. +* .. External Subroutines .. + EXTERNAL DLATMR, DLACPY, DGEMM, DTRSYL, DTRSYL3 +* .. +* .. Intrinsic Functions .. + INTRINSIC ABS, DBLE, MAX +* .. +* .. Executable Statements .. +* +* Get machine parameters +* + EPS = DLAMCH( 'P' ) + SMLNUM = DLAMCH( 'S' ) / EPS + BIGNUM = ONE / SMLNUM +* + VM( 1 ) = ONE + VM( 2 ) = 0.000001D+0 +* +* Begin test loop +* + NINFO( 1 ) = 0 + NINFO( 2 ) = 0 + NFAIL( 1 ) = 0 + NFAIL( 2 ) = 0 + NFAIL( 3 ) = 0 + RMAX( 1 ) = ZERO + RMAX( 2 ) = ZERO + KNT = 0 + DO I = 1, 4 + ISEED( I ) = 1 + END DO + SCALE = ONE + SCALE3 = ONE + LIWORK = MAXM + MAXN + 2 + DO J = 1, 2 + DO ISGN = -1, 1, 2 +* Reset seed (overwritten by LATMR) + DO I = 1, 4 + ISEED( I ) = 1 + END DO + DO M = 32, MAXM, 71 + KLA = 0 + KUA = M - 1 + CALL DLATMR( M, M, 'S', ISEED, 'N', D, + $ 6, ONE, ONE, 'T', 'N', + $ DUML, 1, ONE, DUMR, 1, ONE, + $ 'N', IWORK, KLA, KUA, ZERO, + $ ONE, 'NO', A, MAXM, IWORK, IINFO ) + DO I = 1, M + A( I, I ) = A( I, I ) * VM( J ) + END DO + ANRM = DLANGE( 'M', M, M, A, MAXM, DUM ) + DO N = 51, MAXN, 47 + KLB = 0 + KUB = N - 1 + CALL DLATMR( N, N, 'S', ISEED, 'N', D, + $ 6, ONE, ONE, 'T', 'N', + $ DUML, 1, ONE, DUMR, 1, ONE, + $ 'N', IWORK, KLB, KUB, ZERO, + $ ONE, 'NO', B, MAXN, IWORK, IINFO ) + BNRM = DLANGE( 'M', N, N, B, MAXN, DUM ) + TNRM = MAX( ANRM, BNRM ) + CALL DLATMR( M, N, 'S', ISEED, 'N', D, + $ 6, ONE, ONE, 'T', 'N', + $ DUML, 1, ONE, DUMR, 1, ONE, + $ 'N', IWORK, M, N, ZERO, ONE, + $ 'NO', C, MAXM, IWORK, IINFO ) + DO ITRANA = 1, 2 + IF( ITRANA.EQ.1 ) THEN + TRANA = 'N' + END IF + IF( ITRANA.EQ.2 ) THEN + TRANA = 'T' + END IF + DO ITRANB = 1, 2 + IF( ITRANB.EQ.1 ) THEN + TRANB = 'N' + END IF + IF( ITRANB.EQ.2 ) THEN + TRANB = 'T' + END IF + KNT = KNT + 1 +* + CALL DLACPY( 'All', M, N, C, MAXM, X, MAXM) + CALL DLACPY( 'All', M, N, C, MAXM, CC, MAXM) + CALL DTRSYL( TRANA, TRANB, ISGN, M, N, + $ A, MAXM, B, MAXN, X, MAXM, + $ SCALE, IINFO ) + IF( IINFO.NE.0 ) + $ NINFO( 1 ) = NINFO( 1 ) + 1 + XNRM = DLANGE( 'M', M, N, X, MAXM, DUM ) + RMUL = ONE + IF( XNRM.GT.ONE .AND. TNRM.GT.ONE ) THEN + IF( XNRM.GT.BIGNUM / TNRM ) THEN + RMUL = ONE / MAX( XNRM, TNRM ) + END IF + END IF + CALL DGEMM( TRANA, 'N', M, N, M, RMUL, + $ A, MAXM, X, MAXM, -SCALE*RMUL, + $ CC, MAXM ) + CALL DGEMM( 'N', TRANB, M, N, N, + $ DBLE( ISGN )*RMUL, X, MAXM, B, + $ MAXN, ONE, CC, MAXM ) + RES1 = DLANGE( 'M', M, N, CC, MAXM, DUM ) + RES = RES1 / MAX( SMLNUM, SMLNUM*XNRM, + $ ( ( RMUL*TNRM )*EPS )*XNRM ) + IF( RES.GT.THRESH ) + $ NFAIL( 1 ) = NFAIL( 1 ) + 1 + IF( RES.GT.RMAX( 1 ) ) + $ RMAX( 1 ) = RES +* + CALL DLACPY( 'All', M, N, C, MAXM, X, MAXM ) + CALL DLACPY( 'All', M, N, C, MAXM, CC, MAXM ) + CALL DTRSYL3( TRANA, TRANB, ISGN, M, N, + $ A, MAXM, B, MAXN, X, MAXM, + $ SCALE3, IWORK, LIWORK, + $ SWORK, LDSWORK, INFO) + IF( INFO.NE.0 ) + $ NINFO( 2 ) = NINFO( 2 ) + 1 + XNRM = DLANGE( 'M', M, N, X, MAXM, DUM ) + RMUL = ONE + IF( XNRM.GT.ONE .AND. TNRM.GT.ONE ) THEN + IF( XNRM.GT.BIGNUM / TNRM ) THEN + RMUL = ONE / MAX( XNRM, TNRM ) + END IF + END IF + CALL DGEMM( TRANA, 'N', M, N, M, RMUL, + $ A, MAXM, X, MAXM, -SCALE3*RMUL, + $ CC, MAXM ) + CALL DGEMM( 'N', TRANB, M, N, N, + $ DBLE( ISGN )*RMUL, X, MAXM, B, + $ MAXN, ONE, CC, MAXM ) + RES1 = DLANGE( 'M', M, N, CC, MAXM, DUM ) + RES = RES1 / MAX( SMLNUM, SMLNUM*XNRM, + $ ( ( RMUL*TNRM )*EPS )*XNRM ) +* Verify that TRSYL3 only flushes if TRSYL flushes (but +* there may be cases where TRSYL3 avoid flushing). + IF( SCALE3.EQ.ZERO .AND. SCALE.GT.ZERO .OR. + $ IINFO.NE.INFO ) THEN + NFAIL( 3 ) = NFAIL( 3 ) + 1 + END IF + IF( RES.GT.THRESH .OR. DISNAN( RES ) ) + $ NFAIL( 2 ) = NFAIL( 2 ) + 1 + IF( RES.GT.RMAX( 2 ) ) + $ RMAX( 2 ) = RES + END DO + END DO + END DO + END DO + END DO + END DO +* + RETURN +* +* End of DSYL01 +* + END diff --git a/TESTING/EIG/schkec.f b/TESTING/EIG/schkec.f index f742c5b36e..59abb24664 100644 --- a/TESTING/EIG/schkec.f +++ b/TESTING/EIG/schkec.f @@ -90,21 +90,23 @@ SUBROUTINE SCHKEC( THRESH, TSTERR, NIN, NOUT ) LOGICAL OK CHARACTER*3 PATH INTEGER KLAEXC, KLALN2, KLANV2, KLAQTR, KLASY2, KTREXC, - $ KTRSEN, KTRSNA, KTRSYL, LLAEXC, LLALN2, LLANV2, - $ LLAQTR, LLASY2, LTREXC, LTRSYL, NLANV2, NLAQTR, - $ NLASY2, NTESTS, NTRSYL, KTGEXC, LTGEXC + $ KTRSEN, KTRSNA, KTRSYL, KTRSYL3, LLAEXC, + $ LLALN2, LLANV2, LLAQTR, LLASY2, LTREXC, LTRSYL, + $ NLANV2, NLAQTR, NLASY2, NTESTS, NTRSYL, KTGEXC, + $ LTGEXC REAL EPS, RLAEXC, RLALN2, RLANV2, RLAQTR, RLASY2, - $ RTREXC, RTRSYL, SFMIN, RTGEXC + $ RTREXC, SFMIN, RTGEXC * .. * .. Local Arrays .. - INTEGER LTRSEN( 3 ), LTRSNA( 3 ), NLAEXC( 2 ), - $ NLALN2( 2 ), NTGEXC( 2 ), NTREXC( 3 ), - $ NTRSEN( 3 ), NTRSNA( 3 ) - REAL RTRSEN( 3 ), RTRSNA( 3 ) + INTEGER FTRSYL( 3 ), ITRSYL( 2 ), LTRSEN( 3 ), + $ LTRSNA( 3 ), NLAEXC( 2 ), NLALN2( 2 ), + $ NTGEXC( 2 ), NTREXC( 3 ), NTRSEN( 3 ), + $ NTRSNA( 3 ) + REAL RTRSEN( 3 ), RTRSNA( 3 ), RTRSYL( 2 ) * .. * .. External Subroutines .. EXTERNAL SERREC, SGET31, SGET32, SGET33, SGET34, SGET35, - $ SGET36, SGET37, SGET38, SGET39, SGET40 + $ SGET36, SGET37, SGET38, SGET39, SGET40, SSYL01 * .. * .. External Functions .. REAL SLAMCH @@ -153,10 +155,24 @@ SUBROUTINE SCHKEC( THRESH, TSTERR, NIN, NOUT ) WRITE( NOUT, FMT = 9996 )RLAEXC, LLAEXC, NLAEXC, KLAEXC END IF * - CALL SGET35( RTRSYL, LTRSYL, NTRSYL, KTRSYL ) - IF( RTRSYL.GT.THRESH ) THEN + CALL SGET35( RTRSYL( 1 ), LTRSYL, NTRSYL, KTRSYL ) + IF( RTRSYL( 1 ).GT.THRESH ) THEN OK = .FALSE. - WRITE( NOUT, FMT = 9995 )RTRSYL, LTRSYL, NTRSYL, KTRSYL + WRITE( NOUT, FMT = 9995 )RTRSYL( 1 ), LTRSYL, NTRSYL, KTRSYL + END IF +* + CALL SSYL01( THRESH, FTRSYL, RTRSYL, ITRSYL, KTRSYL3 ) + IF( FTRSYL( 1 ).GT.0 ) THEN + OK = .FALSE. + WRITE( NOUT, FMT = 9970 )FTRSYL( 1 ), RTRSYL( 1 ), THRESH + END IF + IF( FTRSYL( 2 ).GT.0 ) THEN + OK = .FALSE. + WRITE( NOUT, FMT = 9971 )FTRSYL( 2 ), RTRSYL( 2 ), THRESH + END IF + IF( FTRSYL( 3 ).GT.0 ) THEN + OK = .FALSE. + WRITE( NOUT, FMT = 9972 )FTRSYL( 3 ) END IF * CALL SGET36( RTREXC, LTREXC, NTREXC, KTREXC, NIN ) @@ -228,6 +244,12 @@ SUBROUTINE SCHKEC( THRESH, TSTERR, NIN, NOUT ) $ 's than', F8.2, / / ) 9986 FORMAT( ' Error in STGEXC: RMAX =', E12.3, / ' LMAX = ', I8, ' N', $ 'INFO=', 2I8, ' KNT=', I8 ) + 9972 FORMAT( 'STRSYL and STRSYL3 compute an inconsistent result ', + $ 'factor in ', I8, ' tests.') + 9971 FORMAT( 'Error in STRSYL3: ', I8, ' tests fail the threshold.', / + $ 'Maximum test ratio =', D12.3, ' threshold =', D12.3 ) + 9970 FORMAT( 'Error in STRSYL: ', I8, ' tests fail the threshold.', / + $ 'Maximum test ratio =', D12.3, ' threshold =', D12.3 ) * * End of SCHKEC * diff --git a/TESTING/EIG/serrec.f b/TESTING/EIG/serrec.f index 249f0e6424..9a7ceb3627 100644 --- a/TESTING/EIG/serrec.f +++ b/TESTING/EIG/serrec.f @@ -23,7 +23,7 @@ *> *> SERREC tests the error exits for the routines for eigen- condition *> estimation for REAL matrices: -*> STRSYL, STREXC, STRSNA and STRSEN. +*> STRSYL, STRSYL3, STREXC, STRSNA and STRSEN. *> \endverbatim * * Arguments: @@ -82,7 +82,7 @@ SUBROUTINE SERREC( PATH, NUNIT ) $ WI( NMAX ), WORK( NMAX ), WR( NMAX ) * .. * .. External Subroutines .. - EXTERNAL CHKXER, STREXC, STRSEN, STRSNA, STRSYL + EXTERNAL CHKXER, STREXC, STRSEN, STRSNA, STRSYL, STRSYL3 * .. * .. Scalars in Common .. LOGICAL LERR, OK @@ -141,6 +141,43 @@ SUBROUTINE SERREC( PATH, NUNIT ) CALL CHKXER( 'STRSYL', INFOT, NOUT, LERR, OK ) NT = NT + 8 * +* Test STRSYL3 +* + SRNAMT = 'STRSYL3' + INFOT = 1 + CALL STRSYL3( 'X', 'N', 1, 0, 0, A, 1, B, 1, C, 1, SCALE, + $ IWORK, NMAX, WORK, NMAX, INFO ) + CALL CHKXER( 'STRSYL3', INFOT, NOUT, LERR, OK ) + INFOT = 2 + CALL STRSYL3( 'N', 'X', 1, 0, 0, A, 1, B, 1, C, 1, SCALE, + $ IWORK, NMAX, WORK, NMAX, INFO ) + CALL CHKXER( 'STRSYL3', INFOT, NOUT, LERR, OK ) + INFOT = 3 + CALL STRSYL3( 'N', 'N', 0, 0, 0, A, 1, B, 1, C, 1, SCALE, + $ IWORK, NMAX, WORK, NMAX, INFO ) + CALL CHKXER( 'STRSYL3', INFOT, NOUT, LERR, OK ) + INFOT = 4 + CALL STRSYL3( 'N', 'N', 1, -1, 0, A, 1, B, 1, C, 1, SCALE, + $ IWORK, NMAX, WORK, NMAX, INFO ) + CALL CHKXER( 'STRSYL3', INFOT, NOUT, LERR, OK ) + INFOT = 5 + CALL STRSYL3( 'N', 'N', 1, 0, -1, A, 1, B, 1, C, 1, SCALE, + $ IWORK, NMAX, WORK, NMAX, INFO ) + CALL CHKXER( 'STRSYL3', INFOT, NOUT, LERR, OK ) + INFOT = 7 + CALL STRSYL3( 'N', 'N', 1, 2, 0, A, 1, B, 1, C, 2, SCALE, + $ IWORK, NMAX, WORK, NMAX, INFO ) + CALL CHKXER( 'STRSYL3', INFOT, NOUT, LERR, OK ) + INFOT = 9 + CALL STRSYL3( 'N', 'N', 1, 0, 2, A, 1, B, 1, C, 1, SCALE, + $ IWORK, NMAX, WORK, NMAX, INFO ) + CALL CHKXER( 'STRSYL3', INFOT, NOUT, LERR, OK ) + INFOT = 11 + CALL STRSYL3( 'N', 'N', 1, 2, 0, A, 2, B, 1, C, 1, SCALE, + $ IWORK, NMAX, WORK, NMAX, INFO ) + CALL CHKXER( 'STRSYL3', INFOT, NOUT, LERR, OK ) + NT = NT + 8 +* * Test STREXC * SRNAMT = 'STREXC' diff --git a/TESTING/EIG/ssyl01.f b/TESTING/EIG/ssyl01.f new file mode 100644 index 0000000000..22d089dc81 --- /dev/null +++ b/TESTING/EIG/ssyl01.f @@ -0,0 +1,288 @@ +*> \brief \b SSYL01 +* +* =========== DOCUMENTATION =========== +* +* Online html documentation available at +* http://www.netlib.org/lapack/explore-html/ +* +* Definition: +* =========== +* +* SUBROUTINE SSYL01( THRESH, NFAIL, RMAX, NINFO, KNT ) +* +* .. Scalar Arguments .. +* INTEGER KNT +* REAL THRESH +* .. +* .. Array Arguments .. +* INTEGER NFAIL( 3 ), NINFO( 2 ) +* REAL RMAX( 2 ) +* .. +* +* +*> \par Purpose: +* ============= +*> +*> \verbatim +*> +*> SSYL01 tests STRSYL and STRSYL3, routines for solving the Sylvester matrix +*> equation +*> +*> op(A)*X + ISGN*X*op(B) = scale*C, +*> +*> A and B are assumed to be in Schur canonical form, op() represents an +*> optional transpose, and ISGN can be -1 or +1. Scale is an output +*> less than or equal to 1, chosen to avoid overflow in X. +*> +*> The test code verifies that the following residual does not exceed +*> the provided threshold: +*> +*> norm(op(A)*X + ISGN*X*op(B) - scale*C) / +*> (EPS*max(norm(A),norm(B))*norm(X)) +*> +*> This routine complements SGET35 by testing with larger, +*> random matrices, of which some require rescaling of X to avoid overflow. +*> +*> \endverbatim +* +* Arguments: +* ========== +* +*> \param[in] THRESH +*> \verbatim +*> THRESH is REAL +*> A test will count as "failed" if the residual, computed as +*> described above, exceeds THRESH. +*> \endverbatim +*> +*> \param[out] NFAIL +*> \verbatim +*> NFAIL is INTEGER array, dimension (3) +*> NFAIL(1) = No. of times residual STRSYL exceeds threshold THRESH +*> NFAIL(2) = No. of times residual STRSYL3 exceeds threshold THRESH +*> NFAIL(3) = No. of times STRSYL3 and STRSYL deviate +*> \endverbatim +*> +*> \param[out] RMAX +*> \verbatim +*> RMAX is REAL, dimension (2) +*> RMAX(1) = Value of the largest test ratio of STRSYL +*> RMAX(2) = Value of the largest test ratio of STRSYL3 +*> \endverbatim +*> +*> \param[out] NINFO +*> \verbatim +*> NINFO is INTEGER array, dimension (2) +*> NINFO(1) = No. of times STRSYL returns an expected INFO +*> NINFO(2) = No. of times STRSYL3 returns an expected INFO +*> \endverbatim +*> +*> \param[out] KNT +*> \verbatim +*> KNT is INTEGER +*> Total number of examples tested. +*> \endverbatim + +* +* -- LAPACK test routine -- + SUBROUTINE SSYL01( THRESH, NFAIL, RMAX, NINFO, KNT ) + IMPLICIT NONE +* +* -- LAPACK test routine -- +* -- LAPACK is a software package provided by Univ. of Tennessee, -- +* -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- +* +* .. Scalar Arguments .. + INTEGER KNT + REAL THRESH +* .. +* .. Array Arguments .. + INTEGER NFAIL( 3 ), NINFO( 2 ) + REAL RMAX( 2 ) +* .. +* +* ===================================================================== +* .. +* .. Parameters .. + REAL ZERO, ONE + PARAMETER ( ZERO = 0.0E+0, ONE = 1.0E+0 ) + INTEGER MAXM, MAXN, LDSWORK + PARAMETER ( MAXM = 101, MAXN = 138, LDSWORK = 18 ) +* .. +* .. Local Scalars .. + CHARACTER TRANA, TRANB + INTEGER I, INFO, IINFO, ISGN, ITRANA, ITRANB, J, KLA, + $ KUA, KLB, KUB, LIWORK, M, N + REAL ANRM, BNRM, BIGNUM, EPS, RES, RES1, RMUL, + $ SCALE, SCALE3, SMLNUM, TNRM, XNRM +* .. +* .. Local Arrays .. + REAL A( MAXM, MAXM ), B( MAXN, MAXN ), + $ C( MAXM, MAXN ), CC( MAXM, MAXN ), + $ X( MAXM, MAXN ), + $ DUML( MAXM ), DUMR( MAXN ), + $ D( MAX( MAXM, MAXN ) ), DUM( MAXN ), + $ SWORK( LDSWORK, 54 ), VM( 2 ) + INTEGER ISEED( 4 ), IWORK( MAXM + MAXN + 2 ), IDUM( 2 ) +* .. +* .. External Functions .. + LOGICAL SISNAN + REAL SLAMCH, SLANGE + EXTERNAL SISNAN, SLAMCH, SLANGE +* .. +* .. External Subroutines .. + EXTERNAL SLATMR, SLACPY, SGEMM, STRSYL, STRSYL3 +* .. +* .. Intrinsic Functions .. + INTRINSIC ABS, REAL, MAX +* .. +* .. Executable Statements .. +* +* Get machine parameters +* + EPS = SLAMCH( 'P' ) + SMLNUM = SLAMCH( 'S' ) / EPS + BIGNUM = ONE / SMLNUM +* + VM( 1 ) = ONE + VM( 2 ) = 0.05E+0 +* +* Begin test loop +* + NINFO( 1 ) = 0 + NINFO( 2 ) = 0 + NFAIL( 1 ) = 0 + NFAIL( 2 ) = 0 + NFAIL( 3 ) = 0 + RMAX( 1 ) = ZERO + RMAX( 2 ) = ZERO + KNT = 0 + DO I = 1, 4 + ISEED( I ) = 1 + END DO + SCALE = ONE + SCALE3 = ONE + LIWORK = MAXM + MAXN + 2 + DO J = 1, 2 + DO ISGN = -1, 1, 2 +* Reset seed (overwritten by LATMR) + DO I = 1, 4 + ISEED( I ) = 1 + END DO + DO M = 32, MAXM, 71 + KLA = 0 + KUA = M - 1 + CALL SLATMR( M, M, 'S', ISEED, 'N', D, + $ 6, ONE, ONE, 'T', 'N', + $ DUML, 1, ONE, DUMR, 1, ONE, + $ 'N', IWORK, KLA, KUA, ZERO, + $ ONE, 'NO', A, MAXM, IWORK, IINFO ) + DO I = 1, M + A( I, I ) = A( I, I ) * VM( J ) + END DO + ANRM = SLANGE( 'M', M, M, A, MAXM, DUM ) + DO N = 51, MAXN, 47 + KLB = 0 + KUB = N - 1 + CALL SLATMR( N, N, 'S', ISEED, 'N', D, + $ 6, ONE, ONE, 'T', 'N', + $ DUML, 1, ONE, DUMR, 1, ONE, + $ 'N', IWORK, KLB, KUB, ZERO, + $ ONE, 'NO', B, MAXN, IWORK, IINFO ) + BNRM = SLANGE( 'M', N, N, B, MAXN, DUM ) + TNRM = MAX( ANRM, BNRM ) + CALL SLATMR( M, N, 'S', ISEED, 'N', D, + $ 6, ONE, ONE, 'T', 'N', + $ DUML, 1, ONE, DUMR, 1, ONE, + $ 'N', IWORK, M, N, ZERO, ONE, + $ 'NO', C, MAXM, IWORK, IINFO ) + DO ITRANA = 1, 2 + IF( ITRANA.EQ.1 ) THEN + TRANA = 'N' + END IF + IF( ITRANA.EQ.2 ) THEN + TRANA = 'T' + END IF + DO ITRANB = 1, 2 + IF( ITRANB.EQ.1 ) THEN + TRANB = 'N' + END IF + IF( ITRANB.EQ.2 ) THEN + TRANB = 'T' + END IF + KNT = KNT + 1 +* + CALL SLACPY( 'All', M, N, C, MAXM, X, MAXM) + CALL SLACPY( 'All', M, N, C, MAXM, CC, MAXM) + CALL STRSYL( TRANA, TRANB, ISGN, M, N, + $ A, MAXM, B, MAXN, X, MAXM, + $ SCALE, IINFO ) + IF( IINFO.NE.0 ) + $ NINFO( 1 ) = NINFO( 1 ) + 1 + XNRM = SLANGE( 'M', M, N, X, MAXM, DUM ) + RMUL = ONE + IF( XNRM.GT.ONE .AND. TNRM.GT.ONE ) THEN + IF( XNRM.GT.BIGNUM / TNRM ) THEN + RMUL = ONE / MAX( XNRM, TNRM ) + END IF + END IF + CALL SGEMM( TRANA, 'N', M, N, M, RMUL, + $ A, MAXM, X, MAXM, -SCALE*RMUL, + $ C, MAXM ) + CALL SGEMM( 'N', TRANB, M, N, N, + $ REAL( ISGN )*RMUL, X, MAXM, B, + $ MAXN, ONE, C, MAXM ) + RES1 = SLANGE( 'M', M, N, C, MAXM, DUM ) + RES = RES1 / MAX( SMLNUM, SMLNUM*XNRM, + $ ( ( RMUL*TNRM )*EPS )*XNRM ) + IF( RES.GT.THRESH ) + $ NFAIL( 1 ) = NFAIL( 1 ) + 1 + IF( RES.GT.RMAX( 1 ) ) + $ RMAX( 1 ) = RES +* + CALL SLACPY( 'All', M, N, C, MAXM, X, MAXM ) + CALL SLACPY( 'All', M, N, C, MAXM, CC, MAXM ) + CALL STRSYL3( TRANA, TRANB, ISGN, M, N, + $ A, MAXM, B, MAXN, X, MAXM, + $ SCALE3, IWORK, LIWORK, + $ SWORK, LDSWORK, INFO) + IF( INFO.NE.0 ) + $ NINFO( 2 ) = NINFO( 2 ) + 1 + XNRM = SLANGE( 'M', M, N, X, MAXM, DUM ) + RMUL = ONE + IF( XNRM.GT.ONE .AND. TNRM.GT.ONE ) THEN + IF( XNRM.GT.BIGNUM / TNRM ) THEN + RMUL = ONE / MAX( XNRM, TNRM ) + END IF + END IF + CALL SGEMM( TRANA, 'N', M, N, M, RMUL, + $ A, MAXM, X, MAXM, -SCALE3*RMUL, + $ CC, MAXM ) + CALL SGEMM( 'N', TRANB, M, N, N, + $ REAL( ISGN )*RMUL, X, MAXM, B, + $ MAXN, ONE, CC, MAXM ) + RES1 = SLANGE( 'M', M, N, CC, MAXM, DUM ) + RES = RES1 / MAX( SMLNUM, SMLNUM*XNRM, + $ ( ( RMUL*TNRM )*EPS )*XNRM ) +* Verify that TRSYL3 only flushes if TRSYL flushes (but +* there may be cases where TRSYL3 avoid flushing). + IF( SCALE3.EQ.ZERO .AND. SCALE.GT.ZERO .OR. + $ IINFO.NE.INFO ) THEN + NFAIL( 3 ) = NFAIL( 3 ) + 1 + END IF + IF( RES.GT.THRESH .OR. SISNAN( RES ) ) + $ NFAIL( 2 ) = NFAIL( 2 ) + 1 + IF( RES.GT.RMAX( 2 ) ) + $ RMAX( 2 ) = RES + END DO + END DO + END DO + END DO + END DO + END DO +* + RETURN +* +* End of SSYL01 +* + END From 46275f0d789914379432e82516ecd14002d2b3a1 Mon Sep 17 00:00:00 2001 From: Angelika Schwarz Date: Sun, 17 Jul 2022 17:49:06 +0100 Subject: [PATCH 3/8] LAPACKE interface of [sd]trsyl3 --- LAPACKE/include/lapack.h | 40 ++++++++++++++ LAPACKE/include/lapacke.h | 26 +++++++++ LAPACKE/src/CMakeLists.txt | 4 ++ LAPACKE/src/Makefile | 6 ++- LAPACKE/src/lapacke_cgesvdq.c | 1 - LAPACKE/src/lapacke_dgesvdq.c | 1 - LAPACKE/src/lapacke_dtrsyl3.c | 66 +++++++++++++++++++++++ LAPACKE/src/lapacke_dtrsyl3_work.c | 86 ++++++++++++++++++++++++++++++ LAPACKE/src/lapacke_sgesvdq.c | 1 - LAPACKE/src/lapacke_strsyl3.c | 66 +++++++++++++++++++++++ LAPACKE/src/lapacke_strsyl3_work.c | 86 ++++++++++++++++++++++++++++++ LAPACKE/src/lapacke_zgesvdq.c | 1 - 12 files changed, 379 insertions(+), 5 deletions(-) create mode 100644 LAPACKE/src/lapacke_dtrsyl3.c create mode 100644 LAPACKE/src/lapacke_dtrsyl3_work.c create mode 100644 LAPACKE/src/lapacke_strsyl3.c create mode 100644 LAPACKE/src/lapacke_strsyl3_work.c diff --git a/LAPACKE/include/lapack.h b/LAPACKE/include/lapack.h index 5b09bcfb45..2a56b78318 100644 --- a/LAPACKE/include/lapack.h +++ b/LAPACKE/include/lapack.h @@ -22002,6 +22002,46 @@ void LAPACK_ztrsyl_base( #define LAPACK_ztrsyl(...) LAPACK_ztrsyl_base(__VA_ARGS__) #endif +#define LAPACK_dtrsyl3_base LAPACK_GLOBAL(dtrsyl3,DTRSYL3) +void LAPACK_dtrsyl3_base( + char const* trana, char const* tranb, + lapack_int const* isgn, lapack_int const* m, lapack_int const* n, + double const* A, lapack_int const* lda, + double const* B, lapack_int const* ldb, + double* C, lapack_int const* ldc, double* scale, + lapack_int* iwork, lapack_int const* liwork, + double* swork, lapack_int const *ldswork, + lapack_int* info +#ifdef LAPACK_FORTRAN_STRLEN_END + , size_t, size_t +#endif +); +#ifdef LAPACK_FORTRAN_STRLEN_END + #define LAPACK_dtrsyl3(...) LAPACK_dtrsyl3_base(__VA_ARGS__, 1, 1) +#else + #define LAPACK_dtrsyl3(...) LAPACK_dtrsyl3_base(__VA_ARGS__) +#endif + +#define LAPACK_strsyl3_base LAPACK_GLOBAL(strsyl3,STRSYL3) +void LAPACK_strsyl3_base( + char const* trana, char const* tranb, + lapack_int const* isgn, lapack_int const* m, lapack_int const* n, + float const* A, lapack_int const* lda, + float const* B, lapack_int const* ldb, + float* C, lapack_int const* ldc, float* scale, + lapack_int* iwork, lapack_int const* liwork, + float* swork, lapack_int const *ldswork, + lapack_int* info +#ifdef LAPACK_FORTRAN_STRLEN_END + , size_t, size_t +#endif +); +#ifdef LAPACK_FORTRAN_STRLEN_END + #define LAPACK_strsyl3(...) LAPACK_strsyl3_base(__VA_ARGS__, 1, 1) +#else + #define LAPACK_strsyl3(...) LAPACK_strsyl3_base(__VA_ARGS__) +#endif + #define LAPACK_ctrtri_base LAPACK_GLOBAL(ctrtri,CTRTRI) void LAPACK_ctrtri_base( char const* uplo, char const* diag, diff --git a/LAPACKE/include/lapacke.h b/LAPACKE/include/lapacke.h index f6fbfcc33b..1e58b755fc 100644 --- a/LAPACKE/include/lapacke.h +++ b/LAPACKE/include/lapacke.h @@ -4477,6 +4477,17 @@ lapack_int LAPACKE_ztrsyl( int matrix_layout, char trana, char tranb, lapack_complex_double* c, lapack_int ldc, double* scale ); +lapack_int LAPACKE_strsyl3( int matrix_layout, char trana, char tranb, + lapack_int isgn, lapack_int m, lapack_int n, + const float* a, lapack_int lda, const float* b, + lapack_int ldb, float* c, lapack_int ldc, + float* scale ); +lapack_int LAPACKE_dtrsyl3( int matrix_layout, char trana, char tranb, + lapack_int isgn, lapack_int m, lapack_int n, + const double* a, lapack_int lda, const double* b, + lapack_int ldb, double* c, lapack_int ldc, + double* scale ); + lapack_int LAPACKE_strtri( int matrix_layout, char uplo, char diag, lapack_int n, float* a, lapack_int lda ); lapack_int LAPACKE_dtrtri( int matrix_layout, char uplo, char diag, lapack_int n, @@ -10174,6 +10185,21 @@ lapack_int LAPACKE_ztrsyl_work( int matrix_layout, char trana, char tranb, lapack_complex_double* c, lapack_int ldc, double* scale ); +lapack_int LAPACKE_strsyl3_work( int matrix_layout, char trana, char tranb, + lapack_int isgn, lapack_int m, lapack_int n, + const float* a, lapack_int lda, + const float* b, lapack_int ldb, + float* c, lapack_int ldc, float* scale, + lapack_int* iwork, lapack_int liwork, + float* swork, lapack_int ldswork ); +lapack_int LAPACKE_dtrsyl3_work( int matrix_layout, char trana, char tranb, + lapack_int isgn, lapack_int m, lapack_int n, + const double* a, lapack_int lda, + const double* b, lapack_int ldb, + double* c, lapack_int ldc, double* scale, + lapack_int* iwork, lapack_int liwork, + double* swork, lapack_int ldswork ); + lapack_int LAPACKE_strtri_work( int matrix_layout, char uplo, char diag, lapack_int n, float* a, lapack_int lda ); lapack_int LAPACKE_dtrtri_work( int matrix_layout, char uplo, char diag, diff --git a/LAPACKE/src/CMakeLists.txt b/LAPACKE/src/CMakeLists.txt index 4171a3bd42..f9b50e31ad 100644 --- a/LAPACKE/src/CMakeLists.txt +++ b/LAPACKE/src/CMakeLists.txt @@ -1169,6 +1169,8 @@ lapacke_dtrsna.c lapacke_dtrsna_work.c lapacke_dtrsyl.c lapacke_dtrsyl_work.c +lapacke_dtrsyl3.c +lapacke_dtrsyl3_work.c lapacke_dtrtri.c lapacke_dtrtri_work.c lapacke_dtrtrs.c @@ -1740,6 +1742,8 @@ lapacke_strsna.c lapacke_strsna_work.c lapacke_strsyl.c lapacke_strsyl_work.c +lapacke_strsyl3.c +lapacke_strsyl3_work.c lapacke_strtri.c lapacke_strtri_work.c lapacke_strtrs.c diff --git a/LAPACKE/src/Makefile b/LAPACKE/src/Makefile index 2e62d0324a..7bb229dfe5 100644 --- a/LAPACKE/src/Makefile +++ b/LAPACKE/src/Makefile @@ -39,7 +39,7 @@ include $(TOPSRCDIR)/make.inc .SUFFIXES: .c .o .c.o: - $(CC) $(CFLAGS) -I../include -c -o $@ $< + $(CC) $(CFLAGS) -Wall -I../include -c -o $@ $< OBJ = \ lapacke_ilaver.o \ @@ -1216,6 +1216,8 @@ lapacke_dtrsna.o \ lapacke_dtrsna_work.o \ lapacke_dtrsyl.o \ lapacke_dtrsyl_work.o \ +lapacke_dtrsyl3.o \ +lapacke_dtrsyl3_work.o \ lapacke_dtrtri.o \ lapacke_dtrtri_work.o \ lapacke_dtrtrs.o \ @@ -1782,6 +1784,8 @@ lapacke_strsna.o \ lapacke_strsna_work.o \ lapacke_strsyl.o \ lapacke_strsyl_work.o \ +lapacke_strsyl3.o \ +lapacke_strsyl3_work.o \ lapacke_strtri.o \ lapacke_strtri_work.o \ lapacke_strtrs.o \ diff --git a/LAPACKE/src/lapacke_cgesvdq.c b/LAPACKE/src/lapacke_cgesvdq.c index 8406635e99..05ff8d57f5 100644 --- a/LAPACKE/src/lapacke_cgesvdq.c +++ b/LAPACKE/src/lapacke_cgesvdq.c @@ -48,7 +48,6 @@ lapack_int LAPACKE_cgesvdq( int matrix_layout, char joba, char jobp, lapack_int lrwork = -1; float* rwork = NULL; float rwork_query; - lapack_int i; if( matrix_layout != LAPACK_COL_MAJOR && matrix_layout != LAPACK_ROW_MAJOR ) { LAPACKE_xerbla( "LAPACKE_cgesvdq", -1 ); return -1; diff --git a/LAPACKE/src/lapacke_dgesvdq.c b/LAPACKE/src/lapacke_dgesvdq.c index 4e1b876810..4a0d427b33 100644 --- a/LAPACKE/src/lapacke_dgesvdq.c +++ b/LAPACKE/src/lapacke_dgesvdq.c @@ -48,7 +48,6 @@ lapack_int LAPACKE_dgesvdq( int matrix_layout, char joba, char jobp, lapack_int lrwork = -1; double* rwork = NULL; double rwork_query; - lapack_int i; if( matrix_layout != LAPACK_COL_MAJOR && matrix_layout != LAPACK_ROW_MAJOR ) { LAPACKE_xerbla( "LAPACKE_dgesvdq", -1 ); return -1; diff --git a/LAPACKE/src/lapacke_dtrsyl3.c b/LAPACKE/src/lapacke_dtrsyl3.c new file mode 100644 index 0000000000..523235c93a --- /dev/null +++ b/LAPACKE/src/lapacke_dtrsyl3.c @@ -0,0 +1,66 @@ +#include "lapacke_utils.h" + +lapack_int LAPACKE_dtrsyl3( int matrix_layout, char trana, char tranb, + lapack_int isgn, lapack_int m, lapack_int n, + const double* a, lapack_int lda, const double* b, + lapack_int ldb, double* c, lapack_int ldc, + double* scale ) +{ + lapack_int info = 0; + double swork_query[2]; + double* swork = NULL; + lapack_int ldswork = -1; + lapack_int swork_size = -1; + lapack_int* iwork = NULL; + lapack_int liwork = -1; + if( matrix_layout != LAPACK_COL_MAJOR && matrix_layout != LAPACK_ROW_MAJOR ) { + LAPACKE_xerbla( "LAPACKE_dtrsyl3", -1 ); + return -1; + } +#ifndef LAPACK_DISABLE_NAN_CHECK + if( LAPACKE_get_nancheck() ) { + /* Optionally check input matrices for NaNs */ + if( LAPACKE_dge_nancheck( matrix_layout, m, m, a, lda ) ) { + return -7; + } + if( LAPACKE_dge_nancheck( matrix_layout, n, n, b, ldb ) ) { + return -9; + } + if( LAPACKE_dge_nancheck( matrix_layout, m, n, c, ldc ) ) { + return -11; + } + } +#endif + /* Query optimal working array sizes */ + info = LAPACKE_dtrsyl3_work( matrix_layout, trana, tranb, isgn, m, n, a, lda, + b, ldb, c, ldc, scale, iwork, liwork, + swork_query, ldswork ); + if( info != 0 ) { + goto exit_level_0; + } + ldswork = swork_query[0]; + swork_size = ldswork * swork_query[1]; + swork = (double*)LAPACKE_malloc( sizeof(double) * swork_size); + if( swork == NULL ) { + info = LAPACK_WORK_MEMORY_ERROR; + goto exit_level_0; + } + iwork = (lapack_int*)LAPACKE_malloc( sizeof(lapack_int) * liwork ); + if (iwork == NULL ) { + info = LAPACK_WORK_MEMORY_ERROR; + goto exit_level_1; + } + /* Call middle-level interface */ + info = LAPACKE_dtrsyl3_work( matrix_layout, trana, tranb, isgn, m, n, a, + lda, b, ldb, c, ldc, scale, iwork, liwork, + swork, ldswork ); + /* Release memory and exit */ + LAPACKE_free( iwork ); +exit_level_1: + LAPACKE_free( swork ); +exit_level_0: + if( info == LAPACK_WORK_MEMORY_ERROR ) { + LAPACKE_xerbla( "LAPACKE_dtrsyl3", info ); + } + return info; +} diff --git a/LAPACKE/src/lapacke_dtrsyl3_work.c b/LAPACKE/src/lapacke_dtrsyl3_work.c new file mode 100644 index 0000000000..272c35b384 --- /dev/null +++ b/LAPACKE/src/lapacke_dtrsyl3_work.c @@ -0,0 +1,86 @@ +#include "lapacke_utils.h" + +lapack_int LAPACKE_dtrsyl3_work( int matrix_layout, char trana, char tranb, + lapack_int isgn, lapack_int m, lapack_int n, + const double* a, lapack_int lda, + const double* b, lapack_int ldb, double* c, + lapack_int ldc, double* scale, + lapack_int* iwork, lapack_int liwork, + double* swork, lapack_int ldswork ) +{ + lapack_int info = 0; + if( matrix_layout == LAPACK_COL_MAJOR ) { + /* Call LAPACK function and adjust info */ + LAPACK_dtrsyl3( &trana, &tranb, &isgn, &m, &n, a, &lda, b, &ldb, c, &ldc, + scale, iwork, &liwork, swork, &ldswork, &info ); + if( info < 0 ) { + info = info - 1; + } + } else if( matrix_layout == LAPACK_ROW_MAJOR ) { + lapack_int lda_t = MAX(1,m); + lapack_int ldb_t = MAX(1,n); + lapack_int ldc_t = MAX(1,m); + double* a_t = NULL; + double* b_t = NULL; + double* c_t = NULL; + /* Check leading dimension(s) */ + if( lda < m ) { + info = -8; + LAPACKE_xerbla( "LAPACKE_dtrsyl3_work", info ); + return info; + } + if( ldb < n ) { + info = -10; + LAPACKE_xerbla( "LAPACKE_dtrsyl3_work", info ); + return info; + } + if( ldc < n ) { + info = -12; + LAPACKE_xerbla( "LAPACKE_dtrsyl3_work", info ); + return info; + } + /* Allocate memory for temporary array(s) */ + a_t = (double*)LAPACKE_malloc( sizeof(double) * lda_t * MAX(1,m) ); + if( a_t == NULL ) { + info = LAPACK_TRANSPOSE_MEMORY_ERROR; + goto exit_level_0; + } + b_t = (double*)LAPACKE_malloc( sizeof(double) * ldb_t * MAX(1,n) ); + if( b_t == NULL ) { + info = LAPACK_TRANSPOSE_MEMORY_ERROR; + goto exit_level_1; + } + c_t = (double*)LAPACKE_malloc( sizeof(double) * ldc_t * MAX(1,n) ); + if( c_t == NULL ) { + info = LAPACK_TRANSPOSE_MEMORY_ERROR; + goto exit_level_2; + } + /* Transpose input matrices */ + LAPACKE_dge_trans( matrix_layout, m, m, a, lda, a_t, lda_t ); + LAPACKE_dge_trans( matrix_layout, n, n, b, ldb, b_t, ldb_t ); + LAPACKE_dge_trans( matrix_layout, m, n, c, ldc, c_t, ldc_t ); + /* Call LAPACK function and adjust info */ + LAPACK_dtrsyl3( &trana, &tranb, &isgn, &m, &n, a_t, &lda_t, b_t, &ldb_t, + c_t, &ldc_t, scale, iwork, &liwork, swork, &ldswork, + &info ); + if( info < 0 ) { + info = info - 1; + } + /* Transpose output matrices */ + LAPACKE_dge_trans( LAPACK_COL_MAJOR, m, n, c_t, ldc_t, c, ldc ); + /* Release memory and exit */ + LAPACKE_free( c_t ); +exit_level_2: + LAPACKE_free( b_t ); +exit_level_1: + LAPACKE_free( a_t ); +exit_level_0: + if( info == LAPACK_TRANSPOSE_MEMORY_ERROR ) { + LAPACKE_xerbla( "LAPACKE_dtrsyl3_work", info ); + } + } else { + info = -1; + LAPACKE_xerbla( "LAPACKE_dtrsyl3_work", info ); + } + return info; +} diff --git a/LAPACKE/src/lapacke_sgesvdq.c b/LAPACKE/src/lapacke_sgesvdq.c index 0b6406dec6..627d2406cb 100644 --- a/LAPACKE/src/lapacke_sgesvdq.c +++ b/LAPACKE/src/lapacke_sgesvdq.c @@ -48,7 +48,6 @@ lapack_int LAPACKE_sgesvdq( int matrix_layout, char joba, char jobp, lapack_int lrwork = -1; float* rwork = NULL; float rwork_query; - lapack_int i; if( matrix_layout != LAPACK_COL_MAJOR && matrix_layout != LAPACK_ROW_MAJOR ) { LAPACKE_xerbla( "LAPACKE_sgesvdq", -1 ); return -1; diff --git a/LAPACKE/src/lapacke_strsyl3.c b/LAPACKE/src/lapacke_strsyl3.c new file mode 100644 index 0000000000..6db54f21f5 --- /dev/null +++ b/LAPACKE/src/lapacke_strsyl3.c @@ -0,0 +1,66 @@ +#include "lapacke_utils.h" + +lapack_int LAPACKE_strsyl3( int matrix_layout, char trana, char tranb, + lapack_int isgn, lapack_int m, lapack_int n, + const float* a, lapack_int lda, const float* b, + lapack_int ldb, float* c, lapack_int ldc, + float* scale ) +{ + lapack_int info = 0; + float swork_query[2]; + float* swork = NULL; + lapack_int ldswork = -1; + lapack_int swork_size = -1; + lapack_int* iwork = NULL; + lapack_int liwork = -1; + if( matrix_layout != LAPACK_COL_MAJOR && matrix_layout != LAPACK_ROW_MAJOR ) { + LAPACKE_xerbla( "LAPACKE_strsyl3", -1 ); + return -1; + } +#ifndef LAPACK_DISABLE_NAN_CHECK + if( LAPACKE_get_nancheck() ) { + /* Optionally check input matrices for NaNs */ + if( LAPACKE_sge_nancheck( matrix_layout, m, m, a, lda ) ) { + return -7; + } + if( LAPACKE_sge_nancheck( matrix_layout, n, n, b, ldb ) ) { + return -9; + } + if( LAPACKE_sge_nancheck( matrix_layout, m, n, c, ldc ) ) { + return -11; + } + } +#endif + /* Query optimal working array sizes */ + info = LAPACKE_strsyl3_work( matrix_layout, trana, tranb, isgn, m, n, a, lda, + b, ldb, c, ldc, scale, iwork, liwork, + swork_query, ldswork ); + if( info != 0 ) { + goto exit_level_0; + } + ldswork = swork_query[0]; + swork_size = ldswork * swork_query[1]; + swork = (float*)LAPACKE_malloc( sizeof(float) * swork_size); + if( swork == NULL ) { + info = LAPACK_WORK_MEMORY_ERROR; + goto exit_level_0; + } + iwork = (lapack_int*)LAPACKE_malloc( sizeof(lapack_int) * liwork ); + if (iwork == NULL ) { + info = LAPACK_WORK_MEMORY_ERROR; + goto exit_level_1; + } + /* Call middle-level interface */ + info = LAPACKE_strsyl3_work( matrix_layout, trana, tranb, isgn, m, n, a, + lda, b, ldb, c, ldc, scale, iwork, liwork, + swork, ldswork ); + /* Release memory and exit */ + LAPACKE_free( iwork ); +exit_level_1: + LAPACKE_free( swork ); +exit_level_0: + if( info == LAPACK_WORK_MEMORY_ERROR ) { + LAPACKE_xerbla( "LAPACKE_strsyl3", info ); + } + return info; +} diff --git a/LAPACKE/src/lapacke_strsyl3_work.c b/LAPACKE/src/lapacke_strsyl3_work.c new file mode 100644 index 0000000000..3c50e4a451 --- /dev/null +++ b/LAPACKE/src/lapacke_strsyl3_work.c @@ -0,0 +1,86 @@ +#include "lapacke_utils.h" + +lapack_int LAPACKE_strsyl3_work( int matrix_layout, char trana, char tranb, + lapack_int isgn, lapack_int m, lapack_int n, + const float* a, lapack_int lda, + const float* b, lapack_int ldb, float* c, + lapack_int ldc, float* scale, + lapack_int* iwork, lapack_int liwork, + float* swork, lapack_int ldswork ) +{ + lapack_int info = 0; + if( matrix_layout == LAPACK_COL_MAJOR ) { + /* Call LAPACK function and adjust info */ + LAPACK_strsyl3( &trana, &tranb, &isgn, &m, &n, a, &lda, b, &ldb, c, &ldc, + scale, iwork, &liwork, swork, &ldswork, &info ); + if( info < 0 ) { + info = info - 1; + } + } else if( matrix_layout == LAPACK_ROW_MAJOR ) { + lapack_int lda_t = MAX(1,m); + lapack_int ldb_t = MAX(1,n); + lapack_int ldc_t = MAX(1,m); + float* a_t = NULL; + float* b_t = NULL; + float* c_t = NULL; + /* Check leading dimension(s) */ + if( lda < m ) { + info = -8; + LAPACKE_xerbla( "LAPACKE_strsyl3_work", info ); + return info; + } + if( ldb < n ) { + info = -10; + LAPACKE_xerbla( "LAPACKE_strsyl3_work", info ); + return info; + } + if( ldc < n ) { + info = -12; + LAPACKE_xerbla( "LAPACKE_strsyl3_work", info ); + return info; + } + /* Allocate memory for temporary array(s) */ + a_t = (float*)LAPACKE_malloc( sizeof(float) * lda_t * MAX(1,m) ); + if( a_t == NULL ) { + info = LAPACK_TRANSPOSE_MEMORY_ERROR; + goto exit_level_0; + } + b_t = (float*)LAPACKE_malloc( sizeof(float) * ldb_t * MAX(1,n) ); + if( b_t == NULL ) { + info = LAPACK_TRANSPOSE_MEMORY_ERROR; + goto exit_level_1; + } + c_t = (float*)LAPACKE_malloc( sizeof(float) * ldc_t * MAX(1,n) ); + if( c_t == NULL ) { + info = LAPACK_TRANSPOSE_MEMORY_ERROR; + goto exit_level_2; + } + /* Transpose input matrices */ + LAPACKE_sge_trans( matrix_layout, m, m, a, lda, a_t, lda_t ); + LAPACKE_sge_trans( matrix_layout, n, n, b, ldb, b_t, ldb_t ); + LAPACKE_sge_trans( matrix_layout, m, n, c, ldc, c_t, ldc_t ); + /* Call LAPACK function and adjust info */ + LAPACK_strsyl3( &trana, &tranb, &isgn, &m, &n, a_t, &lda_t, b_t, &ldb_t, + c_t, &ldc_t, scale, iwork, &liwork, swork, &ldswork, + &info ); + if( info < 0 ) { + info = info - 1; + } + /* Transpose output matrices */ + LAPACKE_sge_trans( LAPACK_COL_MAJOR, m, n, c_t, ldc_t, c, ldc ); + /* Release memory and exit */ + LAPACKE_free( c_t ); +exit_level_2: + LAPACKE_free( b_t ); +exit_level_1: + LAPACKE_free( a_t ); +exit_level_0: + if( info == LAPACK_TRANSPOSE_MEMORY_ERROR ) { + LAPACKE_xerbla( "LAPACKE_strsyl3_work", info ); + } + } else { + info = -1; + LAPACKE_xerbla( "LAPACKE_strsyl3_work", info ); + } + return info; +} diff --git a/LAPACKE/src/lapacke_zgesvdq.c b/LAPACKE/src/lapacke_zgesvdq.c index 528b94a47e..1d318e5713 100644 --- a/LAPACKE/src/lapacke_zgesvdq.c +++ b/LAPACKE/src/lapacke_zgesvdq.c @@ -48,7 +48,6 @@ lapack_int LAPACKE_zgesvdq( int matrix_layout, char joba, char jobp, lapack_int lrwork = -1; double* rwork = NULL; double rwork_query; - lapack_int i; if( matrix_layout != LAPACK_COL_MAJOR && matrix_layout != LAPACK_ROW_MAJOR ) { LAPACKE_xerbla( "LAPACKE_zgesvdq", -1 ); return -1; From 079c185c6a08d8c592f50afef9b607cb4cc4d9de Mon Sep 17 00:00:00 2001 From: Angelika Schwarz Date: Fri, 22 Jul 2022 07:16:56 +0100 Subject: [PATCH 4/8] Add ztrsyl3 and corresponding tests The tests of ztrsyl via zget35 are on tiny matrices that fall into the unblocked section of ztrsyl3. Add a new test file that checks ztrsyl(3) for larger matrices and their compatibility: Every problem that is solvable by ztrsyl must be solvable by ztrsyl3. --- SRC/CMakeLists.txt | 2 +- SRC/Makefile | 2 +- SRC/ilaenv.f | 3 +- SRC/ztrsyl3.f | 1142 ++++++++++++++++++++++++++++++++++++ TESTING/EIG/CMakeLists.txt | 4 +- TESTING/EIG/Makefile | 4 +- TESTING/EIG/zchkec.f | 42 +- TESTING/EIG/zerrec.f | 41 +- TESTING/EIG/zsyl01.f | 294 ++++++++++ 9 files changed, 1514 insertions(+), 20 deletions(-) create mode 100644 SRC/ztrsyl3.f create mode 100644 TESTING/EIG/zsyl01.f diff --git a/SRC/CMakeLists.txt b/SRC/CMakeLists.txt index b5fa0168f5..16f7ec22d5 100644 --- a/SRC/CMakeLists.txt +++ b/SRC/CMakeLists.txt @@ -442,7 +442,7 @@ set(ZLASRC ztgexc.f ztgsen.f ztgsja.f ztgsna.f ztgsy2.f ztgsyl.f ztpcon.f ztprfs.f ztptri.f ztptrs.f ztrcon.f ztrevc.f ztrevc3.f ztrexc.f ztrrfs.f ztrsen.f ztrsna.f - ztrsyl.f ztrti2.f ztrtri.f ztrtrs.f ztzrzf.f zung2l.f + ztrsyl.f ztrsyl3.f ztrti2.f ztrtri.f ztrtrs.f ztzrzf.f zung2l.f zung2r.f zungbr.f zunghr.f zungl2.f zunglq.f zungql.f zungqr.f zungr2.f zungrq.f zungtr.f zungtsqr.f zungtsqr_row.f zunm2l.f zunm2r.f zunmbr.f zunmhr.f zunml2.f zunm22.f zunmlq.f zunmql.f zunmqr.f zunmr2.f zunmr3.f zunmrq.f zunmrz.f diff --git a/SRC/Makefile b/SRC/Makefile index 3f8f6f4695..a2a5f284b8 100644 --- a/SRC/Makefile +++ b/SRC/Makefile @@ -479,7 +479,7 @@ ZLASRC = \ ztgexc.o ztgsen.o ztgsja.o ztgsna.o ztgsy2.o ztgsyl.o ztpcon.o \ ztprfs.o ztptri.o \ ztptrs.o ztrcon.o ztrevc.o ztrevc3.o ztrexc.o ztrrfs.o ztrsen.o ztrsna.o \ - ztrsyl.o ztrti2.o ztrtri.o ztrtrs.o ztzrzf.o zung2l.o \ + ztrsyl.o ztrsyl3.o ztrti2.o ztrtri.o ztrtrs.o ztzrzf.o zung2l.o \ zung2r.o zungbr.o zunghr.o zungl2.o zunglq.o zungql.o zungqr.o zungr2.o \ zungrq.o zungtr.o zungtsqr.o zungtsqr_row.o zunm2l.o zunm2r.o zunmbr.o zunmhr.o zunml2.o zunm22.o \ zunmlq.o zunmql.o zunmqr.o zunmr2.o zunmr3.o zunmrq.o zunmrz.o \ diff --git a/SRC/ilaenv.f b/SRC/ilaenv.f index 0e3fd38159..395147d95e 100644 --- a/SRC/ilaenv.f +++ b/SRC/ilaenv.f @@ -475,7 +475,8 @@ INTEGER FUNCTION ILAENV( ISPEC, NAME, OPTS, N1, N2, N3, N4 ) NB = MIN( MAX( 48, INT( ( MIN( N1, N2 ) * 16 ) / 100) ), $ 240 ) ELSE - NB = -1 + NB = MIN( MAX( 24, INT( ( MIN( N1, N2 ) * 8 ) / 100) ), + $ 80 ) END IF END IF ELSE IF( C2.EQ.'LA' ) THEN diff --git a/SRC/ztrsyl3.f b/SRC/ztrsyl3.f new file mode 100644 index 0000000000..c344e5303a --- /dev/null +++ b/SRC/ztrsyl3.f @@ -0,0 +1,1142 @@ +*> \brief \b ZTRSYL3 +* +* Definition: +* =========== +* +* +*> \par Purpose +* ============= +*> +*> \verbatim +*> +*> ZTRSYL3 solves the complex Sylvester matrix equation: +*> +*> op(A)*X + X*op(B) = scale*C or +*> op(A)*X - X*op(B) = scale*C, +*> +*> where op(A) = A or A**H, and A and B are both upper triangular. A is +*> M-by-M and B is N-by-N; the right hand side C and the solution X are +*> M-by-N; and scale is an output scale factor, set <= 1 to avoid +*> overflow in X. +*> +*> This is the block version of the algorithm. +*> \endverbatim +* +* Arguments +* ========= +* +*> \param[in] TRANA +*> \verbatim +*> TRANA is CHARACTER*1 +*> Specifies the option op(A): +*> = 'N': op(A) = A (No transpose) +*> = 'C': op(A) = A**H (Conjugate transpose) +*> \endverbatim +*> +*> \param[in] TRANB +*> \verbatim +*> TRANB is CHARACTER*1 +*> Specifies the option op(B): +*> = 'N': op(B) = B (No transpose) +*> = 'C': op(B) = B**H (Conjugate transpose) +*> \endverbatim +*> +*> \param[in] ISGN +*> \verbatim +*> ISGN is INTEGER +*> Specifies the sign in the equation: +*> = +1: solve op(A)*X + X*op(B) = scale*C +*> = -1: solve op(A)*X - X*op(B) = scale*C +*> \endverbatim +*> +*> \param[in] M +*> \verbatim +*> M is INTEGER +*> The order of the matrix A, and the number of rows in the +*> matrices X and C. M >= 0. +*> \endverbatim +*> +*> \param[in] N +*> \verbatim +*> N is INTEGER +*> The order of the matrix B, and the number of columns in the +*> matrices X and C. N >= 0. +*> \endverbatim +*> +*> \param[in] A +*> \verbatim +*> A is COMPLEX*16 array, dimension (LDA,M) +*> The upper triangular matrix A. +*> \endverbatim +*> +*> \param[in] LDA +*> \verbatim +*> LDA is INTEGER +*> The leading dimension of the array A. LDA >= max(1,M). +*> \endverbatim +*> +*> \param[in] B +*> \verbatim +*> B is COMPLEX*16 array, dimension (LDB,N) +*> The upper triangular matrix B. +*> \endverbatim +*> +*> \param[in] LDB +*> \verbatim +*> LDB is INTEGER +*> The leading dimension of the array B. LDB >= max(1,N). +*> \endverbatim +*> +*> \param[in,out] C +*> \verbatim +*> C is COMPLEX*16 array, dimension (LDC,N) +*> On entry, the M-by-N right hand side matrix C. +*> On exit, C is overwritten by the solution matrix X. +*> \endverbatim +*> +*> \param[in] LDC +*> \verbatim +*> LDC is INTEGER +*> The leading dimension of the array C. LDC >= max(1,M) +*> \endverbatim +*> +*> \param[out] SCALE +*> \verbatim +*> SCALE is DOUBLE PRECISION +*> The scale factor, scale, set <= 1 to avoid overflow in X. +*> \endverbatim +*> +*> \param[out] SWORK +*> \verbatim +*> SWORK is DOUBLE PRECISION array, dimension (MAX(2, ROWS), +*> MAX(1,COLS)). +*> On exit, if INFO = 0, SWORK(1) returns the optimal value ROWS +*> and SWORK(2) returns the optimal COLS. +*> \endverbatim +*> +*> \param[in] LDSWORK +*> \verbatim +*> LDSWORK is INTEGER +*> LDSWORK >= MAX(2,ROWS), where ROWS = ((M + NB - 1) / NB + 1) +*> and NB is the optimal block size. +*> +*> If LDSWORK = -1, then a workspace query is assumed; the routine +*> only calculates the optimal dimensions of the SWORK matrix, +*> returns these values as the first and second entry of the SWORK +*> matrix, and no error message related LWORK is issued by XERBLA. +*> \endverbatim +*> +*> \param[out] INFO +*> \verbatim +*> INFO is INTEGER +*> = 0: successful exit +*> < 0: if INFO = -i, the i-th argument had an illegal value +*> = 1: A and B have common or very close eigenvalues; perturbed +*> values were used to solve the equation (but the matrices +*> A and B are unchanged). +*> \endverbatim +* +*> \ingroup complex16SYcomputational +* +* ===================================================================== +* References: +* E. S. Quintana-Orti and R. A. Van De Geijn (2003). Formal derivation of +* algorithms: The triangular Sylvester equation, ACM Transactions +* on Mathematical Software (TOMS), volume 29, pages 218--243. +* +* A. Schwarz and C. C. Kjelgaard Mikkelsen (2020). Robust Task-Parallel +* Solution of the Triangular Sylvester Equation. Lecture Notes in +* Computer Science, vol 12043, pages 82--92, Springer. +* +* Contributor: +* Angelika Schwarz, Umea University, Sweden. +* +* ===================================================================== + SUBROUTINE ZTRSYL3( TRANA, TRANB, ISGN, M, N, A, LDA, B, LDB, C, + $ LDC, SCALE, SWORK, LDSWORK, INFO ) + IMPLICIT NONE +* +* .. Scalar Arguments .. + CHARACTER TRANA, TRANB + INTEGER INFO, ISGN, LDA, LDB, LDC, LDSWORK, M, N + DOUBLE PRECISION SCALE +* .. +* .. Array Arguments .. + COMPLEX*16 A( LDA, * ), B( LDB, * ), C( LDC, * ) + DOUBLE PRECISION SWORK( LDSWORK, * ) +* .. +* .. Parameters .. + DOUBLE PRECISION ZERO, ONE + PARAMETER ( ZERO = 0.0D0, ONE = 1.0D0 ) + COMPLEX*16 CONE + PARAMETER ( CONE = ( 1.0D0, 0.0D0 ) ) +* .. +* .. Local Scalars .. + LOGICAL NOTRNA, NOTRNB, LQUERY + INTEGER AWRK, BWRK, I, I1, I2, IINFO, J, J1, J2, JJ, + $ K, K1, K2, L, L1, L2, LL, NBA, NB, NBB + DOUBLE PRECISION ANRM, BIGNUM, BNRM, CNRM, SCAL, SCALOC, + $ SCAMIN, SGN, XNRM, BUF, SMLNUM + COMPLEX*16 CSGN +* .. +* .. Local Arrays .. + DOUBLE PRECISION WNRM( MAX( M, N ) ) +* .. +* .. External Functions .. + LOGICAL LSAME + INTEGER ILAENV + DOUBLE PRECISION DLAMCH, DLARMM, ZLANGE + EXTERNAL DLAMCH, DLARMM, ILAENV, LSAME, ZLANGE +* .. +* .. External Subroutines .. + EXTERNAL XERBLA, ZDSCAL, ZGEMM, ZLASCL, ZTRSYL +* .. +* .. Intrinsic Functions .. + INTRINSIC ABS, EXPONENT, DBLE, DIMAG, MAX, MIN +* .. +* .. Executable Statements .. +* +* Decode and Test input parameters +* + NOTRNA = LSAME( TRANA, 'N' ) + NOTRNB = LSAME( TRANB, 'N' ) +* +* Use the same block size for all matrices. +* + NB = MAX( 8, ILAENV( 1, 'ZTRSYL', '', M, N, -1, -1) ) +* +* Compute number of blocks in A and B +* + NBA = MAX( 1, (M + NB - 1) / NB ) + NBB = MAX( 1, (N + NB - 1) / NB ) +* +* Compute workspace +* + INFO = 0 + LQUERY = ( LDSWORK.EQ.-1 ) + IF( LQUERY ) THEN + LDSWORK = 2 + SWORK(1,1) = MAX( NBA, NBB ) + SWORK(2,1) = 2 * NBB + NBA + END IF +* +* Test the input arguments +* + IF( .NOT.NOTRNA .AND. .NOT. LSAME( TRANA, 'C' ) ) THEN + INFO = -1 + ELSE IF( .NOT.NOTRNB .AND. .NOT. LSAME( TRANB, 'C' ) ) THEN + INFO = -2 + ELSE IF( ISGN.NE.1 .AND. ISGN.NE.-1 ) THEN + INFO = -3 + ELSE IF( M.LT.0 ) THEN + INFO = -4 + ELSE IF( N.LT.0 ) THEN + INFO = -5 + ELSE IF( LDA.LT.MAX( 1, M ) ) THEN + INFO = -7 + ELSE IF( LDB.LT.MAX( 1, N ) ) THEN + INFO = -9 + ELSE IF( LDC.LT.MAX( 1, M ) ) THEN + INFO = -11 + ELSE IF( .NOT.LQUERY .AND. LDSWORK.LT.MAX( NBA, NBB ) ) THEN + INFO = -16 + END IF + IF( INFO.NE.0 ) THEN + CALL XERBLA( 'ZTRSYL3', -INFO ) + RETURN + ELSE IF( LQUERY ) THEN + RETURN + END IF +* +* Quick return if possible +* + IF( M.EQ.0 .OR. N.EQ.0 ) + $ RETURN +* +* Use unblocked code for small problems +* + IF( NBA.EQ.1 .OR. NBB.EQ.1 ) THEN + CALL ZTRSYL( TRANA, TRANB, ISGN, M, N, A, LDA, B, LDB, + $ C, LDC, SCALE, INFO ) + RETURN + END IF +* +* Set constants to control overflow +* + SMLNUM = DLAMCH( 'S' ) + BIGNUM = ONE / SMLNUM +* +* Set local scaling factors. +* + DO L = 1, NBB + DO K = 1, NBA + SWORK( K, L ) = ONE + END DO + END DO +* +* Fallback scaling factor to prevent flushing of SWORK( K, L ) to zero. +* This scaling is to ensure compatibility with TRSYL and may get flushed. +* + BUF = ONE +* +* Compute upper bounds of blocks of A and B +* + AWRK = NBB + DO K = 1, NBA + K1 = (K - 1) * NB + 1 + K2 = MIN( K * NB, M ) + 1 + DO L = K, NBA + L1 = (L - 1) * NB + 1 + L2 = MIN( L * NB, M ) + 1 + IF( NOTRNA ) THEN + SWORK( K, AWRK + L ) = ZLANGE( 'I', K2-K1, L2-L1, + $ A( K1, L1 ), LDA, WNRM ) + ELSE + SWORK( L, AWRK + K ) = ZLANGE( '1', K2-K1, L2-L1, + $ A( K1, L1 ), LDA, WNRM ) + END IF + END DO + END DO + BWRK = NBB + NBA + DO K = 1, NBB + K1 = (K - 1) * NB + 1 + K2 = MIN( K * NB, N ) + 1 + DO L = K, NBB + L1 = (L - 1) * NB + 1 + L2 = MIN( L * NB, N ) + 1 + IF( NOTRNB ) THEN + SWORK( K, BWRK + L ) = ZLANGE( 'I', K2-K1, L2-L1, + $ B( K1, L1 ), LDB, WNRM ) + ELSE + SWORK( L, BWRK + K ) = ZLANGE( '1', K2-K1, L2-L1, + $ B( K1, L1 ), LDB, WNRM ) + END IF + END DO + END DO +* + SGN = DBLE( ISGN ) + CSGN = DCMPLX( SGN, ZERO ) +* + IF( NOTRNA .AND. NOTRNB ) THEN +* +* Solve A*X + ISGN*X*B = scale*C. +* +* The (K,L)th block of X is determined starting from +* bottom-left corner column by column by +* +* A(K,K)*X(K,L) + ISGN*X(K,L)*B(L,L) = C(K,L) - R(K,L) +* +* Where +* M L-1 +* R(K,L) = SUM [A(K,I)*X(I,L)] + ISGN*SUM [X(K,J)*B(J,L)]. +* I=K+1 J=1 +* +* Start loop over block rows (index = K) and block columns (index = L) +* + DO K = NBA, 1, -1 +* +* K1: row index of the first row in X( K, L ) +* K2: row index of the first row in X( K+1, L ) +* so the K2 - K1 is the column count of the block X( K, L ) +* + K1 = (K - 1) * NB + 1 + K2 = MIN( K * NB, M ) + 1 + DO L = 1, NBB +* +* L1: column index of the first column in X( K, L ) +* L2: column index of the first column in X( K, L + 1) +* so that L2 - L1 is the row count of the block X( K, L ) +* + L1 = (L - 1) * NB + 1 + L2 = MIN( L * NB, N ) + 1 +* + CALL ZTRSYL( TRANA, TRANB, ISGN, K2-K1, L2-L1, + $ A( K1, K1 ), LDA, + $ B( L1, L1 ), LDB, + $ C( K1, L1 ), LDC, SCALOC, IINFO ) + INFO = MAX( INFO, IINFO ) +* + IF( SCALOC * SWORK( K, L ) .EQ. ZERO ) THEN + IF( SCALOC .EQ. ZERO ) THEN +* The magnitude of the largest entry of X(K1:K2-1, L1:L2-1) +* is larger than the product of BIGNUM**2 and cannot be +* represented in the form (1/SCALE)*X(K1:K2-1, L1:L2-1). +* Mark the computation as pointless. + BUF = ZERO + ELSE + BUF = BUF*2.D0**EXPONENT( SCALOC ) + END IF + DO JJ = 1, NBB + DO LL = 1, NBA +* Bound by BIGNUM to not introduce Inf. The value +* is irrelevant; corresponding entries of the +* solution will be flushed in consistency scaling. + SWORK( LL, JJ ) = MIN( BIGNUM, + $ SWORK( LL, JJ ) / 2.D0**EXPONENT( SCALOC ) ) + END DO + END DO + END IF + SWORK( K, L ) = SCALOC * SWORK( K, L ) + XNRM = ZLANGE( 'I', K2-K1, L2-L1, C( K1, L1 ), LDC, + $ WNRM ) +* + DO I = K - 1, 1, -1 +* +* C( I, L ) := C( I, L ) - A( I, K ) * C( K, L ) +* + I1 = (I - 1) * NB + 1 + I2 = MIN( I * NB, M ) + 1 +* +* Compute scaling factor to survive the linear update +* simulating consistent scaling. +* + CNRM = ZLANGE( 'I', I2-I1, L2-L1, C( I1, L1 ), + $ LDC, WNRM ) + SCAMIN = MIN( SWORK( I, L ), SWORK( K, L ) ) + CNRM = CNRM * ( SCAMIN / SWORK( I, L ) ) + XNRM = XNRM * ( SCAMIN / SWORK( K, L ) ) + ANRM = SWORK( I, AWRK + K ) + SCALOC = DLARMM( ANRM, XNRM, CNRM ) + IF( SCALOC * SCAMIN .EQ. ZERO ) THEN +* Use second scaling factor to prevent flushing to zero. + BUF = BUF*2.D0**EXPONENT( SCALOC ) + DO JJ = 1, NBB + DO LL = 1, NBA + SWORK( LL, JJ ) = MIN( BIGNUM, + $ SWORK( LL, JJ ) / 2.D0**EXPONENT( SCALOC ) ) + END DO + END DO + SCAMIN = SCAMIN / 2.D0**EXPONENT( SCALOC ) + SCALOC = SCALOC / 2.D0**EXPONENT( SCALOC ) + END IF + CNRM = CNRM * SCALOC + XNRM = XNRM * SCALOC +* +* Simultaneously apply the robust update factor and the +* consistency scaling factor to C( I, L ) and C( K, L ). +* + SCAL = ( SCAMIN / SWORK( K, L ) ) * SCALOC + IF( SCAL .NE. ONE ) THEN + DO JJ = L1, L2-1 + CALL ZDSCAL( K2-K1, SCAL, C( K1, JJ ), 1) + END DO + ENDIF +* + SCAL = ( SCAMIN / SWORK( I, L ) ) * SCALOC + IF( SCAL .NE. ONE ) THEN + DO LL = L1, L2-1 + CALL ZDSCAL( I2-I1, SCAL, C( I1, LL ), 1) + END DO + ENDIF +* +* Record current scaling factor +* + SWORK( K, L ) = SCAMIN * SCALOC + SWORK( I, L ) = SCAMIN * SCALOC +* + CALL ZGEMM( 'N', 'N', I2-I1, L2-L1, K2-K1, -CONE, + $ A( I1, K1 ), LDA, C( K1, L1 ), LDC, + $ CONE, C( I1, L1 ), LDC ) +* + END DO +* + DO J = L + 1, NBB +* +* C( K, J ) := C( K, J ) - SGN * C( K, L ) * B( L, J ) +* + J1 = (J - 1) * NB + 1 + J2 = MIN( J * NB, N ) + 1 +* +* Compute scaling factor to survive the linear update +* simulating consistent scaling. +* + CNRM = ZLANGE( 'I', K2-K1, J2-J1, C( K1, J1 ), + $ LDC, WNRM ) + SCAMIN = MIN( SWORK( K, J ), SWORK( K, L ) ) + CNRM = CNRM * ( SCAMIN / SWORK( K, J ) ) + XNRM = XNRM * ( SCAMIN / SWORK( K, L ) ) + BNRM = SWORK(L, BWRK + J) + SCALOC = DLARMM( BNRM, XNRM, CNRM ) + IF( SCALOC * SCAMIN .EQ. ZERO ) THEN +* Use second scaling factor to prevent flushing to zero. + BUF = BUF*2.D0**EXPONENT( SCALOC ) + DO JJ = 1, NBB + DO LL = 1, NBA + SWORK( LL, JJ ) = MIN( BIGNUM, + $ SWORK( LL, JJ ) / 2.D0**EXPONENT( SCALOC ) ) + END DO + END DO + SCAMIN = SCAMIN / 2.D0**EXPONENT( SCALOC ) + SCALOC = SCALOC / 2.D0**EXPONENT( SCALOC ) + END IF + CNRM = CNRM * SCALOC + XNRM = XNRM * SCALOC +* +* Simultaneously apply the robust update factor and the +* consistency scaling factor to C( K, J ) and C( K, L). +* + SCAL = ( SCAMIN / SWORK( K, L ) ) * SCALOC + IF( SCAL .NE. ONE ) THEN + DO LL = L1, L2-1 + CALL ZDSCAL( K2-K1, SCAL, C( K1, LL ), 1 ) + END DO + ENDIF +* + SCAL = ( SCAMIN / SWORK( K, J ) ) * SCALOC + IF( SCAL .NE. ONE ) THEN + DO JJ = J1, J2-1 + CALL ZDSCAL( K2-K1, SCAL, C( K1, JJ ), 1 ) + END DO + ENDIF +* +* Record current scaling factor +* + SWORK( K, L ) = SCAMIN * SCALOC + SWORK( K, J ) = SCAMIN * SCALOC +* + CALL ZGEMM( 'N', 'N', K2-K1, J2-J1, L2-L1, -CSGN, + $ C( K1, L1 ), LDC, B( L1, J1 ), LDB, + $ CONE, C( K1, J1 ), LDC ) + END DO + END DO + END DO + ELSE IF( .NOT.NOTRNA .AND. NOTRNB ) THEN +* +* Solve A**H *X + ISGN*X*B = scale*C. +* +* The (K,L)th block of X is determined starting from +* upper-left corner column by column by +* +* A(K,K)**H*X(K,L) + ISGN*X(K,L)*B(L,L) = C(K,L) - R(K,L) +* +* Where +* K-1 L-1 +* R(K,L) = SUM [A(I,K)**H*X(I,L)] +ISGN*SUM [X(K,J)*B(J,L)] +* I=1 J=1 +* +* Start loop over block rows (index = K) and block columns (index = L) +* + DO K = 1, NBA +* +* K1: row index of the first row in X( K, L ) +* K2: row index of the first row in X( K+1, L ) +* so the K2 - K1 is the column count of the block X( K, L ) +* + K1 = (K - 1) * NB + 1 + K2 = MIN( K * NB, M ) + 1 + DO L = 1, NBB +* +* L1: column index of the first column in X( K, L ) +* L2: column index of the first column in X( K, L + 1) +* so that L2 - L1 is the row count of the block X( K, L ) +* + L1 = (L - 1) * NB + 1 + L2 = MIN( L * NB, N ) + 1 +* + CALL ZTRSYL( TRANA, TRANB, ISGN, K2-K1, L2-L1, + $ A( K1, K1 ), LDA, + $ B( L1, L1 ), LDB, + $ C( K1, L1 ), LDC, SCALOC, IINFO ) + INFO = MAX( INFO, IINFO ) +* + IF( SCALOC * SWORK( K, L ) .EQ. ZERO ) THEN + IF( SCALOC .EQ. ZERO ) THEN +* The magnitude of the largest entry of X(K1:K2-1, L1:L2-1) +* is larger than the product of BIGNUM**2 and cannot be +* represented in the form (1/SCALE)*X(K1:K2-1, L1:L2-1). +* Mark the computation as pointless. + BUF = ZERO + ELSE +* Use second scaling factor to prevent flushing to zero. + BUF = BUF*2.D0**EXPONENT( SCALOC ) + END IF + DO JJ = 1, NBB + DO LL = 1, NBA +* Bound by BIGNUM to not introduce Inf. The value +* is irrelevant; corresponding entries of the +* solution will be flushed in consistency scaling. + SWORK( LL, JJ ) = MIN( BIGNUM, + $ SWORK( LL, JJ ) / 2.D0**EXPONENT( SCALOC ) ) + END DO + END DO + END IF + SWORK( K, L ) = SCALOC * SWORK( K, L ) + XNRM = ZLANGE( 'I', K2-K1, L2-L1, C( K1, L1 ), LDC, + $ WNRM ) +* + DO I = K + 1, NBA +* +* C( I, L ) := C( I, L ) - A( K, I )**H * C( K, L ) +* + I1 = (I - 1) * NB + 1 + I2 = MIN( I * NB, M ) + 1 +* +* Compute scaling factor to survive the linear update +* simulating consistent scaling. +* + CNRM = ZLANGE( 'I', I2-I1, L2-L1, C( I1, L1 ), + $ LDC, WNRM ) + SCAMIN = MIN( SWORK( I, L ), SWORK( K, L ) ) + CNRM = CNRM * ( SCAMIN / SWORK( I, L ) ) + XNRM = XNRM * ( SCAMIN / SWORK( K, L ) ) + ANRM = SWORK( I, AWRK + K ) + SCALOC = DLARMM( ANRM, XNRM, CNRM ) + IF( SCALOC * SCAMIN .EQ. ZERO ) THEN +* Use second scaling factor to prevent flushing to zero. + BUF = BUF*2.D0**EXPONENT( SCALOC ) + DO JJ = 1, NBB + DO LL = 1, NBA + SWORK( LL, JJ ) = MIN( BIGNUM, + $ SWORK( LL, JJ ) / 2.D0**EXPONENT( SCALOC ) ) + END DO + END DO + SCAMIN = SCAMIN / 2.D0**EXPONENT( SCALOC ) + SCALOC = SCALOC / 2.D0**EXPONENT( SCALOC ) + END IF + CNRM = CNRM * SCALOC + XNRM = XNRM * SCALOC +* +* Simultaneously apply the robust update factor and the +* consistency scaling factor to to C( I, L ) and C( K, L). +* + SCAL = ( SCAMIN / SWORK( K, L ) ) * SCALOC + IF( SCAL .NE. ONE ) THEN + DO LL = L1, L2-1 + CALL ZDSCAL( K2-K1, SCAL, C( K1, LL ), 1 ) + END DO + ENDIF +* + SCAL = ( SCAMIN / SWORK( I, L ) ) * SCALOC + IF( SCAL .NE. ONE ) THEN + DO LL = L1, L2-1 + CALL ZDSCAL( I2-I1, SCAL, C( I1, LL ), 1 ) + END DO + ENDIF +* +* Record current scaling factor +* + SWORK( K, L ) = SCAMIN * SCALOC + SWORK( I, L ) = SCAMIN * SCALOC +* + CALL ZGEMM( 'C', 'N', I2-I1, L2-L1, K2-K1, -CONE, + $ A( K1, I1 ), LDA, C( K1, L1 ), LDC, + $ CONE, C( I1, L1 ), LDC ) + END DO +* + DO J = L + 1, NBB +* +* C( K, J ) := C( K, J ) - SGN * C( K, L ) * B( L, J ) +* + J1 = (J - 1) * NB + 1 + J2 = MIN( J * NB, N ) + 1 +* +* Compute scaling factor to survive the linear update +* simulating consistent scaling. +* + CNRM = ZLANGE( 'I', K2-K1, J2-J1, C( K1, J1 ), + $ LDC, WNRM ) + SCAMIN = MIN( SWORK( K, J ), SWORK( K, L ) ) + CNRM = CNRM * ( SCAMIN / SWORK( K, J ) ) + XNRM = XNRM * ( SCAMIN / SWORK( K, L ) ) + BNRM = SWORK( L, BWRK + J ) + SCALOC = DLARMM( BNRM, XNRM, CNRM ) + IF( SCALOC * SCAMIN .EQ. ZERO ) THEN +* Use second scaling factor to prevent flushing to zero. + BUF = BUF*2.D0**EXPONENT( SCALOC ) + DO JJ = 1, NBB + DO LL = 1, NBA + SWORK( LL, JJ ) = MIN( BIGNUM, + $ SWORK( LL, JJ ) / 2.D0**EXPONENT( SCALOC ) ) + END DO + END DO + SCAMIN = SCAMIN / 2.D0**EXPONENT( SCALOC ) + SCALOC = SCALOC / 2.D0**EXPONENT( SCALOC ) + END IF + CNRM = CNRM * SCALOC + XNRM = XNRM * SCALOC +* +* Simultaneously apply the robust update factor and the +* consistency scaling factor to to C( K, J ) and C( K, L). +* + SCAL = ( SCAMIN / SWORK( K, L ) ) * SCALOC + IF( SCAL .NE. ONE ) THEN + DO LL = L1, L2-1 + CALL ZDSCAL( K2-K1, SCAL, C( K1, LL ), 1 ) + END DO + ENDIF +* + SCAL = ( SCAMIN / SWORK( K, J ) ) * SCALOC + IF( SCAL .NE. ONE ) THEN + DO JJ = J1, J2-1 + CALL ZDSCAL( K2-K1, SCAL, C( K1, JJ ), 1 ) + END DO + ENDIF +* +* Record current scaling factor +* + SWORK( K, L ) = SCAMIN * SCALOC + SWORK( K, J ) = SCAMIN * SCALOC +* + CALL ZGEMM( 'N', 'N', K2-K1, J2-J1, L2-L1, -CSGN, + $ C( K1, L1 ), LDC, B( L1, J1 ), LDB, + $ CONE, C( K1, J1 ), LDC ) + END DO + END DO + END DO + ELSE IF( .NOT.NOTRNA .AND. .NOT.NOTRNB ) THEN +* +* Solve A**H *X + ISGN*X*B**H = scale*C. +* +* The (K,L)th block of X is determined starting from +* top-right corner column by column by +* +* A(K,K)**H*X(K,L) + ISGN*X(K,L)*B(L,L)**H = C(K,L) - R(K,L) +* +* Where +* K-1 N +* R(K,L) = SUM [A(I,K)**H*X(I,L)] + ISGN*SUM [X(K,J)*B(L,J)**H]. +* I=1 J=L+1 +* +* Start loop over block rows (index = K) and block columns (index = L) +* + DO K = 1, NBA +* +* K1: row index of the first row in X( K, L ) +* K2: row index of the first row in X( K+1, L ) +* so the K2 - K1 is the column count of the block X( K, L ) +* + K1 = (K - 1) * NB + 1 + K2 = MIN( K * NB, M ) + 1 + DO L = NBB, 1, -1 +* +* L1: column index of the first column in X( K, L ) +* L2: column index of the first column in X( K, L + 1) +* so that L2 - L1 is the row count of the block X( K, L ) +* + L1 = (L - 1) * NB + 1 + L2 = MIN( L * NB, N ) + 1 +* + CALL ZTRSYL( TRANA, TRANB, ISGN, K2-K1, L2-L1, + $ A( K1, K1 ), LDA, + $ B( L1, L1 ), LDB, + $ C( K1, L1 ), LDC, SCALOC, IINFO ) + INFO = MAX( INFO, IINFO ) +* + IF( SCALOC * SWORK( K, L ) .EQ. ZERO ) THEN + IF( SCALOC .EQ. ZERO ) THEN +* The magnitude of the largest entry of X(K1:K2-1, L1:L2-1) +* is larger than the product of BIGNUM**2 and cannot be +* represented in the form (1/SCALE)*X(K1:K2-1, L1:L2-1). +* Mark the computation as pointless. + BUF = ZERO + ELSE +* Use second scaling factor to prevent flushing to zero. + BUF = BUF*2.D0**EXPONENT( SCALOC ) + END IF + DO JJ = 1, NBB + DO LL = 1, NBA +* Bound by BIGNUM to not introduce Inf. The value +* is irrelevant; corresponding entries of the +* solution will be flushed in consistency scaling. + SWORK( LL, JJ ) = MIN( BIGNUM, + $ SWORK( LL, JJ ) / 2.D0**EXPONENT( SCALOC ) ) + END DO + END DO + END IF + SWORK( K, L ) = SCALOC * SWORK( K, L ) + XNRM = ZLANGE( 'I', K2-K1, L2-L1, C( K1, L1 ), LDC, + $ WNRM ) +* + DO I = K + 1, NBA +* +* C( I, L ) := C( I, L ) - A( K, I )**H * C( K, L ) +* + I1 = (I - 1) * NB + 1 + I2 = MIN( I * NB, M ) + 1 +* +* Compute scaling factor to survive the linear update +* simulating consistent scaling. +* + CNRM = ZLANGE( 'I', I2-I1, L2-L1, C( I1, L1 ), + $ LDC, WNRM ) + SCAMIN = MIN( SWORK( I, L ), SWORK( K, L ) ) + CNRM = CNRM * ( SCAMIN / SWORK( I, L ) ) + XNRM = XNRM * ( SCAMIN / SWORK( K, L ) ) + ANRM = SWORK( I, AWRK + K ) + SCALOC = DLARMM( ANRM, XNRM, CNRM ) + IF( SCALOC * SCAMIN .EQ. ZERO ) THEN +* Use second scaling factor to prevent flushing to zero. + BUF = BUF*2.D0**EXPONENT( SCALOC ) + DO JJ = 1, NBB + DO LL = 1, NBA + SWORK( LL, JJ ) = MIN( BIGNUM, + $ SWORK( LL, JJ ) / 2.D0**EXPONENT( SCALOC ) ) + END DO + END DO + SCAMIN = SCAMIN / 2.D0**EXPONENT( SCALOC ) + SCALOC = SCALOC / 2.D0**EXPONENT( SCALOC ) + END IF + CNRM = CNRM * SCALOC + XNRM = XNRM * SCALOC +* +* Simultaneously apply the robust update factor and the +* consistency scaling factor to C( I, L ) and C( K, L). +* + SCAL = ( SCAMIN / SWORK( K, L ) ) * SCALOC + IF( SCAL .NE. ONE ) THEN + DO LL = L1, L2-1 + CALL ZDSCAL( K2-K1, SCAL, C( K1, LL ), 1 ) + END DO + ENDIF +* + SCAL = ( SCAMIN / SWORK( I, L ) ) * SCALOC + IF( SCAL .NE. ONE ) THEN + DO LL = L1, L2-1 + CALL ZDSCAL( I2-I1, SCAL, C( I1, LL ), 1 ) + END DO + ENDIF +* +* Record current scaling factor +* + SWORK( K, L ) = SCAMIN * SCALOC + SWORK( I, L ) = SCAMIN * SCALOC +* + CALL ZGEMM( 'C', 'N', I2-I1, L2-L1, K2-K1, -CONE, + $ A( K1, I1 ), LDA, C( K1, L1 ), LDC, + $ CONE, C( I1, L1 ), LDC ) + END DO +* + DO J = 1, L - 1 +* +* C( K, J ) := C( K, J ) - SGN * C( K, L ) * B( J, L )**H +* + J1 = (J - 1) * NB + 1 + J2 = MIN( J * NB, N ) + 1 +* +* Compute scaling factor to survive the linear update +* simulating consistent scaling. +* + CNRM = ZLANGE( 'I', K2-K1, J2-J1, C( K1, J1 ), + $ LDC, WNRM ) + SCAMIN = MIN( SWORK( K, J ), SWORK( K, L ) ) + CNRM = CNRM * ( SCAMIN / SWORK( K, J ) ) + XNRM = XNRM * ( SCAMIN / SWORK( K, L ) ) + BNRM = SWORK( L, BWRK + J ) + SCALOC = DLARMM( BNRM, XNRM, CNRM ) + IF( SCALOC * SCAMIN .EQ. ZERO ) THEN +* Use second scaling factor to prevent flushing to zero. + BUF = BUF*2.D0**EXPONENT( SCALOC ) + DO JJ = 1, NBB + DO LL = 1, NBA + SWORK( LL, JJ ) = MIN( BIGNUM, + $ SWORK( LL, JJ ) / 2.D0**EXPONENT( SCALOC ) ) + END DO + END DO + SCAMIN = SCAMIN / 2.D0**EXPONENT( SCALOC ) + SCALOC = SCALOC / 2.D0**EXPONENT( SCALOC ) + END IF + CNRM = CNRM * SCALOC + XNRM = XNRM * SCALOC +* +* Simultaneously apply the robust update factor and the +* consistency scaling factor to C( K, J ) and C( K, L). +* + SCAL = ( SCAMIN / SWORK( K, L ) ) * SCALOC + IF( SCAL .NE. ONE ) THEN + DO LL = L1, L2-1 + CALL ZDSCAL( K2-K1, SCAL, C( K1, LL ), 1) + END DO + ENDIF +* + SCAL = ( SCAMIN / SWORK( K, J ) ) * SCALOC + IF( SCAL .NE. ONE ) THEN + DO JJ = J1, J2-1 + CALL ZDSCAL( K2-K1, SCAL, C( K1, JJ ), 1 ) + END DO + ENDIF +* +* Record current scaling factor +* + SWORK( K, L ) = SCAMIN * SCALOC + SWORK( K, J ) = SCAMIN * SCALOC +* + CALL ZGEMM( 'N', 'C', K2-K1, J2-J1, L2-L1, -CSGN, + $ C( K1, L1 ), LDC, B( J1, L1 ), LDB, + $ CONE, C( K1, J1 ), LDC ) + END DO + END DO + END DO + ELSE IF( NOTRNA .AND. .NOT.NOTRNB ) THEN +* +* Solve A*X + ISGN*X*B**H = scale*C. +* +* The (K,L)th block of X is determined starting from +* bottom-right corner column by column by +* +* A(K,K)*X(K,L) + ISGN*X(K,L)*B(L,L)**H = C(K,L) - R(K,L) +* +* Where +* M N +* R(K,L) = SUM [A(K,I)*X(I,L)] + ISGN*SUM [X(K,J)*B(L,J)**H]. +* I=K+1 J=L+1 +* +* Start loop over block rows (index = K) and block columns (index = L) +* + DO K = NBA, 1, -1 +* +* K1: row index of the first row in X( K, L ) +* K2: row index of the first row in X( K+1, L ) +* so the K2 - K1 is the column count of the block X( K, L ) +* + K1 = (K - 1) * NB + 1 + K2 = MIN( K * NB, M ) + 1 + DO L = NBB, 1, -1 +* +* L1: column index of the first column in X( K, L ) +* L2: column index of the first column in X( K, L + 1) +* so that L2 - L1 is the row count of the block X( K, L ) +* + L1 = (L - 1) * NB + 1 + L2 = MIN( L * NB, N ) + 1 +* + CALL ZTRSYL( TRANA, TRANB, ISGN, K2-K1, L2-L1, + $ A( K1, K1 ), LDA, + $ B( L1, L1 ), LDB, + $ C( K1, L1 ), LDC, SCALOC, IINFO ) + INFO = MAX( INFO, IINFO ) +* + IF( SCALOC * SWORK( K, L ) .EQ. ZERO ) THEN + IF( SCALOC .EQ. ZERO ) THEN +* The magnitude of the largest entry of X(K1:K2-1, L1:L2-1) +* is larger than the product of BIGNUM**2 and cannot be +* represented in the form (1/SCALE)*X(K1:K2-1, L1:L2-1). +* Mark the computation as pointless. + BUF = ZERO + ELSE +* Use second scaling factor to prevent flushing to zero. + BUF = BUF*2.D0**EXPONENT( SCALOC ) + END IF + DO JJ = 1, NBB + DO LL = 1, NBA +* Bound by BIGNUM to not introduce Inf. The value +* is irrelevant; corresponding entries of the +* solution will be flushed in consistency scaling. + SWORK( LL, JJ ) = MIN( BIGNUM, + $ SWORK( LL, JJ ) / 2.D0**EXPONENT( SCALOC ) ) + END DO + END DO + END IF + SWORK( K, L ) = SCALOC * SWORK( K, L ) + XNRM = ZLANGE( 'I', K2-K1, L2-L1, C( K1, L1 ), LDC, + $ WNRM ) +* + DO I = 1, K - 1 +* +* C( I, L ) := C( I, L ) - A( I, K ) * C( K, L ) +* + I1 = (I - 1) * NB + 1 + I2 = MIN( I * NB, M ) + 1 +* +* Compute scaling factor to survive the linear update +* simulating consistent scaling. +* + CNRM = ZLANGE( 'I', I2-I1, L2-L1, C( I1, L1 ), + $ LDC, WNRM ) + SCAMIN = MIN( SWORK( I, L ), SWORK( K, L ) ) + CNRM = CNRM * ( SCAMIN / SWORK( I, L ) ) + XNRM = XNRM * ( SCAMIN / SWORK( K, L ) ) + ANRM = SWORK( I, AWRK + K ) + SCALOC = DLARMM( ANRM, XNRM, CNRM ) + IF( SCALOC * SCAMIN .EQ. ZERO ) THEN +* Use second scaling factor to prevent flushing to zero. + BUF = BUF*2.D0**EXPONENT( SCALOC ) + DO JJ = 1, NBB + DO LL = 1, NBA + SWORK( LL, JJ ) = MIN( BIGNUM, + $ SWORK( LL, JJ ) / 2.D0**EXPONENT( SCALOC ) ) + END DO + END DO + SCAMIN = SCAMIN / 2.D0**EXPONENT( SCALOC ) + SCALOC = SCALOC / 2.D0**EXPONENT( SCALOC ) + END IF + CNRM = CNRM * SCALOC + XNRM = XNRM * SCALOC +* +* Simultaneously apply the robust update factor and the +* consistency scaling factor to C( I, L ) and C( K, L). +* + SCAL = ( SCAMIN / SWORK( K, L ) ) * SCALOC + IF( SCAL .NE. ONE ) THEN + DO LL = L1, L2-1 + CALL ZDSCAL( K2-K1, SCAL, C( K1, LL ), 1 ) + END DO + ENDIF +* + SCAL = ( SCAMIN / SWORK( I, L ) ) * SCALOC + IF( SCAL .NE. ONE ) THEN + DO LL = L1, L2-1 + CALL ZDSCAL( I2-I1, SCAL, C( I1, LL ), 1 ) + END DO + ENDIF +* +* Record current scaling factor +* + SWORK( K, L ) = SCAMIN * SCALOC + SWORK( I, L ) = SCAMIN * SCALOC +* + CALL ZGEMM( 'N', 'N', I2-I1, L2-L1, K2-K1, -CONE, + $ A( I1, K1 ), LDA, C( K1, L1 ), LDC, + $ CONE, C( I1, L1 ), LDC ) +* + END DO +* + DO J = 1, L - 1 +* +* C( K, J ) := C( K, J ) - SGN * C( K, L ) * B( J, L )**H +* + J1 = (J - 1) * NB + 1 + J2 = MIN( J * NB, N ) + 1 +* +* Compute scaling factor to survive the linear update +* simulating consistent scaling. +* + CNRM = ZLANGE( 'I', K2-K1, J2-J1, C( K1, J1 ), + $ LDC, WNRM ) + SCAMIN = MIN( SWORK( K, J ), SWORK( K, L ) ) + CNRM = CNRM * ( SCAMIN / SWORK( K, J ) ) + XNRM = XNRM * ( SCAMIN / SWORK( K, L ) ) + BNRM = SWORK( L, BWRK + J ) + SCALOC = DLARMM( BNRM, XNRM, CNRM ) + IF( SCALOC * SCAMIN .EQ. ZERO ) THEN +* Use second scaling factor to prevent flushing to zero. + BUF = BUF*2.D0**EXPONENT( SCALOC ) + DO JJ = 1, NBB + DO LL = 1, NBA + SWORK( LL, JJ ) = MIN( BIGNUM, + $ SWORK( LL, JJ ) / 2.D0**EXPONENT( SCALOC ) ) + END DO + END DO + SCAMIN = SCAMIN / 2.D0**EXPONENT( SCALOC ) + SCALOC = SCALOC / 2.D0**EXPONENT( SCALOC ) + END IF + CNRM = CNRM * SCALOC + XNRM = XNRM * SCALOC +* +* Simultaneously apply the robust update factor and the +* consistency scaling factor to C( K, J ) and C( K, L). +* + SCAL = ( SCAMIN / SWORK( K, L ) ) * SCALOC + IF( SCAL .NE. ONE ) THEN + DO JJ = L1, L2-1 + CALL ZDSCAL( K2-K1, SCAL, C( K1, JJ ), 1 ) + END DO + ENDIF +* + SCAL = ( SCAMIN / SWORK( K, J ) ) * SCALOC + IF( SCAL .NE. ONE ) THEN + DO JJ = J1, J2-1 + CALL ZDSCAL( K2-K1, SCAL, C( K1, JJ ), 1 ) + END DO + ENDIF +* +* Record current scaling factor +* + SWORK( K, L ) = SCAMIN * SCALOC + SWORK( K, J ) = SCAMIN * SCALOC +* + CALL ZGEMM( 'N', 'C', K2-K1, J2-J1, L2-L1, -CSGN, + $ C( K1, L1 ), LDC, B( J1, L1 ), LDB, + $ CONE, C( K1, J1 ), LDC ) + END DO + END DO + END DO +* + END IF +* +* Reduce local scaling factors +* + SCALE = SWORK( 1, 1 ) + DO K = 1, NBA + DO L = 1, NBB + SCALE = MIN( SCALE, SWORK( K, L ) ) + END DO + END DO + IF( SCALE .EQ. ZERO ) THEN +* +* The magnitude of the largest entry of the solution is larger +* than the product of BIGNUM**2 and cannot be represented in the +* form (1/SCALE)*X if SCALE is DOUBLE PRECISION. Set SCALE to +* zero and give up. +* + SWORK(1,1) = MAX( NBA, NBB ) + SWORK(2,1) = 2 * NBB + NBA + RETURN + END IF +* +* Realize consistent scaling +* + DO K = 1, NBA + K1 = (K - 1) * NB + 1 + K2 = MIN( K * NB, M ) + 1 + DO L = 1, NBB + L1 = (L - 1) * NB + 1 + L2 = MIN( L * NB, N ) + 1 + SCAL = SCALE / SWORK( K, L ) + IF( SCAL .NE. ONE ) THEN + DO LL = L1, L2-1 + CALL ZDSCAL( K2-K1, SCAL, C( K1, LL ), 1 ) + END DO + ENDIF + END DO + END DO +* + IF( BUF .NE. ONE .AND. BUF.GT.ZERO ) THEN +* +* Decrease SCALE as much as possible. +* + SCALOC = MIN( SCALE / SMLNUM, ONE / BUF ) + BUF = BUF * SCALOC + SCALE = SCALE / SCALOC + END IF +* + IF( BUF.NE.ONE .AND. BUF.GT.ZERO ) THEN +* +* In case of overly aggressive scaling during the computation, +* flushing of the global scale factor may be prevented by +* undoing some of the scaling. This step is to ensure that +* this routine flushes only scale factors that TRSYL also +* flushes and be usable as a drop-in replacement. +* +* How much can the normwise largest entry be upscaled? +* + SCAL = MAX( ABS( DBLE( C( 1, 1 ) ) ), + $ ABS( DIMAG( C ( 1, 1 ) ) ) ) + DO K = 1, M + DO L = 1, N + SCAL = MAX( SCAL, ABS( DBLE ( C( K, L ) ) ), + $ ABS( DIMAG ( C( K, L ) ) ) ) + END DO + END DO +* +* Increase BUF as close to 1 as possible and apply scaling. +* + SCALOC = MIN( BIGNUM / SCAL, ONE / BUF ) + BUF = BUF * SCALOC + CALL ZLASCL( 'G', -1, -1, ONE, SCALOC, M, N, C, LDC, IINFO ) + END IF +* +* Combine with buffer scaling factor. SCALE will be flushed if +* BUF is less than one here. +* + SCALE = SCALE * BUF +* +* Restore workspace dimensions +* + SWORK(1,1) = MAX( NBA, NBB ) + SWORK(2,1) = 2 * NBB + NBA +* + RETURN +* +* End of ZTRSYL3 +* + END diff --git a/TESTING/EIG/CMakeLists.txt b/TESTING/EIG/CMakeLists.txt index a4ba5dfd6e..f8dcfeb9f8 100644 --- a/TESTING/EIG/CMakeLists.txt +++ b/TESTING/EIG/CMakeLists.txt @@ -77,7 +77,7 @@ set(DEIGTST dchkee.F dget54.f dglmts.f dgqrts.f dgrqts.f dgsvts3.f dhst01.f dlarfy.f dlarhs.f dlatm4.f dlctes.f dlctsx.f dlsets.f dort01.f dort03.f dsbt21.f dsgt01.f dslect.f dspt21.f dstt21.f - dstt22.f dsyt21.f dsyt22.f) + dstt22.f dsyl01.f dsyt21.f dsyt22.f) set(ZEIGTST zchkee.F zbdt01.f zbdt02.f zbdt03.f zbdt05.f @@ -93,7 +93,7 @@ set(ZEIGTST zchkee.F zget54.f zglmts.f zgqrts.f zgrqts.f zgsvts3.f zhbt21.f zhet21.f zhet22.f zhpt21.f zhst01.f zlarfy.f zlarhs.f zlatm4.f zlctes.f zlctsx.f zlsets.f zsbmv.f - zsgt01.f zslect.f + zsgt01.f zslect.f zsyl01.f zstt21.f zstt22.f zunt01.f zunt03.f) macro(add_eig_executable name) diff --git a/TESTING/EIG/Makefile b/TESTING/EIG/Makefile index c3d3774849..948c34921f 100644 --- a/TESTING/EIG/Makefile +++ b/TESTING/EIG/Makefile @@ -99,7 +99,7 @@ DEIGTST = dchkee.o \ dget54.o dglmts.o dgqrts.o dgrqts.o dgsvts3.o \ dhst01.o dlarfy.o dlarhs.o dlatm4.o dlctes.o dlctsx.o dlsets.o dort01.o \ dort03.o dsbt21.o dsgt01.o dslect.o dspt21.o dstt21.o \ - dstt22.o dsyt21.o dsyt22.o + dstt22.o dsyl01.o dsyt21.o dsyt22.o ZEIGTST = zchkee.o \ zbdt01.o zbdt02.o zbdt03.o zbdt05.o \ @@ -115,7 +115,7 @@ ZEIGTST = zchkee.o \ zget54.o zglmts.o zgqrts.o zgrqts.o zgsvts3.o \ zhbt21.o zhet21.o zhet22.o zhpt21.o zhst01.o \ zlarfy.o zlarhs.o zlatm4.o zlctes.o zlctsx.o zlsets.o zsbmv.o \ - zsgt01.o zslect.o \ + zsgt01.o zslect.o zsyl01.o\ zstt21.o zstt22.o zunt01.o zunt03.o .PHONY: all diff --git a/TESTING/EIG/zchkec.f b/TESTING/EIG/zchkec.f index 1e1c29e0d0..62a76d3574 100644 --- a/TESTING/EIG/zchkec.f +++ b/TESTING/EIG/zchkec.f @@ -88,17 +88,17 @@ SUBROUTINE ZCHKEC( THRESH, TSTERR, NIN, NOUT ) * .. Local Scalars .. LOGICAL OK CHARACTER*3 PATH - INTEGER KTREXC, KTRSEN, KTRSNA, KTRSYL, LTREXC, LTRSYL, - $ NTESTS, NTREXC, NTRSYL - DOUBLE PRECISION EPS, RTREXC, RTRSYL, SFMIN + INTEGER KTREXC, KTRSEN, KTRSNA, KTRSYL, KTRSYL3, + $ LTREXC, LTRSYL, NTESTS, NTREXC, NTRSYL + DOUBLE PRECISION EPS, RTREXC, SFMIN * .. * .. Local Arrays .. - INTEGER LTRSEN( 3 ), LTRSNA( 3 ), NTRSEN( 3 ), - $ NTRSNA( 3 ) - DOUBLE PRECISION RTRSEN( 3 ), RTRSNA( 3 ) + INTEGER FTRSYL( 3 ), ITRSYL( 2 ), LTRSEN( 3 ), + $ LTRSNA( 3 ), NTRSEN( 3 ), NTRSNA( 3 ) + DOUBLE PRECISION RTRSEN( 3 ), RTRSNA( 3 ), RTRSYL( 2 ) * .. * .. External Subroutines .. - EXTERNAL ZERREC, ZGET35, ZGET36, ZGET37, ZGET38 + EXTERNAL ZERREC, ZGET35, ZGET36, ZGET37, ZGET38, ZSYL01 * .. * .. External Functions .. DOUBLE PRECISION DLAMCH @@ -120,10 +120,24 @@ SUBROUTINE ZCHKEC( THRESH, TSTERR, NIN, NOUT ) $ CALL ZERREC( PATH, NOUT ) * OK = .TRUE. - CALL ZGET35( RTRSYL, LTRSYL, NTRSYL, KTRSYL, NIN ) - IF( RTRSYL.GT.THRESH ) THEN + CALL ZGET35( RTRSYL( 1 ), LTRSYL, NTRSYL, KTRSYL, NIN ) + IF( RTRSYL( 1 ).GT.THRESH ) THEN OK = .FALSE. - WRITE( NOUT, FMT = 9999 )RTRSYL, LTRSYL, NTRSYL, KTRSYL + WRITE( NOUT, FMT = 9999 )RTRSYL( 1 ), LTRSYL, NTRSYL, KTRSYL + END IF +* + CALL ZSYL01( THRESH, FTRSYL, RTRSYL, ITRSYL, KTRSYL3 ) + IF( FTRSYL( 1 ).GT.0 ) THEN + OK = .FALSE. + WRITE( NOUT, FMT = 9970 )FTRSYL( 1 ), RTRSYL( 1 ), THRESH + END IF + IF( FTRSYL( 2 ).GT.0 ) THEN + OK = .FALSE. + WRITE( NOUT, FMT = 9971 )FTRSYL( 2 ), RTRSYL( 2 ), THRESH + END IF + IF( FTRSYL( 3 ).GT.0 ) THEN + OK = .FALSE. + WRITE( NOUT, FMT = 9972 )FTRSYL( 3 ) END IF * CALL ZGET36( RTREXC, LTREXC, NTREXC, KTREXC, NIN ) @@ -148,7 +162,7 @@ SUBROUTINE ZCHKEC( THRESH, TSTERR, NIN, NOUT ) WRITE( NOUT, FMT = 9996 )RTRSEN, LTRSEN, NTRSEN, KTRSEN END IF * - NTESTS = KTRSYL + KTREXC + KTRSNA + KTRSEN + NTESTS = KTRSYL + KTRSYL3 + KTREXC + KTRSNA + KTRSEN IF( OK ) $ WRITE( NOUT, FMT = 9995 )PATH, NTESTS * @@ -169,6 +183,12 @@ SUBROUTINE ZCHKEC( THRESH, TSTERR, NIN, NOUT ) $ / ' Safe minimum (SFMIN) = ', D16.6, / ) 9992 FORMAT( ' Routines pass computational tests if test ratio is ', $ 'less than', F8.2, / / ) + 9970 FORMAT( 'Error in ZTRSYL: ', I8, ' tests fail the threshold.', / + $ 'Maximum test ratio =', D12.3, ' threshold =', D12.3 ) + 9971 FORMAT( 'Error in ZTRSYL3: ', I8, ' tests fail the threshold.', / + $ 'Maximum test ratio =', D12.3, ' threshold =', D12.3 ) + 9972 FORMAT( 'ZTRSYL and ZTRSYL3 compute an inconsistent scale ', + $ 'factor in ', I8, ' tests.') RETURN * * End of ZCHKEC diff --git a/TESTING/EIG/zerrec.f b/TESTING/EIG/zerrec.f index dc6129da91..e1938f57d1 100644 --- a/TESTING/EIG/zerrec.f +++ b/TESTING/EIG/zerrec.f @@ -23,7 +23,7 @@ *> *> ZERREC tests the error exits for the routines for eigen- condition *> estimation for DOUBLE PRECISION matrices: -*> ZTRSYL, ZTREXC, ZTRSNA and ZTRSEN. +*> ZTRSYL, ZTRSYL3, ZTREXC, ZTRSNA and ZTRSEN. *> \endverbatim * * Arguments: @@ -77,7 +77,7 @@ SUBROUTINE ZERREC( PATH, NUNIT ) * .. * .. Local Arrays .. LOGICAL SEL( NMAX ) - DOUBLE PRECISION RW( LW ), S( NMAX ), SEP( NMAX ) + DOUBLE PRECISION RW( LW ), S( NMAX ), SEP( NMAX ), SWORK( NMAX ) COMPLEX*16 A( NMAX, NMAX ), B( NMAX, NMAX ), $ C( NMAX, NMAX ), WORK( LW ), X( NMAX ) * .. @@ -141,6 +141,43 @@ SUBROUTINE ZERREC( PATH, NUNIT ) CALL CHKXER( 'ZTRSYL', INFOT, NOUT, LERR, OK ) NT = NT + 8 * +* Test ZTRSYL3 +* + SRNAMT = 'ZTRSYL3' + INFOT = 1 + CALL ZTRSYL3( 'X', 'N', 1, 0, 0, A, 1, B, 1, C, 1, SCALE, + $ SWORK, NMAX, INFO ) + CALL CHKXER( 'ZTRSYL3', INFOT, NOUT, LERR, OK ) + INFOT = 2 + CALL ZTRSYL3( 'N', 'X', 1, 0, 0, A, 1, B, 1, C, 1, SCALE, + $ SWORK, NMAX, INFO ) + CALL CHKXER( 'ZTRSYL3', INFOT, NOUT, LERR, OK ) + INFOT = 3 + CALL ZTRSYL3( 'N', 'N', 0, 0, 0, A, 1, B, 1, C, 1, SCALE, + $ SWORK, NMAX, INFO ) + CALL CHKXER( 'ZTRSYL3', INFOT, NOUT, LERR, OK ) + INFOT = 4 + CALL ZTRSYL3( 'N', 'N', 1, -1, 0, A, 1, B, 1, C, 1, SCALE, + $ SWORK, NMAX, INFO ) + CALL CHKXER( 'ZTRSYL3', INFOT, NOUT, LERR, OK ) + INFOT = 5 + CALL ZTRSYL3( 'N', 'N', 1, 0, -1, A, 1, B, 1, C, 1, SCALE, + $ SWORK, NMAX, INFO ) + CALL CHKXER( 'ZTRSYL3', INFOT, NOUT, LERR, OK ) + INFOT = 7 + CALL ZTRSYL3( 'N', 'N', 1, 2, 0, A, 1, B, 1, C, 2, SCALE, + $ SWORK, NMAX, INFO ) + CALL CHKXER( 'ZTRSYL3', INFOT, NOUT, LERR, OK ) + INFOT = 9 + CALL ZTRSYL3( 'N', 'N', 1, 0, 2, A, 1, B, 1, C, 1, SCALE, + $ SWORK, NMAX, INFO ) + CALL CHKXER( 'ZTRSYL3', INFOT, NOUT, LERR, OK ) + INFOT = 11 + CALL ZTRSYL3( 'N', 'N', 1, 2, 0, A, 2, B, 1, C, 1, SCALE, + $ SWORK, NMAX, INFO ) + CALL CHKXER( 'ZTRSYL3', INFOT, NOUT, LERR, OK ) + NT = NT + 8 +* * Test ZTREXC * SRNAMT = 'ZTREXC' diff --git a/TESTING/EIG/zsyl01.f b/TESTING/EIG/zsyl01.f new file mode 100644 index 0000000000..03a32f8fc8 --- /dev/null +++ b/TESTING/EIG/zsyl01.f @@ -0,0 +1,294 @@ +*> \brief \b ZSYL01 +* +* =========== DOCUMENTATION =========== +* +* Online html documentation available at +* http://www.netlib.org/lapack/explore-html/ +* +* Definition: +* =========== +* +* SUBROUTINE ZSYL01( THRESH, NFAIL, RMAX, NINFO, KNT ) +* +* .. Scalar Arguments .. +* INTEGER KNT +* DOUBLE PRECISION THRESH +* .. +* .. Array Arguments .. +* INTEGER NFAIL( 3 ), NINFO( 2 ) +* DOUBLE PRECISION RMAX( 2 ) +* .. +* +* +*> \par Purpose: +* ============= +*> +*> \verbatim +*> +*> ZSYL01 tests ZTRSYL and ZTRSYL3, routines for solving the Sylvester matrix +*> equation +*> +*> op(A)*X + ISGN*X*op(B) = scale*C, +*> +*> where op(A) and op(B) are both upper triangular form, op() represents an +*> optional conjugate transpose, and ISGN can be -1 or +1. Scale is an output +*> less than or equal to 1, chosen to avoid overflow in X. +*> +*> The test code verifies that the following residual does not exceed +*> the provided threshold: +*> +*> norm(op(A)*X + ISGN*X*op(B) - scale*C) / +*> (EPS*max(norm(A),norm(B))*norm(X)) +*> +*> This routine complements ZGET35 by testing with larger, +*> random matrices, of which some require rescaling of X to avoid overflow. +*> +*> \endverbatim +* +* Arguments: +* ========== +* +*> \param[in] THRESH +*> \verbatim +*> THRESH is DOUBLE PRECISION +*> A test will count as "failed" if the residual, computed as +*> described above, exceeds THRESH. +*> \endverbatim +*> +*> \param[out] NFAIL +*> \verbatim +*> NFAIL is INTEGER array, dimension (3) +*> NFAIL(1) = No. of times residual ZTRSYL exceeds threshold THRESH +*> NFAIL(2) = No. of times residual ZTRSYL3 exceeds threshold THRESH +*> NFAIL(3) = No. of times ZTRSYL3 and ZTRSYL deviate +*> \endverbatim +*> +*> \param[out] RMAX +*> \verbatim +*> RMAX is DOUBLE PRECISION array, dimension (2) +*> RMAX(1) = Value of the largest test ratio of ZTRSYL +*> RMAX(2) = Value of the largest test ratio of ZTRSYL3 +*> \endverbatim +*> +*> \param[out] NINFO +*> \verbatim +*> NINFO is INTEGER array, dimension (2) +*> NINFO(1) = No. of times ZTRSYL returns an expected INFO +*> NINFO(2) = No. of times ZTRSYL3 returns an expected INFO +*> \endverbatim +*> +*> \param[out] KNT +*> \verbatim +*> KNT is INTEGER +*> Total number of examples tested. +*> \endverbatim + +* +* -- LAPACK test routine -- + SUBROUTINE ZSYL01( THRESH, NFAIL, RMAX, NINFO, KNT ) + IMPLICIT NONE +* +* -- LAPACK test routine -- +* -- LAPACK is a software package provided by Univ. of Tennessee, -- +* -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- +* +* .. Scalar Arguments .. + INTEGER KNT + DOUBLE PRECISION THRESH +* .. +* .. Array Arguments .. + INTEGER NFAIL( 3 ), NINFO( 2 ) + DOUBLE PRECISION RMAX( 2 ) +* .. +* +* ===================================================================== +* .. +* .. Parameters .. + COMPLEX*16 CONE + PARAMETER ( CONE = ( 1.0D0, 0.0D+0 ) ) + DOUBLE PRECISION ONE, ZERO + PARAMETER ( ZERO = 0.0D+0, ONE = 1.0D+0 ) + INTEGER MAXM, MAXN, LDSWORK + PARAMETER ( MAXM = 185, MAXN = 192, LDSWORK = 36 ) +* .. +* .. Local Scalars .. + CHARACTER TRANA, TRANB + INTEGER I, INFO, IINFO, ISGN, ITRANA, ITRANB, J, KLA, + $ KUA, KLB, KUB, M, N + DOUBLE PRECISION ANRM, BNRM, BIGNUM, EPS, RES, RES1, + $ SCALE, SCALE3, SMLNUM, TNRM, XNRM + COMPLEX*16 RMUL +* .. +* .. Local Arrays .. + COMPLEX*16 A( MAXM, MAXM ), B( MAXN, MAXN ), + $ C( MAXM, MAXN ), CC( MAXM, MAXN ), + $ WA( MAXM ), WB( MAXN ), X( MAXM, MAXN ), + $ DUML( MAXM ), DUMR( MAXN ), + $ D( MIN( MAXM, MAXN ) ) + DOUBLE PRECISION SWORK( LDSWORK, 103 ), DUM( MAXN ), VM( 2 ) + INTEGER ISEED( 4 ), IWORK( MAXM + MAXN + 2 ) +* .. +* .. External Functions .. + LOGICAL DISNAN + DOUBLE PRECISION DLAMCH, ZLANGE + EXTERNAL DISNAN, DLAMCH, ZLANGE +* .. +* .. External Subroutines .. + EXTERNAL ZLATMR, ZLACPY, ZGEMM, ZTRSYL, ZTRSYL3 +* .. +* .. Intrinsic Functions .. + INTRINSIC ABS, DBLE, MAX, SQRT +* .. +* .. Executable Statements .. +* +* Get machine parameters +* + EPS = DLAMCH( 'P' ) + SMLNUM = DLAMCH( 'S' ) / EPS + BIGNUM = ONE / SMLNUM +* +* Expect INFO = 0 + VM( 1 ) = ONE +* Expect INFO = 1 + VM( 2 ) = 0.05D+0 +* +* Begin test loop +* + NINFO( 1 ) = 0 + NINFO( 2 ) = 0 + NFAIL( 1 ) = 0 + NFAIL( 2 ) = 0 + NFAIL( 3 ) = 0 + RMAX( 1 ) = ZERO + RMAX( 2 ) = ZERO + KNT = 0 + ISEED( 1 ) = 1 + ISEED( 2 ) = 1 + ISEED( 3 ) = 1 + ISEED( 4 ) = 1 + SCALE = ONE + SCALE3 = ONE + DO J = 1, 2 + DO ISGN = -1, 1, 2 +* Reset seed (overwritten by LATMR) + ISEED( 1 ) = 1 + ISEED( 2 ) = 1 + ISEED( 3 ) = 1 + ISEED( 4 ) = 1 + DO M = 32, MAXM, 51 + KLA = 0 + KUA = M - 1 + CALL ZLATMR( M, M, 'S', ISEED, 'N', D, + $ 6, ONE, CONE, 'T', 'N', + $ DUML, 1, ONE, DUMR, 1, ONE, + $ 'N', IWORK, KLA, KUA, ZERO, + $ ONE, 'NO', A, MAXM, IWORK, + $ IINFO ) + DO I = 1, M + A( I, I ) = A( I, I ) * VM( J ) + END DO + ANRM = ZLANGE( 'M', M, M, A, MAXM, DUM ) + DO N = 51, MAXN, 47 + KLB = 0 + KUB = N - 1 + CALL ZLATMR( N, N, 'S', ISEED, 'N', D, + $ 6, ONE, CONE, 'T', 'N', + $ DUML, 1, ONE, DUMR, 1, ONE, + $ 'N', IWORK, KLB, KUB, ZERO, + $ ONE, 'NO', B, MAXN, IWORK, + $ IINFO ) + DO I = 1, N + B( I, I ) = B( I, I ) * VM ( J ) + END DO + BNRM = ZLANGE( 'M', N, N, B, MAXN, DUM ) + TNRM = MAX( ANRM, BNRM ) + CALL ZLATMR( M, N, 'S', ISEED, 'N', D, + $ 6, ONE, CONE, 'T', 'N', + $ DUML, 1, ONE, DUMR, 1, ONE, + $ 'N', IWORK, M, N, ZERO, ONE, + $ 'NO', C, MAXM, IWORK, IINFO ) + DO ITRANA = 1, 2 + IF( ITRANA.EQ.1 ) + $ TRANA = 'N' + IF( ITRANA.EQ.2 ) + $ TRANA = 'C' + DO ITRANB = 1, 2 + IF( ITRANB.EQ.1 ) + $ TRANB = 'N' + IF( ITRANB.EQ.2 ) + $ TRANB = 'C' + KNT = KNT + 1 +* + CALL ZLACPY( 'All', M, N, C, MAXM, X, MAXM) + CALL ZLACPY( 'All', M, N, C, MAXM, CC, MAXM) + CALL ZTRSYL( TRANA, TRANB, ISGN, M, N, + $ A, MAXM, B, MAXN, X, MAXM, + $ SCALE, IINFO ) + IF( IINFO.NE.0 ) + $ NINFO( 1 ) = NINFO( 1 ) + 1 + XNRM = ZLANGE( 'M', M, N, X, MAXM, DUM ) + RMUL = CONE + IF( XNRM.GT.ONE .AND. TNRM.GT.ONE ) THEN + IF( XNRM.GT.BIGNUM / TNRM ) THEN + RMUL = CONE / MAX( XNRM, TNRM ) + END IF + END IF + CALL ZGEMM( TRANA, 'N', M, N, M, RMUL, + $ A, MAXM, X, MAXM, -SCALE*RMUL, + $ CC, MAXM ) + CALL ZGEMM( 'N', TRANB, M, N, N, + $ DBLE( ISGN )*RMUL, X, MAXM, B, + $ MAXN, CONE, CC, MAXM ) + RES1 = ZLANGE( 'M', M, N, CC, MAXM, DUM ) + RES = RES1 / MAX( SMLNUM, SMLNUM*XNRM, + $ ( ( ABS( RMUL )*TNRM )*EPS )*XNRM ) + IF( RES.GT.THRESH ) + $ NFAIL( 1 ) = NFAIL( 1 ) + 1 + IF( RES.GT.RMAX( 1 ) ) + $ RMAX( 1 ) = RES +* + CALL ZLACPY( 'All', M, N, C, MAXM, X, MAXM ) + CALL ZLACPY( 'All', M, N, C, MAXM, CC, MAXM ) + CALL ZTRSYL3( TRANA, TRANB, ISGN, M, N, + $ A, MAXM, B, MAXN, X, MAXM, + $ SCALE3, SWORK, LDSWORK, INFO) + IF( INFO.NE.0 ) + $ NINFO( 2 ) = NINFO( 2 ) + 1 + XNRM = ZLANGE( 'M', M, N, X, MAXM, DUM ) + RMUL = CONE + IF( XNRM.GT.ONE .AND. TNRM.GT.ONE ) THEN + IF( XNRM.GT.BIGNUM / TNRM ) THEN + RMUL = CONE / MAX( XNRM, TNRM ) + END IF + END IF + CALL ZGEMM( TRANA, 'N', M, N, M, RMUL, + $ A, MAXM, X, MAXM, -SCALE3*RMUL, + $ CC, MAXM ) + CALL ZGEMM( 'N', TRANB, M, N, N, + $ DBLE( ISGN )*RMUL, X, MAXM, B, + $ MAXN, CONE, CC, MAXM ) + RES1 = ZLANGE( 'M', M, N, CC, MAXM, DUM ) + RES = RES1 / MAX( SMLNUM, SMLNUM*XNRM, + $ ( ( ABS( RMUL )*TNRM )*EPS )*XNRM ) +* Verify that TRSYL3 only flushes if TRSYL flushes (but +* there may be cases where TRSYL3 avoid flushing). + IF( SCALE3.EQ.ZERO .AND. SCALE.GT.ZERO .OR. + $ IINFO.NE.INFO ) THEN + NFAIL( 3 ) = NFAIL( 3 ) + 1 + END IF + IF( RES.GT.THRESH .OR. DISNAN( RES ) ) + $ NFAIL( 2 ) = NFAIL( 2 ) + 1 + IF( RES.GT.RMAX( 2 ) ) + $ RMAX( 2 ) = RES + END DO + END DO + END DO + END DO + END DO + END DO +* + RETURN +* +* End of ZSYL01 +* + END From 05f9e54f0fbc5431a8720080136167c6adcc5507 Mon Sep 17 00:00:00 2001 From: Angelika Schwarz Date: Sat, 30 Jul 2022 09:25:55 +0100 Subject: [PATCH 5/8] Add ctrsyl3 and corresponding tests --- SRC/CMakeLists.txt | 2 +- SRC/Makefile | 2 +- SRC/ctrsyl3.f | 1142 ++++++++++++++++++++++++++++++++++++ TESTING/EIG/CMakeLists.txt | 2 +- TESTING/EIG/Makefile | 2 +- TESTING/EIG/cchkec.f | 42 +- TESTING/EIG/cerrec.f | 43 +- TESTING/EIG/csyl01.f | 294 ++++++++++ 8 files changed, 1511 insertions(+), 18 deletions(-) create mode 100644 SRC/ctrsyl3.f create mode 100644 TESTING/EIG/csyl01.f diff --git a/SRC/CMakeLists.txt b/SRC/CMakeLists.txt index 16f7ec22d5..525545924f 100644 --- a/SRC/CMakeLists.txt +++ b/SRC/CMakeLists.txt @@ -242,7 +242,7 @@ set(CLASRC ctgexc.f ctgsen.f ctgsja.f ctgsna.f ctgsy2.f ctgsyl.f ctpcon.f ctprfs.f ctptri.f ctptrs.f ctrcon.f ctrevc.f ctrevc3.f ctrexc.f ctrrfs.f ctrsen.f ctrsna.f - ctrsyl.f ctrti2.f ctrtri.f ctrtrs.f ctzrzf.f cung2l.f cung2r.f + ctrsyl.f ctrsyl3.f ctrti2.f ctrtri.f ctrtrs.f ctzrzf.f cung2l.f cung2r.f cungbr.f cunghr.f cungl2.f cunglq.f cungql.f cungqr.f cungr2.f cungrq.f cungtr.f cungtsqr.f cungtsqr_row.f cunm2l.f cunm2r.f cunmbr.f cunmhr.f cunml2.f cunm22.f cunmlq.f cunmql.f cunmqr.f cunmr2.f cunmr3.f cunmrq.f cunmrz.f diff --git a/SRC/Makefile b/SRC/Makefile index a2a5f284b8..ccc991c9ae 100644 --- a/SRC/Makefile +++ b/SRC/Makefile @@ -275,7 +275,7 @@ CLASRC = \ ctgexc.o ctgsen.o ctgsja.o ctgsna.o ctgsy2.o ctgsyl.o ctpcon.o \ ctprfs.o ctptri.o \ ctptrs.o ctrcon.o ctrevc.o ctrevc3.o ctrexc.o ctrrfs.o ctrsen.o ctrsna.o \ - ctrsyl.o ctrti2.o ctrtri.o ctrtrs.o ctzrzf.o cung2l.o cung2r.o \ + ctrsyl.o ctrsyl3.o ctrti2.o ctrtri.o ctrtrs.o ctzrzf.o cung2l.o cung2r.o \ cungbr.o cunghr.o cungl2.o cunglq.o cungql.o cungqr.o cungr2.o \ cungrq.o cungtr.o cungtsqr.o cungtsqr_row.o cunm2l.o cunm2r.o cunmbr.o cunmhr.o cunml2.o cunm22.o \ cunmlq.o cunmql.o cunmqr.o cunmr2.o cunmr3.o cunmrq.o cunmrz.o \ diff --git a/SRC/ctrsyl3.f b/SRC/ctrsyl3.f new file mode 100644 index 0000000000..b7a7355885 --- /dev/null +++ b/SRC/ctrsyl3.f @@ -0,0 +1,1142 @@ +*> \brief \b CTRSYL3 +* +* Definition: +* =========== +* +* +*> \par Purpose +* ============= +*> +*> \verbatim +*> +*> CTRSYL3 solves the complex Sylvester matrix equation: +*> +*> op(A)*X + X*op(B) = scale*C or +*> op(A)*X - X*op(B) = scale*C, +*> +*> where op(A) = A or A**H, and A and B are both upper triangular. A is +*> M-by-M and B is N-by-N; the right hand side C and the solution X are +*> M-by-N; and scale is an output scale factor, set <= 1 to avoid +*> overflow in X. +*> +*> This is the block version of the algorithm. +*> \endverbatim +* +* Arguments +* ========= +* +*> \param[in] TRANA +*> \verbatim +*> TRANA is CHARACTER*1 +*> Specifies the option op(A): +*> = 'N': op(A) = A (No transpose) +*> = 'C': op(A) = A**H (Conjugate transpose) +*> \endverbatim +*> +*> \param[in] TRANB +*> \verbatim +*> TRANB is CHARACTER*1 +*> Specifies the option op(B): +*> = 'N': op(B) = B (No transpose) +*> = 'C': op(B) = B**H (Conjugate transpose) +*> \endverbatim +*> +*> \param[in] ISGN +*> \verbatim +*> ISGN is INTEGER +*> Specifies the sign in the equation: +*> = +1: solve op(A)*X + X*op(B) = scale*C +*> = -1: solve op(A)*X - X*op(B) = scale*C +*> \endverbatim +*> +*> \param[in] M +*> \verbatim +*> M is INTEGER +*> The order of the matrix A, and the number of rows in the +*> matrices X and C. M >= 0. +*> \endverbatim +*> +*> \param[in] N +*> \verbatim +*> N is INTEGER +*> The order of the matrix B, and the number of columns in the +*> matrices X and C. N >= 0. +*> \endverbatim +*> +*> \param[in] A +*> \verbatim +*> A is COMPLEX array, dimension (LDA,M) +*> The upper triangular matrix A. +*> \endverbatim +*> +*> \param[in] LDA +*> \verbatim +*> LDA is INTEGER +*> The leading dimension of the array A. LDA >= max(1,M). +*> \endverbatim +*> +*> \param[in] B +*> \verbatim +*> B is COMPLEX array, dimension (LDB,N) +*> The upper triangular matrix B. +*> \endverbatim +*> +*> \param[in] LDB +*> \verbatim +*> LDB is INTEGER +*> The leading dimension of the array B. LDB >= max(1,N). +*> \endverbatim +*> +*> \param[in,out] C +*> \verbatim +*> C is COMPLEX array, dimension (LDC,N) +*> On entry, the M-by-N right hand side matrix C. +*> On exit, C is overwritten by the solution matrix X. +*> \endverbatim +*> +*> \param[in] LDC +*> \verbatim +*> LDC is INTEGER +*> The leading dimension of the array C. LDC >= max(1,M) +*> \endverbatim +*> +*> \param[out] SCALE +*> \verbatim +*> SCALE is REAL +*> The scale factor, scale, set <= 1 to avoid overflow in X. +*> \endverbatim +*> +*> \param[out] SWORK +*> \verbatim +*> SWORK is REAL array, dimension (MAX(2, ROWS), MAX(1,COLS)). +*> On exit, if INFO = 0, SWORK(1) returns the optimal value ROWS +*> and SWORK(2) returns the optimal COLS. +*> \endverbatim +*> +*> \param[in] LDSWORK +*> \verbatim +*> LDSWORK is INTEGER +*> LDSWORK >= MAX(2,ROWS), where ROWS = ((M + NB - 1) / NB + 1) +*> and NB is the optimal block size. +*> +*> If LDSWORK = -1, then a workspace query is assumed; the routine +*> only calculates the optimal dimensions of the SWORK matrix, +*> returns these values as the first and second entry of the SWORK +*> matrix, and no error message related LWORK is issued by XERBLA. +*> \endverbatim +*> +*> \param[out] INFO +*> \verbatim +*> INFO is INTEGER +*> = 0: successful exit +*> < 0: if INFO = -i, the i-th argument had an illegal value +*> = 1: A and B have common or very close eigenvalues; perturbed +*> values were used to solve the equation (but the matrices +*> A and B are unchanged). +*> \endverbatim +* +*> \ingroup complexSYcomputational +* +* ===================================================================== +* References: +* E. S. Quintana-Orti and R. A. Van De Geijn (2003). Formal derivation of +* algorithms: The triangular Sylvester equation, ACM Transactions +* on Mathematical Software (TOMS), volume 29, pages 218--243. +* +* A. Schwarz and C. C. Kjelgaard Mikkelsen (2020). Robust Task-Parallel +* Solution of the Triangular Sylvester Equation. Lecture Notes in +* Computer Science, vol 12043, pages 82--92, Springer. +* +* Contributor: +* Angelika Schwarz, Umea University, Sweden. +* +* ===================================================================== + SUBROUTINE CTRSYL3( TRANA, TRANB, ISGN, M, N, A, LDA, B, LDB, C, + $ LDC, SCALE, SWORK, LDSWORK, INFO ) + IMPLICIT NONE +* +* .. Scalar Arguments .. + CHARACTER TRANA, TRANB + INTEGER INFO, ISGN, LDA, LDB, LDC, LDSWORK, M, N + REAL SCALE +* .. +* .. Array Arguments .. + COMPLEX A( LDA, * ), B( LDB, * ), C( LDC, * ) + REAL SWORK( LDSWORK, * ) +* .. +* .. Parameters .. + REAL ZERO, ONE + PARAMETER ( ZERO = 0.0E+0, ONE = 1.0E+0 ) + COMPLEX CONE + PARAMETER ( CONE = ( 1.0E+0, 0.0E+0 ) ) +* .. +* .. Local Scalars .. + LOGICAL NOTRNA, NOTRNB, LQUERY + INTEGER AWRK, BWRK, I, I1, I2, IINFO, J, J1, J2, JJ, + $ K, K1, K2, L, L1, L2, LL, NBA, NB, NBB + REAL ANRM, BIGNUM, BNRM, CNRM, SCAL, SCALOC, + $ SCAMIN, SGN, XNRM, BUF, SMLNUM + COMPLEX CSGN +* .. +* .. Local Arrays .. + REAL WNRM( MAX( M, N ) ) +* .. +* .. External Functions .. + LOGICAL LSAME + INTEGER ILAENV + REAL SLAMCH, SLARMM, CLANGE + EXTERNAL SLAMCH, SLARMM, ILAENV, LSAME, CLANGE +* .. +* .. External Subroutines .. + EXTERNAL XERBLA, CSSCAL, CGEMM, CLASCL, CTRSYL +* .. +* .. Intrinsic Functions .. + INTRINSIC ABS, EXPONENT, REAL, AIMAG, MAX, MIN +* .. +* .. Executable Statements .. +* +* Decode and Test input parameters +* + NOTRNA = LSAME( TRANA, 'N' ) + NOTRNB = LSAME( TRANB, 'N' ) +* +* Use the same block size for all matrices. +* + NB = MAX( 8, ILAENV( 1, 'CTRSYL', '', M, N, -1, -1) ) +* +* Compute number of blocks in A and B +* + NBA = MAX( 1, (M + NB - 1) / NB ) + NBB = MAX( 1, (N + NB - 1) / NB ) +* +* Compute workspace +* + INFO = 0 + LQUERY = ( LDSWORK.EQ.-1 ) + IF( LQUERY ) THEN + LDSWORK = 2 + SWORK(1,1) = MAX( NBA, NBB ) + SWORK(2,1) = 2 * NBB + NBA + END IF +* +* Test the input arguments +* + IF( .NOT.NOTRNA .AND. .NOT. LSAME( TRANA, 'C' ) ) THEN + INFO = -1 + ELSE IF( .NOT.NOTRNB .AND. .NOT. LSAME( TRANB, 'C' ) ) THEN + INFO = -2 + ELSE IF( ISGN.NE.1 .AND. ISGN.NE.-1 ) THEN + INFO = -3 + ELSE IF( M.LT.0 ) THEN + INFO = -4 + ELSE IF( N.LT.0 ) THEN + INFO = -5 + ELSE IF( LDA.LT.MAX( 1, M ) ) THEN + INFO = -7 + ELSE IF( LDB.LT.MAX( 1, N ) ) THEN + INFO = -9 + ELSE IF( LDC.LT.MAX( 1, M ) ) THEN + INFO = -11 + ELSE IF( .NOT.LQUERY .AND. LDSWORK.LT.MAX( NBA, NBB ) ) THEN + INFO = -16 + END IF + IF( INFO.NE.0 ) THEN + CALL XERBLA( 'CTRSYL3', -INFO ) + RETURN + ELSE IF( LQUERY ) THEN + RETURN + END IF +* +* Quick return if possible +* + IF( M.EQ.0 .OR. N.EQ.0 ) + $ RETURN +* +* Use unblocked code for small problems +* + IF( NBA.EQ.1 .OR. NBB.EQ.1 ) THEN + CALL CTRSYL( TRANA, TRANB, ISGN, M, N, A, LDA, B, LDB, + $ C, LDC, SCALE, INFO ) + RETURN + END IF +* +* Set constants to control overflow +* + SMLNUM = SLAMCH( 'S' ) + BIGNUM = ONE / SMLNUM +* +* Set local scaling factors. +* + DO L = 1, NBB + DO K = 1, NBA + SWORK( K, L ) = ONE + END DO + END DO +* +* Fallback scaling factor to prevent flushing of SWORK( K, L ) to zero. +* This scaling is to ensure compatibility with TRSYL and may get flushed. +* + BUF = ONE +* +* Compute upper bounds of blocks of A and B +* + AWRK = NBB + DO K = 1, NBA + K1 = (K - 1) * NB + 1 + K2 = MIN( K * NB, M ) + 1 + DO L = K, NBA + L1 = (L - 1) * NB + 1 + L2 = MIN( L * NB, M ) + 1 + IF( NOTRNA ) THEN + SWORK( K, AWRK + L ) = CLANGE( 'I', K2-K1, L2-L1, + $ A( K1, L1 ), LDA, WNRM ) + ELSE + SWORK( L, AWRK + K ) = CLANGE( '1', K2-K1, L2-L1, + $ A( K1, L1 ), LDA, WNRM ) + END IF + END DO + END DO + BWRK = NBB + NBA + DO K = 1, NBB + K1 = (K - 1) * NB + 1 + K2 = MIN( K * NB, N ) + 1 + DO L = K, NBB + L1 = (L - 1) * NB + 1 + L2 = MIN( L * NB, N ) + 1 + IF( NOTRNB ) THEN + SWORK( K, BWRK + L ) = CLANGE( 'I', K2-K1, L2-L1, + $ B( K1, L1 ), LDB, WNRM ) + ELSE + SWORK( L, BWRK + K ) = CLANGE( '1', K2-K1, L2-L1, + $ B( K1, L1 ), LDB, WNRM ) + END IF + END DO + END DO +* + SGN = REAL( ISGN ) + CSGN = CMPLX( SGN, ZERO ) +* + IF( NOTRNA .AND. NOTRNB ) THEN +* +* Solve A*X + ISGN*X*B = scale*C. +* +* The (K,L)th block of X is determined starting from +* bottom-left corner column by column by +* +* A(K,K)*X(K,L) + ISGN*X(K,L)*B(L,L) = C(K,L) - R(K,L) +* +* Where +* M L-1 +* R(K,L) = SUM [A(K,I)*X(I,L)] + ISGN*SUM [X(K,J)*B(J,L)]. +* I=K+1 J=1 +* +* Start loop over block rows (index = K) and block columns (index = L) +* + DO K = NBA, 1, -1 +* +* K1: row index of the first row in X( K, L ) +* K2: row index of the first row in X( K+1, L ) +* so the K2 - K1 is the column count of the block X( K, L ) +* + K1 = (K - 1) * NB + 1 + K2 = MIN( K * NB, M ) + 1 + DO L = 1, NBB +* +* L1: column index of the first column in X( K, L ) +* L2: column index of the first column in X( K, L + 1) +* so that L2 - L1 is the row count of the block X( K, L ) +* + L1 = (L - 1) * NB + 1 + L2 = MIN( L * NB, N ) + 1 +* + CALL CTRSYL( TRANA, TRANB, ISGN, K2-K1, L2-L1, + $ A( K1, K1 ), LDA, + $ B( L1, L1 ), LDB, + $ C( K1, L1 ), LDC, SCALOC, IINFO ) + INFO = MAX( INFO, IINFO ) +* + IF( SCALOC * SWORK( K, L ) .EQ. ZERO ) THEN + IF( SCALOC .EQ. ZERO ) THEN +* The magnitude of the largest entry of X(K1:K2-1, L1:L2-1) +* is larger than the product of BIGNUM**2 and cannot be +* represented in the form (1/SCALE)*X(K1:K2-1, L1:L2-1). +* Mark the computation as pointless. + BUF = ZERO + ELSE +* Use second scaling factor to prevent flushing to zero. + BUF = BUF*2.E0**EXPONENT( SCALOC ) + END IF + DO JJ = 1, NBB + DO LL = 1, NBA +* Bound by BIGNUM to not introduce Inf. The value +* is irrelevant; corresponding entries of the +* solution will be flushed in consistency scaling. + SWORK( LL, JJ ) = MIN( BIGNUM, + $ SWORK( LL, JJ ) / 2.E0**EXPONENT( SCALOC ) ) + END DO + END DO + END IF + SWORK( K, L ) = SCALOC * SWORK( K, L ) + XNRM = CLANGE( 'I', K2-K1, L2-L1, C( K1, L1 ), LDC, + $ WNRM ) +* + DO I = K - 1, 1, -1 +* +* C( I, L ) := C( I, L ) - A( I, K ) * C( K, L ) +* + I1 = (I - 1) * NB + 1 + I2 = MIN( I * NB, M ) + 1 +* +* Compute scaling factor to survive the linear update +* simulating consistent scaling. +* + CNRM = CLANGE( 'I', I2-I1, L2-L1, C( I1, L1 ), + $ LDC, WNRM ) + SCAMIN = MIN( SWORK( I, L ), SWORK( K, L ) ) + CNRM = CNRM * ( SCAMIN / SWORK( I, L ) ) + XNRM = XNRM * ( SCAMIN / SWORK( K, L ) ) + ANRM = SWORK( I, AWRK + K ) + SCALOC = SLARMM( ANRM, XNRM, CNRM ) + IF( SCALOC * SCAMIN .EQ. ZERO ) THEN +* Use second scaling factor to prevent flushing to zero. + BUF = BUF*2.E0**EXPONENT( SCALOC ) + DO JJ = 1, NBB + DO LL = 1, NBA + SWORK( LL, JJ ) = MIN( BIGNUM, + $ SWORK( LL, JJ ) / 2.E0**EXPONENT( SCALOC ) ) + END DO + END DO + SCAMIN = SCAMIN / 2.E0**EXPONENT( SCALOC ) + SCALOC = SCALOC / 2.E0**EXPONENT( SCALOC ) + END IF + CNRM = CNRM * SCALOC + XNRM = XNRM * SCALOC +* +* Simultaneously apply the robust update factor and the +* consistency scaling factor to C( I, L ) and C( K, L ). +* + SCAL = ( SCAMIN / SWORK( K, L ) ) * SCALOC + IF( SCAL.NE.ONE ) THEN + DO JJ = L1, L2-1 + CALL CSSCAL( K2-K1, SCAL, C( K1, JJ ), 1) + END DO + ENDIF +* + SCAL = ( SCAMIN / SWORK( I, L ) ) * SCALOC + IF( SCAL.NE.ONE ) THEN + DO LL = L1, L2-1 + CALL CSSCAL( I2-I1, SCAL, C( I1, LL ), 1) + END DO + ENDIF +* +* Record current scaling factor +* + SWORK( K, L ) = SCAMIN * SCALOC + SWORK( I, L ) = SCAMIN * SCALOC +* + CALL CGEMM( 'N', 'N', I2-I1, L2-L1, K2-K1, -CONE, + $ A( I1, K1 ), LDA, C( K1, L1 ), LDC, + $ CONE, C( I1, L1 ), LDC ) +* + END DO +* + DO J = L + 1, NBB +* +* C( K, J ) := C( K, J ) - SGN * C( K, L ) * B( L, J ) +* + J1 = (J - 1) * NB + 1 + J2 = MIN( J * NB, N ) + 1 +* +* Compute scaling factor to survive the linear update +* simulating consistent scaling. +* + CNRM = CLANGE( 'I', K2-K1, J2-J1, C( K1, J1 ), + $ LDC, WNRM ) + SCAMIN = MIN( SWORK( K, J ), SWORK( K, L ) ) + CNRM = CNRM * ( SCAMIN / SWORK( K, J ) ) + XNRM = XNRM * ( SCAMIN / SWORK( K, L ) ) + BNRM = SWORK(L, BWRK + J) + SCALOC = SLARMM( BNRM, XNRM, CNRM ) + IF( SCALOC * SCAMIN .EQ. ZERO ) THEN +* Use second scaling factor to prevent flushing to zero. + BUF = BUF*2.E0**EXPONENT( SCALOC ) + DO JJ = 1, NBB + DO LL = 1, NBA + SWORK( LL, JJ ) = MIN( BIGNUM, + $ SWORK( LL, JJ ) / 2.E0**EXPONENT( SCALOC ) ) + END DO + END DO + SCAMIN = SCAMIN / 2.E0**EXPONENT( SCALOC ) + SCALOC = SCALOC / 2.E0**EXPONENT( SCALOC ) + END IF + CNRM = CNRM * SCALOC + XNRM = XNRM * SCALOC +* +* Simultaneously apply the robust update factor and the +* consistency scaling factor to C( K, J ) and C( K, L). +* + SCAL = ( SCAMIN / SWORK( K, L ) ) * SCALOC + IF( SCAL .NE. ONE ) THEN + DO LL = L1, L2-1 + CALL CSSCAL( K2-K1, SCAL, C( K1, LL ), 1 ) + END DO + ENDIF +* + SCAL = ( SCAMIN / SWORK( K, J ) ) * SCALOC + IF( SCAL .NE. ONE ) THEN + DO JJ = J1, J2-1 + CALL CSSCAL( K2-K1, SCAL, C( K1, JJ ), 1 ) + END DO + ENDIF +* +* Record current scaling factor +* + SWORK( K, L ) = SCAMIN * SCALOC + SWORK( K, J ) = SCAMIN * SCALOC +* + CALL CGEMM( 'N', 'N', K2-K1, J2-J1, L2-L1, -CSGN, + $ C( K1, L1 ), LDC, B( L1, J1 ), LDB, + $ CONE, C( K1, J1 ), LDC ) + END DO + END DO + END DO + ELSE IF( .NOT.NOTRNA .AND. NOTRNB ) THEN +* +* Solve A**H *X + ISGN*X*B = scale*C. +* +* The (K,L)th block of X is determined starting from +* upper-left corner column by column by +* +* A(K,K)**H*X(K,L) + ISGN*X(K,L)*B(L,L) = C(K,L) - R(K,L) +* +* Where +* K-1 L-1 +* R(K,L) = SUM [A(I,K)**H*X(I,L)] +ISGN*SUM [X(K,J)*B(J,L)] +* I=1 J=1 +* +* Start loop over block rows (index = K) and block columns (index = L) +* + DO K = 1, NBA +* +* K1: row index of the first row in X( K, L ) +* K2: row index of the first row in X( K+1, L ) +* so the K2 - K1 is the column count of the block X( K, L ) +* + K1 = (K - 1) * NB + 1 + K2 = MIN( K * NB, M ) + 1 + DO L = 1, NBB +* +* L1: column index of the first column in X( K, L ) +* L2: column index of the first column in X( K, L + 1) +* so that L2 - L1 is the row count of the block X( K, L ) +* + L1 = (L - 1) * NB + 1 + L2 = MIN( L * NB, N ) + 1 +* + CALL CTRSYL( TRANA, TRANB, ISGN, K2-K1, L2-L1, + $ A( K1, K1 ), LDA, + $ B( L1, L1 ), LDB, + $ C( K1, L1 ), LDC, SCALOC, IINFO ) + INFO = MAX( INFO, IINFO ) +* + IF( SCALOC * SWORK( K, L ) .EQ. ZERO ) THEN + IF( SCALOC .EQ. ZERO ) THEN +* The magnitude of the largest entry of X(K1:K2-1, L1:L2-1) +* is larger than the product of BIGNUM**2 and cannot be +* represented in the form (1/SCALE)*X(K1:K2-1, L1:L2-1). +* Mark the computation as pointless. + BUF = ZERO + ELSE +* Use second scaling factor to prevent flushing to zero. + BUF = BUF*2.E0**EXPONENT( SCALOC ) + END IF + DO JJ = 1, NBB + DO LL = 1, NBA +* Bound by BIGNUM to not introduce Inf. The value +* is irrelevant; corresponding entries of the +* solution will be flushed in consistency scaling. + SWORK( LL, JJ ) = MIN( BIGNUM, + $ SWORK( LL, JJ ) / 2.E0**EXPONENT( SCALOC ) ) + END DO + END DO + END IF + SWORK( K, L ) = SCALOC * SWORK( K, L ) + XNRM = CLANGE( 'I', K2-K1, L2-L1, C( K1, L1 ), LDC, + $ WNRM ) +* + DO I = K + 1, NBA +* +* C( I, L ) := C( I, L ) - A( K, I )**H * C( K, L ) +* + I1 = (I - 1) * NB + 1 + I2 = MIN( I * NB, M ) + 1 +* +* Compute scaling factor to survive the linear update +* simulating consistent scaling. +* + CNRM = CLANGE( 'I', I2-I1, L2-L1, C( I1, L1 ), + $ LDC, WNRM ) + SCAMIN = MIN( SWORK( I, L ), SWORK( K, L ) ) + CNRM = CNRM * ( SCAMIN / SWORK( I, L ) ) + XNRM = XNRM * ( SCAMIN / SWORK( K, L ) ) + ANRM = SWORK( I, AWRK + K ) + SCALOC = SLARMM( ANRM, XNRM, CNRM ) + IF( SCALOC * SCAMIN .EQ. ZERO ) THEN +* Use second scaling factor to prevent flushing to zero. + BUF = BUF*2.E0**EXPONENT( SCALOC ) + DO JJ = 1, NBB + DO LL = 1, NBA + SWORK( LL, JJ ) = MIN( BIGNUM, + $ SWORK( LL, JJ ) / 2.E0**EXPONENT( SCALOC ) ) + END DO + END DO + SCAMIN = SCAMIN / 2.E0**EXPONENT( SCALOC ) + SCALOC = SCALOC / 2.E0**EXPONENT( SCALOC ) + END IF + CNRM = CNRM * SCALOC + XNRM = XNRM * SCALOC +* +* Simultaneously apply the robust update factor and the +* consistency scaling factor to to C( I, L ) and C( K, L). +* + SCAL = ( SCAMIN / SWORK( K, L ) ) * SCALOC + IF( SCAL .NE. ONE ) THEN + DO LL = L1, L2-1 + CALL CSSCAL( K2-K1, SCAL, C( K1, LL ), 1 ) + END DO + ENDIF +* + SCAL = ( SCAMIN / SWORK( I, L ) ) * SCALOC + IF( SCAL .NE. ONE ) THEN + DO LL = L1, L2-1 + CALL CSSCAL( I2-I1, SCAL, C( I1, LL ), 1 ) + END DO + ENDIF +* +* Record current scaling factor +* + SWORK( K, L ) = SCAMIN * SCALOC + SWORK( I, L ) = SCAMIN * SCALOC +* + CALL CGEMM( 'C', 'N', I2-I1, L2-L1, K2-K1, -CONE, + $ A( K1, I1 ), LDA, C( K1, L1 ), LDC, + $ CONE, C( I1, L1 ), LDC ) + END DO +* + DO J = L + 1, NBB +* +* C( K, J ) := C( K, J ) - SGN * C( K, L ) * B( L, J ) +* + J1 = (J - 1) * NB + 1 + J2 = MIN( J * NB, N ) + 1 +* +* Compute scaling factor to survive the linear update +* simulating consistent scaling. +* + CNRM = CLANGE( 'I', K2-K1, J2-J1, C( K1, J1 ), + $ LDC, WNRM ) + SCAMIN = MIN( SWORK( K, J ), SWORK( K, L ) ) + CNRM = CNRM * ( SCAMIN / SWORK( K, J ) ) + XNRM = XNRM * ( SCAMIN / SWORK( K, L ) ) + BNRM = SWORK( L, BWRK + J ) + SCALOC = SLARMM( BNRM, XNRM, CNRM ) + IF( SCALOC * SCAMIN .EQ. ZERO ) THEN +* Use second scaling factor to prevent flushing to zero. + BUF = BUF*2.E0**EXPONENT( SCALOC ) + DO JJ = 1, NBB + DO LL = 1, NBA + SWORK( LL, JJ ) = MIN( BIGNUM, + $ SWORK( LL, JJ ) / 2.E0**EXPONENT( SCALOC ) ) + END DO + END DO + SCAMIN = SCAMIN / 2.E0**EXPONENT( SCALOC ) + SCALOC = SCALOC / 2.E0**EXPONENT( SCALOC ) + END IF + CNRM = CNRM * SCALOC + XNRM = XNRM * SCALOC +* +* Simultaneously apply the robust update factor and the +* consistency scaling factor to to C( K, J ) and C( K, L). +* + SCAL = ( SCAMIN / SWORK( K, L ) ) * SCALOC + IF( SCAL .NE. ONE ) THEN + DO LL = L1, L2-1 + CALL CSSCAL( K2-K1, SCAL, C( K1, LL ), 1 ) + END DO + ENDIF +* + SCAL = ( SCAMIN / SWORK( K, J ) ) * SCALOC + IF( SCAL .NE. ONE ) THEN + DO JJ = J1, J2-1 + CALL CSSCAL( K2-K1, SCAL, C( K1, JJ ), 1 ) + END DO + ENDIF +* +* Record current scaling factor +* + SWORK( K, L ) = SCAMIN * SCALOC + SWORK( K, J ) = SCAMIN * SCALOC +* + CALL CGEMM( 'N', 'N', K2-K1, J2-J1, L2-L1, -CSGN, + $ C( K1, L1 ), LDC, B( L1, J1 ), LDB, + $ CONE, C( K1, J1 ), LDC ) + END DO + END DO + END DO + ELSE IF( .NOT.NOTRNA .AND. .NOT.NOTRNB ) THEN +* +* Solve A**H *X + ISGN*X*B**H = scale*C. +* +* The (K,L)th block of X is determined starting from +* top-right corner column by column by +* +* A(K,K)**H*X(K,L) + ISGN*X(K,L)*B(L,L)**H = C(K,L) - R(K,L) +* +* Where +* K-1 N +* R(K,L) = SUM [A(I,K)**H*X(I,L)] + ISGN*SUM [X(K,J)*B(L,J)**H]. +* I=1 J=L+1 +* +* Start loop over block rows (index = K) and block columns (index = L) +* + DO K = 1, NBA +* +* K1: row index of the first row in X( K, L ) +* K2: row index of the first row in X( K+1, L ) +* so the K2 - K1 is the column count of the block X( K, L ) +* + K1 = (K - 1) * NB + 1 + K2 = MIN( K * NB, M ) + 1 + DO L = NBB, 1, -1 +* +* L1: column index of the first column in X( K, L ) +* L2: column index of the first column in X( K, L + 1) +* so that L2 - L1 is the row count of the block X( K, L ) +* + L1 = (L - 1) * NB + 1 + L2 = MIN( L * NB, N ) + 1 +* + CALL CTRSYL( TRANA, TRANB, ISGN, K2-K1, L2-L1, + $ A( K1, K1 ), LDA, + $ B( L1, L1 ), LDB, + $ C( K1, L1 ), LDC, SCALOC, IINFO ) + INFO = MAX( INFO, IINFO ) +* + IF( SCALOC * SWORK( K, L ) .EQ. ZERO ) THEN + IF( SCALOC .EQ. ZERO ) THEN +* The magnitude of the largest entry of X(K1:K2-1, L1:L2-1) +* is larger than the product of BIGNUM**2 and cannot be +* represented in the form (1/SCALE)*X(K1:K2-1, L1:L2-1). +* Mark the computation as pointless. + BUF = ZERO + ELSE +* Use second scaling factor to prevent flushing to zero. + BUF = BUF*2.E0**EXPONENT( SCALOC ) + END IF + DO JJ = 1, NBB + DO LL = 1, NBA +* Bound by BIGNUM to not introduce Inf. The value +* is irrelevant; corresponding entries of the +* solution will be flushed in consistency scaling. + SWORK( LL, JJ ) = MIN( BIGNUM, + $ SWORK( LL, JJ ) / 2.E0**EXPONENT( SCALOC ) ) + END DO + END DO + END IF + SWORK( K, L ) = SCALOC * SWORK( K, L ) + XNRM = CLANGE( 'I', K2-K1, L2-L1, C( K1, L1 ), LDC, + $ WNRM ) +* + DO I = K + 1, NBA +* +* C( I, L ) := C( I, L ) - A( K, I )**H * C( K, L ) +* + I1 = (I - 1) * NB + 1 + I2 = MIN( I * NB, M ) + 1 +* +* Compute scaling factor to survive the linear update +* simulating consistent scaling. +* + CNRM = CLANGE( 'I', I2-I1, L2-L1, C( I1, L1 ), + $ LDC, WNRM ) + SCAMIN = MIN( SWORK( I, L ), SWORK( K, L ) ) + CNRM = CNRM * ( SCAMIN / SWORK( I, L ) ) + XNRM = XNRM * ( SCAMIN / SWORK( K, L ) ) + ANRM = SWORK( I, AWRK + K ) + SCALOC = SLARMM( ANRM, XNRM, CNRM ) + IF( SCALOC * SCAMIN .EQ. ZERO ) THEN +* Use second scaling factor to prevent flushing to zero. + BUF = BUF*2.E0**EXPONENT( SCALOC ) + DO JJ = 1, NBB + DO LL = 1, NBA + SWORK( LL, JJ ) = MIN( BIGNUM, + $ SWORK( LL, JJ ) / 2.E0**EXPONENT( SCALOC ) ) + END DO + END DO + SCAMIN = SCAMIN / 2.E0**EXPONENT( SCALOC ) + SCALOC = SCALOC / 2.E0**EXPONENT( SCALOC ) + END IF + CNRM = CNRM * SCALOC + XNRM = XNRM * SCALOC +* +* Simultaneously apply the robust update factor and the +* consistency scaling factor to C( I, L ) and C( K, L). +* + SCAL = ( SCAMIN / SWORK( K, L ) ) * SCALOC + IF( SCAL .NE. ONE ) THEN + DO LL = L1, L2-1 + CALL CSSCAL( K2-K1, SCAL, C( K1, LL ), 1 ) + END DO + ENDIF +* + SCAL = ( SCAMIN / SWORK( I, L ) ) * SCALOC + IF( SCAL .NE. ONE ) THEN + DO LL = L1, L2-1 + CALL CSSCAL( I2-I1, SCAL, C( I1, LL ), 1 ) + END DO + ENDIF +* +* Record current scaling factor +* + SWORK( K, L ) = SCAMIN * SCALOC + SWORK( I, L ) = SCAMIN * SCALOC +* + CALL CGEMM( 'C', 'N', I2-I1, L2-L1, K2-K1, -CONE, + $ A( K1, I1 ), LDA, C( K1, L1 ), LDC, + $ CONE, C( I1, L1 ), LDC ) + END DO +* + DO J = 1, L - 1 +* +* C( K, J ) := C( K, J ) - SGN * C( K, L ) * B( J, L )**H +* + J1 = (J - 1) * NB + 1 + J2 = MIN( J * NB, N ) + 1 +* +* Compute scaling factor to survive the linear update +* simulating consistent scaling. +* + CNRM = CLANGE( 'I', K2-K1, J2-J1, C( K1, J1 ), + $ LDC, WNRM ) + SCAMIN = MIN( SWORK( K, J ), SWORK( K, L ) ) + CNRM = CNRM * ( SCAMIN / SWORK( K, J ) ) + XNRM = XNRM * ( SCAMIN / SWORK( K, L ) ) + BNRM = SWORK( L, BWRK + J ) + SCALOC = SLARMM( BNRM, XNRM, CNRM ) + IF( SCALOC * SCAMIN .EQ. ZERO ) THEN +* Use second scaling factor to prevent flushing to zero. + BUF = BUF*2.E0**EXPONENT( SCALOC ) + DO JJ = 1, NBB + DO LL = 1, NBA + SWORK( LL, JJ ) = MIN( BIGNUM, + $ SWORK( LL, JJ ) / 2.E0**EXPONENT( SCALOC ) ) + END DO + END DO + SCAMIN = SCAMIN / 2.E0**EXPONENT( SCALOC ) + SCALOC = SCALOC / 2.E0**EXPONENT( SCALOC ) + END IF + CNRM = CNRM * SCALOC + XNRM = XNRM * SCALOC +* +* Simultaneously apply the robust update factor and the +* consistency scaling factor to C( K, J ) and C( K, L). +* + SCAL = ( SCAMIN / SWORK( K, L ) ) * SCALOC + IF( SCAL .NE. ONE ) THEN + DO LL = L1, L2-1 + CALL CSSCAL( K2-K1, SCAL, C( K1, LL ), 1) + END DO + ENDIF +* + SCAL = ( SCAMIN / SWORK( K, J ) ) * SCALOC + IF( SCAL .NE. ONE ) THEN + DO JJ = J1, J2-1 + CALL CSSCAL( K2-K1, SCAL, C( K1, JJ ), 1 ) + END DO + ENDIF +* +* Record current scaling factor +* + SWORK( K, L ) = SCAMIN * SCALOC + SWORK( K, J ) = SCAMIN * SCALOC +* + CALL CGEMM( 'N', 'C', K2-K1, J2-J1, L2-L1, -CSGN, + $ C( K1, L1 ), LDC, B( J1, L1 ), LDB, + $ CONE, C( K1, J1 ), LDC ) + END DO + END DO + END DO + ELSE IF( NOTRNA .AND. .NOT.NOTRNB ) THEN +* +* Solve A*X + ISGN*X*B**H = scale*C. +* +* The (K,L)th block of X is determined starting from +* bottom-right corner column by column by +* +* A(K,K)*X(K,L) + ISGN*X(K,L)*B(L,L)**H = C(K,L) - R(K,L) +* +* Where +* M N +* R(K,L) = SUM [A(K,I)*X(I,L)] + ISGN*SUM [X(K,J)*B(L,J)**H]. +* I=K+1 J=L+1 +* +* Start loop over block rows (index = K) and block columns (index = L) +* + DO K = NBA, 1, -1 +* +* K1: row index of the first row in X( K, L ) +* K2: row index of the first row in X( K+1, L ) +* so the K2 - K1 is the column count of the block X( K, L ) +* + K1 = (K - 1) * NB + 1 + K2 = MIN( K * NB, M ) + 1 + DO L = NBB, 1, -1 +* +* L1: column index of the first column in X( K, L ) +* L2: column index of the first column in X( K, L + 1) +* so that L2 - L1 is the row count of the block X( K, L ) +* + L1 = (L - 1) * NB + 1 + L2 = MIN( L * NB, N ) + 1 +* + CALL CTRSYL( TRANA, TRANB, ISGN, K2-K1, L2-L1, + $ A( K1, K1 ), LDA, + $ B( L1, L1 ), LDB, + $ C( K1, L1 ), LDC, SCALOC, IINFO ) + INFO = MAX( INFO, IINFO ) +* + IF( SCALOC * SWORK( K, L ) .EQ. ZERO ) THEN + IF( SCALOC .EQ. ZERO ) THEN +* The magnitude of the largest entry of X(K1:K2-1, L1:L2-1) +* is larger than the product of BIGNUM**2 and cannot be +* represented in the form (1/SCALE)*X(K1:K2-1, L1:L2-1). +* Mark the computation as pointless. + BUF = ZERO + ELSE +* Use second scaling factor to prevent flushing to zero. + BUF = BUF*2.E0**EXPONENT( SCALOC ) + END IF + DO JJ = 1, NBB + DO LL = 1, NBA +* Bound by BIGNUM to not introduce Inf. The value +* is irrelevant; corresponding entries of the +* solution will be flushed in consistency scaling. + SWORK( LL, JJ ) = MIN( BIGNUM, + $ SWORK( LL, JJ ) / 2.E0**EXPONENT( SCALOC ) ) + END DO + END DO + END IF + SWORK( K, L ) = SCALOC * SWORK( K, L ) + XNRM = CLANGE( 'I', K2-K1, L2-L1, C( K1, L1 ), LDC, + $ WNRM ) +* + DO I = 1, K - 1 +* +* C( I, L ) := C( I, L ) - A( I, K ) * C( K, L ) +* + I1 = (I - 1) * NB + 1 + I2 = MIN( I * NB, M ) + 1 +* +* Compute scaling factor to survive the linear update +* simulating consistent scaling. +* + CNRM = CLANGE( 'I', I2-I1, L2-L1, C( I1, L1 ), + $ LDC, WNRM ) + SCAMIN = MIN( SWORK( I, L ), SWORK( K, L ) ) + CNRM = CNRM * ( SCAMIN / SWORK( I, L ) ) + XNRM = XNRM * ( SCAMIN / SWORK( K, L ) ) + ANRM = SWORK( I, AWRK + K ) + SCALOC = SLARMM( ANRM, XNRM, CNRM ) + IF( SCALOC * SCAMIN .EQ. ZERO ) THEN +* Use second scaling factor to prevent flushing to zero. + BUF = BUF*2.E0**EXPONENT( SCALOC ) + DO JJ = 1, NBB + DO LL = 1, NBA + SWORK( LL, JJ ) = MIN( BIGNUM, + $ SWORK( LL, JJ ) / 2.E0**EXPONENT( SCALOC ) ) + END DO + END DO + SCAMIN = SCAMIN / 2.E0**EXPONENT( SCALOC ) + SCALOC = SCALOC / 2.E0**EXPONENT( SCALOC ) + END IF + CNRM = CNRM * SCALOC + XNRM = XNRM * SCALOC +* +* Simultaneously apply the robust update factor and the +* consistency scaling factor to C( I, L ) and C( K, L). +* + SCAL = ( SCAMIN / SWORK( K, L ) ) * SCALOC + IF( SCAL .NE. ONE ) THEN + DO LL = L1, L2-1 + CALL CSSCAL( K2-K1, SCAL, C( K1, LL ), 1 ) + END DO + ENDIF +* + SCAL = ( SCAMIN / SWORK( I, L ) ) * SCALOC + IF( SCAL .NE. ONE ) THEN + DO LL = L1, L2-1 + CALL CSSCAL( I2-I1, SCAL, C( I1, LL ), 1 ) + END DO + ENDIF +* +* Record current scaling factor +* + SWORK( K, L ) = SCAMIN * SCALOC + SWORK( I, L ) = SCAMIN * SCALOC +* + CALL CGEMM( 'N', 'N', I2-I1, L2-L1, K2-K1, -CONE, + $ A( I1, K1 ), LDA, C( K1, L1 ), LDC, + $ CONE, C( I1, L1 ), LDC ) +* + END DO +* + DO J = 1, L - 1 +* +* C( K, J ) := C( K, J ) - SGN * C( K, L ) * B( J, L )**H +* + J1 = (J - 1) * NB + 1 + J2 = MIN( J * NB, N ) + 1 +* +* Compute scaling factor to survive the linear update +* simulating consistent scaling. +* + CNRM = CLANGE( 'I', K2-K1, J2-J1, C( K1, J1 ), + $ LDC, WNRM ) + SCAMIN = MIN( SWORK( K, J ), SWORK( K, L ) ) + CNRM = CNRM * ( SCAMIN / SWORK( K, J ) ) + XNRM = XNRM * ( SCAMIN / SWORK( K, L ) ) + BNRM = SWORK( L, BWRK + J ) + SCALOC = SLARMM( BNRM, XNRM, CNRM ) + IF( SCALOC * SCAMIN .EQ. ZERO ) THEN +* Use second scaling factor to prevent flushing to zero. + BUF = BUF*2.E0**EXPONENT( SCALOC ) + DO JJ = 1, NBB + DO LL = 1, NBA + SWORK( LL, JJ ) = MIN( BIGNUM, + $ SWORK( LL, JJ ) / 2.E0**EXPONENT( SCALOC ) ) + END DO + END DO + SCAMIN = SCAMIN / 2.E0**EXPONENT( SCALOC ) + SCALOC = SCALOC / 2.E0**EXPONENT( SCALOC ) + END IF + CNRM = CNRM * SCALOC + XNRM = XNRM * SCALOC +* +* Simultaneously apply the robust update factor and the +* consistency scaling factor to C( K, J ) and C( K, L). +* + SCAL = ( SCAMIN / SWORK( K, L ) ) * SCALOC + IF( SCAL .NE. ONE ) THEN + DO JJ = L1, L2-1 + CALL CSSCAL( K2-K1, SCAL, C( K1, JJ ), 1 ) + END DO + ENDIF +* + SCAL = ( SCAMIN / SWORK( K, J ) ) * SCALOC + IF( SCAL .NE. ONE ) THEN + DO JJ = J1, J2-1 + CALL CSSCAL( K2-K1, SCAL, C( K1, JJ ), 1 ) + END DO + ENDIF +* +* Record current scaling factor +* + SWORK( K, L ) = SCAMIN * SCALOC + SWORK( K, J ) = SCAMIN * SCALOC +* + CALL CGEMM( 'N', 'C', K2-K1, J2-J1, L2-L1, -CSGN, + $ C( K1, L1 ), LDC, B( J1, L1 ), LDB, + $ CONE, C( K1, J1 ), LDC ) + END DO + END DO + END DO +* + END IF +* +* Reduce local scaling factors +* + SCALE = SWORK( 1, 1 ) + DO K = 1, NBA + DO L = 1, NBB + SCALE = MIN( SCALE, SWORK( K, L ) ) + END DO + END DO + IF( SCALE .EQ. ZERO ) THEN +* +* The magnitude of the largest entry of the solution is larger +* than the product of BIGNUM**2 and cannot be represented in the +* form (1/SCALE)*X if SCALE is REAL. Set SCALE to +* zero and give up. +* + SWORK(1,1) = MAX( NBA, NBB ) + SWORK(2,1) = 2 * NBB + NBA + RETURN + END IF +* +* Realize consistent scaling +* + DO K = 1, NBA + K1 = (K - 1) * NB + 1 + K2 = MIN( K * NB, M ) + 1 + DO L = 1, NBB + L1 = (L - 1) * NB + 1 + L2 = MIN( L * NB, N ) + 1 + SCAL = SCALE / SWORK( K, L ) + IF( SCAL .NE. ONE ) THEN + DO LL = L1, L2-1 + CALL CSSCAL( K2-K1, SCAL, C( K1, LL ), 1 ) + END DO + ENDIF + END DO + END DO +* + IF( BUF .NE. ONE .AND. BUF.GT.ZERO ) THEN +* +* Decrease SCALE as much as possible. +* + SCALOC = MIN( SCALE / SMLNUM, ONE / BUF ) + BUF = BUF * SCALOC + SCALE = SCALE / SCALOC + END IF +* + IF( BUF.NE.ONE .AND. BUF.GT.ZERO ) THEN +* +* In case of overly aggressive scaling during the computation, +* flushing of the global scale factor may be prevented by +* undoing some of the scaling. This step is to ensure that +* this routine flushes only scale factors that TRSYL also +* flushes and be usable as a drop-in replacement. +* +* How much can the normwise largest entry be upscaled? +* + SCAL = MAX( ABS( REAL( C( 1, 1 ) ) ), + $ ABS( AIMAG( C ( 1, 1 ) ) ) ) + DO K = 1, M + DO L = 1, N + SCAL = MAX( SCAL, ABS( REAL ( C( K, L ) ) ), + $ ABS( AIMAG ( C( K, L ) ) ) ) + END DO + END DO +* +* Increase BUF as close to 1 as possible and apply scaling. +* + SCALOC = MIN( BIGNUM / SCAL, ONE / BUF ) + BUF = BUF * SCALOC + CALL CLASCL( 'G', -1, -1, ONE, SCALOC, M, N, C, LDC, IINFO ) + END IF +* +* Combine with buffer scaling factor. SCALE will be flushed if +* BUF is less than one here. +* + SCALE = SCALE * BUF +* +* Restore workspace dimensions +* + SWORK(1,1) = MAX( NBA, NBB ) + SWORK(2,1) = 2 * NBB + NBA +* + RETURN +* +* End of CTRSYL3 +* + END diff --git a/TESTING/EIG/CMakeLists.txt b/TESTING/EIG/CMakeLists.txt index f8dcfeb9f8..3c8d9a8b28 100644 --- a/TESTING/EIG/CMakeLists.txt +++ b/TESTING/EIG/CMakeLists.txt @@ -56,7 +56,7 @@ set(CEIGTST cchkee.F cget54.f cglmts.f cgqrts.f cgrqts.f cgsvts3.f chbt21.f chet21.f chet22.f chpt21.f chst01.f clarfy.f clarhs.f clatm4.f clctes.f clctsx.f clsets.f csbmv.f - csgt01.f cslect.f + csgt01.f cslect.f csyl01.f cstt21.f cstt22.f cunt01.f cunt03.f) set(DZIGTST dlafts.f dlahd2.f dlasum.f dlatb9.f dstech.f dstect.f diff --git a/TESTING/EIG/Makefile b/TESTING/EIG/Makefile index 948c34921f..e403586638 100644 --- a/TESTING/EIG/Makefile +++ b/TESTING/EIG/Makefile @@ -78,7 +78,7 @@ CEIGTST = cchkee.o \ cget54.o cglmts.o cgqrts.o cgrqts.o cgsvts3.o \ chbt21.o chet21.o chet22.o chpt21.o chst01.o \ clarfy.o clarhs.o clatm4.o clctes.o clctsx.o clsets.o csbmv.o \ - csgt01.o cslect.o \ + csgt01.o cslect.o csyl01.o\ cstt21.o cstt22.o cunt01.o cunt03.o DZIGTST = dlafts.o dlahd2.o dlasum.o dlatb9.o dstech.o dstect.o \ diff --git a/TESTING/EIG/cchkec.f b/TESTING/EIG/cchkec.f index 6727a0954b..c892b0a54a 100644 --- a/TESTING/EIG/cchkec.f +++ b/TESTING/EIG/cchkec.f @@ -23,7 +23,7 @@ *> \verbatim *> *> CCHKEC tests eigen- condition estimation routines -*> CTRSYL, CTREXC, CTRSNA, CTRSEN +*> CTRSYL, CTRSYL3, CTREXC, CTRSNA, CTRSEN *> *> In all cases, the routine runs through a fixed set of numerical *> examples, subjects them to various tests, and compares the test @@ -88,17 +88,17 @@ SUBROUTINE CCHKEC( THRESH, TSTERR, NIN, NOUT ) * .. Local Scalars .. LOGICAL OK CHARACTER*3 PATH - INTEGER KTREXC, KTRSEN, KTRSNA, KTRSYL, LTREXC, LTRSYL, - $ NTESTS, NTREXC, NTRSYL - REAL EPS, RTREXC, RTRSYL, SFMIN + INTEGER KTREXC, KTRSEN, KTRSNA, KTRSYL, KTRSYL3, + $ LTREXC, LTRSYL, NTESTS, NTREXC, NTRSYL + REAL EPS, RTREXC, SFMIN * .. * .. Local Arrays .. - INTEGER LTRSEN( 3 ), LTRSNA( 3 ), NTRSEN( 3 ), - $ NTRSNA( 3 ) - REAL RTRSEN( 3 ), RTRSNA( 3 ) + INTEGER FTRSYL( 3 ), ITRSYL( 2 ), LTRSEN( 3 ), + $ LTRSNA( 3 ), NTRSEN( 3 ), NTRSNA( 3 ) + REAL RTRSEN( 3 ), RTRSNA( 3 ), RTRSYL( 2 ) * .. * .. External Subroutines .. - EXTERNAL CERREC, CGET35, CGET36, CGET37, CGET38 + EXTERNAL CERREC, CGET35, CGET36, CGET37, CGET38, CSYL01 * .. * .. External Functions .. REAL SLAMCH @@ -120,10 +120,24 @@ SUBROUTINE CCHKEC( THRESH, TSTERR, NIN, NOUT ) $ CALL CERREC( PATH, NOUT ) * OK = .TRUE. - CALL CGET35( RTRSYL, LTRSYL, NTRSYL, KTRSYL, NIN ) - IF( RTRSYL.GT.THRESH ) THEN + CALL CGET35( RTRSYL( 1 ), LTRSYL, NTRSYL, KTRSYL, NIN ) + IF( RTRSYL( 1 ).GT.THRESH ) THEN OK = .FALSE. - WRITE( NOUT, FMT = 9999 )RTRSYL, LTRSYL, NTRSYL, KTRSYL + WRITE( NOUT, FMT = 9999 )RTRSYL( 1 ), LTRSYL, NTRSYL, KTRSYL + END IF +* + CALL CSYL01( THRESH, FTRSYL, RTRSYL, ITRSYL, KTRSYL3 ) + IF( FTRSYL( 1 ).GT.0 ) THEN + OK = .FALSE. + WRITE( NOUT, FMT = 9970 )FTRSYL( 1 ), RTRSYL( 1 ), THRESH + END IF + IF( FTRSYL( 2 ).GT.0 ) THEN + OK = .FALSE. + WRITE( NOUT, FMT = 9971 )FTRSYL( 2 ), RTRSYL( 2 ), THRESH + END IF + IF( FTRSYL( 3 ).GT.0 ) THEN + OK = .FALSE. + WRITE( NOUT, FMT = 9972 )FTRSYL( 3 ) END IF * CALL CGET36( RTREXC, LTREXC, NTREXC, KTREXC, NIN ) @@ -169,6 +183,12 @@ SUBROUTINE CCHKEC( THRESH, TSTERR, NIN, NOUT ) $ / ' Safe minimum (SFMIN) = ', E16.6, / ) 9992 FORMAT( ' Routines pass computational tests if test ratio is ', $ 'less than', F8.2, / / ) + 9972 FORMAT( 'CTRSYL and CTRSYL3 compute an inconsistent scale ', + $ 'factor in ', I8, ' tests.') + 9971 FORMAT( 'Error in CTRSYL3: ', I8, ' tests fail the threshold.', / + $ 'Maximum test ratio =', D12.3, ' threshold =', D12.3 ) + 9970 FORMAT( 'Error in CTRSYL: ', I8, ' tests fail the threshold.', / + $ 'Maximum test ratio =', D12.3, ' threshold =', D12.3 ) RETURN * * End of CCHKEC diff --git a/TESTING/EIG/cerrec.f b/TESTING/EIG/cerrec.f index 650ab2b6e6..6e2e1d38a3 100644 --- a/TESTING/EIG/cerrec.f +++ b/TESTING/EIG/cerrec.f @@ -23,7 +23,7 @@ *> *> CERREC tests the error exits for the routines for eigen- condition *> estimation for REAL matrices: -*> CTRSYL, CTREXC, CTRSNA and CTRSEN. +*> CTRSYL, CTRSYL3, CTREXC, CTRSNA and CTRSEN. *> \endverbatim * * Arguments: @@ -77,12 +77,12 @@ SUBROUTINE CERREC( PATH, NUNIT ) * .. * .. Local Arrays .. LOGICAL SEL( NMAX ) - REAL RW( LW ), S( NMAX ), SEP( NMAX ) + REAL RW( LW ), S( NMAX ), SEP( NMAX ), SWORK( NMAX ) COMPLEX A( NMAX, NMAX ), B( NMAX, NMAX ), $ C( NMAX, NMAX ), WORK( LW ), X( NMAX ) * .. * .. External Subroutines .. - EXTERNAL CHKXER, CTREXC, CTRSEN, CTRSNA, CTRSYL + EXTERNAL CHKXER, CTREXC, CTRSEN, CTRSNA, CTRSYL, CTRSYL3 * .. * .. Scalars in Common .. LOGICAL LERR, OK @@ -141,6 +141,43 @@ SUBROUTINE CERREC( PATH, NUNIT ) CALL CHKXER( 'CTRSYL', INFOT, NOUT, LERR, OK ) NT = NT + 8 * +* Test CTRSYL3 +* + SRNAMT = 'CTRSYL3' + INFOT = 1 + CALL CTRSYL3( 'X', 'N', 1, 0, 0, A, 1, B, 1, C, 1, SCALE, + $ SWORK, NMAX, INFO ) + CALL CHKXER( 'CTRSYL3', INFOT, NOUT, LERR, OK ) + INFOT = 2 + CALL CTRSYL3( 'N', 'X', 1, 0, 0, A, 1, B, 1, C, 1, SCALE, + $ SWORK, NMAX, INFO ) + CALL CHKXER( 'CTRSYL3', INFOT, NOUT, LERR, OK ) + INFOT = 3 + CALL CTRSYL3( 'N', 'N', 0, 0, 0, A, 1, B, 1, C, 1, SCALE, + $ SWORK, NMAX, INFO ) + CALL CHKXER( 'CTRSYL3', INFOT, NOUT, LERR, OK ) + INFOT = 4 + CALL CTRSYL3( 'N', 'N', 1, -1, 0, A, 1, B, 1, C, 1, SCALE, + $ SWORK, NMAX, INFO ) + CALL CHKXER( 'CTRSYL3', INFOT, NOUT, LERR, OK ) + INFOT = 5 + CALL CTRSYL3( 'N', 'N', 1, 0, -1, A, 1, B, 1, C, 1, SCALE, + $ SWORK, NMAX, INFO ) + CALL CHKXER( 'CTRSYL3', INFOT, NOUT, LERR, OK ) + INFOT = 7 + CALL CTRSYL3( 'N', 'N', 1, 2, 0, A, 1, B, 1, C, 2, SCALE, + $ SWORK, NMAX, INFO ) + CALL CHKXER( 'CTRSYL3', INFOT, NOUT, LERR, OK ) + INFOT = 9 + CALL CTRSYL3( 'N', 'N', 1, 0, 2, A, 1, B, 1, C, 1, SCALE, + $ SWORK, NMAX, INFO ) + CALL CHKXER( 'CTRSYL3', INFOT, NOUT, LERR, OK ) + INFOT = 11 + CALL CTRSYL3( 'N', 'N', 1, 2, 0, A, 2, B, 1, C, 1, SCALE, + $ SWORK, NMAX, INFO ) + CALL CHKXER( 'CTRSYL3', INFOT, NOUT, LERR, OK ) + NT = NT + 8 +* * Test CTREXC * SRNAMT = 'CTREXC' diff --git a/TESTING/EIG/csyl01.f b/TESTING/EIG/csyl01.f new file mode 100644 index 0000000000..a3395428c0 --- /dev/null +++ b/TESTING/EIG/csyl01.f @@ -0,0 +1,294 @@ +*> \brief \b CSYL01 +* +* =========== DOCUMENTATION =========== +* +* Online html documentation available at +* http://www.netlib.org/lapack/explore-html/ +* +* Definition: +* =========== +* +* SUBROUTINE CSYL01( THRESH, NFAIL, RMAX, NINFO, KNT ) +* +* .. Scalar Arguments .. +* INTEGER KNT +* REAL THRESH +* .. +* .. Array Arguments .. +* INTEGER NFAIL( 3 ), NINFO( 2 ) +* REAL RMAX( 2 ) +* .. +* +* +*> \par Purpose: +* ============= +*> +*> \verbatim +*> +*> CSYL01 tests CTRSYL and CTRSYL3, routines for solving the Sylvester matrix +*> equation +*> +*> op(A)*X + ISGN*X*op(B) = scale*C, +*> +*> where op(A) and op(B) are both upper triangular form, op() represents an +*> optional conjugate transpose, and ISGN can be -1 or +1. Scale is an output +*> less than or equal to 1, chosen to avoid overflow in X. +*> +*> The test code verifies that the following residual does not exceed +*> the provided threshold: +*> +*> norm(op(A)*X + ISGN*X*op(B) - scale*C) / +*> (EPS*max(norm(A),norm(B))*norm(X)) +*> +*> This routine complements CGET35 by testing with larger, +*> random matrices, of which some require rescaling of X to avoid overflow. +*> +*> \endverbatim +* +* Arguments: +* ========== +* +*> \param[in] THRESH +*> \verbatim +*> THRESH is REAL +*> A test will count as "failed" if the residual, computed as +*> described above, exceeds THRESH. +*> \endverbatim +*> +*> \param[out] NFAIL +*> \verbatim +*> NFAIL is INTEGER array, dimension (3) +*> NFAIL(1) = No. of times residual CTRSYL exceeds threshold THRESH +*> NFAIL(2) = No. of times residual CTRSYL3 exceeds threshold THRESH +*> NFAIL(3) = No. of times CTRSYL3 and CTRSYL deviate +*> \endverbatim +*> +*> \param[out] RMAX +*> \verbatim +*> RMAX is DOUBLE PRECISION array, dimension (2) +*> RMAX(1) = Value of the largest test ratio of CTRSYL +*> RMAX(2) = Value of the largest test ratio of CTRSYL3 +*> \endverbatim +*> +*> \param[out] NINFO +*> \verbatim +*> NINFO is INTEGER array, dimension (2) +*> NINFO(1) = No. of times CTRSYL where INFO is nonzero +*> NINFO(2) = No. of times CTRSYL3 where INFO is nonzero +*> \endverbatim +*> +*> \param[out] KNT +*> \verbatim +*> KNT is INTEGER +*> Total number of examples tested. +*> \endverbatim + +* +* -- LAPACK test routine -- + SUBROUTINE CSYL01( THRESH, NFAIL, RMAX, NINFO, KNT ) + IMPLICIT NONE +* +* -- LAPACK test routine -- +* -- LAPACK is a software package provided by Univ. of Tennessee, -- +* -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- +* +* .. Scalar Arguments .. + INTEGER KNT + REAL THRESH +* .. +* .. Array Arguments .. + INTEGER NFAIL( 3 ), NINFO( 2 ) + REAL RMAX( 2 ) +* .. +* +* ===================================================================== +* .. +* .. Parameters .. + COMPLEX CONE + PARAMETER ( CONE = ( 1.0E+0, 0.0E+0 ) ) + REAL ONE, ZERO + PARAMETER ( ZERO = 0.0E+0, ONE = 1.0E+0 ) + INTEGER MAXM, MAXN, LDSWORK + PARAMETER ( MAXM = 101, MAXN = 138, LDSWORK = 18 ) +* .. +* .. Local Scalars .. + CHARACTER TRANA, TRANB + INTEGER I, INFO, IINFO, ISGN, ITRANA, ITRANB, J, KLA, + $ KUA, KLB, KUB, M, N + REAL ANRM, BNRM, BIGNUM, EPS, RES, RES1, + $ SCALE, SCALE3, SMLNUM, TNRM, XNRM + COMPLEX RMUL +* .. +* .. Local Arrays .. + COMPLEX A( MAXM, MAXM ), B( MAXN, MAXN ), + $ C( MAXM, MAXN ), CC( MAXM, MAXN ), + $ WA( MAXM ), WB( MAXN ), X( MAXM, MAXN ), + $ DUML( MAXM ), DUMR( MAXN ), + $ D( MIN( MAXM, MAXN ) ) + REAL SWORK( LDSWORK, 54 ), DUM( MAXN ), VM( 2 ) + INTEGER ISEED( 4 ), IWORK( MAXM + MAXN + 2 ) +* .. +* .. External Functions .. + LOGICAL SISNAN + REAL SLAMCH, CLANGE + EXTERNAL SISNAN, SLAMCH, CLANGE +* .. +* .. External Subroutines .. + EXTERNAL CLATMR, CLACPY, CGEMM, CTRSYL, CTRSYL3 +* .. +* .. Intrinsic Functions .. + INTRINSIC ABS, REAL, MAX +* .. +* .. Executable Statements .. +* +* Get machine parameters +* + EPS = SLAMCH( 'P' ) + SMLNUM = SLAMCH( 'S' ) / EPS + BIGNUM = ONE / SMLNUM +* +* Expect INFO = 0 + VM( 1 ) = ONE +* Expect INFO = 1 + VM( 2 ) = 0.5E+0 +* +* Begin test loop +* + NINFO( 1 ) = 0 + NINFO( 2 ) = 0 + NFAIL( 1 ) = 0 + NFAIL( 2 ) = 0 + NFAIL( 3 ) = 0 + RMAX( 1 ) = ZERO + RMAX( 2 ) = ZERO + KNT = 0 + ISEED( 1 ) = 1 + ISEED( 2 ) = 1 + ISEED( 3 ) = 1 + ISEED( 4 ) = 1 + SCALE = ONE + SCALE3 = ONE + DO J = 1, 2 + DO ISGN = -1, 1, 2 +* Reset seed (overwritten by LATMR) + ISEED( 1 ) = 1 + ISEED( 2 ) = 1 + ISEED( 3 ) = 1 + ISEED( 4 ) = 1 + DO M = 32, MAXM, 23 + KLA = 0 + KUA = M - 1 + CALL CLATMR( M, M, 'S', ISEED, 'N', D, + $ 6, ONE, CONE, 'T', 'N', + $ DUML, 1, ONE, DUMR, 1, ONE, + $ 'N', IWORK, KLA, KUA, ZERO, + $ ONE, 'NO', A, MAXM, IWORK, + $ IINFO ) + DO I = 1, M + A( I, I ) = A( I, I ) * VM( J ) + END DO + ANRM = CLANGE( 'M', M, M, A, MAXM, DUM ) + DO N = 51, MAXN, 29 + KLB = 0 + KUB = N - 1 + CALL CLATMR( N, N, 'S', ISEED, 'N', D, + $ 6, ONE, CONE, 'T', 'N', + $ DUML, 1, ONE, DUMR, 1, ONE, + $ 'N', IWORK, KLB, KUB, ZERO, + $ ONE, 'NO', B, MAXN, IWORK, + $ IINFO ) + DO I = 1, N + B( I, I ) = B( I, I ) * VM ( J ) + END DO + BNRM = CLANGE( 'M', N, N, B, MAXN, DUM ) + TNRM = MAX( ANRM, BNRM ) + CALL CLATMR( M, N, 'S', ISEED, 'N', D, + $ 6, ONE, CONE, 'T', 'N', + $ DUML, 1, ONE, DUMR, 1, ONE, + $ 'N', IWORK, M, N, ZERO, ONE, + $ 'NO', C, MAXM, IWORK, IINFO ) + DO ITRANA = 1, 2 + IF( ITRANA.EQ.1 ) + $ TRANA = 'N' + IF( ITRANA.EQ.2 ) + $ TRANA = 'C' + DO ITRANB = 1, 2 + IF( ITRANB.EQ.1 ) + $ TRANB = 'N' + IF( ITRANB.EQ.2 ) + $ TRANB = 'C' + KNT = KNT + 1 +* + CALL CLACPY( 'All', M, N, C, MAXM, X, MAXM) + CALL CLACPY( 'All', M, N, C, MAXM, CC, MAXM) + CALL CTRSYL( TRANA, TRANB, ISGN, M, N, + $ A, MAXM, B, MAXN, X, MAXM, + $ SCALE, IINFO ) + IF( IINFO.NE.0 ) + $ NINFO( 1 ) = NINFO( 1 ) + 1 + XNRM = CLANGE( 'M', M, N, X, MAXM, DUM ) + RMUL = CONE + IF( XNRM.GT.ONE .AND. TNRM.GT.ONE ) THEN + IF( XNRM.GT.BIGNUM / TNRM ) THEN + RMUL = CONE / MAX( XNRM, TNRM ) + END IF + END IF + CALL CGEMM( TRANA, 'N', M, N, M, RMUL, + $ A, MAXM, X, MAXM, -SCALE*RMUL, + $ CC, MAXM ) + CALL CGEMM( 'N', TRANB, M, N, N, + $ REAL( ISGN )*RMUL, X, MAXM, B, + $ MAXN, CONE, CC, MAXM ) + RES1 = CLANGE( 'M', M, N, CC, MAXM, DUM ) + RES = RES1 / MAX( SMLNUM, SMLNUM*XNRM, + $ ( ( ABS( RMUL )*TNRM )*EPS )*XNRM ) + IF( RES.GT.THRESH ) + $ NFAIL( 1 ) = NFAIL( 1 ) + 1 + IF( RES.GT.RMAX( 1 ) ) + $ RMAX( 1 ) = RES +* + CALL CLACPY( 'All', M, N, C, MAXM, X, MAXM ) + CALL CLACPY( 'All', M, N, C, MAXM, CC, MAXM ) + CALL CTRSYL3( TRANA, TRANB, ISGN, M, N, + $ A, MAXM, B, MAXN, X, MAXM, + $ SCALE3, SWORK, LDSWORK, INFO) + IF( INFO.NE.0 ) + $ NINFO( 2 ) = NINFO( 2 ) + 1 + XNRM = CLANGE( 'M', M, N, X, MAXM, DUM ) + RMUL = CONE + IF( XNRM.GT.ONE .AND. TNRM.GT.ONE ) THEN + IF( XNRM.GT.BIGNUM / TNRM ) THEN + RMUL = CONE / MAX( XNRM, TNRM ) + END IF + END IF + CALL CGEMM( TRANA, 'N', M, N, M, RMUL, + $ A, MAXM, X, MAXM, -SCALE3*RMUL, + $ CC, MAXM ) + CALL CGEMM( 'N', TRANB, M, N, N, + $ REAL( ISGN )*RMUL, X, MAXM, B, + $ MAXN, CONE, CC, MAXM ) + RES1 = CLANGE( 'M', M, N, CC, MAXM, DUM ) + RES = RES1 / MAX( SMLNUM, SMLNUM*XNRM, + $ ( ( ABS( RMUL )*TNRM )*EPS )*XNRM ) +* Verify that TRSYL3 only flushes if TRSYL flushes (but +* there may be cases where TRSYL3 avoid flushing). + IF( SCALE3.EQ.ZERO .AND. SCALE.GT.ZERO .OR. + $ IINFO.NE.INFO ) THEN + NFAIL( 3 ) = NFAIL( 3 ) + 1 + END IF + IF( RES.GT.THRESH .OR. SISNAN( RES ) ) + $ NFAIL( 2 ) = NFAIL( 2 ) + 1 + IF( RES.GT.RMAX( 2 ) ) + $ RMAX( 2 ) = RES + END DO + END DO + END DO + END DO + END DO + END DO +* + RETURN +* +* End of CSYL01 +* + END From 8f441098cc8e3e250cb5f0833213edb8105a1e13 Mon Sep 17 00:00:00 2001 From: Angelika Schwarz Date: Tue, 26 Jul 2022 20:42:23 +0100 Subject: [PATCH 6/8] LAPACKE interface of [cz]trsyl3 --- LAPACKE/include/lapack.h | 38 +++++++++++++ LAPACKE/include/lapacke.h | 13 +++++ LAPACKE/src/CMakeLists.txt | 4 ++ LAPACKE/src/Makefile | 4 ++ LAPACKE/src/lapacke_ctrsyl3.c | 56 +++++++++++++++++++ LAPACKE/src/lapacke_ctrsyl3_work.c | 88 ++++++++++++++++++++++++++++++ LAPACKE/src/lapacke_ztrsyl3.c | 56 +++++++++++++++++++ LAPACKE/src/lapacke_ztrsyl3_work.c | 88 ++++++++++++++++++++++++++++++ 8 files changed, 347 insertions(+) create mode 100644 LAPACKE/src/lapacke_ctrsyl3.c create mode 100644 LAPACKE/src/lapacke_ctrsyl3_work.c create mode 100644 LAPACKE/src/lapacke_ztrsyl3.c create mode 100644 LAPACKE/src/lapacke_ztrsyl3_work.c diff --git a/LAPACKE/include/lapack.h b/LAPACKE/include/lapack.h index 2a56b78318..b5a276f5aa 100644 --- a/LAPACKE/include/lapack.h +++ b/LAPACKE/include/lapack.h @@ -22002,6 +22002,25 @@ void LAPACK_ztrsyl_base( #define LAPACK_ztrsyl(...) LAPACK_ztrsyl_base(__VA_ARGS__) #endif +#define LAPACK_ctrsyl3_base LAPACK_GLOBAL(ctrsyl3,CTRSYL3) +void LAPACK_ctrsyl3_base( + char const* trana, char const* tranb, + lapack_int const* isgn, lapack_int const* m, lapack_int const* n, + lapack_complex_float const* A, lapack_int const* lda, + lapack_complex_float const* B, lapack_int const* ldb, + lapack_complex_float* C, lapack_int const* ldc, float* scale, + float* swork, lapack_int const *ldswork, + lapack_int* info +#ifdef LAPACK_FORTRAN_STRLEN_END + , size_t, size_t +#endif +); +#ifdef LAPACK_FORTRAN_STRLEN_END + #define LAPACK_ctrsyl3(...) LAPACK_ctrsyl3_base(__VA_ARGS__, 1, 1) +#else + #define LAPACK_ctrsyl3(...) LAPACK_ctrsyl3_base(__VA_ARGS__) +#endif + #define LAPACK_dtrsyl3_base LAPACK_GLOBAL(dtrsyl3,DTRSYL3) void LAPACK_dtrsyl3_base( char const* trana, char const* tranb, @@ -22042,6 +22061,25 @@ void LAPACK_strsyl3_base( #define LAPACK_strsyl3(...) LAPACK_strsyl3_base(__VA_ARGS__) #endif +#define LAPACK_ztrsyl3_base LAPACK_GLOBAL(ztrsyl3,ZTRSYL3) +void LAPACK_ztrsyl3_base( + char const* trana, char const* tranb, + lapack_int const* isgn, lapack_int const* m, lapack_int const* n, + lapack_complex_double const* A, lapack_int const* lda, + lapack_complex_double const* B, lapack_int const* ldb, + lapack_complex_double* C, lapack_int const* ldc, double* scale, + double* swork, lapack_int const *ldswork, + lapack_int* info +#ifdef LAPACK_FORTRAN_STRLEN_END + , size_t, size_t +#endif +); +#ifdef LAPACK_FORTRAN_STRLEN_END + #define LAPACK_ztrsyl3(...) LAPACK_ztrsyl3_base(__VA_ARGS__, 1, 1) +#else + #define LAPACK_ztrsyl3(...) LAPACK_ztrsyl3_base(__VA_ARGS__) +#endif + #define LAPACK_ctrtri_base LAPACK_GLOBAL(ctrtri,CTRTRI) void LAPACK_ctrtri_base( char const* uplo, char const* diag, diff --git a/LAPACKE/include/lapacke.h b/LAPACKE/include/lapacke.h index 1e58b755fc..bf50eaf9e7 100644 --- a/LAPACKE/include/lapacke.h +++ b/LAPACKE/include/lapacke.h @@ -4487,6 +4487,12 @@ lapack_int LAPACKE_dtrsyl3( int matrix_layout, char trana, char tranb, const double* a, lapack_int lda, const double* b, lapack_int ldb, double* c, lapack_int ldc, double* scale ); +lapack_int LAPACKE_ztrsyl3( int matrix_layout, char trana, char tranb, + lapack_int isgn, lapack_int m, lapack_int n, + const lapack_complex_double* a, lapack_int lda, + const lapack_complex_double* b, lapack_int ldb, + lapack_complex_double* c, lapack_int ldc, + double* scale ); lapack_int LAPACKE_strtri( int matrix_layout, char uplo, char diag, lapack_int n, float* a, lapack_int lda ); @@ -10199,6 +10205,13 @@ lapack_int LAPACKE_dtrsyl3_work( int matrix_layout, char trana, char tranb, double* c, lapack_int ldc, double* scale, lapack_int* iwork, lapack_int liwork, double* swork, lapack_int ldswork ); +lapack_int LAPACKE_ztrsyl3_work( int matrix_layout, char trana, char tranb, + lapack_int isgn, lapack_int m, lapack_int n, + const lapack_complex_double* a, lapack_int lda, + const lapack_complex_double* b, lapack_int ldb, + lapack_complex_double* c, lapack_int ldc, + double* scale, double* swork, + lapack_int ldswork ); lapack_int LAPACKE_strtri_work( int matrix_layout, char uplo, char diag, lapack_int n, float* a, lapack_int lda ); diff --git a/LAPACKE/src/CMakeLists.txt b/LAPACKE/src/CMakeLists.txt index f9b50e31ad..d229ffb6e6 100644 --- a/LAPACKE/src/CMakeLists.txt +++ b/LAPACKE/src/CMakeLists.txt @@ -557,6 +557,8 @@ lapacke_ctrsna.c lapacke_ctrsna_work.c lapacke_ctrsyl.c lapacke_ctrsyl_work.c +lapacke_ctrsyl3.c +lapacke_ctrsyl3_work.c lapacke_ctrtri.c lapacke_ctrtri_work.c lapacke_ctrtrs.c @@ -2318,6 +2320,8 @@ lapacke_ztrsna.c lapacke_ztrsna_work.c lapacke_ztrsyl.c lapacke_ztrsyl_work.c +lapacke_ztrsyl3.c +lapacke_ztrsyl3_work.c lapacke_ztrtri.c lapacke_ztrtri_work.c lapacke_ztrtrs.c diff --git a/LAPACKE/src/Makefile b/LAPACKE/src/Makefile index 7bb229dfe5..fdd62eab21 100644 --- a/LAPACKE/src/Makefile +++ b/LAPACKE/src/Makefile @@ -604,6 +604,8 @@ lapacke_ctrsna.o \ lapacke_ctrsna_work.o \ lapacke_ctrsyl.o \ lapacke_ctrsyl_work.o \ +lapacke_ctrsyl3.o \ +lapacke_ctrsyl3_work.o \ lapacke_ctrtri.o \ lapacke_ctrtri_work.o \ lapacke_ctrtrs.o \ @@ -2360,6 +2362,8 @@ lapacke_ztrsna.o \ lapacke_ztrsna_work.o \ lapacke_ztrsyl.o \ lapacke_ztrsyl_work.o \ +lapacke_ztrsyl3.o \ +lapacke_ztrsyl3_work.o \ lapacke_ztrtri.o \ lapacke_ztrtri_work.o \ lapacke_ztrtrs.o \ diff --git a/LAPACKE/src/lapacke_ctrsyl3.c b/LAPACKE/src/lapacke_ctrsyl3.c new file mode 100644 index 0000000000..c931aac488 --- /dev/null +++ b/LAPACKE/src/lapacke_ctrsyl3.c @@ -0,0 +1,56 @@ +#include "lapacke_utils.h" + +lapack_int LAPACKE_ctrsyl3( int matrix_layout, char trana, char tranb, + lapack_int isgn, lapack_int m, lapack_int n, + const lapack_complex_float* a, lapack_int lda, + const lapack_complex_float* b, lapack_int ldb, + lapack_complex_float* c, lapack_int ldc, + float* scale ) +{ + lapack_int info = 0; + float swork_query[2]; + float* swork = NULL; + lapack_int ldswork = -1; + lapack_int swork_size = -1; + if( matrix_layout != LAPACK_COL_MAJOR && matrix_layout != LAPACK_ROW_MAJOR ) { + LAPACKE_xerbla( "LAPACKE_ctrsyl3", -1 ); + return -1; + } +#ifndef LAPACK_DISABLE_NAN_CHECK + if( LAPACKE_get_nancheck() ) { + /* Optionally check input matrices for NaNs */ + if( LAPACKE_cge_nancheck( matrix_layout, m, m, a, lda ) ) { + return -7; + } + if( LAPACKE_cge_nancheck( matrix_layout, n, n, b, ldb ) ) { + return -9; + } + if( LAPACKE_cge_nancheck( matrix_layout, m, n, c, ldc ) ) { + return -11; + } + } +#endif + /* Query optimal working array sizes */ + info = LAPACKE_ctrsyl3_work( matrix_layout, trana, tranb, isgn, m, n, a, lda, + b, ldb, c, ldc, scale, swork_query, ldswork ); + if( info != 0 ) { + goto exit_level_0; + } + ldswork = swork_query[0]; + swork_size = ldswork * swork_query[1]; + swork = (float*)LAPACKE_malloc( sizeof(float) * swork_size); + if( swork == NULL ) { + info = LAPACK_WORK_MEMORY_ERROR; + goto exit_level_0; + } + /* Call middle-level interface */ + info = LAPACKE_ctrsyl3_work( matrix_layout, trana, tranb, isgn, m, n, a, + lda, b, ldb, c, ldc, scale, swork, ldswork ); + /* Release memory and exit */ + LAPACKE_free( swork ); +exit_level_0: + if( info == LAPACK_WORK_MEMORY_ERROR ) { + LAPACKE_xerbla( "LAPACKE_ctrsyl3", info ); + } + return info; +} diff --git a/LAPACKE/src/lapacke_ctrsyl3_work.c b/LAPACKE/src/lapacke_ctrsyl3_work.c new file mode 100644 index 0000000000..09c08d92aa --- /dev/null +++ b/LAPACKE/src/lapacke_ctrsyl3_work.c @@ -0,0 +1,88 @@ +#include "lapacke_utils.h" + +lapack_int LAPACKE_ctrsyl3_work( int matrix_layout, char trana, char tranb, + lapack_int isgn, lapack_int m, lapack_int n, + const lapack_complex_float* a, lapack_int lda, + const lapack_complex_float* b, lapack_int ldb, + lapack_complex_float* c, lapack_int ldc, + float* scale, float* swork, + lapack_int ldswork ) +{ + lapack_int info = 0; + if( matrix_layout == LAPACK_COL_MAJOR ) { + /* Call LAPACK function and adjust info */ + LAPACK_ctrsyl3( &trana, &tranb, &isgn, &m, &n, a, &lda, b, &ldb, c, &ldc, + scale, swork, &ldswork, &info ); + if( info < 0 ) { + info = info - 1; + } + } else if( matrix_layout == LAPACK_ROW_MAJOR ) { + lapack_int lda_t = MAX(1,m); + lapack_int ldb_t = MAX(1,n); + lapack_int ldc_t = MAX(1,m); + lapack_complex_float* a_t = NULL; + lapack_complex_float* b_t = NULL; + lapack_complex_float* c_t = NULL; + /* Check leading dimension(s) */ + if( lda < m ) { + info = -8; + LAPACKE_xerbla( "LAPACKE_ctrsyl3_work", info ); + return info; + } + if( ldb < n ) { + info = -10; + LAPACKE_xerbla( "LAPACKE_ctrsyl3_work", info ); + return info; + } + if( ldc < n ) { + info = -12; + LAPACKE_xerbla( "LAPACKE_ctrsyl3_work", info ); + return info; + } + /* Allocate memory for temporary array(s) */ + a_t = (lapack_complex_float*) + LAPACKE_malloc( sizeof(lapack_complex_float) * lda_t * MAX(1,m) ); + if( a_t == NULL ) { + info = LAPACK_TRANSPOSE_MEMORY_ERROR; + goto exit_level_0; + } + b_t = (lapack_complex_float*) + LAPACKE_malloc( sizeof(lapack_complex_float) * ldb_t * MAX(1,n) ); + if( b_t == NULL ) { + info = LAPACK_TRANSPOSE_MEMORY_ERROR; + goto exit_level_1; + } + c_t = (lapack_complex_float*) + LAPACKE_malloc( sizeof(lapack_complex_float) * ldc_t * MAX(1,n) ); + if( c_t == NULL ) { + info = LAPACK_TRANSPOSE_MEMORY_ERROR; + goto exit_level_2; + } + /* Transpose input matrices */ + LAPACKE_cge_trans( matrix_layout, m, m, a, lda, a_t, lda_t ); + LAPACKE_cge_trans( matrix_layout, n, n, b, ldb, b_t, ldb_t ); + LAPACKE_cge_trans( matrix_layout, m, n, c, ldc, c_t, ldc_t ); + /* Call LAPACK function and adjust info */ + LAPACK_ctrsyl3( &trana, &tranb, &isgn, &m, &n, a_t, &lda_t, b_t, &ldb_t, + c_t, &ldc_t, scale, swork, &ldswork, &info ); + if( info < 0 ) { + info = info - 1; + } + /* Transpose output matrices */ + LAPACKE_cge_trans( LAPACK_COL_MAJOR, m, n, c_t, ldc_t, c, ldc ); + /* Release memory and exit */ + LAPACKE_free( c_t ); +exit_level_2: + LAPACKE_free( b_t ); +exit_level_1: + LAPACKE_free( a_t ); +exit_level_0: + if( info == LAPACK_TRANSPOSE_MEMORY_ERROR ) { + LAPACKE_xerbla( "LAPACKE_ctrsyl3_work", info ); + } + } else { + info = -1; + LAPACKE_xerbla( "LAPACKE_ctrsyl3_work", info ); + } + return info; +} diff --git a/LAPACKE/src/lapacke_ztrsyl3.c b/LAPACKE/src/lapacke_ztrsyl3.c new file mode 100644 index 0000000000..dbc9bcf9f7 --- /dev/null +++ b/LAPACKE/src/lapacke_ztrsyl3.c @@ -0,0 +1,56 @@ +#include "lapacke_utils.h" + +lapack_int LAPACKE_ztrsyl3( int matrix_layout, char trana, char tranb, + lapack_int isgn, lapack_int m, lapack_int n, + const lapack_complex_double* a, lapack_int lda, + const lapack_complex_double* b, lapack_int ldb, + lapack_complex_double* c, lapack_int ldc, + double* scale ) +{ + lapack_int info = 0; + double swork_query[2]; + double* swork = NULL; + lapack_int ldswork = -1; + lapack_int swork_size = -1; + if( matrix_layout != LAPACK_COL_MAJOR && matrix_layout != LAPACK_ROW_MAJOR ) { + LAPACKE_xerbla( "LAPACKE_ztrsyl3", -1 ); + return -1; + } +#ifndef LAPACK_DISABLE_NAN_CHECK + if( LAPACKE_get_nancheck() ) { + /* Optionally check input matrices for NaNs */ + if( LAPACKE_zge_nancheck( matrix_layout, m, m, a, lda ) ) { + return -7; + } + if( LAPACKE_zge_nancheck( matrix_layout, n, n, b, ldb ) ) { + return -9; + } + if( LAPACKE_zge_nancheck( matrix_layout, m, n, c, ldc ) ) { + return -11; + } + } +#endif + /* Query optimal working array sizes */ + info = LAPACKE_ztrsyl3_work( matrix_layout, trana, tranb, isgn, m, n, a, lda, + b, ldb, c, ldc, scale, swork_query, ldswork ); + if( info != 0 ) { + goto exit_level_0; + } + ldswork = swork_query[0]; + swork_size = ldswork * swork_query[1]; + swork = (double*)LAPACKE_malloc( sizeof(double) * swork_size); + if( swork == NULL ) { + info = LAPACK_WORK_MEMORY_ERROR; + goto exit_level_0; + } + /* Call middle-level interface */ + info = LAPACKE_ztrsyl3_work( matrix_layout, trana, tranb, isgn, m, n, a, + lda, b, ldb, c, ldc, scale, swork, ldswork ); + /* Release memory and exit */ + LAPACKE_free( swork ); +exit_level_0: + if( info == LAPACK_WORK_MEMORY_ERROR ) { + LAPACKE_xerbla( "LAPACKE_ztrsyl3", info ); + } + return info; +} diff --git a/LAPACKE/src/lapacke_ztrsyl3_work.c b/LAPACKE/src/lapacke_ztrsyl3_work.c new file mode 100644 index 0000000000..a7ebd5da60 --- /dev/null +++ b/LAPACKE/src/lapacke_ztrsyl3_work.c @@ -0,0 +1,88 @@ +#include "lapacke_utils.h" + +lapack_int LAPACKE_ztrsyl3_work( int matrix_layout, char trana, char tranb, + lapack_int isgn, lapack_int m, lapack_int n, + const lapack_complex_double* a, lapack_int lda, + const lapack_complex_double* b, lapack_int ldb, + lapack_complex_double* c, lapack_int ldc, + double* scale, double* swork, + lapack_int ldswork ) +{ + lapack_int info = 0; + if( matrix_layout == LAPACK_COL_MAJOR ) { + /* Call LAPACK function and adjust info */ + LAPACK_ztrsyl3( &trana, &tranb, &isgn, &m, &n, a, &lda, b, &ldb, c, &ldc, + scale, swork, &ldswork, &info ); + if( info < 0 ) { + info = info - 1; + } + } else if( matrix_layout == LAPACK_ROW_MAJOR ) { + lapack_int lda_t = MAX(1,m); + lapack_int ldb_t = MAX(1,n); + lapack_int ldc_t = MAX(1,m); + lapack_complex_double* a_t = NULL; + lapack_complex_double* b_t = NULL; + lapack_complex_double* c_t = NULL; + /* Check leading dimension(s) */ + if( lda < m ) { + info = -8; + LAPACKE_xerbla( "LAPACKE_ztrsyl3_work", info ); + return info; + } + if( ldb < n ) { + info = -10; + LAPACKE_xerbla( "LAPACKE_ztrsyl3_work", info ); + return info; + } + if( ldc < n ) { + info = -12; + LAPACKE_xerbla( "LAPACKE_ztrsyl3_work", info ); + return info; + } + /* Allocate memory for temporary array(s) */ + a_t = (lapack_complex_double*) + LAPACKE_malloc( sizeof(lapack_complex_double) * lda_t * MAX(1,m) ); + if( a_t == NULL ) { + info = LAPACK_TRANSPOSE_MEMORY_ERROR; + goto exit_level_0; + } + b_t = (lapack_complex_double*) + LAPACKE_malloc( sizeof(lapack_complex_double) * ldb_t * MAX(1,n) ); + if( b_t == NULL ) { + info = LAPACK_TRANSPOSE_MEMORY_ERROR; + goto exit_level_1; + } + c_t = (lapack_complex_double*) + LAPACKE_malloc( sizeof(lapack_complex_double) * ldc_t * MAX(1,n) ); + if( c_t == NULL ) { + info = LAPACK_TRANSPOSE_MEMORY_ERROR; + goto exit_level_2; + } + /* Transpose input matrices */ + LAPACKE_zge_trans( matrix_layout, m, m, a, lda, a_t, lda_t ); + LAPACKE_zge_trans( matrix_layout, n, n, b, ldb, b_t, ldb_t ); + LAPACKE_zge_trans( matrix_layout, m, n, c, ldc, c_t, ldc_t ); + /* Call LAPACK function and adjust info */ + LAPACK_ztrsyl3( &trana, &tranb, &isgn, &m, &n, a_t, &lda_t, b_t, &ldb_t, + c_t, &ldc_t, scale, swork, &ldswork, &info ); + if( info < 0 ) { + info = info - 1; + } + /* Transpose output matrices */ + LAPACKE_zge_trans( LAPACK_COL_MAJOR, m, n, c_t, ldc_t, c, ldc ); + /* Release memory and exit */ + LAPACKE_free( c_t ); +exit_level_2: + LAPACKE_free( b_t ); +exit_level_1: + LAPACKE_free( a_t ); +exit_level_0: + if( info == LAPACK_TRANSPOSE_MEMORY_ERROR ) { + LAPACKE_xerbla( "LAPACKE_ztrsyl3_work", info ); + } + } else { + info = -1; + LAPACKE_xerbla( "LAPACKE_ztrsyl3_work", info ); + } + return info; +} From 1c1263ec24115af2026787d84dcf21c585a1c86b Mon Sep 17 00:00:00 2001 From: Angelika Schwarz Date: Sun, 21 Aug 2022 16:20:50 +0200 Subject: [PATCH 7/8] Use unblocked code if insufficient workspace is provided --- SRC/ctrsyl3.f | 16 ++++++++-------- SRC/dtrsyl3.f | 13 ++++++------- SRC/strsyl3.f | 7 +++++-- SRC/ztrsyl3.f | 10 +++++----- TESTING/EIG/csyl01.f | 2 +- TESTING/EIG/zsyl01.f | 2 +- 6 files changed, 26 insertions(+), 24 deletions(-) diff --git a/SRC/ctrsyl3.f b/SRC/ctrsyl3.f index b7a7355885..586dc0207f 100644 --- a/SRC/ctrsyl3.f +++ b/SRC/ctrsyl3.f @@ -184,14 +184,14 @@ SUBROUTINE CTRSYL3( TRANA, TRANB, ISGN, M, N, A, LDA, B, LDB, C, * .. External Functions .. LOGICAL LSAME INTEGER ILAENV - REAL SLAMCH, SLARMM, CLANGE - EXTERNAL SLAMCH, SLARMM, ILAENV, LSAME, CLANGE + REAL CLANGE, SLAMCH, SLARMM + EXTERNAL CLANGE, ILAENV, LSAME, SLAMCH, SLARMM * .. * .. External Subroutines .. - EXTERNAL XERBLA, CSSCAL, CGEMM, CLASCL, CTRSYL + EXTERNAL CSSCAL, CGEMM, CLASCL, CTRSYL, XERBLA * .. * .. Intrinsic Functions .. - INTRINSIC ABS, EXPONENT, REAL, AIMAG, MAX, MIN + INTRINSIC ABS, AIMAG, EXPONENT, MAX, MIN, REAL * .. * .. Executable Statements .. * @@ -237,8 +237,6 @@ SUBROUTINE CTRSYL3( TRANA, TRANB, ISGN, M, N, A, LDA, B, LDB, C, INFO = -9 ELSE IF( LDC.LT.MAX( 1, M ) ) THEN INFO = -11 - ELSE IF( .NOT.LQUERY .AND. LDSWORK.LT.MAX( NBA, NBB ) ) THEN - INFO = -16 END IF IF( INFO.NE.0 ) THEN CALL XERBLA( 'CTRSYL3', -INFO ) @@ -249,12 +247,14 @@ SUBROUTINE CTRSYL3( TRANA, TRANB, ISGN, M, N, A, LDA, B, LDB, C, * * Quick return if possible * + SCALE = ONE IF( M.EQ.0 .OR. N.EQ.0 ) $ RETURN * -* Use unblocked code for small problems +* Use unblocked code for small problems or if insufficient +* workspace is provided * - IF( NBA.EQ.1 .OR. NBB.EQ.1 ) THEN + IF( MIN( NBA, NBB ).EQ.1 .OR. LDSWORK.LT.MAX( NBA, NBB ) ) THEN CALL CTRSYL( TRANA, TRANB, ISGN, M, N, A, LDA, B, LDB, $ C, LDC, SCALE, INFO ) RETURN diff --git a/SRC/dtrsyl3.f b/SRC/dtrsyl3.f index dd5f2f48f5..c44ec38087 100644 --- a/SRC/dtrsyl3.f +++ b/SRC/dtrsyl3.f @@ -215,7 +215,7 @@ SUBROUTINE DTRSYL3( TRANA, TRANB, ISGN, M, N, A, LDA, B, LDB, C, EXTERNAL DGEMM, DLASCL, DSCAL, DTRSYL, XERBLA * .. * .. Intrinsic Functions .. - INTRINSIC ABS, EXPONENT, DBLE, MAX, MIN + INTRINSIC ABS, DBLE, EXPONENT, MAX, MIN * .. * .. Executable Statements .. * @@ -264,10 +264,6 @@ SUBROUTINE DTRSYL3( TRANA, TRANB, ISGN, M, N, A, LDA, B, LDB, C, INFO = -9 ELSE IF( LDC.LT.MAX( 1, M ) ) THEN INFO = -11 - ELSE IF( .NOT.LQUERY .AND. LIWORK.LT.IWORK(1) ) THEN - INFO = -14 - ELSE IF( .NOT.LQUERY .AND. LDSWORK.LT.MAX( NBA, NBB ) ) THEN - INFO = -16 END IF IF( INFO.NE.0 ) THEN CALL XERBLA( 'DTRSYL3', -INFO ) @@ -278,12 +274,15 @@ SUBROUTINE DTRSYL3( TRANA, TRANB, ISGN, M, N, A, LDA, B, LDB, C, * * Quick return if possible * + SCALE = ONE IF( M.EQ.0 .OR. N.EQ.0 ) $ RETURN * -* Use unblocked code for small problems +* Use unblocked code for small problems or if insufficient +* workspaces are provided * - IF( NBA.EQ.1 .OR. NBB.EQ.1 ) THEN + IF( MIN( NBA, NBB ).EQ.1 .OR. LDSWORK.LT.MAX( NBA, NBB ) .OR. + $ LIWORK.LT.IWORK(1) ) THEN CALL DTRSYL( TRANA, TRANB, ISGN, M, N, A, LDA, B, LDB, $ C, LDC, SCALE, INFO ) RETURN diff --git a/SRC/strsyl3.f b/SRC/strsyl3.f index 8f9766837a..28762c2ed1 100644 --- a/SRC/strsyl3.f +++ b/SRC/strsyl3.f @@ -278,12 +278,15 @@ SUBROUTINE STRSYL3( TRANA, TRANB, ISGN, M, N, A, LDA, B, LDB, C, * * Quick return if possible * + SCALE = ONE IF( M.EQ.0 .OR. N.EQ.0 ) $ RETURN * -* Use unblocked code for small problems +* Use unblocked code for small problems or if insufficient +* workspaces are provided * - IF( NBA.EQ.1 .OR. NBB.EQ.1 ) THEN + IF( MIN( NBA, NBB ).EQ.1 .OR. LDSWORK.LT.MAX( NBA, NBB ) .OR. + $ LIWORK.LT.IWORK(1) ) THEN CALL STRSYL( TRANA, TRANB, ISGN, M, N, A, LDA, B, LDB, $ C, LDC, SCALE, INFO ) RETURN diff --git a/SRC/ztrsyl3.f b/SRC/ztrsyl3.f index c344e5303a..b5a058da4e 100644 --- a/SRC/ztrsyl3.f +++ b/SRC/ztrsyl3.f @@ -192,7 +192,7 @@ SUBROUTINE ZTRSYL3( TRANA, TRANB, ISGN, M, N, A, LDA, B, LDB, C, EXTERNAL XERBLA, ZDSCAL, ZGEMM, ZLASCL, ZTRSYL * .. * .. Intrinsic Functions .. - INTRINSIC ABS, EXPONENT, DBLE, DIMAG, MAX, MIN + INTRINSIC ABS, DBLE, DIMAG, EXPONENT, MAX, MIN * .. * .. Executable Statements .. * @@ -238,8 +238,6 @@ SUBROUTINE ZTRSYL3( TRANA, TRANB, ISGN, M, N, A, LDA, B, LDB, C, INFO = -9 ELSE IF( LDC.LT.MAX( 1, M ) ) THEN INFO = -11 - ELSE IF( .NOT.LQUERY .AND. LDSWORK.LT.MAX( NBA, NBB ) ) THEN - INFO = -16 END IF IF( INFO.NE.0 ) THEN CALL XERBLA( 'ZTRSYL3', -INFO ) @@ -250,12 +248,14 @@ SUBROUTINE ZTRSYL3( TRANA, TRANB, ISGN, M, N, A, LDA, B, LDB, C, * * Quick return if possible * + SCALE = ONE IF( M.EQ.0 .OR. N.EQ.0 ) $ RETURN * -* Use unblocked code for small problems +* Use unblocked code for small problems or if insufficient +* workspace is provided * - IF( NBA.EQ.1 .OR. NBB.EQ.1 ) THEN + IF( MIN( NBA, NBB ).EQ.1 .OR. LDSWORK.LT.MAX( NBA, NBB ) ) THEN CALL ZTRSYL( TRANA, TRANB, ISGN, M, N, A, LDA, B, LDB, $ C, LDC, SCALE, INFO ) RETURN diff --git a/TESTING/EIG/csyl01.f b/TESTING/EIG/csyl01.f index a3395428c0..e21f1a7a03 100644 --- a/TESTING/EIG/csyl01.f +++ b/TESTING/EIG/csyl01.f @@ -122,7 +122,7 @@ SUBROUTINE CSYL01( THRESH, NFAIL, RMAX, NINFO, KNT ) * .. Local Arrays .. COMPLEX A( MAXM, MAXM ), B( MAXN, MAXN ), $ C( MAXM, MAXN ), CC( MAXM, MAXN ), - $ WA( MAXM ), WB( MAXN ), X( MAXM, MAXN ), + $ X( MAXM, MAXN ), $ DUML( MAXM ), DUMR( MAXN ), $ D( MIN( MAXM, MAXN ) ) REAL SWORK( LDSWORK, 54 ), DUM( MAXN ), VM( 2 ) diff --git a/TESTING/EIG/zsyl01.f b/TESTING/EIG/zsyl01.f index 03a32f8fc8..1e8619a34c 100644 --- a/TESTING/EIG/zsyl01.f +++ b/TESTING/EIG/zsyl01.f @@ -122,7 +122,7 @@ SUBROUTINE ZSYL01( THRESH, NFAIL, RMAX, NINFO, KNT ) * .. Local Arrays .. COMPLEX*16 A( MAXM, MAXM ), B( MAXN, MAXN ), $ C( MAXM, MAXN ), CC( MAXM, MAXN ), - $ WA( MAXM ), WB( MAXN ), X( MAXM, MAXN ), + $ X( MAXM, MAXN ), $ DUML( MAXM ), DUMR( MAXN ), $ D( MIN( MAXM, MAXN ) ) DOUBLE PRECISION SWORK( LDSWORK, 103 ), DUM( MAXN ), VM( 2 ) From 970a7721d87e316d461c5f5c941b7db73ad63b0e Mon Sep 17 00:00:00 2001 From: Angelika Schwarz Date: Sun, 4 Sep 2022 13:23:11 +0200 Subject: [PATCH 8/8] Add BLAS-3 robust triangular solver T*X = B*diag(s1, ..., sN) An extensions of LATRS solving T*x = s*x to many right-hand sides, allowing the usage of BLAS-3. The new algorithm tends to use less aggressive scaling, in particular for larger matrices. --- SRC/CMakeLists.txt | 8 +- SRC/Makefile | 8 +- SRC/clatrs3.f | 666 ++++++++++++++++++++++++++++++++++++++++++ SRC/dlatrs3.f | 656 ++++++++++++++++++++++++++++++++++++++++++ SRC/ilaenv.f | 6 + SRC/slatrs3.f | 656 ++++++++++++++++++++++++++++++++++++++++++ SRC/zlatrs3.f | 667 +++++++++++++++++++++++++++++++++++++++++++ TESTING/LIN/cchktr.f | 52 +++- TESTING/LIN/cerrtr.f | 47 ++- TESTING/LIN/dchktr.f | 56 +++- TESTING/LIN/derrtr.f | 47 ++- TESTING/LIN/schktr.f | 57 +++- TESTING/LIN/serrtr.f | 47 ++- TESTING/LIN/zchktr.f | 58 +++- TESTING/LIN/zerrtr.f | 47 ++- 15 files changed, 3015 insertions(+), 63 deletions(-) create mode 100644 SRC/clatrs3.f create mode 100644 SRC/dlatrs3.f create mode 100644 SRC/slatrs3.f create mode 100644 SRC/zlatrs3.f diff --git a/SRC/CMakeLists.txt b/SRC/CMakeLists.txt index 525545924f..d324d94116 100644 --- a/SRC/CMakeLists.txt +++ b/SRC/CMakeLists.txt @@ -109,7 +109,7 @@ set(SLASRC slargv.f slarmm.f slarrv.f slartv.f slarz.f slarzb.f slarzt.f slasy2.f slasyf.f slasyf_rook.f slasyf_rk.f slasyf_aa.f - slatbs.f slatdf.f slatps.f slatrd.f slatrs.f slatrz.f + slatbs.f slatdf.f slatps.f slatrd.f slatrs.f slatrs3.f slatrz.f slauu2.f slauum.f sopgtr.f sopmtr.f sorg2l.f sorg2r.f sorgbr.f sorghr.f sorgl2.f sorglq.f sorgql.f sorgqr.f sorgr2.f sorgrq.f sorgtr.f sorgtsqr.f sorgtsqr_row.f sorm2l.f sorm2r.f sorm22.f @@ -220,7 +220,7 @@ set(CLASRC clarfx.f clarfy.f clargv.f clarnv.f clarrv.f clartg.f90 clartv.f clarz.f clarzb.f clarzt.f clascl.f claset.f clasr.f classq.f90 claswp.f clasyf.f clasyf_rook.f clasyf_rk.f clasyf_aa.f - clatbs.f clatdf.f clatps.f clatrd.f clatrs.f clatrz.f + clatbs.f clatdf.f clatps.f clatrd.f clatrs.f clatrs3.f clatrz.f clauu2.f clauum.f cpbcon.f cpbequ.f cpbrfs.f cpbstf.f cpbsv.f cpbsvx.f cpbtf2.f cpbtrf.f cpbtrs.f cpocon.f cpoequ.f cporfs.f cposv.f cposvx.f cpotf2.f cpotrf2.f cpotri.f cpstrf.f cpstf2.f @@ -309,7 +309,7 @@ set(DLASRC dlargv.f dlarmm.f dlarrv.f dlartv.f dlarz.f dlarzb.f dlarzt.f dlaswp.f dlasy2.f dlasyf.f dlasyf_rook.f dlasyf_rk.f dlasyf_aa.f - dlatbs.f dlatdf.f dlatps.f dlatrd.f dlatrs.f dlatrz.f dlauu2.f + dlatbs.f dlatdf.f dlatps.f dlatrd.f dlatrs.f dlatrs3.f dlatrz.f dlauu2.f dlauum.f dopgtr.f dopmtr.f dorg2l.f dorg2r.f dorgbr.f dorghr.f dorgl2.f dorglq.f dorgql.f dorgqr.f dorgr2.f dorgrq.f dorgtr.f dorgtsqr.f dorgtsqr_row.f dorm2l.f dorm2r.f dorm22.f @@ -420,7 +420,7 @@ set(ZLASRC zlarfx.f zlarfy.f zlargv.f zlarnv.f zlarrv.f zlartg.f90 zlartv.f zlarz.f zlarzb.f zlarzt.f zlascl.f zlaset.f zlasr.f zlassq.f90 zlaswp.f zlasyf.f zlasyf_rook.f zlasyf_rk.f zlasyf_aa.f - zlatbs.f zlatdf.f zlatps.f zlatrd.f zlatrs.f zlatrz.f zlauu2.f + zlatbs.f zlatdf.f zlatps.f zlatrd.f zlatrs.f zlatrs3.f zlatrz.f zlauu2.f zlauum.f zpbcon.f zpbequ.f zpbrfs.f zpbstf.f zpbsv.f zpbsvx.f zpbtf2.f zpbtrf.f zpbtrs.f zpocon.f zpoequ.f zporfs.f zposv.f zposvx.f zpotf2.f zpotrf.f zpotrf2.f zpotri.f zpotrs.f zpstrf.f zpstf2.f diff --git a/SRC/Makefile b/SRC/Makefile index ccc991c9ae..765abf42ac 100644 --- a/SRC/Makefile +++ b/SRC/Makefile @@ -141,7 +141,7 @@ SLASRC = \ slargv.o slarmm.o slarrv.o slartv.o \ slarz.o slarzb.o slarzt.o slaswp.o slasy2.o slasyf.o slasyf_rook.o \ slasyf_rk.o \ - slatbs.o slatdf.o slatps.o slatrd.o slatrs.o slatrz.o \ + slatbs.o slatdf.o slatps.o slatrd.o slatrs.o slatrs3.o slatrz.o \ slauu2.o slauum.o sopgtr.o sopmtr.o sorg2l.o sorg2r.o \ sorgbr.o sorghr.o sorgl2.o sorglq.o sorgql.o sorgqr.o sorgr2.o \ sorgrq.o sorgtr.o sorgtsqr.o sorgtsqr_row.o sorm2l.o sorm2r.o sorm22.o \ @@ -253,7 +253,7 @@ CLASRC = \ clarfx.o clarfy.o clargv.o clarnv.o clarrv.o clartg.o clartv.o \ clarz.o clarzb.o clarzt.o clascl.o claset.o clasr.o classq.o \ claswp.o clasyf.o clasyf_rook.o clasyf_rk.o clasyf_aa.o \ - clatbs.o clatdf.o clatps.o clatrd.o clatrs.o clatrz.o \ + clatbs.o clatdf.o clatps.o clatrd.o clatrs.o clatrs3.o clatrz.o \ clauu2.o clauum.o cpbcon.o cpbequ.o cpbrfs.o cpbstf.o cpbsv.o \ cpbsvx.o cpbtf2.o cpbtrf.o cpbtrs.o cpocon.o cpoequ.o cporfs.o \ cposv.o cposvx.o cpotf2.o cpotri.o cpstrf.o cpstf2.o \ @@ -343,7 +343,7 @@ DLASRC = \ dlargv.o dlarmm.o dlarrv.o dlartv.o \ dlarz.o dlarzb.o dlarzt.o dlaswp.o dlasy2.o \ dlasyf.o dlasyf_rook.o dlasyf_rk.o \ - dlatbs.o dlatdf.o dlatps.o dlatrd.o dlatrs.o dlatrz.o dlauu2.o \ + dlatbs.o dlatdf.o dlatps.o dlatrd.o dlatrs.o dlatrs3.o dlatrz.o dlauu2.o \ dlauum.o dopgtr.o dopmtr.o dorg2l.o dorg2r.o \ dorgbr.o dorghr.o dorgl2.o dorglq.o dorgql.o dorgqr.o dorgr2.o \ dorgrq.o dorgtr.o dorgtsqr.o dorgtsqr_row.o dorm2l.o dorm2r.o dorm22.o \ @@ -457,7 +457,7 @@ ZLASRC = \ zlarfx.o zlarfy.o zlargv.o zlarnv.o zlarrv.o zlartg.o zlartv.o \ zlarz.o zlarzb.o zlarzt.o zlascl.o zlaset.o zlasr.o \ zlassq.o zlaswp.o zlasyf.o zlasyf_rook.o zlasyf_rk.o zlasyf_aa.o \ - zlatbs.o zlatdf.o zlatps.o zlatrd.o zlatrs.o zlatrz.o zlauu2.o \ + zlatbs.o zlatdf.o zlatps.o zlatrd.o zlatrs.o zlatrs3.o zlatrz.o zlauu2.o \ zlauum.o zpbcon.o zpbequ.o zpbrfs.o zpbstf.o zpbsv.o \ zpbsvx.o zpbtf2.o zpbtrf.o zpbtrs.o zpocon.o zpoequ.o zporfs.o \ zposv.o zposvx.o zpotf2.o zpotrf.o zpotri.o zpotrs.o zpstrf.o zpstf2.o \ diff --git a/SRC/clatrs3.f b/SRC/clatrs3.f new file mode 100644 index 0000000000..a902f1ed01 --- /dev/null +++ b/SRC/clatrs3.f @@ -0,0 +1,666 @@ +*> \brief \b CLATRS3 solves a triangular system of equations with the scale factors set to prevent overflow. +* +* Definition: +* =========== +* +* SUBROUTINE CLATRS3( UPLO, TRANS, DIAG, NORMIN, N, NRHS, A, LDA, +* X, LDX, SCALE, CNORM, WORK, LWORK, INFO ) +* +* .. Scalar Arguments .. +* CHARACTER DIAG, NORMIN, TRANS, UPLO +* INTEGER INFO, LDA, LWORK, LDX, N, NRHS +* .. +* .. Array Arguments .. +* REAL CNORM( * ), SCALE( * ), WORK( * ) +* COMPLEX A( LDA, * ), X( LDX, * ) +* .. +* +* +*> \par Purpose: +* ============= +*> +*> \verbatim +*> +*> CLATRS3 solves one of the triangular systems +*> +*> A * X = B * diag(scale), A**T * X = B * diag(scale), or +*> A**H * X = B * diag(scale) +*> +*> with scaling to prevent overflow. Here A is an upper or lower +*> triangular matrix, A**T denotes the transpose of A, A**H denotes the +*> conjugate transpose of A. X and B are n-by-nrhs matrices and scale +*> is an nrhs-element vector of scaling factors. A scaling factor scale(j) +*> is usually less than or equal to 1, chosen such that X(:,j) is less +*> than the overflow threshold. If the matrix A is singular (A(j,j) = 0 +*> for some j), then a non-trivial solution to A*X = 0 is returned. If +*> the system is so badly scaled that the solution cannot be represented +*> as (1/scale(k))*X(:,k), then x(:,k) = 0 and scale(k) is returned. +*> +*> This is a BLAS-3 version of LATRS for solving several right +*> hand sides simultaneously. +*> +*> \endverbatim +* +* Arguments: +* ========== +* +*> \param[in] UPLO +*> \verbatim +*> UPLO is CHARACTER*1 +*> Specifies whether the matrix A is upper or lower triangular. +*> = 'U': Upper triangular +*> = 'L': Lower triangular +*> \endverbatim +*> +*> \param[in] TRANS +*> \verbatim +*> TRANS is CHARACTER*1 +*> Specifies the operation applied to A. +*> = 'N': Solve A * x = s*b (No transpose) +*> = 'T': Solve A**T* x = s*b (Transpose) +*> = 'C': Solve A**T* x = s*b (Conjugate transpose) +*> \endverbatim +*> +*> \param[in] DIAG +*> \verbatim +*> DIAG is CHARACTER*1 +*> Specifies whether or not the matrix A is unit triangular. +*> = 'N': Non-unit triangular +*> = 'U': Unit triangular +*> \endverbatim +*> +*> \param[in] NORMIN +*> \verbatim +*> NORMIN is CHARACTER*1 +*> Specifies whether CNORM has been set or not. +*> = 'Y': CNORM contains the column norms on entry +*> = 'N': CNORM is not set on entry. On exit, the norms will +*> be computed and stored in CNORM. +*> \endverbatim +*> +*> \param[in] N +*> \verbatim +*> N is INTEGER +*> The order of the matrix A. N >= 0. +*> \endverbatim +*> +*> \param[in] NRHS +*> \verbatim +*> NRHS is INTEGER +*> The number of columns of X. NRHS >= 0. +*> \endverbatim +*> +*> \param[in] A +*> \verbatim +*> A is COMPLEX array, dimension (LDA,N) +*> The triangular matrix A. If UPLO = 'U', the leading n by n +*> upper triangular part of the array A contains the upper +*> triangular matrix, and the strictly lower triangular part of +*> A is not referenced. If UPLO = 'L', the leading n by n lower +*> triangular part of the array A contains the lower triangular +*> matrix, and the strictly upper triangular part of A is not +*> referenced. If DIAG = 'U', the diagonal elements of A are +*> also not referenced and are assumed to be 1. +*> \endverbatim +*> +*> \param[in] LDA +*> \verbatim +*> LDA is INTEGER +*> The leading dimension of the array A. LDA >= max (1,N). +*> \endverbatim +*> +*> \param[in,out] X +*> \verbatim +*> X is COMPLEX array, dimension (LDX,NRHS) +*> On entry, the right hand side B of the triangular system. +*> On exit, X is overwritten by the solution matrix X. +*> \endverbatim +*> +*> \param[in] LDX +*> \verbatim +*> LDX is INTEGER +*> The leading dimension of the array X. LDX >= max (1,N). +*> \endverbatim +*> +*> \param[out] SCALE +*> \verbatim +*> SCALE is REAL array, dimension (NRHS) +*> The scaling factor s(k) is for the triangular system +*> A * x(:,k) = s(k)*b(:,k) or A**T* x(:,k) = s(k)*b(:,k). +*> If SCALE = 0, the matrix A is singular or badly scaled. +*> If A(j,j) = 0 is encountered, a non-trivial vector x(:,k) +*> that is an exact or approximate solution to A*x(:,k) = 0 +*> is returned. If the system so badly scaled that solution +*> cannot be presented as x(:,k) * 1/s(k), then x(:,k) = 0 +*> is returned. +*> \endverbatim +*> +*> \param[in,out] CNORM +*> \verbatim +*> CNORM is REAL array, dimension (N) +*> +*> If NORMIN = 'Y', CNORM is an input argument and CNORM(j) +*> contains the norm of the off-diagonal part of the j-th column +*> of A. If TRANS = 'N', CNORM(j) must be greater than or equal +*> to the infinity-norm, and if TRANS = 'T' or 'C', CNORM(j) +*> must be greater than or equal to the 1-norm. +*> +*> If NORMIN = 'N', CNORM is an output argument and CNORM(j) +*> returns the 1-norm of the offdiagonal part of the j-th column +*> of A. +*> \endverbatim +*> +*> \param[out] WORK +*> \verbatim +*> WORK is REAL array, dimension (LWORK). +*> On exit, if INFO = 0, WORK(1) returns the optimal size of +*> WORK. +*> \endverbatim +*> +*> \param[in] LWORK +*> LWORK is INTEGER +*> LWORK >= MAX(1, 2*NBA * MAX(NBA, MIN(NRHS, 32)), where +*> NBA = (N + NB - 1)/NB and NB is the optimal block size. +*> +*> If LWORK = -1, then a workspace query is assumed; the routine +*> only calculates the optimal dimensions of the WORK array, returns +*> this value as the first entry of the WORK array, and no error +*> message related to LWORK is issued by XERBLA. +*> +*> \param[out] INFO +*> \verbatim +*> INFO is INTEGER +*> = 0: successful exit +*> < 0: if INFO = -k, the k-th argument had an illegal value +*> \endverbatim +* +* Authors: +* ======== +* +*> \author Univ. of Tennessee +*> \author Univ. of California Berkeley +*> \author Univ. of Colorado Denver +*> \author NAG Ltd. +* +*> \ingroup doubleOTHERauxiliary +*> \par Further Details: +* ===================== +* \verbatim +* The algorithm follows the structure of a block triangular solve. +* The diagonal block is solved with a call to the robust the triangular +* solver LATRS for every right-hand side RHS = 1, ..., NRHS +* op(A( J, J )) * x( J, RHS ) = SCALOC * b( J, RHS ), +* where op( A ) = A or op( A ) = A**T or op( A ) = A**H. +* The linear block updates operate on block columns of X, +* B( I, K ) - op(A( I, J )) * X( J, K ) +* and use GEMM. To avoid overflow in the linear block update, the worst case +* growth is estimated. For every RHS, a scale factor s <= 1.0 is computed +* such that +* || s * B( I, RHS )||_oo +* + || op(A( I, J )) ||_oo * || s * X( J, RHS ) ||_oo <= Overflow threshold +* +* Once all columns of a block column have been rescaled (BLAS-1), the linear +* update is executed with GEMM without overflow. +* +* To limit rescaling, local scale factors track the scaling of column segments. +* There is one local scale factor s( I, RHS ) per block row I = 1, ..., NBA +* per right-hand side column RHS = 1, ..., NRHS. The global scale factor +* SCALE( RHS ) is chosen as the smallest local scale factor s( I, RHS ) +* I = 1, ..., NBA. +* A triangular solve op(A( J, J )) * x( J, RHS ) = SCALOC * b( J, RHS ) +* updates the local scale factor s( J, RHS ) := s( J, RHS ) * SCALOC. The +* linear update of potentially inconsistently scaled vector segments +* s( I, RHS ) * b( I, RHS ) - op(A( I, J )) * ( s( J, RHS )* x( J, RHS ) ) +* computes a consistent scaling SCAMIN = MIN( s(I, RHS ), s(J, RHS) ) and, +* if necessary, rescales the blocks prior to calling GEMM. +* +* \endverbatim +* ===================================================================== +* References: +* C. C. Kjelgaard Mikkelsen, A. B. Schwarz and L. Karlsson (2019). +* Parallel robust solution of triangular linear systems. Concurrency +* and Computation: Practice and Experience, 31(19), e5064. +* +* Contributor: +* Angelika Schwarz, Umea University, Sweden. +* +* ===================================================================== + SUBROUTINE CLATRS3( UPLO, TRANS, DIAG, NORMIN, N, NRHS, A, LDA, + $ X, LDX, SCALE, CNORM, WORK, LWORK, INFO ) + IMPLICIT NONE +* +* .. Scalar Arguments .. + CHARACTER DIAG, TRANS, NORMIN, UPLO + INTEGER INFO, LDA, LWORK, LDX, N, NRHS +* .. +* .. Array Arguments .. + COMPLEX A( LDA, * ), X( LDX, * ) + REAL CNORM( * ), SCALE( * ), WORK( * ) +* .. +* +* ===================================================================== +* +* .. Parameters .. + REAL ZERO, ONE + PARAMETER ( ZERO = 0.0E+0, ONE = 1.0E+0 ) + COMPLEX CZERO, CONE + PARAMETER ( CZERO = ( 0.0E+0, 0.0E+0 ) ) + PARAMETER ( CONE = ( 1.0E+0, 0.0E+0 ) ) + INTEGER NBMAX, NBMIN, NBRHS, NRHSMIN + PARAMETER ( NRHSMIN = 2, NBRHS = 32 ) + PARAMETER ( NBMIN = 8, NBMAX = 64 ) +* .. +* .. Local Arrays .. + REAL W( NBMAX ), XNRM( NBRHS ) +* .. +* .. Local Scalars .. + LOGICAL LQUERY, NOTRAN, NOUNIT, UPPER + INTEGER AWRK, I, IFIRST, IINC, ILAST, II, I1, I2, J, + $ JFIRST, JINC, JLAST, J1, J2, K, KK, K1, K2, + $ LANRM, LDS, LSCALE, NB, NBA, NBX, RHS + REAL ANRM, BIGNUM, BNRM, RSCAL, SCAL, SCALOC, + $ SCAMIN, SMLNUM, TMAX +* .. +* .. External Functions .. + LOGICAL LSAME + INTEGER ILAENV + REAL SLAMCH, CLANGE, SLARMM + EXTERNAL ILAENV, LSAME, SLAMCH, CLANGE, SLARMM +* .. +* .. External Subroutines .. + EXTERNAL CLATRS, CSSCAL, XERBLA +* .. +* .. Intrinsic Functions .. + INTRINSIC ABS, MAX, MIN +* .. +* .. Executable Statements .. +* + INFO = 0 + UPPER = LSAME( UPLO, 'U' ) + NOTRAN = LSAME( TRANS, 'N' ) + NOUNIT = LSAME( DIAG, 'N' ) + LQUERY = ( LWORK.EQ.-1 ) +* +* Partition A and X into blocks. +* + NB = MAX( NBMIN, ILAENV( 1, 'CLATRS', '', N, N, -1, -1 ) ) + NB = MIN( NBMAX, NB ) + NBA = MAX( 1, (N + NB - 1) / NB ) + NBX = MAX( 1, (NRHS + NBRHS - 1) / NBRHS ) +* +* Compute the workspace +* +* The workspace comprises two parts. +* The first part stores the local scale factors. Each simultaneously +* computed right-hand side requires one local scale factor per block +* row. WORK( I + KK * LDS ) is the scale factor of the vector +* segment associated with the I-th block row and the KK-th vector +* in the block column. + LSCALE = NBA * MAX( NBA, MIN( NRHS, NBRHS ) ) + LDS = NBA +* The second part stores upper bounds of the triangular A. There are +* a total of NBA x NBA blocks, of which only the upper triangular +* part or the lower triangular part is referenced. The upper bound of +* the block A( I, J ) is stored as WORK( AWRK + I + J * NBA ). + LANRM = NBA * NBA + AWRK = LSCALE + WORK( 1 ) = LSCALE + LANRM +* +* Test the input parameters. +* + IF( .NOT.UPPER .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN + INFO = -1 + ELSE IF( .NOT.NOTRAN .AND. .NOT.LSAME( TRANS, 'T' ) .AND. .NOT. + $ LSAME( TRANS, 'C' ) ) THEN + INFO = -2 + ELSE IF( .NOT.NOUNIT .AND. .NOT.LSAME( DIAG, 'U' ) ) THEN + INFO = -3 + ELSE IF( .NOT.LSAME( NORMIN, 'Y' ) .AND. .NOT. + $ LSAME( NORMIN, 'N' ) ) THEN + INFO = -4 + ELSE IF( N.LT.0 ) THEN + INFO = -5 + ELSE IF( NRHS.LT.0 ) THEN + INFO = -6 + ELSE IF( LDA.LT.MAX( 1, N ) ) THEN + INFO = -8 + ELSE IF( LDX.LT.MAX( 1, N ) ) THEN + INFO = -10 + ELSE IF( .NOT.LQUERY .AND. LWORK.LT.WORK( 1 ) ) THEN + INFO = -14 + END IF + IF( INFO.NE.0 ) THEN + CALL XERBLA( 'CLATRS3', -INFO ) + RETURN + ELSE IF( LQUERY ) THEN + RETURN + END IF +* +* Initialize scaling factors +* + DO KK = 1, NRHS + SCALE( KK ) = ONE + END DO +* +* Quick return if possible +* + IF( MIN( N, NRHS ).EQ.0 ) + $ RETURN +* +* Determine machine dependent constant to control overflow. +* + BIGNUM = SLAMCH( 'Overflow' ) + SMLNUM = SLAMCH( 'Safe Minimum' ) +* +* Use unblocked code for small problems +* + IF( NRHS.LT.NRHSMIN ) THEN + CALL CLATRS( UPLO, TRANS, DIAG, NORMIN, N, A, LDA, X( 1, 1 ), + $ SCALE( 1 ), CNORM, INFO ) + DO K = 2, NRHS + CALL CLATRS( UPLO, TRANS, DIAG, 'Y', N, A, LDA, X( 1, K ), + $ SCALE( K ), CNORM, INFO ) + END DO + RETURN + END IF +* +* Compute norms of blocks of A excluding diagonal blocks and find +* the block with the largest norm TMAX. +* + TMAX = ZERO + DO J = 1, NBA + J1 = (J-1)*NB + 1 + J2 = MIN( J*NB, N ) + 1 + IF ( UPPER ) THEN + IFIRST = 1 + ILAST = J - 1 + ELSE + IFIRST = J + 1 + ILAST = NBA + END IF + DO I = IFIRST, ILAST + I1 = (I-1)*NB + 1 + I2 = MIN( I*NB, N ) + 1 +* +* Compute upper bound of A( I1:I2-1, J1:J2-1 ). +* + IF( NOTRAN ) THEN + ANRM = CLANGE( 'I', I2-I1, J2-J1, A( I1, J1 ), LDA, W ) + WORK( AWRK + I+(J-1)*NBA ) = ANRM + ELSE + ANRM = CLANGE( '1', I2-I1, J2-J1, A( I1, J1 ), LDA, W ) + WORK( AWRK + J+(I-1)*NBA ) = ANRM + END IF + TMAX = MAX( TMAX, ANRM ) + END DO + END DO +* + IF( .NOT. TMAX.LE.SLAMCH('Overflow') ) THEN +* +* Some matrix entries have huge absolute value. At least one upper +* bound norm( A(I1:I2-1, J1:J2-1), 'I') is not a valid floating-point +* number, either due to overflow in LANGE or due to Inf in A. +* Fall back to LATRS. Set normin = 'N' for every right-hand side to +* force computation of TSCAL in LATRS to avoid the likely overflow +* in the computation of the column norms CNORM. +* + DO K = 1, NRHS + CALL CLATRS( UPLO, TRANS, DIAG, 'N', N, A, LDA, X( 1, K ), + $ SCALE( K ), CNORM, INFO ) + END DO + RETURN + END IF +* +* Every right-hand side requires workspace to store NBA local scale +* factors. To save workspace, X is computed successively in block columns +* of width NBRHS, requiring a total of NBA x NBRHS space. If sufficient +* workspace is available, larger values of NBRHS or NBRHS = NRHS are viable. + DO K = 1, NBX +* Loop over block columns (index = K) of X and, for column-wise scalings, +* over individual columns (index = KK). +* K1: column index of the first column in X( J, K ) +* K2: column index of the first column in X( J, K+1 ) +* so the K2 - K1 is the column count of the block X( J, K ) + K1 = (K-1)*NBRHS + 1 + K2 = MIN( K*NBRHS, NRHS ) + 1 +* +* Initialize local scaling factors of current block column X( J, K ) +* + DO KK = 1, K2-K1 + DO I = 1, NBA + WORK( I+KK*LDS ) = ONE + END DO + END DO +* + IF( NOTRAN ) THEN +* +* Solve A * X(:, K1:K2-1) = B * diag(scale(K1:K2-1)) +* + IF( UPPER ) THEN + JFIRST = NBA + JLAST = 1 + JINC = -1 + ELSE + JFIRST = 1 + JLAST = NBA + JINC = 1 + END IF + ELSE +* +* Solve op(A) * X(:, K1:K2-1) = B * diag(scale(K1:K2-1)) +* where op(A) = A**T or op(A) = A**H +* + IF( UPPER ) THEN + JFIRST = 1 + JLAST = NBA + JINC = 1 + ELSE + JFIRST = NBA + JLAST = 1 + JINC = -1 + END IF + END IF + + DO J = JFIRST, JLAST, JINC +* J1: row index of the first row in A( J, J ) +* J2: row index of the first row in A( J+1, J+1 ) +* so that J2 - J1 is the row count of the block A( J, J ) + J1 = (J-1)*NB + 1 + J2 = MIN( J*NB, N ) + 1 +* +* Solve op(A( J, J )) * X( J, RHS ) = SCALOC * B( J, RHS ) +* + DO KK = 1, K2-K1 + RHS = K1 + KK - 1 + IF( KK.EQ.1 ) THEN + CALL CLATRS( UPLO, TRANS, DIAG, 'N', J2-J1, + $ A( J1, J1 ), LDA, X( J1, RHS ), + $ SCALOC, CNORM, INFO ) + ELSE + CALL CLATRS( UPLO, TRANS, DIAG, 'Y', J2-J1, + $ A( J1, J1 ), LDA, X( J1, RHS ), + $ SCALOC, CNORM, INFO ) + END IF +* Find largest absolute value entry in the vector segment +* X( J1:J2-1, RHS ) as an upper bound for the worst case +* growth in the linear updates. + XNRM( KK ) = CLANGE( 'I', J2-J1, 1, X( J1, RHS ), + $ LDX, W ) +* + IF( SCALOC .EQ. ZERO ) THEN +* LATRS found that A is singular through A(j,j) = 0. +* Reset the computation x(1:n) = 0, x(j) = 1, SCALE = 0 +* and compute op(A)*x = 0. Note that X(J1:J2-1, KK) is +* set by LATRS. + SCALE( RHS ) = ZERO + DO II = 1, J1-1 + X( II, KK ) = CZERO + END DO + DO II = J2, N + X( II, KK ) = CZERO + END DO +* Discard the local scale factors. + DO II = 1, NBA + WORK( II+KK*LDS ) = ONE + END DO + SCALOC = ONE + ELSE IF( SCALOC*WORK( J+KK*LDS ) .EQ. ZERO ) THEN +* LATRS computed a valid scale factor, but combined with +* the current scaling the solution does not have a +* scale factor > 0. +* +* Set WORK( J+KK*LDS ) to smallest valid scale +* factor and increase SCALOC accordingly. + SCAL = WORK( J+KK*LDS ) / SMLNUM + SCALOC = SCALOC * SCAL + WORK( J+KK*LDS ) = SMLNUM +* If LATRS overestimated the growth, x may be +* rescaled to preserve a valid combined scale +* factor WORK( J, KK ) > 0. + RSCAL = ONE / SCALOC + IF( XNRM( KK )*RSCAL .LE. BIGNUM ) THEN + XNRM( KK ) = XNRM( KK ) * RSCAL + CALL CSSCAL( J2-J1, RSCAL, X( J1, RHS ), 1 ) + SCALOC = ONE + ELSE +* The system op(A) * x = b is badly scaled and its +* solution cannot be represented as (1/scale) * x. +* Set x to zero. This approach deviates from LATRS +* where a completely meaningless non-zero vector +* is returned that is not a solution to op(A) * x = b. + SCALE( RHS ) = ZERO + DO II = 1, N + X( II, KK ) = CZERO + END DO +* Discard the local scale factors. + DO II = 1, NBA + WORK( II+KK*LDS ) = ONE + END DO + SCALOC = ONE + END IF + END IF + SCALOC = SCALOC * WORK( J+KK*LDS ) + WORK( J+KK*LDS ) = SCALOC + END DO +* +* Linear block updates +* + IF( NOTRAN ) THEN + IF( UPPER ) THEN + IFIRST = J - 1 + ILAST = 1 + IINC = -1 + ELSE + IFIRST = J + 1 + ILAST = NBA + IINC = 1 + END IF + ELSE + IF( UPPER ) THEN + IFIRST = J + 1 + ILAST = NBA + IINC = 1 + ELSE + IFIRST = J - 1 + ILAST = 1 + IINC = -1 + END IF + END IF +* + DO I = IFIRST, ILAST, IINC +* I1: row index of the first column in X( I, K ) +* I2: row index of the first column in X( I+1, K ) +* so the I2 - I1 is the row count of the block X( I, K ) + I1 = (I-1)*NB + 1 + I2 = MIN( I*NB, N ) + 1 +* +* Prepare the linear update to be executed with GEMM. +* For each column, compute a consistent scaling, a +* scaling factor to survive the linear update, and +* rescale the column segments, if necesssary. Then +* the linear update is safely executed. +* + DO KK = 1, K2-K1 + RHS = K1 + KK - 1 +* Compute consistent scaling + SCAMIN = MIN( WORK( I+KK*LDS), WORK( J+KK*LDS ) ) +* +* Compute scaling factor to survive the linear update +* simulating consistent scaling. +* + BNRM = CLANGE( 'I', I2-I1, 1, X( I1, RHS ), LDX, W ) + BNRM = BNRM*( SCAMIN / WORK( I+KK*LDS ) ) + XNRM( KK ) = XNRM( KK )*( SCAMIN / WORK( J+KK*LDS) ) + ANRM = WORK( AWRK + I+(J-1)*NBA ) + SCALOC = SLARMM( ANRM, XNRM( KK ), BNRM ) +* +* Simultaneously apply the robust update factor and the +* consistency scaling factor to X( I, KK ) and X( J, KK ). +* + SCAL = ( SCAMIN / WORK( I+KK*LDS) )*SCALOC + IF( SCAL.NE.ONE ) THEN + CALL CSSCAL( I2-I1, SCAL, X( I1, RHS ), 1 ) + WORK( I+KK*LDS ) = SCAMIN*SCALOC + END IF +* + SCAL = ( SCAMIN / WORK( J+KK*LDS ) )*SCALOC + IF( SCAL.NE.ONE ) THEN + CALL CSSCAL( J2-J1, SCAL, X( J1, RHS ), 1 ) + WORK( J+KK*LDS ) = SCAMIN*SCALOC + END IF + END DO +* + IF( NOTRAN ) THEN +* +* B( I, K ) := B( I, K ) - A( I, J ) * X( J, K ) +* + CALL CGEMM( 'N', 'N', I2-I1, K2-K1, J2-J1, -CONE, + $ A( I1, J1 ), LDA, X( J1, K1 ), LDX, + $ CONE, X( I1, K1 ), LDX ) + ELSE IF( LSAME( TRANS, 'T' ) ) THEN +* +* B( I, K ) := B( I, K ) - A( I, J )**T * X( J, K ) +* + CALL CGEMM( 'T', 'N', I2-I1, K2-K1, J2-J1, -CONE, + $ A( J1, I1 ), LDA, X( J1, K1 ), LDX, + $ CONE, X( I1, K1 ), LDX ) + ELSE +* +* B( I, K ) := B( I, K ) - A( I, J )**H * X( J, K ) +* + CALL CGEMM( 'C', 'N', I2-I1, K2-K1, J2-J1, -CONE, + $ A( J1, I1 ), LDA, X( J1, K1 ), LDX, + $ CONE, X( I1, K1 ), LDX ) + END IF + END DO + END DO +* +* Reduce local scaling factors +* + DO KK = 1, K2-K1 + RHS = K1 + KK - 1 + DO I = 1, NBA + SCALE( RHS ) = MIN( SCALE( RHS ), WORK( I+KK*LDS ) ) + END DO + END DO +* +* Realize consistent scaling +* + DO KK = 1, K2-K1 + RHS = K1 + KK - 1 + IF( SCALE( RHS ).NE.ONE .AND. SCALE( RHS ).NE. ZERO ) THEN + DO I = 1, NBA + I1 = (I-1)*NB + 1 + I2 = MIN( I*NB, N ) + 1 + SCAL = SCALE( RHS ) / WORK( I+KK*LDS ) + IF( SCAL.NE.ONE ) + $ CALL CSSCAL( I2-I1, SCAL, X( I1, RHS ), 1 ) + END DO + END IF + END DO + END DO + RETURN +* +* End of CLATRS3 +* + END diff --git a/SRC/dlatrs3.f b/SRC/dlatrs3.f new file mode 100644 index 0000000000..b4a98bc78e --- /dev/null +++ b/SRC/dlatrs3.f @@ -0,0 +1,656 @@ +*> \brief \b DLATRS3 solves a triangular system of equations with the scale factors set to prevent overflow. +* +* Definition: +* =========== +* +* SUBROUTINE DLATRS3( UPLO, TRANS, DIAG, NORMIN, N, NRHS, A, LDA, +* X, LDX, SCALE, CNORM, WORK, LWORK, INFO ) +* +* .. Scalar Arguments .. +* CHARACTER DIAG, NORMIN, TRANS, UPLO +* INTEGER INFO, LDA, LWORK, LDX, N, NRHS +* .. +* .. Array Arguments .. +* DOUBLE PRECISION A( LDA, * ), CNORM( * ), SCALE( * ), +* WORK( * ), X( LDX, * ) +* .. +* +* +*> \par Purpose: +* ============= +*> +*> \verbatim +*> +*> DLATRS3 solves one of the triangular systems +*> +*> A * X = B * diag(scale) or A**T * X = B * diag(scale) +*> +*> with scaling to prevent overflow. Here A is an upper or lower +*> triangular matrix, A**T denotes the transpose of A. X and B are +*> n by nrhs matrices and scale is an nrhs element vector of scaling +*> factors. A scaling factor scale(j) is usually less than or equal +*> to 1, chosen such that X(:,j) is less than the overflow threshold. +*> If the matrix A is singular (A(j,j) = 0 for some j), then +*> a non-trivial solution to A*X = 0 is returned. If the system is +*> so badly scaled that the solution cannot be represented as +*> (1/scale(k))*X(:,k), then x(:,k) = 0 and scale(k) is returned. +*> +*> This is a BLAS-3 version of LATRS for solving several right +*> hand sides simultaneously. +*> +*> \endverbatim +* +* Arguments: +* ========== +* +*> \param[in] UPLO +*> \verbatim +*> UPLO is CHARACTER*1 +*> Specifies whether the matrix A is upper or lower triangular. +*> = 'U': Upper triangular +*> = 'L': Lower triangular +*> \endverbatim +*> +*> \param[in] TRANS +*> \verbatim +*> TRANS is CHARACTER*1 +*> Specifies the operation applied to A. +*> = 'N': Solve A * x = s*b (No transpose) +*> = 'T': Solve A**T* x = s*b (Transpose) +*> = 'C': Solve A**T* x = s*b (Conjugate transpose = Transpose) +*> \endverbatim +*> +*> \param[in] DIAG +*> \verbatim +*> DIAG is CHARACTER*1 +*> Specifies whether or not the matrix A is unit triangular. +*> = 'N': Non-unit triangular +*> = 'U': Unit triangular +*> \endverbatim +*> +*> \param[in] NORMIN +*> \verbatim +*> NORMIN is CHARACTER*1 +*> Specifies whether CNORM has been set or not. +*> = 'Y': CNORM contains the column norms on entry +*> = 'N': CNORM is not set on entry. On exit, the norms will +*> be computed and stored in CNORM. +*> \endverbatim +*> +*> \param[in] N +*> \verbatim +*> N is INTEGER +*> The order of the matrix A. N >= 0. +*> \endverbatim +*> +*> \param[in] NRHS +*> \verbatim +*> NRHS is INTEGER +*> The number of columns of X. NRHS >= 0. +*> \endverbatim +*> +*> \param[in] A +*> \verbatim +*> A is DOUBLE PRECISION array, dimension (LDA,N) +*> The triangular matrix A. If UPLO = 'U', the leading n by n +*> upper triangular part of the array A contains the upper +*> triangular matrix, and the strictly lower triangular part of +*> A is not referenced. If UPLO = 'L', the leading n by n lower +*> triangular part of the array A contains the lower triangular +*> matrix, and the strictly upper triangular part of A is not +*> referenced. If DIAG = 'U', the diagonal elements of A are +*> also not referenced and are assumed to be 1. +*> \endverbatim +*> +*> \param[in] LDA +*> \verbatim +*> LDA is INTEGER +*> The leading dimension of the array A. LDA >= max (1,N). +*> \endverbatim +*> +*> \param[in,out] X +*> \verbatim +*> X is DOUBLE PRECISION array, dimension (LDX,NRHS) +*> On entry, the right hand side B of the triangular system. +*> On exit, X is overwritten by the solution matrix X. +*> \endverbatim +*> +*> \param[in] LDX +*> \verbatim +*> LDX is INTEGER +*> The leading dimension of the array X. LDX >= max (1,N). +*> \endverbatim +*> +*> \param[out] SCALE +*> \verbatim +*> SCALE is DOUBLE PRECISION array, dimension (NRHS) +*> The scaling factor s(k) is for the triangular system +*> A * x(:,k) = s(k)*b(:,k) or A**T* x(:,k) = s(k)*b(:,k). +*> If SCALE = 0, the matrix A is singular or badly scaled. +*> If A(j,j) = 0 is encountered, a non-trivial vector x(:,k) +*> that is an exact or approximate solution to A*x(:,k) = 0 +*> is returned. If the system so badly scaled that solution +*> cannot be presented as x(:,k) * 1/s(k), then x(:,k) = 0 +*> is returned. +*> \endverbatim +*> +*> \param[in,out] CNORM +*> \verbatim +*> CNORM is DOUBLE PRECISION array, dimension (N) +*> +*> If NORMIN = 'Y', CNORM is an input argument and CNORM(j) +*> contains the norm of the off-diagonal part of the j-th column +*> of A. If TRANS = 'N', CNORM(j) must be greater than or equal +*> to the infinity-norm, and if TRANS = 'T' or 'C', CNORM(j) +*> must be greater than or equal to the 1-norm. +*> +*> If NORMIN = 'N', CNORM is an output argument and CNORM(j) +*> returns the 1-norm of the offdiagonal part of the j-th column +*> of A. +*> \endverbatim +*> +*> \param[out] WORK +*> \verbatim +*> WORK is DOUBLE PRECISION array, dimension (LWORK). +*> On exit, if INFO = 0, WORK(1) returns the optimal size of +*> WORK. +*> \endverbatim +*> +*> \param[in] LWORK +*> LWORK is INTEGER +*> LWORK >= MAX(1, 2*NBA * MAX(NBA, MIN(NRHS, 32)), where +*> NBA = (N + NB - 1)/NB and NB is the optimal block size. +*> +*> If LWORK = -1, then a workspace query is assumed; the routine +*> only calculates the optimal dimensions of the WORK array, returns +*> this value as the first entry of the WORK array, and no error +*> message related to LWORK is issued by XERBLA. +*> +*> \param[out] INFO +*> \verbatim +*> INFO is INTEGER +*> = 0: successful exit +*> < 0: if INFO = -k, the k-th argument had an illegal value +*> \endverbatim +* +* Authors: +* ======== +* +*> \author Univ. of Tennessee +*> \author Univ. of California Berkeley +*> \author Univ. of Colorado Denver +*> \author NAG Ltd. +* +*> \ingroup doubleOTHERauxiliary +*> \par Further Details: +* ===================== +* \verbatim +* The algorithm follows the structure of a block triangular solve. +* The diagonal block is solved with a call to the robust the triangular +* solver LATRS for every right-hand side RHS = 1, ..., NRHS +* op(A( J, J )) * x( J, RHS ) = SCALOC * b( J, RHS ), +* where op( A ) = A or op( A ) = A**T. +* The linear block updates operate on block columns of X, +* B( I, K ) - op(A( I, J )) * X( J, K ) +* and use GEMM. To avoid overflow in the linear block update, the worst case +* growth is estimated. For every RHS, a scale factor s <= 1.0 is computed +* such that +* || s * B( I, RHS )||_oo +* + || op(A( I, J )) ||_oo * || s * X( J, RHS ) ||_oo <= Overflow threshold +* +* Once all columns of a block column have been rescaled (BLAS-1), the linear +* update is executed with GEMM without overflow. +* +* To limit rescaling, local scale factors track the scaling of column segments. +* There is one local scale factor s( I, RHS ) per block row I = 1, ..., NBA +* per right-hand side column RHS = 1, ..., NRHS. The global scale factor +* SCALE( RHS ) is chosen as the smallest local scale factor s( I, RHS ) +* I = 1, ..., NBA. +* A triangular solve op(A( J, J )) * x( J, RHS ) = SCALOC * b( J, RHS ) +* updates the local scale factor s( J, RHS ) := s( J, RHS ) * SCALOC. The +* linear update of potentially inconsistently scaled vector segments +* s( I, RHS ) * b( I, RHS ) - op(A( I, J )) * ( s( J, RHS )* x( J, RHS ) ) +* computes a consistent scaling SCAMIN = MIN( s(I, RHS ), s(J, RHS) ) and, +* if necessary, rescales the blocks prior to calling GEMM. +* +* \endverbatim +* ===================================================================== +* References: +* C. C. Kjelgaard Mikkelsen, A. B. Schwarz and L. Karlsson (2019). +* Parallel robust solution of triangular linear systems. Concurrency +* and Computation: Practice and Experience, 31(19), e5064. +* +* Contributor: +* Angelika Schwarz, Umea University, Sweden. +* +* ===================================================================== + SUBROUTINE DLATRS3( UPLO, TRANS, DIAG, NORMIN, N, NRHS, A, LDA, + $ X, LDX, SCALE, CNORM, WORK, LWORK, INFO ) + IMPLICIT NONE +* +* .. Scalar Arguments .. + CHARACTER DIAG, TRANS, NORMIN, UPLO + INTEGER INFO, LDA, LWORK, LDX, N, NRHS +* .. +* .. Array Arguments .. + DOUBLE PRECISION A( LDA, * ), CNORM( * ), X( LDX, * ), + $ SCALE( * ), WORK( * ) +* .. +* +* ===================================================================== +* +* .. Parameters .. + DOUBLE PRECISION ZERO, ONE + PARAMETER ( ZERO = 0.0D+0, ONE = 1.0D+0 ) + INTEGER NBMAX, NBMIN, NBRHS, NRHSMIN + PARAMETER ( NRHSMIN = 2, NBRHS = 32 ) + PARAMETER ( NBMIN = 8, NBMAX = 64 ) +* .. +* .. Local Arrays .. + DOUBLE PRECISION W( NBMAX ), XNRM( NBRHS ) +* .. +* .. Local Scalars .. + LOGICAL LQUERY, NOTRAN, NOUNIT, UPPER + INTEGER AWRK, I, IFIRST, IINC, ILAST, II, I1, I2, J, + $ JFIRST, JINC, JLAST, J1, J2, K, KK, K1, K2, + $ LANRM, LDS, LSCALE, NB, NBA, NBX, RHS + DOUBLE PRECISION ANRM, BIGNUM, BNRM, RSCAL, SCAL, SCALOC, + $ SCAMIN, SMLNUM, TMAX +* .. +* .. External Functions .. + LOGICAL LSAME + INTEGER ILAENV + DOUBLE PRECISION DLAMCH, DLANGE, DLARMM + EXTERNAL DLAMCH, DLANGE, DLARMM, ILAENV, LSAME +* .. +* .. External Subroutines .. + EXTERNAL DLATRS, DSCAL, XERBLA +* .. +* .. Intrinsic Functions .. + INTRINSIC ABS, MAX, MIN +* .. +* .. Executable Statements .. +* + INFO = 0 + UPPER = LSAME( UPLO, 'U' ) + NOTRAN = LSAME( TRANS, 'N' ) + NOUNIT = LSAME( DIAG, 'N' ) + LQUERY = ( LWORK.EQ.-1 ) +* +* Partition A and X into blocks +* + NB = MAX( 8, ILAENV( 1, 'DLATRS', '', N, N, -1, -1 ) ) + NB = MIN( NBMAX, NB ) + NBA = MAX( 1, (N + NB - 1) / NB ) + NBX = MAX( 1, (NRHS + NBRHS - 1) / NBRHS ) +* +* Compute the workspace +* +* The workspace comprises two parts. +* The first part stores the local scale factors. Each simultaneously +* computed right-hand side requires one local scale factor per block +* row. WORK( I+KK*LDS ) is the scale factor of the vector +* segment associated with the I-th block row and the KK-th vector +* in the block column. + LSCALE = NBA * MAX( NBA, MIN( NRHS, NBRHS ) ) + LDS = NBA +* The second part stores upper bounds of the triangular A. There are +* a total of NBA x NBA blocks, of which only the upper triangular +* part or the lower triangular part is referenced. The upper bound of +* the block A( I, J ) is stored as WORK( AWRK + I + J * NBA ). + LANRM = NBA * NBA + AWRK = LSCALE + WORK( 1 ) = LSCALE + LANRM +* +* Test the input parameters +* + IF( .NOT.UPPER .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN + INFO = -1 + ELSE IF( .NOT.NOTRAN .AND. .NOT.LSAME( TRANS, 'T' ) .AND. .NOT. + $ LSAME( TRANS, 'C' ) ) THEN + INFO = -2 + ELSE IF( .NOT.NOUNIT .AND. .NOT.LSAME( DIAG, 'U' ) ) THEN + INFO = -3 + ELSE IF( .NOT.LSAME( NORMIN, 'Y' ) .AND. .NOT. + $ LSAME( NORMIN, 'N' ) ) THEN + INFO = -4 + ELSE IF( N.LT.0 ) THEN + INFO = -5 + ELSE IF( NRHS.LT.0 ) THEN + INFO = -6 + ELSE IF( LDA.LT.MAX( 1, N ) ) THEN + INFO = -8 + ELSE IF( LDX.LT.MAX( 1, N ) ) THEN + INFO = -10 + ELSE IF( .NOT.LQUERY .AND. LWORK.LT.WORK( 1 ) ) THEN + INFO = -14 + END IF + IF( INFO.NE.0 ) THEN + CALL XERBLA( 'DLATRS3', -INFO ) + RETURN + ELSE IF( LQUERY ) THEN + RETURN + END IF +* +* Initialize scaling factors +* + DO KK = 1, NRHS + SCALE( KK ) = ONE + END DO +* +* Quick return if possible +* + IF( MIN( N, NRHS ).EQ.0 ) + $ RETURN +* +* Determine machine dependent constant to control overflow. +* + BIGNUM = DLAMCH( 'Overflow' ) + SMLNUM = DLAMCH( 'Safe Minimum' ) +* +* Use unblocked code for small problems +* + IF( NRHS.LT.NRHSMIN ) THEN + CALL DLATRS( UPLO, TRANS, DIAG, NORMIN, N, A, LDA, X( 1, 1), + $ SCALE( 1 ), CNORM, INFO ) + DO K = 2, NRHS + CALL DLATRS( UPLO, TRANS, DIAG, 'Y', N, A, LDA, X( 1, K ), + $ SCALE( K ), CNORM, INFO ) + END DO + RETURN + END IF +* +* Compute norms of blocks of A excluding diagonal blocks and find +* the block with the largest norm TMAX. +* + TMAX = ZERO + DO J = 1, NBA + J1 = (J-1)*NB + 1 + J2 = MIN( J*NB, N ) + 1 + IF ( UPPER ) THEN + IFIRST = 1 + ILAST = J - 1 + ELSE + IFIRST = J + 1 + ILAST = NBA + END IF + DO I = IFIRST, ILAST + I1 = (I-1)*NB + 1 + I2 = MIN( I*NB, N ) + 1 +* +* Compute upper bound of A( I1:I2-1, J1:J2-1 ). +* + IF( NOTRAN ) THEN + ANRM = DLANGE( 'I', I2-I1, J2-J1, A( I1, J1 ), LDA, W ) + WORK( AWRK + I+(J-1)*NBA ) = ANRM + ELSE + ANRM = DLANGE( '1', I2-I1, J2-J1, A( I1, J1 ), LDA, W ) + WORK( AWRK + J+(I-1)*NBA ) = ANRM + END IF + TMAX = MAX( TMAX, ANRM ) + END DO + END DO +* + IF( .NOT. TMAX.LE.DLAMCH('Overflow') ) THEN +* +* Some matrix entries have huge absolute value. At least one upper +* bound norm( A(I1:I2-1, J1:J2-1), 'I') is not a valid floating-point +* number, either due to overflow in LANGE or due to Inf in A. +* Fall back to LATRS. Set normin = 'N' for every right-hand side to +* force computation of TSCAL in LATRS to avoid the likely overflow +* in the computation of the column norms CNORM. +* + DO K = 1, NRHS + CALL DLATRS( UPLO, TRANS, DIAG, 'N', N, A, LDA, X( 1, K ), + $ SCALE( K ), CNORM, INFO ) + END DO + RETURN + END IF +* +* Every right-hand side requires workspace to store NBA local scale +* factors. To save workspace, X is computed successively in block columns +* of width NBRHS, requiring a total of NBA x NBRHS space. If sufficient +* workspace is available, larger values of NBRHS or NBRHS = NRHS are viable. + DO K = 1, NBX +* Loop over block columns (index = K) of X and, for column-wise scalings, +* over individual columns (index = KK). +* K1: column index of the first column in X( J, K ) +* K2: column index of the first column in X( J, K+1 ) +* so the K2 - K1 is the column count of the block X( J, K ) + K1 = (K-1)*NBRHS + 1 + K2 = MIN( K*NBRHS, NRHS ) + 1 +* +* Initialize local scaling factors of current block column X( J, K ) +* + DO KK = 1, K2-K1 + DO I = 1, NBA + WORK( I+KK*LDS ) = ONE + END DO + END DO +* + IF( NOTRAN ) THEN +* +* Solve A * X(:, K1:K2-1) = B * diag(scale(K1:K2-1)) +* + IF( UPPER ) THEN + JFIRST = NBA + JLAST = 1 + JINC = -1 + ELSE + JFIRST = 1 + JLAST = NBA + JINC = 1 + END IF + ELSE +* +* Solve A**T * X(:, K1:K2-1) = B * diag(scale(K1:K2-1)) +* + IF( UPPER ) THEN + JFIRST = 1 + JLAST = NBA + JINC = 1 + ELSE + JFIRST = NBA + JLAST = 1 + JINC = -1 + END IF + END IF +* + DO J = JFIRST, JLAST, JINC +* J1: row index of the first row in A( J, J ) +* J2: row index of the first row in A( J+1, J+1 ) +* so that J2 - J1 is the row count of the block A( J, J ) + J1 = (J-1)*NB + 1 + J2 = MIN( J*NB, N ) + 1 +* +* Solve op(A( J, J )) * X( J, RHS ) = SCALOC * B( J, RHS ) +* for all right-hand sides in the current block column, +* one RHS at a time. +* + DO KK = 1, K2-K1 + RHS = K1 + KK - 1 + IF( KK.EQ.1 ) THEN + CALL DLATRS( UPLO, TRANS, DIAG, 'N', J2-J1, + $ A( J1, J1 ), LDA, X( J1, RHS ), + $ SCALOC, CNORM, INFO ) + ELSE + CALL DLATRS( UPLO, TRANS, DIAG, 'Y', J2-J1, + $ A( J1, J1 ), LDA, X( J1, RHS ), + $ SCALOC, CNORM, INFO ) + END IF +* Find largest absolute value entry in the vector segment +* X( J1:J2-1, RHS ) as an upper bound for the worst case +* growth in the linear updates. + XNRM( KK ) = DLANGE( 'I', J2-J1, 1, X( J1, RHS ), + $ LDX, W ) +* + IF( SCALOC .EQ. ZERO ) THEN +* LATRS found that A is singular through A(j,j) = 0. +* Reset the computation x(1:n) = 0, x(j) = 1, SCALE = 0 +* and compute A*x = 0 (or A**T*x = 0). Note that +* X(J1:J2-1, KK) is set by LATRS. + SCALE( RHS ) = ZERO + DO II = 1, J1-1 + X( II, KK ) = ZERO + END DO + DO II = J2, N + X( II, KK ) = ZERO + END DO +* Discard the local scale factors. + DO II = 1, NBA + WORK( II+KK*LDS ) = ONE + END DO + SCALOC = ONE + ELSE IF( SCALOC * WORK( J+KK*LDS ) .EQ. ZERO ) THEN +* LATRS computed a valid scale factor, but combined with +* the current scaling the solution does not have a +* scale factor > 0. +* +* Set WORK( J+KK*LDS ) to smallest valid scale +* factor and increase SCALOC accordingly. + SCAL = WORK( J+KK*LDS ) / SMLNUM + SCALOC = SCALOC * SCAL + WORK( J+KK*LDS ) = SMLNUM +* If LATRS overestimated the growth, x may be +* rescaled to preserve a valid combined scale +* factor WORK( J, KK ) > 0. + RSCAL = ONE / SCALOC + IF( XNRM( KK ) * RSCAL .LE. BIGNUM ) THEN + XNRM( KK ) = XNRM( KK ) * RSCAL + CALL DSCAL( J2-J1, RSCAL, X( J1, RHS ), 1 ) + SCALOC = ONE + ELSE +* The system op(A) * x = b is badly scaled and its +* solution cannot be represented as (1/scale) * x. +* Set x to zero. This approach deviates from LATRS +* where a completely meaningless non-zero vector +* is returned that is not a solution to op(A) * x = b. + SCALE( RHS ) = ZERO + DO II = 1, N + X( II, KK ) = ZERO + END DO +* Discard the local scale factors. + DO II = 1, NBA + WORK( II+KK*LDS ) = ONE + END DO + SCALOC = ONE + END IF + END IF + SCALOC = SCALOC * WORK( J+KK*LDS ) + WORK( J+KK*LDS ) = SCALOC + END DO +* +* Linear block updates +* + IF( NOTRAN ) THEN + IF( UPPER ) THEN + IFIRST = J - 1 + ILAST = 1 + IINC = -1 + ELSE + IFIRST = J + 1 + ILAST = NBA + IINC = 1 + END IF + ELSE + IF( UPPER ) THEN + IFIRST = J + 1 + ILAST = NBA + IINC = 1 + ELSE + IFIRST = J - 1 + ILAST = 1 + IINC = -1 + END IF + END IF +* + DO I = IFIRST, ILAST, IINC +* I1: row index of the first column in X( I, K ) +* I2: row index of the first column in X( I+1, K ) +* so the I2 - I1 is the row count of the block X( I, K ) + I1 = (I-1)*NB + 1 + I2 = MIN( I*NB, N ) + 1 +* +* Prepare the linear update to be executed with GEMM. +* For each column, compute a consistent scaling, a +* scaling factor to survive the linear update, and +* rescale the column segments, if necesssary. Then +* the linear update is safely executed. +* + DO KK = 1, K2-K1 + RHS = K1 + KK - 1 +* Compute consistent scaling + SCAMIN = MIN( WORK( I + KK*LDS), WORK( J + KK*LDS ) ) +* +* Compute scaling factor to survive the linear update +* simulating consistent scaling. +* + BNRM = DLANGE( 'I', I2-I1, 1, X( I1, RHS ), LDX, W ) + BNRM = BNRM*( SCAMIN / WORK( I+KK*LDS ) ) + XNRM( KK ) = XNRM( KK )*(SCAMIN / WORK( J+KK*LDS )) + ANRM = WORK( AWRK + I+(J-1)*NBA ) + SCALOC = DLARMM( ANRM, XNRM( KK ), BNRM ) +* +* Simultaneously apply the robust update factor and the +* consistency scaling factor to B( I, KK ) and B( J, KK ). +* + SCAL = ( SCAMIN / WORK( I+KK*LDS) )*SCALOC + IF( SCAL.NE.ONE ) THEN + CALL DSCAL( I2-I1, SCAL, X( I1, RHS ), 1 ) + WORK( I+KK*LDS ) = SCAMIN*SCALOC + END IF +* + SCAL = ( SCAMIN / WORK( J+KK*LDS ) )*SCALOC + IF( SCAL.NE.ONE ) THEN + CALL DSCAL( J2-J1, SCAL, X( J1, RHS ), 1 ) + WORK( J+KK*LDS ) = SCAMIN*SCALOC + END IF + END DO +* + IF( NOTRAN ) THEN +* +* B( I, K ) := B( I, K ) - A( I, J ) * X( J, K ) +* + CALL DGEMM( 'N', 'N', I2-I1, K2-K1, J2-J1, -ONE, + $ A( I1, J1 ), LDA, X( J1, K1 ), LDX, + $ ONE, X( I1, K1 ), LDX ) + ELSE +* +* B( I, K ) := B( I, K ) - A( J, I )**T * X( J, K ) +* + CALL DGEMM( 'T', 'N', I2-I1, K2-K1, J2-J1, -ONE, + $ A( J1, I1 ), LDA, X( J1, K1 ), LDX, + $ ONE, X( I1, K1 ), LDX ) + END IF + END DO + END DO +* +* Reduce local scaling factors +* + DO KK = 1, K2-K1 + RHS = K1 + KK - 1 + DO I = 1, NBA + SCALE( RHS ) = MIN( SCALE( RHS ), WORK( I+KK*LDS ) ) + END DO + END DO +* +* Realize consistent scaling +* + DO KK = 1, K2-K1 + RHS = K1 + KK - 1 + IF( SCALE( RHS ).NE.ONE .AND. SCALE( RHS ).NE. ZERO ) THEN + DO I = 1, NBA + I1 = (I-1)*NB + 1 + I2 = MIN( I*NB, N ) + 1 + SCAL = SCALE( RHS ) / WORK( I+KK*LDS ) + IF( SCAL.NE.ONE ) + $ CALL DSCAL( I2-I1, SCAL, X( I1, RHS ), 1 ) + END DO + END IF + END DO + END DO + RETURN +* +* End of DLATRS3 +* + END diff --git a/SRC/ilaenv.f b/SRC/ilaenv.f index 395147d95e..a639e0375a 100644 --- a/SRC/ilaenv.f +++ b/SRC/ilaenv.f @@ -486,6 +486,12 @@ INTEGER FUNCTION ILAENV( ISPEC, NAME, OPTS, N1, N2, N3, N4 ) ELSE NB = 64 END IF + ELSE IF( C3.EQ.'TRS' ) THEN + IF( SNAME ) THEN + NB = 32 + ELSE + NB = 32 + END IF END IF ELSE IF( SNAME .AND. C2.EQ.'ST' ) THEN IF( C3.EQ.'EBZ' ) THEN diff --git a/SRC/slatrs3.f b/SRC/slatrs3.f new file mode 100644 index 0000000000..c3a08e524c --- /dev/null +++ b/SRC/slatrs3.f @@ -0,0 +1,656 @@ +*> \brief \b SLATRS3 solves a triangular system of equations with the scale factors set to prevent overflow. +* +* Definition: +* =========== +* +* SUBROUTINE SLATRS3( UPLO, TRANS, DIAG, NORMIN, N, NRHS, A, LDA, +* X, LDX, SCALE, CNORM, WORK, LWORK, INFO ) +* +* .. Scalar Arguments .. +* CHARACTER DIAG, NORMIN, TRANS, UPLO +* INTEGER INFO, LDA, LWORK, LDX, N, NRHS +* .. +* .. Array Arguments .. +* REAL A( LDA, * ), CNORM( * ), SCALE( * ), +* WORK( * ), X( LDX, * ) +* .. +* +* +*> \par Purpose: +* ============= +*> +*> \verbatim +*> +*> SLATRS3 solves one of the triangular systems +*> +*> A * X = B * diag(scale) or A**T * X = B * diag(scale) +*> +*> with scaling to prevent overflow. Here A is an upper or lower +*> triangular matrix, A**T denotes the transpose of A. X and B are +*> n by nrhs matrices and scale is an nrhs element vector of scaling +*> factors. A scaling factor scale(j) is usually less than or equal +*> to 1, chosen such that X(:,j) is less than the overflow threshold. +*> If the matrix A is singular (A(j,j) = 0 for some j), then +*> a non-trivial solution to A*X = 0 is returned. If the system is +*> so badly scaled that the solution cannot be represented as +*> (1/scale(k))*X(:,k), then x(:,k) = 0 and scale(k) is returned. +*> +*> This is a BLAS-3 version of LATRS for solving several right +*> hand sides simultaneously. +*> +*> \endverbatim +* +* Arguments: +* ========== +* +*> \param[in] UPLO +*> \verbatim +*> UPLO is CHARACTER*1 +*> Specifies whether the matrix A is upper or lower triangular. +*> = 'U': Upper triangular +*> = 'L': Lower triangular +*> \endverbatim +*> +*> \param[in] TRANS +*> \verbatim +*> TRANS is CHARACTER*1 +*> Specifies the operation applied to A. +*> = 'N': Solve A * x = s*b (No transpose) +*> = 'T': Solve A**T* x = s*b (Transpose) +*> = 'C': Solve A**T* x = s*b (Conjugate transpose = Transpose) +*> \endverbatim +*> +*> \param[in] DIAG +*> \verbatim +*> DIAG is CHARACTER*1 +*> Specifies whether or not the matrix A is unit triangular. +*> = 'N': Non-unit triangular +*> = 'U': Unit triangular +*> \endverbatim +*> +*> \param[in] NORMIN +*> \verbatim +*> NORMIN is CHARACTER*1 +*> Specifies whether CNORM has been set or not. +*> = 'Y': CNORM contains the column norms on entry +*> = 'N': CNORM is not set on entry. On exit, the norms will +*> be computed and stored in CNORM. +*> \endverbatim +*> +*> \param[in] N +*> \verbatim +*> N is INTEGER +*> The order of the matrix A. N >= 0. +*> \endverbatim +*> +*> \param[in] NRHS +*> \verbatim +*> NRHS is INTEGER +*> The number of columns of X. NRHS >= 0. +*> \endverbatim +*> +*> \param[in] A +*> \verbatim +*> A is REAL array, dimension (LDA,N) +*> The triangular matrix A. If UPLO = 'U', the leading n by n +*> upper triangular part of the array A contains the upper +*> triangular matrix, and the strictly lower triangular part of +*> A is not referenced. If UPLO = 'L', the leading n by n lower +*> triangular part of the array A contains the lower triangular +*> matrix, and the strictly upper triangular part of A is not +*> referenced. If DIAG = 'U', the diagonal elements of A are +*> also not referenced and are assumed to be 1. +*> \endverbatim +*> +*> \param[in] LDA +*> \verbatim +*> LDA is INTEGER +*> The leading dimension of the array A. LDA >= max (1,N). +*> \endverbatim +*> +*> \param[in,out] X +*> \verbatim +*> X is REAL array, dimension (LDX,NRHS) +*> On entry, the right hand side B of the triangular system. +*> On exit, X is overwritten by the solution matrix X. +*> \endverbatim +*> +*> \param[in] LDX +*> \verbatim +*> LDX is INTEGER +*> The leading dimension of the array X. LDX >= max (1,N). +*> \endverbatim +*> +*> \param[out] SCALE +*> \verbatim +*> SCALE is REAL array, dimension (NRHS) +*> The scaling factor s(k) is for the triangular system +*> A * x(:,k) = s(k)*b(:,k) or A**T* x(:,k) = s(k)*b(:,k). +*> If SCALE = 0, the matrix A is singular or badly scaled. +*> If A(j,j) = 0 is encountered, a non-trivial vector x(:,k) +*> that is an exact or approximate solution to A*x(:,k) = 0 +*> is returned. If the system so badly scaled that solution +*> cannot be presented as x(:,k) * 1/s(k), then x(:,k) = 0 +*> is returned. +*> \endverbatim +*> +*> \param[in,out] CNORM +*> \verbatim +*> CNORM is REAL array, dimension (N) +*> +*> If NORMIN = 'Y', CNORM is an input argument and CNORM(j) +*> contains the norm of the off-diagonal part of the j-th column +*> of A. If TRANS = 'N', CNORM(j) must be greater than or equal +*> to the infinity-norm, and if TRANS = 'T' or 'C', CNORM(j) +*> must be greater than or equal to the 1-norm. +*> +*> If NORMIN = 'N', CNORM is an output argument and CNORM(j) +*> returns the 1-norm of the offdiagonal part of the j-th column +*> of A. +*> \endverbatim +*> +*> \param[out] WORK +*> \verbatim +*> WORK is REAL array, dimension (LWORK). +*> On exit, if INFO = 0, WORK(1) returns the optimal size of +*> WORK. +*> \endverbatim +*> +*> \param[in] LWORK +*> LWORK is INTEGER +*> LWORK >= MAX(1, 2*NBA * MAX(NBA, MIN(NRHS, 32)), where +*> NBA = (N + NB - 1)/NB and NB is the optimal block size. +*> +*> If LWORK = -1, then a workspace query is assumed; the routine +*> only calculates the optimal dimensions of the WORK array, returns +*> this value as the first entry of the WORK array, and no error +*> message related to LWORK is issued by XERBLA. +*> +*> \param[out] INFO +*> \verbatim +*> INFO is INTEGER +*> = 0: successful exit +*> < 0: if INFO = -k, the k-th argument had an illegal value +*> \endverbatim +* +* Authors: +* ======== +* +*> \author Univ. of Tennessee +*> \author Univ. of California Berkeley +*> \author Univ. of Colorado Denver +*> \author NAG Ltd. +* +*> \ingroup doubleOTHERauxiliary +*> \par Further Details: +* ===================== +* \verbatim +* The algorithm follows the structure of a block triangular solve. +* The diagonal block is solved with a call to the robust the triangular +* solver LATRS for every right-hand side RHS = 1, ..., NRHS +* op(A( J, J )) * x( J, RHS ) = SCALOC * b( J, RHS ), +* where op( A ) = A or op( A ) = A**T. +* The linear block updates operate on block columns of X, +* B( I, K ) - op(A( I, J )) * X( J, K ) +* and use GEMM. To avoid overflow in the linear block update, the worst case +* growth is estimated. For every RHS, a scale factor s <= 1.0 is computed +* such that +* || s * B( I, RHS )||_oo +* + || op(A( I, J )) ||_oo * || s * X( J, RHS ) ||_oo <= Overflow threshold +* +* Once all columns of a block column have been rescaled (BLAS-1), the linear +* update is executed with GEMM without overflow. +* +* To limit rescaling, local scale factors track the scaling of column segments. +* There is one local scale factor s( I, RHS ) per block row I = 1, ..., NBA +* per right-hand side column RHS = 1, ..., NRHS. The global scale factor +* SCALE( RHS ) is chosen as the smallest local scale factor s( I, RHS ) +* I = 1, ..., NBA. +* A triangular solve op(A( J, J )) * x( J, RHS ) = SCALOC * b( J, RHS ) +* updates the local scale factor s( J, RHS ) := s( J, RHS ) * SCALOC. The +* linear update of potentially inconsistently scaled vector segments +* s( I, RHS ) * b( I, RHS ) - op(A( I, J )) * ( s( J, RHS )* x( J, RHS ) ) +* computes a consistent scaling SCAMIN = MIN( s(I, RHS ), s(J, RHS) ) and, +* if necessary, rescales the blocks prior to calling GEMM. +* +* \endverbatim +* ===================================================================== +* References: +* C. C. Kjelgaard Mikkelsen, A. B. Schwarz and L. Karlsson (2019). +* Parallel robust solution of triangular linear systems. Concurrency +* and Computation: Practice and Experience, 31(19), e5064. +* +* Contributor: +* Angelika Schwarz, Umea University, Sweden. +* +* ===================================================================== + SUBROUTINE SLATRS3( UPLO, TRANS, DIAG, NORMIN, N, NRHS, A, LDA, + $ X, LDX, SCALE, CNORM, WORK, LWORK, INFO ) + IMPLICIT NONE +* +* .. Scalar Arguments .. + CHARACTER DIAG, TRANS, NORMIN, UPLO + INTEGER INFO, LDA, LWORK, LDX, N, NRHS +* .. +* .. Array Arguments .. + REAL A( LDA, * ), CNORM( * ), X( LDX, * ), + $ SCALE( * ), WORK( * ) +* .. +* +* ===================================================================== +* +* .. Parameters .. + REAL ZERO, ONE + PARAMETER ( ZERO = 0.0E+0, ONE = 1.0E+0 ) + INTEGER NBMAX, NBMIN, NBRHS, NRHSMIN + PARAMETER ( NRHSMIN = 2, NBRHS = 32 ) + PARAMETER ( NBMIN = 8, NBMAX = 64 ) +* .. +* .. Local Arrays .. + REAL W( NBMAX ), XNRM( NBRHS ) +* .. +* .. Local Scalars .. + LOGICAL LQUERY, NOTRAN, NOUNIT, UPPER + INTEGER AWRK, I, IFIRST, IINC, ILAST, II, I1, I2, J, + $ JFIRST, JINC, JLAST, J1, J2, K, KK, K1, K2, + $ LANRM, LDS, LSCALE, NB, NBA, NBX, RHS + REAL ANRM, BIGNUM, BNRM, RSCAL, SCAL, SCALOC, + $ SCAMIN, SMLNUM, TMAX +* .. +* .. External Functions .. + LOGICAL LSAME + INTEGER ILAENV + REAL SLAMCH, SLANGE, SLARMM + EXTERNAL ILAENV, LSAME, SLAMCH, SLANGE, SLARMM +* .. +* .. External Subroutines .. + EXTERNAL SLATRS, SSCAL, XERBLA +* .. +* .. Intrinsic Functions .. + INTRINSIC ABS, MAX, MIN +* .. +* .. Executable Statements .. +* + INFO = 0 + UPPER = LSAME( UPLO, 'U' ) + NOTRAN = LSAME( TRANS, 'N' ) + NOUNIT = LSAME( DIAG, 'N' ) + LQUERY = ( LWORK.EQ.-1 ) +* +* Partition A and X into blocks. +* + NB = MAX( 8, ILAENV( 1, 'SLATRS', '', N, N, -1, -1 ) ) + NB = MIN( NBMAX, NB ) + NBA = MAX( 1, (N + NB - 1) / NB ) + NBX = MAX( 1, (NRHS + NBRHS - 1) / NBRHS ) +* +* Compute the workspace +* +* The workspace comprises two parts. +* The first part stores the local scale factors. Each simultaneously +* computed right-hand side requires one local scale factor per block +* row. WORK( I + KK * LDS ) is the scale factor of the vector +* segment associated with the I-th block row and the KK-th vector +* in the block column. + LSCALE = NBA * MAX( NBA, MIN( NRHS, NBRHS ) ) + LDS = NBA +* The second part stores upper bounds of the triangular A. There are +* a total of NBA x NBA blocks, of which only the upper triangular +* part or the lower triangular part is referenced. The upper bound of +* the block A( I, J ) is stored as WORK( AWRK + I + J * NBA ). + LANRM = NBA * NBA + AWRK = LSCALE + WORK( 1 ) = LSCALE + LANRM +* +* Test the input parameters. +* + IF( .NOT.UPPER .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN + INFO = -1 + ELSE IF( .NOT.NOTRAN .AND. .NOT.LSAME( TRANS, 'T' ) .AND. .NOT. + $ LSAME( TRANS, 'C' ) ) THEN + INFO = -2 + ELSE IF( .NOT.NOUNIT .AND. .NOT.LSAME( DIAG, 'U' ) ) THEN + INFO = -3 + ELSE IF( .NOT.LSAME( NORMIN, 'Y' ) .AND. .NOT. + $ LSAME( NORMIN, 'N' ) ) THEN + INFO = -4 + ELSE IF( N.LT.0 ) THEN + INFO = -5 + ELSE IF( NRHS.LT.0 ) THEN + INFO = -6 + ELSE IF( LDA.LT.MAX( 1, N ) ) THEN + INFO = -8 + ELSE IF( LDX.LT.MAX( 1, N ) ) THEN + INFO = -10 + ELSE IF( .NOT.LQUERY .AND. LWORK.LT.WORK( 1 ) ) THEN + INFO = -14 + END IF + IF( INFO.NE.0 ) THEN + CALL XERBLA( 'SLATRS3', -INFO ) + RETURN + ELSE IF( LQUERY ) THEN + RETURN + END IF +* +* Initialize scaling factors +* + DO KK = 1, NRHS + SCALE( KK ) = ONE + END DO +* +* Quick return if possible +* + IF( MIN( N, NRHS ).EQ.0 ) + $ RETURN +* +* Determine machine dependent constant to control overflow. +* + BIGNUM = SLAMCH( 'Overflow' ) + SMLNUM = SLAMCH( 'Safe Minimum' ) +* +* Use unblocked code for small problems +* + IF( NRHS.LT.NRHSMIN ) THEN + CALL SLATRS( UPLO, TRANS, DIAG, NORMIN, N, A, LDA, X( 1, 1), + $ SCALE( 1 ), CNORM, INFO ) + DO K = 2, NRHS + CALL SLATRS( UPLO, TRANS, DIAG, 'Y', N, A, LDA, X( 1, K ), + $ SCALE( K ), CNORM, INFO ) + END DO + RETURN + END IF +* +* Compute norms of blocks of A excluding diagonal blocks and find +* the block with the largest norm TMAX. +* + TMAX = ZERO + DO J = 1, NBA + J1 = (J-1)*NB + 1 + J2 = MIN( J*NB, N ) + 1 + IF ( UPPER ) THEN + IFIRST = 1 + ILAST = J - 1 + ELSE + IFIRST = J + 1 + ILAST = NBA + END IF + DO I = IFIRST, ILAST + I1 = (I-1)*NB + 1 + I2 = MIN( I*NB, N ) + 1 +* +* Compute upper bound of A( I1:I2-1, J1:J2-1 ). +* + IF( NOTRAN ) THEN + ANRM = SLANGE( 'I', I2-I1, J2-J1, A( I1, J1 ), LDA, W ) + WORK( AWRK + I+(J-1)*NBA ) = ANRM + ELSE + ANRM = SLANGE( '1', I2-I1, J2-J1, A( I1, J1 ), LDA, W ) + WORK( AWRK + J+(I-1)*NBA ) = ANRM + END IF + TMAX = MAX( TMAX, ANRM ) + END DO + END DO +* + IF( .NOT. TMAX.LE.SLAMCH('Overflow') ) THEN +* +* Some matrix entries have huge absolute value. At least one upper +* bound norm( A(I1:I2-1, J1:J2-1), 'I') is not a valid floating-point +* number, either due to overflow in LANGE or due to Inf in A. +* Fall back to LATRS. Set normin = 'N' for every right-hand side to +* force computation of TSCAL in LATRS to avoid the likely overflow +* in the computation of the column norms CNORM. +* + DO K = 1, NRHS + CALL SLATRS( UPLO, TRANS, DIAG, 'N', N, A, LDA, X( 1, K ), + $ SCALE( K ), CNORM, INFO ) + END DO + RETURN + END IF +* +* Every right-hand side requires workspace to store NBA local scale +* factors. To save workspace, X is computed successively in block columns +* of width NBRHS, requiring a total of NBA x NBRHS space. If sufficient +* workspace is available, larger values of NBRHS or NBRHS = NRHS are viable. + DO K = 1, NBX +* Loop over block columns (index = K) of X and, for column-wise scalings, +* over individual columns (index = KK). +* K1: column index of the first column in X( J, K ) +* K2: column index of the first column in X( J, K+1 ) +* so the K2 - K1 is the column count of the block X( J, K ) + K1 = (K-1)*NBRHS + 1 + K2 = MIN( K*NBRHS, NRHS ) + 1 +* +* Initialize local scaling factors of current block column X( J, K ) +* + DO KK = 1, K2 - K1 + DO I = 1, NBA + WORK( I+KK*LDS ) = ONE + END DO + END DO +* + IF( NOTRAN ) THEN +* +* Solve A * X(:, K1:K2-1) = B * diag(scale(K1:K2-1)) +* + IF( UPPER ) THEN + JFIRST = NBA + JLAST = 1 + JINC = -1 + ELSE + JFIRST = 1 + JLAST = NBA + JINC = 1 + END IF + ELSE +* +* Solve A**T * X(:, K1:K2-1) = B * diag(scale(K1:K2-1)) +* + IF( UPPER ) THEN + JFIRST = 1 + JLAST = NBA + JINC = 1 + ELSE + JFIRST = NBA + JLAST = 1 + JINC = -1 + END IF + END IF +* + DO J = JFIRST, JLAST, JINC +* J1: row index of the first row in A( J, J ) +* J2: row index of the first row in A( J+1, J+1 ) +* so that J2 - J1 is the row count of the block A( J, J ) + J1 = (J-1)*NB + 1 + J2 = MIN( J*NB, N ) + 1 +* +* Solve op(A( J, J )) * X( J, RHS ) = SCALOC * B( J, RHS ) +* for all right-hand sides in the current block column, +* one RHS at a time. +* + DO KK = 1, K2-K1 + RHS = K1 + KK - 1 + IF( KK.EQ.1 ) THEN + CALL SLATRS( UPLO, TRANS, DIAG, 'N', J2-J1, + $ A( J1, J1 ), LDA, X( J1, RHS ), + $ SCALOC, CNORM, INFO ) + ELSE + CALL SLATRS( UPLO, TRANS, DIAG, 'Y', J2-J1, + $ A( J1, J1 ), LDA, X( J1, RHS ), + $ SCALOC, CNORM, INFO ) + END IF +* Find largest absolute value entry in the vector segment +* X( J1:J2-1, RHS ) as an upper bound for the worst case +* growth in the linear updates. + XNRM( KK ) = SLANGE( 'I', J2-J1, 1, X( J1, RHS ), + $ LDX, W ) +* + IF( SCALOC .EQ. ZERO ) THEN +* LATRS found that A is singular through A(j,j) = 0. +* Reset the computation x(1:n) = 0, x(j) = 1, SCALE = 0 +* and compute A*x = 0 (or A**T*x = 0). Note that +* X(J1:J2-1, KK) is set by LATRS. + SCALE( RHS ) = ZERO + DO II = 1, J1-1 + X( II, KK ) = ZERO + END DO + DO II = J2, N + X( II, KK ) = ZERO + END DO +* Discard the local scale factors. + DO II = 1, NBA + WORK( II+KK*LDS ) = ONE + END DO + SCALOC = ONE + ELSE IF( SCALOC*WORK( J+KK*LDS ) .EQ. ZERO ) THEN +* LATRS computed a valid scale factor, but combined with +* the current scaling the solution does not have a +* scale factor > 0. +* +* Set WORK( J+KK*LDS ) to smallest valid scale +* factor and increase SCALOC accordingly. + SCAL = WORK( J+KK*LDS ) / SMLNUM + SCALOC = SCALOC * SCAL + WORK( J+KK*LDS ) = SMLNUM +* If LATRS overestimated the growth, x may be +* rescaled to preserve a valid combined scale +* factor WORK( J, KK ) > 0. + RSCAL = ONE / SCALOC + IF( XNRM( KK )*RSCAL .LE. BIGNUM ) THEN + XNRM( KK ) = XNRM( KK ) * RSCAL + CALL SSCAL( J2-J1, RSCAL, X( J1, RHS ), 1 ) + SCALOC = ONE + ELSE +* The system op(A) * x = b is badly scaled and its +* solution cannot be represented as (1/scale) * x. +* Set x to zero. This approach deviates from LATRS +* where a completely meaningless non-zero vector +* is returned that is not a solution to op(A) * x = b. + SCALE( RHS ) = ZERO + DO II = 1, N + X( II, KK ) = ZERO + END DO +* Discard the local scale factors. + DO II = 1, NBA + WORK( II+KK*LDS ) = ONE + END DO + SCALOC = ONE + END IF + END IF + SCALOC = SCALOC * WORK( J+KK*LDS ) + WORK( J+KK*LDS ) = SCALOC + END DO +* +* Linear block updates +* + IF( NOTRAN ) THEN + IF( UPPER ) THEN + IFIRST = J - 1 + ILAST = 1 + IINC = -1 + ELSE + IFIRST = J + 1 + ILAST = NBA + IINC = 1 + END IF + ELSE + IF( UPPER ) THEN + IFIRST = J + 1 + ILAST = NBA + IINC = 1 + ELSE + IFIRST = J - 1 + ILAST = 1 + IINC = -1 + END IF + END IF +* + DO I = IFIRST, ILAST, IINC +* I1: row index of the first column in X( I, K ) +* I2: row index of the first column in X( I+1, K ) +* so the I2 - I1 is the row count of the block X( I, K ) + I1 = (I-1)*NB + 1 + I2 = MIN( I*NB, N ) + 1 +* +* Prepare the linear update to be executed with GEMM. +* For each column, compute a consistent scaling, a +* scaling factor to survive the linear update, and +* rescale the column segments, if necesssary. Then +* the linear update is safely executed. +* + DO KK = 1, K2-K1 + RHS = K1 + KK - 1 +* Compute consistent scaling + SCAMIN = MIN( WORK( I+KK*LDS), WORK( J+KK*LDS ) ) +* +* Compute scaling factor to survive the linear update +* simulating consistent scaling. +* + BNRM = SLANGE( 'I', I2-I1, 1, X( I1, RHS ), LDX, W ) + BNRM = BNRM*( SCAMIN / WORK( I+KK*LDS ) ) + XNRM( KK ) = XNRM( KK )*(SCAMIN / WORK( J+KK*LDS )) + ANRM = WORK( AWRK + I+(J-1)*NBA ) + SCALOC = SLARMM( ANRM, XNRM( KK ), BNRM ) +* +* Simultaneously apply the robust update factor and the +* consistency scaling factor to B( I, KK ) and B( J, KK ). +* + SCAL = ( SCAMIN / WORK( I+KK*LDS) )*SCALOC + IF( SCAL.NE.ONE ) THEN + CALL SSCAL( I2-I1, SCAL, X( I1, RHS ), 1 ) + WORK( I+KK*LDS ) = SCAMIN*SCALOC + END IF +* + SCAL = ( SCAMIN / WORK( J+KK*LDS ) )*SCALOC + IF( SCAL.NE.ONE ) THEN + CALL SSCAL( J2-J1, SCAL, X( J1, RHS ), 1 ) + WORK( J+KK*LDS ) = SCAMIN*SCALOC + END IF + END DO +* + IF( NOTRAN ) THEN +* +* B( I, K ) := B( I, K ) - A( I, J ) * X( J, K ) +* + CALL SGEMM( 'N', 'N', I2-I1, K2-K1, J2-J1, -ONE, + $ A( I1, J1 ), LDA, X( J1, K1 ), LDX, + $ ONE, X( I1, K1 ), LDX ) + ELSE +* +* B( I, K ) := B( I, K ) - A( I, J )**T * X( J, K ) +* + CALL SGEMM( 'T', 'N', I2-I1, K2-K1, J2-J1, -ONE, + $ A( J1, I1 ), LDA, X( J1, K1 ), LDX, + $ ONE, X( I1, K1 ), LDX ) + END IF + END DO + END DO +* +* Reduce local scaling factors +* + DO KK = 1, K2-K1 + RHS = K1 + KK - 1 + DO I = 1, NBA + SCALE( RHS ) = MIN( SCALE( RHS ), WORK( I+KK*LDS ) ) + END DO + END DO +* +* Realize consistent scaling +* + DO KK = 1, K2-K1 + RHS = K1 + KK - 1 + IF( SCALE( RHS ).NE.ONE .AND. SCALE( RHS ).NE. ZERO ) THEN + DO I = 1, NBA + I1 = (I-1)*NB + 1 + I2 = MIN( I*NB, N ) + 1 + SCAL = SCALE( RHS ) / WORK( I+KK*LDS ) + IF( SCAL.NE.ONE ) + $ CALL SSCAL( I2-I1, SCAL, X( I1, RHS ), 1 ) + END DO + END IF + END DO + END DO + RETURN +* +* End of SLATRS3 +* + END diff --git a/SRC/zlatrs3.f b/SRC/zlatrs3.f new file mode 100644 index 0000000000..fc1be0517a --- /dev/null +++ b/SRC/zlatrs3.f @@ -0,0 +1,667 @@ +*> \brief \b ZLATRS3 solves a triangular system of equations with the scale factors set to prevent overflow. +* +* Definition: +* =========== +* +* SUBROUTINE ZLATRS3( UPLO, TRANS, DIAG, NORMIN, N, NRHS, A, LDA, +* X, LDX, SCALE, CNORM, WORK, LWORK, INFO ) +* +* .. Scalar Arguments .. +* CHARACTER DIAG, NORMIN, TRANS, UPLO +* INTEGER INFO, LDA, LWORK, LDX, N, NRHS +* .. +* .. Array Arguments .. +* DOUBLE PRECISION CNORM( * ), SCALE( * ), WORK( * ) +* COMPLEX*16 A( LDA, * ), X( LDX, * ) +* .. +* +* +*> \par Purpose: +* ============= +*> +*> \verbatim +*> +*> ZLATRS3 solves one of the triangular systems +*> +*> A * X = B * diag(scale), A**T * X = B * diag(scale), or +*> A**H * X = B * diag(scale) +*> +*> with scaling to prevent overflow. Here A is an upper or lower +*> triangular matrix, A**T denotes the transpose of A, A**H denotes the +*> conjugate transpose of A. X and B are n-by-nrhs matrices and scale +*> is an nrhs-element vector of scaling factors. A scaling factor scale(j) +*> is usually less than or equal to 1, chosen such that X(:,j) is less +*> than the overflow threshold. If the matrix A is singular (A(j,j) = 0 +*> for some j), then a non-trivial solution to A*X = 0 is returned. If +*> the system is so badly scaled that the solution cannot be represented +*> as (1/scale(k))*X(:,k), then x(:,k) = 0 and scale(k) is returned. +*> +*> This is a BLAS-3 version of LATRS for solving several right +*> hand sides simultaneously. +*> +*> \endverbatim +* +* Arguments: +* ========== +* +*> \param[in] UPLO +*> \verbatim +*> UPLO is CHARACTER*1 +*> Specifies whether the matrix A is upper or lower triangular. +*> = 'U': Upper triangular +*> = 'L': Lower triangular +*> \endverbatim +*> +*> \param[in] TRANS +*> \verbatim +*> TRANS is CHARACTER*1 +*> Specifies the operation applied to A. +*> = 'N': Solve A * x = s*b (No transpose) +*> = 'T': Solve A**T* x = s*b (Transpose) +*> = 'C': Solve A**T* x = s*b (Conjugate transpose) +*> \endverbatim +*> +*> \param[in] DIAG +*> \verbatim +*> DIAG is CHARACTER*1 +*> Specifies whether or not the matrix A is unit triangular. +*> = 'N': Non-unit triangular +*> = 'U': Unit triangular +*> \endverbatim +*> +*> \param[in] NORMIN +*> \verbatim +*> NORMIN is CHARACTER*1 +*> Specifies whether CNORM has been set or not. +*> = 'Y': CNORM contains the column norms on entry +*> = 'N': CNORM is not set on entry. On exit, the norms will +*> be computed and stored in CNORM. +*> \endverbatim +*> +*> \param[in] N +*> \verbatim +*> N is INTEGER +*> The order of the matrix A. N >= 0. +*> \endverbatim +*> +*> \param[in] NRHS +*> \verbatim +*> NRHS is INTEGER +*> The number of columns of X. NRHS >= 0. +*> \endverbatim +*> +*> \param[in] A +*> \verbatim +*> A is COMPLEX*16 array, dimension (LDA,N) +*> The triangular matrix A. If UPLO = 'U', the leading n by n +*> upper triangular part of the array A contains the upper +*> triangular matrix, and the strictly lower triangular part of +*> A is not referenced. If UPLO = 'L', the leading n by n lower +*> triangular part of the array A contains the lower triangular +*> matrix, and the strictly upper triangular part of A is not +*> referenced. If DIAG = 'U', the diagonal elements of A are +*> also not referenced and are assumed to be 1. +*> \endverbatim +*> +*> \param[in] LDA +*> \verbatim +*> LDA is INTEGER +*> The leading dimension of the array A. LDA >= max (1,N). +*> \endverbatim +*> +*> \param[in,out] X +*> \verbatim +*> X is COMPLEX*16 array, dimension (LDX,NRHS) +*> On entry, the right hand side B of the triangular system. +*> On exit, X is overwritten by the solution matrix X. +*> \endverbatim +*> +*> \param[in] LDX +*> \verbatim +*> LDX is INTEGER +*> The leading dimension of the array X. LDX >= max (1,N). +*> \endverbatim +*> +*> \param[out] SCALE +*> \verbatim +*> SCALE is DOUBLE PRECISION array, dimension (NRHS) +*> The scaling factor s(k) is for the triangular system +*> A * x(:,k) = s(k)*b(:,k) or A**T* x(:,k) = s(k)*b(:,k). +*> If SCALE = 0, the matrix A is singular or badly scaled. +*> If A(j,j) = 0 is encountered, a non-trivial vector x(:,k) +*> that is an exact or approximate solution to A*x(:,k) = 0 +*> is returned. If the system so badly scaled that solution +*> cannot be presented as x(:,k) * 1/s(k), then x(:,k) = 0 +*> is returned. +*> \endverbatim +*> +*> \param[in,out] CNORM +*> \verbatim +*> CNORM is DOUBLE PRECISION array, dimension (N) +*> +*> If NORMIN = 'Y', CNORM is an input argument and CNORM(j) +*> contains the norm of the off-diagonal part of the j-th column +*> of A. If TRANS = 'N', CNORM(j) must be greater than or equal +*> to the infinity-norm, and if TRANS = 'T' or 'C', CNORM(j) +*> must be greater than or equal to the 1-norm. +*> +*> If NORMIN = 'N', CNORM is an output argument and CNORM(j) +*> returns the 1-norm of the offdiagonal part of the j-th column +*> of A. +*> \endverbatim +*> +*> \param[out] WORK +*> \verbatim +*> WORK is DOUBLE PRECISION array, dimension (LWORK). +*> On exit, if INFO = 0, WORK(1) returns the optimal size of +*> WORK. +*> \endverbatim +*> +*> \param[in] LWORK +*> LWORK is INTEGER +*> LWORK >= MAX(1, 2*NBA * MAX(NBA, MIN(NRHS, 32)), where +*> NBA = (N + NB - 1)/NB and NB is the optimal block size. +*> +*> If LWORK = -1, then a workspace query is assumed; the routine +*> only calculates the optimal dimensions of the WORK array, returns +*> this value as the first entry of the WORK array, and no error +*> message related to LWORK is issued by XERBLA. +*> +*> \param[out] INFO +*> \verbatim +*> INFO is INTEGER +*> = 0: successful exit +*> < 0: if INFO = -k, the k-th argument had an illegal value +*> \endverbatim +* +* Authors: +* ======== +* +*> \author Univ. of Tennessee +*> \author Univ. of California Berkeley +*> \author Univ. of Colorado Denver +*> \author NAG Ltd. +* +*> \ingroup doubleOTHERauxiliary +*> \par Further Details: +* ===================== +* \verbatim +* The algorithm follows the structure of a block triangular solve. +* The diagonal block is solved with a call to the robust the triangular +* solver LATRS for every right-hand side RHS = 1, ..., NRHS +* op(A( J, J )) * x( J, RHS ) = SCALOC * b( J, RHS ), +* where op( A ) = A or op( A ) = A**T or op( A ) = A**H. +* The linear block updates operate on block columns of X, +* B( I, K ) - op(A( I, J )) * X( J, K ) +* and use GEMM. To avoid overflow in the linear block update, the worst case +* growth is estimated. For every RHS, a scale factor s <= 1.0 is computed +* such that +* || s * B( I, RHS )||_oo +* + || op(A( I, J )) ||_oo * || s * X( J, RHS ) ||_oo <= Overflow threshold +* +* Once all columns of a block column have been rescaled (BLAS-1), the linear +* update is executed with GEMM without overflow. +* +* To limit rescaling, local scale factors track the scaling of column segments. +* There is one local scale factor s( I, RHS ) per block row I = 1, ..., NBA +* per right-hand side column RHS = 1, ..., NRHS. The global scale factor +* SCALE( RHS ) is chosen as the smallest local scale factor s( I, RHS ) +* I = 1, ..., NBA. +* A triangular solve op(A( J, J )) * x( J, RHS ) = SCALOC * b( J, RHS ) +* updates the local scale factor s( J, RHS ) := s( J, RHS ) * SCALOC. The +* linear update of potentially inconsistently scaled vector segments +* s( I, RHS ) * b( I, RHS ) - op(A( I, J )) * ( s( J, RHS )* x( J, RHS ) ) +* computes a consistent scaling SCAMIN = MIN( s(I, RHS ), s(J, RHS) ) and, +* if necessary, rescales the blocks prior to calling GEMM. +* +* \endverbatim +* ===================================================================== +* References: +* C. C. Kjelgaard Mikkelsen, A. B. Schwarz and L. Karlsson (2019). +* Parallel robust solution of triangular linear systems. Concurrency +* and Computation: Practice and Experience, 31(19), e5064. +* +* Contributor: +* Angelika Schwarz, Umea University, Sweden. +* +* ===================================================================== + SUBROUTINE ZLATRS3( UPLO, TRANS, DIAG, NORMIN, N, NRHS, A, LDA, + $ X, LDX, SCALE, CNORM, WORK, LWORK, INFO ) + IMPLICIT NONE +* +* .. Scalar Arguments .. + CHARACTER DIAG, TRANS, NORMIN, UPLO + INTEGER INFO, LDA, LWORK, LDX, N, NRHS +* .. +* .. Array Arguments .. + COMPLEX*16 A( LDA, * ), X( LDX, * ) + DOUBLE PRECISION CNORM( * ), SCALE( * ), WORK( * ) +* .. +* +* ===================================================================== +* +* .. Parameters .. + DOUBLE PRECISION ZERO, ONE + PARAMETER ( ZERO = 0.0D+0, ONE = 1.0D+0 ) + COMPLEX*16 CZERO, CONE + PARAMETER ( CONE = ( 1.0D+0, 0.0D+0 ) ) + PARAMETER ( CZERO = ( 0.0D+0, 0.0D+0 ) ) + INTEGER NBMAX, NBMIN, NBRHS, NRHSMIN + PARAMETER ( NRHSMIN = 2, NBRHS = 32 ) + PARAMETER ( NBMIN = 8, NBMAX = 64 ) +* .. +* .. Local Arrays .. + DOUBLE PRECISION W( NBMAX ), XNRM( NBRHS ) +* .. +* .. Local Scalars .. + LOGICAL LQUERY, NOTRAN, NOUNIT, UPPER + INTEGER AWRK, I, IFIRST, IINC, ILAST, II, I1, I2, J, + $ JFIRST, JINC, JLAST, J1, J2, K, KK, K1, K2, + $ LANRM, LDS, LSCALE, NB, NBA, NBX, RHS + DOUBLE PRECISION ANRM, BIGNUM, BNRM, RSCAL, SCAL, SCALOC, + $ SCAMIN, SMLNUM, TMAX +* .. +* .. External Functions .. + LOGICAL LSAME + INTEGER ILAENV + DOUBLE PRECISION DLAMCH, ZLANGE, DLARMM + EXTERNAL ILAENV, LSAME, DLAMCH, ZLANGE, DLARMM +* .. +* .. External Subroutines .. + EXTERNAL ZLATRS, ZDSCAL, XERBLA +* .. +* .. Intrinsic Functions .. + INTRINSIC ABS, MAX, MIN +* .. +* .. Executable Statements .. +* + INFO = 0 + UPPER = LSAME( UPLO, 'U' ) + NOTRAN = LSAME( TRANS, 'N' ) + NOUNIT = LSAME( DIAG, 'N' ) + LQUERY = ( LWORK.EQ.-1 ) +* +* Partition A and X into blocks. +* + NB = MAX( NBMIN, ILAENV( 1, 'ZLATRS', '', N, N, -1, -1 ) ) + NB = MIN( NBMAX, NB ) + NBA = MAX( 1, (N + NB - 1) / NB ) + NBX = MAX( 1, (NRHS + NBRHS - 1) / NBRHS ) +* +* Compute the workspace +* +* The workspace comprises two parts. +* The first part stores the local scale factors. Each simultaneously +* computed right-hand side requires one local scale factor per block +* row. WORK( I + KK * LDS ) is the scale factor of the vector +* segment associated with the I-th block row and the KK-th vector +* in the block column. + LSCALE = NBA * MAX( NBA, MIN( NRHS, NBRHS ) ) + LDS = NBA +* The second part stores upper bounds of the triangular A. There are +* a total of NBA x NBA blocks, of which only the upper triangular +* part or the lower triangular part is referenced. The upper bound of +* the block A( I, J ) is stored as WORK( AWRK + I + J * NBA ). + LANRM = NBA * NBA + AWRK = LSCALE + WORK( 1 ) = LSCALE + LANRM +* +* Test the input parameters. +* + IF( .NOT.UPPER .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN + INFO = -1 + ELSE IF( .NOT.NOTRAN .AND. .NOT.LSAME( TRANS, 'T' ) .AND. .NOT. + $ LSAME( TRANS, 'C' ) ) THEN + INFO = -2 + ELSE IF( .NOT.NOUNIT .AND. .NOT.LSAME( DIAG, 'U' ) ) THEN + INFO = -3 + ELSE IF( .NOT.LSAME( NORMIN, 'Y' ) .AND. .NOT. + $ LSAME( NORMIN, 'N' ) ) THEN + INFO = -4 + ELSE IF( N.LT.0 ) THEN + INFO = -5 + ELSE IF( NRHS.LT.0 ) THEN + INFO = -6 + ELSE IF( LDA.LT.MAX( 1, N ) ) THEN + INFO = -8 + ELSE IF( LDX.LT.MAX( 1, N ) ) THEN + INFO = -10 + ELSE IF( .NOT.LQUERY .AND. LWORK.LT.WORK( 1 ) ) THEN + INFO = -14 + END IF + IF( INFO.NE.0 ) THEN + CALL XERBLA( 'ZLATRS3', -INFO ) + RETURN + ELSE IF( LQUERY ) THEN + RETURN + END IF +* +* Initialize scaling factors +* + DO KK = 1, NRHS + SCALE( KK ) = ONE + END DO +* +* Quick return if possible +* + IF( MIN( N, NRHS ).EQ.0 ) + $ RETURN +* +* Determine machine dependent constant to control overflow. +* + BIGNUM = DLAMCH( 'Overflow' ) + SMLNUM = DLAMCH( 'Safe Minimum' ) +* +* Use unblocked code for small problems +* + IF( NRHS.LT.NRHSMIN ) THEN + CALL ZLATRS( UPLO, TRANS, DIAG, NORMIN, N, A, LDA, X( 1, 1), + $ SCALE( 1 ), CNORM, INFO ) + DO K = 2, NRHS + CALL ZLATRS( UPLO, TRANS, DIAG, 'Y', N, A, LDA, X( 1, K ), + $ SCALE( K ), CNORM, INFO ) + END DO + RETURN + END IF +* +* Compute norms of blocks of A excluding diagonal blocks and find +* the block with the largest norm TMAX. +* + TMAX = ZERO + DO J = 1, NBA + J1 = (J-1)*NB + 1 + J2 = MIN( J*NB, N ) + 1 + IF ( UPPER ) THEN + IFIRST = 1 + ILAST = J - 1 + ELSE + IFIRST = J + 1 + ILAST = NBA + END IF + DO I = IFIRST, ILAST + I1 = (I-1)*NB + 1 + I2 = MIN( I*NB, N ) + 1 +* +* Compute upper bound of A( I1:I2-1, J1:J2-1 ). +* + IF( NOTRAN ) THEN + ANRM = ZLANGE( 'I', I2-I1, J2-J1, A( I1, J1 ), LDA, W ) + WORK( AWRK + I+(J-1)*NBA ) = ANRM + ELSE + ANRM = ZLANGE( '1', I2-I1, J2-J1, A( I1, J1 ), LDA, W ) + WORK( AWRK + J+(I-1) * NBA ) = ANRM + END IF + TMAX = MAX( TMAX, ANRM ) + END DO + END DO +* + IF( .NOT. TMAX.LE.DLAMCH('Overflow') ) THEN +* +* Some matrix entries have huge absolute value. At least one upper +* bound norm( A(I1:I2-1, J1:J2-1), 'I') is not a valid floating-point +* number, either due to overflow in LANGE or due to Inf in A. +* Fall back to LATRS. Set normin = 'N' for every right-hand side to +* force computation of TSCAL in LATRS to avoid the likely overflow +* in the computation of the column norms CNORM. +* + DO K = 1, NRHS + CALL ZLATRS( UPLO, TRANS, DIAG, 'N', N, A, LDA, X( 1, K ), + $ SCALE( K ), CNORM, INFO ) + END DO + RETURN + END IF +* +* Every right-hand side requires workspace to store NBA local scale +* factors. To save workspace, X is computed successively in block columns +* of width NBRHS, requiring a total of NBA x NBRHS space. If sufficient +* workspace is available, larger values of NBRHS or NBRHS = NRHS are viable. + DO K = 1, NBX +* Loop over block columns (index = K) of X and, for column-wise scalings, +* over individual columns (index = KK). +* K1: column index of the first column in X( J, K ) +* K2: column index of the first column in X( J, K+1 ) +* so the K2 - K1 is the column count of the block X( J, K ) + K1 = (K-1)*NBRHS + 1 + K2 = MIN( K*NBRHS, NRHS ) + 1 +* +* Initialize local scaling factors of current block column X( J, K ) +* + DO KK = 1, K2 - K1 + DO I = 1, NBA + WORK( I+KK*LDS ) = ONE + END DO + END DO +* + IF( NOTRAN ) THEN +* +* Solve A * X(:, K1:K2-1) = B * diag(scale(K1:K2-1)) +* + IF( UPPER ) THEN + JFIRST = NBA + JLAST = 1 + JINC = -1 + ELSE + JFIRST = 1 + JLAST = NBA + JINC = 1 + END IF + ELSE +* +* Solve op(A) * X(:, K1:K2-1) = B * diag(scale(K1:K2-1)) +* where op(A) = A**T or op(A) = A**H +* + IF( UPPER ) THEN + JFIRST = 1 + JLAST = NBA + JINC = 1 + ELSE + JFIRST = NBA + JLAST = 1 + JINC = -1 + END IF + END IF + + DO J = JFIRST, JLAST, JINC +* J1: row index of the first row in A( J, J ) +* J2: row index of the first row in A( J+1, J+1 ) +* so that J2 - J1 is the row count of the block A( J, J ) + J1 = (J-1)*NB + 1 + J2 = MIN( J*NB, N ) + 1 +* +* Solve op(A( J, J )) * X( J, RHS ) = SCALOC * B( J, RHS ) +* + DO KK = 1, K2 - K1 + RHS = K1 + KK - 1 + IF( KK.EQ.1 ) THEN + CALL ZLATRS( UPLO, TRANS, DIAG, 'N', J2-J1, + $ A( J1, J1 ), LDA, X( J1, RHS ), + $ SCALOC, CNORM, INFO ) + ELSE + CALL ZLATRS( UPLO, TRANS, DIAG, 'Y', J2-J1, + $ A( J1, J1 ), LDA, X( J1, RHS ), + $ SCALOC, CNORM, INFO ) + END IF +* Find largest absolute value entry in the vector segment +* X( J1:J2-1, RHS ) as an upper bound for the worst case +* growth in the linear updates. + XNRM( KK ) = ZLANGE( 'I', J2-J1, 1, X( J1, RHS ), + $ LDX, W ) +* + IF( SCALOC .EQ. ZERO ) THEN +* LATRS found that A is singular through A(j,j) = 0. +* Reset the computation x(1:n) = 0, x(j) = 1, SCALE = 0 +* and compute op(A)*x = 0. Note that X(J1:J2-1, KK) is +* set by LATRS. + SCALE( RHS ) = ZERO + DO II = 1, J1-1 + X( II, KK ) = CZERO + END DO + DO II = J2, N + X( II, KK ) = CZERO + END DO +* Discard the local scale factors. + DO II = 1, NBA + WORK( II+KK*LDS ) = ONE + END DO + SCALOC = ONE + ELSE IF( SCALOC*WORK( J+KK*LDS ) .EQ. ZERO ) THEN +* LATRS computed a valid scale factor, but combined with +* the current scaling the solution does not have a +* scale factor > 0. +* +* Set WORK( J+KK*LDS ) to smallest valid scale +* factor and increase SCALOC accordingly. + SCAL = WORK( J+KK*LDS ) / SMLNUM + SCALOC = SCALOC * SCAL + WORK( J+KK*LDS ) = SMLNUM +* If LATRS overestimated the growth, x may be +* rescaled to preserve a valid combined scale +* factor WORK( J, KK ) > 0. + RSCAL = ONE / SCALOC + IF( XNRM( KK )*RSCAL .LE. BIGNUM ) THEN + XNRM( KK ) = XNRM( KK ) * RSCAL + CALL ZDSCAL( J2-J1, RSCAL, X( J1, RHS ), 1 ) + SCALOC = ONE + ELSE +* The system op(A) * x = b is badly scaled and its +* solution cannot be represented as (1/scale) * x. +* Set x to zero. This approach deviates from LATRS +* where a completely meaningless non-zero vector +* is returned that is not a solution to op(A) * x = b. + SCALE( RHS ) = ZERO + DO II = 1, N + X( II, KK ) = CZERO + END DO +* Discard the local scale factors. + DO II = 1, NBA + WORK( II+KK*LDS ) = ONE + END DO + SCALOC = ONE + END IF + END IF + SCALOC = SCALOC * WORK( J+KK*LDS ) + WORK( J+KK*LDS ) = SCALOC + END DO +* +* Linear block updates +* + IF( NOTRAN ) THEN + IF( UPPER ) THEN + IFIRST = J - 1 + ILAST = 1 + IINC = -1 + ELSE + IFIRST = J + 1 + ILAST = NBA + IINC = 1 + END IF + ELSE + IF( UPPER ) THEN + IFIRST = J + 1 + ILAST = NBA + IINC = 1 + ELSE + IFIRST = J - 1 + ILAST = 1 + IINC = -1 + END IF + END IF +* + DO I = IFIRST, ILAST, IINC +* I1: row index of the first column in X( I, K ) +* I2: row index of the first column in X( I+1, K ) +* so the I2 - I1 is the row count of the block X( I, K ) + I1 = (I-1)*NB + 1 + I2 = MIN( I*NB, N ) + 1 +* +* Prepare the linear update to be executed with GEMM. +* For each column, compute a consistent scaling, a +* scaling factor to survive the linear update, and +* rescale the column segments, if necesssary. Then +* the linear update is safely executed. +* + DO KK = 1, K2 - K1 + RHS = K1 + KK - 1 +* Compute consistent scaling + SCAMIN = MIN( WORK( I+KK*LDS), WORK( J+KK*LDS ) ) +* +* Compute scaling factor to survive the linear update +* simulating consistent scaling. +* + BNRM = ZLANGE( 'I', I2-I1, 1, X( I1, RHS ), LDX, W ) + BNRM = BNRM*( SCAMIN / WORK( I+KK*LDS ) ) + XNRM( KK ) = XNRM( KK )*( SCAMIN / WORK( J+KK*LDS) ) + ANRM = WORK( AWRK + I+(J-1)*NBA ) + SCALOC = DLARMM( ANRM, XNRM( KK ), BNRM ) +* +* Simultaneously apply the robust update factor and the +* consistency scaling factor to X( I, KK ) and X( J, KK ). +* + SCAL = ( SCAMIN / WORK( I+KK*LDS) )*SCALOC + IF( SCAL.NE.ONE ) THEN + CALL ZDSCAL( I2-I1, SCAL, X( I1, RHS ), 1 ) + WORK( I+KK*LDS ) = SCAMIN*SCALOC + END IF +* + SCAL = ( SCAMIN / WORK( J+KK*LDS ) )*SCALOC + IF( SCAL.NE.ONE ) THEN + CALL ZDSCAL( J2-J1, SCAL, X( J1, RHS ), 1 ) + WORK( J+KK*LDS ) = SCAMIN*SCALOC + END IF + END DO +* + IF( NOTRAN ) THEN +* +* B( I, K ) := B( I, K ) - A( I, J ) * X( J, K ) +* + CALL ZGEMM( 'N', 'N', I2-I1, K2-K1, J2-J1, -CONE, + $ A( I1, J1 ), LDA, X( J1, K1 ), LDX, + $ CONE, X( I1, K1 ), LDX ) + ELSE IF( LSAME( TRANS, 'T' ) ) THEN +* +* B( I, K ) := B( I, K ) - A( I, J )**T * X( J, K ) +* + CALL ZGEMM( 'T', 'N', I2-I1, K2-K1, J2-J1, -CONE, + $ A( J1, I1 ), LDA, X( J1, K1 ), LDX, + $ CONE, X( I1, K1 ), LDX ) + ELSE +* +* B( I, K ) := B( I, K ) - A( I, J )**H * X( J, K ) +* + CALL ZGEMM( 'C', 'N', I2-I1, K2-K1, J2-J1, -CONE, + $ A( J1, I1 ), LDA, X( J1, K1 ), LDX, + $ CONE, X( I1, K1 ), LDX ) + END IF + END DO + END DO + +* +* Reduce local scaling factors +* + DO KK = 1, K2 - K1 + RHS = K1 + KK - 1 + DO I = 1, NBA + SCALE( RHS ) = MIN( SCALE( RHS ), WORK( I+KK*LDS ) ) + END DO + END DO +* +* Realize consistent scaling +* + DO KK = 1, K2 - K1 + RHS = K1 + KK - 1 + IF( SCALE( RHS ).NE.ONE .AND. SCALE( RHS ).NE. ZERO ) THEN + DO I = 1, NBA + I1 = (I - 1) * NB + 1 + I2 = MIN( I * NB, N ) + 1 + SCAL = SCALE( RHS ) / WORK( I+KK*LDS ) + IF( SCAL.NE.ONE ) + $ CALL ZDSCAL( I2-I1, SCAL, X( I1, RHS ), 1 ) + END DO + END IF + END DO + END DO + RETURN +* +* End of ZLATRS3 +* + END diff --git a/TESTING/LIN/cchktr.f b/TESTING/LIN/cchktr.f index ce1ecf7615..c9af11533e 100644 --- a/TESTING/LIN/cchktr.f +++ b/TESTING/LIN/cchktr.f @@ -31,7 +31,7 @@ *> *> \verbatim *> -*> CCHKTR tests CTRTRI, -TRS, -RFS, and -CON, and CLATRS +*> CCHKTR tests CTRTRI, -TRS, -RFS, and -CON, and CLATRS(3) *> \endverbatim * * Arguments: @@ -184,7 +184,7 @@ SUBROUTINE CCHKTR( DOTYPE, NN, NVAL, NNB, NBVAL, NNS, NSVAL, INTEGER NTYPE1, NTYPES PARAMETER ( NTYPE1 = 10, NTYPES = 18 ) INTEGER NTESTS - PARAMETER ( NTESTS = 9 ) + PARAMETER ( NTESTS = 10 ) INTEGER NTRAN PARAMETER ( NTRAN = 3 ) REAL ONE, ZERO @@ -195,13 +195,13 @@ SUBROUTINE CCHKTR( DOTYPE, NN, NVAL, NNB, NBVAL, NNS, NSVAL, CHARACTER*3 PATH INTEGER I, IDIAG, IMAT, IN, INB, INFO, IRHS, ITRAN, $ IUPLO, K, LDA, N, NB, NERRS, NFAIL, NRHS, NRUN - REAL AINVNM, ANORM, DUMMY, RCOND, RCONDC, RCONDI, - $ RCONDO, SCALE + REAL AINVNM, ANORM, BIGNUM, DUMMY, RCOND, RCONDC, + $ RCONDI, RCONDO, RES, SCALE, SLAMCH * .. * .. Local Arrays .. CHARACTER TRANSS( NTRAN ), UPLOS( 2 ) INTEGER ISEED( 4 ), ISEEDY( 4 ) - REAL RESULT( NTESTS ) + REAL RESULT( NTESTS ), SCALE3( 2 ) * .. * .. External Functions .. LOGICAL LSAME @@ -210,9 +210,9 @@ SUBROUTINE CCHKTR( DOTYPE, NN, NVAL, NNB, NBVAL, NNS, NSVAL, * .. * .. External Subroutines .. EXTERNAL ALAERH, ALAHD, ALASUM, CCOPY, CERRTR, CGET04, - $ CLACPY, CLARHS, CLATRS, CLATTR, CTRCON, CTRRFS, - $ CTRT01, CTRT02, CTRT03, CTRT05, CTRT06, CTRTRI, - $ CTRTRS, XLAENV + $ CLACPY, CLARHS, CLATRS, CLATRS3, CLATTR, + $ CSSCAL, CTRCON, CTRRFS, CTRT01, CTRT02, CTRT03, + $ CTRT05, CTRT06, CTRTRI, CTRTRS, XLAENV, SLAMCH * .. * .. Scalars in Common .. LOGICAL LERR, OK @@ -236,6 +236,7 @@ SUBROUTINE CCHKTR( DOTYPE, NN, NVAL, NNB, NBVAL, NNS, NSVAL, * PATH( 1: 1 ) = 'Complex precision' PATH( 2: 3 ) = 'TR' + BIGNUM = SLAMCH('Overflow') / SLAMCH('Precision') NRUN = 0 NFAIL = 0 NERRS = 0 @@ -535,6 +536,32 @@ SUBROUTINE CCHKTR( DOTYPE, NN, NVAL, NNB, NBVAL, NNS, NSVAL, $ RWORK, ONE, B( N+1 ), LDA, X, LDA, WORK, $ RESULT( 9 ) ) * +*+ TEST 10 +* Solve op(A)*X = B. +* + SRNAMT = 'CLATRS3' + CALL CCOPY( N, X, 1, B, 1 ) + CALL CCOPY( N, X, 1, B, 1 ) + CALL CSCAL( N, BIGNUM, B( N+1 ), 1 ) + CALL CLATRS3( UPLO, TRANS, DIAG, 'N', N, 2, A, LDA, + $ B, MAX(1, N), SCALE3, RWORK, WORK, NMAX, + $ INFO ) +* +* Check error code from CLATRS3. +* + IF( INFO.NE.0 ) + $ CALL ALAERH( PATH, 'CLATRS3', INFO, 0, + $ UPLO // TRANS // DIAG // 'Y', N, N, + $ -1, -1, -1, IMAT, NFAIL, NERRS, NOUT ) + CALL CTRT03( UPLO, TRANS, DIAG, N, 1, A, LDA, + $ SCALE3( 1 ), RWORK, ONE, B( 1 ), LDA, + $ X, LDA, WORK, RESULT( 10 ) ) + CALL CSSCAL( N, BIGNUM, X, 1 ) + CALL CTRT03( UPLO, TRANS, DIAG, N, 1, A, LDA, + $ SCALE3( 2 ), RWORK, ONE, B( N+1 ), LDA, + $ X, LDA, WORK, RESULT( 10 ) ) + RESULT( 10 ) = MAX( RESULT( 10 ), RES ) +* * Print information about the tests that did not pass * the threshold. * @@ -552,7 +579,14 @@ SUBROUTINE CCHKTR( DOTYPE, NN, NVAL, NNB, NBVAL, NNS, NSVAL, $ DIAG, 'Y', N, IMAT, 9, RESULT( 9 ) NFAIL = NFAIL + 1 END IF - NRUN = NRUN + 2 + IF( RESULT( 10 ).GE.THRESH ) THEN + IF( NFAIL.EQ.0 .AND. NERRS.EQ.0 ) + $ CALL ALAHD( NOUT, PATH ) + WRITE( NOUT, FMT = 9996 )'CLATRS3', UPLO, TRANS, + $ DIAG, 'N', N, IMAT, 10, RESULT( 10 ) + NFAIL = NFAIL + 1 + END IF + NRUN = NRUN + 3 90 CONTINUE 100 CONTINUE 110 CONTINUE diff --git a/TESTING/LIN/cerrtr.f b/TESTING/LIN/cerrtr.f index db65edd881..9ba784f62a 100644 --- a/TESTING/LIN/cerrtr.f +++ b/TESTING/LIN/cerrtr.f @@ -82,9 +82,10 @@ SUBROUTINE CERRTR( PATH, NUNIT ) EXTERNAL LSAMEN * .. * .. External Subroutines .. - EXTERNAL ALAESM, CHKXER, CLATBS, CLATPS, CLATRS, CTBCON, - $ CTBRFS, CTBTRS, CTPCON, CTPRFS, CTPTRI, CTPTRS, - $ CTRCON, CTRRFS, CTRTI2, CTRTRI, CTRTRS + EXTERNAL ALAESM, CHKXER, CLATBS, CLATPS, CLATRS, + $ CLATRS3, CTBCON, CTBRFS, CTBTRS, CTPCON, + $ CTPRFS, CTPTRI, CTPTRS, CTRCON, CTRRFS, CTRTI2, + $ CTRTRI, CTRTRS * .. * .. Scalars in Common .. LOGICAL LERR, OK @@ -240,6 +241,46 @@ SUBROUTINE CERRTR( PATH, NUNIT ) CALL CLATRS( 'U', 'N', 'N', 'N', 2, A, 1, X, SCALE, RW, INFO ) CALL CHKXER( 'CLATRS', INFOT, NOUT, LERR, OK ) * +* CLATRS3 +* + SRNAMT = 'CLATRS3' + INFOT = 1 + CALL CLATRS3( '/', 'N', 'N', 'N', 0, 0, A, 1, X, 1, SCALE, RW, + $ RW( 2 ), 1, INFO ) + CALL CHKXER( 'CLATRS3', INFOT, NOUT, LERR, OK ) + INFOT = 2 + CALL CLATRS3( 'U', '/', 'N', 'N', 0, 0, A, 1, X, 1, SCALE, RW, + $ RW( 2 ), 1, INFO ) + CALL CHKXER( 'CLATRS3', INFOT, NOUT, LERR, OK ) + INFOT = 3 + CALL CLATRS3( 'U', 'N', '/', 'N', 0, 0, A, 1, X, 1, SCALE, RW, + $ RW( 2 ), 1, INFO ) + CALL CHKXER( 'CLATRS3', INFOT, NOUT, LERR, OK ) + INFOT = 4 + CALL CLATRS3( 'U', 'N', 'N', '/', 0, 0, A, 1, X, 1, SCALE, RW, + $ RW( 2 ), 1, INFO ) + CALL CHKXER( 'CLATRS3', INFOT, NOUT, LERR, OK ) + INFOT = 5 + CALL CLATRS3( 'U', 'N', 'N', 'N', -1, 0, A, 1, X, 1, SCALE, RW, + $ RW( 2 ), 1, INFO ) + CALL CHKXER( 'CLATRS3', INFOT, NOUT, LERR, OK ) + INFOT = 6 + CALL CLATRS3( 'U', 'N', 'N', 'N', 0, -1, A, 1, X, 1, SCALE, RW, + $ RW( 2 ), 1, INFO ) + CALL CHKXER( 'CLATRS3', INFOT, NOUT, LERR, OK ) + INFOT = 8 + CALL CLATRS3( 'U', 'N', 'N', 'N', 2, 0, A, 1, X, 1, SCALE, RW, + $ RW( 2 ), 1, INFO ) + CALL CHKXER( 'CLATRS3', INFOT, NOUT, LERR, OK ) + INFOT = 10 + CALL CLATRS3( 'U', 'N', 'N', 'N', 2, 0, A, 2, X, 1, SCALE, RW, + $ RW( 2 ), 1, INFO ) + CALL CHKXER( 'CLATRS3', INFOT, NOUT, LERR, OK ) + INFOT = 14 + CALL CLATRS3( 'U', 'N', 'N', 'N', 1, 0, A, 1, X, 1, SCALE, RW, + $ RW( 2 ), 0, INFO ) + CALL CHKXER( 'CLATRS3', INFOT, NOUT, LERR, OK ) +* * Test error exits for the packed triangular routines. * ELSE IF( LSAMEN( 2, C2, 'TP' ) ) THEN diff --git a/TESTING/LIN/dchktr.f b/TESTING/LIN/dchktr.f index a4a1150c09..57e87326b0 100644 --- a/TESTING/LIN/dchktr.f +++ b/TESTING/LIN/dchktr.f @@ -30,7 +30,7 @@ *> *> \verbatim *> -*> DCHKTR tests DTRTRI, -TRS, -RFS, and -CON, and DLATRS +*> DCHKTR tests DTRTRI, -TRS, -RFS, and -CON, and DLATRS(3) *> \endverbatim * * Arguments: @@ -187,7 +187,7 @@ SUBROUTINE DCHKTR( DOTYPE, NN, NVAL, NNB, NBVAL, NNS, NSVAL, INTEGER NTYPE1, NTYPES PARAMETER ( NTYPE1 = 10, NTYPES = 18 ) INTEGER NTESTS - PARAMETER ( NTESTS = 9 ) + PARAMETER ( NTESTS = 10 ) INTEGER NTRAN PARAMETER ( NTRAN = 3 ) DOUBLE PRECISION ONE, ZERO @@ -198,13 +198,13 @@ SUBROUTINE DCHKTR( DOTYPE, NN, NVAL, NNB, NBVAL, NNS, NSVAL, CHARACTER*3 PATH INTEGER I, IDIAG, IMAT, IN, INB, INFO, IRHS, ITRAN, $ IUPLO, K, LDA, N, NB, NERRS, NFAIL, NRHS, NRUN - DOUBLE PRECISION AINVNM, ANORM, DUMMY, RCOND, RCONDC, RCONDI, - $ RCONDO, SCALE + DOUBLE PRECISION AINVNM, ANORM, BIGNUM, DLAMCH, DUMMY, RCOND, + $ RCONDC, RCONDI, RCONDO, RES, SCALE * .. * .. Local Arrays .. CHARACTER TRANSS( NTRAN ), UPLOS( 2 ) INTEGER ISEED( 4 ), ISEEDY( 4 ) - DOUBLE PRECISION RESULT( NTESTS ) + DOUBLE PRECISION RESULT( NTESTS ), SCALE3( 2 ) * .. * .. External Functions .. LOGICAL LSAME @@ -213,9 +213,9 @@ SUBROUTINE DCHKTR( DOTYPE, NN, NVAL, NNB, NBVAL, NNS, NSVAL, * .. * .. External Subroutines .. EXTERNAL ALAERH, ALAHD, ALASUM, DCOPY, DERRTR, DGET04, - $ DLACPY, DLARHS, DLATRS, DLATTR, DTRCON, DTRRFS, - $ DTRT01, DTRT02, DTRT03, DTRT05, DTRT06, DTRTRI, - $ DTRTRS, XLAENV + $ DLACPY, DLAMCH, DSCAL, DLARHS, DLATRS, DLATRS3, + $ DLATTR, DTRCON, DTRRFS, DTRT01, DTRT02, DTRT03, + $ DTRT05, DTRT06, DTRTRI, DTRTRS, XLAENV * .. * .. Scalars in Common .. LOGICAL LERR, OK @@ -239,6 +239,7 @@ SUBROUTINE DCHKTR( DOTYPE, NN, NVAL, NNB, NBVAL, NNS, NSVAL, * PATH( 1: 1 ) = 'Double precision' PATH( 2: 3 ) = 'TR' + BIGNUM = DLAMCH('Overflow') / DLAMCH('Precision') NRUN = 0 NFAIL = 0 NERRS = 0 @@ -539,6 +540,32 @@ SUBROUTINE DCHKTR( DOTYPE, NN, NVAL, NNB, NBVAL, NNS, NSVAL, $ RWORK, ONE, B( N+1 ), LDA, X, LDA, WORK, $ RESULT( 9 ) ) * +*+ TEST 10 +* Solve op(A)*X = B +* + SRNAMT = 'DLATRS3' + CALL DCOPY( N, X, 1, B, 1 ) + CALL DCOPY( N, X, 1, B( N+1 ), 1 ) + CALL DSCAL( N, BIGNUM, B( N+1 ), 1 ) + CALL DLATRS3( UPLO, TRANS, DIAG, 'N', N, 2, A, LDA, + $ B, MAX(1, N), SCALE3, RWORK, WORK, NMAX, + $ INFO ) +* +* Check error code from DLATRS3. +* + IF( INFO.NE.0 ) + $ CALL ALAERH( PATH, 'DLATRS3', INFO, 0, + $ UPLO // TRANS // DIAG // 'N', N, N, + $ -1, -1, -1, IMAT, NFAIL, NERRS, NOUT ) + CALL DTRT03( UPLO, TRANS, DIAG, N, 1, A, LDA, + $ SCALE3( 1 ), RWORK, ONE, B( 1 ), LDA, + $ X, LDA, WORK, RESULT( 10 ) ) + CALL DSCAL( N, BIGNUM, X, 1 ) + CALL DTRT03( UPLO, TRANS, DIAG, N, 1, A, LDA, + $ SCALE3( 2 ), RWORK, ONE, B( N+1 ), LDA, + $ X, LDA, WORK, RES ) + RESULT( 10 ) = MAX( RESULT( 10 ), RES ) +* * Print information about the tests that did not pass * the threshold. * @@ -556,7 +583,14 @@ SUBROUTINE DCHKTR( DOTYPE, NN, NVAL, NNB, NBVAL, NNS, NSVAL, $ DIAG, 'Y', N, IMAT, 9, RESULT( 9 ) NFAIL = NFAIL + 1 END IF - NRUN = NRUN + 2 + IF( RESULT( 10 ).GE.THRESH ) THEN + IF( NFAIL.EQ.0 .AND. NERRS.EQ.0 ) + $ CALL ALAHD( NOUT, PATH ) + WRITE( NOUT, FMT = 9996 )'DLATRS3', UPLO, TRANS, + $ DIAG, 'N', N, IMAT, 10, RESULT( 10 ) + NFAIL = NFAIL + 1 + END IF + NRUN = NRUN + 3 90 CONTINUE 100 CONTINUE 110 CONTINUE @@ -569,8 +603,8 @@ SUBROUTINE DCHKTR( DOTYPE, NN, NVAL, NNB, NBVAL, NNS, NSVAL, 9999 FORMAT( ' UPLO=''', A1, ''', DIAG=''', A1, ''', N=', I5, ', NB=', $ I4, ', type ', I2, ', test(', I2, ')= ', G12.5 ) 9998 FORMAT( ' UPLO=''', A1, ''', TRANS=''', A1, ''', DIAG=''', A1, - $ ''', N=', I5, ', NB=', I4, ', type ', I2, ', - $ test(', I2, ')= ', G12.5 ) + $ ''', N=', I5, ', NB=', I4, ', type ', I2, ', test(', + $ I2, ')= ', G12.5 ) 9997 FORMAT( ' NORM=''', A1, ''', UPLO =''', A1, ''', N=', I5, ',', $ 11X, ' type ', I2, ', test(', I2, ')=', G12.5 ) 9996 FORMAT( 1X, A, '( ''', A1, ''', ''', A1, ''', ''', A1, ''', ''', diff --git a/TESTING/LIN/derrtr.f b/TESTING/LIN/derrtr.f index a667f0d2b8..d0580497da 100644 --- a/TESTING/LIN/derrtr.f +++ b/TESTING/LIN/derrtr.f @@ -83,9 +83,10 @@ SUBROUTINE DERRTR( PATH, NUNIT ) EXTERNAL LSAMEN * .. * .. External Subroutines .. - EXTERNAL ALAESM, CHKXER, DLATBS, DLATPS, DLATRS, DTBCON, - $ DTBRFS, DTBTRS, DTPCON, DTPRFS, DTPTRI, DTPTRS, - $ DTRCON, DTRRFS, DTRTI2, DTRTRI, DTRTRS + EXTERNAL ALAESM, CHKXER, DLATBS, DLATPS, DLATRS, + $ DLATRS3, DTBCON, DTBRFS, DTBTRS, DTPCON, + $ DTPRFS, DTPTRI, DTPTRS, DTRCON, DTRRFS, + $ DTRTI2, DTRTRI, DTRTRS * .. * .. Scalars in Common .. LOGICAL LERR, OK @@ -244,6 +245,46 @@ SUBROUTINE DERRTR( PATH, NUNIT ) INFOT = 7 CALL DLATRS( 'U', 'N', 'N', 'N', 2, A, 1, X, SCALE, W, INFO ) CALL CHKXER( 'DLATRS', INFOT, NOUT, LERR, OK ) +* +* DLATRS3 +* + SRNAMT = 'DLATRS3' + INFOT = 1 + CALL DLATRS3( '/', 'N', 'N', 'N', 0, 0, A, 1, X, 1, SCALE, W, + $ W( 2 ), 1, INFO ) + CALL CHKXER( 'DLATRS3', INFOT, NOUT, LERR, OK ) + INFOT = 2 + CALL DLATRS3( 'U', '/', 'N', 'N', 0, 0, A, 1, X, 1, SCALE, W, + $ W( 2 ), 1, INFO ) + CALL CHKXER( 'DLATRS3', INFOT, NOUT, LERR, OK ) + INFOT = 3 + CALL DLATRS3( 'U', 'N', '/', 'N', 0, 0, A, 1, X, 1, SCALE, W, + $ W( 2 ), 1, INFO ) + CALL CHKXER( 'DLATRS3', INFOT, NOUT, LERR, OK ) + INFOT = 4 + CALL DLATRS3( 'U', 'N', 'N', '/', 0, 0, A, 1, X, 1, SCALE, W, + $ W( 2 ), 1, INFO ) + CALL CHKXER( 'DLATRS3', INFOT, NOUT, LERR, OK ) + INFOT = 5 + CALL DLATRS3( 'U', 'N', 'N', 'N', -1, 0, A, 1, X, 1, SCALE, W, + $ W( 2 ), 1, INFO ) + CALL CHKXER( 'DLATRS3', INFOT, NOUT, LERR, OK ) + INFOT = 6 + CALL DLATRS3( 'U', 'N', 'N', 'N', 0, -1, A, 1, X, 1, SCALE, W, + $ W( 2 ), 1, INFO ) + CALL CHKXER( 'DLATRS3', INFOT, NOUT, LERR, OK ) + INFOT = 8 + CALL DLATRS3( 'U', 'N', 'N', 'N', 2, 0, A, 1, X, 1, SCALE, W, + $ W( 2 ), 1, INFO ) + CALL CHKXER( 'DLATRS3', INFOT, NOUT, LERR, OK ) + INFOT = 10 + CALL DLATRS3( 'U', 'N', 'N', 'N', 2, 0, A, 2, X, 1, SCALE, W, + $ W( 2 ), 1, INFO ) + CALL CHKXER( 'DLATRS3', INFOT, NOUT, LERR, OK ) + INFOT = 14 + CALL DLATRS3( 'U', 'N', 'N', 'N', 1, 0, A, 1, X, 1, SCALE, W, + $ W( 2 ), 0, INFO ) + CALL CHKXER( 'DLATRS3', INFOT, NOUT, LERR, OK ) * ELSE IF( LSAMEN( 2, C2, 'TP' ) ) THEN * diff --git a/TESTING/LIN/schktr.f b/TESTING/LIN/schktr.f index 66fa0bee7f..5aeb1ce88c 100644 --- a/TESTING/LIN/schktr.f +++ b/TESTING/LIN/schktr.f @@ -30,7 +30,7 @@ *> *> \verbatim *> -*> SCHKTR tests STRTRI, -TRS, -RFS, and -CON, and SLATRS +*> SCHKTR tests STRTRI, -TRS, -RFS, and -CON, and SLATRS(3) *> \endverbatim * * Arguments: @@ -187,7 +187,7 @@ SUBROUTINE SCHKTR( DOTYPE, NN, NVAL, NNB, NBVAL, NNS, NSVAL, INTEGER NTYPE1, NTYPES PARAMETER ( NTYPE1 = 10, NTYPES = 18 ) INTEGER NTESTS - PARAMETER ( NTESTS = 9 ) + PARAMETER ( NTESTS = 10 ) INTEGER NTRAN PARAMETER ( NTRAN = 3 ) REAL ONE, ZERO @@ -198,13 +198,13 @@ SUBROUTINE SCHKTR( DOTYPE, NN, NVAL, NNB, NBVAL, NNS, NSVAL, CHARACTER*3 PATH INTEGER I, IDIAG, IMAT, IN, INB, INFO, IRHS, ITRAN, $ IUPLO, K, LDA, N, NB, NERRS, NFAIL, NRHS, NRUN - REAL AINVNM, ANORM, DUMMY, RCOND, RCONDC, RCONDI, - $ RCONDO, SCALE + REAL AINVNM, ANORM, BIGNUM, DUMMY, RCOND, RCONDC, + $ RCONDI, RCONDO, RES, SCALE, SLAMCH * .. * .. Local Arrays .. CHARACTER TRANSS( NTRAN ), UPLOS( 2 ) INTEGER ISEED( 4 ), ISEEDY( 4 ) - REAL RESULT( NTESTS ) + REAL RESULT( NTESTS ), SCALE3( 2 ) * .. * .. External Functions .. LOGICAL LSAME @@ -213,9 +213,9 @@ SUBROUTINE SCHKTR( DOTYPE, NN, NVAL, NNB, NBVAL, NNS, NSVAL, * .. * .. External Subroutines .. EXTERNAL ALAERH, ALAHD, ALASUM, SCOPY, SERRTR, SGET04, - $ SLACPY, SLARHS, SLATRS, SLATTR, STRCON, STRRFS, - $ STRT01, STRT02, STRT03, STRT05, STRT06, STRTRI, - $ STRTRS, XLAENV + $ SLACPY, SLARHS, SLATRS, SLATRS3, SLATTR, SSCAL, + $ STRCON, STRRFS, STRT01, STRT02, STRT03, STRT05, + $ STRT06, STRTRI, STRTRS, XLAENV, SLAMCH * .. * .. Scalars in Common .. LOGICAL LERR, OK @@ -239,6 +239,7 @@ SUBROUTINE SCHKTR( DOTYPE, NN, NVAL, NNB, NBVAL, NNS, NSVAL, * PATH( 1: 1 ) = 'Single precision' PATH( 2: 3 ) = 'TR' + BIGNUM = SLAMCH('Overflow') / SLAMCH('Precision') NRUN = 0 NFAIL = 0 NERRS = 0 @@ -539,6 +540,33 @@ SUBROUTINE SCHKTR( DOTYPE, NN, NVAL, NNB, NBVAL, NNS, NSVAL, $ RWORK, ONE, B( N+1 ), LDA, X, LDA, WORK, $ RESULT( 9 ) ) * +*+ TEST 10 +* Solve op(A)*X = B +* + SRNAMT = 'SLATRS3' + CALL SCOPY( N, X, 1, B, 1 ) + CALL SCOPY( N, X, 1, B( N+1 ), 1 ) + CALL SSCAL( N, BIGNUM, B( N+1 ), 1 ) + CALL SLATRS3( UPLO, TRANS, DIAG, 'N', N, 2, A, LDA, + $ B, MAX(1, N), SCALE3, RWORK, WORK, NMAX, + $ INFO ) +* +* Check error code from SLATRS3. +* + IF( INFO.NE.0 ) + $ CALL ALAERH( PATH, 'SLATRS3', INFO, 0, + $ UPLO // TRANS // DIAG // 'Y', N, N, + $ -1, -1, -1, IMAT, NFAIL, NERRS, NOUT ) +* + CALL STRT03( UPLO, TRANS, DIAG, N, 1, A, LDA, + $ SCALE3 ( 1 ), RWORK, ONE, B( N+1 ), LDA, + $ X, LDA, WORK, RESULT( 10 ) ) + CALL SSCAL( N, BIGNUM, X, 1 ) + CALL STRT03( UPLO, TRANS, DIAG, N, 1, A, LDA, + $ SCALE3( 2 ), RWORK, ONE, B( N+1 ), LDA, + $ X, LDA, WORK, RES ) + RESULT( 10 ) = MAX( RESULT( 10 ), RES ) +* * Print information about the tests that did not pass * the threshold. * @@ -556,7 +584,14 @@ SUBROUTINE SCHKTR( DOTYPE, NN, NVAL, NNB, NBVAL, NNS, NSVAL, $ DIAG, 'Y', N, IMAT, 9, RESULT( 9 ) NFAIL = NFAIL + 1 END IF - NRUN = NRUN + 2 + IF( RESULT( 10 ).GE.THRESH ) THEN + IF( NFAIL.EQ.0 .AND. NERRS.EQ.0 ) + $ CALL ALAHD( NOUT, PATH ) + WRITE( NOUT, FMT = 9996 )'SLATRS3', UPLO, TRANS, + $ DIAG, 'N', N, IMAT, 10, RESULT( 10 ) + NFAIL = NFAIL + 1 + END IF + NRUN = NRUN + 3 90 CONTINUE 100 CONTINUE 110 CONTINUE @@ -569,8 +604,8 @@ SUBROUTINE SCHKTR( DOTYPE, NN, NVAL, NNB, NBVAL, NNS, NSVAL, 9999 FORMAT( ' UPLO=''', A1, ''', DIAG=''', A1, ''', N=', I5, ', NB=', $ I4, ', type ', I2, ', test(', I2, ')= ', G12.5 ) 9998 FORMAT( ' UPLO=''', A1, ''', TRANS=''', A1, ''', DIAG=''', A1, - $ ''', N=', I5, ', NB=', I4, ', type ', I2, ', - $ test(', I2, ')= ', G12.5 ) + $ ''', N=', I5, ', NB=', I4, ', type ', I2, ', test(', + $ I2, ')= ', G12.5 ) 9997 FORMAT( ' NORM=''', A1, ''', UPLO =''', A1, ''', N=', I5, ',', $ 11X, ' type ', I2, ', test(', I2, ')=', G12.5 ) 9996 FORMAT( 1X, A, '( ''', A1, ''', ''', A1, ''', ''', A1, ''', ''', diff --git a/TESTING/LIN/serrtr.f b/TESTING/LIN/serrtr.f index f0d0a0ef21..af1ce0a8e3 100644 --- a/TESTING/LIN/serrtr.f +++ b/TESTING/LIN/serrtr.f @@ -83,9 +83,10 @@ SUBROUTINE SERRTR( PATH, NUNIT ) EXTERNAL LSAMEN * .. * .. External Subroutines .. - EXTERNAL ALAESM, CHKXER, SLATBS, SLATPS, SLATRS, STBCON, - $ STBRFS, STBTRS, STPCON, STPRFS, STPTRI, STPTRS, - $ STRCON, STRRFS, STRTI2, STRTRI, STRTRS + EXTERNAL ALAESM, CHKXER, SLATBS, SLATPS, SLATRS, + $ SLATRS3, STBCON, STBRFS, STBTRS, STPCON, + $ STPRFS, STPTRI, STPTRS, STRCON, STRRFS, STRTI2, + $ STRTRI, STRTRS * .. * .. Scalars in Common .. LOGICAL LERR, OK @@ -244,6 +245,46 @@ SUBROUTINE SERRTR( PATH, NUNIT ) INFOT = 7 CALL SLATRS( 'U', 'N', 'N', 'N', 2, A, 1, X, SCALE, W, INFO ) CALL CHKXER( 'SLATRS', INFOT, NOUT, LERR, OK ) +* +* SLATRS3 +* + SRNAMT = 'SLATRS3' + INFOT = 1 + CALL SLATRS3( '/', 'N', 'N', 'N', 0, 0, A, 1, X, 1, SCALE, W, + $ W( 2 ), 1, INFO ) + CALL CHKXER( 'SLATRS3', INFOT, NOUT, LERR, OK ) + INFOT = 2 + CALL SLATRS3( 'U', '/', 'N', 'N', 0, 0, A, 1, X, 1, SCALE, W, + $ W( 2 ), 1, INFO ) + CALL CHKXER( 'SLATRS3', INFOT, NOUT, LERR, OK ) + INFOT = 3 + CALL SLATRS3( 'U', 'N', '/', 'N', 0, 0, A, 1, X, 1, SCALE, W, + $ W( 2 ), 1, INFO ) + CALL CHKXER( 'SLATRS3', INFOT, NOUT, LERR, OK ) + INFOT = 4 + CALL SLATRS3( 'U', 'N', 'N', '/', 0, 0, A, 1, X, 1, SCALE, W, + $ W( 2 ), 1, INFO ) + CALL CHKXER( 'SLATRS3', INFOT, NOUT, LERR, OK ) + INFOT = 5 + CALL SLATRS3( 'U', 'N', 'N', 'N', -1, 0, A, 1, X, 1, SCALE, W, + $ W( 2 ), 1, INFO ) + CALL CHKXER( 'SLATRS3', INFOT, NOUT, LERR, OK ) + INFOT = 6 + CALL SLATRS3( 'U', 'N', 'N', 'N', 0, -1, A, 1, X, 1, SCALE, W, + $ W( 2 ), 1, INFO ) + CALL CHKXER( 'SLATRS3', INFOT, NOUT, LERR, OK ) + INFOT = 8 + CALL SLATRS3( 'U', 'N', 'N', 'N', 2, 0, A, 1, X, 1, SCALE, W, + $ W( 2 ), 1, INFO ) + CALL CHKXER( 'SLATRS3', INFOT, NOUT, LERR, OK ) + INFOT = 10 + CALL SLATRS3( 'U', 'N', 'N', 'N', 2, 0, A, 2, X, 1, SCALE, W, + $ W( 2 ), 1, INFO ) + CALL CHKXER( 'SLATRS3', INFOT, NOUT, LERR, OK ) + INFOT = 14 + CALL SLATRS3( 'U', 'N', 'N', 'N', 1, 0, A, 1, X, 1, SCALE, W, + $ W( 2 ), 0, INFO ) + CALL CHKXER( 'SLATRS3', INFOT, NOUT, LERR, OK ) * ELSE IF( LSAMEN( 2, C2, 'TP' ) ) THEN * diff --git a/TESTING/LIN/zchktr.f b/TESTING/LIN/zchktr.f index 0a6f47b1ea..b09d1f1c5a 100644 --- a/TESTING/LIN/zchktr.f +++ b/TESTING/LIN/zchktr.f @@ -31,7 +31,7 @@ *> *> \verbatim *> -*> ZCHKTR tests ZTRTRI, -TRS, -RFS, and -CON, and ZLATRS +*> ZCHKTR tests ZTRTRI, -TRS, -RFS, and -CON, and ZLATRS(3) *> \endverbatim * * Arguments: @@ -184,7 +184,7 @@ SUBROUTINE ZCHKTR( DOTYPE, NN, NVAL, NNB, NBVAL, NNS, NSVAL, INTEGER NTYPE1, NTYPES PARAMETER ( NTYPE1 = 10, NTYPES = 18 ) INTEGER NTESTS - PARAMETER ( NTESTS = 9 ) + PARAMETER ( NTESTS = 10 ) INTEGER NTRAN PARAMETER ( NTRAN = 3 ) DOUBLE PRECISION ONE, ZERO @@ -195,13 +195,13 @@ SUBROUTINE ZCHKTR( DOTYPE, NN, NVAL, NNB, NBVAL, NNS, NSVAL, CHARACTER*3 PATH INTEGER I, IDIAG, IMAT, IN, INB, INFO, IRHS, ITRAN, $ IUPLO, K, LDA, N, NB, NERRS, NFAIL, NRHS, NRUN - DOUBLE PRECISION AINVNM, ANORM, DUMMY, RCOND, RCONDC, RCONDI, - $ RCONDO, SCALE + DOUBLE PRECISION AINVNM, ANORM, BIGNUM, DUMMY, RCOND, RCONDC, + $ RCONDI, RCONDO, RES, SCALE, DLAMCH * .. * .. Local Arrays .. CHARACTER TRANSS( NTRAN ), UPLOS( 2 ) INTEGER ISEED( 4 ), ISEEDY( 4 ) - DOUBLE PRECISION RESULT( NTESTS ) + DOUBLE PRECISION RESULT( NTESTS ), SCALE3( 2 ) * .. * .. External Functions .. LOGICAL LSAME @@ -209,10 +209,10 @@ SUBROUTINE ZCHKTR( DOTYPE, NN, NVAL, NNB, NBVAL, NNS, NSVAL, EXTERNAL LSAME, ZLANTR * .. * .. External Subroutines .. - EXTERNAL ALAERH, ALAHD, ALASUM, XLAENV, ZCOPY, ZERRTR, - $ ZGET04, ZLACPY, ZLARHS, ZLATRS, ZLATTR, ZTRCON, - $ ZTRRFS, ZTRT01, ZTRT02, ZTRT03, ZTRT05, ZTRT06, - $ ZTRTRI, ZTRTRS + EXTERNAL ALAERH, ALAHD, ALASUM, DLAMCH, XLAENV, ZCOPY, + $ ZDSCAL, ZERRTR, ZGET04, ZLACPY, ZLARHS, ZLATRS, + $ ZLATRS3, ZLATTR, ZTRCON, ZTRRFS, ZTRT01, + $ ZTRT02, ZTRT03, ZTRT05, ZTRT06, ZTRTRI, ZTRTRS * .. * .. Scalars in Common .. LOGICAL LERR, OK @@ -236,6 +236,7 @@ SUBROUTINE ZCHKTR( DOTYPE, NN, NVAL, NNB, NBVAL, NNS, NSVAL, * PATH( 1: 1 ) = 'Zomplex precision' PATH( 2: 3 ) = 'TR' + BIGNUM = DLAMCH('Overflow') / DLAMCH('Precision') NRUN = 0 NFAIL = 0 NERRS = 0 @@ -535,6 +536,32 @@ SUBROUTINE ZCHKTR( DOTYPE, NN, NVAL, NNB, NBVAL, NNS, NSVAL, $ RWORK, ONE, B( N+1 ), LDA, X, LDA, WORK, $ RESULT( 9 ) ) * +*+ TEST 10 +* Solve op(A)*X = B +* + SRNAMT = 'ZLATRS3' + CALL ZCOPY( N, X, 1, B, 1 ) + CALL ZCOPY( N, X, 1, B( N+1 ), 1 ) + CALL ZDSCAL( N, BIGNUM, B( N+1 ), 1 ) + CALL ZLATRS3( UPLO, TRANS, DIAG, 'N', N, 2, A, LDA, + $ B, MAX(1, N), SCALE3, RWORK, WORK, NMAX, + $ INFO ) +* +* Check error code from ZLATRS3. +* + IF( INFO.NE.0 ) + $ CALL ALAERH( PATH, 'ZLATRS3', INFO, 0, + $ UPLO // TRANS // DIAG // 'N', N, N, + $ -1, -1, -1, IMAT, NFAIL, NERRS, NOUT ) + CALL ZTRT03( UPLO, TRANS, DIAG, N, 1, A, LDA, + $ SCALE3( 1 ), RWORK, ONE, B( 1 ), LDA, + $ X, LDA, WORK, RESULT( 10 ) ) + CALL ZDSCAL( N, BIGNUM, X, 1 ) + CALL ZTRT03( UPLO, TRANS, DIAG, N, 1, A, LDA, + $ SCALE3( 2 ), RWORK, ONE, B( N+1 ), LDA, + $ X, LDA, WORK, RES ) + RESULT( 10 ) = MAX( RESULT( 10 ), RES ) +* * Print information about the tests that did not pass * the threshold. * @@ -552,7 +579,14 @@ SUBROUTINE ZCHKTR( DOTYPE, NN, NVAL, NNB, NBVAL, NNS, NSVAL, $ DIAG, 'Y', N, IMAT, 9, RESULT( 9 ) NFAIL = NFAIL + 1 END IF - NRUN = NRUN + 2 + IF( RESULT( 10 ).GE.THRESH ) THEN + IF( NFAIL.EQ.0 .AND. NERRS.EQ.0 ) + $ CALL ALAHD( NOUT, PATH ) + WRITE( NOUT, FMT = 9996 )'ZLATRS3', UPLO, TRANS, + $ DIAG, 'N', N, IMAT, 10, RESULT( 10 ) + NFAIL = NFAIL + 1 + END IF + NRUN = NRUN + 3 90 CONTINUE 100 CONTINUE 110 CONTINUE @@ -565,8 +599,8 @@ SUBROUTINE ZCHKTR( DOTYPE, NN, NVAL, NNB, NBVAL, NNS, NSVAL, 9999 FORMAT( ' UPLO=''', A1, ''', DIAG=''', A1, ''', N=', I5, ', NB=', $ I4, ', type ', I2, ', test(', I2, ')= ', G12.5 ) 9998 FORMAT( ' UPLO=''', A1, ''', TRANS=''', A1, ''', DIAG=''', A1, - $ ''', N=', I5, ', NB=', I4, ', type ', I2, ', - $ test(', I2, ')= ', G12.5 ) + $ ''', N=', I5, ', NB=', I4, ', type ', I2, ', test(', + $ I2, ')= ', G12.5 ) 9997 FORMAT( ' NORM=''', A1, ''', UPLO =''', A1, ''', N=', I5, ',', $ 11X, ' type ', I2, ', test(', I2, ')=', G12.5 ) 9996 FORMAT( 1X, A, '( ''', A1, ''', ''', A1, ''', ''', A1, ''', ''', diff --git a/TESTING/LIN/zerrtr.f b/TESTING/LIN/zerrtr.f index 098040ace3..211b921540 100644 --- a/TESTING/LIN/zerrtr.f +++ b/TESTING/LIN/zerrtr.f @@ -82,9 +82,10 @@ SUBROUTINE ZERRTR( PATH, NUNIT ) EXTERNAL LSAMEN * .. * .. External Subroutines .. - EXTERNAL ALAESM, CHKXER, ZLATBS, ZLATPS, ZLATRS, ZTBCON, - $ ZTBRFS, ZTBTRS, ZTPCON, ZTPRFS, ZTPTRI, ZTPTRS, - $ ZTRCON, ZTRRFS, ZTRTI2, ZTRTRI, ZTRTRS + EXTERNAL ALAESM, CHKXER, ZLATBS, ZLATPS, ZLATRS, + $ ZLATRS3, ZTBCON, ZTBRFS, ZTBTRS, ZTPCON, + $ ZTPRFS, ZTPTRI, ZTPTRS, ZTRCON, ZTRRFS, ZTRTI2, + $ ZTRTRI, ZTRTRS * .. * .. Scalars in Common .. LOGICAL LERR, OK @@ -240,6 +241,46 @@ SUBROUTINE ZERRTR( PATH, NUNIT ) CALL ZLATRS( 'U', 'N', 'N', 'N', 2, A, 1, X, SCALE, RW, INFO ) CALL CHKXER( 'ZLATRS', INFOT, NOUT, LERR, OK ) * +* ZLATRS3 +* + SRNAMT = 'ZLATRS3' + INFOT = 1 + CALL ZLATRS3( '/', 'N', 'N', 'N', 0, 0, A, 1, X, 1, SCALE, RW, + $ RW( 2 ), 1, INFO ) + CALL CHKXER( 'ZLATRS3', INFOT, NOUT, LERR, OK ) + INFOT = 2 + CALL ZLATRS3( 'U', '/', 'N', 'N', 0, 0, A, 1, X, 1, SCALE, RW, + $ RW( 2 ), 1, INFO ) + CALL CHKXER( 'ZLATRS3', INFOT, NOUT, LERR, OK ) + INFOT = 3 + CALL ZLATRS3( 'U', 'N', '/', 'N', 0, 0, A, 1, X, 1, SCALE, RW, + $ RW( 2 ), 1, INFO ) + CALL CHKXER( 'ZLATRS3', INFOT, NOUT, LERR, OK ) + INFOT = 4 + CALL ZLATRS3( 'U', 'N', 'N', '/', 0, 0, A, 1, X, 1, SCALE, RW, + $ RW( 2 ), 1, INFO ) + CALL CHKXER( 'ZLATRS3', INFOT, NOUT, LERR, OK ) + INFOT = 5 + CALL ZLATRS3( 'U', 'N', 'N', 'N', -1, 0, A, 1, X, 1, SCALE, RW, + $ RW( 2 ), 1, INFO ) + CALL CHKXER( 'ZLATRS3', INFOT, NOUT, LERR, OK ) + INFOT = 6 + CALL ZLATRS3( 'U', 'N', 'N', 'N', 0, -1, A, 1, X, 1, SCALE, RW, + $ RW( 2 ), 1, INFO ) + CALL CHKXER( 'ZLATRS3', INFOT, NOUT, LERR, OK ) + INFOT = 8 + CALL ZLATRS3( 'U', 'N', 'N', 'N', 2, 0, A, 1, X, 1, SCALE, RW, + $ RW( 2 ), 1, INFO ) + CALL CHKXER( 'ZLATRS3', INFOT, NOUT, LERR, OK ) + INFOT = 10 + CALL ZLATRS3( 'U', 'N', 'N', 'N', 2, 0, A, 2, X, 1, SCALE, RW, + $ RW( 2 ), 1, INFO ) + CALL CHKXER( 'ZLATRS3', INFOT, NOUT, LERR, OK ) + INFOT = 14 + CALL ZLATRS3( 'U', 'N', 'N', 'N', 1, 0, A, 1, X, 1, SCALE, RW, + $ RW( 2 ), 0, INFO ) + CALL CHKXER( 'ZLATRS3', INFOT, NOUT, LERR, OK ) +* * Test error exits for the packed triangular routines. * ELSE IF( LSAMEN( 2, C2, 'TP' ) ) THEN