diff options
author | jason <jason@8a072113-8704-0410-8d35-dd094bca7971> | 2008-10-28 01:38:50 +0000 |
---|---|---|
committer | jason <jason@8a072113-8704-0410-8d35-dd094bca7971> | 2008-10-28 01:38:50 +0000 |
commit | baba851215b44ac3b60b9248eb02bcce7eb76247 (patch) | |
tree | 8c0f5c006875532a30d4409f5e94b0f310ff00a7 /SRC/VARIANTS |
Move LAPACK trunk into position.
Diffstat (limited to 'SRC/VARIANTS')
27 files changed, 5330 insertions, 0 deletions
diff --git a/SRC/VARIANTS/Makefile b/SRC/VARIANTS/Makefile new file mode 100644 index 00000000..42446eb5 --- /dev/null +++ b/SRC/VARIANTS/Makefile @@ -0,0 +1,67 @@ +include ../../make.inc + +####################################################################### +# This is the makefile to create a the variants libraries for LAPACK. +# The files are organized as follows: +# CHOLRL -- Right looking block version of the algorithm, calling Level 3 BLAS +# CHOLTOP -- Top looking block version of the algorithm, calling Level 3 BLAS +# LUCR -- Crout Level 3 BLAS version of LU factorization +# LULL -- left-looking Level 3 BLAS version of LU factorization +# QRLL -- left-looking Level 3 BLAS version of QR factorization +# LUREC -- an iterative version of Sivan Toledo's recursive LU algorithm[1]. +# For square matrices, this iterative versions should +# be within a factor of two of the optimum number of memory transfers. +# +# [1] Toledo, S. 1997. Locality of Reference in LU Decomposition with +# Partial Pivoting. SIAM J. Matrix Anal. Appl. 18, 4 (Oct. 1997), +# 1065-1081. http://dx.doi.org/10.1137/S0895479896297744 +####################################################################### + +VARIANTSDIR=LIB + +CHOLRL = cholesky/RL/cpotrf.o cholesky/RL/dpotrf.o cholesky/RL/spotrf.o cholesky/RL/zpotrf.o + +CHOLTOP = cholesky/TOP/cpotrf.o cholesky/TOP/dpotrf.o cholesky/TOP/spotrf.o cholesky/TOP/zpotrf.o + +LUCR = lu/CR/cgetrf.o lu/CR/dgetrf.o lu/CR/sgetrf.o lu/CR/zgetrf.o + +LULL = lu/LL/cgetrf.o lu/LL/dgetrf.o lu/LL/sgetrf.o lu/LL/zgetrf.o + +LUREC = lu/REC/cgetrf.o lu/REC/dgetrf.o lu/REC/sgetrf.o lu/REC/zgetrf.o + +QRLL = qr/LL/cgeqrf.o qr/LL/dgeqrf.o qr/LL/sgeqrf.o qr/LL/zgeqrf.o qr/LL/sceil.o + + +all: cholrl choltop lucr lull lurec qrll + +cholrl: $(CHOLRL) + $(ARCH) $(ARCHFLAGS) $(VARIANTSDIR)/cholrl.a $(CHOLRL) + $(RANLIB) $(VARIANTSDIR)/cholrl.a + +choltop: $(CHOLTOP) + $(ARCH) $(ARCHFLAGS) $(VARIANTSDIR)/choltop.a $(CHOLTOP) + $(RANLIB) $(VARIANTSDIR)/choltop.a + +lucr: $(LUCR) + $(ARCH) $(ARCHFLAGS) $(VARIANTSDIR)/lucr.a $(LUCR) + $(RANLIB) $(VARIANTSDIR)/lucr.a + +lull: $(LULL) + $(ARCH) $(ARCHFLAGS) $(VARIANTSDIR)/lull.a $(LULL) + $(RANLIB) $(VARIANTSDIR)/lull.a + +lurec: $(LUREC) + $(ARCH) $(ARCHFLAGS) $(VARIANTSDIR)/lurec.a $(LUREC) + $(RANLIB) $(VARIANTSDIR)/lurec.a + +qrll: $(QRLL) + $(ARCH) $(ARCHFLAGS) $(VARIANTSDIR)/qrll.a $(QRLL) + $(RANLIB) $(VARIANTSDIR)/qrll.a + + +.f.o: + $(FORTRAN) $(OPTS) -c $< -o $@ + +clean: + rm -f $(CHOLRL) $(CHOLTOP) $(LUCR) $(LULL) $(LUREC) $(QRLL) \ + $(VARIANTSDIR)/*.a
\ No newline at end of file diff --git a/SRC/VARIANTS/README b/SRC/VARIANTS/README new file mode 100644 index 00000000..6b4f3258 --- /dev/null +++ b/SRC/VARIANTS/README @@ -0,0 +1,84 @@ + =============== + = README File = + =============== + +This README File is for the LAPACK driver variants. +It is composed of 5 sections: + - Description: contents a quick description of each of the variants. For a more detailed description please refer to LAWN XXX. + - Build + - Testing + - Linking your program + - Support + +Author: Julie LANGOU, May 2008 + +=============== += DESCRIPTION = +=============== + +This directory contains several variants of LAPACK routines in single/double/complex/double complex precision: + - [sdcz]getrf with LU Crout Level 3 BLAS version algorithm [2]- Directory: SRC/VARIANTS/lu/CR + - [sdcz]getrf with LU Left Looking Level 3 BLAS version algorithm [2]- Directory: SRC/VARIANTS/lu/LL + - [sdcz]getrf with Sivan Toledo's recursive LU algorithm [1] - Directory: SRC/VARIANTS/lu/REC + - [sdcz]geqrf with QR Left Looking Level 3 BLAS version algorithm [2]- Directory: SRC/VARIANTS/qr/LL + - [sdcz]potrf with Cholesky Right Looking Level 3 BLAS version algorithm [2]- Directory: SRC/VARIANTS/cholesky/RL + - [sdcz]potrf with Cholesky Top Level 3 BLAS version algorithm [2]- Directory: SRC/VARIANTS/cholesky/TOP + +References:For a more detailed description please refer to + - [1] Toledo, S. 1997. Locality of Reference in LU Decomposition with Partial Pivoting. SIAM J. Matrix Anal. Appl. 18, 4 (Oct. 1997), + 1065-1081. http://dx.doi.org/10.1137/S0895479896297744 + - [2]LAWN XXX + +========= += BUILD = +========= + +These variants are compiled by default in the build process but they are not tested by default. +The build process creates one new library per variants in the four arithmetics (singel/double/comple/double complex). +The libraries are in the SRC/VARIANTS/LIB directory. + +Corresponding libraries created in SRC/VARIANTS/LIB: + - LU Crout : lucr.a + - LU Left Looking : lull.a + - LU Sivan Toledo's recursive : lurec.a + - QR Left Looking : qrll.a + - Cholesky Right Looking : cholrl.a + - Cholesky Top : choltop.a + + +=========== += TESTING = +=========== + +To test these variants you can type 'make variants-testing' +This will rerun the linear methods testings once per variants and append the short name of the variants to the output files. +You should then see the following files in the TESTING directory: +[scdz]test_cholrl.out +[scdz]test_choltop.out +[scdz]test_lucr.out +[scdz]test_lull.out +[scdz]test_lurec.out +[scdz]test_qrll.out + +======================== += LINKING YOUR PROGRAM = +======================== + +You just need to add the variants methods library in your linking sequence before your lapack libary. +Here is a quick example for LU + +Default using LU Right Looking version: + $(FORTRAN) -c myprog.f + $(FORTRAN) -o myexe myprog.o $(LAPACKLIB) $(BLASLIB) + +Using LU Left Looking version: + $(FORTRAN) -c myprog.f + $(FORTRAN) -o myexe myprog.o $(PATH TO LAPACK/SRC/VARIANTS/LIB)/lull.a $(LAPACKLIB) $(BLASLIB) + +=========== += SUPPORT = +=========== + +You can use either LAPACK forum or the LAPACK mailing list to get support. +LAPACK forum : http://icl.cs.utk.edu/lapack-forum +LAPACK mailing list : lapack@cs.utk.edu diff --git a/SRC/VARIANTS/cholesky/RL/cpotrf.f b/SRC/VARIANTS/cholesky/RL/cpotrf.f new file mode 100644 index 00000000..a6d194cb --- /dev/null +++ b/SRC/VARIANTS/cholesky/RL/cpotrf.f @@ -0,0 +1,187 @@ + SUBROUTINE CPOTRF ( UPLO, N, A, LDA, INFO ) +* +* -- LAPACK routine (version 3.1) -- +* Univ. of Tennessee, Univ. of California Berkeley and NAG Ltd.. +* March 2008 +* +* .. Scalar Arguments .. + CHARACTER UPLO + INTEGER INFO, LDA, N +* .. +* .. Array Arguments .. + COMPLEX A( LDA, * ) +* .. +* +* Purpose +* ======= +* +* CPOTRF computes the Cholesky factorization of a real Hermitian +* positive definite matrix A. +* +* The factorization has the form +* A = U**H * U, if UPLO = 'U', or +* A = L * L**H, if UPLO = 'L', +* where U is an upper triangular matrix and L is lower triangular. +* +* This is the right looking block version of the algorithm, calling Level 3 BLAS. +* +* Arguments +* ========= +* +* UPLO (input) CHARACTER*1 +* = 'U': Upper triangle of A is stored; +* = 'L': Lower triangle of A is stored. +* +* N (input) INTEGER +* The order of the matrix A. N >= 0. +* +* A (input/output) COMPLEX array, dimension (LDA,N) +* On entry, the Hermitian matrix A. If UPLO = 'U', the leading +* N-by-N upper triangular part of A contains the upper +* triangular part of the matrix A, and the strictly lower +* triangular part of A is not referenced. If UPLO = 'L', the +* leading N-by-N lower triangular part of A contains the lower +* triangular part of the matrix A, and the strictly upper +* triangular part of A is not referenced. +* +* On exit, if INFO = 0, the factor U or L from the Cholesky +* factorization A = U**H*U or A = L*L**H. +* +* LDA (input) INTEGER +* The leading dimension of the array A. LDA >= max(1,N). +* +* INFO (output) INTEGER +* = 0: successful exit +* < 0: if INFO = -i, the i-th argument had an illegal value +* > 0: if INFO = i, the leading minor of order i is not +* positive definite, and the factorization could not be +* completed. +* +* ===================================================================== +* +* .. Parameters .. + REAL ONE + COMPLEX CONE + PARAMETER ( ONE = 1.0E+0, CONE = ( 1.0E+0, 0.0E+0 ) ) +* .. +* .. Local Scalars .. + LOGICAL UPPER + INTEGER J, JB, NB +* .. +* .. External Functions .. + LOGICAL LSAME + INTEGER ILAENV + EXTERNAL LSAME, ILAENV +* .. +* .. External Subroutines .. + EXTERNAL CGEMM, CPOTF2, CHERK, CTRSM, XERBLA +* .. +* .. Intrinsic Functions .. + INTRINSIC MAX, MIN +* .. +* .. Executable Statements .. +* +* Test the input parameters. +* + INFO = 0 + UPPER = LSAME( UPLO, 'U' ) + IF( .NOT.UPPER .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN + INFO = -1 + ELSE IF( N.LT.0 ) THEN + INFO = -2 + ELSE IF( LDA.LT.MAX( 1, N ) ) THEN + INFO = -4 + END IF + IF( INFO.NE.0 ) THEN + CALL XERBLA( 'CPOTRF', -INFO ) + RETURN + END IF +* +* Quick return if possible +* + IF( N.EQ.0 ) + $ RETURN +* +* Determine the block size for this environment. +* + NB = ILAENV( 1, 'CPOTRF', UPLO, N, -1, -1, -1 ) + IF( NB.LE.1 .OR. NB.GE.N ) THEN +* +* Use unblocked code. +* + CALL CPOTF2( UPLO, N, A, LDA, INFO ) + ELSE +* +* Use blocked code. +* + IF( UPPER ) THEN +* +* Compute the Cholesky factorization A = U'*U. +* + DO 10 J = 1, N, NB +* +* Update and factorize the current diagonal block and test +* for non-positive-definiteness. +* + JB = MIN( NB, N-J+1 ) + + CALL CPOTF2( 'Upper', JB, A( J, J ), LDA, INFO ) + + IF( INFO.NE.0 ) + $ GO TO 30 + + IF( J+JB.LE.N ) THEN +* +* Updating the trailing submatrix. +* + CALL CTRSM( 'Left', 'Upper', 'Conjugate Transpose', + $ 'Non-unit', JB, N-J-JB+1, CONE, A( J, J ), + $ LDA, A( J, J+JB ), LDA ) + CALL CHERK( 'Upper', 'Conjugate transpose', N-J-JB+1, + $ JB, -ONE, A( J, J+JB ), LDA, + $ ONE, A( J+JB, J+JB ), LDA ) + END IF + 10 CONTINUE +* + ELSE +* +* Compute the Cholesky factorization A = L*L'. +* + DO 20 J = 1, N, NB +* +* Update and factorize the current diagonal block and test +* for non-positive-definiteness. +* + JB = MIN( NB, N-J+1 ) + + CALL CPOTF2( 'Lower', JB, A( J, J ), LDA, INFO ) + + IF( INFO.NE.0 ) + $ GO TO 30 + + IF( J+JB.LE.N ) THEN +* +* Updating the trailing submatrix. +* + CALL CTRSM( 'Right', 'Lower', 'Conjugate Transpose', + $ 'Non-unit', N-J-JB+1, JB, CONE, A( J, J ), + $ LDA, A( J+JB, J ), LDA ) + + CALL CHERK( 'Lower', 'No Transpose', N-J-JB+1, JB, + $ -ONE, A( J+JB, J ), LDA, + $ ONE, A( J+JB, J+JB ), LDA ) + END IF + 20 CONTINUE + END IF + END IF + GO TO 40 +* + 30 CONTINUE + INFO = INFO + J - 1 +* + 40 CONTINUE + RETURN +* +* End of CPOTRF +* + END diff --git a/SRC/VARIANTS/cholesky/RL/dpotrf.f b/SRC/VARIANTS/cholesky/RL/dpotrf.f new file mode 100644 index 00000000..72603304 --- /dev/null +++ b/SRC/VARIANTS/cholesky/RL/dpotrf.f @@ -0,0 +1,186 @@ + SUBROUTINE DPOTRF ( UPLO, N, A, LDA, INFO ) +* +* -- LAPACK routine (version 3.1) -- +* Univ. of Tennessee, Univ. of California Berkeley and NAG Ltd.. +* March 2008 +* +* .. Scalar Arguments .. + CHARACTER UPLO + INTEGER INFO, LDA, N +* .. +* .. Array Arguments .. + DOUBLE PRECISION A( LDA, * ) +* .. +* +* Purpose +* ======= +* +* DPOTRF computes the Cholesky factorization of a real symmetric +* positive definite matrix A. +* +* The factorization has the form +* A = U**T * U, if UPLO = 'U', or +* A = L * L**T, if UPLO = 'L', +* where U is an upper triangular matrix and L is lower triangular. +* +* This is the right looking block version of the algorithm, calling Level 3 BLAS. +* +* Arguments +* ========= +* +* UPLO (input) CHARACTER*1 +* = 'U': Upper triangle of A is stored; +* = 'L': Lower triangle of A is stored. +* +* N (input) INTEGER +* The order of the matrix A. N >= 0. +* +* A (input/output) DOUBLE PRECISION array, dimension (LDA,N) +* On entry, the symmetric matrix A. If UPLO = 'U', the leading +* N-by-N upper triangular part of A contains the upper +* triangular part of the matrix A, and the strictly lower +* triangular part of A is not referenced. If UPLO = 'L', the +* leading N-by-N lower triangular part of A contains the lower +* triangular part of the matrix A, and the strictly upper +* triangular part of A is not referenced. +* +* On exit, if INFO = 0, the factor U or L from the Cholesky +* factorization A = U**T*U or A = L*L**T. +* +* LDA (input) INTEGER +* The leading dimension of the array A. LDA >= max(1,N). +* +* INFO (output) INTEGER +* = 0: successful exit +* < 0: if INFO = -i, the i-th argument had an illegal value +* > 0: if INFO = i, the leading minor of order i is not +* positive definite, and the factorization could not be +* completed. +* +* ===================================================================== +* +* .. Parameters .. + DOUBLE PRECISION ONE + PARAMETER ( ONE = 1.0D+0 ) +* .. +* .. Local Scalars .. + LOGICAL UPPER + INTEGER J, JB, NB +* .. +* .. External Functions .. + LOGICAL LSAME + INTEGER ILAENV + EXTERNAL LSAME, ILAENV +* .. +* .. External Subroutines .. + EXTERNAL DGEMM, DPOTF2, DSYRK, DTRSM, XERBLA +* .. +* .. Intrinsic Functions .. + INTRINSIC MAX, MIN +* .. +* .. Executable Statements .. +* +* Test the input parameters. +* + INFO = 0 + UPPER = LSAME( UPLO, 'U' ) + IF( .NOT.UPPER .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN + INFO = -1 + ELSE IF( N.LT.0 ) THEN + INFO = -2 + ELSE IF( LDA.LT.MAX( 1, N ) ) THEN + INFO = -4 + END IF + IF( INFO.NE.0 ) THEN + CALL XERBLA( 'DPOTRF', -INFO ) + RETURN + END IF +* +* Quick return if possible +* + IF( N.EQ.0 ) + $ RETURN +* +* Determine the block size for this environment. +* + NB = ILAENV( 1, 'DPOTRF', UPLO, N, -1, -1, -1 ) + IF( NB.LE.1 .OR. NB.GE.N ) THEN +* +* Use unblocked code. +* + CALL DPOTF2( UPLO, N, A, LDA, INFO ) + ELSE +* +* Use blocked code. +* + IF( UPPER ) THEN +* +* Compute the Cholesky factorization A = U'*U. +* + DO 10 J = 1, N, NB +* +* Update and factorize the current diagonal block and test +* for non-positive-definiteness. +* + JB = MIN( NB, N-J+1 ) + + CALL DPOTF2( 'Upper', JB, A( J, J ), LDA, INFO ) + + IF( INFO.NE.0 ) + $ GO TO 30 + + IF( J+JB.LE.N ) THEN +* +* Updating the trailing submatrix. +* + CALL DTRSM( 'Left', 'Upper', 'Transpose', 'Non-unit', + $ JB, N-J-JB+1, ONE, A( J, J ), LDA, + $ A( J, J+JB ), LDA ) + CALL DSYRK( 'Upper', 'Transpose', N-J-JB+1, JB, -ONE, + $ A( J, J+JB ), LDA, + $ ONE, A( J+JB, J+JB ), LDA ) + END IF + 10 CONTINUE +* + ELSE +* +* Compute the Cholesky factorization A = L*L'. +* + DO 20 J = 1, N, NB +* +* Update and factorize the current diagonal block and test +* for non-positive-definiteness. +* + JB = MIN( NB, N-J+1 ) + + CALL DPOTF2( 'Lower', JB, A( J, J ), LDA, INFO ) + + IF( INFO.NE.0 ) + $ GO TO 30 + + IF( J+JB.LE.N ) THEN +* +* Updating the trailing submatrix. +* + CALL DTRSM( 'Right', 'Lower', 'Transpose', 'Non-unit', + $ N-J-JB+1, JB, ONE, A( J, J ), LDA, + $ A( J+JB, J ), LDA ) + + CALL DSYRK( 'Lower', 'No Transpose', N-J-JB+1, JB, + $ -ONE, A( J+JB, J ), LDA, + $ ONE, A( J+JB, J+JB ), LDA ) + END IF + 20 CONTINUE + END IF + END IF + GO TO 40 +* + 30 CONTINUE + INFO = INFO + J - 1 +* + 40 CONTINUE + RETURN +* +* End of DPOTRF +* + END diff --git a/SRC/VARIANTS/cholesky/RL/spotrf.f b/SRC/VARIANTS/cholesky/RL/spotrf.f new file mode 100644 index 00000000..3375902a --- /dev/null +++ b/SRC/VARIANTS/cholesky/RL/spotrf.f @@ -0,0 +1,186 @@ + SUBROUTINE SPOTRF ( UPLO, N, A, LDA, INFO ) +* +* -- LAPACK routine (version 3.1) -- +* Univ. of Tennessee, Univ. of California Berkeley and NAG Ltd.. +* March 2008 +* +* .. Scalar Arguments .. + CHARACTER UPLO + INTEGER INFO, LDA, N +* .. +* .. Array Arguments .. + REAL A( LDA, * ) +* .. +* +* Purpose +* ======= +* +* SPOTRF computes the Cholesky factorization of a real symmetric +* positive definite matrix A. +* +* The factorization has the form +* A = U**T * U, if UPLO = 'U', or +* A = L * L**T, if UPLO = 'L', +* where U is an upper triangular matrix and L is lower triangular. +* +* This is the right looking block version of the algorithm, calling Level 3 BLAS. +* +* Arguments +* ========= +* +* UPLO (input) CHARACTER*1 +* = 'U': Upper triangle of A is stored; +* = 'L': Lower triangle of A is stored. +* +* N (input) INTEGER +* The order of the matrix A. N >= 0. +* +* A (input/output) REAL array, dimension (LDA,N) +* On entry, the symmetric matrix A. If UPLO = 'U', the leading +* N-by-N upper triangular part of A contains the upper +* triangular part of the matrix A, and the strictly lower +* triangular part of A is not referenced. If UPLO = 'L', the +* leading N-by-N lower triangular part of A contains the lower +* triangular part of the matrix A, and the strictly upper +* triangular part of A is not referenced. +* +* On exit, if INFO = 0, the factor U or L from the Cholesky +* factorization A = U**T*U or A = L*L**T. +* +* LDA (input) INTEGER +* The leading dimension of the array A. LDA >= max(1,N). +* +* INFO (output) INTEGER +* = 0: successful exit +* < 0: if INFO = -i, the i-th argument had an illegal value +* > 0: if INFO = i, the leading minor of order i is not +* positive definite, and the factorization could not be +* completed. +* +* ===================================================================== +* +* .. Parameters .. + REAL ONE + PARAMETER ( ONE = 1.0E+0 ) +* .. +* .. Local Scalars .. + LOGICAL UPPER + INTEGER J, JB, NB +* .. +* .. External Functions .. + LOGICAL LSAME + INTEGER ILAENV + EXTERNAL LSAME, ILAENV +* .. +* .. External Subroutines .. + EXTERNAL SGEMM, SPOTF2, SSYRK, STRSM, XERBLA +* .. +* .. Intrinsic Functions .. + INTRINSIC MAX, MIN +* .. +* .. Executable Statements .. +* +* Test the input parameters. +* + INFO = 0 + UPPER = LSAME( UPLO, 'U' ) + IF( .NOT.UPPER .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN + INFO = -1 + ELSE IF( N.LT.0 ) THEN + INFO = -2 + ELSE IF( LDA.LT.MAX( 1, N ) ) THEN + INFO = -4 + END IF + IF( INFO.NE.0 ) THEN + CALL XERBLA( 'SPOTRF', -INFO ) + RETURN + END IF +* +* Quick return if possible +* + IF( N.EQ.0 ) + $ RETURN +* +* Determine the block size for this environment. +* + NB = ILAENV( 1, 'SPOTRF', UPLO, N, -1, -1, -1 ) + IF( NB.LE.1 .OR. NB.GE.N ) THEN +* +* Use unblocked code. +* + CALL SPOTF2( UPLO, N, A, LDA, INFO ) + ELSE +* +* Use blocked code. +* + IF( UPPER ) THEN +* +* Compute the Cholesky factorization A = U'*U. +* + DO 10 J = 1, N, NB +* +* Update and factorize the current diagonal block and test +* for non-positive-definiteness. +* + JB = MIN( NB, N-J+1 ) + + CALL SPOTF2( 'Upper', JB, A( J, J ), LDA, INFO ) + + IF( INFO.NE.0 ) + $ GO TO 30 + + IF( J+JB.LE.N ) THEN +* +* Updating the trailing submatrix. +* + CALL STRSM( 'Left', 'Upper', 'Transpose', 'Non-unit', + $ JB, N-J-JB+1, ONE, A( J, J ), LDA, + $ A( J, J+JB ), LDA ) + CALL SSYRK( 'Upper', 'Transpose', N-J-JB+1, JB, -ONE, + $ A( J, J+JB ), LDA, + $ ONE, A( J+JB, J+JB ), LDA ) + END IF + 10 CONTINUE +* + ELSE +* +* Compute the Cholesky factorization A = L*L'. +* + DO 20 J = 1, N, NB +* +* Update and factorize the current diagonal block and test +* for non-positive-definiteness. +* + JB = MIN( NB, N-J+1 ) + + CALL SPOTF2( 'Lower', JB, A( J, J ), LDA, INFO ) + + IF( INFO.NE.0 ) + $ GO TO 30 + + IF( J+JB.LE.N ) THEN +* +* Updating the trailing submatrix. +* + CALL STRSM( 'Right', 'Lower', 'Transpose', 'Non-unit', + $ N-J-JB+1, JB, ONE, A( J, J ), LDA, + $ A( J+JB, J ), LDA ) + + CALL SSYRK( 'Lower', 'No Transpose', N-J-JB+1, JB, + $ -ONE, A( J+JB, J ), LDA, + $ ONE, A( J+JB, J+JB ), LDA ) + END IF + 20 CONTINUE + END IF + END IF + GO TO 40 +* + 30 CONTINUE + INFO = INFO + J - 1 +* + 40 CONTINUE + RETURN +* +* End of SPOTRF +* + END diff --git a/SRC/VARIANTS/cholesky/RL/zpotrf.f b/SRC/VARIANTS/cholesky/RL/zpotrf.f new file mode 100644 index 00000000..b2bce7f6 --- /dev/null +++ b/SRC/VARIANTS/cholesky/RL/zpotrf.f @@ -0,0 +1,187 @@ + SUBROUTINE ZPOTRF ( UPLO, N, A, LDA, INFO ) +* +* -- LAPACK routine (version 3.1) -- +* Univ. of Tennessee, Univ. of California Berkeley and NAG Ltd.. +* March 2008 +* +* .. Scalar Arguments .. + CHARACTER UPLO + INTEGER INFO, LDA, N +* .. +* .. Array Arguments .. + COMPLEX*16 A( LDA, * ) +* .. +* +* Purpose +* ======= +* +* ZPOTRF computes the Cholesky factorization of a real Hermitian +* positive definite matrix A. +* +* The factorization has the form +* A = U**H * U, if UPLO = 'U', or +* A = L * L**H, if UPLO = 'L', +* where U is an upper triangular matrix and L is lower triangular. +* +* This is the right looking block version of the algorithm, calling Level 3 BLAS. +* +* Arguments +* ========= +* +* UPLO (input) CHARACTER*1 +* = 'U': Upper triangle of A is stored; +* = 'L': Lower triangle of A is stored. +* +* N (input) INTEGER +* The order of the matrix A. N >= 0. +* +* A (input/output) COMPLEX*16 array, dimension (LDA,N) +* On entry, the Hermitian matrix A. If UPLO = 'U', the leading +* N-by-N upper triangular part of A contains the upper +* triangular part of the matrix A, and the strictly lower +* triangular part of A is not referenced. If UPLO = 'L', the +* leading N-by-N lower triangular part of A contains the lower +* triangular part of the matrix A, and the strictly upper +* triangular part of A is not referenced. +* +* On exit, if INFO = 0, the factor U or L from the Cholesky +* factorization A = U**H*U or A = L*L**H. +* +* LDA (input) INTEGER +* The leading dimension of the array A. LDA >= max(1,N). +* +* INFO (output) INTEGER +* = 0: successful exit +* < 0: if INFO = -i, the i-th argument had an illegal value +* > 0: if INFO = i, the leading minor of order i is not +* positive definite, and the factorization could not be +* completed. +* +* ===================================================================== +* +* .. Parameters .. + DOUBLE PRECISION ONE + COMPLEX*16 CONE + PARAMETER ( ONE = 1.0D+0, CONE = ( 1.0D+0, 0.0D+0 ) ) +* .. +* .. Local Scalars .. + LOGICAL UPPER + INTEGER J, JB, NB +* .. +* .. External Functions .. + LOGICAL LSAME + INTEGER ILAENV + EXTERNAL LSAME, ILAENV +* .. +* .. External Subroutines .. + EXTERNAL ZGEMM, ZPOTF2, ZHERK, ZTRSM, XERBLA +* .. +* .. Intrinsic Functions .. + INTRINSIC MAX, MIN +* .. +* .. Executable Statements .. +* +* Test the input parameters. +* + INFO = 0 + UPPER = LSAME( UPLO, 'U' ) + IF( .NOT.UPPER .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN + INFO = -1 + ELSE IF( N.LT.0 ) THEN + INFO = -2 + ELSE IF( LDA.LT.MAX( 1, N ) ) THEN + INFO = -4 + END IF + IF( INFO.NE.0 ) THEN + CALL XERBLA( 'ZPOTRF', -INFO ) + RETURN + END IF +* +* Quick return if possible +* + IF( N.EQ.0 ) + $ RETURN +* +* Determine the block size for this environment. +* + NB = ILAENV( 1, 'ZPOTRF', UPLO, N, -1, -1, -1 ) + IF( NB.LE.1 .OR. NB.GE.N ) THEN +* +* Use unblocked code. +* + CALL ZPOTF2( UPLO, N, A, LDA, INFO ) + ELSE +* +* Use blocked code. +* + IF( UPPER ) THEN +* +* Compute the Cholesky factorization A = U'*U. +* + DO 10 J = 1, N, NB +* +* Update and factorize the current diagonal block and test +* for non-positive-definiteness. +* + JB = MIN( NB, N-J+1 ) + + CALL ZPOTF2( 'Upper', JB, A( J, J ), LDA, INFO ) + + IF( INFO.NE.0 ) + $ GO TO 30 + + IF( J+JB.LE.N ) THEN +* +* Updating the trailing submatrix. +* + CALL ZTRSM( 'Left', 'Upper', 'Conjugate Transpose', + $ 'Non-unit', JB, N-J-JB+1, CONE, A( J, J ), + $ LDA, A( J, J+JB ), LDA ) + CALL ZHERK( 'Upper', 'Conjugate transpose', N-J-JB+1, + $ JB, -ONE, A( J, J+JB ), LDA, + $ ONE, A( J+JB, J+JB ), LDA ) + END IF + 10 CONTINUE +* + ELSE +* +* Compute the Cholesky factorization A = L*L'. +* + DO 20 J = 1, N, NB +* +* Update and factorize the current diagonal block and test +* for non-positive-definiteness. +* + JB = MIN( NB, N-J+1 ) + + CALL ZPOTF2( 'Lower', JB, A( J, J ), LDA, INFO ) + + IF( INFO.NE.0 ) + $ GO TO 30 + + IF( J+JB.LE.N ) THEN +* +* Updating the trailing submatrix. +* + CALL ZTRSM( 'Right', 'Lower', 'Conjugate Transpose', + $ 'Non-unit', N-J-JB+1, JB, CONE, A( J, J ), + $ LDA, A( J+JB, J ), LDA ) + + CALL ZHERK( 'Lower', 'No Transpose', N-J-JB+1, JB, + $ -ONE, A( J+JB, J ), LDA, + $ ONE, A( J+JB, J+JB ), LDA ) + END IF + 20 CONTINUE + END IF + END IF + GO TO 40 +* + 30 CONTINUE + INFO = INFO + J - 1 +* + 40 CONTINUE + RETURN +* +* End of ZPOTRF +* + END diff --git a/SRC/VARIANTS/cholesky/TOP/cpotrf.f b/SRC/VARIANTS/cholesky/TOP/cpotrf.f new file mode 100644 index 00000000..54ae1bb9 --- /dev/null +++ b/SRC/VARIANTS/cholesky/TOP/cpotrf.f @@ -0,0 +1,181 @@ + SUBROUTINE CPOTRF ( UPLO, N, A, LDA, INFO ) +* +* -- LAPACK routine (version 3.1) -- +* Univ. of Tennessee, Univ. of California Berkeley and NAG Ltd.. +* March 2008 +* +* .. Scalar Arguments .. + CHARACTER UPLO + INTEGER INFO, LDA, N +* .. +* .. Array Arguments .. + COMPLEX A( LDA, * ) +* .. +* +* Purpose +* ======= +* +* CPOTRF computes the Cholesky factorization of a real symmetric +* positive definite matrix A. +* +* The factorization has the form +* A = U**H * U, if UPLO = 'U', or +* A = L * L**H, if UPLO = 'L', +* where U is an upper triangular matrix and L is lower triangular. +* +* This is the top-looking block version of the algorithm, calling Level 3 BLAS. +* +* Arguments +* ========= +* +* UPLO (input) CHARACTER*1 +* = 'U': Upper triangle of A is stored; +* = 'L': Lower triangle of A is stored. +* +* N (input) INTEGER +* The order of the matrix A. N >= 0. +* +* A (input/output) COMPLEX array, dimension (LDA,N) +* On entry, the symmetric matrix A. If UPLO = 'U', the leading +* N-by-N upper triangular part of A contains the upper +* triangular part of the matrix A, and the strictly lower +* triangular part of A is not referenced. If UPLO = 'L', the +* leading N-by-N lower triangular part of A contains the lower +* triangular part of the matrix A, and the strictly upper +* triangular part of A is not referenced. +* +* On exit, if INFO = 0, the factor U or L from the Cholesky +* factorization A = U**H*U or A = L*L**H. +* +* LDA (input) INTEGER +* The leading dimension of the array A. LDA >= max(1,N). +* +* INFO (output) INTEGER +* = 0: successful exit +* < 0: if INFO = -i, the i-th argument had an illegal value +* > 0: if INFO = i, the leading minor of order i is not +* positive definite, and the factorization could not be +* completed. +* +* ===================================================================== +* +* .. Parameters .. + REAL ONE + COMPLEX CONE + PARAMETER ( ONE = 1.0E+0, CONE = ( 1.0E+0, 0.0E+0 ) ) +* .. +* .. Local Scalars .. + LOGICAL UPPER + INTEGER J, JB, NB +* .. +* .. External Functions .. + LOGICAL LSAME + INTEGER ILAENV + EXTERNAL LSAME, ILAENV +* .. +* .. External Subroutines .. + EXTERNAL CGEMM, CPOTF2, CHERK, CTRSM, XERBLA +* .. +* .. Intrinsic Functions .. + INTRINSIC MAX, MIN +* .. +* .. Executable Statements .. +* +* Test the input parameters. +* + INFO = 0 + UPPER = LSAME( UPLO, 'U' ) + IF( .NOT.UPPER .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN + INFO = -1 + ELSE IF( N.LT.0 ) THEN + INFO = -2 + ELSE IF( LDA.LT.MAX( 1, N ) ) THEN + INFO = -4 + END IF + IF( INFO.NE.0 ) THEN + CALL XERBLA( 'CPOTRF', -INFO ) + RETURN + END IF +* +* Quick return if possible +* + IF( N.EQ.0 ) + $ RETURN +* +* Determine the block size for this environment. +* + NB = ILAENV( 1, 'CPOTRF', UPLO, N, -1, -1, -1 ) + IF( NB.LE.1 .OR. NB.GE.N ) THEN +* +* Use unblocked code. +* + CALL CPOTF2( UPLO, N, A, LDA, INFO ) + ELSE +* +* Use blocked code. +* + IF( UPPER ) THEN +* +* Compute the Cholesky factorization A = U'*U. +* + DO 10 J = 1, N, NB + + JB = MIN( NB, N-J+1 ) +* +* Compute the current block. +* + CALL CTRSM( 'Left', 'Upper', 'Conjugate Transpose', + $ 'Non-unit', J-1, JB, CONE, A( 1, 1 ), LDA, + $ A( 1, J ), LDA ) + + CALL CHERK( 'Upper', 'Conjugate Transpose', JB, J-1, + $ -ONE, A( 1, J ), LDA, ONE, A( J, J ), LDA ) +* +* Update and factorize the current diagonal block and test +* for non-positive-definiteness. +* + CALL CPOTF2( 'Upper', JB, A( J, J ), LDA, INFO ) + IF( INFO.NE.0 ) + $ GO TO 30 + + 10 CONTINUE +* + ELSE +* +* Compute the Cholesky factorization A = L*L'. +* + DO 20 J = 1, N, NB + + JB = MIN( NB, N-J+1 ) +* +* Compute the current block. +* + CALL CTRSM( 'Right', 'Lower', 'Conjugate Transpose', + $ 'Non-unit', JB, J-1, CONE, A( 1, 1 ), LDA, + $ A( J, 1 ), LDA ) + + CALL CHERK( 'Lower', 'No Transpose', JB, J-1, + $ -ONE, A( J, 1 ), LDA, + $ ONE, A( J, J ), LDA ) +* +* Update and factorize the current diagonal block and test +* for non-positive-definiteness. +* + CALL CPOTF2( 'Lower', JB, A( J, J ), LDA, INFO ) + IF( INFO.NE.0 ) + $ GO TO 30 + + 20 CONTINUE + END IF + END IF + GO TO 40 +* + 30 CONTINUE + INFO = INFO + J - 1 +* + 40 CONTINUE + RETURN +* +* End of CPOTRF +* + END diff --git a/SRC/VARIANTS/cholesky/TOP/dpotrf.f b/SRC/VARIANTS/cholesky/TOP/dpotrf.f new file mode 100644 index 00000000..8dd2c45b --- /dev/null +++ b/SRC/VARIANTS/cholesky/TOP/dpotrf.f @@ -0,0 +1,182 @@ + SUBROUTINE DPOTRF ( UPLO, N, A, LDA, INFO ) +* +* -- LAPACK routine (version 3.1) -- +* Univ. of Tennessee, Univ. of California Berkeley and NAG Ltd.. +* March 2008 +* +* .. Scalar Arguments .. + CHARACTER UPLO + INTEGER INFO, LDA, N +* .. +* .. Array Arguments .. + DOUBLE PRECISION A( LDA, * ) +* .. +* +* Purpose +* ======= +* +* DPOTRF computes the Cholesky factorization of a real symmetric +* positive definite matrix A. +* +* The factorization has the form +* A = U**T * U, if UPLO = 'U', or +* A = L * L**T, if UPLO = 'L', +* where U is an upper triangular matrix and L is lower triangular. +* +* This is the top-looking block version of the algorithm, calling Level 3 BLAS. +* +* Arguments +* ========= +* +* UPLO (input) CHARACTER*1 +* = 'U': Upper triangle of A is stored; +* = 'L': Lower triangle of A is stored. +* +* N (input) INTEGER +* The order of the matrix A. N >= 0. +* +* A (input/output) DOUBLE PRECISION array, dimension (LDA,N) +* On entry, the symmetric matrix A. If UPLO = 'U', the leading +* N-by-N upper triangular part of A contains the upper +* triangular part of the matrix A, and the strictly lower +* triangular part of A is not referenced. If UPLO = 'L', the +* leading N-by-N lower triangular part of A contains the lower +* triangular part of the matrix A, and the strictly upper +* triangular part of A is not referenced. +* +* On exit, if INFO = 0, the factor U or L from the Cholesky +* factorization A = U**T*U or A = L*L**T. +* +* LDA (input) INTEGER +* The leading dimension of the array A. LDA >= max(1,N). +* +* INFO (output) INTEGER +* = 0: successful exit +* < 0: if INFO = -i, the i-th argument had an illegal value +* > 0: if INFO = i, the leading minor of order i is not +* positive definite, and the factorization could not be +* completed. +* +* ===================================================================== +* +* .. Parameters .. + DOUBLE PRECISION ONE + PARAMETER ( ONE = 1.0D+0 ) +* .. +* .. Local Scalars .. + LOGICAL UPPER + INTEGER J, JB, NB +* .. +* .. External Functions .. + LOGICAL LSAME + INTEGER ILAENV + EXTERNAL LSAME, ILAENV +* .. +* .. External Subroutines .. + EXTERNAL DGEMM, DPOTF2, DSYRK, DTRSM, XERBLA +* .. +* .. Intrinsic Functions .. + INTRINSIC MAX, MIN +* .. +* .. Executable Statements .. +* +* Test the input parameters. +* + INFO = 0 + UPPER = LSAME( UPLO, 'U' ) + IF( .NOT.UPPER .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN + INFO = -1 + ELSE IF( N.LT.0 ) THEN + INFO = -2 + ELSE IF( LDA.LT.MAX( 1, N ) ) THEN + INFO = -4 + END IF + IF( INFO.NE.0 ) THEN + CALL XERBLA( 'DPOTRF', -INFO ) + RETURN + END IF +* +* Quick return if possible +* + IF( N.EQ.0 ) + $ RETURN +* +* Determine the block size for this environment. +* + NB = ILAENV( 1, 'DPOTRF', UPLO, N, -1, -1, -1 ) + IF( NB.LE.1 .OR. NB.GE.N ) THEN +* +* Use unblocked code. +* + CALL DPOTF2( UPLO, N, A, LDA, INFO ) + ELSE +* +* Use blocked code. +* + IF( UPPER ) THEN +* +* Compute the Cholesky factorization A = U'*U. +* + DO 10 J = 1, N, NB + + JB = MIN( NB, N-J+1 ) +* +* Compute the current block. +* + CALL DTRSM( 'Left', 'Upper', 'Transpose', 'Non-unit', + $ J-1, JB, ONE, A( 1, 1 ), LDA, + $ A( 1, J ), LDA ) + + CALL DSYRK( 'Upper', 'Transpose', JB, J-1, -ONE, + $ A( 1, J ), LDA, + $ ONE, A( J, J ), LDA ) +* +* Update and factorize the current diagonal block and test +* for non-positive-definiteness. +* + CALL DPOTF2( 'Upper', JB, A( J, J ), LDA, INFO ) + IF( INFO.NE.0 ) + $ GO TO 30 + + 10 CONTINUE +* + ELSE +* +* Compute the Cholesky factorization A = L*L'. +* + DO 20 J = 1, N, NB + + JB = MIN( NB, N-J+1 ) +* +* Compute the current block. +* + CALL DTRSM( 'Right', 'Lower', 'Transpose', 'Non-unit', + $ JB, J-1, ONE, A( 1, 1 ), LDA, + $ A( J, 1 ), LDA ) + + CALL DSYRK( 'Lower', 'No Transpose', JB, J-1, + $ -ONE, A( J, 1 ), LDA, + $ ONE, A( J, J ), LDA ) + +* +* Update and factorize the current diagonal block and test +* for non-positive-definiteness. +* + CALL DPOTF2( 'Lower', JB, A( J, J ), LDA, INFO ) + IF( INFO.NE.0 ) + $ GO TO 30 + + 20 CONTINUE + END IF + END IF + GO TO 40 +* + 30 CONTINUE + INFO = INFO + J - 1 +* + 40 CONTINUE + RETURN +* +* End of DPOTRF +* + END diff --git a/SRC/VARIANTS/cholesky/TOP/spotrf.f b/SRC/VARIANTS/cholesky/TOP/spotrf.f new file mode 100644 index 00000000..08b41b48 --- /dev/null +++ b/SRC/VARIANTS/cholesky/TOP/spotrf.f @@ -0,0 +1,181 @@ + SUBROUTINE SPOTRF ( UPLO, N, A, LDA, INFO ) +* +* -- LAPACK routine (version 3.1) -- +* Univ. of Tennessee, Univ. of California Berkeley and NAG Ltd.. +* March 2008 +* +* .. Scalar Arguments .. + CHARACTER UPLO + INTEGER INFO, LDA, N +* .. +* .. Array Arguments .. + REAL A( LDA, * ) +* .. +* +* Purpose +* ======= +* +* SPOTRF computes the Cholesky factorization of a real symmetric +* positive definite matrix A. +* +* The factorization has the form +* A = U**T * U, if UPLO = 'U', or +* A = L * L**T, if UPLO = 'L', +* where U is an upper triangular matrix and L is lower triangular. +* +* This is the top-looking block version of the algorithm, calling Level 3 BLAS. +* +* Arguments +* ========= +* +* UPLO (input) CHARACTER*1 +* = 'U': Upper triangle of A is stored; +* = 'L': Lower triangle of A is stored. +* +* N (input) INTEGER +* The order of the matrix A. N >= 0. +* +* A (input/output) REAL array, dimension (LDA,N) +* On entry, the symmetric matrix A. If UPLO = 'U', the leading +* N-by-N upper triangular part of A contains the upper +* triangular part of the matrix A, and the strictly lower +* triangular part of A is not referenced. If UPLO = 'L', the +* leading N-by-N lower triangular part of A contains the lower +* triangular part of the matrix A, and the strictly upper +* triangular part of A is not referenced. +* +* On exit, if INFO = 0, the factor U or L from the Cholesky +* factorization A = U**T*U or A = L*L**T. +* +* LDA (input) INTEGER +* The leading dimension of the array A. LDA >= max(1,N). +* +* INFO (output) INTEGER +* = 0: successful exit +* < 0: if INFO = -i, the i-th argument had an illegal value +* > 0: if INFO = i, the leading minor of order i is not +* positive definite, and the factorization could not be +* completed. +* +* ===================================================================== +* +* .. Parameters .. + REAL ONE + PARAMETER ( ONE = 1.0E+0 ) +* .. +* .. Local Scalars .. + LOGICAL UPPER + INTEGER J, JB, NB +* .. +* .. External Functions .. + LOGICAL LSAME + INTEGER ILAENV + EXTERNAL LSAME, ILAENV +* .. +* .. External Subroutines .. + EXTERNAL SGEMM, SPOTF2, SSYRK, STRSM, XERBLA +* .. +* .. Intrinsic Functions .. + INTRINSIC MAX, MIN +* .. +* .. Executable Statements .. +* +* Test the input parameters. +* + INFO = 0 + UPPER = LSAME( UPLO, 'U' ) + IF( .NOT.UPPER .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN + INFO = -1 + ELSE IF( N.LT.0 ) THEN + INFO = -2 + ELSE IF( LDA.LT.MAX( 1, N ) ) THEN + INFO = -4 + END IF + IF( INFO.NE.0 ) THEN + CALL XERBLA( 'SPOTRF', -INFO ) + RETURN + END IF +* +* Quick return if possible +* + IF( N.EQ.0 ) + $ RETURN +* +* Determine the block size for this environment. +* + NB = ILAENV( 1, 'SPOTRF', UPLO, N, -1, -1, -1 ) + IF( NB.LE.1 .OR. NB.GE.N ) THEN +* +* Use unblocked code. +* + CALL SPOTF2( UPLO, N, A, LDA, INFO ) + ELSE +* +* Use blocked code. +* + IF( UPPER ) THEN +* +* Compute the Cholesky factorization A = U'*U. +* + DO 10 J = 1, N, NB + + JB = MIN( NB, N-J+1 ) +* +* Compute the current block. +* + CALL STRSM( 'Left', 'Upper', 'Transpose', 'Non-unit', + $ J-1, JB, ONE, A( 1, 1 ), LDA, + $ A( 1, J ), LDA ) + + CALL SSYRK( 'Upper', 'Transpose', JB, J-1, -ONE, + $ A( 1, J ), LDA, + $ ONE, A( J, J ), LDA ) +* +* Update and factorize the current diagonal block and test +* for non-positive-definiteness. +* + CALL SPOTF2( 'Upper', JB, A( J, J ), LDA, INFO ) + IF( INFO.NE.0 ) + $ GO TO 30 + + 10 CONTINUE +* + ELSE +* +* Compute the Cholesky factorization A = L*L'. +* + DO 20 J = 1, N, NB + + JB = MIN( NB, N-J+1 ) +* +* Compute the current block. +* + CALL STRSM( 'Right', 'Lower', 'Transpose', 'Non-unit', + $ JB, J-1, ONE, A( 1, 1 ), LDA, + $ A( J, 1 ), LDA ) + + CALL SSYRK( 'Lower', 'No Transpose', JB, J-1, + $ -ONE, A( J, 1 ), LDA, + $ ONE, A( J, J ), LDA ) +* +* Update and factorize the current diagonal block and test +* for non-positive-definiteness. +* + CALL SPOTF2( 'Lower', JB, A( J, J ), LDA, INFO ) + IF( INFO.NE.0 ) + $ GO TO 30 + + 20 CONTINUE + END IF + END IF + GO TO 40 +* + 30 CONTINUE + INFO = INFO + J - 1 +* + 40 CONTINUE + RETURN +* +* End of SPOTRF +* + END diff --git a/SRC/VARIANTS/cholesky/TOP/zpotrf.f b/SRC/VARIANTS/cholesky/TOP/zpotrf.f new file mode 100644 index 00000000..4eae3d1b --- /dev/null +++ b/SRC/VARIANTS/cholesky/TOP/zpotrf.f @@ -0,0 +1,181 @@ + SUBROUTINE ZPOTRF ( UPLO, N, A, LDA, INFO ) +* +* -- LAPACK routine (version 3.1) -- +* Univ. of Tennessee, Univ. of California Berkeley and NAG Ltd.. +* March 2008 +* +* .. Scalar Arguments .. + CHARACTER UPLO + INTEGER INFO, LDA, N +* .. +* .. Array Arguments .. + COMPLEX*16 A( LDA, * ) +* .. +* +* Purpose +* ======= +* +* ZPOTRF computes the Cholesky factorization of a real symmetric +* positive definite matrix A. +* +* The factorization has the form +* A = U**H * U, if UPLO = 'U', or +* A = L * L**H, if UPLO = 'L', +* where U is an upper triangular matrix and L is lower triangular. +* +* This is the top-looking block version of the algorithm, calling Level 3 BLAS. +* +* Arguments +* ========= +* +* UPLO (input) CHARACTER*1 +* = 'U': Upper triangle of A is stored; +* = 'L': Lower triangle of A is stored. +* +* N (input) INTEGER +* The order of the matrix A. N >= 0. +* +* A (input/output) COMPLEX*16 array, dimension (LDA,N) +* On entry, the symmetric matrix A. If UPLO = 'U', the leading +* N-by-N upper triangular part of A contains the upper +* triangular part of the matrix A, and the strictly lower +* triangular part of A is not referenced. If UPLO = 'L', the +* leading N-by-N lower triangular part of A contains the lower +* triangular part of the matrix A, and the strictly upper +* triangular part of A is not referenced. +* +* On exit, if INFO = 0, the factor U or L from the Cholesky +* factorization A = U**H*U or A = L*L**H. +* +* LDA (input) INTEGER +* The leading dimension of the array A. LDA >= max(1,N). +* +* INFO (output) INTEGER +* = 0: successful exit +* < 0: if INFO = -i, the i-th argument had an illegal value +* > 0: if INFO = i, the leading minor of order i is not +* positive definite, and the factorization could not be +* completed. +* +* ===================================================================== +* +* .. Parameters .. + DOUBLE PRECISION ONE + COMPLEX*16 CONE + PARAMETER ( ONE = 1.0D+0, CONE = ( 1.0D+0, 0.0D+0 ) ) +* .. +* .. Local Scalars .. + LOGICAL UPPER + INTEGER J, JB, NB +* .. +* .. External Functions .. + LOGICAL LSAME + INTEGER ILAENV + EXTERNAL LSAME, ILAENV +* .. +* .. External Subroutines .. + EXTERNAL ZGEMM, ZPOTF2, ZHERK, ZTRSM, XERBLA +* .. +* .. Intrinsic Functions .. + INTRINSIC MAX, MIN +* .. +* .. Executable Statements .. +* +* Test the input parameters. +* + INFO = 0 + UPPER = LSAME( UPLO, 'U' ) + IF( .NOT.UPPER .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN + INFO = -1 + ELSE IF( N.LT.0 ) THEN + INFO = -2 + ELSE IF( LDA.LT.MAX( 1, N ) ) THEN + INFO = -4 + END IF + IF( INFO.NE.0 ) THEN + CALL XERBLA( 'ZPOTRF', -INFO ) + RETURN + END IF +* +* Quick return if possible +* + IF( N.EQ.0 ) + $ RETURN +* +* Determine the block size for this environment. +* + NB = ILAENV( 1, 'ZPOTRF', UPLO, N, -1, -1, -1 ) + IF( NB.LE.1 .OR. NB.GE.N ) THEN +* +* Use unblocked code. +* + CALL ZPOTF2( UPLO, N, A, LDA, INFO ) + ELSE +* +* Use blocked code. +* + IF( UPPER ) THEN +* +* Compute the Cholesky factorization A = U'*U. +* + DO 10 J = 1, N, NB + + JB = MIN( NB, N-J+1 ) +* +* Compute the current block. +* + CALL ZTRSM( 'Left', 'Upper', 'Conjugate Transpose', + $ 'Non-unit', J-1, JB, CONE, A( 1, 1 ), LDA, + $ A( 1, J ), LDA ) + + CALL ZHERK( 'Upper', 'Conjugate Transpose', JB, J-1, + $ -ONE, A( 1, J ), LDA, ONE, A( J, J ), LDA ) +* +* Update and factorize the current diagonal block and test +* for non-positive-definiteness. +* + CALL ZPOTF2( 'Upper', JB, A( J, J ), LDA, INFO ) + IF( INFO.NE.0 ) + $ GO TO 30 + + 10 CONTINUE +* + ELSE +* +* Compute the Cholesky factorization A = L*L'. +* + DO 20 J = 1, N, NB + + JB = MIN( NB, N-J+1 ) +* +* Compute the current block. +* + CALL ZTRSM( 'Right', 'Lower', 'Conjugate Transpose', + $ 'Non-unit', JB, J-1, CONE, A( 1, 1 ), LDA, + $ A( J, 1 ), LDA ) + + CALL ZHERK( 'Lower', 'No Transpose', JB, J-1, + $ -ONE, A( J, 1 ), LDA, + $ ONE, A( J, J ), LDA ) +* +* Update and factorize the current diagonal block and test +* for non-positive-definiteness. +* + CALL ZPOTF2( 'Lower', JB, A( J, J ), LDA, INFO ) + IF( INFO.NE.0 ) + $ GO TO 30 + + 20 CONTINUE + END IF + END IF + GO TO 40 +* + 30 CONTINUE + INFO = INFO + J - 1 +* + 40 CONTINUE + RETURN +* +* End of ZPOTRF +* + END diff --git a/SRC/VARIANTS/lu/CR/cgetrf.f b/SRC/VARIANTS/lu/CR/cgetrf.f new file mode 100644 index 00000000..7d6403e1 --- /dev/null +++ b/SRC/VARIANTS/lu/CR/cgetrf.f @@ -0,0 +1,165 @@ + SUBROUTINE CGETRF ( M, N, A, LDA, IPIV, INFO) +* +* -- LAPACK routine (version 3.1) -- +* Univ. of Tennessee, Univ. of California Berkeley and NAG Ltd.. +* March 2008 +* +* .. Scalar Arguments .. + INTEGER INFO, LDA, M, N +* .. +* .. Array Arguments .. + INTEGER IPIV( * ) + COMPLEX A( LDA, * ) +* .. +* +* Purpose +* ======= +* +* CGETRF computes an LU factorization of a general M-by-N matrix A +* using partial pivoting with row interchanges. +* +* The factorization has the form +* A = P * L * U +* where P is a permutation matrix, L is lower triangular with unit +* diagonal elements (lower trapezoidal if m > n), and U is upper +* triangular (upper trapezoidal if m < n). +* +* This is the Crout Level 3 BLAS version of the algorithm. +* +* Arguments +* ========= +* +* M (input) INTEGER +* The number of rows of the matrix A. M >= 0. +* +* N (input) INTEGER +* The number of columns of the matrix A. N >= 0. +* +* A (input/output) COMPLEX array, dimension (LDA,N) +* On entry, the M-by-N matrix to be factored. +* On exit, the factors L and U from the factorization +* A = P*L*U; the unit diagonal elements of L are not stored. +* +* LDA (input) INTEGER +* The leading dimension of the array A. LDA >= max(1,M). +* +* IPIV (output) INTEGER array, dimension (min(M,N)) +* The pivot indices; for 1 <= i <= min(M,N), row i of the +* matrix was interchanged with row IPIV(i). +* +* INFO (output) INTEGER +* = 0: successful exit +* < 0: if INFO = -i, the i-th argument had an illegal value +* > 0: if INFO = i, U(i,i) is exactly zero. The factorization +* has been completed, but the factor U is exactly +* singular, and division by zero will occur if it is used +* to solve a system of equations. +* +* ===================================================================== +* +* .. Parameters .. + COMPLEX ONE + PARAMETER ( ONE = ( 1.0E+0, 0.0E+0 ) ) +* .. +* .. Local Scalars .. + INTEGER I, IINFO, J, JB, NB +* .. +* .. External Subroutines .. + EXTERNAL CGEMM, CGETF2, CLASWP, CTRSM, XERBLA +* .. +* .. External Functions .. + INTEGER ILAENV + EXTERNAL ILAENV +* .. +* .. Intrinsic Functions .. + INTRINSIC MAX, MIN, MOD +* .. +* .. Executable Statements .. +* +* Test the input parameters. +* + INFO = 0 + IF( M.LT.0 ) THEN + INFO = -1 + ELSE IF( N.LT.0 ) THEN + INFO = -2 + ELSE IF( LDA.LT.MAX( 1, M ) ) THEN + INFO = -4 + END IF + IF( INFO.NE.0 ) THEN + CALL XERBLA( 'CGETRF', -INFO ) + RETURN + END IF +* +* Quick return if possible +* + IF( M.EQ.0 .OR. N.EQ.0 ) + $ RETURN +* +* Determine the block size for this environment. +* + NB = ILAENV( 1, 'CGETRF', ' ', M, N, -1, -1 ) + IF( NB.LE.1 .OR. NB.GE.MIN( M, N ) ) THEN +* +* Use unblocked code. +* + CALL CGETF2( M, N, A, LDA, IPIV, INFO ) + ELSE +* +* Use blocked code. +* + DO 20 J = 1, MIN( M, N ), NB + JB = MIN( MIN( M, N )-J+1, NB ) +* +* Update current block. +* + CALL CGEMM( 'No transpose', 'No transpose', + $ M-J+1, JB, J-1, -ONE, + $ A( J, 1 ), LDA, A( 1, J ), LDA, ONE, + $ A( J, J ), LDA ) + +* +* Factor diagonal and subdiagonal blocks and test for exact +* singularity. +* + CALL CGETF2( M-J+1, JB, A( J, J ), LDA, IPIV( J ), IINFO ) +* +* Adjust INFO and the pivot indices. +* + IF( INFO.EQ.0 .AND. IINFO.GT.0 ) + $ INFO = IINFO + J - 1 + DO 10 I = J, MIN( M, J+JB-1 ) + IPIV( I ) = J - 1 + IPIV( I ) + 10 CONTINUE +* +* Apply interchanges to column 1:J-1 +* + CALL CLASWP( J-1, A, LDA, J, J+JB-1, IPIV, 1 ) +* + IF ( J+JB.LE.N ) THEN +* +* Apply interchanges to column J+JB:N +* + CALL CLASWP( N-J-JB+1, A( 1, J+JB ), LDA, J, J+JB-1, + $ IPIV, 1 ) +* + CALL CGEMM( 'No transpose', 'No transpose', + $ JB, N-J-JB+1, J-1, -ONE, + $ A( J, 1 ), LDA, A( 1, J+JB ), LDA, ONE, + $ A( J, J+JB ), LDA ) +* +* Compute block row of U. +* + CALL CTRSM( 'Left', 'Lower', 'No transpose', 'Unit', + $ JB, N-J-JB+1, ONE, A( J, J ), LDA, + $ A( J, J+JB ), LDA ) + END IF + + 20 CONTINUE + + END IF + RETURN +* +* End of CGETRF +* + END diff --git a/SRC/VARIANTS/lu/CR/dgetrf.f b/SRC/VARIANTS/lu/CR/dgetrf.f new file mode 100644 index 00000000..e1b4121e --- /dev/null +++ b/SRC/VARIANTS/lu/CR/dgetrf.f @@ -0,0 +1,165 @@ + SUBROUTINE DGETRF ( M, N, A, LDA, IPIV, INFO) +* +* -- LAPACK routine (version 3.1) -- +* Univ. of Tennessee, Univ. of California Berkeley and NAG Ltd.. +* March 2008 +* +* .. Scalar Arguments .. + INTEGER INFO, LDA, M, N +* .. +* .. Array Arguments .. + INTEGER IPIV( * ) + DOUBLE PRECISION A( LDA, * ) +* .. +* +* Purpose +* ======= +* +* DGETRF computes an LU factorization of a general M-by-N matrix A +* using partial pivoting with row interchanges. +* +* The factorization has the form +* A = P * L * U +* where P is a permutation matrix, L is lower triangular with unit +* diagonal elements (lower trapezoidal if m > n), and U is upper +* triangular (upper trapezoidal if m < n). +* +* This is the Crout Level 3 BLAS version of the algorithm. +* +* Arguments +* ========= +* +* M (input) INTEGER +* The number of rows of the matrix A. M >= 0. +* +* N (input) INTEGER +* The number of columns of the matrix A. N >= 0. +* +* A (input/output) DOUBLE PRECISION array, dimension (LDA,N) +* On entry, the M-by-N matrix to be factored. +* On exit, the factors L and U from the factorization +* A = P*L*U; the unit diagonal elements of L are not stored. +* +* LDA (input) INTEGER +* The leading dimension of the array A. LDA >= max(1,M). +* +* IPIV (output) INTEGER array, dimension (min(M,N)) +* The pivot indices; for 1 <= i <= min(M,N), row i of the +* matrix was interchanged with row IPIV(i). +* +* INFO (output) INTEGER +* = 0: successful exit +* < 0: if INFO = -i, the i-th argument had an illegal value +* > 0: if INFO = i, U(i,i) is exactly zero. The factorization +* has been completed, but the factor U is exactly +* singular, and division by zero will occur if it is used +* to solve a system of equations. +* +* ===================================================================== +* +* .. Parameters .. + DOUBLE PRECISION ONE + PARAMETER ( ONE = 1.0D+0 ) +* .. +* .. Local Scalars .. + INTEGER I, IINFO, J, JB, NB +* .. +* .. External Subroutines .. + EXTERNAL DGEMM, DGETF2, DLASWP, DTRSM, XERBLA +* .. +* .. External Functions .. + INTEGER ILAENV + EXTERNAL ILAENV +* .. +* .. Intrinsic Functions .. + INTRINSIC MAX, MIN, MOD +* .. +* .. Executable Statements .. +* +* Test the input parameters. +* + INFO = 0 + IF( M.LT.0 ) THEN + INFO = -1 + ELSE IF( N.LT.0 ) THEN + INFO = -2 + ELSE IF( LDA.LT.MAX( 1, M ) ) THEN + INFO = -4 + END IF + IF( INFO.NE.0 ) THEN + CALL XERBLA( 'DGETRF', -INFO ) + RETURN + END IF +* +* Quick return if possible +* + IF( M.EQ.0 .OR. N.EQ.0 ) + $ RETURN +* +* Determine the block size for this environment. +* + NB = ILAENV( 1, 'DGETRF', ' ', M, N, -1, -1 ) + IF( NB.LE.1 .OR. NB.GE.MIN( M, N ) ) THEN +* +* Use unblocked code. +* + CALL DGETF2( M, N, A, LDA, IPIV, INFO ) + ELSE +* +* Use blocked code. +* + DO 20 J = 1, MIN( M, N ), NB + JB = MIN( MIN( M, N )-J+1, NB ) +* +* Update current block. +* + CALL DGEMM( 'No transpose', 'No transpose', + $ M-J+1, JB, J-1, -ONE, + $ A( J, 1 ), LDA, A( 1, J ), LDA, ONE, + $ A( J, J ), LDA ) + +* +* Factor diagonal and subdiagonal blocks and test for exact +* singularity. +* + CALL DGETF2( M-J+1, JB, A( J, J ), LDA, IPIV( J ), IINFO ) +* +* Adjust INFO and the pivot indices. +* + IF( INFO.EQ.0 .AND. IINFO.GT.0 ) + $ INFO = IINFO + J - 1 + DO 10 I = J, MIN( M, J+JB-1 ) + IPIV( I ) = J - 1 + IPIV( I ) + 10 CONTINUE +* +* Apply interchanges to column 1:J-1 +* + CALL DLASWP( J-1, A, LDA, J, J+JB-1, IPIV, 1 ) +* + IF ( J+JB.LE.N ) THEN +* +* Apply interchanges to column J+JB:N +* + CALL DLASWP( N-J-JB+1, A( 1, J+JB ), LDA, J, J+JB-1, + $ IPIV, 1 ) +* + CALL DGEMM( 'No transpose', 'No transpose', + $ JB, N-J-JB+1, J-1, -ONE, + $ A( J, 1 ), LDA, A( 1, J+JB ), LDA, ONE, + $ A( J, J+JB ), LDA ) +* +* Compute block row of U. +* + CALL DTRSM( 'Left', 'Lower', 'No transpose', 'Unit', + $ JB, N-J-JB+1, ONE, A( J, J ), LDA, + $ A( J, J+JB ), LDA ) + END IF + + 20 CONTINUE + + END IF + RETURN +* +* End of DGETRF +* + END diff --git a/SRC/VARIANTS/lu/CR/sgetrf.f b/SRC/VARIANTS/lu/CR/sgetrf.f new file mode 100644 index 00000000..238ec119 --- /dev/null +++ b/SRC/VARIANTS/lu/CR/sgetrf.f @@ -0,0 +1,165 @@ + SUBROUTINE SGETRF ( M, N, A, LDA, IPIV, INFO) +* +* -- LAPACK routine (version 3.1) -- +* Univ. of Tennessee, Univ. of California Berkeley and NAG Ltd.. +* March 2008 +* +* .. Scalar Arguments .. + INTEGER INFO, LDA, M, N +* .. +* .. Array Arguments .. + INTEGER IPIV( * ) + REAL A( LDA, * ) +* .. +* +* Purpose +* ======= +* +* SGETRF computes an LU factorization of a general M-by-N matrix A +* using partial pivoting with row interchanges. +* +* The factorization has the form +* A = P * L * U +* where P is a permutation matrix, L is lower triangular with unit +* diagonal elements (lower trapezoidal if m > n), and U is upper +* triangular (upper trapezoidal if m < n). +* +* This is the Crout Level 3 BLAS version of the algorithm. +* +* Arguments +* ========= +* +* M (input) INTEGER +* The number of rows of the matrix A. M >= 0. +* +* N (input) INTEGER +* The number of columns of the matrix A. N >= 0. +* +* A (input/output) REAL array, dimension (LDA,N) +* On entry, the M-by-N matrix to be factored. +* On exit, the factors L and U from the factorization +* A = P*L*U; the unit diagonal elements of L are not stored. +* +* LDA (input) INTEGER +* The leading dimension of the array A. LDA >= max(1,M). +* +* IPIV (output) INTEGER array, dimension (min(M,N)) +* The pivot indices; for 1 <= i <= min(M,N), row i of the +* matrix was interchanged with row IPIV(i). +* +* INFO (output) INTEGER +* = 0: successful exit +* < 0: if INFO = -i, the i-th argument had an illegal value +* > 0: if INFO = i, U(i,i) is exactly zero. The factorization +* has been completed, but the factor U is exactly +* singular, and division by zero will occur if it is used +* to solve a system of equations. +* +* ===================================================================== +* +* .. Parameters .. + REAL ONE + PARAMETER ( ONE = 1.0E+0 ) +* .. +* .. Local Scalars .. + INTEGER I, IINFO, J, JB, NB +* .. +* .. External Subroutines .. + EXTERNAL SGEMM, SGETF2, SLASWP, STRSM, XERBLA +* .. +* .. External Functions .. + INTEGER ILAENV + EXTERNAL ILAENV +* .. +* .. Intrinsic Functions .. + INTRINSIC MAX, MIN, MOD +* .. +* .. Executable Statements .. +* +* Test the input parameters. +* + INFO = 0 + IF( M.LT.0 ) THEN + INFO = -1 + ELSE IF( N.LT.0 ) THEN + INFO = -2 + ELSE IF( LDA.LT.MAX( 1, M ) ) THEN + INFO = -4 + END IF + IF( INFO.NE.0 ) THEN + CALL XERBLA( 'SGETRF', -INFO ) + RETURN + END IF +* +* Quick return if possible +* + IF( M.EQ.0 .OR. N.EQ.0 ) + $ RETURN +* +* Determine the block size for this environment. +* + NB = ILAENV( 1, 'SGETRF', ' ', M, N, -1, -1 ) + IF( NB.LE.1 .OR. NB.GE.MIN( M, N ) ) THEN +* +* Use unblocked code. +* + CALL SGETF2( M, N, A, LDA, IPIV, INFO ) + ELSE +* +* Use blocked code. +* + DO 20 J = 1, MIN( M, N ), NB + JB = MIN( MIN( M, N )-J+1, NB ) +* +* Update current block. +* + CALL SGEMM( 'No transpose', 'No transpose', + $ M-J+1, JB, J-1, -ONE, + $ A( J, 1 ), LDA, A( 1, J ), LDA, ONE, + $ A( J, J ), LDA ) + +* +* Factor diagonal and subdiagonal blocks and test for exact +* singularity. +* + CALL SGETF2( M-J+1, JB, A( J, J ), LDA, IPIV( J ), IINFO ) +* +* Adjust INFO and the pivot indices. +* + IF( INFO.EQ.0 .AND. IINFO.GT.0 ) + $ INFO = IINFO + J - 1 + DO 10 I = J, MIN( M, J+JB-1 ) + IPIV( I ) = J - 1 + IPIV( I ) + 10 CONTINUE +* +* Apply interchanges to column 1:J-1 +* + CALL SLASWP( J-1, A, LDA, J, J+JB-1, IPIV, 1 ) +* + IF ( J+JB.LE.N ) THEN +* +* Apply interchanges to column J+JB:N +* + CALL SLASWP( N-J-JB+1, A( 1, J+JB ), LDA, J, J+JB-1, + $ IPIV, 1 ) +* + CALL SGEMM( 'No transpose', 'No transpose', + $ JB, N-J-JB+1, J-1, -ONE, + $ A( J, 1 ), LDA, A( 1, J+JB ), LDA, ONE, + $ A( J, J+JB ), LDA ) +* +* Compute block row of U. +* + CALL STRSM( 'Left', 'Lower', 'No transpose', 'Unit', + $ JB, N-J-JB+1, ONE, A( J, J ), LDA, + $ A( J, J+JB ), LDA ) + END IF + + 20 CONTINUE + + END IF + RETURN +* +* End of SGETRF +* + END diff --git a/SRC/VARIANTS/lu/CR/zgetrf.f b/SRC/VARIANTS/lu/CR/zgetrf.f new file mode 100644 index 00000000..2dafefbf --- /dev/null +++ b/SRC/VARIANTS/lu/CR/zgetrf.f @@ -0,0 +1,165 @@ + SUBROUTINE ZGETRF ( M, N, A, LDA, IPIV, INFO) +* +* -- LAPACK routine (version 3.1) -- +* Univ. of Tennessee, Univ. of California Berkeley and NAG Ltd.. +* March 2008 +* +* .. Scalar Arguments .. + INTEGER INFO, LDA, M, N +* .. +* .. Array Arguments .. + INTEGER IPIV( * ) + COMPLEX*16 A( LDA, * ) +* .. +* +* Purpose +* ======= +* +* ZGETRF computes an LU factorization of a general M-by-N matrix A +* using partial pivoting with row interchanges. +* +* The factorization has the form +* A = P * L * U +* where P is a permutation matrix, L is lower triangular with unit +* diagonal elements (lower trapezoidal if m > n), and U is upper +* triangular (upper trapezoidal if m < n). +* +* This is the Crout Level 3 BLAS version of the algorithm. +* +* Arguments +* ========= +* +* M (input) INTEGER +* The number of rows of the matrix A. M >= 0. +* +* N (input) INTEGER +* The number of columns of the matrix A. N >= 0. +* +* A (input/output) COMPLEX*16 array, dimension (LDA,N) +* On entry, the M-by-N matrix to be factored. +* On exit, the factors L and U from the factorization +* A = P*L*U; the unit diagonal elements of L are not stored. +* +* LDA (input) INTEGER +* The leading dimension of the array A. LDA >= max(1,M). +* +* IPIV (output) INTEGER array, dimension (min(M,N)) +* The pivot indices; for 1 <= i <= min(M,N), row i of the +* matrix was interchanged with row IPIV(i). +* +* INFO (output) INTEGER +* = 0: successful exit +* < 0: if INFO = -i, the i-th argument had an illegal value +* > 0: if INFO = i, U(i,i) is exactly zero. The factorization +* has been completed, but the factor U is exactly +* singular, and division by zero will occur if it is used +* to solve a system of equations. +* +* ===================================================================== +* +* .. Parameters .. + COMPLEX*16 ONE + PARAMETER ( ONE = ( 1.0D+0, 0.0D+0 ) ) +* .. +* .. Local Scalars .. + INTEGER I, IINFO, J, JB, NB +* .. +* .. External Subroutines .. + EXTERNAL ZGEMM, ZGETF2, ZLASWP, ZTRSM, XERBLA +* .. +* .. External Functions .. + INTEGER ILAENV + EXTERNAL ILAENV +* .. +* .. Intrinsic Functions .. + INTRINSIC MAX, MIN, MOD +* .. +* .. Executable Statements .. +* +* Test the input parameters. +* + INFO = 0 + IF( M.LT.0 ) THEN + INFO = -1 + ELSE IF( N.LT.0 ) THEN + INFO = -2 + ELSE IF( LDA.LT.MAX( 1, M ) ) THEN + INFO = -4 + END IF + IF( INFO.NE.0 ) THEN + CALL XERBLA( 'ZGETRF', -INFO ) + RETURN + END IF +* +* Quick return if possible +* + IF( M.EQ.0 .OR. N.EQ.0 ) + $ RETURN +* +* Determine the block size for this environment. +* + NB = ILAENV( 1, 'ZGETRF', ' ', M, N, -1, -1 ) + IF( NB.LE.1 .OR. NB.GE.MIN( M, N ) ) THEN +* +* Use unblocked code. +* + CALL ZGETF2( M, N, A, LDA, IPIV, INFO ) + ELSE +* +* Use blocked code. +* + DO 20 J = 1, MIN( M, N ), NB + JB = MIN( MIN( M, N )-J+1, NB ) +* +* Update current block. +* + CALL ZGEMM( 'No transpose', 'No transpose', + $ M-J+1, JB, J-1, -ONE, + $ A( J, 1 ), LDA, A( 1, J ), LDA, ONE, + $ A( J, J ), LDA ) + +* +* Factor diagonal and subdiagonal blocks and test for exact +* singularity. +* + CALL ZGETF2( M-J+1, JB, A( J, J ), LDA, IPIV( J ), IINFO ) +* +* Adjust INFO and the pivot indices. +* + IF( INFO.EQ.0 .AND. IINFO.GT.0 ) + $ INFO = IINFO + J - 1 + DO 10 I = J, MIN( M, J+JB-1 ) + IPIV( I ) = J - 1 + IPIV( I ) + 10 CONTINUE +* +* Apply interchanges to column 1:J-1 +* + CALL ZLASWP( J-1, A, LDA, J, J+JB-1, IPIV, 1 ) +* + IF ( J+JB.LE.N ) THEN +* +* Apply interchanges to column J+JB:N +* + CALL ZLASWP( N-J-JB+1, A( 1, J+JB ), LDA, J, J+JB-1, + $ IPIV, 1 ) +* + CALL ZGEMM( 'No transpose', 'No transpose', + $ JB, N-J-JB+1, J-1, -ONE, + $ A( J, 1 ), LDA, A( 1, J+JB ), LDA, ONE, + $ A( J, J+JB ), LDA ) +* +* Compute block row of U. +* + CALL ZTRSM( 'Left', 'Lower', 'No transpose', 'Unit', + $ JB, N-J-JB+1, ONE, A( J, J ), LDA, + $ A( J, J+JB ), LDA ) + END IF + + 20 CONTINUE + + END IF + RETURN +* +* End of ZGETRF +* + END diff --git a/SRC/VARIANTS/lu/LL/cgetrf.f b/SRC/VARIANTS/lu/LL/cgetrf.f new file mode 100644 index 00000000..189362b0 --- /dev/null +++ b/SRC/VARIANTS/lu/LL/cgetrf.f @@ -0,0 +1,190 @@ + SUBROUTINE CGETRF ( M, N, A, LDA, IPIV, INFO) +* +* -- LAPACK routine (version 3.1) -- +* Univ. of Tennessee, Univ. of California Berkeley and NAG Ltd.. +* March 2008 +* +* .. Scalar Arguments .. + INTEGER INFO, LDA, M, N +* .. +* .. Array Arguments .. + INTEGER IPIV( * ) + COMPLEX A( LDA, * ) +* .. +* +* Purpose +* ======= +* +* CGETRF computes an LU factorization of a general M-by-N matrix A +* using partial pivoting with row interchanges. +* +* The factorization has the form +* A = P * L * U +* where P is a permutation matrix, L is lower triangular with unit +* diagonal elements (lower trapezoidal if m > n), and U is upper +* triangular (upper trapezoidal if m < n). +* +* This is the left-looking Level 3 BLAS version of the algorithm. +* +* Arguments +* ========= +* +* M (input) INTEGER +* The number of rows of the matrix A. M >= 0. +* +* N (input) INTEGER +* The number of columns of the matrix A. N >= 0. +* +* A (input/output) COMPLEX array, dimension (LDA,N) +* On entry, the M-by-N matrix to be factored. +* On exit, the factors L and U from the factorization +* A = P*L*U; the unit diagonal elements of L are not stored. +* +* LDA (input) INTEGER +* The leading dimension of the array A. LDA >= max(1,M). +* +* IPIV (output) INTEGER array, dimension (min(M,N)) +* The pivot indices; for 1 <= i <= min(M,N), row i of the +* matrix was interchanged with row IPIV(i). +* +* INFO (output) INTEGER +* = 0: successful exit +* < 0: if INFO = -i, the i-th argument had an illegal value +* > 0: if INFO = i, U(i,i) is exactly zero. The factorization +* has been completed, but the factor U is exactly +* singular, and division by zero will occur if it is used +* to solve a system of equations. +* +* ===================================================================== +* +* .. Parameters .. + COMPLEX ONE + PARAMETER ( ONE = (1.0E+0, 0.0E+0) ) +* .. +* .. Local Scalars .. + INTEGER I, IINFO, J, JB, K, NB +* .. +* .. External Subroutines .. + EXTERNAL CGEMM, CGETF2, CLASWP, CTRSM, XERBLA +* .. +* .. External Functions .. + INTEGER ILAENV + EXTERNAL ILAENV +* .. +* .. Intrinsic Functions .. + INTRINSIC MAX, MIN +* .. +* .. Executable Statements .. +* +* Test the input parameters. +* + INFO = 0 + IF( M.LT.0 ) THEN + INFO = -1 + ELSE IF( N.LT.0 ) THEN + INFO = -2 + ELSE IF( LDA.LT.MAX( 1, M ) ) THEN + INFO = -4 + END IF + IF( INFO.NE.0 ) THEN + CALL XERBLA( 'CGETRF', -INFO ) + RETURN + END IF +* +* Quick return if possible +* + IF( M.EQ.0 .OR. N.EQ.0 ) + $ RETURN +* +* Determine the block size for this environment. +* + NB = ILAENV( 1, 'CGETRF', ' ', M, N, -1, -1 ) + IF( NB.LE.1 .OR. NB.GE.MIN( M, N ) ) THEN +* +* Use unblocked code. +* + CALL CGETF2( M, N, A, LDA, IPIV, INFO ) + + ELSE +* +* Use blocked code. +* + DO 20 J = 1, MIN( M, N ), NB + JB = MIN( MIN( M, N )-J+1, NB ) +* +* +* Update before factoring the current panel +* + DO 30 K = 1, J-NB, NB +* +* Apply interchanges to rows K:K+NB-1. +* + CALL CLASWP( JB, A(1, J), LDA, K, K+NB-1, IPIV, 1 ) +* +* Compute block row of U. +* + CALL CTRSM( 'Left', 'Lower', 'No transpose', 'Unit', + $ NB, JB, ONE, A( K, K ), LDA, + $ A( K, J ), LDA ) +* +* Update trailing submatrix. +* + CALL CGEMM( 'No transpose', 'No transpose', + $ M-K-NB+1, JB, NB, -ONE, + $ A( K+NB, K ), LDA, A( K, J ), LDA, ONE, + $ A( K+NB, J ), LDA ) + 30 CONTINUE +* +* Factor diagonal and subdiagonal blocks and test for exact +* singularity. +* + CALL CGETF2( M-J+1, JB, A( J, J ), LDA, IPIV( J ), IINFO ) +* +* Adjust INFO and the pivot indices. +* + IF( INFO.EQ.0 .AND. IINFO.GT.0 ) + $ INFO = IINFO + J - 1 + DO 10 I = J, MIN( M, J+JB-1 ) + IPIV( I ) = J - 1 + IPIV( I ) + 10 CONTINUE +* + 20 CONTINUE + +* +* Apply interchanges to the left-overs +* + DO 40 K = 1, MIN( M, N ), NB + CALL CLASWP( K-1, A( 1, 1 ), LDA, K, + $ MIN (K+NB-1, MIN ( M, N )), IPIV, 1 ) + 40 CONTINUE +* +* Apply update to the M+1:N columns when N > M +* + IF ( N.GT.M ) THEN + + CALL CLASWP( N-M, A(1, M+1), LDA, 1, M, IPIV, 1 ) + + DO 50 K = 1, M, NB + + JB = MIN( M-K+1, NB ) +* + CALL CTRSM( 'Left', 'Lower', 'No transpose', 'Unit', + $ JB, N-M, ONE, A( K, K ), LDA, + $ A( K, M+1 ), LDA ) + +* + IF ( K+NB.LE.M ) THEN + CALL CGEMM( 'No transpose', 'No transpose', + $ M-K-NB+1, N-M, NB, -ONE, + $ A( K+NB, K ), LDA, A( K, M+1 ), LDA, ONE, + $ A( K+NB, M+1 ), LDA ) + END IF + 50 CONTINUE + END IF +* + END IF + RETURN +* +* End of CGETRF +* + END diff --git a/SRC/VARIANTS/lu/LL/dgetrf.f b/SRC/VARIANTS/lu/LL/dgetrf.f new file mode 100644 index 00000000..8231805c --- /dev/null +++ b/SRC/VARIANTS/lu/LL/dgetrf.f @@ -0,0 +1,189 @@ + SUBROUTINE DGETRF ( M, N, A, LDA, IPIV, INFO) +* +* -- LAPACK routine (version 3.1) -- +* Univ. of Tennessee, Univ. of California Berkeley and NAG Ltd.. +* March 2008 +* +* .. Scalar Arguments .. + INTEGER INFO, LDA, M, N +* .. +* .. Array Arguments .. + INTEGER IPIV( * ) + DOUBLE PRECISION A( LDA, * ) +* .. +* +* Purpose +* ======= +* +* DGETRF computes an LU factorization of a general M-by-N matrix A +* using partial pivoting with row interchanges. +* +* The factorization has the form +* A = P * L * U +* where P is a permutation matrix, L is lower triangular with unit +* diagonal elements (lower trapezoidal if m > n), and U is upper +* triangular (upper trapezoidal if m < n). +* +* This is the left-looking Level 3 BLAS version of the algorithm. +* +* Arguments +* ========= +* +* M (input) INTEGER +* The number of rows of the matrix A. M >= 0. +* +* N (input) INTEGER +* The number of columns of the matrix A. N >= 0. +* +* A (input/output) DOUBLE PRECISION array, dimension (LDA,N) +* On entry, the M-by-N matrix to be factored. +* On exit, the factors L and U from the factorization +* A = P*L*U; the unit diagonal elements of L are not stored. +* +* LDA (input) INTEGER +* The leading dimension of the array A. LDA >= max(1,M). +* +* IPIV (output) INTEGER array, dimension (min(M,N)) +* The pivot indices; for 1 <= i <= min(M,N), row i of the +* matrix was interchanged with row IPIV(i). +* +* INFO (output) INTEGER +* = 0: successful exit +* < 0: if INFO = -i, the i-th argument had an illegal value +* > 0: if INFO = i, U(i,i) is exactly zero. The factorization +* has been completed, but the factor U is exactly +* singular, and division by zero will occur if it is used +* to solve a system of equations. +* +* ===================================================================== +* +* .. Parameters .. + DOUBLE PRECISION ONE + PARAMETER ( ONE = 1.0D+0 ) +* .. +* .. Local Scalars .. + INTEGER I, IINFO, J, JB, K, NB +* .. +* .. External Subroutines .. + EXTERNAL DGEMM, DGETF2, DLASWP, DTRSM, XERBLA +* .. +* .. External Functions .. + INTEGER ILAENV + EXTERNAL ILAENV +* .. +* .. Intrinsic Functions .. + INTRINSIC MAX, MIN +* .. +* .. Executable Statements .. +* +* Test the input parameters. +* + INFO = 0 + IF( M.LT.0 ) THEN + INFO = -1 + ELSE IF( N.LT.0 ) THEN + INFO = -2 + ELSE IF( LDA.LT.MAX( 1, M ) ) THEN + INFO = -4 + END IF + IF( INFO.NE.0 ) THEN + CALL XERBLA( 'DGETRF', -INFO ) + RETURN + END IF +* +* Quick return if possible +* + IF( M.EQ.0 .OR. N.EQ.0 ) + $ RETURN +* +* Determine the block size for this environment. +* + NB = ILAENV( 1, 'DGETRF', ' ', M, N, -1, -1 ) + IF( NB.LE.1 .OR. NB.GE.MIN( M, N ) ) THEN +* +* Use unblocked code. +* + CALL DGETF2( M, N, A, LDA, IPIV, INFO ) + + ELSE +* +* Use blocked code. +* + DO 20 J = 1, MIN( M, N ), NB + JB = MIN( MIN( M, N )-J+1, NB ) +* +* Update before factoring the current panel +* + DO 30 K = 1, J-NB, NB +* +* Apply interchanges to rows K:K+NB-1. +* + CALL DLASWP( JB, A(1, J), LDA, K, K+NB-1, IPIV, 1 ) +* +* Compute block row of U. +* + CALL DTRSM( 'Left', 'Lower', 'No transpose', 'Unit', + $ NB, JB, ONE, A( K, K ), LDA, + $ A( K, J ), LDA ) +* +* Update trailing submatrix. +* + CALL DGEMM( 'No transpose', 'No transpose', + $ M-K-NB+1, JB, NB, -ONE, + $ A( K+NB, K ), LDA, A( K, J ), LDA, ONE, + $ A( K+NB, J ), LDA ) + 30 CONTINUE +* +* Factor diagonal and subdiagonal blocks and test for exact +* singularity. +* + CALL DGETF2( M-J+1, JB, A( J, J ), LDA, IPIV( J ), IINFO ) +* +* Adjust INFO and the pivot indices. +* + IF( INFO.EQ.0 .AND. IINFO.GT.0 ) + $ INFO = IINFO + J - 1 + DO 10 I = J, MIN( M, J+JB-1 ) + IPIV( I ) = J - 1 + IPIV( I ) + 10 CONTINUE +* + 20 CONTINUE + +* +* Apply interchanges to the left-overs +* + DO 40 K = 1, MIN( M, N ), NB + CALL DLASWP( K-1, A( 1, 1 ), LDA, K, + $ MIN (K+NB-1, MIN ( M, N )), IPIV, 1 ) + 40 CONTINUE +* +* Apply update to the M+1:N columns when N > M +* + IF ( N.GT.M ) THEN + + CALL DLASWP( N-M, A(1, M+1), LDA, 1, M, IPIV, 1 ) + + DO 50 K = 1, M, NB + + JB = MIN( M-K+1, NB ) +* + CALL DTRSM( 'Left', 'Lower', 'No transpose', 'Unit', + $ JB, N-M, ONE, A( K, K ), LDA, + $ A( K, M+1 ), LDA ) + +* + IF ( K+NB.LE.M ) THEN + CALL DGEMM( 'No transpose', 'No transpose', + $ M-K-NB+1, N-M, NB, -ONE, + $ A( K+NB, K ), LDA, A( K, M+1 ), LDA, ONE, + $ A( K+NB, M+1 ), LDA ) + END IF + 50 CONTINUE + END IF +* + END IF + RETURN +* +* End of DGETRF +* + END diff --git a/SRC/VARIANTS/lu/LL/sgetrf.f b/SRC/VARIANTS/lu/LL/sgetrf.f new file mode 100644 index 00000000..856c1a7e --- /dev/null +++ b/SRC/VARIANTS/lu/LL/sgetrf.f @@ -0,0 +1,190 @@ + SUBROUTINE SGETRF ( M, N, A, LDA, IPIV, INFO) +* +* -- LAPACK routine (version 3.1) -- +* Univ. of Tennessee, Univ. of California Berkeley and NAG Ltd.. +* March 2008 +* +* .. Scalar Arguments .. + INTEGER INFO, LDA, M, N +* .. +* .. Array Arguments .. + INTEGER IPIV( * ) + REAL A( LDA, * ) +* .. +* +* Purpose +* ======= +* +* SGETRF computes an LU factorization of a general M-by-N matrix A +* using partial pivoting with row interchanges. +* +* The factorization has the form +* A = P * L * U +* where P is a permutation matrix, L is lower triangular with unit +* diagonal elements (lower trapezoidal if m > n), and U is upper +* triangular (upper trapezoidal if m < n). +* +* This is the left-looking Level 3 BLAS version of the algorithm. +* +* Arguments +* ========= +* +* M (input) INTEGER +* The number of rows of the matrix A. M >= 0. +* +* N (input) INTEGER +* The number of columns of the matrix A. N >= 0. +* +* A (input/output) REAL array, dimension (LDA,N) +* On entry, the M-by-N matrix to be factored. +* On exit, the factors L and U from the factorization +* A = P*L*U; the unit diagonal elements of L are not stored. +* +* LDA (input) INTEGER +* The leading dimension of the array A. LDA >= max(1,M). +* +* IPIV (output) INTEGER array, dimension (min(M,N)) +* The pivot indices; for 1 <= i <= min(M,N), row i of the +* matrix was interchanged with row IPIV(i). +* +* INFO (output) INTEGER +* = 0: successful exit +* < 0: if INFO = -i, the i-th argument had an illegal value +* > 0: if INFO = i, U(i,i) is exactly zero. The factorization +* has been completed, but the factor U is exactly +* singular, and division by zero will occur if it is used +* to solve a system of equations. +* +* ===================================================================== +* +* .. Parameters .. + REAL ONE + PARAMETER ( ONE = 1.0E+0 ) +* .. +* .. Local Scalars .. + INTEGER I, IINFO, J, JB, K, NB +* .. +* .. External Subroutines .. + EXTERNAL SGEMM, SGETF2, SLASWP, STRSM, XERBLA +* .. +* .. External Functions .. + INTEGER ILAENV + EXTERNAL ILAENV +* .. +* .. Intrinsic Functions .. + INTRINSIC MAX, MIN +* .. +* .. Executable Statements .. +* +* Test the input parameters. +* + INFO = 0 + IF( M.LT.0 ) THEN + INFO = -1 + ELSE IF( N.LT.0 ) THEN + INFO = -2 + ELSE IF( LDA.LT.MAX( 1, M ) ) THEN + INFO = -4 + END IF + IF( INFO.NE.0 ) THEN + CALL XERBLA( 'SGETRF', -INFO ) + RETURN + END IF +* +* Quick return if possible +* + IF( M.EQ.0 .OR. N.EQ.0 ) + $ RETURN +* +* Determine the block size for this environment. +* + NB = ILAENV( 1, 'SGETRF', ' ', M, N, -1, -1 ) + IF( NB.LE.1 .OR. NB.GE.MIN( M, N ) ) THEN +* +* Use unblocked code. +* + CALL SGETF2( M, N, A, LDA, IPIV, INFO ) + + ELSE +* +* Use blocked code. +* + DO 20 J = 1, MIN( M, N ), NB + JB = MIN( MIN( M, N )-J+1, NB ) +* +* +* Update before factoring the current panel +* + DO 30 K = 1, J-NB, NB +* +* Apply interchanges to rows K:K+NB-1. +* + CALL SLASWP( JB, A(1, J), LDA, K, K+NB-1, IPIV, 1 ) +* +* Compute block row of U. +* + CALL STRSM( 'Left', 'Lower', 'No transpose', 'Unit', + $ NB, JB, ONE, A( K, K ), LDA, + $ A( K, J ), LDA ) +* +* Update trailing submatrix. +* + CALL SGEMM( 'No transpose', 'No transpose', + $ M-K-NB+1, JB, NB, -ONE, + $ A( K+NB, K ), LDA, A( K, J ), LDA, ONE, + $ A( K+NB, J ), LDA ) + 30 CONTINUE +* +* Factor diagonal and subdiagonal blocks and test for exact +* singularity. +* + CALL SGETF2( M-J+1, JB, A( J, J ), LDA, IPIV( J ), IINFO ) +* +* Adjust INFO and the pivot indices. +* + IF( INFO.EQ.0 .AND. IINFO.GT.0 ) + $ INFO = IINFO + J - 1 + DO 10 I = J, MIN( M, J+JB-1 ) + IPIV( I ) = J - 1 + IPIV( I ) + 10 CONTINUE +* + 20 CONTINUE + +* +* Apply interchanges to the left-overs +* + DO 40 K = 1, MIN( M, N ), NB + CALL SLASWP( K-1, A( 1, 1 ), LDA, K, + $ MIN (K+NB-1, MIN ( M, N )), IPIV, 1 ) + 40 CONTINUE +* +* Apply update to the M+1:N columns when N > M +* + IF ( N.GT.M ) THEN + + CALL SLASWP( N-M, A(1, M+1), LDA, 1, M, IPIV, 1 ) + + DO 50 K = 1, M, NB + + JB = MIN( M-K+1, NB ) +* + CALL STRSM( 'Left', 'Lower', 'No transpose', 'Unit', + $ JB, N-M, ONE, A( K, K ), LDA, + $ A( K, M+1 ), LDA ) + +* + IF ( K+NB.LE.M ) THEN + CALL SGEMM( 'No transpose', 'No transpose', + $ M-K-NB+1, N-M, NB, -ONE, + $ A( K+NB, K ), LDA, A( K, M+1 ), LDA, ONE, + $ A( K+NB, M+1 ), LDA ) + END IF + 50 CONTINUE + END IF +* + END IF + RETURN +* +* End of SGETRF +* + END diff --git a/SRC/VARIANTS/lu/LL/zgetrf.f b/SRC/VARIANTS/lu/LL/zgetrf.f new file mode 100644 index 00000000..a6f9c0ff --- /dev/null +++ b/SRC/VARIANTS/lu/LL/zgetrf.f @@ -0,0 +1,190 @@ + SUBROUTINE ZGETRF ( M, N, A, LDA, IPIV, INFO) +* +* -- LAPACK routine (version 3.1) -- +* Univ. of Tennessee, Univ. of California Berkeley and NAG Ltd.. +* March 2008 +* +* .. Scalar Arguments .. + INTEGER INFO, LDA, M, N +* .. +* .. Array Arguments .. + INTEGER IPIV( * ) + COMPLEX*16 A( LDA, * ) +* .. +* +* Purpose +* ======= +* +* ZGETRF computes an LU factorization of a general M-by-N matrix A +* using partial pivoting with row interchanges. +* +* The factorization has the form +* A = P * L * U +* where P is a permutation matrix, L is lower triangular with unit +* diagonal elements (lower trapezoidal if m > n), and U is upper +* triangular (upper trapezoidal if m < n). +* +* This is the left-looking Level 3 BLAS version of the algorithm. +* +* Arguments +* ========= +* +* M (input) INTEGER +* The number of rows of the matrix A. M >= 0. +* +* N (input) INTEGER +* The number of columns of the matrix A. N >= 0. +* +* A (input/output) COMPLEX*16 array, dimension (LDA,N) +* On entry, the M-by-N matrix to be factored. +* On exit, the factors L and U from the factorization +* A = P*L*U; the unit diagonal elements of L are not stored. +* +* LDA (input) INTEGER +* The leading dimension of the array A. LDA >= max(1,M). +* +* IPIV (output) INTEGER array, dimension (min(M,N)) +* The pivot indices; for 1 <= i <= min(M,N), row i of the +* matrix was interchanged with row IPIV(i). +* +* INFO (output) INTEGER +* = 0: successful exit +* < 0: if INFO = -i, the i-th argument had an illegal value +* > 0: if INFO = i, U(i,i) is exactly zero. The factorization +* has been completed, but the factor U is exactly +* singular, and division by zero will occur if it is used +* to solve a system of equations. +* +* ===================================================================== +* +* .. Parameters .. + COMPLEX*16 ONE + PARAMETER ( ONE = (1.0D+0, 0.0D+0) ) +* .. +* .. Local Scalars .. + INTEGER I, IINFO, J, JB, K, NB +* .. +* .. External Subroutines .. + EXTERNAL ZGEMM, ZGETF2, ZLASWP, ZTRSM, XERBLA +* .. +* .. External Functions .. + INTEGER ILAENV + EXTERNAL ILAENV +* .. +* .. Intrinsic Functions .. + INTRINSIC MAX, MIN +* .. +* .. Executable Statements .. +* +* Test the input parameters. +* + INFO = 0 + IF( M.LT.0 ) THEN + INFO = -1 + ELSE IF( N.LT.0 ) THEN + INFO = -2 + ELSE IF( LDA.LT.MAX( 1, M ) ) THEN + INFO = -4 + END IF + IF( INFO.NE.0 ) THEN + CALL XERBLA( 'ZGETRF', -INFO ) + RETURN + END IF +* +* Quick return if possible +* + IF( M.EQ.0 .OR. N.EQ.0 ) + $ RETURN +* +* Determine the block size for this environment. +* + NB = ILAENV( 1, 'ZGETRF', ' ', M, N, -1, -1 ) + IF( NB.LE.1 .OR. NB.GE.MIN( M, N ) ) THEN +* +* Use unblocked code. +* + CALL ZGETF2( M, N, A, LDA, IPIV, INFO ) + + ELSE +* +* Use blocked code. +* + DO 20 J = 1, MIN( M, N ), NB + JB = MIN( MIN( M, N )-J+1, NB ) +* +* +* Update before factoring the current panel +* + DO 30 K = 1, J-NB, NB +* +* Apply interchanges to rows K:K+NB-1. +* + CALL ZLASWP( JB, A(1, J), LDA, K, K+NB-1, IPIV, 1 ) +* +* Compute block row of U. +* + CALL ZTRSM( 'Left', 'Lower', 'No transpose', 'Unit', + $ NB, JB, ONE, A( K, K ), LDA, + $ A( K, J ), LDA ) +* +* Update trailing submatrix. +* + CALL ZGEMM( 'No transpose', 'No transpose', + $ M-K-NB+1, JB, NB, -ONE, + $ A( K+NB, K ), LDA, A( K, J ), LDA, ONE, + $ A( K+NB, J ), LDA ) + 30 CONTINUE +* +* Factor diagonal and subdiagonal blocks and test for exact +* singularity. +* + CALL ZGETF2( M-J+1, JB, A( J, J ), LDA, IPIV( J ), IINFO ) +* +* Adjust INFO and the pivot indices. +* + IF( INFO.EQ.0 .AND. IINFO.GT.0 ) + $ INFO = IINFO + J - 1 + DO 10 I = J, MIN( M, J+JB-1 ) + IPIV( I ) = J - 1 + IPIV( I ) + 10 CONTINUE +* + 20 CONTINUE + +* +* Apply interchanges to the left-overs +* + DO 40 K = 1, MIN( M, N ), NB + CALL ZLASWP( K-1, A( 1, 1 ), LDA, K, + $ MIN (K+NB-1, MIN ( M, N )), IPIV, 1 ) + 40 CONTINUE +* +* Apply update to the M+1:N columns when N > M +* + IF ( N.GT.M ) THEN + + CALL ZLASWP( N-M, A(1, M+1), LDA, 1, M, IPIV, 1 ) + + DO 50 K = 1, M, NB + + JB = MIN( M-K+1, NB ) +* + CALL ZTRSM( 'Left', 'Lower', 'No transpose', 'Unit', + $ JB, N-M, ONE, A( K, K ), LDA, + $ A( K, M+1 ), LDA ) + +* + IF ( K+NB.LE.M ) THEN + CALL ZGEMM( 'No transpose', 'No transpose', + $ M-K-NB+1, N-M, NB, -ONE, + $ A( K+NB, K ), LDA, A( K, M+1 ), LDA, ONE, + $ A( K+NB, M+1 ), LDA ) + END IF + 50 CONTINUE + END IF +* + END IF + RETURN +* +* End of ZGETRF +* + END diff --git a/SRC/VARIANTS/lu/REC/cgetrf.f b/SRC/VARIANTS/lu/REC/cgetrf.f new file mode 100644 index 00000000..d8a8a90b --- /dev/null +++ b/SRC/VARIANTS/lu/REC/cgetrf.f @@ -0,0 +1,224 @@ + SUBROUTINE CGETRF( M, N, A, LDA, IPIV, INFO ) + IMPLICIT NONE +* +* -- LAPACK routine (version 3.X) -- +* Univ. of Tennessee, Univ. of California Berkeley and NAG Ltd.. +* May 2008 +* +* .. Scalar Arguments .. + INTEGER INFO, LDA, M, N +* .. +* .. Array Arguments .. + INTEGER IPIV( * ) + COMPLEX A( LDA, * ) +* .. +* +* Purpose +* ======= +* +* CGETRF computes an LU factorization of a general M-by-N matrix A +* using partial pivoting with row interchanges. +* +* The factorization has the form +* A = P * L * U +* where P is a permutation matrix, L is lower triangular with unit +* diagonal elements (lower trapezoidal if m > n), and U is upper +* triangular (upper trapezoidal if m < n). +* +* This code implements an iterative version of Sivan Toledo's recursive +* LU algorithm[1]. For square matrices, this iterative versions should +* be within a factor of two of the optimum number of memory transfers. +* +* The pattern is as follows, with the large blocks of U being updated +* in one call to DTRSM, and the dotted lines denoting sections that +* have had all pending permutations applied: +* +* 1 2 3 4 5 6 7 8 +* +-+-+---+-------+------ +* | |1| | | +* |.+-+ 2 | | +* | | | | | +* |.|.+-+-+ 4 | +* | | | |1| | +* | | |.+-+ | +* | | | | | | +* |.|.|.|.+-+-+---+ 8 +* | | | | | |1| | +* | | | | |.+-+ 2 | +* | | | | | | | | +* | | | | |.|.+-+-+ +* | | | | | | | |1| +* | | | | | | |.+-+ +* | | | | | | | | | +* |.|.|.|.|.|.|.|.+----- +* | | | | | | | | | +* +* The 1-2-1-4-1-2-1-8-... pattern is the position of the last 1 bit in +* the binary expansion of the current column. Each Schur update is +* applied as soon as the necessary portion of U is available. +* +* [1] Toledo, S. 1997. Locality of Reference in LU Decomposition with +* Partial Pivoting. SIAM J. Matrix Anal. Appl. 18, 4 (Oct. 1997), +* 1065-1081. http://dx.doi.org/10.1137/S0895479896297744 +* +* Arguments +* ========= +* +* M (input) INTEGER +* The number of rows of the matrix A. M >= 0. +* +* N (input) INTEGER +* The number of columns of the matrix A. N >= 0. +* +* A (input/output) COMPLEX array, dimension (LDA,N) +* On entry, the M-by-N matrix to be factored. +* On exit, the factors L and U from the factorization +* A = P*L*U; the unit diagonal elements of L are not stored. +* +* LDA (input) INTEGER +* The leading dimension of the array A. LDA >= max(1,M). +* +* IPIV (output) INTEGER array, dimension (min(M,N)) +* The pivot indices; for 1 <= i <= min(M,N), row i of the +* matrix was interchanged with row IPIV(i). +* +* INFO (output) INTEGER +* = 0: successful exit +* < 0: if INFO = -i, the i-th argument had an illegal value +* > 0: if INFO = i, U(i,i) is exactly zero. The factorization +* has been completed, but the factor U is exactly +* singular, and division by zero will occur if it is used +* to solve a system of equations. +* +* ===================================================================== +* +* .. Parameters .. + COMPLEX ONE, NEGONE + REAL ZERO + PARAMETER ( ONE = (1.0E+0, 0.0E+0) ) + PARAMETER ( NEGONE = (-1.0E+0, 0.0E+0) ) + PARAMETER ( ZERO = 0.0E+0 ) +* .. +* .. Local Scalars .. + REAL SFMIN, PIVMAG + COMPLEX TMP + INTEGER I, J, JP, NSTEP, NTOPIV, NPIVED, KAHEAD + INTEGER KSTART, IPIVSTART, JPIVSTART, KCOLS +* .. +* .. External Functions .. + REAL SLAMCH + INTEGER ICAMAX + LOGICAL SISNAN + EXTERNAL SLAMCH, ICAMAX, SISNAN +* .. +* .. External Subroutines .. + EXTERNAL CTRSM, CSCAL, XERBLA, CLASWP +* .. +* .. Intrinsic Functions .. + INTRINSIC MAX, MIN, IAND, ABS +* .. +* .. Executable Statements .. +* +* Test the input parameters. +* + INFO = 0 + IF( M.LT.0 ) THEN + INFO = -1 + ELSE IF( N.LT.0 ) THEN + INFO = -2 + ELSE IF( LDA.LT.MAX( 1, M ) ) THEN + INFO = -4 + END IF + IF( INFO.NE.0 ) THEN + CALL XERBLA( 'CGETRF', -INFO ) + RETURN + END IF +* +* Quick return if possible +* + IF( M.EQ.0 .OR. N.EQ.0 ) + $ RETURN +* +* Compute machine safe minimum +* + SFMIN = SLAMCH( 'S' ) +* + NSTEP = MIN( M, N ) + DO J = 1, NSTEP + KAHEAD = IAND( J, -J ) + KSTART = J + 1 - KAHEAD + KCOLS = MIN( KAHEAD, M-J ) +* +* Find pivot. +* + JP = J - 1 + ICAMAX( M-J+1, A( J, J ), 1 ) + IPIV( J ) = JP + +! Permute just this column. + IF (JP .NE. J) THEN + TMP = A( J, J ) + A( J, J ) = A( JP, J ) + A( JP, J ) = TMP + END IF + +! Apply pending permutations to L + NTOPIV = 1 + IPIVSTART = J + JPIVSTART = J - NTOPIV + DO WHILE ( NTOPIV .LT. KAHEAD ) + CALL CLASWP( NTOPIV, A( 1, JPIVSTART ), LDA, IPIVSTART, J, + $ IPIV, 1 ) + IPIVSTART = IPIVSTART - NTOPIV; + NTOPIV = NTOPIV * 2; + JPIVSTART = JPIVSTART - NTOPIV; + END DO + +! Permute U block to match L + CALL CLASWP( KCOLS, A( 1,J+1 ), LDA, KSTART, J, IPIV, 1 ) + +! Factor the current column + PIVMAG = ABS( A( J, J ) ) + IF( PIVMAG.NE.ZERO .AND. .NOT.SISNAN( PIVMAG ) ) THEN + IF( PIVMAG .GE. SFMIN ) THEN + CALL CSCAL( M-J, ONE / A( J, J ), A( J+1, J ), 1 ) + ELSE + DO I = 1, M-J + A( J+I, J ) = A( J+I, J ) / A( J, J ) + END DO + END IF + ELSE IF( PIVMAG .EQ. ZERO .AND. INFO .EQ. 0 ) THEN + INFO = J + END IF + +! Solve for U block. + CALL CTRSM( 'Left', 'Lower', 'No transpose', 'Unit', KAHEAD, + $ KCOLS, ONE, A( KSTART, KSTART ), LDA, + $ A( KSTART, J+1 ), LDA ) +! Schur complement. + CALL CGEMM( 'No transpose', 'No transpose', M-J, + $ KCOLS, KAHEAD, NEGONE, A( J+1, KSTART ), LDA, + $ A( KSTART, J+1 ), LDA, ONE, A( J+1, J+1 ), LDA ) + END DO + +! Handle pivot permutations on the way out of the recursion + NPIVED = IAND( NSTEP, -NSTEP ) + J = NSTEP - NPIVED + DO WHILE ( J .GT. 0 ) + NTOPIV = IAND( J, -J ) + CALL CLASWP( NTOPIV, A( 1, J-NTOPIV+1 ), LDA, J+1, NSTEP, + $ IPIV, 1 ) + J = J - NTOPIV + END DO + +! If short and wide, handle the rest of the columns. + IF ( M .LT. N ) THEN + CALL CLASWP( N-M, A( 1, M+KCOLS+1 ), LDA, 1, M, IPIV, 1 ) + CALL CTRSM( 'Left', 'Lower', 'No transpose', 'Unit', M, + $ N-M, ONE, A, LDA, A( 1,M+KCOLS+1 ), LDA ) + END IF + + RETURN +* +* End of CGETRF +* + END diff --git a/SRC/VARIANTS/lu/REC/dgetrf.f b/SRC/VARIANTS/lu/REC/dgetrf.f new file mode 100644 index 00000000..f8c2caf1 --- /dev/null +++ b/SRC/VARIANTS/lu/REC/dgetrf.f @@ -0,0 +1,220 @@ + SUBROUTINE DGETRF( M, N, A, LDA, IPIV, INFO ) + IMPLICIT NONE +* +* -- LAPACK routine (version 3.X) -- +* Univ. of Tennessee, Univ. of California Berkeley and NAG Ltd.. +* May 2008 +* +* .. Scalar Arguments .. + INTEGER INFO, LDA, M, N +* .. +* .. Array Arguments .. + INTEGER IPIV( * ) + DOUBLE PRECISION A( LDA, * ) +* .. +* +* Purpose +* ======= +* +* DGETRF computes an LU factorization of a general M-by-N matrix A +* using partial pivoting with row interchanges. +* +* The factorization has the form +* A = P * L * U +* where P is a permutation matrix, L is lower triangular with unit +* diagonal elements (lower trapezoidal if m > n), and U is upper +* triangular (upper trapezoidal if m < n). +* +* This code implements an iterative version of Sivan Toledo's recursive +* LU algorithm[1]. For square matrices, this iterative versions should +* be within a factor of two of the optimum number of memory transfers. +* +* The pattern is as follows, with the large blocks of U being updated +* in one call to DTRSM, and the dotted lines denoting sections that +* have had all pending permutations applied: +* +* 1 2 3 4 5 6 7 8 +* +-+-+---+-------+------ +* | |1| | | +* |.+-+ 2 | | +* | | | | | +* |.|.+-+-+ 4 | +* | | | |1| | +* | | |.+-+ | +* | | | | | | +* |.|.|.|.+-+-+---+ 8 +* | | | | | |1| | +* | | | | |.+-+ 2 | +* | | | | | | | | +* | | | | |.|.+-+-+ +* | | | | | | | |1| +* | | | | | | |.+-+ +* | | | | | | | | | +* |.|.|.|.|.|.|.|.+----- +* | | | | | | | | | +* +* The 1-2-1-4-1-2-1-8-... pattern is the position of the last 1 bit in +* the binary expansion of the current column. Each Schur update is +* applied as soon as the necessary portion of U is available. +* +* [1] Toledo, S. 1997. Locality of Reference in LU Decomposition with +* Partial Pivoting. SIAM J. Matrix Anal. Appl. 18, 4 (Oct. 1997), +* 1065-1081. http://dx.doi.org/10.1137/S0895479896297744 +* +* Arguments +* ========= +* +* M (input) INTEGER +* The number of rows of the matrix A. M >= 0. +* +* N (input) INTEGER +* The number of columns of the matrix A. N >= 0. +* +* A (input/output) DOUBLE PRECISION array, dimension (LDA,N) +* On entry, the M-by-N matrix to be factored. +* On exit, the factors L and U from the factorization +* A = P*L*U; the unit diagonal elements of L are not stored. +* +* LDA (input) INTEGER +* The leading dimension of the array A. LDA >= max(1,M). +* +* IPIV (output) INTEGER array, dimension (min(M,N)) +* The pivot indices; for 1 <= i <= min(M,N), row i of the +* matrix was interchanged with row IPIV(i). +* +* INFO (output) INTEGER +* = 0: successful exit +* < 0: if INFO = -i, the i-th argument had an illegal value +* > 0: if INFO = i, U(i,i) is exactly zero. The factorization +* has been completed, but the factor U is exactly +* singular, and division by zero will occur if it is used +* to solve a system of equations. +* +* ===================================================================== +* +* .. Parameters .. + DOUBLE PRECISION ONE, ZERO, NEGONE + PARAMETER ( ONE = 1.0D+0, ZERO = 0.0D+0 ) + PARAMETER ( NEGONE = -1.0D+0 ) +* .. +* .. Local Scalars .. + DOUBLE PRECISION SFMIN, TMP + INTEGER I, J, JP, NSTEP, NTOPIV, NPIVED, KAHEAD + INTEGER KSTART, IPIVSTART, JPIVSTART, KCOLS +* .. +* .. External Functions .. + DOUBLE PRECISION DLAMCH + INTEGER IDAMAX + LOGICAL DISNAN + EXTERNAL DLAMCH, IDAMAX, DISNAN +* .. +* .. External Subroutines .. + EXTERNAL DTRSM, DSCAL, XERBLA, DLASWP +* .. +* .. Intrinsic Functions .. + INTRINSIC MAX, MIN, IAND +* .. +* .. Executable Statements .. +* +* Test the input parameters. +* + INFO = 0 + IF( M.LT.0 ) THEN + INFO = -1 + ELSE IF( N.LT.0 ) THEN + INFO = -2 + ELSE IF( LDA.LT.MAX( 1, M ) ) THEN + INFO = -4 + END IF + IF( INFO.NE.0 ) THEN + CALL XERBLA( 'DGETRF', -INFO ) + RETURN + END IF +* +* Quick return if possible +* + IF( M.EQ.0 .OR. N.EQ.0 ) + $ RETURN +* +* Compute machine safe minimum +* + SFMIN = DLAMCH( 'S' ) +* + NSTEP = MIN( M, N ) + DO J = 1, NSTEP + KAHEAD = IAND( J, -J ) + KSTART = J + 1 - KAHEAD + KCOLS = MIN( KAHEAD, M-J ) +* +* Find pivot. +* + JP = J - 1 + IDAMAX( M-J+1, A( J, J ), 1 ) + IPIV( J ) = JP + +! Permute just this column. + IF (JP .NE. J) THEN + TMP = A( J, J ) + A( J, J ) = A( JP, J ) + A( JP, J ) = TMP + END IF + +! Apply pending permutations to L + NTOPIV = 1 + IPIVSTART = J + JPIVSTART = J - NTOPIV + DO WHILE ( NTOPIV .LT. KAHEAD ) + CALL DLASWP( NTOPIV, A( 1, JPIVSTART ), LDA, IPIVSTART, J, + $ IPIV, 1 ) + IPIVSTART = IPIVSTART - NTOPIV; + NTOPIV = NTOPIV * 2; + JPIVSTART = JPIVSTART - NTOPIV; + END DO + +! Permute U block to match L + CALL DLASWP( KCOLS, A( 1,J+1 ), LDA, KSTART, J, IPIV, 1 ) + +! Factor the current column + IF( A( J, J ).NE.ZERO .AND. .NOT.DISNAN( A( J, J ) ) ) THEN + IF( ABS(A( J, J )) .GE. SFMIN ) THEN + CALL DSCAL( M-J, ONE / A( J, J ), A( J+1, J ), 1 ) + ELSE + DO I = 1, M-J + A( J+I, J ) = A( J+I, J ) / A( J, J ) + END DO + END IF + ELSE IF( A( J,J ) .EQ. ZERO .AND. INFO .EQ. 0 ) THEN + INFO = J + END IF + +! Solve for U block. + CALL DTRSM( 'Left', 'Lower', 'No transpose', 'Unit', KAHEAD, + $ KCOLS, ONE, A( KSTART, KSTART ), LDA, + $ A( KSTART, J+1 ), LDA ) +! Schur complement. + CALL DGEMM( 'No transpose', 'No transpose', M-J, + $ KCOLS, KAHEAD, NEGONE, A( J+1, KSTART ), LDA, + $ A( KSTART, J+1 ), LDA, ONE, A( J+1, J+1 ), LDA ) + END DO + +! Handle pivot permutations on the way out of the recursion + NPIVED = IAND( NSTEP, -NSTEP ) + J = NSTEP - NPIVED + DO WHILE ( J .GT. 0 ) + NTOPIV = IAND( J, -J ) + CALL DLASWP( NTOPIV, A( 1, J-NTOPIV+1 ), LDA, J+1, NSTEP, + $ IPIV, 1 ) + J = J - NTOPIV + END DO + +! If short and wide, handle the rest of the columns. + IF ( M .LT. N ) THEN + CALL DLASWP( N-M, A( 1, M+KCOLS+1 ), LDA, 1, M, IPIV, 1 ) + CALL DTRSM( 'Left', 'Lower', 'No transpose', 'Unit', M, + $ N-M, ONE, A, LDA, A( 1,M+KCOLS+1 ), LDA ) + END IF + + RETURN +* +* End of DGETRF +* + END diff --git a/SRC/VARIANTS/lu/REC/sgetrf.f b/SRC/VARIANTS/lu/REC/sgetrf.f new file mode 100644 index 00000000..1890f987 --- /dev/null +++ b/SRC/VARIANTS/lu/REC/sgetrf.f @@ -0,0 +1,220 @@ + SUBROUTINE SGETRF( M, N, A, LDA, IPIV, INFO ) + IMPLICIT NONE +* +* -- LAPACK routine (version 3.X) -- +* Univ. of Tennessee, Univ. of California Berkeley and NAG Ltd.. +* May 2008 +* +* .. Scalar Arguments .. + INTEGER INFO, LDA, M, N +* .. +* .. Array Arguments .. + INTEGER IPIV( * ) + REAL A( LDA, * ) +* .. +* +* Purpose +* ======= +* +* SGETRF computes an LU factorization of a general M-by-N matrix A +* using partial pivoting with row interchanges. +* +* The factorization has the form +* A = P * L * U +* where P is a permutation matrix, L is lower triangular with unit +* diagonal elements (lower trapezoidal if m > n), and U is upper +* triangular (upper trapezoidal if m < n). +* +* This code implements an iterative version of Sivan Toledo's recursive +* LU algorithm[1]. For square matrices, this iterative versions should +* be within a factor of two of the optimum number of memory transfers. +* +* The pattern is as follows, with the large blocks of U being updated +* in one call to STRSM, and the dotted lines denoting sections that +* have had all pending permutations applied: +* +* 1 2 3 4 5 6 7 8 +* +-+-+---+-------+------ +* | |1| | | +* |.+-+ 2 | | +* | | | | | +* |.|.+-+-+ 4 | +* | | | |1| | +* | | |.+-+ | +* | | | | | | +* |.|.|.|.+-+-+---+ 8 +* | | | | | |1| | +* | | | | |.+-+ 2 | +* | | | | | | | | +* | | | | |.|.+-+-+ +* | | | | | | | |1| +* | | | | | | |.+-+ +* | | | | | | | | | +* |.|.|.|.|.|.|.|.+----- +* | | | | | | | | | +* +* The 1-2-1-4-1-2-1-8-... pattern is the position of the last 1 bit in +* the binary expansion of the current column. Each Schur update is +* applied as soon as the necessary portion of U is available. +* +* [1] Toledo, S. 1997. Locality of Reference in LU Decomposition with +* Partial Pivoting. SIAM J. Matrix Anal. Appl. 18, 4 (Oct. 1997), +* 1065-1081. http://dx.doi.org/10.1137/S0895479896297744 +* +* Arguments +* ========= +* +* M (input) INTEGER +* The number of rows of the matrix A. M >= 0. +* +* N (input) INTEGER +* The number of columns of the matrix A. N >= 0. +* +* A (input/output) REAL array, dimension (LDA,N) +* On entry, the M-by-N matrix to be factored. +* On exit, the factors L and U from the factorization +* A = P*L*U; the unit diagonal elements of L are not stored. +* +* LDA (input) INTEGER +* The leading dimension of the array A. LDA >= max(1,M). +* +* IPIV (output) INTEGER array, dimension (min(M,N)) +* The pivot indices; for 1 <= i <= min(M,N), row i of the +* matrix was interchanged with row IPIV(i). +* +* INFO (output) INTEGER +* = 0: successful exit +* < 0: if INFO = -i, the i-th argument had an illegal value +* > 0: if INFO = i, U(i,i) is exactly zero. The factorization +* has been completed, but the factor U is exactly +* singular, and division by zero will occur if it is used +* to solve a system of equations. +* +* ===================================================================== +* +* .. Parameters .. + REAL ONE, ZERO, NEGONE + PARAMETER ( ONE = 1.0E+0, ZERO = 0.0E+0 ) + PARAMETER ( NEGONE = -1.0E+0 ) +* .. +* .. Local Scalars .. + REAL SFMIN, TMP + INTEGER I, J, JP, NSTEP, NTOPIV, NPIVED, KAHEAD + INTEGER KSTART, IPIVSTART, JPIVSTART, KCOLS +* .. +* .. External Functions .. + REAL SLAMCH + INTEGER ISAMAX + LOGICAL SISNAN + EXTERNAL SLAMCH, ISAMAX, SISNAN +* .. +* .. External Subroutines .. + EXTERNAL STRSM, SSCAL, XERBLA, SLASWP +* .. +* .. Intrinsic Functions .. + INTRINSIC MAX, MIN, IAND +* .. +* .. Executable Statements .. +* +* Test the input parameters. +* + INFO = 0 + IF( M.LT.0 ) THEN + INFO = -1 + ELSE IF( N.LT.0 ) THEN + INFO = -2 + ELSE IF( LDA.LT.MAX( 1, M ) ) THEN + INFO = -4 + END IF + IF( INFO.NE.0 ) THEN + CALL XERBLA( 'SGETRF', -INFO ) + RETURN + END IF +* +* Quick return if possible +* + IF( M.EQ.0 .OR. N.EQ.0 ) + $ RETURN +* +* Compute machine safe minimum +* + SFMIN = SLAMCH( 'S' ) +* + NSTEP = MIN( M, N ) + DO J = 1, NSTEP + KAHEAD = IAND( J, -J ) + KSTART = J + 1 - KAHEAD + KCOLS = MIN( KAHEAD, M-J ) +* +* Find pivot. +* + JP = J - 1 + ISAMAX( M-J+1, A( J, J ), 1 ) + IPIV( J ) = JP + +! Permute just this column. + IF (JP .NE. J) THEN + TMP = A( J, J ) + A( J, J ) = A( JP, J ) + A( JP, J ) = TMP + END IF + +! Apply pending permutations to L + NTOPIV = 1 + IPIVSTART = J + JPIVSTART = J - NTOPIV + DO WHILE ( NTOPIV .LT. KAHEAD ) + CALL SLASWP( NTOPIV, A( 1, JPIVSTART ), LDA, IPIVSTART, J, + $ IPIV, 1 ) + IPIVSTART = IPIVSTART - NTOPIV; + NTOPIV = NTOPIV * 2; + JPIVSTART = JPIVSTART - NTOPIV; + END DO + +! Permute U block to match L + CALL SLASWP( KCOLS, A( 1,J+1 ), LDA, KSTART, J, IPIV, 1 ) + +! Factor the current column + IF( A( J, J ).NE.ZERO .AND. .NOT.SISNAN( A( J, J ) ) ) THEN + IF( ABS(A( J, J )) .GE. SFMIN ) THEN + CALL SSCAL( M-J, ONE / A( J, J ), A( J+1, J ), 1 ) + ELSE + DO I = 1, M-J + A( J+I, J ) = A( J+I, J ) / A( J, J ) + END DO + END IF + ELSE IF( A( J,J ) .EQ. ZERO .AND. INFO .EQ. 0 ) THEN + INFO = J + END IF + +! Solve for U block. + CALL STRSM( 'Left', 'Lower', 'No transpose', 'Unit', KAHEAD, + $ KCOLS, ONE, A( KSTART, KSTART ), LDA, + $ A( KSTART, J+1 ), LDA ) +! Schur complement. + CALL SGEMM( 'No transpose', 'No transpose', M-J, + $ KCOLS, KAHEAD, NEGONE, A( J+1, KSTART ), LDA, + $ A( KSTART, J+1 ), LDA, ONE, A( J+1, J+1 ), LDA ) + END DO + +! Handle pivot permutations on the way out of the recursion + NPIVED = IAND( NSTEP, -NSTEP ) + J = NSTEP - NPIVED + DO WHILE ( J .GT. 0 ) + NTOPIV = IAND( J, -J ) + CALL SLASWP( NTOPIV, A( 1, J-NTOPIV+1 ), LDA, J+1, NSTEP, + $ IPIV, 1 ) + J = J - NTOPIV + END DO + +! If short and wide, handle the rest of the columns. + IF ( M .LT. N ) THEN + CALL SLASWP( N-M, A( 1, M+KCOLS+1 ), LDA, 1, M, IPIV, 1 ) + CALL STRSM( 'Left', 'Lower', 'No transpose', 'Unit', M, + $ N-M, ONE, A, LDA, A( 1,M+KCOLS+1 ), LDA ) + END IF + + RETURN +* +* End of SGETRF +* + END diff --git a/SRC/VARIANTS/lu/REC/zgetrf.f b/SRC/VARIANTS/lu/REC/zgetrf.f new file mode 100644 index 00000000..e7b75b00 --- /dev/null +++ b/SRC/VARIANTS/lu/REC/zgetrf.f @@ -0,0 +1,224 @@ + SUBROUTINE ZGETRF( M, N, A, LDA, IPIV, INFO ) + IMPLICIT NONE +* +* -- LAPACK routine (version 3.X) -- +* Univ. of Tennessee, Univ. of California Berkeley and NAG Ltd.. +* May 2008 +* +* .. Scalar Arguments .. + INTEGER INFO, LDA, M, N +* .. +* .. Array Arguments .. + INTEGER IPIV( * ) + COMPLEX*16 A( LDA, * ) +* .. +* +* Purpose +* ======= +* +* ZGETRF computes an LU factorization of a general M-by-N matrix A +* using partial pivoting with row interchanges. +* +* The factorization has the form +* A = P * L * U +* where P is a permutation matrix, L is lower triangular with unit +* diagonal elements (lower trapezoidal if m > n), and U is upper +* triangular (upper trapezoidal if m < n). +* +* This code implements an iterative version of Sivan Toledo's recursive +* LU algorithm[1]. For square matrices, this iterative versions should +* be within a factor of two of the optimum number of memory transfers. +* +* The pattern is as follows, with the large blocks of U being updated +* in one call to DTRSM, and the dotted lines denoting sections that +* have had all pending permutations applied: +* +* 1 2 3 4 5 6 7 8 +* +-+-+---+-------+------ +* | |1| | | +* |.+-+ 2 | | +* | | | | | +* |.|.+-+-+ 4 | +* | | | |1| | +* | | |.+-+ | +* | | | | | | +* |.|.|.|.+-+-+---+ 8 +* | | | | | |1| | +* | | | | |.+-+ 2 | +* | | | | | | | | +* | | | | |.|.+-+-+ +* | | | | | | | |1| +* | | | | | | |.+-+ +* | | | | | | | | | +* |.|.|.|.|.|.|.|.+----- +* | | | | | | | | | +* +* The 1-2-1-4-1-2-1-8-... pattern is the position of the last 1 bit in +* the binary expansion of the current column. Each Schur update is +* applied as soon as the necessary portion of U is available. +* +* [1] Toledo, S. 1997. Locality of Reference in LU Decomposition with +* Partial Pivoting. SIAM J. Matrix Anal. Appl. 18, 4 (Oct. 1997), +* 1065-1081. http://dx.doi.org/10.1137/S0895479896297744 +* +* Arguments +* ========= +* +* M (input) INTEGER +* The number of rows of the matrix A. M >= 0. +* +* N (input) INTEGER +* The number of columns of the matrix A. N >= 0. +* +* A (input/output) COMPLEX*16 array, dimension (LDA,N) +* On entry, the M-by-N matrix to be factored. +* On exit, the factors L and U from the factorization +* A = P*L*U; the unit diagonal elements of L are not stored. +* +* LDA (input) INTEGER +* The leading dimension of the array A. LDA >= max(1,M). +* +* IPIV (output) INTEGER array, dimension (min(M,N)) +* The pivot indices; for 1 <= i <= min(M,N), row i of the +* matrix was interchanged with row IPIV(i). +* +* INFO (output) INTEGER +* = 0: successful exit +* < 0: if INFO = -i, the i-th argument had an illegal value +* > 0: if INFO = i, U(i,i) is exactly zero. The factorization +* has been completed, but the factor U is exactly +* singular, and division by zero will occur if it is used +* to solve a system of equations. +* +* ===================================================================== +* +* .. Parameters .. + COMPLEX*16 ONE, NEGONE + DOUBLE PRECISION ZERO + PARAMETER ( ONE = (1.0D+0, 0.0D+0) ) + PARAMETER ( NEGONE = (-1.0D+0, 0.0D+0) ) + PARAMETER ( ZERO = 0.0D+0 ) +* .. +* .. Local Scalars .. + DOUBLE PRECISION SFMIN, PIVMAG + COMPLEX*16 TMP + INTEGER I, J, JP, NSTEP, NTOPIV, NPIVED, KAHEAD + INTEGER KSTART, IPIVSTART, JPIVSTART, KCOLS +* .. +* .. External Functions .. + DOUBLE PRECISION DLAMCH + INTEGER IZAMAX + LOGICAL DISNAN + EXTERNAL DLAMCH, IZAMAX, DISNAN +* .. +* .. External Subroutines .. + EXTERNAL ZTRSM, ZSCAL, XERBLA, ZLASWP +* .. +* .. Intrinsic Functions .. + INTRINSIC MAX, MIN, IAND, ABS +* .. +* .. Executable Statements .. +* +* Test the input parameters. +* + INFO = 0 + IF( M.LT.0 ) THEN + INFO = -1 + ELSE IF( N.LT.0 ) THEN + INFO = -2 + ELSE IF( LDA.LT.MAX( 1, M ) ) THEN + INFO = -4 + END IF + IF( INFO.NE.0 ) THEN + CALL XERBLA( 'ZGETRF', -INFO ) + RETURN + END IF +* +* Quick return if possible +* + IF( M.EQ.0 .OR. N.EQ.0 ) + $ RETURN +* +* Compute machine safe minimum +* + SFMIN = DLAMCH( 'S' ) +* + NSTEP = MIN( M, N ) + DO J = 1, NSTEP + KAHEAD = IAND( J, -J ) + KSTART = J + 1 - KAHEAD + KCOLS = MIN( KAHEAD, M-J ) +* +* Find pivot. +* + JP = J - 1 + IZAMAX( M-J+1, A( J, J ), 1 ) + IPIV( J ) = JP + +! Permute just this column. + IF (JP .NE. J) THEN + TMP = A( J, J ) + A( J, J ) = A( JP, J ) + A( JP, J ) = TMP + END IF + +! Apply pending permutations to L + NTOPIV = 1 + IPIVSTART = J + JPIVSTART = J - NTOPIV + DO WHILE ( NTOPIV .LT. KAHEAD ) + CALL ZLASWP( NTOPIV, A( 1, JPIVSTART ), LDA, IPIVSTART, J, + $ IPIV, 1 ) + IPIVSTART = IPIVSTART - NTOPIV; + NTOPIV = NTOPIV * 2; + JPIVSTART = JPIVSTART - NTOPIV; + END DO + +! Permute U block to match L + CALL ZLASWP( KCOLS, A( 1,J+1 ), LDA, KSTART, J, IPIV, 1 ) + +! Factor the current column + PIVMAG = ABS( A( J, J ) ) + IF( PIVMAG.NE.ZERO .AND. .NOT.DISNAN( PIVMAG ) ) THEN + IF( PIVMAG .GE. SFMIN ) THEN + CALL ZSCAL( M-J, ONE / A( J, J ), A( J+1, J ), 1 ) + ELSE + DO I = 1, M-J + A( J+I, J ) = A( J+I, J ) / A( J, J ) + END DO + END IF + ELSE IF( PIVMAG .EQ. ZERO .AND. INFO .EQ. 0 ) THEN + INFO = J + END IF + +! Solve for U block. + CALL ZTRSM( 'Left', 'Lower', 'No transpose', 'Unit', KAHEAD, + $ KCOLS, ONE, A( KSTART, KSTART ), LDA, + $ A( KSTART, J+1 ), LDA ) +! Schur complement. + CALL ZGEMM( 'No transpose', 'No transpose', M-J, + $ KCOLS, KAHEAD, NEGONE, A( J+1, KSTART ), LDA, + $ A( KSTART, J+1 ), LDA, ONE, A( J+1, J+1 ), LDA ) + END DO + +! Handle pivot permutations on the way out of the recursion + NPIVED = IAND( NSTEP, -NSTEP ) + J = NSTEP - NPIVED + DO WHILE ( J .GT. 0 ) + NTOPIV = IAND( J, -J ) + CALL ZLASWP( NTOPIV, A( 1, J-NTOPIV+1 ), LDA, J+1, NSTEP, + $ IPIV, 1 ) + J = J - NTOPIV + END DO + +! If short and wide, handle the rest of the columns. + IF ( M .LT. N ) THEN + CALL ZLASWP( N-M, A( 1, M+KCOLS+1 ), LDA, 1, M, IPIV, 1 ) + CALL ZTRSM( 'Left', 'Lower', 'No transpose', 'Unit', M, + $ N-M, ONE, A, LDA, A( 1,M+KCOLS+1 ), LDA ) + END IF + + RETURN +* +* End of ZGETRF +* + END diff --git a/SRC/VARIANTS/qr/LL/cgeqrf.f b/SRC/VARIANTS/qr/LL/cgeqrf.f new file mode 100644 index 00000000..413bc90c --- /dev/null +++ b/SRC/VARIANTS/qr/LL/cgeqrf.f @@ -0,0 +1,343 @@ + SUBROUTINE CGEQRF ( M, N, A, LDA, TAU, WORK, LWORK, INFO ) +* +* -- LAPACK routine (version 3.1) -- +* Univ. of Tennessee, Univ. of California Berkeley and NAG Ltd.. +* March 2008 +* +* .. Scalar Arguments .. + INTEGER INFO, LDA, LWORK, M, N +* .. +* .. Array Arguments .. + COMPLEX A( LDA, * ), TAU( * ), WORK( * ) +* .. +* +* Purpose +* ======= +* +* CGEQRF computes a QR factorization of a real M-by-N matrix A: +* A = Q * R. +* +* This is the left-looking Level 3 BLAS version of the algorithm. +* +* Arguments +* ========= +* +* M (input) INTEGER +* The number of rows of the matrix A. M >= 0. +* +* N (input) INTEGER +* The number of columns of the matrix A. N >= 0. +* +* A (input/output) COMPLEX array, dimension (LDA,N) +* On entry, the M-by-N matrix A. +* On exit, the elements on and above the diagonal of the array +* contain the min(M,N)-by-N upper trapezoidal matrix R (R is +* upper triangular if m >= n); the elements below the diagonal, +* with the array TAU, represent the orthogonal matrix Q as a +* product of min(m,n) elementary reflectors (see Further +* Details). +* +* LDA (input) INTEGER +* The leading dimension of the array A. LDA >= max(1,M). +* +* TAU (output) COMPLEX array, dimension (min(M,N)) +* The scalar factors of the elementary reflectors (see Further +* Details). +* +* WORK (workspace/output) COMPLEX array, dimension (MAX(1,LWORK)) +* On exit, if INFO = 0, WORK(1) returns the optimal LWORK. +* +* LWORK (input) INTEGER +* +* The dimension of the array WORK. The dimension can be divided into three parts. +* +* 1) The part for the triangular factor T. If the very last T is not bigger +* than any of the rest, then this part is NB x ceiling(K/NB), otherwise, +* NB x (K-NT), where K = min(M,N) and NT is the dimension of the very last T +* +* 2) The part for the very last T when T is bigger than any of the rest T. +* The size of this part is NT x NT, where NT = K - ceiling ((K-NX)/NB) x NB, +* where K = min(M,N), NX is calculated by +* NX = MAX( 0, ILAENV( 3, 'CGEQRF', ' ', M, N, -1, -1 ) ) +* +* 3) The part for dlarfb is of size max((N-M)*K, (N-M)*NB, K*NB, NB*NB) +* +* So LWORK = part1 + part2 + part3 +* +* If LWORK = -1, then a workspace query is assumed; the routine +* only calculates the optimal size of the WORK array, returns +* this value as the first entry of the WORK array, and no error +* message related to LWORK is issued by XERBLA. +* +* INFO (output) INTEGER +* = 0: successful exit +* < 0: if INFO = -i, the i-th argument had an illegal value +* +* Further Details +* =============== +* +* The matrix Q is represented as a product of elementary reflectors +* +* Q = H(1) H(2) . . . H(k), where k = min(m,n). +* +* Each H(i) has the form +* +* H(i) = I - tau * v * v' +* +* where tau is a real scalar, and v is a real vector with +* v(1:i-1) = 0 and v(i) = 1; v(i+1:m) is stored on exit in A(i+1:m,i), +* and tau in TAU(i). +* +* ===================================================================== +* +* .. Local Scalars .. + LOGICAL LQUERY + INTEGER I, IB, IINFO, IWS, J, K, LWKOPT, NB, + $ NBMIN, NX, LBWORK, NT, LLWORK +* .. +* .. External Subroutines .. + EXTERNAL CGEQR2, CLARFB, CLARFT, XERBLA +* .. +* .. Intrinsic Functions .. + INTRINSIC MAX, MIN +* .. +* .. External Functions .. + INTEGER ILAENV + REAL SCEIL + EXTERNAL ILAENV, SCEIL +* .. +* .. Executable Statements .. + + INFO = 0 + NBMIN = 2 + NX = 0 + IWS = N + K = MIN( M, N ) + NB = ILAENV( 1, 'CGEQRF', ' ', M, N, -1, -1 ) + + IF( NB.GT.1 .AND. NB.LT.K ) THEN +* +* Determine when to cross over from blocked to unblocked code. +* + NX = MAX( 0, ILAENV( 3, 'CGEQRF', ' ', M, N, -1, -1 ) ) + END IF +* +* Get NT, the size of the very last T, which is the left-over from in-between K-NX and K to K, eg.: +* +* NB=3 2NB=6 K=10 +* | | | +* 1--2--3--4--5--6--7--8--9--10 +* | \________/ +* K-NX=5 NT=4 +* +* So here 4 x 4 is the last T stored in the workspace +* + NT = K-SCEIL(REAL(K-NX)/REAL(NB))*NB + +* +* optimal workspace = space for dlarfb + space for normal T's + space for the last T +* + LLWORK = MAX (MAX((N-M)*K, (N-M)*NB), MAX(K*NB, NB*NB)) + LLWORK = SCEIL(REAL(LLWORK)/REAL(NB)) + + IF ( NT.GT.NB ) THEN + + LBWORK = K-NT +* +* Optimal workspace for dlarfb = MAX(1,N)*NT +* + LWKOPT = (LBWORK+LLWORK)*NB + WORK( 1 ) = (LWKOPT+NT*NT) + + ELSE + + LBWORK = SCEIL(REAL(K)/REAL(NB))*NB + LWKOPT = (LBWORK+LLWORK-NB)*NB + WORK( 1 ) = LWKOPT + + END IF + +* +* Test the input arguments +* + LQUERY = ( LWORK.EQ.-1 ) + IF( M.LT.0 ) THEN + INFO = -1 + ELSE IF( N.LT.0 ) THEN + INFO = -2 + ELSE IF( LDA.LT.MAX( 1, M ) ) THEN + INFO = -4 + ELSE IF( LWORK.LT.MAX( 1, N ) .AND. .NOT.LQUERY ) THEN + INFO = -7 + END IF + IF( INFO.NE.0 ) THEN + CALL XERBLA( 'CGEQRF', -INFO ) + RETURN + ELSE IF( LQUERY ) THEN + RETURN + END IF +* +* Quick return if possible +* + IF( K.EQ.0 ) THEN + WORK( 1 ) = 1 + RETURN + END IF +* + IF( NB.GT.1 .AND. NB.LT.K ) THEN + + IF( NX.LT.K ) THEN +* +* Determine if workspace is large enough for blocked code. +* + IF ( NT.LE.NB ) THEN + IWS = (LBWORK+LLWORK-NB)*NB + ELSE + IWS = (LBWORK+LLWORK)*NB+NT*NT + END IF + + IF( LWORK.LT.IWS ) THEN +* +* Not enough workspace to use optimal NB: reduce NB and +* determine the minimum value of NB. +* + IF ( NT.LE.NB ) THEN + NB = LWORK / (LLWORK+(LBWORK-NB)) + ELSE + NB = (LWORK-NT*NT)/(LBWORK+LLWORK) + END IF + + NBMIN = MAX( 2, ILAENV( 2, 'CGEQRF', ' ', M, N, -1, + $ -1 ) ) + END IF + END IF + END IF +* + IF( NB.GE.NBMIN .AND. NB.LT.K .AND. NX.LT.K ) THEN +* +* Use blocked code initially +* + DO 10 I = 1, K - NX, NB + IB = MIN( K-I+1, NB ) +* +* Update the current column using old T's +* + DO 20 J = 1, I - NB, NB +* +* Apply H' to A(J:M,I:I+IB-1) from the left +* + CALL CLARFB( 'Left', 'Transpose', 'Forward', + $ 'Columnwise', M-J+1, IB, NB, + $ A( J, J ), LDA, WORK(J), LBWORK, + $ A( J, I ), LDA, WORK(LBWORK*NB+NT*NT+1), + $ IB) + +20 CONTINUE +* +* Compute the QR factorization of the current block +* A(I:M,I:I+IB-1) +* + CALL CGEQR2( M-I+1, IB, A( I, I ), LDA, TAU( I ), + $ WORK(LBWORK*NB+NT*NT+1), IINFO ) + + IF( I+IB.LE.N ) THEN +* +* Form the triangular factor of the block reflector +* H = H(i) H(i+1) . . . H(i+ib-1) +* + CALL CLARFT( 'Forward', 'Columnwise', M-I+1, IB, + $ A( I, I ), LDA, TAU( I ), + $ WORK(I), LBWORK ) +* + END IF + 10 CONTINUE + ELSE + I = 1 + END IF +* +* Use unblocked code to factor the last or only block. +* + IF( I.LE.K ) THEN + + IF ( I .NE. 1 ) THEN + + DO 30 J = 1, I - NB, NB +* +* Apply H' to A(J:M,I:K) from the left +* + CALL CLARFB( 'Left', 'Transpose', 'Forward', + $ 'Columnwise', M-J+1, K-I+1, NB, + $ A( J, J ), LDA, WORK(J), LBWORK, + $ A( J, I ), LDA, WORK(LBWORK*NB+NT*NT+1), + $ K-I+1) +30 CONTINUE + + CALL CGEQR2( M-I+1, K-I+1, A( I, I ), LDA, TAU( I ), + $ WORK(LBWORK*NB+NT*NT+1),IINFO ) + + ELSE +* +* Use unblocked code to factor the last or only block. +* + CALL CGEQR2( M-I+1, N-I+1, A( I, I ), LDA, TAU( I ), + $ WORK,IINFO ) + + END IF + END IF + + +* +* Apply update to the column M+1:N when N > M +* + IF ( M.LT.N .AND. I.NE.1) THEN +* +* Form the last triangular factor of the block reflector +* H = H(i) H(i+1) . . . H(i+ib-1) +* + IF ( NT .LE. NB ) THEN + CALL CLARFT( 'Forward', 'Columnwise', M-I+1, K-I+1, + $ A( I, I ), LDA, TAU( I ), WORK(I), LBWORK ) + ELSE + CALL CLARFT( 'Forward', 'Columnwise', M-I+1, K-I+1, + $ A( I, I ), LDA, TAU( I ), + $ WORK(LBWORK*NB+1), NT ) + END IF + +* +* Apply H' to A(1:M,M+1:N) from the left +* + DO 40 J = 1, K-NX, NB + + IB = MIN( K-J+1, NB ) + + CALL CLARFB( 'Left', 'Transpose', 'Forward', + $ 'Columnwise', M-J+1, N-M, IB, + $ A( J, J ), LDA, WORK(J), LBWORK, + $ A( J, M+1 ), LDA, WORK(LBWORK*NB+NT*NT+1), + $ N-M) + +40 CONTINUE + + IF ( NT.LE.NB ) THEN + CALL CLARFB( 'Left', 'Transpose', 'Forward', + $ 'Columnwise', M-J+1, N-M, K-J+1, + $ A( J, J ), LDA, WORK(J), LBWORK, + $ A( J, M+1 ), LDA, WORK(LBWORK*NB+NT*NT+1), + $ N-M) + ELSE + CALL CLARFB( 'Left', 'Transpose', 'Forward', + $ 'Columnwise', M-J+1, N-M, K-J+1, + $ A( J, J ), LDA, + $ WORK(LBWORK*NB+1), + $ NT, A( J, M+1 ), LDA, WORK(LBWORK*NB+NT*NT+1), + $ N-M) + END IF + + END IF + + WORK( 1 ) = IWS + RETURN +* +* End of CGEQRF +* + END diff --git a/SRC/VARIANTS/qr/LL/dgeqrf.f b/SRC/VARIANTS/qr/LL/dgeqrf.f new file mode 100644 index 00000000..728ea1b8 --- /dev/null +++ b/SRC/VARIANTS/qr/LL/dgeqrf.f @@ -0,0 +1,344 @@ + SUBROUTINE DGEQRF ( M, N, A, LDA, TAU, WORK, LWORK, INFO ) +* +* -- LAPACK routine (version 3.1) -- +* Univ. of Tennessee, Univ. of California Berkeley and NAG Ltd.. +* March 2008 +* +* .. Scalar Arguments .. + INTEGER INFO, LDA, LWORK, M, N +* .. +* .. Array Arguments .. + DOUBLE PRECISION A( LDA, * ), TAU( * ), WORK( * ) +* .. +* +* Purpose +* ======= +* +* DGEQRF computes a QR factorization of a real M-by-N matrix A: +* A = Q * R. +* +* This is the left-looking Level 3 BLAS version of the algorithm. +* +* Arguments +* ========= +* +* M (input) INTEGER +* The number of rows of the matrix A. M >= 0. +* +* N (input) INTEGER +* The number of columns of the matrix A. N >= 0. +* +* A (input/output) DOUBLE PRECISION array, dimension (LDA,N) +* On entry, the M-by-N matrix A. +* On exit, the elements on and above the diagonal of the array +* contain the min(M,N)-by-N upper trapezoidal matrix R (R is +* upper triangular if m >= n); the elements below the diagonal, +* with the array TAU, represent the orthogonal matrix Q as a +* product of min(m,n) elementary reflectors (see Further +* Details). +* +* LDA (input) INTEGER +* The leading dimension of the array A. LDA >= max(1,M). +* +* TAU (output) DOUBLE PRECISION array, dimension (min(M,N)) +* The scalar factors of the elementary reflectors (see Further +* Details). +* +* WORK (workspace/output) DOUBLE PRECISION array, dimension (MAX(1,LWORK)) +* On exit, if INFO = 0, WORK(1) returns the optimal LWORK. +* +* LWORK (input) INTEGER +* +* The dimension of the array WORK. The dimension can be divided into three parts. +* +* 1) The part for the triangular factor T. If the very last T is not bigger +* than any of the rest, then this part is NB x ceiling(K/NB), otherwise, +* NB x (K-NT), where K = min(M,N) and NT is the dimension of the very last T +* +* 2) The part for the very last T when T is bigger than any of the rest T. +* The size of this part is NT x NT, where NT = K - ceiling ((K-NX)/NB) x NB, +* where K = min(M,N), NX is calculated by +* NX = MAX( 0, ILAENV( 3, 'DGEQRF', ' ', M, N, -1, -1 ) ) +* +* 3) The part for dlarfb is of size max((N-M)*K, (N-M)*NB, K*NB, NB*NB) +* +* So LWORK = part1 + part2 + part3 +* +* If LWORK = -1, then a workspace query is assumed; the routine +* only calculates the optimal size of the WORK array, returns +* this value as the first entry of the WORK array, and no error +* message related to LWORK is issued by XERBLA. +* +* INFO (output) INTEGER +* = 0: successful exit +* < 0: if INFO = -i, the i-th argument had an illegal value +* +* Further Details +* =============== +* +* The matrix Q is represented as a product of elementary reflectors +* +* Q = H(1) H(2) . . . H(k), where k = min(m,n). +* +* Each H(i) has the form +* +* H(i) = I - tau * v * v' +* +* where tau is a real scalar, and v is a real vector with +* v(1:i-1) = 0 and v(i) = 1; v(i+1:m) is stored on exit in A(i+1:m,i), +* and tau in TAU(i). +* +* ===================================================================== +* +* .. Local Scalars .. + LOGICAL LQUERY + INTEGER I, IB, IINFO, IWS, J, K, LWKOPT, NB, + $ NBMIN, NX, LBWORK, NT, LLWORK +* .. +* .. External Subroutines .. + EXTERNAL DGEQR2, DLARFB, DLARFT, XERBLA +* .. +* .. Intrinsic Functions .. + INTRINSIC MAX, MIN +* .. +* .. External Functions .. + INTEGER ILAENV + REAL SCEIL + EXTERNAL ILAENV, SCEIL +* .. +* .. Executable Statements .. + + INFO = 0 + NBMIN = 2 + NX = 0 + IWS = N + K = MIN( M, N ) + NB = ILAENV( 1, 'DGEQRF', ' ', M, N, -1, -1 ) + + IF( NB.GT.1 .AND. NB.LT.K ) THEN +* +* Determine when to cross over from blocked to unblocked code. +* + NX = MAX( 0, ILAENV( 3, 'DGEQRF', ' ', M, N, -1, -1 ) ) + END IF +* +* Get NT, the size of the very last T, which is the left-over from in-between K-NX and K to K, eg.: +* +* NB=3 2NB=6 K=10 +* | | | +* 1--2--3--4--5--6--7--8--9--10 +* | \________/ +* K-NX=5 NT=4 +* +* So here 4 x 4 is the last T stored in the workspace +* + NT = K-SCEIL(REAL(K-NX)/REAL(NB))*NB + +* +* optimal workspace = space for dlarfb + space for normal T's + space for the last T +* + LLWORK = MAX (MAX((N-M)*K, (N-M)*NB), MAX(K*NB, NB*NB)) + LLWORK = SCEIL(REAL(LLWORK)/REAL(NB)) + + IF ( NT.GT.NB ) THEN + + LBWORK = K-NT +* +* Optimal workspace for dlarfb = MAX(1,N)*NT +* + LWKOPT = (LBWORK+LLWORK)*NB + WORK( 1 ) = (LWKOPT+NT*NT) + + ELSE + + LBWORK = SCEIL(REAL(K)/REAL(NB))*NB + LWKOPT = (LBWORK+LLWORK-NB)*NB + WORK( 1 ) = LWKOPT + + END IF + +* +* Test the input arguments +* + LQUERY = ( LWORK.EQ.-1 ) + IF( M.LT.0 ) THEN + INFO = -1 + ELSE IF( N.LT.0 ) THEN + INFO = -2 + ELSE IF( LDA.LT.MAX( 1, M ) ) THEN + INFO = -4 + ELSE IF( LWORK.LT.MAX( 1, N ) .AND. .NOT.LQUERY ) THEN + INFO = -7 + END IF + IF( INFO.NE.0 ) THEN + CALL XERBLA( 'DGEQRF', -INFO ) + RETURN + ELSE IF( LQUERY ) THEN + RETURN + END IF +* +* Quick return if possible +* + IF( K.EQ.0 ) THEN + WORK( 1 ) = 1 + RETURN + END IF +* + IF( NB.GT.1 .AND. NB.LT.K ) THEN + + IF( NX.LT.K ) THEN +* +* Determine if workspace is large enough for blocked code. +* + IF ( NT.LE.NB ) THEN + IWS = (LBWORK+LLWORK-NB)*NB + ELSE + IWS = (LBWORK+LLWORK)*NB+NT*NT + END IF + + IF( LWORK.LT.IWS ) THEN +* +* Not enough workspace to use optimal NB: reduce NB and +* determine the minimum value of NB. +* + IF ( NT.LE.NB ) THEN + NB = LWORK / (LLWORK+(LBWORK-NB)) + ELSE + NB = (LWORK-NT*NT)/(LBWORK+LLWORK) + END IF + + NBMIN = MAX( 2, ILAENV( 2, 'DGEQRF', ' ', M, N, -1, + $ -1 ) ) + END IF + END IF + END IF +* + IF( NB.GE.NBMIN .AND. NB.LT.K .AND. NX.LT.K ) THEN +* +* Use blocked code initially +* + DO 10 I = 1, K - NX, NB + IB = MIN( K-I+1, NB ) +* +* Update the current column using old T's +* + DO 20 J = 1, I - NB, NB +* +* Apply H' to A(J:M,I:I+IB-1) from the left +* + CALL DLARFB( 'Left', 'Transpose', 'Forward', + $ 'Columnwise', M-J+1, IB, NB, + $ A( J, J ), LDA, WORK(J), LBWORK, + $ A( J, I ), LDA, WORK(LBWORK*NB+NT*NT+1), + $ IB) + +20 CONTINUE +* +* Compute the QR factorization of the current block +* A(I:M,I:I+IB-1) +* + CALL DGEQR2( M-I+1, IB, A( I, I ), LDA, TAU( I ), + $ WORK(LBWORK*NB+NT*NT+1), IINFO ) + + IF( I+IB.LE.N ) THEN +* +* Form the triangular factor of the block reflector +* H = H(i) H(i+1) . . . H(i+ib-1) +* + CALL DLARFT( 'Forward', 'Columnwise', M-I+1, IB, + $ A( I, I ), LDA, TAU( I ), + $ WORK(I), LBWORK ) +* + END IF + 10 CONTINUE + ELSE + I = 1 + END IF +* +* Use unblocked code to factor the last or only block. +* + IF( I.LE.K ) THEN + + IF ( I .NE. 1 ) THEN + + DO 30 J = 1, I - NB, NB +* +* Apply H' to A(J:M,I:K) from the left +* + CALL DLARFB( 'Left', 'Transpose', 'Forward', + $ 'Columnwise', M-J+1, K-I+1, NB, + $ A( J, J ), LDA, WORK(J), LBWORK, + $ A( J, I ), LDA, WORK(LBWORK*NB+NT*NT+1), + $ K-I+1) +30 CONTINUE + + CALL DGEQR2( M-I+1, K-I+1, A( I, I ), LDA, TAU( I ), + $ WORK(LBWORK*NB+NT*NT+1),IINFO ) + + ELSE +* +* Use unblocked code to factor the last or only block. +* + CALL DGEQR2( M-I+1, N-I+1, A( I, I ), LDA, TAU( I ), + $ WORK,IINFO ) + + END IF + END IF + + +* +* Apply update to the column M+1:N when N > M +* + IF ( M.LT.N .AND. I.NE.1) THEN +* +* Form the last triangular factor of the block reflector +* H = H(i) H(i+1) . . . H(i+ib-1) +* + IF ( NT .LE. NB ) THEN + CALL DLARFT( 'Forward', 'Columnwise', M-I+1, K-I+1, + $ A( I, I ), LDA, TAU( I ), WORK(I), LBWORK ) + ELSE + CALL DLARFT( 'Forward', 'Columnwise', M-I+1, K-I+1, + $ A( I, I ), LDA, TAU( I ), + $ WORK(LBWORK*NB+1), NT ) + END IF + +* +* Apply H' to A(1:M,M+1:N) from the left +* + DO 40 J = 1, K-NX, NB + + IB = MIN( K-J+1, NB ) + + CALL DLARFB( 'Left', 'Transpose', 'Forward', + $ 'Columnwise', M-J+1, N-M, IB, + $ A( J, J ), LDA, WORK(J), LBWORK, + $ A( J, M+1 ), LDA, WORK(LBWORK*NB+NT*NT+1), + $ N-M) + +40 CONTINUE + + IF ( NT.LE.NB ) THEN + CALL DLARFB( 'Left', 'Transpose', 'Forward', + $ 'Columnwise', M-J+1, N-M, K-J+1, + $ A( J, J ), LDA, WORK(J), LBWORK, + $ A( J, M+1 ), LDA, WORK(LBWORK*NB+NT*NT+1), + $ N-M) + ELSE + CALL DLARFB( 'Left', 'Transpose', 'Forward', + $ 'Columnwise', M-J+1, N-M, K-J+1, + $ A( J, J ), LDA, + $ WORK(LBWORK*NB+1), + $ NT, A( J, M+1 ), LDA, WORK(LBWORK*NB+NT*NT+1), + $ N-M) + END IF + + END IF + + WORK( 1 ) = IWS + RETURN +* +* End of DGEQRF +* + END + diff --git a/SRC/VARIANTS/qr/LL/sceil.f b/SRC/VARIANTS/qr/LL/sceil.f new file mode 100644 index 00000000..70c5c5cf --- /dev/null +++ b/SRC/VARIANTS/qr/LL/sceil.f @@ -0,0 +1,28 @@ + REAL FUNCTION SCEIL( A ) +* +* -- LAPACK routine (version 3.1) -- +* Univ. of Tennessee, Univ. of California Berkeley and NAG Ltd.. +* June 2008 +* +* .. Scalar Arguments ..* + REAL A +* .. +* +* ===================================================================== +* +* .. Intrinsic Functions .. + INTRINSIC INT +* .. +* .. Executable Statements ..* +* + IF (A-INT(A).EQ.0) THEN + SCEIL = A + ELSE IF (A.GT.0) THEN + SCEIL = INT(A)+1; + ELSE + SCEIL = INT(A) + END IF + + RETURN +* + END diff --git a/SRC/VARIANTS/qr/LL/sgeqrf.f b/SRC/VARIANTS/qr/LL/sgeqrf.f new file mode 100644 index 00000000..fef6379b --- /dev/null +++ b/SRC/VARIANTS/qr/LL/sgeqrf.f @@ -0,0 +1,343 @@ + SUBROUTINE SGEQRF ( M, N, A, LDA, TAU, WORK, LWORK, INFO ) +* +* -- LAPACK routine (version 3.1) -- +* Univ. of Tennessee, Univ. of California Berkeley and NAG Ltd.. +* March 2008 +* +* .. Scalar Arguments .. + INTEGER INFO, LDA, LWORK, M, N +* .. +* .. Array Arguments .. + REAL A( LDA, * ), TAU( * ), WORK( * ) +* .. +* +* Purpose +* ======= +* +* SGEQRF computes a QR factorization of a real M-by-N matrix A: +* A = Q * R. +* +* This is the left-looking Level 3 BLAS version of the algorithm. +* +* Arguments +* ========= +* +* M (input) INTEGER +* The number of rows of the matrix A. M >= 0. +* +* N (input) INTEGER +* The number of columns of the matrix A. N >= 0. +* +* A (input/output) REAL array, dimension (LDA,N) +* On entry, the M-by-N matrix A. +* On exit, the elements on and above the diagonal of the array +* contain the min(M,N)-by-N upper trapezoidal matrix R (R is +* upper triangular if m >= n); the elements below the diagonal, +* with the array TAU, represent the orthogonal matrix Q as a +* product of min(m,n) elementary reflectors (see Further +* Details). +* +* LDA (input) INTEGER +* The leading dimension of the array A. LDA >= max(1,M). +* +* TAU (output) REAL array, dimension (min(M,N)) +* The scalar factors of the elementary reflectors (see Further +* Details). +* +* WORK (workspace/output) REAL array, dimension (MAX(1,LWORK)) +* On exit, if INFO = 0, WORK(1) returns the optimal LWORK. +* +* LWORK (input) INTEGER +* +* The dimension of the array WORK. The dimension can be divided into three parts. +* +* 1) The part for the triangular factor T. If the very last T is not bigger +* than any of the rest, then this part is NB x ceiling(K/NB), otherwise, +* NB x (K-NT), where K = min(M,N) and NT is the dimension of the very last T +* +* 2) The part for the very last T when T is bigger than any of the rest T. +* The size of this part is NT x NT, where NT = K - ceiling ((K-NX)/NB) x NB, +* where K = min(M,N), NX is calculated by +* NX = MAX( 0, ILAENV( 3, 'SGEQRF', ' ', M, N, -1, -1 ) ) +* +* 3) The part for dlarfb is of size max((N-M)*K, (N-M)*NB, K*NB, NB*NB) +* +* So LWORK = part1 + part2 + part3 +* +* If LWORK = -1, then a workspace query is assumed; the routine +* only calculates the optimal size of the WORK array, returns +* this value as the first entry of the WORK array, and no error +* message related to LWORK is issued by XERBLA. +* +* INFO (output) INTEGER +* = 0: successful exit +* < 0: if INFO = -i, the i-th argument had an illegal value +* +* Further Details +* =============== +* +* The matrix Q is represented as a product of elementary reflectors +* +* Q = H(1) H(2) . . . H(k), where k = min(m,n). +* +* Each H(i) has the form +* +* H(i) = I - tau * v * v' +* +* where tau is a real scalar, and v is a real vector with +* v(1:i-1) = 0 and v(i) = 1; v(i+1:m) is stored on exit in A(i+1:m,i), +* and tau in TAU(i). +* +* ===================================================================== +* +* .. Local Scalars .. + LOGICAL LQUERY + INTEGER I, IB, IINFO, IWS, J, K, LWKOPT, NB, + $ NBMIN, NX, LBWORK, NT, LLWORK +* .. +* .. External Subroutines .. + EXTERNAL SGEQR2, SLARFB, SLARFT, XERBLA +* .. +* .. Intrinsic Functions .. + INTRINSIC MAX, MIN +* .. +* .. External Functions .. + INTEGER ILAENV + REAL SCEIL + EXTERNAL ILAENV, SCEIL +* .. +* .. Executable Statements .. + + INFO = 0 + NBMIN = 2 + NX = 0 + IWS = N + K = MIN( M, N ) + NB = ILAENV( 1, 'SGEQRF', ' ', M, N, -1, -1 ) + + IF( NB.GT.1 .AND. NB.LT.K ) THEN +* +* Determine when to cross over from blocked to unblocked code. +* + NX = MAX( 0, ILAENV( 3, 'SGEQRF', ' ', M, N, -1, -1 ) ) + END IF +* +* Get NT, the size of the very last T, which is the left-over from in-between K-NX and K to K, eg.: +* +* NB=3 2NB=6 K=10 +* | | | +* 1--2--3--4--5--6--7--8--9--10 +* | \________/ +* K-NX=5 NT=4 +* +* So here 4 x 4 is the last T stored in the workspace +* + NT = K-SCEIL(REAL(K-NX)/REAL(NB))*NB + +* +* optimal workspace = space for dlarfb + space for normal T's + space for the last T +* + LLWORK = MAX (MAX((N-M)*K, (N-M)*NB), MAX(K*NB, NB*NB)) + LLWORK = SCEIL(REAL(LLWORK)/REAL(NB)) + + IF ( NT.GT.NB ) THEN + + LBWORK = K-NT +* +* Optimal workspace for dlarfb = MAX(1,N)*NT +* + LWKOPT = (LBWORK+LLWORK)*NB + WORK( 1 ) = (LWKOPT+NT*NT) + + ELSE + + LBWORK = SCEIL(REAL(K)/REAL(NB))*NB + LWKOPT = (LBWORK+LLWORK-NB)*NB + WORK( 1 ) = LWKOPT + + END IF + +* +* Test the input arguments +* + LQUERY = ( LWORK.EQ.-1 ) + IF( M.LT.0 ) THEN + INFO = -1 + ELSE IF( N.LT.0 ) THEN + INFO = -2 + ELSE IF( LDA.LT.MAX( 1, M ) ) THEN + INFO = -4 + ELSE IF( LWORK.LT.MAX( 1, N ) .AND. .NOT.LQUERY ) THEN + INFO = -7 + END IF + IF( INFO.NE.0 ) THEN + CALL XERBLA( 'SGEQRF', -INFO ) + RETURN + ELSE IF( LQUERY ) THEN + RETURN + END IF +* +* Quick return if possible +* + IF( K.EQ.0 ) THEN + WORK( 1 ) = 1 + RETURN + END IF +* + IF( NB.GT.1 .AND. NB.LT.K ) THEN + + IF( NX.LT.K ) THEN +* +* Determine if workspace is large enough for blocked code. +* + IF ( NT.LE.NB ) THEN + IWS = (LBWORK+LLWORK-NB)*NB + ELSE + IWS = (LBWORK+LLWORK)*NB+NT*NT + END IF + + IF( LWORK.LT.IWS ) THEN +* +* Not enough workspace to use optimal NB: reduce NB and +* determine the minimum value of NB. +* + IF ( NT.LE.NB ) THEN + NB = LWORK / (LLWORK+(LBWORK-NB)) + ELSE + NB = (LWORK-NT*NT)/(LBWORK+LLWORK) + END IF + + NBMIN = MAX( 2, ILAENV( 2, 'SGEQRF', ' ', M, N, -1, + $ -1 ) ) + END IF + END IF + END IF +* + IF( NB.GE.NBMIN .AND. NB.LT.K .AND. NX.LT.K ) THEN +* +* Use blocked code initially +* + DO 10 I = 1, K - NX, NB + IB = MIN( K-I+1, NB ) +* +* Update the current column using old T's +* + DO 20 J = 1, I - NB, NB +* +* Apply H' to A(J:M,I:I+IB-1) from the left +* + CALL SLARFB( 'Left', 'Transpose', 'Forward', + $ 'Columnwise', M-J+1, IB, NB, + $ A( J, J ), LDA, WORK(J), LBWORK, + $ A( J, I ), LDA, WORK(LBWORK*NB+NT*NT+1), + $ IB) + +20 CONTINUE +* +* Compute the QR factorization of the current block +* A(I:M,I:I+IB-1) +* + CALL SGEQR2( M-I+1, IB, A( I, I ), LDA, TAU( I ), + $ WORK(LBWORK*NB+NT*NT+1), IINFO ) + + IF( I+IB.LE.N ) THEN +* +* Form the triangular factor of the block reflector +* H = H(i) H(i+1) . . . H(i+ib-1) +* + CALL SLARFT( 'Forward', 'Columnwise', M-I+1, IB, + $ A( I, I ), LDA, TAU( I ), + $ WORK(I), LBWORK ) +* + END IF + 10 CONTINUE + ELSE + I = 1 + END IF +* +* Use unblocked code to factor the last or only block. +* + IF( I.LE.K ) THEN + + IF ( I .NE. 1 ) THEN + + DO 30 J = 1, I - NB, NB +* +* Apply H' to A(J:M,I:K) from the left +* + CALL SLARFB( 'Left', 'Transpose', 'Forward', + $ 'Columnwise', M-J+1, K-I+1, NB, + $ A( J, J ), LDA, WORK(J), LBWORK, + $ A( J, I ), LDA, WORK(LBWORK*NB+NT*NT+1), + $ K-I+1) +30 CONTINUE + + CALL SGEQR2( M-I+1, K-I+1, A( I, I ), LDA, TAU( I ), + $ WORK(LBWORK*NB+NT*NT+1),IINFO ) + + ELSE +* +* Use unblocked code to factor the last or only block. +* + CALL SGEQR2( M-I+1, N-I+1, A( I, I ), LDA, TAU( I ), + $ WORK,IINFO ) + + END IF + END IF + + +* +* Apply update to the column M+1:N when N > M +* + IF ( M.LT.N .AND. I.NE.1) THEN +* +* Form the last triangular factor of the block reflector +* H = H(i) H(i+1) . . . H(i+ib-1) +* + IF ( NT .LE. NB ) THEN + CALL SLARFT( 'Forward', 'Columnwise', M-I+1, K-I+1, + $ A( I, I ), LDA, TAU( I ), WORK(I), LBWORK ) + ELSE + CALL SLARFT( 'Forward', 'Columnwise', M-I+1, K-I+1, + $ A( I, I ), LDA, TAU( I ), + $ WORK(LBWORK*NB+1), NT ) + END IF + +* +* Apply H' to A(1:M,M+1:N) from the left +* + DO 40 J = 1, K-NX, NB + + IB = MIN( K-J+1, NB ) + + CALL SLARFB( 'Left', 'Transpose', 'Forward', + $ 'Columnwise', M-J+1, N-M, IB, + $ A( J, J ), LDA, WORK(J), LBWORK, + $ A( J, M+1 ), LDA, WORK(LBWORK*NB+NT*NT+1), + $ N-M) + +40 CONTINUE + + IF ( NT.LE.NB ) THEN + CALL SLARFB( 'Left', 'Transpose', 'Forward', + $ 'Columnwise', M-J+1, N-M, K-J+1, + $ A( J, J ), LDA, WORK(J), LBWORK, + $ A( J, M+1 ), LDA, WORK(LBWORK*NB+NT*NT+1), + $ N-M) + ELSE + CALL SLARFB( 'Left', 'Transpose', 'Forward', + $ 'Columnwise', M-J+1, N-M, K-J+1, + $ A( J, J ), LDA, + $ WORK(LBWORK*NB+1), + $ NT, A( J, M+1 ), LDA, WORK(LBWORK*NB+NT*NT+1), + $ N-M) + END IF + + END IF + + WORK( 1 ) = IWS + RETURN +* +* End of SGEQRF +* + END diff --git a/SRC/VARIANTS/qr/LL/zgeqrf.f b/SRC/VARIANTS/qr/LL/zgeqrf.f new file mode 100644 index 00000000..cf5e093e --- /dev/null +++ b/SRC/VARIANTS/qr/LL/zgeqrf.f @@ -0,0 +1,343 @@ + SUBROUTINE ZGEQRF ( M, N, A, LDA, TAU, WORK, LWORK, INFO ) +* +* -- LAPACK routine (version 3.1) -- +* Univ. of Tennessee, Univ. of California Berkeley and NAG Ltd.. +* March 2008 +* +* .. Scalar Arguments .. + INTEGER INFO, LDA, LWORK, M, N +* .. +* .. Array Arguments .. + COMPLEX*16 A( LDA, * ), TAU( * ), WORK( * ) +* .. +* +* Purpose +* ======= +* +* ZGEQRF computes a QR factorization of a real M-by-N matrix A: +* A = Q * R. +* +* This is the left-looking Level 3 BLAS version of the algorithm. +* +* Arguments +* ========= +* +* M (input) INTEGER +* The number of rows of the matrix A. M >= 0. +* +* N (input) INTEGER +* The number of columns of the matrix A. N >= 0. +* +* A (input/output) COMPLEX*16 array, dimension (LDA,N) +* On entry, the M-by-N matrix A. +* On exit, the elements on and above the diagonal of the array +* contain the min(M,N)-by-N upper trapezoidal matrix R (R is +* upper triangular if m >= n); the elements below the diagonal, +* with the array TAU, represent the orthogonal matrix Q as a +* product of min(m,n) elementary reflectors (see Further +* Details). +* +* LDA (input) INTEGER +* The leading dimension of the array A. LDA >= max(1,M). +* +* TAU (output) COMPLEX*16 array, dimension (min(M,N)) +* The scalar factors of the elementary reflectors (see Further +* Details). +* +* WORK (workspace/output) COMPLEX*16 array, dimension (MAX(1,LWORK)) +* On exit, if INFO = 0, WORK(1) returns the optimal LWORK. +* +* LWORK (input) INTEGER +* +* The dimension of the array WORK. The dimension can be divided into three parts. +* +* 1) The part for the triangular factor T. If the very last T is not bigger +* than any of the rest, then this part is NB x ceiling(K/NB), otherwise, +* NB x (K-NT), where K = min(M,N) and NT is the dimension of the very last T +* +* 2) The part for the very last T when T is bigger than any of the rest T. +* The size of this part is NT x NT, where NT = K - ceiling ((K-NX)/NB) x NB, +* where K = min(M,N), NX is calculated by +* NX = MAX( 0, ILAENV( 3, 'ZGEQRF', ' ', M, N, -1, -1 ) ) +* +* 3) The part for dlarfb is of size max((N-M)*K, (N-M)*NB, K*NB, NB*NB) +* +* So LWORK = part1 + part2 + part3 +* +* If LWORK = -1, then a workspace query is assumed; the routine +* only calculates the optimal size of the WORK array, returns +* this value as the first entry of the WORK array, and no error +* message related to LWORK is issued by XERBLA. +* +* INFO (output) INTEGER +* = 0: successful exit +* < 0: if INFO = -i, the i-th argument had an illegal value +* +* Further Details +* =============== +* +* The matrix Q is represented as a product of elementary reflectors +* +* Q = H(1) H(2) . . . H(k), where k = min(m,n). +* +* Each H(i) has the form +* +* H(i) = I - tau * v * v' +* +* where tau is a real scalar, and v is a real vector with +* v(1:i-1) = 0 and v(i) = 1; v(i+1:m) is stored on exit in A(i+1:m,i), +* and tau in TAU(i). +* +* ===================================================================== +* +* .. Local Scalars .. + LOGICAL LQUERY + INTEGER I, IB, IINFO, IWS, J, K, LWKOPT, NB, + $ NBMIN, NX, LBWORK, NT, LLWORK +* .. +* .. External Subroutines .. + EXTERNAL ZGEQR2, ZLARFB, ZLARFT, XERBLA +* .. +* .. Intrinsic Functions .. + INTRINSIC MAX, MIN +* .. +* .. External Functions .. + INTEGER ILAENV + REAL SCEIL + EXTERNAL ILAENV, SCEIL +* .. +* .. Executable Statements .. + + INFO = 0 + NBMIN = 2 + NX = 0 + IWS = N + K = MIN( M, N ) + NB = ILAENV( 1, 'ZGEQRF', ' ', M, N, -1, -1 ) + + IF( NB.GT.1 .AND. NB.LT.K ) THEN +* +* Determine when to cross over from blocked to unblocked code. +* + NX = MAX( 0, ILAENV( 3, 'ZGEQRF', ' ', M, N, -1, -1 ) ) + END IF +* +* Get NT, the size of the very last T, which is the left-over from in-between K-NX and K to K, eg.: +* +* NB=3 2NB=6 K=10 +* | | | +* 1--2--3--4--5--6--7--8--9--10 +* | \________/ +* K-NX=5 NT=4 +* +* So here 4 x 4 is the last T stored in the workspace +* + NT = K-SCEIL(REAL(K-NX)/REAL(NB))*NB + +* +* optimal workspace = space for dlarfb + space for normal T's + space for the last T +* + LLWORK = MAX (MAX((N-M)*K, (N-M)*NB), MAX(K*NB, NB*NB)) + LLWORK = SCEIL(REAL(LLWORK)/REAL(NB)) + + IF ( NT.GT.NB ) THEN + + LBWORK = K-NT +* +* Optimal workspace for dlarfb = MAX(1,N)*NT +* + LWKOPT = (LBWORK+LLWORK)*NB + WORK( 1 ) = (LWKOPT+NT*NT) + + ELSE + + LBWORK = SCEIL(REAL(K)/REAL(NB))*NB + LWKOPT = (LBWORK+LLWORK-NB)*NB + WORK( 1 ) = LWKOPT + + END IF + +* +* Test the input arguments +* + LQUERY = ( LWORK.EQ.-1 ) + IF( M.LT.0 ) THEN + INFO = -1 + ELSE IF( N.LT.0 ) THEN + INFO = -2 + ELSE IF( LDA.LT.MAX( 1, M ) ) THEN + INFO = -4 + ELSE IF( LWORK.LT.MAX( 1, N ) .AND. .NOT.LQUERY ) THEN + INFO = -7 + END IF + IF( INFO.NE.0 ) THEN + CALL XERBLA( 'ZGEQRF', -INFO ) + RETURN + ELSE IF( LQUERY ) THEN + RETURN + END IF +* +* Quick return if possible +* + IF( K.EQ.0 ) THEN + WORK( 1 ) = 1 + RETURN + END IF +* + IF( NB.GT.1 .AND. NB.LT.K ) THEN + + IF( NX.LT.K ) THEN +* +* Determine if workspace is large enough for blocked code. +* + IF ( NT.LE.NB ) THEN + IWS = (LBWORK+LLWORK-NB)*NB + ELSE + IWS = (LBWORK+LLWORK)*NB+NT*NT + END IF + + IF( LWORK.LT.IWS ) THEN +* +* Not enough workspace to use optimal NB: reduce NB and +* determine the minimum value of NB. +* + IF ( NT.LE.NB ) THEN + NB = LWORK / (LLWORK+(LBWORK-NB)) + ELSE + NB = (LWORK-NT*NT)/(LBWORK+LLWORK) + END IF + + NBMIN = MAX( 2, ILAENV( 2, 'ZGEQRF', ' ', M, N, -1, + $ -1 ) ) + END IF + END IF + END IF +* + IF( NB.GE.NBMIN .AND. NB.LT.K .AND. NX.LT.K ) THEN +* +* Use blocked code initially +* + DO 10 I = 1, K - NX, NB + IB = MIN( K-I+1, NB ) +* +* Update the current column using old T's +* + DO 20 J = 1, I - NB, NB +* +* Apply H' to A(J:M,I:I+IB-1) from the left +* + CALL ZLARFB( 'Left', 'Transpose', 'Forward', + $ 'Columnwise', M-J+1, IB, NB, + $ A( J, J ), LDA, WORK(J), LBWORK, + $ A( J, I ), LDA, WORK(LBWORK*NB+NT*NT+1), + $ IB) + +20 CONTINUE +* +* Compute the QR factorization of the current block +* A(I:M,I:I+IB-1) +* + CALL ZGEQR2( M-I+1, IB, A( I, I ), LDA, TAU( I ), + $ WORK(LBWORK*NB+NT*NT+1), IINFO ) + + IF( I+IB.LE.N ) THEN +* +* Form the triangular factor of the block reflector +* H = H(i) H(i+1) . . . H(i+ib-1) +* + CALL ZLARFT( 'Forward', 'Columnwise', M-I+1, IB, + $ A( I, I ), LDA, TAU( I ), + $ WORK(I), LBWORK ) +* + END IF + 10 CONTINUE + ELSE + I = 1 + END IF +* +* Use unblocked code to factor the last or only block. +* + IF( I.LE.K ) THEN + + IF ( I .NE. 1 ) THEN + + DO 30 J = 1, I - NB, NB +* +* Apply H' to A(J:M,I:K) from the left +* + CALL ZLARFB( 'Left', 'Transpose', 'Forward', + $ 'Columnwise', M-J+1, K-I+1, NB, + $ A( J, J ), LDA, WORK(J), LBWORK, + $ A( J, I ), LDA, WORK(LBWORK*NB+NT*NT+1), + $ K-I+1) +30 CONTINUE + + CALL ZGEQR2( M-I+1, K-I+1, A( I, I ), LDA, TAU( I ), + $ WORK(LBWORK*NB+NT*NT+1),IINFO ) + + ELSE +* +* Use unblocked code to factor the last or only block. +* + CALL ZGEQR2( M-I+1, N-I+1, A( I, I ), LDA, TAU( I ), + $ WORK,IINFO ) + + END IF + END IF + + +* +* Apply update to the column M+1:N when N > M +* + IF ( M.LT.N .AND. I.NE.1) THEN +* +* Form the last triangular factor of the block reflector +* H = H(i) H(i+1) . . . H(i+ib-1) +* + IF ( NT .LE. NB ) THEN + CALL ZLARFT( 'Forward', 'Columnwise', M-I+1, K-I+1, + $ A( I, I ), LDA, TAU( I ), WORK(I), LBWORK ) + ELSE + CALL ZLARFT( 'Forward', 'Columnwise', M-I+1, K-I+1, + $ A( I, I ), LDA, TAU( I ), + $ WORK(LBWORK*NB+1), NT ) + END IF + +* +* Apply H' to A(1:M,M+1:N) from the left +* + DO 40 J = 1, K-NX, NB + + IB = MIN( K-J+1, NB ) + + CALL ZLARFB( 'Left', 'Transpose', 'Forward', + $ 'Columnwise', M-J+1, N-M, IB, + $ A( J, J ), LDA, WORK(J), LBWORK, + $ A( J, M+1 ), LDA, WORK(LBWORK*NB+NT*NT+1), + $ N-M) + +40 CONTINUE + + IF ( NT.LE.NB ) THEN + CALL ZLARFB( 'Left', 'Transpose', 'Forward', + $ 'Columnwise', M-J+1, N-M, K-J+1, + $ A( J, J ), LDA, WORK(J), LBWORK, + $ A( J, M+1 ), LDA, WORK(LBWORK*NB+NT*NT+1), + $ N-M) + ELSE + CALL ZLARFB( 'Left', 'Transpose', 'Forward', + $ 'Columnwise', M-J+1, N-M, K-J+1, + $ A( J, J ), LDA, + $ WORK(LBWORK*NB+1), + $ NT, A( J, M+1 ), LDA, WORK(LBWORK*NB+NT*NT+1), + $ N-M) + END IF + + END IF + + WORK( 1 ) = IWS + RETURN +* +* End of ZGEQRF +* + END |