MODULE dgemmCbind ! Matrix Matrix multiplication
USE OMP_LIB , ONLY : OMP_GET_NUM_THREADS,OMP_GET_THREAD_NUM
#ifdef __CBIND__
USE ISO_C_BINDING, ONLY : C_CHAR,C_INT,C_DOUBLE
#endif
IMPLICIT NONE
! original parameters
! INTEGER,parameter,private :: MB=32, NB=1024, NBT=96, KB=32
! new parameters
INTEGER,parameter,private :: MB=32, NB=256, NBT=48, KB=32
LOGICAL,parameter,private :: UseURL = .TRUE.
INTEGER,parameter,private :: MinDim = 128
INTEGER,parameter,private :: dp=SELECTED_REAL_KIND(15,300)
!
! Short description of subroutines dgemmomp and DGEMMurl
! ------------------------------------------------------
!
! Desciption of parameters
!
! MB, NB, NBT & KB are blocking parameters for subroutine DGEMMurl
! (DGEMM unrolled) which uses unrolled loops and blocking
! If UseURL is set to false only the standard level 3 blas routine
! DGEMM is callsed, otherwith GGEMMurl is used if the dimensions
! of the matrices involved exceed the parameter MinDim
!
! dgemmomp is a OpenMP version of DGEMM. If the number of threads
! (e.g. set by OMP_SET_NUM_THREADS) is less than two the serial
! optimized version of DGEMM (DGEMMurl) is called directly.
!
! Other remarks
!
! dgemmomp is far from beeing optimal - please test if it is really
! improving the performance in your sprecific environment.
! On 32 bit systems the communication overhead outperforms the
! potential gain in speed by using more than one thread in many cases
! But even on a outdated Atom single core processor a two threads
! version was about 40 % quicker than the single theread version.
! So you need to test ...
!
! Letzte Aenderung
!
! 08.06.18, MRi: "Final" Fortran 9x version
!
! $Id: dgemmCbind.f90,v 1.2 2018/06/11 18:23:37 mriedl Exp mriedl $
!
INTERFACE
! Explicit interface to BLAS functions and procedures outside
! of this module to ensure propper calling conventions
LOGICAL FUNCTION LSAME(CA,CB)
! explicit interface
! LAPACK auxiliary routine
! returns .TRUE. if CA is the same letter as CB regardless of case.
CHARACTER :: CA,CB
END FUNCTION LSAME
SUBROUTINE XERBLA(SRNAME,INFO)
! explicit interface
! LAPACK auxiliary routine (preliminary version)
! XERBLA is an error handler for the LAPACK routines.
CHARACTER (len=6) :: SRNAME
INTEGER :: INFO
END SUBROUTINE XERBLA
SUBROUTINE DGEMM(TransA,TransB,M,N,K,Alpha,A,Lda,B,Ldb,Beta,C,Ldc)
! explicit interface
! Standrad level 3 BLAS Matrix-Matrix multiplication routine
IMPLICIT NONE
INTEGER, parameter :: dp = SELECTED_REAL_KIND(15,300)
REAL (kind=dp) :: Alpha,Beta
INTEGER :: K,Lda,Ldb,Ldc,M,N
CHARACTER :: TransA,TransB
REAL (kind=dp) :: A(Lda,*),B(Ldb,*),C(Ldc,*)
END SUBROUTINE DGEMM
END INTERFACE
CONTAINS
#ifdef __CBIND__
SUBROUTINE DGEMMomp(TransA,TransB,M,N,K,Alpha,A,Lda,B,Ldb,Beta,C,Ldc) &
bind(C,name="dgemmomp" )
IMPLICIT NONE
CHARACTER (kind=C_CHAR) :: TransA,TransB
INTEGER (kind=C_INT) :: M,N,K,Lda,Ldb,Ldc
REAL (kind=C_DOUBLE) :: Alpha,Beta
REAL (kind=C_DOUBLE) :: A(Lda,*),B(Ldb,*),C(Ldc,*)
#else
SUBROUTINE DGEMMomp(TransA,TransB,M,N,K,Alpha,A,Lda,B,Ldb,Beta,C,Ldc)
IMPLICIT NONE
CHARACTER :: TransA,TransB
INTEGER :: M,N,K,Lda,Ldb,Ldc
REAL (kind=dp) :: Alpha,Beta
REAL (kind=dp) :: A(Lda,*),B(Ldb,*),C(Ldc,*)
#endif
! ------------------------------------------------------------------
! dgemx.f: auxiliary routine in the package pdhseqr.
!
! contributors: Robert Granat
! Bo Kagstrom
! Daniel Kressner
! Meiyue Shao
!
! Department of computing science and hpc2n, umea university
! mathicse anchp, epf Lausanne
!
! Fortran 2003 version by Michael Riedl
! ------------------------------------------------------------------
! Purpose
! =======
!
! dgemx performs one of the matrix-matrix operations
!
! c := alpha*op( a )*op( b ) + beta*c,
!
! where op( x ) is one of
!
! op( x ) = x or op( x ) = x',
!
! alpha and beta are scalars, and a, b and c are matrices, with
! op( a ) an m by k matrix, op( b ) a k by n matrix and c an
! m by n matrix.
! ------------------------------------------------------------------
! local scalars
INTEGER :: info,ncola,nrowa,nrowb,indx,m2,n2
INTEGER :: threads,mchunk,mythread,nchunk
LOGICAL :: nota,notb
! ------------------------------------------------------------------
! parameters
REAL(dp),parameter :: ONE=1.0D+00, ZERO=0.0D+00
! ------------------------------------------------------------------
! set nota and notb as true if a and b respectively are
! not transposed and set nrowa, ncola and nrowb as the number
! of rows and columns of a and the number of rows of b
! respectively.
! ------------------------------------------------------------------
nota = LSAME(TransA,'N')
notb = LSAME(TransB,'N')
IF (nota) THEN
nrowa = M
ncola = K
ELSE
nrowa = K
ncola = M
ENDIF
IF (notb) THEN
nrowb = K
ELSE
nrowb = N
ENDIF
! test the input parameters.
info = 0
IF ((.NOT. nota).AND.(.NOT. LSAME(TransA,'C')) .AND. &
(.NOT. LSAME(TransA,'T'))) &
THEN
info = 1
ELSEIF ((.NOT. notb) .AND. (.NOT. LSAME(TransB,'C')) .AND. &
(.NOT. LSAME(TransB,'T'))) &
THEN
info = 2
ELSEIF (M .LT. 0) THEN
info = 3
ELSEIF (N .LT. 0) THEN
info = 4
ELSEIF (K .LT. 0) THEN
info = 5
ELSEIF (Lda .LT. MAX(1,nrowa)) THEN
info = 8
ELSEIF (Ldb .LT. MAX(1,nrowb)) THEN
info = 10
ELSEIF (Ldc .LT. MAX(1,M)) THEN
info = 13
ENDIF
IF (info .NE. 0) THEN
CALL XERBLA('DGEMMomp ',info)
STOP
ENDIF
IF ((M .EQ. 0) .OR. (N .EQ. 0) .OR. &
(((Alpha .EQ. ZERO) .OR. (K .EQ. 0)) .AND. (Beta .EQ. ONE))) &
THEN
! quick return if possible.
RETURN
ENDIF
!$OMP PARALLEL SHARED(threads)
!$OMP MASTER
threads = OMP_GET_NUM_THREADS()
!$OMP END MASTER
!$OMP END PARALLEL
IF ((M .LE. 2*threads) .OR. (N .LE. 2*threads)) THEN
! MRi, 09.06.18
CALL DGEMM(TransA,TransB,M,N,K,Alpha,A,Lda,B,Ldb,Beta,C,Ldc)
RETURN
ENDIF
IF (threads .EQ. 1) THEN
! MRi, 09.06.18
CALL DGEMMurl(TransA,TransB,M,N,K,Alpha,A,Lda,B,Ldb,Beta,C,Ldc)
RETURN
ENDIF
!
mchunk = MAX(1,M/threads)
nchunk = MAX(1,N/threads)
!
! do the real work
!
IF (M .GT. N) THEN
!
!$OMP parallel default( none ), &
!$OMP SHARED(TransA,TransB,m,n,k,alpha,a,lda,b,ldb,beta,c,ldc, &
!$OMP threads,mchunk,nota,notb), PRIVATE(indx,m2,mythread)
!
mythread = OMP_GET_THREAD_NUM()
!
!$OMP DO
!
DO indx = 1,M,mchunk
m2 = MIN(mchunk,M-indx+1)
IF (nota) THEN
IF ((m2 .GT. MinDim) .AND. UseURL) THEN ! unrolled version
CALL DGEMMurl(TransA,TransB,m2,N,K,Alpha,A(indx,1),Lda, &
B,Ldb,Beta,C(indx,1),Ldc)
ELSE ! standard routine
CALL DGEMM (TransA,TransB,m2,N,K,Alpha,A(indx,1),Lda, &
B,Ldb,Beta,C(indx,1),Ldc)
ENDIF
ELSE
IF ((m2 .GT. MinDim) .AND. UseURL) THEN ! unrolled version
CALL DGEMMurl(TransA,TransB,m2,N,K,Alpha,A(1,indx),Lda, &
B,Ldb,Beta,C(indx,1),Ldc)
ELSE ! standard routine
CALL DGEMM (TransA,TransB,m2,N,K,Alpha,A(1,indx),Lda, &
B,Ldb,Beta,C(indx,1),Ldc)
ENDIF
ENDIF
ENDDO
!$OMP END DO
!$OMP END PARALLEL
ELSE
!$OMP parallel default( none ), &
!$OMP shared(TransA,TransB,m,n,k,alpha,a,lda,b,ldb,beta,c,ldc, &
!$OMP threads,nchunk,nota,notb), private(indx,n2,mythread)
mythread = OMP_GET_THREAD_NUM()
!$OMP DO
!
DO indx = 1,N,nchunk
n2 = MIN(nchunk,N-indx+1)
IF (notb) THEN
IF ((n2 .GT. MinDim) .AND. UseURL) THEN ! unrolled version
CALL DGEMMurl(TransA,TransB,M,n2,K,Alpha,A,Lda, &
B(1,indx),Ldb,Beta,C(1,indx),Ldc)
ELSE ! standard routine
CALL DGEMM (TransA,TransB,M,n2,K,Alpha,A,Lda, &
B(1,indx),Ldb,Beta,C(1,indx),Ldc)
ENDIF
ELSE
IF ((n2 .GT. MinDim) .AND. UseURL) THEN ! unrolled version
CALL DGEMMurl(TransA,TransB,M,n2,K,Alpha,A,Lda, &
B(indx,1),Ldb,Beta,C(1,indx),Ldc)
ELSE ! standard routine
CALL DGEMM (TransA,TransB,M,n2,K,Alpha,A,Lda, &
B(indx,1),Ldb,Beta,C(1,indx),Ldc)
ENDIF
ENDIF
ENDDO
!
!$OMP END DO
!$OMP END PARALLEL
!
ENDIF
END SUBROUTINE DGEMMomp
SUBROUTINE DGEMMurl(TransA,TransB,M,N,K,Alpha,A,Lda,B,Ldb,Beta,C,Ldc)
IMPLICIT NONE
CHARACTER (len=1) :: TransA,TransB
INTEGER :: M,N,K,Lda,Ldb,Ldc
REAL (kind=dp) :: Alpha,Beta
REAL (kind=dp) :: A(Lda,*),B(Ldb,*),C(Ldc,*)
! ------------------------------------------------------------------
! Version with unrolled loops
! ------------------------------------------------------------------
! Purpose
! =======
!
! DGEMM performs one of the matrix-matrix operations
!
! C := alpha*op( A )*op( B ) + beta*C,
!
! where op( X ) is one of
!
! op( X ) = X or op( X ) = X',
!
! alpha and beta are scalars, and A, B and C are matrices, with
! op(A) an m by k matrix, op(B) a k by n matrix and C an m by n
! matrix.
!
! Parameters
! ==========
!
! TransA - CHARACTER*1.
! On entry, TransA specifies the form of op( A ) to be used
! in the matrix multiplication as follows:
!
! TransA = 'N' or 'n', op( A ) = A.
!
! TransA = 'T' or 't', op( A ) = A'.
!
! TransA = 'C' or 'c', op( A ) = A'.
!
! Unchanged on exit.
!
! TransB - CHARACTER*1.
! On entry, TransB specifies the form of op( B ) to be used
! in the matrix multiplication as follows:
!
! TransB = 'N' or 'n', op( B ) = B.
!
! TransB = 'T' or 't', op( B ) = B'.
!
! TransB = 'C' or 'c', op( B ) = B'.
!
! Unchanged on exit.
!
! M - INTEGER.
! On entry, M specifies the number of rows of the matrix
! op(A) and of the matrix C. M must be at least zero.
! Unchanged on exit.
!
! N - INTEGER.
! On entry, N specifies the number of columns of the matrix
! op( B ) and the number of columns of the matrix C. N must
! be at least zero.
! Unchanged on exit.
!
! K - INTEGER.
! On entry, K specifies the number of columns of the matrix
! op(A) and the number of rows of the matrix op(B). K must
! be at least zero.
! Unchanged on exit.
!
! ALPHA - DOUBLE PRECISION.
! On entry, ALPHA specifies the scalar alpha.
! Unchanged on exit.
!
! A - DOUBLE PRECISION array of DIMENSION (LDA,ka), where ka is
! k when TransA = 'N' or 'n', and is m otherwise.
! Before entry with TransA = 'N' or 'n', the leading m by k
! part of the array A must contain the matrix A, otherwise
! the leading k by m part of the array A must contain the
! matrix A.
! Unchanged on exit.
!
! LDA - INTEGER.
! On entry, LDA specifies the first dimension of A as
! declared in the calling (sub) program. When
! TransA = 'N' or 'n' then LDA must be at least max(1,m),
! otherwise LDA must be at least max(1,k).
! Unchanged on exit.
!
! B - DOUBLE PRECISION array of DIMENSION (LDB,kb), where kb is
! n when TransB = 'N' or 'n', and is k otherwise.
! Before entry with TransB = 'N' or 'n', the leading k by n
! part of the array B must contain the matrix B, otherwise
! the leading n by k part of the array B must contain the
! matrix B.
! Unchanged on exit.
!
! LDB - INTEGER.
! On entry, LDB specifies the first dimension of B as
! declared in the calling (sub) program. When
! TransB = 'N' or 'n' then LDB must be at least max(1,k),
! otherwise LDB must be at least max(1,n).
! Unchanged on exit.
!
! BETA - DOUBLE PRECISION.
! On entry, BETA specifies the scalar beta. When BETA is
! supplied as zero then C need not be set on input.
! Unchanged on exit.
!
! C - DOUBLE PRECISION array of DIMENSION ( LDC, n ).
! Before entry, the leading m by n part of the array C must
! contain the matrix C, except when beta is zero, in which
! case C need not be set on entry.
! On exit, the array C is overwritten by the m by n matrix
! ( alpha*op( A )*op( B ) + beta*C ).
!
! LDC - INTEGER.
! On entry, LDC specifies the first dimension of C as
! declared in the calling (sub) program. LDC must be at
! least max( 1, m ).
! Unchanged on exit.
!
!
! Level 3 Blas routine.
!
! -- Written on 8-February-1989.
! Jack Dongarra, Argonne National Laboratory.
! Iain Duff, AERE Harwell.
! Jeremy Du Croz, Numerical Algorithms Group Ltd.
! Sven Hammarling, Numerical Algorithms Group Ltd.
!
! -- Modified in October-1997.
! Superscalar GEMM-Based Level 3 BLAS (Version 0.1).
! Per Ling, Department of Computing Science,
! Umea University, Sweden.
!
! -- Fortran 90 by Michael Riedl, 2018
! ------------------------------------------------------------------
! local scalars
INTEGER :: i,ii,isec,uisec,j,jj,jsec,ujsec,l,ll,lsec
INTEGER :: ulsec,info,nrowa,nrowb
LOGICAL :: nota,notb
REAL (kind=dp) :: delta
REAL (kind=dp) :: f11,f12,f21,f22,f31,f32,f41,f42
REAL (kind=dp) :: f13,f14,f23,f24,f33,f34,f43,f44
! ------------------------------------------------------------------
REAL (kind=dp),parameter :: ZERO=0.0D+0, ONE=1.0D+0
! ------------------------------------------------------------------
REAL (kind=dp) :: t1(KB,MB),t2(KB,NBT)
! ------------------------------------------------------------------
! executable statements
!
! set nota and notb as true if a and b respectively are
! not transposed and set nrowa and nrowb as the number of rows of
! a and the number of rows of b respectively.
! ------------------------------------------------------------------
! Write (*,*) "dgemm mit ausgerollten Schleifen ... "
nota = LSAME(TransA,'N')
notb = LSAME(TransB,'N')
IF (nota) THEN
nrowa = M
ELSE
nrowa = K
ENDIF
IF (notb) THEN
nrowb = K
ELSE
nrowb = N
ENDIF
!
! test the input parameters.
!
info = 0
IF ((.NOT. nota) .AND. (.NOT. LSAME(TransA,'C')) .AND. &
(.NOT. LSAME(TransA,'T'))) &
THEN
info = 1
ELSEIF ((.NOT. notb) .AND. (.NOT. LSAME(TransB,'C')) .AND. &
(.NOT. LSAME(TransB,'T'))) &
THEN
info = 2
ELSEIF (M .LT. 0) THEN
info = 3
ELSEIF (N .LT. 0) THEN
info = 4
ELSEIF (K .LT. 0) THEN
info = 5
ELSEIF (Lda .LT. MAX(1,nrowa)) THEN
info = 8
ELSEIF (Ldb .LT. MAX(1,nrowb)) THEN
info = 10
ELSEIF (Ldc .LT. MAX(1,M)) THEN
info = 13
ENDIF
IF (info .NE. 0) THEN
CALL XERBLA('DGEMM ',info)
RETURN
ENDIF
!
! quick return if possible.
!
IF ((M .EQ. 0) .OR. (N .EQ. 0) .OR. &
(((Alpha .EQ. ZERO) .OR. (K .EQ. 0)) .AND. (Beta .EQ. ONE))) &
THEN
RETURN
ENDIF
!
! and when alpha.eq.zero.
!
IF ((Alpha .EQ. ZERO) .OR. (K .EQ. 0)) THEN
IF (Beta .EQ. ZERO) THEN
uisec = M - MOD(M,4)
DO j=1,N
DO i=1,uisec,4
C(i+0,j) = ZERO
C(i+1,j) = ZERO
C(i+2,j) = ZERO
C(i+3,j) = ZERO
ENDDO
DO i=uisec+1,M
C(i,j) = ZERO
ENDDO
ENDDO
ELSE
uisec=M-MOD(M,4)
DO j=1,N
DO i=1,uisec,4
C(i+0,j) = Beta*C(i+0,j)
C(i+1,j) = Beta*C(i+1,j)
C(i+2,j) = Beta*C(i+2,j)
C(i+3,j) = Beta*C(i+3,j)
ENDDO
DO i=uisec+1,M
C(i,j) = Beta*C(i,j)
ENDDO
ENDDO
ENDIF
RETURN
ENDIF
!
! start the operations.
!
IF (notb) THEN
!
! form c := alpha*a*b + beta*c or c := alpha*a'*b + beta*c.
!
DO jj=1,N,NB
jsec = MIN(NB,N-jj+1)
ujsec = jsec - MOD(jsec,4)
DO ll=1,K,KB
lsec = MIN(KB,K-ll+1)
ulsec = lsec - MOD(lsec,2)
!
! determine if the block of c should be updated with
! beta or not.
!
delta = ONE
IF (ll .EQ. 1) THEN
delta = Beta
ENDIF
!
DO ii=1,M,MB
isec = MIN(MB,M-ii+1)
!
! t1 := alpha*a' or t1 := alpha*a, copy the transpose
! or the non-transpose of a rectangular block of
! alpha*a to t1.
!
uisec = isec - MOD(isec,2)
IF (nota) THEN
DO l=ll,ll+ulsec-1,2
DO i= ii,ii+uisec-1,2
t1(l-ll+1,i-ii+1) = Alpha*A(i,l)
t1(l-ll+2,i-ii+1) = Alpha*A(i,l+1)
t1(l-ll+1,i-ii+2) = Alpha*A(i+1,l)
t1(l-ll+2,i-ii+2) = Alpha*A(i+1,l+1)
ENDDO
IF (uisec .LT. isec) THEN
t1(l-ll+1,isec) = Alpha*A(ii+isec-1,l)
t1(l-ll+2,isec) = Alpha*A(ii+isec-1,l+1)
ENDIF
ENDDO
IF (ulsec .LT. lsec) THEN
DO i=ii,ii+isec-1
t1(lsec,i-ii+1) = Alpha*A(i,ll+lsec-1)
ENDDO
ENDIF
ELSE
DO i=ii,ii+uisec-1,2
DO l=ll,ll+ulsec-1,2
t1(l-ll+1,i-ii+1) = Alpha*A(l,i)
t1(l-ll+1,i-ii+2) = Alpha*A(l,i+1)
t1(l-ll+2,i-ii+1) = Alpha*A(l+1,i)
t1(l-ll+2,i-ii+2) = Alpha*A(l+1,i+1)
ENDDO
IF (ulsec .LT. lsec) THEN
t1(lsec,i-ii+1) = Alpha*A(ll+lsec-1,i)
t1(lsec,i-ii+2) = Alpha*A(ll+lsec-1,i+1)
ENDIF
ENDDO
IF (uisec .LT. isec) THEN
DO l=ll,ll+lsec-1
t1(l-ll+1,isec) = Alpha*A(l,ii+isec-1)
ENDDO
ENDIF
ENDIF
!
! c := t1'*b + beta*c, update a rectangular block
! of c using 4 by 4 unrolling.
!
uisec = isec - MOD(isec,4)
DO j=jj,jj+ujsec-1,4
DO i=ii,ii+uisec-1,4
f11 = delta*C(i,j)
f21 = delta*C(i+1,j)
f12 = delta*C(i,j+1)
f22 = delta*C(i+1,j+1)
f13 = delta*C(i,j+2)
f23 = delta*C(i+1,j+2)
f14 = delta*C(i,j+3)
f24 = delta*C(i+1,j+3)
f31 = delta*C(i+2,j)
f41 = delta*C(i+3,j)
f32 = delta*C(i+2,j+1)
f42 = delta*C(i+3,j+1)
f33 = delta*C(i+2,j+2)
f43 = delta*C(i+3,j+2)
f34 = delta*C(i+2,j+3)
f44 = delta*C(i+3,j+3)
DO l=ll,ll+lsec-1
f11 = f11 + t1(l-ll+1,i-ii+1)*B(l,j)
f21 = f21 + t1(l-ll+1,i-ii+2)*B(l,j)
f12 = f12 + t1(l-ll+1,i-ii+1)*B(l,j+1)
f22 = f22 + t1(l-ll+1,i-ii+2)*B(l,j+1)
f13 = f13 + t1(l-ll+1,i-ii+1)*B(l,j+2)
f23 = f23 + t1(l-ll+1,i-ii+2)*B(l,j+2)
f14 = f14 + t1(l-ll+1,i-ii+1)*B(l,j+3)
f24 = f24 + t1(l-ll+1,i-ii+2)*B(l,j+3)
f31 = f31 + t1(l-ll+1,i-ii+3)*B(l,j)
f41 = f41 + t1(l-ll+1,i-ii+4)*B(l,j)
f32 = f32 + t1(l-ll+1,i-ii+3)*B(l,j+1)
f42 = f42 + t1(l-ll+1,i-ii+4)*B(l,j+1)
f33 = f33 + t1(l-ll+1,i-ii+3)*B(l,j+2)
f43 = f43 + t1(l-ll+1,i-ii+4)*B(l,j+2)
f34 = f34 + t1(l-ll+1,i-ii+3)*B(l,j+3)
f44 = f44 + t1(l-ll+1,i-ii+4)*B(l,j+3)
ENDDO
C(i,j) = f11
C(i+1,j) = f21
C(i,j+1) = f12
C(i+1,j+1) = f22
C(i,j+2) = f13
C(i+1,j+2) = f23
C(i,j+3) = f14
C(i+1,j+3) = f24
C(i+2,j) = f31
C(i+3,j) = f41
C(i+2,j+1) = f32
C(i+3,j+1) = f42
C(i+2,j+2) = f33
C(i+3,j+2) = f43
C(i+2,j+3) = f34
C(i+3,j+3) = f44
ENDDO
IF (uisec .LT. isec) THEN
DO i=ii+uisec,ii+isec-1
f11 = delta*C(i,j)
f12 = delta*C(i,j+1)
f13 = delta*C(i,j+2)
f14 = delta*C(i,j+3)
DO l=ll,ll+lsec-1
f11 = f11 + t1(l-ll+1,i-ii+1)*B(l,j)
f12 = f12 + t1(l-ll+1,i-ii+1)*B(l,j+1)
f13 = f13 + t1(l-ll+1,i-ii+1)*B(l,j+2)
f14 = f14 + t1(l-ll+1,i-ii+1)*B(l,j+3)
ENDDO
C(i,j) = f11
C(i,j+1) = f12
C(i,j+2) = f13
C(i,j+3) = f14
ENDDO
ENDIF
ENDDO
IF (ujsec .LT. jsec) THEN
DO j=jj+ujsec,jj+jsec-1
DO i=ii,ii+uisec-1,4
f11 = delta*C(i,j)
f21 = delta*C(i+1,j)
f31 = delta*C(i+2,j)
f41 = delta*C(i+3,j)
DO l=ll,ll+lsec-1
f11 = f11 + t1(l-ll+1,i-ii+1)*B(l,j)
f21 = f21 + t1(l-ll+1,i-ii+2)*B(l,j)
f31 = f31 + t1(l-ll+1,i-ii+3)*B(l,j)
f41 = f41 + t1(l-ll+1,i-ii+4)*B(l,j)
ENDDO
C(i,j) = f11
C(i+1,j) = f21
C(i+2,j) = f31
C(i+3,j) = f41
ENDDO
DO i=ii+uisec,ii+isec-1
f11 = delta*C(i,j)
DO l=ll,ll+lsec-1
f11 = f11 + t1(l-ll+1,i-ii+1)*B(l,j)
ENDDO
C(i,j) = f11
ENDDO
ENDDO
ENDIF
ENDDO
ENDDO
ENDDO
ELSE
!
! form c := alpha*a*b' + beta*c or c := alpha*a'*b' + beta*c.
!
DO jj=1,N,NBT
jsec = MIN(NBT,N-jj+1)
DO ll=1,K,KB
lsec = MIN(KB,K-ll+1)
!
! determine if the block of c should be updated with
! beta or not.
!
delta = ONE
IF (ll .EQ. 1) THEN
delta = Beta
ENDIF
!
! t2 := alpha*b', copy the transpose of a rectangular
! block of alpha*a to t2.
!
ulsec = lsec - MOD(lsec,2)
ujsec = jsec - MOD(jsec,2)
DO l=ll,ll+ulsec-1,2
DO j=jj,jj+ujsec-1,2
t2(l-ll+1,j-jj+1) = Alpha*B(j,l)
t2(l-ll+2,j-jj+1) = Alpha*B(j,l+1)
t2(l-ll+1,j-jj+2) = Alpha*B(j+1,l)
t2(l-ll+2,j-jj+2) = Alpha*B(j+1,l+1)
ENDDO
IF (ujsec .LT. jsec) THEN
t2(l-ll+1,jsec) = Alpha*B(jj+jsec-1,l)
t2(l-ll+2,jsec) = Alpha*B(jj+jsec-1,l+1)
ENDIF
ENDDO
IF (ulsec .LT. lsec) THEN
DO j=jj,jj+jsec-1
t2(lsec,j-jj+1) = Alpha*B(j,ll+lsec-1)
ENDDO
ENDIF
!
ujsec = jsec - MOD(jsec,4)
DO ii=1,M,MB
isec = MIN(MB,M-ii+1)
!
! t1 := alpha*a' or t1 := alpha*a, copy the transpose
! or the non-transpose of a rectangular block of
! alpha*a to t1.
!
uisec = isec - MOD(isec,2)
IF (nota) THEN
DO l=ll,ll+ulsec-1,2
DO i=ii,ii+uisec-1,2
t1(l-ll+1,i-ii+1) = A(i,l)
t1(l-ll+2,i-ii+1) = A(i,l+1)
t1(l-ll+1,i-ii+2) = A(i+1,l)
t1(l-ll+2,i-ii+2) = A(i+1,l+1)
ENDDO
IF (uisec .LT. isec) THEN
t1(l-ll+1,isec) = A(ii+isec-1,l)
t1(l-ll+2,isec) = A(ii+isec-1,l+1)
ENDIF
ENDDO
IF (ulsec .LT. lsec) THEN
DO i=ii,ii+isec-1
t1(lsec,i-ii+1) = A(i,ll+lsec-1)
ENDDO
ENDIF
ELSE
DO i=ii,ii+uisec-1,2
DO l=ll,ll+ulsec-1,2
t1(l-ll+1,i-ii+1) = A(l,i)
t1(l-ll+1,i-ii+2) = A(l,i+1)
t1(l-ll+2,i-ii+1) = A(l+1,i)
t1(l-ll+2,i-ii+2) = A(l+1,i+1)
ENDDO
IF (ulsec .LT. lsec) THEN
t1(lsec,i-ii+1) = A(ll+lsec-1,i)
t1(lsec,i-ii+2) = A(ll+lsec-1,i+1)
ENDIF
ENDDO
IF (uisec .LT. isec) THEN
DO l=ll,ll+lsec-1
t1(l-ll+1,isec) = A(l,ii+isec-1)
ENDDO
ENDIF
ENDIF
! c := t1'*b + beta*c, update a rectangular block
! of c using 4 by 4 unrolling.
uisec = isec - MOD(isec,4)
DO j=jj,jj+ujsec-1,4
DO i=ii,ii+uisec-1,4
f11 = delta*C(i,j)
f21 = delta*C(i+1,j)
f12 = delta*C(i,j+1)
f22 = delta*C(i+1,j+1)
f13 = delta*C(i,j+2)
f23 = delta*C(i+1,j+2)
f14 = delta*C(i,j+3)
f24 = delta*C(i+1,j+3)
f31 = delta*C(i+2,j)
f41 = delta*C(i+3,j)
f32 = delta*C(i+2,j+1)
f42 = delta*C(i+3,j+1)
f33 = delta*C(i+2,j+2)
f43 = delta*C(i+3,j+2)
f34 = delta*C(i+2,j+3)
f44 = delta*C(i+3,j+3)
DO l=ll,ll+lsec-1
f11 = f11 + t1(l-ll+1,i-ii+1)*t2(l-ll+1,j-jj+1)
f21 = f21 + t1(l-ll+1,i-ii+2)*t2(l-ll+1,j-jj+1)
f12 = f12 + t1(l-ll+1,i-ii+1)*t2(l-ll+1,j-jj+2)
f22 = f22 + t1(l-ll+1,i-ii+2)*t2(l-ll+1,j-jj+2)
f13 = f13 + t1(l-ll+1,i-ii+1)*t2(l-ll+1,j-jj+3)
f23 = f23 + t1(l-ll+1,i-ii+2)*t2(l-ll+1,j-jj+3)
f14 = f14 + t1(l-ll+1,i-ii+1)*t2(l-ll+1,j-jj+4)
f24 = f24 + t1(l-ll+1,i-ii+2)*t2(l-ll+1,j-jj+4)
f31 = f31 + t1(l-ll+1,i-ii+3)*t2(l-ll+1,j-jj+1)
f41 = f41 + t1(l-ll+1,i-ii+4)*t2(l-ll+1,j-jj+1)
f32 = f32 + t1(l-ll+1,i-ii+3)*t2(l-ll+1,j-jj+2)
f42 = f42 + t1(l-ll+1,i-ii+4)*t2(l-ll+1,j-jj+2)
f33 = f33 + t1(l-ll+1,i-ii+3)*t2(l-ll+1,j-jj+3)
f43 = f43 + t1(l-ll+1,i-ii+4)*t2(l-ll+1,j-jj+3)
f34 = f34 + t1(l-ll+1,i-ii+3)*t2(l-ll+1,j-jj+4)
f44 = f44 + t1(l-ll+1,i-ii+4)*t2(l-ll+1,j-jj+4)
ENDDO
C(i,j) = f11
C(i+1,j) = f21
C(i,j+1) = f12
C(i+1,j+1) = f22
C(i,j+2) = f13
C(i+1,j+2) = f23
C(i,j+3) = f14
C(i+1,j+3) = f24
C(i+2,j) = f31
C(i+3,j) = f41
C(i+2,j+1) = f32
C(i+3,j+1) = f42
C(i+2,j+2) = f33
C(i+3,j+2) = f43
C(i+2,j+3) = f34
C(i+3,j+3) = f44
ENDDO
IF (uisec .LT. isec) THEN
DO i=ii+uisec,ii+isec-1
f11 = delta*C(i,j)
f12 = delta*C(i,j+1)
f13 = delta*C(i,j+2)
f14 = delta*C(i,j+3)
DO l=ll,ll+lsec-1
f11 = f11 + t1(l-ll+1,i-ii+1)*t2(l-ll+1,j-jj+1)
f12 = f12 + t1(l-ll+1,i-ii+1)*t2(l-ll+1,j-jj+2)
f13 = f13 + t1(l-ll+1,i-ii+1)*t2(l-ll+1,j-jj+3)
f14 = f14 + t1(l-ll+1,i-ii+1)*t2(l-ll+1,j-jj+4)
ENDDO
C(i,j) = f11
C(i,j+1) = f12
C(i,j+2) = f13
C(i,j+3) = f14
ENDDO
ENDIF
ENDDO
IF (ujsec .LT. jsec) THEN
DO j=jj+ujsec,jj+jsec-1
DO i=ii,ii+uisec-1,4
f11 = delta*C(i,j)
f21 = delta*C(i+1,j)
f31 = delta*C(i+2,j)
f41 = delta*C(i+3,j)
DO l=ll,ll+lsec-1
f11 = f11 + t1(l-ll+1,i-ii+1)*t2(l-ll+1,j-jj+1)
f21 = f21 + t1(l-ll+1,i-ii+2)*t2(l-ll+1,j-jj+1)
f31 = f31 + t1(l-ll+1,i-ii+3)*t2(l-ll+1,j-jj+1)
f41 = f41 + t1(l-ll+1,i-ii+4)*t2(l-ll+1,j-jj+1)
ENDDO
C(i,j) = f11
C(i+1,j) = f21
C(i+2,j) = f31
C(i+3,j) = f41
ENDDO
DO i=ii+uisec,ii+isec-1
f11 = delta*C(i,j)
DO l=ll,ll+lsec-1
f11 = f11 + t1(l-ll+1,i-ii+1)*t2(l-ll+1,j-jj+1)
ENDDO
C(i,j) = f11
ENDDO
ENDDO
ENDIF
ENDDO
ENDDO
ENDDO
ENDIF
END SUBROUTINE DGEMMurl
#ifdef __CBIND__
SUBROUTINE DGEMMomp2(TransA,TransB,M,N,K,Alpha,A,Lda,B,Ldb,Beta,C,Ldc) &
bind(C,name="dgemmomp2" )
IMPLICIT NONE
CHARACTER (kind=C_CHAR) :: TransA,TransB
INTEGER (kind=C_INT) :: M,N,K,Lda,Ldb,Ldc
REAL (kind=C_DOUBLE) :: Alpha,Beta
REAL (kind=C_DOUBLE) :: A(Lda,*),B(Ldb,*),C(Ldc,*)
#else
SUBROUTINE DGEMMomp2(TransA,TransB,M,N,K,Alpha,A,Lda,B,Ldb,Beta,C,Ldc)
IMPLICIT NONE
CHARACTER :: TransA,TransB
INTEGER :: M,N,K,Lda,Ldb,Ldc
REAL (kind=dp) :: Alpha,Beta
REAL (kind=dp) :: A(Lda,*),B(Ldb,*),C(Ldc,*)
#endif
! ------------------------------------------------------------------
REAL (kind=dp) :: temp
INTEGER :: i,j,l
INTEGER :: info,ncola,nrowa,nrowb,indx,m2,n2
INTEGER :: threads,mchunk,mythread,nchunk
LOGICAL :: nota,notb
! ------------------------------------------------------------------
REAL(dp),parameter :: ONE=1.0D+00, ZERO=0.0D+00
! ------------------------------------------------------------------
! set nota and notb as true if a and b respectively are
! not transposed and set nrowa, ncola and nrowb as the number
! of rows and columns of a and the number of rows of b
! respectively.
! ------------------------------------------------------------------
nota=LSAME(TransA,'N')
notb=LSAME(TransB,'N')
IF (nota) THEN
nrowa=M
ncola=K
ELSE
nrowa=K
ncola=M
ENDIF
IF (notb) THEN
nrowb=K
ELSE
nrowb=N
ENDIF
!
! test the input parameters.
!
info=0
IF ((.NOT. nota) .AND. (.NOT. LSAME(TransA,'C')) .AND. &
(.NOT. LSAME(TransA,'T'))) THEN
info=1
ELSEIF ((.NOT. notb) .AND. (.NOT. LSAME(TransB,'C')) .AND. &
(.NOT. LSAME(TransB,'T'))) THEN
info=2
ELSEIF (M .LT. 0) THEN
info=3
ELSEIF (N .LT. 0) THEN
info=4
ELSEIF (K .LT. 0) THEN
info=5
ELSEIF (LDA .LT. MAX(1,nrowa)) THEN
info=8
ELSEIF (LDB .LT. MAX(1,nrowb)) THEN
info=10
ELSEIF (LDC .LT. MAX(1,M)) THEN
info=13
ENDIF
IF (info .NE. 0) THEN
CALL XERBLA('DGEMM ',info)
RETURN
ENDIF
!
! quick return if possible.
!
IF ((M .EQ. 0) .OR. (N .EQ. 0) .OR. &
(((Alpha .EQ. ZERO) .OR. (K .EQ. 0)) .AND. (Beta .EQ. ONE))) &
THEN
RETURN
ENDIF
!
! and if alpha .eq. zero.
!
IF (Alpha .EQ. ZERO) THEN
IF (Beta .EQ. ZERO) THEN
!$OMP PARALLEL DO SHARED (N,M,C) PRIVATE(i,j)
DO j=1,N
DO i=1,M
C(i,j)=ZERO
ENDDO
ENDDO
!$OMP END PARALLEL DO
ELSE
!$OMP PARALLEL DO SHARED (N,M,C) PRIVATE(i,j,Beta)
DO j=1,N
DO i=1,M
C(i,j)=Beta*C(i,j)
ENDDO
ENDDO
!$OMP END PARALLEL DO
ENDIF
RETURN
ENDIF
!
! start the operations.
!
IF (notb) THEN
IF (nota) THEN
!
! form c := alpha*a*b + beta*c.
!
!$OMP PARALLEL DO SHARED (N,M,K,A,B,C) PRIVATE(i,j,l,Beta,Alpha,temp)
DO j=1,N
IF (Beta .EQ. ZERO) THEN
DO i=1,M
C(i,j)=ZERO
ENDDO
ELSEIF (Beta .NE. ONE) THEN
DO i=1,M
C(i,j)=Beta*C(i,j)
ENDDO
ENDIF
DO l=1,K
IF (B(l,j) .NE. ZERO) THEN
temp=Alpha*B(l,j)
DO i=1,M
C(i,j)=C(i,j) + temp*A(i,l)
ENDDO
ENDIF
ENDDO
ENDDO
!$OMP END PARALLEL DO
ELSE
!
! form c := alpha*a'*b + beta*c
!
!$OMP PARALLEL DO SHARED (N,M,K,A,B,C) PRIVATE(i,j,l,Beta,Alpha,temp)
DO j=1,N
DO i=1,M
temp=ZERO
DO l=1,K
temp=temp + A(l,i)*B(l,j)
ENDDO
IF (Beta .EQ. ZERO) THEN
C(i,j)=Alpha*temp
ELSE
C(i,j)=Alpha*temp + Beta*C(i,j)
ENDIF
ENDDO
ENDDO
!$OMP END PARALLEL DO
ENDIF
ELSEIF (nota) THEN
!
! form c := alpha*a*b' + beta*c
!
!$OMP PARALLEL DO SHARED (N,M,K,A,B,C) PRIVATE(i,j,l,Beta,Alpha,temp)
DO j=1,N
IF (Beta .EQ. ZERO) THEN
DO i=1,M
C(i,j)=ZERO
ENDDO
ELSEIF (Beta .NE. ONE) THEN
DO i=1,M
C(i,j)=Beta*C(i,j)
ENDDO
ENDIF
DO l=1,K
IF (B(j,l) .NE. ZERO) THEN
temp=Alpha*B(j,l)
DO i=1,M
C(i,j)=C(i,j) + temp*A(i,l)
ENDDO
ENDIF
ENDDO
ENDDO
!$OMP END PARALLEL DO
ELSE
!
! form c := alpha*a'*b' + beta*c
!
!$OMP PARALLEL DO SHARED (N,M,K,A,B,C) PRIVATE(i,j,l,Beta,Alpha,temp)
DO j=1,N
DO i=1,M
temp=ZERO
DO l=1,K
temp=temp + A(l,i)*B(j,l)
ENDDO
IF (Beta .EQ. ZERO) THEN
C(i,j)=Alpha*temp
ELSE
C(i,j)=Alpha*temp + Beta*C(i,j)
ENDIF
ENDDO
ENDDO
!$OMP END PARALLEL DO
ENDIF
END SUBROUTINE DGEMMomp2
END MODULE dgemmCbind