Numerik / sources / [139f46] /dgemmCbind.f90

[139f46]: dgemmCbind.f90 History

dgemmCbind.f90 1101 lines (1065 with data), 37.4 kB

MODULE dgemmCbind ! Matrix Matrix multiplication

USE OMP_LIB      , ONLY : OMP_GET_NUM_THREADS,OMP_GET_THREAD_NUM
#ifdef __CBIND__
USE ISO_C_BINDING, ONLY : C_CHAR,C_INT,C_DOUBLE
#endif

IMPLICIT NONE

! original parameters
! INTEGER,parameter,private :: MB=32, NB=1024, NBT=96, KB=32 
! new      parameters
INTEGER,parameter,private  :: MB=32, NB=256, NBT=48, KB=32

LOGICAL,parameter,private :: UseURL = .TRUE.
INTEGER,parameter,private :: MinDim = 128
INTEGER,parameter,private :: dp=SELECTED_REAL_KIND(15,300)

!
!  Short description of subroutines dgemmomp and DGEMMurl
!  ------------------------------------------------------
!
!  Desciption of parameters
!
!  MB, NB, NBT & KB are blocking parameters for subroutine DGEMMurl
!  (DGEMM unrolled) which uses unrolled loops and blocking
!  If UseURL is set to false only the standard level 3 blas routine
!  DGEMM is callsed, otherwith GGEMMurl is used if the dimensions
!  of the matrices involved exceed the parameter MinDim
!
!  dgemmomp is a OpenMP version of DGEMM. If the number of threads
!  (e.g. set by OMP_SET_NUM_THREADS) is less than two the serial 
!  optimized version of DGEMM (DGEMMurl) is called directly.
!
!  Other remarks
!
!  dgemmomp is far from beeing optimal - please test if it is really 
!  improving the performance in your sprecific environment. 
!  On 32 bit systems the communication overhead outperforms the 
!  potential gain in speed by using more than one thread in many cases
!  But even on a outdated Atom single core processor a two threads 
!  version was about 40 % quicker than the single theread version.
!  So you need to test ...
!
!  Letzte Aenderung
!
!  08.06.18, MRi: "Final" Fortran 9x version
!
!  $Id: dgemmCbind.f90,v 1.2 2018/06/11 18:23:37 mriedl Exp mriedl $
!

INTERFACE

  ! Explicit interface to BLAS functions and procedures outside 
  ! of this module to ensure propper calling conventions

  LOGICAL FUNCTION LSAME(CA,CB)
      ! explicit interface
      ! LAPACK auxiliary routine
      ! returns .TRUE. if CA is the same letter as CB regardless of case.
      CHARACTER :: CA,CB
  END FUNCTION LSAME

  SUBROUTINE XERBLA(SRNAME,INFO)
      ! explicit interface
      ! LAPACK auxiliary routine (preliminary version)
      ! XERBLA  is an error handler for the LAPACK routines.
      CHARACTER (len=6) :: SRNAME
      INTEGER           :: INFO
  END SUBROUTINE XERBLA

  SUBROUTINE DGEMM(TransA,TransB,M,N,K,Alpha,A,Lda,B,Ldb,Beta,C,Ldc)
      ! explicit interface
      ! Standrad level 3 BLAS Matrix-Matrix multiplication routine 
      IMPLICIT NONE
      INTEGER, parameter :: dp = SELECTED_REAL_KIND(15,300)
      REAL (kind=dp)     :: Alpha,Beta
      INTEGER            :: K,Lda,Ldb,Ldc,M,N
      CHARACTER          :: TransA,TransB
      REAL (kind=dp)     :: A(Lda,*),B(Ldb,*),C(Ldc,*)
  END SUBROUTINE DGEMM

END INTERFACE

CONTAINS

#ifdef __CBIND__
SUBROUTINE DGEMMomp(TransA,TransB,M,N,K,Alpha,A,Lda,B,Ldb,Beta,C,Ldc)  &
           bind(C,name="dgemmomp" )

           IMPLICIT NONE
           CHARACTER (kind=C_CHAR)   :: TransA,TransB
           INTEGER   (kind=C_INT)    :: M,N,K,Lda,Ldb,Ldc
           REAL      (kind=C_DOUBLE) :: Alpha,Beta
           REAL      (kind=C_DOUBLE) :: A(Lda,*),B(Ldb,*),C(Ldc,*)
#else
SUBROUTINE DGEMMomp(TransA,TransB,M,N,K,Alpha,A,Lda,B,Ldb,Beta,C,Ldc)

           IMPLICIT NONE
           CHARACTER      :: TransA,TransB
           INTEGER        :: M,N,K,Lda,Ldb,Ldc
           REAL (kind=dp) :: Alpha,Beta
           REAL (kind=dp) :: A(Lda,*),B(Ldb,*),C(Ldc,*)
#endif

!     ------------------------------------------------------------------
!     dgemx.f: auxiliary routine in the package pdhseqr.
!
!     contributors: Robert Granat
!                   Bo Kagstrom
!                   Daniel Kressner
!                   Meiyue Shao
!     
!     Department of computing science and hpc2n, umea university
!     mathicse anchp, epf Lausanne
!
!     Fortran 2003 version by Michael Riedl
!     ------------------------------------------------------------------
!     Purpose
!     =======
!
!     dgemx performs one of the matrix-matrix operations
!
!     c := alpha*op( a )*op( b ) + beta*c,
!
!     where  op( x ) is one of
!
!     op( x ) = x   or   op( x ) = x',
!
!     alpha and beta are scalars, and a, b and c are matrices, with
!     op( a ) an m by k matrix, op( b ) a k by n matrix and c an
!     m by n matrix.
!     ------------------------------------------------------------------
!     local scalars

      INTEGER         :: info,ncola,nrowa,nrowb,indx,m2,n2
      INTEGER         :: threads,mchunk,mythread,nchunk
      LOGICAL         :: nota,notb
!     ------------------------------------------------------------------
!     parameters

      REAL(dp),parameter :: ONE=1.0D+00, ZERO=0.0D+00
!     ------------------------------------------------------------------
!     set nota and notb as true if a and b respectively are
!     not transposed and set nrowa, ncola and nrowb as the number
!     of rows and columns of a and the number of rows of b
!     respectively.
!     ------------------------------------------------------------------
      nota = LSAME(TransA,'N')
      notb = LSAME(TransB,'N')
      IF (nota) THEN
        nrowa = M
        ncola = K
      ELSE
        nrowa = K
        ncola = M
      ENDIF
      IF (notb) THEN
        nrowb = K
      ELSE
        nrowb = N
      ENDIF

!     test the input parameters.
      info = 0
      IF ((.NOT. nota).AND.(.NOT. LSAME(TransA,'C')) .AND.             &
          (.NOT. LSAME(TransA,'T')))                                   &
      THEN
        info = 1
      ELSEIF ((.NOT. notb) .AND. (.NOT. LSAME(TransB,'C')) .AND.       &
              (.NOT. LSAME(TransB,'T')))                               &
      THEN
        info = 2
      ELSEIF (M .LT. 0) THEN
        info = 3
      ELSEIF (N .LT. 0) THEN
        info = 4
      ELSEIF (K .LT. 0) THEN
        info = 5
      ELSEIF (Lda .LT. MAX(1,nrowa)) THEN
        info = 8
      ELSEIF (Ldb .LT. MAX(1,nrowb)) THEN
        info = 10
      ELSEIF (Ldc .LT. MAX(1,M)) THEN
        info = 13
      ENDIF
      IF (info .NE. 0) THEN
        CALL XERBLA('DGEMMomp ',info)
        STOP
      ENDIF

      IF ((M .EQ. 0) .OR. (N .EQ. 0) .OR.                              &
         (((Alpha .EQ. ZERO) .OR. (K .EQ. 0)) .AND. (Beta .EQ. ONE)))  &
      THEN
!       quick return if possible.
        RETURN
      ENDIF

!$OMP PARALLEL SHARED(threads)
!$OMP MASTER
        threads = OMP_GET_NUM_THREADS()
!$OMP END MASTER
!$OMP END PARALLEL

      IF ((M .LE. 2*threads) .OR. (N .LE. 2*threads)) THEN
!       MRi, 09.06.18
        CALL DGEMM(TransA,TransB,M,N,K,Alpha,A,Lda,B,Ldb,Beta,C,Ldc)
        RETURN
      ENDIF
      IF (threads .EQ. 1) THEN
!       MRi, 09.06.18
        CALL DGEMMurl(TransA,TransB,M,N,K,Alpha,A,Lda,B,Ldb,Beta,C,Ldc)
        RETURN
      ENDIF
!
      mchunk = MAX(1,M/threads)
      nchunk = MAX(1,N/threads)
!
!     do the real work
!
      IF (M .GT. N) THEN
!
!$OMP parallel default( none ),                                        &
!$OMP SHARED(TransA,TransB,m,n,k,alpha,a,lda,b,ldb,beta,c,ldc,         &
!$OMP threads,mchunk,nota,notb), PRIVATE(indx,m2,mythread)
!
          mythread = OMP_GET_THREAD_NUM()
!
!$OMP DO
!
        DO indx = 1,M,mchunk
          m2 = MIN(mchunk,M-indx+1)
          IF (nota) THEN
            IF ((m2 .GT. MinDim) .AND. UseURL) THEN ! unrolled version
              CALL DGEMMurl(TransA,TransB,m2,N,K,Alpha,A(indx,1),Lda,  &
                            B,Ldb,Beta,C(indx,1),Ldc)
            ELSE ! standard routine
              CALL DGEMM   (TransA,TransB,m2,N,K,Alpha,A(indx,1),Lda,  &
                            B,Ldb,Beta,C(indx,1),Ldc)
            ENDIF
          ELSE
            IF ((m2 .GT. MinDim) .AND. UseURL) THEN ! unrolled version
              CALL DGEMMurl(TransA,TransB,m2,N,K,Alpha,A(1,indx),Lda,  &
                            B,Ldb,Beta,C(indx,1),Ldc)
            ELSE ! standard routine
              CALL DGEMM   (TransA,TransB,m2,N,K,Alpha,A(1,indx),Lda,  &
                            B,Ldb,Beta,C(indx,1),Ldc)
            ENDIF
          ENDIF
        ENDDO

!$OMP END DO
!$OMP END PARALLEL

      ELSE

!$OMP parallel default( none ),                                        &
!$OMP shared(TransA,TransB,m,n,k,alpha,a,lda,b,ldb,beta,c,ldc,         &
!$OMP threads,nchunk,nota,notb), private(indx,n2,mythread)              
          mythread = OMP_GET_THREAD_NUM()
!$OMP DO
!
        DO indx = 1,N,nchunk
          n2 = MIN(nchunk,N-indx+1)
          IF (notb) THEN
            IF ((n2 .GT. MinDim) .AND. UseURL) THEN ! unrolled version
              CALL DGEMMurl(TransA,TransB,M,n2,K,Alpha,A,Lda,          &
                            B(1,indx),Ldb,Beta,C(1,indx),Ldc)
            ELSE ! standard routine
              CALL DGEMM  (TransA,TransB,M,n2,K,Alpha,A,Lda,           &
                           B(1,indx),Ldb,Beta,C(1,indx),Ldc)
            ENDIF
          ELSE
            IF ((n2 .GT. MinDim) .AND. UseURL) THEN ! unrolled version
              CALL DGEMMurl(TransA,TransB,M,n2,K,Alpha,A,Lda,          &
                            B(indx,1),Ldb,Beta,C(1,indx),Ldc)
            ELSE ! standard routine
              CALL DGEMM   (TransA,TransB,M,n2,K,Alpha,A,Lda,          &
                            B(indx,1),Ldb,Beta,C(1,indx),Ldc)
            ENDIF
          ENDIF
        ENDDO
!
!$OMP END DO
!$OMP END PARALLEL
!
      ENDIF
END SUBROUTINE DGEMMomp

SUBROUTINE DGEMMurl(TransA,TransB,M,N,K,Alpha,A,Lda,B,Ldb,Beta,C,Ldc)

      IMPLICIT NONE
      CHARACTER (len=1)  :: TransA,TransB
      INTEGER            :: M,N,K,Lda,Ldb,Ldc
      REAL (kind=dp)     :: Alpha,Beta
      REAL (kind=dp)     :: A(Lda,*),B(Ldb,*),C(Ldc,*)
!     ------------------------------------------------------------------
!     Version with unrolled loops
!     ------------------------------------------------------------------
!     Purpose
!     =======
!
!     DGEMM  performs one of the matrix-matrix operations
!
!        C := alpha*op( A )*op( B ) + beta*C,
!
!     where  op( X ) is one of
!
!        op( X ) = X   or   op( X ) = X',
!
!     alpha and beta are scalars, and A, B and C are matrices, with 
!     op(A) an m by k matrix, op(B) a k by n matrix and C an m by n 
!     matrix.
!
!     Parameters
!     ==========
!
!     TransA - CHARACTER*1.
!              On entry, TransA specifies the form of op( A ) to be used
!              in the matrix multiplication as follows:
!
!                 TransA = 'N' or 'n',  op( A ) = A.
!
!                 TransA = 'T' or 't',  op( A ) = A'.
!
!                 TransA = 'C' or 'c',  op( A ) = A'.
!
!              Unchanged on exit.
!
!     TransB - CHARACTER*1.
!              On entry, TransB specifies the form of op( B ) to be used
!              in the matrix multiplication as follows:
!
!                 TransB = 'N' or 'n',  op( B ) = B.
!
!                 TransB = 'T' or 't',  op( B ) = B'.
!
!                 TransB = 'C' or 'c',  op( B ) = B'.
!
!              Unchanged on exit.
!
!     M      - INTEGER.
!              On entry, M specifies the number of rows of the matrix
!              op(A) and of the matrix C. M must be at least zero.
!              Unchanged on exit.
!
!     N      - INTEGER.
!              On entry, N specifies the number of columns of the matrix
!              op( B ) and the number of columns of the matrix C. N must
!              be at least zero.
!              Unchanged on exit.
!
!     K      - INTEGER.
!              On entry, K specifies the number of columns of the matrix
!              op(A) and the number of rows of the matrix op(B). K must
!              be at least zero.
!              Unchanged on exit.
!
!     ALPHA  - DOUBLE PRECISION.
!              On entry, ALPHA specifies the scalar alpha.
!              Unchanged on exit.
!
!     A      - DOUBLE PRECISION array of DIMENSION (LDA,ka), where ka is
!              k when TransA = 'N' or 'n', and is m otherwise.
!              Before entry with TransA = 'N' or 'n', the leading m by k
!              part of the array A must contain the matrix A, otherwise
!              the leading k by m part of the array A must contain the
!              matrix A.
!              Unchanged on exit.
!
!     LDA    - INTEGER.
!              On entry, LDA specifies the first dimension of A as 
!              declared in the calling (sub) program. When 
!              TransA = 'N' or 'n' then LDA must be at least max(1,m),
!              otherwise  LDA must be at least max(1,k).
!              Unchanged on exit.
!
!     B      - DOUBLE PRECISION array of DIMENSION (LDB,kb), where kb is
!              n when TransB = 'N' or 'n', and is k otherwise.
!              Before entry with TransB = 'N' or 'n', the leading k by n
!              part of the array B must contain the matrix B, otherwise
!              the leading n by k part of the array B must contain the
!              matrix B.
!              Unchanged on exit.
!
!     LDB    - INTEGER.
!              On entry, LDB specifies the first dimension of B as 
!              declared in the calling (sub) program. When  
!              TransB = 'N' or 'n' then LDB must be at least  max(1,k),
!              otherwise  LDB must be at least max(1,n).
!              Unchanged on exit.
!
!     BETA   - DOUBLE PRECISION.
!              On entry, BETA specifies the scalar beta. When BETA is
!              supplied as zero then C need not be set on input.
!              Unchanged on exit.
!
!     C      - DOUBLE PRECISION array of DIMENSION ( LDC, n ).
!              Before entry, the leading m by n part of the array C must
!              contain the matrix C, except when beta is zero, in which
!              case C need not be set on entry.
!              On exit, the array C is overwritten by the m by n matrix
!              ( alpha*op( A )*op( B ) + beta*C ).
!
!     LDC    - INTEGER.
!              On entry, LDC specifies the first dimension of C as 
!              declared in the calling (sub) program. LDC must be at 
!              least max( 1, m ).
!              Unchanged on exit.
!
!
!     Level 3 Blas routine.
!
!     -- Written on 8-February-1989.
!        Jack Dongarra, Argonne National Laboratory.
!        Iain Duff, AERE Harwell.
!        Jeremy Du Croz, Numerical Algorithms Group Ltd.
!        Sven Hammarling, Numerical Algorithms Group Ltd.
!
!     -- Modified in October-1997.
!        Superscalar GEMM-Based Level 3 BLAS (Version 0.1).
!        Per Ling, Department of Computing Science,
!        Umea University, Sweden.
!
!     -- Fortran 90 by Michael Riedl, 2018
!     ------------------------------------------------------------------
!     local scalars
      INTEGER        :: i,ii,isec,uisec,j,jj,jsec,ujsec,l,ll,lsec
      INTEGER        :: ulsec,info,nrowa,nrowb
      LOGICAL        :: nota,notb
      REAL (kind=dp) :: delta
      REAL (kind=dp) :: f11,f12,f21,f22,f31,f32,f41,f42
      REAL (kind=dp) :: f13,f14,f23,f24,f33,f34,f43,f44
!     ------------------------------------------------------------------
      REAL (kind=dp),parameter :: ZERO=0.0D+0, ONE=1.0D+0
!     ------------------------------------------------------------------
      REAL (kind=dp)           :: t1(KB,MB),t2(KB,NBT)
!     ------------------------------------------------------------------
!     executable statements
!
!     set  nota  and  notb  as  true if  a  and  b  respectively are
!     not transposed and set nrowa and nrowb as the number of rows of
!     a  and the number of rows of b respectively.
!     ------------------------------------------------------------------
!     Write (*,*) "dgemm mit ausgerollten Schleifen ... "

      nota = LSAME(TransA,'N')
      notb = LSAME(TransB,'N')
      IF (nota) THEN
        nrowa = M
      ELSE
        nrowa = K
      ENDIF
      IF (notb) THEN
        nrowb = K
      ELSE
        nrowb = N
      ENDIF
!
!     test the input parameters.
!
      info = 0
      IF ((.NOT. nota) .AND. (.NOT. LSAME(TransA,'C')) .AND.           &
          (.NOT. LSAME(TransA,'T')))                                   &
      THEN
        info = 1
      ELSEIF ((.NOT. notb) .AND. (.NOT. LSAME(TransB,'C')) .AND.       &
              (.NOT. LSAME(TransB,'T')))                               &
      THEN
        info = 2
      ELSEIF (M .LT. 0) THEN
        info = 3
      ELSEIF (N .LT. 0) THEN
        info = 4
      ELSEIF (K .LT. 0) THEN
        info = 5
      ELSEIF (Lda .LT. MAX(1,nrowa)) THEN
        info = 8
      ELSEIF (Ldb .LT. MAX(1,nrowb)) THEN
        info = 10
      ELSEIF (Ldc .LT. MAX(1,M)) THEN
        info = 13
      ENDIF
      IF (info .NE. 0) THEN
        CALL XERBLA('DGEMM ',info)
        RETURN
      ENDIF
!
!     quick return if possible.
!
      IF ((M .EQ. 0) .OR. (N .EQ. 0) .OR.                              &
         (((Alpha .EQ. ZERO) .OR. (K .EQ. 0)) .AND. (Beta .EQ. ONE)))  &
      THEN
        RETURN
      ENDIF
!
!     and when alpha.eq.zero.
!
      IF ((Alpha .EQ. ZERO) .OR. (K .EQ. 0)) THEN
        IF (Beta .EQ. ZERO) THEN
          uisec = M - MOD(M,4)
          DO j=1,N
            DO i=1,uisec,4
              C(i+0,j) = ZERO
              C(i+1,j) = ZERO
              C(i+2,j) = ZERO
              C(i+3,j) = ZERO
            ENDDO
            DO i=uisec+1,M
              C(i,j) = ZERO
            ENDDO
          ENDDO
        ELSE
          uisec=M-MOD(M,4)
          DO j=1,N
            DO i=1,uisec,4
              C(i+0,j) = Beta*C(i+0,j)
              C(i+1,j) = Beta*C(i+1,j)
              C(i+2,j) = Beta*C(i+2,j)
              C(i+3,j) = Beta*C(i+3,j)
            ENDDO
            DO i=uisec+1,M
              C(i,j) = Beta*C(i,j)
            ENDDO
          ENDDO
        ENDIF
        RETURN
      ENDIF
!
!     start the operations.
!
      IF (notb) THEN
!
!       form  c := alpha*a*b + beta*c or c := alpha*a'*b + beta*c.
!
        DO jj=1,N,NB
          jsec = MIN(NB,N-jj+1)
          ujsec = jsec - MOD(jsec,4)
          DO ll=1,K,KB
            lsec = MIN(KB,K-ll+1)
            ulsec = lsec - MOD(lsec,2)
!
!           determine if the block of c should be updated with
!           beta or not.
!
            delta = ONE
            IF (ll .EQ. 1) THEN
              delta = Beta
            ENDIF
!
            DO ii=1,M,MB
              isec = MIN(MB,M-ii+1)
!
!             t1 := alpha*a' or t1 := alpha*a, copy the transpose
!             or the non-transpose of a rectangular block of
!             alpha*a to t1.
!
              uisec = isec - MOD(isec,2)
              IF (nota) THEN
                DO l=ll,ll+ulsec-1,2
                  DO i= ii,ii+uisec-1,2
                    t1(l-ll+1,i-ii+1) = Alpha*A(i,l)
                    t1(l-ll+2,i-ii+1) = Alpha*A(i,l+1)
                    t1(l-ll+1,i-ii+2) = Alpha*A(i+1,l)
                    t1(l-ll+2,i-ii+2) = Alpha*A(i+1,l+1)
                  ENDDO
                  IF (uisec .LT. isec) THEN
                    t1(l-ll+1,isec) = Alpha*A(ii+isec-1,l)
                    t1(l-ll+2,isec) = Alpha*A(ii+isec-1,l+1)
                  ENDIF
                ENDDO
                IF (ulsec .LT. lsec) THEN
                  DO i=ii,ii+isec-1
                    t1(lsec,i-ii+1) = Alpha*A(i,ll+lsec-1)
                  ENDDO
                ENDIF
              ELSE
                DO i=ii,ii+uisec-1,2
                  DO l=ll,ll+ulsec-1,2
                    t1(l-ll+1,i-ii+1) = Alpha*A(l,i)
                    t1(l-ll+1,i-ii+2) = Alpha*A(l,i+1)
                    t1(l-ll+2,i-ii+1) = Alpha*A(l+1,i)
                    t1(l-ll+2,i-ii+2) = Alpha*A(l+1,i+1)
                  ENDDO
                  IF (ulsec .LT. lsec) THEN
                    t1(lsec,i-ii+1) = Alpha*A(ll+lsec-1,i)
                    t1(lsec,i-ii+2) = Alpha*A(ll+lsec-1,i+1)
                  ENDIF
                ENDDO
                IF (uisec .LT. isec) THEN
                  DO l=ll,ll+lsec-1
                    t1(l-ll+1,isec) = Alpha*A(l,ii+isec-1)
                  ENDDO
                ENDIF
              ENDIF
!
!             c := t1'*b + beta*c, update a rectangular block
!             of c using 4 by 4 unrolling.
!
              uisec = isec - MOD(isec,4)
              DO j=jj,jj+ujsec-1,4
                DO i=ii,ii+uisec-1,4
                  f11 = delta*C(i,j)
                  f21 = delta*C(i+1,j)
                  f12 = delta*C(i,j+1)
                  f22 = delta*C(i+1,j+1)
                  f13 = delta*C(i,j+2)
                  f23 = delta*C(i+1,j+2)
                  f14 = delta*C(i,j+3)
                  f24 = delta*C(i+1,j+3)
                  f31 = delta*C(i+2,j)
                  f41 = delta*C(i+3,j)
                  f32 = delta*C(i+2,j+1)
                  f42 = delta*C(i+3,j+1)
                  f33 = delta*C(i+2,j+2)
                  f43 = delta*C(i+3,j+2)
                  f34 = delta*C(i+2,j+3)
                  f44 = delta*C(i+3,j+3)
                  DO l=ll,ll+lsec-1
                    f11 = f11 + t1(l-ll+1,i-ii+1)*B(l,j)
                    f21 = f21 + t1(l-ll+1,i-ii+2)*B(l,j)
                    f12 = f12 + t1(l-ll+1,i-ii+1)*B(l,j+1)
                    f22 = f22 + t1(l-ll+1,i-ii+2)*B(l,j+1)
                    f13 = f13 + t1(l-ll+1,i-ii+1)*B(l,j+2)
                    f23 = f23 + t1(l-ll+1,i-ii+2)*B(l,j+2)
                    f14 = f14 + t1(l-ll+1,i-ii+1)*B(l,j+3)
                    f24 = f24 + t1(l-ll+1,i-ii+2)*B(l,j+3)
                    f31 = f31 + t1(l-ll+1,i-ii+3)*B(l,j)
                    f41 = f41 + t1(l-ll+1,i-ii+4)*B(l,j)
                    f32 = f32 + t1(l-ll+1,i-ii+3)*B(l,j+1)
                    f42 = f42 + t1(l-ll+1,i-ii+4)*B(l,j+1)
                    f33 = f33 + t1(l-ll+1,i-ii+3)*B(l,j+2)
                    f43 = f43 + t1(l-ll+1,i-ii+4)*B(l,j+2)
                    f34 = f34 + t1(l-ll+1,i-ii+3)*B(l,j+3)
                    f44 = f44 + t1(l-ll+1,i-ii+4)*B(l,j+3)
                  ENDDO
                  C(i,j) = f11
                  C(i+1,j) = f21
                  C(i,j+1) = f12
                  C(i+1,j+1) = f22
                  C(i,j+2) = f13
                  C(i+1,j+2) = f23
                  C(i,j+3) = f14
                  C(i+1,j+3) = f24
                  C(i+2,j) = f31
                  C(i+3,j) = f41
                  C(i+2,j+1) = f32
                  C(i+3,j+1) = f42
                  C(i+2,j+2) = f33
                  C(i+3,j+2) = f43
                  C(i+2,j+3) = f34
                  C(i+3,j+3) = f44
                ENDDO
                IF (uisec .LT. isec) THEN
                  DO i=ii+uisec,ii+isec-1
                    f11 = delta*C(i,j)
                    f12 = delta*C(i,j+1)
                    f13 = delta*C(i,j+2)
                    f14 = delta*C(i,j+3)
                    DO l=ll,ll+lsec-1
                      f11 = f11 + t1(l-ll+1,i-ii+1)*B(l,j)
                      f12 = f12 + t1(l-ll+1,i-ii+1)*B(l,j+1)
                      f13 = f13 + t1(l-ll+1,i-ii+1)*B(l,j+2)
                      f14 = f14 + t1(l-ll+1,i-ii+1)*B(l,j+3)
                    ENDDO
                    C(i,j) = f11
                    C(i,j+1) = f12
                    C(i,j+2) = f13
                    C(i,j+3) = f14
                  ENDDO
                ENDIF
              ENDDO
              IF (ujsec .LT. jsec) THEN
                DO j=jj+ujsec,jj+jsec-1
                  DO i=ii,ii+uisec-1,4
                    f11 = delta*C(i,j)
                    f21 = delta*C(i+1,j)
                    f31 = delta*C(i+2,j)
                    f41 = delta*C(i+3,j)
                    DO l=ll,ll+lsec-1
                      f11 = f11 + t1(l-ll+1,i-ii+1)*B(l,j)
                      f21 = f21 + t1(l-ll+1,i-ii+2)*B(l,j)
                      f31 = f31 + t1(l-ll+1,i-ii+3)*B(l,j)
                      f41 = f41 + t1(l-ll+1,i-ii+4)*B(l,j)
                    ENDDO
                    C(i,j) = f11
                    C(i+1,j) = f21
                    C(i+2,j) = f31
                    C(i+3,j) = f41
                  ENDDO
                  DO i=ii+uisec,ii+isec-1
                    f11 = delta*C(i,j)
                    DO l=ll,ll+lsec-1
                      f11 = f11 + t1(l-ll+1,i-ii+1)*B(l,j)
                    ENDDO
                    C(i,j) = f11
                  ENDDO
                ENDDO
              ENDIF
            ENDDO
          ENDDO
        ENDDO
      ELSE
!
!       form  c := alpha*a*b' + beta*c or c := alpha*a'*b' + beta*c.
!
        DO jj=1,N,NBT
          jsec = MIN(NBT,N-jj+1)
          DO ll=1,K,KB
            lsec = MIN(KB,K-ll+1)
!
!           determine if the block of c should be updated with
!           beta or not.
!
            delta = ONE
            IF (ll .EQ. 1) THEN
              delta = Beta
            ENDIF
!
!           t2 := alpha*b', copy the transpose of a rectangular
!           block of alpha*a to t2.
!
            ulsec = lsec - MOD(lsec,2)
            ujsec = jsec - MOD(jsec,2)
            DO l=ll,ll+ulsec-1,2
              DO j=jj,jj+ujsec-1,2
                t2(l-ll+1,j-jj+1) = Alpha*B(j,l)
                t2(l-ll+2,j-jj+1) = Alpha*B(j,l+1)
                t2(l-ll+1,j-jj+2) = Alpha*B(j+1,l)
                t2(l-ll+2,j-jj+2) = Alpha*B(j+1,l+1)
              ENDDO
              IF (ujsec .LT. jsec) THEN
                t2(l-ll+1,jsec) = Alpha*B(jj+jsec-1,l)
                t2(l-ll+2,jsec) = Alpha*B(jj+jsec-1,l+1)
              ENDIF
            ENDDO
            IF (ulsec .LT. lsec) THEN
              DO j=jj,jj+jsec-1
                t2(lsec,j-jj+1) = Alpha*B(j,ll+lsec-1)
              ENDDO
            ENDIF
!
            ujsec = jsec - MOD(jsec,4)
            DO ii=1,M,MB
              isec = MIN(MB,M-ii+1)
!
!             t1 := alpha*a' or t1 := alpha*a, copy the transpose
!             or the non-transpose of a rectangular block of
!             alpha*a to t1.
!
              uisec = isec - MOD(isec,2)
              IF (nota) THEN
                DO l=ll,ll+ulsec-1,2
                  DO i=ii,ii+uisec-1,2
                    t1(l-ll+1,i-ii+1) = A(i,l)
                    t1(l-ll+2,i-ii+1) = A(i,l+1)
                    t1(l-ll+1,i-ii+2) = A(i+1,l)
                    t1(l-ll+2,i-ii+2) = A(i+1,l+1)
                  ENDDO
                  IF (uisec .LT. isec) THEN
                    t1(l-ll+1,isec) = A(ii+isec-1,l)
                    t1(l-ll+2,isec) = A(ii+isec-1,l+1)
                  ENDIF
                ENDDO
                IF (ulsec .LT. lsec) THEN
                  DO i=ii,ii+isec-1
                    t1(lsec,i-ii+1) = A(i,ll+lsec-1)
                  ENDDO
                ENDIF
              ELSE
                DO i=ii,ii+uisec-1,2
                  DO l=ll,ll+ulsec-1,2
                    t1(l-ll+1,i-ii+1) = A(l,i)
                    t1(l-ll+1,i-ii+2) = A(l,i+1)
                    t1(l-ll+2,i-ii+1) = A(l+1,i)
                    t1(l-ll+2,i-ii+2) = A(l+1,i+1)
                  ENDDO
                  IF (ulsec .LT. lsec) THEN
                    t1(lsec,i-ii+1) = A(ll+lsec-1,i)
                    t1(lsec,i-ii+2) = A(ll+lsec-1,i+1)
                  ENDIF
                ENDDO
                IF (uisec .LT. isec) THEN
                  DO l=ll,ll+lsec-1
                    t1(l-ll+1,isec) = A(l,ii+isec-1)
                  ENDDO
                ENDIF
              ENDIF

!             c := t1'*b + beta*c, update a rectangular block
!             of c using 4 by 4 unrolling.

              uisec = isec - MOD(isec,4)
              DO j=jj,jj+ujsec-1,4
                DO i=ii,ii+uisec-1,4
                  f11 = delta*C(i,j)
                  f21 = delta*C(i+1,j)
                  f12 = delta*C(i,j+1)
                  f22 = delta*C(i+1,j+1)
                  f13 = delta*C(i,j+2)
                  f23 = delta*C(i+1,j+2)
                  f14 = delta*C(i,j+3)
                  f24 = delta*C(i+1,j+3)
                  f31 = delta*C(i+2,j)
                  f41 = delta*C(i+3,j)
                  f32 = delta*C(i+2,j+1)
                  f42 = delta*C(i+3,j+1)
                  f33 = delta*C(i+2,j+2)
                  f43 = delta*C(i+3,j+2)
                  f34 = delta*C(i+2,j+3)
                  f44 = delta*C(i+3,j+3)
                  DO l=ll,ll+lsec-1
                    f11 = f11 + t1(l-ll+1,i-ii+1)*t2(l-ll+1,j-jj+1)
                    f21 = f21 + t1(l-ll+1,i-ii+2)*t2(l-ll+1,j-jj+1)
                    f12 = f12 + t1(l-ll+1,i-ii+1)*t2(l-ll+1,j-jj+2)
                    f22 = f22 + t1(l-ll+1,i-ii+2)*t2(l-ll+1,j-jj+2)
                    f13 = f13 + t1(l-ll+1,i-ii+1)*t2(l-ll+1,j-jj+3)
                    f23 = f23 + t1(l-ll+1,i-ii+2)*t2(l-ll+1,j-jj+3)
                    f14 = f14 + t1(l-ll+1,i-ii+1)*t2(l-ll+1,j-jj+4)
                    f24 = f24 + t1(l-ll+1,i-ii+2)*t2(l-ll+1,j-jj+4)
                    f31 = f31 + t1(l-ll+1,i-ii+3)*t2(l-ll+1,j-jj+1)
                    f41 = f41 + t1(l-ll+1,i-ii+4)*t2(l-ll+1,j-jj+1)
                    f32 = f32 + t1(l-ll+1,i-ii+3)*t2(l-ll+1,j-jj+2)
                    f42 = f42 + t1(l-ll+1,i-ii+4)*t2(l-ll+1,j-jj+2)
                    f33 = f33 + t1(l-ll+1,i-ii+3)*t2(l-ll+1,j-jj+3)
                    f43 = f43 + t1(l-ll+1,i-ii+4)*t2(l-ll+1,j-jj+3)
                    f34 = f34 + t1(l-ll+1,i-ii+3)*t2(l-ll+1,j-jj+4)
                    f44 = f44 + t1(l-ll+1,i-ii+4)*t2(l-ll+1,j-jj+4)
                  ENDDO
                  C(i,j) = f11
                  C(i+1,j) = f21
                  C(i,j+1) = f12
                  C(i+1,j+1) = f22
                  C(i,j+2) = f13
                  C(i+1,j+2) = f23
                  C(i,j+3) = f14
                  C(i+1,j+3) = f24
                  C(i+2,j) = f31
                  C(i+3,j) = f41
                  C(i+2,j+1) = f32
                  C(i+3,j+1) = f42
                  C(i+2,j+2) = f33
                  C(i+3,j+2) = f43
                  C(i+2,j+3) = f34
                  C(i+3,j+3) = f44
                ENDDO
                IF (uisec .LT. isec) THEN
                  DO i=ii+uisec,ii+isec-1
                    f11 = delta*C(i,j)
                    f12 = delta*C(i,j+1)
                    f13 = delta*C(i,j+2)
                    f14 = delta*C(i,j+3)
                    DO l=ll,ll+lsec-1
                      f11 = f11 + t1(l-ll+1,i-ii+1)*t2(l-ll+1,j-jj+1)
                      f12 = f12 + t1(l-ll+1,i-ii+1)*t2(l-ll+1,j-jj+2)
                      f13 = f13 + t1(l-ll+1,i-ii+1)*t2(l-ll+1,j-jj+3)
                      f14 = f14 + t1(l-ll+1,i-ii+1)*t2(l-ll+1,j-jj+4)
                    ENDDO
                    C(i,j) = f11
                    C(i,j+1) = f12
                    C(i,j+2) = f13
                    C(i,j+3) = f14
                  ENDDO
                ENDIF
              ENDDO
              IF (ujsec .LT. jsec) THEN
                DO j=jj+ujsec,jj+jsec-1
                  DO i=ii,ii+uisec-1,4
                    f11 = delta*C(i,j)
                    f21 = delta*C(i+1,j)
                    f31 = delta*C(i+2,j)
                    f41 = delta*C(i+3,j)
                    DO l=ll,ll+lsec-1
                      f11 = f11 + t1(l-ll+1,i-ii+1)*t2(l-ll+1,j-jj+1)
                      f21 = f21 + t1(l-ll+1,i-ii+2)*t2(l-ll+1,j-jj+1)
                      f31 = f31 + t1(l-ll+1,i-ii+3)*t2(l-ll+1,j-jj+1)
                      f41 = f41 + t1(l-ll+1,i-ii+4)*t2(l-ll+1,j-jj+1)
                    ENDDO
                    C(i,j) = f11
                    C(i+1,j) = f21
                    C(i+2,j) = f31
                    C(i+3,j) = f41
                  ENDDO
                  DO i=ii+uisec,ii+isec-1
                    f11 = delta*C(i,j)
                    DO l=ll,ll+lsec-1
                      f11 = f11 + t1(l-ll+1,i-ii+1)*t2(l-ll+1,j-jj+1)
                    ENDDO
                    C(i,j) = f11
                  ENDDO
                ENDDO
              ENDIF
            ENDDO
          ENDDO
        ENDDO
      ENDIF
END SUBROUTINE DGEMMurl

#ifdef __CBIND__
SUBROUTINE DGEMMomp2(TransA,TransB,M,N,K,Alpha,A,Lda,B,Ldb,Beta,C,Ldc) &
           bind(C,name="dgemmomp2" )

           IMPLICIT NONE
           CHARACTER (kind=C_CHAR)   :: TransA,TransB
           INTEGER   (kind=C_INT)    :: M,N,K,Lda,Ldb,Ldc
           REAL      (kind=C_DOUBLE) :: Alpha,Beta
           REAL      (kind=C_DOUBLE) :: A(Lda,*),B(Ldb,*),C(Ldc,*)
#else
SUBROUTINE DGEMMomp2(TransA,TransB,M,N,K,Alpha,A,Lda,B,Ldb,Beta,C,Ldc)

           IMPLICIT NONE
           CHARACTER      :: TransA,TransB
           INTEGER        :: M,N,K,Lda,Ldb,Ldc
           REAL (kind=dp) :: Alpha,Beta
           REAL (kind=dp) :: A(Lda,*),B(Ldb,*),C(Ldc,*)
#endif
!     ------------------------------------------------------------------
      REAL (kind=dp)      :: temp
      INTEGER             :: i,j,l
      INTEGER             :: info,ncola,nrowa,nrowb,indx,m2,n2
      INTEGER             :: threads,mchunk,mythread,nchunk
      LOGICAL             :: nota,notb
!     ------------------------------------------------------------------
      REAL(dp),parameter  :: ONE=1.0D+00, ZERO=0.0D+00
!     ------------------------------------------------------------------
!     set nota and notb as true if a and b respectively are
!     not transposed and set nrowa, ncola and nrowb as the number
!     of rows and columns of a and the number of rows of b
!     respectively.
!     ------------------------------------------------------------------
      nota=LSAME(TransA,'N')
      notb=LSAME(TransB,'N')
      IF (nota) THEN
        nrowa=M
        ncola=K
      ELSE
        nrowa=K
        ncola=M
      ENDIF
      IF (notb) THEN
        nrowb=K
      ELSE
        nrowb=N
      ENDIF
!
!     test the input parameters.
!
      info=0
      IF ((.NOT. nota) .AND. (.NOT. LSAME(TransA,'C')) .AND.           &
          (.NOT. LSAME(TransA,'T'))) THEN
        info=1
      ELSEIF ((.NOT. notb) .AND. (.NOT. LSAME(TransB,'C')) .AND.       &
              (.NOT. LSAME(TransB,'T'))) THEN
        info=2
      ELSEIF (M .LT. 0) THEN
        info=3
      ELSEIF (N .LT. 0) THEN
        info=4
      ELSEIF (K .LT. 0) THEN
        info=5
      ELSEIF (LDA .LT. MAX(1,nrowa)) THEN
        info=8
      ELSEIF (LDB .LT. MAX(1,nrowb)) THEN
        info=10
      ELSEIF (LDC .LT. MAX(1,M)) THEN
        info=13
      ENDIF
      IF (info .NE. 0) THEN
        CALL XERBLA('DGEMM ',info)
        RETURN
      ENDIF
!
!     quick return if possible.
!
      IF ((M .EQ. 0) .OR. (N .EQ. 0) .OR.                              &
          (((Alpha .EQ. ZERO) .OR. (K .EQ. 0)) .AND. (Beta .EQ. ONE))) &
      THEN
        RETURN
      ENDIF
!
!     and if  alpha .eq. zero.
!
      IF (Alpha .EQ. ZERO) THEN
        IF (Beta .EQ. ZERO) THEN
!$OMP     PARALLEL DO SHARED (N,M,C) PRIVATE(i,j)
          DO j=1,N
            DO i=1,M
              C(i,j)=ZERO
            ENDDO
          ENDDO
!$OMP     END PARALLEL DO
        ELSE
!$OMP     PARALLEL DO SHARED (N,M,C) PRIVATE(i,j,Beta)
          DO j=1,N
            DO i=1,M
              C(i,j)=Beta*C(i,j)
            ENDDO
          ENDDO
!$OMP     END PARALLEL DO
        ENDIF
        RETURN
      ENDIF
!
!       start the operations.
!
      IF (notb) THEN
        IF (nota) THEN
!
!           form  c := alpha*a*b + beta*c.
!
!$OMP     PARALLEL DO SHARED (N,M,K,A,B,C) PRIVATE(i,j,l,Beta,Alpha,temp)
          DO j=1,N
            IF (Beta .EQ. ZERO) THEN
              DO i=1,M
                C(i,j)=ZERO
              ENDDO
            ELSEIF (Beta .NE. ONE) THEN
              DO i=1,M
                C(i,j)=Beta*C(i,j)
              ENDDO
            ENDIF
            DO l=1,K
              IF (B(l,j) .NE. ZERO) THEN
                temp=Alpha*B(l,j)
                DO i=1,M
                  C(i,j)=C(i,j) + temp*A(i,l)
                ENDDO
              ENDIF
            ENDDO
          ENDDO
!$OMP     END PARALLEL DO
        ELSE
!
!           form  c := alpha*a'*b + beta*c
!
!$OMP     PARALLEL DO SHARED (N,M,K,A,B,C) PRIVATE(i,j,l,Beta,Alpha,temp)
          DO j=1,N
            DO i=1,M
              temp=ZERO
              DO l=1,K
                temp=temp + A(l,i)*B(l,j)
              ENDDO
              IF (Beta .EQ. ZERO) THEN
                C(i,j)=Alpha*temp
              ELSE
                C(i,j)=Alpha*temp + Beta*C(i,j)
              ENDIF
            ENDDO
          ENDDO
!$OMP     END PARALLEL DO
        ENDIF
      ELSEIF (nota) THEN
!
!       form  c := alpha*a*b' + beta*c
!
!$OMP   PARALLEL DO SHARED (N,M,K,A,B,C) PRIVATE(i,j,l,Beta,Alpha,temp)
        DO j=1,N
          IF (Beta .EQ. ZERO) THEN
            DO i=1,M
              C(i,j)=ZERO
            ENDDO
          ELSEIF (Beta .NE. ONE) THEN
            DO i=1,M
              C(i,j)=Beta*C(i,j)
            ENDDO
          ENDIF
          DO l=1,K
            IF (B(j,l) .NE. ZERO) THEN
              temp=Alpha*B(j,l)
              DO i=1,M
                C(i,j)=C(i,j) + temp*A(i,l)
              ENDDO
            ENDIF
          ENDDO
        ENDDO
!$OMP   END PARALLEL DO
      ELSE
!
!       form  c := alpha*a'*b' + beta*c
!
!$OMP   PARALLEL DO SHARED (N,M,K,A,B,C) PRIVATE(i,j,l,Beta,Alpha,temp)
        DO j=1,N
          DO i=1,M
            temp=ZERO
            DO l=1,K
              temp=temp + A(l,i)*B(j,l)
            ENDDO
            IF (Beta .EQ. ZERO) THEN
              C(i,j)=Alpha*temp
            ELSE
              C(i,j)=Alpha*temp + Beta*C(i,j)
            ENDIF
          ENDDO
        ENDDO
!$OMP   END PARALLEL DO
      ENDIF

END SUBROUTINE DGEMMomp2

END MODULE dgemmCbind