--- a/LibDBlasL3F77.def
+++ b/LibDBlasL3F77.def
@@ -10,7 +10,7 @@
(* $Id: LibDBlasL3F77.def,v 1.1 2017/11/01 10:42:04 mriedl Exp mriedl $ *)
-FROM LibDBlasL1F77 IMPORT INTEGER4,REAL4,DOUBLEPRECISION;
+FROM LibDBlasL1F77 IMPORT INTEGER4,REAL4,DOUBLEPRECISION,DOUBLECOMPLEX;
PROCEDURE sgemm( TA : CHAR;
TB : CHAR;
@@ -25,7 +25,9 @@
ldc : INTEGER);
(*---------------------------------------------------------------*)
- (* Aufruf der Fortran Version von BLAS3 subroutine sgemm *)
+ (* Aufruf der Fortran Version der BLAS3 subroutine sgemm *)
+ (* *)
+ (* Call to the Fortran version of the BLAS3 subroutine sgemm *)
(*---------------------------------------------------------------*)
PROCEDURE dgemm( TA : CHAR;
@@ -41,7 +43,60 @@
ldc : INTEGER);
(*---------------------------------------------------------------*)
- (* Aufruf der Fortran Version von BLAS3 subroutine dgemm *)
+ (* Aufruf der Fortran Version der BLAS3 subroutine dgemm *)
+ (* *)
+ (* Please note that lda,ldb and ldc are meaningless here as they *)
+ (* will be set automatically within the Modula-2 wrapper routine *)
+ (* which also takes care that the called does not need to worry *)
+ (* about the row major memory model used in Fortran *)
+ (*---------------------------------------------------------------*)
+
+PROCEDURE dgemmOMP( TA : CHAR;
+ TB : CHAR;
+ M,N,K : INTEGER4;
+ Alpha : DOUBLEPRECISION;
+ VAR A : ARRAY OF ARRAY OF DOUBLEPRECISION;
+ lda : INTEGER;
+ VAR B : ARRAY OF ARRAY OF DOUBLEPRECISION;
+ ldb : INTEGER;
+ Beta : DOUBLEPRECISION;
+ VAR C : ARRAY OF ARRAY OF DOUBLEPRECISION;
+ ldc : INTEGER);
+
+ (*---------------------------------------------------------------*)
+ (* Aufruf der Fortran Version der BLAS3 subroutine dgemm in der *)
+ (* OpenMP paralellisierten version *)
+ (* *)
+ (* dgemmOMP is far from beeing optimal - please test if it is *)
+ (* really improving the performance in your sprecific *)
+ (* environment. On 32 bit systems the communication overhead *)
+ (* outperforms the potential gain in speed by using more than *)
+ (* one thread in many cases. But even on a outdated Atom single *)
+ (* core processor a two threads version was about 40 % quicker *)
+ (* than the single theread version. So you need to test ... *)
+ (* and do not make the physical size of A,B and C much bigger *)
+ (* than needed - that will also slow down. *)
+ (*---------------------------------------------------------------*)
+
+PROCEDURE zgemm( TA : CHAR;
+ TB : CHAR;
+ M,N,K : INTEGER4;
+ Alpha : DOUBLECOMPLEX;
+ VAR A : ARRAY OF ARRAY OF DOUBLECOMPLEX;
+ lda : INTEGER;
+ VAR B : ARRAY OF ARRAY OF DOUBLECOMPLEX;
+ ldb : INTEGER;
+ Beta : DOUBLECOMPLEX;
+ VAR C : ARRAY OF ARRAY OF DOUBLECOMPLEX;
+ ldc : INTEGER);
+
+ (*---------------------------------------------------------------*)
+ (* Aufruf der Fortran Version der BLAS3 subroutine zgemm *)
+ (* *)
+ (* Please note that lda,ldb and ldc are meaningless here as they *)
+ (* will be set automatically within the Modula-2 wrapper routine *)
+ (* which also takes care that the called does not need to worry *)
+ (* about the row major memory model used in Fortran *)
(*---------------------------------------------------------------*)
END LibDBlasL3F77.