d3/d53/cp__fm__basic__linalg_8F_source.html

!--------------------------------------------------------------------------------------------------!

!   CP2K: A general program to perform molecular dynamics simulations                              !

!   Copyright 2000-2025 CP2K developers group <https://cp2k.org>                                   !

!                                                                                                  !

!   SPDX-License-Identifier: GPL-2.0-or-later                                                      !

!--------------------------------------------------------------------------------------------------!


! **************************************************************************************************

!> \brief Basic linear algebra operations for full matrices.

!> \par History

!>      08.2002 split out of qs_blacs [fawzi]

!> \author Fawzi Mohamed

! **************************************************************************************************

MODULE cp_fm_basic_linalg

   USE cp_blacs_env, ONLY: cp_blacs_env_type

   USE cp_fm_struct, ONLY: cp_fm_struct_equivalent

   USE cp_fm_types, ONLY: &

      cp_fm_create, cp_fm_get_diag, cp_fm_get_info, cp_fm_get_submatrix, cp_fm_p_type, &

      cp_fm_release, cp_fm_set_all, cp_fm_set_element, cp_fm_set_submatrix, cp_fm_to_fm, &

      cp_fm_type

   USE cp_log_handling, ONLY: cp_logger_get_default_unit_nr, &

                              cp_to_string

   USE kahan_sum, ONLY: accurate_dot_product, &

                        accurate_sum

   USE kinds, ONLY: dp, &

                    int_8, &

                    sp

   USE machine, ONLY: m_memory

   USE mathlib, ONLY: get_pseudo_inverse_svd, &

                      invert_matrix

   USE message_passing, ONLY: mp_comm_type

#include "../base/base_uses.f90"


   IMPLICIT NONE

   PRIVATE


   LOGICAL, PRIVATE, PARAMETER :: debug_this_module = .true.

   CHARACTER(len=*), PARAMETER, PRIVATE :: moduleN = 'cp_fm_basic_linalg'


   PUBLIC :: cp_fm_scale, & ! scale a matrix

             cp_fm_scale_and_add, & ! scale and add two matrices

             cp_fm_geadd, & ! general addition

             cp_fm_column_scale, & ! scale columns of a matrix

             cp_fm_row_scale, & ! scale rows of a matrix

             cp_fm_trace, & ! trace of the transpose(A)*B

             cp_fm_contracted_trace, & ! sum_{i,...,k} Tr [A(i,...,k)^T * B(i,...,k)]

             cp_fm_norm, & ! different norms of A

             cp_fm_schur_product, & ! schur product

             cp_fm_transpose, & ! transpose a matrix

             cp_fm_uplo_to_full, & ! symmetrise a triangular matrix

             cp_fm_syrk, & ! rank k update

             cp_fm_triangular_multiply, & ! triangular matrix multiply / solve

             cp_fm_symm, & ! multiply a symmetric with a non-symmetric matrix

             cp_fm_gemm, & ! multiply two matrices

             cp_complex_fm_gemm, & ! multiply two complex matrices, represented by non_complex fm matrices

             cp_fm_invert, & ! computes the inverse and determinant

             cp_fm_frobenius_norm, & ! frobenius norm

             cp_fm_triangular_invert, & ! compute the reciprocal of a triangular matrix

             cp_fm_qr_factorization, & ! compute the QR factorization of a rectangular matrix

             cp_fm_solve, & ! solves the equation  A*B=C A and C are input

             cp_fm_pdgeqpf, & ! compute a QR factorization with column pivoting of a M-by-N distributed matrix

             cp_fm_pdorgqr, & ! generates an M-by-N as first N columns of a product of K elementary reflectors

             cp_fm_potrf, & ! Cholesky decomposition

             cp_fm_potri, & ! Invert triangular matrix

             cp_fm_rot_rows, & ! rotates two rows

             cp_fm_rot_cols, & ! rotates two columns

             cp_fm_cholesky_restore, & ! apply Cholesky decomposition

             cp_fm_gram_schmidt_orthonorm, & ! Gram-Schmidt orthonormalization of columns of a full matrix, &

             cp_fm_det, & ! determinant of a real matrix with correct sign

             cp_fm_matvec ! matrix-vector multiplication (vector replicated)


   REAL(KIND=dp), EXTERNAL :: dlange, pdlange, pdlatra

   REAL(KIND=sp), EXTERNAL :: slange, pslange, pslatra


   INTERFACE cp_fm_trace

      MODULE PROCEDURE cp_fm_trace_a0b0t0

      MODULE PROCEDURE cp_fm_trace_a1b0t1_a

      MODULE PROCEDURE cp_fm_trace_a1b0t1_p

      MODULE PROCEDURE cp_fm_trace_a1b1t1_aa

      MODULE PROCEDURE cp_fm_trace_a1b1t1_ap

      MODULE PROCEDURE cp_fm_trace_a1b1t1_pa

      MODULE PROCEDURE cp_fm_trace_a1b1t1_pp


   END INTERFACE cp_fm_trace


   INTERFACE cp_fm_contracted_trace

      MODULE PROCEDURE cp_fm_contracted_trace_a2b2t2_aa

      MODULE PROCEDURE cp_fm_contracted_trace_a2b2t2_ap

      MODULE PROCEDURE cp_fm_contracted_trace_a2b2t2_pa

      MODULE PROCEDURE cp_fm_contracted_trace_a2b2t2_pp


   END INTERFACE cp_fm_contracted_trace

CONTAINS


! **************************************************************************************************

!> \brief Computes the determinant (with a correct sign even in parallel environment!) of a real square matrix

!> \author A. Sinyavskiy (andrey.sinyavskiy@chem.uzh.ch)

! **************************************************************************************************


   SUBROUTINE cp_fm_det(matrix_a, det_a)


      TYPE(cp_fm_type), INTENT(IN)             :: matrix_a

      REAL(kind=dp), INTENT(OUT)               :: det_a

      REAL(kind=dp)                            :: determinant

      TYPE(cp_fm_type)                         :: matrix_lu

      REAL(kind=dp), DIMENSION(:, :), POINTER  :: a

      INTEGER                                  :: n, i, info, p

      INTEGER, ALLOCATABLE, DIMENSION(:)       :: ipivot

      REAL(kind=dp), DIMENSION(:), POINTER     :: diag


#if defined(__parallel)

      INTEGER                                  :: myprow, nprow, npcol, nrow_local, nrow_block, irow_local

      INTEGER, DIMENSION(9)                    :: desca

#endif


      CALL cp_fm_create(matrix=matrix_lu, &

                        matrix_struct=matrix_a%matrix_struct, &

                        name="A_lu"//trim(adjustl(cp_to_string(1)))//"MATRIX")

      CALL cp_fm_to_fm(matrix_a, matrix_lu)


      a => matrix_lu%local_data

      n = matrix_lu%matrix_struct%nrow_global

      ALLOCATE (ipivot(n))

      ipivot(:) = 0

      p = 0

      ALLOCATE (diag(n))

      diag(:) = 0.0_dp

#if defined(__parallel)

      ! Use LU decomposition

      desca(:) = matrix_lu%matrix_struct%descriptor(:)

      CALL pdgetrf(n, n, a, 1, 1, desca, ipivot, info)

      CALL cp_fm_get_diag(matrix_lu, diag)

      determinant = product(diag)

      myprow = matrix_lu%matrix_struct%context%mepos(1)

      nprow = matrix_lu%matrix_struct%context%num_pe(1)

      npcol = matrix_lu%matrix_struct%context%num_pe(2)

      nrow_local = matrix_lu%matrix_struct%nrow_locals(myprow)

      nrow_block = matrix_lu%matrix_struct%nrow_block

      DO irow_local = 1, nrow_local

         i = matrix_lu%matrix_struct%row_indices(irow_local)

         IF (ipivot(irow_local) /= i) p = p + 1

      END DO

      CALL matrix_lu%matrix_struct%para_env%sum(p)

      ! very important fix

      p = p/npcol

#else

      CALL dgetrf(n, n, a, n, ipivot, info)

      CALL cp_fm_get_diag(matrix_lu, diag)

      determinant = product(diag)

      DO i = 1, n

         IF (ipivot(i) /= i) p = p + 1

      END DO

#endif

      DEALLOCATE (ipivot)

      DEALLOCATE (diag)

      CALL cp_fm_release(matrix_lu)

      det_a = determinant*(-2*mod(p, 2) + 1.0_dp)


   END SUBROUTINE cp_fm_det


! **************************************************************************************************

!> \brief calc A <- alpha*A + beta*B

!>      optimized for alpha == 1.0 (just add beta*B) and beta == 0.0 (just

!>      scale A)

!> \param alpha ...

!> \param matrix_a ...

!> \param beta ...

!> \param matrix_b ...

! **************************************************************************************************


   SUBROUTINE cp_fm_scale_and_add(alpha, matrix_a, beta, matrix_b)


      REAL(kind=dp), INTENT(IN)                          :: alpha

      TYPE(cp_fm_type), INTENT(IN)                       :: matrix_a

      REAL(kind=dp), INTENT(IN), OPTIONAL                :: beta

      TYPE(cp_fm_type), INTENT(IN), OPTIONAL             :: matrix_b


      CHARACTER(len=*), PARAMETER :: routinen = 'cp_fm_scale_and_add'


      INTEGER                                            :: handle, size_a, size_b

      REAL(kind=dp)                                      :: my_beta

      REAL(kind=dp), DIMENSION(:, :), POINTER            :: a, b

      REAL(kind=sp), DIMENSION(:, :), POINTER            :: a_sp, b_sp


      CALL timeset(routinen, handle)


      my_beta = 0.0_dp

      IF (PRESENT(matrix_b)) my_beta = 1.0_dp

      IF (PRESENT(beta)) my_beta = beta

      NULLIFY (a, b)


      IF (PRESENT(beta)) THEN

         cpassert(PRESENT(matrix_b))

         IF (ASSOCIATED(matrix_a%local_data, matrix_b%local_data)) THEN

            cpwarn("Bad use of routine. Call cp_fm_scale instead")

            CALL cp_fm_scale(alpha + beta, matrix_a)

            CALL timestop(handle)

            RETURN

         END IF

      END IF


      a => matrix_a%local_data

      a_sp => matrix_a%local_data_sp


      IF (matrix_a%use_sp) THEN

         size_a = SIZE(a_sp, 1)*SIZE(a_sp, 2)

      ELSE

         size_a = SIZE(a, 1)*SIZE(a, 2)

      END IF


      IF (alpha .NE. 1.0_dp) THEN

         IF (matrix_a%use_sp) THEN

            CALL sscal(size_a, real(alpha, sp), a_sp, 1)

         ELSE

            CALL dscal(size_a, alpha, a, 1)

         END IF

      END IF

      IF (my_beta .NE. 0.0_dp) THEN

         IF (matrix_a%matrix_struct%context .NE. matrix_b%matrix_struct%context) &

            cpabort("Matrices must be in the same blacs context")


         IF (cp_fm_struct_equivalent(matrix_a%matrix_struct, &

                                     matrix_b%matrix_struct)) THEN


            b => matrix_b%local_data

            b_sp => matrix_b%local_data_sp

            IF (matrix_b%use_sp) THEN

               size_b = SIZE(b_sp, 1)*SIZE(b_sp, 2)

            ELSE

               size_b = SIZE(b, 1)*SIZE(b, 2)

            END IF

            IF (size_a .NE. size_b) &

               cpabort("Matrices must have same local sizes")


            IF (matrix_a%use_sp .AND. matrix_b%use_sp) THEN

               CALL saxpy(size_a, real(my_beta, sp), b_sp, 1, a_sp, 1)

            ELSEIF (matrix_a%use_sp .AND. .NOT. matrix_b%use_sp) THEN

               CALL saxpy(size_a, real(my_beta, sp), real(b, sp), 1, a_sp, 1)

            ELSEIF (.NOT. matrix_a%use_sp .AND. matrix_b%use_sp) THEN

               CALL daxpy(size_a, my_beta, real(b_sp, dp), 1, a, 1)

            ELSE

               CALL daxpy(size_a, my_beta, b, 1, a, 1)

            END IF


         ELSE

#ifdef __parallel

            cpabort("to do (pdscal,pdcopy,pdaxpy)")

#else

            cpabort("")

#endif

         END IF


      END IF


      CALL timestop(handle)


   END SUBROUTINE cp_fm_scale_and_add


! **************************************************************************************************

!> \brief interface to BLACS geadd:

!>                matrix_b = beta*matrix_b + alpha*opt(matrix_a)

!>        where opt(matrix_a) can be either:

!>              'N':  matrix_a

!>              'T':  matrix_a^T

!>              'C':  matrix_a^H (Hermitian conjugate)

!>        note that this is a level three routine, use cp_fm_scale_and_add if that

!>        is sufficient for your needs

!> \param alpha  : complex scalar

!> \param trans  : 'N' normal, 'T' transposed

!> \param matrix_a : input matrix_a

!> \param beta   : complex scalar

!> \param matrix_b : input matrix_b, upon out put the updated matrix_b

!> \author  Lianheng Tong

! **************************************************************************************************


   SUBROUTINE cp_fm_geadd(alpha, trans, matrix_a, beta, matrix_b)

      REAL(kind=dp), INTENT(IN) :: alpha, beta

      CHARACTER, INTENT(IN) :: trans

      TYPE(cp_fm_type), INTENT(IN) :: matrix_a, matrix_b


      CHARACTER(len=*), PARAMETER :: routinen = 'cp_fm_geadd'


      INTEGER :: nrow_global, ncol_global, handle

      REAL(kind=dp), DIMENSION(:, :), POINTER :: aa, bb

#if defined(__parallel)

      INTEGER, DIMENSION(9) :: desca, descb

#elif !defined(__MKL)

      INTEGER :: ii, jj

#endif


      CALL timeset(routinen, handle)


      nrow_global = matrix_a%matrix_struct%nrow_global

      ncol_global = matrix_a%matrix_struct%ncol_global

      cpassert(nrow_global .EQ. matrix_b%matrix_struct%nrow_global)

      cpassert(ncol_global .EQ. matrix_b%matrix_struct%ncol_global)


      aa => matrix_a%local_data

      bb => matrix_b%local_data


#if defined(__parallel)

      desca = matrix_a%matrix_struct%descriptor

      descb = matrix_b%matrix_struct%descriptor

      CALL pdgeadd(trans, &

                   nrow_global, &

                   ncol_global, &

                   alpha, &

                   aa, &

                   1, 1, &

                   desca, &

                   beta, &

                   bb, &

                   1, 1, &

                   descb)

#elif defined(__MKL)

      CALL mkl_domatadd('C', trans, 'N', nrow_global, ncol_global, &

                        alpha, aa, nrow_global, beta, bb, nrow_global, bb, nrow_global)

#else

      ! dgeadd is not a standard BLAS function, although it is implemented

      ! in some libraries like OpenBLAS, so not going to use it here

      SELECT CASE (trans)

      CASE ('T')

         DO jj = 1, ncol_global

            DO ii = 1, nrow_global

               bb(ii, jj) = beta*bb(ii, jj) + alpha*aa(jj, ii)

            END DO

         END DO

      CASE DEFAULT

         DO jj = 1, ncol_global

            DO ii = 1, nrow_global

               bb(ii, jj) = beta*bb(ii, jj) + alpha*aa(ii, jj)

            END DO

         END DO

      END SELECT

#endif


      CALL timestop(handle)


   END SUBROUTINE cp_fm_geadd


! **************************************************************************************************

!> \brief Computes the LU-decomposition of the matrix, and the determinant of the matrix

!>      IMPORTANT : the sign of the determinant is not defined correctly yet ....

!> \param matrix_a ...

!> \param almost_determinant ...

!> \param correct_sign ...

!> \par History

!>      added correct_sign 02.07 (fschiff)

!> \author Joost VandeVondele

!> \note

!>      - matrix_a is overwritten

!>      - the sign of the determinant might be wrong

!>      - SERIOUS WARNING (KNOWN BUG) : the sign of the determinant depends on ipivot

!>      - one should be able to find out if ipivot is an even or an odd permutation...

!>        if you need the correct sign, just add correct_sign==.TRUE. (fschiff)

!>      - Use cp_fm_get_diag instead of n times cp_fm_get_element (A. Bussy)

! **************************************************************************************************

   SUBROUTINE cp_fm_lu_decompose(matrix_a, almost_determinant, correct_sign)

      TYPE(cp_fm_type), INTENT(IN)             :: matrix_a

      REAL(kind=dp), INTENT(OUT)               :: almost_determinant

      LOGICAL, INTENT(IN), OPTIONAL            :: correct_sign


      CHARACTER(len=*), PARAMETER :: routinen = 'cp_fm_lu_decompose'


      INTEGER                                  :: handle, i, info, n

      INTEGER, ALLOCATABLE, DIMENSION(:)       :: ipivot

      REAL(kind=dp)                            :: determinant

      REAL(kind=dp), DIMENSION(:, :), POINTER  :: a

#if defined(__parallel)

      INTEGER, DIMENSION(9)                    :: desca

      REAL(kind=dp), DIMENSION(:), POINTER     :: diag

#else

      INTEGER                                  :: lda

#endif


      CALL timeset(routinen, handle)


      a => matrix_a%local_data

      n = matrix_a%matrix_struct%nrow_global

      ALLOCATE (ipivot(n + matrix_a%matrix_struct%nrow_block))


#if defined(__parallel)

      mark_used(correct_sign)

      desca(:) = matrix_a%matrix_struct%descriptor(:)

      CALL pdgetrf(n, n, a, 1, 1, desca, ipivot, info)


      ALLOCATE (diag(n))

      diag(:) = 0.0_dp

      CALL cp_fm_get_diag(matrix_a, diag)

      determinant = 1.0_dp

      DO i = 1, n

         determinant = determinant*diag(i)

      END DO

      DEALLOCATE (diag)

#else

      lda = SIZE(a, 1)

      CALL dgetrf(n, n, a, lda, ipivot, info)

      determinant = 1.0_dp

      IF (correct_sign) THEN

         DO i = 1, n

            IF (ipivot(i) .NE. i) THEN

               determinant = -determinant*a(i, i)

            ELSE

               determinant = determinant*a(i, i)

            END IF

         END DO

      ELSE

         DO i = 1, n

            determinant = determinant*a(i, i)

         END DO

      END IF

#endif

      ! info is allowed to be zero

      ! this does just signal a zero diagonal element

      DEALLOCATE (ipivot)

      almost_determinant = determinant ! notice that the sign is random

      CALL timestop(handle)

   END SUBROUTINE cp_fm_lu_decompose


! **************************************************************************************************

!> \brief computes matrix_c = beta * matrix_c + alpha * ( matrix_a  ** transa ) * ( matrix_b ** transb )

!> \param transa : 'N' -> normal   'T' -> transpose

!>      alpha,beta :: can be 0.0_dp and 1.0_dp

!> \param transb ...

!> \param m ...

!> \param n ...

!> \param k ...

!> \param alpha ...

!> \param matrix_a : m x k matrix ( ! for transa = 'N')

!> \param matrix_b : k x n matrix ( ! for transb = 'N')

!> \param beta ...

!> \param matrix_c : m x n matrix

!> \param a_first_col ...

!> \param a_first_row ...

!> \param b_first_col : the k x n matrix starts at col b_first_col of matrix_b (avoid usage)

!> \param b_first_row ...

!> \param c_first_col ...

!> \param c_first_row ...

!> \author Matthias Krack

!> \note

!>      matrix_c should have no overlap with matrix_a, matrix_b

! **************************************************************************************************


   SUBROUTINE cp_fm_gemm(transa, transb, m, n, k, alpha, matrix_a, matrix_b, beta, &

                         matrix_c, a_first_col, a_first_row, b_first_col, b_first_row, &

                         c_first_col, c_first_row)


      CHARACTER(LEN=1), INTENT(IN)             :: transa, transb

      INTEGER, INTENT(IN)                      :: m, n, k

      REAL(kind=dp), INTENT(IN)                :: alpha

      TYPE(cp_fm_type), INTENT(IN)             :: matrix_a, matrix_b

      REAL(kind=dp), INTENT(IN)                :: beta

      TYPE(cp_fm_type), INTENT(IN)             :: matrix_c

      INTEGER, INTENT(IN), OPTIONAL            :: a_first_col, a_first_row, &

                                                  b_first_col, b_first_row, &

                                                  c_first_col, c_first_row


      CHARACTER(len=*), PARAMETER :: routinen = 'cp_fm_gemm'


      INTEGER                                  :: handle, i_a, i_b, i_c, j_a, &

                                                  j_b, j_c

      REAL(kind=dp), DIMENSION(:, :), POINTER  :: a, b, c

      REAL(kind=sp), DIMENSION(:, :), POINTER  :: a_sp, b_sp, c_sp

#if defined(__parallel)

      INTEGER, DIMENSION(9)                    :: desca, descb, descc

#else

      INTEGER                                  :: lda, ldb, ldc

#endif


      CALL timeset(routinen, handle)


      !sample peak memory

      CALL m_memory()


      a => matrix_a%local_data

      b => matrix_b%local_data

      c => matrix_c%local_data


      a_sp => matrix_a%local_data_sp

      b_sp => matrix_b%local_data_sp

      c_sp => matrix_c%local_data_sp


      i_a = 1

      IF (PRESENT(a_first_row)) i_a = a_first_row


      j_a = 1

      IF (PRESENT(a_first_col)) j_a = a_first_col


      i_b = 1

      IF (PRESENT(b_first_row)) i_b = b_first_row


      j_b = 1

      IF (PRESENT(b_first_col)) j_b = b_first_col


      i_c = 1

      IF (PRESENT(c_first_row)) i_c = c_first_row


      j_c = 1

      IF (PRESENT(c_first_col)) j_c = c_first_col


#if defined(__parallel)


      desca(:) = matrix_a%matrix_struct%descriptor(:)

      descb(:) = matrix_b%matrix_struct%descriptor(:)

      descc(:) = matrix_c%matrix_struct%descriptor(:)


      IF (matrix_a%use_sp .AND. matrix_b%use_sp .AND. matrix_c%use_sp) THEN


         CALL psgemm(transa, transb, m, n, k, real(alpha, sp), a_sp(1, 1), i_a, j_a, desca, b_sp(1, 1), i_b, j_b, &

                     descb, real(beta, sp), c_sp(1, 1), i_c, j_c, descc)


      ELSEIF ((.NOT. matrix_a%use_sp) .AND. (.NOT. matrix_b%use_sp) .AND. (.NOT. matrix_c%use_sp)) THEN


         CALL pdgemm(transa, transb, m, n, k, alpha, a, i_a, j_a, desca, b, i_b, j_b, &

                     descb, beta, c, i_c, j_c, descc)


      ELSE

         cpabort("Mixed precision gemm NYI")

      END IF

#else


      IF (matrix_a%use_sp .AND. matrix_b%use_sp .AND. matrix_c%use_sp) THEN


         lda = SIZE(a_sp, 1)

         ldb = SIZE(b_sp, 1)

         ldc = SIZE(c_sp, 1)


         CALL sgemm(transa, transb, m, n, k, real(alpha, sp), a_sp(i_a, j_a), lda, b_sp(i_b, j_b), ldb, &

                    REAL(beta, sp), c_sp(i_c, j_c), ldc)


      ELSEIF ((.NOT. matrix_a%use_sp) .AND. (.NOT. matrix_b%use_sp) .AND. (.NOT. matrix_c%use_sp)) THEN


         lda = SIZE(a, 1)

         ldb = SIZE(b, 1)

         ldc = SIZE(c, 1)


         CALL dgemm(transa, transb, m, n, k, alpha, a(i_a, j_a), lda, b(i_b, j_b), ldb, beta, c(i_c, j_c), ldc)


      ELSE

         cpabort("Mixed precision gemm NYI")

      END IF


#endif

      CALL timestop(handle)


   END SUBROUTINE cp_fm_gemm


! **************************************************************************************************

!> \brief computes matrix_c = beta * matrix_c + alpha *  matrix_a  *  matrix_b

!>      computes matrix_c = beta * matrix_c + alpha *  matrix_b  *  matrix_a

!>      where matrix_a is symmetric

!> \param side : 'L' -> matrix_a is on the left 'R' -> matrix_a is on the right

!>      alpha,beta :: can be 0.0_dp and 1.0_dp

!> \param uplo triangular format

!> \param m ...

!> \param n ...

!> \param alpha ...

!> \param matrix_a : m x m matrix

!> \param matrix_b : m x n matrix

!> \param beta ...

!> \param matrix_c : m x n matrix

!> \author Matthias Krack

!> \note

!>      matrix_c should have no overlap with matrix_a, matrix_b

!>      all matrices in QS are triangular according to uplo

!>      matrix_a is always an m x m matrix

!>      typically slower than cp_fm_gemm (especially in parallel easily 50 percent)

! **************************************************************************************************


   SUBROUTINE cp_fm_symm(side, uplo, m, n, alpha, matrix_a, matrix_b, beta, matrix_c)


      CHARACTER(LEN=1), INTENT(IN)             :: side, uplo

      INTEGER, INTENT(IN)                      :: m, n

      REAL(kind=dp), INTENT(IN)                :: alpha

      TYPE(cp_fm_type), INTENT(IN)             :: matrix_a, matrix_b

      REAL(kind=dp), INTENT(IN)                :: beta

      TYPE(cp_fm_type), INTENT(IN)             :: matrix_c


      CHARACTER(len=*), PARAMETER :: routinen = 'cp_fm_symm'


      INTEGER                                  :: handle

      REAL(kind=dp), DIMENSION(:, :), POINTER  :: a, b, c

#if defined(__parallel)

      INTEGER, DIMENSION(9)                    :: desca, descb, descc

#else

      INTEGER                                  :: lda, ldb, ldc

#endif


      CALL timeset(routinen, handle)


      a => matrix_a%local_data

      b => matrix_b%local_data

      c => matrix_c%local_data


#if defined(__parallel)


      desca(:) = matrix_a%matrix_struct%descriptor(:)

      descb(:) = matrix_b%matrix_struct%descriptor(:)

      descc(:) = matrix_c%matrix_struct%descriptor(:)


      CALL pdsymm(side, uplo, m, n, alpha, a(1, 1), 1, 1, desca, b(1, 1), 1, 1, descb, beta, c(1, 1), 1, 1, descc)


#else


      lda = matrix_a%matrix_struct%local_leading_dimension

      ldb = matrix_b%matrix_struct%local_leading_dimension

      ldc = matrix_c%matrix_struct%local_leading_dimension


      CALL dsymm(side, uplo, m, n, alpha, a(1, 1), lda, b(1, 1), ldb, beta, c(1, 1), ldc)


#endif

      CALL timestop(handle)


   END SUBROUTINE cp_fm_symm


! **************************************************************************************************

!> \brief computes the Frobenius norm of matrix_a

!> \brief computes the Frobenius norm of matrix_a

!> \param matrix_a : m x n matrix

!> \return ...

!> \author VW

! **************************************************************************************************


   FUNCTION cp_fm_frobenius_norm(matrix_a) RESULT(norm)

      TYPE(cp_fm_type), INTENT(IN)             :: matrix_a

      REAL(kind=dp)                            :: norm


      CHARACTER(len=*), PARAMETER :: routinen = 'cp_fm_frobenius_norm'


      INTEGER                                  :: handle, size_a

      REAL(kind=dp), DIMENSION(:, :), POINTER  :: a

      REAL(kind=dp), EXTERNAL                  :: ddot

#if defined(__parallel)

      TYPE(mp_comm_type)                       :: group

#endif


      CALL timeset(routinen, handle)


      norm = 0.0_dp

      a => matrix_a%local_data

      size_a = SIZE(a, 1)*SIZE(a, 2)

      norm = ddot(size_a, a(1, 1), 1, a(1, 1), 1)

#if defined(__parallel)

      group = matrix_a%matrix_struct%para_env

      CALL group%sum(norm)

#endif

      norm = sqrt(norm)


      CALL timestop(handle)


   END FUNCTION cp_fm_frobenius_norm


! **************************************************************************************************

!> \brief performs a rank-k update of a symmetric matrix_c

!>         matrix_c = beta * matrix_c + alpha * matrix_a * transpose ( matrix_a )

!> \param uplo : 'U'   ('L')

!> \param trans : 'N'  ('T')

!> \param k : number of cols to use in matrix_a

!>      ia,ja ::  1,1 (could be used for selecting subblock of a)

!> \param alpha ...

!> \param matrix_a ...

!> \param ia ...

!> \param ja ...

!> \param beta ...

!> \param matrix_c ...

!> \author Matthias Krack

! **************************************************************************************************


   SUBROUTINE cp_fm_syrk(uplo, trans, k, alpha, matrix_a, ia, ja, beta, matrix_c)

      CHARACTER(LEN=1), INTENT(IN)             :: uplo, trans

      INTEGER, INTENT(IN)                      :: k

      REAL(kind=dp), INTENT(IN)                :: alpha

      TYPE(cp_fm_type), INTENT(IN)             :: matrix_a

      INTEGER, INTENT(IN)                      :: ia, ja

      REAL(kind=dp), INTENT(IN)                :: beta

      TYPE(cp_fm_type), INTENT(IN)             :: matrix_c


      CHARACTER(len=*), PARAMETER :: routinen = 'cp_fm_syrk'


      INTEGER                                  :: handle, n

      REAL(kind=dp), DIMENSION(:, :), POINTER  :: a, c

#if defined(__parallel)

      INTEGER, DIMENSION(9)                    :: desca, descc

#else

      INTEGER                                  :: lda, ldc

#endif


      CALL timeset(routinen, handle)


      n = matrix_c%matrix_struct%nrow_global


      a => matrix_a%local_data

      c => matrix_c%local_data


#if defined(__parallel)


      desca(:) = matrix_a%matrix_struct%descriptor(:)

      descc(:) = matrix_c%matrix_struct%descriptor(:)


      CALL pdsyrk(uplo, trans, n, k, alpha, a(1, 1), ia, ja, desca, beta, c(1, 1), 1, 1, descc)


#else


      lda = SIZE(a, 1)

      ldc = SIZE(c, 1)


      CALL dsyrk(uplo, trans, n, k, alpha, a(ia, ja), lda, beta, c(1, 1), ldc)


#endif

      CALL timestop(handle)


   END SUBROUTINE cp_fm_syrk


! **************************************************************************************************

!> \brief computes the schur product of two matrices

!>       c_ij = a_ij * b_ij

!> \param matrix_a ...

!> \param matrix_b ...

!> \param matrix_c ...

!> \author Joost VandeVondele

! **************************************************************************************************


   SUBROUTINE cp_fm_schur_product(matrix_a, matrix_b, matrix_c)


      TYPE(cp_fm_type), INTENT(IN)                       :: matrix_a, matrix_b, matrix_c


      CHARACTER(len=*), PARAMETER :: routinen = 'cp_fm_schur_product'


      INTEGER                                            :: handle, icol_local, irow_local, mypcol, &

                                                            myprow, ncol_local, npcol, nprow, &

                                                            nrow_local

      REAL(kind=dp), DIMENSION(:, :), POINTER            :: a, b, c

      TYPE(cp_blacs_env_type), POINTER                   :: context


      CALL timeset(routinen, handle)


      context => matrix_a%matrix_struct%context

      myprow = context%mepos(1)

      mypcol = context%mepos(2)

      nprow = context%num_pe(1)

      npcol = context%num_pe(2)


      a => matrix_a%local_data

      b => matrix_b%local_data

      c => matrix_c%local_data


      nrow_local = matrix_a%matrix_struct%nrow_locals(myprow)

      ncol_local = matrix_a%matrix_struct%ncol_locals(mypcol)


      DO icol_local = 1, ncol_local

         DO irow_local = 1, nrow_local

            c(irow_local, icol_local) = a(irow_local, icol_local)*b(irow_local, icol_local)

         END DO

      END DO


      CALL timestop(handle)


   END SUBROUTINE cp_fm_schur_product


! **************************************************************************************************

!> \brief returns the trace of matrix_a^T matrix_b, i.e

!>      sum_{i,j}(matrix_a(i,j)*matrix_b(i,j))

!> \param matrix_a a matrix

!> \param matrix_b another matrix

!> \param trace ...

!> \par History

!>      11.06.2001 Creation (Matthias Krack)

!>      12.2002 added doc [fawzi]

!> \author Matthias Krack

!> \note

!>      note the transposition of matrix_a!

! **************************************************************************************************


   SUBROUTINE cp_fm_trace_a0b0t0(matrix_a, matrix_b, trace)


      TYPE(cp_fm_type), INTENT(IN)                       :: matrix_a, matrix_b

      REAL(KIND=dp), INTENT(OUT)                         :: trace


      CHARACTER(len=*), PARAMETER :: routineN = 'cp_fm_trace_a0b0t0'


      INTEGER                                            :: handle, mypcol, myprow, ncol_local, &

                                                            npcol, nprow, nrow_local

      REAL(KIND=dp), DIMENSION(:, :), POINTER            :: a, b

      REAL(KIND=sp), DIMENSION(:, :), POINTER            :: a_sp, b_sp

      TYPE(cp_blacs_env_type), POINTER                   :: context

      TYPE(mp_comm_type)                                 :: group


      CALL timeset(routinen, handle)


      context => matrix_a%matrix_struct%context

      myprow = context%mepos(1)

      mypcol = context%mepos(2)

      nprow = context%num_pe(1)

      npcol = context%num_pe(2)


      group = matrix_a%matrix_struct%para_env


      a => matrix_a%local_data

      b => matrix_b%local_data


      a_sp => matrix_a%local_data_sp

      b_sp => matrix_b%local_data_sp


      nrow_local = min(matrix_a%matrix_struct%nrow_locals(myprow), matrix_b%matrix_struct%nrow_locals(myprow))

      ncol_local = min(matrix_a%matrix_struct%ncol_locals(mypcol), matrix_b%matrix_struct%ncol_locals(mypcol))


      ! cries for an accurate_dot_product

      IF (matrix_a%use_sp .AND. matrix_b%use_sp) THEN

         trace = accurate_sum(real(a_sp(1:nrow_local, 1:ncol_local)* &

                                   b_sp(1:nrow_local, 1:ncol_local), dp))

      ELSEIF (matrix_a%use_sp .AND. .NOT. matrix_b%use_sp) THEN

         trace = accurate_sum(real(a_sp(1:nrow_local, 1:ncol_local), dp)* &

                              b(1:nrow_local, 1:ncol_local))

      ELSEIF (.NOT. matrix_a%use_sp .AND. matrix_b%use_sp) THEN

         trace = accurate_sum(a(1:nrow_local, 1:ncol_local)* &

                              REAL(b_sp(1:nrow_local, 1:ncol_local), dp))

      ELSE

         trace = accurate_dot_product(a(1:nrow_local, 1:ncol_local), &

                                      b(1:nrow_local, 1:ncol_local))

      END IF


      CALL group%sum(trace)


      CALL timestop(handle)


   END SUBROUTINE cp_fm_trace_a0b0t0


! **************************************************************************************************

!> \brief Compute trace(k) = Tr (matrix_a(k)^T matrix_b) for each pair of matrices A_k and B.

!> \param matrix_a list of A matrices

!> \param matrix_b B matrix

!> \param trace    computed traces

!> \par History

!>    * 08.2018 forked from cp_fm_trace() [Sergey Chulkov]

!> \note \parblock

!>      Computing the trace requires collective communication between involved MPI processes

!>      that implies a synchronisation point between them. The aim of this subroutine is to reduce

!>      the amount of time wasted in such synchronisation by performing one large collective

!>      operation which involves all the matrices in question.

!>

!>      The subroutine's suffix reflects dimensionality of dummy arrays; 'a1b0t1' means that

!>      the dummy variables 'matrix_a' and 'trace' are 1-dimensional arrays, while the variable

!>      'matrix_b' is a single matrix.

!>      \endparblock

! **************************************************************************************************


      SUBROUTINE cp_fm_trace_a1b0t1_a (matrix_a, matrix_b, trace)

         TYPE(cp_fm_type), DIMENSION(:), INTENT(IN)       :: matrix_a

         TYPE(cp_fm_type), INTENT(IN)                       :: matrix_b

         REAL(kind=dp), DIMENSION(:), INTENT(OUT)           :: trace


         CHARACTER(len=*), PARAMETER :: routineN = 'cp_fm_trace_a1b0t1_a'


         INTEGER                                            :: handle, imatrix, n_matrices, &

                                                               ncols_local, nrows_local

         LOGICAL                                            :: use_sp_a, use_sp_b

         REAL(kind=dp), DIMENSION(:, :), POINTER            :: ldata_a, ldata_b

         REAL(kind=sp), DIMENSION(:, :), POINTER            :: ldata_a_sp, ldata_b_sp

         TYPE(mp_comm_type)                                 :: group


         CALL timeset(routinen, handle)


         n_matrices = SIZE(trace)

         cpassert(SIZE(matrix_a) == n_matrices)


         CALL cp_fm_get_info(matrix_b, nrow_local=nrows_local, ncol_local=ncols_local)

         use_sp_b = matrix_b%use_sp


         IF (use_sp_b) THEN

            ldata_b_sp => matrix_b%local_data_sp(1:nrows_local, 1:ncols_local)

         ELSE

            ldata_b => matrix_b%local_data(1:nrows_local, 1:ncols_local)

         END IF


!$OMP PARALLEL DO DEFAULT(NONE), &

!$OMP             PRIVATE(imatrix, ldata_a, ldata_a_sp, use_sp_a), &

!$OMP             SHARED(ldata_b, ldata_b_sp, matrix_a, matrix_b), &

!$OMP             SHARED(ncols_local, nrows_local, n_matrices, trace, use_sp_b)


         DO imatrix = 1, n_matrices


            use_sp_a = matrix_a(imatrix) %use_sp


            ! assume that the matrices A(i) and B have identical shapes and distribution schemes

            IF (use_sp_a .AND. use_sp_b) THEN

               ldata_a_sp => matrix_a(imatrix) %local_data_sp(1:nrows_local, 1:ncols_local)

               trace(imatrix) = accurate_dot_product(ldata_a_sp, ldata_b_sp)

            ELSE IF (.NOT. use_sp_a .AND. .NOT. use_sp_b) THEN

               ldata_a => matrix_a(imatrix) %local_data(1:nrows_local, 1:ncols_local)

               trace(imatrix) = accurate_dot_product(ldata_a, ldata_b)

            ELSE

               cpabort("Matrices A and B are of different types")

            END IF

         END DO

!$OMP END PARALLEL DO


         group = matrix_b%matrix_struct%para_env

         CALL group%sum(trace)


         CALL timestop(handle)


      END SUBROUTINE cp_fm_trace_a1b0t1_a


      SUBROUTINE cp_fm_trace_a1b0t1_p (matrix_a, matrix_b, trace)

         TYPE(cp_fm_p_type), DIMENSION(:), INTENT(IN)       :: matrix_a

         TYPE(cp_fm_type), INTENT(IN)                       :: matrix_b

         REAL(kind=dp), DIMENSION(:), INTENT(OUT)           :: trace


         CHARACTER(len=*), PARAMETER :: routineN = 'cp_fm_trace_a1b0t1_p'


         INTEGER                                            :: handle, imatrix, n_matrices, &

                                                               ncols_local, nrows_local

         LOGICAL                                            :: use_sp_a, use_sp_b

         REAL(kind=dp), DIMENSION(:, :), POINTER            :: ldata_a, ldata_b

         REAL(kind=sp), DIMENSION(:, :), POINTER            :: ldata_a_sp, ldata_b_sp

         TYPE(mp_comm_type)                                 :: group


         CALL timeset(routinen, handle)


         n_matrices = SIZE(trace)

         cpassert(SIZE(matrix_a) == n_matrices)


         CALL cp_fm_get_info(matrix_b, nrow_local=nrows_local, ncol_local=ncols_local)

         use_sp_b = matrix_b%use_sp


         IF (use_sp_b) THEN

            ldata_b_sp => matrix_b%local_data_sp(1:nrows_local, 1:ncols_local)

         ELSE

            ldata_b => matrix_b%local_data(1:nrows_local, 1:ncols_local)

         END IF


!$OMP PARALLEL DO DEFAULT(NONE), &

!$OMP             PRIVATE(imatrix, ldata_a, ldata_a_sp, use_sp_a), &

!$OMP             SHARED(ldata_b, ldata_b_sp, matrix_a, matrix_b), &

!$OMP             SHARED(ncols_local, nrows_local, n_matrices, trace, use_sp_b)


         DO imatrix = 1, n_matrices


            use_sp_a = matrix_a(imatrix) %matrix%use_sp


            ! assume that the matrices A(i) and B have identical shapes and distribution schemes

            IF (use_sp_a .AND. use_sp_b) THEN

               ldata_a_sp => matrix_a(imatrix) %matrix%local_data_sp(1:nrows_local, 1:ncols_local)

               trace(imatrix) = accurate_dot_product(ldata_a_sp, ldata_b_sp)

            ELSE IF (.NOT. use_sp_a .AND. .NOT. use_sp_b) THEN

               ldata_a => matrix_a(imatrix) %matrix%local_data(1:nrows_local, 1:ncols_local)

               trace(imatrix) = accurate_dot_product(ldata_a, ldata_b)

            ELSE

               cpabort("Matrices A and B are of different types")

            END IF

         END DO

!$OMP END PARALLEL DO


         group = matrix_b%matrix_struct%para_env

         CALL group%sum(trace)


         CALL timestop(handle)


      END SUBROUTINE cp_fm_trace_a1b0t1_p


! **************************************************************************************************

!> \brief Compute trace(k) = Tr (matrix_a(k)^T matrix_b(k)) for each pair of matrices A_k and B_k.

!> \param matrix_a list of A matrices

!> \param matrix_b list of B matrices

!> \param trace    computed traces

!> \param accurate ...

!> \par History

!>    * 11.2016 forked from cp_fm_trace() [Sergey Chulkov]

!> \note \parblock

!>      Computing the trace requires collective communication between involved MPI processes

!>      that implies a synchronisation point between them. The aim of this subroutine is to reduce

!>      the amount of time wasted in such synchronisation by performing one large collective

!>      operation which involves all the matrices in question.

!>

!>      The subroutine's suffix reflects dimensionality of dummy arrays; 'a1b1t1' means that

!>      all dummy variables (matrix_a, matrix_b, and trace) are 1-dimensional arrays.

!>      \endparblock

! **************************************************************************************************


         SUBROUTINE cp_fm_trace_a1b1t1_aa (matrix_a, matrix_b, trace, accurate)

            TYPE(cp_fm_type), DIMENSION(:), INTENT(IN)      :: matrix_a

            TYPE(cp_fm_type), DIMENSION(:), INTENT(IN)      :: matrix_b

            REAL(kind=dp), DIMENSION(:), INTENT(OUT)           :: trace

            LOGICAL, INTENT(IN), OPTIONAL                      :: accurate


            CHARACTER(len=*), PARAMETER :: routineN = 'cp_fm_trace_a1b1t1_aa'


            INTEGER                                            :: handle, imatrix, n_matrices, &

                                                                  ncols_local, nrows_local

            LOGICAL                                            :: use_accurate_sum, use_sp_a, use_sp_b

            REAL(kind=dp), DIMENSION(:, :), POINTER            :: ldata_a, ldata_b

            REAL(kind=sp), DIMENSION(:, :), POINTER            :: ldata_a_sp, ldata_b_sp

            TYPE(mp_comm_type)                                 :: group


            CALL timeset(routinen, handle)


            n_matrices = SIZE(trace)

            cpassert(SIZE(matrix_a) == n_matrices)

            cpassert(SIZE(matrix_b) == n_matrices)


            use_accurate_sum = .true.

            IF (PRESENT(accurate)) use_accurate_sum = accurate


!$OMP PARALLEL DO DEFAULT(NONE), &

!$OMP             PRIVATE(imatrix, ldata_a, ldata_a_sp, ldata_b, ldata_b_sp, ncols_local), &

!$OMP             PRIVATE(nrows_local, use_sp_a, use_sp_b), &

!$OMP             SHARED(matrix_a, matrix_b, n_matrices, trace, use_accurate_sum)

            DO imatrix = 1, n_matrices

               CALL cp_fm_get_info(matrix_a(imatrix) , nrow_local=nrows_local, ncol_local=ncols_local)


               use_sp_a = matrix_a(imatrix) %use_sp

               use_sp_b = matrix_b(imatrix) %use_sp


               ! assume that the matrices A(i) and B(i) have identical shapes and distribution schemes

               IF (use_sp_a .AND. use_sp_b) THEN

                  ldata_a_sp => matrix_a(imatrix) %local_data_sp(1:nrows_local, 1:ncols_local)

                  ldata_b_sp => matrix_b(imatrix) %local_data_sp(1:nrows_local, 1:ncols_local)

                  IF (use_accurate_sum) THEN

                     trace(imatrix) = accurate_dot_product(ldata_a_sp, ldata_b_sp)

                  ELSE

                     trace(imatrix) = sum(ldata_a_sp*ldata_b_sp)

                  END IF

               ELSE IF (.NOT. use_sp_a .AND. .NOT. use_sp_b) THEN

                  ldata_a => matrix_a(imatrix) %local_data(1:nrows_local, 1:ncols_local)

                  ldata_b => matrix_b(imatrix) %local_data(1:nrows_local, 1:ncols_local)

                  IF (use_accurate_sum) THEN

                     trace(imatrix) = accurate_dot_product(ldata_a, ldata_b)

                  ELSE

                     trace(imatrix) = sum(ldata_a*ldata_b)

                  END IF

               ELSE

                  cpabort("Matrices A and B are of different types")

               END IF

            END DO

!$OMP END PARALLEL DO


            group = matrix_a(1) %matrix_struct%para_env

            CALL group%sum(trace)


            CALL timestop(handle)


         END SUBROUTINE cp_fm_trace_a1b1t1_aa


         SUBROUTINE cp_fm_trace_a1b1t1_ap (matrix_a, matrix_b, trace, accurate)

            TYPE(cp_fm_type), DIMENSION(:), INTENT(IN)      :: matrix_a

            TYPE(cp_fm_p_type), DIMENSION(:), INTENT(IN)      :: matrix_b

            REAL(kind=dp), DIMENSION(:), INTENT(OUT)           :: trace

            LOGICAL, INTENT(IN), OPTIONAL                      :: accurate


            CHARACTER(len=*), PARAMETER :: routineN = 'cp_fm_trace_a1b1t1_ap'


            INTEGER                                            :: handle, imatrix, n_matrices, &

                                                                  ncols_local, nrows_local

            LOGICAL                                            :: use_accurate_sum, use_sp_a, use_sp_b

            REAL(kind=dp), DIMENSION(:, :), POINTER            :: ldata_a, ldata_b

            REAL(kind=sp), DIMENSION(:, :), POINTER            :: ldata_a_sp, ldata_b_sp

            TYPE(mp_comm_type)                                 :: group


            CALL timeset(routinen, handle)


            n_matrices = SIZE(trace)

            cpassert(SIZE(matrix_a) == n_matrices)

            cpassert(SIZE(matrix_b) == n_matrices)


            use_accurate_sum = .true.

            IF (PRESENT(accurate)) use_accurate_sum = accurate


!$OMP PARALLEL DO DEFAULT(NONE), &

!$OMP             PRIVATE(imatrix, ldata_a, ldata_a_sp, ldata_b, ldata_b_sp, ncols_local), &

!$OMP             PRIVATE(nrows_local, use_sp_a, use_sp_b), &

!$OMP             SHARED(matrix_a, matrix_b, n_matrices, trace, use_accurate_sum)

            DO imatrix = 1, n_matrices

               CALL cp_fm_get_info(matrix_a(imatrix) , nrow_local=nrows_local, ncol_local=ncols_local)


               use_sp_a = matrix_a(imatrix) %use_sp

               use_sp_b = matrix_b(imatrix) %matrix%use_sp


               ! assume that the matrices A(i) and B(i) have identical shapes and distribution schemes

               IF (use_sp_a .AND. use_sp_b) THEN

                  ldata_a_sp => matrix_a(imatrix) %local_data_sp(1:nrows_local, 1:ncols_local)

                  ldata_b_sp => matrix_b(imatrix) %matrix%local_data_sp(1:nrows_local, 1:ncols_local)

                  IF (use_accurate_sum) THEN

                     trace(imatrix) = accurate_dot_product(ldata_a_sp, ldata_b_sp)

                  ELSE

                     trace(imatrix) = sum(ldata_a_sp*ldata_b_sp)

                  END IF

               ELSE IF (.NOT. use_sp_a .AND. .NOT. use_sp_b) THEN

                  ldata_a => matrix_a(imatrix) %local_data(1:nrows_local, 1:ncols_local)

                  ldata_b => matrix_b(imatrix) %matrix%local_data(1:nrows_local, 1:ncols_local)

                  IF (use_accurate_sum) THEN

                     trace(imatrix) = accurate_dot_product(ldata_a, ldata_b)

                  ELSE

                     trace(imatrix) = sum(ldata_a*ldata_b)

                  END IF

               ELSE

                  cpabort("Matrices A and B are of different types")

               END IF

            END DO

!$OMP END PARALLEL DO


            group = matrix_a(1) %matrix_struct%para_env

            CALL group%sum(trace)


            CALL timestop(handle)


         END SUBROUTINE cp_fm_trace_a1b1t1_ap


         SUBROUTINE cp_fm_trace_a1b1t1_pa (matrix_a, matrix_b, trace, accurate)

            TYPE(cp_fm_p_type), DIMENSION(:), INTENT(IN)      :: matrix_a

            TYPE(cp_fm_type), DIMENSION(:), INTENT(IN)      :: matrix_b

            REAL(kind=dp), DIMENSION(:), INTENT(OUT)           :: trace

            LOGICAL, INTENT(IN), OPTIONAL                      :: accurate


            CHARACTER(len=*), PARAMETER :: routineN = 'cp_fm_trace_a1b1t1_pa'


            INTEGER                                            :: handle, imatrix, n_matrices, &

                                                                  ncols_local, nrows_local

            LOGICAL                                            :: use_accurate_sum, use_sp_a, use_sp_b

            REAL(kind=dp), DIMENSION(:, :), POINTER            :: ldata_a, ldata_b

            REAL(kind=sp), DIMENSION(:, :), POINTER            :: ldata_a_sp, ldata_b_sp

            TYPE(mp_comm_type)                                 :: group


            CALL timeset(routinen, handle)


            n_matrices = SIZE(trace)

            cpassert(SIZE(matrix_a) == n_matrices)

            cpassert(SIZE(matrix_b) == n_matrices)


            use_accurate_sum = .true.

            IF (PRESENT(accurate)) use_accurate_sum = accurate


!$OMP PARALLEL DO DEFAULT(NONE), &

!$OMP             PRIVATE(imatrix, ldata_a, ldata_a_sp, ldata_b, ldata_b_sp, ncols_local), &

!$OMP             PRIVATE(nrows_local, use_sp_a, use_sp_b), &

!$OMP             SHARED(matrix_a, matrix_b, n_matrices, trace, use_accurate_sum)

            DO imatrix = 1, n_matrices

               CALL cp_fm_get_info(matrix_a(imatrix) %matrix, nrow_local=nrows_local, ncol_local=ncols_local)


               use_sp_a = matrix_a(imatrix) %matrix%use_sp

               use_sp_b = matrix_b(imatrix) %use_sp


               ! assume that the matrices A(i) and B(i) have identical shapes and distribution schemes

               IF (use_sp_a .AND. use_sp_b) THEN

                  ldata_a_sp => matrix_a(imatrix) %matrix%local_data_sp(1:nrows_local, 1:ncols_local)

                  ldata_b_sp => matrix_b(imatrix) %local_data_sp(1:nrows_local, 1:ncols_local)

                  IF (use_accurate_sum) THEN

                     trace(imatrix) = accurate_dot_product(ldata_a_sp, ldata_b_sp)

                  ELSE

                     trace(imatrix) = sum(ldata_a_sp*ldata_b_sp)

                  END IF

               ELSE IF (.NOT. use_sp_a .AND. .NOT. use_sp_b) THEN

                  ldata_a => matrix_a(imatrix) %matrix%local_data(1:nrows_local, 1:ncols_local)

                  ldata_b => matrix_b(imatrix) %local_data(1:nrows_local, 1:ncols_local)

                  IF (use_accurate_sum) THEN

                     trace(imatrix) = accurate_dot_product(ldata_a, ldata_b)

                  ELSE

                     trace(imatrix) = sum(ldata_a*ldata_b)

                  END IF

               ELSE

                  cpabort("Matrices A and B are of different types")

               END IF

            END DO

!$OMP END PARALLEL DO


            group = matrix_a(1) %matrix%matrix_struct%para_env

            CALL group%sum(trace)


            CALL timestop(handle)


         END SUBROUTINE cp_fm_trace_a1b1t1_pa


         SUBROUTINE cp_fm_trace_a1b1t1_pp (matrix_a, matrix_b, trace, accurate)

            TYPE(cp_fm_p_type), DIMENSION(:), INTENT(IN)      :: matrix_a

            TYPE(cp_fm_p_type), DIMENSION(:), INTENT(IN)      :: matrix_b

            REAL(kind=dp), DIMENSION(:), INTENT(OUT)           :: trace

            LOGICAL, INTENT(IN), OPTIONAL                      :: accurate


            CHARACTER(len=*), PARAMETER :: routineN = 'cp_fm_trace_a1b1t1_pp'


            INTEGER                                            :: handle, imatrix, n_matrices, &

                                                                  ncols_local, nrows_local

            LOGICAL                                            :: use_accurate_sum, use_sp_a, use_sp_b

            REAL(kind=dp), DIMENSION(:, :), POINTER            :: ldata_a, ldata_b

            REAL(kind=sp), DIMENSION(:, :), POINTER            :: ldata_a_sp, ldata_b_sp

            TYPE(mp_comm_type)                                 :: group


            CALL timeset(routinen, handle)


            n_matrices = SIZE(trace)

            cpassert(SIZE(matrix_a) == n_matrices)

            cpassert(SIZE(matrix_b) == n_matrices)


            use_accurate_sum = .true.

            IF (PRESENT(accurate)) use_accurate_sum = accurate


!$OMP PARALLEL DO DEFAULT(NONE), &

!$OMP             PRIVATE(imatrix, ldata_a, ldata_a_sp, ldata_b, ldata_b_sp, ncols_local), &

!$OMP             PRIVATE(nrows_local, use_sp_a, use_sp_b), &

!$OMP             SHARED(matrix_a, matrix_b, n_matrices, trace, use_accurate_sum)

            DO imatrix = 1, n_matrices

               CALL cp_fm_get_info(matrix_a(imatrix) %matrix, nrow_local=nrows_local, ncol_local=ncols_local)


               use_sp_a = matrix_a(imatrix) %matrix%use_sp

               use_sp_b = matrix_b(imatrix) %matrix%use_sp


               ! assume that the matrices A(i) and B(i) have identical shapes and distribution schemes

               IF (use_sp_a .AND. use_sp_b) THEN

                  ldata_a_sp => matrix_a(imatrix) %matrix%local_data_sp(1:nrows_local, 1:ncols_local)

                  ldata_b_sp => matrix_b(imatrix) %matrix%local_data_sp(1:nrows_local, 1:ncols_local)

                  IF (use_accurate_sum) THEN

                     trace(imatrix) = accurate_dot_product(ldata_a_sp, ldata_b_sp)

                  ELSE

                     trace(imatrix) = sum(ldata_a_sp*ldata_b_sp)

                  END IF

               ELSE IF (.NOT. use_sp_a .AND. .NOT. use_sp_b) THEN

                  ldata_a => matrix_a(imatrix) %matrix%local_data(1:nrows_local, 1:ncols_local)

                  ldata_b => matrix_b(imatrix) %matrix%local_data(1:nrows_local, 1:ncols_local)

                  IF (use_accurate_sum) THEN

                     trace(imatrix) = accurate_dot_product(ldata_a, ldata_b)

                  ELSE

                     trace(imatrix) = sum(ldata_a*ldata_b)

                  END IF

               ELSE

                  cpabort("Matrices A and B are of different types")

               END IF

            END DO

!$OMP END PARALLEL DO


            group = matrix_a(1) %matrix%matrix_struct%para_env

            CALL group%sum(trace)


            CALL timestop(handle)


         END SUBROUTINE cp_fm_trace_a1b1t1_pp


! **************************************************************************************************

!> \brief Compute trace(i,j) = \sum_k Tr (matrix_a(k,i)^T matrix_b(k,j)).

!> \param matrix_a list of A matrices

!> \param matrix_b list of B matrices

!> \param trace    computed traces

!> \param accurate ...

! **************************************************************************************************


         SUBROUTINE cp_fm_contracted_trace_a2b2t2_aa (matrix_a, matrix_b, trace, accurate)

            TYPE(cp_fm_type), DIMENSION(:, :), INTENT(IN)   :: matrix_a

            TYPE(cp_fm_type), DIMENSION(:, :), INTENT(IN)   :: matrix_b

            REAL(kind=dp), DIMENSION(:, :), INTENT(OUT)        :: trace

            LOGICAL, INTENT(IN), OPTIONAL                      :: accurate


            CHARACTER(len=*), PARAMETER :: routineN = 'cp_fm_contracted_trace_a2b2t2_aa'


            INTEGER                                            :: handle, ia, ib, iz, na, nb, ncols_local, &

                                                                  nrows_local, nz

            INTEGER(kind=int_8)                                :: ib8, itrace, na8, ntraces

            LOGICAL                                            :: use_accurate_sum, use_sp_a, use_sp_b

            REAL(kind=dp)                                      :: t

            REAL(kind=dp), DIMENSION(:, :), POINTER            :: ldata_a, ldata_b

            REAL(kind=sp), DIMENSION(:, :), POINTER            :: ldata_a_sp, ldata_b_sp

            TYPE(mp_comm_type)                                 :: group


            CALL timeset(routinen, handle)


            nz = SIZE(matrix_a, 1)

            cpassert(SIZE(matrix_b, 1) == nz)


            na = SIZE(matrix_a, 2)

            nb = SIZE(matrix_b, 2)

            cpassert(SIZE(trace, 1) == na)

            cpassert(SIZE(trace, 2) == nb)


            use_accurate_sum = .true.

            IF (PRESENT(accurate)) use_accurate_sum = accurate


            ! here we use one running index (itrace) instead of two (ia, ib) in order to

            ! improve load balance between shared-memory threads

            ntraces = na*nb

            na8 = int(na, kind=int_8)


!$OMP PARALLEL DO DEFAULT(NONE), &

!$OMP             PRIVATE(ia, ib, ib8, itrace, iz, ldata_a, ldata_a_sp, ldata_b, ldata_b_sp, ncols_local), &

!$OMP             PRIVATE(nrows_local, t, use_sp_a, use_sp_b), &

!$OMP             SHARED(matrix_a, matrix_b, na, na8, nb, ntraces, nz, trace, use_accurate_sum)

            DO itrace = 1, ntraces

               ib8 = (itrace - 1)/na8

               ia = int(itrace - ib8*na8)

               ib = int(ib8) + 1


               t = 0.0_dp

               DO iz = 1, nz

                  CALL cp_fm_get_info(matrix_a(iz, ia) , nrow_local=nrows_local, ncol_local=ncols_local)

                  use_sp_a = matrix_a(iz, ia) %use_sp

                  use_sp_b = matrix_b(iz, ib) %use_sp


                  ! assume that the matrices A(iz, ia) and B(iz, ib) have identical shapes and distribution schemes

                  IF (.NOT. use_sp_a .AND. .NOT. use_sp_b) THEN

                     ldata_a => matrix_a(iz, ia) %local_data(1:nrows_local, 1:ncols_local)

                     ldata_b => matrix_b(iz, ib) %local_data(1:nrows_local, 1:ncols_local)

                     IF (use_accurate_sum) THEN

                        t = t + accurate_dot_product(ldata_a, ldata_b)

                     ELSE

                        t = t + sum(ldata_a*ldata_b)

                     END IF

                  ELSE IF (use_sp_a .AND. use_sp_b) THEN

                     ldata_a_sp => matrix_a(iz, ia) %local_data_sp(1:nrows_local, 1:ncols_local)

                     ldata_b_sp => matrix_b(iz, ib) %local_data_sp(1:nrows_local, 1:ncols_local)

                     IF (use_accurate_sum) THEN

                        t = t + accurate_dot_product(ldata_a_sp, ldata_b_sp)

                     ELSE

                        t = t + sum(ldata_a_sp*ldata_b_sp)

                     END IF

                  ELSE

                     cpabort("Matrices A and B are of different types")

                  END IF

               END DO

               trace(ia, ib) = t

            END DO

!$OMP END PARALLEL DO


            group = matrix_a(1, 1) %matrix_struct%para_env

            CALL group%sum(trace)


            CALL timestop(handle)


         END SUBROUTINE cp_fm_contracted_trace_a2b2t2_aa


         SUBROUTINE cp_fm_contracted_trace_a2b2t2_ap (matrix_a, matrix_b, trace, accurate)

            TYPE(cp_fm_type), DIMENSION(:, :), INTENT(IN)   :: matrix_a

            TYPE(cp_fm_p_type), DIMENSION(:, :), INTENT(IN)   :: matrix_b

            REAL(kind=dp), DIMENSION(:, :), INTENT(OUT)        :: trace

            LOGICAL, INTENT(IN), OPTIONAL                      :: accurate


            CHARACTER(len=*), PARAMETER :: routineN = 'cp_fm_contracted_trace_a2b2t2_ap'


            INTEGER                                            :: handle, ia, ib, iz, na, nb, ncols_local, &

                                                                  nrows_local, nz

            INTEGER(kind=int_8)                                :: ib8, itrace, na8, ntraces

            LOGICAL                                            :: use_accurate_sum, use_sp_a, use_sp_b

            REAL(kind=dp)                                      :: t

            REAL(kind=dp), DIMENSION(:, :), POINTER            :: ldata_a, ldata_b

            REAL(kind=sp), DIMENSION(:, :), POINTER            :: ldata_a_sp, ldata_b_sp

            TYPE(mp_comm_type)                                 :: group


            CALL timeset(routinen, handle)


            nz = SIZE(matrix_a, 1)

            cpassert(SIZE(matrix_b, 1) == nz)


            na = SIZE(matrix_a, 2)

            nb = SIZE(matrix_b, 2)

            cpassert(SIZE(trace, 1) == na)

            cpassert(SIZE(trace, 2) == nb)


            use_accurate_sum = .true.

            IF (PRESENT(accurate)) use_accurate_sum = accurate


            ! here we use one running index (itrace) instead of two (ia, ib) in order to

            ! improve load balance between shared-memory threads

            ntraces = na*nb

            na8 = int(na, kind=int_8)


!$OMP PARALLEL DO DEFAULT(NONE), &

!$OMP             PRIVATE(ia, ib, ib8, itrace, iz, ldata_a, ldata_a_sp, ldata_b, ldata_b_sp, ncols_local), &

!$OMP             PRIVATE(nrows_local, t, use_sp_a, use_sp_b), &

!$OMP             SHARED(matrix_a, matrix_b, na, na8, nb, ntraces, nz, trace, use_accurate_sum)

            DO itrace = 1, ntraces

               ib8 = (itrace - 1)/na8

               ia = int(itrace - ib8*na8)

               ib = int(ib8) + 1


               t = 0.0_dp

               DO iz = 1, nz

                  CALL cp_fm_get_info(matrix_a(iz, ia) , nrow_local=nrows_local, ncol_local=ncols_local)

                  use_sp_a = matrix_a(iz, ia) %use_sp

                  use_sp_b = matrix_b(iz, ib) %matrix%use_sp


                  ! assume that the matrices A(iz, ia) and B(iz, ib) have identical shapes and distribution schemes

                  IF (.NOT. use_sp_a .AND. .NOT. use_sp_b) THEN

                     ldata_a => matrix_a(iz, ia) %local_data(1:nrows_local, 1:ncols_local)

                     ldata_b => matrix_b(iz, ib) %matrix%local_data(1:nrows_local, 1:ncols_local)

                     IF (use_accurate_sum) THEN

                        t = t + accurate_dot_product(ldata_a, ldata_b)

                     ELSE

                        t = t + sum(ldata_a*ldata_b)

                     END IF

                  ELSE IF (use_sp_a .AND. use_sp_b) THEN

                     ldata_a_sp => matrix_a(iz, ia) %local_data_sp(1:nrows_local, 1:ncols_local)

                     ldata_b_sp => matrix_b(iz, ib) %matrix%local_data_sp(1:nrows_local, 1:ncols_local)

                     IF (use_accurate_sum) THEN

                        t = t + accurate_dot_product(ldata_a_sp, ldata_b_sp)

                     ELSE

                        t = t + sum(ldata_a_sp*ldata_b_sp)

                     END IF

                  ELSE

                     cpabort("Matrices A and B are of different types")

                  END IF

               END DO

               trace(ia, ib) = t

            END DO

!$OMP END PARALLEL DO


            group = matrix_a(1, 1) %matrix_struct%para_env

            CALL group%sum(trace)


            CALL timestop(handle)


         END SUBROUTINE cp_fm_contracted_trace_a2b2t2_ap


         SUBROUTINE cp_fm_contracted_trace_a2b2t2_pa (matrix_a, matrix_b, trace, accurate)

            TYPE(cp_fm_p_type), DIMENSION(:, :), INTENT(IN)   :: matrix_a

            TYPE(cp_fm_type), DIMENSION(:, :), INTENT(IN)   :: matrix_b

            REAL(kind=dp), DIMENSION(:, :), INTENT(OUT)        :: trace

            LOGICAL, INTENT(IN), OPTIONAL                      :: accurate


            CHARACTER(len=*), PARAMETER :: routineN = 'cp_fm_contracted_trace_a2b2t2_pa'


            INTEGER                                            :: handle, ia, ib, iz, na, nb, ncols_local, &

                                                                  nrows_local, nz

            INTEGER(kind=int_8)                                :: ib8, itrace, na8, ntraces

            LOGICAL                                            :: use_accurate_sum, use_sp_a, use_sp_b

            REAL(kind=dp)                                      :: t

            REAL(kind=dp), DIMENSION(:, :), POINTER            :: ldata_a, ldata_b

            REAL(kind=sp), DIMENSION(:, :), POINTER            :: ldata_a_sp, ldata_b_sp

            TYPE(mp_comm_type)                                 :: group


            CALL timeset(routinen, handle)


            nz = SIZE(matrix_a, 1)

            cpassert(SIZE(matrix_b, 1) == nz)


            na = SIZE(matrix_a, 2)

            nb = SIZE(matrix_b, 2)

            cpassert(SIZE(trace, 1) == na)

            cpassert(SIZE(trace, 2) == nb)


            use_accurate_sum = .true.

            IF (PRESENT(accurate)) use_accurate_sum = accurate


            ! here we use one running index (itrace) instead of two (ia, ib) in order to

            ! improve load balance between shared-memory threads

            ntraces = na*nb

            na8 = int(na, kind=int_8)


!$OMP PARALLEL DO DEFAULT(NONE), &

!$OMP             PRIVATE(ia, ib, ib8, itrace, iz, ldata_a, ldata_a_sp, ldata_b, ldata_b_sp, ncols_local), &

!$OMP             PRIVATE(nrows_local, t, use_sp_a, use_sp_b), &

!$OMP             SHARED(matrix_a, matrix_b, na, na8, nb, ntraces, nz, trace, use_accurate_sum)

            DO itrace = 1, ntraces

               ib8 = (itrace - 1)/na8

               ia = int(itrace - ib8*na8)

               ib = int(ib8) + 1


               t = 0.0_dp

               DO iz = 1, nz

                  CALL cp_fm_get_info(matrix_a(iz, ia) %matrix, nrow_local=nrows_local, ncol_local=ncols_local)

                  use_sp_a = matrix_a(iz, ia) %matrix%use_sp

                  use_sp_b = matrix_b(iz, ib) %use_sp


                  ! assume that the matrices A(iz, ia) and B(iz, ib) have identical shapes and distribution schemes

                  IF (.NOT. use_sp_a .AND. .NOT. use_sp_b) THEN

                     ldata_a => matrix_a(iz, ia) %matrix%local_data(1:nrows_local, 1:ncols_local)

                     ldata_b => matrix_b(iz, ib) %local_data(1:nrows_local, 1:ncols_local)

                     IF (use_accurate_sum) THEN

                        t = t + accurate_dot_product(ldata_a, ldata_b)

                     ELSE

                        t = t + sum(ldata_a*ldata_b)

                     END IF

                  ELSE IF (use_sp_a .AND. use_sp_b) THEN

                     ldata_a_sp => matrix_a(iz, ia) %matrix%local_data_sp(1:nrows_local, 1:ncols_local)

                     ldata_b_sp => matrix_b(iz, ib) %local_data_sp(1:nrows_local, 1:ncols_local)

                     IF (use_accurate_sum) THEN

                        t = t + accurate_dot_product(ldata_a_sp, ldata_b_sp)

                     ELSE

                        t = t + sum(ldata_a_sp*ldata_b_sp)

                     END IF

                  ELSE

                     cpabort("Matrices A and B are of different types")

                  END IF

               END DO

               trace(ia, ib) = t

            END DO

!$OMP END PARALLEL DO


            group = matrix_a(1, 1) %matrix%matrix_struct%para_env

            CALL group%sum(trace)


            CALL timestop(handle)


         END SUBROUTINE cp_fm_contracted_trace_a2b2t2_pa


         SUBROUTINE cp_fm_contracted_trace_a2b2t2_pp (matrix_a, matrix_b, trace, accurate)

            TYPE(cp_fm_p_type), DIMENSION(:, :), INTENT(IN)   :: matrix_a

            TYPE(cp_fm_p_type), DIMENSION(:, :), INTENT(IN)   :: matrix_b

            REAL(kind=dp), DIMENSION(:, :), INTENT(OUT)        :: trace

            LOGICAL, INTENT(IN), OPTIONAL                      :: accurate


            CHARACTER(len=*), PARAMETER :: routineN = 'cp_fm_contracted_trace_a2b2t2_pp'


            INTEGER                                            :: handle, ia, ib, iz, na, nb, ncols_local, &

                                                                  nrows_local, nz

            INTEGER(kind=int_8)                                :: ib8, itrace, na8, ntraces

            LOGICAL                                            :: use_accurate_sum, use_sp_a, use_sp_b

            REAL(kind=dp)                                      :: t

            REAL(kind=dp), DIMENSION(:, :), POINTER            :: ldata_a, ldata_b

            REAL(kind=sp), DIMENSION(:, :), POINTER            :: ldata_a_sp, ldata_b_sp

            TYPE(mp_comm_type)                                 :: group


            CALL timeset(routinen, handle)


            nz = SIZE(matrix_a, 1)

            cpassert(SIZE(matrix_b, 1) == nz)


            na = SIZE(matrix_a, 2)

            nb = SIZE(matrix_b, 2)

            cpassert(SIZE(trace, 1) == na)

            cpassert(SIZE(trace, 2) == nb)


            use_accurate_sum = .true.

            IF (PRESENT(accurate)) use_accurate_sum = accurate


            ! here we use one running index (itrace) instead of two (ia, ib) in order to

            ! improve load balance between shared-memory threads

            ntraces = na*nb

            na8 = int(na, kind=int_8)


!$OMP PARALLEL DO DEFAULT(NONE), &

!$OMP             PRIVATE(ia, ib, ib8, itrace, iz, ldata_a, ldata_a_sp, ldata_b, ldata_b_sp, ncols_local), &

!$OMP             PRIVATE(nrows_local, t, use_sp_a, use_sp_b), &

!$OMP             SHARED(matrix_a, matrix_b, na, na8, nb, ntraces, nz, trace, use_accurate_sum)

            DO itrace = 1, ntraces

               ib8 = (itrace - 1)/na8

               ia = int(itrace - ib8*na8)

               ib = int(ib8) + 1


               t = 0.0_dp

               DO iz = 1, nz

                  CALL cp_fm_get_info(matrix_a(iz, ia) %matrix, nrow_local=nrows_local, ncol_local=ncols_local)

                  use_sp_a = matrix_a(iz, ia) %matrix%use_sp

                  use_sp_b = matrix_b(iz, ib) %matrix%use_sp


                  ! assume that the matrices A(iz, ia) and B(iz, ib) have identical shapes and distribution schemes

                  IF (.NOT. use_sp_a .AND. .NOT. use_sp_b) THEN

                     ldata_a => matrix_a(iz, ia) %matrix%local_data(1:nrows_local, 1:ncols_local)

                     ldata_b => matrix_b(iz, ib) %matrix%local_data(1:nrows_local, 1:ncols_local)

                     IF (use_accurate_sum) THEN

                        t = t + accurate_dot_product(ldata_a, ldata_b)

                     ELSE

                        t = t + sum(ldata_a*ldata_b)

                     END IF

                  ELSE IF (use_sp_a .AND. use_sp_b) THEN

                     ldata_a_sp => matrix_a(iz, ia) %matrix%local_data_sp(1:nrows_local, 1:ncols_local)

                     ldata_b_sp => matrix_b(iz, ib) %matrix%local_data_sp(1:nrows_local, 1:ncols_local)

                     IF (use_accurate_sum) THEN

                        t = t + accurate_dot_product(ldata_a_sp, ldata_b_sp)

                     ELSE

                        t = t + sum(ldata_a_sp*ldata_b_sp)

                     END IF

                  ELSE

                     cpabort("Matrices A and B are of different types")

                  END IF

               END DO

               trace(ia, ib) = t

            END DO

!$OMP END PARALLEL DO


            group = matrix_a(1, 1) %matrix%matrix_struct%para_env

            CALL group%sum(trace)


            CALL timestop(handle)


         END SUBROUTINE cp_fm_contracted_trace_a2b2t2_pp


! **************************************************************************************************

!> \brief multiplies in place by a triangular matrix:

!>       matrix_b = alpha op(triangular_matrix) matrix_b

!>      or (if side='R')

!>       matrix_b = alpha matrix_b op(triangular_matrix)

!>      op(triangular_matrix) is:

!>       triangular_matrix (if transpose_tr=.false. and invert_tr=.false.)

!>       triangular_matrix^T (if transpose_tr=.true. and invert_tr=.false.)

!>       triangular_matrix^(-1) (if transpose_tr=.false. and invert_tr=.true.)

!>       triangular_matrix^(-T) (if transpose_tr=.true. and invert_tr=.true.)

!> \param triangular_matrix the triangular matrix that multiplies the other

!> \param matrix_b the matrix that gets multiplied and stores the result

!> \param side on which side of matrix_b stays op(triangular_matrix)

!>        (defaults to 'L')

!> \param transpose_tr if the triangular matrix should be transposed

!>        (defaults to false)

!> \param invert_tr if the triangular matrix should be inverted

!>        (defaults to false)

!> \param uplo_tr if triangular_matrix is stored in the upper ('U') or

!>        lower ('L') triangle (defaults to 'U')

!> \param unit_diag_tr if the diagonal elements of triangular_matrix should

!>        be assumed to be 1 (defaults to false)

!> \param n_rows the number of rows of the result (defaults to

!>        size(matrix_b,1))

!> \param n_cols the number of columns of the result (defaults to

!>        size(matrix_b,2))

!> \param alpha ...

!> \par History

!>      08.2002 created [fawzi]

!> \author Fawzi Mohamed

!> \note

!>      needs an mpi env

! **************************************************************************************************


   SUBROUTINE cp_fm_triangular_multiply(triangular_matrix, matrix_b, side, &

                                        transpose_tr, invert_tr, uplo_tr, unit_diag_tr, n_rows, n_cols, &

                                        alpha)

      TYPE(cp_fm_type), INTENT(IN)                       :: triangular_matrix, matrix_b

      CHARACTER, INTENT(IN), OPTIONAL                    :: side

      LOGICAL, INTENT(IN), OPTIONAL                      :: transpose_tr, invert_tr

      CHARACTER, INTENT(IN), OPTIONAL                    :: uplo_tr

      LOGICAL, INTENT(IN), OPTIONAL                      :: unit_diag_tr

      INTEGER, INTENT(IN), OPTIONAL                      :: n_rows, n_cols

      REAL(kind=dp), INTENT(IN), OPTIONAL                :: alpha


      CHARACTER(len=*), PARAMETER :: routinen = 'cp_fm_triangular_multiply'


      CHARACTER                                          :: side_char, transa, unit_diag, uplo

      INTEGER                                            :: handle, m, n

      LOGICAL                                            :: invert

      REAL(kind=dp)                                      :: al


      CALL timeset(routinen, handle)

      side_char = 'L'

      unit_diag = 'N'

      uplo = 'U'

      transa = 'N'

      invert = .false.

      al = 1.0_dp

      CALL cp_fm_get_info(matrix_b, nrow_global=m, ncol_global=n)

      IF (PRESENT(side)) side_char = side

      IF (PRESENT(invert_tr)) invert = invert_tr

      IF (PRESENT(uplo_tr)) uplo = uplo_tr

      IF (PRESENT(unit_diag_tr)) THEN

         IF (unit_diag_tr) THEN

            unit_diag = 'U'

         ELSE

            unit_diag = 'N'

         END IF

      END IF

      IF (PRESENT(transpose_tr)) THEN

         IF (transpose_tr) THEN

            transa = 'T'

         ELSE

            transa = 'N'

         END IF

      END IF

      IF (PRESENT(alpha)) al = alpha

      IF (PRESENT(n_rows)) m = n_rows

      IF (PRESENT(n_cols)) n = n_cols


      IF (invert) THEN


#if defined(__parallel)

         CALL pdtrsm(side_char, uplo, transa, unit_diag, m, n, al, &

                     triangular_matrix%local_data(1, 1), 1, 1, &

                     triangular_matrix%matrix_struct%descriptor, &

                     matrix_b%local_data(1, 1), 1, 1, &

                     matrix_b%matrix_struct%descriptor(1))

#else

         CALL dtrsm(side_char, uplo, transa, unit_diag, m, n, al, &

                    triangular_matrix%local_data(1, 1), &

                    SIZE(triangular_matrix%local_data, 1), &

                    matrix_b%local_data(1, 1), SIZE(matrix_b%local_data, 1))

#endif


      ELSE


#if defined(__parallel)

         CALL pdtrmm(side_char, uplo, transa, unit_diag, m, n, al, &

                     triangular_matrix%local_data(1, 1), 1, 1, &

                     triangular_matrix%matrix_struct%descriptor, &

                     matrix_b%local_data(1, 1), 1, 1, &

                     matrix_b%matrix_struct%descriptor(1))

#else

         CALL dtrmm(side_char, uplo, transa, unit_diag, m, n, al, &

                    triangular_matrix%local_data(1, 1), &

                    SIZE(triangular_matrix%local_data, 1), &

                    matrix_b%local_data(1, 1), SIZE(matrix_b%local_data, 1))

#endif


      END IF


      CALL timestop(handle)


   END SUBROUTINE cp_fm_triangular_multiply


! **************************************************************************************************

!> \brief scales a matrix

!>      matrix_a = alpha * matrix_b

!> \param alpha ...

!> \param matrix_a ...

!> \note

!>      use cp_fm_set_all to zero (avoids problems with nan)

! **************************************************************************************************


   SUBROUTINE cp_fm_scale(alpha, matrix_a)

      REAL(kind=dp), INTENT(IN)                          :: alpha

      TYPE(cp_fm_type), INTENT(IN)                       :: matrix_a


      CHARACTER(len=*), PARAMETER                        :: routinen = 'cp_fm_scale'


      INTEGER                                            :: handle, size_a

      REAL(kind=dp), DIMENSION(:, :), POINTER            :: a


      CALL timeset(routinen, handle)


      NULLIFY (a)


      a => matrix_a%local_data

      size_a = SIZE(a, 1)*SIZE(a, 2)


      CALL dscal(size_a, alpha, a, 1)


      CALL timestop(handle)


   END SUBROUTINE cp_fm_scale


! **************************************************************************************************

!> \brief transposes a matrix

!>      matrixt = matrix ^ T

!> \param matrix ...

!> \param matrixt ...

!> \note

!>      all matrix elements are transposed (see cp_fm_uplo_to_full to symmetrise a matrix)

! **************************************************************************************************


   SUBROUTINE cp_fm_transpose(matrix, matrixt)

      TYPE(cp_fm_type), INTENT(IN)             :: matrix, matrixt


      CHARACTER(len=*), PARAMETER :: routinen = 'cp_fm_transpose'


      INTEGER                                  :: handle, ncol_global, &

                                                  nrow_global, ncol_globalt, nrow_globalt

      REAL(kind=dp), DIMENSION(:, :), POINTER  :: a, c

#if defined(__parallel)

      INTEGER, DIMENSION(9)                    :: desca, descc

#elif !defined(__MKL)

      INTEGER                                  :: i, j

#endif


      nrow_global = matrix%matrix_struct%nrow_global

      ncol_global = matrix%matrix_struct%ncol_global

      nrow_globalt = matrixt%matrix_struct%nrow_global

      ncol_globalt = matrixt%matrix_struct%ncol_global

      cpassert(nrow_global == ncol_globalt)

      cpassert(nrow_globalt == ncol_global)


      CALL timeset(routinen, handle)


      a => matrix%local_data

      c => matrixt%local_data


#if defined(__parallel)

      desca(:) = matrix%matrix_struct%descriptor(:)

      descc(:) = matrixt%matrix_struct%descriptor(:)

      CALL pdtran(ncol_global, nrow_global, 1.0_dp, a(1, 1), 1, 1, desca, 0.0_dp, c(1, 1), 1, 1, descc)

#elif defined(__MKL)

      CALL mkl_domatcopy('C', 'T', nrow_global, ncol_global, 1.0_dp, a(1, 1), nrow_global, c(1, 1), ncol_global)

#else

      DO j = 1, ncol_global

         DO i = 1, nrow_global

            c(j, i) = a(i, j)

         END DO

      END DO

#endif

      CALL timestop(handle)


   END SUBROUTINE cp_fm_transpose


! **************************************************************************************************

!> \brief given a triangular matrix according to uplo, computes the corresponding full matrix

!> \param matrix the triangular matrix as input, the full matrix as output

!> \param work a matrix of the same size as matrix

!> \param uplo triangular format; defaults to 'U'

!> \author Matthias Krack

!> \note

!>       the opposite triangular part is irrelevant

! **************************************************************************************************


   SUBROUTINE cp_fm_uplo_to_full(matrix, work, uplo)


      TYPE(cp_fm_type), INTENT(IN)             :: matrix, work

      CHARACTER, INTENT(IN), OPTIONAL          :: uplo


      CHARACTER(len=*), PARAMETER :: routinen = 'cp_fm_uplo_to_full'


      CHARACTER                                :: myuplo

      INTEGER                                  :: handle, icol_global, irow_global, &

                                                  mypcol, myprow, ncol_global, &

                                                  npcol, nprow, nrow_global

      REAL(kind=dp), DIMENSION(:, :), POINTER  :: a

      REAL(kind=sp), DIMENSION(:, :), POINTER  :: a_sp

      TYPE(cp_blacs_env_type), POINTER         :: context


#if defined(__parallel)

      INTEGER                                  :: icol_local, irow_local, &

                                                  ncol_block, ncol_local, &

                                                  nrow_block, nrow_local

      INTEGER, DIMENSION(9)                    :: desca, descc

      REAL(kind=dp), DIMENSION(:, :), POINTER  :: c

      REAL(kind=sp), DIMENSION(:, :), POINTER  :: c_sp

#endif


      myuplo = 'U'

      IF (PRESENT(uplo)) myuplo = uplo


      nrow_global = matrix%matrix_struct%nrow_global

      ncol_global = matrix%matrix_struct%ncol_global

      cpassert(nrow_global == ncol_global)

      nrow_global = work%matrix_struct%nrow_global

      ncol_global = work%matrix_struct%ncol_global

      cpassert(nrow_global == ncol_global)

      cpassert(matrix%use_sp .EQV. work%use_sp)


      CALL timeset(routinen, handle)


      context => matrix%matrix_struct%context

      myprow = context%mepos(1)

      mypcol = context%mepos(2)

      nprow = context%num_pe(1)

      npcol = context%num_pe(2)


#if defined(__parallel)


      nrow_block = matrix%matrix_struct%nrow_block

      ncol_block = matrix%matrix_struct%ncol_block


      nrow_local = matrix%matrix_struct%nrow_locals(myprow)

      ncol_local = matrix%matrix_struct%ncol_locals(mypcol)


      a => work%local_data

      a_sp => work%local_data_sp

      desca(:) = work%matrix_struct%descriptor(:)

      c => matrix%local_data

      c_sp => matrix%local_data_sp

      descc(:) = matrix%matrix_struct%descriptor(:)


      DO icol_local = 1, ncol_local

         icol_global = matrix%matrix_struct%col_indices(icol_local)

         DO irow_local = 1, nrow_local

            irow_global = matrix%matrix_struct%row_indices(irow_local)

            IF (merge(irow_global > icol_global, irow_global < icol_global, (myuplo == "U") .OR. (myuplo == "u"))) THEN

               IF (matrix%use_sp) THEN

                  c_sp(irow_local, icol_local) = 0.0_sp

               ELSE

                  c(irow_local, icol_local) = 0.0_dp

               END IF

            ELSE IF (irow_global == icol_global) THEN

               IF (matrix%use_sp) THEN

                  c_sp(irow_local, icol_local) = 0.5_sp*c_sp(irow_local, icol_local)

               ELSE

                  c(irow_local, icol_local) = 0.5_dp*c(irow_local, icol_local)

               END IF

            END IF

         END DO

      END DO


      DO icol_local = 1, ncol_local

      DO irow_local = 1, nrow_local

         IF (matrix%use_sp) THEN

            a_sp(irow_local, icol_local) = c_sp(irow_local, icol_local)

         ELSE

            a(irow_local, icol_local) = c(irow_local, icol_local)

         END IF

      END DO

      END DO


      IF (matrix%use_sp) THEN

         CALL pstran(nrow_global, ncol_global, 1.0_sp, a_sp(1, 1), 1, 1, desca, 1.0_sp, c_sp(1, 1), 1, 1, descc)

      ELSE

         CALL pdtran(nrow_global, ncol_global, 1.0_dp, a(1, 1), 1, 1, desca, 1.0_dp, c(1, 1), 1, 1, descc)

      END IF


#else


      a => matrix%local_data

      a_sp => matrix%local_data_sp


      IF ((myuplo == "U") .OR. (myuplo == "u")) THEN

         DO irow_global = 1, nrow_global

         DO icol_global = irow_global + 1, ncol_global

            IF (matrix%use_sp) THEN

               a_sp(icol_global, irow_global) = a_sp(irow_global, icol_global)

            ELSE

               a(icol_global, irow_global) = a(irow_global, icol_global)

            END IF

         END DO

         END DO

      ELSE

         DO icol_global = 1, ncol_global

         DO irow_global = icol_global + 1, nrow_global

            IF (matrix%use_sp) THEN

               a_sp(irow_global, icol_global) = a_sp(icol_global, irow_global)

            ELSE

               a(irow_global, icol_global) = a(icol_global, irow_global)

            END IF

         END DO

         END DO

      END IF


#endif

      CALL timestop(handle)


   END SUBROUTINE cp_fm_uplo_to_full


! **************************************************************************************************

!> \brief scales column i of matrix a with scaling(i)

!> \param matrixa ...

!> \param scaling : an array used for scaling the columns,

!>                  SIZE(scaling) determines the number of columns to be scaled

!> \author Joost VandeVondele

!> \note

!>      this is very useful as a first step in the computation of C = sum_i alpha_i A_i transpose (A_i)

!>      that is a rank-k update (cp_fm_syrk , cp_sm_plus_fm_fm_t)

!>      this procedure can be up to 20 times faster than calling cp_fm_syrk n times

!>      where every vector has a different prefactor

! **************************************************************************************************


   SUBROUTINE cp_fm_column_scale(matrixa, scaling)

      TYPE(cp_fm_type), INTENT(IN)             :: matrixa

      REAL(kind=dp), DIMENSION(:), INTENT(IN)  :: scaling


      INTEGER                                  :: k, mypcol, myprow, n, ncol_global, &

                                                  npcol, nprow

      REAL(kind=dp), DIMENSION(:, :), POINTER  :: a

      REAL(kind=sp), DIMENSION(:, :), POINTER  :: a_sp

#if defined(__parallel)

      INTEGER                                  :: icol_global, icol_local, &

                                                  ipcol, iprow, irow_local

#else

      INTEGER                                  :: i

#endif


      myprow = matrixa%matrix_struct%context%mepos(1)

      mypcol = matrixa%matrix_struct%context%mepos(2)

      nprow = matrixa%matrix_struct%context%num_pe(1)

      npcol = matrixa%matrix_struct%context%num_pe(2)


      ncol_global = matrixa%matrix_struct%ncol_global


      a => matrixa%local_data

      a_sp => matrixa%local_data_sp

      IF (matrixa%use_sp) THEN

         n = SIZE(a_sp, 1)

      ELSE

         n = SIZE(a, 1)

      END IF

      k = min(SIZE(scaling), ncol_global)


#if defined(__parallel)


      DO icol_global = 1, k

         CALL infog2l(1, icol_global, matrixa%matrix_struct%descriptor, &

                      nprow, npcol, myprow, mypcol, &

                      irow_local, icol_local, iprow, ipcol)

         IF ((ipcol == mypcol)) THEN

            IF (matrixa%use_sp) THEN

               CALL sscal(n, real(scaling(icol_global), sp), a_sp(:, icol_local), 1)

            ELSE

               CALL dscal(n, scaling(icol_global), a(:, icol_local), 1)

            END IF

         END IF

      END DO

#else

      DO i = 1, k

         IF (matrixa%use_sp) THEN

            CALL sscal(n, real(scaling(i), sp), a_sp(:, i), 1)

         ELSE

            CALL dscal(n, scaling(i), a(:, i), 1)

         END IF

      END DO

#endif


   END SUBROUTINE cp_fm_column_scale


! **************************************************************************************************

!> \brief scales row i of matrix a with scaling(i)

!> \param matrixa ...

!> \param scaling : an array used for scaling the columns,

!> \author JGH

!> \note

! **************************************************************************************************


   SUBROUTINE cp_fm_row_scale(matrixa, scaling)

      TYPE(cp_fm_type), INTENT(IN)             :: matrixa

      REAL(kind=dp), DIMENSION(:), INTENT(IN)  :: scaling


      INTEGER                                  :: n, m, nrow_global, nrow_local, ncol_local

      INTEGER, DIMENSION(:), POINTER           :: row_indices

      REAL(kind=dp), DIMENSION(:, :), POINTER  :: a

      REAL(kind=sp), DIMENSION(:, :), POINTER  :: a_sp

#if defined(__parallel)

      INTEGER                                  :: irow_global, icol, irow

#else

      INTEGER                                  :: j

#endif


      CALL cp_fm_get_info(matrixa, row_indices=row_indices, nrow_global=nrow_global, &

                          nrow_local=nrow_local, ncol_local=ncol_local)

      cpassert(SIZE(scaling) == nrow_global)


      a => matrixa%local_data

      a_sp => matrixa%local_data_sp

      IF (matrixa%use_sp) THEN

         n = SIZE(a_sp, 1)

         m = SIZE(a_sp, 2)

      ELSE

         n = SIZE(a, 1)

         m = SIZE(a, 2)

      END IF


#if defined(__parallel)

      DO icol = 1, ncol_local

         IF (matrixa%use_sp) THEN

            DO irow = 1, nrow_local

               irow_global = row_indices(irow)

               a(irow, icol) = real(scaling(irow_global), dp)*a(irow, icol)

            END DO

         ELSE

            DO irow = 1, nrow_local

               irow_global = row_indices(irow)

               a(irow, icol) = scaling(irow_global)*a(irow, icol)

            END DO

         END IF

      END DO

#else

      IF (matrixa%use_sp) THEN

         DO j = 1, m

            a_sp(1:n, j) = real(scaling(1:n), sp)*a_sp(1:n, j)

         END DO

      ELSE

         DO j = 1, m

            a(1:n, j) = scaling(1:n)*a(1:n, j)

         END DO

      END IF

#endif


   END SUBROUTINE cp_fm_row_scale


! **************************************************************************************************

!> \brief Inverts a cp_fm_type matrix, optionally returning the determinant of the input matrix

!> \param matrix_a the matrix to invert

!> \param matrix_inverse the inverse of matrix_a

!> \param det_a the determinant of matrix_a

!> \param eps_svd optional parameter to active SVD based inversion, singular values below eps_svd

!>                are screened

!> \param eigval optionally return matrix eigenvalues/singular values

!> \par History

!>      note of Jan Wilhelm (12.2015)

!>      - computation of determinant corrected

!>      - determinant only computed if det_a is present

!>      12.2016 added option to use SVD instead of LU [Nico Holmberg]

!>      - Use cp_fm_get diag instead of n times cp_fm_get_element (A. Bussy)

!> \author Florian Schiffmann(02.2007)

! **************************************************************************************************


   SUBROUTINE cp_fm_invert(matrix_a, matrix_inverse, det_a, eps_svd, eigval)


      TYPE(cp_fm_type), INTENT(IN)             :: matrix_a, matrix_inverse

      REAL(kind=dp), INTENT(OUT), OPTIONAL     :: det_a

      REAL(kind=dp), INTENT(IN), OPTIONAL      :: eps_svd

      REAL(kind=dp), DIMENSION(:), POINTER, &

         INTENT(INOUT), OPTIONAL               :: eigval


      INTEGER                                  :: n

      INTEGER, ALLOCATABLE, DIMENSION(:)       :: ipivot

      REAL(kind=dp)                            :: determinant, my_eps_svd

      REAL(kind=dp), DIMENSION(:, :), POINTER  :: a

      TYPE(cp_fm_type)                         :: matrix_lu


#if defined(__parallel)

      TYPE(cp_fm_type)                         :: u, vt, sigma, inv_sigma_ut

      TYPE(mp_comm_type) :: group

      INTEGER                                  :: i, info, liwork, lwork, exponent_of_minus_one

      INTEGER, DIMENSION(9)                    :: desca

      LOGICAL                                  :: quenched

      REAL(kind=dp)                            :: alpha, beta

      REAL(kind=dp), DIMENSION(:), POINTER     :: diag

      REAL(kind=dp), ALLOCATABLE, DIMENSION(:) :: work

#else

      LOGICAL                                  :: sign

      REAL(kind=dp)                            :: eps1

#endif


      my_eps_svd = 0.0_dp

      IF (PRESENT(eps_svd)) my_eps_svd = eps_svd


      CALL cp_fm_create(matrix=matrix_lu, &

                        matrix_struct=matrix_a%matrix_struct, &

                        name="A_lu"//trim(adjustl(cp_to_string(1)))//"MATRIX")

      CALL cp_fm_to_fm(matrix_a, matrix_lu)


      a => matrix_lu%local_data

      n = matrix_lu%matrix_struct%nrow_global

      ALLOCATE (ipivot(n + matrix_a%matrix_struct%nrow_block))

      ipivot(:) = 0

#if defined(__parallel)

      IF (my_eps_svd .EQ. 0.0_dp) THEN

         ! Use LU decomposition

         lwork = 3*n

         liwork = 3*n

         desca(:) = matrix_lu%matrix_struct%descriptor(:)

         CALL pdgetrf(n, n, a, 1, 1, desca, ipivot, info)


         IF (PRESENT(det_a) .OR. PRESENT(eigval)) THEN


            ALLOCATE (diag(n))

            diag(:) = 0.0_dp

            CALL cp_fm_get_diag(matrix_lu, diag)


            exponent_of_minus_one = 0

            determinant = 1.0_dp

            DO i = 1, n

               determinant = determinant*diag(i)

               IF (ipivot(i) .NE. i) THEN

                  exponent_of_minus_one = exponent_of_minus_one + 1

               END IF

            END DO

            IF (PRESENT(eigval)) THEN

               cpassert(.NOT. ASSOCIATED(eigval))

               ALLOCATE (eigval(n))

               eigval(:) = diag

            END IF

            DEALLOCATE (diag)


            group = matrix_lu%matrix_struct%para_env

            CALL group%sum(exponent_of_minus_one)


            determinant = determinant*(-1.0_dp)**exponent_of_minus_one


         END IF


         alpha = 0.0_dp

         beta = 1.0_dp

         CALL cp_fm_set_all(matrix_inverse, alpha, beta)

         CALL pdgetrs('N', n, n, matrix_lu%local_data, 1, 1, desca, ipivot, matrix_inverse%local_data, 1, 1, desca, info)

      ELSE

         ! Use singular value decomposition

         CALL cp_fm_create(matrix=u, &

                           matrix_struct=matrix_a%matrix_struct, &

                           name="LEFT_SINGULAR_MATRIX")

         CALL cp_fm_set_all(u, alpha=0.0_dp)

         CALL cp_fm_create(matrix=vt, &

                           matrix_struct=matrix_a%matrix_struct, &

                           name="RIGHT_SINGULAR_MATRIX")

         CALL cp_fm_set_all(vt, alpha=0.0_dp)

         ALLOCATE (diag(n))

         diag(:) = 0.0_dp

         desca(:) = matrix_lu%matrix_struct%descriptor(:)

         ALLOCATE (work(1))

         ! Workspace query

         lwork = -1

         CALL pdgesvd('V', 'V', n, n, matrix_lu%local_data, 1, 1, desca, diag, u%local_data, &

                      1, 1, desca, vt%local_data, 1, 1, desca, work, lwork, info)

         lwork = int(work(1))

         DEALLOCATE (work)

         ALLOCATE (work(lwork))

         ! SVD

         CALL pdgesvd('V', 'V', n, n, matrix_lu%local_data, 1, 1, desca, diag, u%local_data, &

                      1, 1, desca, vt%local_data, 1, 1, desca, work, lwork, info)

         ! info == n+1 implies homogeneity error when the number of procs is large

         ! this likely isnt a problem, but maybe we should handle it separately

         IF (info /= 0 .AND. info /= n + 1) &

            cpabort("Singular value decomposition of matrix failed.")

         ! (Pseudo)inverse and (pseudo)determinant

         CALL cp_fm_create(matrix=sigma, &

                           matrix_struct=matrix_a%matrix_struct, &

                           name="SINGULAR_VALUE_MATRIX")

         CALL cp_fm_set_all(sigma, alpha=0.0_dp)

         determinant = 1.0_dp

         quenched = .false.

         IF (PRESENT(eigval)) THEN

            cpassert(.NOT. ASSOCIATED(eigval))

            ALLOCATE (eigval(n))

            eigval(:) = diag

         END IF

         DO i = 1, n

            IF (diag(i) < my_eps_svd) THEN

               diag(i) = 0.0_dp

               quenched = .true.

            ELSE

               determinant = determinant*diag(i)

               diag(i) = 1.0_dp/diag(i)

            END IF

            CALL cp_fm_set_element(sigma, i, i, diag(i))

         END DO

         DEALLOCATE (diag)

         IF (quenched) &

            CALL cp_warn(__location__, &

                         "Linear dependencies were detected in the SVD inversion of matrix "//trim(adjustl(matrix_a%name))// &

                         ". At least one singular value has been quenched.")

         ! Sigma^-1 * U^T

         CALL cp_fm_create(matrix=inv_sigma_ut, &

                           matrix_struct=matrix_a%matrix_struct, &

                           name="SINGULAR_VALUE_MATRIX")

         CALL cp_fm_set_all(inv_sigma_ut, alpha=0.0_dp)

         CALL pdgemm('N', 'T', n, n, n, 1.0_dp, sigma%local_data, 1, 1, desca, &

                     u%local_data, 1, 1, desca, 0.0_dp, inv_sigma_ut%local_data, 1, 1, desca)

         ! A^-1 = V * (Sigma^-1 * U^T)

         CALL cp_fm_set_all(matrix_inverse, alpha=0.0_dp)

         CALL pdgemm('T', 'N', n, n, n, 1.0_dp, vt%local_data, 1, 1, desca, &

                     inv_sigma_ut%local_data, 1, 1, desca, 0.0_dp, matrix_inverse%local_data, 1, 1, desca)

         ! Clean up

         DEALLOCATE (work)

         CALL cp_fm_release(u)

         CALL cp_fm_release(vt)

         CALL cp_fm_release(sigma)

         CALL cp_fm_release(inv_sigma_ut)

      END IF

#else

      IF (my_eps_svd .EQ. 0.0_dp) THEN

         sign = .true.

         CALL invert_matrix(matrix_a%local_data, matrix_inverse%local_data, &

                            eval_error=eps1)

         CALL cp_fm_lu_decompose(matrix_lu, determinant, correct_sign=sign)

         IF (PRESENT(eigval)) &

            CALL cp_abort(__location__, &

                          "NYI. Eigenvalues not available for return without SCALAPACK.")

      ELSE

         CALL get_pseudo_inverse_svd(matrix_a%local_data, matrix_inverse%local_data, eps_svd, &

                                     determinant, eigval)

      END IF

#endif

      CALL cp_fm_release(matrix_lu)

      DEALLOCATE (ipivot)

      IF (PRESENT(det_a)) det_a = determinant


   END SUBROUTINE cp_fm_invert


! **************************************************************************************************

!> \brief inverts a triangular matrix

!> \param matrix_a ...

!> \param uplo_tr triangular format; defaults to 'U'

!> \author MI

! **************************************************************************************************


   SUBROUTINE cp_fm_triangular_invert(matrix_a, uplo_tr)


      TYPE(cp_fm_type), INTENT(IN)             :: matrix_a

      CHARACTER, INTENT(IN), OPTIONAL          :: uplo_tr


      CHARACTER(LEN=*), PARAMETER :: routinen = 'cp_fm_triangular_invert'


      CHARACTER                                :: unit_diag, uplo

      INTEGER                                  :: handle, info, ncol_global

      REAL(kind=dp), DIMENSION(:, :), POINTER  :: a

#if defined(__parallel)

      INTEGER, DIMENSION(9)                    :: desca

#endif


      CALL timeset(routinen, handle)


      unit_diag = 'N'

      uplo = 'U'

      IF (PRESENT(uplo_tr)) uplo = uplo_tr


      ncol_global = matrix_a%matrix_struct%ncol_global


      a => matrix_a%local_data


#if defined(__parallel)

      desca(:) = matrix_a%matrix_struct%descriptor(:)


      CALL pdtrtri(uplo, unit_diag, ncol_global, a(1, 1), 1, 1, desca, info)


#else

      CALL dtrtri(uplo, unit_diag, ncol_global, a(1, 1), ncol_global, info)

#endif


      CALL timestop(handle)


   END SUBROUTINE cp_fm_triangular_invert


! **************************************************************************************************

!> \brief  performs a QR factorization of the input rectangular matrix A or of a submatrix of A

!>         the computed triangular matrix R is in output of the submatrix sub(A) of size NxN

!>         M and M give the dimension of the submatrix that has to be factorized (MxN) with M>N

!> \param matrix_a ...

!> \param matrix_r ...

!> \param nrow_fact ...

!> \param ncol_fact ...

!> \param first_row ...

!> \param first_col ...

!> \author MI

! **************************************************************************************************


   SUBROUTINE cp_fm_qr_factorization(matrix_a, matrix_r, nrow_fact, ncol_fact, first_row, first_col, uplo)

      TYPE(cp_fm_type), INTENT(IN)             :: matrix_a, matrix_r

      INTEGER, INTENT(IN), OPTIONAL            :: nrow_fact, ncol_fact, &

                                                  first_row, first_col

      CHARACTER, INTENT(IN), OPTIONAL          :: uplo


      CHARACTER(LEN=*), PARAMETER :: routinen = 'cp_fm_qr_factorization'


      CHARACTER                                :: myuplo

      INTEGER                                  :: handle, i, icol, info, irow, &

                                                  j, lda, lwork, ncol, &

                                                  ndim, nrow

      REAL(dp), ALLOCATABLE, DIMENSION(:)      :: tau, work

      REAL(dp), ALLOCATABLE, DIMENSION(:, :)   :: r_mat

      REAL(kind=dp), DIMENSION(:, :), POINTER  :: a

#if defined(__parallel)

      INTEGER, DIMENSION(9)                    :: desca

#endif


      CALL timeset(routinen, handle)


      myuplo = 'U'

      IF (PRESENT(uplo)) myuplo = uplo


      ncol = matrix_a%matrix_struct%ncol_global

      nrow = matrix_a%matrix_struct%nrow_global

      lda = nrow


      a => matrix_a%local_data


      IF (PRESENT(nrow_fact)) nrow = nrow_fact

      IF (PRESENT(ncol_fact)) ncol = ncol_fact

      irow = 1

      IF (PRESENT(first_row)) irow = first_row

      icol = 1

      IF (PRESENT(first_col)) icol = first_col


      cpassert(nrow >= ncol)

      ndim = SIZE(a, 2)

      ALLOCATE (tau(ndim))


#if defined(__parallel)


      desca(:) = matrix_a%matrix_struct%descriptor(:)


      lwork = -1

      ALLOCATE (work(2*ndim))

      CALL pdgeqrf(nrow, ncol, a, irow, icol, desca, tau, work, lwork, info)

      lwork = int(work(1))

      DEALLOCATE (work)

      ALLOCATE (work(lwork))

      CALL pdgeqrf(nrow, ncol, a, irow, icol, desca, tau, work, lwork, info)


#else

      lwork = -1

      ALLOCATE (work(2*ndim))

      CALL dgeqrf(nrow, ncol, a, lda, tau, work, lwork, info)

      lwork = int(work(1))

      DEALLOCATE (work)

      ALLOCATE (work(lwork))

      CALL dgeqrf(nrow, ncol, a, lda, tau, work, lwork, info)


#endif


      ALLOCATE (r_mat(ncol, ncol))

      CALL cp_fm_get_submatrix(matrix_a, r_mat, 1, 1, ncol, ncol)

      IF ((myuplo == "U") .OR. (myuplo == "u")) THEN

         DO i = 1, ncol

         DO j = i + 1, ncol

            r_mat(j, i) = 0.0_dp

         END DO

         END DO

      ELSE

         DO j = 1, ncol

         DO i = j + 1, ncol

            r_mat(i, j) = 0.0_dp

         END DO

         END DO

      END IF

      CALL cp_fm_set_submatrix(matrix_r, r_mat, 1, 1, ncol, ncol)


      DEALLOCATE (tau, work, r_mat)


      CALL timestop(handle)


   END SUBROUTINE cp_fm_qr_factorization


! **************************************************************************************************

!> \brief computes the the solution to A*b=A_general using lu decomposition

!> \param matrix_a input matrix; will be overwritten

!> \param general_a contains the result

!> \author Florian Schiffmann

! **************************************************************************************************


   SUBROUTINE cp_fm_solve(matrix_a, general_a)

      TYPE(cp_fm_type), INTENT(IN)             :: matrix_a, general_a


      CHARACTER(len=*), PARAMETER :: routinen = 'cp_fm_solve'


      INTEGER                                  :: handle, info, n, nrhs

      INTEGER, ALLOCATABLE, DIMENSION(:)       :: ipivot

      REAL(kind=dp), DIMENSION(:, :), POINTER  :: a, a_general

#if defined(__parallel)

      INTEGER, DIMENSION(9)                    :: desca, descb

#else

      INTEGER                                  :: lda, ldb

#endif


      CALL timeset(routinen, handle)


      a => matrix_a%local_data

      a_general => general_a%local_data

      n = matrix_a%matrix_struct%nrow_global

      nrhs = general_a%matrix_struct%ncol_global

      ALLOCATE (ipivot(n + matrix_a%matrix_struct%nrow_block))


#if defined(__parallel)

      desca(:) = matrix_a%matrix_struct%descriptor(:)

      descb(:) = general_a%matrix_struct%descriptor(:)

      CALL pdgetrf(n, n, a, 1, 1, desca, ipivot, info)

      CALL pdgetrs("N", n, nrhs, a, 1, 1, desca, ipivot, a_general, &

                   1, 1, descb, info)


#else

      lda = SIZE(a, 1)

      ldb = SIZE(a_general, 1)

      CALL dgetrf(n, n, a, lda, ipivot, info)

      CALL dgetrs("N", n, nrhs, a, lda, ipivot, a_general, ldb, info)


#endif

      ! info is allowed to be zero

      ! this does just signal a zero diagonal element

      DEALLOCATE (ipivot)

      CALL timestop(handle)


   END SUBROUTINE


! **************************************************************************************************

!> \brief Convenience function. Computes the matrix multiplications needed

!>        for the multiplication of complex matrices.

!>        C = beta * C + alpha * ( A  ** transa ) * ( B ** transb )

!> \param transa : 'N' -> normal   'T' -> transpose

!>      alpha,beta :: can be 0.0_dp and 1.0_dp

!> \param transb ...

!> \param m ...

!> \param n ...

!> \param k ...

!> \param alpha ...

!> \param A_re m x k matrix ( ! for transa = 'N'), real part

!> \param A_im m x k matrix ( ! for transa = 'N'), imaginary part

!> \param B_re k x n matrix ( ! for transa = 'N'), real part

!> \param B_im k x n matrix ( ! for transa = 'N'), imaginary part

!> \param beta ...

!> \param C_re m x n matrix, real part

!> \param C_im m x n matrix, imaginary part

!> \param a_first_col ...

!> \param a_first_row ...

!> \param b_first_col : the k x n matrix starts at col b_first_col of matrix_b (avoid usage)

!> \param b_first_row ...

!> \param c_first_col ...

!> \param c_first_row ...

!> \author Samuel Andermatt

!> \note

!>      C should have no overlap with A, B

! **************************************************************************************************


   SUBROUTINE cp_complex_fm_gemm(transa, transb, m, n, k, alpha, A_re, A_im, B_re, B_im, beta, &

                                 C_re, C_im, a_first_col, a_first_row, b_first_col, b_first_row, c_first_col, &

                                 c_first_row)

      CHARACTER(LEN=1), INTENT(IN)                       :: transa, transb

      INTEGER, INTENT(IN)                                :: m, n, k

      REAL(kind=dp), INTENT(IN)                          :: alpha

      TYPE(cp_fm_type), INTENT(IN)                       :: a_re, a_im, b_re, b_im

      REAL(kind=dp), INTENT(IN)                          :: beta

      TYPE(cp_fm_type), INTENT(IN)                       :: c_re, c_im

      INTEGER, INTENT(IN), OPTIONAL                      :: a_first_col, a_first_row, b_first_col, &

                                                            b_first_row, c_first_col, c_first_row


      CHARACTER(len=*), PARAMETER :: routinen = 'cp_complex_fm_gemm'


      INTEGER                                            :: handle


      CALL timeset(routinen, handle)


      CALL cp_fm_gemm(transa, transb, m, n, k, alpha, a_re, b_re, beta, c_re, &

                      a_first_col=a_first_col, &

                      a_first_row=a_first_row, &

                      b_first_col=b_first_col, &

                      b_first_row=b_first_row, &

                      c_first_col=c_first_col, &

                      c_first_row=c_first_row)

      CALL cp_fm_gemm(transa, transb, m, n, k, -alpha, a_im, b_im, 1.0_dp, c_re, &

                      a_first_col=a_first_col, &

                      a_first_row=a_first_row, &

                      b_first_col=b_first_col, &

                      b_first_row=b_first_row, &

                      c_first_col=c_first_col, &

                      c_first_row=c_first_row)

      CALL cp_fm_gemm(transa, transb, m, n, k, alpha, a_re, b_im, beta, c_im, &

                      a_first_col=a_first_col, &

                      a_first_row=a_first_row, &

                      b_first_col=b_first_col, &

                      b_first_row=b_first_row, &

                      c_first_col=c_first_col, &

                      c_first_row=c_first_row)

      CALL cp_fm_gemm(transa, transb, m, n, k, alpha, a_im, b_re, 1.0_dp, c_im, &

                      a_first_col=a_first_col, &

                      a_first_row=a_first_row, &

                      b_first_col=b_first_col, &

                      b_first_row=b_first_row, &

                      c_first_col=c_first_col, &

                      c_first_row=c_first_row)


      CALL timestop(handle)


   END SUBROUTINE cp_complex_fm_gemm


! **************************************************************************************************

!> \brief inverts a matrix using LU decomposition

!>        the input matrix will be overwritten

!> \param matrix   : input a general square non-singular matrix, outputs its inverse

!> \param info_out : optional, if present outputs the info from (p)zgetri

!> \author Lianheng Tong

! **************************************************************************************************

   SUBROUTINE cp_fm_lu_invert(matrix, info_out)

      TYPE(cp_fm_type), INTENT(IN)             :: matrix

      INTEGER, INTENT(OUT), OPTIONAL           :: info_out


      CHARACTER(len=*), PARAMETER :: routinen = 'cp_fm_lu_invert'


      INTEGER :: nrows_global, handle, info, lwork

      INTEGER, DIMENSION(:), ALLOCATABLE       :: ipivot

      REAL(kind=dp), DIMENSION(:, :), POINTER  :: mat

      REAL(kind=sp), DIMENSION(:, :), POINTER  :: mat_sp

      REAL(kind=dp), DIMENSION(:), ALLOCATABLE :: work

      REAL(kind=sp), DIMENSION(:), ALLOCATABLE :: work_sp

#if defined(__parallel)

      INTEGER                                  :: liwork

      INTEGER, DIMENSION(9)                    :: desca

      INTEGER, DIMENSION(:), ALLOCATABLE       :: iwork

#else

      INTEGER                                  :: lda

#endif


      CALL timeset(routinen, handle)


      mat => matrix%local_data

      mat_sp => matrix%local_data_sp

      nrows_global = matrix%matrix_struct%nrow_global

      cpassert(nrows_global .EQ. matrix%matrix_struct%ncol_global)

      ALLOCATE (ipivot(nrows_global))

      ! do LU decomposition

#if defined(__parallel)

      desca = matrix%matrix_struct%descriptor

      IF (matrix%use_sp) THEN

         CALL psgetrf(nrows_global, nrows_global, &

                      mat_sp, 1, 1, desca, ipivot, info)

      ELSE

         CALL pdgetrf(nrows_global, nrows_global, &

                      mat, 1, 1, desca, ipivot, info)

      END IF

#else

      lda = SIZE(mat, 1)

      IF (matrix%use_sp) THEN

         CALL sgetrf(nrows_global, nrows_global, &

                     mat_sp, lda, ipivot, info)

      ELSE

         CALL dgetrf(nrows_global, nrows_global, &

                     mat, lda, ipivot, info)

      END IF

#endif

      IF (info /= 0) THEN

         CALL cp_abort(__location__, "LU decomposition has failed")

      END IF

      ! do inversion

      IF (matrix%use_sp) THEN

         ALLOCATE (work(1))

      ELSE

         ALLOCATE (work_sp(1))

      END IF

#if defined(__parallel)

      ALLOCATE (iwork(1))

      IF (matrix%use_sp) THEN

         CALL psgetri(nrows_global, mat_sp, 1, 1, desca, &

                      ipivot, work_sp, -1, iwork, -1, info)

         lwork = int(work_sp(1))

         DEALLOCATE (work_sp)

         ALLOCATE (work_sp(lwork))

      ELSE

         CALL pdgetri(nrows_global, mat, 1, 1, desca, &

                      ipivot, work, -1, iwork, -1, info)

         lwork = int(work(1))

         DEALLOCATE (work)

         ALLOCATE (work(lwork))

      END IF

      liwork = int(iwork(1))

      DEALLOCATE (iwork)

      ALLOCATE (iwork(liwork))

      IF (matrix%use_sp) THEN

         CALL psgetri(nrows_global, mat_sp, 1, 1, desca, &

                      ipivot, work_sp, lwork, iwork, liwork, info)

      ELSE

         CALL pdgetri(nrows_global, mat, 1, 1, desca, &

                      ipivot, work, lwork, iwork, liwork, info)

      END IF

      DEALLOCATE (iwork)

#else

      IF (matrix%use_sp) THEN

         CALL sgetri(nrows_global, mat_sp, lda, &

                     ipivot, work_sp, -1, info)

         lwork = int(work_sp(1))

         DEALLOCATE (work_sp)

         ALLOCATE (work_sp(lwork))

         CALL sgetri(nrows_global, mat_sp, lda, &

                     ipivot, work_sp, lwork, info)

      ELSE

         CALL dgetri(nrows_global, mat, lda, &

                     ipivot, work, -1, info)

         lwork = int(work(1))

         DEALLOCATE (work)

         ALLOCATE (work(lwork))

         CALL dgetri(nrows_global, mat, lda, &

                     ipivot, work, lwork, info)

      END IF

#endif

      IF (matrix%use_sp) THEN

         DEALLOCATE (work_sp)

      ELSE

         DEALLOCATE (work)

      END IF

      DEALLOCATE (ipivot)


      IF (PRESENT(info_out)) THEN

         info_out = info

      ELSE

         IF (info /= 0) &

            CALL cp_abort(__location__, "LU inversion has failed")

      END IF


      CALL timestop(handle)


   END SUBROUTINE cp_fm_lu_invert


! **************************************************************************************************

!> \brief norm of matrix using (p)dlange

!> \param matrix   : input a general matrix

!> \param mode     : 'M' max abs element value,

!>                   '1' or 'O' one norm, i.e. maximum column sum

!>                   'I' infinity norm, i.e. maximum row sum

!>                   'F' or 'E' Frobenius norm, i.e. sqrt of sum of all squares of elements

!> \return : the norm according to mode

!> \author Lianheng Tong

! **************************************************************************************************


   FUNCTION cp_fm_norm(matrix, mode) RESULT(res)

      TYPE(cp_fm_type), INTENT(IN) :: matrix

      CHARACTER, INTENT(IN) :: mode

      REAL(kind=dp) :: res


      CHARACTER(len=*), PARAMETER :: routinen = 'cp_fm_norm'


      INTEGER :: nrows, ncols, handle, lwork, nrows_local, ncols_local

      REAL(kind=sp) :: res_sp

      REAL(kind=dp), DIMENSION(:, :), POINTER :: aa

      REAL(kind=sp), DIMENSION(:, :), POINTER :: aa_sp

      REAL(kind=dp), DIMENSION(:), ALLOCATABLE :: work

      REAL(kind=sp), DIMENSION(:), ALLOCATABLE :: work_sp

#if defined(__parallel)

      INTEGER, DIMENSION(9) :: desca

#else

      INTEGER :: lda

#endif


      CALL timeset(routinen, handle)


      CALL cp_fm_get_info(matrix=matrix, &

                          nrow_global=nrows, &

                          ncol_global=ncols, &

                          nrow_local=nrows_local, &

                          ncol_local=ncols_local)

      aa => matrix%local_data

      aa_sp => matrix%local_data_sp


#if defined(__parallel)

      desca = matrix%matrix_struct%descriptor

      SELECT CASE (mode)

      CASE ('M', 'm')

         lwork = 1

      CASE ('1', 'O', 'o')

         lwork = ncols_local

      CASE ('I', 'i')

         lwork = nrows_local

      CASE ('F', 'f', 'E', 'e')

         lwork = 1

      CASE DEFAULT

         cpabort("mode input is not valid")

      END SELECT

      IF (matrix%use_sp) THEN

         ALLOCATE (work_sp(lwork))

         res_sp = pslange(mode, nrows, ncols, aa_sp, 1, 1, desca, work_sp)

         DEALLOCATE (work_sp)

         res = real(res_sp, kind=dp)

      ELSE

         ALLOCATE (work(lwork))

         res = pdlange(mode, nrows, ncols, aa, 1, 1, desca, work)

         DEALLOCATE (work)

      END IF

#else

      SELECT CASE (mode)

      CASE ('M', 'm')

         lwork = 1

      CASE ('1', 'O', 'o')

         lwork = 1

      CASE ('I', 'i')

         lwork = nrows

      CASE ('F', 'f', 'E', 'e')

         lwork = 1

      CASE DEFAULT

         cpabort("mode input is not valid")

      END SELECT

      IF (matrix%use_sp) THEN

         ALLOCATE (work_sp(lwork))

         lda = SIZE(aa_sp, 1)

         res_sp = slange(mode, nrows, ncols, aa_sp, lda, work_sp)

         DEALLOCATE (work_sp)

         res = real(res_sp, kind=dp)

      ELSE

         ALLOCATE (work(lwork))

         lda = SIZE(aa, 1)

         res = dlange(mode, nrows, ncols, aa, lda, work)

         DEALLOCATE (work)

      END IF

#endif


      CALL timestop(handle)


   END FUNCTION cp_fm_norm


! **************************************************************************************************

!> \brief trace of a matrix using pdlatra

!> \param matrix   : input a square matrix

!> \return : the trace

!> \author Lianheng Tong

! **************************************************************************************************

   FUNCTION cp_fm_latra(matrix) RESULT(res)

      TYPE(cp_fm_type), INTENT(IN) :: matrix

      REAL(kind=dp) :: res


      CHARACTER(len=*), PARAMETER :: routinen = 'cp_fm_latra'


      INTEGER :: nrows, ncols, handle

      REAL(kind=sp) :: res_sp

      REAL(kind=dp), DIMENSION(:, :), POINTER :: aa

      REAL(kind=sp), DIMENSION(:, :), POINTER :: aa_sp

#if defined(__parallel)

      INTEGER, DIMENSION(9) :: desca

#else

      INTEGER :: ii

#endif


      CALL timeset(routinen, handle)


      nrows = matrix%matrix_struct%nrow_global

      ncols = matrix%matrix_struct%ncol_global

      cpassert(nrows .EQ. ncols)

      aa => matrix%local_data

      aa_sp => matrix%local_data_sp


#if defined(__parallel)

      desca = matrix%matrix_struct%descriptor

      IF (matrix%use_sp) THEN

         res_sp = pslatra(nrows, aa_sp, 1, 1, desca)

         res = real(res_sp, kind=dp)

      ELSE

         res = pdlatra(nrows, aa, 1, 1, desca)

      END IF

#else

      IF (matrix%use_sp) THEN

         res_sp = 0.0_sp

         DO ii = 1, nrows

            res_sp = res_sp + aa_sp(ii, ii)

         END DO

         res = real(res_sp, kind=dp)

      ELSE

         res = 0.0_dp

         DO ii = 1, nrows

            res = res + aa(ii, ii)

         END DO

      END IF

#endif


      CALL timestop(handle)


   END FUNCTION cp_fm_latra


! **************************************************************************************************

!> \brief compute a QR factorization with column pivoting of a M-by-N distributed matrix

!>        sub( A ) = A(IA:IA+M-1,JA:JA+N-1)

!> \param matrix   : input M-by-N distributed matrix sub( A ) which is to be factored

!> \param tau      : scalar factors TAU of the elementary reflectors. TAU is tied to the distributed matrix A

!> \param nrow ...

!> \param ncol ...

!> \param first_row ...

!> \param first_col ...

!> \author MI

! **************************************************************************************************


   SUBROUTINE cp_fm_pdgeqpf(matrix, tau, nrow, ncol, first_row, first_col)


      TYPE(cp_fm_type), INTENT(IN)                       :: matrix

      REAL(kind=dp), DIMENSION(:), POINTER               :: tau

      INTEGER, INTENT(IN)                                :: nrow, ncol, first_row, first_col


      CHARACTER(len=*), PARAMETER :: routinen = 'cp_fm_pdgeqpf'


      INTEGER                                            :: handle

      INTEGER                                            :: info, lwork

      INTEGER, ALLOCATABLE, DIMENSION(:)                 :: ipiv

      REAL(kind=dp), DIMENSION(:, :), POINTER            :: a

      REAL(kind=dp), DIMENSION(:), POINTER               :: work

#if defined(__parallel)

      INTEGER, DIMENSION(9) :: descc

#else

      INTEGER :: lda

#endif


      CALL timeset(routinen, handle)


      a => matrix%local_data

      lwork = -1

      ALLOCATE (work(2*nrow))

      ALLOCATE (ipiv(ncol))

      info = 0


#if defined(__parallel)

      descc(:) = matrix%matrix_struct%descriptor(:)

      ! Call SCALAPACK routine to get optimal work dimension

      CALL pdgeqpf(nrow, ncol, a, first_row, first_col, descc, ipiv, tau, work, lwork, info)

      lwork = int(work(1))

      DEALLOCATE (work)

      ALLOCATE (work(lwork))

      tau = 0.0_dp

      ipiv = 0


      ! Call SCALAPACK routine to get QR decomposition of CTs

      CALL pdgeqpf(nrow, ncol, a, first_row, first_col, descc, ipiv, tau, work, lwork, info)

#else

      cpassert(first_row == 1 .AND. first_col == 1)

      lda = SIZE(a, 1)

      CALL dgeqp3(nrow, ncol, a, lda, ipiv, tau, work, lwork, info)

      lwork = int(work(1))

      DEALLOCATE (work)

      ALLOCATE (work(lwork))

      tau = 0.0_dp

      ipiv = 0

      CALL dgeqp3(nrow, ncol, a, lda, ipiv, tau, work, lwork, info)

#endif

      cpassert(info == 0)


      DEALLOCATE (work)

      DEALLOCATE (ipiv)


      CALL timestop(handle)


   END SUBROUTINE cp_fm_pdgeqpf


! **************************************************************************************************

!> \brief generates an M-by-N real distributed matrix Q denoting A(IA:IA+M-1,JA:JA+N-1)

!>         with orthonormal columns, which is defined as the first N columns of a product of K

!>         elementary reflectors of order M

!> \param matrix : On entry, the j-th column must contain the vector which defines the elementary reflector

!>                  as returned from PDGEQRF

!>                 On exit it contains  the M-by-N distributed matrix Q

!> \param tau :   contains the scalar factors TAU of elementary reflectors  as returned by PDGEQRF

!> \param nrow ...

!> \param first_row ...

!> \param first_col ...

!> \author MI

! **************************************************************************************************


   SUBROUTINE cp_fm_pdorgqr(matrix, tau, nrow, first_row, first_col)


      TYPE(cp_fm_type), INTENT(IN)                       :: matrix

      REAL(kind=dp), DIMENSION(:), POINTER               :: tau

      INTEGER, INTENT(IN)                                :: nrow, first_row, first_col


      CHARACTER(len=*), PARAMETER :: routinen = 'cp_fm_pdorgqr'


      INTEGER                                            :: handle

      INTEGER                                            :: info, lwork

      REAL(kind=dp), DIMENSION(:, :), POINTER            :: a

      REAL(kind=dp), DIMENSION(:), POINTER               :: work

#if defined(__parallel)

      INTEGER, DIMENSION(9) :: descc

#else

      INTEGER :: lda

#endif


      CALL timeset(routinen, handle)


      a => matrix%local_data

      lwork = -1

      ALLOCATE (work(2*nrow))

      info = 0


#if defined(__parallel)

      descc(:) = matrix%matrix_struct%descriptor(:)


      CALL pdorgqr(nrow, nrow, nrow, a, first_row, first_col, descc, tau, work, lwork, info)

      cpassert(info == 0)

      lwork = int(work(1))

      DEALLOCATE (work)

      ALLOCATE (work(lwork))


      ! Call SCALAPACK routine to get Q

      CALL pdorgqr(nrow, nrow, nrow, a, first_row, first_col, descc, tau, work, lwork, info)

#else

      cpassert(first_row == 1 .AND. first_col == 1)

      lda = SIZE(a, 1)

      CALL dorgqr(nrow, nrow, nrow, a, lda, tau, work, lwork, info)

      lwork = int(work(1))

      DEALLOCATE (work)

      ALLOCATE (work(lwork))

      CALL dorgqr(nrow, nrow, nrow, a, lda, tau, work, lwork, info)

#endif

      cpassert(info == 0)


      DEALLOCATE (work)

      CALL timestop(handle)


   END SUBROUTINE cp_fm_pdorgqr


! **************************************************************************************************

!> \brief Applies a planar rotation defined by cs and sn to the i'th and j'th rows.

!> \param matrix ...

!> \param irow ...

!> \param jrow ...

!> \param cs cosine of the rotation angle

!> \param sn sinus of the rotation angle

!> \author Ole Schuett

! **************************************************************************************************


   SUBROUTINE cp_fm_rot_rows(matrix, irow, jrow, cs, sn)

      TYPE(cp_fm_type), INTENT(IN)             :: matrix

      INTEGER, INTENT(IN)                      :: irow, jrow

      REAL(dp), INTENT(IN)                     :: cs, sn


      CHARACTER(len=*), PARAMETER :: routinen = 'cp_fm_rot_rows'

      INTEGER                                  :: handle, ncol


#if defined(__parallel)

      INTEGER                                  :: info, lwork

      INTEGER, DIMENSION(9)                    :: desc

      REAL(dp), DIMENSION(:), ALLOCATABLE      :: work

#endif

      CALL timeset(routinen, handle)

      CALL cp_fm_get_info(matrix, ncol_global=ncol)

#if defined(__parallel)

      IF (1 /= matrix%matrix_struct%context%n_pid) THEN

         lwork = 2*ncol + 1

         ALLOCATE (work(lwork))

         desc(:) = matrix%matrix_struct%descriptor(:)

         CALL pdrot(ncol, &

                    matrix%local_data(1, 1), irow, 1, desc, ncol, &

                    matrix%local_data(1, 1), jrow, 1, desc, ncol, &

                    cs, sn, work, lwork, info)

         cpassert(info == 0)

         DEALLOCATE (work)

      ELSE

#endif

         CALL drot(ncol, matrix%local_data(irow, 1), ncol, matrix%local_data(jrow, 1), ncol, cs, sn)

#if defined(__parallel)

      END IF

#endif

      CALL timestop(handle)


   END SUBROUTINE cp_fm_rot_rows


! **************************************************************************************************

!> \brief Applies a planar rotation defined by cs and sn to the i'th and j'th columnns.

!> \param matrix ...

!> \param icol ...

!> \param jcol ...

!> \param cs cosine of the rotation angle

!> \param sn sinus of the rotation angle

!> \author Ole Schuett

! **************************************************************************************************


   SUBROUTINE cp_fm_rot_cols(matrix, icol, jcol, cs, sn)

      TYPE(cp_fm_type), INTENT(IN)             :: matrix

      INTEGER, INTENT(IN)                      :: icol, jcol

      REAL(dp), INTENT(IN)                     :: cs, sn


      CHARACTER(len=*), PARAMETER :: routinen = 'cp_fm_rot_cols'

      INTEGER                                  :: handle, nrow


#if defined(__parallel)

      INTEGER                                  :: info, lwork

      INTEGER, DIMENSION(9)                    :: desc

      REAL(dp), DIMENSION(:), ALLOCATABLE      :: work

#endif

      CALL timeset(routinen, handle)

      CALL cp_fm_get_info(matrix, nrow_global=nrow)

#if defined(__parallel)

      IF (1 /= matrix%matrix_struct%context%n_pid) THEN

         lwork = 2*nrow + 1

         ALLOCATE (work(lwork))

         desc(:) = matrix%matrix_struct%descriptor(:)

         CALL pdrot(nrow, &

                    matrix%local_data(1, 1), 1, icol, desc, 1, &

                    matrix%local_data(1, 1), 1, jcol, desc, 1, &

                    cs, sn, work, lwork, info)

         cpassert(info == 0)

         DEALLOCATE (work)

      ELSE

#endif

         CALL drot(nrow, matrix%local_data(1, icol), 1, matrix%local_data(1, jcol), 1, cs, sn)

#if defined(__parallel)

      END IF

#endif

      CALL timestop(handle)


   END SUBROUTINE cp_fm_rot_cols


! **************************************************************************************************

!> \brief Orthonormalizes selected rows and columns of a full matrix, matrix_a

!> \param matrix_a ...

!> \param B ...

!> \param nrows number of rows of matrix_a, optional, defaults to size(matrix_a,1)

!> \param ncols number of columns of matrix_a, optional, defaults to size(matrix_a, 2)

!> \param start_row starting index of rows, optional, defaults to 1

!> \param start_col starting index of columns, optional, defaults to 1

!> \param do_norm ...

!> \param do_print ...

! **************************************************************************************************


   SUBROUTINE cp_fm_gram_schmidt_orthonorm(matrix_a, B, nrows, ncols, start_row, start_col, &

                                           do_norm, do_print)


      TYPE(cp_fm_type), INTENT(IN)                       :: matrix_a

      REAL(kind=dp), DIMENSION(:, :), INTENT(OUT)        :: b

      INTEGER, INTENT(IN), OPTIONAL                      :: nrows, ncols, start_row, start_col

      LOGICAL, INTENT(IN), OPTIONAL                      :: do_norm, do_print


      CHARACTER(len=*), PARAMETER :: routinen = 'cp_fm_Gram_Schmidt_orthonorm'


      INTEGER :: end_col_global, end_col_local, end_row_global, end_row_local, handle, i, j, &

                 j_col, ncol_global, ncol_local, nrow_global, nrow_local, start_col_global, &

                 start_col_local, start_row_global, start_row_local, this_col, unit_nr

      INTEGER, DIMENSION(:), POINTER                     :: col_indices, row_indices

      LOGICAL                                            :: my_do_norm, my_do_print

      REAL(kind=dp)                                      :: norm

      REAL(kind=dp), DIMENSION(:, :), POINTER            :: a


      CALL timeset(routinen, handle)


      my_do_norm = .true.

      IF (PRESENT(do_norm)) my_do_norm = do_norm


      my_do_print = .false.

      IF (PRESENT(do_print) .AND. (my_do_norm)) my_do_print = do_print


      unit_nr = -1

      IF (my_do_print) THEN

         unit_nr = cp_logger_get_default_unit_nr()

         IF (unit_nr < 1) my_do_print = .false.

      END IF


      IF (SIZE(b) /= 0) THEN

         IF (PRESENT(nrows)) THEN

            nrow_global = nrows

         ELSE

            nrow_global = SIZE(b, 1)

         END IF


         IF (PRESENT(ncols)) THEN

            ncol_global = ncols

         ELSE

            ncol_global = SIZE(b, 2)

         END IF


         IF (PRESENT(start_row)) THEN

            start_row_global = start_row

         ELSE

            start_row_global = 1

         END IF


         IF (PRESENT(start_col)) THEN

            start_col_global = start_col

         ELSE

            start_col_global = 1

         END IF


         end_row_global = start_row_global + nrow_global - 1

         end_col_global = start_col_global + ncol_global - 1


         CALL cp_fm_get_info(matrix=matrix_a, &

                             nrow_global=nrow_global, ncol_global=ncol_global, &

                             nrow_local=nrow_local, ncol_local=ncol_local, &

                             row_indices=row_indices, col_indices=col_indices)

         IF (end_row_global > nrow_global) THEN

            end_row_global = nrow_global

         END IF

         IF (end_col_global > ncol_global) THEN

            end_col_global = ncol_global

         END IF


         ! find out row/column indices of locally stored matrix elements that

         ! needs to be copied.

         ! Arrays row_indices and col_indices are assumed to be sorted in

         ! ascending order

         DO start_row_local = 1, nrow_local

            IF (row_indices(start_row_local) >= start_row_global) EXIT

         END DO


         DO end_row_local = start_row_local, nrow_local

            IF (row_indices(end_row_local) > end_row_global) EXIT

         END DO

         end_row_local = end_row_local - 1


         DO start_col_local = 1, ncol_local

            IF (col_indices(start_col_local) >= start_col_global) EXIT

         END DO


         DO end_col_local = start_col_local, ncol_local

            IF (col_indices(end_col_local) > end_col_global) EXIT

         END DO

         end_col_local = end_col_local - 1


         a => matrix_a%local_data


         this_col = col_indices(start_col_local) - start_col_global + 1


         b(:, this_col) = a(:, start_col_local)


         IF (my_do_norm) THEN

            norm = sqrt(accurate_dot_product(b(:, this_col), b(:, this_col)))

            b(:, this_col) = b(:, this_col)/norm

            IF (my_do_print) WRITE (unit_nr, '(I3,F8.3)') this_col, norm

         END IF


         DO i = start_col_local + 1, end_col_local

            this_col = col_indices(i) - start_col_global + 1

            b(:, this_col) = a(:, i)

            DO j = start_col_local, i - 1

               j_col = col_indices(j) - start_col_global + 1

               b(:, this_col) = b(:, this_col) - &

                                accurate_dot_product(b(:, j_col), b(:, this_col))* &

                                b(:, j_col)/accurate_dot_product(b(:, j_col), b(:, j_col))

            END DO


            IF (my_do_norm) THEN

               norm = sqrt(accurate_dot_product(b(:, this_col), b(:, this_col)))

               b(:, this_col) = b(:, this_col)/norm

               IF (my_do_print) WRITE (unit_nr, '(I3,F8.3)') this_col, norm

            END IF


         END DO

         CALL matrix_a%matrix_struct%para_env%sum(b)

      END IF


      CALL timestop(handle)


   END SUBROUTINE cp_fm_gram_schmidt_orthonorm


! **************************************************************************************************

!> \brief Cholesky decomposition

!> \param fm_matrix ...

!> \param n ...

!> \param uplo triangular format; defaults to 'U'

! **************************************************************************************************


   SUBROUTINE cp_fm_potrf(fm_matrix, n, uplo)

      TYPE(cp_fm_type)                         :: fm_matrix

      INTEGER, INTENT(IN)                      :: n

      CHARACTER, INTENT(IN), OPTIONAL          :: uplo


      CHARACTER                                :: myuplo

      INTEGER                                  :: info

      REAL(kind=dp), DIMENSION(:, :), POINTER  :: a

      REAL(kind=sp), DIMENSION(:, :), POINTER  :: a_sp

#if defined(__parallel)

      INTEGER, DIMENSION(9)                    :: desca

#endif


      myuplo = 'U'

      IF (PRESENT(uplo)) myuplo = uplo


      a => fm_matrix%local_data

      a_sp => fm_matrix%local_data_sp

#if defined(__parallel)

      desca(:) = fm_matrix%matrix_struct%descriptor(:)

      IF (fm_matrix%use_sp) THEN

         CALL pspotrf(myuplo, n, a_sp(1, 1), 1, 1, desca, info)

      ELSE

         CALL pdpotrf(myuplo, n, a(1, 1), 1, 1, desca, info)

      END IF

#else

      IF (fm_matrix%use_sp) THEN

         CALL spotrf(myuplo, n, a_sp(1, 1), SIZE(a_sp, 1), info)

      ELSE

         CALL dpotrf(myuplo, n, a(1, 1), SIZE(a, 1), info)

      END IF

#endif

      IF (info /= 0) &

         cpabort("Cholesky decomposition failed. Matrix ill-conditioned?")


   END SUBROUTINE cp_fm_potrf


! **************************************************************************************************

!> \brief Invert trianguar matrix

!> \param fm_matrix the matrix to invert (triangular matrix according to uplo)

!> \param n size of the matrix to invert

!> \param uplo triangular format; defaults to 'U'

! **************************************************************************************************


   SUBROUTINE cp_fm_potri(fm_matrix, n, uplo)

      TYPE(cp_fm_type)                         :: fm_matrix

      INTEGER, INTENT(IN)                      :: n

      CHARACTER, INTENT(IN), OPTIONAL          :: uplo


      CHARACTER                                :: myuplo

      REAL(kind=dp), DIMENSION(:, :), POINTER  :: a

      REAL(kind=sp), DIMENSION(:, :), POINTER  :: a_sp

      INTEGER                                  :: info

#if defined(__parallel)

      INTEGER, DIMENSION(9)                    :: desca

#endif


      myuplo = 'U'

      IF (PRESENT(uplo)) myuplo = uplo


      a => fm_matrix%local_data

      a_sp => fm_matrix%local_data_sp

#if defined(__parallel)

      desca(:) = fm_matrix%matrix_struct%descriptor(:)

      IF (fm_matrix%use_sp) THEN

         CALL pspotri(myuplo, n, a_sp(1, 1), 1, 1, desca, info)

      ELSE

         CALL pdpotri(myuplo, n, a(1, 1), 1, 1, desca, info)

      END IF

#else

      IF (fm_matrix%use_sp) THEN

         CALL spotri(myuplo, n, a_sp(1, 1), SIZE(a_sp, 1), info)

      ELSE

         CALL dpotri(myuplo, n, a(1, 1), SIZE(a, 1), info)

      END IF

#endif

      cpassert(info == 0)


   END SUBROUTINE cp_fm_potri


! **************************************************************************************************

!> \brief ...

!> \param fm_matrix ...

!> \param neig ...

!> \param fm_matrixb ...

!> \param fm_matrixout ...

!> \param op ...

!> \param pos ...

!> \param transa ...

! **************************************************************************************************


   SUBROUTINE cp_fm_cholesky_restore(fm_matrix, neig, fm_matrixb, fm_matrixout, op, pos, transa)

      TYPE(cp_fm_type)                               :: fm_matrix

      TYPE(cp_fm_type)                               :: fm_matrixb

      TYPE(cp_fm_type)                               :: fm_matrixout

      INTEGER, INTENT(IN)                            :: neig

      CHARACTER(LEN=*), INTENT(IN)                   :: op

      CHARACTER(LEN=*), INTENT(IN)                   :: pos

      CHARACTER(LEN=*), INTENT(IN)                   :: transa


      REAL(kind=dp), DIMENSION(:, :), POINTER        :: a, b, outm

      REAL(kind=sp), DIMENSION(:, :), POINTER        :: a_sp, b_sp, outm_sp

      INTEGER                                        :: n, itype

      REAL(kind=dp)                                  :: alpha

#if defined(__parallel)

      INTEGER                                        :: i

      INTEGER, DIMENSION(9)                          :: desca, descb, descout

#endif


      ! notice b is the cholesky guy

      a => fm_matrix%local_data

      b => fm_matrixb%local_data

      outm => fm_matrixout%local_data

      a_sp => fm_matrix%local_data_sp

      b_sp => fm_matrixb%local_data_sp

      outm_sp => fm_matrixout%local_data_sp


      n = fm_matrix%matrix_struct%nrow_global

      itype = 1


#if defined(__parallel)

      desca(:) = fm_matrix%matrix_struct%descriptor(:)

      descb(:) = fm_matrixb%matrix_struct%descriptor(:)

      descout(:) = fm_matrixout%matrix_struct%descriptor(:)

      alpha = 1.0_dp

      DO i = 1, neig

         IF (fm_matrix%use_sp) THEN

            CALL pscopy(n, a_sp(1, 1), 1, i, desca, 1, outm_sp(1, 1), 1, i, descout, 1)

         ELSE

            CALL pdcopy(n, a(1, 1), 1, i, desca, 1, outm(1, 1), 1, i, descout, 1)

         END IF

      END DO

      IF (op .EQ. "SOLVE") THEN

         IF (fm_matrix%use_sp) THEN

            CALL pstrsm(pos, 'U', transa, 'N', n, neig, real(alpha, sp), b_sp(1, 1), 1, 1, descb, &

                        outm_sp(1, 1), 1, 1, descout)

         ELSE

            CALL pdtrsm(pos, 'U', transa, 'N', n, neig, alpha, b(1, 1), 1, 1, descb, outm(1, 1), 1, 1, descout)

         END IF

      ELSE

         IF (fm_matrix%use_sp) THEN

            CALL pstrmm(pos, 'U', transa, 'N', n, neig, real(alpha, sp), b_sp(1, 1), 1, 1, descb, &

                        outm_sp(1, 1), 1, 1, descout)

         ELSE

            CALL pdtrmm(pos, 'U', transa, 'N', n, neig, alpha, b(1, 1), 1, 1, descb, outm(1, 1), 1, 1, descout)

         END IF

      END IF

#else

      alpha = 1.0_dp

      IF (fm_matrix%use_sp) THEN

         CALL scopy(neig*n, a_sp(1, 1), 1, outm_sp(1, 1), 1)

      ELSE

         CALL dcopy(neig*n, a(1, 1), 1, outm(1, 1), 1)

      END IF

      IF (op .EQ. "SOLVE") THEN

         IF (fm_matrix%use_sp) THEN

            CALL strsm(pos, 'U', transa, 'N', n, neig, real(alpha, sp), b_sp(1, 1), SIZE(b_sp, 1), outm_sp(1, 1), n)

         ELSE

            CALL dtrsm(pos, 'U', transa, 'N', n, neig, alpha, b(1, 1), SIZE(b, 1), outm(1, 1), n)

         END IF

      ELSE

         IF (fm_matrix%use_sp) THEN

            CALL strmm(pos, 'U', transa, 'N', n, neig, real(alpha, sp), b_sp(1, 1), n, outm_sp(1, 1), n)

         ELSE

            CALL dtrmm(pos, 'U', transa, 'N', n, neig, alpha, b(1, 1), n, outm(1, 1), n)

         END IF

      END IF

#endif


   END SUBROUTINE cp_fm_cholesky_restore


! **************************************************************************************************

!> \brief Calculates

!>        yv = alpha*amat*xv + beta*yv

!>        where amat: fm matrix

!>              xv  : vector replicated

!>              yv  : vector replicated

!>        Defaults: alpha = 1, beta = 0

! **************************************************************************************************


   SUBROUTINE cp_fm_matvec(amat, xv, yv, alpha, beta)

      TYPE(cp_fm_type), INTENT(IN)                   :: amat

      REAL(kind=dp), DIMENSION(:), INTENT(IN)        :: xv

      REAL(kind=dp), DIMENSION(:), INTENT(INOUT)     :: yv

      REAL(kind=dp), OPTIONAL, INTENT(IN)            :: alpha, beta


      INTEGER                                        :: na, nc, nx, ny

      REAL(kind=dp)                                  :: aval, bval

#if defined(__parallel)

      INTEGER                                        :: nrl, ncl, ic, ir

      INTEGER, DIMENSION(:), POINTER                 :: rind, cind

      REAL(kind=dp), DIMENSION(:), ALLOCATABLE       :: xvl, yvl, yvm

#endif


      IF (amat%use_sp) THEN

         cpabort("cp_fm_matvec: SP option not available")

      END IF

      aval = 1.0_dp

      IF (PRESENT(alpha)) aval = alpha

      bval = 0.0_dp

      IF (PRESENT(beta)) bval = beta


      CALL cp_fm_get_info(amat, nrow_global=na, ncol_global=nc)

      nx = SIZE(xv)

      ny = SIZE(yv)

      IF ((nx /= ny) .OR. (nc /= nx)) THEN

         cpabort("cp_fm_matvec: incompatible dimensions")

      END IF

#if defined(__parallel)

      CALL cp_fm_get_info(amat, nrow_local=nrl, ncol_local=ncl, &

                          row_indices=rind, col_indices=cind)

      ALLOCATE (xvl(ncl), yvl(nrl), yvm(ny))

      DO ic = 1, ncl

         xvl(ic) = xv(cind(ic))

      END DO

      yvl(1:nrl) = matmul(amat%local_data, xvl(1:ncl))

      yvm = 0.0_dp

      DO ir = 1, nrl

         yvm(rind(ir)) = yvl(ir)

      END DO

      CALL amat%matrix_struct%para_env%sum(yvm)

      IF (bval == 0.0_dp) THEN

         yv = aval*yvm

      ELSE

         yv = bval*yv + aval*yvm

      END IF

#else

      IF (bval == 0.0_dp) THEN

         yv = aval*matmul(amat%local_data, xv)

      ELSE

         yv = bval*yv + aval*matmul(amat%local_data, xv)

      END IF

#endif


   END SUBROUTINE cp_fm_matvec


END MODULE cp_fm_basic_linalg

dgemm
static void dgemm(const char transa, const char transb, const int m, const int n, const int k, const double alpha, const double *a, const int lda, const double *b, const int ldb, const double beta, double *c, const int ldc)
Convenient wrapper to hide Fortran nature of dgemm_, swapping a and b.
Definition grid_cpu_task_list.c:214

cp_fm_basic_linalg::cp_fm_contracted_trace
Definition cp_fm_basic_linalg.F:85

cp_fm_basic_linalg::cp_fm_trace
Definition cp_fm_basic_linalg.F:75

cp_fm_types::cp_fm_release
Definition cp_fm_types.F:87

cp_fm_types::cp_fm_to_fm
Definition cp_fm_types.F:82

cp_log_handling::cp_to_string
Definition cp_log_handling.F:90

kahan_sum::accurate_dot_product
Definition kahan_sum.F:52

kahan_sum::accurate_sum
Definition kahan_sum.F:41

mathlib::invert_matrix
Definition mathlib.F:70

cp_blacs_env
methods related to the blacs parallel environment
Definition cp_blacs_env.F:15

cp_fm_basic_linalg
Basic linear algebra operations for full matrices.
Definition cp_fm_basic_linalg.F:14

cp_fm_basic_linalg::cp_fm_rot_rows
subroutine, public cp_fm_rot_rows(matrix, irow, jrow, cs, sn)
Applies a planar rotation defined by cs and sn to the i'th and j'th rows.
Definition cp_fm_basic_linalg.F:2880

cp_fm_basic_linalg::cp_fm_row_scale
subroutine, public cp_fm_row_scale(matrixa, scaling)
scales row i of matrix a with scaling(i)
Definition cp_fm_basic_linalg.F:1948

cp_fm_basic_linalg::cp_fm_gemm
subroutine, public cp_fm_gemm(transa, transb, m, n, k, alpha, matrix_a, matrix_b, beta, matrix_c, a_first_col, a_first_row, b_first_col, b_first_row, c_first_col, c_first_row)
computes matrix_c = beta * matrix_c + alpha * ( matrix_a ** transa ) * ( matrix_b ** transb )
Definition cp_fm_basic_linalg.F:440

cp_fm_basic_linalg::cp_fm_column_scale
subroutine, public cp_fm_column_scale(matrixa, scaling)
scales column i of matrix a with scaling(i)
Definition cp_fm_basic_linalg.F:1885

cp_fm_basic_linalg::cp_fm_rot_cols
subroutine, public cp_fm_rot_cols(matrix, icol, jcol, cs, sn)
Applies a planar rotation defined by cs and sn to the i'th and j'th columnns.
Definition cp_fm_basic_linalg.F:2924

cp_fm_basic_linalg::cp_fm_solve
subroutine, public cp_fm_solve(matrix_a, general_a)
computes the the solution to A*b=A_general using lu decomposition
Definition cp_fm_basic_linalg.F:2338

cp_fm_basic_linalg::cp_fm_pdgeqpf
subroutine, public cp_fm_pdgeqpf(matrix, tau, nrow, ncol, first_row, first_col)
compute a QR factorization with column pivoting of a M-by-N distributed matrix sub( A ) = A(IA:IA+M-1...
Definition cp_fm_basic_linalg.F:2747

cp_fm_basic_linalg::cp_fm_frobenius_norm
real(kind=dp) function, public cp_fm_frobenius_norm(matrix_a)
computes the Frobenius norm of matrix_a
Definition cp_fm_basic_linalg.F:616

cp_fm_basic_linalg::cp_fm_det
subroutine, public cp_fm_det(matrix_a, det_a)
Computes the determinant (with a correct sign even in parallel environment!) of a real square matrix.
Definition cp_fm_basic_linalg.F:98

cp_fm_basic_linalg::cp_fm_transpose
subroutine, public cp_fm_transpose(matrix, matrixt)
transposes a matrix matrixt = matrix ^ T
Definition cp_fm_basic_linalg.F:1695

cp_fm_basic_linalg::cp_fm_qr_factorization
subroutine, public cp_fm_qr_factorization(matrix_a, matrix_r, nrow_fact, ncol_fact, first_row, first_col, uplo)
performs a QR factorization of the input rectangular matrix A or of a submatrix of A the computed tri...
Definition cp_fm_basic_linalg.F:2245

cp_fm_basic_linalg::cp_fm_gram_schmidt_orthonorm
subroutine, public cp_fm_gram_schmidt_orthonorm(matrix_a, b, nrows, ncols, start_row, start_col, do_norm, do_print)
Orthonormalizes selected rows and columns of a full matrix, matrix_a.
Definition cp_fm_basic_linalg.F:2971

cp_fm_basic_linalg::cp_fm_syrk
subroutine, public cp_fm_syrk(uplo, trans, k, alpha, matrix_a, ia, ja, beta, matrix_c)
performs a rank-k update of a symmetric matrix_c matrix_c = beta * matrix_c + alpha * matrix_a * tran...
Definition cp_fm_basic_linalg.F:660

cp_fm_basic_linalg::cp_fm_potrf
subroutine, public cp_fm_potrf(fm_matrix, n, uplo)
Cholesky decomposition.
Definition cp_fm_basic_linalg.F:3105

cp_fm_basic_linalg::cp_fm_potri
subroutine, public cp_fm_potri(fm_matrix, n, uplo)
Invert trianguar matrix.
Definition cp_fm_basic_linalg.F:3148

cp_fm_basic_linalg::cp_fm_geadd
subroutine, public cp_fm_geadd(alpha, trans, matrix_a, beta, matrix_b)
interface to BLACS geadd: matrix_b = beta*matrix_b + alpha*opt(matrix_a) where opt(matrix_a) can be e...
Definition cp_fm_basic_linalg.F:271

cp_fm_basic_linalg::cp_fm_schur_product
subroutine, public cp_fm_schur_product(matrix_a, matrix_b, matrix_c)
computes the schur product of two matrices c_ij = a_ij * b_ij
Definition cp_fm_basic_linalg.F:713

cp_fm_basic_linalg::cp_fm_norm
real(kind=dp) function, public cp_fm_norm(matrix, mode)
norm of matrix using (p)dlange
Definition cp_fm_basic_linalg.F:2595

cp_fm_basic_linalg::cp_fm_cholesky_restore
subroutine, public cp_fm_cholesky_restore(fm_matrix, neig, fm_matrixb, fm_matrixout, op, pos, transa)
...
Definition cp_fm_basic_linalg.F:3193

cp_fm_basic_linalg::cp_fm_scale_and_add
subroutine, public cp_fm_scale_and_add(alpha, matrix_a, beta, matrix_b)
calc A <- alpha*A + beta*B optimized for alpha == 1.0 (just add beta*B) and beta == 0....
Definition cp_fm_basic_linalg.F:167

cp_fm_basic_linalg::cp_fm_uplo_to_full
subroutine, public cp_fm_uplo_to_full(matrix, work, uplo)
given a triangular matrix according to uplo, computes the corresponding full matrix
Definition cp_fm_basic_linalg.F:1747

cp_fm_basic_linalg::cp_fm_invert
subroutine, public cp_fm_invert(matrix_a, matrix_inverse, det_a, eps_svd, eigval)
Inverts a cp_fm_type matrix, optionally returning the determinant of the input matrix.
Definition cp_fm_basic_linalg.F:2019

cp_fm_basic_linalg::cp_complex_fm_gemm
subroutine, public cp_complex_fm_gemm(transa, transb, m, n, k, alpha, a_re, a_im, b_re, b_im, beta, c_re, c_im, a_first_col, a_first_row, b_first_col, b_first_row, c_first_col, c_first_row)
Convenience function. Computes the matrix multiplications needed for the multiplication of complex ma...
Definition cp_fm_basic_linalg.F:2410

cp_fm_basic_linalg::cp_fm_scale
subroutine, public cp_fm_scale(alpha, matrix_a)
scales a matrix matrix_a = alpha * matrix_b
Definition cp_fm_basic_linalg.F:1665

cp_fm_basic_linalg::cp_fm_triangular_invert
subroutine, public cp_fm_triangular_invert(matrix_a, uplo_tr)
inverts a triangular matrix
Definition cp_fm_basic_linalg.F:2197

cp_fm_basic_linalg::cp_fm_symm
subroutine, public cp_fm_symm(side, uplo, m, n, alpha, matrix_a, matrix_b, beta, matrix_c)
computes matrix_c = beta * matrix_c + alpha * matrix_a * matrix_b computes matrix_c = beta * matrix_c...
Definition cp_fm_basic_linalg.F:563

cp_fm_basic_linalg::cp_fm_matvec
subroutine, public cp_fm_matvec(amat, xv, yv, alpha, beta)
Calculates yv = alpha*amat*xv + beta*yv where amat: fm matrix xv : vector replicated yv : vector repl...
Definition cp_fm_basic_linalg.F:3281

cp_fm_basic_linalg::cp_fm_triangular_multiply
subroutine, public cp_fm_triangular_multiply(triangular_matrix, matrix_b, side, transpose_tr, invert_tr, uplo_tr, unit_diag_tr, n_rows, n_cols, alpha)
multiplies in place by a triangular matrix: matrix_b = alpha op(triangular_matrix) matrix_b or (if si...
Definition cp_fm_basic_linalg.F:1577

cp_fm_basic_linalg::cp_fm_pdorgqr
subroutine, public cp_fm_pdorgqr(matrix, tau, nrow, first_row, first_col)
generates an M-by-N real distributed matrix Q denoting A(IA:IA+M-1,JA:JA+N-1) with orthonormal column...
Definition cp_fm_basic_linalg.F:2819

cp_fm_struct
represent the structure of a full matrix
Definition cp_fm_struct.F:14

cp_fm_struct::cp_fm_struct_equivalent
logical function, public cp_fm_struct_equivalent(fmstruct1, fmstruct2)
returns true if the two matrix structures are equivalent, false otherwise.
Definition cp_fm_struct.F:388

cp_fm_types
represent a full matrix distributed on many processors
Definition cp_fm_types.F:15

cp_fm_types::cp_fm_get_diag
subroutine, public cp_fm_get_diag(matrix, diag)
returns the diagonal elements of a fm
Definition cp_fm_types.F:563

cp_fm_types::cp_fm_get_info
subroutine, public cp_fm_get_info(matrix, name, nrow_global, ncol_global, nrow_block, ncol_block, nrow_local, ncol_local, row_indices, col_indices, local_data, context, nrow_locals, ncol_locals, matrix_struct, para_env)
returns all kind of information about the full matrix
Definition cp_fm_types.F:1009

cp_fm_types::cp_fm_set_submatrix
subroutine, public cp_fm_set_submatrix(fm, new_values, start_row, start_col, n_rows, n_cols, alpha, beta, transpose)
sets a submatrix of a full matrix fm(start_row:start_row+n_rows,start_col:start_col+n_cols) = alpha*o...
Definition cp_fm_types.F:761

cp_fm_types::cp_fm_set_all
subroutine, public cp_fm_set_all(matrix, alpha, beta)
set all elements of a matrix to the same value, and optionally the diagonal to a different one
Definition cp_fm_types.F:528

cp_fm_types::cp_fm_get_submatrix
subroutine, public cp_fm_get_submatrix(fm, target_m, start_row, start_col, n_rows, n_cols, transpose)
gets a submatrix of a full matrix op(target_m)(1:n_rows,1:n_cols) =fm(start_row:start_row+n_rows,...
Definition cp_fm_types.F:894

cp_fm_types::cp_fm_set_element
subroutine, public cp_fm_set_element(matrix, irow_global, icol_global, alpha)
sets an element of a matrix
Definition cp_fm_types.F:693

cp_fm_types::cp_fm_create
subroutine, public cp_fm_create(matrix, matrix_struct, name, use_sp)
creates a new full matrix with the given structure
Definition cp_fm_types.F:164

cp_log_handling
various routines to log and control the output. The idea is that decisions about where to log should ...
Definition cp_log_handling.F:41

cp_log_handling::cp_logger_get_default_unit_nr
recursive integer function, public cp_logger_get_default_unit_nr(logger, local, skip_not_ionode)
asks the default unit number of the given logger. try to use cp_logger_get_unit_nr
Definition cp_log_handling.F:567

kahan_sum
sums arrays of real/complex numbers with much reduced round-off as compared to a naive implementation...
Definition kahan_sum.F:29

kinds
Defines the basic variable types.
Definition kinds.F:23

kinds::int_8
integer, parameter, public int_8
Definition kinds.F:54

kinds::dp
integer, parameter, public dp
Definition kinds.F:34

kinds::sp
integer, parameter, public sp
Definition kinds.F:33

machine
Machine interface based on Fortran 2003 and POSIX.
Definition machine.F:17

machine::m_memory
subroutine, public m_memory(mem)
Returns the total amount of memory [bytes] in use, if known, zero otherwise.
Definition machine.F:440

mathlib
Collection of simple mathematical functions and subroutines.
Definition mathlib.F:15

mathlib::get_pseudo_inverse_svd
subroutine, public get_pseudo_inverse_svd(a, a_pinverse, rskip, determinant, sval)
returns the pseudoinverse of a real, square matrix using singular value decomposition
Definition mathlib.F:938

mathlib::diag
subroutine, public diag(n, a, d, v)
Diagonalize matrix a. The eigenvalues are returned in vector d and the eigenvectors are returned in m...
Definition mathlib.F:1496

message_passing
Interface to the message passing library MPI.
Definition message_passing.F:23

cp_blacs_env::cp_blacs_env_type
represent a blacs multidimensional parallel environment (for the mpi corrispective see cp_paratypes/m...
Definition cp_blacs_env.F:53

cp_fm_types::cp_fm_p_type
just to build arrays of pointers to matrices
Definition cp_fm_types.F:129

cp_fm_types::cp_fm_type
represent a full matrix
Definition cp_fm_types.F:113

message_passing::mp_comm_type
Definition message_passing.F:189