33#include "../base/base_uses.f90"
39 CHARACTER(len=*),
PARAMETER,
PRIVATE :: moduleN =
'cp_fm_diag_utils'
43 INTEGER :: matrix_order = -1
44 INTEGER :: num_pe_old = -1
45 INTEGER :: num_pe_new = -1
46 INTEGER :: num_pe_opt = -1
47 INTEGER :: num_pe_max_nz_col = -1
48 LOGICAL :: redistribute = .false.
50 PROCEDURE, pass(self) :: write => cp_fm_redistribute_info_write
54 TYPE cp_fm_redistribute_type
56 INTEGER :: a = -1, x = -1
57 LOGICAL :: should_print = .false.
58 LOGICAL :: elpa_force_redistribute = .false.
60 INTEGER,
DIMENSION(:),
POINTER :: group_distribution => null(), &
61 group_partition => null()
64 END TYPE cp_fm_redistribute_type
67 TYPE(cp_fm_redistribute_type),
PRIVATE, &
68 SAVE :: work_redistribute
83 SUBROUTINE cp_fm_redistribute_info_write(self, io_unit)
85 INTEGER,
INTENT(IN) :: io_unit
87 WRITE (unit=io_unit, fmt=
"(A)")
""
88 WRITE (unit=io_unit, fmt=
"(T2,A,T71,I10)") &
89 "CP_FM_DIAG| Number of processes over which the matrix is distributed ", self%num_pe_old, &
90 "CP_FM_DIAG| Matrix order ", self%matrix_order
91 WRITE (unit=io_unit, fmt=
"(T2,A,T71,I10)") &
92 "CP_FM_DIAG| Optimal number of CPUs ", self%num_pe_opt
93 IF (self%num_pe_max_nz_col < 0)
THEN
94 WRITE (unit=io_unit, fmt=
"(T2,A,T71,A10)") &
95 "CP_FM_DIAG| Maximum number of CPUs (with non-zero columns) ",
"<N/A>"
97 WRITE (unit=io_unit, fmt=
"(T2,A,T71,I10)") &
98 "CP_FM_DIAG| Maximum number of CPUs (with non-zero columns): ", self%num_pe_max_nz_col
100 IF (self%redistribute)
THEN
101 WRITE (unit=io_unit, fmt=
"(T2,A,T71,I10)") &
102 "CP_FM_DIAG| Number of processes for the redistribution ", self%num_pe_new
104 WRITE (unit=io_unit, fmt=
"(T2,A)") &
105 "CP_FM_DIAG| The matrix will NOT be redistributed"
107 WRITE (unit=io_unit, fmt=
"(A)")
""
109 END SUBROUTINE cp_fm_redistribute_info_write
117 SUBROUTINE cp_fm_redistribute_work_finalize(has_redistributed)
118 LOGICAL,
INTENT(IN) :: has_redistributed
120 IF (
ASSOCIATED(work_redistribute%group_distribution))
THEN
121 IF (has_redistributed)
THEN
124 CALL work_redistribute%para_env_new%free()
125 DEALLOCATE (work_redistribute%para_env_new)
126 DEALLOCATE (work_redistribute%group_distribution)
127 DEALLOCATE (work_redistribute%group_partition)
130 work_redistribute = cp_fm_redistribute_type()
132 END SUBROUTINE cp_fm_redistribute_work_finalize
148 INTEGER,
INTENT(IN) :: a, x
149 LOGICAL,
INTENT(IN) :: should_print, elpa_force_redistribute
151 work_redistribute%a = a
152 work_redistribute%x = x
153 work_redistribute%should_print = should_print
154 work_redistribute%elpa_force_redistribute = elpa_force_redistribute
156 work_redistribute = cp_fm_redistribute_type()
166 PURE FUNCTION cp_fm_diag_get_optimal_ncpu(size)
RESULT(ncpu)
167 INTEGER,
INTENT(IN) :: size
170 ncpu = ((
size + work_redistribute%a*work_redistribute%x - 1)/ &
171 (work_redistribute%a*work_redistribute%x))*work_redistribute%a
173 END FUNCTION cp_fm_diag_get_optimal_ncpu
175#if defined(__parallel)
183 FUNCTION cp_fm_max_ncpu_non_zero_column(matrix)
RESULT(ncpu)
187 INTEGER :: gcd_max, ipe, jpe, ncol_block, &
188 ncol_global, npcol, nrow_block, &
189 nrow_global, num_pe_old, nzero
190 INTEGER,
DIMENSION(:),
POINTER :: ncol_locals
191 INTEGER,
EXTERNAL :: numroc
193 NULLIFY (ncol_locals)
196 nrow_global=nrow_global, ncol_global=ncol_global, &
197 nrow_block=nrow_block, ncol_block=ncol_block)
198 nzero = count(ncol_locals == 0)
199 num_pe_old = matrix%matrix_struct%para_env%num_pe
200 ncpu = num_pe_old - nzero
204 ncpu = ncpu -
modulo(ncpu, 2)
207 IF (ncpu == num_pe_old) &
217 DO ipe = 1, ceiling(sqrt(real(ncpu,
dp)))
219 IF (ipe*jpe .NE. ncpu) &
221 IF (
gcd(ipe, jpe) >= gcd_max)
THEN
223 gcd_max =
gcd(ipe, jpe)
230 DO ipe = 0, npcol - 1
231 IF (numroc(ncol_global, ncol_block, ipe, 0, npcol) == 0) &
241 ncpu = ncpu -
modulo(ncpu, 2)
244 END FUNCTION cp_fm_max_ncpu_non_zero_column
263 caller_is_elpa, redist_info)
265 TYPE(
cp_fm_type),
INTENT(IN) :: matrix, eigenvectors
266 TYPE(
cp_fm_type),
INTENT(OUT) :: matrix_new, eigenvectors_new
267 LOGICAL,
OPTIONAL,
INTENT(IN) :: caller_is_elpa
269 CHARACTER(len=*),
PARAMETER :: routinen =
'cp_fm_redistribute_start'
275#if defined(__parallel)
276 REAL(kind=
dp) :: fake_local_data(1, 1)
277 INTEGER :: fake_descriptor(9), mepos_old, &
278 io_unit, ngroups, ncol_block, blksize, nrow_block
285 CALL timeset(routinen, handle)
287 IF (
PRESENT(caller_is_elpa))
THEN
289 is_elpa = caller_is_elpa
291 cpabort(
"CP2K compiled without the ELPA library.")
295#if defined(__parallel)
304 para_env => matrix%matrix_struct%para_env
305 mepos_old = para_env%mepos
310 rdinfo%matrix_order = matrix%matrix_struct%nrow_global
311 rdinfo%num_pe_old = para_env%num_pe
312 rdinfo%num_pe_opt = cp_fm_diag_get_optimal_ncpu(rdinfo%matrix_order)
313 rdinfo%num_pe_new = rdinfo%num_pe_opt
314 rdinfo%num_pe_max_nz_col = -1
315 rdinfo%redistribute = .false.
319 rdinfo%num_pe_new = rdinfo%num_pe_old
324 rdinfo%num_pe_max_nz_col = cp_fm_max_ncpu_non_zero_column(matrix)
327 IF (work_redistribute%elpa_force_redistribute .AND. rdinfo%num_pe_opt < rdinfo%num_pe_max_nz_col)
THEN
330 rdinfo%num_pe_new = rdinfo%num_pe_opt
331 ELSE IF (rdinfo%num_pe_old > rdinfo%num_pe_max_nz_col)
THEN
333 rdinfo%num_pe_new = rdinfo%num_pe_max_nz_col
340 CALL cp_fm_get_info(matrix, ncol_block=ncol_block, nrow_block=nrow_block)
344 DO WHILE (2*blksize <= min(nrow_block, ncol_block))
352 rdinfo%redistribute = (rdinfo%num_pe_old > rdinfo%num_pe_new) .OR. (blksize >= 0 .AND. &
353 ((blksize /= matrix%matrix_struct%ncol_block) .OR. (blksize /= matrix%matrix_struct%nrow_block)))
355 IF (work_redistribute%should_print .AND. io_unit > 0)
THEN
357 IF (work_redistribute%elpa_force_redistribute)
THEN
358 WRITE (unit=io_unit, fmt=
"(T2,A,T78,A3)") &
359 "CP_FM_DIAG| Force redistribute (ELPA):",
"YES"
361 WRITE (unit=io_unit, fmt=
"(T2,A,T79,A2)") &
362 "CP_FM_DIAG| Force redistribute (ELPA):",
"NO"
365 CALL rdinfo%write(io_unit)
370 IF (rdinfo%redistribute)
THEN
372 ALLOCATE (work_redistribute%group_distribution(0:rdinfo%num_pe_old - 1))
373 ALLOCATE (work_redistribute%group_partition(0:1))
374 work_redistribute%group_partition = (/rdinfo%num_pe_new, rdinfo%num_pe_old - rdinfo%num_pe_new/)
375 ALLOCATE (work_redistribute%para_env_new)
376 CALL work_redistribute%para_env_new%from_split( &
377 comm=para_env, ngroups=ngroups, group_distribution=work_redistribute%group_distribution, &
378 n_subgroups=2, group_partition=work_redistribute%group_partition)
380 IF (work_redistribute%group_distribution(mepos_old) == 0)
THEN
383 NULLIFY (work_redistribute%blacs_env_new)
384 CALL cp_blacs_env_create(blacs_env=work_redistribute%blacs_env_new, para_env=work_redistribute%para_env_new)
387 NULLIFY (fm_struct_new)
388 IF (nrow_block == -1 .OR. ncol_block == -1)
THEN
390 para_env=work_redistribute%para_env_new, &
391 context=work_redistribute%blacs_env_new, &
392 nrow_global=rdinfo%matrix_order, ncol_global=rdinfo%matrix_order, &
393 ncol_block=ncol_block, nrow_block=nrow_block)
396 para_env=work_redistribute%para_env_new, &
397 context=work_redistribute%blacs_env_new, &
398 nrow_global=rdinfo%matrix_order, ncol_global=rdinfo%matrix_order, &
399 ncol_block=ncol_block, nrow_block=nrow_block, force_block=.true.)
401 CALL cp_fm_create(matrix_new, matrix_struct=fm_struct_new, name=
"yevd_new_mat")
402 CALL cp_fm_create(eigenvectors_new, matrix_struct=fm_struct_new, name=
"yevd_new_vec")
406 CALL pdgemr2d(rdinfo%matrix_order, rdinfo%matrix_order, matrix%local_data(1, 1), 1, 1, &
407 matrix%matrix_struct%descriptor, &
408 matrix_new%local_data(1, 1), 1, 1, matrix_new%matrix_struct%descriptor, &
409 matrix%matrix_struct%context)
415 CALL pdgemr2d(rdinfo%matrix_order, rdinfo%matrix_order, matrix%local_data(1, 1), 1, 1, &
416 matrix%matrix_struct%descriptor, &
417 fake_local_data(1, 1), 1, 1, fake_descriptor, &
418 matrix%matrix_struct%context)
423 eigenvectors_new = eigenvectors
426 IF (
PRESENT(redist_info)) &
431 mark_used(eigenvectors)
432 mark_used(matrix_new)
433 mark_used(eigenvectors_new)
434 mark_used(redist_info)
435 cpabort(
"Routine called in non-parallel case.")
438 CALL timestop(handle)
457 TYPE(
cp_fm_type),
INTENT(IN) :: matrix, eigenvectors
458 REAL(kind=
dp),
DIMENSION(:),
INTENT(INOUT) :: eig
459 TYPE(
cp_fm_type),
INTENT(INOUT) :: matrix_new, eigenvectors_new
461 CHARACTER(len=*),
PARAMETER :: routinen =
'cp_fm_redistribute_end'
464#if defined(__parallel)
465 REAL(kind=
dp) :: fake_local_data(1, 1)
466 INTEGER :: fake_descriptor(9), mepos_old, n
470 CALL timeset(routinen, handle)
472#if defined(__parallel)
475 IF (
ASSOCIATED(work_redistribute%group_distribution))
THEN
476 n = matrix%matrix_struct%nrow_global
477 para_env => matrix%matrix_struct%para_env
478 mepos_old = para_env%mepos
480 IF (work_redistribute%group_distribution(mepos_old) == 0)
THEN
482 CALL pdgemr2d(n, n, eigenvectors_new%local_data(1, 1), 1, 1, eigenvectors_new%matrix_struct%descriptor, &
483 eigenvectors%local_data(1, 1), 1, 1, eigenvectors%matrix_struct%descriptor, &
484 eigenvectors%matrix_struct%context)
492 CALL pdgemr2d(n, n, fake_local_data(1, 1), 1, 1, fake_descriptor, &
493 eigenvectors%local_data(1, 1), 1, 1, eigenvectors%matrix_struct%descriptor, &
494 eigenvectors%matrix_struct%context)
497 CALL cp_fm_redistribute_work_finalize(work_redistribute%group_distribution(mepos_old) == 0)
500 CALL para_env%bcast(eig, 0)
506 mark_used(eigenvectors)
508 mark_used(matrix_new)
509 mark_used(eigenvectors_new)
510 cpabort(
"Routine called in non-parallel case.")
513 CALL timestop(handle)
static GRID_HOST_DEVICE int modulo(int a, int m)
Equivalent of Fortran's MODULO, which always return a positive number. https://gcc....
methods related to the blacs parallel environment
subroutine, public cp_blacs_env_release(blacs_env)
releases the given blacs_env
subroutine, public cp_blacs_env_create(blacs_env, para_env, blacs_grid_layout, blacs_repeatable, row_major, grid_2d)
allocates and initializes a type that represent a blacs context
Auxiliary tools to redistribute cp_fm_type matrices before and after diagonalization....
subroutine, public cp_fm_redistribute_end(matrix, eigenvectors, eig, matrix_new, eigenvectors_new)
Redistributes eigenvectors and eigenvalues back to the original communicator group.
subroutine, public cp_fm_redistribute_start(matrix, eigenvectors, matrix_new, eigenvectors_new, caller_is_elpa, redist_info)
Determines the optimal number of CPUs for matrix diagonalization and redistributes the input matrices...
subroutine, public cp_fm_redistribute_init(a, x, should_print, elpa_force_redistribute)
Initializes the parameters that determine how to calculate the optimal number of CPUs for diagonalizi...
represent the structure of a full matrix
subroutine, public cp_fm_struct_create(fmstruct, para_env, context, nrow_global, ncol_global, nrow_block, ncol_block, descriptor, first_p_pos, local_leading_dimension, template_fmstruct, square_blocks, force_block)
allocates and initializes a full matrix structure
subroutine, public cp_fm_struct_release(fmstruct)
releases a full matrix structure
represent a full matrix distributed on many processors
subroutine, public cp_fm_get_info(matrix, name, nrow_global, ncol_global, nrow_block, ncol_block, nrow_local, ncol_local, row_indices, col_indices, local_data, context, nrow_locals, ncol_locals, matrix_struct, para_env)
returns all kind of information about the full matrix
subroutine, public cp_fm_create(matrix, matrix_struct, name, use_sp)
creates a new full matrix with the given structure
various routines to log and control the output. The idea is that decisions about where to log should ...
integer function, public cp_logger_get_default_io_unit(logger)
returns the unit nr for the ionode (-1 on all other processors) skips as well checks if the procs cal...
type(cp_logger_type) function, pointer, public cp_get_default_logger()
returns the default logger
Defines the basic variable types.
integer, parameter, public dp
Collection of simple mathematical functions and subroutines.
elemental integer function, public gcd(a, b)
computes the greatest common divisor of two number
Interface to the message passing library MPI.
represent a blacs multidimensional parallel environment (for the mpi corrispective see cp_paratypes/m...
keeps the information about the structure of a full matrix
type of a logger, at the moment it contains just a print level starting at which level it should be l...
stores all the informations relevant to an mpi environment