37 USE omp_lib,
ONLY: omp_get_max_threads
39#include "../base/base_uses.f90"
42 USE elpa_constants,
ONLY: elpa_2stage_real_invalid, &
43 elpa_2stage_real_default, &
44 elpa_2stage_real_generic, &
45 elpa_2stage_real_generic_simple, &
46 elpa_2stage_real_bgp, &
47 elpa_2stage_real_bgq, &
48 elpa_2stage_real_sse_assembly, &
49 elpa_2stage_real_sse_block2, &
50 elpa_2stage_real_sse_block4, &
51 elpa_2stage_real_sse_block6, &
52 elpa_2stage_real_avx_block2, &
53 elpa_2stage_real_avx_block4, &
54 elpa_2stage_real_avx_block6, &
55 elpa_2stage_real_avx2_block2, &
56 elpa_2stage_real_avx2_block4, &
57 elpa_2stage_real_avx2_block6, &
58 elpa_2stage_real_avx512_block2, &
59 elpa_2stage_real_avx512_block4, &
60 elpa_2stage_real_avx512_block6, &
61 elpa_2stage_real_nvidia_gpu, &
62 elpa_2stage_real_amd_gpu, &
63 elpa_2stage_real_intel_gpu_sycl
65 USE elpa,
ONLY: elpa_t, elpa_solver_2stage, &
66 elpa_init, elpa_uninit, &
67 elpa_allocate, elpa_deallocate, elpa_ok
74 CHARACTER(len=*),
PARAMETER,
PRIVATE :: moduleN =
'cp_fm_elpa'
77 INTEGER,
DIMENSION(21),
PARAMETER :: elpa_kernel_ids = [ &
78 elpa_2stage_real_invalid, &
79 elpa_2stage_real_generic, &
80 elpa_2stage_real_generic_simple, &
81 elpa_2stage_real_bgp, &
82 elpa_2stage_real_bgq, &
83 elpa_2stage_real_sse_assembly, &
84 elpa_2stage_real_sse_block2, &
85 elpa_2stage_real_sse_block4, &
86 elpa_2stage_real_sse_block6, &
87 elpa_2stage_real_avx_block2, &
88 elpa_2stage_real_avx_block4, &
89 elpa_2stage_real_avx_block6, &
90 elpa_2stage_real_avx2_block2, &
91 elpa_2stage_real_avx2_block4, &
92 elpa_2stage_real_avx2_block6, &
93 elpa_2stage_real_avx512_block2, &
94 elpa_2stage_real_avx512_block4, &
95 elpa_2stage_real_avx512_block6, &
96 elpa_2stage_real_nvidia_gpu, &
97 elpa_2stage_real_amd_gpu, &
98 elpa_2stage_real_intel_gpu_sycl]
100 CHARACTER(len=14),
DIMENSION(SIZE(elpa_kernel_ids)),
PARAMETER :: &
101 elpa_kernel_names = [character(len=14) :: &
124 CHARACTER(len=44),
DIMENSION(SIZE(elpa_kernel_ids)),
PARAMETER :: &
125 elpa_kernel_descriptions = [character(len=44) :: &
126 "Automatically selected kernel", &
128 "Simplified generic kernel", &
129 "Kernel optimized for IBM BGP", &
130 "Kernel optimized for IBM BGQ", &
131 "Kernel optimized for x86_64/SSE", &
132 "Kernel optimized for x86_64/SSE (block=2)", &
133 "Kernel optimized for x86_64/SSE (block=4)", &
134 "Kernel optimized for x86_64/SSE (block=6)", &
135 "Kernel optimized for Intel AVX (block=2)", &
136 "Kernel optimized for Intel AVX (block=4)", &
137 "Kernel optimized for Intel AVX (block=6)", &
138 "Kernel optimized for Intel AVX2 (block=2)", &
139 "Kernel optimized for Intel AVX2 (block=4)", &
140 "Kernel optimized for Intel AVX2 (block=6)", &
141 "Kernel optimized for Intel AVX-512 (block=2)", &
142 "Kernel optimized for Intel AVX-512 (block=4)", &
143 "Kernel optimized for Intel AVX-512 (block=6)", &
144 "Kernel targeting Nvidia GPUs", &
145 "Kernel targeting AMD GPUs", &
146 "Kernel targeting Intel GPUs"]
149 INTEGER,
DIMENSION(1),
PARAMETER :: elpa_kernel_ids = [-1]
150 CHARACTER(len=14),
DIMENSION(1),
PARAMETER :: elpa_kernel_names = [
"AUTO"]
151 CHARACTER(len=44),
DIMENSION(1),
PARAMETER :: elpa_kernel_descriptions = [
"Automatically selected kernel"]
155 INTEGER,
SAVE :: elpa_kernel = elpa_kernel_ids(1)
157 LOGICAL,
SAVE :: elpa_qr = .false., &
158 elpa_qr_unsafe = .false., &
159 elpa_should_print = .false.
167 elpa_kernel_descriptions, &
178 IF (elpa_init(20180525) /= elpa_ok) &
179 cpabort(
"The linked ELPA library does not support the required API version")
181 cpabort(
"Initialization of ELPA library requested but not enabled during build")
192 cpabort(
"Finalization of ELPA library requested but not enabled during build")
201 INTEGER,
INTENT(IN) :: requested_kernel
206 elpa_kernel = requested_kernel
209 IF (elpa_kernel == elpa_2stage_real_invalid)
THEN
214 elpa_kernel = elpa_2stage_real_sse_block4
216 elpa_kernel = elpa_2stage_real_avx_block4
218 elpa_kernel = elpa_2stage_real_avx2_block4
220 elpa_kernel = elpa_2stage_real_avx512_block4
225#if defined (__ELPA_NVIDIA_GPU)
226 elpa_kernel = elpa_2stage_real_nvidia_gpu
228#if defined (__ELPA_AMD_GPU)
229 elpa_kernel = elpa_2stage_real_amd_gpu
231#if defined (__ELPA_INTEL_GPU)
232 elpa_kernel = elpa_2stage_real_intel_gpu_sycl
236 IF (elpa_kernel == elpa_2stage_real_invalid)
THEN
237 elpa_kernel = elpa_2stage_real_default
241 mark_used(requested_kernel)
254 LOGICAL,
INTENT(IN) :: use_qr, use_qr_unsafe
257 elpa_qr_unsafe = use_qr_unsafe
266 LOGICAL,
INTENT(IN) :: flag
268 elpa_should_print = flag
278 TYPE(
cp_fm_type),
INTENT(IN) :: matrix, eigenvectors
279 REAL(kind=
dp),
DIMENSION(:),
INTENT(OUT) :: eigenvalues
282 CHARACTER(len=*),
PARAMETER :: routinen =
'cp_fm_diag_elpa'
285 TYPE(
cp_fm_type) :: eigenvectors_new, matrix_new
288 CALL timeset(routinen, handle)
296 caller_is_elpa=.true., redist_info=rdinfo)
299 IF (
ASSOCIATED(matrix_new%matrix_struct)) &
300 CALL cp_fm_diag_elpa_base(matrix_new, eigenvectors_new, eigenvalues, rdinfo)
305 CALL timestop(handle)
308 mark_used(eigenvectors)
309 mark_used(eigenvalues)
311 cpabort(
"CP2K compiled without the ELPA library.")
323 SUBROUTINE cp_fm_diag_elpa_base(matrix, eigenvectors, eigenvalues, rdinfo)
325 TYPE(
cp_fm_type),
INTENT(IN) :: matrix, eigenvectors
326 REAL(kind=
dp),
DIMENSION(:),
INTENT(OUT) :: eigenvalues
329 CHARACTER(len=*),
PARAMETER :: routinen =
'cp_fm_diag_elpa_base'
333 CLASS(elpa_t),
POINTER :: elpa_obj
334 CHARACTER(len=default_string_length) :: kernel_name
339 nblk, neig, io_unit, &
341 LOGICAL :: use_qr, check_eigenvalues
342 REAL(kind=
dp),
DIMENSION(:),
ALLOCATABLE :: eval, eval_noqr
344 TYPE(
cp_fm_type) :: matrix_noqr, eigenvectors_noqr
346 REAL(kind=
dp),
PARAMETER :: th = 1.0e-14_dp
347 INTEGER,
DIMENSION(:),
POINTER :: ncol_locals
349 CALL timeset(routinen, handle)
351 NULLIFY (ncol_locals)
353 check_eigenvalues = .false.
358 n = matrix%matrix_struct%nrow_global
359 context => matrix%matrix_struct%context
360 group = matrix%matrix_struct%para_env
362 myprow = context%mepos(1)
363 mypcol = context%mepos(2)
369 local_leading_dimension=n_rows, &
372 ncol_locals=ncol_locals)
375 IF (io_unit > 0 .AND. any(ncol_locals == 0))
THEN
376 CALL rdinfo%write(io_unit)
378 cpabort(
"ELPA [pre-fail]: Problem contains processor column with zero width.")
381 neig =
SIZE(eigenvalues, 1)
389 use_qr = elpa_qr .AND. (
modulo(n, 2) .EQ. 0)
391 IF (.NOT. elpa_qr_unsafe) &
392 use_qr = use_qr .AND. (n .GE. 64) .AND. (nblk .GE. 64)
395 IF (use_qr .AND. elpa_qr_unsafe .AND. elpa_should_print) &
396 check_eigenvalues = .true.
398 CALL matrix%matrix_struct%para_env%bcast(check_eigenvalues)
400 IF (check_eigenvalues)
THEN
402 ALLOCATE (eval_noqr(n))
403 CALL cp_fm_create(matrix=matrix_noqr, matrix_struct=matrix%matrix_struct)
405 CALL cp_fm_create(matrix=eigenvectors_noqr, matrix_struct=eigenvectors%matrix_struct)
409 IF (io_unit > 0 .AND. elpa_should_print)
THEN
410 WRITE (unit=io_unit, fmt=
"(/,T2,A)") &
411 "ELPA| Matrix diagonalization information"
415 kernel_name =
"id: "//trim(adjustl(
cp_to_string(elpa_kernel)))
416 DO i = 1,
SIZE(elpa_kernel_ids)
417 IF (elpa_kernel_ids(i) == elpa_kernel)
THEN
418 kernel_name = elpa_kernel_names(i)
422 WRITE (unit=io_unit, fmt=
"(T2,A,T71,I10)") &
423 "ELPA| Matrix order (NA) ", n, &
424 "ELPA| Matrix block size (NBLK) ", nblk, &
425 "ELPA| Number of eigenvectors (NEV) ", neig, &
426 "ELPA| Local rows (LOCAL_NROWS) ", n_rows, &
427 "ELPA| Local columns (LOCAL_NCOLS) ", n_cols
428 WRITE (unit=io_unit, fmt=
"(T2,A,T61,A20)") &
429 "ELPA| Kernel ", adjustr(trim(kernel_name))
431 WRITE (unit=io_unit, fmt=
"(T2,A,T78,A3)") &
432 "ELPA| QR step requested ",
"YES"
434 WRITE (unit=io_unit, fmt=
"(T2,A,T79,A2)") &
435 "ELPA| QR step requested ",
"NO"
439 IF (elpa_qr_unsafe)
THEN
440 WRITE (unit=io_unit, fmt=
"(T2,A,T78,A3)") &
441 "ELPA| Use potentially unsafe QR ",
"YES"
443 WRITE (unit=io_unit, fmt=
"(T2,A,T79,A2)") &
444 "ELPA| Use potentially unsafe QR ",
"NO"
447 WRITE (unit=io_unit, fmt=
"(T2,A,T78,A3)") &
448 "ELPA| Matrix is suitable for QR ",
"YES"
450 WRITE (unit=io_unit, fmt=
"(T2,A,T79,A2)") &
451 "ELPA| Matrix is suitable for QR ",
"NO"
453 IF (.NOT. use_qr)
THEN
454 IF (
modulo(n, 2) /= 0)
THEN
455 WRITE (unit=io_unit, fmt=
"(T2,A)") &
456 "ELPA| Matrix order is NOT even"
458 IF ((nblk < 64) .AND. (.NOT. elpa_qr_unsafe))
THEN
459 WRITE (unit=io_unit, fmt=
"(T2,A)") &
460 "ELPA| Matrix block size is NOT 64 or greater"
463 IF ((nblk < 64) .AND. elpa_qr_unsafe)
THEN
464 WRITE (unit=io_unit, fmt=
"(T2,A)") &
465 "ELPA| Matrix block size check was bypassed"
474 elpa_obj => elpa_allocate()
476 CALL elpa_obj%set(
"na", n, success)
477 cpassert(success == elpa_ok)
479 CALL elpa_obj%set(
"nev", neig, success)
480 cpassert(success == elpa_ok)
482 CALL elpa_obj%set(
"local_nrows", n_rows, success)
483 cpassert(success == elpa_ok)
485 CALL elpa_obj%set(
"local_ncols", n_cols, success)
486 cpassert(success == elpa_ok)
488 CALL elpa_obj%set(
"nblk", nblk, success)
489 cpassert(success == elpa_ok)
491 CALL elpa_obj%set(
"mpi_comm_parent", group%get_handle(), success)
492 cpassert(success == elpa_ok)
494 CALL elpa_obj%set(
"process_row", myprow, success)
495 cpassert(success == elpa_ok)
497 CALL elpa_obj%set(
"process_col", mypcol, success)
498 cpassert(success == elpa_ok)
500 success = elpa_obj%setup()
501 cpassert(success == elpa_ok)
503 CALL elpa_obj%set(
"solver", elpa_solver_2stage, success)
504 cpassert(success == elpa_ok)
507 IF (elpa_kernel == elpa_2stage_real_nvidia_gpu)
THEN
508 CALL elpa_obj%set(
"nvidia-gpu", 1, success)
509 cpassert(success == elpa_ok)
511 IF (elpa_kernel == elpa_2stage_real_amd_gpu)
THEN
512 CALL elpa_obj%set(
"amd-gpu", 1, success)
513 cpassert(success == elpa_ok)
515 IF (elpa_kernel == elpa_2stage_real_intel_gpu_sycl)
THEN
516 CALL elpa_obj%set(
"intel-gpu", 1, success)
517 cpassert(success == elpa_ok)
520 CALL elpa_obj%set(
"real_kernel", elpa_kernel, success)
521 cpwarn_if(success /= elpa_ok,
"Setting real_kernel for ELPA failed")
524 CALL elpa_obj%set(
"qr", 1, success)
525 cpassert(success == elpa_ok)
529 IF (elpa_obj%can_set(
"omp_threads", omp_get_max_threads()) == elpa_ok)
THEN
530 CALL elpa_obj%set(
"omp_threads", omp_get_max_threads(), success)
531 cpassert(success == elpa_ok)
534 CALL elpa_obj%eigenvectors(matrix%local_data, eval, eigenvectors%local_data, success)
535 IF (success /= elpa_ok) &
536 cpabort(
"ELPA failed to diagonalize a matrix")
538 IF (check_eigenvalues)
THEN
540 CALL elpa_obj%set(
"qr", 0, success)
541 cpassert(success == elpa_ok)
543 CALL elpa_obj%eigenvectors(matrix_noqr%local_data, eval_noqr, eigenvectors_noqr%local_data, success)
544 IF (success /= elpa_ok) &
545 cpabort(
"ELPA failed to diagonalize a matrix even without QR decomposition")
547 IF (any(abs(eval(1:neig) - eval_noqr(1:neig)) .GT. th)) &
548 cpabort(
"Eigenvalues calculated with QR decomp. in ELPA are wrong. Disable ELPA_QR_UNSAFE.")
550 DEALLOCATE (eval_noqr)
555 CALL elpa_deallocate(elpa_obj, success)
556 cpassert(success == elpa_ok)
558 eigenvalues(1:neig) = eval(1:neig)
561 CALL timestop(handle)
563 END SUBROUTINE cp_fm_diag_elpa_base
static GRID_HOST_DEVICE int modulo(int a, int m)
Equivalent of Fortran's MODULO, which always return a positive number. https://gcc....
methods related to the blacs parallel environment
basic linear algebra operations for full matrices
subroutine, public cp_fm_uplo_to_full(matrix, work, uplo)
given a triangular matrix according to uplo, computes the corresponding full matrix
Auxiliary tools to redistribute cp_fm_type matrices before and after diagonalization....
subroutine, public cp_fm_redistribute_end(matrix, eigenvectors, eig, matrix_new, eigenvectors_new)
Redistributes eigenvectors and eigenvalues back to the original communicator group.
subroutine, public cp_fm_redistribute_start(matrix, eigenvectors, matrix_new, eigenvectors_new, caller_is_elpa, redist_info)
Determines the optimal number of CPUs for matrix diagonalization and redistributes the input matrices...
subroutine, public set_elpa_print(flag)
Sets a flag that determines if additional information about the ELPA diagonalization should be printe...
subroutine, public set_elpa_kernel(requested_kernel)
Sets the active ELPA kernel.
subroutine, public cp_fm_diag_elpa(matrix, eigenvectors, eigenvalues)
Driver routine to diagonalize a FM matrix with the ELPA library.
subroutine, public set_elpa_qr(use_qr, use_qr_unsafe)
Sets flags that determines if ELPA should try to use QR during diagonalization If use_qr = ....
subroutine, public initialize_elpa_library()
Initialize the ELPA library.
subroutine, public finalize_elpa_library()
Finalize the ELPA library.
represent the structure of a full matrix
subroutine, public cp_fm_struct_get(fmstruct, para_env, context, descriptor, ncol_block, nrow_block, nrow_global, ncol_global, first_p_pos, row_indices, col_indices, nrow_local, ncol_local, nrow_locals, ncol_locals, local_leading_dimension)
returns the values of various attributes of the matrix structure
represent a full matrix distributed on many processors
subroutine, public cp_fm_write_info(matrix, io_unit)
Write nicely formatted info about the FM to the given I/O unit (including the underlying FM struct)
subroutine, public cp_fm_create(matrix, matrix_struct, name, use_sp)
creates a new full matrix with the given structure
various routines to log and control the output. The idea is that decisions about where to log should ...
integer function, public cp_logger_get_default_io_unit(logger)
returns the unit nr for the ionode (-1 on all other processors) skips as well checks if the procs cal...
type(cp_logger_type) function, pointer, public cp_get_default_logger()
returns the default logger
Defines the basic variable types.
integer, parameter, public dp
integer, parameter, public default_string_length
Machine interface based on Fortran 2003 and POSIX.
integer, parameter, public machine_x86_avx
integer, parameter, public machine_x86_sse4
integer, parameter, public machine_cpu_generic
integer, parameter, public machine_x86_avx2
pure integer function, public m_cpuid()
Target architecture or instruction set extension according to CPU-check at runtime.
integer, parameter, public machine_x86
Interface to the message passing library MPI.
represent a blacs multidimensional parallel environment (for the mpi corrispective see cp_paratypes/m...
type of a logger, at the moment it contains just a print level starting at which level it should be l...