36 USE omp_lib,
ONLY: omp_get_max_threads
37#if defined(__HAS_IEEE_EXCEPTIONS)
38 USE ieee_exceptions,
ONLY: ieee_get_halting_mode, &
39 ieee_set_halting_mode, &
42#include "../base/base_uses.f90"
45 USE elpa_constants,
ONLY: elpa_solver_1stage, elpa_solver_2stage, elpa_ok, &
46 elpa_2stage_real_invalid, &
47 elpa_2stage_real_default, &
48 elpa_2stage_real_generic, &
49 elpa_2stage_real_generic_simple, &
50 elpa_2stage_real_bgp, &
51 elpa_2stage_real_bgq, &
52 elpa_2stage_real_sse_assembly, &
53 elpa_2stage_real_sse_block2, &
54 elpa_2stage_real_sse_block4, &
55 elpa_2stage_real_sse_block6, &
56 elpa_2stage_real_avx_block2, &
57 elpa_2stage_real_avx_block4, &
58 elpa_2stage_real_avx_block6, &
59 elpa_2stage_real_avx2_block2, &
60 elpa_2stage_real_avx2_block4, &
61 elpa_2stage_real_avx2_block6, &
62 elpa_2stage_real_avx512_block2, &
63 elpa_2stage_real_avx512_block4, &
64 elpa_2stage_real_avx512_block6, &
65 elpa_2stage_real_nvidia_gpu, &
66 elpa_2stage_real_amd_gpu, &
67 elpa_2stage_real_intel_gpu_sycl
69 USE elpa,
ONLY: elpa_t, elpa_init, elpa_uninit, &
70 elpa_allocate, elpa_deallocate
77 CHARACTER(len=*),
PARAMETER,
PRIVATE :: moduleN =
'cp_fm_elpa'
80 INTEGER,
DIMENSION(21),
PARAMETER :: elpa_kernel_ids = [ &
81 elpa_2stage_real_invalid, &
82 elpa_2stage_real_generic, &
83 elpa_2stage_real_generic_simple, &
84 elpa_2stage_real_bgp, &
85 elpa_2stage_real_bgq, &
86 elpa_2stage_real_sse_assembly, &
87 elpa_2stage_real_sse_block2, &
88 elpa_2stage_real_sse_block4, &
89 elpa_2stage_real_sse_block6, &
90 elpa_2stage_real_avx_block2, &
91 elpa_2stage_real_avx_block4, &
92 elpa_2stage_real_avx_block6, &
93 elpa_2stage_real_avx2_block2, &
94 elpa_2stage_real_avx2_block4, &
95 elpa_2stage_real_avx2_block6, &
96 elpa_2stage_real_avx512_block2, &
97 elpa_2stage_real_avx512_block4, &
98 elpa_2stage_real_avx512_block6, &
99 elpa_2stage_real_nvidia_gpu, &
100 elpa_2stage_real_amd_gpu, &
101 elpa_2stage_real_intel_gpu_sycl]
103 CHARACTER(len=14),
DIMENSION(SIZE(elpa_kernel_ids)),
PARAMETER :: &
104 elpa_kernel_names = [character(len=14) :: &
127 CHARACTER(len=44),
DIMENSION(SIZE(elpa_kernel_ids)),
PARAMETER :: &
128 elpa_kernel_descriptions = [character(len=44) :: &
129 "Automatically selected kernel", &
131 "Simplified generic kernel", &
132 "Kernel optimized for IBM BGP", &
133 "Kernel optimized for IBM BGQ", &
134 "Kernel optimized for x86_64/SSE", &
135 "Kernel optimized for x86_64/SSE (block=2)", &
136 "Kernel optimized for x86_64/SSE (block=4)", &
137 "Kernel optimized for x86_64/SSE (block=6)", &
138 "Kernel optimized for Intel AVX (block=2)", &
139 "Kernel optimized for Intel AVX (block=4)", &
140 "Kernel optimized for Intel AVX (block=6)", &
141 "Kernel optimized for Intel AVX2 (block=2)", &
142 "Kernel optimized for Intel AVX2 (block=4)", &
143 "Kernel optimized for Intel AVX2 (block=6)", &
144 "Kernel optimized for Intel AVX-512 (block=2)", &
145 "Kernel optimized for Intel AVX-512 (block=4)", &
146 "Kernel optimized for Intel AVX-512 (block=6)", &
147 "Kernel targeting Nvidia GPUs", &
148 "Kernel targeting AMD GPUs", &
149 "Kernel targeting Intel GPUs"]
151 INTEGER,
DIMENSION(1),
PARAMETER :: elpa_kernel_ids = [-1]
152 CHARACTER(len=14),
DIMENSION(1),
PARAMETER :: elpa_kernel_names = [
"AUTO"]
153 CHARACTER(len=44),
DIMENSION(1),
PARAMETER :: elpa_kernel_descriptions = [
"Automatically selected kernel"]
157 INTEGER,
SAVE :: elpa_kernel = elpa_kernel_ids(1)
161 LOGICAL,
SAVE :: elpa_qr_unsafe = .true., &
165#if defined(__OFFLOAD_OPENCL)
178 elpa_kernel_descriptions, &
192 LOGICAL,
INTENT(IN),
OPTIONAL :: one_stage, qr, should_print
195 IF (elpa_init(20180525) /= elpa_ok) &
196 cpabort(
"The linked ELPA library does not support the required API version")
198 IF (
PRESENT(should_print))
elpa_print = should_print
203 mark_used(should_print)
204 cpabort(
"Initialization of ELPA library requested but not enabled during build")
215 cpabort(
"Finalization of ELPA library requested but not enabled during build")
224 INTEGER,
INTENT(IN) :: requested_kernel
229 elpa_kernel = requested_kernel
232 IF (elpa_kernel == elpa_2stage_real_invalid)
THEN
237 elpa_kernel = elpa_2stage_real_sse_block4
239 elpa_kernel = elpa_2stage_real_avx_block4
241 elpa_kernel = elpa_2stage_real_avx2_block4
243 elpa_kernel = elpa_2stage_real_avx512_block4
248#if !defined(__NO_OFFLOAD_ELPA)
249#if defined(__OFFLOAD_CUDA)
250 elpa_kernel = elpa_2stage_real_nvidia_gpu
252#if defined(__OFFLOAD_HIP)
253 elpa_kernel = elpa_2stage_real_amd_gpu
255#if defined(__OFFLOAD_OPENCL)
256 elpa_kernel = elpa_2stage_real_intel_gpu_sycl
260 IF (elpa_kernel == elpa_2stage_real_invalid)
THEN
261 elpa_kernel = elpa_2stage_real_default
265 mark_used(requested_kernel)
276 TYPE(
cp_fm_type),
INTENT(IN) :: matrix, eigenvectors
277 REAL(kind=
dp),
DIMENSION(:),
INTENT(OUT) :: eigenvalues
280 CHARACTER(len=*),
PARAMETER :: routinen =
'cp_fm_diag_elpa'
283 TYPE(
cp_fm_type) :: eigenvectors_new, matrix_new
286 CALL timeset(routinen, handle)
294 caller_is_elpa=.true., redist_info=rdinfo)
297 IF (
ASSOCIATED(matrix_new%matrix_struct)) &
298 CALL cp_fm_diag_elpa_base(matrix_new, eigenvectors_new, eigenvalues, rdinfo)
303 CALL timestop(handle)
307 mark_used(eigenvectors)
309 cpabort(
"CP2K compiled without the ELPA library.")
321 SUBROUTINE cp_fm_diag_elpa_base(matrix, eigenvectors, eigenvalues, rdinfo)
323 TYPE(
cp_fm_type),
INTENT(IN) :: matrix, eigenvectors
324 REAL(kind=
dp),
DIMENSION(:),
INTENT(OUT) :: eigenvalues
327 CHARACTER(len=*),
PARAMETER :: routinen =
'cp_fm_diag_elpa_base'
331 CLASS(elpa_t),
POINTER :: elpa_obj
332 CHARACTER(len=default_string_length) :: kernel_name
337 nblk, neig, io_unit, &
339 LOGICAL :: use_qr, check_eigenvalues
340 REAL(kind=
dp),
DIMENSION(:),
ALLOCATABLE :: eval, eval_noqr
342 TYPE(
cp_fm_type) :: matrix_noqr, eigenvectors_noqr
344 REAL(kind=
dp),
PARAMETER :: th = 1.0e-14_dp
345 INTEGER,
DIMENSION(:),
POINTER :: ncol_locals
346#if defined(__HAS_IEEE_EXCEPTIONS)
347 LOGICAL,
DIMENSION(5) :: halt
350 CALL timeset(routinen, handle)
352 NULLIFY (ncol_locals)
354 check_eigenvalues = .false.
359 n = matrix%matrix_struct%nrow_global
360 context => matrix%matrix_struct%context
361 group = matrix%matrix_struct%para_env
363 myprow = context%mepos(1)
364 mypcol = context%mepos(2)
370 local_leading_dimension=n_rows, &
373 ncol_locals=ncol_locals)
376 IF (io_unit > 0 .AND. any(ncol_locals == 0))
THEN
377 CALL rdinfo%write(io_unit)
379 cpabort(
"ELPA [pre-fail]: Problem contains processor column with zero width.")
382 neig =
SIZE(eigenvalues, 1)
392 IF (.NOT. elpa_qr_unsafe) &
393 use_qr = use_qr .AND. (n >= 64) .AND. (nblk >= 64)
396 IF (use_qr .AND. elpa_qr_unsafe .AND.
elpa_print) &
397 check_eigenvalues = .true.
399 CALL matrix%matrix_struct%para_env%bcast(check_eigenvalues)
401 IF (check_eigenvalues)
THEN
403 ALLOCATE (eval_noqr(n))
404 CALL cp_fm_create(matrix=matrix_noqr, matrix_struct=matrix%matrix_struct)
406 CALL cp_fm_create(matrix=eigenvectors_noqr, matrix_struct=eigenvectors%matrix_struct)
411 WRITE (unit=io_unit, fmt=
"(/,T2,A)") &
412 "ELPA| Matrix diagonalization information"
416 kernel_name =
"id: "//trim(adjustl(
cp_to_string(elpa_kernel)))
417 DO i = 1,
SIZE(elpa_kernel_ids)
418 IF (elpa_kernel_ids(i) == elpa_kernel)
THEN
419 kernel_name = elpa_kernel_names(i)
423 WRITE (unit=io_unit, fmt=
"(T2,A,T71,I10)") &
424 "ELPA| Matrix order (NA) ", n, &
425 "ELPA| Matrix block size (NBLK) ", nblk, &
426 "ELPA| Number of eigenvectors (NEV) ", neig, &
427 "ELPA| Local rows (LOCAL_NROWS) ", n_rows, &
428 "ELPA| Local columns (LOCAL_NCOLS) ", n_cols
429 WRITE (unit=io_unit, fmt=
"(T2,A,T61,A20)") &
430 "ELPA| Kernel ", adjustr(trim(kernel_name))
432 WRITE (unit=io_unit, fmt=
"(T2,A,T78,A3)") &
433 "ELPA| QR step requested ",
"YES"
435 WRITE (unit=io_unit, fmt=
"(T2,A,T79,A2)") &
436 "ELPA| QR step requested ",
"NO"
441 WRITE (unit=io_unit, fmt=
"(T2,A,T78,A3)") &
442 "ELPA| Matrix is suitable for QR ",
"YES"
444 WRITE (unit=io_unit, fmt=
"(T2,A,T79,A2)") &
445 "ELPA| Matrix is suitable for QR ",
"NO"
447 IF (.NOT. use_qr)
THEN
448 IF (
modulo(n, 2) /= 0)
THEN
449 WRITE (unit=io_unit, fmt=
"(T2,A)") &
450 "ELPA| Matrix order is NOT even"
452 IF ((nblk < 64) .AND. (.NOT. elpa_qr_unsafe))
THEN
453 WRITE (unit=io_unit, fmt=
"(T2,A)") &
454 "ELPA| Matrix block size is NOT 64 or greater"
457 IF ((nblk < 64) .AND. elpa_qr_unsafe)
THEN
458 WRITE (unit=io_unit, fmt=
"(T2,A)") &
459 "ELPA| Matrix block size check was bypassed"
468 elpa_obj => elpa_allocate()
470 CALL elpa_obj%set(
"na", n, success)
471 cpassert(success == elpa_ok)
473 CALL elpa_obj%set(
"nev", neig, success)
474 cpassert(success == elpa_ok)
476 CALL elpa_obj%set(
"local_nrows", n_rows, success)
477 cpassert(success == elpa_ok)
479 CALL elpa_obj%set(
"local_ncols", n_cols, success)
480 cpassert(success == elpa_ok)
482 CALL elpa_obj%set(
"nblk", nblk, success)
483 cpassert(success == elpa_ok)
485 CALL elpa_obj%set(
"mpi_comm_parent", group%get_handle(), success)
486 cpassert(success == elpa_ok)
488 CALL elpa_obj%set(
"process_row", myprow, success)
489 cpassert(success == elpa_ok)
491 CALL elpa_obj%set(
"process_col", mypcol, success)
492 cpassert(success == elpa_ok)
494 success = elpa_obj%setup()
495 cpassert(success == elpa_ok)
497 CALL elpa_obj%set(
"solver", &
500 IF (success /= elpa_ok) &
501 cpabort(
"Setting solver for ELPA failed")
504 SELECT CASE (elpa_kernel)
505 CASE (elpa_2stage_real_nvidia_gpu)
506 CALL elpa_obj%set(
"nvidia-gpu", 1, success)
507 cpassert(success == elpa_ok)
508 CASE (elpa_2stage_real_amd_gpu)
509 CALL elpa_obj%set(
"amd-gpu", 1, success)
510 cpassert(success == elpa_ok)
511 CASE (elpa_2stage_real_intel_gpu_sycl)
512 CALL elpa_obj%set(
"intel-gpu", 1, success)
513 cpassert(success == elpa_ok)
517 CALL elpa_obj%set(
"real_kernel", elpa_kernel, success)
518 cpwarn_if(success /= elpa_ok,
"Setting real_kernel for ELPA failed")
521 CALL elpa_obj%set(
"qr", 1, success)
522 cpassert(success == elpa_ok)
527 IF (elpa_obj%can_set(
"omp_threads", omp_get_max_threads()) == elpa_ok)
THEN
528 CALL elpa_obj%set(
"omp_threads", omp_get_max_threads(), success)
529 cpassert(success == elpa_ok)
533#if defined(__HAS_IEEE_EXCEPTIONS)
534 CALL ieee_get_halting_mode(ieee_all, halt)
535 CALL ieee_set_halting_mode(ieee_all, .false.)
537 CALL elpa_obj%eigenvectors(matrix%local_data, eval, eigenvectors%local_data, success)
538#if defined(__HAS_IEEE_EXCEPTIONS)
539 CALL ieee_set_halting_mode(ieee_all, halt)
542 IF (success /= elpa_ok) &
543 cpabort(
"ELPA failed to diagonalize a matrix")
545 IF (check_eigenvalues)
THEN
547 CALL elpa_obj%set(
"qr", 0, success)
548 cpassert(success == elpa_ok)
550 CALL elpa_obj%eigenvectors(matrix_noqr%local_data, eval_noqr, eigenvectors_noqr%local_data, success)
551 IF (success /= elpa_ok) &
552 cpabort(
"ELPA failed to diagonalize a matrix even without QR decomposition")
554 IF (any(abs(eval(1:neig) - eval_noqr(1:neig)) > th)) &
555 cpabort(
"ELPA failed to calculate Eigenvalues with ELPA's QR decomposition")
557 DEALLOCATE (eval_noqr)
562 CALL elpa_deallocate(elpa_obj, success)
563 cpassert(success == elpa_ok)
565 eigenvalues(1:neig) = eval(1:neig)
568 CALL timestop(handle)
570 END SUBROUTINE cp_fm_diag_elpa_base
static GRID_HOST_DEVICE int modulo(int a, int m)
Equivalent of Fortran's MODULO, which always return a positive number. https://gcc....
methods related to the blacs parallel environment
Basic linear algebra operations for full matrices.
subroutine, public cp_fm_uplo_to_full(matrix, work, uplo)
given a triangular matrix according to uplo, computes the corresponding full matrix
Auxiliary tools to redistribute cp_fm_type matrices before and after diagonalization....
subroutine, public cp_fm_redistribute_end(matrix, eigenvectors, eig, matrix_new, eigenvectors_new)
Redistributes eigenvectors and eigenvalues back to the original communicator group.
subroutine, public cp_fm_redistribute_start(matrix, eigenvectors, matrix_new, eigenvectors_new, caller_is_elpa, redist_info)
Determines the optimal number of CPUs for matrix diagonalization and redistributes the input matrices...
subroutine, public set_elpa_kernel(requested_kernel)
Sets the active ELPA kernel.
logical, save, public elpa_qr
subroutine, public cp_fm_diag_elpa(matrix, eigenvectors, eigenvalues)
Driver routine to diagonalize a FM matrix with the ELPA library.
logical, save, public elpa_print
subroutine, public finalize_elpa_library()
Finalize the ELPA library.
logical, save, public elpa_one_stage
subroutine, public initialize_elpa_library(one_stage, qr, should_print)
Initialize the ELPA library.
represent the structure of a full matrix
subroutine, public cp_fm_struct_get(fmstruct, para_env, context, descriptor, ncol_block, nrow_block, nrow_global, ncol_global, first_p_pos, row_indices, col_indices, nrow_local, ncol_local, nrow_locals, ncol_locals, local_leading_dimension)
returns the values of various attributes of the matrix structure
represent a full matrix distributed on many processors
subroutine, public cp_fm_create(matrix, matrix_struct, name, use_sp, nrow, ncol, set_zero)
creates a new full matrix with the given structure
subroutine, public cp_fm_write_info(matrix, io_unit)
Write nicely formatted info about the FM to the given I/O unit (including the underlying FM struct)
various routines to log and control the output. The idea is that decisions about where to log should ...
integer function, public cp_logger_get_default_io_unit(logger)
returns the unit nr for the ionode (-1 on all other processors) skips as well checks if the procs cal...
type(cp_logger_type) function, pointer, public cp_get_default_logger()
returns the default logger
Defines the basic variable types.
integer, parameter, public dp
integer, parameter, public default_string_length
Machine interface based on Fortran 2003 and POSIX.
integer, parameter, public machine_x86_avx
integer, parameter, public machine_x86_sse4
integer, parameter, public machine_cpu_generic
integer, parameter, public machine_x86_avx2
pure integer function, public m_cpuid()
Target architecture or instruction set extension according to CPU-check at runtime.
integer, parameter, public machine_x86
Interface to the message passing library MPI.
represent a blacs multidimensional parallel environment (for the mpi corrispective see cp_paratypes/m...
type of a logger, at the moment it contains just a print level starting at which level it should be l...