36 USE omp_lib,
ONLY: omp_get_max_threads
37#if defined(__HAS_IEEE_EXCEPTIONS)
38 USE ieee_exceptions,
ONLY: ieee_get_halting_mode, &
39 ieee_set_halting_mode, &
42#include "../base/base_uses.f90"
45 USE elpa_constants,
ONLY: elpa_solver_1stage, elpa_solver_2stage, elpa_ok, &
46 elpa_2stage_real_invalid, &
47 elpa_2stage_real_default, &
48 elpa_2stage_real_generic, &
49 elpa_2stage_real_generic_simple, &
50 elpa_2stage_real_bgp, &
51 elpa_2stage_real_bgq, &
52 elpa_2stage_real_sse_assembly, &
53 elpa_2stage_real_sse_block2, &
54 elpa_2stage_real_sse_block4, &
55 elpa_2stage_real_sse_block6, &
56 elpa_2stage_real_avx_block2, &
57 elpa_2stage_real_avx_block4, &
58 elpa_2stage_real_avx_block6, &
59 elpa_2stage_real_avx2_block2, &
60 elpa_2stage_real_avx2_block4, &
61 elpa_2stage_real_avx2_block6, &
62 elpa_2stage_real_avx512_block2, &
63 elpa_2stage_real_avx512_block4, &
64 elpa_2stage_real_avx512_block6, &
65 elpa_2stage_real_nvidia_gpu, &
66 elpa_2stage_real_amd_gpu, &
67 elpa_2stage_real_intel_gpu_sycl
69 USE elpa,
ONLY: elpa_t, elpa_init, elpa_uninit, &
70 elpa_allocate, elpa_deallocate
77 CHARACTER(len=*),
PARAMETER,
PRIVATE :: moduleN =
'cp_fm_elpa'
80 INTEGER,
DIMENSION(21),
PARAMETER :: elpa_kernel_ids = [ &
81 elpa_2stage_real_invalid, &
82 elpa_2stage_real_generic, &
83 elpa_2stage_real_generic_simple, &
84 elpa_2stage_real_bgp, &
85 elpa_2stage_real_bgq, &
86 elpa_2stage_real_sse_assembly, &
87 elpa_2stage_real_sse_block2, &
88 elpa_2stage_real_sse_block4, &
89 elpa_2stage_real_sse_block6, &
90 elpa_2stage_real_avx_block2, &
91 elpa_2stage_real_avx_block4, &
92 elpa_2stage_real_avx_block6, &
93 elpa_2stage_real_avx2_block2, &
94 elpa_2stage_real_avx2_block4, &
95 elpa_2stage_real_avx2_block6, &
96 elpa_2stage_real_avx512_block2, &
97 elpa_2stage_real_avx512_block4, &
98 elpa_2stage_real_avx512_block6, &
99 elpa_2stage_real_nvidia_gpu, &
100 elpa_2stage_real_amd_gpu, &
101 elpa_2stage_real_intel_gpu_sycl]
103 CHARACTER(len=14),
DIMENSION(SIZE(elpa_kernel_ids)),
PARAMETER :: &
104 elpa_kernel_names = [character(len=14) :: &
127 CHARACTER(len=44),
DIMENSION(SIZE(elpa_kernel_ids)),
PARAMETER :: &
128 elpa_kernel_descriptions = [character(len=44) :: &
129 "Automatically selected kernel", &
131 "Simplified generic kernel", &
132 "Kernel optimized for IBM BGP", &
133 "Kernel optimized for IBM BGQ", &
134 "Kernel optimized for x86_64/SSE", &
135 "Kernel optimized for x86_64/SSE (block=2)", &
136 "Kernel optimized for x86_64/SSE (block=4)", &
137 "Kernel optimized for x86_64/SSE (block=6)", &
138 "Kernel optimized for Intel AVX (block=2)", &
139 "Kernel optimized for Intel AVX (block=4)", &
140 "Kernel optimized for Intel AVX (block=6)", &
141 "Kernel optimized for Intel AVX2 (block=2)", &
142 "Kernel optimized for Intel AVX2 (block=4)", &
143 "Kernel optimized for Intel AVX2 (block=6)", &
144 "Kernel optimized for Intel AVX-512 (block=2)", &
145 "Kernel optimized for Intel AVX-512 (block=4)", &
146 "Kernel optimized for Intel AVX-512 (block=6)", &
147 "Kernel targeting Nvidia GPUs", &
148 "Kernel targeting AMD GPUs", &
149 "Kernel targeting Intel GPUs"]
151 INTEGER,
DIMENSION(1),
PARAMETER :: elpa_kernel_ids = [-1]
152 CHARACTER(len=14),
DIMENSION(1),
PARAMETER :: elpa_kernel_names = [
"AUTO"]
153 CHARACTER(len=44),
DIMENSION(1),
PARAMETER :: elpa_kernel_descriptions = [
"Automatically selected kernel"]
157 INTEGER,
SAVE :: elpa_kernel = elpa_kernel_ids(1)
161 LOGICAL,
SAVE :: elpa_should_print = .false., &
162 elpa_qr_unsafe = .true., &
165#if defined(__OFFLOAD_OPENCL)
178 elpa_kernel_descriptions, &
190 LOGICAL,
INTENT(IN),
OPTIONAL :: one_stage, qr
193 IF (elpa_init(20180525) /= elpa_ok) &
194 cpabort(
"The linked ELPA library does not support the required API version")
198 cpabort(
"Initialization of ELPA library requested but not enabled during build")
209 cpabort(
"Finalization of ELPA library requested but not enabled during build")
218 INTEGER,
INTENT(IN) :: requested_kernel
223 elpa_kernel = requested_kernel
226 IF (elpa_kernel == elpa_2stage_real_invalid)
THEN
231 elpa_kernel = elpa_2stage_real_sse_block4
233 elpa_kernel = elpa_2stage_real_avx_block4
235 elpa_kernel = elpa_2stage_real_avx2_block4
237 elpa_kernel = elpa_2stage_real_avx512_block4
242#if !defined(__NO_OFFLOAD_ELPA)
243#if defined(__OFFLOAD_CUDA)
244 elpa_kernel = elpa_2stage_real_nvidia_gpu
246#if defined(__OFFLOAD_HIP)
247 elpa_kernel = elpa_2stage_real_amd_gpu
249#if defined(__OFFLOAD_OPENCL)
250 elpa_kernel = elpa_2stage_real_intel_gpu_sycl
254 IF (elpa_kernel == elpa_2stage_real_invalid)
THEN
255 elpa_kernel = elpa_2stage_real_default
259 mark_used(requested_kernel)
269 LOGICAL,
INTENT(IN) :: flag
271 elpa_should_print = flag
281 TYPE(
cp_fm_type),
INTENT(IN) :: matrix, eigenvectors
282 REAL(kind=
dp),
DIMENSION(:),
INTENT(OUT) :: eigenvalues
285 CHARACTER(len=*),
PARAMETER :: routinen =
'cp_fm_diag_elpa'
288 TYPE(
cp_fm_type) :: eigenvectors_new, matrix_new
291 CALL timeset(routinen, handle)
299 caller_is_elpa=.true., redist_info=rdinfo)
302 IF (
ASSOCIATED(matrix_new%matrix_struct)) &
303 CALL cp_fm_diag_elpa_base(matrix_new, eigenvectors_new, eigenvalues, rdinfo)
308 CALL timestop(handle)
312 mark_used(eigenvectors)
314 cpabort(
"CP2K compiled without the ELPA library.")
326 SUBROUTINE cp_fm_diag_elpa_base(matrix, eigenvectors, eigenvalues, rdinfo)
328 TYPE(
cp_fm_type),
INTENT(IN) :: matrix, eigenvectors
329 REAL(kind=
dp),
DIMENSION(:),
INTENT(OUT) :: eigenvalues
332 CHARACTER(len=*),
PARAMETER :: routinen =
'cp_fm_diag_elpa_base'
336 CLASS(elpa_t),
POINTER :: elpa_obj
337 CHARACTER(len=default_string_length) :: kernel_name
342 nblk, neig, io_unit, &
344 LOGICAL :: use_qr, check_eigenvalues
345 REAL(kind=
dp),
DIMENSION(:),
ALLOCATABLE :: eval, eval_noqr
347 TYPE(
cp_fm_type) :: matrix_noqr, eigenvectors_noqr
349 REAL(kind=
dp),
PARAMETER :: th = 1.0e-14_dp
350 INTEGER,
DIMENSION(:),
POINTER :: ncol_locals
351#if defined(__HAS_IEEE_EXCEPTIONS)
352 LOGICAL,
DIMENSION(5) :: halt
355 CALL timeset(routinen, handle)
357 NULLIFY (ncol_locals)
359 check_eigenvalues = .false.
364 n = matrix%matrix_struct%nrow_global
365 context => matrix%matrix_struct%context
366 group = matrix%matrix_struct%para_env
368 myprow = context%mepos(1)
369 mypcol = context%mepos(2)
375 local_leading_dimension=n_rows, &
378 ncol_locals=ncol_locals)
381 IF (io_unit > 0 .AND. any(ncol_locals == 0))
THEN
382 CALL rdinfo%write(io_unit)
384 cpabort(
"ELPA [pre-fail]: Problem contains processor column with zero width.")
387 neig =
SIZE(eigenvalues, 1)
397 IF (.NOT. elpa_qr_unsafe) &
398 use_qr = use_qr .AND. (n >= 64) .AND. (nblk >= 64)
401 IF (use_qr .AND. elpa_qr_unsafe .AND. elpa_should_print) &
402 check_eigenvalues = .true.
404 CALL matrix%matrix_struct%para_env%bcast(check_eigenvalues)
406 IF (check_eigenvalues)
THEN
408 ALLOCATE (eval_noqr(n))
409 CALL cp_fm_create(matrix=matrix_noqr, matrix_struct=matrix%matrix_struct)
411 CALL cp_fm_create(matrix=eigenvectors_noqr, matrix_struct=eigenvectors%matrix_struct)
415 IF (io_unit > 0 .AND. elpa_should_print)
THEN
416 WRITE (unit=io_unit, fmt=
"(/,T2,A)") &
417 "ELPA| Matrix diagonalization information"
421 kernel_name =
"id: "//trim(adjustl(
cp_to_string(elpa_kernel)))
422 DO i = 1,
SIZE(elpa_kernel_ids)
423 IF (elpa_kernel_ids(i) == elpa_kernel)
THEN
424 kernel_name = elpa_kernel_names(i)
428 WRITE (unit=io_unit, fmt=
"(T2,A,T71,I10)") &
429 "ELPA| Matrix order (NA) ", n, &
430 "ELPA| Matrix block size (NBLK) ", nblk, &
431 "ELPA| Number of eigenvectors (NEV) ", neig, &
432 "ELPA| Local rows (LOCAL_NROWS) ", n_rows, &
433 "ELPA| Local columns (LOCAL_NCOLS) ", n_cols
434 WRITE (unit=io_unit, fmt=
"(T2,A,T61,A20)") &
435 "ELPA| Kernel ", adjustr(trim(kernel_name))
437 WRITE (unit=io_unit, fmt=
"(T2,A,T78,A3)") &
438 "ELPA| QR step requested ",
"YES"
440 WRITE (unit=io_unit, fmt=
"(T2,A,T79,A2)") &
441 "ELPA| QR step requested ",
"NO"
446 WRITE (unit=io_unit, fmt=
"(T2,A,T78,A3)") &
447 "ELPA| Matrix is suitable for QR ",
"YES"
449 WRITE (unit=io_unit, fmt=
"(T2,A,T79,A2)") &
450 "ELPA| Matrix is suitable for QR ",
"NO"
452 IF (.NOT. use_qr)
THEN
453 IF (
modulo(n, 2) /= 0)
THEN
454 WRITE (unit=io_unit, fmt=
"(T2,A)") &
455 "ELPA| Matrix order is NOT even"
457 IF ((nblk < 64) .AND. (.NOT. elpa_qr_unsafe))
THEN
458 WRITE (unit=io_unit, fmt=
"(T2,A)") &
459 "ELPA| Matrix block size is NOT 64 or greater"
462 IF ((nblk < 64) .AND. elpa_qr_unsafe)
THEN
463 WRITE (unit=io_unit, fmt=
"(T2,A)") &
464 "ELPA| Matrix block size check was bypassed"
473 elpa_obj => elpa_allocate()
475 CALL elpa_obj%set(
"na", n, success)
476 cpassert(success == elpa_ok)
478 CALL elpa_obj%set(
"nev", neig, success)
479 cpassert(success == elpa_ok)
481 CALL elpa_obj%set(
"local_nrows", n_rows, success)
482 cpassert(success == elpa_ok)
484 CALL elpa_obj%set(
"local_ncols", n_cols, success)
485 cpassert(success == elpa_ok)
487 CALL elpa_obj%set(
"nblk", nblk, success)
488 cpassert(success == elpa_ok)
490 CALL elpa_obj%set(
"mpi_comm_parent", group%get_handle(), success)
491 cpassert(success == elpa_ok)
493 CALL elpa_obj%set(
"process_row", myprow, success)
494 cpassert(success == elpa_ok)
496 CALL elpa_obj%set(
"process_col", mypcol, success)
497 cpassert(success == elpa_ok)
499 success = elpa_obj%setup()
500 cpassert(success == elpa_ok)
502 CALL elpa_obj%set(
"solver", &
505 cpassert(success == elpa_ok)
508 SELECT CASE (elpa_kernel)
509 CASE (elpa_2stage_real_nvidia_gpu)
510 CALL elpa_obj%set(
"nvidia-gpu", 1, success)
511 cpassert(success == elpa_ok)
512 CASE (elpa_2stage_real_amd_gpu)
513 CALL elpa_obj%set(
"amd-gpu", 1, success)
514 cpassert(success == elpa_ok)
515 CASE (elpa_2stage_real_intel_gpu_sycl)
516 CALL elpa_obj%set(
"intel-gpu", 1, success)
517 cpassert(success == elpa_ok)
521 CALL elpa_obj%set(
"real_kernel", elpa_kernel, success)
522 cpwarn_if(success /= elpa_ok,
"Setting real_kernel for ELPA failed")
525 CALL elpa_obj%set(
"qr", 1, success)
526 cpassert(success == elpa_ok)
531 IF (elpa_obj%can_set(
"omp_threads", omp_get_max_threads()) == elpa_ok)
THEN
532 CALL elpa_obj%set(
"omp_threads", omp_get_max_threads(), success)
533 cpassert(success == elpa_ok)
537#if defined(__HAS_IEEE_EXCEPTIONS)
538 CALL ieee_get_halting_mode(ieee_all, halt)
539 CALL ieee_set_halting_mode(ieee_all, .false.)
541 CALL elpa_obj%eigenvectors(matrix%local_data, eval, eigenvectors%local_data, success)
542#if defined(__HAS_IEEE_EXCEPTIONS)
543 CALL ieee_set_halting_mode(ieee_all, halt)
546 IF (success /= elpa_ok) &
547 cpabort(
"ELPA failed to diagonalize a matrix")
549 IF (check_eigenvalues)
THEN
551 CALL elpa_obj%set(
"qr", 0, success)
552 cpassert(success == elpa_ok)
554 CALL elpa_obj%eigenvectors(matrix_noqr%local_data, eval_noqr, eigenvectors_noqr%local_data, success)
555 IF (success /= elpa_ok) &
556 cpabort(
"ELPA failed to diagonalize a matrix even without QR decomposition")
558 IF (any(abs(eval(1:neig) - eval_noqr(1:neig)) > th)) &
559 cpabort(
"ELPA failed to calculate Eigenvalues with ELPA's QR decomposition")
561 DEALLOCATE (eval_noqr)
566 CALL elpa_deallocate(elpa_obj, success)
567 cpassert(success == elpa_ok)
569 eigenvalues(1:neig) = eval(1:neig)
572 CALL timestop(handle)
574 END SUBROUTINE cp_fm_diag_elpa_base
static GRID_HOST_DEVICE int modulo(int a, int m)
Equivalent of Fortran's MODULO, which always return a positive number. https://gcc....
methods related to the blacs parallel environment
Basic linear algebra operations for full matrices.
subroutine, public cp_fm_uplo_to_full(matrix, work, uplo)
given a triangular matrix according to uplo, computes the corresponding full matrix
Auxiliary tools to redistribute cp_fm_type matrices before and after diagonalization....
subroutine, public cp_fm_redistribute_end(matrix, eigenvectors, eig, matrix_new, eigenvectors_new)
Redistributes eigenvectors and eigenvalues back to the original communicator group.
subroutine, public cp_fm_redistribute_start(matrix, eigenvectors, matrix_new, eigenvectors_new, caller_is_elpa, redist_info)
Determines the optimal number of CPUs for matrix diagonalization and redistributes the input matrices...
subroutine, public set_elpa_print(flag)
Sets a flag that determines if additional information about the ELPA diagonalization should be printe...
subroutine, public set_elpa_kernel(requested_kernel)
Sets the active ELPA kernel.
logical, save, public elpa_qr
subroutine, public cp_fm_diag_elpa(matrix, eigenvectors, eigenvalues)
Driver routine to diagonalize a FM matrix with the ELPA library.
subroutine, public initialize_elpa_library(one_stage, qr)
Initialize the ELPA library.
subroutine, public finalize_elpa_library()
Finalize the ELPA library.
logical, save, public elpa_one_stage
represent the structure of a full matrix
subroutine, public cp_fm_struct_get(fmstruct, para_env, context, descriptor, ncol_block, nrow_block, nrow_global, ncol_global, first_p_pos, row_indices, col_indices, nrow_local, ncol_local, nrow_locals, ncol_locals, local_leading_dimension)
returns the values of various attributes of the matrix structure
represent a full matrix distributed on many processors
subroutine, public cp_fm_create(matrix, matrix_struct, name, use_sp, nrow, ncol, set_zero)
creates a new full matrix with the given structure
subroutine, public cp_fm_write_info(matrix, io_unit)
Write nicely formatted info about the FM to the given I/O unit (including the underlying FM struct)
various routines to log and control the output. The idea is that decisions about where to log should ...
integer function, public cp_logger_get_default_io_unit(logger)
returns the unit nr for the ionode (-1 on all other processors) skips as well checks if the procs cal...
type(cp_logger_type) function, pointer, public cp_get_default_logger()
returns the default logger
Defines the basic variable types.
integer, parameter, public dp
integer, parameter, public default_string_length
Machine interface based on Fortran 2003 and POSIX.
integer, parameter, public machine_x86_avx
integer, parameter, public machine_x86_sse4
integer, parameter, public machine_cpu_generic
integer, parameter, public machine_x86_avx2
pure integer function, public m_cpuid()
Target architecture or instruction set extension according to CPU-check at runtime.
integer, parameter, public machine_x86
Interface to the message passing library MPI.
represent a blacs multidimensional parallel environment (for the mpi corrispective see cp_paratypes/m...
type of a logger, at the moment it contains just a print level starting at which level it should be l...