(git:ccc2433)
realspace_grid_types.F
Go to the documentation of this file.
1 !--------------------------------------------------------------------------------------------------!
2 ! CP2K: A general program to perform molecular dynamics simulations !
3 ! Copyright 2000-2024 CP2K developers group <https://cp2k.org> !
4 ! !
5 ! SPDX-License-Identifier: GPL-2.0-or-later !
6 !--------------------------------------------------------------------------------------------------!
7 
8 ! **************************************************************************************************
9 !> \note
10 !> Basic type for real space grid methods
11 !> \par History
12 !> JGH (22-May-2002) : New routine rs_grid_zero
13 !> JGH (12-Jun-2002) : Bug fix for mpi groups
14 !> JGH (19-Jun-2003) : Added routine for task distribution
15 !> JGH (23-Nov-2003) : Added routine for task loop separation
16 !> \author JGH (18-Mar-2001)
17 ! **************************************************************************************************
19  USE cp_array_utils, ONLY: cp_1d_r_p_type
20  USE cp_log_handling, ONLY: cp_to_string
21  USE kahan_sum, ONLY: accurate_sum
22  USE kinds, ONLY: dp,&
23  int_8
24  USE machine, ONLY: m_memory
25  USE mathlib, ONLY: det_3x3
26  USE message_passing, ONLY: mp_comm_null,&
27  mp_comm_type,&
29  mp_request_type,&
30  mp_waitall,&
32  USE offload_api, ONLY: offload_buffer_type,&
35  USE pw_grid_types, ONLY: pw_mode_local,&
36  pw_grid_type
37  USE pw_grids, ONLY: pw_grid_release,&
39  USE pw_methods, ONLY: pw_integrate_function
40  USE pw_types, ONLY: pw_r3d_rs_type
41  USE util, ONLY: get_limit
42 
43 !$ USE OMP_LIB, ONLY: omp_get_max_threads, omp_get_thread_num, omp_get_num_threads
44 
45 #include "../base/base_uses.f90"
46 
47  IMPLICIT NONE
48 
49  PRIVATE
50  PUBLIC :: realspace_grid_type, &
51  realspace_grid_desc_type, &
52  realspace_grid_p_type, &
53  realspace_grid_desc_p_type, &
54  realspace_grid_input_type
55 
56  PUBLIC :: transfer_rs2pw, &
58  rs_grid_zero, &
66  rs_grid_print, &
71 
72  INTEGER, PARAMETER, PUBLIC :: rsgrid_distributed = 0, &
73  rsgrid_replicated = 1, &
75 
76  LOGICAL, PRIVATE, PARAMETER :: debug_this_module = .false.
77  CHARACTER(len=*), PARAMETER, PRIVATE :: modulen = 'realspace_grid_types'
78 
79 ! **************************************************************************************************
80  TYPE realspace_grid_input_type
81  INTEGER :: distribution_type = rsgrid_replicated
82  INTEGER :: distribution_layout(3) = -1
83  REAL(kind=dp) :: memory_factor = 0.0_dp
84  LOGICAL :: lock_distribution = .false.
85  INTEGER :: nsmax = -1
86  REAL(kind=dp) :: halo_reduction_factor = 1.0_dp
87  END TYPE realspace_grid_input_type
88 
89 ! **************************************************************************************************
90  TYPE realspace_grid_desc_type
91  TYPE(pw_grid_type), POINTER :: pw => null() ! the pw grid
92 
93  INTEGER :: ref_count = 0 ! reference count
94 
95  INTEGER(int_8) :: ngpts = 0_int_8 ! # grid points
96  INTEGER, DIMENSION(3) :: npts = 0 ! # grid points per dimension
97  INTEGER, DIMENSION(3) :: lb = 0 ! lower bounds
98  INTEGER, DIMENSION(3) :: ub = 0 ! upper bounds
99 
100  INTEGER :: border = 0 ! border points
101 
102  INTEGER, DIMENSION(3) :: perd = -1 ! periodicity enforced
103  REAL(kind=dp), DIMENSION(3, 3) :: dh = 0.0_dp ! incremental grid matrix
104  REAL(kind=dp), DIMENSION(3, 3) :: dh_inv = 0.0_dp ! inverse incremental grid matrix
105  LOGICAL :: orthorhombic = .true. ! grid symmetry
106 
107  LOGICAL :: parallel = .true. ! whether the corresponding pw grid is distributed
108  LOGICAL :: distributed = .true. ! whether the rs grid is distributed
109  ! these MPI related quantities are only meaningful depending on how the grid has been laid out
110  ! they are most useful for fully distributed grids, where they reflect the topology of the grid
111  TYPE(mp_comm_type) :: group = mp_comm_null
112  INTEGER :: my_pos = -1
113  LOGICAL :: group_head = .false.
114  INTEGER :: group_size = 0
115  INTEGER, DIMENSION(3) :: group_dim = -1
116  INTEGER, DIMENSION(3) :: group_coor = -1
117  INTEGER, DIMENSION(3) :: neighbours = -1
118  ! only meaningful on distributed grids
119  ! a list of bounds for each CPU
120  INTEGER, DIMENSION(:, :), ALLOCATABLE :: lb_global
121  INTEGER, DIMENSION(:, :), ALLOCATABLE :: ub_global
122  ! a mapping from linear rank to 3d coord
123  INTEGER, DIMENSION(:, :), ALLOCATABLE :: rank2coord
124  INTEGER, DIMENSION(:, :, :), ALLOCATABLE :: coord2rank
125  ! a mapping from index to rank (which allows to figure out easily on which rank a given point of the grid is)
126  INTEGER, DIMENSION(:), ALLOCATABLE :: x2coord
127  INTEGER, DIMENSION(:), ALLOCATABLE :: y2coord
128  INTEGER, DIMENSION(:), ALLOCATABLE :: z2coord
129 
130  INTEGER :: my_virtual_pos = -1
131  INTEGER, DIMENSION(3) :: virtual_group_coor = -1
132 
133  INTEGER, DIMENSION(:), ALLOCATABLE :: virtual2real, real2virtual
134 
135  END TYPE realspace_grid_desc_type
136 
137  TYPE realspace_grid_type
138 
139  TYPE(realspace_grid_desc_type), POINTER :: desc => null()
140 
141  INTEGER :: ngpts_local = -1 ! local dimensions
142  INTEGER, DIMENSION(3) :: npts_local = -1
143  INTEGER, DIMENSION(3) :: lb_local = -1
144  INTEGER, DIMENSION(3) :: ub_local = -1
145  INTEGER, DIMENSION(3) :: lb_real = -1 ! lower bounds of the real local data
146  INTEGER, DIMENSION(3) :: ub_real = -1 ! upper bounds of the real local data
147 
148  INTEGER, DIMENSION(:), ALLOCATABLE :: px, py, pz ! index translators
149  TYPE(offload_buffer_type) :: buffer = offload_buffer_type() ! owner of the grid's memory
150  REAL(kind=dp), DIMENSION(:, :, :), CONTIGUOUS, POINTER :: r => null() ! the grid (pointer to buffer%host_buffer)
151 
152  END TYPE realspace_grid_type
153 
154 ! **************************************************************************************************
155  TYPE realspace_grid_p_type
156  TYPE(realspace_grid_type), POINTER :: rs_grid => null()
157  END TYPE realspace_grid_p_type
158 
159  TYPE realspace_grid_desc_p_type
160  TYPE(realspace_grid_desc_type), POINTER :: rs_desc => null()
161  END TYPE realspace_grid_desc_p_type
162 
163 CONTAINS
164 
165 ! **************************************************************************************************
166 !> \brief returns the 1D rank of the task which is a cartesian shift away from 1D rank rank_in
167 !> only possible if rs_grid is a distributed grid
168 !> \param rs_desc ...
169 !> \param rank_in ...
170 !> \param shift ...
171 !> \return ...
172 ! **************************************************************************************************
173  PURE FUNCTION rs_grid_locate_rank(rs_desc, rank_in, shift) RESULT(rank_out)
174  TYPE(realspace_grid_desc_type), INTENT(IN) :: rs_desc
175  INTEGER, INTENT(IN) :: rank_in
176  INTEGER, DIMENSION(3), INTENT(IN) :: shift
177  INTEGER :: rank_out
178 
179  INTEGER :: coord(3)
180 
181  coord = modulo(rs_desc%rank2coord(:, rank_in) + shift, rs_desc%group_dim)
182  rank_out = rs_desc%coord2rank(coord(1), coord(2), coord(3))
183  END FUNCTION rs_grid_locate_rank
184 
185 ! **************************************************************************************************
186 !> \brief Determine the setup of real space grids - this is divided up into the
187 !> creation of a descriptor and the actual grid itself (see rs_grid_create)
188 !> \param desc ...
189 !> \param pw_grid ...
190 !> \param input_settings ...
191 !> \param border_points ...
192 !> \par History
193 !> JGH (08-Jun-2003) : nsmax <= 0 indicates fully replicated grid
194 !> Iain Bethune (05-Sep-2008) : modified cut heuristic
195 !> (c) The Numerical Algorithms Group (NAG) Ltd, 2008 on behalf of the HECToR project
196 !> - Create a descriptor for realspace grids with a number of border
197 !> points as exactly given by the optional argument border_points.
198 !> These grids are always distributed.
199 !> (27.11.2013, Matthias Krack)
200 !> \author JGH (18-Mar-2001)
201 ! **************************************************************************************************
202  SUBROUTINE rs_grid_create_descriptor(desc, pw_grid, input_settings, border_points)
203  TYPE(realspace_grid_desc_type), POINTER :: desc
204  TYPE(pw_grid_type), INTENT(INOUT), TARGET :: pw_grid
205  TYPE(realspace_grid_input_type), INTENT(IN) :: input_settings
206  INTEGER, INTENT(IN), OPTIONAL :: border_points
207 
208  CHARACTER(LEN=*), PARAMETER :: routinen = 'rs_grid_create_descriptor'
209 
210  INTEGER :: border_size, dir, handle, i, j, k, l, &
211  lb(2), min_npts_real, n_slices(3), &
212  n_slices_tmp(3), nmin
213  LOGICAL :: overlap
214  REAL(kind=dp) :: ratio, ratio_best, volume, volume_dist
215 
216  CALL timeset(routinen, handle)
217 
218  IF (PRESENT(border_points)) THEN
219  border_size = border_points
220  ELSE
221  border_size = 0
222  END IF
223 
224  ALLOCATE (desc)
225 
226  CALL pw_grid%para%group%sync()
227 
228  desc%pw => pw_grid
229  CALL pw_grid_retain(desc%pw)
230 
231  desc%dh = pw_grid%dh
232  desc%dh_inv = pw_grid%dh_inv
233  desc%orthorhombic = pw_grid%orthorhombic
234  desc%ref_count = 1
235 
236  IF (pw_grid%para%mode == pw_mode_local) THEN
237  ! The corresponding group has dimension 1
238  ! All operations will be done locally
239  desc%npts = pw_grid%npts
240  desc%ngpts = product(int(desc%npts, kind=int_8))
241  desc%lb = pw_grid%bounds(1, :)
242  desc%ub = pw_grid%bounds(2, :)
243  desc%border = border_size
244  IF (border_size == 0) THEN
245  desc%perd = 1
246  ELSE
247  desc%perd = 0
248  END IF
249  desc%parallel = .false.
250  desc%distributed = .false.
251  desc%group = mp_comm_null
252  desc%group_size = 1
253  desc%group_head = .true.
254  desc%group_dim = 1
255  desc%group_coor = 0
256  desc%my_pos = 0
257  ELSE
258  ! group size of desc grid
259  ! global grid dimensions are still the same
260  desc%group_size = pw_grid%para%group_size
261  desc%npts = pw_grid%npts
262  desc%ngpts = product(int(desc%npts, kind=int_8))
263  desc%lb = pw_grid%bounds(1, :)
264  desc%ub = pw_grid%bounds(2, :)
265 
266  ! this is the eventual border size
267  IF (border_size == 0) THEN
268  nmin = (input_settings%nsmax + 1)/2
269  nmin = max(0, nint(nmin*input_settings%halo_reduction_factor))
270  ELSE
271  ! Set explicitly the requested border size
272  nmin = border_size
273  END IF
274 
275  IF (input_settings%distribution_type == rsgrid_replicated) THEN
276 
277  n_slices = 1
278  IF (border_size > 0) THEN
279  CALL cp_abort(__location__, &
280  "An explicit border size > 0 is not yet working for "// &
281  "replicated realspace grids. Request DISTRIBUTION_TYPE "// &
282  "distributed for RS_GRID explicitly.")
283  END IF
284 
285  ELSE
286 
287  n_slices = 1
288  ratio_best = -huge(ratio_best)
289 
290  ! don't allow distributions with more processors than real grid points
291  DO k = 1, min(desc%npts(3), desc%group_size)
292  DO j = 1, min(desc%npts(2), desc%group_size)
293  i = min(desc%npts(1), desc%group_size/(j*k))
294  n_slices_tmp = (/i, j, k/)
295 
296  ! we don't match the actual number of CPUs
297  IF (product(n_slices_tmp) .NE. desc%group_size) cycle
298 
299  ! we see if there has been a input constraint
300  ! i.e. if the layout is not -1 we need to fullfil it
301  IF (.NOT. all(pack(n_slices_tmp == input_settings%distribution_layout, &
302  (/-1, -1, -1/) /= input_settings%distribution_layout) &
303  )) cycle
304 
305  ! We can not work with a grid that has more local than global grid points.
306  ! This can happen when a halo region wraps around and overlaps with the other halo.
307  overlap = .false.
308  DO dir = 1, 3
309  IF (n_slices_tmp(dir) > 1) THEN
310  DO l = 0, n_slices_tmp(dir) - 1
311  lb = get_limit(desc%npts(dir), n_slices_tmp(dir), l)
312  IF (lb(2) - lb(1) + 1 + 2*nmin > desc%npts(dir)) overlap = .true.
313  END DO
314  END IF
315  END DO
316  IF (overlap) cycle
317 
318  ! a heuristic optimisation to reduce the memory usage
319  ! we go for the smallest local to real volume
320  ! volume of the box without the wings / volume of the box with the wings
321  ! with prefactodesc to promote less cuts in Z dimension
322  ratio = product(real(desc%npts, kind=dp)/n_slices_tmp)/ &
323  product(real(desc%npts, kind=dp)/n_slices_tmp + &
324  merge((/0.0, 0.0, 0.0/), 2*(/1.06*nmin, 1.05*nmin, 1.03*nmin/), n_slices_tmp == (/1, 1, 1/)))
325  IF (ratio > ratio_best) THEN
326  ratio_best = ratio
327  n_slices = n_slices_tmp
328  END IF
329 
330  END DO
331  END DO
332 
333  ! if automatic we can still decide this is a replicated grid
334  ! if the memory gain (or the gain is messages) is too small.
335  IF (input_settings%distribution_type == rsgrid_automatic) THEN
336  volume = product(real(desc%npts, kind=dp))
337  volume_dist = product(real(desc%npts, kind=dp)/n_slices + &
338  merge((/0, 0, 0/), 2*(/nmin, nmin, nmin/), n_slices == (/1, 1, 1/)))
339  IF (volume < volume_dist*input_settings%memory_factor) THEN
340  n_slices = 1
341  END IF
342  END IF
343 
344  END IF
345 
346  desc%group_dim(:) = n_slices(:)
347  CALL desc%group%from_dup(pw_grid%para%group)
348  desc%group_size = desc%group%num_pe
349  desc%my_pos = desc%group%mepos
350 
351  IF (all(n_slices == 1)) THEN
352  ! CASE 1 : only one slice: we do not need overlapping regions and special
353  ! recombination of the total density
354  desc%border = border_size
355  IF (border_size == 0) THEN
356  desc%perd = 1
357  ELSE
358  desc%perd = 0
359  END IF
360  desc%distributed = .false.
361  desc%parallel = .true.
362  desc%group_head = pw_grid%para%group_head
363  desc%group_coor(:) = 0
364  desc%my_virtual_pos = 0
365 
366  ALLOCATE (desc%virtual2real(0:desc%group_size - 1))
367  ALLOCATE (desc%real2virtual(0:desc%group_size - 1))
368  ! Start with no reordering
369  DO i = 0, desc%group_size - 1
370  desc%virtual2real(i) = i
371  desc%real2virtual(i) = i
372  END DO
373  ELSE
374  ! CASE 2 : general case
375  ! periodicity is no longer enforced arbritary directions
376  IF (border_size == 0) THEN
377  desc%perd = 1
378  DO dir = 1, 3
379  IF (n_slices(dir) > 1) desc%perd(dir) = 0
380  END DO
381  ELSE
382  desc%perd(:) = 0
383  END IF
384  ! we keep a border of nmin points
385  desc%border = nmin
386  ! we are going parallel on the real space grid
387  desc%parallel = .true.
388  desc%distributed = .true.
389  desc%group_head = (desc%my_pos == 0)
390 
391  ! set up global info about the distribution
392  ALLOCATE (desc%rank2coord(3, 0:desc%group_size - 1))
393  ALLOCATE (desc%coord2rank(0:desc%group_dim(1) - 1, 0:desc%group_dim(2) - 1, 0:desc%group_dim(3) - 1))
394  ALLOCATE (desc%lb_global(3, 0:desc%group_size - 1))
395  ALLOCATE (desc%ub_global(3, 0:desc%group_size - 1))
396  ALLOCATE (desc%x2coord(desc%lb(1):desc%ub(1)))
397  ALLOCATE (desc%y2coord(desc%lb(2):desc%ub(2)))
398  ALLOCATE (desc%z2coord(desc%lb(3):desc%ub(3)))
399 
400  DO i = 0, desc%group_size - 1
401  ! Calculate coordinates in a row-major order (to be SMP-friendly)
402  desc%rank2coord(1, i) = i/(desc%group_dim(2)*desc%group_dim(3))
403  desc%rank2coord(2, i) = modulo(i, desc%group_dim(2)*desc%group_dim(3)) &
404  /desc%group_dim(3)
405  desc%rank2coord(3, i) = modulo(i, desc%group_dim(3))
406 
407  IF (i == desc%my_pos) THEN
408  desc%group_coor = desc%rank2coord(:, i)
409  END IF
410 
411  desc%coord2rank(desc%rank2coord(1, i), desc%rank2coord(2, i), desc%rank2coord(3, i)) = i
412  ! the lb_global and ub_global correspond to lb_real and ub_real of each task
413  desc%lb_global(:, i) = desc%lb
414  desc%ub_global(:, i) = desc%ub
415  DO dir = 1, 3
416  IF (desc%group_dim(dir) .GT. 1) THEN
417  lb = get_limit(desc%npts(dir), desc%group_dim(dir), desc%rank2coord(dir, i))
418  desc%lb_global(dir, i) = lb(1) + desc%lb(dir) - 1
419  desc%ub_global(dir, i) = lb(2) + desc%lb(dir) - 1
420  END IF
421  END DO
422  END DO
423 
424  ! map a grid point to a CPU coord
425  DO dir = 1, 3
426  DO l = 0, desc%group_dim(dir) - 1
427  IF (desc%group_dim(dir) .GT. 1) THEN
428  lb = get_limit(desc%npts(dir), desc%group_dim(dir), l)
429  lb = lb + desc%lb(dir) - 1
430  ELSE
431  lb(1) = desc%lb(dir)
432  lb(2) = desc%ub(dir)
433  END IF
434  SELECT CASE (dir)
435  CASE (1)
436  desc%x2coord(lb(1):lb(2)) = l
437  CASE (2)
438  desc%y2coord(lb(1):lb(2)) = l
439  CASE (3)
440  desc%z2coord(lb(1):lb(2)) = l
441  END SELECT
442  END DO
443  END DO
444 
445  ! an upper bound for the number of neighbours the border is overlapping with
446  DO dir = 1, 3
447  desc%neighbours(dir) = 0
448  IF ((n_slices(dir) > 1) .OR. (border_size > 0)) THEN
449  min_npts_real = huge(0)
450  DO l = 0, n_slices(dir) - 1
451  lb = get_limit(desc%npts(dir), n_slices(dir), l)
452  min_npts_real = min(lb(2) - lb(1) + 1, min_npts_real)
453  END DO
454  desc%neighbours(dir) = (desc%border + min_npts_real - 1)/min_npts_real
455  END IF
456  END DO
457 
458  ALLOCATE (desc%virtual2real(0:desc%group_size - 1))
459  ALLOCATE (desc%real2virtual(0:desc%group_size - 1))
460  ! Start with no reordering
461  DO i = 0, desc%group_size - 1
462  desc%virtual2real(i) = i
463  desc%real2virtual(i) = i
464  END DO
465 
466  desc%my_virtual_pos = desc%real2virtual(desc%my_pos)
467  desc%virtual_group_coor(:) = desc%rank2coord(:, desc%my_virtual_pos)
468 
469  END IF
470  END IF
471 
472  CALL timestop(handle)
473 
474  END SUBROUTINE rs_grid_create_descriptor
475 
476 ! **************************************************************************************************
477 !> \brief ...
478 !> \param rs ...
479 !> \param desc ...
480 ! **************************************************************************************************
481  SUBROUTINE rs_grid_create(rs, desc)
482  TYPE(realspace_grid_type), INTENT(OUT) :: rs
483  TYPE(realspace_grid_desc_type), INTENT(INOUT), &
484  TARGET :: desc
485 
486  CHARACTER(LEN=*), PARAMETER :: routinen = 'rs_grid_create'
487 
488  INTEGER :: handle
489 
490  CALL timeset(routinen, handle)
491 
492  rs%desc => desc
493  CALL rs_grid_retain_descriptor(rs%desc)
494 
495  IF (desc%pw%para%mode == pw_mode_local) THEN
496  ! The corresponding group has dimension 1
497  ! All operations will be done locally
498  rs%lb_real = desc%lb
499  rs%ub_real = desc%ub
500  rs%lb_local = rs%lb_real - desc%border*(1 - desc%perd)
501  rs%ub_local = rs%ub_real + desc%border*(1 - desc%perd)
502  rs%npts_local = rs%ub_local - rs%lb_local + 1
503  rs%ngpts_local = product(rs%npts_local)
504  END IF
505 
506  IF (all(rs%desc%group_dim == 1)) THEN
507  ! CASE 1 : only one slice: we do not need overlapping regions and special
508  ! recombination of the total density
509  rs%lb_real = desc%lb
510  rs%ub_real = desc%ub
511  rs%lb_local = rs%lb_real - desc%border*(1 - desc%perd)
512  rs%ub_local = rs%ub_real + desc%border*(1 - desc%perd)
513  rs%npts_local = rs%ub_local - rs%lb_local + 1
514  rs%ngpts_local = product(rs%npts_local)
515  ELSE
516  ! CASE 2 : general case
517  ! extract some more derived quantities about the local grid
518  rs%lb_real = desc%lb_global(:, desc%my_virtual_pos)
519  rs%ub_real = desc%ub_global(:, desc%my_virtual_pos)
520  rs%lb_local = rs%lb_real - desc%border*(1 - desc%perd)
521  rs%ub_local = rs%ub_real + desc%border*(1 - desc%perd)
522  rs%npts_local = rs%ub_local - rs%lb_local + 1
523  rs%ngpts_local = product(rs%npts_local)
524  END IF
525 
526  CALL offload_create_buffer(rs%ngpts_local, rs%buffer)
527  rs%r(rs%lb_local(1):rs%ub_local(1), &
528  rs%lb_local(2):rs%ub_local(2), &
529  rs%lb_local(3):rs%ub_local(3)) => rs%buffer%host_buffer
530 
531  ALLOCATE (rs%px(desc%npts(1)))
532  ALLOCATE (rs%py(desc%npts(2)))
533  ALLOCATE (rs%pz(desc%npts(3)))
534 
535  CALL timestop(handle)
536 
537  END SUBROUTINE rs_grid_create
538 
539 ! **************************************************************************************************
540 !> \brief Defines a new ordering of ranks on this realspace grid, recalculating
541 !> the data bounds and reallocating the grid. As a result, each MPI process
542 !> now has a real rank (i.e., its rank in the MPI communicator from the pw grid)
543 !> and a virtual rank (the rank of the process where the data now owned by this
544 !> process would reside in an ordinary cartesian distribution).
545 !> NB. Since the grid size required may change, the caller should be sure to release
546 !> and recreate the corresponding rs_grids
547 !> The desc%real2virtual and desc%virtual2real arrays can be used to map
548 !> a physical rank to the 'rank' of data owned by that process and vice versa
549 !> \param desc ...
550 !> \param real2virtual ...
551 !> \par History
552 !> 04-2009 created [Iain Bethune]
553 !> (c) The Numerical Algorithms Group (NAG) Ltd, 2009 on behalf of the HECToR project
554 ! **************************************************************************************************
555  PURE SUBROUTINE rs_grid_reorder_ranks(desc, real2virtual)
556 
557  TYPE(realspace_grid_desc_type), INTENT(INOUT) :: desc
558  INTEGER, DIMENSION(:), INTENT(IN) :: real2virtual
559 
560  INTEGER :: i
561 
562  desc%real2virtual(:) = real2virtual
563 
564  DO i = 0, desc%group_size - 1
565  desc%virtual2real(desc%real2virtual(i)) = i
566  END DO
567 
568  desc%my_virtual_pos = desc%real2virtual(desc%my_pos)
569 
570  IF (.NOT. all(desc%group_dim == 1)) THEN
571  desc%virtual_group_coor(:) = desc%rank2coord(:, desc%my_virtual_pos)
572  END IF
573 
574  END SUBROUTINE
575 
576 ! **************************************************************************************************
577 !> \brief Print information on grids to output
578 !> \param rs ...
579 !> \param iounit ...
580 !> \author JGH (17-May-2007)
581 ! **************************************************************************************************
582  SUBROUTINE rs_grid_print(rs, iounit)
583  TYPE(realspace_grid_type), INTENT(IN) :: rs
584  INTEGER, INTENT(in) :: iounit
585 
586  INTEGER :: dir, i, nn
587  REAL(kind=dp) :: pp(3)
588 
589  IF (rs%desc%parallel) THEN
590  IF (iounit > 0) THEN
591  WRITE (iounit, '(/,A,T71,I10)') &
592  " RS_GRID| Information for grid number ", rs%desc%pw%id_nr
593  DO i = 1, 3
594  WRITE (iounit, '(A,I3,T30,2I8,T62,A,T71,I10)') " RS_GRID| Bounds ", &
595  i, rs%desc%lb(i), rs%desc%ub(i), "Points:", rs%desc%npts(i)
596  END DO
597  IF (.NOT. rs%desc%distributed) THEN
598  WRITE (iounit, '(A)') " RS_GRID| Real space fully replicated"
599  WRITE (iounit, '(A,T71,I10)') &
600  " RS_GRID| Group size ", rs%desc%group_dim(2)
601  ELSE
602  DO dir = 1, 3
603  IF (rs%desc%perd(dir) /= 1) THEN
604  WRITE (iounit, '(A,T71,I3,A)') &
605  " RS_GRID| Real space distribution over ", rs%desc%group_dim(dir), " groups"
606  WRITE (iounit, '(A,T71,I10)') &
607  " RS_GRID| Real space distribution along direction ", dir
608  WRITE (iounit, '(A,T71,I10)') &
609  " RS_GRID| Border size ", rs%desc%border
610  END IF
611  END DO
612  END IF
613  END IF
614  IF (rs%desc%distributed) THEN
615  DO dir = 1, 3
616  IF (rs%desc%perd(dir) /= 1) THEN
617  nn = rs%npts_local(dir)
618  CALL rs%desc%group%sum(nn)
619  pp(1) = real(nn, kind=dp)/real(product(rs%desc%group_dim), kind=dp)
620  nn = rs%npts_local(dir)
621  CALL rs%desc%group%max(nn)
622  pp(2) = real(nn, kind=dp)
623  nn = rs%npts_local(dir)
624  CALL rs%desc%group%min(nn)
625  pp(3) = real(nn, kind=dp)
626  IF (iounit > 0) THEN
627  WRITE (iounit, '(A,T48,A)') " RS_GRID| Distribution", &
628  " Average Max Min"
629  WRITE (iounit, '(A,T45,F12.1,2I12)') " RS_GRID| Planes ", &
630  pp(1), nint(pp(2)), nint(pp(3))
631  END IF
632  END IF
633  END DO
634 ! WRITE ( iounit, '(/)' )
635  END IF
636  ELSE
637  IF (iounit > 0) THEN
638  WRITE (iounit, '(/,A,T71,I10)') &
639  " RS_GRID| Information for grid number ", rs%desc%pw%id_nr
640  DO i = 1, 3
641  WRITE (iounit, '(A,I3,T30,2I8,T62,A,T71,I10)') " RS_GRID| Bounds ", &
642  i, rs%desc%lb(i), rs%desc%ub(i), "Points:", rs%desc%npts(i)
643  END DO
644 ! WRITE ( iounit, '(/)' )
645  END IF
646  END IF
647 
648  END SUBROUTINE rs_grid_print
649 
650 ! **************************************************************************************************
651 !> \brief ...
652 !> \param rs ...
653 !> \param pw ...
654 ! **************************************************************************************************
655  SUBROUTINE transfer_rs2pw(rs, pw)
656  TYPE(realspace_grid_type), INTENT(IN) :: rs
657  TYPE(pw_r3d_rs_type), INTENT(INOUT) :: pw
658 
659  CHARACTER(len=*), PARAMETER :: routinen = 'transfer_rs2pw'
660 
661  INTEGER :: handle, handle2, i
662 
663  CALL timeset(routinen, handle2)
664  CALL timeset(routinen//"_"//trim(adjustl(cp_to_string(ceiling(pw%pw_grid%cutoff/10)*10))), handle)
665 
666  IF (.NOT. ASSOCIATED(rs%desc%pw, pw%pw_grid)) &
667  cpabort("Different rs and pw indentifiers")
668 
669  IF (rs%desc%distributed) THEN
670  CALL transfer_rs2pw_distributed(rs, pw)
671  ELSE IF (rs%desc%parallel) THEN
672  CALL transfer_rs2pw_replicated(rs, pw)
673  ELSE ! treat simple serial case locally
674  IF (rs%desc%border == 0) THEN
675  CALL dcopy(SIZE(rs%r), rs%r, 1, pw%array, 1)
676  ELSE
677  cpassert(lbound(pw%array, 3) .EQ. rs%lb_real(3))
678 !$OMP PARALLEL DO DEFAULT(NONE) SHARED(pw,rs)
679  DO i = rs%lb_real(3), rs%ub_real(3)
680  pw%array(:, :, i) = rs%r(rs%lb_real(1):rs%ub_real(1), &
681  rs%lb_real(2):rs%ub_real(2), i)
682  END DO
683 !$OMP END PARALLEL DO
684  END IF
685  END IF
686 
687  CALL timestop(handle)
688  CALL timestop(handle2)
689 
690  END SUBROUTINE transfer_rs2pw
691 
692 ! **************************************************************************************************
693 !> \brief ...
694 !> \param rs ...
695 !> \param pw ...
696 ! **************************************************************************************************
697  SUBROUTINE transfer_pw2rs(rs, pw)
698 
699  TYPE(realspace_grid_type), INTENT(IN) :: rs
700  TYPE(pw_r3d_rs_type), INTENT(IN) :: pw
701 
702  CHARACTER(len=*), PARAMETER :: routinen = 'transfer_pw2rs'
703 
704  INTEGER :: handle, handle2, i, im, j, jm, k, km
705 
706  CALL timeset(routinen, handle2)
707  CALL timeset(routinen//"_"//trim(adjustl(cp_to_string(ceiling(pw%pw_grid%cutoff/10)*10))), handle)
708 
709  IF (.NOT. ASSOCIATED(rs%desc%pw, pw%pw_grid)) &
710  cpabort("Different rs and pw indentifiers")
711 
712  IF (rs%desc%distributed) THEN
713  CALL transfer_pw2rs_distributed(rs, pw)
714  ELSE IF (rs%desc%parallel) THEN
715  CALL transfer_pw2rs_replicated(rs, pw)
716  ELSE ! treat simple serial case locally
717  IF (rs%desc%border == 0) THEN
718  CALL dcopy(SIZE(rs%r), pw%array, 1, rs%r, 1)
719  ELSE
720 !$OMP PARALLEL DO DEFAULT(NONE) &
721 !$OMP PRIVATE(i,im,j,jm,k,km) &
722 !$OMP SHARED(pw,rs)
723  DO k = rs%lb_local(3), rs%ub_local(3)
724  IF (k < rs%lb_real(3)) THEN
725  km = k + rs%desc%npts(3)
726  ELSE IF (k > rs%ub_real(3)) THEN
727  km = k - rs%desc%npts(3)
728  ELSE
729  km = k
730  END IF
731  DO j = rs%lb_local(2), rs%ub_local(2)
732  IF (j < rs%lb_real(2)) THEN
733  jm = j + rs%desc%npts(2)
734  ELSE IF (j > rs%ub_real(2)) THEN
735  jm = j - rs%desc%npts(2)
736  ELSE
737  jm = j
738  END IF
739  DO i = rs%lb_local(1), rs%ub_local(1)
740  IF (i < rs%lb_real(1)) THEN
741  im = i + rs%desc%npts(1)
742  ELSE IF (i > rs%ub_real(1)) THEN
743  im = i - rs%desc%npts(1)
744  ELSE
745  im = i
746  END IF
747  rs%r(i, j, k) = pw%array(im, jm, km)
748  END DO
749  END DO
750  END DO
751 !$OMP END PARALLEL DO
752  END IF
753  END IF
754 
755  CALL timestop(handle)
756  CALL timestop(handle2)
757 
758  END SUBROUTINE transfer_pw2rs
759 
760 ! **************************************************************************************************
761 !> \brief transfer from a realspace grid to a planewave grid
762 !> \param rs ...
763 !> \param pw ...
764 ! **************************************************************************************************
765  SUBROUTINE transfer_rs2pw_replicated(rs, pw)
766  TYPE(realspace_grid_type), INTENT(IN) :: rs
767  TYPE(pw_r3d_rs_type), INTENT(INOUT) :: pw
768 
769  INTEGER :: dest, ii, ip, ix, iy, iz, nma, nn, s(3), &
770  source
771  INTEGER, ALLOCATABLE, DIMENSION(:) :: rcount
772  INTEGER, DIMENSION(3) :: lb, ub
773  REAL(kind=dp), ALLOCATABLE, DIMENSION(:) :: recvbuf, sendbuf, swaparray
774 
775  associate(np => pw%pw_grid%para%group_size, bo => pw%pw_grid%para%bo(1:2, 1:3, 0:pw%pw_grid%para%group_size - 1, 1), &
776  pbo => pw%pw_grid%bounds, group => pw%pw_grid%para%rs_group, mepos => pw%pw_grid%para%rs_mpo, &
777  grid => rs%r)
778  ALLOCATE (rcount(0:np - 1))
779  DO ip = 1, np
780  rcount(ip - 1) = product(bo(2, :, ip) - bo(1, :, ip) + 1)
781  END DO
782  nma = maxval(rcount(0:np - 1))
783  ALLOCATE (sendbuf(nma), recvbuf(nma))
784  sendbuf = 1.0e99_dp; recvbuf = 1.0e99_dp ! init mpi'ed buffers to silence warnings under valgrind
785 
786  !sample peak memory
787  CALL m_memory()
788 
789  dest = modulo(mepos + 1, np)
790  source = modulo(mepos - 1, np)
791  sendbuf = 0.0_dp
792 
793  DO ip = 1, np
794 
795  lb = pbo(1, :) + bo(1, :, modulo(mepos - ip, np) + 1) - 1
796  ub = pbo(1, :) + bo(2, :, modulo(mepos - ip, np) + 1) - 1
797  ! this loop takes about the same time as the message passing call
798  ! notice that the range of ix is only a small fraction of the first index of grid
799  ! therefore it seems faster to have the second index as the innermost loop
800  ! if this runs on many cpus
801  ! tested on itanium, pentium4, opteron, ultrasparc...
802  s = ub - lb + 1
803  DO iz = lb(3), ub(3)
804  DO ix = lb(1), ub(1)
805  ii = (iz - lb(3))*s(1)*s(2) + (ix - lb(1)) + 1
806  DO iy = lb(2), ub(2)
807  sendbuf(ii) = sendbuf(ii) + grid(ix, iy, iz)
808  ii = ii + s(1)
809  END DO
810  END DO
811  END DO
812  IF (ip .EQ. np) EXIT
813  CALL group%sendrecv(sendbuf, dest, recvbuf, source, 13)
814  CALL move_alloc(sendbuf, swaparray)
815  CALL move_alloc(recvbuf, sendbuf)
816  CALL move_alloc(swaparray, recvbuf)
817  END DO
818  nn = rcount(mepos)
819  END associate
820 
821  CALL dcopy(nn, sendbuf, 1, pw%array, 1)
822 
823  DEALLOCATE (rcount)
824  DEALLOCATE (sendbuf)
825  DEALLOCATE (recvbuf)
826 
827  END SUBROUTINE transfer_rs2pw_replicated
828 
829 ! **************************************************************************************************
830 !> \brief transfer from a planewave grid to a realspace grid
831 !> \param rs ...
832 !> \param pw ...
833 ! **************************************************************************************************
834  SUBROUTINE transfer_pw2rs_replicated(rs, pw)
835  TYPE(realspace_grid_type), INTENT(IN) :: rs
836  TYPE(pw_r3d_rs_type), INTENT(IN) :: pw
837 
838  INTEGER :: dest, i, ii, im, ip, ix, iy, iz, j, jm, &
839  k, km, nma, nn, source
840  INTEGER, ALLOCATABLE, DIMENSION(:) :: rcount
841  INTEGER, DIMENSION(3) :: lb, ub
842  REAL(kind=dp), ALLOCATABLE, DIMENSION(:) :: recvbuf, sendbuf, swaparray
843  TYPE(mp_request_type), DIMENSION(2) :: req
844 
845  associate(np => pw%pw_grid%para%group_size, bo => pw%pw_grid%para%bo(1:2, 1:3, 0:pw%pw_grid%para%group_size - 1, 1), &
846  pbo => pw%pw_grid%bounds, group => pw%pw_grid%para%rs_group, mepos => pw%pw_grid%para%rs_mpo, &
847  grid => rs%r)
848  ALLOCATE (rcount(0:np - 1))
849  DO ip = 1, np
850  rcount(ip - 1) = product(bo(2, :, ip) - bo(1, :, ip) + 1)
851  END DO
852  nma = maxval(rcount(0:np - 1))
853  ALLOCATE (sendbuf(nma), recvbuf(nma))
854  sendbuf = 1.0e99_dp; recvbuf = 1.0e99_dp ! init mpi'ed buffers to silence warnings under valgrind
855 
856  !sample peak memory
857  CALL m_memory()
858 
859  nn = rcount(mepos)
860  CALL dcopy(nn, pw%array, 1, sendbuf, 1)
861 
862  dest = modulo(mepos + 1, np)
863  source = modulo(mepos - 1, np)
864 
865  DO ip = 0, np - 1
866  ! we must shift the buffer only np-1 times around
867  IF (ip .NE. np - 1) THEN
868  CALL group%isendrecv(sendbuf, dest, recvbuf, source, &
869  req(1), req(2), 13)
870  END IF
871  lb = pbo(1, :) + bo(1, :, modulo(mepos - ip, np) + 1) - 1
872  ub = pbo(1, :) + bo(2, :, modulo(mepos - ip, np) + 1) - 1
873  ii = 0
874  ! this loop takes about the same time as the message passing call
875  ! If I read the code correctly then:
876  DO iz = lb(3), ub(3)
877  DO iy = lb(2), ub(2)
878  DO ix = lb(1), ub(1)
879  ii = ii + 1
880  grid(ix, iy, iz) = sendbuf(ii)
881  END DO
882  END DO
883  END DO
884  IF (ip .NE. np - 1) THEN
885  CALL mp_waitall(req)
886  END IF
887  CALL move_alloc(sendbuf, swaparray)
888  CALL move_alloc(recvbuf, sendbuf)
889  CALL move_alloc(swaparray, recvbuf)
890  END DO
891  IF (rs%desc%border > 0) THEN
892 !$OMP PARALLEL DO DEFAULT(NONE) &
893 !$OMP PRIVATE(i,im,j,jm,k,km) &
894 !$OMP SHARED(rs)
895  DO k = rs%lb_local(3), rs%ub_local(3)
896  IF (k < rs%lb_real(3)) THEN
897  km = k + rs%desc%npts(3)
898  ELSE IF (k > rs%ub_real(3)) THEN
899  km = k - rs%desc%npts(3)
900  ELSE
901  km = k
902  END IF
903  DO j = rs%lb_local(2), rs%ub_local(2)
904  IF (j < rs%lb_real(2)) THEN
905  jm = j + rs%desc%npts(2)
906  ELSE IF (j > rs%ub_real(2)) THEN
907  jm = j - rs%desc%npts(2)
908  ELSE
909  jm = j
910  END IF
911  DO i = rs%lb_local(1), rs%ub_local(1)
912  IF (i < rs%lb_real(1)) THEN
913  im = i + rs%desc%npts(1)
914  ELSE IF (i > rs%ub_real(1)) THEN
915  im = i - rs%desc%npts(1)
916  ELSE
917  im = i
918  END IF
919  rs%r(i, j, k) = rs%r(im, jm, km)
920  END DO
921  END DO
922  END DO
923 !$OMP END PARALLEL DO
924  END IF
925  END associate
926 
927  DEALLOCATE (rcount)
928  DEALLOCATE (sendbuf)
929  DEALLOCATE (recvbuf)
930 
931  END SUBROUTINE transfer_pw2rs_replicated
932 
933 ! **************************************************************************************************
934 !> \brief does the rs2pw transfer in the case where the rs grid is
935 !> distributed (3D domain decomposition)
936 !> \param rs ...
937 !> \param pw ...
938 !> \par History
939 !> 12.2007 created [Matt Watkins]
940 !> 9.2008 reduced amount of halo data sent [Iain Bethune]
941 !> 10.2008 added non-blocking communication [Iain Bethune]
942 !> 4.2009 added support for rank-reordering on the grid [Iain Bethune]
943 !> 12.2009 added OMP and sparse alltoall [Iain Bethune]
944 !> (c) The Numerical Algorithms Group (NAG) Ltd, 2008-2009 on behalf of the HECToR project
945 !> \note
946 !> the transfer is a two step procedure. For example, for the rs2pw transfer:
947 !>
948 !> 1) Halo-exchange in 3D so that the local part of the rs_grid contains the full data
949 !> 2) an alltoall communication to redistribute the local rs_grid to the local pw_grid
950 !>
951 !> the halo exchange is most expensive on a large number of CPUs. Particular in this halo
952 !> exchange is that the border region is rather large (e.g. 20 points) and that it might overlap
953 !> with the central domain of several CPUs (i.e. next nearest neighbors)
954 ! **************************************************************************************************
955  SUBROUTINE transfer_rs2pw_distributed(rs, pw)
956  TYPE(realspace_grid_type), INTENT(IN) :: rs
957  TYPE(pw_r3d_rs_type), INTENT(IN) :: pw
958 
959  CHARACTER(LEN=200) :: error_string
960  INTEGER :: completed, dest_down, dest_up, i, idir, j, k, lb, my_id, my_pw_rank, my_rs_rank, &
961  n_shifts, nn, num_threads, position, source_down, source_up, ub, x, y, z
962  INTEGER, ALLOCATABLE, DIMENSION(:) :: dshifts, recv_disps, recv_sizes, &
963  send_disps, send_sizes, ushifts
964  INTEGER, ALLOCATABLE, DIMENSION(:, :) :: bounds, recv_tasks, send_tasks
965  INTEGER, DIMENSION(2) :: neighbours, pos
966  INTEGER, DIMENSION(3) :: coords, lb_recv, lb_recv_down, lb_recv_up, lb_send, lb_send_down, &
967  lb_send_up, ub_recv, ub_recv_down, ub_recv_up, ub_send, ub_send_down, ub_send_up
968  LOGICAL, DIMENSION(3) :: halo_swapped
969  REAL(kind=dp) :: pw_sum, rs_sum
970  REAL(kind=dp), ALLOCATABLE, DIMENSION(:, :, :) :: recv_buf_3d_down, recv_buf_3d_up, &
971  send_buf_3d_down, send_buf_3d_up
972  TYPE(cp_1d_r_p_type), ALLOCATABLE, DIMENSION(:) :: recv_bufs, send_bufs
973  TYPE(mp_request_type), ALLOCATABLE, DIMENSION(:) :: recv_reqs, send_reqs
974  TYPE(mp_request_type), DIMENSION(4) :: req
975 
976  num_threads = 1
977  my_id = 0
978 
979  ! safety check, to be removed once we're absolute sure the routine is correct
980  IF (debug_this_module) THEN
981  rs_sum = accurate_sum(rs%r)*abs(det_3x3(rs%desc%dh))
982  CALL rs%desc%group%sum(rs_sum)
983  END IF
984 
985  halo_swapped = .false.
986  ! We don't need to send the 'edges' of the halos that have already been sent
987  ! Halos are contiguous in memory in z-direction only, so swap these first,
988  ! and send less data in the y and x directions which are more expensive
989 
990  DO idir = 3, 1, -1
991 
992  IF (rs%desc%perd(idir) .NE. 1) THEN
993 
994  ALLOCATE (dshifts(0:rs%desc%neighbours(idir)))
995  ALLOCATE (ushifts(0:rs%desc%neighbours(idir)))
996 
997  ushifts = 0
998  dshifts = 0
999 
1000  ! check that we don't try to send data to ourself
1001  DO n_shifts = 1, min(rs%desc%neighbours(idir), rs%desc%group_dim(idir) - 1)
1002 
1003  ! need to take into account the possible varying widths of neighbouring cells
1004  ! offset_up and offset_down hold the real size of the neighbouring cells
1005  position = modulo(rs%desc%virtual_group_coor(idir) - n_shifts, rs%desc%group_dim(idir))
1006  neighbours = get_limit(rs%desc%npts(idir), rs%desc%group_dim(idir), position)
1007  dshifts(n_shifts) = dshifts(n_shifts - 1) + (neighbours(2) - neighbours(1) + 1)
1008 
1009  position = modulo(rs%desc%virtual_group_coor(idir) + n_shifts, rs%desc%group_dim(idir))
1010  neighbours = get_limit(rs%desc%npts(idir), rs%desc%group_dim(idir), position)
1011  ushifts(n_shifts) = ushifts(n_shifts - 1) + (neighbours(2) - neighbours(1) + 1)
1012 
1013  ! The border data has to be send/received from the neighbours
1014  ! First we calculate the source and destination processes for the shift
1015  ! We do both shifts at once to allow for more overlap of communication and buffer packing/unpacking
1016 
1017  CALL cart_shift(rs, idir, -1*n_shifts, source_down, dest_down)
1018 
1019  lb_send_down(:) = rs%lb_local(:)
1020  lb_recv_down(:) = rs%lb_local(:)
1021  ub_recv_down(:) = rs%ub_local(:)
1022  ub_send_down(:) = rs%ub_local(:)
1023 
1024  IF (dshifts(n_shifts - 1) .LE. rs%desc%border) THEN
1025  ub_send_down(idir) = lb_send_down(idir) + rs%desc%border - 1 - dshifts(n_shifts - 1)
1026  lb_send_down(idir) = max(lb_send_down(idir), &
1027  lb_send_down(idir) + rs%desc%border - dshifts(n_shifts))
1028 
1029  ub_recv_down(idir) = ub_recv_down(idir) - rs%desc%border
1030  lb_recv_down(idir) = max(lb_recv_down(idir) + rs%desc%border, &
1031  ub_recv_down(idir) - rs%desc%border + 1 + ushifts(n_shifts - 1))
1032  ELSE
1033  lb_send_down(idir) = 0
1034  ub_send_down(idir) = -1
1035  lb_recv_down(idir) = 0
1036  ub_recv_down(idir) = -1
1037  END IF
1038 
1039  DO i = 1, 3
1040  IF (halo_swapped(i)) THEN
1041  lb_send_down(i) = rs%lb_real(i)
1042  ub_send_down(i) = rs%ub_real(i)
1043  lb_recv_down(i) = rs%lb_real(i)
1044  ub_recv_down(i) = rs%ub_real(i)
1045  END IF
1046  END DO
1047 
1048  ! post the receive
1049  ALLOCATE (recv_buf_3d_down(lb_recv_down(1):ub_recv_down(1), &
1050  lb_recv_down(2):ub_recv_down(2), lb_recv_down(3):ub_recv_down(3)))
1051  CALL rs%desc%group%irecv(recv_buf_3d_down, source_down, req(1))
1052 
1053  ! now allocate, pack and send the send buffer
1054  nn = product(ub_send_down - lb_send_down + 1)
1055  ALLOCATE (send_buf_3d_down(lb_send_down(1):ub_send_down(1), &
1056  lb_send_down(2):ub_send_down(2), lb_send_down(3):ub_send_down(3)))
1057 
1058 !$OMP PARALLEL DEFAULT(NONE), &
1059 !$OMP PRIVATE(lb,ub,my_id,NUM_THREADS), &
1060 !$OMP SHARED(send_buf_3d_down,rs,lb_send_down,ub_send_down)
1061 !$ num_threads = MIN(omp_get_max_threads(), ub_send_down(3) - lb_send_down(3) + 1)
1062 !$ my_id = omp_get_thread_num()
1063  IF (my_id < num_threads) THEN
1064  lb = lb_send_down(3) + ((ub_send_down(3) - lb_send_down(3) + 1)*my_id)/num_threads
1065  ub = lb_send_down(3) + ((ub_send_down(3) - lb_send_down(3) + 1)*(my_id + 1))/num_threads - 1
1066 
1067  send_buf_3d_down(lb_send_down(1):ub_send_down(1), lb_send_down(2):ub_send_down(2), &
1068  lb:ub) = rs%r(lb_send_down(1):ub_send_down(1), &
1069  lb_send_down(2):ub_send_down(2), lb:ub)
1070  END IF
1071 !$OMP END PARALLEL
1072 
1073  CALL rs%desc%group%isend(send_buf_3d_down, dest_down, req(3))
1074 
1075  ! Now for the other direction
1076  CALL cart_shift(rs, idir, n_shifts, source_up, dest_up)
1077 
1078  lb_send_up(:) = rs%lb_local(:)
1079  lb_recv_up(:) = rs%lb_local(:)
1080  ub_recv_up(:) = rs%ub_local(:)
1081  ub_send_up(:) = rs%ub_local(:)
1082 
1083  IF (ushifts(n_shifts - 1) .LE. rs%desc%border) THEN
1084 
1085  lb_send_up(idir) = ub_send_up(idir) - rs%desc%border + 1 + ushifts(n_shifts - 1)
1086  ub_send_up(idir) = min(ub_send_up(idir), &
1087  ub_send_up(idir) - rs%desc%border + ushifts(n_shifts))
1088 
1089  lb_recv_up(idir) = lb_recv_up(idir) + rs%desc%border
1090  ub_recv_up(idir) = min(ub_recv_up(idir) - rs%desc%border, &
1091  lb_recv_up(idir) + rs%desc%border - 1 - dshifts(n_shifts - 1))
1092  ELSE
1093  lb_send_up(idir) = 0
1094  ub_send_up(idir) = -1
1095  lb_recv_up(idir) = 0
1096  ub_recv_up(idir) = -1
1097  END IF
1098 
1099  DO i = 1, 3
1100  IF (halo_swapped(i)) THEN
1101  lb_send_up(i) = rs%lb_real(i)
1102  ub_send_up(i) = rs%ub_real(i)
1103  lb_recv_up(i) = rs%lb_real(i)
1104  ub_recv_up(i) = rs%ub_real(i)
1105  END IF
1106  END DO
1107 
1108  ! post the receive
1109  ALLOCATE (recv_buf_3d_up(lb_recv_up(1):ub_recv_up(1), &
1110  lb_recv_up(2):ub_recv_up(2), lb_recv_up(3):ub_recv_up(3)))
1111  CALL rs%desc%group%irecv(recv_buf_3d_up, source_up, req(2))
1112 
1113  ! now allocate,pack and send the send buffer
1114  nn = product(ub_send_up - lb_send_up + 1)
1115  ALLOCATE (send_buf_3d_up(lb_send_up(1):ub_send_up(1), &
1116  lb_send_up(2):ub_send_up(2), lb_send_up(3):ub_send_up(3)))
1117 
1118 !$OMP PARALLEL DEFAULT(NONE), &
1119 !$OMP PRIVATE(lb,ub,my_id,NUM_THREADS), &
1120 !$OMP SHARED(send_buf_3d_up,rs,lb_send_up,ub_send_up)
1121 !$ num_threads = MIN(omp_get_max_threads(), ub_send_up(3) - lb_send_up(3) + 1)
1122 !$ my_id = omp_get_thread_num()
1123  IF (my_id < num_threads) THEN
1124  lb = lb_send_up(3) + ((ub_send_up(3) - lb_send_up(3) + 1)*my_id)/num_threads
1125  ub = lb_send_up(3) + ((ub_send_up(3) - lb_send_up(3) + 1)*(my_id + 1))/num_threads - 1
1126 
1127  send_buf_3d_up(lb_send_up(1):ub_send_up(1), lb_send_up(2):ub_send_up(2), &
1128  lb:ub) = rs%r(lb_send_up(1):ub_send_up(1), &
1129  lb_send_up(2):ub_send_up(2), lb:ub)
1130  END IF
1131 !$OMP END PARALLEL
1132 
1133  CALL rs%desc%group%isend(send_buf_3d_up, dest_up, req(4))
1134 
1135  ! wait for a recv to complete, then we can unpack
1136 
1137  DO i = 1, 2
1138 
1139  CALL mp_waitany(req(1:2), completed)
1140 
1141  IF (completed .EQ. 1) THEN
1142 
1143  ! only some procs may need later shifts
1144  IF (ub_recv_down(idir) .GE. lb_recv_down(idir)) THEN
1145  ! Sum the data in the RS Grid
1146 !$OMP PARALLEL DEFAULT(NONE), &
1147 !$OMP PRIVATE(lb,ub,my_id,NUM_THREADS), &
1148 !$OMP SHARED(recv_buf_3d_down,rs,lb_recv_down,ub_recv_down)
1149 !$ num_threads = MIN(omp_get_max_threads(), ub_recv_down(3) - lb_recv_down(3) + 1)
1150 !$ my_id = omp_get_thread_num()
1151  IF (my_id < num_threads) THEN
1152  lb = lb_recv_down(3) + ((ub_recv_down(3) - lb_recv_down(3) + 1)*my_id)/num_threads
1153  ub = lb_recv_down(3) + ((ub_recv_down(3) - lb_recv_down(3) + 1)*(my_id + 1))/num_threads - 1
1154 
1155  rs%r(lb_recv_down(1):ub_recv_down(1), &
1156  lb_recv_down(2):ub_recv_down(2), lb:ub) = &
1157  rs%r(lb_recv_down(1):ub_recv_down(1), &
1158  lb_recv_down(2):ub_recv_down(2), lb:ub) + &
1159  recv_buf_3d_down(:, :, lb:ub)
1160  END IF
1161 !$OMP END PARALLEL
1162  END IF
1163  DEALLOCATE (recv_buf_3d_down)
1164  ELSE
1165 
1166  ! only some procs may need later shifts
1167  IF (ub_recv_up(idir) .GE. lb_recv_up(idir)) THEN
1168  ! Sum the data in the RS Grid
1169 !$OMP PARALLEL DEFAULT(NONE), &
1170 !$OMP PRIVATE(lb,ub,my_id,NUM_THREADS), &
1171 !$OMP SHARED(recv_buf_3d_up,rs,lb_recv_up,ub_recv_up)
1172 !$ num_threads = MIN(omp_get_max_threads(), ub_recv_up(3) - lb_recv_up(3) + 1)
1173 !$ my_id = omp_get_thread_num()
1174  IF (my_id < num_threads) THEN
1175  lb = lb_recv_up(3) + ((ub_recv_up(3) - lb_recv_up(3) + 1)*my_id)/num_threads
1176  ub = lb_recv_up(3) + ((ub_recv_up(3) - lb_recv_up(3) + 1)*(my_id + 1))/num_threads - 1
1177 
1178  rs%r(lb_recv_up(1):ub_recv_up(1), &
1179  lb_recv_up(2):ub_recv_up(2), lb:ub) = &
1180  rs%r(lb_recv_up(1):ub_recv_up(1), &
1181  lb_recv_up(2):ub_recv_up(2), lb:ub) + &
1182  recv_buf_3d_up(:, :, lb:ub)
1183  END IF
1184 !$OMP END PARALLEL
1185  END IF
1186  DEALLOCATE (recv_buf_3d_up)
1187  END IF
1188 
1189  END DO
1190 
1191  ! make sure the sends have completed before we deallocate
1192 
1193  CALL mp_waitall(req(3:4))
1194 
1195  DEALLOCATE (send_buf_3d_down)
1196  DEALLOCATE (send_buf_3d_up)
1197  END DO
1198 
1199  DEALLOCATE (dshifts)
1200  DEALLOCATE (ushifts)
1201 
1202  END IF
1203 
1204  halo_swapped(idir) = .true.
1205 
1206  END DO
1207 
1208  ! This is the real redistribution
1209  ALLOCATE (bounds(0:pw%pw_grid%para%group_size - 1, 1:4))
1210 
1211  ! work out the pw grid points each proc holds
1212  DO i = 0, pw%pw_grid%para%group_size - 1
1213  bounds(i, 1:2) = pw%pw_grid%para%bo(1:2, 1, i, 1)
1214  bounds(i, 3:4) = pw%pw_grid%para%bo(1:2, 2, i, 1)
1215  bounds(i, 1:2) = bounds(i, 1:2) - pw%pw_grid%npts(1)/2 - 1
1216  bounds(i, 3:4) = bounds(i, 3:4) - pw%pw_grid%npts(2)/2 - 1
1217  END DO
1218 
1219  ALLOCATE (send_tasks(0:pw%pw_grid%para%group_size - 1, 1:6))
1220  ALLOCATE (send_sizes(0:pw%pw_grid%para%group_size - 1))
1221  ALLOCATE (send_disps(0:pw%pw_grid%para%group_size - 1))
1222  ALLOCATE (recv_tasks(0:pw%pw_grid%para%group_size - 1, 1:6))
1223  ALLOCATE (recv_sizes(0:pw%pw_grid%para%group_size - 1))
1224  ALLOCATE (recv_disps(0:pw%pw_grid%para%group_size - 1))
1225  send_tasks(:, 1) = 1
1226  send_tasks(:, 2) = 0
1227  send_tasks(:, 3) = 1
1228  send_tasks(:, 4) = 0
1229  send_tasks(:, 5) = 1
1230  send_tasks(:, 6) = 0
1231  send_sizes = 0
1232  recv_sizes = 0
1233 
1234  my_rs_rank = rs%desc%my_pos
1235  my_pw_rank = pw%pw_grid%para%rs_mpo
1236 
1237  ! find the processors that should hold our data
1238  ! should be part of the rs grid type
1239  ! this is a loop over real ranks (i.e. the in-order cartesian ranks)
1240  ! do the recv and send tasks in two separate loops which will
1241  ! load balance better for OpenMP with large numbers of MPI tasks
1242 
1243 !$OMP PARALLEL DO DEFAULT(NONE), &
1244 !$OMP PRIVATE(coords,idir,pos,lb_send,ub_send), &
1245 !$OMP SHARED(rs,bounds,my_rs_rank,recv_tasks,recv_sizes)
1246  DO i = 0, rs%desc%group_size - 1
1247 
1248  coords(:) = rs%desc%rank2coord(:, rs%desc%real2virtual(i))
1249  !calculate the rs grid points on each processor
1250  !coords is the part of the grid that rank i actually holds
1251  DO idir = 1, 3
1252  pos(:) = get_limit(rs%desc%npts(idir), rs%desc%group_dim(idir), coords(idir))
1253  pos(:) = pos(:) - rs%desc%npts(idir)/2 - 1
1254  lb_send(idir) = pos(1)
1255  ub_send(idir) = pos(2)
1256  END DO
1257 
1258  IF (lb_send(1) .GT. bounds(my_rs_rank, 2)) cycle
1259  IF (ub_send(1) .LT. bounds(my_rs_rank, 1)) cycle
1260  IF (lb_send(2) .GT. bounds(my_rs_rank, 4)) cycle
1261  IF (ub_send(2) .LT. bounds(my_rs_rank, 3)) cycle
1262 
1263  recv_tasks(i, 1) = max(lb_send(1), bounds(my_rs_rank, 1))
1264  recv_tasks(i, 2) = min(ub_send(1), bounds(my_rs_rank, 2))
1265  recv_tasks(i, 3) = max(lb_send(2), bounds(my_rs_rank, 3))
1266  recv_tasks(i, 4) = min(ub_send(2), bounds(my_rs_rank, 4))
1267  recv_tasks(i, 5) = lb_send(3)
1268  recv_tasks(i, 6) = ub_send(3)
1269  recv_sizes(i) = (recv_tasks(i, 2) - recv_tasks(i, 1) + 1)* &
1270  (recv_tasks(i, 4) - recv_tasks(i, 3) + 1)*(recv_tasks(i, 6) - recv_tasks(i, 5) + 1)
1271 
1272  END DO
1273 !$OMP END PARALLEL DO
1274 
1275  coords(:) = rs%desc%rank2coord(:, rs%desc%real2virtual(my_rs_rank))
1276  DO idir = 1, 3
1277  pos(:) = get_limit(rs%desc%npts(idir), rs%desc%group_dim(idir), coords(idir))
1278  pos(:) = pos(:) - rs%desc%npts(idir)/2 - 1
1279  lb_send(idir) = pos(1)
1280  ub_send(idir) = pos(2)
1281  END DO
1282 
1283  lb_recv(:) = lb_send(:)
1284  ub_recv(:) = ub_send(:)
1285 !$OMP PARALLEL DO DEFAULT(NONE), &
1286 !$OMP SHARED(pw,lb_send,ub_send,bounds,send_tasks,send_sizes)
1287  DO j = 0, pw%pw_grid%para%group_size - 1
1288 
1289  IF (lb_send(1) .GT. bounds(j, 2)) cycle
1290  IF (ub_send(1) .LT. bounds(j, 1)) cycle
1291  IF (lb_send(2) .GT. bounds(j, 4)) cycle
1292  IF (ub_send(2) .LT. bounds(j, 3)) cycle
1293 
1294  send_tasks(j, 1) = max(lb_send(1), bounds(j, 1))
1295  send_tasks(j, 2) = min(ub_send(1), bounds(j, 2))
1296  send_tasks(j, 3) = max(lb_send(2), bounds(j, 3))
1297  send_tasks(j, 4) = min(ub_send(2), bounds(j, 4))
1298  send_tasks(j, 5) = lb_send(3)
1299  send_tasks(j, 6) = ub_send(3)
1300  send_sizes(j) = (send_tasks(j, 2) - send_tasks(j, 1) + 1)* &
1301  (send_tasks(j, 4) - send_tasks(j, 3) + 1)*(send_tasks(j, 6) - send_tasks(j, 5) + 1)
1302 
1303  END DO
1304 !$OMP END PARALLEL DO
1305 
1306  send_disps(0) = 0
1307  recv_disps(0) = 0
1308  DO i = 1, pw%pw_grid%para%group_size - 1
1309  send_disps(i) = send_disps(i - 1) + send_sizes(i - 1)
1310  recv_disps(i) = recv_disps(i - 1) + recv_sizes(i - 1)
1311  END DO
1312 
1313  cpassert(sum(send_sizes) == product(ub_recv - lb_recv + 1))
1314 
1315  ALLOCATE (send_bufs(0:rs%desc%group_size - 1))
1316  ALLOCATE (recv_bufs(0:rs%desc%group_size - 1))
1317 
1318  DO i = 0, rs%desc%group_size - 1
1319  IF (send_sizes(i) .NE. 0) THEN
1320  ALLOCATE (send_bufs(i)%array(send_sizes(i)))
1321  ELSE
1322  NULLIFY (send_bufs(i)%array)
1323  END IF
1324  IF (recv_sizes(i) .NE. 0) THEN
1325  ALLOCATE (recv_bufs(i)%array(recv_sizes(i)))
1326  ELSE
1327  NULLIFY (recv_bufs(i)%array)
1328  END IF
1329  END DO
1330 
1331  ALLOCATE (recv_reqs(0:rs%desc%group_size - 1))
1332  recv_reqs = mp_request_null
1333 
1334  DO i = 0, rs%desc%group_size - 1
1335  IF (recv_sizes(i) .NE. 0) THEN
1336  CALL rs%desc%group%irecv(recv_bufs(i)%array, i, recv_reqs(i))
1337  END IF
1338  END DO
1339 
1340  ! do packing
1341 !$OMP PARALLEL DO DEFAULT(NONE), &
1342 !$OMP PRIVATE(k,z,y,x), &
1343 !$OMP SHARED(rs,send_tasks,send_bufs,send_disps)
1344  DO i = 0, rs%desc%group_size - 1
1345  k = 0
1346  DO z = send_tasks(i, 5), send_tasks(i, 6)
1347  DO y = send_tasks(i, 3), send_tasks(i, 4)
1348  DO x = send_tasks(i, 1), send_tasks(i, 2)
1349  k = k + 1
1350  send_bufs(i)%array(k) = rs%r(x, y, z)
1351  END DO
1352  END DO
1353  END DO
1354  END DO
1355 !$OMP END PARALLEL DO
1356 
1357  ALLOCATE (send_reqs(0:rs%desc%group_size - 1))
1358  send_reqs = mp_request_null
1359 
1360  DO i = 0, rs%desc%group_size - 1
1361  IF (send_sizes(i) .NE. 0) THEN
1362  CALL rs%desc%group%isend(send_bufs(i)%array, i, send_reqs(i))
1363  END IF
1364  END DO
1365 
1366  ! do unpacking
1367  ! no OMP here so we can unpack each message as it arrives
1368  DO i = 0, rs%desc%group_size - 1
1369  IF (recv_sizes(i) .EQ. 0) cycle
1370 
1371  CALL mp_waitany(recv_reqs, completed)
1372  k = 0
1373  DO z = recv_tasks(completed - 1, 5), recv_tasks(completed - 1, 6)
1374  DO y = recv_tasks(completed - 1, 3), recv_tasks(completed - 1, 4)
1375  DO x = recv_tasks(completed - 1, 1), recv_tasks(completed - 1, 2)
1376  k = k + 1
1377  pw%array(x, y, z) = recv_bufs(completed - 1)%array(k)
1378  END DO
1379  END DO
1380  END DO
1381  END DO
1382 
1383  CALL mp_waitall(send_reqs)
1384 
1385  DEALLOCATE (recv_reqs)
1386  DEALLOCATE (send_reqs)
1387 
1388  DO i = 0, rs%desc%group_size - 1
1389  IF (ASSOCIATED(send_bufs(i)%array)) THEN
1390  DEALLOCATE (send_bufs(i)%array)
1391  END IF
1392  IF (ASSOCIATED(recv_bufs(i)%array)) THEN
1393  DEALLOCATE (recv_bufs(i)%array)
1394  END IF
1395  END DO
1396 
1397  DEALLOCATE (send_bufs)
1398  DEALLOCATE (recv_bufs)
1399  DEALLOCATE (send_tasks)
1400  DEALLOCATE (send_sizes)
1401  DEALLOCATE (send_disps)
1402  DEALLOCATE (recv_tasks)
1403  DEALLOCATE (recv_sizes)
1404  DEALLOCATE (recv_disps)
1405 
1406  IF (debug_this_module) THEN
1407  ! safety check, to be removed once we're absolute sure the routine is correct
1408  pw_sum = pw_integrate_function(pw)
1409  IF (abs(pw_sum - rs_sum)/max(1.0_dp, abs(pw_sum), abs(rs_sum)) > epsilon(rs_sum)*1000) THEN
1410  WRITE (error_string, '(A,6(1X,I4.4),3F25.16)') "rs_pw_transfer_distributed", &
1411  rs%desc%npts, rs%desc%group_dim, pw_sum, rs_sum, abs(pw_sum - rs_sum)
1412  CALL cp_abort(__location__, &
1413  error_string//" Please report this bug ... quick workaround: use "// &
1414  "DISTRIBUTION_TYPE REPLICATED")
1415  END IF
1416  END IF
1417 
1418  END SUBROUTINE transfer_rs2pw_distributed
1419 
1420 ! **************************************************************************************************
1421 !> \brief does the pw2rs transfer in the case where the rs grid is
1422 !> distributed (3D domain decomposition)
1423 !> \param rs ...
1424 !> \param pw ...
1425 !> \par History
1426 !> 12.2007 created [Matt Watkins]
1427 !> 9.2008 reduced amount of halo data sent [Iain Bethune]
1428 !> 10.2008 added non-blocking communication [Iain Bethune]
1429 !> 4.2009 added support for rank-reordering on the grid [Iain Bethune]
1430 !> 12.2009 added OMP and sparse alltoall [Iain Bethune]
1431 !> (c) The Numerical Algorithms Group (NAG) Ltd, 2008-2009 on behalf of the HECToR project
1432 !> \note
1433 !> the transfer is a two step procedure. For example, for the rs2pw transfer:
1434 !>
1435 !> 1) Halo-exchange in 3D so that the local part of the rs_grid contains the full data
1436 !> 2) an alltoall communication to redistribute the local rs_grid to the local pw_grid
1437 !>
1438 !> the halo exchange is most expensive on a large number of CPUs. Particular in this halo
1439 !> exchange is that the border region is rather large (e.g. 20 points) and that it might overlap
1440 !> with the central domain of several CPUs (i.e. next nearest neighbors)
1441 ! **************************************************************************************************
1442  SUBROUTINE transfer_pw2rs_distributed(rs, pw)
1443  TYPE(realspace_grid_type), INTENT(IN) :: rs
1444  TYPE(pw_r3d_rs_type), INTENT(IN) :: pw
1445 
1446  INTEGER :: completed, dest_down, dest_up, i, idir, j, k, lb, my_id, my_pw_rank, my_rs_rank, &
1447  n_shifts, nn, num_threads, position, source_down, source_up, ub, x, y, z
1448  INTEGER, ALLOCATABLE, DIMENSION(:) :: dshifts, recv_disps, recv_sizes, &
1449  send_disps, send_sizes, ushifts
1450  INTEGER, ALLOCATABLE, DIMENSION(:, :) :: bounds, recv_tasks, send_tasks
1451  INTEGER, DIMENSION(2) :: neighbours, pos
1452  INTEGER, DIMENSION(3) :: coords, lb_recv, lb_recv_down, lb_recv_up, lb_send, lb_send_down, &
1453  lb_send_up, ub_recv, ub_recv_down, ub_recv_up, ub_send, ub_send_down, ub_send_up
1454  LOGICAL, DIMENSION(3) :: halo_swapped
1455  REAL(kind=dp), ALLOCATABLE, DIMENSION(:, :, :) :: recv_buf_3d_down, recv_buf_3d_up, &
1456  send_buf_3d_down, send_buf_3d_up
1457  TYPE(cp_1d_r_p_type), ALLOCATABLE, DIMENSION(:) :: recv_bufs, send_bufs
1458  TYPE(mp_request_type), ALLOCATABLE, DIMENSION(:) :: recv_reqs, send_reqs
1459  TYPE(mp_request_type), DIMENSION(4) :: req
1460 
1461  num_threads = 1
1462  my_id = 0
1463 
1464  CALL rs_grid_zero(rs)
1465 
1466  ! This is the real redistribution
1467 
1468  ALLOCATE (bounds(0:pw%pw_grid%para%group_size - 1, 1:4))
1469 
1470  DO i = 0, pw%pw_grid%para%group_size - 1
1471  bounds(i, 1:2) = pw%pw_grid%para%bo(1:2, 1, i, 1)
1472  bounds(i, 3:4) = pw%pw_grid%para%bo(1:2, 2, i, 1)
1473  bounds(i, 1:2) = bounds(i, 1:2) - pw%pw_grid%npts(1)/2 - 1
1474  bounds(i, 3:4) = bounds(i, 3:4) - pw%pw_grid%npts(2)/2 - 1
1475  END DO
1476 
1477  ALLOCATE (send_tasks(0:pw%pw_grid%para%group_size - 1, 1:6))
1478  ALLOCATE (send_sizes(0:pw%pw_grid%para%group_size - 1))
1479  ALLOCATE (send_disps(0:pw%pw_grid%para%group_size - 1))
1480  ALLOCATE (recv_tasks(0:pw%pw_grid%para%group_size - 1, 1:6))
1481  ALLOCATE (recv_sizes(0:pw%pw_grid%para%group_size - 1))
1482  ALLOCATE (recv_disps(0:pw%pw_grid%para%group_size - 1))
1483 
1484  send_tasks = 0
1485  send_tasks(:, 1) = 1
1486  send_tasks(:, 2) = 0
1487  send_tasks(:, 3) = 1
1488  send_tasks(:, 4) = 0
1489  send_tasks(:, 5) = 1
1490  send_tasks(:, 6) = 0
1491  send_sizes = 0
1492 
1493  recv_tasks = 0
1494  recv_tasks(:, 1) = 1
1495  recv_tasks(:, 2) = 0
1496  send_tasks(:, 3) = 1
1497  send_tasks(:, 4) = 0
1498  send_tasks(:, 5) = 1
1499  send_tasks(:, 6) = 0
1500  recv_sizes = 0
1501 
1502  my_rs_rank = rs%desc%my_pos
1503  my_pw_rank = pw%pw_grid%para%rs_mpo
1504 
1505  ! find the processors that should hold our data
1506  ! should be part of the rs grid type
1507  ! this is a loop over real ranks (i.e. the in-order cartesian ranks)
1508  ! do the recv and send tasks in two separate loops which will
1509  ! load balance better for OpenMP with large numbers of MPI tasks
1510 
1511  ! this is the reverse of rs2pw: what were the sends are now the recvs
1512 
1513 !$OMP PARALLEL DO DEFAULT(NONE), &
1514 !$OMP PRIVATE(coords,idir,pos,lb_send,ub_send), &
1515 !$OMP SHARED(rs,bounds,my_rs_rank,send_tasks,send_sizes,pw)
1516  DO i = 0, pw%pw_grid%para%group_size - 1
1517 
1518  coords(:) = rs%desc%rank2coord(:, rs%desc%real2virtual(i))
1519  !calculate the real rs grid points on each processor
1520  !coords is the part of the grid that rank i actually holds
1521  DO idir = 1, 3
1522  pos(:) = get_limit(rs%desc%npts(idir), rs%desc%group_dim(idir), coords(idir))
1523  pos(:) = pos(:) - rs%desc%npts(idir)/2 - 1
1524  lb_send(idir) = pos(1)
1525  ub_send(idir) = pos(2)
1526  END DO
1527 
1528  IF (ub_send(1) .LT. bounds(my_rs_rank, 1)) cycle
1529  IF (lb_send(1) .GT. bounds(my_rs_rank, 2)) cycle
1530  IF (ub_send(2) .LT. bounds(my_rs_rank, 3)) cycle
1531  IF (lb_send(2) .GT. bounds(my_rs_rank, 4)) cycle
1532 
1533  send_tasks(i, 1) = max(lb_send(1), bounds(my_rs_rank, 1))
1534  send_tasks(i, 2) = min(ub_send(1), bounds(my_rs_rank, 2))
1535  send_tasks(i, 3) = max(lb_send(2), bounds(my_rs_rank, 3))
1536  send_tasks(i, 4) = min(ub_send(2), bounds(my_rs_rank, 4))
1537  send_tasks(i, 5) = lb_send(3)
1538  send_tasks(i, 6) = ub_send(3)
1539  send_sizes(i) = (send_tasks(i, 2) - send_tasks(i, 1) + 1)* &
1540  (send_tasks(i, 4) - send_tasks(i, 3) + 1)*(send_tasks(i, 6) - send_tasks(i, 5) + 1)
1541 
1542  END DO
1543 !$OMP END PARALLEL DO
1544 
1545  coords(:) = rs%desc%rank2coord(:, rs%desc%real2virtual(my_rs_rank))
1546  DO idir = 1, 3
1547  pos(:) = get_limit(rs%desc%npts(idir), rs%desc%group_dim(idir), coords(idir))
1548  pos(:) = pos(:) - rs%desc%npts(idir)/2 - 1
1549  lb_send(idir) = pos(1)
1550  ub_send(idir) = pos(2)
1551  END DO
1552 
1553  lb_recv(:) = lb_send(:)
1554  ub_recv(:) = ub_send(:)
1555 
1556 !$OMP PARALLEL DO DEFAULT(NONE), &
1557 !$OMP SHARED(pw,lb_send,ub_send,bounds,recv_tasks,recv_sizes)
1558  DO j = 0, pw%pw_grid%para%group_size - 1
1559 
1560  IF (ub_send(1) .LT. bounds(j, 1)) cycle
1561  IF (lb_send(1) .GT. bounds(j, 2)) cycle
1562  IF (ub_send(2) .LT. bounds(j, 3)) cycle
1563  IF (lb_send(2) .GT. bounds(j, 4)) cycle
1564 
1565  recv_tasks(j, 1) = max(lb_send(1), bounds(j, 1))
1566  recv_tasks(j, 2) = min(ub_send(1), bounds(j, 2))
1567  recv_tasks(j, 3) = max(lb_send(2), bounds(j, 3))
1568  recv_tasks(j, 4) = min(ub_send(2), bounds(j, 4))
1569  recv_tasks(j, 5) = lb_send(3)
1570  recv_tasks(j, 6) = ub_send(3)
1571  recv_sizes(j) = (recv_tasks(j, 2) - recv_tasks(j, 1) + 1)* &
1572  (recv_tasks(j, 4) - recv_tasks(j, 3) + 1)*(recv_tasks(j, 6) - recv_tasks(j, 5) + 1)
1573 
1574  END DO
1575 !$OMP END PARALLEL DO
1576 
1577  send_disps(0) = 0
1578  recv_disps(0) = 0
1579  DO i = 1, pw%pw_grid%para%group_size - 1
1580  send_disps(i) = send_disps(i - 1) + send_sizes(i - 1)
1581  recv_disps(i) = recv_disps(i - 1) + recv_sizes(i - 1)
1582  END DO
1583 
1584  cpassert(sum(recv_sizes) == product(ub_recv - lb_recv + 1))
1585 
1586  ALLOCATE (send_bufs(0:rs%desc%group_size - 1))
1587  ALLOCATE (recv_bufs(0:rs%desc%group_size - 1))
1588 
1589  DO i = 0, rs%desc%group_size - 1
1590  IF (send_sizes(i) .NE. 0) THEN
1591  ALLOCATE (send_bufs(i)%array(send_sizes(i)))
1592  ELSE
1593  NULLIFY (send_bufs(i)%array)
1594  END IF
1595  IF (recv_sizes(i) .NE. 0) THEN
1596  ALLOCATE (recv_bufs(i)%array(recv_sizes(i)))
1597  ELSE
1598  NULLIFY (recv_bufs(i)%array)
1599  END IF
1600  END DO
1601 
1602  ALLOCATE (recv_reqs(0:rs%desc%group_size - 1))
1603  recv_reqs = mp_request_null
1604 
1605  DO i = 0, rs%desc%group_size - 1
1606  IF (recv_sizes(i) .NE. 0) THEN
1607  CALL rs%desc%group%irecv(recv_bufs(i)%array, i, recv_reqs(i))
1608  END IF
1609  END DO
1610 
1611  ! do packing
1612 !$OMP PARALLEL DO DEFAULT(NONE), &
1613 !$OMP PRIVATE(k,z,y,x), &
1614 !$OMP SHARED(pw,rs,send_tasks,send_bufs,send_disps)
1615  DO i = 0, rs%desc%group_size - 1
1616  k = 0
1617  DO z = send_tasks(i, 5), send_tasks(i, 6)
1618  DO y = send_tasks(i, 3), send_tasks(i, 4)
1619  DO x = send_tasks(i, 1), send_tasks(i, 2)
1620  k = k + 1
1621  send_bufs(i)%array(k) = pw%array(x, y, z)
1622  END DO
1623  END DO
1624  END DO
1625  END DO
1626 !$OMP END PARALLEL DO
1627 
1628  ALLOCATE (send_reqs(0:rs%desc%group_size - 1))
1629  send_reqs = mp_request_null
1630 
1631  DO i = 0, rs%desc%group_size - 1
1632  IF (send_sizes(i) .NE. 0) THEN
1633  CALL rs%desc%group%isend(send_bufs(i)%array, i, send_reqs(i))
1634  END IF
1635  END DO
1636 
1637  ! do unpacking
1638  ! no OMP here so we can unpack each message as it arrives
1639 
1640  DO i = 0, rs%desc%group_size - 1
1641  IF (recv_sizes(i) .EQ. 0) cycle
1642 
1643  CALL mp_waitany(recv_reqs, completed)
1644  k = 0
1645  DO z = recv_tasks(completed - 1, 5), recv_tasks(completed - 1, 6)
1646  DO y = recv_tasks(completed - 1, 3), recv_tasks(completed - 1, 4)
1647  DO x = recv_tasks(completed - 1, 1), recv_tasks(completed - 1, 2)
1648  k = k + 1
1649  rs%r(x, y, z) = recv_bufs(completed - 1)%array(k)
1650  END DO
1651  END DO
1652  END DO
1653  END DO
1654 
1655  CALL mp_waitall(send_reqs)
1656 
1657  DEALLOCATE (recv_reqs)
1658  DEALLOCATE (send_reqs)
1659 
1660  DO i = 0, rs%desc%group_size - 1
1661  IF (ASSOCIATED(send_bufs(i)%array)) THEN
1662  DEALLOCATE (send_bufs(i)%array)
1663  END IF
1664  IF (ASSOCIATED(recv_bufs(i)%array)) THEN
1665  DEALLOCATE (recv_bufs(i)%array)
1666  END IF
1667  END DO
1668 
1669  DEALLOCATE (send_bufs)
1670  DEALLOCATE (recv_bufs)
1671  DEALLOCATE (send_tasks)
1672  DEALLOCATE (send_sizes)
1673  DEALLOCATE (send_disps)
1674  DEALLOCATE (recv_tasks)
1675  DEALLOCATE (recv_sizes)
1676  DEALLOCATE (recv_disps)
1677 
1678  ! now pass wings around
1679  halo_swapped = .false.
1680 
1681  DO idir = 1, 3
1682 
1683  IF (rs%desc%perd(idir) /= 1) THEN
1684 
1685  ALLOCATE (dshifts(0:rs%desc%neighbours(idir)))
1686  ALLOCATE (ushifts(0:rs%desc%neighbours(idir)))
1687  ushifts = 0
1688  dshifts = 0
1689 
1690  DO n_shifts = 1, rs%desc%neighbours(idir)
1691 
1692  ! need to take into account the possible varying widths of neighbouring cells
1693  ! ushifts and dshifts hold the real size of the neighbouring cells
1694 
1695  position = modulo(rs%desc%virtual_group_coor(idir) - n_shifts, rs%desc%group_dim(idir))
1696  neighbours = get_limit(rs%desc%npts(idir), rs%desc%group_dim(idir), position)
1697  dshifts(n_shifts) = dshifts(n_shifts - 1) + (neighbours(2) - neighbours(1) + 1)
1698 
1699  position = modulo(rs%desc%virtual_group_coor(idir) + n_shifts, rs%desc%group_dim(idir))
1700  neighbours = get_limit(rs%desc%npts(idir), rs%desc%group_dim(idir), position)
1701  ushifts(n_shifts) = ushifts(n_shifts - 1) + (neighbours(2) - neighbours(1) + 1)
1702 
1703  ! The border data has to be send/received from the neighbors
1704  ! First we calculate the source and destination processes for the shift
1705  ! The first shift is "downwards"
1706 
1707  CALL cart_shift(rs, idir, -1*n_shifts, source_down, dest_down)
1708 
1709  lb_send_down(:) = rs%lb_local(:)
1710  ub_send_down(:) = rs%ub_local(:)
1711  lb_recv_down(:) = rs%lb_local(:)
1712  ub_recv_down(:) = rs%ub_local(:)
1713 
1714  IF (dshifts(n_shifts - 1) .LE. rs%desc%border) THEN
1715  lb_send_down(idir) = lb_send_down(idir) + rs%desc%border
1716  ub_send_down(idir) = min(ub_send_down(idir) - rs%desc%border, &
1717  lb_send_down(idir) + rs%desc%border - 1 - dshifts(n_shifts - 1))
1718 
1719  lb_recv_down(idir) = ub_recv_down(idir) - rs%desc%border + 1 + ushifts(n_shifts - 1)
1720  ub_recv_down(idir) = min(ub_recv_down(idir), &
1721  ub_recv_down(idir) - rs%desc%border + ushifts(n_shifts))
1722  ELSE
1723  lb_send_down(idir) = 0
1724  ub_send_down(idir) = -1
1725  lb_recv_down(idir) = 0
1726  ub_recv_down(idir) = -1
1727  END IF
1728 
1729  DO i = 1, 3
1730  IF (.NOT. (halo_swapped(i) .OR. i .EQ. idir)) THEN
1731  lb_send_down(i) = rs%lb_real(i)
1732  ub_send_down(i) = rs%ub_real(i)
1733  lb_recv_down(i) = rs%lb_real(i)
1734  ub_recv_down(i) = rs%ub_real(i)
1735  END IF
1736  END DO
1737 
1738  ! allocate the recv buffer
1739  nn = product(ub_recv_down - lb_recv_down + 1)
1740  ALLOCATE (recv_buf_3d_down(lb_recv_down(1):ub_recv_down(1), &
1741  lb_recv_down(2):ub_recv_down(2), lb_recv_down(3):ub_recv_down(3)))
1742 
1743  ! recv buffer is now ready, so post the receive
1744  CALL rs%desc%group%irecv(recv_buf_3d_down, source_down, req(1))
1745 
1746  ! now allocate,pack and send the send buffer
1747  nn = product(ub_send_down - lb_send_down + 1)
1748  ALLOCATE (send_buf_3d_down(lb_send_down(1):ub_send_down(1), &
1749  lb_send_down(2):ub_send_down(2), lb_send_down(3):ub_send_down(3)))
1750 
1751 !$OMP PARALLEL DEFAULT(NONE), &
1752 !$OMP PRIVATE(lb,ub,my_id,NUM_THREADS), &
1753 !$OMP SHARED(send_buf_3d_down,rs,lb_send_down,ub_send_down)
1754 !$ num_threads = MIN(omp_get_max_threads(), ub_send_down(3) - lb_send_down(3) + 1)
1755 !$ my_id = omp_get_thread_num()
1756  IF (my_id < num_threads) THEN
1757  lb = lb_send_down(3) + ((ub_send_down(3) - lb_send_down(3) + 1)*my_id)/num_threads
1758  ub = lb_send_down(3) + ((ub_send_down(3) - lb_send_down(3) + 1)*(my_id + 1))/num_threads - 1
1759 
1760  send_buf_3d_down(lb_send_down(1):ub_send_down(1), lb_send_down(2):ub_send_down(2), &
1761  lb:ub) = rs%r(lb_send_down(1):ub_send_down(1), &
1762  lb_send_down(2):ub_send_down(2), lb:ub)
1763  END IF
1764 !$OMP END PARALLEL
1765 
1766  CALL rs%desc%group%isend(send_buf_3d_down, dest_down, req(3))
1767 
1768  ! Now for the other direction
1769 
1770  CALL cart_shift(rs, idir, n_shifts, source_up, dest_up)
1771 
1772  lb_send_up(:) = rs%lb_local(:)
1773  ub_send_up(:) = rs%ub_local(:)
1774  lb_recv_up(:) = rs%lb_local(:)
1775  ub_recv_up(:) = rs%ub_local(:)
1776 
1777  IF (ushifts(n_shifts - 1) .LE. rs%desc%border) THEN
1778  ub_send_up(idir) = ub_send_up(idir) - rs%desc%border
1779  lb_send_up(idir) = max(lb_send_up(idir) + rs%desc%border, &
1780  ub_send_up(idir) - rs%desc%border + 1 + ushifts(n_shifts - 1))
1781 
1782  ub_recv_up(idir) = lb_recv_up(idir) + rs%desc%border - 1 - dshifts(n_shifts - 1)
1783  lb_recv_up(idir) = max(lb_recv_up(idir), &
1784  lb_recv_up(idir) + rs%desc%border - dshifts(n_shifts))
1785  ELSE
1786  lb_send_up(idir) = 0
1787  ub_send_up(idir) = -1
1788  lb_recv_up(idir) = 0
1789  ub_recv_up(idir) = -1
1790  END IF
1791 
1792  DO i = 1, 3
1793  IF (.NOT. (halo_swapped(i) .OR. i .EQ. idir)) THEN
1794  lb_send_up(i) = rs%lb_real(i)
1795  ub_send_up(i) = rs%ub_real(i)
1796  lb_recv_up(i) = rs%lb_real(i)
1797  ub_recv_up(i) = rs%ub_real(i)
1798  END IF
1799  END DO
1800 
1801  ! allocate the recv buffer
1802  nn = product(ub_recv_up - lb_recv_up + 1)
1803  ALLOCATE (recv_buf_3d_up(lb_recv_up(1):ub_recv_up(1), &
1804  lb_recv_up(2):ub_recv_up(2), lb_recv_up(3):ub_recv_up(3)))
1805 
1806  ! recv buffer is now ready, so post the receive
1807 
1808  CALL rs%desc%group%irecv(recv_buf_3d_up, source_up, req(2))
1809 
1810  ! now allocate,pack and send the send buffer
1811  nn = product(ub_send_up - lb_send_up + 1)
1812  ALLOCATE (send_buf_3d_up(lb_send_up(1):ub_send_up(1), &
1813  lb_send_up(2):ub_send_up(2), lb_send_up(3):ub_send_up(3)))
1814 
1815 !$OMP PARALLEL DEFAULT(NONE), &
1816 !$OMP PRIVATE(lb,ub,my_id,NUM_THREADS), &
1817 !$OMP SHARED(send_buf_3d_up,rs,lb_send_up,ub_send_up)
1818 !$ num_threads = MIN(omp_get_max_threads(), ub_send_up(3) - lb_send_up(3) + 1)
1819 !$ my_id = omp_get_thread_num()
1820  IF (my_id < num_threads) THEN
1821  lb = lb_send_up(3) + ((ub_send_up(3) - lb_send_up(3) + 1)*my_id)/num_threads
1822  ub = lb_send_up(3) + ((ub_send_up(3) - lb_send_up(3) + 1)*(my_id + 1))/num_threads - 1
1823 
1824  send_buf_3d_up(lb_send_up(1):ub_send_up(1), lb_send_up(2):ub_send_up(2), &
1825  lb:ub) = rs%r(lb_send_up(1):ub_send_up(1), &
1826  lb_send_up(2):ub_send_up(2), lb:ub)
1827  END IF
1828 !$OMP END PARALLEL
1829 
1830  CALL rs%desc%group%isend(send_buf_3d_up, dest_up, req(4))
1831 
1832  ! wait for a recv to complete, then we can unpack
1833 
1834  DO i = 1, 2
1835 
1836  CALL mp_waitany(req(1:2), completed)
1837 
1838  IF (completed .EQ. 1) THEN
1839 
1840  ! only some procs may need later shifts
1841  IF (ub_recv_down(idir) .GE. lb_recv_down(idir)) THEN
1842 
1843  ! Add the data to the RS Grid
1844 !$OMP PARALLEL DEFAULT(NONE), &
1845 !$OMP PRIVATE(lb,ub,my_id,NUM_THREADS), &
1846 !$OMP SHARED(recv_buf_3d_down,rs,lb_recv_down,ub_recv_down)
1847 !$ num_threads = MIN(omp_get_max_threads(), ub_recv_down(3) - lb_recv_down(3) + 1)
1848 !$ my_id = omp_get_thread_num()
1849  IF (my_id < num_threads) THEN
1850  lb = lb_recv_down(3) + ((ub_recv_down(3) - lb_recv_down(3) + 1)*my_id)/num_threads
1851  ub = lb_recv_down(3) + ((ub_recv_down(3) - lb_recv_down(3) + 1)*(my_id + 1))/num_threads - 1
1852 
1853  rs%r(lb_recv_down(1):ub_recv_down(1), lb_recv_down(2):ub_recv_down(2), &
1854  lb:ub) = recv_buf_3d_down(:, :, lb:ub)
1855  END IF
1856 !$OMP END PARALLEL
1857  END IF
1858 
1859  DEALLOCATE (recv_buf_3d_down)
1860  ELSE
1861 
1862  ! only some procs may need later shifts
1863  IF (ub_recv_up(idir) .GE. lb_recv_up(idir)) THEN
1864 
1865  ! Add the data to the RS Grid
1866 !$OMP PARALLEL DEFAULT(NONE), &
1867 !$OMP PRIVATE(lb,ub,my_id,NUM_THREADS), &
1868 !$OMP SHARED(recv_buf_3d_up,rs,lb_recv_up,ub_recv_up)
1869 !$ num_threads = MIN(omp_get_max_threads(), ub_recv_up(3) - lb_recv_up(3) + 1)
1870 !$ my_id = omp_get_thread_num()
1871  IF (my_id < num_threads) THEN
1872  lb = lb_recv_up(3) + ((ub_recv_up(3) - lb_recv_up(3) + 1)*my_id)/num_threads
1873  ub = lb_recv_up(3) + ((ub_recv_up(3) - lb_recv_up(3) + 1)*(my_id + 1))/num_threads - 1
1874 
1875  rs%r(lb_recv_up(1):ub_recv_up(1), lb_recv_up(2):ub_recv_up(2), &
1876  lb:ub) = recv_buf_3d_up(:, :, lb:ub)
1877  END IF
1878 !$OMP END PARALLEL
1879  END IF
1880 
1881  DEALLOCATE (recv_buf_3d_up)
1882  END IF
1883  END DO
1884 
1885  CALL mp_waitall(req(3:4))
1886 
1887  DEALLOCATE (send_buf_3d_down)
1888  DEALLOCATE (send_buf_3d_up)
1889  END DO
1890 
1891  DEALLOCATE (ushifts)
1892  DEALLOCATE (dshifts)
1893  END IF
1894 
1895  halo_swapped(idir) = .true.
1896 
1897  END DO
1898 
1899  END SUBROUTINE transfer_pw2rs_distributed
1900 
1901 ! **************************************************************************************************
1902 !> \brief Initialize grid to zero
1903 !> \param rs ...
1904 !> \par History
1905 !> none
1906 !> \author JGH (23-Mar-2002)
1907 ! **************************************************************************************************
1908  SUBROUTINE rs_grid_zero(rs)
1909 
1910  TYPE(realspace_grid_type), INTENT(IN) :: rs
1911 
1912  CHARACTER(len=*), PARAMETER :: routinen = 'rs_grid_zero'
1913 
1914  INTEGER :: handle, i, j, k, l(3), u(3)
1915 
1916  CALL timeset(routinen, handle)
1917  l(1) = lbound(rs%r, 1); l(2) = lbound(rs%r, 2); l(3) = lbound(rs%r, 3)
1918  u(1) = ubound(rs%r, 1); u(2) = ubound(rs%r, 2); u(3) = ubound(rs%r, 3)
1919 !$OMP PARALLEL DO DEFAULT(NONE) COLLAPSE(3) &
1920 !$OMP PRIVATE(i,j,k) &
1921 !$OMP SHARED(rs,l,u)
1922  DO k = l(3), u(3)
1923  DO j = l(2), u(2)
1924  DO i = l(1), u(1)
1925  rs%r(i, j, k) = 0.0_dp
1926  END DO
1927  END DO
1928  END DO
1929 !$OMP END PARALLEL DO
1930  CALL timestop(handle)
1931 
1932  END SUBROUTINE rs_grid_zero
1933 
1934 ! **************************************************************************************************
1935 !> \brief rs1(i) = rs1(i) + rs2(i)*rs3(i)
1936 !> \param rs1 ...
1937 !> \param rs2 ...
1938 !> \param rs3 ...
1939 !> \param scalar ...
1940 !> \par History
1941 !> none
1942 !> \author
1943 ! **************************************************************************************************
1944  SUBROUTINE rs_grid_mult_and_add(rs1, rs2, rs3, scalar)
1945 
1946  TYPE(realspace_grid_type), INTENT(IN) :: rs1, rs2, rs3
1947  REAL(dp), INTENT(IN) :: scalar
1948 
1949  CHARACTER(len=*), PARAMETER :: routinen = 'rs_grid_mult_and_add'
1950 
1951  INTEGER :: handle, i, j, k, l(3), u(3)
1952 
1953 !-----------------------------------------------------------------------------!
1954 
1955  CALL timeset(routinen, handle)
1956  IF (scalar .NE. 0.0_dp) THEN
1957  l(1) = lbound(rs1%r, 1); l(2) = lbound(rs1%r, 2); l(3) = lbound(rs1%r, 3)
1958  u(1) = ubound(rs1%r, 1); u(2) = ubound(rs1%r, 2); u(3) = ubound(rs1%r, 3)
1959 !$OMP PARALLEL DO DEFAULT(NONE) COLLAPSE(3) &
1960 !$OMP PRIVATE(i,j,k) &
1961 !$OMP SHARED(rs1,rs2,rs3,scalar,l,u)
1962  DO k = l(3), u(3)
1963  DO j = l(2), u(2)
1964  DO i = l(1), u(1)
1965  rs1%r(i, j, k) = rs1%r(i, j, k) + scalar*rs2%r(i, j, k)*rs3%r(i, j, k)
1966  END DO
1967  END DO
1968  END DO
1969 !$OMP END PARALLEL DO
1970  END IF
1971  CALL timestop(handle)
1972  END SUBROUTINE rs_grid_mult_and_add
1973 
1974 ! **************************************************************************************************
1975 !> \brief Set box matrix info for real space grid
1976 !> This is needed for variable cell simulations
1977 !> \param pw_grid ...
1978 !> \param rs ...
1979 !> \par History
1980 !> none
1981 !> \author JGH (15-May-2007)
1982 ! **************************************************************************************************
1983  SUBROUTINE rs_grid_set_box(pw_grid, rs)
1984 
1985  TYPE(pw_grid_type), INTENT(IN), TARGET :: pw_grid
1986  TYPE(realspace_grid_type), INTENT(IN) :: rs
1987 
1988  cpassert(ASSOCIATED(rs%desc%pw, pw_grid))
1989  rs%desc%dh = pw_grid%dh
1990  rs%desc%dh_inv = pw_grid%dh_inv
1991 
1992  END SUBROUTINE rs_grid_set_box
1993 
1994 ! **************************************************************************************************
1995 !> \brief retains the given rs grid descriptor (see doc/ReferenceCounting.html)
1996 !> \param rs_desc the grid descriptor to retain
1997 !> \par History
1998 !> 04.2009 created [Iain Bethune]
1999 !> (c) The Numerical Algorithms Group (NAG) Ltd, 2009 on behalf of the HECToR project
2000 ! **************************************************************************************************
2001  SUBROUTINE rs_grid_retain_descriptor(rs_desc)
2002  TYPE(realspace_grid_desc_type), INTENT(INOUT) :: rs_desc
2003 
2004  cpassert(rs_desc%ref_count > 0)
2005  rs_desc%ref_count = rs_desc%ref_count + 1
2006  END SUBROUTINE rs_grid_retain_descriptor
2007 
2008 ! **************************************************************************************************
2009 !> \brief releases the given rs grid (see doc/ReferenceCounting.html)
2010 !> \param rs_grid the rs grid to release
2011 !> \par History
2012 !> 03.2003 created [fawzi]
2013 !> \author fawzi
2014 ! **************************************************************************************************
2015  SUBROUTINE rs_grid_release(rs_grid)
2016  TYPE(realspace_grid_type), INTENT(INOUT) :: rs_grid
2017 
2018  CALL rs_grid_release_descriptor(rs_grid%desc)
2019 
2020  CALL offload_free_buffer(rs_grid%buffer)
2021  NULLIFY (rs_grid%r)
2022 
2023  IF (ALLOCATED(rs_grid%px)) DEALLOCATE (rs_grid%px)
2024  IF (ALLOCATED(rs_grid%py)) DEALLOCATE (rs_grid%py)
2025  IF (ALLOCATED(rs_grid%pz)) DEALLOCATE (rs_grid%pz)
2026  END SUBROUTINE rs_grid_release
2027 
2028 ! **************************************************************************************************
2029 !> \brief releases the given rs grid descriptor (see doc/ReferenceCounting.html)
2030 !> \param rs_desc the rs grid descriptor to release
2031 !> \par History
2032 !> 04.2009 created [Iain Bethune]
2033 !> (c) The Numerical Algorithms Group (NAG) Ltd, 2009 on behalf of the HECToR project
2034 ! **************************************************************************************************
2035  SUBROUTINE rs_grid_release_descriptor(rs_desc)
2036  TYPE(realspace_grid_desc_type), POINTER :: rs_desc
2037 
2038  IF (ASSOCIATED(rs_desc)) THEN
2039  cpassert(rs_desc%ref_count > 0)
2040  rs_desc%ref_count = rs_desc%ref_count - 1
2041  IF (rs_desc%ref_count == 0) THEN
2042 
2043  CALL pw_grid_release(rs_desc%pw)
2044 
2045  IF (rs_desc%parallel) THEN
2046  ! release the group communicator
2047  CALL rs_desc%group%free()
2048 
2049  DEALLOCATE (rs_desc%virtual2real)
2050  DEALLOCATE (rs_desc%real2virtual)
2051  END IF
2052 
2053  IF (rs_desc%distributed) THEN
2054  DEALLOCATE (rs_desc%rank2coord)
2055  DEALLOCATE (rs_desc%coord2rank)
2056  DEALLOCATE (rs_desc%lb_global)
2057  DEALLOCATE (rs_desc%ub_global)
2058  DEALLOCATE (rs_desc%x2coord)
2059  DEALLOCATE (rs_desc%y2coord)
2060  DEALLOCATE (rs_desc%z2coord)
2061  END IF
2062 
2063  DEALLOCATE (rs_desc)
2064  END IF
2065  END IF
2066  NULLIFY (rs_desc)
2067  END SUBROUTINE rs_grid_release_descriptor
2068 
2069 ! **************************************************************************************************
2070 !> \brief emulates the function of an MPI_cart_shift operation, but the shift is
2071 !> done in virtual coordinates, and the corresponding real ranks are returned
2072 !> \param rs_grid ...
2073 !> \param dir ...
2074 !> \param disp ...
2075 !> \param source ...
2076 !> \param dest ...
2077 !> \par History
2078 !> 04.2009 created [Iain Bethune]
2079 !> (c) The Numerical Algorithms Group (NAG) Ltd, 2009 on behalf of the HECToR project
2080 ! **************************************************************************************************
2081  PURE SUBROUTINE cart_shift(rs_grid, dir, disp, source, dest)
2082 
2083  TYPE(realspace_grid_type), INTENT(IN) :: rs_grid
2084  INTEGER, INTENT(IN) :: dir, disp
2085  INTEGER, INTENT(OUT) :: source, dest
2086 
2087  INTEGER, DIMENSION(3) :: shift_coords
2088 
2089  shift_coords = rs_grid%desc%virtual_group_coor
2090  shift_coords(dir) = modulo(shift_coords(dir) + disp, rs_grid%desc%group_dim(dir))
2091  dest = rs_grid%desc%virtual2real(rs_grid%desc%coord2rank(shift_coords(1), shift_coords(2), shift_coords(3)))
2092  shift_coords = rs_grid%desc%virtual_group_coor
2093  shift_coords(dir) = modulo(shift_coords(dir) - disp, rs_grid%desc%group_dim(dir))
2094  source = rs_grid%desc%virtual2real(rs_grid%desc%coord2rank(shift_coords(1), shift_coords(2), shift_coords(3)))
2095 
2096  END SUBROUTINE
2097 
2098 ! **************************************************************************************************
2099 !> \brief returns the maximum number of points in the local grid of any process
2100 !> to account for the case where the grid may later be reordered
2101 !> \param desc ...
2102 !> \return ...
2103 !> \par History
2104 !> 10.2011 created [Iain Bethune]
2105 ! **************************************************************************************************
2106  FUNCTION rs_grid_max_ngpts(desc) RESULT(max_ngpts)
2107  TYPE(realspace_grid_desc_type), INTENT(IN) :: desc
2108  INTEGER :: max_ngpts
2109 
2110  CHARACTER(len=*), PARAMETER :: routinen = 'rs_grid_max_ngpts'
2111 
2112  INTEGER :: handle, i
2113  INTEGER, DIMENSION(3) :: lb, ub
2114 
2115  CALL timeset(routinen, handle)
2116 
2117  max_ngpts = 0
2118  IF ((desc%pw%para%mode == pw_mode_local) .OR. &
2119  (all(desc%group_dim == 1))) THEN
2120  cpassert(product(int(desc%npts, kind=int_8)) < huge(1))
2121  max_ngpts = product(desc%npts)
2122  ELSE
2123  DO i = 0, desc%group_size - 1
2124  lb = desc%lb_global(:, i)
2125  ub = desc%ub_global(:, i)
2126  lb = lb - desc%border*(1 - desc%perd)
2127  ub = ub + desc%border*(1 - desc%perd)
2128  cpassert(product(int(ub - lb + 1, kind=int_8)) < huge(1))
2129  max_ngpts = max(max_ngpts, product(ub - lb + 1))
2130  END DO
2131  END IF
2132 
2133  CALL timestop(handle)
2134 
2135  END FUNCTION rs_grid_max_ngpts
2136 
2137 ! **************************************************************************************************
2138 !> \brief ...
2139 !> \param rs_grid ...
2140 !> \param h_inv ...
2141 !> \param ra ...
2142 !> \param offset ...
2143 !> \param group_size ...
2144 !> \param my_pos ...
2145 !> \return ...
2146 ! **************************************************************************************************
2147  PURE LOGICAL FUNCTION map_gaussian_here(rs_grid, h_inv, ra, offset, group_size, my_pos) RESULT(res)
2148  TYPE(realspace_grid_type), INTENT(IN) :: rs_grid
2149  REAL(kind=dp), DIMENSION(3, 3), INTENT(IN) :: h_inv
2150  REAL(kind=dp), DIMENSION(3), INTENT(IN) :: ra
2151  INTEGER, INTENT(IN), OPTIONAL :: offset, group_size, my_pos
2152 
2153  INTEGER :: dir, lb(3), location(3), tp(3), ub(3)
2154 
2155  res = .false.
2156 
2157  IF (.NOT. all(rs_grid%desc%perd == 1)) THEN
2158  DO dir = 1, 3
2159  ! bounds of local grid (i.e. removing the 'wings'), if periodic
2160  tp(dir) = floor(dot_product(h_inv(dir, :), ra)*rs_grid%desc%npts(dir))
2161  tp(dir) = modulo(tp(dir), rs_grid%desc%npts(dir))
2162  IF (rs_grid%desc%perd(dir) .NE. 1) THEN
2163  lb(dir) = rs_grid%lb_local(dir) + rs_grid%desc%border
2164  ub(dir) = rs_grid%ub_local(dir) - rs_grid%desc%border
2165  ELSE
2166  lb(dir) = rs_grid%lb_local(dir)
2167  ub(dir) = rs_grid%ub_local(dir)
2168  END IF
2169  ! distributed grid, only map if it is local to the grid
2170  location(dir) = tp(dir) + rs_grid%desc%lb(dir)
2171  END DO
2172  IF (all(lb(:) <= location(:)) .AND. all(location(:) <= ub(:))) THEN
2173  res = .true.
2174  END IF
2175  ELSE
2176  IF (PRESENT(offset) .AND. PRESENT(group_size) .AND. PRESENT(my_pos)) THEN
2177  ! not distributed, just a round-robin distribution over the full set of CPUs
2178  IF (modulo(offset, group_size) == my_pos) res = .true.
2179  END IF
2180  END IF
2181 
2182  END FUNCTION map_gaussian_here
2183 
2184 END MODULE realspace_grid_types
real(kind=dp) function det_3x3(a)
...
Definition: dumpdcd.F:1091
static GRID_HOST_DEVICE int modulo(int a, int m)
Equivalent of Fortran's MODULO, which always return a positive number. https://gcc....
Definition: grid_common.h:117
various utilities that regard array of different kinds: output, allocation,... maybe it is not a good...
various routines to log and control the output. The idea is that decisions about where to log should ...
sums arrays of real/complex numbers with much reduced round-off as compared to a naive implementation...
Definition: kahan_sum.F:29
Defines the basic variable types.
Definition: kinds.F:23
integer, parameter, public int_8
Definition: kinds.F:54
integer, parameter, public dp
Definition: kinds.F:34
Machine interface based on Fortran 2003 and POSIX.
Definition: machine.F:17
subroutine, public m_memory(mem)
Returns the total amount of memory [bytes] in use, if known, zero otherwise.
Definition: machine.F:332
Collection of simple mathematical functions and subroutines.
Definition: mathlib.F:15
Interface to the message passing library MPI.
type(mp_comm_type), parameter, public mp_comm_null
subroutine, public mp_waitany(requests, completed)
waits for completion of any of the given requests
type(mp_request_type), parameter, public mp_request_null
Fortran API for the offload package, which is written in C.
Definition: offload_api.F:12
subroutine, public offload_free_buffer(buffer)
Deallocates given buffer.
Definition: offload_api.F:303
subroutine, public offload_create_buffer(length, buffer)
Allocates a buffer of given length, ie. number of elements.
Definition: offload_api.F:251
integer, parameter, public pw_mode_local
Definition: pw_grid_types.F:29
This module defines the grid data type and some basic operations on it.
Definition: pw_grids.F:36
subroutine, public pw_grid_release(pw_grid)
releases the given pw grid
Definition: pw_grids.F:2133
subroutine, public pw_grid_retain(pw_grid)
retains the given pw grid
Definition: pw_grids.F:2117
subroutine, public rs_grid_print(rs, iounit)
Print information on grids to output.
integer, parameter, public rsgrid_replicated
subroutine, public rs_grid_create(rs, desc)
...
subroutine, public rs_grid_create_descriptor(desc, pw_grid, input_settings, border_points)
Determine the setup of real space grids - this is divided up into the creation of a descriptor and th...
pure integer function, public rs_grid_locate_rank(rs_desc, rank_in, shift)
returns the 1D rank of the task which is a cartesian shift away from 1D rank rank_in only possible if...
subroutine, public transfer_pw2rs(rs, pw)
...
integer, parameter, public rsgrid_automatic
pure subroutine, public rs_grid_reorder_ranks(desc, real2virtual)
Defines a new ordering of ranks on this realspace grid, recalculating the data bounds and reallocatin...
subroutine, public rs_grid_mult_and_add(rs1, rs2, rs3, scalar)
rs1(i) = rs1(i) + rs2(i)*rs3(i)
subroutine, public rs_grid_set_box(pw_grid, rs)
Set box matrix info for real space grid This is needed for variable cell simulations.
subroutine, public rs_grid_retain_descriptor(rs_desc)
retains the given rs grid descriptor (see doc/ReferenceCounting.html)
subroutine, public rs_grid_release_descriptor(rs_desc)
releases the given rs grid descriptor (see doc/ReferenceCounting.html)
integer function, public rs_grid_max_ngpts(desc)
returns the maximum number of points in the local grid of any process to account for the case where t...
integer, parameter, public rsgrid_distributed
pure logical function, public map_gaussian_here(rs_grid, h_inv, ra, offset, group_size, my_pos)
...
subroutine, public transfer_rs2pw(rs, pw)
...
subroutine, public rs_grid_release(rs_grid)
releases the given rs grid (see doc/ReferenceCounting.html)
subroutine, public rs_grid_zero(rs)
Initialize grid to zero.
All kind of helpful little routines.
Definition: util.F:14
pure integer function, dimension(2), public get_limit(m, n, me)
divide m entries into n parts, return size of part me
Definition: util.F:333
void offload_free_buffer(offload_buffer *buffer)
Deallocate given buffer.