(git:33f85d8)
Loading...
Searching...
No Matches
realspace_grid_types.F
Go to the documentation of this file.
1!--------------------------------------------------------------------------------------------------!
2! CP2K: A general program to perform molecular dynamics simulations !
3! Copyright 2000-2025 CP2K developers group <https://cp2k.org> !
4! !
5! SPDX-License-Identifier: GPL-2.0-or-later !
6!--------------------------------------------------------------------------------------------------!
7
8! **************************************************************************************************
9!> \note
10!> Basic type for real space grid methods
11!> \par History
12!> JGH (22-May-2002) : New routine rs_grid_zero
13!> JGH (12-Jun-2002) : Bug fix for mpi groups
14!> JGH (19-Jun-2003) : Added routine for task distribution
15!> JGH (23-Nov-2003) : Added routine for task loop separation
16!> \author JGH (18-Mar-2001)
17! **************************************************************************************************
21 USE kahan_sum, ONLY: accurate_sum
22 USE kinds, ONLY: dp,&
23 int_8
24 USE machine, ONLY: m_memory
25 USE mathlib, ONLY: det_3x3
26 USE message_passing, ONLY: mp_comm_null,&
35 USE pw_grid_types, ONLY: pw_mode_local,&
37 USE pw_grids, ONLY: pw_grid_release,&
40 USE pw_types, ONLY: pw_r3d_rs_type
41 USE util, ONLY: get_limit
42
43!$ USE OMP_LIB, ONLY: omp_get_max_threads, omp_get_thread_num, omp_get_num_threads
44
45#include "../base/base_uses.f90"
46
47 IMPLICIT NONE
48
49 PRIVATE
50 PUBLIC :: realspace_grid_type, &
55
56 PUBLIC :: transfer_rs2pw, &
71
72 INTEGER, PARAMETER, PUBLIC :: rsgrid_distributed = 0, &
75
76 LOGICAL, PRIVATE, PARAMETER :: debug_this_module = .false.
77 CHARACTER(len=*), PARAMETER, PRIVATE :: modulen = 'realspace_grid_types'
78
79! **************************************************************************************************
81 INTEGER :: distribution_type = rsgrid_replicated
82 INTEGER :: distribution_layout(3) = -1
83 REAL(kind=dp) :: memory_factor = 0.0_dp
84 LOGICAL :: lock_distribution = .false.
85 INTEGER :: nsmax = -1
86 REAL(kind=dp) :: halo_reduction_factor = 1.0_dp
88
89! **************************************************************************************************
91 TYPE(pw_grid_type), POINTER :: pw => null() ! the pw grid
92
93 INTEGER :: ref_count = 0 ! reference count
94
95 INTEGER(int_8) :: ngpts = 0_int_8 ! # grid points
96 INTEGER, DIMENSION(3) :: npts = 0 ! # grid points per dimension
97 INTEGER, DIMENSION(3) :: lb = 0 ! lower bounds
98 INTEGER, DIMENSION(3) :: ub = 0 ! upper bounds
99
100 INTEGER :: border = 0 ! border points
101
102 INTEGER, DIMENSION(3) :: perd = -1 ! periodicity enforced
103 REAL(kind=dp), DIMENSION(3, 3) :: dh = 0.0_dp ! incremental grid matrix
104 REAL(kind=dp), DIMENSION(3, 3) :: dh_inv = 0.0_dp ! inverse incremental grid matrix
105 LOGICAL :: orthorhombic = .true. ! grid symmetry
106
107 LOGICAL :: parallel = .true. ! whether the corresponding pw grid is distributed
108 LOGICAL :: distributed = .true. ! whether the rs grid is distributed
109 ! these MPI related quantities are only meaningful depending on how the grid has been laid out
110 ! they are most useful for fully distributed grids, where they reflect the topology of the grid
111 TYPE(mp_comm_type) :: group = mp_comm_null
112 INTEGER :: my_pos = -1
113 INTEGER :: group_size = 0
114 INTEGER, DIMENSION(3) :: group_dim = -1
115 INTEGER, DIMENSION(3) :: group_coor = -1
116 INTEGER, DIMENSION(3) :: neighbours = -1
117 ! only meaningful on distributed grids
118 ! a list of bounds for each CPU
119 INTEGER, DIMENSION(:, :), ALLOCATABLE :: lb_global
120 INTEGER, DIMENSION(:, :), ALLOCATABLE :: ub_global
121 ! a mapping from linear rank to 3d coord
122 INTEGER, DIMENSION(:, :), ALLOCATABLE :: rank2coord
123 INTEGER, DIMENSION(:, :, :), ALLOCATABLE :: coord2rank
124 ! a mapping from index to rank (which allows to figure out easily on which rank a given point of the grid is)
125 INTEGER, DIMENSION(:), ALLOCATABLE :: x2coord
126 INTEGER, DIMENSION(:), ALLOCATABLE :: y2coord
127 INTEGER, DIMENSION(:), ALLOCATABLE :: z2coord
128
129 INTEGER :: my_virtual_pos = -1
130 INTEGER, DIMENSION(3) :: virtual_group_coor = -1
131
132 INTEGER, DIMENSION(:), ALLOCATABLE :: virtual2real, real2virtual
133
135
137
138 TYPE(realspace_grid_desc_type), POINTER :: desc => null()
139
140 INTEGER :: ngpts_local = -1 ! local dimensions
141 INTEGER, DIMENSION(3) :: npts_local = -1
142 INTEGER, DIMENSION(3) :: lb_local = -1
143 INTEGER, DIMENSION(3) :: ub_local = -1
144 INTEGER, DIMENSION(3) :: lb_real = -1 ! lower bounds of the real local data
145 INTEGER, DIMENSION(3) :: ub_real = -1 ! upper bounds of the real local data
146
147 INTEGER, DIMENSION(:), ALLOCATABLE :: px, py, pz ! index translators
148 TYPE(offload_buffer_type) :: buffer = offload_buffer_type() ! owner of the grid's memory
149 REAL(kind=dp), DIMENSION(:, :, :), CONTIGUOUS, POINTER :: r => null() ! the grid (pointer to buffer%host_buffer)
150
151 END TYPE realspace_grid_type
152
153! **************************************************************************************************
155 TYPE(realspace_grid_type), POINTER :: rs_grid => null()
156 END TYPE realspace_grid_p_type
157
159 TYPE(realspace_grid_desc_type), POINTER :: rs_desc => null()
161
162CONTAINS
163
164! **************************************************************************************************
165!> \brief returns the 1D rank of the task which is a cartesian shift away from 1D rank rank_in
166!> only possible if rs_grid is a distributed grid
167!> \param rs_desc ...
168!> \param rank_in ...
169!> \param shift ...
170!> \return ...
171! **************************************************************************************************
172 PURE FUNCTION rs_grid_locate_rank(rs_desc, rank_in, shift) RESULT(rank_out)
173 TYPE(realspace_grid_desc_type), INTENT(IN) :: rs_desc
174 INTEGER, INTENT(IN) :: rank_in
175 INTEGER, DIMENSION(3), INTENT(IN) :: shift
176 INTEGER :: rank_out
177
178 INTEGER :: coord(3)
179
180 coord = modulo(rs_desc%rank2coord(:, rank_in) + shift, rs_desc%group_dim)
181 rank_out = rs_desc%coord2rank(coord(1), coord(2), coord(3))
182 END FUNCTION rs_grid_locate_rank
183
184! **************************************************************************************************
185!> \brief Determine the setup of real space grids - this is divided up into the
186!> creation of a descriptor and the actual grid itself (see rs_grid_create)
187!> \param desc ...
188!> \param pw_grid ...
189!> \param input_settings ...
190!> \param border_points ...
191!> \par History
192!> JGH (08-Jun-2003) : nsmax <= 0 indicates fully replicated grid
193!> Iain Bethune (05-Sep-2008) : modified cut heuristic
194!> (c) The Numerical Algorithms Group (NAG) Ltd, 2008 on behalf of the HECToR project
195!> - Create a descriptor for realspace grids with a number of border
196!> points as exactly given by the optional argument border_points.
197!> These grids are always distributed.
198!> (27.11.2013, Matthias Krack)
199!> \author JGH (18-Mar-2001)
200! **************************************************************************************************
201 SUBROUTINE rs_grid_create_descriptor(desc, pw_grid, input_settings, border_points)
202 TYPE(realspace_grid_desc_type), POINTER :: desc
203 TYPE(pw_grid_type), INTENT(INOUT), TARGET :: pw_grid
204 TYPE(realspace_grid_input_type), INTENT(IN) :: input_settings
205 INTEGER, INTENT(IN), OPTIONAL :: border_points
206
207 CHARACTER(LEN=*), PARAMETER :: routinen = 'rs_grid_create_descriptor'
208
209 INTEGER :: border_size, dir, handle, i, j, k, l, &
210 lb(2), min_npts_real, n_slices(3), &
211 n_slices_tmp(3), nmin
212 LOGICAL :: overlap
213 REAL(kind=dp) :: ratio, ratio_best, volume, volume_dist
214
215 CALL timeset(routinen, handle)
216
217 IF (PRESENT(border_points)) THEN
218 border_size = border_points
219 ELSE
220 border_size = 0
221 END IF
222
223 ALLOCATE (desc)
224
225 CALL pw_grid%para%group%sync()
226
227 desc%pw => pw_grid
228 CALL pw_grid_retain(desc%pw)
229
230 desc%dh = pw_grid%dh
231 desc%dh_inv = pw_grid%dh_inv
232 desc%orthorhombic = pw_grid%orthorhombic
233 desc%ref_count = 1
234
235 IF (pw_grid%para%mode == pw_mode_local) THEN
236 ! The corresponding group has dimension 1
237 ! All operations will be done locally
238 desc%npts = pw_grid%npts
239 desc%ngpts = product(int(desc%npts, kind=int_8))
240 desc%lb = pw_grid%bounds(1, :)
241 desc%ub = pw_grid%bounds(2, :)
242 desc%border = border_size
243 IF (border_size == 0) THEN
244 desc%perd = 1
245 ELSE
246 desc%perd = 0
247 END IF
248 desc%parallel = .false.
249 desc%distributed = .false.
250 desc%group = mp_comm_null
251 desc%group_size = 1
252 desc%group_dim = 1
253 desc%group_coor = 0
254 desc%my_pos = 0
255 ELSE
256 ! group size of desc grid
257 ! global grid dimensions are still the same
258 desc%group_size = pw_grid%para%group%num_pe
259 desc%npts = pw_grid%npts
260 desc%ngpts = product(int(desc%npts, kind=int_8))
261 desc%lb = pw_grid%bounds(1, :)
262 desc%ub = pw_grid%bounds(2, :)
263
264 ! this is the eventual border size
265 IF (border_size == 0) THEN
266 nmin = (input_settings%nsmax + 1)/2
267 nmin = max(0, nint(nmin*input_settings%halo_reduction_factor))
268 ELSE
269 ! Set explicitly the requested border size
270 nmin = border_size
271 END IF
272
273 IF (input_settings%distribution_type == rsgrid_replicated) THEN
274
275 n_slices = 1
276 IF (border_size > 0) THEN
277 CALL cp_abort(__location__, &
278 "An explicit border size > 0 is not yet working for "// &
279 "replicated realspace grids. Request DISTRIBUTION_TYPE "// &
280 "distributed for RS_GRID explicitly.")
281 END IF
282
283 ELSE
284
285 n_slices = 1
286 ratio_best = -huge(ratio_best)
287
288 ! don't allow distributions with more processors than real grid points
289 DO k = 1, min(desc%npts(3), desc%group_size)
290 DO j = 1, min(desc%npts(2), desc%group_size)
291 i = min(desc%npts(1), desc%group_size/(j*k))
292 n_slices_tmp = (/i, j, k/)
293
294 ! we don't match the actual number of CPUs
295 IF (product(n_slices_tmp) .NE. desc%group_size) cycle
296
297 ! we see if there has been a input constraint
298 ! i.e. if the layout is not -1 we need to fullfil it
299 IF (.NOT. all(pack(n_slices_tmp == input_settings%distribution_layout, &
300 (/-1, -1, -1/) /= input_settings%distribution_layout) &
301 )) cycle
302
303 ! We can not work with a grid that has more local than global grid points.
304 ! This can happen when a halo region wraps around and overlaps with the other halo.
305 overlap = .false.
306 DO dir = 1, 3
307 IF (n_slices_tmp(dir) > 1) THEN
308 DO l = 0, n_slices_tmp(dir) - 1
309 lb = get_limit(desc%npts(dir), n_slices_tmp(dir), l)
310 IF (lb(2) - lb(1) + 1 + 2*nmin > desc%npts(dir)) overlap = .true.
311 END DO
312 END IF
313 END DO
314 IF (overlap) cycle
315
316 ! a heuristic optimisation to reduce the memory usage
317 ! we go for the smallest local to real volume
318 ! volume of the box without the wings / volume of the box with the wings
319 ! with prefactodesc to promote less cuts in Z dimension
320 ratio = product(real(desc%npts, kind=dp)/n_slices_tmp)/ &
321 product(real(desc%npts, kind=dp)/n_slices_tmp + &
322 merge((/0.0, 0.0, 0.0/), 2*(/1.06*nmin, 1.05*nmin, 1.03*nmin/), n_slices_tmp == (/1, 1, 1/)))
323 IF (ratio > ratio_best) THEN
324 ratio_best = ratio
325 n_slices = n_slices_tmp
326 END IF
327
328 END DO
329 END DO
330
331 ! if automatic we can still decide this is a replicated grid
332 ! if the memory gain (or the gain is messages) is too small.
333 IF (input_settings%distribution_type == rsgrid_automatic) THEN
334 volume = product(real(desc%npts, kind=dp))
335 volume_dist = product(real(desc%npts, kind=dp)/n_slices + &
336 merge((/0, 0, 0/), 2*(/nmin, nmin, nmin/), n_slices == (/1, 1, 1/)))
337 IF (volume < volume_dist*input_settings%memory_factor) THEN
338 n_slices = 1
339 END IF
340 END IF
341
342 END IF
343
344 desc%group_dim(:) = n_slices(:)
345 CALL desc%group%from_dup(pw_grid%para%group)
346 desc%group_size = desc%group%num_pe
347 desc%my_pos = desc%group%mepos
348
349 IF (all(n_slices == 1)) THEN
350 ! CASE 1 : only one slice: we do not need overlapping regions and special
351 ! recombination of the total density
352 desc%border = border_size
353 IF (border_size == 0) THEN
354 desc%perd = 1
355 ELSE
356 desc%perd = 0
357 END IF
358 desc%distributed = .false.
359 desc%parallel = .true.
360 desc%group_coor(:) = 0
361 desc%my_virtual_pos = 0
362
363 ALLOCATE (desc%virtual2real(0:desc%group_size - 1))
364 ALLOCATE (desc%real2virtual(0:desc%group_size - 1))
365 ! Start with no reordering
366 DO i = 0, desc%group_size - 1
367 desc%virtual2real(i) = i
368 desc%real2virtual(i) = i
369 END DO
370 ELSE
371 ! CASE 2 : general case
372 ! periodicity is no longer enforced arbritary directions
373 IF (border_size == 0) THEN
374 desc%perd = 1
375 DO dir = 1, 3
376 IF (n_slices(dir) > 1) desc%perd(dir) = 0
377 END DO
378 ELSE
379 desc%perd(:) = 0
380 END IF
381 ! we keep a border of nmin points
382 desc%border = nmin
383 ! we are going parallel on the real space grid
384 desc%parallel = .true.
385 desc%distributed = .true.
386
387 ! set up global info about the distribution
388 ALLOCATE (desc%rank2coord(3, 0:desc%group_size - 1))
389 ALLOCATE (desc%coord2rank(0:desc%group_dim(1) - 1, 0:desc%group_dim(2) - 1, 0:desc%group_dim(3) - 1))
390 ALLOCATE (desc%lb_global(3, 0:desc%group_size - 1))
391 ALLOCATE (desc%ub_global(3, 0:desc%group_size - 1))
392 ALLOCATE (desc%x2coord(desc%lb(1):desc%ub(1)))
393 ALLOCATE (desc%y2coord(desc%lb(2):desc%ub(2)))
394 ALLOCATE (desc%z2coord(desc%lb(3):desc%ub(3)))
395
396 DO i = 0, desc%group_size - 1
397 ! Calculate coordinates in a row-major order (to be SMP-friendly)
398 desc%rank2coord(1, i) = i/(desc%group_dim(2)*desc%group_dim(3))
399 desc%rank2coord(2, i) = modulo(i, desc%group_dim(2)*desc%group_dim(3)) &
400 /desc%group_dim(3)
401 desc%rank2coord(3, i) = modulo(i, desc%group_dim(3))
402
403 IF (i == desc%my_pos) THEN
404 desc%group_coor = desc%rank2coord(:, i)
405 END IF
406
407 desc%coord2rank(desc%rank2coord(1, i), desc%rank2coord(2, i), desc%rank2coord(3, i)) = i
408 ! the lb_global and ub_global correspond to lb_real and ub_real of each task
409 desc%lb_global(:, i) = desc%lb
410 desc%ub_global(:, i) = desc%ub
411 DO dir = 1, 3
412 IF (desc%group_dim(dir) .GT. 1) THEN
413 lb = get_limit(desc%npts(dir), desc%group_dim(dir), desc%rank2coord(dir, i))
414 desc%lb_global(dir, i) = lb(1) + desc%lb(dir) - 1
415 desc%ub_global(dir, i) = lb(2) + desc%lb(dir) - 1
416 END IF
417 END DO
418 END DO
419
420 ! map a grid point to a CPU coord
421 DO dir = 1, 3
422 DO l = 0, desc%group_dim(dir) - 1
423 IF (desc%group_dim(dir) .GT. 1) THEN
424 lb = get_limit(desc%npts(dir), desc%group_dim(dir), l)
425 lb = lb + desc%lb(dir) - 1
426 ELSE
427 lb(1) = desc%lb(dir)
428 lb(2) = desc%ub(dir)
429 END IF
430 SELECT CASE (dir)
431 CASE (1)
432 desc%x2coord(lb(1):lb(2)) = l
433 CASE (2)
434 desc%y2coord(lb(1):lb(2)) = l
435 CASE (3)
436 desc%z2coord(lb(1):lb(2)) = l
437 END SELECT
438 END DO
439 END DO
440
441 ! an upper bound for the number of neighbours the border is overlapping with
442 DO dir = 1, 3
443 desc%neighbours(dir) = 0
444 IF ((n_slices(dir) > 1) .OR. (border_size > 0)) THEN
445 min_npts_real = huge(0)
446 DO l = 0, n_slices(dir) - 1
447 lb = get_limit(desc%npts(dir), n_slices(dir), l)
448 min_npts_real = min(lb(2) - lb(1) + 1, min_npts_real)
449 END DO
450 desc%neighbours(dir) = (desc%border + min_npts_real - 1)/min_npts_real
451 END IF
452 END DO
453
454 ALLOCATE (desc%virtual2real(0:desc%group_size - 1))
455 ALLOCATE (desc%real2virtual(0:desc%group_size - 1))
456 ! Start with no reordering
457 DO i = 0, desc%group_size - 1
458 desc%virtual2real(i) = i
459 desc%real2virtual(i) = i
460 END DO
461
462 desc%my_virtual_pos = desc%real2virtual(desc%my_pos)
463 desc%virtual_group_coor(:) = desc%rank2coord(:, desc%my_virtual_pos)
464
465 END IF
466 END IF
467
468 CALL timestop(handle)
469
470 END SUBROUTINE rs_grid_create_descriptor
471
472! **************************************************************************************************
473!> \brief ...
474!> \param rs ...
475!> \param desc ...
476! **************************************************************************************************
477 SUBROUTINE rs_grid_create(rs, desc)
478 TYPE(realspace_grid_type), INTENT(OUT) :: rs
479 TYPE(realspace_grid_desc_type), INTENT(INOUT), &
480 TARGET :: desc
481
482 CHARACTER(LEN=*), PARAMETER :: routinen = 'rs_grid_create'
483
484 INTEGER :: handle
485
486 CALL timeset(routinen, handle)
487
488 rs%desc => desc
489 CALL rs_grid_retain_descriptor(rs%desc)
490
491 IF (desc%pw%para%mode == pw_mode_local) THEN
492 ! The corresponding group has dimension 1
493 ! All operations will be done locally
494 rs%lb_real = desc%lb
495 rs%ub_real = desc%ub
496 rs%lb_local = rs%lb_real - desc%border*(1 - desc%perd)
497 rs%ub_local = rs%ub_real + desc%border*(1 - desc%perd)
498 rs%npts_local = rs%ub_local - rs%lb_local + 1
499 rs%ngpts_local = product(rs%npts_local)
500 END IF
501
502 IF (all(rs%desc%group_dim == 1)) THEN
503 ! CASE 1 : only one slice: we do not need overlapping regions and special
504 ! recombination of the total density
505 rs%lb_real = desc%lb
506 rs%ub_real = desc%ub
507 rs%lb_local = rs%lb_real - desc%border*(1 - desc%perd)
508 rs%ub_local = rs%ub_real + desc%border*(1 - desc%perd)
509 rs%npts_local = rs%ub_local - rs%lb_local + 1
510 rs%ngpts_local = product(rs%npts_local)
511 ELSE
512 ! CASE 2 : general case
513 ! extract some more derived quantities about the local grid
514 rs%lb_real = desc%lb_global(:, desc%my_virtual_pos)
515 rs%ub_real = desc%ub_global(:, desc%my_virtual_pos)
516 rs%lb_local = rs%lb_real - desc%border*(1 - desc%perd)
517 rs%ub_local = rs%ub_real + desc%border*(1 - desc%perd)
518 rs%npts_local = rs%ub_local - rs%lb_local + 1
519 rs%ngpts_local = product(rs%npts_local)
520 END IF
521
522 CALL offload_create_buffer(rs%ngpts_local, rs%buffer)
523 rs%r(rs%lb_local(1):rs%ub_local(1), &
524 rs%lb_local(2):rs%ub_local(2), &
525 rs%lb_local(3):rs%ub_local(3)) => rs%buffer%host_buffer
526
527 ALLOCATE (rs%px(desc%npts(1)))
528 ALLOCATE (rs%py(desc%npts(2)))
529 ALLOCATE (rs%pz(desc%npts(3)))
530
531 CALL timestop(handle)
532
533 END SUBROUTINE rs_grid_create
534
535! **************************************************************************************************
536!> \brief Defines a new ordering of ranks on this realspace grid, recalculating
537!> the data bounds and reallocating the grid. As a result, each MPI process
538!> now has a real rank (i.e., its rank in the MPI communicator from the pw grid)
539!> and a virtual rank (the rank of the process where the data now owned by this
540!> process would reside in an ordinary cartesian distribution).
541!> NB. Since the grid size required may change, the caller should be sure to release
542!> and recreate the corresponding rs_grids
543!> The desc%real2virtual and desc%virtual2real arrays can be used to map
544!> a physical rank to the 'rank' of data owned by that process and vice versa
545!> \param desc ...
546!> \param real2virtual ...
547!> \par History
548!> 04-2009 created [Iain Bethune]
549!> (c) The Numerical Algorithms Group (NAG) Ltd, 2009 on behalf of the HECToR project
550! **************************************************************************************************
551 PURE SUBROUTINE rs_grid_reorder_ranks(desc, real2virtual)
552
553 TYPE(realspace_grid_desc_type), INTENT(INOUT) :: desc
554 INTEGER, DIMENSION(:), INTENT(IN) :: real2virtual
555
556 INTEGER :: i
557
558 desc%real2virtual(:) = real2virtual
559
560 DO i = 0, desc%group_size - 1
561 desc%virtual2real(desc%real2virtual(i)) = i
562 END DO
563
564 desc%my_virtual_pos = desc%real2virtual(desc%my_pos)
565
566 IF (.NOT. all(desc%group_dim == 1)) THEN
567 desc%virtual_group_coor(:) = desc%rank2coord(:, desc%my_virtual_pos)
568 END IF
569
570 END SUBROUTINE
571
572! **************************************************************************************************
573!> \brief Print information on grids to output
574!> \param rs ...
575!> \param iounit ...
576!> \author JGH (17-May-2007)
577! **************************************************************************************************
578 SUBROUTINE rs_grid_print(rs, iounit)
579 TYPE(realspace_grid_type), INTENT(IN) :: rs
580 INTEGER, INTENT(in) :: iounit
581
582 INTEGER :: dir, i, nn
583 REAL(kind=dp) :: pp(3)
584
585 IF (rs%desc%parallel) THEN
586 IF (iounit > 0) THEN
587 WRITE (iounit, '(/,A,T71,I10)') &
588 " RS_GRID| Information for grid number ", rs%desc%pw%id_nr
589 DO i = 1, 3
590 WRITE (iounit, '(A,I3,T30,2I8,T62,A,T71,I10)') " RS_GRID| Bounds ", &
591 i, rs%desc%lb(i), rs%desc%ub(i), "Points:", rs%desc%npts(i)
592 END DO
593 IF (.NOT. rs%desc%distributed) THEN
594 WRITE (iounit, '(A)') " RS_GRID| Real space fully replicated"
595 WRITE (iounit, '(A,T71,I10)') &
596 " RS_GRID| Group size ", rs%desc%group_dim(2)
597 ELSE
598 DO dir = 1, 3
599 IF (rs%desc%perd(dir) /= 1) THEN
600 WRITE (iounit, '(A,T71,I3,A)') &
601 " RS_GRID| Real space distribution over ", rs%desc%group_dim(dir), " groups"
602 WRITE (iounit, '(A,T71,I10)') &
603 " RS_GRID| Real space distribution along direction ", dir
604 WRITE (iounit, '(A,T71,I10)') &
605 " RS_GRID| Border size ", rs%desc%border
606 END IF
607 END DO
608 END IF
609 END IF
610 IF (rs%desc%distributed) THEN
611 DO dir = 1, 3
612 IF (rs%desc%perd(dir) /= 1) THEN
613 nn = rs%npts_local(dir)
614 CALL rs%desc%group%sum(nn)
615 pp(1) = real(nn, kind=dp)/real(product(rs%desc%group_dim), kind=dp)
616 nn = rs%npts_local(dir)
617 CALL rs%desc%group%max(nn)
618 pp(2) = real(nn, kind=dp)
619 nn = rs%npts_local(dir)
620 CALL rs%desc%group%min(nn)
621 pp(3) = real(nn, kind=dp)
622 IF (iounit > 0) THEN
623 WRITE (iounit, '(A,T48,A)') " RS_GRID| Distribution", &
624 " Average Max Min"
625 WRITE (iounit, '(A,T45,F12.1,2I12)') " RS_GRID| Planes ", &
626 pp(1), nint(pp(2)), nint(pp(3))
627 END IF
628 END IF
629 END DO
630! WRITE ( iounit, '(/)' )
631 END IF
632 ELSE
633 IF (iounit > 0) THEN
634 WRITE (iounit, '(/,A,T71,I10)') &
635 " RS_GRID| Information for grid number ", rs%desc%pw%id_nr
636 DO i = 1, 3
637 WRITE (iounit, '(A,I3,T30,2I8,T62,A,T71,I10)') " RS_GRID| Bounds ", &
638 i, rs%desc%lb(i), rs%desc%ub(i), "Points:", rs%desc%npts(i)
639 END DO
640! WRITE ( iounit, '(/)' )
641 END IF
642 END IF
643
644 END SUBROUTINE rs_grid_print
645
646! **************************************************************************************************
647!> \brief ...
648!> \param rs ...
649!> \param pw ...
650! **************************************************************************************************
651 SUBROUTINE transfer_rs2pw(rs, pw)
652 TYPE(realspace_grid_type), INTENT(IN) :: rs
653 TYPE(pw_r3d_rs_type), INTENT(INOUT) :: pw
654
655 CHARACTER(len=*), PARAMETER :: routinen = 'transfer_rs2pw'
656
657 INTEGER :: handle, handle2, i
658
659 CALL timeset(routinen, handle2)
660 CALL timeset(routinen//"_"//trim(adjustl(cp_to_string(ceiling(pw%pw_grid%cutoff/10)*10))), handle)
661
662 IF (.NOT. ASSOCIATED(rs%desc%pw, pw%pw_grid)) &
663 cpabort("Different rs and pw indentifiers")
664
665 IF (rs%desc%distributed) THEN
666 CALL transfer_rs2pw_distributed(rs, pw)
667 ELSE IF (rs%desc%parallel) THEN
668 CALL transfer_rs2pw_replicated(rs, pw)
669 ELSE ! treat simple serial case locally
670 IF (rs%desc%border == 0) THEN
671 CALL dcopy(SIZE(rs%r), rs%r, 1, pw%array, 1)
672 ELSE
673 cpassert(lbound(pw%array, 3) .EQ. rs%lb_real(3))
674!$OMP PARALLEL DO DEFAULT(NONE) SHARED(pw,rs)
675 DO i = rs%lb_real(3), rs%ub_real(3)
676 pw%array(:, :, i) = rs%r(rs%lb_real(1):rs%ub_real(1), &
677 rs%lb_real(2):rs%ub_real(2), i)
678 END DO
679!$OMP END PARALLEL DO
680 END IF
681 END IF
682
683 CALL timestop(handle)
684 CALL timestop(handle2)
685
686 END SUBROUTINE transfer_rs2pw
687
688! **************************************************************************************************
689!> \brief ...
690!> \param rs ...
691!> \param pw ...
692! **************************************************************************************************
693 SUBROUTINE transfer_pw2rs(rs, pw)
694
695 TYPE(realspace_grid_type), INTENT(IN) :: rs
696 TYPE(pw_r3d_rs_type), INTENT(IN) :: pw
697
698 CHARACTER(len=*), PARAMETER :: routinen = 'transfer_pw2rs'
699
700 INTEGER :: handle, handle2, i, im, j, jm, k, km
701
702 CALL timeset(routinen, handle2)
703 CALL timeset(routinen//"_"//trim(adjustl(cp_to_string(ceiling(pw%pw_grid%cutoff/10)*10))), handle)
704
705 IF (.NOT. ASSOCIATED(rs%desc%pw, pw%pw_grid)) &
706 cpabort("Different rs and pw indentifiers")
707
708 IF (rs%desc%distributed) THEN
709 CALL transfer_pw2rs_distributed(rs, pw)
710 ELSE IF (rs%desc%parallel) THEN
711 CALL transfer_pw2rs_replicated(rs, pw)
712 ELSE ! treat simple serial case locally
713 IF (rs%desc%border == 0) THEN
714 CALL dcopy(SIZE(rs%r), pw%array, 1, rs%r, 1)
715 ELSE
716!$OMP PARALLEL DO DEFAULT(NONE) &
717!$OMP PRIVATE(i,im,j,jm,k,km) &
718!$OMP SHARED(pw,rs)
719 DO k = rs%lb_local(3), rs%ub_local(3)
720 IF (k < rs%lb_real(3)) THEN
721 km = k + rs%desc%npts(3)
722 ELSE IF (k > rs%ub_real(3)) THEN
723 km = k - rs%desc%npts(3)
724 ELSE
725 km = k
726 END IF
727 DO j = rs%lb_local(2), rs%ub_local(2)
728 IF (j < rs%lb_real(2)) THEN
729 jm = j + rs%desc%npts(2)
730 ELSE IF (j > rs%ub_real(2)) THEN
731 jm = j - rs%desc%npts(2)
732 ELSE
733 jm = j
734 END IF
735 DO i = rs%lb_local(1), rs%ub_local(1)
736 IF (i < rs%lb_real(1)) THEN
737 im = i + rs%desc%npts(1)
738 ELSE IF (i > rs%ub_real(1)) THEN
739 im = i - rs%desc%npts(1)
740 ELSE
741 im = i
742 END IF
743 rs%r(i, j, k) = pw%array(im, jm, km)
744 END DO
745 END DO
746 END DO
747!$OMP END PARALLEL DO
748 END IF
749 END IF
750
751 CALL timestop(handle)
752 CALL timestop(handle2)
753
754 END SUBROUTINE transfer_pw2rs
755
756! **************************************************************************************************
757!> \brief transfer from a realspace grid to a planewave grid
758!> \param rs ...
759!> \param pw ...
760! **************************************************************************************************
761 SUBROUTINE transfer_rs2pw_replicated(rs, pw)
762 TYPE(realspace_grid_type), INTENT(IN) :: rs
763 TYPE(pw_r3d_rs_type), INTENT(INOUT) :: pw
764
765 INTEGER :: dest, ii, ip, ix, iy, iz, nma, nn, s(3), &
766 source
767 INTEGER, ALLOCATABLE, DIMENSION(:) :: rcount
768 INTEGER, DIMENSION(3) :: lb, ub
769 REAL(kind=dp), ALLOCATABLE, DIMENSION(:) :: recvbuf, sendbuf, swaparray
770
771 associate(np => pw%pw_grid%para%group%num_pe, bo => pw%pw_grid%para%bo(1:2, 1:3, 0:pw%pw_grid%para%group%num_pe - 1, 1), &
772 pbo => pw%pw_grid%bounds, group => pw%pw_grid%para%group, mepos => pw%pw_grid%para%group%mepos, &
773 grid => rs%r)
774 ALLOCATE (rcount(0:np - 1))
775 DO ip = 1, np
776 rcount(ip - 1) = product(bo(2, :, ip) - bo(1, :, ip) + 1)
777 END DO
778 nma = maxval(rcount(0:np - 1))
779 ALLOCATE (sendbuf(nma), recvbuf(nma))
780 sendbuf = 1.0e99_dp; recvbuf = 1.0e99_dp ! init mpi'ed buffers to silence warnings under valgrind
781
782 !sample peak memory
783 CALL m_memory()
784
785 dest = modulo(mepos + 1, np)
786 source = modulo(mepos - 1, np)
787 sendbuf = 0.0_dp
788
789 DO ip = 1, np
790
791 lb = pbo(1, :) + bo(1, :, modulo(mepos - ip, np) + 1) - 1
792 ub = pbo(1, :) + bo(2, :, modulo(mepos - ip, np) + 1) - 1
793 ! this loop takes about the same time as the message passing call
794 ! notice that the range of ix is only a small fraction of the first index of grid
795 ! therefore it seems faster to have the second index as the innermost loop
796 ! if this runs on many cpus
797 ! tested on itanium, pentium4, opteron, ultrasparc...
798 s = ub - lb + 1
799 DO iz = lb(3), ub(3)
800 DO ix = lb(1), ub(1)
801 ii = (iz - lb(3))*s(1)*s(2) + (ix - lb(1)) + 1
802 DO iy = lb(2), ub(2)
803 sendbuf(ii) = sendbuf(ii) + grid(ix, iy, iz)
804 ii = ii + s(1)
805 END DO
806 END DO
807 END DO
808 IF (ip .EQ. np) EXIT
809 CALL group%sendrecv(sendbuf, dest, recvbuf, source, 13)
810 CALL move_alloc(sendbuf, swaparray)
811 CALL move_alloc(recvbuf, sendbuf)
812 CALL move_alloc(swaparray, recvbuf)
813 END DO
814 nn = rcount(mepos)
815 END associate
816
817 CALL dcopy(nn, sendbuf, 1, pw%array, 1)
818
819 DEALLOCATE (rcount)
820 DEALLOCATE (sendbuf)
821 DEALLOCATE (recvbuf)
822
823 END SUBROUTINE transfer_rs2pw_replicated
824
825! **************************************************************************************************
826!> \brief transfer from a planewave grid to a realspace grid
827!> \param rs ...
828!> \param pw ...
829! **************************************************************************************************
830 SUBROUTINE transfer_pw2rs_replicated(rs, pw)
831 TYPE(realspace_grid_type), INTENT(IN) :: rs
832 TYPE(pw_r3d_rs_type), INTENT(IN) :: pw
833
834 INTEGER :: dest, i, ii, im, ip, ix, iy, iz, j, jm, &
835 k, km, nma, nn, source
836 INTEGER, ALLOCATABLE, DIMENSION(:) :: rcount
837 INTEGER, DIMENSION(3) :: lb, ub
838 REAL(kind=dp), ALLOCATABLE, DIMENSION(:) :: recvbuf, sendbuf, swaparray
839 TYPE(mp_request_type), DIMENSION(2) :: req
840
841 associate(np => pw%pw_grid%para%group%num_pe, bo => pw%pw_grid%para%bo(1:2, 1:3, 0:pw%pw_grid%para%group%num_pe - 1, 1), &
842 pbo => pw%pw_grid%bounds, group => pw%pw_grid%para%group, mepos => pw%pw_grid%para%group%mepos, &
843 grid => rs%r)
844 ALLOCATE (rcount(0:np - 1))
845 DO ip = 1, np
846 rcount(ip - 1) = product(bo(2, :, ip) - bo(1, :, ip) + 1)
847 END DO
848 nma = maxval(rcount(0:np - 1))
849 ALLOCATE (sendbuf(nma), recvbuf(nma))
850 sendbuf = 1.0e99_dp; recvbuf = 1.0e99_dp ! init mpi'ed buffers to silence warnings under valgrind
851
852 !sample peak memory
853 CALL m_memory()
854
855 nn = rcount(mepos)
856 CALL dcopy(nn, pw%array, 1, sendbuf, 1)
857
858 dest = modulo(mepos + 1, np)
859 source = modulo(mepos - 1, np)
860
861 DO ip = 0, np - 1
862 ! we must shift the buffer only np-1 times around
863 IF (ip .NE. np - 1) THEN
864 CALL group%isendrecv(sendbuf, dest, recvbuf, source, &
865 req(1), req(2), 13)
866 END IF
867 lb = pbo(1, :) + bo(1, :, modulo(mepos - ip, np) + 1) - 1
868 ub = pbo(1, :) + bo(2, :, modulo(mepos - ip, np) + 1) - 1
869 ii = 0
870 ! this loop takes about the same time as the message passing call
871 ! If I read the code correctly then:
872 DO iz = lb(3), ub(3)
873 DO iy = lb(2), ub(2)
874 DO ix = lb(1), ub(1)
875 ii = ii + 1
876 grid(ix, iy, iz) = sendbuf(ii)
877 END DO
878 END DO
879 END DO
880 IF (ip .NE. np - 1) THEN
881 CALL mp_waitall(req)
882 END IF
883 CALL move_alloc(sendbuf, swaparray)
884 CALL move_alloc(recvbuf, sendbuf)
885 CALL move_alloc(swaparray, recvbuf)
886 END DO
887 IF (rs%desc%border > 0) THEN
888!$OMP PARALLEL DO DEFAULT(NONE) &
889!$OMP PRIVATE(i,im,j,jm,k,km) &
890!$OMP SHARED(rs)
891 DO k = rs%lb_local(3), rs%ub_local(3)
892 IF (k < rs%lb_real(3)) THEN
893 km = k + rs%desc%npts(3)
894 ELSE IF (k > rs%ub_real(3)) THEN
895 km = k - rs%desc%npts(3)
896 ELSE
897 km = k
898 END IF
899 DO j = rs%lb_local(2), rs%ub_local(2)
900 IF (j < rs%lb_real(2)) THEN
901 jm = j + rs%desc%npts(2)
902 ELSE IF (j > rs%ub_real(2)) THEN
903 jm = j - rs%desc%npts(2)
904 ELSE
905 jm = j
906 END IF
907 DO i = rs%lb_local(1), rs%ub_local(1)
908 IF (i < rs%lb_real(1)) THEN
909 im = i + rs%desc%npts(1)
910 ELSE IF (i > rs%ub_real(1)) THEN
911 im = i - rs%desc%npts(1)
912 ELSE
913 im = i
914 END IF
915 rs%r(i, j, k) = rs%r(im, jm, km)
916 END DO
917 END DO
918 END DO
919!$OMP END PARALLEL DO
920 END IF
921 END associate
922
923 DEALLOCATE (rcount)
924 DEALLOCATE (sendbuf)
925 DEALLOCATE (recvbuf)
926
927 END SUBROUTINE transfer_pw2rs_replicated
928
929! **************************************************************************************************
930!> \brief does the rs2pw transfer in the case where the rs grid is
931!> distributed (3D domain decomposition)
932!> \param rs ...
933!> \param pw ...
934!> \par History
935!> 12.2007 created [Matt Watkins]
936!> 9.2008 reduced amount of halo data sent [Iain Bethune]
937!> 10.2008 added non-blocking communication [Iain Bethune]
938!> 4.2009 added support for rank-reordering on the grid [Iain Bethune]
939!> 12.2009 added OMP and sparse alltoall [Iain Bethune]
940!> (c) The Numerical Algorithms Group (NAG) Ltd, 2008-2009 on behalf of the HECToR project
941!> \note
942!> the transfer is a two step procedure. For example, for the rs2pw transfer:
943!>
944!> 1) Halo-exchange in 3D so that the local part of the rs_grid contains the full data
945!> 2) an alltoall communication to redistribute the local rs_grid to the local pw_grid
946!>
947!> the halo exchange is most expensive on a large number of CPUs. Particular in this halo
948!> exchange is that the border region is rather large (e.g. 20 points) and that it might overlap
949!> with the central domain of several CPUs (i.e. next nearest neighbors)
950! **************************************************************************************************
951 SUBROUTINE transfer_rs2pw_distributed(rs, pw)
952 TYPE(realspace_grid_type), INTENT(IN) :: rs
953 TYPE(pw_r3d_rs_type), INTENT(IN) :: pw
954
955 CHARACTER(LEN=200) :: error_string
956 INTEGER :: completed, dest_down, dest_up, i, idir, j, k, lb, my_id, my_pw_rank, my_rs_rank, &
957 n_shifts, nn, num_threads, position, source_down, source_up, ub, x, y, z
958 INTEGER, ALLOCATABLE, DIMENSION(:) :: dshifts, recv_disps, recv_sizes, &
959 send_disps, send_sizes, ushifts
960 INTEGER, ALLOCATABLE, DIMENSION(:, :) :: bounds, recv_tasks, send_tasks
961 INTEGER, DIMENSION(2) :: neighbours, pos
962 INTEGER, DIMENSION(3) :: coords, lb_recv, lb_recv_down, lb_recv_up, lb_send, lb_send_down, &
963 lb_send_up, ub_recv, ub_recv_down, ub_recv_up, ub_send, ub_send_down, ub_send_up
964 LOGICAL, DIMENSION(3) :: halo_swapped
965 REAL(kind=dp) :: pw_sum, rs_sum
966 REAL(kind=dp), ALLOCATABLE, DIMENSION(:, :, :) :: recv_buf_3d_down, recv_buf_3d_up, &
967 send_buf_3d_down, send_buf_3d_up
968 TYPE(cp_1d_r_p_type), ALLOCATABLE, DIMENSION(:) :: recv_bufs, send_bufs
969 TYPE(mp_request_type), ALLOCATABLE, DIMENSION(:) :: recv_reqs, send_reqs
970 TYPE(mp_request_type), DIMENSION(4) :: req
971
972 num_threads = 1
973 my_id = 0
974
975 ! safety check, to be removed once we're absolute sure the routine is correct
976 IF (debug_this_module) THEN
977 rs_sum = accurate_sum(rs%r)*abs(det_3x3(rs%desc%dh))
978 CALL rs%desc%group%sum(rs_sum)
979 END IF
980
981 halo_swapped = .false.
982 ! We don't need to send the 'edges' of the halos that have already been sent
983 ! Halos are contiguous in memory in z-direction only, so swap these first,
984 ! and send less data in the y and x directions which are more expensive
985
986 DO idir = 3, 1, -1
987
988 IF (rs%desc%perd(idir) .NE. 1) THEN
989
990 ALLOCATE (dshifts(0:rs%desc%neighbours(idir)))
991 ALLOCATE (ushifts(0:rs%desc%neighbours(idir)))
992
993 ushifts = 0
994 dshifts = 0
995
996 ! check that we don't try to send data to ourself
997 DO n_shifts = 1, min(rs%desc%neighbours(idir), rs%desc%group_dim(idir) - 1)
998
999 ! need to take into account the possible varying widths of neighbouring cells
1000 ! offset_up and offset_down hold the real size of the neighbouring cells
1001 position = modulo(rs%desc%virtual_group_coor(idir) - n_shifts, rs%desc%group_dim(idir))
1002 neighbours = get_limit(rs%desc%npts(idir), rs%desc%group_dim(idir), position)
1003 dshifts(n_shifts) = dshifts(n_shifts - 1) + (neighbours(2) - neighbours(1) + 1)
1004
1005 position = modulo(rs%desc%virtual_group_coor(idir) + n_shifts, rs%desc%group_dim(idir))
1006 neighbours = get_limit(rs%desc%npts(idir), rs%desc%group_dim(idir), position)
1007 ushifts(n_shifts) = ushifts(n_shifts - 1) + (neighbours(2) - neighbours(1) + 1)
1008
1009 ! The border data has to be send/received from the neighbours
1010 ! First we calculate the source and destination processes for the shift
1011 ! We do both shifts at once to allow for more overlap of communication and buffer packing/unpacking
1012
1013 CALL cart_shift(rs, idir, -1*n_shifts, source_down, dest_down)
1014
1015 lb_send_down(:) = rs%lb_local(:)
1016 lb_recv_down(:) = rs%lb_local(:)
1017 ub_recv_down(:) = rs%ub_local(:)
1018 ub_send_down(:) = rs%ub_local(:)
1019
1020 IF (dshifts(n_shifts - 1) .LE. rs%desc%border) THEN
1021 ub_send_down(idir) = lb_send_down(idir) + rs%desc%border - 1 - dshifts(n_shifts - 1)
1022 lb_send_down(idir) = max(lb_send_down(idir), &
1023 lb_send_down(idir) + rs%desc%border - dshifts(n_shifts))
1024
1025 ub_recv_down(idir) = ub_recv_down(idir) - rs%desc%border
1026 lb_recv_down(idir) = max(lb_recv_down(idir) + rs%desc%border, &
1027 ub_recv_down(idir) - rs%desc%border + 1 + ushifts(n_shifts - 1))
1028 ELSE
1029 lb_send_down(idir) = 0
1030 ub_send_down(idir) = -1
1031 lb_recv_down(idir) = 0
1032 ub_recv_down(idir) = -1
1033 END IF
1034
1035 DO i = 1, 3
1036 IF (halo_swapped(i)) THEN
1037 lb_send_down(i) = rs%lb_real(i)
1038 ub_send_down(i) = rs%ub_real(i)
1039 lb_recv_down(i) = rs%lb_real(i)
1040 ub_recv_down(i) = rs%ub_real(i)
1041 END IF
1042 END DO
1043
1044 ! post the receive
1045 ALLOCATE (recv_buf_3d_down(lb_recv_down(1):ub_recv_down(1), &
1046 lb_recv_down(2):ub_recv_down(2), lb_recv_down(3):ub_recv_down(3)))
1047 CALL rs%desc%group%irecv(recv_buf_3d_down, source_down, req(1))
1048
1049 ! now allocate, pack and send the send buffer
1050 nn = product(ub_send_down - lb_send_down + 1)
1051 ALLOCATE (send_buf_3d_down(lb_send_down(1):ub_send_down(1), &
1052 lb_send_down(2):ub_send_down(2), lb_send_down(3):ub_send_down(3)))
1053
1054!$OMP PARALLEL DEFAULT(NONE), &
1055!$OMP PRIVATE(lb,ub,my_id,NUM_THREADS), &
1056!$OMP SHARED(send_buf_3d_down,rs,lb_send_down,ub_send_down)
1057!$ num_threads = MIN(omp_get_max_threads(), ub_send_down(3) - lb_send_down(3) + 1)
1058!$ my_id = omp_get_thread_num()
1059 IF (my_id < num_threads) THEN
1060 lb = lb_send_down(3) + ((ub_send_down(3) - lb_send_down(3) + 1)*my_id)/num_threads
1061 ub = lb_send_down(3) + ((ub_send_down(3) - lb_send_down(3) + 1)*(my_id + 1))/num_threads - 1
1062
1063 send_buf_3d_down(lb_send_down(1):ub_send_down(1), lb_send_down(2):ub_send_down(2), &
1064 lb:ub) = rs%r(lb_send_down(1):ub_send_down(1), &
1065 lb_send_down(2):ub_send_down(2), lb:ub)
1066 END IF
1067!$OMP END PARALLEL
1068
1069 CALL rs%desc%group%isend(send_buf_3d_down, dest_down, req(3))
1070
1071 ! Now for the other direction
1072 CALL cart_shift(rs, idir, n_shifts, source_up, dest_up)
1073
1074 lb_send_up(:) = rs%lb_local(:)
1075 lb_recv_up(:) = rs%lb_local(:)
1076 ub_recv_up(:) = rs%ub_local(:)
1077 ub_send_up(:) = rs%ub_local(:)
1078
1079 IF (ushifts(n_shifts - 1) .LE. rs%desc%border) THEN
1080
1081 lb_send_up(idir) = ub_send_up(idir) - rs%desc%border + 1 + ushifts(n_shifts - 1)
1082 ub_send_up(idir) = min(ub_send_up(idir), &
1083 ub_send_up(idir) - rs%desc%border + ushifts(n_shifts))
1084
1085 lb_recv_up(idir) = lb_recv_up(idir) + rs%desc%border
1086 ub_recv_up(idir) = min(ub_recv_up(idir) - rs%desc%border, &
1087 lb_recv_up(idir) + rs%desc%border - 1 - dshifts(n_shifts - 1))
1088 ELSE
1089 lb_send_up(idir) = 0
1090 ub_send_up(idir) = -1
1091 lb_recv_up(idir) = 0
1092 ub_recv_up(idir) = -1
1093 END IF
1094
1095 DO i = 1, 3
1096 IF (halo_swapped(i)) THEN
1097 lb_send_up(i) = rs%lb_real(i)
1098 ub_send_up(i) = rs%ub_real(i)
1099 lb_recv_up(i) = rs%lb_real(i)
1100 ub_recv_up(i) = rs%ub_real(i)
1101 END IF
1102 END DO
1103
1104 ! post the receive
1105 ALLOCATE (recv_buf_3d_up(lb_recv_up(1):ub_recv_up(1), &
1106 lb_recv_up(2):ub_recv_up(2), lb_recv_up(3):ub_recv_up(3)))
1107 CALL rs%desc%group%irecv(recv_buf_3d_up, source_up, req(2))
1108
1109 ! now allocate,pack and send the send buffer
1110 nn = product(ub_send_up - lb_send_up + 1)
1111 ALLOCATE (send_buf_3d_up(lb_send_up(1):ub_send_up(1), &
1112 lb_send_up(2):ub_send_up(2), lb_send_up(3):ub_send_up(3)))
1113
1114!$OMP PARALLEL DEFAULT(NONE), &
1115!$OMP PRIVATE(lb,ub,my_id,NUM_THREADS), &
1116!$OMP SHARED(send_buf_3d_up,rs,lb_send_up,ub_send_up)
1117!$ num_threads = MIN(omp_get_max_threads(), ub_send_up(3) - lb_send_up(3) + 1)
1118!$ my_id = omp_get_thread_num()
1119 IF (my_id < num_threads) THEN
1120 lb = lb_send_up(3) + ((ub_send_up(3) - lb_send_up(3) + 1)*my_id)/num_threads
1121 ub = lb_send_up(3) + ((ub_send_up(3) - lb_send_up(3) + 1)*(my_id + 1))/num_threads - 1
1122
1123 send_buf_3d_up(lb_send_up(1):ub_send_up(1), lb_send_up(2):ub_send_up(2), &
1124 lb:ub) = rs%r(lb_send_up(1):ub_send_up(1), &
1125 lb_send_up(2):ub_send_up(2), lb:ub)
1126 END IF
1127!$OMP END PARALLEL
1128
1129 CALL rs%desc%group%isend(send_buf_3d_up, dest_up, req(4))
1130
1131 ! wait for a recv to complete, then we can unpack
1132
1133 DO i = 1, 2
1134
1135 CALL mp_waitany(req(1:2), completed)
1136
1137 IF (completed .EQ. 1) THEN
1138
1139 ! only some procs may need later shifts
1140 IF (ub_recv_down(idir) .GE. lb_recv_down(idir)) THEN
1141 ! Sum the data in the RS Grid
1142!$OMP PARALLEL DEFAULT(NONE), &
1143!$OMP PRIVATE(lb,ub,my_id,NUM_THREADS), &
1144!$OMP SHARED(recv_buf_3d_down,rs,lb_recv_down,ub_recv_down)
1145!$ num_threads = MIN(omp_get_max_threads(), ub_recv_down(3) - lb_recv_down(3) + 1)
1146!$ my_id = omp_get_thread_num()
1147 IF (my_id < num_threads) THEN
1148 lb = lb_recv_down(3) + ((ub_recv_down(3) - lb_recv_down(3) + 1)*my_id)/num_threads
1149 ub = lb_recv_down(3) + ((ub_recv_down(3) - lb_recv_down(3) + 1)*(my_id + 1))/num_threads - 1
1150
1151 rs%r(lb_recv_down(1):ub_recv_down(1), &
1152 lb_recv_down(2):ub_recv_down(2), lb:ub) = &
1153 rs%r(lb_recv_down(1):ub_recv_down(1), &
1154 lb_recv_down(2):ub_recv_down(2), lb:ub) + &
1155 recv_buf_3d_down(:, :, lb:ub)
1156 END IF
1157!$OMP END PARALLEL
1158 END IF
1159 DEALLOCATE (recv_buf_3d_down)
1160 ELSE
1161
1162 ! only some procs may need later shifts
1163 IF (ub_recv_up(idir) .GE. lb_recv_up(idir)) THEN
1164 ! Sum the data in the RS Grid
1165!$OMP PARALLEL DEFAULT(NONE), &
1166!$OMP PRIVATE(lb,ub,my_id,NUM_THREADS), &
1167!$OMP SHARED(recv_buf_3d_up,rs,lb_recv_up,ub_recv_up)
1168!$ num_threads = MIN(omp_get_max_threads(), ub_recv_up(3) - lb_recv_up(3) + 1)
1169!$ my_id = omp_get_thread_num()
1170 IF (my_id < num_threads) THEN
1171 lb = lb_recv_up(3) + ((ub_recv_up(3) - lb_recv_up(3) + 1)*my_id)/num_threads
1172 ub = lb_recv_up(3) + ((ub_recv_up(3) - lb_recv_up(3) + 1)*(my_id + 1))/num_threads - 1
1173
1174 rs%r(lb_recv_up(1):ub_recv_up(1), &
1175 lb_recv_up(2):ub_recv_up(2), lb:ub) = &
1176 rs%r(lb_recv_up(1):ub_recv_up(1), &
1177 lb_recv_up(2):ub_recv_up(2), lb:ub) + &
1178 recv_buf_3d_up(:, :, lb:ub)
1179 END IF
1180!$OMP END PARALLEL
1181 END IF
1182 DEALLOCATE (recv_buf_3d_up)
1183 END IF
1184
1185 END DO
1186
1187 ! make sure the sends have completed before we deallocate
1188
1189 CALL mp_waitall(req(3:4))
1190
1191 DEALLOCATE (send_buf_3d_down)
1192 DEALLOCATE (send_buf_3d_up)
1193 END DO
1194
1195 DEALLOCATE (dshifts)
1196 DEALLOCATE (ushifts)
1197
1198 END IF
1199
1200 halo_swapped(idir) = .true.
1201
1202 END DO
1203
1204 ! This is the real redistribution
1205 ALLOCATE (bounds(0:pw%pw_grid%para%group%num_pe - 1, 1:4))
1206
1207 ! work out the pw grid points each proc holds
1208 DO i = 0, pw%pw_grid%para%group%num_pe - 1
1209 bounds(i, 1:2) = pw%pw_grid%para%bo(1:2, 1, i, 1)
1210 bounds(i, 3:4) = pw%pw_grid%para%bo(1:2, 2, i, 1)
1211 bounds(i, 1:2) = bounds(i, 1:2) - pw%pw_grid%npts(1)/2 - 1
1212 bounds(i, 3:4) = bounds(i, 3:4) - pw%pw_grid%npts(2)/2 - 1
1213 END DO
1214
1215 ALLOCATE (send_tasks(0:pw%pw_grid%para%group%num_pe - 1, 1:6))
1216 ALLOCATE (send_sizes(0:pw%pw_grid%para%group%num_pe - 1))
1217 ALLOCATE (send_disps(0:pw%pw_grid%para%group%num_pe - 1))
1218 ALLOCATE (recv_tasks(0:pw%pw_grid%para%group%num_pe - 1, 1:6))
1219 ALLOCATE (recv_sizes(0:pw%pw_grid%para%group%num_pe - 1))
1220 ALLOCATE (recv_disps(0:pw%pw_grid%para%group%num_pe - 1))
1221 send_tasks(:, 1) = 1
1222 send_tasks(:, 2) = 0
1223 send_tasks(:, 3) = 1
1224 send_tasks(:, 4) = 0
1225 send_tasks(:, 5) = 1
1226 send_tasks(:, 6) = 0
1227 send_sizes = 0
1228 recv_sizes = 0
1229
1230 my_rs_rank = rs%desc%my_pos
1231 my_pw_rank = pw%pw_grid%para%group%mepos
1232
1233 ! find the processors that should hold our data
1234 ! should be part of the rs grid type
1235 ! this is a loop over real ranks (i.e. the in-order cartesian ranks)
1236 ! do the recv and send tasks in two separate loops which will
1237 ! load balance better for OpenMP with large numbers of MPI tasks
1238
1239!$OMP PARALLEL DO DEFAULT(NONE), &
1240!$OMP PRIVATE(coords,idir,pos,lb_send,ub_send), &
1241!$OMP SHARED(rs,bounds,my_rs_rank,recv_tasks,recv_sizes)
1242 DO i = 0, rs%desc%group_size - 1
1243
1244 coords(:) = rs%desc%rank2coord(:, rs%desc%real2virtual(i))
1245 !calculate the rs grid points on each processor
1246 !coords is the part of the grid that rank i actually holds
1247 DO idir = 1, 3
1248 pos(:) = get_limit(rs%desc%npts(idir), rs%desc%group_dim(idir), coords(idir))
1249 pos(:) = pos(:) - rs%desc%npts(idir)/2 - 1
1250 lb_send(idir) = pos(1)
1251 ub_send(idir) = pos(2)
1252 END DO
1253
1254 IF (lb_send(1) .GT. bounds(my_rs_rank, 2)) cycle
1255 IF (ub_send(1) .LT. bounds(my_rs_rank, 1)) cycle
1256 IF (lb_send(2) .GT. bounds(my_rs_rank, 4)) cycle
1257 IF (ub_send(2) .LT. bounds(my_rs_rank, 3)) cycle
1258
1259 recv_tasks(i, 1) = max(lb_send(1), bounds(my_rs_rank, 1))
1260 recv_tasks(i, 2) = min(ub_send(1), bounds(my_rs_rank, 2))
1261 recv_tasks(i, 3) = max(lb_send(2), bounds(my_rs_rank, 3))
1262 recv_tasks(i, 4) = min(ub_send(2), bounds(my_rs_rank, 4))
1263 recv_tasks(i, 5) = lb_send(3)
1264 recv_tasks(i, 6) = ub_send(3)
1265 recv_sizes(i) = (recv_tasks(i, 2) - recv_tasks(i, 1) + 1)* &
1266 (recv_tasks(i, 4) - recv_tasks(i, 3) + 1)*(recv_tasks(i, 6) - recv_tasks(i, 5) + 1)
1267
1268 END DO
1269!$OMP END PARALLEL DO
1270
1271 coords(:) = rs%desc%rank2coord(:, rs%desc%real2virtual(my_rs_rank))
1272 DO idir = 1, 3
1273 pos(:) = get_limit(rs%desc%npts(idir), rs%desc%group_dim(idir), coords(idir))
1274 pos(:) = pos(:) - rs%desc%npts(idir)/2 - 1
1275 lb_send(idir) = pos(1)
1276 ub_send(idir) = pos(2)
1277 END DO
1278
1279 lb_recv(:) = lb_send(:)
1280 ub_recv(:) = ub_send(:)
1281!$OMP PARALLEL DO DEFAULT(NONE), &
1282!$OMP SHARED(pw,lb_send,ub_send,bounds,send_tasks,send_sizes)
1283 DO j = 0, pw%pw_grid%para%group%num_pe - 1
1284
1285 IF (lb_send(1) .GT. bounds(j, 2)) cycle
1286 IF (ub_send(1) .LT. bounds(j, 1)) cycle
1287 IF (lb_send(2) .GT. bounds(j, 4)) cycle
1288 IF (ub_send(2) .LT. bounds(j, 3)) cycle
1289
1290 send_tasks(j, 1) = max(lb_send(1), bounds(j, 1))
1291 send_tasks(j, 2) = min(ub_send(1), bounds(j, 2))
1292 send_tasks(j, 3) = max(lb_send(2), bounds(j, 3))
1293 send_tasks(j, 4) = min(ub_send(2), bounds(j, 4))
1294 send_tasks(j, 5) = lb_send(3)
1295 send_tasks(j, 6) = ub_send(3)
1296 send_sizes(j) = (send_tasks(j, 2) - send_tasks(j, 1) + 1)* &
1297 (send_tasks(j, 4) - send_tasks(j, 3) + 1)*(send_tasks(j, 6) - send_tasks(j, 5) + 1)
1298
1299 END DO
1300!$OMP END PARALLEL DO
1301
1302 send_disps(0) = 0
1303 recv_disps(0) = 0
1304 DO i = 1, pw%pw_grid%para%group%num_pe - 1
1305 send_disps(i) = send_disps(i - 1) + send_sizes(i - 1)
1306 recv_disps(i) = recv_disps(i - 1) + recv_sizes(i - 1)
1307 END DO
1308
1309 cpassert(sum(send_sizes) == product(ub_recv - lb_recv + 1))
1310
1311 ALLOCATE (send_bufs(0:rs%desc%group_size - 1))
1312 ALLOCATE (recv_bufs(0:rs%desc%group_size - 1))
1313
1314 DO i = 0, rs%desc%group_size - 1
1315 IF (send_sizes(i) .NE. 0) THEN
1316 ALLOCATE (send_bufs(i)%array(send_sizes(i)))
1317 ELSE
1318 NULLIFY (send_bufs(i)%array)
1319 END IF
1320 IF (recv_sizes(i) .NE. 0) THEN
1321 ALLOCATE (recv_bufs(i)%array(recv_sizes(i)))
1322 ELSE
1323 NULLIFY (recv_bufs(i)%array)
1324 END IF
1325 END DO
1326
1327 ALLOCATE (recv_reqs(0:rs%desc%group_size - 1))
1328 recv_reqs = mp_request_null
1329
1330 DO i = 0, rs%desc%group_size - 1
1331 IF (recv_sizes(i) .NE. 0) THEN
1332 CALL rs%desc%group%irecv(recv_bufs(i)%array, i, recv_reqs(i))
1333 END IF
1334 END DO
1335
1336 ! do packing
1337!$OMP PARALLEL DO DEFAULT(NONE), &
1338!$OMP PRIVATE(k,z,y,x), &
1339!$OMP SHARED(rs,send_tasks,send_bufs,send_disps)
1340 DO i = 0, rs%desc%group_size - 1
1341 k = 0
1342 DO z = send_tasks(i, 5), send_tasks(i, 6)
1343 DO y = send_tasks(i, 3), send_tasks(i, 4)
1344 DO x = send_tasks(i, 1), send_tasks(i, 2)
1345 k = k + 1
1346 send_bufs(i)%array(k) = rs%r(x, y, z)
1347 END DO
1348 END DO
1349 END DO
1350 END DO
1351!$OMP END PARALLEL DO
1352
1353 ALLOCATE (send_reqs(0:rs%desc%group_size - 1))
1354 send_reqs = mp_request_null
1355
1356 DO i = 0, rs%desc%group_size - 1
1357 IF (send_sizes(i) .NE. 0) THEN
1358 CALL rs%desc%group%isend(send_bufs(i)%array, i, send_reqs(i))
1359 END IF
1360 END DO
1361
1362 ! do unpacking
1363 ! no OMP here so we can unpack each message as it arrives
1364 DO i = 0, rs%desc%group_size - 1
1365 IF (recv_sizes(i) .EQ. 0) cycle
1366
1367 CALL mp_waitany(recv_reqs, completed)
1368 k = 0
1369 DO z = recv_tasks(completed - 1, 5), recv_tasks(completed - 1, 6)
1370 DO y = recv_tasks(completed - 1, 3), recv_tasks(completed - 1, 4)
1371 DO x = recv_tasks(completed - 1, 1), recv_tasks(completed - 1, 2)
1372 k = k + 1
1373 pw%array(x, y, z) = recv_bufs(completed - 1)%array(k)
1374 END DO
1375 END DO
1376 END DO
1377 END DO
1378
1379 CALL mp_waitall(send_reqs)
1380
1381 DEALLOCATE (recv_reqs)
1382 DEALLOCATE (send_reqs)
1383
1384 DO i = 0, rs%desc%group_size - 1
1385 IF (ASSOCIATED(send_bufs(i)%array)) THEN
1386 DEALLOCATE (send_bufs(i)%array)
1387 END IF
1388 IF (ASSOCIATED(recv_bufs(i)%array)) THEN
1389 DEALLOCATE (recv_bufs(i)%array)
1390 END IF
1391 END DO
1392
1393 DEALLOCATE (send_bufs)
1394 DEALLOCATE (recv_bufs)
1395 DEALLOCATE (send_tasks)
1396 DEALLOCATE (send_sizes)
1397 DEALLOCATE (send_disps)
1398 DEALLOCATE (recv_tasks)
1399 DEALLOCATE (recv_sizes)
1400 DEALLOCATE (recv_disps)
1401
1402 IF (debug_this_module) THEN
1403 ! safety check, to be removed once we're absolute sure the routine is correct
1404 pw_sum = pw_integrate_function(pw)
1405 IF (abs(pw_sum - rs_sum)/max(1.0_dp, abs(pw_sum), abs(rs_sum)) > epsilon(rs_sum)*1000) THEN
1406 WRITE (error_string, '(A,6(1X,I4.4),3F25.16)') "rs_pw_transfer_distributed", &
1407 rs%desc%npts, rs%desc%group_dim, pw_sum, rs_sum, abs(pw_sum - rs_sum)
1408 CALL cp_abort(__location__, &
1409 error_string//" Please report this bug ... quick workaround: use "// &
1410 "DISTRIBUTION_TYPE REPLICATED")
1411 END IF
1412 END IF
1413
1414 END SUBROUTINE transfer_rs2pw_distributed
1415
1416! **************************************************************************************************
1417!> \brief does the pw2rs transfer in the case where the rs grid is
1418!> distributed (3D domain decomposition)
1419!> \param rs ...
1420!> \param pw ...
1421!> \par History
1422!> 12.2007 created [Matt Watkins]
1423!> 9.2008 reduced amount of halo data sent [Iain Bethune]
1424!> 10.2008 added non-blocking communication [Iain Bethune]
1425!> 4.2009 added support for rank-reordering on the grid [Iain Bethune]
1426!> 12.2009 added OMP and sparse alltoall [Iain Bethune]
1427!> (c) The Numerical Algorithms Group (NAG) Ltd, 2008-2009 on behalf of the HECToR project
1428!> \note
1429!> the transfer is a two step procedure. For example, for the rs2pw transfer:
1430!>
1431!> 1) Halo-exchange in 3D so that the local part of the rs_grid contains the full data
1432!> 2) an alltoall communication to redistribute the local rs_grid to the local pw_grid
1433!>
1434!> the halo exchange is most expensive on a large number of CPUs. Particular in this halo
1435!> exchange is that the border region is rather large (e.g. 20 points) and that it might overlap
1436!> with the central domain of several CPUs (i.e. next nearest neighbors)
1437! **************************************************************************************************
1438 SUBROUTINE transfer_pw2rs_distributed(rs, pw)
1439 TYPE(realspace_grid_type), INTENT(IN) :: rs
1440 TYPE(pw_r3d_rs_type), INTENT(IN) :: pw
1441
1442 INTEGER :: completed, dest_down, dest_up, i, idir, j, k, lb, my_id, my_pw_rank, my_rs_rank, &
1443 n_shifts, nn, num_threads, position, source_down, source_up, ub, x, y, z
1444 INTEGER, ALLOCATABLE, DIMENSION(:) :: dshifts, recv_disps, recv_sizes, &
1445 send_disps, send_sizes, ushifts
1446 INTEGER, ALLOCATABLE, DIMENSION(:, :) :: bounds, recv_tasks, send_tasks
1447 INTEGER, DIMENSION(2) :: neighbours, pos
1448 INTEGER, DIMENSION(3) :: coords, lb_recv, lb_recv_down, lb_recv_up, lb_send, lb_send_down, &
1449 lb_send_up, ub_recv, ub_recv_down, ub_recv_up, ub_send, ub_send_down, ub_send_up
1450 LOGICAL, DIMENSION(3) :: halo_swapped
1451 REAL(kind=dp), ALLOCATABLE, DIMENSION(:, :, :) :: recv_buf_3d_down, recv_buf_3d_up, &
1452 send_buf_3d_down, send_buf_3d_up
1453 TYPE(cp_1d_r_p_type), ALLOCATABLE, DIMENSION(:) :: recv_bufs, send_bufs
1454 TYPE(mp_request_type), ALLOCATABLE, DIMENSION(:) :: recv_reqs, send_reqs
1455 TYPE(mp_request_type), DIMENSION(4) :: req
1456
1457 num_threads = 1
1458 my_id = 0
1459
1460 CALL rs_grid_zero(rs)
1461
1462 ! This is the real redistribution
1463
1464 ALLOCATE (bounds(0:pw%pw_grid%para%group%num_pe - 1, 1:4))
1465
1466 DO i = 0, pw%pw_grid%para%group%num_pe - 1
1467 bounds(i, 1:2) = pw%pw_grid%para%bo(1:2, 1, i, 1)
1468 bounds(i, 3:4) = pw%pw_grid%para%bo(1:2, 2, i, 1)
1469 bounds(i, 1:2) = bounds(i, 1:2) - pw%pw_grid%npts(1)/2 - 1
1470 bounds(i, 3:4) = bounds(i, 3:4) - pw%pw_grid%npts(2)/2 - 1
1471 END DO
1472
1473 ALLOCATE (send_tasks(0:pw%pw_grid%para%group%num_pe - 1, 1:6))
1474 ALLOCATE (send_sizes(0:pw%pw_grid%para%group%num_pe - 1))
1475 ALLOCATE (send_disps(0:pw%pw_grid%para%group%num_pe - 1))
1476 ALLOCATE (recv_tasks(0:pw%pw_grid%para%group%num_pe - 1, 1:6))
1477 ALLOCATE (recv_sizes(0:pw%pw_grid%para%group%num_pe - 1))
1478 ALLOCATE (recv_disps(0:pw%pw_grid%para%group%num_pe - 1))
1479
1480 send_tasks = 0
1481 send_tasks(:, 1) = 1
1482 send_tasks(:, 2) = 0
1483 send_tasks(:, 3) = 1
1484 send_tasks(:, 4) = 0
1485 send_tasks(:, 5) = 1
1486 send_tasks(:, 6) = 0
1487 send_sizes = 0
1488
1489 recv_tasks = 0
1490 recv_tasks(:, 1) = 1
1491 recv_tasks(:, 2) = 0
1492 send_tasks(:, 3) = 1
1493 send_tasks(:, 4) = 0
1494 send_tasks(:, 5) = 1
1495 send_tasks(:, 6) = 0
1496 recv_sizes = 0
1497
1498 my_rs_rank = rs%desc%my_pos
1499 my_pw_rank = pw%pw_grid%para%group%mepos
1500
1501 ! find the processors that should hold our data
1502 ! should be part of the rs grid type
1503 ! this is a loop over real ranks (i.e. the in-order cartesian ranks)
1504 ! do the recv and send tasks in two separate loops which will
1505 ! load balance better for OpenMP with large numbers of MPI tasks
1506
1507 ! this is the reverse of rs2pw: what were the sends are now the recvs
1508
1509!$OMP PARALLEL DO DEFAULT(NONE), &
1510!$OMP PRIVATE(coords,idir,pos,lb_send,ub_send), &
1511!$OMP SHARED(rs,bounds,my_rs_rank,send_tasks,send_sizes,pw)
1512 DO i = 0, pw%pw_grid%para%group%num_pe - 1
1513
1514 coords(:) = rs%desc%rank2coord(:, rs%desc%real2virtual(i))
1515 !calculate the real rs grid points on each processor
1516 !coords is the part of the grid that rank i actually holds
1517 DO idir = 1, 3
1518 pos(:) = get_limit(rs%desc%npts(idir), rs%desc%group_dim(idir), coords(idir))
1519 pos(:) = pos(:) - rs%desc%npts(idir)/2 - 1
1520 lb_send(idir) = pos(1)
1521 ub_send(idir) = pos(2)
1522 END DO
1523
1524 IF (ub_send(1) .LT. bounds(my_rs_rank, 1)) cycle
1525 IF (lb_send(1) .GT. bounds(my_rs_rank, 2)) cycle
1526 IF (ub_send(2) .LT. bounds(my_rs_rank, 3)) cycle
1527 IF (lb_send(2) .GT. bounds(my_rs_rank, 4)) cycle
1528
1529 send_tasks(i, 1) = max(lb_send(1), bounds(my_rs_rank, 1))
1530 send_tasks(i, 2) = min(ub_send(1), bounds(my_rs_rank, 2))
1531 send_tasks(i, 3) = max(lb_send(2), bounds(my_rs_rank, 3))
1532 send_tasks(i, 4) = min(ub_send(2), bounds(my_rs_rank, 4))
1533 send_tasks(i, 5) = lb_send(3)
1534 send_tasks(i, 6) = ub_send(3)
1535 send_sizes(i) = (send_tasks(i, 2) - send_tasks(i, 1) + 1)* &
1536 (send_tasks(i, 4) - send_tasks(i, 3) + 1)*(send_tasks(i, 6) - send_tasks(i, 5) + 1)
1537
1538 END DO
1539!$OMP END PARALLEL DO
1540
1541 coords(:) = rs%desc%rank2coord(:, rs%desc%real2virtual(my_rs_rank))
1542 DO idir = 1, 3
1543 pos(:) = get_limit(rs%desc%npts(idir), rs%desc%group_dim(idir), coords(idir))
1544 pos(:) = pos(:) - rs%desc%npts(idir)/2 - 1
1545 lb_send(idir) = pos(1)
1546 ub_send(idir) = pos(2)
1547 END DO
1548
1549 lb_recv(:) = lb_send(:)
1550 ub_recv(:) = ub_send(:)
1551
1552!$OMP PARALLEL DO DEFAULT(NONE), &
1553!$OMP SHARED(pw,lb_send,ub_send,bounds,recv_tasks,recv_sizes)
1554 DO j = 0, pw%pw_grid%para%group%num_pe - 1
1555
1556 IF (ub_send(1) .LT. bounds(j, 1)) cycle
1557 IF (lb_send(1) .GT. bounds(j, 2)) cycle
1558 IF (ub_send(2) .LT. bounds(j, 3)) cycle
1559 IF (lb_send(2) .GT. bounds(j, 4)) cycle
1560
1561 recv_tasks(j, 1) = max(lb_send(1), bounds(j, 1))
1562 recv_tasks(j, 2) = min(ub_send(1), bounds(j, 2))
1563 recv_tasks(j, 3) = max(lb_send(2), bounds(j, 3))
1564 recv_tasks(j, 4) = min(ub_send(2), bounds(j, 4))
1565 recv_tasks(j, 5) = lb_send(3)
1566 recv_tasks(j, 6) = ub_send(3)
1567 recv_sizes(j) = (recv_tasks(j, 2) - recv_tasks(j, 1) + 1)* &
1568 (recv_tasks(j, 4) - recv_tasks(j, 3) + 1)*(recv_tasks(j, 6) - recv_tasks(j, 5) + 1)
1569
1570 END DO
1571!$OMP END PARALLEL DO
1572
1573 send_disps(0) = 0
1574 recv_disps(0) = 0
1575 DO i = 1, pw%pw_grid%para%group%num_pe - 1
1576 send_disps(i) = send_disps(i - 1) + send_sizes(i - 1)
1577 recv_disps(i) = recv_disps(i - 1) + recv_sizes(i - 1)
1578 END DO
1579
1580 cpassert(sum(recv_sizes) == product(ub_recv - lb_recv + 1))
1581
1582 ALLOCATE (send_bufs(0:rs%desc%group_size - 1))
1583 ALLOCATE (recv_bufs(0:rs%desc%group_size - 1))
1584
1585 DO i = 0, rs%desc%group_size - 1
1586 IF (send_sizes(i) .NE. 0) THEN
1587 ALLOCATE (send_bufs(i)%array(send_sizes(i)))
1588 ELSE
1589 NULLIFY (send_bufs(i)%array)
1590 END IF
1591 IF (recv_sizes(i) .NE. 0) THEN
1592 ALLOCATE (recv_bufs(i)%array(recv_sizes(i)))
1593 ELSE
1594 NULLIFY (recv_bufs(i)%array)
1595 END IF
1596 END DO
1597
1598 ALLOCATE (recv_reqs(0:rs%desc%group_size - 1))
1599 recv_reqs = mp_request_null
1600
1601 DO i = 0, rs%desc%group_size - 1
1602 IF (recv_sizes(i) .NE. 0) THEN
1603 CALL rs%desc%group%irecv(recv_bufs(i)%array, i, recv_reqs(i))
1604 END IF
1605 END DO
1606
1607 ! do packing
1608!$OMP PARALLEL DO DEFAULT(NONE), &
1609!$OMP PRIVATE(k,z,y,x), &
1610!$OMP SHARED(pw,rs,send_tasks,send_bufs,send_disps)
1611 DO i = 0, rs%desc%group_size - 1
1612 k = 0
1613 DO z = send_tasks(i, 5), send_tasks(i, 6)
1614 DO y = send_tasks(i, 3), send_tasks(i, 4)
1615 DO x = send_tasks(i, 1), send_tasks(i, 2)
1616 k = k + 1
1617 send_bufs(i)%array(k) = pw%array(x, y, z)
1618 END DO
1619 END DO
1620 END DO
1621 END DO
1622!$OMP END PARALLEL DO
1623
1624 ALLOCATE (send_reqs(0:rs%desc%group_size - 1))
1625 send_reqs = mp_request_null
1626
1627 DO i = 0, rs%desc%group_size - 1
1628 IF (send_sizes(i) .NE. 0) THEN
1629 CALL rs%desc%group%isend(send_bufs(i)%array, i, send_reqs(i))
1630 END IF
1631 END DO
1632
1633 ! do unpacking
1634 ! no OMP here so we can unpack each message as it arrives
1635
1636 DO i = 0, rs%desc%group_size - 1
1637 IF (recv_sizes(i) .EQ. 0) cycle
1638
1639 CALL mp_waitany(recv_reqs, completed)
1640 k = 0
1641 DO z = recv_tasks(completed - 1, 5), recv_tasks(completed - 1, 6)
1642 DO y = recv_tasks(completed - 1, 3), recv_tasks(completed - 1, 4)
1643 DO x = recv_tasks(completed - 1, 1), recv_tasks(completed - 1, 2)
1644 k = k + 1
1645 rs%r(x, y, z) = recv_bufs(completed - 1)%array(k)
1646 END DO
1647 END DO
1648 END DO
1649 END DO
1650
1651 CALL mp_waitall(send_reqs)
1652
1653 DEALLOCATE (recv_reqs)
1654 DEALLOCATE (send_reqs)
1655
1656 DO i = 0, rs%desc%group_size - 1
1657 IF (ASSOCIATED(send_bufs(i)%array)) THEN
1658 DEALLOCATE (send_bufs(i)%array)
1659 END IF
1660 IF (ASSOCIATED(recv_bufs(i)%array)) THEN
1661 DEALLOCATE (recv_bufs(i)%array)
1662 END IF
1663 END DO
1664
1665 DEALLOCATE (send_bufs)
1666 DEALLOCATE (recv_bufs)
1667 DEALLOCATE (send_tasks)
1668 DEALLOCATE (send_sizes)
1669 DEALLOCATE (send_disps)
1670 DEALLOCATE (recv_tasks)
1671 DEALLOCATE (recv_sizes)
1672 DEALLOCATE (recv_disps)
1673
1674 ! now pass wings around
1675 halo_swapped = .false.
1676
1677 DO idir = 1, 3
1678
1679 IF (rs%desc%perd(idir) /= 1) THEN
1680
1681 ALLOCATE (dshifts(0:rs%desc%neighbours(idir)))
1682 ALLOCATE (ushifts(0:rs%desc%neighbours(idir)))
1683 ushifts = 0
1684 dshifts = 0
1685
1686 DO n_shifts = 1, rs%desc%neighbours(idir)
1687
1688 ! need to take into account the possible varying widths of neighbouring cells
1689 ! ushifts and dshifts hold the real size of the neighbouring cells
1690
1691 position = modulo(rs%desc%virtual_group_coor(idir) - n_shifts, rs%desc%group_dim(idir))
1692 neighbours = get_limit(rs%desc%npts(idir), rs%desc%group_dim(idir), position)
1693 dshifts(n_shifts) = dshifts(n_shifts - 1) + (neighbours(2) - neighbours(1) + 1)
1694
1695 position = modulo(rs%desc%virtual_group_coor(idir) + n_shifts, rs%desc%group_dim(idir))
1696 neighbours = get_limit(rs%desc%npts(idir), rs%desc%group_dim(idir), position)
1697 ushifts(n_shifts) = ushifts(n_shifts - 1) + (neighbours(2) - neighbours(1) + 1)
1698
1699 ! The border data has to be send/received from the neighbors
1700 ! First we calculate the source and destination processes for the shift
1701 ! The first shift is "downwards"
1702
1703 CALL cart_shift(rs, idir, -1*n_shifts, source_down, dest_down)
1704
1705 lb_send_down(:) = rs%lb_local(:)
1706 ub_send_down(:) = rs%ub_local(:)
1707 lb_recv_down(:) = rs%lb_local(:)
1708 ub_recv_down(:) = rs%ub_local(:)
1709
1710 IF (dshifts(n_shifts - 1) .LE. rs%desc%border) THEN
1711 lb_send_down(idir) = lb_send_down(idir) + rs%desc%border
1712 ub_send_down(idir) = min(ub_send_down(idir) - rs%desc%border, &
1713 lb_send_down(idir) + rs%desc%border - 1 - dshifts(n_shifts - 1))
1714
1715 lb_recv_down(idir) = ub_recv_down(idir) - rs%desc%border + 1 + ushifts(n_shifts - 1)
1716 ub_recv_down(idir) = min(ub_recv_down(idir), &
1717 ub_recv_down(idir) - rs%desc%border + ushifts(n_shifts))
1718 ELSE
1719 lb_send_down(idir) = 0
1720 ub_send_down(idir) = -1
1721 lb_recv_down(idir) = 0
1722 ub_recv_down(idir) = -1
1723 END IF
1724
1725 DO i = 1, 3
1726 IF (.NOT. (halo_swapped(i) .OR. i .EQ. idir)) THEN
1727 lb_send_down(i) = rs%lb_real(i)
1728 ub_send_down(i) = rs%ub_real(i)
1729 lb_recv_down(i) = rs%lb_real(i)
1730 ub_recv_down(i) = rs%ub_real(i)
1731 END IF
1732 END DO
1733
1734 ! allocate the recv buffer
1735 nn = product(ub_recv_down - lb_recv_down + 1)
1736 ALLOCATE (recv_buf_3d_down(lb_recv_down(1):ub_recv_down(1), &
1737 lb_recv_down(2):ub_recv_down(2), lb_recv_down(3):ub_recv_down(3)))
1738
1739 ! recv buffer is now ready, so post the receive
1740 CALL rs%desc%group%irecv(recv_buf_3d_down, source_down, req(1))
1741
1742 ! now allocate,pack and send the send buffer
1743 nn = product(ub_send_down - lb_send_down + 1)
1744 ALLOCATE (send_buf_3d_down(lb_send_down(1):ub_send_down(1), &
1745 lb_send_down(2):ub_send_down(2), lb_send_down(3):ub_send_down(3)))
1746
1747!$OMP PARALLEL DEFAULT(NONE), &
1748!$OMP PRIVATE(lb,ub,my_id,NUM_THREADS), &
1749!$OMP SHARED(send_buf_3d_down,rs,lb_send_down,ub_send_down)
1750!$ num_threads = MIN(omp_get_max_threads(), ub_send_down(3) - lb_send_down(3) + 1)
1751!$ my_id = omp_get_thread_num()
1752 IF (my_id < num_threads) THEN
1753 lb = lb_send_down(3) + ((ub_send_down(3) - lb_send_down(3) + 1)*my_id)/num_threads
1754 ub = lb_send_down(3) + ((ub_send_down(3) - lb_send_down(3) + 1)*(my_id + 1))/num_threads - 1
1755
1756 send_buf_3d_down(lb_send_down(1):ub_send_down(1), lb_send_down(2):ub_send_down(2), &
1757 lb:ub) = rs%r(lb_send_down(1):ub_send_down(1), &
1758 lb_send_down(2):ub_send_down(2), lb:ub)
1759 END IF
1760!$OMP END PARALLEL
1761
1762 CALL rs%desc%group%isend(send_buf_3d_down, dest_down, req(3))
1763
1764 ! Now for the other direction
1765
1766 CALL cart_shift(rs, idir, n_shifts, source_up, dest_up)
1767
1768 lb_send_up(:) = rs%lb_local(:)
1769 ub_send_up(:) = rs%ub_local(:)
1770 lb_recv_up(:) = rs%lb_local(:)
1771 ub_recv_up(:) = rs%ub_local(:)
1772
1773 IF (ushifts(n_shifts - 1) .LE. rs%desc%border) THEN
1774 ub_send_up(idir) = ub_send_up(idir) - rs%desc%border
1775 lb_send_up(idir) = max(lb_send_up(idir) + rs%desc%border, &
1776 ub_send_up(idir) - rs%desc%border + 1 + ushifts(n_shifts - 1))
1777
1778 ub_recv_up(idir) = lb_recv_up(idir) + rs%desc%border - 1 - dshifts(n_shifts - 1)
1779 lb_recv_up(idir) = max(lb_recv_up(idir), &
1780 lb_recv_up(idir) + rs%desc%border - dshifts(n_shifts))
1781 ELSE
1782 lb_send_up(idir) = 0
1783 ub_send_up(idir) = -1
1784 lb_recv_up(idir) = 0
1785 ub_recv_up(idir) = -1
1786 END IF
1787
1788 DO i = 1, 3
1789 IF (.NOT. (halo_swapped(i) .OR. i .EQ. idir)) THEN
1790 lb_send_up(i) = rs%lb_real(i)
1791 ub_send_up(i) = rs%ub_real(i)
1792 lb_recv_up(i) = rs%lb_real(i)
1793 ub_recv_up(i) = rs%ub_real(i)
1794 END IF
1795 END DO
1796
1797 ! allocate the recv buffer
1798 nn = product(ub_recv_up - lb_recv_up + 1)
1799 ALLOCATE (recv_buf_3d_up(lb_recv_up(1):ub_recv_up(1), &
1800 lb_recv_up(2):ub_recv_up(2), lb_recv_up(3):ub_recv_up(3)))
1801
1802 ! recv buffer is now ready, so post the receive
1803
1804 CALL rs%desc%group%irecv(recv_buf_3d_up, source_up, req(2))
1805
1806 ! now allocate,pack and send the send buffer
1807 nn = product(ub_send_up - lb_send_up + 1)
1808 ALLOCATE (send_buf_3d_up(lb_send_up(1):ub_send_up(1), &
1809 lb_send_up(2):ub_send_up(2), lb_send_up(3):ub_send_up(3)))
1810
1811!$OMP PARALLEL DEFAULT(NONE), &
1812!$OMP PRIVATE(lb,ub,my_id,NUM_THREADS), &
1813!$OMP SHARED(send_buf_3d_up,rs,lb_send_up,ub_send_up)
1814!$ num_threads = MIN(omp_get_max_threads(), ub_send_up(3) - lb_send_up(3) + 1)
1815!$ my_id = omp_get_thread_num()
1816 IF (my_id < num_threads) THEN
1817 lb = lb_send_up(3) + ((ub_send_up(3) - lb_send_up(3) + 1)*my_id)/num_threads
1818 ub = lb_send_up(3) + ((ub_send_up(3) - lb_send_up(3) + 1)*(my_id + 1))/num_threads - 1
1819
1820 send_buf_3d_up(lb_send_up(1):ub_send_up(1), lb_send_up(2):ub_send_up(2), &
1821 lb:ub) = rs%r(lb_send_up(1):ub_send_up(1), &
1822 lb_send_up(2):ub_send_up(2), lb:ub)
1823 END IF
1824!$OMP END PARALLEL
1825
1826 CALL rs%desc%group%isend(send_buf_3d_up, dest_up, req(4))
1827
1828 ! wait for a recv to complete, then we can unpack
1829
1830 DO i = 1, 2
1831
1832 CALL mp_waitany(req(1:2), completed)
1833
1834 IF (completed .EQ. 1) THEN
1835
1836 ! only some procs may need later shifts
1837 IF (ub_recv_down(idir) .GE. lb_recv_down(idir)) THEN
1838
1839 ! Add the data to the RS Grid
1840!$OMP PARALLEL DEFAULT(NONE), &
1841!$OMP PRIVATE(lb,ub,my_id,NUM_THREADS), &
1842!$OMP SHARED(recv_buf_3d_down,rs,lb_recv_down,ub_recv_down)
1843!$ num_threads = MIN(omp_get_max_threads(), ub_recv_down(3) - lb_recv_down(3) + 1)
1844!$ my_id = omp_get_thread_num()
1845 IF (my_id < num_threads) THEN
1846 lb = lb_recv_down(3) + ((ub_recv_down(3) - lb_recv_down(3) + 1)*my_id)/num_threads
1847 ub = lb_recv_down(3) + ((ub_recv_down(3) - lb_recv_down(3) + 1)*(my_id + 1))/num_threads - 1
1848
1849 rs%r(lb_recv_down(1):ub_recv_down(1), lb_recv_down(2):ub_recv_down(2), &
1850 lb:ub) = recv_buf_3d_down(:, :, lb:ub)
1851 END IF
1852!$OMP END PARALLEL
1853 END IF
1854
1855 DEALLOCATE (recv_buf_3d_down)
1856 ELSE
1857
1858 ! only some procs may need later shifts
1859 IF (ub_recv_up(idir) .GE. lb_recv_up(idir)) THEN
1860
1861 ! Add the data to the RS Grid
1862!$OMP PARALLEL DEFAULT(NONE), &
1863!$OMP PRIVATE(lb,ub,my_id,NUM_THREADS), &
1864!$OMP SHARED(recv_buf_3d_up,rs,lb_recv_up,ub_recv_up)
1865!$ num_threads = MIN(omp_get_max_threads(), ub_recv_up(3) - lb_recv_up(3) + 1)
1866!$ my_id = omp_get_thread_num()
1867 IF (my_id < num_threads) THEN
1868 lb = lb_recv_up(3) + ((ub_recv_up(3) - lb_recv_up(3) + 1)*my_id)/num_threads
1869 ub = lb_recv_up(3) + ((ub_recv_up(3) - lb_recv_up(3) + 1)*(my_id + 1))/num_threads - 1
1870
1871 rs%r(lb_recv_up(1):ub_recv_up(1), lb_recv_up(2):ub_recv_up(2), &
1872 lb:ub) = recv_buf_3d_up(:, :, lb:ub)
1873 END IF
1874!$OMP END PARALLEL
1875 END IF
1876
1877 DEALLOCATE (recv_buf_3d_up)
1878 END IF
1879 END DO
1880
1881 CALL mp_waitall(req(3:4))
1882
1883 DEALLOCATE (send_buf_3d_down)
1884 DEALLOCATE (send_buf_3d_up)
1885 END DO
1886
1887 DEALLOCATE (ushifts)
1888 DEALLOCATE (dshifts)
1889 END IF
1890
1891 halo_swapped(idir) = .true.
1892
1893 END DO
1894
1895 END SUBROUTINE transfer_pw2rs_distributed
1896
1897! **************************************************************************************************
1898!> \brief Initialize grid to zero
1899!> \param rs ...
1900!> \par History
1901!> none
1902!> \author JGH (23-Mar-2002)
1903! **************************************************************************************************
1904 SUBROUTINE rs_grid_zero(rs)
1905
1906 TYPE(realspace_grid_type), INTENT(IN) :: rs
1907
1908 CHARACTER(len=*), PARAMETER :: routinen = 'rs_grid_zero'
1909
1910 INTEGER :: handle, i, j, k, l(3), u(3)
1911
1912 CALL timeset(routinen, handle)
1913 l(1) = lbound(rs%r, 1); l(2) = lbound(rs%r, 2); l(3) = lbound(rs%r, 3)
1914 u(1) = ubound(rs%r, 1); u(2) = ubound(rs%r, 2); u(3) = ubound(rs%r, 3)
1915!$OMP PARALLEL DO DEFAULT(NONE) COLLAPSE(3) &
1916!$OMP PRIVATE(i,j,k) &
1917!$OMP SHARED(rs,l,u)
1918 DO k = l(3), u(3)
1919 DO j = l(2), u(2)
1920 DO i = l(1), u(1)
1921 rs%r(i, j, k) = 0.0_dp
1922 END DO
1923 END DO
1924 END DO
1925!$OMP END PARALLEL DO
1926 CALL timestop(handle)
1927
1928 END SUBROUTINE rs_grid_zero
1929
1930! **************************************************************************************************
1931!> \brief rs1(i) = rs1(i) + rs2(i)*rs3(i)
1932!> \param rs1 ...
1933!> \param rs2 ...
1934!> \param rs3 ...
1935!> \param scalar ...
1936!> \par History
1937!> none
1938!> \author
1939! **************************************************************************************************
1940 SUBROUTINE rs_grid_mult_and_add(rs1, rs2, rs3, scalar)
1941
1942 TYPE(realspace_grid_type), INTENT(IN) :: rs1, rs2, rs3
1943 REAL(dp), INTENT(IN) :: scalar
1944
1945 CHARACTER(len=*), PARAMETER :: routinen = 'rs_grid_mult_and_add'
1946
1947 INTEGER :: handle, i, j, k, l(3), u(3)
1948
1949!-----------------------------------------------------------------------------!
1950
1951 CALL timeset(routinen, handle)
1952 IF (scalar .NE. 0.0_dp) THEN
1953 l(1) = lbound(rs1%r, 1); l(2) = lbound(rs1%r, 2); l(3) = lbound(rs1%r, 3)
1954 u(1) = ubound(rs1%r, 1); u(2) = ubound(rs1%r, 2); u(3) = ubound(rs1%r, 3)
1955!$OMP PARALLEL DO DEFAULT(NONE) COLLAPSE(3) &
1956!$OMP PRIVATE(i,j,k) &
1957!$OMP SHARED(rs1,rs2,rs3,scalar,l,u)
1958 DO k = l(3), u(3)
1959 DO j = l(2), u(2)
1960 DO i = l(1), u(1)
1961 rs1%r(i, j, k) = rs1%r(i, j, k) + scalar*rs2%r(i, j, k)*rs3%r(i, j, k)
1962 END DO
1963 END DO
1964 END DO
1965!$OMP END PARALLEL DO
1966 END IF
1967 CALL timestop(handle)
1968 END SUBROUTINE rs_grid_mult_and_add
1969
1970! **************************************************************************************************
1971!> \brief Set box matrix info for real space grid
1972!> This is needed for variable cell simulations
1973!> \param pw_grid ...
1974!> \param rs ...
1975!> \par History
1976!> none
1977!> \author JGH (15-May-2007)
1978! **************************************************************************************************
1979 SUBROUTINE rs_grid_set_box(pw_grid, rs)
1980
1981 TYPE(pw_grid_type), INTENT(IN), TARGET :: pw_grid
1982 TYPE(realspace_grid_type), INTENT(IN) :: rs
1983
1984 cpassert(ASSOCIATED(rs%desc%pw, pw_grid))
1985 rs%desc%dh = pw_grid%dh
1986 rs%desc%dh_inv = pw_grid%dh_inv
1987
1988 END SUBROUTINE rs_grid_set_box
1989
1990! **************************************************************************************************
1991!> \brief retains the given rs grid descriptor (see doc/ReferenceCounting.html)
1992!> \param rs_desc the grid descriptor to retain
1993!> \par History
1994!> 04.2009 created [Iain Bethune]
1995!> (c) The Numerical Algorithms Group (NAG) Ltd, 2009 on behalf of the HECToR project
1996! **************************************************************************************************
1997 SUBROUTINE rs_grid_retain_descriptor(rs_desc)
1998 TYPE(realspace_grid_desc_type), INTENT(INOUT) :: rs_desc
1999
2000 cpassert(rs_desc%ref_count > 0)
2001 rs_desc%ref_count = rs_desc%ref_count + 1
2002 END SUBROUTINE rs_grid_retain_descriptor
2003
2004! **************************************************************************************************
2005!> \brief releases the given rs grid (see doc/ReferenceCounting.html)
2006!> \param rs_grid the rs grid to release
2007!> \par History
2008!> 03.2003 created [fawzi]
2009!> \author fawzi
2010! **************************************************************************************************
2011 SUBROUTINE rs_grid_release(rs_grid)
2012 TYPE(realspace_grid_type), INTENT(INOUT) :: rs_grid
2013
2014 CALL rs_grid_release_descriptor(rs_grid%desc)
2015
2016 CALL offload_free_buffer(rs_grid%buffer)
2017 NULLIFY (rs_grid%r)
2018
2019 IF (ALLOCATED(rs_grid%px)) DEALLOCATE (rs_grid%px)
2020 IF (ALLOCATED(rs_grid%py)) DEALLOCATE (rs_grid%py)
2021 IF (ALLOCATED(rs_grid%pz)) DEALLOCATE (rs_grid%pz)
2022 END SUBROUTINE rs_grid_release
2023
2024! **************************************************************************************************
2025!> \brief releases the given rs grid descriptor (see doc/ReferenceCounting.html)
2026!> \param rs_desc the rs grid descriptor to release
2027!> \par History
2028!> 04.2009 created [Iain Bethune]
2029!> (c) The Numerical Algorithms Group (NAG) Ltd, 2009 on behalf of the HECToR project
2030! **************************************************************************************************
2031 SUBROUTINE rs_grid_release_descriptor(rs_desc)
2032 TYPE(realspace_grid_desc_type), POINTER :: rs_desc
2033
2034 IF (ASSOCIATED(rs_desc)) THEN
2035 cpassert(rs_desc%ref_count > 0)
2036 rs_desc%ref_count = rs_desc%ref_count - 1
2037 IF (rs_desc%ref_count == 0) THEN
2038
2039 CALL pw_grid_release(rs_desc%pw)
2040
2041 IF (rs_desc%parallel) THEN
2042 ! release the group communicator
2043 CALL rs_desc%group%free()
2044
2045 DEALLOCATE (rs_desc%virtual2real)
2046 DEALLOCATE (rs_desc%real2virtual)
2047 END IF
2048
2049 IF (rs_desc%distributed) THEN
2050 DEALLOCATE (rs_desc%rank2coord)
2051 DEALLOCATE (rs_desc%coord2rank)
2052 DEALLOCATE (rs_desc%lb_global)
2053 DEALLOCATE (rs_desc%ub_global)
2054 DEALLOCATE (rs_desc%x2coord)
2055 DEALLOCATE (rs_desc%y2coord)
2056 DEALLOCATE (rs_desc%z2coord)
2057 END IF
2058
2059 DEALLOCATE (rs_desc)
2060 END IF
2061 END IF
2062 NULLIFY (rs_desc)
2063 END SUBROUTINE rs_grid_release_descriptor
2064
2065! **************************************************************************************************
2066!> \brief emulates the function of an MPI_cart_shift operation, but the shift is
2067!> done in virtual coordinates, and the corresponding real ranks are returned
2068!> \param rs_grid ...
2069!> \param dir ...
2070!> \param disp ...
2071!> \param source ...
2072!> \param dest ...
2073!> \par History
2074!> 04.2009 created [Iain Bethune]
2075!> (c) The Numerical Algorithms Group (NAG) Ltd, 2009 on behalf of the HECToR project
2076! **************************************************************************************************
2077 PURE SUBROUTINE cart_shift(rs_grid, dir, disp, source, dest)
2078
2079 TYPE(realspace_grid_type), INTENT(IN) :: rs_grid
2080 INTEGER, INTENT(IN) :: dir, disp
2081 INTEGER, INTENT(OUT) :: source, dest
2082
2083 INTEGER, DIMENSION(3) :: shift_coords
2084
2085 shift_coords = rs_grid%desc%virtual_group_coor
2086 shift_coords(dir) = modulo(shift_coords(dir) + disp, rs_grid%desc%group_dim(dir))
2087 dest = rs_grid%desc%virtual2real(rs_grid%desc%coord2rank(shift_coords(1), shift_coords(2), shift_coords(3)))
2088 shift_coords = rs_grid%desc%virtual_group_coor
2089 shift_coords(dir) = modulo(shift_coords(dir) - disp, rs_grid%desc%group_dim(dir))
2090 source = rs_grid%desc%virtual2real(rs_grid%desc%coord2rank(shift_coords(1), shift_coords(2), shift_coords(3)))
2091
2092 END SUBROUTINE
2093
2094! **************************************************************************************************
2095!> \brief returns the maximum number of points in the local grid of any process
2096!> to account for the case where the grid may later be reordered
2097!> \param desc ...
2098!> \return ...
2099!> \par History
2100!> 10.2011 created [Iain Bethune]
2101! **************************************************************************************************
2102 FUNCTION rs_grid_max_ngpts(desc) RESULT(max_ngpts)
2103 TYPE(realspace_grid_desc_type), INTENT(IN) :: desc
2104 INTEGER :: max_ngpts
2105
2106 CHARACTER(len=*), PARAMETER :: routinen = 'rs_grid_max_ngpts'
2107
2108 INTEGER :: handle, i
2109 INTEGER, DIMENSION(3) :: lb, ub
2110
2111 CALL timeset(routinen, handle)
2112
2113 max_ngpts = 0
2114 IF ((desc%pw%para%mode == pw_mode_local) .OR. &
2115 (all(desc%group_dim == 1))) THEN
2116 cpassert(product(int(desc%npts, kind=int_8)) < huge(1))
2117 max_ngpts = product(desc%npts)
2118 ELSE
2119 DO i = 0, desc%group_size - 1
2120 lb = desc%lb_global(:, i)
2121 ub = desc%ub_global(:, i)
2122 lb = lb - desc%border*(1 - desc%perd)
2123 ub = ub + desc%border*(1 - desc%perd)
2124 cpassert(product(int(ub - lb + 1, kind=int_8)) < huge(1))
2125 max_ngpts = max(max_ngpts, product(ub - lb + 1))
2126 END DO
2127 END IF
2128
2129 CALL timestop(handle)
2130
2131 END FUNCTION rs_grid_max_ngpts
2132
2133! **************************************************************************************************
2134!> \brief ...
2135!> \param rs_grid ...
2136!> \param h_inv ...
2137!> \param ra ...
2138!> \param offset ...
2139!> \param group_size ...
2140!> \param my_pos ...
2141!> \return ...
2142! **************************************************************************************************
2143 PURE LOGICAL FUNCTION map_gaussian_here(rs_grid, h_inv, ra, offset, group_size, my_pos) RESULT(res)
2144 TYPE(realspace_grid_type), INTENT(IN) :: rs_grid
2145 REAL(kind=dp), DIMENSION(3, 3), INTENT(IN) :: h_inv
2146 REAL(kind=dp), DIMENSION(3), INTENT(IN) :: ra
2147 INTEGER, INTENT(IN), OPTIONAL :: offset, group_size, my_pos
2148
2149 INTEGER :: dir, lb(3), location(3), tp(3), ub(3)
2150
2151 res = .false.
2152
2153 IF (.NOT. all(rs_grid%desc%perd == 1)) THEN
2154 DO dir = 1, 3
2155 ! bounds of local grid (i.e. removing the 'wings'), if periodic
2156 tp(dir) = floor(dot_product(h_inv(dir, :), ra)*rs_grid%desc%npts(dir))
2157 tp(dir) = modulo(tp(dir), rs_grid%desc%npts(dir))
2158 IF (rs_grid%desc%perd(dir) .NE. 1) THEN
2159 lb(dir) = rs_grid%lb_local(dir) + rs_grid%desc%border
2160 ub(dir) = rs_grid%ub_local(dir) - rs_grid%desc%border
2161 ELSE
2162 lb(dir) = rs_grid%lb_local(dir)
2163 ub(dir) = rs_grid%ub_local(dir)
2164 END IF
2165 ! distributed grid, only map if it is local to the grid
2166 location(dir) = tp(dir) + rs_grid%desc%lb(dir)
2167 END DO
2168 IF (all(lb(:) <= location(:)) .AND. all(location(:) <= ub(:))) THEN
2169 res = .true.
2170 END IF
2171 ELSE
2172 IF (PRESENT(offset) .AND. PRESENT(group_size) .AND. PRESENT(my_pos)) THEN
2173 ! not distributed, just a round-robin distribution over the full set of CPUs
2174 IF (modulo(offset, group_size) == my_pos) res = .true.
2175 END IF
2176 END IF
2177
2178 END FUNCTION map_gaussian_here
2179
2180END MODULE realspace_grid_types
static GRID_HOST_DEVICE int modulo(int a, int m)
Equivalent of Fortran's MODULO, which always return a positive number. https://gcc....
various utilities that regard array of different kinds: output, allocation,... maybe it is not a good...
various routines to log and control the output. The idea is that decisions about where to log should ...
sums arrays of real/complex numbers with much reduced round-off as compared to a naive implementation...
Definition kahan_sum.F:29
Defines the basic variable types.
Definition kinds.F:23
integer, parameter, public int_8
Definition kinds.F:54
integer, parameter, public dp
Definition kinds.F:34
Machine interface based on Fortran 2003 and POSIX.
Definition machine.F:17
subroutine, public m_memory(mem)
Returns the total amount of memory [bytes] in use, if known, zero otherwise.
Definition machine.F:440
Collection of simple mathematical functions and subroutines.
Definition mathlib.F:15
Interface to the message passing library MPI.
type(mp_comm_type), parameter, public mp_comm_null
subroutine, public mp_waitany(requests, completed)
waits for completion of any of the given requests
type(mp_request_type), parameter, public mp_request_null
Fortran API for the offload package, which is written in C.
Definition offload_api.F:12
subroutine, public offload_free_buffer(buffer)
Deallocates given buffer.
subroutine, public offload_create_buffer(length, buffer)
Allocates a buffer of given length, ie. number of elements.
integer, parameter, public pw_mode_local
This module defines the grid data type and some basic operations on it.
Definition pw_grids.F:36
subroutine, public pw_grid_release(pw_grid)
releases the given pw grid
Definition pw_grids.F:2163
subroutine, public pw_grid_retain(pw_grid)
retains the given pw grid
Definition pw_grids.F:2147
subroutine, public rs_grid_print(rs, iounit)
Print information on grids to output.
integer, parameter, public rsgrid_replicated
subroutine, public rs_grid_create(rs, desc)
...
subroutine, public rs_grid_create_descriptor(desc, pw_grid, input_settings, border_points)
Determine the setup of real space grids - this is divided up into the creation of a descriptor and th...
pure integer function, public rs_grid_locate_rank(rs_desc, rank_in, shift)
returns the 1D rank of the task which is a cartesian shift away from 1D rank rank_in only possible if...
subroutine, public transfer_pw2rs(rs, pw)
...
integer, parameter, public rsgrid_automatic
pure subroutine, public rs_grid_reorder_ranks(desc, real2virtual)
Defines a new ordering of ranks on this realspace grid, recalculating the data bounds and reallocatin...
subroutine, public rs_grid_mult_and_add(rs1, rs2, rs3, scalar)
rs1(i) = rs1(i) + rs2(i)*rs3(i)
subroutine, public rs_grid_set_box(pw_grid, rs)
Set box matrix info for real space grid This is needed for variable cell simulations.
subroutine, public rs_grid_retain_descriptor(rs_desc)
retains the given rs grid descriptor (see doc/ReferenceCounting.html)
subroutine, public rs_grid_release_descriptor(rs_desc)
releases the given rs grid descriptor (see doc/ReferenceCounting.html)
integer function, public rs_grid_max_ngpts(desc)
returns the maximum number of points in the local grid of any process to account for the case where t...
integer, parameter, public rsgrid_distributed
pure logical function, public map_gaussian_here(rs_grid, h_inv, ra, offset, group_size, my_pos)
...
subroutine, public transfer_rs2pw(rs, pw)
...
subroutine, public rs_grid_release(rs_grid)
releases the given rs grid (see doc/ReferenceCounting.html)
subroutine, public rs_grid_zero(rs)
Initialize grid to zero.
All kind of helpful little routines.
Definition util.F:14
pure integer function, dimension(2), public get_limit(m, n, me)
divide m entries into n parts, return size of part me
Definition util.F:333
void offload_free_buffer(offload_buffer *buffer)
Deallocate given buffer.
represent a pointer to a 1d array