(git:374b731)
Loading...
Searching...
No Matches
realspace_grid_types.F
Go to the documentation of this file.
1!--------------------------------------------------------------------------------------------------!
2! CP2K: A general program to perform molecular dynamics simulations !
3! Copyright 2000-2024 CP2K developers group <https://cp2k.org> !
4! !
5! SPDX-License-Identifier: GPL-2.0-or-later !
6!--------------------------------------------------------------------------------------------------!
7
8! **************************************************************************************************
9!> \note
10!> Basic type for real space grid methods
11!> \par History
12!> JGH (22-May-2002) : New routine rs_grid_zero
13!> JGH (12-Jun-2002) : Bug fix for mpi groups
14!> JGH (19-Jun-2003) : Added routine for task distribution
15!> JGH (23-Nov-2003) : Added routine for task loop separation
16!> \author JGH (18-Mar-2001)
17! **************************************************************************************************
21 USE kahan_sum, ONLY: accurate_sum
22 USE kinds, ONLY: dp,&
23 int_8
24 USE machine, ONLY: m_memory
25 USE mathlib, ONLY: det_3x3
26 USE message_passing, ONLY: mp_comm_null,&
35 USE pw_grid_types, ONLY: pw_mode_local,&
37 USE pw_grids, ONLY: pw_grid_release,&
40 USE pw_types, ONLY: pw_r3d_rs_type
41 USE util, ONLY: get_limit
42
43!$ USE OMP_LIB, ONLY: omp_get_max_threads, omp_get_thread_num, omp_get_num_threads
44
45#include "../base/base_uses.f90"
46
47 IMPLICIT NONE
48
49 PRIVATE
50 PUBLIC :: realspace_grid_type, &
55
56 PUBLIC :: transfer_rs2pw, &
71
72 INTEGER, PARAMETER, PUBLIC :: rsgrid_distributed = 0, &
75
76 LOGICAL, PRIVATE, PARAMETER :: debug_this_module = .false.
77 CHARACTER(len=*), PARAMETER, PRIVATE :: modulen = 'realspace_grid_types'
78
79! **************************************************************************************************
81 INTEGER :: distribution_type = rsgrid_replicated
82 INTEGER :: distribution_layout(3) = -1
83 REAL(kind=dp) :: memory_factor = 0.0_dp
84 LOGICAL :: lock_distribution = .false.
85 INTEGER :: nsmax = -1
86 REAL(kind=dp) :: halo_reduction_factor = 1.0_dp
88
89! **************************************************************************************************
91 TYPE(pw_grid_type), POINTER :: pw => null() ! the pw grid
92
93 INTEGER :: ref_count = 0 ! reference count
94
95 INTEGER(int_8) :: ngpts = 0_int_8 ! # grid points
96 INTEGER, DIMENSION(3) :: npts = 0 ! # grid points per dimension
97 INTEGER, DIMENSION(3) :: lb = 0 ! lower bounds
98 INTEGER, DIMENSION(3) :: ub = 0 ! upper bounds
99
100 INTEGER :: border = 0 ! border points
101
102 INTEGER, DIMENSION(3) :: perd = -1 ! periodicity enforced
103 REAL(kind=dp), DIMENSION(3, 3) :: dh = 0.0_dp ! incremental grid matrix
104 REAL(kind=dp), DIMENSION(3, 3) :: dh_inv = 0.0_dp ! inverse incremental grid matrix
105 LOGICAL :: orthorhombic = .true. ! grid symmetry
106
107 LOGICAL :: parallel = .true. ! whether the corresponding pw grid is distributed
108 LOGICAL :: distributed = .true. ! whether the rs grid is distributed
109 ! these MPI related quantities are only meaningful depending on how the grid has been laid out
110 ! they are most useful for fully distributed grids, where they reflect the topology of the grid
111 TYPE(mp_comm_type) :: group = mp_comm_null
112 INTEGER :: my_pos = -1
113 LOGICAL :: group_head = .false.
114 INTEGER :: group_size = 0
115 INTEGER, DIMENSION(3) :: group_dim = -1
116 INTEGER, DIMENSION(3) :: group_coor = -1
117 INTEGER, DIMENSION(3) :: neighbours = -1
118 ! only meaningful on distributed grids
119 ! a list of bounds for each CPU
120 INTEGER, DIMENSION(:, :), ALLOCATABLE :: lb_global
121 INTEGER, DIMENSION(:, :), ALLOCATABLE :: ub_global
122 ! a mapping from linear rank to 3d coord
123 INTEGER, DIMENSION(:, :), ALLOCATABLE :: rank2coord
124 INTEGER, DIMENSION(:, :, :), ALLOCATABLE :: coord2rank
125 ! a mapping from index to rank (which allows to figure out easily on which rank a given point of the grid is)
126 INTEGER, DIMENSION(:), ALLOCATABLE :: x2coord
127 INTEGER, DIMENSION(:), ALLOCATABLE :: y2coord
128 INTEGER, DIMENSION(:), ALLOCATABLE :: z2coord
129
130 INTEGER :: my_virtual_pos = -1
131 INTEGER, DIMENSION(3) :: virtual_group_coor = -1
132
133 INTEGER, DIMENSION(:), ALLOCATABLE :: virtual2real, real2virtual
134
136
138
139 TYPE(realspace_grid_desc_type), POINTER :: desc => null()
140
141 INTEGER :: ngpts_local = -1 ! local dimensions
142 INTEGER, DIMENSION(3) :: npts_local = -1
143 INTEGER, DIMENSION(3) :: lb_local = -1
144 INTEGER, DIMENSION(3) :: ub_local = -1
145 INTEGER, DIMENSION(3) :: lb_real = -1 ! lower bounds of the real local data
146 INTEGER, DIMENSION(3) :: ub_real = -1 ! upper bounds of the real local data
147
148 INTEGER, DIMENSION(:), ALLOCATABLE :: px, py, pz ! index translators
149 TYPE(offload_buffer_type) :: buffer = offload_buffer_type() ! owner of the grid's memory
150 REAL(kind=dp), DIMENSION(:, :, :), CONTIGUOUS, POINTER :: r => null() ! the grid (pointer to buffer%host_buffer)
151
152 END TYPE realspace_grid_type
153
154! **************************************************************************************************
156 TYPE(realspace_grid_type), POINTER :: rs_grid => null()
157 END TYPE realspace_grid_p_type
158
160 TYPE(realspace_grid_desc_type), POINTER :: rs_desc => null()
162
163CONTAINS
164
165! **************************************************************************************************
166!> \brief returns the 1D rank of the task which is a cartesian shift away from 1D rank rank_in
167!> only possible if rs_grid is a distributed grid
168!> \param rs_desc ...
169!> \param rank_in ...
170!> \param shift ...
171!> \return ...
172! **************************************************************************************************
173 PURE FUNCTION rs_grid_locate_rank(rs_desc, rank_in, shift) RESULT(rank_out)
174 TYPE(realspace_grid_desc_type), INTENT(IN) :: rs_desc
175 INTEGER, INTENT(IN) :: rank_in
176 INTEGER, DIMENSION(3), INTENT(IN) :: shift
177 INTEGER :: rank_out
178
179 INTEGER :: coord(3)
180
181 coord = modulo(rs_desc%rank2coord(:, rank_in) + shift, rs_desc%group_dim)
182 rank_out = rs_desc%coord2rank(coord(1), coord(2), coord(3))
183 END FUNCTION rs_grid_locate_rank
184
185! **************************************************************************************************
186!> \brief Determine the setup of real space grids - this is divided up into the
187!> creation of a descriptor and the actual grid itself (see rs_grid_create)
188!> \param desc ...
189!> \param pw_grid ...
190!> \param input_settings ...
191!> \param border_points ...
192!> \par History
193!> JGH (08-Jun-2003) : nsmax <= 0 indicates fully replicated grid
194!> Iain Bethune (05-Sep-2008) : modified cut heuristic
195!> (c) The Numerical Algorithms Group (NAG) Ltd, 2008 on behalf of the HECToR project
196!> - Create a descriptor for realspace grids with a number of border
197!> points as exactly given by the optional argument border_points.
198!> These grids are always distributed.
199!> (27.11.2013, Matthias Krack)
200!> \author JGH (18-Mar-2001)
201! **************************************************************************************************
202 SUBROUTINE rs_grid_create_descriptor(desc, pw_grid, input_settings, border_points)
203 TYPE(realspace_grid_desc_type), POINTER :: desc
204 TYPE(pw_grid_type), INTENT(INOUT), TARGET :: pw_grid
205 TYPE(realspace_grid_input_type), INTENT(IN) :: input_settings
206 INTEGER, INTENT(IN), OPTIONAL :: border_points
207
208 CHARACTER(LEN=*), PARAMETER :: routinen = 'rs_grid_create_descriptor'
209
210 INTEGER :: border_size, dir, handle, i, j, k, l, &
211 lb(2), min_npts_real, n_slices(3), &
212 n_slices_tmp(3), nmin
213 LOGICAL :: overlap
214 REAL(kind=dp) :: ratio, ratio_best, volume, volume_dist
215
216 CALL timeset(routinen, handle)
217
218 IF (PRESENT(border_points)) THEN
219 border_size = border_points
220 ELSE
221 border_size = 0
222 END IF
223
224 ALLOCATE (desc)
225
226 CALL pw_grid%para%group%sync()
227
228 desc%pw => pw_grid
229 CALL pw_grid_retain(desc%pw)
230
231 desc%dh = pw_grid%dh
232 desc%dh_inv = pw_grid%dh_inv
233 desc%orthorhombic = pw_grid%orthorhombic
234 desc%ref_count = 1
235
236 IF (pw_grid%para%mode == pw_mode_local) THEN
237 ! The corresponding group has dimension 1
238 ! All operations will be done locally
239 desc%npts = pw_grid%npts
240 desc%ngpts = product(int(desc%npts, kind=int_8))
241 desc%lb = pw_grid%bounds(1, :)
242 desc%ub = pw_grid%bounds(2, :)
243 desc%border = border_size
244 IF (border_size == 0) THEN
245 desc%perd = 1
246 ELSE
247 desc%perd = 0
248 END IF
249 desc%parallel = .false.
250 desc%distributed = .false.
251 desc%group = mp_comm_null
252 desc%group_size = 1
253 desc%group_head = .true.
254 desc%group_dim = 1
255 desc%group_coor = 0
256 desc%my_pos = 0
257 ELSE
258 ! group size of desc grid
259 ! global grid dimensions are still the same
260 desc%group_size = pw_grid%para%group_size
261 desc%npts = pw_grid%npts
262 desc%ngpts = product(int(desc%npts, kind=int_8))
263 desc%lb = pw_grid%bounds(1, :)
264 desc%ub = pw_grid%bounds(2, :)
265
266 ! this is the eventual border size
267 IF (border_size == 0) THEN
268 nmin = (input_settings%nsmax + 1)/2
269 nmin = max(0, nint(nmin*input_settings%halo_reduction_factor))
270 ELSE
271 ! Set explicitly the requested border size
272 nmin = border_size
273 END IF
274
275 IF (input_settings%distribution_type == rsgrid_replicated) THEN
276
277 n_slices = 1
278 IF (border_size > 0) THEN
279 CALL cp_abort(__location__, &
280 "An explicit border size > 0 is not yet working for "// &
281 "replicated realspace grids. Request DISTRIBUTION_TYPE "// &
282 "distributed for RS_GRID explicitly.")
283 END IF
284
285 ELSE
286
287 n_slices = 1
288 ratio_best = -huge(ratio_best)
289
290 ! don't allow distributions with more processors than real grid points
291 DO k = 1, min(desc%npts(3), desc%group_size)
292 DO j = 1, min(desc%npts(2), desc%group_size)
293 i = min(desc%npts(1), desc%group_size/(j*k))
294 n_slices_tmp = (/i, j, k/)
295
296 ! we don't match the actual number of CPUs
297 IF (product(n_slices_tmp) .NE. desc%group_size) cycle
298
299 ! we see if there has been a input constraint
300 ! i.e. if the layout is not -1 we need to fullfil it
301 IF (.NOT. all(pack(n_slices_tmp == input_settings%distribution_layout, &
302 (/-1, -1, -1/) /= input_settings%distribution_layout) &
303 )) cycle
304
305 ! We can not work with a grid that has more local than global grid points.
306 ! This can happen when a halo region wraps around and overlaps with the other halo.
307 overlap = .false.
308 DO dir = 1, 3
309 IF (n_slices_tmp(dir) > 1) THEN
310 DO l = 0, n_slices_tmp(dir) - 1
311 lb = get_limit(desc%npts(dir), n_slices_tmp(dir), l)
312 IF (lb(2) - lb(1) + 1 + 2*nmin > desc%npts(dir)) overlap = .true.
313 END DO
314 END IF
315 END DO
316 IF (overlap) cycle
317
318 ! a heuristic optimisation to reduce the memory usage
319 ! we go for the smallest local to real volume
320 ! volume of the box without the wings / volume of the box with the wings
321 ! with prefactodesc to promote less cuts in Z dimension
322 ratio = product(real(desc%npts, kind=dp)/n_slices_tmp)/ &
323 product(real(desc%npts, kind=dp)/n_slices_tmp + &
324 merge((/0.0, 0.0, 0.0/), 2*(/1.06*nmin, 1.05*nmin, 1.03*nmin/), n_slices_tmp == (/1, 1, 1/)))
325 IF (ratio > ratio_best) THEN
326 ratio_best = ratio
327 n_slices = n_slices_tmp
328 END IF
329
330 END DO
331 END DO
332
333 ! if automatic we can still decide this is a replicated grid
334 ! if the memory gain (or the gain is messages) is too small.
335 IF (input_settings%distribution_type == rsgrid_automatic) THEN
336 volume = product(real(desc%npts, kind=dp))
337 volume_dist = product(real(desc%npts, kind=dp)/n_slices + &
338 merge((/0, 0, 0/), 2*(/nmin, nmin, nmin/), n_slices == (/1, 1, 1/)))
339 IF (volume < volume_dist*input_settings%memory_factor) THEN
340 n_slices = 1
341 END IF
342 END IF
343
344 END IF
345
346 desc%group_dim(:) = n_slices(:)
347 CALL desc%group%from_dup(pw_grid%para%group)
348 desc%group_size = desc%group%num_pe
349 desc%my_pos = desc%group%mepos
350
351 IF (all(n_slices == 1)) THEN
352 ! CASE 1 : only one slice: we do not need overlapping regions and special
353 ! recombination of the total density
354 desc%border = border_size
355 IF (border_size == 0) THEN
356 desc%perd = 1
357 ELSE
358 desc%perd = 0
359 END IF
360 desc%distributed = .false.
361 desc%parallel = .true.
362 desc%group_head = pw_grid%para%group_head
363 desc%group_coor(:) = 0
364 desc%my_virtual_pos = 0
365
366 ALLOCATE (desc%virtual2real(0:desc%group_size - 1))
367 ALLOCATE (desc%real2virtual(0:desc%group_size - 1))
368 ! Start with no reordering
369 DO i = 0, desc%group_size - 1
370 desc%virtual2real(i) = i
371 desc%real2virtual(i) = i
372 END DO
373 ELSE
374 ! CASE 2 : general case
375 ! periodicity is no longer enforced arbritary directions
376 IF (border_size == 0) THEN
377 desc%perd = 1
378 DO dir = 1, 3
379 IF (n_slices(dir) > 1) desc%perd(dir) = 0
380 END DO
381 ELSE
382 desc%perd(:) = 0
383 END IF
384 ! we keep a border of nmin points
385 desc%border = nmin
386 ! we are going parallel on the real space grid
387 desc%parallel = .true.
388 desc%distributed = .true.
389 desc%group_head = (desc%my_pos == 0)
390
391 ! set up global info about the distribution
392 ALLOCATE (desc%rank2coord(3, 0:desc%group_size - 1))
393 ALLOCATE (desc%coord2rank(0:desc%group_dim(1) - 1, 0:desc%group_dim(2) - 1, 0:desc%group_dim(3) - 1))
394 ALLOCATE (desc%lb_global(3, 0:desc%group_size - 1))
395 ALLOCATE (desc%ub_global(3, 0:desc%group_size - 1))
396 ALLOCATE (desc%x2coord(desc%lb(1):desc%ub(1)))
397 ALLOCATE (desc%y2coord(desc%lb(2):desc%ub(2)))
398 ALLOCATE (desc%z2coord(desc%lb(3):desc%ub(3)))
399
400 DO i = 0, desc%group_size - 1
401 ! Calculate coordinates in a row-major order (to be SMP-friendly)
402 desc%rank2coord(1, i) = i/(desc%group_dim(2)*desc%group_dim(3))
403 desc%rank2coord(2, i) = modulo(i, desc%group_dim(2)*desc%group_dim(3)) &
404 /desc%group_dim(3)
405 desc%rank2coord(3, i) = modulo(i, desc%group_dim(3))
406
407 IF (i == desc%my_pos) THEN
408 desc%group_coor = desc%rank2coord(:, i)
409 END IF
410
411 desc%coord2rank(desc%rank2coord(1, i), desc%rank2coord(2, i), desc%rank2coord(3, i)) = i
412 ! the lb_global and ub_global correspond to lb_real and ub_real of each task
413 desc%lb_global(:, i) = desc%lb
414 desc%ub_global(:, i) = desc%ub
415 DO dir = 1, 3
416 IF (desc%group_dim(dir) .GT. 1) THEN
417 lb = get_limit(desc%npts(dir), desc%group_dim(dir), desc%rank2coord(dir, i))
418 desc%lb_global(dir, i) = lb(1) + desc%lb(dir) - 1
419 desc%ub_global(dir, i) = lb(2) + desc%lb(dir) - 1
420 END IF
421 END DO
422 END DO
423
424 ! map a grid point to a CPU coord
425 DO dir = 1, 3
426 DO l = 0, desc%group_dim(dir) - 1
427 IF (desc%group_dim(dir) .GT. 1) THEN
428 lb = get_limit(desc%npts(dir), desc%group_dim(dir), l)
429 lb = lb + desc%lb(dir) - 1
430 ELSE
431 lb(1) = desc%lb(dir)
432 lb(2) = desc%ub(dir)
433 END IF
434 SELECT CASE (dir)
435 CASE (1)
436 desc%x2coord(lb(1):lb(2)) = l
437 CASE (2)
438 desc%y2coord(lb(1):lb(2)) = l
439 CASE (3)
440 desc%z2coord(lb(1):lb(2)) = l
441 END SELECT
442 END DO
443 END DO
444
445 ! an upper bound for the number of neighbours the border is overlapping with
446 DO dir = 1, 3
447 desc%neighbours(dir) = 0
448 IF ((n_slices(dir) > 1) .OR. (border_size > 0)) THEN
449 min_npts_real = huge(0)
450 DO l = 0, n_slices(dir) - 1
451 lb = get_limit(desc%npts(dir), n_slices(dir), l)
452 min_npts_real = min(lb(2) - lb(1) + 1, min_npts_real)
453 END DO
454 desc%neighbours(dir) = (desc%border + min_npts_real - 1)/min_npts_real
455 END IF
456 END DO
457
458 ALLOCATE (desc%virtual2real(0:desc%group_size - 1))
459 ALLOCATE (desc%real2virtual(0:desc%group_size - 1))
460 ! Start with no reordering
461 DO i = 0, desc%group_size - 1
462 desc%virtual2real(i) = i
463 desc%real2virtual(i) = i
464 END DO
465
466 desc%my_virtual_pos = desc%real2virtual(desc%my_pos)
467 desc%virtual_group_coor(:) = desc%rank2coord(:, desc%my_virtual_pos)
468
469 END IF
470 END IF
471
472 CALL timestop(handle)
473
474 END SUBROUTINE rs_grid_create_descriptor
475
476! **************************************************************************************************
477!> \brief ...
478!> \param rs ...
479!> \param desc ...
480! **************************************************************************************************
481 SUBROUTINE rs_grid_create(rs, desc)
482 TYPE(realspace_grid_type), INTENT(OUT) :: rs
483 TYPE(realspace_grid_desc_type), INTENT(INOUT), &
484 TARGET :: desc
485
486 CHARACTER(LEN=*), PARAMETER :: routinen = 'rs_grid_create'
487
488 INTEGER :: handle
489
490 CALL timeset(routinen, handle)
491
492 rs%desc => desc
493 CALL rs_grid_retain_descriptor(rs%desc)
494
495 IF (desc%pw%para%mode == pw_mode_local) THEN
496 ! The corresponding group has dimension 1
497 ! All operations will be done locally
498 rs%lb_real = desc%lb
499 rs%ub_real = desc%ub
500 rs%lb_local = rs%lb_real - desc%border*(1 - desc%perd)
501 rs%ub_local = rs%ub_real + desc%border*(1 - desc%perd)
502 rs%npts_local = rs%ub_local - rs%lb_local + 1
503 rs%ngpts_local = product(rs%npts_local)
504 END IF
505
506 IF (all(rs%desc%group_dim == 1)) THEN
507 ! CASE 1 : only one slice: we do not need overlapping regions and special
508 ! recombination of the total density
509 rs%lb_real = desc%lb
510 rs%ub_real = desc%ub
511 rs%lb_local = rs%lb_real - desc%border*(1 - desc%perd)
512 rs%ub_local = rs%ub_real + desc%border*(1 - desc%perd)
513 rs%npts_local = rs%ub_local - rs%lb_local + 1
514 rs%ngpts_local = product(rs%npts_local)
515 ELSE
516 ! CASE 2 : general case
517 ! extract some more derived quantities about the local grid
518 rs%lb_real = desc%lb_global(:, desc%my_virtual_pos)
519 rs%ub_real = desc%ub_global(:, desc%my_virtual_pos)
520 rs%lb_local = rs%lb_real - desc%border*(1 - desc%perd)
521 rs%ub_local = rs%ub_real + desc%border*(1 - desc%perd)
522 rs%npts_local = rs%ub_local - rs%lb_local + 1
523 rs%ngpts_local = product(rs%npts_local)
524 END IF
525
526 CALL offload_create_buffer(rs%ngpts_local, rs%buffer)
527 rs%r(rs%lb_local(1):rs%ub_local(1), &
528 rs%lb_local(2):rs%ub_local(2), &
529 rs%lb_local(3):rs%ub_local(3)) => rs%buffer%host_buffer
530
531 ALLOCATE (rs%px(desc%npts(1)))
532 ALLOCATE (rs%py(desc%npts(2)))
533 ALLOCATE (rs%pz(desc%npts(3)))
534
535 CALL timestop(handle)
536
537 END SUBROUTINE rs_grid_create
538
539! **************************************************************************************************
540!> \brief Defines a new ordering of ranks on this realspace grid, recalculating
541!> the data bounds and reallocating the grid. As a result, each MPI process
542!> now has a real rank (i.e., its rank in the MPI communicator from the pw grid)
543!> and a virtual rank (the rank of the process where the data now owned by this
544!> process would reside in an ordinary cartesian distribution).
545!> NB. Since the grid size required may change, the caller should be sure to release
546!> and recreate the corresponding rs_grids
547!> The desc%real2virtual and desc%virtual2real arrays can be used to map
548!> a physical rank to the 'rank' of data owned by that process and vice versa
549!> \param desc ...
550!> \param real2virtual ...
551!> \par History
552!> 04-2009 created [Iain Bethune]
553!> (c) The Numerical Algorithms Group (NAG) Ltd, 2009 on behalf of the HECToR project
554! **************************************************************************************************
555 PURE SUBROUTINE rs_grid_reorder_ranks(desc, real2virtual)
556
557 TYPE(realspace_grid_desc_type), INTENT(INOUT) :: desc
558 INTEGER, DIMENSION(:), INTENT(IN) :: real2virtual
559
560 INTEGER :: i
561
562 desc%real2virtual(:) = real2virtual
563
564 DO i = 0, desc%group_size - 1
565 desc%virtual2real(desc%real2virtual(i)) = i
566 END DO
567
568 desc%my_virtual_pos = desc%real2virtual(desc%my_pos)
569
570 IF (.NOT. all(desc%group_dim == 1)) THEN
571 desc%virtual_group_coor(:) = desc%rank2coord(:, desc%my_virtual_pos)
572 END IF
573
574 END SUBROUTINE
575
576! **************************************************************************************************
577!> \brief Print information on grids to output
578!> \param rs ...
579!> \param iounit ...
580!> \author JGH (17-May-2007)
581! **************************************************************************************************
582 SUBROUTINE rs_grid_print(rs, iounit)
583 TYPE(realspace_grid_type), INTENT(IN) :: rs
584 INTEGER, INTENT(in) :: iounit
585
586 INTEGER :: dir, i, nn
587 REAL(kind=dp) :: pp(3)
588
589 IF (rs%desc%parallel) THEN
590 IF (iounit > 0) THEN
591 WRITE (iounit, '(/,A,T71,I10)') &
592 " RS_GRID| Information for grid number ", rs%desc%pw%id_nr
593 DO i = 1, 3
594 WRITE (iounit, '(A,I3,T30,2I8,T62,A,T71,I10)') " RS_GRID| Bounds ", &
595 i, rs%desc%lb(i), rs%desc%ub(i), "Points:", rs%desc%npts(i)
596 END DO
597 IF (.NOT. rs%desc%distributed) THEN
598 WRITE (iounit, '(A)') " RS_GRID| Real space fully replicated"
599 WRITE (iounit, '(A,T71,I10)') &
600 " RS_GRID| Group size ", rs%desc%group_dim(2)
601 ELSE
602 DO dir = 1, 3
603 IF (rs%desc%perd(dir) /= 1) THEN
604 WRITE (iounit, '(A,T71,I3,A)') &
605 " RS_GRID| Real space distribution over ", rs%desc%group_dim(dir), " groups"
606 WRITE (iounit, '(A,T71,I10)') &
607 " RS_GRID| Real space distribution along direction ", dir
608 WRITE (iounit, '(A,T71,I10)') &
609 " RS_GRID| Border size ", rs%desc%border
610 END IF
611 END DO
612 END IF
613 END IF
614 IF (rs%desc%distributed) THEN
615 DO dir = 1, 3
616 IF (rs%desc%perd(dir) /= 1) THEN
617 nn = rs%npts_local(dir)
618 CALL rs%desc%group%sum(nn)
619 pp(1) = real(nn, kind=dp)/real(product(rs%desc%group_dim), kind=dp)
620 nn = rs%npts_local(dir)
621 CALL rs%desc%group%max(nn)
622 pp(2) = real(nn, kind=dp)
623 nn = rs%npts_local(dir)
624 CALL rs%desc%group%min(nn)
625 pp(3) = real(nn, kind=dp)
626 IF (iounit > 0) THEN
627 WRITE (iounit, '(A,T48,A)') " RS_GRID| Distribution", &
628 " Average Max Min"
629 WRITE (iounit, '(A,T45,F12.1,2I12)') " RS_GRID| Planes ", &
630 pp(1), nint(pp(2)), nint(pp(3))
631 END IF
632 END IF
633 END DO
634! WRITE ( iounit, '(/)' )
635 END IF
636 ELSE
637 IF (iounit > 0) THEN
638 WRITE (iounit, '(/,A,T71,I10)') &
639 " RS_GRID| Information for grid number ", rs%desc%pw%id_nr
640 DO i = 1, 3
641 WRITE (iounit, '(A,I3,T30,2I8,T62,A,T71,I10)') " RS_GRID| Bounds ", &
642 i, rs%desc%lb(i), rs%desc%ub(i), "Points:", rs%desc%npts(i)
643 END DO
644! WRITE ( iounit, '(/)' )
645 END IF
646 END IF
647
648 END SUBROUTINE rs_grid_print
649
650! **************************************************************************************************
651!> \brief ...
652!> \param rs ...
653!> \param pw ...
654! **************************************************************************************************
655 SUBROUTINE transfer_rs2pw(rs, pw)
656 TYPE(realspace_grid_type), INTENT(IN) :: rs
657 TYPE(pw_r3d_rs_type), INTENT(INOUT) :: pw
658
659 CHARACTER(len=*), PARAMETER :: routinen = 'transfer_rs2pw'
660
661 INTEGER :: handle, handle2, i
662
663 CALL timeset(routinen, handle2)
664 CALL timeset(routinen//"_"//trim(adjustl(cp_to_string(ceiling(pw%pw_grid%cutoff/10)*10))), handle)
665
666 IF (.NOT. ASSOCIATED(rs%desc%pw, pw%pw_grid)) &
667 cpabort("Different rs and pw indentifiers")
668
669 IF (rs%desc%distributed) THEN
670 CALL transfer_rs2pw_distributed(rs, pw)
671 ELSE IF (rs%desc%parallel) THEN
672 CALL transfer_rs2pw_replicated(rs, pw)
673 ELSE ! treat simple serial case locally
674 IF (rs%desc%border == 0) THEN
675 CALL dcopy(SIZE(rs%r), rs%r, 1, pw%array, 1)
676 ELSE
677 cpassert(lbound(pw%array, 3) .EQ. rs%lb_real(3))
678!$OMP PARALLEL DO DEFAULT(NONE) SHARED(pw,rs)
679 DO i = rs%lb_real(3), rs%ub_real(3)
680 pw%array(:, :, i) = rs%r(rs%lb_real(1):rs%ub_real(1), &
681 rs%lb_real(2):rs%ub_real(2), i)
682 END DO
683!$OMP END PARALLEL DO
684 END IF
685 END IF
686
687 CALL timestop(handle)
688 CALL timestop(handle2)
689
690 END SUBROUTINE transfer_rs2pw
691
692! **************************************************************************************************
693!> \brief ...
694!> \param rs ...
695!> \param pw ...
696! **************************************************************************************************
697 SUBROUTINE transfer_pw2rs(rs, pw)
698
699 TYPE(realspace_grid_type), INTENT(IN) :: rs
700 TYPE(pw_r3d_rs_type), INTENT(IN) :: pw
701
702 CHARACTER(len=*), PARAMETER :: routinen = 'transfer_pw2rs'
703
704 INTEGER :: handle, handle2, i, im, j, jm, k, km
705
706 CALL timeset(routinen, handle2)
707 CALL timeset(routinen//"_"//trim(adjustl(cp_to_string(ceiling(pw%pw_grid%cutoff/10)*10))), handle)
708
709 IF (.NOT. ASSOCIATED(rs%desc%pw, pw%pw_grid)) &
710 cpabort("Different rs and pw indentifiers")
711
712 IF (rs%desc%distributed) THEN
713 CALL transfer_pw2rs_distributed(rs, pw)
714 ELSE IF (rs%desc%parallel) THEN
715 CALL transfer_pw2rs_replicated(rs, pw)
716 ELSE ! treat simple serial case locally
717 IF (rs%desc%border == 0) THEN
718 CALL dcopy(SIZE(rs%r), pw%array, 1, rs%r, 1)
719 ELSE
720!$OMP PARALLEL DO DEFAULT(NONE) &
721!$OMP PRIVATE(i,im,j,jm,k,km) &
722!$OMP SHARED(pw,rs)
723 DO k = rs%lb_local(3), rs%ub_local(3)
724 IF (k < rs%lb_real(3)) THEN
725 km = k + rs%desc%npts(3)
726 ELSE IF (k > rs%ub_real(3)) THEN
727 km = k - rs%desc%npts(3)
728 ELSE
729 km = k
730 END IF
731 DO j = rs%lb_local(2), rs%ub_local(2)
732 IF (j < rs%lb_real(2)) THEN
733 jm = j + rs%desc%npts(2)
734 ELSE IF (j > rs%ub_real(2)) THEN
735 jm = j - rs%desc%npts(2)
736 ELSE
737 jm = j
738 END IF
739 DO i = rs%lb_local(1), rs%ub_local(1)
740 IF (i < rs%lb_real(1)) THEN
741 im = i + rs%desc%npts(1)
742 ELSE IF (i > rs%ub_real(1)) THEN
743 im = i - rs%desc%npts(1)
744 ELSE
745 im = i
746 END IF
747 rs%r(i, j, k) = pw%array(im, jm, km)
748 END DO
749 END DO
750 END DO
751!$OMP END PARALLEL DO
752 END IF
753 END IF
754
755 CALL timestop(handle)
756 CALL timestop(handle2)
757
758 END SUBROUTINE transfer_pw2rs
759
760! **************************************************************************************************
761!> \brief transfer from a realspace grid to a planewave grid
762!> \param rs ...
763!> \param pw ...
764! **************************************************************************************************
765 SUBROUTINE transfer_rs2pw_replicated(rs, pw)
766 TYPE(realspace_grid_type), INTENT(IN) :: rs
767 TYPE(pw_r3d_rs_type), INTENT(INOUT) :: pw
768
769 INTEGER :: dest, ii, ip, ix, iy, iz, nma, nn, s(3), &
770 source
771 INTEGER, ALLOCATABLE, DIMENSION(:) :: rcount
772 INTEGER, DIMENSION(3) :: lb, ub
773 REAL(kind=dp), ALLOCATABLE, DIMENSION(:) :: recvbuf, sendbuf, swaparray
774
775 associate(np => pw%pw_grid%para%group_size, bo => pw%pw_grid%para%bo(1:2, 1:3, 0:pw%pw_grid%para%group_size - 1, 1), &
776 pbo => pw%pw_grid%bounds, group => pw%pw_grid%para%rs_group, mepos => pw%pw_grid%para%rs_mpo, &
777 grid => rs%r)
778 ALLOCATE (rcount(0:np - 1))
779 DO ip = 1, np
780 rcount(ip - 1) = product(bo(2, :, ip) - bo(1, :, ip) + 1)
781 END DO
782 nma = maxval(rcount(0:np - 1))
783 ALLOCATE (sendbuf(nma), recvbuf(nma))
784 sendbuf = 1.0e99_dp; recvbuf = 1.0e99_dp ! init mpi'ed buffers to silence warnings under valgrind
785
786 !sample peak memory
787 CALL m_memory()
788
789 dest = modulo(mepos + 1, np)
790 source = modulo(mepos - 1, np)
791 sendbuf = 0.0_dp
792
793 DO ip = 1, np
794
795 lb = pbo(1, :) + bo(1, :, modulo(mepos - ip, np) + 1) - 1
796 ub = pbo(1, :) + bo(2, :, modulo(mepos - ip, np) + 1) - 1
797 ! this loop takes about the same time as the message passing call
798 ! notice that the range of ix is only a small fraction of the first index of grid
799 ! therefore it seems faster to have the second index as the innermost loop
800 ! if this runs on many cpus
801 ! tested on itanium, pentium4, opteron, ultrasparc...
802 s = ub - lb + 1
803 DO iz = lb(3), ub(3)
804 DO ix = lb(1), ub(1)
805 ii = (iz - lb(3))*s(1)*s(2) + (ix - lb(1)) + 1
806 DO iy = lb(2), ub(2)
807 sendbuf(ii) = sendbuf(ii) + grid(ix, iy, iz)
808 ii = ii + s(1)
809 END DO
810 END DO
811 END DO
812 IF (ip .EQ. np) EXIT
813 CALL group%sendrecv(sendbuf, dest, recvbuf, source, 13)
814 CALL move_alloc(sendbuf, swaparray)
815 CALL move_alloc(recvbuf, sendbuf)
816 CALL move_alloc(swaparray, recvbuf)
817 END DO
818 nn = rcount(mepos)
819 END associate
820
821 CALL dcopy(nn, sendbuf, 1, pw%array, 1)
822
823 DEALLOCATE (rcount)
824 DEALLOCATE (sendbuf)
825 DEALLOCATE (recvbuf)
826
827 END SUBROUTINE transfer_rs2pw_replicated
828
829! **************************************************************************************************
830!> \brief transfer from a planewave grid to a realspace grid
831!> \param rs ...
832!> \param pw ...
833! **************************************************************************************************
834 SUBROUTINE transfer_pw2rs_replicated(rs, pw)
835 TYPE(realspace_grid_type), INTENT(IN) :: rs
836 TYPE(pw_r3d_rs_type), INTENT(IN) :: pw
837
838 INTEGER :: dest, i, ii, im, ip, ix, iy, iz, j, jm, &
839 k, km, nma, nn, source
840 INTEGER, ALLOCATABLE, DIMENSION(:) :: rcount
841 INTEGER, DIMENSION(3) :: lb, ub
842 REAL(kind=dp), ALLOCATABLE, DIMENSION(:) :: recvbuf, sendbuf, swaparray
843 TYPE(mp_request_type), DIMENSION(2) :: req
844
845 associate(np => pw%pw_grid%para%group_size, bo => pw%pw_grid%para%bo(1:2, 1:3, 0:pw%pw_grid%para%group_size - 1, 1), &
846 pbo => pw%pw_grid%bounds, group => pw%pw_grid%para%rs_group, mepos => pw%pw_grid%para%rs_mpo, &
847 grid => rs%r)
848 ALLOCATE (rcount(0:np - 1))
849 DO ip = 1, np
850 rcount(ip - 1) = product(bo(2, :, ip) - bo(1, :, ip) + 1)
851 END DO
852 nma = maxval(rcount(0:np - 1))
853 ALLOCATE (sendbuf(nma), recvbuf(nma))
854 sendbuf = 1.0e99_dp; recvbuf = 1.0e99_dp ! init mpi'ed buffers to silence warnings under valgrind
855
856 !sample peak memory
857 CALL m_memory()
858
859 nn = rcount(mepos)
860 CALL dcopy(nn, pw%array, 1, sendbuf, 1)
861
862 dest = modulo(mepos + 1, np)
863 source = modulo(mepos - 1, np)
864
865 DO ip = 0, np - 1
866 ! we must shift the buffer only np-1 times around
867 IF (ip .NE. np - 1) THEN
868 CALL group%isendrecv(sendbuf, dest, recvbuf, source, &
869 req(1), req(2), 13)
870 END IF
871 lb = pbo(1, :) + bo(1, :, modulo(mepos - ip, np) + 1) - 1
872 ub = pbo(1, :) + bo(2, :, modulo(mepos - ip, np) + 1) - 1
873 ii = 0
874 ! this loop takes about the same time as the message passing call
875 ! If I read the code correctly then:
876 DO iz = lb(3), ub(3)
877 DO iy = lb(2), ub(2)
878 DO ix = lb(1), ub(1)
879 ii = ii + 1
880 grid(ix, iy, iz) = sendbuf(ii)
881 END DO
882 END DO
883 END DO
884 IF (ip .NE. np - 1) THEN
885 CALL mp_waitall(req)
886 END IF
887 CALL move_alloc(sendbuf, swaparray)
888 CALL move_alloc(recvbuf, sendbuf)
889 CALL move_alloc(swaparray, recvbuf)
890 END DO
891 IF (rs%desc%border > 0) THEN
892!$OMP PARALLEL DO DEFAULT(NONE) &
893!$OMP PRIVATE(i,im,j,jm,k,km) &
894!$OMP SHARED(rs)
895 DO k = rs%lb_local(3), rs%ub_local(3)
896 IF (k < rs%lb_real(3)) THEN
897 km = k + rs%desc%npts(3)
898 ELSE IF (k > rs%ub_real(3)) THEN
899 km = k - rs%desc%npts(3)
900 ELSE
901 km = k
902 END IF
903 DO j = rs%lb_local(2), rs%ub_local(2)
904 IF (j < rs%lb_real(2)) THEN
905 jm = j + rs%desc%npts(2)
906 ELSE IF (j > rs%ub_real(2)) THEN
907 jm = j - rs%desc%npts(2)
908 ELSE
909 jm = j
910 END IF
911 DO i = rs%lb_local(1), rs%ub_local(1)
912 IF (i < rs%lb_real(1)) THEN
913 im = i + rs%desc%npts(1)
914 ELSE IF (i > rs%ub_real(1)) THEN
915 im = i - rs%desc%npts(1)
916 ELSE
917 im = i
918 END IF
919 rs%r(i, j, k) = rs%r(im, jm, km)
920 END DO
921 END DO
922 END DO
923!$OMP END PARALLEL DO
924 END IF
925 END associate
926
927 DEALLOCATE (rcount)
928 DEALLOCATE (sendbuf)
929 DEALLOCATE (recvbuf)
930
931 END SUBROUTINE transfer_pw2rs_replicated
932
933! **************************************************************************************************
934!> \brief does the rs2pw transfer in the case where the rs grid is
935!> distributed (3D domain decomposition)
936!> \param rs ...
937!> \param pw ...
938!> \par History
939!> 12.2007 created [Matt Watkins]
940!> 9.2008 reduced amount of halo data sent [Iain Bethune]
941!> 10.2008 added non-blocking communication [Iain Bethune]
942!> 4.2009 added support for rank-reordering on the grid [Iain Bethune]
943!> 12.2009 added OMP and sparse alltoall [Iain Bethune]
944!> (c) The Numerical Algorithms Group (NAG) Ltd, 2008-2009 on behalf of the HECToR project
945!> \note
946!> the transfer is a two step procedure. For example, for the rs2pw transfer:
947!>
948!> 1) Halo-exchange in 3D so that the local part of the rs_grid contains the full data
949!> 2) an alltoall communication to redistribute the local rs_grid to the local pw_grid
950!>
951!> the halo exchange is most expensive on a large number of CPUs. Particular in this halo
952!> exchange is that the border region is rather large (e.g. 20 points) and that it might overlap
953!> with the central domain of several CPUs (i.e. next nearest neighbors)
954! **************************************************************************************************
955 SUBROUTINE transfer_rs2pw_distributed(rs, pw)
956 TYPE(realspace_grid_type), INTENT(IN) :: rs
957 TYPE(pw_r3d_rs_type), INTENT(IN) :: pw
958
959 CHARACTER(LEN=200) :: error_string
960 INTEGER :: completed, dest_down, dest_up, i, idir, j, k, lb, my_id, my_pw_rank, my_rs_rank, &
961 n_shifts, nn, num_threads, position, source_down, source_up, ub, x, y, z
962 INTEGER, ALLOCATABLE, DIMENSION(:) :: dshifts, recv_disps, recv_sizes, &
963 send_disps, send_sizes, ushifts
964 INTEGER, ALLOCATABLE, DIMENSION(:, :) :: bounds, recv_tasks, send_tasks
965 INTEGER, DIMENSION(2) :: neighbours, pos
966 INTEGER, DIMENSION(3) :: coords, lb_recv, lb_recv_down, lb_recv_up, lb_send, lb_send_down, &
967 lb_send_up, ub_recv, ub_recv_down, ub_recv_up, ub_send, ub_send_down, ub_send_up
968 LOGICAL, DIMENSION(3) :: halo_swapped
969 REAL(kind=dp) :: pw_sum, rs_sum
970 REAL(kind=dp), ALLOCATABLE, DIMENSION(:, :, :) :: recv_buf_3d_down, recv_buf_3d_up, &
971 send_buf_3d_down, send_buf_3d_up
972 TYPE(cp_1d_r_p_type), ALLOCATABLE, DIMENSION(:) :: recv_bufs, send_bufs
973 TYPE(mp_request_type), ALLOCATABLE, DIMENSION(:) :: recv_reqs, send_reqs
974 TYPE(mp_request_type), DIMENSION(4) :: req
975
976 num_threads = 1
977 my_id = 0
978
979 ! safety check, to be removed once we're absolute sure the routine is correct
980 IF (debug_this_module) THEN
981 rs_sum = accurate_sum(rs%r)*abs(det_3x3(rs%desc%dh))
982 CALL rs%desc%group%sum(rs_sum)
983 END IF
984
985 halo_swapped = .false.
986 ! We don't need to send the 'edges' of the halos that have already been sent
987 ! Halos are contiguous in memory in z-direction only, so swap these first,
988 ! and send less data in the y and x directions which are more expensive
989
990 DO idir = 3, 1, -1
991
992 IF (rs%desc%perd(idir) .NE. 1) THEN
993
994 ALLOCATE (dshifts(0:rs%desc%neighbours(idir)))
995 ALLOCATE (ushifts(0:rs%desc%neighbours(idir)))
996
997 ushifts = 0
998 dshifts = 0
999
1000 ! check that we don't try to send data to ourself
1001 DO n_shifts = 1, min(rs%desc%neighbours(idir), rs%desc%group_dim(idir) - 1)
1002
1003 ! need to take into account the possible varying widths of neighbouring cells
1004 ! offset_up and offset_down hold the real size of the neighbouring cells
1005 position = modulo(rs%desc%virtual_group_coor(idir) - n_shifts, rs%desc%group_dim(idir))
1006 neighbours = get_limit(rs%desc%npts(idir), rs%desc%group_dim(idir), position)
1007 dshifts(n_shifts) = dshifts(n_shifts - 1) + (neighbours(2) - neighbours(1) + 1)
1008
1009 position = modulo(rs%desc%virtual_group_coor(idir) + n_shifts, rs%desc%group_dim(idir))
1010 neighbours = get_limit(rs%desc%npts(idir), rs%desc%group_dim(idir), position)
1011 ushifts(n_shifts) = ushifts(n_shifts - 1) + (neighbours(2) - neighbours(1) + 1)
1012
1013 ! The border data has to be send/received from the neighbours
1014 ! First we calculate the source and destination processes for the shift
1015 ! We do both shifts at once to allow for more overlap of communication and buffer packing/unpacking
1016
1017 CALL cart_shift(rs, idir, -1*n_shifts, source_down, dest_down)
1018
1019 lb_send_down(:) = rs%lb_local(:)
1020 lb_recv_down(:) = rs%lb_local(:)
1021 ub_recv_down(:) = rs%ub_local(:)
1022 ub_send_down(:) = rs%ub_local(:)
1023
1024 IF (dshifts(n_shifts - 1) .LE. rs%desc%border) THEN
1025 ub_send_down(idir) = lb_send_down(idir) + rs%desc%border - 1 - dshifts(n_shifts - 1)
1026 lb_send_down(idir) = max(lb_send_down(idir), &
1027 lb_send_down(idir) + rs%desc%border - dshifts(n_shifts))
1028
1029 ub_recv_down(idir) = ub_recv_down(idir) - rs%desc%border
1030 lb_recv_down(idir) = max(lb_recv_down(idir) + rs%desc%border, &
1031 ub_recv_down(idir) - rs%desc%border + 1 + ushifts(n_shifts - 1))
1032 ELSE
1033 lb_send_down(idir) = 0
1034 ub_send_down(idir) = -1
1035 lb_recv_down(idir) = 0
1036 ub_recv_down(idir) = -1
1037 END IF
1038
1039 DO i = 1, 3
1040 IF (halo_swapped(i)) THEN
1041 lb_send_down(i) = rs%lb_real(i)
1042 ub_send_down(i) = rs%ub_real(i)
1043 lb_recv_down(i) = rs%lb_real(i)
1044 ub_recv_down(i) = rs%ub_real(i)
1045 END IF
1046 END DO
1047
1048 ! post the receive
1049 ALLOCATE (recv_buf_3d_down(lb_recv_down(1):ub_recv_down(1), &
1050 lb_recv_down(2):ub_recv_down(2), lb_recv_down(3):ub_recv_down(3)))
1051 CALL rs%desc%group%irecv(recv_buf_3d_down, source_down, req(1))
1052
1053 ! now allocate, pack and send the send buffer
1054 nn = product(ub_send_down - lb_send_down + 1)
1055 ALLOCATE (send_buf_3d_down(lb_send_down(1):ub_send_down(1), &
1056 lb_send_down(2):ub_send_down(2), lb_send_down(3):ub_send_down(3)))
1057
1058!$OMP PARALLEL DEFAULT(NONE), &
1059!$OMP PRIVATE(lb,ub,my_id,NUM_THREADS), &
1060!$OMP SHARED(send_buf_3d_down,rs,lb_send_down,ub_send_down)
1061!$ num_threads = MIN(omp_get_max_threads(), ub_send_down(3) - lb_send_down(3) + 1)
1062!$ my_id = omp_get_thread_num()
1063 IF (my_id < num_threads) THEN
1064 lb = lb_send_down(3) + ((ub_send_down(3) - lb_send_down(3) + 1)*my_id)/num_threads
1065 ub = lb_send_down(3) + ((ub_send_down(3) - lb_send_down(3) + 1)*(my_id + 1))/num_threads - 1
1066
1067 send_buf_3d_down(lb_send_down(1):ub_send_down(1), lb_send_down(2):ub_send_down(2), &
1068 lb:ub) = rs%r(lb_send_down(1):ub_send_down(1), &
1069 lb_send_down(2):ub_send_down(2), lb:ub)
1070 END IF
1071!$OMP END PARALLEL
1072
1073 CALL rs%desc%group%isend(send_buf_3d_down, dest_down, req(3))
1074
1075 ! Now for the other direction
1076 CALL cart_shift(rs, idir, n_shifts, source_up, dest_up)
1077
1078 lb_send_up(:) = rs%lb_local(:)
1079 lb_recv_up(:) = rs%lb_local(:)
1080 ub_recv_up(:) = rs%ub_local(:)
1081 ub_send_up(:) = rs%ub_local(:)
1082
1083 IF (ushifts(n_shifts - 1) .LE. rs%desc%border) THEN
1084
1085 lb_send_up(idir) = ub_send_up(idir) - rs%desc%border + 1 + ushifts(n_shifts - 1)
1086 ub_send_up(idir) = min(ub_send_up(idir), &
1087 ub_send_up(idir) - rs%desc%border + ushifts(n_shifts))
1088
1089 lb_recv_up(idir) = lb_recv_up(idir) + rs%desc%border
1090 ub_recv_up(idir) = min(ub_recv_up(idir) - rs%desc%border, &
1091 lb_recv_up(idir) + rs%desc%border - 1 - dshifts(n_shifts - 1))
1092 ELSE
1093 lb_send_up(idir) = 0
1094 ub_send_up(idir) = -1
1095 lb_recv_up(idir) = 0
1096 ub_recv_up(idir) = -1
1097 END IF
1098
1099 DO i = 1, 3
1100 IF (halo_swapped(i)) THEN
1101 lb_send_up(i) = rs%lb_real(i)
1102 ub_send_up(i) = rs%ub_real(i)
1103 lb_recv_up(i) = rs%lb_real(i)
1104 ub_recv_up(i) = rs%ub_real(i)
1105 END IF
1106 END DO
1107
1108 ! post the receive
1109 ALLOCATE (recv_buf_3d_up(lb_recv_up(1):ub_recv_up(1), &
1110 lb_recv_up(2):ub_recv_up(2), lb_recv_up(3):ub_recv_up(3)))
1111 CALL rs%desc%group%irecv(recv_buf_3d_up, source_up, req(2))
1112
1113 ! now allocate,pack and send the send buffer
1114 nn = product(ub_send_up - lb_send_up + 1)
1115 ALLOCATE (send_buf_3d_up(lb_send_up(1):ub_send_up(1), &
1116 lb_send_up(2):ub_send_up(2), lb_send_up(3):ub_send_up(3)))
1117
1118!$OMP PARALLEL DEFAULT(NONE), &
1119!$OMP PRIVATE(lb,ub,my_id,NUM_THREADS), &
1120!$OMP SHARED(send_buf_3d_up,rs,lb_send_up,ub_send_up)
1121!$ num_threads = MIN(omp_get_max_threads(), ub_send_up(3) - lb_send_up(3) + 1)
1122!$ my_id = omp_get_thread_num()
1123 IF (my_id < num_threads) THEN
1124 lb = lb_send_up(3) + ((ub_send_up(3) - lb_send_up(3) + 1)*my_id)/num_threads
1125 ub = lb_send_up(3) + ((ub_send_up(3) - lb_send_up(3) + 1)*(my_id + 1))/num_threads - 1
1126
1127 send_buf_3d_up(lb_send_up(1):ub_send_up(1), lb_send_up(2):ub_send_up(2), &
1128 lb:ub) = rs%r(lb_send_up(1):ub_send_up(1), &
1129 lb_send_up(2):ub_send_up(2), lb:ub)
1130 END IF
1131!$OMP END PARALLEL
1132
1133 CALL rs%desc%group%isend(send_buf_3d_up, dest_up, req(4))
1134
1135 ! wait for a recv to complete, then we can unpack
1136
1137 DO i = 1, 2
1138
1139 CALL mp_waitany(req(1:2), completed)
1140
1141 IF (completed .EQ. 1) THEN
1142
1143 ! only some procs may need later shifts
1144 IF (ub_recv_down(idir) .GE. lb_recv_down(idir)) THEN
1145 ! Sum the data in the RS Grid
1146!$OMP PARALLEL DEFAULT(NONE), &
1147!$OMP PRIVATE(lb,ub,my_id,NUM_THREADS), &
1148!$OMP SHARED(recv_buf_3d_down,rs,lb_recv_down,ub_recv_down)
1149!$ num_threads = MIN(omp_get_max_threads(), ub_recv_down(3) - lb_recv_down(3) + 1)
1150!$ my_id = omp_get_thread_num()
1151 IF (my_id < num_threads) THEN
1152 lb = lb_recv_down(3) + ((ub_recv_down(3) - lb_recv_down(3) + 1)*my_id)/num_threads
1153 ub = lb_recv_down(3) + ((ub_recv_down(3) - lb_recv_down(3) + 1)*(my_id + 1))/num_threads - 1
1154
1155 rs%r(lb_recv_down(1):ub_recv_down(1), &
1156 lb_recv_down(2):ub_recv_down(2), lb:ub) = &
1157 rs%r(lb_recv_down(1):ub_recv_down(1), &
1158 lb_recv_down(2):ub_recv_down(2), lb:ub) + &
1159 recv_buf_3d_down(:, :, lb:ub)
1160 END IF
1161!$OMP END PARALLEL
1162 END IF
1163 DEALLOCATE (recv_buf_3d_down)
1164 ELSE
1165
1166 ! only some procs may need later shifts
1167 IF (ub_recv_up(idir) .GE. lb_recv_up(idir)) THEN
1168 ! Sum the data in the RS Grid
1169!$OMP PARALLEL DEFAULT(NONE), &
1170!$OMP PRIVATE(lb,ub,my_id,NUM_THREADS), &
1171!$OMP SHARED(recv_buf_3d_up,rs,lb_recv_up,ub_recv_up)
1172!$ num_threads = MIN(omp_get_max_threads(), ub_recv_up(3) - lb_recv_up(3) + 1)
1173!$ my_id = omp_get_thread_num()
1174 IF (my_id < num_threads) THEN
1175 lb = lb_recv_up(3) + ((ub_recv_up(3) - lb_recv_up(3) + 1)*my_id)/num_threads
1176 ub = lb_recv_up(3) + ((ub_recv_up(3) - lb_recv_up(3) + 1)*(my_id + 1))/num_threads - 1
1177
1178 rs%r(lb_recv_up(1):ub_recv_up(1), &
1179 lb_recv_up(2):ub_recv_up(2), lb:ub) = &
1180 rs%r(lb_recv_up(1):ub_recv_up(1), &
1181 lb_recv_up(2):ub_recv_up(2), lb:ub) + &
1182 recv_buf_3d_up(:, :, lb:ub)
1183 END IF
1184!$OMP END PARALLEL
1185 END IF
1186 DEALLOCATE (recv_buf_3d_up)
1187 END IF
1188
1189 END DO
1190
1191 ! make sure the sends have completed before we deallocate
1192
1193 CALL mp_waitall(req(3:4))
1194
1195 DEALLOCATE (send_buf_3d_down)
1196 DEALLOCATE (send_buf_3d_up)
1197 END DO
1198
1199 DEALLOCATE (dshifts)
1200 DEALLOCATE (ushifts)
1201
1202 END IF
1203
1204 halo_swapped(idir) = .true.
1205
1206 END DO
1207
1208 ! This is the real redistribution
1209 ALLOCATE (bounds(0:pw%pw_grid%para%group_size - 1, 1:4))
1210
1211 ! work out the pw grid points each proc holds
1212 DO i = 0, pw%pw_grid%para%group_size - 1
1213 bounds(i, 1:2) = pw%pw_grid%para%bo(1:2, 1, i, 1)
1214 bounds(i, 3:4) = pw%pw_grid%para%bo(1:2, 2, i, 1)
1215 bounds(i, 1:2) = bounds(i, 1:2) - pw%pw_grid%npts(1)/2 - 1
1216 bounds(i, 3:4) = bounds(i, 3:4) - pw%pw_grid%npts(2)/2 - 1
1217 END DO
1218
1219 ALLOCATE (send_tasks(0:pw%pw_grid%para%group_size - 1, 1:6))
1220 ALLOCATE (send_sizes(0:pw%pw_grid%para%group_size - 1))
1221 ALLOCATE (send_disps(0:pw%pw_grid%para%group_size - 1))
1222 ALLOCATE (recv_tasks(0:pw%pw_grid%para%group_size - 1, 1:6))
1223 ALLOCATE (recv_sizes(0:pw%pw_grid%para%group_size - 1))
1224 ALLOCATE (recv_disps(0:pw%pw_grid%para%group_size - 1))
1225 send_tasks(:, 1) = 1
1226 send_tasks(:, 2) = 0
1227 send_tasks(:, 3) = 1
1228 send_tasks(:, 4) = 0
1229 send_tasks(:, 5) = 1
1230 send_tasks(:, 6) = 0
1231 send_sizes = 0
1232 recv_sizes = 0
1233
1234 my_rs_rank = rs%desc%my_pos
1235 my_pw_rank = pw%pw_grid%para%rs_mpo
1236
1237 ! find the processors that should hold our data
1238 ! should be part of the rs grid type
1239 ! this is a loop over real ranks (i.e. the in-order cartesian ranks)
1240 ! do the recv and send tasks in two separate loops which will
1241 ! load balance better for OpenMP with large numbers of MPI tasks
1242
1243!$OMP PARALLEL DO DEFAULT(NONE), &
1244!$OMP PRIVATE(coords,idir,pos,lb_send,ub_send), &
1245!$OMP SHARED(rs,bounds,my_rs_rank,recv_tasks,recv_sizes)
1246 DO i = 0, rs%desc%group_size - 1
1247
1248 coords(:) = rs%desc%rank2coord(:, rs%desc%real2virtual(i))
1249 !calculate the rs grid points on each processor
1250 !coords is the part of the grid that rank i actually holds
1251 DO idir = 1, 3
1252 pos(:) = get_limit(rs%desc%npts(idir), rs%desc%group_dim(idir), coords(idir))
1253 pos(:) = pos(:) - rs%desc%npts(idir)/2 - 1
1254 lb_send(idir) = pos(1)
1255 ub_send(idir) = pos(2)
1256 END DO
1257
1258 IF (lb_send(1) .GT. bounds(my_rs_rank, 2)) cycle
1259 IF (ub_send(1) .LT. bounds(my_rs_rank, 1)) cycle
1260 IF (lb_send(2) .GT. bounds(my_rs_rank, 4)) cycle
1261 IF (ub_send(2) .LT. bounds(my_rs_rank, 3)) cycle
1262
1263 recv_tasks(i, 1) = max(lb_send(1), bounds(my_rs_rank, 1))
1264 recv_tasks(i, 2) = min(ub_send(1), bounds(my_rs_rank, 2))
1265 recv_tasks(i, 3) = max(lb_send(2), bounds(my_rs_rank, 3))
1266 recv_tasks(i, 4) = min(ub_send(2), bounds(my_rs_rank, 4))
1267 recv_tasks(i, 5) = lb_send(3)
1268 recv_tasks(i, 6) = ub_send(3)
1269 recv_sizes(i) = (recv_tasks(i, 2) - recv_tasks(i, 1) + 1)* &
1270 (recv_tasks(i, 4) - recv_tasks(i, 3) + 1)*(recv_tasks(i, 6) - recv_tasks(i, 5) + 1)
1271
1272 END DO
1273!$OMP END PARALLEL DO
1274
1275 coords(:) = rs%desc%rank2coord(:, rs%desc%real2virtual(my_rs_rank))
1276 DO idir = 1, 3
1277 pos(:) = get_limit(rs%desc%npts(idir), rs%desc%group_dim(idir), coords(idir))
1278 pos(:) = pos(:) - rs%desc%npts(idir)/2 - 1
1279 lb_send(idir) = pos(1)
1280 ub_send(idir) = pos(2)
1281 END DO
1282
1283 lb_recv(:) = lb_send(:)
1284 ub_recv(:) = ub_send(:)
1285!$OMP PARALLEL DO DEFAULT(NONE), &
1286!$OMP SHARED(pw,lb_send,ub_send,bounds,send_tasks,send_sizes)
1287 DO j = 0, pw%pw_grid%para%group_size - 1
1288
1289 IF (lb_send(1) .GT. bounds(j, 2)) cycle
1290 IF (ub_send(1) .LT. bounds(j, 1)) cycle
1291 IF (lb_send(2) .GT. bounds(j, 4)) cycle
1292 IF (ub_send(2) .LT. bounds(j, 3)) cycle
1293
1294 send_tasks(j, 1) = max(lb_send(1), bounds(j, 1))
1295 send_tasks(j, 2) = min(ub_send(1), bounds(j, 2))
1296 send_tasks(j, 3) = max(lb_send(2), bounds(j, 3))
1297 send_tasks(j, 4) = min(ub_send(2), bounds(j, 4))
1298 send_tasks(j, 5) = lb_send(3)
1299 send_tasks(j, 6) = ub_send(3)
1300 send_sizes(j) = (send_tasks(j, 2) - send_tasks(j, 1) + 1)* &
1301 (send_tasks(j, 4) - send_tasks(j, 3) + 1)*(send_tasks(j, 6) - send_tasks(j, 5) + 1)
1302
1303 END DO
1304!$OMP END PARALLEL DO
1305
1306 send_disps(0) = 0
1307 recv_disps(0) = 0
1308 DO i = 1, pw%pw_grid%para%group_size - 1
1309 send_disps(i) = send_disps(i - 1) + send_sizes(i - 1)
1310 recv_disps(i) = recv_disps(i - 1) + recv_sizes(i - 1)
1311 END DO
1312
1313 cpassert(sum(send_sizes) == product(ub_recv - lb_recv + 1))
1314
1315 ALLOCATE (send_bufs(0:rs%desc%group_size - 1))
1316 ALLOCATE (recv_bufs(0:rs%desc%group_size - 1))
1317
1318 DO i = 0, rs%desc%group_size - 1
1319 IF (send_sizes(i) .NE. 0) THEN
1320 ALLOCATE (send_bufs(i)%array(send_sizes(i)))
1321 ELSE
1322 NULLIFY (send_bufs(i)%array)
1323 END IF
1324 IF (recv_sizes(i) .NE. 0) THEN
1325 ALLOCATE (recv_bufs(i)%array(recv_sizes(i)))
1326 ELSE
1327 NULLIFY (recv_bufs(i)%array)
1328 END IF
1329 END DO
1330
1331 ALLOCATE (recv_reqs(0:rs%desc%group_size - 1))
1332 recv_reqs = mp_request_null
1333
1334 DO i = 0, rs%desc%group_size - 1
1335 IF (recv_sizes(i) .NE. 0) THEN
1336 CALL rs%desc%group%irecv(recv_bufs(i)%array, i, recv_reqs(i))
1337 END IF
1338 END DO
1339
1340 ! do packing
1341!$OMP PARALLEL DO DEFAULT(NONE), &
1342!$OMP PRIVATE(k,z,y,x), &
1343!$OMP SHARED(rs,send_tasks,send_bufs,send_disps)
1344 DO i = 0, rs%desc%group_size - 1
1345 k = 0
1346 DO z = send_tasks(i, 5), send_tasks(i, 6)
1347 DO y = send_tasks(i, 3), send_tasks(i, 4)
1348 DO x = send_tasks(i, 1), send_tasks(i, 2)
1349 k = k + 1
1350 send_bufs(i)%array(k) = rs%r(x, y, z)
1351 END DO
1352 END DO
1353 END DO
1354 END DO
1355!$OMP END PARALLEL DO
1356
1357 ALLOCATE (send_reqs(0:rs%desc%group_size - 1))
1358 send_reqs = mp_request_null
1359
1360 DO i = 0, rs%desc%group_size - 1
1361 IF (send_sizes(i) .NE. 0) THEN
1362 CALL rs%desc%group%isend(send_bufs(i)%array, i, send_reqs(i))
1363 END IF
1364 END DO
1365
1366 ! do unpacking
1367 ! no OMP here so we can unpack each message as it arrives
1368 DO i = 0, rs%desc%group_size - 1
1369 IF (recv_sizes(i) .EQ. 0) cycle
1370
1371 CALL mp_waitany(recv_reqs, completed)
1372 k = 0
1373 DO z = recv_tasks(completed - 1, 5), recv_tasks(completed - 1, 6)
1374 DO y = recv_tasks(completed - 1, 3), recv_tasks(completed - 1, 4)
1375 DO x = recv_tasks(completed - 1, 1), recv_tasks(completed - 1, 2)
1376 k = k + 1
1377 pw%array(x, y, z) = recv_bufs(completed - 1)%array(k)
1378 END DO
1379 END DO
1380 END DO
1381 END DO
1382
1383 CALL mp_waitall(send_reqs)
1384
1385 DEALLOCATE (recv_reqs)
1386 DEALLOCATE (send_reqs)
1387
1388 DO i = 0, rs%desc%group_size - 1
1389 IF (ASSOCIATED(send_bufs(i)%array)) THEN
1390 DEALLOCATE (send_bufs(i)%array)
1391 END IF
1392 IF (ASSOCIATED(recv_bufs(i)%array)) THEN
1393 DEALLOCATE (recv_bufs(i)%array)
1394 END IF
1395 END DO
1396
1397 DEALLOCATE (send_bufs)
1398 DEALLOCATE (recv_bufs)
1399 DEALLOCATE (send_tasks)
1400 DEALLOCATE (send_sizes)
1401 DEALLOCATE (send_disps)
1402 DEALLOCATE (recv_tasks)
1403 DEALLOCATE (recv_sizes)
1404 DEALLOCATE (recv_disps)
1405
1406 IF (debug_this_module) THEN
1407 ! safety check, to be removed once we're absolute sure the routine is correct
1408 pw_sum = pw_integrate_function(pw)
1409 IF (abs(pw_sum - rs_sum)/max(1.0_dp, abs(pw_sum), abs(rs_sum)) > epsilon(rs_sum)*1000) THEN
1410 WRITE (error_string, '(A,6(1X,I4.4),3F25.16)') "rs_pw_transfer_distributed", &
1411 rs%desc%npts, rs%desc%group_dim, pw_sum, rs_sum, abs(pw_sum - rs_sum)
1412 CALL cp_abort(__location__, &
1413 error_string//" Please report this bug ... quick workaround: use "// &
1414 "DISTRIBUTION_TYPE REPLICATED")
1415 END IF
1416 END IF
1417
1418 END SUBROUTINE transfer_rs2pw_distributed
1419
1420! **************************************************************************************************
1421!> \brief does the pw2rs transfer in the case where the rs grid is
1422!> distributed (3D domain decomposition)
1423!> \param rs ...
1424!> \param pw ...
1425!> \par History
1426!> 12.2007 created [Matt Watkins]
1427!> 9.2008 reduced amount of halo data sent [Iain Bethune]
1428!> 10.2008 added non-blocking communication [Iain Bethune]
1429!> 4.2009 added support for rank-reordering on the grid [Iain Bethune]
1430!> 12.2009 added OMP and sparse alltoall [Iain Bethune]
1431!> (c) The Numerical Algorithms Group (NAG) Ltd, 2008-2009 on behalf of the HECToR project
1432!> \note
1433!> the transfer is a two step procedure. For example, for the rs2pw transfer:
1434!>
1435!> 1) Halo-exchange in 3D so that the local part of the rs_grid contains the full data
1436!> 2) an alltoall communication to redistribute the local rs_grid to the local pw_grid
1437!>
1438!> the halo exchange is most expensive on a large number of CPUs. Particular in this halo
1439!> exchange is that the border region is rather large (e.g. 20 points) and that it might overlap
1440!> with the central domain of several CPUs (i.e. next nearest neighbors)
1441! **************************************************************************************************
1442 SUBROUTINE transfer_pw2rs_distributed(rs, pw)
1443 TYPE(realspace_grid_type), INTENT(IN) :: rs
1444 TYPE(pw_r3d_rs_type), INTENT(IN) :: pw
1445
1446 INTEGER :: completed, dest_down, dest_up, i, idir, j, k, lb, my_id, my_pw_rank, my_rs_rank, &
1447 n_shifts, nn, num_threads, position, source_down, source_up, ub, x, y, z
1448 INTEGER, ALLOCATABLE, DIMENSION(:) :: dshifts, recv_disps, recv_sizes, &
1449 send_disps, send_sizes, ushifts
1450 INTEGER, ALLOCATABLE, DIMENSION(:, :) :: bounds, recv_tasks, send_tasks
1451 INTEGER, DIMENSION(2) :: neighbours, pos
1452 INTEGER, DIMENSION(3) :: coords, lb_recv, lb_recv_down, lb_recv_up, lb_send, lb_send_down, &
1453 lb_send_up, ub_recv, ub_recv_down, ub_recv_up, ub_send, ub_send_down, ub_send_up
1454 LOGICAL, DIMENSION(3) :: halo_swapped
1455 REAL(kind=dp), ALLOCATABLE, DIMENSION(:, :, :) :: recv_buf_3d_down, recv_buf_3d_up, &
1456 send_buf_3d_down, send_buf_3d_up
1457 TYPE(cp_1d_r_p_type), ALLOCATABLE, DIMENSION(:) :: recv_bufs, send_bufs
1458 TYPE(mp_request_type), ALLOCATABLE, DIMENSION(:) :: recv_reqs, send_reqs
1459 TYPE(mp_request_type), DIMENSION(4) :: req
1460
1461 num_threads = 1
1462 my_id = 0
1463
1464 CALL rs_grid_zero(rs)
1465
1466 ! This is the real redistribution
1467
1468 ALLOCATE (bounds(0:pw%pw_grid%para%group_size - 1, 1:4))
1469
1470 DO i = 0, pw%pw_grid%para%group_size - 1
1471 bounds(i, 1:2) = pw%pw_grid%para%bo(1:2, 1, i, 1)
1472 bounds(i, 3:4) = pw%pw_grid%para%bo(1:2, 2, i, 1)
1473 bounds(i, 1:2) = bounds(i, 1:2) - pw%pw_grid%npts(1)/2 - 1
1474 bounds(i, 3:4) = bounds(i, 3:4) - pw%pw_grid%npts(2)/2 - 1
1475 END DO
1476
1477 ALLOCATE (send_tasks(0:pw%pw_grid%para%group_size - 1, 1:6))
1478 ALLOCATE (send_sizes(0:pw%pw_grid%para%group_size - 1))
1479 ALLOCATE (send_disps(0:pw%pw_grid%para%group_size - 1))
1480 ALLOCATE (recv_tasks(0:pw%pw_grid%para%group_size - 1, 1:6))
1481 ALLOCATE (recv_sizes(0:pw%pw_grid%para%group_size - 1))
1482 ALLOCATE (recv_disps(0:pw%pw_grid%para%group_size - 1))
1483
1484 send_tasks = 0
1485 send_tasks(:, 1) = 1
1486 send_tasks(:, 2) = 0
1487 send_tasks(:, 3) = 1
1488 send_tasks(:, 4) = 0
1489 send_tasks(:, 5) = 1
1490 send_tasks(:, 6) = 0
1491 send_sizes = 0
1492
1493 recv_tasks = 0
1494 recv_tasks(:, 1) = 1
1495 recv_tasks(:, 2) = 0
1496 send_tasks(:, 3) = 1
1497 send_tasks(:, 4) = 0
1498 send_tasks(:, 5) = 1
1499 send_tasks(:, 6) = 0
1500 recv_sizes = 0
1501
1502 my_rs_rank = rs%desc%my_pos
1503 my_pw_rank = pw%pw_grid%para%rs_mpo
1504
1505 ! find the processors that should hold our data
1506 ! should be part of the rs grid type
1507 ! this is a loop over real ranks (i.e. the in-order cartesian ranks)
1508 ! do the recv and send tasks in two separate loops which will
1509 ! load balance better for OpenMP with large numbers of MPI tasks
1510
1511 ! this is the reverse of rs2pw: what were the sends are now the recvs
1512
1513!$OMP PARALLEL DO DEFAULT(NONE), &
1514!$OMP PRIVATE(coords,idir,pos,lb_send,ub_send), &
1515!$OMP SHARED(rs,bounds,my_rs_rank,send_tasks,send_sizes,pw)
1516 DO i = 0, pw%pw_grid%para%group_size - 1
1517
1518 coords(:) = rs%desc%rank2coord(:, rs%desc%real2virtual(i))
1519 !calculate the real rs grid points on each processor
1520 !coords is the part of the grid that rank i actually holds
1521 DO idir = 1, 3
1522 pos(:) = get_limit(rs%desc%npts(idir), rs%desc%group_dim(idir), coords(idir))
1523 pos(:) = pos(:) - rs%desc%npts(idir)/2 - 1
1524 lb_send(idir) = pos(1)
1525 ub_send(idir) = pos(2)
1526 END DO
1527
1528 IF (ub_send(1) .LT. bounds(my_rs_rank, 1)) cycle
1529 IF (lb_send(1) .GT. bounds(my_rs_rank, 2)) cycle
1530 IF (ub_send(2) .LT. bounds(my_rs_rank, 3)) cycle
1531 IF (lb_send(2) .GT. bounds(my_rs_rank, 4)) cycle
1532
1533 send_tasks(i, 1) = max(lb_send(1), bounds(my_rs_rank, 1))
1534 send_tasks(i, 2) = min(ub_send(1), bounds(my_rs_rank, 2))
1535 send_tasks(i, 3) = max(lb_send(2), bounds(my_rs_rank, 3))
1536 send_tasks(i, 4) = min(ub_send(2), bounds(my_rs_rank, 4))
1537 send_tasks(i, 5) = lb_send(3)
1538 send_tasks(i, 6) = ub_send(3)
1539 send_sizes(i) = (send_tasks(i, 2) - send_tasks(i, 1) + 1)* &
1540 (send_tasks(i, 4) - send_tasks(i, 3) + 1)*(send_tasks(i, 6) - send_tasks(i, 5) + 1)
1541
1542 END DO
1543!$OMP END PARALLEL DO
1544
1545 coords(:) = rs%desc%rank2coord(:, rs%desc%real2virtual(my_rs_rank))
1546 DO idir = 1, 3
1547 pos(:) = get_limit(rs%desc%npts(idir), rs%desc%group_dim(idir), coords(idir))
1548 pos(:) = pos(:) - rs%desc%npts(idir)/2 - 1
1549 lb_send(idir) = pos(1)
1550 ub_send(idir) = pos(2)
1551 END DO
1552
1553 lb_recv(:) = lb_send(:)
1554 ub_recv(:) = ub_send(:)
1555
1556!$OMP PARALLEL DO DEFAULT(NONE), &
1557!$OMP SHARED(pw,lb_send,ub_send,bounds,recv_tasks,recv_sizes)
1558 DO j = 0, pw%pw_grid%para%group_size - 1
1559
1560 IF (ub_send(1) .LT. bounds(j, 1)) cycle
1561 IF (lb_send(1) .GT. bounds(j, 2)) cycle
1562 IF (ub_send(2) .LT. bounds(j, 3)) cycle
1563 IF (lb_send(2) .GT. bounds(j, 4)) cycle
1564
1565 recv_tasks(j, 1) = max(lb_send(1), bounds(j, 1))
1566 recv_tasks(j, 2) = min(ub_send(1), bounds(j, 2))
1567 recv_tasks(j, 3) = max(lb_send(2), bounds(j, 3))
1568 recv_tasks(j, 4) = min(ub_send(2), bounds(j, 4))
1569 recv_tasks(j, 5) = lb_send(3)
1570 recv_tasks(j, 6) = ub_send(3)
1571 recv_sizes(j) = (recv_tasks(j, 2) - recv_tasks(j, 1) + 1)* &
1572 (recv_tasks(j, 4) - recv_tasks(j, 3) + 1)*(recv_tasks(j, 6) - recv_tasks(j, 5) + 1)
1573
1574 END DO
1575!$OMP END PARALLEL DO
1576
1577 send_disps(0) = 0
1578 recv_disps(0) = 0
1579 DO i = 1, pw%pw_grid%para%group_size - 1
1580 send_disps(i) = send_disps(i - 1) + send_sizes(i - 1)
1581 recv_disps(i) = recv_disps(i - 1) + recv_sizes(i - 1)
1582 END DO
1583
1584 cpassert(sum(recv_sizes) == product(ub_recv - lb_recv + 1))
1585
1586 ALLOCATE (send_bufs(0:rs%desc%group_size - 1))
1587 ALLOCATE (recv_bufs(0:rs%desc%group_size - 1))
1588
1589 DO i = 0, rs%desc%group_size - 1
1590 IF (send_sizes(i) .NE. 0) THEN
1591 ALLOCATE (send_bufs(i)%array(send_sizes(i)))
1592 ELSE
1593 NULLIFY (send_bufs(i)%array)
1594 END IF
1595 IF (recv_sizes(i) .NE. 0) THEN
1596 ALLOCATE (recv_bufs(i)%array(recv_sizes(i)))
1597 ELSE
1598 NULLIFY (recv_bufs(i)%array)
1599 END IF
1600 END DO
1601
1602 ALLOCATE (recv_reqs(0:rs%desc%group_size - 1))
1603 recv_reqs = mp_request_null
1604
1605 DO i = 0, rs%desc%group_size - 1
1606 IF (recv_sizes(i) .NE. 0) THEN
1607 CALL rs%desc%group%irecv(recv_bufs(i)%array, i, recv_reqs(i))
1608 END IF
1609 END DO
1610
1611 ! do packing
1612!$OMP PARALLEL DO DEFAULT(NONE), &
1613!$OMP PRIVATE(k,z,y,x), &
1614!$OMP SHARED(pw,rs,send_tasks,send_bufs,send_disps)
1615 DO i = 0, rs%desc%group_size - 1
1616 k = 0
1617 DO z = send_tasks(i, 5), send_tasks(i, 6)
1618 DO y = send_tasks(i, 3), send_tasks(i, 4)
1619 DO x = send_tasks(i, 1), send_tasks(i, 2)
1620 k = k + 1
1621 send_bufs(i)%array(k) = pw%array(x, y, z)
1622 END DO
1623 END DO
1624 END DO
1625 END DO
1626!$OMP END PARALLEL DO
1627
1628 ALLOCATE (send_reqs(0:rs%desc%group_size - 1))
1629 send_reqs = mp_request_null
1630
1631 DO i = 0, rs%desc%group_size - 1
1632 IF (send_sizes(i) .NE. 0) THEN
1633 CALL rs%desc%group%isend(send_bufs(i)%array, i, send_reqs(i))
1634 END IF
1635 END DO
1636
1637 ! do unpacking
1638 ! no OMP here so we can unpack each message as it arrives
1639
1640 DO i = 0, rs%desc%group_size - 1
1641 IF (recv_sizes(i) .EQ. 0) cycle
1642
1643 CALL mp_waitany(recv_reqs, completed)
1644 k = 0
1645 DO z = recv_tasks(completed - 1, 5), recv_tasks(completed - 1, 6)
1646 DO y = recv_tasks(completed - 1, 3), recv_tasks(completed - 1, 4)
1647 DO x = recv_tasks(completed - 1, 1), recv_tasks(completed - 1, 2)
1648 k = k + 1
1649 rs%r(x, y, z) = recv_bufs(completed - 1)%array(k)
1650 END DO
1651 END DO
1652 END DO
1653 END DO
1654
1655 CALL mp_waitall(send_reqs)
1656
1657 DEALLOCATE (recv_reqs)
1658 DEALLOCATE (send_reqs)
1659
1660 DO i = 0, rs%desc%group_size - 1
1661 IF (ASSOCIATED(send_bufs(i)%array)) THEN
1662 DEALLOCATE (send_bufs(i)%array)
1663 END IF
1664 IF (ASSOCIATED(recv_bufs(i)%array)) THEN
1665 DEALLOCATE (recv_bufs(i)%array)
1666 END IF
1667 END DO
1668
1669 DEALLOCATE (send_bufs)
1670 DEALLOCATE (recv_bufs)
1671 DEALLOCATE (send_tasks)
1672 DEALLOCATE (send_sizes)
1673 DEALLOCATE (send_disps)
1674 DEALLOCATE (recv_tasks)
1675 DEALLOCATE (recv_sizes)
1676 DEALLOCATE (recv_disps)
1677
1678 ! now pass wings around
1679 halo_swapped = .false.
1680
1681 DO idir = 1, 3
1682
1683 IF (rs%desc%perd(idir) /= 1) THEN
1684
1685 ALLOCATE (dshifts(0:rs%desc%neighbours(idir)))
1686 ALLOCATE (ushifts(0:rs%desc%neighbours(idir)))
1687 ushifts = 0
1688 dshifts = 0
1689
1690 DO n_shifts = 1, rs%desc%neighbours(idir)
1691
1692 ! need to take into account the possible varying widths of neighbouring cells
1693 ! ushifts and dshifts hold the real size of the neighbouring cells
1694
1695 position = modulo(rs%desc%virtual_group_coor(idir) - n_shifts, rs%desc%group_dim(idir))
1696 neighbours = get_limit(rs%desc%npts(idir), rs%desc%group_dim(idir), position)
1697 dshifts(n_shifts) = dshifts(n_shifts - 1) + (neighbours(2) - neighbours(1) + 1)
1698
1699 position = modulo(rs%desc%virtual_group_coor(idir) + n_shifts, rs%desc%group_dim(idir))
1700 neighbours = get_limit(rs%desc%npts(idir), rs%desc%group_dim(idir), position)
1701 ushifts(n_shifts) = ushifts(n_shifts - 1) + (neighbours(2) - neighbours(1) + 1)
1702
1703 ! The border data has to be send/received from the neighbors
1704 ! First we calculate the source and destination processes for the shift
1705 ! The first shift is "downwards"
1706
1707 CALL cart_shift(rs, idir, -1*n_shifts, source_down, dest_down)
1708
1709 lb_send_down(:) = rs%lb_local(:)
1710 ub_send_down(:) = rs%ub_local(:)
1711 lb_recv_down(:) = rs%lb_local(:)
1712 ub_recv_down(:) = rs%ub_local(:)
1713
1714 IF (dshifts(n_shifts - 1) .LE. rs%desc%border) THEN
1715 lb_send_down(idir) = lb_send_down(idir) + rs%desc%border
1716 ub_send_down(idir) = min(ub_send_down(idir) - rs%desc%border, &
1717 lb_send_down(idir) + rs%desc%border - 1 - dshifts(n_shifts - 1))
1718
1719 lb_recv_down(idir) = ub_recv_down(idir) - rs%desc%border + 1 + ushifts(n_shifts - 1)
1720 ub_recv_down(idir) = min(ub_recv_down(idir), &
1721 ub_recv_down(idir) - rs%desc%border + ushifts(n_shifts))
1722 ELSE
1723 lb_send_down(idir) = 0
1724 ub_send_down(idir) = -1
1725 lb_recv_down(idir) = 0
1726 ub_recv_down(idir) = -1
1727 END IF
1728
1729 DO i = 1, 3
1730 IF (.NOT. (halo_swapped(i) .OR. i .EQ. idir)) THEN
1731 lb_send_down(i) = rs%lb_real(i)
1732 ub_send_down(i) = rs%ub_real(i)
1733 lb_recv_down(i) = rs%lb_real(i)
1734 ub_recv_down(i) = rs%ub_real(i)
1735 END IF
1736 END DO
1737
1738 ! allocate the recv buffer
1739 nn = product(ub_recv_down - lb_recv_down + 1)
1740 ALLOCATE (recv_buf_3d_down(lb_recv_down(1):ub_recv_down(1), &
1741 lb_recv_down(2):ub_recv_down(2), lb_recv_down(3):ub_recv_down(3)))
1742
1743 ! recv buffer is now ready, so post the receive
1744 CALL rs%desc%group%irecv(recv_buf_3d_down, source_down, req(1))
1745
1746 ! now allocate,pack and send the send buffer
1747 nn = product(ub_send_down - lb_send_down + 1)
1748 ALLOCATE (send_buf_3d_down(lb_send_down(1):ub_send_down(1), &
1749 lb_send_down(2):ub_send_down(2), lb_send_down(3):ub_send_down(3)))
1750
1751!$OMP PARALLEL DEFAULT(NONE), &
1752!$OMP PRIVATE(lb,ub,my_id,NUM_THREADS), &
1753!$OMP SHARED(send_buf_3d_down,rs,lb_send_down,ub_send_down)
1754!$ num_threads = MIN(omp_get_max_threads(), ub_send_down(3) - lb_send_down(3) + 1)
1755!$ my_id = omp_get_thread_num()
1756 IF (my_id < num_threads) THEN
1757 lb = lb_send_down(3) + ((ub_send_down(3) - lb_send_down(3) + 1)*my_id)/num_threads
1758 ub = lb_send_down(3) + ((ub_send_down(3) - lb_send_down(3) + 1)*(my_id + 1))/num_threads - 1
1759
1760 send_buf_3d_down(lb_send_down(1):ub_send_down(1), lb_send_down(2):ub_send_down(2), &
1761 lb:ub) = rs%r(lb_send_down(1):ub_send_down(1), &
1762 lb_send_down(2):ub_send_down(2), lb:ub)
1763 END IF
1764!$OMP END PARALLEL
1765
1766 CALL rs%desc%group%isend(send_buf_3d_down, dest_down, req(3))
1767
1768 ! Now for the other direction
1769
1770 CALL cart_shift(rs, idir, n_shifts, source_up, dest_up)
1771
1772 lb_send_up(:) = rs%lb_local(:)
1773 ub_send_up(:) = rs%ub_local(:)
1774 lb_recv_up(:) = rs%lb_local(:)
1775 ub_recv_up(:) = rs%ub_local(:)
1776
1777 IF (ushifts(n_shifts - 1) .LE. rs%desc%border) THEN
1778 ub_send_up(idir) = ub_send_up(idir) - rs%desc%border
1779 lb_send_up(idir) = max(lb_send_up(idir) + rs%desc%border, &
1780 ub_send_up(idir) - rs%desc%border + 1 + ushifts(n_shifts - 1))
1781
1782 ub_recv_up(idir) = lb_recv_up(idir) + rs%desc%border - 1 - dshifts(n_shifts - 1)
1783 lb_recv_up(idir) = max(lb_recv_up(idir), &
1784 lb_recv_up(idir) + rs%desc%border - dshifts(n_shifts))
1785 ELSE
1786 lb_send_up(idir) = 0
1787 ub_send_up(idir) = -1
1788 lb_recv_up(idir) = 0
1789 ub_recv_up(idir) = -1
1790 END IF
1791
1792 DO i = 1, 3
1793 IF (.NOT. (halo_swapped(i) .OR. i .EQ. idir)) THEN
1794 lb_send_up(i) = rs%lb_real(i)
1795 ub_send_up(i) = rs%ub_real(i)
1796 lb_recv_up(i) = rs%lb_real(i)
1797 ub_recv_up(i) = rs%ub_real(i)
1798 END IF
1799 END DO
1800
1801 ! allocate the recv buffer
1802 nn = product(ub_recv_up - lb_recv_up + 1)
1803 ALLOCATE (recv_buf_3d_up(lb_recv_up(1):ub_recv_up(1), &
1804 lb_recv_up(2):ub_recv_up(2), lb_recv_up(3):ub_recv_up(3)))
1805
1806 ! recv buffer is now ready, so post the receive
1807
1808 CALL rs%desc%group%irecv(recv_buf_3d_up, source_up, req(2))
1809
1810 ! now allocate,pack and send the send buffer
1811 nn = product(ub_send_up - lb_send_up + 1)
1812 ALLOCATE (send_buf_3d_up(lb_send_up(1):ub_send_up(1), &
1813 lb_send_up(2):ub_send_up(2), lb_send_up(3):ub_send_up(3)))
1814
1815!$OMP PARALLEL DEFAULT(NONE), &
1816!$OMP PRIVATE(lb,ub,my_id,NUM_THREADS), &
1817!$OMP SHARED(send_buf_3d_up,rs,lb_send_up,ub_send_up)
1818!$ num_threads = MIN(omp_get_max_threads(), ub_send_up(3) - lb_send_up(3) + 1)
1819!$ my_id = omp_get_thread_num()
1820 IF (my_id < num_threads) THEN
1821 lb = lb_send_up(3) + ((ub_send_up(3) - lb_send_up(3) + 1)*my_id)/num_threads
1822 ub = lb_send_up(3) + ((ub_send_up(3) - lb_send_up(3) + 1)*(my_id + 1))/num_threads - 1
1823
1824 send_buf_3d_up(lb_send_up(1):ub_send_up(1), lb_send_up(2):ub_send_up(2), &
1825 lb:ub) = rs%r(lb_send_up(1):ub_send_up(1), &
1826 lb_send_up(2):ub_send_up(2), lb:ub)
1827 END IF
1828!$OMP END PARALLEL
1829
1830 CALL rs%desc%group%isend(send_buf_3d_up, dest_up, req(4))
1831
1832 ! wait for a recv to complete, then we can unpack
1833
1834 DO i = 1, 2
1835
1836 CALL mp_waitany(req(1:2), completed)
1837
1838 IF (completed .EQ. 1) THEN
1839
1840 ! only some procs may need later shifts
1841 IF (ub_recv_down(idir) .GE. lb_recv_down(idir)) THEN
1842
1843 ! Add the data to the RS Grid
1844!$OMP PARALLEL DEFAULT(NONE), &
1845!$OMP PRIVATE(lb,ub,my_id,NUM_THREADS), &
1846!$OMP SHARED(recv_buf_3d_down,rs,lb_recv_down,ub_recv_down)
1847!$ num_threads = MIN(omp_get_max_threads(), ub_recv_down(3) - lb_recv_down(3) + 1)
1848!$ my_id = omp_get_thread_num()
1849 IF (my_id < num_threads) THEN
1850 lb = lb_recv_down(3) + ((ub_recv_down(3) - lb_recv_down(3) + 1)*my_id)/num_threads
1851 ub = lb_recv_down(3) + ((ub_recv_down(3) - lb_recv_down(3) + 1)*(my_id + 1))/num_threads - 1
1852
1853 rs%r(lb_recv_down(1):ub_recv_down(1), lb_recv_down(2):ub_recv_down(2), &
1854 lb:ub) = recv_buf_3d_down(:, :, lb:ub)
1855 END IF
1856!$OMP END PARALLEL
1857 END IF
1858
1859 DEALLOCATE (recv_buf_3d_down)
1860 ELSE
1861
1862 ! only some procs may need later shifts
1863 IF (ub_recv_up(idir) .GE. lb_recv_up(idir)) THEN
1864
1865 ! Add the data to the RS Grid
1866!$OMP PARALLEL DEFAULT(NONE), &
1867!$OMP PRIVATE(lb,ub,my_id,NUM_THREADS), &
1868!$OMP SHARED(recv_buf_3d_up,rs,lb_recv_up,ub_recv_up)
1869!$ num_threads = MIN(omp_get_max_threads(), ub_recv_up(3) - lb_recv_up(3) + 1)
1870!$ my_id = omp_get_thread_num()
1871 IF (my_id < num_threads) THEN
1872 lb = lb_recv_up(3) + ((ub_recv_up(3) - lb_recv_up(3) + 1)*my_id)/num_threads
1873 ub = lb_recv_up(3) + ((ub_recv_up(3) - lb_recv_up(3) + 1)*(my_id + 1))/num_threads - 1
1874
1875 rs%r(lb_recv_up(1):ub_recv_up(1), lb_recv_up(2):ub_recv_up(2), &
1876 lb:ub) = recv_buf_3d_up(:, :, lb:ub)
1877 END IF
1878!$OMP END PARALLEL
1879 END IF
1880
1881 DEALLOCATE (recv_buf_3d_up)
1882 END IF
1883 END DO
1884
1885 CALL mp_waitall(req(3:4))
1886
1887 DEALLOCATE (send_buf_3d_down)
1888 DEALLOCATE (send_buf_3d_up)
1889 END DO
1890
1891 DEALLOCATE (ushifts)
1892 DEALLOCATE (dshifts)
1893 END IF
1894
1895 halo_swapped(idir) = .true.
1896
1897 END DO
1898
1899 END SUBROUTINE transfer_pw2rs_distributed
1900
1901! **************************************************************************************************
1902!> \brief Initialize grid to zero
1903!> \param rs ...
1904!> \par History
1905!> none
1906!> \author JGH (23-Mar-2002)
1907! **************************************************************************************************
1908 SUBROUTINE rs_grid_zero(rs)
1909
1910 TYPE(realspace_grid_type), INTENT(IN) :: rs
1911
1912 CHARACTER(len=*), PARAMETER :: routinen = 'rs_grid_zero'
1913
1914 INTEGER :: handle, i, j, k, l(3), u(3)
1915
1916 CALL timeset(routinen, handle)
1917 l(1) = lbound(rs%r, 1); l(2) = lbound(rs%r, 2); l(3) = lbound(rs%r, 3)
1918 u(1) = ubound(rs%r, 1); u(2) = ubound(rs%r, 2); u(3) = ubound(rs%r, 3)
1919!$OMP PARALLEL DO DEFAULT(NONE) COLLAPSE(3) &
1920!$OMP PRIVATE(i,j,k) &
1921!$OMP SHARED(rs,l,u)
1922 DO k = l(3), u(3)
1923 DO j = l(2), u(2)
1924 DO i = l(1), u(1)
1925 rs%r(i, j, k) = 0.0_dp
1926 END DO
1927 END DO
1928 END DO
1929!$OMP END PARALLEL DO
1930 CALL timestop(handle)
1931
1932 END SUBROUTINE rs_grid_zero
1933
1934! **************************************************************************************************
1935!> \brief rs1(i) = rs1(i) + rs2(i)*rs3(i)
1936!> \param rs1 ...
1937!> \param rs2 ...
1938!> \param rs3 ...
1939!> \param scalar ...
1940!> \par History
1941!> none
1942!> \author
1943! **************************************************************************************************
1944 SUBROUTINE rs_grid_mult_and_add(rs1, rs2, rs3, scalar)
1945
1946 TYPE(realspace_grid_type), INTENT(IN) :: rs1, rs2, rs3
1947 REAL(dp), INTENT(IN) :: scalar
1948
1949 CHARACTER(len=*), PARAMETER :: routinen = 'rs_grid_mult_and_add'
1950
1951 INTEGER :: handle, i, j, k, l(3), u(3)
1952
1953!-----------------------------------------------------------------------------!
1954
1955 CALL timeset(routinen, handle)
1956 IF (scalar .NE. 0.0_dp) THEN
1957 l(1) = lbound(rs1%r, 1); l(2) = lbound(rs1%r, 2); l(3) = lbound(rs1%r, 3)
1958 u(1) = ubound(rs1%r, 1); u(2) = ubound(rs1%r, 2); u(3) = ubound(rs1%r, 3)
1959!$OMP PARALLEL DO DEFAULT(NONE) COLLAPSE(3) &
1960!$OMP PRIVATE(i,j,k) &
1961!$OMP SHARED(rs1,rs2,rs3,scalar,l,u)
1962 DO k = l(3), u(3)
1963 DO j = l(2), u(2)
1964 DO i = l(1), u(1)
1965 rs1%r(i, j, k) = rs1%r(i, j, k) + scalar*rs2%r(i, j, k)*rs3%r(i, j, k)
1966 END DO
1967 END DO
1968 END DO
1969!$OMP END PARALLEL DO
1970 END IF
1971 CALL timestop(handle)
1972 END SUBROUTINE rs_grid_mult_and_add
1973
1974! **************************************************************************************************
1975!> \brief Set box matrix info for real space grid
1976!> This is needed for variable cell simulations
1977!> \param pw_grid ...
1978!> \param rs ...
1979!> \par History
1980!> none
1981!> \author JGH (15-May-2007)
1982! **************************************************************************************************
1983 SUBROUTINE rs_grid_set_box(pw_grid, rs)
1984
1985 TYPE(pw_grid_type), INTENT(IN), TARGET :: pw_grid
1986 TYPE(realspace_grid_type), INTENT(IN) :: rs
1987
1988 cpassert(ASSOCIATED(rs%desc%pw, pw_grid))
1989 rs%desc%dh = pw_grid%dh
1990 rs%desc%dh_inv = pw_grid%dh_inv
1991
1992 END SUBROUTINE rs_grid_set_box
1993
1994! **************************************************************************************************
1995!> \brief retains the given rs grid descriptor (see doc/ReferenceCounting.html)
1996!> \param rs_desc the grid descriptor to retain
1997!> \par History
1998!> 04.2009 created [Iain Bethune]
1999!> (c) The Numerical Algorithms Group (NAG) Ltd, 2009 on behalf of the HECToR project
2000! **************************************************************************************************
2001 SUBROUTINE rs_grid_retain_descriptor(rs_desc)
2002 TYPE(realspace_grid_desc_type), INTENT(INOUT) :: rs_desc
2003
2004 cpassert(rs_desc%ref_count > 0)
2005 rs_desc%ref_count = rs_desc%ref_count + 1
2006 END SUBROUTINE rs_grid_retain_descriptor
2007
2008! **************************************************************************************************
2009!> \brief releases the given rs grid (see doc/ReferenceCounting.html)
2010!> \param rs_grid the rs grid to release
2011!> \par History
2012!> 03.2003 created [fawzi]
2013!> \author fawzi
2014! **************************************************************************************************
2015 SUBROUTINE rs_grid_release(rs_grid)
2016 TYPE(realspace_grid_type), INTENT(INOUT) :: rs_grid
2017
2018 CALL rs_grid_release_descriptor(rs_grid%desc)
2019
2020 CALL offload_free_buffer(rs_grid%buffer)
2021 NULLIFY (rs_grid%r)
2022
2023 IF (ALLOCATED(rs_grid%px)) DEALLOCATE (rs_grid%px)
2024 IF (ALLOCATED(rs_grid%py)) DEALLOCATE (rs_grid%py)
2025 IF (ALLOCATED(rs_grid%pz)) DEALLOCATE (rs_grid%pz)
2026 END SUBROUTINE rs_grid_release
2027
2028! **************************************************************************************************
2029!> \brief releases the given rs grid descriptor (see doc/ReferenceCounting.html)
2030!> \param rs_desc the rs grid descriptor to release
2031!> \par History
2032!> 04.2009 created [Iain Bethune]
2033!> (c) The Numerical Algorithms Group (NAG) Ltd, 2009 on behalf of the HECToR project
2034! **************************************************************************************************
2035 SUBROUTINE rs_grid_release_descriptor(rs_desc)
2036 TYPE(realspace_grid_desc_type), POINTER :: rs_desc
2037
2038 IF (ASSOCIATED(rs_desc)) THEN
2039 cpassert(rs_desc%ref_count > 0)
2040 rs_desc%ref_count = rs_desc%ref_count - 1
2041 IF (rs_desc%ref_count == 0) THEN
2042
2043 CALL pw_grid_release(rs_desc%pw)
2044
2045 IF (rs_desc%parallel) THEN
2046 ! release the group communicator
2047 CALL rs_desc%group%free()
2048
2049 DEALLOCATE (rs_desc%virtual2real)
2050 DEALLOCATE (rs_desc%real2virtual)
2051 END IF
2052
2053 IF (rs_desc%distributed) THEN
2054 DEALLOCATE (rs_desc%rank2coord)
2055 DEALLOCATE (rs_desc%coord2rank)
2056 DEALLOCATE (rs_desc%lb_global)
2057 DEALLOCATE (rs_desc%ub_global)
2058 DEALLOCATE (rs_desc%x2coord)
2059 DEALLOCATE (rs_desc%y2coord)
2060 DEALLOCATE (rs_desc%z2coord)
2061 END IF
2062
2063 DEALLOCATE (rs_desc)
2064 END IF
2065 END IF
2066 NULLIFY (rs_desc)
2067 END SUBROUTINE rs_grid_release_descriptor
2068
2069! **************************************************************************************************
2070!> \brief emulates the function of an MPI_cart_shift operation, but the shift is
2071!> done in virtual coordinates, and the corresponding real ranks are returned
2072!> \param rs_grid ...
2073!> \param dir ...
2074!> \param disp ...
2075!> \param source ...
2076!> \param dest ...
2077!> \par History
2078!> 04.2009 created [Iain Bethune]
2079!> (c) The Numerical Algorithms Group (NAG) Ltd, 2009 on behalf of the HECToR project
2080! **************************************************************************************************
2081 PURE SUBROUTINE cart_shift(rs_grid, dir, disp, source, dest)
2082
2083 TYPE(realspace_grid_type), INTENT(IN) :: rs_grid
2084 INTEGER, INTENT(IN) :: dir, disp
2085 INTEGER, INTENT(OUT) :: source, dest
2086
2087 INTEGER, DIMENSION(3) :: shift_coords
2088
2089 shift_coords = rs_grid%desc%virtual_group_coor
2090 shift_coords(dir) = modulo(shift_coords(dir) + disp, rs_grid%desc%group_dim(dir))
2091 dest = rs_grid%desc%virtual2real(rs_grid%desc%coord2rank(shift_coords(1), shift_coords(2), shift_coords(3)))
2092 shift_coords = rs_grid%desc%virtual_group_coor
2093 shift_coords(dir) = modulo(shift_coords(dir) - disp, rs_grid%desc%group_dim(dir))
2094 source = rs_grid%desc%virtual2real(rs_grid%desc%coord2rank(shift_coords(1), shift_coords(2), shift_coords(3)))
2095
2096 END SUBROUTINE
2097
2098! **************************************************************************************************
2099!> \brief returns the maximum number of points in the local grid of any process
2100!> to account for the case where the grid may later be reordered
2101!> \param desc ...
2102!> \return ...
2103!> \par History
2104!> 10.2011 created [Iain Bethune]
2105! **************************************************************************************************
2106 FUNCTION rs_grid_max_ngpts(desc) RESULT(max_ngpts)
2107 TYPE(realspace_grid_desc_type), INTENT(IN) :: desc
2108 INTEGER :: max_ngpts
2109
2110 CHARACTER(len=*), PARAMETER :: routinen = 'rs_grid_max_ngpts'
2111
2112 INTEGER :: handle, i
2113 INTEGER, DIMENSION(3) :: lb, ub
2114
2115 CALL timeset(routinen, handle)
2116
2117 max_ngpts = 0
2118 IF ((desc%pw%para%mode == pw_mode_local) .OR. &
2119 (all(desc%group_dim == 1))) THEN
2120 cpassert(product(int(desc%npts, kind=int_8)) < huge(1))
2121 max_ngpts = product(desc%npts)
2122 ELSE
2123 DO i = 0, desc%group_size - 1
2124 lb = desc%lb_global(:, i)
2125 ub = desc%ub_global(:, i)
2126 lb = lb - desc%border*(1 - desc%perd)
2127 ub = ub + desc%border*(1 - desc%perd)
2128 cpassert(product(int(ub - lb + 1, kind=int_8)) < huge(1))
2129 max_ngpts = max(max_ngpts, product(ub - lb + 1))
2130 END DO
2131 END IF
2132
2133 CALL timestop(handle)
2134
2135 END FUNCTION rs_grid_max_ngpts
2136
2137! **************************************************************************************************
2138!> \brief ...
2139!> \param rs_grid ...
2140!> \param h_inv ...
2141!> \param ra ...
2142!> \param offset ...
2143!> \param group_size ...
2144!> \param my_pos ...
2145!> \return ...
2146! **************************************************************************************************
2147 PURE LOGICAL FUNCTION map_gaussian_here(rs_grid, h_inv, ra, offset, group_size, my_pos) RESULT(res)
2148 TYPE(realspace_grid_type), INTENT(IN) :: rs_grid
2149 REAL(kind=dp), DIMENSION(3, 3), INTENT(IN) :: h_inv
2150 REAL(kind=dp), DIMENSION(3), INTENT(IN) :: ra
2151 INTEGER, INTENT(IN), OPTIONAL :: offset, group_size, my_pos
2152
2153 INTEGER :: dir, lb(3), location(3), tp(3), ub(3)
2154
2155 res = .false.
2156
2157 IF (.NOT. all(rs_grid%desc%perd == 1)) THEN
2158 DO dir = 1, 3
2159 ! bounds of local grid (i.e. removing the 'wings'), if periodic
2160 tp(dir) = floor(dot_product(h_inv(dir, :), ra)*rs_grid%desc%npts(dir))
2161 tp(dir) = modulo(tp(dir), rs_grid%desc%npts(dir))
2162 IF (rs_grid%desc%perd(dir) .NE. 1) THEN
2163 lb(dir) = rs_grid%lb_local(dir) + rs_grid%desc%border
2164 ub(dir) = rs_grid%ub_local(dir) - rs_grid%desc%border
2165 ELSE
2166 lb(dir) = rs_grid%lb_local(dir)
2167 ub(dir) = rs_grid%ub_local(dir)
2168 END IF
2169 ! distributed grid, only map if it is local to the grid
2170 location(dir) = tp(dir) + rs_grid%desc%lb(dir)
2171 END DO
2172 IF (all(lb(:) <= location(:)) .AND. all(location(:) <= ub(:))) THEN
2173 res = .true.
2174 END IF
2175 ELSE
2176 IF (PRESENT(offset) .AND. PRESENT(group_size) .AND. PRESENT(my_pos)) THEN
2177 ! not distributed, just a round-robin distribution over the full set of CPUs
2178 IF (modulo(offset, group_size) == my_pos) res = .true.
2179 END IF
2180 END IF
2181
2182 END FUNCTION map_gaussian_here
2183
2184END MODULE realspace_grid_types
static GRID_HOST_DEVICE int modulo(int a, int m)
Equivalent of Fortran's MODULO, which always return a positive number. https://gcc....
various utilities that regard array of different kinds: output, allocation,... maybe it is not a good...
various routines to log and control the output. The idea is that decisions about where to log should ...
sums arrays of real/complex numbers with much reduced round-off as compared to a naive implementation...
Definition kahan_sum.F:29
Defines the basic variable types.
Definition kinds.F:23
integer, parameter, public int_8
Definition kinds.F:54
integer, parameter, public dp
Definition kinds.F:34
Machine interface based on Fortran 2003 and POSIX.
Definition machine.F:17
subroutine, public m_memory(mem)
Returns the total amount of memory [bytes] in use, if known, zero otherwise.
Definition machine.F:332
Collection of simple mathematical functions and subroutines.
Definition mathlib.F:15
Interface to the message passing library MPI.
type(mp_comm_type), parameter, public mp_comm_null
subroutine, public mp_waitany(requests, completed)
waits for completion of any of the given requests
type(mp_request_type), parameter, public mp_request_null
Fortran API for the offload package, which is written in C.
Definition offload_api.F:12
subroutine, public offload_free_buffer(buffer)
Deallocates given buffer.
subroutine, public offload_create_buffer(length, buffer)
Allocates a buffer of given length, ie. number of elements.
integer, parameter, public pw_mode_local
This module defines the grid data type and some basic operations on it.
Definition pw_grids.F:36
subroutine, public pw_grid_release(pw_grid)
releases the given pw grid
Definition pw_grids.F:2133
subroutine, public pw_grid_retain(pw_grid)
retains the given pw grid
Definition pw_grids.F:2117
subroutine, public rs_grid_print(rs, iounit)
Print information on grids to output.
integer, parameter, public rsgrid_replicated
subroutine, public rs_grid_create(rs, desc)
...
subroutine, public rs_grid_create_descriptor(desc, pw_grid, input_settings, border_points)
Determine the setup of real space grids - this is divided up into the creation of a descriptor and th...
pure integer function, public rs_grid_locate_rank(rs_desc, rank_in, shift)
returns the 1D rank of the task which is a cartesian shift away from 1D rank rank_in only possible if...
subroutine, public transfer_pw2rs(rs, pw)
...
integer, parameter, public rsgrid_automatic
pure subroutine, public rs_grid_reorder_ranks(desc, real2virtual)
Defines a new ordering of ranks on this realspace grid, recalculating the data bounds and reallocatin...
subroutine, public rs_grid_mult_and_add(rs1, rs2, rs3, scalar)
rs1(i) = rs1(i) + rs2(i)*rs3(i)
subroutine, public rs_grid_set_box(pw_grid, rs)
Set box matrix info for real space grid This is needed for variable cell simulations.
subroutine, public rs_grid_retain_descriptor(rs_desc)
retains the given rs grid descriptor (see doc/ReferenceCounting.html)
subroutine, public rs_grid_release_descriptor(rs_desc)
releases the given rs grid descriptor (see doc/ReferenceCounting.html)
integer function, public rs_grid_max_ngpts(desc)
returns the maximum number of points in the local grid of any process to account for the case where t...
integer, parameter, public rsgrid_distributed
pure logical function, public map_gaussian_here(rs_grid, h_inv, ra, offset, group_size, my_pos)
...
subroutine, public transfer_rs2pw(rs, pw)
...
subroutine, public rs_grid_release(rs_grid)
releases the given rs grid (see doc/ReferenceCounting.html)
subroutine, public rs_grid_zero(rs)
Initialize grid to zero.
All kind of helpful little routines.
Definition util.F:14
pure integer function, dimension(2), public get_limit(m, n, me)
divide m entries into n parts, return size of part me
Definition util.F:333
void offload_free_buffer(offload_buffer *buffer)
Deallocate given buffer.
represent a pointer to a 1d array