(git:0de0cc2)
swarm_mpi.F
Go to the documentation of this file.
1 !--------------------------------------------------------------------------------------------------!
2 ! CP2K: A general program to perform molecular dynamics simulations !
3 ! Copyright 2000-2024 CP2K developers group <https://cp2k.org> !
4 ! !
5 ! SPDX-License-Identifier: GPL-2.0-or-later !
6 !--------------------------------------------------------------------------------------------------!
7 
8 ! **************************************************************************************************
9 !> \brief Handles the MPI communication of the swarm framework.
10 !> \author Ole Schuett
11 ! **************************************************************************************************
12 MODULE swarm_mpi
13  USE cp_files, ONLY: close_file,&
14  open_file
17  cp_iteration_info_type
22  cp_logger_type,&
24  USE input_section_types, ONLY: section_vals_type,&
26  USE kinds, ONLY: default_path_length,&
28  USE machine, ONLY: default_output_unit
29  USE message_passing, ONLY: mp_any_source,&
30  mp_comm_type,&
32  mp_para_env_type
33  USE swarm_message, ONLY: swarm_message_get,&
37  swarm_message_type
38 #include "../base/base_uses.f90"
39 
40  IMPLICIT NONE
41  PRIVATE
42 
43  CHARACTER(len=*), PARAMETER, PRIVATE :: moduleN = 'swarm_mpi'
44 
45  PUBLIC :: swarm_mpi_type, swarm_mpi_init, swarm_mpi_finalize
48 
49  TYPE swarm_mpi_type
50  TYPE(mp_para_env_type), POINTER :: world => null()
51  TYPE(mp_para_env_type), POINTER :: worker => null()
52  TYPE(mp_para_env_type), POINTER :: master => null()
53  INTEGER, DIMENSION(:), ALLOCATABLE :: wid2group
54  CHARACTER(LEN=default_path_length) :: master_output_path = ""
55  END TYPE swarm_mpi_type
56 
57 CONTAINS
58 
59 ! **************************************************************************************************
60 !> \brief Initialize MPI communicators for a swarm run.
61 !> \param swarm_mpi ...
62 !> \param world_para_env ...
63 !> \param root_section ...
64 !> \param n_workers ...
65 !> \param worker_id ...
66 !> \param iw ...
67 !> \author Ole Schuett
68 ! **************************************************************************************************
69  SUBROUTINE swarm_mpi_init(swarm_mpi, world_para_env, root_section, n_workers, worker_id, iw)
70  TYPE(swarm_mpi_type) :: swarm_mpi
71  TYPE(mp_para_env_type), POINTER :: world_para_env
72  TYPE(section_vals_type), POINTER :: root_section
73  INTEGER, INTENT(IN) :: n_workers
74  INTEGER, INTENT(OUT) :: worker_id
75  INTEGER, INTENT(IN) :: iw
76 
77  INTEGER :: n_groups_created, pe_per_worker, &
78  subgroup_rank, subgroup_size
79  TYPE(mp_comm_type) :: subgroup
80  LOGICAL :: im_the_master
81  INTEGER, DIMENSION(:), POINTER :: group_distribution_p
82  INTEGER, DIMENSION(0:world_para_env%num_pe-2), &
83  TARGET :: group_distribution
84 
85 ! ====== Setup of MPI-Groups ======
86 
87  worker_id = -1
88  swarm_mpi%world => world_para_env
89 
90  IF (mod(swarm_mpi%world%num_pe - 1, n_workers) /= 0) THEN
91  cpabort("number of processors-1 is not divisible by n_workers.")
92  END IF
93  IF (swarm_mpi%world%num_pe < n_workers + 1) THEN
94  cpabort("There are not enough processes for n_workers + 1. Aborting.")
95  END IF
96 
97  pe_per_worker = (swarm_mpi%world%num_pe - 1)/n_workers
98 
99  IF (iw > 0) THEN
100  WRITE (iw, '(A,45X,I8)') " SWARM| Number of mpi ranks", swarm_mpi%world%num_pe
101  WRITE (iw, '(A,47X,I8)') " SWARM| Number of workers", n_workers
102  END IF
103 
104  ! the last task becomes the master. Preseves node-alignment of other tasks.
105  im_the_master = (swarm_mpi%world%mepos == swarm_mpi%world%num_pe - 1)
106 
107  ! First split split para_env into a master- and a workers-groups...
108  IF (im_the_master) THEN
109  ALLOCATE (swarm_mpi%master)
110  CALL swarm_mpi%master%from_split(swarm_mpi%world, 1)
111  subgroup_size = swarm_mpi%master%num_pe
112  subgroup_rank = swarm_mpi%master%mepos
113  IF (swarm_mpi%master%num_pe /= 1) cpabort("mp_comm_split_direct failed (master)")
114  ELSE
115  CALL subgroup%from_split(swarm_mpi%world, 2)
116  subgroup_size = subgroup%num_pe
117  subgroup_rank = subgroup%mepos
118  IF (subgroup_size /= swarm_mpi%world%num_pe - 1) cpabort("mp_comm_split_direct failed (worker)")
119  END IF
120 
121  ALLOCATE (swarm_mpi%wid2group(n_workers))
122  swarm_mpi%wid2group = 0
123 
124  IF (.NOT. im_the_master) THEN
125  ! ...then split workers-group into n_workers groups - one for each worker.
126  group_distribution_p => group_distribution
127  ALLOCATE (swarm_mpi%worker)
128  CALL swarm_mpi%worker%from_split(subgroup, n_groups_created, group_distribution_p, n_subgroups=n_workers)
129  worker_id = group_distribution(subgroup_rank) + 1 ! shall start by 1
130  IF (n_groups_created /= n_workers) cpabort("mp_comm_split failed.")
131  CALL subgroup%free()
132 
133  !WRITE (*,*) "this is worker ", worker_id, swarm_mpi%worker%mepos, swarm_mpi%worker%num_pe
134 
135  ! collect world-ranks of each worker groups rank-0 node
136  IF (swarm_mpi%worker%mepos == 0) &
137  swarm_mpi%wid2group(worker_id) = swarm_mpi%world%mepos
138 
139  END IF
140 
141  CALL swarm_mpi%world%sum(swarm_mpi%wid2group)
142  !WRITE (*,*), "wid2group table: ",swarm_mpi%wid2group
143 
144  CALL logger_init_master(swarm_mpi)
145  CALL logger_init_worker(swarm_mpi, root_section, worker_id)
146  END SUBROUTINE swarm_mpi_init
147 
148 ! **************************************************************************************************
149 !> \brief Helper routine for swarm_mpi_init, configures the master's logger.
150 !> \param swarm_mpi ...
151 !> \author Ole Schuett
152 ! **************************************************************************************************
153  SUBROUTINE logger_init_master(swarm_mpi)
154  TYPE(swarm_mpi_type) :: swarm_mpi
155 
156  INTEGER :: output_unit
157  TYPE(cp_logger_type), POINTER :: logger
158 
159 ! broadcast master_output_path to all ranks
160 
161  IF (swarm_mpi%world%is_source()) THEN
162  logger => cp_get_default_logger()
163  output_unit = logger%default_local_unit_nr
164  swarm_mpi%master_output_path = output_unit2path(output_unit)
165  IF (output_unit /= default_output_unit) &
166  CLOSE (output_unit)
167  END IF
168 
169  CALL swarm_mpi%world%bcast(swarm_mpi%master_output_path)
170 
171  IF (ASSOCIATED(swarm_mpi%master)) &
172  CALL error_add_new_logger(swarm_mpi%master, swarm_mpi%master_output_path)
173  END SUBROUTINE logger_init_master
174 
175 ! **************************************************************************************************
176 !> \brief Helper routine for logger_init_master, inquires filename for given unit.
177 !> \param output_unit ...
178 !> \return ...
179 !> \author Ole Schuett
180 ! **************************************************************************************************
181  FUNCTION output_unit2path(output_unit) RESULT(output_path)
182  INTEGER, INTENT(IN) :: output_unit
183  CHARACTER(LEN=default_path_length) :: output_path
184 
185  output_path = "__STD_OUT__"
186  IF (output_unit /= default_output_unit) &
187  INQUIRE (unit=output_unit, name=output_path)
188  END FUNCTION output_unit2path
189 
190 ! **************************************************************************************************
191 !> \brief Helper routine for swarm_mpi_init, configures the workers's logger.
192 !> \param swarm_mpi ...
193 !> \param root_section ...
194 !> \param worker_id ...
195 !> \author Ole Schuett
196 ! **************************************************************************************************
197  SUBROUTINE logger_init_worker(swarm_mpi, root_section, worker_id)
198  TYPE(swarm_mpi_type) :: swarm_mpi
199  TYPE(section_vals_type), POINTER :: root_section
200  INTEGER :: worker_id
201 
202  CHARACTER(LEN=default_path_length) :: output_path
203  CHARACTER(len=default_string_length) :: new_project_name, project_name, &
204  worker_name
205  TYPE(cp_iteration_info_type), POINTER :: new_iter_info
206  TYPE(cp_logger_type), POINTER :: old_logger
207 
208  NULLIFY (old_logger, new_iter_info)
209  IF (ASSOCIATED(swarm_mpi%worker)) THEN
210  old_logger => cp_get_default_logger()
211  project_name = old_logger%iter_info%project_name
212  IF (worker_id > 99999) THEN
213  cpabort("Did not expect so many workers.")
214  END IF
215  WRITE (worker_name, "(A,I5.5)") 'WORKER', worker_id
216  IF (len_trim(project_name) + 1 + len_trim(worker_name) > default_string_length) THEN
217  cpabort("project name too long")
218  END IF
219  output_path = trim(project_name)//"-"//trim(worker_name)//".out"
220  new_project_name = trim(project_name)//"-"//trim(worker_name)
221  CALL section_vals_val_set(root_section, "GLOBAL%PROJECT_NAME", c_val=new_project_name)
222  CALL cp_iteration_info_create(new_iter_info, new_project_name)
223  CALL error_add_new_logger(swarm_mpi%worker, output_path, new_iter_info)
224  CALL cp_iteration_info_release(new_iter_info)
225  END IF
226  END SUBROUTINE logger_init_worker
227 
228 ! **************************************************************************************************
229 !> \brief Helper routine for logger_init_master and logger_init_worker
230 !> \param para_env ...
231 !> \param output_path ...
232 !> \param iter_info ...
233 !> \author Ole Schuett
234 ! **************************************************************************************************
235  SUBROUTINE error_add_new_logger(para_env, output_path, iter_info)
236  TYPE(mp_para_env_type), POINTER :: para_env
237  CHARACTER(LEN=default_path_length) :: output_path
238  TYPE(cp_iteration_info_type), OPTIONAL, POINTER :: iter_info
239 
240  INTEGER :: output_unit
241  TYPE(cp_logger_type), POINTER :: new_logger, old_logger
242 
243  NULLIFY (new_logger, old_logger)
244  output_unit = -1
245  IF (para_env%is_source()) THEN
246  ! open output_unit according to output_path
247  output_unit = default_output_unit
248  IF (output_path /= "__STD_OUT__") &
249  CALL open_file(file_name=output_path, file_status="UNKNOWN", &
250  file_action="WRITE", file_position="APPEND", unit_number=output_unit)
251  END IF
252 
253  old_logger => cp_get_default_logger()
254  CALL cp_logger_create(new_logger, para_env=para_env, &
255  default_global_unit_nr=output_unit, close_global_unit_on_dealloc=.false., &
256  template_logger=old_logger, iter_info=iter_info)
257 
258  CALL cp_add_default_logger(new_logger)
259  CALL cp_logger_release(new_logger)
260  END SUBROUTINE error_add_new_logger
261 
262 ! **************************************************************************************************
263 !> \brief Finalizes the MPI communicators of a swarm run.
264 !> \param swarm_mpi ...
265 !> \param root_section ...
266 !> \author Ole Schuett
267 ! **************************************************************************************************
268  SUBROUTINE swarm_mpi_finalize(swarm_mpi, root_section)
269  TYPE(swarm_mpi_type) :: swarm_mpi
270  TYPE(section_vals_type), POINTER :: root_section
271 
272  CALL swarm_mpi%world%sync()
273  CALL logger_finalize(swarm_mpi, root_section)
274 
275  IF (ASSOCIATED(swarm_mpi%worker)) CALL mp_para_env_release(swarm_mpi%worker)
276  IF (ASSOCIATED(swarm_mpi%master)) CALL mp_para_env_release(swarm_mpi%master)
277  NULLIFY (swarm_mpi%worker, swarm_mpi%master)
278  DEALLOCATE (swarm_mpi%wid2group)
279  END SUBROUTINE swarm_mpi_finalize
280 
281 ! **************************************************************************************************
282 !> \brief Helper routine for swarm_mpi_finalize, restores the original loggers
283 !> \param swarm_mpi ...
284 !> \param root_section ...
285 !> \author Ole Schuett
286 ! **************************************************************************************************
287  SUBROUTINE logger_finalize(swarm_mpi, root_section)
288  TYPE(swarm_mpi_type) :: swarm_mpi
289  TYPE(section_vals_type), POINTER :: root_section
290 
291  INTEGER :: output_unit
292  TYPE(cp_logger_type), POINTER :: logger, old_logger
293 
294  NULLIFY (logger, old_logger)
295  logger => cp_get_default_logger()
296  output_unit = logger%default_local_unit_nr
297  IF (output_unit > 0 .AND. output_unit /= default_output_unit) &
298  CALL close_file(output_unit)
299 
300  CALL cp_rm_default_logger() !pops the top-most logger
301  old_logger => cp_get_default_logger()
302 
303  ! restore GLOBAL%PROJECT_NAME
304  CALL section_vals_val_set(root_section, "GLOBAL%PROJECT_NAME", &
305  c_val=old_logger%iter_info%project_name)
306 
307  CALL swarm_mpi%world%sync()
308 
309  ! do this only on master's rank 0
310  IF (swarm_mpi%world%is_source() .AND. output_unit /= default_output_unit) THEN
311  output_unit = old_logger%default_local_unit_nr
312  OPEN (unit=output_unit, file=swarm_mpi%master_output_path, &
313  status="UNKNOWN", action="WRITE", position="APPEND")
314  END IF
315  END SUBROUTINE logger_finalize
316 
317 ! **************************************************************************************************
318 !> \brief Sends a report via MPI
319 !> \param swarm_mpi ...
320 !> \param report ...
321 !> \author Ole Schuett
322 ! **************************************************************************************************
323  SUBROUTINE swarm_mpi_send_report(swarm_mpi, report)
324  TYPE(swarm_mpi_type) :: swarm_mpi
325  TYPE(swarm_message_type) :: report
326 
327  INTEGER :: dest, tag
328 
329 ! Only rank-0 of worker group sends its report
330 
331  IF (swarm_mpi%worker%is_source()) THEN
332  dest = swarm_mpi%world%num_pe - 1
333  tag = 42
334  CALL swarm_message_mpi_send(report, group=swarm_mpi%world, dest=dest, tag=tag)
335  END IF
336 
337  END SUBROUTINE swarm_mpi_send_report
338 
339 ! **************************************************************************************************
340 !> \brief Receives a report via MPI
341 !> \param swarm_mpi ...
342 !> \param report ...
343 !> \author Ole Schuett
344 ! **************************************************************************************************
345  SUBROUTINE swarm_mpi_recv_report(swarm_mpi, report)
346  TYPE(swarm_mpi_type) :: swarm_mpi
347  TYPE(swarm_message_type), INTENT(OUT) :: report
348 
349  INTEGER :: src, tag
350 
351  tag = 42
352  src = mp_any_source
353 
354  CALL swarm_message_mpi_recv(report, group=swarm_mpi%world, src=src, tag=tag)
355 
356  END SUBROUTINE swarm_mpi_recv_report
357 
358 ! **************************************************************************************************
359 !> \brief Sends a command via MPI
360 !> \param swarm_mpi ...
361 !> \param cmd ...
362 !> \author Ole Schuett
363 ! **************************************************************************************************
364  SUBROUTINE swarm_mpi_send_command(swarm_mpi, cmd)
365  TYPE(swarm_mpi_type) :: swarm_mpi
366  TYPE(swarm_message_type) :: cmd
367 
368  INTEGER :: dest, tag, worker_id
369 
370  CALL swarm_message_get(cmd, "worker_id", worker_id)
371  tag = 42
372  dest = swarm_mpi%wid2group(worker_id)
373 
374  CALL swarm_message_mpi_send(cmd, group=swarm_mpi%world, dest=dest, tag=tag)
375 
376  END SUBROUTINE swarm_mpi_send_command
377 
378 ! **************************************************************************************************
379 !> \brief Receives a command via MPI and broadcasts it within a worker.
380 !> \param swarm_mpi ...
381 !> \param cmd ...
382 !> \author Ole Schuett
383 ! **************************************************************************************************
384  SUBROUTINE swarm_mpi_recv_command(swarm_mpi, cmd)
385  TYPE(swarm_mpi_type) :: swarm_mpi
386  TYPE(swarm_message_type), INTENT(OUT) :: cmd
387 
388  INTEGER :: src, tag
389 
390 ! This is a two step communication schema.
391 ! First: The rank-0 of the worker groups receives the command from the master.
392 
393  IF (swarm_mpi%worker%is_source()) THEN
394  src = swarm_mpi%world%num_pe - 1 !
395  tag = 42
396  CALL swarm_message_mpi_recv(cmd, group=swarm_mpi%world, src=src, tag=tag)
397 
398  END IF
399 
400 ! ! Second: The command is broadcasted within the worker group.
401  CALL swarm_message_mpi_bcast(cmd, src=swarm_mpi%worker%source, group=swarm_mpi%worker)
402 
403  END SUBROUTINE swarm_mpi_recv_command
404 
405 END MODULE swarm_mpi
406 
Utility routines to open and close files. Tracking of preconnections.
Definition: cp_files.F:16
subroutine, public open_file(file_name, file_status, file_form, file_action, file_position, file_pad, unit_number, debug, skip_get_unit_number, file_access)
Opens the requested file using a free unit number.
Definition: cp_files.F:308
subroutine, public close_file(unit_number, file_status, keep_preconnection)
Close an open file given by its logical unit number. Optionally, keep the file and unit preconnected.
Definition: cp_files.F:119
Collection of routines to handle the iteration info.
Definition: cp_iter_types.F:11
pure subroutine, public cp_iteration_info_create(iteration_info, project_name)
creates an output info object
Definition: cp_iter_types.F:94
subroutine, public cp_iteration_info_release(iteration_info)
releases the iteration_info (see doc/ReferenceCounting.html)
various routines to log and control the output. The idea is that decisions about where to log should ...
subroutine, public cp_rm_default_logger()
the cousin of cp_add_default_logger, decrements the stack, so that the default logger is what it has ...
subroutine, public cp_logger_release(logger)
releases this logger
subroutine, public cp_logger_create(logger, para_env, print_level, default_global_unit_nr, default_local_unit_nr, global_filename, local_filename, close_global_unit_on_dealloc, iter_info, close_local_unit_on_dealloc, suffix, template_logger)
initializes a logger
subroutine, public cp_add_default_logger(logger)
adds a default logger. MUST be called before logging occours
type(cp_logger_type) function, pointer, public cp_get_default_logger()
returns the default logger
objects that represent the structure of input sections and the data contained in an input section
subroutine, public section_vals_val_set(section_vals, keyword_name, i_rep_section, i_rep_val, val, l_val, i_val, r_val, c_val, l_vals_ptr, i_vals_ptr, r_vals_ptr, c_vals_ptr)
sets the requested value
Defines the basic variable types.
Definition: kinds.F:23
integer, parameter, public default_string_length
Definition: kinds.F:57
integer, parameter, public default_path_length
Definition: kinds.F:58
Machine interface based on Fortran 2003 and POSIX.
Definition: machine.F:17
integer, parameter, public default_output_unit
Definition: machine.F:45
Interface to the message passing library MPI.
subroutine, public mp_para_env_release(para_env)
releases the para object (to be called when you don't want anymore the shared copy of this object)
integer, parameter, public mp_any_source
Swarm-message, a convenient data-container for with build-in serialization.
Definition: swarm_message.F:12
subroutine, public swarm_message_mpi_send(msg, group, dest, tag)
Sends a swarm message via MPI.
subroutine, public swarm_message_mpi_bcast(msg, src, group)
Broadcasts a swarm message via MPI.
subroutine, public swarm_message_mpi_recv(msg, group, src, tag)
Receives a swarm message via MPI.
Handles the MPI communication of the swarm framework.
Definition: swarm_mpi.F:12
subroutine, public swarm_mpi_init(swarm_mpi, world_para_env, root_section, n_workers, worker_id, iw)
Initialize MPI communicators for a swarm run.
Definition: swarm_mpi.F:70
subroutine, public swarm_mpi_send_command(swarm_mpi, cmd)
Sends a command via MPI.
Definition: swarm_mpi.F:365
subroutine, public swarm_mpi_recv_report(swarm_mpi, report)
Receives a report via MPI.
Definition: swarm_mpi.F:346
subroutine, public swarm_mpi_recv_command(swarm_mpi, cmd)
Receives a command via MPI and broadcasts it within a worker.
Definition: swarm_mpi.F:385
subroutine, public swarm_mpi_send_report(swarm_mpi, report)
Sends a report via MPI.
Definition: swarm_mpi.F:324
subroutine, public swarm_mpi_finalize(swarm_mpi, root_section)
Finalizes the MPI communicators of a swarm run.
Definition: swarm_mpi.F:269