(git:b279b6b)
dbcsr_vector.F
Go to the documentation of this file.
1 !--------------------------------------------------------------------------------------------------!
2 ! CP2K: A general program to perform molecular dynamics simulations !
3 ! Copyright 2000-2024 CP2K developers group <https://cp2k.org> !
4 ! !
5 ! SPDX-License-Identifier: GPL-2.0-or-later !
6 !--------------------------------------------------------------------------------------------------!
7 
8 ! **************************************************************************************************
9 !> \brief operations for skinny matrices/vectors expressed in dbcsr form
10 !> \par History
11 !> 2014.10 created [Florian Schiffmann]
12 !> \author Florian Schiffmann
13 ! **************************************************************************************************
14 
16  USE dbcsr_api, ONLY: dbcsr_copy, &
17  dbcsr_create, &
18  dbcsr_distribution_get, &
19  dbcsr_distribution_new, &
20  dbcsr_distribution_release, &
21  dbcsr_distribution_type, &
22  dbcsr_get_info, &
23  dbcsr_iterator_blocks_left, &
24  dbcsr_iterator_next_block, &
25  dbcsr_iterator_start, &
26  dbcsr_iterator_stop, &
27  dbcsr_iterator_type, &
28  dbcsr_release, &
29  dbcsr_reserve_all_blocks, &
30  dbcsr_set, dbcsr_get_data_p, &
31  dbcsr_type, &
32  dbcsr_type_antisymmetric, &
33  dbcsr_type_complex_8, &
34  dbcsr_type_complex_8, &
35  dbcsr_type_no_symmetry, &
36  dbcsr_type_real_8, &
37  dbcsr_type_real_8, &
38  dbcsr_type_symmetric
39  USE kinds, ONLY: dp, &
40  real_8
41  USE message_passing, ONLY: mp_comm_type
42 
43 #include "../base/base_uses.f90"
44 
45 !$ USE OMP_LIB, ONLY: omp_get_max_threads, omp_get_thread_num, omp_get_num_threads
46 
47  IMPLICIT NONE
48 
49  PRIVATE
50 
51  CHARACTER(len=*), PARAMETER, PRIVATE :: moduleN = 'dbcsr_vector_operations'
52 
53 ! **************************************************************************************************
54 !> \brief Types needed for the hashtable.
55 ! **************************************************************************************************
56  TYPE ele_type
57  INTEGER :: c = 0
58  INTEGER :: p = 0
59  END TYPE ele_type
60 
61  TYPE hash_table_type
62  TYPE(ele_type), DIMENSION(:), POINTER :: table => null()
63  INTEGER :: nele = 0
64  INTEGER :: nmax = 0
65  INTEGER :: prime = 0
66  END TYPE hash_table_type
67 
68 ! **************************************************************************************************
69 !> \brief Types needed for fast access to distributed dbcsr vectors.
70 ! **************************************************************************************************
71  TYPE block_ptr_d
72  REAL(real_8), DIMENSION(:, :), POINTER :: ptr => null()
73  INTEGER :: assigned_thread = -1
74  END TYPE
75  TYPE block_ptr_z
76  COMPLEX(real_8), DIMENSION(:, :), POINTER :: ptr => null()
77  INTEGER :: assigned_thread = -1
78  END TYPE
79 
80  TYPE fast_vec_access_type
81  TYPE(hash_table_type) :: hash_table = hash_table_type()
82  TYPE(block_ptr_d), DIMENSION(:), ALLOCATABLE :: blk_map_d
83  TYPE(block_ptr_z), DIMENSION(:), ALLOCATABLE :: blk_map_z
84  END TYPE
85 
86  PUBLIC :: dbcsr_matrix_colvec_multiply, &
91 
92  INTERFACE dbcsr_matrix_colvec_multiply
93  MODULE PROCEDURE dbcsr_matrix_colvec_multiply_d
94  MODULE PROCEDURE dbcsr_matrix_colvec_multiply_z
95  END INTERFACE
96 
97 CONTAINS
98 
99 ! **************************************************************************************************
100 !> \brief creates a dbcsr col vector like object which lives on proc_col 0
101 !> and has the same row dist as the template matrix
102 !> the returned matrix is fully allocated and all blocks are set to 0
103 !> this is not a sparse object (and must never be)
104 !> \param dbcsr_vec the vector object to create must be allocated but not initialized
105 !> \param matrix a dbcsr matrix used as template
106 !> \param ncol number of vectors in the dbcsr_typeect (1 for vector, n for skinny matrix)
107 ! **************************************************************************************************
108  SUBROUTINE create_col_vec_from_matrix(dbcsr_vec, matrix, ncol)
109  TYPE(dbcsr_type) :: dbcsr_vec, matrix
110  INTEGER :: ncol
111 
112  CHARACTER(LEN=*), PARAMETER :: routinen = 'create_col_vec_from_matrix'
113 
114  INTEGER :: handle, npcols, data_type
115  INTEGER, DIMENSION(:), POINTER :: row_blk_size, col_blk_size, row_dist, col_dist
116  TYPE(dbcsr_distribution_type) :: dist_col_vec, dist
117 
118  CALL timeset(routinen, handle)
119 
120  CALL dbcsr_get_info(matrix, data_type=data_type, row_blk_size=row_blk_size, distribution=dist)
121  CALL dbcsr_distribution_get(dist, npcols=npcols, row_dist=row_dist)
122 
123  ALLOCATE (col_dist(1), col_blk_size(1))
124  col_dist(1) = 0
125  col_blk_size(1) = ncol
126  CALL dbcsr_distribution_new(dist_col_vec, template=dist, row_dist=row_dist, col_dist=col_dist)
127 
128  CALL dbcsr_create(dbcsr_vec, "D", dist_col_vec, &
129  matrix_type=dbcsr_type_no_symmetry, &
130  row_blk_size=row_blk_size, &
131  col_blk_size=col_blk_size, &
132  nze=0, data_type=data_type)
133  CALL dbcsr_reserve_all_blocks(dbcsr_vec)
134 
135  CALL dbcsr_distribution_release(dist_col_vec)
136  DEALLOCATE (col_dist, col_blk_size)
137  CALL timestop(handle)
138 
139  END SUBROUTINE create_col_vec_from_matrix
140 
141 ! **************************************************************************************************
142 !> \brief creates a dbcsr row vector like object which lives on proc_row 0
143 !> and has the same row dist as the template matrix
144 !> the returned matrix is fully allocated and all blocks are set to 0
145 !> this is not a sparse object (and must never be)
146 !> \param dbcsr_vec ...
147 !> \param matrix a dbcsr matrix used as template
148 !> \param nrow number of vectors in the dbcsr_typeect (1 for vector, n for skinny matrix)
149 ! **************************************************************************************************
150  SUBROUTINE create_row_vec_from_matrix(dbcsr_vec, matrix, nrow)
151  TYPE(dbcsr_type) :: dbcsr_vec, matrix
152  INTEGER :: nrow
153 
154  CHARACTER(LEN=*), PARAMETER :: routinen = 'create_row_vec_from_matrix'
155 
156  INTEGER :: handle, nprows, data_type
157  INTEGER, DIMENSION(:), POINTER :: row_blk_size, col_blk_size, row_dist, col_dist
158  TYPE(dbcsr_distribution_type) :: dist_row_vec, dist
159 
160  CALL timeset(routinen, handle)
161 
162  CALL dbcsr_get_info(matrix, data_type=data_type, col_blk_size=col_blk_size, distribution=dist)
163  CALL dbcsr_distribution_get(dist, nprows=nprows, col_dist=col_dist)
164 
165  ALLOCATE (row_dist(1), row_blk_size(1))
166  row_dist(1) = 0
167  row_blk_size(1) = nrow
168  CALL dbcsr_distribution_new(dist_row_vec, template=dist, row_dist=row_dist, col_dist=col_dist)
169 
170  CALL dbcsr_create(dbcsr_vec, "D", dist_row_vec, &
171  matrix_type=dbcsr_type_no_symmetry, &
172  row_blk_size=row_blk_size, &
173  col_blk_size=col_blk_size, &
174  nze=0, data_type=data_type)
175  CALL dbcsr_reserve_all_blocks(dbcsr_vec)
176 
177  CALL dbcsr_distribution_release(dist_row_vec)
178  DEALLOCATE (row_dist, row_blk_size)
179  CALL timestop(handle)
180 
181  END SUBROUTINE create_row_vec_from_matrix
182 
183 ! **************************************************************************************************
184 !> \brief creates a col vector like object whose blocks can be replicated
185 !> along the processor row and has the same row dist as the template matrix
186 !> the returned matrix is fully allocated and all blocks are set to 0
187 !> this is not a sparse object (and must never be)
188 !> \param dbcsr_vec the vector object to create must be allocated but not initialized
189 !> \param matrix a dbcsr matrix used as template
190 !> \param ncol number of vectors in the dbcsr_typeect (1 for vector, n for skinny matrix)
191 ! **************************************************************************************************
192  SUBROUTINE create_replicated_col_vec_from_matrix(dbcsr_vec, matrix, ncol)
193  TYPE(dbcsr_type) :: dbcsr_vec, matrix
194  INTEGER :: ncol
195 
196  CHARACTER(LEN=*), PARAMETER :: routinen = 'create_replicated_col_vec_from_matrix'
197 
198  INTEGER :: handle, npcols, data_type, i
199  INTEGER, DIMENSION(:), POINTER :: row_blk_size, col_blk_size, row_dist, col_dist
200  TYPE(dbcsr_distribution_type) :: dist_col_vec, dist
201  CALL timeset(routinen, handle)
202 
203  CALL dbcsr_get_info(matrix, data_type=data_type, row_blk_size=row_blk_size, distribution=dist)
204  CALL dbcsr_distribution_get(dist, npcols=npcols, row_dist=row_dist)
205 
206  ALLOCATE (col_dist(npcols), col_blk_size(npcols))
207  col_blk_size(:) = ncol
208  DO i = 0, npcols - 1
209  col_dist(i + 1) = i
210  END DO
211  CALL dbcsr_distribution_new(dist_col_vec, template=dist, row_dist=row_dist, col_dist=col_dist)
212 
213  CALL dbcsr_create(dbcsr_vec, "D", dist_col_vec, &
214  matrix_type=dbcsr_type_no_symmetry, &
215  row_blk_size=row_blk_size, &
216  col_blk_size=col_blk_size, &
217  nze=0, data_type=data_type)
218  CALL dbcsr_reserve_all_blocks(dbcsr_vec)
219 
220  CALL dbcsr_distribution_release(dist_col_vec)
221  DEALLOCATE (col_dist, col_blk_size)
222  CALL timestop(handle)
223 
225 
226 ! **************************************************************************************************
227 !> \brief creates a row vector like object whose blocks can be replicated
228 !> along the processor col and has the same col dist as the template matrix
229 !> the returned matrix is fully allocated and all blocks are set to 0
230 !> this is not a sparse object (and must never be)
231 !> \param dbcsr_vec the vector object to create must be allocated but not initialized
232 !> \param matrix a dbcsr matrix used as template
233 !> \param nrow number of vectors in the dbcsr_typeect (1 for vector, n for skinny matrix)
234 ! **************************************************************************************************
235  SUBROUTINE create_replicated_row_vec_from_matrix(dbcsr_vec, matrix, nrow)
236  TYPE(dbcsr_type) :: dbcsr_vec
237  TYPE(dbcsr_type) :: matrix
238  INTEGER :: nrow
239 
240  CHARACTER(LEN=*), PARAMETER :: routinen = 'create_replicated_row_vec_from_matrix'
241 
242  INTEGER :: handle, i, nprows, data_type
243  INTEGER, DIMENSION(:), POINTER :: row_dist, col_dist, row_blk_size, col_blk_size
244  TYPE(dbcsr_distribution_type) :: dist_row_vec, dist
245 
246  CALL timeset(routinen, handle)
247 
248  CALL dbcsr_get_info(matrix, distribution=dist, col_blk_size=col_blk_size, data_type=data_type)
249  CALL dbcsr_distribution_get(dist, nprows=nprows, col_dist=col_dist)
250 
251  ALLOCATE (row_dist(nprows), row_blk_size(nprows))
252  row_blk_size(:) = nrow
253  DO i = 0, nprows - 1
254  row_dist(i + 1) = i
255  END DO
256  CALL dbcsr_distribution_new(dist_row_vec, template=dist, row_dist=row_dist, col_dist=col_dist)
257 
258  CALL dbcsr_create(dbcsr_vec, "D", dist_row_vec, dbcsr_type_no_symmetry, &
259  row_blk_size=row_blk_size, col_blk_size=col_blk_size, &
260  nze=0, data_type=data_type)
261  CALL dbcsr_reserve_all_blocks(dbcsr_vec)
262 
263  CALL dbcsr_distribution_release(dist_row_vec)
264  DEALLOCATE (row_dist, row_blk_size)
265  CALL timestop(handle)
266 
268 
269 ! **************************************************************************************************
270 !> \brief given a column vector, prepare the fast_vec_access container
271 !> \param vec ...
272 !> \param fast_vec_access ...
273 ! **************************************************************************************************
274  SUBROUTINE create_fast_col_vec_access(vec, fast_vec_access)
275  TYPE(dbcsr_type) :: vec
276  TYPE(fast_vec_access_type) :: fast_vec_access
277 
278  CHARACTER(LEN=*), PARAMETER :: routinen = 'create_fast_col_vec_access'
279 
280  INTEGER :: handle, data_type
281 
282  CALL timeset(routinen, handle)
283 
284  CALL dbcsr_get_info(vec, data_type=data_type)
285 
286  SELECT CASE (data_type)
287  CASE (dbcsr_type_real_8)
288  CALL create_fast_col_vec_access_d(vec, fast_vec_access)
289  CASE (dbcsr_type_complex_8)
290  CALL create_fast_col_vec_access_z(vec, fast_vec_access)
291  END SELECT
292 
293  CALL timestop(handle)
294 
295  END SUBROUTINE create_fast_col_vec_access
296 
297 ! **************************************************************************************************
298 !> \brief given a row vector, prepare the fast_vec_access_container
299 !> \param vec ...
300 !> \param fast_vec_access ...
301 ! **************************************************************************************************
302  SUBROUTINE create_fast_row_vec_access(vec, fast_vec_access)
303  TYPE(dbcsr_type) :: vec
304  TYPE(fast_vec_access_type) :: fast_vec_access
305 
306  CHARACTER(LEN=*), PARAMETER :: routinen = 'create_fast_row_vec_access'
307 
308  INTEGER :: handle, data_type
309 
310  CALL timeset(routinen, handle)
311 
312  CALL dbcsr_get_info(vec, data_type=data_type)
313 
314  SELECT CASE (data_type)
315  CASE (dbcsr_type_real_8)
316  CALL create_fast_row_vec_access_d(vec, fast_vec_access)
317  CASE (dbcsr_type_complex_8)
318  CALL create_fast_row_vec_access_z(vec, fast_vec_access)
319  END SELECT
320 
321  CALL timestop(handle)
322 
323  END SUBROUTINE create_fast_row_vec_access
324 
325 ! **************************************************************************************************
326 !> \brief release all memory associated with the fast_vec_access type
327 !> \param fast_vec_access ...
328 ! **************************************************************************************************
329  SUBROUTINE release_fast_vec_access(fast_vec_access)
330  TYPE(fast_vec_access_type) :: fast_vec_access
331 
332  CHARACTER(LEN=*), PARAMETER :: routinen = 'release_fast_vec_access'
333 
334  INTEGER :: handle
335 
336  CALL timeset(routinen, handle)
337 
338  CALL hash_table_release(fast_vec_access%hash_table)
339  IF (ALLOCATED(fast_vec_access%blk_map_d)) DEALLOCATE (fast_vec_access%blk_map_d)
340  IF (ALLOCATED(fast_vec_access%blk_map_z)) DEALLOCATE (fast_vec_access%blk_map_z)
341 
342  CALL timestop(handle)
343 
344  END SUBROUTINE release_fast_vec_access
345 
346 ! --------------------------------------------------------------------------------------------------
347 ! Beginning of hashtable.
348 ! this file can be 'INCLUDE'ed verbatim in various place, where it needs to be
349 ! part of the module to guarantee inlining
350 ! hashes (c,p) pairs, where p is assumed to be >0
351 ! on return (0 is used as a flag for not present)
352 !
353 !
354 ! **************************************************************************************************
355 !> \brief finds a prime equal or larger than i, needed at creation
356 !> \param i ...
357 !> \return ...
358 ! **************************************************************************************************
359  FUNCTION matching_prime(i) RESULT(res)
360  INTEGER, INTENT(IN) :: i
361  INTEGER :: res
362 
363  INTEGER :: j
364 
365  res = i
366  j = 0
367  DO WHILE (j < res)
368  DO j = 2, res - 1
369  IF (mod(res, j) == 0) THEN
370  res = res + 1
371  EXIT
372  END IF
373  END DO
374  END DO
375  END FUNCTION
376 
377 ! **************************************************************************************************
378 !> \brief create a hash_table of given initial size.
379 !> the hash table will expand as needed (but this requires rehashing)
380 !> \param hash_table ...
381 !> \param table_size ...
382 ! **************************************************************************************************
383  SUBROUTINE hash_table_create(hash_table, table_size)
384  TYPE(hash_table_type) :: hash_table
385  INTEGER, INTENT(IN) :: table_size
386 
387  INTEGER :: j
388 
389  ! guarantee a minimal hash table size (8), so that expansion works
390 
391  j = 3
392  DO WHILE (2**j - 1 < table_size)
393  j = j + 1
394  END DO
395  hash_table%nmax = 2**j - 1
396  hash_table%prime = matching_prime(hash_table%nmax)
397  hash_table%nele = 0
398  ALLOCATE (hash_table%table(0:hash_table%nmax))
399  END SUBROUTINE hash_table_create
400 
401 ! **************************************************************************************************
402 !> \brief ...
403 !> \param hash_table ...
404 ! **************************************************************************************************
405  SUBROUTINE hash_table_release(hash_table)
406  TYPE(hash_table_type) :: hash_table
407 
408  hash_table%nmax = 0
409  hash_table%nele = 0
410  DEALLOCATE (hash_table%table)
411 
412  END SUBROUTINE hash_table_release
413 
414 ! **************************************************************************************************
415 !> \brief add a pair (c,p) to the hash table
416 !> \param hash_table ...
417 !> \param c this value is being hashed
418 !> \param p this is being stored
419 ! **************************************************************************************************
420  RECURSIVE SUBROUTINE hash_table_add(hash_table, c, p)
421  TYPE(hash_table_type), INTENT(INOUT) :: hash_table
422  INTEGER, INTENT(IN) :: c, p
423 
424  REAL(kind=real_8), PARAMETER :: hash_table_expand = 1.5_real_8, &
425  inv_hash_table_fill = 2.5_real_8
426 
427  INTEGER :: i, j
428  TYPE(ele_type), ALLOCATABLE, &
429  DIMENSION(:) :: tmp_hash
430 
431 ! if too small, make a copy and rehash in a larger table
432 
433  IF (hash_table%nele*inv_hash_table_fill > hash_table%nmax) THEN
434  ALLOCATE (tmp_hash(lbound(hash_table%table, 1):ubound(hash_table%table, 1)))
435  tmp_hash(:) = hash_table%table
436  CALL hash_table_release(hash_table)
437  CALL hash_table_create(hash_table, int((ubound(tmp_hash, 1) + 8)*hash_table_expand))
438  DO i = lbound(tmp_hash, 1), ubound(tmp_hash, 1)
439  IF (tmp_hash(i)%c .NE. 0) THEN
440  CALL hash_table_add(hash_table, tmp_hash(i)%c, tmp_hash(i)%p)
441  END IF
442  END DO
443  DEALLOCATE (tmp_hash)
444  END IF
445 
446  hash_table%nele = hash_table%nele + 1
447  i = iand(c*hash_table%prime, hash_table%nmax)
448 
449  DO j = i, hash_table%nmax
450  IF (hash_table%table(j)%c == 0 .OR. hash_table%table(j)%c == c) THEN
451  hash_table%table(j)%c = c
452  hash_table%table(j)%p = p
453  RETURN
454  END IF
455  END DO
456  DO j = 0, i - 1
457  IF (hash_table%table(j)%c == 0 .OR. hash_table%table(j)%c == c) THEN
458  hash_table%table(j)%c = c
459  hash_table%table(j)%p = p
460  RETURN
461  END IF
462  END DO
463 
464  END SUBROUTINE hash_table_add
465 
466 ! **************************************************************************************************
467 !> \brief ...
468 !> \param hash_table ...
469 !> \param c ...
470 !> \return ...
471 ! **************************************************************************************************
472  PURE FUNCTION hash_table_get(hash_table, c) RESULT(p)
473  TYPE(hash_table_type), INTENT(IN) :: hash_table
474  INTEGER, INTENT(IN) :: c
475  INTEGER :: p
476 
477  INTEGER :: i, j
478 
479  i = iand(c*hash_table%prime, hash_table%nmax)
480 
481  ! catch the likely case first
482  IF (hash_table%table(i)%c == c) THEN
483  p = hash_table%table(i)%p
484  RETURN
485  END IF
486 
487  DO j = i, hash_table%nmax
488  IF (hash_table%table(j)%c == 0 .OR. hash_table%table(j)%c == c) THEN
489  p = hash_table%table(j)%p
490  RETURN
491  END IF
492  END DO
493  DO j = 0, i - 1
494  IF (hash_table%table(j)%c == 0 .OR. hash_table%table(j)%c == c) THEN
495  p = hash_table%table(j)%p
496  RETURN
497  END IF
498  END DO
499 
500  ! we should never reach this point.
501  p = huge(p)
502 
503  END FUNCTION hash_table_get
504 
505 ! End of hashtable
506 ! --------------------------------------------------------------------------------------------------
507 
508 
509 
510 ! **************************************************************************************************
511 !> \brief the real driver routine for the multiply, not all symmetries implemented yet
512 !> \param matrix ...
513 !> \param vec_in ...
514 !> \param vec_out ...
515 !> \param alpha ...
516 !> \param beta ...
517 !> \param work_row ...
518 !> \param work_col ...
519 ! **************************************************************************************************
520  SUBROUTINE dbcsr_matrix_colvec_multiply_d (matrix, vec_in, vec_out, alpha, beta, work_row, work_col)
521  TYPE(dbcsr_type) :: matrix, vec_in, vec_out
522  REAL(kind=real_8) :: alpha, beta
523  TYPE(dbcsr_type) :: work_row, work_col
524 
525  CHARACTER :: matrix_type
526 
527  CALL dbcsr_get_info(matrix, matrix_type=matrix_type)
528 
529  SELECT CASE (matrix_type)
530  CASE (dbcsr_type_no_symmetry)
531  CALL dbcsr_matrix_vector_mult_d (matrix, vec_in, vec_out, alpha, beta, work_row, work_col)
532  CASE (dbcsr_type_symmetric)
533  CALL dbcsr_sym_matrix_vector_mult_d (matrix, vec_in, vec_out, alpha, beta, work_row, work_col)
534  CASE (dbcsr_type_antisymmetric)
535  ! Not yet implemented, should mainly be some prefactor magic, but who knows how antisymmetric matrices are stored???
536  cpabort("NYI, antisymmetric matrix not permitted")
537  CASE DEFAULT
538  cpabort("Unknown matrix type, ...")
539  END SELECT
540 
541  END SUBROUTINE dbcsr_matrix_colvec_multiply_d
542 
543 ! **************************************************************************************************
544 !> \brief low level routines for matrix vector multiplies
545 !> \param matrix ...
546 !> \param vec_in ...
547 !> \param vec_out ...
548 !> \param alpha ...
549 !> \param beta ...
550 !> \param work_row ...
551 !> \param work_col ...
552 ! **************************************************************************************************
553  SUBROUTINE dbcsr_matrix_vector_mult_d (matrix, vec_in, vec_out, alpha, beta, work_row, work_col)
554  TYPE(dbcsr_type) :: matrix, vec_in, vec_out
555  REAL(kind=real_8) :: alpha, beta
556  TYPE(dbcsr_type) :: work_row, work_col
557 
558  CHARACTER(LEN=*), PARAMETER :: routinen = 'dbcsr_matrix_vector_mult'
559 
560  INTEGER :: col, mypcol, &
561  myprow, prow_handle, &
562  ncols, nrows, &
563  row, &
564  handle, handle1, ithread
565  TYPE(mp_comm_type) :: prow_group
566  REAL(kind=real_8), DIMENSION(:), POINTER :: data_vec
567  REAL(kind=real_8), DIMENSION(:, :), POINTER :: data_d, vec_res
568  TYPE(dbcsr_distribution_type) :: dist
569  TYPE(dbcsr_iterator_type) :: iter
570  TYPE(fast_vec_access_type) :: fast_vec_row, fast_vec_col
571  INTEGER :: prow, pcol
572 
573  CALL timeset(routinen, handle)
574  ithread = 0
575 
576 ! Collect some data about the parallel environment. We will use them later to move the vector around
577  CALL dbcsr_get_info(matrix, distribution=dist)
578  CALL dbcsr_distribution_get(dist, prow_group=prow_handle, myprow=myprow, mypcol=mypcol)
579 
580  CALL prow_group%set_handle(prow_handle)
581 
582  CALL create_fast_row_vec_access(work_row, fast_vec_row)
583  CALL create_fast_col_vec_access(work_col, fast_vec_col)
584 
585 ! Transfer the correct parts of the input vector to the correct locations so we can do a local multiply
586  CALL dbcsr_col_vec_to_rep_row_d (vec_in, work_col, work_row, fast_vec_col)
587 
588 ! Set the work vector for the results to 0
589  CALL dbcsr_set(work_col, 0.0_real_8)
590 
591 ! Perform the local multiply. Here we exploit, that we have the blocks replicated on the mpi processes
592 ! It is important to note, that the input and result vector are distributed differently (row wise, col wise respectively)
593  CALL timeset(routinen//"_local_mm", handle1)
594 
595 !$OMP PARALLEL DEFAULT(NONE) PRIVATE(row,col,iter,data_d,ithread,pcol,prow) &
596 !$OMP SHARED(matrix,fast_vec_col,fast_vec_row)
597 !$ ithread = omp_get_thread_num()
598  CALL dbcsr_iterator_start(iter, matrix, shared=.false.)
599  DO WHILE (dbcsr_iterator_blocks_left(iter))
600  CALL dbcsr_iterator_next_block(iter, row, col, data_d)
601  prow = hash_table_get(fast_vec_col%hash_table, row)
602  IF (fast_vec_col%blk_map_d (prow)%assigned_thread .NE. ithread) cycle
603  pcol = hash_table_get(fast_vec_row%hash_table, col)
604  IF (SIZE(fast_vec_col%blk_map_d (prow)%ptr, 1) .EQ. 0 .OR. &
605  SIZE(fast_vec_col%blk_map_d (prow)%ptr, 2) .EQ. 0 .OR. &
606  SIZE(data_d, 2) .EQ. 0) cycle
607  CALL dgemm('N', 'T', SIZE(fast_vec_col%blk_map_d (prow)%ptr, 1), &
608  SIZE(fast_vec_col%blk_map_d (prow)%ptr, 2), &
609  SIZE(data_d, 2), &
610  1.0_dp, &
611  data_d, &
612  SIZE(fast_vec_col%blk_map_d (prow)%ptr, 1), &
613  fast_vec_row%blk_map_d (pcol)%ptr, &
614  SIZE(fast_vec_col%blk_map_d (prow)%ptr, 2), &
615  1.0_dp, &
616  fast_vec_col%blk_map_d (prow)%ptr, &
617  SIZE(fast_vec_col%blk_map_d (prow)%ptr, 1))
618  END DO
619  CALL dbcsr_iterator_stop(iter)
620 !$OMP END PARALLEL
621 
622  CALL timestop(handle1)
623 
624 ! sum all the data onto the first processor col where the original vector is stored
625  data_vec => dbcsr_get_data_p(work_col, select_data_type=0.0_real_8)
626  CALL dbcsr_get_info(matrix=work_col, nfullrows_local=nrows, nfullcols_local=ncols)
627  CALL prow_group%sum(data_vec(1:nrows*ncols))
628 
629 ! Local copy on the first mpi col (as this is the localtion of the vec_res blocks) of the result vector
630 ! from the replicated to the original vector. Let's play it safe and use the iterator
631  CALL dbcsr_iterator_start(iter, vec_out)
632  DO WHILE (dbcsr_iterator_blocks_left(iter))
633  CALL dbcsr_iterator_next_block(iter, row, col, vec_res)
634  prow = hash_table_get(fast_vec_col%hash_table, row)
635  IF (ASSOCIATED(fast_vec_col%blk_map_d (prow)%ptr)) THEN
636  vec_res(:, :) = beta*vec_res(:, :) + alpha*fast_vec_col%blk_map_d (prow)%ptr(:, :)
637  ELSE
638  vec_res(:, :) = beta*vec_res(:, :)
639  END IF
640  END DO
641  CALL dbcsr_iterator_stop(iter)
642 
643  CALL release_fast_vec_access(fast_vec_row)
644  CALL release_fast_vec_access(fast_vec_col)
645 
646  CALL timestop(handle)
647 
648  END SUBROUTINE dbcsr_matrix_vector_mult_d
649 
650 ! **************************************************************************************************
651 !> \brief ...
652 !> \param matrix ...
653 !> \param vec_in ...
654 !> \param vec_out ...
655 !> \param alpha ...
656 !> \param beta ...
657 !> \param work_row ...
658 !> \param work_col ...
659 !> \param skip_diag ...
660 ! **************************************************************************************************
661  SUBROUTINE dbcsr_matrixt_vector_mult_d (matrix, vec_in, vec_out, alpha, beta, work_row, work_col, skip_diag)
662  TYPE(dbcsr_type) :: matrix, vec_in, vec_out
663  REAL(kind=real_8) :: alpha, beta
664  TYPE(dbcsr_type) :: work_row, work_col
665  LOGICAL :: skip_diag
666 
667  CHARACTER(LEN=*), PARAMETER :: routinen = 'dbcsr_matrixT_vector_mult'
668 
669  INTEGER :: col, col_size, mypcol, &
670  myprow, pcol_handle, prow_handle, &
671  ncols, nrows, &
672  row, row_size, &
673  handle, handle1, ithread
674  TYPE(mp_comm_type) :: pcol_group, prow_group
675  REAL(kind=real_8), DIMENSION(:), POINTER :: data_vec
676  REAL(kind=real_8), DIMENSION(:, :), POINTER :: data_d, vec_bl, vec_res
677  TYPE(dbcsr_distribution_type) :: dist
678  TYPE(dbcsr_iterator_type) :: iter
679 
680  TYPE(fast_vec_access_type) :: fast_vec_row, fast_vec_col
681  INTEGER :: prow, pcol
682 
683  CALL timeset(routinen, handle)
684  ithread = 0
685 
686 ! Collect some data about the parallel environment. We will use them later to move the vector around
687  CALL dbcsr_get_info(matrix, distribution=dist)
688  CALL dbcsr_distribution_get(dist, prow_group=prow_handle, pcol_group=pcol_handle, myprow=myprow, mypcol=mypcol)
689 
690  CALL prow_group%set_handle(prow_handle)
691  CALL pcol_group%set_handle(pcol_handle)
692 
693  CALL create_fast_row_vec_access(work_row, fast_vec_row)
694  CALL create_fast_col_vec_access(work_col, fast_vec_col)
695 
696 ! Set the work vector for the results to 0
697  CALL dbcsr_set(work_row, 0.0_real_8)
698 
699 ! Transfer the correct parts of the input vector to the replicated vector on proc_col 0
700  CALL dbcsr_iterator_start(iter, vec_in)
701  DO WHILE (dbcsr_iterator_blocks_left(iter))
702  CALL dbcsr_iterator_next_block(iter, row, col, vec_bl, row_size=row_size, col_size=col_size)
703  prow = hash_table_get(fast_vec_col%hash_table, row)
704  fast_vec_col%blk_map_d (prow)%ptr(1:row_size, 1:col_size) = vec_bl(1:row_size, 1:col_size)
705  END DO
706  CALL dbcsr_iterator_stop(iter)
707 ! Replicate the data on all processore in the row
708  data_vec => dbcsr_get_data_p(work_col, select_data_type=0.0_real_8)
709  CALL prow_group%bcast(data_vec, 0)
710 
711 ! Perform the local multiply. Here it is obvious why the vectors are replicated on the mpi rows and cols
712  CALL timeset(routinen//"local_mm", handle1)
713  CALL dbcsr_get_info(matrix=work_col, nfullcols_local=ncols)
714 !$OMP PARALLEL DEFAULT(NONE) PRIVATE(row,col,iter,data_d,row_size,col_size,ithread,prow,pcol) &
715 !$OMP SHARED(matrix,fast_vec_row,fast_vec_col,skip_diag,ncols)
716 !$ ithread = omp_get_thread_num()
717  CALL dbcsr_iterator_start(iter, matrix, shared=.false.)
718  DO WHILE (dbcsr_iterator_blocks_left(iter))
719  CALL dbcsr_iterator_next_block(iter, row, col, data_d, row_size=row_size, col_size=col_size)
720  IF (skip_diag .AND. col == row) cycle
721  prow = hash_table_get(fast_vec_col%hash_table, row)
722  pcol = hash_table_get(fast_vec_row%hash_table, col)
723  IF (ASSOCIATED(fast_vec_row%blk_map_d (pcol)%ptr) .AND. &
724  ASSOCIATED(fast_vec_col%blk_map_d (prow)%ptr)) THEN
725  IF (fast_vec_row%blk_map_d (pcol)%assigned_thread .NE. ithread) cycle
726  fast_vec_row%blk_map_d (pcol)%ptr = fast_vec_row%blk_map_d (pcol)%ptr + &
727  matmul(transpose(fast_vec_col%blk_map_d (prow)%ptr), data_d)
728  ELSE
729  prow = hash_table_get(fast_vec_row%hash_table, row)
730  pcol = hash_table_get(fast_vec_col%hash_table, col)
731  IF (fast_vec_row%blk_map_d (prow)%assigned_thread .NE. ithread) cycle
732  fast_vec_row%blk_map_d (prow)%ptr = fast_vec_row%blk_map_d (prow)%ptr + &
733  matmul(transpose(fast_vec_col%blk_map_d (pcol)%ptr), transpose(data_d))
734  END IF
735  END DO
736  CALL dbcsr_iterator_stop(iter)
737 !$OMP END PARALLEL
738 
739  CALL timestop(handle1)
740 
741 ! sum all the data within a processor column to obtain the replicated result
742  data_vec => dbcsr_get_data_p(work_row, select_data_type=0.0_real_8)
743 ! we use the replicated vector but the final answer is only summed to proc_col 0 for efficiency
744  CALL dbcsr_get_info(matrix=work_row, nfullrows_local=nrows, nfullcols_local=ncols)
745  CALL pcol_group%sum(data_vec(1:nrows*ncols))
746 
747 ! Convert the result to a column wise distribution
748  CALL dbcsr_rep_row_to_rep_col_vec_d (work_col, work_row, fast_vec_row)
749 
750 ! Create_the final vector by summing it to the result vector which lives on proc_col 0
751  CALL dbcsr_iterator_start(iter, vec_out)
752  DO WHILE (dbcsr_iterator_blocks_left(iter))
753  CALL dbcsr_iterator_next_block(iter, row, col, vec_res, row_size=row_size)
754  prow = hash_table_get(fast_vec_col%hash_table, row)
755  IF (ASSOCIATED(fast_vec_col%blk_map_d (prow)%ptr)) THEN
756  vec_res(:, :) = beta*vec_res(:, :) + alpha*fast_vec_col%blk_map_d (prow)%ptr(:, :)
757  ELSE
758  vec_res(:, :) = beta*vec_res(:, :)
759  END IF
760  END DO
761  CALL dbcsr_iterator_stop(iter)
762 
763  CALL timestop(handle)
764 
765  END SUBROUTINE dbcsr_matrixt_vector_mult_d
766 
767 ! **************************************************************************************************
768 !> \brief ...
769 !> \param vec_in ...
770 !> \param rep_col_vec ...
771 !> \param rep_row_vec ...
772 !> \param fast_vec_col ...
773 ! **************************************************************************************************
774  SUBROUTINE dbcsr_col_vec_to_rep_row_d (vec_in, rep_col_vec, rep_row_vec, fast_vec_col)
775  TYPE(dbcsr_type) :: vec_in, rep_col_vec, &
776  rep_row_vec
777  TYPE(fast_vec_access_type), INTENT(IN) :: fast_vec_col
778 
779  CHARACTER(LEN=*), PARAMETER :: routinen = 'dbcsr_col_vec_to_rep_row'
780 
781  INTEGER :: col, mypcol, myprow, ncols, &
782  nrows, pcol_handle, prow_handle, &
783  row, handle
784  TYPE(mp_comm_type) :: pcol_group, prow_group
785  INTEGER, DIMENSION(:), POINTER :: local_cols, row_dist
786  REAL(kind=real_8), DIMENSION(:), POINTER :: data_vec, data_vec_rep
787  REAL(kind=real_8), DIMENSION(:, :), POINTER :: vec_row
788  TYPE(dbcsr_distribution_type) :: dist_in, dist_rep_col
789  TYPE(dbcsr_iterator_type) :: iter
790 
791  CALL timeset(routinen, handle)
792 
793 ! get information about the parallel environment
794  CALL dbcsr_get_info(vec_in, distribution=dist_in)
795  CALL dbcsr_distribution_get(dist_in, &
796  prow_group=prow_handle, &
797  pcol_group=pcol_handle, &
798  myprow=myprow, &
799  mypcol=mypcol)
800 
801  CALL prow_group%set_handle(prow_handle)
802  CALL pcol_group%set_handle(pcol_handle)
803 
804 ! Get the vector which tells us which blocks are local to which processor row in the col vec
805  CALL dbcsr_get_info(rep_col_vec, distribution=dist_rep_col)
806  CALL dbcsr_distribution_get(dist_rep_col, row_dist=row_dist)
807 
808 ! Copy the local vector to the replicated on the first processor column (this is where vec_in lives)
809  CALL dbcsr_get_info(matrix=rep_col_vec, nfullrows_local=nrows, nfullcols_local=ncols)
810  data_vec_rep => dbcsr_get_data_p(rep_col_vec, select_data_type=0.0_real_8)
811  data_vec => dbcsr_get_data_p(vec_in, select_data_type=0.0_real_8)
812  IF (mypcol == 0) data_vec_rep(1:nrows*ncols) = data_vec(1:nrows*ncols)
813 ! Replicate the data along the row
814  CALL prow_group%bcast(data_vec_rep(1:nrows*ncols), 0)
815 
816 ! Here it gets a bit tricky as we are dealing with two different parallel layouts:
817 ! The rep_col_vec contains all blocks local to the row distribution of the vector.
818 ! The rep_row_vec only needs the fraction which is local to the col distribution.
819 ! However in most cases this won't the complete set of block which can be obtained from col_vector p_row i
820 ! Anyway, as the blocks don't repeat in the col_vec, a different fraction of the row vec will be available
821 ! on every replica in the processor column, by summing along the column we end up with the complete vector everywhere
822 ! Hope this clarifies the idea
823  CALL dbcsr_set(rep_row_vec, 0.0_real_8)
824  CALL dbcsr_get_info(matrix=rep_row_vec, nfullrows_local=nrows, local_cols=local_cols, nfullcols_local=ncols)
825  CALL dbcsr_iterator_start(iter, rep_row_vec)
826  DO WHILE (dbcsr_iterator_blocks_left(iter))
827  CALL dbcsr_iterator_next_block(iter, row, col, vec_row)
828  IF (row_dist(col) == myprow) THEN
829  vec_row = transpose(fast_vec_col%blk_map_d (hash_table_get(fast_vec_col%hash_table, col))%ptr)
830  END IF
831  END DO
832  CALL dbcsr_iterator_stop(iter)
833  CALL dbcsr_get_info(matrix=rep_row_vec, nfullrows_local=nrows, nfullcols_local=ncols)
834  data_vec_rep => dbcsr_get_data_p(rep_row_vec, select_data_type=0.0_real_8)
835  CALL pcol_group%sum(data_vec_rep(1:ncols*nrows))
836 
837  CALL timestop(handle)
838 
839  END SUBROUTINE dbcsr_col_vec_to_rep_row_d
840 
841 ! **************************************************************************************************
842 !> \brief ...
843 !> \param rep_col_vec ...
844 !> \param rep_row_vec ...
845 !> \param fast_vec_row ...
846 !> \param fast_vec_col_add ...
847 ! **************************************************************************************************
848  SUBROUTINE dbcsr_rep_row_to_rep_col_vec_d (rep_col_vec, rep_row_vec, fast_vec_row, fast_vec_col_add)
849  TYPE(dbcsr_type) :: rep_col_vec, rep_row_vec
850  TYPE(fast_vec_access_type), OPTIONAL :: fast_vec_col_add
851  TYPE(fast_vec_access_type) :: fast_vec_row
852 
853  CHARACTER(LEN=*), PARAMETER :: routinen = 'dbcsr_rep_row_to_rep_col_vec'
854 
855  INTEGER :: col, mypcol, myprow, ncols, &
856  nrows, prow_handle, &
857  row, handle
858  INTEGER, DIMENSION(:), POINTER :: col_dist
859  TYPE(mp_comm_type) :: prow_group
860  REAL(kind=real_8), DIMENSION(:), POINTER :: data_vec_rep
861  REAL(kind=real_8), DIMENSION(:, :), POINTER :: vec_col
862  TYPE(dbcsr_distribution_type) :: dist_rep_row, dist_rep_col
863  TYPE(dbcsr_iterator_type) :: iter
864 
865  CALL timeset(routinen, handle)
866 
867 ! get information about the parallel environment
868  CALL dbcsr_get_info(matrix=rep_col_vec, distribution=dist_rep_col)
869  CALL dbcsr_distribution_get(dist_rep_col, &
870  prow_group=prow_handle, &
871  myprow=myprow, &
872  mypcol=mypcol)
873 
874  CALL prow_group%set_handle(prow_handle)
875 
876 ! Get the vector which tells us which blocks are local to which processor col in the row vec
877  CALL dbcsr_get_info(matrix=rep_row_vec, distribution=dist_rep_row)
878  CALL dbcsr_distribution_get(dist_rep_row, col_dist=col_dist)
879 
880 ! The same trick as described above with opposite direction
881  CALL dbcsr_set(rep_col_vec, 0.0_real_8)
882  CALL dbcsr_iterator_start(iter, rep_col_vec)
883  DO WHILE (dbcsr_iterator_blocks_left(iter))
884  CALL dbcsr_iterator_next_block(iter, row, col, vec_col)
885  IF (col_dist(row) == mypcol) THEN
886  vec_col = transpose(fast_vec_row%blk_map_d (hash_table_get(fast_vec_row%hash_table, row))%ptr)
887  END IF
888  ! this one is special and allows to add the elements of a not yet summed replicated
889  ! column vector as it appears in M*V(row_rep) as result. Save an parallel summation in the symmetric case
890  IF (PRESENT(fast_vec_col_add)) vec_col = vec_col + &
891  fast_vec_col_add%blk_map_d (hash_table_get(fast_vec_col_add%hash_table, row))%ptr(:, :)
892  END DO
893  CALL dbcsr_iterator_stop(iter)
894  CALL dbcsr_get_info(matrix=rep_col_vec, nfullrows_local=nrows, nfullcols_local=ncols)
895  data_vec_rep => dbcsr_get_data_p(rep_col_vec, select_data_type=0.0_real_8)
896  CALL prow_group%sum(data_vec_rep(1:nrows*ncols))
897 
898  CALL timestop(handle)
899 
900  END SUBROUTINE dbcsr_rep_row_to_rep_col_vec_d
901 
902 ! **************************************************************************************************
903 !> \brief given a column vector, prepare the fast_vec_access container
904 !> \param vec ...
905 !> \param fast_vec_access ...
906 ! **************************************************************************************************
907  SUBROUTINE create_fast_col_vec_access_d (vec, fast_vec_access)
908  TYPE(dbcsr_type) :: vec
909  TYPE(fast_vec_access_type) :: fast_vec_access
910 
911  CHARACTER(LEN=*), PARAMETER :: routinen = 'create_fast_col_vec_access_d'
912 
913  INTEGER :: handle, nblk_local
914  INTEGER :: col, row, iblock, nthreads
915  REAL(kind=real_8), DIMENSION(:, :), POINTER :: vec_bl
916  TYPE(dbcsr_iterator_type) :: iter
917 
918  CALL timeset(routinen, handle)
919 
920  ! figure out the number of threads
921  nthreads = 1
922 !$OMP PARALLEL DEFAULT(NONE) SHARED(nthreads)
923 !$OMP MASTER
924 !$ nthreads = OMP_GET_NUM_THREADS()
925 !$OMP END MASTER
926 !$OMP END PARALLEL
927 
928  CALL dbcsr_get_info(matrix=vec, nblkrows_local=nblk_local)
929  ! 4 times makes sure the table is big enough to limit collisions.
930  CALL hash_table_create(fast_vec_access%hash_table, 4*nblk_local)
931  ! include zero for effective dealing with values not in the hash table (will return 0)
932  ALLOCATE (fast_vec_access%blk_map_d (0:nblk_local))
933 
934  CALL dbcsr_get_info(matrix=vec, nblkcols_local=col)
935  IF (col .GT. 1) cpabort("BUG")
936 
937  ! go through the blocks of the vector
938  iblock = 0
939  CALL dbcsr_iterator_start(iter, vec)
940  DO WHILE (dbcsr_iterator_blocks_left(iter))
941  CALL dbcsr_iterator_next_block(iter, row, col, vec_bl)
942  iblock = iblock + 1
943  CALL hash_table_add(fast_vec_access%hash_table, row, iblock)
944  fast_vec_access%blk_map_d (iblock)%ptr => vec_bl
945  fast_vec_access%blk_map_d (iblock)%assigned_thread = mod(iblock, nthreads)
946  END DO
947  CALL dbcsr_iterator_stop(iter)
948 
949  CALL timestop(handle)
950 
951  END SUBROUTINE create_fast_col_vec_access_d
952 
953 ! **************************************************************************************************
954 !> \brief given a row vector, prepare the fast_vec_access_container
955 !> \param vec ...
956 !> \param fast_vec_access ...
957 ! **************************************************************************************************
958  SUBROUTINE create_fast_row_vec_access_d (vec, fast_vec_access)
959  TYPE(dbcsr_type) :: vec
960  TYPE(fast_vec_access_type) :: fast_vec_access
961 
962  CHARACTER(LEN=*), PARAMETER :: routinen = 'create_fast_row_vec_access_d'
963 
964  INTEGER :: handle, nblk_local
965  INTEGER :: col, row, iblock, nthreads
966  REAL(kind=real_8), DIMENSION(:, :), POINTER :: vec_bl
967  TYPE(dbcsr_iterator_type) :: iter
968 
969  CALL timeset(routinen, handle)
970 
971  ! figure out the number of threads
972  nthreads = 1
973 !$OMP PARALLEL DEFAULT(NONE) SHARED(nthreads)
974 !$OMP MASTER
975 !$ nthreads = OMP_GET_NUM_THREADS()
976 !$OMP END MASTER
977 !$OMP END PARALLEL
978 
979  CALL dbcsr_get_info(matrix=vec, nblkcols_local=nblk_local)
980  ! 4 times makes sure the table is big enough to limit collisions.
981  CALL hash_table_create(fast_vec_access%hash_table, 4*nblk_local)
982  ! include zero for effective dealing with values not in the hash table (will return 0)
983  ALLOCATE (fast_vec_access%blk_map_d (0:nblk_local))
984 
985  ! sanity check
986  CALL dbcsr_get_info(matrix=vec, nblkrows_local=row)
987  IF (row .GT. 1) cpabort("BUG")
988 
989  ! go through the blocks of the vector
990  iblock = 0
991  CALL dbcsr_iterator_start(iter, vec)
992  DO WHILE (dbcsr_iterator_blocks_left(iter))
993  CALL dbcsr_iterator_next_block(iter, row, col, vec_bl)
994  iblock = iblock + 1
995  CALL hash_table_add(fast_vec_access%hash_table, col, iblock)
996  fast_vec_access%blk_map_d (iblock)%ptr => vec_bl
997  fast_vec_access%blk_map_d (iblock)%assigned_thread = mod(iblock, nthreads)
998  END DO
999  CALL dbcsr_iterator_stop(iter)
1000 
1001  CALL timestop(handle)
1002 
1003  END SUBROUTINE create_fast_row_vec_access_d
1004 
1005 ! **************************************************************************************************
1006 !> \brief ...
1007 !> \param matrix ...
1008 !> \param vec_in ...
1009 !> \param vec_out ...
1010 !> \param alpha ...
1011 !> \param beta ...
1012 !> \param work_row ...
1013 !> \param work_col ...
1014 ! **************************************************************************************************
1015  SUBROUTINE dbcsr_sym_matrix_vector_mult_d (matrix, vec_in, vec_out, alpha, beta, work_row, work_col)
1016  TYPE(dbcsr_type) :: matrix, vec_in, vec_out
1017  REAL(kind=real_8) :: alpha, beta
1018  TYPE(dbcsr_type) :: work_row, work_col
1019 
1020  CHARACTER(LEN=*), PARAMETER :: routinen = 'dbcsr_sym_m_v_mult'
1021 
1022  INTEGER :: col, mypcol, &
1023  myprow, &
1024  nrows, ncols, &
1025  row, pcol_handle, &
1026  handle, handle1, ithread, vec_dim
1027  REAL(kind=real_8), DIMENSION(:), POINTER :: data_vec
1028  REAL(kind=real_8), DIMENSION(:, :), POINTER :: data_d, vec_res
1029  TYPE(dbcsr_distribution_type) :: dist
1030  TYPE(dbcsr_iterator_type) :: iter
1031  TYPE(dbcsr_type) :: result_row, result_col
1032  TYPE(mp_comm_type) :: pcol_group
1033 
1034  TYPE(fast_vec_access_type) :: fast_vec_row, fast_vec_col, res_fast_vec_row, res_fast_vec_col
1035  INTEGER :: prow, pcol, rprow, rpcol
1036 
1037  CALL timeset(routinen, handle)
1038  ithread = 0
1039 ! We need some work matrices as we try to exploit operations on the replicated vectors which are duplicated otherwise
1040  CALL dbcsr_get_info(matrix=vec_in, nfullcols_total=vec_dim)
1041 ! This is a performance hack as the new creation of a replicated vector is a fair bit more expensive
1042  CALL dbcsr_set(work_col, 0.0_real_8)
1043  CALL dbcsr_copy(result_col, work_col)
1044  CALL dbcsr_set(work_row, 0.0_real_8)
1045  CALL dbcsr_copy(result_row, work_row)
1046 
1047 ! Collect some data about the parallel environment. We will use them later to move the vector around
1048  CALL dbcsr_get_info(matrix=matrix, distribution=dist)
1049  CALL dbcsr_distribution_get(dist, pcol_group=pcol_handle, myprow=myprow, mypcol=mypcol)
1050 
1051  CALL pcol_group%set_handle(pcol_handle)
1052 
1053  CALL create_fast_row_vec_access(work_row, fast_vec_row)
1054  CALL create_fast_col_vec_access(work_col, fast_vec_col)
1055  CALL create_fast_row_vec_access(result_row, res_fast_vec_row)
1056  CALL create_fast_col_vec_access(result_col, res_fast_vec_col)
1057 
1058 ! Transfer the correct parts of the input vector to the correct locations so we can do a local multiply
1059  CALL dbcsr_col_vec_to_rep_row_d (vec_in, work_col, work_row, fast_vec_col)
1060 
1061 ! Probably I should rename the routine above as it delivers both the replicated row and column vector
1062 
1063 ! Perform the local multiply. Here we exploit, that we have the blocks replicated on the mpi processes
1064 ! It is important to note, that the input and result vector are distributed differently (row wise, col wise respectively)
1065  CALL timeset(routinen//"_local_mm", handle1)
1066 
1067 !------ perform the multiplication, we have to take car to take the correct blocks ----------
1068 
1069 !$OMP PARALLEL DEFAULT(NONE) PRIVATE(row,col,iter,data_d,ithread,pcol,prow,rpcol,rprow) &
1070 !$OMP SHARED(matrix,fast_vec_row,res_fast_vec_col,res_fast_vec_row,fast_vec_col)
1071 !$ ithread = omp_get_thread_num()
1072  CALL dbcsr_iterator_start(iter, matrix, shared=.false.)
1073  DO WHILE (dbcsr_iterator_blocks_left(iter))
1074  CALL dbcsr_iterator_next_block(iter, row, col, data_d)
1075  pcol = hash_table_get(fast_vec_row%hash_table, col)
1076  rprow = hash_table_get(res_fast_vec_col%hash_table, row)
1077  IF (ASSOCIATED(fast_vec_row%blk_map_d (pcol)%ptr) .AND. &
1078  ASSOCIATED(res_fast_vec_col%blk_map_d (rprow)%ptr)) THEN
1079  IF (res_fast_vec_col%blk_map_d (rprow)%assigned_thread .EQ. ithread) THEN
1080  res_fast_vec_col%blk_map_d (rprow)%ptr = res_fast_vec_col%blk_map_d (rprow)%ptr + &
1081  matmul(data_d, transpose(fast_vec_row%blk_map_d (pcol)%ptr))
1082  END IF
1083  prow = hash_table_get(fast_vec_col%hash_table, row)
1084  rpcol = hash_table_get(res_fast_vec_row%hash_table, col)
1085  IF (res_fast_vec_row%blk_map_d (rpcol)%assigned_thread .EQ. ithread .AND. row .NE. col) THEN
1086  res_fast_vec_row%blk_map_d (rpcol)%ptr = res_fast_vec_row%blk_map_d (rpcol)%ptr + &
1087  matmul(transpose(fast_vec_col%blk_map_d (prow)%ptr), data_d)
1088  END IF
1089  ELSE
1090  rpcol = hash_table_get(res_fast_vec_col%hash_table, col)
1091  prow = hash_table_get(fast_vec_row%hash_table, row)
1092  IF (res_fast_vec_col%blk_map_d (rpcol)%assigned_thread .EQ. ithread) THEN
1093  res_fast_vec_col%blk_map_d (rpcol)%ptr = res_fast_vec_col%blk_map_d (rpcol)%ptr + &
1094  transpose(matmul(fast_vec_row%blk_map_d (prow)%ptr, data_d))
1095  END IF
1096  rprow = hash_table_get(res_fast_vec_row%hash_table, row)
1097  pcol = hash_table_get(fast_vec_col%hash_table, col)
1098  IF (res_fast_vec_row%blk_map_d (rprow)%assigned_thread .EQ. ithread .AND. row .NE. col) THEN
1099  res_fast_vec_row%blk_map_d (rprow)%ptr = res_fast_vec_row%blk_map_d (rprow)%ptr + &
1100  transpose(matmul(data_d, fast_vec_col%blk_map_d (pcol)%ptr))
1101  END IF
1102  END IF
1103  END DO
1104  CALL dbcsr_iterator_stop(iter)
1105 !$OMP END PARALLEL
1106 
1107  CALL timestop(handle1)
1108 
1109  ! sum all the data within a processor column to obtain the replicated result from lower
1110  data_vec => dbcsr_get_data_p(result_row, select_data_type=0.0_real_8)
1111  CALL dbcsr_get_info(matrix=result_row, nfullrows_local=nrows, nfullcols_local=ncols)
1112 
1113  CALL pcol_group%sum(data_vec(1:nrows*ncols))
1114 !
1115 !! Convert the results to a column wise distribution, this is a bit involved as the result_row is fully replicated
1116 !! While the result_col still has the partial results in parallel. The routine below takes care of that and saves a
1117 !! parallel summation. Of the res_row vectors are created only taking the appropriate element (0 otherwise) while the res_col
1118 !! parallel bits are locally added. The sum magically creates the correct vector
1119  CALL dbcsr_rep_row_to_rep_col_vec_d (work_col, result_row, res_fast_vec_row, res_fast_vec_col)
1120 
1121 ! ! Create_the final vector by summing it to the result vector which lives on proc_col 0 lower
1122  CALL dbcsr_iterator_start(iter, vec_out)
1123  DO WHILE (dbcsr_iterator_blocks_left(iter))
1124  CALL dbcsr_iterator_next_block(iter, row, col, vec_res)
1125  prow = hash_table_get(fast_vec_col%hash_table, row)
1126  IF (ASSOCIATED(fast_vec_col%blk_map_d (prow)%ptr)) THEN
1127  vec_res(:, :) = beta*vec_res(:, :) + alpha*(fast_vec_col%blk_map_d (prow)%ptr(:, :))
1128  ELSE
1129  vec_res(:, :) = beta*vec_res(:, :)
1130  END IF
1131  END DO
1132  CALL dbcsr_iterator_stop(iter)
1133 
1134  CALL release_fast_vec_access(fast_vec_row)
1135  CALL release_fast_vec_access(fast_vec_col)
1136  CALL release_fast_vec_access(res_fast_vec_row)
1137  CALL release_fast_vec_access(res_fast_vec_col)
1138 
1139  CALL dbcsr_release(result_row); CALL dbcsr_release(result_col)
1140 
1141  CALL timestop(handle)
1142 
1143  END SUBROUTINE dbcsr_sym_matrix_vector_mult_d
1144 
1145 
1146 ! **************************************************************************************************
1147 !> \brief the real driver routine for the multiply, not all symmetries implemented yet
1148 !> \param matrix ...
1149 !> \param vec_in ...
1150 !> \param vec_out ...
1151 !> \param alpha ...
1152 !> \param beta ...
1153 !> \param work_row ...
1154 !> \param work_col ...
1155 ! **************************************************************************************************
1156  SUBROUTINE dbcsr_matrix_colvec_multiply_z (matrix, vec_in, vec_out, alpha, beta, work_row, work_col)
1157  TYPE(dbcsr_type) :: matrix, vec_in, vec_out
1158  COMPLEX(kind=real_8) :: alpha, beta
1159  TYPE(dbcsr_type) :: work_row, work_col
1160 
1161  CHARACTER :: matrix_type
1162 
1163  CALL dbcsr_get_info(matrix, matrix_type=matrix_type)
1164 
1165  SELECT CASE (matrix_type)
1166  CASE (dbcsr_type_no_symmetry)
1167  CALL dbcsr_matrix_vector_mult_z (matrix, vec_in, vec_out, alpha, beta, work_row, work_col)
1168  CASE (dbcsr_type_symmetric)
1169  CALL dbcsr_sym_matrix_vector_mult_z (matrix, vec_in, vec_out, alpha, beta, work_row, work_col)
1170  CASE (dbcsr_type_antisymmetric)
1171  ! Not yet implemented, should mainly be some prefactor magic, but who knows how antisymmetric matrices are stored???
1172  cpabort("NYI, antisymmetric matrix not permitted")
1173  CASE DEFAULT
1174  cpabort("Unknown matrix type, ...")
1175  END SELECT
1176 
1177  END SUBROUTINE dbcsr_matrix_colvec_multiply_z
1178 
1179 ! **************************************************************************************************
1180 !> \brief low level routines for matrix vector multiplies
1181 !> \param matrix ...
1182 !> \param vec_in ...
1183 !> \param vec_out ...
1184 !> \param alpha ...
1185 !> \param beta ...
1186 !> \param work_row ...
1187 !> \param work_col ...
1188 ! **************************************************************************************************
1189  SUBROUTINE dbcsr_matrix_vector_mult_z (matrix, vec_in, vec_out, alpha, beta, work_row, work_col)
1190  TYPE(dbcsr_type) :: matrix, vec_in, vec_out
1191  COMPLEX(kind=real_8) :: alpha, beta
1192  TYPE(dbcsr_type) :: work_row, work_col
1193 
1194  CHARACTER(LEN=*), PARAMETER :: routinen = 'dbcsr_matrix_vector_mult'
1195 
1196  INTEGER :: col, mypcol, &
1197  myprow, prow_handle, &
1198  ncols, nrows, &
1199  row, &
1200  handle, handle1, ithread
1201  TYPE(mp_comm_type) :: prow_group
1202  COMPLEX(kind=real_8), DIMENSION(:), POINTER :: data_vec
1203  COMPLEX(kind=real_8), DIMENSION(:, :), POINTER :: data_d, vec_res
1204  TYPE(dbcsr_distribution_type) :: dist
1205  TYPE(dbcsr_iterator_type) :: iter
1206  TYPE(fast_vec_access_type) :: fast_vec_row, fast_vec_col
1207  INTEGER :: prow, pcol
1208 
1209  CALL timeset(routinen, handle)
1210  ithread = 0
1211 
1212 ! Collect some data about the parallel environment. We will use them later to move the vector around
1213  CALL dbcsr_get_info(matrix, distribution=dist)
1214  CALL dbcsr_distribution_get(dist, prow_group=prow_handle, myprow=myprow, mypcol=mypcol)
1215 
1216  CALL prow_group%set_handle(prow_handle)
1217 
1218  CALL create_fast_row_vec_access(work_row, fast_vec_row)
1219  CALL create_fast_col_vec_access(work_col, fast_vec_col)
1220 
1221 ! Transfer the correct parts of the input vector to the correct locations so we can do a local multiply
1222  CALL dbcsr_col_vec_to_rep_row_z (vec_in, work_col, work_row, fast_vec_col)
1223 
1224 ! Set the work vector for the results to 0
1225  CALL dbcsr_set(work_col, cmplx(0.0, 0.0, real_8))
1226 
1227 ! Perform the local multiply. Here we exploit, that we have the blocks replicated on the mpi processes
1228 ! It is important to note, that the input and result vector are distributed differently (row wise, col wise respectively)
1229  CALL timeset(routinen//"_local_mm", handle1)
1230 
1231 !$OMP PARALLEL DEFAULT(NONE) PRIVATE(row,col,iter,data_d,ithread,pcol,prow) &
1232 !$OMP SHARED(matrix,fast_vec_col,fast_vec_row)
1233 !$ ithread = omp_get_thread_num()
1234  CALL dbcsr_iterator_start(iter, matrix, shared=.false.)
1235  DO WHILE (dbcsr_iterator_blocks_left(iter))
1236  CALL dbcsr_iterator_next_block(iter, row, col, data_d)
1237  prow = hash_table_get(fast_vec_col%hash_table, row)
1238  IF (fast_vec_col%blk_map_z (prow)%assigned_thread .NE. ithread) cycle
1239  pcol = hash_table_get(fast_vec_row%hash_table, col)
1240  fast_vec_col%blk_map_z (prow)%ptr = fast_vec_col%blk_map_z (prow)%ptr + &
1241  matmul(data_d, transpose(fast_vec_row%blk_map_z (pcol)%ptr))
1242  END DO
1243  CALL dbcsr_iterator_stop(iter)
1244 !$OMP END PARALLEL
1245 
1246  CALL timestop(handle1)
1247 
1248 ! sum all the data onto the first processor col where the original vector is stored
1249  data_vec => dbcsr_get_data_p(work_col, select_data_type=cmplx(0.0, 0.0, real_8))
1250  CALL dbcsr_get_info(matrix=work_col, nfullrows_local=nrows, nfullcols_local=ncols)
1251  CALL prow_group%sum(data_vec(1:nrows*ncols))
1252 
1253 ! Local copy on the first mpi col (as this is the localtion of the vec_res blocks) of the result vector
1254 ! from the replicated to the original vector. Let's play it safe and use the iterator
1255  CALL dbcsr_iterator_start(iter, vec_out)
1256  DO WHILE (dbcsr_iterator_blocks_left(iter))
1257  CALL dbcsr_iterator_next_block(iter, row, col, vec_res)
1258  prow = hash_table_get(fast_vec_col%hash_table, row)
1259  IF (ASSOCIATED(fast_vec_col%blk_map_z (prow)%ptr)) THEN
1260  vec_res(:, :) = beta*vec_res(:, :) + alpha*fast_vec_col%blk_map_z (prow)%ptr(:, :)
1261  ELSE
1262  vec_res(:, :) = beta*vec_res(:, :)
1263  END IF
1264  END DO
1265  CALL dbcsr_iterator_stop(iter)
1266 
1267  CALL release_fast_vec_access(fast_vec_row)
1268  CALL release_fast_vec_access(fast_vec_col)
1269 
1270  CALL timestop(handle)
1271 
1272  END SUBROUTINE dbcsr_matrix_vector_mult_z
1273 
1274 ! **************************************************************************************************
1275 !> \brief ...
1276 !> \param matrix ...
1277 !> \param vec_in ...
1278 !> \param vec_out ...
1279 !> \param alpha ...
1280 !> \param beta ...
1281 !> \param work_row ...
1282 !> \param work_col ...
1283 !> \param skip_diag ...
1284 ! **************************************************************************************************
1285  SUBROUTINE dbcsr_matrixt_vector_mult_z (matrix, vec_in, vec_out, alpha, beta, work_row, work_col, skip_diag)
1286  TYPE(dbcsr_type) :: matrix, vec_in, vec_out
1287  COMPLEX(kind=real_8) :: alpha, beta
1288  TYPE(dbcsr_type) :: work_row, work_col
1289  LOGICAL :: skip_diag
1290 
1291  CHARACTER(LEN=*), PARAMETER :: routinen = 'dbcsr_matrixT_vector_mult'
1292 
1293  INTEGER :: col, col_size, mypcol, &
1294  myprow, pcol_handle, prow_handle, &
1295  ncols, nrows, &
1296  row, row_size, &
1297  handle, handle1, ithread
1298  TYPE(mp_comm_type) :: pcol_group, prow_group
1299  COMPLEX(kind=real_8), DIMENSION(:), POINTER :: data_vec
1300  COMPLEX(kind=real_8), DIMENSION(:, :), POINTER :: data_d, vec_bl, vec_res
1301  TYPE(dbcsr_distribution_type) :: dist
1302  TYPE(dbcsr_iterator_type) :: iter
1303 
1304  TYPE(fast_vec_access_type) :: fast_vec_row, fast_vec_col
1305  INTEGER :: prow, pcol
1306 
1307  CALL timeset(routinen, handle)
1308  ithread = 0
1309 
1310 ! Collect some data about the parallel environment. We will use them later to move the vector around
1311  CALL dbcsr_get_info(matrix, distribution=dist)
1312  CALL dbcsr_distribution_get(dist, prow_group=prow_handle, pcol_group=pcol_handle, myprow=myprow, mypcol=mypcol)
1313 
1314  CALL prow_group%set_handle(prow_handle)
1315  CALL pcol_group%set_handle(pcol_handle)
1316 
1317  CALL create_fast_row_vec_access(work_row, fast_vec_row)
1318  CALL create_fast_col_vec_access(work_col, fast_vec_col)
1319 
1320 ! Set the work vector for the results to 0
1321  CALL dbcsr_set(work_row, cmplx(0.0, 0.0, real_8))
1322 
1323 ! Transfer the correct parts of the input vector to the replicated vector on proc_col 0
1324  CALL dbcsr_iterator_start(iter, vec_in)
1325  DO WHILE (dbcsr_iterator_blocks_left(iter))
1326  CALL dbcsr_iterator_next_block(iter, row, col, vec_bl, row_size=row_size, col_size=col_size)
1327  prow = hash_table_get(fast_vec_col%hash_table, row)
1328  fast_vec_col%blk_map_z (prow)%ptr(1:row_size, 1:col_size) = vec_bl(1:row_size, 1:col_size)
1329  END DO
1330  CALL dbcsr_iterator_stop(iter)
1331 ! Replicate the data on all processore in the row
1332  data_vec => dbcsr_get_data_p(work_col, select_data_type=cmplx(0.0, 0.0, real_8))
1333  CALL prow_group%bcast(data_vec, 0)
1334 
1335 ! Perform the local multiply. Here it is obvious why the vectors are replicated on the mpi rows and cols
1336  CALL timeset(routinen//"local_mm", handle1)
1337  CALL dbcsr_get_info(matrix=work_col, nfullcols_local=ncols)
1338 !$OMP PARALLEL DEFAULT(NONE) PRIVATE(row,col,iter,data_d,row_size,col_size,ithread,prow,pcol) &
1339 !$OMP SHARED(matrix,fast_vec_row,fast_vec_col,skip_diag,ncols)
1340 !$ ithread = omp_get_thread_num()
1341  CALL dbcsr_iterator_start(iter, matrix, shared=.false.)
1342  DO WHILE (dbcsr_iterator_blocks_left(iter))
1343  CALL dbcsr_iterator_next_block(iter, row, col, data_d, row_size=row_size, col_size=col_size)
1344  IF (skip_diag .AND. col == row) cycle
1345  prow = hash_table_get(fast_vec_col%hash_table, row)
1346  pcol = hash_table_get(fast_vec_row%hash_table, col)
1347  IF (ASSOCIATED(fast_vec_row%blk_map_z (pcol)%ptr) .AND. &
1348  ASSOCIATED(fast_vec_col%blk_map_z (prow)%ptr)) THEN
1349  IF (fast_vec_row%blk_map_z (pcol)%assigned_thread .NE. ithread) cycle
1350  fast_vec_row%blk_map_z (pcol)%ptr = fast_vec_row%blk_map_z (pcol)%ptr + &
1351  matmul(transpose(fast_vec_col%blk_map_z (prow)%ptr), data_d)
1352  ELSE
1353  prow = hash_table_get(fast_vec_row%hash_table, row)
1354  pcol = hash_table_get(fast_vec_col%hash_table, col)
1355  IF (fast_vec_row%blk_map_z (prow)%assigned_thread .NE. ithread) cycle
1356  fast_vec_row%blk_map_z (prow)%ptr = fast_vec_row%blk_map_z (prow)%ptr + &
1357  matmul(transpose(fast_vec_col%blk_map_z (pcol)%ptr), transpose(data_d))
1358  END IF
1359  END DO
1360  CALL dbcsr_iterator_stop(iter)
1361 !$OMP END PARALLEL
1362 
1363  CALL timestop(handle1)
1364 
1365 ! sum all the data within a processor column to obtain the replicated result
1366  data_vec => dbcsr_get_data_p(work_row, select_data_type=cmplx(0.0, 0.0, real_8))
1367 ! we use the replicated vector but the final answer is only summed to proc_col 0 for efficiency
1368  CALL dbcsr_get_info(matrix=work_row, nfullrows_local=nrows, nfullcols_local=ncols)
1369  CALL pcol_group%sum(data_vec(1:nrows*ncols))
1370 
1371 ! Convert the result to a column wise distribution
1372  CALL dbcsr_rep_row_to_rep_col_vec_z (work_col, work_row, fast_vec_row)
1373 
1374 ! Create_the final vector by summing it to the result vector which lives on proc_col 0
1375  CALL dbcsr_iterator_start(iter, vec_out)
1376  DO WHILE (dbcsr_iterator_blocks_left(iter))
1377  CALL dbcsr_iterator_next_block(iter, row, col, vec_res, row_size=row_size)
1378  prow = hash_table_get(fast_vec_col%hash_table, row)
1379  IF (ASSOCIATED(fast_vec_col%blk_map_z (prow)%ptr)) THEN
1380  vec_res(:, :) = beta*vec_res(:, :) + alpha*fast_vec_col%blk_map_z (prow)%ptr(:, :)
1381  ELSE
1382  vec_res(:, :) = beta*vec_res(:, :)
1383  END IF
1384  END DO
1385  CALL dbcsr_iterator_stop(iter)
1386 
1387  CALL timestop(handle)
1388 
1389  END SUBROUTINE dbcsr_matrixt_vector_mult_z
1390 
1391 ! **************************************************************************************************
1392 !> \brief ...
1393 !> \param vec_in ...
1394 !> \param rep_col_vec ...
1395 !> \param rep_row_vec ...
1396 !> \param fast_vec_col ...
1397 ! **************************************************************************************************
1398  SUBROUTINE dbcsr_col_vec_to_rep_row_z (vec_in, rep_col_vec, rep_row_vec, fast_vec_col)
1399  TYPE(dbcsr_type) :: vec_in, rep_col_vec, &
1400  rep_row_vec
1401  TYPE(fast_vec_access_type), INTENT(IN) :: fast_vec_col
1402 
1403  CHARACTER(LEN=*), PARAMETER :: routinen = 'dbcsr_col_vec_to_rep_row'
1404 
1405  INTEGER :: col, mypcol, myprow, ncols, &
1406  nrows, pcol_handle, prow_handle, &
1407  row, handle
1408  TYPE(mp_comm_type) :: pcol_group, prow_group
1409  INTEGER, DIMENSION(:), POINTER :: local_cols, row_dist
1410  COMPLEX(kind=real_8), DIMENSION(:), POINTER :: data_vec, data_vec_rep
1411  COMPLEX(kind=real_8), DIMENSION(:, :), POINTER :: vec_row
1412  TYPE(dbcsr_distribution_type) :: dist_in, dist_rep_col
1413  TYPE(dbcsr_iterator_type) :: iter
1414 
1415  CALL timeset(routinen, handle)
1416 
1417 ! get information about the parallel environment
1418  CALL dbcsr_get_info(vec_in, distribution=dist_in)
1419  CALL dbcsr_distribution_get(dist_in, &
1420  prow_group=prow_handle, &
1421  pcol_group=pcol_handle, &
1422  myprow=myprow, &
1423  mypcol=mypcol)
1424 
1425  CALL prow_group%set_handle(prow_handle)
1426  CALL pcol_group%set_handle(pcol_handle)
1427 
1428 ! Get the vector which tells us which blocks are local to which processor row in the col vec
1429  CALL dbcsr_get_info(rep_col_vec, distribution=dist_rep_col)
1430  CALL dbcsr_distribution_get(dist_rep_col, row_dist=row_dist)
1431 
1432 ! Copy the local vector to the replicated on the first processor column (this is where vec_in lives)
1433  CALL dbcsr_get_info(matrix=rep_col_vec, nfullrows_local=nrows, nfullcols_local=ncols)
1434  data_vec_rep => dbcsr_get_data_p(rep_col_vec, select_data_type=cmplx(0.0, 0.0, real_8))
1435  data_vec => dbcsr_get_data_p(vec_in, select_data_type=cmplx(0.0, 0.0, real_8))
1436  IF (mypcol == 0) data_vec_rep(1:nrows*ncols) = data_vec(1:nrows*ncols)
1437 ! Replicate the data along the row
1438  CALL prow_group%bcast(data_vec_rep(1:nrows*ncols), 0)
1439 
1440 ! Here it gets a bit tricky as we are dealing with two different parallel layouts:
1441 ! The rep_col_vec contains all blocks local to the row distribution of the vector.
1442 ! The rep_row_vec only needs the fraction which is local to the col distribution.
1443 ! However in most cases this won't the complete set of block which can be obtained from col_vector p_row i
1444 ! Anyway, as the blocks don't repeat in the col_vec, a different fraction of the row vec will be available
1445 ! on every replica in the processor column, by summing along the column we end up with the complete vector everywhere
1446 ! Hope this clarifies the idea
1447  CALL dbcsr_set(rep_row_vec, cmplx(0.0, 0.0, real_8))
1448  CALL dbcsr_get_info(matrix=rep_row_vec, nfullrows_local=nrows, local_cols=local_cols, nfullcols_local=ncols)
1449  CALL dbcsr_iterator_start(iter, rep_row_vec)
1450  DO WHILE (dbcsr_iterator_blocks_left(iter))
1451  CALL dbcsr_iterator_next_block(iter, row, col, vec_row)
1452  IF (row_dist(col) == myprow) THEN
1453  vec_row = transpose(fast_vec_col%blk_map_z (hash_table_get(fast_vec_col%hash_table, col))%ptr)
1454  END IF
1455  END DO
1456  CALL dbcsr_iterator_stop(iter)
1457  CALL dbcsr_get_info(matrix=rep_row_vec, nfullrows_local=nrows, nfullcols_local=ncols)
1458  data_vec_rep => dbcsr_get_data_p(rep_row_vec, select_data_type=cmplx(0.0, 0.0, real_8))
1459  CALL pcol_group%sum(data_vec_rep(1:ncols*nrows))
1460 
1461  CALL timestop(handle)
1462 
1463  END SUBROUTINE dbcsr_col_vec_to_rep_row_z
1464 
1465 ! **************************************************************************************************
1466 !> \brief ...
1467 !> \param rep_col_vec ...
1468 !> \param rep_row_vec ...
1469 !> \param fast_vec_row ...
1470 !> \param fast_vec_col_add ...
1471 ! **************************************************************************************************
1472  SUBROUTINE dbcsr_rep_row_to_rep_col_vec_z (rep_col_vec, rep_row_vec, fast_vec_row, fast_vec_col_add)
1473  TYPE(dbcsr_type) :: rep_col_vec, rep_row_vec
1474  TYPE(fast_vec_access_type), OPTIONAL :: fast_vec_col_add
1475  TYPE(fast_vec_access_type) :: fast_vec_row
1476 
1477  CHARACTER(LEN=*), PARAMETER :: routinen = 'dbcsr_rep_row_to_rep_col_vec'
1478 
1479  INTEGER :: col, mypcol, myprow, ncols, &
1480  nrows, prow_handle, &
1481  row, handle
1482  INTEGER, DIMENSION(:), POINTER :: col_dist
1483  TYPE(mp_comm_type) :: prow_group
1484  COMPLEX(kind=real_8), DIMENSION(:), POINTER :: data_vec_rep
1485  COMPLEX(kind=real_8), DIMENSION(:, :), POINTER :: vec_col
1486  TYPE(dbcsr_distribution_type) :: dist_rep_row, dist_rep_col
1487  TYPE(dbcsr_iterator_type) :: iter
1488 
1489  CALL timeset(routinen, handle)
1490 
1491 ! get information about the parallel environment
1492  CALL dbcsr_get_info(matrix=rep_col_vec, distribution=dist_rep_col)
1493  CALL dbcsr_distribution_get(dist_rep_col, &
1494  prow_group=prow_handle, &
1495  myprow=myprow, &
1496  mypcol=mypcol)
1497 
1498  CALL prow_group%set_handle(prow_handle)
1499 
1500 ! Get the vector which tells us which blocks are local to which processor col in the row vec
1501  CALL dbcsr_get_info(matrix=rep_row_vec, distribution=dist_rep_row)
1502  CALL dbcsr_distribution_get(dist_rep_row, col_dist=col_dist)
1503 
1504 ! The same trick as described above with opposite direction
1505  CALL dbcsr_set(rep_col_vec, cmplx(0.0, 0.0, real_8))
1506  CALL dbcsr_iterator_start(iter, rep_col_vec)
1507  DO WHILE (dbcsr_iterator_blocks_left(iter))
1508  CALL dbcsr_iterator_next_block(iter, row, col, vec_col)
1509  IF (col_dist(row) == mypcol) THEN
1510  vec_col = transpose(fast_vec_row%blk_map_z (hash_table_get(fast_vec_row%hash_table, row))%ptr)
1511  END IF
1512  ! this one is special and allows to add the elements of a not yet summed replicated
1513  ! column vector as it appears in M*V(row_rep) as result. Save an parallel summation in the symmetric case
1514  IF (PRESENT(fast_vec_col_add)) vec_col = vec_col + &
1515  fast_vec_col_add%blk_map_z (hash_table_get(fast_vec_col_add%hash_table, row))%ptr(:, :)
1516  END DO
1517  CALL dbcsr_iterator_stop(iter)
1518  CALL dbcsr_get_info(matrix=rep_col_vec, nfullrows_local=nrows, nfullcols_local=ncols)
1519  data_vec_rep => dbcsr_get_data_p(rep_col_vec, select_data_type=cmplx(0.0, 0.0, real_8))
1520  CALL prow_group%sum(data_vec_rep(1:nrows*ncols))
1521 
1522  CALL timestop(handle)
1523 
1524  END SUBROUTINE dbcsr_rep_row_to_rep_col_vec_z
1525 
1526 ! **************************************************************************************************
1527 !> \brief given a column vector, prepare the fast_vec_access container
1528 !> \param vec ...
1529 !> \param fast_vec_access ...
1530 ! **************************************************************************************************
1531  SUBROUTINE create_fast_col_vec_access_z (vec, fast_vec_access)
1532  TYPE(dbcsr_type) :: vec
1533  TYPE(fast_vec_access_type) :: fast_vec_access
1534 
1535  CHARACTER(LEN=*), PARAMETER :: routinen = 'create_fast_col_vec_access_z'
1536 
1537  INTEGER :: handle, nblk_local
1538  INTEGER :: col, row, iblock, nthreads
1539  COMPLEX(kind=real_8), DIMENSION(:, :), POINTER :: vec_bl
1540  TYPE(dbcsr_iterator_type) :: iter
1541 
1542  CALL timeset(routinen, handle)
1543 
1544  ! figure out the number of threads
1545  nthreads = 1
1546 !$OMP PARALLEL DEFAULT(NONE) SHARED(nthreads)
1547 !$OMP MASTER
1548 !$ nthreads = OMP_GET_NUM_THREADS()
1549 !$OMP END MASTER
1550 !$OMP END PARALLEL
1551 
1552  CALL dbcsr_get_info(matrix=vec, nblkrows_local=nblk_local)
1553  ! 4 times makes sure the table is big enough to limit collisions.
1554  CALL hash_table_create(fast_vec_access%hash_table, 4*nblk_local)
1555  ! include zero for effective dealing with values not in the hash table (will return 0)
1556  ALLOCATE (fast_vec_access%blk_map_z (0:nblk_local))
1557 
1558  CALL dbcsr_get_info(matrix=vec, nblkcols_local=col)
1559  IF (col .GT. 1) cpabort("BUG")
1560 
1561  ! go through the blocks of the vector
1562  iblock = 0
1563  CALL dbcsr_iterator_start(iter, vec)
1564  DO WHILE (dbcsr_iterator_blocks_left(iter))
1565  CALL dbcsr_iterator_next_block(iter, row, col, vec_bl)
1566  iblock = iblock + 1
1567  CALL hash_table_add(fast_vec_access%hash_table, row, iblock)
1568  fast_vec_access%blk_map_z (iblock)%ptr => vec_bl
1569  fast_vec_access%blk_map_z (iblock)%assigned_thread = mod(iblock, nthreads)
1570  END DO
1571  CALL dbcsr_iterator_stop(iter)
1572 
1573  CALL timestop(handle)
1574 
1575  END SUBROUTINE create_fast_col_vec_access_z
1576 
1577 ! **************************************************************************************************
1578 !> \brief given a row vector, prepare the fast_vec_access_container
1579 !> \param vec ...
1580 !> \param fast_vec_access ...
1581 ! **************************************************************************************************
1582  SUBROUTINE create_fast_row_vec_access_z (vec, fast_vec_access)
1583  TYPE(dbcsr_type) :: vec
1584  TYPE(fast_vec_access_type) :: fast_vec_access
1585 
1586  CHARACTER(LEN=*), PARAMETER :: routinen = 'create_fast_row_vec_access_z'
1587 
1588  INTEGER :: handle, nblk_local
1589  INTEGER :: col, row, iblock, nthreads
1590  COMPLEX(kind=real_8), DIMENSION(:, :), POINTER :: vec_bl
1591  TYPE(dbcsr_iterator_type) :: iter
1592 
1593  CALL timeset(routinen, handle)
1594 
1595  ! figure out the number of threads
1596  nthreads = 1
1597 !$OMP PARALLEL DEFAULT(NONE) SHARED(nthreads)
1598 !$OMP MASTER
1599 !$ nthreads = OMP_GET_NUM_THREADS()
1600 !$OMP END MASTER
1601 !$OMP END PARALLEL
1602 
1603  CALL dbcsr_get_info(matrix=vec, nblkcols_local=nblk_local)
1604  ! 4 times makes sure the table is big enough to limit collisions.
1605  CALL hash_table_create(fast_vec_access%hash_table, 4*nblk_local)
1606  ! include zero for effective dealing with values not in the hash table (will return 0)
1607  ALLOCATE (fast_vec_access%blk_map_z (0:nblk_local))
1608 
1609  ! sanity check
1610  CALL dbcsr_get_info(matrix=vec, nblkrows_local=row)
1611  IF (row .GT. 1) cpabort("BUG")
1612 
1613  ! go through the blocks of the vector
1614  iblock = 0
1615  CALL dbcsr_iterator_start(iter, vec)
1616  DO WHILE (dbcsr_iterator_blocks_left(iter))
1617  CALL dbcsr_iterator_next_block(iter, row, col, vec_bl)
1618  iblock = iblock + 1
1619  CALL hash_table_add(fast_vec_access%hash_table, col, iblock)
1620  fast_vec_access%blk_map_z (iblock)%ptr => vec_bl
1621  fast_vec_access%blk_map_z (iblock)%assigned_thread = mod(iblock, nthreads)
1622  END DO
1623  CALL dbcsr_iterator_stop(iter)
1624 
1625  CALL timestop(handle)
1626 
1627  END SUBROUTINE create_fast_row_vec_access_z
1628 
1629 ! **************************************************************************************************
1630 !> \brief ...
1631 !> \param matrix ...
1632 !> \param vec_in ...
1633 !> \param vec_out ...
1634 !> \param alpha ...
1635 !> \param beta ...
1636 !> \param work_row ...
1637 !> \param work_col ...
1638 ! **************************************************************************************************
1639  SUBROUTINE dbcsr_sym_matrix_vector_mult_z (matrix, vec_in, vec_out, alpha, beta, work_row, work_col)
1640  TYPE(dbcsr_type) :: matrix, vec_in, vec_out
1641  COMPLEX(kind=real_8) :: alpha, beta
1642  TYPE(dbcsr_type) :: work_row, work_col
1643 
1644  CHARACTER(LEN=*), PARAMETER :: routinen = 'dbcsr_sym_m_v_mult'
1645 
1646  INTEGER :: col, mypcol, &
1647  myprow, &
1648  nrows, ncols, &
1649  row, pcol_handle, &
1650  handle, handle1, ithread, vec_dim
1651  COMPLEX(kind=real_8), DIMENSION(:), POINTER :: data_vec
1652  COMPLEX(kind=real_8), DIMENSION(:, :), POINTER :: data_d, vec_res
1653  TYPE(dbcsr_distribution_type) :: dist
1654  TYPE(dbcsr_iterator_type) :: iter
1655  TYPE(dbcsr_type) :: result_row, result_col
1656  TYPE(mp_comm_type) :: pcol_group
1657 
1658  TYPE(fast_vec_access_type) :: fast_vec_row, fast_vec_col, res_fast_vec_row, res_fast_vec_col
1659  INTEGER :: prow, pcol, rprow, rpcol
1660 
1661  CALL timeset(routinen, handle)
1662  ithread = 0
1663 ! We need some work matrices as we try to exploit operations on the replicated vectors which are duplicated otherwise
1664  CALL dbcsr_get_info(matrix=vec_in, nfullcols_total=vec_dim)
1665 ! This is a performance hack as the new creation of a replicated vector is a fair bit more expensive
1666  CALL dbcsr_set(work_col, cmplx(0.0, 0.0, real_8))
1667  CALL dbcsr_copy(result_col, work_col)
1668  CALL dbcsr_set(work_row, cmplx(0.0, 0.0, real_8))
1669  CALL dbcsr_copy(result_row, work_row)
1670 
1671 ! Collect some data about the parallel environment. We will use them later to move the vector around
1672  CALL dbcsr_get_info(matrix=matrix, distribution=dist)
1673  CALL dbcsr_distribution_get(dist, pcol_group=pcol_handle, myprow=myprow, mypcol=mypcol)
1674 
1675  CALL pcol_group%set_handle(pcol_handle)
1676 
1677  CALL create_fast_row_vec_access(work_row, fast_vec_row)
1678  CALL create_fast_col_vec_access(work_col, fast_vec_col)
1679  CALL create_fast_row_vec_access(result_row, res_fast_vec_row)
1680  CALL create_fast_col_vec_access(result_col, res_fast_vec_col)
1681 
1682 ! Transfer the correct parts of the input vector to the correct locations so we can do a local multiply
1683  CALL dbcsr_col_vec_to_rep_row_z (vec_in, work_col, work_row, fast_vec_col)
1684 
1685 ! Probably I should rename the routine above as it delivers both the replicated row and column vector
1686 
1687 ! Perform the local multiply. Here we exploit, that we have the blocks replicated on the mpi processes
1688 ! It is important to note, that the input and result vector are distributed differently (row wise, col wise respectively)
1689  CALL timeset(routinen//"_local_mm", handle1)
1690 
1691 !------ perform the multiplication, we have to take car to take the correct blocks ----------
1692 
1693 !$OMP PARALLEL DEFAULT(NONE) PRIVATE(row,col,iter,data_d,ithread,pcol,prow,rpcol,rprow) &
1694 !$OMP SHARED(matrix,fast_vec_row,res_fast_vec_col,res_fast_vec_row,fast_vec_col)
1695 !$ ithread = omp_get_thread_num()
1696  CALL dbcsr_iterator_start(iter, matrix, shared=.false.)
1697  DO WHILE (dbcsr_iterator_blocks_left(iter))
1698  CALL dbcsr_iterator_next_block(iter, row, col, data_d)
1699  pcol = hash_table_get(fast_vec_row%hash_table, col)
1700  rprow = hash_table_get(res_fast_vec_col%hash_table, row)
1701  IF (ASSOCIATED(fast_vec_row%blk_map_z (pcol)%ptr) .AND. &
1702  ASSOCIATED(res_fast_vec_col%blk_map_z (rprow)%ptr)) THEN
1703  IF (res_fast_vec_col%blk_map_z (rprow)%assigned_thread .EQ. ithread) THEN
1704  res_fast_vec_col%blk_map_z (rprow)%ptr = res_fast_vec_col%blk_map_z (rprow)%ptr + &
1705  matmul(data_d, transpose(fast_vec_row%blk_map_z (pcol)%ptr))
1706  END IF
1707  prow = hash_table_get(fast_vec_col%hash_table, row)
1708  rpcol = hash_table_get(res_fast_vec_row%hash_table, col)
1709  IF (res_fast_vec_row%blk_map_z (rpcol)%assigned_thread .EQ. ithread .AND. row .NE. col) THEN
1710  res_fast_vec_row%blk_map_z (rpcol)%ptr = res_fast_vec_row%blk_map_z (rpcol)%ptr + &
1711  matmul(transpose(fast_vec_col%blk_map_z (prow)%ptr), data_d)
1712  END IF
1713  ELSE
1714  rpcol = hash_table_get(res_fast_vec_col%hash_table, col)
1715  prow = hash_table_get(fast_vec_row%hash_table, row)
1716  IF (res_fast_vec_col%blk_map_z (rpcol)%assigned_thread .EQ. ithread) THEN
1717  res_fast_vec_col%blk_map_z (rpcol)%ptr = res_fast_vec_col%blk_map_z (rpcol)%ptr + &
1718  transpose(matmul(fast_vec_row%blk_map_z (prow)%ptr, data_d))
1719  END IF
1720  rprow = hash_table_get(res_fast_vec_row%hash_table, row)
1721  pcol = hash_table_get(fast_vec_col%hash_table, col)
1722  IF (res_fast_vec_row%blk_map_z (rprow)%assigned_thread .EQ. ithread .AND. row .NE. col) THEN
1723  res_fast_vec_row%blk_map_z (rprow)%ptr = res_fast_vec_row%blk_map_z (rprow)%ptr + &
1724  transpose(matmul(data_d, fast_vec_col%blk_map_z (pcol)%ptr))
1725  END IF
1726  END IF
1727  END DO
1728  CALL dbcsr_iterator_stop(iter)
1729 !$OMP END PARALLEL
1730 
1731  CALL timestop(handle1)
1732 
1733  ! sum all the data within a processor column to obtain the replicated result from lower
1734  data_vec => dbcsr_get_data_p(result_row, select_data_type=cmplx(0.0, 0.0, real_8))
1735  CALL dbcsr_get_info(matrix=result_row, nfullrows_local=nrows, nfullcols_local=ncols)
1736 
1737  CALL pcol_group%sum(data_vec(1:nrows*ncols))
1738 !
1739 !! Convert the results to a column wise distribution, this is a bit involved as the result_row is fully replicated
1740 !! While the result_col still has the partial results in parallel. The routine below takes care of that and saves a
1741 !! parallel summation. Of the res_row vectors are created only taking the appropriate element (0 otherwise) while the res_col
1742 !! parallel bits are locally added. The sum magically creates the correct vector
1743  CALL dbcsr_rep_row_to_rep_col_vec_z (work_col, result_row, res_fast_vec_row, res_fast_vec_col)
1744 
1745 ! ! Create_the final vector by summing it to the result vector which lives on proc_col 0 lower
1746  CALL dbcsr_iterator_start(iter, vec_out)
1747  DO WHILE (dbcsr_iterator_blocks_left(iter))
1748  CALL dbcsr_iterator_next_block(iter, row, col, vec_res)
1749  prow = hash_table_get(fast_vec_col%hash_table, row)
1750  IF (ASSOCIATED(fast_vec_col%blk_map_z (prow)%ptr)) THEN
1751  vec_res(:, :) = beta*vec_res(:, :) + alpha*(fast_vec_col%blk_map_z (prow)%ptr(:, :))
1752  ELSE
1753  vec_res(:, :) = beta*vec_res(:, :)
1754  END IF
1755  END DO
1756  CALL dbcsr_iterator_stop(iter)
1757 
1758  CALL release_fast_vec_access(fast_vec_row)
1759  CALL release_fast_vec_access(fast_vec_col)
1760  CALL release_fast_vec_access(res_fast_vec_row)
1761  CALL release_fast_vec_access(res_fast_vec_col)
1762 
1763  CALL dbcsr_release(result_row); CALL dbcsr_release(result_col)
1764 
1765  CALL timestop(handle)
1766 
1767  END SUBROUTINE dbcsr_sym_matrix_vector_mult_z
1768 
1769 
1770 END MODULE dbcsr_vector
static void dgemm(const char transa, const char transb, const int m, const int n, const int k, const double alpha, const double *a, const int lda, const double *b, const int ldb, const double beta, double *c, const int ldc)
Convenient wrapper to hide Fortran nature of dgemm_, swapping a and b.
operations for skinny matrices/vectors expressed in dbcsr form
Definition: dbcsr_vector.F:15
subroutine, public create_replicated_row_vec_from_matrix(dbcsr_vec, matrix, nrow)
creates a row vector like object whose blocks can be replicated along the processor col and has the s...
Definition: dbcsr_vector.F:236
subroutine, public create_col_vec_from_matrix(dbcsr_vec, matrix, ncol)
creates a dbcsr col vector like object which lives on proc_col 0 and has the same row dist as the tem...
Definition: dbcsr_vector.F:109
subroutine, public create_replicated_col_vec_from_matrix(dbcsr_vec, matrix, ncol)
creates a col vector like object whose blocks can be replicated along the processor row and has the s...
Definition: dbcsr_vector.F:193
subroutine, public create_row_vec_from_matrix(dbcsr_vec, matrix, nrow)
creates a dbcsr row vector like object which lives on proc_row 0 and has the same row dist as the tem...
Definition: dbcsr_vector.F:151
Defines the basic variable types.
Definition: kinds.F:23
integer, parameter, public dp
Definition: kinds.F:34
integer, parameter, public real_8
Definition: kinds.F:41
Interface to the message passing library MPI.