(git:374b731)
Loading...
Searching...
No Matches
dbcsr_vector.F
Go to the documentation of this file.
1!--------------------------------------------------------------------------------------------------!
2! CP2K: A general program to perform molecular dynamics simulations !
3! Copyright 2000-2024 CP2K developers group <https://cp2k.org> !
4! !
5! SPDX-License-Identifier: GPL-2.0-or-later !
6!--------------------------------------------------------------------------------------------------!
7
8! **************************************************************************************************
9!> \brief operations for skinny matrices/vectors expressed in dbcsr form
10!> \par History
11!> 2014.10 created [Florian Schiffmann]
12!> \author Florian Schiffmann
13! **************************************************************************************************
14
16 USE dbcsr_api, ONLY: dbcsr_copy, &
17 dbcsr_create, &
18 dbcsr_distribution_get, &
19 dbcsr_distribution_new, &
20 dbcsr_distribution_release, &
21 dbcsr_distribution_type, &
22 dbcsr_get_info, &
23 dbcsr_iterator_blocks_left, &
24 dbcsr_iterator_next_block, &
25 dbcsr_iterator_start, &
26 dbcsr_iterator_stop, &
27 dbcsr_iterator_type, &
28 dbcsr_release, &
29 dbcsr_reserve_all_blocks, &
30 dbcsr_set, dbcsr_get_data_p, &
31 dbcsr_type, &
32 dbcsr_type_antisymmetric, &
33 dbcsr_type_complex_8, &
34 dbcsr_type_complex_8, &
35 dbcsr_type_no_symmetry, &
36 dbcsr_type_real_8, &
37 dbcsr_type_real_8, &
38 dbcsr_type_symmetric
39 USE kinds, ONLY: dp, &
40 real_8
42
43#include "../base/base_uses.f90"
44
45!$ USE OMP_LIB, ONLY: omp_get_max_threads, omp_get_thread_num, omp_get_num_threads
46
47 IMPLICIT NONE
48
49 PRIVATE
50
51 CHARACTER(len=*), PARAMETER, PRIVATE :: moduleN = 'dbcsr_vector_operations'
52
53! **************************************************************************************************
54!> \brief Types needed for the hashtable.
55! **************************************************************************************************
56 TYPE ele_type
57 INTEGER :: c = 0
58 INTEGER :: p = 0
59 END TYPE ele_type
60
61 TYPE hash_table_type
62 TYPE(ele_type), DIMENSION(:), POINTER :: table => null()
63 INTEGER :: nele = 0
64 INTEGER :: nmax = 0
65 INTEGER :: prime = 0
66 END TYPE hash_table_type
67
68! **************************************************************************************************
69!> \brief Types needed for fast access to distributed dbcsr vectors.
70! **************************************************************************************************
71 TYPE block_ptr_d
72 REAL(real_8), DIMENSION(:, :), POINTER :: ptr => null()
73 INTEGER :: assigned_thread = -1
74 END TYPE
75 TYPE block_ptr_z
76 COMPLEX(real_8), DIMENSION(:, :), POINTER :: ptr => null()
77 INTEGER :: assigned_thread = -1
78 END TYPE
79
80 TYPE fast_vec_access_type
81 TYPE(hash_table_type) :: hash_table = hash_table_type()
82 TYPE(block_ptr_d), DIMENSION(:), ALLOCATABLE :: blk_map_d
83 TYPE(block_ptr_z), DIMENSION(:), ALLOCATABLE :: blk_map_z
84 END TYPE
85
91
93 MODULE PROCEDURE dbcsr_matrix_colvec_multiply_d
94 MODULE PROCEDURE dbcsr_matrix_colvec_multiply_z
95 END INTERFACE
96
97CONTAINS
98
99! **************************************************************************************************
100!> \brief creates a dbcsr col vector like object which lives on proc_col 0
101!> and has the same row dist as the template matrix
102!> the returned matrix is fully allocated and all blocks are set to 0
103!> this is not a sparse object (and must never be)
104!> \param dbcsr_vec the vector object to create must be allocated but not initialized
105!> \param matrix a dbcsr matrix used as template
106!> \param ncol number of vectors in the dbcsr_typeect (1 for vector, n for skinny matrix)
107! **************************************************************************************************
108 SUBROUTINE create_col_vec_from_matrix(dbcsr_vec, matrix, ncol)
109 TYPE(dbcsr_type) :: dbcsr_vec, matrix
110 INTEGER :: ncol
111
112 CHARACTER(LEN=*), PARAMETER :: routinen = 'create_col_vec_from_matrix'
113
114 INTEGER :: handle, npcols, data_type
115 INTEGER, DIMENSION(:), POINTER :: row_blk_size, col_blk_size, row_dist, col_dist
116 TYPE(dbcsr_distribution_type) :: dist_col_vec, dist
117
118 CALL timeset(routinen, handle)
119
120 CALL dbcsr_get_info(matrix, data_type=data_type, row_blk_size=row_blk_size, distribution=dist)
121 CALL dbcsr_distribution_get(dist, npcols=npcols, row_dist=row_dist)
122
123 ALLOCATE (col_dist(1), col_blk_size(1))
124 col_dist(1) = 0
125 col_blk_size(1) = ncol
126 CALL dbcsr_distribution_new(dist_col_vec, template=dist, row_dist=row_dist, col_dist=col_dist)
127
128 CALL dbcsr_create(dbcsr_vec, "D", dist_col_vec, &
129 matrix_type=dbcsr_type_no_symmetry, &
130 row_blk_size=row_blk_size, &
131 col_blk_size=col_blk_size, &
132 nze=0, data_type=data_type)
133 CALL dbcsr_reserve_all_blocks(dbcsr_vec)
134
135 CALL dbcsr_distribution_release(dist_col_vec)
136 DEALLOCATE (col_dist, col_blk_size)
137 CALL timestop(handle)
138
139 END SUBROUTINE create_col_vec_from_matrix
140
141! **************************************************************************************************
142!> \brief creates a dbcsr row vector like object which lives on proc_row 0
143!> and has the same row dist as the template matrix
144!> the returned matrix is fully allocated and all blocks are set to 0
145!> this is not a sparse object (and must never be)
146!> \param dbcsr_vec ...
147!> \param matrix a dbcsr matrix used as template
148!> \param nrow number of vectors in the dbcsr_typeect (1 for vector, n for skinny matrix)
149! **************************************************************************************************
150 SUBROUTINE create_row_vec_from_matrix(dbcsr_vec, matrix, nrow)
151 TYPE(dbcsr_type) :: dbcsr_vec, matrix
152 INTEGER :: nrow
153
154 CHARACTER(LEN=*), PARAMETER :: routinen = 'create_row_vec_from_matrix'
155
156 INTEGER :: handle, nprows, data_type
157 INTEGER, DIMENSION(:), POINTER :: row_blk_size, col_blk_size, row_dist, col_dist
158 TYPE(dbcsr_distribution_type) :: dist_row_vec, dist
159
160 CALL timeset(routinen, handle)
161
162 CALL dbcsr_get_info(matrix, data_type=data_type, col_blk_size=col_blk_size, distribution=dist)
163 CALL dbcsr_distribution_get(dist, nprows=nprows, col_dist=col_dist)
164
165 ALLOCATE (row_dist(1), row_blk_size(1))
166 row_dist(1) = 0
167 row_blk_size(1) = nrow
168 CALL dbcsr_distribution_new(dist_row_vec, template=dist, row_dist=row_dist, col_dist=col_dist)
169
170 CALL dbcsr_create(dbcsr_vec, "D", dist_row_vec, &
171 matrix_type=dbcsr_type_no_symmetry, &
172 row_blk_size=row_blk_size, &
173 col_blk_size=col_blk_size, &
174 nze=0, data_type=data_type)
175 CALL dbcsr_reserve_all_blocks(dbcsr_vec)
176
177 CALL dbcsr_distribution_release(dist_row_vec)
178 DEALLOCATE (row_dist, row_blk_size)
179 CALL timestop(handle)
180
181 END SUBROUTINE create_row_vec_from_matrix
182
183! **************************************************************************************************
184!> \brief creates a col vector like object whose blocks can be replicated
185!> along the processor row and has the same row dist as the template matrix
186!> the returned matrix is fully allocated and all blocks are set to 0
187!> this is not a sparse object (and must never be)
188!> \param dbcsr_vec the vector object to create must be allocated but not initialized
189!> \param matrix a dbcsr matrix used as template
190!> \param ncol number of vectors in the dbcsr_typeect (1 for vector, n for skinny matrix)
191! **************************************************************************************************
192 SUBROUTINE create_replicated_col_vec_from_matrix(dbcsr_vec, matrix, ncol)
193 TYPE(dbcsr_type) :: dbcsr_vec, matrix
194 INTEGER :: ncol
195
196 CHARACTER(LEN=*), PARAMETER :: routinen = 'create_replicated_col_vec_from_matrix'
197
198 INTEGER :: handle, npcols, data_type, i
199 INTEGER, DIMENSION(:), POINTER :: row_blk_size, col_blk_size, row_dist, col_dist
200 TYPE(dbcsr_distribution_type) :: dist_col_vec, dist
201 CALL timeset(routinen, handle)
202
203 CALL dbcsr_get_info(matrix, data_type=data_type, row_blk_size=row_blk_size, distribution=dist)
204 CALL dbcsr_distribution_get(dist, npcols=npcols, row_dist=row_dist)
205
206 ALLOCATE (col_dist(npcols), col_blk_size(npcols))
207 col_blk_size(:) = ncol
208 DO i = 0, npcols - 1
209 col_dist(i + 1) = i
210 END DO
211 CALL dbcsr_distribution_new(dist_col_vec, template=dist, row_dist=row_dist, col_dist=col_dist)
212
213 CALL dbcsr_create(dbcsr_vec, "D", dist_col_vec, &
214 matrix_type=dbcsr_type_no_symmetry, &
215 row_blk_size=row_blk_size, &
216 col_blk_size=col_blk_size, &
217 nze=0, data_type=data_type)
218 CALL dbcsr_reserve_all_blocks(dbcsr_vec)
219
220 CALL dbcsr_distribution_release(dist_col_vec)
221 DEALLOCATE (col_dist, col_blk_size)
222 CALL timestop(handle)
223
225
226! **************************************************************************************************
227!> \brief creates a row vector like object whose blocks can be replicated
228!> along the processor col and has the same col dist as the template matrix
229!> the returned matrix is fully allocated and all blocks are set to 0
230!> this is not a sparse object (and must never be)
231!> \param dbcsr_vec the vector object to create must be allocated but not initialized
232!> \param matrix a dbcsr matrix used as template
233!> \param nrow number of vectors in the dbcsr_typeect (1 for vector, n for skinny matrix)
234! **************************************************************************************************
235 SUBROUTINE create_replicated_row_vec_from_matrix(dbcsr_vec, matrix, nrow)
236 TYPE(dbcsr_type) :: dbcsr_vec
237 TYPE(dbcsr_type) :: matrix
238 INTEGER :: nrow
239
240 CHARACTER(LEN=*), PARAMETER :: routinen = 'create_replicated_row_vec_from_matrix'
241
242 INTEGER :: handle, i, nprows, data_type
243 INTEGER, DIMENSION(:), POINTER :: row_dist, col_dist, row_blk_size, col_blk_size
244 TYPE(dbcsr_distribution_type) :: dist_row_vec, dist
245
246 CALL timeset(routinen, handle)
247
248 CALL dbcsr_get_info(matrix, distribution=dist, col_blk_size=col_blk_size, data_type=data_type)
249 CALL dbcsr_distribution_get(dist, nprows=nprows, col_dist=col_dist)
250
251 ALLOCATE (row_dist(nprows), row_blk_size(nprows))
252 row_blk_size(:) = nrow
253 DO i = 0, nprows - 1
254 row_dist(i + 1) = i
255 END DO
256 CALL dbcsr_distribution_new(dist_row_vec, template=dist, row_dist=row_dist, col_dist=col_dist)
257
258 CALL dbcsr_create(dbcsr_vec, "D", dist_row_vec, dbcsr_type_no_symmetry, &
259 row_blk_size=row_blk_size, col_blk_size=col_blk_size, &
260 nze=0, data_type=data_type)
261 CALL dbcsr_reserve_all_blocks(dbcsr_vec)
262
263 CALL dbcsr_distribution_release(dist_row_vec)
264 DEALLOCATE (row_dist, row_blk_size)
265 CALL timestop(handle)
266
268
269! **************************************************************************************************
270!> \brief given a column vector, prepare the fast_vec_access container
271!> \param vec ...
272!> \param fast_vec_access ...
273! **************************************************************************************************
274 SUBROUTINE create_fast_col_vec_access(vec, fast_vec_access)
275 TYPE(dbcsr_type) :: vec
276 TYPE(fast_vec_access_type) :: fast_vec_access
277
278 CHARACTER(LEN=*), PARAMETER :: routinen = 'create_fast_col_vec_access'
279
280 INTEGER :: handle, data_type
281
282 CALL timeset(routinen, handle)
283
284 CALL dbcsr_get_info(vec, data_type=data_type)
285
286 SELECT CASE (data_type)
287 CASE (dbcsr_type_real_8)
288 CALL create_fast_col_vec_access_d(vec, fast_vec_access)
289 CASE (dbcsr_type_complex_8)
290 CALL create_fast_col_vec_access_z(vec, fast_vec_access)
291 END SELECT
292
293 CALL timestop(handle)
294
295 END SUBROUTINE create_fast_col_vec_access
296
297! **************************************************************************************************
298!> \brief given a row vector, prepare the fast_vec_access_container
299!> \param vec ...
300!> \param fast_vec_access ...
301! **************************************************************************************************
302 SUBROUTINE create_fast_row_vec_access(vec, fast_vec_access)
303 TYPE(dbcsr_type) :: vec
304 TYPE(fast_vec_access_type) :: fast_vec_access
305
306 CHARACTER(LEN=*), PARAMETER :: routinen = 'create_fast_row_vec_access'
307
308 INTEGER :: handle, data_type
309
310 CALL timeset(routinen, handle)
311
312 CALL dbcsr_get_info(vec, data_type=data_type)
313
314 SELECT CASE (data_type)
315 CASE (dbcsr_type_real_8)
316 CALL create_fast_row_vec_access_d(vec, fast_vec_access)
317 CASE (dbcsr_type_complex_8)
318 CALL create_fast_row_vec_access_z(vec, fast_vec_access)
319 END SELECT
320
321 CALL timestop(handle)
322
323 END SUBROUTINE create_fast_row_vec_access
324
325! **************************************************************************************************
326!> \brief release all memory associated with the fast_vec_access type
327!> \param fast_vec_access ...
328! **************************************************************************************************
329 SUBROUTINE release_fast_vec_access(fast_vec_access)
330 TYPE(fast_vec_access_type) :: fast_vec_access
331
332 CHARACTER(LEN=*), PARAMETER :: routinen = 'release_fast_vec_access'
333
334 INTEGER :: handle
335
336 CALL timeset(routinen, handle)
337
338 CALL hash_table_release(fast_vec_access%hash_table)
339 IF (ALLOCATED(fast_vec_access%blk_map_d)) DEALLOCATE (fast_vec_access%blk_map_d)
340 IF (ALLOCATED(fast_vec_access%blk_map_z)) DEALLOCATE (fast_vec_access%blk_map_z)
341
342 CALL timestop(handle)
343
344 END SUBROUTINE release_fast_vec_access
345
346! --------------------------------------------------------------------------------------------------
347! Beginning of hashtable.
348! this file can be 'INCLUDE'ed verbatim in various place, where it needs to be
349! part of the module to guarantee inlining
350! hashes (c,p) pairs, where p is assumed to be >0
351! on return (0 is used as a flag for not present)
352!
353!
354! **************************************************************************************************
355!> \brief finds a prime equal or larger than i, needed at creation
356!> \param i ...
357!> \return ...
358! **************************************************************************************************
359 FUNCTION matching_prime(i) RESULT(res)
360 INTEGER, INTENT(IN) :: i
361 INTEGER :: res
362
363 INTEGER :: j
364
365 res = i
366 j = 0
367 DO WHILE (j < res)
368 DO j = 2, res - 1
369 IF (mod(res, j) == 0) THEN
370 res = res + 1
371 EXIT
372 END IF
373 END DO
374 END DO
375 END FUNCTION
376
377! **************************************************************************************************
378!> \brief create a hash_table of given initial size.
379!> the hash table will expand as needed (but this requires rehashing)
380!> \param hash_table ...
381!> \param table_size ...
382! **************************************************************************************************
383 SUBROUTINE hash_table_create(hash_table, table_size)
384 TYPE(hash_table_type) :: hash_table
385 INTEGER, INTENT(IN) :: table_size
386
387 INTEGER :: j
388
389 ! guarantee a minimal hash table size (8), so that expansion works
390
391 j = 3
392 DO WHILE (2**j - 1 < table_size)
393 j = j + 1
394 END DO
395 hash_table%nmax = 2**j - 1
396 hash_table%prime = matching_prime(hash_table%nmax)
397 hash_table%nele = 0
398 ALLOCATE (hash_table%table(0:hash_table%nmax))
399 END SUBROUTINE hash_table_create
400
401! **************************************************************************************************
402!> \brief ...
403!> \param hash_table ...
404! **************************************************************************************************
405 SUBROUTINE hash_table_release(hash_table)
406 TYPE(hash_table_type) :: hash_table
407
408 hash_table%nmax = 0
409 hash_table%nele = 0
410 DEALLOCATE (hash_table%table)
411
412 END SUBROUTINE hash_table_release
413
414! **************************************************************************************************
415!> \brief add a pair (c,p) to the hash table
416!> \param hash_table ...
417!> \param c this value is being hashed
418!> \param p this is being stored
419! **************************************************************************************************
420 RECURSIVE SUBROUTINE hash_table_add(hash_table, c, p)
421 TYPE(hash_table_type), INTENT(INOUT) :: hash_table
422 INTEGER, INTENT(IN) :: c, p
423
424 REAL(kind=real_8), PARAMETER :: hash_table_expand = 1.5_real_8, &
425 inv_hash_table_fill = 2.5_real_8
426
427 INTEGER :: i, j
428 TYPE(ele_type), ALLOCATABLE, &
429 DIMENSION(:) :: tmp_hash
430
431! if too small, make a copy and rehash in a larger table
432
433 IF (hash_table%nele*inv_hash_table_fill > hash_table%nmax) THEN
434 ALLOCATE (tmp_hash(lbound(hash_table%table, 1):ubound(hash_table%table, 1)))
435 tmp_hash(:) = hash_table%table
436 CALL hash_table_release(hash_table)
437 CALL hash_table_create(hash_table, int((ubound(tmp_hash, 1) + 8)*hash_table_expand))
438 DO i = lbound(tmp_hash, 1), ubound(tmp_hash, 1)
439 IF (tmp_hash(i)%c .NE. 0) THEN
440 CALL hash_table_add(hash_table, tmp_hash(i)%c, tmp_hash(i)%p)
441 END IF
442 END DO
443 DEALLOCATE (tmp_hash)
444 END IF
445
446 hash_table%nele = hash_table%nele + 1
447 i = iand(c*hash_table%prime, hash_table%nmax)
448
449 DO j = i, hash_table%nmax
450 IF (hash_table%table(j)%c == 0 .OR. hash_table%table(j)%c == c) THEN
451 hash_table%table(j)%c = c
452 hash_table%table(j)%p = p
453 RETURN
454 END IF
455 END DO
456 DO j = 0, i - 1
457 IF (hash_table%table(j)%c == 0 .OR. hash_table%table(j)%c == c) THEN
458 hash_table%table(j)%c = c
459 hash_table%table(j)%p = p
460 RETURN
461 END IF
462 END DO
463
464 END SUBROUTINE hash_table_add
465
466! **************************************************************************************************
467!> \brief ...
468!> \param hash_table ...
469!> \param c ...
470!> \return ...
471! **************************************************************************************************
472 PURE FUNCTION hash_table_get(hash_table, c) RESULT(p)
473 TYPE(hash_table_type), INTENT(IN) :: hash_table
474 INTEGER, INTENT(IN) :: c
475 INTEGER :: p
476
477 INTEGER :: i, j
478
479 i = iand(c*hash_table%prime, hash_table%nmax)
480
481 ! catch the likely case first
482 IF (hash_table%table(i)%c == c) THEN
483 p = hash_table%table(i)%p
484 RETURN
485 END IF
486
487 DO j = i, hash_table%nmax
488 IF (hash_table%table(j)%c == 0 .OR. hash_table%table(j)%c == c) THEN
489 p = hash_table%table(j)%p
490 RETURN
491 END IF
492 END DO
493 DO j = 0, i - 1
494 IF (hash_table%table(j)%c == 0 .OR. hash_table%table(j)%c == c) THEN
495 p = hash_table%table(j)%p
496 RETURN
497 END IF
498 END DO
499
500 ! we should never reach this point.
501 p = huge(p)
502
503 END FUNCTION hash_table_get
504
505! End of hashtable
506! --------------------------------------------------------------------------------------------------
507
508
509
510! **************************************************************************************************
511!> \brief the real driver routine for the multiply, not all symmetries implemented yet
512!> \param matrix ...
513!> \param vec_in ...
514!> \param vec_out ...
515!> \param alpha ...
516!> \param beta ...
517!> \param work_row ...
518!> \param work_col ...
519! **************************************************************************************************
520 SUBROUTINE dbcsr_matrix_colvec_multiply_d (matrix, vec_in, vec_out, alpha, beta, work_row, work_col)
521 TYPE(dbcsr_type) :: matrix, vec_in, vec_out
522 REAL(kind=real_8) :: alpha, beta
523 TYPE(dbcsr_type) :: work_row, work_col
524
525 CHARACTER :: matrix_type
526
527 CALL dbcsr_get_info(matrix, matrix_type=matrix_type)
528
529 SELECT CASE (matrix_type)
530 CASE (dbcsr_type_no_symmetry)
531 CALL dbcsr_matrix_vector_mult_d (matrix, vec_in, vec_out, alpha, beta, work_row, work_col)
532 CASE (dbcsr_type_symmetric)
533 CALL dbcsr_sym_matrix_vector_mult_d (matrix, vec_in, vec_out, alpha, beta, work_row, work_col)
534 CASE (dbcsr_type_antisymmetric)
535 ! Not yet implemented, should mainly be some prefactor magic, but who knows how antisymmetric matrices are stored???
536 cpabort("NYI, antisymmetric matrix not permitted")
537 CASE DEFAULT
538 cpabort("Unknown matrix type, ...")
539 END SELECT
540
541 END SUBROUTINE dbcsr_matrix_colvec_multiply_d
542
543! **************************************************************************************************
544!> \brief low level routines for matrix vector multiplies
545!> \param matrix ...
546!> \param vec_in ...
547!> \param vec_out ...
548!> \param alpha ...
549!> \param beta ...
550!> \param work_row ...
551!> \param work_col ...
552! **************************************************************************************************
553 SUBROUTINE dbcsr_matrix_vector_mult_d (matrix, vec_in, vec_out, alpha, beta, work_row, work_col)
554 TYPE(dbcsr_type) :: matrix, vec_in, vec_out
555 REAL(kind=real_8) :: alpha, beta
556 TYPE(dbcsr_type) :: work_row, work_col
557
558 CHARACTER(LEN=*), PARAMETER :: routinen = 'dbcsr_matrix_vector_mult'
559
560 INTEGER :: col, mypcol, &
561 myprow, prow_handle, &
562 ncols, nrows, &
563 row, &
564 handle, handle1, ithread
565 TYPE(mp_comm_type) :: prow_group
566 REAL(kind=real_8), DIMENSION(:), POINTER :: data_vec
567 REAL(kind=real_8), DIMENSION(:, :), POINTER :: data_d, vec_res
568 TYPE(dbcsr_distribution_type) :: dist
569 TYPE(dbcsr_iterator_type) :: iter
570 TYPE(fast_vec_access_type) :: fast_vec_row, fast_vec_col
571 INTEGER :: prow, pcol
572
573 CALL timeset(routinen, handle)
574 ithread = 0
575
576! Collect some data about the parallel environment. We will use them later to move the vector around
577 CALL dbcsr_get_info(matrix, distribution=dist)
578 CALL dbcsr_distribution_get(dist, prow_group=prow_handle, myprow=myprow, mypcol=mypcol)
579
580 CALL prow_group%set_handle(prow_handle)
581
582 CALL create_fast_row_vec_access(work_row, fast_vec_row)
583 CALL create_fast_col_vec_access(work_col, fast_vec_col)
584
585! Transfer the correct parts of the input vector to the correct locations so we can do a local multiply
586 CALL dbcsr_col_vec_to_rep_row_d (vec_in, work_col, work_row, fast_vec_col)
587
588! Set the work vector for the results to 0
589 CALL dbcsr_set(work_col, 0.0_real_8)
590
591! Perform the local multiply. Here we exploit, that we have the blocks replicated on the mpi processes
592! It is important to note, that the input and result vector are distributed differently (row wise, col wise respectively)
593 CALL timeset(routinen//"_local_mm", handle1)
594
595!$OMP PARALLEL DEFAULT(NONE) PRIVATE(row,col,iter,data_d,ithread,pcol,prow) &
596!$OMP SHARED(matrix,fast_vec_col,fast_vec_row)
597!$ ithread = omp_get_thread_num()
598 CALL dbcsr_iterator_start(iter, matrix, shared=.false.)
599 DO WHILE (dbcsr_iterator_blocks_left(iter))
600 CALL dbcsr_iterator_next_block(iter, row, col, data_d)
601 prow = hash_table_get(fast_vec_col%hash_table, row)
602 IF (fast_vec_col%blk_map_d (prow)%assigned_thread .NE. ithread) cycle
603 pcol = hash_table_get(fast_vec_row%hash_table, col)
604 IF (SIZE(fast_vec_col%blk_map_d (prow)%ptr, 1) .EQ. 0 .OR. &
605 SIZE(fast_vec_col%blk_map_d (prow)%ptr, 2) .EQ. 0 .OR. &
606 SIZE(data_d, 2) .EQ. 0) cycle
607 CALL dgemm('N', 'T', SIZE(fast_vec_col%blk_map_d (prow)%ptr, 1), &
608 SIZE(fast_vec_col%blk_map_d (prow)%ptr, 2), &
609 SIZE(data_d, 2), &
610 1.0_dp, &
611 data_d, &
612 SIZE(fast_vec_col%blk_map_d (prow)%ptr, 1), &
613 fast_vec_row%blk_map_d (pcol)%ptr, &
614 SIZE(fast_vec_col%blk_map_d (prow)%ptr, 2), &
615 1.0_dp, &
616 fast_vec_col%blk_map_d (prow)%ptr, &
617 SIZE(fast_vec_col%blk_map_d (prow)%ptr, 1))
618 END DO
619 CALL dbcsr_iterator_stop(iter)
620!$OMP END PARALLEL
621
622 CALL timestop(handle1)
623
624! sum all the data onto the first processor col where the original vector is stored
625 data_vec => dbcsr_get_data_p(work_col, select_data_type=0.0_real_8)
626 CALL dbcsr_get_info(matrix=work_col, nfullrows_local=nrows, nfullcols_local=ncols)
627 CALL prow_group%sum(data_vec(1:nrows*ncols))
628
629! Local copy on the first mpi col (as this is the localtion of the vec_res blocks) of the result vector
630! from the replicated to the original vector. Let's play it safe and use the iterator
631 CALL dbcsr_iterator_start(iter, vec_out)
632 DO WHILE (dbcsr_iterator_blocks_left(iter))
633 CALL dbcsr_iterator_next_block(iter, row, col, vec_res)
634 prow = hash_table_get(fast_vec_col%hash_table, row)
635 IF (ASSOCIATED(fast_vec_col%blk_map_d (prow)%ptr)) THEN
636 vec_res(:, :) = beta*vec_res(:, :) + alpha*fast_vec_col%blk_map_d (prow)%ptr(:, :)
637 ELSE
638 vec_res(:, :) = beta*vec_res(:, :)
639 END IF
640 END DO
641 CALL dbcsr_iterator_stop(iter)
642
643 CALL release_fast_vec_access(fast_vec_row)
644 CALL release_fast_vec_access(fast_vec_col)
645
646 CALL timestop(handle)
647
648 END SUBROUTINE dbcsr_matrix_vector_mult_d
649
650! **************************************************************************************************
651!> \brief ...
652!> \param matrix ...
653!> \param vec_in ...
654!> \param vec_out ...
655!> \param alpha ...
656!> \param beta ...
657!> \param work_row ...
658!> \param work_col ...
659!> \param skip_diag ...
660! **************************************************************************************************
661 SUBROUTINE dbcsr_matrixt_vector_mult_d (matrix, vec_in, vec_out, alpha, beta, work_row, work_col, skip_diag)
662 TYPE(dbcsr_type) :: matrix, vec_in, vec_out
663 REAL(kind=real_8) :: alpha, beta
664 TYPE(dbcsr_type) :: work_row, work_col
665 LOGICAL :: skip_diag
666
667 CHARACTER(LEN=*), PARAMETER :: routinen = 'dbcsr_matrixT_vector_mult'
668
669 INTEGER :: col, col_size, mypcol, &
670 myprow, pcol_handle, prow_handle, &
671 ncols, nrows, &
672 row, row_size, &
673 handle, handle1, ithread
674 TYPE(mp_comm_type) :: pcol_group, prow_group
675 REAL(kind=real_8), DIMENSION(:), POINTER :: data_vec
676 REAL(kind=real_8), DIMENSION(:, :), POINTER :: data_d, vec_bl, vec_res
677 TYPE(dbcsr_distribution_type) :: dist
678 TYPE(dbcsr_iterator_type) :: iter
679
680 TYPE(fast_vec_access_type) :: fast_vec_row, fast_vec_col
681 INTEGER :: prow, pcol
682
683 CALL timeset(routinen, handle)
684 ithread = 0
685
686! Collect some data about the parallel environment. We will use them later to move the vector around
687 CALL dbcsr_get_info(matrix, distribution=dist)
688 CALL dbcsr_distribution_get(dist, prow_group=prow_handle, pcol_group=pcol_handle, myprow=myprow, mypcol=mypcol)
689
690 CALL prow_group%set_handle(prow_handle)
691 CALL pcol_group%set_handle(pcol_handle)
692
693 CALL create_fast_row_vec_access(work_row, fast_vec_row)
694 CALL create_fast_col_vec_access(work_col, fast_vec_col)
695
696! Set the work vector for the results to 0
697 CALL dbcsr_set(work_row, 0.0_real_8)
698
699! Transfer the correct parts of the input vector to the replicated vector on proc_col 0
700 CALL dbcsr_iterator_start(iter, vec_in)
701 DO WHILE (dbcsr_iterator_blocks_left(iter))
702 CALL dbcsr_iterator_next_block(iter, row, col, vec_bl, row_size=row_size, col_size=col_size)
703 prow = hash_table_get(fast_vec_col%hash_table, row)
704 fast_vec_col%blk_map_d (prow)%ptr(1:row_size, 1:col_size) = vec_bl(1:row_size, 1:col_size)
705 END DO
706 CALL dbcsr_iterator_stop(iter)
707! Replicate the data on all processore in the row
708 data_vec => dbcsr_get_data_p(work_col, select_data_type=0.0_real_8)
709 CALL prow_group%bcast(data_vec, 0)
710
711! Perform the local multiply. Here it is obvious why the vectors are replicated on the mpi rows and cols
712 CALL timeset(routinen//"local_mm", handle1)
713 CALL dbcsr_get_info(matrix=work_col, nfullcols_local=ncols)
714!$OMP PARALLEL DEFAULT(NONE) PRIVATE(row,col,iter,data_d,row_size,col_size,ithread,prow,pcol) &
715!$OMP SHARED(matrix,fast_vec_row,fast_vec_col,skip_diag,ncols)
716!$ ithread = omp_get_thread_num()
717 CALL dbcsr_iterator_start(iter, matrix, shared=.false.)
718 DO WHILE (dbcsr_iterator_blocks_left(iter))
719 CALL dbcsr_iterator_next_block(iter, row, col, data_d, row_size=row_size, col_size=col_size)
720 IF (skip_diag .AND. col == row) cycle
721 prow = hash_table_get(fast_vec_col%hash_table, row)
722 pcol = hash_table_get(fast_vec_row%hash_table, col)
723 IF (ASSOCIATED(fast_vec_row%blk_map_d (pcol)%ptr) .AND. &
724 ASSOCIATED(fast_vec_col%blk_map_d (prow)%ptr)) THEN
725 IF (fast_vec_row%blk_map_d (pcol)%assigned_thread .NE. ithread) cycle
726 fast_vec_row%blk_map_d (pcol)%ptr = fast_vec_row%blk_map_d (pcol)%ptr + &
727 matmul(transpose(fast_vec_col%blk_map_d (prow)%ptr), data_d)
728 ELSE
729 prow = hash_table_get(fast_vec_row%hash_table, row)
730 pcol = hash_table_get(fast_vec_col%hash_table, col)
731 IF (fast_vec_row%blk_map_d (prow)%assigned_thread .NE. ithread) cycle
732 fast_vec_row%blk_map_d (prow)%ptr = fast_vec_row%blk_map_d (prow)%ptr + &
733 matmul(transpose(fast_vec_col%blk_map_d (pcol)%ptr), transpose(data_d))
734 END IF
735 END DO
736 CALL dbcsr_iterator_stop(iter)
737!$OMP END PARALLEL
738
739 CALL timestop(handle1)
740
741! sum all the data within a processor column to obtain the replicated result
742 data_vec => dbcsr_get_data_p(work_row, select_data_type=0.0_real_8)
743! we use the replicated vector but the final answer is only summed to proc_col 0 for efficiency
744 CALL dbcsr_get_info(matrix=work_row, nfullrows_local=nrows, nfullcols_local=ncols)
745 CALL pcol_group%sum(data_vec(1:nrows*ncols))
746
747! Convert the result to a column wise distribution
748 CALL dbcsr_rep_row_to_rep_col_vec_d (work_col, work_row, fast_vec_row)
749
750! Create_the final vector by summing it to the result vector which lives on proc_col 0
751 CALL dbcsr_iterator_start(iter, vec_out)
752 DO WHILE (dbcsr_iterator_blocks_left(iter))
753 CALL dbcsr_iterator_next_block(iter, row, col, vec_res, row_size=row_size)
754 prow = hash_table_get(fast_vec_col%hash_table, row)
755 IF (ASSOCIATED(fast_vec_col%blk_map_d (prow)%ptr)) THEN
756 vec_res(:, :) = beta*vec_res(:, :) + alpha*fast_vec_col%blk_map_d (prow)%ptr(:, :)
757 ELSE
758 vec_res(:, :) = beta*vec_res(:, :)
759 END IF
760 END DO
761 CALL dbcsr_iterator_stop(iter)
762
763 CALL timestop(handle)
764
765 END SUBROUTINE dbcsr_matrixt_vector_mult_d
766
767! **************************************************************************************************
768!> \brief ...
769!> \param vec_in ...
770!> \param rep_col_vec ...
771!> \param rep_row_vec ...
772!> \param fast_vec_col ...
773! **************************************************************************************************
774 SUBROUTINE dbcsr_col_vec_to_rep_row_d (vec_in, rep_col_vec, rep_row_vec, fast_vec_col)
775 TYPE(dbcsr_type) :: vec_in, rep_col_vec, &
776 rep_row_vec
777 TYPE(fast_vec_access_type), INTENT(IN) :: fast_vec_col
778
779 CHARACTER(LEN=*), PARAMETER :: routineN = 'dbcsr_col_vec_to_rep_row'
780
781 INTEGER :: col, mypcol, myprow, ncols, &
782 nrows, pcol_handle, prow_handle, &
783 row, handle
784 TYPE(mp_comm_type) :: pcol_group, prow_group
785 INTEGER, DIMENSION(:), POINTER :: local_cols, row_dist
786 REAL(kind=real_8), DIMENSION(:), POINTER :: data_vec, data_vec_rep
787 REAL(kind=real_8), DIMENSION(:, :), POINTER :: vec_row
788 TYPE(dbcsr_distribution_type) :: dist_in, dist_rep_col
789 TYPE(dbcsr_iterator_type) :: iter
790
791 CALL timeset(routinen, handle)
792
793! get information about the parallel environment
794 CALL dbcsr_get_info(vec_in, distribution=dist_in)
795 CALL dbcsr_distribution_get(dist_in, &
796 prow_group=prow_handle, &
797 pcol_group=pcol_handle, &
798 myprow=myprow, &
799 mypcol=mypcol)
800
801 CALL prow_group%set_handle(prow_handle)
802 CALL pcol_group%set_handle(pcol_handle)
803
804! Get the vector which tells us which blocks are local to which processor row in the col vec
805 CALL dbcsr_get_info(rep_col_vec, distribution=dist_rep_col)
806 CALL dbcsr_distribution_get(dist_rep_col, row_dist=row_dist)
807
808! Copy the local vector to the replicated on the first processor column (this is where vec_in lives)
809 CALL dbcsr_get_info(matrix=rep_col_vec, nfullrows_local=nrows, nfullcols_local=ncols)
810 data_vec_rep => dbcsr_get_data_p(rep_col_vec, select_data_type=0.0_real_8)
811 data_vec => dbcsr_get_data_p(vec_in, select_data_type=0.0_real_8)
812 IF (mypcol == 0) data_vec_rep(1:nrows*ncols) = data_vec(1:nrows*ncols)
813! Replicate the data along the row
814 CALL prow_group%bcast(data_vec_rep(1:nrows*ncols), 0)
815
816! Here it gets a bit tricky as we are dealing with two different parallel layouts:
817! The rep_col_vec contains all blocks local to the row distribution of the vector.
818! The rep_row_vec only needs the fraction which is local to the col distribution.
819! However in most cases this won't the complete set of block which can be obtained from col_vector p_row i
820! Anyway, as the blocks don't repeat in the col_vec, a different fraction of the row vec will be available
821! on every replica in the processor column, by summing along the column we end up with the complete vector everywhere
822! Hope this clarifies the idea
823 CALL dbcsr_set(rep_row_vec, 0.0_real_8)
824 CALL dbcsr_get_info(matrix=rep_row_vec, nfullrows_local=nrows, local_cols=local_cols, nfullcols_local=ncols)
825 CALL dbcsr_iterator_start(iter, rep_row_vec)
826 DO WHILE (dbcsr_iterator_blocks_left(iter))
827 CALL dbcsr_iterator_next_block(iter, row, col, vec_row)
828 IF (row_dist(col) == myprow) THEN
829 vec_row = transpose(fast_vec_col%blk_map_d (hash_table_get(fast_vec_col%hash_table, col))%ptr)
830 END IF
831 END DO
832 CALL dbcsr_iterator_stop(iter)
833 CALL dbcsr_get_info(matrix=rep_row_vec, nfullrows_local=nrows, nfullcols_local=ncols)
834 data_vec_rep => dbcsr_get_data_p(rep_row_vec, select_data_type=0.0_real_8)
835 CALL pcol_group%sum(data_vec_rep(1:ncols*nrows))
836
837 CALL timestop(handle)
838
839 END SUBROUTINE dbcsr_col_vec_to_rep_row_d
840
841! **************************************************************************************************
842!> \brief ...
843!> \param rep_col_vec ...
844!> \param rep_row_vec ...
845!> \param fast_vec_row ...
846!> \param fast_vec_col_add ...
847! **************************************************************************************************
848 SUBROUTINE dbcsr_rep_row_to_rep_col_vec_d (rep_col_vec, rep_row_vec, fast_vec_row, fast_vec_col_add)
849 TYPE(dbcsr_type) :: rep_col_vec, rep_row_vec
850 TYPE(fast_vec_access_type), OPTIONAL :: fast_vec_col_add
851 TYPE(fast_vec_access_type) :: fast_vec_row
852
853 CHARACTER(LEN=*), PARAMETER :: routineN = 'dbcsr_rep_row_to_rep_col_vec'
854
855 INTEGER :: col, mypcol, myprow, ncols, &
856 nrows, prow_handle, &
857 row, handle
858 INTEGER, DIMENSION(:), POINTER :: col_dist
859 TYPE(mp_comm_type) :: prow_group
860 REAL(kind=real_8), DIMENSION(:), POINTER :: data_vec_rep
861 REAL(kind=real_8), DIMENSION(:, :), POINTER :: vec_col
862 TYPE(dbcsr_distribution_type) :: dist_rep_row, dist_rep_col
863 TYPE(dbcsr_iterator_type) :: iter
864
865 CALL timeset(routinen, handle)
866
867! get information about the parallel environment
868 CALL dbcsr_get_info(matrix=rep_col_vec, distribution=dist_rep_col)
869 CALL dbcsr_distribution_get(dist_rep_col, &
870 prow_group=prow_handle, &
871 myprow=myprow, &
872 mypcol=mypcol)
873
874 CALL prow_group%set_handle(prow_handle)
875
876! Get the vector which tells us which blocks are local to which processor col in the row vec
877 CALL dbcsr_get_info(matrix=rep_row_vec, distribution=dist_rep_row)
878 CALL dbcsr_distribution_get(dist_rep_row, col_dist=col_dist)
879
880! The same trick as described above with opposite direction
881 CALL dbcsr_set(rep_col_vec, 0.0_real_8)
882 CALL dbcsr_iterator_start(iter, rep_col_vec)
883 DO WHILE (dbcsr_iterator_blocks_left(iter))
884 CALL dbcsr_iterator_next_block(iter, row, col, vec_col)
885 IF (col_dist(row) == mypcol) THEN
886 vec_col = transpose(fast_vec_row%blk_map_d (hash_table_get(fast_vec_row%hash_table, row))%ptr)
887 END IF
888 ! this one is special and allows to add the elements of a not yet summed replicated
889 ! column vector as it appears in M*V(row_rep) as result. Save an parallel summation in the symmetric case
890 IF (PRESENT(fast_vec_col_add)) vec_col = vec_col + &
891 fast_vec_col_add%blk_map_d (hash_table_get(fast_vec_col_add%hash_table, row))%ptr(:, :)
892 END DO
893 CALL dbcsr_iterator_stop(iter)
894 CALL dbcsr_get_info(matrix=rep_col_vec, nfullrows_local=nrows, nfullcols_local=ncols)
895 data_vec_rep => dbcsr_get_data_p(rep_col_vec, select_data_type=0.0_real_8)
896 CALL prow_group%sum(data_vec_rep(1:nrows*ncols))
897
898 CALL timestop(handle)
899
900 END SUBROUTINE dbcsr_rep_row_to_rep_col_vec_d
901
902! **************************************************************************************************
903!> \brief given a column vector, prepare the fast_vec_access container
904!> \param vec ...
905!> \param fast_vec_access ...
906! **************************************************************************************************
907 SUBROUTINE create_fast_col_vec_access_d (vec, fast_vec_access)
908 TYPE(dbcsr_type) :: vec
909 TYPE(fast_vec_access_type) :: fast_vec_access
910
911 CHARACTER(LEN=*), PARAMETER :: routineN = 'create_fast_col_vec_access_d'
912
913 INTEGER :: handle, nblk_local
914 INTEGER :: col, row, iblock, nthreads
915 REAL(kind=real_8), DIMENSION(:, :), POINTER :: vec_bl
916 TYPE(dbcsr_iterator_type) :: iter
917
918 CALL timeset(routinen, handle)
919
920 ! figure out the number of threads
921 nthreads = 1
922!$OMP PARALLEL DEFAULT(NONE) SHARED(nthreads)
923!$OMP MASTER
924!$ nthreads = OMP_GET_NUM_THREADS()
925!$OMP END MASTER
926!$OMP END PARALLEL
927
928 CALL dbcsr_get_info(matrix=vec, nblkrows_local=nblk_local)
929 ! 4 times makes sure the table is big enough to limit collisions.
930 CALL hash_table_create(fast_vec_access%hash_table, 4*nblk_local)
931 ! include zero for effective dealing with values not in the hash table (will return 0)
932 ALLOCATE (fast_vec_access%blk_map_d (0:nblk_local))
933
934 CALL dbcsr_get_info(matrix=vec, nblkcols_local=col)
935 IF (col .GT. 1) cpabort("BUG")
936
937 ! go through the blocks of the vector
938 iblock = 0
939 CALL dbcsr_iterator_start(iter, vec)
940 DO WHILE (dbcsr_iterator_blocks_left(iter))
941 CALL dbcsr_iterator_next_block(iter, row, col, vec_bl)
942 iblock = iblock + 1
943 CALL hash_table_add(fast_vec_access%hash_table, row, iblock)
944 fast_vec_access%blk_map_d (iblock)%ptr => vec_bl
945 fast_vec_access%blk_map_d (iblock)%assigned_thread = mod(iblock, nthreads)
946 END DO
947 CALL dbcsr_iterator_stop(iter)
948
949 CALL timestop(handle)
950
951 END SUBROUTINE create_fast_col_vec_access_d
952
953! **************************************************************************************************
954!> \brief given a row vector, prepare the fast_vec_access_container
955!> \param vec ...
956!> \param fast_vec_access ...
957! **************************************************************************************************
958 SUBROUTINE create_fast_row_vec_access_d (vec, fast_vec_access)
959 TYPE(dbcsr_type) :: vec
960 TYPE(fast_vec_access_type) :: fast_vec_access
961
962 CHARACTER(LEN=*), PARAMETER :: routineN = 'create_fast_row_vec_access_d'
963
964 INTEGER :: handle, nblk_local
965 INTEGER :: col, row, iblock, nthreads
966 REAL(kind=real_8), DIMENSION(:, :), POINTER :: vec_bl
967 TYPE(dbcsr_iterator_type) :: iter
968
969 CALL timeset(routinen, handle)
970
971 ! figure out the number of threads
972 nthreads = 1
973!$OMP PARALLEL DEFAULT(NONE) SHARED(nthreads)
974!$OMP MASTER
975!$ nthreads = OMP_GET_NUM_THREADS()
976!$OMP END MASTER
977!$OMP END PARALLEL
978
979 CALL dbcsr_get_info(matrix=vec, nblkcols_local=nblk_local)
980 ! 4 times makes sure the table is big enough to limit collisions.
981 CALL hash_table_create(fast_vec_access%hash_table, 4*nblk_local)
982 ! include zero for effective dealing with values not in the hash table (will return 0)
983 ALLOCATE (fast_vec_access%blk_map_d (0:nblk_local))
984
985 ! sanity check
986 CALL dbcsr_get_info(matrix=vec, nblkrows_local=row)
987 IF (row .GT. 1) cpabort("BUG")
988
989 ! go through the blocks of the vector
990 iblock = 0
991 CALL dbcsr_iterator_start(iter, vec)
992 DO WHILE (dbcsr_iterator_blocks_left(iter))
993 CALL dbcsr_iterator_next_block(iter, row, col, vec_bl)
994 iblock = iblock + 1
995 CALL hash_table_add(fast_vec_access%hash_table, col, iblock)
996 fast_vec_access%blk_map_d (iblock)%ptr => vec_bl
997 fast_vec_access%blk_map_d (iblock)%assigned_thread = mod(iblock, nthreads)
998 END DO
999 CALL dbcsr_iterator_stop(iter)
1000
1001 CALL timestop(handle)
1002
1003 END SUBROUTINE create_fast_row_vec_access_d
1004
1005! **************************************************************************************************
1006!> \brief ...
1007!> \param matrix ...
1008!> \param vec_in ...
1009!> \param vec_out ...
1010!> \param alpha ...
1011!> \param beta ...
1012!> \param work_row ...
1013!> \param work_col ...
1014! **************************************************************************************************
1015 SUBROUTINE dbcsr_sym_matrix_vector_mult_d (matrix, vec_in, vec_out, alpha, beta, work_row, work_col)
1016 TYPE(dbcsr_type) :: matrix, vec_in, vec_out
1017 REAL(kind=real_8) :: alpha, beta
1018 TYPE(dbcsr_type) :: work_row, work_col
1019
1020 CHARACTER(LEN=*), PARAMETER :: routinen = 'dbcsr_sym_m_v_mult'
1021
1022 INTEGER :: col, mypcol, &
1023 myprow, &
1024 nrows, ncols, &
1025 row, pcol_handle, &
1026 handle, handle1, ithread, vec_dim
1027 REAL(kind=real_8), DIMENSION(:), POINTER :: data_vec
1028 REAL(kind=real_8), DIMENSION(:, :), POINTER :: data_d, vec_res
1029 TYPE(dbcsr_distribution_type) :: dist
1030 TYPE(dbcsr_iterator_type) :: iter
1031 TYPE(dbcsr_type) :: result_row, result_col
1032 TYPE(mp_comm_type) :: pcol_group
1033
1034 TYPE(fast_vec_access_type) :: fast_vec_row, fast_vec_col, res_fast_vec_row, res_fast_vec_col
1035 INTEGER :: prow, pcol, rprow, rpcol
1036
1037 CALL timeset(routinen, handle)
1038 ithread = 0
1039! We need some work matrices as we try to exploit operations on the replicated vectors which are duplicated otherwise
1040 CALL dbcsr_get_info(matrix=vec_in, nfullcols_total=vec_dim)
1041! This is a performance hack as the new creation of a replicated vector is a fair bit more expensive
1042 CALL dbcsr_set(work_col, 0.0_real_8)
1043 CALL dbcsr_copy(result_col, work_col)
1044 CALL dbcsr_set(work_row, 0.0_real_8)
1045 CALL dbcsr_copy(result_row, work_row)
1046
1047! Collect some data about the parallel environment. We will use them later to move the vector around
1048 CALL dbcsr_get_info(matrix=matrix, distribution=dist)
1049 CALL dbcsr_distribution_get(dist, pcol_group=pcol_handle, myprow=myprow, mypcol=mypcol)
1050
1051 CALL pcol_group%set_handle(pcol_handle)
1052
1053 CALL create_fast_row_vec_access(work_row, fast_vec_row)
1054 CALL create_fast_col_vec_access(work_col, fast_vec_col)
1055 CALL create_fast_row_vec_access(result_row, res_fast_vec_row)
1056 CALL create_fast_col_vec_access(result_col, res_fast_vec_col)
1057
1058! Transfer the correct parts of the input vector to the correct locations so we can do a local multiply
1059 CALL dbcsr_col_vec_to_rep_row_d (vec_in, work_col, work_row, fast_vec_col)
1060
1061! Probably I should rename the routine above as it delivers both the replicated row and column vector
1062
1063! Perform the local multiply. Here we exploit, that we have the blocks replicated on the mpi processes
1064! It is important to note, that the input and result vector are distributed differently (row wise, col wise respectively)
1065 CALL timeset(routinen//"_local_mm", handle1)
1066
1067!------ perform the multiplication, we have to take car to take the correct blocks ----------
1068
1069!$OMP PARALLEL DEFAULT(NONE) PRIVATE(row,col,iter,data_d,ithread,pcol,prow,rpcol,rprow) &
1070!$OMP SHARED(matrix,fast_vec_row,res_fast_vec_col,res_fast_vec_row,fast_vec_col)
1071!$ ithread = omp_get_thread_num()
1072 CALL dbcsr_iterator_start(iter, matrix, shared=.false.)
1073 DO WHILE (dbcsr_iterator_blocks_left(iter))
1074 CALL dbcsr_iterator_next_block(iter, row, col, data_d)
1075 pcol = hash_table_get(fast_vec_row%hash_table, col)
1076 rprow = hash_table_get(res_fast_vec_col%hash_table, row)
1077 IF (ASSOCIATED(fast_vec_row%blk_map_d (pcol)%ptr) .AND. &
1078 ASSOCIATED(res_fast_vec_col%blk_map_d (rprow)%ptr)) THEN
1079 IF (res_fast_vec_col%blk_map_d (rprow)%assigned_thread .EQ. ithread) THEN
1080 res_fast_vec_col%blk_map_d (rprow)%ptr = res_fast_vec_col%blk_map_d (rprow)%ptr + &
1081 matmul(data_d, transpose(fast_vec_row%blk_map_d (pcol)%ptr))
1082 END IF
1083 prow = hash_table_get(fast_vec_col%hash_table, row)
1084 rpcol = hash_table_get(res_fast_vec_row%hash_table, col)
1085 IF (res_fast_vec_row%blk_map_d (rpcol)%assigned_thread .EQ. ithread .AND. row .NE. col) THEN
1086 res_fast_vec_row%blk_map_d (rpcol)%ptr = res_fast_vec_row%blk_map_d (rpcol)%ptr + &
1087 matmul(transpose(fast_vec_col%blk_map_d (prow)%ptr), data_d)
1088 END IF
1089 ELSE
1090 rpcol = hash_table_get(res_fast_vec_col%hash_table, col)
1091 prow = hash_table_get(fast_vec_row%hash_table, row)
1092 IF (res_fast_vec_col%blk_map_d (rpcol)%assigned_thread .EQ. ithread) THEN
1093 res_fast_vec_col%blk_map_d (rpcol)%ptr = res_fast_vec_col%blk_map_d (rpcol)%ptr + &
1094 transpose(matmul(fast_vec_row%blk_map_d (prow)%ptr, data_d))
1095 END IF
1096 rprow = hash_table_get(res_fast_vec_row%hash_table, row)
1097 pcol = hash_table_get(fast_vec_col%hash_table, col)
1098 IF (res_fast_vec_row%blk_map_d (rprow)%assigned_thread .EQ. ithread .AND. row .NE. col) THEN
1099 res_fast_vec_row%blk_map_d (rprow)%ptr = res_fast_vec_row%blk_map_d (rprow)%ptr + &
1100 transpose(matmul(data_d, fast_vec_col%blk_map_d (pcol)%ptr))
1101 END IF
1102 END IF
1103 END DO
1104 CALL dbcsr_iterator_stop(iter)
1105!$OMP END PARALLEL
1106
1107 CALL timestop(handle1)
1108
1109 ! sum all the data within a processor column to obtain the replicated result from lower
1110 data_vec => dbcsr_get_data_p(result_row, select_data_type=0.0_real_8)
1111 CALL dbcsr_get_info(matrix=result_row, nfullrows_local=nrows, nfullcols_local=ncols)
1112
1113 CALL pcol_group%sum(data_vec(1:nrows*ncols))
1114!
1115!! Convert the results to a column wise distribution, this is a bit involved as the result_row is fully replicated
1116!! While the result_col still has the partial results in parallel. The routine below takes care of that and saves a
1117!! parallel summation. Of the res_row vectors are created only taking the appropriate element (0 otherwise) while the res_col
1118!! parallel bits are locally added. The sum magically creates the correct vector
1119 CALL dbcsr_rep_row_to_rep_col_vec_d (work_col, result_row, res_fast_vec_row, res_fast_vec_col)
1120
1121! ! Create_the final vector by summing it to the result vector which lives on proc_col 0 lower
1122 CALL dbcsr_iterator_start(iter, vec_out)
1123 DO WHILE (dbcsr_iterator_blocks_left(iter))
1124 CALL dbcsr_iterator_next_block(iter, row, col, vec_res)
1125 prow = hash_table_get(fast_vec_col%hash_table, row)
1126 IF (ASSOCIATED(fast_vec_col%blk_map_d (prow)%ptr)) THEN
1127 vec_res(:, :) = beta*vec_res(:, :) + alpha*(fast_vec_col%blk_map_d (prow)%ptr(:, :))
1128 ELSE
1129 vec_res(:, :) = beta*vec_res(:, :)
1130 END IF
1131 END DO
1132 CALL dbcsr_iterator_stop(iter)
1133
1134 CALL release_fast_vec_access(fast_vec_row)
1135 CALL release_fast_vec_access(fast_vec_col)
1136 CALL release_fast_vec_access(res_fast_vec_row)
1137 CALL release_fast_vec_access(res_fast_vec_col)
1138
1139 CALL dbcsr_release(result_row); CALL dbcsr_release(result_col)
1140
1141 CALL timestop(handle)
1142
1143 END SUBROUTINE dbcsr_sym_matrix_vector_mult_d
1144
1145
1146! **************************************************************************************************
1147!> \brief the real driver routine for the multiply, not all symmetries implemented yet
1148!> \param matrix ...
1149!> \param vec_in ...
1150!> \param vec_out ...
1151!> \param alpha ...
1152!> \param beta ...
1153!> \param work_row ...
1154!> \param work_col ...
1155! **************************************************************************************************
1156 SUBROUTINE dbcsr_matrix_colvec_multiply_z (matrix, vec_in, vec_out, alpha, beta, work_row, work_col)
1157 TYPE(dbcsr_type) :: matrix, vec_in, vec_out
1158 COMPLEX(kind=real_8) :: alpha, beta
1159 TYPE(dbcsr_type) :: work_row, work_col
1160
1161 CHARACTER :: matrix_type
1162
1163 CALL dbcsr_get_info(matrix, matrix_type=matrix_type)
1164
1165 SELECT CASE (matrix_type)
1166 CASE (dbcsr_type_no_symmetry)
1167 CALL dbcsr_matrix_vector_mult_z (matrix, vec_in, vec_out, alpha, beta, work_row, work_col)
1168 CASE (dbcsr_type_symmetric)
1169 CALL dbcsr_sym_matrix_vector_mult_z (matrix, vec_in, vec_out, alpha, beta, work_row, work_col)
1170 CASE (dbcsr_type_antisymmetric)
1171 ! Not yet implemented, should mainly be some prefactor magic, but who knows how antisymmetric matrices are stored???
1172 cpabort("NYI, antisymmetric matrix not permitted")
1173 CASE DEFAULT
1174 cpabort("Unknown matrix type, ...")
1175 END SELECT
1176
1177 END SUBROUTINE dbcsr_matrix_colvec_multiply_z
1178
1179! **************************************************************************************************
1180!> \brief low level routines for matrix vector multiplies
1181!> \param matrix ...
1182!> \param vec_in ...
1183!> \param vec_out ...
1184!> \param alpha ...
1185!> \param beta ...
1186!> \param work_row ...
1187!> \param work_col ...
1188! **************************************************************************************************
1189 SUBROUTINE dbcsr_matrix_vector_mult_z (matrix, vec_in, vec_out, alpha, beta, work_row, work_col)
1190 TYPE(dbcsr_type) :: matrix, vec_in, vec_out
1191 COMPLEX(kind=real_8) :: alpha, beta
1192 TYPE(dbcsr_type) :: work_row, work_col
1193
1194 CHARACTER(LEN=*), PARAMETER :: routineN = 'dbcsr_matrix_vector_mult'
1195
1196 INTEGER :: col, mypcol, &
1197 myprow, prow_handle, &
1198 ncols, nrows, &
1199 row, &
1200 handle, handle1, ithread
1201 TYPE(mp_comm_type) :: prow_group
1202 COMPLEX(kind=real_8), DIMENSION(:), POINTER :: data_vec
1203 COMPLEX(kind=real_8), DIMENSION(:, :), POINTER :: data_d, vec_res
1204 TYPE(dbcsr_distribution_type) :: dist
1205 TYPE(dbcsr_iterator_type) :: iter
1206 TYPE(fast_vec_access_type) :: fast_vec_row, fast_vec_col
1207 INTEGER :: prow, pcol
1208
1209 CALL timeset(routinen, handle)
1210 ithread = 0
1211
1212! Collect some data about the parallel environment. We will use them later to move the vector around
1213 CALL dbcsr_get_info(matrix, distribution=dist)
1214 CALL dbcsr_distribution_get(dist, prow_group=prow_handle, myprow=myprow, mypcol=mypcol)
1215
1216 CALL prow_group%set_handle(prow_handle)
1217
1218 CALL create_fast_row_vec_access(work_row, fast_vec_row)
1219 CALL create_fast_col_vec_access(work_col, fast_vec_col)
1220
1221! Transfer the correct parts of the input vector to the correct locations so we can do a local multiply
1222 CALL dbcsr_col_vec_to_rep_row_z (vec_in, work_col, work_row, fast_vec_col)
1223
1224! Set the work vector for the results to 0
1225 CALL dbcsr_set(work_col, cmplx(0.0, 0.0, real_8))
1226
1227! Perform the local multiply. Here we exploit, that we have the blocks replicated on the mpi processes
1228! It is important to note, that the input and result vector are distributed differently (row wise, col wise respectively)
1229 CALL timeset(routinen//"_local_mm", handle1)
1230
1231!$OMP PARALLEL DEFAULT(NONE) PRIVATE(row,col,iter,data_d,ithread,pcol,prow) &
1232!$OMP SHARED(matrix,fast_vec_col,fast_vec_row)
1233!$ ithread = omp_get_thread_num()
1234 CALL dbcsr_iterator_start(iter, matrix, shared=.false.)
1235 DO WHILE (dbcsr_iterator_blocks_left(iter))
1236 CALL dbcsr_iterator_next_block(iter, row, col, data_d)
1237 prow = hash_table_get(fast_vec_col%hash_table, row)
1238 IF (fast_vec_col%blk_map_z (prow)%assigned_thread .NE. ithread) cycle
1239 pcol = hash_table_get(fast_vec_row%hash_table, col)
1240 fast_vec_col%blk_map_z (prow)%ptr = fast_vec_col%blk_map_z (prow)%ptr + &
1241 matmul(data_d, transpose(fast_vec_row%blk_map_z (pcol)%ptr))
1242 END DO
1243 CALL dbcsr_iterator_stop(iter)
1244!$OMP END PARALLEL
1245
1246 CALL timestop(handle1)
1247
1248! sum all the data onto the first processor col where the original vector is stored
1249 data_vec => dbcsr_get_data_p(work_col, select_data_type=cmplx(0.0, 0.0, real_8))
1250 CALL dbcsr_get_info(matrix=work_col, nfullrows_local=nrows, nfullcols_local=ncols)
1251 CALL prow_group%sum(data_vec(1:nrows*ncols))
1252
1253! Local copy on the first mpi col (as this is the localtion of the vec_res blocks) of the result vector
1254! from the replicated to the original vector. Let's play it safe and use the iterator
1255 CALL dbcsr_iterator_start(iter, vec_out)
1256 DO WHILE (dbcsr_iterator_blocks_left(iter))
1257 CALL dbcsr_iterator_next_block(iter, row, col, vec_res)
1258 prow = hash_table_get(fast_vec_col%hash_table, row)
1259 IF (ASSOCIATED(fast_vec_col%blk_map_z (prow)%ptr)) THEN
1260 vec_res(:, :) = beta*vec_res(:, :) + alpha*fast_vec_col%blk_map_z (prow)%ptr(:, :)
1261 ELSE
1262 vec_res(:, :) = beta*vec_res(:, :)
1263 END IF
1264 END DO
1265 CALL dbcsr_iterator_stop(iter)
1266
1267 CALL release_fast_vec_access(fast_vec_row)
1268 CALL release_fast_vec_access(fast_vec_col)
1269
1270 CALL timestop(handle)
1271
1272 END SUBROUTINE dbcsr_matrix_vector_mult_z
1273
1274! **************************************************************************************************
1275!> \brief ...
1276!> \param matrix ...
1277!> \param vec_in ...
1278!> \param vec_out ...
1279!> \param alpha ...
1280!> \param beta ...
1281!> \param work_row ...
1282!> \param work_col ...
1283!> \param skip_diag ...
1284! **************************************************************************************************
1285 SUBROUTINE dbcsr_matrixt_vector_mult_z (matrix, vec_in, vec_out, alpha, beta, work_row, work_col, skip_diag)
1286 TYPE(dbcsr_type) :: matrix, vec_in, vec_out
1287 COMPLEX(kind=real_8) :: alpha, beta
1288 TYPE(dbcsr_type) :: work_row, work_col
1289 LOGICAL :: skip_diag
1290
1291 CHARACTER(LEN=*), PARAMETER :: routineN = 'dbcsr_matrixT_vector_mult'
1292
1293 INTEGER :: col, col_size, mypcol, &
1294 myprow, pcol_handle, prow_handle, &
1295 ncols, nrows, &
1296 row, row_size, &
1297 handle, handle1, ithread
1298 TYPE(mp_comm_type) :: pcol_group, prow_group
1299 COMPLEX(kind=real_8), DIMENSION(:), POINTER :: data_vec
1300 COMPLEX(kind=real_8), DIMENSION(:, :), POINTER :: data_d, vec_bl, vec_res
1301 TYPE(dbcsr_distribution_type) :: dist
1302 TYPE(dbcsr_iterator_type) :: iter
1303
1304 TYPE(fast_vec_access_type) :: fast_vec_row, fast_vec_col
1305 INTEGER :: prow, pcol
1306
1307 CALL timeset(routinen, handle)
1308 ithread = 0
1309
1310! Collect some data about the parallel environment. We will use them later to move the vector around
1311 CALL dbcsr_get_info(matrix, distribution=dist)
1312 CALL dbcsr_distribution_get(dist, prow_group=prow_handle, pcol_group=pcol_handle, myprow=myprow, mypcol=mypcol)
1313
1314 CALL prow_group%set_handle(prow_handle)
1315 CALL pcol_group%set_handle(pcol_handle)
1316
1317 CALL create_fast_row_vec_access(work_row, fast_vec_row)
1318 CALL create_fast_col_vec_access(work_col, fast_vec_col)
1319
1320! Set the work vector for the results to 0
1321 CALL dbcsr_set(work_row, cmplx(0.0, 0.0, real_8))
1322
1323! Transfer the correct parts of the input vector to the replicated vector on proc_col 0
1324 CALL dbcsr_iterator_start(iter, vec_in)
1325 DO WHILE (dbcsr_iterator_blocks_left(iter))
1326 CALL dbcsr_iterator_next_block(iter, row, col, vec_bl, row_size=row_size, col_size=col_size)
1327 prow = hash_table_get(fast_vec_col%hash_table, row)
1328 fast_vec_col%blk_map_z (prow)%ptr(1:row_size, 1:col_size) = vec_bl(1:row_size, 1:col_size)
1329 END DO
1330 CALL dbcsr_iterator_stop(iter)
1331! Replicate the data on all processore in the row
1332 data_vec => dbcsr_get_data_p(work_col, select_data_type=cmplx(0.0, 0.0, real_8))
1333 CALL prow_group%bcast(data_vec, 0)
1334
1335! Perform the local multiply. Here it is obvious why the vectors are replicated on the mpi rows and cols
1336 CALL timeset(routinen//"local_mm", handle1)
1337 CALL dbcsr_get_info(matrix=work_col, nfullcols_local=ncols)
1338!$OMP PARALLEL DEFAULT(NONE) PRIVATE(row,col,iter,data_d,row_size,col_size,ithread,prow,pcol) &
1339!$OMP SHARED(matrix,fast_vec_row,fast_vec_col,skip_diag,ncols)
1340!$ ithread = omp_get_thread_num()
1341 CALL dbcsr_iterator_start(iter, matrix, shared=.false.)
1342 DO WHILE (dbcsr_iterator_blocks_left(iter))
1343 CALL dbcsr_iterator_next_block(iter, row, col, data_d, row_size=row_size, col_size=col_size)
1344 IF (skip_diag .AND. col == row) cycle
1345 prow = hash_table_get(fast_vec_col%hash_table, row)
1346 pcol = hash_table_get(fast_vec_row%hash_table, col)
1347 IF (ASSOCIATED(fast_vec_row%blk_map_z (pcol)%ptr) .AND. &
1348 ASSOCIATED(fast_vec_col%blk_map_z (prow)%ptr)) THEN
1349 IF (fast_vec_row%blk_map_z (pcol)%assigned_thread .NE. ithread) cycle
1350 fast_vec_row%blk_map_z (pcol)%ptr = fast_vec_row%blk_map_z (pcol)%ptr + &
1351 matmul(transpose(fast_vec_col%blk_map_z (prow)%ptr), data_d)
1352 ELSE
1353 prow = hash_table_get(fast_vec_row%hash_table, row)
1354 pcol = hash_table_get(fast_vec_col%hash_table, col)
1355 IF (fast_vec_row%blk_map_z (prow)%assigned_thread .NE. ithread) cycle
1356 fast_vec_row%blk_map_z (prow)%ptr = fast_vec_row%blk_map_z (prow)%ptr + &
1357 matmul(transpose(fast_vec_col%blk_map_z (pcol)%ptr), transpose(data_d))
1358 END IF
1359 END DO
1360 CALL dbcsr_iterator_stop(iter)
1361!$OMP END PARALLEL
1362
1363 CALL timestop(handle1)
1364
1365! sum all the data within a processor column to obtain the replicated result
1366 data_vec => dbcsr_get_data_p(work_row, select_data_type=cmplx(0.0, 0.0, real_8))
1367! we use the replicated vector but the final answer is only summed to proc_col 0 for efficiency
1368 CALL dbcsr_get_info(matrix=work_row, nfullrows_local=nrows, nfullcols_local=ncols)
1369 CALL pcol_group%sum(data_vec(1:nrows*ncols))
1370
1371! Convert the result to a column wise distribution
1372 CALL dbcsr_rep_row_to_rep_col_vec_z (work_col, work_row, fast_vec_row)
1373
1374! Create_the final vector by summing it to the result vector which lives on proc_col 0
1375 CALL dbcsr_iterator_start(iter, vec_out)
1376 DO WHILE (dbcsr_iterator_blocks_left(iter))
1377 CALL dbcsr_iterator_next_block(iter, row, col, vec_res, row_size=row_size)
1378 prow = hash_table_get(fast_vec_col%hash_table, row)
1379 IF (ASSOCIATED(fast_vec_col%blk_map_z (prow)%ptr)) THEN
1380 vec_res(:, :) = beta*vec_res(:, :) + alpha*fast_vec_col%blk_map_z (prow)%ptr(:, :)
1381 ELSE
1382 vec_res(:, :) = beta*vec_res(:, :)
1383 END IF
1384 END DO
1385 CALL dbcsr_iterator_stop(iter)
1386
1387 CALL timestop(handle)
1388
1389 END SUBROUTINE dbcsr_matrixt_vector_mult_z
1390
1391! **************************************************************************************************
1392!> \brief ...
1393!> \param vec_in ...
1394!> \param rep_col_vec ...
1395!> \param rep_row_vec ...
1396!> \param fast_vec_col ...
1397! **************************************************************************************************
1398 SUBROUTINE dbcsr_col_vec_to_rep_row_z (vec_in, rep_col_vec, rep_row_vec, fast_vec_col)
1399 TYPE(dbcsr_type) :: vec_in, rep_col_vec, &
1400 rep_row_vec
1401 TYPE(fast_vec_access_type), INTENT(IN) :: fast_vec_col
1402
1403 CHARACTER(LEN=*), PARAMETER :: routineN = 'dbcsr_col_vec_to_rep_row'
1404
1405 INTEGER :: col, mypcol, myprow, ncols, &
1406 nrows, pcol_handle, prow_handle, &
1407 row, handle
1408 TYPE(mp_comm_type) :: pcol_group, prow_group
1409 INTEGER, DIMENSION(:), POINTER :: local_cols, row_dist
1410 COMPLEX(kind=real_8), DIMENSION(:), POINTER :: data_vec, data_vec_rep
1411 COMPLEX(kind=real_8), DIMENSION(:, :), POINTER :: vec_row
1412 TYPE(dbcsr_distribution_type) :: dist_in, dist_rep_col
1413 TYPE(dbcsr_iterator_type) :: iter
1414
1415 CALL timeset(routinen, handle)
1416
1417! get information about the parallel environment
1418 CALL dbcsr_get_info(vec_in, distribution=dist_in)
1419 CALL dbcsr_distribution_get(dist_in, &
1420 prow_group=prow_handle, &
1421 pcol_group=pcol_handle, &
1422 myprow=myprow, &
1423 mypcol=mypcol)
1424
1425 CALL prow_group%set_handle(prow_handle)
1426 CALL pcol_group%set_handle(pcol_handle)
1427
1428! Get the vector which tells us which blocks are local to which processor row in the col vec
1429 CALL dbcsr_get_info(rep_col_vec, distribution=dist_rep_col)
1430 CALL dbcsr_distribution_get(dist_rep_col, row_dist=row_dist)
1431
1432! Copy the local vector to the replicated on the first processor column (this is where vec_in lives)
1433 CALL dbcsr_get_info(matrix=rep_col_vec, nfullrows_local=nrows, nfullcols_local=ncols)
1434 data_vec_rep => dbcsr_get_data_p(rep_col_vec, select_data_type=cmplx(0.0, 0.0, real_8))
1435 data_vec => dbcsr_get_data_p(vec_in, select_data_type=cmplx(0.0, 0.0, real_8))
1436 IF (mypcol == 0) data_vec_rep(1:nrows*ncols) = data_vec(1:nrows*ncols)
1437! Replicate the data along the row
1438 CALL prow_group%bcast(data_vec_rep(1:nrows*ncols), 0)
1439
1440! Here it gets a bit tricky as we are dealing with two different parallel layouts:
1441! The rep_col_vec contains all blocks local to the row distribution of the vector.
1442! The rep_row_vec only needs the fraction which is local to the col distribution.
1443! However in most cases this won't the complete set of block which can be obtained from col_vector p_row i
1444! Anyway, as the blocks don't repeat in the col_vec, a different fraction of the row vec will be available
1445! on every replica in the processor column, by summing along the column we end up with the complete vector everywhere
1446! Hope this clarifies the idea
1447 CALL dbcsr_set(rep_row_vec, cmplx(0.0, 0.0, real_8))
1448 CALL dbcsr_get_info(matrix=rep_row_vec, nfullrows_local=nrows, local_cols=local_cols, nfullcols_local=ncols)
1449 CALL dbcsr_iterator_start(iter, rep_row_vec)
1450 DO WHILE (dbcsr_iterator_blocks_left(iter))
1451 CALL dbcsr_iterator_next_block(iter, row, col, vec_row)
1452 IF (row_dist(col) == myprow) THEN
1453 vec_row = transpose(fast_vec_col%blk_map_z (hash_table_get(fast_vec_col%hash_table, col))%ptr)
1454 END IF
1455 END DO
1456 CALL dbcsr_iterator_stop(iter)
1457 CALL dbcsr_get_info(matrix=rep_row_vec, nfullrows_local=nrows, nfullcols_local=ncols)
1458 data_vec_rep => dbcsr_get_data_p(rep_row_vec, select_data_type=cmplx(0.0, 0.0, real_8))
1459 CALL pcol_group%sum(data_vec_rep(1:ncols*nrows))
1460
1461 CALL timestop(handle)
1462
1463 END SUBROUTINE dbcsr_col_vec_to_rep_row_z
1464
1465! **************************************************************************************************
1466!> \brief ...
1467!> \param rep_col_vec ...
1468!> \param rep_row_vec ...
1469!> \param fast_vec_row ...
1470!> \param fast_vec_col_add ...
1471! **************************************************************************************************
1472 SUBROUTINE dbcsr_rep_row_to_rep_col_vec_z (rep_col_vec, rep_row_vec, fast_vec_row, fast_vec_col_add)
1473 TYPE(dbcsr_type) :: rep_col_vec, rep_row_vec
1474 TYPE(fast_vec_access_type), OPTIONAL :: fast_vec_col_add
1475 TYPE(fast_vec_access_type) :: fast_vec_row
1476
1477 CHARACTER(LEN=*), PARAMETER :: routineN = 'dbcsr_rep_row_to_rep_col_vec'
1478
1479 INTEGER :: col, mypcol, myprow, ncols, &
1480 nrows, prow_handle, &
1481 row, handle
1482 INTEGER, DIMENSION(:), POINTER :: col_dist
1483 TYPE(mp_comm_type) :: prow_group
1484 COMPLEX(kind=real_8), DIMENSION(:), POINTER :: data_vec_rep
1485 COMPLEX(kind=real_8), DIMENSION(:, :), POINTER :: vec_col
1486 TYPE(dbcsr_distribution_type) :: dist_rep_row, dist_rep_col
1487 TYPE(dbcsr_iterator_type) :: iter
1488
1489 CALL timeset(routinen, handle)
1490
1491! get information about the parallel environment
1492 CALL dbcsr_get_info(matrix=rep_col_vec, distribution=dist_rep_col)
1493 CALL dbcsr_distribution_get(dist_rep_col, &
1494 prow_group=prow_handle, &
1495 myprow=myprow, &
1496 mypcol=mypcol)
1497
1498 CALL prow_group%set_handle(prow_handle)
1499
1500! Get the vector which tells us which blocks are local to which processor col in the row vec
1501 CALL dbcsr_get_info(matrix=rep_row_vec, distribution=dist_rep_row)
1502 CALL dbcsr_distribution_get(dist_rep_row, col_dist=col_dist)
1503
1504! The same trick as described above with opposite direction
1505 CALL dbcsr_set(rep_col_vec, cmplx(0.0, 0.0, real_8))
1506 CALL dbcsr_iterator_start(iter, rep_col_vec)
1507 DO WHILE (dbcsr_iterator_blocks_left(iter))
1508 CALL dbcsr_iterator_next_block(iter, row, col, vec_col)
1509 IF (col_dist(row) == mypcol) THEN
1510 vec_col = transpose(fast_vec_row%blk_map_z (hash_table_get(fast_vec_row%hash_table, row))%ptr)
1511 END IF
1512 ! this one is special and allows to add the elements of a not yet summed replicated
1513 ! column vector as it appears in M*V(row_rep) as result. Save an parallel summation in the symmetric case
1514 IF (PRESENT(fast_vec_col_add)) vec_col = vec_col + &
1515 fast_vec_col_add%blk_map_z (hash_table_get(fast_vec_col_add%hash_table, row))%ptr(:, :)
1516 END DO
1517 CALL dbcsr_iterator_stop(iter)
1518 CALL dbcsr_get_info(matrix=rep_col_vec, nfullrows_local=nrows, nfullcols_local=ncols)
1519 data_vec_rep => dbcsr_get_data_p(rep_col_vec, select_data_type=cmplx(0.0, 0.0, real_8))
1520 CALL prow_group%sum(data_vec_rep(1:nrows*ncols))
1521
1522 CALL timestop(handle)
1523
1524 END SUBROUTINE dbcsr_rep_row_to_rep_col_vec_z
1525
1526! **************************************************************************************************
1527!> \brief given a column vector, prepare the fast_vec_access container
1528!> \param vec ...
1529!> \param fast_vec_access ...
1530! **************************************************************************************************
1531 SUBROUTINE create_fast_col_vec_access_z (vec, fast_vec_access)
1532 TYPE(dbcsr_type) :: vec
1533 TYPE(fast_vec_access_type) :: fast_vec_access
1534
1535 CHARACTER(LEN=*), PARAMETER :: routineN = 'create_fast_col_vec_access_z'
1536
1537 INTEGER :: handle, nblk_local
1538 INTEGER :: col, row, iblock, nthreads
1539 COMPLEX(kind=real_8), DIMENSION(:, :), POINTER :: vec_bl
1540 TYPE(dbcsr_iterator_type) :: iter
1541
1542 CALL timeset(routinen, handle)
1543
1544 ! figure out the number of threads
1545 nthreads = 1
1546!$OMP PARALLEL DEFAULT(NONE) SHARED(nthreads)
1547!$OMP MASTER
1548!$ nthreads = OMP_GET_NUM_THREADS()
1549!$OMP END MASTER
1550!$OMP END PARALLEL
1551
1552 CALL dbcsr_get_info(matrix=vec, nblkrows_local=nblk_local)
1553 ! 4 times makes sure the table is big enough to limit collisions.
1554 CALL hash_table_create(fast_vec_access%hash_table, 4*nblk_local)
1555 ! include zero for effective dealing with values not in the hash table (will return 0)
1556 ALLOCATE (fast_vec_access%blk_map_z (0:nblk_local))
1557
1558 CALL dbcsr_get_info(matrix=vec, nblkcols_local=col)
1559 IF (col .GT. 1) cpabort("BUG")
1560
1561 ! go through the blocks of the vector
1562 iblock = 0
1563 CALL dbcsr_iterator_start(iter, vec)
1564 DO WHILE (dbcsr_iterator_blocks_left(iter))
1565 CALL dbcsr_iterator_next_block(iter, row, col, vec_bl)
1566 iblock = iblock + 1
1567 CALL hash_table_add(fast_vec_access%hash_table, row, iblock)
1568 fast_vec_access%blk_map_z (iblock)%ptr => vec_bl
1569 fast_vec_access%blk_map_z (iblock)%assigned_thread = mod(iblock, nthreads)
1570 END DO
1571 CALL dbcsr_iterator_stop(iter)
1572
1573 CALL timestop(handle)
1574
1575 END SUBROUTINE create_fast_col_vec_access_z
1576
1577! **************************************************************************************************
1578!> \brief given a row vector, prepare the fast_vec_access_container
1579!> \param vec ...
1580!> \param fast_vec_access ...
1581! **************************************************************************************************
1582 SUBROUTINE create_fast_row_vec_access_z (vec, fast_vec_access)
1583 TYPE(dbcsr_type) :: vec
1584 TYPE(fast_vec_access_type) :: fast_vec_access
1585
1586 CHARACTER(LEN=*), PARAMETER :: routineN = 'create_fast_row_vec_access_z'
1587
1588 INTEGER :: handle, nblk_local
1589 INTEGER :: col, row, iblock, nthreads
1590 COMPLEX(kind=real_8), DIMENSION(:, :), POINTER :: vec_bl
1591 TYPE(dbcsr_iterator_type) :: iter
1592
1593 CALL timeset(routinen, handle)
1594
1595 ! figure out the number of threads
1596 nthreads = 1
1597!$OMP PARALLEL DEFAULT(NONE) SHARED(nthreads)
1598!$OMP MASTER
1599!$ nthreads = OMP_GET_NUM_THREADS()
1600!$OMP END MASTER
1601!$OMP END PARALLEL
1602
1603 CALL dbcsr_get_info(matrix=vec, nblkcols_local=nblk_local)
1604 ! 4 times makes sure the table is big enough to limit collisions.
1605 CALL hash_table_create(fast_vec_access%hash_table, 4*nblk_local)
1606 ! include zero for effective dealing with values not in the hash table (will return 0)
1607 ALLOCATE (fast_vec_access%blk_map_z (0:nblk_local))
1608
1609 ! sanity check
1610 CALL dbcsr_get_info(matrix=vec, nblkrows_local=row)
1611 IF (row .GT. 1) cpabort("BUG")
1612
1613 ! go through the blocks of the vector
1614 iblock = 0
1615 CALL dbcsr_iterator_start(iter, vec)
1616 DO WHILE (dbcsr_iterator_blocks_left(iter))
1617 CALL dbcsr_iterator_next_block(iter, row, col, vec_bl)
1618 iblock = iblock + 1
1619 CALL hash_table_add(fast_vec_access%hash_table, col, iblock)
1620 fast_vec_access%blk_map_z (iblock)%ptr => vec_bl
1621 fast_vec_access%blk_map_z (iblock)%assigned_thread = mod(iblock, nthreads)
1622 END DO
1623 CALL dbcsr_iterator_stop(iter)
1624
1625 CALL timestop(handle)
1626
1627 END SUBROUTINE create_fast_row_vec_access_z
1628
1629! **************************************************************************************************
1630!> \brief ...
1631!> \param matrix ...
1632!> \param vec_in ...
1633!> \param vec_out ...
1634!> \param alpha ...
1635!> \param beta ...
1636!> \param work_row ...
1637!> \param work_col ...
1638! **************************************************************************************************
1639 SUBROUTINE dbcsr_sym_matrix_vector_mult_z (matrix, vec_in, vec_out, alpha, beta, work_row, work_col)
1640 TYPE(dbcsr_type) :: matrix, vec_in, vec_out
1641 COMPLEX(kind=real_8) :: alpha, beta
1642 TYPE(dbcsr_type) :: work_row, work_col
1643
1644 CHARACTER(LEN=*), PARAMETER :: routineN = 'dbcsr_sym_m_v_mult'
1645
1646 INTEGER :: col, mypcol, &
1647 myprow, &
1648 nrows, ncols, &
1649 row, pcol_handle, &
1650 handle, handle1, ithread, vec_dim
1651 COMPLEX(kind=real_8), DIMENSION(:), POINTER :: data_vec
1652 COMPLEX(kind=real_8), DIMENSION(:, :), POINTER :: data_d, vec_res
1653 TYPE(dbcsr_distribution_type) :: dist
1654 TYPE(dbcsr_iterator_type) :: iter
1655 TYPE(dbcsr_type) :: result_row, result_col
1656 TYPE(mp_comm_type) :: pcol_group
1657
1658 TYPE(fast_vec_access_type) :: fast_vec_row, fast_vec_col, res_fast_vec_row, res_fast_vec_col
1659 INTEGER :: prow, pcol, rprow, rpcol
1660
1661 CALL timeset(routinen, handle)
1662 ithread = 0
1663! We need some work matrices as we try to exploit operations on the replicated vectors which are duplicated otherwise
1664 CALL dbcsr_get_info(matrix=vec_in, nfullcols_total=vec_dim)
1665! This is a performance hack as the new creation of a replicated vector is a fair bit more expensive
1666 CALL dbcsr_set(work_col, cmplx(0.0, 0.0, real_8))
1667 CALL dbcsr_copy(result_col, work_col)
1668 CALL dbcsr_set(work_row, cmplx(0.0, 0.0, real_8))
1669 CALL dbcsr_copy(result_row, work_row)
1670
1671! Collect some data about the parallel environment. We will use them later to move the vector around
1672 CALL dbcsr_get_info(matrix=matrix, distribution=dist)
1673 CALL dbcsr_distribution_get(dist, pcol_group=pcol_handle, myprow=myprow, mypcol=mypcol)
1674
1675 CALL pcol_group%set_handle(pcol_handle)
1676
1677 CALL create_fast_row_vec_access(work_row, fast_vec_row)
1678 CALL create_fast_col_vec_access(work_col, fast_vec_col)
1679 CALL create_fast_row_vec_access(result_row, res_fast_vec_row)
1680 CALL create_fast_col_vec_access(result_col, res_fast_vec_col)
1681
1682! Transfer the correct parts of the input vector to the correct locations so we can do a local multiply
1683 CALL dbcsr_col_vec_to_rep_row_z (vec_in, work_col, work_row, fast_vec_col)
1684
1685! Probably I should rename the routine above as it delivers both the replicated row and column vector
1686
1687! Perform the local multiply. Here we exploit, that we have the blocks replicated on the mpi processes
1688! It is important to note, that the input and result vector are distributed differently (row wise, col wise respectively)
1689 CALL timeset(routinen//"_local_mm", handle1)
1690
1691!------ perform the multiplication, we have to take car to take the correct blocks ----------
1692
1693!$OMP PARALLEL DEFAULT(NONE) PRIVATE(row,col,iter,data_d,ithread,pcol,prow,rpcol,rprow) &
1694!$OMP SHARED(matrix,fast_vec_row,res_fast_vec_col,res_fast_vec_row,fast_vec_col)
1695!$ ithread = omp_get_thread_num()
1696 CALL dbcsr_iterator_start(iter, matrix, shared=.false.)
1697 DO WHILE (dbcsr_iterator_blocks_left(iter))
1698 CALL dbcsr_iterator_next_block(iter, row, col, data_d)
1699 pcol = hash_table_get(fast_vec_row%hash_table, col)
1700 rprow = hash_table_get(res_fast_vec_col%hash_table, row)
1701 IF (ASSOCIATED(fast_vec_row%blk_map_z (pcol)%ptr) .AND. &
1702 ASSOCIATED(res_fast_vec_col%blk_map_z (rprow)%ptr)) THEN
1703 IF (res_fast_vec_col%blk_map_z (rprow)%assigned_thread .EQ. ithread) THEN
1704 res_fast_vec_col%blk_map_z (rprow)%ptr = res_fast_vec_col%blk_map_z (rprow)%ptr + &
1705 matmul(data_d, transpose(fast_vec_row%blk_map_z (pcol)%ptr))
1706 END IF
1707 prow = hash_table_get(fast_vec_col%hash_table, row)
1708 rpcol = hash_table_get(res_fast_vec_row%hash_table, col)
1709 IF (res_fast_vec_row%blk_map_z (rpcol)%assigned_thread .EQ. ithread .AND. row .NE. col) THEN
1710 res_fast_vec_row%blk_map_z (rpcol)%ptr = res_fast_vec_row%blk_map_z (rpcol)%ptr + &
1711 matmul(transpose(fast_vec_col%blk_map_z (prow)%ptr), data_d)
1712 END IF
1713 ELSE
1714 rpcol = hash_table_get(res_fast_vec_col%hash_table, col)
1715 prow = hash_table_get(fast_vec_row%hash_table, row)
1716 IF (res_fast_vec_col%blk_map_z (rpcol)%assigned_thread .EQ. ithread) THEN
1717 res_fast_vec_col%blk_map_z (rpcol)%ptr = res_fast_vec_col%blk_map_z (rpcol)%ptr + &
1718 transpose(matmul(fast_vec_row%blk_map_z (prow)%ptr, data_d))
1719 END IF
1720 rprow = hash_table_get(res_fast_vec_row%hash_table, row)
1721 pcol = hash_table_get(fast_vec_col%hash_table, col)
1722 IF (res_fast_vec_row%blk_map_z (rprow)%assigned_thread .EQ. ithread .AND. row .NE. col) THEN
1723 res_fast_vec_row%blk_map_z (rprow)%ptr = res_fast_vec_row%blk_map_z (rprow)%ptr + &
1724 transpose(matmul(data_d, fast_vec_col%blk_map_z (pcol)%ptr))
1725 END IF
1726 END IF
1727 END DO
1728 CALL dbcsr_iterator_stop(iter)
1729!$OMP END PARALLEL
1730
1731 CALL timestop(handle1)
1732
1733 ! sum all the data within a processor column to obtain the replicated result from lower
1734 data_vec => dbcsr_get_data_p(result_row, select_data_type=cmplx(0.0, 0.0, real_8))
1735 CALL dbcsr_get_info(matrix=result_row, nfullrows_local=nrows, nfullcols_local=ncols)
1736
1737 CALL pcol_group%sum(data_vec(1:nrows*ncols))
1738!
1739!! Convert the results to a column wise distribution, this is a bit involved as the result_row is fully replicated
1740!! While the result_col still has the partial results in parallel. The routine below takes care of that and saves a
1741!! parallel summation. Of the res_row vectors are created only taking the appropriate element (0 otherwise) while the res_col
1742!! parallel bits are locally added. The sum magically creates the correct vector
1743 CALL dbcsr_rep_row_to_rep_col_vec_z (work_col, result_row, res_fast_vec_row, res_fast_vec_col)
1744
1745! ! Create_the final vector by summing it to the result vector which lives on proc_col 0 lower
1746 CALL dbcsr_iterator_start(iter, vec_out)
1747 DO WHILE (dbcsr_iterator_blocks_left(iter))
1748 CALL dbcsr_iterator_next_block(iter, row, col, vec_res)
1749 prow = hash_table_get(fast_vec_col%hash_table, row)
1750 IF (ASSOCIATED(fast_vec_col%blk_map_z (prow)%ptr)) THEN
1751 vec_res(:, :) = beta*vec_res(:, :) + alpha*(fast_vec_col%blk_map_z (prow)%ptr(:, :))
1752 ELSE
1753 vec_res(:, :) = beta*vec_res(:, :)
1754 END IF
1755 END DO
1756 CALL dbcsr_iterator_stop(iter)
1757
1758 CALL release_fast_vec_access(fast_vec_row)
1759 CALL release_fast_vec_access(fast_vec_col)
1760 CALL release_fast_vec_access(res_fast_vec_row)
1761 CALL release_fast_vec_access(res_fast_vec_col)
1762
1763 CALL dbcsr_release(result_row); CALL dbcsr_release(result_col)
1764
1765 CALL timestop(handle)
1766
1767 END SUBROUTINE dbcsr_sym_matrix_vector_mult_z
1768
1769
1770END MODULE dbcsr_vector
static void dgemm(const char transa, const char transb, const int m, const int n, const int k, const double alpha, const double *a, const int lda, const double *b, const int ldb, const double beta, double *c, const int ldc)
Convenient wrapper to hide Fortran nature of dgemm_, swapping a and b.
operations for skinny matrices/vectors expressed in dbcsr form
subroutine, public create_replicated_row_vec_from_matrix(dbcsr_vec, matrix, nrow)
creates a row vector like object whose blocks can be replicated along the processor col and has the s...
subroutine, public create_col_vec_from_matrix(dbcsr_vec, matrix, ncol)
creates a dbcsr col vector like object which lives on proc_col 0 and has the same row dist as the tem...
subroutine, public create_replicated_col_vec_from_matrix(dbcsr_vec, matrix, ncol)
creates a col vector like object whose blocks can be replicated along the processor row and has the s...
subroutine, public create_row_vec_from_matrix(dbcsr_vec, matrix, nrow)
creates a dbcsr row vector like object which lives on proc_row 0 and has the same row dist as the tem...
Defines the basic variable types.
Definition kinds.F:23
integer, parameter, public dp
Definition kinds.F:34
integer, parameter, public real_8
Definition kinds.F:41
Interface to the message passing library MPI.