(git:8dea62c)
Loading...
Searching...
No Matches
cp_fm_cusolver.c
Go to the documentation of this file.
1/*----------------------------------------------------------------------------*/
2/* CP2K: A general program to perform molecular dynamics simulations */
3/* Copyright 2000-2026 CP2K developers group <https://cp2k.org> */
4/* */
5/* SPDX-License-Identifier: GPL-2.0-or-later */
6/*----------------------------------------------------------------------------*/
7
8#if defined(__CUSOLVERMP)
9
10#include "../offload/offload_library.h"
11#include <assert.h>
12#include <cuComplex.h>
13#include <cuda_runtime.h>
14#include <cusolverMp.h>
15#include <math.h>
16#include <mpi.h>
17#include <stdio.h>
18#include <stdlib.h>
19#include <string.h>
20
21#if defined(__CUSOLVERMP_NCCL)
22#include <nccl.h>
23#else
24#include <cal.h>
25#endif
26
27/*******************************************************************************
28 * \brief Run given CUDA command and upon failure abort with a nice message.
29 * \author Ole Schuett
30 ******************************************************************************/
31#define CUDA_CHECK(cmd) \
32 do { \
33 cudaError_t status = cmd; \
34 if (status != cudaSuccess) { \
35 fprintf(stderr, "ERROR: %s %s %d\n", cudaGetErrorString(status), \
36 __FILE__, __LINE__); \
37 abort(); \
38 } \
39 } while (0)
40
41#if defined(__CUSOLVERMP_NCCL)
42/*******************************************************************************
43 * \brief Run given NCCL command and upon failure abort with a nice message.
44 * \author Jiri Vyskocil
45 ******************************************************************************/
46#define NCCL_CHECK(cmd) \
47 do { \
48 ncclResult_t status = cmd; \
49 if (status != ncclSuccess) { \
50 fprintf(stderr, "ERROR: %s %s %d\n", ncclGetErrorString(status), \
51 __FILE__, __LINE__); \
52 abort(); \
53 } \
54 } while (0)
55
56/*******************************************************************************
57 * \brief Check that the selected CUDA device is visible to this MPI rank.
58 ******************************************************************************/
59static void check_nccl_device_visibility(MPI_Comm comm,
60 const int local_device) {
61 int device_count = 0;
62 CUDA_CHECK(cudaGetDeviceCount(&device_count));
63
64 int rank;
65 MPI_Comm_rank(comm, &rank);
66
67 if (device_count <= 0) {
68 fprintf(stderr,
69 "ERROR: cuSOLVERMp with NCCL requires at least one visible GPU. "
70 "No CUDA device is visible to MPI rank %d.\n",
71 rank);
72 MPI_Abort(comm, EXIT_FAILURE);
73 }
74
75 if (local_device < 0 || local_device >= device_count) {
76 fprintf(stderr,
77 "ERROR: cuSOLVERMp with NCCL selected invalid CUDA device %d on "
78 "MPI rank %d; %d CUDA device(s) are visible to this rank.\n",
79 local_device, rank, device_count);
80 MPI_Abort(comm, EXIT_FAILURE);
81 }
82}
83
84#else
85/*******************************************************************************
86 * \brief Decode given cal error.
87 * \author Ole Schuett
88 ******************************************************************************/
89static char *calGetErrorString(calError_t status) {
90 switch (status) {
91 case CAL_OK:
92 return "CAL_OK";
93 case CAL_ERROR:
94 return "CAL_ERROR";
95 case CAL_ERROR_INVALID_PARAMETER:
96 return "CAL_ERROR_INVALID_PARAMETER";
97 case CAL_ERROR_INTERNAL:
98 return "CAL_ERROR_INTERNAL";
99 case CAL_ERROR_CUDA:
100 return "CAL_ERROR_CUDA";
101 case CAL_ERROR_UCC:
102 return "CAL_ERROR_UCC";
103 case CAL_ERROR_NOT_SUPPORTED:
104 return "CAL_ERROR_NOT_SUPPORTED";
105 case CAL_ERROR_INPROGRESS:
106 return "CAL_ERROR_INPROGRESS";
107 default:
108 return "CAL UNKNOWN ERROR";
109 }
110}
111
112/*******************************************************************************
113 * \brief Run given cal command and upon failure abort with a nice message.
114 * \author Ole Schuett
115 ******************************************************************************/
116#define CAL_CHECK(cmd) \
117 do { \
118 calError_t status = cmd; \
119 if (status != CAL_OK) { \
120 fprintf(stderr, "ERROR: %s %s %d\n", calGetErrorString(status), \
121 __FILE__, __LINE__); \
122 abort(); \
123 } \
124 } while (0)
125
126/*******************************************************************************
127 * \brief Callback for cal library to initiate an allgather operation.
128 * \author Ole Schuett
129 ******************************************************************************/
130static calError_t allgather(void *src_buf, void *recv_buf, size_t size,
131 void *data, void **req) {
132 const MPI_Comm comm = *(MPI_Comm *)data;
133 MPI_Request *request = malloc(sizeof(MPI_Request));
134 *req = request;
135 const int status = MPI_Iallgather(src_buf, size, MPI_BYTE, recv_buf, size,
136 MPI_BYTE, comm, request);
137 return (status == MPI_SUCCESS) ? CAL_OK : CAL_ERROR;
138}
139
140/*******************************************************************************
141 * \brief Callback for cal library to test if a request has completed.
142 * \author Ole Schuett
143 ******************************************************************************/
144static calError_t req_test(void *req) {
145 MPI_Request *request = (MPI_Request *)(req);
146 int completed;
147 const int status = MPI_Test(request, &completed, MPI_STATUS_IGNORE);
148 if (status != MPI_SUCCESS) {
149 return CAL_ERROR;
150 }
151 return completed ? CAL_OK : CAL_ERROR_INPROGRESS;
152}
153
154/*******************************************************************************
155 * \brief Callback for cal library to free a request.
156 * \author Ole Schuett
157 ******************************************************************************/
158static calError_t req_free(void *req) {
159 free(req);
160 return CAL_OK;
161}
162#endif /* __CUSOLVERMP_NCCL */
163
164/*******************************************************************************
165 * \brief Decode given cusolver error.
166 * \author Ole Schuett
167 ******************************************************************************/
168static char *cusolverGetErrorString(cusolverStatus_t status) {
169 switch (status) {
170 case CUSOLVER_STATUS_SUCCESS:
171 return "CUSOLVER_STATUS_SUCCESS";
172 case CUSOLVER_STATUS_NOT_INITIALIZED:
173 return "CUSOLVER_STATUS_NOT_INITIALIZED";
174 case CUSOLVER_STATUS_ALLOC_FAILED:
175 return "CUSOLVER_STATUS_ALLOC_FAILED";
176 case CUSOLVER_STATUS_INVALID_VALUE:
177 return "CUSOLVER_STATUS_INVALID_VALUE";
178 case CUSOLVER_STATUS_ARCH_MISMATCH:
179 return "CUSOLVER_STATUS_ARCH_MISMATCH";
180 case CUSOLVER_STATUS_MAPPING_ERROR:
181 return "CUSOLVER_STATUS_MAPPING_ERROR";
182 case CUSOLVER_STATUS_EXECUTION_FAILED:
183 return "CUSOLVER_STATUS_EXECUTION_FAILED";
184 case CUSOLVER_STATUS_INTERNAL_ERROR:
185 return "CUSOLVER_STATUS_INTERNAL_ERROR";
186 case CUSOLVER_STATUS_MATRIX_TYPE_NOT_SUPPORTED:
187 return "CUSOLVER_STATUS_MATRIX_TYPE_NOT_SUPPORTED";
188 case CUSOLVER_STATUS_NOT_SUPPORTED:
189 return "CUSOLVER_STATUS_NOT_SUPPORTED";
190 case CUSOLVER_STATUS_ZERO_PIVOT:
191 return "CUSOLVER_STATUS_ZERO_PIVOT";
192 case CUSOLVER_STATUS_INVALID_LICENSE:
193 return "CUSOLVER_STATUS_INVALID_LICENSE";
194 case CUSOLVER_STATUS_IRS_PARAMS_NOT_INITIALIZED:
195 return "CUSOLVER_STATUS_IRS_PARAMS_NOT_INITIALIZED";
196 case CUSOLVER_STATUS_IRS_PARAMS_INVALID:
197 return "CUSOLVER_STATUS_IRS_PARAMS_INVALID";
198 case CUSOLVER_STATUS_IRS_PARAMS_INVALID_PREC:
199 return "CUSOLVER_STATUS_IRS_PARAMS_INVALID_PREC";
200 case CUSOLVER_STATUS_IRS_PARAMS_INVALID_REFINE:
201 return "CUSOLVER_STATUS_IRS_PARAMS_INVALID_REFINE";
202 case CUSOLVER_STATUS_IRS_PARAMS_INVALID_MAXITER:
203 return "CUSOLVER_STATUS_IRS_PARAMS_INVALID_MAXITER";
204 case CUSOLVER_STATUS_IRS_INTERNAL_ERROR:
205 return "CUSOLVER_STATUS_IRS_INTERNAL_ERROR";
206 case CUSOLVER_STATUS_IRS_NOT_SUPPORTED:
207 return "CUSOLVER_STATUS_IRS_NOT_SUPPORTED";
208 case CUSOLVER_STATUS_IRS_OUT_OF_RANGE:
209 return "CUSOLVER_STATUS_IRS_OUT_OF_RANGE";
210 case CUSOLVER_STATUS_IRS_NRHS_NOT_SUPPORTED_FOR_REFINE_GMRES:
211 return "CUSOLVER_STATUS_IRS_NRHS_NOT_SUPPORTED_FOR_REFINE_GMRES";
212 case CUSOLVER_STATUS_IRS_INFOS_NOT_INITIALIZED:
213 return "CUSOLVER_STATUS_IRS_INFOS_NOT_INITIALIZED";
214 case CUSOLVER_STATUS_IRS_INFOS_NOT_DESTROYED:
215 return "CUSOLVER_STATUS_IRS_INFOS_NOT_DESTROYED";
216 case CUSOLVER_STATUS_IRS_MATRIX_SINGULAR:
217 return "CUSOLVER_STATUS_IRS_MATRIX_SINGULAR";
218 case CUSOLVER_STATUS_INVALID_WORKSPACE:
219 return "CUSOLVER_STATUS_INVALID_WORKSPACE";
220 default:
221 return "CUSOLVER UNKNOWN ERROR";
222 }
223}
224
225/*******************************************************************************
226 * \brief Run given cusolver command and upon failure abort with a nice message.
227 * \author Ole Schuett
228 ******************************************************************************/
229#define CUSOLVER_CHECK(cmd) \
230 do { \
231 cusolverStatus_t status = cmd; \
232 if (status != CUSOLVER_STATUS_SUCCESS) { \
233 fprintf(stderr, "ERROR: %s %s %d\n", cusolverGetErrorString(status), \
234 __FILE__, __LINE__); \
235 abort(); \
236 } \
237 } while (0)
238
239/*******************************************************************************
240 * \brief Driver routine to diagonalize a matrix with the cuSOLVERMp library.
241 * \author Ole Schuett
242 ******************************************************************************/
243void cp_fm_diag_cusolver(const int fortran_comm, const int matrix_desc[9],
244 const int nprow, const int npcol, const int myprow,
245 const int mypcol, const int n, const double *matrix,
246 double *eigenvectors, double *eigenvalues) {
247
248 const int local_device = offload_get_chosen_device();
249
250 MPI_Comm comm = MPI_Comm_f2c(fortran_comm);
251 int rank, nranks;
252 MPI_Comm_rank(comm, &rank);
253 MPI_Comm_size(comm, &nranks);
254
255#if defined(__CUSOLVERMP_NCCL)
256 check_nccl_device_visibility(comm, local_device);
257#endif
259
260#if defined(__CUSOLVERMP_NCCL)
261 // Create NCCL communicator.
262 ncclUniqueId nccl_id;
263 if (rank == 0) {
264 NCCL_CHECK(ncclGetUniqueId(&nccl_id));
265 }
266 MPI_Bcast(&nccl_id, sizeof(nccl_id), MPI_BYTE, 0, comm);
267
268 ncclComm_t nccl_comm;
269 NCCL_CHECK(ncclCommInitRank(&nccl_comm, nranks, nccl_id, rank));
270#else
271 // Create CAL communicator.
272 cal_comm_t cal_comm = NULL;
273 cal_comm_create_params_t params;
274 params.allgather = &allgather;
275 params.req_test = &req_test;
276 params.req_free = &req_free;
277 params.data = &comm;
278 params.rank = rank;
279 params.nranks = nranks;
280 params.local_device = local_device;
281 CAL_CHECK(cal_comm_create(params, &cal_comm));
282#endif
283
284 // Create various handles.
285 cudaStream_t stream = NULL;
286 CUDA_CHECK(cudaStreamCreate(&stream));
287
288 cusolverMpHandle_t cusolvermp_handle = NULL;
289 CUSOLVER_CHECK(cusolverMpCreate(&cusolvermp_handle, local_device, stream));
290
291 cusolverMpGrid_t grid = NULL;
292#if defined(__CUSOLVERMP_NCCL)
293 CUSOLVER_CHECK(cusolverMpCreateDeviceGrid(cusolvermp_handle, &grid, nccl_comm,
294 nprow, npcol,
295 CUSOLVERMP_GRID_MAPPING_ROW_MAJOR));
296#else
297 CUSOLVER_CHECK(cusolverMpCreateDeviceGrid(cusolvermp_handle, &grid, cal_comm,
298 nprow, npcol,
299 CUSOLVERMP_GRID_MAPPING_ROW_MAJOR));
300#endif
301 const int mb = matrix_desc[4];
302 const int nb = matrix_desc[5];
303 const int rsrc = matrix_desc[6];
304 const int csrc = matrix_desc[7];
305 const int ldA = matrix_desc[8];
306 assert(rsrc == csrc);
307 assert(ldA >= 1);
308
309 const int np = cusolverMpNUMROC(n, mb, myprow, rsrc, nprow);
310 const int nq = cusolverMpNUMROC(n, nb, mypcol, csrc, npcol);
311 assert(np == ldA);
312
313 const cublasFillMode_t fill_mode = CUBLAS_FILL_MODE_UPPER;
314 const cudaDataType_t data_type = CUDA_R_64F; // double
315
316 cusolverMpMatrixDescriptor_t cusolvermp_matrix_desc = NULL;
317 CUSOLVER_CHECK(cusolverMpCreateMatrixDesc(
318 &cusolvermp_matrix_desc, grid, data_type, n, n, mb, nb, rsrc, csrc, np));
319
320 // Allocate workspaces.
321 size_t work_dev_size, work_host_size;
322 void *DUMMY = (void *)1; // Workaround to avoid crash when passing NULL.
323 CUSOLVER_CHECK(cusolverMpSyevd_bufferSize(
324 cusolvermp_handle, "V", fill_mode, n, DUMMY, 1, 1, cusolvermp_matrix_desc,
325 NULL, NULL, 1, 1, cusolvermp_matrix_desc, data_type, &work_dev_size,
326 &work_host_size));
327
328 double *work_dev = NULL;
329 CUDA_CHECK(cudaMalloc((void **)&work_dev, work_dev_size));
330
331 double *work_host = NULL;
332 CUDA_CHECK(cudaMallocHost((void **)&work_host, work_host_size));
333
334 // Upload input matrix.
335 const size_t matrix_local_size = ldA * nq * sizeof(double);
336 double *matrix_dev = NULL;
337 CUDA_CHECK(cudaMalloc((void **)&matrix_dev, matrix_local_size));
338 CUDA_CHECK(cudaMemcpyAsync(matrix_dev, matrix, matrix_local_size,
339 cudaMemcpyHostToDevice, stream));
340
341 // Allocate result buffers.
342 int *info_dev = NULL;
343 CUDA_CHECK(cudaMalloc((void **)&info_dev, sizeof(int)));
344
345 double *eigenvectors_dev = NULL;
346 CUDA_CHECK(cudaMalloc((void **)&eigenvectors_dev, matrix_local_size));
347
348 double *eigenvalues_dev = NULL;
349 CUDA_CHECK(cudaMalloc((void **)&eigenvalues_dev, n * sizeof(double)));
350
351 // Call solver.
352 CUSOLVER_CHECK(
353 cusolverMpSyevd(cusolvermp_handle, "V", fill_mode, n, matrix_dev, 1, 1,
354 cusolvermp_matrix_desc, eigenvalues_dev, eigenvectors_dev,
355 1, 1, cusolvermp_matrix_desc, data_type, work_dev,
356 work_dev_size, work_host, work_host_size, info_dev));
357
358 // Wait for solver to finish.
359 CUDA_CHECK(cudaStreamSynchronize(stream));
360#if !defined(__CUSOLVERMP_NCCL)
361 CAL_CHECK(cal_stream_sync(cal_comm, stream));
362#endif
363
364 // Check info.
365 int info = -1;
366 CUDA_CHECK(cudaMemcpy(&info, info_dev, sizeof(int), cudaMemcpyDeviceToHost));
367 assert(info == 0);
368
369 // Download results.
370 CUDA_CHECK(cudaMemcpyAsync(eigenvectors, eigenvectors_dev, matrix_local_size,
371 cudaMemcpyDeviceToHost, stream));
372 CUDA_CHECK(cudaMemcpyAsync(eigenvalues, eigenvalues_dev, n * sizeof(double),
373 cudaMemcpyDeviceToHost, stream));
374
375 // Wait for download to finish.
376 CUDA_CHECK(cudaStreamSynchronize(stream));
377
378 // Free buffers.
379 CUDA_CHECK(cudaFree(matrix_dev));
380 CUDA_CHECK(cudaFree(info_dev));
381 CUDA_CHECK(cudaFree(eigenvectors_dev));
382 CUDA_CHECK(cudaFree(eigenvalues_dev));
383 CUDA_CHECK(cudaFree(work_dev));
384 CUDA_CHECK(cudaFreeHost(work_host));
385
386 // Destroy handles.
387 CUSOLVER_CHECK(cusolverMpDestroyMatrixDesc(cusolvermp_matrix_desc));
388 CUSOLVER_CHECK(cusolverMpDestroyGrid(grid));
389 CUSOLVER_CHECK(cusolverMpDestroy(cusolvermp_handle));
390 CUDA_CHECK(cudaStreamDestroy(stream));
391#if defined(__CUSOLVERMP_NCCL)
392 NCCL_CHECK(ncclCommDestroy(nccl_comm));
393#else
394 CAL_CHECK(cal_comm_destroy(cal_comm));
395#endif
396
397 // Sync MPI ranks to include load imbalance in total timings.
398 MPI_Barrier(comm);
399}
400
401/*******************************************************************************
402 * \brief Driver routine to solve A*x = lambda*B*x with cuSOLVERMp sygvd.
403 * \author Jiri Vyskocil
404 ******************************************************************************/
405void cp_fm_diag_cusolver_sygvd(const int fortran_comm,
406 const int a_matrix_desc[9],
407 const int b_matrix_desc[9], const int nprow,
408 const int npcol, const int myprow,
409 const int mypcol, const int n,
410 const double *aMatrix, const double *bMatrix,
411 double *eigenvectors, double *eigenvalues) {
412
413 const int local_device = offload_get_chosen_device();
414
415 MPI_Comm comm = MPI_Comm_f2c(fortran_comm);
416 int rank, nranks;
417 MPI_Comm_rank(comm, &rank);
418 MPI_Comm_size(comm, &nranks);
419
420#if defined(__CUSOLVERMP_NCCL)
421 check_nccl_device_visibility(comm, local_device);
422#endif
424
425#if defined(__CUSOLVERMP_NCCL)
426 // Create NCCL communicator.
427 ncclUniqueId nccl_id;
428 if (rank == 0) {
429 NCCL_CHECK(ncclGetUniqueId(&nccl_id));
430 }
431 MPI_Bcast(&nccl_id, sizeof(nccl_id), MPI_BYTE, 0, comm);
432
433 ncclComm_t nccl_comm;
434 NCCL_CHECK(ncclCommInitRank(&nccl_comm, nranks, nccl_id, rank));
435#else
436 // Create CAL communicator
437 cal_comm_t cal_comm = NULL;
438 cal_comm_create_params_t params;
439 params.allgather = &allgather;
440 params.req_test = &req_test;
441 params.req_free = &req_free;
442 params.data = &comm;
443 params.rank = rank;
444 params.nranks = nranks;
445 params.local_device = local_device;
446 CAL_CHECK(cal_comm_create(params, &cal_comm));
447#endif
448
449 // Create CUDA stream and cuSOLVER handle
450 cudaStream_t stream = NULL;
451 CUDA_CHECK(cudaStreamCreate(&stream));
452
453 cusolverMpHandle_t cusolvermp_handle = NULL;
454 CUSOLVER_CHECK(cusolverMpCreate(&cusolvermp_handle, local_device, stream));
455
456 // Define grid for device computation
457 cusolverMpGrid_t grid = NULL;
458#if defined(__CUSOLVERMP_NCCL)
459 CUSOLVER_CHECK(cusolverMpCreateDeviceGrid(cusolvermp_handle, &grid, nccl_comm,
460 nprow, npcol,
461 CUSOLVERMP_GRID_MAPPING_ROW_MAJOR));
462#else
463 CUSOLVER_CHECK(cusolverMpCreateDeviceGrid(cusolvermp_handle, &grid, cal_comm,
464 nprow, npcol,
465 CUSOLVERMP_GRID_MAPPING_ROW_MAJOR));
466#endif
467
468 // Matrix descriptors for A, B, and Z
469 const int mb_a = a_matrix_desc[4];
470 const int nb_a = a_matrix_desc[5];
471 const int rsrc_a = a_matrix_desc[6];
472 const int csrc_a = a_matrix_desc[7];
473 const int ldA = a_matrix_desc[8];
474
475 const int mb_b = b_matrix_desc[4];
476 const int nb_b = b_matrix_desc[5];
477 const int rsrc_b = b_matrix_desc[6];
478 const int csrc_b = b_matrix_desc[7];
479 const int ldB = b_matrix_desc[8];
480
481 // Ensure consistency in block sizes, sources, and leading dimensions
482 assert(mb_a == mb_b && nb_a == nb_b);
483 assert(rsrc_a == rsrc_b && csrc_a == csrc_b);
484 assert(ldA == ldB);
485
486 const int np_a = cusolverMpNUMROC(n, mb_a, myprow, rsrc_a, nprow);
487 const int nq_a = cusolverMpNUMROC(n, nb_a, mypcol, csrc_a, npcol);
488
489 const cublasFillMode_t uplo = CUBLAS_FILL_MODE_LOWER;
490 const cusolverEigType_t itype = CUSOLVER_EIG_TYPE_1;
491 const cusolverEigMode_t jobz = CUSOLVER_EIG_MODE_VECTOR;
492 const cudaDataType_t data_type = CUDA_R_64F;
493
494 // Create matrix descriptors
495 cusolverMpMatrixDescriptor_t descrA = NULL;
496 cusolverMpMatrixDescriptor_t descrB = NULL;
497 cusolverMpMatrixDescriptor_t descrZ = NULL;
498
499 // Create matrix descriptors using ldA as local leading dimension (LLD)
500 // Note: We use ldA for all matrices. The assertion above verifies ldA == ldB.
501 CUSOLVER_CHECK(cusolverMpCreateMatrixDesc(&descrA, grid, data_type, n, n,
502 mb_a, nb_a, rsrc_a, csrc_a, ldA));
503 CUSOLVER_CHECK(cusolverMpCreateMatrixDesc(&descrB, grid, data_type, n, n,
504 mb_b, nb_b, rsrc_b, csrc_b, ldA));
505 CUSOLVER_CHECK(cusolverMpCreateMatrixDesc(&descrZ, grid, data_type, n, n,
506 mb_a, nb_a, rsrc_a, csrc_a, ldA));
507
508 // Allocate device memory for matrices
509 double *dev_A = NULL, *dev_B = NULL;
510 size_t matrix_local_size = ldA * nq_a * sizeof(double);
511 CUDA_CHECK(cudaMalloc((void **)&dev_A, matrix_local_size));
512 CUDA_CHECK(cudaMalloc((void **)&dev_B, matrix_local_size));
513
514 // Copy matrices from host to device
515 CUDA_CHECK(cudaMemcpyAsync(dev_A, aMatrix, matrix_local_size,
516 cudaMemcpyHostToDevice, stream));
517 CUDA_CHECK(cudaMemcpyAsync(dev_B, bMatrix, matrix_local_size,
518 cudaMemcpyHostToDevice, stream));
519
520 // Allocate device memory for eigenvalues and eigenvectors
521 double *dev_Z = NULL, *eigenvalues_dev = NULL;
522 CUDA_CHECK(cudaMalloc((void **)&dev_Z, matrix_local_size));
523 CUDA_CHECK(cudaMalloc((void **)&eigenvalues_dev, n * sizeof(double)));
524
525 // Query workspace size
526 size_t work_dev_size = 0, work_host_size = 0;
527 const int64_t ia = 1, ja = 1, ib = 1, jb = 1, iz = 1, jz = 1;
528 const int64_t m = (int64_t)n;
529
530 cusolverStatus_t status_bufsize = cusolverMpSygvd_bufferSize(
531 cusolvermp_handle, itype, jobz, uplo, m, ia, ja, descrA, ib, jb, descrB,
532 iz, jz, descrZ, data_type, &work_dev_size, &work_host_size);
533 if (status_bufsize != CUSOLVER_STATUS_SUCCESS) {
534 fprintf(stderr, "ERROR: cusolverMpSygvd_bufferSize failed with status=%d\n",
535 (int)status_bufsize);
536 abort();
537 }
538
539 void *work_dev = NULL, *work_host = NULL;
540 CUDA_CHECK(cudaMalloc(&work_dev, work_dev_size));
541 CUDA_CHECK(cudaMallocHost(&work_host, work_host_size));
542
543 // Allocate and initialize device memory for info
544 int *info_dev = NULL;
545 CUDA_CHECK(cudaMalloc((void **)&info_dev, sizeof(int)));
546 CUDA_CHECK(cudaMemset(info_dev, 0, sizeof(int)));
547
548 // Call cusolverMpSygvd
549 cusolverStatus_t status_sygvd = cusolverMpSygvd(
550 cusolvermp_handle, itype, jobz, uplo, m, dev_A, ia, ja, descrA, dev_B, ib,
551 jb, descrB, eigenvalues_dev, dev_Z, iz, jz, descrZ, data_type, work_dev,
552 work_dev_size, work_host, work_host_size, info_dev);
553 if (status_sygvd != CUSOLVER_STATUS_SUCCESS) {
554 fprintf(stderr, "ERROR: cusolverMpSygvd failed with status=%d\n",
555 (int)status_sygvd);
556 abort();
557 }
558
559 // Wait for computation to finish
560 CUDA_CHECK(cudaStreamSynchronize(stream));
561#if !defined(__CUSOLVERMP_NCCL)
562 CAL_CHECK(cal_stream_sync(cal_comm, stream));
563#endif
564
565 // Check info
566 int info;
567 CUDA_CHECK(cudaMemcpy(&info, info_dev, sizeof(int), cudaMemcpyDeviceToHost));
568 if (info != 0) {
569 fprintf(stderr, "ERROR: cusolverMpSygvd failed with info = %d\n", info);
570 abort();
571 }
572
573 // Copy results back to host
574 CUDA_CHECK(cudaMemcpyAsync(eigenvectors, dev_Z, matrix_local_size,
575 cudaMemcpyDeviceToHost, stream));
576 CUDA_CHECK(cudaMemcpyAsync(eigenvalues, eigenvalues_dev, n * sizeof(double),
577 cudaMemcpyDeviceToHost, stream));
578
579 // Wait for copy to finish
580 CUDA_CHECK(cudaStreamSynchronize(stream));
581
582 // Clean up resources
583 CUDA_CHECK(cudaFree(dev_A));
584 CUDA_CHECK(cudaFree(dev_B));
585 CUDA_CHECK(cudaFree(dev_Z));
586 CUDA_CHECK(cudaFree(eigenvalues_dev));
587 CUDA_CHECK(cudaFree(info_dev));
588 CUDA_CHECK(cudaFree(work_dev));
589 CUDA_CHECK(cudaFreeHost(work_host));
590 CUSOLVER_CHECK(cusolverMpDestroyMatrixDesc(descrA));
591 CUSOLVER_CHECK(cusolverMpDestroyMatrixDesc(descrB));
592 CUSOLVER_CHECK(cusolverMpDestroyMatrixDesc(descrZ));
593 CUSOLVER_CHECK(cusolverMpDestroyGrid(grid));
594 CUSOLVER_CHECK(cusolverMpDestroy(cusolvermp_handle));
595 CUDA_CHECK(cudaStreamDestroy(stream));
596#if defined(__CUSOLVERMP_NCCL)
597 NCCL_CHECK(ncclCommDestroy(nccl_comm));
598#else
599 CAL_CHECK(cal_comm_destroy(cal_comm));
600#endif
601
602 MPI_Barrier(comm); // Synchronize MPI ranks
603}
604
605/*******************************************************************************
606 * \brief Driver routine to solve A*x = lambda*B*x with cuSOLVERMp hegvd.
607 ******************************************************************************/
608void cp_cfm_diag_cusolver_hegvd(
609 const int fortran_comm, const int a_matrix_desc[9],
610 const int b_matrix_desc[9], const int nprow, const int npcol,
611 const int myprow, const int mypcol, const int n,
612 const cuDoubleComplex *aMatrix, const cuDoubleComplex *bMatrix,
613 cuDoubleComplex *eigenvectors, double *eigenvalues) {
614
615 const int local_device = offload_get_chosen_device();
616
617 MPI_Comm comm = MPI_Comm_f2c(fortran_comm);
618 int rank, nranks;
619 MPI_Comm_rank(comm, &rank);
620 MPI_Comm_size(comm, &nranks);
621
622#if defined(__CUSOLVERMP_NCCL)
623 check_nccl_device_visibility(comm, local_device);
624#endif
626
627#if defined(__CUSOLVERMP_NCCL)
628 ncclUniqueId nccl_id;
629 if (rank == 0) {
630 NCCL_CHECK(ncclGetUniqueId(&nccl_id));
631 }
632 MPI_Bcast(&nccl_id, sizeof(nccl_id), MPI_BYTE, 0, comm);
633
634 ncclComm_t nccl_comm;
635 NCCL_CHECK(ncclCommInitRank(&nccl_comm, nranks, nccl_id, rank));
636#else
637 cal_comm_t cal_comm = NULL;
638 cal_comm_create_params_t params;
639 params.allgather = &allgather;
640 params.req_test = &req_test;
641 params.req_free = &req_free;
642 params.data = &comm;
643 params.rank = rank;
644 params.nranks = nranks;
645 params.local_device = local_device;
646 CAL_CHECK(cal_comm_create(params, &cal_comm));
647#endif
648
649 cudaStream_t stream = NULL;
650 CUDA_CHECK(cudaStreamCreate(&stream));
651
652 cusolverMpHandle_t cusolvermp_handle = NULL;
653 CUSOLVER_CHECK(cusolverMpCreate(&cusolvermp_handle, local_device, stream));
654
655 cusolverMpGrid_t grid = NULL;
656#if defined(__CUSOLVERMP_NCCL)
657 CUSOLVER_CHECK(cusolverMpCreateDeviceGrid(cusolvermp_handle, &grid, nccl_comm,
658 nprow, npcol,
659 CUSOLVERMP_GRID_MAPPING_ROW_MAJOR));
660#else
661 CUSOLVER_CHECK(cusolverMpCreateDeviceGrid(cusolvermp_handle, &grid, cal_comm,
662 nprow, npcol,
663 CUSOLVERMP_GRID_MAPPING_ROW_MAJOR));
664#endif
665
666 const int mb_a = a_matrix_desc[4];
667 const int nb_a = a_matrix_desc[5];
668 const int rsrc_a = a_matrix_desc[6];
669 const int csrc_a = a_matrix_desc[7];
670 const int ldA = a_matrix_desc[8];
671
672 const int mb_b = b_matrix_desc[4];
673 const int nb_b = b_matrix_desc[5];
674 const int rsrc_b = b_matrix_desc[6];
675 const int csrc_b = b_matrix_desc[7];
676 const int ldB = b_matrix_desc[8];
677
678 assert(mb_a == mb_b && nb_a == nb_b);
679 assert(rsrc_a == rsrc_b && csrc_a == csrc_b);
680 assert(ldA == ldB);
681
682 const int np_a = cusolverMpNUMROC(n, mb_a, myprow, rsrc_a, nprow);
683 const int nq_a = cusolverMpNUMROC(n, nb_a, mypcol, csrc_a, npcol);
684 assert(np_a == ldA);
685
686 const cublasFillMode_t uplo = CUBLAS_FILL_MODE_LOWER;
687 const cusolverEigType_t itype = CUSOLVER_EIG_TYPE_1;
688 const cusolverEigMode_t jobz = CUSOLVER_EIG_MODE_VECTOR;
689 const cudaDataType_t data_type = CUDA_C_64F;
690
691 cusolverMpMatrixDescriptor_t descrA = NULL;
692 cusolverMpMatrixDescriptor_t descrB = NULL;
693 cusolverMpMatrixDescriptor_t descrZ = NULL;
694
695 CUSOLVER_CHECK(cusolverMpCreateMatrixDesc(&descrA, grid, data_type, n, n,
696 mb_a, nb_a, rsrc_a, csrc_a, ldA));
697 CUSOLVER_CHECK(cusolverMpCreateMatrixDesc(&descrB, grid, data_type, n, n,
698 mb_b, nb_b, rsrc_b, csrc_b, ldA));
699 CUSOLVER_CHECK(cusolverMpCreateMatrixDesc(&descrZ, grid, data_type, n, n,
700 mb_a, nb_a, rsrc_a, csrc_a, ldA));
701
702 cuDoubleComplex *dev_A = NULL, *dev_B = NULL;
703 size_t matrix_local_size = ldA * nq_a * sizeof(cuDoubleComplex);
704 CUDA_CHECK(cudaMalloc((void **)&dev_A, matrix_local_size));
705 CUDA_CHECK(cudaMalloc((void **)&dev_B, matrix_local_size));
706
707 CUDA_CHECK(cudaMemcpyAsync(dev_A, aMatrix, matrix_local_size,
708 cudaMemcpyHostToDevice, stream));
709 CUDA_CHECK(cudaMemcpyAsync(dev_B, bMatrix, matrix_local_size,
710 cudaMemcpyHostToDevice, stream));
711
712 cuDoubleComplex *dev_Z = NULL;
713 double *eigenvalues_dev = NULL;
714 CUDA_CHECK(cudaMalloc((void **)&dev_Z, matrix_local_size));
715 CUDA_CHECK(cudaMalloc((void **)&eigenvalues_dev, n * sizeof(double)));
716
717 size_t work_dev_size = 0, work_host_size = 0;
718 const int64_t ia = 1, ja = 1, ib = 1, jb = 1, iz = 1, jz = 1;
719 const int64_t m = (int64_t)n;
720
721 cusolverStatus_t status_bufsize = cusolverMpSygvd_bufferSize(
722 cusolvermp_handle, itype, jobz, uplo, m, ia, ja, descrA, ib, jb, descrB,
723 iz, jz, descrZ, data_type, &work_dev_size, &work_host_size);
724 if (status_bufsize != CUSOLVER_STATUS_SUCCESS) {
725 fprintf(stderr, "ERROR: cusolverMpSygvd_bufferSize failed with status=%d\n",
726 (int)status_bufsize);
727 abort();
728 }
729
730 void *work_dev = NULL, *work_host = NULL;
731 CUDA_CHECK(cudaMalloc(&work_dev, work_dev_size));
732 CUDA_CHECK(cudaMallocHost(&work_host, work_host_size));
733
734 int *info_dev = NULL;
735 CUDA_CHECK(cudaMalloc((void **)&info_dev, sizeof(int)));
736 CUDA_CHECK(cudaMemset(info_dev, 0, sizeof(int)));
737
738 cusolverStatus_t status_sygvd = cusolverMpSygvd(
739 cusolvermp_handle, itype, jobz, uplo, m, dev_A, ia, ja, descrA, dev_B, ib,
740 jb, descrB, eigenvalues_dev, dev_Z, iz, jz, descrZ, data_type, work_dev,
741 work_dev_size, work_host, work_host_size, info_dev);
742 if (status_sygvd != CUSOLVER_STATUS_SUCCESS) {
743 fprintf(stderr, "ERROR: cusolverMpSygvd failed with status=%d\n",
744 (int)status_sygvd);
745 abort();
746 }
747
748 CUDA_CHECK(cudaStreamSynchronize(stream));
749#if !defined(__CUSOLVERMP_NCCL)
750 CAL_CHECK(cal_stream_sync(cal_comm, stream));
751#endif
752
753 int info;
754 CUDA_CHECK(cudaMemcpy(&info, info_dev, sizeof(int), cudaMemcpyDeviceToHost));
755 if (info != 0) {
756 fprintf(stderr, "ERROR: cusolverMpSygvd failed with info = %d\n", info);
757 abort();
758 }
759
760 CUDA_CHECK(cudaMemcpyAsync(eigenvectors, dev_Z, matrix_local_size,
761 cudaMemcpyDeviceToHost, stream));
762 CUDA_CHECK(cudaMemcpyAsync(eigenvalues, eigenvalues_dev, n * sizeof(double),
763 cudaMemcpyDeviceToHost, stream));
764
765 CUDA_CHECK(cudaStreamSynchronize(stream));
766
767 CUDA_CHECK(cudaFree(dev_A));
768 CUDA_CHECK(cudaFree(dev_B));
769 CUDA_CHECK(cudaFree(dev_Z));
770 CUDA_CHECK(cudaFree(eigenvalues_dev));
771 CUDA_CHECK(cudaFree(info_dev));
772 CUDA_CHECK(cudaFree(work_dev));
773 CUDA_CHECK(cudaFreeHost(work_host));
774 CUSOLVER_CHECK(cusolverMpDestroyMatrixDesc(descrA));
775 CUSOLVER_CHECK(cusolverMpDestroyMatrixDesc(descrB));
776 CUSOLVER_CHECK(cusolverMpDestroyMatrixDesc(descrZ));
777 CUSOLVER_CHECK(cusolverMpDestroyGrid(grid));
778 CUSOLVER_CHECK(cusolverMpDestroy(cusolvermp_handle));
779 CUDA_CHECK(cudaStreamDestroy(stream));
780#if defined(__CUSOLVERMP_NCCL)
781 NCCL_CHECK(ncclCommDestroy(nccl_comm));
782#else
783 CAL_CHECK(cal_comm_destroy(cal_comm));
784#endif
785
786 MPI_Barrier(comm);
787}
788#endif
789
790// EOF
static void const int const int const int const int const int const double const int const int const int int GRID_CONST_WHEN_COLLOCATE double GRID_CONST_WHEN_INTEGRATE double * grid
subroutine, public cp_fm_diag_cusolver(matrix, eigenvectors, eigenvalues)
Driver routine to diagonalize a FM matrix with the cuSOLVERMp library.
subroutine, public offload_activate_chosen_device()
Activates the device selected via offload_set_chosen_device()
integer function, public offload_get_chosen_device()
Returns the chosen device.