8#if defined(__CUSOLVERMP)
10#include "../offload/offload_library.h"
13#include <cuda_runtime.h>
14#include <cusolverMp.h>
21#if defined(__CUSOLVERMP_NCCL)
31#define CUDA_CHECK(cmd) \
33 cudaError_t status = cmd; \
34 if (status != cudaSuccess) { \
35 fprintf(stderr, "ERROR: %s %s %d\n", cudaGetErrorString(status), \
36 __FILE__, __LINE__); \
41#if defined(__CUSOLVERMP_NCCL)
46#define NCCL_CHECK(cmd) \
48 ncclResult_t status = cmd; \
49 if (status != ncclSuccess) { \
50 fprintf(stderr, "ERROR: %s %s %d\n", ncclGetErrorString(status), \
51 __FILE__, __LINE__); \
59static void check_nccl_device_visibility(MPI_Comm comm,
60 const int local_device) {
62 CUDA_CHECK(cudaGetDeviceCount(&device_count));
65 MPI_Comm_rank(comm, &rank);
67 if (device_count <= 0) {
69 "ERROR: cuSOLVERMp with NCCL requires at least one visible GPU. "
70 "No CUDA device is visible to MPI rank %d.\n",
72 MPI_Abort(comm, EXIT_FAILURE);
75 if (local_device < 0 || local_device >= device_count) {
77 "ERROR: cuSOLVERMp with NCCL selected invalid CUDA device %d on "
78 "MPI rank %d; %d CUDA device(s) are visible to this rank.\n",
79 local_device, rank, device_count);
80 MPI_Abort(comm, EXIT_FAILURE);
89static char *calGetErrorString(calError_t status) {
95 case CAL_ERROR_INVALID_PARAMETER:
96 return "CAL_ERROR_INVALID_PARAMETER";
97 case CAL_ERROR_INTERNAL:
98 return "CAL_ERROR_INTERNAL";
100 return "CAL_ERROR_CUDA";
102 return "CAL_ERROR_UCC";
103 case CAL_ERROR_NOT_SUPPORTED:
104 return "CAL_ERROR_NOT_SUPPORTED";
105 case CAL_ERROR_INPROGRESS:
106 return "CAL_ERROR_INPROGRESS";
108 return "CAL UNKNOWN ERROR";
116#define CAL_CHECK(cmd) \
118 calError_t status = cmd; \
119 if (status != CAL_OK) { \
120 fprintf(stderr, "ERROR: %s %s %d\n", calGetErrorString(status), \
121 __FILE__, __LINE__); \
130static calError_t allgather(
void *src_buf,
void *recv_buf,
size_t size,
131 void *data,
void **req) {
132 const MPI_Comm comm = *(MPI_Comm *)data;
133 MPI_Request *request = malloc(
sizeof(MPI_Request));
135 const int status = MPI_Iallgather(src_buf, size, MPI_BYTE, recv_buf, size,
136 MPI_BYTE, comm, request);
137 return (status == MPI_SUCCESS) ? CAL_OK : CAL_ERROR;
144static calError_t req_test(
void *req) {
145 MPI_Request *request = (MPI_Request *)(req);
147 const int status = MPI_Test(request, &completed, MPI_STATUS_IGNORE);
148 if (status != MPI_SUCCESS) {
151 return completed ? CAL_OK : CAL_ERROR_INPROGRESS;
158static calError_t req_free(
void *req) {
168static char *cusolverGetErrorString(cusolverStatus_t status) {
170 case CUSOLVER_STATUS_SUCCESS:
171 return "CUSOLVER_STATUS_SUCCESS";
172 case CUSOLVER_STATUS_NOT_INITIALIZED:
173 return "CUSOLVER_STATUS_NOT_INITIALIZED";
174 case CUSOLVER_STATUS_ALLOC_FAILED:
175 return "CUSOLVER_STATUS_ALLOC_FAILED";
176 case CUSOLVER_STATUS_INVALID_VALUE:
177 return "CUSOLVER_STATUS_INVALID_VALUE";
178 case CUSOLVER_STATUS_ARCH_MISMATCH:
179 return "CUSOLVER_STATUS_ARCH_MISMATCH";
180 case CUSOLVER_STATUS_MAPPING_ERROR:
181 return "CUSOLVER_STATUS_MAPPING_ERROR";
182 case CUSOLVER_STATUS_EXECUTION_FAILED:
183 return "CUSOLVER_STATUS_EXECUTION_FAILED";
184 case CUSOLVER_STATUS_INTERNAL_ERROR:
185 return "CUSOLVER_STATUS_INTERNAL_ERROR";
186 case CUSOLVER_STATUS_MATRIX_TYPE_NOT_SUPPORTED:
187 return "CUSOLVER_STATUS_MATRIX_TYPE_NOT_SUPPORTED";
188 case CUSOLVER_STATUS_NOT_SUPPORTED:
189 return "CUSOLVER_STATUS_NOT_SUPPORTED";
190 case CUSOLVER_STATUS_ZERO_PIVOT:
191 return "CUSOLVER_STATUS_ZERO_PIVOT";
192 case CUSOLVER_STATUS_INVALID_LICENSE:
193 return "CUSOLVER_STATUS_INVALID_LICENSE";
194 case CUSOLVER_STATUS_IRS_PARAMS_NOT_INITIALIZED:
195 return "CUSOLVER_STATUS_IRS_PARAMS_NOT_INITIALIZED";
196 case CUSOLVER_STATUS_IRS_PARAMS_INVALID:
197 return "CUSOLVER_STATUS_IRS_PARAMS_INVALID";
198 case CUSOLVER_STATUS_IRS_PARAMS_INVALID_PREC:
199 return "CUSOLVER_STATUS_IRS_PARAMS_INVALID_PREC";
200 case CUSOLVER_STATUS_IRS_PARAMS_INVALID_REFINE:
201 return "CUSOLVER_STATUS_IRS_PARAMS_INVALID_REFINE";
202 case CUSOLVER_STATUS_IRS_PARAMS_INVALID_MAXITER:
203 return "CUSOLVER_STATUS_IRS_PARAMS_INVALID_MAXITER";
204 case CUSOLVER_STATUS_IRS_INTERNAL_ERROR:
205 return "CUSOLVER_STATUS_IRS_INTERNAL_ERROR";
206 case CUSOLVER_STATUS_IRS_NOT_SUPPORTED:
207 return "CUSOLVER_STATUS_IRS_NOT_SUPPORTED";
208 case CUSOLVER_STATUS_IRS_OUT_OF_RANGE:
209 return "CUSOLVER_STATUS_IRS_OUT_OF_RANGE";
210 case CUSOLVER_STATUS_IRS_NRHS_NOT_SUPPORTED_FOR_REFINE_GMRES:
211 return "CUSOLVER_STATUS_IRS_NRHS_NOT_SUPPORTED_FOR_REFINE_GMRES";
212 case CUSOLVER_STATUS_IRS_INFOS_NOT_INITIALIZED:
213 return "CUSOLVER_STATUS_IRS_INFOS_NOT_INITIALIZED";
214 case CUSOLVER_STATUS_IRS_INFOS_NOT_DESTROYED:
215 return "CUSOLVER_STATUS_IRS_INFOS_NOT_DESTROYED";
216 case CUSOLVER_STATUS_IRS_MATRIX_SINGULAR:
217 return "CUSOLVER_STATUS_IRS_MATRIX_SINGULAR";
218 case CUSOLVER_STATUS_INVALID_WORKSPACE:
219 return "CUSOLVER_STATUS_INVALID_WORKSPACE";
221 return "CUSOLVER UNKNOWN ERROR";
229#define CUSOLVER_CHECK(cmd) \
231 cusolverStatus_t status = cmd; \
232 if (status != CUSOLVER_STATUS_SUCCESS) { \
233 fprintf(stderr, "ERROR: %s %s %d\n", cusolverGetErrorString(status), \
234 __FILE__, __LINE__); \
244 const int nprow,
const int npcol,
const int myprow,
245 const int mypcol,
const int n,
const double *matrix,
246 double *eigenvectors,
double *eigenvalues) {
250 MPI_Comm comm = MPI_Comm_f2c(fortran_comm);
252 MPI_Comm_rank(comm, &rank);
253 MPI_Comm_size(comm, &nranks);
255#if defined(__CUSOLVERMP_NCCL)
256 check_nccl_device_visibility(comm, local_device);
260#if defined(__CUSOLVERMP_NCCL)
262 ncclUniqueId nccl_id;
264 NCCL_CHECK(ncclGetUniqueId(&nccl_id));
266 MPI_Bcast(&nccl_id,
sizeof(nccl_id), MPI_BYTE, 0, comm);
268 ncclComm_t nccl_comm;
269 NCCL_CHECK(ncclCommInitRank(&nccl_comm, nranks, nccl_id, rank));
272 cal_comm_t cal_comm = NULL;
273 cal_comm_create_params_t params;
274 params.allgather = &allgather;
275 params.req_test = &req_test;
276 params.req_free = &req_free;
279 params.nranks = nranks;
280 params.local_device = local_device;
281 CAL_CHECK(cal_comm_create(params, &cal_comm));
285 cudaStream_t stream = NULL;
286 CUDA_CHECK(cudaStreamCreate(&stream));
288 cusolverMpHandle_t cusolvermp_handle = NULL;
289 CUSOLVER_CHECK(cusolverMpCreate(&cusolvermp_handle, local_device, stream));
291 cusolverMpGrid_t
grid = NULL;
292#if defined(__CUSOLVERMP_NCCL)
293 CUSOLVER_CHECK(cusolverMpCreateDeviceGrid(cusolvermp_handle, &
grid, nccl_comm,
295 CUSOLVERMP_GRID_MAPPING_ROW_MAJOR));
297 CUSOLVER_CHECK(cusolverMpCreateDeviceGrid(cusolvermp_handle, &
grid, cal_comm,
299 CUSOLVERMP_GRID_MAPPING_ROW_MAJOR));
301 const int mb = matrix_desc[4];
302 const int nb = matrix_desc[5];
303 const int rsrc = matrix_desc[6];
304 const int csrc = matrix_desc[7];
305 const int ldA = matrix_desc[8];
306 assert(rsrc == csrc);
309 const int np = cusolverMpNUMROC(n, mb, myprow, rsrc, nprow);
310 const int nq = cusolverMpNUMROC(n, nb, mypcol, csrc, npcol);
313 const cublasFillMode_t fill_mode = CUBLAS_FILL_MODE_UPPER;
314 const cudaDataType_t data_type = CUDA_R_64F;
316 cusolverMpMatrixDescriptor_t cusolvermp_matrix_desc = NULL;
317 CUSOLVER_CHECK(cusolverMpCreateMatrixDesc(
318 &cusolvermp_matrix_desc,
grid, data_type, n, n, mb, nb, rsrc, csrc, np));
321 size_t work_dev_size, work_host_size;
322 void *DUMMY = (
void *)1;
323 CUSOLVER_CHECK(cusolverMpSyevd_bufferSize(
324 cusolvermp_handle,
"V", fill_mode, n, DUMMY, 1, 1, cusolvermp_matrix_desc,
325 NULL, NULL, 1, 1, cusolvermp_matrix_desc, data_type, &work_dev_size,
328 double *work_dev = NULL;
329 CUDA_CHECK(cudaMalloc((
void **)&work_dev, work_dev_size));
331 double *work_host = NULL;
332 CUDA_CHECK(cudaMallocHost((
void **)&work_host, work_host_size));
335 const size_t matrix_local_size = ldA * nq *
sizeof(double);
336 double *matrix_dev = NULL;
337 CUDA_CHECK(cudaMalloc((
void **)&matrix_dev, matrix_local_size));
338 CUDA_CHECK(cudaMemcpyAsync(matrix_dev, matrix, matrix_local_size,
339 cudaMemcpyHostToDevice, stream));
342 int *info_dev = NULL;
343 CUDA_CHECK(cudaMalloc((
void **)&info_dev,
sizeof(
int)));
345 double *eigenvectors_dev = NULL;
346 CUDA_CHECK(cudaMalloc((
void **)&eigenvectors_dev, matrix_local_size));
348 double *eigenvalues_dev = NULL;
349 CUDA_CHECK(cudaMalloc((
void **)&eigenvalues_dev, n *
sizeof(
double)));
353 cusolverMpSyevd(cusolvermp_handle,
"V", fill_mode, n, matrix_dev, 1, 1,
354 cusolvermp_matrix_desc, eigenvalues_dev, eigenvectors_dev,
355 1, 1, cusolvermp_matrix_desc, data_type, work_dev,
356 work_dev_size, work_host, work_host_size, info_dev));
359 CUDA_CHECK(cudaStreamSynchronize(stream));
360#if !defined(__CUSOLVERMP_NCCL)
361 CAL_CHECK(cal_stream_sync(cal_comm, stream));
366 CUDA_CHECK(cudaMemcpy(&info, info_dev,
sizeof(
int), cudaMemcpyDeviceToHost));
370 CUDA_CHECK(cudaMemcpyAsync(eigenvectors, eigenvectors_dev, matrix_local_size,
371 cudaMemcpyDeviceToHost, stream));
372 CUDA_CHECK(cudaMemcpyAsync(eigenvalues, eigenvalues_dev, n *
sizeof(
double),
373 cudaMemcpyDeviceToHost, stream));
376 CUDA_CHECK(cudaStreamSynchronize(stream));
379 CUDA_CHECK(cudaFree(matrix_dev));
380 CUDA_CHECK(cudaFree(info_dev));
381 CUDA_CHECK(cudaFree(eigenvectors_dev));
382 CUDA_CHECK(cudaFree(eigenvalues_dev));
383 CUDA_CHECK(cudaFree(work_dev));
384 CUDA_CHECK(cudaFreeHost(work_host));
387 CUSOLVER_CHECK(cusolverMpDestroyMatrixDesc(cusolvermp_matrix_desc));
388 CUSOLVER_CHECK(cusolverMpDestroyGrid(
grid));
389 CUSOLVER_CHECK(cusolverMpDestroy(cusolvermp_handle));
390 CUDA_CHECK(cudaStreamDestroy(stream));
391#if defined(__CUSOLVERMP_NCCL)
392 NCCL_CHECK(ncclCommDestroy(nccl_comm));
394 CAL_CHECK(cal_comm_destroy(cal_comm));
405void cp_fm_diag_cusolver_sygvd(
const int fortran_comm,
406 const int a_matrix_desc[9],
407 const int b_matrix_desc[9],
const int nprow,
408 const int npcol,
const int myprow,
409 const int mypcol,
const int n,
410 const double *aMatrix,
const double *bMatrix,
411 double *eigenvectors,
double *eigenvalues) {
415 MPI_Comm comm = MPI_Comm_f2c(fortran_comm);
417 MPI_Comm_rank(comm, &rank);
418 MPI_Comm_size(comm, &nranks);
420#if defined(__CUSOLVERMP_NCCL)
421 check_nccl_device_visibility(comm, local_device);
425#if defined(__CUSOLVERMP_NCCL)
427 ncclUniqueId nccl_id;
429 NCCL_CHECK(ncclGetUniqueId(&nccl_id));
431 MPI_Bcast(&nccl_id,
sizeof(nccl_id), MPI_BYTE, 0, comm);
433 ncclComm_t nccl_comm;
434 NCCL_CHECK(ncclCommInitRank(&nccl_comm, nranks, nccl_id, rank));
437 cal_comm_t cal_comm = NULL;
438 cal_comm_create_params_t params;
439 params.allgather = &allgather;
440 params.req_test = &req_test;
441 params.req_free = &req_free;
444 params.nranks = nranks;
445 params.local_device = local_device;
446 CAL_CHECK(cal_comm_create(params, &cal_comm));
450 cudaStream_t stream = NULL;
451 CUDA_CHECK(cudaStreamCreate(&stream));
453 cusolverMpHandle_t cusolvermp_handle = NULL;
454 CUSOLVER_CHECK(cusolverMpCreate(&cusolvermp_handle, local_device, stream));
457 cusolverMpGrid_t
grid = NULL;
458#if defined(__CUSOLVERMP_NCCL)
459 CUSOLVER_CHECK(cusolverMpCreateDeviceGrid(cusolvermp_handle, &
grid, nccl_comm,
461 CUSOLVERMP_GRID_MAPPING_ROW_MAJOR));
463 CUSOLVER_CHECK(cusolverMpCreateDeviceGrid(cusolvermp_handle, &
grid, cal_comm,
465 CUSOLVERMP_GRID_MAPPING_ROW_MAJOR));
469 const int mb_a = a_matrix_desc[4];
470 const int nb_a = a_matrix_desc[5];
471 const int rsrc_a = a_matrix_desc[6];
472 const int csrc_a = a_matrix_desc[7];
473 const int ldA = a_matrix_desc[8];
475 const int mb_b = b_matrix_desc[4];
476 const int nb_b = b_matrix_desc[5];
477 const int rsrc_b = b_matrix_desc[6];
478 const int csrc_b = b_matrix_desc[7];
479 const int ldB = b_matrix_desc[8];
482 assert(mb_a == mb_b && nb_a == nb_b);
483 assert(rsrc_a == rsrc_b && csrc_a == csrc_b);
486 const int np_a = cusolverMpNUMROC(n, mb_a, myprow, rsrc_a, nprow);
487 const int nq_a = cusolverMpNUMROC(n, nb_a, mypcol, csrc_a, npcol);
489 const cublasFillMode_t uplo = CUBLAS_FILL_MODE_LOWER;
490 const cusolverEigType_t itype = CUSOLVER_EIG_TYPE_1;
491 const cusolverEigMode_t jobz = CUSOLVER_EIG_MODE_VECTOR;
492 const cudaDataType_t data_type = CUDA_R_64F;
495 cusolverMpMatrixDescriptor_t descrA = NULL;
496 cusolverMpMatrixDescriptor_t descrB = NULL;
497 cusolverMpMatrixDescriptor_t descrZ = NULL;
501 CUSOLVER_CHECK(cusolverMpCreateMatrixDesc(&descrA,
grid, data_type, n, n,
502 mb_a, nb_a, rsrc_a, csrc_a, ldA));
503 CUSOLVER_CHECK(cusolverMpCreateMatrixDesc(&descrB,
grid, data_type, n, n,
504 mb_b, nb_b, rsrc_b, csrc_b, ldA));
505 CUSOLVER_CHECK(cusolverMpCreateMatrixDesc(&descrZ,
grid, data_type, n, n,
506 mb_a, nb_a, rsrc_a, csrc_a, ldA));
509 double *dev_A = NULL, *dev_B = NULL;
510 size_t matrix_local_size = ldA * nq_a *
sizeof(double);
511 CUDA_CHECK(cudaMalloc((
void **)&dev_A, matrix_local_size));
512 CUDA_CHECK(cudaMalloc((
void **)&dev_B, matrix_local_size));
515 CUDA_CHECK(cudaMemcpyAsync(dev_A, aMatrix, matrix_local_size,
516 cudaMemcpyHostToDevice, stream));
517 CUDA_CHECK(cudaMemcpyAsync(dev_B, bMatrix, matrix_local_size,
518 cudaMemcpyHostToDevice, stream));
521 double *dev_Z = NULL, *eigenvalues_dev = NULL;
522 CUDA_CHECK(cudaMalloc((
void **)&dev_Z, matrix_local_size));
523 CUDA_CHECK(cudaMalloc((
void **)&eigenvalues_dev, n *
sizeof(
double)));
526 size_t work_dev_size = 0, work_host_size = 0;
527 const int64_t ia = 1, ja = 1, ib = 1, jb = 1, iz = 1, jz = 1;
528 const int64_t m = (int64_t)n;
530 cusolverStatus_t status_bufsize = cusolverMpSygvd_bufferSize(
531 cusolvermp_handle, itype, jobz, uplo, m, ia, ja, descrA, ib, jb, descrB,
532 iz, jz, descrZ, data_type, &work_dev_size, &work_host_size);
533 if (status_bufsize != CUSOLVER_STATUS_SUCCESS) {
534 fprintf(stderr,
"ERROR: cusolverMpSygvd_bufferSize failed with status=%d\n",
535 (
int)status_bufsize);
539 void *work_dev = NULL, *work_host = NULL;
540 CUDA_CHECK(cudaMalloc(&work_dev, work_dev_size));
541 CUDA_CHECK(cudaMallocHost(&work_host, work_host_size));
544 int *info_dev = NULL;
545 CUDA_CHECK(cudaMalloc((
void **)&info_dev,
sizeof(
int)));
546 CUDA_CHECK(cudaMemset(info_dev, 0,
sizeof(
int)));
549 cusolverStatus_t status_sygvd = cusolverMpSygvd(
550 cusolvermp_handle, itype, jobz, uplo, m, dev_A, ia, ja, descrA, dev_B, ib,
551 jb, descrB, eigenvalues_dev, dev_Z, iz, jz, descrZ, data_type, work_dev,
552 work_dev_size, work_host, work_host_size, info_dev);
553 if (status_sygvd != CUSOLVER_STATUS_SUCCESS) {
554 fprintf(stderr,
"ERROR: cusolverMpSygvd failed with status=%d\n",
560 CUDA_CHECK(cudaStreamSynchronize(stream));
561#if !defined(__CUSOLVERMP_NCCL)
562 CAL_CHECK(cal_stream_sync(cal_comm, stream));
567 CUDA_CHECK(cudaMemcpy(&info, info_dev,
sizeof(
int), cudaMemcpyDeviceToHost));
569 fprintf(stderr,
"ERROR: cusolverMpSygvd failed with info = %d\n", info);
574 CUDA_CHECK(cudaMemcpyAsync(eigenvectors, dev_Z, matrix_local_size,
575 cudaMemcpyDeviceToHost, stream));
576 CUDA_CHECK(cudaMemcpyAsync(eigenvalues, eigenvalues_dev, n *
sizeof(
double),
577 cudaMemcpyDeviceToHost, stream));
580 CUDA_CHECK(cudaStreamSynchronize(stream));
583 CUDA_CHECK(cudaFree(dev_A));
584 CUDA_CHECK(cudaFree(dev_B));
585 CUDA_CHECK(cudaFree(dev_Z));
586 CUDA_CHECK(cudaFree(eigenvalues_dev));
587 CUDA_CHECK(cudaFree(info_dev));
588 CUDA_CHECK(cudaFree(work_dev));
589 CUDA_CHECK(cudaFreeHost(work_host));
590 CUSOLVER_CHECK(cusolverMpDestroyMatrixDesc(descrA));
591 CUSOLVER_CHECK(cusolverMpDestroyMatrixDesc(descrB));
592 CUSOLVER_CHECK(cusolverMpDestroyMatrixDesc(descrZ));
593 CUSOLVER_CHECK(cusolverMpDestroyGrid(
grid));
594 CUSOLVER_CHECK(cusolverMpDestroy(cusolvermp_handle));
595 CUDA_CHECK(cudaStreamDestroy(stream));
596#if defined(__CUSOLVERMP_NCCL)
597 NCCL_CHECK(ncclCommDestroy(nccl_comm));
599 CAL_CHECK(cal_comm_destroy(cal_comm));
608void cp_cfm_diag_cusolver_hegvd(
609 const int fortran_comm,
const int a_matrix_desc[9],
610 const int b_matrix_desc[9],
const int nprow,
const int npcol,
611 const int myprow,
const int mypcol,
const int n,
612 const cuDoubleComplex *aMatrix,
const cuDoubleComplex *bMatrix,
613 cuDoubleComplex *eigenvectors,
double *eigenvalues) {
617 MPI_Comm comm = MPI_Comm_f2c(fortran_comm);
619 MPI_Comm_rank(comm, &rank);
620 MPI_Comm_size(comm, &nranks);
622#if defined(__CUSOLVERMP_NCCL)
623 check_nccl_device_visibility(comm, local_device);
627#if defined(__CUSOLVERMP_NCCL)
628 ncclUniqueId nccl_id;
630 NCCL_CHECK(ncclGetUniqueId(&nccl_id));
632 MPI_Bcast(&nccl_id,
sizeof(nccl_id), MPI_BYTE, 0, comm);
634 ncclComm_t nccl_comm;
635 NCCL_CHECK(ncclCommInitRank(&nccl_comm, nranks, nccl_id, rank));
637 cal_comm_t cal_comm = NULL;
638 cal_comm_create_params_t params;
639 params.allgather = &allgather;
640 params.req_test = &req_test;
641 params.req_free = &req_free;
644 params.nranks = nranks;
645 params.local_device = local_device;
646 CAL_CHECK(cal_comm_create(params, &cal_comm));
649 cudaStream_t stream = NULL;
650 CUDA_CHECK(cudaStreamCreate(&stream));
652 cusolverMpHandle_t cusolvermp_handle = NULL;
653 CUSOLVER_CHECK(cusolverMpCreate(&cusolvermp_handle, local_device, stream));
655 cusolverMpGrid_t
grid = NULL;
656#if defined(__CUSOLVERMP_NCCL)
657 CUSOLVER_CHECK(cusolverMpCreateDeviceGrid(cusolvermp_handle, &
grid, nccl_comm,
659 CUSOLVERMP_GRID_MAPPING_ROW_MAJOR));
661 CUSOLVER_CHECK(cusolverMpCreateDeviceGrid(cusolvermp_handle, &
grid, cal_comm,
663 CUSOLVERMP_GRID_MAPPING_ROW_MAJOR));
666 const int mb_a = a_matrix_desc[4];
667 const int nb_a = a_matrix_desc[5];
668 const int rsrc_a = a_matrix_desc[6];
669 const int csrc_a = a_matrix_desc[7];
670 const int ldA = a_matrix_desc[8];
672 const int mb_b = b_matrix_desc[4];
673 const int nb_b = b_matrix_desc[5];
674 const int rsrc_b = b_matrix_desc[6];
675 const int csrc_b = b_matrix_desc[7];
676 const int ldB = b_matrix_desc[8];
678 assert(mb_a == mb_b && nb_a == nb_b);
679 assert(rsrc_a == rsrc_b && csrc_a == csrc_b);
682 const int np_a = cusolverMpNUMROC(n, mb_a, myprow, rsrc_a, nprow);
683 const int nq_a = cusolverMpNUMROC(n, nb_a, mypcol, csrc_a, npcol);
686 const cublasFillMode_t uplo = CUBLAS_FILL_MODE_LOWER;
687 const cusolverEigType_t itype = CUSOLVER_EIG_TYPE_1;
688 const cusolverEigMode_t jobz = CUSOLVER_EIG_MODE_VECTOR;
689 const cudaDataType_t data_type = CUDA_C_64F;
691 cusolverMpMatrixDescriptor_t descrA = NULL;
692 cusolverMpMatrixDescriptor_t descrB = NULL;
693 cusolverMpMatrixDescriptor_t descrZ = NULL;
695 CUSOLVER_CHECK(cusolverMpCreateMatrixDesc(&descrA,
grid, data_type, n, n,
696 mb_a, nb_a, rsrc_a, csrc_a, ldA));
697 CUSOLVER_CHECK(cusolverMpCreateMatrixDesc(&descrB,
grid, data_type, n, n,
698 mb_b, nb_b, rsrc_b, csrc_b, ldA));
699 CUSOLVER_CHECK(cusolverMpCreateMatrixDesc(&descrZ,
grid, data_type, n, n,
700 mb_a, nb_a, rsrc_a, csrc_a, ldA));
702 cuDoubleComplex *dev_A = NULL, *dev_B = NULL;
703 size_t matrix_local_size = ldA * nq_a *
sizeof(cuDoubleComplex);
704 CUDA_CHECK(cudaMalloc((
void **)&dev_A, matrix_local_size));
705 CUDA_CHECK(cudaMalloc((
void **)&dev_B, matrix_local_size));
707 CUDA_CHECK(cudaMemcpyAsync(dev_A, aMatrix, matrix_local_size,
708 cudaMemcpyHostToDevice, stream));
709 CUDA_CHECK(cudaMemcpyAsync(dev_B, bMatrix, matrix_local_size,
710 cudaMemcpyHostToDevice, stream));
712 cuDoubleComplex *dev_Z = NULL;
713 double *eigenvalues_dev = NULL;
714 CUDA_CHECK(cudaMalloc((
void **)&dev_Z, matrix_local_size));
715 CUDA_CHECK(cudaMalloc((
void **)&eigenvalues_dev, n *
sizeof(
double)));
717 size_t work_dev_size = 0, work_host_size = 0;
718 const int64_t ia = 1, ja = 1, ib = 1, jb = 1, iz = 1, jz = 1;
719 const int64_t m = (int64_t)n;
721 cusolverStatus_t status_bufsize = cusolverMpSygvd_bufferSize(
722 cusolvermp_handle, itype, jobz, uplo, m, ia, ja, descrA, ib, jb, descrB,
723 iz, jz, descrZ, data_type, &work_dev_size, &work_host_size);
724 if (status_bufsize != CUSOLVER_STATUS_SUCCESS) {
725 fprintf(stderr,
"ERROR: cusolverMpSygvd_bufferSize failed with status=%d\n",
726 (
int)status_bufsize);
730 void *work_dev = NULL, *work_host = NULL;
731 CUDA_CHECK(cudaMalloc(&work_dev, work_dev_size));
732 CUDA_CHECK(cudaMallocHost(&work_host, work_host_size));
734 int *info_dev = NULL;
735 CUDA_CHECK(cudaMalloc((
void **)&info_dev,
sizeof(
int)));
736 CUDA_CHECK(cudaMemset(info_dev, 0,
sizeof(
int)));
738 cusolverStatus_t status_sygvd = cusolverMpSygvd(
739 cusolvermp_handle, itype, jobz, uplo, m, dev_A, ia, ja, descrA, dev_B, ib,
740 jb, descrB, eigenvalues_dev, dev_Z, iz, jz, descrZ, data_type, work_dev,
741 work_dev_size, work_host, work_host_size, info_dev);
742 if (status_sygvd != CUSOLVER_STATUS_SUCCESS) {
743 fprintf(stderr,
"ERROR: cusolverMpSygvd failed with status=%d\n",
748 CUDA_CHECK(cudaStreamSynchronize(stream));
749#if !defined(__CUSOLVERMP_NCCL)
750 CAL_CHECK(cal_stream_sync(cal_comm, stream));
754 CUDA_CHECK(cudaMemcpy(&info, info_dev,
sizeof(
int), cudaMemcpyDeviceToHost));
756 fprintf(stderr,
"ERROR: cusolverMpSygvd failed with info = %d\n", info);
760 CUDA_CHECK(cudaMemcpyAsync(eigenvectors, dev_Z, matrix_local_size,
761 cudaMemcpyDeviceToHost, stream));
762 CUDA_CHECK(cudaMemcpyAsync(eigenvalues, eigenvalues_dev, n *
sizeof(
double),
763 cudaMemcpyDeviceToHost, stream));
765 CUDA_CHECK(cudaStreamSynchronize(stream));
767 CUDA_CHECK(cudaFree(dev_A));
768 CUDA_CHECK(cudaFree(dev_B));
769 CUDA_CHECK(cudaFree(dev_Z));
770 CUDA_CHECK(cudaFree(eigenvalues_dev));
771 CUDA_CHECK(cudaFree(info_dev));
772 CUDA_CHECK(cudaFree(work_dev));
773 CUDA_CHECK(cudaFreeHost(work_host));
774 CUSOLVER_CHECK(cusolverMpDestroyMatrixDesc(descrA));
775 CUSOLVER_CHECK(cusolverMpDestroyMatrixDesc(descrB));
776 CUSOLVER_CHECK(cusolverMpDestroyMatrixDesc(descrZ));
777 CUSOLVER_CHECK(cusolverMpDestroyGrid(
grid));
778 CUSOLVER_CHECK(cusolverMpDestroy(cusolvermp_handle));
779 CUDA_CHECK(cudaStreamDestroy(stream));
780#if defined(__CUSOLVERMP_NCCL)
781 NCCL_CHECK(ncclCommDestroy(nccl_comm));
783 CAL_CHECK(cal_comm_destroy(cal_comm));
static void const int const int const int const int const int const double const int const int const int int GRID_CONST_WHEN_COLLOCATE double GRID_CONST_WHEN_INTEGRATE double * grid
subroutine, public cp_fm_diag_cusolver(matrix, eigenvectors, eigenvalues)
Driver routine to diagonalize a FM matrix with the cuSOLVERMp library.
subroutine, public offload_activate_chosen_device()
Activates the device selected via offload_set_chosen_device()
integer function, public offload_get_chosen_device()
Returns the chosen device.