8#if defined(__CUSOLVERMP)
10#include "../offload/offload_library.h"
13#include <cuda_runtime.h>
14#include <cusolverMp.h>
22#define CUDA_CHECK(cmd) \
24 cudaError_t status = cmd; \
25 if (status != cudaSuccess) { \
26 fprintf(stderr, "ERROR: %s %s %d\n", cudaGetErrorString(status), \
27 __FILE__, __LINE__); \
36static char *calGetErrorString(calError_t status) {
42 case CAL_ERROR_INVALID_PARAMETER:
43 return "CAL_ERROR_INVALID_PARAMETER";
44 case CAL_ERROR_INTERNAL:
45 return "CAL_ERROR_INTERNAL";
47 return "CAL_ERROR_CUDA";
49 return "CAL_ERROR_UCC";
50 case CAL_ERROR_NOT_SUPPORTED:
51 return "CAL_ERROR_NOT_SUPPORTED";
52 case CAL_ERROR_INPROGRESS:
53 return "CAL_ERROR_INPROGRESS";
55 return "CAL UNKNOWN ERROR";
63#define CAL_CHECK(cmd) \
65 calError_t status = cmd; \
66 if (status != CAL_OK) { \
67 fprintf(stderr, "ERROR: %s %s %d\n", calGetErrorString(status), \
68 __FILE__, __LINE__); \
77static char *cusolverGetErrorString(cusolverStatus_t status) {
79 case CUSOLVER_STATUS_SUCCESS:
80 return "CUSOLVER_STATUS_SUCCESS";
81 case CUSOLVER_STATUS_NOT_INITIALIZED:
82 return "CUSOLVER_STATUS_NOT_INITIALIZED";
83 case CUSOLVER_STATUS_ALLOC_FAILED:
84 return "CUSOLVER_STATUS_ALLOC_FAILED";
85 case CUSOLVER_STATUS_INVALID_VALUE:
86 return "CUSOLVER_STATUS_INVALID_VALUE";
87 case CUSOLVER_STATUS_ARCH_MISMATCH:
88 return "CUSOLVER_STATUS_ARCH_MISMATCH";
89 case CUSOLVER_STATUS_MAPPING_ERROR:
90 return "CUSOLVER_STATUS_MAPPING_ERROR";
91 case CUSOLVER_STATUS_EXECUTION_FAILED:
92 return "CUSOLVER_STATUS_EXECUTION_FAILED";
93 case CUSOLVER_STATUS_INTERNAL_ERROR:
94 return "CUSOLVER_STATUS_INTERNAL_ERROR";
95 case CUSOLVER_STATUS_MATRIX_TYPE_NOT_SUPPORTED:
96 return "CUSOLVER_STATUS_MATRIX_TYPE_NOT_SUPPORTED";
97 case CUSOLVER_STATUS_NOT_SUPPORTED:
98 return "CUSOLVER_STATUS_NOT_SUPPORTED";
99 case CUSOLVER_STATUS_ZERO_PIVOT:
100 return "CUSOLVER_STATUS_ZERO_PIVOT";
101 case CUSOLVER_STATUS_INVALID_LICENSE:
102 return "CUSOLVER_STATUS_INVALID_LICENSE";
103 case CUSOLVER_STATUS_IRS_PARAMS_NOT_INITIALIZED:
104 return "CUSOLVER_STATUS_IRS_PARAMS_NOT_INITIALIZED";
105 case CUSOLVER_STATUS_IRS_PARAMS_INVALID:
106 return "CUSOLVER_STATUS_IRS_PARAMS_INVALID";
107 case CUSOLVER_STATUS_IRS_PARAMS_INVALID_PREC:
108 return "CUSOLVER_STATUS_IRS_PARAMS_INVALID_PREC";
109 case CUSOLVER_STATUS_IRS_PARAMS_INVALID_REFINE:
110 return "CUSOLVER_STATUS_IRS_PARAMS_INVALID_REFINE";
111 case CUSOLVER_STATUS_IRS_PARAMS_INVALID_MAXITER:
112 return "CUSOLVER_STATUS_IRS_PARAMS_INVALID_MAXITER";
113 case CUSOLVER_STATUS_IRS_INTERNAL_ERROR:
114 return "CUSOLVER_STATUS_IRS_INTERNAL_ERROR";
115 case CUSOLVER_STATUS_IRS_NOT_SUPPORTED:
116 return "CUSOLVER_STATUS_IRS_NOT_SUPPORTED";
117 case CUSOLVER_STATUS_IRS_OUT_OF_RANGE:
118 return "CUSOLVER_STATUS_IRS_OUT_OF_RANGE";
119 case CUSOLVER_STATUS_IRS_NRHS_NOT_SUPPORTED_FOR_REFINE_GMRES:
120 return "CUSOLVER_STATUS_IRS_NRHS_NOT_SUPPORTED_FOR_REFINE_GMRES";
121 case CUSOLVER_STATUS_IRS_INFOS_NOT_INITIALIZED:
122 return "CUSOLVER_STATUS_IRS_INFOS_NOT_INITIALIZED";
123 case CUSOLVER_STATUS_IRS_INFOS_NOT_DESTROYED:
124 return "CUSOLVER_STATUS_IRS_INFOS_NOT_DESTROYED";
125 case CUSOLVER_STATUS_IRS_MATRIX_SINGULAR:
126 return "CUSOLVER_STATUS_IRS_MATRIX_SINGULAR";
127 case CUSOLVER_STATUS_INVALID_WORKSPACE:
128 return "CUSOLVER_STATUS_INVALID_WORKSPACE";
130 return "CUSOLVER UNKNOWN ERROR";
138#define CUSOLVER_CHECK(cmd) \
140 cusolverStatus_t status = cmd; \
141 if (status != CUSOLVER_STATUS_SUCCESS) { \
142 fprintf(stderr, "ERROR: %s %s %d\n", cusolverGetErrorString(status), \
143 __FILE__, __LINE__); \
152static calError_t allgather(
void *src_buf,
void *recv_buf,
size_t size,
153 void *data,
void **req) {
154 const MPI_Comm comm = *(MPI_Comm *)data;
155 MPI_Request *request = malloc(
sizeof(MPI_Request));
157 const int status = MPI_Iallgather(src_buf, size, MPI_BYTE, recv_buf, size,
158 MPI_BYTE, comm, request);
159 return (status == MPI_SUCCESS) ? CAL_OK : CAL_ERROR;
166static calError_t req_test(
void *req) {
167 MPI_Request *request = (MPI_Request *)(req);
169 const int status = MPI_Test(request, &completed, MPI_STATUS_IGNORE);
170 if (status != MPI_SUCCESS) {
173 return completed ? CAL_OK : CAL_ERROR_INPROGRESS;
180static calError_t req_free(
void *req) {
190 const int nprow,
const int npcol,
const int myprow,
191 const int mypcol,
const int n,
const double *matrix,
192 double *eigenvectors,
double *eigenvalues) {
197 MPI_Comm comm = MPI_Comm_f2c(fortran_comm);
199 MPI_Comm_rank(comm, &rank);
200 MPI_Comm_size(comm, &nranks);
203 cal_comm_t cal_comm = NULL;
204 cal_comm_create_params_t params;
205 params.allgather = &allgather;
206 params.req_test = &req_test;
207 params.req_free = &req_free;
210 params.nranks = nranks;
211 params.local_device = local_device;
212 CAL_CHECK(cal_comm_create(params, &cal_comm));
215 cudaStream_t stream = NULL;
216 CUDA_CHECK(cudaStreamCreate(&stream));
218 cusolverMpHandle_t cusolvermp_handle = NULL;
219 CUSOLVER_CHECK(cusolverMpCreate(&cusolvermp_handle, local_device, stream));
221 cusolverMpGrid_t
grid = NULL;
222 CUSOLVER_CHECK(cusolverMpCreateDeviceGrid(cusolvermp_handle, &
grid, cal_comm,
224 CUSOLVERMP_GRID_MAPPING_ROW_MAJOR));
225 const int mb = matrix_desc[4];
226 const int nb = matrix_desc[5];
227 const int rsrc = matrix_desc[6];
228 const int csrc = matrix_desc[7];
229 const int ldA = matrix_desc[8];
230 assert(rsrc == csrc);
233 const int np = cusolverMpNUMROC(n, mb, myprow, rsrc, nprow);
234 const int nq = cusolverMpNUMROC(n, nb, mypcol, csrc, npcol);
237 const cublasFillMode_t fill_mode = CUBLAS_FILL_MODE_UPPER;
238 const cudaDataType_t data_type = CUDA_R_64F;
240 cusolverMpMatrixDescriptor_t cusolvermp_matrix_desc = NULL;
241 CUSOLVER_CHECK(cusolverMpCreateMatrixDesc(
242 &cusolvermp_matrix_desc,
grid, data_type, n, n, mb, nb, rsrc, csrc, np));
245 size_t work_dev_size, work_host_size;
246 void *DUMMY = (
void *)1;
247 CUSOLVER_CHECK(cusolverMpSyevd_bufferSize(
248 cusolvermp_handle,
"V", fill_mode, n, DUMMY, 1, 1, cusolvermp_matrix_desc,
249 NULL, NULL, 1, 1, cusolvermp_matrix_desc, data_type, &work_dev_size,
252 double *work_dev = NULL;
253 CUDA_CHECK(cudaMalloc((
void **)&work_dev, work_dev_size));
255 double *work_host = NULL;
256 CUDA_CHECK(cudaMallocHost((
void **)&work_host, work_host_size));
259 const size_t matrix_local_size = ldA * nq *
sizeof(double);
260 double *matrix_dev = NULL;
261 CUDA_CHECK(cudaMalloc((
void **)&matrix_dev, matrix_local_size));
262 CUDA_CHECK(cudaMemcpyAsync(matrix_dev, matrix, matrix_local_size,
263 cudaMemcpyHostToDevice, stream));
266 int *info_dev = NULL;
267 CUDA_CHECK(cudaMalloc((
void **)&info_dev,
sizeof(
int)));
269 double *eigenvectors_dev = NULL;
270 CUDA_CHECK(cudaMalloc((
void **)&eigenvectors_dev, matrix_local_size));
272 double *eigenvalues_dev = NULL;
273 CUDA_CHECK(cudaMalloc((
void **)&eigenvalues_dev, n *
sizeof(
double)));
277 cusolverMpSyevd(cusolvermp_handle,
"V", fill_mode, n, matrix_dev, 1, 1,
278 cusolvermp_matrix_desc, eigenvalues_dev, eigenvectors_dev,
279 1, 1, cusolvermp_matrix_desc, data_type, work_dev,
280 work_dev_size, work_host, work_host_size, info_dev));
283 CUDA_CHECK(cudaStreamSynchronize(stream));
284 CAL_CHECK(cal_stream_sync(cal_comm, stream));
288 CUDA_CHECK(cudaMemcpy(&info, info_dev,
sizeof(
int), cudaMemcpyDeviceToHost));
292 CUDA_CHECK(cudaMemcpyAsync(eigenvectors, eigenvectors_dev, matrix_local_size,
293 cudaMemcpyDeviceToHost, stream));
294 CUDA_CHECK(cudaMemcpyAsync(eigenvalues, eigenvalues_dev, n *
sizeof(
double),
295 cudaMemcpyDeviceToHost, stream));
298 CUDA_CHECK(cudaStreamSynchronize(stream));
301 CUDA_CHECK(cudaFree(matrix_dev));
302 CUDA_CHECK(cudaFree(info_dev));
303 CUDA_CHECK(cudaFree(eigenvectors_dev));
304 CUDA_CHECK(cudaFree(eigenvalues_dev));
305 CUDA_CHECK(cudaFree(work_dev));
306 CUDA_CHECK(cudaFreeHost(work_host));
309 CUSOLVER_CHECK(cusolverMpDestroyMatrixDesc(cusolvermp_matrix_desc));
310 CUSOLVER_CHECK(cusolverMpDestroyGrid(
grid));
311 CUSOLVER_CHECK(cusolverMpDestroy(cusolvermp_handle));
312 CUDA_CHECK(cudaStreamDestroy(stream));
313 CAL_CHECK(cal_comm_destroy(cal_comm));
323void cp_fm_diag_cusolver_sygvd(
const int fortran_comm,
324 const int a_matrix_desc[9],
325 const int b_matrix_desc[9],
const int nprow,
326 const int npcol,
const int myprow,
327 const int mypcol,
const int n,
328 const double *aMatrix,
const double *bMatrix,
329 double *eigenvectors,
double *eigenvalues) {
334 MPI_Comm comm = MPI_Comm_f2c(fortran_comm);
336 MPI_Comm_rank(comm, &rank);
337 MPI_Comm_size(comm, &nranks);
340 cal_comm_t cal_comm = NULL;
341 cal_comm_create_params_t params;
342 params.allgather = &allgather;
343 params.req_test = &req_test;
344 params.req_free = &req_free;
347 params.nranks = nranks;
348 params.local_device = local_device;
349 CAL_CHECK(cal_comm_create(params, &cal_comm));
352 cudaStream_t stream = NULL;
353 CUDA_CHECK(cudaStreamCreate(&stream));
355 cusolverMpHandle_t cusolvermp_handle = NULL;
356 CUSOLVER_CHECK(cusolverMpCreate(&cusolvermp_handle, local_device, stream));
359 cusolverMpGrid_t
grid = NULL;
360 CUSOLVER_CHECK(cusolverMpCreateDeviceGrid(cusolvermp_handle, &
grid, cal_comm,
362 CUSOLVERMP_GRID_MAPPING_ROW_MAJOR));
365 const int mb_a = a_matrix_desc[4];
366 const int nb_a = a_matrix_desc[5];
367 const int rsrc_a = a_matrix_desc[6];
368 const int csrc_a = a_matrix_desc[7];
369 const int ldA = a_matrix_desc[8];
371 const int mb_b = b_matrix_desc[4];
372 const int nb_b = b_matrix_desc[5];
373 const int rsrc_b = b_matrix_desc[6];
374 const int csrc_b = b_matrix_desc[7];
375 const int ldB = b_matrix_desc[8];
378 assert(mb_a == mb_b && nb_a == nb_b);
379 assert(rsrc_a == rsrc_b && csrc_a == csrc_b);
381 const int np_a = cusolverMpNUMROC(n, mb_a, myprow, rsrc_a, nprow);
382 const int nq_a = cusolverMpNUMROC(n, nb_a, mypcol, csrc_a, npcol);
384 const cublasFillMode_t uplo = CUBLAS_FILL_MODE_LOWER;
385 const cusolverEigType_t itype = CUSOLVER_EIG_TYPE_1;
386 const cusolverEigMode_t jobz = CUSOLVER_EIG_MODE_VECTOR;
387 const cudaDataType_t data_type = CUDA_R_64F;
390 cusolverMpMatrixDescriptor_t descrA = NULL;
391 cusolverMpMatrixDescriptor_t descrB = NULL;
392 cusolverMpMatrixDescriptor_t descrZ = NULL;
394 CUSOLVER_CHECK(cusolverMpCreateMatrixDesc(&descrA,
grid, data_type, n, n,
395 mb_a, nb_a, rsrc_a, csrc_a, np_a));
396 CUSOLVER_CHECK(cusolverMpCreateMatrixDesc(&descrB,
grid, data_type, n, n,
397 mb_b, nb_b, rsrc_b, csrc_b, np_a));
398 CUSOLVER_CHECK(cusolverMpCreateMatrixDesc(&descrZ,
grid, data_type, n, n,
399 mb_a, nb_a, rsrc_a, csrc_a, np_a));
402 double *dev_A = NULL, *dev_B = NULL;
403 size_t matrix_local_size = ldA * nq_a *
sizeof(double);
404 CUDA_CHECK(cudaMalloc((
void **)&dev_A, matrix_local_size));
405 CUDA_CHECK(cudaMalloc((
void **)&dev_B, matrix_local_size));
408 CUDA_CHECK(cudaMemcpyAsync(dev_A, aMatrix, matrix_local_size,
409 cudaMemcpyHostToDevice, stream));
410 CUDA_CHECK(cudaMemcpyAsync(dev_B, bMatrix, matrix_local_size,
411 cudaMemcpyHostToDevice, stream));
414 double *dev_Z = NULL, *eigenvalues_dev = NULL;
415 CUDA_CHECK(cudaMalloc((
void **)&dev_Z, matrix_local_size));
416 CUDA_CHECK(cudaMalloc((
void **)&eigenvalues_dev, n *
sizeof(
double)));
419 size_t work_dev_size = 0, work_host_size = 0;
420 CUSOLVER_CHECK(cusolverMpSygvd_bufferSize(
421 cusolvermp_handle, itype, jobz, uplo, n, 1, 1, descrA, 1, 1, descrB, 1, 1,
422 descrZ, data_type, &work_dev_size, &work_host_size));
424 void *work_dev = NULL, *work_host = NULL;
425 CUDA_CHECK(cudaMalloc(&work_dev, work_dev_size));
426 CUDA_CHECK(cudaMallocHost(&work_host, work_host_size));
429 int *info_dev = NULL;
430 CUDA_CHECK(cudaMalloc((
void **)&info_dev,
sizeof(
int)));
433 CUSOLVER_CHECK(cusolverMpSygvd(
434 cusolvermp_handle, itype, jobz, uplo, n, dev_A, 1, 1, descrA, dev_B, 1, 1,
435 descrB, eigenvalues_dev, dev_Z, 1, 1, descrZ, data_type, work_dev,
436 work_dev_size, work_host, work_host_size, info_dev));
439 CUDA_CHECK(cudaStreamSynchronize(stream));
443 CUDA_CHECK(cudaMemcpy(&info, info_dev,
sizeof(
int), cudaMemcpyDeviceToHost));
445 fprintf(stderr,
"ERROR: cusolverMpSygvd failed with info = %d\n", info);
450 CUDA_CHECK(cudaMemcpyAsync(eigenvectors, dev_Z, matrix_local_size,
451 cudaMemcpyDeviceToHost, stream));
452 CUDA_CHECK(cudaMemcpyAsync(eigenvalues, eigenvalues_dev, n *
sizeof(
double),
453 cudaMemcpyDeviceToHost, stream));
456 CUDA_CHECK(cudaStreamSynchronize(stream));
459 CUDA_CHECK(cudaFree(dev_A));
460 CUDA_CHECK(cudaFree(dev_B));
461 CUDA_CHECK(cudaFree(dev_Z));
462 CUDA_CHECK(cudaFree(eigenvalues_dev));
463 CUDA_CHECK(cudaFree(info_dev));
464 CUDA_CHECK(cudaFree(work_dev));
465 CUDA_CHECK(cudaFreeHost(work_host));
466 CUSOLVER_CHECK(cusolverMpDestroyMatrixDesc(descrA));
467 CUSOLVER_CHECK(cusolverMpDestroyMatrixDesc(descrB));
468 CUSOLVER_CHECK(cusolverMpDestroyMatrixDesc(descrZ));
469 CUSOLVER_CHECK(cusolverMpDestroyGrid(
grid));
470 CUSOLVER_CHECK(cusolverMpDestroy(cusolvermp_handle));
471 CUDA_CHECK(cudaStreamDestroy(stream));
472 CAL_CHECK(cal_comm_destroy(cal_comm));
static void const int const int const int const int const int const double const int const int const int int GRID_CONST_WHEN_COLLOCATE double GRID_CONST_WHEN_INTEGRATE double * grid
subroutine, public cp_fm_diag_cusolver(matrix, eigenvectors, eigenvalues)
Driver routine to diagonalize a FM matrix with the cuSOLVERMp library.
subroutine, public offload_activate_chosen_device()
Activates the device selected via offload_set_chosen_device()
integer function, public offload_get_chosen_device()
Returns the chosen device.