(git:aeba166)
Loading...
Searching...
No Matches
cp_fm_cusolver.c
Go to the documentation of this file.
1/*----------------------------------------------------------------------------*/
2/* CP2K: A general program to perform molecular dynamics simulations */
3/* Copyright 2000-2024 CP2K developers group <https://cp2k.org> */
4/* */
5/* SPDX-License-Identifier: GPL-2.0-or-later */
6/*----------------------------------------------------------------------------*/
7
8#if defined(__CUSOLVERMP)
9
10#include "../offload/offload_library.h"
11#include <assert.h>
12#include <cal.h>
13#include <cuda_runtime.h>
14#include <cusolverMp.h>
15#include <mpi.h>
16#include <stdlib.h>
17
18/*******************************************************************************
19 * \brief Run given CUDA command and upon failure abort with a nice message.
20 * \author Ole Schuett
21 ******************************************************************************/
22#define CUDA_CHECK(cmd) \
23 do { \
24 cudaError_t status = cmd; \
25 if (status != cudaSuccess) { \
26 fprintf(stderr, "ERROR: %s %s %d\n", cudaGetErrorString(status), \
27 __FILE__, __LINE__); \
28 abort(); \
29 } \
30 } while (0)
31
32/*******************************************************************************
33 * \brief Decode given cal error.
34 * \author Ole Schuett
35 ******************************************************************************/
36static char *calGetErrorString(calError_t status) {
37 switch (status) {
38 case CAL_OK:
39 return "CAL_OK";
40 case CAL_ERROR:
41 return "CAL_ERROR";
42 case CAL_ERROR_INVALID_PARAMETER:
43 return "CAL_ERROR_INVALID_PARAMETER";
44 case CAL_ERROR_INTERNAL:
45 return "CAL_ERROR_INTERNAL";
46 case CAL_ERROR_CUDA:
47 return "CAL_ERROR_CUDA";
48 case CAL_ERROR_UCC:
49 return "CAL_ERROR_UCC";
50 case CAL_ERROR_NOT_SUPPORTED:
51 return "CAL_ERROR_NOT_SUPPORTED";
52 case CAL_ERROR_INPROGRESS:
53 return "CAL_ERROR_INPROGRESS";
54 default:
55 return "CAL UNKNOWN ERROR";
56 }
57}
58
59/*******************************************************************************
60 * \brief Run given cal command and upon failure abort with a nice message.
61 * \author Ole Schuett
62 ******************************************************************************/
63#define CAL_CHECK(cmd) \
64 do { \
65 calError_t status = cmd; \
66 if (status != CAL_OK) { \
67 fprintf(stderr, "ERROR: %s %s %d\n", calGetErrorString(status), \
68 __FILE__, __LINE__); \
69 abort(); \
70 } \
71 } while (0)
72
73/*******************************************************************************
74 * \brief Decode given cusolver error.
75 * \author Ole Schuett
76 ******************************************************************************/
77static char *cusolverGetErrorString(cusolverStatus_t status) {
78 switch (status) {
79 case CUSOLVER_STATUS_SUCCESS:
80 return "CUSOLVER_STATUS_SUCCESS";
81 case CUSOLVER_STATUS_NOT_INITIALIZED:
82 return "CUSOLVER_STATUS_NOT_INITIALIZED";
83 case CUSOLVER_STATUS_ALLOC_FAILED:
84 return "CUSOLVER_STATUS_ALLOC_FAILED";
85 case CUSOLVER_STATUS_INVALID_VALUE:
86 return "CUSOLVER_STATUS_INVALID_VALUE";
87 case CUSOLVER_STATUS_ARCH_MISMATCH:
88 return "CUSOLVER_STATUS_ARCH_MISMATCH";
89 case CUSOLVER_STATUS_MAPPING_ERROR:
90 return "CUSOLVER_STATUS_MAPPING_ERROR";
91 case CUSOLVER_STATUS_EXECUTION_FAILED:
92 return "CUSOLVER_STATUS_EXECUTION_FAILED";
93 case CUSOLVER_STATUS_INTERNAL_ERROR:
94 return "CUSOLVER_STATUS_INTERNAL_ERROR";
95 case CUSOLVER_STATUS_MATRIX_TYPE_NOT_SUPPORTED:
96 return "CUSOLVER_STATUS_MATRIX_TYPE_NOT_SUPPORTED";
97 case CUSOLVER_STATUS_NOT_SUPPORTED:
98 return "CUSOLVER_STATUS_NOT_SUPPORTED";
99 case CUSOLVER_STATUS_ZERO_PIVOT:
100 return "CUSOLVER_STATUS_ZERO_PIVOT";
101 case CUSOLVER_STATUS_INVALID_LICENSE:
102 return "CUSOLVER_STATUS_INVALID_LICENSE";
103 case CUSOLVER_STATUS_IRS_PARAMS_NOT_INITIALIZED:
104 return "CUSOLVER_STATUS_IRS_PARAMS_NOT_INITIALIZED";
105 case CUSOLVER_STATUS_IRS_PARAMS_INVALID:
106 return "CUSOLVER_STATUS_IRS_PARAMS_INVALID";
107 case CUSOLVER_STATUS_IRS_PARAMS_INVALID_PREC:
108 return "CUSOLVER_STATUS_IRS_PARAMS_INVALID_PREC";
109 case CUSOLVER_STATUS_IRS_PARAMS_INVALID_REFINE:
110 return "CUSOLVER_STATUS_IRS_PARAMS_INVALID_REFINE";
111 case CUSOLVER_STATUS_IRS_PARAMS_INVALID_MAXITER:
112 return "CUSOLVER_STATUS_IRS_PARAMS_INVALID_MAXITER";
113 case CUSOLVER_STATUS_IRS_INTERNAL_ERROR:
114 return "CUSOLVER_STATUS_IRS_INTERNAL_ERROR";
115 case CUSOLVER_STATUS_IRS_NOT_SUPPORTED:
116 return "CUSOLVER_STATUS_IRS_NOT_SUPPORTED";
117 case CUSOLVER_STATUS_IRS_OUT_OF_RANGE:
118 return "CUSOLVER_STATUS_IRS_OUT_OF_RANGE";
119 case CUSOLVER_STATUS_IRS_NRHS_NOT_SUPPORTED_FOR_REFINE_GMRES:
120 return "CUSOLVER_STATUS_IRS_NRHS_NOT_SUPPORTED_FOR_REFINE_GMRES";
121 case CUSOLVER_STATUS_IRS_INFOS_NOT_INITIALIZED:
122 return "CUSOLVER_STATUS_IRS_INFOS_NOT_INITIALIZED";
123 case CUSOLVER_STATUS_IRS_INFOS_NOT_DESTROYED:
124 return "CUSOLVER_STATUS_IRS_INFOS_NOT_DESTROYED";
125 case CUSOLVER_STATUS_IRS_MATRIX_SINGULAR:
126 return "CUSOLVER_STATUS_IRS_MATRIX_SINGULAR";
127 case CUSOLVER_STATUS_INVALID_WORKSPACE:
128 return "CUSOLVER_STATUS_INVALID_WORKSPACE";
129 default:
130 return "CUSOLVER UNKNOWN ERROR";
131 }
132}
133
134/*******************************************************************************
135 * \brief Run given cusolver command and upon failure abort with a nice message.
136 * \author Ole Schuett
137 ******************************************************************************/
138#define CUSOLVER_CHECK(cmd) \
139 do { \
140 cusolverStatus_t status = cmd; \
141 if (status != CUSOLVER_STATUS_SUCCESS) { \
142 fprintf(stderr, "ERROR: %s %s %d\n", cusolverGetErrorString(status), \
143 __FILE__, __LINE__); \
144 abort(); \
145 } \
146 } while (0)
147
148/*******************************************************************************
149 * \brief Callback for cal library to initiate an allgather operation.
150 * \author Ole Schuett
151 ******************************************************************************/
152static calError_t allgather(void *src_buf, void *recv_buf, size_t size,
153 void *data, void **req) {
154 const MPI_Comm comm = *(MPI_Comm *)data;
155 MPI_Request *request = malloc(sizeof(MPI_Request));
156 *req = request;
157 const int status = MPI_Iallgather(src_buf, size, MPI_BYTE, recv_buf, size,
158 MPI_BYTE, comm, request);
159 return (status == MPI_SUCCESS) ? CAL_OK : CAL_ERROR;
160}
161
162/*******************************************************************************
163 * \brief Callback for cal library to test if a request has completed.
164 * \author Ole Schuett
165 ******************************************************************************/
166static calError_t req_test(void *req) {
167 MPI_Request *request = (MPI_Request *)(req);
168 int completed;
169 const int status = MPI_Test(request, &completed, MPI_STATUS_IGNORE);
170 if (status != MPI_SUCCESS) {
171 return CAL_ERROR;
172 }
173 return completed ? CAL_OK : CAL_ERROR_INPROGRESS;
174}
175
176/*******************************************************************************
177 * \brief Callback for cal library to free a request.
178 * \author Ole Schuett
179 ******************************************************************************/
180static calError_t req_free(void *req) {
181 free(req);
182 return CAL_OK;
183}
184
185/*******************************************************************************
186 * \brief Driver routine to diagonalize a matrix with the cuSOLVERMp library.
187 * \author Ole Schuett
188 ******************************************************************************/
189void cp_fm_diag_cusolver(const int fortran_comm, const int matrix_desc[9],
190 const int nprow, const int npcol, const int myprow,
191 const int mypcol, const int n, const double *matrix,
192 double *eigenvectors, double *eigenvalues) {
193
195 const int local_device = offload_get_chosen_device();
196
197 MPI_Comm comm = MPI_Comm_f2c(fortran_comm);
198 int rank, nranks;
199 MPI_Comm_rank(comm, &rank);
200 MPI_Comm_size(comm, &nranks);
201
202 // Create CAL communicator.
203 cal_comm_t cal_comm = NULL;
204 cal_comm_create_params_t params;
205 params.allgather = &allgather;
206 params.req_test = &req_test;
207 params.req_free = &req_free;
208 params.data = &comm;
209 params.rank = rank;
210 params.nranks = nranks;
211 params.local_device = local_device;
212 CAL_CHECK(cal_comm_create(params, &cal_comm));
213
214 // Create various handles.
215 cudaStream_t stream = NULL;
216 CUDA_CHECK(cudaStreamCreate(&stream));
217
218 cusolverMpHandle_t cusolvermp_handle = NULL;
219 CUSOLVER_CHECK(cusolverMpCreate(&cusolvermp_handle, local_device, stream));
220
221 cusolverMpGrid_t grid = NULL;
222 CUSOLVER_CHECK(cusolverMpCreateDeviceGrid(cusolvermp_handle, &grid, cal_comm,
223 nprow, npcol,
224 CUSOLVERMP_GRID_MAPPING_ROW_MAJOR));
225 const int mb = matrix_desc[4];
226 const int nb = matrix_desc[5];
227 const int rsrc = matrix_desc[6];
228 const int csrc = matrix_desc[7];
229 const int ldA = matrix_desc[8];
230 assert(rsrc == csrc);
231 assert(ldA >= 1);
232
233 const int np = cusolverMpNUMROC(n, mb, myprow, rsrc, nprow);
234 const int nq = cusolverMpNUMROC(n, nb, mypcol, csrc, npcol);
235 assert(np == ldA);
236
237 const cublasFillMode_t fill_mode = CUBLAS_FILL_MODE_UPPER;
238 const cudaDataType_t data_type = CUDA_R_64F; // double
239
240 cusolverMpMatrixDescriptor_t cusolvermp_matrix_desc = NULL;
241 CUSOLVER_CHECK(cusolverMpCreateMatrixDesc(
242 &cusolvermp_matrix_desc, grid, data_type, n, n, mb, nb, rsrc, csrc, np));
243
244 // Allocate workspaces.
245 size_t work_dev_size, work_host_size;
246 void *DUMMY = (void *)1; // Workaround to avoid crash when passing NULL.
247 CUSOLVER_CHECK(cusolverMpSyevd_bufferSize(
248 cusolvermp_handle, "V", fill_mode, n, DUMMY, 1, 1, cusolvermp_matrix_desc,
249 NULL, NULL, 1, 1, cusolvermp_matrix_desc, data_type, &work_dev_size,
250 &work_host_size));
251
252 double *work_dev = NULL;
253 CUDA_CHECK(cudaMalloc((void **)&work_dev, work_dev_size));
254
255 double *work_host = NULL;
256 CUDA_CHECK(cudaMallocHost((void **)&work_host, work_host_size));
257
258 // Upload input matrix.
259 const size_t matrix_local_size = ldA * nq * sizeof(double);
260 double *matrix_dev = NULL;
261 CUDA_CHECK(cudaMalloc((void **)&matrix_dev, matrix_local_size));
262 CUDA_CHECK(cudaMemcpyAsync(matrix_dev, matrix, matrix_local_size,
263 cudaMemcpyHostToDevice, stream));
264
265 // Allocate result buffers.
266 int *info_dev = NULL;
267 CUDA_CHECK(cudaMalloc((void **)&info_dev, sizeof(int)));
268
269 double *eigenvectors_dev = NULL;
270 CUDA_CHECK(cudaMalloc((void **)&eigenvectors_dev, matrix_local_size));
271
272 double *eigenvalues_dev = NULL;
273 CUDA_CHECK(cudaMalloc((void **)&eigenvalues_dev, n * sizeof(double)));
274
275 // Call solver.
276 CUSOLVER_CHECK(
277 cusolverMpSyevd(cusolvermp_handle, "V", fill_mode, n, matrix_dev, 1, 1,
278 cusolvermp_matrix_desc, eigenvalues_dev, eigenvectors_dev,
279 1, 1, cusolvermp_matrix_desc, data_type, work_dev,
280 work_dev_size, work_host, work_host_size, info_dev));
281
282 // Wait for solver to finish.
283 CUDA_CHECK(cudaStreamSynchronize(stream));
284 CAL_CHECK(cal_stream_sync(cal_comm, stream));
285
286 // Check info.
287 int info = -1;
288 CUDA_CHECK(cudaMemcpy(&info, info_dev, sizeof(int), cudaMemcpyDeviceToHost));
289 assert(info == 0);
290
291 // Download results.
292 CUDA_CHECK(cudaMemcpyAsync(eigenvectors, eigenvectors_dev, matrix_local_size,
293 cudaMemcpyDeviceToHost, stream));
294 CUDA_CHECK(cudaMemcpyAsync(eigenvalues, eigenvalues_dev, n * sizeof(double),
295 cudaMemcpyDeviceToHost, stream));
296
297 // Wait for download to finish.
298 CUDA_CHECK(cudaStreamSynchronize(stream));
299
300 // Free buffers.
301 CUDA_CHECK(cudaFree(matrix_dev));
302 CUDA_CHECK(cudaFree(info_dev));
303 CUDA_CHECK(cudaFree(eigenvectors_dev));
304 CUDA_CHECK(cudaFree(eigenvalues_dev));
305 CUDA_CHECK(cudaFree(work_dev));
306 CUDA_CHECK(cudaFreeHost(work_host));
307
308 // Destroy handles.
309 CUSOLVER_CHECK(cusolverMpDestroyMatrixDesc(cusolvermp_matrix_desc));
310 CUSOLVER_CHECK(cusolverMpDestroyGrid(grid));
311 CUSOLVER_CHECK(cusolverMpDestroy(cusolvermp_handle));
312 CUDA_CHECK(cudaStreamDestroy(stream));
313 CAL_CHECK(cal_comm_destroy(cal_comm));
314
315 // Sync MPI ranks to include load imbalance in total timings.
316 MPI_Barrier(comm);
317}
318
319#endif
320
321// EOF
static void const int const int const int const int const int const double const int const int const int int GRID_CONST_WHEN_COLLOCATE double GRID_CONST_WHEN_INTEGRATE double * grid
subroutine, public cp_fm_diag_cusolver(matrix, eigenvectors, eigenvalues)
Driver routine to diagonalize a FM matrix with the cuSOLVERMp library.
subroutine, public offload_activate_chosen_device()
Activates the device selected via offload_set_chosen_device()
integer function, public offload_get_chosen_device()
Returns the chosen device.