d4/db8/dbm__multiply__gpu_8c_source.html

/*----------------------------------------------------------------------------*/

/*  CP2K: A general program to perform molecular dynamics simulations         */

/*  Copyright 2000-2025 CP2K developers group <https://cp2k.org>              */

/*                                                                            */

/*  SPDX-License-Identifier: BSD-3-Clause                                     */

/*----------------------------------------------------------------------------*/


#include "../offload/offload_runtime.h"

#if defined(__OFFLOAD) && !defined(__NO_OFFLOAD_DBM)


#include "../offload/offload_library.h"

#include "dbm_hyperparams.h"

#include "dbm_mempool.h"

#include "dbm_multiply_gpu.h"

#include "dbm_multiply_gpu_kernel.h"


#include <assert.h>

#include <stdio.h>


/*******************************************************************************

 * \brief Internal routine for initializing the gpu backend.

 * \author Ole Schuett

 ******************************************************************************/

void dbm_multiply_gpu_start(const int max_batch_size, const int nshards,

                            dbm_shard_t *shards_c_host,

                            dbm_multiply_gpu_context_t *ctx) {

  // Select GPU device.

  offload_activate_chosen_device();


  ctx->nshards = nshards;

  ctx->shards_c_host = shards_c_host;

  ctx->max_batch_size = max_batch_size;

  offloadStreamCreate(&ctx->main_stream);


  // Allocate device storage for batches.

  const size_t size = nshards * max_batch_size * sizeof(dbm_task_t);

  ctx->batches_dev = dbm_mempool_device_malloc(size);


  // Allocate and upload shards of result matrix C.

  ctx->shards_c_dev = malloc(nshards * sizeof(dbm_shard_gpu_t));

  assert(ctx->shards_c_dev != NULL);

  for (int i = 0; i < nshards; i++) {

    const dbm_shard_t *shard_c_host = &ctx->shards_c_host[i];

    dbm_shard_gpu_t *shard_c_dev = &ctx->shards_c_dev[i];

    offloadStreamCreate(&shard_c_dev->stream);

    shard_c_dev->data_size = shard_c_host->data_size;

    // only allocate data_size on device rather than data_allocated

    shard_c_dev->data_allocated = shard_c_host->data_size;

    shard_c_dev->data =

        dbm_mempool_device_malloc(shard_c_dev->data_allocated * sizeof(double));

    offloadMemcpyAsyncHtoD(shard_c_dev->data, shard_c_host->data,

                           shard_c_dev->data_size * sizeof(double),

                           shard_c_dev->stream);

  }

}


/*******************************************************************************

 * \brief Private routine for uploading a single pack onto the device.

 * \author Ole Schuett

 ******************************************************************************/

static void upload_pack(const dbm_pack_t *pack_host, dbm_pack_t *pack_dev,

                        const offloadStream_t stream) {


  const size_t size = pack_host->data_size * sizeof(double);

  if (pack_dev->data_size < pack_host->data_size) {

    dbm_mempool_device_free(pack_dev->data);

    pack_dev->data = dbm_mempool_device_malloc(size);

  }

  offloadMemcpyAsyncHtoD(pack_dev->data, pack_host->data, size, stream);

}


/*******************************************************************************

 * \brief Internal routine for uploading newly arrived packs onto the device.

 * \author Ole Schuett

 ******************************************************************************/

void dbm_multiply_gpu_upload_packs(const dbm_pack_t *pack_a,

                                   const dbm_pack_t *pack_b,

                                   dbm_multiply_gpu_context_t *ctx) {

  // Select GPU device.

  offload_activate_chosen_device();


  // Wait for all c-streams to complete before overwriting old packs.

  offloadEvent_t event;

  offloadEventCreate(&event);

  for (int i = 0; i < ctx->nshards; i++) {

    offloadEventRecord(event, ctx->shards_c_dev[i].stream);

    offloadStreamWaitEvent(ctx->main_stream, event, 0);

  }


  upload_pack(pack_a, &ctx->pack_a_dev, ctx->main_stream);

  upload_pack(pack_b, &ctx->pack_b_dev, ctx->main_stream);


  // Have all c-streams wait until new packs are uploaded.

  offloadEventRecord(event, ctx->main_stream);

  for (int i = 0; i < ctx->nshards; i++) {

    offloadStreamWaitEvent(ctx->shards_c_dev[i].stream, event, 0);

  }

  offloadEventDestroy(event);

}


/*******************************************************************************

 * \brief Internal routine for executing the tasks in given batch on the GPU.

 * \author Ole Schuett

 ******************************************************************************/

void dbm_multiply_gpu_process_batch(const int ntasks, const dbm_task_t *batch,

                                    const double alpha, const int kshard,

                                    dbm_multiply_gpu_context_t *ctx) {

  if (ntasks == 0) {

    return; // Nothing to do.

  }


  // Select GPU device.

  offload_activate_chosen_device();


  const dbm_shard_t *shard_c_host = &ctx->shards_c_host[kshard];

  dbm_shard_gpu_t *shard_c_dev = &ctx->shards_c_dev[kshard];

  assert(NULL != shard_c_host && NULL != shard_c_dev);


  // Upload new batch.

  dbm_task_t *batch_dev = &ctx->batches_dev[kshard * ctx->max_batch_size];

  const size_t size = ntasks * sizeof(dbm_task_t);

  offloadMemcpyAsyncHtoD(batch_dev, batch, size, shard_c_dev->stream);

  offloadEvent_t memsetup;

  offloadEventCreate(&memsetup);


  // Reallocate shard_c_dev->data if necessary.

  double *old_data_dev = NULL;

  if (shard_c_host->data_promised > shard_c_dev->data_allocated) {

    shard_c_dev->data_allocated =

        DBM_OVERCOMMIT_DEVICE * shard_c_host->data_promised;

    assert(shard_c_host->data_promised <= shard_c_dev->data_allocated);

    old_data_dev = shard_c_dev->data;

    shard_c_dev->data =

        dbm_mempool_device_malloc(shard_c_dev->data_allocated * sizeof(double));

    // Omit to wait for copy before freeing old buffer.

    offloadMemcpyAsyncDtoD(shard_c_dev->data, old_data_dev,

                           shard_c_dev->data_size * sizeof(double),

                           shard_c_dev->stream);

  }

  offloadEventRecord(memsetup, shard_c_dev->stream);


  // Zero new blocks if necessary.

  if (shard_c_host->data_promised > shard_c_dev->data_size) {

    const int tail = shard_c_host->data_promised - shard_c_dev->data_size;

    offloadMemsetAsync(&shard_c_dev->data[shard_c_dev->data_size], 0,

                       tail * sizeof(double), shard_c_dev->stream);

    shard_c_dev->data_size = shard_c_host->data_promised;

  }


  // Launch kernel.

  assert(0 != shard_c_dev->data_size);

  dbm_multiply_gpu_launch_kernel(shard_c_dev->stream, alpha, ntasks, batch,

                                 batch_dev, ctx->pack_a_dev.data,

                                 ctx->pack_b_dev.data, shard_c_dev->data);

  OFFLOAD_CHECK(offloadGetLastError());


  // Wait for:

  // - Batch to be uploaded (before refilling it).

  // - Device memory buffer (if resized).

  offloadEventSynchronize(memsetup);

  offloadEventDestroy(memsetup);


  // Safely freeing old buffer.

  if (NULL != old_data_dev) {

    dbm_mempool_device_free(old_data_dev);

  }

}


/*******************************************************************************

 * \brief Internal routine for downloading results from the device.

 * \author Ole Schuett

 ******************************************************************************/

void dbm_multiply_gpu_download_results(dbm_multiply_gpu_context_t *ctx) {

  // Select GPU device.

  offload_activate_chosen_device();


#pragma omp parallel for DBM_OMP_SCHEDULE

  for (int i = 0; i < ctx->nshards; i++) {

    // Grow host buffer if necessary.

    dbm_shard_t *shard_c_host = &ctx->shards_c_host[i];

    dbm_shard_allocate_promised_blocks(shard_c_host);


    // Download results from device.

    dbm_shard_gpu_t *shard_c_dev = &ctx->shards_c_dev[i];

    assert(shard_c_host->data_size == shard_c_dev->data_size);

    const size_t size = shard_c_dev->data_size * sizeof(double);

    offloadMemcpyAsyncDtoH(shard_c_host->data, shard_c_dev->data, size,

                           shard_c_dev->stream);

  }

}


/*******************************************************************************

 * \brief Internal routine for shutting down the gpu backend.

 * \author Ole Schuett

 ******************************************************************************/

void dbm_multiply_gpu_stop(dbm_multiply_gpu_context_t *ctx) {

  // Select GPU device.

  offload_activate_chosen_device();


  // Wait for completion, then free gpu ressources.

#pragma omp parallel for DBM_OMP_SCHEDULE

  for (int i = 0; i < ctx->nshards; i++) {

    dbm_shard_gpu_t *shard_c_dev = &ctx->shards_c_dev[i];

    offloadStreamSynchronize(shard_c_dev->stream);

    offloadStreamDestroy(shard_c_dev->stream);

    dbm_mempool_device_free(shard_c_dev->data);

  }

  free(ctx->shards_c_dev);


  dbm_mempool_device_free(ctx->pack_a_dev.data);

  dbm_mempool_device_free(ctx->pack_b_dev.data);

  dbm_mempool_device_free(ctx->batches_dev);

  offloadStreamDestroy(ctx->main_stream);

}


#endif // defined(__OFFLOAD) && !defined(__NO_OFFLOAD_DBM)


// EOF

dbm_hyperparams.h

DBM_OVERCOMMIT_DEVICE
#define DBM_OVERCOMMIT_DEVICE
Definition dbm_hyperparams.h:20

dbm_mempool_device_free
void dbm_mempool_device_free(const void *memory)
Internal routine for releasing memory back to the pool.
Definition dbm_mempool.c:276

dbm_mempool_device_malloc
void * dbm_mempool_device_malloc(size_t size)
Internal routine for allocating device memory from the pool.
Definition dbm_mempool.c:219

dbm_mempool.h

dbm_multiply_gpu.h

dbm_multiply_gpu_kernel.h

dbm_shard_allocate_promised_blocks
void dbm_shard_allocate_promised_blocks(dbm_shard_t *shard)
Internal routine for allocating and zeroing any promised block's data.
Definition dbm_shard.c:231

i
static void const int const int i
Definition grid_cpu_collint.h:38

offload_api::offload_activate_chosen_device
subroutine, public offload_activate_chosen_device()
Activates the device selected via offload_set_chosen_device()
Definition offload_api.F:174

dbm_pack_t
Internal struct for storing a pack - essentially a shard for MPI.
Definition dbm_internal.h:32

dbm_pack_t::data
double * data
Definition dbm_internal.h:36

dbm_pack_t::data_size
int data_size
Definition dbm_internal.h:34

dbm_shard_t
Internal struct for storing a matrix shard.
Definition dbm_shard.h:30

dbm_shard_t::data
double * data
Definition dbm_shard.h:43

dbm_shard_t::data_size
int data_size
Definition dbm_shard.h:42

dbm_shard_t::data_promised
int data_promised
Definition dbm_shard.h:39

dbm_task_t
Internal struct for storing a task, ie. a single block multiplication.
Definition dbm_internal.h:43