(git:1f285aa)
grid_gpu_task_list.cu
Go to the documentation of this file.
1 /*----------------------------------------------------------------------------*/
2 /* CP2K: A general program to perform molecular dynamics simulations */
3 /* Copyright 2000-2024 CP2K developers group <https://cp2k.org> */
4 /* */
5 /* SPDX-License-Identifier: BSD-3-Clause */
6 /*----------------------------------------------------------------------------*/
7 
8 #include "../../offload/offload_runtime.h"
9 #if defined(__OFFLOAD) && !defined(__NO_OFFLOAD_GRID)
10 
11 #include <assert.h>
12 #include <stdint.h>
13 #include <stdio.h>
14 #include <stdlib.h>
15 #include <string.h>
16 
17 #include "../../offload/offload_library.h"
18 #include "../common/grid_common.h"
19 #include "../common/grid_constants.h"
20 #include "../common/grid_library.h"
21 #include "grid_gpu_collocate.h"
22 #include "grid_gpu_integrate.h"
23 #include "grid_gpu_task_list.h"
24 
25 #if defined(_OMP_H)
26 #error "OpenMP should not be used in .cu files to accommodate HIP."
27 #endif
28 
29 /*******************************************************************************
30  * \brief Create a single task and precompute as much as possible.
31  * \author Ole Schuett
32  ******************************************************************************/
33 static void
34 create_tasks(const bool orthorhombic, const int ntasks,
35  const int block_offsets[], const double atom_positions[][3],
36  const int atom_kinds[], const grid_basis_set *basis_sets[],
37  const int level_list[], const int iatom_list[],
38  const int jatom_list[], const int iset_list[],
39  const int jset_list[], const int ipgf_list[],
40  const int jpgf_list[], const int border_mask_list[],
41  const int block_num_list[], const double radius_list[],
42  const double rab_list[][3], const int npts_local[][3],
43  const int shift_local[][3], const int border_width[][3],
44  const double dh[][3][3], const double dh_inv[][3][3],
45  const double *sphis_dev[], grid_gpu_task tasks[]) {
46 
47  for (int itask = 0; itask < ntasks; itask++) {
48  grid_gpu_task *task = &tasks[itask];
49 
50  task->iatom = iatom_list[itask] - 1;
51  task->jatom = jatom_list[itask] - 1;
52  const int iset = iset_list[itask] - 1;
53  const int jset = jset_list[itask] - 1;
54  const int ipgf = ipgf_list[itask] - 1;
55  const int jpgf = jpgf_list[itask] - 1;
56  const int ikind = atom_kinds[task->iatom] - 1;
57  const int jkind = atom_kinds[task->jatom] - 1;
58  const grid_basis_set *ibasis = basis_sets[ikind];
59  const grid_basis_set *jbasis = basis_sets[jkind];
60 
61  const int level = level_list[itask] - 1;
62  const int border_mask = border_mask_list[itask];
63 
64  task->use_orthorhombic_kernel = (orthorhombic && border_mask == 0);
65 
66  task->zeta = ibasis->zet[iset * ibasis->maxpgf + ipgf];
67  task->zetb = jbasis->zet[jset * jbasis->maxpgf + jpgf];
68  task->zetp = task->zeta + task->zetb;
69  const double f = task->zetb / task->zetp;
70 
71  task->rab2 = 0.0;
72  for (int i = 0; i < 3; i++) {
73  task->rab[i] = rab_list[itask][i];
74  task->rab2 += task->rab[i] * task->rab[i];
75  task->ra[i] = atom_positions[task->iatom][i];
76  task->rb[i] = task->ra[i] + task->rab[i];
77  task->rp[i] = task->ra[i] + task->rab[i] * f;
78  }
79 
80  // center in grid coords, gp = MATMUL(dh_inv, rp)
81  for (int i = 0; i < 3; i++) {
82  task->gp[i] = dh_inv[level][0][i] * task->rp[0] +
83  dh_inv[level][1][i] * task->rp[1] +
84  dh_inv[level][2][i] * task->rp[2];
85  }
86 
87  // resolution of the grid
88  task->dh_max = 0.0;
89  for (int i = 0; i < 3; i++) {
90  for (int j = 0; j < 3; j++) {
91  task->dh_max = fmax(task->dh_max, fabs(dh[level][i][j]));
92  }
93  }
94 
95  task->radius = radius_list[itask];
96  task->radius2 = task->radius * task->radius;
97  task->prefactor = exp(-task->zeta * f * task->rab2);
98  task->off_diag_twice = (task->iatom == task->jatom) ? 1.0 : 2.0;
99 
100  // angular momentum range of basis set
101  task->la_max_basis = ibasis->lmax[iset];
102  task->lb_max_basis = jbasis->lmax[jset];
103  task->la_min_basis = ibasis->lmin[iset];
104  task->lb_min_basis = jbasis->lmin[jset];
105 
106  // size of entire spherical basis
107  task->nsgfa = ibasis->nsgf;
108  task->nsgfb = jbasis->nsgf;
109 
110  // size of spherical set
111  task->nsgf_seta = ibasis->nsgf_set[iset];
112  task->nsgf_setb = jbasis->nsgf_set[jset];
113 
114  // strides of the sphi transformation matrices
115  task->maxcoa = ibasis->maxco;
116  task->maxcob = jbasis->maxco;
117 
118  // start of spherical set within the basis
119  const int sgfa = ibasis->first_sgf[iset] - 1;
120  const int sgfb = jbasis->first_sgf[jset] - 1;
121 
122  // start of exponent within the cartesian set
123  const int o1 = ipgf * ncoset(task->la_max_basis);
124  const int o2 = jpgf * ncoset(task->lb_max_basis);
125 
126  // transformations from contracted spherical to primitiv carthesian basis
127  task->sphia = &sphis_dev[ikind][sgfa * task->maxcoa + o1];
128  task->sphib = &sphis_dev[jkind][sgfb * task->maxcob + o2];
129 
130  // Locate current matrix block within the buffer.
131  const int block_num = block_num_list[itask] - 1;
132  task->block_transposed = (task->iatom > task->jatom);
133  const int block_offset = block_offsets[block_num];
134  const int subblock_offset = (task->block_transposed)
135  ? sgfa * task->nsgfb + sgfb
136  : sgfb * task->nsgfa + sgfa;
137  task->ab_block_offset = block_offset + subblock_offset;
138 
139  // Stuff for the ortho kernel ----------------------------------------------
140  if (orthorhombic) {
141  // Discretize the radius.
142  const double drmin =
143  fmin(dh[level][0][0], fmin(dh[level][1][1], dh[level][2][2]));
144  const int imr = imax(1, (int)ceil(task->radius / drmin));
145  task->disr_radius = drmin * imr;
146 
147  // Position of the gaussian product:
148  // this is the actual definition of the position on the grid
149  // i.e. a point rp(:) gets here grid coordinates
150  // MODULO(rp(:)/dr(:),npts_global(:))+1
151  // hence (0.0,0.0,0.0) in real space is rsgrid%lb on the rsgrid in Fortran
152  // and (1,1,1) on grid here in C.
153  //
154  // cubecenter(:) = FLOOR(MATMUL(dh_inv, rp))
155  for (int i = 0; i < 3; i++) {
156  const int cubecenter = floor(task->gp[i]);
157  task->cube_center_shifted[i] = cubecenter - shift_local[level][i];
158  task->cube_offset[i] = cubecenter * dh[level][i][i] - task->rp[i];
159  }
160  }
161 
162  // Stuff for the general kernel --------------------------------------------
163  //
164  // get the min max indices that contain at least the cube that contains a
165  // sphere around rp of radius radius if the cell is very non-orthogonal this
166  // implies that many useless points are included this estimate can be
167  // improved (i.e. not box but sphere should be used)
168  for (int i = 0; i < 3; i++) {
169  task->index_min[i] = INT_MAX;
170  task->index_max[i] = INT_MIN;
171  }
172  for (int i = -1; i <= 1; i++) {
173  for (int j = -1; j <= 1; j++) {
174  for (int k = -1; k <= 1; k++) {
175  const double x = task->rp[0] + i * task->radius;
176  const double y = task->rp[1] + j * task->radius;
177  const double z = task->rp[2] + k * task->radius;
178  for (int idir = 0; idir < 3; idir++) {
179  const double resc = dh_inv[level][0][idir] * x +
180  dh_inv[level][1][idir] * y +
181  dh_inv[level][2][idir] * z;
182  task->index_min[idir] =
183  imin(task->index_min[idir], (int)floor(resc));
184  task->index_max[idir] =
185  imax(task->index_max[idir], (int)ceil(resc));
186  }
187  }
188  }
189  }
190 
191  // Defaults for border_mask == 0.
192  task->bounds_i[0] = 0;
193  task->bounds_i[1] = npts_local[level][0] - 1;
194  task->bounds_j[0] = 0;
195  task->bounds_j[1] = npts_local[level][1] - 1;
196  task->bounds_k[0] = 0;
197  task->bounds_k[1] = npts_local[level][2] - 1;
198 
199  // See also rs_find_node() in task_list_methods.F.
200  // If the bit is set then we need to exclude the border in that direction.
201  if (border_mask & (1 << 0))
202  task->bounds_i[0] += border_width[level][0];
203  if (border_mask & (1 << 1))
204  task->bounds_i[1] -= border_width[level][0];
205  if (border_mask & (1 << 2))
206  task->bounds_j[0] += border_width[level][1];
207  if (border_mask & (1 << 3))
208  task->bounds_j[1] -= border_width[level][1];
209  if (border_mask & (1 << 4))
210  task->bounds_k[0] += border_width[level][2];
211  if (border_mask & (1 << 5))
212  task->bounds_k[1] -= border_width[level][2];
213  }
214 }
215 
216 /*******************************************************************************
217  * \brief Allocates a task list for the GPU backend.
218  * See grid_task_list.h for details.
219  * \author Ole Schuett
220  ******************************************************************************/
221 void grid_gpu_create_task_list(
222  const bool orthorhombic, const int ntasks, const int nlevels,
223  const int natoms, const int nkinds, const int nblocks,
224  const int block_offsets[], const double atom_positions[][3],
225  const int atom_kinds[], const grid_basis_set *basis_sets[],
226  const int level_list[], const int iatom_list[], const int jatom_list[],
227  const int iset_list[], const int jset_list[], const int ipgf_list[],
228  const int jpgf_list[], const int border_mask_list[],
229  const int block_num_list[], const double radius_list[],
230  const double rab_list[][3], const int npts_global[][3],
231  const int npts_local[][3], const int shift_local[][3],
232  const int border_width[][3], const double dh[][3][3],
233  const double dh_inv[][3][3], grid_gpu_task_list **task_list_out) {
234 
235  // Select GPU device.
237 
238  if (*task_list_out != NULL) {
239  // This is actually an opportunity to reuse some buffers.
240  grid_gpu_free_task_list(*task_list_out);
241  }
242 
243  grid_gpu_task_list *task_list =
244  (grid_gpu_task_list *)malloc(sizeof(grid_gpu_task_list));
245 
246  task_list->orthorhombic = orthorhombic;
247  task_list->ntasks = ntasks;
248  task_list->nlevels = nlevels;
249  task_list->natoms = natoms;
250  task_list->nkinds = nkinds;
251  task_list->nblocks = nblocks;
252 
253  // Store grid layouts.
254  size_t size = nlevels * sizeof(grid_gpu_layout);
255  task_list->layouts = (grid_gpu_layout *)malloc(size);
256  for (int level = 0; level < nlevels; level++) {
257  for (int i = 0; i < 3; i++) {
258  task_list->layouts[level].npts_global[i] = npts_global[level][i];
259  task_list->layouts[level].npts_local[i] = npts_local[level][i];
260  task_list->layouts[level].shift_local[i] = shift_local[level][i];
261  task_list->layouts[level].border_width[i] = border_width[level][i];
262  for (int j = 0; j < 3; j++) {
263  task_list->layouts[level].dh[i][j] = dh[level][i][j];
264  task_list->layouts[level].dh_inv[i][j] = dh_inv[level][i][j];
265  }
266  }
267  }
268 
269  // Upload basis set sphi matrices to device.
270  task_list->sphis_dev = (double **)malloc(nkinds * sizeof(double *));
271  for (int i = 0; i < nkinds; i++) {
272  size = basis_sets[i]->nsgf * basis_sets[i]->maxco * sizeof(double);
273  offloadMalloc((void **)&task_list->sphis_dev[i], size);
274  offloadMemcpyHtoD(task_list->sphis_dev[i], basis_sets[i]->sphi, size);
275  }
276 
277  size = ntasks * sizeof(grid_gpu_task);
278  grid_gpu_task *tasks_host = (grid_gpu_task *)malloc(size);
279  create_tasks(orthorhombic, ntasks, block_offsets, atom_positions, atom_kinds,
280  basis_sets, level_list, iatom_list, jatom_list, iset_list,
281  jset_list, ipgf_list, jpgf_list, border_mask_list,
282  block_num_list, radius_list, rab_list, npts_local, shift_local,
283  border_width, dh, dh_inv, (const double **)task_list->sphis_dev,
284  tasks_host);
285  offloadMalloc((void **)&task_list->tasks_dev, size);
286  offloadMemcpyHtoD(task_list->tasks_dev, tasks_host, size);
287 
288  free(tasks_host);
289 
290  // Count tasks per level.
291  size = nlevels * sizeof(int);
292  task_list->tasks_per_level = (int *)malloc(size);
293  memset(task_list->tasks_per_level, 0, size);
294  for (int i = 0; i < ntasks; i++) {
295  task_list->tasks_per_level[level_list[i] - 1]++;
296  assert(i == 0 || level_list[i] >= level_list[i - 1]); // expect ordered list
297  }
298 
299  // Find largest angular momentum.
300  task_list->lmax = 0;
301  for (int ikind = 0; ikind < nkinds; ikind++) {
302  for (int iset = 0; iset < basis_sets[ikind]->nset; iset++) {
303  task_list->lmax = imax(task_list->lmax, basis_sets[ikind]->lmax[iset]);
304  }
305  }
306 
307  // collect stats
308  memset(task_list->stats, 0, 2 * 20 * sizeof(int));
309  for (int itask = 0; itask < ntasks; itask++) {
310  const int iatom = iatom_list[itask] - 1;
311  const int jatom = jatom_list[itask] - 1;
312  const int ikind = atom_kinds[iatom] - 1;
313  const int jkind = atom_kinds[jatom] - 1;
314  const int iset = iset_list[itask] - 1;
315  const int jset = jset_list[itask] - 1;
316  const int la_max = basis_sets[ikind]->lmax[iset];
317  const int lb_max = basis_sets[jkind]->lmax[jset];
318  const int lp = imin(la_max + lb_max, 19);
319  const bool has_border_mask = (border_mask_list[itask] != 0);
320  task_list->stats[has_border_mask][lp]++;
321  }
322 
323  // allocate main cuda stream
324  offloadStreamCreate(&task_list->main_stream);
325 
326  // allocate one cuda stream per grid level
327  size = nlevels * sizeof(offloadStream_t);
328  task_list->level_streams = (offloadStream_t *)malloc(size);
329  for (int i = 0; i < nlevels; i++) {
330  offloadStreamCreate(&task_list->level_streams[i]);
331  }
332 
333  // return newly created task list
334  *task_list_out = task_list;
335 }
336 
337 /*******************************************************************************
338  * \brief Deallocates given task list, basis_sets have to be freed separately.
339  * \author Ole Schuett
340  ******************************************************************************/
341 void grid_gpu_free_task_list(grid_gpu_task_list *task_list) {
342 
343  offloadFree(task_list->tasks_dev);
344 
345  offloadStreamDestroy(task_list->main_stream);
346 
347  for (int i = 0; i < task_list->nlevels; i++) {
348  offloadStreamDestroy(task_list->level_streams[i]);
349  }
350  free(task_list->level_streams);
351 
352  for (int i = 0; i < task_list->nkinds; i++) {
353  offloadFree(task_list->sphis_dev[i]);
354  }
355  free(task_list->sphis_dev);
356 
357  free(task_list->tasks_per_level);
358  free(task_list->layouts);
359  free(task_list);
360 }
361 
362 /*******************************************************************************
363  * \brief Collocate all tasks of in given list onto given grids.
364  * See grid_task_list.h for details.
365  * \author Ole Schuett
366  ******************************************************************************/
367 void grid_gpu_collocate_task_list(const grid_gpu_task_list *task_list,
368  const enum grid_func func, const int nlevels,
369  const offload_buffer *pab_blocks,
370  offload_buffer *grids[]) {
371 
372  // Select GPU device.
374 
375  // Upload blocks buffer using the main stream
376  offloadMemcpyAsyncHtoD(pab_blocks->device_buffer, pab_blocks->host_buffer,
377  pab_blocks->size, task_list->main_stream);
378 
379  // record an event so the level streams can wait for the blocks to be uploaded
380  offloadEvent_t input_ready_event;
381  offloadEventCreate(&input_ready_event);
382  offloadEventRecord(input_ready_event, task_list->main_stream);
383 
384  int lp_diff;
385  int first_task = 0;
386  assert(task_list->nlevels == nlevels);
387  for (int level = 0; level < task_list->nlevels; level++) {
388  const int last_task = first_task + task_list->tasks_per_level[level] - 1;
389  const offloadStream_t level_stream = task_list->level_streams[level];
390  const grid_gpu_layout *layout = &task_list->layouts[level];
391  offload_buffer *grid = grids[level];
392 
393  // zero grid device buffer
394  offloadMemsetAsync(grid->device_buffer, 0, grid->size, level_stream);
395 
396  // launch kernel, but only after blocks have arrived
397  offloadStreamWaitEvent(level_stream, input_ready_event, 0);
398  grid_gpu_collocate_one_grid_level(
399  task_list, first_task, last_task, func, layout, level_stream,
400  pab_blocks->device_buffer, grid->device_buffer, &lp_diff);
401 
402  first_task = last_task + 1;
403  }
404 
405  // update counters while we wait for kernels to finish
406  for (int has_border_mask = 0; has_border_mask <= 1; has_border_mask++) {
407  for (int lp = 0; lp < 20; lp++) {
408  const int count = task_list->stats[has_border_mask][lp];
409  if (task_list->orthorhombic && !has_border_mask) {
411  GRID_COLLOCATE_ORTHO, count);
412  } else {
414  GRID_COLLOCATE_GENERAL, count);
415  }
416  }
417  }
418 
419  // download result from device to host.
420  for (int level = 0; level < task_list->nlevels; level++) {
421  offload_buffer *grid = grids[level];
422  offloadMemcpyAsyncDtoH(grid->host_buffer, grid->device_buffer, grid->size,
423  task_list->level_streams[level]);
424  }
425 
426  // clean up
427  offloadEventDestroy(input_ready_event);
428 
429  // wait for all the streams to finish
430  offloadDeviceSynchronize();
431 }
432 
433 /*******************************************************************************
434  * \brief Integrate all tasks of in given list onto given grids.
435  * See grid_task_list.h for details.
436  * \author Ole Schuett
437  ******************************************************************************/
438 void grid_gpu_integrate_task_list(const grid_gpu_task_list *task_list,
439  const bool compute_tau, const int natoms,
440  const int nlevels,
441  const offload_buffer *pab_blocks,
442  const offload_buffer *grids[],
443  offload_buffer *hab_blocks,
444  double forces[][3], double virial[3][3]) {
445 
446  // Select GPU device.
448 
449  // Prepare shared buffers using the main stream
450  double *forces_dev = NULL;
451  double *virial_dev = NULL;
452  double *pab_blocks_dev = NULL;
453  const size_t forces_size = 3 * natoms * sizeof(double);
454  const size_t virial_size = 9 * sizeof(double);
455  if (forces != NULL || virial != NULL) {
456  offloadMemcpyAsyncHtoD(pab_blocks->device_buffer, pab_blocks->host_buffer,
457  pab_blocks->size, task_list->main_stream);
458  pab_blocks_dev = pab_blocks->device_buffer;
459  }
460  if (forces != NULL) {
461  offloadMalloc((void **)&forces_dev, forces_size);
462  offloadMemsetAsync(forces_dev, 0, forces_size, task_list->main_stream);
463  }
464  if (virial != NULL) {
465  offloadMalloc((void **)&virial_dev, virial_size);
466  offloadMemsetAsync(virial_dev, 0, virial_size, task_list->main_stream);
467  }
468 
469  // zero device hab blocks buffers
470  offloadMemsetAsync(hab_blocks->device_buffer, 0, hab_blocks->size,
471  task_list->main_stream);
472 
473  // record event so other streams can wait for hab, pab, virial etc to be ready
474  offloadEvent_t input_ready_event;
475  offloadEventCreate(&input_ready_event);
476  offloadEventRecord(input_ready_event, task_list->main_stream);
477 
478  int lp_diff;
479  int first_task = 0;
480  assert(task_list->nlevels == nlevels);
481  for (int level = 0; level < task_list->nlevels; level++) {
482  const int last_task = first_task + task_list->tasks_per_level[level] - 1;
483  const offloadStream_t level_stream = task_list->level_streams[level];
484  const grid_gpu_layout *layout = &task_list->layouts[level];
485  const offload_buffer *grid = grids[level];
486 
487  // upload grid
488  offloadMemcpyAsyncHtoD(grid->device_buffer, grid->host_buffer, grid->size,
489  level_stream);
490 
491  // launch kernel, but only after hab, pab, virial, etc are ready
492  offloadStreamWaitEvent(level_stream, input_ready_event, 0);
493  grid_gpu_integrate_one_grid_level(
494  task_list, first_task, last_task, compute_tau, layout, level_stream,
495  pab_blocks_dev, grid->device_buffer, hab_blocks->device_buffer,
496  forces_dev, virial_dev, &lp_diff);
497 
498  // Have main stream wait for level to complete before downloading results.
499  offloadEvent_t level_done_event;
500  offloadEventCreate(&level_done_event);
501  offloadEventRecord(level_done_event, level_stream);
502  offloadStreamWaitEvent(task_list->main_stream, level_done_event, 0);
503  offloadEventDestroy(level_done_event);
504 
505  first_task = last_task + 1;
506  }
507 
508  // update counters while we wait for kernels to finish
509  for (int has_border_mask = 0; has_border_mask <= 1; has_border_mask++) {
510  for (int lp = 0; lp < 20; lp++) {
511  const int count = task_list->stats[has_border_mask][lp];
512  if (task_list->orthorhombic && !has_border_mask) {
514  GRID_INTEGRATE_ORTHO, count);
515  } else {
517  GRID_INTEGRATE_GENERAL, count);
518  }
519  }
520  }
521 
522  // download result from device to host using main stream.
523  offloadMemcpyAsyncDtoH(hab_blocks->host_buffer, hab_blocks->device_buffer,
524  hab_blocks->size, task_list->main_stream);
525  if (forces != NULL) {
526  offloadMemcpyAsyncDtoH(forces, forces_dev, forces_size,
527  task_list->main_stream);
528  }
529  if (virial != NULL) {
530  offloadMemcpyAsyncDtoH(virial, virial_dev, virial_size,
531  task_list->main_stream);
532  }
533 
534  // wait for all the streams to finish
535  offloadDeviceSynchronize();
536 
537  // clean up
538  offloadEventDestroy(input_ready_event);
539  if (forces != NULL) {
540  offloadFree(forces_dev);
541  }
542  if (virial != NULL) {
543  offloadFree(virial_dev);
544  }
545 }
546 
547 #endif // defined(__OFFLOAD) && !defined(__NO_OFFLOAD_GRID)
548 // EOF
static int imax(int x, int y)
Returns the larger of two given integer (missing from the C standard)
static int imin(int x, int y)
Returns the smaller of two given integer (missing from the C standard)
Definition: dbm_miniapp.c:38
static GRID_HOST_DEVICE int ncoset(const int l)
Number of Cartesian orbitals up to given angular momentum quantum.
Definition: grid_common.h:73
@ GRID_BACKEND_GPU
grid_func
static void const int const int const int const int const int const double const int const int const int int GRID_CONST_WHEN_COLLOCATE double GRID_CONST_WHEN_INTEGRATE double * grid
static void const int const int i
static void const int const int const int const int const int const double const int const int const int npts_local[3]
void grid_library_counter_add(const int lp, const enum grid_backend backend, const enum grid_library_kernel kernel, const int increment)
Adds given increment to counter specified by lp, backend, and kernel.
Definition: grid_library.c:129
@ GRID_INTEGRATE_GENERAL
Definition: grid_library.h:69
@ GRID_COLLOCATE_ORTHO
Definition: grid_library.h:66
@ GRID_COLLOCATE_GENERAL
Definition: grid_library.h:68
@ GRID_INTEGRATE_ORTHO
Definition: grid_library.h:67
subroutine, public offload_activate_chosen_device()
Activates the device selected via offload_set_chosen_device()
Definition: offload_api.F:174
Internal representation of a basis set.
Internal representation of a buffer.
double * device_buffer
double * host_buffer