src/mdlib/nbnxn_cuda/nbnxn_cuda.cu

   1 /* -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustrup"; -*-
   2  *
   3  *
   4  *                This source code is part of
   5  *
   6  *                 G   R   O   M   A   C   S
   7  *
   8  *          GROningen MAchine for Chemical Simulations
   9  *
  10  * Written by David van der Spoel, Erik Lindahl, Berk Hess, and others.
  11  * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
  12  * Copyright (c) 2001-2012, The GROMACS development team,
  13  * check out http://www.gromacs.org for more information.
  14  *
  15  * This program is free software; you can redistribute it and/or
  16  * modify it under the terms of the GNU General Public License
  17  * as published by the Free Software Foundation; either version 2
  18  * of the License, or (at your option) any later version.
  19  *
  20  * If you want to redistribute modifications, please consider that
  21  * scientific software is very special. Version control is crucial -
  22  * bugs must be traceable. We will be happy to consider code for
  23  * inclusion in the official distribution, but derived work must not
  24  * be called official GROMACS. Details are found in the README & COPYING
  25  * files - if they are missing, get the official version at www.gromacs.org.
  26  *
  27  * To help us fund GROMACS development, we humbly ask that you cite
  28  * the papers on the package - you can find them in the top README file.
  29  *
  30  * For more info, check our website at http://www.gromacs.org
  31  *
  32  * And Hey:
  33  * Gallium Rubidium Oxygen Manganese Argon Carbon Silicon
  34  */
  35
  36 #include <stdlib.h>
  37 #include <assert.h>
  38
  39 #if defined(_MSVC)
  40 #include <limits>
  41 #endif
  42
  43 #include "types/simple.h"
  44 #include "types/nbnxn_pairlist.h"
  45 #include "types/nb_verlet.h"
  46 #include "types/ishift.h"
  47 #include "types/force_flags.h"
  48 #include "../nbnxn_consts.h"
  49
  50 #ifdef TMPI_ATOMICS
  51 #include "thread_mpi/atomic.h"
  52 #endif
  53
  54 #include "nbnxn_cuda_types.h"
  55 #include "../../gmxlib/cuda_tools/cudautils.cuh"
  56 #include "nbnxn_cuda.h"
  57 #include "nbnxn_cuda_data_mgmt.h"
  58
  59
  60 /*! Texture reference for nonbonded parameters; bound to cu_nbparam_t.nbfp*/
  61 texture<float, 1, cudaReadModeElementType> tex_nbfp;
  62
  63 /*! Texture reference for Ewald coulomb force table; bound to cu_nbparam_t.coulomb_tab */
  64 texture<float, 1, cudaReadModeElementType> tex_coulomb_tab;
  65
  66 /* Convenience defines */
  67 #define NCL_PER_SUPERCL         (NBNXN_GPU_NCLUSTER_PER_SUPERCLUSTER)
  68 #define CL_SIZE                 (NBNXN_GPU_CLUSTER_SIZE)
  69
  70 /***** The kernels come here *****/
  71 #include "nbnxn_cuda_kernel_utils.cuh"
  72
  73 /* Generate all combinations of kernels through multiple inclusion:
  74    F, F + E, F + prune, F + E + prune. */
  75 /** Force only **/
  76 #include "nbnxn_cuda_kernels.cuh"
  77 /** Force & energy **/
  78 #define CALC_ENERGIES
  79 #include "nbnxn_cuda_kernels.cuh"
  80 #undef CALC_ENERGIES
  81
  82 /*** Pair-list pruning kernels ***/
  83 /** Force only **/
  84 #define PRUNE_NBL
  85 #include "nbnxn_cuda_kernels.cuh"
  86 /** Force & energy **/
  87 #define CALC_ENERGIES
  88 #include "nbnxn_cuda_kernels.cuh"
  89 #undef CALC_ENERGIES
  90 #undef PRUNE_NBL
  91
  92 /*! Nonbonded kernel function pointer type */
  93 typedef void (*nbnxn_cu_kfunc_ptr_t)(const cu_atomdata_t,
  94                                      const cu_nbparam_t,
  95                                      const cu_plist_t,
  96                                      bool);
  97
  98 /*********************************/
  99
 100 /* XXX always/never run the energy/pruning kernels -- only for benchmarking purposes */
 101 static bool always_ener  = (getenv("GMX_GPU_ALWAYS_ENER") != NULL);
 102 static bool never_ener   = (getenv("GMX_GPU_NEVER_ENER") != NULL);
 103 static bool always_prune = (getenv("GMX_GPU_ALWAYS_PRUNE") != NULL);
 104
 105
 106 /* Bit-pattern used for polling-based GPU synchronization. It is used as a float
 107  * and corresponds to having the exponent set to the maximum (127 -- single
 108  * precision) and the mantissa to 0.
 109  */
 110 static unsigned int poll_wait_pattern = (0x7FU << 23);
 111
 112 /*! Returns the number of blocks to be used for the nonbonded GPU kernel. */
 113 static inline int calc_nb_kernel_nblock(int nwork_units, cuda_dev_info_t *dinfo)
 114 {
 115     int max_grid_x_size;
 116
 117     assert(dinfo);
 118
 119     max_grid_x_size = dinfo->prop.maxGridSize[0];
 120
 121     /* do we exceed the grid x dimension limit? */
 122     if (nwork_units > max_grid_x_size)
 123     {
 124         gmx_fatal(FARGS, "Watch out system too large to simulate!\n"
 125                   "The number of nonbonded work units (=number of super-clusters) exceeds the"
 126                   "maximum grid size in x dimension (%d > %d)!", nwork_units, max_grid_x_size);
 127     }
 128
 129     return nwork_units;
 130 }
 131
 132
 133 /* Constant arrays listing all kernel function pointers and enabling selection
 134    of a kernel in an elegant manner. */
 135
 136 static const int nEnergyKernelTypes = 2; /* 0 - no energy, 1 - energy */
 137 static const int nPruneKernelTypes  = 2; /* 0 - no prune, 1 - prune */
 138
 139 /* Default kernels */
 140 static const nbnxn_cu_kfunc_ptr_t
 141 nb_default_kfunc_ptr[eelCuNR][nEnergyKernelTypes][nPruneKernelTypes] =
 142 {
 143     { { k_nbnxn_ewald,              k_nbnxn_ewald_prune },
 144       { k_nbnxn_ewald_ener,         k_nbnxn_ewald_ener_prune } },
 145     { { k_nbnxn_ewald_twin,         k_nbnxn_ewald_twin_prune },
 146       { k_nbnxn_ewald_twin_ener,    k_nbnxn_ewald_twin_ener_prune } },
 147     { { k_nbnxn_rf,                 k_nbnxn_rf_prune },
 148       { k_nbnxn_rf_ener,            k_nbnxn_rf_ener_prune } },
 149     { { k_nbnxn_ewald,              k_nbnxn_ewald_prune },
 150       { k_nbnxn_cutoff_ener,        k_nbnxn_cutoff_ener_prune } },
 151 };
 152
 153 /* Legacy kernels */
 154 static const nbnxn_cu_kfunc_ptr_t
 155 nb_legacy_kfunc_ptr[eelCuNR][nEnergyKernelTypes][nPruneKernelTypes] =
 156 {
 157     { { k_nbnxn_ewald_legacy,           k_nbnxn_ewald_prune_legacy },
 158       { k_nbnxn_ewald_ener_legacy,      k_nbnxn_ewald_ener_prune_legacy } },
 159     { { k_nbnxn_ewald_twin_legacy,      k_nbnxn_ewald_twin_prune_legacy },
 160       { k_nbnxn_ewald_twin_ener_legacy, k_nbnxn_ewald_twin_ener_prune_legacy } },
 161     { { k_nbnxn_rf_legacy,              k_nbnxn_rf_prune_legacy },
 162       { k_nbnxn_rf_ener_legacy,         k_nbnxn_rf_ener_prune_legacy } },
 163     { { k_nbnxn_ewald_legacy,           k_nbnxn_ewald_prune_legacy },
 164       { k_nbnxn_cutoff_ener_legacy,     k_nbnxn_cutoff_ener_prune_legacy } },
 165 };
 166
 167 /*! Return a pointer to the kernel version to be executed at the current step. */
 168 static inline nbnxn_cu_kfunc_ptr_t select_nbnxn_kernel(int kver, int eeltype,
 169                                                        bool bDoEne, bool bDoPrune)
 170 {
 171     assert(kver < eNbnxnCuKNR);
 172     assert(eeltype < eelCuNR);
 173
 174     if (NBNXN_KVER_LEGACY(kver))
 175     {
 176         return nb_legacy_kfunc_ptr[eeltype][bDoEne][bDoPrune];
 177     }
 178     else
 179     {
 180         return nb_default_kfunc_ptr[eeltype][bDoEne][bDoPrune];
 181     }
 182 }
 183
 184 /*! Calculates the amount of shared memory required for kernel version in use. */
 185 static inline int calc_shmem_required(int kver)
 186 {
 187     int shmem;
 188
 189     /* size of shmem (force-buffers/xq/atom type preloading) */
 190     if (NBNXN_KVER_LEGACY(kver))
 191     {
 192         /* i-atom x+q in shared memory */
 193         shmem =  NCL_PER_SUPERCL * CL_SIZE * sizeof(float4);
 194         /* force reduction buffers in shared memory */
 195         shmem += CL_SIZE * CL_SIZE * 3 * sizeof(float);
 196     }
 197     else
 198     {
 199         /* NOTE: with the default kernel on sm3.0 we need shmem only for pre-loading */
 200         /* i-atom x+q in shared memory */
 201         shmem  = NCL_PER_SUPERCL * CL_SIZE * sizeof(float4);
 202 #ifdef IATYPE_SHMEM
 203         /* i-atom types in shared memory */
 204         shmem += NCL_PER_SUPERCL * CL_SIZE * sizeof(int);
 205 #endif
 206 #if __CUDA_ARCH__ < 300
 207         /* force reduction buffers in shared memory */
 208         shmem += CL_SIZE * CL_SIZE * 3 * sizeof(float);
 209 #endif
 210     }
 211
 212     return shmem;
 213 }
 214
 215 /*! As we execute nonbonded workload in separate streams, before launching
 216    the kernel we need to make sure that he following operations have completed:
 217    - atomdata allocation and related H2D transfers (every nstlist step);
 218    - pair list H2D transfer (every nstlist step);
 219    - shift vector H2D transfer (every nstlist step);
 220    - force (+shift force and energy) output clearing (every step).
 221
 222    These operations are issued in the local stream at the beginning of the step
 223    and therefore always complete before the local kernel launch. The non-local
 224    kernel is launched after the local on the same device/context, so this is
 225    inherently scheduled after the operations in the local stream (including the
 226    above "misc_ops").
 227    However, for the sake of having a future-proof implementation, we use the
 228    misc_ops_done event to record the point in time when the above  operations
 229    are finished and synchronize with this event in the non-local stream.
 230 */
 231 void nbnxn_cuda_launch_kernel(nbnxn_cuda_ptr_t cu_nb,
 232                               const nbnxn_atomdata_t *nbatom,
 233                               int flags,
 234                               int iloc)
 235 {
 236     cudaError_t stat;
 237     int adat_begin, adat_len;  /* local/nonlocal offset and length used for xq and f */
 238     /* CUDA kernel launch-related stuff */
 239     int  shmem, nblock;
 240     dim3 dim_block, dim_grid;
 241     nbnxn_cu_kfunc_ptr_t nb_kernel = NULL; /* fn pointer to the nonbonded kernel */
 242
 243     cu_atomdata_t   *adat   = cu_nb->atdat;
 244     cu_nbparam_t    *nbp    = cu_nb->nbparam;
 245     cu_plist_t      *plist  = cu_nb->plist[iloc];
 246     cu_timers_t     *t      = cu_nb->timers;
 247     cudaStream_t    stream  = cu_nb->stream[iloc];
 248
 249     bool bCalcEner   = flags & GMX_FORCE_VIRIAL;
 250     bool bCalcFshift = flags & GMX_FORCE_VIRIAL;
 251     bool bDoTime     = cu_nb->bDoTime;
 252
 253     /* turn energy calculation always on/off (for debugging/testing only) */
 254     bCalcEner = (bCalcEner || always_ener) && !never_ener;
 255
 256     /* don't launch the kernel if there is no work to do */
 257     if (plist->nsci == 0)
 258     {
 259         return;
 260     }
 261
 262     /* calculate the atom data index range based on locality */
 263     if (LOCAL_I(iloc))
 264     {
 265         adat_begin  = 0;
 266         adat_len    = adat->natoms_local;
 267     }
 268     else
 269     {
 270         adat_begin  = adat->natoms_local;
 271         adat_len    = adat->natoms - adat->natoms_local;
 272     }
 273
 274     /* When we get here all misc operations issues in the local stream are done,
 275        so we record that in the local stream and wait for it in the nonlocal one. */
 276     if (cu_nb->bUseTwoStreams)
 277     {
 278         if (iloc == eintLocal)
 279         {
 280             stat = cudaEventRecord(cu_nb->misc_ops_done, stream);
 281             CU_RET_ERR(stat, "cudaEventRecord on misc_ops_done failed");
 282         }
 283         else
 284         {
 285             stat = cudaStreamWaitEvent(stream, cu_nb->misc_ops_done, 0);
 286             CU_RET_ERR(stat, "cudaStreamWaitEvent on misc_ops_done failed");
 287         }
 288     }
 289
 290     /* beginning of timed HtoD section */
 291     if (bDoTime)
 292     {
 293         stat = cudaEventRecord(t->start_nb_h2d[iloc], stream);
 294         CU_RET_ERR(stat, "cudaEventRecord failed");
 295     }
 296
 297     /* HtoD x, q */
 298     cu_copy_H2D_async(adat->xq + adat_begin, nbatom->x + adat_begin * 4,
 299                       adat_len * sizeof(*adat->xq), stream);
 300
 301     if (bDoTime)
 302     {
 303         stat = cudaEventRecord(t->stop_nb_h2d[iloc], stream);
 304         CU_RET_ERR(stat, "cudaEventRecord failed");
 305     }
 306
 307     /* beginning of timed nonbonded calculation section */
 308     if (bDoTime)
 309     {
 310         stat = cudaEventRecord(t->start_nb_k[iloc], stream);
 311         CU_RET_ERR(stat, "cudaEventRecord failed");
 312     }
 313
 314     /* get the pointer to the kernel flavor we need to use */
 315     nb_kernel = select_nbnxn_kernel(cu_nb->kernel_ver, nbp->eeltype, bCalcEner,
 316                                     plist->bDoPrune || always_prune);
 317
 318     /* kernel launch config */
 319     nblock    = calc_nb_kernel_nblock(plist->nsci, cu_nb->dev_info);
 320     dim_block = dim3(CL_SIZE, CL_SIZE, 1);
 321     dim_grid  = dim3(nblock, 1, 1);
 322     shmem     = calc_shmem_required(cu_nb->kernel_ver);
 323
 324     if (debug)
 325     {
 326         fprintf(debug, "GPU launch configuration:\n\tThread block: %dx%dx%d\n\t"
 327                 "Grid: %dx%d\n\t#Super-clusters/clusters: %d/%d (%d)\n",
 328                 dim_block.x, dim_block.y, dim_block.z,
 329                 dim_grid.x, dim_grid.y, plist->nsci*NCL_PER_SUPERCL,
 330                 NCL_PER_SUPERCL, plist->na_c);
 331     }
 332
 333     nb_kernel<<<dim_grid, dim_block, shmem, stream>>>(*adat, *nbp, *plist, bCalcFshift);
 334     CU_LAUNCH_ERR("k_calc_nb");
 335
 336     if (bDoTime)
 337     {
 338         stat = cudaEventRecord(t->stop_nb_k[iloc], stream);
 339         CU_RET_ERR(stat, "cudaEventRecord failed");
 340     }
 341 }
 342
 343 void nbnxn_cuda_launch_cpyback(nbnxn_cuda_ptr_t cu_nb,
 344                                const nbnxn_atomdata_t *nbatom,
 345                                int flags,
 346                                int aloc)
 347 {
 348     cudaError_t stat;
 349     int adat_begin, adat_len, adat_end;  /* local/nonlocal offset and length used for xq and f */
 350     int iloc = -1;
 351
 352     /* determine interaction locality from atom locality */
 353     if (LOCAL_A(aloc))
 354     {
 355         iloc = eintLocal;
 356     }
 357     else if (NONLOCAL_A(aloc))
 358     {
 359         iloc = eintNonlocal;
 360     }
 361     else
 362     {
 363         char stmp[STRLEN];
 364         sprintf(stmp, "Invalid atom locality passed (%d); valid here is only "
 365                 "local (%d) or nonlocal (%d)", aloc, eatLocal, eatNonlocal);
 366         gmx_incons(stmp);
 367     }
 368
 369     cu_atomdata_t   *adat   = cu_nb->atdat;
 370     cu_timers_t     *t      = cu_nb->timers;
 371     bool            bDoTime = cu_nb->bDoTime;
 372     cudaStream_t    stream  = cu_nb->stream[iloc];
 373
 374     bool bCalcEner   = flags & GMX_FORCE_VIRIAL;
 375     bool bCalcFshift = flags & GMX_FORCE_VIRIAL;
 376
 377     /* don't launch copy-back if there was no work to do */
 378     if (cu_nb->plist[iloc]->nsci == 0)
 379     {
 380         return;
 381     }
 382
 383     /* calculate the atom data index range based on locality */
 384     if (LOCAL_A(aloc))
 385     {
 386         adat_begin  = 0;
 387         adat_len    = adat->natoms_local;
 388         adat_end    = cu_nb->atdat->natoms_local;
 389     }
 390     else
 391     {
 392         adat_begin  = adat->natoms_local;
 393         adat_len    = adat->natoms - adat->natoms_local;
 394         adat_end    = cu_nb->atdat->natoms;
 395     }
 396
 397     /* beginning of timed D2H section */
 398     if (bDoTime)
 399     {
 400         stat = cudaEventRecord(t->start_nb_d2h[iloc], stream);
 401         CU_RET_ERR(stat, "cudaEventRecord failed");
 402     }
 403
 404     if (!cu_nb->bUseStreamSync)
 405     {
 406         /* For safety reasons set a few (5%) forces to NaN. This way even if the
 407            polling "hack" fails with some future NVIDIA driver we'll get a crash. */
 408         for (int i = adat_begin; i < 3*adat_end + 2; i += adat_len/20)
 409         {
 410 #ifdef NAN
 411             nbatom->out[0].f[i] = NAN;
 412 #else
 413 #  ifdef _MSVC
 414             if (numeric_limits<float>::has_quiet_NaN)
 415             {
 416                 nbatom->out[0].f[i] = numeric_limits<float>::quiet_NaN();
 417             }
 418             else
 419 #  endif
 420             {
 421                 nbatom->out[0].f[i] = GMX_REAL_MAX;
 422             }
 423 #endif
 424         }
 425
 426         /* Set the last four bytes of the force array to a bit pattern
 427            which can't be the result of the force calculation:
 428            max exponent (127) and zero mantissa. */
 429         *(unsigned int*)&nbatom->out[0].f[adat_end*3 - 1] = poll_wait_pattern;
 430     }
 431
 432     /* With DD the local D2H transfer can only start after the non-local
 433        has been launched. */
 434     if (iloc == eintLocal && cu_nb->bUseTwoStreams)
 435     {
 436         stat = cudaStreamWaitEvent(stream, cu_nb->nonlocal_done, 0);
 437         CU_RET_ERR(stat, "cudaStreamWaitEvent on nonlocal_done failed");
 438     }
 439
 440     /* DtoH f */
 441     cu_copy_D2H_async(nbatom->out[0].f + adat_begin * 3, adat->f + adat_begin,
 442                       (adat_len)*sizeof(*adat->f), stream);
 443
 444     /* After the non-local D2H is launched the nonlocal_done event can be
 445        recorded which signals that the local D2H can proceed. This event is not
 446        placed after the non-local kernel because we first need the non-local
 447        data back first. */
 448     if (iloc == eintNonlocal)
 449     {
 450         stat = cudaEventRecord(cu_nb->nonlocal_done, stream);
 451         CU_RET_ERR(stat, "cudaEventRecord on nonlocal_done failed");
 452     }
 453
 454     /* only transfer energies in the local stream */
 455     if (LOCAL_I(iloc))
 456     {
 457         /* DtoH fshift */
 458         if (bCalcFshift)
 459         {
 460             cu_copy_D2H_async(cu_nb->nbst.fshift, adat->fshift,
 461                               SHIFTS * sizeof(*cu_nb->nbst.fshift), stream);
 462         }
 463
 464         /* DtoH energies */
 465         if (bCalcEner)
 466         {
 467             cu_copy_D2H_async(cu_nb->nbst.e_lj, adat->e_lj,
 468                               sizeof(*cu_nb->nbst.e_lj), stream);
 469             cu_copy_D2H_async(cu_nb->nbst.e_el, adat->e_el,
 470                               sizeof(*cu_nb->nbst.e_el), stream);
 471         }
 472     }
 473
 474     if (bDoTime)
 475     {
 476         stat = cudaEventRecord(t->stop_nb_d2h[iloc], stream);
 477         CU_RET_ERR(stat, "cudaEventRecord failed");
 478     }
 479 }
 480
 481 /* Atomic compare-exchange operation on unsigned values. It is used in
 482  * polling wait for the GPU.
 483  */
 484 static inline bool atomic_cas(volatile unsigned int *ptr,
 485                               unsigned int oldval,
 486                               unsigned int newval)
 487 {
 488     assert(ptr);
 489
 490 #ifdef TMPI_ATOMICS
 491     return tMPI_Atomic_cas((tMPI_Atomic_t *)ptr, oldval, newval);
 492 #else
 493     gmx_incons("Atomic operations not available, atomic_cas() should not have been called!");
 494     return true;
 495 #endif
 496 }
 497
 498 void nbnxn_cuda_wait_gpu(nbnxn_cuda_ptr_t cu_nb,
 499                          const nbnxn_atomdata_t *nbatom,
 500                          int flags, int aloc,
 501                          float *e_lj, float *e_el, rvec *fshift)
 502 {
 503     cudaError_t stat;
 504     int i, adat_end, iloc = -1;
 505     volatile unsigned int *poll_word;
 506
 507     /* determine interaction locality from atom locality */
 508     if (LOCAL_A(aloc))
 509     {
 510         iloc = eintLocal;
 511     }
 512     else if (NONLOCAL_A(aloc))
 513     {
 514         iloc = eintNonlocal;
 515     }
 516     else
 517     {
 518         char stmp[STRLEN];
 519         sprintf(stmp, "Invalid atom locality passed (%d); valid here is only "
 520                 "local (%d) or nonlocal (%d)", aloc, eatLocal, eatNonlocal);
 521         gmx_incons(stmp);
 522     }
 523
 524     cu_plist_t      *plist   = cu_nb->plist[iloc];
 525     cu_timers_t     *timers  = cu_nb->timers;
 526     wallclock_gpu_t *timings = cu_nb->timings;
 527     nb_staging      nbst     = cu_nb->nbst;
 528
 529     bool    bCalcEner   = flags & GMX_FORCE_VIRIAL;
 530     bool    bCalcFshift = flags & GMX_FORCE_VIRIAL;
 531
 532     /* turn energy calculation always on/off (for debugging/testing only) */
 533     bCalcEner = (bCalcEner || always_ener) && !never_ener;
 534
 535     /* don't launch wait/update timers & counters if there was no work to do
 536
 537        NOTE: if timing with multiple GPUs (streams) becomes possible, the
 538        counters could end up being inconsistent due to not being incremented
 539        on some of the nodes! */
 540     if (cu_nb->plist[iloc]->nsci == 0)
 541     {
 542         return;
 543     }
 544
 545     /* calculate the atom data index range based on locality */
 546     if (LOCAL_A(aloc))
 547     {
 548         adat_end = cu_nb->atdat->natoms_local;
 549     }
 550     else
 551     {
 552         adat_end = cu_nb->atdat->natoms;
 553     }
 554
 555     if (cu_nb->bUseStreamSync)
 556     {
 557         stat = cudaStreamSynchronize(cu_nb->stream[iloc]);
 558         CU_RET_ERR(stat, "cudaStreamSynchronize failed in cu_blockwait_nb");
 559     }
 560     else
 561     {
 562         /* Busy-wait until we get the signal pattern set in last byte
 563          * of the l/nl float vector. This pattern corresponds to a floating
 564          * point number which can't be the result of the force calculation
 565          * (maximum, 127 exponent and 0 mantissa).
 566          * The polling uses atomic compare-exchange.
 567          */
 568         poll_word = (volatile unsigned int*)&nbatom->out[0].f[adat_end*3 - 1];
 569         while (atomic_cas(poll_word, poll_wait_pattern, poll_wait_pattern)) {}
 570     }
 571
 572     /* timing data accumulation */
 573     if (cu_nb->bDoTime)
 574     {
 575         /* only increase counter once (at local F wait) */
 576         if (LOCAL_I(iloc))
 577         {
 578             timings->nb_c++;
 579             timings->ktime[plist->bDoPrune ? 1 : 0][bCalcEner ? 1 : 0].c += 1;
 580         }
 581
 582         /* kernel timings */
 583         timings->ktime[plist->bDoPrune ? 1 : 0][bCalcEner ? 1 : 0].t +=
 584             cu_event_elapsed(timers->start_nb_k[iloc], timers->stop_nb_k[iloc]);
 585
 586         /* X/q H2D and F D2H timings */
 587         timings->nb_h2d_t += cu_event_elapsed(timers->start_nb_h2d[iloc],
 588                                                  timers->stop_nb_h2d[iloc]);
 589         timings->nb_d2h_t += cu_event_elapsed(timers->start_nb_d2h[iloc],
 590                                                  timers->stop_nb_d2h[iloc]);
 591
 592         /* only count atdat and pair-list H2D at pair-search step */
 593         if (plist->bDoPrune)
 594         {
 595             /* atdat transfer timing (add only once, at local F wait) */
 596             if (LOCAL_A(aloc))
 597             {
 598                 timings->pl_h2d_c++;
 599                 timings->pl_h2d_t += cu_event_elapsed(timers->start_atdat,
 600                                                          timers->stop_atdat);
 601             }
 602
 603             timings->pl_h2d_t += cu_event_elapsed(timers->start_pl_h2d[iloc],
 604                                                      timers->stop_pl_h2d[iloc]);
 605         }
 606     }
 607
 608     /* add up energies and shift forces (only once at local F wait) */
 609     if (LOCAL_I(iloc))
 610     {
 611         if (bCalcEner)
 612         {
 613             *e_lj += *nbst.e_lj;
 614             *e_el += *nbst.e_el;
 615         }
 616
 617         if (bCalcFshift)
 618         {
 619             for (i = 0; i < SHIFTS; i++)
 620             {
 621                 fshift[i][0] += nbst.fshift[i].x;
 622                 fshift[i][1] += nbst.fshift[i].y;
 623                 fshift[i][2] += nbst.fshift[i].z;
 624             }
 625         }
 626     }
 627
 628     /* turn off pruning (doesn't matter if this is pair-search step or not) */
 629     plist->bDoPrune = false;
 630 }
 631
 632 /*! Return the reference to the nbfp texture. */
 633 const struct texture<float, 1, cudaReadModeElementType>& nbnxn_cuda_get_nbfp_texref()
 634 {
 635     return tex_nbfp;
 636 }
 637
 638 /*! Return the reference to the coulomb_tab. */
 639 const struct texture<float, 1, cudaReadModeElementType>& nbnxn_cuda_get_coulomb_tab_texref()
 640 {
 641     return tex_coulomb_tab;
 642 }
 643
 644 /*! Set up the cache configuration for the non-bonded kernels,
 645  */
 646 void nbnxn_cuda_set_cacheconfig(cuda_dev_info_t *devinfo)
 647 {
 648     cudaError_t stat;
 649
 650     for (int i = 0; i < eelCuNR; i++)
 651         for (int j = 0; j < nEnergyKernelTypes; j++)
 652             for (int k = 0; k < nPruneKernelTypes; k++)
 653             {
 654                 /* Legacy kernel 16/48 kB Shared/L1 */
 655                 stat = cudaFuncSetCacheConfig(nb_legacy_kfunc_ptr[i][j][k], cudaFuncCachePreferL1);
 656                 CU_RET_ERR(stat, "cudaFuncSetCacheConfig failed");
 657
 658                 if (devinfo->prop.major >= 3)
 659                 {
 660                     /* Default kernel on sm 3.x 48/16 kB Shared/L1 */
 661                     stat = cudaFuncSetCacheConfig(nb_default_kfunc_ptr[i][j][k], cudaFuncCachePreferShared);
 662                 }
 663                 else
 664                 {
 665                     /* On Fermi prefer L1 gives 2% higher performance */
 666                     /* Default kernel on sm_2.x 16/48 kB Shared/L1 */
 667                     stat = cudaFuncSetCacheConfig(nb_default_kfunc_ptr[i][j][k], cudaFuncCachePreferL1);
 668                 }
 669                 CU_RET_ERR(stat, "cudaFuncSetCacheConfig failed");
 670             }
 671 }