Double precision SSE2 kernels
[gromacs.git] / src / gmxlib / nonbonded / nonbonded.c
blob75aae6193e163722871186b271c72d48510d83f0
1 /* -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustrup"; -*-
3 *
4 * This source code is part of
5 *
6 * G R O M A C S
7 *
8 * GROningen MAchine for Chemical Simulations
9 *
10 * VERSION 3.2.0
11 * Written by David van der Spoel, Erik Lindahl, Berk Hess, and others.
12 * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
13 * Copyright (c) 2001-2004, The GROMACS development team,
14 * check out http://www.gromacs.org for more information.
16 * This program is free software; you can redistribute it and/or
17 * modify it under the terms of the GNU General Public License
18 * as published by the Free Software Foundation; either version 2
19 * of the License, or (at your option) any later version.
21 * If you want to redistribute modifications, please consider that
22 * scientific software is very special. Version control is crucial -
23 * bugs must be traceable. We will be happy to consider code for
24 * inclusion in the official distribution, but derived work must not
25 * be called official GROMACS. Details are found in the README & COPYING
26 * files - if they are missing, get the official version at www.gromacs.org.
28 * To help us fund GROMACS development, we humbly ask that you cite
29 * the papers on the package - you can find them in the top README file.
31 * For more info, check our website at http://www.gromacs.org
33 * And Hey:
34 * GROningen Mixture of Alchemy and Childrens' Stories
36 #ifdef HAVE_CONFIG_H
37 #include <config.h>
38 #endif
40 #ifdef GMX_THREAD_MPI
41 #include <thread_mpi.h>
42 #endif
44 #include <stdio.h>
45 #include <stdlib.h>
46 #include "typedefs.h"
47 #include "txtdump.h"
48 #include "smalloc.h"
49 #include "ns.h"
50 #include "vec.h"
51 #include "maths.h"
52 #include "macros.h"
53 #include "string2.h"
54 #include "force.h"
55 #include "names.h"
56 #include "main.h"
57 #include "xvgr.h"
58 #include "gmx_fatal.h"
59 #include "physics.h"
60 #include "force.h"
61 #include "bondf.h"
62 #include "nrnb.h"
63 #include "smalloc.h"
64 #include "nonbonded.h"
66 #include "nb_kernel.h"
67 #include "nb_free_energy.h"
68 #include "nb_generic.h"
69 #include "nb_generic_cg.h"
70 #include "nb_generic_adress.h"
72 /* Different default (c) and accelerated interaction-specific kernels */
73 #include "nb_kernel_c/nb_kernel_c.h"
75 #if (defined GMX_CPU_ACCELERATION_X86_SSE2) && !(defined GMX_DOUBLE)
76 # include "nb_kernel_sse2_single/nb_kernel_sse2_single.h"
77 #endif
78 #if (defined GMX_CPU_ACCELERATION_X86_SSE4_1) && !(defined GMX_DOUBLE)
79 # include "nb_kernel_sse4_1_single/nb_kernel_sse4_1_single.h"
80 #endif
81 #if (defined GMX_CPU_ACCELERATION_X86_AVX_128_FMA) && !(defined GMX_DOUBLE)
82 # include "nb_kernel_avx_128_fma_single/nb_kernel_avx_128_fma_single.h"
83 #endif
84 #if (defined GMX_CPU_ACCELERATION_X86_AVX_256) && !(defined GMX_DOUBLE)
85 # include "nb_kernel_avx_256_single/nb_kernel_avx_256_single.h"
86 #endif
87 #if (defined GMX_CPU_ACCELERATION_X86_SSE2 && defined GMX_DOUBLE)
88 # include "nb_kernel_sse2_double/nb_kernel_sse2_double.h"
89 #endif
91 #ifdef GMX_THREAD_MPI
92 static tMPI_Thread_mutex_t nonbonded_setup_mutex = TMPI_THREAD_MUTEX_INITIALIZER;
93 #endif
94 static gmx_bool nonbonded_setup_done = FALSE;
97 void
98 gmx_nonbonded_setup(FILE * fplog,
99 t_forcerec * fr,
100 gmx_bool bGenericKernelOnly)
102 #ifdef GMX_THREAD_MPI
103 tMPI_Thread_mutex_lock(&nonbonded_setup_mutex);
104 #endif
105 /* Here we are guaranteed only one thread made it. */
106 if(nonbonded_setup_done==FALSE)
108 if(bGenericKernelOnly==FALSE)
110 /* Add the generic kernels to the structure stored statically in nb_kernel.c */
111 nb_kernel_list_add_kernels(kernellist_c,kernellist_c_size);
113 if(!(fr!=NULL && fr->use_cpu_acceleration==FALSE))
115 /* Add interaction-specific kernels for different architectures */
116 /* Single precision */
117 #if (defined GMX_CPU_ACCELERATION_X86_SSE2) && !(defined GMX_DOUBLE)
118 nb_kernel_list_add_kernels(kernellist_sse2_single,kernellist_sse2_single_size);
119 #endif
120 #if (defined GMX_CPU_ACCELERATION_X86_SSE4_1) && !(defined GMX_DOUBLE)
121 nb_kernel_list_add_kernels(kernellist_sse4_1_single,kernellist_sse4_1_single_size);
122 #endif
123 #if (defined GMX_CPU_ACCELERATION_X86_AVX_128_FMA) && !(defined GMX_DOUBLE)
124 nb_kernel_list_add_kernels(kernellist_avx_128_fma_single,kernellist_avx_128_fma_single_size);
125 #endif
126 #if (defined GMX_CPU_ACCELERATION_X86_AVX_256) && !(defined GMX_DOUBLE)
127 nb_kernel_list_add_kernels(kernellist_avx_256_single,kernellist_avx_256_single_size);
128 #endif
129 /* Double precision */
130 #if (defined GMX_CPU_ACCELERATION_X86_SSE2 && defined GMX_DOUBLE)
131 nb_kernel_list_add_kernels(kernellist_sse2_double,kernellist_sse2_double_size);
132 #endif
133 ; /* empty statement to avoid a completely empty block */
136 /* Create a hash for faster lookups */
137 nb_kernel_list_hash_init();
139 nonbonded_setup_done=TRUE;
141 #ifdef GMX_THREAD_MPI
142 tMPI_Thread_mutex_unlock(&nonbonded_setup_mutex);
143 #endif
148 void
149 gmx_nonbonded_set_kernel_pointers(FILE *log, t_nblist *nl)
151 const char * elec;
152 const char * elec_mod;
153 const char * vdw;
154 const char * vdw_mod;
155 const char * geom;
156 const char * other;
157 const char * vf;
159 struct
161 const char * arch;
162 int simd_padding_width;
164 arch_and_padding[] =
166 /* Single precision */
167 #if (defined GMX_CPU_ACCELERATION_X86_AVX_256) && !(defined GMX_DOUBLE)
168 { "avx_256_single", 8 },
169 #endif
170 #if (defined GMX_CPU_ACCELERATION_X86_AVX_128_FMA) && !(defined GMX_DOUBLE)
171 { "avx_128_fma_single", 4 },
172 #endif
173 #if (defined GMX_CPU_ACCELERATION_X86_SSE4_1) && !(defined GMX_DOUBLE)
174 { "sse4_1_single", 4 },
175 #endif
176 #if (defined GMX_CPU_ACCELERATION_X86_SSE2) && !(defined GMX_DOUBLE)
177 { "sse2_single", 4 },
178 #endif
179 /* Double precision */
180 #if (defined GMX_CPU_ACCELERATION_X86_SSE2 && defined GMX_DOUBLE)
181 /* Sic. Double precision SSE2 does not require neighbor list padding,
182 * since the kernels execute a loop unrolled a factor 2, followed by
183 * a possible single odd-element epilogue.
185 { "sse2_double", 1 },
186 #endif
187 { "c", 1 },
189 int narch = asize(arch_and_padding);
190 int i;
192 if(nonbonded_setup_done==FALSE)
194 /* We typically call this setup routine before starting timers,
195 * but if that has not been done for whatever reason we do it now.
197 gmx_nonbonded_setup(NULL,NULL,FALSE);
200 /* Not used yet */
201 other="";
203 nl->kernelptr_vf = NULL;
204 nl->kernelptr_v = NULL;
205 nl->kernelptr_f = NULL;
207 elec = gmx_nbkernel_elec_names[nl->ielec];
208 elec_mod = eintmod_names[nl->ielecmod];
209 vdw = gmx_nbkernel_vdw_names[nl->ivdw];
210 vdw_mod = eintmod_names[nl->ivdwmod];
211 geom = gmx_nblist_geometry_names[nl->igeometry];
213 if(nl->free_energy)
215 nl->kernelptr_vf = gmx_nb_free_energy_kernel;
216 nl->kernelptr_f = gmx_nb_free_energy_kernel;
217 nl->simd_padding_width = 1;
219 else if(!gmx_strcasecmp_min(geom,"CG-CG"))
221 nl->kernelptr_vf = gmx_nb_generic_cg_kernel;
222 nl->kernelptr_f = gmx_nb_generic_cg_kernel;
223 nl->simd_padding_width = 1;
225 else
227 /* Try to find a specific kernel first */
229 for(i=0;i<narch && nl->kernelptr_vf==NULL ;i++)
231 nl->kernelptr_vf = nb_kernel_list_findkernel(log,arch_and_padding[i].arch,elec,elec_mod,vdw,vdw_mod,geom,other,"PotentialAndForce");
232 nl->simd_padding_width = arch_and_padding[i].simd_padding_width;
234 for(i=0;i<narch && nl->kernelptr_f==NULL ;i++)
236 nl->kernelptr_f = nb_kernel_list_findkernel(log,arch_and_padding[i].arch,elec,elec_mod,vdw,vdw_mod,geom,other,"Force");
237 nl->simd_padding_width = arch_and_padding[i].simd_padding_width;
239 /* If there is not force-only optimized kernel, is there a potential & force one? */
240 if(nl->kernelptr_f == NULL)
242 nl->kernelptr_f = nb_kernel_list_findkernel(NULL,arch_and_padding[i].arch,elec,elec_mod,vdw,vdw_mod,geom,other,"PotentialAndForce");
243 nl->simd_padding_width = arch_and_padding[i].simd_padding_width;
247 /* Give up, pick a generic one instead */
248 if(nl->kernelptr_vf==NULL)
250 nl->kernelptr_vf = gmx_nb_generic_kernel;
251 nl->kernelptr_f = gmx_nb_generic_kernel;
252 nl->simd_padding_width = 1;
253 if(log)
255 fprintf(log,
256 "WARNING - Slow generic NB kernel used for neighborlist with\n"
257 " Elec: '%s', Modifier: '%s'\n"
258 " Vdw: '%s', Modifier: '%s'\n"
259 " Geom: '%s', Other: '%s'\n\n",
260 elec,elec_mod,vdw,vdw_mod,geom,other);
265 return;
268 void do_nonbonded(t_commrec *cr,t_forcerec *fr,
269 rvec x[],rvec f_shortrange[],rvec f_longrange[],t_mdatoms *mdatoms,t_blocka *excl,
270 gmx_grppairener_t *grppener,rvec box_size,
271 t_nrnb *nrnb,real *lambda, real *dvdl,
272 int nls,int eNL,int flags)
274 t_nblist * nlist;
275 int n,n0,n1,i,i0,i1,sz,range;
276 t_nblists * nblists;
277 nb_kernel_data_t kernel_data;
278 nb_kernel_t * kernelptr=NULL;
279 rvec * f;
281 kernel_data.flags = flags;
282 kernel_data.exclusions = excl;
283 kernel_data.lambda = lambda;
284 kernel_data.dvdl = dvdl;
286 if(fr->bAllvsAll)
288 return;
291 if (eNL >= 0)
293 i0 = eNL;
294 i1 = i0+1;
296 else
298 i0 = 0;
299 i1 = eNL_NR;
302 if (nls >= 0)
304 n0 = nls;
305 n1 = nls+1;
307 else
309 n0 = 0;
310 n1 = fr->nnblists;
313 for(n=n0; (n<n1); n++)
315 nblists = &fr->nblists[n];
317 kernel_data.table_elec = &nblists->table_elec;
318 kernel_data.table_vdw = &nblists->table_vdw;
319 kernel_data.table_elec_vdw = &nblists->table_elec_vdw;
321 for(range=0;range<2;range++)
323 /* Are we doing short/long-range? */
324 if(range==0)
326 /* Short-range */
327 if(!(flags & GMX_NONBONDED_DO_SR))
329 continue;
331 kernel_data.energygrp_elec = grppener->ener[egCOULSR];
332 kernel_data.energygrp_vdw = grppener->ener[fr->bBHAM ? egBHAMSR : egLJSR];
333 kernel_data.energygrp_polarization = grppener->ener[egGB];
334 nlist = nblists->nlist_sr;
335 f = f_shortrange;
337 else if(range==1)
339 /* Long-range */
340 if(!(flags & GMX_NONBONDED_DO_LR))
342 continue;
344 kernel_data.energygrp_elec = grppener->ener[egCOULLR];
345 kernel_data.energygrp_vdw = grppener->ener[fr->bBHAM ? egBHAMLR : egLJLR];
346 kernel_data.energygrp_polarization = grppener->ener[egGB];
347 nlist = nblists->nlist_lr;
348 f = f_longrange;
351 for(i=i0; (i<i1); i++)
353 if (nlist[i].nri > 0)
355 if(flags & GMX_NONBONDED_DO_POTENTIAL)
357 /* Potential and force */
358 kernelptr = (nb_kernel_t *)nlist[i].kernelptr_vf;
360 else
362 /* Force only, no potential */
363 kernelptr = (nb_kernel_t *)nlist[i].kernelptr_f;
366 if(nlist[i].free_energy==0 && (flags & GMX_NONBONDED_DO_FOREIGNLAMBDA))
368 /* We don't need the non-perturbed interactions */
369 continue;
371 (*kernelptr)(&(nlist[i]),x,f,fr,mdatoms,&kernel_data,nrnb);
378 static void
379 nb_listed_warning_rlimit(const rvec *x,int ai, int aj,int * global_atom_index,real r, real rlimit)
381 gmx_warning("Listed nonbonded interaction between particles %d and %d\n"
382 "at distance %.3f which is larger than the table limit %.3f nm.\n\n"
383 "This is likely either a 1,4 interaction, or a listed interaction inside\n"
384 "a smaller molecule you are decoupling during a free energy calculation.\n"
385 "Since interactions at distances beyond the table cannot be computed,\n"
386 "they are skipped until they are inside the table limit again. You will\n"
387 "only see this message once, even if it occurs for several interactions.\n\n"
388 "IMPORTANT: This should not happen in a stable simulation, so there is\n"
389 "probably something wrong with your system. Only change the table-extension\n"
390 "distance in the mdp file if you are really sure that is the reason.\n",
391 glatnr(global_atom_index,ai),glatnr(global_atom_index,aj),r,rlimit);
393 if (debug)
395 fprintf(debug,
396 "%8f %8f %8f\n%8f %8f %8f\n1-4 (%d,%d) interaction not within cut-off! r=%g. Ignored\n",
397 x[ai][XX],x[ai][YY],x[ai][ZZ],x[aj][XX],x[aj][YY],x[aj][ZZ],
398 glatnr(global_atom_index,ai),glatnr(global_atom_index,aj),r);
404 /* This might logically belong better in the nb_generic.c module, but it is only
405 * used in do_nonbonded_listed(), and we want it to be inlined there to avoid an
406 * extra functional call for every single pair listed in the topology.
408 static real
409 nb_evaluate_single(real r2, real tabscale,real *vftab,
410 real qq, real c6, real c12, real *velec, real *vvdw)
412 real rinv,r,rtab,eps,eps2,Y,F,Geps,Heps2,Fp,VVe,FFe,VVd,FFd,VVr,FFr,fscal;
413 int ntab;
415 /* Do the tabulated interactions - first table lookup */
416 rinv = gmx_invsqrt(r2);
417 r = r2*rinv;
418 rtab = r*tabscale;
419 ntab = rtab;
420 eps = rtab-ntab;
421 eps2 = eps*eps;
422 ntab = 12*ntab;
423 /* Electrostatics */
424 Y = vftab[ntab];
425 F = vftab[ntab+1];
426 Geps = eps*vftab[ntab+2];
427 Heps2 = eps2*vftab[ntab+3];
428 Fp = F+Geps+Heps2;
429 VVe = Y+eps*Fp;
430 FFe = Fp+Geps+2.0*Heps2;
431 /* Dispersion */
432 Y = vftab[ntab+4];
433 F = vftab[ntab+5];
434 Geps = eps*vftab[ntab+6];
435 Heps2 = eps2*vftab[ntab+7];
436 Fp = F+Geps+Heps2;
437 VVd = Y+eps*Fp;
438 FFd = Fp+Geps+2.0*Heps2;
439 /* Repulsion */
440 Y = vftab[ntab+8];
441 F = vftab[ntab+9];
442 Geps = eps*vftab[ntab+10];
443 Heps2 = eps2*vftab[ntab+11];
444 Fp = F+Geps+Heps2;
445 VVr = Y+eps*Fp;
446 FFr = Fp+Geps+2.0*Heps2;
448 *velec = qq*VVe;
449 *vvdw = c6*VVd+c12*VVr;
451 fscal = -(qq*FFe+c6*FFd+c12*FFr)*tabscale*rinv;
453 return fscal;
457 real
458 do_nonbonded_listed(int ftype,int nbonds,
459 const t_iatom iatoms[],const t_iparams iparams[],
460 const rvec x[],rvec f[],rvec fshift[],
461 const t_pbc *pbc,const t_graph *g,
462 real *lambda, real *dvdl,
463 const t_mdatoms *md,
464 const t_forcerec *fr,gmx_grppairener_t *grppener,
465 int *global_atom_index)
467 int ielec,ivdw;
468 real qq,c6,c12;
469 rvec dx;
470 ivec dt;
471 int i,j,itype,ai,aj,gid;
472 int fshift_index;
473 real r2,rinv;
474 real fscal,velec,vvdw;
475 real * energygrp_elec;
476 real * energygrp_vdw;
477 static gmx_bool warned_rlimit=FALSE;
478 /* Free energy stuff */
479 gmx_bool bFreeEnergy;
480 real LFC[2],LFV[2],DLF[2],lfac_coul[2],lfac_vdw[2],dlfac_coul[2],dlfac_vdw[2];
481 real qqB,c6B,c12B,sigma2_def,sigma2_min;
484 switch (ftype) {
485 case F_LJ14:
486 case F_LJC14_Q:
487 energygrp_elec = grppener->ener[egCOUL14];
488 energygrp_vdw = grppener->ener[egLJ14];
489 break;
490 case F_LJC_PAIRS_NB:
491 energygrp_elec = grppener->ener[egCOULSR];
492 energygrp_vdw = grppener->ener[egLJSR];
493 break;
494 default:
495 energygrp_elec = NULL; /* Keep compiler happy */
496 energygrp_vdw = NULL; /* Keep compiler happy */
497 gmx_fatal(FARGS,"Unknown function type %d in do_nonbonded14",ftype);
498 break;
501 if(fr->efep != efepNO)
503 /* Lambda factor for state A=1-lambda and B=lambda */
504 LFC[0] = 1.0 - lambda[efptCOUL];
505 LFV[0] = 1.0 - lambda[efptVDW];
506 LFC[1] = lambda[efptCOUL];
507 LFV[1] = lambda[efptVDW];
509 /*derivative of the lambda factor for state A and B */
510 DLF[0] = -1;
511 DLF[1] = 1;
513 /* precalculate */
514 sigma2_def = pow(fr->sc_sigma6_def,1.0/3.0);
515 sigma2_min = pow(fr->sc_sigma6_min,1.0/3.0);
517 for (i=0;i<2;i++)
519 lfac_coul[i] = (fr->sc_power==2 ? (1-LFC[i])*(1-LFC[i]) : (1-LFC[i]));
520 dlfac_coul[i] = DLF[i]*fr->sc_power/fr->sc_r_power*(fr->sc_power==2 ? (1-LFC[i]) : 1);
521 lfac_vdw[i] = (fr->sc_power==2 ? (1-LFV[i])*(1-LFV[i]) : (1-LFV[i]));
522 dlfac_vdw[i] = DLF[i]*fr->sc_power/fr->sc_r_power*(fr->sc_power==2 ? (1-LFV[i]) : 1);
525 else
527 sigma2_min = sigma2_def = 0;
530 bFreeEnergy = FALSE;
531 for(i=0; (i<nbonds); )
533 itype = iatoms[i++];
534 ai = iatoms[i++];
535 aj = iatoms[i++];
536 gid = GID(md->cENER[ai],md->cENER[aj],md->nenergrp);
538 /* Get parameters */
539 switch (ftype) {
540 case F_LJ14:
541 bFreeEnergy =
542 (fr->efep != efepNO &&
543 ((md->nPerturbed && (md->bPerturbed[ai] || md->bPerturbed[aj])) ||
544 iparams[itype].lj14.c6A != iparams[itype].lj14.c6B ||
545 iparams[itype].lj14.c12A != iparams[itype].lj14.c12B));
546 qq = md->chargeA[ai]*md->chargeA[aj]*fr->epsfac*fr->fudgeQQ;
547 c6 = iparams[itype].lj14.c6A;
548 c12 = iparams[itype].lj14.c12A;
549 break;
550 case F_LJC14_Q:
551 qq = iparams[itype].ljc14.qi*iparams[itype].ljc14.qj*fr->epsfac*iparams[itype].ljc14.fqq;
552 c6 = iparams[itype].ljc14.c6;
553 c12 = iparams[itype].ljc14.c12;
554 break;
555 case F_LJC_PAIRS_NB:
556 qq = iparams[itype].ljcnb.qi*iparams[itype].ljcnb.qj*fr->epsfac;
557 c6 = iparams[itype].ljcnb.c6;
558 c12 = iparams[itype].ljcnb.c12;
559 break;
560 default:
561 /* Cannot happen since we called gmx_fatal() above in this case */
562 qq = c6 = c12 = 0; /* Keep compiler happy */
563 break;
566 /* To save flops in the optimized kernels, c6/c12 have 6.0/12.0 derivative prefactors
567 * included in the general nfbp array now. This means the tables are scaled down by the
568 * same factor, so when we use the original c6/c12 parameters from iparams[] they must
569 * be scaled up.
571 c6 *= 6.0;
572 c12 *= 12.0;
574 /* Do we need to apply full periodic boundary conditions? */
575 if(fr->bMolPBC==TRUE)
577 fshift_index = pbc_dx_aiuc(pbc,x[ai],x[aj],dx);
579 else
581 fshift_index = CENTRAL;
582 rvec_sub(x[ai],x[aj],dx);
584 r2 = norm2(dx);
586 if(r2>=fr->tab14.r*fr->tab14.r)
588 if(warned_rlimit==FALSE)
590 nb_listed_warning_rlimit(x,ai,aj,global_atom_index,sqrt(r2),fr->tab14.r);
591 warned_rlimit=TRUE;
593 continue;
596 if (bFreeEnergy)
598 /* Currently free energy is only supported for F_LJ14, so no need to check for that if we got here */
599 qqB = md->chargeB[ai]*md->chargeB[aj]*fr->epsfac*fr->fudgeQQ;
600 c6B = iparams[itype].lj14.c6B*6.0;
601 c12B = iparams[itype].lj14.c12B*12.0;
603 fscal = nb_free_energy_evaluate_single(r2,fr->sc_r_power,fr->sc_alphacoul,fr->sc_alphavdw,
604 fr->tab14.scale,fr->tab14.data,qq,c6,c12,qqB,c6B,c12B,
605 LFC,LFV,DLF,lfac_coul,lfac_vdw,dlfac_coul,dlfac_vdw,
606 fr->sc_sigma6_def,fr->sc_sigma6_min,sigma2_def,sigma2_min,&velec,&vvdw,dvdl);
608 else
610 /* Evaluate tabulated interaction without free energy */
611 fscal = nb_evaluate_single(r2,fr->tab14.scale,fr->tab14.data,qq,c6,c12,&velec,&vvdw);
614 energygrp_elec[gid] += velec;
615 energygrp_vdw[gid] += vvdw;
616 svmul(fscal,dx,dx);
618 /* Add the forces */
619 rvec_inc(f[ai],dx);
620 rvec_dec(f[aj],dx);
622 if (g)
624 /* Correct the shift forces using the graph */
625 ivec_sub(SHIFT_IVEC(g,ai),SHIFT_IVEC(g,aj),dt);
626 fshift_index = IVEC2IS(dt);
628 if(fshift_index!=CENTRAL)
630 rvec_inc(fshift[fshift_index],dx);
631 rvec_dec(fshift[CENTRAL],dx);
634 return 0.0;