Fixed include statements such that double precision version of genborn.c
[gromacs/rigid-bodies.git] / src / mdlib / genborn_sse2_single.c
blob017574d191192fd48bf5e6340e4852c97d59a0c3
1 /*
2 *
3 * This source code is part of
4 *
5 * G R O M A C S
6 *
7 * GROningen MAchine for Chemical Simulations
8 *
9 * Written by David van der Spoel, Erik Lindahl, Berk Hess, and others.
10 * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
11 * Copyright (c) 2001-2008, The GROMACS development team,
12 * check out http://www.gromacs.org for more information.
14 * This program is free software; you can redistribute it and/or
15 * modify it under the terms of the GNU General Public License
16 * as published by the Free Software Foundation; either version 2
17 * of the License, or (at your option) any later version.
19 * If you want to redistribute modifications, please consider that
20 * scientific software is very special. Version control is crucial -
21 * bugs must be traceable. We will be happy to consider code for
22 * inclusion in the official distribution, but derived work must not
23 * be called official GROMACS. Details are found in the README & COPYING
24 * files - if they are missing, get the official version at www.gromacs.org.
26 * To help us fund GROMACS development, we humbly ask that you cite
27 * the papers on the package - you can find them in the top README file.
29 * For more info, check our website at http://www.gromacs.org
31 * And Hey:
32 * Gallium Rubidium Oxygen Manganese Argon Carbon Silicon
34 #ifdef HAVE_CONFIG_H
35 #include <config.h>
36 #endif
38 #include <math.h>
39 #include <string.h>
41 #include "typedefs.h"
42 #include "smalloc.h"
43 #include "genborn.h"
44 #include "vec.h"
45 #include "grompp.h"
46 #include "pdbio.h"
47 #include "names.h"
48 #include "physics.h"
49 #include "partdec.h"
50 #include "domdec.h"
51 #include "network.h"
52 #include "gmx_fatal.h"
53 #include "mtop_util.h"
54 #include "genborn.h"
56 #ifdef GMX_LIB_MPI
57 #include <mpi.h>
58 #endif
59 #ifdef GMX_THREADS
60 #include "tmpi.h"
61 #endif
64 /* Only compile this file if SSE intrinsics are available */
65 #if ( (defined(GMX_IA32_SSE) || defined(GMX_X86_64_SSE) || defined(GMX_SSE2)) && !defined(GMX_DOUBLE) )
67 #include <gmx_sse2_single.h>
68 #include <xmmintrin.h>
69 #include <emmintrin.h>
71 #include "genborn_sse2_single.h"
74 int
75 calc_gb_rad_still_sse(t_commrec *cr, t_forcerec *fr,
76 int natoms, gmx_localtop_t *top,
77 const t_atomtypes *atype, float *x, t_nblist *nl,
78 gmx_genborn_t *born)
80 int i,k,n,ii,is3,ii3,nj0,nj1,offset;
81 int jnrA,jnrB,jnrC,jnrD,j3A,j3B,j3C,j3D;
82 int jnrE,jnrF,jnrG,jnrH,j3E,j3F,j3G,j3H;
83 int shift;
84 int *mdtype;
85 real shX,shY,shZ;
86 int *jjnr;
87 real *shiftvec;
89 float gpi_ai,gpi2;
90 float factor;
91 float *gb_radius;
92 float *vsolv;
93 float *work;
94 float *dadx;
96 __m128 ix,iy,iz;
97 __m128 jx,jy,jz;
98 __m128 dx,dy,dz;
99 __m128 tx,ty,tz;
100 __m128 jxB,jyB,jzB;
101 __m128 dxB,dyB,dzB;
102 __m128 txB,tyB,tzB;
103 __m128 rsq,rinv,rinv2,rinv4,rinv6;
104 __m128 rsqB,rinvB,rinv2B,rinv4B,rinv6B;
105 __m128 ratio,gpi,rai,raj,vai,vaj,rvdw;
106 __m128 ratioB,rajB,vajB,rvdwB;
107 __m128 ccf,dccf,theta,cosq,term,sinq,res,prod,prod_ai,tmp;
108 __m128 ccfB,dccfB,thetaB,cosqB,termB,sinqB,resB,prodB;
109 __m128 mask,icf4,icf6,mask_cmp;
110 __m128 icf4B,icf6B,mask_cmpB;
112 __m128 mask1 = gmx_mm_castsi128_ps( _mm_set_epi32(0, 0, 0, 0xffffffff) );
113 __m128 mask2 = gmx_mm_castsi128_ps( _mm_set_epi32(0, 0, 0xffffffff, 0xffffffff) );
114 __m128 mask3 = gmx_mm_castsi128_ps( _mm_set_epi32(0, 0xffffffff, 0xffffffff, 0xffffffff) );
116 const __m128 half = {0.5f , 0.5f , 0.5f , 0.5f };
117 const __m128 three = {3.0f , 3.0f , 3.0f , 3.0f };
118 const __m128 one = {1.0f, 1.0f , 1.0f , 1.0f };
119 const __m128 two = {2.0f , 2.0f , 2.0f, 2.0f };
120 const __m128 zero = {0.0f , 0.0f , 0.0f , 0.0f };
121 const __m128 four = {4.0f , 4.0f , 4.0f , 4.0f };
123 const __m128 still_p5inv = {STILL_P5INV, STILL_P5INV, STILL_P5INV, STILL_P5INV};
124 const __m128 still_pip5 = {STILL_PIP5, STILL_PIP5, STILL_PIP5, STILL_PIP5};
125 const __m128 still_p4 = {STILL_P4, STILL_P4, STILL_P4, STILL_P4};
127 factor = 0.5 * ONE_4PI_EPS0;
129 gb_radius = born->gb_radius;
130 vsolv = born->vsolv;
131 work = born->gpol_still_work;
132 jjnr = nl->jjnr;
133 shiftvec = fr->shift_vec[0];
134 dadx = fr->dadx;
136 jnrA = jnrB = jnrC = jnrD = 0;
137 jx = _mm_setzero_ps();
138 jy = _mm_setzero_ps();
139 jz = _mm_setzero_ps();
141 n = 0;
143 for(i=0;i<natoms;i++)
145 work[i]=0;
148 for(i=0;i<nl->nri;i++)
150 ii = nl->iinr[i];
151 ii3 = ii*3;
152 is3 = 3*nl->shift[i];
153 shX = shiftvec[is3];
154 shY = shiftvec[is3+1];
155 shZ = shiftvec[is3+2];
156 nj0 = nl->jindex[i];
157 nj1 = nl->jindex[i+1];
159 ix = _mm_set1_ps(shX+x[ii3+0]);
160 iy = _mm_set1_ps(shY+x[ii3+1]);
161 iz = _mm_set1_ps(shZ+x[ii3+2]);
163 offset = (nj1-nj0)%4;
165 /* Polarization energy for atom ai */
166 gpi = _mm_setzero_ps();
168 rai = _mm_load1_ps(gb_radius+ii);
169 prod_ai = _mm_set1_ps(STILL_P4*vsolv[ii]);
171 for(k=nj0;k<nj1-4-offset;k+=8)
173 jnrA = jjnr[k];
174 jnrB = jjnr[k+1];
175 jnrC = jjnr[k+2];
176 jnrD = jjnr[k+3];
177 jnrE = jjnr[k+4];
178 jnrF = jjnr[k+5];
179 jnrG = jjnr[k+6];
180 jnrH = jjnr[k+7];
182 j3A = 3*jnrA;
183 j3B = 3*jnrB;
184 j3C = 3*jnrC;
185 j3D = 3*jnrD;
186 j3E = 3*jnrE;
187 j3F = 3*jnrF;
188 j3G = 3*jnrG;
189 j3H = 3*jnrH;
191 GMX_MM_LOAD_1RVEC_4POINTERS_PS(x+j3A,x+j3B,x+j3C,x+j3D,jx,jy,jz);
192 GMX_MM_LOAD_1RVEC_4POINTERS_PS(x+j3E,x+j3F,x+j3G,x+j3H,jxB,jyB,jzB);
194 GMX_MM_LOAD_4VALUES_PS(gb_radius+jnrA,gb_radius+jnrB,gb_radius+jnrC,gb_radius+jnrD,raj);
195 GMX_MM_LOAD_4VALUES_PS(gb_radius+jnrE,gb_radius+jnrF,gb_radius+jnrG,gb_radius+jnrH,rajB);
196 GMX_MM_LOAD_4VALUES_PS(vsolv+jnrA,vsolv+jnrB,vsolv+jnrC,vsolv+jnrD,vaj);
197 GMX_MM_LOAD_4VALUES_PS(vsolv+jnrE,vsolv+jnrF,vsolv+jnrG,vsolv+jnrH,vajB);
199 dx = _mm_sub_ps(ix,jx);
200 dy = _mm_sub_ps(iy,jy);
201 dz = _mm_sub_ps(iz,jz);
202 dxB = _mm_sub_ps(ix,jxB);
203 dyB = _mm_sub_ps(iy,jyB);
204 dzB = _mm_sub_ps(iz,jzB);
206 rsq = gmx_mm_calc_rsq_ps(dx,dy,dz);
207 rsqB = gmx_mm_calc_rsq_ps(dxB,dyB,dzB);
208 rinv = gmx_mm_invsqrt_ps(rsq);
209 rinvB = gmx_mm_invsqrt_ps(rsqB);
210 rinv2 = _mm_mul_ps(rinv,rinv);
211 rinv2B = _mm_mul_ps(rinvB,rinvB);
212 rinv4 = _mm_mul_ps(rinv2,rinv2);
213 rinv4B = _mm_mul_ps(rinv2B,rinv2B);
214 rinv6 = _mm_mul_ps(rinv4,rinv2);
215 rinv6B = _mm_mul_ps(rinv4B,rinv2B);
217 rvdw = _mm_add_ps(rai,raj);
218 rvdwB = _mm_add_ps(rai,rajB);
219 ratio = _mm_mul_ps(rsq, gmx_mm_inv_ps( _mm_mul_ps(rvdw,rvdw)));
220 ratioB = _mm_mul_ps(rsqB, gmx_mm_inv_ps( _mm_mul_ps(rvdwB,rvdwB)));
222 mask_cmp = _mm_cmple_ps(ratio,still_p5inv);
223 mask_cmpB = _mm_cmple_ps(ratioB,still_p5inv);
225 /* gmx_mm_sincos_ps() is quite expensive, so avoid calculating it if we can! */
226 if( 0 == _mm_movemask_ps(mask_cmp) )
228 /* if ratio>still_p5inv for ALL elements */
229 ccf = one;
230 dccf = _mm_setzero_ps();
232 else
234 ratio = _mm_min_ps(ratio,still_p5inv);
235 theta = _mm_mul_ps(ratio,still_pip5);
236 GMX_MM_SINCOS_PS(theta,sinq,cosq);
237 term = _mm_mul_ps(half,_mm_sub_ps(one,cosq));
238 ccf = _mm_mul_ps(term,term);
239 dccf = _mm_mul_ps(_mm_mul_ps(two,term),
240 _mm_mul_ps(sinq,theta));
242 if( 0 == _mm_movemask_ps(mask_cmpB) )
244 /* if ratio>still_p5inv for ALL elements */
245 ccfB = one;
246 dccfB = _mm_setzero_ps();
248 else
250 ratioB = _mm_min_ps(ratioB,still_p5inv);
251 thetaB = _mm_mul_ps(ratioB,still_pip5);
252 GMX_MM_SINCOS_PS(thetaB,sinqB,cosqB);
253 termB = _mm_mul_ps(half,_mm_sub_ps(one,cosqB));
254 ccfB = _mm_mul_ps(termB,termB);
255 dccfB = _mm_mul_ps(_mm_mul_ps(two,termB),
256 _mm_mul_ps(sinqB,thetaB));
259 prod = _mm_mul_ps(still_p4,vaj);
260 prodB = _mm_mul_ps(still_p4,vajB);
261 icf4 = _mm_mul_ps(ccf,rinv4);
262 icf4B = _mm_mul_ps(ccfB,rinv4B);
263 icf6 = _mm_mul_ps( _mm_sub_ps( _mm_mul_ps(four,ccf),dccf), rinv6);
264 icf6B = _mm_mul_ps( _mm_sub_ps( _mm_mul_ps(four,ccfB),dccfB), rinv6B);
266 GMX_MM_INCREMENT_4VALUES_PS(work+jnrA,work+jnrB,work+jnrC,work+jnrD,_mm_mul_ps(prod_ai,icf4));
267 GMX_MM_INCREMENT_4VALUES_PS(work+jnrE,work+jnrF,work+jnrG,work+jnrH,_mm_mul_ps(prod_ai,icf4B));
269 gpi = _mm_add_ps(gpi, _mm_add_ps( _mm_mul_ps(prod,icf4) , _mm_mul_ps(prodB,icf4B) ) );
271 _mm_store_ps(dadx,_mm_mul_ps(prod,icf6));
272 dadx+=4;
273 _mm_store_ps(dadx,_mm_mul_ps(prod_ai,icf6));
274 dadx+=4;
275 _mm_store_ps(dadx,_mm_mul_ps(prodB,icf6B));
276 dadx+=4;
277 _mm_store_ps(dadx,_mm_mul_ps(prod_ai,icf6B));
278 dadx+=4;
281 for(;k<nj1-offset;k+=4)
283 jnrA = jjnr[k];
284 jnrB = jjnr[k+1];
285 jnrC = jjnr[k+2];
286 jnrD = jjnr[k+3];
288 j3A = 3*jnrA;
289 j3B = 3*jnrB;
290 j3C = 3*jnrC;
291 j3D = 3*jnrD;
293 GMX_MM_LOAD_1RVEC_4POINTERS_PS(x+j3A,x+j3B,x+j3C,x+j3D,jx,jy,jz);
295 GMX_MM_LOAD_4VALUES_PS(gb_radius+jnrA,gb_radius+jnrB,gb_radius+jnrC,gb_radius+jnrD,raj);
296 GMX_MM_LOAD_4VALUES_PS(vsolv+jnrA,vsolv+jnrB,vsolv+jnrC,vsolv+jnrD,vaj);
298 dx = _mm_sub_ps(ix,jx);
299 dy = _mm_sub_ps(iy,jy);
300 dz = _mm_sub_ps(iz,jz);
302 rsq = gmx_mm_calc_rsq_ps(dx,dy,dz);
303 rinv = gmx_mm_invsqrt_ps(rsq);
304 rinv2 = _mm_mul_ps(rinv,rinv);
305 rinv4 = _mm_mul_ps(rinv2,rinv2);
306 rinv6 = _mm_mul_ps(rinv4,rinv2);
308 rvdw = _mm_add_ps(rai,raj);
309 ratio = _mm_mul_ps(rsq, gmx_mm_inv_ps( _mm_mul_ps(rvdw,rvdw)));
311 mask_cmp = _mm_cmple_ps(ratio,still_p5inv);
313 /* gmx_mm_sincos_ps() is quite expensive, so avoid calculating it if we can! */
314 if(0 == _mm_movemask_ps(mask_cmp))
316 /* if ratio>still_p5inv for ALL elements */
317 ccf = one;
318 dccf = _mm_setzero_ps();
320 else
322 ratio = _mm_min_ps(ratio,still_p5inv);
323 theta = _mm_mul_ps(ratio,still_pip5);
324 GMX_MM_SINCOS_PS(theta,sinq,cosq);
325 term = _mm_mul_ps(half,_mm_sub_ps(one,cosq));
326 ccf = _mm_mul_ps(term,term);
327 dccf = _mm_mul_ps(_mm_mul_ps(two,term),
328 _mm_mul_ps(sinq,theta));
331 prod = _mm_mul_ps(still_p4,vaj);
332 icf4 = _mm_mul_ps(ccf,rinv4);
333 icf6 = _mm_mul_ps( _mm_sub_ps( _mm_mul_ps(four,ccf),dccf), rinv6);
336 GMX_MM_INCREMENT_4VALUES_PS(work+jnrA,work+jnrB,work+jnrC,work+jnrD,_mm_mul_ps(prod_ai,icf4));
338 gpi = _mm_add_ps(gpi, _mm_mul_ps(prod,icf4));
340 _mm_store_ps(dadx,_mm_mul_ps(prod,icf6));
341 dadx+=4;
342 _mm_store_ps(dadx,_mm_mul_ps(prod_ai,icf6));
343 dadx+=4;
346 if(offset!=0)
348 if(offset==1)
350 jnrA = jjnr[k];
351 j3A = 3*jnrA;
352 GMX_MM_LOAD_1RVEC_1POINTER_PS(x+j3A,jx,jy,jz);
353 GMX_MM_LOAD_1VALUE_PS(gb_radius+jnrA,raj);
354 GMX_MM_LOAD_1VALUE_PS(vsolv+jnrA,vaj);
355 mask = mask1;
357 else if(offset==2)
359 jnrA = jjnr[k];
360 jnrB = jjnr[k+1];
361 j3A = 3*jnrA;
362 j3B = 3*jnrB;
363 GMX_MM_LOAD_1RVEC_2POINTERS_PS(x+j3A,x+j3B,jx,jy,jz);
364 GMX_MM_LOAD_2VALUES_PS(gb_radius+jnrA,gb_radius+jnrB,raj);
365 GMX_MM_LOAD_2VALUES_PS(vsolv+jnrA,vsolv+jnrB,vaj);
366 mask = mask2;
368 else
370 /* offset must be 3 */
371 jnrA = jjnr[k];
372 jnrB = jjnr[k+1];
373 jnrC = jjnr[k+2];
374 j3A = 3*jnrA;
375 j3B = 3*jnrB;
376 j3C = 3*jnrC;
377 GMX_MM_LOAD_1RVEC_3POINTERS_PS(x+j3A,x+j3B,x+j3C,jx,jy,jz);
378 GMX_MM_LOAD_3VALUES_PS(gb_radius+jnrA,gb_radius+jnrB,gb_radius+jnrC,raj);
379 GMX_MM_LOAD_3VALUES_PS(vsolv+jnrA,vsolv+jnrB,vsolv+jnrC,vaj);
380 mask = mask3;
383 dx = _mm_sub_ps(ix,jx);
384 dy = _mm_sub_ps(iy,jy);
385 dz = _mm_sub_ps(iz,jz);
387 rsq = gmx_mm_calc_rsq_ps(dx,dy,dz);
388 rinv = gmx_mm_invsqrt_ps(rsq);
389 rinv2 = _mm_mul_ps(rinv,rinv);
390 rinv4 = _mm_mul_ps(rinv2,rinv2);
391 rinv6 = _mm_mul_ps(rinv4,rinv2);
393 rvdw = _mm_add_ps(rai,raj);
394 ratio = _mm_mul_ps(rsq, gmx_mm_inv_ps( _mm_mul_ps(rvdw,rvdw)));
396 mask_cmp = _mm_cmple_ps(ratio,still_p5inv);
398 if(0 == _mm_movemask_ps(mask_cmp))
400 /* if ratio>still_p5inv for ALL elements */
401 ccf = one;
402 dccf = _mm_setzero_ps();
404 else
406 ratio = _mm_min_ps(ratio,still_p5inv);
407 theta = _mm_mul_ps(ratio,still_pip5);
408 GMX_MM_SINCOS_PS(theta,sinq,cosq);
409 term = _mm_mul_ps(half,_mm_sub_ps(one,cosq));
410 ccf = _mm_mul_ps(term,term);
411 dccf = _mm_mul_ps(_mm_mul_ps(two,term),
412 _mm_mul_ps(sinq,theta));
415 prod = _mm_mul_ps(still_p4,vaj);
416 icf4 = _mm_mul_ps(ccf,rinv4);
417 icf6 = _mm_mul_ps( _mm_sub_ps( _mm_mul_ps(four,ccf),dccf), rinv6);
419 gpi = _mm_add_ps(gpi, _mm_mul_ps(prod,icf4));
421 _mm_store_ps(dadx,_mm_mul_ps(prod,icf6));
422 dadx+=4;
423 _mm_store_ps(dadx,_mm_mul_ps(prod_ai,icf6));
424 dadx+=4;
426 tmp = _mm_mul_ps(prod_ai,icf4);
428 if(offset==1)
430 GMX_MM_INCREMENT_1VALUE_PS(work+jnrA,tmp);
432 else if(offset==2)
434 GMX_MM_INCREMENT_2VALUES_PS(work+jnrA,work+jnrB,tmp);
436 else
438 /* offset must be 3 */
439 GMX_MM_INCREMENT_3VALUES_PS(work+jnrA,work+jnrB,work+jnrC,tmp);
442 gmx_mm_update_1pot_ps(gpi,work+ii);
445 /* Sum up the polarization energy from other nodes */
446 if(PARTDECOMP(cr))
448 gmx_sum(natoms, work, cr);
450 else if(DOMAINDECOMP(cr))
452 dd_atom_sum_real(cr->dd, work);
455 /* Compute the radii */
456 for(i=0;i<fr->natoms_force;i++) /* PELA born->nr */
458 if(born->use[i] != 0)
460 gpi_ai = born->gpol[i] + work[i]; /* add gpi to the initial pol energy gpi_ai*/
461 gpi2 = gpi_ai * gpi_ai;
462 born->bRad[i] = factor*gmx_invsqrt(gpi2);
463 fr->invsqrta[i] = gmx_invsqrt(born->bRad[i]);
467 /* Extra (local) communication required for DD */
468 if(DOMAINDECOMP(cr))
470 dd_atom_spread_real(cr->dd, born->bRad);
471 dd_atom_spread_real(cr->dd, fr->invsqrta);
474 return 0;
478 int
479 calc_gb_rad_hct_obc_sse(t_commrec *cr, t_forcerec * fr, int natoms, gmx_localtop_t *top,
480 const t_atomtypes *atype, float *x, t_nblist *nl, gmx_genborn_t *born,t_mdatoms *md,int gb_algorithm)
482 int i,ai,k,n,ii,ii3,is3,nj0,nj1,at0,at1,offset;
483 int jnrA,jnrB,jnrC,jnrD;
484 int j3A,j3B,j3C,j3D;
485 int jnrE,jnrF,jnrG,jnrH;
486 int j3E,j3F,j3G,j3H;
487 float shX,shY,shZ;
488 float rr,rr_inv,rr_inv2,sum_tmp,sum,sum2,sum3,gbr;
489 float sum_ai2, sum_ai3,tsum,tchain,doffset;
490 float *obc_param;
491 float *gb_radius;
492 float *work;
493 int * jjnr;
494 float *dadx;
495 float *shiftvec;
496 float min_rad,rad;
498 __m128 ix,iy,iz,jx,jy,jz;
499 __m128 dx,dy,dz,t1,t2,t3,t4;
500 __m128 rsq,rinv,r;
501 __m128 rai,rai_inv,raj, raj_inv,rai_inv2,sk,sk2,lij,dlij,duij;
502 __m128 uij,lij2,uij2,lij3,uij3,diff2;
503 __m128 lij_inv,sk2_inv,prod,log_term,tmp,tmp_sum;
504 __m128 sum_ai, tmp_ai,sk_ai,sk_aj,sk2_ai,sk2_aj,sk2_rinv;
505 __m128 dadx1,dadx2;
506 __m128 logterm;
507 __m128 mask;
508 __m128 obc_mask1,obc_mask2,obc_mask3;
509 __m128 jxB,jyB,jzB,t1B,t2B,t3B,t4B;
510 __m128 dxB,dyB,dzB,rsqB,rinvB,rB;
511 __m128 rajB, raj_invB,rai_inv2B,sk2B,lijB,dlijB,duijB;
512 __m128 uijB,lij2B,uij2B,lij3B,uij3B,diff2B;
513 __m128 lij_invB,sk2_invB,prodB;
514 __m128 sk_ajB,sk2_ajB,sk2_rinvB;
515 __m128 dadx1B,dadx2B;
516 __m128 logtermB;
517 __m128 obc_mask1B,obc_mask2B,obc_mask3B;
519 __m128 mask1 = gmx_mm_castsi128_ps( _mm_set_epi32(0, 0, 0, 0xffffffff) );
520 __m128 mask2 = gmx_mm_castsi128_ps( _mm_set_epi32(0, 0, 0xffffffff, 0xffffffff) );
521 __m128 mask3 = gmx_mm_castsi128_ps( _mm_set_epi32(0, 0xffffffff, 0xffffffff, 0xffffffff) );
523 __m128 oneeighth = _mm_set1_ps(0.125);
524 __m128 onefourth = _mm_set1_ps(0.25);
526 const __m128 neg = {-1.0f , -1.0f , -1.0f , -1.0f };
527 const __m128 zero = {0.0f , 0.0f , 0.0f , 0.0f };
528 const __m128 half = {0.5f , 0.5f , 0.5f , 0.5f };
529 const __m128 one = {1.0f , 1.0f , 1.0f , 1.0f };
530 const __m128 two = {2.0f , 2.0f , 2.0f , 2.0f };
531 const __m128 three = {3.0f , 3.0f , 3.0f , 3.0f };
533 /* Set the dielectric offset */
534 doffset = born->gb_doffset;
535 gb_radius = born->gb_radius;
536 obc_param = born->param;
537 work = born->gpol_hct_work;
538 jjnr = nl->jjnr;
539 dadx = fr->dadx;
540 shiftvec = fr->shift_vec[0];
542 jx = _mm_setzero_ps();
543 jy = _mm_setzero_ps();
544 jz = _mm_setzero_ps();
546 jnrA = jnrB = jnrC = jnrD = 0;
548 for(i=0;i<born->nr;i++)
550 work[i] = 0;
553 for(i=0;i<nl->nri;i++)
555 ii = nl->iinr[i];
556 ii3 = ii*3;
557 is3 = 3*nl->shift[i];
558 shX = shiftvec[is3];
559 shY = shiftvec[is3+1];
560 shZ = shiftvec[is3+2];
561 nj0 = nl->jindex[i];
562 nj1 = nl->jindex[i+1];
564 ix = _mm_set1_ps(shX+x[ii3+0]);
565 iy = _mm_set1_ps(shY+x[ii3+1]);
566 iz = _mm_set1_ps(shZ+x[ii3+2]);
568 offset = (nj1-nj0)%4;
570 rai = _mm_load1_ps(gb_radius+ii);
571 rai_inv= gmx_mm_inv_ps(rai);
573 sum_ai = _mm_setzero_ps();
575 sk_ai = _mm_load1_ps(born->param+ii);
576 sk2_ai = _mm_mul_ps(sk_ai,sk_ai);
578 for(k=nj0;k<nj1-4-offset;k+=8)
580 jnrA = jjnr[k];
581 jnrB = jjnr[k+1];
582 jnrC = jjnr[k+2];
583 jnrD = jjnr[k+3];
584 jnrE = jjnr[k+4];
585 jnrF = jjnr[k+5];
586 jnrG = jjnr[k+6];
587 jnrH = jjnr[k+7];
589 j3A = 3*jnrA;
590 j3B = 3*jnrB;
591 j3C = 3*jnrC;
592 j3D = 3*jnrD;
593 j3E = 3*jnrE;
594 j3F = 3*jnrF;
595 j3G = 3*jnrG;
596 j3H = 3*jnrH;
598 GMX_MM_LOAD_1RVEC_4POINTERS_PS(x+j3A,x+j3B,x+j3C,x+j3D,jx,jy,jz);
599 GMX_MM_LOAD_1RVEC_4POINTERS_PS(x+j3E,x+j3F,x+j3G,x+j3H,jxB,jyB,jzB);
600 GMX_MM_LOAD_4VALUES_PS(gb_radius+jnrA,gb_radius+jnrB,gb_radius+jnrC,gb_radius+jnrD,raj);
601 GMX_MM_LOAD_4VALUES_PS(gb_radius+jnrE,gb_radius+jnrF,gb_radius+jnrG,gb_radius+jnrH,rajB);
602 GMX_MM_LOAD_4VALUES_PS(obc_param+jnrA,obc_param+jnrB,obc_param+jnrC,obc_param+jnrD,sk_aj);
603 GMX_MM_LOAD_4VALUES_PS(obc_param+jnrE,obc_param+jnrF,obc_param+jnrG,obc_param+jnrH,sk_ajB);
605 dx = _mm_sub_ps(ix, jx);
606 dy = _mm_sub_ps(iy, jy);
607 dz = _mm_sub_ps(iz, jz);
608 dxB = _mm_sub_ps(ix, jxB);
609 dyB = _mm_sub_ps(iy, jyB);
610 dzB = _mm_sub_ps(iz, jzB);
612 rsq = gmx_mm_calc_rsq_ps(dx,dy,dz);
613 rsqB = gmx_mm_calc_rsq_ps(dxB,dyB,dzB);
615 rinv = gmx_mm_invsqrt_ps(rsq);
616 r = _mm_mul_ps(rsq,rinv);
617 rinvB = gmx_mm_invsqrt_ps(rsqB);
618 rB = _mm_mul_ps(rsqB,rinvB);
620 /* Compute raj_inv aj1-4 */
621 raj_inv = gmx_mm_inv_ps(raj);
622 raj_invB = gmx_mm_inv_ps(rajB);
624 /* Evaluate influence of atom aj -> ai */
625 t1 = _mm_add_ps(r,sk_aj);
626 t2 = _mm_sub_ps(r,sk_aj);
627 t3 = _mm_sub_ps(sk_aj,r);
628 t1B = _mm_add_ps(rB,sk_ajB);
629 t2B = _mm_sub_ps(rB,sk_ajB);
630 t3B = _mm_sub_ps(sk_ajB,rB);
631 obc_mask1 = _mm_cmplt_ps(rai, t1);
632 obc_mask2 = _mm_cmplt_ps(rai, t2);
633 obc_mask3 = _mm_cmplt_ps(rai, t3);
634 obc_mask1B = _mm_cmplt_ps(rai, t1B);
635 obc_mask2B = _mm_cmplt_ps(rai, t2B);
636 obc_mask3B = _mm_cmplt_ps(rai, t3B);
638 uij = gmx_mm_inv_ps(t1);
639 lij = _mm_or_ps( _mm_and_ps(obc_mask2,gmx_mm_inv_ps(t2)),
640 _mm_andnot_ps(obc_mask2,rai_inv));
641 dlij = _mm_and_ps(one,obc_mask2);
642 uij2 = _mm_mul_ps(uij, uij);
643 uij3 = _mm_mul_ps(uij2,uij);
644 lij2 = _mm_mul_ps(lij, lij);
645 lij3 = _mm_mul_ps(lij2,lij);
647 uijB = gmx_mm_inv_ps(t1B);
648 lijB = _mm_or_ps( _mm_and_ps(obc_mask2B,gmx_mm_inv_ps(t2B)),
649 _mm_andnot_ps(obc_mask2B,rai_inv));
650 dlijB = _mm_and_ps(one,obc_mask2B);
651 uij2B = _mm_mul_ps(uijB, uijB);
652 uij3B = _mm_mul_ps(uij2B,uijB);
653 lij2B = _mm_mul_ps(lijB, lijB);
654 lij3B = _mm_mul_ps(lij2B,lijB);
656 diff2 = _mm_sub_ps(uij2,lij2);
657 lij_inv = gmx_mm_invsqrt_ps(lij2);
658 sk2_aj = _mm_mul_ps(sk_aj,sk_aj);
659 sk2_rinv = _mm_mul_ps(sk2_aj,rinv);
660 prod = _mm_mul_ps(onefourth,sk2_rinv);
662 diff2B = _mm_sub_ps(uij2B,lij2B);
663 lij_invB = gmx_mm_invsqrt_ps(lij2B);
664 sk2_ajB = _mm_mul_ps(sk_ajB,sk_ajB);
665 sk2_rinvB = _mm_mul_ps(sk2_ajB,rinvB);
666 prodB = _mm_mul_ps(onefourth,sk2_rinvB);
668 logterm = gmx_mm_log_ps(_mm_mul_ps(uij,lij_inv));
669 logtermB = gmx_mm_log_ps(_mm_mul_ps(uijB,lij_invB));
671 t1 = _mm_sub_ps(lij,uij);
672 t2 = _mm_mul_ps(diff2,
673 _mm_sub_ps(_mm_mul_ps(onefourth,r),
674 prod));
675 t3 = _mm_mul_ps(half,_mm_mul_ps(rinv,logterm));
676 t1 = _mm_add_ps(t1,_mm_add_ps(t2,t3));
677 t4 = _mm_mul_ps(two,_mm_sub_ps(rai_inv,lij));
678 t4 = _mm_and_ps(t4,obc_mask3);
679 t1 = _mm_mul_ps(half,_mm_add_ps(t1,t4));
681 t1B = _mm_sub_ps(lijB,uijB);
682 t2B = _mm_mul_ps(diff2B,
683 _mm_sub_ps(_mm_mul_ps(onefourth,rB),
684 prodB));
685 t3B = _mm_mul_ps(half,_mm_mul_ps(rinvB,logtermB));
686 t1B = _mm_add_ps(t1B,_mm_add_ps(t2B,t3B));
687 t4B = _mm_mul_ps(two,_mm_sub_ps(rai_inv,lijB));
688 t4B = _mm_and_ps(t4B,obc_mask3B);
689 t1B = _mm_mul_ps(half,_mm_add_ps(t1B,t4B));
691 sum_ai = _mm_add_ps(sum_ai, _mm_add_ps( _mm_and_ps(t1,obc_mask1), _mm_and_ps(t1B,obc_mask1B) ));
693 t1 = _mm_add_ps(_mm_mul_ps(half,lij2),
694 _mm_mul_ps(prod,lij3));
695 t1 = _mm_sub_ps(t1,
696 _mm_mul_ps(onefourth,
697 _mm_add_ps(_mm_mul_ps(lij,rinv),
698 _mm_mul_ps(lij3,r))));
699 t2 = _mm_mul_ps(onefourth,
700 _mm_add_ps(_mm_mul_ps(uij,rinv),
701 _mm_mul_ps(uij3,r)));
702 t2 = _mm_sub_ps(t2,
703 _mm_add_ps(_mm_mul_ps(half,uij2),
704 _mm_mul_ps(prod,uij3)));
705 t3 = _mm_mul_ps(_mm_mul_ps(onefourth,logterm),
706 _mm_mul_ps(rinv,rinv));
707 t3 = _mm_sub_ps(t3,
708 _mm_mul_ps(_mm_mul_ps(diff2,oneeighth),
709 _mm_add_ps(one,
710 _mm_mul_ps(sk2_rinv,rinv))));
711 t1 = _mm_mul_ps(rinv,
712 _mm_add_ps(_mm_mul_ps(dlij,t1),
713 _mm_add_ps(t2,t3)));
717 t1B = _mm_add_ps(_mm_mul_ps(half,lij2B),
718 _mm_mul_ps(prodB,lij3B));
719 t1B = _mm_sub_ps(t1B,
720 _mm_mul_ps(onefourth,
721 _mm_add_ps(_mm_mul_ps(lijB,rinvB),
722 _mm_mul_ps(lij3B,rB))));
723 t2B = _mm_mul_ps(onefourth,
724 _mm_add_ps(_mm_mul_ps(uijB,rinvB),
725 _mm_mul_ps(uij3B,rB)));
726 t2B = _mm_sub_ps(t2B,
727 _mm_add_ps(_mm_mul_ps(half,uij2B),
728 _mm_mul_ps(prodB,uij3B)));
729 t3B = _mm_mul_ps(_mm_mul_ps(onefourth,logtermB),
730 _mm_mul_ps(rinvB,rinvB));
731 t3B = _mm_sub_ps(t3B,
732 _mm_mul_ps(_mm_mul_ps(diff2B,oneeighth),
733 _mm_add_ps(one,
734 _mm_mul_ps(sk2_rinvB,rinvB))));
735 t1B = _mm_mul_ps(rinvB,
736 _mm_add_ps(_mm_mul_ps(dlijB,t1B),
737 _mm_add_ps(t2B,t3B)));
739 dadx1 = _mm_and_ps(t1,obc_mask1);
740 dadx1B = _mm_and_ps(t1B,obc_mask1B);
743 /* Evaluate influence of atom ai -> aj */
744 t1 = _mm_add_ps(r,sk_ai);
745 t2 = _mm_sub_ps(r,sk_ai);
746 t3 = _mm_sub_ps(sk_ai,r);
747 t1B = _mm_add_ps(rB,sk_ai);
748 t2B = _mm_sub_ps(rB,sk_ai);
749 t3B = _mm_sub_ps(sk_ai,rB);
750 obc_mask1 = _mm_cmplt_ps(raj, t1);
751 obc_mask2 = _mm_cmplt_ps(raj, t2);
752 obc_mask3 = _mm_cmplt_ps(raj, t3);
753 obc_mask1B = _mm_cmplt_ps(rajB, t1B);
754 obc_mask2B = _mm_cmplt_ps(rajB, t2B);
755 obc_mask3B = _mm_cmplt_ps(rajB, t3B);
757 uij = gmx_mm_inv_ps(t1);
758 lij = _mm_or_ps( _mm_and_ps(obc_mask2,gmx_mm_inv_ps(t2)),
759 _mm_andnot_ps(obc_mask2,raj_inv));
760 dlij = _mm_and_ps(one,obc_mask2);
761 uij2 = _mm_mul_ps(uij, uij);
762 uij3 = _mm_mul_ps(uij2,uij);
763 lij2 = _mm_mul_ps(lij, lij);
764 lij3 = _mm_mul_ps(lij2,lij);
766 uijB = gmx_mm_inv_ps(t1B);
767 lijB = _mm_or_ps( _mm_and_ps(obc_mask2B,gmx_mm_inv_ps(t2B)),
768 _mm_andnot_ps(obc_mask2B,raj_invB));
769 dlijB = _mm_and_ps(one,obc_mask2B);
770 uij2B = _mm_mul_ps(uijB, uijB);
771 uij3B = _mm_mul_ps(uij2B,uijB);
772 lij2B = _mm_mul_ps(lijB, lijB);
773 lij3B = _mm_mul_ps(lij2B,lijB);
775 diff2 = _mm_sub_ps(uij2,lij2);
776 lij_inv = gmx_mm_invsqrt_ps(lij2);
777 sk2_rinv = _mm_mul_ps(sk2_ai,rinv);
778 prod = _mm_mul_ps(onefourth,sk2_rinv);
780 diff2B = _mm_sub_ps(uij2B,lij2B);
781 lij_invB = gmx_mm_invsqrt_ps(lij2B);
782 sk2_rinvB = _mm_mul_ps(sk2_ai,rinvB);
783 prodB = _mm_mul_ps(onefourth,sk2_rinvB);
785 logterm = gmx_mm_log_ps(_mm_mul_ps(uij,lij_inv));
786 logtermB = gmx_mm_log_ps(_mm_mul_ps(uijB,lij_invB));
788 t1 = _mm_sub_ps(lij,uij);
789 t2 = _mm_mul_ps(diff2,
790 _mm_sub_ps(_mm_mul_ps(onefourth,r),
791 prod));
792 t3 = _mm_mul_ps(half,_mm_mul_ps(rinv,logterm));
793 t1 = _mm_add_ps(t1,_mm_add_ps(t2,t3));
794 t4 = _mm_mul_ps(two,_mm_sub_ps(raj_inv,lij));
795 t4 = _mm_and_ps(t4,obc_mask3);
796 t1 = _mm_mul_ps(half,_mm_add_ps(t1,t4));
798 t1B = _mm_sub_ps(lijB,uijB);
799 t2B = _mm_mul_ps(diff2B,
800 _mm_sub_ps(_mm_mul_ps(onefourth,rB),
801 prodB));
802 t3B = _mm_mul_ps(half,_mm_mul_ps(rinvB,logtermB));
803 t1B = _mm_add_ps(t1B,_mm_add_ps(t2B,t3B));
804 t4B = _mm_mul_ps(two,_mm_sub_ps(raj_invB,lijB));
805 t4B = _mm_and_ps(t4B,obc_mask3B);
806 t1B = _mm_mul_ps(half,_mm_add_ps(t1B,t4B));
808 GMX_MM_INCREMENT_4VALUES_PS(work+jnrA,work+jnrB,work+jnrC,work+jnrD,_mm_and_ps(t1,obc_mask1));
809 GMX_MM_INCREMENT_4VALUES_PS(work+jnrE,work+jnrF,work+jnrG,work+jnrH,_mm_and_ps(t1B,obc_mask1B));
811 t1 = _mm_add_ps(_mm_mul_ps(half,lij2),
812 _mm_mul_ps(prod,lij3));
813 t1 = _mm_sub_ps(t1,
814 _mm_mul_ps(onefourth,
815 _mm_add_ps(_mm_mul_ps(lij,rinv),
816 _mm_mul_ps(lij3,r))));
817 t2 = _mm_mul_ps(onefourth,
818 _mm_add_ps(_mm_mul_ps(uij,rinv),
819 _mm_mul_ps(uij3,r)));
820 t2 = _mm_sub_ps(t2,
821 _mm_add_ps(_mm_mul_ps(half,uij2),
822 _mm_mul_ps(prod,uij3)));
823 t3 = _mm_mul_ps(_mm_mul_ps(onefourth,logterm),
824 _mm_mul_ps(rinv,rinv));
825 t3 = _mm_sub_ps(t3,
826 _mm_mul_ps(_mm_mul_ps(diff2,oneeighth),
827 _mm_add_ps(one,
828 _mm_mul_ps(sk2_rinv,rinv))));
829 t1 = _mm_mul_ps(rinv,
830 _mm_add_ps(_mm_mul_ps(dlij,t1),
831 _mm_add_ps(t2,t3)));
834 t1B = _mm_add_ps(_mm_mul_ps(half,lij2B),
835 _mm_mul_ps(prodB,lij3B));
836 t1B = _mm_sub_ps(t1B,
837 _mm_mul_ps(onefourth,
838 _mm_add_ps(_mm_mul_ps(lijB,rinvB),
839 _mm_mul_ps(lij3B,rB))));
840 t2B = _mm_mul_ps(onefourth,
841 _mm_add_ps(_mm_mul_ps(uijB,rinvB),
842 _mm_mul_ps(uij3B,rB)));
843 t2B = _mm_sub_ps(t2B,
844 _mm_add_ps(_mm_mul_ps(half,uij2B),
845 _mm_mul_ps(prodB,uij3B)));
846 t3B = _mm_mul_ps(_mm_mul_ps(onefourth,logtermB),
847 _mm_mul_ps(rinvB,rinvB));
848 t3B = _mm_sub_ps(t3B,
849 _mm_mul_ps(_mm_mul_ps(diff2B,oneeighth),
850 _mm_add_ps(one,
851 _mm_mul_ps(sk2_rinvB,rinvB))));
852 t1B = _mm_mul_ps(rinvB,
853 _mm_add_ps(_mm_mul_ps(dlijB,t1B),
854 _mm_add_ps(t2B,t3B)));
857 dadx2 = _mm_and_ps(t1,obc_mask1);
858 dadx2B = _mm_and_ps(t1B,obc_mask1B);
860 _mm_store_ps(dadx,dadx1);
861 dadx += 4;
862 _mm_store_ps(dadx,dadx2);
863 dadx += 4;
864 _mm_store_ps(dadx,dadx1B);
865 dadx += 4;
866 _mm_store_ps(dadx,dadx2B);
867 dadx += 4;
869 } /* end normal inner loop */
871 for(;k<nj1-offset;k+=4)
873 jnrA = jjnr[k];
874 jnrB = jjnr[k+1];
875 jnrC = jjnr[k+2];
876 jnrD = jjnr[k+3];
878 j3A = 3*jnrA;
879 j3B = 3*jnrB;
880 j3C = 3*jnrC;
881 j3D = 3*jnrD;
883 GMX_MM_LOAD_1RVEC_4POINTERS_PS(x+j3A,x+j3B,x+j3C,x+j3D,jx,jy,jz);
884 GMX_MM_LOAD_4VALUES_PS(gb_radius+jnrA,gb_radius+jnrB,gb_radius+jnrC,gb_radius+jnrD,raj);
885 GMX_MM_LOAD_4VALUES_PS(obc_param+jnrA,obc_param+jnrB,obc_param+jnrC,obc_param+jnrD,sk_aj);
887 dx = _mm_sub_ps(ix, jx);
888 dy = _mm_sub_ps(iy, jy);
889 dz = _mm_sub_ps(iz, jz);
891 rsq = gmx_mm_calc_rsq_ps(dx,dy,dz);
893 rinv = gmx_mm_invsqrt_ps(rsq);
894 r = _mm_mul_ps(rsq,rinv);
896 /* Compute raj_inv aj1-4 */
897 raj_inv = gmx_mm_inv_ps(raj);
899 /* Evaluate influence of atom aj -> ai */
900 t1 = _mm_add_ps(r,sk_aj);
901 obc_mask1 = _mm_cmplt_ps(rai, t1);
903 if(_mm_movemask_ps(obc_mask1))
905 /* If any of the elements has rai<dr+sk, this is executed */
906 t2 = _mm_sub_ps(r,sk_aj);
907 t3 = _mm_sub_ps(sk_aj,r);
909 obc_mask2 = _mm_cmplt_ps(rai, t2);
910 obc_mask3 = _mm_cmplt_ps(rai, t3);
912 uij = gmx_mm_inv_ps(t1);
913 lij = _mm_or_ps( _mm_and_ps(obc_mask2,gmx_mm_inv_ps(t2)),
914 _mm_andnot_ps(obc_mask2,rai_inv));
915 dlij = _mm_and_ps(one,obc_mask2);
916 uij2 = _mm_mul_ps(uij, uij);
917 uij3 = _mm_mul_ps(uij2,uij);
918 lij2 = _mm_mul_ps(lij, lij);
919 lij3 = _mm_mul_ps(lij2,lij);
920 diff2 = _mm_sub_ps(uij2,lij2);
921 lij_inv = gmx_mm_invsqrt_ps(lij2);
922 sk2_aj = _mm_mul_ps(sk_aj,sk_aj);
923 sk2_rinv = _mm_mul_ps(sk2_aj,rinv);
924 prod = _mm_mul_ps(onefourth,sk2_rinv);
925 logterm = gmx_mm_log_ps(_mm_mul_ps(uij,lij_inv));
926 t1 = _mm_sub_ps(lij,uij);
927 t2 = _mm_mul_ps(diff2,
928 _mm_sub_ps(_mm_mul_ps(onefourth,r),
929 prod));
930 t3 = _mm_mul_ps(half,_mm_mul_ps(rinv,logterm));
931 t1 = _mm_add_ps(t1,_mm_add_ps(t2,t3));
932 t4 = _mm_mul_ps(two,_mm_sub_ps(rai_inv,lij));
933 t4 = _mm_and_ps(t4,obc_mask3);
934 t1 = _mm_mul_ps(half,_mm_add_ps(t1,t4));
935 sum_ai = _mm_add_ps(sum_ai,_mm_and_ps(t1,obc_mask1));
936 t1 = _mm_add_ps(_mm_mul_ps(half,lij2),
937 _mm_mul_ps(prod,lij3));
938 t1 = _mm_sub_ps(t1,
939 _mm_mul_ps(onefourth,
940 _mm_add_ps(_mm_mul_ps(lij,rinv),
941 _mm_mul_ps(lij3,r))));
942 t2 = _mm_mul_ps(onefourth,
943 _mm_add_ps(_mm_mul_ps(uij,rinv),
944 _mm_mul_ps(uij3,r)));
945 t2 = _mm_sub_ps(t2,
946 _mm_add_ps(_mm_mul_ps(half,uij2),
947 _mm_mul_ps(prod,uij3)));
948 t3 = _mm_mul_ps(_mm_mul_ps(onefourth,logterm),
949 _mm_mul_ps(rinv,rinv));
950 t3 = _mm_sub_ps(t3,
951 _mm_mul_ps(_mm_mul_ps(diff2,oneeighth),
952 _mm_add_ps(one,
953 _mm_mul_ps(sk2_rinv,rinv))));
954 t1 = _mm_mul_ps(rinv,
955 _mm_add_ps(_mm_mul_ps(dlij,t1),
956 _mm_add_ps(t2,t3)));
958 dadx1 = _mm_and_ps(t1,obc_mask1);
960 else
962 dadx1 = _mm_setzero_ps();
965 /* Evaluate influence of atom ai -> aj */
966 t1 = _mm_add_ps(r,sk_ai);
967 obc_mask1 = _mm_cmplt_ps(raj, t1);
969 if(_mm_movemask_ps(obc_mask1))
971 t2 = _mm_sub_ps(r,sk_ai);
972 t3 = _mm_sub_ps(sk_ai,r);
973 obc_mask2 = _mm_cmplt_ps(raj, t2);
974 obc_mask3 = _mm_cmplt_ps(raj, t3);
976 uij = gmx_mm_inv_ps(t1);
977 lij = _mm_or_ps( _mm_and_ps(obc_mask2,gmx_mm_inv_ps(t2)),
978 _mm_andnot_ps(obc_mask2,raj_inv));
979 dlij = _mm_and_ps(one,obc_mask2);
980 uij2 = _mm_mul_ps(uij, uij);
981 uij3 = _mm_mul_ps(uij2,uij);
982 lij2 = _mm_mul_ps(lij, lij);
983 lij3 = _mm_mul_ps(lij2,lij);
984 diff2 = _mm_sub_ps(uij2,lij2);
985 lij_inv = gmx_mm_invsqrt_ps(lij2);
986 sk2_rinv = _mm_mul_ps(sk2_ai,rinv);
987 prod = _mm_mul_ps(onefourth,sk2_rinv);
988 logterm = gmx_mm_log_ps(_mm_mul_ps(uij,lij_inv));
989 t1 = _mm_sub_ps(lij,uij);
990 t2 = _mm_mul_ps(diff2,
991 _mm_sub_ps(_mm_mul_ps(onefourth,r),
992 prod));
993 t3 = _mm_mul_ps(half,_mm_mul_ps(rinv,logterm));
994 t1 = _mm_add_ps(t1,_mm_add_ps(t2,t3));
995 t4 = _mm_mul_ps(two,_mm_sub_ps(raj_inv,lij));
996 t4 = _mm_and_ps(t4,obc_mask3);
997 t1 = _mm_mul_ps(half,_mm_add_ps(t1,t4));
999 GMX_MM_INCREMENT_4VALUES_PS(work+jnrA,work+jnrB,work+jnrC,work+jnrD,_mm_and_ps(t1,obc_mask1));
1001 t1 = _mm_add_ps(_mm_mul_ps(half,lij2),
1002 _mm_mul_ps(prod,lij3));
1003 t1 = _mm_sub_ps(t1,
1004 _mm_mul_ps(onefourth,
1005 _mm_add_ps(_mm_mul_ps(lij,rinv),
1006 _mm_mul_ps(lij3,r))));
1007 t2 = _mm_mul_ps(onefourth,
1008 _mm_add_ps(_mm_mul_ps(uij,rinv),
1009 _mm_mul_ps(uij3,r)));
1010 t2 = _mm_sub_ps(t2,
1011 _mm_add_ps(_mm_mul_ps(half,uij2),
1012 _mm_mul_ps(prod,uij3)));
1013 t3 = _mm_mul_ps(_mm_mul_ps(onefourth,logterm),
1014 _mm_mul_ps(rinv,rinv));
1015 t3 = _mm_sub_ps(t3,
1016 _mm_mul_ps(_mm_mul_ps(diff2,oneeighth),
1017 _mm_add_ps(one,
1018 _mm_mul_ps(sk2_rinv,rinv))));
1019 t1 = _mm_mul_ps(rinv,
1020 _mm_add_ps(_mm_mul_ps(dlij,t1),
1021 _mm_add_ps(t2,t3)));
1022 dadx2 = _mm_and_ps(t1,obc_mask1);
1024 else
1026 dadx2 = _mm_setzero_ps();
1029 _mm_store_ps(dadx,dadx1);
1030 dadx += 4;
1031 _mm_store_ps(dadx,dadx2);
1032 dadx += 4;
1033 } /* end normal inner loop */
1035 if(offset!=0)
1037 if(offset==1)
1039 jnrA = jjnr[k];
1040 j3A = 3*jnrA;
1041 GMX_MM_LOAD_1RVEC_1POINTER_PS(x+j3A,jx,jy,jz);
1042 GMX_MM_LOAD_1VALUE_PS(gb_radius+jnrA,raj);
1043 GMX_MM_LOAD_1VALUE_PS(obc_param+jnrA,sk_aj);
1044 mask = mask1;
1046 else if(offset==2)
1048 jnrA = jjnr[k];
1049 jnrB = jjnr[k+1];
1050 j3A = 3*jnrA;
1051 j3B = 3*jnrB;
1052 GMX_MM_LOAD_1RVEC_2POINTERS_PS(x+j3A,x+j3B,jx,jy,jz);
1053 GMX_MM_LOAD_2VALUES_PS(gb_radius+jnrA,gb_radius+jnrB,raj);
1054 GMX_MM_LOAD_2VALUES_PS(obc_param+jnrA,obc_param+jnrB,sk_aj);
1055 mask = mask2;
1057 else
1059 /* offset must be 3 */
1060 jnrA = jjnr[k];
1061 jnrB = jjnr[k+1];
1062 jnrC = jjnr[k+2];
1063 j3A = 3*jnrA;
1064 j3B = 3*jnrB;
1065 j3C = 3*jnrC;
1066 GMX_MM_LOAD_1RVEC_3POINTERS_PS(x+j3A,x+j3B,x+j3C,jx,jy,jz);
1067 GMX_MM_LOAD_3VALUES_PS(gb_radius+jnrA,gb_radius+jnrB,gb_radius+jnrC,raj);
1068 GMX_MM_LOAD_3VALUES_PS(obc_param+jnrA,obc_param+jnrB,obc_param+jnrC,sk_aj);
1069 mask = mask3;
1072 dx = _mm_sub_ps(ix, jx);
1073 dy = _mm_sub_ps(iy, jy);
1074 dz = _mm_sub_ps(iz, jz);
1076 rsq = gmx_mm_calc_rsq_ps(dx,dy,dz);
1078 rinv = gmx_mm_invsqrt_ps(rsq);
1079 r = _mm_mul_ps(rsq,rinv);
1081 /* Compute raj_inv aj1-4 */
1082 raj_inv = gmx_mm_inv_ps(raj);
1084 /* Evaluate influence of atom aj -> ai */
1085 t1 = _mm_add_ps(r,sk_aj);
1086 obc_mask1 = _mm_cmplt_ps(rai, t1);
1087 obc_mask1 = _mm_and_ps(obc_mask1,mask);
1089 if(_mm_movemask_ps(obc_mask1))
1091 t2 = _mm_sub_ps(r,sk_aj);
1092 t3 = _mm_sub_ps(sk_aj,r);
1093 obc_mask2 = _mm_cmplt_ps(rai, t2);
1094 obc_mask3 = _mm_cmplt_ps(rai, t3);
1096 uij = gmx_mm_inv_ps(t1);
1097 lij = _mm_or_ps( _mm_and_ps(obc_mask2,gmx_mm_inv_ps(t2)),
1098 _mm_andnot_ps(obc_mask2,rai_inv));
1099 dlij = _mm_and_ps(one,obc_mask2);
1100 uij2 = _mm_mul_ps(uij, uij);
1101 uij3 = _mm_mul_ps(uij2,uij);
1102 lij2 = _mm_mul_ps(lij, lij);
1103 lij3 = _mm_mul_ps(lij2,lij);
1104 diff2 = _mm_sub_ps(uij2,lij2);
1105 lij_inv = gmx_mm_invsqrt_ps(lij2);
1106 sk2_aj = _mm_mul_ps(sk_aj,sk_aj);
1107 sk2_rinv = _mm_mul_ps(sk2_aj,rinv);
1108 prod = _mm_mul_ps(onefourth,sk2_rinv);
1109 logterm = gmx_mm_log_ps(_mm_mul_ps(uij,lij_inv));
1110 t1 = _mm_sub_ps(lij,uij);
1111 t2 = _mm_mul_ps(diff2,
1112 _mm_sub_ps(_mm_mul_ps(onefourth,r),
1113 prod));
1114 t3 = _mm_mul_ps(half,_mm_mul_ps(rinv,logterm));
1115 t1 = _mm_add_ps(t1,_mm_add_ps(t2,t3));
1116 t4 = _mm_mul_ps(two,_mm_sub_ps(rai_inv,lij));
1117 t4 = _mm_and_ps(t4,obc_mask3);
1118 t1 = _mm_mul_ps(half,_mm_add_ps(t1,t4));
1119 sum_ai = _mm_add_ps(sum_ai,_mm_and_ps(t1,obc_mask1));
1120 t1 = _mm_add_ps(_mm_mul_ps(half,lij2),
1121 _mm_mul_ps(prod,lij3));
1122 t1 = _mm_sub_ps(t1,
1123 _mm_mul_ps(onefourth,
1124 _mm_add_ps(_mm_mul_ps(lij,rinv),
1125 _mm_mul_ps(lij3,r))));
1126 t2 = _mm_mul_ps(onefourth,
1127 _mm_add_ps(_mm_mul_ps(uij,rinv),
1128 _mm_mul_ps(uij3,r)));
1129 t2 = _mm_sub_ps(t2,
1130 _mm_add_ps(_mm_mul_ps(half,uij2),
1131 _mm_mul_ps(prod,uij3)));
1132 t3 = _mm_mul_ps(_mm_mul_ps(onefourth,logterm),
1133 _mm_mul_ps(rinv,rinv));
1134 t3 = _mm_sub_ps(t3,
1135 _mm_mul_ps(_mm_mul_ps(diff2,oneeighth),
1136 _mm_add_ps(one,
1137 _mm_mul_ps(sk2_rinv,rinv))));
1138 t1 = _mm_mul_ps(rinv,
1139 _mm_add_ps(_mm_mul_ps(dlij,t1),
1140 _mm_add_ps(t2,t3)));
1141 dadx1 = _mm_and_ps(t1,obc_mask1);
1143 else
1145 dadx1 = _mm_setzero_ps();
1148 /* Evaluate influence of atom ai -> aj */
1149 t1 = _mm_add_ps(r,sk_ai);
1150 obc_mask1 = _mm_cmplt_ps(raj, t1);
1151 obc_mask1 = _mm_and_ps(obc_mask1,mask);
1153 if(_mm_movemask_ps(obc_mask1))
1155 t2 = _mm_sub_ps(r,sk_ai);
1156 t3 = _mm_sub_ps(sk_ai,r);
1157 obc_mask2 = _mm_cmplt_ps(raj, t2);
1158 obc_mask3 = _mm_cmplt_ps(raj, t3);
1160 uij = gmx_mm_inv_ps(t1);
1161 lij = _mm_or_ps(_mm_and_ps(obc_mask2,gmx_mm_inv_ps(t2)),
1162 _mm_andnot_ps(obc_mask2,raj_inv));
1163 dlij = _mm_and_ps(one,obc_mask2);
1164 uij2 = _mm_mul_ps(uij, uij);
1165 uij3 = _mm_mul_ps(uij2,uij);
1166 lij2 = _mm_mul_ps(lij, lij);
1167 lij3 = _mm_mul_ps(lij2,lij);
1168 diff2 = _mm_sub_ps(uij2,lij2);
1169 lij_inv = gmx_mm_invsqrt_ps(lij2);
1170 sk2_rinv = _mm_mul_ps(sk2_ai,rinv);
1171 prod = _mm_mul_ps(onefourth,sk2_rinv);
1172 logterm = gmx_mm_log_ps(_mm_mul_ps(uij,lij_inv));
1173 t1 = _mm_sub_ps(lij,uij);
1174 t2 = _mm_mul_ps(diff2,
1175 _mm_sub_ps(_mm_mul_ps(onefourth,r),
1176 prod));
1177 t3 = _mm_mul_ps(half,_mm_mul_ps(rinv,logterm));
1178 t1 = _mm_add_ps(t1,_mm_add_ps(t2,t3));
1179 t4 = _mm_mul_ps(two,_mm_sub_ps(raj_inv,lij));
1180 t4 = _mm_and_ps(t4,obc_mask3);
1181 t1 = _mm_mul_ps(half,_mm_add_ps(t1,t4));
1183 tmp = _mm_and_ps(t1,obc_mask1);
1185 t1 = _mm_add_ps(_mm_mul_ps(half,lij2),
1186 _mm_mul_ps(prod,lij3));
1187 t1 = _mm_sub_ps(t1,
1188 _mm_mul_ps(onefourth,
1189 _mm_add_ps(_mm_mul_ps(lij,rinv),
1190 _mm_mul_ps(lij3,r))));
1191 t2 = _mm_mul_ps(onefourth,
1192 _mm_add_ps(_mm_mul_ps(uij,rinv),
1193 _mm_mul_ps(uij3,r)));
1194 t2 = _mm_sub_ps(t2,
1195 _mm_add_ps(_mm_mul_ps(half,uij2),
1196 _mm_mul_ps(prod,uij3)));
1197 t3 = _mm_mul_ps(_mm_mul_ps(onefourth,logterm),
1198 _mm_mul_ps(rinv,rinv));
1199 t3 = _mm_sub_ps(t3,
1200 _mm_mul_ps(_mm_mul_ps(diff2,oneeighth),
1201 _mm_add_ps(one,
1202 _mm_mul_ps(sk2_rinv,rinv))));
1203 t1 = _mm_mul_ps(rinv,
1204 _mm_add_ps(_mm_mul_ps(dlij,t1),
1205 _mm_add_ps(t2,t3)));
1206 dadx2 = _mm_and_ps(t1,obc_mask1);
1208 else
1210 dadx2 = _mm_setzero_ps();
1211 tmp = _mm_setzero_ps();
1214 _mm_store_ps(dadx,dadx1);
1215 dadx += 4;
1216 _mm_store_ps(dadx,dadx2);
1217 dadx += 4;
1219 if(offset==1)
1221 GMX_MM_INCREMENT_1VALUE_PS(work+jnrA,tmp);
1223 else if(offset==2)
1225 GMX_MM_INCREMENT_2VALUES_PS(work+jnrA,work+jnrB,tmp);
1227 else
1229 /* offset must be 3 */
1230 GMX_MM_INCREMENT_3VALUES_PS(work+jnrA,work+jnrB,work+jnrC,tmp);
1234 gmx_mm_update_1pot_ps(sum_ai,work+ii);
1238 /* Parallel summations */
1239 if(PARTDECOMP(cr))
1241 gmx_sum(natoms, work, cr);
1243 else if(DOMAINDECOMP(cr))
1245 dd_atom_sum_real(cr->dd, work);
1248 if(gb_algorithm==egbHCT)
1250 /* HCT */
1251 for(i=0;i<fr->natoms_force;i++) /* PELA born->nr */
1253 if(born->use[i] != 0)
1255 rr = top->atomtypes.gb_radius[md->typeA[i]]-doffset;
1256 sum = 1.0/rr - work[i];
1257 min_rad = rr + doffset;
1258 rad = 1.0/sum;
1260 born->bRad[i] = rad > min_rad ? rad : min_rad;
1261 fr->invsqrta[i] = gmx_invsqrt(born->bRad[i]);
1265 /* Extra communication required for DD */
1266 if(DOMAINDECOMP(cr))
1268 dd_atom_spread_real(cr->dd, born->bRad);
1269 dd_atom_spread_real(cr->dd, fr->invsqrta);
1272 else
1274 /* OBC */
1275 for(i=0;i<fr->natoms_force;i++) /* PELA born->nr */
1277 if(born->use[i] != 0)
1279 rr = top->atomtypes.gb_radius[md->typeA[i]];
1280 rr_inv2 = 1.0/rr;
1281 rr = rr-doffset;
1282 rr_inv = 1.0/rr;
1283 sum = rr * work[i];
1284 sum2 = sum * sum;
1285 sum3 = sum2 * sum;
1287 tsum = tanh(born->obc_alpha*sum-born->obc_beta*sum2+born->obc_gamma*sum3);
1288 born->bRad[i] = rr_inv - tsum*rr_inv2;
1289 born->bRad[i] = 1.0 / born->bRad[i];
1291 fr->invsqrta[i]=gmx_invsqrt(born->bRad[i]);
1293 tchain = rr * (born->obc_alpha-2*born->obc_beta*sum+3*born->obc_gamma*sum2);
1294 born->drobc[i] = (1.0-tsum*tsum)*tchain*rr_inv2;
1297 /* Extra (local) communication required for DD */
1298 if(DOMAINDECOMP(cr))
1300 dd_atom_spread_real(cr->dd, born->bRad);
1301 dd_atom_spread_real(cr->dd, fr->invsqrta);
1302 dd_atom_spread_real(cr->dd, born->drobc);
1308 return 0;
1313 float calc_gb_chainrule_sse(int natoms, t_nblist *nl, float *dadx, float *dvda,
1314 float *x, float *f, float *fshift, float *shiftvec,
1315 int gb_algorithm, gmx_genborn_t *born, t_mdatoms *md)
1317 int i,k,n,ii,jnr,ii3,is3,nj0,nj1,offset,n0,n1;
1318 int jnrA,jnrB,jnrC,jnrD;
1319 int j3A,j3B,j3C,j3D;
1320 int jnrE,jnrF,jnrG,jnrH;
1321 int j3E,j3F,j3G,j3H;
1322 int * jjnr;
1324 float rbi,shX,shY,shZ;
1325 float *rb;
1327 __m128 ix,iy,iz;
1328 __m128 jx,jy,jz;
1329 __m128 jxB,jyB,jzB;
1330 __m128 fix,fiy,fiz;
1331 __m128 dx,dy,dz;
1332 __m128 tx,ty,tz;
1333 __m128 dxB,dyB,dzB;
1334 __m128 txB,tyB,tzB;
1336 __m128 rbai,rbaj,rbajB, f_gb, f_gb_ai,f_gbB,f_gb_aiB;
1337 __m128 xmm1,xmm2,xmm3;
1339 const __m128 two = {2.0f , 2.0f , 2.0f , 2.0f };
1341 rb = born->work;
1343 jjnr = nl->jjnr;
1345 /* Loop to get the proper form for the Born radius term, sse style */
1346 offset=natoms%4;
1348 n0 = md->start;
1349 n1 = md->start+md->homenr+1+natoms/2;
1351 if(gb_algorithm==egbSTILL)
1353 for(i=n0;i<n1;i++)
1355 k = i % natoms;
1356 rbi = born->bRad[k];
1357 rb[k] = (2 * rbi * rbi * dvda[k])/ONE_4PI_EPS0;
1360 else if(gb_algorithm==egbHCT)
1362 for(i=n0;i<n1;i++)
1364 k = i % natoms;
1365 rbi = born->bRad[k];
1366 rb[k] = rbi * rbi * dvda[k];
1369 else if(gb_algorithm==egbOBC)
1371 for(i=n0;i<n1;i++)
1373 k = i % natoms;
1374 rbi = born->bRad[k];
1375 rb[k] = rbi * rbi * born->drobc[k] * dvda[k];
1379 jz = _mm_setzero_ps();
1381 n = j3A = j3B = j3C = j3D = 0;
1383 for(i=0;i<nl->nri;i++)
1385 ii = nl->iinr[i];
1386 ii3 = ii*3;
1387 is3 = 3*nl->shift[i];
1388 shX = shiftvec[is3];
1389 shY = shiftvec[is3+1];
1390 shZ = shiftvec[is3+2];
1391 nj0 = nl->jindex[i];
1392 nj1 = nl->jindex[i+1];
1394 ix = _mm_set1_ps(shX+x[ii3+0]);
1395 iy = _mm_set1_ps(shY+x[ii3+1]);
1396 iz = _mm_set1_ps(shZ+x[ii3+2]);
1398 offset = (nj1-nj0)%4;
1400 rbai = _mm_load1_ps(rb+ii);
1401 fix = _mm_setzero_ps();
1402 fiy = _mm_setzero_ps();
1403 fiz = _mm_setzero_ps();
1406 for(k=nj0;k<nj1-offset;k+=4)
1408 jnrA = jjnr[k];
1409 jnrB = jjnr[k+1];
1410 jnrC = jjnr[k+2];
1411 jnrD = jjnr[k+3];
1413 j3A = 3*jnrA;
1414 j3B = 3*jnrB;
1415 j3C = 3*jnrC;
1416 j3D = 3*jnrD;
1418 GMX_MM_LOAD_1RVEC_4POINTERS_PS(x+j3A,x+j3B,x+j3C,x+j3D,jx,jy,jz);
1420 dx = _mm_sub_ps(ix,jx);
1421 dy = _mm_sub_ps(iy,jy);
1422 dz = _mm_sub_ps(iz,jz);
1424 GMX_MM_LOAD_4VALUES_PS(rb+jnrA,rb+jnrB,rb+jnrC,rb+jnrD,rbaj);
1426 /* load chain rule terms for j1-4 */
1427 f_gb = _mm_load_ps(dadx);
1428 dadx += 4;
1429 f_gb_ai = _mm_load_ps(dadx);
1430 dadx += 4;
1432 /* calculate scalar force */
1433 f_gb = _mm_mul_ps(f_gb,rbai);
1434 f_gb_ai = _mm_mul_ps(f_gb_ai,rbaj);
1435 f_gb = _mm_add_ps(f_gb,f_gb_ai);
1437 tx = _mm_mul_ps(f_gb,dx);
1438 ty = _mm_mul_ps(f_gb,dy);
1439 tz = _mm_mul_ps(f_gb,dz);
1441 fix = _mm_add_ps(fix,tx);
1442 fiy = _mm_add_ps(fiy,ty);
1443 fiz = _mm_add_ps(fiz,tz);
1445 GMX_MM_DECREMENT_1RVEC_4POINTERS_PS(f+j3A,f+j3B,f+j3C,f+j3D,tx,ty,tz);
1448 /*deal with odd elements */
1449 if(offset!=0)
1451 if(offset==1)
1453 jnrA = jjnr[k];
1454 j3A = 3*jnrA;
1455 GMX_MM_LOAD_1RVEC_1POINTER_PS(x+j3A,jx,jy,jz);
1456 GMX_MM_LOAD_1VALUE_PS(rb+jnrA,rbaj);
1458 else if(offset==2)
1460 jnrA = jjnr[k];
1461 jnrB = jjnr[k+1];
1462 j3A = 3*jnrA;
1463 j3B = 3*jnrB;
1464 GMX_MM_LOAD_1RVEC_2POINTERS_PS(x+j3A,x+j3B,jx,jy,jz);
1465 GMX_MM_LOAD_2VALUES_PS(rb+jnrA,rb+jnrB,rbaj);
1467 else
1469 /* offset must be 3 */
1470 jnrA = jjnr[k];
1471 jnrB = jjnr[k+1];
1472 jnrC = jjnr[k+2];
1473 j3A = 3*jnrA;
1474 j3B = 3*jnrB;
1475 j3C = 3*jnrC;
1476 GMX_MM_LOAD_1RVEC_3POINTERS_PS(x+j3A,x+j3B,x+j3C,jx,jy,jz);
1477 GMX_MM_LOAD_3VALUES_PS(rb+jnrA,rb+jnrB,rb+jnrC,rbaj);
1480 dx = _mm_sub_ps(ix,jx);
1481 dy = _mm_sub_ps(iy,jy);
1482 dz = _mm_sub_ps(iz,jz);
1484 /* load chain rule terms for j1-4 */
1485 f_gb = _mm_load_ps(dadx);
1486 dadx += 4;
1487 f_gb_ai = _mm_load_ps(dadx);
1488 dadx += 4;
1490 /* calculate scalar force */
1491 f_gb = _mm_mul_ps(f_gb,rbai);
1492 f_gb_ai = _mm_mul_ps(f_gb_ai,rbaj);
1493 f_gb = _mm_add_ps(f_gb,f_gb_ai);
1495 tx = _mm_mul_ps(f_gb,dx);
1496 ty = _mm_mul_ps(f_gb,dy);
1497 tz = _mm_mul_ps(f_gb,dz);
1499 fix = _mm_add_ps(fix,tx);
1500 fiy = _mm_add_ps(fiy,ty);
1501 fiz = _mm_add_ps(fiz,tz);
1503 if(offset==1)
1505 GMX_MM_DECREMENT_1RVEC_1POINTER_PS(f+j3A,tx,ty,tz);
1507 else if(offset==2)
1509 GMX_MM_DECREMENT_1RVEC_2POINTERS_PS(f+j3A,f+j3B,tx,ty,tz);
1511 else
1513 /* offset must be 3 */
1514 GMX_MM_DECREMENT_1RVEC_3POINTERS_PS(f+j3A,f+j3B,f+j3C,tx,ty,tz);
1518 /* fix/fiy/fiz now contain four partial force terms, that all should be
1519 * added to the i particle forces and shift forces.
1521 gmx_mm_update_iforce_1atom_ps(fix,fiy,fiz,f+ii3,fshift+is3);
1524 return 0;
1528 float gb_bonds_analytic(real *x, real *f, real *charge, real *bRad, real *dvda,
1529 t_idef *idef, real epsilon_r, real gb_epsilon_solvent, real facel)
1532 int i,j,nral,nri;
1533 int type1,type2,type3,type4,ai1,ai2,ai3,ai4,aj1,aj2,aj3,aj4;
1534 int ai13,ai23,ai33,ai43,aj13,aj23,aj33,aj43;
1535 int offset;
1537 float vctot;
1539 __m128 ix,iy,iz,jx,jy,jz,dx,dy,dz;
1540 __m128 rsq11,rinv,r,t1,t2,t3;
1541 __m128 isai,isaj,isaprod,inv_isaprod,expterm;
1542 __m128 ci,cj,qq,fac,dva,dva_i,dva_j,di,dj;
1543 __m128 vgb,vgb_tot,fgb,fgb2,fijC,fscal;
1544 __m128 tx,ty,tz,fix1,fiy1,fiz1;
1545 __m128 xmm1,xmm2,xmm3,xmm4,xmm5,xmm6,xmm7,xmm8;
1546 __m128 mask;
1548 const __m128 neg = {-1.0f , -1.0f , -1.0f , -1.0f };
1549 const __m128 qrtr = {0.25f , 0.25f , 0.25f , 0.25f };
1550 const __m128 eigth = {0.125f , 0.125f , 0.125f , 0.125f };
1551 const __m128 half = {0.5f , 0.5f , 0.5f , 0.5f };
1552 const __m128 three = {3.0f , 3.0f , 3.0f , 3.0f };
1554 t_iatom *forceatoms;
1556 /* Keep the compiler happy */
1557 vgb_tot = _mm_setzero_ps();
1558 vgb = _mm_setzero_ps();
1559 xmm1 = _mm_setzero_ps();
1560 xmm2 = _mm_setzero_ps();
1561 xmm3 = _mm_setzero_ps();
1562 xmm4 = _mm_setzero_ps();
1563 ai1 = ai2 = ai3 = ai4 = 0;
1564 aj1 = aj2 = aj3 = aj4 = 0;
1565 ai13 = ai23 = ai33 = ai43 = 0;
1566 aj13 = aj23 = aj33 = aj43 = 0;
1568 /* Scale the electrostatics by gb_epsilon_solvent */
1569 facel = (-1.0) * facel * ((1.0/epsilon_r) - 1.0/gb_epsilon_solvent);
1570 fac = _mm_load1_ps(&facel);
1572 vctot = 0.0;
1574 for(j=F_GB12;j<=F_GB14;j++)
1576 forceatoms = idef->il[j].iatoms;
1578 /* Number of atoms involved in each GB interaction plus the interaction type */
1579 nral = NRAL(j)+1;
1580 nri = idef->il[j].nr/nral;
1581 offset = nri%4;
1583 /* In the for loop, i is updated for every element in the forceatom list, so we have to stop
1584 * the loop at offset*nral before the maximum idef->il[j].nr number, instead of just offset
1586 for(i=0;i<idef->il[j].nr-(offset*nral); )
1588 /* Load everything separately for now, test with sse load and shuffles later
1589 * Also, to avoid reading in the interaction type, we just increment i to pass over
1590 * the types in the forceatoms array, this saves some memory accesses.
1592 i++;
1593 ai1 = forceatoms[i++];
1594 aj1 = forceatoms[i++];
1596 i++;
1597 ai2 = forceatoms[i++];
1598 aj2 = forceatoms[i++];
1600 i++;
1601 ai3 = forceatoms[i++];
1602 aj3 = forceatoms[i++];
1604 i++;
1605 ai4 = forceatoms[i++];
1606 aj4 = forceatoms[i++];
1608 ai13 = ai1*3;
1609 ai23 = ai2*3;
1610 ai33 = ai3*3;
1611 ai43 = ai4*3;
1613 aj13 = aj1*3;
1614 aj23 = aj2*3;
1615 aj33 = aj3*3;
1616 aj43 = aj4*3;
1618 /* Load particle ai1-4 and transpose */
1619 xmm1 = _mm_loadh_pi(xmm1,(__m64 *) (x+ai13));
1620 xmm2 = _mm_loadh_pi(xmm2,(__m64 *) (x+ai23));
1621 xmm3 = _mm_loadh_pi(xmm3,(__m64 *) (x+ai33));
1622 xmm4 = _mm_loadh_pi(xmm4,(__m64 *) (x+ai43));
1624 xmm5 = _mm_load1_ps(x+ai13+2);
1625 xmm6 = _mm_load1_ps(x+ai23+2);
1626 xmm7 = _mm_load1_ps(x+ai33+2);
1627 xmm8 = _mm_load1_ps(x+ai43+2);
1629 xmm5 = _mm_shuffle_ps(xmm5,xmm6,_MM_SHUFFLE(0,0,0,0));
1630 xmm6 = _mm_shuffle_ps(xmm7,xmm8,_MM_SHUFFLE(0,0,0,0));
1631 iz = _mm_shuffle_ps(xmm5,xmm6,_MM_SHUFFLE(2,0,2,0));
1633 xmm1 = _mm_shuffle_ps(xmm1,xmm2,_MM_SHUFFLE(3,2,3,2));
1634 xmm2 = _mm_shuffle_ps(xmm3,xmm4,_MM_SHUFFLE(3,2,3,2));
1635 ix = _mm_shuffle_ps(xmm1,xmm2,_MM_SHUFFLE(2,0,2,0));
1636 iy = _mm_shuffle_ps(xmm1,xmm2,_MM_SHUFFLE(3,1,3,1));
1638 /* Load particle aj1-4 and transpose */
1639 xmm1 = _mm_loadh_pi(xmm1,(__m64 *) (x+aj13));
1640 xmm2 = _mm_loadh_pi(xmm2,(__m64 *) (x+aj23));
1641 xmm3 = _mm_loadh_pi(xmm3,(__m64 *) (x+aj33));
1642 xmm4 = _mm_loadh_pi(xmm4,(__m64 *) (x+aj43));
1644 xmm5 = _mm_load1_ps(x+aj13+2);
1645 xmm6 = _mm_load1_ps(x+aj23+2);
1646 xmm7 = _mm_load1_ps(x+aj33+2);
1647 xmm8 = _mm_load1_ps(x+aj43+2);
1649 xmm5 = _mm_shuffle_ps(xmm5,xmm6,_MM_SHUFFLE(0,0,0,0));
1650 xmm6 = _mm_shuffle_ps(xmm7,xmm8,_MM_SHUFFLE(0,0,0,0));
1651 jz = _mm_shuffle_ps(xmm5,xmm6,_MM_SHUFFLE(2,0,2,0));
1653 xmm1 = _mm_shuffle_ps(xmm1,xmm2,_MM_SHUFFLE(3,2,3,2));
1654 xmm2 = _mm_shuffle_ps(xmm3,xmm4,_MM_SHUFFLE(3,2,3,2));
1655 jx = _mm_shuffle_ps(xmm1,xmm2,_MM_SHUFFLE(2,0,2,0));
1656 jy = _mm_shuffle_ps(xmm1,xmm2,_MM_SHUFFLE(3,1,3,1));
1658 /* Distances */
1659 dx = _mm_sub_ps(ix, jx);
1660 dy = _mm_sub_ps(iy, jy);
1661 dz = _mm_sub_ps(iz, jz);
1663 rsq11 = _mm_add_ps( _mm_add_ps( _mm_mul_ps(dx,dx) , _mm_mul_ps(dy,dy) ) , _mm_mul_ps(dz,dz) );
1665 rinv = gmx_mm_inv_ps(rsq11);
1666 r = _mm_mul_ps(rsq11,rinv);
1668 /* Load Born radii for ai's and aj's */
1669 xmm1 = _mm_load_ss(bRad+ai1);
1670 xmm2 = _mm_load_ss(bRad+ai2);
1671 xmm3 = _mm_load_ss(bRad+ai3);
1672 xmm4 = _mm_load_ss(bRad+ai4);
1674 xmm1 = _mm_shuffle_ps(xmm1,xmm2,_MM_SHUFFLE(0,0,0,0));
1675 xmm3 = _mm_shuffle_ps(xmm3,xmm4,_MM_SHUFFLE(0,0,0,0));
1676 isai = _mm_shuffle_ps(xmm1,xmm3,_MM_SHUFFLE(2,0,2,0));
1678 xmm1 = _mm_load_ss(bRad+aj1);
1679 xmm2 = _mm_load_ss(bRad+aj2);
1680 xmm3 = _mm_load_ss(bRad+aj3);
1681 xmm4 = _mm_load_ss(bRad+aj4);
1683 xmm1 = _mm_shuffle_ps(xmm1,xmm2,_MM_SHUFFLE(0,0,0,0));
1684 xmm3 = _mm_shuffle_ps(xmm3,xmm4,_MM_SHUFFLE(0,0,0,0));
1685 isaj = _mm_shuffle_ps(xmm1,xmm3,_MM_SHUFFLE(2,0,2,0));
1687 isaprod = _mm_mul_ps(isai,isaj); /* rb2 in tinker */
1688 inv_isaprod = _mm_mul_ps(isaprod,isaprod);
1689 inv_isaprod = gmx_mm_inv_ps(inv_isaprod); /* 1/rb2 in tinker*/
1691 /* Load charges for ai's and aj's */
1692 xmm1 = _mm_load_ss(charge+ai1);
1693 xmm2 = _mm_load_ss(charge+ai2);
1694 xmm3 = _mm_load_ss(charge+ai3);
1695 xmm4 = _mm_load_ss(charge+ai4);
1697 xmm1 = _mm_shuffle_ps(xmm1,xmm2,_MM_SHUFFLE(0,0,0,0));
1698 xmm3 = _mm_shuffle_ps(xmm3,xmm4,_MM_SHUFFLE(0,0,0,0));
1699 ci = _mm_shuffle_ps(xmm1,xmm3,_MM_SHUFFLE(2,0,2,0));
1701 xmm1 = _mm_load_ss(charge+aj1);
1702 xmm2 = _mm_load_ss(charge+aj2);
1703 xmm3 = _mm_load_ss(charge+aj3);
1704 xmm4 = _mm_load_ss(charge+aj4);
1706 xmm1 = _mm_shuffle_ps(xmm1,xmm2,_MM_SHUFFLE(0,0,0,0));
1707 xmm3 = _mm_shuffle_ps(xmm3,xmm4,_MM_SHUFFLE(0,0,0,0));
1708 cj = _mm_shuffle_ps(xmm1,xmm3,_MM_SHUFFLE(2,0,2,0));
1710 qq = _mm_mul_ps(ci,fac);
1711 qq = _mm_mul_ps(cj,qq);
1713 /* Calculate scalar GB force */
1714 expterm = _mm_mul_ps(rsq11,inv_isaprod);
1715 expterm = _mm_mul_ps(expterm,qrtr);
1716 expterm = _mm_mul_ps(expterm,neg);
1717 expterm = gmx_mm_exp_ps(expterm);
1719 fgb2 = _mm_mul_ps(isaprod,expterm);
1720 fgb2 = _mm_add_ps(fgb2,rsq11);
1722 fgb = gmx_mm_inv_ps(fgb2);
1724 /* Potential energy */
1725 vgb = _mm_mul_ps(qq,fgb);
1726 vgb_tot = _mm_add_ps(vgb_tot,vgb);
1728 fijC = _mm_mul_ps(qrtr,r);
1729 fijC = _mm_mul_ps(fijC,expterm);
1730 fijC = _mm_sub_ps(r,fijC);
1731 fijC = _mm_mul_ps(fijC,vgb);
1732 fijC = _mm_mul_ps(fijC,neg);
1733 fijC = _mm_mul_ps(fijC,fgb);
1734 fijC = _mm_mul_ps(fijC,fgb); /* fijC = fijC in tab code, de */
1736 /* Chain rule terms */
1737 dva = _mm_mul_ps(rsq11,inv_isaprod);
1738 dva = _mm_mul_ps(dva,eigth);
1739 dva = _mm_add_ps(dva,half);
1740 dva = _mm_mul_ps(dva,expterm);
1741 dva = _mm_mul_ps(dva,vgb);
1742 dva = _mm_mul_ps(dva,neg);
1743 dva = _mm_mul_ps(dva,fgb);
1744 dva = _mm_mul_ps(dva,fgb); /* dva * isaj = dvatmp*isai*isai */
1746 /* Calculate vectorial force */
1747 fscal = _mm_mul_ps(fijC,rinv);
1748 fscal = _mm_mul_ps(fscal,neg);
1750 tx = _mm_mul_ps(fscal,dx);
1751 ty = _mm_mul_ps(fscal,dy);
1752 tz = _mm_mul_ps(fscal,dz);
1754 /* Load, update and store chain rule terms for ai and aj atoms */
1755 dva_i = _mm_mul_ps(dva,isaj);
1756 dva_j = _mm_mul_ps(dva,isai);
1758 xmm1 = _mm_load_ss(dvda+ai1);
1759 xmm2 = _mm_load_ss(dvda+aj1);
1760 xmm1 = _mm_add_ss(xmm1,dva_i);
1761 xmm2 = _mm_add_ss(xmm2,dva_j);
1762 _mm_store_ss(dvda+ai1,xmm1);
1763 _mm_store_ss(dvda+aj1,xmm2);
1765 /* Need to shuffle dva_i and dva_j */
1766 dva_i = _mm_shuffle_ps(dva_i,dva_i,_MM_SHUFFLE(0,3,2,1));
1767 dva_j = _mm_shuffle_ps(dva_j,dva_j,_MM_SHUFFLE(0,3,2,1));
1769 xmm1 = _mm_load_ss(dvda+ai2);
1770 xmm2 = _mm_load_ss(dvda+aj2);
1771 xmm1 = _mm_add_ss(xmm1,dva_i);
1772 xmm2 = _mm_add_ss(xmm2,dva_j);
1773 _mm_store_ss(dvda+ai2,xmm1);
1774 _mm_store_ss(dvda+aj2,xmm2);
1776 /* Need to shuffle dva_i and dva_j */
1777 dva_i = _mm_shuffle_ps(dva_i,dva_i,_MM_SHUFFLE(0,3,2,1));
1778 dva_j = _mm_shuffle_ps(dva_j,dva_j,_MM_SHUFFLE(0,3,2,1));
1780 xmm1 = _mm_load_ss(dvda+ai3);
1781 xmm2 = _mm_load_ss(dvda+aj3);
1782 xmm1 = _mm_add_ss(xmm1,dva_i);
1783 xmm2 = _mm_add_ss(xmm2,dva_j);
1784 _mm_store_ss(dvda+ai3,xmm1);
1785 _mm_store_ss(dvda+aj3,xmm2);
1787 /* Need to shuffle dva_i and dva_j */
1788 dva_i = _mm_shuffle_ps(dva_i,dva_i,_MM_SHUFFLE(0,3,2,1));
1789 dva_j = _mm_shuffle_ps(dva_j,dva_j,_MM_SHUFFLE(0,3,2,1));
1791 xmm1 = _mm_load_ss(dvda+ai4);
1792 xmm2 = _mm_load_ss(dvda+aj4);
1793 xmm1 = _mm_add_ss(xmm1,dva_i);
1794 xmm2 = _mm_add_ss(xmm2,dva_j);
1795 _mm_store_ss(dvda+ai4,xmm1);
1796 _mm_store_ss(dvda+aj4,xmm2);
1798 /* Load, update and store partial forces on ai and ai atoms
1799 * This needs to be done in the order ai1+aj1,ai2+aj2 etc...
1800 * since loading all four values at once will mean the force
1801 * is not updated properly
1803 /* ai1 and aj1 */
1804 xmm1 = _mm_load_ss(f+ai13);
1805 xmm2 = _mm_load_ss(f+ai13+1);
1806 xmm3 = _mm_load_ss(f+ai13+2);
1808 xmm4 = _mm_load_ss(f+aj13);
1809 xmm5 = _mm_load_ss(f+aj13+1);
1810 xmm6 = _mm_load_ss(f+aj13+2);
1812 xmm1 = _mm_add_ss(xmm1,tx);
1813 xmm2 = _mm_add_ss(xmm2,ty);
1814 xmm3 = _mm_add_ss(xmm3,tz);
1816 xmm4 = _mm_sub_ss(xmm4,tx);
1817 xmm5 = _mm_sub_ss(xmm5,ty);
1818 xmm6 = _mm_sub_ss(xmm6,tz);
1820 _mm_store_ss(f+ai13,xmm1);
1821 _mm_store_ss(f+ai13+1,xmm2);
1822 _mm_store_ss(f+ai13+2,xmm3);
1824 _mm_store_ss(f+aj13,xmm4);
1825 _mm_store_ss(f+aj13+1,xmm5);
1826 _mm_store_ss(f+aj13+2,xmm6);
1828 /* ai2 and aj2 */
1829 xmm1 = _mm_load_ss(f+ai23);
1830 xmm2 = _mm_load_ss(f+ai23+1);
1831 xmm3 = _mm_load_ss(f+ai23+2);
1833 xmm4 = _mm_load_ss(f+aj23);
1834 xmm5 = _mm_load_ss(f+aj23+1);
1835 xmm6 = _mm_load_ss(f+aj23+2);
1837 /* Need to shuffle tx/ty/tz */
1838 tx = _mm_shuffle_ps(tx,tx,_MM_SHUFFLE(0,3,2,1));
1839 ty = _mm_shuffle_ps(ty,ty,_MM_SHUFFLE(0,3,2,1));
1840 tz = _mm_shuffle_ps(tz,tz,_MM_SHUFFLE(0,3,2,1));
1842 xmm1 = _mm_add_ss(xmm1,tx);
1843 xmm2 = _mm_add_ss(xmm2,ty);
1844 xmm3 = _mm_add_ss(xmm3,tz);
1846 xmm4 = _mm_sub_ss(xmm4,tx);
1847 xmm5 = _mm_sub_ss(xmm5,ty);
1848 xmm6 = _mm_sub_ss(xmm6,tz);
1850 _mm_store_ss(f+ai23,xmm1);
1851 _mm_store_ss(f+ai23+1,xmm2);
1852 _mm_store_ss(f+ai23+2,xmm3);
1854 _mm_store_ss(f+aj23,xmm4);
1855 _mm_store_ss(f+aj23+1,xmm5);
1856 _mm_store_ss(f+aj23+2,xmm6);
1858 /* ai3 and aj3 */
1859 xmm1 = _mm_load_ss(f+ai33);
1860 xmm2 = _mm_load_ss(f+ai33+1);
1861 xmm3 = _mm_load_ss(f+ai33+2);
1863 xmm4 = _mm_load_ss(f+aj33);
1864 xmm5 = _mm_load_ss(f+aj33+1);
1865 xmm6 = _mm_load_ss(f+aj33+2);
1867 /* Need to shuffle tx/ty/tz */
1868 tx = _mm_shuffle_ps(tx,tx,_MM_SHUFFLE(0,3,2,1));
1869 ty = _mm_shuffle_ps(ty,ty,_MM_SHUFFLE(0,3,2,1));
1870 tz = _mm_shuffle_ps(tz,tz,_MM_SHUFFLE(0,3,2,1));
1872 xmm1 = _mm_add_ss(xmm1,tx);
1873 xmm2 = _mm_add_ss(xmm2,ty);
1874 xmm3 = _mm_add_ss(xmm3,tz);
1876 xmm4 = _mm_sub_ss(xmm4,tx);
1877 xmm5 = _mm_sub_ss(xmm5,ty);
1878 xmm6 = _mm_sub_ss(xmm6,tz);
1880 _mm_store_ss(f+ai33,xmm1);
1881 _mm_store_ss(f+ai33+1,xmm2);
1882 _mm_store_ss(f+ai33+2,xmm3);
1884 _mm_store_ss(f+aj33,xmm4);
1885 _mm_store_ss(f+aj33+1,xmm5);
1886 _mm_store_ss(f+aj33+2,xmm6);
1888 /* ai4 and aj4 */
1889 xmm1 = _mm_load_ss(f+ai43);
1890 xmm2 = _mm_load_ss(f+ai43+1);
1891 xmm3 = _mm_load_ss(f+ai43+2);
1893 xmm4 = _mm_load_ss(f+aj43);
1894 xmm5 = _mm_load_ss(f+aj43+1);
1895 xmm6 = _mm_load_ss(f+aj43+2);
1897 /* Need to shuffle tx/ty/tz */
1898 tx = _mm_shuffle_ps(tx,tx,_MM_SHUFFLE(0,3,2,1));
1899 ty = _mm_shuffle_ps(ty,ty,_MM_SHUFFLE(0,3,2,1));
1900 tz = _mm_shuffle_ps(tz,tz,_MM_SHUFFLE(0,3,2,1));
1902 xmm1 = _mm_add_ss(xmm1,tx);
1903 xmm2 = _mm_add_ss(xmm2,ty);
1904 xmm3 = _mm_add_ss(xmm3,tz);
1906 xmm4 = _mm_sub_ss(xmm4,tx);
1907 xmm5 = _mm_sub_ss(xmm5,ty);
1908 xmm6 = _mm_sub_ss(xmm6,tz);
1910 _mm_store_ss(f+ai43,xmm1);
1911 _mm_store_ss(f+ai43+1,xmm2);
1912 _mm_store_ss(f+ai43+2,xmm3);
1914 _mm_store_ss(f+aj43,xmm4);
1915 _mm_store_ss(f+aj43+1,xmm5);
1916 _mm_store_ss(f+aj43+2,xmm6);
1919 if(offset!=0)
1921 if(offset==1)
1923 i++;
1924 ai1 = forceatoms[i++];
1925 aj1 = forceatoms[i++];
1927 ai13 = ai1*3;
1928 aj13 = aj1*3;
1930 /* Load ai and aj coordinates */
1931 xmm1 = _mm_loadl_pi(xmm1,(__m64 *) (x+ai13));
1932 iz = _mm_load1_ps(x+ai13+2);
1934 ix = _mm_shuffle_ps(xmm1, xmm1, _MM_SHUFFLE(0,0,0,0));
1935 iy = _mm_shuffle_ps(xmm1, xmm1, _MM_SHUFFLE(1,1,1,1));
1937 xmm1 = _mm_loadl_pi(xmm1,(__m64 *) (x+aj13));
1938 jz = _mm_load1_ps(x+aj13+2);
1940 jx = _mm_shuffle_ps(xmm1, xmm1, _MM_SHUFFLE(0,0,0,0));
1941 jy = _mm_shuffle_ps(xmm1, xmm1, _MM_SHUFFLE(1,1,1,1));
1943 /* Load Born radii for ai1 and aj1 */
1944 isai = _mm_set_ps(0.0f, 0.0f, 0.0f, bRad[ai1]);
1945 isaj = _mm_set_ps(0.0f, 0.0f, 0.0f, bRad[aj1]);
1947 /* Load charges for ai1 and aj1 */
1948 ci = _mm_set_ps(0.0f, 0.0f, 0.0f, charge[ai1]);
1949 cj = _mm_set_ps(0.0f, 0.0f, 0.0f, charge[aj1]);
1951 /* Load chain rule terms for ai1 and aj1 */
1952 di = _mm_set_ps(0.0f,0.0f,0.0f,dvda[ai1]);
1953 dj = _mm_set_ps(0.0f,0.0f,0.0f,dvda[aj1]);
1955 mask = gmx_mm_castsi128_ps( _mm_set_epi32(0,0,0,0xffffffff) );
1957 else if(offset==2)
1959 i++;
1960 ai1 = forceatoms[i++];
1961 aj1 = forceatoms[i++];
1963 i++;
1964 ai2 = forceatoms[i++];
1965 aj2 = forceatoms[i++];
1967 ai13 = ai1 * 3;
1968 ai23 = ai2 * 3;
1969 aj13 = aj1 * 3;
1970 aj23 = aj2 * 3;
1972 /* Load ai1-2 and aj1-2 coordinates */
1973 xmm1 = _mm_loadh_pi(xmm1,(__m64 *) (x+ai13));
1974 xmm2 = _mm_loadh_pi(xmm2,(__m64 *) (x+ai23));
1976 xmm5 = _mm_load1_ps(x+ai13+2);
1977 xmm6 = _mm_load1_ps(x+ai23+2);
1979 xmm5 = _mm_shuffle_ps(xmm5,xmm6,_MM_SHUFFLE(0,0,0,0));
1980 iz = _mm_shuffle_ps(xmm5,xmm5,_MM_SHUFFLE(2,0,2,0));
1982 xmm1 = _mm_shuffle_ps(xmm1,xmm2,_MM_SHUFFLE(3,2,3,2));
1983 ix = _mm_shuffle_ps(xmm1,xmm1,_MM_SHUFFLE(2,0,2,0));
1984 iy = _mm_shuffle_ps(xmm1,xmm1,_MM_SHUFFLE(3,1,3,1));
1986 xmm1 = _mm_loadh_pi(xmm1,(__m64 *) (x+aj13));
1987 xmm2 = _mm_loadh_pi(xmm2,(__m64 *) (x+aj23));
1989 xmm5 = _mm_load1_ps(x+aj13+2);
1990 xmm6 = _mm_load1_ps(x+aj23+2);
1992 xmm5 = _mm_shuffle_ps(xmm5,xmm6,_MM_SHUFFLE(0,0,0,0));
1993 jz = _mm_shuffle_ps(xmm5,xmm5,_MM_SHUFFLE(2,0,2,0));
1995 xmm1 = _mm_shuffle_ps(xmm1,xmm2,_MM_SHUFFLE(3,2,3,2));
1996 jx = _mm_shuffle_ps(xmm1,xmm1,_MM_SHUFFLE(2,0,2,0));
1997 jy = _mm_shuffle_ps(xmm1,xmm1,_MM_SHUFFLE(3,1,3,1));
1999 /* Load Born radii for ai1-2 and aj1-2 */
2000 isai = _mm_set_ps(0.0f, 0.0f, bRad[ai2], bRad[ai1]);
2001 isaj = _mm_set_ps(0.0f, 0.0f, bRad[aj2], bRad[aj1]);
2003 /* Load charges for ai1-2 and aj1-2 */
2004 ci = _mm_set_ps(0.0f, 0.0f, charge[ai2], charge[ai1]);
2005 cj = _mm_set_ps(0.0f, 0.0f, charge[aj2], charge[aj1]);
2007 /* Load chain rule terms for ai1-2 and aj1-2 */
2008 di = _mm_set_ps(0.0f,0.0f,dvda[ai2],dvda[ai1]);
2009 dj = _mm_set_ps(0.0f,0.0f,dvda[aj2],dvda[aj1]);
2011 mask = gmx_mm_castsi128_ps( _mm_set_epi32(0,0,0xffffffff,0xffffffff) );
2013 else
2015 i++;
2016 ai1 = forceatoms[i++];
2017 aj1 = forceatoms[i++];
2019 i++;
2020 ai2 = forceatoms[i++];
2021 aj2 = forceatoms[i++];
2023 i++;
2024 ai3 = forceatoms[i++];
2025 aj3 = forceatoms[i++];
2027 ai13 = ai1 * 3;
2028 ai23 = ai2 * 3;
2029 ai33 = ai3 * 3;
2031 aj13 = aj1 * 3;
2032 aj23 = aj2 * 3;
2033 aj33 = aj3 * 3;
2035 /* Load ai1-3 and aj1-3 coordinates */
2036 xmm1 = _mm_loadh_pi(xmm1,(__m64 *) (x+ai13));
2037 xmm2 = _mm_loadh_pi(xmm2,(__m64 *) (x+ai23));
2038 xmm3 = _mm_loadh_pi(xmm3,(__m64 *) (x+ai33));
2040 xmm5 = _mm_load1_ps(x+ai13+2);
2041 xmm6 = _mm_load1_ps(x+ai23+2);
2042 xmm7 = _mm_load1_ps(x+ai33+2);
2044 xmm5 = _mm_shuffle_ps(xmm5,xmm6, _MM_SHUFFLE(0,0,0,0));
2045 iz = _mm_shuffle_ps(xmm5,xmm7, _MM_SHUFFLE(3,1,3,1));
2047 xmm1 = _mm_shuffle_ps(xmm1,xmm2, _MM_SHUFFLE(3,2,3,2));
2048 xmm2 = _mm_shuffle_ps(xmm3,xmm3, _MM_SHUFFLE(3,2,3,2));
2050 ix = _mm_shuffle_ps(xmm1,xmm2, _MM_SHUFFLE(2,0,2,0));
2051 iy = _mm_shuffle_ps(xmm1,xmm2, _MM_SHUFFLE(3,1,3,1));
2053 xmm1 = _mm_loadh_pi(xmm1,(__m64 *) (x+aj13));
2054 xmm2 = _mm_loadh_pi(xmm2,(__m64 *) (x+aj23));
2055 xmm3 = _mm_loadh_pi(xmm3,(__m64 *) (x+aj33));
2057 xmm5 = _mm_load1_ps(x+aj13+2);
2058 xmm6 = _mm_load1_ps(x+aj23+2);
2059 xmm7 = _mm_load1_ps(x+aj33+2);
2061 xmm5 = _mm_shuffle_ps(xmm5,xmm6, _MM_SHUFFLE(0,0,0,0));
2062 jz = _mm_shuffle_ps(xmm5,xmm7, _MM_SHUFFLE(3,1,3,1));
2064 xmm1 = _mm_shuffle_ps(xmm1,xmm2, _MM_SHUFFLE(3,2,3,2));
2065 xmm2 = _mm_shuffle_ps(xmm3,xmm3, _MM_SHUFFLE(3,2,3,2));
2067 jx = _mm_shuffle_ps(xmm1,xmm2, _MM_SHUFFLE(2,0,2,0));
2068 jy = _mm_shuffle_ps(xmm1,xmm2, _MM_SHUFFLE(3,1,3,1));
2070 /* Load Born radii for ai1-3 and aj1-3 */
2071 isai = _mm_set_ps(0.0f, bRad[ai3], bRad[ai2], bRad[ai1]);
2072 isaj = _mm_set_ps(0.0f, bRad[aj3], bRad[aj2], bRad[aj1]);
2074 /* Load charges for ai1-3 and aj1-3 */
2075 ci = _mm_set_ps(0.0f, charge[ai3], charge[ai2], charge[ai1]);
2076 cj = _mm_set_ps(0.0f, charge[aj3], charge[aj2], charge[aj1]);
2078 /* Load chain rule terms for ai1-3 and aj1-3 */
2079 di = _mm_set_ps(0.0f,dvda[ai3],dvda[ai2],dvda[ai1]);
2080 dj = _mm_set_ps(0.0f,dvda[aj3],dvda[aj2],dvda[aj1]);
2082 mask = gmx_mm_castsi128_ps( _mm_set_epi32(0,0xffffffff,0xffffffff,0xffffffff) );
2085 ix = _mm_and_ps( mask, ix);
2086 iy = _mm_and_ps( mask, iy);
2087 iz = _mm_and_ps( mask, iz);
2089 jx = _mm_and_ps( mask, jx);
2090 jy = _mm_and_ps( mask, jy);
2091 jz = _mm_and_ps( mask, jz);
2093 /* Distances */
2094 dx = _mm_sub_ps(ix, jx);
2095 dy = _mm_sub_ps(iy, jy);
2096 dz = _mm_sub_ps(iz, jz);
2098 rsq11 = _mm_add_ps( _mm_add_ps( _mm_mul_ps(dx,dx) , _mm_mul_ps(dy,dy) ) , _mm_mul_ps(dz,dz) );
2100 rinv = gmx_mm_inv_ps(rsq11);
2101 r = _mm_mul_ps(rsq11,rinv);
2103 isaprod = _mm_mul_ps(isai,isaj);
2104 inv_isaprod = _mm_mul_ps(isaprod,isaprod);
2105 inv_isaprod = gmx_mm_inv_ps(inv_isaprod);
2107 qq = _mm_mul_ps(ci,fac);
2108 qq = _mm_mul_ps(cj,qq);
2110 /* Calculate scalar GB force */
2111 expterm = _mm_mul_ps(rsq11,inv_isaprod);
2112 expterm = _mm_mul_ps(expterm,qrtr);
2113 expterm = _mm_mul_ps(expterm,neg);
2114 expterm = gmx_mm_exp_ps(expterm);
2116 fgb2 = _mm_mul_ps(isaprod,expterm);
2117 fgb2 = _mm_add_ps(fgb2,rsq11);
2119 fgb = gmx_mm_inv_ps(fgb2);
2121 /* Potential energy */
2122 vgb = _mm_mul_ps(qq,fgb);
2123 vgb = _mm_and_ps(mask,vgb);
2124 vgb_tot = _mm_add_ps(vgb_tot,vgb);
2126 fijC = _mm_mul_ps(qrtr,r);
2127 fijC = _mm_mul_ps(fijC,expterm);
2128 fijC = _mm_sub_ps(r,fijC);
2129 fijC = _mm_mul_ps(fijC,vgb);
2130 fijC = _mm_mul_ps(fijC,neg);
2131 fijC = _mm_mul_ps(fijC,fgb);
2132 fijC = _mm_mul_ps(fijC,fgb); /* fscal = fijC */
2134 /* Chain rule terms */
2135 dva = _mm_mul_ps(rsq11,inv_isaprod);
2136 dva = _mm_mul_ps(dva,eigth);
2137 dva = _mm_add_ps(dva,half);
2138 dva = _mm_mul_ps(dva,expterm);
2139 dva = _mm_mul_ps(dva,vgb);
2140 dva = _mm_mul_ps(dva,neg);
2141 dva = _mm_mul_ps(dva,fgb);
2142 dva = _mm_mul_ps(dva,fgb);
2144 /* Calculate vectorial force */
2145 fscal = _mm_mul_ps(fijC,rinv);
2146 fscal = _mm_mul_ps(fscal,neg);
2148 tx = _mm_mul_ps(fscal,dx);
2149 ty = _mm_mul_ps(fscal,dy);
2150 tz = _mm_mul_ps(fscal,dz);
2152 /* Calculate chain rule terms */
2153 dva_i = _mm_mul_ps(dva,isaj);
2154 dva_j = _mm_mul_ps(dva,isai);
2156 if(offset==1)
2158 /* Load, update and store chain rule terms for ai1 and aj1 */
2159 xmm1 = _mm_load_ss(dvda+ai1);
2160 xmm2 = _mm_load_ss(dvda+aj1);
2161 xmm1 = _mm_add_ss(xmm1,dva_i);
2162 xmm2 = _mm_add_ss(xmm2,dva_j);
2163 _mm_store_ss(dvda+ai1,xmm1);
2164 _mm_store_ss(dvda+aj1,xmm2);
2166 /* Load, update and store partial forces on ai1 and aj1 */
2167 xmm1 = _mm_load_ss(f+ai13);
2168 xmm2 = _mm_load_ss(f+ai13+1);
2169 xmm3 = _mm_load_ss(f+ai13+2);
2171 xmm4 = _mm_load_ss(f+aj13);
2172 xmm5 = _mm_load_ss(f+aj13+1);
2173 xmm6 = _mm_load_ss(f+aj13+2);
2175 xmm1 = _mm_add_ss(xmm1,tx);
2176 xmm2 = _mm_add_ss(xmm2,ty);
2177 xmm3 = _mm_add_ss(xmm3,tz);
2179 xmm4 = _mm_sub_ss(xmm4,tx);
2180 xmm5 = _mm_sub_ss(xmm5,ty);
2181 xmm6 = _mm_sub_ss(xmm6,tz);
2183 _mm_store_ss(f+ai13,xmm1);
2184 _mm_store_ss(f+ai13+1,xmm2);
2185 _mm_store_ss(f+ai13+2,xmm3);
2187 _mm_store_ss(f+aj13,xmm4);
2188 _mm_store_ss(f+aj13+1,xmm5);
2189 _mm_store_ss(f+aj13+2,xmm6);
2191 else if(offset==2)
2193 /* Load, update and store chain rule terms for ai1-2 and aj1-2 atoms */
2194 dva_i = _mm_mul_ps(dva,isaj);
2195 dva_j = _mm_mul_ps(dva,isai);
2197 xmm1 = _mm_load_ss(dvda+ai1);
2198 xmm2 = _mm_load_ss(dvda+aj1);
2199 xmm1 = _mm_add_ss(xmm1,dva_i);
2200 xmm2 = _mm_add_ss(xmm2,dva_j);
2201 _mm_store_ss(dvda+ai1,xmm1);
2202 _mm_store_ss(dvda+aj1,xmm2);
2204 /* Need to shuffle dva_i and dva_j */
2205 dva_i = _mm_shuffle_ps(dva_i,dva_i,_MM_SHUFFLE(0,3,2,1));
2206 dva_j = _mm_shuffle_ps(dva_j,dva_j,_MM_SHUFFLE(0,3,2,1));
2208 xmm1 = _mm_load_ss(dvda+ai2);
2209 xmm2 = _mm_load_ss(dvda+aj2);
2210 xmm1 = _mm_add_ss(xmm1,dva_i);
2211 xmm2 = _mm_add_ss(xmm2,dva_j);
2212 _mm_store_ss(dvda+ai2,xmm1);
2213 _mm_store_ss(dvda+aj2,xmm2);
2215 /* Load, update and store partial forces on ai1-2 and aj1-2 */
2216 xmm1 = _mm_load_ss(f+ai13);
2217 xmm2 = _mm_load_ss(f+ai13+1);
2218 xmm3 = _mm_load_ss(f+ai13+2);
2220 xmm4 = _mm_load_ss(f+aj13);
2221 xmm5 = _mm_load_ss(f+aj13+1);
2222 xmm6 = _mm_load_ss(f+aj13+2);
2224 xmm1 = _mm_add_ss(xmm1,tx);
2225 xmm2 = _mm_add_ss(xmm2,ty);
2226 xmm3 = _mm_add_ss(xmm3,tz);
2228 xmm4 = _mm_sub_ss(xmm4,tx);
2229 xmm5 = _mm_sub_ss(xmm5,ty);
2230 xmm6 = _mm_sub_ss(xmm6,tz);
2232 _mm_store_ss(f+ai13,xmm1);
2233 _mm_store_ss(f+ai13+1,xmm2);
2234 _mm_store_ss(f+ai13+2,xmm3);
2236 _mm_store_ss(f+aj13,xmm4);
2237 _mm_store_ss(f+aj13+1,xmm5);
2238 _mm_store_ss(f+aj13+2,xmm6);
2240 /* ai2 and aj2 */
2241 xmm1 = _mm_load_ss(f+ai23);
2242 xmm2 = _mm_load_ss(f+ai23+1);
2243 xmm3 = _mm_load_ss(f+ai23+2);
2245 xmm4 = _mm_load_ss(f+aj23);
2246 xmm5 = _mm_load_ss(f+aj23+1);
2247 xmm6 = _mm_load_ss(f+aj23+2);
2249 /* Need to shuffle tx/ty/tz */
2250 tx = _mm_shuffle_ps(tx,tx,_MM_SHUFFLE(0,3,2,1));
2251 ty = _mm_shuffle_ps(ty,ty,_MM_SHUFFLE(0,3,2,1));
2252 tz = _mm_shuffle_ps(tz,tz,_MM_SHUFFLE(0,3,2,1));
2254 xmm1 = _mm_add_ss(xmm1,tx);
2255 xmm2 = _mm_add_ss(xmm2,ty);
2256 xmm3 = _mm_add_ss(xmm3,tz);
2258 xmm4 = _mm_sub_ss(xmm4,tx);
2259 xmm5 = _mm_sub_ss(xmm5,ty);
2260 xmm6 = _mm_sub_ss(xmm6,tz);
2262 _mm_store_ss(f+ai23,xmm1);
2263 _mm_store_ss(f+ai23+1,xmm2);
2264 _mm_store_ss(f+ai23+2,xmm3);
2266 _mm_store_ss(f+aj23,xmm4);
2267 _mm_store_ss(f+aj23+1,xmm5);
2268 _mm_store_ss(f+aj23+2,xmm6);
2270 else
2272 /* Load, update and store chain rule terms for ai1-3 and aj1-3 atoms */
2273 dva_i = _mm_mul_ps(dva,isaj);
2274 dva_j = _mm_mul_ps(dva,isai);
2276 xmm1 = _mm_load_ss(dvda+ai1);
2277 xmm2 = _mm_load_ss(dvda+aj1);
2278 xmm1 = _mm_add_ss(xmm1,dva_i);
2279 xmm2 = _mm_add_ss(xmm2,dva_j);
2280 _mm_store_ss(dvda+ai1,xmm1);
2281 _mm_store_ss(dvda+aj1,xmm2);
2283 /* Need to shuffle dva_i and dva_j */
2284 dva_i = _mm_shuffle_ps(dva_i,dva_i,_MM_SHUFFLE(0,3,2,1));
2285 dva_j = _mm_shuffle_ps(dva_j,dva_j,_MM_SHUFFLE(0,3,2,1));
2287 xmm1 = _mm_load_ss(dvda+ai2);
2288 xmm2 = _mm_load_ss(dvda+aj2);
2289 xmm1 = _mm_add_ss(xmm1,dva_i);
2290 xmm2 = _mm_add_ss(xmm2,dva_j);
2291 _mm_store_ss(dvda+ai2,xmm1);
2292 _mm_store_ss(dvda+aj2,xmm2);
2294 /* Need to shuffle dva_i and dva_j */
2295 dva_i = _mm_shuffle_ps(dva_i,dva_i,_MM_SHUFFLE(0,3,2,1));
2296 dva_j = _mm_shuffle_ps(dva_j,dva_j,_MM_SHUFFLE(0,3,2,1));
2298 xmm1 = _mm_load_ss(dvda+ai3);
2299 xmm2 = _mm_load_ss(dvda+aj3);
2300 xmm1 = _mm_add_ss(xmm1,dva_i);
2301 xmm2 = _mm_add_ss(xmm2,dva_j);
2302 _mm_store_ss(dvda+ai3,xmm1);
2303 _mm_store_ss(dvda+aj3,xmm2);
2305 /* Load, update and store partial forces on ai1-3 and aj1-3 */
2306 xmm1 = _mm_load_ss(f+ai13);
2307 xmm2 = _mm_load_ss(f+ai13+1);
2308 xmm3 = _mm_load_ss(f+ai13+2);
2310 xmm4 = _mm_load_ss(f+aj13);
2311 xmm5 = _mm_load_ss(f+aj13+1);
2312 xmm6 = _mm_load_ss(f+aj13+2);
2314 xmm1 = _mm_add_ss(xmm1,tx);
2315 xmm2 = _mm_add_ss(xmm2,ty);
2316 xmm3 = _mm_add_ss(xmm3,tz);
2318 xmm4 = _mm_sub_ss(xmm4,tx);
2319 xmm5 = _mm_sub_ss(xmm5,ty);
2320 xmm6 = _mm_sub_ss(xmm6,tz);
2322 _mm_store_ss(f+ai13,xmm1);
2323 _mm_store_ss(f+ai13+1,xmm2);
2324 _mm_store_ss(f+ai13+2,xmm3);
2326 _mm_store_ss(f+aj13,xmm4);
2327 _mm_store_ss(f+aj13+1,xmm5);
2328 _mm_store_ss(f+aj13+2,xmm6);
2330 /* ai2 and aj2 */
2331 xmm1 = _mm_load_ss(f+ai23);
2332 xmm2 = _mm_load_ss(f+ai23+1);
2333 xmm3 = _mm_load_ss(f+ai23+2);
2335 xmm4 = _mm_load_ss(f+aj23);
2336 xmm5 = _mm_load_ss(f+aj23+1);
2337 xmm6 = _mm_load_ss(f+aj23+2);
2339 /* Need to shuffle tx/ty/tz */
2340 tx = _mm_shuffle_ps(tx,tx,_MM_SHUFFLE(0,3,2,1));
2341 ty = _mm_shuffle_ps(ty,ty,_MM_SHUFFLE(0,3,2,1));
2342 tz = _mm_shuffle_ps(tz,tz,_MM_SHUFFLE(0,3,2,1));
2344 xmm1 = _mm_add_ss(xmm1,tx);
2345 xmm2 = _mm_add_ss(xmm2,ty);
2346 xmm3 = _mm_add_ss(xmm3,tz);
2348 xmm4 = _mm_sub_ss(xmm4,tx);
2349 xmm5 = _mm_sub_ss(xmm5,ty);
2350 xmm6 = _mm_sub_ss(xmm6,tz);
2352 _mm_store_ss(f+ai23,xmm1);
2353 _mm_store_ss(f+ai23+1,xmm2);
2354 _mm_store_ss(f+ai23+2,xmm3);
2356 _mm_store_ss(f+aj23,xmm4);
2357 _mm_store_ss(f+aj23+1,xmm5);
2358 _mm_store_ss(f+aj23+2,xmm6);
2360 /* ai3 and aj3 */
2361 xmm1 = _mm_load_ss(f+ai33);
2362 xmm2 = _mm_load_ss(f+ai33+1);
2363 xmm3 = _mm_load_ss(f+ai33+2);
2365 xmm4 = _mm_load_ss(f+aj33);
2366 xmm5 = _mm_load_ss(f+aj33+1);
2367 xmm6 = _mm_load_ss(f+aj33+2);
2369 /* Need to shuffle tx/ty/tz */
2370 tx = _mm_shuffle_ps(tx,tx,_MM_SHUFFLE(0,3,2,1));
2371 ty = _mm_shuffle_ps(ty,ty,_MM_SHUFFLE(0,3,2,1));
2372 tz = _mm_shuffle_ps(tz,tz,_MM_SHUFFLE(0,3,2,1));
2374 xmm1 = _mm_add_ss(xmm1,tx);
2375 xmm2 = _mm_add_ss(xmm2,ty);
2376 xmm3 = _mm_add_ss(xmm3,tz);
2378 xmm4 = _mm_sub_ss(xmm4,tx);
2379 xmm5 = _mm_sub_ss(xmm5,ty);
2380 xmm6 = _mm_sub_ss(xmm6,tz);
2382 _mm_store_ss(f+ai33,xmm1);
2383 _mm_store_ss(f+ai33+1,xmm2);
2384 _mm_store_ss(f+ai33+2,xmm3);
2386 _mm_store_ss(f+aj33,xmm4);
2387 _mm_store_ss(f+aj33+1,xmm5);
2388 _mm_store_ss(f+aj33+2,xmm6);
2393 /* End processing for potential energy */
2394 vgb = _mm_movehl_ps(vgb,vgb_tot);
2395 vgb_tot = _mm_add_ps(vgb_tot,vgb);
2396 vgb = _mm_shuffle_ps(vgb_tot,vgb_tot,_MM_SHUFFLE(1,1,1,1));
2397 vgb_tot = _mm_add_ss(vgb_tot,vgb);
2399 _mm_store_ss(&vctot,vgb_tot);
2401 return vctot;
2407 #else
2408 /* keep compiler happy */
2409 int genborn_sse_dummy;
2411 #endif /* SSE intrinsics available */