Fixed include statements such that double precision version of genborn.c
[gromacs/rigid-bodies.git] / src / mdlib / genborn_sse2_double.c
blob4be1567e82411bcff97e8cd6f74f0cc2da9ebbf8
1 #ifdef HAVE_CONFIG_H
2 #include <config.h>
3 #endif
5 #include <math.h>
6 #include <string.h>
8 #include "typedefs.h"
9 #include "smalloc.h"
10 #include "genborn.h"
11 #include "vec.h"
12 #include "grompp.h"
13 #include "pdbio.h"
14 #include "names.h"
15 #include "physics.h"
16 #include "domdec.h"
17 #include "partdec.h"
18 #include "network.h"
19 #include "gmx_fatal.h"
20 #include "mtop_util.h"
21 #include "genborn.h"
23 #ifdef GMX_LIB_MPI
24 #include <mpi.h>
25 #endif
26 #ifdef GMX_THREADS
27 #include "tmpi.h"
28 #endif
30 /* Only compile this file if SSE2 intrinsics are available */
31 #if ( (defined(GMX_IA32_SSE2) || defined(GMX_X86_64_SSE2) || defined(GMX_SSE2)) && defined(GMX_DOUBLE) )
32 #include <xmmintrin.h>
33 #include <emmintrin.h>
35 #include "genborn_sse2_double.h"
37 #if (defined (_MSC_VER) || defined(__INTEL_COMPILER))
38 #define gmx_castsi128_pd(a) _mm_castsi128_pd(a)
39 #define gmx_castpd_si128(a) _mm_castpd_si128(a)
40 #elif defined(__GNUC__)
41 #define gmx_castsi128_pd(a) ((__m128d)(a))
42 #define gmx_castpd_si128(a) ((__m128i)(a))
43 #else
44 static __m128d gmx_castsi128_pd(__m128i a) { return *(__m128d *) &a; }
45 static __m128i gmx_castpd_si128(__m128d a) { return *(__m128i *) &a; }
46 #endif
49 static inline void
50 sincos_sse2double(__m128d x, __m128d *sinval, __m128d *cosval)
52 const __m128d two_over_pi = {2.0/M_PI,2.0/M_PI};
53 const __m128d half = {0.5,0.5};
54 const __m128d one = {1.0,1.0};
55 const __m128i izero = _mm_set1_epi32(0);
56 const __m128i ione = _mm_set1_epi32(1);
57 const __m128i itwo = _mm_set1_epi32(2);
58 const __m128i ithree = _mm_set1_epi32(3);
59 const __m128d sincosd_kc1 = {(13176794.0 / 8388608.0),(13176794.0 / 8388608.0)};
60 const __m128d sincosd_kc2 = {7.5497899548918821691639751442098584e-8,7.5497899548918821691639751442098584e-8};
61 const __m128d sincosd_cc0 = {0.00000000206374484196,0.00000000206374484196};
62 const __m128d sincosd_cc1 = {-0.00000027555365134677,-0.00000027555365134677};
63 const __m128d sincosd_cc2 = {0.00002480157946764225,0.00002480157946764225};
64 const __m128d sincosd_cc3 = {-0.00138888888730525966,-0.00138888888730525966};
65 const __m128d sincosd_cc4 = {0.04166666666651986722,0.04166666666651986722};
66 const __m128d sincosd_cc5 = {-0.49999999999999547304,-0.49999999999999547304};
67 const __m128d sincosd_sc0 = {0.00000000015893606014,0.00000000015893606014};
68 const __m128d sincosd_sc1 = {-0.00000002505069049138,-0.00000002505069049138};
69 const __m128d sincosd_sc2 = {0.00000275573131527032,0.00000275573131527032};
70 const __m128d sincosd_sc3 = {-0.00019841269827816117,-0.00019841269827816117};
71 const __m128d sincosd_sc4 = {0.00833333333331908278,0.00833333333331908278};
72 const __m128d sincosd_sc5 = {-0.16666666666666612594,-0.16666666666666612594};
74 __m128d signbit = gmx_castsi128_pd(_mm_set1_epi64x(0x8000000000000000ULL));
75 __m128d tiny = gmx_castsi128_pd(_mm_set1_epi64x(0x3e40000000000000ULL));
77 __m128d xl,xl2,xl3,qd,absxl,p1,cx,sx,ts,tc,tsn,tcn;
78 __m128i q;
79 __m128i offsetSin,offsetCos;
80 __m128d sinMask,cosMask,isTiny;
81 __m128d ct0,ct1,ct2,ct3,ct4,ct5,ct6,st1,st2,st3,st4,st6;
83 /* Rescale the angle to the range 0..4, and find which quadrant it is in */
84 xl = _mm_mul_pd(x,two_over_pi);
86 /* q=integer part of xl, rounded _away_ from 0.0 */
87 /* Add 0.5 away from 0.0 */
88 xl = _mm_add_pd(xl,_mm_or_pd(_mm_and_pd(xl,signbit),half));
90 q = _mm_cvttpd_epi32(xl);
91 qd = _mm_cvtepi32_pd(q);
92 q = _mm_shuffle_epi32(q,_MM_SHUFFLE(1,1,0,0));
94 /* Compute offset based on quadrant the arg falls in */
95 offsetSin = _mm_and_si128(q,ithree);
96 offsetCos = _mm_add_epi32(offsetSin,ione);
98 /* Remainder in range [-pi/4..pi/4] */
99 p1 = _mm_mul_pd(qd,sincosd_kc1);
100 xl = _mm_mul_pd(qd,sincosd_kc2);
101 p1 = _mm_sub_pd(x,p1);
102 xl = _mm_sub_pd(p1,xl);
104 absxl = _mm_andnot_pd(signbit,xl);
105 isTiny = _mm_cmpgt_pd(tiny,absxl);
107 xl2 = _mm_mul_pd(xl,xl);
108 xl3 = _mm_mul_pd(xl2,xl);
110 ct0 = _mm_mul_pd(xl2,xl2);
111 ct1 = _mm_mul_pd(sincosd_cc0,xl2);
112 ct2 = _mm_mul_pd(sincosd_cc2,xl2);
113 ct3 = _mm_mul_pd(sincosd_cc4,xl2);
114 st1 = _mm_mul_pd(sincosd_sc0,xl2);
115 st2 = _mm_mul_pd(sincosd_sc2,xl2);
116 st3 = _mm_mul_pd(sincosd_sc4,xl2);
117 ct1 = _mm_add_pd(ct1,sincosd_cc1);
118 ct2 = _mm_add_pd(ct2,sincosd_cc3);
119 ct3 = _mm_add_pd(ct3,sincosd_cc5);
120 st1 = _mm_add_pd(st1,sincosd_sc1);
121 st2 = _mm_add_pd(st2,sincosd_sc3);
122 st3 = _mm_add_pd(st3,sincosd_sc5);
124 ct4 = _mm_mul_pd(ct2,ct0);
125 ct4 = _mm_add_pd(ct4,ct3);
127 st4 = _mm_mul_pd(st2,ct0);
128 st4 = _mm_add_pd(st4,st3);
129 ct5 = _mm_mul_pd(ct0,ct0);
131 ct6 = _mm_mul_pd(ct5,ct1);
132 ct6 = _mm_add_pd(ct6,ct4);
134 st6 = _mm_mul_pd(ct5,st1);
135 st6 = _mm_add_pd(st6,st4);
137 cx = _mm_mul_pd(ct6,xl2);
138 cx = _mm_add_pd(cx,one);
140 sx = _mm_mul_pd(st6,xl3);
141 sx = _mm_add_pd(sx,xl);
143 /* Small angle approximation, sin(tiny)=tiny, cos(tiny)=1.0 */
144 sx = _mm_or_pd( _mm_and_pd(isTiny,xl) , _mm_andnot_pd(isTiny,sx) );
145 cx = _mm_or_pd( _mm_and_pd(isTiny,one) , _mm_andnot_pd(isTiny,cx) );
147 sinMask = gmx_castsi128_pd(_mm_cmpeq_epi32( _mm_and_si128(offsetSin,ione), izero));
148 cosMask = gmx_castsi128_pd(_mm_cmpeq_epi32( _mm_and_si128(offsetCos,ione), izero));
150 ts = _mm_or_pd( _mm_and_pd(sinMask,sx) , _mm_andnot_pd(sinMask,cx) );
151 tc = _mm_or_pd( _mm_and_pd(cosMask,sx) , _mm_andnot_pd(cosMask,cx) );
153 /* Flip the sign of the result when (offset mod 4) = 1 or 2 */
154 sinMask = gmx_castsi128_pd(_mm_cmpeq_epi32( _mm_and_si128(offsetSin,itwo), izero));
155 tsn = _mm_xor_pd(signbit,ts);
156 ts = _mm_or_pd( _mm_and_pd(sinMask,ts) , _mm_andnot_pd(sinMask,tsn) );
158 cosMask = gmx_castsi128_pd(_mm_cmpeq_epi32( _mm_and_si128(offsetCos,itwo), izero));
159 tcn = _mm_xor_pd(signbit,tc);
160 tc = _mm_or_pd( _mm_and_pd(cosMask,tc) , _mm_andnot_pd(cosMask,tcn) );
162 *sinval = ts;
163 *cosval = tc;
165 return;
168 __m128d log_pd(__m128d x)
170 const __m128i exp_mask = _mm_set_epi32(0x7FF00000,0,0x7FF00000,0);
171 const __m128i exp_bias = _mm_set_epi32(0,1023,0,1023);
173 const __m128d const_loge = _mm_set1_pd(0.69314718055994529);
174 const __m128d const_one = _mm_set1_pd(1.0);
175 const __m128d const_two = _mm_set1_pd(2.0);
177 /* Almost full single precision accuracy (~20 bits worst case) */
178 const __m128d P0 = _mm_set1_pd(6.108179944792157749153050);
179 const __m128d P1 = _mm_set1_pd(52.43691313715523327631139);
180 const __m128d P2 = _mm_set1_pd(71.53664010795613671168440);
181 const __m128d P3 = _mm_set1_pd(18.92097516931559547548485);
182 const __m128d P4 = _mm_set1_pd(0.3504714784635941984522153);
183 const __m128d P5 = _mm_set1_pd(-0.007105890734229368515879);
184 const __m128d Q1 = _mm_set1_pd(17.73314231909420567454406);
185 const __m128d Q2 = _mm_set1_pd(48.82373085428713023213363);
186 const __m128d Q3 = _mm_set1_pd(31.65945943354513166309101);
187 const __m128d Q4 = _mm_set1_pd(4.302477477108162270199051);
189 __m128d xmm0,xmm1,xmm2,xmm3, xmm4;
190 __m128i xmmi,xmmj;
192 xmmi = gmx_castpd_si128(x);
193 xmm1 = _mm_cvtepi32_pd(_mm_shuffle_epi32(_mm_sub_epi64(_mm_srli_epi64(_mm_and_si128(xmmi, exp_mask), 52), exp_bias),_MM_SHUFFLE(3,1,2,0)));
194 xmm0 = _mm_or_pd(gmx_castsi128_pd(_mm_andnot_si128(exp_mask, xmmi)), const_one);
196 xmm2 = _mm_mul_pd(P5,xmm0);
197 xmm2 = _mm_add_pd(xmm2,P4);
198 xmm2 = _mm_mul_pd(xmm2,xmm0);
199 xmm2 = _mm_add_pd(xmm2,P3);
200 xmm2 = _mm_mul_pd(xmm2,xmm0);
201 xmm2 = _mm_add_pd(xmm2,P2);
202 xmm2 = _mm_mul_pd(xmm2,xmm0);
203 xmm2 = _mm_add_pd(xmm2,P1);
204 xmm2 = _mm_mul_pd(xmm2,xmm0);
205 xmm2 = _mm_add_pd(xmm2,P0);
207 xmm3 = _mm_mul_pd(Q4,xmm0);
208 xmm3 = _mm_add_pd(xmm3,Q3);
209 xmm3 = _mm_mul_pd(xmm3,xmm0);
210 xmm3 = _mm_add_pd(xmm3,Q2);
211 xmm3 = _mm_mul_pd(xmm3,xmm0);
212 xmm3 = _mm_add_pd(xmm3,Q1);
213 xmm3 = _mm_mul_pd(xmm3,xmm0);
214 xmm3 = _mm_add_pd(xmm3,const_one);
216 /* xmm4=1.0/xmm3 */
217 xmm4 = _mm_cvtps_pd(_mm_rcp_ps(_mm_cvtpd_ps(xmm3)));
218 xmm4 = _mm_mul_pd(xmm4,_mm_sub_pd(const_two,_mm_mul_pd(xmm3,xmm4)));
219 xmm4 = _mm_mul_pd(xmm4,_mm_sub_pd(const_two,_mm_mul_pd(xmm3,xmm4)));
220 xmm2 = _mm_mul_pd(xmm2,xmm4);
222 xmm0 = _mm_sub_pd(xmm0, const_one);
223 xmm0 = _mm_mul_pd(xmm0,xmm2);
225 xmm0 = _mm_add_pd(xmm0,xmm1);
227 return _mm_mul_pd(xmm0, const_loge);
230 /* This exp() routine provides accuracy of 10E-9 to 10E-11.
231 * The polynomial minimax coefficients are actually accurate to 10E-14,
232 * but we lose some accuracy in the polynomial evaluation.
234 __m128d exp_pd(__m128d x)
236 const __m128d lim1 = _mm_set1_pd(1025.0); /* 1025.00000e+0d */
237 const __m128d lim2 = _mm_set1_pd(-1022.99999999); /* -1022.99999e+0f */
239 const __m128i base = _mm_set_epi32(0,0,1023,1023);
240 const __m128d half = _mm_set1_pd(0.5);
241 const __m128d log2e = _mm_set1_pd(1.4426950408889634);
243 const __m128d exp_P0 = _mm_set1_pd(1.00000000000001276211229749);
244 const __m128d exp_P1 = _mm_set1_pd(6.931471805598709708635169E-1);
245 const __m128d exp_P2 = _mm_set1_pd(2.402265069564965287455972E-1);
246 const __m128d exp_P3 = _mm_set1_pd(5.550410866868561155599683E-2);
247 const __m128d exp_P4 = _mm_set1_pd(9.618129192067246128919915E-3);
248 const __m128d exp_P5 = _mm_set1_pd(1.333355761760444302342084E-3);
249 const __m128d exp_P6 = _mm_set1_pd(1.540343494807179111289781E-4);
250 const __m128d exp_P7 = _mm_set1_pd(1.525298483865349629325421E-5);
251 const __m128d exp_P8 = _mm_set1_pd(1.325940560934510417818578E-6);
252 const __m128d exp_P9 = _mm_set1_pd(1.015033670529892589443421E-7);
254 __m128d xmm0,xmm1;
255 __m128i xmmi;
257 xmm0 = _mm_mul_pd(x,log2e);
258 xmm0 = _mm_min_pd(xmm0,lim1);
259 xmm0 = _mm_max_pd(xmm0,lim2);
260 xmm1 = _mm_sub_pd(xmm0,half);
262 xmmi = _mm_cvtpd_epi32(xmm1);
263 xmm1 = _mm_cvtepi32_pd(xmmi);
265 xmmi = _mm_add_epi32(xmmi,base);
266 xmmi = _mm_shuffle_epi32(xmmi,_MM_SHUFFLE(3,1,2,0));
267 xmmi = _mm_slli_epi64(xmmi,52);
269 xmm0 = _mm_sub_pd(xmm0,xmm1);
271 xmm1 = _mm_mul_pd(exp_P9,xmm0);
272 xmm1 = _mm_add_pd(xmm1,exp_P8);
273 xmm1 = _mm_mul_pd(xmm1,xmm0);
274 xmm1 = _mm_add_pd(xmm1,exp_P7);
275 xmm1 = _mm_mul_pd(xmm1,xmm0);
276 xmm1 = _mm_add_pd(xmm1,exp_P6);
277 xmm1 = _mm_mul_pd(xmm1,xmm0);
278 xmm1 = _mm_add_pd(xmm1,exp_P5);
279 xmm1 = _mm_mul_pd(xmm1,xmm0);
280 xmm1 = _mm_add_pd(xmm1,exp_P4);
281 xmm1 = _mm_mul_pd(xmm1,xmm0);
282 xmm1 = _mm_add_pd(xmm1,exp_P3);
283 xmm1 = _mm_mul_pd(xmm1,xmm0);
284 xmm1 = _mm_add_pd(xmm1,exp_P2);
285 xmm1 = _mm_mul_pd(xmm1,xmm0);
286 xmm1 = _mm_add_pd(xmm1,exp_P1);
287 xmm1 = _mm_mul_pd(xmm1,xmm0);
288 xmm1 = _mm_add_pd(xmm1,exp_P0);
289 xmm1 = _mm_mul_pd(xmm1,gmx_castsi128_pd(xmmi));
291 return xmm1;
294 static inline __m128d
295 my_invrsq_pd(__m128d x)
297 const __m128d three = {3.0, 3.0};
298 const __m128d half = {0.5, 0.5};
300 __m128 t = _mm_rsqrt_ps(_mm_cvtpd_ps(x)); /* Convert to single precision and do _mm_rsqrt_ps() */
301 __m128d t1 = _mm_cvtps_pd(t); /* Convert back to double precision */
303 /* First Newton-Rapson step, accuracy is now 24 bits */
304 __m128d t2 = _mm_mul_pd(half,_mm_mul_pd(t1,_mm_sub_pd(three,_mm_mul_pd(x,_mm_mul_pd(t1,t1)))));
306 /* Return second Newton-Rapson step, accuracy 48 bits */
307 return _mm_mul_pd(half,_mm_mul_pd(t2,_mm_sub_pd(three,_mm_mul_pd(x,_mm_mul_pd(t2,t2)))));
310 static inline __m128d
311 my_inv_pd(__m128d x)
313 const __m128d two = {2.0, 2.0};
315 __m128 t = _mm_rcp_ps(_mm_cvtpd_ps(x));
316 __m128d t1 = _mm_cvtps_pd(t);
317 __m128d t2 = _mm_mul_pd(t1,_mm_sub_pd(two,_mm_mul_pd(t1,x)));
319 return _mm_mul_pd(t2,_mm_sub_pd(two,_mm_mul_pd(t2,x)));
324 calc_gb_rad_still_sse2_double(t_commrec *cr, t_forcerec *fr,int natoms, gmx_localtop_t *top,
325 const t_atomtypes *atype, double *x, t_nblist *nl, gmx_genborn_t *born, t_mdatoms *md)
327 int i,k,n,ai,ai3,aj1,aj2,aj13,aj23;
328 int at0,at1,nj0,nj1,offset,taj1,taj2;
329 int shift;
331 double factor,gpi_ai,gpi_tmp,gpi2;
332 double shX,shY,shZ;
334 __m128d ix,iy,iz,jx,jy,jz,dx,dy,dz,sX,sY,sZ;
335 __m128d t1,t2,t3,rsq11,rinv,rinv2,rinv4,rinv6;
336 __m128d ratio,gpi,rai,raj,vaj,rvdw,mask_cmp;
337 __m128d ccf,dccf,theta,cosq,term,sinq,res,prod;
338 __m128d xmm1,xmm2,xmm3,xmm4,xmm7,vai,prod_ai,icf4,icf6;
340 const __m128d half = {0.5, 0.5};
341 const __m128d three = {3.0, 3.0};
342 const __m128d one = {1.0, 1.0};
343 const __m128d two = {2.0, 2.0};
344 const __m128d zero = {0.0, 0.0};
345 const __m128d four = {4.0, 4.0};
347 const __m128d p5inv = {STILL_P5INV, STILL_P5INV};
348 const __m128d pip5 = {STILL_PIP5, STILL_PIP5};
349 const __m128d p4 = {STILL_P4, STILL_P4};
351 factor = 0.5 * ONE_4PI_EPS0;
352 n = 0;
354 /* Keep the compiler happy */
355 raj = _mm_setzero_pd();
356 vaj = _mm_setzero_pd();
357 jx = _mm_setzero_pd();
358 jy = _mm_setzero_pd();
359 jz = _mm_setzero_pd();
360 xmm2 = _mm_setzero_pd();
361 xmm7 = _mm_setzero_pd();
363 for(i=0;i<born->nr;i++)
365 born->gpol_still_work[i]=0;
368 for(i=0;i<nl->nri;i++)
370 ai = nl->iinr[i];
371 ai3 = ai * 3;
373 nj0 = nl->jindex[i];
374 nj1 = nl->jindex[i+1];
376 /* Load shifts for this list */
377 shift = nl->shift[i];
378 shX = fr->shift_vec[shift][0];
379 shY = fr->shift_vec[shift][1];
380 shZ = fr->shift_vec[shift][2];
382 /* Splat the shifts */
383 sX = _mm_load1_pd(&shX);
384 sY = _mm_load1_pd(&shY);
385 sZ = _mm_load1_pd(&shZ);
387 offset = (nj1-nj0)%2;
389 /* Polarization energy for atom ai */
390 gpi = _mm_setzero_pd();
392 /* Load particle ai coordinates and add shifts */
393 ix = _mm_load1_pd(x+ai3);
394 iy = _mm_load1_pd(x+ai3+1);
395 iz = _mm_load1_pd(x+ai3+2);
397 ix = _mm_add_pd(sX,ix);
398 iy = _mm_add_pd(sY,iy);
399 iz = _mm_add_pd(sZ,iz);
401 /* Load particle ai gb_radius */
402 rai = _mm_set1_pd(top->atomtypes.gb_radius[md->typeA[ai]]);
403 vai = _mm_set1_pd(born->vsolv[ai]);
404 prod_ai = _mm_mul_pd(p4,vai);
406 for(k=nj0;k<nj1-offset;k+=2)
408 aj1 = nl->jjnr[k];
409 aj2 = nl->jjnr[k+1];
411 aj13 = aj1 * 3;
412 aj23 = aj2 * 3;
414 taj1 = md->typeA[aj1];
415 taj2 = md->typeA[aj2];
417 /* Load particle aj1-2 coordinates and compute ai->aj distances */
418 xmm1 = _mm_loadu_pd(x+aj13);
419 xmm2 = _mm_loadu_pd(x+aj23);
420 jx = _mm_shuffle_pd(xmm1,xmm2,_MM_SHUFFLE2(0,0));
421 jy = _mm_shuffle_pd(xmm1,xmm2,_MM_SHUFFLE2(1,1));
423 jz = _mm_loadl_pd(jz,x+aj13+2);
424 jz = _mm_loadh_pd(jz,x+aj23+2);
426 dx = _mm_sub_pd(ix,jx);
427 dy = _mm_sub_pd(iy,jy);
428 dz = _mm_sub_pd(iz,jz);
430 rsq11 = _mm_add_pd( _mm_add_pd( _mm_mul_pd(dx,dx) , _mm_mul_pd(dy,dy) ) , _mm_mul_pd(dz,dz) );
431 rinv = my_invrsq_pd(rsq11);
433 rinv2 = _mm_mul_pd(rinv,rinv);
434 rinv4 = _mm_mul_pd(rinv2,rinv2);
435 rinv6 = _mm_mul_pd(rinv4,rinv2);
437 vaj = _mm_loadl_pd(vaj,born->vsolv+aj1);
438 vaj = _mm_loadh_pd(vaj,born->vsolv+aj2);
440 raj = _mm_loadl_pd(raj, top->atomtypes.gb_radius+taj1);
441 raj = _mm_loadh_pd(raj, top->atomtypes.gb_radius+taj2);
443 rvdw = _mm_add_pd(rai,raj);
444 rvdw = _mm_mul_pd(rvdw,rvdw);
445 ratio = _mm_div_pd(rsq11,rvdw);
447 mask_cmp = _mm_cmpgt_pd(ratio,p5inv);
449 switch(_mm_movemask_pd(mask_cmp))
451 case 0xF:
452 ccf = one;
453 dccf = zero;
454 break;
455 default:
456 theta = _mm_mul_pd(ratio,pip5);
457 sincos_sse2double(theta,&sinq,&cosq);
459 term = _mm_sub_pd(one,cosq);
460 term = _mm_mul_pd(half,term);
461 ccf = _mm_mul_pd(term,term);
462 dccf = _mm_mul_pd(two,term);
463 dccf = _mm_mul_pd(dccf,sinq);
464 dccf = _mm_mul_pd(dccf,pip5);
465 dccf = _mm_mul_pd(dccf,ratio);
467 ccf = _mm_or_pd(_mm_and_pd(mask_cmp,one) ,_mm_andnot_pd(mask_cmp,ccf));
468 dccf = _mm_or_pd(_mm_and_pd(mask_cmp,zero) ,_mm_andnot_pd(mask_cmp,dccf));
471 prod = _mm_mul_pd(p4,vaj);
472 icf4 = _mm_mul_pd(ccf,rinv4);
473 xmm2 = _mm_mul_pd(icf4,prod);
474 xmm3 = _mm_mul_pd(icf4,prod_ai);
475 gpi = _mm_add_pd(gpi,xmm2);
477 /* Load, subtract and store atom aj gpol energy */
478 xmm7 = _mm_loadl_pd(xmm7,born->gpol_still_work+aj1);
479 xmm7 = _mm_loadh_pd(xmm7,born->gpol_still_work+aj2);
481 xmm3 = _mm_add_pd(xmm7,xmm3);
483 _mm_storel_pd(born->gpol_still_work+aj1,xmm3);
484 _mm_storeh_pd(born->gpol_still_work+aj2,xmm3);
486 /* Chain rule terms */
487 ccf = _mm_mul_pd(four,ccf);
488 xmm3 = _mm_sub_pd(ccf,dccf);
489 icf6 = _mm_mul_pd(xmm3,rinv6);
490 xmm1 = _mm_mul_pd(icf6,prod);
491 xmm2 = _mm_mul_pd(icf6,prod_ai);
493 /* As with single precision, we need to shift stuff around, to change the order of
494 * the interactions from ai->aj1, ai->aj2 to ai->aj1, aj1->ai, ai->aj2, aj2->ai etc,
495 to do 2 instead of 4 store operations
497 xmm3 = _mm_unpacklo_pd(xmm1,xmm2);
498 xmm4 = _mm_unpackhi_pd(xmm1,xmm2);
500 _mm_storeu_pd(fr->dadx+n,xmm3);
501 n = n + 2;
502 _mm_storeu_pd(fr->dadx+n,xmm4);
503 n = n + 2;
506 /* Deal with odd elements */
507 if(offset!=0)
509 aj1 = nl->jjnr[k];
510 aj13 = aj1 * 3;
511 taj1 = md->typeA[aj1];
513 jx = _mm_load_sd(x+aj13);
514 jy = _mm_load_sd(x+aj13+1);
515 jz = _mm_load_sd(x+aj13+2);
517 raj = _mm_load_sd(top->atomtypes.gb_radius+taj1);
518 vaj = _mm_load_sd(born->vsolv+aj1);
520 dx = _mm_sub_sd(ix,jx);
521 dy = _mm_sub_pd(iy,jy);
522 dz = _mm_sub_pd(iz,jz);
524 rsq11 = _mm_add_sd( _mm_add_sd( _mm_mul_sd(dx,dx) , _mm_mul_sd(dy,dy) ) , _mm_mul_sd(dz,dz) );
525 rinv = my_invrsq_pd(rsq11);
527 rinv2 = _mm_mul_sd(rinv,rinv);
528 rinv4 = _mm_mul_sd(rinv2,rinv2);
529 rinv6 = _mm_mul_sd(rinv4,rinv2);
531 rvdw = _mm_add_sd(rai,raj);
532 rvdw = _mm_mul_sd(rvdw,rvdw);
533 ratio = _mm_div_sd(rsq11,rvdw);
535 mask_cmp = _mm_cmpgt_sd(ratio,p5inv);
537 switch(_mm_movemask_pd(mask_cmp))
539 case 0xF:
540 ccf = one;
541 dccf = zero;
542 break;
543 default:
544 theta = _mm_mul_sd(ratio,pip5);
545 sincos_sse2double(theta,&sinq,&cosq);
547 term = _mm_sub_sd(one,cosq);
548 term = _mm_mul_sd(half,term);
549 ccf = _mm_mul_sd(term,term);
550 dccf = _mm_mul_sd(two,term);
551 dccf = _mm_mul_sd(dccf,sinq);
552 dccf = _mm_mul_sd(dccf,pip5);
553 dccf = _mm_mul_sd(dccf,ratio);
555 ccf = _mm_or_pd(_mm_and_pd(mask_cmp,one) ,_mm_andnot_pd(mask_cmp,ccf));
556 dccf = _mm_or_pd(_mm_and_pd(mask_cmp,zero) ,_mm_andnot_pd(mask_cmp,dccf));
559 prod = _mm_mul_sd(p4,vaj);
560 icf4 = _mm_mul_sd(ccf,rinv4);
561 xmm2 = _mm_mul_sd(icf4,prod);
562 xmm3 = _mm_mul_sd(icf4,prod_ai);
563 gpi = _mm_add_sd(gpi,xmm2);
565 /* Load, subtract and store atom aj gpol energy */
566 xmm7 = _mm_load_sd(born->gpol_still_work+aj1);
567 xmm3 = _mm_add_sd(xmm7,xmm3);
568 _mm_store_sd(born->gpol_still_work+aj1,xmm3);
570 /* Chain rule terms */
571 ccf = _mm_mul_sd(four,ccf);
572 xmm3 = _mm_sub_sd(ccf,dccf);
573 icf6 = _mm_mul_sd(xmm3,rinv6);
574 xmm1 = _mm_mul_sd(icf6,prod);
575 xmm2 = _mm_mul_sd(icf6,prod_ai);
577 /* Here we only have ai->aj1 and aj1->ai, so we can store directly */
578 _mm_storel_pd(fr->dadx+n,xmm1);
579 n = n + 1;
580 _mm_storel_pd(fr->dadx+n,xmm2);
581 n = n + 1;
582 } /* End offset */
584 /* Do end processing ... */
585 xmm2 = _mm_unpacklo_pd(xmm2,gpi);
586 gpi = _mm_add_pd(gpi,xmm2);
587 gpi = _mm_shuffle_pd(gpi,gpi,_MM_SHUFFLE2(1,1));
589 /* Load, add and store atom ai polarisation energy */
590 xmm2 = _mm_load_sd(born->gpol_still_work+ai);
591 gpi = _mm_add_sd(gpi,xmm2);
592 _mm_store_sd(born->gpol_still_work+ai,gpi);
595 /* Parallel summations */
596 if(PARTDECOMP(cr))
598 gmx_sum(natoms,born->gpol_still_work, cr);
600 else if(DOMAINDECOMP(cr))
602 dd_atom_sum_real(cr->dd, born->gpol_still_work);
605 /* Compute the radii */
606 for(i=0;i<fr->natoms_force;i++) /* PELA born->nr */
608 if(born->use[i] != 0)
610 gpi_ai = born->gpol[i] + born->gpol_still_work[i];
611 gpi2 = gpi_ai*gpi_ai;
613 born->bRad[i]=factor*gmx_invsqrt(gpi2);
614 fr->invsqrta[i]=gmx_invsqrt(born->bRad[i]);
618 /* Extra (local) communication reqiured for DD */
619 if(DOMAINDECOMP(cr))
621 dd_atom_spread_real(cr->dd, born->bRad);
622 dd_atom_spread_real(cr->dd, fr->invsqrta);
625 return 0;
628 int
629 calc_gb_rad_hct_sse2_double(t_commrec *cr, t_forcerec *fr, int natoms, gmx_localtop_t *top,
630 const t_atomtypes *atype, double *x, t_nblist *nl, gmx_genborn_t *born, t_mdatoms *md)
632 int i,k,n,ai,ai3,aj1,aj2,aj13,aj23,nj0,nj1,at0,at1,offset;
633 int p1, p2, shift;
634 double rr,sum,sum_tmp,min_rad,rad,doff;
635 double shX,shY,shZ;
637 __m128d ix,iy,iz,jx,jy,jz,dx,dy,dz,sX,sY,sZ;
638 __m128d t1,t2,t3,rsq11,rinv,r,rai;
639 __m128d rai_inv,sk,sk2,lij,dlij,duij;
640 __m128d uij,lij2,uij2,lij3,uij3,diff2;
641 __m128d lij_inv,sk2_inv,prod,log_term,tmp,tmp_sum;
642 __m128d mask_cmp,mask_cmp2,mask_cmp3;
643 __m128d xmm1,xmm2,xmm3,xmm4,xmm7,xmm8,xmm9,doffset;
644 __m128d sum_ai,chrule,chrule_ai,tmp_ai;
645 __m128d sk_ai, sk2_ai,raj,raj_inv;
647 const __m128d neg = {-1.0, -1.0};
648 const __m128d zero = {0.0, 0.0};
649 const __m128d eigth = {0.125, 0.125};
650 const __m128d qrtr = {0.25, 0.25};
651 const __m128d half = {0.5, 0.5};
652 const __m128d one = {1.0, 1.0};
653 const __m128d two = {2.0, 2.0};
654 const __m128d three = {3.0, 3.0};
656 /* Keep the compiler happy */
657 tmp_ai = _mm_setzero_pd();
658 tmp = _mm_setzero_pd();
659 xmm7 = _mm_setzero_pd();
660 xmm8 = _mm_setzero_pd();
661 raj_inv = _mm_setzero_pd();
662 raj = _mm_setzero_pd();
663 sk = _mm_setzero_pd();
664 jx = _mm_setzero_pd();
665 jy = _mm_setzero_pd();
666 jz = _mm_setzero_pd();
668 /* Set the dielectric offset */
669 doff = born->gb_doffset;
670 doffset = _mm_load1_pd(&doff);
671 n = 0;
673 for(i=0;i<born->nr;i++)
675 born->gpol_hct_work[i] = 0;
678 for(i=0;i<nl->nri;i++)
680 ai = nl->iinr[i];
681 ai3 = ai * 3;
683 nj0 = nl->jindex[i];
684 nj1 = nl->jindex[i+1];
686 /* Load shifts for this list */
687 shift = nl->shift[i];
688 shX = fr->shift_vec[shift][0];
689 shY = fr->shift_vec[shift][1];
690 shZ = fr->shift_vec[shift][2];
692 /* Splat the shifts */
693 sX = _mm_load1_pd(&shX);
694 sY = _mm_load1_pd(&shY);
695 sZ = _mm_load1_pd(&shZ);
697 offset = (nj1-nj0)%2;
699 /* Load rai */
700 rr = top->atomtypes.gb_radius[md->typeA[ai]]-doff;
701 rai = _mm_load1_pd(&rr);
702 rr = 1.0/rr;
703 rai_inv = _mm_load1_pd(&rr);
705 /* Zero out sums for polarisation energies */
706 sum_ai = _mm_setzero_pd();
708 /* Load ai coordinates and add shifts */
709 ix = _mm_load1_pd(x+ai3);
710 iy = _mm_load1_pd(x+ai3+1);
711 iz = _mm_load1_pd(x+ai3+2);
713 ix = _mm_add_pd(sX,ix);
714 iy = _mm_add_pd(sY,iy);
715 iz = _mm_add_pd(sZ,iz);
717 sk_ai = _mm_load1_pd(born->param+ai);
718 sk2_ai = _mm_mul_pd(sk_ai,sk_ai);
720 for(k=nj0;k<nj1-offset;k+=2)
722 aj1 = nl->jjnr[k];
723 aj2 = nl->jjnr[k+1];
725 aj13 = aj1 * 3;
726 aj23 = aj2 * 3;
728 /* Load particle aj1-2 coordinates */
729 xmm1 = _mm_loadu_pd(x+aj13);
730 xmm2 = _mm_loadu_pd(x+aj23);
731 jx = _mm_shuffle_pd(xmm1,xmm2,_MM_SHUFFLE2(0,0));
732 jy = _mm_shuffle_pd(xmm1,xmm2,_MM_SHUFFLE2(1,1));
734 jz = _mm_loadl_pd(jz, x+aj13+2);
735 jz = _mm_loadh_pd(jz, x+aj23+2);
737 dx = _mm_sub_pd(ix,jx);
738 dy = _mm_sub_pd(iy,jy);
739 dz = _mm_sub_pd(iz,jz);
741 rsq11 = _mm_add_pd( _mm_add_pd( _mm_mul_pd(dx,dx) , _mm_mul_pd(dy,dy) ) , _mm_mul_pd(dz,dz) );
742 rinv = my_invrsq_pd(rsq11);
743 r = _mm_mul_pd(rinv,rsq11);
745 sk = _mm_loadl_pd(sk,born->param+aj1);
746 sk = _mm_loadh_pd(sk,born->param+aj2);
748 /* Load aj1,aj2 raj */
749 p1 = md->typeA[aj1];
750 p2 = md->typeA[aj2];
752 raj = _mm_loadl_pd(raj,top->atomtypes.gb_radius+p1);
753 raj = _mm_loadh_pd(raj,top->atomtypes.gb_radius+p2);
754 raj = _mm_sub_pd(raj,doffset);
756 /* Compute 1.0/raj */
757 raj_inv = my_inv_pd(raj);
759 /* INTERACTION aj->ai STARS HERE */
760 /* conditional mask for rai<dr+sk */
761 xmm1 = _mm_add_pd(r,sk);
762 mask_cmp = _mm_cmplt_pd(rai,xmm1);
764 /* conditional for rai>dr-sk, ends with mask_cmp2 */
765 xmm2 = _mm_sub_pd(r,sk);
766 xmm3 = my_inv_pd(xmm2);
767 mask_cmp2 = _mm_cmpgt_pd(rai,xmm2);
769 lij = _mm_or_pd(_mm_and_pd(mask_cmp2,rai_inv) ,_mm_andnot_pd(mask_cmp2,xmm3)); /*conditional as a mask*/
770 dlij = _mm_or_pd(_mm_and_pd(mask_cmp2,zero) ,_mm_andnot_pd(mask_cmp2,one));
772 uij = my_inv_pd(xmm1);
773 lij2 = _mm_mul_pd(lij,lij);
774 lij3 = _mm_mul_pd(lij2,lij);
775 uij2 = _mm_mul_pd(uij,uij);
776 uij3 = _mm_mul_pd(uij2,uij);
778 diff2 = _mm_sub_pd(uij2,lij2);
780 lij_inv = my_invrsq_pd(lij2);
781 sk2 = _mm_mul_pd(sk,sk);
782 sk2_inv = _mm_mul_pd(sk2,rinv);
783 prod = _mm_mul_pd(qrtr,sk2_inv);
785 log_term = _mm_mul_pd(uij,lij_inv);
786 log_term = log_pd(log_term);
788 xmm1 = _mm_sub_pd(lij,uij);
789 xmm2 = _mm_mul_pd(qrtr,r);
790 xmm2 = _mm_mul_pd(xmm2,diff2);
791 xmm1 = _mm_add_pd(xmm1,xmm2);
792 xmm2 = _mm_mul_pd(half,rinv);
793 xmm2 = _mm_mul_pd(xmm2,log_term);
794 xmm1 = _mm_add_pd(xmm1,xmm2);
795 xmm9 = _mm_mul_pd(neg,diff2);
796 xmm2 = _mm_mul_pd(xmm9,prod);
797 tmp_ai = _mm_add_pd(xmm1,xmm2);
799 /* contitional for rai<sk-dr */
800 xmm3 = _mm_sub_pd(sk,r);
801 mask_cmp3 = _mm_cmplt_pd(rai,xmm3); /* rai<sk-dr */
803 xmm4 = _mm_sub_pd(rai_inv,lij);
804 xmm4 = _mm_mul_pd(two,xmm4);
805 xmm4 = _mm_add_pd(tmp_ai,xmm4);
807 tmp_ai = _mm_or_pd(_mm_and_pd(mask_cmp3,xmm4) ,_mm_andnot_pd(mask_cmp3,tmp_ai)); /*conditional as a mask*/
809 /* the tmp will now contain two partial values, that not all are to be used. Which */
810 /* ones are governed by the mask_cmp mask. */
811 tmp_ai = _mm_mul_pd(half,tmp_ai);
812 tmp_ai = _mm_or_pd(_mm_and_pd(mask_cmp,tmp_ai) ,_mm_andnot_pd(mask_cmp,zero)); /*conditional as a mask*/
813 sum_ai = _mm_add_pd(sum_ai,tmp_ai);
815 xmm2 = _mm_mul_pd(half,lij2);
816 xmm3 = _mm_mul_pd(prod,lij3);
817 xmm2 = _mm_add_pd(xmm2,xmm3);
818 xmm3 = _mm_mul_pd(lij,rinv);
819 xmm4 = _mm_mul_pd(lij3,r);
820 xmm3 = _mm_add_pd(xmm3,xmm4);
821 xmm3 = _mm_mul_pd(qrtr,xmm3);
822 t1 = _mm_sub_pd(xmm2,xmm3);
824 xmm2 = _mm_mul_pd(half,uij2);
825 xmm2 = _mm_mul_pd(neg,xmm2);
826 xmm3 = _mm_mul_pd(qrtr,sk2_inv);
827 xmm3 = _mm_mul_pd(xmm3,uij3);
828 xmm2 = _mm_sub_pd(xmm2,xmm3);
829 xmm3 = _mm_mul_pd(uij,rinv);
830 xmm4 = _mm_mul_pd(uij3,r);
831 xmm3 = _mm_add_pd(xmm3,xmm4);
832 xmm3 = _mm_mul_pd(qrtr,xmm3);
833 t2 = _mm_add_pd(xmm2,xmm3);
835 xmm2 = _mm_mul_pd(sk2_inv,rinv);
836 xmm2 = _mm_add_pd(one,xmm2);
837 xmm2 = _mm_mul_pd(eigth,xmm2);
838 xmm2 = _mm_mul_pd(xmm2,xmm9);
839 xmm3 = _mm_mul_pd(log_term, rinv);
840 xmm3 = _mm_mul_pd(xmm3,rinv);
841 xmm3 = _mm_mul_pd(qrtr,xmm3);
842 t3 = _mm_add_pd(xmm2,xmm3);
844 /* chain rule terms */
845 xmm2 = _mm_mul_pd(dlij,t1);
846 xmm2 = _mm_add_pd(xmm2,t2);
847 xmm2 = _mm_add_pd(xmm2,t3);
849 /* temporary storage of chain rule terms, since we have to compute
850 the reciprocal terms also before storing them */
851 chrule = _mm_mul_pd(xmm2,rinv);
853 /* INTERACTION ai->aj starts here */
854 /* Conditional mask for raj<dr+sk_ai */
855 xmm1 = _mm_add_pd(r,sk_ai);
856 mask_cmp = _mm_cmplt_pd(raj,xmm1);
858 /* Conditional for rai>dr-sk, ends with mask_cmp2 */
859 xmm2 = _mm_sub_pd(r,sk_ai);
860 xmm3 = my_inv_pd(xmm2);
861 mask_cmp2 = _mm_cmpgt_pd(raj,xmm2);
863 lij = _mm_or_pd(_mm_and_pd(mask_cmp2,raj_inv) ,_mm_andnot_pd(mask_cmp2,xmm3)); /*conditional as a mask*/
864 dlij = _mm_or_pd(_mm_and_pd(mask_cmp2,zero) ,_mm_andnot_pd(mask_cmp2,one));
866 uij = my_inv_pd(xmm1);
867 lij2 = _mm_mul_pd(lij,lij);
868 lij3 = _mm_mul_pd(lij2,lij);
869 uij2 = _mm_mul_pd(uij,uij);
870 uij3 = _mm_mul_pd(uij2,uij);
872 diff2 = _mm_sub_pd(uij2,lij2);
874 lij_inv = my_invrsq_pd(lij2);
876 sk2 = sk2_ai;
877 sk2_inv = _mm_mul_pd(sk2,rinv);
878 prod = _mm_mul_pd(qrtr,sk2_inv);
880 log_term = _mm_mul_pd(uij,lij_inv);
881 log_term = log_pd(log_term);
883 xmm1 = _mm_sub_pd(lij,uij);
884 xmm2 = _mm_mul_pd(qrtr,r);
885 xmm2 = _mm_mul_pd(xmm2,diff2);
886 xmm1 = _mm_add_pd(xmm1,xmm2);
887 xmm2 = _mm_mul_pd(half,rinv);
888 xmm2 = _mm_mul_pd(xmm2,log_term);
889 xmm1 = _mm_add_pd(xmm1,xmm2);
890 xmm9 = _mm_mul_pd(neg,diff2);
891 xmm2 = _mm_mul_pd(xmm9,prod);
892 tmp = _mm_add_pd(xmm1,xmm2);
894 /* contitional for rai<sk_ai-dr */
895 xmm3 = _mm_sub_pd(sk_ai,r);
896 mask_cmp3 = _mm_cmplt_pd(raj,xmm3); /* rai<sk-dr */
898 xmm4 = _mm_sub_pd(raj_inv,lij);
899 xmm4 = _mm_mul_pd(two,xmm4);
900 xmm4 = _mm_add_pd(tmp,xmm4);
902 tmp = _mm_or_pd(_mm_and_pd(mask_cmp3,xmm4) ,_mm_andnot_pd(mask_cmp3,tmp)); /*conditional as a mask*/
904 /* the tmp will now contain two partial values, that not all are to be used. Which */
905 /* ones are governed by the mask_cmp mask. */
906 tmp = _mm_mul_pd(half,tmp);
907 tmp = _mm_or_pd(_mm_and_pd(mask_cmp,tmp) ,_mm_andnot_pd(mask_cmp,zero)); /*conditional as a mask*/
909 /* Load, add and store ai->aj pol energy */
910 xmm7 = _mm_loadl_pd(xmm7,born->gpol_hct_work+aj1);
911 xmm7 = _mm_loadh_pd(xmm7,born->gpol_hct_work+aj2);
913 xmm7 = _mm_add_pd(xmm7,tmp);
915 _mm_storel_pd(born->gpol_hct_work+aj1,xmm7);
916 _mm_storeh_pd(born->gpol_hct_work+aj2,xmm7);
918 /* Start chain rule terms */
919 xmm2 = _mm_mul_pd(half,lij2);
920 xmm3 = _mm_mul_pd(prod,lij3);
921 xmm2 = _mm_add_pd(xmm2,xmm3);
922 xmm3 = _mm_mul_pd(lij,rinv);
923 xmm4 = _mm_mul_pd(lij3,r);
924 xmm3 = _mm_add_pd(xmm3,xmm4);
925 xmm3 = _mm_mul_pd(qrtr,xmm3);
926 t1 = _mm_sub_pd(xmm2,xmm3);
928 xmm2 = _mm_mul_pd(half,uij2);
929 xmm2 = _mm_mul_pd(neg,xmm2);
930 xmm3 = _mm_mul_pd(qrtr,sk2_inv);
931 xmm3 = _mm_mul_pd(xmm3,uij3);
932 xmm2 = _mm_sub_pd(xmm2,xmm3);
933 xmm3 = _mm_mul_pd(uij,rinv);
934 xmm4 = _mm_mul_pd(uij3,r);
935 xmm3 = _mm_add_pd(xmm3,xmm4);
936 xmm3 = _mm_mul_pd(qrtr,xmm3);
937 t2 = _mm_add_pd(xmm2,xmm3);
939 xmm2 = _mm_mul_pd(sk2_inv,rinv);
940 xmm2 = _mm_add_pd(one,xmm2);
941 xmm2 = _mm_mul_pd(eigth,xmm2);
942 xmm2 = _mm_mul_pd(xmm2,xmm9);
943 xmm3 = _mm_mul_pd(log_term, rinv);
944 xmm3 = _mm_mul_pd(xmm3,rinv);
945 xmm3 = _mm_mul_pd(qrtr,xmm3);
946 t3 = _mm_add_pd(xmm2,xmm3);
948 /* chain rule terms */
949 xmm2 = _mm_mul_pd(dlij,t1);
950 xmm2 = _mm_add_pd(xmm2,t2);
951 xmm2 = _mm_add_pd(xmm2,t3);
952 chrule_ai = _mm_mul_pd(xmm2,rinv);
954 /* Store chain rule terms
955 * same unpacking rationale as with Still above
957 xmm3 = _mm_unpacklo_pd(chrule, chrule_ai);
958 xmm4 = _mm_unpackhi_pd(chrule, chrule_ai);
960 _mm_storeu_pd(fr->dadx+n, xmm3);
961 n = n + 2;
962 _mm_storeu_pd(fr->dadx+n, xmm4);
963 n = n + 2;
966 if(offset!=0)
968 aj1 = nl->jjnr[k];
969 aj13 = aj1 * 3;
970 p1 = md->typeA[aj1];
972 jx = _mm_load_sd(x+aj13);
973 jy = _mm_load_sd(x+aj13+1);
974 jz = _mm_load_sd(x+aj13+2);
976 sk = _mm_load_sd(born->param+aj1);
978 dx = _mm_sub_sd(ix,jx);
979 dy = _mm_sub_pd(iy,jy);
980 dz = _mm_sub_pd(iz,jz);
982 rsq11 = _mm_add_sd( _mm_add_sd( _mm_mul_sd(dx,dx) , _mm_mul_sd(dy,dy) ) , _mm_mul_sd(dz,dz) );
983 rinv = my_invrsq_pd(rsq11);
984 r = _mm_mul_sd(rinv,rsq11);
986 /* Load raj */
987 raj = _mm_load_sd(top->atomtypes.gb_radius+p1);
988 raj = _mm_sub_sd(raj,doffset);
989 raj_inv = my_inv_pd(raj);
991 /* OFFSET INTERATIONS aj->ai STARTS HERE */
992 /* conditional mask for rai<dr+sk */
993 xmm1 = _mm_add_sd(r,sk);
994 mask_cmp = _mm_cmplt_sd(rai,xmm1);
996 /* conditional for rai>dr-sk, ends with mask_cmp2 */
997 xmm2 = _mm_sub_sd(r,sk);
998 xmm3 = my_inv_pd(xmm2);
999 mask_cmp2 = _mm_cmpgt_pd(rai,xmm2);
1001 lij = _mm_or_pd(_mm_and_pd(mask_cmp2,rai_inv) ,_mm_andnot_pd(mask_cmp2,xmm3)); /*conditional as a mask*/
1002 dlij = _mm_or_pd(_mm_and_pd(mask_cmp2,zero) ,_mm_andnot_pd(mask_cmp2,one));
1004 uij = my_inv_pd(xmm1);
1005 lij2 = _mm_mul_sd(lij,lij);
1006 lij3 = _mm_mul_sd(lij2,lij);
1007 uij2 = _mm_mul_sd(uij,uij);
1008 uij3 = _mm_mul_sd(uij2,uij);
1010 diff2 = _mm_sub_sd(uij2,lij2);
1012 lij_inv = my_invrsq_pd(lij2);
1013 sk2 = _mm_mul_sd(sk,sk);
1014 sk2_inv = _mm_mul_sd(sk2,rinv);
1015 prod = _mm_mul_sd(qrtr,sk2_inv);
1017 log_term = _mm_mul_pd(uij,lij_inv);
1018 log_term = log_pd(log_term);
1020 xmm1 = _mm_sub_sd(lij,uij);
1021 xmm2 = _mm_mul_sd(qrtr,r);
1022 xmm2 = _mm_mul_sd(xmm2,diff2);
1023 xmm1 = _mm_add_sd(xmm1,xmm2);
1024 xmm2 = _mm_mul_sd(half,rinv);
1025 xmm2 = _mm_mul_sd(xmm2,log_term);
1026 xmm1 = _mm_add_sd(xmm1,xmm2);
1027 xmm9 = _mm_mul_sd(neg,diff2);
1028 xmm2 = _mm_mul_sd(xmm9,prod);
1030 tmp_ai = _mm_add_sd(xmm1,xmm2);
1032 /* contitional for rai<sk-dr */
1033 xmm3 = _mm_sub_sd(sk,r);
1034 mask_cmp3 = _mm_cmplt_sd(rai,xmm3); /* rai<sk-dr */
1036 xmm4 = _mm_sub_sd(rai_inv,lij);
1037 xmm4 = _mm_mul_sd(two,xmm4);
1038 xmm4 = _mm_add_sd(tmp_ai,xmm4);
1040 tmp_ai = _mm_or_pd(_mm_and_pd(mask_cmp3,xmm4) ,_mm_andnot_pd(mask_cmp3,tmp_ai)); /*conditional as a mask*/
1042 /* the tmp will now contain two partial values, that not all are to be used. Which */
1043 /* ones are governed by the mask_cmp mask. */
1044 tmp_ai = _mm_mul_pd(half,tmp_ai);
1045 tmp_ai = _mm_or_pd(_mm_and_pd(mask_cmp,tmp_ai) ,_mm_andnot_pd(mask_cmp,zero)); /*conditional as a mask*/
1046 sum_ai = _mm_add_sd(sum_ai,tmp_ai);
1048 xmm2 = _mm_mul_sd(half,lij2);
1049 xmm3 = _mm_mul_sd(prod,lij3);
1050 xmm2 = _mm_add_sd(xmm2,xmm3);
1051 xmm3 = _mm_mul_sd(lij,rinv);
1052 xmm4 = _mm_mul_sd(lij3,r);
1053 xmm3 = _mm_add_sd(xmm3,xmm4);
1054 xmm3 = _mm_mul_sd(qrtr,xmm3);
1055 t1 = _mm_sub_sd(xmm2,xmm3);
1057 xmm2 = _mm_mul_sd(half,uij2);
1058 xmm2 = _mm_mul_sd(neg,xmm2);
1059 xmm3 = _mm_mul_sd(qrtr,sk2_inv);
1060 xmm3 = _mm_mul_sd(xmm3,uij3);
1061 xmm2 = _mm_sub_sd(xmm2,xmm3);
1062 xmm3 = _mm_mul_sd(uij,rinv);
1063 xmm4 = _mm_mul_sd(uij3,r);
1064 xmm3 = _mm_add_sd(xmm3,xmm4);
1065 xmm3 = _mm_mul_sd(qrtr,xmm3);
1066 t2 = _mm_add_sd(xmm2,xmm3);
1068 xmm2 = _mm_mul_sd(sk2_inv,rinv);
1069 xmm2 = _mm_add_sd(one,xmm2);
1070 xmm2 = _mm_mul_sd(eigth,xmm2);
1071 xmm2 = _mm_mul_sd(xmm2,xmm9);
1072 xmm3 = _mm_mul_sd(log_term, rinv);
1073 xmm3 = _mm_mul_sd(xmm3,rinv);
1074 xmm3 = _mm_mul_sd(qrtr,xmm3);
1075 t3 = _mm_add_sd(xmm2,xmm3);
1077 /* chain rule terms */
1078 xmm2 = _mm_mul_sd(dlij,t1);
1079 xmm2 = _mm_add_sd(xmm2,t2);
1080 xmm2 = _mm_add_sd(xmm2,t3);
1082 /* temporary storage of chain rule terms, since we have to compute
1083 the reciprocal terms also before storing them */
1084 chrule = _mm_mul_sd(xmm2,rinv);
1086 /* OFFSET INTERACTION ai->aj starts here */
1087 /* conditional mask for raj<dr+sk */
1088 xmm1 = _mm_add_sd(r,sk_ai);
1089 mask_cmp = _mm_cmplt_sd(raj,xmm1);
1091 /* conditional for rai>dr-sk, ends with mask_cmp2 */
1092 xmm2 = _mm_sub_sd(r,sk_ai);
1093 xmm3 = my_inv_pd(xmm2);
1094 mask_cmp2 = _mm_cmpgt_pd(raj,xmm2);
1096 lij = _mm_or_pd(_mm_and_pd(mask_cmp2,raj_inv) ,_mm_andnot_pd(mask_cmp2,xmm3)); /*conditional as a mask*/
1097 dlij = _mm_or_pd(_mm_and_pd(mask_cmp2,zero) ,_mm_andnot_pd(mask_cmp2,one));
1099 uij = my_inv_pd(xmm1);
1100 lij2 = _mm_mul_sd(lij,lij);
1101 lij3 = _mm_mul_sd(lij2,lij);
1102 uij2 = _mm_mul_sd(uij,uij);
1103 uij3 = _mm_mul_sd(uij2,uij);
1105 diff2 = _mm_sub_sd(uij2,lij2);
1107 lij_inv = my_invrsq_pd(lij2);
1109 sk2 = sk2_ai;
1110 sk2_inv = _mm_mul_sd(sk2,rinv);
1111 prod = _mm_mul_sd(qrtr,sk2_inv);
1113 log_term = _mm_mul_pd(uij,lij_inv);
1114 log_term = log_pd(log_term);
1116 xmm1 = _mm_sub_sd(lij,uij);
1117 xmm2 = _mm_mul_sd(qrtr,r);
1118 xmm2 = _mm_mul_sd(xmm2,diff2);
1119 xmm1 = _mm_add_sd(xmm1,xmm2);
1120 xmm2 = _mm_mul_sd(half,rinv);
1121 xmm2 = _mm_mul_sd(xmm2,log_term);
1122 xmm1 = _mm_add_sd(xmm1,xmm2);
1123 xmm9 = _mm_mul_sd(neg,diff2);
1124 xmm2 = _mm_mul_sd(xmm9,prod);
1125 tmp = _mm_add_sd(xmm1,xmm2);
1127 /* contitional for rai<sk-dr */
1128 xmm3 = _mm_sub_sd(sk_ai,r);
1129 mask_cmp3 = _mm_cmplt_sd(raj,xmm3); /* rai<sk-dr */
1131 xmm4 = _mm_sub_sd(raj_inv,lij);
1132 xmm4 = _mm_mul_sd(two,xmm4);
1133 xmm4 = _mm_add_sd(tmp,xmm4);
1135 tmp = _mm_or_pd(_mm_and_pd(mask_cmp3,xmm4) ,_mm_andnot_pd(mask_cmp3,tmp)); /*conditional as a mask*/
1137 /* the tmp will now contain two partial values, that not all are to be used. Which */
1138 /* ones are governed by the mask_cmp mask. */
1139 tmp = _mm_mul_pd(half,tmp);
1140 tmp = _mm_or_pd(_mm_and_pd(mask_cmp,tmp) ,_mm_andnot_pd(mask_cmp,zero)); /*conditional as a mask*/
1142 /* Load, add and store gpol energy */
1143 xmm7 = _mm_load_sd(born->gpol_hct_work+aj1);
1144 xmm7 = _mm_add_sd(xmm7,tmp);
1145 _mm_store_sd(born->gpol_hct_work+aj1,xmm7);
1147 /* Start chain rule terms, t1 */
1148 xmm2 = _mm_mul_sd(half,lij2);
1149 xmm3 = _mm_mul_sd(prod,lij3);
1150 xmm2 = _mm_add_sd(xmm2,xmm3);
1151 xmm3 = _mm_mul_sd(lij,rinv);
1152 xmm4 = _mm_mul_sd(lij3,r);
1153 xmm3 = _mm_add_sd(xmm3,xmm4);
1154 xmm3 = _mm_mul_sd(qrtr,xmm3);
1155 t1 = _mm_sub_sd(xmm2,xmm3);
1157 xmm2 = _mm_mul_sd(half,uij2);
1158 xmm2 = _mm_mul_sd(neg,xmm2);
1159 xmm3 = _mm_mul_sd(qrtr,sk2_inv);
1160 xmm3 = _mm_mul_sd(xmm3,uij3);
1161 xmm2 = _mm_sub_sd(xmm2,xmm3);
1162 xmm3 = _mm_mul_sd(uij,rinv);
1163 xmm4 = _mm_mul_sd(uij3,r);
1164 xmm3 = _mm_add_sd(xmm3,xmm4);
1165 xmm3 = _mm_mul_sd(qrtr,xmm3);
1166 t2 = _mm_add_sd(xmm2,xmm3);
1168 xmm2 = _mm_mul_sd(sk2_inv,rinv);
1169 xmm2 = _mm_add_sd(one,xmm2);
1170 xmm2 = _mm_mul_sd(eigth,xmm2);
1171 xmm2 = _mm_mul_sd(xmm2,xmm9);
1172 xmm3 = _mm_mul_sd(log_term, rinv);
1173 xmm3 = _mm_mul_sd(xmm3,rinv);
1174 xmm3 = _mm_mul_sd(qrtr,xmm3);
1175 t3 = _mm_add_sd(xmm2,xmm3);
1177 /* chain rule terms */
1178 xmm2 = _mm_mul_sd(dlij,t1);
1179 xmm2 = _mm_add_sd(xmm2,t2);
1180 xmm2 = _mm_add_sd(xmm2,t3);
1181 chrule_ai = _mm_mul_sd(xmm2,rinv);
1183 _mm_store_sd(fr->dadx+n, chrule);
1184 n = n + 1;
1185 _mm_store_sd(fr->dadx+n, chrule_ai);
1186 n = n + 1;
1189 /* Do end processing ... */
1190 tmp_ai = _mm_unpacklo_pd(tmp_ai,sum_ai);
1191 sum_ai = _mm_add_pd(sum_ai,tmp_ai);
1192 sum_ai = _mm_shuffle_pd(sum_ai,sum_ai,_MM_SHUFFLE2(1,1));
1194 /* Load, add and store atom ai polarisation energy */
1195 xmm2 = _mm_load_sd(born->gpol_hct_work+ai);
1196 sum_ai = _mm_add_sd(sum_ai,xmm2);
1197 _mm_store_sd(born->gpol_hct_work+ai,sum_ai);
1200 /* Parallel summations */
1201 if(PARTDECOMP(cr))
1203 gmx_sum(natoms, born->gpol_hct_work, cr);
1205 else if(DOMAINDECOMP(cr))
1207 dd_atom_sum_real(cr->dd, born->gpol_hct_work);
1210 /* Compute the radii */
1211 for(i=0;i<fr->natoms_force;i++) /* PELA born->nr */
1213 if(born->use[i] != 0)
1215 rr = top->atomtypes.gb_radius[md->typeA[i]]-doff;
1216 sum = 1.0/rr - born->gpol_hct_work[i];
1217 min_rad = rr + doff;
1218 rad = 1.0/sum;
1220 born->bRad[i] = rad > min_rad ? rad : min_rad;
1221 fr->invsqrta[i] = gmx_invsqrt(born->bRad[i]);
1225 /* Extra (local) communication required for DD */
1226 if(DOMAINDECOMP(cr))
1228 dd_atom_spread_real(cr->dd, born->bRad);
1229 dd_atom_spread_real(cr->dd, fr->invsqrta);
1232 return 0;
1235 int
1236 calc_gb_rad_obc_sse2_double(t_commrec *cr, t_forcerec * fr, int natoms, gmx_localtop_t *top,
1237 const t_atomtypes *atype, double *x, t_nblist *nl, gmx_genborn_t *born,t_mdatoms *md)
1239 int i,k,n,ai,ai3,aj1,aj2,aj13,aj23,nj0,nj1,at0,at1,offset;
1240 int p1,p2,p3,p4,shift;
1241 double rr,sum,sum_tmp,sum2,sum3,min_rad,rad,doff;
1242 double tsum,tchain,rr_inv,rr_inv2,gbr;
1243 double shX,shY,shZ;
1245 __m128d ix,iy,iz,jx,jy,jz,dx,dy,dz,sX,sY,sZ;
1246 __m128d t1,t2,t3,rsq11,rinv,r,rai;
1247 __m128d rai_inv,sk,sk2,lij,dlij,duij;
1248 __m128d uij,lij2,uij2,lij3,uij3,diff2;
1249 __m128d lij_inv,sk2_inv,prod,log_term,tmp,tmp_sum;
1250 __m128d mask_cmp,mask_cmp2,mask_cmp3,doffset,raj,raj_inv;
1251 __m128d xmm1,xmm2,xmm3,xmm4,xmm7,xmm8,xmm9;
1252 __m128d sum_ai,chrule,chrule_ai,tmp_ai,sk_ai,sk2_ai;
1254 const __m128d neg = {-1.0, -1.0};
1255 const __m128d zero = {0.0, 0.0};
1256 const __m128d eigth = {0.125, 0.125};
1257 const __m128d qrtr = {0.25, 0.25};
1258 const __m128d half = {0.5, 0.5};
1259 const __m128d one = {1.0, 1.0};
1260 const __m128d two = {2.0, 2.0};
1261 const __m128d three = {3.0, 3.0};
1263 /* Keep the compiler happy */
1264 tmp_ai = _mm_setzero_pd();
1265 tmp = _mm_setzero_pd();
1266 raj_inv = _mm_setzero_pd();
1267 raj = _mm_setzero_pd();
1268 sk = _mm_setzero_pd();
1269 jx = _mm_setzero_pd();
1270 jy = _mm_setzero_pd();
1271 jz = _mm_setzero_pd();
1272 xmm7 = _mm_setzero_pd();
1273 xmm8 = _mm_setzero_pd();
1275 /* Set the dielectric offset */
1276 doff = born->gb_doffset;
1277 doffset = _mm_load1_pd(&doff);
1279 n = 0;
1281 for(i=0;i<born->nr;i++)
1283 born->gpol_hct_work[i] = 0;
1286 for(i=0;i<nl->nri;i++)
1288 ai = nl->iinr[i];
1289 ai3 = ai * 3;
1291 nj0 = nl->jindex[i];
1292 nj1 = nl->jindex[i+1];
1294 /* Load shifts for this list */
1295 shift = nl->shift[i];
1296 shX = fr->shift_vec[shift][0];
1297 shY = fr->shift_vec[shift][1];
1298 shZ = fr->shift_vec[shift][2];
1300 /* Splat the shifts */
1301 sX = _mm_load1_pd(&shX);
1302 sY = _mm_load1_pd(&shY);
1303 sZ = _mm_load1_pd(&shZ);
1305 offset = (nj1-nj0)%2;
1307 /* Load rai */
1308 rr = top->atomtypes.gb_radius[md->typeA[ai]]-doff;
1309 rai = _mm_load1_pd(&rr);
1310 rr = 1.0/rr;
1311 rai_inv = _mm_load1_pd(&rr);
1313 /* Load ai coordinates and add shifts */
1314 ix = _mm_load1_pd(x+ai3);
1315 iy = _mm_load1_pd(x+ai3+1);
1316 iz = _mm_load1_pd(x+ai3+2);
1318 ix = _mm_add_pd(sX,ix);
1319 iy = _mm_add_pd(sY,iy);
1320 iz = _mm_add_pd(sZ,iz);
1322 /* Zero out sums for polarisation energies */
1323 sum_ai = _mm_setzero_pd();
1325 sk_ai = _mm_load1_pd(born->param+ai);
1326 sk2_ai = _mm_mul_pd(sk_ai,sk_ai);
1328 for(k=nj0;k<nj1-offset;k+=2)
1330 aj1 = nl->jjnr[k];
1331 aj2 = nl->jjnr[k+1];
1333 aj13 = aj1 * 3;
1334 aj23 = aj2 * 3;
1336 /* Load particle aj1-2 coordinates */
1337 xmm1 = _mm_loadu_pd(x+aj13);
1338 xmm2 = _mm_loadu_pd(x+aj23);
1339 jx = _mm_shuffle_pd(xmm1,xmm2,_MM_SHUFFLE2(0,0));
1340 jy = _mm_shuffle_pd(xmm1,xmm2,_MM_SHUFFLE2(1,1));
1342 jz = _mm_loadl_pd(jz, x+aj13+2);
1343 jz = _mm_loadh_pd(jz, x+aj23+2);
1345 dx = _mm_sub_pd(ix,jx);
1346 dy = _mm_sub_pd(iy,jy);
1347 dz = _mm_sub_pd(iz,jz);
1349 rsq11 = _mm_add_pd( _mm_add_pd( _mm_mul_pd(dx,dx) , _mm_mul_pd(dy,dy) ) , _mm_mul_pd(dz,dz) );
1350 rinv = my_invrsq_pd(rsq11);
1351 r = _mm_mul_pd(rinv,rsq11);
1353 /* Load atom aj1,aj2 raj */
1354 p1 = md->typeA[aj1];
1355 p2 = md->typeA[aj2];
1357 raj = _mm_loadl_pd(raj,top->atomtypes.gb_radius+p1);
1358 raj = _mm_loadh_pd(raj,top->atomtypes.gb_radius+p2);
1359 raj = _mm_sub_pd(raj,doffset);
1361 /* Compute 1.0/raj */
1362 raj_inv = my_inv_pd(raj);
1364 sk = _mm_loadl_pd(sk,born->param+aj1);
1365 sk = _mm_loadh_pd(sk,born->param+aj2);
1367 /* INTERACTION aj->ai STARTS HERE */
1368 /* conditional mask for rai<dr+sk */
1369 xmm1 = _mm_add_pd(r,sk);
1370 mask_cmp = _mm_cmplt_pd(rai,xmm1);
1372 /* conditional for rai>dr-sk, ends with mask_cmp2 */
1373 xmm2 = _mm_sub_pd(r,sk);
1374 xmm3 = my_inv_pd(xmm2);
1375 mask_cmp2 = _mm_cmpgt_pd(rai,xmm2);
1377 lij = _mm_or_pd(_mm_and_pd(mask_cmp2,rai_inv) ,_mm_andnot_pd(mask_cmp2,xmm3)); /*conditional as a mask*/
1378 dlij = _mm_or_pd(_mm_and_pd(mask_cmp2,zero) ,_mm_andnot_pd(mask_cmp2,one));
1380 uij = my_inv_pd(xmm1);
1381 lij2 = _mm_mul_pd(lij,lij);
1382 lij3 = _mm_mul_pd(lij2,lij);
1383 uij2 = _mm_mul_pd(uij,uij);
1384 uij3 = _mm_mul_pd(uij2,uij);
1386 diff2 = _mm_sub_pd(uij2,lij2);
1388 lij_inv = my_invrsq_pd(lij2);
1389 sk2 = _mm_mul_pd(sk,sk);
1390 sk2_inv = _mm_mul_pd(sk2,rinv);
1391 prod = _mm_mul_pd(qrtr,sk2_inv);
1393 log_term = _mm_mul_pd(uij,lij_inv);
1394 log_term = log_pd(log_term);
1396 xmm1 = _mm_sub_pd(lij,uij);
1397 xmm2 = _mm_mul_pd(qrtr,r);
1398 xmm2 = _mm_mul_pd(xmm2,diff2);
1399 xmm1 = _mm_add_pd(xmm1,xmm2);
1400 xmm2 = _mm_mul_pd(half,rinv);
1401 xmm2 = _mm_mul_pd(xmm2,log_term);
1402 xmm1 = _mm_add_pd(xmm1,xmm2);
1403 xmm9 = _mm_mul_pd(neg,diff2);
1404 xmm2 = _mm_mul_pd(xmm9,prod);
1405 tmp_ai = _mm_add_pd(xmm1,xmm2);
1407 /* contitional for rai<sk-dr */
1408 xmm3 = _mm_sub_pd(sk,r);
1409 mask_cmp3 = _mm_cmplt_pd(rai,xmm3); /* rai<sk-dr */
1411 xmm4 = _mm_sub_pd(rai_inv,lij);
1412 xmm4 = _mm_mul_pd(two,xmm4);
1413 xmm4 = _mm_add_pd(tmp_ai,xmm4);
1415 tmp_ai = _mm_or_pd(_mm_and_pd(mask_cmp3,xmm4) ,_mm_andnot_pd(mask_cmp3,tmp_ai)); /*conditional as a mask*/
1417 /* the tmp will now contain two partial values, that not all are to be used. Which */
1418 /* ones are governed by the mask_cmp mask. */
1419 tmp_ai = _mm_mul_pd(half,tmp_ai);
1420 tmp_ai = _mm_or_pd(_mm_and_pd(mask_cmp,tmp_ai) ,_mm_andnot_pd(mask_cmp,zero)); /*conditional as a mask*/
1421 sum_ai = _mm_add_pd(sum_ai,tmp_ai);
1423 /* Start the dadx chain rule terms */
1424 xmm2 = _mm_mul_pd(half,lij2);
1425 xmm3 = _mm_mul_pd(prod,lij3);
1426 xmm2 = _mm_add_pd(xmm2,xmm3);
1427 xmm3 = _mm_mul_pd(lij,rinv);
1428 xmm4 = _mm_mul_pd(lij3,r);
1429 xmm3 = _mm_add_pd(xmm3,xmm4);
1430 xmm3 = _mm_mul_pd(qrtr,xmm3);
1431 t1 = _mm_sub_pd(xmm2,xmm3);
1433 xmm2 = _mm_mul_pd(half,uij2);
1434 xmm2 = _mm_mul_pd(neg,xmm2);
1435 xmm3 = _mm_mul_pd(qrtr,sk2_inv);
1436 xmm3 = _mm_mul_pd(xmm3,uij3);
1437 xmm2 = _mm_sub_pd(xmm2,xmm3);
1438 xmm3 = _mm_mul_pd(uij,rinv);
1439 xmm4 = _mm_mul_pd(uij3,r);
1440 xmm3 = _mm_add_pd(xmm3,xmm4);
1441 xmm3 = _mm_mul_pd(qrtr,xmm3);
1442 t2 = _mm_add_pd(xmm2,xmm3);
1444 xmm2 = _mm_mul_pd(sk2_inv,rinv);
1445 xmm2 = _mm_add_pd(one,xmm2);
1446 xmm2 = _mm_mul_pd(eigth,xmm2);
1447 xmm2 = _mm_mul_pd(xmm2,xmm9);
1448 xmm3 = _mm_mul_pd(log_term, rinv);
1449 xmm3 = _mm_mul_pd(xmm3,rinv);
1450 xmm3 = _mm_mul_pd(qrtr,xmm3);
1451 t3 = _mm_add_pd(xmm2,xmm3);
1453 /* chain rule terms */
1454 xmm2 = _mm_mul_pd(dlij,t1);
1455 xmm2 = _mm_add_pd(xmm2,t2);
1456 xmm2 = _mm_add_pd(xmm2,t3);
1458 /* temporary storage of chain rule terms, since we have to compute
1459 the reciprocal terms also before storing them */
1460 chrule = _mm_mul_pd(xmm2,rinv);
1462 /* INTERACTION ai->aj STARTS HERE */
1463 /* conditional mask for raj<dr+sk_ai */
1464 xmm1 = _mm_add_pd(r,sk_ai);
1465 mask_cmp = _mm_cmplt_pd(raj,xmm1);
1467 /* conditional for rai>dr-sk, ends with mask_cmp2 */
1468 xmm2 = _mm_sub_pd(r,sk_ai);
1469 xmm3 = my_inv_pd(xmm2);
1470 mask_cmp2 = _mm_cmpgt_pd(raj,xmm2);
1472 lij = _mm_or_pd(_mm_and_pd(mask_cmp2,raj_inv) ,_mm_andnot_pd(mask_cmp2,xmm3)); /*conditional as a mask*/
1473 dlij = _mm_or_pd(_mm_and_pd(mask_cmp2,zero) ,_mm_andnot_pd(mask_cmp2,one));
1475 uij = my_inv_pd(xmm1);
1476 lij2 = _mm_mul_pd(lij,lij);
1477 lij3 = _mm_mul_pd(lij2,lij);
1478 uij2 = _mm_mul_pd(uij,uij);
1479 uij3 = _mm_mul_pd(uij2,uij);
1481 diff2 = _mm_sub_pd(uij2,lij2);
1483 lij_inv = my_invrsq_pd(lij2);
1485 sk2 = sk2_ai;
1486 sk2_inv = _mm_mul_pd(sk2,rinv);
1487 prod = _mm_mul_pd(qrtr,sk2_inv);
1489 log_term = _mm_mul_pd(uij,lij_inv);
1490 log_term = log_pd(log_term);
1492 xmm1 = _mm_sub_pd(lij,uij);
1493 xmm2 = _mm_mul_pd(qrtr,r);
1494 xmm2 = _mm_mul_pd(xmm2,diff2);
1495 xmm1 = _mm_add_pd(xmm1,xmm2);
1496 xmm2 = _mm_mul_pd(half,rinv);
1497 xmm2 = _mm_mul_pd(xmm2,log_term);
1498 xmm1 = _mm_add_pd(xmm1,xmm2);
1499 xmm9 = _mm_mul_pd(neg,diff2);
1500 xmm2 = _mm_mul_pd(xmm9,prod);
1501 tmp = _mm_add_pd(xmm1,xmm2);
1503 /* contitional for raj<sk_ai-dr */
1504 xmm3 = _mm_sub_pd(sk_ai,r);
1505 mask_cmp3 = _mm_cmplt_pd(raj,xmm3); /* rai<sk-dr */
1507 xmm4 = _mm_sub_pd(raj_inv,lij);
1508 xmm4 = _mm_mul_pd(two,xmm4);
1509 xmm4 = _mm_add_pd(tmp,xmm4);
1511 tmp = _mm_or_pd(_mm_and_pd(mask_cmp3,xmm4) ,_mm_andnot_pd(mask_cmp3,tmp)); /*conditional as a mask*/
1513 /* the tmp will now contain two partial values, that not all are to be used. Which */
1514 /* ones are governed by the mask_cmp mask. */
1515 tmp = _mm_mul_pd(half,tmp);
1516 tmp = _mm_or_pd(_mm_and_pd(mask_cmp,tmp) ,_mm_andnot_pd(mask_cmp,zero)); /*conditional as a mask*/
1518 /* Load, add and store ai->aj pol energy */
1519 xmm7 = _mm_loadl_pd(xmm7,born->gpol_hct_work+aj1);
1520 xmm7 = _mm_loadh_pd(xmm7,born->gpol_hct_work+aj2);
1522 xmm7 = _mm_add_pd(xmm7,tmp);
1524 _mm_storel_pd(born->gpol_hct_work+aj1,xmm7);
1525 _mm_storeh_pd(born->gpol_hct_work+aj2,xmm7);
1527 /* Start dadx chain rule terms */
1528 xmm2 = _mm_mul_pd(half,lij2);
1529 xmm3 = _mm_mul_pd(prod,lij3);
1530 xmm2 = _mm_add_pd(xmm2,xmm3);
1531 xmm3 = _mm_mul_pd(lij,rinv);
1532 xmm4 = _mm_mul_pd(lij3,r);
1533 xmm3 = _mm_add_pd(xmm3,xmm4);
1534 xmm3 = _mm_mul_pd(qrtr,xmm3);
1535 t1 = _mm_sub_pd(xmm2,xmm3);
1537 xmm2 = _mm_mul_pd(half,uij2);
1538 xmm2 = _mm_mul_pd(neg,xmm2);
1539 xmm3 = _mm_mul_pd(qrtr,sk2_inv);
1540 xmm3 = _mm_mul_pd(xmm3,uij3);
1541 xmm2 = _mm_sub_pd(xmm2,xmm3);
1542 xmm3 = _mm_mul_pd(uij,rinv);
1543 xmm4 = _mm_mul_pd(uij3,r);
1544 xmm3 = _mm_add_pd(xmm3,xmm4);
1545 xmm3 = _mm_mul_pd(qrtr,xmm3);
1546 t2 = _mm_add_pd(xmm2,xmm3);
1548 xmm2 = _mm_mul_pd(sk2_inv,rinv);
1549 xmm2 = _mm_add_pd(one,xmm2);
1550 xmm2 = _mm_mul_pd(eigth,xmm2);
1551 xmm2 = _mm_mul_pd(xmm2,xmm9);
1552 xmm3 = _mm_mul_pd(log_term, rinv);
1553 xmm3 = _mm_mul_pd(xmm3,rinv);
1554 xmm3 = _mm_mul_pd(qrtr,xmm3);
1555 t3 = _mm_add_pd(xmm2,xmm3);
1557 /* chain rule terms */
1558 xmm2 = _mm_mul_pd(dlij,t1);
1559 xmm2 = _mm_add_pd(xmm2,t2);
1560 xmm2 = _mm_add_pd(xmm2,t3);
1561 chrule_ai = _mm_mul_pd(xmm2,rinv);
1563 /* Store chain rule terms
1564 * same unpacking rationale as with Still above
1566 xmm3 = _mm_unpacklo_pd(chrule, chrule_ai);
1567 xmm4 = _mm_unpackhi_pd(chrule, chrule_ai);
1569 _mm_storeu_pd(fr->dadx+n, xmm3);
1570 n = n + 2;
1571 _mm_storeu_pd(fr->dadx+n, xmm4);
1572 n = n + 2;
1575 if(offset!=0)
1577 aj1 = nl->jjnr[k];
1578 aj13 = aj1 * 3;
1579 p1 = md->typeA[aj1];
1581 jx = _mm_load_sd(x+aj13);
1582 jy = _mm_load_sd(x+aj13+1);
1583 jz = _mm_load_sd(x+aj13+2);
1585 sk = _mm_load_sd(born->param+aj1);
1587 /* Load raj */
1588 raj = _mm_load_sd(top->atomtypes.gb_radius+p1);
1589 raj = _mm_sub_sd(raj,doffset);
1590 raj_inv = my_inv_pd(raj);
1592 dx = _mm_sub_sd(ix,jx);
1593 dy = _mm_sub_pd(iy,jy);
1594 dz = _mm_sub_pd(iz,jz);
1596 rsq11 = _mm_add_sd( _mm_add_sd( _mm_mul_sd(dx,dx) , _mm_mul_sd(dy,dy) ) , _mm_mul_sd(dz,dz) );
1597 rinv = my_invrsq_pd(rsq11);
1598 r = _mm_mul_sd(rinv,rsq11);
1600 /* OFFSET INTERACTION aj->ai STARTS HERE */
1601 /* conditional mask for rai<dr+sk */
1602 xmm1 = _mm_add_sd(r,sk);
1603 mask_cmp = _mm_cmplt_sd(rai,xmm1);
1605 /* conditional for rai>dr-sk, ends with mask_cmp2 */
1606 xmm2 = _mm_sub_sd(r,sk);
1607 xmm3 = my_inv_pd(xmm2);
1608 mask_cmp2 = _mm_cmpgt_pd(rai,xmm2);
1610 lij = _mm_or_pd(_mm_and_pd(mask_cmp2,rai_inv) ,_mm_andnot_pd(mask_cmp2,xmm3)); /*conditional as a mask*/
1611 dlij = _mm_or_pd(_mm_and_pd(mask_cmp2,zero) ,_mm_andnot_pd(mask_cmp2,one));
1613 uij = my_inv_pd(xmm1);
1614 lij2 = _mm_mul_sd(lij,lij);
1615 lij3 = _mm_mul_sd(lij2,lij);
1616 uij2 = _mm_mul_sd(uij,uij);
1617 uij3 = _mm_mul_sd(uij2,uij);
1619 diff2 = _mm_sub_sd(uij2,lij2);
1621 lij_inv = my_invrsq_pd(lij2);
1622 sk2 = _mm_mul_sd(sk,sk);
1623 sk2_inv = _mm_mul_sd(sk2,rinv);
1624 prod = _mm_mul_sd(qrtr,sk2_inv);
1626 log_term = _mm_mul_pd(uij,lij_inv);
1627 log_term = log_pd(log_term);
1629 xmm1 = _mm_sub_sd(lij,uij);
1630 xmm2 = _mm_mul_sd(qrtr,r);
1631 xmm2 = _mm_mul_sd(xmm2,diff2);
1632 xmm1 = _mm_add_sd(xmm1,xmm2);
1633 xmm2 = _mm_mul_sd(half,rinv);
1634 xmm2 = _mm_mul_sd(xmm2,log_term);
1635 xmm1 = _mm_add_sd(xmm1,xmm2);
1636 xmm9 = _mm_mul_sd(neg,diff2);
1637 xmm2 = _mm_mul_sd(xmm9,prod);
1639 tmp_ai = _mm_add_sd(xmm1,xmm2);
1641 /* contitional for rai<sk-dr */
1642 xmm3 = _mm_sub_sd(sk,r);
1643 mask_cmp3 = _mm_cmplt_sd(rai,xmm3); /* rai<sk-dr */
1645 xmm4 = _mm_sub_sd(rai_inv,lij);
1646 xmm4 = _mm_mul_sd(two,xmm4);
1647 xmm4 = _mm_add_sd(tmp_ai,xmm4);
1649 tmp_ai = _mm_or_pd(_mm_and_pd(mask_cmp3,xmm4) ,_mm_andnot_pd(mask_cmp3,tmp_ai)); /*conditional as a mask*/
1651 /* the tmp will now contain two partial values, that not all are to be used. Which */
1652 /* ones are governed by the mask_cmp mask. */
1653 tmp_ai = _mm_mul_pd(half,tmp_ai);
1654 tmp_ai = _mm_or_pd(_mm_and_pd(mask_cmp,tmp_ai) ,_mm_andnot_pd(mask_cmp,zero)); /*conditional as a mask*/
1655 sum_ai = _mm_add_sd(sum_ai,tmp_ai);
1657 xmm2 = _mm_mul_sd(half,lij2);
1658 xmm3 = _mm_mul_sd(prod,lij3);
1659 xmm2 = _mm_add_sd(xmm2,xmm3);
1660 xmm3 = _mm_mul_sd(lij,rinv);
1661 xmm4 = _mm_mul_sd(lij3,r);
1662 xmm3 = _mm_add_sd(xmm3,xmm4);
1663 xmm3 = _mm_mul_sd(qrtr,xmm3);
1664 t1 = _mm_sub_sd(xmm2,xmm3);
1666 xmm2 = _mm_mul_sd(half,uij2);
1667 xmm2 = _mm_mul_sd(neg,xmm2);
1668 xmm3 = _mm_mul_sd(qrtr,sk2_inv);
1669 xmm3 = _mm_mul_sd(xmm3,uij3);
1670 xmm2 = _mm_sub_sd(xmm2,xmm3);
1671 xmm3 = _mm_mul_sd(uij,rinv);
1672 xmm4 = _mm_mul_sd(uij3,r);
1673 xmm3 = _mm_add_sd(xmm3,xmm4);
1674 xmm3 = _mm_mul_sd(qrtr,xmm3);
1675 t2 = _mm_add_sd(xmm2,xmm3);
1677 xmm2 = _mm_mul_sd(sk2_inv,rinv);
1678 xmm2 = _mm_add_sd(one,xmm2);
1679 xmm2 = _mm_mul_sd(eigth,xmm2);
1680 xmm2 = _mm_mul_sd(xmm2,xmm9);
1681 xmm3 = _mm_mul_sd(log_term, rinv);
1682 xmm3 = _mm_mul_sd(xmm3,rinv);
1683 xmm3 = _mm_mul_sd(qrtr,xmm3);
1684 t3 = _mm_add_sd(xmm2,xmm3);
1686 /* chain rule terms */
1687 xmm2 = _mm_mul_sd(dlij,t1);
1688 xmm2 = _mm_add_sd(xmm2,t2);
1689 xmm2 = _mm_add_sd(xmm2,t3);
1690 chrule = _mm_mul_sd(xmm2,rinv);
1692 /* OFFSET INTERACTION ai->aj STARTS HERE */
1693 /* conditional mask for raj<dr+sk_ai */
1694 xmm1 = _mm_add_sd(r,sk_ai);
1695 mask_cmp = _mm_cmplt_sd(raj,xmm1);
1697 /* conditional for rai>dr-sk, ends with mask_cmp2 */
1698 xmm2 = _mm_sub_sd(r,sk_ai);
1699 xmm3 = my_inv_pd(xmm2);
1700 mask_cmp2 = _mm_cmpgt_pd(raj,xmm2);
1702 lij = _mm_or_pd(_mm_and_pd(mask_cmp2,raj_inv) ,_mm_andnot_pd(mask_cmp2,xmm3)); /*conditional as a mask*/
1703 dlij = _mm_or_pd(_mm_and_pd(mask_cmp2,zero) ,_mm_andnot_pd(mask_cmp2,one));
1705 uij = my_inv_pd(xmm1);
1706 lij2 = _mm_mul_sd(lij,lij);
1707 lij3 = _mm_mul_sd(lij2,lij);
1708 uij2 = _mm_mul_sd(uij,uij);
1709 uij3 = _mm_mul_sd(uij2,uij);
1711 diff2 = _mm_sub_sd(uij2,lij2);
1713 lij_inv = my_invrsq_pd(lij2);
1715 sk2 = sk2_ai;
1716 sk2_inv = _mm_mul_sd(sk2,rinv);
1717 prod = _mm_mul_sd(qrtr,sk2_inv);
1719 log_term = _mm_mul_pd(uij,lij_inv);
1720 log_term = log_pd(log_term);
1722 xmm1 = _mm_sub_sd(lij,uij);
1723 xmm2 = _mm_mul_sd(qrtr,r);
1724 xmm2 = _mm_mul_sd(xmm2,diff2);
1725 xmm1 = _mm_add_sd(xmm1,xmm2);
1726 xmm2 = _mm_mul_sd(half,rinv);
1727 xmm2 = _mm_mul_sd(xmm2,log_term);
1728 xmm1 = _mm_add_sd(xmm1,xmm2);
1729 xmm9 = _mm_mul_sd(neg,diff2);
1730 xmm2 = _mm_mul_sd(xmm9,prod);
1731 tmp = _mm_add_sd(xmm1,xmm2);
1733 /* contitional for raj<sk_ai-dr */
1734 xmm3 = _mm_sub_sd(sk_ai,r);
1735 mask_cmp3 = _mm_cmplt_sd(raj,xmm3); /* rai<sk-dr */
1737 xmm4 = _mm_sub_sd(raj_inv,lij);
1738 xmm4 = _mm_mul_sd(two,xmm4);
1739 xmm4 = _mm_add_sd(tmp,xmm4);
1741 tmp = _mm_or_pd(_mm_and_pd(mask_cmp3,xmm4) ,_mm_andnot_pd(mask_cmp3,tmp)); /*conditional as a mask*/
1743 /* the tmp will now contain two partial values, that not all are to be used. Which */
1744 /* ones are governed by the mask_cmp mask. */
1745 tmp = _mm_mul_pd(half,tmp);
1746 tmp = _mm_or_pd(_mm_and_pd(mask_cmp,tmp) ,_mm_andnot_pd(mask_cmp,zero)); /*conditional as a mask*/
1748 /* Load, add and store gpol energy */
1749 xmm7 = _mm_load_sd(born->gpol_hct_work+aj1);
1750 xmm7 = _mm_add_sd(xmm7,tmp);
1751 _mm_store_sd(born->gpol_hct_work+aj1,xmm7);
1753 /* Start chain rule terms, t1 */
1754 xmm2 = _mm_mul_sd(half,lij2);
1755 xmm3 = _mm_mul_sd(prod,lij3);
1756 xmm2 = _mm_add_sd(xmm2,xmm3);
1757 xmm3 = _mm_mul_sd(lij,rinv);
1758 xmm4 = _mm_mul_sd(lij3,r);
1759 xmm3 = _mm_add_sd(xmm3,xmm4);
1760 xmm3 = _mm_mul_sd(qrtr,xmm3);
1761 t1 = _mm_sub_sd(xmm2,xmm3);
1763 xmm2 = _mm_mul_sd(half,uij2);
1764 xmm2 = _mm_mul_sd(neg,xmm2);
1765 xmm3 = _mm_mul_sd(qrtr,sk2_inv);
1766 xmm3 = _mm_mul_sd(xmm3,uij3);
1767 xmm2 = _mm_sub_sd(xmm2,xmm3);
1768 xmm3 = _mm_mul_sd(uij,rinv);
1769 xmm4 = _mm_mul_sd(uij3,r);
1770 xmm3 = _mm_add_sd(xmm3,xmm4);
1771 xmm3 = _mm_mul_sd(qrtr,xmm3);
1772 t2 = _mm_add_sd(xmm2,xmm3);
1774 xmm2 = _mm_mul_sd(sk2_inv,rinv);
1775 xmm2 = _mm_add_sd(one,xmm2);
1776 xmm2 = _mm_mul_sd(eigth,xmm2);
1777 xmm2 = _mm_mul_sd(xmm2,xmm9);
1778 xmm3 = _mm_mul_sd(log_term, rinv);
1779 xmm3 = _mm_mul_sd(xmm3,rinv);
1780 xmm3 = _mm_mul_sd(qrtr,xmm3);
1781 t3 = _mm_add_sd(xmm2,xmm3);
1783 /* chain rule terms */
1784 xmm2 = _mm_mul_sd(dlij,t1);
1785 xmm2 = _mm_add_sd(xmm2,t2);
1786 xmm2 = _mm_add_sd(xmm2,t3);
1787 chrule_ai = _mm_mul_sd(xmm2,rinv);
1789 _mm_store_sd(fr->dadx+n, chrule);
1790 n = n + 1;
1791 _mm_store_sd(fr->dadx+n, chrule_ai);
1792 n = n + 1;
1795 /* Do end processing ... */
1796 tmp_ai = _mm_unpacklo_pd(tmp_ai,sum_ai);
1797 sum_ai = _mm_add_pd(sum_ai,tmp_ai);
1798 sum_ai = _mm_shuffle_pd(sum_ai,sum_ai,_MM_SHUFFLE2(1,1));
1800 /* Load, add and store atom ai polarisation energy */
1801 xmm2 = _mm_load_sd(born->gpol_hct_work+ai);
1802 sum_ai = _mm_add_sd(sum_ai,xmm2);
1803 _mm_store_sd(born->gpol_hct_work+ai,sum_ai);
1806 /* Parallel summations */
1807 if(PARTDECOMP(cr))
1809 gmx_sum(natoms, born->gpol_hct_work, cr);
1811 else if(DOMAINDECOMP(cr))
1813 dd_atom_sum_real(cr->dd, born->gpol_hct_work);
1816 /* Compute the radii */
1817 for(i=0;i<fr->natoms_force;i++) /* PELA born->nr */
1819 if(born->use[i] != 0)
1821 rr = top->atomtypes.gb_radius[md->typeA[i]];
1822 rr_inv2 = 1.0/rr;
1823 rr = rr-doff;
1824 rr_inv = 1.0/rr;
1825 sum = rr * born->gpol_hct_work[i];
1826 sum2 = sum * sum;
1827 sum3 = sum2 * sum;
1829 tsum = tanh(born->obc_alpha*sum-born->obc_beta*sum2+born->obc_gamma*sum3);
1830 born->bRad[i] = rr_inv - tsum*rr_inv2;
1831 born->bRad[i] = 1.0 / born->bRad[i];
1833 fr->invsqrta[i] = gmx_invsqrt(born->bRad[i]);
1835 tchain = rr * (born->obc_alpha-2*born->obc_beta*sum+3*born->obc_gamma*sum2);
1836 born->drobc[i] = (1.0-tsum*tsum)*tchain*rr_inv2;
1840 /* Extra (local) communication required for DD */
1841 if(DOMAINDECOMP(cr))
1843 dd_atom_spread_real(cr->dd, born->bRad);
1844 dd_atom_spread_real(cr->dd, fr->invsqrta);
1845 dd_atom_spread_real(cr->dd, born->drobc);
1848 return 0;
1854 calc_gb_chainrule_sse2_double(int natoms, t_nblist *nl, double *dadx, double *dvda, double *xd, double *f,
1855 double *fshift, double *shift_vec, int gb_algorithm, gmx_genborn_t *born)
1857 int i,k,n,ai,aj,ai3,aj1,aj2,aj13,aj23,aj4,nj0,nj1,offset;
1858 int shift;
1859 double rbi,shX,shY,shZ;
1860 double *rb;
1862 __m128d ix,iy,iz,jx,jy,jz,fix,fiy,fiz,sX,sY,sZ;
1863 __m128d dx,dy,dz,t1,t2,t3,dva,dvaj,dax,dax_ai,fgb,fgb_ai;
1864 __m128d xmm1,xmm2,xmm3,xmm4,xmm5,xmm6,xmm7,xmm8;
1866 t1 = t2 = t3 = _mm_setzero_pd();
1867 rb = born->work;
1869 /* Loop to get the proper form for the Born radius term */
1870 if(gb_algorithm==egbSTILL) {
1871 for(i=0;i<natoms;i++)
1873 rbi = born->bRad[i];
1874 rb[i] = (2 * rbi * rbi * dvda[i])/ONE_4PI_EPS0;
1878 else if(gb_algorithm==egbHCT) {
1879 for(i=0;i<natoms;i++)
1881 rbi = born->bRad[i];
1882 rb[i] = rbi * rbi * dvda[i];
1886 else if(gb_algorithm==egbOBC) {
1887 for(i=0;i<natoms;i++)
1889 rbi = born->bRad[i];
1890 rb[i] = rbi * rbi * born->drobc[i] * dvda[i];
1894 n = 0;
1896 for(i=0;i<nl->nri;i++)
1898 ai = nl->iinr[i];
1899 ai3 = ai * 3;
1901 nj0 = nl->jindex[i];
1902 nj1 = nl->jindex[i+1];
1904 /* Load shifts for this list */
1905 shift = 3*nl->shift[i];
1906 shX = shift_vec[shift+0];
1907 shY = shift_vec[shift+1];
1908 shZ = shift_vec[shift+2];
1910 /* Splat the shifts */
1911 sX = _mm_load1_pd(&shX);
1912 sY = _mm_load1_pd(&shY);
1913 sZ = _mm_load1_pd(&shZ);
1915 offset = (nj1-nj0)%2;
1917 /* Load particle ai coordinates and add shifts */
1918 ix = _mm_load1_pd(xd+ai3);
1919 iy = _mm_load1_pd(xd+ai3+1);
1920 iz = _mm_load1_pd(xd+ai3+2);
1922 ix = _mm_add_pd(sX,ix);
1923 iy = _mm_add_pd(sY,iy);
1924 iz = _mm_add_pd(sZ,iz);
1926 /* Load particle ai dvda */
1927 dva = _mm_load1_pd(rb+ai);
1929 fix = _mm_setzero_pd();
1930 fiy = _mm_setzero_pd();
1931 fiz = _mm_setzero_pd();
1933 for(k=nj0;k<nj1-offset;k+=2)
1935 aj1 = nl->jjnr[k];
1936 aj2 = nl->jjnr[k+1];
1938 /* Load dvda_j */
1939 dvaj = _mm_set_pd(rb[aj2],rb[aj1]);
1941 aj13 = aj1 * 3;
1942 aj23 = aj2 * 3;
1944 /* Load particle aj1-2 coordinates */
1945 xmm1 = _mm_loadu_pd(xd+aj13);
1946 xmm2 = _mm_loadu_pd(xd+aj23);
1948 xmm5 = _mm_load_sd(xd+aj13+2);
1949 xmm6 = _mm_load_sd(xd+aj23+2);
1951 jx = _mm_shuffle_pd(xmm1,xmm2,_MM_SHUFFLE2(0,0));
1952 jy = _mm_shuffle_pd(xmm1,xmm2,_MM_SHUFFLE2(1,1));
1953 jz = _mm_shuffle_pd(xmm5,xmm6,_MM_SHUFFLE2(0,0));
1955 /* load derivative of born radii w.r.t. coordinates */
1956 xmm7 = _mm_loadu_pd(dadx+n);
1957 n = n + 2;
1958 xmm8 = _mm_loadu_pd(dadx+n);
1959 n = n + 2;
1961 /* Shuffle to get the ai and aj components right */
1962 dax = _mm_shuffle_pd(xmm7,xmm8,_MM_SHUFFLE2(2,0));
1963 dax_ai = _mm_shuffle_pd(xmm7,xmm8,_MM_SHUFFLE2(3,1));
1965 /* distances */
1966 dx = _mm_sub_pd(ix,jx);
1967 dy = _mm_sub_pd(iy,jy);
1968 dz = _mm_sub_pd(iz,jz);
1970 /* scalar force */
1971 fgb = _mm_mul_pd(dva,dax);
1972 fgb_ai = _mm_mul_pd(dvaj,dax_ai);
1973 fgb = _mm_add_pd(fgb,fgb_ai);
1975 /* partial forces */
1976 t1 = _mm_mul_pd(fgb,dx);
1977 t2 = _mm_mul_pd(fgb,dy);
1978 t3 = _mm_mul_pd(fgb,dz);
1980 /* update the i force */
1981 fix = _mm_add_pd(fix,t1);
1982 fiy = _mm_add_pd(fiy,t2);
1983 fiz = _mm_add_pd(fiz,t3);
1985 /* accumulate forces from memory */
1986 xmm1 = _mm_loadu_pd(f+aj13); /* fx1 fx2 */
1987 xmm2 = _mm_loadu_pd(f+aj23); /* fy1 fy2 */
1989 xmm5 = _mm_load1_pd(f+aj13+2); /* fz1 fz1 */
1990 xmm6 = _mm_load1_pd(f+aj23+2); /* fz2 fz2 */
1992 /* transpose */
1993 xmm7 = _mm_shuffle_pd(xmm5,xmm6,_MM_SHUFFLE2(0,0)); /* fz1 fz2 */
1994 xmm5 = _mm_shuffle_pd(xmm1,xmm2,_MM_SHUFFLE2(0,0)); /* fx1 fx2 */
1995 xmm6 = _mm_shuffle_pd(xmm1,xmm2,_MM_SHUFFLE2(1,1)); /* fy1 fy2 */
1997 /* subtract partial forces */
1998 xmm5 = _mm_sub_pd(xmm5, t1);
1999 xmm6 = _mm_sub_pd(xmm6, t2);
2000 xmm7 = _mm_sub_pd(xmm7, t3);
2002 /* transpose back fx and fy */
2003 xmm1 = _mm_shuffle_pd(xmm5,xmm6,_MM_SHUFFLE2(0,0));
2004 xmm2 = _mm_shuffle_pd(xmm5,xmm6,_MM_SHUFFLE2(1,1));
2006 /* store the force, first fx's and fy's */
2007 _mm_storeu_pd(f+aj13,xmm1);
2008 _mm_storeu_pd(f+aj23,xmm2);
2010 /* then fz */
2011 _mm_storel_pd(f+aj13+2,xmm7);
2012 _mm_storeh_pd(f+aj23+2,xmm7);
2015 if(offset!=0)
2017 aj1 = nl->jjnr[k];
2019 dvaj = _mm_load_sd(rb+aj1);
2021 aj13 = aj1 * 3;
2023 jx = _mm_load_sd(xd+aj13);
2024 jy = _mm_load_sd(xd+aj13+1);
2025 jz = _mm_load_sd(xd+aj13+2);
2027 dax = _mm_load_sd(dadx+n);
2028 n = n + 1;
2029 dax_ai = _mm_load_sd(dadx+n);
2030 n = n + 1;
2032 dx = _mm_sub_sd(ix,jx);
2033 dy = _mm_sub_sd(iy,jy);
2034 dz = _mm_sub_sd(iz,jz);
2036 fgb = _mm_mul_sd(dva,dax);
2037 fgb_ai = _mm_mul_sd(dvaj,dax_ai);
2038 fgb = _mm_add_pd(fgb,fgb_ai);
2040 t1 = _mm_mul_sd(fgb,dx);
2041 t2 = _mm_mul_sd(fgb,dy);
2042 t3 = _mm_mul_sd(fgb,dz);
2044 fix = _mm_add_sd(fix,t1);
2045 fiy = _mm_add_sd(fiy,t2);
2046 fiz = _mm_add_sd(fiz,t3);
2048 /* accumulate forces from memory */
2049 xmm5 = _mm_load_sd(f+aj13);
2050 xmm6 = _mm_load_sd(f+aj13+1);
2051 xmm7 = _mm_load_sd(f+aj13+2);
2053 /* subtract partial forces */
2054 xmm5 = _mm_sub_sd(xmm5, t1);
2055 xmm6 = _mm_sub_sd(xmm6, t2);
2056 xmm7 = _mm_sub_sd(xmm7, t3);
2058 /* store the force */
2059 _mm_store_sd(f+aj13,xmm5);
2060 _mm_store_sd(f+aj13+1,xmm6);
2061 _mm_store_sd(f+aj13+2,xmm7);
2064 /* fix/fiy/fiz now contain two partial terms, that all should be
2065 * added to the i particle forces
2067 t1 = _mm_unpacklo_pd(t1,fix);
2068 t2 = _mm_unpacklo_pd(t2,fiy);
2069 t3 = _mm_unpacklo_pd(t3,fiz);
2071 fix = _mm_add_pd(fix,t1);
2072 fiy = _mm_add_pd(fiy,t2);
2073 fiz = _mm_add_pd(fiz,t3);
2075 fix = _mm_shuffle_pd(fix,fix,_MM_SHUFFLE2(1,1));
2076 fiy = _mm_shuffle_pd(fiy,fiy,_MM_SHUFFLE2(1,1));
2077 fiz = _mm_shuffle_pd(fiz,fiz,_MM_SHUFFLE2(1,1));
2079 /* load, add and store i forces */
2080 xmm1 = _mm_load_sd(f+ai3);
2081 xmm2 = _mm_load_sd(f+ai3+1);
2082 xmm3 = _mm_load_sd(f+ai3+2);
2084 fix = _mm_add_sd(fix,xmm1);
2085 fiy = _mm_add_sd(fiy,xmm2);
2086 fiz = _mm_add_sd(fiz,xmm3);
2088 _mm_store_sd(f+ai3,fix);
2089 _mm_store_sd(f+ai3+1,fiy);
2090 _mm_store_sd(f+ai3+2,fiz);
2092 /* load, add and store i shift forces */
2093 xmm1 = _mm_load_sd(fshift+shift);
2094 xmm2 = _mm_load_sd(fshift+shift+1);
2095 xmm3 = _mm_load_sd(fshift+shift+2);
2097 fix = _mm_add_sd(fix,xmm1);
2098 fiy = _mm_add_sd(fiy,xmm2);
2099 fiz = _mm_add_sd(fiz,xmm3);
2101 _mm_store_sd(fshift+shift,fix);
2102 _mm_store_sd(fshift+shift+1,fiy);
2103 _mm_store_sd(fshift+shift+2,fiz);
2107 return 0;
2110 #else
2111 /* keep compiler happy */
2112 int genborn_sse_dummy;
2114 #endif /* SSE2 intrinsics available */