Temporarily disabled double precision SSE2 generalized born (waiting for transcendentals)
[gromacs.git] / src / mdlib / genborn_sse2_double.c
blob6aa529e75609f75b285d4383d090a9fcbbf2e61a
1 #ifdef HAVE_CONFIG_H
2 #include <config.h>
3 #endif
5 #include <math.h>
6 #include <string.h>
8 #include "typedefs.h"
9 #include "smalloc.h"
10 #include "genborn.h"
11 #include "vec.h"
12 #include "grompp.h"
13 #include "pdbio.h"
14 #include "names.h"
15 #include "physics.h"
16 #include "domdec.h"
17 #include "partdec.h"
18 #include "network.h"
19 #include "gmx_fatal.h"
20 #include "mtop_util.h"
21 #include "genborn.h"
23 #ifdef GMX_LIB_MPI
24 #include <mpi.h>
25 #endif
26 #ifdef GMX_THREADS
27 #include "tmpi.h"
28 #endif
30 /* Double precision SSE2 genborn disabled while waiting for transcendental implementations... */
31 #if 0
33 /* Only compile this file if SSE2 intrinsics are available */
34 /* #if ( (defined(GMX_IA32_SSE2) || defined(GMX_X86_64_SSE2) || defined(GMX_SSE2)) && defined(GMX_DOUBLE) ) */
35 #include <xmmintrin.h>
36 #include <emmintrin.h>
38 #include "genborn_sse2_double.h"
40 #if (defined (_MSC_VER) || defined(__INTEL_COMPILER))
41 #define gmx_castsi128_pd(a) _mm_castsi128_pd(a)
42 #define gmx_castpd_si128(a) _mm_castpd_si128(a)
43 #elif defined(__GNUC__)
44 #define gmx_castsi128_pd(a) ((__m128d)(a))
45 #define gmx_castpd_si128(a) ((__m128i)(a))
46 #else
47 static __m128d gmx_castsi128_pd(__m128i a) { return *(__m128d *) &a; }
48 static __m128i gmx_castpd_si128(__m128d a) { return *(__m128i *) &a; }
49 #endif
52 static inline void
53 sincos_sse2double(__m128d x, __m128d *sinval, __m128d *cosval)
55 const __m128d two_over_pi = {2.0/M_PI,2.0/M_PI};
56 const __m128d half = {0.5,0.5};
57 const __m128d one = {1.0,1.0};
58 const __m128i izero = _mm_set1_epi32(0);
59 const __m128i ione = _mm_set1_epi32(1);
60 const __m128i itwo = _mm_set1_epi32(2);
61 const __m128i ithree = _mm_set1_epi32(3);
62 const __m128d sincosd_kc1 = {(13176794.0 / 8388608.0),(13176794.0 / 8388608.0)};
63 const __m128d sincosd_kc2 = {7.5497899548918821691639751442098584e-8,7.5497899548918821691639751442098584e-8};
64 const __m128d sincosd_cc0 = {0.00000000206374484196,0.00000000206374484196};
65 const __m128d sincosd_cc1 = {-0.00000027555365134677,-0.00000027555365134677};
66 const __m128d sincosd_cc2 = {0.00002480157946764225,0.00002480157946764225};
67 const __m128d sincosd_cc3 = {-0.00138888888730525966,-0.00138888888730525966};
68 const __m128d sincosd_cc4 = {0.04166666666651986722,0.04166666666651986722};
69 const __m128d sincosd_cc5 = {-0.49999999999999547304,-0.49999999999999547304};
70 const __m128d sincosd_sc0 = {0.00000000015893606014,0.00000000015893606014};
71 const __m128d sincosd_sc1 = {-0.00000002505069049138,-0.00000002505069049138};
72 const __m128d sincosd_sc2 = {0.00000275573131527032,0.00000275573131527032};
73 const __m128d sincosd_sc3 = {-0.00019841269827816117,-0.00019841269827816117};
74 const __m128d sincosd_sc4 = {0.00833333333331908278,0.00833333333331908278};
75 const __m128d sincosd_sc5 = {-0.16666666666666612594,-0.16666666666666612594};
77 __m128d signbit = gmx_castsi128_pd(_mm_set1_epi64x(0x8000000000000000ULL));
78 __m128d tiny = gmx_castsi128_pd(_mm_set1_epi64x(0x3e40000000000000ULL));
80 __m128d xl,xl2,xl3,qd,absxl,p1,cx,sx,ts,tc,tsn,tcn;
81 __m128i q;
82 __m128i offsetSin,offsetCos;
83 __m128d sinMask,cosMask,isTiny;
84 __m128d ct0,ct1,ct2,ct3,ct4,ct5,ct6,st1,st2,st3,st4,st6;
86 /* Rescale the angle to the range 0..4, and find which quadrant it is in */
87 xl = _mm_mul_pd(x,two_over_pi);
89 /* q=integer part of xl, rounded _away_ from 0.0 */
90 /* Add 0.5 away from 0.0 */
91 xl = _mm_add_pd(xl,_mm_or_pd(_mm_and_pd(xl,signbit),half));
93 q = _mm_cvttpd_epi32(xl);
94 qd = _mm_cvtepi32_pd(q);
95 q = _mm_shuffle_epi32(q,_MM_SHUFFLE(1,1,0,0));
97 /* Compute offset based on quadrant the arg falls in */
98 offsetSin = _mm_and_si128(q,ithree);
99 offsetCos = _mm_add_epi32(offsetSin,ione);
101 /* Remainder in range [-pi/4..pi/4] */
102 p1 = _mm_mul_pd(qd,sincosd_kc1);
103 xl = _mm_mul_pd(qd,sincosd_kc2);
104 p1 = _mm_sub_pd(x,p1);
105 xl = _mm_sub_pd(p1,xl);
107 absxl = _mm_andnot_pd(signbit,xl);
108 isTiny = _mm_cmpgt_pd(tiny,absxl);
110 xl2 = _mm_mul_pd(xl,xl);
111 xl3 = _mm_mul_pd(xl2,xl);
113 ct0 = _mm_mul_pd(xl2,xl2);
114 ct1 = _mm_mul_pd(sincosd_cc0,xl2);
115 ct2 = _mm_mul_pd(sincosd_cc2,xl2);
116 ct3 = _mm_mul_pd(sincosd_cc4,xl2);
117 st1 = _mm_mul_pd(sincosd_sc0,xl2);
118 st2 = _mm_mul_pd(sincosd_sc2,xl2);
119 st3 = _mm_mul_pd(sincosd_sc4,xl2);
120 ct1 = _mm_add_pd(ct1,sincosd_cc1);
121 ct2 = _mm_add_pd(ct2,sincosd_cc3);
122 ct3 = _mm_add_pd(ct3,sincosd_cc5);
123 st1 = _mm_add_pd(st1,sincosd_sc1);
124 st2 = _mm_add_pd(st2,sincosd_sc3);
125 st3 = _mm_add_pd(st3,sincosd_sc5);
127 ct4 = _mm_mul_pd(ct2,ct0);
128 ct4 = _mm_add_pd(ct4,ct3);
130 st4 = _mm_mul_pd(st2,ct0);
131 st4 = _mm_add_pd(st4,st3);
132 ct5 = _mm_mul_pd(ct0,ct0);
134 ct6 = _mm_mul_pd(ct5,ct1);
135 ct6 = _mm_add_pd(ct6,ct4);
137 st6 = _mm_mul_pd(ct5,st1);
138 st6 = _mm_add_pd(st6,st4);
140 cx = _mm_mul_pd(ct6,xl2);
141 cx = _mm_add_pd(cx,one);
143 sx = _mm_mul_pd(st6,xl3);
144 sx = _mm_add_pd(sx,xl);
146 /* Small angle approximation, sin(tiny)=tiny, cos(tiny)=1.0 */
147 sx = _mm_or_pd( _mm_and_pd(isTiny,xl) , _mm_andnot_pd(isTiny,sx) );
148 cx = _mm_or_pd( _mm_and_pd(isTiny,one) , _mm_andnot_pd(isTiny,cx) );
150 sinMask = gmx_castsi128_pd(_mm_cmpeq_epi32( _mm_and_si128(offsetSin,ione), izero));
151 cosMask = gmx_castsi128_pd(_mm_cmpeq_epi32( _mm_and_si128(offsetCos,ione), izero));
153 ts = _mm_or_pd( _mm_and_pd(sinMask,sx) , _mm_andnot_pd(sinMask,cx) );
154 tc = _mm_or_pd( _mm_and_pd(cosMask,sx) , _mm_andnot_pd(cosMask,cx) );
156 /* Flip the sign of the result when (offset mod 4) = 1 or 2 */
157 sinMask = gmx_castsi128_pd(_mm_cmpeq_epi32( _mm_and_si128(offsetSin,itwo), izero));
158 tsn = _mm_xor_pd(signbit,ts);
159 ts = _mm_or_pd( _mm_and_pd(sinMask,ts) , _mm_andnot_pd(sinMask,tsn) );
161 cosMask = gmx_castsi128_pd(_mm_cmpeq_epi32( _mm_and_si128(offsetCos,itwo), izero));
162 tcn = _mm_xor_pd(signbit,tc);
163 tc = _mm_or_pd( _mm_and_pd(cosMask,tc) , _mm_andnot_pd(cosMask,tcn) );
165 *sinval = ts;
166 *cosval = tc;
168 return;
171 __m128d log_pd(__m128d x)
173 const __m128i exp_mask = _mm_set_epi32(0x7FF00000,0,0x7FF00000,0);
174 const __m128i exp_bias = _mm_set_epi32(0,1023,0,1023);
176 const __m128d const_loge = _mm_set1_pd(0.69314718055994529);
177 const __m128d const_one = _mm_set1_pd(1.0);
178 const __m128d const_two = _mm_set1_pd(2.0);
180 /* Almost full single precision accuracy (~20 bits worst case) */
181 const __m128d P0 = _mm_set1_pd(6.108179944792157749153050);
182 const __m128d P1 = _mm_set1_pd(52.43691313715523327631139);
183 const __m128d P2 = _mm_set1_pd(71.53664010795613671168440);
184 const __m128d P3 = _mm_set1_pd(18.92097516931559547548485);
185 const __m128d P4 = _mm_set1_pd(0.3504714784635941984522153);
186 const __m128d P5 = _mm_set1_pd(-0.007105890734229368515879);
187 const __m128d Q1 = _mm_set1_pd(17.73314231909420567454406);
188 const __m128d Q2 = _mm_set1_pd(48.82373085428713023213363);
189 const __m128d Q3 = _mm_set1_pd(31.65945943354513166309101);
190 const __m128d Q4 = _mm_set1_pd(4.302477477108162270199051);
192 __m128d xmm0,xmm1,xmm2,xmm3, xmm4;
193 __m128i xmmi,xmmj;
195 xmmi = gmx_castpd_si128(x);
196 xmm1 = _mm_cvtepi32_pd(_mm_shuffle_epi32(_mm_sub_epi64(_mm_srli_epi64(_mm_and_si128(xmmi, exp_mask), 52), exp_bias),_MM_SHUFFLE(3,1,2,0)));
197 xmm0 = _mm_or_pd(gmx_castsi128_pd(_mm_andnot_si128(exp_mask, xmmi)), const_one);
199 xmm2 = _mm_mul_pd(P5,xmm0);
200 xmm2 = _mm_add_pd(xmm2,P4);
201 xmm2 = _mm_mul_pd(xmm2,xmm0);
202 xmm2 = _mm_add_pd(xmm2,P3);
203 xmm2 = _mm_mul_pd(xmm2,xmm0);
204 xmm2 = _mm_add_pd(xmm2,P2);
205 xmm2 = _mm_mul_pd(xmm2,xmm0);
206 xmm2 = _mm_add_pd(xmm2,P1);
207 xmm2 = _mm_mul_pd(xmm2,xmm0);
208 xmm2 = _mm_add_pd(xmm2,P0);
210 xmm3 = _mm_mul_pd(Q4,xmm0);
211 xmm3 = _mm_add_pd(xmm3,Q3);
212 xmm3 = _mm_mul_pd(xmm3,xmm0);
213 xmm3 = _mm_add_pd(xmm3,Q2);
214 xmm3 = _mm_mul_pd(xmm3,xmm0);
215 xmm3 = _mm_add_pd(xmm3,Q1);
216 xmm3 = _mm_mul_pd(xmm3,xmm0);
217 xmm3 = _mm_add_pd(xmm3,const_one);
219 /* xmm4=1.0/xmm3 */
220 xmm4 = _mm_cvtps_pd(_mm_rcp_ps(_mm_cvtpd_ps(xmm3)));
221 xmm4 = _mm_mul_pd(xmm4,_mm_sub_pd(const_two,_mm_mul_pd(xmm3,xmm4)));
222 xmm4 = _mm_mul_pd(xmm4,_mm_sub_pd(const_two,_mm_mul_pd(xmm3,xmm4)));
223 xmm2 = _mm_mul_pd(xmm2,xmm4);
225 xmm0 = _mm_sub_pd(xmm0, const_one);
226 xmm0 = _mm_mul_pd(xmm0,xmm2);
228 xmm0 = _mm_add_pd(xmm0,xmm1);
230 return _mm_mul_pd(xmm0, const_loge);
233 /* This exp() routine provides accuracy of 10E-9 to 10E-11.
234 * The polynomial minimax coefficients are actually accurate to 10E-14,
235 * but we lose some accuracy in the polynomial evaluation.
237 __m128d exp_pd(__m128d x)
239 const __m128d lim1 = _mm_set1_pd(1025.0); /* 1025.00000e+0d */
240 const __m128d lim2 = _mm_set1_pd(-1022.99999999); /* -1022.99999e+0f */
242 const __m128i base = _mm_set_epi32(0,0,1023,1023);
243 const __m128d half = _mm_set1_pd(0.5);
244 const __m128d log2e = _mm_set1_pd(1.4426950408889634);
246 const __m128d exp_P0 = _mm_set1_pd(1.00000000000001276211229749);
247 const __m128d exp_P1 = _mm_set1_pd(6.931471805598709708635169E-1);
248 const __m128d exp_P2 = _mm_set1_pd(2.402265069564965287455972E-1);
249 const __m128d exp_P3 = _mm_set1_pd(5.550410866868561155599683E-2);
250 const __m128d exp_P4 = _mm_set1_pd(9.618129192067246128919915E-3);
251 const __m128d exp_P5 = _mm_set1_pd(1.333355761760444302342084E-3);
252 const __m128d exp_P6 = _mm_set1_pd(1.540343494807179111289781E-4);
253 const __m128d exp_P7 = _mm_set1_pd(1.525298483865349629325421E-5);
254 const __m128d exp_P8 = _mm_set1_pd(1.325940560934510417818578E-6);
255 const __m128d exp_P9 = _mm_set1_pd(1.015033670529892589443421E-7);
257 __m128d xmm0,xmm1;
258 __m128i xmmi;
260 xmm0 = _mm_mul_pd(x,log2e);
261 xmm0 = _mm_min_pd(xmm0,lim1);
262 xmm0 = _mm_max_pd(xmm0,lim2);
263 xmm1 = _mm_sub_pd(xmm0,half);
265 xmmi = _mm_cvtpd_epi32(xmm1);
266 xmm1 = _mm_cvtepi32_pd(xmmi);
268 xmmi = _mm_add_epi32(xmmi,base);
269 xmmi = _mm_shuffle_epi32(xmmi,_MM_SHUFFLE(3,1,2,0));
270 xmmi = _mm_slli_epi64(xmmi,52);
272 xmm0 = _mm_sub_pd(xmm0,xmm1);
274 xmm1 = _mm_mul_pd(exp_P9,xmm0);
275 xmm1 = _mm_add_pd(xmm1,exp_P8);
276 xmm1 = _mm_mul_pd(xmm1,xmm0);
277 xmm1 = _mm_add_pd(xmm1,exp_P7);
278 xmm1 = _mm_mul_pd(xmm1,xmm0);
279 xmm1 = _mm_add_pd(xmm1,exp_P6);
280 xmm1 = _mm_mul_pd(xmm1,xmm0);
281 xmm1 = _mm_add_pd(xmm1,exp_P5);
282 xmm1 = _mm_mul_pd(xmm1,xmm0);
283 xmm1 = _mm_add_pd(xmm1,exp_P4);
284 xmm1 = _mm_mul_pd(xmm1,xmm0);
285 xmm1 = _mm_add_pd(xmm1,exp_P3);
286 xmm1 = _mm_mul_pd(xmm1,xmm0);
287 xmm1 = _mm_add_pd(xmm1,exp_P2);
288 xmm1 = _mm_mul_pd(xmm1,xmm0);
289 xmm1 = _mm_add_pd(xmm1,exp_P1);
290 xmm1 = _mm_mul_pd(xmm1,xmm0);
291 xmm1 = _mm_add_pd(xmm1,exp_P0);
292 xmm1 = _mm_mul_pd(xmm1,gmx_castsi128_pd(xmmi));
294 return xmm1;
297 static inline __m128d
298 my_invrsq_pd(__m128d x)
300 const __m128d three = {3.0, 3.0};
301 const __m128d half = {0.5, 0.5};
303 __m128 t = _mm_rsqrt_ps(_mm_cvtpd_ps(x)); /* Convert to single precision and do _mm_rsqrt_ps() */
304 __m128d t1 = _mm_cvtps_pd(t); /* Convert back to double precision */
306 /* First Newton-Rapson step, accuracy is now 24 bits */
307 __m128d t2 = _mm_mul_pd(half,_mm_mul_pd(t1,_mm_sub_pd(three,_mm_mul_pd(x,_mm_mul_pd(t1,t1)))));
309 /* Return second Newton-Rapson step, accuracy 48 bits */
310 return _mm_mul_pd(half,_mm_mul_pd(t2,_mm_sub_pd(three,_mm_mul_pd(x,_mm_mul_pd(t2,t2)))));
313 static inline __m128d
314 my_inv_pd(__m128d x)
316 const __m128d two = {2.0, 2.0};
318 __m128 t = _mm_rcp_ps(_mm_cvtpd_ps(x));
319 __m128d t1 = _mm_cvtps_pd(t);
320 __m128d t2 = _mm_mul_pd(t1,_mm_sub_pd(two,_mm_mul_pd(t1,x)));
322 return _mm_mul_pd(t2,_mm_sub_pd(two,_mm_mul_pd(t2,x)));
327 calc_gb_rad_still_sse2_double(t_commrec *cr, t_forcerec *fr,int natoms, gmx_localtop_t *top,
328 const t_atomtypes *atype, double *x, t_nblist *nl, gmx_genborn_t *born, t_mdatoms *md)
330 int i,k,n,ai,ai3,aj1,aj2,aj13,aj23;
331 int at0,at1,nj0,nj1,offset,taj1,taj2;
332 int shift;
334 double factor,gpi_ai,gpi_tmp,gpi2;
335 double shX,shY,shZ;
337 __m128d ix,iy,iz,jx,jy,jz,dx,dy,dz,sX,sY,sZ;
338 __m128d t1,t2,t3,rsq11,rinv,rinv2,rinv4,rinv6;
339 __m128d ratio,gpi,rai,raj,vaj,rvdw,mask_cmp;
340 __m128d ccf,dccf,theta,cosq,term,sinq,res,prod;
341 __m128d xmm1,xmm2,xmm3,xmm4,xmm7,vai,prod_ai,icf4,icf6;
343 const __m128d half = {0.5, 0.5};
344 const __m128d three = {3.0, 3.0};
345 const __m128d one = {1.0, 1.0};
346 const __m128d two = {2.0, 2.0};
347 const __m128d zero = {0.0, 0.0};
348 const __m128d four = {4.0, 4.0};
350 const __m128d p5inv = {STILL_P5INV, STILL_P5INV};
351 const __m128d pip5 = {STILL_PIP5, STILL_PIP5};
352 const __m128d p4 = {STILL_P4, STILL_P4};
354 factor = 0.5 * ONE_4PI_EPS0;
355 n = 0;
357 /* Keep the compiler happy */
358 raj = _mm_setzero_pd();
359 vaj = _mm_setzero_pd();
360 jx = _mm_setzero_pd();
361 jy = _mm_setzero_pd();
362 jz = _mm_setzero_pd();
363 xmm2 = _mm_setzero_pd();
364 xmm7 = _mm_setzero_pd();
366 for(i=0;i<born->nr;i++)
368 born->gpol_still_work[i]=0;
371 for(i=0;i<nl->nri;i++)
373 ai = nl->iinr[i];
374 ai3 = ai * 3;
376 nj0 = nl->jindex[i];
377 nj1 = nl->jindex[i+1];
379 /* Load shifts for this list */
380 shift = nl->shift[i];
381 shX = fr->shift_vec[shift][0];
382 shY = fr->shift_vec[shift][1];
383 shZ = fr->shift_vec[shift][2];
385 /* Splat the shifts */
386 sX = _mm_load1_pd(&shX);
387 sY = _mm_load1_pd(&shY);
388 sZ = _mm_load1_pd(&shZ);
390 offset = (nj1-nj0)%2;
392 /* Polarization energy for atom ai */
393 gpi = _mm_setzero_pd();
395 /* Load particle ai coordinates and add shifts */
396 ix = _mm_load1_pd(x+ai3);
397 iy = _mm_load1_pd(x+ai3+1);
398 iz = _mm_load1_pd(x+ai3+2);
400 ix = _mm_add_pd(sX,ix);
401 iy = _mm_add_pd(sY,iy);
402 iz = _mm_add_pd(sZ,iz);
404 /* Load particle ai gb_radius */
405 rai = _mm_set1_pd(top->atomtypes.gb_radius[md->typeA[ai]]);
406 vai = _mm_set1_pd(born->vsolv[ai]);
407 prod_ai = _mm_mul_pd(p4,vai);
409 for(k=nj0;k<nj1-offset;k+=2)
411 aj1 = nl->jjnr[k];
412 aj2 = nl->jjnr[k+1];
414 aj13 = aj1 * 3;
415 aj23 = aj2 * 3;
417 taj1 = md->typeA[aj1];
418 taj2 = md->typeA[aj2];
420 /* Load particle aj1-2 coordinates and compute ai->aj distances */
421 xmm1 = _mm_loadu_pd(x+aj13);
422 xmm2 = _mm_loadu_pd(x+aj23);
423 jx = _mm_shuffle_pd(xmm1,xmm2,_MM_SHUFFLE2(0,0));
424 jy = _mm_shuffle_pd(xmm1,xmm2,_MM_SHUFFLE2(1,1));
426 jz = _mm_loadl_pd(jz,x+aj13+2);
427 jz = _mm_loadh_pd(jz,x+aj23+2);
429 dx = _mm_sub_pd(ix,jx);
430 dy = _mm_sub_pd(iy,jy);
431 dz = _mm_sub_pd(iz,jz);
433 rsq11 = _mm_add_pd( _mm_add_pd( _mm_mul_pd(dx,dx) , _mm_mul_pd(dy,dy) ) , _mm_mul_pd(dz,dz) );
434 rinv = my_invrsq_pd(rsq11);
436 rinv2 = _mm_mul_pd(rinv,rinv);
437 rinv4 = _mm_mul_pd(rinv2,rinv2);
438 rinv6 = _mm_mul_pd(rinv4,rinv2);
440 vaj = _mm_loadl_pd(vaj,born->vsolv+aj1);
441 vaj = _mm_loadh_pd(vaj,born->vsolv+aj2);
443 raj = _mm_loadl_pd(raj, top->atomtypes.gb_radius+taj1);
444 raj = _mm_loadh_pd(raj, top->atomtypes.gb_radius+taj2);
446 rvdw = _mm_add_pd(rai,raj);
447 rvdw = _mm_mul_pd(rvdw,rvdw);
448 ratio = _mm_div_pd(rsq11,rvdw);
450 mask_cmp = _mm_cmpgt_pd(ratio,p5inv);
452 switch(_mm_movemask_pd(mask_cmp))
454 case 0xF:
455 ccf = one;
456 dccf = zero;
457 break;
458 default:
459 theta = _mm_mul_pd(ratio,pip5);
460 sincos_sse2double(theta,&sinq,&cosq);
462 term = _mm_sub_pd(one,cosq);
463 term = _mm_mul_pd(half,term);
464 ccf = _mm_mul_pd(term,term);
465 dccf = _mm_mul_pd(two,term);
466 dccf = _mm_mul_pd(dccf,sinq);
467 dccf = _mm_mul_pd(dccf,pip5);
468 dccf = _mm_mul_pd(dccf,ratio);
470 ccf = _mm_or_pd(_mm_and_pd(mask_cmp,one) ,_mm_andnot_pd(mask_cmp,ccf));
471 dccf = _mm_or_pd(_mm_and_pd(mask_cmp,zero) ,_mm_andnot_pd(mask_cmp,dccf));
474 prod = _mm_mul_pd(p4,vaj);
475 icf4 = _mm_mul_pd(ccf,rinv4);
476 xmm2 = _mm_mul_pd(icf4,prod);
477 xmm3 = _mm_mul_pd(icf4,prod_ai);
478 gpi = _mm_add_pd(gpi,xmm2);
480 /* Load, subtract and store atom aj gpol energy */
481 xmm7 = _mm_loadl_pd(xmm7,born->gpol_still_work+aj1);
482 xmm7 = _mm_loadh_pd(xmm7,born->gpol_still_work+aj2);
484 xmm3 = _mm_add_pd(xmm7,xmm3);
486 _mm_storel_pd(born->gpol_still_work+aj1,xmm3);
487 _mm_storeh_pd(born->gpol_still_work+aj2,xmm3);
489 /* Chain rule terms */
490 ccf = _mm_mul_pd(four,ccf);
491 xmm3 = _mm_sub_pd(ccf,dccf);
492 icf6 = _mm_mul_pd(xmm3,rinv6);
493 xmm1 = _mm_mul_pd(icf6,prod);
494 xmm2 = _mm_mul_pd(icf6,prod_ai);
496 /* As with single precision, we need to shift stuff around, to change the order of
497 * the interactions from ai->aj1, ai->aj2 to ai->aj1, aj1->ai, ai->aj2, aj2->ai etc,
498 to do 2 instead of 4 store operations
500 xmm3 = _mm_unpacklo_pd(xmm1,xmm2);
501 xmm4 = _mm_unpackhi_pd(xmm1,xmm2);
503 _mm_storeu_pd(fr->dadx+n,xmm3);
504 n = n + 2;
505 _mm_storeu_pd(fr->dadx+n,xmm4);
506 n = n + 2;
509 /* Deal with odd elements */
510 if(offset!=0)
512 aj1 = nl->jjnr[k];
513 aj13 = aj1 * 3;
514 taj1 = md->typeA[aj1];
516 jx = _mm_load_sd(x+aj13);
517 jy = _mm_load_sd(x+aj13+1);
518 jz = _mm_load_sd(x+aj13+2);
520 raj = _mm_load_sd(top->atomtypes.gb_radius+taj1);
521 vaj = _mm_load_sd(born->vsolv+aj1);
523 dx = _mm_sub_sd(ix,jx);
524 dy = _mm_sub_pd(iy,jy);
525 dz = _mm_sub_pd(iz,jz);
527 rsq11 = _mm_add_sd( _mm_add_sd( _mm_mul_sd(dx,dx) , _mm_mul_sd(dy,dy) ) , _mm_mul_sd(dz,dz) );
528 rinv = my_invrsq_pd(rsq11);
530 rinv2 = _mm_mul_sd(rinv,rinv);
531 rinv4 = _mm_mul_sd(rinv2,rinv2);
532 rinv6 = _mm_mul_sd(rinv4,rinv2);
534 rvdw = _mm_add_sd(rai,raj);
535 rvdw = _mm_mul_sd(rvdw,rvdw);
536 ratio = _mm_div_sd(rsq11,rvdw);
538 mask_cmp = _mm_cmpgt_sd(ratio,p5inv);
540 switch(_mm_movemask_pd(mask_cmp))
542 case 0xF:
543 ccf = one;
544 dccf = zero;
545 break;
546 default:
547 theta = _mm_mul_sd(ratio,pip5);
548 sincos_sse2double(theta,&sinq,&cosq);
550 term = _mm_sub_sd(one,cosq);
551 term = _mm_mul_sd(half,term);
552 ccf = _mm_mul_sd(term,term);
553 dccf = _mm_mul_sd(two,term);
554 dccf = _mm_mul_sd(dccf,sinq);
555 dccf = _mm_mul_sd(dccf,pip5);
556 dccf = _mm_mul_sd(dccf,ratio);
558 ccf = _mm_or_pd(_mm_and_pd(mask_cmp,one) ,_mm_andnot_pd(mask_cmp,ccf));
559 dccf = _mm_or_pd(_mm_and_pd(mask_cmp,zero) ,_mm_andnot_pd(mask_cmp,dccf));
562 prod = _mm_mul_sd(p4,vaj);
563 icf4 = _mm_mul_sd(ccf,rinv4);
564 xmm2 = _mm_mul_sd(icf4,prod);
565 xmm3 = _mm_mul_sd(icf4,prod_ai);
566 gpi = _mm_add_sd(gpi,xmm2);
568 /* Load, subtract and store atom aj gpol energy */
569 xmm7 = _mm_load_sd(born->gpol_still_work+aj1);
570 xmm3 = _mm_add_sd(xmm7,xmm3);
571 _mm_store_sd(born->gpol_still_work+aj1,xmm3);
573 /* Chain rule terms */
574 ccf = _mm_mul_sd(four,ccf);
575 xmm3 = _mm_sub_sd(ccf,dccf);
576 icf6 = _mm_mul_sd(xmm3,rinv6);
577 xmm1 = _mm_mul_sd(icf6,prod);
578 xmm2 = _mm_mul_sd(icf6,prod_ai);
580 /* Here we only have ai->aj1 and aj1->ai, so we can store directly */
581 _mm_storel_pd(fr->dadx+n,xmm1);
582 n = n + 1;
583 _mm_storel_pd(fr->dadx+n,xmm2);
584 n = n + 1;
585 } /* End offset */
587 /* Do end processing ... */
588 xmm2 = _mm_unpacklo_pd(xmm2,gpi);
589 gpi = _mm_add_pd(gpi,xmm2);
590 gpi = _mm_shuffle_pd(gpi,gpi,_MM_SHUFFLE2(1,1));
592 /* Load, add and store atom ai polarisation energy */
593 xmm2 = _mm_load_sd(born->gpol_still_work+ai);
594 gpi = _mm_add_sd(gpi,xmm2);
595 _mm_store_sd(born->gpol_still_work+ai,gpi);
598 /* Parallel summations */
599 if(PARTDECOMP(cr))
601 gmx_sum(natoms,born->gpol_still_work, cr);
603 else if(DOMAINDECOMP(cr))
605 dd_atom_sum_real(cr->dd, born->gpol_still_work);
608 /* Compute the radii */
609 for(i=0;i<fr->natoms_force;i++) /* PELA born->nr */
611 if(born->use[i] != 0)
613 gpi_ai = born->gpol[i] + born->gpol_still_work[i];
614 gpi2 = gpi_ai*gpi_ai;
616 born->bRad[i]=factor*gmx_invsqrt(gpi2);
617 fr->invsqrta[i]=gmx_invsqrt(born->bRad[i]);
621 /* Extra (local) communication reqiured for DD */
622 if(DOMAINDECOMP(cr))
624 dd_atom_spread_real(cr->dd, born->bRad);
625 dd_atom_spread_real(cr->dd, fr->invsqrta);
628 return 0;
631 int
632 calc_gb_rad_hct_sse2_double(t_commrec *cr, t_forcerec *fr, int natoms, gmx_localtop_t *top,
633 const t_atomtypes *atype, double *x, t_nblist *nl, gmx_genborn_t *born, t_mdatoms *md)
635 int i,k,n,ai,ai3,aj1,aj2,aj13,aj23,nj0,nj1,at0,at1,offset;
636 int p1, p2, shift;
637 double rr,sum,sum_tmp,min_rad,rad,doff;
638 double shX,shY,shZ;
640 __m128d ix,iy,iz,jx,jy,jz,dx,dy,dz,sX,sY,sZ;
641 __m128d t1,t2,t3,rsq11,rinv,r,rai;
642 __m128d rai_inv,sk,sk2,lij,dlij,duij;
643 __m128d uij,lij2,uij2,lij3,uij3,diff2;
644 __m128d lij_inv,sk2_inv,prod,log_term,tmp,tmp_sum;
645 __m128d mask_cmp,mask_cmp2,mask_cmp3;
646 __m128d xmm1,xmm2,xmm3,xmm4,xmm7,xmm8,xmm9,doffset;
647 __m128d sum_ai,chrule,chrule_ai,tmp_ai;
648 __m128d sk_ai, sk2_ai,raj,raj_inv;
650 const __m128d neg = {-1.0, -1.0};
651 const __m128d zero = {0.0, 0.0};
652 const __m128d eigth = {0.125, 0.125};
653 const __m128d qrtr = {0.25, 0.25};
654 const __m128d half = {0.5, 0.5};
655 const __m128d one = {1.0, 1.0};
656 const __m128d two = {2.0, 2.0};
657 const __m128d three = {3.0, 3.0};
659 /* Keep the compiler happy */
660 tmp_ai = _mm_setzero_pd();
661 tmp = _mm_setzero_pd();
662 xmm7 = _mm_setzero_pd();
663 xmm8 = _mm_setzero_pd();
664 raj_inv = _mm_setzero_pd();
665 raj = _mm_setzero_pd();
666 sk = _mm_setzero_pd();
667 jx = _mm_setzero_pd();
668 jy = _mm_setzero_pd();
669 jz = _mm_setzero_pd();
671 /* Set the dielectric offset */
672 doff = born->gb_doffset;
673 doffset = _mm_load1_pd(&doff);
674 n = 0;
676 for(i=0;i<born->nr;i++)
678 born->gpol_hct_work[i] = 0;
681 for(i=0;i<nl->nri;i++)
683 ai = nl->iinr[i];
684 ai3 = ai * 3;
686 nj0 = nl->jindex[i];
687 nj1 = nl->jindex[i+1];
689 /* Load shifts for this list */
690 shift = nl->shift[i];
691 shX = fr->shift_vec[shift][0];
692 shY = fr->shift_vec[shift][1];
693 shZ = fr->shift_vec[shift][2];
695 /* Splat the shifts */
696 sX = _mm_load1_pd(&shX);
697 sY = _mm_load1_pd(&shY);
698 sZ = _mm_load1_pd(&shZ);
700 offset = (nj1-nj0)%2;
702 /* Load rai */
703 rr = top->atomtypes.gb_radius[md->typeA[ai]]-doff;
704 rai = _mm_load1_pd(&rr);
705 rr = 1.0/rr;
706 rai_inv = _mm_load1_pd(&rr);
708 /* Zero out sums for polarisation energies */
709 sum_ai = _mm_setzero_pd();
711 /* Load ai coordinates and add shifts */
712 ix = _mm_load1_pd(x+ai3);
713 iy = _mm_load1_pd(x+ai3+1);
714 iz = _mm_load1_pd(x+ai3+2);
716 ix = _mm_add_pd(sX,ix);
717 iy = _mm_add_pd(sY,iy);
718 iz = _mm_add_pd(sZ,iz);
720 sk_ai = _mm_load1_pd(born->param+ai);
721 sk2_ai = _mm_mul_pd(sk_ai,sk_ai);
723 for(k=nj0;k<nj1-offset;k+=2)
725 aj1 = nl->jjnr[k];
726 aj2 = nl->jjnr[k+1];
728 aj13 = aj1 * 3;
729 aj23 = aj2 * 3;
731 /* Load particle aj1-2 coordinates */
732 xmm1 = _mm_loadu_pd(x+aj13);
733 xmm2 = _mm_loadu_pd(x+aj23);
734 jx = _mm_shuffle_pd(xmm1,xmm2,_MM_SHUFFLE2(0,0));
735 jy = _mm_shuffle_pd(xmm1,xmm2,_MM_SHUFFLE2(1,1));
737 jz = _mm_loadl_pd(jz, x+aj13+2);
738 jz = _mm_loadh_pd(jz, x+aj23+2);
740 dx = _mm_sub_pd(ix,jx);
741 dy = _mm_sub_pd(iy,jy);
742 dz = _mm_sub_pd(iz,jz);
744 rsq11 = _mm_add_pd( _mm_add_pd( _mm_mul_pd(dx,dx) , _mm_mul_pd(dy,dy) ) , _mm_mul_pd(dz,dz) );
745 rinv = my_invrsq_pd(rsq11);
746 r = _mm_mul_pd(rinv,rsq11);
748 sk = _mm_loadl_pd(sk,born->param+aj1);
749 sk = _mm_loadh_pd(sk,born->param+aj2);
751 /* Load aj1,aj2 raj */
752 p1 = md->typeA[aj1];
753 p2 = md->typeA[aj2];
755 raj = _mm_loadl_pd(raj,top->atomtypes.gb_radius+p1);
756 raj = _mm_loadh_pd(raj,top->atomtypes.gb_radius+p2);
757 raj = _mm_sub_pd(raj,doffset);
759 /* Compute 1.0/raj */
760 raj_inv = my_inv_pd(raj);
762 /* INTERACTION aj->ai STARS HERE */
763 /* conditional mask for rai<dr+sk */
764 xmm1 = _mm_add_pd(r,sk);
765 mask_cmp = _mm_cmplt_pd(rai,xmm1);
767 /* conditional for rai>dr-sk, ends with mask_cmp2 */
768 xmm2 = _mm_sub_pd(r,sk);
769 xmm3 = my_inv_pd(xmm2);
770 mask_cmp2 = _mm_cmpgt_pd(rai,xmm2);
772 lij = _mm_or_pd(_mm_and_pd(mask_cmp2,rai_inv) ,_mm_andnot_pd(mask_cmp2,xmm3)); /*conditional as a mask*/
773 dlij = _mm_or_pd(_mm_and_pd(mask_cmp2,zero) ,_mm_andnot_pd(mask_cmp2,one));
775 uij = my_inv_pd(xmm1);
776 lij2 = _mm_mul_pd(lij,lij);
777 lij3 = _mm_mul_pd(lij2,lij);
778 uij2 = _mm_mul_pd(uij,uij);
779 uij3 = _mm_mul_pd(uij2,uij);
781 diff2 = _mm_sub_pd(uij2,lij2);
783 lij_inv = my_invrsq_pd(lij2);
784 sk2 = _mm_mul_pd(sk,sk);
785 sk2_inv = _mm_mul_pd(sk2,rinv);
786 prod = _mm_mul_pd(qrtr,sk2_inv);
788 log_term = _mm_mul_pd(uij,lij_inv);
789 log_term = log_pd(log_term);
791 xmm1 = _mm_sub_pd(lij,uij);
792 xmm2 = _mm_mul_pd(qrtr,r);
793 xmm2 = _mm_mul_pd(xmm2,diff2);
794 xmm1 = _mm_add_pd(xmm1,xmm2);
795 xmm2 = _mm_mul_pd(half,rinv);
796 xmm2 = _mm_mul_pd(xmm2,log_term);
797 xmm1 = _mm_add_pd(xmm1,xmm2);
798 xmm9 = _mm_mul_pd(neg,diff2);
799 xmm2 = _mm_mul_pd(xmm9,prod);
800 tmp_ai = _mm_add_pd(xmm1,xmm2);
802 /* contitional for rai<sk-dr */
803 xmm3 = _mm_sub_pd(sk,r);
804 mask_cmp3 = _mm_cmplt_pd(rai,xmm3); /* rai<sk-dr */
806 xmm4 = _mm_sub_pd(rai_inv,lij);
807 xmm4 = _mm_mul_pd(two,xmm4);
808 xmm4 = _mm_add_pd(tmp_ai,xmm4);
810 tmp_ai = _mm_or_pd(_mm_and_pd(mask_cmp3,xmm4) ,_mm_andnot_pd(mask_cmp3,tmp_ai)); /*conditional as a mask*/
812 /* the tmp will now contain two partial values, that not all are to be used. Which */
813 /* ones are governed by the mask_cmp mask. */
814 tmp_ai = _mm_mul_pd(half,tmp_ai);
815 tmp_ai = _mm_or_pd(_mm_and_pd(mask_cmp,tmp_ai) ,_mm_andnot_pd(mask_cmp,zero)); /*conditional as a mask*/
816 sum_ai = _mm_add_pd(sum_ai,tmp_ai);
818 xmm2 = _mm_mul_pd(half,lij2);
819 xmm3 = _mm_mul_pd(prod,lij3);
820 xmm2 = _mm_add_pd(xmm2,xmm3);
821 xmm3 = _mm_mul_pd(lij,rinv);
822 xmm4 = _mm_mul_pd(lij3,r);
823 xmm3 = _mm_add_pd(xmm3,xmm4);
824 xmm3 = _mm_mul_pd(qrtr,xmm3);
825 t1 = _mm_sub_pd(xmm2,xmm3);
827 xmm2 = _mm_mul_pd(half,uij2);
828 xmm2 = _mm_mul_pd(neg,xmm2);
829 xmm3 = _mm_mul_pd(qrtr,sk2_inv);
830 xmm3 = _mm_mul_pd(xmm3,uij3);
831 xmm2 = _mm_sub_pd(xmm2,xmm3);
832 xmm3 = _mm_mul_pd(uij,rinv);
833 xmm4 = _mm_mul_pd(uij3,r);
834 xmm3 = _mm_add_pd(xmm3,xmm4);
835 xmm3 = _mm_mul_pd(qrtr,xmm3);
836 t2 = _mm_add_pd(xmm2,xmm3);
838 xmm2 = _mm_mul_pd(sk2_inv,rinv);
839 xmm2 = _mm_add_pd(one,xmm2);
840 xmm2 = _mm_mul_pd(eigth,xmm2);
841 xmm2 = _mm_mul_pd(xmm2,xmm9);
842 xmm3 = _mm_mul_pd(log_term, rinv);
843 xmm3 = _mm_mul_pd(xmm3,rinv);
844 xmm3 = _mm_mul_pd(qrtr,xmm3);
845 t3 = _mm_add_pd(xmm2,xmm3);
847 /* chain rule terms */
848 xmm2 = _mm_mul_pd(dlij,t1);
849 xmm2 = _mm_add_pd(xmm2,t2);
850 xmm2 = _mm_add_pd(xmm2,t3);
852 /* temporary storage of chain rule terms, since we have to compute
853 the reciprocal terms also before storing them */
854 chrule = _mm_mul_pd(xmm2,rinv);
856 /* INTERACTION ai->aj starts here */
857 /* Conditional mask for raj<dr+sk_ai */
858 xmm1 = _mm_add_pd(r,sk_ai);
859 mask_cmp = _mm_cmplt_pd(raj,xmm1);
861 /* Conditional for rai>dr-sk, ends with mask_cmp2 */
862 xmm2 = _mm_sub_pd(r,sk_ai);
863 xmm3 = my_inv_pd(xmm2);
864 mask_cmp2 = _mm_cmpgt_pd(raj,xmm2);
866 lij = _mm_or_pd(_mm_and_pd(mask_cmp2,raj_inv) ,_mm_andnot_pd(mask_cmp2,xmm3)); /*conditional as a mask*/
867 dlij = _mm_or_pd(_mm_and_pd(mask_cmp2,zero) ,_mm_andnot_pd(mask_cmp2,one));
869 uij = my_inv_pd(xmm1);
870 lij2 = _mm_mul_pd(lij,lij);
871 lij3 = _mm_mul_pd(lij2,lij);
872 uij2 = _mm_mul_pd(uij,uij);
873 uij3 = _mm_mul_pd(uij2,uij);
875 diff2 = _mm_sub_pd(uij2,lij2);
877 lij_inv = my_invrsq_pd(lij2);
879 sk2 = sk2_ai;
880 sk2_inv = _mm_mul_pd(sk2,rinv);
881 prod = _mm_mul_pd(qrtr,sk2_inv);
883 log_term = _mm_mul_pd(uij,lij_inv);
884 log_term = log_pd(log_term);
886 xmm1 = _mm_sub_pd(lij,uij);
887 xmm2 = _mm_mul_pd(qrtr,r);
888 xmm2 = _mm_mul_pd(xmm2,diff2);
889 xmm1 = _mm_add_pd(xmm1,xmm2);
890 xmm2 = _mm_mul_pd(half,rinv);
891 xmm2 = _mm_mul_pd(xmm2,log_term);
892 xmm1 = _mm_add_pd(xmm1,xmm2);
893 xmm9 = _mm_mul_pd(neg,diff2);
894 xmm2 = _mm_mul_pd(xmm9,prod);
895 tmp = _mm_add_pd(xmm1,xmm2);
897 /* contitional for rai<sk_ai-dr */
898 xmm3 = _mm_sub_pd(sk_ai,r);
899 mask_cmp3 = _mm_cmplt_pd(raj,xmm3); /* rai<sk-dr */
901 xmm4 = _mm_sub_pd(raj_inv,lij);
902 xmm4 = _mm_mul_pd(two,xmm4);
903 xmm4 = _mm_add_pd(tmp,xmm4);
905 tmp = _mm_or_pd(_mm_and_pd(mask_cmp3,xmm4) ,_mm_andnot_pd(mask_cmp3,tmp)); /*conditional as a mask*/
907 /* the tmp will now contain two partial values, that not all are to be used. Which */
908 /* ones are governed by the mask_cmp mask. */
909 tmp = _mm_mul_pd(half,tmp);
910 tmp = _mm_or_pd(_mm_and_pd(mask_cmp,tmp) ,_mm_andnot_pd(mask_cmp,zero)); /*conditional as a mask*/
912 /* Load, add and store ai->aj pol energy */
913 xmm7 = _mm_loadl_pd(xmm7,born->gpol_hct_work+aj1);
914 xmm7 = _mm_loadh_pd(xmm7,born->gpol_hct_work+aj2);
916 xmm7 = _mm_add_pd(xmm7,tmp);
918 _mm_storel_pd(born->gpol_hct_work+aj1,xmm7);
919 _mm_storeh_pd(born->gpol_hct_work+aj2,xmm7);
921 /* Start chain rule terms */
922 xmm2 = _mm_mul_pd(half,lij2);
923 xmm3 = _mm_mul_pd(prod,lij3);
924 xmm2 = _mm_add_pd(xmm2,xmm3);
925 xmm3 = _mm_mul_pd(lij,rinv);
926 xmm4 = _mm_mul_pd(lij3,r);
927 xmm3 = _mm_add_pd(xmm3,xmm4);
928 xmm3 = _mm_mul_pd(qrtr,xmm3);
929 t1 = _mm_sub_pd(xmm2,xmm3);
931 xmm2 = _mm_mul_pd(half,uij2);
932 xmm2 = _mm_mul_pd(neg,xmm2);
933 xmm3 = _mm_mul_pd(qrtr,sk2_inv);
934 xmm3 = _mm_mul_pd(xmm3,uij3);
935 xmm2 = _mm_sub_pd(xmm2,xmm3);
936 xmm3 = _mm_mul_pd(uij,rinv);
937 xmm4 = _mm_mul_pd(uij3,r);
938 xmm3 = _mm_add_pd(xmm3,xmm4);
939 xmm3 = _mm_mul_pd(qrtr,xmm3);
940 t2 = _mm_add_pd(xmm2,xmm3);
942 xmm2 = _mm_mul_pd(sk2_inv,rinv);
943 xmm2 = _mm_add_pd(one,xmm2);
944 xmm2 = _mm_mul_pd(eigth,xmm2);
945 xmm2 = _mm_mul_pd(xmm2,xmm9);
946 xmm3 = _mm_mul_pd(log_term, rinv);
947 xmm3 = _mm_mul_pd(xmm3,rinv);
948 xmm3 = _mm_mul_pd(qrtr,xmm3);
949 t3 = _mm_add_pd(xmm2,xmm3);
951 /* chain rule terms */
952 xmm2 = _mm_mul_pd(dlij,t1);
953 xmm2 = _mm_add_pd(xmm2,t2);
954 xmm2 = _mm_add_pd(xmm2,t3);
955 chrule_ai = _mm_mul_pd(xmm2,rinv);
957 /* Store chain rule terms
958 * same unpacking rationale as with Still above
960 xmm3 = _mm_unpacklo_pd(chrule, chrule_ai);
961 xmm4 = _mm_unpackhi_pd(chrule, chrule_ai);
963 _mm_storeu_pd(fr->dadx+n, xmm3);
964 n = n + 2;
965 _mm_storeu_pd(fr->dadx+n, xmm4);
966 n = n + 2;
969 if(offset!=0)
971 aj1 = nl->jjnr[k];
972 aj13 = aj1 * 3;
973 p1 = md->typeA[aj1];
975 jx = _mm_load_sd(x+aj13);
976 jy = _mm_load_sd(x+aj13+1);
977 jz = _mm_load_sd(x+aj13+2);
979 sk = _mm_load_sd(born->param+aj1);
981 dx = _mm_sub_sd(ix,jx);
982 dy = _mm_sub_pd(iy,jy);
983 dz = _mm_sub_pd(iz,jz);
985 rsq11 = _mm_add_sd( _mm_add_sd( _mm_mul_sd(dx,dx) , _mm_mul_sd(dy,dy) ) , _mm_mul_sd(dz,dz) );
986 rinv = my_invrsq_pd(rsq11);
987 r = _mm_mul_sd(rinv,rsq11);
989 /* Load raj */
990 raj = _mm_load_sd(top->atomtypes.gb_radius+p1);
991 raj = _mm_sub_sd(raj,doffset);
992 raj_inv = my_inv_pd(raj);
994 /* OFFSET INTERATIONS aj->ai STARTS HERE */
995 /* conditional mask for rai<dr+sk */
996 xmm1 = _mm_add_sd(r,sk);
997 mask_cmp = _mm_cmplt_sd(rai,xmm1);
999 /* conditional for rai>dr-sk, ends with mask_cmp2 */
1000 xmm2 = _mm_sub_sd(r,sk);
1001 xmm3 = my_inv_pd(xmm2);
1002 mask_cmp2 = _mm_cmpgt_pd(rai,xmm2);
1004 lij = _mm_or_pd(_mm_and_pd(mask_cmp2,rai_inv) ,_mm_andnot_pd(mask_cmp2,xmm3)); /*conditional as a mask*/
1005 dlij = _mm_or_pd(_mm_and_pd(mask_cmp2,zero) ,_mm_andnot_pd(mask_cmp2,one));
1007 uij = my_inv_pd(xmm1);
1008 lij2 = _mm_mul_sd(lij,lij);
1009 lij3 = _mm_mul_sd(lij2,lij);
1010 uij2 = _mm_mul_sd(uij,uij);
1011 uij3 = _mm_mul_sd(uij2,uij);
1013 diff2 = _mm_sub_sd(uij2,lij2);
1015 lij_inv = my_invrsq_pd(lij2);
1016 sk2 = _mm_mul_sd(sk,sk);
1017 sk2_inv = _mm_mul_sd(sk2,rinv);
1018 prod = _mm_mul_sd(qrtr,sk2_inv);
1020 log_term = _mm_mul_pd(uij,lij_inv);
1021 log_term = log_pd(log_term);
1023 xmm1 = _mm_sub_sd(lij,uij);
1024 xmm2 = _mm_mul_sd(qrtr,r);
1025 xmm2 = _mm_mul_sd(xmm2,diff2);
1026 xmm1 = _mm_add_sd(xmm1,xmm2);
1027 xmm2 = _mm_mul_sd(half,rinv);
1028 xmm2 = _mm_mul_sd(xmm2,log_term);
1029 xmm1 = _mm_add_sd(xmm1,xmm2);
1030 xmm9 = _mm_mul_sd(neg,diff2);
1031 xmm2 = _mm_mul_sd(xmm9,prod);
1033 tmp_ai = _mm_add_sd(xmm1,xmm2);
1035 /* contitional for rai<sk-dr */
1036 xmm3 = _mm_sub_sd(sk,r);
1037 mask_cmp3 = _mm_cmplt_sd(rai,xmm3); /* rai<sk-dr */
1039 xmm4 = _mm_sub_sd(rai_inv,lij);
1040 xmm4 = _mm_mul_sd(two,xmm4);
1041 xmm4 = _mm_add_sd(tmp_ai,xmm4);
1043 tmp_ai = _mm_or_pd(_mm_and_pd(mask_cmp3,xmm4) ,_mm_andnot_pd(mask_cmp3,tmp_ai)); /*conditional as a mask*/
1045 /* the tmp will now contain two partial values, that not all are to be used. Which */
1046 /* ones are governed by the mask_cmp mask. */
1047 tmp_ai = _mm_mul_pd(half,tmp_ai);
1048 tmp_ai = _mm_or_pd(_mm_and_pd(mask_cmp,tmp_ai) ,_mm_andnot_pd(mask_cmp,zero)); /*conditional as a mask*/
1049 sum_ai = _mm_add_sd(sum_ai,tmp_ai);
1051 xmm2 = _mm_mul_sd(half,lij2);
1052 xmm3 = _mm_mul_sd(prod,lij3);
1053 xmm2 = _mm_add_sd(xmm2,xmm3);
1054 xmm3 = _mm_mul_sd(lij,rinv);
1055 xmm4 = _mm_mul_sd(lij3,r);
1056 xmm3 = _mm_add_sd(xmm3,xmm4);
1057 xmm3 = _mm_mul_sd(qrtr,xmm3);
1058 t1 = _mm_sub_sd(xmm2,xmm3);
1060 xmm2 = _mm_mul_sd(half,uij2);
1061 xmm2 = _mm_mul_sd(neg,xmm2);
1062 xmm3 = _mm_mul_sd(qrtr,sk2_inv);
1063 xmm3 = _mm_mul_sd(xmm3,uij3);
1064 xmm2 = _mm_sub_sd(xmm2,xmm3);
1065 xmm3 = _mm_mul_sd(uij,rinv);
1066 xmm4 = _mm_mul_sd(uij3,r);
1067 xmm3 = _mm_add_sd(xmm3,xmm4);
1068 xmm3 = _mm_mul_sd(qrtr,xmm3);
1069 t2 = _mm_add_sd(xmm2,xmm3);
1071 xmm2 = _mm_mul_sd(sk2_inv,rinv);
1072 xmm2 = _mm_add_sd(one,xmm2);
1073 xmm2 = _mm_mul_sd(eigth,xmm2);
1074 xmm2 = _mm_mul_sd(xmm2,xmm9);
1075 xmm3 = _mm_mul_sd(log_term, rinv);
1076 xmm3 = _mm_mul_sd(xmm3,rinv);
1077 xmm3 = _mm_mul_sd(qrtr,xmm3);
1078 t3 = _mm_add_sd(xmm2,xmm3);
1080 /* chain rule terms */
1081 xmm2 = _mm_mul_sd(dlij,t1);
1082 xmm2 = _mm_add_sd(xmm2,t2);
1083 xmm2 = _mm_add_sd(xmm2,t3);
1085 /* temporary storage of chain rule terms, since we have to compute
1086 the reciprocal terms also before storing them */
1087 chrule = _mm_mul_sd(xmm2,rinv);
1089 /* OFFSET INTERACTION ai->aj starts here */
1090 /* conditional mask for raj<dr+sk */
1091 xmm1 = _mm_add_sd(r,sk_ai);
1092 mask_cmp = _mm_cmplt_sd(raj,xmm1);
1094 /* conditional for rai>dr-sk, ends with mask_cmp2 */
1095 xmm2 = _mm_sub_sd(r,sk_ai);
1096 xmm3 = my_inv_pd(xmm2);
1097 mask_cmp2 = _mm_cmpgt_pd(raj,xmm2);
1099 lij = _mm_or_pd(_mm_and_pd(mask_cmp2,raj_inv) ,_mm_andnot_pd(mask_cmp2,xmm3)); /*conditional as a mask*/
1100 dlij = _mm_or_pd(_mm_and_pd(mask_cmp2,zero) ,_mm_andnot_pd(mask_cmp2,one));
1102 uij = my_inv_pd(xmm1);
1103 lij2 = _mm_mul_sd(lij,lij);
1104 lij3 = _mm_mul_sd(lij2,lij);
1105 uij2 = _mm_mul_sd(uij,uij);
1106 uij3 = _mm_mul_sd(uij2,uij);
1108 diff2 = _mm_sub_sd(uij2,lij2);
1110 lij_inv = my_invrsq_pd(lij2);
1112 sk2 = sk2_ai;
1113 sk2_inv = _mm_mul_sd(sk2,rinv);
1114 prod = _mm_mul_sd(qrtr,sk2_inv);
1116 log_term = _mm_mul_pd(uij,lij_inv);
1117 log_term = log_pd(log_term);
1119 xmm1 = _mm_sub_sd(lij,uij);
1120 xmm2 = _mm_mul_sd(qrtr,r);
1121 xmm2 = _mm_mul_sd(xmm2,diff2);
1122 xmm1 = _mm_add_sd(xmm1,xmm2);
1123 xmm2 = _mm_mul_sd(half,rinv);
1124 xmm2 = _mm_mul_sd(xmm2,log_term);
1125 xmm1 = _mm_add_sd(xmm1,xmm2);
1126 xmm9 = _mm_mul_sd(neg,diff2);
1127 xmm2 = _mm_mul_sd(xmm9,prod);
1128 tmp = _mm_add_sd(xmm1,xmm2);
1130 /* contitional for rai<sk-dr */
1131 xmm3 = _mm_sub_sd(sk_ai,r);
1132 mask_cmp3 = _mm_cmplt_sd(raj,xmm3); /* rai<sk-dr */
1134 xmm4 = _mm_sub_sd(raj_inv,lij);
1135 xmm4 = _mm_mul_sd(two,xmm4);
1136 xmm4 = _mm_add_sd(tmp,xmm4);
1138 tmp = _mm_or_pd(_mm_and_pd(mask_cmp3,xmm4) ,_mm_andnot_pd(mask_cmp3,tmp)); /*conditional as a mask*/
1140 /* the tmp will now contain two partial values, that not all are to be used. Which */
1141 /* ones are governed by the mask_cmp mask. */
1142 tmp = _mm_mul_pd(half,tmp);
1143 tmp = _mm_or_pd(_mm_and_pd(mask_cmp,tmp) ,_mm_andnot_pd(mask_cmp,zero)); /*conditional as a mask*/
1145 /* Load, add and store gpol energy */
1146 xmm7 = _mm_load_sd(born->gpol_hct_work+aj1);
1147 xmm7 = _mm_add_sd(xmm7,tmp);
1148 _mm_store_sd(born->gpol_hct_work+aj1,xmm7);
1150 /* Start chain rule terms, t1 */
1151 xmm2 = _mm_mul_sd(half,lij2);
1152 xmm3 = _mm_mul_sd(prod,lij3);
1153 xmm2 = _mm_add_sd(xmm2,xmm3);
1154 xmm3 = _mm_mul_sd(lij,rinv);
1155 xmm4 = _mm_mul_sd(lij3,r);
1156 xmm3 = _mm_add_sd(xmm3,xmm4);
1157 xmm3 = _mm_mul_sd(qrtr,xmm3);
1158 t1 = _mm_sub_sd(xmm2,xmm3);
1160 xmm2 = _mm_mul_sd(half,uij2);
1161 xmm2 = _mm_mul_sd(neg,xmm2);
1162 xmm3 = _mm_mul_sd(qrtr,sk2_inv);
1163 xmm3 = _mm_mul_sd(xmm3,uij3);
1164 xmm2 = _mm_sub_sd(xmm2,xmm3);
1165 xmm3 = _mm_mul_sd(uij,rinv);
1166 xmm4 = _mm_mul_sd(uij3,r);
1167 xmm3 = _mm_add_sd(xmm3,xmm4);
1168 xmm3 = _mm_mul_sd(qrtr,xmm3);
1169 t2 = _mm_add_sd(xmm2,xmm3);
1171 xmm2 = _mm_mul_sd(sk2_inv,rinv);
1172 xmm2 = _mm_add_sd(one,xmm2);
1173 xmm2 = _mm_mul_sd(eigth,xmm2);
1174 xmm2 = _mm_mul_sd(xmm2,xmm9);
1175 xmm3 = _mm_mul_sd(log_term, rinv);
1176 xmm3 = _mm_mul_sd(xmm3,rinv);
1177 xmm3 = _mm_mul_sd(qrtr,xmm3);
1178 t3 = _mm_add_sd(xmm2,xmm3);
1180 /* chain rule terms */
1181 xmm2 = _mm_mul_sd(dlij,t1);
1182 xmm2 = _mm_add_sd(xmm2,t2);
1183 xmm2 = _mm_add_sd(xmm2,t3);
1184 chrule_ai = _mm_mul_sd(xmm2,rinv);
1186 _mm_store_sd(fr->dadx+n, chrule);
1187 n = n + 1;
1188 _mm_store_sd(fr->dadx+n, chrule_ai);
1189 n = n + 1;
1192 /* Do end processing ... */
1193 tmp_ai = _mm_unpacklo_pd(tmp_ai,sum_ai);
1194 sum_ai = _mm_add_pd(sum_ai,tmp_ai);
1195 sum_ai = _mm_shuffle_pd(sum_ai,sum_ai,_MM_SHUFFLE2(1,1));
1197 /* Load, add and store atom ai polarisation energy */
1198 xmm2 = _mm_load_sd(born->gpol_hct_work+ai);
1199 sum_ai = _mm_add_sd(sum_ai,xmm2);
1200 _mm_store_sd(born->gpol_hct_work+ai,sum_ai);
1203 /* Parallel summations */
1204 if(PARTDECOMP(cr))
1206 gmx_sum(natoms, born->gpol_hct_work, cr);
1208 else if(DOMAINDECOMP(cr))
1210 dd_atom_sum_real(cr->dd, born->gpol_hct_work);
1213 /* Compute the radii */
1214 for(i=0;i<fr->natoms_force;i++) /* PELA born->nr */
1216 if(born->use[i] != 0)
1218 rr = top->atomtypes.gb_radius[md->typeA[i]]-doff;
1219 sum = 1.0/rr - born->gpol_hct_work[i];
1220 min_rad = rr + doff;
1221 rad = 1.0/sum;
1223 born->bRad[i] = rad > min_rad ? rad : min_rad;
1224 fr->invsqrta[i] = gmx_invsqrt(born->bRad[i]);
1228 /* Extra (local) communication required for DD */
1229 if(DOMAINDECOMP(cr))
1231 dd_atom_spread_real(cr->dd, born->bRad);
1232 dd_atom_spread_real(cr->dd, fr->invsqrta);
1235 return 0;
1238 int
1239 calc_gb_rad_obc_sse2_double(t_commrec *cr, t_forcerec * fr, int natoms, gmx_localtop_t *top,
1240 const t_atomtypes *atype, double *x, t_nblist *nl, gmx_genborn_t *born,t_mdatoms *md)
1242 int i,k,n,ai,ai3,aj1,aj2,aj13,aj23,nj0,nj1,at0,at1,offset;
1243 int p1,p2,p3,p4,shift;
1244 double rr,sum,sum_tmp,sum2,sum3,min_rad,rad,doff;
1245 double tsum,tchain,rr_inv,rr_inv2,gbr;
1246 double shX,shY,shZ;
1248 __m128d ix,iy,iz,jx,jy,jz,dx,dy,dz,sX,sY,sZ;
1249 __m128d t1,t2,t3,rsq11,rinv,r,rai;
1250 __m128d rai_inv,sk,sk2,lij,dlij,duij;
1251 __m128d uij,lij2,uij2,lij3,uij3,diff2;
1252 __m128d lij_inv,sk2_inv,prod,log_term,tmp,tmp_sum;
1253 __m128d mask_cmp,mask_cmp2,mask_cmp3,doffset,raj,raj_inv;
1254 __m128d xmm1,xmm2,xmm3,xmm4,xmm7,xmm8,xmm9;
1255 __m128d sum_ai,chrule,chrule_ai,tmp_ai,sk_ai,sk2_ai;
1257 const __m128d neg = {-1.0, -1.0};
1258 const __m128d zero = {0.0, 0.0};
1259 const __m128d eigth = {0.125, 0.125};
1260 const __m128d qrtr = {0.25, 0.25};
1261 const __m128d half = {0.5, 0.5};
1262 const __m128d one = {1.0, 1.0};
1263 const __m128d two = {2.0, 2.0};
1264 const __m128d three = {3.0, 3.0};
1266 /* Keep the compiler happy */
1267 tmp_ai = _mm_setzero_pd();
1268 tmp = _mm_setzero_pd();
1269 raj_inv = _mm_setzero_pd();
1270 raj = _mm_setzero_pd();
1271 sk = _mm_setzero_pd();
1272 jx = _mm_setzero_pd();
1273 jy = _mm_setzero_pd();
1274 jz = _mm_setzero_pd();
1275 xmm7 = _mm_setzero_pd();
1276 xmm8 = _mm_setzero_pd();
1278 /* Set the dielectric offset */
1279 doff = born->gb_doffset;
1280 doffset = _mm_load1_pd(&doff);
1282 n = 0;
1284 for(i=0;i<born->nr;i++)
1286 born->gpol_hct_work[i] = 0;
1289 for(i=0;i<nl->nri;i++)
1291 ai = nl->iinr[i];
1292 ai3 = ai * 3;
1294 nj0 = nl->jindex[i];
1295 nj1 = nl->jindex[i+1];
1297 /* Load shifts for this list */
1298 shift = nl->shift[i];
1299 shX = fr->shift_vec[shift][0];
1300 shY = fr->shift_vec[shift][1];
1301 shZ = fr->shift_vec[shift][2];
1303 /* Splat the shifts */
1304 sX = _mm_load1_pd(&shX);
1305 sY = _mm_load1_pd(&shY);
1306 sZ = _mm_load1_pd(&shZ);
1308 offset = (nj1-nj0)%2;
1310 /* Load rai */
1311 rr = top->atomtypes.gb_radius[md->typeA[ai]]-doff;
1312 rai = _mm_load1_pd(&rr);
1313 rr = 1.0/rr;
1314 rai_inv = _mm_load1_pd(&rr);
1316 /* Load ai coordinates and add shifts */
1317 ix = _mm_load1_pd(x+ai3);
1318 iy = _mm_load1_pd(x+ai3+1);
1319 iz = _mm_load1_pd(x+ai3+2);
1321 ix = _mm_add_pd(sX,ix);
1322 iy = _mm_add_pd(sY,iy);
1323 iz = _mm_add_pd(sZ,iz);
1325 /* Zero out sums for polarisation energies */
1326 sum_ai = _mm_setzero_pd();
1328 sk_ai = _mm_load1_pd(born->param+ai);
1329 sk2_ai = _mm_mul_pd(sk_ai,sk_ai);
1331 for(k=nj0;k<nj1-offset;k+=2)
1333 aj1 = nl->jjnr[k];
1334 aj2 = nl->jjnr[k+1];
1336 aj13 = aj1 * 3;
1337 aj23 = aj2 * 3;
1339 /* Load particle aj1-2 coordinates */
1340 xmm1 = _mm_loadu_pd(x+aj13);
1341 xmm2 = _mm_loadu_pd(x+aj23);
1342 jx = _mm_shuffle_pd(xmm1,xmm2,_MM_SHUFFLE2(0,0));
1343 jy = _mm_shuffle_pd(xmm1,xmm2,_MM_SHUFFLE2(1,1));
1345 jz = _mm_loadl_pd(jz, x+aj13+2);
1346 jz = _mm_loadh_pd(jz, x+aj23+2);
1348 dx = _mm_sub_pd(ix,jx);
1349 dy = _mm_sub_pd(iy,jy);
1350 dz = _mm_sub_pd(iz,jz);
1352 rsq11 = _mm_add_pd( _mm_add_pd( _mm_mul_pd(dx,dx) , _mm_mul_pd(dy,dy) ) , _mm_mul_pd(dz,dz) );
1353 rinv = my_invrsq_pd(rsq11);
1354 r = _mm_mul_pd(rinv,rsq11);
1356 /* Load atom aj1,aj2 raj */
1357 p1 = md->typeA[aj1];
1358 p2 = md->typeA[aj2];
1360 raj = _mm_loadl_pd(raj,top->atomtypes.gb_radius+p1);
1361 raj = _mm_loadh_pd(raj,top->atomtypes.gb_radius+p2);
1362 raj = _mm_sub_pd(raj,doffset);
1364 /* Compute 1.0/raj */
1365 raj_inv = my_inv_pd(raj);
1367 sk = _mm_loadl_pd(sk,born->param+aj1);
1368 sk = _mm_loadh_pd(sk,born->param+aj2);
1370 /* INTERACTION aj->ai STARTS HERE */
1371 /* conditional mask for rai<dr+sk */
1372 xmm1 = _mm_add_pd(r,sk);
1373 mask_cmp = _mm_cmplt_pd(rai,xmm1);
1375 /* conditional for rai>dr-sk, ends with mask_cmp2 */
1376 xmm2 = _mm_sub_pd(r,sk);
1377 xmm3 = my_inv_pd(xmm2);
1378 mask_cmp2 = _mm_cmpgt_pd(rai,xmm2);
1380 lij = _mm_or_pd(_mm_and_pd(mask_cmp2,rai_inv) ,_mm_andnot_pd(mask_cmp2,xmm3)); /*conditional as a mask*/
1381 dlij = _mm_or_pd(_mm_and_pd(mask_cmp2,zero) ,_mm_andnot_pd(mask_cmp2,one));
1383 uij = my_inv_pd(xmm1);
1384 lij2 = _mm_mul_pd(lij,lij);
1385 lij3 = _mm_mul_pd(lij2,lij);
1386 uij2 = _mm_mul_pd(uij,uij);
1387 uij3 = _mm_mul_pd(uij2,uij);
1389 diff2 = _mm_sub_pd(uij2,lij2);
1391 lij_inv = my_invrsq_pd(lij2);
1392 sk2 = _mm_mul_pd(sk,sk);
1393 sk2_inv = _mm_mul_pd(sk2,rinv);
1394 prod = _mm_mul_pd(qrtr,sk2_inv);
1396 log_term = _mm_mul_pd(uij,lij_inv);
1397 log_term = log_pd(log_term);
1399 xmm1 = _mm_sub_pd(lij,uij);
1400 xmm2 = _mm_mul_pd(qrtr,r);
1401 xmm2 = _mm_mul_pd(xmm2,diff2);
1402 xmm1 = _mm_add_pd(xmm1,xmm2);
1403 xmm2 = _mm_mul_pd(half,rinv);
1404 xmm2 = _mm_mul_pd(xmm2,log_term);
1405 xmm1 = _mm_add_pd(xmm1,xmm2);
1406 xmm9 = _mm_mul_pd(neg,diff2);
1407 xmm2 = _mm_mul_pd(xmm9,prod);
1408 tmp_ai = _mm_add_pd(xmm1,xmm2);
1410 /* contitional for rai<sk-dr */
1411 xmm3 = _mm_sub_pd(sk,r);
1412 mask_cmp3 = _mm_cmplt_pd(rai,xmm3); /* rai<sk-dr */
1414 xmm4 = _mm_sub_pd(rai_inv,lij);
1415 xmm4 = _mm_mul_pd(two,xmm4);
1416 xmm4 = _mm_add_pd(tmp_ai,xmm4);
1418 tmp_ai = _mm_or_pd(_mm_and_pd(mask_cmp3,xmm4) ,_mm_andnot_pd(mask_cmp3,tmp_ai)); /*conditional as a mask*/
1420 /* the tmp will now contain two partial values, that not all are to be used. Which */
1421 /* ones are governed by the mask_cmp mask. */
1422 tmp_ai = _mm_mul_pd(half,tmp_ai);
1423 tmp_ai = _mm_or_pd(_mm_and_pd(mask_cmp,tmp_ai) ,_mm_andnot_pd(mask_cmp,zero)); /*conditional as a mask*/
1424 sum_ai = _mm_add_pd(sum_ai,tmp_ai);
1426 /* Start the dadx chain rule terms */
1427 xmm2 = _mm_mul_pd(half,lij2);
1428 xmm3 = _mm_mul_pd(prod,lij3);
1429 xmm2 = _mm_add_pd(xmm2,xmm3);
1430 xmm3 = _mm_mul_pd(lij,rinv);
1431 xmm4 = _mm_mul_pd(lij3,r);
1432 xmm3 = _mm_add_pd(xmm3,xmm4);
1433 xmm3 = _mm_mul_pd(qrtr,xmm3);
1434 t1 = _mm_sub_pd(xmm2,xmm3);
1436 xmm2 = _mm_mul_pd(half,uij2);
1437 xmm2 = _mm_mul_pd(neg,xmm2);
1438 xmm3 = _mm_mul_pd(qrtr,sk2_inv);
1439 xmm3 = _mm_mul_pd(xmm3,uij3);
1440 xmm2 = _mm_sub_pd(xmm2,xmm3);
1441 xmm3 = _mm_mul_pd(uij,rinv);
1442 xmm4 = _mm_mul_pd(uij3,r);
1443 xmm3 = _mm_add_pd(xmm3,xmm4);
1444 xmm3 = _mm_mul_pd(qrtr,xmm3);
1445 t2 = _mm_add_pd(xmm2,xmm3);
1447 xmm2 = _mm_mul_pd(sk2_inv,rinv);
1448 xmm2 = _mm_add_pd(one,xmm2);
1449 xmm2 = _mm_mul_pd(eigth,xmm2);
1450 xmm2 = _mm_mul_pd(xmm2,xmm9);
1451 xmm3 = _mm_mul_pd(log_term, rinv);
1452 xmm3 = _mm_mul_pd(xmm3,rinv);
1453 xmm3 = _mm_mul_pd(qrtr,xmm3);
1454 t3 = _mm_add_pd(xmm2,xmm3);
1456 /* chain rule terms */
1457 xmm2 = _mm_mul_pd(dlij,t1);
1458 xmm2 = _mm_add_pd(xmm2,t2);
1459 xmm2 = _mm_add_pd(xmm2,t3);
1461 /* temporary storage of chain rule terms, since we have to compute
1462 the reciprocal terms also before storing them */
1463 chrule = _mm_mul_pd(xmm2,rinv);
1465 /* INTERACTION ai->aj STARTS HERE */
1466 /* conditional mask for raj<dr+sk_ai */
1467 xmm1 = _mm_add_pd(r,sk_ai);
1468 mask_cmp = _mm_cmplt_pd(raj,xmm1);
1470 /* conditional for rai>dr-sk, ends with mask_cmp2 */
1471 xmm2 = _mm_sub_pd(r,sk_ai);
1472 xmm3 = my_inv_pd(xmm2);
1473 mask_cmp2 = _mm_cmpgt_pd(raj,xmm2);
1475 lij = _mm_or_pd(_mm_and_pd(mask_cmp2,raj_inv) ,_mm_andnot_pd(mask_cmp2,xmm3)); /*conditional as a mask*/
1476 dlij = _mm_or_pd(_mm_and_pd(mask_cmp2,zero) ,_mm_andnot_pd(mask_cmp2,one));
1478 uij = my_inv_pd(xmm1);
1479 lij2 = _mm_mul_pd(lij,lij);
1480 lij3 = _mm_mul_pd(lij2,lij);
1481 uij2 = _mm_mul_pd(uij,uij);
1482 uij3 = _mm_mul_pd(uij2,uij);
1484 diff2 = _mm_sub_pd(uij2,lij2);
1486 lij_inv = my_invrsq_pd(lij2);
1488 sk2 = sk2_ai;
1489 sk2_inv = _mm_mul_pd(sk2,rinv);
1490 prod = _mm_mul_pd(qrtr,sk2_inv);
1492 log_term = _mm_mul_pd(uij,lij_inv);
1493 log_term = log_pd(log_term);
1495 xmm1 = _mm_sub_pd(lij,uij);
1496 xmm2 = _mm_mul_pd(qrtr,r);
1497 xmm2 = _mm_mul_pd(xmm2,diff2);
1498 xmm1 = _mm_add_pd(xmm1,xmm2);
1499 xmm2 = _mm_mul_pd(half,rinv);
1500 xmm2 = _mm_mul_pd(xmm2,log_term);
1501 xmm1 = _mm_add_pd(xmm1,xmm2);
1502 xmm9 = _mm_mul_pd(neg,diff2);
1503 xmm2 = _mm_mul_pd(xmm9,prod);
1504 tmp = _mm_add_pd(xmm1,xmm2);
1506 /* contitional for raj<sk_ai-dr */
1507 xmm3 = _mm_sub_pd(sk_ai,r);
1508 mask_cmp3 = _mm_cmplt_pd(raj,xmm3); /* rai<sk-dr */
1510 xmm4 = _mm_sub_pd(raj_inv,lij);
1511 xmm4 = _mm_mul_pd(two,xmm4);
1512 xmm4 = _mm_add_pd(tmp,xmm4);
1514 tmp = _mm_or_pd(_mm_and_pd(mask_cmp3,xmm4) ,_mm_andnot_pd(mask_cmp3,tmp)); /*conditional as a mask*/
1516 /* the tmp will now contain two partial values, that not all are to be used. Which */
1517 /* ones are governed by the mask_cmp mask. */
1518 tmp = _mm_mul_pd(half,tmp);
1519 tmp = _mm_or_pd(_mm_and_pd(mask_cmp,tmp) ,_mm_andnot_pd(mask_cmp,zero)); /*conditional as a mask*/
1521 /* Load, add and store ai->aj pol energy */
1522 xmm7 = _mm_loadl_pd(xmm7,born->gpol_hct_work+aj1);
1523 xmm7 = _mm_loadh_pd(xmm7,born->gpol_hct_work+aj2);
1525 xmm7 = _mm_add_pd(xmm7,tmp);
1527 _mm_storel_pd(born->gpol_hct_work+aj1,xmm7);
1528 _mm_storeh_pd(born->gpol_hct_work+aj2,xmm7);
1530 /* Start dadx chain rule terms */
1531 xmm2 = _mm_mul_pd(half,lij2);
1532 xmm3 = _mm_mul_pd(prod,lij3);
1533 xmm2 = _mm_add_pd(xmm2,xmm3);
1534 xmm3 = _mm_mul_pd(lij,rinv);
1535 xmm4 = _mm_mul_pd(lij3,r);
1536 xmm3 = _mm_add_pd(xmm3,xmm4);
1537 xmm3 = _mm_mul_pd(qrtr,xmm3);
1538 t1 = _mm_sub_pd(xmm2,xmm3);
1540 xmm2 = _mm_mul_pd(half,uij2);
1541 xmm2 = _mm_mul_pd(neg,xmm2);
1542 xmm3 = _mm_mul_pd(qrtr,sk2_inv);
1543 xmm3 = _mm_mul_pd(xmm3,uij3);
1544 xmm2 = _mm_sub_pd(xmm2,xmm3);
1545 xmm3 = _mm_mul_pd(uij,rinv);
1546 xmm4 = _mm_mul_pd(uij3,r);
1547 xmm3 = _mm_add_pd(xmm3,xmm4);
1548 xmm3 = _mm_mul_pd(qrtr,xmm3);
1549 t2 = _mm_add_pd(xmm2,xmm3);
1551 xmm2 = _mm_mul_pd(sk2_inv,rinv);
1552 xmm2 = _mm_add_pd(one,xmm2);
1553 xmm2 = _mm_mul_pd(eigth,xmm2);
1554 xmm2 = _mm_mul_pd(xmm2,xmm9);
1555 xmm3 = _mm_mul_pd(log_term, rinv);
1556 xmm3 = _mm_mul_pd(xmm3,rinv);
1557 xmm3 = _mm_mul_pd(qrtr,xmm3);
1558 t3 = _mm_add_pd(xmm2,xmm3);
1560 /* chain rule terms */
1561 xmm2 = _mm_mul_pd(dlij,t1);
1562 xmm2 = _mm_add_pd(xmm2,t2);
1563 xmm2 = _mm_add_pd(xmm2,t3);
1564 chrule_ai = _mm_mul_pd(xmm2,rinv);
1566 /* Store chain rule terms
1567 * same unpacking rationale as with Still above
1569 xmm3 = _mm_unpacklo_pd(chrule, chrule_ai);
1570 xmm4 = _mm_unpackhi_pd(chrule, chrule_ai);
1572 _mm_storeu_pd(fr->dadx+n, xmm3);
1573 n = n + 2;
1574 _mm_storeu_pd(fr->dadx+n, xmm4);
1575 n = n + 2;
1578 if(offset!=0)
1580 aj1 = nl->jjnr[k];
1581 aj13 = aj1 * 3;
1582 p1 = md->typeA[aj1];
1584 jx = _mm_load_sd(x+aj13);
1585 jy = _mm_load_sd(x+aj13+1);
1586 jz = _mm_load_sd(x+aj13+2);
1588 sk = _mm_load_sd(born->param+aj1);
1590 /* Load raj */
1591 raj = _mm_load_sd(top->atomtypes.gb_radius+p1);
1592 raj = _mm_sub_sd(raj,doffset);
1593 raj_inv = my_inv_pd(raj);
1595 dx = _mm_sub_sd(ix,jx);
1596 dy = _mm_sub_pd(iy,jy);
1597 dz = _mm_sub_pd(iz,jz);
1599 rsq11 = _mm_add_sd( _mm_add_sd( _mm_mul_sd(dx,dx) , _mm_mul_sd(dy,dy) ) , _mm_mul_sd(dz,dz) );
1600 rinv = my_invrsq_pd(rsq11);
1601 r = _mm_mul_sd(rinv,rsq11);
1603 /* OFFSET INTERACTION aj->ai STARTS HERE */
1604 /* conditional mask for rai<dr+sk */
1605 xmm1 = _mm_add_sd(r,sk);
1606 mask_cmp = _mm_cmplt_sd(rai,xmm1);
1608 /* conditional for rai>dr-sk, ends with mask_cmp2 */
1609 xmm2 = _mm_sub_sd(r,sk);
1610 xmm3 = my_inv_pd(xmm2);
1611 mask_cmp2 = _mm_cmpgt_pd(rai,xmm2);
1613 lij = _mm_or_pd(_mm_and_pd(mask_cmp2,rai_inv) ,_mm_andnot_pd(mask_cmp2,xmm3)); /*conditional as a mask*/
1614 dlij = _mm_or_pd(_mm_and_pd(mask_cmp2,zero) ,_mm_andnot_pd(mask_cmp2,one));
1616 uij = my_inv_pd(xmm1);
1617 lij2 = _mm_mul_sd(lij,lij);
1618 lij3 = _mm_mul_sd(lij2,lij);
1619 uij2 = _mm_mul_sd(uij,uij);
1620 uij3 = _mm_mul_sd(uij2,uij);
1622 diff2 = _mm_sub_sd(uij2,lij2);
1624 lij_inv = my_invrsq_pd(lij2);
1625 sk2 = _mm_mul_sd(sk,sk);
1626 sk2_inv = _mm_mul_sd(sk2,rinv);
1627 prod = _mm_mul_sd(qrtr,sk2_inv);
1629 log_term = _mm_mul_pd(uij,lij_inv);
1630 log_term = log_pd(log_term);
1632 xmm1 = _mm_sub_sd(lij,uij);
1633 xmm2 = _mm_mul_sd(qrtr,r);
1634 xmm2 = _mm_mul_sd(xmm2,diff2);
1635 xmm1 = _mm_add_sd(xmm1,xmm2);
1636 xmm2 = _mm_mul_sd(half,rinv);
1637 xmm2 = _mm_mul_sd(xmm2,log_term);
1638 xmm1 = _mm_add_sd(xmm1,xmm2);
1639 xmm9 = _mm_mul_sd(neg,diff2);
1640 xmm2 = _mm_mul_sd(xmm9,prod);
1642 tmp_ai = _mm_add_sd(xmm1,xmm2);
1644 /* contitional for rai<sk-dr */
1645 xmm3 = _mm_sub_sd(sk,r);
1646 mask_cmp3 = _mm_cmplt_sd(rai,xmm3); /* rai<sk-dr */
1648 xmm4 = _mm_sub_sd(rai_inv,lij);
1649 xmm4 = _mm_mul_sd(two,xmm4);
1650 xmm4 = _mm_add_sd(tmp_ai,xmm4);
1652 tmp_ai = _mm_or_pd(_mm_and_pd(mask_cmp3,xmm4) ,_mm_andnot_pd(mask_cmp3,tmp_ai)); /*conditional as a mask*/
1654 /* the tmp will now contain two partial values, that not all are to be used. Which */
1655 /* ones are governed by the mask_cmp mask. */
1656 tmp_ai = _mm_mul_pd(half,tmp_ai);
1657 tmp_ai = _mm_or_pd(_mm_and_pd(mask_cmp,tmp_ai) ,_mm_andnot_pd(mask_cmp,zero)); /*conditional as a mask*/
1658 sum_ai = _mm_add_sd(sum_ai,tmp_ai);
1660 xmm2 = _mm_mul_sd(half,lij2);
1661 xmm3 = _mm_mul_sd(prod,lij3);
1662 xmm2 = _mm_add_sd(xmm2,xmm3);
1663 xmm3 = _mm_mul_sd(lij,rinv);
1664 xmm4 = _mm_mul_sd(lij3,r);
1665 xmm3 = _mm_add_sd(xmm3,xmm4);
1666 xmm3 = _mm_mul_sd(qrtr,xmm3);
1667 t1 = _mm_sub_sd(xmm2,xmm3);
1669 xmm2 = _mm_mul_sd(half,uij2);
1670 xmm2 = _mm_mul_sd(neg,xmm2);
1671 xmm3 = _mm_mul_sd(qrtr,sk2_inv);
1672 xmm3 = _mm_mul_sd(xmm3,uij3);
1673 xmm2 = _mm_sub_sd(xmm2,xmm3);
1674 xmm3 = _mm_mul_sd(uij,rinv);
1675 xmm4 = _mm_mul_sd(uij3,r);
1676 xmm3 = _mm_add_sd(xmm3,xmm4);
1677 xmm3 = _mm_mul_sd(qrtr,xmm3);
1678 t2 = _mm_add_sd(xmm2,xmm3);
1680 xmm2 = _mm_mul_sd(sk2_inv,rinv);
1681 xmm2 = _mm_add_sd(one,xmm2);
1682 xmm2 = _mm_mul_sd(eigth,xmm2);
1683 xmm2 = _mm_mul_sd(xmm2,xmm9);
1684 xmm3 = _mm_mul_sd(log_term, rinv);
1685 xmm3 = _mm_mul_sd(xmm3,rinv);
1686 xmm3 = _mm_mul_sd(qrtr,xmm3);
1687 t3 = _mm_add_sd(xmm2,xmm3);
1689 /* chain rule terms */
1690 xmm2 = _mm_mul_sd(dlij,t1);
1691 xmm2 = _mm_add_sd(xmm2,t2);
1692 xmm2 = _mm_add_sd(xmm2,t3);
1693 chrule = _mm_mul_sd(xmm2,rinv);
1695 /* OFFSET INTERACTION ai->aj STARTS HERE */
1696 /* conditional mask for raj<dr+sk_ai */
1697 xmm1 = _mm_add_sd(r,sk_ai);
1698 mask_cmp = _mm_cmplt_sd(raj,xmm1);
1700 /* conditional for rai>dr-sk, ends with mask_cmp2 */
1701 xmm2 = _mm_sub_sd(r,sk_ai);
1702 xmm3 = my_inv_pd(xmm2);
1703 mask_cmp2 = _mm_cmpgt_pd(raj,xmm2);
1705 lij = _mm_or_pd(_mm_and_pd(mask_cmp2,raj_inv) ,_mm_andnot_pd(mask_cmp2,xmm3)); /*conditional as a mask*/
1706 dlij = _mm_or_pd(_mm_and_pd(mask_cmp2,zero) ,_mm_andnot_pd(mask_cmp2,one));
1708 uij = my_inv_pd(xmm1);
1709 lij2 = _mm_mul_sd(lij,lij);
1710 lij3 = _mm_mul_sd(lij2,lij);
1711 uij2 = _mm_mul_sd(uij,uij);
1712 uij3 = _mm_mul_sd(uij2,uij);
1714 diff2 = _mm_sub_sd(uij2,lij2);
1716 lij_inv = my_invrsq_pd(lij2);
1718 sk2 = sk2_ai;
1719 sk2_inv = _mm_mul_sd(sk2,rinv);
1720 prod = _mm_mul_sd(qrtr,sk2_inv);
1722 log_term = _mm_mul_pd(uij,lij_inv);
1723 log_term = log_pd(log_term);
1725 xmm1 = _mm_sub_sd(lij,uij);
1726 xmm2 = _mm_mul_sd(qrtr,r);
1727 xmm2 = _mm_mul_sd(xmm2,diff2);
1728 xmm1 = _mm_add_sd(xmm1,xmm2);
1729 xmm2 = _mm_mul_sd(half,rinv);
1730 xmm2 = _mm_mul_sd(xmm2,log_term);
1731 xmm1 = _mm_add_sd(xmm1,xmm2);
1732 xmm9 = _mm_mul_sd(neg,diff2);
1733 xmm2 = _mm_mul_sd(xmm9,prod);
1734 tmp = _mm_add_sd(xmm1,xmm2);
1736 /* contitional for raj<sk_ai-dr */
1737 xmm3 = _mm_sub_sd(sk_ai,r);
1738 mask_cmp3 = _mm_cmplt_sd(raj,xmm3); /* rai<sk-dr */
1740 xmm4 = _mm_sub_sd(raj_inv,lij);
1741 xmm4 = _mm_mul_sd(two,xmm4);
1742 xmm4 = _mm_add_sd(tmp,xmm4);
1744 tmp = _mm_or_pd(_mm_and_pd(mask_cmp3,xmm4) ,_mm_andnot_pd(mask_cmp3,tmp)); /*conditional as a mask*/
1746 /* the tmp will now contain two partial values, that not all are to be used. Which */
1747 /* ones are governed by the mask_cmp mask. */
1748 tmp = _mm_mul_pd(half,tmp);
1749 tmp = _mm_or_pd(_mm_and_pd(mask_cmp,tmp) ,_mm_andnot_pd(mask_cmp,zero)); /*conditional as a mask*/
1751 /* Load, add and store gpol energy */
1752 xmm7 = _mm_load_sd(born->gpol_hct_work+aj1);
1753 xmm7 = _mm_add_sd(xmm7,tmp);
1754 _mm_store_sd(born->gpol_hct_work+aj1,xmm7);
1756 /* Start chain rule terms, t1 */
1757 xmm2 = _mm_mul_sd(half,lij2);
1758 xmm3 = _mm_mul_sd(prod,lij3);
1759 xmm2 = _mm_add_sd(xmm2,xmm3);
1760 xmm3 = _mm_mul_sd(lij,rinv);
1761 xmm4 = _mm_mul_sd(lij3,r);
1762 xmm3 = _mm_add_sd(xmm3,xmm4);
1763 xmm3 = _mm_mul_sd(qrtr,xmm3);
1764 t1 = _mm_sub_sd(xmm2,xmm3);
1766 xmm2 = _mm_mul_sd(half,uij2);
1767 xmm2 = _mm_mul_sd(neg,xmm2);
1768 xmm3 = _mm_mul_sd(qrtr,sk2_inv);
1769 xmm3 = _mm_mul_sd(xmm3,uij3);
1770 xmm2 = _mm_sub_sd(xmm2,xmm3);
1771 xmm3 = _mm_mul_sd(uij,rinv);
1772 xmm4 = _mm_mul_sd(uij3,r);
1773 xmm3 = _mm_add_sd(xmm3,xmm4);
1774 xmm3 = _mm_mul_sd(qrtr,xmm3);
1775 t2 = _mm_add_sd(xmm2,xmm3);
1777 xmm2 = _mm_mul_sd(sk2_inv,rinv);
1778 xmm2 = _mm_add_sd(one,xmm2);
1779 xmm2 = _mm_mul_sd(eigth,xmm2);
1780 xmm2 = _mm_mul_sd(xmm2,xmm9);
1781 xmm3 = _mm_mul_sd(log_term, rinv);
1782 xmm3 = _mm_mul_sd(xmm3,rinv);
1783 xmm3 = _mm_mul_sd(qrtr,xmm3);
1784 t3 = _mm_add_sd(xmm2,xmm3);
1786 /* chain rule terms */
1787 xmm2 = _mm_mul_sd(dlij,t1);
1788 xmm2 = _mm_add_sd(xmm2,t2);
1789 xmm2 = _mm_add_sd(xmm2,t3);
1790 chrule_ai = _mm_mul_sd(xmm2,rinv);
1792 _mm_store_sd(fr->dadx+n, chrule);
1793 n = n + 1;
1794 _mm_store_sd(fr->dadx+n, chrule_ai);
1795 n = n + 1;
1798 /* Do end processing ... */
1799 tmp_ai = _mm_unpacklo_pd(tmp_ai,sum_ai);
1800 sum_ai = _mm_add_pd(sum_ai,tmp_ai);
1801 sum_ai = _mm_shuffle_pd(sum_ai,sum_ai,_MM_SHUFFLE2(1,1));
1803 /* Load, add and store atom ai polarisation energy */
1804 xmm2 = _mm_load_sd(born->gpol_hct_work+ai);
1805 sum_ai = _mm_add_sd(sum_ai,xmm2);
1806 _mm_store_sd(born->gpol_hct_work+ai,sum_ai);
1809 /* Parallel summations */
1810 if(PARTDECOMP(cr))
1812 gmx_sum(natoms, born->gpol_hct_work, cr);
1814 else if(DOMAINDECOMP(cr))
1816 dd_atom_sum_real(cr->dd, born->gpol_hct_work);
1819 /* Compute the radii */
1820 for(i=0;i<fr->natoms_force;i++) /* PELA born->nr */
1822 if(born->use[i] != 0)
1824 rr = top->atomtypes.gb_radius[md->typeA[i]];
1825 rr_inv2 = 1.0/rr;
1826 rr = rr-doff;
1827 rr_inv = 1.0/rr;
1828 sum = rr * born->gpol_hct_work[i];
1829 sum2 = sum * sum;
1830 sum3 = sum2 * sum;
1832 tsum = tanh(born->obc_alpha*sum-born->obc_beta*sum2+born->obc_gamma*sum3);
1833 born->bRad[i] = rr_inv - tsum*rr_inv2;
1834 born->bRad[i] = 1.0 / born->bRad[i];
1836 fr->invsqrta[i] = gmx_invsqrt(born->bRad[i]);
1838 tchain = rr * (born->obc_alpha-2*born->obc_beta*sum+3*born->obc_gamma*sum2);
1839 born->drobc[i] = (1.0-tsum*tsum)*tchain*rr_inv2;
1843 /* Extra (local) communication required for DD */
1844 if(DOMAINDECOMP(cr))
1846 dd_atom_spread_real(cr->dd, born->bRad);
1847 dd_atom_spread_real(cr->dd, fr->invsqrta);
1848 dd_atom_spread_real(cr->dd, born->drobc);
1851 return 0;
1857 calc_gb_chainrule_sse2_double(int natoms, t_nblist *nl, double *dadx, double *dvda, double *xd, double *f,
1858 double *fshift, double *shift_vec, int gb_algorithm, gmx_genborn_t *born)
1860 int i,k,n,ai,aj,ai3,aj1,aj2,aj13,aj23,aj4,nj0,nj1,offset;
1861 int shift;
1862 double rbi,shX,shY,shZ;
1863 double *rb;
1865 __m128d ix,iy,iz,jx,jy,jz,fix,fiy,fiz,sX,sY,sZ;
1866 __m128d dx,dy,dz,t1,t2,t3,dva,dvaj,dax,dax_ai,fgb,fgb_ai;
1867 __m128d xmm1,xmm2,xmm3,xmm4,xmm5,xmm6,xmm7,xmm8;
1869 t1 = t2 = t3 = _mm_setzero_pd();
1870 rb = born->work;
1872 /* Loop to get the proper form for the Born radius term */
1873 if(gb_algorithm==egbSTILL) {
1874 for(i=0;i<natoms;i++)
1876 rbi = born->bRad[i];
1877 rb[i] = (2 * rbi * rbi * dvda[i])/ONE_4PI_EPS0;
1881 else if(gb_algorithm==egbHCT) {
1882 for(i=0;i<natoms;i++)
1884 rbi = born->bRad[i];
1885 rb[i] = rbi * rbi * dvda[i];
1889 else if(gb_algorithm==egbOBC) {
1890 for(i=0;i<natoms;i++)
1892 rbi = born->bRad[i];
1893 rb[i] = rbi * rbi * born->drobc[i] * dvda[i];
1897 n = 0;
1899 for(i=0;i<nl->nri;i++)
1901 ai = nl->iinr[i];
1902 ai3 = ai * 3;
1904 nj0 = nl->jindex[i];
1905 nj1 = nl->jindex[i+1];
1907 /* Load shifts for this list */
1908 shift = 3*nl->shift[i];
1909 shX = shift_vec[shift+0];
1910 shY = shift_vec[shift+1];
1911 shZ = shift_vec[shift+2];
1913 /* Splat the shifts */
1914 sX = _mm_load1_pd(&shX);
1915 sY = _mm_load1_pd(&shY);
1916 sZ = _mm_load1_pd(&shZ);
1918 offset = (nj1-nj0)%2;
1920 /* Load particle ai coordinates and add shifts */
1921 ix = _mm_load1_pd(xd+ai3);
1922 iy = _mm_load1_pd(xd+ai3+1);
1923 iz = _mm_load1_pd(xd+ai3+2);
1925 ix = _mm_add_pd(sX,ix);
1926 iy = _mm_add_pd(sY,iy);
1927 iz = _mm_add_pd(sZ,iz);
1929 /* Load particle ai dvda */
1930 dva = _mm_load1_pd(rb+ai);
1932 fix = _mm_setzero_pd();
1933 fiy = _mm_setzero_pd();
1934 fiz = _mm_setzero_pd();
1936 for(k=nj0;k<nj1-offset;k+=2)
1938 aj1 = nl->jjnr[k];
1939 aj2 = nl->jjnr[k+1];
1941 /* Load dvda_j */
1942 dvaj = _mm_set_pd(rb[aj2],rb[aj1]);
1944 aj13 = aj1 * 3;
1945 aj23 = aj2 * 3;
1947 /* Load particle aj1-2 coordinates */
1948 xmm1 = _mm_loadu_pd(xd+aj13);
1949 xmm2 = _mm_loadu_pd(xd+aj23);
1951 xmm5 = _mm_load_sd(xd+aj13+2);
1952 xmm6 = _mm_load_sd(xd+aj23+2);
1954 jx = _mm_shuffle_pd(xmm1,xmm2,_MM_SHUFFLE2(0,0));
1955 jy = _mm_shuffle_pd(xmm1,xmm2,_MM_SHUFFLE2(1,1));
1956 jz = _mm_shuffle_pd(xmm5,xmm6,_MM_SHUFFLE2(0,0));
1958 /* load derivative of born radii w.r.t. coordinates */
1959 xmm7 = _mm_loadu_pd(dadx+n);
1960 n = n + 2;
1961 xmm8 = _mm_loadu_pd(dadx+n);
1962 n = n + 2;
1964 /* Shuffle to get the ai and aj components right */
1965 dax = _mm_shuffle_pd(xmm7,xmm8,_MM_SHUFFLE2(2,0));
1966 dax_ai = _mm_shuffle_pd(xmm7,xmm8,_MM_SHUFFLE2(3,1));
1968 /* distances */
1969 dx = _mm_sub_pd(ix,jx);
1970 dy = _mm_sub_pd(iy,jy);
1971 dz = _mm_sub_pd(iz,jz);
1973 /* scalar force */
1974 fgb = _mm_mul_pd(dva,dax);
1975 fgb_ai = _mm_mul_pd(dvaj,dax_ai);
1976 fgb = _mm_add_pd(fgb,fgb_ai);
1978 /* partial forces */
1979 t1 = _mm_mul_pd(fgb,dx);
1980 t2 = _mm_mul_pd(fgb,dy);
1981 t3 = _mm_mul_pd(fgb,dz);
1983 /* update the i force */
1984 fix = _mm_add_pd(fix,t1);
1985 fiy = _mm_add_pd(fiy,t2);
1986 fiz = _mm_add_pd(fiz,t3);
1988 /* accumulate forces from memory */
1989 xmm1 = _mm_loadu_pd(f+aj13); /* fx1 fx2 */
1990 xmm2 = _mm_loadu_pd(f+aj23); /* fy1 fy2 */
1992 xmm5 = _mm_load1_pd(f+aj13+2); /* fz1 fz1 */
1993 xmm6 = _mm_load1_pd(f+aj23+2); /* fz2 fz2 */
1995 /* transpose */
1996 xmm7 = _mm_shuffle_pd(xmm5,xmm6,_MM_SHUFFLE2(0,0)); /* fz1 fz2 */
1997 xmm5 = _mm_shuffle_pd(xmm1,xmm2,_MM_SHUFFLE2(0,0)); /* fx1 fx2 */
1998 xmm6 = _mm_shuffle_pd(xmm1,xmm2,_MM_SHUFFLE2(1,1)); /* fy1 fy2 */
2000 /* subtract partial forces */
2001 xmm5 = _mm_sub_pd(xmm5, t1);
2002 xmm6 = _mm_sub_pd(xmm6, t2);
2003 xmm7 = _mm_sub_pd(xmm7, t3);
2005 /* transpose back fx and fy */
2006 xmm1 = _mm_shuffle_pd(xmm5,xmm6,_MM_SHUFFLE2(0,0));
2007 xmm2 = _mm_shuffle_pd(xmm5,xmm6,_MM_SHUFFLE2(1,1));
2009 /* store the force, first fx's and fy's */
2010 _mm_storeu_pd(f+aj13,xmm1);
2011 _mm_storeu_pd(f+aj23,xmm2);
2013 /* then fz */
2014 _mm_storel_pd(f+aj13+2,xmm7);
2015 _mm_storeh_pd(f+aj23+2,xmm7);
2018 if(offset!=0)
2020 aj1 = nl->jjnr[k];
2022 dvaj = _mm_load_sd(rb+aj1);
2024 aj13 = aj1 * 3;
2026 jx = _mm_load_sd(xd+aj13);
2027 jy = _mm_load_sd(xd+aj13+1);
2028 jz = _mm_load_sd(xd+aj13+2);
2030 dax = _mm_load_sd(dadx+n);
2031 n = n + 1;
2032 dax_ai = _mm_load_sd(dadx+n);
2033 n = n + 1;
2035 dx = _mm_sub_sd(ix,jx);
2036 dy = _mm_sub_sd(iy,jy);
2037 dz = _mm_sub_sd(iz,jz);
2039 fgb = _mm_mul_sd(dva,dax);
2040 fgb_ai = _mm_mul_sd(dvaj,dax_ai);
2041 fgb = _mm_add_pd(fgb,fgb_ai);
2043 t1 = _mm_mul_sd(fgb,dx);
2044 t2 = _mm_mul_sd(fgb,dy);
2045 t3 = _mm_mul_sd(fgb,dz);
2047 fix = _mm_add_sd(fix,t1);
2048 fiy = _mm_add_sd(fiy,t2);
2049 fiz = _mm_add_sd(fiz,t3);
2051 /* accumulate forces from memory */
2052 xmm5 = _mm_load_sd(f+aj13);
2053 xmm6 = _mm_load_sd(f+aj13+1);
2054 xmm7 = _mm_load_sd(f+aj13+2);
2056 /* subtract partial forces */
2057 xmm5 = _mm_sub_sd(xmm5, t1);
2058 xmm6 = _mm_sub_sd(xmm6, t2);
2059 xmm7 = _mm_sub_sd(xmm7, t3);
2061 /* store the force */
2062 _mm_store_sd(f+aj13,xmm5);
2063 _mm_store_sd(f+aj13+1,xmm6);
2064 _mm_store_sd(f+aj13+2,xmm7);
2067 /* fix/fiy/fiz now contain two partial terms, that all should be
2068 * added to the i particle forces
2070 t1 = _mm_unpacklo_pd(t1,fix);
2071 t2 = _mm_unpacklo_pd(t2,fiy);
2072 t3 = _mm_unpacklo_pd(t3,fiz);
2074 fix = _mm_add_pd(fix,t1);
2075 fiy = _mm_add_pd(fiy,t2);
2076 fiz = _mm_add_pd(fiz,t3);
2078 fix = _mm_shuffle_pd(fix,fix,_MM_SHUFFLE2(1,1));
2079 fiy = _mm_shuffle_pd(fiy,fiy,_MM_SHUFFLE2(1,1));
2080 fiz = _mm_shuffle_pd(fiz,fiz,_MM_SHUFFLE2(1,1));
2082 /* load, add and store i forces */
2083 xmm1 = _mm_load_sd(f+ai3);
2084 xmm2 = _mm_load_sd(f+ai3+1);
2085 xmm3 = _mm_load_sd(f+ai3+2);
2087 fix = _mm_add_sd(fix,xmm1);
2088 fiy = _mm_add_sd(fiy,xmm2);
2089 fiz = _mm_add_sd(fiz,xmm3);
2091 _mm_store_sd(f+ai3,fix);
2092 _mm_store_sd(f+ai3+1,fiy);
2093 _mm_store_sd(f+ai3+2,fiz);
2095 /* load, add and store i shift forces */
2096 xmm1 = _mm_load_sd(fshift+shift);
2097 xmm2 = _mm_load_sd(fshift+shift+1);
2098 xmm3 = _mm_load_sd(fshift+shift+2);
2100 fix = _mm_add_sd(fix,xmm1);
2101 fiy = _mm_add_sd(fiy,xmm2);
2102 fiz = _mm_add_sd(fiz,xmm3);
2104 _mm_store_sd(fshift+shift,fix);
2105 _mm_store_sd(fshift+shift+1,fiy);
2106 _mm_store_sd(fshift+shift+2,fiz);
2110 return 0;
2113 #else
2114 /* keep compiler happy */
2115 int genborn_sse_dummy;
2117 #endif /* SSE2 intrinsics available */