19 #include "gmx_fatal.h"
20 #include "mtop_util.h"
30 /* Only compile this file if SSE2 intrinsics are available */
31 #if ( (defined(GMX_IA32_SSE2) || defined(GMX_X86_64_SSE2) || defined(GMX_SSE2)) && defined(GMX_DOUBLE) )
32 #include <xmmintrin.h>
33 #include <emmintrin.h>
35 #include "genborn_sse2_double.h"
37 #if (defined (_MSC_VER) || defined(__INTEL_COMPILER))
38 #define gmx_castsi128_pd(a) _mm_castsi128_pd(a)
39 #define gmx_castpd_si128(a) _mm_castpd_si128(a)
40 #elif defined(__GNUC__)
41 #define gmx_castsi128_pd(a) ((__m128d)(a))
42 #define gmx_castpd_si128(a) ((__m128i)(a))
44 static __m128d
gmx_castsi128_pd(__m128i a
) { return *(__m128d
*) &a
; }
45 static __m128i
gmx_castpd_si128(__m128d a
) { return *(__m128i
*) &a
; }
50 sincos_sse2double(__m128d x
, __m128d
*sinval
, __m128d
*cosval
)
52 const __m128d two_over_pi
= {2.0/M_PI
,2.0/M_PI
};
53 const __m128d half
= {0.5,0.5};
54 const __m128d one
= {1.0,1.0};
55 const __m128i izero
= _mm_set1_epi32(0);
56 const __m128i ione
= _mm_set1_epi32(1);
57 const __m128i itwo
= _mm_set1_epi32(2);
58 const __m128i ithree
= _mm_set1_epi32(3);
59 const __m128d sincosd_kc1
= {(13176794.0 / 8388608.0),(13176794.0 / 8388608.0)};
60 const __m128d sincosd_kc2
= {7.5497899548918821691639751442098584e-8,7.5497899548918821691639751442098584e-8};
61 const __m128d sincosd_cc0
= {0.00000000206374484196,0.00000000206374484196};
62 const __m128d sincosd_cc1
= {-0.00000027555365134677,-0.00000027555365134677};
63 const __m128d sincosd_cc2
= {0.00002480157946764225,0.00002480157946764225};
64 const __m128d sincosd_cc3
= {-0.00138888888730525966,-0.00138888888730525966};
65 const __m128d sincosd_cc4
= {0.04166666666651986722,0.04166666666651986722};
66 const __m128d sincosd_cc5
= {-0.49999999999999547304,-0.49999999999999547304};
67 const __m128d sincosd_sc0
= {0.00000000015893606014,0.00000000015893606014};
68 const __m128d sincosd_sc1
= {-0.00000002505069049138,-0.00000002505069049138};
69 const __m128d sincosd_sc2
= {0.00000275573131527032,0.00000275573131527032};
70 const __m128d sincosd_sc3
= {-0.00019841269827816117,-0.00019841269827816117};
71 const __m128d sincosd_sc4
= {0.00833333333331908278,0.00833333333331908278};
72 const __m128d sincosd_sc5
= {-0.16666666666666612594,-0.16666666666666612594};
74 __m128d signbit
= gmx_castsi128_pd(_mm_set1_epi64x(0x8000000000000000ULL
));
75 __m128d tiny
= gmx_castsi128_pd(_mm_set1_epi64x(0x3e40000000000000ULL
));
77 __m128d xl
,xl2
,xl3
,qd
,absxl
,p1
,cx
,sx
,ts
,tc
,tsn
,tcn
;
79 __m128i offsetSin
,offsetCos
;
80 __m128d sinMask
,cosMask
,isTiny
;
81 __m128d ct0
,ct1
,ct2
,ct3
,ct4
,ct5
,ct6
,st1
,st2
,st3
,st4
,st6
;
83 /* Rescale the angle to the range 0..4, and find which quadrant it is in */
84 xl
= _mm_mul_pd(x
,two_over_pi
);
86 /* q=integer part of xl, rounded _away_ from 0.0 */
87 /* Add 0.5 away from 0.0 */
88 xl
= _mm_add_pd(xl
,_mm_or_pd(_mm_and_pd(xl
,signbit
),half
));
90 q
= _mm_cvttpd_epi32(xl
);
91 qd
= _mm_cvtepi32_pd(q
);
92 q
= _mm_shuffle_epi32(q
,_MM_SHUFFLE(1,1,0,0));
94 /* Compute offset based on quadrant the arg falls in */
95 offsetSin
= _mm_and_si128(q
,ithree
);
96 offsetCos
= _mm_add_epi32(offsetSin
,ione
);
98 /* Remainder in range [-pi/4..pi/4] */
99 p1
= _mm_mul_pd(qd
,sincosd_kc1
);
100 xl
= _mm_mul_pd(qd
,sincosd_kc2
);
101 p1
= _mm_sub_pd(x
,p1
);
102 xl
= _mm_sub_pd(p1
,xl
);
104 absxl
= _mm_andnot_pd(signbit
,xl
);
105 isTiny
= _mm_cmpgt_pd(tiny
,absxl
);
107 xl2
= _mm_mul_pd(xl
,xl
);
108 xl3
= _mm_mul_pd(xl2
,xl
);
110 ct0
= _mm_mul_pd(xl2
,xl2
);
111 ct1
= _mm_mul_pd(sincosd_cc0
,xl2
);
112 ct2
= _mm_mul_pd(sincosd_cc2
,xl2
);
113 ct3
= _mm_mul_pd(sincosd_cc4
,xl2
);
114 st1
= _mm_mul_pd(sincosd_sc0
,xl2
);
115 st2
= _mm_mul_pd(sincosd_sc2
,xl2
);
116 st3
= _mm_mul_pd(sincosd_sc4
,xl2
);
117 ct1
= _mm_add_pd(ct1
,sincosd_cc1
);
118 ct2
= _mm_add_pd(ct2
,sincosd_cc3
);
119 ct3
= _mm_add_pd(ct3
,sincosd_cc5
);
120 st1
= _mm_add_pd(st1
,sincosd_sc1
);
121 st2
= _mm_add_pd(st2
,sincosd_sc3
);
122 st3
= _mm_add_pd(st3
,sincosd_sc5
);
124 ct4
= _mm_mul_pd(ct2
,ct0
);
125 ct4
= _mm_add_pd(ct4
,ct3
);
127 st4
= _mm_mul_pd(st2
,ct0
);
128 st4
= _mm_add_pd(st4
,st3
);
129 ct5
= _mm_mul_pd(ct0
,ct0
);
131 ct6
= _mm_mul_pd(ct5
,ct1
);
132 ct6
= _mm_add_pd(ct6
,ct4
);
134 st6
= _mm_mul_pd(ct5
,st1
);
135 st6
= _mm_add_pd(st6
,st4
);
137 cx
= _mm_mul_pd(ct6
,xl2
);
138 cx
= _mm_add_pd(cx
,one
);
140 sx
= _mm_mul_pd(st6
,xl3
);
141 sx
= _mm_add_pd(sx
,xl
);
143 /* Small angle approximation, sin(tiny)=tiny, cos(tiny)=1.0 */
144 sx
= _mm_or_pd( _mm_and_pd(isTiny
,xl
) , _mm_andnot_pd(isTiny
,sx
) );
145 cx
= _mm_or_pd( _mm_and_pd(isTiny
,one
) , _mm_andnot_pd(isTiny
,cx
) );
147 sinMask
= gmx_castsi128_pd(_mm_cmpeq_epi32( _mm_and_si128(offsetSin
,ione
), izero
));
148 cosMask
= gmx_castsi128_pd(_mm_cmpeq_epi32( _mm_and_si128(offsetCos
,ione
), izero
));
150 ts
= _mm_or_pd( _mm_and_pd(sinMask
,sx
) , _mm_andnot_pd(sinMask
,cx
) );
151 tc
= _mm_or_pd( _mm_and_pd(cosMask
,sx
) , _mm_andnot_pd(cosMask
,cx
) );
153 /* Flip the sign of the result when (offset mod 4) = 1 or 2 */
154 sinMask
= gmx_castsi128_pd(_mm_cmpeq_epi32( _mm_and_si128(offsetSin
,itwo
), izero
));
155 tsn
= _mm_xor_pd(signbit
,ts
);
156 ts
= _mm_or_pd( _mm_and_pd(sinMask
,ts
) , _mm_andnot_pd(sinMask
,tsn
) );
158 cosMask
= gmx_castsi128_pd(_mm_cmpeq_epi32( _mm_and_si128(offsetCos
,itwo
), izero
));
159 tcn
= _mm_xor_pd(signbit
,tc
);
160 tc
= _mm_or_pd( _mm_and_pd(cosMask
,tc
) , _mm_andnot_pd(cosMask
,tcn
) );
168 __m128d
log_pd(__m128d x
)
170 const __m128i exp_mask
= _mm_set_epi32(0x7FF00000,0,0x7FF00000,0);
171 const __m128i exp_bias
= _mm_set_epi32(0,1023,0,1023);
173 const __m128d const_loge
= _mm_set1_pd(0.69314718055994529);
174 const __m128d const_one
= _mm_set1_pd(1.0);
175 const __m128d const_two
= _mm_set1_pd(2.0);
177 /* Almost full single precision accuracy (~20 bits worst case) */
178 const __m128d P0
= _mm_set1_pd(6.108179944792157749153050);
179 const __m128d P1
= _mm_set1_pd(52.43691313715523327631139);
180 const __m128d P2
= _mm_set1_pd(71.53664010795613671168440);
181 const __m128d P3
= _mm_set1_pd(18.92097516931559547548485);
182 const __m128d P4
= _mm_set1_pd(0.3504714784635941984522153);
183 const __m128d P5
= _mm_set1_pd(-0.007105890734229368515879);
184 const __m128d Q1
= _mm_set1_pd(17.73314231909420567454406);
185 const __m128d Q2
= _mm_set1_pd(48.82373085428713023213363);
186 const __m128d Q3
= _mm_set1_pd(31.65945943354513166309101);
187 const __m128d Q4
= _mm_set1_pd(4.302477477108162270199051);
189 __m128d xmm0
,xmm1
,xmm2
,xmm3
, xmm4
;
192 xmmi
= gmx_castpd_si128(x
);
193 xmm1
= _mm_cvtepi32_pd(_mm_shuffle_epi32(_mm_sub_epi64(_mm_srli_epi64(_mm_and_si128(xmmi
, exp_mask
), 52), exp_bias
),_MM_SHUFFLE(3,1,2,0)));
194 xmm0
= _mm_or_pd(gmx_castsi128_pd(_mm_andnot_si128(exp_mask
, xmmi
)), const_one
);
196 xmm2
= _mm_mul_pd(P5
,xmm0
);
197 xmm2
= _mm_add_pd(xmm2
,P4
);
198 xmm2
= _mm_mul_pd(xmm2
,xmm0
);
199 xmm2
= _mm_add_pd(xmm2
,P3
);
200 xmm2
= _mm_mul_pd(xmm2
,xmm0
);
201 xmm2
= _mm_add_pd(xmm2
,P2
);
202 xmm2
= _mm_mul_pd(xmm2
,xmm0
);
203 xmm2
= _mm_add_pd(xmm2
,P1
);
204 xmm2
= _mm_mul_pd(xmm2
,xmm0
);
205 xmm2
= _mm_add_pd(xmm2
,P0
);
207 xmm3
= _mm_mul_pd(Q4
,xmm0
);
208 xmm3
= _mm_add_pd(xmm3
,Q3
);
209 xmm3
= _mm_mul_pd(xmm3
,xmm0
);
210 xmm3
= _mm_add_pd(xmm3
,Q2
);
211 xmm3
= _mm_mul_pd(xmm3
,xmm0
);
212 xmm3
= _mm_add_pd(xmm3
,Q1
);
213 xmm3
= _mm_mul_pd(xmm3
,xmm0
);
214 xmm3
= _mm_add_pd(xmm3
,const_one
);
217 xmm4
= _mm_cvtps_pd(_mm_rcp_ps(_mm_cvtpd_ps(xmm3
)));
218 xmm4
= _mm_mul_pd(xmm4
,_mm_sub_pd(const_two
,_mm_mul_pd(xmm3
,xmm4
)));
219 xmm4
= _mm_mul_pd(xmm4
,_mm_sub_pd(const_two
,_mm_mul_pd(xmm3
,xmm4
)));
220 xmm2
= _mm_mul_pd(xmm2
,xmm4
);
222 xmm0
= _mm_sub_pd(xmm0
, const_one
);
223 xmm0
= _mm_mul_pd(xmm0
,xmm2
);
225 xmm0
= _mm_add_pd(xmm0
,xmm1
);
227 return _mm_mul_pd(xmm0
, const_loge
);
230 /* This exp() routine provides accuracy of 10E-9 to 10E-11.
231 * The polynomial minimax coefficients are actually accurate to 10E-14,
232 * but we lose some accuracy in the polynomial evaluation.
234 __m128d
exp_pd(__m128d x
)
236 const __m128d lim1
= _mm_set1_pd(1025.0); /* 1025.00000e+0d */
237 const __m128d lim2
= _mm_set1_pd(-1022.99999999); /* -1022.99999e+0f */
239 const __m128i base
= _mm_set_epi32(0,0,1023,1023);
240 const __m128d half
= _mm_set1_pd(0.5);
241 const __m128d log2e
= _mm_set1_pd(1.4426950408889634);
243 const __m128d exp_P0
= _mm_set1_pd(1.00000000000001276211229749);
244 const __m128d exp_P1
= _mm_set1_pd(6.931471805598709708635169E-1);
245 const __m128d exp_P2
= _mm_set1_pd(2.402265069564965287455972E-1);
246 const __m128d exp_P3
= _mm_set1_pd(5.550410866868561155599683E-2);
247 const __m128d exp_P4
= _mm_set1_pd(9.618129192067246128919915E-3);
248 const __m128d exp_P5
= _mm_set1_pd(1.333355761760444302342084E-3);
249 const __m128d exp_P6
= _mm_set1_pd(1.540343494807179111289781E-4);
250 const __m128d exp_P7
= _mm_set1_pd(1.525298483865349629325421E-5);
251 const __m128d exp_P8
= _mm_set1_pd(1.325940560934510417818578E-6);
252 const __m128d exp_P9
= _mm_set1_pd(1.015033670529892589443421E-7);
257 xmm0
= _mm_mul_pd(x
,log2e
);
258 xmm0
= _mm_min_pd(xmm0
,lim1
);
259 xmm0
= _mm_max_pd(xmm0
,lim2
);
260 xmm1
= _mm_sub_pd(xmm0
,half
);
262 xmmi
= _mm_cvtpd_epi32(xmm1
);
263 xmm1
= _mm_cvtepi32_pd(xmmi
);
265 xmmi
= _mm_add_epi32(xmmi
,base
);
266 xmmi
= _mm_shuffle_epi32(xmmi
,_MM_SHUFFLE(3,1,2,0));
267 xmmi
= _mm_slli_epi64(xmmi
,52);
269 xmm0
= _mm_sub_pd(xmm0
,xmm1
);
271 xmm1
= _mm_mul_pd(exp_P9
,xmm0
);
272 xmm1
= _mm_add_pd(xmm1
,exp_P8
);
273 xmm1
= _mm_mul_pd(xmm1
,xmm0
);
274 xmm1
= _mm_add_pd(xmm1
,exp_P7
);
275 xmm1
= _mm_mul_pd(xmm1
,xmm0
);
276 xmm1
= _mm_add_pd(xmm1
,exp_P6
);
277 xmm1
= _mm_mul_pd(xmm1
,xmm0
);
278 xmm1
= _mm_add_pd(xmm1
,exp_P5
);
279 xmm1
= _mm_mul_pd(xmm1
,xmm0
);
280 xmm1
= _mm_add_pd(xmm1
,exp_P4
);
281 xmm1
= _mm_mul_pd(xmm1
,xmm0
);
282 xmm1
= _mm_add_pd(xmm1
,exp_P3
);
283 xmm1
= _mm_mul_pd(xmm1
,xmm0
);
284 xmm1
= _mm_add_pd(xmm1
,exp_P2
);
285 xmm1
= _mm_mul_pd(xmm1
,xmm0
);
286 xmm1
= _mm_add_pd(xmm1
,exp_P1
);
287 xmm1
= _mm_mul_pd(xmm1
,xmm0
);
288 xmm1
= _mm_add_pd(xmm1
,exp_P0
);
289 xmm1
= _mm_mul_pd(xmm1
,gmx_castsi128_pd(xmmi
));
294 static inline __m128d
295 my_invrsq_pd(__m128d x
)
297 const __m128d three
= {3.0, 3.0};
298 const __m128d half
= {0.5, 0.5};
300 __m128 t
= _mm_rsqrt_ps(_mm_cvtpd_ps(x
)); /* Convert to single precision and do _mm_rsqrt_ps() */
301 __m128d t1
= _mm_cvtps_pd(t
); /* Convert back to double precision */
303 /* First Newton-Rapson step, accuracy is now 24 bits */
304 __m128d t2
= _mm_mul_pd(half
,_mm_mul_pd(t1
,_mm_sub_pd(three
,_mm_mul_pd(x
,_mm_mul_pd(t1
,t1
)))));
306 /* Return second Newton-Rapson step, accuracy 48 bits */
307 return _mm_mul_pd(half
,_mm_mul_pd(t2
,_mm_sub_pd(three
,_mm_mul_pd(x
,_mm_mul_pd(t2
,t2
)))));
310 static inline __m128d
313 const __m128d two
= {2.0, 2.0};
315 __m128 t
= _mm_rcp_ps(_mm_cvtpd_ps(x
));
316 __m128d t1
= _mm_cvtps_pd(t
);
317 __m128d t2
= _mm_mul_pd(t1
,_mm_sub_pd(two
,_mm_mul_pd(t1
,x
)));
319 return _mm_mul_pd(t2
,_mm_sub_pd(two
,_mm_mul_pd(t2
,x
)));
324 calc_gb_rad_still_sse2_double(t_commrec
*cr
, t_forcerec
*fr
,int natoms
, gmx_localtop_t
*top
,
325 const t_atomtypes
*atype
, double *x
, t_nblist
*nl
, gmx_genborn_t
*born
, t_mdatoms
*md
)
327 int i
,k
,n
,ai
,ai3
,aj1
,aj2
,aj13
,aj23
;
328 int at0
,at1
,nj0
,nj1
,offset
,taj1
,taj2
;
331 double factor
,gpi_ai
,gpi_tmp
,gpi2
;
334 __m128d ix
,iy
,iz
,jx
,jy
,jz
,dx
,dy
,dz
,sX
,sY
,sZ
;
335 __m128d t1
,t2
,t3
,rsq11
,rinv
,rinv2
,rinv4
,rinv6
;
336 __m128d ratio
,gpi
,rai
,raj
,vaj
,rvdw
,mask_cmp
;
337 __m128d ccf
,dccf
,theta
,cosq
,term
,sinq
,res
,prod
;
338 __m128d xmm1
,xmm2
,xmm3
,xmm4
,xmm7
,vai
,prod_ai
,icf4
,icf6
;
340 const __m128d half
= {0.5, 0.5};
341 const __m128d three
= {3.0, 3.0};
342 const __m128d one
= {1.0, 1.0};
343 const __m128d two
= {2.0, 2.0};
344 const __m128d zero
= {0.0, 0.0};
345 const __m128d four
= {4.0, 4.0};
347 const __m128d p5inv
= {STILL_P5INV
, STILL_P5INV
};
348 const __m128d pip5
= {STILL_PIP5
, STILL_PIP5
};
349 const __m128d p4
= {STILL_P4
, STILL_P4
};
351 factor
= 0.5 * ONE_4PI_EPS0
;
354 /* Keep the compiler happy */
355 raj
= _mm_setzero_pd();
356 vaj
= _mm_setzero_pd();
357 jx
= _mm_setzero_pd();
358 jy
= _mm_setzero_pd();
359 jz
= _mm_setzero_pd();
360 xmm2
= _mm_setzero_pd();
361 xmm7
= _mm_setzero_pd();
363 for(i
=0;i
<born
->nr
;i
++)
365 born
->gpol_still_work
[i
]=0;
368 for(i
=0;i
<nl
->nri
;i
++)
374 nj1
= nl
->jindex
[i
+1];
376 /* Load shifts for this list */
377 shift
= nl
->shift
[i
];
378 shX
= fr
->shift_vec
[shift
][0];
379 shY
= fr
->shift_vec
[shift
][1];
380 shZ
= fr
->shift_vec
[shift
][2];
382 /* Splat the shifts */
383 sX
= _mm_load1_pd(&shX
);
384 sY
= _mm_load1_pd(&shY
);
385 sZ
= _mm_load1_pd(&shZ
);
387 offset
= (nj1
-nj0
)%2;
389 /* Polarization energy for atom ai */
390 gpi
= _mm_setzero_pd();
392 /* Load particle ai coordinates and add shifts */
393 ix
= _mm_load1_pd(x
+ai3
);
394 iy
= _mm_load1_pd(x
+ai3
+1);
395 iz
= _mm_load1_pd(x
+ai3
+2);
397 ix
= _mm_add_pd(sX
,ix
);
398 iy
= _mm_add_pd(sY
,iy
);
399 iz
= _mm_add_pd(sZ
,iz
);
401 /* Load particle ai gb_radius */
402 rai
= _mm_set1_pd(top
->atomtypes
.gb_radius
[md
->typeA
[ai
]]);
403 vai
= _mm_set1_pd(born
->vsolv
[ai
]);
404 prod_ai
= _mm_mul_pd(p4
,vai
);
406 for(k
=nj0
;k
<nj1
-offset
;k
+=2)
414 taj1
= md
->typeA
[aj1
];
415 taj2
= md
->typeA
[aj2
];
417 /* Load particle aj1-2 coordinates and compute ai->aj distances */
418 xmm1
= _mm_loadu_pd(x
+aj13
);
419 xmm2
= _mm_loadu_pd(x
+aj23
);
420 jx
= _mm_shuffle_pd(xmm1
,xmm2
,_MM_SHUFFLE2(0,0));
421 jy
= _mm_shuffle_pd(xmm1
,xmm2
,_MM_SHUFFLE2(1,1));
423 jz
= _mm_loadl_pd(jz
,x
+aj13
+2);
424 jz
= _mm_loadh_pd(jz
,x
+aj23
+2);
426 dx
= _mm_sub_pd(ix
,jx
);
427 dy
= _mm_sub_pd(iy
,jy
);
428 dz
= _mm_sub_pd(iz
,jz
);
430 rsq11
= _mm_add_pd( _mm_add_pd( _mm_mul_pd(dx
,dx
) , _mm_mul_pd(dy
,dy
) ) , _mm_mul_pd(dz
,dz
) );
431 rinv
= my_invrsq_pd(rsq11
);
433 rinv2
= _mm_mul_pd(rinv
,rinv
);
434 rinv4
= _mm_mul_pd(rinv2
,rinv2
);
435 rinv6
= _mm_mul_pd(rinv4
,rinv2
);
437 vaj
= _mm_loadl_pd(vaj
,born
->vsolv
+aj1
);
438 vaj
= _mm_loadh_pd(vaj
,born
->vsolv
+aj2
);
440 raj
= _mm_loadl_pd(raj
, top
->atomtypes
.gb_radius
+taj1
);
441 raj
= _mm_loadh_pd(raj
, top
->atomtypes
.gb_radius
+taj2
);
443 rvdw
= _mm_add_pd(rai
,raj
);
444 rvdw
= _mm_mul_pd(rvdw
,rvdw
);
445 ratio
= _mm_div_pd(rsq11
,rvdw
);
447 mask_cmp
= _mm_cmpgt_pd(ratio
,p5inv
);
449 switch(_mm_movemask_pd(mask_cmp
))
456 theta
= _mm_mul_pd(ratio
,pip5
);
457 sincos_sse2double(theta
,&sinq
,&cosq
);
459 term
= _mm_sub_pd(one
,cosq
);
460 term
= _mm_mul_pd(half
,term
);
461 ccf
= _mm_mul_pd(term
,term
);
462 dccf
= _mm_mul_pd(two
,term
);
463 dccf
= _mm_mul_pd(dccf
,sinq
);
464 dccf
= _mm_mul_pd(dccf
,pip5
);
465 dccf
= _mm_mul_pd(dccf
,ratio
);
467 ccf
= _mm_or_pd(_mm_and_pd(mask_cmp
,one
) ,_mm_andnot_pd(mask_cmp
,ccf
));
468 dccf
= _mm_or_pd(_mm_and_pd(mask_cmp
,zero
) ,_mm_andnot_pd(mask_cmp
,dccf
));
471 prod
= _mm_mul_pd(p4
,vaj
);
472 icf4
= _mm_mul_pd(ccf
,rinv4
);
473 xmm2
= _mm_mul_pd(icf4
,prod
);
474 xmm3
= _mm_mul_pd(icf4
,prod_ai
);
475 gpi
= _mm_add_pd(gpi
,xmm2
);
477 /* Load, subtract and store atom aj gpol energy */
478 xmm7
= _mm_loadl_pd(xmm7
,born
->gpol_still_work
+aj1
);
479 xmm7
= _mm_loadh_pd(xmm7
,born
->gpol_still_work
+aj2
);
481 xmm3
= _mm_add_pd(xmm7
,xmm3
);
483 _mm_storel_pd(born
->gpol_still_work
+aj1
,xmm3
);
484 _mm_storeh_pd(born
->gpol_still_work
+aj2
,xmm3
);
486 /* Chain rule terms */
487 ccf
= _mm_mul_pd(four
,ccf
);
488 xmm3
= _mm_sub_pd(ccf
,dccf
);
489 icf6
= _mm_mul_pd(xmm3
,rinv6
);
490 xmm1
= _mm_mul_pd(icf6
,prod
);
491 xmm2
= _mm_mul_pd(icf6
,prod_ai
);
493 /* As with single precision, we need to shift stuff around, to change the order of
494 * the interactions from ai->aj1, ai->aj2 to ai->aj1, aj1->ai, ai->aj2, aj2->ai etc,
495 to do 2 instead of 4 store operations
497 xmm3
= _mm_unpacklo_pd(xmm1
,xmm2
);
498 xmm4
= _mm_unpackhi_pd(xmm1
,xmm2
);
500 _mm_storeu_pd(fr
->dadx
+n
,xmm3
);
502 _mm_storeu_pd(fr
->dadx
+n
,xmm4
);
506 /* Deal with odd elements */
511 taj1
= md
->typeA
[aj1
];
513 jx
= _mm_load_sd(x
+aj13
);
514 jy
= _mm_load_sd(x
+aj13
+1);
515 jz
= _mm_load_sd(x
+aj13
+2);
517 raj
= _mm_load_sd(top
->atomtypes
.gb_radius
+taj1
);
518 vaj
= _mm_load_sd(born
->vsolv
+aj1
);
520 dx
= _mm_sub_sd(ix
,jx
);
521 dy
= _mm_sub_pd(iy
,jy
);
522 dz
= _mm_sub_pd(iz
,jz
);
524 rsq11
= _mm_add_sd( _mm_add_sd( _mm_mul_sd(dx
,dx
) , _mm_mul_sd(dy
,dy
) ) , _mm_mul_sd(dz
,dz
) );
525 rinv
= my_invrsq_pd(rsq11
);
527 rinv2
= _mm_mul_sd(rinv
,rinv
);
528 rinv4
= _mm_mul_sd(rinv2
,rinv2
);
529 rinv6
= _mm_mul_sd(rinv4
,rinv2
);
531 rvdw
= _mm_add_sd(rai
,raj
);
532 rvdw
= _mm_mul_sd(rvdw
,rvdw
);
533 ratio
= _mm_div_sd(rsq11
,rvdw
);
535 mask_cmp
= _mm_cmpgt_sd(ratio
,p5inv
);
537 switch(_mm_movemask_pd(mask_cmp
))
544 theta
= _mm_mul_sd(ratio
,pip5
);
545 sincos_sse2double(theta
,&sinq
,&cosq
);
547 term
= _mm_sub_sd(one
,cosq
);
548 term
= _mm_mul_sd(half
,term
);
549 ccf
= _mm_mul_sd(term
,term
);
550 dccf
= _mm_mul_sd(two
,term
);
551 dccf
= _mm_mul_sd(dccf
,sinq
);
552 dccf
= _mm_mul_sd(dccf
,pip5
);
553 dccf
= _mm_mul_sd(dccf
,ratio
);
555 ccf
= _mm_or_pd(_mm_and_pd(mask_cmp
,one
) ,_mm_andnot_pd(mask_cmp
,ccf
));
556 dccf
= _mm_or_pd(_mm_and_pd(mask_cmp
,zero
) ,_mm_andnot_pd(mask_cmp
,dccf
));
559 prod
= _mm_mul_sd(p4
,vaj
);
560 icf4
= _mm_mul_sd(ccf
,rinv4
);
561 xmm2
= _mm_mul_sd(icf4
,prod
);
562 xmm3
= _mm_mul_sd(icf4
,prod_ai
);
563 gpi
= _mm_add_sd(gpi
,xmm2
);
565 /* Load, subtract and store atom aj gpol energy */
566 xmm7
= _mm_load_sd(born
->gpol_still_work
+aj1
);
567 xmm3
= _mm_add_sd(xmm7
,xmm3
);
568 _mm_store_sd(born
->gpol_still_work
+aj1
,xmm3
);
570 /* Chain rule terms */
571 ccf
= _mm_mul_sd(four
,ccf
);
572 xmm3
= _mm_sub_sd(ccf
,dccf
);
573 icf6
= _mm_mul_sd(xmm3
,rinv6
);
574 xmm1
= _mm_mul_sd(icf6
,prod
);
575 xmm2
= _mm_mul_sd(icf6
,prod_ai
);
577 /* Here we only have ai->aj1 and aj1->ai, so we can store directly */
578 _mm_storel_pd(fr
->dadx
+n
,xmm1
);
580 _mm_storel_pd(fr
->dadx
+n
,xmm2
);
584 /* Do end processing ... */
585 xmm2
= _mm_unpacklo_pd(xmm2
,gpi
);
586 gpi
= _mm_add_pd(gpi
,xmm2
);
587 gpi
= _mm_shuffle_pd(gpi
,gpi
,_MM_SHUFFLE2(1,1));
589 /* Load, add and store atom ai polarisation energy */
590 xmm2
= _mm_load_sd(born
->gpol_still_work
+ai
);
591 gpi
= _mm_add_sd(gpi
,xmm2
);
592 _mm_store_sd(born
->gpol_still_work
+ai
,gpi
);
595 /* Parallel summations */
598 gmx_sum(natoms
,born
->gpol_still_work
, cr
);
600 else if(DOMAINDECOMP(cr
))
602 dd_atom_sum_real(cr
->dd
, born
->gpol_still_work
);
605 /* Compute the radii */
606 for(i
=0;i
<fr
->natoms_force
;i
++) /* PELA born->nr */
608 if(born
->use
[i
] != 0)
610 gpi_ai
= born
->gpol
[i
] + born
->gpol_still_work
[i
];
611 gpi2
= gpi_ai
*gpi_ai
;
613 born
->bRad
[i
]=factor
*gmx_invsqrt(gpi2
);
614 fr
->invsqrta
[i
]=gmx_invsqrt(born
->bRad
[i
]);
618 /* Extra (local) communication reqiured for DD */
621 dd_atom_spread_real(cr
->dd
, born
->bRad
);
622 dd_atom_spread_real(cr
->dd
, fr
->invsqrta
);
629 calc_gb_rad_hct_sse2_double(t_commrec
*cr
, t_forcerec
*fr
, int natoms
, gmx_localtop_t
*top
,
630 const t_atomtypes
*atype
, double *x
, t_nblist
*nl
, gmx_genborn_t
*born
, t_mdatoms
*md
)
632 int i
,k
,n
,ai
,ai3
,aj1
,aj2
,aj13
,aj23
,nj0
,nj1
,at0
,at1
,offset
;
634 double rr
,sum
,sum_tmp
,min_rad
,rad
,doff
;
637 __m128d ix
,iy
,iz
,jx
,jy
,jz
,dx
,dy
,dz
,sX
,sY
,sZ
;
638 __m128d t1
,t2
,t3
,rsq11
,rinv
,r
,rai
;
639 __m128d rai_inv
,sk
,sk2
,lij
,dlij
,duij
;
640 __m128d uij
,lij2
,uij2
,lij3
,uij3
,diff2
;
641 __m128d lij_inv
,sk2_inv
,prod
,log_term
,tmp
,tmp_sum
;
642 __m128d mask_cmp
,mask_cmp2
,mask_cmp3
;
643 __m128d xmm1
,xmm2
,xmm3
,xmm4
,xmm7
,xmm8
,xmm9
,doffset
;
644 __m128d sum_ai
,chrule
,chrule_ai
,tmp_ai
;
645 __m128d sk_ai
, sk2_ai
,raj
,raj_inv
;
647 const __m128d neg
= {-1.0, -1.0};
648 const __m128d zero
= {0.0, 0.0};
649 const __m128d eigth
= {0.125, 0.125};
650 const __m128d qrtr
= {0.25, 0.25};
651 const __m128d half
= {0.5, 0.5};
652 const __m128d one
= {1.0, 1.0};
653 const __m128d two
= {2.0, 2.0};
654 const __m128d three
= {3.0, 3.0};
656 /* Keep the compiler happy */
657 tmp_ai
= _mm_setzero_pd();
658 tmp
= _mm_setzero_pd();
659 xmm7
= _mm_setzero_pd();
660 xmm8
= _mm_setzero_pd();
661 raj_inv
= _mm_setzero_pd();
662 raj
= _mm_setzero_pd();
663 sk
= _mm_setzero_pd();
664 jx
= _mm_setzero_pd();
665 jy
= _mm_setzero_pd();
666 jz
= _mm_setzero_pd();
668 /* Set the dielectric offset */
669 doff
= born
->gb_doffset
;
670 doffset
= _mm_load1_pd(&doff
);
673 for(i
=0;i
<born
->nr
;i
++)
675 born
->gpol_hct_work
[i
] = 0;
678 for(i
=0;i
<nl
->nri
;i
++)
684 nj1
= nl
->jindex
[i
+1];
686 /* Load shifts for this list */
687 shift
= nl
->shift
[i
];
688 shX
= fr
->shift_vec
[shift
][0];
689 shY
= fr
->shift_vec
[shift
][1];
690 shZ
= fr
->shift_vec
[shift
][2];
692 /* Splat the shifts */
693 sX
= _mm_load1_pd(&shX
);
694 sY
= _mm_load1_pd(&shY
);
695 sZ
= _mm_load1_pd(&shZ
);
697 offset
= (nj1
-nj0
)%2;
700 rr
= top
->atomtypes
.gb_radius
[md
->typeA
[ai
]]-doff
;
701 rai
= _mm_load1_pd(&rr
);
703 rai_inv
= _mm_load1_pd(&rr
);
705 /* Zero out sums for polarisation energies */
706 sum_ai
= _mm_setzero_pd();
708 /* Load ai coordinates and add shifts */
709 ix
= _mm_load1_pd(x
+ai3
);
710 iy
= _mm_load1_pd(x
+ai3
+1);
711 iz
= _mm_load1_pd(x
+ai3
+2);
713 ix
= _mm_add_pd(sX
,ix
);
714 iy
= _mm_add_pd(sY
,iy
);
715 iz
= _mm_add_pd(sZ
,iz
);
717 sk_ai
= _mm_load1_pd(born
->param
+ai
);
718 sk2_ai
= _mm_mul_pd(sk_ai
,sk_ai
);
720 for(k
=nj0
;k
<nj1
-offset
;k
+=2)
728 /* Load particle aj1-2 coordinates */
729 xmm1
= _mm_loadu_pd(x
+aj13
);
730 xmm2
= _mm_loadu_pd(x
+aj23
);
731 jx
= _mm_shuffle_pd(xmm1
,xmm2
,_MM_SHUFFLE2(0,0));
732 jy
= _mm_shuffle_pd(xmm1
,xmm2
,_MM_SHUFFLE2(1,1));
734 jz
= _mm_loadl_pd(jz
, x
+aj13
+2);
735 jz
= _mm_loadh_pd(jz
, x
+aj23
+2);
737 dx
= _mm_sub_pd(ix
,jx
);
738 dy
= _mm_sub_pd(iy
,jy
);
739 dz
= _mm_sub_pd(iz
,jz
);
741 rsq11
= _mm_add_pd( _mm_add_pd( _mm_mul_pd(dx
,dx
) , _mm_mul_pd(dy
,dy
) ) , _mm_mul_pd(dz
,dz
) );
742 rinv
= my_invrsq_pd(rsq11
);
743 r
= _mm_mul_pd(rinv
,rsq11
);
745 sk
= _mm_loadl_pd(sk
,born
->param
+aj1
);
746 sk
= _mm_loadh_pd(sk
,born
->param
+aj2
);
748 /* Load aj1,aj2 raj */
752 raj
= _mm_loadl_pd(raj
,top
->atomtypes
.gb_radius
+p1
);
753 raj
= _mm_loadh_pd(raj
,top
->atomtypes
.gb_radius
+p2
);
754 raj
= _mm_sub_pd(raj
,doffset
);
756 /* Compute 1.0/raj */
757 raj_inv
= my_inv_pd(raj
);
759 /* INTERACTION aj->ai STARS HERE */
760 /* conditional mask for rai<dr+sk */
761 xmm1
= _mm_add_pd(r
,sk
);
762 mask_cmp
= _mm_cmplt_pd(rai
,xmm1
);
764 /* conditional for rai>dr-sk, ends with mask_cmp2 */
765 xmm2
= _mm_sub_pd(r
,sk
);
766 xmm3
= my_inv_pd(xmm2
);
767 mask_cmp2
= _mm_cmpgt_pd(rai
,xmm2
);
769 lij
= _mm_or_pd(_mm_and_pd(mask_cmp2
,rai_inv
) ,_mm_andnot_pd(mask_cmp2
,xmm3
)); /*conditional as a mask*/
770 dlij
= _mm_or_pd(_mm_and_pd(mask_cmp2
,zero
) ,_mm_andnot_pd(mask_cmp2
,one
));
772 uij
= my_inv_pd(xmm1
);
773 lij2
= _mm_mul_pd(lij
,lij
);
774 lij3
= _mm_mul_pd(lij2
,lij
);
775 uij2
= _mm_mul_pd(uij
,uij
);
776 uij3
= _mm_mul_pd(uij2
,uij
);
778 diff2
= _mm_sub_pd(uij2
,lij2
);
780 lij_inv
= my_invrsq_pd(lij2
);
781 sk2
= _mm_mul_pd(sk
,sk
);
782 sk2_inv
= _mm_mul_pd(sk2
,rinv
);
783 prod
= _mm_mul_pd(qrtr
,sk2_inv
);
785 log_term
= _mm_mul_pd(uij
,lij_inv
);
786 log_term
= log_pd(log_term
);
788 xmm1
= _mm_sub_pd(lij
,uij
);
789 xmm2
= _mm_mul_pd(qrtr
,r
);
790 xmm2
= _mm_mul_pd(xmm2
,diff2
);
791 xmm1
= _mm_add_pd(xmm1
,xmm2
);
792 xmm2
= _mm_mul_pd(half
,rinv
);
793 xmm2
= _mm_mul_pd(xmm2
,log_term
);
794 xmm1
= _mm_add_pd(xmm1
,xmm2
);
795 xmm9
= _mm_mul_pd(neg
,diff2
);
796 xmm2
= _mm_mul_pd(xmm9
,prod
);
797 tmp_ai
= _mm_add_pd(xmm1
,xmm2
);
799 /* contitional for rai<sk-dr */
800 xmm3
= _mm_sub_pd(sk
,r
);
801 mask_cmp3
= _mm_cmplt_pd(rai
,xmm3
); /* rai<sk-dr */
803 xmm4
= _mm_sub_pd(rai_inv
,lij
);
804 xmm4
= _mm_mul_pd(two
,xmm4
);
805 xmm4
= _mm_add_pd(tmp_ai
,xmm4
);
807 tmp_ai
= _mm_or_pd(_mm_and_pd(mask_cmp3
,xmm4
) ,_mm_andnot_pd(mask_cmp3
,tmp_ai
)); /*conditional as a mask*/
809 /* the tmp will now contain two partial values, that not all are to be used. Which */
810 /* ones are governed by the mask_cmp mask. */
811 tmp_ai
= _mm_mul_pd(half
,tmp_ai
);
812 tmp_ai
= _mm_or_pd(_mm_and_pd(mask_cmp
,tmp_ai
) ,_mm_andnot_pd(mask_cmp
,zero
)); /*conditional as a mask*/
813 sum_ai
= _mm_add_pd(sum_ai
,tmp_ai
);
815 xmm2
= _mm_mul_pd(half
,lij2
);
816 xmm3
= _mm_mul_pd(prod
,lij3
);
817 xmm2
= _mm_add_pd(xmm2
,xmm3
);
818 xmm3
= _mm_mul_pd(lij
,rinv
);
819 xmm4
= _mm_mul_pd(lij3
,r
);
820 xmm3
= _mm_add_pd(xmm3
,xmm4
);
821 xmm3
= _mm_mul_pd(qrtr
,xmm3
);
822 t1
= _mm_sub_pd(xmm2
,xmm3
);
824 xmm2
= _mm_mul_pd(half
,uij2
);
825 xmm2
= _mm_mul_pd(neg
,xmm2
);
826 xmm3
= _mm_mul_pd(qrtr
,sk2_inv
);
827 xmm3
= _mm_mul_pd(xmm3
,uij3
);
828 xmm2
= _mm_sub_pd(xmm2
,xmm3
);
829 xmm3
= _mm_mul_pd(uij
,rinv
);
830 xmm4
= _mm_mul_pd(uij3
,r
);
831 xmm3
= _mm_add_pd(xmm3
,xmm4
);
832 xmm3
= _mm_mul_pd(qrtr
,xmm3
);
833 t2
= _mm_add_pd(xmm2
,xmm3
);
835 xmm2
= _mm_mul_pd(sk2_inv
,rinv
);
836 xmm2
= _mm_add_pd(one
,xmm2
);
837 xmm2
= _mm_mul_pd(eigth
,xmm2
);
838 xmm2
= _mm_mul_pd(xmm2
,xmm9
);
839 xmm3
= _mm_mul_pd(log_term
, rinv
);
840 xmm3
= _mm_mul_pd(xmm3
,rinv
);
841 xmm3
= _mm_mul_pd(qrtr
,xmm3
);
842 t3
= _mm_add_pd(xmm2
,xmm3
);
844 /* chain rule terms */
845 xmm2
= _mm_mul_pd(dlij
,t1
);
846 xmm2
= _mm_add_pd(xmm2
,t2
);
847 xmm2
= _mm_add_pd(xmm2
,t3
);
849 /* temporary storage of chain rule terms, since we have to compute
850 the reciprocal terms also before storing them */
851 chrule
= _mm_mul_pd(xmm2
,rinv
);
853 /* INTERACTION ai->aj starts here */
854 /* Conditional mask for raj<dr+sk_ai */
855 xmm1
= _mm_add_pd(r
,sk_ai
);
856 mask_cmp
= _mm_cmplt_pd(raj
,xmm1
);
858 /* Conditional for rai>dr-sk, ends with mask_cmp2 */
859 xmm2
= _mm_sub_pd(r
,sk_ai
);
860 xmm3
= my_inv_pd(xmm2
);
861 mask_cmp2
= _mm_cmpgt_pd(raj
,xmm2
);
863 lij
= _mm_or_pd(_mm_and_pd(mask_cmp2
,raj_inv
) ,_mm_andnot_pd(mask_cmp2
,xmm3
)); /*conditional as a mask*/
864 dlij
= _mm_or_pd(_mm_and_pd(mask_cmp2
,zero
) ,_mm_andnot_pd(mask_cmp2
,one
));
866 uij
= my_inv_pd(xmm1
);
867 lij2
= _mm_mul_pd(lij
,lij
);
868 lij3
= _mm_mul_pd(lij2
,lij
);
869 uij2
= _mm_mul_pd(uij
,uij
);
870 uij3
= _mm_mul_pd(uij2
,uij
);
872 diff2
= _mm_sub_pd(uij2
,lij2
);
874 lij_inv
= my_invrsq_pd(lij2
);
877 sk2_inv
= _mm_mul_pd(sk2
,rinv
);
878 prod
= _mm_mul_pd(qrtr
,sk2_inv
);
880 log_term
= _mm_mul_pd(uij
,lij_inv
);
881 log_term
= log_pd(log_term
);
883 xmm1
= _mm_sub_pd(lij
,uij
);
884 xmm2
= _mm_mul_pd(qrtr
,r
);
885 xmm2
= _mm_mul_pd(xmm2
,diff2
);
886 xmm1
= _mm_add_pd(xmm1
,xmm2
);
887 xmm2
= _mm_mul_pd(half
,rinv
);
888 xmm2
= _mm_mul_pd(xmm2
,log_term
);
889 xmm1
= _mm_add_pd(xmm1
,xmm2
);
890 xmm9
= _mm_mul_pd(neg
,diff2
);
891 xmm2
= _mm_mul_pd(xmm9
,prod
);
892 tmp
= _mm_add_pd(xmm1
,xmm2
);
894 /* contitional for rai<sk_ai-dr */
895 xmm3
= _mm_sub_pd(sk_ai
,r
);
896 mask_cmp3
= _mm_cmplt_pd(raj
,xmm3
); /* rai<sk-dr */
898 xmm4
= _mm_sub_pd(raj_inv
,lij
);
899 xmm4
= _mm_mul_pd(two
,xmm4
);
900 xmm4
= _mm_add_pd(tmp
,xmm4
);
902 tmp
= _mm_or_pd(_mm_and_pd(mask_cmp3
,xmm4
) ,_mm_andnot_pd(mask_cmp3
,tmp
)); /*conditional as a mask*/
904 /* the tmp will now contain two partial values, that not all are to be used. Which */
905 /* ones are governed by the mask_cmp mask. */
906 tmp
= _mm_mul_pd(half
,tmp
);
907 tmp
= _mm_or_pd(_mm_and_pd(mask_cmp
,tmp
) ,_mm_andnot_pd(mask_cmp
,zero
)); /*conditional as a mask*/
909 /* Load, add and store ai->aj pol energy */
910 xmm7
= _mm_loadl_pd(xmm7
,born
->gpol_hct_work
+aj1
);
911 xmm7
= _mm_loadh_pd(xmm7
,born
->gpol_hct_work
+aj2
);
913 xmm7
= _mm_add_pd(xmm7
,tmp
);
915 _mm_storel_pd(born
->gpol_hct_work
+aj1
,xmm7
);
916 _mm_storeh_pd(born
->gpol_hct_work
+aj2
,xmm7
);
918 /* Start chain rule terms */
919 xmm2
= _mm_mul_pd(half
,lij2
);
920 xmm3
= _mm_mul_pd(prod
,lij3
);
921 xmm2
= _mm_add_pd(xmm2
,xmm3
);
922 xmm3
= _mm_mul_pd(lij
,rinv
);
923 xmm4
= _mm_mul_pd(lij3
,r
);
924 xmm3
= _mm_add_pd(xmm3
,xmm4
);
925 xmm3
= _mm_mul_pd(qrtr
,xmm3
);
926 t1
= _mm_sub_pd(xmm2
,xmm3
);
928 xmm2
= _mm_mul_pd(half
,uij2
);
929 xmm2
= _mm_mul_pd(neg
,xmm2
);
930 xmm3
= _mm_mul_pd(qrtr
,sk2_inv
);
931 xmm3
= _mm_mul_pd(xmm3
,uij3
);
932 xmm2
= _mm_sub_pd(xmm2
,xmm3
);
933 xmm3
= _mm_mul_pd(uij
,rinv
);
934 xmm4
= _mm_mul_pd(uij3
,r
);
935 xmm3
= _mm_add_pd(xmm3
,xmm4
);
936 xmm3
= _mm_mul_pd(qrtr
,xmm3
);
937 t2
= _mm_add_pd(xmm2
,xmm3
);
939 xmm2
= _mm_mul_pd(sk2_inv
,rinv
);
940 xmm2
= _mm_add_pd(one
,xmm2
);
941 xmm2
= _mm_mul_pd(eigth
,xmm2
);
942 xmm2
= _mm_mul_pd(xmm2
,xmm9
);
943 xmm3
= _mm_mul_pd(log_term
, rinv
);
944 xmm3
= _mm_mul_pd(xmm3
,rinv
);
945 xmm3
= _mm_mul_pd(qrtr
,xmm3
);
946 t3
= _mm_add_pd(xmm2
,xmm3
);
948 /* chain rule terms */
949 xmm2
= _mm_mul_pd(dlij
,t1
);
950 xmm2
= _mm_add_pd(xmm2
,t2
);
951 xmm2
= _mm_add_pd(xmm2
,t3
);
952 chrule_ai
= _mm_mul_pd(xmm2
,rinv
);
954 /* Store chain rule terms
955 * same unpacking rationale as with Still above
957 xmm3
= _mm_unpacklo_pd(chrule
, chrule_ai
);
958 xmm4
= _mm_unpackhi_pd(chrule
, chrule_ai
);
960 _mm_storeu_pd(fr
->dadx
+n
, xmm3
);
962 _mm_storeu_pd(fr
->dadx
+n
, xmm4
);
972 jx
= _mm_load_sd(x
+aj13
);
973 jy
= _mm_load_sd(x
+aj13
+1);
974 jz
= _mm_load_sd(x
+aj13
+2);
976 sk
= _mm_load_sd(born
->param
+aj1
);
978 dx
= _mm_sub_sd(ix
,jx
);
979 dy
= _mm_sub_pd(iy
,jy
);
980 dz
= _mm_sub_pd(iz
,jz
);
982 rsq11
= _mm_add_sd( _mm_add_sd( _mm_mul_sd(dx
,dx
) , _mm_mul_sd(dy
,dy
) ) , _mm_mul_sd(dz
,dz
) );
983 rinv
= my_invrsq_pd(rsq11
);
984 r
= _mm_mul_sd(rinv
,rsq11
);
987 raj
= _mm_load_sd(top
->atomtypes
.gb_radius
+p1
);
988 raj
= _mm_sub_sd(raj
,doffset
);
989 raj_inv
= my_inv_pd(raj
);
991 /* OFFSET INTERATIONS aj->ai STARTS HERE */
992 /* conditional mask for rai<dr+sk */
993 xmm1
= _mm_add_sd(r
,sk
);
994 mask_cmp
= _mm_cmplt_sd(rai
,xmm1
);
996 /* conditional for rai>dr-sk, ends with mask_cmp2 */
997 xmm2
= _mm_sub_sd(r
,sk
);
998 xmm3
= my_inv_pd(xmm2
);
999 mask_cmp2
= _mm_cmpgt_pd(rai
,xmm2
);
1001 lij
= _mm_or_pd(_mm_and_pd(mask_cmp2
,rai_inv
) ,_mm_andnot_pd(mask_cmp2
,xmm3
)); /*conditional as a mask*/
1002 dlij
= _mm_or_pd(_mm_and_pd(mask_cmp2
,zero
) ,_mm_andnot_pd(mask_cmp2
,one
));
1004 uij
= my_inv_pd(xmm1
);
1005 lij2
= _mm_mul_sd(lij
,lij
);
1006 lij3
= _mm_mul_sd(lij2
,lij
);
1007 uij2
= _mm_mul_sd(uij
,uij
);
1008 uij3
= _mm_mul_sd(uij2
,uij
);
1010 diff2
= _mm_sub_sd(uij2
,lij2
);
1012 lij_inv
= my_invrsq_pd(lij2
);
1013 sk2
= _mm_mul_sd(sk
,sk
);
1014 sk2_inv
= _mm_mul_sd(sk2
,rinv
);
1015 prod
= _mm_mul_sd(qrtr
,sk2_inv
);
1017 log_term
= _mm_mul_pd(uij
,lij_inv
);
1018 log_term
= log_pd(log_term
);
1020 xmm1
= _mm_sub_sd(lij
,uij
);
1021 xmm2
= _mm_mul_sd(qrtr
,r
);
1022 xmm2
= _mm_mul_sd(xmm2
,diff2
);
1023 xmm1
= _mm_add_sd(xmm1
,xmm2
);
1024 xmm2
= _mm_mul_sd(half
,rinv
);
1025 xmm2
= _mm_mul_sd(xmm2
,log_term
);
1026 xmm1
= _mm_add_sd(xmm1
,xmm2
);
1027 xmm9
= _mm_mul_sd(neg
,diff2
);
1028 xmm2
= _mm_mul_sd(xmm9
,prod
);
1030 tmp_ai
= _mm_add_sd(xmm1
,xmm2
);
1032 /* contitional for rai<sk-dr */
1033 xmm3
= _mm_sub_sd(sk
,r
);
1034 mask_cmp3
= _mm_cmplt_sd(rai
,xmm3
); /* rai<sk-dr */
1036 xmm4
= _mm_sub_sd(rai_inv
,lij
);
1037 xmm4
= _mm_mul_sd(two
,xmm4
);
1038 xmm4
= _mm_add_sd(tmp_ai
,xmm4
);
1040 tmp_ai
= _mm_or_pd(_mm_and_pd(mask_cmp3
,xmm4
) ,_mm_andnot_pd(mask_cmp3
,tmp_ai
)); /*conditional as a mask*/
1042 /* the tmp will now contain two partial values, that not all are to be used. Which */
1043 /* ones are governed by the mask_cmp mask. */
1044 tmp_ai
= _mm_mul_pd(half
,tmp_ai
);
1045 tmp_ai
= _mm_or_pd(_mm_and_pd(mask_cmp
,tmp_ai
) ,_mm_andnot_pd(mask_cmp
,zero
)); /*conditional as a mask*/
1046 sum_ai
= _mm_add_sd(sum_ai
,tmp_ai
);
1048 xmm2
= _mm_mul_sd(half
,lij2
);
1049 xmm3
= _mm_mul_sd(prod
,lij3
);
1050 xmm2
= _mm_add_sd(xmm2
,xmm3
);
1051 xmm3
= _mm_mul_sd(lij
,rinv
);
1052 xmm4
= _mm_mul_sd(lij3
,r
);
1053 xmm3
= _mm_add_sd(xmm3
,xmm4
);
1054 xmm3
= _mm_mul_sd(qrtr
,xmm3
);
1055 t1
= _mm_sub_sd(xmm2
,xmm3
);
1057 xmm2
= _mm_mul_sd(half
,uij2
);
1058 xmm2
= _mm_mul_sd(neg
,xmm2
);
1059 xmm3
= _mm_mul_sd(qrtr
,sk2_inv
);
1060 xmm3
= _mm_mul_sd(xmm3
,uij3
);
1061 xmm2
= _mm_sub_sd(xmm2
,xmm3
);
1062 xmm3
= _mm_mul_sd(uij
,rinv
);
1063 xmm4
= _mm_mul_sd(uij3
,r
);
1064 xmm3
= _mm_add_sd(xmm3
,xmm4
);
1065 xmm3
= _mm_mul_sd(qrtr
,xmm3
);
1066 t2
= _mm_add_sd(xmm2
,xmm3
);
1068 xmm2
= _mm_mul_sd(sk2_inv
,rinv
);
1069 xmm2
= _mm_add_sd(one
,xmm2
);
1070 xmm2
= _mm_mul_sd(eigth
,xmm2
);
1071 xmm2
= _mm_mul_sd(xmm2
,xmm9
);
1072 xmm3
= _mm_mul_sd(log_term
, rinv
);
1073 xmm3
= _mm_mul_sd(xmm3
,rinv
);
1074 xmm3
= _mm_mul_sd(qrtr
,xmm3
);
1075 t3
= _mm_add_sd(xmm2
,xmm3
);
1077 /* chain rule terms */
1078 xmm2
= _mm_mul_sd(dlij
,t1
);
1079 xmm2
= _mm_add_sd(xmm2
,t2
);
1080 xmm2
= _mm_add_sd(xmm2
,t3
);
1082 /* temporary storage of chain rule terms, since we have to compute
1083 the reciprocal terms also before storing them */
1084 chrule
= _mm_mul_sd(xmm2
,rinv
);
1086 /* OFFSET INTERACTION ai->aj starts here */
1087 /* conditional mask for raj<dr+sk */
1088 xmm1
= _mm_add_sd(r
,sk_ai
);
1089 mask_cmp
= _mm_cmplt_sd(raj
,xmm1
);
1091 /* conditional for rai>dr-sk, ends with mask_cmp2 */
1092 xmm2
= _mm_sub_sd(r
,sk_ai
);
1093 xmm3
= my_inv_pd(xmm2
);
1094 mask_cmp2
= _mm_cmpgt_pd(raj
,xmm2
);
1096 lij
= _mm_or_pd(_mm_and_pd(mask_cmp2
,raj_inv
) ,_mm_andnot_pd(mask_cmp2
,xmm3
)); /*conditional as a mask*/
1097 dlij
= _mm_or_pd(_mm_and_pd(mask_cmp2
,zero
) ,_mm_andnot_pd(mask_cmp2
,one
));
1099 uij
= my_inv_pd(xmm1
);
1100 lij2
= _mm_mul_sd(lij
,lij
);
1101 lij3
= _mm_mul_sd(lij2
,lij
);
1102 uij2
= _mm_mul_sd(uij
,uij
);
1103 uij3
= _mm_mul_sd(uij2
,uij
);
1105 diff2
= _mm_sub_sd(uij2
,lij2
);
1107 lij_inv
= my_invrsq_pd(lij2
);
1110 sk2_inv
= _mm_mul_sd(sk2
,rinv
);
1111 prod
= _mm_mul_sd(qrtr
,sk2_inv
);
1113 log_term
= _mm_mul_pd(uij
,lij_inv
);
1114 log_term
= log_pd(log_term
);
1116 xmm1
= _mm_sub_sd(lij
,uij
);
1117 xmm2
= _mm_mul_sd(qrtr
,r
);
1118 xmm2
= _mm_mul_sd(xmm2
,diff2
);
1119 xmm1
= _mm_add_sd(xmm1
,xmm2
);
1120 xmm2
= _mm_mul_sd(half
,rinv
);
1121 xmm2
= _mm_mul_sd(xmm2
,log_term
);
1122 xmm1
= _mm_add_sd(xmm1
,xmm2
);
1123 xmm9
= _mm_mul_sd(neg
,diff2
);
1124 xmm2
= _mm_mul_sd(xmm9
,prod
);
1125 tmp
= _mm_add_sd(xmm1
,xmm2
);
1127 /* contitional for rai<sk-dr */
1128 xmm3
= _mm_sub_sd(sk_ai
,r
);
1129 mask_cmp3
= _mm_cmplt_sd(raj
,xmm3
); /* rai<sk-dr */
1131 xmm4
= _mm_sub_sd(raj_inv
,lij
);
1132 xmm4
= _mm_mul_sd(two
,xmm4
);
1133 xmm4
= _mm_add_sd(tmp
,xmm4
);
1135 tmp
= _mm_or_pd(_mm_and_pd(mask_cmp3
,xmm4
) ,_mm_andnot_pd(mask_cmp3
,tmp
)); /*conditional as a mask*/
1137 /* the tmp will now contain two partial values, that not all are to be used. Which */
1138 /* ones are governed by the mask_cmp mask. */
1139 tmp
= _mm_mul_pd(half
,tmp
);
1140 tmp
= _mm_or_pd(_mm_and_pd(mask_cmp
,tmp
) ,_mm_andnot_pd(mask_cmp
,zero
)); /*conditional as a mask*/
1142 /* Load, add and store gpol energy */
1143 xmm7
= _mm_load_sd(born
->gpol_hct_work
+aj1
);
1144 xmm7
= _mm_add_sd(xmm7
,tmp
);
1145 _mm_store_sd(born
->gpol_hct_work
+aj1
,xmm7
);
1147 /* Start chain rule terms, t1 */
1148 xmm2
= _mm_mul_sd(half
,lij2
);
1149 xmm3
= _mm_mul_sd(prod
,lij3
);
1150 xmm2
= _mm_add_sd(xmm2
,xmm3
);
1151 xmm3
= _mm_mul_sd(lij
,rinv
);
1152 xmm4
= _mm_mul_sd(lij3
,r
);
1153 xmm3
= _mm_add_sd(xmm3
,xmm4
);
1154 xmm3
= _mm_mul_sd(qrtr
,xmm3
);
1155 t1
= _mm_sub_sd(xmm2
,xmm3
);
1157 xmm2
= _mm_mul_sd(half
,uij2
);
1158 xmm2
= _mm_mul_sd(neg
,xmm2
);
1159 xmm3
= _mm_mul_sd(qrtr
,sk2_inv
);
1160 xmm3
= _mm_mul_sd(xmm3
,uij3
);
1161 xmm2
= _mm_sub_sd(xmm2
,xmm3
);
1162 xmm3
= _mm_mul_sd(uij
,rinv
);
1163 xmm4
= _mm_mul_sd(uij3
,r
);
1164 xmm3
= _mm_add_sd(xmm3
,xmm4
);
1165 xmm3
= _mm_mul_sd(qrtr
,xmm3
);
1166 t2
= _mm_add_sd(xmm2
,xmm3
);
1168 xmm2
= _mm_mul_sd(sk2_inv
,rinv
);
1169 xmm2
= _mm_add_sd(one
,xmm2
);
1170 xmm2
= _mm_mul_sd(eigth
,xmm2
);
1171 xmm2
= _mm_mul_sd(xmm2
,xmm9
);
1172 xmm3
= _mm_mul_sd(log_term
, rinv
);
1173 xmm3
= _mm_mul_sd(xmm3
,rinv
);
1174 xmm3
= _mm_mul_sd(qrtr
,xmm3
);
1175 t3
= _mm_add_sd(xmm2
,xmm3
);
1177 /* chain rule terms */
1178 xmm2
= _mm_mul_sd(dlij
,t1
);
1179 xmm2
= _mm_add_sd(xmm2
,t2
);
1180 xmm2
= _mm_add_sd(xmm2
,t3
);
1181 chrule_ai
= _mm_mul_sd(xmm2
,rinv
);
1183 _mm_store_sd(fr
->dadx
+n
, chrule
);
1185 _mm_store_sd(fr
->dadx
+n
, chrule_ai
);
1189 /* Do end processing ... */
1190 tmp_ai
= _mm_unpacklo_pd(tmp_ai
,sum_ai
);
1191 sum_ai
= _mm_add_pd(sum_ai
,tmp_ai
);
1192 sum_ai
= _mm_shuffle_pd(sum_ai
,sum_ai
,_MM_SHUFFLE2(1,1));
1194 /* Load, add and store atom ai polarisation energy */
1195 xmm2
= _mm_load_sd(born
->gpol_hct_work
+ai
);
1196 sum_ai
= _mm_add_sd(sum_ai
,xmm2
);
1197 _mm_store_sd(born
->gpol_hct_work
+ai
,sum_ai
);
1200 /* Parallel summations */
1203 gmx_sum(natoms
, born
->gpol_hct_work
, cr
);
1205 else if(DOMAINDECOMP(cr
))
1207 dd_atom_sum_real(cr
->dd
, born
->gpol_hct_work
);
1210 /* Compute the radii */
1211 for(i
=0;i
<fr
->natoms_force
;i
++) /* PELA born->nr */
1213 if(born
->use
[i
] != 0)
1215 rr
= top
->atomtypes
.gb_radius
[md
->typeA
[i
]]-doff
;
1216 sum
= 1.0/rr
- born
->gpol_hct_work
[i
];
1217 min_rad
= rr
+ doff
;
1220 born
->bRad
[i
] = rad
> min_rad
? rad
: min_rad
;
1221 fr
->invsqrta
[i
] = gmx_invsqrt(born
->bRad
[i
]);
1225 /* Extra (local) communication required for DD */
1226 if(DOMAINDECOMP(cr
))
1228 dd_atom_spread_real(cr
->dd
, born
->bRad
);
1229 dd_atom_spread_real(cr
->dd
, fr
->invsqrta
);
1236 calc_gb_rad_obc_sse2_double(t_commrec
*cr
, t_forcerec
* fr
, int natoms
, gmx_localtop_t
*top
,
1237 const t_atomtypes
*atype
, double *x
, t_nblist
*nl
, gmx_genborn_t
*born
,t_mdatoms
*md
)
1239 int i
,k
,n
,ai
,ai3
,aj1
,aj2
,aj13
,aj23
,nj0
,nj1
,at0
,at1
,offset
;
1240 int p1
,p2
,p3
,p4
,shift
;
1241 double rr
,sum
,sum_tmp
,sum2
,sum3
,min_rad
,rad
,doff
;
1242 double tsum
,tchain
,rr_inv
,rr_inv2
,gbr
;
1245 __m128d ix
,iy
,iz
,jx
,jy
,jz
,dx
,dy
,dz
,sX
,sY
,sZ
;
1246 __m128d t1
,t2
,t3
,rsq11
,rinv
,r
,rai
;
1247 __m128d rai_inv
,sk
,sk2
,lij
,dlij
,duij
;
1248 __m128d uij
,lij2
,uij2
,lij3
,uij3
,diff2
;
1249 __m128d lij_inv
,sk2_inv
,prod
,log_term
,tmp
,tmp_sum
;
1250 __m128d mask_cmp
,mask_cmp2
,mask_cmp3
,doffset
,raj
,raj_inv
;
1251 __m128d xmm1
,xmm2
,xmm3
,xmm4
,xmm7
,xmm8
,xmm9
;
1252 __m128d sum_ai
,chrule
,chrule_ai
,tmp_ai
,sk_ai
,sk2_ai
;
1254 const __m128d neg
= {-1.0, -1.0};
1255 const __m128d zero
= {0.0, 0.0};
1256 const __m128d eigth
= {0.125, 0.125};
1257 const __m128d qrtr
= {0.25, 0.25};
1258 const __m128d half
= {0.5, 0.5};
1259 const __m128d one
= {1.0, 1.0};
1260 const __m128d two
= {2.0, 2.0};
1261 const __m128d three
= {3.0, 3.0};
1263 /* Keep the compiler happy */
1264 tmp_ai
= _mm_setzero_pd();
1265 tmp
= _mm_setzero_pd();
1266 raj_inv
= _mm_setzero_pd();
1267 raj
= _mm_setzero_pd();
1268 sk
= _mm_setzero_pd();
1269 jx
= _mm_setzero_pd();
1270 jy
= _mm_setzero_pd();
1271 jz
= _mm_setzero_pd();
1272 xmm7
= _mm_setzero_pd();
1273 xmm8
= _mm_setzero_pd();
1275 /* Set the dielectric offset */
1276 doff
= born
->gb_doffset
;
1277 doffset
= _mm_load1_pd(&doff
);
1281 for(i
=0;i
<born
->nr
;i
++)
1283 born
->gpol_hct_work
[i
] = 0;
1286 for(i
=0;i
<nl
->nri
;i
++)
1291 nj0
= nl
->jindex
[i
];
1292 nj1
= nl
->jindex
[i
+1];
1294 /* Load shifts for this list */
1295 shift
= nl
->shift
[i
];
1296 shX
= fr
->shift_vec
[shift
][0];
1297 shY
= fr
->shift_vec
[shift
][1];
1298 shZ
= fr
->shift_vec
[shift
][2];
1300 /* Splat the shifts */
1301 sX
= _mm_load1_pd(&shX
);
1302 sY
= _mm_load1_pd(&shY
);
1303 sZ
= _mm_load1_pd(&shZ
);
1305 offset
= (nj1
-nj0
)%2;
1308 rr
= top
->atomtypes
.gb_radius
[md
->typeA
[ai
]]-doff
;
1309 rai
= _mm_load1_pd(&rr
);
1311 rai_inv
= _mm_load1_pd(&rr
);
1313 /* Load ai coordinates and add shifts */
1314 ix
= _mm_load1_pd(x
+ai3
);
1315 iy
= _mm_load1_pd(x
+ai3
+1);
1316 iz
= _mm_load1_pd(x
+ai3
+2);
1318 ix
= _mm_add_pd(sX
,ix
);
1319 iy
= _mm_add_pd(sY
,iy
);
1320 iz
= _mm_add_pd(sZ
,iz
);
1322 /* Zero out sums for polarisation energies */
1323 sum_ai
= _mm_setzero_pd();
1325 sk_ai
= _mm_load1_pd(born
->param
+ai
);
1326 sk2_ai
= _mm_mul_pd(sk_ai
,sk_ai
);
1328 for(k
=nj0
;k
<nj1
-offset
;k
+=2)
1331 aj2
= nl
->jjnr
[k
+1];
1336 /* Load particle aj1-2 coordinates */
1337 xmm1
= _mm_loadu_pd(x
+aj13
);
1338 xmm2
= _mm_loadu_pd(x
+aj23
);
1339 jx
= _mm_shuffle_pd(xmm1
,xmm2
,_MM_SHUFFLE2(0,0));
1340 jy
= _mm_shuffle_pd(xmm1
,xmm2
,_MM_SHUFFLE2(1,1));
1342 jz
= _mm_loadl_pd(jz
, x
+aj13
+2);
1343 jz
= _mm_loadh_pd(jz
, x
+aj23
+2);
1345 dx
= _mm_sub_pd(ix
,jx
);
1346 dy
= _mm_sub_pd(iy
,jy
);
1347 dz
= _mm_sub_pd(iz
,jz
);
1349 rsq11
= _mm_add_pd( _mm_add_pd( _mm_mul_pd(dx
,dx
) , _mm_mul_pd(dy
,dy
) ) , _mm_mul_pd(dz
,dz
) );
1350 rinv
= my_invrsq_pd(rsq11
);
1351 r
= _mm_mul_pd(rinv
,rsq11
);
1353 /* Load atom aj1,aj2 raj */
1354 p1
= md
->typeA
[aj1
];
1355 p2
= md
->typeA
[aj2
];
1357 raj
= _mm_loadl_pd(raj
,top
->atomtypes
.gb_radius
+p1
);
1358 raj
= _mm_loadh_pd(raj
,top
->atomtypes
.gb_radius
+p2
);
1359 raj
= _mm_sub_pd(raj
,doffset
);
1361 /* Compute 1.0/raj */
1362 raj_inv
= my_inv_pd(raj
);
1364 sk
= _mm_loadl_pd(sk
,born
->param
+aj1
);
1365 sk
= _mm_loadh_pd(sk
,born
->param
+aj2
);
1367 /* INTERACTION aj->ai STARTS HERE */
1368 /* conditional mask for rai<dr+sk */
1369 xmm1
= _mm_add_pd(r
,sk
);
1370 mask_cmp
= _mm_cmplt_pd(rai
,xmm1
);
1372 /* conditional for rai>dr-sk, ends with mask_cmp2 */
1373 xmm2
= _mm_sub_pd(r
,sk
);
1374 xmm3
= my_inv_pd(xmm2
);
1375 mask_cmp2
= _mm_cmpgt_pd(rai
,xmm2
);
1377 lij
= _mm_or_pd(_mm_and_pd(mask_cmp2
,rai_inv
) ,_mm_andnot_pd(mask_cmp2
,xmm3
)); /*conditional as a mask*/
1378 dlij
= _mm_or_pd(_mm_and_pd(mask_cmp2
,zero
) ,_mm_andnot_pd(mask_cmp2
,one
));
1380 uij
= my_inv_pd(xmm1
);
1381 lij2
= _mm_mul_pd(lij
,lij
);
1382 lij3
= _mm_mul_pd(lij2
,lij
);
1383 uij2
= _mm_mul_pd(uij
,uij
);
1384 uij3
= _mm_mul_pd(uij2
,uij
);
1386 diff2
= _mm_sub_pd(uij2
,lij2
);
1388 lij_inv
= my_invrsq_pd(lij2
);
1389 sk2
= _mm_mul_pd(sk
,sk
);
1390 sk2_inv
= _mm_mul_pd(sk2
,rinv
);
1391 prod
= _mm_mul_pd(qrtr
,sk2_inv
);
1393 log_term
= _mm_mul_pd(uij
,lij_inv
);
1394 log_term
= log_pd(log_term
);
1396 xmm1
= _mm_sub_pd(lij
,uij
);
1397 xmm2
= _mm_mul_pd(qrtr
,r
);
1398 xmm2
= _mm_mul_pd(xmm2
,diff2
);
1399 xmm1
= _mm_add_pd(xmm1
,xmm2
);
1400 xmm2
= _mm_mul_pd(half
,rinv
);
1401 xmm2
= _mm_mul_pd(xmm2
,log_term
);
1402 xmm1
= _mm_add_pd(xmm1
,xmm2
);
1403 xmm9
= _mm_mul_pd(neg
,diff2
);
1404 xmm2
= _mm_mul_pd(xmm9
,prod
);
1405 tmp_ai
= _mm_add_pd(xmm1
,xmm2
);
1407 /* contitional for rai<sk-dr */
1408 xmm3
= _mm_sub_pd(sk
,r
);
1409 mask_cmp3
= _mm_cmplt_pd(rai
,xmm3
); /* rai<sk-dr */
1411 xmm4
= _mm_sub_pd(rai_inv
,lij
);
1412 xmm4
= _mm_mul_pd(two
,xmm4
);
1413 xmm4
= _mm_add_pd(tmp_ai
,xmm4
);
1415 tmp_ai
= _mm_or_pd(_mm_and_pd(mask_cmp3
,xmm4
) ,_mm_andnot_pd(mask_cmp3
,tmp_ai
)); /*conditional as a mask*/
1417 /* the tmp will now contain two partial values, that not all are to be used. Which */
1418 /* ones are governed by the mask_cmp mask. */
1419 tmp_ai
= _mm_mul_pd(half
,tmp_ai
);
1420 tmp_ai
= _mm_or_pd(_mm_and_pd(mask_cmp
,tmp_ai
) ,_mm_andnot_pd(mask_cmp
,zero
)); /*conditional as a mask*/
1421 sum_ai
= _mm_add_pd(sum_ai
,tmp_ai
);
1423 /* Start the dadx chain rule terms */
1424 xmm2
= _mm_mul_pd(half
,lij2
);
1425 xmm3
= _mm_mul_pd(prod
,lij3
);
1426 xmm2
= _mm_add_pd(xmm2
,xmm3
);
1427 xmm3
= _mm_mul_pd(lij
,rinv
);
1428 xmm4
= _mm_mul_pd(lij3
,r
);
1429 xmm3
= _mm_add_pd(xmm3
,xmm4
);
1430 xmm3
= _mm_mul_pd(qrtr
,xmm3
);
1431 t1
= _mm_sub_pd(xmm2
,xmm3
);
1433 xmm2
= _mm_mul_pd(half
,uij2
);
1434 xmm2
= _mm_mul_pd(neg
,xmm2
);
1435 xmm3
= _mm_mul_pd(qrtr
,sk2_inv
);
1436 xmm3
= _mm_mul_pd(xmm3
,uij3
);
1437 xmm2
= _mm_sub_pd(xmm2
,xmm3
);
1438 xmm3
= _mm_mul_pd(uij
,rinv
);
1439 xmm4
= _mm_mul_pd(uij3
,r
);
1440 xmm3
= _mm_add_pd(xmm3
,xmm4
);
1441 xmm3
= _mm_mul_pd(qrtr
,xmm3
);
1442 t2
= _mm_add_pd(xmm2
,xmm3
);
1444 xmm2
= _mm_mul_pd(sk2_inv
,rinv
);
1445 xmm2
= _mm_add_pd(one
,xmm2
);
1446 xmm2
= _mm_mul_pd(eigth
,xmm2
);
1447 xmm2
= _mm_mul_pd(xmm2
,xmm9
);
1448 xmm3
= _mm_mul_pd(log_term
, rinv
);
1449 xmm3
= _mm_mul_pd(xmm3
,rinv
);
1450 xmm3
= _mm_mul_pd(qrtr
,xmm3
);
1451 t3
= _mm_add_pd(xmm2
,xmm3
);
1453 /* chain rule terms */
1454 xmm2
= _mm_mul_pd(dlij
,t1
);
1455 xmm2
= _mm_add_pd(xmm2
,t2
);
1456 xmm2
= _mm_add_pd(xmm2
,t3
);
1458 /* temporary storage of chain rule terms, since we have to compute
1459 the reciprocal terms also before storing them */
1460 chrule
= _mm_mul_pd(xmm2
,rinv
);
1462 /* INTERACTION ai->aj STARTS HERE */
1463 /* conditional mask for raj<dr+sk_ai */
1464 xmm1
= _mm_add_pd(r
,sk_ai
);
1465 mask_cmp
= _mm_cmplt_pd(raj
,xmm1
);
1467 /* conditional for rai>dr-sk, ends with mask_cmp2 */
1468 xmm2
= _mm_sub_pd(r
,sk_ai
);
1469 xmm3
= my_inv_pd(xmm2
);
1470 mask_cmp2
= _mm_cmpgt_pd(raj
,xmm2
);
1472 lij
= _mm_or_pd(_mm_and_pd(mask_cmp2
,raj_inv
) ,_mm_andnot_pd(mask_cmp2
,xmm3
)); /*conditional as a mask*/
1473 dlij
= _mm_or_pd(_mm_and_pd(mask_cmp2
,zero
) ,_mm_andnot_pd(mask_cmp2
,one
));
1475 uij
= my_inv_pd(xmm1
);
1476 lij2
= _mm_mul_pd(lij
,lij
);
1477 lij3
= _mm_mul_pd(lij2
,lij
);
1478 uij2
= _mm_mul_pd(uij
,uij
);
1479 uij3
= _mm_mul_pd(uij2
,uij
);
1481 diff2
= _mm_sub_pd(uij2
,lij2
);
1483 lij_inv
= my_invrsq_pd(lij2
);
1486 sk2_inv
= _mm_mul_pd(sk2
,rinv
);
1487 prod
= _mm_mul_pd(qrtr
,sk2_inv
);
1489 log_term
= _mm_mul_pd(uij
,lij_inv
);
1490 log_term
= log_pd(log_term
);
1492 xmm1
= _mm_sub_pd(lij
,uij
);
1493 xmm2
= _mm_mul_pd(qrtr
,r
);
1494 xmm2
= _mm_mul_pd(xmm2
,diff2
);
1495 xmm1
= _mm_add_pd(xmm1
,xmm2
);
1496 xmm2
= _mm_mul_pd(half
,rinv
);
1497 xmm2
= _mm_mul_pd(xmm2
,log_term
);
1498 xmm1
= _mm_add_pd(xmm1
,xmm2
);
1499 xmm9
= _mm_mul_pd(neg
,diff2
);
1500 xmm2
= _mm_mul_pd(xmm9
,prod
);
1501 tmp
= _mm_add_pd(xmm1
,xmm2
);
1503 /* contitional for raj<sk_ai-dr */
1504 xmm3
= _mm_sub_pd(sk_ai
,r
);
1505 mask_cmp3
= _mm_cmplt_pd(raj
,xmm3
); /* rai<sk-dr */
1507 xmm4
= _mm_sub_pd(raj_inv
,lij
);
1508 xmm4
= _mm_mul_pd(two
,xmm4
);
1509 xmm4
= _mm_add_pd(tmp
,xmm4
);
1511 tmp
= _mm_or_pd(_mm_and_pd(mask_cmp3
,xmm4
) ,_mm_andnot_pd(mask_cmp3
,tmp
)); /*conditional as a mask*/
1513 /* the tmp will now contain two partial values, that not all are to be used. Which */
1514 /* ones are governed by the mask_cmp mask. */
1515 tmp
= _mm_mul_pd(half
,tmp
);
1516 tmp
= _mm_or_pd(_mm_and_pd(mask_cmp
,tmp
) ,_mm_andnot_pd(mask_cmp
,zero
)); /*conditional as a mask*/
1518 /* Load, add and store ai->aj pol energy */
1519 xmm7
= _mm_loadl_pd(xmm7
,born
->gpol_hct_work
+aj1
);
1520 xmm7
= _mm_loadh_pd(xmm7
,born
->gpol_hct_work
+aj2
);
1522 xmm7
= _mm_add_pd(xmm7
,tmp
);
1524 _mm_storel_pd(born
->gpol_hct_work
+aj1
,xmm7
);
1525 _mm_storeh_pd(born
->gpol_hct_work
+aj2
,xmm7
);
1527 /* Start dadx chain rule terms */
1528 xmm2
= _mm_mul_pd(half
,lij2
);
1529 xmm3
= _mm_mul_pd(prod
,lij3
);
1530 xmm2
= _mm_add_pd(xmm2
,xmm3
);
1531 xmm3
= _mm_mul_pd(lij
,rinv
);
1532 xmm4
= _mm_mul_pd(lij3
,r
);
1533 xmm3
= _mm_add_pd(xmm3
,xmm4
);
1534 xmm3
= _mm_mul_pd(qrtr
,xmm3
);
1535 t1
= _mm_sub_pd(xmm2
,xmm3
);
1537 xmm2
= _mm_mul_pd(half
,uij2
);
1538 xmm2
= _mm_mul_pd(neg
,xmm2
);
1539 xmm3
= _mm_mul_pd(qrtr
,sk2_inv
);
1540 xmm3
= _mm_mul_pd(xmm3
,uij3
);
1541 xmm2
= _mm_sub_pd(xmm2
,xmm3
);
1542 xmm3
= _mm_mul_pd(uij
,rinv
);
1543 xmm4
= _mm_mul_pd(uij3
,r
);
1544 xmm3
= _mm_add_pd(xmm3
,xmm4
);
1545 xmm3
= _mm_mul_pd(qrtr
,xmm3
);
1546 t2
= _mm_add_pd(xmm2
,xmm3
);
1548 xmm2
= _mm_mul_pd(sk2_inv
,rinv
);
1549 xmm2
= _mm_add_pd(one
,xmm2
);
1550 xmm2
= _mm_mul_pd(eigth
,xmm2
);
1551 xmm2
= _mm_mul_pd(xmm2
,xmm9
);
1552 xmm3
= _mm_mul_pd(log_term
, rinv
);
1553 xmm3
= _mm_mul_pd(xmm3
,rinv
);
1554 xmm3
= _mm_mul_pd(qrtr
,xmm3
);
1555 t3
= _mm_add_pd(xmm2
,xmm3
);
1557 /* chain rule terms */
1558 xmm2
= _mm_mul_pd(dlij
,t1
);
1559 xmm2
= _mm_add_pd(xmm2
,t2
);
1560 xmm2
= _mm_add_pd(xmm2
,t3
);
1561 chrule_ai
= _mm_mul_pd(xmm2
,rinv
);
1563 /* Store chain rule terms
1564 * same unpacking rationale as with Still above
1566 xmm3
= _mm_unpacklo_pd(chrule
, chrule_ai
);
1567 xmm4
= _mm_unpackhi_pd(chrule
, chrule_ai
);
1569 _mm_storeu_pd(fr
->dadx
+n
, xmm3
);
1571 _mm_storeu_pd(fr
->dadx
+n
, xmm4
);
1579 p1
= md
->typeA
[aj1
];
1581 jx
= _mm_load_sd(x
+aj13
);
1582 jy
= _mm_load_sd(x
+aj13
+1);
1583 jz
= _mm_load_sd(x
+aj13
+2);
1585 sk
= _mm_load_sd(born
->param
+aj1
);
1588 raj
= _mm_load_sd(top
->atomtypes
.gb_radius
+p1
);
1589 raj
= _mm_sub_sd(raj
,doffset
);
1590 raj_inv
= my_inv_pd(raj
);
1592 dx
= _mm_sub_sd(ix
,jx
);
1593 dy
= _mm_sub_pd(iy
,jy
);
1594 dz
= _mm_sub_pd(iz
,jz
);
1596 rsq11
= _mm_add_sd( _mm_add_sd( _mm_mul_sd(dx
,dx
) , _mm_mul_sd(dy
,dy
) ) , _mm_mul_sd(dz
,dz
) );
1597 rinv
= my_invrsq_pd(rsq11
);
1598 r
= _mm_mul_sd(rinv
,rsq11
);
1600 /* OFFSET INTERACTION aj->ai STARTS HERE */
1601 /* conditional mask for rai<dr+sk */
1602 xmm1
= _mm_add_sd(r
,sk
);
1603 mask_cmp
= _mm_cmplt_sd(rai
,xmm1
);
1605 /* conditional for rai>dr-sk, ends with mask_cmp2 */
1606 xmm2
= _mm_sub_sd(r
,sk
);
1607 xmm3
= my_inv_pd(xmm2
);
1608 mask_cmp2
= _mm_cmpgt_pd(rai
,xmm2
);
1610 lij
= _mm_or_pd(_mm_and_pd(mask_cmp2
,rai_inv
) ,_mm_andnot_pd(mask_cmp2
,xmm3
)); /*conditional as a mask*/
1611 dlij
= _mm_or_pd(_mm_and_pd(mask_cmp2
,zero
) ,_mm_andnot_pd(mask_cmp2
,one
));
1613 uij
= my_inv_pd(xmm1
);
1614 lij2
= _mm_mul_sd(lij
,lij
);
1615 lij3
= _mm_mul_sd(lij2
,lij
);
1616 uij2
= _mm_mul_sd(uij
,uij
);
1617 uij3
= _mm_mul_sd(uij2
,uij
);
1619 diff2
= _mm_sub_sd(uij2
,lij2
);
1621 lij_inv
= my_invrsq_pd(lij2
);
1622 sk2
= _mm_mul_sd(sk
,sk
);
1623 sk2_inv
= _mm_mul_sd(sk2
,rinv
);
1624 prod
= _mm_mul_sd(qrtr
,sk2_inv
);
1626 log_term
= _mm_mul_pd(uij
,lij_inv
);
1627 log_term
= log_pd(log_term
);
1629 xmm1
= _mm_sub_sd(lij
,uij
);
1630 xmm2
= _mm_mul_sd(qrtr
,r
);
1631 xmm2
= _mm_mul_sd(xmm2
,diff2
);
1632 xmm1
= _mm_add_sd(xmm1
,xmm2
);
1633 xmm2
= _mm_mul_sd(half
,rinv
);
1634 xmm2
= _mm_mul_sd(xmm2
,log_term
);
1635 xmm1
= _mm_add_sd(xmm1
,xmm2
);
1636 xmm9
= _mm_mul_sd(neg
,diff2
);
1637 xmm2
= _mm_mul_sd(xmm9
,prod
);
1639 tmp_ai
= _mm_add_sd(xmm1
,xmm2
);
1641 /* contitional for rai<sk-dr */
1642 xmm3
= _mm_sub_sd(sk
,r
);
1643 mask_cmp3
= _mm_cmplt_sd(rai
,xmm3
); /* rai<sk-dr */
1645 xmm4
= _mm_sub_sd(rai_inv
,lij
);
1646 xmm4
= _mm_mul_sd(two
,xmm4
);
1647 xmm4
= _mm_add_sd(tmp_ai
,xmm4
);
1649 tmp_ai
= _mm_or_pd(_mm_and_pd(mask_cmp3
,xmm4
) ,_mm_andnot_pd(mask_cmp3
,tmp_ai
)); /*conditional as a mask*/
1651 /* the tmp will now contain two partial values, that not all are to be used. Which */
1652 /* ones are governed by the mask_cmp mask. */
1653 tmp_ai
= _mm_mul_pd(half
,tmp_ai
);
1654 tmp_ai
= _mm_or_pd(_mm_and_pd(mask_cmp
,tmp_ai
) ,_mm_andnot_pd(mask_cmp
,zero
)); /*conditional as a mask*/
1655 sum_ai
= _mm_add_sd(sum_ai
,tmp_ai
);
1657 xmm2
= _mm_mul_sd(half
,lij2
);
1658 xmm3
= _mm_mul_sd(prod
,lij3
);
1659 xmm2
= _mm_add_sd(xmm2
,xmm3
);
1660 xmm3
= _mm_mul_sd(lij
,rinv
);
1661 xmm4
= _mm_mul_sd(lij3
,r
);
1662 xmm3
= _mm_add_sd(xmm3
,xmm4
);
1663 xmm3
= _mm_mul_sd(qrtr
,xmm3
);
1664 t1
= _mm_sub_sd(xmm2
,xmm3
);
1666 xmm2
= _mm_mul_sd(half
,uij2
);
1667 xmm2
= _mm_mul_sd(neg
,xmm2
);
1668 xmm3
= _mm_mul_sd(qrtr
,sk2_inv
);
1669 xmm3
= _mm_mul_sd(xmm3
,uij3
);
1670 xmm2
= _mm_sub_sd(xmm2
,xmm3
);
1671 xmm3
= _mm_mul_sd(uij
,rinv
);
1672 xmm4
= _mm_mul_sd(uij3
,r
);
1673 xmm3
= _mm_add_sd(xmm3
,xmm4
);
1674 xmm3
= _mm_mul_sd(qrtr
,xmm3
);
1675 t2
= _mm_add_sd(xmm2
,xmm3
);
1677 xmm2
= _mm_mul_sd(sk2_inv
,rinv
);
1678 xmm2
= _mm_add_sd(one
,xmm2
);
1679 xmm2
= _mm_mul_sd(eigth
,xmm2
);
1680 xmm2
= _mm_mul_sd(xmm2
,xmm9
);
1681 xmm3
= _mm_mul_sd(log_term
, rinv
);
1682 xmm3
= _mm_mul_sd(xmm3
,rinv
);
1683 xmm3
= _mm_mul_sd(qrtr
,xmm3
);
1684 t3
= _mm_add_sd(xmm2
,xmm3
);
1686 /* chain rule terms */
1687 xmm2
= _mm_mul_sd(dlij
,t1
);
1688 xmm2
= _mm_add_sd(xmm2
,t2
);
1689 xmm2
= _mm_add_sd(xmm2
,t3
);
1690 chrule
= _mm_mul_sd(xmm2
,rinv
);
1692 /* OFFSET INTERACTION ai->aj STARTS HERE */
1693 /* conditional mask for raj<dr+sk_ai */
1694 xmm1
= _mm_add_sd(r
,sk_ai
);
1695 mask_cmp
= _mm_cmplt_sd(raj
,xmm1
);
1697 /* conditional for rai>dr-sk, ends with mask_cmp2 */
1698 xmm2
= _mm_sub_sd(r
,sk_ai
);
1699 xmm3
= my_inv_pd(xmm2
);
1700 mask_cmp2
= _mm_cmpgt_pd(raj
,xmm2
);
1702 lij
= _mm_or_pd(_mm_and_pd(mask_cmp2
,raj_inv
) ,_mm_andnot_pd(mask_cmp2
,xmm3
)); /*conditional as a mask*/
1703 dlij
= _mm_or_pd(_mm_and_pd(mask_cmp2
,zero
) ,_mm_andnot_pd(mask_cmp2
,one
));
1705 uij
= my_inv_pd(xmm1
);
1706 lij2
= _mm_mul_sd(lij
,lij
);
1707 lij3
= _mm_mul_sd(lij2
,lij
);
1708 uij2
= _mm_mul_sd(uij
,uij
);
1709 uij3
= _mm_mul_sd(uij2
,uij
);
1711 diff2
= _mm_sub_sd(uij2
,lij2
);
1713 lij_inv
= my_invrsq_pd(lij2
);
1716 sk2_inv
= _mm_mul_sd(sk2
,rinv
);
1717 prod
= _mm_mul_sd(qrtr
,sk2_inv
);
1719 log_term
= _mm_mul_pd(uij
,lij_inv
);
1720 log_term
= log_pd(log_term
);
1722 xmm1
= _mm_sub_sd(lij
,uij
);
1723 xmm2
= _mm_mul_sd(qrtr
,r
);
1724 xmm2
= _mm_mul_sd(xmm2
,diff2
);
1725 xmm1
= _mm_add_sd(xmm1
,xmm2
);
1726 xmm2
= _mm_mul_sd(half
,rinv
);
1727 xmm2
= _mm_mul_sd(xmm2
,log_term
);
1728 xmm1
= _mm_add_sd(xmm1
,xmm2
);
1729 xmm9
= _mm_mul_sd(neg
,diff2
);
1730 xmm2
= _mm_mul_sd(xmm9
,prod
);
1731 tmp
= _mm_add_sd(xmm1
,xmm2
);
1733 /* contitional for raj<sk_ai-dr */
1734 xmm3
= _mm_sub_sd(sk_ai
,r
);
1735 mask_cmp3
= _mm_cmplt_sd(raj
,xmm3
); /* rai<sk-dr */
1737 xmm4
= _mm_sub_sd(raj_inv
,lij
);
1738 xmm4
= _mm_mul_sd(two
,xmm4
);
1739 xmm4
= _mm_add_sd(tmp
,xmm4
);
1741 tmp
= _mm_or_pd(_mm_and_pd(mask_cmp3
,xmm4
) ,_mm_andnot_pd(mask_cmp3
,tmp
)); /*conditional as a mask*/
1743 /* the tmp will now contain two partial values, that not all are to be used. Which */
1744 /* ones are governed by the mask_cmp mask. */
1745 tmp
= _mm_mul_pd(half
,tmp
);
1746 tmp
= _mm_or_pd(_mm_and_pd(mask_cmp
,tmp
) ,_mm_andnot_pd(mask_cmp
,zero
)); /*conditional as a mask*/
1748 /* Load, add and store gpol energy */
1749 xmm7
= _mm_load_sd(born
->gpol_hct_work
+aj1
);
1750 xmm7
= _mm_add_sd(xmm7
,tmp
);
1751 _mm_store_sd(born
->gpol_hct_work
+aj1
,xmm7
);
1753 /* Start chain rule terms, t1 */
1754 xmm2
= _mm_mul_sd(half
,lij2
);
1755 xmm3
= _mm_mul_sd(prod
,lij3
);
1756 xmm2
= _mm_add_sd(xmm2
,xmm3
);
1757 xmm3
= _mm_mul_sd(lij
,rinv
);
1758 xmm4
= _mm_mul_sd(lij3
,r
);
1759 xmm3
= _mm_add_sd(xmm3
,xmm4
);
1760 xmm3
= _mm_mul_sd(qrtr
,xmm3
);
1761 t1
= _mm_sub_sd(xmm2
,xmm3
);
1763 xmm2
= _mm_mul_sd(half
,uij2
);
1764 xmm2
= _mm_mul_sd(neg
,xmm2
);
1765 xmm3
= _mm_mul_sd(qrtr
,sk2_inv
);
1766 xmm3
= _mm_mul_sd(xmm3
,uij3
);
1767 xmm2
= _mm_sub_sd(xmm2
,xmm3
);
1768 xmm3
= _mm_mul_sd(uij
,rinv
);
1769 xmm4
= _mm_mul_sd(uij3
,r
);
1770 xmm3
= _mm_add_sd(xmm3
,xmm4
);
1771 xmm3
= _mm_mul_sd(qrtr
,xmm3
);
1772 t2
= _mm_add_sd(xmm2
,xmm3
);
1774 xmm2
= _mm_mul_sd(sk2_inv
,rinv
);
1775 xmm2
= _mm_add_sd(one
,xmm2
);
1776 xmm2
= _mm_mul_sd(eigth
,xmm2
);
1777 xmm2
= _mm_mul_sd(xmm2
,xmm9
);
1778 xmm3
= _mm_mul_sd(log_term
, rinv
);
1779 xmm3
= _mm_mul_sd(xmm3
,rinv
);
1780 xmm3
= _mm_mul_sd(qrtr
,xmm3
);
1781 t3
= _mm_add_sd(xmm2
,xmm3
);
1783 /* chain rule terms */
1784 xmm2
= _mm_mul_sd(dlij
,t1
);
1785 xmm2
= _mm_add_sd(xmm2
,t2
);
1786 xmm2
= _mm_add_sd(xmm2
,t3
);
1787 chrule_ai
= _mm_mul_sd(xmm2
,rinv
);
1789 _mm_store_sd(fr
->dadx
+n
, chrule
);
1791 _mm_store_sd(fr
->dadx
+n
, chrule_ai
);
1795 /* Do end processing ... */
1796 tmp_ai
= _mm_unpacklo_pd(tmp_ai
,sum_ai
);
1797 sum_ai
= _mm_add_pd(sum_ai
,tmp_ai
);
1798 sum_ai
= _mm_shuffle_pd(sum_ai
,sum_ai
,_MM_SHUFFLE2(1,1));
1800 /* Load, add and store atom ai polarisation energy */
1801 xmm2
= _mm_load_sd(born
->gpol_hct_work
+ai
);
1802 sum_ai
= _mm_add_sd(sum_ai
,xmm2
);
1803 _mm_store_sd(born
->gpol_hct_work
+ai
,sum_ai
);
1806 /* Parallel summations */
1809 gmx_sum(natoms
, born
->gpol_hct_work
, cr
);
1811 else if(DOMAINDECOMP(cr
))
1813 dd_atom_sum_real(cr
->dd
, born
->gpol_hct_work
);
1816 /* Compute the radii */
1817 for(i
=0;i
<fr
->natoms_force
;i
++) /* PELA born->nr */
1819 if(born
->use
[i
] != 0)
1821 rr
= top
->atomtypes
.gb_radius
[md
->typeA
[i
]];
1825 sum
= rr
* born
->gpol_hct_work
[i
];
1829 tsum
= tanh(born
->obc_alpha
*sum
-born
->obc_beta
*sum2
+born
->obc_gamma
*sum3
);
1830 born
->bRad
[i
] = rr_inv
- tsum
*rr_inv2
;
1831 born
->bRad
[i
] = 1.0 / born
->bRad
[i
];
1833 fr
->invsqrta
[i
] = gmx_invsqrt(born
->bRad
[i
]);
1835 tchain
= rr
* (born
->obc_alpha
-2*born
->obc_beta
*sum
+3*born
->obc_gamma
*sum2
);
1836 born
->drobc
[i
] = (1.0-tsum
*tsum
)*tchain
*rr_inv2
;
1840 /* Extra (local) communication required for DD */
1841 if(DOMAINDECOMP(cr
))
1843 dd_atom_spread_real(cr
->dd
, born
->bRad
);
1844 dd_atom_spread_real(cr
->dd
, fr
->invsqrta
);
1845 dd_atom_spread_real(cr
->dd
, born
->drobc
);
1854 calc_gb_chainrule_sse2_double(int natoms
, t_nblist
*nl
, double *dadx
, double *dvda
, double *xd
, double *f
,
1855 double *fshift
, double *shift_vec
, int gb_algorithm
, gmx_genborn_t
*born
)
1857 int i
,k
,n
,ai
,aj
,ai3
,aj1
,aj2
,aj13
,aj23
,aj4
,nj0
,nj1
,offset
;
1859 double rbi
,shX
,shY
,shZ
;
1862 __m128d ix
,iy
,iz
,jx
,jy
,jz
,fix
,fiy
,fiz
,sX
,sY
,sZ
;
1863 __m128d dx
,dy
,dz
,t1
,t2
,t3
,dva
,dvaj
,dax
,dax_ai
,fgb
,fgb_ai
;
1864 __m128d xmm1
,xmm2
,xmm3
,xmm4
,xmm5
,xmm6
,xmm7
,xmm8
;
1866 t1
= t2
= t3
= _mm_setzero_pd();
1869 /* Loop to get the proper form for the Born radius term */
1870 if(gb_algorithm
==egbSTILL
) {
1871 for(i
=0;i
<natoms
;i
++)
1873 rbi
= born
->bRad
[i
];
1874 rb
[i
] = (2 * rbi
* rbi
* dvda
[i
])/ONE_4PI_EPS0
;
1878 else if(gb_algorithm
==egbHCT
) {
1879 for(i
=0;i
<natoms
;i
++)
1881 rbi
= born
->bRad
[i
];
1882 rb
[i
] = rbi
* rbi
* dvda
[i
];
1886 else if(gb_algorithm
==egbOBC
) {
1887 for(i
=0;i
<natoms
;i
++)
1889 rbi
= born
->bRad
[i
];
1890 rb
[i
] = rbi
* rbi
* born
->drobc
[i
] * dvda
[i
];
1896 for(i
=0;i
<nl
->nri
;i
++)
1901 nj0
= nl
->jindex
[i
];
1902 nj1
= nl
->jindex
[i
+1];
1904 /* Load shifts for this list */
1905 shift
= 3*nl
->shift
[i
];
1906 shX
= shift_vec
[shift
+0];
1907 shY
= shift_vec
[shift
+1];
1908 shZ
= shift_vec
[shift
+2];
1910 /* Splat the shifts */
1911 sX
= _mm_load1_pd(&shX
);
1912 sY
= _mm_load1_pd(&shY
);
1913 sZ
= _mm_load1_pd(&shZ
);
1915 offset
= (nj1
-nj0
)%2;
1917 /* Load particle ai coordinates and add shifts */
1918 ix
= _mm_load1_pd(xd
+ai3
);
1919 iy
= _mm_load1_pd(xd
+ai3
+1);
1920 iz
= _mm_load1_pd(xd
+ai3
+2);
1922 ix
= _mm_add_pd(sX
,ix
);
1923 iy
= _mm_add_pd(sY
,iy
);
1924 iz
= _mm_add_pd(sZ
,iz
);
1926 /* Load particle ai dvda */
1927 dva
= _mm_load1_pd(rb
+ai
);
1929 fix
= _mm_setzero_pd();
1930 fiy
= _mm_setzero_pd();
1931 fiz
= _mm_setzero_pd();
1933 for(k
=nj0
;k
<nj1
-offset
;k
+=2)
1936 aj2
= nl
->jjnr
[k
+1];
1939 dvaj
= _mm_set_pd(rb
[aj2
],rb
[aj1
]);
1944 /* Load particle aj1-2 coordinates */
1945 xmm1
= _mm_loadu_pd(xd
+aj13
);
1946 xmm2
= _mm_loadu_pd(xd
+aj23
);
1948 xmm5
= _mm_load_sd(xd
+aj13
+2);
1949 xmm6
= _mm_load_sd(xd
+aj23
+2);
1951 jx
= _mm_shuffle_pd(xmm1
,xmm2
,_MM_SHUFFLE2(0,0));
1952 jy
= _mm_shuffle_pd(xmm1
,xmm2
,_MM_SHUFFLE2(1,1));
1953 jz
= _mm_shuffle_pd(xmm5
,xmm6
,_MM_SHUFFLE2(0,0));
1955 /* load derivative of born radii w.r.t. coordinates */
1956 xmm7
= _mm_loadu_pd(dadx
+n
);
1958 xmm8
= _mm_loadu_pd(dadx
+n
);
1961 /* Shuffle to get the ai and aj components right */
1962 dax
= _mm_shuffle_pd(xmm7
,xmm8
,_MM_SHUFFLE2(2,0));
1963 dax_ai
= _mm_shuffle_pd(xmm7
,xmm8
,_MM_SHUFFLE2(3,1));
1966 dx
= _mm_sub_pd(ix
,jx
);
1967 dy
= _mm_sub_pd(iy
,jy
);
1968 dz
= _mm_sub_pd(iz
,jz
);
1971 fgb
= _mm_mul_pd(dva
,dax
);
1972 fgb_ai
= _mm_mul_pd(dvaj
,dax_ai
);
1973 fgb
= _mm_add_pd(fgb
,fgb_ai
);
1975 /* partial forces */
1976 t1
= _mm_mul_pd(fgb
,dx
);
1977 t2
= _mm_mul_pd(fgb
,dy
);
1978 t3
= _mm_mul_pd(fgb
,dz
);
1980 /* update the i force */
1981 fix
= _mm_add_pd(fix
,t1
);
1982 fiy
= _mm_add_pd(fiy
,t2
);
1983 fiz
= _mm_add_pd(fiz
,t3
);
1985 /* accumulate forces from memory */
1986 xmm1
= _mm_loadu_pd(f
+aj13
); /* fx1 fx2 */
1987 xmm2
= _mm_loadu_pd(f
+aj23
); /* fy1 fy2 */
1989 xmm5
= _mm_load1_pd(f
+aj13
+2); /* fz1 fz1 */
1990 xmm6
= _mm_load1_pd(f
+aj23
+2); /* fz2 fz2 */
1993 xmm7
= _mm_shuffle_pd(xmm5
,xmm6
,_MM_SHUFFLE2(0,0)); /* fz1 fz2 */
1994 xmm5
= _mm_shuffle_pd(xmm1
,xmm2
,_MM_SHUFFLE2(0,0)); /* fx1 fx2 */
1995 xmm6
= _mm_shuffle_pd(xmm1
,xmm2
,_MM_SHUFFLE2(1,1)); /* fy1 fy2 */
1997 /* subtract partial forces */
1998 xmm5
= _mm_sub_pd(xmm5
, t1
);
1999 xmm6
= _mm_sub_pd(xmm6
, t2
);
2000 xmm7
= _mm_sub_pd(xmm7
, t3
);
2002 /* transpose back fx and fy */
2003 xmm1
= _mm_shuffle_pd(xmm5
,xmm6
,_MM_SHUFFLE2(0,0));
2004 xmm2
= _mm_shuffle_pd(xmm5
,xmm6
,_MM_SHUFFLE2(1,1));
2006 /* store the force, first fx's and fy's */
2007 _mm_storeu_pd(f
+aj13
,xmm1
);
2008 _mm_storeu_pd(f
+aj23
,xmm2
);
2011 _mm_storel_pd(f
+aj13
+2,xmm7
);
2012 _mm_storeh_pd(f
+aj23
+2,xmm7
);
2019 dvaj
= _mm_load_sd(rb
+aj1
);
2023 jx
= _mm_load_sd(xd
+aj13
);
2024 jy
= _mm_load_sd(xd
+aj13
+1);
2025 jz
= _mm_load_sd(xd
+aj13
+2);
2027 dax
= _mm_load_sd(dadx
+n
);
2029 dax_ai
= _mm_load_sd(dadx
+n
);
2032 dx
= _mm_sub_sd(ix
,jx
);
2033 dy
= _mm_sub_sd(iy
,jy
);
2034 dz
= _mm_sub_sd(iz
,jz
);
2036 fgb
= _mm_mul_sd(dva
,dax
);
2037 fgb_ai
= _mm_mul_sd(dvaj
,dax_ai
);
2038 fgb
= _mm_add_pd(fgb
,fgb_ai
);
2040 t1
= _mm_mul_sd(fgb
,dx
);
2041 t2
= _mm_mul_sd(fgb
,dy
);
2042 t3
= _mm_mul_sd(fgb
,dz
);
2044 fix
= _mm_add_sd(fix
,t1
);
2045 fiy
= _mm_add_sd(fiy
,t2
);
2046 fiz
= _mm_add_sd(fiz
,t3
);
2048 /* accumulate forces from memory */
2049 xmm5
= _mm_load_sd(f
+aj13
);
2050 xmm6
= _mm_load_sd(f
+aj13
+1);
2051 xmm7
= _mm_load_sd(f
+aj13
+2);
2053 /* subtract partial forces */
2054 xmm5
= _mm_sub_sd(xmm5
, t1
);
2055 xmm6
= _mm_sub_sd(xmm6
, t2
);
2056 xmm7
= _mm_sub_sd(xmm7
, t3
);
2058 /* store the force */
2059 _mm_store_sd(f
+aj13
,xmm5
);
2060 _mm_store_sd(f
+aj13
+1,xmm6
);
2061 _mm_store_sd(f
+aj13
+2,xmm7
);
2064 /* fix/fiy/fiz now contain two partial terms, that all should be
2065 * added to the i particle forces
2067 t1
= _mm_unpacklo_pd(t1
,fix
);
2068 t2
= _mm_unpacklo_pd(t2
,fiy
);
2069 t3
= _mm_unpacklo_pd(t3
,fiz
);
2071 fix
= _mm_add_pd(fix
,t1
);
2072 fiy
= _mm_add_pd(fiy
,t2
);
2073 fiz
= _mm_add_pd(fiz
,t3
);
2075 fix
= _mm_shuffle_pd(fix
,fix
,_MM_SHUFFLE2(1,1));
2076 fiy
= _mm_shuffle_pd(fiy
,fiy
,_MM_SHUFFLE2(1,1));
2077 fiz
= _mm_shuffle_pd(fiz
,fiz
,_MM_SHUFFLE2(1,1));
2079 /* load, add and store i forces */
2080 xmm1
= _mm_load_sd(f
+ai3
);
2081 xmm2
= _mm_load_sd(f
+ai3
+1);
2082 xmm3
= _mm_load_sd(f
+ai3
+2);
2084 fix
= _mm_add_sd(fix
,xmm1
);
2085 fiy
= _mm_add_sd(fiy
,xmm2
);
2086 fiz
= _mm_add_sd(fiz
,xmm3
);
2088 _mm_store_sd(f
+ai3
,fix
);
2089 _mm_store_sd(f
+ai3
+1,fiy
);
2090 _mm_store_sd(f
+ai3
+2,fiz
);
2092 /* load, add and store i shift forces */
2093 xmm1
= _mm_load_sd(fshift
+shift
);
2094 xmm2
= _mm_load_sd(fshift
+shift
+1);
2095 xmm3
= _mm_load_sd(fshift
+shift
+2);
2097 fix
= _mm_add_sd(fix
,xmm1
);
2098 fiy
= _mm_add_sd(fiy
,xmm2
);
2099 fiz
= _mm_add_sd(fiz
,xmm3
);
2101 _mm_store_sd(fshift
+shift
,fix
);
2102 _mm_store_sd(fshift
+shift
+1,fiy
);
2103 _mm_store_sd(fshift
+shift
+2,fiz
);
2111 /* keep compiler happy */
2112 int genborn_sse_dummy
;
2114 #endif /* SSE2 intrinsics available */