2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2012,2013,2014,2015,2017, by the GROMACS development team, led by
5 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6 * and including many others, as listed in the AUTHORS file in the
7 * top-level source directory and at http://www.gromacs.org.
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
36 * Note: this file was generated by the GROMACS sse2_single kernel generator.
44 #include "../nb_kernel.h"
45 #include "gromacs/gmxlib/nrnb.h"
47 #include "kernelutil_x86_sse2_single.h"
50 * Gromacs nonbonded kernel: nb_kernel_ElecRF_VdwCSTab_GeomW4W4_VF_sse2_single
51 * Electrostatics interaction: ReactionField
52 * VdW interaction: CubicSplineTable
53 * Geometry: Water4-Water4
54 * Calculate force/pot: PotentialAndForce
57 nb_kernel_ElecRF_VdwCSTab_GeomW4W4_VF_sse2_single
58 (t_nblist
* gmx_restrict nlist
,
59 rvec
* gmx_restrict xx
,
60 rvec
* gmx_restrict ff
,
61 struct t_forcerec
* gmx_restrict fr
,
62 t_mdatoms
* gmx_restrict mdatoms
,
63 nb_kernel_data_t gmx_unused
* gmx_restrict kernel_data
,
64 t_nrnb
* gmx_restrict nrnb
)
66 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
67 * just 0 for non-waters.
68 * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
69 * jnr indices corresponding to data put in the four positions in the SIMD register.
71 int i_shift_offset
,i_coord_offset
,outeriter
,inneriter
;
72 int j_index_start
,j_index_end
,jidx
,nri
,inr
,ggid
,iidx
;
73 int jnrA
,jnrB
,jnrC
,jnrD
;
74 int jnrlistA
,jnrlistB
,jnrlistC
,jnrlistD
;
75 int j_coord_offsetA
,j_coord_offsetB
,j_coord_offsetC
,j_coord_offsetD
;
76 int *iinr
,*jindex
,*jjnr
,*shiftidx
,*gid
;
78 real
*shiftvec
,*fshift
,*x
,*f
;
79 real
*fjptrA
,*fjptrB
,*fjptrC
,*fjptrD
;
81 __m128 tx
,ty
,tz
,fscal
,rcutoff
,rcutoff2
,jidxall
;
83 __m128 ix0
,iy0
,iz0
,fix0
,fiy0
,fiz0
,iq0
,isai0
;
85 __m128 ix1
,iy1
,iz1
,fix1
,fiy1
,fiz1
,iq1
,isai1
;
87 __m128 ix2
,iy2
,iz2
,fix2
,fiy2
,fiz2
,iq2
,isai2
;
89 __m128 ix3
,iy3
,iz3
,fix3
,fiy3
,fiz3
,iq3
,isai3
;
90 int vdwjidx0A
,vdwjidx0B
,vdwjidx0C
,vdwjidx0D
;
91 __m128 jx0
,jy0
,jz0
,fjx0
,fjy0
,fjz0
,jq0
,isaj0
;
92 int vdwjidx1A
,vdwjidx1B
,vdwjidx1C
,vdwjidx1D
;
93 __m128 jx1
,jy1
,jz1
,fjx1
,fjy1
,fjz1
,jq1
,isaj1
;
94 int vdwjidx2A
,vdwjidx2B
,vdwjidx2C
,vdwjidx2D
;
95 __m128 jx2
,jy2
,jz2
,fjx2
,fjy2
,fjz2
,jq2
,isaj2
;
96 int vdwjidx3A
,vdwjidx3B
,vdwjidx3C
,vdwjidx3D
;
97 __m128 jx3
,jy3
,jz3
,fjx3
,fjy3
,fjz3
,jq3
,isaj3
;
98 __m128 dx00
,dy00
,dz00
,rsq00
,rinv00
,rinvsq00
,r00
,qq00
,c6_00
,c12_00
;
99 __m128 dx11
,dy11
,dz11
,rsq11
,rinv11
,rinvsq11
,r11
,qq11
,c6_11
,c12_11
;
100 __m128 dx12
,dy12
,dz12
,rsq12
,rinv12
,rinvsq12
,r12
,qq12
,c6_12
,c12_12
;
101 __m128 dx13
,dy13
,dz13
,rsq13
,rinv13
,rinvsq13
,r13
,qq13
,c6_13
,c12_13
;
102 __m128 dx21
,dy21
,dz21
,rsq21
,rinv21
,rinvsq21
,r21
,qq21
,c6_21
,c12_21
;
103 __m128 dx22
,dy22
,dz22
,rsq22
,rinv22
,rinvsq22
,r22
,qq22
,c6_22
,c12_22
;
104 __m128 dx23
,dy23
,dz23
,rsq23
,rinv23
,rinvsq23
,r23
,qq23
,c6_23
,c12_23
;
105 __m128 dx31
,dy31
,dz31
,rsq31
,rinv31
,rinvsq31
,r31
,qq31
,c6_31
,c12_31
;
106 __m128 dx32
,dy32
,dz32
,rsq32
,rinv32
,rinvsq32
,r32
,qq32
,c6_32
,c12_32
;
107 __m128 dx33
,dy33
,dz33
,rsq33
,rinv33
,rinvsq33
,r33
,qq33
,c6_33
,c12_33
;
108 __m128 velec
,felec
,velecsum
,facel
,crf
,krf
,krf2
;
111 __m128 rinvsix
,rvdw
,vvdw
,vvdw6
,vvdw12
,fvdw
,fvdw6
,fvdw12
,vvdwsum
,sh_vdw_invrcut6
;
114 __m128 one_sixth
= _mm_set1_ps(1.0/6.0);
115 __m128 one_twelfth
= _mm_set1_ps(1.0/12.0);
117 __m128i ifour
= _mm_set1_epi32(4);
118 __m128 rt
,vfeps
,vftabscale
,Y
,F
,G
,H
,Heps
,Fp
,VV
,FF
;
120 __m128 dummy_mask
,cutoff_mask
;
121 __m128 signbit
= _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
122 __m128 one
= _mm_set1_ps(1.0);
123 __m128 two
= _mm_set1_ps(2.0);
129 jindex
= nlist
->jindex
;
131 shiftidx
= nlist
->shift
;
133 shiftvec
= fr
->shift_vec
[0];
134 fshift
= fr
->fshift
[0];
135 facel
= _mm_set1_ps(fr
->ic
->epsfac
);
136 charge
= mdatoms
->chargeA
;
137 krf
= _mm_set1_ps(fr
->ic
->k_rf
);
138 krf2
= _mm_set1_ps(fr
->ic
->k_rf
*2.0);
139 crf
= _mm_set1_ps(fr
->ic
->c_rf
);
140 nvdwtype
= fr
->ntype
;
142 vdwtype
= mdatoms
->typeA
;
144 vftab
= kernel_data
->table_vdw
->data
;
145 vftabscale
= _mm_set1_ps(kernel_data
->table_vdw
->scale
);
147 /* Setup water-specific parameters */
148 inr
= nlist
->iinr
[0];
149 iq1
= _mm_mul_ps(facel
,_mm_set1_ps(charge
[inr
+1]));
150 iq2
= _mm_mul_ps(facel
,_mm_set1_ps(charge
[inr
+2]));
151 iq3
= _mm_mul_ps(facel
,_mm_set1_ps(charge
[inr
+3]));
152 vdwioffset0
= 2*nvdwtype
*vdwtype
[inr
+0];
154 jq1
= _mm_set1_ps(charge
[inr
+1]);
155 jq2
= _mm_set1_ps(charge
[inr
+2]);
156 jq3
= _mm_set1_ps(charge
[inr
+3]);
157 vdwjidx0A
= 2*vdwtype
[inr
+0];
158 c6_00
= _mm_set1_ps(vdwparam
[vdwioffset0
+vdwjidx0A
]);
159 c12_00
= _mm_set1_ps(vdwparam
[vdwioffset0
+vdwjidx0A
+1]);
160 qq11
= _mm_mul_ps(iq1
,jq1
);
161 qq12
= _mm_mul_ps(iq1
,jq2
);
162 qq13
= _mm_mul_ps(iq1
,jq3
);
163 qq21
= _mm_mul_ps(iq2
,jq1
);
164 qq22
= _mm_mul_ps(iq2
,jq2
);
165 qq23
= _mm_mul_ps(iq2
,jq3
);
166 qq31
= _mm_mul_ps(iq3
,jq1
);
167 qq32
= _mm_mul_ps(iq3
,jq2
);
168 qq33
= _mm_mul_ps(iq3
,jq3
);
170 /* Avoid stupid compiler warnings */
171 jnrA
= jnrB
= jnrC
= jnrD
= 0;
180 for(iidx
=0;iidx
<4*DIM
;iidx
++)
185 /* Start outer loop over neighborlists */
186 for(iidx
=0; iidx
<nri
; iidx
++)
188 /* Load shift vector for this list */
189 i_shift_offset
= DIM
*shiftidx
[iidx
];
191 /* Load limits for loop over neighbors */
192 j_index_start
= jindex
[iidx
];
193 j_index_end
= jindex
[iidx
+1];
195 /* Get outer coordinate index */
197 i_coord_offset
= DIM
*inr
;
199 /* Load i particle coords and add shift vector */
200 gmx_mm_load_shift_and_4rvec_broadcast_ps(shiftvec
+i_shift_offset
,x
+i_coord_offset
,
201 &ix0
,&iy0
,&iz0
,&ix1
,&iy1
,&iz1
,&ix2
,&iy2
,&iz2
,&ix3
,&iy3
,&iz3
);
203 fix0
= _mm_setzero_ps();
204 fiy0
= _mm_setzero_ps();
205 fiz0
= _mm_setzero_ps();
206 fix1
= _mm_setzero_ps();
207 fiy1
= _mm_setzero_ps();
208 fiz1
= _mm_setzero_ps();
209 fix2
= _mm_setzero_ps();
210 fiy2
= _mm_setzero_ps();
211 fiz2
= _mm_setzero_ps();
212 fix3
= _mm_setzero_ps();
213 fiy3
= _mm_setzero_ps();
214 fiz3
= _mm_setzero_ps();
216 /* Reset potential sums */
217 velecsum
= _mm_setzero_ps();
218 vvdwsum
= _mm_setzero_ps();
220 /* Start inner kernel loop */
221 for(jidx
=j_index_start
; jidx
<j_index_end
&& jjnr
[jidx
+3]>=0; jidx
+=4)
224 /* Get j neighbor index, and coordinate index */
229 j_coord_offsetA
= DIM
*jnrA
;
230 j_coord_offsetB
= DIM
*jnrB
;
231 j_coord_offsetC
= DIM
*jnrC
;
232 j_coord_offsetD
= DIM
*jnrD
;
234 /* load j atom coordinates */
235 gmx_mm_load_4rvec_4ptr_swizzle_ps(x
+j_coord_offsetA
,x
+j_coord_offsetB
,
236 x
+j_coord_offsetC
,x
+j_coord_offsetD
,
237 &jx0
,&jy0
,&jz0
,&jx1
,&jy1
,&jz1
,&jx2
,
238 &jy2
,&jz2
,&jx3
,&jy3
,&jz3
);
240 /* Calculate displacement vector */
241 dx00
= _mm_sub_ps(ix0
,jx0
);
242 dy00
= _mm_sub_ps(iy0
,jy0
);
243 dz00
= _mm_sub_ps(iz0
,jz0
);
244 dx11
= _mm_sub_ps(ix1
,jx1
);
245 dy11
= _mm_sub_ps(iy1
,jy1
);
246 dz11
= _mm_sub_ps(iz1
,jz1
);
247 dx12
= _mm_sub_ps(ix1
,jx2
);
248 dy12
= _mm_sub_ps(iy1
,jy2
);
249 dz12
= _mm_sub_ps(iz1
,jz2
);
250 dx13
= _mm_sub_ps(ix1
,jx3
);
251 dy13
= _mm_sub_ps(iy1
,jy3
);
252 dz13
= _mm_sub_ps(iz1
,jz3
);
253 dx21
= _mm_sub_ps(ix2
,jx1
);
254 dy21
= _mm_sub_ps(iy2
,jy1
);
255 dz21
= _mm_sub_ps(iz2
,jz1
);
256 dx22
= _mm_sub_ps(ix2
,jx2
);
257 dy22
= _mm_sub_ps(iy2
,jy2
);
258 dz22
= _mm_sub_ps(iz2
,jz2
);
259 dx23
= _mm_sub_ps(ix2
,jx3
);
260 dy23
= _mm_sub_ps(iy2
,jy3
);
261 dz23
= _mm_sub_ps(iz2
,jz3
);
262 dx31
= _mm_sub_ps(ix3
,jx1
);
263 dy31
= _mm_sub_ps(iy3
,jy1
);
264 dz31
= _mm_sub_ps(iz3
,jz1
);
265 dx32
= _mm_sub_ps(ix3
,jx2
);
266 dy32
= _mm_sub_ps(iy3
,jy2
);
267 dz32
= _mm_sub_ps(iz3
,jz2
);
268 dx33
= _mm_sub_ps(ix3
,jx3
);
269 dy33
= _mm_sub_ps(iy3
,jy3
);
270 dz33
= _mm_sub_ps(iz3
,jz3
);
272 /* Calculate squared distance and things based on it */
273 rsq00
= gmx_mm_calc_rsq_ps(dx00
,dy00
,dz00
);
274 rsq11
= gmx_mm_calc_rsq_ps(dx11
,dy11
,dz11
);
275 rsq12
= gmx_mm_calc_rsq_ps(dx12
,dy12
,dz12
);
276 rsq13
= gmx_mm_calc_rsq_ps(dx13
,dy13
,dz13
);
277 rsq21
= gmx_mm_calc_rsq_ps(dx21
,dy21
,dz21
);
278 rsq22
= gmx_mm_calc_rsq_ps(dx22
,dy22
,dz22
);
279 rsq23
= gmx_mm_calc_rsq_ps(dx23
,dy23
,dz23
);
280 rsq31
= gmx_mm_calc_rsq_ps(dx31
,dy31
,dz31
);
281 rsq32
= gmx_mm_calc_rsq_ps(dx32
,dy32
,dz32
);
282 rsq33
= gmx_mm_calc_rsq_ps(dx33
,dy33
,dz33
);
284 rinv00
= sse2_invsqrt_f(rsq00
);
285 rinv11
= sse2_invsqrt_f(rsq11
);
286 rinv12
= sse2_invsqrt_f(rsq12
);
287 rinv13
= sse2_invsqrt_f(rsq13
);
288 rinv21
= sse2_invsqrt_f(rsq21
);
289 rinv22
= sse2_invsqrt_f(rsq22
);
290 rinv23
= sse2_invsqrt_f(rsq23
);
291 rinv31
= sse2_invsqrt_f(rsq31
);
292 rinv32
= sse2_invsqrt_f(rsq32
);
293 rinv33
= sse2_invsqrt_f(rsq33
);
295 rinvsq11
= _mm_mul_ps(rinv11
,rinv11
);
296 rinvsq12
= _mm_mul_ps(rinv12
,rinv12
);
297 rinvsq13
= _mm_mul_ps(rinv13
,rinv13
);
298 rinvsq21
= _mm_mul_ps(rinv21
,rinv21
);
299 rinvsq22
= _mm_mul_ps(rinv22
,rinv22
);
300 rinvsq23
= _mm_mul_ps(rinv23
,rinv23
);
301 rinvsq31
= _mm_mul_ps(rinv31
,rinv31
);
302 rinvsq32
= _mm_mul_ps(rinv32
,rinv32
);
303 rinvsq33
= _mm_mul_ps(rinv33
,rinv33
);
305 fjx0
= _mm_setzero_ps();
306 fjy0
= _mm_setzero_ps();
307 fjz0
= _mm_setzero_ps();
308 fjx1
= _mm_setzero_ps();
309 fjy1
= _mm_setzero_ps();
310 fjz1
= _mm_setzero_ps();
311 fjx2
= _mm_setzero_ps();
312 fjy2
= _mm_setzero_ps();
313 fjz2
= _mm_setzero_ps();
314 fjx3
= _mm_setzero_ps();
315 fjy3
= _mm_setzero_ps();
316 fjz3
= _mm_setzero_ps();
318 /**************************
319 * CALCULATE INTERACTIONS *
320 **************************/
322 r00
= _mm_mul_ps(rsq00
,rinv00
);
324 /* Calculate table index by multiplying r with table scale and truncate to integer */
325 rt
= _mm_mul_ps(r00
,vftabscale
);
326 vfitab
= _mm_cvttps_epi32(rt
);
327 vfeps
= _mm_sub_ps(rt
,_mm_cvtepi32_ps(vfitab
));
328 vfitab
= _mm_slli_epi32(vfitab
,3);
330 /* CUBIC SPLINE TABLE DISPERSION */
331 Y
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,0) );
332 F
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,1) );
333 G
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,2) );
334 H
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,3) );
335 _MM_TRANSPOSE4_PS(Y
,F
,G
,H
);
336 Heps
= _mm_mul_ps(vfeps
,H
);
337 Fp
= _mm_add_ps(F
,_mm_mul_ps(vfeps
,_mm_add_ps(G
,Heps
)));
338 VV
= _mm_add_ps(Y
,_mm_mul_ps(vfeps
,Fp
));
339 vvdw6
= _mm_mul_ps(c6_00
,VV
);
340 FF
= _mm_add_ps(Fp
,_mm_mul_ps(vfeps
,_mm_add_ps(G
,_mm_add_ps(Heps
,Heps
))));
341 fvdw6
= _mm_mul_ps(c6_00
,FF
);
343 /* CUBIC SPLINE TABLE REPULSION */
344 vfitab
= _mm_add_epi32(vfitab
,ifour
);
345 Y
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,0) );
346 F
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,1) );
347 G
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,2) );
348 H
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,3) );
349 _MM_TRANSPOSE4_PS(Y
,F
,G
,H
);
350 Heps
= _mm_mul_ps(vfeps
,H
);
351 Fp
= _mm_add_ps(F
,_mm_mul_ps(vfeps
,_mm_add_ps(G
,Heps
)));
352 VV
= _mm_add_ps(Y
,_mm_mul_ps(vfeps
,Fp
));
353 vvdw12
= _mm_mul_ps(c12_00
,VV
);
354 FF
= _mm_add_ps(Fp
,_mm_mul_ps(vfeps
,_mm_add_ps(G
,_mm_add_ps(Heps
,Heps
))));
355 fvdw12
= _mm_mul_ps(c12_00
,FF
);
356 vvdw
= _mm_add_ps(vvdw12
,vvdw6
);
357 fvdw
= _mm_xor_ps(signbit
,_mm_mul_ps(_mm_add_ps(fvdw6
,fvdw12
),_mm_mul_ps(vftabscale
,rinv00
)));
359 /* Update potential sum for this i atom from the interaction with this j atom. */
360 vvdwsum
= _mm_add_ps(vvdwsum
,vvdw
);
364 /* Calculate temporary vectorial force */
365 tx
= _mm_mul_ps(fscal
,dx00
);
366 ty
= _mm_mul_ps(fscal
,dy00
);
367 tz
= _mm_mul_ps(fscal
,dz00
);
369 /* Update vectorial force */
370 fix0
= _mm_add_ps(fix0
,tx
);
371 fiy0
= _mm_add_ps(fiy0
,ty
);
372 fiz0
= _mm_add_ps(fiz0
,tz
);
374 fjx0
= _mm_add_ps(fjx0
,tx
);
375 fjy0
= _mm_add_ps(fjy0
,ty
);
376 fjz0
= _mm_add_ps(fjz0
,tz
);
378 /**************************
379 * CALCULATE INTERACTIONS *
380 **************************/
382 /* REACTION-FIELD ELECTROSTATICS */
383 velec
= _mm_mul_ps(qq11
,_mm_sub_ps(_mm_add_ps(rinv11
,_mm_mul_ps(krf
,rsq11
)),crf
));
384 felec
= _mm_mul_ps(qq11
,_mm_sub_ps(_mm_mul_ps(rinv11
,rinvsq11
),krf2
));
386 /* Update potential sum for this i atom from the interaction with this j atom. */
387 velecsum
= _mm_add_ps(velecsum
,velec
);
391 /* Calculate temporary vectorial force */
392 tx
= _mm_mul_ps(fscal
,dx11
);
393 ty
= _mm_mul_ps(fscal
,dy11
);
394 tz
= _mm_mul_ps(fscal
,dz11
);
396 /* Update vectorial force */
397 fix1
= _mm_add_ps(fix1
,tx
);
398 fiy1
= _mm_add_ps(fiy1
,ty
);
399 fiz1
= _mm_add_ps(fiz1
,tz
);
401 fjx1
= _mm_add_ps(fjx1
,tx
);
402 fjy1
= _mm_add_ps(fjy1
,ty
);
403 fjz1
= _mm_add_ps(fjz1
,tz
);
405 /**************************
406 * CALCULATE INTERACTIONS *
407 **************************/
409 /* REACTION-FIELD ELECTROSTATICS */
410 velec
= _mm_mul_ps(qq12
,_mm_sub_ps(_mm_add_ps(rinv12
,_mm_mul_ps(krf
,rsq12
)),crf
));
411 felec
= _mm_mul_ps(qq12
,_mm_sub_ps(_mm_mul_ps(rinv12
,rinvsq12
),krf2
));
413 /* Update potential sum for this i atom from the interaction with this j atom. */
414 velecsum
= _mm_add_ps(velecsum
,velec
);
418 /* Calculate temporary vectorial force */
419 tx
= _mm_mul_ps(fscal
,dx12
);
420 ty
= _mm_mul_ps(fscal
,dy12
);
421 tz
= _mm_mul_ps(fscal
,dz12
);
423 /* Update vectorial force */
424 fix1
= _mm_add_ps(fix1
,tx
);
425 fiy1
= _mm_add_ps(fiy1
,ty
);
426 fiz1
= _mm_add_ps(fiz1
,tz
);
428 fjx2
= _mm_add_ps(fjx2
,tx
);
429 fjy2
= _mm_add_ps(fjy2
,ty
);
430 fjz2
= _mm_add_ps(fjz2
,tz
);
432 /**************************
433 * CALCULATE INTERACTIONS *
434 **************************/
436 /* REACTION-FIELD ELECTROSTATICS */
437 velec
= _mm_mul_ps(qq13
,_mm_sub_ps(_mm_add_ps(rinv13
,_mm_mul_ps(krf
,rsq13
)),crf
));
438 felec
= _mm_mul_ps(qq13
,_mm_sub_ps(_mm_mul_ps(rinv13
,rinvsq13
),krf2
));
440 /* Update potential sum for this i atom from the interaction with this j atom. */
441 velecsum
= _mm_add_ps(velecsum
,velec
);
445 /* Calculate temporary vectorial force */
446 tx
= _mm_mul_ps(fscal
,dx13
);
447 ty
= _mm_mul_ps(fscal
,dy13
);
448 tz
= _mm_mul_ps(fscal
,dz13
);
450 /* Update vectorial force */
451 fix1
= _mm_add_ps(fix1
,tx
);
452 fiy1
= _mm_add_ps(fiy1
,ty
);
453 fiz1
= _mm_add_ps(fiz1
,tz
);
455 fjx3
= _mm_add_ps(fjx3
,tx
);
456 fjy3
= _mm_add_ps(fjy3
,ty
);
457 fjz3
= _mm_add_ps(fjz3
,tz
);
459 /**************************
460 * CALCULATE INTERACTIONS *
461 **************************/
463 /* REACTION-FIELD ELECTROSTATICS */
464 velec
= _mm_mul_ps(qq21
,_mm_sub_ps(_mm_add_ps(rinv21
,_mm_mul_ps(krf
,rsq21
)),crf
));
465 felec
= _mm_mul_ps(qq21
,_mm_sub_ps(_mm_mul_ps(rinv21
,rinvsq21
),krf2
));
467 /* Update potential sum for this i atom from the interaction with this j atom. */
468 velecsum
= _mm_add_ps(velecsum
,velec
);
472 /* Calculate temporary vectorial force */
473 tx
= _mm_mul_ps(fscal
,dx21
);
474 ty
= _mm_mul_ps(fscal
,dy21
);
475 tz
= _mm_mul_ps(fscal
,dz21
);
477 /* Update vectorial force */
478 fix2
= _mm_add_ps(fix2
,tx
);
479 fiy2
= _mm_add_ps(fiy2
,ty
);
480 fiz2
= _mm_add_ps(fiz2
,tz
);
482 fjx1
= _mm_add_ps(fjx1
,tx
);
483 fjy1
= _mm_add_ps(fjy1
,ty
);
484 fjz1
= _mm_add_ps(fjz1
,tz
);
486 /**************************
487 * CALCULATE INTERACTIONS *
488 **************************/
490 /* REACTION-FIELD ELECTROSTATICS */
491 velec
= _mm_mul_ps(qq22
,_mm_sub_ps(_mm_add_ps(rinv22
,_mm_mul_ps(krf
,rsq22
)),crf
));
492 felec
= _mm_mul_ps(qq22
,_mm_sub_ps(_mm_mul_ps(rinv22
,rinvsq22
),krf2
));
494 /* Update potential sum for this i atom from the interaction with this j atom. */
495 velecsum
= _mm_add_ps(velecsum
,velec
);
499 /* Calculate temporary vectorial force */
500 tx
= _mm_mul_ps(fscal
,dx22
);
501 ty
= _mm_mul_ps(fscal
,dy22
);
502 tz
= _mm_mul_ps(fscal
,dz22
);
504 /* Update vectorial force */
505 fix2
= _mm_add_ps(fix2
,tx
);
506 fiy2
= _mm_add_ps(fiy2
,ty
);
507 fiz2
= _mm_add_ps(fiz2
,tz
);
509 fjx2
= _mm_add_ps(fjx2
,tx
);
510 fjy2
= _mm_add_ps(fjy2
,ty
);
511 fjz2
= _mm_add_ps(fjz2
,tz
);
513 /**************************
514 * CALCULATE INTERACTIONS *
515 **************************/
517 /* REACTION-FIELD ELECTROSTATICS */
518 velec
= _mm_mul_ps(qq23
,_mm_sub_ps(_mm_add_ps(rinv23
,_mm_mul_ps(krf
,rsq23
)),crf
));
519 felec
= _mm_mul_ps(qq23
,_mm_sub_ps(_mm_mul_ps(rinv23
,rinvsq23
),krf2
));
521 /* Update potential sum for this i atom from the interaction with this j atom. */
522 velecsum
= _mm_add_ps(velecsum
,velec
);
526 /* Calculate temporary vectorial force */
527 tx
= _mm_mul_ps(fscal
,dx23
);
528 ty
= _mm_mul_ps(fscal
,dy23
);
529 tz
= _mm_mul_ps(fscal
,dz23
);
531 /* Update vectorial force */
532 fix2
= _mm_add_ps(fix2
,tx
);
533 fiy2
= _mm_add_ps(fiy2
,ty
);
534 fiz2
= _mm_add_ps(fiz2
,tz
);
536 fjx3
= _mm_add_ps(fjx3
,tx
);
537 fjy3
= _mm_add_ps(fjy3
,ty
);
538 fjz3
= _mm_add_ps(fjz3
,tz
);
540 /**************************
541 * CALCULATE INTERACTIONS *
542 **************************/
544 /* REACTION-FIELD ELECTROSTATICS */
545 velec
= _mm_mul_ps(qq31
,_mm_sub_ps(_mm_add_ps(rinv31
,_mm_mul_ps(krf
,rsq31
)),crf
));
546 felec
= _mm_mul_ps(qq31
,_mm_sub_ps(_mm_mul_ps(rinv31
,rinvsq31
),krf2
));
548 /* Update potential sum for this i atom from the interaction with this j atom. */
549 velecsum
= _mm_add_ps(velecsum
,velec
);
553 /* Calculate temporary vectorial force */
554 tx
= _mm_mul_ps(fscal
,dx31
);
555 ty
= _mm_mul_ps(fscal
,dy31
);
556 tz
= _mm_mul_ps(fscal
,dz31
);
558 /* Update vectorial force */
559 fix3
= _mm_add_ps(fix3
,tx
);
560 fiy3
= _mm_add_ps(fiy3
,ty
);
561 fiz3
= _mm_add_ps(fiz3
,tz
);
563 fjx1
= _mm_add_ps(fjx1
,tx
);
564 fjy1
= _mm_add_ps(fjy1
,ty
);
565 fjz1
= _mm_add_ps(fjz1
,tz
);
567 /**************************
568 * CALCULATE INTERACTIONS *
569 **************************/
571 /* REACTION-FIELD ELECTROSTATICS */
572 velec
= _mm_mul_ps(qq32
,_mm_sub_ps(_mm_add_ps(rinv32
,_mm_mul_ps(krf
,rsq32
)),crf
));
573 felec
= _mm_mul_ps(qq32
,_mm_sub_ps(_mm_mul_ps(rinv32
,rinvsq32
),krf2
));
575 /* Update potential sum for this i atom from the interaction with this j atom. */
576 velecsum
= _mm_add_ps(velecsum
,velec
);
580 /* Calculate temporary vectorial force */
581 tx
= _mm_mul_ps(fscal
,dx32
);
582 ty
= _mm_mul_ps(fscal
,dy32
);
583 tz
= _mm_mul_ps(fscal
,dz32
);
585 /* Update vectorial force */
586 fix3
= _mm_add_ps(fix3
,tx
);
587 fiy3
= _mm_add_ps(fiy3
,ty
);
588 fiz3
= _mm_add_ps(fiz3
,tz
);
590 fjx2
= _mm_add_ps(fjx2
,tx
);
591 fjy2
= _mm_add_ps(fjy2
,ty
);
592 fjz2
= _mm_add_ps(fjz2
,tz
);
594 /**************************
595 * CALCULATE INTERACTIONS *
596 **************************/
598 /* REACTION-FIELD ELECTROSTATICS */
599 velec
= _mm_mul_ps(qq33
,_mm_sub_ps(_mm_add_ps(rinv33
,_mm_mul_ps(krf
,rsq33
)),crf
));
600 felec
= _mm_mul_ps(qq33
,_mm_sub_ps(_mm_mul_ps(rinv33
,rinvsq33
),krf2
));
602 /* Update potential sum for this i atom from the interaction with this j atom. */
603 velecsum
= _mm_add_ps(velecsum
,velec
);
607 /* Calculate temporary vectorial force */
608 tx
= _mm_mul_ps(fscal
,dx33
);
609 ty
= _mm_mul_ps(fscal
,dy33
);
610 tz
= _mm_mul_ps(fscal
,dz33
);
612 /* Update vectorial force */
613 fix3
= _mm_add_ps(fix3
,tx
);
614 fiy3
= _mm_add_ps(fiy3
,ty
);
615 fiz3
= _mm_add_ps(fiz3
,tz
);
617 fjx3
= _mm_add_ps(fjx3
,tx
);
618 fjy3
= _mm_add_ps(fjy3
,ty
);
619 fjz3
= _mm_add_ps(fjz3
,tz
);
621 fjptrA
= f
+j_coord_offsetA
;
622 fjptrB
= f
+j_coord_offsetB
;
623 fjptrC
= f
+j_coord_offsetC
;
624 fjptrD
= f
+j_coord_offsetD
;
626 gmx_mm_decrement_4rvec_4ptr_swizzle_ps(fjptrA
,fjptrB
,fjptrC
,fjptrD
,
627 fjx0
,fjy0
,fjz0
,fjx1
,fjy1
,fjz1
,
628 fjx2
,fjy2
,fjz2
,fjx3
,fjy3
,fjz3
);
630 /* Inner loop uses 347 flops */
636 /* Get j neighbor index, and coordinate index */
637 jnrlistA
= jjnr
[jidx
];
638 jnrlistB
= jjnr
[jidx
+1];
639 jnrlistC
= jjnr
[jidx
+2];
640 jnrlistD
= jjnr
[jidx
+3];
641 /* Sign of each element will be negative for non-real atoms.
642 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
643 * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
645 dummy_mask
= gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i
*)(jjnr
+jidx
)),_mm_setzero_si128()));
646 jnrA
= (jnrlistA
>=0) ? jnrlistA
: 0;
647 jnrB
= (jnrlistB
>=0) ? jnrlistB
: 0;
648 jnrC
= (jnrlistC
>=0) ? jnrlistC
: 0;
649 jnrD
= (jnrlistD
>=0) ? jnrlistD
: 0;
650 j_coord_offsetA
= DIM
*jnrA
;
651 j_coord_offsetB
= DIM
*jnrB
;
652 j_coord_offsetC
= DIM
*jnrC
;
653 j_coord_offsetD
= DIM
*jnrD
;
655 /* load j atom coordinates */
656 gmx_mm_load_4rvec_4ptr_swizzle_ps(x
+j_coord_offsetA
,x
+j_coord_offsetB
,
657 x
+j_coord_offsetC
,x
+j_coord_offsetD
,
658 &jx0
,&jy0
,&jz0
,&jx1
,&jy1
,&jz1
,&jx2
,
659 &jy2
,&jz2
,&jx3
,&jy3
,&jz3
);
661 /* Calculate displacement vector */
662 dx00
= _mm_sub_ps(ix0
,jx0
);
663 dy00
= _mm_sub_ps(iy0
,jy0
);
664 dz00
= _mm_sub_ps(iz0
,jz0
);
665 dx11
= _mm_sub_ps(ix1
,jx1
);
666 dy11
= _mm_sub_ps(iy1
,jy1
);
667 dz11
= _mm_sub_ps(iz1
,jz1
);
668 dx12
= _mm_sub_ps(ix1
,jx2
);
669 dy12
= _mm_sub_ps(iy1
,jy2
);
670 dz12
= _mm_sub_ps(iz1
,jz2
);
671 dx13
= _mm_sub_ps(ix1
,jx3
);
672 dy13
= _mm_sub_ps(iy1
,jy3
);
673 dz13
= _mm_sub_ps(iz1
,jz3
);
674 dx21
= _mm_sub_ps(ix2
,jx1
);
675 dy21
= _mm_sub_ps(iy2
,jy1
);
676 dz21
= _mm_sub_ps(iz2
,jz1
);
677 dx22
= _mm_sub_ps(ix2
,jx2
);
678 dy22
= _mm_sub_ps(iy2
,jy2
);
679 dz22
= _mm_sub_ps(iz2
,jz2
);
680 dx23
= _mm_sub_ps(ix2
,jx3
);
681 dy23
= _mm_sub_ps(iy2
,jy3
);
682 dz23
= _mm_sub_ps(iz2
,jz3
);
683 dx31
= _mm_sub_ps(ix3
,jx1
);
684 dy31
= _mm_sub_ps(iy3
,jy1
);
685 dz31
= _mm_sub_ps(iz3
,jz1
);
686 dx32
= _mm_sub_ps(ix3
,jx2
);
687 dy32
= _mm_sub_ps(iy3
,jy2
);
688 dz32
= _mm_sub_ps(iz3
,jz2
);
689 dx33
= _mm_sub_ps(ix3
,jx3
);
690 dy33
= _mm_sub_ps(iy3
,jy3
);
691 dz33
= _mm_sub_ps(iz3
,jz3
);
693 /* Calculate squared distance and things based on it */
694 rsq00
= gmx_mm_calc_rsq_ps(dx00
,dy00
,dz00
);
695 rsq11
= gmx_mm_calc_rsq_ps(dx11
,dy11
,dz11
);
696 rsq12
= gmx_mm_calc_rsq_ps(dx12
,dy12
,dz12
);
697 rsq13
= gmx_mm_calc_rsq_ps(dx13
,dy13
,dz13
);
698 rsq21
= gmx_mm_calc_rsq_ps(dx21
,dy21
,dz21
);
699 rsq22
= gmx_mm_calc_rsq_ps(dx22
,dy22
,dz22
);
700 rsq23
= gmx_mm_calc_rsq_ps(dx23
,dy23
,dz23
);
701 rsq31
= gmx_mm_calc_rsq_ps(dx31
,dy31
,dz31
);
702 rsq32
= gmx_mm_calc_rsq_ps(dx32
,dy32
,dz32
);
703 rsq33
= gmx_mm_calc_rsq_ps(dx33
,dy33
,dz33
);
705 rinv00
= sse2_invsqrt_f(rsq00
);
706 rinv11
= sse2_invsqrt_f(rsq11
);
707 rinv12
= sse2_invsqrt_f(rsq12
);
708 rinv13
= sse2_invsqrt_f(rsq13
);
709 rinv21
= sse2_invsqrt_f(rsq21
);
710 rinv22
= sse2_invsqrt_f(rsq22
);
711 rinv23
= sse2_invsqrt_f(rsq23
);
712 rinv31
= sse2_invsqrt_f(rsq31
);
713 rinv32
= sse2_invsqrt_f(rsq32
);
714 rinv33
= sse2_invsqrt_f(rsq33
);
716 rinvsq11
= _mm_mul_ps(rinv11
,rinv11
);
717 rinvsq12
= _mm_mul_ps(rinv12
,rinv12
);
718 rinvsq13
= _mm_mul_ps(rinv13
,rinv13
);
719 rinvsq21
= _mm_mul_ps(rinv21
,rinv21
);
720 rinvsq22
= _mm_mul_ps(rinv22
,rinv22
);
721 rinvsq23
= _mm_mul_ps(rinv23
,rinv23
);
722 rinvsq31
= _mm_mul_ps(rinv31
,rinv31
);
723 rinvsq32
= _mm_mul_ps(rinv32
,rinv32
);
724 rinvsq33
= _mm_mul_ps(rinv33
,rinv33
);
726 fjx0
= _mm_setzero_ps();
727 fjy0
= _mm_setzero_ps();
728 fjz0
= _mm_setzero_ps();
729 fjx1
= _mm_setzero_ps();
730 fjy1
= _mm_setzero_ps();
731 fjz1
= _mm_setzero_ps();
732 fjx2
= _mm_setzero_ps();
733 fjy2
= _mm_setzero_ps();
734 fjz2
= _mm_setzero_ps();
735 fjx3
= _mm_setzero_ps();
736 fjy3
= _mm_setzero_ps();
737 fjz3
= _mm_setzero_ps();
739 /**************************
740 * CALCULATE INTERACTIONS *
741 **************************/
743 r00
= _mm_mul_ps(rsq00
,rinv00
);
744 r00
= _mm_andnot_ps(dummy_mask
,r00
);
746 /* Calculate table index by multiplying r with table scale and truncate to integer */
747 rt
= _mm_mul_ps(r00
,vftabscale
);
748 vfitab
= _mm_cvttps_epi32(rt
);
749 vfeps
= _mm_sub_ps(rt
,_mm_cvtepi32_ps(vfitab
));
750 vfitab
= _mm_slli_epi32(vfitab
,3);
752 /* CUBIC SPLINE TABLE DISPERSION */
753 Y
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,0) );
754 F
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,1) );
755 G
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,2) );
756 H
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,3) );
757 _MM_TRANSPOSE4_PS(Y
,F
,G
,H
);
758 Heps
= _mm_mul_ps(vfeps
,H
);
759 Fp
= _mm_add_ps(F
,_mm_mul_ps(vfeps
,_mm_add_ps(G
,Heps
)));
760 VV
= _mm_add_ps(Y
,_mm_mul_ps(vfeps
,Fp
));
761 vvdw6
= _mm_mul_ps(c6_00
,VV
);
762 FF
= _mm_add_ps(Fp
,_mm_mul_ps(vfeps
,_mm_add_ps(G
,_mm_add_ps(Heps
,Heps
))));
763 fvdw6
= _mm_mul_ps(c6_00
,FF
);
765 /* CUBIC SPLINE TABLE REPULSION */
766 vfitab
= _mm_add_epi32(vfitab
,ifour
);
767 Y
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,0) );
768 F
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,1) );
769 G
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,2) );
770 H
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,3) );
771 _MM_TRANSPOSE4_PS(Y
,F
,G
,H
);
772 Heps
= _mm_mul_ps(vfeps
,H
);
773 Fp
= _mm_add_ps(F
,_mm_mul_ps(vfeps
,_mm_add_ps(G
,Heps
)));
774 VV
= _mm_add_ps(Y
,_mm_mul_ps(vfeps
,Fp
));
775 vvdw12
= _mm_mul_ps(c12_00
,VV
);
776 FF
= _mm_add_ps(Fp
,_mm_mul_ps(vfeps
,_mm_add_ps(G
,_mm_add_ps(Heps
,Heps
))));
777 fvdw12
= _mm_mul_ps(c12_00
,FF
);
778 vvdw
= _mm_add_ps(vvdw12
,vvdw6
);
779 fvdw
= _mm_xor_ps(signbit
,_mm_mul_ps(_mm_add_ps(fvdw6
,fvdw12
),_mm_mul_ps(vftabscale
,rinv00
)));
781 /* Update potential sum for this i atom from the interaction with this j atom. */
782 vvdw
= _mm_andnot_ps(dummy_mask
,vvdw
);
783 vvdwsum
= _mm_add_ps(vvdwsum
,vvdw
);
787 fscal
= _mm_andnot_ps(dummy_mask
,fscal
);
789 /* Calculate temporary vectorial force */
790 tx
= _mm_mul_ps(fscal
,dx00
);
791 ty
= _mm_mul_ps(fscal
,dy00
);
792 tz
= _mm_mul_ps(fscal
,dz00
);
794 /* Update vectorial force */
795 fix0
= _mm_add_ps(fix0
,tx
);
796 fiy0
= _mm_add_ps(fiy0
,ty
);
797 fiz0
= _mm_add_ps(fiz0
,tz
);
799 fjx0
= _mm_add_ps(fjx0
,tx
);
800 fjy0
= _mm_add_ps(fjy0
,ty
);
801 fjz0
= _mm_add_ps(fjz0
,tz
);
803 /**************************
804 * CALCULATE INTERACTIONS *
805 **************************/
807 /* REACTION-FIELD ELECTROSTATICS */
808 velec
= _mm_mul_ps(qq11
,_mm_sub_ps(_mm_add_ps(rinv11
,_mm_mul_ps(krf
,rsq11
)),crf
));
809 felec
= _mm_mul_ps(qq11
,_mm_sub_ps(_mm_mul_ps(rinv11
,rinvsq11
),krf2
));
811 /* Update potential sum for this i atom from the interaction with this j atom. */
812 velec
= _mm_andnot_ps(dummy_mask
,velec
);
813 velecsum
= _mm_add_ps(velecsum
,velec
);
817 fscal
= _mm_andnot_ps(dummy_mask
,fscal
);
819 /* Calculate temporary vectorial force */
820 tx
= _mm_mul_ps(fscal
,dx11
);
821 ty
= _mm_mul_ps(fscal
,dy11
);
822 tz
= _mm_mul_ps(fscal
,dz11
);
824 /* Update vectorial force */
825 fix1
= _mm_add_ps(fix1
,tx
);
826 fiy1
= _mm_add_ps(fiy1
,ty
);
827 fiz1
= _mm_add_ps(fiz1
,tz
);
829 fjx1
= _mm_add_ps(fjx1
,tx
);
830 fjy1
= _mm_add_ps(fjy1
,ty
);
831 fjz1
= _mm_add_ps(fjz1
,tz
);
833 /**************************
834 * CALCULATE INTERACTIONS *
835 **************************/
837 /* REACTION-FIELD ELECTROSTATICS */
838 velec
= _mm_mul_ps(qq12
,_mm_sub_ps(_mm_add_ps(rinv12
,_mm_mul_ps(krf
,rsq12
)),crf
));
839 felec
= _mm_mul_ps(qq12
,_mm_sub_ps(_mm_mul_ps(rinv12
,rinvsq12
),krf2
));
841 /* Update potential sum for this i atom from the interaction with this j atom. */
842 velec
= _mm_andnot_ps(dummy_mask
,velec
);
843 velecsum
= _mm_add_ps(velecsum
,velec
);
847 fscal
= _mm_andnot_ps(dummy_mask
,fscal
);
849 /* Calculate temporary vectorial force */
850 tx
= _mm_mul_ps(fscal
,dx12
);
851 ty
= _mm_mul_ps(fscal
,dy12
);
852 tz
= _mm_mul_ps(fscal
,dz12
);
854 /* Update vectorial force */
855 fix1
= _mm_add_ps(fix1
,tx
);
856 fiy1
= _mm_add_ps(fiy1
,ty
);
857 fiz1
= _mm_add_ps(fiz1
,tz
);
859 fjx2
= _mm_add_ps(fjx2
,tx
);
860 fjy2
= _mm_add_ps(fjy2
,ty
);
861 fjz2
= _mm_add_ps(fjz2
,tz
);
863 /**************************
864 * CALCULATE INTERACTIONS *
865 **************************/
867 /* REACTION-FIELD ELECTROSTATICS */
868 velec
= _mm_mul_ps(qq13
,_mm_sub_ps(_mm_add_ps(rinv13
,_mm_mul_ps(krf
,rsq13
)),crf
));
869 felec
= _mm_mul_ps(qq13
,_mm_sub_ps(_mm_mul_ps(rinv13
,rinvsq13
),krf2
));
871 /* Update potential sum for this i atom from the interaction with this j atom. */
872 velec
= _mm_andnot_ps(dummy_mask
,velec
);
873 velecsum
= _mm_add_ps(velecsum
,velec
);
877 fscal
= _mm_andnot_ps(dummy_mask
,fscal
);
879 /* Calculate temporary vectorial force */
880 tx
= _mm_mul_ps(fscal
,dx13
);
881 ty
= _mm_mul_ps(fscal
,dy13
);
882 tz
= _mm_mul_ps(fscal
,dz13
);
884 /* Update vectorial force */
885 fix1
= _mm_add_ps(fix1
,tx
);
886 fiy1
= _mm_add_ps(fiy1
,ty
);
887 fiz1
= _mm_add_ps(fiz1
,tz
);
889 fjx3
= _mm_add_ps(fjx3
,tx
);
890 fjy3
= _mm_add_ps(fjy3
,ty
);
891 fjz3
= _mm_add_ps(fjz3
,tz
);
893 /**************************
894 * CALCULATE INTERACTIONS *
895 **************************/
897 /* REACTION-FIELD ELECTROSTATICS */
898 velec
= _mm_mul_ps(qq21
,_mm_sub_ps(_mm_add_ps(rinv21
,_mm_mul_ps(krf
,rsq21
)),crf
));
899 felec
= _mm_mul_ps(qq21
,_mm_sub_ps(_mm_mul_ps(rinv21
,rinvsq21
),krf2
));
901 /* Update potential sum for this i atom from the interaction with this j atom. */
902 velec
= _mm_andnot_ps(dummy_mask
,velec
);
903 velecsum
= _mm_add_ps(velecsum
,velec
);
907 fscal
= _mm_andnot_ps(dummy_mask
,fscal
);
909 /* Calculate temporary vectorial force */
910 tx
= _mm_mul_ps(fscal
,dx21
);
911 ty
= _mm_mul_ps(fscal
,dy21
);
912 tz
= _mm_mul_ps(fscal
,dz21
);
914 /* Update vectorial force */
915 fix2
= _mm_add_ps(fix2
,tx
);
916 fiy2
= _mm_add_ps(fiy2
,ty
);
917 fiz2
= _mm_add_ps(fiz2
,tz
);
919 fjx1
= _mm_add_ps(fjx1
,tx
);
920 fjy1
= _mm_add_ps(fjy1
,ty
);
921 fjz1
= _mm_add_ps(fjz1
,tz
);
923 /**************************
924 * CALCULATE INTERACTIONS *
925 **************************/
927 /* REACTION-FIELD ELECTROSTATICS */
928 velec
= _mm_mul_ps(qq22
,_mm_sub_ps(_mm_add_ps(rinv22
,_mm_mul_ps(krf
,rsq22
)),crf
));
929 felec
= _mm_mul_ps(qq22
,_mm_sub_ps(_mm_mul_ps(rinv22
,rinvsq22
),krf2
));
931 /* Update potential sum for this i atom from the interaction with this j atom. */
932 velec
= _mm_andnot_ps(dummy_mask
,velec
);
933 velecsum
= _mm_add_ps(velecsum
,velec
);
937 fscal
= _mm_andnot_ps(dummy_mask
,fscal
);
939 /* Calculate temporary vectorial force */
940 tx
= _mm_mul_ps(fscal
,dx22
);
941 ty
= _mm_mul_ps(fscal
,dy22
);
942 tz
= _mm_mul_ps(fscal
,dz22
);
944 /* Update vectorial force */
945 fix2
= _mm_add_ps(fix2
,tx
);
946 fiy2
= _mm_add_ps(fiy2
,ty
);
947 fiz2
= _mm_add_ps(fiz2
,tz
);
949 fjx2
= _mm_add_ps(fjx2
,tx
);
950 fjy2
= _mm_add_ps(fjy2
,ty
);
951 fjz2
= _mm_add_ps(fjz2
,tz
);
953 /**************************
954 * CALCULATE INTERACTIONS *
955 **************************/
957 /* REACTION-FIELD ELECTROSTATICS */
958 velec
= _mm_mul_ps(qq23
,_mm_sub_ps(_mm_add_ps(rinv23
,_mm_mul_ps(krf
,rsq23
)),crf
));
959 felec
= _mm_mul_ps(qq23
,_mm_sub_ps(_mm_mul_ps(rinv23
,rinvsq23
),krf2
));
961 /* Update potential sum for this i atom from the interaction with this j atom. */
962 velec
= _mm_andnot_ps(dummy_mask
,velec
);
963 velecsum
= _mm_add_ps(velecsum
,velec
);
967 fscal
= _mm_andnot_ps(dummy_mask
,fscal
);
969 /* Calculate temporary vectorial force */
970 tx
= _mm_mul_ps(fscal
,dx23
);
971 ty
= _mm_mul_ps(fscal
,dy23
);
972 tz
= _mm_mul_ps(fscal
,dz23
);
974 /* Update vectorial force */
975 fix2
= _mm_add_ps(fix2
,tx
);
976 fiy2
= _mm_add_ps(fiy2
,ty
);
977 fiz2
= _mm_add_ps(fiz2
,tz
);
979 fjx3
= _mm_add_ps(fjx3
,tx
);
980 fjy3
= _mm_add_ps(fjy3
,ty
);
981 fjz3
= _mm_add_ps(fjz3
,tz
);
983 /**************************
984 * CALCULATE INTERACTIONS *
985 **************************/
987 /* REACTION-FIELD ELECTROSTATICS */
988 velec
= _mm_mul_ps(qq31
,_mm_sub_ps(_mm_add_ps(rinv31
,_mm_mul_ps(krf
,rsq31
)),crf
));
989 felec
= _mm_mul_ps(qq31
,_mm_sub_ps(_mm_mul_ps(rinv31
,rinvsq31
),krf2
));
991 /* Update potential sum for this i atom from the interaction with this j atom. */
992 velec
= _mm_andnot_ps(dummy_mask
,velec
);
993 velecsum
= _mm_add_ps(velecsum
,velec
);
997 fscal
= _mm_andnot_ps(dummy_mask
,fscal
);
999 /* Calculate temporary vectorial force */
1000 tx
= _mm_mul_ps(fscal
,dx31
);
1001 ty
= _mm_mul_ps(fscal
,dy31
);
1002 tz
= _mm_mul_ps(fscal
,dz31
);
1004 /* Update vectorial force */
1005 fix3
= _mm_add_ps(fix3
,tx
);
1006 fiy3
= _mm_add_ps(fiy3
,ty
);
1007 fiz3
= _mm_add_ps(fiz3
,tz
);
1009 fjx1
= _mm_add_ps(fjx1
,tx
);
1010 fjy1
= _mm_add_ps(fjy1
,ty
);
1011 fjz1
= _mm_add_ps(fjz1
,tz
);
1013 /**************************
1014 * CALCULATE INTERACTIONS *
1015 **************************/
1017 /* REACTION-FIELD ELECTROSTATICS */
1018 velec
= _mm_mul_ps(qq32
,_mm_sub_ps(_mm_add_ps(rinv32
,_mm_mul_ps(krf
,rsq32
)),crf
));
1019 felec
= _mm_mul_ps(qq32
,_mm_sub_ps(_mm_mul_ps(rinv32
,rinvsq32
),krf2
));
1021 /* Update potential sum for this i atom from the interaction with this j atom. */
1022 velec
= _mm_andnot_ps(dummy_mask
,velec
);
1023 velecsum
= _mm_add_ps(velecsum
,velec
);
1027 fscal
= _mm_andnot_ps(dummy_mask
,fscal
);
1029 /* Calculate temporary vectorial force */
1030 tx
= _mm_mul_ps(fscal
,dx32
);
1031 ty
= _mm_mul_ps(fscal
,dy32
);
1032 tz
= _mm_mul_ps(fscal
,dz32
);
1034 /* Update vectorial force */
1035 fix3
= _mm_add_ps(fix3
,tx
);
1036 fiy3
= _mm_add_ps(fiy3
,ty
);
1037 fiz3
= _mm_add_ps(fiz3
,tz
);
1039 fjx2
= _mm_add_ps(fjx2
,tx
);
1040 fjy2
= _mm_add_ps(fjy2
,ty
);
1041 fjz2
= _mm_add_ps(fjz2
,tz
);
1043 /**************************
1044 * CALCULATE INTERACTIONS *
1045 **************************/
1047 /* REACTION-FIELD ELECTROSTATICS */
1048 velec
= _mm_mul_ps(qq33
,_mm_sub_ps(_mm_add_ps(rinv33
,_mm_mul_ps(krf
,rsq33
)),crf
));
1049 felec
= _mm_mul_ps(qq33
,_mm_sub_ps(_mm_mul_ps(rinv33
,rinvsq33
),krf2
));
1051 /* Update potential sum for this i atom from the interaction with this j atom. */
1052 velec
= _mm_andnot_ps(dummy_mask
,velec
);
1053 velecsum
= _mm_add_ps(velecsum
,velec
);
1057 fscal
= _mm_andnot_ps(dummy_mask
,fscal
);
1059 /* Calculate temporary vectorial force */
1060 tx
= _mm_mul_ps(fscal
,dx33
);
1061 ty
= _mm_mul_ps(fscal
,dy33
);
1062 tz
= _mm_mul_ps(fscal
,dz33
);
1064 /* Update vectorial force */
1065 fix3
= _mm_add_ps(fix3
,tx
);
1066 fiy3
= _mm_add_ps(fiy3
,ty
);
1067 fiz3
= _mm_add_ps(fiz3
,tz
);
1069 fjx3
= _mm_add_ps(fjx3
,tx
);
1070 fjy3
= _mm_add_ps(fjy3
,ty
);
1071 fjz3
= _mm_add_ps(fjz3
,tz
);
1073 fjptrA
= (jnrlistA
>=0) ? f
+j_coord_offsetA
: scratch
;
1074 fjptrB
= (jnrlistB
>=0) ? f
+j_coord_offsetB
: scratch
;
1075 fjptrC
= (jnrlistC
>=0) ? f
+j_coord_offsetC
: scratch
;
1076 fjptrD
= (jnrlistD
>=0) ? f
+j_coord_offsetD
: scratch
;
1078 gmx_mm_decrement_4rvec_4ptr_swizzle_ps(fjptrA
,fjptrB
,fjptrC
,fjptrD
,
1079 fjx0
,fjy0
,fjz0
,fjx1
,fjy1
,fjz1
,
1080 fjx2
,fjy2
,fjz2
,fjx3
,fjy3
,fjz3
);
1082 /* Inner loop uses 348 flops */
1085 /* End of innermost loop */
1087 gmx_mm_update_iforce_4atom_swizzle_ps(fix0
,fiy0
,fiz0
,fix1
,fiy1
,fiz1
,fix2
,fiy2
,fiz2
,fix3
,fiy3
,fiz3
,
1088 f
+i_coord_offset
,fshift
+i_shift_offset
);
1091 /* Update potential energies */
1092 gmx_mm_update_1pot_ps(velecsum
,kernel_data
->energygrp_elec
+ggid
);
1093 gmx_mm_update_1pot_ps(vvdwsum
,kernel_data
->energygrp_vdw
+ggid
);
1095 /* Increment number of inner iterations */
1096 inneriter
+= j_index_end
- j_index_start
;
1098 /* Outer loop uses 26 flops */
1101 /* Increment number of outer iterations */
1104 /* Update outer/inner flops */
1106 inc_nrnb(nrnb
,eNR_NBKERNEL_ELEC_VDW_W4W4_VF
,outeriter
*26 + inneriter
*348);
1109 * Gromacs nonbonded kernel: nb_kernel_ElecRF_VdwCSTab_GeomW4W4_F_sse2_single
1110 * Electrostatics interaction: ReactionField
1111 * VdW interaction: CubicSplineTable
1112 * Geometry: Water4-Water4
1113 * Calculate force/pot: Force
1116 nb_kernel_ElecRF_VdwCSTab_GeomW4W4_F_sse2_single
1117 (t_nblist
* gmx_restrict nlist
,
1118 rvec
* gmx_restrict xx
,
1119 rvec
* gmx_restrict ff
,
1120 struct t_forcerec
* gmx_restrict fr
,
1121 t_mdatoms
* gmx_restrict mdatoms
,
1122 nb_kernel_data_t gmx_unused
* gmx_restrict kernel_data
,
1123 t_nrnb
* gmx_restrict nrnb
)
1125 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
1126 * just 0 for non-waters.
1127 * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
1128 * jnr indices corresponding to data put in the four positions in the SIMD register.
1130 int i_shift_offset
,i_coord_offset
,outeriter
,inneriter
;
1131 int j_index_start
,j_index_end
,jidx
,nri
,inr
,ggid
,iidx
;
1132 int jnrA
,jnrB
,jnrC
,jnrD
;
1133 int jnrlistA
,jnrlistB
,jnrlistC
,jnrlistD
;
1134 int j_coord_offsetA
,j_coord_offsetB
,j_coord_offsetC
,j_coord_offsetD
;
1135 int *iinr
,*jindex
,*jjnr
,*shiftidx
,*gid
;
1136 real rcutoff_scalar
;
1137 real
*shiftvec
,*fshift
,*x
,*f
;
1138 real
*fjptrA
,*fjptrB
,*fjptrC
,*fjptrD
;
1139 real scratch
[4*DIM
];
1140 __m128 tx
,ty
,tz
,fscal
,rcutoff
,rcutoff2
,jidxall
;
1142 __m128 ix0
,iy0
,iz0
,fix0
,fiy0
,fiz0
,iq0
,isai0
;
1144 __m128 ix1
,iy1
,iz1
,fix1
,fiy1
,fiz1
,iq1
,isai1
;
1146 __m128 ix2
,iy2
,iz2
,fix2
,fiy2
,fiz2
,iq2
,isai2
;
1148 __m128 ix3
,iy3
,iz3
,fix3
,fiy3
,fiz3
,iq3
,isai3
;
1149 int vdwjidx0A
,vdwjidx0B
,vdwjidx0C
,vdwjidx0D
;
1150 __m128 jx0
,jy0
,jz0
,fjx0
,fjy0
,fjz0
,jq0
,isaj0
;
1151 int vdwjidx1A
,vdwjidx1B
,vdwjidx1C
,vdwjidx1D
;
1152 __m128 jx1
,jy1
,jz1
,fjx1
,fjy1
,fjz1
,jq1
,isaj1
;
1153 int vdwjidx2A
,vdwjidx2B
,vdwjidx2C
,vdwjidx2D
;
1154 __m128 jx2
,jy2
,jz2
,fjx2
,fjy2
,fjz2
,jq2
,isaj2
;
1155 int vdwjidx3A
,vdwjidx3B
,vdwjidx3C
,vdwjidx3D
;
1156 __m128 jx3
,jy3
,jz3
,fjx3
,fjy3
,fjz3
,jq3
,isaj3
;
1157 __m128 dx00
,dy00
,dz00
,rsq00
,rinv00
,rinvsq00
,r00
,qq00
,c6_00
,c12_00
;
1158 __m128 dx11
,dy11
,dz11
,rsq11
,rinv11
,rinvsq11
,r11
,qq11
,c6_11
,c12_11
;
1159 __m128 dx12
,dy12
,dz12
,rsq12
,rinv12
,rinvsq12
,r12
,qq12
,c6_12
,c12_12
;
1160 __m128 dx13
,dy13
,dz13
,rsq13
,rinv13
,rinvsq13
,r13
,qq13
,c6_13
,c12_13
;
1161 __m128 dx21
,dy21
,dz21
,rsq21
,rinv21
,rinvsq21
,r21
,qq21
,c6_21
,c12_21
;
1162 __m128 dx22
,dy22
,dz22
,rsq22
,rinv22
,rinvsq22
,r22
,qq22
,c6_22
,c12_22
;
1163 __m128 dx23
,dy23
,dz23
,rsq23
,rinv23
,rinvsq23
,r23
,qq23
,c6_23
,c12_23
;
1164 __m128 dx31
,dy31
,dz31
,rsq31
,rinv31
,rinvsq31
,r31
,qq31
,c6_31
,c12_31
;
1165 __m128 dx32
,dy32
,dz32
,rsq32
,rinv32
,rinvsq32
,r32
,qq32
,c6_32
,c12_32
;
1166 __m128 dx33
,dy33
,dz33
,rsq33
,rinv33
,rinvsq33
,r33
,qq33
,c6_33
,c12_33
;
1167 __m128 velec
,felec
,velecsum
,facel
,crf
,krf
,krf2
;
1170 __m128 rinvsix
,rvdw
,vvdw
,vvdw6
,vvdw12
,fvdw
,fvdw6
,fvdw12
,vvdwsum
,sh_vdw_invrcut6
;
1173 __m128 one_sixth
= _mm_set1_ps(1.0/6.0);
1174 __m128 one_twelfth
= _mm_set1_ps(1.0/12.0);
1176 __m128i ifour
= _mm_set1_epi32(4);
1177 __m128 rt
,vfeps
,vftabscale
,Y
,F
,G
,H
,Heps
,Fp
,VV
,FF
;
1179 __m128 dummy_mask
,cutoff_mask
;
1180 __m128 signbit
= _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
1181 __m128 one
= _mm_set1_ps(1.0);
1182 __m128 two
= _mm_set1_ps(2.0);
1188 jindex
= nlist
->jindex
;
1190 shiftidx
= nlist
->shift
;
1192 shiftvec
= fr
->shift_vec
[0];
1193 fshift
= fr
->fshift
[0];
1194 facel
= _mm_set1_ps(fr
->ic
->epsfac
);
1195 charge
= mdatoms
->chargeA
;
1196 krf
= _mm_set1_ps(fr
->ic
->k_rf
);
1197 krf2
= _mm_set1_ps(fr
->ic
->k_rf
*2.0);
1198 crf
= _mm_set1_ps(fr
->ic
->c_rf
);
1199 nvdwtype
= fr
->ntype
;
1200 vdwparam
= fr
->nbfp
;
1201 vdwtype
= mdatoms
->typeA
;
1203 vftab
= kernel_data
->table_vdw
->data
;
1204 vftabscale
= _mm_set1_ps(kernel_data
->table_vdw
->scale
);
1206 /* Setup water-specific parameters */
1207 inr
= nlist
->iinr
[0];
1208 iq1
= _mm_mul_ps(facel
,_mm_set1_ps(charge
[inr
+1]));
1209 iq2
= _mm_mul_ps(facel
,_mm_set1_ps(charge
[inr
+2]));
1210 iq3
= _mm_mul_ps(facel
,_mm_set1_ps(charge
[inr
+3]));
1211 vdwioffset0
= 2*nvdwtype
*vdwtype
[inr
+0];
1213 jq1
= _mm_set1_ps(charge
[inr
+1]);
1214 jq2
= _mm_set1_ps(charge
[inr
+2]);
1215 jq3
= _mm_set1_ps(charge
[inr
+3]);
1216 vdwjidx0A
= 2*vdwtype
[inr
+0];
1217 c6_00
= _mm_set1_ps(vdwparam
[vdwioffset0
+vdwjidx0A
]);
1218 c12_00
= _mm_set1_ps(vdwparam
[vdwioffset0
+vdwjidx0A
+1]);
1219 qq11
= _mm_mul_ps(iq1
,jq1
);
1220 qq12
= _mm_mul_ps(iq1
,jq2
);
1221 qq13
= _mm_mul_ps(iq1
,jq3
);
1222 qq21
= _mm_mul_ps(iq2
,jq1
);
1223 qq22
= _mm_mul_ps(iq2
,jq2
);
1224 qq23
= _mm_mul_ps(iq2
,jq3
);
1225 qq31
= _mm_mul_ps(iq3
,jq1
);
1226 qq32
= _mm_mul_ps(iq3
,jq2
);
1227 qq33
= _mm_mul_ps(iq3
,jq3
);
1229 /* Avoid stupid compiler warnings */
1230 jnrA
= jnrB
= jnrC
= jnrD
= 0;
1231 j_coord_offsetA
= 0;
1232 j_coord_offsetB
= 0;
1233 j_coord_offsetC
= 0;
1234 j_coord_offsetD
= 0;
1239 for(iidx
=0;iidx
<4*DIM
;iidx
++)
1241 scratch
[iidx
] = 0.0;
1244 /* Start outer loop over neighborlists */
1245 for(iidx
=0; iidx
<nri
; iidx
++)
1247 /* Load shift vector for this list */
1248 i_shift_offset
= DIM
*shiftidx
[iidx
];
1250 /* Load limits for loop over neighbors */
1251 j_index_start
= jindex
[iidx
];
1252 j_index_end
= jindex
[iidx
+1];
1254 /* Get outer coordinate index */
1256 i_coord_offset
= DIM
*inr
;
1258 /* Load i particle coords and add shift vector */
1259 gmx_mm_load_shift_and_4rvec_broadcast_ps(shiftvec
+i_shift_offset
,x
+i_coord_offset
,
1260 &ix0
,&iy0
,&iz0
,&ix1
,&iy1
,&iz1
,&ix2
,&iy2
,&iz2
,&ix3
,&iy3
,&iz3
);
1262 fix0
= _mm_setzero_ps();
1263 fiy0
= _mm_setzero_ps();
1264 fiz0
= _mm_setzero_ps();
1265 fix1
= _mm_setzero_ps();
1266 fiy1
= _mm_setzero_ps();
1267 fiz1
= _mm_setzero_ps();
1268 fix2
= _mm_setzero_ps();
1269 fiy2
= _mm_setzero_ps();
1270 fiz2
= _mm_setzero_ps();
1271 fix3
= _mm_setzero_ps();
1272 fiy3
= _mm_setzero_ps();
1273 fiz3
= _mm_setzero_ps();
1275 /* Start inner kernel loop */
1276 for(jidx
=j_index_start
; jidx
<j_index_end
&& jjnr
[jidx
+3]>=0; jidx
+=4)
1279 /* Get j neighbor index, and coordinate index */
1281 jnrB
= jjnr
[jidx
+1];
1282 jnrC
= jjnr
[jidx
+2];
1283 jnrD
= jjnr
[jidx
+3];
1284 j_coord_offsetA
= DIM
*jnrA
;
1285 j_coord_offsetB
= DIM
*jnrB
;
1286 j_coord_offsetC
= DIM
*jnrC
;
1287 j_coord_offsetD
= DIM
*jnrD
;
1289 /* load j atom coordinates */
1290 gmx_mm_load_4rvec_4ptr_swizzle_ps(x
+j_coord_offsetA
,x
+j_coord_offsetB
,
1291 x
+j_coord_offsetC
,x
+j_coord_offsetD
,
1292 &jx0
,&jy0
,&jz0
,&jx1
,&jy1
,&jz1
,&jx2
,
1293 &jy2
,&jz2
,&jx3
,&jy3
,&jz3
);
1295 /* Calculate displacement vector */
1296 dx00
= _mm_sub_ps(ix0
,jx0
);
1297 dy00
= _mm_sub_ps(iy0
,jy0
);
1298 dz00
= _mm_sub_ps(iz0
,jz0
);
1299 dx11
= _mm_sub_ps(ix1
,jx1
);
1300 dy11
= _mm_sub_ps(iy1
,jy1
);
1301 dz11
= _mm_sub_ps(iz1
,jz1
);
1302 dx12
= _mm_sub_ps(ix1
,jx2
);
1303 dy12
= _mm_sub_ps(iy1
,jy2
);
1304 dz12
= _mm_sub_ps(iz1
,jz2
);
1305 dx13
= _mm_sub_ps(ix1
,jx3
);
1306 dy13
= _mm_sub_ps(iy1
,jy3
);
1307 dz13
= _mm_sub_ps(iz1
,jz3
);
1308 dx21
= _mm_sub_ps(ix2
,jx1
);
1309 dy21
= _mm_sub_ps(iy2
,jy1
);
1310 dz21
= _mm_sub_ps(iz2
,jz1
);
1311 dx22
= _mm_sub_ps(ix2
,jx2
);
1312 dy22
= _mm_sub_ps(iy2
,jy2
);
1313 dz22
= _mm_sub_ps(iz2
,jz2
);
1314 dx23
= _mm_sub_ps(ix2
,jx3
);
1315 dy23
= _mm_sub_ps(iy2
,jy3
);
1316 dz23
= _mm_sub_ps(iz2
,jz3
);
1317 dx31
= _mm_sub_ps(ix3
,jx1
);
1318 dy31
= _mm_sub_ps(iy3
,jy1
);
1319 dz31
= _mm_sub_ps(iz3
,jz1
);
1320 dx32
= _mm_sub_ps(ix3
,jx2
);
1321 dy32
= _mm_sub_ps(iy3
,jy2
);
1322 dz32
= _mm_sub_ps(iz3
,jz2
);
1323 dx33
= _mm_sub_ps(ix3
,jx3
);
1324 dy33
= _mm_sub_ps(iy3
,jy3
);
1325 dz33
= _mm_sub_ps(iz3
,jz3
);
1327 /* Calculate squared distance and things based on it */
1328 rsq00
= gmx_mm_calc_rsq_ps(dx00
,dy00
,dz00
);
1329 rsq11
= gmx_mm_calc_rsq_ps(dx11
,dy11
,dz11
);
1330 rsq12
= gmx_mm_calc_rsq_ps(dx12
,dy12
,dz12
);
1331 rsq13
= gmx_mm_calc_rsq_ps(dx13
,dy13
,dz13
);
1332 rsq21
= gmx_mm_calc_rsq_ps(dx21
,dy21
,dz21
);
1333 rsq22
= gmx_mm_calc_rsq_ps(dx22
,dy22
,dz22
);
1334 rsq23
= gmx_mm_calc_rsq_ps(dx23
,dy23
,dz23
);
1335 rsq31
= gmx_mm_calc_rsq_ps(dx31
,dy31
,dz31
);
1336 rsq32
= gmx_mm_calc_rsq_ps(dx32
,dy32
,dz32
);
1337 rsq33
= gmx_mm_calc_rsq_ps(dx33
,dy33
,dz33
);
1339 rinv00
= sse2_invsqrt_f(rsq00
);
1340 rinv11
= sse2_invsqrt_f(rsq11
);
1341 rinv12
= sse2_invsqrt_f(rsq12
);
1342 rinv13
= sse2_invsqrt_f(rsq13
);
1343 rinv21
= sse2_invsqrt_f(rsq21
);
1344 rinv22
= sse2_invsqrt_f(rsq22
);
1345 rinv23
= sse2_invsqrt_f(rsq23
);
1346 rinv31
= sse2_invsqrt_f(rsq31
);
1347 rinv32
= sse2_invsqrt_f(rsq32
);
1348 rinv33
= sse2_invsqrt_f(rsq33
);
1350 rinvsq11
= _mm_mul_ps(rinv11
,rinv11
);
1351 rinvsq12
= _mm_mul_ps(rinv12
,rinv12
);
1352 rinvsq13
= _mm_mul_ps(rinv13
,rinv13
);
1353 rinvsq21
= _mm_mul_ps(rinv21
,rinv21
);
1354 rinvsq22
= _mm_mul_ps(rinv22
,rinv22
);
1355 rinvsq23
= _mm_mul_ps(rinv23
,rinv23
);
1356 rinvsq31
= _mm_mul_ps(rinv31
,rinv31
);
1357 rinvsq32
= _mm_mul_ps(rinv32
,rinv32
);
1358 rinvsq33
= _mm_mul_ps(rinv33
,rinv33
);
1360 fjx0
= _mm_setzero_ps();
1361 fjy0
= _mm_setzero_ps();
1362 fjz0
= _mm_setzero_ps();
1363 fjx1
= _mm_setzero_ps();
1364 fjy1
= _mm_setzero_ps();
1365 fjz1
= _mm_setzero_ps();
1366 fjx2
= _mm_setzero_ps();
1367 fjy2
= _mm_setzero_ps();
1368 fjz2
= _mm_setzero_ps();
1369 fjx3
= _mm_setzero_ps();
1370 fjy3
= _mm_setzero_ps();
1371 fjz3
= _mm_setzero_ps();
1373 /**************************
1374 * CALCULATE INTERACTIONS *
1375 **************************/
1377 r00
= _mm_mul_ps(rsq00
,rinv00
);
1379 /* Calculate table index by multiplying r with table scale and truncate to integer */
1380 rt
= _mm_mul_ps(r00
,vftabscale
);
1381 vfitab
= _mm_cvttps_epi32(rt
);
1382 vfeps
= _mm_sub_ps(rt
,_mm_cvtepi32_ps(vfitab
));
1383 vfitab
= _mm_slli_epi32(vfitab
,3);
1385 /* CUBIC SPLINE TABLE DISPERSION */
1386 Y
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,0) );
1387 F
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,1) );
1388 G
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,2) );
1389 H
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,3) );
1390 _MM_TRANSPOSE4_PS(Y
,F
,G
,H
);
1391 Heps
= _mm_mul_ps(vfeps
,H
);
1392 Fp
= _mm_add_ps(F
,_mm_mul_ps(vfeps
,_mm_add_ps(G
,Heps
)));
1393 FF
= _mm_add_ps(Fp
,_mm_mul_ps(vfeps
,_mm_add_ps(G
,_mm_add_ps(Heps
,Heps
))));
1394 fvdw6
= _mm_mul_ps(c6_00
,FF
);
1396 /* CUBIC SPLINE TABLE REPULSION */
1397 vfitab
= _mm_add_epi32(vfitab
,ifour
);
1398 Y
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,0) );
1399 F
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,1) );
1400 G
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,2) );
1401 H
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,3) );
1402 _MM_TRANSPOSE4_PS(Y
,F
,G
,H
);
1403 Heps
= _mm_mul_ps(vfeps
,H
);
1404 Fp
= _mm_add_ps(F
,_mm_mul_ps(vfeps
,_mm_add_ps(G
,Heps
)));
1405 FF
= _mm_add_ps(Fp
,_mm_mul_ps(vfeps
,_mm_add_ps(G
,_mm_add_ps(Heps
,Heps
))));
1406 fvdw12
= _mm_mul_ps(c12_00
,FF
);
1407 fvdw
= _mm_xor_ps(signbit
,_mm_mul_ps(_mm_add_ps(fvdw6
,fvdw12
),_mm_mul_ps(vftabscale
,rinv00
)));
1411 /* Calculate temporary vectorial force */
1412 tx
= _mm_mul_ps(fscal
,dx00
);
1413 ty
= _mm_mul_ps(fscal
,dy00
);
1414 tz
= _mm_mul_ps(fscal
,dz00
);
1416 /* Update vectorial force */
1417 fix0
= _mm_add_ps(fix0
,tx
);
1418 fiy0
= _mm_add_ps(fiy0
,ty
);
1419 fiz0
= _mm_add_ps(fiz0
,tz
);
1421 fjx0
= _mm_add_ps(fjx0
,tx
);
1422 fjy0
= _mm_add_ps(fjy0
,ty
);
1423 fjz0
= _mm_add_ps(fjz0
,tz
);
1425 /**************************
1426 * CALCULATE INTERACTIONS *
1427 **************************/
1429 /* REACTION-FIELD ELECTROSTATICS */
1430 felec
= _mm_mul_ps(qq11
,_mm_sub_ps(_mm_mul_ps(rinv11
,rinvsq11
),krf2
));
1434 /* Calculate temporary vectorial force */
1435 tx
= _mm_mul_ps(fscal
,dx11
);
1436 ty
= _mm_mul_ps(fscal
,dy11
);
1437 tz
= _mm_mul_ps(fscal
,dz11
);
1439 /* Update vectorial force */
1440 fix1
= _mm_add_ps(fix1
,tx
);
1441 fiy1
= _mm_add_ps(fiy1
,ty
);
1442 fiz1
= _mm_add_ps(fiz1
,tz
);
1444 fjx1
= _mm_add_ps(fjx1
,tx
);
1445 fjy1
= _mm_add_ps(fjy1
,ty
);
1446 fjz1
= _mm_add_ps(fjz1
,tz
);
1448 /**************************
1449 * CALCULATE INTERACTIONS *
1450 **************************/
1452 /* REACTION-FIELD ELECTROSTATICS */
1453 felec
= _mm_mul_ps(qq12
,_mm_sub_ps(_mm_mul_ps(rinv12
,rinvsq12
),krf2
));
1457 /* Calculate temporary vectorial force */
1458 tx
= _mm_mul_ps(fscal
,dx12
);
1459 ty
= _mm_mul_ps(fscal
,dy12
);
1460 tz
= _mm_mul_ps(fscal
,dz12
);
1462 /* Update vectorial force */
1463 fix1
= _mm_add_ps(fix1
,tx
);
1464 fiy1
= _mm_add_ps(fiy1
,ty
);
1465 fiz1
= _mm_add_ps(fiz1
,tz
);
1467 fjx2
= _mm_add_ps(fjx2
,tx
);
1468 fjy2
= _mm_add_ps(fjy2
,ty
);
1469 fjz2
= _mm_add_ps(fjz2
,tz
);
1471 /**************************
1472 * CALCULATE INTERACTIONS *
1473 **************************/
1475 /* REACTION-FIELD ELECTROSTATICS */
1476 felec
= _mm_mul_ps(qq13
,_mm_sub_ps(_mm_mul_ps(rinv13
,rinvsq13
),krf2
));
1480 /* Calculate temporary vectorial force */
1481 tx
= _mm_mul_ps(fscal
,dx13
);
1482 ty
= _mm_mul_ps(fscal
,dy13
);
1483 tz
= _mm_mul_ps(fscal
,dz13
);
1485 /* Update vectorial force */
1486 fix1
= _mm_add_ps(fix1
,tx
);
1487 fiy1
= _mm_add_ps(fiy1
,ty
);
1488 fiz1
= _mm_add_ps(fiz1
,tz
);
1490 fjx3
= _mm_add_ps(fjx3
,tx
);
1491 fjy3
= _mm_add_ps(fjy3
,ty
);
1492 fjz3
= _mm_add_ps(fjz3
,tz
);
1494 /**************************
1495 * CALCULATE INTERACTIONS *
1496 **************************/
1498 /* REACTION-FIELD ELECTROSTATICS */
1499 felec
= _mm_mul_ps(qq21
,_mm_sub_ps(_mm_mul_ps(rinv21
,rinvsq21
),krf2
));
1503 /* Calculate temporary vectorial force */
1504 tx
= _mm_mul_ps(fscal
,dx21
);
1505 ty
= _mm_mul_ps(fscal
,dy21
);
1506 tz
= _mm_mul_ps(fscal
,dz21
);
1508 /* Update vectorial force */
1509 fix2
= _mm_add_ps(fix2
,tx
);
1510 fiy2
= _mm_add_ps(fiy2
,ty
);
1511 fiz2
= _mm_add_ps(fiz2
,tz
);
1513 fjx1
= _mm_add_ps(fjx1
,tx
);
1514 fjy1
= _mm_add_ps(fjy1
,ty
);
1515 fjz1
= _mm_add_ps(fjz1
,tz
);
1517 /**************************
1518 * CALCULATE INTERACTIONS *
1519 **************************/
1521 /* REACTION-FIELD ELECTROSTATICS */
1522 felec
= _mm_mul_ps(qq22
,_mm_sub_ps(_mm_mul_ps(rinv22
,rinvsq22
),krf2
));
1526 /* Calculate temporary vectorial force */
1527 tx
= _mm_mul_ps(fscal
,dx22
);
1528 ty
= _mm_mul_ps(fscal
,dy22
);
1529 tz
= _mm_mul_ps(fscal
,dz22
);
1531 /* Update vectorial force */
1532 fix2
= _mm_add_ps(fix2
,tx
);
1533 fiy2
= _mm_add_ps(fiy2
,ty
);
1534 fiz2
= _mm_add_ps(fiz2
,tz
);
1536 fjx2
= _mm_add_ps(fjx2
,tx
);
1537 fjy2
= _mm_add_ps(fjy2
,ty
);
1538 fjz2
= _mm_add_ps(fjz2
,tz
);
1540 /**************************
1541 * CALCULATE INTERACTIONS *
1542 **************************/
1544 /* REACTION-FIELD ELECTROSTATICS */
1545 felec
= _mm_mul_ps(qq23
,_mm_sub_ps(_mm_mul_ps(rinv23
,rinvsq23
),krf2
));
1549 /* Calculate temporary vectorial force */
1550 tx
= _mm_mul_ps(fscal
,dx23
);
1551 ty
= _mm_mul_ps(fscal
,dy23
);
1552 tz
= _mm_mul_ps(fscal
,dz23
);
1554 /* Update vectorial force */
1555 fix2
= _mm_add_ps(fix2
,tx
);
1556 fiy2
= _mm_add_ps(fiy2
,ty
);
1557 fiz2
= _mm_add_ps(fiz2
,tz
);
1559 fjx3
= _mm_add_ps(fjx3
,tx
);
1560 fjy3
= _mm_add_ps(fjy3
,ty
);
1561 fjz3
= _mm_add_ps(fjz3
,tz
);
1563 /**************************
1564 * CALCULATE INTERACTIONS *
1565 **************************/
1567 /* REACTION-FIELD ELECTROSTATICS */
1568 felec
= _mm_mul_ps(qq31
,_mm_sub_ps(_mm_mul_ps(rinv31
,rinvsq31
),krf2
));
1572 /* Calculate temporary vectorial force */
1573 tx
= _mm_mul_ps(fscal
,dx31
);
1574 ty
= _mm_mul_ps(fscal
,dy31
);
1575 tz
= _mm_mul_ps(fscal
,dz31
);
1577 /* Update vectorial force */
1578 fix3
= _mm_add_ps(fix3
,tx
);
1579 fiy3
= _mm_add_ps(fiy3
,ty
);
1580 fiz3
= _mm_add_ps(fiz3
,tz
);
1582 fjx1
= _mm_add_ps(fjx1
,tx
);
1583 fjy1
= _mm_add_ps(fjy1
,ty
);
1584 fjz1
= _mm_add_ps(fjz1
,tz
);
1586 /**************************
1587 * CALCULATE INTERACTIONS *
1588 **************************/
1590 /* REACTION-FIELD ELECTROSTATICS */
1591 felec
= _mm_mul_ps(qq32
,_mm_sub_ps(_mm_mul_ps(rinv32
,rinvsq32
),krf2
));
1595 /* Calculate temporary vectorial force */
1596 tx
= _mm_mul_ps(fscal
,dx32
);
1597 ty
= _mm_mul_ps(fscal
,dy32
);
1598 tz
= _mm_mul_ps(fscal
,dz32
);
1600 /* Update vectorial force */
1601 fix3
= _mm_add_ps(fix3
,tx
);
1602 fiy3
= _mm_add_ps(fiy3
,ty
);
1603 fiz3
= _mm_add_ps(fiz3
,tz
);
1605 fjx2
= _mm_add_ps(fjx2
,tx
);
1606 fjy2
= _mm_add_ps(fjy2
,ty
);
1607 fjz2
= _mm_add_ps(fjz2
,tz
);
1609 /**************************
1610 * CALCULATE INTERACTIONS *
1611 **************************/
1613 /* REACTION-FIELD ELECTROSTATICS */
1614 felec
= _mm_mul_ps(qq33
,_mm_sub_ps(_mm_mul_ps(rinv33
,rinvsq33
),krf2
));
1618 /* Calculate temporary vectorial force */
1619 tx
= _mm_mul_ps(fscal
,dx33
);
1620 ty
= _mm_mul_ps(fscal
,dy33
);
1621 tz
= _mm_mul_ps(fscal
,dz33
);
1623 /* Update vectorial force */
1624 fix3
= _mm_add_ps(fix3
,tx
);
1625 fiy3
= _mm_add_ps(fiy3
,ty
);
1626 fiz3
= _mm_add_ps(fiz3
,tz
);
1628 fjx3
= _mm_add_ps(fjx3
,tx
);
1629 fjy3
= _mm_add_ps(fjy3
,ty
);
1630 fjz3
= _mm_add_ps(fjz3
,tz
);
1632 fjptrA
= f
+j_coord_offsetA
;
1633 fjptrB
= f
+j_coord_offsetB
;
1634 fjptrC
= f
+j_coord_offsetC
;
1635 fjptrD
= f
+j_coord_offsetD
;
1637 gmx_mm_decrement_4rvec_4ptr_swizzle_ps(fjptrA
,fjptrB
,fjptrC
,fjptrD
,
1638 fjx0
,fjy0
,fjz0
,fjx1
,fjy1
,fjz1
,
1639 fjx2
,fjy2
,fjz2
,fjx3
,fjy3
,fjz3
);
1641 /* Inner loop uses 294 flops */
1644 if(jidx
<j_index_end
)
1647 /* Get j neighbor index, and coordinate index */
1648 jnrlistA
= jjnr
[jidx
];
1649 jnrlistB
= jjnr
[jidx
+1];
1650 jnrlistC
= jjnr
[jidx
+2];
1651 jnrlistD
= jjnr
[jidx
+3];
1652 /* Sign of each element will be negative for non-real atoms.
1653 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
1654 * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
1656 dummy_mask
= gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i
*)(jjnr
+jidx
)),_mm_setzero_si128()));
1657 jnrA
= (jnrlistA
>=0) ? jnrlistA
: 0;
1658 jnrB
= (jnrlistB
>=0) ? jnrlistB
: 0;
1659 jnrC
= (jnrlistC
>=0) ? jnrlistC
: 0;
1660 jnrD
= (jnrlistD
>=0) ? jnrlistD
: 0;
1661 j_coord_offsetA
= DIM
*jnrA
;
1662 j_coord_offsetB
= DIM
*jnrB
;
1663 j_coord_offsetC
= DIM
*jnrC
;
1664 j_coord_offsetD
= DIM
*jnrD
;
1666 /* load j atom coordinates */
1667 gmx_mm_load_4rvec_4ptr_swizzle_ps(x
+j_coord_offsetA
,x
+j_coord_offsetB
,
1668 x
+j_coord_offsetC
,x
+j_coord_offsetD
,
1669 &jx0
,&jy0
,&jz0
,&jx1
,&jy1
,&jz1
,&jx2
,
1670 &jy2
,&jz2
,&jx3
,&jy3
,&jz3
);
1672 /* Calculate displacement vector */
1673 dx00
= _mm_sub_ps(ix0
,jx0
);
1674 dy00
= _mm_sub_ps(iy0
,jy0
);
1675 dz00
= _mm_sub_ps(iz0
,jz0
);
1676 dx11
= _mm_sub_ps(ix1
,jx1
);
1677 dy11
= _mm_sub_ps(iy1
,jy1
);
1678 dz11
= _mm_sub_ps(iz1
,jz1
);
1679 dx12
= _mm_sub_ps(ix1
,jx2
);
1680 dy12
= _mm_sub_ps(iy1
,jy2
);
1681 dz12
= _mm_sub_ps(iz1
,jz2
);
1682 dx13
= _mm_sub_ps(ix1
,jx3
);
1683 dy13
= _mm_sub_ps(iy1
,jy3
);
1684 dz13
= _mm_sub_ps(iz1
,jz3
);
1685 dx21
= _mm_sub_ps(ix2
,jx1
);
1686 dy21
= _mm_sub_ps(iy2
,jy1
);
1687 dz21
= _mm_sub_ps(iz2
,jz1
);
1688 dx22
= _mm_sub_ps(ix2
,jx2
);
1689 dy22
= _mm_sub_ps(iy2
,jy2
);
1690 dz22
= _mm_sub_ps(iz2
,jz2
);
1691 dx23
= _mm_sub_ps(ix2
,jx3
);
1692 dy23
= _mm_sub_ps(iy2
,jy3
);
1693 dz23
= _mm_sub_ps(iz2
,jz3
);
1694 dx31
= _mm_sub_ps(ix3
,jx1
);
1695 dy31
= _mm_sub_ps(iy3
,jy1
);
1696 dz31
= _mm_sub_ps(iz3
,jz1
);
1697 dx32
= _mm_sub_ps(ix3
,jx2
);
1698 dy32
= _mm_sub_ps(iy3
,jy2
);
1699 dz32
= _mm_sub_ps(iz3
,jz2
);
1700 dx33
= _mm_sub_ps(ix3
,jx3
);
1701 dy33
= _mm_sub_ps(iy3
,jy3
);
1702 dz33
= _mm_sub_ps(iz3
,jz3
);
1704 /* Calculate squared distance and things based on it */
1705 rsq00
= gmx_mm_calc_rsq_ps(dx00
,dy00
,dz00
);
1706 rsq11
= gmx_mm_calc_rsq_ps(dx11
,dy11
,dz11
);
1707 rsq12
= gmx_mm_calc_rsq_ps(dx12
,dy12
,dz12
);
1708 rsq13
= gmx_mm_calc_rsq_ps(dx13
,dy13
,dz13
);
1709 rsq21
= gmx_mm_calc_rsq_ps(dx21
,dy21
,dz21
);
1710 rsq22
= gmx_mm_calc_rsq_ps(dx22
,dy22
,dz22
);
1711 rsq23
= gmx_mm_calc_rsq_ps(dx23
,dy23
,dz23
);
1712 rsq31
= gmx_mm_calc_rsq_ps(dx31
,dy31
,dz31
);
1713 rsq32
= gmx_mm_calc_rsq_ps(dx32
,dy32
,dz32
);
1714 rsq33
= gmx_mm_calc_rsq_ps(dx33
,dy33
,dz33
);
1716 rinv00
= sse2_invsqrt_f(rsq00
);
1717 rinv11
= sse2_invsqrt_f(rsq11
);
1718 rinv12
= sse2_invsqrt_f(rsq12
);
1719 rinv13
= sse2_invsqrt_f(rsq13
);
1720 rinv21
= sse2_invsqrt_f(rsq21
);
1721 rinv22
= sse2_invsqrt_f(rsq22
);
1722 rinv23
= sse2_invsqrt_f(rsq23
);
1723 rinv31
= sse2_invsqrt_f(rsq31
);
1724 rinv32
= sse2_invsqrt_f(rsq32
);
1725 rinv33
= sse2_invsqrt_f(rsq33
);
1727 rinvsq11
= _mm_mul_ps(rinv11
,rinv11
);
1728 rinvsq12
= _mm_mul_ps(rinv12
,rinv12
);
1729 rinvsq13
= _mm_mul_ps(rinv13
,rinv13
);
1730 rinvsq21
= _mm_mul_ps(rinv21
,rinv21
);
1731 rinvsq22
= _mm_mul_ps(rinv22
,rinv22
);
1732 rinvsq23
= _mm_mul_ps(rinv23
,rinv23
);
1733 rinvsq31
= _mm_mul_ps(rinv31
,rinv31
);
1734 rinvsq32
= _mm_mul_ps(rinv32
,rinv32
);
1735 rinvsq33
= _mm_mul_ps(rinv33
,rinv33
);
1737 fjx0
= _mm_setzero_ps();
1738 fjy0
= _mm_setzero_ps();
1739 fjz0
= _mm_setzero_ps();
1740 fjx1
= _mm_setzero_ps();
1741 fjy1
= _mm_setzero_ps();
1742 fjz1
= _mm_setzero_ps();
1743 fjx2
= _mm_setzero_ps();
1744 fjy2
= _mm_setzero_ps();
1745 fjz2
= _mm_setzero_ps();
1746 fjx3
= _mm_setzero_ps();
1747 fjy3
= _mm_setzero_ps();
1748 fjz3
= _mm_setzero_ps();
1750 /**************************
1751 * CALCULATE INTERACTIONS *
1752 **************************/
1754 r00
= _mm_mul_ps(rsq00
,rinv00
);
1755 r00
= _mm_andnot_ps(dummy_mask
,r00
);
1757 /* Calculate table index by multiplying r with table scale and truncate to integer */
1758 rt
= _mm_mul_ps(r00
,vftabscale
);
1759 vfitab
= _mm_cvttps_epi32(rt
);
1760 vfeps
= _mm_sub_ps(rt
,_mm_cvtepi32_ps(vfitab
));
1761 vfitab
= _mm_slli_epi32(vfitab
,3);
1763 /* CUBIC SPLINE TABLE DISPERSION */
1764 Y
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,0) );
1765 F
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,1) );
1766 G
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,2) );
1767 H
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,3) );
1768 _MM_TRANSPOSE4_PS(Y
,F
,G
,H
);
1769 Heps
= _mm_mul_ps(vfeps
,H
);
1770 Fp
= _mm_add_ps(F
,_mm_mul_ps(vfeps
,_mm_add_ps(G
,Heps
)));
1771 FF
= _mm_add_ps(Fp
,_mm_mul_ps(vfeps
,_mm_add_ps(G
,_mm_add_ps(Heps
,Heps
))));
1772 fvdw6
= _mm_mul_ps(c6_00
,FF
);
1774 /* CUBIC SPLINE TABLE REPULSION */
1775 vfitab
= _mm_add_epi32(vfitab
,ifour
);
1776 Y
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,0) );
1777 F
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,1) );
1778 G
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,2) );
1779 H
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,3) );
1780 _MM_TRANSPOSE4_PS(Y
,F
,G
,H
);
1781 Heps
= _mm_mul_ps(vfeps
,H
);
1782 Fp
= _mm_add_ps(F
,_mm_mul_ps(vfeps
,_mm_add_ps(G
,Heps
)));
1783 FF
= _mm_add_ps(Fp
,_mm_mul_ps(vfeps
,_mm_add_ps(G
,_mm_add_ps(Heps
,Heps
))));
1784 fvdw12
= _mm_mul_ps(c12_00
,FF
);
1785 fvdw
= _mm_xor_ps(signbit
,_mm_mul_ps(_mm_add_ps(fvdw6
,fvdw12
),_mm_mul_ps(vftabscale
,rinv00
)));
1789 fscal
= _mm_andnot_ps(dummy_mask
,fscal
);
1791 /* Calculate temporary vectorial force */
1792 tx
= _mm_mul_ps(fscal
,dx00
);
1793 ty
= _mm_mul_ps(fscal
,dy00
);
1794 tz
= _mm_mul_ps(fscal
,dz00
);
1796 /* Update vectorial force */
1797 fix0
= _mm_add_ps(fix0
,tx
);
1798 fiy0
= _mm_add_ps(fiy0
,ty
);
1799 fiz0
= _mm_add_ps(fiz0
,tz
);
1801 fjx0
= _mm_add_ps(fjx0
,tx
);
1802 fjy0
= _mm_add_ps(fjy0
,ty
);
1803 fjz0
= _mm_add_ps(fjz0
,tz
);
1805 /**************************
1806 * CALCULATE INTERACTIONS *
1807 **************************/
1809 /* REACTION-FIELD ELECTROSTATICS */
1810 felec
= _mm_mul_ps(qq11
,_mm_sub_ps(_mm_mul_ps(rinv11
,rinvsq11
),krf2
));
1814 fscal
= _mm_andnot_ps(dummy_mask
,fscal
);
1816 /* Calculate temporary vectorial force */
1817 tx
= _mm_mul_ps(fscal
,dx11
);
1818 ty
= _mm_mul_ps(fscal
,dy11
);
1819 tz
= _mm_mul_ps(fscal
,dz11
);
1821 /* Update vectorial force */
1822 fix1
= _mm_add_ps(fix1
,tx
);
1823 fiy1
= _mm_add_ps(fiy1
,ty
);
1824 fiz1
= _mm_add_ps(fiz1
,tz
);
1826 fjx1
= _mm_add_ps(fjx1
,tx
);
1827 fjy1
= _mm_add_ps(fjy1
,ty
);
1828 fjz1
= _mm_add_ps(fjz1
,tz
);
1830 /**************************
1831 * CALCULATE INTERACTIONS *
1832 **************************/
1834 /* REACTION-FIELD ELECTROSTATICS */
1835 felec
= _mm_mul_ps(qq12
,_mm_sub_ps(_mm_mul_ps(rinv12
,rinvsq12
),krf2
));
1839 fscal
= _mm_andnot_ps(dummy_mask
,fscal
);
1841 /* Calculate temporary vectorial force */
1842 tx
= _mm_mul_ps(fscal
,dx12
);
1843 ty
= _mm_mul_ps(fscal
,dy12
);
1844 tz
= _mm_mul_ps(fscal
,dz12
);
1846 /* Update vectorial force */
1847 fix1
= _mm_add_ps(fix1
,tx
);
1848 fiy1
= _mm_add_ps(fiy1
,ty
);
1849 fiz1
= _mm_add_ps(fiz1
,tz
);
1851 fjx2
= _mm_add_ps(fjx2
,tx
);
1852 fjy2
= _mm_add_ps(fjy2
,ty
);
1853 fjz2
= _mm_add_ps(fjz2
,tz
);
1855 /**************************
1856 * CALCULATE INTERACTIONS *
1857 **************************/
1859 /* REACTION-FIELD ELECTROSTATICS */
1860 felec
= _mm_mul_ps(qq13
,_mm_sub_ps(_mm_mul_ps(rinv13
,rinvsq13
),krf2
));
1864 fscal
= _mm_andnot_ps(dummy_mask
,fscal
);
1866 /* Calculate temporary vectorial force */
1867 tx
= _mm_mul_ps(fscal
,dx13
);
1868 ty
= _mm_mul_ps(fscal
,dy13
);
1869 tz
= _mm_mul_ps(fscal
,dz13
);
1871 /* Update vectorial force */
1872 fix1
= _mm_add_ps(fix1
,tx
);
1873 fiy1
= _mm_add_ps(fiy1
,ty
);
1874 fiz1
= _mm_add_ps(fiz1
,tz
);
1876 fjx3
= _mm_add_ps(fjx3
,tx
);
1877 fjy3
= _mm_add_ps(fjy3
,ty
);
1878 fjz3
= _mm_add_ps(fjz3
,tz
);
1880 /**************************
1881 * CALCULATE INTERACTIONS *
1882 **************************/
1884 /* REACTION-FIELD ELECTROSTATICS */
1885 felec
= _mm_mul_ps(qq21
,_mm_sub_ps(_mm_mul_ps(rinv21
,rinvsq21
),krf2
));
1889 fscal
= _mm_andnot_ps(dummy_mask
,fscal
);
1891 /* Calculate temporary vectorial force */
1892 tx
= _mm_mul_ps(fscal
,dx21
);
1893 ty
= _mm_mul_ps(fscal
,dy21
);
1894 tz
= _mm_mul_ps(fscal
,dz21
);
1896 /* Update vectorial force */
1897 fix2
= _mm_add_ps(fix2
,tx
);
1898 fiy2
= _mm_add_ps(fiy2
,ty
);
1899 fiz2
= _mm_add_ps(fiz2
,tz
);
1901 fjx1
= _mm_add_ps(fjx1
,tx
);
1902 fjy1
= _mm_add_ps(fjy1
,ty
);
1903 fjz1
= _mm_add_ps(fjz1
,tz
);
1905 /**************************
1906 * CALCULATE INTERACTIONS *
1907 **************************/
1909 /* REACTION-FIELD ELECTROSTATICS */
1910 felec
= _mm_mul_ps(qq22
,_mm_sub_ps(_mm_mul_ps(rinv22
,rinvsq22
),krf2
));
1914 fscal
= _mm_andnot_ps(dummy_mask
,fscal
);
1916 /* Calculate temporary vectorial force */
1917 tx
= _mm_mul_ps(fscal
,dx22
);
1918 ty
= _mm_mul_ps(fscal
,dy22
);
1919 tz
= _mm_mul_ps(fscal
,dz22
);
1921 /* Update vectorial force */
1922 fix2
= _mm_add_ps(fix2
,tx
);
1923 fiy2
= _mm_add_ps(fiy2
,ty
);
1924 fiz2
= _mm_add_ps(fiz2
,tz
);
1926 fjx2
= _mm_add_ps(fjx2
,tx
);
1927 fjy2
= _mm_add_ps(fjy2
,ty
);
1928 fjz2
= _mm_add_ps(fjz2
,tz
);
1930 /**************************
1931 * CALCULATE INTERACTIONS *
1932 **************************/
1934 /* REACTION-FIELD ELECTROSTATICS */
1935 felec
= _mm_mul_ps(qq23
,_mm_sub_ps(_mm_mul_ps(rinv23
,rinvsq23
),krf2
));
1939 fscal
= _mm_andnot_ps(dummy_mask
,fscal
);
1941 /* Calculate temporary vectorial force */
1942 tx
= _mm_mul_ps(fscal
,dx23
);
1943 ty
= _mm_mul_ps(fscal
,dy23
);
1944 tz
= _mm_mul_ps(fscal
,dz23
);
1946 /* Update vectorial force */
1947 fix2
= _mm_add_ps(fix2
,tx
);
1948 fiy2
= _mm_add_ps(fiy2
,ty
);
1949 fiz2
= _mm_add_ps(fiz2
,tz
);
1951 fjx3
= _mm_add_ps(fjx3
,tx
);
1952 fjy3
= _mm_add_ps(fjy3
,ty
);
1953 fjz3
= _mm_add_ps(fjz3
,tz
);
1955 /**************************
1956 * CALCULATE INTERACTIONS *
1957 **************************/
1959 /* REACTION-FIELD ELECTROSTATICS */
1960 felec
= _mm_mul_ps(qq31
,_mm_sub_ps(_mm_mul_ps(rinv31
,rinvsq31
),krf2
));
1964 fscal
= _mm_andnot_ps(dummy_mask
,fscal
);
1966 /* Calculate temporary vectorial force */
1967 tx
= _mm_mul_ps(fscal
,dx31
);
1968 ty
= _mm_mul_ps(fscal
,dy31
);
1969 tz
= _mm_mul_ps(fscal
,dz31
);
1971 /* Update vectorial force */
1972 fix3
= _mm_add_ps(fix3
,tx
);
1973 fiy3
= _mm_add_ps(fiy3
,ty
);
1974 fiz3
= _mm_add_ps(fiz3
,tz
);
1976 fjx1
= _mm_add_ps(fjx1
,tx
);
1977 fjy1
= _mm_add_ps(fjy1
,ty
);
1978 fjz1
= _mm_add_ps(fjz1
,tz
);
1980 /**************************
1981 * CALCULATE INTERACTIONS *
1982 **************************/
1984 /* REACTION-FIELD ELECTROSTATICS */
1985 felec
= _mm_mul_ps(qq32
,_mm_sub_ps(_mm_mul_ps(rinv32
,rinvsq32
),krf2
));
1989 fscal
= _mm_andnot_ps(dummy_mask
,fscal
);
1991 /* Calculate temporary vectorial force */
1992 tx
= _mm_mul_ps(fscal
,dx32
);
1993 ty
= _mm_mul_ps(fscal
,dy32
);
1994 tz
= _mm_mul_ps(fscal
,dz32
);
1996 /* Update vectorial force */
1997 fix3
= _mm_add_ps(fix3
,tx
);
1998 fiy3
= _mm_add_ps(fiy3
,ty
);
1999 fiz3
= _mm_add_ps(fiz3
,tz
);
2001 fjx2
= _mm_add_ps(fjx2
,tx
);
2002 fjy2
= _mm_add_ps(fjy2
,ty
);
2003 fjz2
= _mm_add_ps(fjz2
,tz
);
2005 /**************************
2006 * CALCULATE INTERACTIONS *
2007 **************************/
2009 /* REACTION-FIELD ELECTROSTATICS */
2010 felec
= _mm_mul_ps(qq33
,_mm_sub_ps(_mm_mul_ps(rinv33
,rinvsq33
),krf2
));
2014 fscal
= _mm_andnot_ps(dummy_mask
,fscal
);
2016 /* Calculate temporary vectorial force */
2017 tx
= _mm_mul_ps(fscal
,dx33
);
2018 ty
= _mm_mul_ps(fscal
,dy33
);
2019 tz
= _mm_mul_ps(fscal
,dz33
);
2021 /* Update vectorial force */
2022 fix3
= _mm_add_ps(fix3
,tx
);
2023 fiy3
= _mm_add_ps(fiy3
,ty
);
2024 fiz3
= _mm_add_ps(fiz3
,tz
);
2026 fjx3
= _mm_add_ps(fjx3
,tx
);
2027 fjy3
= _mm_add_ps(fjy3
,ty
);
2028 fjz3
= _mm_add_ps(fjz3
,tz
);
2030 fjptrA
= (jnrlistA
>=0) ? f
+j_coord_offsetA
: scratch
;
2031 fjptrB
= (jnrlistB
>=0) ? f
+j_coord_offsetB
: scratch
;
2032 fjptrC
= (jnrlistC
>=0) ? f
+j_coord_offsetC
: scratch
;
2033 fjptrD
= (jnrlistD
>=0) ? f
+j_coord_offsetD
: scratch
;
2035 gmx_mm_decrement_4rvec_4ptr_swizzle_ps(fjptrA
,fjptrB
,fjptrC
,fjptrD
,
2036 fjx0
,fjy0
,fjz0
,fjx1
,fjy1
,fjz1
,
2037 fjx2
,fjy2
,fjz2
,fjx3
,fjy3
,fjz3
);
2039 /* Inner loop uses 295 flops */
2042 /* End of innermost loop */
2044 gmx_mm_update_iforce_4atom_swizzle_ps(fix0
,fiy0
,fiz0
,fix1
,fiy1
,fiz1
,fix2
,fiy2
,fiz2
,fix3
,fiy3
,fiz3
,
2045 f
+i_coord_offset
,fshift
+i_shift_offset
);
2047 /* Increment number of inner iterations */
2048 inneriter
+= j_index_end
- j_index_start
;
2050 /* Outer loop uses 24 flops */
2053 /* Increment number of outer iterations */
2056 /* Update outer/inner flops */
2058 inc_nrnb(nrnb
,eNR_NBKERNEL_ELEC_VDW_W4W4_F
,outeriter
*24 + inneriter
*295);