2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2012,2013,2014,2015, by the GROMACS development team, led by
5 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6 * and including many others, as listed in the AUTHORS file in the
7 * top-level source directory and at http://www.gromacs.org.
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
36 * Note: this file was generated by the GROMACS sse4_1_double kernel generator.
44 #include "../nb_kernel.h"
45 #include "gromacs/math/vec.h"
46 #include "gromacs/legacyheaders/nrnb.h"
48 #include "gromacs/simd/math_x86_sse4_1_double.h"
49 #include "kernelutil_x86_sse4_1_double.h"
52 * Gromacs nonbonded kernel: nb_kernel_ElecCoul_VdwCSTab_GeomW4W4_VF_sse4_1_double
53 * Electrostatics interaction: Coulomb
54 * VdW interaction: CubicSplineTable
55 * Geometry: Water4-Water4
56 * Calculate force/pot: PotentialAndForce
59 nb_kernel_ElecCoul_VdwCSTab_GeomW4W4_VF_sse4_1_double
60 (t_nblist
* gmx_restrict nlist
,
61 rvec
* gmx_restrict xx
,
62 rvec
* gmx_restrict ff
,
63 t_forcerec
* gmx_restrict fr
,
64 t_mdatoms
* gmx_restrict mdatoms
,
65 nb_kernel_data_t gmx_unused
* gmx_restrict kernel_data
,
66 t_nrnb
* gmx_restrict nrnb
)
68 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
69 * just 0 for non-waters.
70 * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different
71 * jnr indices corresponding to data put in the four positions in the SIMD register.
73 int i_shift_offset
,i_coord_offset
,outeriter
,inneriter
;
74 int j_index_start
,j_index_end
,jidx
,nri
,inr
,ggid
,iidx
;
76 int j_coord_offsetA
,j_coord_offsetB
;
77 int *iinr
,*jindex
,*jjnr
,*shiftidx
,*gid
;
79 real
*shiftvec
,*fshift
,*x
,*f
;
80 __m128d tx
,ty
,tz
,fscal
,rcutoff
,rcutoff2
,jidxall
;
82 __m128d ix0
,iy0
,iz0
,fix0
,fiy0
,fiz0
,iq0
,isai0
;
84 __m128d ix1
,iy1
,iz1
,fix1
,fiy1
,fiz1
,iq1
,isai1
;
86 __m128d ix2
,iy2
,iz2
,fix2
,fiy2
,fiz2
,iq2
,isai2
;
88 __m128d ix3
,iy3
,iz3
,fix3
,fiy3
,fiz3
,iq3
,isai3
;
89 int vdwjidx0A
,vdwjidx0B
;
90 __m128d jx0
,jy0
,jz0
,fjx0
,fjy0
,fjz0
,jq0
,isaj0
;
91 int vdwjidx1A
,vdwjidx1B
;
92 __m128d jx1
,jy1
,jz1
,fjx1
,fjy1
,fjz1
,jq1
,isaj1
;
93 int vdwjidx2A
,vdwjidx2B
;
94 __m128d jx2
,jy2
,jz2
,fjx2
,fjy2
,fjz2
,jq2
,isaj2
;
95 int vdwjidx3A
,vdwjidx3B
;
96 __m128d jx3
,jy3
,jz3
,fjx3
,fjy3
,fjz3
,jq3
,isaj3
;
97 __m128d dx00
,dy00
,dz00
,rsq00
,rinv00
,rinvsq00
,r00
,qq00
,c6_00
,c12_00
;
98 __m128d dx11
,dy11
,dz11
,rsq11
,rinv11
,rinvsq11
,r11
,qq11
,c6_11
,c12_11
;
99 __m128d dx12
,dy12
,dz12
,rsq12
,rinv12
,rinvsq12
,r12
,qq12
,c6_12
,c12_12
;
100 __m128d dx13
,dy13
,dz13
,rsq13
,rinv13
,rinvsq13
,r13
,qq13
,c6_13
,c12_13
;
101 __m128d dx21
,dy21
,dz21
,rsq21
,rinv21
,rinvsq21
,r21
,qq21
,c6_21
,c12_21
;
102 __m128d dx22
,dy22
,dz22
,rsq22
,rinv22
,rinvsq22
,r22
,qq22
,c6_22
,c12_22
;
103 __m128d dx23
,dy23
,dz23
,rsq23
,rinv23
,rinvsq23
,r23
,qq23
,c6_23
,c12_23
;
104 __m128d dx31
,dy31
,dz31
,rsq31
,rinv31
,rinvsq31
,r31
,qq31
,c6_31
,c12_31
;
105 __m128d dx32
,dy32
,dz32
,rsq32
,rinv32
,rinvsq32
,r32
,qq32
,c6_32
,c12_32
;
106 __m128d dx33
,dy33
,dz33
,rsq33
,rinv33
,rinvsq33
,r33
,qq33
,c6_33
,c12_33
;
107 __m128d velec
,felec
,velecsum
,facel
,crf
,krf
,krf2
;
110 __m128d rinvsix
,rvdw
,vvdw
,vvdw6
,vvdw12
,fvdw
,fvdw6
,fvdw12
,vvdwsum
,sh_vdw_invrcut6
;
113 __m128d one_sixth
= _mm_set1_pd(1.0/6.0);
114 __m128d one_twelfth
= _mm_set1_pd(1.0/12.0);
116 __m128i ifour
= _mm_set1_epi32(4);
117 __m128d rt
,vfeps
,vftabscale
,Y
,F
,G
,H
,Heps
,Fp
,VV
,FF
;
119 __m128d dummy_mask
,cutoff_mask
;
120 __m128d signbit
= gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) );
121 __m128d one
= _mm_set1_pd(1.0);
122 __m128d two
= _mm_set1_pd(2.0);
128 jindex
= nlist
->jindex
;
130 shiftidx
= nlist
->shift
;
132 shiftvec
= fr
->shift_vec
[0];
133 fshift
= fr
->fshift
[0];
134 facel
= _mm_set1_pd(fr
->epsfac
);
135 charge
= mdatoms
->chargeA
;
136 nvdwtype
= fr
->ntype
;
138 vdwtype
= mdatoms
->typeA
;
140 vftab
= kernel_data
->table_vdw
->data
;
141 vftabscale
= _mm_set1_pd(kernel_data
->table_vdw
->scale
);
143 /* Setup water-specific parameters */
144 inr
= nlist
->iinr
[0];
145 iq1
= _mm_mul_pd(facel
,_mm_set1_pd(charge
[inr
+1]));
146 iq2
= _mm_mul_pd(facel
,_mm_set1_pd(charge
[inr
+2]));
147 iq3
= _mm_mul_pd(facel
,_mm_set1_pd(charge
[inr
+3]));
148 vdwioffset0
= 2*nvdwtype
*vdwtype
[inr
+0];
150 jq1
= _mm_set1_pd(charge
[inr
+1]);
151 jq2
= _mm_set1_pd(charge
[inr
+2]);
152 jq3
= _mm_set1_pd(charge
[inr
+3]);
153 vdwjidx0A
= 2*vdwtype
[inr
+0];
154 c6_00
= _mm_set1_pd(vdwparam
[vdwioffset0
+vdwjidx0A
]);
155 c12_00
= _mm_set1_pd(vdwparam
[vdwioffset0
+vdwjidx0A
+1]);
156 qq11
= _mm_mul_pd(iq1
,jq1
);
157 qq12
= _mm_mul_pd(iq1
,jq2
);
158 qq13
= _mm_mul_pd(iq1
,jq3
);
159 qq21
= _mm_mul_pd(iq2
,jq1
);
160 qq22
= _mm_mul_pd(iq2
,jq2
);
161 qq23
= _mm_mul_pd(iq2
,jq3
);
162 qq31
= _mm_mul_pd(iq3
,jq1
);
163 qq32
= _mm_mul_pd(iq3
,jq2
);
164 qq33
= _mm_mul_pd(iq3
,jq3
);
166 /* Avoid stupid compiler warnings */
174 /* Start outer loop over neighborlists */
175 for(iidx
=0; iidx
<nri
; iidx
++)
177 /* Load shift vector for this list */
178 i_shift_offset
= DIM
*shiftidx
[iidx
];
180 /* Load limits for loop over neighbors */
181 j_index_start
= jindex
[iidx
];
182 j_index_end
= jindex
[iidx
+1];
184 /* Get outer coordinate index */
186 i_coord_offset
= DIM
*inr
;
188 /* Load i particle coords and add shift vector */
189 gmx_mm_load_shift_and_4rvec_broadcast_pd(shiftvec
+i_shift_offset
,x
+i_coord_offset
,
190 &ix0
,&iy0
,&iz0
,&ix1
,&iy1
,&iz1
,&ix2
,&iy2
,&iz2
,&ix3
,&iy3
,&iz3
);
192 fix0
= _mm_setzero_pd();
193 fiy0
= _mm_setzero_pd();
194 fiz0
= _mm_setzero_pd();
195 fix1
= _mm_setzero_pd();
196 fiy1
= _mm_setzero_pd();
197 fiz1
= _mm_setzero_pd();
198 fix2
= _mm_setzero_pd();
199 fiy2
= _mm_setzero_pd();
200 fiz2
= _mm_setzero_pd();
201 fix3
= _mm_setzero_pd();
202 fiy3
= _mm_setzero_pd();
203 fiz3
= _mm_setzero_pd();
205 /* Reset potential sums */
206 velecsum
= _mm_setzero_pd();
207 vvdwsum
= _mm_setzero_pd();
209 /* Start inner kernel loop */
210 for(jidx
=j_index_start
; jidx
<j_index_end
-1; jidx
+=2)
213 /* Get j neighbor index, and coordinate index */
216 j_coord_offsetA
= DIM
*jnrA
;
217 j_coord_offsetB
= DIM
*jnrB
;
219 /* load j atom coordinates */
220 gmx_mm_load_4rvec_2ptr_swizzle_pd(x
+j_coord_offsetA
,x
+j_coord_offsetB
,
221 &jx0
,&jy0
,&jz0
,&jx1
,&jy1
,&jz1
,&jx2
,
222 &jy2
,&jz2
,&jx3
,&jy3
,&jz3
);
224 /* Calculate displacement vector */
225 dx00
= _mm_sub_pd(ix0
,jx0
);
226 dy00
= _mm_sub_pd(iy0
,jy0
);
227 dz00
= _mm_sub_pd(iz0
,jz0
);
228 dx11
= _mm_sub_pd(ix1
,jx1
);
229 dy11
= _mm_sub_pd(iy1
,jy1
);
230 dz11
= _mm_sub_pd(iz1
,jz1
);
231 dx12
= _mm_sub_pd(ix1
,jx2
);
232 dy12
= _mm_sub_pd(iy1
,jy2
);
233 dz12
= _mm_sub_pd(iz1
,jz2
);
234 dx13
= _mm_sub_pd(ix1
,jx3
);
235 dy13
= _mm_sub_pd(iy1
,jy3
);
236 dz13
= _mm_sub_pd(iz1
,jz3
);
237 dx21
= _mm_sub_pd(ix2
,jx1
);
238 dy21
= _mm_sub_pd(iy2
,jy1
);
239 dz21
= _mm_sub_pd(iz2
,jz1
);
240 dx22
= _mm_sub_pd(ix2
,jx2
);
241 dy22
= _mm_sub_pd(iy2
,jy2
);
242 dz22
= _mm_sub_pd(iz2
,jz2
);
243 dx23
= _mm_sub_pd(ix2
,jx3
);
244 dy23
= _mm_sub_pd(iy2
,jy3
);
245 dz23
= _mm_sub_pd(iz2
,jz3
);
246 dx31
= _mm_sub_pd(ix3
,jx1
);
247 dy31
= _mm_sub_pd(iy3
,jy1
);
248 dz31
= _mm_sub_pd(iz3
,jz1
);
249 dx32
= _mm_sub_pd(ix3
,jx2
);
250 dy32
= _mm_sub_pd(iy3
,jy2
);
251 dz32
= _mm_sub_pd(iz3
,jz2
);
252 dx33
= _mm_sub_pd(ix3
,jx3
);
253 dy33
= _mm_sub_pd(iy3
,jy3
);
254 dz33
= _mm_sub_pd(iz3
,jz3
);
256 /* Calculate squared distance and things based on it */
257 rsq00
= gmx_mm_calc_rsq_pd(dx00
,dy00
,dz00
);
258 rsq11
= gmx_mm_calc_rsq_pd(dx11
,dy11
,dz11
);
259 rsq12
= gmx_mm_calc_rsq_pd(dx12
,dy12
,dz12
);
260 rsq13
= gmx_mm_calc_rsq_pd(dx13
,dy13
,dz13
);
261 rsq21
= gmx_mm_calc_rsq_pd(dx21
,dy21
,dz21
);
262 rsq22
= gmx_mm_calc_rsq_pd(dx22
,dy22
,dz22
);
263 rsq23
= gmx_mm_calc_rsq_pd(dx23
,dy23
,dz23
);
264 rsq31
= gmx_mm_calc_rsq_pd(dx31
,dy31
,dz31
);
265 rsq32
= gmx_mm_calc_rsq_pd(dx32
,dy32
,dz32
);
266 rsq33
= gmx_mm_calc_rsq_pd(dx33
,dy33
,dz33
);
268 rinv00
= gmx_mm_invsqrt_pd(rsq00
);
269 rinv11
= gmx_mm_invsqrt_pd(rsq11
);
270 rinv12
= gmx_mm_invsqrt_pd(rsq12
);
271 rinv13
= gmx_mm_invsqrt_pd(rsq13
);
272 rinv21
= gmx_mm_invsqrt_pd(rsq21
);
273 rinv22
= gmx_mm_invsqrt_pd(rsq22
);
274 rinv23
= gmx_mm_invsqrt_pd(rsq23
);
275 rinv31
= gmx_mm_invsqrt_pd(rsq31
);
276 rinv32
= gmx_mm_invsqrt_pd(rsq32
);
277 rinv33
= gmx_mm_invsqrt_pd(rsq33
);
279 rinvsq11
= _mm_mul_pd(rinv11
,rinv11
);
280 rinvsq12
= _mm_mul_pd(rinv12
,rinv12
);
281 rinvsq13
= _mm_mul_pd(rinv13
,rinv13
);
282 rinvsq21
= _mm_mul_pd(rinv21
,rinv21
);
283 rinvsq22
= _mm_mul_pd(rinv22
,rinv22
);
284 rinvsq23
= _mm_mul_pd(rinv23
,rinv23
);
285 rinvsq31
= _mm_mul_pd(rinv31
,rinv31
);
286 rinvsq32
= _mm_mul_pd(rinv32
,rinv32
);
287 rinvsq33
= _mm_mul_pd(rinv33
,rinv33
);
289 fjx0
= _mm_setzero_pd();
290 fjy0
= _mm_setzero_pd();
291 fjz0
= _mm_setzero_pd();
292 fjx1
= _mm_setzero_pd();
293 fjy1
= _mm_setzero_pd();
294 fjz1
= _mm_setzero_pd();
295 fjx2
= _mm_setzero_pd();
296 fjy2
= _mm_setzero_pd();
297 fjz2
= _mm_setzero_pd();
298 fjx3
= _mm_setzero_pd();
299 fjy3
= _mm_setzero_pd();
300 fjz3
= _mm_setzero_pd();
302 /**************************
303 * CALCULATE INTERACTIONS *
304 **************************/
306 r00
= _mm_mul_pd(rsq00
,rinv00
);
308 /* Calculate table index by multiplying r with table scale and truncate to integer */
309 rt
= _mm_mul_pd(r00
,vftabscale
);
310 vfitab
= _mm_cvttpd_epi32(rt
);
311 vfeps
= _mm_sub_pd(rt
,_mm_round_pd(rt
, _MM_FROUND_FLOOR
));
312 vfitab
= _mm_slli_epi32(vfitab
,3);
314 /* CUBIC SPLINE TABLE DISPERSION */
315 Y
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) );
316 F
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,1) );
317 GMX_MM_TRANSPOSE2_PD(Y
,F
);
318 G
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) +2);
319 H
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,1) +2);
320 GMX_MM_TRANSPOSE2_PD(G
,H
);
321 Heps
= _mm_mul_pd(vfeps
,H
);
322 Fp
= _mm_add_pd(F
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,Heps
)));
323 VV
= _mm_add_pd(Y
,_mm_mul_pd(vfeps
,Fp
));
324 vvdw6
= _mm_mul_pd(c6_00
,VV
);
325 FF
= _mm_add_pd(Fp
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,_mm_add_pd(Heps
,Heps
))));
326 fvdw6
= _mm_mul_pd(c6_00
,FF
);
328 /* CUBIC SPLINE TABLE REPULSION */
329 vfitab
= _mm_add_epi32(vfitab
,ifour
);
330 Y
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) );
331 F
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,1) );
332 GMX_MM_TRANSPOSE2_PD(Y
,F
);
333 G
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) +2);
334 H
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,1) +2);
335 GMX_MM_TRANSPOSE2_PD(G
,H
);
336 Heps
= _mm_mul_pd(vfeps
,H
);
337 Fp
= _mm_add_pd(F
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,Heps
)));
338 VV
= _mm_add_pd(Y
,_mm_mul_pd(vfeps
,Fp
));
339 vvdw12
= _mm_mul_pd(c12_00
,VV
);
340 FF
= _mm_add_pd(Fp
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,_mm_add_pd(Heps
,Heps
))));
341 fvdw12
= _mm_mul_pd(c12_00
,FF
);
342 vvdw
= _mm_add_pd(vvdw12
,vvdw6
);
343 fvdw
= _mm_xor_pd(signbit
,_mm_mul_pd(_mm_add_pd(fvdw6
,fvdw12
),_mm_mul_pd(vftabscale
,rinv00
)));
345 /* Update potential sum for this i atom from the interaction with this j atom. */
346 vvdwsum
= _mm_add_pd(vvdwsum
,vvdw
);
350 /* Calculate temporary vectorial force */
351 tx
= _mm_mul_pd(fscal
,dx00
);
352 ty
= _mm_mul_pd(fscal
,dy00
);
353 tz
= _mm_mul_pd(fscal
,dz00
);
355 /* Update vectorial force */
356 fix0
= _mm_add_pd(fix0
,tx
);
357 fiy0
= _mm_add_pd(fiy0
,ty
);
358 fiz0
= _mm_add_pd(fiz0
,tz
);
360 fjx0
= _mm_add_pd(fjx0
,tx
);
361 fjy0
= _mm_add_pd(fjy0
,ty
);
362 fjz0
= _mm_add_pd(fjz0
,tz
);
364 /**************************
365 * CALCULATE INTERACTIONS *
366 **************************/
368 /* COULOMB ELECTROSTATICS */
369 velec
= _mm_mul_pd(qq11
,rinv11
);
370 felec
= _mm_mul_pd(velec
,rinvsq11
);
372 /* Update potential sum for this i atom from the interaction with this j atom. */
373 velecsum
= _mm_add_pd(velecsum
,velec
);
377 /* Calculate temporary vectorial force */
378 tx
= _mm_mul_pd(fscal
,dx11
);
379 ty
= _mm_mul_pd(fscal
,dy11
);
380 tz
= _mm_mul_pd(fscal
,dz11
);
382 /* Update vectorial force */
383 fix1
= _mm_add_pd(fix1
,tx
);
384 fiy1
= _mm_add_pd(fiy1
,ty
);
385 fiz1
= _mm_add_pd(fiz1
,tz
);
387 fjx1
= _mm_add_pd(fjx1
,tx
);
388 fjy1
= _mm_add_pd(fjy1
,ty
);
389 fjz1
= _mm_add_pd(fjz1
,tz
);
391 /**************************
392 * CALCULATE INTERACTIONS *
393 **************************/
395 /* COULOMB ELECTROSTATICS */
396 velec
= _mm_mul_pd(qq12
,rinv12
);
397 felec
= _mm_mul_pd(velec
,rinvsq12
);
399 /* Update potential sum for this i atom from the interaction with this j atom. */
400 velecsum
= _mm_add_pd(velecsum
,velec
);
404 /* Calculate temporary vectorial force */
405 tx
= _mm_mul_pd(fscal
,dx12
);
406 ty
= _mm_mul_pd(fscal
,dy12
);
407 tz
= _mm_mul_pd(fscal
,dz12
);
409 /* Update vectorial force */
410 fix1
= _mm_add_pd(fix1
,tx
);
411 fiy1
= _mm_add_pd(fiy1
,ty
);
412 fiz1
= _mm_add_pd(fiz1
,tz
);
414 fjx2
= _mm_add_pd(fjx2
,tx
);
415 fjy2
= _mm_add_pd(fjy2
,ty
);
416 fjz2
= _mm_add_pd(fjz2
,tz
);
418 /**************************
419 * CALCULATE INTERACTIONS *
420 **************************/
422 /* COULOMB ELECTROSTATICS */
423 velec
= _mm_mul_pd(qq13
,rinv13
);
424 felec
= _mm_mul_pd(velec
,rinvsq13
);
426 /* Update potential sum for this i atom from the interaction with this j atom. */
427 velecsum
= _mm_add_pd(velecsum
,velec
);
431 /* Calculate temporary vectorial force */
432 tx
= _mm_mul_pd(fscal
,dx13
);
433 ty
= _mm_mul_pd(fscal
,dy13
);
434 tz
= _mm_mul_pd(fscal
,dz13
);
436 /* Update vectorial force */
437 fix1
= _mm_add_pd(fix1
,tx
);
438 fiy1
= _mm_add_pd(fiy1
,ty
);
439 fiz1
= _mm_add_pd(fiz1
,tz
);
441 fjx3
= _mm_add_pd(fjx3
,tx
);
442 fjy3
= _mm_add_pd(fjy3
,ty
);
443 fjz3
= _mm_add_pd(fjz3
,tz
);
445 /**************************
446 * CALCULATE INTERACTIONS *
447 **************************/
449 /* COULOMB ELECTROSTATICS */
450 velec
= _mm_mul_pd(qq21
,rinv21
);
451 felec
= _mm_mul_pd(velec
,rinvsq21
);
453 /* Update potential sum for this i atom from the interaction with this j atom. */
454 velecsum
= _mm_add_pd(velecsum
,velec
);
458 /* Calculate temporary vectorial force */
459 tx
= _mm_mul_pd(fscal
,dx21
);
460 ty
= _mm_mul_pd(fscal
,dy21
);
461 tz
= _mm_mul_pd(fscal
,dz21
);
463 /* Update vectorial force */
464 fix2
= _mm_add_pd(fix2
,tx
);
465 fiy2
= _mm_add_pd(fiy2
,ty
);
466 fiz2
= _mm_add_pd(fiz2
,tz
);
468 fjx1
= _mm_add_pd(fjx1
,tx
);
469 fjy1
= _mm_add_pd(fjy1
,ty
);
470 fjz1
= _mm_add_pd(fjz1
,tz
);
472 /**************************
473 * CALCULATE INTERACTIONS *
474 **************************/
476 /* COULOMB ELECTROSTATICS */
477 velec
= _mm_mul_pd(qq22
,rinv22
);
478 felec
= _mm_mul_pd(velec
,rinvsq22
);
480 /* Update potential sum for this i atom from the interaction with this j atom. */
481 velecsum
= _mm_add_pd(velecsum
,velec
);
485 /* Calculate temporary vectorial force */
486 tx
= _mm_mul_pd(fscal
,dx22
);
487 ty
= _mm_mul_pd(fscal
,dy22
);
488 tz
= _mm_mul_pd(fscal
,dz22
);
490 /* Update vectorial force */
491 fix2
= _mm_add_pd(fix2
,tx
);
492 fiy2
= _mm_add_pd(fiy2
,ty
);
493 fiz2
= _mm_add_pd(fiz2
,tz
);
495 fjx2
= _mm_add_pd(fjx2
,tx
);
496 fjy2
= _mm_add_pd(fjy2
,ty
);
497 fjz2
= _mm_add_pd(fjz2
,tz
);
499 /**************************
500 * CALCULATE INTERACTIONS *
501 **************************/
503 /* COULOMB ELECTROSTATICS */
504 velec
= _mm_mul_pd(qq23
,rinv23
);
505 felec
= _mm_mul_pd(velec
,rinvsq23
);
507 /* Update potential sum for this i atom from the interaction with this j atom. */
508 velecsum
= _mm_add_pd(velecsum
,velec
);
512 /* Calculate temporary vectorial force */
513 tx
= _mm_mul_pd(fscal
,dx23
);
514 ty
= _mm_mul_pd(fscal
,dy23
);
515 tz
= _mm_mul_pd(fscal
,dz23
);
517 /* Update vectorial force */
518 fix2
= _mm_add_pd(fix2
,tx
);
519 fiy2
= _mm_add_pd(fiy2
,ty
);
520 fiz2
= _mm_add_pd(fiz2
,tz
);
522 fjx3
= _mm_add_pd(fjx3
,tx
);
523 fjy3
= _mm_add_pd(fjy3
,ty
);
524 fjz3
= _mm_add_pd(fjz3
,tz
);
526 /**************************
527 * CALCULATE INTERACTIONS *
528 **************************/
530 /* COULOMB ELECTROSTATICS */
531 velec
= _mm_mul_pd(qq31
,rinv31
);
532 felec
= _mm_mul_pd(velec
,rinvsq31
);
534 /* Update potential sum for this i atom from the interaction with this j atom. */
535 velecsum
= _mm_add_pd(velecsum
,velec
);
539 /* Calculate temporary vectorial force */
540 tx
= _mm_mul_pd(fscal
,dx31
);
541 ty
= _mm_mul_pd(fscal
,dy31
);
542 tz
= _mm_mul_pd(fscal
,dz31
);
544 /* Update vectorial force */
545 fix3
= _mm_add_pd(fix3
,tx
);
546 fiy3
= _mm_add_pd(fiy3
,ty
);
547 fiz3
= _mm_add_pd(fiz3
,tz
);
549 fjx1
= _mm_add_pd(fjx1
,tx
);
550 fjy1
= _mm_add_pd(fjy1
,ty
);
551 fjz1
= _mm_add_pd(fjz1
,tz
);
553 /**************************
554 * CALCULATE INTERACTIONS *
555 **************************/
557 /* COULOMB ELECTROSTATICS */
558 velec
= _mm_mul_pd(qq32
,rinv32
);
559 felec
= _mm_mul_pd(velec
,rinvsq32
);
561 /* Update potential sum for this i atom from the interaction with this j atom. */
562 velecsum
= _mm_add_pd(velecsum
,velec
);
566 /* Calculate temporary vectorial force */
567 tx
= _mm_mul_pd(fscal
,dx32
);
568 ty
= _mm_mul_pd(fscal
,dy32
);
569 tz
= _mm_mul_pd(fscal
,dz32
);
571 /* Update vectorial force */
572 fix3
= _mm_add_pd(fix3
,tx
);
573 fiy3
= _mm_add_pd(fiy3
,ty
);
574 fiz3
= _mm_add_pd(fiz3
,tz
);
576 fjx2
= _mm_add_pd(fjx2
,tx
);
577 fjy2
= _mm_add_pd(fjy2
,ty
);
578 fjz2
= _mm_add_pd(fjz2
,tz
);
580 /**************************
581 * CALCULATE INTERACTIONS *
582 **************************/
584 /* COULOMB ELECTROSTATICS */
585 velec
= _mm_mul_pd(qq33
,rinv33
);
586 felec
= _mm_mul_pd(velec
,rinvsq33
);
588 /* Update potential sum for this i atom from the interaction with this j atom. */
589 velecsum
= _mm_add_pd(velecsum
,velec
);
593 /* Calculate temporary vectorial force */
594 tx
= _mm_mul_pd(fscal
,dx33
);
595 ty
= _mm_mul_pd(fscal
,dy33
);
596 tz
= _mm_mul_pd(fscal
,dz33
);
598 /* Update vectorial force */
599 fix3
= _mm_add_pd(fix3
,tx
);
600 fiy3
= _mm_add_pd(fiy3
,ty
);
601 fiz3
= _mm_add_pd(fiz3
,tz
);
603 fjx3
= _mm_add_pd(fjx3
,tx
);
604 fjy3
= _mm_add_pd(fjy3
,ty
);
605 fjz3
= _mm_add_pd(fjz3
,tz
);
607 gmx_mm_decrement_4rvec_2ptr_swizzle_pd(f
+j_coord_offsetA
,f
+j_coord_offsetB
,fjx0
,fjy0
,fjz0
,fjx1
,fjy1
,fjz1
,fjx2
,fjy2
,fjz2
,fjx3
,fjy3
,fjz3
);
609 /* Inner loop uses 311 flops */
616 j_coord_offsetA
= DIM
*jnrA
;
618 /* load j atom coordinates */
619 gmx_mm_load_4rvec_1ptr_swizzle_pd(x
+j_coord_offsetA
,
620 &jx0
,&jy0
,&jz0
,&jx1
,&jy1
,&jz1
,&jx2
,
621 &jy2
,&jz2
,&jx3
,&jy3
,&jz3
);
623 /* Calculate displacement vector */
624 dx00
= _mm_sub_pd(ix0
,jx0
);
625 dy00
= _mm_sub_pd(iy0
,jy0
);
626 dz00
= _mm_sub_pd(iz0
,jz0
);
627 dx11
= _mm_sub_pd(ix1
,jx1
);
628 dy11
= _mm_sub_pd(iy1
,jy1
);
629 dz11
= _mm_sub_pd(iz1
,jz1
);
630 dx12
= _mm_sub_pd(ix1
,jx2
);
631 dy12
= _mm_sub_pd(iy1
,jy2
);
632 dz12
= _mm_sub_pd(iz1
,jz2
);
633 dx13
= _mm_sub_pd(ix1
,jx3
);
634 dy13
= _mm_sub_pd(iy1
,jy3
);
635 dz13
= _mm_sub_pd(iz1
,jz3
);
636 dx21
= _mm_sub_pd(ix2
,jx1
);
637 dy21
= _mm_sub_pd(iy2
,jy1
);
638 dz21
= _mm_sub_pd(iz2
,jz1
);
639 dx22
= _mm_sub_pd(ix2
,jx2
);
640 dy22
= _mm_sub_pd(iy2
,jy2
);
641 dz22
= _mm_sub_pd(iz2
,jz2
);
642 dx23
= _mm_sub_pd(ix2
,jx3
);
643 dy23
= _mm_sub_pd(iy2
,jy3
);
644 dz23
= _mm_sub_pd(iz2
,jz3
);
645 dx31
= _mm_sub_pd(ix3
,jx1
);
646 dy31
= _mm_sub_pd(iy3
,jy1
);
647 dz31
= _mm_sub_pd(iz3
,jz1
);
648 dx32
= _mm_sub_pd(ix3
,jx2
);
649 dy32
= _mm_sub_pd(iy3
,jy2
);
650 dz32
= _mm_sub_pd(iz3
,jz2
);
651 dx33
= _mm_sub_pd(ix3
,jx3
);
652 dy33
= _mm_sub_pd(iy3
,jy3
);
653 dz33
= _mm_sub_pd(iz3
,jz3
);
655 /* Calculate squared distance and things based on it */
656 rsq00
= gmx_mm_calc_rsq_pd(dx00
,dy00
,dz00
);
657 rsq11
= gmx_mm_calc_rsq_pd(dx11
,dy11
,dz11
);
658 rsq12
= gmx_mm_calc_rsq_pd(dx12
,dy12
,dz12
);
659 rsq13
= gmx_mm_calc_rsq_pd(dx13
,dy13
,dz13
);
660 rsq21
= gmx_mm_calc_rsq_pd(dx21
,dy21
,dz21
);
661 rsq22
= gmx_mm_calc_rsq_pd(dx22
,dy22
,dz22
);
662 rsq23
= gmx_mm_calc_rsq_pd(dx23
,dy23
,dz23
);
663 rsq31
= gmx_mm_calc_rsq_pd(dx31
,dy31
,dz31
);
664 rsq32
= gmx_mm_calc_rsq_pd(dx32
,dy32
,dz32
);
665 rsq33
= gmx_mm_calc_rsq_pd(dx33
,dy33
,dz33
);
667 rinv00
= gmx_mm_invsqrt_pd(rsq00
);
668 rinv11
= gmx_mm_invsqrt_pd(rsq11
);
669 rinv12
= gmx_mm_invsqrt_pd(rsq12
);
670 rinv13
= gmx_mm_invsqrt_pd(rsq13
);
671 rinv21
= gmx_mm_invsqrt_pd(rsq21
);
672 rinv22
= gmx_mm_invsqrt_pd(rsq22
);
673 rinv23
= gmx_mm_invsqrt_pd(rsq23
);
674 rinv31
= gmx_mm_invsqrt_pd(rsq31
);
675 rinv32
= gmx_mm_invsqrt_pd(rsq32
);
676 rinv33
= gmx_mm_invsqrt_pd(rsq33
);
678 rinvsq11
= _mm_mul_pd(rinv11
,rinv11
);
679 rinvsq12
= _mm_mul_pd(rinv12
,rinv12
);
680 rinvsq13
= _mm_mul_pd(rinv13
,rinv13
);
681 rinvsq21
= _mm_mul_pd(rinv21
,rinv21
);
682 rinvsq22
= _mm_mul_pd(rinv22
,rinv22
);
683 rinvsq23
= _mm_mul_pd(rinv23
,rinv23
);
684 rinvsq31
= _mm_mul_pd(rinv31
,rinv31
);
685 rinvsq32
= _mm_mul_pd(rinv32
,rinv32
);
686 rinvsq33
= _mm_mul_pd(rinv33
,rinv33
);
688 fjx0
= _mm_setzero_pd();
689 fjy0
= _mm_setzero_pd();
690 fjz0
= _mm_setzero_pd();
691 fjx1
= _mm_setzero_pd();
692 fjy1
= _mm_setzero_pd();
693 fjz1
= _mm_setzero_pd();
694 fjx2
= _mm_setzero_pd();
695 fjy2
= _mm_setzero_pd();
696 fjz2
= _mm_setzero_pd();
697 fjx3
= _mm_setzero_pd();
698 fjy3
= _mm_setzero_pd();
699 fjz3
= _mm_setzero_pd();
701 /**************************
702 * CALCULATE INTERACTIONS *
703 **************************/
705 r00
= _mm_mul_pd(rsq00
,rinv00
);
707 /* Calculate table index by multiplying r with table scale and truncate to integer */
708 rt
= _mm_mul_pd(r00
,vftabscale
);
709 vfitab
= _mm_cvttpd_epi32(rt
);
710 vfeps
= _mm_sub_pd(rt
,_mm_round_pd(rt
, _MM_FROUND_FLOOR
));
711 vfitab
= _mm_slli_epi32(vfitab
,3);
713 /* CUBIC SPLINE TABLE DISPERSION */
714 Y
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) );
715 F
= _mm_setzero_pd();
716 GMX_MM_TRANSPOSE2_PD(Y
,F
);
717 G
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) +2);
718 H
= _mm_setzero_pd();
719 GMX_MM_TRANSPOSE2_PD(G
,H
);
720 Heps
= _mm_mul_pd(vfeps
,H
);
721 Fp
= _mm_add_pd(F
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,Heps
)));
722 VV
= _mm_add_pd(Y
,_mm_mul_pd(vfeps
,Fp
));
723 vvdw6
= _mm_mul_pd(c6_00
,VV
);
724 FF
= _mm_add_pd(Fp
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,_mm_add_pd(Heps
,Heps
))));
725 fvdw6
= _mm_mul_pd(c6_00
,FF
);
727 /* CUBIC SPLINE TABLE REPULSION */
728 vfitab
= _mm_add_epi32(vfitab
,ifour
);
729 Y
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) );
730 F
= _mm_setzero_pd();
731 GMX_MM_TRANSPOSE2_PD(Y
,F
);
732 G
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) +2);
733 H
= _mm_setzero_pd();
734 GMX_MM_TRANSPOSE2_PD(G
,H
);
735 Heps
= _mm_mul_pd(vfeps
,H
);
736 Fp
= _mm_add_pd(F
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,Heps
)));
737 VV
= _mm_add_pd(Y
,_mm_mul_pd(vfeps
,Fp
));
738 vvdw12
= _mm_mul_pd(c12_00
,VV
);
739 FF
= _mm_add_pd(Fp
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,_mm_add_pd(Heps
,Heps
))));
740 fvdw12
= _mm_mul_pd(c12_00
,FF
);
741 vvdw
= _mm_add_pd(vvdw12
,vvdw6
);
742 fvdw
= _mm_xor_pd(signbit
,_mm_mul_pd(_mm_add_pd(fvdw6
,fvdw12
),_mm_mul_pd(vftabscale
,rinv00
)));
744 /* Update potential sum for this i atom from the interaction with this j atom. */
745 vvdw
= _mm_unpacklo_pd(vvdw
,_mm_setzero_pd());
746 vvdwsum
= _mm_add_pd(vvdwsum
,vvdw
);
750 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
752 /* Calculate temporary vectorial force */
753 tx
= _mm_mul_pd(fscal
,dx00
);
754 ty
= _mm_mul_pd(fscal
,dy00
);
755 tz
= _mm_mul_pd(fscal
,dz00
);
757 /* Update vectorial force */
758 fix0
= _mm_add_pd(fix0
,tx
);
759 fiy0
= _mm_add_pd(fiy0
,ty
);
760 fiz0
= _mm_add_pd(fiz0
,tz
);
762 fjx0
= _mm_add_pd(fjx0
,tx
);
763 fjy0
= _mm_add_pd(fjy0
,ty
);
764 fjz0
= _mm_add_pd(fjz0
,tz
);
766 /**************************
767 * CALCULATE INTERACTIONS *
768 **************************/
770 /* COULOMB ELECTROSTATICS */
771 velec
= _mm_mul_pd(qq11
,rinv11
);
772 felec
= _mm_mul_pd(velec
,rinvsq11
);
774 /* Update potential sum for this i atom from the interaction with this j atom. */
775 velec
= _mm_unpacklo_pd(velec
,_mm_setzero_pd());
776 velecsum
= _mm_add_pd(velecsum
,velec
);
780 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
782 /* Calculate temporary vectorial force */
783 tx
= _mm_mul_pd(fscal
,dx11
);
784 ty
= _mm_mul_pd(fscal
,dy11
);
785 tz
= _mm_mul_pd(fscal
,dz11
);
787 /* Update vectorial force */
788 fix1
= _mm_add_pd(fix1
,tx
);
789 fiy1
= _mm_add_pd(fiy1
,ty
);
790 fiz1
= _mm_add_pd(fiz1
,tz
);
792 fjx1
= _mm_add_pd(fjx1
,tx
);
793 fjy1
= _mm_add_pd(fjy1
,ty
);
794 fjz1
= _mm_add_pd(fjz1
,tz
);
796 /**************************
797 * CALCULATE INTERACTIONS *
798 **************************/
800 /* COULOMB ELECTROSTATICS */
801 velec
= _mm_mul_pd(qq12
,rinv12
);
802 felec
= _mm_mul_pd(velec
,rinvsq12
);
804 /* Update potential sum for this i atom from the interaction with this j atom. */
805 velec
= _mm_unpacklo_pd(velec
,_mm_setzero_pd());
806 velecsum
= _mm_add_pd(velecsum
,velec
);
810 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
812 /* Calculate temporary vectorial force */
813 tx
= _mm_mul_pd(fscal
,dx12
);
814 ty
= _mm_mul_pd(fscal
,dy12
);
815 tz
= _mm_mul_pd(fscal
,dz12
);
817 /* Update vectorial force */
818 fix1
= _mm_add_pd(fix1
,tx
);
819 fiy1
= _mm_add_pd(fiy1
,ty
);
820 fiz1
= _mm_add_pd(fiz1
,tz
);
822 fjx2
= _mm_add_pd(fjx2
,tx
);
823 fjy2
= _mm_add_pd(fjy2
,ty
);
824 fjz2
= _mm_add_pd(fjz2
,tz
);
826 /**************************
827 * CALCULATE INTERACTIONS *
828 **************************/
830 /* COULOMB ELECTROSTATICS */
831 velec
= _mm_mul_pd(qq13
,rinv13
);
832 felec
= _mm_mul_pd(velec
,rinvsq13
);
834 /* Update potential sum for this i atom from the interaction with this j atom. */
835 velec
= _mm_unpacklo_pd(velec
,_mm_setzero_pd());
836 velecsum
= _mm_add_pd(velecsum
,velec
);
840 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
842 /* Calculate temporary vectorial force */
843 tx
= _mm_mul_pd(fscal
,dx13
);
844 ty
= _mm_mul_pd(fscal
,dy13
);
845 tz
= _mm_mul_pd(fscal
,dz13
);
847 /* Update vectorial force */
848 fix1
= _mm_add_pd(fix1
,tx
);
849 fiy1
= _mm_add_pd(fiy1
,ty
);
850 fiz1
= _mm_add_pd(fiz1
,tz
);
852 fjx3
= _mm_add_pd(fjx3
,tx
);
853 fjy3
= _mm_add_pd(fjy3
,ty
);
854 fjz3
= _mm_add_pd(fjz3
,tz
);
856 /**************************
857 * CALCULATE INTERACTIONS *
858 **************************/
860 /* COULOMB ELECTROSTATICS */
861 velec
= _mm_mul_pd(qq21
,rinv21
);
862 felec
= _mm_mul_pd(velec
,rinvsq21
);
864 /* Update potential sum for this i atom from the interaction with this j atom. */
865 velec
= _mm_unpacklo_pd(velec
,_mm_setzero_pd());
866 velecsum
= _mm_add_pd(velecsum
,velec
);
870 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
872 /* Calculate temporary vectorial force */
873 tx
= _mm_mul_pd(fscal
,dx21
);
874 ty
= _mm_mul_pd(fscal
,dy21
);
875 tz
= _mm_mul_pd(fscal
,dz21
);
877 /* Update vectorial force */
878 fix2
= _mm_add_pd(fix2
,tx
);
879 fiy2
= _mm_add_pd(fiy2
,ty
);
880 fiz2
= _mm_add_pd(fiz2
,tz
);
882 fjx1
= _mm_add_pd(fjx1
,tx
);
883 fjy1
= _mm_add_pd(fjy1
,ty
);
884 fjz1
= _mm_add_pd(fjz1
,tz
);
886 /**************************
887 * CALCULATE INTERACTIONS *
888 **************************/
890 /* COULOMB ELECTROSTATICS */
891 velec
= _mm_mul_pd(qq22
,rinv22
);
892 felec
= _mm_mul_pd(velec
,rinvsq22
);
894 /* Update potential sum for this i atom from the interaction with this j atom. */
895 velec
= _mm_unpacklo_pd(velec
,_mm_setzero_pd());
896 velecsum
= _mm_add_pd(velecsum
,velec
);
900 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
902 /* Calculate temporary vectorial force */
903 tx
= _mm_mul_pd(fscal
,dx22
);
904 ty
= _mm_mul_pd(fscal
,dy22
);
905 tz
= _mm_mul_pd(fscal
,dz22
);
907 /* Update vectorial force */
908 fix2
= _mm_add_pd(fix2
,tx
);
909 fiy2
= _mm_add_pd(fiy2
,ty
);
910 fiz2
= _mm_add_pd(fiz2
,tz
);
912 fjx2
= _mm_add_pd(fjx2
,tx
);
913 fjy2
= _mm_add_pd(fjy2
,ty
);
914 fjz2
= _mm_add_pd(fjz2
,tz
);
916 /**************************
917 * CALCULATE INTERACTIONS *
918 **************************/
920 /* COULOMB ELECTROSTATICS */
921 velec
= _mm_mul_pd(qq23
,rinv23
);
922 felec
= _mm_mul_pd(velec
,rinvsq23
);
924 /* Update potential sum for this i atom from the interaction with this j atom. */
925 velec
= _mm_unpacklo_pd(velec
,_mm_setzero_pd());
926 velecsum
= _mm_add_pd(velecsum
,velec
);
930 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
932 /* Calculate temporary vectorial force */
933 tx
= _mm_mul_pd(fscal
,dx23
);
934 ty
= _mm_mul_pd(fscal
,dy23
);
935 tz
= _mm_mul_pd(fscal
,dz23
);
937 /* Update vectorial force */
938 fix2
= _mm_add_pd(fix2
,tx
);
939 fiy2
= _mm_add_pd(fiy2
,ty
);
940 fiz2
= _mm_add_pd(fiz2
,tz
);
942 fjx3
= _mm_add_pd(fjx3
,tx
);
943 fjy3
= _mm_add_pd(fjy3
,ty
);
944 fjz3
= _mm_add_pd(fjz3
,tz
);
946 /**************************
947 * CALCULATE INTERACTIONS *
948 **************************/
950 /* COULOMB ELECTROSTATICS */
951 velec
= _mm_mul_pd(qq31
,rinv31
);
952 felec
= _mm_mul_pd(velec
,rinvsq31
);
954 /* Update potential sum for this i atom from the interaction with this j atom. */
955 velec
= _mm_unpacklo_pd(velec
,_mm_setzero_pd());
956 velecsum
= _mm_add_pd(velecsum
,velec
);
960 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
962 /* Calculate temporary vectorial force */
963 tx
= _mm_mul_pd(fscal
,dx31
);
964 ty
= _mm_mul_pd(fscal
,dy31
);
965 tz
= _mm_mul_pd(fscal
,dz31
);
967 /* Update vectorial force */
968 fix3
= _mm_add_pd(fix3
,tx
);
969 fiy3
= _mm_add_pd(fiy3
,ty
);
970 fiz3
= _mm_add_pd(fiz3
,tz
);
972 fjx1
= _mm_add_pd(fjx1
,tx
);
973 fjy1
= _mm_add_pd(fjy1
,ty
);
974 fjz1
= _mm_add_pd(fjz1
,tz
);
976 /**************************
977 * CALCULATE INTERACTIONS *
978 **************************/
980 /* COULOMB ELECTROSTATICS */
981 velec
= _mm_mul_pd(qq32
,rinv32
);
982 felec
= _mm_mul_pd(velec
,rinvsq32
);
984 /* Update potential sum for this i atom from the interaction with this j atom. */
985 velec
= _mm_unpacklo_pd(velec
,_mm_setzero_pd());
986 velecsum
= _mm_add_pd(velecsum
,velec
);
990 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
992 /* Calculate temporary vectorial force */
993 tx
= _mm_mul_pd(fscal
,dx32
);
994 ty
= _mm_mul_pd(fscal
,dy32
);
995 tz
= _mm_mul_pd(fscal
,dz32
);
997 /* Update vectorial force */
998 fix3
= _mm_add_pd(fix3
,tx
);
999 fiy3
= _mm_add_pd(fiy3
,ty
);
1000 fiz3
= _mm_add_pd(fiz3
,tz
);
1002 fjx2
= _mm_add_pd(fjx2
,tx
);
1003 fjy2
= _mm_add_pd(fjy2
,ty
);
1004 fjz2
= _mm_add_pd(fjz2
,tz
);
1006 /**************************
1007 * CALCULATE INTERACTIONS *
1008 **************************/
1010 /* COULOMB ELECTROSTATICS */
1011 velec
= _mm_mul_pd(qq33
,rinv33
);
1012 felec
= _mm_mul_pd(velec
,rinvsq33
);
1014 /* Update potential sum for this i atom from the interaction with this j atom. */
1015 velec
= _mm_unpacklo_pd(velec
,_mm_setzero_pd());
1016 velecsum
= _mm_add_pd(velecsum
,velec
);
1020 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
1022 /* Calculate temporary vectorial force */
1023 tx
= _mm_mul_pd(fscal
,dx33
);
1024 ty
= _mm_mul_pd(fscal
,dy33
);
1025 tz
= _mm_mul_pd(fscal
,dz33
);
1027 /* Update vectorial force */
1028 fix3
= _mm_add_pd(fix3
,tx
);
1029 fiy3
= _mm_add_pd(fiy3
,ty
);
1030 fiz3
= _mm_add_pd(fiz3
,tz
);
1032 fjx3
= _mm_add_pd(fjx3
,tx
);
1033 fjy3
= _mm_add_pd(fjy3
,ty
);
1034 fjz3
= _mm_add_pd(fjz3
,tz
);
1036 gmx_mm_decrement_4rvec_1ptr_swizzle_pd(f
+j_coord_offsetA
,fjx0
,fjy0
,fjz0
,fjx1
,fjy1
,fjz1
,fjx2
,fjy2
,fjz2
,fjx3
,fjy3
,fjz3
);
1038 /* Inner loop uses 311 flops */
1041 /* End of innermost loop */
1043 gmx_mm_update_iforce_4atom_swizzle_pd(fix0
,fiy0
,fiz0
,fix1
,fiy1
,fiz1
,fix2
,fiy2
,fiz2
,fix3
,fiy3
,fiz3
,
1044 f
+i_coord_offset
,fshift
+i_shift_offset
);
1047 /* Update potential energies */
1048 gmx_mm_update_1pot_pd(velecsum
,kernel_data
->energygrp_elec
+ggid
);
1049 gmx_mm_update_1pot_pd(vvdwsum
,kernel_data
->energygrp_vdw
+ggid
);
1051 /* Increment number of inner iterations */
1052 inneriter
+= j_index_end
- j_index_start
;
1054 /* Outer loop uses 26 flops */
1057 /* Increment number of outer iterations */
1060 /* Update outer/inner flops */
1062 inc_nrnb(nrnb
,eNR_NBKERNEL_ELEC_VDW_W4W4_VF
,outeriter
*26 + inneriter
*311);
1065 * Gromacs nonbonded kernel: nb_kernel_ElecCoul_VdwCSTab_GeomW4W4_F_sse4_1_double
1066 * Electrostatics interaction: Coulomb
1067 * VdW interaction: CubicSplineTable
1068 * Geometry: Water4-Water4
1069 * Calculate force/pot: Force
1072 nb_kernel_ElecCoul_VdwCSTab_GeomW4W4_F_sse4_1_double
1073 (t_nblist
* gmx_restrict nlist
,
1074 rvec
* gmx_restrict xx
,
1075 rvec
* gmx_restrict ff
,
1076 t_forcerec
* gmx_restrict fr
,
1077 t_mdatoms
* gmx_restrict mdatoms
,
1078 nb_kernel_data_t gmx_unused
* gmx_restrict kernel_data
,
1079 t_nrnb
* gmx_restrict nrnb
)
1081 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
1082 * just 0 for non-waters.
1083 * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different
1084 * jnr indices corresponding to data put in the four positions in the SIMD register.
1086 int i_shift_offset
,i_coord_offset
,outeriter
,inneriter
;
1087 int j_index_start
,j_index_end
,jidx
,nri
,inr
,ggid
,iidx
;
1089 int j_coord_offsetA
,j_coord_offsetB
;
1090 int *iinr
,*jindex
,*jjnr
,*shiftidx
,*gid
;
1091 real rcutoff_scalar
;
1092 real
*shiftvec
,*fshift
,*x
,*f
;
1093 __m128d tx
,ty
,tz
,fscal
,rcutoff
,rcutoff2
,jidxall
;
1095 __m128d ix0
,iy0
,iz0
,fix0
,fiy0
,fiz0
,iq0
,isai0
;
1097 __m128d ix1
,iy1
,iz1
,fix1
,fiy1
,fiz1
,iq1
,isai1
;
1099 __m128d ix2
,iy2
,iz2
,fix2
,fiy2
,fiz2
,iq2
,isai2
;
1101 __m128d ix3
,iy3
,iz3
,fix3
,fiy3
,fiz3
,iq3
,isai3
;
1102 int vdwjidx0A
,vdwjidx0B
;
1103 __m128d jx0
,jy0
,jz0
,fjx0
,fjy0
,fjz0
,jq0
,isaj0
;
1104 int vdwjidx1A
,vdwjidx1B
;
1105 __m128d jx1
,jy1
,jz1
,fjx1
,fjy1
,fjz1
,jq1
,isaj1
;
1106 int vdwjidx2A
,vdwjidx2B
;
1107 __m128d jx2
,jy2
,jz2
,fjx2
,fjy2
,fjz2
,jq2
,isaj2
;
1108 int vdwjidx3A
,vdwjidx3B
;
1109 __m128d jx3
,jy3
,jz3
,fjx3
,fjy3
,fjz3
,jq3
,isaj3
;
1110 __m128d dx00
,dy00
,dz00
,rsq00
,rinv00
,rinvsq00
,r00
,qq00
,c6_00
,c12_00
;
1111 __m128d dx11
,dy11
,dz11
,rsq11
,rinv11
,rinvsq11
,r11
,qq11
,c6_11
,c12_11
;
1112 __m128d dx12
,dy12
,dz12
,rsq12
,rinv12
,rinvsq12
,r12
,qq12
,c6_12
,c12_12
;
1113 __m128d dx13
,dy13
,dz13
,rsq13
,rinv13
,rinvsq13
,r13
,qq13
,c6_13
,c12_13
;
1114 __m128d dx21
,dy21
,dz21
,rsq21
,rinv21
,rinvsq21
,r21
,qq21
,c6_21
,c12_21
;
1115 __m128d dx22
,dy22
,dz22
,rsq22
,rinv22
,rinvsq22
,r22
,qq22
,c6_22
,c12_22
;
1116 __m128d dx23
,dy23
,dz23
,rsq23
,rinv23
,rinvsq23
,r23
,qq23
,c6_23
,c12_23
;
1117 __m128d dx31
,dy31
,dz31
,rsq31
,rinv31
,rinvsq31
,r31
,qq31
,c6_31
,c12_31
;
1118 __m128d dx32
,dy32
,dz32
,rsq32
,rinv32
,rinvsq32
,r32
,qq32
,c6_32
,c12_32
;
1119 __m128d dx33
,dy33
,dz33
,rsq33
,rinv33
,rinvsq33
,r33
,qq33
,c6_33
,c12_33
;
1120 __m128d velec
,felec
,velecsum
,facel
,crf
,krf
,krf2
;
1123 __m128d rinvsix
,rvdw
,vvdw
,vvdw6
,vvdw12
,fvdw
,fvdw6
,fvdw12
,vvdwsum
,sh_vdw_invrcut6
;
1126 __m128d one_sixth
= _mm_set1_pd(1.0/6.0);
1127 __m128d one_twelfth
= _mm_set1_pd(1.0/12.0);
1129 __m128i ifour
= _mm_set1_epi32(4);
1130 __m128d rt
,vfeps
,vftabscale
,Y
,F
,G
,H
,Heps
,Fp
,VV
,FF
;
1132 __m128d dummy_mask
,cutoff_mask
;
1133 __m128d signbit
= gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) );
1134 __m128d one
= _mm_set1_pd(1.0);
1135 __m128d two
= _mm_set1_pd(2.0);
1141 jindex
= nlist
->jindex
;
1143 shiftidx
= nlist
->shift
;
1145 shiftvec
= fr
->shift_vec
[0];
1146 fshift
= fr
->fshift
[0];
1147 facel
= _mm_set1_pd(fr
->epsfac
);
1148 charge
= mdatoms
->chargeA
;
1149 nvdwtype
= fr
->ntype
;
1150 vdwparam
= fr
->nbfp
;
1151 vdwtype
= mdatoms
->typeA
;
1153 vftab
= kernel_data
->table_vdw
->data
;
1154 vftabscale
= _mm_set1_pd(kernel_data
->table_vdw
->scale
);
1156 /* Setup water-specific parameters */
1157 inr
= nlist
->iinr
[0];
1158 iq1
= _mm_mul_pd(facel
,_mm_set1_pd(charge
[inr
+1]));
1159 iq2
= _mm_mul_pd(facel
,_mm_set1_pd(charge
[inr
+2]));
1160 iq3
= _mm_mul_pd(facel
,_mm_set1_pd(charge
[inr
+3]));
1161 vdwioffset0
= 2*nvdwtype
*vdwtype
[inr
+0];
1163 jq1
= _mm_set1_pd(charge
[inr
+1]);
1164 jq2
= _mm_set1_pd(charge
[inr
+2]);
1165 jq3
= _mm_set1_pd(charge
[inr
+3]);
1166 vdwjidx0A
= 2*vdwtype
[inr
+0];
1167 c6_00
= _mm_set1_pd(vdwparam
[vdwioffset0
+vdwjidx0A
]);
1168 c12_00
= _mm_set1_pd(vdwparam
[vdwioffset0
+vdwjidx0A
+1]);
1169 qq11
= _mm_mul_pd(iq1
,jq1
);
1170 qq12
= _mm_mul_pd(iq1
,jq2
);
1171 qq13
= _mm_mul_pd(iq1
,jq3
);
1172 qq21
= _mm_mul_pd(iq2
,jq1
);
1173 qq22
= _mm_mul_pd(iq2
,jq2
);
1174 qq23
= _mm_mul_pd(iq2
,jq3
);
1175 qq31
= _mm_mul_pd(iq3
,jq1
);
1176 qq32
= _mm_mul_pd(iq3
,jq2
);
1177 qq33
= _mm_mul_pd(iq3
,jq3
);
1179 /* Avoid stupid compiler warnings */
1181 j_coord_offsetA
= 0;
1182 j_coord_offsetB
= 0;
1187 /* Start outer loop over neighborlists */
1188 for(iidx
=0; iidx
<nri
; iidx
++)
1190 /* Load shift vector for this list */
1191 i_shift_offset
= DIM
*shiftidx
[iidx
];
1193 /* Load limits for loop over neighbors */
1194 j_index_start
= jindex
[iidx
];
1195 j_index_end
= jindex
[iidx
+1];
1197 /* Get outer coordinate index */
1199 i_coord_offset
= DIM
*inr
;
1201 /* Load i particle coords and add shift vector */
1202 gmx_mm_load_shift_and_4rvec_broadcast_pd(shiftvec
+i_shift_offset
,x
+i_coord_offset
,
1203 &ix0
,&iy0
,&iz0
,&ix1
,&iy1
,&iz1
,&ix2
,&iy2
,&iz2
,&ix3
,&iy3
,&iz3
);
1205 fix0
= _mm_setzero_pd();
1206 fiy0
= _mm_setzero_pd();
1207 fiz0
= _mm_setzero_pd();
1208 fix1
= _mm_setzero_pd();
1209 fiy1
= _mm_setzero_pd();
1210 fiz1
= _mm_setzero_pd();
1211 fix2
= _mm_setzero_pd();
1212 fiy2
= _mm_setzero_pd();
1213 fiz2
= _mm_setzero_pd();
1214 fix3
= _mm_setzero_pd();
1215 fiy3
= _mm_setzero_pd();
1216 fiz3
= _mm_setzero_pd();
1218 /* Start inner kernel loop */
1219 for(jidx
=j_index_start
; jidx
<j_index_end
-1; jidx
+=2)
1222 /* Get j neighbor index, and coordinate index */
1224 jnrB
= jjnr
[jidx
+1];
1225 j_coord_offsetA
= DIM
*jnrA
;
1226 j_coord_offsetB
= DIM
*jnrB
;
1228 /* load j atom coordinates */
1229 gmx_mm_load_4rvec_2ptr_swizzle_pd(x
+j_coord_offsetA
,x
+j_coord_offsetB
,
1230 &jx0
,&jy0
,&jz0
,&jx1
,&jy1
,&jz1
,&jx2
,
1231 &jy2
,&jz2
,&jx3
,&jy3
,&jz3
);
1233 /* Calculate displacement vector */
1234 dx00
= _mm_sub_pd(ix0
,jx0
);
1235 dy00
= _mm_sub_pd(iy0
,jy0
);
1236 dz00
= _mm_sub_pd(iz0
,jz0
);
1237 dx11
= _mm_sub_pd(ix1
,jx1
);
1238 dy11
= _mm_sub_pd(iy1
,jy1
);
1239 dz11
= _mm_sub_pd(iz1
,jz1
);
1240 dx12
= _mm_sub_pd(ix1
,jx2
);
1241 dy12
= _mm_sub_pd(iy1
,jy2
);
1242 dz12
= _mm_sub_pd(iz1
,jz2
);
1243 dx13
= _mm_sub_pd(ix1
,jx3
);
1244 dy13
= _mm_sub_pd(iy1
,jy3
);
1245 dz13
= _mm_sub_pd(iz1
,jz3
);
1246 dx21
= _mm_sub_pd(ix2
,jx1
);
1247 dy21
= _mm_sub_pd(iy2
,jy1
);
1248 dz21
= _mm_sub_pd(iz2
,jz1
);
1249 dx22
= _mm_sub_pd(ix2
,jx2
);
1250 dy22
= _mm_sub_pd(iy2
,jy2
);
1251 dz22
= _mm_sub_pd(iz2
,jz2
);
1252 dx23
= _mm_sub_pd(ix2
,jx3
);
1253 dy23
= _mm_sub_pd(iy2
,jy3
);
1254 dz23
= _mm_sub_pd(iz2
,jz3
);
1255 dx31
= _mm_sub_pd(ix3
,jx1
);
1256 dy31
= _mm_sub_pd(iy3
,jy1
);
1257 dz31
= _mm_sub_pd(iz3
,jz1
);
1258 dx32
= _mm_sub_pd(ix3
,jx2
);
1259 dy32
= _mm_sub_pd(iy3
,jy2
);
1260 dz32
= _mm_sub_pd(iz3
,jz2
);
1261 dx33
= _mm_sub_pd(ix3
,jx3
);
1262 dy33
= _mm_sub_pd(iy3
,jy3
);
1263 dz33
= _mm_sub_pd(iz3
,jz3
);
1265 /* Calculate squared distance and things based on it */
1266 rsq00
= gmx_mm_calc_rsq_pd(dx00
,dy00
,dz00
);
1267 rsq11
= gmx_mm_calc_rsq_pd(dx11
,dy11
,dz11
);
1268 rsq12
= gmx_mm_calc_rsq_pd(dx12
,dy12
,dz12
);
1269 rsq13
= gmx_mm_calc_rsq_pd(dx13
,dy13
,dz13
);
1270 rsq21
= gmx_mm_calc_rsq_pd(dx21
,dy21
,dz21
);
1271 rsq22
= gmx_mm_calc_rsq_pd(dx22
,dy22
,dz22
);
1272 rsq23
= gmx_mm_calc_rsq_pd(dx23
,dy23
,dz23
);
1273 rsq31
= gmx_mm_calc_rsq_pd(dx31
,dy31
,dz31
);
1274 rsq32
= gmx_mm_calc_rsq_pd(dx32
,dy32
,dz32
);
1275 rsq33
= gmx_mm_calc_rsq_pd(dx33
,dy33
,dz33
);
1277 rinv00
= gmx_mm_invsqrt_pd(rsq00
);
1278 rinv11
= gmx_mm_invsqrt_pd(rsq11
);
1279 rinv12
= gmx_mm_invsqrt_pd(rsq12
);
1280 rinv13
= gmx_mm_invsqrt_pd(rsq13
);
1281 rinv21
= gmx_mm_invsqrt_pd(rsq21
);
1282 rinv22
= gmx_mm_invsqrt_pd(rsq22
);
1283 rinv23
= gmx_mm_invsqrt_pd(rsq23
);
1284 rinv31
= gmx_mm_invsqrt_pd(rsq31
);
1285 rinv32
= gmx_mm_invsqrt_pd(rsq32
);
1286 rinv33
= gmx_mm_invsqrt_pd(rsq33
);
1288 rinvsq11
= _mm_mul_pd(rinv11
,rinv11
);
1289 rinvsq12
= _mm_mul_pd(rinv12
,rinv12
);
1290 rinvsq13
= _mm_mul_pd(rinv13
,rinv13
);
1291 rinvsq21
= _mm_mul_pd(rinv21
,rinv21
);
1292 rinvsq22
= _mm_mul_pd(rinv22
,rinv22
);
1293 rinvsq23
= _mm_mul_pd(rinv23
,rinv23
);
1294 rinvsq31
= _mm_mul_pd(rinv31
,rinv31
);
1295 rinvsq32
= _mm_mul_pd(rinv32
,rinv32
);
1296 rinvsq33
= _mm_mul_pd(rinv33
,rinv33
);
1298 fjx0
= _mm_setzero_pd();
1299 fjy0
= _mm_setzero_pd();
1300 fjz0
= _mm_setzero_pd();
1301 fjx1
= _mm_setzero_pd();
1302 fjy1
= _mm_setzero_pd();
1303 fjz1
= _mm_setzero_pd();
1304 fjx2
= _mm_setzero_pd();
1305 fjy2
= _mm_setzero_pd();
1306 fjz2
= _mm_setzero_pd();
1307 fjx3
= _mm_setzero_pd();
1308 fjy3
= _mm_setzero_pd();
1309 fjz3
= _mm_setzero_pd();
1311 /**************************
1312 * CALCULATE INTERACTIONS *
1313 **************************/
1315 r00
= _mm_mul_pd(rsq00
,rinv00
);
1317 /* Calculate table index by multiplying r with table scale and truncate to integer */
1318 rt
= _mm_mul_pd(r00
,vftabscale
);
1319 vfitab
= _mm_cvttpd_epi32(rt
);
1320 vfeps
= _mm_sub_pd(rt
,_mm_round_pd(rt
, _MM_FROUND_FLOOR
));
1321 vfitab
= _mm_slli_epi32(vfitab
,3);
1323 /* CUBIC SPLINE TABLE DISPERSION */
1324 Y
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) );
1325 F
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,1) );
1326 GMX_MM_TRANSPOSE2_PD(Y
,F
);
1327 G
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) +2);
1328 H
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,1) +2);
1329 GMX_MM_TRANSPOSE2_PD(G
,H
);
1330 Heps
= _mm_mul_pd(vfeps
,H
);
1331 Fp
= _mm_add_pd(F
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,Heps
)));
1332 FF
= _mm_add_pd(Fp
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,_mm_add_pd(Heps
,Heps
))));
1333 fvdw6
= _mm_mul_pd(c6_00
,FF
);
1335 /* CUBIC SPLINE TABLE REPULSION */
1336 vfitab
= _mm_add_epi32(vfitab
,ifour
);
1337 Y
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) );
1338 F
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,1) );
1339 GMX_MM_TRANSPOSE2_PD(Y
,F
);
1340 G
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) +2);
1341 H
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,1) +2);
1342 GMX_MM_TRANSPOSE2_PD(G
,H
);
1343 Heps
= _mm_mul_pd(vfeps
,H
);
1344 Fp
= _mm_add_pd(F
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,Heps
)));
1345 FF
= _mm_add_pd(Fp
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,_mm_add_pd(Heps
,Heps
))));
1346 fvdw12
= _mm_mul_pd(c12_00
,FF
);
1347 fvdw
= _mm_xor_pd(signbit
,_mm_mul_pd(_mm_add_pd(fvdw6
,fvdw12
),_mm_mul_pd(vftabscale
,rinv00
)));
1351 /* Calculate temporary vectorial force */
1352 tx
= _mm_mul_pd(fscal
,dx00
);
1353 ty
= _mm_mul_pd(fscal
,dy00
);
1354 tz
= _mm_mul_pd(fscal
,dz00
);
1356 /* Update vectorial force */
1357 fix0
= _mm_add_pd(fix0
,tx
);
1358 fiy0
= _mm_add_pd(fiy0
,ty
);
1359 fiz0
= _mm_add_pd(fiz0
,tz
);
1361 fjx0
= _mm_add_pd(fjx0
,tx
);
1362 fjy0
= _mm_add_pd(fjy0
,ty
);
1363 fjz0
= _mm_add_pd(fjz0
,tz
);
1365 /**************************
1366 * CALCULATE INTERACTIONS *
1367 **************************/
1369 /* COULOMB ELECTROSTATICS */
1370 velec
= _mm_mul_pd(qq11
,rinv11
);
1371 felec
= _mm_mul_pd(velec
,rinvsq11
);
1375 /* Calculate temporary vectorial force */
1376 tx
= _mm_mul_pd(fscal
,dx11
);
1377 ty
= _mm_mul_pd(fscal
,dy11
);
1378 tz
= _mm_mul_pd(fscal
,dz11
);
1380 /* Update vectorial force */
1381 fix1
= _mm_add_pd(fix1
,tx
);
1382 fiy1
= _mm_add_pd(fiy1
,ty
);
1383 fiz1
= _mm_add_pd(fiz1
,tz
);
1385 fjx1
= _mm_add_pd(fjx1
,tx
);
1386 fjy1
= _mm_add_pd(fjy1
,ty
);
1387 fjz1
= _mm_add_pd(fjz1
,tz
);
1389 /**************************
1390 * CALCULATE INTERACTIONS *
1391 **************************/
1393 /* COULOMB ELECTROSTATICS */
1394 velec
= _mm_mul_pd(qq12
,rinv12
);
1395 felec
= _mm_mul_pd(velec
,rinvsq12
);
1399 /* Calculate temporary vectorial force */
1400 tx
= _mm_mul_pd(fscal
,dx12
);
1401 ty
= _mm_mul_pd(fscal
,dy12
);
1402 tz
= _mm_mul_pd(fscal
,dz12
);
1404 /* Update vectorial force */
1405 fix1
= _mm_add_pd(fix1
,tx
);
1406 fiy1
= _mm_add_pd(fiy1
,ty
);
1407 fiz1
= _mm_add_pd(fiz1
,tz
);
1409 fjx2
= _mm_add_pd(fjx2
,tx
);
1410 fjy2
= _mm_add_pd(fjy2
,ty
);
1411 fjz2
= _mm_add_pd(fjz2
,tz
);
1413 /**************************
1414 * CALCULATE INTERACTIONS *
1415 **************************/
1417 /* COULOMB ELECTROSTATICS */
1418 velec
= _mm_mul_pd(qq13
,rinv13
);
1419 felec
= _mm_mul_pd(velec
,rinvsq13
);
1423 /* Calculate temporary vectorial force */
1424 tx
= _mm_mul_pd(fscal
,dx13
);
1425 ty
= _mm_mul_pd(fscal
,dy13
);
1426 tz
= _mm_mul_pd(fscal
,dz13
);
1428 /* Update vectorial force */
1429 fix1
= _mm_add_pd(fix1
,tx
);
1430 fiy1
= _mm_add_pd(fiy1
,ty
);
1431 fiz1
= _mm_add_pd(fiz1
,tz
);
1433 fjx3
= _mm_add_pd(fjx3
,tx
);
1434 fjy3
= _mm_add_pd(fjy3
,ty
);
1435 fjz3
= _mm_add_pd(fjz3
,tz
);
1437 /**************************
1438 * CALCULATE INTERACTIONS *
1439 **************************/
1441 /* COULOMB ELECTROSTATICS */
1442 velec
= _mm_mul_pd(qq21
,rinv21
);
1443 felec
= _mm_mul_pd(velec
,rinvsq21
);
1447 /* Calculate temporary vectorial force */
1448 tx
= _mm_mul_pd(fscal
,dx21
);
1449 ty
= _mm_mul_pd(fscal
,dy21
);
1450 tz
= _mm_mul_pd(fscal
,dz21
);
1452 /* Update vectorial force */
1453 fix2
= _mm_add_pd(fix2
,tx
);
1454 fiy2
= _mm_add_pd(fiy2
,ty
);
1455 fiz2
= _mm_add_pd(fiz2
,tz
);
1457 fjx1
= _mm_add_pd(fjx1
,tx
);
1458 fjy1
= _mm_add_pd(fjy1
,ty
);
1459 fjz1
= _mm_add_pd(fjz1
,tz
);
1461 /**************************
1462 * CALCULATE INTERACTIONS *
1463 **************************/
1465 /* COULOMB ELECTROSTATICS */
1466 velec
= _mm_mul_pd(qq22
,rinv22
);
1467 felec
= _mm_mul_pd(velec
,rinvsq22
);
1471 /* Calculate temporary vectorial force */
1472 tx
= _mm_mul_pd(fscal
,dx22
);
1473 ty
= _mm_mul_pd(fscal
,dy22
);
1474 tz
= _mm_mul_pd(fscal
,dz22
);
1476 /* Update vectorial force */
1477 fix2
= _mm_add_pd(fix2
,tx
);
1478 fiy2
= _mm_add_pd(fiy2
,ty
);
1479 fiz2
= _mm_add_pd(fiz2
,tz
);
1481 fjx2
= _mm_add_pd(fjx2
,tx
);
1482 fjy2
= _mm_add_pd(fjy2
,ty
);
1483 fjz2
= _mm_add_pd(fjz2
,tz
);
1485 /**************************
1486 * CALCULATE INTERACTIONS *
1487 **************************/
1489 /* COULOMB ELECTROSTATICS */
1490 velec
= _mm_mul_pd(qq23
,rinv23
);
1491 felec
= _mm_mul_pd(velec
,rinvsq23
);
1495 /* Calculate temporary vectorial force */
1496 tx
= _mm_mul_pd(fscal
,dx23
);
1497 ty
= _mm_mul_pd(fscal
,dy23
);
1498 tz
= _mm_mul_pd(fscal
,dz23
);
1500 /* Update vectorial force */
1501 fix2
= _mm_add_pd(fix2
,tx
);
1502 fiy2
= _mm_add_pd(fiy2
,ty
);
1503 fiz2
= _mm_add_pd(fiz2
,tz
);
1505 fjx3
= _mm_add_pd(fjx3
,tx
);
1506 fjy3
= _mm_add_pd(fjy3
,ty
);
1507 fjz3
= _mm_add_pd(fjz3
,tz
);
1509 /**************************
1510 * CALCULATE INTERACTIONS *
1511 **************************/
1513 /* COULOMB ELECTROSTATICS */
1514 velec
= _mm_mul_pd(qq31
,rinv31
);
1515 felec
= _mm_mul_pd(velec
,rinvsq31
);
1519 /* Calculate temporary vectorial force */
1520 tx
= _mm_mul_pd(fscal
,dx31
);
1521 ty
= _mm_mul_pd(fscal
,dy31
);
1522 tz
= _mm_mul_pd(fscal
,dz31
);
1524 /* Update vectorial force */
1525 fix3
= _mm_add_pd(fix3
,tx
);
1526 fiy3
= _mm_add_pd(fiy3
,ty
);
1527 fiz3
= _mm_add_pd(fiz3
,tz
);
1529 fjx1
= _mm_add_pd(fjx1
,tx
);
1530 fjy1
= _mm_add_pd(fjy1
,ty
);
1531 fjz1
= _mm_add_pd(fjz1
,tz
);
1533 /**************************
1534 * CALCULATE INTERACTIONS *
1535 **************************/
1537 /* COULOMB ELECTROSTATICS */
1538 velec
= _mm_mul_pd(qq32
,rinv32
);
1539 felec
= _mm_mul_pd(velec
,rinvsq32
);
1543 /* Calculate temporary vectorial force */
1544 tx
= _mm_mul_pd(fscal
,dx32
);
1545 ty
= _mm_mul_pd(fscal
,dy32
);
1546 tz
= _mm_mul_pd(fscal
,dz32
);
1548 /* Update vectorial force */
1549 fix3
= _mm_add_pd(fix3
,tx
);
1550 fiy3
= _mm_add_pd(fiy3
,ty
);
1551 fiz3
= _mm_add_pd(fiz3
,tz
);
1553 fjx2
= _mm_add_pd(fjx2
,tx
);
1554 fjy2
= _mm_add_pd(fjy2
,ty
);
1555 fjz2
= _mm_add_pd(fjz2
,tz
);
1557 /**************************
1558 * CALCULATE INTERACTIONS *
1559 **************************/
1561 /* COULOMB ELECTROSTATICS */
1562 velec
= _mm_mul_pd(qq33
,rinv33
);
1563 felec
= _mm_mul_pd(velec
,rinvsq33
);
1567 /* Calculate temporary vectorial force */
1568 tx
= _mm_mul_pd(fscal
,dx33
);
1569 ty
= _mm_mul_pd(fscal
,dy33
);
1570 tz
= _mm_mul_pd(fscal
,dz33
);
1572 /* Update vectorial force */
1573 fix3
= _mm_add_pd(fix3
,tx
);
1574 fiy3
= _mm_add_pd(fiy3
,ty
);
1575 fiz3
= _mm_add_pd(fiz3
,tz
);
1577 fjx3
= _mm_add_pd(fjx3
,tx
);
1578 fjy3
= _mm_add_pd(fjy3
,ty
);
1579 fjz3
= _mm_add_pd(fjz3
,tz
);
1581 gmx_mm_decrement_4rvec_2ptr_swizzle_pd(f
+j_coord_offsetA
,f
+j_coord_offsetB
,fjx0
,fjy0
,fjz0
,fjx1
,fjy1
,fjz1
,fjx2
,fjy2
,fjz2
,fjx3
,fjy3
,fjz3
);
1583 /* Inner loop uses 294 flops */
1586 if(jidx
<j_index_end
)
1590 j_coord_offsetA
= DIM
*jnrA
;
1592 /* load j atom coordinates */
1593 gmx_mm_load_4rvec_1ptr_swizzle_pd(x
+j_coord_offsetA
,
1594 &jx0
,&jy0
,&jz0
,&jx1
,&jy1
,&jz1
,&jx2
,
1595 &jy2
,&jz2
,&jx3
,&jy3
,&jz3
);
1597 /* Calculate displacement vector */
1598 dx00
= _mm_sub_pd(ix0
,jx0
);
1599 dy00
= _mm_sub_pd(iy0
,jy0
);
1600 dz00
= _mm_sub_pd(iz0
,jz0
);
1601 dx11
= _mm_sub_pd(ix1
,jx1
);
1602 dy11
= _mm_sub_pd(iy1
,jy1
);
1603 dz11
= _mm_sub_pd(iz1
,jz1
);
1604 dx12
= _mm_sub_pd(ix1
,jx2
);
1605 dy12
= _mm_sub_pd(iy1
,jy2
);
1606 dz12
= _mm_sub_pd(iz1
,jz2
);
1607 dx13
= _mm_sub_pd(ix1
,jx3
);
1608 dy13
= _mm_sub_pd(iy1
,jy3
);
1609 dz13
= _mm_sub_pd(iz1
,jz3
);
1610 dx21
= _mm_sub_pd(ix2
,jx1
);
1611 dy21
= _mm_sub_pd(iy2
,jy1
);
1612 dz21
= _mm_sub_pd(iz2
,jz1
);
1613 dx22
= _mm_sub_pd(ix2
,jx2
);
1614 dy22
= _mm_sub_pd(iy2
,jy2
);
1615 dz22
= _mm_sub_pd(iz2
,jz2
);
1616 dx23
= _mm_sub_pd(ix2
,jx3
);
1617 dy23
= _mm_sub_pd(iy2
,jy3
);
1618 dz23
= _mm_sub_pd(iz2
,jz3
);
1619 dx31
= _mm_sub_pd(ix3
,jx1
);
1620 dy31
= _mm_sub_pd(iy3
,jy1
);
1621 dz31
= _mm_sub_pd(iz3
,jz1
);
1622 dx32
= _mm_sub_pd(ix3
,jx2
);
1623 dy32
= _mm_sub_pd(iy3
,jy2
);
1624 dz32
= _mm_sub_pd(iz3
,jz2
);
1625 dx33
= _mm_sub_pd(ix3
,jx3
);
1626 dy33
= _mm_sub_pd(iy3
,jy3
);
1627 dz33
= _mm_sub_pd(iz3
,jz3
);
1629 /* Calculate squared distance and things based on it */
1630 rsq00
= gmx_mm_calc_rsq_pd(dx00
,dy00
,dz00
);
1631 rsq11
= gmx_mm_calc_rsq_pd(dx11
,dy11
,dz11
);
1632 rsq12
= gmx_mm_calc_rsq_pd(dx12
,dy12
,dz12
);
1633 rsq13
= gmx_mm_calc_rsq_pd(dx13
,dy13
,dz13
);
1634 rsq21
= gmx_mm_calc_rsq_pd(dx21
,dy21
,dz21
);
1635 rsq22
= gmx_mm_calc_rsq_pd(dx22
,dy22
,dz22
);
1636 rsq23
= gmx_mm_calc_rsq_pd(dx23
,dy23
,dz23
);
1637 rsq31
= gmx_mm_calc_rsq_pd(dx31
,dy31
,dz31
);
1638 rsq32
= gmx_mm_calc_rsq_pd(dx32
,dy32
,dz32
);
1639 rsq33
= gmx_mm_calc_rsq_pd(dx33
,dy33
,dz33
);
1641 rinv00
= gmx_mm_invsqrt_pd(rsq00
);
1642 rinv11
= gmx_mm_invsqrt_pd(rsq11
);
1643 rinv12
= gmx_mm_invsqrt_pd(rsq12
);
1644 rinv13
= gmx_mm_invsqrt_pd(rsq13
);
1645 rinv21
= gmx_mm_invsqrt_pd(rsq21
);
1646 rinv22
= gmx_mm_invsqrt_pd(rsq22
);
1647 rinv23
= gmx_mm_invsqrt_pd(rsq23
);
1648 rinv31
= gmx_mm_invsqrt_pd(rsq31
);
1649 rinv32
= gmx_mm_invsqrt_pd(rsq32
);
1650 rinv33
= gmx_mm_invsqrt_pd(rsq33
);
1652 rinvsq11
= _mm_mul_pd(rinv11
,rinv11
);
1653 rinvsq12
= _mm_mul_pd(rinv12
,rinv12
);
1654 rinvsq13
= _mm_mul_pd(rinv13
,rinv13
);
1655 rinvsq21
= _mm_mul_pd(rinv21
,rinv21
);
1656 rinvsq22
= _mm_mul_pd(rinv22
,rinv22
);
1657 rinvsq23
= _mm_mul_pd(rinv23
,rinv23
);
1658 rinvsq31
= _mm_mul_pd(rinv31
,rinv31
);
1659 rinvsq32
= _mm_mul_pd(rinv32
,rinv32
);
1660 rinvsq33
= _mm_mul_pd(rinv33
,rinv33
);
1662 fjx0
= _mm_setzero_pd();
1663 fjy0
= _mm_setzero_pd();
1664 fjz0
= _mm_setzero_pd();
1665 fjx1
= _mm_setzero_pd();
1666 fjy1
= _mm_setzero_pd();
1667 fjz1
= _mm_setzero_pd();
1668 fjx2
= _mm_setzero_pd();
1669 fjy2
= _mm_setzero_pd();
1670 fjz2
= _mm_setzero_pd();
1671 fjx3
= _mm_setzero_pd();
1672 fjy3
= _mm_setzero_pd();
1673 fjz3
= _mm_setzero_pd();
1675 /**************************
1676 * CALCULATE INTERACTIONS *
1677 **************************/
1679 r00
= _mm_mul_pd(rsq00
,rinv00
);
1681 /* Calculate table index by multiplying r with table scale and truncate to integer */
1682 rt
= _mm_mul_pd(r00
,vftabscale
);
1683 vfitab
= _mm_cvttpd_epi32(rt
);
1684 vfeps
= _mm_sub_pd(rt
,_mm_round_pd(rt
, _MM_FROUND_FLOOR
));
1685 vfitab
= _mm_slli_epi32(vfitab
,3);
1687 /* CUBIC SPLINE TABLE DISPERSION */
1688 Y
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) );
1689 F
= _mm_setzero_pd();
1690 GMX_MM_TRANSPOSE2_PD(Y
,F
);
1691 G
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) +2);
1692 H
= _mm_setzero_pd();
1693 GMX_MM_TRANSPOSE2_PD(G
,H
);
1694 Heps
= _mm_mul_pd(vfeps
,H
);
1695 Fp
= _mm_add_pd(F
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,Heps
)));
1696 FF
= _mm_add_pd(Fp
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,_mm_add_pd(Heps
,Heps
))));
1697 fvdw6
= _mm_mul_pd(c6_00
,FF
);
1699 /* CUBIC SPLINE TABLE REPULSION */
1700 vfitab
= _mm_add_epi32(vfitab
,ifour
);
1701 Y
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) );
1702 F
= _mm_setzero_pd();
1703 GMX_MM_TRANSPOSE2_PD(Y
,F
);
1704 G
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) +2);
1705 H
= _mm_setzero_pd();
1706 GMX_MM_TRANSPOSE2_PD(G
,H
);
1707 Heps
= _mm_mul_pd(vfeps
,H
);
1708 Fp
= _mm_add_pd(F
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,Heps
)));
1709 FF
= _mm_add_pd(Fp
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,_mm_add_pd(Heps
,Heps
))));
1710 fvdw12
= _mm_mul_pd(c12_00
,FF
);
1711 fvdw
= _mm_xor_pd(signbit
,_mm_mul_pd(_mm_add_pd(fvdw6
,fvdw12
),_mm_mul_pd(vftabscale
,rinv00
)));
1715 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
1717 /* Calculate temporary vectorial force */
1718 tx
= _mm_mul_pd(fscal
,dx00
);
1719 ty
= _mm_mul_pd(fscal
,dy00
);
1720 tz
= _mm_mul_pd(fscal
,dz00
);
1722 /* Update vectorial force */
1723 fix0
= _mm_add_pd(fix0
,tx
);
1724 fiy0
= _mm_add_pd(fiy0
,ty
);
1725 fiz0
= _mm_add_pd(fiz0
,tz
);
1727 fjx0
= _mm_add_pd(fjx0
,tx
);
1728 fjy0
= _mm_add_pd(fjy0
,ty
);
1729 fjz0
= _mm_add_pd(fjz0
,tz
);
1731 /**************************
1732 * CALCULATE INTERACTIONS *
1733 **************************/
1735 /* COULOMB ELECTROSTATICS */
1736 velec
= _mm_mul_pd(qq11
,rinv11
);
1737 felec
= _mm_mul_pd(velec
,rinvsq11
);
1741 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
1743 /* Calculate temporary vectorial force */
1744 tx
= _mm_mul_pd(fscal
,dx11
);
1745 ty
= _mm_mul_pd(fscal
,dy11
);
1746 tz
= _mm_mul_pd(fscal
,dz11
);
1748 /* Update vectorial force */
1749 fix1
= _mm_add_pd(fix1
,tx
);
1750 fiy1
= _mm_add_pd(fiy1
,ty
);
1751 fiz1
= _mm_add_pd(fiz1
,tz
);
1753 fjx1
= _mm_add_pd(fjx1
,tx
);
1754 fjy1
= _mm_add_pd(fjy1
,ty
);
1755 fjz1
= _mm_add_pd(fjz1
,tz
);
1757 /**************************
1758 * CALCULATE INTERACTIONS *
1759 **************************/
1761 /* COULOMB ELECTROSTATICS */
1762 velec
= _mm_mul_pd(qq12
,rinv12
);
1763 felec
= _mm_mul_pd(velec
,rinvsq12
);
1767 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
1769 /* Calculate temporary vectorial force */
1770 tx
= _mm_mul_pd(fscal
,dx12
);
1771 ty
= _mm_mul_pd(fscal
,dy12
);
1772 tz
= _mm_mul_pd(fscal
,dz12
);
1774 /* Update vectorial force */
1775 fix1
= _mm_add_pd(fix1
,tx
);
1776 fiy1
= _mm_add_pd(fiy1
,ty
);
1777 fiz1
= _mm_add_pd(fiz1
,tz
);
1779 fjx2
= _mm_add_pd(fjx2
,tx
);
1780 fjy2
= _mm_add_pd(fjy2
,ty
);
1781 fjz2
= _mm_add_pd(fjz2
,tz
);
1783 /**************************
1784 * CALCULATE INTERACTIONS *
1785 **************************/
1787 /* COULOMB ELECTROSTATICS */
1788 velec
= _mm_mul_pd(qq13
,rinv13
);
1789 felec
= _mm_mul_pd(velec
,rinvsq13
);
1793 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
1795 /* Calculate temporary vectorial force */
1796 tx
= _mm_mul_pd(fscal
,dx13
);
1797 ty
= _mm_mul_pd(fscal
,dy13
);
1798 tz
= _mm_mul_pd(fscal
,dz13
);
1800 /* Update vectorial force */
1801 fix1
= _mm_add_pd(fix1
,tx
);
1802 fiy1
= _mm_add_pd(fiy1
,ty
);
1803 fiz1
= _mm_add_pd(fiz1
,tz
);
1805 fjx3
= _mm_add_pd(fjx3
,tx
);
1806 fjy3
= _mm_add_pd(fjy3
,ty
);
1807 fjz3
= _mm_add_pd(fjz3
,tz
);
1809 /**************************
1810 * CALCULATE INTERACTIONS *
1811 **************************/
1813 /* COULOMB ELECTROSTATICS */
1814 velec
= _mm_mul_pd(qq21
,rinv21
);
1815 felec
= _mm_mul_pd(velec
,rinvsq21
);
1819 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
1821 /* Calculate temporary vectorial force */
1822 tx
= _mm_mul_pd(fscal
,dx21
);
1823 ty
= _mm_mul_pd(fscal
,dy21
);
1824 tz
= _mm_mul_pd(fscal
,dz21
);
1826 /* Update vectorial force */
1827 fix2
= _mm_add_pd(fix2
,tx
);
1828 fiy2
= _mm_add_pd(fiy2
,ty
);
1829 fiz2
= _mm_add_pd(fiz2
,tz
);
1831 fjx1
= _mm_add_pd(fjx1
,tx
);
1832 fjy1
= _mm_add_pd(fjy1
,ty
);
1833 fjz1
= _mm_add_pd(fjz1
,tz
);
1835 /**************************
1836 * CALCULATE INTERACTIONS *
1837 **************************/
1839 /* COULOMB ELECTROSTATICS */
1840 velec
= _mm_mul_pd(qq22
,rinv22
);
1841 felec
= _mm_mul_pd(velec
,rinvsq22
);
1845 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
1847 /* Calculate temporary vectorial force */
1848 tx
= _mm_mul_pd(fscal
,dx22
);
1849 ty
= _mm_mul_pd(fscal
,dy22
);
1850 tz
= _mm_mul_pd(fscal
,dz22
);
1852 /* Update vectorial force */
1853 fix2
= _mm_add_pd(fix2
,tx
);
1854 fiy2
= _mm_add_pd(fiy2
,ty
);
1855 fiz2
= _mm_add_pd(fiz2
,tz
);
1857 fjx2
= _mm_add_pd(fjx2
,tx
);
1858 fjy2
= _mm_add_pd(fjy2
,ty
);
1859 fjz2
= _mm_add_pd(fjz2
,tz
);
1861 /**************************
1862 * CALCULATE INTERACTIONS *
1863 **************************/
1865 /* COULOMB ELECTROSTATICS */
1866 velec
= _mm_mul_pd(qq23
,rinv23
);
1867 felec
= _mm_mul_pd(velec
,rinvsq23
);
1871 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
1873 /* Calculate temporary vectorial force */
1874 tx
= _mm_mul_pd(fscal
,dx23
);
1875 ty
= _mm_mul_pd(fscal
,dy23
);
1876 tz
= _mm_mul_pd(fscal
,dz23
);
1878 /* Update vectorial force */
1879 fix2
= _mm_add_pd(fix2
,tx
);
1880 fiy2
= _mm_add_pd(fiy2
,ty
);
1881 fiz2
= _mm_add_pd(fiz2
,tz
);
1883 fjx3
= _mm_add_pd(fjx3
,tx
);
1884 fjy3
= _mm_add_pd(fjy3
,ty
);
1885 fjz3
= _mm_add_pd(fjz3
,tz
);
1887 /**************************
1888 * CALCULATE INTERACTIONS *
1889 **************************/
1891 /* COULOMB ELECTROSTATICS */
1892 velec
= _mm_mul_pd(qq31
,rinv31
);
1893 felec
= _mm_mul_pd(velec
,rinvsq31
);
1897 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
1899 /* Calculate temporary vectorial force */
1900 tx
= _mm_mul_pd(fscal
,dx31
);
1901 ty
= _mm_mul_pd(fscal
,dy31
);
1902 tz
= _mm_mul_pd(fscal
,dz31
);
1904 /* Update vectorial force */
1905 fix3
= _mm_add_pd(fix3
,tx
);
1906 fiy3
= _mm_add_pd(fiy3
,ty
);
1907 fiz3
= _mm_add_pd(fiz3
,tz
);
1909 fjx1
= _mm_add_pd(fjx1
,tx
);
1910 fjy1
= _mm_add_pd(fjy1
,ty
);
1911 fjz1
= _mm_add_pd(fjz1
,tz
);
1913 /**************************
1914 * CALCULATE INTERACTIONS *
1915 **************************/
1917 /* COULOMB ELECTROSTATICS */
1918 velec
= _mm_mul_pd(qq32
,rinv32
);
1919 felec
= _mm_mul_pd(velec
,rinvsq32
);
1923 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
1925 /* Calculate temporary vectorial force */
1926 tx
= _mm_mul_pd(fscal
,dx32
);
1927 ty
= _mm_mul_pd(fscal
,dy32
);
1928 tz
= _mm_mul_pd(fscal
,dz32
);
1930 /* Update vectorial force */
1931 fix3
= _mm_add_pd(fix3
,tx
);
1932 fiy3
= _mm_add_pd(fiy3
,ty
);
1933 fiz3
= _mm_add_pd(fiz3
,tz
);
1935 fjx2
= _mm_add_pd(fjx2
,tx
);
1936 fjy2
= _mm_add_pd(fjy2
,ty
);
1937 fjz2
= _mm_add_pd(fjz2
,tz
);
1939 /**************************
1940 * CALCULATE INTERACTIONS *
1941 **************************/
1943 /* COULOMB ELECTROSTATICS */
1944 velec
= _mm_mul_pd(qq33
,rinv33
);
1945 felec
= _mm_mul_pd(velec
,rinvsq33
);
1949 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
1951 /* Calculate temporary vectorial force */
1952 tx
= _mm_mul_pd(fscal
,dx33
);
1953 ty
= _mm_mul_pd(fscal
,dy33
);
1954 tz
= _mm_mul_pd(fscal
,dz33
);
1956 /* Update vectorial force */
1957 fix3
= _mm_add_pd(fix3
,tx
);
1958 fiy3
= _mm_add_pd(fiy3
,ty
);
1959 fiz3
= _mm_add_pd(fiz3
,tz
);
1961 fjx3
= _mm_add_pd(fjx3
,tx
);
1962 fjy3
= _mm_add_pd(fjy3
,ty
);
1963 fjz3
= _mm_add_pd(fjz3
,tz
);
1965 gmx_mm_decrement_4rvec_1ptr_swizzle_pd(f
+j_coord_offsetA
,fjx0
,fjy0
,fjz0
,fjx1
,fjy1
,fjz1
,fjx2
,fjy2
,fjz2
,fjx3
,fjy3
,fjz3
);
1967 /* Inner loop uses 294 flops */
1970 /* End of innermost loop */
1972 gmx_mm_update_iforce_4atom_swizzle_pd(fix0
,fiy0
,fiz0
,fix1
,fiy1
,fiz1
,fix2
,fiy2
,fiz2
,fix3
,fiy3
,fiz3
,
1973 f
+i_coord_offset
,fshift
+i_shift_offset
);
1975 /* Increment number of inner iterations */
1976 inneriter
+= j_index_end
- j_index_start
;
1978 /* Outer loop uses 24 flops */
1981 /* Increment number of outer iterations */
1984 /* Update outer/inner flops */
1986 inc_nrnb(nrnb
,eNR_NBKERNEL_ELEC_VDW_W4W4_F
,outeriter
*24 + inneriter
*294);