2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
5 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6 * and including many others, as listed in the AUTHORS file in the
7 * top-level source directory and at http://www.gromacs.org.
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
36 * Note: this file was generated by the GROMACS avx_128_fma_single kernel generator.
42 #include "../nb_kernel.h"
43 #include "types/simple.h"
44 #include "gromacs/math/vec.h"
47 #include "gromacs/simd/math_x86_avx_128_fma_single.h"
48 #include "kernelutil_x86_avx_128_fma_single.h"
51 * Gromacs nonbonded kernel: nb_kernel_ElecEw_VdwNone_GeomW4W4_VF_avx_128_fma_single
52 * Electrostatics interaction: Ewald
53 * VdW interaction: None
54 * Geometry: Water4-Water4
55 * Calculate force/pot: PotentialAndForce
58 nb_kernel_ElecEw_VdwNone_GeomW4W4_VF_avx_128_fma_single
59 (t_nblist
* gmx_restrict nlist
,
60 rvec
* gmx_restrict xx
,
61 rvec
* gmx_restrict ff
,
62 t_forcerec
* gmx_restrict fr
,
63 t_mdatoms
* gmx_restrict mdatoms
,
64 nb_kernel_data_t gmx_unused
* gmx_restrict kernel_data
,
65 t_nrnb
* gmx_restrict nrnb
)
67 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
68 * just 0 for non-waters.
69 * Suffixes A,B,C,D refer to j loop unrolling done with AVX_128, e.g. for the four different
70 * jnr indices corresponding to data put in the four positions in the SIMD register.
72 int i_shift_offset
,i_coord_offset
,outeriter
,inneriter
;
73 int j_index_start
,j_index_end
,jidx
,nri
,inr
,ggid
,iidx
;
74 int jnrA
,jnrB
,jnrC
,jnrD
;
75 int jnrlistA
,jnrlistB
,jnrlistC
,jnrlistD
;
76 int j_coord_offsetA
,j_coord_offsetB
,j_coord_offsetC
,j_coord_offsetD
;
77 int *iinr
,*jindex
,*jjnr
,*shiftidx
,*gid
;
79 real
*shiftvec
,*fshift
,*x
,*f
;
80 real
*fjptrA
,*fjptrB
,*fjptrC
,*fjptrD
;
82 __m128 fscal
,rcutoff
,rcutoff2
,jidxall
;
84 __m128 ix1
,iy1
,iz1
,fix1
,fiy1
,fiz1
,iq1
,isai1
;
86 __m128 ix2
,iy2
,iz2
,fix2
,fiy2
,fiz2
,iq2
,isai2
;
88 __m128 ix3
,iy3
,iz3
,fix3
,fiy3
,fiz3
,iq3
,isai3
;
89 int vdwjidx1A
,vdwjidx1B
,vdwjidx1C
,vdwjidx1D
;
90 __m128 jx1
,jy1
,jz1
,fjx1
,fjy1
,fjz1
,jq1
,isaj1
;
91 int vdwjidx2A
,vdwjidx2B
,vdwjidx2C
,vdwjidx2D
;
92 __m128 jx2
,jy2
,jz2
,fjx2
,fjy2
,fjz2
,jq2
,isaj2
;
93 int vdwjidx3A
,vdwjidx3B
,vdwjidx3C
,vdwjidx3D
;
94 __m128 jx3
,jy3
,jz3
,fjx3
,fjy3
,fjz3
,jq3
,isaj3
;
95 __m128 dx11
,dy11
,dz11
,rsq11
,rinv11
,rinvsq11
,r11
,qq11
,c6_11
,c12_11
;
96 __m128 dx12
,dy12
,dz12
,rsq12
,rinv12
,rinvsq12
,r12
,qq12
,c6_12
,c12_12
;
97 __m128 dx13
,dy13
,dz13
,rsq13
,rinv13
,rinvsq13
,r13
,qq13
,c6_13
,c12_13
;
98 __m128 dx21
,dy21
,dz21
,rsq21
,rinv21
,rinvsq21
,r21
,qq21
,c6_21
,c12_21
;
99 __m128 dx22
,dy22
,dz22
,rsq22
,rinv22
,rinvsq22
,r22
,qq22
,c6_22
,c12_22
;
100 __m128 dx23
,dy23
,dz23
,rsq23
,rinv23
,rinvsq23
,r23
,qq23
,c6_23
,c12_23
;
101 __m128 dx31
,dy31
,dz31
,rsq31
,rinv31
,rinvsq31
,r31
,qq31
,c6_31
,c12_31
;
102 __m128 dx32
,dy32
,dz32
,rsq32
,rinv32
,rinvsq32
,r32
,qq32
,c6_32
,c12_32
;
103 __m128 dx33
,dy33
,dz33
,rsq33
,rinv33
,rinvsq33
,r33
,qq33
,c6_33
,c12_33
;
104 __m128 velec
,felec
,velecsum
,facel
,crf
,krf
,krf2
;
107 __m128 ewtabscale
,eweps
,twoeweps
,sh_ewald
,ewrt
,ewtabhalfspace
,ewtabF
,ewtabFn
,ewtabD
,ewtabV
;
108 __m128 beta
,beta2
,beta3
,zeta2
,pmecorrF
,pmecorrV
,rinv3
;
110 __m128 dummy_mask
,cutoff_mask
;
111 __m128 signbit
= _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
112 __m128 one
= _mm_set1_ps(1.0);
113 __m128 two
= _mm_set1_ps(2.0);
119 jindex
= nlist
->jindex
;
121 shiftidx
= nlist
->shift
;
123 shiftvec
= fr
->shift_vec
[0];
124 fshift
= fr
->fshift
[0];
125 facel
= _mm_set1_ps(fr
->epsfac
);
126 charge
= mdatoms
->chargeA
;
128 sh_ewald
= _mm_set1_ps(fr
->ic
->sh_ewald
);
129 beta
= _mm_set1_ps(fr
->ic
->ewaldcoeff_q
);
130 beta2
= _mm_mul_ps(beta
,beta
);
131 beta3
= _mm_mul_ps(beta
,beta2
);
132 ewtab
= fr
->ic
->tabq_coul_FDV0
;
133 ewtabscale
= _mm_set1_ps(fr
->ic
->tabq_scale
);
134 ewtabhalfspace
= _mm_set1_ps(0.5/fr
->ic
->tabq_scale
);
136 /* Setup water-specific parameters */
137 inr
= nlist
->iinr
[0];
138 iq1
= _mm_mul_ps(facel
,_mm_set1_ps(charge
[inr
+1]));
139 iq2
= _mm_mul_ps(facel
,_mm_set1_ps(charge
[inr
+2]));
140 iq3
= _mm_mul_ps(facel
,_mm_set1_ps(charge
[inr
+3]));
142 jq1
= _mm_set1_ps(charge
[inr
+1]);
143 jq2
= _mm_set1_ps(charge
[inr
+2]);
144 jq3
= _mm_set1_ps(charge
[inr
+3]);
145 qq11
= _mm_mul_ps(iq1
,jq1
);
146 qq12
= _mm_mul_ps(iq1
,jq2
);
147 qq13
= _mm_mul_ps(iq1
,jq3
);
148 qq21
= _mm_mul_ps(iq2
,jq1
);
149 qq22
= _mm_mul_ps(iq2
,jq2
);
150 qq23
= _mm_mul_ps(iq2
,jq3
);
151 qq31
= _mm_mul_ps(iq3
,jq1
);
152 qq32
= _mm_mul_ps(iq3
,jq2
);
153 qq33
= _mm_mul_ps(iq3
,jq3
);
155 /* Avoid stupid compiler warnings */
156 jnrA
= jnrB
= jnrC
= jnrD
= 0;
165 for(iidx
=0;iidx
<4*DIM
;iidx
++)
170 /* Start outer loop over neighborlists */
171 for(iidx
=0; iidx
<nri
; iidx
++)
173 /* Load shift vector for this list */
174 i_shift_offset
= DIM
*shiftidx
[iidx
];
176 /* Load limits for loop over neighbors */
177 j_index_start
= jindex
[iidx
];
178 j_index_end
= jindex
[iidx
+1];
180 /* Get outer coordinate index */
182 i_coord_offset
= DIM
*inr
;
184 /* Load i particle coords and add shift vector */
185 gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec
+i_shift_offset
,x
+i_coord_offset
+DIM
,
186 &ix1
,&iy1
,&iz1
,&ix2
,&iy2
,&iz2
,&ix3
,&iy3
,&iz3
);
188 fix1
= _mm_setzero_ps();
189 fiy1
= _mm_setzero_ps();
190 fiz1
= _mm_setzero_ps();
191 fix2
= _mm_setzero_ps();
192 fiy2
= _mm_setzero_ps();
193 fiz2
= _mm_setzero_ps();
194 fix3
= _mm_setzero_ps();
195 fiy3
= _mm_setzero_ps();
196 fiz3
= _mm_setzero_ps();
198 /* Reset potential sums */
199 velecsum
= _mm_setzero_ps();
201 /* Start inner kernel loop */
202 for(jidx
=j_index_start
; jidx
<j_index_end
&& jjnr
[jidx
+3]>=0; jidx
+=4)
205 /* Get j neighbor index, and coordinate index */
210 j_coord_offsetA
= DIM
*jnrA
;
211 j_coord_offsetB
= DIM
*jnrB
;
212 j_coord_offsetC
= DIM
*jnrC
;
213 j_coord_offsetD
= DIM
*jnrD
;
215 /* load j atom coordinates */
216 gmx_mm_load_3rvec_4ptr_swizzle_ps(x
+j_coord_offsetA
+DIM
,x
+j_coord_offsetB
+DIM
,
217 x
+j_coord_offsetC
+DIM
,x
+j_coord_offsetD
+DIM
,
218 &jx1
,&jy1
,&jz1
,&jx2
,&jy2
,&jz2
,&jx3
,&jy3
,&jz3
);
220 /* Calculate displacement vector */
221 dx11
= _mm_sub_ps(ix1
,jx1
);
222 dy11
= _mm_sub_ps(iy1
,jy1
);
223 dz11
= _mm_sub_ps(iz1
,jz1
);
224 dx12
= _mm_sub_ps(ix1
,jx2
);
225 dy12
= _mm_sub_ps(iy1
,jy2
);
226 dz12
= _mm_sub_ps(iz1
,jz2
);
227 dx13
= _mm_sub_ps(ix1
,jx3
);
228 dy13
= _mm_sub_ps(iy1
,jy3
);
229 dz13
= _mm_sub_ps(iz1
,jz3
);
230 dx21
= _mm_sub_ps(ix2
,jx1
);
231 dy21
= _mm_sub_ps(iy2
,jy1
);
232 dz21
= _mm_sub_ps(iz2
,jz1
);
233 dx22
= _mm_sub_ps(ix2
,jx2
);
234 dy22
= _mm_sub_ps(iy2
,jy2
);
235 dz22
= _mm_sub_ps(iz2
,jz2
);
236 dx23
= _mm_sub_ps(ix2
,jx3
);
237 dy23
= _mm_sub_ps(iy2
,jy3
);
238 dz23
= _mm_sub_ps(iz2
,jz3
);
239 dx31
= _mm_sub_ps(ix3
,jx1
);
240 dy31
= _mm_sub_ps(iy3
,jy1
);
241 dz31
= _mm_sub_ps(iz3
,jz1
);
242 dx32
= _mm_sub_ps(ix3
,jx2
);
243 dy32
= _mm_sub_ps(iy3
,jy2
);
244 dz32
= _mm_sub_ps(iz3
,jz2
);
245 dx33
= _mm_sub_ps(ix3
,jx3
);
246 dy33
= _mm_sub_ps(iy3
,jy3
);
247 dz33
= _mm_sub_ps(iz3
,jz3
);
249 /* Calculate squared distance and things based on it */
250 rsq11
= gmx_mm_calc_rsq_ps(dx11
,dy11
,dz11
);
251 rsq12
= gmx_mm_calc_rsq_ps(dx12
,dy12
,dz12
);
252 rsq13
= gmx_mm_calc_rsq_ps(dx13
,dy13
,dz13
);
253 rsq21
= gmx_mm_calc_rsq_ps(dx21
,dy21
,dz21
);
254 rsq22
= gmx_mm_calc_rsq_ps(dx22
,dy22
,dz22
);
255 rsq23
= gmx_mm_calc_rsq_ps(dx23
,dy23
,dz23
);
256 rsq31
= gmx_mm_calc_rsq_ps(dx31
,dy31
,dz31
);
257 rsq32
= gmx_mm_calc_rsq_ps(dx32
,dy32
,dz32
);
258 rsq33
= gmx_mm_calc_rsq_ps(dx33
,dy33
,dz33
);
260 rinv11
= gmx_mm_invsqrt_ps(rsq11
);
261 rinv12
= gmx_mm_invsqrt_ps(rsq12
);
262 rinv13
= gmx_mm_invsqrt_ps(rsq13
);
263 rinv21
= gmx_mm_invsqrt_ps(rsq21
);
264 rinv22
= gmx_mm_invsqrt_ps(rsq22
);
265 rinv23
= gmx_mm_invsqrt_ps(rsq23
);
266 rinv31
= gmx_mm_invsqrt_ps(rsq31
);
267 rinv32
= gmx_mm_invsqrt_ps(rsq32
);
268 rinv33
= gmx_mm_invsqrt_ps(rsq33
);
270 rinvsq11
= _mm_mul_ps(rinv11
,rinv11
);
271 rinvsq12
= _mm_mul_ps(rinv12
,rinv12
);
272 rinvsq13
= _mm_mul_ps(rinv13
,rinv13
);
273 rinvsq21
= _mm_mul_ps(rinv21
,rinv21
);
274 rinvsq22
= _mm_mul_ps(rinv22
,rinv22
);
275 rinvsq23
= _mm_mul_ps(rinv23
,rinv23
);
276 rinvsq31
= _mm_mul_ps(rinv31
,rinv31
);
277 rinvsq32
= _mm_mul_ps(rinv32
,rinv32
);
278 rinvsq33
= _mm_mul_ps(rinv33
,rinv33
);
280 fjx1
= _mm_setzero_ps();
281 fjy1
= _mm_setzero_ps();
282 fjz1
= _mm_setzero_ps();
283 fjx2
= _mm_setzero_ps();
284 fjy2
= _mm_setzero_ps();
285 fjz2
= _mm_setzero_ps();
286 fjx3
= _mm_setzero_ps();
287 fjy3
= _mm_setzero_ps();
288 fjz3
= _mm_setzero_ps();
290 /**************************
291 * CALCULATE INTERACTIONS *
292 **************************/
294 r11
= _mm_mul_ps(rsq11
,rinv11
);
296 /* EWALD ELECTROSTATICS */
298 /* Analytical PME correction */
299 zeta2
= _mm_mul_ps(beta2
,rsq11
);
300 rinv3
= _mm_mul_ps(rinvsq11
,rinv11
);
301 pmecorrF
= gmx_mm_pmecorrF_ps(zeta2
);
302 felec
= _mm_macc_ps(pmecorrF
,beta3
,rinv3
);
303 felec
= _mm_mul_ps(qq11
,felec
);
304 pmecorrV
= gmx_mm_pmecorrV_ps(zeta2
);
305 velec
= _mm_nmacc_ps(pmecorrV
,beta
,rinv11
);
306 velec
= _mm_mul_ps(qq11
,velec
);
308 /* Update potential sum for this i atom from the interaction with this j atom. */
309 velecsum
= _mm_add_ps(velecsum
,velec
);
313 /* Update vectorial force */
314 fix1
= _mm_macc_ps(dx11
,fscal
,fix1
);
315 fiy1
= _mm_macc_ps(dy11
,fscal
,fiy1
);
316 fiz1
= _mm_macc_ps(dz11
,fscal
,fiz1
);
318 fjx1
= _mm_macc_ps(dx11
,fscal
,fjx1
);
319 fjy1
= _mm_macc_ps(dy11
,fscal
,fjy1
);
320 fjz1
= _mm_macc_ps(dz11
,fscal
,fjz1
);
322 /**************************
323 * CALCULATE INTERACTIONS *
324 **************************/
326 r12
= _mm_mul_ps(rsq12
,rinv12
);
328 /* EWALD ELECTROSTATICS */
330 /* Analytical PME correction */
331 zeta2
= _mm_mul_ps(beta2
,rsq12
);
332 rinv3
= _mm_mul_ps(rinvsq12
,rinv12
);
333 pmecorrF
= gmx_mm_pmecorrF_ps(zeta2
);
334 felec
= _mm_macc_ps(pmecorrF
,beta3
,rinv3
);
335 felec
= _mm_mul_ps(qq12
,felec
);
336 pmecorrV
= gmx_mm_pmecorrV_ps(zeta2
);
337 velec
= _mm_nmacc_ps(pmecorrV
,beta
,rinv12
);
338 velec
= _mm_mul_ps(qq12
,velec
);
340 /* Update potential sum for this i atom from the interaction with this j atom. */
341 velecsum
= _mm_add_ps(velecsum
,velec
);
345 /* Update vectorial force */
346 fix1
= _mm_macc_ps(dx12
,fscal
,fix1
);
347 fiy1
= _mm_macc_ps(dy12
,fscal
,fiy1
);
348 fiz1
= _mm_macc_ps(dz12
,fscal
,fiz1
);
350 fjx2
= _mm_macc_ps(dx12
,fscal
,fjx2
);
351 fjy2
= _mm_macc_ps(dy12
,fscal
,fjy2
);
352 fjz2
= _mm_macc_ps(dz12
,fscal
,fjz2
);
354 /**************************
355 * CALCULATE INTERACTIONS *
356 **************************/
358 r13
= _mm_mul_ps(rsq13
,rinv13
);
360 /* EWALD ELECTROSTATICS */
362 /* Analytical PME correction */
363 zeta2
= _mm_mul_ps(beta2
,rsq13
);
364 rinv3
= _mm_mul_ps(rinvsq13
,rinv13
);
365 pmecorrF
= gmx_mm_pmecorrF_ps(zeta2
);
366 felec
= _mm_macc_ps(pmecorrF
,beta3
,rinv3
);
367 felec
= _mm_mul_ps(qq13
,felec
);
368 pmecorrV
= gmx_mm_pmecorrV_ps(zeta2
);
369 velec
= _mm_nmacc_ps(pmecorrV
,beta
,rinv13
);
370 velec
= _mm_mul_ps(qq13
,velec
);
372 /* Update potential sum for this i atom from the interaction with this j atom. */
373 velecsum
= _mm_add_ps(velecsum
,velec
);
377 /* Update vectorial force */
378 fix1
= _mm_macc_ps(dx13
,fscal
,fix1
);
379 fiy1
= _mm_macc_ps(dy13
,fscal
,fiy1
);
380 fiz1
= _mm_macc_ps(dz13
,fscal
,fiz1
);
382 fjx3
= _mm_macc_ps(dx13
,fscal
,fjx3
);
383 fjy3
= _mm_macc_ps(dy13
,fscal
,fjy3
);
384 fjz3
= _mm_macc_ps(dz13
,fscal
,fjz3
);
386 /**************************
387 * CALCULATE INTERACTIONS *
388 **************************/
390 r21
= _mm_mul_ps(rsq21
,rinv21
);
392 /* EWALD ELECTROSTATICS */
394 /* Analytical PME correction */
395 zeta2
= _mm_mul_ps(beta2
,rsq21
);
396 rinv3
= _mm_mul_ps(rinvsq21
,rinv21
);
397 pmecorrF
= gmx_mm_pmecorrF_ps(zeta2
);
398 felec
= _mm_macc_ps(pmecorrF
,beta3
,rinv3
);
399 felec
= _mm_mul_ps(qq21
,felec
);
400 pmecorrV
= gmx_mm_pmecorrV_ps(zeta2
);
401 velec
= _mm_nmacc_ps(pmecorrV
,beta
,rinv21
);
402 velec
= _mm_mul_ps(qq21
,velec
);
404 /* Update potential sum for this i atom from the interaction with this j atom. */
405 velecsum
= _mm_add_ps(velecsum
,velec
);
409 /* Update vectorial force */
410 fix2
= _mm_macc_ps(dx21
,fscal
,fix2
);
411 fiy2
= _mm_macc_ps(dy21
,fscal
,fiy2
);
412 fiz2
= _mm_macc_ps(dz21
,fscal
,fiz2
);
414 fjx1
= _mm_macc_ps(dx21
,fscal
,fjx1
);
415 fjy1
= _mm_macc_ps(dy21
,fscal
,fjy1
);
416 fjz1
= _mm_macc_ps(dz21
,fscal
,fjz1
);
418 /**************************
419 * CALCULATE INTERACTIONS *
420 **************************/
422 r22
= _mm_mul_ps(rsq22
,rinv22
);
424 /* EWALD ELECTROSTATICS */
426 /* Analytical PME correction */
427 zeta2
= _mm_mul_ps(beta2
,rsq22
);
428 rinv3
= _mm_mul_ps(rinvsq22
,rinv22
);
429 pmecorrF
= gmx_mm_pmecorrF_ps(zeta2
);
430 felec
= _mm_macc_ps(pmecorrF
,beta3
,rinv3
);
431 felec
= _mm_mul_ps(qq22
,felec
);
432 pmecorrV
= gmx_mm_pmecorrV_ps(zeta2
);
433 velec
= _mm_nmacc_ps(pmecorrV
,beta
,rinv22
);
434 velec
= _mm_mul_ps(qq22
,velec
);
436 /* Update potential sum for this i atom from the interaction with this j atom. */
437 velecsum
= _mm_add_ps(velecsum
,velec
);
441 /* Update vectorial force */
442 fix2
= _mm_macc_ps(dx22
,fscal
,fix2
);
443 fiy2
= _mm_macc_ps(dy22
,fscal
,fiy2
);
444 fiz2
= _mm_macc_ps(dz22
,fscal
,fiz2
);
446 fjx2
= _mm_macc_ps(dx22
,fscal
,fjx2
);
447 fjy2
= _mm_macc_ps(dy22
,fscal
,fjy2
);
448 fjz2
= _mm_macc_ps(dz22
,fscal
,fjz2
);
450 /**************************
451 * CALCULATE INTERACTIONS *
452 **************************/
454 r23
= _mm_mul_ps(rsq23
,rinv23
);
456 /* EWALD ELECTROSTATICS */
458 /* Analytical PME correction */
459 zeta2
= _mm_mul_ps(beta2
,rsq23
);
460 rinv3
= _mm_mul_ps(rinvsq23
,rinv23
);
461 pmecorrF
= gmx_mm_pmecorrF_ps(zeta2
);
462 felec
= _mm_macc_ps(pmecorrF
,beta3
,rinv3
);
463 felec
= _mm_mul_ps(qq23
,felec
);
464 pmecorrV
= gmx_mm_pmecorrV_ps(zeta2
);
465 velec
= _mm_nmacc_ps(pmecorrV
,beta
,rinv23
);
466 velec
= _mm_mul_ps(qq23
,velec
);
468 /* Update potential sum for this i atom from the interaction with this j atom. */
469 velecsum
= _mm_add_ps(velecsum
,velec
);
473 /* Update vectorial force */
474 fix2
= _mm_macc_ps(dx23
,fscal
,fix2
);
475 fiy2
= _mm_macc_ps(dy23
,fscal
,fiy2
);
476 fiz2
= _mm_macc_ps(dz23
,fscal
,fiz2
);
478 fjx3
= _mm_macc_ps(dx23
,fscal
,fjx3
);
479 fjy3
= _mm_macc_ps(dy23
,fscal
,fjy3
);
480 fjz3
= _mm_macc_ps(dz23
,fscal
,fjz3
);
482 /**************************
483 * CALCULATE INTERACTIONS *
484 **************************/
486 r31
= _mm_mul_ps(rsq31
,rinv31
);
488 /* EWALD ELECTROSTATICS */
490 /* Analytical PME correction */
491 zeta2
= _mm_mul_ps(beta2
,rsq31
);
492 rinv3
= _mm_mul_ps(rinvsq31
,rinv31
);
493 pmecorrF
= gmx_mm_pmecorrF_ps(zeta2
);
494 felec
= _mm_macc_ps(pmecorrF
,beta3
,rinv3
);
495 felec
= _mm_mul_ps(qq31
,felec
);
496 pmecorrV
= gmx_mm_pmecorrV_ps(zeta2
);
497 velec
= _mm_nmacc_ps(pmecorrV
,beta
,rinv31
);
498 velec
= _mm_mul_ps(qq31
,velec
);
500 /* Update potential sum for this i atom from the interaction with this j atom. */
501 velecsum
= _mm_add_ps(velecsum
,velec
);
505 /* Update vectorial force */
506 fix3
= _mm_macc_ps(dx31
,fscal
,fix3
);
507 fiy3
= _mm_macc_ps(dy31
,fscal
,fiy3
);
508 fiz3
= _mm_macc_ps(dz31
,fscal
,fiz3
);
510 fjx1
= _mm_macc_ps(dx31
,fscal
,fjx1
);
511 fjy1
= _mm_macc_ps(dy31
,fscal
,fjy1
);
512 fjz1
= _mm_macc_ps(dz31
,fscal
,fjz1
);
514 /**************************
515 * CALCULATE INTERACTIONS *
516 **************************/
518 r32
= _mm_mul_ps(rsq32
,rinv32
);
520 /* EWALD ELECTROSTATICS */
522 /* Analytical PME correction */
523 zeta2
= _mm_mul_ps(beta2
,rsq32
);
524 rinv3
= _mm_mul_ps(rinvsq32
,rinv32
);
525 pmecorrF
= gmx_mm_pmecorrF_ps(zeta2
);
526 felec
= _mm_macc_ps(pmecorrF
,beta3
,rinv3
);
527 felec
= _mm_mul_ps(qq32
,felec
);
528 pmecorrV
= gmx_mm_pmecorrV_ps(zeta2
);
529 velec
= _mm_nmacc_ps(pmecorrV
,beta
,rinv32
);
530 velec
= _mm_mul_ps(qq32
,velec
);
532 /* Update potential sum for this i atom from the interaction with this j atom. */
533 velecsum
= _mm_add_ps(velecsum
,velec
);
537 /* Update vectorial force */
538 fix3
= _mm_macc_ps(dx32
,fscal
,fix3
);
539 fiy3
= _mm_macc_ps(dy32
,fscal
,fiy3
);
540 fiz3
= _mm_macc_ps(dz32
,fscal
,fiz3
);
542 fjx2
= _mm_macc_ps(dx32
,fscal
,fjx2
);
543 fjy2
= _mm_macc_ps(dy32
,fscal
,fjy2
);
544 fjz2
= _mm_macc_ps(dz32
,fscal
,fjz2
);
546 /**************************
547 * CALCULATE INTERACTIONS *
548 **************************/
550 r33
= _mm_mul_ps(rsq33
,rinv33
);
552 /* EWALD ELECTROSTATICS */
554 /* Analytical PME correction */
555 zeta2
= _mm_mul_ps(beta2
,rsq33
);
556 rinv3
= _mm_mul_ps(rinvsq33
,rinv33
);
557 pmecorrF
= gmx_mm_pmecorrF_ps(zeta2
);
558 felec
= _mm_macc_ps(pmecorrF
,beta3
,rinv3
);
559 felec
= _mm_mul_ps(qq33
,felec
);
560 pmecorrV
= gmx_mm_pmecorrV_ps(zeta2
);
561 velec
= _mm_nmacc_ps(pmecorrV
,beta
,rinv33
);
562 velec
= _mm_mul_ps(qq33
,velec
);
564 /* Update potential sum for this i atom from the interaction with this j atom. */
565 velecsum
= _mm_add_ps(velecsum
,velec
);
569 /* Update vectorial force */
570 fix3
= _mm_macc_ps(dx33
,fscal
,fix3
);
571 fiy3
= _mm_macc_ps(dy33
,fscal
,fiy3
);
572 fiz3
= _mm_macc_ps(dz33
,fscal
,fiz3
);
574 fjx3
= _mm_macc_ps(dx33
,fscal
,fjx3
);
575 fjy3
= _mm_macc_ps(dy33
,fscal
,fjy3
);
576 fjz3
= _mm_macc_ps(dz33
,fscal
,fjz3
);
578 fjptrA
= f
+j_coord_offsetA
;
579 fjptrB
= f
+j_coord_offsetB
;
580 fjptrC
= f
+j_coord_offsetC
;
581 fjptrD
= f
+j_coord_offsetD
;
583 gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA
+DIM
,fjptrB
+DIM
,fjptrC
+DIM
,fjptrD
+DIM
,
584 fjx1
,fjy1
,fjz1
,fjx2
,fjy2
,fjz2
,fjx3
,fjy3
,fjz3
);
586 /* Inner loop uses 261 flops */
592 /* Get j neighbor index, and coordinate index */
593 jnrlistA
= jjnr
[jidx
];
594 jnrlistB
= jjnr
[jidx
+1];
595 jnrlistC
= jjnr
[jidx
+2];
596 jnrlistD
= jjnr
[jidx
+3];
597 /* Sign of each element will be negative for non-real atoms.
598 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
599 * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
601 dummy_mask
= gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i
*)(jjnr
+jidx
)),_mm_setzero_si128()));
602 jnrA
= (jnrlistA
>=0) ? jnrlistA
: 0;
603 jnrB
= (jnrlistB
>=0) ? jnrlistB
: 0;
604 jnrC
= (jnrlistC
>=0) ? jnrlistC
: 0;
605 jnrD
= (jnrlistD
>=0) ? jnrlistD
: 0;
606 j_coord_offsetA
= DIM
*jnrA
;
607 j_coord_offsetB
= DIM
*jnrB
;
608 j_coord_offsetC
= DIM
*jnrC
;
609 j_coord_offsetD
= DIM
*jnrD
;
611 /* load j atom coordinates */
612 gmx_mm_load_3rvec_4ptr_swizzle_ps(x
+j_coord_offsetA
+DIM
,x
+j_coord_offsetB
+DIM
,
613 x
+j_coord_offsetC
+DIM
,x
+j_coord_offsetD
+DIM
,
614 &jx1
,&jy1
,&jz1
,&jx2
,&jy2
,&jz2
,&jx3
,&jy3
,&jz3
);
616 /* Calculate displacement vector */
617 dx11
= _mm_sub_ps(ix1
,jx1
);
618 dy11
= _mm_sub_ps(iy1
,jy1
);
619 dz11
= _mm_sub_ps(iz1
,jz1
);
620 dx12
= _mm_sub_ps(ix1
,jx2
);
621 dy12
= _mm_sub_ps(iy1
,jy2
);
622 dz12
= _mm_sub_ps(iz1
,jz2
);
623 dx13
= _mm_sub_ps(ix1
,jx3
);
624 dy13
= _mm_sub_ps(iy1
,jy3
);
625 dz13
= _mm_sub_ps(iz1
,jz3
);
626 dx21
= _mm_sub_ps(ix2
,jx1
);
627 dy21
= _mm_sub_ps(iy2
,jy1
);
628 dz21
= _mm_sub_ps(iz2
,jz1
);
629 dx22
= _mm_sub_ps(ix2
,jx2
);
630 dy22
= _mm_sub_ps(iy2
,jy2
);
631 dz22
= _mm_sub_ps(iz2
,jz2
);
632 dx23
= _mm_sub_ps(ix2
,jx3
);
633 dy23
= _mm_sub_ps(iy2
,jy3
);
634 dz23
= _mm_sub_ps(iz2
,jz3
);
635 dx31
= _mm_sub_ps(ix3
,jx1
);
636 dy31
= _mm_sub_ps(iy3
,jy1
);
637 dz31
= _mm_sub_ps(iz3
,jz1
);
638 dx32
= _mm_sub_ps(ix3
,jx2
);
639 dy32
= _mm_sub_ps(iy3
,jy2
);
640 dz32
= _mm_sub_ps(iz3
,jz2
);
641 dx33
= _mm_sub_ps(ix3
,jx3
);
642 dy33
= _mm_sub_ps(iy3
,jy3
);
643 dz33
= _mm_sub_ps(iz3
,jz3
);
645 /* Calculate squared distance and things based on it */
646 rsq11
= gmx_mm_calc_rsq_ps(dx11
,dy11
,dz11
);
647 rsq12
= gmx_mm_calc_rsq_ps(dx12
,dy12
,dz12
);
648 rsq13
= gmx_mm_calc_rsq_ps(dx13
,dy13
,dz13
);
649 rsq21
= gmx_mm_calc_rsq_ps(dx21
,dy21
,dz21
);
650 rsq22
= gmx_mm_calc_rsq_ps(dx22
,dy22
,dz22
);
651 rsq23
= gmx_mm_calc_rsq_ps(dx23
,dy23
,dz23
);
652 rsq31
= gmx_mm_calc_rsq_ps(dx31
,dy31
,dz31
);
653 rsq32
= gmx_mm_calc_rsq_ps(dx32
,dy32
,dz32
);
654 rsq33
= gmx_mm_calc_rsq_ps(dx33
,dy33
,dz33
);
656 rinv11
= gmx_mm_invsqrt_ps(rsq11
);
657 rinv12
= gmx_mm_invsqrt_ps(rsq12
);
658 rinv13
= gmx_mm_invsqrt_ps(rsq13
);
659 rinv21
= gmx_mm_invsqrt_ps(rsq21
);
660 rinv22
= gmx_mm_invsqrt_ps(rsq22
);
661 rinv23
= gmx_mm_invsqrt_ps(rsq23
);
662 rinv31
= gmx_mm_invsqrt_ps(rsq31
);
663 rinv32
= gmx_mm_invsqrt_ps(rsq32
);
664 rinv33
= gmx_mm_invsqrt_ps(rsq33
);
666 rinvsq11
= _mm_mul_ps(rinv11
,rinv11
);
667 rinvsq12
= _mm_mul_ps(rinv12
,rinv12
);
668 rinvsq13
= _mm_mul_ps(rinv13
,rinv13
);
669 rinvsq21
= _mm_mul_ps(rinv21
,rinv21
);
670 rinvsq22
= _mm_mul_ps(rinv22
,rinv22
);
671 rinvsq23
= _mm_mul_ps(rinv23
,rinv23
);
672 rinvsq31
= _mm_mul_ps(rinv31
,rinv31
);
673 rinvsq32
= _mm_mul_ps(rinv32
,rinv32
);
674 rinvsq33
= _mm_mul_ps(rinv33
,rinv33
);
676 fjx1
= _mm_setzero_ps();
677 fjy1
= _mm_setzero_ps();
678 fjz1
= _mm_setzero_ps();
679 fjx2
= _mm_setzero_ps();
680 fjy2
= _mm_setzero_ps();
681 fjz2
= _mm_setzero_ps();
682 fjx3
= _mm_setzero_ps();
683 fjy3
= _mm_setzero_ps();
684 fjz3
= _mm_setzero_ps();
686 /**************************
687 * CALCULATE INTERACTIONS *
688 **************************/
690 r11
= _mm_mul_ps(rsq11
,rinv11
);
691 r11
= _mm_andnot_ps(dummy_mask
,r11
);
693 /* EWALD ELECTROSTATICS */
695 /* Analytical PME correction */
696 zeta2
= _mm_mul_ps(beta2
,rsq11
);
697 rinv3
= _mm_mul_ps(rinvsq11
,rinv11
);
698 pmecorrF
= gmx_mm_pmecorrF_ps(zeta2
);
699 felec
= _mm_macc_ps(pmecorrF
,beta3
,rinv3
);
700 felec
= _mm_mul_ps(qq11
,felec
);
701 pmecorrV
= gmx_mm_pmecorrV_ps(zeta2
);
702 velec
= _mm_nmacc_ps(pmecorrV
,beta
,rinv11
);
703 velec
= _mm_mul_ps(qq11
,velec
);
705 /* Update potential sum for this i atom from the interaction with this j atom. */
706 velec
= _mm_andnot_ps(dummy_mask
,velec
);
707 velecsum
= _mm_add_ps(velecsum
,velec
);
711 fscal
= _mm_andnot_ps(dummy_mask
,fscal
);
713 /* Update vectorial force */
714 fix1
= _mm_macc_ps(dx11
,fscal
,fix1
);
715 fiy1
= _mm_macc_ps(dy11
,fscal
,fiy1
);
716 fiz1
= _mm_macc_ps(dz11
,fscal
,fiz1
);
718 fjx1
= _mm_macc_ps(dx11
,fscal
,fjx1
);
719 fjy1
= _mm_macc_ps(dy11
,fscal
,fjy1
);
720 fjz1
= _mm_macc_ps(dz11
,fscal
,fjz1
);
722 /**************************
723 * CALCULATE INTERACTIONS *
724 **************************/
726 r12
= _mm_mul_ps(rsq12
,rinv12
);
727 r12
= _mm_andnot_ps(dummy_mask
,r12
);
729 /* EWALD ELECTROSTATICS */
731 /* Analytical PME correction */
732 zeta2
= _mm_mul_ps(beta2
,rsq12
);
733 rinv3
= _mm_mul_ps(rinvsq12
,rinv12
);
734 pmecorrF
= gmx_mm_pmecorrF_ps(zeta2
);
735 felec
= _mm_macc_ps(pmecorrF
,beta3
,rinv3
);
736 felec
= _mm_mul_ps(qq12
,felec
);
737 pmecorrV
= gmx_mm_pmecorrV_ps(zeta2
);
738 velec
= _mm_nmacc_ps(pmecorrV
,beta
,rinv12
);
739 velec
= _mm_mul_ps(qq12
,velec
);
741 /* Update potential sum for this i atom from the interaction with this j atom. */
742 velec
= _mm_andnot_ps(dummy_mask
,velec
);
743 velecsum
= _mm_add_ps(velecsum
,velec
);
747 fscal
= _mm_andnot_ps(dummy_mask
,fscal
);
749 /* Update vectorial force */
750 fix1
= _mm_macc_ps(dx12
,fscal
,fix1
);
751 fiy1
= _mm_macc_ps(dy12
,fscal
,fiy1
);
752 fiz1
= _mm_macc_ps(dz12
,fscal
,fiz1
);
754 fjx2
= _mm_macc_ps(dx12
,fscal
,fjx2
);
755 fjy2
= _mm_macc_ps(dy12
,fscal
,fjy2
);
756 fjz2
= _mm_macc_ps(dz12
,fscal
,fjz2
);
758 /**************************
759 * CALCULATE INTERACTIONS *
760 **************************/
762 r13
= _mm_mul_ps(rsq13
,rinv13
);
763 r13
= _mm_andnot_ps(dummy_mask
,r13
);
765 /* EWALD ELECTROSTATICS */
767 /* Analytical PME correction */
768 zeta2
= _mm_mul_ps(beta2
,rsq13
);
769 rinv3
= _mm_mul_ps(rinvsq13
,rinv13
);
770 pmecorrF
= gmx_mm_pmecorrF_ps(zeta2
);
771 felec
= _mm_macc_ps(pmecorrF
,beta3
,rinv3
);
772 felec
= _mm_mul_ps(qq13
,felec
);
773 pmecorrV
= gmx_mm_pmecorrV_ps(zeta2
);
774 velec
= _mm_nmacc_ps(pmecorrV
,beta
,rinv13
);
775 velec
= _mm_mul_ps(qq13
,velec
);
777 /* Update potential sum for this i atom from the interaction with this j atom. */
778 velec
= _mm_andnot_ps(dummy_mask
,velec
);
779 velecsum
= _mm_add_ps(velecsum
,velec
);
783 fscal
= _mm_andnot_ps(dummy_mask
,fscal
);
785 /* Update vectorial force */
786 fix1
= _mm_macc_ps(dx13
,fscal
,fix1
);
787 fiy1
= _mm_macc_ps(dy13
,fscal
,fiy1
);
788 fiz1
= _mm_macc_ps(dz13
,fscal
,fiz1
);
790 fjx3
= _mm_macc_ps(dx13
,fscal
,fjx3
);
791 fjy3
= _mm_macc_ps(dy13
,fscal
,fjy3
);
792 fjz3
= _mm_macc_ps(dz13
,fscal
,fjz3
);
794 /**************************
795 * CALCULATE INTERACTIONS *
796 **************************/
798 r21
= _mm_mul_ps(rsq21
,rinv21
);
799 r21
= _mm_andnot_ps(dummy_mask
,r21
);
801 /* EWALD ELECTROSTATICS */
803 /* Analytical PME correction */
804 zeta2
= _mm_mul_ps(beta2
,rsq21
);
805 rinv3
= _mm_mul_ps(rinvsq21
,rinv21
);
806 pmecorrF
= gmx_mm_pmecorrF_ps(zeta2
);
807 felec
= _mm_macc_ps(pmecorrF
,beta3
,rinv3
);
808 felec
= _mm_mul_ps(qq21
,felec
);
809 pmecorrV
= gmx_mm_pmecorrV_ps(zeta2
);
810 velec
= _mm_nmacc_ps(pmecorrV
,beta
,rinv21
);
811 velec
= _mm_mul_ps(qq21
,velec
);
813 /* Update potential sum for this i atom from the interaction with this j atom. */
814 velec
= _mm_andnot_ps(dummy_mask
,velec
);
815 velecsum
= _mm_add_ps(velecsum
,velec
);
819 fscal
= _mm_andnot_ps(dummy_mask
,fscal
);
821 /* Update vectorial force */
822 fix2
= _mm_macc_ps(dx21
,fscal
,fix2
);
823 fiy2
= _mm_macc_ps(dy21
,fscal
,fiy2
);
824 fiz2
= _mm_macc_ps(dz21
,fscal
,fiz2
);
826 fjx1
= _mm_macc_ps(dx21
,fscal
,fjx1
);
827 fjy1
= _mm_macc_ps(dy21
,fscal
,fjy1
);
828 fjz1
= _mm_macc_ps(dz21
,fscal
,fjz1
);
830 /**************************
831 * CALCULATE INTERACTIONS *
832 **************************/
834 r22
= _mm_mul_ps(rsq22
,rinv22
);
835 r22
= _mm_andnot_ps(dummy_mask
,r22
);
837 /* EWALD ELECTROSTATICS */
839 /* Analytical PME correction */
840 zeta2
= _mm_mul_ps(beta2
,rsq22
);
841 rinv3
= _mm_mul_ps(rinvsq22
,rinv22
);
842 pmecorrF
= gmx_mm_pmecorrF_ps(zeta2
);
843 felec
= _mm_macc_ps(pmecorrF
,beta3
,rinv3
);
844 felec
= _mm_mul_ps(qq22
,felec
);
845 pmecorrV
= gmx_mm_pmecorrV_ps(zeta2
);
846 velec
= _mm_nmacc_ps(pmecorrV
,beta
,rinv22
);
847 velec
= _mm_mul_ps(qq22
,velec
);
849 /* Update potential sum for this i atom from the interaction with this j atom. */
850 velec
= _mm_andnot_ps(dummy_mask
,velec
);
851 velecsum
= _mm_add_ps(velecsum
,velec
);
855 fscal
= _mm_andnot_ps(dummy_mask
,fscal
);
857 /* Update vectorial force */
858 fix2
= _mm_macc_ps(dx22
,fscal
,fix2
);
859 fiy2
= _mm_macc_ps(dy22
,fscal
,fiy2
);
860 fiz2
= _mm_macc_ps(dz22
,fscal
,fiz2
);
862 fjx2
= _mm_macc_ps(dx22
,fscal
,fjx2
);
863 fjy2
= _mm_macc_ps(dy22
,fscal
,fjy2
);
864 fjz2
= _mm_macc_ps(dz22
,fscal
,fjz2
);
866 /**************************
867 * CALCULATE INTERACTIONS *
868 **************************/
870 r23
= _mm_mul_ps(rsq23
,rinv23
);
871 r23
= _mm_andnot_ps(dummy_mask
,r23
);
873 /* EWALD ELECTROSTATICS */
875 /* Analytical PME correction */
876 zeta2
= _mm_mul_ps(beta2
,rsq23
);
877 rinv3
= _mm_mul_ps(rinvsq23
,rinv23
);
878 pmecorrF
= gmx_mm_pmecorrF_ps(zeta2
);
879 felec
= _mm_macc_ps(pmecorrF
,beta3
,rinv3
);
880 felec
= _mm_mul_ps(qq23
,felec
);
881 pmecorrV
= gmx_mm_pmecorrV_ps(zeta2
);
882 velec
= _mm_nmacc_ps(pmecorrV
,beta
,rinv23
);
883 velec
= _mm_mul_ps(qq23
,velec
);
885 /* Update potential sum for this i atom from the interaction with this j atom. */
886 velec
= _mm_andnot_ps(dummy_mask
,velec
);
887 velecsum
= _mm_add_ps(velecsum
,velec
);
891 fscal
= _mm_andnot_ps(dummy_mask
,fscal
);
893 /* Update vectorial force */
894 fix2
= _mm_macc_ps(dx23
,fscal
,fix2
);
895 fiy2
= _mm_macc_ps(dy23
,fscal
,fiy2
);
896 fiz2
= _mm_macc_ps(dz23
,fscal
,fiz2
);
898 fjx3
= _mm_macc_ps(dx23
,fscal
,fjx3
);
899 fjy3
= _mm_macc_ps(dy23
,fscal
,fjy3
);
900 fjz3
= _mm_macc_ps(dz23
,fscal
,fjz3
);
902 /**************************
903 * CALCULATE INTERACTIONS *
904 **************************/
906 r31
= _mm_mul_ps(rsq31
,rinv31
);
907 r31
= _mm_andnot_ps(dummy_mask
,r31
);
909 /* EWALD ELECTROSTATICS */
911 /* Analytical PME correction */
912 zeta2
= _mm_mul_ps(beta2
,rsq31
);
913 rinv3
= _mm_mul_ps(rinvsq31
,rinv31
);
914 pmecorrF
= gmx_mm_pmecorrF_ps(zeta2
);
915 felec
= _mm_macc_ps(pmecorrF
,beta3
,rinv3
);
916 felec
= _mm_mul_ps(qq31
,felec
);
917 pmecorrV
= gmx_mm_pmecorrV_ps(zeta2
);
918 velec
= _mm_nmacc_ps(pmecorrV
,beta
,rinv31
);
919 velec
= _mm_mul_ps(qq31
,velec
);
921 /* Update potential sum for this i atom from the interaction with this j atom. */
922 velec
= _mm_andnot_ps(dummy_mask
,velec
);
923 velecsum
= _mm_add_ps(velecsum
,velec
);
927 fscal
= _mm_andnot_ps(dummy_mask
,fscal
);
929 /* Update vectorial force */
930 fix3
= _mm_macc_ps(dx31
,fscal
,fix3
);
931 fiy3
= _mm_macc_ps(dy31
,fscal
,fiy3
);
932 fiz3
= _mm_macc_ps(dz31
,fscal
,fiz3
);
934 fjx1
= _mm_macc_ps(dx31
,fscal
,fjx1
);
935 fjy1
= _mm_macc_ps(dy31
,fscal
,fjy1
);
936 fjz1
= _mm_macc_ps(dz31
,fscal
,fjz1
);
938 /**************************
939 * CALCULATE INTERACTIONS *
940 **************************/
942 r32
= _mm_mul_ps(rsq32
,rinv32
);
943 r32
= _mm_andnot_ps(dummy_mask
,r32
);
945 /* EWALD ELECTROSTATICS */
947 /* Analytical PME correction */
948 zeta2
= _mm_mul_ps(beta2
,rsq32
);
949 rinv3
= _mm_mul_ps(rinvsq32
,rinv32
);
950 pmecorrF
= gmx_mm_pmecorrF_ps(zeta2
);
951 felec
= _mm_macc_ps(pmecorrF
,beta3
,rinv3
);
952 felec
= _mm_mul_ps(qq32
,felec
);
953 pmecorrV
= gmx_mm_pmecorrV_ps(zeta2
);
954 velec
= _mm_nmacc_ps(pmecorrV
,beta
,rinv32
);
955 velec
= _mm_mul_ps(qq32
,velec
);
957 /* Update potential sum for this i atom from the interaction with this j atom. */
958 velec
= _mm_andnot_ps(dummy_mask
,velec
);
959 velecsum
= _mm_add_ps(velecsum
,velec
);
963 fscal
= _mm_andnot_ps(dummy_mask
,fscal
);
965 /* Update vectorial force */
966 fix3
= _mm_macc_ps(dx32
,fscal
,fix3
);
967 fiy3
= _mm_macc_ps(dy32
,fscal
,fiy3
);
968 fiz3
= _mm_macc_ps(dz32
,fscal
,fiz3
);
970 fjx2
= _mm_macc_ps(dx32
,fscal
,fjx2
);
971 fjy2
= _mm_macc_ps(dy32
,fscal
,fjy2
);
972 fjz2
= _mm_macc_ps(dz32
,fscal
,fjz2
);
974 /**************************
975 * CALCULATE INTERACTIONS *
976 **************************/
978 r33
= _mm_mul_ps(rsq33
,rinv33
);
979 r33
= _mm_andnot_ps(dummy_mask
,r33
);
981 /* EWALD ELECTROSTATICS */
983 /* Analytical PME correction */
984 zeta2
= _mm_mul_ps(beta2
,rsq33
);
985 rinv3
= _mm_mul_ps(rinvsq33
,rinv33
);
986 pmecorrF
= gmx_mm_pmecorrF_ps(zeta2
);
987 felec
= _mm_macc_ps(pmecorrF
,beta3
,rinv3
);
988 felec
= _mm_mul_ps(qq33
,felec
);
989 pmecorrV
= gmx_mm_pmecorrV_ps(zeta2
);
990 velec
= _mm_nmacc_ps(pmecorrV
,beta
,rinv33
);
991 velec
= _mm_mul_ps(qq33
,velec
);
993 /* Update potential sum for this i atom from the interaction with this j atom. */
994 velec
= _mm_andnot_ps(dummy_mask
,velec
);
995 velecsum
= _mm_add_ps(velecsum
,velec
);
999 fscal
= _mm_andnot_ps(dummy_mask
,fscal
);
1001 /* Update vectorial force */
1002 fix3
= _mm_macc_ps(dx33
,fscal
,fix3
);
1003 fiy3
= _mm_macc_ps(dy33
,fscal
,fiy3
);
1004 fiz3
= _mm_macc_ps(dz33
,fscal
,fiz3
);
1006 fjx3
= _mm_macc_ps(dx33
,fscal
,fjx3
);
1007 fjy3
= _mm_macc_ps(dy33
,fscal
,fjy3
);
1008 fjz3
= _mm_macc_ps(dz33
,fscal
,fjz3
);
1010 fjptrA
= (jnrlistA
>=0) ? f
+j_coord_offsetA
: scratch
;
1011 fjptrB
= (jnrlistB
>=0) ? f
+j_coord_offsetB
: scratch
;
1012 fjptrC
= (jnrlistC
>=0) ? f
+j_coord_offsetC
: scratch
;
1013 fjptrD
= (jnrlistD
>=0) ? f
+j_coord_offsetD
: scratch
;
1015 gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA
+DIM
,fjptrB
+DIM
,fjptrC
+DIM
,fjptrD
+DIM
,
1016 fjx1
,fjy1
,fjz1
,fjx2
,fjy2
,fjz2
,fjx3
,fjy3
,fjz3
);
1018 /* Inner loop uses 270 flops */
1021 /* End of innermost loop */
1023 gmx_mm_update_iforce_3atom_swizzle_ps(fix1
,fiy1
,fiz1
,fix2
,fiy2
,fiz2
,fix3
,fiy3
,fiz3
,
1024 f
+i_coord_offset
+DIM
,fshift
+i_shift_offset
);
1027 /* Update potential energies */
1028 gmx_mm_update_1pot_ps(velecsum
,kernel_data
->energygrp_elec
+ggid
);
1030 /* Increment number of inner iterations */
1031 inneriter
+= j_index_end
- j_index_start
;
1033 /* Outer loop uses 19 flops */
1036 /* Increment number of outer iterations */
1039 /* Update outer/inner flops */
1041 inc_nrnb(nrnb
,eNR_NBKERNEL_ELEC_W4W4_VF
,outeriter
*19 + inneriter
*270);
1044 * Gromacs nonbonded kernel: nb_kernel_ElecEw_VdwNone_GeomW4W4_F_avx_128_fma_single
1045 * Electrostatics interaction: Ewald
1046 * VdW interaction: None
1047 * Geometry: Water4-Water4
1048 * Calculate force/pot: Force
1051 nb_kernel_ElecEw_VdwNone_GeomW4W4_F_avx_128_fma_single
1052 (t_nblist
* gmx_restrict nlist
,
1053 rvec
* gmx_restrict xx
,
1054 rvec
* gmx_restrict ff
,
1055 t_forcerec
* gmx_restrict fr
,
1056 t_mdatoms
* gmx_restrict mdatoms
,
1057 nb_kernel_data_t gmx_unused
* gmx_restrict kernel_data
,
1058 t_nrnb
* gmx_restrict nrnb
)
1060 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
1061 * just 0 for non-waters.
1062 * Suffixes A,B,C,D refer to j loop unrolling done with AVX_128, e.g. for the four different
1063 * jnr indices corresponding to data put in the four positions in the SIMD register.
1065 int i_shift_offset
,i_coord_offset
,outeriter
,inneriter
;
1066 int j_index_start
,j_index_end
,jidx
,nri
,inr
,ggid
,iidx
;
1067 int jnrA
,jnrB
,jnrC
,jnrD
;
1068 int jnrlistA
,jnrlistB
,jnrlistC
,jnrlistD
;
1069 int j_coord_offsetA
,j_coord_offsetB
,j_coord_offsetC
,j_coord_offsetD
;
1070 int *iinr
,*jindex
,*jjnr
,*shiftidx
,*gid
;
1071 real rcutoff_scalar
;
1072 real
*shiftvec
,*fshift
,*x
,*f
;
1073 real
*fjptrA
,*fjptrB
,*fjptrC
,*fjptrD
;
1074 real scratch
[4*DIM
];
1075 __m128 fscal
,rcutoff
,rcutoff2
,jidxall
;
1077 __m128 ix1
,iy1
,iz1
,fix1
,fiy1
,fiz1
,iq1
,isai1
;
1079 __m128 ix2
,iy2
,iz2
,fix2
,fiy2
,fiz2
,iq2
,isai2
;
1081 __m128 ix3
,iy3
,iz3
,fix3
,fiy3
,fiz3
,iq3
,isai3
;
1082 int vdwjidx1A
,vdwjidx1B
,vdwjidx1C
,vdwjidx1D
;
1083 __m128 jx1
,jy1
,jz1
,fjx1
,fjy1
,fjz1
,jq1
,isaj1
;
1084 int vdwjidx2A
,vdwjidx2B
,vdwjidx2C
,vdwjidx2D
;
1085 __m128 jx2
,jy2
,jz2
,fjx2
,fjy2
,fjz2
,jq2
,isaj2
;
1086 int vdwjidx3A
,vdwjidx3B
,vdwjidx3C
,vdwjidx3D
;
1087 __m128 jx3
,jy3
,jz3
,fjx3
,fjy3
,fjz3
,jq3
,isaj3
;
1088 __m128 dx11
,dy11
,dz11
,rsq11
,rinv11
,rinvsq11
,r11
,qq11
,c6_11
,c12_11
;
1089 __m128 dx12
,dy12
,dz12
,rsq12
,rinv12
,rinvsq12
,r12
,qq12
,c6_12
,c12_12
;
1090 __m128 dx13
,dy13
,dz13
,rsq13
,rinv13
,rinvsq13
,r13
,qq13
,c6_13
,c12_13
;
1091 __m128 dx21
,dy21
,dz21
,rsq21
,rinv21
,rinvsq21
,r21
,qq21
,c6_21
,c12_21
;
1092 __m128 dx22
,dy22
,dz22
,rsq22
,rinv22
,rinvsq22
,r22
,qq22
,c6_22
,c12_22
;
1093 __m128 dx23
,dy23
,dz23
,rsq23
,rinv23
,rinvsq23
,r23
,qq23
,c6_23
,c12_23
;
1094 __m128 dx31
,dy31
,dz31
,rsq31
,rinv31
,rinvsq31
,r31
,qq31
,c6_31
,c12_31
;
1095 __m128 dx32
,dy32
,dz32
,rsq32
,rinv32
,rinvsq32
,r32
,qq32
,c6_32
,c12_32
;
1096 __m128 dx33
,dy33
,dz33
,rsq33
,rinv33
,rinvsq33
,r33
,qq33
,c6_33
,c12_33
;
1097 __m128 velec
,felec
,velecsum
,facel
,crf
,krf
,krf2
;
1100 __m128 ewtabscale
,eweps
,twoeweps
,sh_ewald
,ewrt
,ewtabhalfspace
,ewtabF
,ewtabFn
,ewtabD
,ewtabV
;
1101 __m128 beta
,beta2
,beta3
,zeta2
,pmecorrF
,pmecorrV
,rinv3
;
1103 __m128 dummy_mask
,cutoff_mask
;
1104 __m128 signbit
= _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
1105 __m128 one
= _mm_set1_ps(1.0);
1106 __m128 two
= _mm_set1_ps(2.0);
1112 jindex
= nlist
->jindex
;
1114 shiftidx
= nlist
->shift
;
1116 shiftvec
= fr
->shift_vec
[0];
1117 fshift
= fr
->fshift
[0];
1118 facel
= _mm_set1_ps(fr
->epsfac
);
1119 charge
= mdatoms
->chargeA
;
1121 sh_ewald
= _mm_set1_ps(fr
->ic
->sh_ewald
);
1122 beta
= _mm_set1_ps(fr
->ic
->ewaldcoeff_q
);
1123 beta2
= _mm_mul_ps(beta
,beta
);
1124 beta3
= _mm_mul_ps(beta
,beta2
);
1125 ewtab
= fr
->ic
->tabq_coul_F
;
1126 ewtabscale
= _mm_set1_ps(fr
->ic
->tabq_scale
);
1127 ewtabhalfspace
= _mm_set1_ps(0.5/fr
->ic
->tabq_scale
);
1129 /* Setup water-specific parameters */
1130 inr
= nlist
->iinr
[0];
1131 iq1
= _mm_mul_ps(facel
,_mm_set1_ps(charge
[inr
+1]));
1132 iq2
= _mm_mul_ps(facel
,_mm_set1_ps(charge
[inr
+2]));
1133 iq3
= _mm_mul_ps(facel
,_mm_set1_ps(charge
[inr
+3]));
1135 jq1
= _mm_set1_ps(charge
[inr
+1]);
1136 jq2
= _mm_set1_ps(charge
[inr
+2]);
1137 jq3
= _mm_set1_ps(charge
[inr
+3]);
1138 qq11
= _mm_mul_ps(iq1
,jq1
);
1139 qq12
= _mm_mul_ps(iq1
,jq2
);
1140 qq13
= _mm_mul_ps(iq1
,jq3
);
1141 qq21
= _mm_mul_ps(iq2
,jq1
);
1142 qq22
= _mm_mul_ps(iq2
,jq2
);
1143 qq23
= _mm_mul_ps(iq2
,jq3
);
1144 qq31
= _mm_mul_ps(iq3
,jq1
);
1145 qq32
= _mm_mul_ps(iq3
,jq2
);
1146 qq33
= _mm_mul_ps(iq3
,jq3
);
1148 /* Avoid stupid compiler warnings */
1149 jnrA
= jnrB
= jnrC
= jnrD
= 0;
1150 j_coord_offsetA
= 0;
1151 j_coord_offsetB
= 0;
1152 j_coord_offsetC
= 0;
1153 j_coord_offsetD
= 0;
1158 for(iidx
=0;iidx
<4*DIM
;iidx
++)
1160 scratch
[iidx
] = 0.0;
1163 /* Start outer loop over neighborlists */
1164 for(iidx
=0; iidx
<nri
; iidx
++)
1166 /* Load shift vector for this list */
1167 i_shift_offset
= DIM
*shiftidx
[iidx
];
1169 /* Load limits for loop over neighbors */
1170 j_index_start
= jindex
[iidx
];
1171 j_index_end
= jindex
[iidx
+1];
1173 /* Get outer coordinate index */
1175 i_coord_offset
= DIM
*inr
;
1177 /* Load i particle coords and add shift vector */
1178 gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec
+i_shift_offset
,x
+i_coord_offset
+DIM
,
1179 &ix1
,&iy1
,&iz1
,&ix2
,&iy2
,&iz2
,&ix3
,&iy3
,&iz3
);
1181 fix1
= _mm_setzero_ps();
1182 fiy1
= _mm_setzero_ps();
1183 fiz1
= _mm_setzero_ps();
1184 fix2
= _mm_setzero_ps();
1185 fiy2
= _mm_setzero_ps();
1186 fiz2
= _mm_setzero_ps();
1187 fix3
= _mm_setzero_ps();
1188 fiy3
= _mm_setzero_ps();
1189 fiz3
= _mm_setzero_ps();
1191 /* Start inner kernel loop */
1192 for(jidx
=j_index_start
; jidx
<j_index_end
&& jjnr
[jidx
+3]>=0; jidx
+=4)
1195 /* Get j neighbor index, and coordinate index */
1197 jnrB
= jjnr
[jidx
+1];
1198 jnrC
= jjnr
[jidx
+2];
1199 jnrD
= jjnr
[jidx
+3];
1200 j_coord_offsetA
= DIM
*jnrA
;
1201 j_coord_offsetB
= DIM
*jnrB
;
1202 j_coord_offsetC
= DIM
*jnrC
;
1203 j_coord_offsetD
= DIM
*jnrD
;
1205 /* load j atom coordinates */
1206 gmx_mm_load_3rvec_4ptr_swizzle_ps(x
+j_coord_offsetA
+DIM
,x
+j_coord_offsetB
+DIM
,
1207 x
+j_coord_offsetC
+DIM
,x
+j_coord_offsetD
+DIM
,
1208 &jx1
,&jy1
,&jz1
,&jx2
,&jy2
,&jz2
,&jx3
,&jy3
,&jz3
);
1210 /* Calculate displacement vector */
1211 dx11
= _mm_sub_ps(ix1
,jx1
);
1212 dy11
= _mm_sub_ps(iy1
,jy1
);
1213 dz11
= _mm_sub_ps(iz1
,jz1
);
1214 dx12
= _mm_sub_ps(ix1
,jx2
);
1215 dy12
= _mm_sub_ps(iy1
,jy2
);
1216 dz12
= _mm_sub_ps(iz1
,jz2
);
1217 dx13
= _mm_sub_ps(ix1
,jx3
);
1218 dy13
= _mm_sub_ps(iy1
,jy3
);
1219 dz13
= _mm_sub_ps(iz1
,jz3
);
1220 dx21
= _mm_sub_ps(ix2
,jx1
);
1221 dy21
= _mm_sub_ps(iy2
,jy1
);
1222 dz21
= _mm_sub_ps(iz2
,jz1
);
1223 dx22
= _mm_sub_ps(ix2
,jx2
);
1224 dy22
= _mm_sub_ps(iy2
,jy2
);
1225 dz22
= _mm_sub_ps(iz2
,jz2
);
1226 dx23
= _mm_sub_ps(ix2
,jx3
);
1227 dy23
= _mm_sub_ps(iy2
,jy3
);
1228 dz23
= _mm_sub_ps(iz2
,jz3
);
1229 dx31
= _mm_sub_ps(ix3
,jx1
);
1230 dy31
= _mm_sub_ps(iy3
,jy1
);
1231 dz31
= _mm_sub_ps(iz3
,jz1
);
1232 dx32
= _mm_sub_ps(ix3
,jx2
);
1233 dy32
= _mm_sub_ps(iy3
,jy2
);
1234 dz32
= _mm_sub_ps(iz3
,jz2
);
1235 dx33
= _mm_sub_ps(ix3
,jx3
);
1236 dy33
= _mm_sub_ps(iy3
,jy3
);
1237 dz33
= _mm_sub_ps(iz3
,jz3
);
1239 /* Calculate squared distance and things based on it */
1240 rsq11
= gmx_mm_calc_rsq_ps(dx11
,dy11
,dz11
);
1241 rsq12
= gmx_mm_calc_rsq_ps(dx12
,dy12
,dz12
);
1242 rsq13
= gmx_mm_calc_rsq_ps(dx13
,dy13
,dz13
);
1243 rsq21
= gmx_mm_calc_rsq_ps(dx21
,dy21
,dz21
);
1244 rsq22
= gmx_mm_calc_rsq_ps(dx22
,dy22
,dz22
);
1245 rsq23
= gmx_mm_calc_rsq_ps(dx23
,dy23
,dz23
);
1246 rsq31
= gmx_mm_calc_rsq_ps(dx31
,dy31
,dz31
);
1247 rsq32
= gmx_mm_calc_rsq_ps(dx32
,dy32
,dz32
);
1248 rsq33
= gmx_mm_calc_rsq_ps(dx33
,dy33
,dz33
);
1250 rinv11
= gmx_mm_invsqrt_ps(rsq11
);
1251 rinv12
= gmx_mm_invsqrt_ps(rsq12
);
1252 rinv13
= gmx_mm_invsqrt_ps(rsq13
);
1253 rinv21
= gmx_mm_invsqrt_ps(rsq21
);
1254 rinv22
= gmx_mm_invsqrt_ps(rsq22
);
1255 rinv23
= gmx_mm_invsqrt_ps(rsq23
);
1256 rinv31
= gmx_mm_invsqrt_ps(rsq31
);
1257 rinv32
= gmx_mm_invsqrt_ps(rsq32
);
1258 rinv33
= gmx_mm_invsqrt_ps(rsq33
);
1260 rinvsq11
= _mm_mul_ps(rinv11
,rinv11
);
1261 rinvsq12
= _mm_mul_ps(rinv12
,rinv12
);
1262 rinvsq13
= _mm_mul_ps(rinv13
,rinv13
);
1263 rinvsq21
= _mm_mul_ps(rinv21
,rinv21
);
1264 rinvsq22
= _mm_mul_ps(rinv22
,rinv22
);
1265 rinvsq23
= _mm_mul_ps(rinv23
,rinv23
);
1266 rinvsq31
= _mm_mul_ps(rinv31
,rinv31
);
1267 rinvsq32
= _mm_mul_ps(rinv32
,rinv32
);
1268 rinvsq33
= _mm_mul_ps(rinv33
,rinv33
);
1270 fjx1
= _mm_setzero_ps();
1271 fjy1
= _mm_setzero_ps();
1272 fjz1
= _mm_setzero_ps();
1273 fjx2
= _mm_setzero_ps();
1274 fjy2
= _mm_setzero_ps();
1275 fjz2
= _mm_setzero_ps();
1276 fjx3
= _mm_setzero_ps();
1277 fjy3
= _mm_setzero_ps();
1278 fjz3
= _mm_setzero_ps();
1280 /**************************
1281 * CALCULATE INTERACTIONS *
1282 **************************/
1284 r11
= _mm_mul_ps(rsq11
,rinv11
);
1286 /* EWALD ELECTROSTATICS */
1288 /* Analytical PME correction */
1289 zeta2
= _mm_mul_ps(beta2
,rsq11
);
1290 rinv3
= _mm_mul_ps(rinvsq11
,rinv11
);
1291 pmecorrF
= gmx_mm_pmecorrF_ps(zeta2
);
1292 felec
= _mm_macc_ps(pmecorrF
,beta3
,rinv3
);
1293 felec
= _mm_mul_ps(qq11
,felec
);
1297 /* Update vectorial force */
1298 fix1
= _mm_macc_ps(dx11
,fscal
,fix1
);
1299 fiy1
= _mm_macc_ps(dy11
,fscal
,fiy1
);
1300 fiz1
= _mm_macc_ps(dz11
,fscal
,fiz1
);
1302 fjx1
= _mm_macc_ps(dx11
,fscal
,fjx1
);
1303 fjy1
= _mm_macc_ps(dy11
,fscal
,fjy1
);
1304 fjz1
= _mm_macc_ps(dz11
,fscal
,fjz1
);
1306 /**************************
1307 * CALCULATE INTERACTIONS *
1308 **************************/
1310 r12
= _mm_mul_ps(rsq12
,rinv12
);
1312 /* EWALD ELECTROSTATICS */
1314 /* Analytical PME correction */
1315 zeta2
= _mm_mul_ps(beta2
,rsq12
);
1316 rinv3
= _mm_mul_ps(rinvsq12
,rinv12
);
1317 pmecorrF
= gmx_mm_pmecorrF_ps(zeta2
);
1318 felec
= _mm_macc_ps(pmecorrF
,beta3
,rinv3
);
1319 felec
= _mm_mul_ps(qq12
,felec
);
1323 /* Update vectorial force */
1324 fix1
= _mm_macc_ps(dx12
,fscal
,fix1
);
1325 fiy1
= _mm_macc_ps(dy12
,fscal
,fiy1
);
1326 fiz1
= _mm_macc_ps(dz12
,fscal
,fiz1
);
1328 fjx2
= _mm_macc_ps(dx12
,fscal
,fjx2
);
1329 fjy2
= _mm_macc_ps(dy12
,fscal
,fjy2
);
1330 fjz2
= _mm_macc_ps(dz12
,fscal
,fjz2
);
1332 /**************************
1333 * CALCULATE INTERACTIONS *
1334 **************************/
1336 r13
= _mm_mul_ps(rsq13
,rinv13
);
1338 /* EWALD ELECTROSTATICS */
1340 /* Analytical PME correction */
1341 zeta2
= _mm_mul_ps(beta2
,rsq13
);
1342 rinv3
= _mm_mul_ps(rinvsq13
,rinv13
);
1343 pmecorrF
= gmx_mm_pmecorrF_ps(zeta2
);
1344 felec
= _mm_macc_ps(pmecorrF
,beta3
,rinv3
);
1345 felec
= _mm_mul_ps(qq13
,felec
);
1349 /* Update vectorial force */
1350 fix1
= _mm_macc_ps(dx13
,fscal
,fix1
);
1351 fiy1
= _mm_macc_ps(dy13
,fscal
,fiy1
);
1352 fiz1
= _mm_macc_ps(dz13
,fscal
,fiz1
);
1354 fjx3
= _mm_macc_ps(dx13
,fscal
,fjx3
);
1355 fjy3
= _mm_macc_ps(dy13
,fscal
,fjy3
);
1356 fjz3
= _mm_macc_ps(dz13
,fscal
,fjz3
);
1358 /**************************
1359 * CALCULATE INTERACTIONS *
1360 **************************/
1362 r21
= _mm_mul_ps(rsq21
,rinv21
);
1364 /* EWALD ELECTROSTATICS */
1366 /* Analytical PME correction */
1367 zeta2
= _mm_mul_ps(beta2
,rsq21
);
1368 rinv3
= _mm_mul_ps(rinvsq21
,rinv21
);
1369 pmecorrF
= gmx_mm_pmecorrF_ps(zeta2
);
1370 felec
= _mm_macc_ps(pmecorrF
,beta3
,rinv3
);
1371 felec
= _mm_mul_ps(qq21
,felec
);
1375 /* Update vectorial force */
1376 fix2
= _mm_macc_ps(dx21
,fscal
,fix2
);
1377 fiy2
= _mm_macc_ps(dy21
,fscal
,fiy2
);
1378 fiz2
= _mm_macc_ps(dz21
,fscal
,fiz2
);
1380 fjx1
= _mm_macc_ps(dx21
,fscal
,fjx1
);
1381 fjy1
= _mm_macc_ps(dy21
,fscal
,fjy1
);
1382 fjz1
= _mm_macc_ps(dz21
,fscal
,fjz1
);
1384 /**************************
1385 * CALCULATE INTERACTIONS *
1386 **************************/
1388 r22
= _mm_mul_ps(rsq22
,rinv22
);
1390 /* EWALD ELECTROSTATICS */
1392 /* Analytical PME correction */
1393 zeta2
= _mm_mul_ps(beta2
,rsq22
);
1394 rinv3
= _mm_mul_ps(rinvsq22
,rinv22
);
1395 pmecorrF
= gmx_mm_pmecorrF_ps(zeta2
);
1396 felec
= _mm_macc_ps(pmecorrF
,beta3
,rinv3
);
1397 felec
= _mm_mul_ps(qq22
,felec
);
1401 /* Update vectorial force */
1402 fix2
= _mm_macc_ps(dx22
,fscal
,fix2
);
1403 fiy2
= _mm_macc_ps(dy22
,fscal
,fiy2
);
1404 fiz2
= _mm_macc_ps(dz22
,fscal
,fiz2
);
1406 fjx2
= _mm_macc_ps(dx22
,fscal
,fjx2
);
1407 fjy2
= _mm_macc_ps(dy22
,fscal
,fjy2
);
1408 fjz2
= _mm_macc_ps(dz22
,fscal
,fjz2
);
1410 /**************************
1411 * CALCULATE INTERACTIONS *
1412 **************************/
1414 r23
= _mm_mul_ps(rsq23
,rinv23
);
1416 /* EWALD ELECTROSTATICS */
1418 /* Analytical PME correction */
1419 zeta2
= _mm_mul_ps(beta2
,rsq23
);
1420 rinv3
= _mm_mul_ps(rinvsq23
,rinv23
);
1421 pmecorrF
= gmx_mm_pmecorrF_ps(zeta2
);
1422 felec
= _mm_macc_ps(pmecorrF
,beta3
,rinv3
);
1423 felec
= _mm_mul_ps(qq23
,felec
);
1427 /* Update vectorial force */
1428 fix2
= _mm_macc_ps(dx23
,fscal
,fix2
);
1429 fiy2
= _mm_macc_ps(dy23
,fscal
,fiy2
);
1430 fiz2
= _mm_macc_ps(dz23
,fscal
,fiz2
);
1432 fjx3
= _mm_macc_ps(dx23
,fscal
,fjx3
);
1433 fjy3
= _mm_macc_ps(dy23
,fscal
,fjy3
);
1434 fjz3
= _mm_macc_ps(dz23
,fscal
,fjz3
);
1436 /**************************
1437 * CALCULATE INTERACTIONS *
1438 **************************/
1440 r31
= _mm_mul_ps(rsq31
,rinv31
);
1442 /* EWALD ELECTROSTATICS */
1444 /* Analytical PME correction */
1445 zeta2
= _mm_mul_ps(beta2
,rsq31
);
1446 rinv3
= _mm_mul_ps(rinvsq31
,rinv31
);
1447 pmecorrF
= gmx_mm_pmecorrF_ps(zeta2
);
1448 felec
= _mm_macc_ps(pmecorrF
,beta3
,rinv3
);
1449 felec
= _mm_mul_ps(qq31
,felec
);
1453 /* Update vectorial force */
1454 fix3
= _mm_macc_ps(dx31
,fscal
,fix3
);
1455 fiy3
= _mm_macc_ps(dy31
,fscal
,fiy3
);
1456 fiz3
= _mm_macc_ps(dz31
,fscal
,fiz3
);
1458 fjx1
= _mm_macc_ps(dx31
,fscal
,fjx1
);
1459 fjy1
= _mm_macc_ps(dy31
,fscal
,fjy1
);
1460 fjz1
= _mm_macc_ps(dz31
,fscal
,fjz1
);
1462 /**************************
1463 * CALCULATE INTERACTIONS *
1464 **************************/
1466 r32
= _mm_mul_ps(rsq32
,rinv32
);
1468 /* EWALD ELECTROSTATICS */
1470 /* Analytical PME correction */
1471 zeta2
= _mm_mul_ps(beta2
,rsq32
);
1472 rinv3
= _mm_mul_ps(rinvsq32
,rinv32
);
1473 pmecorrF
= gmx_mm_pmecorrF_ps(zeta2
);
1474 felec
= _mm_macc_ps(pmecorrF
,beta3
,rinv3
);
1475 felec
= _mm_mul_ps(qq32
,felec
);
1479 /* Update vectorial force */
1480 fix3
= _mm_macc_ps(dx32
,fscal
,fix3
);
1481 fiy3
= _mm_macc_ps(dy32
,fscal
,fiy3
);
1482 fiz3
= _mm_macc_ps(dz32
,fscal
,fiz3
);
1484 fjx2
= _mm_macc_ps(dx32
,fscal
,fjx2
);
1485 fjy2
= _mm_macc_ps(dy32
,fscal
,fjy2
);
1486 fjz2
= _mm_macc_ps(dz32
,fscal
,fjz2
);
1488 /**************************
1489 * CALCULATE INTERACTIONS *
1490 **************************/
1492 r33
= _mm_mul_ps(rsq33
,rinv33
);
1494 /* EWALD ELECTROSTATICS */
1496 /* Analytical PME correction */
1497 zeta2
= _mm_mul_ps(beta2
,rsq33
);
1498 rinv3
= _mm_mul_ps(rinvsq33
,rinv33
);
1499 pmecorrF
= gmx_mm_pmecorrF_ps(zeta2
);
1500 felec
= _mm_macc_ps(pmecorrF
,beta3
,rinv3
);
1501 felec
= _mm_mul_ps(qq33
,felec
);
1505 /* Update vectorial force */
1506 fix3
= _mm_macc_ps(dx33
,fscal
,fix3
);
1507 fiy3
= _mm_macc_ps(dy33
,fscal
,fiy3
);
1508 fiz3
= _mm_macc_ps(dz33
,fscal
,fiz3
);
1510 fjx3
= _mm_macc_ps(dx33
,fscal
,fjx3
);
1511 fjy3
= _mm_macc_ps(dy33
,fscal
,fjy3
);
1512 fjz3
= _mm_macc_ps(dz33
,fscal
,fjz3
);
1514 fjptrA
= f
+j_coord_offsetA
;
1515 fjptrB
= f
+j_coord_offsetB
;
1516 fjptrC
= f
+j_coord_offsetC
;
1517 fjptrD
= f
+j_coord_offsetD
;
1519 gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA
+DIM
,fjptrB
+DIM
,fjptrC
+DIM
,fjptrD
+DIM
,
1520 fjx1
,fjy1
,fjz1
,fjx2
,fjy2
,fjz2
,fjx3
,fjy3
,fjz3
);
1522 /* Inner loop uses 252 flops */
1525 if(jidx
<j_index_end
)
1528 /* Get j neighbor index, and coordinate index */
1529 jnrlistA
= jjnr
[jidx
];
1530 jnrlistB
= jjnr
[jidx
+1];
1531 jnrlistC
= jjnr
[jidx
+2];
1532 jnrlistD
= jjnr
[jidx
+3];
1533 /* Sign of each element will be negative for non-real atoms.
1534 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
1535 * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
1537 dummy_mask
= gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i
*)(jjnr
+jidx
)),_mm_setzero_si128()));
1538 jnrA
= (jnrlistA
>=0) ? jnrlistA
: 0;
1539 jnrB
= (jnrlistB
>=0) ? jnrlistB
: 0;
1540 jnrC
= (jnrlistC
>=0) ? jnrlistC
: 0;
1541 jnrD
= (jnrlistD
>=0) ? jnrlistD
: 0;
1542 j_coord_offsetA
= DIM
*jnrA
;
1543 j_coord_offsetB
= DIM
*jnrB
;
1544 j_coord_offsetC
= DIM
*jnrC
;
1545 j_coord_offsetD
= DIM
*jnrD
;
1547 /* load j atom coordinates */
1548 gmx_mm_load_3rvec_4ptr_swizzle_ps(x
+j_coord_offsetA
+DIM
,x
+j_coord_offsetB
+DIM
,
1549 x
+j_coord_offsetC
+DIM
,x
+j_coord_offsetD
+DIM
,
1550 &jx1
,&jy1
,&jz1
,&jx2
,&jy2
,&jz2
,&jx3
,&jy3
,&jz3
);
1552 /* Calculate displacement vector */
1553 dx11
= _mm_sub_ps(ix1
,jx1
);
1554 dy11
= _mm_sub_ps(iy1
,jy1
);
1555 dz11
= _mm_sub_ps(iz1
,jz1
);
1556 dx12
= _mm_sub_ps(ix1
,jx2
);
1557 dy12
= _mm_sub_ps(iy1
,jy2
);
1558 dz12
= _mm_sub_ps(iz1
,jz2
);
1559 dx13
= _mm_sub_ps(ix1
,jx3
);
1560 dy13
= _mm_sub_ps(iy1
,jy3
);
1561 dz13
= _mm_sub_ps(iz1
,jz3
);
1562 dx21
= _mm_sub_ps(ix2
,jx1
);
1563 dy21
= _mm_sub_ps(iy2
,jy1
);
1564 dz21
= _mm_sub_ps(iz2
,jz1
);
1565 dx22
= _mm_sub_ps(ix2
,jx2
);
1566 dy22
= _mm_sub_ps(iy2
,jy2
);
1567 dz22
= _mm_sub_ps(iz2
,jz2
);
1568 dx23
= _mm_sub_ps(ix2
,jx3
);
1569 dy23
= _mm_sub_ps(iy2
,jy3
);
1570 dz23
= _mm_sub_ps(iz2
,jz3
);
1571 dx31
= _mm_sub_ps(ix3
,jx1
);
1572 dy31
= _mm_sub_ps(iy3
,jy1
);
1573 dz31
= _mm_sub_ps(iz3
,jz1
);
1574 dx32
= _mm_sub_ps(ix3
,jx2
);
1575 dy32
= _mm_sub_ps(iy3
,jy2
);
1576 dz32
= _mm_sub_ps(iz3
,jz2
);
1577 dx33
= _mm_sub_ps(ix3
,jx3
);
1578 dy33
= _mm_sub_ps(iy3
,jy3
);
1579 dz33
= _mm_sub_ps(iz3
,jz3
);
1581 /* Calculate squared distance and things based on it */
1582 rsq11
= gmx_mm_calc_rsq_ps(dx11
,dy11
,dz11
);
1583 rsq12
= gmx_mm_calc_rsq_ps(dx12
,dy12
,dz12
);
1584 rsq13
= gmx_mm_calc_rsq_ps(dx13
,dy13
,dz13
);
1585 rsq21
= gmx_mm_calc_rsq_ps(dx21
,dy21
,dz21
);
1586 rsq22
= gmx_mm_calc_rsq_ps(dx22
,dy22
,dz22
);
1587 rsq23
= gmx_mm_calc_rsq_ps(dx23
,dy23
,dz23
);
1588 rsq31
= gmx_mm_calc_rsq_ps(dx31
,dy31
,dz31
);
1589 rsq32
= gmx_mm_calc_rsq_ps(dx32
,dy32
,dz32
);
1590 rsq33
= gmx_mm_calc_rsq_ps(dx33
,dy33
,dz33
);
1592 rinv11
= gmx_mm_invsqrt_ps(rsq11
);
1593 rinv12
= gmx_mm_invsqrt_ps(rsq12
);
1594 rinv13
= gmx_mm_invsqrt_ps(rsq13
);
1595 rinv21
= gmx_mm_invsqrt_ps(rsq21
);
1596 rinv22
= gmx_mm_invsqrt_ps(rsq22
);
1597 rinv23
= gmx_mm_invsqrt_ps(rsq23
);
1598 rinv31
= gmx_mm_invsqrt_ps(rsq31
);
1599 rinv32
= gmx_mm_invsqrt_ps(rsq32
);
1600 rinv33
= gmx_mm_invsqrt_ps(rsq33
);
1602 rinvsq11
= _mm_mul_ps(rinv11
,rinv11
);
1603 rinvsq12
= _mm_mul_ps(rinv12
,rinv12
);
1604 rinvsq13
= _mm_mul_ps(rinv13
,rinv13
);
1605 rinvsq21
= _mm_mul_ps(rinv21
,rinv21
);
1606 rinvsq22
= _mm_mul_ps(rinv22
,rinv22
);
1607 rinvsq23
= _mm_mul_ps(rinv23
,rinv23
);
1608 rinvsq31
= _mm_mul_ps(rinv31
,rinv31
);
1609 rinvsq32
= _mm_mul_ps(rinv32
,rinv32
);
1610 rinvsq33
= _mm_mul_ps(rinv33
,rinv33
);
1612 fjx1
= _mm_setzero_ps();
1613 fjy1
= _mm_setzero_ps();
1614 fjz1
= _mm_setzero_ps();
1615 fjx2
= _mm_setzero_ps();
1616 fjy2
= _mm_setzero_ps();
1617 fjz2
= _mm_setzero_ps();
1618 fjx3
= _mm_setzero_ps();
1619 fjy3
= _mm_setzero_ps();
1620 fjz3
= _mm_setzero_ps();
1622 /**************************
1623 * CALCULATE INTERACTIONS *
1624 **************************/
1626 r11
= _mm_mul_ps(rsq11
,rinv11
);
1627 r11
= _mm_andnot_ps(dummy_mask
,r11
);
1629 /* EWALD ELECTROSTATICS */
1631 /* Analytical PME correction */
1632 zeta2
= _mm_mul_ps(beta2
,rsq11
);
1633 rinv3
= _mm_mul_ps(rinvsq11
,rinv11
);
1634 pmecorrF
= gmx_mm_pmecorrF_ps(zeta2
);
1635 felec
= _mm_macc_ps(pmecorrF
,beta3
,rinv3
);
1636 felec
= _mm_mul_ps(qq11
,felec
);
1640 fscal
= _mm_andnot_ps(dummy_mask
,fscal
);
1642 /* Update vectorial force */
1643 fix1
= _mm_macc_ps(dx11
,fscal
,fix1
);
1644 fiy1
= _mm_macc_ps(dy11
,fscal
,fiy1
);
1645 fiz1
= _mm_macc_ps(dz11
,fscal
,fiz1
);
1647 fjx1
= _mm_macc_ps(dx11
,fscal
,fjx1
);
1648 fjy1
= _mm_macc_ps(dy11
,fscal
,fjy1
);
1649 fjz1
= _mm_macc_ps(dz11
,fscal
,fjz1
);
1651 /**************************
1652 * CALCULATE INTERACTIONS *
1653 **************************/
1655 r12
= _mm_mul_ps(rsq12
,rinv12
);
1656 r12
= _mm_andnot_ps(dummy_mask
,r12
);
1658 /* EWALD ELECTROSTATICS */
1660 /* Analytical PME correction */
1661 zeta2
= _mm_mul_ps(beta2
,rsq12
);
1662 rinv3
= _mm_mul_ps(rinvsq12
,rinv12
);
1663 pmecorrF
= gmx_mm_pmecorrF_ps(zeta2
);
1664 felec
= _mm_macc_ps(pmecorrF
,beta3
,rinv3
);
1665 felec
= _mm_mul_ps(qq12
,felec
);
1669 fscal
= _mm_andnot_ps(dummy_mask
,fscal
);
1671 /* Update vectorial force */
1672 fix1
= _mm_macc_ps(dx12
,fscal
,fix1
);
1673 fiy1
= _mm_macc_ps(dy12
,fscal
,fiy1
);
1674 fiz1
= _mm_macc_ps(dz12
,fscal
,fiz1
);
1676 fjx2
= _mm_macc_ps(dx12
,fscal
,fjx2
);
1677 fjy2
= _mm_macc_ps(dy12
,fscal
,fjy2
);
1678 fjz2
= _mm_macc_ps(dz12
,fscal
,fjz2
);
1680 /**************************
1681 * CALCULATE INTERACTIONS *
1682 **************************/
1684 r13
= _mm_mul_ps(rsq13
,rinv13
);
1685 r13
= _mm_andnot_ps(dummy_mask
,r13
);
1687 /* EWALD ELECTROSTATICS */
1689 /* Analytical PME correction */
1690 zeta2
= _mm_mul_ps(beta2
,rsq13
);
1691 rinv3
= _mm_mul_ps(rinvsq13
,rinv13
);
1692 pmecorrF
= gmx_mm_pmecorrF_ps(zeta2
);
1693 felec
= _mm_macc_ps(pmecorrF
,beta3
,rinv3
);
1694 felec
= _mm_mul_ps(qq13
,felec
);
1698 fscal
= _mm_andnot_ps(dummy_mask
,fscal
);
1700 /* Update vectorial force */
1701 fix1
= _mm_macc_ps(dx13
,fscal
,fix1
);
1702 fiy1
= _mm_macc_ps(dy13
,fscal
,fiy1
);
1703 fiz1
= _mm_macc_ps(dz13
,fscal
,fiz1
);
1705 fjx3
= _mm_macc_ps(dx13
,fscal
,fjx3
);
1706 fjy3
= _mm_macc_ps(dy13
,fscal
,fjy3
);
1707 fjz3
= _mm_macc_ps(dz13
,fscal
,fjz3
);
1709 /**************************
1710 * CALCULATE INTERACTIONS *
1711 **************************/
1713 r21
= _mm_mul_ps(rsq21
,rinv21
);
1714 r21
= _mm_andnot_ps(dummy_mask
,r21
);
1716 /* EWALD ELECTROSTATICS */
1718 /* Analytical PME correction */
1719 zeta2
= _mm_mul_ps(beta2
,rsq21
);
1720 rinv3
= _mm_mul_ps(rinvsq21
,rinv21
);
1721 pmecorrF
= gmx_mm_pmecorrF_ps(zeta2
);
1722 felec
= _mm_macc_ps(pmecorrF
,beta3
,rinv3
);
1723 felec
= _mm_mul_ps(qq21
,felec
);
1727 fscal
= _mm_andnot_ps(dummy_mask
,fscal
);
1729 /* Update vectorial force */
1730 fix2
= _mm_macc_ps(dx21
,fscal
,fix2
);
1731 fiy2
= _mm_macc_ps(dy21
,fscal
,fiy2
);
1732 fiz2
= _mm_macc_ps(dz21
,fscal
,fiz2
);
1734 fjx1
= _mm_macc_ps(dx21
,fscal
,fjx1
);
1735 fjy1
= _mm_macc_ps(dy21
,fscal
,fjy1
);
1736 fjz1
= _mm_macc_ps(dz21
,fscal
,fjz1
);
1738 /**************************
1739 * CALCULATE INTERACTIONS *
1740 **************************/
1742 r22
= _mm_mul_ps(rsq22
,rinv22
);
1743 r22
= _mm_andnot_ps(dummy_mask
,r22
);
1745 /* EWALD ELECTROSTATICS */
1747 /* Analytical PME correction */
1748 zeta2
= _mm_mul_ps(beta2
,rsq22
);
1749 rinv3
= _mm_mul_ps(rinvsq22
,rinv22
);
1750 pmecorrF
= gmx_mm_pmecorrF_ps(zeta2
);
1751 felec
= _mm_macc_ps(pmecorrF
,beta3
,rinv3
);
1752 felec
= _mm_mul_ps(qq22
,felec
);
1756 fscal
= _mm_andnot_ps(dummy_mask
,fscal
);
1758 /* Update vectorial force */
1759 fix2
= _mm_macc_ps(dx22
,fscal
,fix2
);
1760 fiy2
= _mm_macc_ps(dy22
,fscal
,fiy2
);
1761 fiz2
= _mm_macc_ps(dz22
,fscal
,fiz2
);
1763 fjx2
= _mm_macc_ps(dx22
,fscal
,fjx2
);
1764 fjy2
= _mm_macc_ps(dy22
,fscal
,fjy2
);
1765 fjz2
= _mm_macc_ps(dz22
,fscal
,fjz2
);
1767 /**************************
1768 * CALCULATE INTERACTIONS *
1769 **************************/
1771 r23
= _mm_mul_ps(rsq23
,rinv23
);
1772 r23
= _mm_andnot_ps(dummy_mask
,r23
);
1774 /* EWALD ELECTROSTATICS */
1776 /* Analytical PME correction */
1777 zeta2
= _mm_mul_ps(beta2
,rsq23
);
1778 rinv3
= _mm_mul_ps(rinvsq23
,rinv23
);
1779 pmecorrF
= gmx_mm_pmecorrF_ps(zeta2
);
1780 felec
= _mm_macc_ps(pmecorrF
,beta3
,rinv3
);
1781 felec
= _mm_mul_ps(qq23
,felec
);
1785 fscal
= _mm_andnot_ps(dummy_mask
,fscal
);
1787 /* Update vectorial force */
1788 fix2
= _mm_macc_ps(dx23
,fscal
,fix2
);
1789 fiy2
= _mm_macc_ps(dy23
,fscal
,fiy2
);
1790 fiz2
= _mm_macc_ps(dz23
,fscal
,fiz2
);
1792 fjx3
= _mm_macc_ps(dx23
,fscal
,fjx3
);
1793 fjy3
= _mm_macc_ps(dy23
,fscal
,fjy3
);
1794 fjz3
= _mm_macc_ps(dz23
,fscal
,fjz3
);
1796 /**************************
1797 * CALCULATE INTERACTIONS *
1798 **************************/
1800 r31
= _mm_mul_ps(rsq31
,rinv31
);
1801 r31
= _mm_andnot_ps(dummy_mask
,r31
);
1803 /* EWALD ELECTROSTATICS */
1805 /* Analytical PME correction */
1806 zeta2
= _mm_mul_ps(beta2
,rsq31
);
1807 rinv3
= _mm_mul_ps(rinvsq31
,rinv31
);
1808 pmecorrF
= gmx_mm_pmecorrF_ps(zeta2
);
1809 felec
= _mm_macc_ps(pmecorrF
,beta3
,rinv3
);
1810 felec
= _mm_mul_ps(qq31
,felec
);
1814 fscal
= _mm_andnot_ps(dummy_mask
,fscal
);
1816 /* Update vectorial force */
1817 fix3
= _mm_macc_ps(dx31
,fscal
,fix3
);
1818 fiy3
= _mm_macc_ps(dy31
,fscal
,fiy3
);
1819 fiz3
= _mm_macc_ps(dz31
,fscal
,fiz3
);
1821 fjx1
= _mm_macc_ps(dx31
,fscal
,fjx1
);
1822 fjy1
= _mm_macc_ps(dy31
,fscal
,fjy1
);
1823 fjz1
= _mm_macc_ps(dz31
,fscal
,fjz1
);
1825 /**************************
1826 * CALCULATE INTERACTIONS *
1827 **************************/
1829 r32
= _mm_mul_ps(rsq32
,rinv32
);
1830 r32
= _mm_andnot_ps(dummy_mask
,r32
);
1832 /* EWALD ELECTROSTATICS */
1834 /* Analytical PME correction */
1835 zeta2
= _mm_mul_ps(beta2
,rsq32
);
1836 rinv3
= _mm_mul_ps(rinvsq32
,rinv32
);
1837 pmecorrF
= gmx_mm_pmecorrF_ps(zeta2
);
1838 felec
= _mm_macc_ps(pmecorrF
,beta3
,rinv3
);
1839 felec
= _mm_mul_ps(qq32
,felec
);
1843 fscal
= _mm_andnot_ps(dummy_mask
,fscal
);
1845 /* Update vectorial force */
1846 fix3
= _mm_macc_ps(dx32
,fscal
,fix3
);
1847 fiy3
= _mm_macc_ps(dy32
,fscal
,fiy3
);
1848 fiz3
= _mm_macc_ps(dz32
,fscal
,fiz3
);
1850 fjx2
= _mm_macc_ps(dx32
,fscal
,fjx2
);
1851 fjy2
= _mm_macc_ps(dy32
,fscal
,fjy2
);
1852 fjz2
= _mm_macc_ps(dz32
,fscal
,fjz2
);
1854 /**************************
1855 * CALCULATE INTERACTIONS *
1856 **************************/
1858 r33
= _mm_mul_ps(rsq33
,rinv33
);
1859 r33
= _mm_andnot_ps(dummy_mask
,r33
);
1861 /* EWALD ELECTROSTATICS */
1863 /* Analytical PME correction */
1864 zeta2
= _mm_mul_ps(beta2
,rsq33
);
1865 rinv3
= _mm_mul_ps(rinvsq33
,rinv33
);
1866 pmecorrF
= gmx_mm_pmecorrF_ps(zeta2
);
1867 felec
= _mm_macc_ps(pmecorrF
,beta3
,rinv3
);
1868 felec
= _mm_mul_ps(qq33
,felec
);
1872 fscal
= _mm_andnot_ps(dummy_mask
,fscal
);
1874 /* Update vectorial force */
1875 fix3
= _mm_macc_ps(dx33
,fscal
,fix3
);
1876 fiy3
= _mm_macc_ps(dy33
,fscal
,fiy3
);
1877 fiz3
= _mm_macc_ps(dz33
,fscal
,fiz3
);
1879 fjx3
= _mm_macc_ps(dx33
,fscal
,fjx3
);
1880 fjy3
= _mm_macc_ps(dy33
,fscal
,fjy3
);
1881 fjz3
= _mm_macc_ps(dz33
,fscal
,fjz3
);
1883 fjptrA
= (jnrlistA
>=0) ? f
+j_coord_offsetA
: scratch
;
1884 fjptrB
= (jnrlistB
>=0) ? f
+j_coord_offsetB
: scratch
;
1885 fjptrC
= (jnrlistC
>=0) ? f
+j_coord_offsetC
: scratch
;
1886 fjptrD
= (jnrlistD
>=0) ? f
+j_coord_offsetD
: scratch
;
1888 gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA
+DIM
,fjptrB
+DIM
,fjptrC
+DIM
,fjptrD
+DIM
,
1889 fjx1
,fjy1
,fjz1
,fjx2
,fjy2
,fjz2
,fjx3
,fjy3
,fjz3
);
1891 /* Inner loop uses 261 flops */
1894 /* End of innermost loop */
1896 gmx_mm_update_iforce_3atom_swizzle_ps(fix1
,fiy1
,fiz1
,fix2
,fiy2
,fiz2
,fix3
,fiy3
,fiz3
,
1897 f
+i_coord_offset
+DIM
,fshift
+i_shift_offset
);
1899 /* Increment number of inner iterations */
1900 inneriter
+= j_index_end
- j_index_start
;
1902 /* Outer loop uses 18 flops */
1905 /* Increment number of outer iterations */
1908 /* Update outer/inner flops */
1910 inc_nrnb(nrnb
,eNR_NBKERNEL_ELEC_W4W4_F
,outeriter
*18 + inneriter
*261);