2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
5 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6 * and including many others, as listed in the AUTHORS file in the
7 * top-level source directory and at http://www.gromacs.org.
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
36 * Note: this file was generated by the GROMACS sse2_double kernel generator.
42 #include "../nb_kernel.h"
43 #include "types/simple.h"
44 #include "gromacs/math/vec.h"
47 #include "gromacs/simd/math_x86_sse2_double.h"
48 #include "kernelutil_x86_sse2_double.h"
51 * Gromacs nonbonded kernel: nb_kernel_ElecRFCut_VdwNone_GeomW3P1_VF_sse2_double
52 * Electrostatics interaction: ReactionField
53 * VdW interaction: None
54 * Geometry: Water3-Particle
55 * Calculate force/pot: PotentialAndForce
58 nb_kernel_ElecRFCut_VdwNone_GeomW3P1_VF_sse2_double
59 (t_nblist
* gmx_restrict nlist
,
60 rvec
* gmx_restrict xx
,
61 rvec
* gmx_restrict ff
,
62 t_forcerec
* gmx_restrict fr
,
63 t_mdatoms
* gmx_restrict mdatoms
,
64 nb_kernel_data_t gmx_unused
* gmx_restrict kernel_data
,
65 t_nrnb
* gmx_restrict nrnb
)
67 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
68 * just 0 for non-waters.
69 * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different
70 * jnr indices corresponding to data put in the four positions in the SIMD register.
72 int i_shift_offset
,i_coord_offset
,outeriter
,inneriter
;
73 int j_index_start
,j_index_end
,jidx
,nri
,inr
,ggid
,iidx
;
75 int j_coord_offsetA
,j_coord_offsetB
;
76 int *iinr
,*jindex
,*jjnr
,*shiftidx
,*gid
;
78 real
*shiftvec
,*fshift
,*x
,*f
;
79 __m128d tx
,ty
,tz
,fscal
,rcutoff
,rcutoff2
,jidxall
;
81 __m128d ix0
,iy0
,iz0
,fix0
,fiy0
,fiz0
,iq0
,isai0
;
83 __m128d ix1
,iy1
,iz1
,fix1
,fiy1
,fiz1
,iq1
,isai1
;
85 __m128d ix2
,iy2
,iz2
,fix2
,fiy2
,fiz2
,iq2
,isai2
;
86 int vdwjidx0A
,vdwjidx0B
;
87 __m128d jx0
,jy0
,jz0
,fjx0
,fjy0
,fjz0
,jq0
,isaj0
;
88 __m128d dx00
,dy00
,dz00
,rsq00
,rinv00
,rinvsq00
,r00
,qq00
,c6_00
,c12_00
;
89 __m128d dx10
,dy10
,dz10
,rsq10
,rinv10
,rinvsq10
,r10
,qq10
,c6_10
,c12_10
;
90 __m128d dx20
,dy20
,dz20
,rsq20
,rinv20
,rinvsq20
,r20
,qq20
,c6_20
,c12_20
;
91 __m128d velec
,felec
,velecsum
,facel
,crf
,krf
,krf2
;
93 __m128d dummy_mask
,cutoff_mask
;
94 __m128d signbit
= gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) );
95 __m128d one
= _mm_set1_pd(1.0);
96 __m128d two
= _mm_set1_pd(2.0);
102 jindex
= nlist
->jindex
;
104 shiftidx
= nlist
->shift
;
106 shiftvec
= fr
->shift_vec
[0];
107 fshift
= fr
->fshift
[0];
108 facel
= _mm_set1_pd(fr
->epsfac
);
109 charge
= mdatoms
->chargeA
;
110 krf
= _mm_set1_pd(fr
->ic
->k_rf
);
111 krf2
= _mm_set1_pd(fr
->ic
->k_rf
*2.0);
112 crf
= _mm_set1_pd(fr
->ic
->c_rf
);
114 /* Setup water-specific parameters */
115 inr
= nlist
->iinr
[0];
116 iq0
= _mm_mul_pd(facel
,_mm_set1_pd(charge
[inr
+0]));
117 iq1
= _mm_mul_pd(facel
,_mm_set1_pd(charge
[inr
+1]));
118 iq2
= _mm_mul_pd(facel
,_mm_set1_pd(charge
[inr
+2]));
120 /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
121 rcutoff_scalar
= fr
->rcoulomb
;
122 rcutoff
= _mm_set1_pd(rcutoff_scalar
);
123 rcutoff2
= _mm_mul_pd(rcutoff
,rcutoff
);
125 /* Avoid stupid compiler warnings */
133 /* Start outer loop over neighborlists */
134 for(iidx
=0; iidx
<nri
; iidx
++)
136 /* Load shift vector for this list */
137 i_shift_offset
= DIM
*shiftidx
[iidx
];
139 /* Load limits for loop over neighbors */
140 j_index_start
= jindex
[iidx
];
141 j_index_end
= jindex
[iidx
+1];
143 /* Get outer coordinate index */
145 i_coord_offset
= DIM
*inr
;
147 /* Load i particle coords and add shift vector */
148 gmx_mm_load_shift_and_3rvec_broadcast_pd(shiftvec
+i_shift_offset
,x
+i_coord_offset
,
149 &ix0
,&iy0
,&iz0
,&ix1
,&iy1
,&iz1
,&ix2
,&iy2
,&iz2
);
151 fix0
= _mm_setzero_pd();
152 fiy0
= _mm_setzero_pd();
153 fiz0
= _mm_setzero_pd();
154 fix1
= _mm_setzero_pd();
155 fiy1
= _mm_setzero_pd();
156 fiz1
= _mm_setzero_pd();
157 fix2
= _mm_setzero_pd();
158 fiy2
= _mm_setzero_pd();
159 fiz2
= _mm_setzero_pd();
161 /* Reset potential sums */
162 velecsum
= _mm_setzero_pd();
164 /* Start inner kernel loop */
165 for(jidx
=j_index_start
; jidx
<j_index_end
-1; jidx
+=2)
168 /* Get j neighbor index, and coordinate index */
171 j_coord_offsetA
= DIM
*jnrA
;
172 j_coord_offsetB
= DIM
*jnrB
;
174 /* load j atom coordinates */
175 gmx_mm_load_1rvec_2ptr_swizzle_pd(x
+j_coord_offsetA
,x
+j_coord_offsetB
,
178 /* Calculate displacement vector */
179 dx00
= _mm_sub_pd(ix0
,jx0
);
180 dy00
= _mm_sub_pd(iy0
,jy0
);
181 dz00
= _mm_sub_pd(iz0
,jz0
);
182 dx10
= _mm_sub_pd(ix1
,jx0
);
183 dy10
= _mm_sub_pd(iy1
,jy0
);
184 dz10
= _mm_sub_pd(iz1
,jz0
);
185 dx20
= _mm_sub_pd(ix2
,jx0
);
186 dy20
= _mm_sub_pd(iy2
,jy0
);
187 dz20
= _mm_sub_pd(iz2
,jz0
);
189 /* Calculate squared distance and things based on it */
190 rsq00
= gmx_mm_calc_rsq_pd(dx00
,dy00
,dz00
);
191 rsq10
= gmx_mm_calc_rsq_pd(dx10
,dy10
,dz10
);
192 rsq20
= gmx_mm_calc_rsq_pd(dx20
,dy20
,dz20
);
194 rinv00
= gmx_mm_invsqrt_pd(rsq00
);
195 rinv10
= gmx_mm_invsqrt_pd(rsq10
);
196 rinv20
= gmx_mm_invsqrt_pd(rsq20
);
198 rinvsq00
= _mm_mul_pd(rinv00
,rinv00
);
199 rinvsq10
= _mm_mul_pd(rinv10
,rinv10
);
200 rinvsq20
= _mm_mul_pd(rinv20
,rinv20
);
202 /* Load parameters for j particles */
203 jq0
= gmx_mm_load_2real_swizzle_pd(charge
+jnrA
+0,charge
+jnrB
+0);
205 fjx0
= _mm_setzero_pd();
206 fjy0
= _mm_setzero_pd();
207 fjz0
= _mm_setzero_pd();
209 /**************************
210 * CALCULATE INTERACTIONS *
211 **************************/
213 if (gmx_mm_any_lt(rsq00
,rcutoff2
))
216 /* Compute parameters for interactions between i and j atoms */
217 qq00
= _mm_mul_pd(iq0
,jq0
);
219 /* REACTION-FIELD ELECTROSTATICS */
220 velec
= _mm_mul_pd(qq00
,_mm_sub_pd(_mm_add_pd(rinv00
,_mm_mul_pd(krf
,rsq00
)),crf
));
221 felec
= _mm_mul_pd(qq00
,_mm_sub_pd(_mm_mul_pd(rinv00
,rinvsq00
),krf2
));
223 cutoff_mask
= _mm_cmplt_pd(rsq00
,rcutoff2
);
225 /* Update potential sum for this i atom from the interaction with this j atom. */
226 velec
= _mm_and_pd(velec
,cutoff_mask
);
227 velecsum
= _mm_add_pd(velecsum
,velec
);
231 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
233 /* Calculate temporary vectorial force */
234 tx
= _mm_mul_pd(fscal
,dx00
);
235 ty
= _mm_mul_pd(fscal
,dy00
);
236 tz
= _mm_mul_pd(fscal
,dz00
);
238 /* Update vectorial force */
239 fix0
= _mm_add_pd(fix0
,tx
);
240 fiy0
= _mm_add_pd(fiy0
,ty
);
241 fiz0
= _mm_add_pd(fiz0
,tz
);
243 fjx0
= _mm_add_pd(fjx0
,tx
);
244 fjy0
= _mm_add_pd(fjy0
,ty
);
245 fjz0
= _mm_add_pd(fjz0
,tz
);
249 /**************************
250 * CALCULATE INTERACTIONS *
251 **************************/
253 if (gmx_mm_any_lt(rsq10
,rcutoff2
))
256 /* Compute parameters for interactions between i and j atoms */
257 qq10
= _mm_mul_pd(iq1
,jq0
);
259 /* REACTION-FIELD ELECTROSTATICS */
260 velec
= _mm_mul_pd(qq10
,_mm_sub_pd(_mm_add_pd(rinv10
,_mm_mul_pd(krf
,rsq10
)),crf
));
261 felec
= _mm_mul_pd(qq10
,_mm_sub_pd(_mm_mul_pd(rinv10
,rinvsq10
),krf2
));
263 cutoff_mask
= _mm_cmplt_pd(rsq10
,rcutoff2
);
265 /* Update potential sum for this i atom from the interaction with this j atom. */
266 velec
= _mm_and_pd(velec
,cutoff_mask
);
267 velecsum
= _mm_add_pd(velecsum
,velec
);
271 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
273 /* Calculate temporary vectorial force */
274 tx
= _mm_mul_pd(fscal
,dx10
);
275 ty
= _mm_mul_pd(fscal
,dy10
);
276 tz
= _mm_mul_pd(fscal
,dz10
);
278 /* Update vectorial force */
279 fix1
= _mm_add_pd(fix1
,tx
);
280 fiy1
= _mm_add_pd(fiy1
,ty
);
281 fiz1
= _mm_add_pd(fiz1
,tz
);
283 fjx0
= _mm_add_pd(fjx0
,tx
);
284 fjy0
= _mm_add_pd(fjy0
,ty
);
285 fjz0
= _mm_add_pd(fjz0
,tz
);
289 /**************************
290 * CALCULATE INTERACTIONS *
291 **************************/
293 if (gmx_mm_any_lt(rsq20
,rcutoff2
))
296 /* Compute parameters for interactions between i and j atoms */
297 qq20
= _mm_mul_pd(iq2
,jq0
);
299 /* REACTION-FIELD ELECTROSTATICS */
300 velec
= _mm_mul_pd(qq20
,_mm_sub_pd(_mm_add_pd(rinv20
,_mm_mul_pd(krf
,rsq20
)),crf
));
301 felec
= _mm_mul_pd(qq20
,_mm_sub_pd(_mm_mul_pd(rinv20
,rinvsq20
),krf2
));
303 cutoff_mask
= _mm_cmplt_pd(rsq20
,rcutoff2
);
305 /* Update potential sum for this i atom from the interaction with this j atom. */
306 velec
= _mm_and_pd(velec
,cutoff_mask
);
307 velecsum
= _mm_add_pd(velecsum
,velec
);
311 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
313 /* Calculate temporary vectorial force */
314 tx
= _mm_mul_pd(fscal
,dx20
);
315 ty
= _mm_mul_pd(fscal
,dy20
);
316 tz
= _mm_mul_pd(fscal
,dz20
);
318 /* Update vectorial force */
319 fix2
= _mm_add_pd(fix2
,tx
);
320 fiy2
= _mm_add_pd(fiy2
,ty
);
321 fiz2
= _mm_add_pd(fiz2
,tz
);
323 fjx0
= _mm_add_pd(fjx0
,tx
);
324 fjy0
= _mm_add_pd(fjy0
,ty
);
325 fjz0
= _mm_add_pd(fjz0
,tz
);
329 gmx_mm_decrement_1rvec_2ptr_swizzle_pd(f
+j_coord_offsetA
,f
+j_coord_offsetB
,fjx0
,fjy0
,fjz0
);
331 /* Inner loop uses 111 flops */
338 j_coord_offsetA
= DIM
*jnrA
;
340 /* load j atom coordinates */
341 gmx_mm_load_1rvec_1ptr_swizzle_pd(x
+j_coord_offsetA
,
344 /* Calculate displacement vector */
345 dx00
= _mm_sub_pd(ix0
,jx0
);
346 dy00
= _mm_sub_pd(iy0
,jy0
);
347 dz00
= _mm_sub_pd(iz0
,jz0
);
348 dx10
= _mm_sub_pd(ix1
,jx0
);
349 dy10
= _mm_sub_pd(iy1
,jy0
);
350 dz10
= _mm_sub_pd(iz1
,jz0
);
351 dx20
= _mm_sub_pd(ix2
,jx0
);
352 dy20
= _mm_sub_pd(iy2
,jy0
);
353 dz20
= _mm_sub_pd(iz2
,jz0
);
355 /* Calculate squared distance and things based on it */
356 rsq00
= gmx_mm_calc_rsq_pd(dx00
,dy00
,dz00
);
357 rsq10
= gmx_mm_calc_rsq_pd(dx10
,dy10
,dz10
);
358 rsq20
= gmx_mm_calc_rsq_pd(dx20
,dy20
,dz20
);
360 rinv00
= gmx_mm_invsqrt_pd(rsq00
);
361 rinv10
= gmx_mm_invsqrt_pd(rsq10
);
362 rinv20
= gmx_mm_invsqrt_pd(rsq20
);
364 rinvsq00
= _mm_mul_pd(rinv00
,rinv00
);
365 rinvsq10
= _mm_mul_pd(rinv10
,rinv10
);
366 rinvsq20
= _mm_mul_pd(rinv20
,rinv20
);
368 /* Load parameters for j particles */
369 jq0
= _mm_load_sd(charge
+jnrA
+0);
371 fjx0
= _mm_setzero_pd();
372 fjy0
= _mm_setzero_pd();
373 fjz0
= _mm_setzero_pd();
375 /**************************
376 * CALCULATE INTERACTIONS *
377 **************************/
379 if (gmx_mm_any_lt(rsq00
,rcutoff2
))
382 /* Compute parameters for interactions between i and j atoms */
383 qq00
= _mm_mul_pd(iq0
,jq0
);
385 /* REACTION-FIELD ELECTROSTATICS */
386 velec
= _mm_mul_pd(qq00
,_mm_sub_pd(_mm_add_pd(rinv00
,_mm_mul_pd(krf
,rsq00
)),crf
));
387 felec
= _mm_mul_pd(qq00
,_mm_sub_pd(_mm_mul_pd(rinv00
,rinvsq00
),krf2
));
389 cutoff_mask
= _mm_cmplt_pd(rsq00
,rcutoff2
);
391 /* Update potential sum for this i atom from the interaction with this j atom. */
392 velec
= _mm_and_pd(velec
,cutoff_mask
);
393 velec
= _mm_unpacklo_pd(velec
,_mm_setzero_pd());
394 velecsum
= _mm_add_pd(velecsum
,velec
);
398 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
400 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
402 /* Calculate temporary vectorial force */
403 tx
= _mm_mul_pd(fscal
,dx00
);
404 ty
= _mm_mul_pd(fscal
,dy00
);
405 tz
= _mm_mul_pd(fscal
,dz00
);
407 /* Update vectorial force */
408 fix0
= _mm_add_pd(fix0
,tx
);
409 fiy0
= _mm_add_pd(fiy0
,ty
);
410 fiz0
= _mm_add_pd(fiz0
,tz
);
412 fjx0
= _mm_add_pd(fjx0
,tx
);
413 fjy0
= _mm_add_pd(fjy0
,ty
);
414 fjz0
= _mm_add_pd(fjz0
,tz
);
418 /**************************
419 * CALCULATE INTERACTIONS *
420 **************************/
422 if (gmx_mm_any_lt(rsq10
,rcutoff2
))
425 /* Compute parameters for interactions between i and j atoms */
426 qq10
= _mm_mul_pd(iq1
,jq0
);
428 /* REACTION-FIELD ELECTROSTATICS */
429 velec
= _mm_mul_pd(qq10
,_mm_sub_pd(_mm_add_pd(rinv10
,_mm_mul_pd(krf
,rsq10
)),crf
));
430 felec
= _mm_mul_pd(qq10
,_mm_sub_pd(_mm_mul_pd(rinv10
,rinvsq10
),krf2
));
432 cutoff_mask
= _mm_cmplt_pd(rsq10
,rcutoff2
);
434 /* Update potential sum for this i atom from the interaction with this j atom. */
435 velec
= _mm_and_pd(velec
,cutoff_mask
);
436 velec
= _mm_unpacklo_pd(velec
,_mm_setzero_pd());
437 velecsum
= _mm_add_pd(velecsum
,velec
);
441 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
443 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
445 /* Calculate temporary vectorial force */
446 tx
= _mm_mul_pd(fscal
,dx10
);
447 ty
= _mm_mul_pd(fscal
,dy10
);
448 tz
= _mm_mul_pd(fscal
,dz10
);
450 /* Update vectorial force */
451 fix1
= _mm_add_pd(fix1
,tx
);
452 fiy1
= _mm_add_pd(fiy1
,ty
);
453 fiz1
= _mm_add_pd(fiz1
,tz
);
455 fjx0
= _mm_add_pd(fjx0
,tx
);
456 fjy0
= _mm_add_pd(fjy0
,ty
);
457 fjz0
= _mm_add_pd(fjz0
,tz
);
461 /**************************
462 * CALCULATE INTERACTIONS *
463 **************************/
465 if (gmx_mm_any_lt(rsq20
,rcutoff2
))
468 /* Compute parameters for interactions between i and j atoms */
469 qq20
= _mm_mul_pd(iq2
,jq0
);
471 /* REACTION-FIELD ELECTROSTATICS */
472 velec
= _mm_mul_pd(qq20
,_mm_sub_pd(_mm_add_pd(rinv20
,_mm_mul_pd(krf
,rsq20
)),crf
));
473 felec
= _mm_mul_pd(qq20
,_mm_sub_pd(_mm_mul_pd(rinv20
,rinvsq20
),krf2
));
475 cutoff_mask
= _mm_cmplt_pd(rsq20
,rcutoff2
);
477 /* Update potential sum for this i atom from the interaction with this j atom. */
478 velec
= _mm_and_pd(velec
,cutoff_mask
);
479 velec
= _mm_unpacklo_pd(velec
,_mm_setzero_pd());
480 velecsum
= _mm_add_pd(velecsum
,velec
);
484 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
486 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
488 /* Calculate temporary vectorial force */
489 tx
= _mm_mul_pd(fscal
,dx20
);
490 ty
= _mm_mul_pd(fscal
,dy20
);
491 tz
= _mm_mul_pd(fscal
,dz20
);
493 /* Update vectorial force */
494 fix2
= _mm_add_pd(fix2
,tx
);
495 fiy2
= _mm_add_pd(fiy2
,ty
);
496 fiz2
= _mm_add_pd(fiz2
,tz
);
498 fjx0
= _mm_add_pd(fjx0
,tx
);
499 fjy0
= _mm_add_pd(fjy0
,ty
);
500 fjz0
= _mm_add_pd(fjz0
,tz
);
504 gmx_mm_decrement_1rvec_1ptr_swizzle_pd(f
+j_coord_offsetA
,fjx0
,fjy0
,fjz0
);
506 /* Inner loop uses 111 flops */
509 /* End of innermost loop */
511 gmx_mm_update_iforce_3atom_swizzle_pd(fix0
,fiy0
,fiz0
,fix1
,fiy1
,fiz1
,fix2
,fiy2
,fiz2
,
512 f
+i_coord_offset
,fshift
+i_shift_offset
);
515 /* Update potential energies */
516 gmx_mm_update_1pot_pd(velecsum
,kernel_data
->energygrp_elec
+ggid
);
518 /* Increment number of inner iterations */
519 inneriter
+= j_index_end
- j_index_start
;
521 /* Outer loop uses 19 flops */
524 /* Increment number of outer iterations */
527 /* Update outer/inner flops */
529 inc_nrnb(nrnb
,eNR_NBKERNEL_ELEC_W3_VF
,outeriter
*19 + inneriter
*111);
532 * Gromacs nonbonded kernel: nb_kernel_ElecRFCut_VdwNone_GeomW3P1_F_sse2_double
533 * Electrostatics interaction: ReactionField
534 * VdW interaction: None
535 * Geometry: Water3-Particle
536 * Calculate force/pot: Force
539 nb_kernel_ElecRFCut_VdwNone_GeomW3P1_F_sse2_double
540 (t_nblist
* gmx_restrict nlist
,
541 rvec
* gmx_restrict xx
,
542 rvec
* gmx_restrict ff
,
543 t_forcerec
* gmx_restrict fr
,
544 t_mdatoms
* gmx_restrict mdatoms
,
545 nb_kernel_data_t gmx_unused
* gmx_restrict kernel_data
,
546 t_nrnb
* gmx_restrict nrnb
)
548 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
549 * just 0 for non-waters.
550 * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different
551 * jnr indices corresponding to data put in the four positions in the SIMD register.
553 int i_shift_offset
,i_coord_offset
,outeriter
,inneriter
;
554 int j_index_start
,j_index_end
,jidx
,nri
,inr
,ggid
,iidx
;
556 int j_coord_offsetA
,j_coord_offsetB
;
557 int *iinr
,*jindex
,*jjnr
,*shiftidx
,*gid
;
559 real
*shiftvec
,*fshift
,*x
,*f
;
560 __m128d tx
,ty
,tz
,fscal
,rcutoff
,rcutoff2
,jidxall
;
562 __m128d ix0
,iy0
,iz0
,fix0
,fiy0
,fiz0
,iq0
,isai0
;
564 __m128d ix1
,iy1
,iz1
,fix1
,fiy1
,fiz1
,iq1
,isai1
;
566 __m128d ix2
,iy2
,iz2
,fix2
,fiy2
,fiz2
,iq2
,isai2
;
567 int vdwjidx0A
,vdwjidx0B
;
568 __m128d jx0
,jy0
,jz0
,fjx0
,fjy0
,fjz0
,jq0
,isaj0
;
569 __m128d dx00
,dy00
,dz00
,rsq00
,rinv00
,rinvsq00
,r00
,qq00
,c6_00
,c12_00
;
570 __m128d dx10
,dy10
,dz10
,rsq10
,rinv10
,rinvsq10
,r10
,qq10
,c6_10
,c12_10
;
571 __m128d dx20
,dy20
,dz20
,rsq20
,rinv20
,rinvsq20
,r20
,qq20
,c6_20
,c12_20
;
572 __m128d velec
,felec
,velecsum
,facel
,crf
,krf
,krf2
;
574 __m128d dummy_mask
,cutoff_mask
;
575 __m128d signbit
= gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) );
576 __m128d one
= _mm_set1_pd(1.0);
577 __m128d two
= _mm_set1_pd(2.0);
583 jindex
= nlist
->jindex
;
585 shiftidx
= nlist
->shift
;
587 shiftvec
= fr
->shift_vec
[0];
588 fshift
= fr
->fshift
[0];
589 facel
= _mm_set1_pd(fr
->epsfac
);
590 charge
= mdatoms
->chargeA
;
591 krf
= _mm_set1_pd(fr
->ic
->k_rf
);
592 krf2
= _mm_set1_pd(fr
->ic
->k_rf
*2.0);
593 crf
= _mm_set1_pd(fr
->ic
->c_rf
);
595 /* Setup water-specific parameters */
596 inr
= nlist
->iinr
[0];
597 iq0
= _mm_mul_pd(facel
,_mm_set1_pd(charge
[inr
+0]));
598 iq1
= _mm_mul_pd(facel
,_mm_set1_pd(charge
[inr
+1]));
599 iq2
= _mm_mul_pd(facel
,_mm_set1_pd(charge
[inr
+2]));
601 /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
602 rcutoff_scalar
= fr
->rcoulomb
;
603 rcutoff
= _mm_set1_pd(rcutoff_scalar
);
604 rcutoff2
= _mm_mul_pd(rcutoff
,rcutoff
);
606 /* Avoid stupid compiler warnings */
614 /* Start outer loop over neighborlists */
615 for(iidx
=0; iidx
<nri
; iidx
++)
617 /* Load shift vector for this list */
618 i_shift_offset
= DIM
*shiftidx
[iidx
];
620 /* Load limits for loop over neighbors */
621 j_index_start
= jindex
[iidx
];
622 j_index_end
= jindex
[iidx
+1];
624 /* Get outer coordinate index */
626 i_coord_offset
= DIM
*inr
;
628 /* Load i particle coords and add shift vector */
629 gmx_mm_load_shift_and_3rvec_broadcast_pd(shiftvec
+i_shift_offset
,x
+i_coord_offset
,
630 &ix0
,&iy0
,&iz0
,&ix1
,&iy1
,&iz1
,&ix2
,&iy2
,&iz2
);
632 fix0
= _mm_setzero_pd();
633 fiy0
= _mm_setzero_pd();
634 fiz0
= _mm_setzero_pd();
635 fix1
= _mm_setzero_pd();
636 fiy1
= _mm_setzero_pd();
637 fiz1
= _mm_setzero_pd();
638 fix2
= _mm_setzero_pd();
639 fiy2
= _mm_setzero_pd();
640 fiz2
= _mm_setzero_pd();
642 /* Start inner kernel loop */
643 for(jidx
=j_index_start
; jidx
<j_index_end
-1; jidx
+=2)
646 /* Get j neighbor index, and coordinate index */
649 j_coord_offsetA
= DIM
*jnrA
;
650 j_coord_offsetB
= DIM
*jnrB
;
652 /* load j atom coordinates */
653 gmx_mm_load_1rvec_2ptr_swizzle_pd(x
+j_coord_offsetA
,x
+j_coord_offsetB
,
656 /* Calculate displacement vector */
657 dx00
= _mm_sub_pd(ix0
,jx0
);
658 dy00
= _mm_sub_pd(iy0
,jy0
);
659 dz00
= _mm_sub_pd(iz0
,jz0
);
660 dx10
= _mm_sub_pd(ix1
,jx0
);
661 dy10
= _mm_sub_pd(iy1
,jy0
);
662 dz10
= _mm_sub_pd(iz1
,jz0
);
663 dx20
= _mm_sub_pd(ix2
,jx0
);
664 dy20
= _mm_sub_pd(iy2
,jy0
);
665 dz20
= _mm_sub_pd(iz2
,jz0
);
667 /* Calculate squared distance and things based on it */
668 rsq00
= gmx_mm_calc_rsq_pd(dx00
,dy00
,dz00
);
669 rsq10
= gmx_mm_calc_rsq_pd(dx10
,dy10
,dz10
);
670 rsq20
= gmx_mm_calc_rsq_pd(dx20
,dy20
,dz20
);
672 rinv00
= gmx_mm_invsqrt_pd(rsq00
);
673 rinv10
= gmx_mm_invsqrt_pd(rsq10
);
674 rinv20
= gmx_mm_invsqrt_pd(rsq20
);
676 rinvsq00
= _mm_mul_pd(rinv00
,rinv00
);
677 rinvsq10
= _mm_mul_pd(rinv10
,rinv10
);
678 rinvsq20
= _mm_mul_pd(rinv20
,rinv20
);
680 /* Load parameters for j particles */
681 jq0
= gmx_mm_load_2real_swizzle_pd(charge
+jnrA
+0,charge
+jnrB
+0);
683 fjx0
= _mm_setzero_pd();
684 fjy0
= _mm_setzero_pd();
685 fjz0
= _mm_setzero_pd();
687 /**************************
688 * CALCULATE INTERACTIONS *
689 **************************/
691 if (gmx_mm_any_lt(rsq00
,rcutoff2
))
694 /* Compute parameters for interactions between i and j atoms */
695 qq00
= _mm_mul_pd(iq0
,jq0
);
697 /* REACTION-FIELD ELECTROSTATICS */
698 felec
= _mm_mul_pd(qq00
,_mm_sub_pd(_mm_mul_pd(rinv00
,rinvsq00
),krf2
));
700 cutoff_mask
= _mm_cmplt_pd(rsq00
,rcutoff2
);
704 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
706 /* Calculate temporary vectorial force */
707 tx
= _mm_mul_pd(fscal
,dx00
);
708 ty
= _mm_mul_pd(fscal
,dy00
);
709 tz
= _mm_mul_pd(fscal
,dz00
);
711 /* Update vectorial force */
712 fix0
= _mm_add_pd(fix0
,tx
);
713 fiy0
= _mm_add_pd(fiy0
,ty
);
714 fiz0
= _mm_add_pd(fiz0
,tz
);
716 fjx0
= _mm_add_pd(fjx0
,tx
);
717 fjy0
= _mm_add_pd(fjy0
,ty
);
718 fjz0
= _mm_add_pd(fjz0
,tz
);
722 /**************************
723 * CALCULATE INTERACTIONS *
724 **************************/
726 if (gmx_mm_any_lt(rsq10
,rcutoff2
))
729 /* Compute parameters for interactions between i and j atoms */
730 qq10
= _mm_mul_pd(iq1
,jq0
);
732 /* REACTION-FIELD ELECTROSTATICS */
733 felec
= _mm_mul_pd(qq10
,_mm_sub_pd(_mm_mul_pd(rinv10
,rinvsq10
),krf2
));
735 cutoff_mask
= _mm_cmplt_pd(rsq10
,rcutoff2
);
739 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
741 /* Calculate temporary vectorial force */
742 tx
= _mm_mul_pd(fscal
,dx10
);
743 ty
= _mm_mul_pd(fscal
,dy10
);
744 tz
= _mm_mul_pd(fscal
,dz10
);
746 /* Update vectorial force */
747 fix1
= _mm_add_pd(fix1
,tx
);
748 fiy1
= _mm_add_pd(fiy1
,ty
);
749 fiz1
= _mm_add_pd(fiz1
,tz
);
751 fjx0
= _mm_add_pd(fjx0
,tx
);
752 fjy0
= _mm_add_pd(fjy0
,ty
);
753 fjz0
= _mm_add_pd(fjz0
,tz
);
757 /**************************
758 * CALCULATE INTERACTIONS *
759 **************************/
761 if (gmx_mm_any_lt(rsq20
,rcutoff2
))
764 /* Compute parameters for interactions between i and j atoms */
765 qq20
= _mm_mul_pd(iq2
,jq0
);
767 /* REACTION-FIELD ELECTROSTATICS */
768 felec
= _mm_mul_pd(qq20
,_mm_sub_pd(_mm_mul_pd(rinv20
,rinvsq20
),krf2
));
770 cutoff_mask
= _mm_cmplt_pd(rsq20
,rcutoff2
);
774 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
776 /* Calculate temporary vectorial force */
777 tx
= _mm_mul_pd(fscal
,dx20
);
778 ty
= _mm_mul_pd(fscal
,dy20
);
779 tz
= _mm_mul_pd(fscal
,dz20
);
781 /* Update vectorial force */
782 fix2
= _mm_add_pd(fix2
,tx
);
783 fiy2
= _mm_add_pd(fiy2
,ty
);
784 fiz2
= _mm_add_pd(fiz2
,tz
);
786 fjx0
= _mm_add_pd(fjx0
,tx
);
787 fjy0
= _mm_add_pd(fjy0
,ty
);
788 fjz0
= _mm_add_pd(fjz0
,tz
);
792 gmx_mm_decrement_1rvec_2ptr_swizzle_pd(f
+j_coord_offsetA
,f
+j_coord_offsetB
,fjx0
,fjy0
,fjz0
);
794 /* Inner loop uses 93 flops */
801 j_coord_offsetA
= DIM
*jnrA
;
803 /* load j atom coordinates */
804 gmx_mm_load_1rvec_1ptr_swizzle_pd(x
+j_coord_offsetA
,
807 /* Calculate displacement vector */
808 dx00
= _mm_sub_pd(ix0
,jx0
);
809 dy00
= _mm_sub_pd(iy0
,jy0
);
810 dz00
= _mm_sub_pd(iz0
,jz0
);
811 dx10
= _mm_sub_pd(ix1
,jx0
);
812 dy10
= _mm_sub_pd(iy1
,jy0
);
813 dz10
= _mm_sub_pd(iz1
,jz0
);
814 dx20
= _mm_sub_pd(ix2
,jx0
);
815 dy20
= _mm_sub_pd(iy2
,jy0
);
816 dz20
= _mm_sub_pd(iz2
,jz0
);
818 /* Calculate squared distance and things based on it */
819 rsq00
= gmx_mm_calc_rsq_pd(dx00
,dy00
,dz00
);
820 rsq10
= gmx_mm_calc_rsq_pd(dx10
,dy10
,dz10
);
821 rsq20
= gmx_mm_calc_rsq_pd(dx20
,dy20
,dz20
);
823 rinv00
= gmx_mm_invsqrt_pd(rsq00
);
824 rinv10
= gmx_mm_invsqrt_pd(rsq10
);
825 rinv20
= gmx_mm_invsqrt_pd(rsq20
);
827 rinvsq00
= _mm_mul_pd(rinv00
,rinv00
);
828 rinvsq10
= _mm_mul_pd(rinv10
,rinv10
);
829 rinvsq20
= _mm_mul_pd(rinv20
,rinv20
);
831 /* Load parameters for j particles */
832 jq0
= _mm_load_sd(charge
+jnrA
+0);
834 fjx0
= _mm_setzero_pd();
835 fjy0
= _mm_setzero_pd();
836 fjz0
= _mm_setzero_pd();
838 /**************************
839 * CALCULATE INTERACTIONS *
840 **************************/
842 if (gmx_mm_any_lt(rsq00
,rcutoff2
))
845 /* Compute parameters for interactions between i and j atoms */
846 qq00
= _mm_mul_pd(iq0
,jq0
);
848 /* REACTION-FIELD ELECTROSTATICS */
849 felec
= _mm_mul_pd(qq00
,_mm_sub_pd(_mm_mul_pd(rinv00
,rinvsq00
),krf2
));
851 cutoff_mask
= _mm_cmplt_pd(rsq00
,rcutoff2
);
855 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
857 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
859 /* Calculate temporary vectorial force */
860 tx
= _mm_mul_pd(fscal
,dx00
);
861 ty
= _mm_mul_pd(fscal
,dy00
);
862 tz
= _mm_mul_pd(fscal
,dz00
);
864 /* Update vectorial force */
865 fix0
= _mm_add_pd(fix0
,tx
);
866 fiy0
= _mm_add_pd(fiy0
,ty
);
867 fiz0
= _mm_add_pd(fiz0
,tz
);
869 fjx0
= _mm_add_pd(fjx0
,tx
);
870 fjy0
= _mm_add_pd(fjy0
,ty
);
871 fjz0
= _mm_add_pd(fjz0
,tz
);
875 /**************************
876 * CALCULATE INTERACTIONS *
877 **************************/
879 if (gmx_mm_any_lt(rsq10
,rcutoff2
))
882 /* Compute parameters for interactions between i and j atoms */
883 qq10
= _mm_mul_pd(iq1
,jq0
);
885 /* REACTION-FIELD ELECTROSTATICS */
886 felec
= _mm_mul_pd(qq10
,_mm_sub_pd(_mm_mul_pd(rinv10
,rinvsq10
),krf2
));
888 cutoff_mask
= _mm_cmplt_pd(rsq10
,rcutoff2
);
892 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
894 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
896 /* Calculate temporary vectorial force */
897 tx
= _mm_mul_pd(fscal
,dx10
);
898 ty
= _mm_mul_pd(fscal
,dy10
);
899 tz
= _mm_mul_pd(fscal
,dz10
);
901 /* Update vectorial force */
902 fix1
= _mm_add_pd(fix1
,tx
);
903 fiy1
= _mm_add_pd(fiy1
,ty
);
904 fiz1
= _mm_add_pd(fiz1
,tz
);
906 fjx0
= _mm_add_pd(fjx0
,tx
);
907 fjy0
= _mm_add_pd(fjy0
,ty
);
908 fjz0
= _mm_add_pd(fjz0
,tz
);
912 /**************************
913 * CALCULATE INTERACTIONS *
914 **************************/
916 if (gmx_mm_any_lt(rsq20
,rcutoff2
))
919 /* Compute parameters for interactions between i and j atoms */
920 qq20
= _mm_mul_pd(iq2
,jq0
);
922 /* REACTION-FIELD ELECTROSTATICS */
923 felec
= _mm_mul_pd(qq20
,_mm_sub_pd(_mm_mul_pd(rinv20
,rinvsq20
),krf2
));
925 cutoff_mask
= _mm_cmplt_pd(rsq20
,rcutoff2
);
929 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
931 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
933 /* Calculate temporary vectorial force */
934 tx
= _mm_mul_pd(fscal
,dx20
);
935 ty
= _mm_mul_pd(fscal
,dy20
);
936 tz
= _mm_mul_pd(fscal
,dz20
);
938 /* Update vectorial force */
939 fix2
= _mm_add_pd(fix2
,tx
);
940 fiy2
= _mm_add_pd(fiy2
,ty
);
941 fiz2
= _mm_add_pd(fiz2
,tz
);
943 fjx0
= _mm_add_pd(fjx0
,tx
);
944 fjy0
= _mm_add_pd(fjy0
,ty
);
945 fjz0
= _mm_add_pd(fjz0
,tz
);
949 gmx_mm_decrement_1rvec_1ptr_swizzle_pd(f
+j_coord_offsetA
,fjx0
,fjy0
,fjz0
);
951 /* Inner loop uses 93 flops */
954 /* End of innermost loop */
956 gmx_mm_update_iforce_3atom_swizzle_pd(fix0
,fiy0
,fiz0
,fix1
,fiy1
,fiz1
,fix2
,fiy2
,fiz2
,
957 f
+i_coord_offset
,fshift
+i_shift_offset
);
959 /* Increment number of inner iterations */
960 inneriter
+= j_index_end
- j_index_start
;
962 /* Outer loop uses 18 flops */
965 /* Increment number of outer iterations */
968 /* Update outer/inner flops */
970 inc_nrnb(nrnb
,eNR_NBKERNEL_ELEC_W3_F
,outeriter
*18 + inneriter
*93);