Partial commit of the project to remove all static variables.
[gromacs.git] / src / gmxlib / inner_altivec.c
blob1d0410ac6185463f7263a57b0a48377a92814a78
1 /*
2 * $Id$
3 *
4 * This source code is part of
5 *
6 * G R O M A C S
7 *
8 * GROningen MAchine for Chemical Simulations
9 *
10 * VERSION 3.1
11 * Copyright (c) 1991-2001, University of Groningen, The Netherlands
12 * This program is free software; you can redistribute it and/or
13 * modify it under the terms of the GNU General Public License
14 * as published by the Free Software Foundation; either version 2
15 * of the License, or (at your option) any later version.
17 * If you want to redistribute modifications, please consider that
18 * scientific software is very special. Version control is crucial -
19 * bugs must be traceable. We will be happy to consider code for
20 * inclusion in the official distribution, but derived work must not
21 * be called official GROMACS. Details are found in the README & COPYING
22 * files - if they are missing, get the official version at www.gromacs.org.
24 * To help us fund GROMACS development, we humbly ask that you cite
25 * the papers on the package - you can find them in the top README file.
27 * For more info, check our website at http://www.gromacs.org
29 * And Hey:
30 * Great Red Owns Many ACres of Sand
33 #include <ppc_altivec.h>
35 #include<stdio.h>
38 void check_altivec(void)
40 vector unsigned short vsr1,vsr2;
41 vector unsigned int tmp;
43 vsr1=vec_mfvscr();
44 tmp=vec_sl(vec_splat_u32(1),vec_splat_u32(8));
45 vsr2=(vector unsigned short)vec_sl(tmp,vec_splat_u32(8));
46 vsr1=vec_or(vsr1,vsr2);
47 vec_mtvscr(vsr1);
52 void inl0100_altivec(
53 int nri,
54 int iinr[],
55 int jindex[],
56 int jjnr[],
57 int shift[],
58 float shiftvec[],
59 float fshift[],
60 int gid[],
61 float pos[],
62 float faction[],
63 int type[],
64 int ntype,
65 float nbfp[],
66 float Vnb[])
68 vector float ix,iy,iz,shvec;
69 vector float fs,nul;
70 vector float dx,dy,dz;
71 vector float vnbtot,c6,c12;
72 vector float fix,fiy,fiz;
73 vector float tmp1,tmp2,tmp3,tmp4;
74 vector float rinvsq,rsq,rinvsix,vnb6,vnb12;
76 int n,k,k0,ii,is3,ii3,nj0,nj1;
77 int jnra,jnrb,jnrc,jnrd;
78 int j3a,j3b,j3c,j3d;
79 int ntiA,tja,tjb,tjc,tjd;
81 nul=vec_zero();
83 for(n=0;n<nri;n++) {
84 is3 = 3*shift[n];
85 shvec = load_xyz(shiftvec+is3);
86 ii = iinr[n];
87 ii3 = 3*ii;
88 ix = load_xyz(pos+ii3);
89 vnbtot = nul;
90 fix = nul;
91 fiy = nul;
92 fiz = nul;
93 ix = vec_add(ix,shvec);
94 nj0 = jindex[n];
95 nj1 = jindex[n+1];
96 splat_xyz_to_vectors(ix,&ix,&iy,&iz);
97 ntiA = 2*ntype*type[ii];
98 for(k=nj0; k<(nj1-3); k+=4) {
99 jnra = jjnr[k];
100 jnrb = jjnr[k+1];
101 jnrc = jjnr[k+2];
102 jnrd = jjnr[k+3];
103 j3a = 3*jnra;
104 j3b = 3*jnrb;
105 j3c = 3*jnrc;
106 j3d = 3*jnrd;
107 transpose_4_to_3(load_xyz(pos+j3a),
108 load_xyz(pos+j3b),
109 load_xyz(pos+j3c),
110 load_xyz(pos+j3d),&dx,&dy,&dz);
111 dx = vec_sub(ix,dx);
112 dy = vec_sub(iy,dy);
113 dz = vec_sub(iz,dz);
114 rsq = vec_madd(dx,dx,nul);
115 rsq = vec_madd(dy,dy,rsq);
116 rsq = vec_madd(dz,dz,rsq);
117 rinvsq = do_recip(rsq);
118 rinvsix = vec_madd(rinvsq,rinvsq,nul);
119 rinvsix = vec_madd(rinvsix,rinvsq,nul);
120 tja = ntiA+2*type[jnra];
121 tjb = ntiA+2*type[jnrb];
122 tjc = ntiA+2*type[jnrc];
123 tjd = ntiA+2*type[jnrd];
124 load_4_pair(nbfp+tja,nbfp+tjb,nbfp+tjc,nbfp+tjd,&c6,&c12);
125 vnb6 = vec_madd(c6,rinvsix,nul);
126 vnb12 = vec_madd(c12,vec_madd(rinvsix,rinvsix,nul),nul);
127 vnbtot = vec_add(vnbtot,vnb12);
128 vnbtot = vec_sub(vnbtot,vnb6);
129 fs = vec_madd(vec_twelve(),vnb12,nul);
130 fs = vec_nmsub(vec_six(),vnb6,fs);
131 fs = vec_madd(fs,rinvsq,nul);
132 fix = vec_madd(fs,dx,fix); /* +=fx */
133 fiy = vec_madd(fs,dy,fiy); /* +=fy */
134 fiz = vec_madd(fs,dz,fiz); /* +=fz */
135 dx = vec_nmsub(dx,fs,nul); /* -fx */
136 dy = vec_nmsub(dy,fs,nul); /* -fy */
137 dz = vec_nmsub(dz,fs,nul); /* -fz */
138 transpose_3_to_4(dx,dy,dz,&tmp1,&tmp2,&tmp3,&tmp4);
139 add_xyz_to_mem(faction+j3a,tmp1);
140 add_xyz_to_mem(faction+j3b,tmp2);
141 add_xyz_to_mem(faction+j3c,tmp3);
142 add_xyz_to_mem(faction+j3d,tmp4);
144 if(k<(nj1-1)) {
145 jnra = jjnr[k];
146 jnrb = jjnr[k+1];
147 j3a = 3*jnra;
148 j3b = 3*jnrb;
149 transpose_2_to_3(load_xyz(pos+j3a),
150 load_xyz(pos+j3b),&dx,&dy,&dz);
151 dx = vec_sub(ix,dx);
152 dy = vec_sub(iy,dy);
153 dz = vec_sub(iz,dz);
154 rsq = vec_madd(dx,dx,nul);
155 rsq = vec_madd(dy,dy,rsq);
156 rsq = vec_madd(dz,dz,rsq);
157 rinvsq = do_recip(rsq);
158 zero_highest_2_elements_in_vector(&rinvsq);
159 rinvsix = vec_madd(rinvsq,rinvsq,nul);
160 rinvsix = vec_madd(rinvsix,rinvsq,nul);
161 tja = ntiA+2*type[jnra];
162 tjb = ntiA+2*type[jnrb];
163 load_2_pair(nbfp+tja,nbfp+tjb,&c6,&c12);
164 vnb6 = vec_madd(c6,rinvsix,nul);
165 vnb12 = vec_madd(c12,vec_madd(rinvsix,rinvsix,nul),nul);
166 vnbtot = vec_add(vnbtot,vnb12);
167 vnbtot = vec_sub(vnbtot,vnb6);
168 fs = vec_madd(vec_twelve(),vnb12,nul);
169 fs = vec_nmsub(vec_six(),vnb6,fs);
170 fs = vec_madd(fs,rinvsq,nul);
171 fix = vec_madd(fs,dx,fix); /* +=fx */
172 fiy = vec_madd(fs,dy,fiy); /* +=fy */
173 fiz = vec_madd(fs,dz,fiz); /* +=fz */
174 dx = vec_nmsub(dx,fs,nul); /* -fx */
175 dy = vec_nmsub(dy,fs,nul); /* -fy */
176 dz = vec_nmsub(dz,fs,nul); /* -fz */
177 transpose_3_to_2(dx,dy,dz,&tmp1,&tmp2);
178 add_xyz_to_mem(faction+j3a,tmp1);
179 add_xyz_to_mem(faction+j3b,tmp2);
180 k += 2;
182 if((nj1-nj0)%2) {
183 jnra = jjnr[k];
184 j3a = 3*jnra;
185 transpose_1_to_3(load_xyz(pos+j3a),&dx,&dy,&dz);
186 dx = vec_sub(ix,dx);
187 dy = vec_sub(iy,dy);
188 dz = vec_sub(iz,dz);
189 rsq = vec_madd(dx,dx,nul);
190 rsq = vec_madd(dy,dy,rsq);
191 rsq = vec_madd(dz,dz,rsq);
192 rinvsq = do_recip(rsq);
193 zero_highest_3_elements_in_vector(&rinvsq);
194 rinvsix = vec_madd(rinvsq,rinvsq,nul);
195 rinvsix = vec_madd(rinvsix,rinvsq,nul);
196 tja = ntiA+2*type[jnra];
197 load_1_pair(nbfp+tja,&c6,&c12);
198 vnb6 = vec_madd(c6,rinvsix,nul);
199 vnb12 = vec_madd(c12,vec_madd(rinvsix,rinvsix,nul),nul);
200 vnbtot = vec_add(vnbtot,vnb12);
201 vnbtot = vec_sub(vnbtot,vnb6);
202 fs = vec_madd(vec_twelve(),vnb12,nul);
203 fs = vec_nmsub(vec_six(),vnb6,fs);
204 fs = vec_madd(fs,rinvsq,nul);
205 fix = vec_madd(fs,dx,fix); /* +=fx */
206 fiy = vec_madd(fs,dy,fiy); /* +=fy */
207 fiz = vec_madd(fs,dz,fiz); /* +=fz */
208 dx = vec_nmsub(dx,fs,nul); /* -fx */
209 dy = vec_nmsub(dy,fs,nul); /* -fy */
210 dz = vec_nmsub(dz,fs,nul); /* -fz */
211 transpose_3_to_1(dx,dy,dz,&tmp1);
212 add_xyz_to_mem(faction+j3a,tmp1);
214 /* update outer data */
215 transpose_3_to_4(fix,fiy,fiz,&tmp1,&tmp2,&tmp3,&tmp4);
216 tmp1 = vec_add(tmp1,tmp3);
217 tmp2 = vec_add(tmp2,tmp4);
218 tmp1 = vec_add(tmp1,tmp2);
219 add_xyz_to_mem(faction+ii3,tmp1);
220 add_xyz_to_mem(fshift+is3,tmp1);
222 add_vector_to_float(Vnb+gid[n],vnbtot);
227 void inl0300_altivec(
228 int nri,
229 int iinr[],
230 int jindex[],
231 int jjnr[],
232 int shift[],
233 float shiftvec[],
234 float fshift[],
235 int gid[],
236 float pos[],
237 float faction[],
238 int type[],
239 int ntype,
240 float nbfp[],
241 float Vnb[],
242 float tabscale,
243 float VFtab[])
245 vector float ix,iy,iz,shvec;
246 vector float fs,nul,tsc;
247 vector float dx,dy,dz;
248 vector float vnbtot,c6,c12;
249 vector float fix,fiy,fiz;
250 vector float tmp1,tmp2,tmp3,tmp4;
251 vector float rinv,r,rsq;
252 vector float VVd,FFd,VVr,FFr;
254 int n,k,k0,ii,is3,ii3,ntiA,nj0,nj1;
255 int jnra,jnrb,jnrc,jnrd;
256 int j3a,j3b,j3c,j3d;
257 int tja,tjb,tjc,tjd;
259 nul=vec_zero();
260 tsc=load_float_and_splat(&tabscale);
262 for(n=0;n<nri;n++) {
263 is3 = 3*shift[n];
264 shvec = load_xyz(shiftvec+is3);
265 ii = iinr[n];
266 ii3 = 3*ii;
267 ix = load_xyz(pos+ii3);
268 vnbtot = nul;
269 fix = nul;
270 fiy = nul;
271 fiz = nul;
272 ix = vec_add(ix,shvec);
273 nj0 = jindex[n];
274 nj1 = jindex[n+1];
275 splat_xyz_to_vectors(ix,&ix,&iy,&iz);
276 ntiA = 2*ntype*type[ii];
278 for(k=nj0; k<(nj1-3); k+=4) {
279 jnra = jjnr[k];
280 jnrb = jjnr[k+1];
281 jnrc = jjnr[k+2];
282 jnrd = jjnr[k+3];
283 j3a = 3*jnra;
284 j3b = 3*jnrb;
285 j3c = 3*jnrc;
286 j3d = 3*jnrd;
287 transpose_4_to_3(load_xyz(pos+j3a),
288 load_xyz(pos+j3b),
289 load_xyz(pos+j3c),
290 load_xyz(pos+j3d),&dx,&dy,&dz);
291 dx = vec_sub(ix,dx);
292 dy = vec_sub(iy,dy);
293 dz = vec_sub(iz,dz);
294 rsq = vec_madd(dx,dx,nul);
295 rsq = vec_madd(dy,dy,rsq);
296 rsq = vec_madd(dz,dz,rsq);
297 rinv = do_invsqrt(rsq);
298 r = vec_madd(rinv,rsq,nul);
299 tja = ntiA+2*type[jnra];
300 tjb = ntiA+2*type[jnrb];
301 tjc = ntiA+2*type[jnrc];
302 tjd = ntiA+2*type[jnrd];
303 load_4_pair(nbfp+tja,nbfp+tjb,nbfp+tjc,nbfp+tjd,&c6,&c12);
304 do_4_ljtable_lj(VFtab,vec_madd(r,tsc,nul),&VVd,&FFd,&VVr,&FFr);
305 fs = vec_nmsub(c6,FFd,nul);
306 vnbtot = vec_madd(c6,VVd,vnbtot);
307 fs = vec_nmsub(c12,FFr,fs);
308 vnbtot = vec_madd(c12,VVr,vnbtot);
309 fs = vec_madd(vec_madd(fs,tsc,nul),rinv,nul);
310 fix = vec_madd(fs,dx,fix); /* +=fx */
311 fiy = vec_madd(fs,dy,fiy); /* +=fy */
312 fiz = vec_madd(fs,dz,fiz); /* +=fz */
313 dx = vec_nmsub(dx,fs,nul); /* -fx */
314 dy = vec_nmsub(dy,fs,nul); /* -fy */
315 dz = vec_nmsub(dz,fs,nul); /* -fz */
316 transpose_3_to_4(dx,dy,dz,&tmp1,&tmp2,&tmp3,&tmp4);
317 add_xyz_to_mem(faction+j3a,tmp1);
318 add_xyz_to_mem(faction+j3b,tmp2);
319 add_xyz_to_mem(faction+j3c,tmp3);
320 add_xyz_to_mem(faction+j3d,tmp4);
322 if(k<(nj1-1)) {
323 jnra = jjnr[k];
324 jnrb = jjnr[k+1];
325 j3a = 3*jnra;
326 j3b = 3*jnrb;
327 transpose_2_to_3(load_xyz(pos+j3a),
328 load_xyz(pos+j3b),&dx,&dy,&dz);
329 dx = vec_sub(ix,dx);
330 dy = vec_sub(iy,dy);
331 dz = vec_sub(iz,dz);
332 rsq = vec_madd(dx,dx,nul);
333 rsq = vec_madd(dy,dy,rsq);
334 rsq = vec_madd(dz,dz,rsq);
335 zero_highest_2_elements_in_vector(&rsq);
336 rinv = do_invsqrt(rsq);
337 zero_highest_2_elements_in_vector(&rinv);
338 r = vec_madd(rinv,rsq,nul);
339 tja = ntiA+2*type[jnra];
340 tjb = ntiA+2*type[jnrb];
341 load_2_pair(nbfp+tja,nbfp+tjb,&c6,&c12);
342 do_2_ljtable_lj(VFtab,vec_madd(r,tsc,nul),&VVd,&FFd,&VVr,&FFr);
343 fs = vec_nmsub(c6,FFd,nul);
344 vnbtot = vec_madd(c6,VVd,vnbtot);
345 fs = vec_nmsub(c12,FFr,fs);
346 vnbtot = vec_madd(c12,VVr,vnbtot);
347 fs = vec_madd(vec_madd(fs,tsc,nul),rinv,nul);
348 fix = vec_madd(fs,dx,fix); /* +=fx */
349 fiy = vec_madd(fs,dy,fiy); /* +=fy */
350 fiz = vec_madd(fs,dz,fiz); /* +=fz */
351 dx = vec_nmsub(dx,fs,nul); /* -fx */
352 dy = vec_nmsub(dy,fs,nul); /* -fy */
353 dz = vec_nmsub(dz,fs,nul); /* -fz */
354 transpose_3_to_2(dx,dy,dz,&tmp1,&tmp2);
355 add_xyz_to_mem(faction+j3a,tmp1);
356 add_xyz_to_mem(faction+j3b,tmp2);
357 k += 2;
359 if((nj1-nj0)%2) {
360 jnra = jjnr[k];
361 j3a = 3*jnra;
362 transpose_1_to_3(load_xyz(pos+j3a),&dx,&dy,&dz);
363 dx = vec_sub(ix,dx);
364 dy = vec_sub(iy,dy);
365 dz = vec_sub(iz,dz);
366 rsq = vec_madd(dx,dx,nul);
367 rsq = vec_madd(dy,dy,rsq);
368 rsq = vec_madd(dz,dz,rsq);
369 zero_highest_3_elements_in_vector(&rsq);
370 rinv = do_invsqrt(rsq);
371 zero_highest_3_elements_in_vector(&rinv);
372 r = vec_madd(rinv,rsq,nul);
373 tja = ntiA+2*type[jnra];
374 load_1_pair(nbfp+tja,&c6,&c12);
375 do_1_ljtable_lj(VFtab,vec_madd(r,tsc,nul),&VVd,&FFd,&VVr,&FFr);
376 fs = vec_nmsub(c6,FFd,nul);
377 vnbtot = vec_madd(c6,VVd,vnbtot);
378 fs = vec_nmsub(c12,FFr,fs);
379 vnbtot = vec_madd(c12,VVr,vnbtot);
380 fs = vec_madd(vec_madd(fs,tsc,nul),rinv,nul);
381 fix = vec_madd(fs,dx,fix); /* +=fx */
382 fiy = vec_madd(fs,dy,fiy); /* +=fy */
383 fiz = vec_madd(fs,dz,fiz); /* +=fz */
384 dx = vec_nmsub(dx,fs,nul); /* -fx */
385 dy = vec_nmsub(dy,fs,nul); /* -fy */
386 dz = vec_nmsub(dz,fs,nul); /* -fz */
387 transpose_3_to_1(dx,dy,dz,&tmp1);
388 add_xyz_to_mem(faction+j3a,tmp1);
390 /* update outer data */
391 transpose_3_to_4(fix,fiy,fiz,&tmp1,&tmp2,&tmp3,&tmp4);
392 tmp1 = vec_add(tmp1,tmp3);
393 tmp2 = vec_add(tmp2,tmp4);
394 tmp1 = vec_add(tmp1,tmp2);
395 add_xyz_to_mem(faction+ii3,tmp1);
396 add_xyz_to_mem(fshift+is3,tmp1);
398 add_vector_to_float(Vnb+gid[n],vnbtot);
404 void inl1000_altivec(
405 int nri,
406 int iinr[],
407 int jindex[],
408 int jjnr[],
409 int shift[],
410 float shiftvec[],
411 float fshift[],
412 int gid[],
413 float pos[],
414 float faction[],
415 float charge[],
416 float facel,
417 float Vc[])
419 vector float ix,iy,iz,shvec;
420 vector float vfacel,vcoul,fs,nul;
421 vector float dx,dy,dz;
422 vector float vctot,qq,iq;
423 vector float fix,fiy,fiz;
424 vector float tmp1,tmp2,tmp3,tmp4;
425 vector float rinv,rinvsq,rsq;
427 int n,k,k0,ii,is3,ii3,ntiA,nj0,nj1;
428 int jnra,jnrb,jnrc,jnrd;
429 int j3a,j3b,j3c,j3d;
431 nul=vec_zero();
432 vfacel=load_float_and_splat(&facel);
434 for(n=0;n<nri;n++) {
435 is3 = 3*shift[n];
436 shvec = load_xyz(shiftvec+is3);
437 ii = iinr[n];
438 ii3 = 3*ii;
439 ix = load_xyz(pos+ii3);
440 vctot = nul;
441 fix = nul;
442 fiy = nul;
443 fiz = nul;
444 ix = vec_add(ix,shvec);
445 nj0 = jindex[n];
446 nj1 = jindex[n+1];
447 splat_xyz_to_vectors(ix,&ix,&iy,&iz);
448 iq = vec_madd(load_float_and_splat(charge+ii),vfacel,nul);
450 for(k=nj0; k<(nj1-3); k+=4) {
451 jnra = jjnr[k];
452 jnrb = jjnr[k+1];
453 jnrc = jjnr[k+2];
454 jnrd = jjnr[k+3];
455 j3a = 3*jnra;
456 j3b = 3*jnrb;
457 j3c = 3*jnrc;
458 j3d = 3*jnrd;
459 transpose_4_to_3(load_xyz(pos+j3a),
460 load_xyz(pos+j3b),
461 load_xyz(pos+j3c),
462 load_xyz(pos+j3d),&dx,&dy,&dz);
463 dx = vec_sub(ix,dx);
464 dy = vec_sub(iy,dy);
465 dz = vec_sub(iz,dz);
466 rsq = vec_madd(dx,dx,nul);
467 rsq = vec_madd(dy,dy,rsq);
468 rsq = vec_madd(dz,dz,rsq);
469 rinv = do_invsqrt(rsq);
470 rinvsq = vec_madd(rinv,rinv,nul);
471 /* load 4 j charges and multiply by iq */
472 qq = vec_madd(load_4_float(charge+jnra,charge+jnrb,
473 charge+jnrc,charge+jnrd),iq,nul);
474 vcoul = vec_madd(qq,rinv,nul);
475 fs = vec_madd(vcoul,rinvsq,nul);
476 vctot = vec_add(vctot,vcoul);
477 fix = vec_madd(fs,dx,fix); /* +=fx */
478 fiy = vec_madd(fs,dy,fiy); /* +=fy */
479 fiz = vec_madd(fs,dz,fiz); /* +=fz */
480 dx = vec_nmsub(dx,fs,nul); /* -fx */
481 dy = vec_nmsub(dy,fs,nul); /* -fy */
482 dz = vec_nmsub(dz,fs,nul); /* -fz */
483 transpose_3_to_4(dx,dy,dz,&tmp1,&tmp2,&tmp3,&tmp4);
484 add_xyz_to_mem(faction+j3a,tmp1);
485 add_xyz_to_mem(faction+j3b,tmp2);
486 add_xyz_to_mem(faction+j3c,tmp3);
487 add_xyz_to_mem(faction+j3d,tmp4);
489 if(k<(nj1-1)) {
490 jnra = jjnr[k];
491 jnrb = jjnr[k+1];
492 j3a = 3*jnra;
493 j3b = 3*jnrb;
494 transpose_2_to_3(load_xyz(pos+j3a),
495 load_xyz(pos+j3b),&dx,&dy,&dz);
496 dx = vec_sub(ix,dx);
497 dy = vec_sub(iy,dy);
498 dz = vec_sub(iz,dz);
499 rsq = vec_madd(dx,dx,nul);
500 rsq = vec_madd(dy,dy,rsq);
501 rsq = vec_madd(dz,dz,rsq);
502 rinv = do_invsqrt(rsq);
503 zero_highest_2_elements_in_vector(&rinv);
504 rinvsq = vec_madd(rinv,rinv,nul);
505 /* load 2 j charges and multiply by iq */
506 qq = vec_madd(load_2_float(charge+jnra,charge+jnrb),iq,nul);
507 vcoul = vec_madd(qq,rinv,nul);
508 fs = vec_madd(vcoul,rinvsq,nul);
509 vctot = vec_add(vctot,vcoul);
510 fix = vec_madd(fs,dx,fix); /* +=fx */
511 fiy = vec_madd(fs,dy,fiy); /* +=fy */
512 fiz = vec_madd(fs,dz,fiz); /* +=fz */
513 dx = vec_nmsub(dx,fs,nul); /* -fx */
514 dy = vec_nmsub(dy,fs,nul); /* -fy */
515 dz = vec_nmsub(dz,fs,nul); /* -fz */
516 transpose_3_to_2(dx,dy,dz,&tmp1,&tmp2);
517 add_xyz_to_mem(faction+j3a,tmp1);
518 add_xyz_to_mem(faction+j3b,tmp2);
519 k += 2;
521 if((nj1-nj0)%2) {
522 jnra = jjnr[k];
523 j3a = 3*jnra;
524 transpose_1_to_3(load_xyz(pos+j3a),&dx,&dy,&dz);
525 dx = vec_sub(ix,dx);
526 dy = vec_sub(iy,dy);
527 dz = vec_sub(iz,dz);
528 rsq = vec_madd(dx,dx,nul);
529 rsq = vec_madd(dy,dy,rsq);
530 rsq = vec_madd(dz,dz,rsq);
531 rinv = do_invsqrt(rsq);
532 zero_highest_3_elements_in_vector(&rinv);
533 rinvsq = vec_madd(rinv,rinv,nul);
534 /* load 1 j charge and multiply by iq */
535 qq = vec_madd(load_1_float(charge+jnra),iq,nul);
536 vcoul = vec_madd(qq,rinv,nul);
537 fs = vec_madd(vcoul,rinvsq,nul);
538 vctot = vec_add(vctot,vcoul);
539 fix = vec_madd(fs,dx,fix); /* +=fx */
540 fiy = vec_madd(fs,dy,fiy); /* +=fy */
541 fiz = vec_madd(fs,dz,fiz); /* +=fz */
542 dx = vec_nmsub(dx,fs,nul); /* -fx */
543 dy = vec_nmsub(dy,fs,nul); /* -fy */
544 dz = vec_nmsub(dz,fs,nul); /* -fz */
545 transpose_3_to_1(dx,dy,dz,&tmp1);
546 add_xyz_to_mem(faction+j3a,tmp1);
548 /* update outer data */
549 transpose_3_to_4(fix,fiy,fiz,&tmp1,&tmp2,&tmp3,&tmp4);
550 tmp1 = vec_add(tmp1,tmp3);
551 tmp2 = vec_add(tmp2,tmp4);
552 tmp1 = vec_add(tmp1,tmp2);
553 add_xyz_to_mem(faction+ii3,tmp1);
554 add_xyz_to_mem(fshift+is3,tmp1);
556 add_vector_to_float(Vc+gid[n],vctot);
562 void inl1100_altivec(
563 int nri,
564 int iinr[],
565 int jindex[],
566 int jjnr[],
567 int shift[],
568 float shiftvec[],
569 float fshift[],
570 int gid[],
571 float pos[],
572 float faction[],
573 float charge[],
574 float facel,
575 float Vc[],
576 int type[],
577 int ntype,
578 float nbfp[],
579 float Vnb[])
581 vector float ix,iy,iz,shvec;
582 vector float vfacel,vcoul,fs,nul;
583 vector float dx,dy,dz;
584 vector float vnbtot,vctot,qq,iq,c6,c12;
585 vector float fix,fiy,fiz;
586 vector float tmp1,tmp2,tmp3,tmp4;
587 vector float rinv,rinvsq,rsq,rinvsix,vnb6,vnb12;
589 int n,k,k0,ii,is3,ii3,ntiA,nj0,nj1;
590 int jnra,jnrb,jnrc,jnrd;
591 int j3a,j3b,j3c,j3d;
592 int tja,tjb,tjc,tjd;
594 nul=vec_zero();
595 vfacel=load_float_and_splat(&facel);
597 for(n=0;n<nri;n++) {
598 is3 = 3*shift[n];
599 shvec = load_xyz(shiftvec+is3);
600 ii = iinr[n];
601 ii3 = 3*ii;
602 ix = load_xyz(pos+ii3);
603 vnbtot = nul;
604 vctot = nul;
605 fix = nul;
606 fiy = nul;
607 fiz = nul;
608 ix = vec_add(ix,shvec);
609 nj0 = jindex[n];
610 nj1 = jindex[n+1];
611 splat_xyz_to_vectors(ix,&ix,&iy,&iz);
612 ntiA = 2*ntype*type[ii];
613 iq = vec_madd(load_float_and_splat(charge+ii),vfacel,nul);
615 for(k=nj0; k<(nj1-3); k+=4) {
616 jnra = jjnr[k];
617 jnrb = jjnr[k+1];
618 jnrc = jjnr[k+2];
619 jnrd = jjnr[k+3];
620 j3a = 3*jnra;
621 j3b = 3*jnrb;
622 j3c = 3*jnrc;
623 j3d = 3*jnrd;
624 transpose_4_to_3(load_xyz(pos+j3a),
625 load_xyz(pos+j3b),
626 load_xyz(pos+j3c),
627 load_xyz(pos+j3d),&dx,&dy,&dz);
628 dx = vec_sub(ix,dx);
629 dy = vec_sub(iy,dy);
630 dz = vec_sub(iz,dz);
631 rsq = vec_madd(dx,dx,nul);
632 rsq = vec_madd(dy,dy,rsq);
633 rsq = vec_madd(dz,dz,rsq);
634 rinv = do_invsqrt(rsq);
635 rinvsq = vec_madd(rinv,rinv,nul);
636 rinvsix = vec_madd(rinvsq,rinvsq,nul);
637 rinvsix = vec_madd(rinvsix,rinvsq,nul);
638 tja = ntiA+2*type[jnra];
639 tjb = ntiA+2*type[jnrb];
640 tjc = ntiA+2*type[jnrc];
641 tjd = ntiA+2*type[jnrd];
642 qq = vec_madd(load_4_float(charge+jnra,charge+jnrb,
643 charge+jnrc,charge+jnrd),iq,nul);
644 load_4_pair(nbfp+tja,nbfp+tjb,nbfp+tjc,nbfp+tjd,&c6,&c12);
645 vnb6 = vec_madd(c6,rinvsix,nul);
646 vnb12 = vec_madd(c12,vec_madd(rinvsix,rinvsix,nul),nul);
647 vcoul = vec_madd(qq,rinv,nul);
648 vnbtot = vec_add(vnbtot,vnb12);
649 vnbtot = vec_sub(vnbtot,vnb6);
650 vctot = vec_add(vctot,vcoul);
651 fs = vec_madd(vec_twelve(),vnb12,vcoul);
652 fs = vec_nmsub(vec_six(),vnb6,fs);
653 fs = vec_madd(fs,rinvsq,nul);
654 fix = vec_madd(fs,dx,fix); /* +=fx */
655 fiy = vec_madd(fs,dy,fiy); /* +=fy */
656 fiz = vec_madd(fs,dz,fiz); /* +=fz */
657 dx = vec_nmsub(dx,fs,nul); /* -fx */
658 dy = vec_nmsub(dy,fs,nul); /* -fy */
659 dz = vec_nmsub(dz,fs,nul); /* -fz */
660 transpose_3_to_4(dx,dy,dz,&tmp1,&tmp2,&tmp3,&tmp4);
661 add_xyz_to_mem(faction+j3a,tmp1);
662 add_xyz_to_mem(faction+j3b,tmp2);
663 add_xyz_to_mem(faction+j3c,tmp3);
664 add_xyz_to_mem(faction+j3d,tmp4);
666 if(k<(nj1-1)) {
667 jnra = jjnr[k];
668 jnrb = jjnr[k+1];
669 j3a = 3*jnra;
670 j3b = 3*jnrb;
671 transpose_2_to_3(load_xyz(pos+j3a),
672 load_xyz(pos+j3b),&dx,&dy,&dz);
673 dx = vec_sub(ix,dx);
674 dy = vec_sub(iy,dy);
675 dz = vec_sub(iz,dz);
676 rsq = vec_madd(dx,dx,nul);
677 rsq = vec_madd(dy,dy,rsq);
678 rsq = vec_madd(dz,dz,rsq);
679 rinv = do_invsqrt(rsq);
680 zero_highest_2_elements_in_vector(&rinv);
681 rinvsq = vec_madd(rinv,rinv,nul);
682 rinvsix = vec_madd(rinvsq,rinvsq,nul);
683 rinvsix = vec_madd(rinvsix,rinvsq,nul);
684 tja = ntiA+2*type[jnra];
685 tjb = ntiA+2*type[jnrb];
686 qq = vec_madd(load_2_float(charge+jnra,charge+jnrb),iq,nul);
687 load_2_pair(nbfp+tja,nbfp+tjb,&c6,&c12);
688 vnb6 = vec_madd(c6,rinvsix,nul);
689 vnb12 = vec_madd(c12,vec_madd(rinvsix,rinvsix,nul),nul);
690 vcoul = vec_madd(qq,rinv,nul);
691 vnbtot = vec_add(vnbtot,vnb12);
692 vnbtot = vec_sub(vnbtot,vnb6);
693 vctot = vec_add(vctot,vcoul);
694 fs = vec_madd(vec_twelve(),vnb12,vcoul);
695 fs = vec_nmsub(vec_six(),vnb6,fs);
696 fs = vec_madd(fs,rinvsq,nul);
697 fix = vec_madd(fs,dx,fix); /* +=fx */
698 fiy = vec_madd(fs,dy,fiy); /* +=fy */
699 fiz = vec_madd(fs,dz,fiz); /* +=fz */
700 dx = vec_nmsub(dx,fs,nul); /* -fx */
701 dy = vec_nmsub(dy,fs,nul); /* -fy */
702 dz = vec_nmsub(dz,fs,nul); /* -fz */
703 transpose_3_to_2(dx,dy,dz,&tmp1,&tmp2);
704 add_xyz_to_mem(faction+j3a,tmp1);
705 add_xyz_to_mem(faction+j3b,tmp2);
706 k += 2;
708 if((nj1-nj0)%2) {
709 jnra = jjnr[k];
710 j3a = 3*jnra;
711 transpose_1_to_3(load_xyz(pos+j3a),&dx,&dy,&dz);
712 dx = vec_sub(ix,dx);
713 dy = vec_sub(iy,dy);
714 dz = vec_sub(iz,dz);
715 rsq = vec_madd(dx,dx,nul);
716 rsq = vec_madd(dy,dy,rsq);
717 rsq = vec_madd(dz,dz,rsq);
718 zero_highest_3_elements_in_vector(&rinv);
719 rinv = do_invsqrt(rsq);
720 rinvsq = vec_madd(rinv,rinv,nul);
721 rinvsix = vec_madd(rinvsq,rinvsq,nul);
722 rinvsix = vec_madd(rinvsix,rinvsq,nul);
723 tja = ntiA+2*type[jnra];
724 qq = vec_madd(load_1_float(charge+jnra),iq,nul);
725 load_1_pair(nbfp+tja,&c6,&c12);
726 vnb6 = vec_madd(c6,rinvsix,nul);
727 vnb12 = vec_madd(c12,vec_madd(rinvsix,rinvsix,nul),nul);
728 vcoul = vec_madd(qq,rinv,nul);
729 vnbtot = vec_add(vnbtot,vnb12);
730 vnbtot = vec_sub(vnbtot,vnb6);
731 vctot = vec_add(vctot,vcoul);
732 fs = vec_madd(vec_twelve(),vnb12,vcoul);
733 fs = vec_nmsub(vec_six(),vnb6,fs);
734 fs = vec_madd(fs,rinvsq,nul);
735 fix = vec_madd(fs,dx,fix); /* +=fx */
736 fiy = vec_madd(fs,dy,fiy); /* +=fy */
737 fiz = vec_madd(fs,dz,fiz); /* +=fz */
738 dx = vec_nmsub(dx,fs,nul); /* -fx */
739 dy = vec_nmsub(dy,fs,nul); /* -fy */
740 dz = vec_nmsub(dz,fs,nul); /* -fz */
741 transpose_3_to_1(dx,dy,dz,&tmp1);
742 add_xyz_to_mem(faction+j3a,tmp1);
744 /* update outer data */
745 transpose_3_to_4(fix,fiy,fiz,&tmp1,&tmp2,&tmp3,&tmp4);
746 tmp1 = vec_add(tmp1,tmp3);
747 tmp2 = vec_add(tmp2,tmp4);
748 tmp1 = vec_add(tmp1,tmp2);
750 add_xyz_to_mem(faction+ii3,tmp1);
751 add_xyz_to_mem(fshift+is3,tmp1);
753 add_vector_to_float(Vc+gid[n],vctot);
754 add_vector_to_float(Vnb+gid[n],vnbtot);
761 void inl2000_altivec(
762 int nri,
763 int iinr[],
764 int jindex[],
765 int jjnr[],
766 int shift[],
767 float shiftvec[],
768 float fshift[],
769 int gid[],
770 float pos[],
771 float faction[],
772 float charge[],
773 float facel,
774 float Vc[],
775 float krf,
776 float crf)
778 vector float ix,iy,iz,shvec;
779 vector float vfacel,vkrf,vcrf,krsq,vcoul,fs,nul;
780 vector float dx,dy,dz;
781 vector float vctot,qq,iq;
782 vector float fix,fiy,fiz;
783 vector float tmp1,tmp2,tmp3,tmp4;
784 vector float rinv,rinvsq,rsq;
786 int n,k,k0,ii,is3,ii3,ntiA,nj0,nj1;
787 int jnra,jnrb,jnrc,jnrd;
788 int j3a,j3b,j3c,j3d;
790 nul=vec_zero();
791 vfacel=load_float_and_splat(&facel);
792 vkrf=load_float_and_splat(&krf);
793 vcrf=load_float_and_splat(&crf);
795 for(n=0;n<nri;n++) {
796 is3 = 3*shift[n];
797 shvec = load_xyz(shiftvec+is3);
798 ii = iinr[n];
799 ii3 = 3*ii;
800 ix = load_xyz(pos+ii3);
801 vctot = nul;
802 fix = nul;
803 fiy = nul;
804 fiz = nul;
805 ix = vec_add(ix,shvec);
806 nj0 = jindex[n];
807 nj1 = jindex[n+1];
808 splat_xyz_to_vectors(ix,&ix,&iy,&iz);
809 iq = vec_madd(load_float_and_splat(charge+ii),vfacel,nul);
811 for(k=nj0; k<(nj1-3); k+=4) {
812 jnra = jjnr[k];
813 jnrb = jjnr[k+1];
814 jnrc = jjnr[k+2];
815 jnrd = jjnr[k+3];
816 j3a = 3*jnra;
817 j3b = 3*jnrb;
818 j3c = 3*jnrc;
819 j3d = 3*jnrd;
820 transpose_4_to_3(load_xyz(pos+j3a),
821 load_xyz(pos+j3b),
822 load_xyz(pos+j3c),
823 load_xyz(pos+j3d),&dx,&dy,&dz);
824 dx = vec_sub(ix,dx);
825 dy = vec_sub(iy,dy);
826 dz = vec_sub(iz,dz);
827 rsq = vec_madd(dx,dx,nul);
828 rsq = vec_madd(dy,dy,rsq);
829 rsq = vec_madd(dz,dz,rsq);
830 rinv = do_invsqrt(rsq);
831 rinvsq = vec_madd(rinv,rinv,nul);
832 /* load 4 j charges and multiply by iq */
833 qq = vec_madd(load_4_float(charge+jnra,charge+jnrb,
834 charge+jnrc,charge+jnrd),iq,nul);
835 krsq = vec_madd(vkrf,rsq,nul);
836 vcoul = vec_add(rinv,krsq);
837 vcoul = vec_sub(vcoul,vcrf);
839 fs = vec_nmsub(vec_two(),krsq,rinv);
840 vctot = vec_madd(qq,vcoul,vctot);
841 fs = vec_madd(fs,qq,nul);
842 fs = vec_madd(fs,rinvsq,nul);
844 fix = vec_madd(fs,dx,fix); /* +=fx */
845 fiy = vec_madd(fs,dy,fiy); /* +=fy */
846 fiz = vec_madd(fs,dz,fiz); /* +=fz */
847 dx = vec_nmsub(dx,fs,nul); /* -fx */
848 dy = vec_nmsub(dy,fs,nul); /* -fy */
849 dz = vec_nmsub(dz,fs,nul); /* -fz */
850 transpose_3_to_4(dx,dy,dz,&tmp1,&tmp2,&tmp3,&tmp4);
851 add_xyz_to_mem(faction+j3a,tmp1);
852 add_xyz_to_mem(faction+j3b,tmp2);
853 add_xyz_to_mem(faction+j3c,tmp3);
854 add_xyz_to_mem(faction+j3d,tmp4);
856 if(k<(nj1-1)) {
857 jnra = jjnr[k];
858 jnrb = jjnr[k+1];
859 j3a = 3*jnra;
860 j3b = 3*jnrb;
861 transpose_2_to_3(load_xyz(pos+j3a),
862 load_xyz(pos+j3b),&dx,&dy,&dz);
863 dx = vec_sub(ix,dx);
864 dy = vec_sub(iy,dy);
865 dz = vec_sub(iz,dz);
866 rsq = vec_madd(dx,dx,nul);
867 rsq = vec_madd(dy,dy,rsq);
868 rsq = vec_madd(dz,dz,rsq);
869 zero_highest_2_elements_in_vector(&rsq);
870 rinv = do_invsqrt(rsq);
871 zero_highest_2_elements_in_vector(&rinv);
872 rinvsq = vec_madd(rinv,rinv,nul);
873 /* load 2 j charges and multiply by iq */
874 qq = vec_madd(load_2_float(charge+jnra,charge+jnrb),iq,nul);
875 krsq = vec_madd(vkrf,rsq,nul);
876 vcoul = vec_add(rinv,krsq);
877 vcoul = vec_sub(vcoul,vcrf);
878 fs = vec_nmsub(vec_two(),krsq,rinv);
879 vctot = vec_madd(qq,vcoul,vctot);
880 fs = vec_madd(fs,qq,nul);
881 fs = vec_madd(fs,rinvsq,nul);
882 fix = vec_madd(fs,dx,fix); /* +=fx */
883 fiy = vec_madd(fs,dy,fiy); /* +=fy */
884 fiz = vec_madd(fs,dz,fiz); /* +=fz */
885 dx = vec_nmsub(dx,fs,nul); /* -fx */
886 dy = vec_nmsub(dy,fs,nul); /* -fy */
887 dz = vec_nmsub(dz,fs,nul); /* -fz */
888 transpose_3_to_2(dx,dy,dz,&tmp1,&tmp2);
889 add_xyz_to_mem(faction+j3a,tmp1);
890 add_xyz_to_mem(faction+j3b,tmp2);
891 k += 2;
893 if((nj1-nj0)%2) {
894 jnra = jjnr[k];
895 j3a = 3*jnra;
896 transpose_1_to_3(load_xyz(pos+j3a),&dx,&dy,&dz);
897 dx = vec_sub(ix,dx);
898 dy = vec_sub(iy,dy);
899 dz = vec_sub(iz,dz);
900 rsq = vec_madd(dx,dx,nul);
901 rsq = vec_madd(dy,dy,rsq);
902 rsq = vec_madd(dz,dz,rsq);
903 zero_highest_3_elements_in_vector(&rsq);
904 rinv = do_invsqrt(rsq);
905 zero_highest_3_elements_in_vector(&rinv);
906 rinvsq = vec_madd(rinv,rinv,nul);
907 /* load 1 j charge and multiply by iq */
908 qq = vec_madd(load_1_float(charge+jnra),iq,nul);
909 krsq = vec_madd(vkrf,rsq,nul);
910 vcoul = vec_add(rinv,krsq);
911 vcoul = vec_sub(vcoul,vcrf);
912 fs = vec_nmsub(vec_two(),krsq,rinv);
913 vctot = vec_madd(qq,vcoul,vctot);
914 fs = vec_madd(fs,qq,nul);
915 fs = vec_madd(fs,rinvsq,nul);
916 fix = vec_madd(fs,dx,fix); /* +=fx */
917 fiy = vec_madd(fs,dy,fiy); /* +=fy */
918 fiz = vec_madd(fs,dz,fiz); /* +=fz */
919 dx = vec_nmsub(dx,fs,nul); /* -fx */
920 dy = vec_nmsub(dy,fs,nul); /* -fy */
921 dz = vec_nmsub(dz,fs,nul); /* -fz */
922 transpose_3_to_1(dx,dy,dz,&tmp1);
923 add_xyz_to_mem(faction+j3a,tmp1);
925 /* update outer data */
926 transpose_3_to_4(fix,fiy,fiz,&tmp1,&tmp2,&tmp3,&tmp4);
927 tmp1 = vec_add(tmp1,tmp3);
928 tmp2 = vec_add(tmp2,tmp4);
929 tmp1 = vec_add(tmp1,tmp2);
930 add_xyz_to_mem(faction+ii3,tmp1);
931 add_xyz_to_mem(fshift+is3,tmp1);
933 add_vector_to_float(Vc+gid[n],vctot);
939 void inl2100_altivec(
940 int nri,
941 int iinr[],
942 int jindex[],
943 int jjnr[],
944 int shift[],
945 float shiftvec[],
946 float fshift[],
947 int gid[],
948 float pos[],
949 float faction[],
950 float charge[],
951 float facel,
952 float Vc[],
953 float krf,
954 float crf,
955 int type[],
956 int ntype,
957 float nbfp[],
958 float Vnb[])
960 vector float ix,iy,iz,shvec;
961 vector float vfacel,vkrf,vcrf,krsq,vcoul,fs,nul;
962 vector float dx,dy,dz;
963 vector float vnbtot,vctot,qq,iq,c6,c12;
964 vector float fix,fiy,fiz;
965 vector float tmp1,tmp2,tmp3,tmp4;
966 vector float rinv,rinvsq,rsq,rinvsix,vnb6,vnb12;
968 int n,k,k0,ii,is3,ii3,ntiA,nj0,nj1;
969 int jnra,jnrb,jnrc,jnrd;
970 int j3a,j3b,j3c,j3d;
971 int tja,tjb,tjc,tjd;
973 nul=vec_zero();
974 vfacel=load_float_and_splat(&facel);
975 vkrf=load_float_and_splat(&krf);
976 vcrf=load_float_and_splat(&crf);
978 for(n=0;n<nri;n++) {
979 is3 = 3*shift[n];
980 shvec = load_xyz(shiftvec+is3);
981 ii = iinr[n];
982 ii3 = 3*ii;
983 ix = load_xyz(pos+ii3);
984 vnbtot = nul;
985 vctot = nul;
986 fix = nul;
987 fiy = nul;
988 fiz = nul;
989 ix = vec_add(ix,shvec);
990 nj0 = jindex[n];
991 nj1 = jindex[n+1];
992 splat_xyz_to_vectors(ix,&ix,&iy,&iz);
993 ntiA = 2*ntype*type[ii];
994 iq = vec_madd(load_float_and_splat(charge+ii),vfacel,nul);
996 for(k=nj0; k<(nj1-3); k+=4) {
997 jnra = jjnr[k];
998 jnrb = jjnr[k+1];
999 jnrc = jjnr[k+2];
1000 jnrd = jjnr[k+3];
1001 j3a = 3*jnra;
1002 j3b = 3*jnrb;
1003 j3c = 3*jnrc;
1004 j3d = 3*jnrd;
1005 transpose_4_to_3(load_xyz(pos+j3a),
1006 load_xyz(pos+j3b),
1007 load_xyz(pos+j3c),
1008 load_xyz(pos+j3d),&dx,&dy,&dz);
1009 dx = vec_sub(ix,dx);
1010 dy = vec_sub(iy,dy);
1011 dz = vec_sub(iz,dz);
1012 rsq = vec_madd(dx,dx,nul);
1013 rsq = vec_madd(dy,dy,rsq);
1014 rsq = vec_madd(dz,dz,rsq);
1015 rinv = do_invsqrt(rsq);
1016 rinvsq = vec_madd(rinv,rinv,nul);
1017 rinvsix = vec_madd(rinvsq,rinvsq,nul);
1018 rinvsix = vec_madd(rinvsix,rinvsq,nul);
1019 tja = ntiA+2*type[jnra];
1020 tjb = ntiA+2*type[jnrb];
1021 tjc = ntiA+2*type[jnrc];
1022 tjd = ntiA+2*type[jnrd];
1023 qq = vec_madd(load_4_float(charge+jnra,charge+jnrb,
1024 charge+jnrc,charge+jnrd),iq,nul);
1025 load_4_pair(nbfp+tja,nbfp+tjb,nbfp+tjc,nbfp+tjd,&c6,&c12);
1026 vnb6 = vec_madd(c6,rinvsix,nul);
1027 vnb12 = vec_madd(c12,vec_madd(rinvsix,rinvsix,nul),nul);
1028 krsq = vec_madd(vkrf,rsq,nul);
1029 vcoul = vec_add(rinv,krsq);
1030 vcoul = vec_sub(vcoul,vcrf);
1031 vctot = vec_madd(qq,vcoul,vctot);
1032 vnbtot = vec_add(vnbtot,vnb12);
1033 vnbtot = vec_sub(vnbtot,vnb6);
1034 fs = vec_nmsub(vec_two(),krsq,rinv); /* rinv-2*krsq */
1035 fs = vec_madd(qq,fs,nul); /* qq*(rinv-2*krsq) */
1036 fs = vec_madd(vec_twelve(),vnb12,fs);
1037 fs = vec_nmsub(vec_six(),vnb6,fs);
1038 fs = vec_madd(fs,rinvsq,nul);
1039 fix = vec_madd(fs,dx,fix); /* +=fx */
1040 fiy = vec_madd(fs,dy,fiy); /* +=fy */
1041 fiz = vec_madd(fs,dz,fiz); /* +=fz */
1042 dx = vec_nmsub(dx,fs,nul); /* -fx */
1043 dy = vec_nmsub(dy,fs,nul); /* -fy */
1044 dz = vec_nmsub(dz,fs,nul); /* -fz */
1045 transpose_3_to_4(dx,dy,dz,&tmp1,&tmp2,&tmp3,&tmp4);
1046 add_xyz_to_mem(faction+j3a,tmp1);
1047 add_xyz_to_mem(faction+j3b,tmp2);
1048 add_xyz_to_mem(faction+j3c,tmp3);
1049 add_xyz_to_mem(faction+j3d,tmp4);
1051 if(k<(nj1-1)) {
1052 jnra = jjnr[k];
1053 jnrb = jjnr[k+1];
1054 j3a = 3*jnra;
1055 j3b = 3*jnrb;
1056 transpose_2_to_3(load_xyz(pos+j3a),
1057 load_xyz(pos+j3b),&dx,&dy,&dz);
1058 dx = vec_sub(ix,dx);
1059 dy = vec_sub(iy,dy);
1060 dz = vec_sub(iz,dz);
1061 rsq = vec_madd(dx,dx,nul);
1062 rsq = vec_madd(dy,dy,rsq);
1063 rsq = vec_madd(dz,dz,rsq);
1064 zero_highest_2_elements_in_vector(&rsq);
1065 rinv = do_invsqrt(rsq);
1066 zero_highest_2_elements_in_vector(&rinv);
1067 rinvsq = vec_madd(rinv,rinv,nul);
1068 rinvsix = vec_madd(rinvsq,rinvsq,nul);
1069 rinvsix = vec_madd(rinvsix,rinvsq,nul);
1070 tja = ntiA+2*type[jnra];
1071 tjb = ntiA+2*type[jnrb];
1072 qq = vec_madd(load_2_float(charge+jnra,charge+jnrb),iq,nul);
1073 load_2_pair(nbfp+tja,nbfp+tjb,&c6,&c12);
1074 vnb6 = vec_madd(c6,rinvsix,nul);
1075 vnb12 = vec_madd(c12,vec_madd(rinvsix,rinvsix,nul),nul);
1076 krsq = vec_madd(vkrf,rsq,nul);
1077 vcoul = vec_add(rinv,krsq);
1078 vcoul = vec_sub(vcoul,vcrf);
1079 vctot = vec_madd(qq,vcoul,vctot);
1080 vnbtot = vec_add(vnbtot,vnb12);
1081 vnbtot = vec_sub(vnbtot,vnb6);
1082 fs = vec_nmsub(vec_two(),krsq,rinv); /* rinv-2*krsq */
1083 fs = vec_madd(qq,fs,nul); /* qq*(rinv-2*krsq) */
1084 fs = vec_madd(vec_twelve(),vnb12,fs);
1085 fs = vec_nmsub(vec_six(),vnb6,fs);
1086 fs = vec_madd(fs,rinvsq,nul);
1087 fix = vec_madd(fs,dx,fix); /* +=fx */
1088 fiy = vec_madd(fs,dy,fiy); /* +=fy */
1089 fiz = vec_madd(fs,dz,fiz); /* +=fz */
1090 dx = vec_nmsub(dx,fs,nul); /* -fx */
1091 dy = vec_nmsub(dy,fs,nul); /* -fy */
1092 dz = vec_nmsub(dz,fs,nul); /* -fz */
1093 transpose_3_to_2(dx,dy,dz,&tmp1,&tmp2);
1094 add_xyz_to_mem(faction+j3a,tmp1);
1095 add_xyz_to_mem(faction+j3b,tmp2);
1096 k += 2;
1098 if((nj1-nj0)%2) {
1099 jnra = jjnr[k];
1100 j3a = 3*jnra;
1101 transpose_1_to_3(load_xyz(pos+j3a),&dx,&dy,&dz);
1102 dx = vec_sub(ix,dx);
1103 dy = vec_sub(iy,dy);
1104 dz = vec_sub(iz,dz);
1105 rsq = vec_madd(dx,dx,nul);
1106 rsq = vec_madd(dy,dy,rsq);
1107 rsq = vec_madd(dz,dz,rsq);
1108 zero_highest_3_elements_in_vector(&rsq);
1109 rinv = do_invsqrt(rsq);
1110 zero_highest_3_elements_in_vector(&rinv);
1111 rinvsq = vec_madd(rinv,rinv,nul);
1112 rinvsix = vec_madd(rinvsq,rinvsq,nul);
1113 rinvsix = vec_madd(rinvsix,rinvsq,nul);
1114 tja = ntiA+2*type[jnra];
1115 qq = vec_madd(load_1_float(charge+jnra),iq,nul);
1116 load_1_pair(nbfp+tja,&c6,&c12);
1117 vnb6 = vec_madd(c6,rinvsix,nul);
1118 vnb12 = vec_madd(c12,vec_madd(rinvsix,rinvsix,nul),nul);
1119 krsq = vec_madd(vkrf,rsq,nul);
1120 vcoul = vec_add(rinv,krsq);
1121 vcoul = vec_sub(vcoul,vcrf);
1122 vctot = vec_madd(qq,vcoul,vctot);
1123 vnbtot = vec_add(vnbtot,vnb12);
1124 vnbtot = vec_sub(vnbtot,vnb6);
1125 fs = vec_nmsub(vec_two(),krsq,rinv); /* rinv-2*krsq */
1126 fs = vec_madd(qq,fs,nul); /* qq*(rinv-2*krsq) */
1127 fs = vec_madd(vec_twelve(),vnb12,fs);
1128 fs = vec_nmsub(vec_six(),vnb6,fs);
1129 fs = vec_madd(fs,rinvsq,nul);
1130 fix = vec_madd(fs,dx,fix); /* +=fx */
1131 fiy = vec_madd(fs,dy,fiy); /* +=fy */
1132 fiz = vec_madd(fs,dz,fiz); /* +=fz */
1133 dx = vec_nmsub(dx,fs,nul); /* -fx */
1134 dy = vec_nmsub(dy,fs,nul); /* -fy */
1135 dz = vec_nmsub(dz,fs,nul); /* -fz */
1136 transpose_3_to_1(dx,dy,dz,&tmp1);
1137 add_xyz_to_mem(faction+j3a,tmp1);
1139 /* update outer data */
1140 transpose_3_to_4(fix,fiy,fiz,&tmp1,&tmp2,&tmp3,&tmp4);
1141 tmp1 = vec_add(tmp1,tmp3);
1142 tmp2 = vec_add(tmp2,tmp4);
1143 tmp1 = vec_add(tmp1,tmp2);
1145 add_xyz_to_mem(faction+ii3,tmp1);
1146 add_xyz_to_mem(fshift+is3,tmp1);
1148 add_vector_to_float(Vc+gid[n],vctot);
1149 add_vector_to_float(Vnb+gid[n],vnbtot);
1158 void inl3000_altivec(
1159 int nri,
1160 int iinr[],
1161 int jindex[],
1162 int jjnr[],
1163 int shift[],
1164 float shiftvec[],
1165 float fshift[],
1166 int gid[],
1167 float pos[],
1168 float faction[],
1169 float charge[],
1170 float facel,
1171 float Vc[],
1172 float tabscale,
1173 float VFtab[])
1175 vector float ix,iy,iz,shvec;
1176 vector float vfacel,tsc,vcoul,fs,nul;
1177 vector float dx,dy,dz;
1178 vector float vctot,qq,iq;
1179 vector float fix,fiy,fiz;
1180 vector float tmp1,tmp2,tmp3,tmp4;
1181 vector float rinv,r,rsq,VVc,FFc;
1183 int n,k,k0,ii,is3,ii3,ntiA,nj0,nj1;
1184 int jnra,jnrb,jnrc,jnrd;
1185 int j3a,j3b,j3c,j3d;
1187 nul=vec_zero();
1188 vfacel=load_float_and_splat(&facel);
1189 tsc=load_float_and_splat(&tabscale);
1191 for(n=0;n<nri;n++) {
1192 is3 = 3*shift[n];
1193 shvec = load_xyz(shiftvec+is3);
1194 ii = iinr[n];
1195 ii3 = 3*ii;
1196 ix = load_xyz(pos+ii3);
1197 vctot = nul;
1198 fix = nul;
1199 fiy = nul;
1200 fiz = nul;
1201 ix = vec_add(ix,shvec);
1202 nj0 = jindex[n];
1203 nj1 = jindex[n+1];
1204 splat_xyz_to_vectors(ix,&ix,&iy,&iz);
1205 iq = vec_madd(load_float_and_splat(charge+ii),vfacel,nul);
1207 for(k=nj0; k<(nj1-3); k+=4) {
1208 jnra = jjnr[k];
1209 jnrb = jjnr[k+1];
1210 jnrc = jjnr[k+2];
1211 jnrd = jjnr[k+3];
1212 j3a = 3*jnra;
1213 j3b = 3*jnrb;
1214 j3c = 3*jnrc;
1215 j3d = 3*jnrd;
1216 transpose_4_to_3(load_xyz(pos+j3a),
1217 load_xyz(pos+j3b),
1218 load_xyz(pos+j3c),
1219 load_xyz(pos+j3d),&dx,&dy,&dz);
1220 dx = vec_sub(ix,dx);
1221 dy = vec_sub(iy,dy);
1222 dz = vec_sub(iz,dz);
1223 rsq = vec_madd(dx,dx,nul);
1224 rsq = vec_madd(dy,dy,rsq);
1225 rsq = vec_madd(dz,dz,rsq);
1226 rinv = do_invsqrt(rsq);
1227 r = vec_madd(rinv,rsq,nul);
1228 /* load 4 j charges and multiply by iq */
1229 qq = vec_madd(load_4_float(charge+jnra,charge+jnrb,
1230 charge+jnrc,charge+jnrd),iq,nul);
1231 do_4_ctable_coul(VFtab,vec_madd(r,tsc,nul),&VVc,&FFc);
1232 fs = vec_nmsub(qq,FFc,nul);
1233 vctot = vec_madd(qq,VVc,vctot);
1234 fs = vec_madd(fs,tsc,nul);
1235 fs = vec_madd(fs,rinv,nul);
1236 fix = vec_madd(fs,dx,fix); /* +=fx */
1237 fiy = vec_madd(fs,dy,fiy); /* +=fy */
1238 fiz = vec_madd(fs,dz,fiz); /* +=fz */
1239 dx = vec_nmsub(dx,fs,nul); /* -fx */
1240 dy = vec_nmsub(dy,fs,nul); /* -fy */
1241 dz = vec_nmsub(dz,fs,nul); /* -fz */
1242 transpose_3_to_4(dx,dy,dz,&tmp1,&tmp2,&tmp3,&tmp4);
1243 add_xyz_to_mem(faction+j3a,tmp1);
1244 add_xyz_to_mem(faction+j3b,tmp2);
1245 add_xyz_to_mem(faction+j3c,tmp3);
1246 add_xyz_to_mem(faction+j3d,tmp4);
1248 if(k<(nj1-1)) {
1249 jnra = jjnr[k];
1250 jnrb = jjnr[k+1];
1251 j3a = 3*jnra;
1252 j3b = 3*jnrb;
1253 transpose_2_to_3(load_xyz(pos+j3a),
1254 load_xyz(pos+j3b),&dx,&dy,&dz);
1255 dx = vec_sub(ix,dx);
1256 dy = vec_sub(iy,dy);
1257 dz = vec_sub(iz,dz);
1258 rsq = vec_madd(dx,dx,nul);
1259 rsq = vec_madd(dy,dy,rsq);
1260 rsq = vec_madd(dz,dz,rsq);
1261 zero_highest_2_elements_in_vector(&rsq);
1262 rinv = do_invsqrt(rsq);
1263 zero_highest_2_elements_in_vector(&rinv);
1264 r = vec_madd(rinv,rsq,nul);
1265 /* load 2 j charges and multiply by iq */
1266 qq = vec_madd(load_2_float(charge+jnra,charge+jnrb),iq,nul);
1267 do_2_ctable_coul(VFtab,vec_madd(r,tsc,nul),&VVc,&FFc);
1268 fs = vec_nmsub(qq,FFc,nul);
1269 vctot = vec_madd(qq,VVc,vctot);
1270 fs = vec_madd(fs,tsc,nul);
1271 fs = vec_madd(fs,rinv,nul);
1272 fix = vec_madd(fs,dx,fix); /* +=fx */
1273 fiy = vec_madd(fs,dy,fiy); /* +=fy */
1274 fiz = vec_madd(fs,dz,fiz); /* +=fz */
1275 dx = vec_nmsub(dx,fs,nul); /* -fx */
1276 dy = vec_nmsub(dy,fs,nul); /* -fy */
1277 dz = vec_nmsub(dz,fs,nul); /* -fz */
1278 transpose_3_to_2(dx,dy,dz,&tmp1,&tmp2);
1279 add_xyz_to_mem(faction+j3a,tmp1);
1280 add_xyz_to_mem(faction+j3b,tmp2);
1281 k += 2;
1283 if((nj1-nj0)%2) {
1284 jnra = jjnr[k];
1285 j3a = 3*jnra;
1286 transpose_1_to_3(load_xyz(pos+j3a),&dx,&dy,&dz);
1287 dx = vec_sub(ix,dx);
1288 dy = vec_sub(iy,dy);
1289 dz = vec_sub(iz,dz);
1290 rsq = vec_madd(dx,dx,nul);
1291 rsq = vec_madd(dy,dy,rsq);
1292 rsq = vec_madd(dz,dz,rsq);
1293 zero_highest_3_elements_in_vector(&rsq);
1294 rinv = do_invsqrt(rsq);
1295 zero_highest_3_elements_in_vector(&rinv);
1296 r = vec_madd(rinv,rsq,nul);
1297 /* load 1 j charge and multiply by iq */
1298 qq = vec_madd(load_1_float(charge+jnra),iq,nul);
1299 do_1_ctable_coul(VFtab,vec_madd(r,tsc,nul),&VVc,&FFc);
1300 fs = vec_nmsub(qq,FFc,nul);
1301 vctot = vec_madd(qq,VVc,vctot);
1302 fs = vec_madd(fs,tsc,nul);
1303 fs = vec_madd(fs,rinv,nul);
1304 fix = vec_madd(fs,dx,fix); /* +=fx */
1305 fiy = vec_madd(fs,dy,fiy); /* +=fy */
1306 fiz = vec_madd(fs,dz,fiz); /* +=fz */
1307 dx = vec_nmsub(dx,fs,nul); /* -fx */
1308 dy = vec_nmsub(dy,fs,nul); /* -fy */
1309 dz = vec_nmsub(dz,fs,nul); /* -fz */
1310 transpose_3_to_1(dx,dy,dz,&tmp1);
1311 add_xyz_to_mem(faction+j3a,tmp1);
1313 /* update outer data */
1314 transpose_3_to_4(fix,fiy,fiz,&tmp1,&tmp2,&tmp3,&tmp4);
1315 tmp1 = vec_add(tmp1,tmp3);
1316 tmp2 = vec_add(tmp2,tmp4);
1317 tmp1 = vec_add(tmp1,tmp2);
1318 add_xyz_to_mem(faction+ii3,tmp1);
1319 add_xyz_to_mem(fshift+is3,tmp1);
1321 add_vector_to_float(Vc+gid[n],vctot);
1327 void inl3100_altivec(
1328 int nri,
1329 int iinr[],
1330 int jindex[],
1331 int jjnr[],
1332 int shift[],
1333 float shiftvec[],
1334 float fshift[],
1335 int gid[],
1336 float pos[],
1337 float faction[],
1338 float charge[],
1339 float facel,
1340 float Vc[],
1341 int type[],
1342 int ntype,
1343 float nbfp[],
1344 float Vnb[],
1345 float tabscale,
1346 float VFtab[])
1348 vector float ix,iy,iz,shvec;
1349 vector float vfacel,vcoul,tsc,fs,fs2,nul;
1350 vector float dx,dy,dz;
1351 vector float vnbtot,vctot,qq,iq,c6,c12,VVc,FFc;
1352 vector float fix,fiy,fiz;
1353 vector float tmp1,tmp2,tmp3,tmp4;
1354 vector float rinv,r,rinvsq,rsq,rinvsix,vnb6,vnb12;
1356 int n,k,k0,ii,is3,ii3,ntiA,nj0,nj1;
1357 int jnra,jnrb,jnrc,jnrd;
1358 int j3a,j3b,j3c,j3d;
1359 int tja,tjb,tjc,tjd;
1361 nul=vec_zero();
1362 vfacel=load_float_and_splat(&facel);
1363 tsc=load_float_and_splat(&tabscale);
1365 for(n=0;n<nri;n++) {
1366 is3 = 3*shift[n];
1367 shvec = load_xyz(shiftvec+is3);
1368 ii = iinr[n];
1369 ii3 = 3*ii;
1370 ix = load_xyz(pos+ii3);
1371 vnbtot = nul;
1372 vctot = nul;
1373 fix = nul;
1374 fiy = nul;
1375 fiz = nul;
1376 ix = vec_add(ix,shvec);
1377 nj0 = jindex[n];
1378 nj1 = jindex[n+1];
1379 splat_xyz_to_vectors(ix,&ix,&iy,&iz);
1380 ntiA = 2*ntype*type[ii];
1381 iq = vec_madd(load_float_and_splat(charge+ii),vfacel,nul);
1383 for(k=nj0; k<(nj1-3); k+=4) {
1384 jnra = jjnr[k];
1385 jnrb = jjnr[k+1];
1386 jnrc = jjnr[k+2];
1387 jnrd = jjnr[k+3];
1388 j3a = 3*jnra;
1389 j3b = 3*jnrb;
1390 j3c = 3*jnrc;
1391 j3d = 3*jnrd;
1392 transpose_4_to_3(load_xyz(pos+j3a),
1393 load_xyz(pos+j3b),
1394 load_xyz(pos+j3c),
1395 load_xyz(pos+j3d),&dx,&dy,&dz);
1396 dx = vec_sub(ix,dx);
1397 dy = vec_sub(iy,dy);
1398 dz = vec_sub(iz,dz);
1399 rsq = vec_madd(dx,dx,nul);
1400 rsq = vec_madd(dy,dy,rsq);
1401 rsq = vec_madd(dz,dz,rsq);
1402 rinv = do_invsqrt(rsq);
1403 rinvsq = vec_madd(rinv,rinv,nul);
1404 r = vec_madd(rinv,rsq,nul);
1405 rinvsix = vec_madd(rinvsq,rinvsq,nul);
1406 rinvsix = vec_madd(rinvsix,rinvsq,nul);
1407 tja = ntiA+2*type[jnra];
1408 tjb = ntiA+2*type[jnrb];
1409 tjc = ntiA+2*type[jnrc];
1410 tjd = ntiA+2*type[jnrd];
1411 qq = vec_madd(load_4_float(charge+jnra,charge+jnrb,
1412 charge+jnrc,charge+jnrd),iq,nul);
1413 load_4_pair(nbfp+tja,nbfp+tjb,nbfp+tjc,nbfp+tjd,&c6,&c12);
1414 do_4_ctable_coul(VFtab,vec_madd(r,tsc,nul),&VVc,&FFc);
1415 fs2 = vec_madd(qq,FFc,nul); /* fijC */
1416 vctot = vec_madd(qq,VVc,vctot);
1417 vnb6 = vec_madd(c6,rinvsix,nul);
1418 vnb12 = vec_madd(c12,vec_madd(rinvsix,rinvsix,nul),nul);
1419 fs = vec_madd(vec_twelve(),vnb12,nul);
1420 fs = vec_nmsub(vec_six(),vnb6,fs);
1421 fs = vec_madd(fs,rinv,nul);
1422 vnbtot = vec_add(vnbtot,vnb12);
1423 fs = vec_nmsub(fs2,tsc,fs);
1424 fs = vec_madd(fs,rinv,nul);
1425 vnbtot = vec_sub(vnbtot,vnb6);
1426 fix = vec_madd(fs,dx,fix); /* +=fx */
1427 fiy = vec_madd(fs,dy,fiy); /* +=fy */
1428 fiz = vec_madd(fs,dz,fiz); /* +=fz */
1429 dx = vec_nmsub(dx,fs,nul); /* -fx */
1430 dy = vec_nmsub(dy,fs,nul); /* -fy */
1431 dz = vec_nmsub(dz,fs,nul); /* -fz */
1432 transpose_3_to_4(dx,dy,dz,&tmp1,&tmp2,&tmp3,&tmp4);
1433 add_xyz_to_mem(faction+j3a,tmp1);
1434 add_xyz_to_mem(faction+j3b,tmp2);
1435 add_xyz_to_mem(faction+j3c,tmp3);
1436 add_xyz_to_mem(faction+j3d,tmp4);
1438 if(k<(nj1-1)) {
1439 jnra = jjnr[k];
1440 jnrb = jjnr[k+1];
1441 j3a = 3*jnra;
1442 j3b = 3*jnrb;
1443 transpose_2_to_3(load_xyz(pos+j3a),
1444 load_xyz(pos+j3b),&dx,&dy,&dz);
1445 dx = vec_sub(ix,dx);
1446 dy = vec_sub(iy,dy);
1447 dz = vec_sub(iz,dz);
1448 rsq = vec_madd(dx,dx,nul);
1449 rsq = vec_madd(dy,dy,rsq);
1450 rsq = vec_madd(dz,dz,rsq);
1451 zero_highest_2_elements_in_vector(&rsq);
1452 rinv = do_invsqrt(rsq);
1453 zero_highest_2_elements_in_vector(&rinv);
1454 rinvsq = vec_madd(rinv,rinv,nul);
1455 r = vec_madd(rinv,rsq,nul);
1456 rinvsix = vec_madd(rinvsq,rinvsq,nul);
1457 rinvsix = vec_madd(rinvsix,rinvsq,nul);
1458 tja = ntiA+2*type[jnra];
1459 tjb = ntiA+2*type[jnrb];
1460 qq = vec_madd(load_2_float(charge+jnra,charge+jnrb),iq,nul);
1461 load_2_pair(nbfp+tja,nbfp+tjb,&c6,&c12);
1462 do_2_ctable_coul(VFtab,vec_madd(r,tsc,nul),&VVc,&FFc);
1463 fs2 = vec_madd(qq,FFc,nul); /* fijC */
1464 vctot = vec_madd(qq,VVc,vctot);
1465 vnb6 = vec_madd(c6,rinvsix,nul);
1466 vnb12 = vec_madd(c12,vec_madd(rinvsix,rinvsix,nul),nul);
1467 fs = vec_madd(vec_twelve(),vnb12,nul);
1468 fs = vec_nmsub(vec_six(),vnb6,fs);
1469 vnbtot = vec_add(vnbtot,vnb12);
1470 fs = vec_madd(fs,rinv,nul);
1471 fs = vec_nmsub(fs2,tsc,fs);
1472 fs = vec_madd(fs,rinv,nul);
1473 vnbtot = vec_sub(vnbtot,vnb6);
1474 fix = vec_madd(fs,dx,fix); /* +=fx */
1475 fiy = vec_madd(fs,dy,fiy); /* +=fy */
1476 fiz = vec_madd(fs,dz,fiz); /* +=fz */
1477 dx = vec_nmsub(dx,fs,nul); /* -fx */
1478 dy = vec_nmsub(dy,fs,nul); /* -fy */
1479 dz = vec_nmsub(dz,fs,nul); /* -fz */
1480 transpose_3_to_2(dx,dy,dz,&tmp1,&tmp2);
1481 add_xyz_to_mem(faction+j3a,tmp1);
1482 add_xyz_to_mem(faction+j3b,tmp2);
1483 k += 2;
1485 if((nj1-nj0)%2) {
1486 jnra = jjnr[k];
1487 j3a = 3*jnra;
1488 transpose_1_to_3(load_xyz(pos+j3a),&dx,&dy,&dz);
1489 dx = vec_sub(ix,dx);
1490 dy = vec_sub(iy,dy);
1491 dz = vec_sub(iz,dz);
1492 rsq = vec_madd(dx,dx,nul);
1493 rsq = vec_madd(dy,dy,rsq);
1494 rsq = vec_madd(dz,dz,rsq);
1495 zero_highest_3_elements_in_vector(&rsq);
1496 rinv = do_invsqrt(rsq);
1497 zero_highest_3_elements_in_vector(&rinv);
1498 rinvsq = vec_madd(rinv,rinv,nul);
1499 r = vec_madd(rinv,rsq,nul);
1500 rinvsix = vec_madd(rinvsq,rinvsq,nul);
1501 rinvsix = vec_madd(rinvsix,rinvsq,nul);
1502 tja = ntiA+2*type[jnra];
1503 qq = vec_madd(load_1_float(charge+jnra),iq,nul);
1504 load_1_pair(nbfp+tja,&c6,&c12);
1505 do_1_ctable_coul(VFtab,vec_madd(r,tsc,nul),&VVc,&FFc);
1506 fs2 = vec_madd(qq,FFc,nul); /* fijC */
1507 vctot = vec_madd(qq,VVc,vctot);
1508 vnb6 = vec_madd(c6,rinvsix,nul);
1509 vnb12 = vec_madd(c12,vec_madd(rinvsix,rinvsix,nul),nul);
1510 fs = vec_madd(vec_twelve(),vnb12,nul);
1511 fs = vec_nmsub(vec_six(),vnb6,fs);
1512 fs = vec_madd(fs,rinv,nul);
1513 vnbtot = vec_add(vnbtot,vnb12);
1514 fs = vec_nmsub(fs2,tsc,fs);
1515 fs = vec_madd(fs,rinv,nul);
1516 vnbtot = vec_sub(vnbtot,vnb6);
1517 fix = vec_madd(fs,dx,fix); /* +=fx */
1518 fiy = vec_madd(fs,dy,fiy); /* +=fy */
1519 fiz = vec_madd(fs,dz,fiz); /* +=fz */
1520 dx = vec_nmsub(dx,fs,nul); /* -fx */
1521 dy = vec_nmsub(dy,fs,nul); /* -fy */
1522 dz = vec_nmsub(dz,fs,nul); /* -fz */
1523 transpose_3_to_1(dx,dy,dz,&tmp1);
1524 add_xyz_to_mem(faction+j3a,tmp1);
1526 /* update outer data */
1527 transpose_3_to_4(fix,fiy,fiz,&tmp1,&tmp2,&tmp3,&tmp4);
1528 tmp1 = vec_add(tmp1,tmp3);
1529 tmp2 = vec_add(tmp2,tmp4);
1530 tmp1 = vec_add(tmp1,tmp2);
1532 add_xyz_to_mem(faction+ii3,tmp1);
1533 add_xyz_to_mem(fshift+is3,tmp1);
1535 add_vector_to_float(Vc+gid[n],vctot);
1536 add_vector_to_float(Vnb+gid[n],vnbtot);
1541 void inl3300_altivec(
1542 int nri,
1543 int iinr[],
1544 int jindex[],
1545 int jjnr[],
1546 int shift[],
1547 float shiftvec[],
1548 float fshift[],
1549 int gid[],
1550 float pos[],
1551 float faction[],
1552 float charge[],
1553 float facel,
1554 float Vc[],
1555 int type[],
1556 int ntype,
1557 float nbfp[],
1558 float Vnb[],
1559 float tabscale,
1560 float VFtab[])
1562 vector float ix,iy,iz,shvec;
1563 vector float fs,nul,tsc;
1564 vector float dx,dy,dz,vfacel,vcoul,vctot;
1565 vector float vnbtot,c6,c12,iq,qq;
1566 vector float fix,fiy,fiz;
1567 vector float tmp1,tmp2,tmp3,tmp4;
1568 vector float rinv,r,rsq;
1569 vector float VVc,FFc,VVd,FFd,VVr,FFr;
1571 int n,k,k0,ii,is3,ii3,ntiA,nj0,nj1;
1572 int jnra,jnrb,jnrc,jnrd;
1573 int j3a,j3b,j3c,j3d;
1574 int tja,tjb,tjc,tjd;
1576 nul=vec_zero();
1577 tsc=load_float_and_splat(&tabscale);
1578 vfacel=load_float_and_splat(&facel);
1580 for(n=0;n<nri;n++) {
1581 is3 = 3*shift[n];
1582 shvec = load_xyz(shiftvec+is3);
1583 ii = iinr[n];
1584 ii3 = 3*ii;
1585 ix = load_xyz(pos+ii3);
1586 vnbtot = nul;
1587 vctot = nul;
1588 fix = nul;
1589 fiy = nul;
1590 fiz = nul;
1591 ix = vec_add(ix,shvec);
1592 nj0 = jindex[n];
1593 nj1 = jindex[n+1];
1594 splat_xyz_to_vectors(ix,&ix,&iy,&iz);
1595 ntiA = 2*ntype*type[ii];
1596 iq = vec_madd(load_float_and_splat(charge+ii),vfacel,nul);
1598 for(k=nj0; k<(nj1-3); k+=4) {
1599 jnra = jjnr[k];
1600 jnrb = jjnr[k+1];
1601 jnrc = jjnr[k+2];
1602 jnrd = jjnr[k+3];
1603 j3a = 3*jnra;
1604 j3b = 3*jnrb;
1605 j3c = 3*jnrc;
1606 j3d = 3*jnrd;
1607 transpose_4_to_3(load_xyz(pos+j3a),
1608 load_xyz(pos+j3b),
1609 load_xyz(pos+j3c),
1610 load_xyz(pos+j3d),&dx,&dy,&dz);
1611 dx = vec_sub(ix,dx);
1612 dy = vec_sub(iy,dy);
1613 dz = vec_sub(iz,dz);
1614 rsq = vec_madd(dx,dx,nul);
1615 rsq = vec_madd(dy,dy,rsq);
1616 rsq = vec_madd(dz,dz,rsq);
1617 rinv = do_invsqrt(rsq);
1618 r = vec_madd(rinv,rsq,nul);
1619 qq = vec_madd(load_4_float(charge+jnra,charge+jnrb,
1620 charge+jnrc,charge+jnrd),iq,nul);
1621 tja = ntiA+2*type[jnra];
1622 tjb = ntiA+2*type[jnrb];
1623 tjc = ntiA+2*type[jnrc];
1624 tjd = ntiA+2*type[jnrd];
1625 load_4_pair(nbfp+tja,nbfp+tjb,nbfp+tjc,nbfp+tjd,&c6,&c12);
1626 do_4_ljctable_coul_and_lj(VFtab,vec_madd(r,tsc,nul),&VVc,&FFc,&VVd,&FFd,&VVr,&FFr);
1627 vctot = vec_madd(qq,VVc,vctot);
1628 fs = vec_nmsub(qq,FFc,nul);
1629 vnbtot = vec_madd(c6,VVd,vnbtot);
1630 fs = vec_nmsub(c6,FFd,fs);
1631 vnbtot = vec_madd(c12,VVr,vnbtot);
1632 fs = vec_nmsub(c12,FFr,fs);
1633 fs = vec_madd(fs,tsc,nul);
1634 fs = vec_madd(fs,rinv,nul);
1635 fix = vec_madd(fs,dx,fix); /* +=fx */
1636 fiy = vec_madd(fs,dy,fiy); /* +=fy */
1637 fiz = vec_madd(fs,dz,fiz); /* +=fz */
1638 dx = vec_nmsub(dx,fs,nul); /* -fx */
1639 dy = vec_nmsub(dy,fs,nul); /* -fy */
1640 dz = vec_nmsub(dz,fs,nul); /* -fz */
1641 transpose_3_to_4(dx,dy,dz,&tmp1,&tmp2,&tmp3,&tmp4);
1642 add_xyz_to_mem(faction+j3a,tmp1);
1643 add_xyz_to_mem(faction+j3b,tmp2);
1644 add_xyz_to_mem(faction+j3c,tmp3);
1645 add_xyz_to_mem(faction+j3d,tmp4);
1647 if(k<(nj1-1)) {
1648 jnra = jjnr[k];
1649 jnrb = jjnr[k+1];
1650 j3a = 3*jnra;
1651 j3b = 3*jnrb;
1652 transpose_2_to_3(load_xyz(pos+j3a),
1653 load_xyz(pos+j3b),&dx,&dy,&dz);
1654 dx = vec_sub(ix,dx);
1655 dy = vec_sub(iy,dy);
1656 dz = vec_sub(iz,dz);
1657 rsq = vec_madd(dx,dx,nul);
1658 rsq = vec_madd(dy,dy,rsq);
1659 rsq = vec_madd(dz,dz,rsq);
1660 zero_highest_2_elements_in_vector(&rsq);
1661 rinv = do_invsqrt(rsq);
1662 zero_highest_2_elements_in_vector(&rinv);
1663 r = vec_madd(rinv,rsq,nul);
1664 qq = vec_madd(load_2_float(charge+jnra,charge+jnrb),iq,nul);
1665 tja = ntiA+2*type[jnra];
1666 tjb = ntiA+2*type[jnrb];
1667 load_2_pair(nbfp+tja,nbfp+tjb,&c6,&c12);
1668 do_2_ljctable_coul_and_lj(VFtab,vec_madd(r,tsc,nul),&VVc,&FFc,&VVd,&FFd,&VVr,&FFr);
1669 vctot = vec_madd(qq,VVc,vctot);
1670 fs = vec_nmsub(qq,FFc,nul);
1671 vnbtot = vec_madd(c6,VVd,vnbtot);
1672 fs = vec_nmsub(c6,FFd,fs);
1673 vnbtot = vec_madd(c12,VVr,vnbtot);
1674 fs = vec_nmsub(c12,FFr,fs);
1675 fs = vec_madd(fs,tsc,nul);
1676 fs = vec_madd(fs,rinv,nul);
1677 fix = vec_madd(fs,dx,fix); /* +=fx */
1678 fiy = vec_madd(fs,dy,fiy); /* +=fy */
1679 fiz = vec_madd(fs,dz,fiz); /* +=fz */
1680 dx = vec_nmsub(dx,fs,nul); /* -fx */
1681 dy = vec_nmsub(dy,fs,nul); /* -fy */
1682 dz = vec_nmsub(dz,fs,nul); /* -fz */
1683 transpose_3_to_2(dx,dy,dz,&tmp1,&tmp2);
1684 add_xyz_to_mem(faction+j3a,tmp1);
1685 add_xyz_to_mem(faction+j3b,tmp2);
1686 k += 2;
1688 if((nj1-nj0)%2) {
1689 jnra = jjnr[k];
1690 j3a = 3*jnra;
1691 transpose_1_to_3(load_xyz(pos+j3a),&dx,&dy,&dz);
1692 dx = vec_sub(ix,dx);
1693 dy = vec_sub(iy,dy);
1694 dz = vec_sub(iz,dz);
1695 rsq = vec_madd(dx,dx,nul);
1696 rsq = vec_madd(dy,dy,rsq);
1697 rsq = vec_madd(dz,dz,rsq);
1698 zero_highest_3_elements_in_vector(&rsq);
1699 rinv = do_invsqrt(rsq);
1700 zero_highest_3_elements_in_vector(&rinv);
1701 r = vec_madd(rinv,rsq,nul);
1702 qq = vec_madd(load_1_float(charge+jnra),iq,nul);
1703 tja = ntiA+2*type[jnra];
1704 load_1_pair(nbfp+tja,&c6,&c12);
1705 do_1_ljctable_coul_and_lj(VFtab,vec_madd(r,tsc,nul),&VVc,&FFc,&VVd,&FFd,&VVr,&FFr);
1706 vctot = vec_madd(qq,VVc,vctot);
1707 fs = vec_nmsub(qq,FFc,nul);
1708 vnbtot = vec_madd(c6,VVd,vnbtot);
1709 fs = vec_nmsub(c6,FFd,fs);
1710 vnbtot = vec_madd(c12,VVr,vnbtot);
1711 fs = vec_nmsub(c12,FFr,fs);
1712 fs = vec_madd(fs,tsc,nul);
1713 fs = vec_madd(fs,rinv,nul);
1714 fix = vec_madd(fs,dx,fix); /* +=fx */
1715 fiy = vec_madd(fs,dy,fiy); /* +=fy */
1716 fiz = vec_madd(fs,dz,fiz); /* +=fz */
1717 dx = vec_nmsub(dx,fs,nul); /* -fx */
1718 dy = vec_nmsub(dy,fs,nul); /* -fy */
1719 dz = vec_nmsub(dz,fs,nul); /* -fz */
1720 transpose_3_to_1(dx,dy,dz,&tmp1);
1721 add_xyz_to_mem(faction+j3a,tmp1);
1723 /* update outer data */
1724 transpose_3_to_4(fix,fiy,fiz,&tmp1,&tmp2,&tmp3,&tmp4);
1725 tmp1 = vec_add(tmp1,tmp3);
1726 tmp2 = vec_add(tmp2,tmp4);
1727 tmp1 = vec_add(tmp1,tmp2);
1728 add_xyz_to_mem(faction+ii3,tmp1);
1729 add_xyz_to_mem(fshift+is3,tmp1);
1731 add_vector_to_float(Vnb+gid[n],vnbtot);
1732 add_vector_to_float(Vc+gid[n],vctot);
1737 void inl1020_altivec(
1738 int nri,
1739 int iinr[],
1740 int jindex[],
1741 int jjnr[],
1742 int shift[],
1743 float shiftvec[],
1744 float fshift[],
1745 int gid[],
1746 float pos[],
1747 float faction[],
1748 float charge[],
1749 float facel,
1750 float Vc[])
1752 vector float iOx,iOy,iOz,iH1x,iH1y,iH1z,iH2x,iH2y,iH2z;
1753 vector float dOx,dOy,dOz,dH1x,dH1y,dH1z,dH2x,dH2y,dH2z;
1754 vector float vfacel,vcoulO,vcoulH1,vcoulH2,nul;
1755 vector float fsO,fsH1,fsH2;
1756 vector float vctot,qqO,qqH,iqO,iqH,jq;
1757 vector float fiOx,fiOy,fiOz,fiH1x,fiH1y,fiH1z,fiH2x,fiH2y,fiH2z;
1758 vector float tmp1,tmp2,tmp3,tmp4;
1759 vector float rinvO,rinvH1,rinvH2,rinvsqO,rinvsqH1,rinvsqH2,rsqO,rsqH1,rsqH2;
1762 int n,k,k0,ii,is3,ii3,ntiA,nj0,nj1;
1763 int jnra,jnrb,jnrc,jnrd;
1764 int j3a,j3b,j3c,j3d;
1766 nul=vec_zero();
1767 vfacel=load_float_and_splat(&facel);
1768 iqO = vec_madd(load_float_and_splat(charge+iinr[0]),vfacel,nul);
1769 iqH = vec_madd(load_float_and_splat(charge+iinr[0]+1),vfacel,nul);
1771 for(n=0;n<nri;n++) {
1772 is3 = 3*shift[n];
1773 ii = iinr[n];
1774 ii3 = 3*ii;
1775 load_1_water_shift_and_splat(pos+ii3,shiftvec+is3,&iOx,&iOy,&iOz,
1776 &iH1x,&iH1y,&iH1z,&iH2x,&iH2y,&iH2z);
1777 vctot = nul;
1778 fiOx = nul;
1779 fiOy = nul;
1780 fiOz = nul;
1781 fiH1x = nul;
1782 fiH1y = nul;
1783 fiH1z = nul;
1784 fiH2x = nul;
1785 fiH2y = nul;
1786 fiH2z = nul;
1787 nj0 = jindex[n];
1788 nj1 = jindex[n+1];
1790 for(k=nj0; k<(nj1-3); k+=4) {
1791 jnra = jjnr[k];
1792 jnrb = jjnr[k+1];
1793 jnrc = jjnr[k+2];
1794 jnrd = jjnr[k+3];
1795 j3a = 3*jnra;
1796 j3b = 3*jnrb;
1797 j3c = 3*jnrc;
1798 j3d = 3*jnrd;
1799 transpose_4_to_3(load_xyz(pos+j3a),
1800 load_xyz(pos+j3b),
1801 load_xyz(pos+j3c),
1802 load_xyz(pos+j3d),&dH2x,&dH2y,&dH2z);
1803 dOx = vec_sub(iOx,dH2x);
1804 dOy = vec_sub(iOy,dH2y);
1805 dOz = vec_sub(iOz,dH2z);
1806 dH1x = vec_sub(iH1x,dH2x);
1807 dH1y = vec_sub(iH1y,dH2y);
1808 dH1z = vec_sub(iH1z,dH2z);
1809 dH2x = vec_sub(iH2x,dH2x);
1810 dH2y = vec_sub(iH2y,dH2y);
1811 dH2z = vec_sub(iH2z,dH2z);
1813 rsqO = vec_madd(dOx,dOx,nul);
1814 rsqH1 = vec_madd(dH1x,dH1x,nul);
1815 rsqH2 = vec_madd(dH2x,dH2x,nul);
1816 rsqO = vec_madd(dOy,dOy,rsqO);
1817 rsqH1 = vec_madd(dH1y,dH1y,rsqH1);
1818 rsqH2 = vec_madd(dH2y,dH2y,rsqH2);
1819 rsqO = vec_madd(dOz,dOz,rsqO);
1820 rsqH1 = vec_madd(dH1z,dH1z,rsqH1);
1821 rsqH2 = vec_madd(dH2z,dH2z,rsqH2);
1822 do_3_invsqrt(rsqO,rsqH1,rsqH2,&rinvO,&rinvH1,&rinvH2);
1823 rinvsqO = vec_madd(rinvO,rinvO,nul);
1824 rinvsqH1 = vec_madd(rinvH1,rinvH1,nul);
1825 rinvsqH2 = vec_madd(rinvH2,rinvH2,nul);
1826 /* load 4 j charges and multiply by iq */
1827 jq=load_4_float(charge+jnra,charge+jnrb,charge+jnrc,charge+jnrd);
1828 qqO = vec_madd(iqO,jq,nul);
1829 qqH = vec_madd(iqH,jq,nul);
1830 vcoulO = vec_madd(qqO,rinvO,nul);
1831 vcoulH1 = vec_madd(qqH,rinvH1,nul);
1832 vcoulH2 = vec_madd(qqH,rinvH2,nul);
1833 fsO = vec_madd(vcoulO,rinvsqO,nul);
1834 fsH1 = vec_madd(vcoulH1,rinvsqH1,nul);
1835 fsH2 = vec_madd(vcoulH2,rinvsqH2,nul);
1836 vctot = vec_add(vctot,vcoulO);
1837 vcoulH1 = vec_add(vcoulH1,vcoulH2);
1838 vctot = vec_add(vctot,vcoulH1);
1840 fiOx = vec_madd(fsO,dOx,fiOx); /* +=fx */
1841 dOx = vec_nmsub(fsO,dOx,nul); /* -fx */
1842 fiOy = vec_madd(fsO,dOy,fiOy); /* +=fy */
1843 dOy = vec_nmsub(fsO,dOy,nul); /* -fy */
1844 fiOz = vec_madd(fsO,dOz,fiOz); /* +=fz */
1845 dOz = vec_nmsub(fsO,dOz,nul); /* -fz */
1846 fiH1x = vec_madd(fsH1,dH1x,fiH1x); /* +=fx */
1847 dOx = vec_nmsub(fsH1,dH1x,dOx); /* -fx */
1848 fiH1y = vec_madd(fsH1,dH1y,fiH1y); /* +=fy */
1849 dOy = vec_nmsub(fsH1,dH1y,dOy); /* -fy */
1850 fiH1z = vec_madd(fsH1,dH1z,fiH1z); /* +=fz */
1851 dOz = vec_nmsub(fsH1,dH1z,dOz); /* -fz */
1852 fiH2x = vec_madd(fsH2,dH2x,fiH2x); /* +=fx */
1853 dOx = vec_nmsub(fsH2,dH2x,dOx); /* -fx */
1854 fiH2y = vec_madd(fsH2,dH2y,fiH2y); /* +=fy */
1855 dOy = vec_nmsub(fsH2,dH2y,dOy); /* -fy */
1856 fiH2z = vec_madd(fsH2,dH2z,fiH2z); /* +=fz */
1857 dOz = vec_nmsub(fsH2,dH2z,dOz); /* -fz */
1859 transpose_3_to_4(dOx,dOy,dOz,&tmp1,&tmp2,&tmp3,&tmp4);
1860 add_xyz_to_mem(faction+j3a,tmp1);
1861 add_xyz_to_mem(faction+j3b,tmp2);
1862 add_xyz_to_mem(faction+j3c,tmp3);
1863 add_xyz_to_mem(faction+j3d,tmp4);
1865 if(k<(nj1-2)) {
1866 jnra = jjnr[k];
1867 jnrb = jjnr[k+1];
1868 jnrc = jjnr[k+2];
1869 j3a = 3*jnra;
1870 j3b = 3*jnrb;
1871 j3c = 3*jnrc;
1872 transpose_4_to_3(load_xyz(pos+j3a),
1873 load_xyz(pos+j3b),
1874 load_xyz(pos+j3c),nul,&dH2x,&dH2y,&dH2z);
1875 dOx = vec_sub(iOx,dH2x);
1876 dOy = vec_sub(iOy,dH2y);
1877 dOz = vec_sub(iOz,dH2z);
1878 dH1x = vec_sub(iH1x,dH2x);
1879 dH1y = vec_sub(iH1y,dH2y);
1880 dH1z = vec_sub(iH1z,dH2z);
1881 dH2x = vec_sub(iH2x,dH2x);
1882 dH2y = vec_sub(iH2y,dH2y);
1883 dH2z = vec_sub(iH2z,dH2z);
1885 rsqO = vec_madd(dOx,dOx,nul);
1886 rsqH1 = vec_madd(dH1x,dH1x,nul);
1887 rsqH2 = vec_madd(dH2x,dH2x,nul);
1888 rsqO = vec_madd(dOy,dOy,rsqO);
1889 rsqH1 = vec_madd(dH1y,dH1y,rsqH1);
1890 rsqH2 = vec_madd(dH2y,dH2y,rsqH2);
1891 rsqO = vec_madd(dOz,dOz,rsqO);
1892 rsqH1 = vec_madd(dH1z,dH1z,rsqH1);
1893 rsqH2 = vec_madd(dH2z,dH2z,rsqH2);
1894 do_3_invsqrt(rsqO,rsqH1,rsqH2,&rinvO,&rinvH1,&rinvH2);
1895 zero_highest_element_in_3_vectors(&rinvO,&rinvH1,&rinvH2);
1897 jq=load_3_float(charge+jnra,charge+jnrb,charge+jnrc);
1898 rinvsqO = vec_madd(rinvO,rinvO,nul);
1899 rinvsqH1 = vec_madd(rinvH1,rinvH1,nul);
1900 rinvsqH2 = vec_madd(rinvH2,rinvH2,nul);
1902 qqO = vec_madd(iqO,jq,nul);
1903 qqH = vec_madd(iqH,jq,nul);
1904 vcoulO = vec_madd(qqO,rinvO,nul);
1905 vcoulH1 = vec_madd(qqH,rinvH1,nul);
1906 vcoulH2 = vec_madd(qqH,rinvH2,nul);
1907 fsO = vec_madd(vcoulO,rinvsqO,nul);
1908 fsH1 = vec_madd(vcoulH1,rinvsqH1,nul);
1909 fsH2 = vec_madd(vcoulH2,rinvsqH2,nul);
1910 vctot = vec_add(vctot,vcoulO);
1911 vcoulH1 = vec_add(vcoulH1,vcoulH2);
1912 vctot = vec_add(vctot,vcoulH1);
1914 fiOx = vec_madd(fsO,dOx,fiOx); /* +=fx */
1915 dOx = vec_nmsub(fsO,dOx,nul); /* -fx */
1916 fiOy = vec_madd(fsO,dOy,fiOy); /* +=fy */
1917 dOy = vec_nmsub(fsO,dOy,nul); /* -fy */
1918 fiOz = vec_madd(fsO,dOz,fiOz); /* +=fz */
1919 dOz = vec_nmsub(fsO,dOz,nul); /* -fz */
1920 fiH1x = vec_madd(fsH1,dH1x,fiH1x); /* +=fx */
1921 dOx = vec_nmsub(fsH1,dH1x,dOx); /* -fx */
1922 fiH1y = vec_madd(fsH1,dH1y,fiH1y); /* +=fy */
1923 dOy = vec_nmsub(fsH1,dH1y,dOy); /* -fy */
1924 fiH1z = vec_madd(fsH1,dH1z,fiH1z); /* +=fz */
1925 dOz = vec_nmsub(fsH1,dH1z,dOz); /* -fz */
1926 fiH2x = vec_madd(fsH2,dH2x,fiH2x); /* +=fx */
1927 dOx = vec_nmsub(fsH2,dH2x,dOx); /* -fx */
1928 fiH2y = vec_madd(fsH2,dH2y,fiH2y); /* +=fy */
1929 dOy = vec_nmsub(fsH2,dH2y,dOy); /* -fy */
1930 fiH2z = vec_madd(fsH2,dH2z,fiH2z); /* +=fz */
1931 dOz = vec_nmsub(fsH2,dH2z,dOz); /* -fz */
1933 transpose_4_to_3(dOx,dOy,dOz,nul,&tmp1,&tmp2,&tmp3);
1934 add_xyz_to_mem(faction+j3a,tmp1);
1935 add_xyz_to_mem(faction+j3b,tmp2);
1936 add_xyz_to_mem(faction+j3c,tmp3);
1937 } else if(k<(nj1-1)) {
1938 jnra = jjnr[k];
1939 jnrb = jjnr[k+1];
1940 j3a = 3*jnra;
1941 j3b = 3*jnrb;
1942 transpose_2_to_3(load_xyz(pos+j3a),
1943 load_xyz(pos+j3b),&dH2x,&dH2y,&dH2z);
1944 dOx = vec_sub(iOx,dH2x);
1945 dOy = vec_sub(iOy,dH2y);
1946 dOz = vec_sub(iOz,dH2z);
1947 dH1x = vec_sub(iH1x,dH2x);
1948 dH1y = vec_sub(iH1y,dH2y);
1949 dH1z = vec_sub(iH1z,dH2z);
1950 dH2x = vec_sub(iH2x,dH2x);
1951 dH2y = vec_sub(iH2y,dH2y);
1952 dH2z = vec_sub(iH2z,dH2z);
1954 rsqO = vec_madd(dOx,dOx,nul);
1955 rsqH1 = vec_madd(dH1x,dH1x,nul);
1956 rsqH2 = vec_madd(dH2x,dH2x,nul);
1957 rsqO = vec_madd(dOy,dOy,rsqO);
1958 rsqH1 = vec_madd(dH1y,dH1y,rsqH1);
1959 rsqH2 = vec_madd(dH2y,dH2y,rsqH2);
1960 rsqO = vec_madd(dOz,dOz,rsqO);
1961 rsqH1 = vec_madd(dH1z,dH1z,rsqH1);
1962 rsqH2 = vec_madd(dH2z,dH2z,rsqH2);
1963 do_3_invsqrt(rsqO,rsqH1,rsqH2,&rinvO,&rinvH1,&rinvH2);
1964 zero_highest_2_elements_in_3_vectors(&rinvO,&rinvH1,&rinvH2);
1966 rinvsqO = vec_madd(rinvO,rinvO,nul);
1967 rinvsqH1 = vec_madd(rinvH1,rinvH1,nul);
1968 rinvsqH2 = vec_madd(rinvH2,rinvH2,nul);
1969 /* load 2 j charges and multiply by iq */
1970 jq=load_2_float(charge+jnra,charge+jnrb);
1971 qqO = vec_madd(iqO,jq,nul);
1972 qqH = vec_madd(iqH,jq,nul);
1973 vcoulO = vec_madd(qqO,rinvO,nul);
1974 vcoulH1 = vec_madd(qqH,rinvH1,nul);
1975 vcoulH2 = vec_madd(qqH,rinvH2,nul);
1976 fsO = vec_madd(vcoulO,rinvsqO,nul);
1977 fsH1 = vec_madd(vcoulH1,rinvsqH1,nul);
1978 fsH2 = vec_madd(vcoulH2,rinvsqH2,nul);
1979 vctot = vec_add(vctot,vcoulO);
1980 vcoulH1 = vec_add(vcoulH1,vcoulH2);
1981 vctot = vec_add(vctot,vcoulH1);
1983 fiOx = vec_madd(fsO,dOx,fiOx); /* +=fx */
1984 dOx = vec_nmsub(fsO,dOx,nul); /* -fx */
1985 fiOy = vec_madd(fsO,dOy,fiOy); /* +=fy */
1986 dOy = vec_nmsub(fsO,dOy,nul); /* -fy */
1987 fiOz = vec_madd(fsO,dOz,fiOz); /* +=fz */
1988 dOz = vec_nmsub(fsO,dOz,nul); /* -fz */
1989 fiH1x = vec_madd(fsH1,dH1x,fiH1x); /* +=fx */
1990 dOx = vec_nmsub(fsH1,dH1x,dOx); /* -fx */
1991 fiH1y = vec_madd(fsH1,dH1y,fiH1y); /* +=fy */
1992 dOy = vec_nmsub(fsH1,dH1y,dOy); /* -fy */
1993 fiH1z = vec_madd(fsH1,dH1z,fiH1z); /* +=fz */
1994 dOz = vec_nmsub(fsH1,dH1z,dOz); /* -fz */
1995 fiH2x = vec_madd(fsH2,dH2x,fiH2x); /* +=fx */
1996 dOx = vec_nmsub(fsH2,dH2x,dOx); /* -fx */
1997 fiH2y = vec_madd(fsH2,dH2y,fiH2y); /* +=fy */
1998 dOy = vec_nmsub(fsH2,dH2y,dOy); /* -fy */
1999 fiH2z = vec_madd(fsH2,dH2z,fiH2z); /* +=fz */
2000 dOz = vec_nmsub(fsH2,dH2z,dOz); /* -fz */
2002 transpose_3_to_2(dOx,dOy,dOz,&tmp1,&tmp2);
2003 add_xyz_to_mem(faction+j3a,tmp1);
2004 add_xyz_to_mem(faction+j3b,tmp2);
2005 } else if(k<nj1) {
2006 jnra = jjnr[k];
2007 j3a = 3*jnra;
2008 transpose_1_to_3(load_xyz(pos+j3a),&dH2x,&dH2y,&dH2z);
2009 dOx = vec_sub(iOx,dH2x);
2010 dOy = vec_sub(iOy,dH2y);
2011 dOz = vec_sub(iOz,dH2z);
2012 dH1x = vec_sub(iH1x,dH2x);
2013 dH1y = vec_sub(iH1y,dH2y);
2014 dH1z = vec_sub(iH1z,dH2z);
2015 dH2x = vec_sub(iH2x,dH2x);
2016 dH2y = vec_sub(iH2y,dH2y);
2017 dH2z = vec_sub(iH2z,dH2z);
2019 rsqO = vec_madd(dOx,dOx,nul);
2020 rsqH1 = vec_madd(dH1x,dH1x,nul);
2021 rsqH2 = vec_madd(dH2x,dH2x,nul);
2022 rsqO = vec_madd(dOy,dOy,rsqO);
2023 rsqH1 = vec_madd(dH1y,dH1y,rsqH1);
2024 rsqH2 = vec_madd(dH2y,dH2y,rsqH2);
2025 rsqO = vec_madd(dOz,dOz,rsqO);
2026 rsqH1 = vec_madd(dH1z,dH1z,rsqH1);
2027 rsqH2 = vec_madd(dH2z,dH2z,rsqH2);
2028 do_3_invsqrt(rsqO,rsqH1,rsqH2,&rinvO,&rinvH1,&rinvH2);
2029 zero_highest_3_elements_in_3_vectors(&rinvO,&rinvH1,&rinvH2);
2031 rinvsqO = vec_madd(rinvO,rinvO,nul);
2032 rinvsqH1 = vec_madd(rinvH1,rinvH1,nul);
2033 rinvsqH2 = vec_madd(rinvH2,rinvH2,nul);
2034 /* load 1 j charges and multiply by iq */
2035 jq=load_1_float(charge+jnra);
2036 qqO = vec_madd(iqO,jq,nul);
2037 qqH = vec_madd(iqH,jq,nul);
2038 vcoulO = vec_madd(qqO,rinvO,nul);
2039 vcoulH1 = vec_madd(qqH,rinvH1,nul);
2040 vcoulH2 = vec_madd(qqH,rinvH2,nul);
2041 fsO = vec_madd(vcoulO,rinvsqO,nul);
2042 fsH1 = vec_madd(vcoulH1,rinvsqH1,nul);
2043 fsH2 = vec_madd(vcoulH2,rinvsqH2,nul);
2044 vctot = vec_add(vctot,vcoulO);
2045 vcoulH1 = vec_add(vcoulH1,vcoulH2);
2046 vctot = vec_add(vctot,vcoulH1);
2048 fiOx = vec_madd(fsO,dOx,fiOx); /* +=fx */
2049 dOx = vec_nmsub(fsO,dOx,nul); /* -fx */
2050 fiOy = vec_madd(fsO,dOy,fiOy); /* +=fy */
2051 dOy = vec_nmsub(fsO,dOy,nul); /* -fy */
2052 fiOz = vec_madd(fsO,dOz,fiOz); /* +=fz */
2053 dOz = vec_nmsub(fsO,dOz,nul); /* -fz */
2054 fiH1x = vec_madd(fsH1,dH1x,fiH1x); /* +=fx */
2055 dOx = vec_nmsub(fsH1,dH1x,dOx); /* -fx */
2056 fiH1y = vec_madd(fsH1,dH1y,fiH1y); /* +=fy */
2057 dOy = vec_nmsub(fsH1,dH1y,dOy); /* -fy */
2058 fiH1z = vec_madd(fsH1,dH1z,fiH1z); /* +=fz */
2059 dOz = vec_nmsub(fsH1,dH1z,dOz); /* -fz */
2060 fiH2x = vec_madd(fsH2,dH2x,fiH2x); /* +=fx */
2061 dOx = vec_nmsub(fsH2,dH2x,dOx); /* -fx */
2062 fiH2y = vec_madd(fsH2,dH2y,fiH2y); /* +=fy */
2063 dOy = vec_nmsub(fsH2,dH2y,dOy); /* -fy */
2064 fiH2z = vec_madd(fsH2,dH2z,fiH2z); /* +=fz */
2065 dOz = vec_nmsub(fsH2,dH2z,dOz); /* -fz */
2067 transpose_3_to_1(dOx,dOy,dOz,&tmp1);
2068 add_xyz_to_mem(faction+j3a,tmp1);
2070 /* update outer data */
2071 update_i_water_forces(faction+ii3,fshift+is3,
2072 fiOx,fiOy,fiOz,fiH1x,fiH1y,fiH1z,fiH2x,fiH2y,fiH2z);
2074 add_vector_to_float(Vc+gid[n],vctot);
2079 void inl1120_altivec(
2080 int nri,
2081 int iinr[],
2082 int jindex[],
2083 int jjnr[],
2084 int shift[],
2085 float shiftvec[],
2086 float fshift[],
2087 int gid[],
2088 float pos[],
2089 float faction[],
2090 float charge[],
2091 float facel,
2092 float Vc[],
2093 int type[],
2094 int ntype,
2095 float nbfp[],
2096 float Vnb[])
2098 vector float iOx,iOy,iOz,iH1x,iH1y,iH1z,iH2x,iH2y,iH2z;
2099 vector float dOx,dOy,dOz,dH1x,dH1y,dH1z,dH2x,dH2y,dH2z;
2100 vector float vfacel,vcoulO,vcoulH1,vcoulH2,nul;
2101 vector float vnbtot,c6,c12,rinvsix,vnb6,vnb12;
2102 vector float fsO,fsH1,fsH2;
2103 vector float vctot,qqO,qqH,iqO,iqH,jq;
2104 vector float fiOx,fiOy,fiOz,fiH1x,fiH1y,fiH1z,fiH2x,fiH2y,fiH2z;
2105 vector float tmp1,tmp2,tmp3,tmp4;
2106 vector float rinvO,rinvH1,rinvH2,rinvsqO,rinvsqH1,rinvsqH2,rsqO,rsqH1,rsqH2;
2109 int n,k,k0,ii,is3,ii3,ntiA,nj0,nj1;
2110 int jnra,jnrb,jnrc,jnrd;
2111 int j3a,j3b,j3c,j3d;
2112 int tja,tjb,tjc,tjd;
2114 nul=vec_zero();
2115 vfacel=load_float_and_splat(&facel);
2116 ii = iinr[0];
2117 iqO = vec_madd(load_float_and_splat(charge+ii),vfacel,nul);
2118 iqH = vec_madd(load_float_and_splat(charge+ii+1),vfacel,nul);
2119 ntiA = 2*ntype*type[ii];
2121 for(n=0;n<nri;n++) {
2122 is3 = 3*shift[n];
2123 ii = iinr[n];
2124 ii3 = 3*ii;
2125 load_1_water_shift_and_splat(pos+ii3,shiftvec+is3,&iOx,&iOy,&iOz,
2126 &iH1x,&iH1y,&iH1z,&iH2x,&iH2y,&iH2z);
2127 vctot = nul;
2128 vnbtot = nul;
2129 fiOx = nul;
2130 fiOy = nul;
2131 fiOz = nul;
2132 fiH1x = nul;
2133 fiH1y = nul;
2134 fiH1z = nul;
2135 fiH2x = nul;
2136 fiH2y = nul;
2137 fiH2z = nul;
2138 nj0 = jindex[n];
2139 nj1 = jindex[n+1];
2141 for(k=nj0; k<(nj1-3); k+=4) {
2142 jnra = jjnr[k];
2143 jnrb = jjnr[k+1];
2144 jnrc = jjnr[k+2];
2145 jnrd = jjnr[k+3];
2146 j3a = 3*jnra;
2147 j3b = 3*jnrb;
2148 j3c = 3*jnrc;
2149 j3d = 3*jnrd;
2150 transpose_4_to_3(load_xyz(pos+j3a),
2151 load_xyz(pos+j3b),
2152 load_xyz(pos+j3c),
2153 load_xyz(pos+j3d),&dH2x,&dH2y,&dH2z);
2154 dOx = vec_sub(iOx,dH2x);
2155 dOy = vec_sub(iOy,dH2y);
2156 dOz = vec_sub(iOz,dH2z);
2157 dH1x = vec_sub(iH1x,dH2x);
2158 dH1y = vec_sub(iH1y,dH2y);
2159 dH1z = vec_sub(iH1z,dH2z);
2160 dH2x = vec_sub(iH2x,dH2x);
2161 dH2y = vec_sub(iH2y,dH2y);
2162 dH2z = vec_sub(iH2z,dH2z);
2164 rsqO = vec_madd(dOx,dOx,nul);
2165 rsqH1 = vec_madd(dH1x,dH1x,nul);
2166 rsqH2 = vec_madd(dH2x,dH2x,nul);
2167 rsqO = vec_madd(dOy,dOy,rsqO);
2168 rsqH1 = vec_madd(dH1y,dH1y,rsqH1);
2169 rsqH2 = vec_madd(dH2y,dH2y,rsqH2);
2170 rsqO = vec_madd(dOz,dOz,rsqO);
2171 rsqH1 = vec_madd(dH1z,dH1z,rsqH1);
2172 rsqH2 = vec_madd(dH2z,dH2z,rsqH2);
2173 do_3_invsqrt(rsqO,rsqH1,rsqH2,&rinvO,&rinvH1,&rinvH2);
2174 rinvsqO = vec_madd(rinvO,rinvO,nul);
2175 rinvsqH1 = vec_madd(rinvH1,rinvH1,nul);
2176 rinvsqH2 = vec_madd(rinvH2,rinvH2,nul);
2177 rinvsix = vec_madd(rinvsqO,rinvsqO,nul);
2178 rinvsix = vec_madd(rinvsix,rinvsqO,nul);
2179 tja = ntiA+2*type[jnra];
2180 tjb = ntiA+2*type[jnrb];
2181 tjc = ntiA+2*type[jnrc];
2182 tjd = ntiA+2*type[jnrd];
2183 /* load 4 j charges and multiply by iq */
2184 jq=load_4_float(charge+jnra,charge+jnrb,charge+jnrc,charge+jnrd);
2185 load_4_pair(nbfp+tja,nbfp+tjb,nbfp+tjc,nbfp+tjd,&c6,&c12);
2186 qqO = vec_madd(iqO,jq,nul);
2187 qqH = vec_madd(iqH,jq,nul);
2188 vnb6 = vec_madd(c6,rinvsix,nul);
2189 vcoulO = vec_madd(qqO,rinvO,nul);
2190 vnb12 = vec_madd(c12,vec_madd(rinvsix,rinvsix,nul),nul);
2191 vcoulH1 = vec_madd(qqH,rinvH1,nul);
2192 vnbtot = vec_add(vnbtot,vnb12);
2193 fsO = vec_madd(vec_twelve(),vnb12,vcoulO);
2194 vcoulH2 = vec_madd(qqH,rinvH2,nul);
2195 vnbtot = vec_sub(vnbtot,vnb6);
2196 fsO = vec_nmsub(vec_six(),vnb6,fsO);
2197 fsH1 = vec_madd(vcoulH1,rinvsqH1,nul);
2198 fsH2 = vec_madd(vcoulH2,rinvsqH2,nul);
2199 fsO = vec_madd(fsO,rinvsqO,nul);
2200 vctot = vec_add(vctot,vcoulO);
2201 vcoulH1 = vec_add(vcoulH1,vcoulH2);
2202 vctot = vec_add(vctot,vcoulH1);
2203 fiOx = vec_madd(fsO,dOx,fiOx); /* +=fx */
2204 dOx = vec_nmsub(fsO,dOx,nul); /* -fx */
2205 fiOy = vec_madd(fsO,dOy,fiOy); /* +=fy */
2206 dOy = vec_nmsub(fsO,dOy,nul); /* -fy */
2207 fiOz = vec_madd(fsO,dOz,fiOz); /* +=fz */
2208 dOz = vec_nmsub(fsO,dOz,nul); /* -fz */
2209 fiH1x = vec_madd(fsH1,dH1x,fiH1x); /* +=fx */
2210 dOx = vec_nmsub(fsH1,dH1x,dOx); /* -fx */
2211 fiH1y = vec_madd(fsH1,dH1y,fiH1y); /* +=fy */
2212 dOy = vec_nmsub(fsH1,dH1y,dOy); /* -fy */
2213 fiH1z = vec_madd(fsH1,dH1z,fiH1z); /* +=fz */
2214 dOz = vec_nmsub(fsH1,dH1z,dOz); /* -fz */
2215 fiH2x = vec_madd(fsH2,dH2x,fiH2x); /* +=fx */
2216 dOx = vec_nmsub(fsH2,dH2x,dOx); /* -fx */
2217 fiH2y = vec_madd(fsH2,dH2y,fiH2y); /* +=fy */
2218 dOy = vec_nmsub(fsH2,dH2y,dOy); /* -fy */
2219 fiH2z = vec_madd(fsH2,dH2z,fiH2z); /* +=fz */
2220 dOz = vec_nmsub(fsH2,dH2z,dOz); /* -fz */
2222 transpose_3_to_4(dOx,dOy,dOz,&tmp1,&tmp2,&tmp3,&tmp4);
2223 add_xyz_to_mem(faction+j3a,tmp1);
2224 add_xyz_to_mem(faction+j3b,tmp2);
2225 add_xyz_to_mem(faction+j3c,tmp3);
2226 add_xyz_to_mem(faction+j3d,tmp4);
2228 if(k<(nj1-2)) {
2229 jnra = jjnr[k];
2230 jnrb = jjnr[k+1];
2231 jnrc = jjnr[k+2];
2232 j3a = 3*jnra;
2233 j3b = 3*jnrb;
2234 j3c = 3*jnrc;
2235 transpose_4_to_3(load_xyz(pos+j3a),
2236 load_xyz(pos+j3b),
2237 load_xyz(pos+j3c),nul,&dH2x,&dH2y,&dH2z);
2238 dOx = vec_sub(iOx,dH2x);
2239 dOy = vec_sub(iOy,dH2y);
2240 dOz = vec_sub(iOz,dH2z);
2241 dH1x = vec_sub(iH1x,dH2x);
2242 dH1y = vec_sub(iH1y,dH2y);
2243 dH1z = vec_sub(iH1z,dH2z);
2244 dH2x = vec_sub(iH2x,dH2x);
2245 dH2y = vec_sub(iH2y,dH2y);
2246 dH2z = vec_sub(iH2z,dH2z);
2248 rsqO = vec_madd(dOx,dOx,nul);
2249 rsqH1 = vec_madd(dH1x,dH1x,nul);
2250 rsqH2 = vec_madd(dH2x,dH2x,nul);
2251 rsqO = vec_madd(dOy,dOy,rsqO);
2252 rsqH1 = vec_madd(dH1y,dH1y,rsqH1);
2253 rsqH2 = vec_madd(dH2y,dH2y,rsqH2);
2254 rsqO = vec_madd(dOz,dOz,rsqO);
2255 rsqH1 = vec_madd(dH1z,dH1z,rsqH1);
2256 rsqH2 = vec_madd(dH2z,dH2z,rsqH2);
2257 do_3_invsqrt(rsqO,rsqH1,rsqH2,&rinvO,&rinvH1,&rinvH2);
2258 zero_highest_element_in_3_vectors(&rinvO,&rinvH1,&rinvH2);
2260 rinvsqO = vec_madd(rinvO,rinvO,nul);
2261 rinvsqH1 = vec_madd(rinvH1,rinvH1,nul);
2262 rinvsqH2 = vec_madd(rinvH2,rinvH2,nul);
2263 rinvsix = vec_madd(rinvsqO,rinvsqO,nul);
2264 rinvsix = vec_madd(rinvsix,rinvsqO,nul);
2265 jq=load_3_float(charge+jnra,charge+jnrb,charge+jnrc);
2266 tja = ntiA+2*type[jnra];
2267 tjb = ntiA+2*type[jnrb];
2268 tjc = ntiA+2*type[jnrc];
2269 /* load 3 j charges and multiply by iq */
2270 load_3_pair(nbfp+tja,nbfp+tjb,nbfp+tjc,&c6,&c12);
2271 qqO = vec_madd(iqO,jq,nul);
2272 qqH = vec_madd(iqH,jq,nul);
2273 vnb6 = vec_madd(c6,rinvsix,nul);
2274 vcoulO = vec_madd(qqO,rinvO,nul);
2275 vnb12 = vec_madd(c12,vec_madd(rinvsix,rinvsix,nul),nul);
2276 vcoulH1 = vec_madd(qqH,rinvH1,nul);
2277 vnbtot = vec_add(vnbtot,vnb12);
2278 fsO = vec_madd(vec_twelve(),vnb12,vcoulO);
2279 vcoulH2 = vec_madd(qqH,rinvH2,nul);
2280 vnbtot = vec_sub(vnbtot,vnb6);
2281 fsO = vec_nmsub(vec_six(),vnb6,fsO);
2282 fsH1 = vec_madd(vcoulH1,rinvsqH1,nul);
2283 fsH2 = vec_madd(vcoulH2,rinvsqH2,nul);
2284 fsO = vec_madd(fsO,rinvsqO,nul);
2285 vctot = vec_add(vctot,vcoulO);
2286 vcoulH1 = vec_add(vcoulH1,vcoulH2);
2287 vctot = vec_add(vctot,vcoulH1);
2289 fiOx = vec_madd(fsO,dOx,fiOx); /* +=fx */
2290 dOx = vec_nmsub(fsO,dOx,nul); /* -fx */
2291 fiOy = vec_madd(fsO,dOy,fiOy); /* +=fy */
2292 dOy = vec_nmsub(fsO,dOy,nul); /* -fy */
2293 fiOz = vec_madd(fsO,dOz,fiOz); /* +=fz */
2294 dOz = vec_nmsub(fsO,dOz,nul); /* -fz */
2295 fiH1x = vec_madd(fsH1,dH1x,fiH1x); /* +=fx */
2296 dOx = vec_nmsub(fsH1,dH1x,dOx); /* -fx */
2297 fiH1y = vec_madd(fsH1,dH1y,fiH1y); /* +=fy */
2298 dOy = vec_nmsub(fsH1,dH1y,dOy); /* -fy */
2299 fiH1z = vec_madd(fsH1,dH1z,fiH1z); /* +=fz */
2300 dOz = vec_nmsub(fsH1,dH1z,dOz); /* -fz */
2301 fiH2x = vec_madd(fsH2,dH2x,fiH2x); /* +=fx */
2302 dOx = vec_nmsub(fsH2,dH2x,dOx); /* -fx */
2303 fiH2y = vec_madd(fsH2,dH2y,fiH2y); /* +=fy */
2304 dOy = vec_nmsub(fsH2,dH2y,dOy); /* -fy */
2305 fiH2z = vec_madd(fsH2,dH2z,fiH2z); /* +=fz */
2306 dOz = vec_nmsub(fsH2,dH2z,dOz); /* -fz */
2308 transpose_4_to_3(dOx,dOy,dOz,nul,&tmp1,&tmp2,&tmp3);
2309 add_xyz_to_mem(faction+j3a,tmp1);
2310 add_xyz_to_mem(faction+j3b,tmp2);
2311 add_xyz_to_mem(faction+j3c,tmp3);
2312 } else if(k<(nj1-1)) {
2313 jnra = jjnr[k];
2314 jnrb = jjnr[k+1];
2315 j3a = 3*jnra;
2316 j3b = 3*jnrb;
2317 transpose_2_to_3(load_xyz(pos+j3a),
2318 load_xyz(pos+j3b),&dH2x,&dH2y,&dH2z);
2319 dOx = vec_sub(iOx,dH2x);
2320 dOy = vec_sub(iOy,dH2y);
2321 dOz = vec_sub(iOz,dH2z);
2322 dH1x = vec_sub(iH1x,dH2x);
2323 dH1y = vec_sub(iH1y,dH2y);
2324 dH1z = vec_sub(iH1z,dH2z);
2325 dH2x = vec_sub(iH2x,dH2x);
2326 dH2y = vec_sub(iH2y,dH2y);
2327 dH2z = vec_sub(iH2z,dH2z);
2329 rsqO = vec_madd(dOx,dOx,nul);
2330 rsqH1 = vec_madd(dH1x,dH1x,nul);
2331 rsqH2 = vec_madd(dH2x,dH2x,nul);
2332 rsqO = vec_madd(dOy,dOy,rsqO);
2333 rsqH1 = vec_madd(dH1y,dH1y,rsqH1);
2334 rsqH2 = vec_madd(dH2y,dH2y,rsqH2);
2335 rsqO = vec_madd(dOz,dOz,rsqO);
2336 rsqH1 = vec_madd(dH1z,dH1z,rsqH1);
2337 rsqH2 = vec_madd(dH2z,dH2z,rsqH2);
2338 do_3_invsqrt(rsqO,rsqH1,rsqH2,&rinvO,&rinvH1,&rinvH2);
2339 zero_highest_2_elements_in_3_vectors(&rinvO,&rinvH1,&rinvH2);
2341 rinvsqO = vec_madd(rinvO,rinvO,nul);
2342 rinvsqH1 = vec_madd(rinvH1,rinvH1,nul);
2343 rinvsqH2 = vec_madd(rinvH2,rinvH2,nul);
2344 rinvsix = vec_madd(rinvsqO,rinvsqO,nul);
2345 rinvsix = vec_madd(rinvsix,rinvsqO,nul);
2346 tja = ntiA+2*type[jnra];
2347 tjb = ntiA+2*type[jnrb];
2348 /* load 2 j charges and multiply by iq */
2349 jq=load_2_float(charge+jnra,charge+jnrb);
2350 load_2_pair(nbfp+tja,nbfp+tjb,&c6,&c12);
2351 qqO = vec_madd(iqO,jq,nul);
2352 qqH = vec_madd(iqH,jq,nul);
2353 vnb6 = vec_madd(c6,rinvsix,nul);
2354 vcoulO = vec_madd(qqO,rinvO,nul);
2355 vnb12 = vec_madd(c12,vec_madd(rinvsix,rinvsix,nul),nul);
2356 vcoulH1 = vec_madd(qqH,rinvH1,nul);
2357 vnbtot = vec_add(vnbtot,vnb12);
2358 fsO = vec_madd(vec_twelve(),vnb12,vcoulO);
2359 vcoulH2 = vec_madd(qqH,rinvH2,nul);
2360 vnbtot = vec_sub(vnbtot,vnb6);
2361 fsO = vec_nmsub(vec_six(),vnb6,fsO);
2362 fsH1 = vec_madd(vcoulH1,rinvsqH1,nul);
2363 fsH2 = vec_madd(vcoulH2,rinvsqH2,nul);
2364 fsO = vec_madd(fsO,rinvsqO,nul);
2365 vctot = vec_add(vctot,vcoulO);
2366 vcoulH1 = vec_add(vcoulH1,vcoulH2);
2367 vctot = vec_add(vctot,vcoulH1);
2369 fiOx = vec_madd(fsO,dOx,fiOx); /* +=fx */
2370 dOx = vec_nmsub(fsO,dOx,nul); /* -fx */
2371 fiOy = vec_madd(fsO,dOy,fiOy); /* +=fy */
2372 dOy = vec_nmsub(fsO,dOy,nul); /* -fy */
2373 fiOz = vec_madd(fsO,dOz,fiOz); /* +=fz */
2374 dOz = vec_nmsub(fsO,dOz,nul); /* -fz */
2375 fiH1x = vec_madd(fsH1,dH1x,fiH1x); /* +=fx */
2376 dOx = vec_nmsub(fsH1,dH1x,dOx); /* -fx */
2377 fiH1y = vec_madd(fsH1,dH1y,fiH1y); /* +=fy */
2378 dOy = vec_nmsub(fsH1,dH1y,dOy); /* -fy */
2379 fiH1z = vec_madd(fsH1,dH1z,fiH1z); /* +=fz */
2380 dOz = vec_nmsub(fsH1,dH1z,dOz); /* -fz */
2381 fiH2x = vec_madd(fsH2,dH2x,fiH2x); /* +=fx */
2382 dOx = vec_nmsub(fsH2,dH2x,dOx); /* -fx */
2383 fiH2y = vec_madd(fsH2,dH2y,fiH2y); /* +=fy */
2384 dOy = vec_nmsub(fsH2,dH2y,dOy); /* -fy */
2385 fiH2z = vec_madd(fsH2,dH2z,fiH2z); /* +=fz */
2386 dOz = vec_nmsub(fsH2,dH2z,dOz); /* -fz */
2388 transpose_3_to_2(dOx,dOy,dOz,&tmp1,&tmp2);
2389 add_xyz_to_mem(faction+j3a,tmp1);
2390 add_xyz_to_mem(faction+j3b,tmp2);
2391 } else if(k<nj1) {
2392 jnra = jjnr[k];
2393 j3a = 3*jnra;
2394 transpose_1_to_3(load_xyz(pos+j3a),&dH2x,&dH2y,&dH2z);
2395 dOx = vec_sub(iOx,dH2x);
2396 dOy = vec_sub(iOy,dH2y);
2397 dOz = vec_sub(iOz,dH2z);
2398 dH1x = vec_sub(iH1x,dH2x);
2399 dH1y = vec_sub(iH1y,dH2y);
2400 dH1z = vec_sub(iH1z,dH2z);
2401 dH2x = vec_sub(iH2x,dH2x);
2402 dH2y = vec_sub(iH2y,dH2y);
2403 dH2z = vec_sub(iH2z,dH2z);
2405 rsqO = vec_madd(dOx,dOx,nul);
2406 rsqH1 = vec_madd(dH1x,dH1x,nul);
2407 rsqH2 = vec_madd(dH2x,dH2x,nul);
2408 rsqO = vec_madd(dOy,dOy,rsqO);
2409 rsqH1 = vec_madd(dH1y,dH1y,rsqH1);
2410 rsqH2 = vec_madd(dH2y,dH2y,rsqH2);
2411 rsqO = vec_madd(dOz,dOz,rsqO);
2412 rsqH1 = vec_madd(dH1z,dH1z,rsqH1);
2413 rsqH2 = vec_madd(dH2z,dH2z,rsqH2);
2414 do_3_invsqrt(rsqO,rsqH1,rsqH2,&rinvO,&rinvH1,&rinvH2);
2415 zero_highest_3_elements_in_3_vectors(&rinvO,&rinvH1,&rinvH2);
2417 rinvsqO = vec_madd(rinvO,rinvO,nul);
2418 rinvsqH1 = vec_madd(rinvH1,rinvH1,nul);
2419 rinvsqH2 = vec_madd(rinvH2,rinvH2,nul);
2420 rinvsix = vec_madd(rinvsqO,rinvsqO,nul);
2421 rinvsix = vec_madd(rinvsix,rinvsqO,nul);
2422 tja = ntiA+2*type[jnra];
2423 /* load 1 j charges and multiply by iq */
2424 jq=load_1_float(charge+jnra);
2425 load_1_pair(nbfp+tja,&c6,&c12);
2426 qqO = vec_madd(iqO,jq,nul);
2427 qqH = vec_madd(iqH,jq,nul);
2428 vnb6 = vec_madd(c6,rinvsix,nul);
2429 vcoulO = vec_madd(qqO,rinvO,nul);
2430 vnb12 = vec_madd(c12,vec_madd(rinvsix,rinvsix,nul),nul);
2431 vcoulH1 = vec_madd(qqH,rinvH1,nul);
2432 vnbtot = vec_add(vnbtot,vnb12);
2433 fsO = vec_madd(vec_twelve(),vnb12,vcoulO);
2434 vcoulH2 = vec_madd(qqH,rinvH2,nul);
2435 vnbtot = vec_sub(vnbtot,vnb6);
2436 fsO = vec_nmsub(vec_six(),vnb6,fsO);
2437 fsH1 = vec_madd(vcoulH1,rinvsqH1,nul);
2438 fsH2 = vec_madd(vcoulH2,rinvsqH2,nul);
2439 fsO = vec_madd(fsO,rinvsqO,nul);
2440 vctot = vec_add(vctot,vcoulO);
2441 vcoulH1 = vec_add(vcoulH1,vcoulH2);
2442 vctot = vec_add(vctot,vcoulH1);
2444 fiOx = vec_madd(fsO,dOx,fiOx); /* +=fx */
2445 dOx = vec_nmsub(fsO,dOx,nul); /* -fx */
2446 fiOy = vec_madd(fsO,dOy,fiOy); /* +=fy */
2447 dOy = vec_nmsub(fsO,dOy,nul); /* -fy */
2448 fiOz = vec_madd(fsO,dOz,fiOz); /* +=fz */
2449 dOz = vec_nmsub(fsO,dOz,nul); /* -fz */
2450 fiH1x = vec_madd(fsH1,dH1x,fiH1x); /* +=fx */
2451 dOx = vec_nmsub(fsH1,dH1x,dOx); /* -fx */
2452 fiH1y = vec_madd(fsH1,dH1y,fiH1y); /* +=fy */
2453 dOy = vec_nmsub(fsH1,dH1y,dOy); /* -fy */
2454 fiH1z = vec_madd(fsH1,dH1z,fiH1z); /* +=fz */
2455 dOz = vec_nmsub(fsH1,dH1z,dOz); /* -fz */
2456 fiH2x = vec_madd(fsH2,dH2x,fiH2x); /* +=fx */
2457 dOx = vec_nmsub(fsH2,dH2x,dOx); /* -fx */
2458 fiH2y = vec_madd(fsH2,dH2y,fiH2y); /* +=fy */
2459 dOy = vec_nmsub(fsH2,dH2y,dOy); /* -fy */
2460 fiH2z = vec_madd(fsH2,dH2z,fiH2z); /* +=fz */
2461 dOz = vec_nmsub(fsH2,dH2z,dOz); /* -fz */
2463 transpose_3_to_1(dOx,dOy,dOz,&tmp1);
2464 add_xyz_to_mem(faction+j3a,tmp1);
2466 /* update outer data */
2467 update_i_water_forces(faction+ii3,fshift+is3,
2468 fiOx,fiOy,fiOz,fiH1x,fiH1y,fiH1z,fiH2x,fiH2y,fiH2z);
2470 add_vector_to_float(Vc+gid[n],vctot);
2471 add_vector_to_float(Vnb+gid[n],vnbtot);
2477 void inl2020_altivec(
2478 int nri,
2479 int iinr[],
2480 int jindex[],
2481 int jjnr[],
2482 int shift[],
2483 float shiftvec[],
2484 float fshift[],
2485 int gid[],
2486 float pos[],
2487 float faction[],
2488 float charge[],
2489 float facel,
2490 float Vc[],
2491 float krf,
2492 float crf)
2494 vector float vkrf,vcrf;
2495 vector float iOx,iOy,iOz,iH1x,iH1y,iH1z,iH2x,iH2y,iH2z;
2496 vector float dOx,dOy,dOz,dH1x,dH1y,dH1z,dH2x,dH2y,dH2z;
2497 vector float vfacel,vcoulO,vcoulH1,vcoulH2,nul;
2498 vector float fsO,fsH1,fsH2,krsqO,krsqH1,krsqH2;
2499 vector float vctot,qqO,qqH,iqO,iqH,jq;
2500 vector float fiOx,fiOy,fiOz,fiH1x,fiH1y,fiH1z,fiH2x,fiH2y,fiH2z;
2501 vector float tmp1,tmp2,tmp3,tmp4;
2502 vector float rinvO,rinvH1,rinvH2,rinvsqO,rinvsqH1,rinvsqH2,rsqO,rsqH1,rsqH2;
2505 int n,k,k0,ii,is3,ii3,ntiA,nj0,nj1;
2506 int jnra,jnrb,jnrc,jnrd;
2507 int j3a,j3b,j3c,j3d;
2509 nul=vec_zero();
2510 vfacel=load_float_and_splat(&facel);
2511 vkrf=load_float_and_splat(&krf);
2512 vcrf=load_float_and_splat(&crf);
2514 iqO = vec_madd(load_float_and_splat(charge+iinr[0]),vfacel,nul);
2515 iqH = vec_madd(load_float_and_splat(charge+iinr[0]+1),vfacel,nul);
2517 for(n=0;n<nri;n++) {
2518 is3 = 3*shift[n];
2519 ii = iinr[n];
2520 ii3 = 3*ii;
2521 load_1_water_shift_and_splat(pos+ii3,shiftvec+is3,&iOx,&iOy,&iOz,
2522 &iH1x,&iH1y,&iH1z,&iH2x,&iH2y,&iH2z);
2523 vctot = nul;
2524 fiOx = nul;
2525 fiOy = nul;
2526 fiOz = nul;
2527 fiH1x = nul;
2528 fiH1y = nul;
2529 fiH1z = nul;
2530 fiH2x = nul;
2531 fiH2y = nul;
2532 fiH2z = nul;
2533 nj0 = jindex[n];
2534 nj1 = jindex[n+1];
2536 for(k=nj0; k<(nj1-3); k+=4) {
2537 jnra = jjnr[k];
2538 jnrb = jjnr[k+1];
2539 jnrc = jjnr[k+2];
2540 jnrd = jjnr[k+3];
2541 j3a = 3*jnra;
2542 j3b = 3*jnrb;
2543 j3c = 3*jnrc;
2544 j3d = 3*jnrd;
2545 transpose_4_to_3(load_xyz(pos+j3a),
2546 load_xyz(pos+j3b),
2547 load_xyz(pos+j3c),
2548 load_xyz(pos+j3d),&dH2x,&dH2y,&dH2z);
2549 dOx = vec_sub(iOx,dH2x);
2550 dOy = vec_sub(iOy,dH2y);
2551 dOz = vec_sub(iOz,dH2z);
2552 dH1x = vec_sub(iH1x,dH2x);
2553 dH1y = vec_sub(iH1y,dH2y);
2554 dH1z = vec_sub(iH1z,dH2z);
2555 dH2x = vec_sub(iH2x,dH2x);
2556 dH2y = vec_sub(iH2y,dH2y);
2557 dH2z = vec_sub(iH2z,dH2z);
2559 rsqO = vec_madd(dOx,dOx,nul);
2560 rsqH1 = vec_madd(dH1x,dH1x,nul);
2561 rsqH2 = vec_madd(dH2x,dH2x,nul);
2562 rsqO = vec_madd(dOy,dOy,rsqO);
2563 rsqH1 = vec_madd(dH1y,dH1y,rsqH1);
2564 rsqH2 = vec_madd(dH2y,dH2y,rsqH2);
2565 rsqO = vec_madd(dOz,dOz,rsqO);
2566 rsqH1 = vec_madd(dH1z,dH1z,rsqH1);
2567 rsqH2 = vec_madd(dH2z,dH2z,rsqH2);
2568 do_3_invsqrt(rsqO,rsqH1,rsqH2,&rinvO,&rinvH1,&rinvH2);
2569 rinvsqO = vec_madd(rinvO,rinvO,nul);
2570 rinvsqH1 = vec_madd(rinvH1,rinvH1,nul);
2571 rinvsqH2 = vec_madd(rinvH2,rinvH2,nul);
2572 /* load 4 j charges and multiply by iq */
2573 jq=load_4_float(charge+jnra,charge+jnrb,charge+jnrc,charge+jnrd);
2574 qqO = vec_madd(iqO,jq,nul);
2575 qqH = vec_madd(iqH,jq,nul);
2576 krsqO = vec_madd(vkrf,rsqO,nul);
2577 krsqH1 = vec_madd(vkrf,rsqH1,nul);
2578 krsqH2 = vec_madd(vkrf,rsqH2,nul);
2579 vcoulO = vec_add(rinvO,krsqO);
2580 vcoulH1 = vec_add(rinvH1,krsqH1);
2581 vcoulH2 = vec_add(rinvH2,krsqH2);
2582 vcoulO = vec_sub(vcoulO,vcrf);
2583 vcoulH1 = vec_sub(vcoulH1,vcrf);
2584 vcoulH2 = vec_sub(vcoulH2,vcrf);
2585 vctot = vec_madd(qqO,vcoulO,vctot);
2586 fsO = vec_nmsub(vec_two(),krsqO,rinvO);
2587 fsH1 = vec_nmsub(vec_two(),krsqH1,rinvH1);
2588 fsH2 = vec_nmsub(vec_two(),krsqH2,rinvH2);
2589 vctot = vec_madd(qqH,vcoulH1,vctot);
2590 fsO = vec_madd(fsO,qqO,nul);
2591 fsH1 = vec_madd(fsH1,qqH,nul);
2592 fsH2 = vec_madd(fsH2,qqH,nul);
2593 vctot = vec_madd(qqH,vcoulH2,vctot);
2594 fsO = vec_madd(fsO,rinvsqO,nul);
2595 fsH1 = vec_madd(fsH1,rinvsqH1,nul);
2596 fsH2 = vec_madd(fsH2,rinvsqH2,nul);
2597 fiOx = vec_madd(fsO,dOx,fiOx); /* +=fx */
2598 dOx = vec_nmsub(fsO,dOx,nul); /* -fx */
2599 fiOy = vec_madd(fsO,dOy,fiOy); /* +=fy */
2600 dOy = vec_nmsub(fsO,dOy,nul); /* -fy */
2601 fiOz = vec_madd(fsO,dOz,fiOz); /* +=fz */
2602 dOz = vec_nmsub(fsO,dOz,nul); /* -fz */
2603 fiH1x = vec_madd(fsH1,dH1x,fiH1x); /* +=fx */
2604 dOx = vec_nmsub(fsH1,dH1x,dOx); /* -fx */
2605 fiH1y = vec_madd(fsH1,dH1y,fiH1y); /* +=fy */
2606 dOy = vec_nmsub(fsH1,dH1y,dOy); /* -fy */
2607 fiH1z = vec_madd(fsH1,dH1z,fiH1z); /* +=fz */
2608 dOz = vec_nmsub(fsH1,dH1z,dOz); /* -fz */
2609 fiH2x = vec_madd(fsH2,dH2x,fiH2x); /* +=fx */
2610 dOx = vec_nmsub(fsH2,dH2x,dOx); /* -fx */
2611 fiH2y = vec_madd(fsH2,dH2y,fiH2y); /* +=fy */
2612 dOy = vec_nmsub(fsH2,dH2y,dOy); /* -fy */
2613 fiH2z = vec_madd(fsH2,dH2z,fiH2z); /* +=fz */
2614 dOz = vec_nmsub(fsH2,dH2z,dOz); /* -fz */
2616 transpose_3_to_4(dOx,dOy,dOz,&tmp1,&tmp2,&tmp3,&tmp4);
2617 add_xyz_to_mem(faction+j3a,tmp1);
2618 add_xyz_to_mem(faction+j3b,tmp2);
2619 add_xyz_to_mem(faction+j3c,tmp3);
2620 add_xyz_to_mem(faction+j3d,tmp4);
2622 if(k<(nj1-2)) {
2623 jnra = jjnr[k];
2624 jnrb = jjnr[k+1];
2625 jnrc = jjnr[k+2];
2626 j3a = 3*jnra;
2627 j3b = 3*jnrb;
2628 j3c = 3*jnrc;
2629 transpose_4_to_3(load_xyz(pos+j3a),
2630 load_xyz(pos+j3b),
2631 load_xyz(pos+j3c),nul,&dH2x,&dH2y,&dH2z);
2632 dOx = vec_sub(iOx,dH2x);
2633 dOy = vec_sub(iOy,dH2y);
2634 dOz = vec_sub(iOz,dH2z);
2635 dH1x = vec_sub(iH1x,dH2x);
2636 dH1y = vec_sub(iH1y,dH2y);
2637 dH1z = vec_sub(iH1z,dH2z);
2638 dH2x = vec_sub(iH2x,dH2x);
2639 dH2y = vec_sub(iH2y,dH2y);
2640 dH2z = vec_sub(iH2z,dH2z);
2642 rsqO = vec_madd(dOx,dOx,nul);
2643 rsqH1 = vec_madd(dH1x,dH1x,nul);
2644 rsqH2 = vec_madd(dH2x,dH2x,nul);
2645 rsqO = vec_madd(dOy,dOy,rsqO);
2646 rsqH1 = vec_madd(dH1y,dH1y,rsqH1);
2647 rsqH2 = vec_madd(dH2y,dH2y,rsqH2);
2648 rsqO = vec_madd(dOz,dOz,rsqO);
2649 rsqH1 = vec_madd(dH1z,dH1z,rsqH1);
2650 rsqH2 = vec_madd(dH2z,dH2z,rsqH2);
2652 zero_highest_element_in_3_vectors(&rsqO,&rsqH1,&rsqH2);
2653 do_3_invsqrt(rsqO,rsqH1,rsqH2,&rinvO,&rinvH1,&rinvH2);
2654 zero_highest_element_in_3_vectors(&rinvO,&rinvH1,&rinvH2);
2656 jq=load_3_float(charge+jnra,charge+jnrb,charge+jnrc);
2657 rinvsqO = vec_madd(rinvO,rinvO,nul);
2658 rinvsqH1 = vec_madd(rinvH1,rinvH1,nul);
2659 rinvsqH2 = vec_madd(rinvH2,rinvH2,nul);
2660 /* load 3 j charges and multiply by iq */
2661 qqO = vec_madd(iqO,jq,nul);
2662 qqH = vec_madd(iqH,jq,nul);
2663 krsqO = vec_madd(vkrf,rsqO,nul);
2664 krsqH1 = vec_madd(vkrf,rsqH1,nul);
2665 krsqH2 = vec_madd(vkrf,rsqH2,nul);
2666 vcoulO = vec_add(rinvO,krsqO);
2667 vcoulH1 = vec_add(rinvH1,krsqH1);
2668 vcoulH2 = vec_add(rinvH2,krsqH2);
2669 vcoulO = vec_sub(vcoulO,vcrf);
2670 vcoulH1 = vec_sub(vcoulH1,vcrf);
2671 vcoulH2 = vec_sub(vcoulH2,vcrf);
2672 vctot = vec_madd(qqO,vcoulO,vctot);
2673 fsO = vec_nmsub(vec_two(),krsqO,rinvO);
2674 fsH1 = vec_nmsub(vec_two(),krsqH1,rinvH1);
2675 fsH2 = vec_nmsub(vec_two(),krsqH2,rinvH2);
2676 vctot = vec_madd(qqH,vcoulH1,vctot);
2677 fsO = vec_madd(fsO,qqO,nul);
2678 fsH1 = vec_madd(fsH1,qqH,nul);
2679 fsH2 = vec_madd(fsH2,qqH,nul);
2680 vctot = vec_madd(qqH,vcoulH2,vctot);
2681 fsO = vec_madd(fsO,rinvsqO,nul);
2682 fsH1 = vec_madd(fsH1,rinvsqH1,nul);
2683 fsH2 = vec_madd(fsH2,rinvsqH2,nul);
2685 fiOx = vec_madd(fsO,dOx,fiOx); /* +=fx */
2686 dOx = vec_nmsub(fsO,dOx,nul); /* -fx */
2687 fiOy = vec_madd(fsO,dOy,fiOy); /* +=fy */
2688 dOy = vec_nmsub(fsO,dOy,nul); /* -fy */
2689 fiOz = vec_madd(fsO,dOz,fiOz); /* +=fz */
2690 dOz = vec_nmsub(fsO,dOz,nul); /* -fz */
2691 fiH1x = vec_madd(fsH1,dH1x,fiH1x); /* +=fx */
2692 dOx = vec_nmsub(fsH1,dH1x,dOx); /* -fx */
2693 fiH1y = vec_madd(fsH1,dH1y,fiH1y); /* +=fy */
2694 dOy = vec_nmsub(fsH1,dH1y,dOy); /* -fy */
2695 fiH1z = vec_madd(fsH1,dH1z,fiH1z); /* +=fz */
2696 dOz = vec_nmsub(fsH1,dH1z,dOz); /* -fz */
2697 fiH2x = vec_madd(fsH2,dH2x,fiH2x); /* +=fx */
2698 dOx = vec_nmsub(fsH2,dH2x,dOx); /* -fx */
2699 fiH2y = vec_madd(fsH2,dH2y,fiH2y); /* +=fy */
2700 dOy = vec_nmsub(fsH2,dH2y,dOy); /* -fy */
2701 fiH2z = vec_madd(fsH2,dH2z,fiH2z); /* +=fz */
2702 dOz = vec_nmsub(fsH2,dH2z,dOz); /* -fz */
2704 transpose_4_to_3(dOx,dOy,dOz,nul,&tmp1,&tmp2,&tmp3);
2705 add_xyz_to_mem(faction+j3a,tmp1);
2706 add_xyz_to_mem(faction+j3b,tmp2);
2707 add_xyz_to_mem(faction+j3c,tmp3);
2708 } else if(k<(nj1-1)) {
2709 jnra = jjnr[k];
2710 jnrb = jjnr[k+1];
2711 j3a = 3*jnra;
2712 j3b = 3*jnrb;
2713 transpose_2_to_3(load_xyz(pos+j3a),
2714 load_xyz(pos+j3b),&dH2x,&dH2y,&dH2z);
2715 dOx = vec_sub(iOx,dH2x);
2716 dOy = vec_sub(iOy,dH2y);
2717 dOz = vec_sub(iOz,dH2z);
2718 dH1x = vec_sub(iH1x,dH2x);
2719 dH1y = vec_sub(iH1y,dH2y);
2720 dH1z = vec_sub(iH1z,dH2z);
2721 dH2x = vec_sub(iH2x,dH2x);
2722 dH2y = vec_sub(iH2y,dH2y);
2723 dH2z = vec_sub(iH2z,dH2z);
2725 rsqO = vec_madd(dOx,dOx,nul);
2726 rsqH1 = vec_madd(dH1x,dH1x,nul);
2727 rsqH2 = vec_madd(dH2x,dH2x,nul);
2728 rsqO = vec_madd(dOy,dOy,rsqO);
2729 rsqH1 = vec_madd(dH1y,dH1y,rsqH1);
2730 rsqH2 = vec_madd(dH2y,dH2y,rsqH2);
2731 rsqO = vec_madd(dOz,dOz,rsqO);
2732 rsqH1 = vec_madd(dH1z,dH1z,rsqH1);
2733 rsqH2 = vec_madd(dH2z,dH2z,rsqH2);
2735 zero_highest_2_elements_in_3_vectors(&rsqO,&rsqH1,&rsqH2);
2736 do_3_invsqrt(rsqO,rsqH1,rsqH2,&rinvO,&rinvH1,&rinvH2);
2737 zero_highest_2_elements_in_3_vectors(&rinvO,&rinvH1,&rinvH2);
2739 rinvsqO = vec_madd(rinvO,rinvO,nul);
2740 rinvsqH1 = vec_madd(rinvH1,rinvH1,nul);
2741 rinvsqH2 = vec_madd(rinvH2,rinvH2,nul);
2742 /* load 2 j charges and multiply by iq */
2743 jq=load_2_float(charge+jnra,charge+jnrb);
2744 qqO = vec_madd(iqO,jq,nul);
2745 qqH = vec_madd(iqH,jq,nul);
2746 krsqO = vec_madd(vkrf,rsqO,nul);
2747 krsqH1 = vec_madd(vkrf,rsqH1,nul);
2748 krsqH2 = vec_madd(vkrf,rsqH2,nul);
2749 vcoulO = vec_add(rinvO,krsqO);
2750 vcoulH1 = vec_add(rinvH1,krsqH1);
2751 vcoulH2 = vec_add(rinvH2,krsqH2);
2752 vcoulO = vec_sub(vcoulO,vcrf);
2753 vcoulH1 = vec_sub(vcoulH1,vcrf);
2754 vcoulH2 = vec_sub(vcoulH2,vcrf);
2755 vctot = vec_madd(qqO,vcoulO,vctot);
2756 fsO = vec_nmsub(vec_two(),krsqO,rinvO);
2757 fsH1 = vec_nmsub(vec_two(),krsqH1,rinvH1);
2758 fsH2 = vec_nmsub(vec_two(),krsqH2,rinvH2);
2759 vctot = vec_madd(qqH,vcoulH1,vctot);
2760 fsO = vec_madd(fsO,qqO,nul);
2761 fsH1 = vec_madd(fsH1,qqH,nul);
2762 fsH2 = vec_madd(fsH2,qqH,nul);
2763 vctot = vec_madd(qqH,vcoulH2,vctot);
2764 fsO = vec_madd(fsO,rinvsqO,nul);
2765 fsH1 = vec_madd(fsH1,rinvsqH1,nul);
2766 fsH2 = vec_madd(fsH2,rinvsqH2,nul);
2768 fiOx = vec_madd(fsO,dOx,fiOx); /* +=fx */
2769 dOx = vec_nmsub(fsO,dOx,nul); /* -fx */
2770 fiOy = vec_madd(fsO,dOy,fiOy); /* +=fy */
2771 dOy = vec_nmsub(fsO,dOy,nul); /* -fy */
2772 fiOz = vec_madd(fsO,dOz,fiOz); /* +=fz */
2773 dOz = vec_nmsub(fsO,dOz,nul); /* -fz */
2774 fiH1x = vec_madd(fsH1,dH1x,fiH1x); /* +=fx */
2775 dOx = vec_nmsub(fsH1,dH1x,dOx); /* -fx */
2776 fiH1y = vec_madd(fsH1,dH1y,fiH1y); /* +=fy */
2777 dOy = vec_nmsub(fsH1,dH1y,dOy); /* -fy */
2778 fiH1z = vec_madd(fsH1,dH1z,fiH1z); /* +=fz */
2779 dOz = vec_nmsub(fsH1,dH1z,dOz); /* -fz */
2780 fiH2x = vec_madd(fsH2,dH2x,fiH2x); /* +=fx */
2781 dOx = vec_nmsub(fsH2,dH2x,dOx); /* -fx */
2782 fiH2y = vec_madd(fsH2,dH2y,fiH2y); /* +=fy */
2783 dOy = vec_nmsub(fsH2,dH2y,dOy); /* -fy */
2784 fiH2z = vec_madd(fsH2,dH2z,fiH2z); /* +=fz */
2785 dOz = vec_nmsub(fsH2,dH2z,dOz); /* -fz */
2787 transpose_3_to_2(dOx,dOy,dOz,&tmp1,&tmp2);
2788 add_xyz_to_mem(faction+j3a,tmp1);
2789 add_xyz_to_mem(faction+j3b,tmp2);
2790 } else if(k<nj1) {
2791 jnra = jjnr[k];
2792 j3a = 3*jnra;
2793 transpose_1_to_3(load_xyz(pos+j3a),&dH2x,&dH2y,&dH2z);
2794 dOx = vec_sub(iOx,dH2x);
2795 dOy = vec_sub(iOy,dH2y);
2796 dOz = vec_sub(iOz,dH2z);
2797 dH1x = vec_sub(iH1x,dH2x);
2798 dH1y = vec_sub(iH1y,dH2y);
2799 dH1z = vec_sub(iH1z,dH2z);
2800 dH2x = vec_sub(iH2x,dH2x);
2801 dH2y = vec_sub(iH2y,dH2y);
2802 dH2z = vec_sub(iH2z,dH2z);
2804 rsqO = vec_madd(dOx,dOx,nul);
2805 rsqH1 = vec_madd(dH1x,dH1x,nul);
2806 rsqH2 = vec_madd(dH2x,dH2x,nul);
2807 rsqO = vec_madd(dOy,dOy,rsqO);
2808 rsqH1 = vec_madd(dH1y,dH1y,rsqH1);
2809 rsqH2 = vec_madd(dH2y,dH2y,rsqH2);
2810 rsqO = vec_madd(dOz,dOz,rsqO);
2811 rsqH1 = vec_madd(dH1z,dH1z,rsqH1);
2812 rsqH2 = vec_madd(dH2z,dH2z,rsqH2);
2814 zero_highest_3_elements_in_3_vectors(&rsqO,&rsqH1,&rsqH2);
2815 do_3_invsqrt(rsqO,rsqH1,rsqH2,&rinvO,&rinvH1,&rinvH2);
2816 zero_highest_3_elements_in_3_vectors(&rinvO,&rinvH1,&rinvH2);
2818 rinvsqO = vec_madd(rinvO,rinvO,nul);
2819 rinvsqH1 = vec_madd(rinvH1,rinvH1,nul);
2820 rinvsqH2 = vec_madd(rinvH2,rinvH2,nul);
2821 /* load 1 j charges and multiply by iq */
2822 jq=load_1_float(charge+jnra);
2823 qqO = vec_madd(iqO,jq,nul);
2824 qqH = vec_madd(iqH,jq,nul);
2825 krsqO = vec_madd(vkrf,rsqO,nul);
2826 krsqH1 = vec_madd(vkrf,rsqH1,nul);
2827 krsqH2 = vec_madd(vkrf,rsqH2,nul);
2828 vcoulO = vec_add(rinvO,krsqO);
2829 vcoulH1 = vec_add(rinvH1,krsqH1);
2830 vcoulH2 = vec_add(rinvH2,krsqH2);
2831 vcoulO = vec_sub(vcoulO,vcrf);
2832 vcoulH1 = vec_sub(vcoulH1,vcrf);
2833 vcoulH2 = vec_sub(vcoulH2,vcrf);
2834 vctot = vec_madd(qqO,vcoulO,vctot);
2835 fsO = vec_nmsub(vec_two(),krsqO,rinvO);
2836 fsH1 = vec_nmsub(vec_two(),krsqH1,rinvH1);
2837 fsH2 = vec_nmsub(vec_two(),krsqH2,rinvH2);
2838 vctot = vec_madd(qqH,vcoulH1,vctot);
2839 fsO = vec_madd(fsO,qqO,nul);
2840 fsH1 = vec_madd(fsH1,qqH,nul);
2841 fsH2 = vec_madd(fsH2,qqH,nul);
2842 vctot = vec_madd(qqH,vcoulH2,vctot);
2843 fsO = vec_madd(fsO,rinvsqO,nul);
2844 fsH1 = vec_madd(fsH1,rinvsqH1,nul);
2845 fsH2 = vec_madd(fsH2,rinvsqH2,nul);
2847 fiOx = vec_madd(fsO,dOx,fiOx); /* +=fx */
2848 dOx = vec_nmsub(fsO,dOx,nul); /* -fx */
2849 fiOy = vec_madd(fsO,dOy,fiOy); /* +=fy */
2850 dOy = vec_nmsub(fsO,dOy,nul); /* -fy */
2851 fiOz = vec_madd(fsO,dOz,fiOz); /* +=fz */
2852 dOz = vec_nmsub(fsO,dOz,nul); /* -fz */
2853 fiH1x = vec_madd(fsH1,dH1x,fiH1x); /* +=fx */
2854 dOx = vec_nmsub(fsH1,dH1x,dOx); /* -fx */
2855 fiH1y = vec_madd(fsH1,dH1y,fiH1y); /* +=fy */
2856 dOy = vec_nmsub(fsH1,dH1y,dOy); /* -fy */
2857 fiH1z = vec_madd(fsH1,dH1z,fiH1z); /* +=fz */
2858 dOz = vec_nmsub(fsH1,dH1z,dOz); /* -fz */
2859 fiH2x = vec_madd(fsH2,dH2x,fiH2x); /* +=fx */
2860 dOx = vec_nmsub(fsH2,dH2x,dOx); /* -fx */
2861 fiH2y = vec_madd(fsH2,dH2y,fiH2y); /* +=fy */
2862 dOy = vec_nmsub(fsH2,dH2y,dOy); /* -fy */
2863 fiH2z = vec_madd(fsH2,dH2z,fiH2z); /* +=fz */
2864 dOz = vec_nmsub(fsH2,dH2z,dOz); /* -fz */
2866 transpose_3_to_1(dOx,dOy,dOz,&tmp1);
2867 add_xyz_to_mem(faction+j3a,tmp1);
2869 /* update outer data */
2870 update_i_water_forces(faction+ii3,fshift+is3,
2871 fiOx,fiOy,fiOz,fiH1x,fiH1y,fiH1z,fiH2x,fiH2y,fiH2z);
2873 add_vector_to_float(Vc+gid[n],vctot);
2879 void inl2120_altivec(
2880 int nri,
2881 int iinr[],
2882 int jindex[],
2883 int jjnr[],
2884 int shift[],
2885 float shiftvec[],
2886 float fshift[],
2887 int gid[],
2888 float pos[],
2889 float faction[],
2890 float charge[],
2891 float facel,
2892 float Vc[],
2893 float krf,
2894 float crf,
2895 int type[],
2896 int ntype,
2897 float nbfp[],
2898 float Vnb[])
2900 vector float vkrf,vcrf;
2901 vector float iOx,iOy,iOz,iH1x,iH1y,iH1z,iH2x,iH2y,iH2z;
2902 vector float dOx,dOy,dOz,dH1x,dH1y,dH1z,dH2x,dH2y,dH2z;
2903 vector float vfacel,vcoulO,vcoulH1,vcoulH2,nul;
2904 vector float vnbtot,c6,c12,rinvsix,vnb6,vnb12;
2905 vector float fsO,fsH1,fsH2,krsqO,krsqH1,krsqH2;
2906 vector float vctot,qqO,qqH,iqO,iqH,jq;
2907 vector float fiOx,fiOy,fiOz,fiH1x,fiH1y,fiH1z,fiH2x,fiH2y,fiH2z;
2908 vector float tmp1,tmp2,tmp3,tmp4;
2909 vector float rinvO,rinvH1,rinvH2,rinvsqO,rinvsqH1,rinvsqH2,rsqO,rsqH1,rsqH2;
2912 int n,k,k0,ii,is3,ii3,ntiA,nj0,nj1;
2913 int jnra,jnrb,jnrc,jnrd;
2914 int j3a,j3b,j3c,j3d;
2915 int tja,tjb,tjc,tjd;
2917 nul=vec_zero();
2918 vfacel=load_float_and_splat(&facel);
2919 vkrf=load_float_and_splat(&krf);
2920 vcrf=load_float_and_splat(&crf);
2921 ii = iinr[0];
2922 iqO = vec_madd(load_float_and_splat(charge+ii),vfacel,nul);
2923 iqH = vec_madd(load_float_and_splat(charge+ii+1),vfacel,nul);
2924 ntiA = 2*ntype*type[ii];
2926 for(n=0;n<nri;n++) {
2927 is3 = 3*shift[n];
2928 ii = iinr[n];
2929 ii3 = 3*ii;
2930 load_1_water_shift_and_splat(pos+ii3,shiftvec+is3,&iOx,&iOy,&iOz,
2931 &iH1x,&iH1y,&iH1z,&iH2x,&iH2y,&iH2z);
2932 vctot = nul;
2933 vnbtot = nul;
2934 fiOx = nul;
2935 fiOy = nul;
2936 fiOz = nul;
2937 fiH1x = nul;
2938 fiH1y = nul;
2939 fiH1z = nul;
2940 fiH2x = nul;
2941 fiH2y = nul;
2942 fiH2z = nul;
2943 nj0 = jindex[n];
2944 nj1 = jindex[n+1];
2946 for(k=nj0; k<(nj1-3); k+=4) {
2947 jnra = jjnr[k];
2948 jnrb = jjnr[k+1];
2949 jnrc = jjnr[k+2];
2950 jnrd = jjnr[k+3];
2951 j3a = 3*jnra;
2952 j3b = 3*jnrb;
2953 j3c = 3*jnrc;
2954 j3d = 3*jnrd;
2955 transpose_4_to_3(load_xyz(pos+j3a),
2956 load_xyz(pos+j3b),
2957 load_xyz(pos+j3c),
2958 load_xyz(pos+j3d),&dH2x,&dH2y,&dH2z);
2959 dOx = vec_sub(iOx,dH2x);
2960 dOy = vec_sub(iOy,dH2y);
2961 dOz = vec_sub(iOz,dH2z);
2962 dH1x = vec_sub(iH1x,dH2x);
2963 dH1y = vec_sub(iH1y,dH2y);
2964 dH1z = vec_sub(iH1z,dH2z);
2965 dH2x = vec_sub(iH2x,dH2x);
2966 dH2y = vec_sub(iH2y,dH2y);
2967 dH2z = vec_sub(iH2z,dH2z);
2969 rsqO = vec_madd(dOx,dOx,nul);
2970 rsqH1 = vec_madd(dH1x,dH1x,nul);
2971 rsqH2 = vec_madd(dH2x,dH2x,nul);
2972 rsqO = vec_madd(dOy,dOy,rsqO);
2973 rsqH1 = vec_madd(dH1y,dH1y,rsqH1);
2974 rsqH2 = vec_madd(dH2y,dH2y,rsqH2);
2975 rsqO = vec_madd(dOz,dOz,rsqO);
2976 rsqH1 = vec_madd(dH1z,dH1z,rsqH1);
2977 rsqH2 = vec_madd(dH2z,dH2z,rsqH2);
2978 do_3_invsqrt(rsqO,rsqH1,rsqH2,&rinvO,&rinvH1,&rinvH2);
2979 rinvsqO = vec_madd(rinvO,rinvO,nul);
2980 rinvsqH1 = vec_madd(rinvH1,rinvH1,nul);
2981 rinvsqH2 = vec_madd(rinvH2,rinvH2,nul);
2982 rinvsix = vec_madd(rinvsqO,rinvsqO,nul);
2983 rinvsix = vec_madd(rinvsix,rinvsqO,nul);
2984 tja = ntiA+2*type[jnra];
2985 tjb = ntiA+2*type[jnrb];
2986 tjc = ntiA+2*type[jnrc];
2987 tjd = ntiA+2*type[jnrd];
2988 /* load 4 j charges and multiply by iq */
2989 jq=load_4_float(charge+jnra,charge+jnrb,charge+jnrc,charge+jnrd);
2990 load_4_pair(nbfp+tja,nbfp+tjb,nbfp+tjc,nbfp+tjd,&c6,&c12);
2991 qqO = vec_madd(iqO,jq,nul);
2992 qqH = vec_madd(iqH,jq,nul);
2993 krsqO = vec_madd(vkrf,rsqO,nul);
2994 krsqH1 = vec_madd(vkrf,rsqH1,nul);
2995 krsqH2 = vec_madd(vkrf,rsqH2,nul);
2996 vnb6 = vec_madd(c6,rinvsix,nul);
2997 vnb12 = vec_madd(c12,vec_madd(rinvsix,rinvsix,nul),nul);
2998 fsO = vec_nmsub(vec_two(),krsqO,rinvO);
2999 vcoulO = vec_add(rinvO,krsqO);
3000 vcoulH1 = vec_add(rinvH1,krsqH1);
3001 vnbtot = vec_add(vnbtot,vnb12);
3002 fsO = vec_madd(qqO,fsO,nul);
3003 vcoulH2 = vec_add(rinvH2,krsqH2);
3004 vcoulO = vec_sub(vcoulO,vcrf);
3005 vnbtot = vec_sub(vnbtot,vnb6);
3006 fsO = vec_madd(vec_twelve(),vnb12,fsO);
3007 vcoulH1 = vec_sub(vcoulH1,vcrf);
3008 vcoulH2 = vec_sub(vcoulH2,vcrf);
3009 vctot = vec_madd(qqO,vcoulO,vctot);
3010 fsO = vec_nmsub(vec_six(),vnb6,fsO);
3011 fsH1 = vec_nmsub(vec_two(),krsqH1,rinvH1);
3012 fsH2 = vec_nmsub(vec_two(),krsqH2,rinvH2);
3013 vctot = vec_madd(qqH,vcoulH1,vctot);
3014 fsO = vec_madd(fsO,rinvsqO,nul);
3015 fsH1 = vec_madd(fsH1,qqH,nul);
3016 fsH2 = vec_madd(fsH2,qqH,nul);
3017 vctot = vec_madd(qqH,vcoulH2,vctot);
3018 fsH1 = vec_madd(fsH1,rinvsqH1,nul);
3019 fsH2 = vec_madd(fsH2,rinvsqH2,nul);
3021 fiOx = vec_madd(fsO,dOx,fiOx); /* +=fx */
3022 dOx = vec_nmsub(fsO,dOx,nul); /* -fx */
3023 fiOy = vec_madd(fsO,dOy,fiOy); /* +=fy */
3024 dOy = vec_nmsub(fsO,dOy,nul); /* -fy */
3025 fiOz = vec_madd(fsO,dOz,fiOz); /* +=fz */
3026 dOz = vec_nmsub(fsO,dOz,nul); /* -fz */
3027 fiH1x = vec_madd(fsH1,dH1x,fiH1x); /* +=fx */
3028 dOx = vec_nmsub(fsH1,dH1x,dOx); /* -fx */
3029 fiH1y = vec_madd(fsH1,dH1y,fiH1y); /* +=fy */
3030 dOy = vec_nmsub(fsH1,dH1y,dOy); /* -fy */
3031 fiH1z = vec_madd(fsH1,dH1z,fiH1z); /* +=fz */
3032 dOz = vec_nmsub(fsH1,dH1z,dOz); /* -fz */
3033 fiH2x = vec_madd(fsH2,dH2x,fiH2x); /* +=fx */
3034 dOx = vec_nmsub(fsH2,dH2x,dOx); /* -fx */
3035 fiH2y = vec_madd(fsH2,dH2y,fiH2y); /* +=fy */
3036 dOy = vec_nmsub(fsH2,dH2y,dOy); /* -fy */
3037 fiH2z = vec_madd(fsH2,dH2z,fiH2z); /* +=fz */
3038 dOz = vec_nmsub(fsH2,dH2z,dOz); /* -fz */
3040 transpose_3_to_4(dOx,dOy,dOz,&tmp1,&tmp2,&tmp3,&tmp4);
3041 add_xyz_to_mem(faction+j3a,tmp1);
3042 add_xyz_to_mem(faction+j3b,tmp2);
3043 add_xyz_to_mem(faction+j3c,tmp3);
3044 add_xyz_to_mem(faction+j3d,tmp4);
3046 if(k<(nj1-2)) {
3047 jnra = jjnr[k];
3048 jnrb = jjnr[k+1];
3049 jnrc = jjnr[k+2];
3050 j3a = 3*jnra;
3051 j3b = 3*jnrb;
3052 j3c = 3*jnrc;
3053 transpose_4_to_3(load_xyz(pos+j3a),
3054 load_xyz(pos+j3b),
3055 load_xyz(pos+j3c),nul,&dH2x,&dH2y,&dH2z);
3056 dOx = vec_sub(iOx,dH2x);
3057 dOy = vec_sub(iOy,dH2y);
3058 dOz = vec_sub(iOz,dH2z);
3059 dH1x = vec_sub(iH1x,dH2x);
3060 dH1y = vec_sub(iH1y,dH2y);
3061 dH1z = vec_sub(iH1z,dH2z);
3062 dH2x = vec_sub(iH2x,dH2x);
3063 dH2y = vec_sub(iH2y,dH2y);
3064 dH2z = vec_sub(iH2z,dH2z);
3066 rsqO = vec_madd(dOx,dOx,nul);
3067 rsqH1 = vec_madd(dH1x,dH1x,nul);
3068 rsqH2 = vec_madd(dH2x,dH2x,nul);
3069 rsqO = vec_madd(dOy,dOy,rsqO);
3070 rsqH1 = vec_madd(dH1y,dH1y,rsqH1);
3071 rsqH2 = vec_madd(dH2y,dH2y,rsqH2);
3072 rsqO = vec_madd(dOz,dOz,rsqO);
3073 rsqH1 = vec_madd(dH1z,dH1z,rsqH1);
3074 rsqH2 = vec_madd(dH2z,dH2z,rsqH2);
3076 zero_highest_element_in_3_vectors(&rsqO,&rsqH1,&rsqH2);
3077 do_3_invsqrt(rsqO,rsqH1,rsqH2,&rinvO,&rinvH1,&rinvH2);
3078 zero_highest_element_in_3_vectors(&rinvO,&rinvH1,&rinvH2);
3080 rinvsqO = vec_madd(rinvO,rinvO,nul);
3081 rinvsqH1 = vec_madd(rinvH1,rinvH1,nul);
3082 rinvsqH2 = vec_madd(rinvH2,rinvH2,nul);
3083 rinvsix = vec_madd(rinvsqO,rinvsqO,nul);
3084 rinvsix = vec_madd(rinvsix,rinvsqO,nul);
3085 jq=load_3_float(charge+jnra,charge+jnrb,charge+jnrc);
3086 tja = ntiA+2*type[jnra];
3087 tjb = ntiA+2*type[jnrb];
3088 tjc = ntiA+2*type[jnrc];
3089 load_3_pair(nbfp+tja,nbfp+tjb,nbfp+tjc,&c6,&c12);
3090 qqO = vec_madd(iqO,jq,nul);
3091 qqH = vec_madd(iqH,jq,nul);
3092 krsqO = vec_madd(vkrf,rsqO,nul);
3093 krsqH1 = vec_madd(vkrf,rsqH1,nul);
3094 krsqH2 = vec_madd(vkrf,rsqH2,nul);
3095 vnb6 = vec_madd(c6,rinvsix,nul);
3096 vnb12 = vec_madd(c12,vec_madd(rinvsix,rinvsix,nul),nul);
3097 fsO = vec_nmsub(vec_two(),krsqO,rinvO);
3098 vcoulO = vec_add(rinvO,krsqO);
3099 vcoulH1 = vec_add(rinvH1,krsqH1);
3100 vnbtot = vec_add(vnbtot,vnb12);
3101 fsO = vec_madd(qqO,fsO,nul);
3102 vcoulH2 = vec_add(rinvH2,krsqH2);
3103 vcoulO = vec_sub(vcoulO,vcrf);
3104 vnbtot = vec_sub(vnbtot,vnb6);
3105 fsO = vec_madd(vec_twelve(),vnb12,fsO);
3106 vcoulH1 = vec_sub(vcoulH1,vcrf);
3107 vcoulH2 = vec_sub(vcoulH2,vcrf);
3108 vctot = vec_madd(qqO,vcoulO,vctot);
3109 fsO = vec_nmsub(vec_six(),vnb6,fsO);
3110 fsH1 = vec_nmsub(vec_two(),krsqH1,rinvH1);
3111 fsH2 = vec_nmsub(vec_two(),krsqH2,rinvH2);
3112 vctot = vec_madd(qqH,vcoulH1,vctot);
3113 fsO = vec_madd(fsO,rinvsqO,nul);
3114 fsH1 = vec_madd(fsH1,qqH,nul);
3115 fsH2 = vec_madd(fsH2,qqH,nul);
3116 vctot = vec_madd(qqH,vcoulH2,vctot);
3117 fsH1 = vec_madd(fsH1,rinvsqH1,nul);
3118 fsH2 = vec_madd(fsH2,rinvsqH2,nul);
3120 fiOx = vec_madd(fsO,dOx,fiOx); /* +=fx */
3121 dOx = vec_nmsub(fsO,dOx,nul); /* -fx */
3122 fiOy = vec_madd(fsO,dOy,fiOy); /* +=fy */
3123 dOy = vec_nmsub(fsO,dOy,nul); /* -fy */
3124 fiOz = vec_madd(fsO,dOz,fiOz); /* +=fz */
3125 dOz = vec_nmsub(fsO,dOz,nul); /* -fz */
3126 fiH1x = vec_madd(fsH1,dH1x,fiH1x); /* +=fx */
3127 dOx = vec_nmsub(fsH1,dH1x,dOx); /* -fx */
3128 fiH1y = vec_madd(fsH1,dH1y,fiH1y); /* +=fy */
3129 dOy = vec_nmsub(fsH1,dH1y,dOy); /* -fy */
3130 fiH1z = vec_madd(fsH1,dH1z,fiH1z); /* +=fz */
3131 dOz = vec_nmsub(fsH1,dH1z,dOz); /* -fz */
3132 fiH2x = vec_madd(fsH2,dH2x,fiH2x); /* +=fx */
3133 dOx = vec_nmsub(fsH2,dH2x,dOx); /* -fx */
3134 fiH2y = vec_madd(fsH2,dH2y,fiH2y); /* +=fy */
3135 dOy = vec_nmsub(fsH2,dH2y,dOy); /* -fy */
3136 fiH2z = vec_madd(fsH2,dH2z,fiH2z); /* +=fz */
3137 dOz = vec_nmsub(fsH2,dH2z,dOz); /* -fz */
3139 transpose_4_to_3(dOx,dOy,dOz,nul,&tmp1,&tmp2,&tmp3);
3140 add_xyz_to_mem(faction+j3a,tmp1);
3141 add_xyz_to_mem(faction+j3b,tmp2);
3142 add_xyz_to_mem(faction+j3c,tmp3);
3143 } else if(k<(nj1-1)) {
3144 jnra = jjnr[k];
3145 jnrb = jjnr[k+1];
3146 j3a = 3*jnra;
3147 j3b = 3*jnrb;
3148 transpose_2_to_3(load_xyz(pos+j3a),
3149 load_xyz(pos+j3b),&dH2x,&dH2y,&dH2z);
3150 dOx = vec_sub(iOx,dH2x);
3151 dOy = vec_sub(iOy,dH2y);
3152 dOz = vec_sub(iOz,dH2z);
3153 dH1x = vec_sub(iH1x,dH2x);
3154 dH1y = vec_sub(iH1y,dH2y);
3155 dH1z = vec_sub(iH1z,dH2z);
3156 dH2x = vec_sub(iH2x,dH2x);
3157 dH2y = vec_sub(iH2y,dH2y);
3158 dH2z = vec_sub(iH2z,dH2z);
3160 rsqO = vec_madd(dOx,dOx,nul);
3161 rsqH1 = vec_madd(dH1x,dH1x,nul);
3162 rsqH2 = vec_madd(dH2x,dH2x,nul);
3163 rsqO = vec_madd(dOy,dOy,rsqO);
3164 rsqH1 = vec_madd(dH1y,dH1y,rsqH1);
3165 rsqH2 = vec_madd(dH2y,dH2y,rsqH2);
3166 rsqO = vec_madd(dOz,dOz,rsqO);
3167 rsqH1 = vec_madd(dH1z,dH1z,rsqH1);
3168 rsqH2 = vec_madd(dH2z,dH2z,rsqH2);
3170 zero_highest_2_elements_in_3_vectors(&rsqO,&rsqH1,&rsqH2);
3171 do_3_invsqrt(rsqO,rsqH1,rsqH2,&rinvO,&rinvH1,&rinvH2);
3172 zero_highest_2_elements_in_3_vectors(&rinvO,&rinvH1,&rinvH2);
3174 rinvsqO = vec_madd(rinvO,rinvO,nul);
3175 rinvsqH1 = vec_madd(rinvH1,rinvH1,nul);
3176 rinvsqH2 = vec_madd(rinvH2,rinvH2,nul);
3177 rinvsix = vec_madd(rinvsqO,rinvsqO,nul);
3178 rinvsix = vec_madd(rinvsix,rinvsqO,nul);
3179 tja = ntiA+2*type[jnra];
3180 tjb = ntiA+2*type[jnrb];
3181 /* load 2 j charges and multiply by iq */
3182 jq=load_2_float(charge+jnra,charge+jnrb);
3183 load_2_pair(nbfp+tja,nbfp+tjb,&c6,&c12);
3184 qqO = vec_madd(iqO,jq,nul);
3185 qqH = vec_madd(iqH,jq,nul);
3186 krsqO = vec_madd(vkrf,rsqO,nul);
3187 krsqH1 = vec_madd(vkrf,rsqH1,nul);
3188 krsqH2 = vec_madd(vkrf,rsqH2,nul);
3189 vnb6 = vec_madd(c6,rinvsix,nul);
3190 vnb12 = vec_madd(c12,vec_madd(rinvsix,rinvsix,nul),nul);
3191 fsO = vec_nmsub(vec_two(),krsqO,rinvO);
3192 vcoulO = vec_add(rinvO,krsqO);
3193 vcoulH1 = vec_add(rinvH1,krsqH1);
3194 vnbtot = vec_add(vnbtot,vnb12);
3195 fsO = vec_madd(qqO,fsO,nul);
3196 vcoulH2 = vec_add(rinvH2,krsqH2);
3197 vcoulO = vec_sub(vcoulO,vcrf);
3198 vnbtot = vec_sub(vnbtot,vnb6);
3199 fsO = vec_madd(vec_twelve(),vnb12,fsO);
3200 vcoulH1 = vec_sub(vcoulH1,vcrf);
3201 vcoulH2 = vec_sub(vcoulH2,vcrf);
3202 vctot = vec_madd(qqO,vcoulO,vctot);
3203 fsO = vec_nmsub(vec_six(),vnb6,fsO);
3204 fsH1 = vec_nmsub(vec_two(),krsqH1,rinvH1);
3205 fsH2 = vec_nmsub(vec_two(),krsqH2,rinvH2);
3206 vctot = vec_madd(qqH,vcoulH1,vctot);
3207 fsO = vec_madd(fsO,rinvsqO,nul);
3208 fsH1 = vec_madd(fsH1,qqH,nul);
3209 fsH2 = vec_madd(fsH2,qqH,nul);
3210 vctot = vec_madd(qqH,vcoulH2,vctot);
3211 fsH1 = vec_madd(fsH1,rinvsqH1,nul);
3212 fsH2 = vec_madd(fsH2,rinvsqH2,nul);
3214 fiOx = vec_madd(fsO,dOx,fiOx); /* +=fx */
3215 dOx = vec_nmsub(fsO,dOx,nul); /* -fx */
3216 fiOy = vec_madd(fsO,dOy,fiOy); /* +=fy */
3217 dOy = vec_nmsub(fsO,dOy,nul); /* -fy */
3218 fiOz = vec_madd(fsO,dOz,fiOz); /* +=fz */
3219 dOz = vec_nmsub(fsO,dOz,nul); /* -fz */
3220 fiH1x = vec_madd(fsH1,dH1x,fiH1x); /* +=fx */
3221 dOx = vec_nmsub(fsH1,dH1x,dOx); /* -fx */
3222 fiH1y = vec_madd(fsH1,dH1y,fiH1y); /* +=fy */
3223 dOy = vec_nmsub(fsH1,dH1y,dOy); /* -fy */
3224 fiH1z = vec_madd(fsH1,dH1z,fiH1z); /* +=fz */
3225 dOz = vec_nmsub(fsH1,dH1z,dOz); /* -fz */
3226 fiH2x = vec_madd(fsH2,dH2x,fiH2x); /* +=fx */
3227 dOx = vec_nmsub(fsH2,dH2x,dOx); /* -fx */
3228 fiH2y = vec_madd(fsH2,dH2y,fiH2y); /* +=fy */
3229 dOy = vec_nmsub(fsH2,dH2y,dOy); /* -fy */
3230 fiH2z = vec_madd(fsH2,dH2z,fiH2z); /* +=fz */
3231 dOz = vec_nmsub(fsH2,dH2z,dOz); /* -fz */
3233 transpose_3_to_2(dOx,dOy,dOz,&tmp1,&tmp2);
3234 add_xyz_to_mem(faction+j3a,tmp1);
3235 add_xyz_to_mem(faction+j3b,tmp2);
3236 } else if(k<nj1) {
3237 jnra = jjnr[k];
3238 j3a = 3*jnra;
3239 transpose_1_to_3(load_xyz(pos+j3a),&dH2x,&dH2y,&dH2z);
3240 dOx = vec_sub(iOx,dH2x);
3241 dOy = vec_sub(iOy,dH2y);
3242 dOz = vec_sub(iOz,dH2z);
3243 dH1x = vec_sub(iH1x,dH2x);
3244 dH1y = vec_sub(iH1y,dH2y);
3245 dH1z = vec_sub(iH1z,dH2z);
3246 dH2x = vec_sub(iH2x,dH2x);
3247 dH2y = vec_sub(iH2y,dH2y);
3248 dH2z = vec_sub(iH2z,dH2z);
3250 rsqO = vec_madd(dOx,dOx,nul);
3251 rsqH1 = vec_madd(dH1x,dH1x,nul);
3252 rsqH2 = vec_madd(dH2x,dH2x,nul);
3253 rsqO = vec_madd(dOy,dOy,rsqO);
3254 rsqH1 = vec_madd(dH1y,dH1y,rsqH1);
3255 rsqH2 = vec_madd(dH2y,dH2y,rsqH2);
3256 rsqO = vec_madd(dOz,dOz,rsqO);
3257 rsqH1 = vec_madd(dH1z,dH1z,rsqH1);
3258 rsqH2 = vec_madd(dH2z,dH2z,rsqH2);
3260 zero_highest_3_elements_in_3_vectors(&rsqO,&rsqH1,&rsqH2);
3261 do_3_invsqrt(rsqO,rsqH1,rsqH2,&rinvO,&rinvH1,&rinvH2);
3262 zero_highest_3_elements_in_3_vectors(&rinvO,&rinvH1,&rinvH2);
3264 rinvsqO = vec_madd(rinvO,rinvO,nul);
3265 rinvsqH1 = vec_madd(rinvH1,rinvH1,nul);
3266 rinvsqH2 = vec_madd(rinvH2,rinvH2,nul);
3267 rinvsix = vec_madd(rinvsqO,rinvsqO,nul);
3268 rinvsix = vec_madd(rinvsix,rinvsqO,nul);
3269 tja = ntiA+2*type[jnra];
3270 /* load 1 j charges and multiply by iq */
3271 jq=load_1_float(charge+jnra);
3272 load_1_pair(nbfp+tja,&c6,&c12);
3273 qqO = vec_madd(iqO,jq,nul);
3274 qqH = vec_madd(iqH,jq,nul);
3275 krsqO = vec_madd(vkrf,rsqO,nul);
3276 krsqH1 = vec_madd(vkrf,rsqH1,nul);
3277 krsqH2 = vec_madd(vkrf,rsqH2,nul);
3278 vnb6 = vec_madd(c6,rinvsix,nul);
3279 vnb12 = vec_madd(c12,vec_madd(rinvsix,rinvsix,nul),nul);
3280 fsO = vec_nmsub(vec_two(),krsqO,rinvO);
3281 vcoulO = vec_add(rinvO,krsqO);
3282 vcoulH1 = vec_add(rinvH1,krsqH1);
3283 vnbtot = vec_add(vnbtot,vnb12);
3284 fsO = vec_madd(qqO,fsO,nul);
3285 vcoulH2 = vec_add(rinvH2,krsqH2);
3286 vcoulO = vec_sub(vcoulO,vcrf);
3287 vnbtot = vec_sub(vnbtot,vnb6);
3288 fsO = vec_madd(vec_twelve(),vnb12,fsO);
3289 vcoulH1 = vec_sub(vcoulH1,vcrf);
3290 vcoulH2 = vec_sub(vcoulH2,vcrf);
3291 vctot = vec_madd(qqO,vcoulO,vctot);
3292 fsO = vec_nmsub(vec_six(),vnb6,fsO);
3293 fsH1 = vec_nmsub(vec_two(),krsqH1,rinvH1);
3294 fsH2 = vec_nmsub(vec_two(),krsqH2,rinvH2);
3295 vctot = vec_madd(qqH,vcoulH1,vctot);
3296 fsO = vec_madd(fsO,rinvsqO,nul);
3297 fsH1 = vec_madd(fsH1,qqH,nul);
3298 fsH2 = vec_madd(fsH2,qqH,nul);
3299 vctot = vec_madd(qqH,vcoulH2,vctot);
3300 fsH1 = vec_madd(fsH1,rinvsqH1,nul);
3301 fsH2 = vec_madd(fsH2,rinvsqH2,nul);
3303 fiOx = vec_madd(fsO,dOx,fiOx); /* +=fx */
3304 dOx = vec_nmsub(fsO,dOx,nul); /* -fx */
3305 fiOy = vec_madd(fsO,dOy,fiOy); /* +=fy */
3306 dOy = vec_nmsub(fsO,dOy,nul); /* -fy */
3307 fiOz = vec_madd(fsO,dOz,fiOz); /* +=fz */
3308 dOz = vec_nmsub(fsO,dOz,nul); /* -fz */
3309 fiH1x = vec_madd(fsH1,dH1x,fiH1x); /* +=fx */
3310 dOx = vec_nmsub(fsH1,dH1x,dOx); /* -fx */
3311 fiH1y = vec_madd(fsH1,dH1y,fiH1y); /* +=fy */
3312 dOy = vec_nmsub(fsH1,dH1y,dOy); /* -fy */
3313 fiH1z = vec_madd(fsH1,dH1z,fiH1z); /* +=fz */
3314 dOz = vec_nmsub(fsH1,dH1z,dOz); /* -fz */
3315 fiH2x = vec_madd(fsH2,dH2x,fiH2x); /* +=fx */
3316 dOx = vec_nmsub(fsH2,dH2x,dOx); /* -fx */
3317 fiH2y = vec_madd(fsH2,dH2y,fiH2y); /* +=fy */
3318 dOy = vec_nmsub(fsH2,dH2y,dOy); /* -fy */
3319 fiH2z = vec_madd(fsH2,dH2z,fiH2z); /* +=fz */
3320 dOz = vec_nmsub(fsH2,dH2z,dOz); /* -fz */
3322 transpose_3_to_1(dOx,dOy,dOz,&tmp1);
3323 add_xyz_to_mem(faction+j3a,tmp1);
3325 /* update outer data */
3326 update_i_water_forces(faction+ii3,fshift+is3,
3327 fiOx,fiOy,fiOz,fiH1x,fiH1y,fiH1z,fiH2x,fiH2y,fiH2z);
3329 add_vector_to_float(Vc+gid[n],vctot);
3330 add_vector_to_float(Vnb+gid[n],vnbtot);
3336 void inl3020_altivec(
3337 int nri,
3338 int iinr[],
3339 int jindex[],
3340 int jjnr[],
3341 int shift[],
3342 float shiftvec[],
3343 float fshift[],
3344 int gid[],
3345 float pos[],
3346 float faction[],
3347 float charge[],
3348 float facel,
3349 float Vc[],
3350 float tabscale,
3351 float VFtab[])
3353 vector float iOx,iOy,iOz,iH1x,iH1y,iH1z,iH2x,iH2y,iH2z;
3354 vector float dOx,dOy,dOz,dH1x,dH1y,dH1z,dH2x,dH2y,dH2z;
3355 vector float vfacel,vcoulO,vcoulH1,vcoulH2,nul;
3356 vector float fsO,fsH1,fsH2,tsc,VVcO,FFcO,VVcH1,FFcH1,VVcH2,FFcH2;
3357 vector float vctot,qqO,qqH,iqO,iqH,jq;
3358 vector float fiOx,fiOy,fiOz,fiH1x,fiH1y,fiH1z,fiH2x,fiH2y,fiH2z;
3359 vector float tmp1,tmp2,tmp3,tmp4;
3360 vector float rinvO,rinvH1,rinvH2,rO,rH1,rH2,rsqO,rsqH1,rsqH2;
3363 int n,k,k0,ii,is3,ii3,ntiA,nj0,nj1;
3364 int jnra,jnrb,jnrc,jnrd;
3365 int j3a,j3b,j3c,j3d;
3367 nul=vec_zero();
3368 vfacel=load_float_and_splat(&facel);
3369 tsc=load_float_and_splat(&tabscale);
3370 iqO = vec_madd(load_float_and_splat(charge+iinr[0]),vfacel,nul);
3371 iqH = vec_madd(load_float_and_splat(charge+iinr[0]+1),vfacel,nul);
3373 for(n=0;n<nri;n++) {
3374 is3 = 3*shift[n];
3375 ii = iinr[n];
3376 ii3 = 3*ii;
3377 load_1_water_shift_and_splat(pos+ii3,shiftvec+is3,&iOx,&iOy,&iOz,
3378 &iH1x,&iH1y,&iH1z,&iH2x,&iH2y,&iH2z);
3379 vctot = nul;
3380 fiOx = nul;
3381 fiOy = nul;
3382 fiOz = nul;
3383 fiH1x = nul;
3384 fiH1y = nul;
3385 fiH1z = nul;
3386 fiH2x = nul;
3387 fiH2y = nul;
3388 fiH2z = nul;
3389 nj0 = jindex[n];
3390 nj1 = jindex[n+1];
3392 for(k=nj0; k<(nj1-3); k+=4) {
3393 jnra = jjnr[k];
3394 jnrb = jjnr[k+1];
3395 jnrc = jjnr[k+2];
3396 jnrd = jjnr[k+3];
3397 j3a = 3*jnra;
3398 j3b = 3*jnrb;
3399 j3c = 3*jnrc;
3400 j3d = 3*jnrd;
3401 transpose_4_to_3(load_xyz(pos+j3a),
3402 load_xyz(pos+j3b),
3403 load_xyz(pos+j3c),
3404 load_xyz(pos+j3d),&dH2x,&dH2y,&dH2z);
3405 dOx = vec_sub(iOx,dH2x);
3406 dOy = vec_sub(iOy,dH2y);
3407 dOz = vec_sub(iOz,dH2z);
3408 dH1x = vec_sub(iH1x,dH2x);
3409 dH1y = vec_sub(iH1y,dH2y);
3410 dH1z = vec_sub(iH1z,dH2z);
3411 dH2x = vec_sub(iH2x,dH2x);
3412 dH2y = vec_sub(iH2y,dH2y);
3413 dH2z = vec_sub(iH2z,dH2z);
3415 rsqO = vec_madd(dOx,dOx,nul);
3416 rsqH1 = vec_madd(dH1x,dH1x,nul);
3417 rsqH2 = vec_madd(dH2x,dH2x,nul);
3418 rsqO = vec_madd(dOy,dOy,rsqO);
3419 rsqH1 = vec_madd(dH1y,dH1y,rsqH1);
3420 rsqH2 = vec_madd(dH2y,dH2y,rsqH2);
3421 rsqO = vec_madd(dOz,dOz,rsqO);
3422 rsqH1 = vec_madd(dH1z,dH1z,rsqH1);
3423 rsqH2 = vec_madd(dH2z,dH2z,rsqH2);
3424 do_3_invsqrt(rsqO,rsqH1,rsqH2,&rinvO,&rinvH1,&rinvH2);
3425 rO = vec_madd(rsqO,rinvO,nul);
3426 rH1 = vec_madd(rsqH1,rinvH1,nul);
3427 rH2 = vec_madd(rsqH2,rinvH2,nul);
3429 /* load 4 j charges and multiply by iq */
3430 jq=load_4_float(charge+jnra,charge+jnrb,charge+jnrc,charge+jnrd);
3431 do_4_ctable_coul(VFtab,vec_madd(rO,tsc,nul),&VVcO,&FFcO);
3432 do_4_ctable_coul(VFtab,vec_madd(rH1,tsc,nul),&VVcH1,&FFcH1);
3433 do_4_ctable_coul(VFtab,vec_madd(rH2,tsc,nul),&VVcH2,&FFcH2);
3434 qqO = vec_madd(iqO,jq,nul);
3435 qqH = vec_madd(iqH,jq,nul);
3436 vctot = vec_madd(qqO,VVcO,vctot);
3437 fsO = vec_nmsub(qqO,FFcO,nul);
3438 fsH1 = vec_nmsub(qqH,FFcH1,nul);
3439 fsH2 = vec_nmsub(qqH,FFcH2,nul);
3440 vctot = vec_madd(qqH,VVcH1,vctot);
3441 fsO = vec_madd(fsO,tsc,nul);
3442 fsH1 = vec_madd(fsH1,tsc,nul);
3443 fsH2 = vec_madd(fsH2,tsc,nul);
3444 vctot = vec_madd(qqH,VVcH2,vctot);
3445 fsO = vec_madd(fsO,rinvO,nul);
3446 fsH1 = vec_madd(fsH1,rinvH1,nul);
3447 fsH2 = vec_madd(fsH2,rinvH2,nul);
3449 fiOx = vec_madd(fsO,dOx,fiOx); /* +=fx */
3450 dOx = vec_nmsub(fsO,dOx,nul); /* -fx */
3451 fiOy = vec_madd(fsO,dOy,fiOy); /* +=fy */
3452 dOy = vec_nmsub(fsO,dOy,nul); /* -fy */
3453 fiOz = vec_madd(fsO,dOz,fiOz); /* +=fz */
3454 dOz = vec_nmsub(fsO,dOz,nul); /* -fz */
3455 fiH1x = vec_madd(fsH1,dH1x,fiH1x); /* +=fx */
3456 dOx = vec_nmsub(fsH1,dH1x,dOx); /* -fx */
3457 fiH1y = vec_madd(fsH1,dH1y,fiH1y); /* +=fy */
3458 dOy = vec_nmsub(fsH1,dH1y,dOy); /* -fy */
3459 fiH1z = vec_madd(fsH1,dH1z,fiH1z); /* +=fz */
3460 dOz = vec_nmsub(fsH1,dH1z,dOz); /* -fz */
3461 fiH2x = vec_madd(fsH2,dH2x,fiH2x); /* +=fx */
3462 dOx = vec_nmsub(fsH2,dH2x,dOx); /* -fx */
3463 fiH2y = vec_madd(fsH2,dH2y,fiH2y); /* +=fy */
3464 dOy = vec_nmsub(fsH2,dH2y,dOy); /* -fy */
3465 fiH2z = vec_madd(fsH2,dH2z,fiH2z); /* +=fz */
3466 dOz = vec_nmsub(fsH2,dH2z,dOz); /* -fz */
3468 transpose_3_to_4(dOx,dOy,dOz,&tmp1,&tmp2,&tmp3,&tmp4);
3469 add_xyz_to_mem(faction+j3a,tmp1);
3470 add_xyz_to_mem(faction+j3b,tmp2);
3471 add_xyz_to_mem(faction+j3c,tmp3);
3472 add_xyz_to_mem(faction+j3d,tmp4);
3474 if(k<(nj1-2)) {
3475 jnra = jjnr[k];
3476 jnrb = jjnr[k+1];
3477 jnrc = jjnr[k+2];
3478 j3a = 3*jnra;
3479 j3b = 3*jnrb;
3480 j3c = 3*jnrc;
3481 transpose_4_to_3(load_xyz(pos+j3a),
3482 load_xyz(pos+j3b),
3483 load_xyz(pos+j3c),nul,&dH2x,&dH2y,&dH2z);
3484 dOx = vec_sub(iOx,dH2x);
3485 dOy = vec_sub(iOy,dH2y);
3486 dOz = vec_sub(iOz,dH2z);
3487 dH1x = vec_sub(iH1x,dH2x);
3488 dH1y = vec_sub(iH1y,dH2y);
3489 dH1z = vec_sub(iH1z,dH2z);
3490 dH2x = vec_sub(iH2x,dH2x);
3491 dH2y = vec_sub(iH2y,dH2y);
3492 dH2z = vec_sub(iH2z,dH2z);
3494 rsqO = vec_madd(dOx,dOx,nul);
3495 rsqH1 = vec_madd(dH1x,dH1x,nul);
3496 rsqH2 = vec_madd(dH2x,dH2x,nul);
3497 rsqO = vec_madd(dOy,dOy,rsqO);
3498 rsqH1 = vec_madd(dH1y,dH1y,rsqH1);
3499 rsqH2 = vec_madd(dH2y,dH2y,rsqH2);
3500 rsqO = vec_madd(dOz,dOz,rsqO);
3501 rsqH1 = vec_madd(dH1z,dH1z,rsqH1);
3502 rsqH2 = vec_madd(dH2z,dH2z,rsqH2);
3504 zero_highest_element_in_3_vectors(&rsqO,&rsqH1,&rsqH2);
3505 do_3_invsqrt(rsqO,rsqH1,rsqH2,&rinvO,&rinvH1,&rinvH2);
3506 zero_highest_element_in_3_vectors(&rinvO,&rinvH1,&rinvH2);
3508 rO = vec_madd(rsqO,rinvO,nul);
3509 rH1 = vec_madd(rsqH1,rinvH1,nul);
3510 rH2 = vec_madd(rsqH2,rinvH2,nul);
3512 /* load 3 j charges and multiply by iq */
3513 jq=load_3_float(charge+jnra,charge+jnrb,charge+jnrc);
3514 do_3_ctable_coul(VFtab,vec_madd(rO,tsc,nul),&VVcO,&FFcO);
3515 do_3_ctable_coul(VFtab,vec_madd(rH1,tsc,nul),&VVcH1,&FFcH1);
3516 do_3_ctable_coul(VFtab,vec_madd(rH2,tsc,nul),&VVcH2,&FFcH2);
3517 qqO = vec_madd(iqO,jq,nul);
3518 qqH = vec_madd(iqH,jq,nul);
3519 vctot = vec_madd(qqO,VVcO,vctot);
3520 fsO = vec_nmsub(qqO,FFcO,nul);
3521 fsH1 = vec_nmsub(qqH,FFcH1,nul);
3522 fsH2 = vec_nmsub(qqH,FFcH2,nul);
3523 vctot = vec_madd(qqH,VVcH1,vctot);
3524 fsO = vec_madd(fsO,tsc,nul);
3525 fsH1 = vec_madd(fsH1,tsc,nul);
3526 fsH2 = vec_madd(fsH2,tsc,nul);
3527 vctot = vec_madd(qqH,VVcH2,vctot);
3528 fsO = vec_madd(fsO,rinvO,nul);
3529 fsH1 = vec_madd(fsH1,rinvH1,nul);
3530 fsH2 = vec_madd(fsH2,rinvH2,nul);
3532 fiOx = vec_madd(fsO,dOx,fiOx); /* +=fx */
3533 dOx = vec_nmsub(fsO,dOx,nul); /* -fx */
3534 fiOy = vec_madd(fsO,dOy,fiOy); /* +=fy */
3535 dOy = vec_nmsub(fsO,dOy,nul); /* -fy */
3536 fiOz = vec_madd(fsO,dOz,fiOz); /* +=fz */
3537 dOz = vec_nmsub(fsO,dOz,nul); /* -fz */
3538 fiH1x = vec_madd(fsH1,dH1x,fiH1x); /* +=fx */
3539 dOx = vec_nmsub(fsH1,dH1x,dOx); /* -fx */
3540 fiH1y = vec_madd(fsH1,dH1y,fiH1y); /* +=fy */
3541 dOy = vec_nmsub(fsH1,dH1y,dOy); /* -fy */
3542 fiH1z = vec_madd(fsH1,dH1z,fiH1z); /* +=fz */
3543 dOz = vec_nmsub(fsH1,dH1z,dOz); /* -fz */
3544 fiH2x = vec_madd(fsH2,dH2x,fiH2x); /* +=fx */
3545 dOx = vec_nmsub(fsH2,dH2x,dOx); /* -fx */
3546 fiH2y = vec_madd(fsH2,dH2y,fiH2y); /* +=fy */
3547 dOy = vec_nmsub(fsH2,dH2y,dOy); /* -fy */
3548 fiH2z = vec_madd(fsH2,dH2z,fiH2z); /* +=fz */
3549 dOz = vec_nmsub(fsH2,dH2z,dOz); /* -fz */
3551 transpose_4_to_3(dOx,dOy,dOz,nul,&tmp1,&tmp2,&tmp3);
3552 add_xyz_to_mem(faction+j3a,tmp1);
3553 add_xyz_to_mem(faction+j3b,tmp2);
3554 add_xyz_to_mem(faction+j3c,tmp3);
3555 } else if(k<(nj1-1)) {
3556 jnra = jjnr[k];
3557 jnrb = jjnr[k+1];
3558 j3a = 3*jnra;
3559 j3b = 3*jnrb;
3560 transpose_2_to_3(load_xyz(pos+j3a),
3561 load_xyz(pos+j3b),&dH2x,&dH2y,&dH2z);
3562 dOx = vec_sub(iOx,dH2x);
3563 dOy = vec_sub(iOy,dH2y);
3564 dOz = vec_sub(iOz,dH2z);
3565 dH1x = vec_sub(iH1x,dH2x);
3566 dH1y = vec_sub(iH1y,dH2y);
3567 dH1z = vec_sub(iH1z,dH2z);
3568 dH2x = vec_sub(iH2x,dH2x);
3569 dH2y = vec_sub(iH2y,dH2y);
3570 dH2z = vec_sub(iH2z,dH2z);
3572 rsqO = vec_madd(dOx,dOx,nul);
3573 rsqH1 = vec_madd(dH1x,dH1x,nul);
3574 rsqH2 = vec_madd(dH2x,dH2x,nul);
3575 rsqO = vec_madd(dOy,dOy,rsqO);
3576 rsqH1 = vec_madd(dH1y,dH1y,rsqH1);
3577 rsqH2 = vec_madd(dH2y,dH2y,rsqH2);
3578 rsqO = vec_madd(dOz,dOz,rsqO);
3579 rsqH1 = vec_madd(dH1z,dH1z,rsqH1);
3580 rsqH2 = vec_madd(dH2z,dH2z,rsqH2);
3582 zero_highest_2_elements_in_3_vectors(&rsqO,&rsqH1,&rsqH2);
3583 do_3_invsqrt(rsqO,rsqH1,rsqH2,&rinvO,&rinvH1,&rinvH2);
3584 zero_highest_2_elements_in_3_vectors(&rinvO,&rinvH1,&rinvH2);
3586 rO = vec_madd(rsqO,rinvO,nul);
3587 rH1 = vec_madd(rsqH1,rinvH1,nul);
3588 rH2 = vec_madd(rsqH2,rinvH2,nul);
3590 /* load 2 j charges and multiply by iq */
3591 jq=load_2_float(charge+jnra,charge+jnrb);
3592 do_2_ctable_coul(VFtab,vec_madd(rO,tsc,nul),&VVcO,&FFcO);
3593 do_2_ctable_coul(VFtab,vec_madd(rH1,tsc,nul),&VVcH1,&FFcH1);
3594 do_2_ctable_coul(VFtab,vec_madd(rH2,tsc,nul),&VVcH2,&FFcH2);
3595 qqO = vec_madd(iqO,jq,nul);
3596 qqH = vec_madd(iqH,jq,nul);
3597 vctot = vec_madd(qqO,VVcO,vctot);
3598 fsO = vec_nmsub(qqO,FFcO,nul);
3599 fsH1 = vec_nmsub(qqH,FFcH1,nul);
3600 fsH2 = vec_nmsub(qqH,FFcH2,nul);
3601 vctot = vec_madd(qqH,VVcH1,vctot);
3602 fsO = vec_madd(fsO,tsc,nul);
3603 fsH1 = vec_madd(fsH1,tsc,nul);
3604 fsH2 = vec_madd(fsH2,tsc,nul);
3605 vctot = vec_madd(qqH,VVcH2,vctot);
3606 fsO = vec_madd(fsO,rinvO,nul);
3607 fsH1 = vec_madd(fsH1,rinvH1,nul);
3608 fsH2 = vec_madd(fsH2,rinvH2,nul);
3610 fiOx = vec_madd(fsO,dOx,fiOx); /* +=fx */
3611 dOx = vec_nmsub(fsO,dOx,nul); /* -fx */
3612 fiOy = vec_madd(fsO,dOy,fiOy); /* +=fy */
3613 dOy = vec_nmsub(fsO,dOy,nul); /* -fy */
3614 fiOz = vec_madd(fsO,dOz,fiOz); /* +=fz */
3615 dOz = vec_nmsub(fsO,dOz,nul); /* -fz */
3616 fiH1x = vec_madd(fsH1,dH1x,fiH1x); /* +=fx */
3617 dOx = vec_nmsub(fsH1,dH1x,dOx); /* -fx */
3618 fiH1y = vec_madd(fsH1,dH1y,fiH1y); /* +=fy */
3619 dOy = vec_nmsub(fsH1,dH1y,dOy); /* -fy */
3620 fiH1z = vec_madd(fsH1,dH1z,fiH1z); /* +=fz */
3621 dOz = vec_nmsub(fsH1,dH1z,dOz); /* -fz */
3622 fiH2x = vec_madd(fsH2,dH2x,fiH2x); /* +=fx */
3623 dOx = vec_nmsub(fsH2,dH2x,dOx); /* -fx */
3624 fiH2y = vec_madd(fsH2,dH2y,fiH2y); /* +=fy */
3625 dOy = vec_nmsub(fsH2,dH2y,dOy); /* -fy */
3626 fiH2z = vec_madd(fsH2,dH2z,fiH2z); /* +=fz */
3627 dOz = vec_nmsub(fsH2,dH2z,dOz); /* -fz */
3629 transpose_3_to_2(dOx,dOy,dOz,&tmp1,&tmp2);
3630 add_xyz_to_mem(faction+j3a,tmp1);
3631 add_xyz_to_mem(faction+j3b,tmp2);
3632 } else if(k<nj1) {
3633 jnra = jjnr[k];
3634 j3a = 3*jnra;
3635 transpose_1_to_3(load_xyz(pos+j3a),&dH2x,&dH2y,&dH2z);
3636 dOx = vec_sub(iOx,dH2x);
3637 dOy = vec_sub(iOy,dH2y);
3638 dOz = vec_sub(iOz,dH2z);
3639 dH1x = vec_sub(iH1x,dH2x);
3640 dH1y = vec_sub(iH1y,dH2y);
3641 dH1z = vec_sub(iH1z,dH2z);
3642 dH2x = vec_sub(iH2x,dH2x);
3643 dH2y = vec_sub(iH2y,dH2y);
3644 dH2z = vec_sub(iH2z,dH2z);
3646 rsqO = vec_madd(dOx,dOx,nul);
3647 rsqH1 = vec_madd(dH1x,dH1x,nul);
3648 rsqH2 = vec_madd(dH2x,dH2x,nul);
3649 rsqO = vec_madd(dOy,dOy,rsqO);
3650 rsqH1 = vec_madd(dH1y,dH1y,rsqH1);
3651 rsqH2 = vec_madd(dH2y,dH2y,rsqH2);
3652 rsqO = vec_madd(dOz,dOz,rsqO);
3653 rsqH1 = vec_madd(dH1z,dH1z,rsqH1);
3654 rsqH2 = vec_madd(dH2z,dH2z,rsqH2);
3656 zero_highest_3_elements_in_3_vectors(&rsqO,&rsqH1,&rsqH2);
3657 do_3_invsqrt(rsqO,rsqH1,rsqH2,&rinvO,&rinvH1,&rinvH2);
3658 zero_highest_3_elements_in_3_vectors(&rinvO,&rinvH1,&rinvH2);
3660 rO = vec_madd(rsqO,rinvO,nul);
3661 rH1 = vec_madd(rsqH1,rinvH1,nul);
3662 rH2 = vec_madd(rsqH2,rinvH2,nul);
3664 /* load 1 j charges and multiply by iq */
3665 jq=load_1_float(charge+jnra);
3666 do_1_ctable_coul(VFtab,vec_madd(rO,tsc,nul),&VVcO,&FFcO);
3667 do_1_ctable_coul(VFtab,vec_madd(rH1,tsc,nul),&VVcH1,&FFcH1);
3668 do_1_ctable_coul(VFtab,vec_madd(rH2,tsc,nul),&VVcH2,&FFcH2);
3669 qqO = vec_madd(iqO,jq,nul);
3670 qqH = vec_madd(iqH,jq,nul);
3671 vctot = vec_madd(qqO,VVcO,vctot);
3672 fsO = vec_nmsub(qqO,FFcO,nul);
3673 fsH1 = vec_nmsub(qqH,FFcH1,nul);
3674 fsH2 = vec_nmsub(qqH,FFcH2,nul);
3675 vctot = vec_madd(qqH,VVcH1,vctot);
3676 fsO = vec_madd(fsO,tsc,nul);
3677 fsH1 = vec_madd(fsH1,tsc,nul);
3678 fsH2 = vec_madd(fsH2,tsc,nul);
3679 vctot = vec_madd(qqH,VVcH2,vctot);
3680 fsO = vec_madd(fsO,rinvO,nul);
3681 fsH1 = vec_madd(fsH1,rinvH1,nul);
3682 fsH2 = vec_madd(fsH2,rinvH2,nul);
3684 fiOx = vec_madd(fsO,dOx,fiOx); /* +=fx */
3685 dOx = vec_nmsub(fsO,dOx,nul); /* -fx */
3686 fiOy = vec_madd(fsO,dOy,fiOy); /* +=fy */
3687 dOy = vec_nmsub(fsO,dOy,nul); /* -fy */
3688 fiOz = vec_madd(fsO,dOz,fiOz); /* +=fz */
3689 dOz = vec_nmsub(fsO,dOz,nul); /* -fz */
3690 fiH1x = vec_madd(fsH1,dH1x,fiH1x); /* +=fx */
3691 dOx = vec_nmsub(fsH1,dH1x,dOx); /* -fx */
3692 fiH1y = vec_madd(fsH1,dH1y,fiH1y); /* +=fy */
3693 dOy = vec_nmsub(fsH1,dH1y,dOy); /* -fy */
3694 fiH1z = vec_madd(fsH1,dH1z,fiH1z); /* +=fz */
3695 dOz = vec_nmsub(fsH1,dH1z,dOz); /* -fz */
3696 fiH2x = vec_madd(fsH2,dH2x,fiH2x); /* +=fx */
3697 dOx = vec_nmsub(fsH2,dH2x,dOx); /* -fx */
3698 fiH2y = vec_madd(fsH2,dH2y,fiH2y); /* +=fy */
3699 dOy = vec_nmsub(fsH2,dH2y,dOy); /* -fy */
3700 fiH2z = vec_madd(fsH2,dH2z,fiH2z); /* +=fz */
3701 dOz = vec_nmsub(fsH2,dH2z,dOz); /* -fz */
3703 transpose_3_to_1(dOx,dOy,dOz,&tmp1);
3704 add_xyz_to_mem(faction+j3a,tmp1);
3706 /* update outer data */
3707 update_i_water_forces(faction+ii3,fshift+is3,
3708 fiOx,fiOy,fiOz,fiH1x,fiH1y,fiH1z,fiH2x,fiH2y,fiH2z);
3710 add_vector_to_float(Vc+gid[n],vctot);
3716 void inl3120_altivec(
3717 int nri,
3718 int iinr[],
3719 int jindex[],
3720 int jjnr[],
3721 int shift[],
3722 float shiftvec[],
3723 float fshift[],
3724 int gid[],
3725 float pos[],
3726 float faction[],
3727 float charge[],
3728 float facel,
3729 float Vc[],
3730 int type[],
3731 int ntype,
3732 float nbfp[],
3733 float Vnb[],
3734 float tabscale,
3735 float VFtab[])
3737 vector float iOx,iOy,iOz,iH1x,iH1y,iH1z,iH2x,iH2y,iH2z;
3738 vector float dOx,dOy,dOz,dH1x,dH1y,dH1z,dH2x,dH2y,dH2z;
3739 vector float vfacel,vcoulO,vcoulH1,vcoulH2,nul;
3740 vector float vnbtot,c6,c12,rinvsix,rinvsqO,vnb6,vnb12;
3741 vector float fsO,fsH1,fsH2,tsc,VVcO,FFcO,VVcH1,FFcH1,VVcH2,FFcH2;
3742 vector float vctot,qqO,qqH,iqO,iqH,jq;
3743 vector float fiOx,fiOy,fiOz,fiH1x,fiH1y,fiH1z,fiH2x,fiH2y,fiH2z;
3744 vector float tmp1,tmp2,tmp3,tmp4;
3745 vector float rinvO,rinvH1,rinvH2,rO,rH1,rH2,rsqO,rsqH1,rsqH2;
3747 int n,k,k0,ii,is3,ii3,ntiA,nj0,nj1;
3748 int jnra,jnrb,jnrc,jnrd;
3749 int j3a,j3b,j3c,j3d;
3750 int tja,tjb,tjc,tjd;
3752 nul=vec_zero();
3753 vfacel=load_float_and_splat(&facel);
3754 tsc=load_float_and_splat(&tabscale);
3755 ii = iinr[0];
3756 iqO = vec_madd(load_float_and_splat(charge+ii),vfacel,nul);
3757 iqH = vec_madd(load_float_and_splat(charge+ii+1),vfacel,nul);
3758 ntiA = 2*ntype*type[ii];
3760 for(n=0;n<nri;n++) {
3761 is3 = 3*shift[n];
3762 ii = iinr[n];
3763 ii3 = 3*ii;
3764 load_1_water_shift_and_splat(pos+ii3,shiftvec+is3,&iOx,&iOy,&iOz,
3765 &iH1x,&iH1y,&iH1z,&iH2x,&iH2y,&iH2z);
3766 vctot = nul;
3767 vnbtot = nul;
3768 fiOx = nul;
3769 fiOy = nul;
3770 fiOz = nul;
3771 fiH1x = nul;
3772 fiH1y = nul;
3773 fiH1z = nul;
3774 fiH2x = nul;
3775 fiH2y = nul;
3776 fiH2z = nul;
3777 nj0 = jindex[n];
3778 nj1 = jindex[n+1];
3780 for(k=nj0; k<(nj1-3); k+=4) {
3781 jnra = jjnr[k];
3782 jnrb = jjnr[k+1];
3783 jnrc = jjnr[k+2];
3784 jnrd = jjnr[k+3];
3785 j3a = 3*jnra;
3786 j3b = 3*jnrb;
3787 j3c = 3*jnrc;
3788 j3d = 3*jnrd;
3789 transpose_4_to_3(load_xyz(pos+j3a),
3790 load_xyz(pos+j3b),
3791 load_xyz(pos+j3c),
3792 load_xyz(pos+j3d),&dH2x,&dH2y,&dH2z);
3793 dOx = vec_sub(iOx,dH2x);
3794 dOy = vec_sub(iOy,dH2y);
3795 dOz = vec_sub(iOz,dH2z);
3796 dH1x = vec_sub(iH1x,dH2x);
3797 dH1y = vec_sub(iH1y,dH2y);
3798 dH1z = vec_sub(iH1z,dH2z);
3799 dH2x = vec_sub(iH2x,dH2x);
3800 dH2y = vec_sub(iH2y,dH2y);
3801 dH2z = vec_sub(iH2z,dH2z);
3803 rsqO = vec_madd(dOx,dOx,nul);
3804 rsqH1 = vec_madd(dH1x,dH1x,nul);
3805 rsqH2 = vec_madd(dH2x,dH2x,nul);
3806 rsqO = vec_madd(dOy,dOy,rsqO);
3807 rsqH1 = vec_madd(dH1y,dH1y,rsqH1);
3808 rsqH2 = vec_madd(dH2y,dH2y,rsqH2);
3809 rsqO = vec_madd(dOz,dOz,rsqO);
3810 rsqH1 = vec_madd(dH1z,dH1z,rsqH1);
3811 rsqH2 = vec_madd(dH2z,dH2z,rsqH2);
3812 do_3_invsqrt(rsqO,rsqH1,rsqH2,&rinvO,&rinvH1,&rinvH2);
3813 rinvsqO = vec_madd(rinvO,rinvO,nul);
3814 rO = vec_madd(rsqO,rinvO,nul);
3815 rH1 = vec_madd(rsqH1,rinvH1,nul);
3816 rH2 = vec_madd(rsqH2,rinvH2,nul);
3817 rinvsix = vec_madd(rinvsqO,rinvsqO,nul);
3818 rinvsix = vec_madd(rinvsix,rinvsqO,nul);
3819 tja = ntiA+2*type[jnra];
3820 tjb = ntiA+2*type[jnrb];
3821 tjc = ntiA+2*type[jnrc];
3822 tjd = ntiA+2*type[jnrd];
3823 /* load 4 j charges and multiply by iq */
3824 jq=load_4_float(charge+jnra,charge+jnrb,charge+jnrc,charge+jnrd);
3825 load_4_pair(nbfp+tja,nbfp+tjb,nbfp+tjc,nbfp+tjd,&c6,&c12);
3826 do_4_ctable_coul(VFtab,vec_madd(rO,tsc,nul),&VVcO,&FFcO);
3827 do_4_ctable_coul(VFtab,vec_madd(rH1,tsc,nul),&VVcH1,&FFcH1);
3828 do_4_ctable_coul(VFtab,vec_madd(rH2,tsc,nul),&VVcH2,&FFcH2);
3829 vnb6 = vec_madd(c6,rinvsix,nul);
3830 qqO = vec_madd(iqO,jq,nul);
3831 qqH = vec_madd(iqH,jq,nul);
3832 vnb12 = vec_madd(c12,vec_madd(rinvsix,rinvsix,nul),nul);
3833 vctot = vec_madd(qqO,VVcO,vctot);
3834 fsO = vec_madd(vec_twelve(),vnb12,nul);
3835 tmp1 = vec_madd(qqO,FFcO,nul);
3836 vnbtot = vec_add(vnbtot,vnb12);
3837 fsO = vec_nmsub(vec_six(),vnb6,fsO);
3838 vnbtot = vec_sub(vnbtot,vnb6);
3839 fsO = vec_madd(fsO,rinvO,nul);
3840 fsH1 = vec_nmsub(qqH,FFcH1,nul);
3841 fsH2 = vec_nmsub(qqH,FFcH2,nul);
3842 fsO = vec_nmsub(tmp1,tsc,fsO);
3843 vctot = vec_madd(qqH,VVcH1,vctot);
3844 fsH1 = vec_madd(fsH1,tsc,nul);
3845 fsH2 = vec_madd(fsH2,tsc,nul);
3846 vctot = vec_madd(qqH,VVcH2,vctot);
3847 fsO = vec_madd(fsO,rinvO,nul);
3848 fsH1 = vec_madd(fsH1,rinvH1,nul);
3849 fsH2 = vec_madd(fsH2,rinvH2,nul);
3851 fiOx = vec_madd(fsO,dOx,fiOx); /* +=fx */
3852 dOx = vec_nmsub(fsO,dOx,nul); /* -fx */
3853 fiOy = vec_madd(fsO,dOy,fiOy); /* +=fy */
3854 dOy = vec_nmsub(fsO,dOy,nul); /* -fy */
3855 fiOz = vec_madd(fsO,dOz,fiOz); /* +=fz */
3856 dOz = vec_nmsub(fsO,dOz,nul); /* -fz */
3857 fiH1x = vec_madd(fsH1,dH1x,fiH1x); /* +=fx */
3858 dOx = vec_nmsub(fsH1,dH1x,dOx); /* -fx */
3859 fiH1y = vec_madd(fsH1,dH1y,fiH1y); /* +=fy */
3860 dOy = vec_nmsub(fsH1,dH1y,dOy); /* -fy */
3861 fiH1z = vec_madd(fsH1,dH1z,fiH1z); /* +=fz */
3862 dOz = vec_nmsub(fsH1,dH1z,dOz); /* -fz */
3863 fiH2x = vec_madd(fsH2,dH2x,fiH2x); /* +=fx */
3864 dOx = vec_nmsub(fsH2,dH2x,dOx); /* -fx */
3865 fiH2y = vec_madd(fsH2,dH2y,fiH2y); /* +=fy */
3866 dOy = vec_nmsub(fsH2,dH2y,dOy); /* -fy */
3867 fiH2z = vec_madd(fsH2,dH2z,fiH2z); /* +=fz */
3868 dOz = vec_nmsub(fsH2,dH2z,dOz); /* -fz */
3870 transpose_3_to_4(dOx,dOy,dOz,&tmp1,&tmp2,&tmp3,&tmp4);
3871 add_xyz_to_mem(faction+j3a,tmp1);
3872 add_xyz_to_mem(faction+j3b,tmp2);
3873 add_xyz_to_mem(faction+j3c,tmp3);
3874 add_xyz_to_mem(faction+j3d,tmp4);
3876 if(k<(nj1-2)) {
3877 jnra = jjnr[k];
3878 jnrb = jjnr[k+1];
3879 jnrc = jjnr[k+2];
3880 j3a = 3*jnra;
3881 j3b = 3*jnrb;
3882 j3c = 3*jnrc;
3883 transpose_4_to_3(load_xyz(pos+j3a),
3884 load_xyz(pos+j3b),
3885 load_xyz(pos+j3c),nul,&dH2x,&dH2y,&dH2z);
3886 dOx = vec_sub(iOx,dH2x);
3887 dOy = vec_sub(iOy,dH2y);
3888 dOz = vec_sub(iOz,dH2z);
3889 dH1x = vec_sub(iH1x,dH2x);
3890 dH1y = vec_sub(iH1y,dH2y);
3891 dH1z = vec_sub(iH1z,dH2z);
3892 dH2x = vec_sub(iH2x,dH2x);
3893 dH2y = vec_sub(iH2y,dH2y);
3894 dH2z = vec_sub(iH2z,dH2z);
3896 rsqO = vec_madd(dOx,dOx,nul);
3897 rsqH1 = vec_madd(dH1x,dH1x,nul);
3898 rsqH2 = vec_madd(dH2x,dH2x,nul);
3899 rsqO = vec_madd(dOy,dOy,rsqO);
3900 rsqH1 = vec_madd(dH1y,dH1y,rsqH1);
3901 rsqH2 = vec_madd(dH2y,dH2y,rsqH2);
3902 rsqO = vec_madd(dOz,dOz,rsqO);
3903 rsqH1 = vec_madd(dH1z,dH1z,rsqH1);
3904 rsqH2 = vec_madd(dH2z,dH2z,rsqH2);
3906 zero_highest_element_in_3_vectors(&rsqO,&rsqH1,&rsqH2);
3907 do_3_invsqrt(rsqO,rsqH1,rsqH2,&rinvO,&rinvH1,&rinvH2);
3908 zero_highest_element_in_3_vectors(&rinvO,&rinvH1,&rinvH2);
3910 rinvsqO = vec_madd(rinvO,rinvO,nul);
3911 rO = vec_madd(rsqO,rinvO,nul);
3912 rH1 = vec_madd(rsqH1,rinvH1,nul);
3913 rH2 = vec_madd(rsqH2,rinvH2,nul);
3914 rinvsix = vec_madd(rinvsqO,rinvsqO,nul);
3915 rinvsix = vec_madd(rinvsix,rinvsqO,nul);
3916 tja = ntiA+2*type[jnra];
3917 tjb = ntiA+2*type[jnrb];
3918 tjc = ntiA+2*type[jnrc];
3919 /* load 3 j charges and multiply by iq */
3920 load_3_pair(nbfp+tja,nbfp+tjb,nbfp+tjc,&c6,&c12);
3921 jq=load_3_float(charge+jnra,charge+jnrb,charge+jnrc);
3922 do_3_ctable_coul(VFtab,vec_madd(rO,tsc,nul),&VVcO,&FFcO);
3923 do_3_ctable_coul(VFtab,vec_madd(rH1,tsc,nul),&VVcH1,&FFcH1);
3924 do_3_ctable_coul(VFtab,vec_madd(rH2,tsc,nul),&VVcH2,&FFcH2);
3925 vnb6 = vec_madd(c6,rinvsix,nul);
3926 qqO = vec_madd(iqO,jq,nul);
3927 qqH = vec_madd(iqH,jq,nul);
3928 vnb12 = vec_madd(c12,vec_madd(rinvsix,rinvsix,nul),nul);
3929 vctot = vec_madd(qqO,VVcO,vctot);
3930 fsO = vec_madd(vec_twelve(),vnb12,nul);
3931 tmp1 = vec_madd(qqO,FFcO,nul);
3932 vnbtot = vec_add(vnbtot,vnb12);
3933 fsO = vec_nmsub(vec_six(),vnb6,fsO);
3934 vnbtot = vec_sub(vnbtot,vnb6);
3935 fsO = vec_madd(fsO,rinvO,nul);
3936 fsH1 = vec_nmsub(qqH,FFcH1,nul);
3937 fsH2 = vec_nmsub(qqH,FFcH2,nul);
3938 fsO = vec_nmsub(tmp1,tsc,fsO);
3939 vctot = vec_madd(qqH,VVcH1,vctot);
3940 fsH1 = vec_madd(fsH1,tsc,nul);
3941 fsH2 = vec_madd(fsH2,tsc,nul);
3942 vctot = vec_madd(qqH,VVcH2,vctot);
3943 fsO = vec_madd(fsO,rinvO,nul);
3944 fsH1 = vec_madd(fsH1,rinvH1,nul);
3945 fsH2 = vec_madd(fsH2,rinvH2,nul);
3947 fiOx = vec_madd(fsO,dOx,fiOx); /* +=fx */
3948 dOx = vec_nmsub(fsO,dOx,nul); /* -fx */
3949 fiOy = vec_madd(fsO,dOy,fiOy); /* +=fy */
3950 dOy = vec_nmsub(fsO,dOy,nul); /* -fy */
3951 fiOz = vec_madd(fsO,dOz,fiOz); /* +=fz */
3952 dOz = vec_nmsub(fsO,dOz,nul); /* -fz */
3953 fiH1x = vec_madd(fsH1,dH1x,fiH1x); /* +=fx */
3954 dOx = vec_nmsub(fsH1,dH1x,dOx); /* -fx */
3955 fiH1y = vec_madd(fsH1,dH1y,fiH1y); /* +=fy */
3956 dOy = vec_nmsub(fsH1,dH1y,dOy); /* -fy */
3957 fiH1z = vec_madd(fsH1,dH1z,fiH1z); /* +=fz */
3958 dOz = vec_nmsub(fsH1,dH1z,dOz); /* -fz */
3959 fiH2x = vec_madd(fsH2,dH2x,fiH2x); /* +=fx */
3960 dOx = vec_nmsub(fsH2,dH2x,dOx); /* -fx */
3961 fiH2y = vec_madd(fsH2,dH2y,fiH2y); /* +=fy */
3962 dOy = vec_nmsub(fsH2,dH2y,dOy); /* -fy */
3963 fiH2z = vec_madd(fsH2,dH2z,fiH2z); /* +=fz */
3964 dOz = vec_nmsub(fsH2,dH2z,dOz); /* -fz */
3966 transpose_4_to_3(dOx,dOy,dOz,nul,&tmp1,&tmp2,&tmp3);
3967 add_xyz_to_mem(faction+j3a,tmp1);
3968 add_xyz_to_mem(faction+j3b,tmp2);
3969 add_xyz_to_mem(faction+j3c,tmp3);
3970 } else if(k<(nj1-1)) {
3971 jnra = jjnr[k];
3972 jnrb = jjnr[k+1];
3973 j3a = 3*jnra;
3974 j3b = 3*jnrb;
3975 transpose_2_to_3(load_xyz(pos+j3a),
3976 load_xyz(pos+j3b),&dH2x,&dH2y,&dH2z);
3977 dOx = vec_sub(iOx,dH2x);
3978 dOy = vec_sub(iOy,dH2y);
3979 dOz = vec_sub(iOz,dH2z);
3980 dH1x = vec_sub(iH1x,dH2x);
3981 dH1y = vec_sub(iH1y,dH2y);
3982 dH1z = vec_sub(iH1z,dH2z);
3983 dH2x = vec_sub(iH2x,dH2x);
3984 dH2y = vec_sub(iH2y,dH2y);
3985 dH2z = vec_sub(iH2z,dH2z);
3987 rsqO = vec_madd(dOx,dOx,nul);
3988 rsqH1 = vec_madd(dH1x,dH1x,nul);
3989 rsqH2 = vec_madd(dH2x,dH2x,nul);
3990 rsqO = vec_madd(dOy,dOy,rsqO);
3991 rsqH1 = vec_madd(dH1y,dH1y,rsqH1);
3992 rsqH2 = vec_madd(dH2y,dH2y,rsqH2);
3993 rsqO = vec_madd(dOz,dOz,rsqO);
3994 rsqH1 = vec_madd(dH1z,dH1z,rsqH1);
3995 rsqH2 = vec_madd(dH2z,dH2z,rsqH2);
3997 zero_highest_2_elements_in_3_vectors(&rsqO,&rsqH1,&rsqH2);
3998 do_3_invsqrt(rsqO,rsqH1,rsqH2,&rinvO,&rinvH1,&rinvH2);
3999 zero_highest_2_elements_in_3_vectors(&rinvO,&rinvH1,&rinvH2);
4001 rinvsqO = vec_madd(rinvO,rinvO,nul);
4002 rO = vec_madd(rsqO,rinvO,nul);
4003 rH1 = vec_madd(rsqH1,rinvH1,nul);
4004 rH2 = vec_madd(rsqH2,rinvH2,nul);
4005 rinvsix = vec_madd(rinvsqO,rinvsqO,nul);
4006 rinvsix = vec_madd(rinvsix,rinvsqO,nul);
4007 tja = ntiA+2*type[jnra];
4008 tjb = ntiA+2*type[jnrb];
4009 /* load 2 j charges and multiply by iq */
4010 jq=load_2_float(charge+jnra,charge+jnrb);
4011 load_2_pair(nbfp+tja,nbfp+tjb,&c6,&c12);
4012 do_2_ctable_coul(VFtab,vec_madd(rO,tsc,nul),&VVcO,&FFcO);
4013 do_2_ctable_coul(VFtab,vec_madd(rH1,tsc,nul),&VVcH1,&FFcH1);
4014 do_2_ctable_coul(VFtab,vec_madd(rH2,tsc,nul),&VVcH2,&FFcH2);
4015 vnb6 = vec_madd(c6,rinvsix,nul);
4016 qqO = vec_madd(iqO,jq,nul);
4017 qqH = vec_madd(iqH,jq,nul);
4018 vnb12 = vec_madd(c12,vec_madd(rinvsix,rinvsix,nul),nul);
4019 vctot = vec_madd(qqO,VVcO,vctot);
4020 fsO = vec_madd(vec_twelve(),vnb12,nul);
4021 tmp1 = vec_madd(qqO,FFcO,nul);
4022 vnbtot = vec_add(vnbtot,vnb12);
4023 fsO = vec_nmsub(vec_six(),vnb6,fsO);
4024 vnbtot = vec_sub(vnbtot,vnb6);
4025 fsO = vec_madd(fsO,rinvO,nul);
4026 fsH1 = vec_nmsub(qqH,FFcH1,nul);
4027 fsH2 = vec_nmsub(qqH,FFcH2,nul);
4028 fsO = vec_nmsub(tmp1,tsc,fsO);
4029 vctot = vec_madd(qqH,VVcH1,vctot);
4030 fsH1 = vec_madd(fsH1,tsc,nul);
4031 fsH2 = vec_madd(fsH2,tsc,nul);
4032 vctot = vec_madd(qqH,VVcH2,vctot);
4033 fsO = vec_madd(fsO,rinvO,nul);
4034 fsH1 = vec_madd(fsH1,rinvH1,nul);
4035 fsH2 = vec_madd(fsH2,rinvH2,nul);
4037 fiOx = vec_madd(fsO,dOx,fiOx); /* +=fx */
4038 dOx = vec_nmsub(fsO,dOx,nul); /* -fx */
4039 fiOy = vec_madd(fsO,dOy,fiOy); /* +=fy */
4040 dOy = vec_nmsub(fsO,dOy,nul); /* -fy */
4041 fiOz = vec_madd(fsO,dOz,fiOz); /* +=fz */
4042 dOz = vec_nmsub(fsO,dOz,nul); /* -fz */
4043 fiH1x = vec_madd(fsH1,dH1x,fiH1x); /* +=fx */
4044 dOx = vec_nmsub(fsH1,dH1x,dOx); /* -fx */
4045 fiH1y = vec_madd(fsH1,dH1y,fiH1y); /* +=fy */
4046 dOy = vec_nmsub(fsH1,dH1y,dOy); /* -fy */
4047 fiH1z = vec_madd(fsH1,dH1z,fiH1z); /* +=fz */
4048 dOz = vec_nmsub(fsH1,dH1z,dOz); /* -fz */
4049 fiH2x = vec_madd(fsH2,dH2x,fiH2x); /* +=fx */
4050 dOx = vec_nmsub(fsH2,dH2x,dOx); /* -fx */
4051 fiH2y = vec_madd(fsH2,dH2y,fiH2y); /* +=fy */
4052 dOy = vec_nmsub(fsH2,dH2y,dOy); /* -fy */
4053 fiH2z = vec_madd(fsH2,dH2z,fiH2z); /* +=fz */
4054 dOz = vec_nmsub(fsH2,dH2z,dOz); /* -fz */
4056 transpose_3_to_2(dOx,dOy,dOz,&tmp1,&tmp2);
4057 add_xyz_to_mem(faction+j3a,tmp1);
4058 add_xyz_to_mem(faction+j3b,tmp2);
4059 } else if(k<nj1) {
4060 jnra = jjnr[k];
4061 j3a = 3*jnra;
4062 transpose_1_to_3(load_xyz(pos+j3a),&dH2x,&dH2y,&dH2z);
4063 dOx = vec_sub(iOx,dH2x);
4064 dOy = vec_sub(iOy,dH2y);
4065 dOz = vec_sub(iOz,dH2z);
4066 dH1x = vec_sub(iH1x,dH2x);
4067 dH1y = vec_sub(iH1y,dH2y);
4068 dH1z = vec_sub(iH1z,dH2z);
4069 dH2x = vec_sub(iH2x,dH2x);
4070 dH2y = vec_sub(iH2y,dH2y);
4071 dH2z = vec_sub(iH2z,dH2z);
4073 rsqO = vec_madd(dOx,dOx,nul);
4074 rsqH1 = vec_madd(dH1x,dH1x,nul);
4075 rsqH2 = vec_madd(dH2x,dH2x,nul);
4076 rsqO = vec_madd(dOy,dOy,rsqO);
4077 rsqH1 = vec_madd(dH1y,dH1y,rsqH1);
4078 rsqH2 = vec_madd(dH2y,dH2y,rsqH2);
4079 rsqO = vec_madd(dOz,dOz,rsqO);
4080 rsqH1 = vec_madd(dH1z,dH1z,rsqH1);
4081 rsqH2 = vec_madd(dH2z,dH2z,rsqH2);
4083 zero_highest_3_elements_in_3_vectors(&rsqO,&rsqH1,&rsqH2);
4084 do_3_invsqrt(rsqO,rsqH1,rsqH2,&rinvO,&rinvH1,&rinvH2);
4085 zero_highest_3_elements_in_3_vectors(&rinvO,&rinvH1,&rinvH2);
4087 rinvsqO = vec_madd(rinvO,rinvO,nul);
4088 rO = vec_madd(rsqO,rinvO,nul);
4089 rH1 = vec_madd(rsqH1,rinvH1,nul);
4090 rH2 = vec_madd(rsqH2,rinvH2,nul);
4091 rinvsix = vec_madd(rinvsqO,rinvsqO,nul);
4092 rinvsix = vec_madd(rinvsix,rinvsqO,nul);
4093 tja = ntiA+2*type[jnra];
4094 /* load 1 j charges and multiply by iq */
4095 jq=load_1_float(charge+jnra);
4096 load_1_pair(nbfp+tja,&c6,&c12);
4097 do_1_ctable_coul(VFtab,vec_madd(rO,tsc,nul),&VVcO,&FFcO);
4098 do_1_ctable_coul(VFtab,vec_madd(rH1,tsc,nul),&VVcH1,&FFcH1);
4099 do_1_ctable_coul(VFtab,vec_madd(rH2,tsc,nul),&VVcH2,&FFcH2);
4100 vnb6 = vec_madd(c6,rinvsix,nul);
4101 qqO = vec_madd(iqO,jq,nul);
4102 qqH = vec_madd(iqH,jq,nul);
4103 vnb12 = vec_madd(c12,vec_madd(rinvsix,rinvsix,nul),nul);
4104 vctot = vec_madd(qqO,VVcO,vctot);
4105 fsO = vec_madd(vec_twelve(),vnb12,nul);
4106 tmp1 = vec_madd(qqO,FFcO,nul);
4107 vnbtot = vec_add(vnbtot,vnb12);
4108 fsO = vec_nmsub(vec_six(),vnb6,fsO);
4109 vnbtot = vec_sub(vnbtot,vnb6);
4110 fsO = vec_madd(fsO,rinvO,nul);
4111 fsH1 = vec_nmsub(qqH,FFcH1,nul);
4112 fsH2 = vec_nmsub(qqH,FFcH2,nul);
4113 fsO = vec_nmsub(tmp1,tsc,fsO);
4114 vctot = vec_madd(qqH,VVcH1,vctot);
4115 fsH1 = vec_madd(fsH1,tsc,nul);
4116 fsH2 = vec_madd(fsH2,tsc,nul);
4117 vctot = vec_madd(qqH,VVcH2,vctot);
4118 fsO = vec_madd(fsO,rinvO,nul);
4119 fsH1 = vec_madd(fsH1,rinvH1,nul);
4120 fsH2 = vec_madd(fsH2,rinvH2,nul);
4122 fiOx = vec_madd(fsO,dOx,fiOx); /* +=fx */
4123 dOx = vec_nmsub(fsO,dOx,nul); /* -fx */
4124 fiOy = vec_madd(fsO,dOy,fiOy); /* +=fy */
4125 dOy = vec_nmsub(fsO,dOy,nul); /* -fy */
4126 fiOz = vec_madd(fsO,dOz,fiOz); /* +=fz */
4127 dOz = vec_nmsub(fsO,dOz,nul); /* -fz */
4128 fiH1x = vec_madd(fsH1,dH1x,fiH1x); /* +=fx */
4129 dOx = vec_nmsub(fsH1,dH1x,dOx); /* -fx */
4130 fiH1y = vec_madd(fsH1,dH1y,fiH1y); /* +=fy */
4131 dOy = vec_nmsub(fsH1,dH1y,dOy); /* -fy */
4132 fiH1z = vec_madd(fsH1,dH1z,fiH1z); /* +=fz */
4133 dOz = vec_nmsub(fsH1,dH1z,dOz); /* -fz */
4134 fiH2x = vec_madd(fsH2,dH2x,fiH2x); /* +=fx */
4135 dOx = vec_nmsub(fsH2,dH2x,dOx); /* -fx */
4136 fiH2y = vec_madd(fsH2,dH2y,fiH2y); /* +=fy */
4137 dOy = vec_nmsub(fsH2,dH2y,dOy); /* -fy */
4138 fiH2z = vec_madd(fsH2,dH2z,fiH2z); /* +=fz */
4139 dOz = vec_nmsub(fsH2,dH2z,dOz); /* -fz */
4141 transpose_3_to_1(dOx,dOy,dOz,&tmp1);
4142 add_xyz_to_mem(faction+j3a,tmp1);
4144 /* update outer data */
4145 update_i_water_forces(faction+ii3,fshift+is3,
4146 fiOx,fiOy,fiOz,fiH1x,fiH1y,fiH1z,fiH2x,fiH2y,fiH2z);
4148 add_vector_to_float(Vc+gid[n],vctot);
4149 add_vector_to_float(Vnb+gid[n],vnbtot);
4154 void inl3320_altivec(
4155 int nri,
4156 int iinr[],
4157 int jindex[],
4158 int jjnr[],
4159 int shift[],
4160 float shiftvec[],
4161 float fshift[],
4162 int gid[],
4163 float pos[],
4164 float faction[],
4165 float charge[],
4166 float facel,
4167 float Vc[],
4168 int type[],
4169 int ntype,
4170 float nbfp[],
4171 float Vnb[],
4172 float tabscale,
4173 float VFtab[])
4175 vector float tsc;
4176 vector float iOx,iOy,iOz,iH1x,iH1y,iH1z,iH2x,iH2y,iH2z;
4177 vector float dOx,dOy,dOz,dH1x,dH1y,dH1z,dH2x,dH2y,dH2z;
4178 vector float vfacel,vcoulO,vcoulH1,vcoulH2,nul;
4179 vector float vnbtot,c6,c12;
4180 vector float fsO,fsH1,fsH2;
4181 vector float vctot,qqO,qqH,iqO,iqH,jq;
4182 vector float fiOx,fiOy,fiOz,fiH1x,fiH1y,fiH1z,fiH2x,fiH2y,fiH2z;
4183 vector float tmp1,tmp2,tmp3,tmp4;
4184 vector float rinvO,rinvH1,rinvH2,rsqO,rsqH1,rsqH2;
4185 vector float rO,rH1,rH2,VVcO,FFcO,VVcH1,FFcH1,VVcH2,FFcH2,VVd,FFd,VVr,FFr;
4187 int n,k,k0,ii,is3,ii3,ntiA,nj0,nj1;
4188 int jnra,jnrb,jnrc,jnrd;
4189 int j3a,j3b,j3c,j3d;
4190 int tja,tjb,tjc,tjd;
4192 nul=vec_zero();
4193 tsc=load_float_and_splat(&tabscale);
4194 vfacel=load_float_and_splat(&facel);
4196 ii = iinr[0];
4197 iqO = vec_madd(load_float_and_splat(charge+ii),vfacel,nul);
4198 iqH = vec_madd(load_float_and_splat(charge+ii+1),vfacel,nul);
4199 ntiA = 2*ntype*type[ii];
4201 for(n=0;n<nri;n++) {
4202 is3 = 3*shift[n];
4203 ii = iinr[n];
4204 ii3 = 3*ii;
4205 load_1_water_shift_and_splat(pos+ii3,shiftvec+is3,&iOx,&iOy,&iOz,
4206 &iH1x,&iH1y,&iH1z,&iH2x,&iH2y,&iH2z);
4207 vctot = nul;
4208 vnbtot = nul;
4209 fiOx = nul;
4210 fiOy = nul;
4211 fiOz = nul;
4212 fiH1x = nul;
4213 fiH1y = nul;
4214 fiH1z = nul;
4215 fiH2x = nul;
4216 fiH2y = nul;
4217 fiH2z = nul;
4218 nj0 = jindex[n];
4219 nj1 = jindex[n+1];
4221 for(k=nj0; k<(nj1-3); k+=4) {
4222 jnra = jjnr[k];
4223 jnrb = jjnr[k+1];
4224 jnrc = jjnr[k+2];
4225 jnrd = jjnr[k+3];
4226 j3a = 3*jnra;
4227 j3b = 3*jnrb;
4228 j3c = 3*jnrc;
4229 j3d = 3*jnrd;
4230 transpose_4_to_3(load_xyz(pos+j3a),
4231 load_xyz(pos+j3b),
4232 load_xyz(pos+j3c),
4233 load_xyz(pos+j3d),&dH2x,&dH2y,&dH2z);
4234 dOx = vec_sub(iOx,dH2x);
4235 dOy = vec_sub(iOy,dH2y);
4236 dOz = vec_sub(iOz,dH2z);
4237 dH1x = vec_sub(iH1x,dH2x);
4238 dH1y = vec_sub(iH1y,dH2y);
4239 dH1z = vec_sub(iH1z,dH2z);
4240 dH2x = vec_sub(iH2x,dH2x);
4241 dH2y = vec_sub(iH2y,dH2y);
4242 dH2z = vec_sub(iH2z,dH2z);
4244 rsqO = vec_madd(dOx,dOx,nul);
4245 rsqH1 = vec_madd(dH1x,dH1x,nul);
4246 rsqH2 = vec_madd(dH2x,dH2x,nul);
4247 rsqO = vec_madd(dOy,dOy,rsqO);
4248 rsqH1 = vec_madd(dH1y,dH1y,rsqH1);
4249 rsqH2 = vec_madd(dH2y,dH2y,rsqH2);
4250 rsqO = vec_madd(dOz,dOz,rsqO);
4251 rsqH1 = vec_madd(dH1z,dH1z,rsqH1);
4252 rsqH2 = vec_madd(dH2z,dH2z,rsqH2);
4253 do_3_invsqrt(rsqO,rsqH1,rsqH2,&rinvO,&rinvH1,&rinvH2);
4254 rO = vec_madd(rsqO,rinvO,nul);
4255 rH1 = vec_madd(rsqH1,rinvH1,nul);
4256 rH2 = vec_madd(rsqH2,rinvH2,nul);
4257 tja = ntiA+2*type[jnra];
4258 tjb = ntiA+2*type[jnrb];
4259 tjc = ntiA+2*type[jnrc];
4260 tjd = ntiA+2*type[jnrd];
4261 /* load 4 j charges and multiply by iq */
4262 jq=load_4_float(charge+jnra,charge+jnrb,charge+jnrc,charge+jnrd);
4263 load_4_pair(nbfp+tja,nbfp+tjb,nbfp+tjc,nbfp+tjd,&c6,&c12);
4264 do_4_ljctable_coul_and_lj(VFtab,vec_madd(rO,tsc,nul),&VVcO,&FFcO,&VVd,&FFd,&VVr,&FFr);
4265 do_4_ljctable_coul(VFtab,vec_madd(rH1,tsc,nul),&VVcH1,&FFcH1);
4266 do_4_ljctable_coul(VFtab,vec_madd(rH2,tsc,nul),&VVcH2,&FFcH2);
4267 qqO = vec_madd(iqO,jq,nul);
4268 qqH = vec_madd(iqH,jq,nul);
4269 fsO = vec_madd(qqO,FFcO,nul);
4270 vctot = vec_madd(qqO,VVcO,vctot);
4271 vnbtot = vec_madd(c6,VVd,vnbtot);
4272 fsO = vec_madd(c6,FFd,fsO);
4273 fsH1 = vec_madd(qqH,FFcH1,nul);
4274 fsH2 = vec_madd(qqH,FFcH2,nul);
4275 fsO = vec_madd(c12,FFr,fsO);
4276 vctot = vec_madd(qqH,VVcH1,vctot);
4277 vnbtot = vec_madd(c12,VVr,vnbtot);
4278 fsO = vec_nmsub(fsO,tsc,nul);
4279 vctot = vec_madd(qqH,VVcH2,vctot);
4280 fsH1 = vec_nmsub(fsH1,tsc,nul);
4281 fsH2 = vec_nmsub(fsH2,tsc,nul);
4282 fsO = vec_madd(fsO,rinvO,nul);
4283 fsH1 = vec_madd(fsH1,rinvH1,nul);
4284 fsH2 = vec_madd(fsH2,rinvH2,nul);
4286 fiOx = vec_madd(fsO,dOx,fiOx); /* +=fx */
4287 dOx = vec_nmsub(fsO,dOx,nul); /* -fx */
4288 fiOy = vec_madd(fsO,dOy,fiOy); /* +=fy */
4289 dOy = vec_nmsub(fsO,dOy,nul); /* -fy */
4290 fiOz = vec_madd(fsO,dOz,fiOz); /* +=fz */
4291 dOz = vec_nmsub(fsO,dOz,nul); /* -fz */
4292 fiH1x = vec_madd(fsH1,dH1x,fiH1x); /* +=fx */
4293 dOx = vec_nmsub(fsH1,dH1x,dOx); /* -fx */
4294 fiH1y = vec_madd(fsH1,dH1y,fiH1y); /* +=fy */
4295 dOy = vec_nmsub(fsH1,dH1y,dOy); /* -fy */
4296 fiH1z = vec_madd(fsH1,dH1z,fiH1z); /* +=fz */
4297 dOz = vec_nmsub(fsH1,dH1z,dOz); /* -fz */
4298 fiH2x = vec_madd(fsH2,dH2x,fiH2x); /* +=fx */
4299 dOx = vec_nmsub(fsH2,dH2x,dOx); /* -fx */
4300 fiH2y = vec_madd(fsH2,dH2y,fiH2y); /* +=fy */
4301 dOy = vec_nmsub(fsH2,dH2y,dOy); /* -fy */
4302 fiH2z = vec_madd(fsH2,dH2z,fiH2z); /* +=fz */
4303 dOz = vec_nmsub(fsH2,dH2z,dOz); /* -fz */
4305 transpose_3_to_4(dOx,dOy,dOz,&tmp1,&tmp2,&tmp3,&tmp4);
4306 add_xyz_to_mem(faction+j3a,tmp1);
4307 add_xyz_to_mem(faction+j3b,tmp2);
4308 add_xyz_to_mem(faction+j3c,tmp3);
4309 add_xyz_to_mem(faction+j3d,tmp4);
4311 if(k<(nj1-2)) {
4312 jnra = jjnr[k];
4313 jnrb = jjnr[k+1];
4314 jnrc = jjnr[k+2];
4315 j3a = 3*jnra;
4316 j3b = 3*jnrb;
4317 j3c = 3*jnrc;
4318 transpose_4_to_3(load_xyz(pos+j3a),
4319 load_xyz(pos+j3b),
4320 load_xyz(pos+j3c),nul,&dH2x,&dH2y,&dH2z);
4321 dOx = vec_sub(iOx,dH2x);
4322 dOy = vec_sub(iOy,dH2y);
4323 dOz = vec_sub(iOz,dH2z);
4324 dH1x = vec_sub(iH1x,dH2x);
4325 dH1y = vec_sub(iH1y,dH2y);
4326 dH1z = vec_sub(iH1z,dH2z);
4327 dH2x = vec_sub(iH2x,dH2x);
4328 dH2y = vec_sub(iH2y,dH2y);
4329 dH2z = vec_sub(iH2z,dH2z);
4331 rsqO = vec_madd(dOx,dOx,nul);
4332 rsqH1 = vec_madd(dH1x,dH1x,nul);
4333 rsqH2 = vec_madd(dH2x,dH2x,nul);
4334 rsqO = vec_madd(dOy,dOy,rsqO);
4335 rsqH1 = vec_madd(dH1y,dH1y,rsqH1);
4336 rsqH2 = vec_madd(dH2y,dH2y,rsqH2);
4337 rsqO = vec_madd(dOz,dOz,rsqO);
4338 rsqH1 = vec_madd(dH1z,dH1z,rsqH1);
4339 rsqH2 = vec_madd(dH2z,dH2z,rsqH2);
4341 zero_highest_element_in_3_vectors(&rsqO,&rsqH1,&rsqH2);
4342 do_3_invsqrt(rsqO,rsqH1,rsqH2,&rinvO,&rinvH1,&rinvH2);
4343 zero_highest_element_in_3_vectors(&rinvO,&rinvH1,&rinvH2);
4345 rO = vec_madd(rsqO,rinvO,nul);
4346 rH1 = vec_madd(rsqH1,rinvH1,nul);
4347 rH2 = vec_madd(rsqH2,rinvH2,nul);
4348 tja = ntiA+2*type[jnra];
4349 tjb = ntiA+2*type[jnrb];
4350 tjc = ntiA+2*type[jnrc];
4352 load_3_pair(nbfp+tja,nbfp+tjb,nbfp+tjc,&c6,&c12);
4353 jq=load_3_float(charge+jnra,charge+jnrb,charge+jnrc);
4354 do_3_ljctable_coul_and_lj(VFtab,vec_madd(rO,tsc,nul),&VVcO,&FFcO,&VVd,&FFd,&VVr,&FFr);
4355 do_3_ljctable_coul(VFtab,vec_madd(rH1,tsc,nul),&VVcH1,&FFcH1);
4356 do_3_ljctable_coul(VFtab,vec_madd(rH2,tsc,nul),&VVcH2,&FFcH2);
4357 qqO = vec_madd(iqO,jq,nul);
4358 qqH = vec_madd(iqH,jq,nul);
4359 fsO = vec_madd(qqO,FFcO,nul);
4360 vctot = vec_madd(qqO,VVcO,vctot);
4361 vnbtot = vec_madd(c6,VVd,vnbtot);
4362 fsO = vec_madd(c6,FFd,fsO);
4363 fsH1 = vec_madd(qqH,FFcH1,nul);
4364 fsH2 = vec_madd(qqH,FFcH2,nul);
4365 fsO = vec_madd(c12,FFr,fsO);
4366 vctot = vec_madd(qqH,VVcH1,vctot);
4367 vnbtot = vec_madd(c12,VVr,vnbtot);
4368 fsO = vec_nmsub(fsO,tsc,nul);
4369 vctot = vec_madd(qqH,VVcH2,vctot);
4370 fsH1 = vec_nmsub(fsH1,tsc,nul);
4371 fsH2 = vec_nmsub(fsH2,tsc,nul);
4372 fsO = vec_madd(fsO,rinvO,nul);
4373 fsH1 = vec_madd(fsH1,rinvH1,nul);
4374 fsH2 = vec_madd(fsH2,rinvH2,nul);
4376 fiOx = vec_madd(fsO,dOx,fiOx); /* +=fx */
4377 dOx = vec_nmsub(fsO,dOx,nul); /* -fx */
4378 fiOy = vec_madd(fsO,dOy,fiOy); /* +=fy */
4379 dOy = vec_nmsub(fsO,dOy,nul); /* -fy */
4380 fiOz = vec_madd(fsO,dOz,fiOz); /* +=fz */
4381 dOz = vec_nmsub(fsO,dOz,nul); /* -fz */
4382 fiH1x = vec_madd(fsH1,dH1x,fiH1x); /* +=fx */
4383 dOx = vec_nmsub(fsH1,dH1x,dOx); /* -fx */
4384 fiH1y = vec_madd(fsH1,dH1y,fiH1y); /* +=fy */
4385 dOy = vec_nmsub(fsH1,dH1y,dOy); /* -fy */
4386 fiH1z = vec_madd(fsH1,dH1z,fiH1z); /* +=fz */
4387 dOz = vec_nmsub(fsH1,dH1z,dOz); /* -fz */
4388 fiH2x = vec_madd(fsH2,dH2x,fiH2x); /* +=fx */
4389 dOx = vec_nmsub(fsH2,dH2x,dOx); /* -fx */
4390 fiH2y = vec_madd(fsH2,dH2y,fiH2y); /* +=fy */
4391 dOy = vec_nmsub(fsH2,dH2y,dOy); /* -fy */
4392 fiH2z = vec_madd(fsH2,dH2z,fiH2z); /* +=fz */
4393 dOz = vec_nmsub(fsH2,dH2z,dOz); /* -fz */
4395 transpose_4_to_3(dOx,dOy,dOz,nul,&tmp1,&tmp2,&tmp3);
4396 add_xyz_to_mem(faction+j3a,tmp1);
4397 add_xyz_to_mem(faction+j3b,tmp2);
4398 add_xyz_to_mem(faction+j3c,tmp3);
4399 } else if(k<(nj1-1)) {
4400 jnra = jjnr[k];
4401 jnrb = jjnr[k+1];
4402 j3a = 3*jnra;
4403 j3b = 3*jnrb;
4404 transpose_2_to_3(load_xyz(pos+j3a),
4405 load_xyz(pos+j3b),&dH2x,&dH2y,&dH2z);
4406 dOx = vec_sub(iOx,dH2x);
4407 dOy = vec_sub(iOy,dH2y);
4408 dOz = vec_sub(iOz,dH2z);
4409 dH1x = vec_sub(iH1x,dH2x);
4410 dH1y = vec_sub(iH1y,dH2y);
4411 dH1z = vec_sub(iH1z,dH2z);
4412 dH2x = vec_sub(iH2x,dH2x);
4413 dH2y = vec_sub(iH2y,dH2y);
4414 dH2z = vec_sub(iH2z,dH2z);
4416 rsqO = vec_madd(dOx,dOx,nul);
4417 rsqH1 = vec_madd(dH1x,dH1x,nul);
4418 rsqH2 = vec_madd(dH2x,dH2x,nul);
4419 rsqO = vec_madd(dOy,dOy,rsqO);
4420 rsqH1 = vec_madd(dH1y,dH1y,rsqH1);
4421 rsqH2 = vec_madd(dH2y,dH2y,rsqH2);
4422 rsqO = vec_madd(dOz,dOz,rsqO);
4423 rsqH1 = vec_madd(dH1z,dH1z,rsqH1);
4424 rsqH2 = vec_madd(dH2z,dH2z,rsqH2);
4426 zero_highest_2_elements_in_3_vectors(&rsqO,&rsqH1,&rsqH2);
4427 do_3_invsqrt(rsqO,rsqH1,rsqH2,&rinvO,&rinvH1,&rinvH2);
4428 zero_highest_2_elements_in_3_vectors(&rinvO,&rinvH1,&rinvH2);
4430 rO = vec_madd(rsqO,rinvO,nul);
4431 rH1 = vec_madd(rsqH1,rinvH1,nul);
4432 rH2 = vec_madd(rsqH2,rinvH2,nul);
4433 tja = ntiA+2*type[jnra];
4434 tjb = ntiA+2*type[jnrb];
4435 /* load 2 j charges and multiply by iq */
4436 jq=load_2_float(charge+jnra,charge+jnrb);
4437 load_2_pair(nbfp+tja,nbfp+tjb,&c6,&c12);
4438 do_2_ljctable_coul_and_lj(VFtab,vec_madd(rO,tsc,nul),&VVcO,&FFcO,&VVd,&FFd,&VVr,&FFr);
4439 do_2_ljctable_coul(VFtab,vec_madd(rH1,tsc,nul),&VVcH1,&FFcH1);
4440 do_2_ljctable_coul(VFtab,vec_madd(rH2,tsc,nul),&VVcH2,&FFcH2);
4441 qqO = vec_madd(iqO,jq,nul);
4442 qqH = vec_madd(iqH,jq,nul);
4443 fsO = vec_madd(qqO,FFcO,nul);
4444 vctot = vec_madd(qqO,VVcO,vctot);
4445 vnbtot = vec_madd(c6,VVd,vnbtot);
4446 fsO = vec_madd(c6,FFd,fsO);
4447 fsH1 = vec_madd(qqH,FFcH1,nul);
4448 fsH2 = vec_madd(qqH,FFcH2,nul);
4449 fsO = vec_madd(c12,FFr,fsO);
4450 vctot = vec_madd(qqH,VVcH1,vctot);
4451 vnbtot = vec_madd(c12,VVr,vnbtot);
4452 fsO = vec_nmsub(fsO,tsc,nul);
4453 vctot = vec_madd(qqH,VVcH2,vctot);
4454 fsH1 = vec_nmsub(fsH1,tsc,nul);
4455 fsH2 = vec_nmsub(fsH2,tsc,nul);
4456 fsO = vec_madd(fsO,rinvO,nul);
4457 fsH1 = vec_madd(fsH1,rinvH1,nul);
4458 fsH2 = vec_madd(fsH2,rinvH2,nul);
4460 fiOx = vec_madd(fsO,dOx,fiOx); /* +=fx */
4461 dOx = vec_nmsub(fsO,dOx,nul); /* -fx */
4462 fiOy = vec_madd(fsO,dOy,fiOy); /* +=fy */
4463 dOy = vec_nmsub(fsO,dOy,nul); /* -fy */
4464 fiOz = vec_madd(fsO,dOz,fiOz); /* +=fz */
4465 dOz = vec_nmsub(fsO,dOz,nul); /* -fz */
4466 fiH1x = vec_madd(fsH1,dH1x,fiH1x); /* +=fx */
4467 dOx = vec_nmsub(fsH1,dH1x,dOx); /* -fx */
4468 fiH1y = vec_madd(fsH1,dH1y,fiH1y); /* +=fy */
4469 dOy = vec_nmsub(fsH1,dH1y,dOy); /* -fy */
4470 fiH1z = vec_madd(fsH1,dH1z,fiH1z); /* +=fz */
4471 dOz = vec_nmsub(fsH1,dH1z,dOz); /* -fz */
4472 fiH2x = vec_madd(fsH2,dH2x,fiH2x); /* +=fx */
4473 dOx = vec_nmsub(fsH2,dH2x,dOx); /* -fx */
4474 fiH2y = vec_madd(fsH2,dH2y,fiH2y); /* +=fy */
4475 dOy = vec_nmsub(fsH2,dH2y,dOy); /* -fy */
4476 fiH2z = vec_madd(fsH2,dH2z,fiH2z); /* +=fz */
4477 dOz = vec_nmsub(fsH2,dH2z,dOz); /* -fz */
4479 transpose_3_to_2(dOx,dOy,dOz,&tmp1,&tmp2);
4480 add_xyz_to_mem(faction+j3a,tmp1);
4481 add_xyz_to_mem(faction+j3b,tmp2);
4482 } else if(k<nj1) {
4483 jnra = jjnr[k];
4484 j3a = 3*jnra;
4485 transpose_1_to_3(load_xyz(pos+j3a),&dH2x,&dH2y,&dH2z);
4486 dOx = vec_sub(iOx,dH2x);
4487 dOy = vec_sub(iOy,dH2y);
4488 dOz = vec_sub(iOz,dH2z);
4489 dH1x = vec_sub(iH1x,dH2x);
4490 dH1y = vec_sub(iH1y,dH2y);
4491 dH1z = vec_sub(iH1z,dH2z);
4492 dH2x = vec_sub(iH2x,dH2x);
4493 dH2y = vec_sub(iH2y,dH2y);
4494 dH2z = vec_sub(iH2z,dH2z);
4496 rsqO = vec_madd(dOx,dOx,nul);
4497 rsqH1 = vec_madd(dH1x,dH1x,nul);
4498 rsqH2 = vec_madd(dH2x,dH2x,nul);
4499 rsqO = vec_madd(dOy,dOy,rsqO);
4500 rsqH1 = vec_madd(dH1y,dH1y,rsqH1);
4501 rsqH2 = vec_madd(dH2y,dH2y,rsqH2);
4502 rsqO = vec_madd(dOz,dOz,rsqO);
4503 rsqH1 = vec_madd(dH1z,dH1z,rsqH1);
4504 rsqH2 = vec_madd(dH2z,dH2z,rsqH2);
4506 zero_highest_3_elements_in_3_vectors(&rsqO,&rsqH1,&rsqH2);
4507 do_3_invsqrt(rsqO,rsqH1,rsqH2,&rinvO,&rinvH1,&rinvH2);
4508 zero_highest_3_elements_in_3_vectors(&rinvO,&rinvH1,&rinvH2);
4510 rO = vec_madd(rsqO,rinvO,nul);
4511 rH1 = vec_madd(rsqH1,rinvH1,nul);
4512 rH2 = vec_madd(rsqH2,rinvH2,nul);
4513 tja = ntiA+2*type[jnra];
4514 /* load 1 j charges and multiply by iq */
4515 jq=load_1_float(charge+jnra);
4516 load_1_pair(nbfp+tja,&c6,&c12);
4517 do_1_ljctable_coul_and_lj(VFtab,vec_madd(rO,tsc,nul),&VVcO,&FFcO,&VVd,&FFd,&VVr,&FFr);
4518 do_1_ljctable_coul(VFtab,vec_madd(rH1,tsc,nul),&VVcH1,&FFcH1);
4519 do_1_ljctable_coul(VFtab,vec_madd(rH2,tsc,nul),&VVcH2,&FFcH2);
4520 qqO = vec_madd(iqO,jq,nul);
4521 qqH = vec_madd(iqH,jq,nul);
4522 fsO = vec_madd(qqO,FFcO,nul);
4523 vctot = vec_madd(qqO,VVcO,vctot);
4524 vnbtot = vec_madd(c6,VVd,vnbtot);
4525 fsO = vec_madd(c6,FFd,fsO);
4526 fsH1 = vec_madd(qqH,FFcH1,nul);
4527 fsH2 = vec_madd(qqH,FFcH2,nul);
4528 fsO = vec_madd(c12,FFr,fsO);
4529 vctot = vec_madd(qqH,VVcH1,vctot);
4530 vnbtot = vec_madd(c12,VVr,vnbtot);
4531 fsO = vec_nmsub(fsO,tsc,nul);
4532 vctot = vec_madd(qqH,VVcH2,vctot);
4533 fsH1 = vec_nmsub(fsH1,tsc,nul);
4534 fsH2 = vec_nmsub(fsH2,tsc,nul);
4535 fsO = vec_madd(fsO,rinvO,nul);
4536 fsH1 = vec_madd(fsH1,rinvH1,nul);
4537 fsH2 = vec_madd(fsH2,rinvH2,nul);
4539 fiOx = vec_madd(fsO,dOx,fiOx); /* +=fx */
4540 dOx = vec_nmsub(fsO,dOx,nul); /* -fx */
4541 fiOy = vec_madd(fsO,dOy,fiOy); /* +=fy */
4542 dOy = vec_nmsub(fsO,dOy,nul); /* -fy */
4543 fiOz = vec_madd(fsO,dOz,fiOz); /* +=fz */
4544 dOz = vec_nmsub(fsO,dOz,nul); /* -fz */
4545 fiH1x = vec_madd(fsH1,dH1x,fiH1x); /* +=fx */
4546 dOx = vec_nmsub(fsH1,dH1x,dOx); /* -fx */
4547 fiH1y = vec_madd(fsH1,dH1y,fiH1y); /* +=fy */
4548 dOy = vec_nmsub(fsH1,dH1y,dOy); /* -fy */
4549 fiH1z = vec_madd(fsH1,dH1z,fiH1z); /* +=fz */
4550 dOz = vec_nmsub(fsH1,dH1z,dOz); /* -fz */
4551 fiH2x = vec_madd(fsH2,dH2x,fiH2x); /* +=fx */
4552 dOx = vec_nmsub(fsH2,dH2x,dOx); /* -fx */
4553 fiH2y = vec_madd(fsH2,dH2y,fiH2y); /* +=fy */
4554 dOy = vec_nmsub(fsH2,dH2y,dOy); /* -fy */
4555 fiH2z = vec_madd(fsH2,dH2z,fiH2z); /* +=fz */
4556 dOz = vec_nmsub(fsH2,dH2z,dOz); /* -fz */
4558 transpose_3_to_1(dOx,dOy,dOz,&tmp1);
4559 add_xyz_to_mem(faction+j3a,tmp1);
4561 /* update outer data */
4562 update_i_water_forces(faction+ii3,fshift+is3,
4563 fiOx,fiOy,fiOz,fiH1x,fiH1y,fiH1z,fiH2x,fiH2y,fiH2z);
4565 add_vector_to_float(Vc+gid[n],vctot);
4566 add_vector_to_float(Vnb+gid[n],vnbtot);
4573 void inl1030_altivec(
4574 int nri,
4575 int iinr[],
4576 int jindex[],
4577 int jjnr[],
4578 int shift[],
4579 float shiftvec[],
4580 float fshift[],
4581 int gid[],
4582 float pos[],
4583 float faction[],
4584 float charge[],
4585 float facel,
4586 float Vc[])
4588 vector float ix1,iy1,iz1,ix2,iy2,iz2,ix3,iy3,iz3;
4589 vector float jx1,jy1,jz1,jx2,jy2,jz2,jx3,jy3,jz3;
4591 vector float dx11,dy11,dz11,dx12,dy12,dz12,dx13,dy13,dz13;
4592 vector float dx21,dy21,dz21,dx22,dy22,dz22,dx23,dy23,dz23;
4593 vector float dx31,dy31,dz31,dx32,dy32,dz32,dx33,dy33,dz33;
4595 vector float rsq11,rsq12,rsq13,rsq21,rsq22,rsq23,rsq31,rsq32,rsq33;
4596 vector float rinv11,rinv12,rinv13,rinv21,rinv22,rinv23,rinv31,rinv32,rinv33;
4597 vector float rinvsq11,rinvsq12,rinvsq13;
4598 vector float rinvsq21,rinvsq22,rinvsq23;
4599 vector float rinvsq31,rinvsq32,rinvsq33;
4600 vector float vc11,vc12,vc13,vc21,vc22,vc23,vc31,vc32,vc33;
4602 vector float vfacel,vcoul1,vcoul2,vcoul3,nul;
4603 vector float fs11,fs12,fs13,fs21,fs22,fs23,fs31,fs32,fs33;
4604 vector float fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3;
4605 vector float fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3;
4606 vector float vctot,qqOO,qqOH,qqHH,qO,qH,qqOOt,qqOHt,qqHHt;
4610 int n,k,k0,ii,is3,ii3,ntiA,nj0,nj1;
4611 int jnra,jnrb,jnrc,jnrd;
4612 int j3a,j3b,j3c,j3d;
4614 nul=vec_zero();
4615 vfacel=load_float_and_splat(&facel);
4616 qO = load_float_and_splat(charge+iinr[0]);
4617 qH = load_float_and_splat(charge+iinr[0]+1);
4618 qqOO = vec_madd(qO,qO,nul);
4619 qqOH = vec_madd(qO,qH,nul);
4620 qqHH = vec_madd(qH,qH,nul);
4621 qqOO = vec_madd(qqOO,vfacel,nul);
4622 qqOH = vec_madd(qqOH,vfacel,nul);
4623 qqHH = vec_madd(qqHH,vfacel,nul);
4625 for(n=0;n<nri;n++) {
4626 is3 = 3*shift[n];
4627 ii = iinr[n];
4628 ii3 = 3*ii;
4629 load_1_water_shift_and_splat(pos+ii3,shiftvec+is3,&ix1,&iy1,&iz1,
4630 &ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
4631 vctot = nul;
4632 fix1 = nul;
4633 fiy1 = nul;
4634 fiz1 = nul;
4635 fix2 = nul;
4636 fiy2 = nul;
4637 fiz2 = nul;
4638 fix3 = nul;
4639 fiy3 = nul;
4640 fiz3 = nul;
4641 nj0 = jindex[n];
4642 nj1 = jindex[n+1];
4644 for(k=nj0; k<(nj1-3); k+=4) {
4645 jnra = jjnr[k];
4646 jnrb = jjnr[k+1];
4647 jnrc = jjnr[k+2];
4648 jnrd = jjnr[k+3];
4649 j3a = 3*jnra;
4650 j3b = 3*jnrb;
4651 j3c = 3*jnrc;
4652 j3d = 3*jnrd;
4653 load_4_water(pos+j3a,pos+j3b,pos+j3c,pos+j3d,
4654 &jx1,&jy1,&jz1,&jx2,&jy2,&jz2,&jx3,&jy3,&jz3);
4656 dx11 = vec_sub(ix1,jx1);
4657 dx12 = vec_sub(ix1,jx2);
4658 dx13 = vec_sub(ix1,jx3);
4659 dy11 = vec_sub(iy1,jy1);
4660 dy12 = vec_sub(iy1,jy2);
4661 dy13 = vec_sub(iy1,jy3);
4662 dz11 = vec_sub(iz1,jz1);
4663 dz12 = vec_sub(iz1,jz2);
4664 dz13 = vec_sub(iz1,jz3);
4665 dx21 = vec_sub(ix2,jx1);
4666 dx22 = vec_sub(ix2,jx2);
4667 dx23 = vec_sub(ix2,jx3);
4668 dy21 = vec_sub(iy2,jy1);
4669 dy22 = vec_sub(iy2,jy2);
4670 dy23 = vec_sub(iy2,jy3);
4671 dz21 = vec_sub(iz2,jz1);
4672 dz22 = vec_sub(iz2,jz2);
4673 dz23 = vec_sub(iz2,jz3);
4674 dx31 = vec_sub(ix3,jx1);
4675 dx32 = vec_sub(ix3,jx2);
4676 dx33 = vec_sub(ix3,jx3);
4677 dy31 = vec_sub(iy3,jy1);
4678 dy32 = vec_sub(iy3,jy2);
4679 dy33 = vec_sub(iy3,jy3);
4680 dz31 = vec_sub(iz3,jz1);
4681 dz32 = vec_sub(iz3,jz2);
4682 dz33 = vec_sub(iz3,jz3);
4684 rsq11 = vec_madd(dx11,dx11,nul);
4685 rsq12 = vec_madd(dx12,dx12,nul);
4686 rsq13 = vec_madd(dx13,dx13,nul);
4687 rsq21 = vec_madd(dx21,dx21,nul);
4688 rsq22 = vec_madd(dx22,dx22,nul);
4689 rsq23 = vec_madd(dx23,dx23,nul);
4690 rsq31 = vec_madd(dx31,dx31,nul);
4691 rsq32 = vec_madd(dx32,dx32,nul);
4692 rsq33 = vec_madd(dx33,dx33,nul);
4693 rsq11 = vec_madd(dy11,dy11,rsq11);
4694 rsq12 = vec_madd(dy12,dy12,rsq12);
4695 rsq13 = vec_madd(dy13,dy13,rsq13);
4696 rsq21 = vec_madd(dy21,dy21,rsq21);
4697 rsq22 = vec_madd(dy22,dy22,rsq22);
4698 rsq23 = vec_madd(dy23,dy23,rsq23);
4699 rsq31 = vec_madd(dy31,dy31,rsq31);
4700 rsq32 = vec_madd(dy32,dy32,rsq32);
4701 rsq33 = vec_madd(dy33,dy33,rsq33);
4702 rsq11 = vec_madd(dz11,dz11,rsq11);
4703 rsq12 = vec_madd(dz12,dz12,rsq12);
4704 rsq13 = vec_madd(dz13,dz13,rsq13);
4705 rsq21 = vec_madd(dz21,dz21,rsq21);
4706 rsq22 = vec_madd(dz22,dz22,rsq22);
4707 rsq23 = vec_madd(dz23,dz23,rsq23);
4708 rsq31 = vec_madd(dz31,dz31,rsq31);
4709 rsq32 = vec_madd(dz32,dz32,rsq32);
4710 rsq33 = vec_madd(dz33,dz33,rsq33);
4712 do_9_invsqrt(rsq11,rsq12,rsq13,
4713 rsq21,rsq22,rsq23,
4714 rsq31,rsq32,rsq33,
4715 &rinv11,&rinv12,&rinv13,
4716 &rinv21,&rinv22,&rinv23,
4717 &rinv31,&rinv32,&rinv33);
4719 rinvsq11 = vec_madd(rinv11,rinv11,nul);
4720 rinvsq12 = vec_madd(rinv12,rinv12,nul);
4721 rinvsq13 = vec_madd(rinv13,rinv13,nul);
4722 rinvsq21 = vec_madd(rinv21,rinv21,nul);
4723 rinvsq22 = vec_madd(rinv22,rinv22,nul);
4724 rinvsq23 = vec_madd(rinv23,rinv23,nul);
4725 rinvsq31 = vec_madd(rinv31,rinv31,nul);
4726 rinvsq32 = vec_madd(rinv32,rinv32,nul);
4727 rinvsq33 = vec_madd(rinv33,rinv33,nul);
4729 vc11 = vec_madd(rinv11,qqOO,nul);
4730 vc12 = vec_madd(rinv12,qqOH,nul);
4731 vc13 = vec_madd(rinv13,qqOH,nul);
4732 vc21 = vec_madd(rinv21,qqOH,nul);
4733 vc22 = vec_madd(rinv22,qqHH,nul);
4734 vc23 = vec_madd(rinv23,qqHH,nul);
4735 vc31 = vec_madd(rinv31,qqOH,nul);
4736 vc32 = vec_madd(rinv32,qqHH,nul);
4737 vc33 = vec_madd(rinv33,qqHH,nul);
4739 fs11 = vec_madd(vc11,rinvsq11,nul);
4740 fs12 = vec_madd(vc12,rinvsq12,nul);
4741 fs13 = vec_madd(vc13,rinvsq13,nul);
4742 fs21 = vec_madd(vc21,rinvsq21,nul);
4743 fs22 = vec_madd(vc22,rinvsq22,nul);
4744 fs23 = vec_madd(vc23,rinvsq23,nul);
4745 fs31 = vec_madd(vc31,rinvsq31,nul);
4746 fs32 = vec_madd(vc32,rinvsq32,nul);
4747 fs33 = vec_madd(vc33,rinvsq33,nul);
4749 vctot = vec_add(vctot,vc11);
4750 vc12 = vec_add(vc12,vc13);
4751 vc21 = vec_add(vc21,vc22);
4752 vc23 = vec_add(vc23,vc31);
4753 vc32 = vec_add(vc32,vc33);
4754 vctot = vec_add(vctot,vc12);
4755 vc21 = vec_add(vc21,vc23);
4756 vctot = vec_add(vctot,vc32);
4757 vctot = vec_add(vctot,vc21);
4759 fix1 = vec_madd(fs11,dx11,fix1);
4760 fiy1 = vec_madd(fs11,dy11,fiy1);
4761 fiz1 = vec_madd(fs11,dz11,fiz1);
4762 fix2 = vec_madd(fs21,dx21,fix2);
4763 fiy2 = vec_madd(fs21,dy21,fiy2);
4764 fiz2 = vec_madd(fs21,dz21,fiz2);
4765 fix3 = vec_madd(fs31,dx31,fix3);
4766 fiy3 = vec_madd(fs31,dy31,fiy3);
4767 fiz3 = vec_madd(fs31,dz31,fiz3);
4769 fix1 = vec_madd(fs12,dx12,fix1);
4770 fiy1 = vec_madd(fs12,dy12,fiy1);
4771 fiz1 = vec_madd(fs12,dz12,fiz1);
4772 fix2 = vec_madd(fs22,dx22,fix2);
4773 fiy2 = vec_madd(fs22,dy22,fiy2);
4774 fiz2 = vec_madd(fs22,dz22,fiz2);
4775 fix3 = vec_madd(fs32,dx32,fix3);
4776 fiy3 = vec_madd(fs32,dy32,fiy3);
4777 fiz3 = vec_madd(fs32,dz32,fiz3);
4779 fix1 = vec_madd(fs13,dx13,fix1);
4780 fiy1 = vec_madd(fs13,dy13,fiy1);
4781 fiz1 = vec_madd(fs13,dz13,fiz1);
4782 fix2 = vec_madd(fs23,dx23,fix2);
4783 fiy2 = vec_madd(fs23,dy23,fiy2);
4784 fiz2 = vec_madd(fs23,dz23,fiz2);
4785 fix3 = vec_madd(fs33,dx33,fix3);
4786 fiy3 = vec_madd(fs33,dy33,fiy3);
4787 fiz3 = vec_madd(fs33,dz33,fiz3);
4789 fjx1 = vec_nmsub(fs11,dx11,nul);
4790 fjy1 = vec_nmsub(fs11,dy11,nul);
4791 fjz1 = vec_nmsub(fs11,dz11,nul);
4792 fjx2 = vec_nmsub(fs12,dx12,nul);
4793 fjy2 = vec_nmsub(fs12,dy12,nul);
4794 fjz2 = vec_nmsub(fs12,dz12,nul);
4795 fjx3 = vec_nmsub(fs13,dx13,nul);
4796 fjy3 = vec_nmsub(fs13,dy13,nul);
4797 fjz3 = vec_nmsub(fs13,dz13,nul);
4799 fjx1 = vec_nmsub(fs21,dx21,fjx1);
4800 fjy1 = vec_nmsub(fs21,dy21,fjy1);
4801 fjz1 = vec_nmsub(fs21,dz21,fjz1);
4802 fjx2 = vec_nmsub(fs22,dx22,fjx2);
4803 fjy2 = vec_nmsub(fs22,dy22,fjy2);
4804 fjz2 = vec_nmsub(fs22,dz22,fjz2);
4805 fjx3 = vec_nmsub(fs23,dx23,fjx3);
4806 fjy3 = vec_nmsub(fs23,dy23,fjy3);
4807 fjz3 = vec_nmsub(fs23,dz23,fjz3);
4809 fjx1 = vec_nmsub(fs31,dx31,fjx1);
4810 fjy1 = vec_nmsub(fs31,dy31,fjy1);
4811 fjz1 = vec_nmsub(fs31,dz31,fjz1);
4812 fjx2 = vec_nmsub(fs32,dx32,fjx2);
4813 fjy2 = vec_nmsub(fs32,dy32,fjy2);
4814 fjz2 = vec_nmsub(fs32,dz32,fjz2);
4815 fjx3 = vec_nmsub(fs33,dx33,fjx3);
4816 fjy3 = vec_nmsub(fs33,dy33,fjy3);
4817 fjz3 = vec_nmsub(fs33,dz33,fjz3);
4819 add_force_to_4_water(faction+j3a,faction+j3b,faction+j3c,faction+j3d,
4820 fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
4822 if(k<(nj1-2)) {
4823 jnra = jjnr[k];
4824 jnrb = jjnr[k+1];
4825 jnrc = jjnr[k+2];
4826 j3a = 3*jnra;
4827 j3b = 3*jnrb;
4828 j3c = 3*jnrc;
4829 load_3_water(pos+j3a,pos+j3b,pos+j3c,
4830 &jx1,&jy1,&jz1,&jx2,&jy2,&jz2,&jx3,&jy3,&jz3);
4831 qqOOt = vec_sld(qqOO,nul,4);
4832 qqOHt = vec_sld(qqOH,nul,4);
4833 qqHHt = vec_sld(qqHH,nul,4);
4835 dx11 = vec_sub(ix1,jx1);
4836 dx12 = vec_sub(ix1,jx2);
4837 dx13 = vec_sub(ix1,jx3);
4838 dy11 = vec_sub(iy1,jy1);
4839 dy12 = vec_sub(iy1,jy2);
4840 dy13 = vec_sub(iy1,jy3);
4841 dz11 = vec_sub(iz1,jz1);
4842 dz12 = vec_sub(iz1,jz2);
4843 dz13 = vec_sub(iz1,jz3);
4844 dx21 = vec_sub(ix2,jx1);
4845 dx22 = vec_sub(ix2,jx2);
4846 dx23 = vec_sub(ix2,jx3);
4847 dy21 = vec_sub(iy2,jy1);
4848 dy22 = vec_sub(iy2,jy2);
4849 dy23 = vec_sub(iy2,jy3);
4850 dz21 = vec_sub(iz2,jz1);
4851 dz22 = vec_sub(iz2,jz2);
4852 dz23 = vec_sub(iz2,jz3);
4853 dx31 = vec_sub(ix3,jx1);
4854 dx32 = vec_sub(ix3,jx2);
4855 dx33 = vec_sub(ix3,jx3);
4856 dy31 = vec_sub(iy3,jy1);
4857 dy32 = vec_sub(iy3,jy2);
4858 dy33 = vec_sub(iy3,jy3);
4859 dz31 = vec_sub(iz3,jz1);
4860 dz32 = vec_sub(iz3,jz2);
4861 dz33 = vec_sub(iz3,jz3);
4863 rsq11 = vec_madd(dx11,dx11,nul);
4864 rsq12 = vec_madd(dx12,dx12,nul);
4865 rsq13 = vec_madd(dx13,dx13,nul);
4866 rsq21 = vec_madd(dx21,dx21,nul);
4867 rsq22 = vec_madd(dx22,dx22,nul);
4868 rsq23 = vec_madd(dx23,dx23,nul);
4869 rsq31 = vec_madd(dx31,dx31,nul);
4870 rsq32 = vec_madd(dx32,dx32,nul);
4871 rsq33 = vec_madd(dx33,dx33,nul);
4872 rsq11 = vec_madd(dy11,dy11,rsq11);
4873 rsq12 = vec_madd(dy12,dy12,rsq12);
4874 rsq13 = vec_madd(dy13,dy13,rsq13);
4875 rsq21 = vec_madd(dy21,dy21,rsq21);
4876 rsq22 = vec_madd(dy22,dy22,rsq22);
4877 rsq23 = vec_madd(dy23,dy23,rsq23);
4878 rsq31 = vec_madd(dy31,dy31,rsq31);
4879 rsq32 = vec_madd(dy32,dy32,rsq32);
4880 rsq33 = vec_madd(dy33,dy33,rsq33);
4881 rsq11 = vec_madd(dz11,dz11,rsq11);
4882 rsq12 = vec_madd(dz12,dz12,rsq12);
4883 rsq13 = vec_madd(dz13,dz13,rsq13);
4884 rsq21 = vec_madd(dz21,dz21,rsq21);
4885 rsq22 = vec_madd(dz22,dz22,rsq22);
4886 rsq23 = vec_madd(dz23,dz23,rsq23);
4887 rsq31 = vec_madd(dz31,dz31,rsq31);
4888 rsq32 = vec_madd(dz32,dz32,rsq32);
4889 rsq33 = vec_madd(dz33,dz33,rsq33);
4891 do_9_invsqrt(rsq11,rsq12,rsq13,
4892 rsq21,rsq22,rsq23,
4893 rsq31,rsq32,rsq33,
4894 &rinv11,&rinv12,&rinv13,
4895 &rinv21,&rinv22,&rinv23,
4896 &rinv31,&rinv32,&rinv33);
4898 zero_highest_element_in_9_vectors(&rinv11,&rinv12,&rinv13,
4899 &rinv21,&rinv22,&rinv23,
4900 &rinv31,&rinv32,&rinv33);
4902 rinvsq11 = vec_madd(rinv11,rinv11,nul);
4903 rinvsq12 = vec_madd(rinv12,rinv12,nul);
4904 rinvsq13 = vec_madd(rinv13,rinv13,nul);
4905 rinvsq21 = vec_madd(rinv21,rinv21,nul);
4906 rinvsq22 = vec_madd(rinv22,rinv22,nul);
4907 rinvsq23 = vec_madd(rinv23,rinv23,nul);
4908 rinvsq31 = vec_madd(rinv31,rinv31,nul);
4909 rinvsq32 = vec_madd(rinv32,rinv32,nul);
4910 rinvsq33 = vec_madd(rinv33,rinv33,nul);
4912 vc11 = vec_madd(rinv11,qqOOt,nul);
4913 vc12 = vec_madd(rinv12,qqOHt,nul);
4914 vc13 = vec_madd(rinv13,qqOHt,nul);
4915 vc21 = vec_madd(rinv21,qqOHt,nul);
4916 vc22 = vec_madd(rinv22,qqHHt,nul);
4917 vc23 = vec_madd(rinv23,qqHHt,nul);
4918 vc31 = vec_madd(rinv31,qqOHt,nul);
4919 vc32 = vec_madd(rinv32,qqHHt,nul);
4920 vc33 = vec_madd(rinv33,qqHHt,nul);
4922 fs11 = vec_madd(vc11,rinvsq11,nul);
4923 fs12 = vec_madd(vc12,rinvsq12,nul);
4924 fs13 = vec_madd(vc13,rinvsq13,nul);
4925 fs21 = vec_madd(vc21,rinvsq21,nul);
4926 fs22 = vec_madd(vc22,rinvsq22,nul);
4927 fs23 = vec_madd(vc23,rinvsq23,nul);
4928 fs31 = vec_madd(vc31,rinvsq31,nul);
4929 fs32 = vec_madd(vc32,rinvsq32,nul);
4930 fs33 = vec_madd(vc33,rinvsq33,nul);
4932 vctot = vec_add(vctot,vc11);
4933 vc12 = vec_add(vc12,vc13);
4934 vc21 = vec_add(vc21,vc22);
4935 vc23 = vec_add(vc23,vc31);
4936 vc32 = vec_add(vc32,vc33);
4937 vctot = vec_add(vctot,vc12);
4938 vc21 = vec_add(vc21,vc23);
4939 vctot = vec_add(vctot,vc32);
4940 vctot = vec_add(vctot,vc21);
4942 fix1 = vec_madd(fs11,dx11,fix1);
4943 fiy1 = vec_madd(fs11,dy11,fiy1);
4944 fiz1 = vec_madd(fs11,dz11,fiz1);
4945 fix2 = vec_madd(fs21,dx21,fix2);
4946 fiy2 = vec_madd(fs21,dy21,fiy2);
4947 fiz2 = vec_madd(fs21,dz21,fiz2);
4948 fix3 = vec_madd(fs31,dx31,fix3);
4949 fiy3 = vec_madd(fs31,dy31,fiy3);
4950 fiz3 = vec_madd(fs31,dz31,fiz3);
4952 fix1 = vec_madd(fs12,dx12,fix1);
4953 fiy1 = vec_madd(fs12,dy12,fiy1);
4954 fiz1 = vec_madd(fs12,dz12,fiz1);
4955 fix2 = vec_madd(fs22,dx22,fix2);
4956 fiy2 = vec_madd(fs22,dy22,fiy2);
4957 fiz2 = vec_madd(fs22,dz22,fiz2);
4958 fix3 = vec_madd(fs32,dx32,fix3);
4959 fiy3 = vec_madd(fs32,dy32,fiy3);
4960 fiz3 = vec_madd(fs32,dz32,fiz3);
4962 fix1 = vec_madd(fs13,dx13,fix1);
4963 fiy1 = vec_madd(fs13,dy13,fiy1);
4964 fiz1 = vec_madd(fs13,dz13,fiz1);
4965 fix2 = vec_madd(fs23,dx23,fix2);
4966 fiy2 = vec_madd(fs23,dy23,fiy2);
4967 fiz2 = vec_madd(fs23,dz23,fiz2);
4968 fix3 = vec_madd(fs33,dx33,fix3);
4969 fiy3 = vec_madd(fs33,dy33,fiy3);
4970 fiz3 = vec_madd(fs33,dz33,fiz3);
4972 fjx1 = vec_nmsub(fs11,dx11,nul);
4973 fjy1 = vec_nmsub(fs11,dy11,nul);
4974 fjz1 = vec_nmsub(fs11,dz11,nul);
4975 fjx2 = vec_nmsub(fs12,dx12,nul);
4976 fjy2 = vec_nmsub(fs12,dy12,nul);
4977 fjz2 = vec_nmsub(fs12,dz12,nul);
4978 fjx3 = vec_nmsub(fs13,dx13,nul);
4979 fjy3 = vec_nmsub(fs13,dy13,nul);
4980 fjz3 = vec_nmsub(fs13,dz13,nul);
4982 fjx1 = vec_nmsub(fs21,dx21,fjx1);
4983 fjy1 = vec_nmsub(fs21,dy21,fjy1);
4984 fjz1 = vec_nmsub(fs21,dz21,fjz1);
4985 fjx2 = vec_nmsub(fs22,dx22,fjx2);
4986 fjy2 = vec_nmsub(fs22,dy22,fjy2);
4987 fjz2 = vec_nmsub(fs22,dz22,fjz2);
4988 fjx3 = vec_nmsub(fs23,dx23,fjx3);
4989 fjy3 = vec_nmsub(fs23,dy23,fjy3);
4990 fjz3 = vec_nmsub(fs23,dz23,fjz3);
4992 fjx1 = vec_nmsub(fs31,dx31,fjx1);
4993 fjy1 = vec_nmsub(fs31,dy31,fjy1);
4994 fjz1 = vec_nmsub(fs31,dz31,fjz1);
4995 fjx2 = vec_nmsub(fs32,dx32,fjx2);
4996 fjy2 = vec_nmsub(fs32,dy32,fjy2);
4997 fjz2 = vec_nmsub(fs32,dz32,fjz2);
4998 fjx3 = vec_nmsub(fs33,dx33,fjx3);
4999 fjy3 = vec_nmsub(fs33,dy33,fjy3);
5000 fjz3 = vec_nmsub(fs33,dz33,fjz3);
5002 add_force_to_3_water(faction+j3a,faction+j3b,faction+j3c,
5003 fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
5004 } else if(k<(nj1-1)) {
5005 jnra = jjnr[k];
5006 jnrb = jjnr[k+1];
5007 j3a = 3*jnra;
5008 j3b = 3*jnrb;
5009 load_2_water(pos+j3a,pos+j3b,
5010 &jx1,&jy1,&jz1,&jx2,&jy2,&jz2,&jx3,&jy3,&jz3);
5011 qqOOt = vec_sld(qqOO,nul,8);
5012 qqOHt = vec_sld(qqOH,nul,8);
5013 qqHHt = vec_sld(qqHH,nul,8);
5015 dx11 = vec_sub(ix1,jx1);
5016 dx12 = vec_sub(ix1,jx2);
5017 dx13 = vec_sub(ix1,jx3);
5018 dy11 = vec_sub(iy1,jy1);
5019 dy12 = vec_sub(iy1,jy2);
5020 dy13 = vec_sub(iy1,jy3);
5021 dz11 = vec_sub(iz1,jz1);
5022 dz12 = vec_sub(iz1,jz2);
5023 dz13 = vec_sub(iz1,jz3);
5024 dx21 = vec_sub(ix2,jx1);
5025 dx22 = vec_sub(ix2,jx2);
5026 dx23 = vec_sub(ix2,jx3);
5027 dy21 = vec_sub(iy2,jy1);
5028 dy22 = vec_sub(iy2,jy2);
5029 dy23 = vec_sub(iy2,jy3);
5030 dz21 = vec_sub(iz2,jz1);
5031 dz22 = vec_sub(iz2,jz2);
5032 dz23 = vec_sub(iz2,jz3);
5033 dx31 = vec_sub(ix3,jx1);
5034 dx32 = vec_sub(ix3,jx2);
5035 dx33 = vec_sub(ix3,jx3);
5036 dy31 = vec_sub(iy3,jy1);
5037 dy32 = vec_sub(iy3,jy2);
5038 dy33 = vec_sub(iy3,jy3);
5039 dz31 = vec_sub(iz3,jz1);
5040 dz32 = vec_sub(iz3,jz2);
5041 dz33 = vec_sub(iz3,jz3);
5043 rsq11 = vec_madd(dx11,dx11,nul);
5044 rsq12 = vec_madd(dx12,dx12,nul);
5045 rsq13 = vec_madd(dx13,dx13,nul);
5046 rsq21 = vec_madd(dx21,dx21,nul);
5047 rsq22 = vec_madd(dx22,dx22,nul);
5048 rsq23 = vec_madd(dx23,dx23,nul);
5049 rsq31 = vec_madd(dx31,dx31,nul);
5050 rsq32 = vec_madd(dx32,dx32,nul);
5051 rsq33 = vec_madd(dx33,dx33,nul);
5052 rsq11 = vec_madd(dy11,dy11,rsq11);
5053 rsq12 = vec_madd(dy12,dy12,rsq12);
5054 rsq13 = vec_madd(dy13,dy13,rsq13);
5055 rsq21 = vec_madd(dy21,dy21,rsq21);
5056 rsq22 = vec_madd(dy22,dy22,rsq22);
5057 rsq23 = vec_madd(dy23,dy23,rsq23);
5058 rsq31 = vec_madd(dy31,dy31,rsq31);
5059 rsq32 = vec_madd(dy32,dy32,rsq32);
5060 rsq33 = vec_madd(dy33,dy33,rsq33);
5061 rsq11 = vec_madd(dz11,dz11,rsq11);
5062 rsq12 = vec_madd(dz12,dz12,rsq12);
5063 rsq13 = vec_madd(dz13,dz13,rsq13);
5064 rsq21 = vec_madd(dz21,dz21,rsq21);
5065 rsq22 = vec_madd(dz22,dz22,rsq22);
5066 rsq23 = vec_madd(dz23,dz23,rsq23);
5067 rsq31 = vec_madd(dz31,dz31,rsq31);
5068 rsq32 = vec_madd(dz32,dz32,rsq32);
5069 rsq33 = vec_madd(dz33,dz33,rsq33);
5071 do_9_invsqrt(rsq11,rsq12,rsq13,
5072 rsq21,rsq22,rsq23,
5073 rsq31,rsq32,rsq33,
5074 &rinv11,&rinv12,&rinv13,
5075 &rinv21,&rinv22,&rinv23,
5076 &rinv31,&rinv32,&rinv33);
5078 zero_highest_2_elements_in_9_vectors(&rinv11,&rinv12,&rinv13,
5079 &rinv21,&rinv22,&rinv23,
5080 &rinv31,&rinv32,&rinv33);
5082 rinvsq11 = vec_madd(rinv11,rinv11,nul);
5083 rinvsq12 = vec_madd(rinv12,rinv12,nul);
5084 rinvsq13 = vec_madd(rinv13,rinv13,nul);
5085 rinvsq21 = vec_madd(rinv21,rinv21,nul);
5086 rinvsq22 = vec_madd(rinv22,rinv22,nul);
5087 rinvsq23 = vec_madd(rinv23,rinv23,nul);
5088 rinvsq31 = vec_madd(rinv31,rinv31,nul);
5089 rinvsq32 = vec_madd(rinv32,rinv32,nul);
5090 rinvsq33 = vec_madd(rinv33,rinv33,nul);
5092 vc11 = vec_madd(rinv11,qqOOt,nul);
5093 vc12 = vec_madd(rinv12,qqOHt,nul);
5094 vc13 = vec_madd(rinv13,qqOHt,nul);
5095 vc21 = vec_madd(rinv21,qqOHt,nul);
5096 vc22 = vec_madd(rinv22,qqHHt,nul);
5097 vc23 = vec_madd(rinv23,qqHHt,nul);
5098 vc31 = vec_madd(rinv31,qqOHt,nul);
5099 vc32 = vec_madd(rinv32,qqHHt,nul);
5100 vc33 = vec_madd(rinv33,qqHHt,nul);
5102 fs11 = vec_madd(vc11,rinvsq11,nul);
5103 fs12 = vec_madd(vc12,rinvsq12,nul);
5104 fs13 = vec_madd(vc13,rinvsq13,nul);
5105 fs21 = vec_madd(vc21,rinvsq21,nul);
5106 fs22 = vec_madd(vc22,rinvsq22,nul);
5107 fs23 = vec_madd(vc23,rinvsq23,nul);
5108 fs31 = vec_madd(vc31,rinvsq31,nul);
5109 fs32 = vec_madd(vc32,rinvsq32,nul);
5110 fs33 = vec_madd(vc33,rinvsq33,nul);
5112 vctot = vec_add(vctot,vc11);
5113 vc12 = vec_add(vc12,vc13);
5114 vc21 = vec_add(vc21,vc22);
5115 vc23 = vec_add(vc23,vc31);
5116 vc32 = vec_add(vc32,vc33);
5117 vctot = vec_add(vctot,vc12);
5118 vc21 = vec_add(vc21,vc23);
5119 vctot = vec_add(vctot,vc32);
5120 vctot = vec_add(vctot,vc21);
5122 fix1 = vec_madd(fs11,dx11,fix1);
5123 fiy1 = vec_madd(fs11,dy11,fiy1);
5124 fiz1 = vec_madd(fs11,dz11,fiz1);
5125 fix2 = vec_madd(fs21,dx21,fix2);
5126 fiy2 = vec_madd(fs21,dy21,fiy2);
5127 fiz2 = vec_madd(fs21,dz21,fiz2);
5128 fix3 = vec_madd(fs31,dx31,fix3);
5129 fiy3 = vec_madd(fs31,dy31,fiy3);
5130 fiz3 = vec_madd(fs31,dz31,fiz3);
5132 fix1 = vec_madd(fs12,dx12,fix1);
5133 fiy1 = vec_madd(fs12,dy12,fiy1);
5134 fiz1 = vec_madd(fs12,dz12,fiz1);
5135 fix2 = vec_madd(fs22,dx22,fix2);
5136 fiy2 = vec_madd(fs22,dy22,fiy2);
5137 fiz2 = vec_madd(fs22,dz22,fiz2);
5138 fix3 = vec_madd(fs32,dx32,fix3);
5139 fiy3 = vec_madd(fs32,dy32,fiy3);
5140 fiz3 = vec_madd(fs32,dz32,fiz3);
5142 fix1 = vec_madd(fs13,dx13,fix1);
5143 fiy1 = vec_madd(fs13,dy13,fiy1);
5144 fiz1 = vec_madd(fs13,dz13,fiz1);
5145 fix2 = vec_madd(fs23,dx23,fix2);
5146 fiy2 = vec_madd(fs23,dy23,fiy2);
5147 fiz2 = vec_madd(fs23,dz23,fiz2);
5148 fix3 = vec_madd(fs33,dx33,fix3);
5149 fiy3 = vec_madd(fs33,dy33,fiy3);
5150 fiz3 = vec_madd(fs33,dz33,fiz3);
5152 fjx1 = vec_nmsub(fs11,dx11,nul);
5153 fjy1 = vec_nmsub(fs11,dy11,nul);
5154 fjz1 = vec_nmsub(fs11,dz11,nul);
5155 fjx2 = vec_nmsub(fs12,dx12,nul);
5156 fjy2 = vec_nmsub(fs12,dy12,nul);
5157 fjz2 = vec_nmsub(fs12,dz12,nul);
5158 fjx3 = vec_nmsub(fs13,dx13,nul);
5159 fjy3 = vec_nmsub(fs13,dy13,nul);
5160 fjz3 = vec_nmsub(fs13,dz13,nul);
5162 fjx1 = vec_nmsub(fs21,dx21,fjx1);
5163 fjy1 = vec_nmsub(fs21,dy21,fjy1);
5164 fjz1 = vec_nmsub(fs21,dz21,fjz1);
5165 fjx2 = vec_nmsub(fs22,dx22,fjx2);
5166 fjy2 = vec_nmsub(fs22,dy22,fjy2);
5167 fjz2 = vec_nmsub(fs22,dz22,fjz2);
5168 fjx3 = vec_nmsub(fs23,dx23,fjx3);
5169 fjy3 = vec_nmsub(fs23,dy23,fjy3);
5170 fjz3 = vec_nmsub(fs23,dz23,fjz3);
5172 fjx1 = vec_nmsub(fs31,dx31,fjx1);
5173 fjy1 = vec_nmsub(fs31,dy31,fjy1);
5174 fjz1 = vec_nmsub(fs31,dz31,fjz1);
5175 fjx2 = vec_nmsub(fs32,dx32,fjx2);
5176 fjy2 = vec_nmsub(fs32,dy32,fjy2);
5177 fjz2 = vec_nmsub(fs32,dz32,fjz2);
5178 fjx3 = vec_nmsub(fs33,dx33,fjx3);
5179 fjy3 = vec_nmsub(fs33,dy33,fjy3);
5180 fjz3 = vec_nmsub(fs33,dz33,fjz3);
5182 add_force_to_2_water(faction+j3a,faction+j3b,
5183 fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
5184 } else if(k<nj1) {
5185 jnra = jjnr[k];
5186 j3a = 3*jnra;
5187 load_1_water(pos+j3a,
5188 &jx1,&jy1,&jz1,&jx2,&jy2,&jz2,&jx3,&jy3,&jz3);
5189 qqOOt = vec_sld(qqOO,nul,12);
5190 qqOHt = vec_sld(qqOH,nul,12);
5191 qqHHt = vec_sld(qqHH,nul,12);
5193 dx11 = vec_sub(ix1,jx1);
5194 dx12 = vec_sub(ix1,jx2);
5195 dx13 = vec_sub(ix1,jx3);
5196 dy11 = vec_sub(iy1,jy1);
5197 dy12 = vec_sub(iy1,jy2);
5198 dy13 = vec_sub(iy1,jy3);
5199 dz11 = vec_sub(iz1,jz1);
5200 dz12 = vec_sub(iz1,jz2);
5201 dz13 = vec_sub(iz1,jz3);
5202 dx21 = vec_sub(ix2,jx1);
5203 dx22 = vec_sub(ix2,jx2);
5204 dx23 = vec_sub(ix2,jx3);
5205 dy21 = vec_sub(iy2,jy1);
5206 dy22 = vec_sub(iy2,jy2);
5207 dy23 = vec_sub(iy2,jy3);
5208 dz21 = vec_sub(iz2,jz1);
5209 dz22 = vec_sub(iz2,jz2);
5210 dz23 = vec_sub(iz2,jz3);
5211 dx31 = vec_sub(ix3,jx1);
5212 dx32 = vec_sub(ix3,jx2);
5213 dx33 = vec_sub(ix3,jx3);
5214 dy31 = vec_sub(iy3,jy1);
5215 dy32 = vec_sub(iy3,jy2);
5216 dy33 = vec_sub(iy3,jy3);
5217 dz31 = vec_sub(iz3,jz1);
5218 dz32 = vec_sub(iz3,jz2);
5219 dz33 = vec_sub(iz3,jz3);
5221 rsq11 = vec_madd(dx11,dx11,nul);
5222 rsq12 = vec_madd(dx12,dx12,nul);
5223 rsq13 = vec_madd(dx13,dx13,nul);
5224 rsq21 = vec_madd(dx21,dx21,nul);
5225 rsq22 = vec_madd(dx22,dx22,nul);
5226 rsq23 = vec_madd(dx23,dx23,nul);
5227 rsq31 = vec_madd(dx31,dx31,nul);
5228 rsq32 = vec_madd(dx32,dx32,nul);
5229 rsq33 = vec_madd(dx33,dx33,nul);
5230 rsq11 = vec_madd(dy11,dy11,rsq11);
5231 rsq12 = vec_madd(dy12,dy12,rsq12);
5232 rsq13 = vec_madd(dy13,dy13,rsq13);
5233 rsq21 = vec_madd(dy21,dy21,rsq21);
5234 rsq22 = vec_madd(dy22,dy22,rsq22);
5235 rsq23 = vec_madd(dy23,dy23,rsq23);
5236 rsq31 = vec_madd(dy31,dy31,rsq31);
5237 rsq32 = vec_madd(dy32,dy32,rsq32);
5238 rsq33 = vec_madd(dy33,dy33,rsq33);
5239 rsq11 = vec_madd(dz11,dz11,rsq11);
5240 rsq12 = vec_madd(dz12,dz12,rsq12);
5241 rsq13 = vec_madd(dz13,dz13,rsq13);
5242 rsq21 = vec_madd(dz21,dz21,rsq21);
5243 rsq22 = vec_madd(dz22,dz22,rsq22);
5244 rsq23 = vec_madd(dz23,dz23,rsq23);
5245 rsq31 = vec_madd(dz31,dz31,rsq31);
5246 rsq32 = vec_madd(dz32,dz32,rsq32);
5247 rsq33 = vec_madd(dz33,dz33,rsq33);
5249 do_9_invsqrt(rsq11,rsq12,rsq13,
5250 rsq21,rsq22,rsq23,
5251 rsq31,rsq32,rsq33,
5252 &rinv11,&rinv12,&rinv13,
5253 &rinv21,&rinv22,&rinv23,
5254 &rinv31,&rinv32,&rinv33);
5256 zero_highest_3_elements_in_9_vectors(&rinv11,&rinv12,&rinv13,
5257 &rinv21,&rinv22,&rinv23,
5258 &rinv31,&rinv32,&rinv33);
5260 rinvsq11 = vec_madd(rinv11,rinv11,nul);
5261 rinvsq12 = vec_madd(rinv12,rinv12,nul);
5262 rinvsq13 = vec_madd(rinv13,rinv13,nul);
5263 rinvsq21 = vec_madd(rinv21,rinv21,nul);
5264 rinvsq22 = vec_madd(rinv22,rinv22,nul);
5265 rinvsq23 = vec_madd(rinv23,rinv23,nul);
5266 rinvsq31 = vec_madd(rinv31,rinv31,nul);
5267 rinvsq32 = vec_madd(rinv32,rinv32,nul);
5268 rinvsq33 = vec_madd(rinv33,rinv33,nul);
5270 vc11 = vec_madd(rinv11,qqOOt,nul);
5271 vc12 = vec_madd(rinv12,qqOHt,nul);
5272 vc13 = vec_madd(rinv13,qqOHt,nul);
5273 vc21 = vec_madd(rinv21,qqOHt,nul);
5274 vc22 = vec_madd(rinv22,qqHHt,nul);
5275 vc23 = vec_madd(rinv23,qqHHt,nul);
5276 vc31 = vec_madd(rinv31,qqOHt,nul);
5277 vc32 = vec_madd(rinv32,qqHHt,nul);
5278 vc33 = vec_madd(rinv33,qqHHt,nul);
5280 fs11 = vec_madd(vc11,rinvsq11,nul);
5281 fs12 = vec_madd(vc12,rinvsq12,nul);
5282 fs13 = vec_madd(vc13,rinvsq13,nul);
5283 fs21 = vec_madd(vc21,rinvsq21,nul);
5284 fs22 = vec_madd(vc22,rinvsq22,nul);
5285 fs23 = vec_madd(vc23,rinvsq23,nul);
5286 fs31 = vec_madd(vc31,rinvsq31,nul);
5287 fs32 = vec_madd(vc32,rinvsq32,nul);
5288 fs33 = vec_madd(vc33,rinvsq33,nul);
5290 vctot = vec_add(vctot,vc11);
5291 vc12 = vec_add(vc12,vc13);
5292 vc21 = vec_add(vc21,vc22);
5293 vc23 = vec_add(vc23,vc31);
5294 vc32 = vec_add(vc32,vc33);
5295 vctot = vec_add(vctot,vc12);
5296 vc21 = vec_add(vc21,vc23);
5297 vctot = vec_add(vctot,vc32);
5298 vctot = vec_add(vctot,vc21);
5300 fix1 = vec_madd(fs11,dx11,fix1);
5301 fiy1 = vec_madd(fs11,dy11,fiy1);
5302 fiz1 = vec_madd(fs11,dz11,fiz1);
5303 fix2 = vec_madd(fs21,dx21,fix2);
5304 fiy2 = vec_madd(fs21,dy21,fiy2);
5305 fiz2 = vec_madd(fs21,dz21,fiz2);
5306 fix3 = vec_madd(fs31,dx31,fix3);
5307 fiy3 = vec_madd(fs31,dy31,fiy3);
5308 fiz3 = vec_madd(fs31,dz31,fiz3);
5310 fix1 = vec_madd(fs12,dx12,fix1);
5311 fiy1 = vec_madd(fs12,dy12,fiy1);
5312 fiz1 = vec_madd(fs12,dz12,fiz1);
5313 fix2 = vec_madd(fs22,dx22,fix2);
5314 fiy2 = vec_madd(fs22,dy22,fiy2);
5315 fiz2 = vec_madd(fs22,dz22,fiz2);
5316 fix3 = vec_madd(fs32,dx32,fix3);
5317 fiy3 = vec_madd(fs32,dy32,fiy3);
5318 fiz3 = vec_madd(fs32,dz32,fiz3);
5320 fix1 = vec_madd(fs13,dx13,fix1);
5321 fiy1 = vec_madd(fs13,dy13,fiy1);
5322 fiz1 = vec_madd(fs13,dz13,fiz1);
5323 fix2 = vec_madd(fs23,dx23,fix2);
5324 fiy2 = vec_madd(fs23,dy23,fiy2);
5325 fiz2 = vec_madd(fs23,dz23,fiz2);
5326 fix3 = vec_madd(fs33,dx33,fix3);
5327 fiy3 = vec_madd(fs33,dy33,fiy3);
5328 fiz3 = vec_madd(fs33,dz33,fiz3);
5330 fjx1 = vec_nmsub(fs11,dx11,nul);
5331 fjy1 = vec_nmsub(fs11,dy11,nul);
5332 fjz1 = vec_nmsub(fs11,dz11,nul);
5333 fjx2 = vec_nmsub(fs12,dx12,nul);
5334 fjy2 = vec_nmsub(fs12,dy12,nul);
5335 fjz2 = vec_nmsub(fs12,dz12,nul);
5336 fjx3 = vec_nmsub(fs13,dx13,nul);
5337 fjy3 = vec_nmsub(fs13,dy13,nul);
5338 fjz3 = vec_nmsub(fs13,dz13,nul);
5340 fjx1 = vec_nmsub(fs21,dx21,fjx1);
5341 fjy1 = vec_nmsub(fs21,dy21,fjy1);
5342 fjz1 = vec_nmsub(fs21,dz21,fjz1);
5343 fjx2 = vec_nmsub(fs22,dx22,fjx2);
5344 fjy2 = vec_nmsub(fs22,dy22,fjy2);
5345 fjz2 = vec_nmsub(fs22,dz22,fjz2);
5346 fjx3 = vec_nmsub(fs23,dx23,fjx3);
5347 fjy3 = vec_nmsub(fs23,dy23,fjy3);
5348 fjz3 = vec_nmsub(fs23,dz23,fjz3);
5350 fjx1 = vec_nmsub(fs31,dx31,fjx1);
5351 fjy1 = vec_nmsub(fs31,dy31,fjy1);
5352 fjz1 = vec_nmsub(fs31,dz31,fjz1);
5353 fjx2 = vec_nmsub(fs32,dx32,fjx2);
5354 fjy2 = vec_nmsub(fs32,dy32,fjy2);
5355 fjz2 = vec_nmsub(fs32,dz32,fjz2);
5356 fjx3 = vec_nmsub(fs33,dx33,fjx3);
5357 fjy3 = vec_nmsub(fs33,dy33,fjy3);
5358 fjz3 = vec_nmsub(fs33,dz33,fjz3);
5360 add_force_to_1_water(faction+j3a,
5361 fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
5363 /* update outer data */
5364 update_i_water_forces(faction+ii3,fshift+is3,
5365 fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3);
5367 add_vector_to_float(Vc+gid[n],vctot);
5371 typedef union vfloat {
5372 float f[4];
5373 vector float v;
5374 } vfloat;
5376 void inl1130_altivec(
5377 int nri,
5378 int iinr[],
5379 int jindex[],
5380 int jjnr[],
5381 int shift[],
5382 float shiftvec[],
5383 float fshift[],
5384 int gid[],
5385 float pos[],
5386 float faction[],
5387 float charge[],
5388 float facel,
5389 float Vc[],
5390 int type[],
5391 int ntype,
5392 float nbfp[],
5393 float Vnb[])
5395 register vector float v0;
5396 register vector float v1;
5397 register vector float v2;
5398 register vector float v3;
5399 register vector float v4;
5400 register vector float v5;
5401 register vector float v6;
5402 register vector float v7;
5403 register vector float v8;
5404 register vector float v9;
5405 register vector float v10;
5406 register vector float v11;
5407 register vector float v12;
5408 register vector float v13;
5409 register vector float v14;
5410 register vector float v15;
5411 register vector float v16;
5412 register vector float v17;
5413 register vector float v18;
5414 register vector float v19;
5415 register vector float v20;
5416 register vector float v21;
5417 register vector float v22;
5418 register vector float v23;
5419 register vector float v24;
5420 register vector float v25;
5421 register vector float v26;
5422 register vector float v27;
5423 register vector float v28;
5424 register vector float v29;
5425 register vector float v30;
5426 register vector float v31;
5428 vfloat stackdata[52];
5430 int n,k,k0,ii,is3,ii3,nj0,nj1;
5431 int jnra,jnrb,jnrc,jnrd;
5433 int j3a,j3b,j3c,j3d;
5435 /* set non java mode */
5436 v10 = (vector float)vec_mfvscr();
5437 v11 = (vector float)vec_sl(vec_splat_u32(1),vec_splat_u32(8));
5438 v12 = (vector float)vec_sl((vector unsigned int)v11,vec_splat_u32(8));
5439 v10 = (vector float)vec_or((vector unsigned short)v10,(vector unsigned short)v12);
5440 vec_mtvscr((vector unsigned short)v10);
5442 v0 = (vector float)vec_splat_u32(0);
5443 v0 = vec_ctf((vector unsigned int)v0,0); /* load 0 to v0 */
5444 v1 = vec_lde(0,&facel); /* load facel float to a vector */
5445 v2 = (vector float) vec_lvsl(0,&facel);
5446 v1 = vec_perm(v1,v1,(vector unsigned char) v2); /* move it to elem 0 */
5447 v1 = vec_splat(v1,0); /* splat it to all elem */
5449 ii = iinr[0];
5451 v3 = vec_lde(0,charge+ii); /* load qO float to a vector */
5452 v4 = (vector float) vec_lvsl(0,charge+ii);
5453 v3 = vec_perm(v3,v3,(vector unsigned char) v4); /* move it to elem 0 */
5454 v3 = vec_splat(v3,0); /* splat it to all elem */
5456 v5 = vec_lde(0,charge+ii+1); /* load qH float to a vector */
5457 v6 = (vector float) vec_lvsl(0,charge+ii+1);
5458 v5 = vec_perm(v5,v5,(vector unsigned char) v6); /* move it to elem 0 */
5459 v5 = vec_splat(v5,0); /* splat it to all elem */
5461 v4 = vec_madd(v3,v5,v0); /* qqOH */
5462 v3 = vec_madd(v3,v3,v0); /* qqOO */
5463 v5 = vec_madd(v5,v5,v0); /* qqHH */
5464 v4 = vec_madd(v4,v1,v0); /* qqOH * facel */
5465 v3 = vec_madd(v3,v1,v0); /* qqOO * facel */
5466 v5 = vec_madd(v5,v1,v0); /* qqHH * facel */
5468 n = 2*type[ii];
5469 n = (ntype+1)*n;
5471 v1 = vec_ld( 0,nbfp+n); /* c6a c12a - this works since the nbfp array
5472 * is always at least 8-byte aligned and n is even here.
5474 v2 = (vector float) vec_lvsl(0,nbfp+n);
5475 v1 = vec_perm(v1,v1,(vector unsigned char)v2); /* c6 c12 moved to positions 0,1 */
5476 v2 = vec_splat(v1,1); /* c12 in all elements */
5477 v1 = vec_splat(v1,0); /* c6 in all elements */
5479 /* store things to stack before starting outer loop */
5480 vec_st(v3, 0, (float *) stackdata); /* qqOO*facel is in stack pos 0 */
5481 vec_st(v4, 16, (float *) stackdata); /* qqOH*facel is in stack pos 1 */
5482 vec_st(v5, 32, (float *) stackdata); /* qqHH*facel is in stack pos 2 */
5483 vec_st(v1, 48, (float *) stackdata); /* c6 is in stack pos 3 */
5484 vec_st(v2, 64, (float *) stackdata); /* c12 is in stack pos 4 */
5486 for(n=0;n<nri;n++) {
5487 is3 = 3*shift[n];
5488 ii = iinr[n];
5489 ii3 = 3*ii;
5490 /* load shift */
5491 /* load three consecutive shiftvector floats. We never access the fourth element,
5492 * so this is safe even at the end of an array.
5495 v4 = (vector float)vec_lvsl(0, shiftvec+is3);
5496 v1 = vec_lde(0, shiftvec+is3);
5497 v2 = vec_lde(4, shiftvec+is3);
5498 v3 = vec_lde(8, shiftvec+is3);
5499 v1 = vec_perm(v1,v1,(vector unsigned char)v4); /* shX in elem 0 */
5500 v2 = vec_perm(v2,v2,(vector unsigned char)v4); /* shY in elem 1 */
5501 v3 = vec_perm(v3,v3,(vector unsigned char)v4); /* shZ in elem 2 */
5502 v2 = vec_sld(v2,v2,4);
5503 v3 = vec_sld(v3,v3,8);
5504 v1 = vec_mergeh(v1,v3);
5505 v1 = vec_mergeh(v1,v2); /* [ shX shY shZ - ] */
5506 /* load i coordinates */
5507 v2 = (vector float)vec_lvsl(0, pos+ii3);
5508 v3 = vec_ld(0, pos+ii3); /* load water coords into three vectors. */
5509 v4 = vec_ld(16, pos+ii3);/* we do not yet know how it is aligned. */
5510 v5 = vec_ld(32, pos+ii3);
5511 v6 = vec_sld(v1,v1,12); /* - shX shY shZ */
5512 v7 = vec_sld(v6,v1,4); /* shX shY shZ shX */
5513 v8 = vec_sld(v6,v1,8); /* shY shZ shX shY */
5514 v9 = vec_sld(v6,v1,12); /* shZ shX shY shZ */
5515 v3 = vec_perm(v3,v4,(vector unsigned char)v2); /* Ox Oy Oz H1x */
5516 v4 = vec_perm(v4,v5,(vector unsigned char)v2); /* H1y H1z H2x H2y */
5517 v5 = vec_perm(v5,v5,(vector unsigned char)v2); /* H2z - - - */
5518 v3 = vec_add(v3,v7);
5519 v4 = vec_add(v4,v8);
5520 v5 = vec_add(v5,v9);
5521 v6 = vec_splat(v3,0); /* Ox Ox Ox Ox */
5522 v7 = vec_splat(v3,1); /* Oy Oy Oy Oy */
5523 v8 = vec_splat(v3,2); /* Oz Oz Oz Oz */
5524 v9 = vec_splat(v3,3); /* H1x H1x H1x H1x */
5525 v10 = vec_splat(v4,0); /* H1y H1y H1y H1y */
5526 v11 = vec_splat(v4,1); /* H1z H1z H1z H1z */
5527 v12 = vec_splat(v4,2); /* H2x H2x H2x H2x */
5528 v13 = vec_splat(v4,3); /* H2y H2y H2y H2y */
5529 v14 = vec_splat(v5,0); /* H2z H2z H2z H2z */
5530 /* Store i water coordinates to stack */
5531 vec_st(v6, 80, (float *)stackdata); /* i Ox is in stack pos 5 */
5532 vec_st(v7, 96, (float *)stackdata); /* i Oy is in stack pos 6 */
5533 vec_st(v8, 112, (float *)stackdata); /* i Oz is in stack pos 7 */
5534 vec_st(v9, 128, (float *)stackdata); /* i H1x is in stack pos 8 */
5535 vec_st(v10,144, (float *)stackdata); /* i H1y is in stack pos 9 */
5536 vec_st(v11,160, (float *)stackdata); /* i H1z is in stack pos 10 */
5537 vec_st(v12,176, (float *)stackdata); /* i H2x is in stack pos 11 */
5538 vec_st(v13,192, (float *)stackdata); /* i H2y is in stack pos 12 */
5539 vec_st(v14,208, (float *)stackdata); /* i H2z is in stack pos 13 */
5541 nj0 = jindex[n];
5542 nj1 = jindex[n+1];
5543 vec_dst( jjnr + nj1, 0x10010100, 0 );
5544 vec_st(v0, 224, (float *)stackdata); /* zero vctot, in stack pos 14 */
5545 vec_st(v0, 240, (float *)stackdata); /* zero vctot, in stack pos 15 */
5546 vec_st(v0, 256, (float *)stackdata); /* zero fiOx, in stack pos 16 */
5547 vec_st(v0, 272, (float *)stackdata); /* zero fiOy, in stack pos 17 */
5548 vec_st(v0, 288, (float *)stackdata); /* zero fiOz, in stack pos 18 */
5550 vec_st(v0, 304, (float *)stackdata); /* zero fiH1x, in stack pos 19 */
5551 vec_st(v0, 320, (float *)stackdata); /* zero fiH1y, in stack pos 20 */
5552 vec_st(v0, 336, (float *)stackdata); /* zero fiH1z, in stack pos 21 */
5553 vec_st(v0, 352, (float *)stackdata); /* zero fiH2x, in stack pos 22 */
5554 vec_st(v0, 368, (float *)stackdata); /* zero fiH2y, in stack pos 23 */
5555 vec_st(v0, 384, (float *)stackdata); /* zero fiH2z, in stack pos 24 */
5557 for(k=nj0; k<(nj1-3); k+=4) {
5558 jnra = jjnr[k];
5559 jnrb = jjnr[k+1];
5560 jnrc = jjnr[k+2];
5561 jnrd = jjnr[k+3];
5563 vec_dst( jjnr + k + 4, 0x02020020, 0 );
5565 j3a = 3*jnra;
5566 j3b = 3*jnrb;
5567 j3c = 3*jnrc;
5568 j3d = 3*jnrd;
5570 vec_dst( pos+j3a, 0x10010100, 1 );
5572 v1 = (vector float)vec_lvsl(0, pos+j3a);
5573 v8 = (vector float)vec_lvsl(0, pos+j3b);
5574 v15 = (vector float)vec_lvsl(0, pos+j3c);
5575 v22 = (vector float)vec_lvsl(0, pos+j3d);
5576 v2 = vec_ld(0, pos+j3a);
5577 v9 = vec_ld(0, pos+j3b);
5578 v16 = vec_ld(0, pos+j3c);
5579 v23 = vec_ld(0, pos+j3d);
5581 v3 = vec_ld(16, pos+j3a);
5582 v10 = vec_ld(16, pos+j3b);
5583 v17 = vec_ld(16, pos+j3c);
5584 v24 = vec_ld(16, pos+j3d);
5585 v4 = vec_ld(32, pos+j3a);
5586 v11 = vec_ld(32, pos+j3b);
5587 v18 = vec_ld(32, pos+j3c);
5588 v25 = vec_ld(32, pos+j3d);
5589 v5 = vec_perm(v2,v3,(vector unsigned char)v1); /* Oxa Oya Oza H1xa */
5590 v12 = vec_perm(v9,v10,(vector unsigned char)v8); /* Oxb Oyb Ozb H1xb */
5591 v19 = vec_perm(v16,v17,(vector unsigned char)v15); /* Oxc Oyc Ozc H1xc */
5592 v26 = vec_perm(v23,v24,(vector unsigned char)v22); /* Oxd Oyd Ozd H1xd */
5594 v6 = vec_perm(v3,v4,(vector unsigned char)v1); /* H1ya H1za H2xa H2ya */
5595 v13 = vec_perm(v10,v11,(vector unsigned char)v8); /* H1yb H1zb H2xb H2yb */
5596 v20 = vec_perm(v17,v18,(vector unsigned char)v15); /* H1yc H1zc H2xc H2yc */
5597 v27 = vec_perm(v24,v25,(vector unsigned char)v22); /* H1yd H1zd H2xd H2yd */
5599 v7 = vec_perm(v4,v4,(vector unsigned char)v1); /* H2za - - - */
5600 v14 = vec_perm(v11,v11,(vector unsigned char)v8); /* H2zb - - - */
5601 v21 = vec_perm(v18,v18,(vector unsigned char)v15); /* H2zc - - - */
5602 v28 = vec_perm(v25,v25,(vector unsigned char)v22); /* H2zd - - - */
5604 /* permute water coordinates */
5605 v3 = vec_mergeh(v5,v19); /* Oxa Oxc Oya Oyc */
5606 v5 = vec_mergel(v5,v19); /* Oza Ozc H1xa H1xc */
5607 v19 = vec_mergeh(v12,v26); /* Oxb Oxd Oyb Oyd */
5608 v12 = vec_mergel(v12,v26); /* Ozb Ozd H1xb H1xd */
5610 v26 = vec_mergeh(v6,v20); /* H1ya H1yc H1za H1zc */
5611 v16 = vec_mergel(v6,v20); /* H2xa H2xc H2ya H2yc */
5612 v20 = vec_mergeh(v13,v27); /* H1yb H1yd H1zb H1zd */
5613 v13 = vec_mergel(v13,v27); /* H2xb H2xd H2yb H2yd */
5615 v15 = vec_mergeh(v7,v21); /* H2za H2zc - - */
5616 v14 = vec_mergeh(v14,v28); /* H2zb H2zd - - */
5618 v1 = vec_mergeh(v3,v19); /* Oxa Oxb Oxc Oxd */
5619 v29 = vec_ld(128, (float *) stackdata); /* load i H1x */
5620 v2 = vec_mergel(v3,v19); /* Oya Oyb Oyc Oyd */
5621 v30 = vec_ld(144, (float *) stackdata); /* load i H1y */
5622 v3 = vec_mergeh(v5,v12); /* Oza Ozb Ozc Ozd */
5623 v31 = vec_ld(160, (float *) stackdata); /* load i H1z */
5624 v4 = vec_mergel(v5,v12); /* H1xa H1xb H1xc H1xd */
5625 v5 = vec_mergeh(v26,v20); /* H1ya H1yb H1yc H1yd */
5626 v6 = vec_mergel(v26,v20); /* H1za H1zb H1zc H1zd */
5627 v7 = vec_mergeh(v16,v13); /* H2xa H2xb H2xc H2xd */
5628 v8 = vec_mergel(v16,v13); /* H2ya H2yb H2yc H2yd */
5629 v9 = vec_mergeh(v15,v14); /* H2za H2zb H2zc H2zd */
5631 v10 = vec_sub(v29,v1); /* iH1x - jOx */
5632 v13 = vec_sub(v29,v4); /* iH1x - jH1x */
5633 v16 = vec_sub(v29,v7); /* iH1x - jH2x */
5634 v29 = vec_ld(176, (float *) stackdata); /* load i H2x */
5635 v11 = vec_sub(v30,v2); /* iH1y - jOy */
5636 v14 = vec_sub(v30,v5); /* iH1y - jH1y */
5637 v17 = vec_sub(v30,v8); /* iH1y - jH2y */
5638 v30 = vec_ld(192, (float *) stackdata); /* load i H2y */
5639 vec_st(v10, 544, (float *)stackdata); /* dx21 */
5640 vec_st(v13, 592, (float *)stackdata); /* dx22 */
5641 vec_st(v16, 640, (float *)stackdata); /* dx23 */
5642 v12 = vec_sub(v31,v3); /* iH1z - jOz */
5643 v15 = vec_sub(v31,v6); /* iH1z - jH1z */
5644 v18 = vec_sub(v31,v9); /* iH1z - jH2z */
5645 v31 = vec_ld(208, (float *) stackdata); /* load i H2z */
5646 /* v10-v18 now contains iH1-jO, iH1-jH1 and iJ1-jH2 distances */
5647 vec_st(v11, 560, (float *)stackdata); /* dy21 */
5648 vec_st(v14, 608, (float *)stackdata); /* dy22 */
5649 vec_st(v17, 656, (float *)stackdata); /* dy23 */
5650 v19 = vec_sub(v29,v1); /* iH2x - jOx */
5651 v22 = vec_sub(v29,v4); /* iH2x - jH1x */
5652 v25 = vec_sub(v29,v7); /* iH2x - jH2x */
5653 vec_st(v12, 576, (float *)stackdata); /* dz21 */
5654 vec_st(v15, 624, (float *)stackdata); /* dz22 */
5655 vec_st(v18, 672, (float *)stackdata); /* dz23 */
5656 v29 = vec_ld(80, (float *) stackdata); /* load i Ox */
5657 v20 = vec_sub(v30,v2); /* iH2y - jOy */
5658 v23 = vec_sub(v30,v5); /* iH2y - jH1y */
5659 v26 = vec_sub(v30,v8); /* iH2y - jH2y */
5660 vec_st(v19, 688, (float *)stackdata); /* dx31 */
5661 vec_st(v22, 736, (float *)stackdata); /* dx32 */
5662 vec_st(v25, 784, (float *)stackdata); /* dx33 */
5663 v30 = vec_ld(96, (float *) stackdata); /* load i Oy */
5664 v21 = vec_sub(v31,v3); /* iH2z - jOz */
5665 v24 = vec_sub(v31,v6); /* iH2z - jH1z */
5666 v27 = vec_sub(v31,v9); /* iH2z - jH2z */
5667 v31 = vec_ld(112, (float *) stackdata); /* load i Oz */
5668 vec_st(v20, 704, (float *)stackdata); /* dy31 */
5669 vec_st(v23, 752, (float *)stackdata); /* dy32 */
5670 vec_st(v26, 800, (float *)stackdata); /* dy33 */
5672 v1 = vec_sub(v29,v1); /* iOx - jOx */
5673 v4 = vec_sub(v29,v4); /* iOx - jH1x */
5674 v7 = vec_sub(v29,v7); /* iOx - jH2x */
5675 vec_st(v21, 720, (float *)stackdata); /* dz31 */
5676 vec_st(v24, 768, (float *)stackdata); /* dz32 */
5677 vec_st(v27, 816, (float *)stackdata); /* dz33 */
5678 v2 = vec_sub(v30,v2); /* iOy - jOy */
5679 v5 = vec_sub(v30,v5); /* iOy - jH1y */
5680 v8 = vec_sub(v30,v8); /* iOy - jH2y */
5681 vec_st(v1, 400, (float *)stackdata); /* dx11 */
5682 vec_st(v4, 448, (float *)stackdata); /* dx12 */
5683 vec_st(v7, 496, (float *)stackdata); /* dx13 */
5684 v3 = vec_sub(v31,v3); /* iOz - jOz */
5685 v6 = vec_sub(v31,v6); /* iOz - jH1z */
5686 v9 = vec_sub(v31,v9); /* iOz - jH2z */
5687 vec_st(v2, 416, (float *)stackdata); /* dy11 */
5688 vec_st(v5, 464, (float *)stackdata); /* dy12 */
5689 vec_st(v8, 512, (float *)stackdata); /* dy13 */
5691 v1 = vec_madd(v1,v1,v0);
5692 v4 = vec_madd(v4,v4,v0);
5693 v7 = vec_madd(v7,v7,v0);
5694 vec_st(v3, 432, (float *)stackdata); /* dz11 */
5695 vec_st(v6, 480, (float *)stackdata); /* dz12 */
5696 vec_st(v9, 528, (float *)stackdata); /* dz13 */
5697 v10 = vec_madd(v10,v10,v0);
5698 v13 = vec_madd(v13,v13,v0);
5699 v16 = vec_madd(v16,v16,v0);
5700 v19 = vec_madd(v19,v19,v0);
5701 v22 = vec_madd(v22,v22,v0);
5702 v25 = vec_madd(v25,v25,v0);
5703 v1 = vec_madd(v2,v2,v1);
5704 v4 = vec_madd(v5,v5,v4);
5705 v7 = vec_madd(v8,v8,v7);
5706 v10 = vec_madd(v11,v11,v10);
5707 v13 = vec_madd(v14,v14,v13);
5708 v16 = vec_madd(v17,v17,v16);
5709 v19 = vec_madd(v20,v20,v19);
5710 v22 = vec_madd(v23,v23,v22);
5711 v25 = vec_madd(v26,v26,v25);
5712 v1 = vec_madd(v3,v3,v1);
5713 v2 = vec_madd(v6,v6,v4);
5714 v3 = vec_madd(v9,v9,v7);
5715 v4 = vec_madd(v12,v12,v10);
5716 v5 = vec_madd(v15,v15,v13);
5717 v6 = vec_madd(v18,v18,v16);
5718 v7 = vec_madd(v21,v21,v19);
5719 v8 = vec_madd(v24,v24,v22);
5720 v9 = vec_madd(v27,v27,v25);
5722 * v1 = rsq iO-jO
5723 * v2 = rsq iO-jH1
5724 * v3 = rsq iO-jH2
5725 * v4 = rsq iH1-jO
5726 * v5 = rsq iH1-jH1
5727 * v6 = rsq iH1-jH2
5728 * v7 = rsq iH2-jO
5729 * v8 = rsq iH2-jH1
5730 * v9 = rsq iH2-jH2
5733 v10 = vec_rsqrte(v1);
5734 v11 = vec_rsqrte(v2);
5735 v12 = vec_rsqrte(v3);
5736 v13 = vec_rsqrte(v4);
5737 v14 = vec_rsqrte(v5);
5738 v15 = vec_rsqrte(v6);
5739 v16 = vec_rsqrte(v7);
5740 v17 = vec_rsqrte(v8);
5741 v18 = vec_rsqrte(v9);
5742 /* create constant 0.5 */
5743 v30 = (vector float) vec_splat_u32(1);
5744 v31 = vec_ctf((vector unsigned int)v30,1); /* 0.5 */
5745 v30 = vec_ctf((vector unsigned int)v30,0); /* 1.0 */
5747 v19 = vec_madd(v10,v10,v0); /* lu*lu */
5748 v20 = vec_madd(v11,v11,v0);
5749 v21 = vec_madd(v12,v12,v0);
5750 v22 = vec_madd(v13,v13,v0);
5751 v23 = vec_madd(v14,v14,v0);
5752 v24 = vec_madd(v15,v15,v0);
5753 v25 = vec_madd(v16,v16,v0);
5754 v26 = vec_madd(v17,v17,v0);
5755 v27 = vec_madd(v18,v18,v0);
5757 v19 = vec_nmsub(v1,v19,v30); /* 1.0 - rsq*lu*lu */
5758 v20 = vec_nmsub(v2,v20,v30);
5759 v21 = vec_nmsub(v3,v21,v30);
5760 v22 = vec_nmsub(v4,v22,v30);
5761 v23 = vec_nmsub(v5,v23,v30);
5762 v24 = vec_nmsub(v6,v24,v30);
5763 v25 = vec_nmsub(v7,v25,v30);
5764 v26 = vec_nmsub(v8,v26,v30);
5765 v27 = vec_nmsub(v9,v27,v30);
5767 v1 = vec_madd(v10,v31,v0);/* lu*0.5*/
5768 v2 = vec_madd(v11,v31,v0);
5769 v3 = vec_madd(v12,v31,v0);
5770 v4 = vec_madd(v13,v31,v0);
5771 v5 = vec_madd(v14,v31,v0);
5772 v6 = vec_madd(v15,v31,v0);
5773 v7 = vec_madd(v16,v31,v0);
5774 v8 = vec_madd(v17,v31,v0);
5775 v9 = vec_madd(v18,v31,v0);
5777 /* The rinv values */
5778 v1 = vec_madd(v1,v19,v10);
5779 v2 = vec_madd(v2,v20,v11);
5780 v3 = vec_madd(v3,v21,v12);
5781 v4 = vec_madd(v4,v22,v13);
5782 v5 = vec_madd(v5,v23,v14);
5783 v6 = vec_madd(v6,v24,v15);
5784 v7 = vec_madd(v7,v25,v16);
5785 v8 = vec_madd(v8,v26,v17);
5786 v9 = vec_madd(v9,v27,v18);
5788 /* load qqOO, qqOH and qqHH to v27,v28,v29 */
5789 v27 = vec_ld(0, (float *) stackdata);
5790 v28 = vec_ld(16, (float *) stackdata);
5791 v29 = vec_ld(32, (float *) stackdata);
5793 vec_dstst( faction+j3a, 0x10010100, 2 );
5795 /* put rinvsq in v10-v18, rinv6_OO in v30 and rinv12_OO in v31 */
5796 /* load c6 to v25 and c12 to v26 */
5797 v25 = vec_ld(48, (float *) stackdata);
5798 v26 = vec_ld(64, (float *) stackdata);
5800 v10 = vec_madd(v1,v1,v0);
5801 v1 = vec_madd(v1,v27,v0); /* rinv11*qqOO */
5802 v11 = vec_madd(v2,v2,v0);
5803 /* load vctot to v23 and vnbtot to v24 */
5804 v23 = vec_ld(224,(float *) stackdata);
5805 v24 = vec_ld(240,(float *) stackdata);
5807 v2 = vec_madd(v2,v28,v0); /* rinv12*qqOH */
5808 v12 = vec_madd(v3,v3,v0);
5809 v30 = vec_madd(v10,v10,v0); /* rinv4 */
5810 v3 = vec_madd(v3,v28,v0); /* rinv13*qqOH */
5811 v13 = vec_madd(v4,v4,v0);
5812 v4 = vec_madd(v4,v28,v0); /* rinv21*qqOH */
5813 v14 = vec_madd(v5,v5,v0);
5815 v23 = vec_add(v23,v1);
5817 v30 = vec_madd(v30,v10,v0); /* rinv6 */
5818 v5 = vec_madd(v5,v29,v0); /* rinv22*qqHH */
5819 v15 = vec_madd(v6,v6,v0);
5820 v6 = vec_madd(v6,v29,v0); /* rinv23*qqHH */
5821 v23 = vec_add(v23,v2);
5822 v16 = vec_madd(v7,v7,v0);
5823 v31 = vec_madd(v30,v30,v0); /* rinv12 */
5824 v25 = vec_madd(v25,v30,v0); /* c6*rinv6 */
5825 /* load 6.0 to v30 */
5826 v30 = (vector float)vec_splat_u32(6);
5827 v30 = vec_ctf((vector unsigned int)v30,0);
5828 v23 = vec_add(v23,v3);
5830 v7 = vec_madd(v7,v28,v0); /* rinv31*qqOH */
5831 v17 = vec_madd(v8,v8,v0);
5832 v8 = vec_madd(v8,v29,v0); /* rinv32*qqHH */
5833 v26 = vec_madd(v26,v31,v0); /* c12*rinv12 */
5834 v23 = vec_add(v23,v4);
5835 /* load 12.0 to v31 */
5836 v31 = (vector float)vec_splat_u32(12);
5837 v31 = vec_ctf((vector unsigned int)v31,0);
5839 v24 = vec_sub(v24,v25); /* add vnb6 to vnbtot */
5840 v18 = vec_madd(v9,v9,v0);
5841 v23 = vec_add(v23,v5);
5842 v9 = vec_madd(v9,v29,v0); /* rinv33*qqHH */
5844 v24 = vec_add(v24,v26);/* add vnb12 to vnbtot */
5846 v31 = vec_madd(v31,v26,v0);
5847 v11 = vec_madd(v11,v2,v0); /* fs12 */
5848 v23 = vec_add(v23,v6);
5849 v12 = vec_madd(v12,v3,v0); /* fs13 */
5850 v13 = vec_madd(v13,v4,v0); /* fs21 */
5851 v31 = vec_nmsub(v30,v25,v31);
5853 v14 = vec_madd(v14,v5,v0); /* fs22 */
5854 v23 = vec_add(v23,v7);
5855 v15 = vec_madd(v15,v6,v0); /* fs23 */
5856 v16 = vec_madd(v16,v7,v0); /* fs31 */
5857 v1 = vec_add(v31,v1);
5858 v17 = vec_madd(v17,v8,v0); /* fs32 */
5859 v23 = vec_add(v23,v8);
5860 v18 = vec_madd(v18,v9,v0); /* fs33 */
5861 v10 = vec_madd(v10,v1,v0);
5863 vec_st(v24,240,(float *)stackdata); /* store vnbtot */
5864 /* calculate vectorial forces and accumulate fj. v10-v18 has fs11-fs33 now. */
5865 /* First load iO-* dx,dy,dz vectors to v1-v9 */
5866 /* and load iO forces to v28,v29,v30 */
5867 /* use v19-v27 to accumulate j water forces */
5868 v28 = vec_ld(256, (float *) stackdata);
5869 v29 = vec_ld(272, (float *) stackdata);
5870 v30 = vec_ld(288, (float *) stackdata);
5872 v1 = vec_ld(400, (float *) stackdata);
5873 v2 = vec_ld(416, (float *) stackdata);
5874 v23 = vec_add(v23,v9); /* incr. vctot */
5875 v3 = vec_ld(432, (float *) stackdata);
5876 v4 = vec_ld(448, (float *) stackdata);
5877 v5 = vec_ld(464, (float *) stackdata);
5878 v6 = vec_ld(480, (float *) stackdata);
5879 vec_st(v23,224,(float *)stackdata); /* store vctot back to stack */
5880 v7 = vec_ld(496, (float *) stackdata);
5881 v8 = vec_ld(512, (float *) stackdata);
5882 v9 = vec_ld(528, (float *) stackdata);
5884 v28 = vec_madd(v10,v1,v28);
5885 v19 = vec_nmsub(v10,v1,v0);
5886 v29 = vec_madd(v10,v2,v29);
5887 v20 = vec_nmsub(v10,v2,v0);
5888 v30 = vec_madd(v10,v3,v30);
5889 v21 = vec_nmsub(v10,v3,v0);
5891 v28 = vec_madd(v11,v4,v28);
5892 v22 = vec_nmsub(v11,v4,v0);
5893 v29 = vec_madd(v11,v5,v29);
5894 v23 = vec_nmsub(v11,v5,v0);
5895 v30 = vec_madd(v11,v6,v30);
5896 v24 = vec_nmsub(v11,v6,v0);
5898 v28 = vec_madd(v12,v7,v28);
5899 v25 = vec_nmsub(v12,v7,v0);
5900 v29 = vec_madd(v12,v8,v29);
5901 v26 = vec_nmsub(v12,v8,v0);
5902 v30 = vec_madd(v12,v9,v30);
5903 v27 = vec_nmsub(v12,v9,v0);
5905 /* store these i forces, and repeat the procedue for the iH1-* force */
5906 vec_st(v28,256,(float *)stackdata);
5907 vec_st(v29,272,(float *)stackdata);
5908 vec_st(v30,288,(float *)stackdata);
5910 v28 = vec_ld(304,(float *) stackdata);
5911 v29 = vec_ld(320,(float *) stackdata);
5912 v30 = vec_ld(336,(float *) stackdata);
5913 /* load new vectorial distances */
5914 v1 = vec_ld(544, (float *) stackdata);
5915 v2 = vec_ld(560, (float *) stackdata);
5916 v3 = vec_ld(576, (float *) stackdata);
5917 v4 = vec_ld(592, (float *) stackdata);
5918 v5 = vec_ld(608, (float *) stackdata);
5919 v6 = vec_ld(624, (float *) stackdata);
5920 v7 = vec_ld(640, (float *) stackdata);
5921 v8 = vec_ld(656, (float *) stackdata);
5922 v9 = vec_ld(672, (float *) stackdata);
5924 v28 = vec_madd(v13,v1,v28);
5925 v19 = vec_nmsub(v13,v1,v19);
5926 v29 = vec_madd(v13,v2,v29);
5927 v20 = vec_nmsub(v13,v2,v20);
5928 v30 = vec_madd(v13,v3,v30);
5929 v21 = vec_nmsub(v13,v3,v21);
5931 v28 = vec_madd(v14,v4,v28);
5932 v22 = vec_nmsub(v14,v4,v22);
5933 v29 = vec_madd(v14,v5,v29);
5934 v23 = vec_nmsub(v14,v5,v23);
5935 v30 = vec_madd(v14,v6,v30);
5936 v24 = vec_nmsub(v14,v6,v24);
5938 v28 = vec_madd(v15,v7,v28);
5939 v25 = vec_nmsub(v15,v7,v25);
5940 v29 = vec_madd(v15,v8,v29);
5941 v26 = vec_nmsub(v15,v8,v26);
5942 v30 = vec_madd(v15,v9,v30);
5943 v27 = vec_nmsub(v15,v9,v27);
5945 /* store these i forces, and repeat the procedue for the iH2-* force */
5946 vec_st(v28,304,(float *)stackdata);
5947 vec_st(v29,320,(float *)stackdata);
5948 vec_st(v30,336,(float *)stackdata);
5949 v28 = vec_ld(352,(float *) stackdata);
5950 v29 = vec_ld(368,(float *) stackdata);
5951 v30 = vec_ld(384,(float *) stackdata);
5952 /* load new vectorial distances */
5953 v1 = vec_ld(688, (float *) stackdata);
5954 v2 = vec_ld(704, (float *) stackdata);
5955 v3 = vec_ld(720, (float *) stackdata);
5956 v4 = vec_ld(736, (float *) stackdata);
5957 v5 = vec_ld(752, (float *) stackdata);
5958 v6 = vec_ld(768, (float *) stackdata);
5959 v7 = vec_ld(784, (float *) stackdata);
5960 v8 = vec_ld(800, (float *) stackdata);
5961 v9 = vec_ld(816, (float *) stackdata);
5963 v28 = vec_madd(v16,v1,v28);
5964 v19 = vec_nmsub(v16,v1,v19);
5965 v29 = vec_madd(v16,v2,v29);
5966 v20 = vec_nmsub(v16,v2,v20);
5967 v30 = vec_madd(v16,v3,v30);
5968 v21 = vec_nmsub(v16,v3,v21);
5970 v28 = vec_madd(v17,v4,v28);
5971 v22 = vec_nmsub(v17,v4,v22);
5972 v29 = vec_madd(v17,v5,v29);
5973 v23 = vec_nmsub(v17,v5,v23);
5974 v30 = vec_madd(v17,v6,v30);
5975 v24 = vec_nmsub(v17,v6,v24);
5977 v28 = vec_madd(v18,v7,v28);
5978 v25 = vec_nmsub(v18,v7,v25);
5979 v29 = vec_madd(v18,v8,v29);
5980 v26 = vec_nmsub(v18,v8,v26);
5981 v30 = vec_madd(v18,v9,v30);
5982 v27 = vec_nmsub(v18,v9,v27);
5984 /* store these i forces */
5985 vec_st(v28,352,(float *)stackdata);
5986 vec_st(v29,368,(float *)stackdata);
5987 vec_st(v30,384,(float *)stackdata);
5989 /* j forces present in v19-v27 */
5991 v1 = vec_mergeh(v19,v21); /* Oxa Oza Oxb Ozb */
5992 v19 = vec_mergel(v19,v21); /* Oxc Ozc Oxd Ozd */
5993 v21 = vec_mergeh(v20,v22); /* Oya H1xa Oyb H1xb */
5994 v20 = vec_mergel(v20,v22); /* Oyc H1xc Oyd H1xd */
5995 v22 = vec_mergeh(v23,v25); /* H1ya H2xa H1yb H2xb */
5996 v23 = vec_mergel(v23,v25); /* H1yc H2xc H1yd H2xd */
5997 v25 = vec_mergeh(v24,v26); /* H1za H2ya H1zb H2yb */
5998 v24 = vec_mergel(v24,v26); /* H1zc H2yc H1zd H2yd */
6000 v26 = vec_mergeh(v27,v0); /* H2za 0 H2zb 0 */
6001 v27 = vec_mergel(v27,v0); /* H2zc 0 H2zd 0 */
6003 v2 = vec_mergeh(v1,v21); /* Oxa Oya Oza H1xa */
6004 v21 = vec_mergel(v1,v21); /* Oxb Oyb Ozb H1xb */
6005 v1 = vec_mergeh(v19,v20); /* Oxc Oyc Ozc H1xc */
6006 v19 = vec_mergel(v19,v20); /* Oxd Oyd Ozd H1xd */
6007 v20 = vec_mergeh(v22,v25); /* H1ya H1za H2xa H2ya */
6008 v22 = vec_mergel(v22,v25); /* H1yb H1zb H2xb H2yb */
6009 v25 = vec_mergeh(v23,v24); /* H1yc H1zc H2xc H2yc */
6010 v23 = vec_mergel(v23,v24); /* H1yd H1zd H2xd H2yd */
6011 v24 = vec_mergeh(v26,v0); /* H2za 0 0 0 */
6012 v26 = vec_mergel(v26,v0); /* H2zb 0 0 0 */
6013 v3 = vec_mergeh(v27,v0); /* H2zc 0 0 0 */
6014 v27 = vec_mergel(v27,v0); /* H2zd 0 0 0 */
6016 v29 = (vector float)vec_splat_s32(-1);
6017 /* move into position, load and add */
6018 v30 = (vector float)vec_lvsr( 0, (int *) faction+j3a );
6019 v31 = (vector float)vec_lvsr( 0, (int *) faction+j3c );
6020 v4 = vec_ld( 0, faction+j3a);
6021 v5 = vec_ld( 0, faction+j3c);
6023 v6 = vec_ld( 16, faction+j3a);
6024 v7 = vec_ld( 16, faction+j3c);
6025 v8 = vec_ld( 32, faction+j3a);
6026 v9 = vec_ld( 32, faction+j3c);
6027 v10 = vec_perm(v0,v29,(vector unsigned char)v30);
6028 v11 = vec_perm(v0,v29,(vector unsigned char)v31);
6030 v12 = vec_perm(v0,v2,(vector unsigned char)v30);
6031 v13 = vec_perm(v0,v1,(vector unsigned char)v31);
6032 v4 = vec_add(v12,v4);
6033 v5 = vec_add(v13,v5);
6035 v14 = vec_perm(v2,v20,(vector unsigned char)v30);
6036 v15 = vec_perm(v1,v25,(vector unsigned char)v31);
6037 v2 = vec_add(v14,v6);
6038 v1 = vec_add(v15,v7);
6040 v16 = vec_perm(v20,v24,(vector unsigned char)v30);
6041 v17 = vec_perm(v25,v3,(vector unsigned char)v31);
6042 v20 = vec_add(v16,v8);
6043 v25 = vec_add(v17,v9);
6045 v12 = vec_sel(v4,v4,(vector unsigned int)v10);
6046 v13 = vec_sel(v5,v5,(vector unsigned int)v11);
6047 vec_st(v12, 0, faction+j3a);
6048 vec_st(v13, 0, faction+j3c);
6050 v10 = vec_sld(v0,v10,12);
6051 v11 = vec_sld(v0,v11,12);
6053 vec_st(v2, 16, faction+j3a);
6054 vec_st(v1, 16, faction+j3c);
6056 v12 = vec_sel(v20,v8,(vector unsigned int)v10);
6057 v13 = vec_sel(v25,v9,(vector unsigned int)v11);
6059 vec_st(v12, 32, faction+j3a);
6060 vec_st(v13, 32, faction+j3c);
6062 /* Finished 1 & 3 - now do 2 & 4 */
6064 v30 = (vector float)vec_lvsr( 0, (int *) faction+j3b );
6065 v31 = (vector float)vec_lvsr( 0, (int *) faction+j3d );
6067 v4 = vec_ld( 0, faction+j3b);
6068 v5 = vec_ld( 0, faction+j3d);
6069 v6 = vec_ld( 16, faction+j3b);
6070 v7 = vec_ld( 16, faction+j3d);
6071 v8 = vec_ld( 32, faction+j3b);
6072 v9 = vec_ld( 32, faction+j3d);
6073 v10 = vec_perm(v0,v29,(vector unsigned char)v30);
6074 v11 = vec_perm(v0,v29,(vector unsigned char)v31);
6076 v12 = vec_perm(v0,v21,(vector unsigned char)v30);
6077 v13 = vec_perm(v0,v19,(vector unsigned char)v31);
6078 v24 = vec_add(v12,v4);
6079 v25 = vec_add(v13,v5);
6081 v12 = vec_perm(v21,v22,(vector unsigned char)v30);
6082 v13 = vec_perm(v19,v23,(vector unsigned char)v31);
6083 v21 = vec_add(v12,v6);
6084 v19 = vec_add(v13,v7);
6086 v12 = vec_perm(v22,v26,(vector unsigned char)v30);
6087 v13 = vec_perm(v23,v27,(vector unsigned char)v31);
6088 v22 = vec_add(v12,v8);
6089 v23 = vec_add(v13,v9);
6091 v12 = vec_sel(v4,v24,(vector unsigned int)v10);
6092 v13 = vec_sel(v5,v25,(vector unsigned int)v11);
6093 vec_st(v12, 0, faction+j3b);
6094 vec_st(v13, 0, faction+j3d);
6095 v10 = vec_sld(v0,v10,12);
6096 v11 = vec_sld(v0,v11,12);
6098 vec_st(v21, 16, faction+j3b);
6099 vec_st(v19, 16, faction+j3d);
6101 v12 = vec_sel(v22,v8,(vector unsigned int)v10);
6102 v13 = vec_sel(v23,v9,(vector unsigned int)v11);
6103 vec_st(v12, 32, faction+j3b);
6104 vec_st(v13, 32, faction+j3d);
6106 if(k<(nj1-2)) {
6107 jnra = jjnr[k];
6108 jnrb = jjnr[k+1];
6109 jnrc = jjnr[k+2];
6110 j3a = 3*jnra;
6111 j3b = 3*jnrb;
6112 j3c = 3*jnrc;
6114 v1 = (vector float)vec_lvsl(0, pos+j3a);
6115 v8 = (vector float)vec_lvsl(0, pos+j3b);
6116 v15 = (vector float)vec_lvsl(0, pos+j3c);
6118 v2 = vec_ld(0, pos+j3a);
6119 v9 = vec_ld(0, pos+j3b);
6120 v16 = vec_ld(0, pos+j3c);
6121 v3 = vec_ld(16, pos+j3a);
6122 v10 = vec_ld(16, pos+j3b);
6123 v17 = vec_ld(16, pos+j3c);
6124 v4 = vec_ld(32, pos+j3a);
6125 v11 = vec_ld(32, pos+j3b);
6126 v18 = vec_ld(32, pos+j3c);
6127 v5 = vec_perm(v2,v3,(vector unsigned char)v1); /* Oxa Oya Oza H1xa */
6128 v12 = vec_perm(v9,v10,(vector unsigned char)v8); /* Oxb Oyb Ozb H1xb */
6129 v19 = vec_perm(v16,v17,(vector unsigned char)v15); /* Oxc Oyc Ozc H1xc */
6131 v6 = vec_perm(v3,v4,(vector unsigned char)v1); /* H1ya H1za H2xa H2ya */
6132 v13 = vec_perm(v10,v11,(vector unsigned char)v8); /* H1yb H1zb H2xb H2yb */
6133 v20 = vec_perm(v17,v18,(vector unsigned char)v15); /* H1yc H1zc H2xc H2yc */
6135 v7 = vec_perm(v4,v4,(vector unsigned char)v1); /* H2za - - - */
6136 v14 = vec_perm(v11,v11,(vector unsigned char)v8); /* H2zb - - - */
6137 v21 = vec_perm(v18,v18,(vector unsigned char)v15); /* H2zc - - - */
6139 /* permute water coordinates */
6140 v3 = vec_mergeh(v5,v19); /* Oxa Oxc Oya Oyc */
6141 v5 = vec_mergel(v5,v19); /* Oza Ozc H1xa H1xc */
6142 v19 = vec_mergeh(v12,v0); /* Oxb - Oyb - */
6143 v12 = vec_mergel(v12,v0); /* Ozb - H1xb - */
6145 v26 = vec_mergeh(v6,v20); /* H1ya H1yc H1za H1zc */
6146 v16 = vec_mergel(v6,v20); /* H2xa H2xc H2ya H2yc */
6147 v20 = vec_mergeh(v13,v0); /* H1yb - H1zb - */
6148 v13 = vec_mergel(v13,v0); /* H2xb - H2yb - */
6150 v15 = vec_mergeh(v7,v21); /* H2za H2zc - - */
6152 v1 = vec_mergeh(v3,v19); /* Oxa Oxb Oxc - */
6153 v29 = vec_ld(128, (float *) stackdata); /* load i H1x */
6154 v2 = vec_mergel(v3,v19); /* Oya Oyb Oyc - */
6155 v30 = vec_ld(144, (float *) stackdata); /* load i H1y */
6156 v3 = vec_mergeh(v5,v12); /* Oza Ozb Ozc - */
6157 v31 = vec_ld(160, (float *) stackdata); /* load i H1z */
6158 v4 = vec_mergel(v5,v12); /* H1xa H1xb H1xc - */
6159 v5 = vec_mergeh(v26,v20); /* H1ya H1yb H1yc - */
6160 v6 = vec_mergel(v26,v20); /* H1za H1zb H1zc - */
6161 v7 = vec_mergeh(v16,v13); /* H2xa H2xb H2xc - */
6162 v8 = vec_mergel(v16,v13); /* H2ya H2yb H2yc - */
6163 v9 = vec_mergeh(v15,v14); /* H2za H2zb H2zc - */
6165 v10 = vec_sub(v29,v1); /* iH1x - jOx */
6166 v13 = vec_sub(v29,v4); /* iH1x - jH1x */
6167 v16 = vec_sub(v29,v7); /* iH1x - jH2x */
6168 v29 = vec_ld(176, (float *) stackdata); /* load i H2x */
6169 v11 = vec_sub(v30,v2); /* iH1y - jOy */
6170 v14 = vec_sub(v30,v5); /* iH1y - jH1y */
6171 v17 = vec_sub(v30,v8); /* iH1y - jH2y */
6172 v30 = vec_ld(192, (float *) stackdata); /* load i H2y */
6173 vec_st(v10, 544, (float *)stackdata); /* dx21 */
6174 vec_st(v13, 592, (float *)stackdata); /* dx22 */
6175 vec_st(v16, 640, (float *)stackdata); /* dx23 */
6176 v12 = vec_sub(v31,v3); /* iH1z - jOz */
6177 v15 = vec_sub(v31,v6); /* iH1z - jH1z */
6178 v18 = vec_sub(v31,v9); /* iH1z - jH2z */
6179 v31 = vec_ld(208, (float *) stackdata); /* load i H2z */
6180 /* v10-v18 now contains iH1-jO, iH1-jH1 and iJ1-jH2 distances */
6181 vec_st(v11, 560, (float *)stackdata); /* dy21 */
6182 vec_st(v14, 608, (float *)stackdata); /* dy22 */
6183 vec_st(v17, 656, (float *)stackdata); /* dy23 */
6184 v19 = vec_sub(v29,v1); /* iH2x - jOx */
6185 v22 = vec_sub(v29,v4); /* iH2x - jH1x */
6186 v25 = vec_sub(v29,v7); /* iH2x - jH2x */
6187 vec_st(v12, 576, (float *)stackdata); /* dz21 */
6188 vec_st(v15, 624, (float *)stackdata); /* dz22 */
6189 vec_st(v18, 672, (float *)stackdata); /* dz23 */
6190 v29 = vec_ld(80, (float *) stackdata); /* load i Ox */
6191 v20 = vec_sub(v30,v2); /* iH2y - jOy */
6192 v23 = vec_sub(v30,v5); /* iH2y - jH1y */
6193 v26 = vec_sub(v30,v8); /* iH2y - jH2y */
6194 vec_st(v19, 688, (float *)stackdata); /* dx31 */
6195 vec_st(v22, 736, (float *)stackdata); /* dx32 */
6196 vec_st(v25, 784, (float *)stackdata); /* dx33 */
6197 v30 = vec_ld(96, (float *) stackdata); /* load i Oy */
6198 v21 = vec_sub(v31,v3); /* iH2z - jOz */
6199 v24 = vec_sub(v31,v6); /* iH2z - jH1z */
6200 v27 = vec_sub(v31,v9); /* iH2z - jH2z */
6201 v31 = vec_ld(112, (float *) stackdata); /* load i Oz */
6202 vec_st(v20, 704, (float *)stackdata); /* dy31 */
6203 vec_st(v23, 752, (float *)stackdata); /* dy32 */
6204 vec_st(v26, 800, (float *)stackdata); /* dy33 */
6206 v1 = vec_sub(v29,v1); /* iOx - jOx */
6207 v4 = vec_sub(v29,v4); /* iOx - jH1x */
6208 v7 = vec_sub(v29,v7); /* iOx - jH2x */
6209 vec_st(v21, 720, (float *)stackdata); /* dz31 */
6210 vec_st(v24, 768, (float *)stackdata); /* dz32 */
6211 vec_st(v27, 816, (float *)stackdata); /* dz33 */
6212 v2 = vec_sub(v30,v2); /* iOy - jOy */
6213 v5 = vec_sub(v30,v5); /* iOy - jH1y */
6214 v8 = vec_sub(v30,v8); /* iOy - jH2y */
6215 vec_st(v1, 400, (float *)stackdata); /* dx11 */
6216 vec_st(v4, 448, (float *)stackdata); /* dx12 */
6217 vec_st(v7, 496, (float *)stackdata); /* dx13 */
6218 v3 = vec_sub(v31,v3); /* iOz - jOz */
6219 v6 = vec_sub(v31,v6); /* iOz - jH1z */
6220 v9 = vec_sub(v31,v9); /* iOz - jH2z */
6221 vec_st(v2, 416, (float *)stackdata); /* dy11 */
6222 vec_st(v5, 464, (float *)stackdata); /* dy12 */
6223 vec_st(v8, 512, (float *)stackdata); /* dy13 */
6225 v1 = vec_madd(v1,v1,v0);
6226 v4 = vec_madd(v4,v4,v0);
6227 v7 = vec_madd(v7,v7,v0);
6228 vec_st(v3, 432, (float *)stackdata); /* dz11 */
6229 vec_st(v6, 480, (float *)stackdata); /* dz12 */
6230 vec_st(v9, 528, (float *)stackdata); /* dz13 */
6231 v10 = vec_madd(v10,v10,v0);
6232 v13 = vec_madd(v13,v13,v0);
6233 v16 = vec_madd(v16,v16,v0);
6234 v19 = vec_madd(v19,v19,v0);
6235 v22 = vec_madd(v22,v22,v0);
6236 v25 = vec_madd(v25,v25,v0);
6237 v1 = vec_madd(v2,v2,v1);
6238 v4 = vec_madd(v5,v5,v4);
6239 v7 = vec_madd(v8,v8,v7);
6240 v10 = vec_madd(v11,v11,v10);
6241 v13 = vec_madd(v14,v14,v13);
6242 v16 = vec_madd(v17,v17,v16);
6243 v19 = vec_madd(v20,v20,v19);
6244 v22 = vec_madd(v23,v23,v22);
6245 v25 = vec_madd(v26,v26,v25);
6246 v1 = vec_madd(v3,v3,v1);
6247 v2 = vec_madd(v6,v6,v4);
6248 v3 = vec_madd(v9,v9,v7);
6249 v4 = vec_madd(v12,v12,v10);
6250 v5 = vec_madd(v15,v15,v13);
6251 v6 = vec_madd(v18,v18,v16);
6252 v7 = vec_madd(v21,v21,v19);
6253 v8 = vec_madd(v24,v24,v22);
6254 v9 = vec_madd(v27,v27,v25);
6256 * v1 = rsq iO-jO
6257 * v2 = rsq iO-jH1
6258 * v3 = rsq iO-jH2
6259 * v4 = rsq iH1-jO
6260 * v5 = rsq iH1-jH1
6261 * v6 = rsq iH1-jH2
6262 * v7 = rsq iH2-jO
6263 * v8 = rsq iH2-jH1
6264 * v9 = rsq iH2-jH2
6267 v10 = vec_rsqrte(v1);
6268 v11 = vec_rsqrte(v2);
6269 v12 = vec_rsqrte(v3);
6270 v13 = vec_rsqrte(v4);
6271 v14 = vec_rsqrte(v5);
6272 v15 = vec_rsqrte(v6);
6273 v16 = vec_rsqrte(v7);
6274 v17 = vec_rsqrte(v8);
6275 v18 = vec_rsqrte(v9);
6277 /* create constant 0.5 */
6278 v30 = (vector float) vec_splat_u32(1);
6279 v31 = vec_ctf((vector unsigned int)v30,1); /* 0.5 */
6280 v30 = vec_ctf((vector unsigned int)v30,0); /* 1.0 */
6282 v19 = vec_madd(v10,v10,v0); /* lu*lu */
6283 v20 = vec_madd(v11,v11,v0);
6284 v21 = vec_madd(v12,v12,v0);
6285 v22 = vec_madd(v13,v13,v0);
6286 v23 = vec_madd(v14,v14,v0);
6287 v24 = vec_madd(v15,v15,v0);
6288 v25 = vec_madd(v16,v16,v0);
6289 v26 = vec_madd(v17,v17,v0);
6290 v27 = vec_madd(v18,v18,v0);
6292 v19 = vec_nmsub(v1,v19,v30); /* 1.0 - rsq*lu*lu */
6293 v20 = vec_nmsub(v2,v20,v30);
6294 v21 = vec_nmsub(v3,v21,v30);
6295 v22 = vec_nmsub(v4,v22,v30);
6296 v23 = vec_nmsub(v5,v23,v30);
6297 v24 = vec_nmsub(v6,v24,v30);
6298 v25 = vec_nmsub(v7,v25,v30);
6299 v26 = vec_nmsub(v8,v26,v30);
6300 v27 = vec_nmsub(v9,v27,v30);
6302 v1 = vec_madd(v10,v31,v0);/* lu*0.5*/
6303 v2 = vec_madd(v11,v31,v0);
6304 v3 = vec_madd(v12,v31,v0);
6305 v4 = vec_madd(v13,v31,v0);
6306 v5 = vec_madd(v14,v31,v0);
6307 v6 = vec_madd(v15,v31,v0);
6308 v7 = vec_madd(v16,v31,v0);
6309 v8 = vec_madd(v17,v31,v0);
6310 v9 = vec_madd(v18,v31,v0);
6312 /* The rinv values */
6313 v1 = vec_madd(v1,v19,v10);
6314 v2 = vec_madd(v2,v20,v11);
6315 v3 = vec_madd(v3,v21,v12);
6316 v4 = vec_madd(v4,v22,v13);
6317 v5 = vec_madd(v5,v23,v14);
6318 v6 = vec_madd(v6,v24,v15);
6319 v7 = vec_madd(v7,v25,v16);
6320 v8 = vec_madd(v8,v26,v17);
6321 v9 = vec_madd(v9,v27,v18);
6323 v10 = (vector float)vec_splat_s32(-1);
6324 v10 = vec_sld(v0,v10,4);
6326 v1 = (vector float)vec_sel((vector unsigned int)v1,(vector unsigned int)v0,(vector unsigned int)v10);
6327 v2 = (vector float)vec_sel((vector unsigned int)v2,(vector unsigned int)v0,(vector unsigned int)v10);
6328 v3 = (vector float)vec_sel((vector unsigned int)v3,(vector unsigned int)v0,(vector unsigned int)v10);
6329 v4 = (vector float)vec_sel((vector unsigned int)v4,(vector unsigned int)v0,(vector unsigned int)v10);
6330 v5 = (vector float)vec_sel((vector unsigned int)v5,(vector unsigned int)v0,(vector unsigned int)v10);
6331 v6 = (vector float)vec_sel((vector unsigned int)v6,(vector unsigned int)v0,(vector unsigned int)v10);
6332 v7 = (vector float)vec_sel((vector unsigned int)v7,(vector unsigned int)v0,(vector unsigned int)v10);
6333 v8 = (vector float)vec_sel((vector unsigned int)v8,(vector unsigned int)v0,(vector unsigned int)v10);
6334 v9 = (vector float)vec_sel((vector unsigned int)v9,(vector unsigned int)v0,(vector unsigned int)v10);
6336 /* load qqOO, qqOH and qqHH to v27,v28,v29 */
6337 v27 = vec_ld(0, (float *) stackdata);
6338 v28 = vec_ld(16, (float *) stackdata);
6339 v29 = vec_ld(32, (float *) stackdata);
6342 vec_dstst( faction+j3a, 0x10010100, 2 );
6344 v27 = vec_sld(v27,v0,4);
6345 v28 = vec_sld(v28,v0,4);
6346 v29 = vec_sld(v29,v0,4);
6348 /* put rinvsq in v10-v18, rinv6_OO in v30 and rinv12_OO in v31 */
6349 /* load c6 to v25 and c12 to v26 */
6350 v25 = vec_ld(48, (float *) stackdata);
6351 v26 = vec_ld(64, (float *) stackdata);
6353 v10 = vec_madd(v1,v1,v0);
6354 v1 = vec_madd(v1,v27,v0); /* rinv11*qqOO */
6355 v11 = vec_madd(v2,v2,v0);
6356 /* load vctot to v23 and vnbtot to v24 */
6357 v23 = vec_ld(224,(float *) stackdata);
6358 v24 = vec_ld(240,(float *) stackdata);
6360 v25 = vec_sld(v25,v0,4);
6361 v26 = vec_sld(v26,v0,4);
6363 v2 = vec_madd(v2,v28,v0); /* rinv12*qqOH */
6364 v12 = vec_madd(v3,v3,v0);
6365 v30 = vec_madd(v10,v10,v0); /* rinv4 */
6366 v3 = vec_madd(v3,v28,v0); /* rinv13*qqOH */
6367 v13 = vec_madd(v4,v4,v0);
6368 v4 = vec_madd(v4,v28,v0); /* rinv21*qqOH */
6369 v14 = vec_madd(v5,v5,v0);
6371 v23 = vec_add(v23,v1);
6373 v30 = vec_madd(v30,v10,v0); /* rinv6 */
6374 v5 = vec_madd(v5,v29,v0); /* rinv22*qqHH */
6375 v15 = vec_madd(v6,v6,v0);
6376 v6 = vec_madd(v6,v29,v0); /* rinv23*qqHH */
6377 v23 = vec_add(v23,v2);
6378 v16 = vec_madd(v7,v7,v0);
6379 v31 = vec_madd(v30,v30,v0); /* rinv12 */
6380 v25 = vec_madd(v25,v30,v0); /* c6*rinv6 */
6381 /* load 6.0 to v30 */
6382 v30 = (vector float)vec_splat_u32(6);
6383 v30 = vec_ctf((vector unsigned int)v30,0);
6384 v23 = vec_add(v23,v3);
6386 v7 = vec_madd(v7,v28,v0); /* rinv31*qqOH */
6387 v17 = vec_madd(v8,v8,v0);
6388 v8 = vec_madd(v8,v29,v0); /* rinv32*qqHH */
6389 v26 = vec_madd(v26,v31,v0); /* c12*rinv12 */
6390 v23 = vec_add(v23,v4);
6391 /* load 12.0 to v31 */
6392 v31 = (vector float)vec_splat_u32(12);
6393 v31 = vec_ctf((vector unsigned int)v31,0);
6396 v24 = vec_sub(v24,v25); /* add vnb6 to vnbtot */
6397 v18 = vec_madd(v9,v9,v0);
6398 v23 = vec_add(v23,v5);
6399 v9 = vec_madd(v9,v29,v0); /* rinv33*qqHH */
6400 v24 = vec_add(v24,v26);/* add vnb12 to vnbtot */
6402 v31 = vec_madd(v31,v26,v0);
6403 v11 = vec_madd(v11,v2,v0); /* fs12 */
6404 v23 = vec_add(v23,v6);
6405 v12 = vec_madd(v12,v3,v0); /* fs13 */
6406 v13 = vec_madd(v13,v4,v0); /* fs21 */
6407 v31 = vec_nmsub(v30,v25,v31);
6409 v14 = vec_madd(v14,v5,v0); /* fs22 */
6410 v23 = vec_add(v23,v7);
6411 v15 = vec_madd(v15,v6,v0); /* fs23 */
6412 v16 = vec_madd(v16,v7,v0); /* fs31 */
6413 v1 = vec_add(v31,v1);
6414 v17 = vec_madd(v17,v8,v0); /* fs32 */
6415 v23 = vec_add(v23,v8);
6416 v18 = vec_madd(v18,v9,v0); /* fs33 */
6417 v10 = vec_madd(v10,v1,v0);
6419 vec_st(v24,240,(float *)stackdata); /* store vnbtot */
6420 /* calculate vectorial forces and accumulate fj. v10-v18 has fs11-fs33 now. */
6421 /* First load iO-* dx,dy,dz vectors to v1-v9 */
6422 /* and load iO forces to v28,v29,v30 */
6423 /* use v19-v27 to accumulate j water forces */
6424 v28 = vec_ld(256, (float *) stackdata);
6425 v29 = vec_ld(272, (float *) stackdata);
6426 v30 = vec_ld(288, (float *) stackdata);
6428 v1 = vec_ld(400, (float *) stackdata);
6429 v2 = vec_ld(416, (float *) stackdata);
6430 v23 = vec_add(v23,v9); /* incr. vctot */
6431 v3 = vec_ld(432, (float *) stackdata);
6432 v4 = vec_ld(448, (float *) stackdata);
6433 v5 = vec_ld(464, (float *) stackdata);
6434 v6 = vec_ld(480, (float *) stackdata);
6435 vec_st(v23,224,(float *)stackdata); /* store vctot back to stack */
6436 v7 = vec_ld(496, (float *) stackdata);
6437 v8 = vec_ld(512, (float *) stackdata);
6438 v9 = vec_ld(528, (float *) stackdata);
6440 v28 = vec_madd(v10,v1,v28);
6441 v19 = vec_nmsub(v10,v1,v0);
6442 v29 = vec_madd(v10,v2,v29);
6443 v20 = vec_nmsub(v10,v2,v0);
6444 v30 = vec_madd(v10,v3,v30);
6445 v21 = vec_nmsub(v10,v3,v0);
6447 v28 = vec_madd(v11,v4,v28);
6448 v22 = vec_nmsub(v11,v4,v0);
6449 v29 = vec_madd(v11,v5,v29);
6450 v23 = vec_nmsub(v11,v5,v0);
6451 v30 = vec_madd(v11,v6,v30);
6452 v24 = vec_nmsub(v11,v6,v0);
6454 v28 = vec_madd(v12,v7,v28);
6455 v25 = vec_nmsub(v12,v7,v0);
6456 v29 = vec_madd(v12,v8,v29);
6457 v26 = vec_nmsub(v12,v8,v0);
6458 v30 = vec_madd(v12,v9,v30);
6459 v27 = vec_nmsub(v12,v9,v0);
6461 /* store these i forces, and repeat the procedue for the iH1-* force */
6462 vec_st(v28,256,(float *)stackdata);
6463 vec_st(v29,272,(float *)stackdata);
6464 vec_st(v30,288,(float *)stackdata);
6466 v28 = vec_ld(304,(float *) stackdata);
6467 v29 = vec_ld(320,(float *) stackdata);
6468 v30 = vec_ld(336,(float *) stackdata);
6469 /* load new vectorial distances */
6470 v1 = vec_ld(544, (float *) stackdata);
6471 v2 = vec_ld(560, (float *) stackdata);
6472 v3 = vec_ld(576, (float *) stackdata);
6473 v4 = vec_ld(592, (float *) stackdata);
6474 v5 = vec_ld(608, (float *) stackdata);
6475 v6 = vec_ld(624, (float *) stackdata);
6476 v7 = vec_ld(640, (float *) stackdata);
6477 v8 = vec_ld(656, (float *) stackdata);
6478 v9 = vec_ld(672, (float *) stackdata);
6480 v28 = vec_madd(v13,v1,v28);
6481 v19 = vec_nmsub(v13,v1,v19);
6482 v29 = vec_madd(v13,v2,v29);
6483 v20 = vec_nmsub(v13,v2,v20);
6484 v30 = vec_madd(v13,v3,v30);
6485 v21 = vec_nmsub(v13,v3,v21);
6487 v28 = vec_madd(v14,v4,v28);
6488 v22 = vec_nmsub(v14,v4,v22);
6489 v29 = vec_madd(v14,v5,v29);
6490 v23 = vec_nmsub(v14,v5,v23);
6491 v30 = vec_madd(v14,v6,v30);
6492 v24 = vec_nmsub(v14,v6,v24);
6494 v28 = vec_madd(v15,v7,v28);
6495 v25 = vec_nmsub(v15,v7,v25);
6496 v29 = vec_madd(v15,v8,v29);
6497 v26 = vec_nmsub(v15,v8,v26);
6498 v30 = vec_madd(v15,v9,v30);
6499 v27 = vec_nmsub(v15,v9,v27);
6501 /* store these i forces, and repeat the procedue for the iH2-* force */
6502 vec_st(v28,304,(float *)stackdata);
6503 vec_st(v29,320,(float *)stackdata);
6504 vec_st(v30,336,(float *)stackdata);
6505 v28 = vec_ld(352,(float *) stackdata);
6506 v29 = vec_ld(368,(float *) stackdata);
6507 v30 = vec_ld(384,(float *) stackdata);
6508 /* load new vectorial distances */
6509 v1 = vec_ld(688, (float *) stackdata);
6510 v2 = vec_ld(704, (float *) stackdata);
6511 v3 = vec_ld(720, (float *) stackdata);
6512 v4 = vec_ld(736, (float *) stackdata);
6513 v5 = vec_ld(752, (float *) stackdata);
6514 v6 = vec_ld(768, (float *) stackdata);
6515 v7 = vec_ld(784, (float *) stackdata);
6516 v8 = vec_ld(800, (float *) stackdata);
6517 v9 = vec_ld(816, (float *) stackdata);
6519 v28 = vec_madd(v16,v1,v28);
6520 v19 = vec_nmsub(v16,v1,v19);
6521 v29 = vec_madd(v16,v2,v29);
6522 v20 = vec_nmsub(v16,v2,v20);
6523 v30 = vec_madd(v16,v3,v30);
6524 v21 = vec_nmsub(v16,v3,v21);
6526 v28 = vec_madd(v17,v4,v28);
6527 v22 = vec_nmsub(v17,v4,v22);
6528 v29 = vec_madd(v17,v5,v29);
6529 v23 = vec_nmsub(v17,v5,v23);
6530 v30 = vec_madd(v17,v6,v30);
6531 v24 = vec_nmsub(v17,v6,v24);
6533 v28 = vec_madd(v18,v7,v28);
6534 v25 = vec_nmsub(v18,v7,v25);
6535 v29 = vec_madd(v18,v8,v29);
6536 v26 = vec_nmsub(v18,v8,v26);
6537 v30 = vec_madd(v18,v9,v30);
6538 v27 = vec_nmsub(v18,v9,v27);
6540 /* store these i forces */
6541 vec_st(v28,352,(float *)stackdata);
6542 vec_st(v29,368,(float *)stackdata);
6543 vec_st(v30,384,(float *)stackdata);
6545 /* j forces present in v19-v27 */
6547 v1 = vec_mergeh(v19,v21); /* Oxa Oza Oxb Ozb */
6548 v19 = vec_mergel(v19,v21); /* Oxc Ozc - - */
6549 v21 = vec_mergeh(v20,v22); /* Oya H1xa Oyb H1xb */
6550 v20 = vec_mergel(v20,v22); /* Oyc H1xc - - */
6551 v22 = vec_mergeh(v23,v25); /* H1ya H2xa H1yb H2xb */
6552 v23 = vec_mergel(v23,v25); /* H1yc H2xc - - */
6553 v25 = vec_mergeh(v24,v26); /* H1za H2ya H1zb H2yb */
6554 v24 = vec_mergel(v24,v26); /* H1zc H2yc - - */
6556 v26 = vec_mergeh(v27,v0); /* H2za 0 H2zb 0 */
6557 v27 = vec_mergel(v27,v0); /* H2zc 0 - 0 */
6559 v2 = vec_mergeh(v1,v21); /* Oxa Oya Oza H1xa */
6560 v21 = vec_mergel(v1,v21); /* Oxb Oyb Ozb H1xb */
6561 v1 = vec_mergeh(v19,v20); /* Oxc Oyc Ozc H1xc */
6562 v20 = vec_mergeh(v22,v25); /* H1ya H1za H2xa H2ya */
6563 v22 = vec_mergel(v22,v25); /* H1yb H1zb H2xb H2yb */
6564 v25 = vec_mergeh(v23,v24); /* H1yc H1zc H2xc H2yc */
6565 v24 = vec_mergeh(v26,v0); /* H2za 0 0 0 */
6566 v26 = vec_mergel(v26,v0); /* H2zb 0 0 0 */
6567 v3 = vec_mergeh(v27,v0); /* H2zc 0 0 0 */
6569 v29 = (vector float)vec_splat_s32(-1);
6570 /* move into position, load and add */
6571 v30 = (vector float)vec_lvsr( 0, (int *) faction+j3a );
6572 v31 = (vector float)vec_lvsr( 0, (int *) faction+j3c );
6573 v4 = vec_ld( 0, faction+j3a);
6574 v5 = vec_ld( 0, faction+j3c);
6576 v6 = vec_ld( 16, faction+j3a);
6577 v7 = vec_ld( 16, faction+j3c);
6578 v8 = vec_ld( 32, faction+j3a);
6579 v9 = vec_ld( 32, faction+j3c);
6580 v10 = vec_perm(v0,v29,(vector unsigned char)v30);
6581 v11 = vec_perm(v0,v29,(vector unsigned char)v31);
6583 v12 = vec_perm(v0,v2,(vector unsigned char)v30);
6584 v13 = vec_perm(v0,v1,(vector unsigned char)v31);
6585 v4 = vec_add(v12,v4);
6586 v5 = vec_add(v13,v5);
6588 v14 = vec_perm(v2,v20,(vector unsigned char)v30);
6589 v15 = vec_perm(v1,v25,(vector unsigned char)v31);
6590 v2 = vec_add(v14,v6);
6591 v1 = vec_add(v15,v7);
6593 v16 = vec_perm(v20,v24,(vector unsigned char)v30);
6594 v17 = vec_perm(v25,v3,(vector unsigned char)v31);
6595 v20 = vec_add(v16,v8);
6596 v25 = vec_add(v17,v9);
6598 v12 = vec_sel(v4,v4,(vector unsigned int)v10);
6599 v13 = vec_sel(v5,v5,(vector unsigned int)v11);
6600 vec_st(v12, 0, faction+j3a);
6601 vec_st(v13, 0, faction+j3c);
6603 v10 = vec_sld(v0,v10,12);
6604 v11 = vec_sld(v0,v11,12);
6606 vec_st(v2, 16, faction+j3a);
6607 vec_st(v1, 16, faction+j3c);
6609 v12 = vec_sel(v20,v8,(vector unsigned int)v10);
6610 v13 = vec_sel(v25,v9,(vector unsigned int)v11);
6612 vec_st(v12, 32, faction+j3a);
6613 vec_st(v13, 32, faction+j3c);
6615 /* Finished 1 & 3 - now do 2 */
6617 v30 = (vector float)vec_lvsr( 0, (int *) faction+j3b );
6619 v4 = vec_ld( 0, faction+j3b);
6620 v6 = vec_ld( 16, faction+j3b);
6621 v8 = vec_ld( 32, faction+j3b);
6622 v10 = vec_perm(v0,v29,(vector unsigned char)v30);
6624 v12 = vec_perm(v0,v21,(vector unsigned char)v30);
6625 v24 = vec_add(v12,v4);
6627 v12 = vec_perm(v21,v22,(vector unsigned char)v30);
6628 v21 = vec_add(v12,v6);
6630 v12 = vec_perm(v22,v26,(vector unsigned char)v30);
6631 v22 = vec_add(v12,v8);
6633 v12 = vec_sel(v4,v24,(vector unsigned int)v10);
6634 vec_st(v12, 0, faction+j3b);
6635 v10 = vec_sld(v0,v10,12);
6637 vec_st(v21, 16, faction+j3b);
6639 v12 = vec_sel(v22,v8,(vector unsigned int)v10);
6640 vec_st(v12, 32, faction+j3b);
6642 } else if(k<(nj1-1)) {
6643 jnra = jjnr[k];
6644 jnrb = jjnr[k+1];
6645 j3a = 3*jnra;
6646 j3b = 3*jnrb;
6648 v1 = (vector float)vec_lvsl(0, pos+j3a);
6649 v8 = (vector float)vec_lvsl(0, pos+j3b);
6651 v2 = vec_ld(0, pos+j3a);
6652 v9 = vec_ld(0, pos+j3b);
6653 v3 = vec_ld(16, pos+j3a);
6654 v10 = vec_ld(16, pos+j3b);
6655 v4 = vec_ld(32, pos+j3a);
6656 v11 = vec_ld(32, pos+j3b);
6657 v5 = vec_perm(v2,v3,(vector unsigned char)v1); /* Oxa Oya Oza H1xa */
6658 v12 = vec_perm(v9,v10,(vector unsigned char)v8); /* Oxb Oyb Ozb H1xb */
6660 v6 = vec_perm(v3,v4,(vector unsigned char)v1); /* H1ya H1za H2xa H2ya */
6661 v13 = vec_perm(v10,v11,(vector unsigned char)v8); /* H1yb H1zb H2xb H2yb */
6663 v7 = vec_perm(v4,v4,(vector unsigned char)v1); /* H2za - - - */
6664 v14 = vec_perm(v11,v11,(vector unsigned char)v8); /* H2zb - - - */
6666 /* permute water coordinates */
6667 v1 = vec_mergeh(v5,v12); /* Oxa Oxb Oya Oyb */
6668 v3 = vec_mergel(v5,v12); /* Oza Ozb H1xa H1xb */
6669 v5 = vec_mergeh(v6,v13); /* H1ya H1yb H1za H1zb */
6670 v9 = vec_mergeh(v7,v14); /* H2za H2zb - - */
6671 v7 = vec_mergel(v6,v13); /* H2xa H2xb H2ya H2yb */
6673 v29 = vec_ld(128, (float *) stackdata); /* load i H1x */
6674 v2 = vec_sld(v1,v1,8); /* Oya Oyb - - */
6675 v30 = vec_ld(144, (float *) stackdata); /* load i H1y */
6676 v4 = vec_sld(v3,v3,8); /* H1xa H1xb - - */
6677 v31 = vec_ld(160, (float *) stackdata); /* load i H1z */
6678 v6 = vec_sld(v5,v5,8); /* H1za H1zb - - */
6679 v8 = vec_sld(v7,v7,8); /* H2ya H2yb - - */
6682 v10 = vec_sub(v29,v1); /* iH1x - jOx */
6683 v13 = vec_sub(v29,v4); /* iH1x - jH1x */
6684 v16 = vec_sub(v29,v7); /* iH1x - jH2x */
6685 v29 = vec_ld(176, (float *) stackdata); /* load i H2x */
6686 v11 = vec_sub(v30,v2); /* iH1y - jOy */
6687 v14 = vec_sub(v30,v5); /* iH1y - jH1y */
6688 v17 = vec_sub(v30,v8); /* iH1y - jH2y */
6689 v30 = vec_ld(192, (float *) stackdata); /* load i H2y */
6690 vec_st(v10, 544, (float *)stackdata); /* dx21 */
6691 vec_st(v13, 592, (float *)stackdata); /* dx22 */
6692 vec_st(v16, 640, (float *)stackdata); /* dx23 */
6693 v12 = vec_sub(v31,v3); /* iH1z - jOz */
6694 v15 = vec_sub(v31,v6); /* iH1z - jH1z */
6695 v18 = vec_sub(v31,v9); /* iH1z - jH2z */
6696 v31 = vec_ld(208, (float *) stackdata); /* load i H2z */
6697 /* v10-v18 now contains iH1-jO, iH1-jH1 and iJ1-jH2 distances */
6698 vec_st(v11, 560, (float *)stackdata); /* dy21 */
6699 vec_st(v14, 608, (float *)stackdata); /* dy22 */
6700 vec_st(v17, 656, (float *)stackdata); /* dy23 */
6701 v19 = vec_sub(v29,v1); /* iH2x - jOx */
6702 v22 = vec_sub(v29,v4); /* iH2x - jH1x */
6703 v25 = vec_sub(v29,v7); /* iH2x - jH2x */
6704 vec_st(v12, 576, (float *)stackdata); /* dz21 */
6705 vec_st(v15, 624, (float *)stackdata); /* dz22 */
6706 vec_st(v18, 672, (float *)stackdata); /* dz23 */
6707 v29 = vec_ld(80, (float *) stackdata); /* load i Ox */
6708 v20 = vec_sub(v30,v2); /* iH2y - jOy */
6709 v23 = vec_sub(v30,v5); /* iH2y - jH1y */
6710 v26 = vec_sub(v30,v8); /* iH2y - jH2y */
6711 vec_st(v19, 688, (float *)stackdata); /* dx31 */
6712 vec_st(v22, 736, (float *)stackdata); /* dx32 */
6713 vec_st(v25, 784, (float *)stackdata); /* dx33 */
6714 v30 = vec_ld(96, (float *) stackdata); /* load i Oy */
6715 v21 = vec_sub(v31,v3); /* iH2z - jOz */
6716 v24 = vec_sub(v31,v6); /* iH2z - jH1z */
6717 v27 = vec_sub(v31,v9); /* iH2z - jH2z */
6718 v31 = vec_ld(112, (float *) stackdata); /* load i Oz */
6719 vec_st(v20, 704, (float *)stackdata); /* dy31 */
6720 vec_st(v23, 752, (float *)stackdata); /* dy32 */
6721 vec_st(v26, 800, (float *)stackdata); /* dy33 */
6723 v1 = vec_sub(v29,v1); /* iOx - jOx */
6724 v4 = vec_sub(v29,v4); /* iOx - jH1x */
6725 v7 = vec_sub(v29,v7); /* iOx - jH2x */
6726 vec_st(v21, 720, (float *)stackdata); /* dz31 */
6727 vec_st(v24, 768, (float *)stackdata); /* dz32 */
6728 vec_st(v27, 816, (float *)stackdata); /* dz33 */
6729 v2 = vec_sub(v30,v2); /* iOy - jOy */
6730 v5 = vec_sub(v30,v5); /* iOy - jH1y */
6731 v8 = vec_sub(v30,v8); /* iOy - jH2y */
6732 vec_st(v1, 400, (float *)stackdata); /* dx11 */
6733 vec_st(v4, 448, (float *)stackdata); /* dx12 */
6734 vec_st(v7, 496, (float *)stackdata); /* dx13 */
6735 v3 = vec_sub(v31,v3); /* iOz - jOz */
6736 v6 = vec_sub(v31,v6); /* iOz - jH1z */
6737 v9 = vec_sub(v31,v9); /* iOz - jH2z */
6738 vec_st(v2, 416, (float *)stackdata); /* dy11 */
6739 vec_st(v5, 464, (float *)stackdata); /* dy12 */
6740 vec_st(v8, 512, (float *)stackdata); /* dy13 */
6742 v1 = vec_madd(v1,v1,v0);
6743 v4 = vec_madd(v4,v4,v0);
6744 v7 = vec_madd(v7,v7,v0);
6745 vec_st(v3, 432, (float *)stackdata); /* dz11 */
6746 vec_st(v6, 480, (float *)stackdata); /* dz12 */
6747 vec_st(v9, 528, (float *)stackdata); /* dz13 */
6748 v10 = vec_madd(v10,v10,v0);
6749 v13 = vec_madd(v13,v13,v0);
6750 v16 = vec_madd(v16,v16,v0);
6751 v19 = vec_madd(v19,v19,v0);
6752 v22 = vec_madd(v22,v22,v0);
6753 v25 = vec_madd(v25,v25,v0);
6754 v1 = vec_madd(v2,v2,v1);
6755 v4 = vec_madd(v5,v5,v4);
6756 v7 = vec_madd(v8,v8,v7);
6757 v10 = vec_madd(v11,v11,v10);
6758 v13 = vec_madd(v14,v14,v13);
6759 v16 = vec_madd(v17,v17,v16);
6760 v19 = vec_madd(v20,v20,v19);
6761 v22 = vec_madd(v23,v23,v22);
6762 v25 = vec_madd(v26,v26,v25);
6763 v1 = vec_madd(v3,v3,v1);
6764 v2 = vec_madd(v6,v6,v4);
6765 v3 = vec_madd(v9,v9,v7);
6766 v4 = vec_madd(v12,v12,v10);
6767 v5 = vec_madd(v15,v15,v13);
6768 v6 = vec_madd(v18,v18,v16);
6769 v7 = vec_madd(v21,v21,v19);
6770 v8 = vec_madd(v24,v24,v22);
6771 v9 = vec_madd(v27,v27,v25);
6773 * v1 = rsq iO-jO
6774 * v2 = rsq iO-jH1
6775 * v3 = rsq iO-jH2
6776 * v4 = rsq iH1-jO
6777 * v5 = rsq iH1-jH1
6778 * v6 = rsq iH1-jH2
6779 * v7 = rsq iH2-jO
6780 * v8 = rsq iH2-jH1
6781 * v9 = rsq iH2-jH2
6784 v10 = vec_rsqrte(v1);
6785 v11 = vec_rsqrte(v2);
6786 v12 = vec_rsqrte(v3);
6787 v13 = vec_rsqrte(v4);
6788 v14 = vec_rsqrte(v5);
6789 v15 = vec_rsqrte(v6);
6790 v16 = vec_rsqrte(v7);
6791 v17 = vec_rsqrte(v8);
6792 v18 = vec_rsqrte(v9);
6793 /* create constant 0.5 */
6794 v30 = (vector float) vec_splat_u32(1);
6795 v31 = vec_ctf((vector unsigned int)v30,1); /* 0.5 */
6796 v30 = vec_ctf((vector unsigned int)v30,0); /* 1.0 */
6798 v19 = vec_madd(v10,v10,v0); /* lu*lu */
6799 v20 = vec_madd(v11,v11,v0);
6800 v21 = vec_madd(v12,v12,v0);
6801 v22 = vec_madd(v13,v13,v0);
6802 v23 = vec_madd(v14,v14,v0);
6803 v24 = vec_madd(v15,v15,v0);
6804 v25 = vec_madd(v16,v16,v0);
6805 v26 = vec_madd(v17,v17,v0);
6806 v27 = vec_madd(v18,v18,v0);
6808 v19 = vec_nmsub(v1,v19,v30); /* 1.0 - rsq*lu*lu */
6809 v20 = vec_nmsub(v2,v20,v30);
6810 v21 = vec_nmsub(v3,v21,v30);
6811 v22 = vec_nmsub(v4,v22,v30);
6812 v23 = vec_nmsub(v5,v23,v30);
6813 v24 = vec_nmsub(v6,v24,v30);
6814 v25 = vec_nmsub(v7,v25,v30);
6815 v26 = vec_nmsub(v8,v26,v30);
6816 v27 = vec_nmsub(v9,v27,v30);
6818 v1 = vec_madd(v10,v31,v0);/* lu*0.5*/
6819 v2 = vec_madd(v11,v31,v0);
6820 v3 = vec_madd(v12,v31,v0);
6821 v4 = vec_madd(v13,v31,v0);
6822 v5 = vec_madd(v14,v31,v0);
6823 v6 = vec_madd(v15,v31,v0);
6824 v7 = vec_madd(v16,v31,v0);
6825 v8 = vec_madd(v17,v31,v0);
6826 v9 = vec_madd(v18,v31,v0);
6828 /* The rinv values */
6829 v1 = vec_madd(v1,v19,v10);
6830 v2 = vec_madd(v2,v20,v11);
6831 v3 = vec_madd(v3,v21,v12);
6832 v4 = vec_madd(v4,v22,v13);
6833 v5 = vec_madd(v5,v23,v14);
6834 v6 = vec_madd(v6,v24,v15);
6835 v7 = vec_madd(v7,v25,v16);
6836 v8 = vec_madd(v8,v26,v17);
6837 v9 = vec_madd(v9,v27,v18);
6839 v10 = (vector float)vec_splat_s32(-1);
6840 v10 = vec_sld(v0,v10,8);
6842 v1 = (vector float)vec_sel((vector unsigned int)v1,(vector unsigned int)v0,(vector unsigned int)v10);
6843 v2 = (vector float)vec_sel((vector unsigned int)v2,(vector unsigned int)v0,(vector unsigned int)v10);
6844 v3 = (vector float)vec_sel((vector unsigned int)v3,(vector unsigned int)v0,(vector unsigned int)v10);
6845 v4 = (vector float)vec_sel((vector unsigned int)v4,(vector unsigned int)v0,(vector unsigned int)v10);
6846 v5 = (vector float)vec_sel((vector unsigned int)v5,(vector unsigned int)v0,(vector unsigned int)v10);
6847 v6 = (vector float)vec_sel((vector unsigned int)v6,(vector unsigned int)v0,(vector unsigned int)v10);
6848 v7 = (vector float)vec_sel((vector unsigned int)v7,(vector unsigned int)v0,(vector unsigned int)v10);
6849 v8 = (vector float)vec_sel((vector unsigned int)v8,(vector unsigned int)v0,(vector unsigned int)v10);
6850 v9 = (vector float)vec_sel((vector unsigned int)v9,(vector unsigned int)v0,(vector unsigned int)v10);
6852 /* load qqOO, qqOH and qqHH to v27,v28,v29 */
6853 v27 = vec_ld(0, (float *) stackdata);
6854 v28 = vec_ld(16, (float *) stackdata);
6855 v29 = vec_ld(32, (float *) stackdata);
6857 vec_dstst( faction+j3a, 0x10010100, 2 );
6859 /* put rinvsq in v10-v18, rinv6_OO in v30 and rinv12_OO in v31 */
6860 /* load c6 to v25 and c12 to v26 */
6861 v25 = vec_ld(48, (float *) stackdata);
6862 v26 = vec_ld(64, (float *) stackdata);
6864 v10 = vec_madd(v1,v1,v0);
6865 v1 = vec_madd(v1,v27,v0); /* rinv11*qqOO */
6866 v11 = vec_madd(v2,v2,v0);
6867 /* load vctot to v23 and vnbtot to v24 */
6868 v23 = vec_ld(224,(float *) stackdata);
6869 v24 = vec_ld(240,(float *) stackdata);
6871 v2 = vec_madd(v2,v28,v0); /* rinv12*qqOH */
6872 v12 = vec_madd(v3,v3,v0);
6873 v30 = vec_madd(v10,v10,v0); /* rinv4 */
6874 v3 = vec_madd(v3,v28,v0); /* rinv13*qqOH */
6875 v13 = vec_madd(v4,v4,v0);
6876 v4 = vec_madd(v4,v28,v0); /* rinv21*qqOH */
6877 v14 = vec_madd(v5,v5,v0);
6879 v23 = vec_add(v23,v1);
6881 v30 = vec_madd(v30,v10,v0); /* rinv6 */
6882 v5 = vec_madd(v5,v29,v0); /* rinv22*qqHH */
6883 v15 = vec_madd(v6,v6,v0);
6884 v6 = vec_madd(v6,v29,v0); /* rinv23*qqHH */
6885 v23 = vec_add(v23,v2);
6886 v16 = vec_madd(v7,v7,v0);
6887 v31 = vec_madd(v30,v30,v0); /* rinv12 */
6888 v25 = vec_madd(v25,v30,v0); /* c6*rinv6 */
6889 /* load 6.0 to v30 */
6890 v30 = (vector float)vec_splat_u32(6);
6891 v30 = vec_ctf((vector unsigned int)v30,0);
6892 v23 = vec_add(v23,v3);
6894 v7 = vec_madd(v7,v28,v0); /* rinv31*qqOH */
6895 v17 = vec_madd(v8,v8,v0);
6896 v8 = vec_madd(v8,v29,v0); /* rinv32*qqHH */
6897 v26 = vec_madd(v26,v31,v0); /* c12*rinv12 */
6898 v23 = vec_add(v23,v4);
6899 /* load 12.0 to v31 */
6900 v31 = (vector float)vec_splat_u32(12);
6901 v31 = vec_ctf((vector unsigned int)v31,0);
6903 v24 = vec_sub(v24,v25); /* add vnb6 to vnbtot */
6904 v18 = vec_madd(v9,v9,v0);
6905 v23 = vec_add(v23,v5);
6906 v9 = vec_madd(v9,v29,v0); /* rinv33*qqHH */
6908 v24 = vec_add(v24,v26);/* add vnb12 to vnbtot */
6910 v31 = vec_madd(v31,v26,v0);
6911 v11 = vec_madd(v11,v2,v0); /* fs12 */
6912 v23 = vec_add(v23,v6);
6913 v12 = vec_madd(v12,v3,v0); /* fs13 */
6914 v13 = vec_madd(v13,v4,v0); /* fs21 */
6915 v31 = vec_nmsub(v30,v25,v31);
6917 v14 = vec_madd(v14,v5,v0); /* fs22 */
6918 v23 = vec_add(v23,v7);
6919 v15 = vec_madd(v15,v6,v0); /* fs23 */
6920 v16 = vec_madd(v16,v7,v0); /* fs31 */
6921 v1 = vec_add(v31,v1);
6922 v17 = vec_madd(v17,v8,v0); /* fs32 */
6923 v23 = vec_add(v23,v8);
6924 v18 = vec_madd(v18,v9,v0); /* fs33 */
6925 v10 = vec_madd(v10,v1,v0);
6927 vec_st(v24,240,(float *)stackdata); /* store vnbtot */
6928 /* calculate vectorial forces and accumulate fj. v10-v18 has fs11-fs33 now. */
6929 /* First load iO-* dx,dy,dz vectors to v1-v9 */
6930 /* and load iO forces to v28,v29,v30 */
6931 /* use v19-v27 to accumulate j water forces */
6932 v28 = vec_ld(256, (float *) stackdata);
6933 v29 = vec_ld(272, (float *) stackdata);
6934 v30 = vec_ld(288, (float *) stackdata);
6936 v1 = vec_ld(400, (float *) stackdata);
6937 v2 = vec_ld(416, (float *) stackdata);
6938 v23 = vec_add(v23,v9); /* incr. vctot */
6939 v3 = vec_ld(432, (float *) stackdata);
6940 v4 = vec_ld(448, (float *) stackdata);
6941 v5 = vec_ld(464, (float *) stackdata);
6942 v6 = vec_ld(480, (float *) stackdata);
6943 vec_st(v23,224,(float *)stackdata); /* store vctot back to stack */
6944 v7 = vec_ld(496, (float *) stackdata);
6945 v8 = vec_ld(512, (float *) stackdata);
6946 v9 = vec_ld(528, (float *) stackdata);
6948 v28 = vec_madd(v10,v1,v28);
6949 v19 = vec_nmsub(v10,v1,v0);
6950 v29 = vec_madd(v10,v2,v29);
6951 v20 = vec_nmsub(v10,v2,v0);
6952 v30 = vec_madd(v10,v3,v30);
6953 v21 = vec_nmsub(v10,v3,v0);
6955 v28 = vec_madd(v11,v4,v28);
6956 v22 = vec_nmsub(v11,v4,v0);
6957 v29 = vec_madd(v11,v5,v29);
6958 v23 = vec_nmsub(v11,v5,v0);
6959 v30 = vec_madd(v11,v6,v30);
6960 v24 = vec_nmsub(v11,v6,v0);
6962 v28 = vec_madd(v12,v7,v28);
6963 v25 = vec_nmsub(v12,v7,v0);
6964 v29 = vec_madd(v12,v8,v29);
6965 v26 = vec_nmsub(v12,v8,v0);
6966 v30 = vec_madd(v12,v9,v30);
6967 v27 = vec_nmsub(v12,v9,v0);
6969 /* store these i forces, and repeat the procedue for the iH1-* force */
6970 vec_st(v28,256,(float *)stackdata);
6971 vec_st(v29,272,(float *)stackdata);
6972 vec_st(v30,288,(float *)stackdata);
6974 v28 = vec_ld(304,(float *) stackdata);
6975 v29 = vec_ld(320,(float *) stackdata);
6976 v30 = vec_ld(336,(float *) stackdata);
6977 /* load new vectorial distances */
6978 v1 = vec_ld(544, (float *) stackdata);
6979 v2 = vec_ld(560, (float *) stackdata);
6980 v3 = vec_ld(576, (float *) stackdata);
6981 v4 = vec_ld(592, (float *) stackdata);
6982 v5 = vec_ld(608, (float *) stackdata);
6983 v6 = vec_ld(624, (float *) stackdata);
6984 v7 = vec_ld(640, (float *) stackdata);
6985 v8 = vec_ld(656, (float *) stackdata);
6986 v9 = vec_ld(672, (float *) stackdata);
6988 v28 = vec_madd(v13,v1,v28);
6989 v19 = vec_nmsub(v13,v1,v19);
6990 v29 = vec_madd(v13,v2,v29);
6991 v20 = vec_nmsub(v13,v2,v20);
6992 v30 = vec_madd(v13,v3,v30);
6993 v21 = vec_nmsub(v13,v3,v21);
6995 v28 = vec_madd(v14,v4,v28);
6996 v22 = vec_nmsub(v14,v4,v22);
6997 v29 = vec_madd(v14,v5,v29);
6998 v23 = vec_nmsub(v14,v5,v23);
6999 v30 = vec_madd(v14,v6,v30);
7000 v24 = vec_nmsub(v14,v6,v24);
7002 v28 = vec_madd(v15,v7,v28);
7003 v25 = vec_nmsub(v15,v7,v25);
7004 v29 = vec_madd(v15,v8,v29);
7005 v26 = vec_nmsub(v15,v8,v26);
7006 v30 = vec_madd(v15,v9,v30);
7007 v27 = vec_nmsub(v15,v9,v27);
7009 /* store these i forces, and repeat the procedue for the iH2-* force */
7010 vec_st(v28,304,(float *)stackdata);
7011 vec_st(v29,320,(float *)stackdata);
7012 vec_st(v30,336,(float *)stackdata);
7013 v28 = vec_ld(352,(float *) stackdata);
7014 v29 = vec_ld(368,(float *) stackdata);
7015 v30 = vec_ld(384,(float *) stackdata);
7016 /* load new vectorial distances */
7017 v1 = vec_ld(688, (float *) stackdata);
7018 v2 = vec_ld(704, (float *) stackdata);
7019 v3 = vec_ld(720, (float *) stackdata);
7020 v4 = vec_ld(736, (float *) stackdata);
7021 v5 = vec_ld(752, (float *) stackdata);
7022 v6 = vec_ld(768, (float *) stackdata);
7023 v7 = vec_ld(784, (float *) stackdata);
7024 v8 = vec_ld(800, (float *) stackdata);
7025 v9 = vec_ld(816, (float *) stackdata);
7027 v28 = vec_madd(v16,v1,v28);
7028 v19 = vec_nmsub(v16,v1,v19);
7029 v29 = vec_madd(v16,v2,v29);
7030 v20 = vec_nmsub(v16,v2,v20);
7031 v30 = vec_madd(v16,v3,v30);
7032 v21 = vec_nmsub(v16,v3,v21);
7034 v28 = vec_madd(v17,v4,v28);
7035 v22 = vec_nmsub(v17,v4,v22);
7036 v29 = vec_madd(v17,v5,v29);
7037 v23 = vec_nmsub(v17,v5,v23);
7038 v30 = vec_madd(v17,v6,v30);
7039 v24 = vec_nmsub(v17,v6,v24);
7041 v28 = vec_madd(v18,v7,v28);
7042 v25 = vec_nmsub(v18,v7,v25);
7043 v29 = vec_madd(v18,v8,v29);
7044 v26 = vec_nmsub(v18,v8,v26);
7045 v30 = vec_madd(v18,v9,v30);
7046 v27 = vec_nmsub(v18,v9,v27);
7048 /* store these i forces */
7049 vec_st(v28,352,(float *)stackdata);
7050 vec_st(v29,368,(float *)stackdata);
7051 vec_st(v30,384,(float *)stackdata);
7053 /* j forces present in v19-v27 */
7055 v1 = vec_mergeh(v19,v21); /* Oxa Oza Oxb Ozb */
7056 v21 = vec_mergeh(v20,v22); /* Oya H1xa Oyb H1xb */
7057 v22 = vec_mergeh(v23,v25); /* H1ya H2xa H1yb H2xb */
7058 v25 = vec_mergeh(v24,v26); /* H1za H2ya H1zb H2yb */
7060 v26 = vec_mergeh(v27,v0); /* H2za 0 H2zb 0 */
7062 v2 = vec_mergeh(v1,v21); /* Oxa Oya Oza H1xa */
7063 v21 = vec_mergel(v1,v21); /* Oxb Oyb Ozb H1xb */
7064 v20 = vec_mergeh(v22,v25); /* H1ya H1za H2xa H2ya */
7065 v22 = vec_mergel(v22,v25); /* H1yb H1zb H2xb H2yb */
7066 v24 = vec_mergeh(v26,v0); /* H2za 0 0 0 */
7067 v26 = vec_mergel(v26,v0); /* H2zb 0 0 0 */
7069 v29 = (vector float)vec_splat_s32(-1);
7070 /* move into position, load and add */
7071 v30 = (vector float)vec_lvsr( 0, (int *) faction+j3a );
7072 v4 = vec_ld( 0, faction+j3a);
7074 v6 = vec_ld( 16, faction+j3a);
7075 v8 = vec_ld( 32, faction+j3a);
7076 v10 = vec_perm(v0,v29,(vector unsigned char)v30);
7078 v12 = vec_perm(v0,v2,(vector unsigned char)v30);
7079 v4 = vec_add(v12,v4);
7081 v14 = vec_perm(v2,v20,(vector unsigned char)v30);
7082 v2 = vec_add(v14,v6);
7084 v16 = vec_perm(v20,v24,(vector unsigned char)v30);
7085 v20 = vec_add(v16,v8);
7087 v12 = vec_sel(v4,v4,(vector unsigned int)v10);
7088 vec_st(v12, 0, faction+j3a);
7090 v10 = vec_sld(v0,v10,12);
7092 vec_st(v2, 16, faction+j3a);
7094 v12 = vec_sel(v20,v8,(vector unsigned int)v10);
7096 vec_st(v12, 32, faction+j3a);
7098 /* Finished 1 - now do 2 */
7100 v30 = (vector float)vec_lvsr( 0, (int *) faction+j3b );
7101 v4 = vec_ld( 0, faction+j3b);
7102 v6 = vec_ld( 16, faction+j3b);
7103 v8 = vec_ld( 32, faction+j3b);
7104 v10 = vec_perm(v0,v29,(vector unsigned char)v30);
7106 v12 = vec_perm(v0,v21,(vector unsigned char)v30);
7107 v24 = vec_add(v12,v4);
7109 v12 = vec_perm(v21,v22,(vector unsigned char)v30);
7110 v21 = vec_add(v12,v6);
7112 v12 = vec_perm(v22,v26,(vector unsigned char)v30);
7113 v22 = vec_add(v12,v8);
7115 v12 = vec_sel(v4,v24,(vector unsigned int)v10);
7116 vec_st(v12, 0, faction+j3b);
7117 v10 = vec_sld(v0,v10,12);
7119 vec_st(v21, 16, faction+j3b);
7121 v12 = vec_sel(v22,v8,(vector unsigned int)v10);
7122 vec_st(v12, 32, faction+j3b);
7124 } else if(k<nj1) {
7125 jnra = jjnr[k];
7126 j3a = 3*jnra;
7128 v10 = (vector float)vec_lvsl(0, pos+j3a);
7130 v2 = vec_ld(0, pos+j3a);
7131 v3 = vec_ld(16, pos+j3a);
7132 v4 = vec_ld(32, pos+j3a);
7133 v1 = vec_perm(v2,v3,(vector unsigned char)v10); /* Oxa Oya Oza H1xa */
7134 v5 = vec_perm(v3,v4,(vector unsigned char)v10); /* H1ya H1za H2xa H2ya */
7135 v9 = vec_perm(v4,v4,(vector unsigned char)v10); /* H2za - - - */
7137 /* permute water coordinates */
7138 /* just splat things... never mind that we fill all cells :-) */
7139 v29 = vec_ld(128, (float *) stackdata); /* load i H1x */
7140 v2 = vec_splat(v1,1);
7141 v30 = vec_ld(144, (float *) stackdata); /* load i H1y */
7142 v3 = vec_splat(v1,2);
7143 v31 = vec_ld(160, (float *) stackdata); /* load i H1z */
7144 v4 = vec_splat(v1,3);
7145 v6 = vec_splat(v5,1);
7146 v7 = vec_splat(v5,2);
7147 v8 = vec_splat(v5,3);
7149 v10 = vec_sub(v29,v1); /* iH1x - jOx */
7150 v13 = vec_sub(v29,v4); /* iH1x - jH1x */
7151 v16 = vec_sub(v29,v7); /* iH1x - jH2x */
7152 v29 = vec_ld(176, (float *) stackdata); /* load i H2x */
7153 v11 = vec_sub(v30,v2); /* iH1y - jOy */
7154 v14 = vec_sub(v30,v5); /* iH1y - jH1y */
7155 v17 = vec_sub(v30,v8); /* iH1y - jH2y */
7156 v30 = vec_ld(192, (float *) stackdata); /* load i H2y */
7157 vec_st(v10, 544, (float *)stackdata); /* dx21 */
7158 vec_st(v13, 592, (float *)stackdata); /* dx22 */
7159 vec_st(v16, 640, (float *)stackdata); /* dx23 */
7160 v12 = vec_sub(v31,v3); /* iH1z - jOz */
7161 v15 = vec_sub(v31,v6); /* iH1z - jH1z */
7162 v18 = vec_sub(v31,v9); /* iH1z - jH2z */
7163 v31 = vec_ld(208, (float *) stackdata); /* load i H2z */
7164 /* v10-v18 now contains iH1-jO, iH1-jH1 and iJ1-jH2 distances */
7165 vec_st(v11, 560, (float *)stackdata); /* dy21 */
7166 vec_st(v14, 608, (float *)stackdata); /* dy22 */
7167 vec_st(v17, 656, (float *)stackdata); /* dy23 */
7168 v19 = vec_sub(v29,v1); /* iH2x - jOx */
7169 v22 = vec_sub(v29,v4); /* iH2x - jH1x */
7170 v25 = vec_sub(v29,v7); /* iH2x - jH2x */
7171 vec_st(v12, 576, (float *)stackdata); /* dz21 */
7172 vec_st(v15, 624, (float *)stackdata); /* dz22 */
7173 vec_st(v18, 672, (float *)stackdata); /* dz23 */
7174 v29 = vec_ld(80, (float *) stackdata); /* load i Ox */
7175 v20 = vec_sub(v30,v2); /* iH2y - jOy */
7176 v23 = vec_sub(v30,v5); /* iH2y - jH1y */
7177 v26 = vec_sub(v30,v8); /* iH2y - jH2y */
7178 vec_st(v19, 688, (float *)stackdata); /* dx31 */
7179 vec_st(v22, 736, (float *)stackdata); /* dx32 */
7180 vec_st(v25, 784, (float *)stackdata); /* dx33 */
7181 v30 = vec_ld(96, (float *) stackdata); /* load i Oy */
7182 v21 = vec_sub(v31,v3); /* iH2z - jOz */
7183 v24 = vec_sub(v31,v6); /* iH2z - jH1z */
7184 v27 = vec_sub(v31,v9); /* iH2z - jH2z */
7185 v31 = vec_ld(112, (float *) stackdata); /* load i Oz */
7186 vec_st(v20, 704, (float *)stackdata); /* dy31 */
7187 vec_st(v23, 752, (float *)stackdata); /* dy32 */
7188 vec_st(v26, 800, (float *)stackdata); /* dy33 */
7190 v1 = vec_sub(v29,v1); /* iOx - jOx */
7191 v4 = vec_sub(v29,v4); /* iOx - jH1x */
7192 v7 = vec_sub(v29,v7); /* iOx - jH2x */
7193 vec_st(v21, 720, (float *)stackdata); /* dz31 */
7194 vec_st(v24, 768, (float *)stackdata); /* dz32 */
7195 vec_st(v27, 816, (float *)stackdata); /* dz33 */
7196 v2 = vec_sub(v30,v2); /* iOy - jOy */
7197 v5 = vec_sub(v30,v5); /* iOy - jH1y */
7198 v8 = vec_sub(v30,v8); /* iOy - jH2y */
7199 vec_st(v1, 400, (float *)stackdata); /* dx11 */
7200 vec_st(v4, 448, (float *)stackdata); /* dx12 */
7201 vec_st(v7, 496, (float *)stackdata); /* dx13 */
7202 v3 = vec_sub(v31,v3); /* iOz - jOz */
7203 v6 = vec_sub(v31,v6); /* iOz - jH1z */
7204 v9 = vec_sub(v31,v9); /* iOz - jH2z */
7205 vec_st(v2, 416, (float *)stackdata); /* dy11 */
7206 vec_st(v5, 464, (float *)stackdata); /* dy12 */
7207 vec_st(v8, 512, (float *)stackdata); /* dy13 */
7209 v1 = vec_madd(v1,v1,v0);
7210 v4 = vec_madd(v4,v4,v0);
7211 v7 = vec_madd(v7,v7,v0);
7212 vec_st(v3, 432, (float *)stackdata); /* dz11 */
7213 vec_st(v6, 480, (float *)stackdata); /* dz12 */
7214 vec_st(v9, 528, (float *)stackdata); /* dz13 */
7215 v10 = vec_madd(v10,v10,v0);
7216 v13 = vec_madd(v13,v13,v0);
7217 v16 = vec_madd(v16,v16,v0);
7218 v19 = vec_madd(v19,v19,v0);
7219 v22 = vec_madd(v22,v22,v0);
7220 v25 = vec_madd(v25,v25,v0);
7221 v1 = vec_madd(v2,v2,v1);
7222 v4 = vec_madd(v5,v5,v4);
7223 v7 = vec_madd(v8,v8,v7);
7224 v10 = vec_madd(v11,v11,v10);
7225 v13 = vec_madd(v14,v14,v13);
7226 v16 = vec_madd(v17,v17,v16);
7227 v19 = vec_madd(v20,v20,v19);
7228 v22 = vec_madd(v23,v23,v22);
7229 v25 = vec_madd(v26,v26,v25);
7230 v1 = vec_madd(v3,v3,v1);
7231 v2 = vec_madd(v6,v6,v4);
7232 v3 = vec_madd(v9,v9,v7);
7233 v4 = vec_madd(v12,v12,v10);
7234 v5 = vec_madd(v15,v15,v13);
7235 v6 = vec_madd(v18,v18,v16);
7236 v7 = vec_madd(v21,v21,v19);
7237 v8 = vec_madd(v24,v24,v22);
7238 v9 = vec_madd(v27,v27,v25);
7240 * v1 = rsq iO-jO
7241 * v2 = rsq iO-jH1
7242 * v3 = rsq iO-jH2
7243 * v4 = rsq iH1-jO
7244 * v5 = rsq iH1-jH1
7245 * v6 = rsq iH1-jH2
7246 * v7 = rsq iH2-jO
7247 * v8 = rsq iH2-jH1
7248 * v9 = rsq iH2-jH2
7251 v10 = vec_rsqrte(v1);
7252 v11 = vec_rsqrte(v2);
7253 v12 = vec_rsqrte(v3);
7254 v13 = vec_rsqrte(v4);
7255 v14 = vec_rsqrte(v5);
7256 v15 = vec_rsqrte(v6);
7257 v16 = vec_rsqrte(v7);
7258 v17 = vec_rsqrte(v8);
7259 v18 = vec_rsqrte(v9);
7260 /* create constant 0.5 */
7261 v30 = (vector float) vec_splat_u32(1);
7262 v31 = vec_ctf((vector unsigned int)v30,1); /* 0.5 */
7263 v30 = vec_ctf((vector unsigned int)v30,0); /* 1.0 */
7265 v19 = vec_madd(v10,v10,v0); /* lu*lu */
7266 v20 = vec_madd(v11,v11,v0);
7267 v21 = vec_madd(v12,v12,v0);
7268 v22 = vec_madd(v13,v13,v0);
7269 v23 = vec_madd(v14,v14,v0);
7270 v24 = vec_madd(v15,v15,v0);
7271 v25 = vec_madd(v16,v16,v0);
7272 v26 = vec_madd(v17,v17,v0);
7273 v27 = vec_madd(v18,v18,v0);
7275 v19 = vec_nmsub(v1,v19,v30); /* 1.0 - rsq*lu*lu */
7276 v20 = vec_nmsub(v2,v20,v30);
7277 v21 = vec_nmsub(v3,v21,v30);
7278 v22 = vec_nmsub(v4,v22,v30);
7279 v23 = vec_nmsub(v5,v23,v30);
7280 v24 = vec_nmsub(v6,v24,v30);
7281 v25 = vec_nmsub(v7,v25,v30);
7282 v26 = vec_nmsub(v8,v26,v30);
7283 v27 = vec_nmsub(v9,v27,v30);
7285 v1 = vec_madd(v10,v31,v0);/* lu*0.5*/
7286 v2 = vec_madd(v11,v31,v0);
7287 v3 = vec_madd(v12,v31,v0);
7288 v4 = vec_madd(v13,v31,v0);
7289 v5 = vec_madd(v14,v31,v0);
7290 v6 = vec_madd(v15,v31,v0);
7291 v7 = vec_madd(v16,v31,v0);
7292 v8 = vec_madd(v17,v31,v0);
7293 v9 = vec_madd(v18,v31,v0);
7295 /* The rinv values */
7296 v1 = vec_madd(v1,v19,v10);
7297 v2 = vec_madd(v2,v20,v11);
7298 v3 = vec_madd(v3,v21,v12);
7299 v4 = vec_madd(v4,v22,v13);
7300 v5 = vec_madd(v5,v23,v14);
7301 v6 = vec_madd(v6,v24,v15);
7302 v7 = vec_madd(v7,v25,v16);
7303 v8 = vec_madd(v8,v26,v17);
7304 v9 = vec_madd(v9,v27,v18);
7306 v10 = (vector float)vec_splat_s32(-1);
7307 v10 = vec_sld(v0,v10,12);
7309 v1 = (vector float)vec_sel((vector unsigned int)v1,(vector unsigned int)v0,(vector unsigned int)v10);
7310 v2 = (vector float)vec_sel((vector unsigned int)v2,(vector unsigned int)v0,(vector unsigned int)v10);
7311 v3 = (vector float)vec_sel((vector unsigned int)v3,(vector unsigned int)v0,(vector unsigned int)v10);
7312 v4 = (vector float)vec_sel((vector unsigned int)v4,(vector unsigned int)v0,(vector unsigned int)v10);
7313 v5 = (vector float)vec_sel((vector unsigned int)v5,(vector unsigned int)v0,(vector unsigned int)v10);
7314 v6 = (vector float)vec_sel((vector unsigned int)v6,(vector unsigned int)v0,(vector unsigned int)v10);
7315 v7 = (vector float)vec_sel((vector unsigned int)v7,(vector unsigned int)v0,(vector unsigned int)v10);
7316 v8 = (vector float)vec_sel((vector unsigned int)v8,(vector unsigned int)v0,(vector unsigned int)v10);
7317 v9 = (vector float)vec_sel((vector unsigned int)v9,(vector unsigned int)v0,(vector unsigned int)v10);
7319 /* load qqOO, qqOH and qqHH to v27,v28,v29 */
7320 v27 = vec_ld(0, (float *) stackdata);
7321 v28 = vec_ld(16, (float *) stackdata);
7322 v29 = vec_ld(32, (float *) stackdata);
7323 vec_dstst( faction+j3a, 0x10010100, 2 );
7325 /* put rinvsq in v10-v18, rinv6_OO in v30 and rinv12_OO in v31 */
7326 /* load c6 to v25 and c12 to v26 */
7327 v25 = vec_ld(48, (float *) stackdata);
7328 v26 = vec_ld(64, (float *) stackdata);
7330 v10 = vec_madd(v1,v1,v0);
7331 v1 = vec_madd(v1,v27,v0); /* rinv11*qqOO */
7332 v11 = vec_madd(v2,v2,v0);
7333 /* load vctot to v23 and vnbtot to v24 */
7334 v23 = vec_ld(224,(float *) stackdata);
7335 v24 = vec_ld(240,(float *) stackdata);
7337 v2 = vec_madd(v2,v28,v0); /* rinv12*qqOH */
7338 v12 = vec_madd(v3,v3,v0);
7339 v30 = vec_madd(v10,v10,v0); /* rinv4 */
7340 v3 = vec_madd(v3,v28,v0); /* rinv13*qqOH */
7341 v13 = vec_madd(v4,v4,v0);
7342 v4 = vec_madd(v4,v28,v0); /* rinv21*qqOH */
7343 v14 = vec_madd(v5,v5,v0);
7345 v23 = vec_add(v23,v1);
7347 v30 = vec_madd(v30,v10,v0); /* rinv6 */
7348 v5 = vec_madd(v5,v29,v0); /* rinv22*qqHH */
7349 v15 = vec_madd(v6,v6,v0);
7350 v6 = vec_madd(v6,v29,v0); /* rinv23*qqHH */
7351 v23 = vec_add(v23,v2);
7352 v16 = vec_madd(v7,v7,v0);
7353 v31 = vec_madd(v30,v30,v0); /* rinv12 */
7354 v25 = vec_madd(v25,v30,v0); /* c6*rinv6 */
7355 /* load 6.0 to v30 */
7356 v30 = (vector float)vec_splat_u32(6);
7357 v30 = vec_ctf((vector unsigned int)v30,0);
7358 v23 = vec_add(v23,v3);
7360 v7 = vec_madd(v7,v28,v0); /* rinv31*qqOH */
7361 v17 = vec_madd(v8,v8,v0);
7362 v8 = vec_madd(v8,v29,v0); /* rinv32*qqHH */
7363 v26 = vec_madd(v26,v31,v0); /* c12*rinv12 */
7364 v23 = vec_add(v23,v4);
7365 /* load 12.0 to v31 */
7366 v31 = (vector float)vec_splat_u32(12);
7367 v31 = vec_ctf((vector unsigned int)v31,0);
7369 v24 = vec_sub(v24,v25); /* add vnb6 to vnbtot */
7370 v18 = vec_madd(v9,v9,v0);
7371 v23 = vec_add(v23,v5);
7372 v9 = vec_madd(v9,v29,v0); /* rinv33*qqHH */
7374 v24 = vec_add(v24,v26);/* add vnb12 to vnbtot */
7376 v31 = vec_madd(v31,v26,v0);
7377 v11 = vec_madd(v11,v2,v0); /* fs12 */
7378 v23 = vec_add(v23,v6);
7379 v12 = vec_madd(v12,v3,v0); /* fs13 */
7380 v13 = vec_madd(v13,v4,v0); /* fs21 */
7381 v31 = vec_nmsub(v30,v25,v31);
7383 v14 = vec_madd(v14,v5,v0); /* fs22 */
7384 v23 = vec_add(v23,v7);
7385 v15 = vec_madd(v15,v6,v0); /* fs23 */
7386 v16 = vec_madd(v16,v7,v0); /* fs31 */
7387 v1 = vec_add(v31,v1);
7388 v17 = vec_madd(v17,v8,v0); /* fs32 */
7389 v23 = vec_add(v23,v8);
7390 v18 = vec_madd(v18,v9,v0); /* fs33 */
7391 v10 = vec_madd(v10,v1,v0);
7393 vec_st(v24,240,(float *)stackdata); /* store vnbtot */
7394 /* calculate vectorial forces and accumulate fj. v10-v18 has fs11-fs33 now. */
7395 /* First load iO-* dx,dy,dz vectors to v1-v9 */
7396 /* and load iO forces to v28,v29,v30 */
7397 /* use v19-v27 to accumulate j water forces */
7398 v28 = vec_ld(256, (float *) stackdata);
7399 v29 = vec_ld(272, (float *) stackdata);
7400 v30 = vec_ld(288, (float *) stackdata);
7402 v1 = vec_ld(400, (float *) stackdata);
7403 v2 = vec_ld(416, (float *) stackdata);
7404 v23 = vec_add(v23,v9); /* incr. vctot */
7405 v3 = vec_ld(432, (float *) stackdata);
7406 v4 = vec_ld(448, (float *) stackdata);
7407 v5 = vec_ld(464, (float *) stackdata);
7408 v6 = vec_ld(480, (float *) stackdata);
7409 vec_st(v23,224,(float *)stackdata); /* store vctot back to stack */
7410 v7 = vec_ld(496, (float *) stackdata);
7411 v8 = vec_ld(512, (float *) stackdata);
7412 v9 = vec_ld(528, (float *) stackdata);
7414 v28 = vec_madd(v10,v1,v28);
7415 v19 = vec_nmsub(v10,v1,v0);
7416 v29 = vec_madd(v10,v2,v29);
7417 v20 = vec_nmsub(v10,v2,v0);
7418 v30 = vec_madd(v10,v3,v30);
7419 v21 = vec_nmsub(v10,v3,v0);
7421 v28 = vec_madd(v11,v4,v28);
7422 v22 = vec_nmsub(v11,v4,v0);
7423 v29 = vec_madd(v11,v5,v29);
7424 v23 = vec_nmsub(v11,v5,v0);
7425 v30 = vec_madd(v11,v6,v30);
7426 v24 = vec_nmsub(v11,v6,v0);
7428 v28 = vec_madd(v12,v7,v28);
7429 v25 = vec_nmsub(v12,v7,v0);
7430 v29 = vec_madd(v12,v8,v29);
7431 v26 = vec_nmsub(v12,v8,v0);
7432 v30 = vec_madd(v12,v9,v30);
7433 v27 = vec_nmsub(v12,v9,v0);
7435 /* store these i forces, and repeat the procedue for the iH1-* force */
7436 vec_st(v28,256,(float *)stackdata);
7437 vec_st(v29,272,(float *)stackdata);
7438 vec_st(v30,288,(float *)stackdata);
7440 v28 = vec_ld(304,(float *) stackdata);
7441 v29 = vec_ld(320,(float *) stackdata);
7442 v30 = vec_ld(336,(float *) stackdata);
7443 /* load new vectorial distances */
7444 v1 = vec_ld(544, (float *) stackdata);
7445 v2 = vec_ld(560, (float *) stackdata);
7446 v3 = vec_ld(576, (float *) stackdata);
7447 v4 = vec_ld(592, (float *) stackdata);
7448 v5 = vec_ld(608, (float *) stackdata);
7449 v6 = vec_ld(624, (float *) stackdata);
7450 v7 = vec_ld(640, (float *) stackdata);
7451 v8 = vec_ld(656, (float *) stackdata);
7452 v9 = vec_ld(672, (float *) stackdata);
7454 v28 = vec_madd(v13,v1,v28);
7455 v19 = vec_nmsub(v13,v1,v19);
7456 v29 = vec_madd(v13,v2,v29);
7457 v20 = vec_nmsub(v13,v2,v20);
7458 v30 = vec_madd(v13,v3,v30);
7459 v21 = vec_nmsub(v13,v3,v21);
7461 v28 = vec_madd(v14,v4,v28);
7462 v22 = vec_nmsub(v14,v4,v22);
7463 v29 = vec_madd(v14,v5,v29);
7464 v23 = vec_nmsub(v14,v5,v23);
7465 v30 = vec_madd(v14,v6,v30);
7466 v24 = vec_nmsub(v14,v6,v24);
7468 v28 = vec_madd(v15,v7,v28);
7469 v25 = vec_nmsub(v15,v7,v25);
7470 v29 = vec_madd(v15,v8,v29);
7471 v26 = vec_nmsub(v15,v8,v26);
7472 v30 = vec_madd(v15,v9,v30);
7473 v27 = vec_nmsub(v15,v9,v27);
7475 /* store these i forces, and repeat the procedue for the iH2-* force */
7476 vec_st(v28,304,(float *)stackdata);
7477 vec_st(v29,320,(float *)stackdata);
7478 vec_st(v30,336,(float *)stackdata);
7479 v28 = vec_ld(352,(float *) stackdata);
7480 v29 = vec_ld(368,(float *) stackdata);
7481 v30 = vec_ld(384,(float *) stackdata);
7482 /* load new vectorial distances */
7483 v1 = vec_ld(688, (float *) stackdata);
7484 v2 = vec_ld(704, (float *) stackdata);
7485 v3 = vec_ld(720, (float *) stackdata);
7486 v4 = vec_ld(736, (float *) stackdata);
7487 v5 = vec_ld(752, (float *) stackdata);
7488 v6 = vec_ld(768, (float *) stackdata);
7489 v7 = vec_ld(784, (float *) stackdata);
7490 v8 = vec_ld(800, (float *) stackdata);
7491 v9 = vec_ld(816, (float *) stackdata);
7493 v28 = vec_madd(v16,v1,v28);
7494 v19 = vec_nmsub(v16,v1,v19);
7495 v29 = vec_madd(v16,v2,v29);
7496 v20 = vec_nmsub(v16,v2,v20);
7497 v30 = vec_madd(v16,v3,v30);
7498 v21 = vec_nmsub(v16,v3,v21);
7500 v28 = vec_madd(v17,v4,v28);
7501 v22 = vec_nmsub(v17,v4,v22);
7502 v29 = vec_madd(v17,v5,v29);
7503 v23 = vec_nmsub(v17,v5,v23);
7504 v30 = vec_madd(v17,v6,v30);
7505 v24 = vec_nmsub(v17,v6,v24);
7507 v28 = vec_madd(v18,v7,v28);
7508 v25 = vec_nmsub(v18,v7,v25);
7509 v29 = vec_madd(v18,v8,v29);
7510 v26 = vec_nmsub(v18,v8,v26);
7511 v30 = vec_madd(v18,v9,v30);
7512 v27 = vec_nmsub(v18,v9,v27);
7514 /* store these i forces */
7515 vec_st(v28,352,(float *)stackdata);
7516 vec_st(v29,368,(float *)stackdata);
7517 vec_st(v30,384,(float *)stackdata);
7519 /* j forces present in v19-v27 */
7521 v1 = vec_mergeh(v19,v21); /* Oxa Oza - - */
7522 v21 = vec_mergeh(v20,v22); /* Oya H1xa - - */
7523 v22 = vec_mergeh(v23,v25); /* H1ya H2xa - - */
7524 v25 = vec_mergeh(v24,v26); /* H1za H2ya - - */
7526 v26 = vec_mergeh(v27,v0); /* H2za 0 - 0 */
7528 v2 = vec_mergeh(v1,v21); /* Oxa Oya Oza H1xa */
7529 v20 = vec_mergeh(v22,v25); /* H1ya H1za H2xa H2ya */
7530 v24 = vec_mergeh(v26,v0); /* H2za 0 0 0 */
7532 v29 = (vector float)vec_splat_s32(-1);
7534 /* move into position, load and add */
7535 v30 = (vector float)vec_lvsr( 0, (int *) faction+j3a );
7536 v4 = vec_ld( 0, faction+j3a);
7538 v6 = vec_ld( 16, faction+j3a);
7539 v8 = vec_ld( 32, faction+j3a);
7540 v10 = vec_perm(v0,v29,(vector unsigned char)v30);
7542 v12 = vec_perm(v0,v2,(vector unsigned char)v30);
7543 v4 = vec_add(v12,v4);
7545 v14 = vec_perm(v2,v20,(vector unsigned char)v30);
7546 v2 = vec_add(v14,v6);
7548 v16 = vec_perm(v20,v24,(vector unsigned char)v30);
7549 v20 = vec_add(v16,v8);
7551 v12 = vec_sel(v4,v4,(vector unsigned int)v10);
7552 vec_st(v12, 0, faction+j3a);
7554 v10 = vec_sld(v0,v10,12);
7556 vec_st(v2, 16, faction+j3a);
7558 v12 = vec_sel(v20,v8,(vector unsigned int)v10);
7560 vec_st(v12, 32, faction+j3a);
7564 v1 = (vector float)vec_lvsr(0,faction+ii3);
7565 v5 = (vector float)vec_splat_s32(-1);
7566 v2 = vec_ld( 0, faction+ii3);
7567 v3 = vec_ld(16, faction+ii3);
7568 v4 = vec_ld(32, faction+ii3);
7569 v5 = vec_perm(v0, v5,(vector unsigned char)v1); /* mask */
7570 /* load forces from stack */
7571 v6 = vec_ld(256, (float *) stackdata); /* Ox */
7572 v7 = vec_ld(272, (float *) stackdata); /* Oy */
7573 v8 = vec_ld(288, (float *) stackdata); /* Oz */
7574 v9 = vec_ld(304, (float *) stackdata); /* H1x */
7575 v10 = vec_ld(320, (float *) stackdata); /* H1y */
7576 v11 = vec_ld(336, (float *) stackdata); /* H1z */
7577 v12 = vec_ld(352, (float *) stackdata); /* H2x */
7578 v13 = vec_ld(368, (float *) stackdata); /* H2y */
7579 v14 = vec_ld(384, (float *) stackdata); /* H2z */
7581 /* accumulate the forces */
7582 v15 = vec_sld(v6,v6,8);
7583 v16 = vec_sld(v7,v7,8);
7584 v17 = vec_sld(v8,v8,8);
7585 v18 = vec_sld(v9,v9,8);
7586 v19 = vec_sld(v10,v10,8);
7587 v20 = vec_sld(v11,v11,8);
7588 v21 = vec_sld(v12,v12,8);
7589 v22 = vec_sld(v13,v13,8);
7590 v23 = vec_sld(v14,v14,8);
7592 v6 = vec_add(v6,v15); /* Ox Ox' - - */
7593 v7 = vec_add(v7,v16); /* Oy Oy' - - */
7594 v8 = vec_add(v8,v17); /* Oz Oz' - - */
7595 v9 = vec_add(v9,v18); /* H1x H1x' - - */
7596 v10 = vec_add(v10,v19); /* H1y H1y' - - */
7597 v11 = vec_add(v11,v20); /* H1z H1z' - - */
7598 v12 = vec_add(v12,v21); /* H2x H2x' - - */
7599 v13 = vec_add(v13,v22); /* H2y H2y' - - */
7600 v14 = vec_add(v14,v23); /* H2z H2z' - - */
7602 v6 = vec_mergeh(v6,v8); /* Ox Oz Ox' Oz' */
7603 v7 = vec_mergeh(v7,v9); /* Oy H1x Oy' H1x' */
7604 v10 = vec_mergeh(v10,v12); /* H1y H2x H1y' H2x' */
7605 v11 = vec_mergeh(v11,v13); /* H1z H2y H1z' H2y' */
7606 v14 = vec_mergeh(v14,v0); /* H2z 0 H2z' 0 */
7608 v15 = vec_sld(v6,v6,8);
7609 v16 = vec_sld(v7,v7,8);
7610 v17 = vec_sld(v10,v10,8);
7611 v18 = vec_sld(v11,v11,8);
7612 v19 = vec_sld(v14,v14,8);
7614 v6 = vec_add(v6,v15); /* Ox Oz - - */
7615 v7 = vec_add(v7,v16); /* Oy H1x - - */
7616 v10 = vec_add(v10,v17);/* H1y H2x - - */
7617 v11 = vec_add(v11,v18);/* H1z H2y - - */
7618 v14 = vec_add(v14,v19);/* H2z 0 - 0 */
7620 v6 = vec_mergeh(v6,v7); /* Ox Oy Oz H1x */
7621 v10 = vec_mergeh(v10,v11); /* H1y H1z H2x H2y */
7622 v14 = vec_mergeh(v14,v0); /* H2z 0 0 0 */
7624 v7 = vec_sld(v0,v6,12); /* 0 Ox Oy Oz */
7625 v8 = vec_sld(v6,v10,8); /* - H1x H1y H1z */
7626 v9 = vec_sld(v10,v14,4); /* - H2x H2y H2z */
7628 v12 = vec_perm(v0,v6,(vector unsigned char)v1); /* The part to add to v2 */
7629 v13 = vec_perm(v6,v10,(vector unsigned char)v1); /* The part to add to v3 */
7630 v14 = vec_perm(v10,v14,(vector unsigned char)v1); /* The part to add to v4 */
7632 v12 = vec_add(v2,v12);
7633 v13 = vec_add(v3,v13);
7634 v14 = vec_add(v4,v14);
7636 v12 = vec_sel(v2,v12,(vector unsigned int)v5);
7637 v5 = vec_sld(v0,v5,12);
7638 v14 = vec_sel(v14,v4,(vector unsigned int)v5);
7640 /* store */
7641 vec_st(v12, 0, faction+ii3);
7642 vec_st(v13,16, faction+ii3);
7643 vec_st(v14,32, faction+ii3);
7645 /* accumulate for shift */
7646 v7 = vec_add(v7,v8);
7647 v7 = vec_add(v7,v9);
7648 v7 = vec_sld(v7,v0,4); /* x y z 0 */
7650 /* add v7 to the memory location fshift+is3 */
7651 v15 = vec_lde(0, fshift+is3);
7652 v16 = vec_lde(4, fshift+is3);
7653 v17 = vec_lde(8, fshift+is3);
7654 v18 = (vector float)vec_splat(v7,0);
7655 v19 = (vector float)vec_splat(v7,1);
7656 v20 = (vector float)vec_splat(v7,2);
7657 v15 = vec_add(v15,v18);
7658 v16 = vec_add(v16,v19);
7659 v17 = vec_add(v17,v20);
7660 vec_ste(v15,0,fshift+is3);
7661 vec_ste(v16,4,fshift+is3);
7662 vec_ste(v17,8,fshift+is3);
7664 /* update potential energies */
7665 v1 = vec_ld(224,(float *) stackdata); /* load vctot */
7666 v2 = vec_ld(240,(float *) stackdata); /* load vnbtot */
7667 v3 = vec_sld(v1,v1,8);
7668 v4 = vec_sld(v2,v2,8);
7669 v1 = vec_add(v1,v3);
7670 v2 = vec_add(v2,v4);
7671 v3 = vec_sld(v1,v1,4);
7672 v4 = vec_sld(v2,v2,4);
7673 v1 = vec_add(v1,v3);
7674 v2 = vec_add(v2,v4);
7675 /* all 4 positions in v1, v2 contain the sum now */
7676 v3 = vec_lde(0, Vc+gid[n]);
7677 v4 = vec_lde(0, Vnb+gid[n]);
7678 v3 = vec_add(v1,v3);
7679 v4 = vec_add(v2,v4);
7680 vec_ste(v3,0,Vc+gid[n]);
7681 vec_ste(v4,0,Vnb+gid[n]);
7688 void inl2030_altivec(
7689 int nri,
7690 int iinr[],
7691 int jindex[],
7692 int jjnr[],
7693 int shift[],
7694 float shiftvec[],
7695 float fshift[],
7696 int gid[],
7697 float pos[],
7698 float faction[],
7699 float charge[],
7700 float facel,
7701 float Vc[],
7702 float krf,
7703 float crf)
7705 vector float ix1,iy1,iz1,ix2,iy2,iz2,ix3,iy3,iz3;
7706 vector float jx1,jy1,jz1,jx2,jy2,jz2,jx3,jy3,jz3;
7708 vector float dx11,dy11,dz11,dx12,dy12,dz12,dx13,dy13,dz13;
7709 vector float dx21,dy21,dz21,dx22,dy22,dz22,dx23,dy23,dz23;
7710 vector float dx31,dy31,dz31,dx32,dy32,dz32,dx33,dy33,dz33;
7712 vector float rsq11,rsq12,rsq13,rsq21,rsq22,rsq23,rsq31,rsq32,rsq33;
7713 vector float rinv11,rinv12,rinv13,rinv21,rinv22,rinv23,rinv31,rinv32,rinv33;
7714 vector float rinvsq11,rinvsq12,rinvsq13;
7715 vector float rinvsq21,rinvsq22,rinvsq23;
7716 vector float rinvsq31,rinvsq32,rinvsq33;
7717 vector float vc11,vc12,vc13,vc21,vc22,vc23,vc31,vc32,vc33;
7719 vector float vfacel,vcoul1,vcoul2,vcoul3,nul;
7720 vector float fs11,fs12,fs13,fs21,fs22,fs23,fs31,fs32,fs33;
7721 vector float fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3;
7722 vector float fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3;
7723 vector float vctot,qqOO,qqOH,qqHH,qO,qH,vkrf,vcrf;
7724 vector float krsq11,krsq12,krsq13,krsq21,krsq22,krsq23,krsq31,krsq32,krsq33;
7725 vector float qqOOt,qqOHt,qqHHt;
7727 int n,k,k0,ii,is3,ii3,ntiA,nj0,nj1;
7728 int jnra,jnrb,jnrc,jnrd;
7729 int j3a,j3b,j3c,j3d;
7731 nul=vec_zero();
7732 vfacel=load_float_and_splat(&facel);
7733 vkrf=load_float_and_splat(&krf);
7734 vcrf=load_float_and_splat(&crf);
7735 qO = load_float_and_splat(charge+iinr[0]);
7736 qH = load_float_and_splat(charge+iinr[0]+1);
7737 qqOO = vec_madd(qO,qO,nul);
7738 qqOH = vec_madd(qO,qH,nul);
7739 qqHH = vec_madd(qH,qH,nul);
7740 qqOO = vec_madd(qqOO,vfacel,nul);
7741 qqOH = vec_madd(qqOH,vfacel,nul);
7742 qqHH = vec_madd(qqHH,vfacel,nul);
7744 for(n=0;n<nri;n++) {
7745 is3 = 3*shift[n];
7746 ii = iinr[n];
7747 ii3 = 3*ii;
7748 load_1_water_shift_and_splat(pos+ii3,shiftvec+is3,&ix1,&iy1,&iz1,
7749 &ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
7750 vctot = nul;
7751 fix1 = nul;
7752 fiy1 = nul;
7753 fiz1 = nul;
7754 fix2 = nul;
7755 fiy2 = nul;
7756 fiz2 = nul;
7757 fix3 = nul;
7758 fiy3 = nul;
7759 fiz3 = nul;
7760 nj0 = jindex[n];
7761 nj1 = jindex[n+1];
7763 for(k=nj0; k<(nj1-3); k+=4) {
7764 jnra = jjnr[k];
7765 jnrb = jjnr[k+1];
7766 jnrc = jjnr[k+2];
7767 jnrd = jjnr[k+3];
7768 j3a = 3*jnra;
7769 j3b = 3*jnrb;
7770 j3c = 3*jnrc;
7771 j3d = 3*jnrd;
7772 load_4_water(pos+j3a,pos+j3b,pos+j3c,pos+j3d,
7773 &jx1,&jy1,&jz1,&jx2,&jy2,&jz2,&jx3,&jy3,&jz3);
7774 dx11 = vec_sub(ix1,jx1);
7775 dx12 = vec_sub(ix1,jx2);
7776 dx13 = vec_sub(ix1,jx3);
7777 dy11 = vec_sub(iy1,jy1);
7778 dy12 = vec_sub(iy1,jy2);
7779 dy13 = vec_sub(iy1,jy3);
7780 dz11 = vec_sub(iz1,jz1);
7781 dz12 = vec_sub(iz1,jz2);
7782 dz13 = vec_sub(iz1,jz3);
7783 dx21 = vec_sub(ix2,jx1);
7784 dx22 = vec_sub(ix2,jx2);
7785 dx23 = vec_sub(ix2,jx3);
7786 dy21 = vec_sub(iy2,jy1);
7787 dy22 = vec_sub(iy2,jy2);
7788 dy23 = vec_sub(iy2,jy3);
7789 dz21 = vec_sub(iz2,jz1);
7790 dz22 = vec_sub(iz2,jz2);
7791 dz23 = vec_sub(iz2,jz3);
7792 dx31 = vec_sub(ix3,jx1);
7793 dx32 = vec_sub(ix3,jx2);
7794 dx33 = vec_sub(ix3,jx3);
7795 dy31 = vec_sub(iy3,jy1);
7796 dy32 = vec_sub(iy3,jy2);
7797 dy33 = vec_sub(iy3,jy3);
7798 dz31 = vec_sub(iz3,jz1);
7799 dz32 = vec_sub(iz3,jz2);
7800 dz33 = vec_sub(iz3,jz3);
7802 rsq11 = vec_madd(dx11,dx11,nul);
7803 rsq12 = vec_madd(dx12,dx12,nul);
7804 rsq13 = vec_madd(dx13,dx13,nul);
7805 rsq21 = vec_madd(dx21,dx21,nul);
7806 rsq22 = vec_madd(dx22,dx22,nul);
7807 rsq23 = vec_madd(dx23,dx23,nul);
7808 rsq31 = vec_madd(dx31,dx31,nul);
7809 rsq32 = vec_madd(dx32,dx32,nul);
7810 rsq33 = vec_madd(dx33,dx33,nul);
7811 rsq11 = vec_madd(dy11,dy11,rsq11);
7812 rsq12 = vec_madd(dy12,dy12,rsq12);
7813 rsq13 = vec_madd(dy13,dy13,rsq13);
7814 rsq21 = vec_madd(dy21,dy21,rsq21);
7815 rsq22 = vec_madd(dy22,dy22,rsq22);
7816 rsq23 = vec_madd(dy23,dy23,rsq23);
7817 rsq31 = vec_madd(dy31,dy31,rsq31);
7818 rsq32 = vec_madd(dy32,dy32,rsq32);
7819 rsq33 = vec_madd(dy33,dy33,rsq33);
7820 rsq11 = vec_madd(dz11,dz11,rsq11);
7821 rsq12 = vec_madd(dz12,dz12,rsq12);
7822 rsq13 = vec_madd(dz13,dz13,rsq13);
7823 rsq21 = vec_madd(dz21,dz21,rsq21);
7824 rsq22 = vec_madd(dz22,dz22,rsq22);
7825 rsq23 = vec_madd(dz23,dz23,rsq23);
7826 rsq31 = vec_madd(dz31,dz31,rsq31);
7827 rsq32 = vec_madd(dz32,dz32,rsq32);
7828 rsq33 = vec_madd(dz33,dz33,rsq33);
7830 do_9_invsqrt(rsq11,rsq12,rsq13,
7831 rsq21,rsq22,rsq23,
7832 rsq31,rsq32,rsq33,
7833 &rinv11,&rinv12,&rinv13,
7834 &rinv21,&rinv22,&rinv23,
7835 &rinv31,&rinv32,&rinv33);
7837 krsq11 = vec_madd(vkrf,rsq11,nul);
7838 krsq12 = vec_madd(vkrf,rsq12,nul);
7839 krsq13 = vec_madd(vkrf,rsq13,nul);
7840 krsq21 = vec_madd(vkrf,rsq21,nul);
7841 krsq22 = vec_madd(vkrf,rsq22,nul);
7842 krsq23 = vec_madd(vkrf,rsq23,nul);
7843 krsq31 = vec_madd(vkrf,rsq31,nul);
7844 krsq32 = vec_madd(vkrf,rsq32,nul);
7845 krsq33 = vec_madd(vkrf,rsq33,nul);
7847 rinvsq11 = vec_madd(rinv11,rinv11,nul);
7848 rinvsq12 = vec_madd(rinv12,rinv12,nul);
7849 rinvsq13 = vec_madd(rinv13,rinv13,nul);
7850 rinvsq21 = vec_madd(rinv21,rinv21,nul);
7851 rinvsq22 = vec_madd(rinv22,rinv22,nul);
7852 rinvsq23 = vec_madd(rinv23,rinv23,nul);
7853 rinvsq31 = vec_madd(rinv31,rinv31,nul);
7854 rinvsq32 = vec_madd(rinv32,rinv32,nul);
7855 rinvsq33 = vec_madd(rinv33,rinv33,nul);
7857 vc11 = vec_add(rinv11,krsq11);
7858 vc12 = vec_add(rinv12,krsq12);
7859 vc13 = vec_add(rinv13,krsq13);
7860 vc21 = vec_add(rinv21,krsq21);
7861 vc22 = vec_add(rinv22,krsq22);
7862 vc23 = vec_add(rinv23,krsq23);
7863 vc31 = vec_add(rinv31,krsq31);
7864 vc32 = vec_add(rinv32,krsq32);
7865 vc33 = vec_add(rinv33,krsq33);
7867 vc11 = vec_sub(vc11,vcrf);
7868 vc12 = vec_sub(vc12,vcrf);
7869 vc13 = vec_sub(vc13,vcrf);
7870 vc21 = vec_sub(vc21,vcrf);
7871 vc22 = vec_sub(vc22,vcrf);
7872 vc23 = vec_sub(vc23,vcrf);
7873 vc31 = vec_sub(vc31,vcrf);
7874 vc32 = vec_sub(vc32,vcrf);
7875 vc33 = vec_sub(vc33,vcrf);
7877 fs11 = vec_nmsub(vec_two(),krsq11,rinv11);
7878 fs12 = vec_nmsub(vec_two(),krsq12,rinv12);
7879 fs13 = vec_nmsub(vec_two(),krsq13,rinv13);
7880 fs21 = vec_nmsub(vec_two(),krsq21,rinv21);
7881 fs22 = vec_nmsub(vec_two(),krsq22,rinv22);
7882 fs23 = vec_nmsub(vec_two(),krsq23,rinv23);
7883 fs31 = vec_nmsub(vec_two(),krsq31,rinv31);
7884 fs32 = vec_nmsub(vec_two(),krsq32,rinv32);
7885 fs33 = vec_nmsub(vec_two(),krsq33,rinv33);
7887 fs11 = vec_madd(fs11,qqOO,nul);
7888 fs12 = vec_madd(fs12,qqOH,nul);
7889 fs13 = vec_madd(fs13,qqOH,nul);
7890 fs21 = vec_madd(fs21,qqOH,nul);
7891 fs22 = vec_madd(fs22,qqHH,nul);
7892 fs23 = vec_madd(fs23,qqHH,nul);
7893 fs31 = vec_madd(fs31,qqOH,nul);
7894 fs32 = vec_madd(fs32,qqHH,nul);
7895 fs33 = vec_madd(fs33,qqHH,nul);
7897 fs11 = vec_madd(fs11,rinvsq11,nul);
7898 fs12 = vec_madd(fs12,rinvsq12,nul);
7899 fs13 = vec_madd(fs13,rinvsq13,nul);
7900 fs21 = vec_madd(fs21,rinvsq21,nul);
7901 fs22 = vec_madd(fs22,rinvsq22,nul);
7902 fs23 = vec_madd(fs23,rinvsq23,nul);
7903 fs31 = vec_madd(fs31,rinvsq31,nul);
7904 fs32 = vec_madd(fs32,rinvsq32,nul);
7905 fs33 = vec_madd(fs33,rinvsq33,nul);
7907 vctot = vec_madd(qqOO,vc11,vctot);
7908 vctot = vec_madd(qqOH,vc12,vctot);
7909 vctot = vec_madd(qqOH,vc13,vctot);
7910 vctot = vec_madd(qqOH,vc21,vctot);
7911 vctot = vec_madd(qqHH,vc22,vctot);
7912 vctot = vec_madd(qqHH,vc23,vctot);
7913 vctot = vec_madd(qqOH,vc31,vctot);
7914 vctot = vec_madd(qqHH,vc32,vctot);
7915 vctot = vec_madd(qqHH,vc33,vctot);
7917 fix1 = vec_madd(fs11,dx11,fix1);
7918 fiy1 = vec_madd(fs11,dy11,fiy1);
7919 fiz1 = vec_madd(fs11,dz11,fiz1);
7920 fix2 = vec_madd(fs21,dx21,fix2);
7921 fiy2 = vec_madd(fs21,dy21,fiy2);
7922 fiz2 = vec_madd(fs21,dz21,fiz2);
7923 fix3 = vec_madd(fs31,dx31,fix3);
7924 fiy3 = vec_madd(fs31,dy31,fiy3);
7925 fiz3 = vec_madd(fs31,dz31,fiz3);
7927 fix1 = vec_madd(fs12,dx12,fix1);
7928 fiy1 = vec_madd(fs12,dy12,fiy1);
7929 fiz1 = vec_madd(fs12,dz12,fiz1);
7930 fix2 = vec_madd(fs22,dx22,fix2);
7931 fiy2 = vec_madd(fs22,dy22,fiy2);
7932 fiz2 = vec_madd(fs22,dz22,fiz2);
7933 fix3 = vec_madd(fs32,dx32,fix3);
7934 fiy3 = vec_madd(fs32,dy32,fiy3);
7935 fiz3 = vec_madd(fs32,dz32,fiz3);
7937 fix1 = vec_madd(fs13,dx13,fix1);
7938 fiy1 = vec_madd(fs13,dy13,fiy1);
7939 fiz1 = vec_madd(fs13,dz13,fiz1);
7940 fix2 = vec_madd(fs23,dx23,fix2);
7941 fiy2 = vec_madd(fs23,dy23,fiy2);
7942 fiz2 = vec_madd(fs23,dz23,fiz2);
7943 fix3 = vec_madd(fs33,dx33,fix3);
7944 fiy3 = vec_madd(fs33,dy33,fiy3);
7945 fiz3 = vec_madd(fs33,dz33,fiz3);
7947 fjx1 = vec_nmsub(fs11,dx11,nul);
7948 fjy1 = vec_nmsub(fs11,dy11,nul);
7949 fjz1 = vec_nmsub(fs11,dz11,nul);
7950 fjx2 = vec_nmsub(fs12,dx12,nul);
7951 fjy2 = vec_nmsub(fs12,dy12,nul);
7952 fjz2 = vec_nmsub(fs12,dz12,nul);
7953 fjx3 = vec_nmsub(fs13,dx13,nul);
7954 fjy3 = vec_nmsub(fs13,dy13,nul);
7955 fjz3 = vec_nmsub(fs13,dz13,nul);
7957 fjx1 = vec_nmsub(fs21,dx21,fjx1);
7958 fjy1 = vec_nmsub(fs21,dy21,fjy1);
7959 fjz1 = vec_nmsub(fs21,dz21,fjz1);
7960 fjx2 = vec_nmsub(fs22,dx22,fjx2);
7961 fjy2 = vec_nmsub(fs22,dy22,fjy2);
7962 fjz2 = vec_nmsub(fs22,dz22,fjz2);
7963 fjx3 = vec_nmsub(fs23,dx23,fjx3);
7964 fjy3 = vec_nmsub(fs23,dy23,fjy3);
7965 fjz3 = vec_nmsub(fs23,dz23,fjz3);
7967 fjx1 = vec_nmsub(fs31,dx31,fjx1);
7968 fjy1 = vec_nmsub(fs31,dy31,fjy1);
7969 fjz1 = vec_nmsub(fs31,dz31,fjz1);
7970 fjx2 = vec_nmsub(fs32,dx32,fjx2);
7971 fjy2 = vec_nmsub(fs32,dy32,fjy2);
7972 fjz2 = vec_nmsub(fs32,dz32,fjz2);
7973 fjx3 = vec_nmsub(fs33,dx33,fjx3);
7974 fjy3 = vec_nmsub(fs33,dy33,fjy3);
7975 fjz3 = vec_nmsub(fs33,dz33,fjz3);
7977 add_force_to_4_water(faction+j3a,faction+j3b,faction+j3c,faction+j3d,
7978 fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
7980 if(k<(nj1-2)) {
7981 jnra = jjnr[k];
7982 jnrb = jjnr[k+1];
7983 jnrc = jjnr[k+2];
7984 j3a = 3*jnra;
7985 j3b = 3*jnrb;
7986 j3c = 3*jnrc;
7987 load_3_water(pos+j3a,pos+j3b,pos+j3c,
7988 &jx1,&jy1,&jz1,&jx2,&jy2,&jz2,&jx3,&jy3,&jz3);
7989 qqOOt = vec_sld(qqOO,nul,4);
7990 qqOHt = vec_sld(qqOH,nul,4);
7991 qqHHt = vec_sld(qqHH,nul,4);
7993 dx11 = vec_sub(ix1,jx1);
7994 dx12 = vec_sub(ix1,jx2);
7995 dx13 = vec_sub(ix1,jx3);
7996 dy11 = vec_sub(iy1,jy1);
7997 dy12 = vec_sub(iy1,jy2);
7998 dy13 = vec_sub(iy1,jy3);
7999 dz11 = vec_sub(iz1,jz1);
8000 dz12 = vec_sub(iz1,jz2);
8001 dz13 = vec_sub(iz1,jz3);
8002 dx21 = vec_sub(ix2,jx1);
8003 dx22 = vec_sub(ix2,jx2);
8004 dx23 = vec_sub(ix2,jx3);
8005 dy21 = vec_sub(iy2,jy1);
8006 dy22 = vec_sub(iy2,jy2);
8007 dy23 = vec_sub(iy2,jy3);
8008 dz21 = vec_sub(iz2,jz1);
8009 dz22 = vec_sub(iz2,jz2);
8010 dz23 = vec_sub(iz2,jz3);
8011 dx31 = vec_sub(ix3,jx1);
8012 dx32 = vec_sub(ix3,jx2);
8013 dx33 = vec_sub(ix3,jx3);
8014 dy31 = vec_sub(iy3,jy1);
8015 dy32 = vec_sub(iy3,jy2);
8016 dy33 = vec_sub(iy3,jy3);
8017 dz31 = vec_sub(iz3,jz1);
8018 dz32 = vec_sub(iz3,jz2);
8019 dz33 = vec_sub(iz3,jz3);
8021 rsq11 = vec_madd(dx11,dx11,nul);
8022 rsq12 = vec_madd(dx12,dx12,nul);
8023 rsq13 = vec_madd(dx13,dx13,nul);
8024 rsq21 = vec_madd(dx21,dx21,nul);
8025 rsq22 = vec_madd(dx22,dx22,nul);
8026 rsq23 = vec_madd(dx23,dx23,nul);
8027 rsq31 = vec_madd(dx31,dx31,nul);
8028 rsq32 = vec_madd(dx32,dx32,nul);
8029 rsq33 = vec_madd(dx33,dx33,nul);
8030 rsq11 = vec_madd(dy11,dy11,rsq11);
8031 rsq12 = vec_madd(dy12,dy12,rsq12);
8032 rsq13 = vec_madd(dy13,dy13,rsq13);
8033 rsq21 = vec_madd(dy21,dy21,rsq21);
8034 rsq22 = vec_madd(dy22,dy22,rsq22);
8035 rsq23 = vec_madd(dy23,dy23,rsq23);
8036 rsq31 = vec_madd(dy31,dy31,rsq31);
8037 rsq32 = vec_madd(dy32,dy32,rsq32);
8038 rsq33 = vec_madd(dy33,dy33,rsq33);
8039 rsq11 = vec_madd(dz11,dz11,rsq11);
8040 rsq12 = vec_madd(dz12,dz12,rsq12);
8041 rsq13 = vec_madd(dz13,dz13,rsq13);
8042 rsq21 = vec_madd(dz21,dz21,rsq21);
8043 rsq22 = vec_madd(dz22,dz22,rsq22);
8044 rsq23 = vec_madd(dz23,dz23,rsq23);
8045 rsq31 = vec_madd(dz31,dz31,rsq31);
8046 rsq32 = vec_madd(dz32,dz32,rsq32);
8047 rsq33 = vec_madd(dz33,dz33,rsq33);
8049 zero_highest_element_in_9_vectors(&rsq11,&rsq12,&rsq13,
8050 &rsq21,&rsq22,&rsq23,
8051 &rsq31,&rsq32,&rsq33);
8053 do_9_invsqrt(rsq11,rsq12,rsq13,
8054 rsq21,rsq22,rsq23,
8055 rsq31,rsq32,rsq33,
8056 &rinv11,&rinv12,&rinv13,
8057 &rinv21,&rinv22,&rinv23,
8058 &rinv31,&rinv32,&rinv33);
8060 zero_highest_element_in_9_vectors(&rinv11,&rinv12,&rinv13,
8061 &rinv21,&rinv22,&rinv23,
8062 &rinv31,&rinv32,&rinv33);
8064 krsq11 = vec_madd(vkrf,rsq11,nul);
8065 krsq12 = vec_madd(vkrf,rsq12,nul);
8066 krsq13 = vec_madd(vkrf,rsq13,nul);
8067 krsq21 = vec_madd(vkrf,rsq21,nul);
8068 krsq22 = vec_madd(vkrf,rsq22,nul);
8069 krsq23 = vec_madd(vkrf,rsq23,nul);
8070 krsq31 = vec_madd(vkrf,rsq31,nul);
8071 krsq32 = vec_madd(vkrf,rsq32,nul);
8072 krsq33 = vec_madd(vkrf,rsq33,nul);
8074 rinvsq11 = vec_madd(rinv11,rinv11,nul);
8075 rinvsq12 = vec_madd(rinv12,rinv12,nul);
8076 rinvsq13 = vec_madd(rinv13,rinv13,nul);
8077 rinvsq21 = vec_madd(rinv21,rinv21,nul);
8078 rinvsq22 = vec_madd(rinv22,rinv22,nul);
8079 rinvsq23 = vec_madd(rinv23,rinv23,nul);
8080 rinvsq31 = vec_madd(rinv31,rinv31,nul);
8081 rinvsq32 = vec_madd(rinv32,rinv32,nul);
8082 rinvsq33 = vec_madd(rinv33,rinv33,nul);
8084 vc11 = vec_add(rinv11,krsq11);
8085 vc12 = vec_add(rinv12,krsq12);
8086 vc13 = vec_add(rinv13,krsq13);
8087 vc21 = vec_add(rinv21,krsq21);
8088 vc22 = vec_add(rinv22,krsq22);
8089 vc23 = vec_add(rinv23,krsq23);
8090 vc31 = vec_add(rinv31,krsq31);
8091 vc32 = vec_add(rinv32,krsq32);
8092 vc33 = vec_add(rinv33,krsq33);
8094 vc11 = vec_sub(vc11,vcrf);
8095 vc12 = vec_sub(vc12,vcrf);
8096 vc13 = vec_sub(vc13,vcrf);
8097 vc21 = vec_sub(vc21,vcrf);
8098 vc22 = vec_sub(vc22,vcrf);
8099 vc23 = vec_sub(vc23,vcrf);
8100 vc31 = vec_sub(vc31,vcrf);
8101 vc32 = vec_sub(vc32,vcrf);
8102 vc33 = vec_sub(vc33,vcrf);
8104 fs11 = vec_nmsub(vec_two(),krsq11,rinv11);
8105 fs12 = vec_nmsub(vec_two(),krsq12,rinv12);
8106 fs13 = vec_nmsub(vec_two(),krsq13,rinv13);
8107 fs21 = vec_nmsub(vec_two(),krsq21,rinv21);
8108 fs22 = vec_nmsub(vec_two(),krsq22,rinv22);
8109 fs23 = vec_nmsub(vec_two(),krsq23,rinv23);
8110 fs31 = vec_nmsub(vec_two(),krsq31,rinv31);
8111 fs32 = vec_nmsub(vec_two(),krsq32,rinv32);
8112 fs33 = vec_nmsub(vec_two(),krsq33,rinv33);
8114 fs11 = vec_madd(fs11,qqOOt,nul);
8115 fs12 = vec_madd(fs12,qqOHt,nul);
8116 fs13 = vec_madd(fs13,qqOHt,nul);
8117 fs21 = vec_madd(fs21,qqOHt,nul);
8118 fs22 = vec_madd(fs22,qqHHt,nul);
8119 fs23 = vec_madd(fs23,qqHHt,nul);
8120 fs31 = vec_madd(fs31,qqOHt,nul);
8121 fs32 = vec_madd(fs32,qqHHt,nul);
8122 fs33 = vec_madd(fs33,qqHHt,nul);
8124 fs11 = vec_madd(fs11,rinvsq11,nul);
8125 fs12 = vec_madd(fs12,rinvsq12,nul);
8126 fs13 = vec_madd(fs13,rinvsq13,nul);
8127 fs21 = vec_madd(fs21,rinvsq21,nul);
8128 fs22 = vec_madd(fs22,rinvsq22,nul);
8129 fs23 = vec_madd(fs23,rinvsq23,nul);
8130 fs31 = vec_madd(fs31,rinvsq31,nul);
8131 fs32 = vec_madd(fs32,rinvsq32,nul);
8132 fs33 = vec_madd(fs33,rinvsq33,nul);
8134 vctot = vec_madd(qqOOt,vc11,vctot);
8135 vctot = vec_madd(qqOHt,vc12,vctot);
8136 vctot = vec_madd(qqOHt,vc13,vctot);
8137 vctot = vec_madd(qqOHt,vc21,vctot);
8138 vctot = vec_madd(qqHHt,vc22,vctot);
8139 vctot = vec_madd(qqHHt,vc23,vctot);
8140 vctot = vec_madd(qqOHt,vc31,vctot);
8141 vctot = vec_madd(qqHHt,vc32,vctot);
8142 vctot = vec_madd(qqHHt,vc33,vctot);
8144 fix1 = vec_madd(fs11,dx11,fix1);
8145 fiy1 = vec_madd(fs11,dy11,fiy1);
8146 fiz1 = vec_madd(fs11,dz11,fiz1);
8147 fix2 = vec_madd(fs21,dx21,fix2);
8148 fiy2 = vec_madd(fs21,dy21,fiy2);
8149 fiz2 = vec_madd(fs21,dz21,fiz2);
8150 fix3 = vec_madd(fs31,dx31,fix3);
8151 fiy3 = vec_madd(fs31,dy31,fiy3);
8152 fiz3 = vec_madd(fs31,dz31,fiz3);
8154 fix1 = vec_madd(fs12,dx12,fix1);
8155 fiy1 = vec_madd(fs12,dy12,fiy1);
8156 fiz1 = vec_madd(fs12,dz12,fiz1);
8157 fix2 = vec_madd(fs22,dx22,fix2);
8158 fiy2 = vec_madd(fs22,dy22,fiy2);
8159 fiz2 = vec_madd(fs22,dz22,fiz2);
8160 fix3 = vec_madd(fs32,dx32,fix3);
8161 fiy3 = vec_madd(fs32,dy32,fiy3);
8162 fiz3 = vec_madd(fs32,dz32,fiz3);
8164 fix1 = vec_madd(fs13,dx13,fix1);
8165 fiy1 = vec_madd(fs13,dy13,fiy1);
8166 fiz1 = vec_madd(fs13,dz13,fiz1);
8167 fix2 = vec_madd(fs23,dx23,fix2);
8168 fiy2 = vec_madd(fs23,dy23,fiy2);
8169 fiz2 = vec_madd(fs23,dz23,fiz2);
8170 fix3 = vec_madd(fs33,dx33,fix3);
8171 fiy3 = vec_madd(fs33,dy33,fiy3);
8172 fiz3 = vec_madd(fs33,dz33,fiz3);
8174 fjx1 = vec_nmsub(fs11,dx11,nul);
8175 fjy1 = vec_nmsub(fs11,dy11,nul);
8176 fjz1 = vec_nmsub(fs11,dz11,nul);
8177 fjx2 = vec_nmsub(fs12,dx12,nul);
8178 fjy2 = vec_nmsub(fs12,dy12,nul);
8179 fjz2 = vec_nmsub(fs12,dz12,nul);
8180 fjx3 = vec_nmsub(fs13,dx13,nul);
8181 fjy3 = vec_nmsub(fs13,dy13,nul);
8182 fjz3 = vec_nmsub(fs13,dz13,nul);
8184 fjx1 = vec_nmsub(fs21,dx21,fjx1);
8185 fjy1 = vec_nmsub(fs21,dy21,fjy1);
8186 fjz1 = vec_nmsub(fs21,dz21,fjz1);
8187 fjx2 = vec_nmsub(fs22,dx22,fjx2);
8188 fjy2 = vec_nmsub(fs22,dy22,fjy2);
8189 fjz2 = vec_nmsub(fs22,dz22,fjz2);
8190 fjx3 = vec_nmsub(fs23,dx23,fjx3);
8191 fjy3 = vec_nmsub(fs23,dy23,fjy3);
8192 fjz3 = vec_nmsub(fs23,dz23,fjz3);
8194 fjx1 = vec_nmsub(fs31,dx31,fjx1);
8195 fjy1 = vec_nmsub(fs31,dy31,fjy1);
8196 fjz1 = vec_nmsub(fs31,dz31,fjz1);
8197 fjx2 = vec_nmsub(fs32,dx32,fjx2);
8198 fjy2 = vec_nmsub(fs32,dy32,fjy2);
8199 fjz2 = vec_nmsub(fs32,dz32,fjz2);
8200 fjx3 = vec_nmsub(fs33,dx33,fjx3);
8201 fjy3 = vec_nmsub(fs33,dy33,fjy3);
8202 fjz3 = vec_nmsub(fs33,dz33,fjz3);
8204 add_force_to_3_water(faction+j3a,faction+j3b,faction+j3c,
8205 fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
8206 } else if(k<(nj1-1)) {
8207 jnra = jjnr[k];
8208 jnrb = jjnr[k+1];
8209 j3a = 3*jnra;
8210 j3b = 3*jnrb;
8211 load_2_water(pos+j3a,pos+j3b,
8212 &jx1,&jy1,&jz1,&jx2,&jy2,&jz2,&jx3,&jy3,&jz3);
8213 qqOOt = vec_sld(qqOO,nul,8);
8214 qqOHt = vec_sld(qqOH,nul,8);
8215 qqHHt = vec_sld(qqHH,nul,8);
8217 dx11 = vec_sub(ix1,jx1);
8218 dx12 = vec_sub(ix1,jx2);
8219 dx13 = vec_sub(ix1,jx3);
8220 dy11 = vec_sub(iy1,jy1);
8221 dy12 = vec_sub(iy1,jy2);
8222 dy13 = vec_sub(iy1,jy3);
8223 dz11 = vec_sub(iz1,jz1);
8224 dz12 = vec_sub(iz1,jz2);
8225 dz13 = vec_sub(iz1,jz3);
8226 dx21 = vec_sub(ix2,jx1);
8227 dx22 = vec_sub(ix2,jx2);
8228 dx23 = vec_sub(ix2,jx3);
8229 dy21 = vec_sub(iy2,jy1);
8230 dy22 = vec_sub(iy2,jy2);
8231 dy23 = vec_sub(iy2,jy3);
8232 dz21 = vec_sub(iz2,jz1);
8233 dz22 = vec_sub(iz2,jz2);
8234 dz23 = vec_sub(iz2,jz3);
8235 dx31 = vec_sub(ix3,jx1);
8236 dx32 = vec_sub(ix3,jx2);
8237 dx33 = vec_sub(ix3,jx3);
8238 dy31 = vec_sub(iy3,jy1);
8239 dy32 = vec_sub(iy3,jy2);
8240 dy33 = vec_sub(iy3,jy3);
8241 dz31 = vec_sub(iz3,jz1);
8242 dz32 = vec_sub(iz3,jz2);
8243 dz33 = vec_sub(iz3,jz3);
8245 rsq11 = vec_madd(dx11,dx11,nul);
8246 rsq12 = vec_madd(dx12,dx12,nul);
8247 rsq13 = vec_madd(dx13,dx13,nul);
8248 rsq21 = vec_madd(dx21,dx21,nul);
8249 rsq22 = vec_madd(dx22,dx22,nul);
8250 rsq23 = vec_madd(dx23,dx23,nul);
8251 rsq31 = vec_madd(dx31,dx31,nul);
8252 rsq32 = vec_madd(dx32,dx32,nul);
8253 rsq33 = vec_madd(dx33,dx33,nul);
8254 rsq11 = vec_madd(dy11,dy11,rsq11);
8255 rsq12 = vec_madd(dy12,dy12,rsq12);
8256 rsq13 = vec_madd(dy13,dy13,rsq13);
8257 rsq21 = vec_madd(dy21,dy21,rsq21);
8258 rsq22 = vec_madd(dy22,dy22,rsq22);
8259 rsq23 = vec_madd(dy23,dy23,rsq23);
8260 rsq31 = vec_madd(dy31,dy31,rsq31);
8261 rsq32 = vec_madd(dy32,dy32,rsq32);
8262 rsq33 = vec_madd(dy33,dy33,rsq33);
8263 rsq11 = vec_madd(dz11,dz11,rsq11);
8264 rsq12 = vec_madd(dz12,dz12,rsq12);
8265 rsq13 = vec_madd(dz13,dz13,rsq13);
8266 rsq21 = vec_madd(dz21,dz21,rsq21);
8267 rsq22 = vec_madd(dz22,dz22,rsq22);
8268 rsq23 = vec_madd(dz23,dz23,rsq23);
8269 rsq31 = vec_madd(dz31,dz31,rsq31);
8270 rsq32 = vec_madd(dz32,dz32,rsq32);
8271 rsq33 = vec_madd(dz33,dz33,rsq33);
8273 zero_highest_2_elements_in_9_vectors(&rsq11,&rsq12,&rsq13,
8274 &rsq21,&rsq22,&rsq23,
8275 &rsq31,&rsq32,&rsq33);
8277 do_9_invsqrt(rsq11,rsq12,rsq13,
8278 rsq21,rsq22,rsq23,
8279 rsq31,rsq32,rsq33,
8280 &rinv11,&rinv12,&rinv13,
8281 &rinv21,&rinv22,&rinv23,
8282 &rinv31,&rinv32,&rinv33);
8284 zero_highest_2_elements_in_9_vectors(&rinv11,&rinv12,&rinv13,
8285 &rinv21,&rinv22,&rinv23,
8286 &rinv31,&rinv32,&rinv33);
8288 krsq11 = vec_madd(vkrf,rsq11,nul);
8289 krsq12 = vec_madd(vkrf,rsq12,nul);
8290 krsq13 = vec_madd(vkrf,rsq13,nul);
8291 krsq21 = vec_madd(vkrf,rsq21,nul);
8292 krsq22 = vec_madd(vkrf,rsq22,nul);
8293 krsq23 = vec_madd(vkrf,rsq23,nul);
8294 krsq31 = vec_madd(vkrf,rsq31,nul);
8295 krsq32 = vec_madd(vkrf,rsq32,nul);
8296 krsq33 = vec_madd(vkrf,rsq33,nul);
8298 rinvsq11 = vec_madd(rinv11,rinv11,nul);
8299 rinvsq12 = vec_madd(rinv12,rinv12,nul);
8300 rinvsq13 = vec_madd(rinv13,rinv13,nul);
8301 rinvsq21 = vec_madd(rinv21,rinv21,nul);
8302 rinvsq22 = vec_madd(rinv22,rinv22,nul);
8303 rinvsq23 = vec_madd(rinv23,rinv23,nul);
8304 rinvsq31 = vec_madd(rinv31,rinv31,nul);
8305 rinvsq32 = vec_madd(rinv32,rinv32,nul);
8306 rinvsq33 = vec_madd(rinv33,rinv33,nul);
8308 vc11 = vec_add(rinv11,krsq11);
8309 vc12 = vec_add(rinv12,krsq12);
8310 vc13 = vec_add(rinv13,krsq13);
8311 vc21 = vec_add(rinv21,krsq21);
8312 vc22 = vec_add(rinv22,krsq22);
8313 vc23 = vec_add(rinv23,krsq23);
8314 vc31 = vec_add(rinv31,krsq31);
8315 vc32 = vec_add(rinv32,krsq32);
8316 vc33 = vec_add(rinv33,krsq33);
8318 vc11 = vec_sub(vc11,vcrf);
8319 vc12 = vec_sub(vc12,vcrf);
8320 vc13 = vec_sub(vc13,vcrf);
8321 vc21 = vec_sub(vc21,vcrf);
8322 vc22 = vec_sub(vc22,vcrf);
8323 vc23 = vec_sub(vc23,vcrf);
8324 vc31 = vec_sub(vc31,vcrf);
8325 vc32 = vec_sub(vc32,vcrf);
8326 vc33 = vec_sub(vc33,vcrf);
8328 fs11 = vec_nmsub(vec_two(),krsq11,rinv11);
8329 fs12 = vec_nmsub(vec_two(),krsq12,rinv12);
8330 fs13 = vec_nmsub(vec_two(),krsq13,rinv13);
8331 fs21 = vec_nmsub(vec_two(),krsq21,rinv21);
8332 fs22 = vec_nmsub(vec_two(),krsq22,rinv22);
8333 fs23 = vec_nmsub(vec_two(),krsq23,rinv23);
8334 fs31 = vec_nmsub(vec_two(),krsq31,rinv31);
8335 fs32 = vec_nmsub(vec_two(),krsq32,rinv32);
8336 fs33 = vec_nmsub(vec_two(),krsq33,rinv33);
8338 fs11 = vec_madd(fs11,qqOOt,nul);
8339 fs12 = vec_madd(fs12,qqOHt,nul);
8340 fs13 = vec_madd(fs13,qqOHt,nul);
8341 fs21 = vec_madd(fs21,qqOHt,nul);
8342 fs22 = vec_madd(fs22,qqHHt,nul);
8343 fs23 = vec_madd(fs23,qqHHt,nul);
8344 fs31 = vec_madd(fs31,qqOHt,nul);
8345 fs32 = vec_madd(fs32,qqHHt,nul);
8346 fs33 = vec_madd(fs33,qqHHt,nul);
8348 fs11 = vec_madd(fs11,rinvsq11,nul);
8349 fs12 = vec_madd(fs12,rinvsq12,nul);
8350 fs13 = vec_madd(fs13,rinvsq13,nul);
8351 fs21 = vec_madd(fs21,rinvsq21,nul);
8352 fs22 = vec_madd(fs22,rinvsq22,nul);
8353 fs23 = vec_madd(fs23,rinvsq23,nul);
8354 fs31 = vec_madd(fs31,rinvsq31,nul);
8355 fs32 = vec_madd(fs32,rinvsq32,nul);
8356 fs33 = vec_madd(fs33,rinvsq33,nul);
8358 vctot = vec_madd(qqOOt,vc11,vctot);
8359 vctot = vec_madd(qqOHt,vc12,vctot);
8360 vctot = vec_madd(qqOHt,vc13,vctot);
8361 vctot = vec_madd(qqOHt,vc21,vctot);
8362 vctot = vec_madd(qqHHt,vc22,vctot);
8363 vctot = vec_madd(qqHHt,vc23,vctot);
8364 vctot = vec_madd(qqOHt,vc31,vctot);
8365 vctot = vec_madd(qqHHt,vc32,vctot);
8366 vctot = vec_madd(qqHHt,vc33,vctot);
8368 fix1 = vec_madd(fs11,dx11,fix1);
8369 fiy1 = vec_madd(fs11,dy11,fiy1);
8370 fiz1 = vec_madd(fs11,dz11,fiz1);
8371 fix2 = vec_madd(fs21,dx21,fix2);
8372 fiy2 = vec_madd(fs21,dy21,fiy2);
8373 fiz2 = vec_madd(fs21,dz21,fiz2);
8374 fix3 = vec_madd(fs31,dx31,fix3);
8375 fiy3 = vec_madd(fs31,dy31,fiy3);
8376 fiz3 = vec_madd(fs31,dz31,fiz3);
8378 fix1 = vec_madd(fs12,dx12,fix1);
8379 fiy1 = vec_madd(fs12,dy12,fiy1);
8380 fiz1 = vec_madd(fs12,dz12,fiz1);
8381 fix2 = vec_madd(fs22,dx22,fix2);
8382 fiy2 = vec_madd(fs22,dy22,fiy2);
8383 fiz2 = vec_madd(fs22,dz22,fiz2);
8384 fix3 = vec_madd(fs32,dx32,fix3);
8385 fiy3 = vec_madd(fs32,dy32,fiy3);
8386 fiz3 = vec_madd(fs32,dz32,fiz3);
8388 fix1 = vec_madd(fs13,dx13,fix1);
8389 fiy1 = vec_madd(fs13,dy13,fiy1);
8390 fiz1 = vec_madd(fs13,dz13,fiz1);
8391 fix2 = vec_madd(fs23,dx23,fix2);
8392 fiy2 = vec_madd(fs23,dy23,fiy2);
8393 fiz2 = vec_madd(fs23,dz23,fiz2);
8394 fix3 = vec_madd(fs33,dx33,fix3);
8395 fiy3 = vec_madd(fs33,dy33,fiy3);
8396 fiz3 = vec_madd(fs33,dz33,fiz3);
8398 fjx1 = vec_nmsub(fs11,dx11,nul);
8399 fjy1 = vec_nmsub(fs11,dy11,nul);
8400 fjz1 = vec_nmsub(fs11,dz11,nul);
8401 fjx2 = vec_nmsub(fs12,dx12,nul);
8402 fjy2 = vec_nmsub(fs12,dy12,nul);
8403 fjz2 = vec_nmsub(fs12,dz12,nul);
8404 fjx3 = vec_nmsub(fs13,dx13,nul);
8405 fjy3 = vec_nmsub(fs13,dy13,nul);
8406 fjz3 = vec_nmsub(fs13,dz13,nul);
8408 fjx1 = vec_nmsub(fs21,dx21,fjx1);
8409 fjy1 = vec_nmsub(fs21,dy21,fjy1);
8410 fjz1 = vec_nmsub(fs21,dz21,fjz1);
8411 fjx2 = vec_nmsub(fs22,dx22,fjx2);
8412 fjy2 = vec_nmsub(fs22,dy22,fjy2);
8413 fjz2 = vec_nmsub(fs22,dz22,fjz2);
8414 fjx3 = vec_nmsub(fs23,dx23,fjx3);
8415 fjy3 = vec_nmsub(fs23,dy23,fjy3);
8416 fjz3 = vec_nmsub(fs23,dz23,fjz3);
8418 fjx1 = vec_nmsub(fs31,dx31,fjx1);
8419 fjy1 = vec_nmsub(fs31,dy31,fjy1);
8420 fjz1 = vec_nmsub(fs31,dz31,fjz1);
8421 fjx2 = vec_nmsub(fs32,dx32,fjx2);
8422 fjy2 = vec_nmsub(fs32,dy32,fjy2);
8423 fjz2 = vec_nmsub(fs32,dz32,fjz2);
8424 fjx3 = vec_nmsub(fs33,dx33,fjx3);
8425 fjy3 = vec_nmsub(fs33,dy33,fjy3);
8426 fjz3 = vec_nmsub(fs33,dz33,fjz3);
8428 add_force_to_2_water(faction+j3a,faction+j3b,
8429 fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
8430 } else if(k<nj1) {
8431 jnra = jjnr[k];
8432 j3a = 3*jnra;
8433 load_1_water(pos+j3a,
8434 &jx1,&jy1,&jz1,&jx2,&jy2,&jz2,&jx3,&jy3,&jz3);
8435 qqOOt = vec_sld(qqOO,nul,12);
8436 qqOHt = vec_sld(qqOH,nul,12);
8437 qqHHt = vec_sld(qqHH,nul,12);
8439 dx11 = vec_sub(ix1,jx1);
8440 dx12 = vec_sub(ix1,jx2);
8441 dx13 = vec_sub(ix1,jx3);
8442 dy11 = vec_sub(iy1,jy1);
8443 dy12 = vec_sub(iy1,jy2);
8444 dy13 = vec_sub(iy1,jy3);
8445 dz11 = vec_sub(iz1,jz1);
8446 dz12 = vec_sub(iz1,jz2);
8447 dz13 = vec_sub(iz1,jz3);
8448 dx21 = vec_sub(ix2,jx1);
8449 dx22 = vec_sub(ix2,jx2);
8450 dx23 = vec_sub(ix2,jx3);
8451 dy21 = vec_sub(iy2,jy1);
8452 dy22 = vec_sub(iy2,jy2);
8453 dy23 = vec_sub(iy2,jy3);
8454 dz21 = vec_sub(iz2,jz1);
8455 dz22 = vec_sub(iz2,jz2);
8456 dz23 = vec_sub(iz2,jz3);
8457 dx31 = vec_sub(ix3,jx1);
8458 dx32 = vec_sub(ix3,jx2);
8459 dx33 = vec_sub(ix3,jx3);
8460 dy31 = vec_sub(iy3,jy1);
8461 dy32 = vec_sub(iy3,jy2);
8462 dy33 = vec_sub(iy3,jy3);
8463 dz31 = vec_sub(iz3,jz1);
8464 dz32 = vec_sub(iz3,jz2);
8465 dz33 = vec_sub(iz3,jz3);
8467 rsq11 = vec_madd(dx11,dx11,nul);
8468 rsq12 = vec_madd(dx12,dx12,nul);
8469 rsq13 = vec_madd(dx13,dx13,nul);
8470 rsq21 = vec_madd(dx21,dx21,nul);
8471 rsq22 = vec_madd(dx22,dx22,nul);
8472 rsq23 = vec_madd(dx23,dx23,nul);
8473 rsq31 = vec_madd(dx31,dx31,nul);
8474 rsq32 = vec_madd(dx32,dx32,nul);
8475 rsq33 = vec_madd(dx33,dx33,nul);
8476 rsq11 = vec_madd(dy11,dy11,rsq11);
8477 rsq12 = vec_madd(dy12,dy12,rsq12);
8478 rsq13 = vec_madd(dy13,dy13,rsq13);
8479 rsq21 = vec_madd(dy21,dy21,rsq21);
8480 rsq22 = vec_madd(dy22,dy22,rsq22);
8481 rsq23 = vec_madd(dy23,dy23,rsq23);
8482 rsq31 = vec_madd(dy31,dy31,rsq31);
8483 rsq32 = vec_madd(dy32,dy32,rsq32);
8484 rsq33 = vec_madd(dy33,dy33,rsq33);
8485 rsq11 = vec_madd(dz11,dz11,rsq11);
8486 rsq12 = vec_madd(dz12,dz12,rsq12);
8487 rsq13 = vec_madd(dz13,dz13,rsq13);
8488 rsq21 = vec_madd(dz21,dz21,rsq21);
8489 rsq22 = vec_madd(dz22,dz22,rsq22);
8490 rsq23 = vec_madd(dz23,dz23,rsq23);
8491 rsq31 = vec_madd(dz31,dz31,rsq31);
8492 rsq32 = vec_madd(dz32,dz32,rsq32);
8493 rsq33 = vec_madd(dz33,dz33,rsq33);
8495 zero_highest_3_elements_in_9_vectors(&rsq11,&rsq12,&rsq13,
8496 &rsq21,&rsq22,&rsq23,
8497 &rsq31,&rsq32,&rsq33);
8499 do_9_invsqrt(rsq11,rsq12,rsq13,
8500 rsq21,rsq22,rsq23,
8501 rsq31,rsq32,rsq33,
8502 &rinv11,&rinv12,&rinv13,
8503 &rinv21,&rinv22,&rinv23,
8504 &rinv31,&rinv32,&rinv33);
8506 zero_highest_3_elements_in_9_vectors(&rinv11,&rinv12,&rinv13,
8507 &rinv21,&rinv22,&rinv23,
8508 &rinv31,&rinv32,&rinv33);
8510 krsq11 = vec_madd(vkrf,rsq11,nul);
8511 krsq12 = vec_madd(vkrf,rsq12,nul);
8512 krsq13 = vec_madd(vkrf,rsq13,nul);
8513 krsq21 = vec_madd(vkrf,rsq21,nul);
8514 krsq22 = vec_madd(vkrf,rsq22,nul);
8515 krsq23 = vec_madd(vkrf,rsq23,nul);
8516 krsq31 = vec_madd(vkrf,rsq31,nul);
8517 krsq32 = vec_madd(vkrf,rsq32,nul);
8518 krsq33 = vec_madd(vkrf,rsq33,nul);
8520 rinvsq11 = vec_madd(rinv11,rinv11,nul);
8521 rinvsq12 = vec_madd(rinv12,rinv12,nul);
8522 rinvsq13 = vec_madd(rinv13,rinv13,nul);
8523 rinvsq21 = vec_madd(rinv21,rinv21,nul);
8524 rinvsq22 = vec_madd(rinv22,rinv22,nul);
8525 rinvsq23 = vec_madd(rinv23,rinv23,nul);
8526 rinvsq31 = vec_madd(rinv31,rinv31,nul);
8527 rinvsq32 = vec_madd(rinv32,rinv32,nul);
8528 rinvsq33 = vec_madd(rinv33,rinv33,nul);
8530 vc11 = vec_add(rinv11,krsq11);
8531 vc12 = vec_add(rinv12,krsq12);
8532 vc13 = vec_add(rinv13,krsq13);
8533 vc21 = vec_add(rinv21,krsq21);
8534 vc22 = vec_add(rinv22,krsq22);
8535 vc23 = vec_add(rinv23,krsq23);
8536 vc31 = vec_add(rinv31,krsq31);
8537 vc32 = vec_add(rinv32,krsq32);
8538 vc33 = vec_add(rinv33,krsq33);
8540 vc11 = vec_sub(vc11,vcrf);
8541 vc12 = vec_sub(vc12,vcrf);
8542 vc13 = vec_sub(vc13,vcrf);
8543 vc21 = vec_sub(vc21,vcrf);
8544 vc22 = vec_sub(vc22,vcrf);
8545 vc23 = vec_sub(vc23,vcrf);
8546 vc31 = vec_sub(vc31,vcrf);
8547 vc32 = vec_sub(vc32,vcrf);
8548 vc33 = vec_sub(vc33,vcrf);
8550 fs11 = vec_nmsub(vec_two(),krsq11,rinv11);
8551 fs12 = vec_nmsub(vec_two(),krsq12,rinv12);
8552 fs13 = vec_nmsub(vec_two(),krsq13,rinv13);
8553 fs21 = vec_nmsub(vec_two(),krsq21,rinv21);
8554 fs22 = vec_nmsub(vec_two(),krsq22,rinv22);
8555 fs23 = vec_nmsub(vec_two(),krsq23,rinv23);
8556 fs31 = vec_nmsub(vec_two(),krsq31,rinv31);
8557 fs32 = vec_nmsub(vec_two(),krsq32,rinv32);
8558 fs33 = vec_nmsub(vec_two(),krsq33,rinv33);
8560 fs11 = vec_madd(fs11,qqOOt,nul);
8561 fs12 = vec_madd(fs12,qqOHt,nul);
8562 fs13 = vec_madd(fs13,qqOHt,nul);
8563 fs21 = vec_madd(fs21,qqOHt,nul);
8564 fs22 = vec_madd(fs22,qqHHt,nul);
8565 fs23 = vec_madd(fs23,qqHHt,nul);
8566 fs31 = vec_madd(fs31,qqOHt,nul);
8567 fs32 = vec_madd(fs32,qqHHt,nul);
8568 fs33 = vec_madd(fs33,qqHHt,nul);
8570 fs11 = vec_madd(fs11,rinvsq11,nul);
8571 fs12 = vec_madd(fs12,rinvsq12,nul);
8572 fs13 = vec_madd(fs13,rinvsq13,nul);
8573 fs21 = vec_madd(fs21,rinvsq21,nul);
8574 fs22 = vec_madd(fs22,rinvsq22,nul);
8575 fs23 = vec_madd(fs23,rinvsq23,nul);
8576 fs31 = vec_madd(fs31,rinvsq31,nul);
8577 fs32 = vec_madd(fs32,rinvsq32,nul);
8578 fs33 = vec_madd(fs33,rinvsq33,nul);
8580 vctot = vec_madd(qqOOt,vc11,vctot);
8581 vctot = vec_madd(qqOHt,vc12,vctot);
8582 vctot = vec_madd(qqOHt,vc13,vctot);
8583 vctot = vec_madd(qqOHt,vc21,vctot);
8584 vctot = vec_madd(qqHHt,vc22,vctot);
8585 vctot = vec_madd(qqHHt,vc23,vctot);
8586 vctot = vec_madd(qqOHt,vc31,vctot);
8587 vctot = vec_madd(qqHHt,vc32,vctot);
8588 vctot = vec_madd(qqHHt,vc33,vctot);
8590 fix1 = vec_madd(fs11,dx11,fix1);
8591 fiy1 = vec_madd(fs11,dy11,fiy1);
8592 fiz1 = vec_madd(fs11,dz11,fiz1);
8593 fix2 = vec_madd(fs21,dx21,fix2);
8594 fiy2 = vec_madd(fs21,dy21,fiy2);
8595 fiz2 = vec_madd(fs21,dz21,fiz2);
8596 fix3 = vec_madd(fs31,dx31,fix3);
8597 fiy3 = vec_madd(fs31,dy31,fiy3);
8598 fiz3 = vec_madd(fs31,dz31,fiz3);
8600 fix1 = vec_madd(fs12,dx12,fix1);
8601 fiy1 = vec_madd(fs12,dy12,fiy1);
8602 fiz1 = vec_madd(fs12,dz12,fiz1);
8603 fix2 = vec_madd(fs22,dx22,fix2);
8604 fiy2 = vec_madd(fs22,dy22,fiy2);
8605 fiz2 = vec_madd(fs22,dz22,fiz2);
8606 fix3 = vec_madd(fs32,dx32,fix3);
8607 fiy3 = vec_madd(fs32,dy32,fiy3);
8608 fiz3 = vec_madd(fs32,dz32,fiz3);
8610 fix1 = vec_madd(fs13,dx13,fix1);
8611 fiy1 = vec_madd(fs13,dy13,fiy1);
8612 fiz1 = vec_madd(fs13,dz13,fiz1);
8613 fix2 = vec_madd(fs23,dx23,fix2);
8614 fiy2 = vec_madd(fs23,dy23,fiy2);
8615 fiz2 = vec_madd(fs23,dz23,fiz2);
8616 fix3 = vec_madd(fs33,dx33,fix3);
8617 fiy3 = vec_madd(fs33,dy33,fiy3);
8618 fiz3 = vec_madd(fs33,dz33,fiz3);
8620 fjx1 = vec_nmsub(fs11,dx11,nul);
8621 fjy1 = vec_nmsub(fs11,dy11,nul);
8622 fjz1 = vec_nmsub(fs11,dz11,nul);
8623 fjx2 = vec_nmsub(fs12,dx12,nul);
8624 fjy2 = vec_nmsub(fs12,dy12,nul);
8625 fjz2 = vec_nmsub(fs12,dz12,nul);
8626 fjx3 = vec_nmsub(fs13,dx13,nul);
8627 fjy3 = vec_nmsub(fs13,dy13,nul);
8628 fjz3 = vec_nmsub(fs13,dz13,nul);
8630 fjx1 = vec_nmsub(fs21,dx21,fjx1);
8631 fjy1 = vec_nmsub(fs21,dy21,fjy1);
8632 fjz1 = vec_nmsub(fs21,dz21,fjz1);
8633 fjx2 = vec_nmsub(fs22,dx22,fjx2);
8634 fjy2 = vec_nmsub(fs22,dy22,fjy2);
8635 fjz2 = vec_nmsub(fs22,dz22,fjz2);
8636 fjx3 = vec_nmsub(fs23,dx23,fjx3);
8637 fjy3 = vec_nmsub(fs23,dy23,fjy3);
8638 fjz3 = vec_nmsub(fs23,dz23,fjz3);
8640 fjx1 = vec_nmsub(fs31,dx31,fjx1);
8641 fjy1 = vec_nmsub(fs31,dy31,fjy1);
8642 fjz1 = vec_nmsub(fs31,dz31,fjz1);
8643 fjx2 = vec_nmsub(fs32,dx32,fjx2);
8644 fjy2 = vec_nmsub(fs32,dy32,fjy2);
8645 fjz2 = vec_nmsub(fs32,dz32,fjz2);
8646 fjx3 = vec_nmsub(fs33,dx33,fjx3);
8647 fjy3 = vec_nmsub(fs33,dy33,fjy3);
8648 fjz3 = vec_nmsub(fs33,dz33,fjz3);
8650 add_force_to_1_water(faction+j3a,
8651 fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
8653 /* update outer data */
8654 update_i_water_forces(faction+ii3,fshift+is3,
8655 fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3);
8657 add_vector_to_float(Vc+gid[n],vctot);
8663 void inl2130_altivec(
8664 int nri,
8665 int iinr[],
8666 int jindex[],
8667 int jjnr[],
8668 int shift[],
8669 float shiftvec[],
8670 float fshift[],
8671 int gid[],
8672 float pos[],
8673 float faction[],
8674 float charge[],
8675 float facel,
8676 float Vc[],
8677 float krf,
8678 float crf,
8679 int type[],
8680 int ntype,
8681 float nbfp[],
8682 float Vnb[])
8684 vector float ix1,iy1,iz1,ix2,iy2,iz2,ix3,iy3,iz3;
8685 vector float jx1,jy1,jz1,jx2,jy2,jz2,jx3,jy3,jz3;
8687 vector float dx11,dy11,dz11,dx12,dy12,dz12,dx13,dy13,dz13;
8688 vector float dx21,dy21,dz21,dx22,dy22,dz22,dx23,dy23,dz23;
8689 vector float dx31,dy31,dz31,dx32,dy32,dz32,dx33,dy33,dz33;
8691 vector float rsq11,rsq12,rsq13,rsq21,rsq22,rsq23,rsq31,rsq32,rsq33;
8692 vector float rinv11,rinv12,rinv13,rinv21,rinv22,rinv23,rinv31,rinv32,rinv33;
8693 vector float rinvsq11,rinvsq12,rinvsq13;
8694 vector float rinvsq21,rinvsq22,rinvsq23;
8695 vector float rinvsq31,rinvsq32,rinvsq33;
8696 vector float vc11,vc12,vc13,vc21,vc22,vc23,vc31,vc32,vc33,vkrf,vcrf;
8697 vector float krsq11,krsq12,krsq13,krsq21,krsq22,krsq23,krsq31,krsq32,krsq33;
8699 vector float vfacel,vcoul1,vcoul2,vcoul3,nul;
8700 vector float fs11,fs12,fs13,fs21,fs22,fs23,fs31,fs32,fs33;
8701 vector float fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3;
8702 vector float fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3;
8703 vector float vctot,qqOO,qqOH,qqHH,qO,qH,c6,c12,rinvsix;
8704 vector float vnb6,vnb12,vnbtot,qqOOt,qqOHt,qqHHt,c6t,c12t;
8706 int n,k,k0,ii,is3,ii3,ntiA,nj0,nj1;
8707 int jnra,jnrb,jnrc,jnrd,tp,tj;
8708 int j3a,j3b,j3c,j3d;
8710 nul=vec_zero();
8711 vfacel=load_float_and_splat(&facel);
8712 vkrf=load_float_and_splat(&krf);
8713 vcrf=load_float_and_splat(&crf);
8714 ii = iinr[0];
8715 qO = load_float_and_splat(charge+ii);
8716 qH = load_float_and_splat(charge+ii+1);
8717 qqOO = vec_madd(qO,qO,nul);
8718 qqOH = vec_madd(qO,qH,nul);
8719 qqHH = vec_madd(qH,qH,nul);
8720 qqOO = vec_madd(qqOO,vfacel,nul);
8721 qqOH = vec_madd(qqOH,vfacel,nul);
8722 qqHH = vec_madd(qqHH,vfacel,nul);
8723 tp = 2*type[ii];
8724 tj = (ntype+1)*tp;
8725 load_1_pair(nbfp+tj,&c6,&c12);
8726 c6 = vec_splat(c6,0);
8727 c12 = vec_splat(c12,0);
8729 for(n=0;n<nri;n++) {
8730 is3 = 3*shift[n];
8731 ii = iinr[n];
8732 ii3 = 3*ii;
8733 load_1_water_shift_and_splat(pos+ii3,shiftvec+is3,&ix1,&iy1,&iz1,
8734 &ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
8735 vctot = nul;
8736 vnbtot = nul;
8737 fix1 = nul;
8738 fiy1 = nul;
8739 fiz1 = nul;
8740 fix2 = nul;
8741 fiy2 = nul;
8742 fiz2 = nul;
8743 fix3 = nul;
8744 fiy3 = nul;
8745 fiz3 = nul;
8746 nj0 = jindex[n];
8747 nj1 = jindex[n+1];
8749 for(k=nj0; k<(nj1-3); k+=4) {
8750 jnra = jjnr[k];
8751 jnrb = jjnr[k+1];
8752 jnrc = jjnr[k+2];
8753 jnrd = jjnr[k+3];
8754 j3a = 3*jnra;
8755 j3b = 3*jnrb;
8756 j3c = 3*jnrc;
8757 j3d = 3*jnrd;
8758 load_4_water(pos+j3a,pos+j3b,pos+j3c,pos+j3d,
8759 &jx1,&jy1,&jz1,&jx2,&jy2,&jz2,&jx3,&jy3,&jz3);
8760 dx11 = vec_sub(ix1,jx1);
8761 dx12 = vec_sub(ix1,jx2);
8762 dx13 = vec_sub(ix1,jx3);
8763 dy11 = vec_sub(iy1,jy1);
8764 dy12 = vec_sub(iy1,jy2);
8765 dy13 = vec_sub(iy1,jy3);
8766 dz11 = vec_sub(iz1,jz1);
8767 dz12 = vec_sub(iz1,jz2);
8768 dz13 = vec_sub(iz1,jz3);
8769 dx21 = vec_sub(ix2,jx1);
8770 dx22 = vec_sub(ix2,jx2);
8771 dx23 = vec_sub(ix2,jx3);
8772 dy21 = vec_sub(iy2,jy1);
8773 dy22 = vec_sub(iy2,jy2);
8774 dy23 = vec_sub(iy2,jy3);
8775 dz21 = vec_sub(iz2,jz1);
8776 dz22 = vec_sub(iz2,jz2);
8777 dz23 = vec_sub(iz2,jz3);
8778 dx31 = vec_sub(ix3,jx1);
8779 dx32 = vec_sub(ix3,jx2);
8780 dx33 = vec_sub(ix3,jx3);
8781 dy31 = vec_sub(iy3,jy1);
8782 dy32 = vec_sub(iy3,jy2);
8783 dy33 = vec_sub(iy3,jy3);
8784 dz31 = vec_sub(iz3,jz1);
8785 dz32 = vec_sub(iz3,jz2);
8786 dz33 = vec_sub(iz3,jz3);
8788 rsq11 = vec_madd(dx11,dx11,nul);
8789 rsq12 = vec_madd(dx12,dx12,nul);
8790 rsq13 = vec_madd(dx13,dx13,nul);
8791 rsq21 = vec_madd(dx21,dx21,nul);
8792 rsq22 = vec_madd(dx22,dx22,nul);
8793 rsq23 = vec_madd(dx23,dx23,nul);
8794 rsq31 = vec_madd(dx31,dx31,nul);
8795 rsq32 = vec_madd(dx32,dx32,nul);
8796 rsq33 = vec_madd(dx33,dx33,nul);
8797 rsq11 = vec_madd(dy11,dy11,rsq11);
8798 rsq12 = vec_madd(dy12,dy12,rsq12);
8799 rsq13 = vec_madd(dy13,dy13,rsq13);
8800 rsq21 = vec_madd(dy21,dy21,rsq21);
8801 rsq22 = vec_madd(dy22,dy22,rsq22);
8802 rsq23 = vec_madd(dy23,dy23,rsq23);
8803 rsq31 = vec_madd(dy31,dy31,rsq31);
8804 rsq32 = vec_madd(dy32,dy32,rsq32);
8805 rsq33 = vec_madd(dy33,dy33,rsq33);
8806 rsq11 = vec_madd(dz11,dz11,rsq11);
8807 rsq12 = vec_madd(dz12,dz12,rsq12);
8808 rsq13 = vec_madd(dz13,dz13,rsq13);
8809 rsq21 = vec_madd(dz21,dz21,rsq21);
8810 rsq22 = vec_madd(dz22,dz22,rsq22);
8811 rsq23 = vec_madd(dz23,dz23,rsq23);
8812 rsq31 = vec_madd(dz31,dz31,rsq31);
8813 rsq32 = vec_madd(dz32,dz32,rsq32);
8814 rsq33 = vec_madd(dz33,dz33,rsq33);
8816 do_9_invsqrt(rsq11,rsq12,rsq13,
8817 rsq21,rsq22,rsq23,
8818 rsq31,rsq32,rsq33,
8819 &rinv11,&rinv12,&rinv13,
8820 &rinv21,&rinv22,&rinv23,
8821 &rinv31,&rinv32,&rinv33);
8823 krsq11 = vec_madd(vkrf,rsq11,nul);
8824 krsq12 = vec_madd(vkrf,rsq12,nul);
8825 krsq13 = vec_madd(vkrf,rsq13,nul);
8826 krsq21 = vec_madd(vkrf,rsq21,nul);
8827 krsq22 = vec_madd(vkrf,rsq22,nul);
8828 krsq23 = vec_madd(vkrf,rsq23,nul);
8829 krsq31 = vec_madd(vkrf,rsq31,nul);
8830 krsq32 = vec_madd(vkrf,rsq32,nul);
8831 krsq33 = vec_madd(vkrf,rsq33,nul);
8833 rinvsq11 = vec_madd(rinv11,rinv11,nul);
8834 rinvsq12 = vec_madd(rinv12,rinv12,nul);
8835 rinvsq13 = vec_madd(rinv13,rinv13,nul);
8836 rinvsq21 = vec_madd(rinv21,rinv21,nul);
8837 rinvsix = vec_madd(rinvsq11,rinvsq11,nul);
8838 rinvsq22 = vec_madd(rinv22,rinv22,nul);
8839 rinvsq23 = vec_madd(rinv23,rinv23,nul);
8840 rinvsq31 = vec_madd(rinv31,rinv31,nul);
8841 rinvsix = vec_madd(rinvsix,rinvsq11,nul);
8842 rinvsq32 = vec_madd(rinv32,rinv32,nul);
8843 rinvsq33 = vec_madd(rinv33,rinv33,nul);
8845 vnb6 = vec_madd(c6,rinvsix,nul);
8846 vnb12 = vec_madd(c12,vec_madd(rinvsix,rinvsix,nul),nul);
8847 vnbtot = vec_add(vnbtot,vnb12);
8848 vnbtot = vec_sub(vnbtot,vnb6);
8850 fs11 = vec_nmsub(vec_two(),krsq11,rinv11);
8851 vc11 = vec_add(rinv11,krsq11);
8852 vc12 = vec_add(rinv12,krsq12);
8853 vc13 = vec_add(rinv13,krsq13);
8854 vc21 = vec_add(rinv21,krsq21);
8855 vc22 = vec_add(rinv22,krsq22);
8856 vc23 = vec_add(rinv23,krsq23);
8857 vc31 = vec_add(rinv31,krsq31);
8858 vc32 = vec_add(rinv32,krsq32);
8859 vc33 = vec_add(rinv33,krsq33);
8861 fs11 = vec_madd(qqOO,fs11,nul);
8862 vc11 = vec_sub(vc11,vcrf);
8863 vc12 = vec_sub(vc12,vcrf);
8864 vc13 = vec_sub(vc13,vcrf);
8865 vc21 = vec_sub(vc21,vcrf);
8866 vc22 = vec_sub(vc22,vcrf);
8867 vc23 = vec_sub(vc23,vcrf);
8868 vc31 = vec_sub(vc31,vcrf);
8869 vc32 = vec_sub(vc32,vcrf);
8870 vc33 = vec_sub(vc33,vcrf);
8872 fs11 = vec_nmsub(vec_six(),vnb6,fs11);
8873 fs12 = vec_nmsub(vec_two(),krsq12,rinv12);
8874 fs13 = vec_nmsub(vec_two(),krsq13,rinv13);
8875 fs21 = vec_nmsub(vec_two(),krsq21,rinv21);
8876 fs22 = vec_nmsub(vec_two(),krsq22,rinv22);
8877 fs23 = vec_nmsub(vec_two(),krsq23,rinv23);
8878 fs31 = vec_nmsub(vec_two(),krsq31,rinv31);
8879 fs32 = vec_nmsub(vec_two(),krsq32,rinv32);
8880 fs33 = vec_nmsub(vec_two(),krsq33,rinv33);
8882 fs11 = vec_madd(vec_twelve(),vnb12,fs11);
8883 fs12 = vec_madd(fs12,qqOH,nul);
8884 fs13 = vec_madd(fs13,qqOH,nul);
8885 fs21 = vec_madd(fs21,qqOH,nul);
8886 fs22 = vec_madd(fs22,qqHH,nul);
8887 fs23 = vec_madd(fs23,qqHH,nul);
8888 fs31 = vec_madd(fs31,qqOH,nul);
8889 fs32 = vec_madd(fs32,qqHH,nul);
8890 fs33 = vec_madd(fs33,qqHH,nul);
8892 fs11 = vec_madd(fs11,rinvsq11,nul);
8893 fs12 = vec_madd(fs12,rinvsq12,nul);
8894 fs13 = vec_madd(fs13,rinvsq13,nul);
8895 fs21 = vec_madd(fs21,rinvsq21,nul);
8896 fs22 = vec_madd(fs22,rinvsq22,nul);
8897 fs23 = vec_madd(fs23,rinvsq23,nul);
8898 fs31 = vec_madd(fs31,rinvsq31,nul);
8899 fs32 = vec_madd(fs32,rinvsq32,nul);
8900 fs33 = vec_madd(fs33,rinvsq33,nul);
8902 vctot = vec_madd(qqOO,vc11,vctot);
8903 vctot = vec_madd(qqOH,vc12,vctot);
8904 vctot = vec_madd(qqOH,vc13,vctot);
8905 vctot = vec_madd(qqOH,vc21,vctot);
8906 vctot = vec_madd(qqHH,vc22,vctot);
8907 vctot = vec_madd(qqHH,vc23,vctot);
8908 vctot = vec_madd(qqOH,vc31,vctot);
8909 vctot = vec_madd(qqHH,vc32,vctot);
8910 vctot = vec_madd(qqHH,vc33,vctot);
8912 fix1 = vec_madd(fs11,dx11,fix1);
8913 fiy1 = vec_madd(fs11,dy11,fiy1);
8914 fiz1 = vec_madd(fs11,dz11,fiz1);
8915 fix2 = vec_madd(fs21,dx21,fix2);
8916 fiy2 = vec_madd(fs21,dy21,fiy2);
8917 fiz2 = vec_madd(fs21,dz21,fiz2);
8918 fix3 = vec_madd(fs31,dx31,fix3);
8919 fiy3 = vec_madd(fs31,dy31,fiy3);
8920 fiz3 = vec_madd(fs31,dz31,fiz3);
8922 fix1 = vec_madd(fs12,dx12,fix1);
8923 fiy1 = vec_madd(fs12,dy12,fiy1);
8924 fiz1 = vec_madd(fs12,dz12,fiz1);
8925 fix2 = vec_madd(fs22,dx22,fix2);
8926 fiy2 = vec_madd(fs22,dy22,fiy2);
8927 fiz2 = vec_madd(fs22,dz22,fiz2);
8928 fix3 = vec_madd(fs32,dx32,fix3);
8929 fiy3 = vec_madd(fs32,dy32,fiy3);
8930 fiz3 = vec_madd(fs32,dz32,fiz3);
8932 fix1 = vec_madd(fs13,dx13,fix1);
8933 fiy1 = vec_madd(fs13,dy13,fiy1);
8934 fiz1 = vec_madd(fs13,dz13,fiz1);
8935 fix2 = vec_madd(fs23,dx23,fix2);
8936 fiy2 = vec_madd(fs23,dy23,fiy2);
8937 fiz2 = vec_madd(fs23,dz23,fiz2);
8938 fix3 = vec_madd(fs33,dx33,fix3);
8939 fiy3 = vec_madd(fs33,dy33,fiy3);
8940 fiz3 = vec_madd(fs33,dz33,fiz3);
8942 fjx1 = vec_nmsub(fs11,dx11,nul);
8943 fjy1 = vec_nmsub(fs11,dy11,nul);
8944 fjz1 = vec_nmsub(fs11,dz11,nul);
8945 fjx2 = vec_nmsub(fs12,dx12,nul);
8946 fjy2 = vec_nmsub(fs12,dy12,nul);
8947 fjz2 = vec_nmsub(fs12,dz12,nul);
8948 fjx3 = vec_nmsub(fs13,dx13,nul);
8949 fjy3 = vec_nmsub(fs13,dy13,nul);
8950 fjz3 = vec_nmsub(fs13,dz13,nul);
8952 fjx1 = vec_nmsub(fs21,dx21,fjx1);
8953 fjy1 = vec_nmsub(fs21,dy21,fjy1);
8954 fjz1 = vec_nmsub(fs21,dz21,fjz1);
8955 fjx2 = vec_nmsub(fs22,dx22,fjx2);
8956 fjy2 = vec_nmsub(fs22,dy22,fjy2);
8957 fjz2 = vec_nmsub(fs22,dz22,fjz2);
8958 fjx3 = vec_nmsub(fs23,dx23,fjx3);
8959 fjy3 = vec_nmsub(fs23,dy23,fjy3);
8960 fjz3 = vec_nmsub(fs23,dz23,fjz3);
8962 fjx1 = vec_nmsub(fs31,dx31,fjx1);
8963 fjy1 = vec_nmsub(fs31,dy31,fjy1);
8964 fjz1 = vec_nmsub(fs31,dz31,fjz1);
8965 fjx2 = vec_nmsub(fs32,dx32,fjx2);
8966 fjy2 = vec_nmsub(fs32,dy32,fjy2);
8967 fjz2 = vec_nmsub(fs32,dz32,fjz2);
8968 fjx3 = vec_nmsub(fs33,dx33,fjx3);
8969 fjy3 = vec_nmsub(fs33,dy33,fjy3);
8970 fjz3 = vec_nmsub(fs33,dz33,fjz3);
8972 add_force_to_4_water(faction+j3a,faction+j3b,faction+j3c,faction+j3d,
8973 fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
8975 if(k<(nj1-2)) {
8976 jnra = jjnr[k];
8977 jnrb = jjnr[k+1];
8978 jnrc = jjnr[k+2];
8979 j3a = 3*jnra;
8980 j3b = 3*jnrb;
8981 j3c = 3*jnrc;
8982 load_3_water(pos+j3a,pos+j3b,pos+j3c,
8983 &jx1,&jy1,&jz1,&jx2,&jy2,&jz2,&jx3,&jy3,&jz3);
8984 qqOOt = vec_sld(qqOO,nul,4);
8985 qqOHt = vec_sld(qqOH,nul,4);
8986 qqHHt = vec_sld(qqHH,nul,4);
8987 c6t = vec_sld(c6,nul,4);
8988 c12t = vec_sld(c12,nul,4);
8990 dx11 = vec_sub(ix1,jx1);
8991 dx12 = vec_sub(ix1,jx2);
8992 dx13 = vec_sub(ix1,jx3);
8993 dy11 = vec_sub(iy1,jy1);
8994 dy12 = vec_sub(iy1,jy2);
8995 dy13 = vec_sub(iy1,jy3);
8996 dz11 = vec_sub(iz1,jz1);
8997 dz12 = vec_sub(iz1,jz2);
8998 dz13 = vec_sub(iz1,jz3);
8999 dx21 = vec_sub(ix2,jx1);
9000 dx22 = vec_sub(ix2,jx2);
9001 dx23 = vec_sub(ix2,jx3);
9002 dy21 = vec_sub(iy2,jy1);
9003 dy22 = vec_sub(iy2,jy2);
9004 dy23 = vec_sub(iy2,jy3);
9005 dz21 = vec_sub(iz2,jz1);
9006 dz22 = vec_sub(iz2,jz2);
9007 dz23 = vec_sub(iz2,jz3);
9008 dx31 = vec_sub(ix3,jx1);
9009 dx32 = vec_sub(ix3,jx2);
9010 dx33 = vec_sub(ix3,jx3);
9011 dy31 = vec_sub(iy3,jy1);
9012 dy32 = vec_sub(iy3,jy2);
9013 dy33 = vec_sub(iy3,jy3);
9014 dz31 = vec_sub(iz3,jz1);
9015 dz32 = vec_sub(iz3,jz2);
9016 dz33 = vec_sub(iz3,jz3);
9018 rsq11 = vec_madd(dx11,dx11,nul);
9019 rsq12 = vec_madd(dx12,dx12,nul);
9020 rsq13 = vec_madd(dx13,dx13,nul);
9021 rsq21 = vec_madd(dx21,dx21,nul);
9022 rsq22 = vec_madd(dx22,dx22,nul);
9023 rsq23 = vec_madd(dx23,dx23,nul);
9024 rsq31 = vec_madd(dx31,dx31,nul);
9025 rsq32 = vec_madd(dx32,dx32,nul);
9026 rsq33 = vec_madd(dx33,dx33,nul);
9027 rsq11 = vec_madd(dy11,dy11,rsq11);
9028 rsq12 = vec_madd(dy12,dy12,rsq12);
9029 rsq13 = vec_madd(dy13,dy13,rsq13);
9030 rsq21 = vec_madd(dy21,dy21,rsq21);
9031 rsq22 = vec_madd(dy22,dy22,rsq22);
9032 rsq23 = vec_madd(dy23,dy23,rsq23);
9033 rsq31 = vec_madd(dy31,dy31,rsq31);
9034 rsq32 = vec_madd(dy32,dy32,rsq32);
9035 rsq33 = vec_madd(dy33,dy33,rsq33);
9036 rsq11 = vec_madd(dz11,dz11,rsq11);
9037 rsq12 = vec_madd(dz12,dz12,rsq12);
9038 rsq13 = vec_madd(dz13,dz13,rsq13);
9039 rsq21 = vec_madd(dz21,dz21,rsq21);
9040 rsq22 = vec_madd(dz22,dz22,rsq22);
9041 rsq23 = vec_madd(dz23,dz23,rsq23);
9042 rsq31 = vec_madd(dz31,dz31,rsq31);
9043 rsq32 = vec_madd(dz32,dz32,rsq32);
9044 rsq33 = vec_madd(dz33,dz33,rsq33);
9046 zero_highest_element_in_9_vectors(&rsq11,&rsq12,&rsq13,
9047 &rsq21,&rsq22,&rsq23,
9048 &rsq31,&rsq32,&rsq33);
9050 do_9_invsqrt(rsq11,rsq12,rsq13,
9051 rsq21,rsq22,rsq23,
9052 rsq31,rsq32,rsq33,
9053 &rinv11,&rinv12,&rinv13,
9054 &rinv21,&rinv22,&rinv23,
9055 &rinv31,&rinv32,&rinv33);
9057 zero_highest_element_in_9_vectors(&rinv11,&rinv12,&rinv13,
9058 &rinv21,&rinv22,&rinv23,
9059 &rinv31,&rinv32,&rinv33);
9061 krsq11 = vec_madd(vkrf,rsq11,nul);
9062 krsq12 = vec_madd(vkrf,rsq12,nul);
9063 krsq13 = vec_madd(vkrf,rsq13,nul);
9064 krsq21 = vec_madd(vkrf,rsq21,nul);
9065 krsq22 = vec_madd(vkrf,rsq22,nul);
9066 krsq23 = vec_madd(vkrf,rsq23,nul);
9067 krsq31 = vec_madd(vkrf,rsq31,nul);
9068 krsq32 = vec_madd(vkrf,rsq32,nul);
9069 krsq33 = vec_madd(vkrf,rsq33,nul);
9071 rinvsq11 = vec_madd(rinv11,rinv11,nul);
9072 rinvsq12 = vec_madd(rinv12,rinv12,nul);
9073 rinvsq13 = vec_madd(rinv13,rinv13,nul);
9074 rinvsq21 = vec_madd(rinv21,rinv21,nul);
9075 rinvsix = vec_madd(rinvsq11,rinvsq11,nul);
9076 rinvsq22 = vec_madd(rinv22,rinv22,nul);
9077 rinvsq23 = vec_madd(rinv23,rinv23,nul);
9078 rinvsq31 = vec_madd(rinv31,rinv31,nul);
9079 rinvsix = vec_madd(rinvsix,rinvsq11,nul);
9080 rinvsq32 = vec_madd(rinv32,rinv32,nul);
9081 rinvsq33 = vec_madd(rinv33,rinv33,nul);
9083 vnb6 = vec_madd(c6t,rinvsix,nul);
9084 vnb12 = vec_madd(c12t,vec_madd(rinvsix,rinvsix,nul),nul);
9085 vnbtot = vec_add(vnbtot,vnb12);
9086 vnbtot = vec_sub(vnbtot,vnb6);
9088 fs11 = vec_nmsub(vec_two(),krsq11,rinv11);
9089 vc11 = vec_add(rinv11,krsq11);
9090 vc12 = vec_add(rinv12,krsq12);
9091 vc13 = vec_add(rinv13,krsq13);
9092 vc21 = vec_add(rinv21,krsq21);
9093 vc22 = vec_add(rinv22,krsq22);
9094 vc23 = vec_add(rinv23,krsq23);
9095 vc31 = vec_add(rinv31,krsq31);
9096 vc32 = vec_add(rinv32,krsq32);
9097 vc33 = vec_add(rinv33,krsq33);
9099 fs11 = vec_madd(qqOOt,fs11,nul);
9100 vc11 = vec_sub(vc11,vcrf);
9101 vc12 = vec_sub(vc12,vcrf);
9102 vc13 = vec_sub(vc13,vcrf);
9103 vc21 = vec_sub(vc21,vcrf);
9104 vc22 = vec_sub(vc22,vcrf);
9105 vc23 = vec_sub(vc23,vcrf);
9106 vc31 = vec_sub(vc31,vcrf);
9107 vc32 = vec_sub(vc32,vcrf);
9108 vc33 = vec_sub(vc33,vcrf);
9110 fs11 = vec_nmsub(vec_six(),vnb6,fs11);
9111 fs12 = vec_nmsub(vec_two(),krsq12,rinv12);
9112 fs13 = vec_nmsub(vec_two(),krsq13,rinv13);
9113 fs21 = vec_nmsub(vec_two(),krsq21,rinv21);
9114 fs22 = vec_nmsub(vec_two(),krsq22,rinv22);
9115 fs23 = vec_nmsub(vec_two(),krsq23,rinv23);
9116 fs31 = vec_nmsub(vec_two(),krsq31,rinv31);
9117 fs32 = vec_nmsub(vec_two(),krsq32,rinv32);
9118 fs33 = vec_nmsub(vec_two(),krsq33,rinv33);
9120 fs11 = vec_madd(vec_twelve(),vnb12,fs11);
9121 fs12 = vec_madd(fs12,qqOHt,nul);
9122 fs13 = vec_madd(fs13,qqOHt,nul);
9123 fs21 = vec_madd(fs21,qqOHt,nul);
9124 fs22 = vec_madd(fs22,qqHHt,nul);
9125 fs23 = vec_madd(fs23,qqHHt,nul);
9126 fs31 = vec_madd(fs31,qqOHt,nul);
9127 fs32 = vec_madd(fs32,qqHHt,nul);
9128 fs33 = vec_madd(fs33,qqHHt,nul);
9130 fs11 = vec_madd(fs11,rinvsq11,nul);
9131 fs12 = vec_madd(fs12,rinvsq12,nul);
9132 fs13 = vec_madd(fs13,rinvsq13,nul);
9133 fs21 = vec_madd(fs21,rinvsq21,nul);
9134 fs22 = vec_madd(fs22,rinvsq22,nul);
9135 fs23 = vec_madd(fs23,rinvsq23,nul);
9136 fs31 = vec_madd(fs31,rinvsq31,nul);
9137 fs32 = vec_madd(fs32,rinvsq32,nul);
9138 fs33 = vec_madd(fs33,rinvsq33,nul);
9140 vctot = vec_madd(qqOOt,vc11,vctot);
9141 vctot = vec_madd(qqOHt,vc12,vctot);
9142 vctot = vec_madd(qqOHt,vc13,vctot);
9143 vctot = vec_madd(qqOHt,vc21,vctot);
9144 vctot = vec_madd(qqHHt,vc22,vctot);
9145 vctot = vec_madd(qqHHt,vc23,vctot);
9146 vctot = vec_madd(qqOHt,vc31,vctot);
9147 vctot = vec_madd(qqHHt,vc32,vctot);
9148 vctot = vec_madd(qqHHt,vc33,vctot);
9150 fix1 = vec_madd(fs11,dx11,fix1);
9151 fiy1 = vec_madd(fs11,dy11,fiy1);
9152 fiz1 = vec_madd(fs11,dz11,fiz1);
9153 fix2 = vec_madd(fs21,dx21,fix2);
9154 fiy2 = vec_madd(fs21,dy21,fiy2);
9155 fiz2 = vec_madd(fs21,dz21,fiz2);
9156 fix3 = vec_madd(fs31,dx31,fix3);
9157 fiy3 = vec_madd(fs31,dy31,fiy3);
9158 fiz3 = vec_madd(fs31,dz31,fiz3);
9160 fix1 = vec_madd(fs12,dx12,fix1);
9161 fiy1 = vec_madd(fs12,dy12,fiy1);
9162 fiz1 = vec_madd(fs12,dz12,fiz1);
9163 fix2 = vec_madd(fs22,dx22,fix2);
9164 fiy2 = vec_madd(fs22,dy22,fiy2);
9165 fiz2 = vec_madd(fs22,dz22,fiz2);
9166 fix3 = vec_madd(fs32,dx32,fix3);
9167 fiy3 = vec_madd(fs32,dy32,fiy3);
9168 fiz3 = vec_madd(fs32,dz32,fiz3);
9170 fix1 = vec_madd(fs13,dx13,fix1);
9171 fiy1 = vec_madd(fs13,dy13,fiy1);
9172 fiz1 = vec_madd(fs13,dz13,fiz1);
9173 fix2 = vec_madd(fs23,dx23,fix2);
9174 fiy2 = vec_madd(fs23,dy23,fiy2);
9175 fiz2 = vec_madd(fs23,dz23,fiz2);
9176 fix3 = vec_madd(fs33,dx33,fix3);
9177 fiy3 = vec_madd(fs33,dy33,fiy3);
9178 fiz3 = vec_madd(fs33,dz33,fiz3);
9180 fjx1 = vec_nmsub(fs11,dx11,nul);
9181 fjy1 = vec_nmsub(fs11,dy11,nul);
9182 fjz1 = vec_nmsub(fs11,dz11,nul);
9183 fjx2 = vec_nmsub(fs12,dx12,nul);
9184 fjy2 = vec_nmsub(fs12,dy12,nul);
9185 fjz2 = vec_nmsub(fs12,dz12,nul);
9186 fjx3 = vec_nmsub(fs13,dx13,nul);
9187 fjy3 = vec_nmsub(fs13,dy13,nul);
9188 fjz3 = vec_nmsub(fs13,dz13,nul);
9190 fjx1 = vec_nmsub(fs21,dx21,fjx1);
9191 fjy1 = vec_nmsub(fs21,dy21,fjy1);
9192 fjz1 = vec_nmsub(fs21,dz21,fjz1);
9193 fjx2 = vec_nmsub(fs22,dx22,fjx2);
9194 fjy2 = vec_nmsub(fs22,dy22,fjy2);
9195 fjz2 = vec_nmsub(fs22,dz22,fjz2);
9196 fjx3 = vec_nmsub(fs23,dx23,fjx3);
9197 fjy3 = vec_nmsub(fs23,dy23,fjy3);
9198 fjz3 = vec_nmsub(fs23,dz23,fjz3);
9200 fjx1 = vec_nmsub(fs31,dx31,fjx1);
9201 fjy1 = vec_nmsub(fs31,dy31,fjy1);
9202 fjz1 = vec_nmsub(fs31,dz31,fjz1);
9203 fjx2 = vec_nmsub(fs32,dx32,fjx2);
9204 fjy2 = vec_nmsub(fs32,dy32,fjy2);
9205 fjz2 = vec_nmsub(fs32,dz32,fjz2);
9206 fjx3 = vec_nmsub(fs33,dx33,fjx3);
9207 fjy3 = vec_nmsub(fs33,dy33,fjy3);
9208 fjz3 = vec_nmsub(fs33,dz33,fjz3);
9210 add_force_to_3_water(faction+j3a,faction+j3b,faction+j3c,
9211 fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
9212 } else if(k<(nj1-1)) {
9213 jnra = jjnr[k];
9214 jnrb = jjnr[k+1];
9215 j3a = 3*jnra;
9216 j3b = 3*jnrb;
9217 load_2_water(pos+j3a,pos+j3b,
9218 &jx1,&jy1,&jz1,&jx2,&jy2,&jz2,&jx3,&jy3,&jz3);
9219 qqOOt = vec_sld(qqOO,nul,8);
9220 qqOHt = vec_sld(qqOH,nul,8);
9221 qqHHt = vec_sld(qqHH,nul,8);
9222 c6t = vec_sld(c6,nul,8);
9223 c12t = vec_sld(c12,nul,8);
9225 dx11 = vec_sub(ix1,jx1);
9226 dx12 = vec_sub(ix1,jx2);
9227 dx13 = vec_sub(ix1,jx3);
9228 dy11 = vec_sub(iy1,jy1);
9229 dy12 = vec_sub(iy1,jy2);
9230 dy13 = vec_sub(iy1,jy3);
9231 dz11 = vec_sub(iz1,jz1);
9232 dz12 = vec_sub(iz1,jz2);
9233 dz13 = vec_sub(iz1,jz3);
9234 dx21 = vec_sub(ix2,jx1);
9235 dx22 = vec_sub(ix2,jx2);
9236 dx23 = vec_sub(ix2,jx3);
9237 dy21 = vec_sub(iy2,jy1);
9238 dy22 = vec_sub(iy2,jy2);
9239 dy23 = vec_sub(iy2,jy3);
9240 dz21 = vec_sub(iz2,jz1);
9241 dz22 = vec_sub(iz2,jz2);
9242 dz23 = vec_sub(iz2,jz3);
9243 dx31 = vec_sub(ix3,jx1);
9244 dx32 = vec_sub(ix3,jx2);
9245 dx33 = vec_sub(ix3,jx3);
9246 dy31 = vec_sub(iy3,jy1);
9247 dy32 = vec_sub(iy3,jy2);
9248 dy33 = vec_sub(iy3,jy3);
9249 dz31 = vec_sub(iz3,jz1);
9250 dz32 = vec_sub(iz3,jz2);
9251 dz33 = vec_sub(iz3,jz3);
9253 rsq11 = vec_madd(dx11,dx11,nul);
9254 rsq12 = vec_madd(dx12,dx12,nul);
9255 rsq13 = vec_madd(dx13,dx13,nul);
9256 rsq21 = vec_madd(dx21,dx21,nul);
9257 rsq22 = vec_madd(dx22,dx22,nul);
9258 rsq23 = vec_madd(dx23,dx23,nul);
9259 rsq31 = vec_madd(dx31,dx31,nul);
9260 rsq32 = vec_madd(dx32,dx32,nul);
9261 rsq33 = vec_madd(dx33,dx33,nul);
9262 rsq11 = vec_madd(dy11,dy11,rsq11);
9263 rsq12 = vec_madd(dy12,dy12,rsq12);
9264 rsq13 = vec_madd(dy13,dy13,rsq13);
9265 rsq21 = vec_madd(dy21,dy21,rsq21);
9266 rsq22 = vec_madd(dy22,dy22,rsq22);
9267 rsq23 = vec_madd(dy23,dy23,rsq23);
9268 rsq31 = vec_madd(dy31,dy31,rsq31);
9269 rsq32 = vec_madd(dy32,dy32,rsq32);
9270 rsq33 = vec_madd(dy33,dy33,rsq33);
9271 rsq11 = vec_madd(dz11,dz11,rsq11);
9272 rsq12 = vec_madd(dz12,dz12,rsq12);
9273 rsq13 = vec_madd(dz13,dz13,rsq13);
9274 rsq21 = vec_madd(dz21,dz21,rsq21);
9275 rsq22 = vec_madd(dz22,dz22,rsq22);
9276 rsq23 = vec_madd(dz23,dz23,rsq23);
9277 rsq31 = vec_madd(dz31,dz31,rsq31);
9278 rsq32 = vec_madd(dz32,dz32,rsq32);
9279 rsq33 = vec_madd(dz33,dz33,rsq33);
9281 zero_highest_2_elements_in_9_vectors(&rsq11,&rsq12,&rsq13,
9282 &rsq21,&rsq22,&rsq23,
9283 &rsq31,&rsq32,&rsq33);
9285 do_9_invsqrt(rsq11,rsq12,rsq13,
9286 rsq21,rsq22,rsq23,
9287 rsq31,rsq32,rsq33,
9288 &rinv11,&rinv12,&rinv13,
9289 &rinv21,&rinv22,&rinv23,
9290 &rinv31,&rinv32,&rinv33);
9292 zero_highest_2_elements_in_9_vectors(&rinv11,&rinv12,&rinv13,
9293 &rinv21,&rinv22,&rinv23,
9294 &rinv31,&rinv32,&rinv33);
9296 krsq11 = vec_madd(vkrf,rsq11,nul);
9297 krsq12 = vec_madd(vkrf,rsq12,nul);
9298 krsq13 = vec_madd(vkrf,rsq13,nul);
9299 krsq21 = vec_madd(vkrf,rsq21,nul);
9300 krsq22 = vec_madd(vkrf,rsq22,nul);
9301 krsq23 = vec_madd(vkrf,rsq23,nul);
9302 krsq31 = vec_madd(vkrf,rsq31,nul);
9303 krsq32 = vec_madd(vkrf,rsq32,nul);
9304 krsq33 = vec_madd(vkrf,rsq33,nul);
9306 rinvsq11 = vec_madd(rinv11,rinv11,nul);
9307 rinvsq12 = vec_madd(rinv12,rinv12,nul);
9308 rinvsq13 = vec_madd(rinv13,rinv13,nul);
9309 rinvsq21 = vec_madd(rinv21,rinv21,nul);
9310 rinvsix = vec_madd(rinvsq11,rinvsq11,nul);
9311 rinvsq22 = vec_madd(rinv22,rinv22,nul);
9312 rinvsq23 = vec_madd(rinv23,rinv23,nul);
9313 rinvsq31 = vec_madd(rinv31,rinv31,nul);
9314 rinvsix = vec_madd(rinvsix,rinvsq11,nul);
9315 rinvsq32 = vec_madd(rinv32,rinv32,nul);
9316 rinvsq33 = vec_madd(rinv33,rinv33,nul);
9318 vnb6 = vec_madd(c6t,rinvsix,nul);
9319 vnb12 = vec_madd(c12t,vec_madd(rinvsix,rinvsix,nul),nul);
9320 vnbtot = vec_add(vnbtot,vnb12);
9321 vnbtot = vec_sub(vnbtot,vnb6);
9323 fs11 = vec_nmsub(vec_two(),krsq11,rinv11);
9324 vc11 = vec_add(rinv11,krsq11);
9325 vc12 = vec_add(rinv12,krsq12);
9326 vc13 = vec_add(rinv13,krsq13);
9327 vc21 = vec_add(rinv21,krsq21);
9328 vc22 = vec_add(rinv22,krsq22);
9329 vc23 = vec_add(rinv23,krsq23);
9330 vc31 = vec_add(rinv31,krsq31);
9331 vc32 = vec_add(rinv32,krsq32);
9332 vc33 = vec_add(rinv33,krsq33);
9334 fs11 = vec_madd(qqOOt,fs11,nul);
9335 vc11 = vec_sub(vc11,vcrf);
9336 vc12 = vec_sub(vc12,vcrf);
9337 vc13 = vec_sub(vc13,vcrf);
9338 vc21 = vec_sub(vc21,vcrf);
9339 vc22 = vec_sub(vc22,vcrf);
9340 vc23 = vec_sub(vc23,vcrf);
9341 vc31 = vec_sub(vc31,vcrf);
9342 vc32 = vec_sub(vc32,vcrf);
9343 vc33 = vec_sub(vc33,vcrf);
9345 fs11 = vec_nmsub(vec_six(),vnb6,fs11);
9346 fs12 = vec_nmsub(vec_two(),krsq12,rinv12);
9347 fs13 = vec_nmsub(vec_two(),krsq13,rinv13);
9348 fs21 = vec_nmsub(vec_two(),krsq21,rinv21);
9349 fs22 = vec_nmsub(vec_two(),krsq22,rinv22);
9350 fs23 = vec_nmsub(vec_two(),krsq23,rinv23);
9351 fs31 = vec_nmsub(vec_two(),krsq31,rinv31);
9352 fs32 = vec_nmsub(vec_two(),krsq32,rinv32);
9353 fs33 = vec_nmsub(vec_two(),krsq33,rinv33);
9355 fs11 = vec_madd(vec_twelve(),vnb12,fs11);
9356 fs12 = vec_madd(fs12,qqOHt,nul);
9357 fs13 = vec_madd(fs13,qqOHt,nul);
9358 fs21 = vec_madd(fs21,qqOHt,nul);
9359 fs22 = vec_madd(fs22,qqHHt,nul);
9360 fs23 = vec_madd(fs23,qqHHt,nul);
9361 fs31 = vec_madd(fs31,qqOHt,nul);
9362 fs32 = vec_madd(fs32,qqHHt,nul);
9363 fs33 = vec_madd(fs33,qqHHt,nul);
9365 fs11 = vec_madd(fs11,rinvsq11,nul);
9366 fs12 = vec_madd(fs12,rinvsq12,nul);
9367 fs13 = vec_madd(fs13,rinvsq13,nul);
9368 fs21 = vec_madd(fs21,rinvsq21,nul);
9369 fs22 = vec_madd(fs22,rinvsq22,nul);
9370 fs23 = vec_madd(fs23,rinvsq23,nul);
9371 fs31 = vec_madd(fs31,rinvsq31,nul);
9372 fs32 = vec_madd(fs32,rinvsq32,nul);
9373 fs33 = vec_madd(fs33,rinvsq33,nul);
9375 vctot = vec_madd(qqOOt,vc11,vctot);
9376 vctot = vec_madd(qqOHt,vc12,vctot);
9377 vctot = vec_madd(qqOHt,vc13,vctot);
9378 vctot = vec_madd(qqOHt,vc21,vctot);
9379 vctot = vec_madd(qqHHt,vc22,vctot);
9380 vctot = vec_madd(qqHHt,vc23,vctot);
9381 vctot = vec_madd(qqOHt,vc31,vctot);
9382 vctot = vec_madd(qqHHt,vc32,vctot);
9383 vctot = vec_madd(qqHHt,vc33,vctot);
9385 fix1 = vec_madd(fs11,dx11,fix1);
9386 fiy1 = vec_madd(fs11,dy11,fiy1);
9387 fiz1 = vec_madd(fs11,dz11,fiz1);
9388 fix2 = vec_madd(fs21,dx21,fix2);
9389 fiy2 = vec_madd(fs21,dy21,fiy2);
9390 fiz2 = vec_madd(fs21,dz21,fiz2);
9391 fix3 = vec_madd(fs31,dx31,fix3);
9392 fiy3 = vec_madd(fs31,dy31,fiy3);
9393 fiz3 = vec_madd(fs31,dz31,fiz3);
9395 fix1 = vec_madd(fs12,dx12,fix1);
9396 fiy1 = vec_madd(fs12,dy12,fiy1);
9397 fiz1 = vec_madd(fs12,dz12,fiz1);
9398 fix2 = vec_madd(fs22,dx22,fix2);
9399 fiy2 = vec_madd(fs22,dy22,fiy2);
9400 fiz2 = vec_madd(fs22,dz22,fiz2);
9401 fix3 = vec_madd(fs32,dx32,fix3);
9402 fiy3 = vec_madd(fs32,dy32,fiy3);
9403 fiz3 = vec_madd(fs32,dz32,fiz3);
9405 fix1 = vec_madd(fs13,dx13,fix1);
9406 fiy1 = vec_madd(fs13,dy13,fiy1);
9407 fiz1 = vec_madd(fs13,dz13,fiz1);
9408 fix2 = vec_madd(fs23,dx23,fix2);
9409 fiy2 = vec_madd(fs23,dy23,fiy2);
9410 fiz2 = vec_madd(fs23,dz23,fiz2);
9411 fix3 = vec_madd(fs33,dx33,fix3);
9412 fiy3 = vec_madd(fs33,dy33,fiy3);
9413 fiz3 = vec_madd(fs33,dz33,fiz3);
9415 fjx1 = vec_nmsub(fs11,dx11,nul);
9416 fjy1 = vec_nmsub(fs11,dy11,nul);
9417 fjz1 = vec_nmsub(fs11,dz11,nul);
9418 fjx2 = vec_nmsub(fs12,dx12,nul);
9419 fjy2 = vec_nmsub(fs12,dy12,nul);
9420 fjz2 = vec_nmsub(fs12,dz12,nul);
9421 fjx3 = vec_nmsub(fs13,dx13,nul);
9422 fjy3 = vec_nmsub(fs13,dy13,nul);
9423 fjz3 = vec_nmsub(fs13,dz13,nul);
9425 fjx1 = vec_nmsub(fs21,dx21,fjx1);
9426 fjy1 = vec_nmsub(fs21,dy21,fjy1);
9427 fjz1 = vec_nmsub(fs21,dz21,fjz1);
9428 fjx2 = vec_nmsub(fs22,dx22,fjx2);
9429 fjy2 = vec_nmsub(fs22,dy22,fjy2);
9430 fjz2 = vec_nmsub(fs22,dz22,fjz2);
9431 fjx3 = vec_nmsub(fs23,dx23,fjx3);
9432 fjy3 = vec_nmsub(fs23,dy23,fjy3);
9433 fjz3 = vec_nmsub(fs23,dz23,fjz3);
9435 fjx1 = vec_nmsub(fs31,dx31,fjx1);
9436 fjy1 = vec_nmsub(fs31,dy31,fjy1);
9437 fjz1 = vec_nmsub(fs31,dz31,fjz1);
9438 fjx2 = vec_nmsub(fs32,dx32,fjx2);
9439 fjy2 = vec_nmsub(fs32,dy32,fjy2);
9440 fjz2 = vec_nmsub(fs32,dz32,fjz2);
9441 fjx3 = vec_nmsub(fs33,dx33,fjx3);
9442 fjy3 = vec_nmsub(fs33,dy33,fjy3);
9443 fjz3 = vec_nmsub(fs33,dz33,fjz3);
9445 add_force_to_2_water(faction+j3a,faction+j3b,
9446 fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
9447 } else if(k<nj1) {
9448 jnra = jjnr[k];
9449 j3a = 3*jnra;
9450 load_1_water(pos+j3a,
9451 &jx1,&jy1,&jz1,&jx2,&jy2,&jz2,&jx3,&jy3,&jz3);
9452 qqOOt = vec_sld(qqOO,nul,12);
9453 qqOHt = vec_sld(qqOH,nul,12);
9454 qqHHt = vec_sld(qqHH,nul,12);
9455 c6t = vec_sld(c6,nul,12);
9456 c12t = vec_sld(c12,nul,12);
9458 dx11 = vec_sub(ix1,jx1);
9459 dx12 = vec_sub(ix1,jx2);
9460 dx13 = vec_sub(ix1,jx3);
9461 dy11 = vec_sub(iy1,jy1);
9462 dy12 = vec_sub(iy1,jy2);
9463 dy13 = vec_sub(iy1,jy3);
9464 dz11 = vec_sub(iz1,jz1);
9465 dz12 = vec_sub(iz1,jz2);
9466 dz13 = vec_sub(iz1,jz3);
9467 dx21 = vec_sub(ix2,jx1);
9468 dx22 = vec_sub(ix2,jx2);
9469 dx23 = vec_sub(ix2,jx3);
9470 dy21 = vec_sub(iy2,jy1);
9471 dy22 = vec_sub(iy2,jy2);
9472 dy23 = vec_sub(iy2,jy3);
9473 dz21 = vec_sub(iz2,jz1);
9474 dz22 = vec_sub(iz2,jz2);
9475 dz23 = vec_sub(iz2,jz3);
9476 dx31 = vec_sub(ix3,jx1);
9477 dx32 = vec_sub(ix3,jx2);
9478 dx33 = vec_sub(ix3,jx3);
9479 dy31 = vec_sub(iy3,jy1);
9480 dy32 = vec_sub(iy3,jy2);
9481 dy33 = vec_sub(iy3,jy3);
9482 dz31 = vec_sub(iz3,jz1);
9483 dz32 = vec_sub(iz3,jz2);
9484 dz33 = vec_sub(iz3,jz3);
9486 rsq11 = vec_madd(dx11,dx11,nul);
9487 rsq12 = vec_madd(dx12,dx12,nul);
9488 rsq13 = vec_madd(dx13,dx13,nul);
9489 rsq21 = vec_madd(dx21,dx21,nul);
9490 rsq22 = vec_madd(dx22,dx22,nul);
9491 rsq23 = vec_madd(dx23,dx23,nul);
9492 rsq31 = vec_madd(dx31,dx31,nul);
9493 rsq32 = vec_madd(dx32,dx32,nul);
9494 rsq33 = vec_madd(dx33,dx33,nul);
9495 rsq11 = vec_madd(dy11,dy11,rsq11);
9496 rsq12 = vec_madd(dy12,dy12,rsq12);
9497 rsq13 = vec_madd(dy13,dy13,rsq13);
9498 rsq21 = vec_madd(dy21,dy21,rsq21);
9499 rsq22 = vec_madd(dy22,dy22,rsq22);
9500 rsq23 = vec_madd(dy23,dy23,rsq23);
9501 rsq31 = vec_madd(dy31,dy31,rsq31);
9502 rsq32 = vec_madd(dy32,dy32,rsq32);
9503 rsq33 = vec_madd(dy33,dy33,rsq33);
9504 rsq11 = vec_madd(dz11,dz11,rsq11);
9505 rsq12 = vec_madd(dz12,dz12,rsq12);
9506 rsq13 = vec_madd(dz13,dz13,rsq13);
9507 rsq21 = vec_madd(dz21,dz21,rsq21);
9508 rsq22 = vec_madd(dz22,dz22,rsq22);
9509 rsq23 = vec_madd(dz23,dz23,rsq23);
9510 rsq31 = vec_madd(dz31,dz31,rsq31);
9511 rsq32 = vec_madd(dz32,dz32,rsq32);
9512 rsq33 = vec_madd(dz33,dz33,rsq33);
9514 zero_highest_3_elements_in_9_vectors(&rsq11,&rsq12,&rsq13,
9515 &rsq21,&rsq22,&rsq23,
9516 &rsq31,&rsq32,&rsq33);
9518 do_9_invsqrt(rsq11,rsq12,rsq13,
9519 rsq21,rsq22,rsq23,
9520 rsq31,rsq32,rsq33,
9521 &rinv11,&rinv12,&rinv13,
9522 &rinv21,&rinv22,&rinv23,
9523 &rinv31,&rinv32,&rinv33);
9525 zero_highest_3_elements_in_9_vectors(&rinv11,&rinv12,&rinv13,
9526 &rinv21,&rinv22,&rinv23,
9527 &rinv31,&rinv32,&rinv33);
9529 krsq11 = vec_madd(vkrf,rsq11,nul);
9530 krsq12 = vec_madd(vkrf,rsq12,nul);
9531 krsq13 = vec_madd(vkrf,rsq13,nul);
9532 krsq21 = vec_madd(vkrf,rsq21,nul);
9533 krsq22 = vec_madd(vkrf,rsq22,nul);
9534 krsq23 = vec_madd(vkrf,rsq23,nul);
9535 krsq31 = vec_madd(vkrf,rsq31,nul);
9536 krsq32 = vec_madd(vkrf,rsq32,nul);
9537 krsq33 = vec_madd(vkrf,rsq33,nul);
9539 rinvsq11 = vec_madd(rinv11,rinv11,nul);
9540 rinvsq12 = vec_madd(rinv12,rinv12,nul);
9541 rinvsq13 = vec_madd(rinv13,rinv13,nul);
9542 rinvsq21 = vec_madd(rinv21,rinv21,nul);
9543 rinvsix = vec_madd(rinvsq11,rinvsq11,nul);
9544 rinvsq22 = vec_madd(rinv22,rinv22,nul);
9545 rinvsq23 = vec_madd(rinv23,rinv23,nul);
9546 rinvsq31 = vec_madd(rinv31,rinv31,nul);
9547 rinvsix = vec_madd(rinvsix,rinvsq11,nul);
9548 rinvsq32 = vec_madd(rinv32,rinv32,nul);
9549 rinvsq33 = vec_madd(rinv33,rinv33,nul);
9551 vnb6 = vec_madd(c6t,rinvsix,nul);
9552 vnb12 = vec_madd(c12t,vec_madd(rinvsix,rinvsix,nul),nul);
9553 vnbtot = vec_add(vnbtot,vnb12);
9554 vnbtot = vec_sub(vnbtot,vnb6);
9556 fs11 = vec_nmsub(vec_two(),krsq11,rinv11);
9557 vc11 = vec_add(rinv11,krsq11);
9558 vc12 = vec_add(rinv12,krsq12);
9559 vc13 = vec_add(rinv13,krsq13);
9560 vc21 = vec_add(rinv21,krsq21);
9561 vc22 = vec_add(rinv22,krsq22);
9562 vc23 = vec_add(rinv23,krsq23);
9563 vc31 = vec_add(rinv31,krsq31);
9564 vc32 = vec_add(rinv32,krsq32);
9565 vc33 = vec_add(rinv33,krsq33);
9567 fs11 = vec_madd(qqOOt,fs11,nul);
9568 vc11 = vec_sub(vc11,vcrf);
9569 vc12 = vec_sub(vc12,vcrf);
9570 vc13 = vec_sub(vc13,vcrf);
9571 vc21 = vec_sub(vc21,vcrf);
9572 vc22 = vec_sub(vc22,vcrf);
9573 vc23 = vec_sub(vc23,vcrf);
9574 vc31 = vec_sub(vc31,vcrf);
9575 vc32 = vec_sub(vc32,vcrf);
9576 vc33 = vec_sub(vc33,vcrf);
9578 fs11 = vec_nmsub(vec_six(),vnb6,fs11);
9579 fs12 = vec_nmsub(vec_two(),krsq12,rinv12);
9580 fs13 = vec_nmsub(vec_two(),krsq13,rinv13);
9581 fs21 = vec_nmsub(vec_two(),krsq21,rinv21);
9582 fs22 = vec_nmsub(vec_two(),krsq22,rinv22);
9583 fs23 = vec_nmsub(vec_two(),krsq23,rinv23);
9584 fs31 = vec_nmsub(vec_two(),krsq31,rinv31);
9585 fs32 = vec_nmsub(vec_two(),krsq32,rinv32);
9586 fs33 = vec_nmsub(vec_two(),krsq33,rinv33);
9588 fs11 = vec_madd(vec_twelve(),vnb12,fs11);
9589 fs12 = vec_madd(fs12,qqOHt,nul);
9590 fs13 = vec_madd(fs13,qqOHt,nul);
9591 fs21 = vec_madd(fs21,qqOHt,nul);
9592 fs22 = vec_madd(fs22,qqHHt,nul);
9593 fs23 = vec_madd(fs23,qqHHt,nul);
9594 fs31 = vec_madd(fs31,qqOHt,nul);
9595 fs32 = vec_madd(fs32,qqHHt,nul);
9596 fs33 = vec_madd(fs33,qqHHt,nul);
9598 fs11 = vec_madd(fs11,rinvsq11,nul);
9599 fs12 = vec_madd(fs12,rinvsq12,nul);
9600 fs13 = vec_madd(fs13,rinvsq13,nul);
9601 fs21 = vec_madd(fs21,rinvsq21,nul);
9602 fs22 = vec_madd(fs22,rinvsq22,nul);
9603 fs23 = vec_madd(fs23,rinvsq23,nul);
9604 fs31 = vec_madd(fs31,rinvsq31,nul);
9605 fs32 = vec_madd(fs32,rinvsq32,nul);
9606 fs33 = vec_madd(fs33,rinvsq33,nul);
9608 vctot = vec_madd(qqOOt,vc11,vctot);
9609 vctot = vec_madd(qqOHt,vc12,vctot);
9610 vctot = vec_madd(qqOHt,vc13,vctot);
9611 vctot = vec_madd(qqOHt,vc21,vctot);
9612 vctot = vec_madd(qqHHt,vc22,vctot);
9613 vctot = vec_madd(qqHHt,vc23,vctot);
9614 vctot = vec_madd(qqOHt,vc31,vctot);
9615 vctot = vec_madd(qqHHt,vc32,vctot);
9616 vctot = vec_madd(qqHHt,vc33,vctot);
9618 fix1 = vec_madd(fs11,dx11,fix1);
9619 fiy1 = vec_madd(fs11,dy11,fiy1);
9620 fiz1 = vec_madd(fs11,dz11,fiz1);
9621 fix2 = vec_madd(fs21,dx21,fix2);
9622 fiy2 = vec_madd(fs21,dy21,fiy2);
9623 fiz2 = vec_madd(fs21,dz21,fiz2);
9624 fix3 = vec_madd(fs31,dx31,fix3);
9625 fiy3 = vec_madd(fs31,dy31,fiy3);
9626 fiz3 = vec_madd(fs31,dz31,fiz3);
9628 fix1 = vec_madd(fs12,dx12,fix1);
9629 fiy1 = vec_madd(fs12,dy12,fiy1);
9630 fiz1 = vec_madd(fs12,dz12,fiz1);
9631 fix2 = vec_madd(fs22,dx22,fix2);
9632 fiy2 = vec_madd(fs22,dy22,fiy2);
9633 fiz2 = vec_madd(fs22,dz22,fiz2);
9634 fix3 = vec_madd(fs32,dx32,fix3);
9635 fiy3 = vec_madd(fs32,dy32,fiy3);
9636 fiz3 = vec_madd(fs32,dz32,fiz3);
9638 fix1 = vec_madd(fs13,dx13,fix1);
9639 fiy1 = vec_madd(fs13,dy13,fiy1);
9640 fiz1 = vec_madd(fs13,dz13,fiz1);
9641 fix2 = vec_madd(fs23,dx23,fix2);
9642 fiy2 = vec_madd(fs23,dy23,fiy2);
9643 fiz2 = vec_madd(fs23,dz23,fiz2);
9644 fix3 = vec_madd(fs33,dx33,fix3);
9645 fiy3 = vec_madd(fs33,dy33,fiy3);
9646 fiz3 = vec_madd(fs33,dz33,fiz3);
9648 fjx1 = vec_nmsub(fs11,dx11,nul);
9649 fjy1 = vec_nmsub(fs11,dy11,nul);
9650 fjz1 = vec_nmsub(fs11,dz11,nul);
9651 fjx2 = vec_nmsub(fs12,dx12,nul);
9652 fjy2 = vec_nmsub(fs12,dy12,nul);
9653 fjz2 = vec_nmsub(fs12,dz12,nul);
9654 fjx3 = vec_nmsub(fs13,dx13,nul);
9655 fjy3 = vec_nmsub(fs13,dy13,nul);
9656 fjz3 = vec_nmsub(fs13,dz13,nul);
9658 fjx1 = vec_nmsub(fs21,dx21,fjx1);
9659 fjy1 = vec_nmsub(fs21,dy21,fjy1);
9660 fjz1 = vec_nmsub(fs21,dz21,fjz1);
9661 fjx2 = vec_nmsub(fs22,dx22,fjx2);
9662 fjy2 = vec_nmsub(fs22,dy22,fjy2);
9663 fjz2 = vec_nmsub(fs22,dz22,fjz2);
9664 fjx3 = vec_nmsub(fs23,dx23,fjx3);
9665 fjy3 = vec_nmsub(fs23,dy23,fjy3);
9666 fjz3 = vec_nmsub(fs23,dz23,fjz3);
9668 fjx1 = vec_nmsub(fs31,dx31,fjx1);
9669 fjy1 = vec_nmsub(fs31,dy31,fjy1);
9670 fjz1 = vec_nmsub(fs31,dz31,fjz1);
9671 fjx2 = vec_nmsub(fs32,dx32,fjx2);
9672 fjy2 = vec_nmsub(fs32,dy32,fjy2);
9673 fjz2 = vec_nmsub(fs32,dz32,fjz2);
9674 fjx3 = vec_nmsub(fs33,dx33,fjx3);
9675 fjy3 = vec_nmsub(fs33,dy33,fjy3);
9676 fjz3 = vec_nmsub(fs33,dz33,fjz3);
9678 add_force_to_1_water(faction+j3a,
9679 fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
9681 /* update outer data */
9682 update_i_water_forces(faction+ii3,fshift+is3,
9683 fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3);
9685 add_vector_to_float(Vc+gid[n],vctot);
9686 add_vector_to_float(Vnb+gid[n],vnbtot);
9692 void inl3030_altivec(
9693 int nri,
9694 int iinr[],
9695 int jindex[],
9696 int jjnr[],
9697 int shift[],
9698 float shiftvec[],
9699 float fshift[],
9700 int gid[],
9701 float pos[],
9702 float faction[],
9703 float charge[],
9704 float facel,
9705 float Vc[],
9706 float tabscale,
9707 float VFtab[])
9709 vector float ix1,iy1,iz1,ix2,iy2,iz2,ix3,iy3,iz3;
9710 vector float jx1,jy1,jz1,jx2,jy2,jz2,jx3,jy3,jz3;
9712 vector float dx11,dy11,dz11,dx12,dy12,dz12,dx13,dy13,dz13;
9713 vector float dx21,dy21,dz21,dx22,dy22,dz22,dx23,dy23,dz23;
9714 vector float dx31,dy31,dz31,dx32,dy32,dz32,dx33,dy33,dz33;
9716 vector float rsq11,rsq12,rsq13,rsq21,rsq22,rsq23,rsq31,rsq32,rsq33;
9717 vector float r11,r12,r13,r21,r22,r23,r31,r32,r33;
9718 vector float rinv11,rinv12,rinv13,rinv21,rinv22,rinv23,rinv31,rinv32,rinv33;
9719 vector float vc11,vc12,vc13,vc21,vc22,vc23,vc31,vc32,vc33;
9721 vector float vfacel,vcoul1,vcoul2,vcoul3,nul;
9722 vector float fs11,fs12,fs13,fs21,fs22,fs23,fs31,fs32,fs33;
9723 vector float fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3;
9724 vector float fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3;
9725 vector float vctot,qqOO,qqOH,qqHH,qO,qH,tsc;
9726 vector float VV11c,FF11c,VV12c,FF12c,VV13c,FF13c;
9727 vector float VV21c,FF21c,VV22c,FF22c,VV23c,FF23c;
9728 vector float VV31c,FF31c,VV32c,FF32c,VV33c,FF33c;
9729 vector float qqOOt,qqOHt,qqHHt;
9731 int n,k,k0,ii,is3,ii3,ntiA,nj0,nj1;
9732 int jnra,jnrb,jnrc,jnrd;
9733 int j3a,j3b,j3c,j3d;
9735 nul=vec_zero();
9736 vfacel=load_float_and_splat(&facel);
9737 tsc=load_float_and_splat(&tabscale);
9738 qO = load_float_and_splat(charge+iinr[0]);
9739 qH = load_float_and_splat(charge+iinr[0]+1);
9740 qqOO = vec_madd(qO,qO,nul);
9741 qqOH = vec_madd(qO,qH,nul);
9742 qqHH = vec_madd(qH,qH,nul);
9743 qqOO = vec_madd(qqOO,vfacel,nul);
9744 qqOH = vec_madd(qqOH,vfacel,nul);
9745 qqHH = vec_madd(qqHH,vfacel,nul);
9747 for(n=0;n<nri;n++) {
9748 is3 = 3*shift[n];
9749 ii = iinr[n];
9750 ii3 = 3*ii;
9751 load_1_water_shift_and_splat(pos+ii3,shiftvec+is3,&ix1,&iy1,&iz1,
9752 &ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
9753 vctot = nul;
9754 fix1 = nul;
9755 fiy1 = nul;
9756 fiz1 = nul;
9757 fix2 = nul;
9758 fiy2 = nul;
9759 fiz2 = nul;
9760 fix3 = nul;
9761 fiy3 = nul;
9762 fiz3 = nul;
9763 nj0 = jindex[n];
9764 nj1 = jindex[n+1];
9766 for(k=nj0; k<(nj1-3); k+=4) {
9767 jnra = jjnr[k];
9768 jnrb = jjnr[k+1];
9769 jnrc = jjnr[k+2];
9770 jnrd = jjnr[k+3];
9771 j3a = 3*jnra;
9772 j3b = 3*jnrb;
9773 j3c = 3*jnrc;
9774 j3d = 3*jnrd;
9775 load_4_water(pos+j3a,pos+j3b,pos+j3c,pos+j3d,
9776 &jx1,&jy1,&jz1,&jx2,&jy2,&jz2,&jx3,&jy3,&jz3);
9777 dx11 = vec_sub(ix1,jx1);
9778 dx12 = vec_sub(ix1,jx2);
9779 dx13 = vec_sub(ix1,jx3);
9780 dy11 = vec_sub(iy1,jy1);
9781 dy12 = vec_sub(iy1,jy2);
9782 dy13 = vec_sub(iy1,jy3);
9783 dz11 = vec_sub(iz1,jz1);
9784 dz12 = vec_sub(iz1,jz2);
9785 dz13 = vec_sub(iz1,jz3);
9786 dx21 = vec_sub(ix2,jx1);
9787 dx22 = vec_sub(ix2,jx2);
9788 dx23 = vec_sub(ix2,jx3);
9789 dy21 = vec_sub(iy2,jy1);
9790 dy22 = vec_sub(iy2,jy2);
9791 dy23 = vec_sub(iy2,jy3);
9792 dz21 = vec_sub(iz2,jz1);
9793 dz22 = vec_sub(iz2,jz2);
9794 dz23 = vec_sub(iz2,jz3);
9795 dx31 = vec_sub(ix3,jx1);
9796 dx32 = vec_sub(ix3,jx2);
9797 dx33 = vec_sub(ix3,jx3);
9798 dy31 = vec_sub(iy3,jy1);
9799 dy32 = vec_sub(iy3,jy2);
9800 dy33 = vec_sub(iy3,jy3);
9801 dz31 = vec_sub(iz3,jz1);
9802 dz32 = vec_sub(iz3,jz2);
9803 dz33 = vec_sub(iz3,jz3);
9805 rsq11 = vec_madd(dx11,dx11,nul);
9806 rsq12 = vec_madd(dx12,dx12,nul);
9807 rsq13 = vec_madd(dx13,dx13,nul);
9808 rsq21 = vec_madd(dx21,dx21,nul);
9809 rsq22 = vec_madd(dx22,dx22,nul);
9810 rsq23 = vec_madd(dx23,dx23,nul);
9811 rsq31 = vec_madd(dx31,dx31,nul);
9812 rsq32 = vec_madd(dx32,dx32,nul);
9813 rsq33 = vec_madd(dx33,dx33,nul);
9814 rsq11 = vec_madd(dy11,dy11,rsq11);
9815 rsq12 = vec_madd(dy12,dy12,rsq12);
9816 rsq13 = vec_madd(dy13,dy13,rsq13);
9817 rsq21 = vec_madd(dy21,dy21,rsq21);
9818 rsq22 = vec_madd(dy22,dy22,rsq22);
9819 rsq23 = vec_madd(dy23,dy23,rsq23);
9820 rsq31 = vec_madd(dy31,dy31,rsq31);
9821 rsq32 = vec_madd(dy32,dy32,rsq32);
9822 rsq33 = vec_madd(dy33,dy33,rsq33);
9823 rsq11 = vec_madd(dz11,dz11,rsq11);
9824 rsq12 = vec_madd(dz12,dz12,rsq12);
9825 rsq13 = vec_madd(dz13,dz13,rsq13);
9826 rsq21 = vec_madd(dz21,dz21,rsq21);
9827 rsq22 = vec_madd(dz22,dz22,rsq22);
9828 rsq23 = vec_madd(dz23,dz23,rsq23);
9829 rsq31 = vec_madd(dz31,dz31,rsq31);
9830 rsq32 = vec_madd(dz32,dz32,rsq32);
9831 rsq33 = vec_madd(dz33,dz33,rsq33);
9833 do_9_invsqrt(rsq11,rsq12,rsq13,
9834 rsq21,rsq22,rsq23,
9835 rsq31,rsq32,rsq33,
9836 &rinv11,&rinv12,&rinv13,
9837 &rinv21,&rinv22,&rinv23,
9838 &rinv31,&rinv32,&rinv33);
9840 r11 = vec_madd(rsq11,rinv11,nul);
9841 r12 = vec_madd(rsq12,rinv12,nul);
9842 r13 = vec_madd(rsq13,rinv13,nul);
9843 r21 = vec_madd(rsq21,rinv21,nul);
9844 r22 = vec_madd(rsq22,rinv22,nul);
9845 r23 = vec_madd(rsq23,rinv23,nul);
9846 r31 = vec_madd(rsq31,rinv31,nul);
9847 r32 = vec_madd(rsq32,rinv32,nul);
9848 r33 = vec_madd(rsq33,rinv33,nul);
9850 do_4_ctable_coul(VFtab,vec_madd(r11,tsc,nul),&VV11c,&FF11c);
9851 do_4_ctable_coul(VFtab,vec_madd(r12,tsc,nul),&VV12c,&FF12c);
9852 do_4_ctable_coul(VFtab,vec_madd(r13,tsc,nul),&VV13c,&FF13c);
9853 do_4_ctable_coul(VFtab,vec_madd(r21,tsc,nul),&VV21c,&FF21c);
9854 do_4_ctable_coul(VFtab,vec_madd(r22,tsc,nul),&VV22c,&FF22c);
9855 do_4_ctable_coul(VFtab,vec_madd(r23,tsc,nul),&VV23c,&FF23c);
9856 do_4_ctable_coul(VFtab,vec_madd(r31,tsc,nul),&VV31c,&FF31c);
9857 do_4_ctable_coul(VFtab,vec_madd(r32,tsc,nul),&VV32c,&FF32c);
9858 do_4_ctable_coul(VFtab,vec_madd(r33,tsc,nul),&VV33c,&FF33c);
9860 fs11 = vec_nmsub(qqOO,FF11c,nul);
9861 fs12 = vec_nmsub(qqOH,FF12c,nul);
9862 fs13 = vec_nmsub(qqOH,FF13c,nul);
9863 fs21 = vec_nmsub(qqOH,FF21c,nul);
9864 fs22 = vec_nmsub(qqHH,FF22c,nul);
9865 fs23 = vec_nmsub(qqHH,FF23c,nul);
9866 fs31 = vec_nmsub(qqOH,FF31c,nul);
9867 fs32 = vec_nmsub(qqHH,FF32c,nul);
9868 fs33 = vec_nmsub(qqHH,FF33c,nul);
9870 vctot = vec_madd(qqOO,VV11c,vctot);
9871 vctot = vec_madd(qqOH,VV12c,vctot);
9872 vctot = vec_madd(qqOH,VV13c,vctot);
9873 vctot = vec_madd(qqOH,VV21c,vctot);
9874 vctot = vec_madd(qqHH,VV22c,vctot);
9875 vctot = vec_madd(qqHH,VV23c,vctot);
9876 vctot = vec_madd(qqOH,VV31c,vctot);
9877 vctot = vec_madd(qqHH,VV32c,vctot);
9878 vctot = vec_madd(qqHH,VV33c,vctot);
9880 fs11 = vec_madd(fs11,tsc,nul);
9881 fs12 = vec_madd(fs12,tsc,nul);
9882 fs13 = vec_madd(fs13,tsc,nul);
9883 fs21 = vec_madd(fs21,tsc,nul);
9884 fs22 = vec_madd(fs22,tsc,nul);
9885 fs23 = vec_madd(fs23,tsc,nul);
9886 fs31 = vec_madd(fs31,tsc,nul);
9887 fs32 = vec_madd(fs32,tsc,nul);
9888 fs33 = vec_madd(fs33,tsc,nul);
9890 fs11 = vec_madd(fs11,rinv11,nul);
9891 fs12 = vec_madd(fs12,rinv12,nul);
9892 fs13 = vec_madd(fs13,rinv13,nul);
9893 fs21 = vec_madd(fs21,rinv21,nul);
9894 fs22 = vec_madd(fs22,rinv22,nul);
9895 fs23 = vec_madd(fs23,rinv23,nul);
9896 fs31 = vec_madd(fs31,rinv31,nul);
9897 fs32 = vec_madd(fs32,rinv32,nul);
9898 fs33 = vec_madd(fs33,rinv33,nul);
9900 fix1 = vec_madd(fs11,dx11,fix1);
9901 fiy1 = vec_madd(fs11,dy11,fiy1);
9902 fiz1 = vec_madd(fs11,dz11,fiz1);
9903 fix2 = vec_madd(fs21,dx21,fix2);
9904 fiy2 = vec_madd(fs21,dy21,fiy2);
9905 fiz2 = vec_madd(fs21,dz21,fiz2);
9906 fix3 = vec_madd(fs31,dx31,fix3);
9907 fiy3 = vec_madd(fs31,dy31,fiy3);
9908 fiz3 = vec_madd(fs31,dz31,fiz3);
9910 fix1 = vec_madd(fs12,dx12,fix1);
9911 fiy1 = vec_madd(fs12,dy12,fiy1);
9912 fiz1 = vec_madd(fs12,dz12,fiz1);
9913 fix2 = vec_madd(fs22,dx22,fix2);
9914 fiy2 = vec_madd(fs22,dy22,fiy2);
9915 fiz2 = vec_madd(fs22,dz22,fiz2);
9916 fix3 = vec_madd(fs32,dx32,fix3);
9917 fiy3 = vec_madd(fs32,dy32,fiy3);
9918 fiz3 = vec_madd(fs32,dz32,fiz3);
9920 fix1 = vec_madd(fs13,dx13,fix1);
9921 fiy1 = vec_madd(fs13,dy13,fiy1);
9922 fiz1 = vec_madd(fs13,dz13,fiz1);
9923 fix2 = vec_madd(fs23,dx23,fix2);
9924 fiy2 = vec_madd(fs23,dy23,fiy2);
9925 fiz2 = vec_madd(fs23,dz23,fiz2);
9926 fix3 = vec_madd(fs33,dx33,fix3);
9927 fiy3 = vec_madd(fs33,dy33,fiy3);
9928 fiz3 = vec_madd(fs33,dz33,fiz3);
9930 fjx1 = vec_nmsub(fs11,dx11,nul);
9931 fjy1 = vec_nmsub(fs11,dy11,nul);
9932 fjz1 = vec_nmsub(fs11,dz11,nul);
9933 fjx2 = vec_nmsub(fs12,dx12,nul);
9934 fjy2 = vec_nmsub(fs12,dy12,nul);
9935 fjz2 = vec_nmsub(fs12,dz12,nul);
9936 fjx3 = vec_nmsub(fs13,dx13,nul);
9937 fjy3 = vec_nmsub(fs13,dy13,nul);
9938 fjz3 = vec_nmsub(fs13,dz13,nul);
9940 fjx1 = vec_nmsub(fs21,dx21,fjx1);
9941 fjy1 = vec_nmsub(fs21,dy21,fjy1);
9942 fjz1 = vec_nmsub(fs21,dz21,fjz1);
9943 fjx2 = vec_nmsub(fs22,dx22,fjx2);
9944 fjy2 = vec_nmsub(fs22,dy22,fjy2);
9945 fjz2 = vec_nmsub(fs22,dz22,fjz2);
9946 fjx3 = vec_nmsub(fs23,dx23,fjx3);
9947 fjy3 = vec_nmsub(fs23,dy23,fjy3);
9948 fjz3 = vec_nmsub(fs23,dz23,fjz3);
9950 fjx1 = vec_nmsub(fs31,dx31,fjx1);
9951 fjy1 = vec_nmsub(fs31,dy31,fjy1);
9952 fjz1 = vec_nmsub(fs31,dz31,fjz1);
9953 fjx2 = vec_nmsub(fs32,dx32,fjx2);
9954 fjy2 = vec_nmsub(fs32,dy32,fjy2);
9955 fjz2 = vec_nmsub(fs32,dz32,fjz2);
9956 fjx3 = vec_nmsub(fs33,dx33,fjx3);
9957 fjy3 = vec_nmsub(fs33,dy33,fjy3);
9958 fjz3 = vec_nmsub(fs33,dz33,fjz3);
9960 add_force_to_4_water(faction+j3a,faction+j3b,faction+j3c,faction+j3d,
9961 fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
9963 if(k<(nj1-2)) {
9964 jnra = jjnr[k];
9965 jnrb = jjnr[k+1];
9966 jnrc = jjnr[k+2];
9967 j3a = 3*jnra;
9968 j3b = 3*jnrb;
9969 j3c = 3*jnrc;
9970 load_3_water(pos+j3a,pos+j3b,pos+j3c,
9971 &jx1,&jy1,&jz1,&jx2,&jy2,&jz2,&jx3,&jy3,&jz3);
9972 qqOOt = vec_sld(qqOO,nul,4);
9973 qqOHt = vec_sld(qqOH,nul,4);
9974 qqHHt = vec_sld(qqHH,nul,4);
9976 dx11 = vec_sub(ix1,jx1);
9977 dx12 = vec_sub(ix1,jx2);
9978 dx13 = vec_sub(ix1,jx3);
9979 dy11 = vec_sub(iy1,jy1);
9980 dy12 = vec_sub(iy1,jy2);
9981 dy13 = vec_sub(iy1,jy3);
9982 dz11 = vec_sub(iz1,jz1);
9983 dz12 = vec_sub(iz1,jz2);
9984 dz13 = vec_sub(iz1,jz3);
9985 dx21 = vec_sub(ix2,jx1);
9986 dx22 = vec_sub(ix2,jx2);
9987 dx23 = vec_sub(ix2,jx3);
9988 dy21 = vec_sub(iy2,jy1);
9989 dy22 = vec_sub(iy2,jy2);
9990 dy23 = vec_sub(iy2,jy3);
9991 dz21 = vec_sub(iz2,jz1);
9992 dz22 = vec_sub(iz2,jz2);
9993 dz23 = vec_sub(iz2,jz3);
9994 dx31 = vec_sub(ix3,jx1);
9995 dx32 = vec_sub(ix3,jx2);
9996 dx33 = vec_sub(ix3,jx3);
9997 dy31 = vec_sub(iy3,jy1);
9998 dy32 = vec_sub(iy3,jy2);
9999 dy33 = vec_sub(iy3,jy3);
10000 dz31 = vec_sub(iz3,jz1);
10001 dz32 = vec_sub(iz3,jz2);
10002 dz33 = vec_sub(iz3,jz3);
10004 rsq11 = vec_madd(dx11,dx11,nul);
10005 rsq12 = vec_madd(dx12,dx12,nul);
10006 rsq13 = vec_madd(dx13,dx13,nul);
10007 rsq21 = vec_madd(dx21,dx21,nul);
10008 rsq22 = vec_madd(dx22,dx22,nul);
10009 rsq23 = vec_madd(dx23,dx23,nul);
10010 rsq31 = vec_madd(dx31,dx31,nul);
10011 rsq32 = vec_madd(dx32,dx32,nul);
10012 rsq33 = vec_madd(dx33,dx33,nul);
10013 rsq11 = vec_madd(dy11,dy11,rsq11);
10014 rsq12 = vec_madd(dy12,dy12,rsq12);
10015 rsq13 = vec_madd(dy13,dy13,rsq13);
10016 rsq21 = vec_madd(dy21,dy21,rsq21);
10017 rsq22 = vec_madd(dy22,dy22,rsq22);
10018 rsq23 = vec_madd(dy23,dy23,rsq23);
10019 rsq31 = vec_madd(dy31,dy31,rsq31);
10020 rsq32 = vec_madd(dy32,dy32,rsq32);
10021 rsq33 = vec_madd(dy33,dy33,rsq33);
10022 rsq11 = vec_madd(dz11,dz11,rsq11);
10023 rsq12 = vec_madd(dz12,dz12,rsq12);
10024 rsq13 = vec_madd(dz13,dz13,rsq13);
10025 rsq21 = vec_madd(dz21,dz21,rsq21);
10026 rsq22 = vec_madd(dz22,dz22,rsq22);
10027 rsq23 = vec_madd(dz23,dz23,rsq23);
10028 rsq31 = vec_madd(dz31,dz31,rsq31);
10029 rsq32 = vec_madd(dz32,dz32,rsq32);
10030 rsq33 = vec_madd(dz33,dz33,rsq33);
10032 zero_highest_element_in_9_vectors(&rsq11,&rsq12,&rsq13,
10033 &rsq21,&rsq22,&rsq23,
10034 &rsq31,&rsq32,&rsq33);
10036 do_9_invsqrt(rsq11,rsq12,rsq13,
10037 rsq21,rsq22,rsq23,
10038 rsq31,rsq32,rsq33,
10039 &rinv11,&rinv12,&rinv13,
10040 &rinv21,&rinv22,&rinv23,
10041 &rinv31,&rinv32,&rinv33);
10043 zero_highest_element_in_9_vectors(&rinv11,&rinv12,&rinv13,
10044 &rinv21,&rinv22,&rinv23,
10045 &rinv31,&rinv32,&rinv33);
10047 r11 = vec_madd(rsq11,rinv11,nul);
10048 r12 = vec_madd(rsq12,rinv12,nul);
10049 r13 = vec_madd(rsq13,rinv13,nul);
10050 r21 = vec_madd(rsq21,rinv21,nul);
10051 r22 = vec_madd(rsq22,rinv22,nul);
10052 r23 = vec_madd(rsq23,rinv23,nul);
10053 r31 = vec_madd(rsq31,rinv31,nul);
10054 r32 = vec_madd(rsq32,rinv32,nul);
10055 r33 = vec_madd(rsq33,rinv33,nul);
10057 do_3_ctable_coul(VFtab,vec_madd(r11,tsc,nul),&VV11c,&FF11c);
10058 do_3_ctable_coul(VFtab,vec_madd(r12,tsc,nul),&VV12c,&FF12c);
10059 do_3_ctable_coul(VFtab,vec_madd(r13,tsc,nul),&VV13c,&FF13c);
10060 do_3_ctable_coul(VFtab,vec_madd(r21,tsc,nul),&VV21c,&FF21c);
10061 do_3_ctable_coul(VFtab,vec_madd(r22,tsc,nul),&VV22c,&FF22c);
10062 do_3_ctable_coul(VFtab,vec_madd(r23,tsc,nul),&VV23c,&FF23c);
10063 do_3_ctable_coul(VFtab,vec_madd(r31,tsc,nul),&VV31c,&FF31c);
10064 do_3_ctable_coul(VFtab,vec_madd(r32,tsc,nul),&VV32c,&FF32c);
10065 do_3_ctable_coul(VFtab,vec_madd(r33,tsc,nul),&VV33c,&FF33c);
10067 fs11 = vec_nmsub(qqOOt,FF11c,nul);
10068 fs12 = vec_nmsub(qqOHt,FF12c,nul);
10069 fs13 = vec_nmsub(qqOHt,FF13c,nul);
10070 fs21 = vec_nmsub(qqOHt,FF21c,nul);
10071 fs22 = vec_nmsub(qqHHt,FF22c,nul);
10072 fs23 = vec_nmsub(qqHHt,FF23c,nul);
10073 fs31 = vec_nmsub(qqOHt,FF31c,nul);
10074 fs32 = vec_nmsub(qqHHt,FF32c,nul);
10075 fs33 = vec_nmsub(qqHHt,FF33c,nul);
10077 vctot = vec_madd(qqOOt,VV11c,vctot);
10078 vctot = vec_madd(qqOHt,VV12c,vctot);
10079 vctot = vec_madd(qqOHt,VV13c,vctot);
10080 vctot = vec_madd(qqOHt,VV21c,vctot);
10081 vctot = vec_madd(qqHHt,VV22c,vctot);
10082 vctot = vec_madd(qqHHt,VV23c,vctot);
10083 vctot = vec_madd(qqOHt,VV31c,vctot);
10084 vctot = vec_madd(qqHHt,VV32c,vctot);
10085 vctot = vec_madd(qqHHt,VV33c,vctot);
10087 fs11 = vec_madd(fs11,tsc,nul);
10088 fs12 = vec_madd(fs12,tsc,nul);
10089 fs13 = vec_madd(fs13,tsc,nul);
10090 fs21 = vec_madd(fs21,tsc,nul);
10091 fs22 = vec_madd(fs22,tsc,nul);
10092 fs23 = vec_madd(fs23,tsc,nul);
10093 fs31 = vec_madd(fs31,tsc,nul);
10094 fs32 = vec_madd(fs32,tsc,nul);
10095 fs33 = vec_madd(fs33,tsc,nul);
10097 fs11 = vec_madd(fs11,rinv11,nul);
10098 fs12 = vec_madd(fs12,rinv12,nul);
10099 fs13 = vec_madd(fs13,rinv13,nul);
10100 fs21 = vec_madd(fs21,rinv21,nul);
10101 fs22 = vec_madd(fs22,rinv22,nul);
10102 fs23 = vec_madd(fs23,rinv23,nul);
10103 fs31 = vec_madd(fs31,rinv31,nul);
10104 fs32 = vec_madd(fs32,rinv32,nul);
10105 fs33 = vec_madd(fs33,rinv33,nul);
10107 fix1 = vec_madd(fs11,dx11,fix1);
10108 fiy1 = vec_madd(fs11,dy11,fiy1);
10109 fiz1 = vec_madd(fs11,dz11,fiz1);
10110 fix2 = vec_madd(fs21,dx21,fix2);
10111 fiy2 = vec_madd(fs21,dy21,fiy2);
10112 fiz2 = vec_madd(fs21,dz21,fiz2);
10113 fix3 = vec_madd(fs31,dx31,fix3);
10114 fiy3 = vec_madd(fs31,dy31,fiy3);
10115 fiz3 = vec_madd(fs31,dz31,fiz3);
10117 fix1 = vec_madd(fs12,dx12,fix1);
10118 fiy1 = vec_madd(fs12,dy12,fiy1);
10119 fiz1 = vec_madd(fs12,dz12,fiz1);
10120 fix2 = vec_madd(fs22,dx22,fix2);
10121 fiy2 = vec_madd(fs22,dy22,fiy2);
10122 fiz2 = vec_madd(fs22,dz22,fiz2);
10123 fix3 = vec_madd(fs32,dx32,fix3);
10124 fiy3 = vec_madd(fs32,dy32,fiy3);
10125 fiz3 = vec_madd(fs32,dz32,fiz3);
10127 fix1 = vec_madd(fs13,dx13,fix1);
10128 fiy1 = vec_madd(fs13,dy13,fiy1);
10129 fiz1 = vec_madd(fs13,dz13,fiz1);
10130 fix2 = vec_madd(fs23,dx23,fix2);
10131 fiy2 = vec_madd(fs23,dy23,fiy2);
10132 fiz2 = vec_madd(fs23,dz23,fiz2);
10133 fix3 = vec_madd(fs33,dx33,fix3);
10134 fiy3 = vec_madd(fs33,dy33,fiy3);
10135 fiz3 = vec_madd(fs33,dz33,fiz3);
10137 fjx1 = vec_nmsub(fs11,dx11,nul);
10138 fjy1 = vec_nmsub(fs11,dy11,nul);
10139 fjz1 = vec_nmsub(fs11,dz11,nul);
10140 fjx2 = vec_nmsub(fs12,dx12,nul);
10141 fjy2 = vec_nmsub(fs12,dy12,nul);
10142 fjz2 = vec_nmsub(fs12,dz12,nul);
10143 fjx3 = vec_nmsub(fs13,dx13,nul);
10144 fjy3 = vec_nmsub(fs13,dy13,nul);
10145 fjz3 = vec_nmsub(fs13,dz13,nul);
10147 fjx1 = vec_nmsub(fs21,dx21,fjx1);
10148 fjy1 = vec_nmsub(fs21,dy21,fjy1);
10149 fjz1 = vec_nmsub(fs21,dz21,fjz1);
10150 fjx2 = vec_nmsub(fs22,dx22,fjx2);
10151 fjy2 = vec_nmsub(fs22,dy22,fjy2);
10152 fjz2 = vec_nmsub(fs22,dz22,fjz2);
10153 fjx3 = vec_nmsub(fs23,dx23,fjx3);
10154 fjy3 = vec_nmsub(fs23,dy23,fjy3);
10155 fjz3 = vec_nmsub(fs23,dz23,fjz3);
10157 fjx1 = vec_nmsub(fs31,dx31,fjx1);
10158 fjy1 = vec_nmsub(fs31,dy31,fjy1);
10159 fjz1 = vec_nmsub(fs31,dz31,fjz1);
10160 fjx2 = vec_nmsub(fs32,dx32,fjx2);
10161 fjy2 = vec_nmsub(fs32,dy32,fjy2);
10162 fjz2 = vec_nmsub(fs32,dz32,fjz2);
10163 fjx3 = vec_nmsub(fs33,dx33,fjx3);
10164 fjy3 = vec_nmsub(fs33,dy33,fjy3);
10165 fjz3 = vec_nmsub(fs33,dz33,fjz3);
10167 add_force_to_3_water(faction+j3a,faction+j3b,faction+j3c,
10168 fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
10169 } else if(k<(nj1-1)) {
10170 jnra = jjnr[k];
10171 jnrb = jjnr[k+1];
10172 j3a = 3*jnra;
10173 j3b = 3*jnrb;
10174 load_2_water(pos+j3a,pos+j3b,
10175 &jx1,&jy1,&jz1,&jx2,&jy2,&jz2,&jx3,&jy3,&jz3);
10176 qqOOt = vec_sld(qqOO,nul,8);
10177 qqOHt = vec_sld(qqOH,nul,8);
10178 qqHHt = vec_sld(qqHH,nul,8);
10180 dx11 = vec_sub(ix1,jx1);
10181 dx12 = vec_sub(ix1,jx2);
10182 dx13 = vec_sub(ix1,jx3);
10183 dy11 = vec_sub(iy1,jy1);
10184 dy12 = vec_sub(iy1,jy2);
10185 dy13 = vec_sub(iy1,jy3);
10186 dz11 = vec_sub(iz1,jz1);
10187 dz12 = vec_sub(iz1,jz2);
10188 dz13 = vec_sub(iz1,jz3);
10189 dx21 = vec_sub(ix2,jx1);
10190 dx22 = vec_sub(ix2,jx2);
10191 dx23 = vec_sub(ix2,jx3);
10192 dy21 = vec_sub(iy2,jy1);
10193 dy22 = vec_sub(iy2,jy2);
10194 dy23 = vec_sub(iy2,jy3);
10195 dz21 = vec_sub(iz2,jz1);
10196 dz22 = vec_sub(iz2,jz2);
10197 dz23 = vec_sub(iz2,jz3);
10198 dx31 = vec_sub(ix3,jx1);
10199 dx32 = vec_sub(ix3,jx2);
10200 dx33 = vec_sub(ix3,jx3);
10201 dy31 = vec_sub(iy3,jy1);
10202 dy32 = vec_sub(iy3,jy2);
10203 dy33 = vec_sub(iy3,jy3);
10204 dz31 = vec_sub(iz3,jz1);
10205 dz32 = vec_sub(iz3,jz2);
10206 dz33 = vec_sub(iz3,jz3);
10208 rsq11 = vec_madd(dx11,dx11,nul);
10209 rsq12 = vec_madd(dx12,dx12,nul);
10210 rsq13 = vec_madd(dx13,dx13,nul);
10211 rsq21 = vec_madd(dx21,dx21,nul);
10212 rsq22 = vec_madd(dx22,dx22,nul);
10213 rsq23 = vec_madd(dx23,dx23,nul);
10214 rsq31 = vec_madd(dx31,dx31,nul);
10215 rsq32 = vec_madd(dx32,dx32,nul);
10216 rsq33 = vec_madd(dx33,dx33,nul);
10217 rsq11 = vec_madd(dy11,dy11,rsq11);
10218 rsq12 = vec_madd(dy12,dy12,rsq12);
10219 rsq13 = vec_madd(dy13,dy13,rsq13);
10220 rsq21 = vec_madd(dy21,dy21,rsq21);
10221 rsq22 = vec_madd(dy22,dy22,rsq22);
10222 rsq23 = vec_madd(dy23,dy23,rsq23);
10223 rsq31 = vec_madd(dy31,dy31,rsq31);
10224 rsq32 = vec_madd(dy32,dy32,rsq32);
10225 rsq33 = vec_madd(dy33,dy33,rsq33);
10226 rsq11 = vec_madd(dz11,dz11,rsq11);
10227 rsq12 = vec_madd(dz12,dz12,rsq12);
10228 rsq13 = vec_madd(dz13,dz13,rsq13);
10229 rsq21 = vec_madd(dz21,dz21,rsq21);
10230 rsq22 = vec_madd(dz22,dz22,rsq22);
10231 rsq23 = vec_madd(dz23,dz23,rsq23);
10232 rsq31 = vec_madd(dz31,dz31,rsq31);
10233 rsq32 = vec_madd(dz32,dz32,rsq32);
10234 rsq33 = vec_madd(dz33,dz33,rsq33);
10236 zero_highest_2_elements_in_9_vectors(&rsq11,&rsq12,&rsq13,
10237 &rsq21,&rsq22,&rsq23,
10238 &rsq31,&rsq32,&rsq33);
10240 do_9_invsqrt(rsq11,rsq12,rsq13,
10241 rsq21,rsq22,rsq23,
10242 rsq31,rsq32,rsq33,
10243 &rinv11,&rinv12,&rinv13,
10244 &rinv21,&rinv22,&rinv23,
10245 &rinv31,&rinv32,&rinv33);
10247 zero_highest_2_elements_in_9_vectors(&rinv11,&rinv12,&rinv13,
10248 &rinv21,&rinv22,&rinv23,
10249 &rinv31,&rinv32,&rinv33);
10251 r11 = vec_madd(rsq11,rinv11,nul);
10252 r12 = vec_madd(rsq12,rinv12,nul);
10253 r13 = vec_madd(rsq13,rinv13,nul);
10254 r21 = vec_madd(rsq21,rinv21,nul);
10255 r22 = vec_madd(rsq22,rinv22,nul);
10256 r23 = vec_madd(rsq23,rinv23,nul);
10257 r31 = vec_madd(rsq31,rinv31,nul);
10258 r32 = vec_madd(rsq32,rinv32,nul);
10259 r33 = vec_madd(rsq33,rinv33,nul);
10261 do_2_ctable_coul(VFtab,vec_madd(r11,tsc,nul),&VV11c,&FF11c);
10262 do_2_ctable_coul(VFtab,vec_madd(r12,tsc,nul),&VV12c,&FF12c);
10263 do_2_ctable_coul(VFtab,vec_madd(r13,tsc,nul),&VV13c,&FF13c);
10264 do_2_ctable_coul(VFtab,vec_madd(r21,tsc,nul),&VV21c,&FF21c);
10265 do_2_ctable_coul(VFtab,vec_madd(r22,tsc,nul),&VV22c,&FF22c);
10266 do_2_ctable_coul(VFtab,vec_madd(r23,tsc,nul),&VV23c,&FF23c);
10267 do_2_ctable_coul(VFtab,vec_madd(r31,tsc,nul),&VV31c,&FF31c);
10268 do_2_ctable_coul(VFtab,vec_madd(r32,tsc,nul),&VV32c,&FF32c);
10269 do_2_ctable_coul(VFtab,vec_madd(r33,tsc,nul),&VV33c,&FF33c);
10271 fs11 = vec_nmsub(qqOOt,FF11c,nul);
10272 fs12 = vec_nmsub(qqOHt,FF12c,nul);
10273 fs13 = vec_nmsub(qqOHt,FF13c,nul);
10274 fs21 = vec_nmsub(qqOHt,FF21c,nul);
10275 fs22 = vec_nmsub(qqHHt,FF22c,nul);
10276 fs23 = vec_nmsub(qqHHt,FF23c,nul);
10277 fs31 = vec_nmsub(qqOHt,FF31c,nul);
10278 fs32 = vec_nmsub(qqHHt,FF32c,nul);
10279 fs33 = vec_nmsub(qqHHt,FF33c,nul);
10281 vctot = vec_madd(qqOOt,VV11c,vctot);
10282 vctot = vec_madd(qqOHt,VV12c,vctot);
10283 vctot = vec_madd(qqOHt,VV13c,vctot);
10284 vctot = vec_madd(qqOHt,VV21c,vctot);
10285 vctot = vec_madd(qqHHt,VV22c,vctot);
10286 vctot = vec_madd(qqHHt,VV23c,vctot);
10287 vctot = vec_madd(qqOHt,VV31c,vctot);
10288 vctot = vec_madd(qqHHt,VV32c,vctot);
10289 vctot = vec_madd(qqHHt,VV33c,vctot);
10291 fs11 = vec_madd(fs11,tsc,nul);
10292 fs12 = vec_madd(fs12,tsc,nul);
10293 fs13 = vec_madd(fs13,tsc,nul);
10294 fs21 = vec_madd(fs21,tsc,nul);
10295 fs22 = vec_madd(fs22,tsc,nul);
10296 fs23 = vec_madd(fs23,tsc,nul);
10297 fs31 = vec_madd(fs31,tsc,nul);
10298 fs32 = vec_madd(fs32,tsc,nul);
10299 fs33 = vec_madd(fs33,tsc,nul);
10301 fs11 = vec_madd(fs11,rinv11,nul);
10302 fs12 = vec_madd(fs12,rinv12,nul);
10303 fs13 = vec_madd(fs13,rinv13,nul);
10304 fs21 = vec_madd(fs21,rinv21,nul);
10305 fs22 = vec_madd(fs22,rinv22,nul);
10306 fs23 = vec_madd(fs23,rinv23,nul);
10307 fs31 = vec_madd(fs31,rinv31,nul);
10308 fs32 = vec_madd(fs32,rinv32,nul);
10309 fs33 = vec_madd(fs33,rinv33,nul);
10311 fix1 = vec_madd(fs11,dx11,fix1);
10312 fiy1 = vec_madd(fs11,dy11,fiy1);
10313 fiz1 = vec_madd(fs11,dz11,fiz1);
10314 fix2 = vec_madd(fs21,dx21,fix2);
10315 fiy2 = vec_madd(fs21,dy21,fiy2);
10316 fiz2 = vec_madd(fs21,dz21,fiz2);
10317 fix3 = vec_madd(fs31,dx31,fix3);
10318 fiy3 = vec_madd(fs31,dy31,fiy3);
10319 fiz3 = vec_madd(fs31,dz31,fiz3);
10321 fix1 = vec_madd(fs12,dx12,fix1);
10322 fiy1 = vec_madd(fs12,dy12,fiy1);
10323 fiz1 = vec_madd(fs12,dz12,fiz1);
10324 fix2 = vec_madd(fs22,dx22,fix2);
10325 fiy2 = vec_madd(fs22,dy22,fiy2);
10326 fiz2 = vec_madd(fs22,dz22,fiz2);
10327 fix3 = vec_madd(fs32,dx32,fix3);
10328 fiy3 = vec_madd(fs32,dy32,fiy3);
10329 fiz3 = vec_madd(fs32,dz32,fiz3);
10331 fix1 = vec_madd(fs13,dx13,fix1);
10332 fiy1 = vec_madd(fs13,dy13,fiy1);
10333 fiz1 = vec_madd(fs13,dz13,fiz1);
10334 fix2 = vec_madd(fs23,dx23,fix2);
10335 fiy2 = vec_madd(fs23,dy23,fiy2);
10336 fiz2 = vec_madd(fs23,dz23,fiz2);
10337 fix3 = vec_madd(fs33,dx33,fix3);
10338 fiy3 = vec_madd(fs33,dy33,fiy3);
10339 fiz3 = vec_madd(fs33,dz33,fiz3);
10341 fjx1 = vec_nmsub(fs11,dx11,nul);
10342 fjy1 = vec_nmsub(fs11,dy11,nul);
10343 fjz1 = vec_nmsub(fs11,dz11,nul);
10344 fjx2 = vec_nmsub(fs12,dx12,nul);
10345 fjy2 = vec_nmsub(fs12,dy12,nul);
10346 fjz2 = vec_nmsub(fs12,dz12,nul);
10347 fjx3 = vec_nmsub(fs13,dx13,nul);
10348 fjy3 = vec_nmsub(fs13,dy13,nul);
10349 fjz3 = vec_nmsub(fs13,dz13,nul);
10351 fjx1 = vec_nmsub(fs21,dx21,fjx1);
10352 fjy1 = vec_nmsub(fs21,dy21,fjy1);
10353 fjz1 = vec_nmsub(fs21,dz21,fjz1);
10354 fjx2 = vec_nmsub(fs22,dx22,fjx2);
10355 fjy2 = vec_nmsub(fs22,dy22,fjy2);
10356 fjz2 = vec_nmsub(fs22,dz22,fjz2);
10357 fjx3 = vec_nmsub(fs23,dx23,fjx3);
10358 fjy3 = vec_nmsub(fs23,dy23,fjy3);
10359 fjz3 = vec_nmsub(fs23,dz23,fjz3);
10361 fjx1 = vec_nmsub(fs31,dx31,fjx1);
10362 fjy1 = vec_nmsub(fs31,dy31,fjy1);
10363 fjz1 = vec_nmsub(fs31,dz31,fjz1);
10364 fjx2 = vec_nmsub(fs32,dx32,fjx2);
10365 fjy2 = vec_nmsub(fs32,dy32,fjy2);
10366 fjz2 = vec_nmsub(fs32,dz32,fjz2);
10367 fjx3 = vec_nmsub(fs33,dx33,fjx3);
10368 fjy3 = vec_nmsub(fs33,dy33,fjy3);
10369 fjz3 = vec_nmsub(fs33,dz33,fjz3);
10371 add_force_to_2_water(faction+j3a,faction+j3b,
10372 fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
10373 } else if(k<nj1) {
10374 jnra = jjnr[k];
10375 j3a = 3*jnra;
10376 load_1_water(pos+j3a,
10377 &jx1,&jy1,&jz1,&jx2,&jy2,&jz2,&jx3,&jy3,&jz3);
10378 qqOOt = vec_sld(qqOO,nul,12);
10379 qqOHt = vec_sld(qqOH,nul,12);
10380 qqHHt = vec_sld(qqHH,nul,12);
10382 dx11 = vec_sub(ix1,jx1);
10383 dx12 = vec_sub(ix1,jx2);
10384 dx13 = vec_sub(ix1,jx3);
10385 dy11 = vec_sub(iy1,jy1);
10386 dy12 = vec_sub(iy1,jy2);
10387 dy13 = vec_sub(iy1,jy3);
10388 dz11 = vec_sub(iz1,jz1);
10389 dz12 = vec_sub(iz1,jz2);
10390 dz13 = vec_sub(iz1,jz3);
10391 dx21 = vec_sub(ix2,jx1);
10392 dx22 = vec_sub(ix2,jx2);
10393 dx23 = vec_sub(ix2,jx3);
10394 dy21 = vec_sub(iy2,jy1);
10395 dy22 = vec_sub(iy2,jy2);
10396 dy23 = vec_sub(iy2,jy3);
10397 dz21 = vec_sub(iz2,jz1);
10398 dz22 = vec_sub(iz2,jz2);
10399 dz23 = vec_sub(iz2,jz3);
10400 dx31 = vec_sub(ix3,jx1);
10401 dx32 = vec_sub(ix3,jx2);
10402 dx33 = vec_sub(ix3,jx3);
10403 dy31 = vec_sub(iy3,jy1);
10404 dy32 = vec_sub(iy3,jy2);
10405 dy33 = vec_sub(iy3,jy3);
10406 dz31 = vec_sub(iz3,jz1);
10407 dz32 = vec_sub(iz3,jz2);
10408 dz33 = vec_sub(iz3,jz3);
10410 rsq11 = vec_madd(dx11,dx11,nul);
10411 rsq12 = vec_madd(dx12,dx12,nul);
10412 rsq13 = vec_madd(dx13,dx13,nul);
10413 rsq21 = vec_madd(dx21,dx21,nul);
10414 rsq22 = vec_madd(dx22,dx22,nul);
10415 rsq23 = vec_madd(dx23,dx23,nul);
10416 rsq31 = vec_madd(dx31,dx31,nul);
10417 rsq32 = vec_madd(dx32,dx32,nul);
10418 rsq33 = vec_madd(dx33,dx33,nul);
10419 rsq11 = vec_madd(dy11,dy11,rsq11);
10420 rsq12 = vec_madd(dy12,dy12,rsq12);
10421 rsq13 = vec_madd(dy13,dy13,rsq13);
10422 rsq21 = vec_madd(dy21,dy21,rsq21);
10423 rsq22 = vec_madd(dy22,dy22,rsq22);
10424 rsq23 = vec_madd(dy23,dy23,rsq23);
10425 rsq31 = vec_madd(dy31,dy31,rsq31);
10426 rsq32 = vec_madd(dy32,dy32,rsq32);
10427 rsq33 = vec_madd(dy33,dy33,rsq33);
10428 rsq11 = vec_madd(dz11,dz11,rsq11);
10429 rsq12 = vec_madd(dz12,dz12,rsq12);
10430 rsq13 = vec_madd(dz13,dz13,rsq13);
10431 rsq21 = vec_madd(dz21,dz21,rsq21);
10432 rsq22 = vec_madd(dz22,dz22,rsq22);
10433 rsq23 = vec_madd(dz23,dz23,rsq23);
10434 rsq31 = vec_madd(dz31,dz31,rsq31);
10435 rsq32 = vec_madd(dz32,dz32,rsq32);
10436 rsq33 = vec_madd(dz33,dz33,rsq33);
10438 zero_highest_3_elements_in_9_vectors(&rsq11,&rsq12,&rsq13,
10439 &rsq21,&rsq22,&rsq23,
10440 &rsq31,&rsq32,&rsq33);
10442 do_9_invsqrt(rsq11,rsq12,rsq13,
10443 rsq21,rsq22,rsq23,
10444 rsq31,rsq32,rsq33,
10445 &rinv11,&rinv12,&rinv13,
10446 &rinv21,&rinv22,&rinv23,
10447 &rinv31,&rinv32,&rinv33);
10449 zero_highest_3_elements_in_9_vectors(&rinv11,&rinv12,&rinv13,
10450 &rinv21,&rinv22,&rinv23,
10451 &rinv31,&rinv32,&rinv33);
10453 r11 = vec_madd(rsq11,rinv11,nul);
10454 r12 = vec_madd(rsq12,rinv12,nul);
10455 r13 = vec_madd(rsq13,rinv13,nul);
10456 r21 = vec_madd(rsq21,rinv21,nul);
10457 r22 = vec_madd(rsq22,rinv22,nul);
10458 r23 = vec_madd(rsq23,rinv23,nul);
10459 r31 = vec_madd(rsq31,rinv31,nul);
10460 r32 = vec_madd(rsq32,rinv32,nul);
10461 r33 = vec_madd(rsq33,rinv33,nul);
10463 do_1_ctable_coul(VFtab,vec_madd(r11,tsc,nul),&VV11c,&FF11c);
10464 do_1_ctable_coul(VFtab,vec_madd(r12,tsc,nul),&VV12c,&FF12c);
10465 do_1_ctable_coul(VFtab,vec_madd(r13,tsc,nul),&VV13c,&FF13c);
10466 do_1_ctable_coul(VFtab,vec_madd(r21,tsc,nul),&VV21c,&FF21c);
10467 do_1_ctable_coul(VFtab,vec_madd(r22,tsc,nul),&VV22c,&FF22c);
10468 do_1_ctable_coul(VFtab,vec_madd(r23,tsc,nul),&VV23c,&FF23c);
10469 do_1_ctable_coul(VFtab,vec_madd(r31,tsc,nul),&VV31c,&FF31c);
10470 do_1_ctable_coul(VFtab,vec_madd(r32,tsc,nul),&VV32c,&FF32c);
10471 do_1_ctable_coul(VFtab,vec_madd(r33,tsc,nul),&VV33c,&FF33c);
10473 fs11 = vec_nmsub(qqOOt,FF11c,nul);
10474 fs12 = vec_nmsub(qqOHt,FF12c,nul);
10475 fs13 = vec_nmsub(qqOHt,FF13c,nul);
10476 fs21 = vec_nmsub(qqOHt,FF21c,nul);
10477 fs22 = vec_nmsub(qqHHt,FF22c,nul);
10478 fs23 = vec_nmsub(qqHHt,FF23c,nul);
10479 fs31 = vec_nmsub(qqOHt,FF31c,nul);
10480 fs32 = vec_nmsub(qqHHt,FF32c,nul);
10481 fs33 = vec_nmsub(qqHHt,FF33c,nul);
10483 vctot = vec_madd(qqOOt,VV11c,vctot);
10484 vctot = vec_madd(qqOHt,VV12c,vctot);
10485 vctot = vec_madd(qqOHt,VV13c,vctot);
10486 vctot = vec_madd(qqOHt,VV21c,vctot);
10487 vctot = vec_madd(qqHHt,VV22c,vctot);
10488 vctot = vec_madd(qqHHt,VV23c,vctot);
10489 vctot = vec_madd(qqOHt,VV31c,vctot);
10490 vctot = vec_madd(qqHHt,VV32c,vctot);
10491 vctot = vec_madd(qqHHt,VV33c,vctot);
10493 fs11 = vec_madd(fs11,tsc,nul);
10494 fs12 = vec_madd(fs12,tsc,nul);
10495 fs13 = vec_madd(fs13,tsc,nul);
10496 fs21 = vec_madd(fs21,tsc,nul);
10497 fs22 = vec_madd(fs22,tsc,nul);
10498 fs23 = vec_madd(fs23,tsc,nul);
10499 fs31 = vec_madd(fs31,tsc,nul);
10500 fs32 = vec_madd(fs32,tsc,nul);
10501 fs33 = vec_madd(fs33,tsc,nul);
10503 fs11 = vec_madd(fs11,rinv11,nul);
10504 fs12 = vec_madd(fs12,rinv12,nul);
10505 fs13 = vec_madd(fs13,rinv13,nul);
10506 fs21 = vec_madd(fs21,rinv21,nul);
10507 fs22 = vec_madd(fs22,rinv22,nul);
10508 fs23 = vec_madd(fs23,rinv23,nul);
10509 fs31 = vec_madd(fs31,rinv31,nul);
10510 fs32 = vec_madd(fs32,rinv32,nul);
10511 fs33 = vec_madd(fs33,rinv33,nul);
10513 fix1 = vec_madd(fs11,dx11,fix1);
10514 fiy1 = vec_madd(fs11,dy11,fiy1);
10515 fiz1 = vec_madd(fs11,dz11,fiz1);
10516 fix2 = vec_madd(fs21,dx21,fix2);
10517 fiy2 = vec_madd(fs21,dy21,fiy2);
10518 fiz2 = vec_madd(fs21,dz21,fiz2);
10519 fix3 = vec_madd(fs31,dx31,fix3);
10520 fiy3 = vec_madd(fs31,dy31,fiy3);
10521 fiz3 = vec_madd(fs31,dz31,fiz3);
10523 fix1 = vec_madd(fs12,dx12,fix1);
10524 fiy1 = vec_madd(fs12,dy12,fiy1);
10525 fiz1 = vec_madd(fs12,dz12,fiz1);
10526 fix2 = vec_madd(fs22,dx22,fix2);
10527 fiy2 = vec_madd(fs22,dy22,fiy2);
10528 fiz2 = vec_madd(fs22,dz22,fiz2);
10529 fix3 = vec_madd(fs32,dx32,fix3);
10530 fiy3 = vec_madd(fs32,dy32,fiy3);
10531 fiz3 = vec_madd(fs32,dz32,fiz3);
10533 fix1 = vec_madd(fs13,dx13,fix1);
10534 fiy1 = vec_madd(fs13,dy13,fiy1);
10535 fiz1 = vec_madd(fs13,dz13,fiz1);
10536 fix2 = vec_madd(fs23,dx23,fix2);
10537 fiy2 = vec_madd(fs23,dy23,fiy2);
10538 fiz2 = vec_madd(fs23,dz23,fiz2);
10539 fix3 = vec_madd(fs33,dx33,fix3);
10540 fiy3 = vec_madd(fs33,dy33,fiy3);
10541 fiz3 = vec_madd(fs33,dz33,fiz3);
10543 fjx1 = vec_nmsub(fs11,dx11,nul);
10544 fjy1 = vec_nmsub(fs11,dy11,nul);
10545 fjz1 = vec_nmsub(fs11,dz11,nul);
10546 fjx2 = vec_nmsub(fs12,dx12,nul);
10547 fjy2 = vec_nmsub(fs12,dy12,nul);
10548 fjz2 = vec_nmsub(fs12,dz12,nul);
10549 fjx3 = vec_nmsub(fs13,dx13,nul);
10550 fjy3 = vec_nmsub(fs13,dy13,nul);
10551 fjz3 = vec_nmsub(fs13,dz13,nul);
10553 fjx1 = vec_nmsub(fs21,dx21,fjx1);
10554 fjy1 = vec_nmsub(fs21,dy21,fjy1);
10555 fjz1 = vec_nmsub(fs21,dz21,fjz1);
10556 fjx2 = vec_nmsub(fs22,dx22,fjx2);
10557 fjy2 = vec_nmsub(fs22,dy22,fjy2);
10558 fjz2 = vec_nmsub(fs22,dz22,fjz2);
10559 fjx3 = vec_nmsub(fs23,dx23,fjx3);
10560 fjy3 = vec_nmsub(fs23,dy23,fjy3);
10561 fjz3 = vec_nmsub(fs23,dz23,fjz3);
10563 fjx1 = vec_nmsub(fs31,dx31,fjx1);
10564 fjy1 = vec_nmsub(fs31,dy31,fjy1);
10565 fjz1 = vec_nmsub(fs31,dz31,fjz1);
10566 fjx2 = vec_nmsub(fs32,dx32,fjx2);
10567 fjy2 = vec_nmsub(fs32,dy32,fjy2);
10568 fjz2 = vec_nmsub(fs32,dz32,fjz2);
10569 fjx3 = vec_nmsub(fs33,dx33,fjx3);
10570 fjy3 = vec_nmsub(fs33,dy33,fjy3);
10571 fjz3 = vec_nmsub(fs33,dz33,fjz3);
10573 add_force_to_1_water(faction+j3a,
10574 fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
10576 /* update outer data */
10577 update_i_water_forces(faction+ii3,fshift+is3,
10578 fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3);
10580 add_vector_to_float(Vc+gid[n],vctot);
10586 void inl3130_altivec(
10587 int nri,
10588 int iinr[],
10589 int jindex[],
10590 int jjnr[],
10591 int shift[],
10592 float shiftvec[],
10593 float fshift[],
10594 int gid[],
10595 float pos[],
10596 float faction[],
10597 float charge[],
10598 float facel,
10599 float Vc[],
10600 int type[],
10601 int ntype,
10602 float nbfp[],
10603 float Vnb[],
10604 float tabscale,
10605 float VFtab[])
10607 vector float ix1,iy1,iz1,ix2,iy2,iz2,ix3,iy3,iz3;
10608 vector float jx1,jy1,jz1,jx2,jy2,jz2,jx3,jy3,jz3;
10610 vector float dx11,dy11,dz11,dx12,dy12,dz12,dx13,dy13,dz13;
10611 vector float dx21,dy21,dz21,dx22,dy22,dz22,dx23,dy23,dz23;
10612 vector float dx31,dy31,dz31,dx32,dy32,dz32,dx33,dy33,dz33;
10614 vector float rsq11,rsq12,rsq13,rsq21,rsq22,rsq23,rsq31,rsq32,rsq33;
10615 vector float r11,r12,r13,r21,r22,r23,r31,r32,r33;
10616 vector float rinv11,rinv12,rinv13,rinv21,rinv22,rinv23,rinv31,rinv32,rinv33;
10617 vector float rinvsq11;
10618 vector float vc11,vc12,vc13,vc21,vc22,vc23,vc31,vc32,vc33,tsc,VVc,FFc;
10620 vector float vfacel,vcoul1,vcoul2,vcoul3,nul;
10621 vector float fs11,fs12,fs13,fs21,fs22,fs23,fs31,fs32,fs33,fs11c;
10622 vector float fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3;
10623 vector float fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3;
10624 vector float vctot,qqOO,qqOH,qqHH,qO,qH,c6,c12,rinvsix;
10625 vector float vnb6,vnb12,vnbtot,qqOOt,qqOHt,qqHHt,c6t,c12t;
10626 vector float VV11c,FF11c,VV12c,FF12c,VV13c,FF13c;
10627 vector float VV21c,FF21c,VV22c,FF22c,VV23c,FF23c;
10628 vector float VV31c,FF31c,VV32c,FF32c,VV33c,FF33c;
10630 int n,k,k0,ii,is3,ii3,ntiA,nj0,nj1;
10631 int jnra,jnrb,jnrc,jnrd,tp,tj;
10632 int j3a,j3b,j3c,j3d;
10634 nul=vec_zero();
10635 vfacel=load_float_and_splat(&facel);
10636 tsc=load_float_and_splat(&tabscale);
10637 ii = iinr[0];
10638 qO = load_float_and_splat(charge+ii);
10639 qH = load_float_and_splat(charge+ii+1);
10640 qqOO = vec_madd(qO,qO,nul);
10641 qqOH = vec_madd(qO,qH,nul);
10642 qqHH = vec_madd(qH,qH,nul);
10643 qqOO = vec_madd(qqOO,vfacel,nul);
10644 qqOH = vec_madd(qqOH,vfacel,nul);
10645 qqHH = vec_madd(qqHH,vfacel,nul);
10646 tp = 2*type[ii];
10647 tj = (ntype+1)*tp;
10648 load_1_pair(nbfp+tj,&c6,&c12);
10649 c6 = vec_splat(c6,0);
10650 c12 = vec_splat(c12,0);
10652 for(n=0;n<nri;n++) {
10653 is3 = 3*shift[n];
10654 ii = iinr[n];
10655 ii3 = 3*ii;
10656 load_1_water_shift_and_splat(pos+ii3,shiftvec+is3,&ix1,&iy1,&iz1,
10657 &ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
10658 vctot = nul;
10659 vnbtot = nul;
10660 fix1 = nul;
10661 fiy1 = nul;
10662 fiz1 = nul;
10663 fix2 = nul;
10664 fiy2 = nul;
10665 fiz2 = nul;
10666 fix3 = nul;
10667 fiy3 = nul;
10668 fiz3 = nul;
10669 nj0 = jindex[n];
10670 nj1 = jindex[n+1];
10672 for(k=nj0; k<(nj1-3); k+=4) {
10673 jnra = jjnr[k];
10674 jnrb = jjnr[k+1];
10675 jnrc = jjnr[k+2];
10676 jnrd = jjnr[k+3];
10677 j3a = 3*jnra;
10678 j3b = 3*jnrb;
10679 j3c = 3*jnrc;
10680 j3d = 3*jnrd;
10681 load_4_water(pos+j3a,pos+j3b,pos+j3c,pos+j3d,
10682 &jx1,&jy1,&jz1,&jx2,&jy2,&jz2,&jx3,&jy3,&jz3);
10683 dx11 = vec_sub(ix1,jx1);
10684 dx12 = vec_sub(ix1,jx2);
10685 dx13 = vec_sub(ix1,jx3);
10686 dy11 = vec_sub(iy1,jy1);
10687 dy12 = vec_sub(iy1,jy2);
10688 dy13 = vec_sub(iy1,jy3);
10689 dz11 = vec_sub(iz1,jz1);
10690 dz12 = vec_sub(iz1,jz2);
10691 dz13 = vec_sub(iz1,jz3);
10692 dx21 = vec_sub(ix2,jx1);
10693 dx22 = vec_sub(ix2,jx2);
10694 dx23 = vec_sub(ix2,jx3);
10695 dy21 = vec_sub(iy2,jy1);
10696 dy22 = vec_sub(iy2,jy2);
10697 dy23 = vec_sub(iy2,jy3);
10698 dz21 = vec_sub(iz2,jz1);
10699 dz22 = vec_sub(iz2,jz2);
10700 dz23 = vec_sub(iz2,jz3);
10701 dx31 = vec_sub(ix3,jx1);
10702 dx32 = vec_sub(ix3,jx2);
10703 dx33 = vec_sub(ix3,jx3);
10704 dy31 = vec_sub(iy3,jy1);
10705 dy32 = vec_sub(iy3,jy2);
10706 dy33 = vec_sub(iy3,jy3);
10707 dz31 = vec_sub(iz3,jz1);
10708 dz32 = vec_sub(iz3,jz2);
10709 dz33 = vec_sub(iz3,jz3);
10711 rsq11 = vec_madd(dx11,dx11,nul);
10712 rsq12 = vec_madd(dx12,dx12,nul);
10713 rsq13 = vec_madd(dx13,dx13,nul);
10714 rsq21 = vec_madd(dx21,dx21,nul);
10715 rsq22 = vec_madd(dx22,dx22,nul);
10716 rsq23 = vec_madd(dx23,dx23,nul);
10717 rsq31 = vec_madd(dx31,dx31,nul);
10718 rsq32 = vec_madd(dx32,dx32,nul);
10719 rsq33 = vec_madd(dx33,dx33,nul);
10720 rsq11 = vec_madd(dy11,dy11,rsq11);
10721 rsq12 = vec_madd(dy12,dy12,rsq12);
10722 rsq13 = vec_madd(dy13,dy13,rsq13);
10723 rsq21 = vec_madd(dy21,dy21,rsq21);
10724 rsq22 = vec_madd(dy22,dy22,rsq22);
10725 rsq23 = vec_madd(dy23,dy23,rsq23);
10726 rsq31 = vec_madd(dy31,dy31,rsq31);
10727 rsq32 = vec_madd(dy32,dy32,rsq32);
10728 rsq33 = vec_madd(dy33,dy33,rsq33);
10729 rsq11 = vec_madd(dz11,dz11,rsq11);
10730 rsq12 = vec_madd(dz12,dz12,rsq12);
10731 rsq13 = vec_madd(dz13,dz13,rsq13);
10732 rsq21 = vec_madd(dz21,dz21,rsq21);
10733 rsq22 = vec_madd(dz22,dz22,rsq22);
10734 rsq23 = vec_madd(dz23,dz23,rsq23);
10735 rsq31 = vec_madd(dz31,dz31,rsq31);
10736 rsq32 = vec_madd(dz32,dz32,rsq32);
10737 rsq33 = vec_madd(dz33,dz33,rsq33);
10739 do_9_invsqrt(rsq11,rsq12,rsq13,
10740 rsq21,rsq22,rsq23,
10741 rsq31,rsq32,rsq33,
10742 &rinv11,&rinv12,&rinv13,
10743 &rinv21,&rinv22,&rinv23,
10744 &rinv31,&rinv32,&rinv33);
10746 rinvsq11 = vec_madd(rinv11,rinv11,nul);
10747 r11 = vec_madd(rsq11,rinv11,nul);
10748 r12 = vec_madd(rsq12,rinv12,nul);
10749 r13 = vec_madd(rsq13,rinv13,nul);
10750 r21 = vec_madd(rsq21,rinv21,nul);
10751 rinvsix = vec_madd(rinvsq11,rinvsq11,nul);
10752 r22 = vec_madd(rsq22,rinv22,nul);
10753 r23 = vec_madd(rsq23,rinv23,nul);
10754 r31 = vec_madd(rsq31,rinv31,nul);
10755 r32 = vec_madd(rsq32,rinv32,nul);
10756 r33 = vec_madd(rsq33,rinv33,nul);
10757 rinvsix = vec_madd(rinvsix,rinvsq11,nul);
10759 do_4_ctable_coul(VFtab,vec_madd(r11,tsc,nul),&VV11c,&FF11c);
10760 do_4_ctable_coul(VFtab,vec_madd(r12,tsc,nul),&VV12c,&FF12c);
10761 do_4_ctable_coul(VFtab,vec_madd(r13,tsc,nul),&VV13c,&FF13c);
10762 do_4_ctable_coul(VFtab,vec_madd(r21,tsc,nul),&VV21c,&FF21c);
10763 do_4_ctable_coul(VFtab,vec_madd(r22,tsc,nul),&VV22c,&FF22c);
10764 do_4_ctable_coul(VFtab,vec_madd(r23,tsc,nul),&VV23c,&FF23c);
10765 do_4_ctable_coul(VFtab,vec_madd(r31,tsc,nul),&VV31c,&FF31c);
10766 do_4_ctable_coul(VFtab,vec_madd(r32,tsc,nul),&VV32c,&FF32c);
10767 do_4_ctable_coul(VFtab,vec_madd(r33,tsc,nul),&VV33c,&FF33c);
10769 vnb6 = vec_madd(c6,rinvsix,nul);
10770 vnb12 = vec_madd(c12,vec_madd(rinvsix,rinvsix,nul),nul);
10771 fs11 = vec_madd(vec_twelve(),vnb12,nul);
10772 fs11c = vec_nmsub(qqOO,FF11c,nul);
10773 fs12 = vec_nmsub(qqOH,FF12c,nul);
10774 fs13 = vec_nmsub(qqOH,FF13c,nul);
10775 fs21 = vec_nmsub(qqOH,FF21c,nul);
10776 fs11 = vec_nmsub(vec_six(),vnb6,fs11);
10777 fs22 = vec_nmsub(qqHH,FF22c,nul);
10778 fs23 = vec_nmsub(qqHH,FF23c,nul);
10779 fs31 = vec_nmsub(qqOH,FF31c,nul);
10780 fs32 = vec_nmsub(qqHH,FF32c,nul);
10781 fs11 = vec_madd(fs11,rinv11,nul);
10782 fs33 = vec_nmsub(qqHH,FF33c,nul);
10783 vnbtot = vec_add(vnbtot,vnb12);
10784 vnbtot = vec_sub(vnbtot,vnb6);
10785 vctot = vec_madd(qqOO,VV11c,vctot);
10786 vctot = vec_madd(qqOH,VV12c,vctot);
10787 vctot = vec_madd(qqOH,VV13c,vctot);
10788 vctot = vec_madd(qqOH,VV21c,vctot);
10789 vctot = vec_madd(qqHH,VV22c,vctot);
10790 vctot = vec_madd(qqHH,VV23c,vctot);
10791 vctot = vec_madd(qqOH,VV31c,vctot);
10792 vctot = vec_madd(qqHH,VV32c,vctot);
10793 vctot = vec_madd(qqHH,VV33c,vctot);
10795 fs11 = vec_madd(fs11c,tsc,fs11);
10796 fs12 = vec_madd(fs12,tsc,nul);
10797 fs13 = vec_madd(fs13,tsc,nul);
10798 fs21 = vec_madd(fs21,tsc,nul);
10799 fs22 = vec_madd(fs22,tsc,nul);
10800 fs23 = vec_madd(fs23,tsc,nul);
10801 fs31 = vec_madd(fs31,tsc,nul);
10802 fs32 = vec_madd(fs32,tsc,nul);
10803 fs33 = vec_madd(fs33,tsc,nul);
10805 fs11 = vec_madd(fs11,rinv11,nul);
10806 fs12 = vec_madd(fs12,rinv12,nul);
10807 fs13 = vec_madd(fs13,rinv13,nul);
10808 fs21 = vec_madd(fs21,rinv21,nul);
10809 fs22 = vec_madd(fs22,rinv22,nul);
10810 fs23 = vec_madd(fs23,rinv23,nul);
10811 fs31 = vec_madd(fs31,rinv31,nul);
10812 fs32 = vec_madd(fs32,rinv32,nul);
10813 fs33 = vec_madd(fs33,rinv33,nul);
10815 fix1 = vec_madd(fs11,dx11,fix1);
10816 fiy1 = vec_madd(fs11,dy11,fiy1);
10817 fiz1 = vec_madd(fs11,dz11,fiz1);
10818 fix2 = vec_madd(fs21,dx21,fix2);
10819 fiy2 = vec_madd(fs21,dy21,fiy2);
10820 fiz2 = vec_madd(fs21,dz21,fiz2);
10821 fix3 = vec_madd(fs31,dx31,fix3);
10822 fiy3 = vec_madd(fs31,dy31,fiy3);
10823 fiz3 = vec_madd(fs31,dz31,fiz3);
10825 fix1 = vec_madd(fs12,dx12,fix1);
10826 fiy1 = vec_madd(fs12,dy12,fiy1);
10827 fiz1 = vec_madd(fs12,dz12,fiz1);
10828 fix2 = vec_madd(fs22,dx22,fix2);
10829 fiy2 = vec_madd(fs22,dy22,fiy2);
10830 fiz2 = vec_madd(fs22,dz22,fiz2);
10831 fix3 = vec_madd(fs32,dx32,fix3);
10832 fiy3 = vec_madd(fs32,dy32,fiy3);
10833 fiz3 = vec_madd(fs32,dz32,fiz3);
10835 fix1 = vec_madd(fs13,dx13,fix1);
10836 fiy1 = vec_madd(fs13,dy13,fiy1);
10837 fiz1 = vec_madd(fs13,dz13,fiz1);
10838 fix2 = vec_madd(fs23,dx23,fix2);
10839 fiy2 = vec_madd(fs23,dy23,fiy2);
10840 fiz2 = vec_madd(fs23,dz23,fiz2);
10841 fix3 = vec_madd(fs33,dx33,fix3);
10842 fiy3 = vec_madd(fs33,dy33,fiy3);
10843 fiz3 = vec_madd(fs33,dz33,fiz3);
10845 fjx1 = vec_nmsub(fs11,dx11,nul);
10846 fjy1 = vec_nmsub(fs11,dy11,nul);
10847 fjz1 = vec_nmsub(fs11,dz11,nul);
10848 fjx2 = vec_nmsub(fs12,dx12,nul);
10849 fjy2 = vec_nmsub(fs12,dy12,nul);
10850 fjz2 = vec_nmsub(fs12,dz12,nul);
10851 fjx3 = vec_nmsub(fs13,dx13,nul);
10852 fjy3 = vec_nmsub(fs13,dy13,nul);
10853 fjz3 = vec_nmsub(fs13,dz13,nul);
10855 fjx1 = vec_nmsub(fs21,dx21,fjx1);
10856 fjy1 = vec_nmsub(fs21,dy21,fjy1);
10857 fjz1 = vec_nmsub(fs21,dz21,fjz1);
10858 fjx2 = vec_nmsub(fs22,dx22,fjx2);
10859 fjy2 = vec_nmsub(fs22,dy22,fjy2);
10860 fjz2 = vec_nmsub(fs22,dz22,fjz2);
10861 fjx3 = vec_nmsub(fs23,dx23,fjx3);
10862 fjy3 = vec_nmsub(fs23,dy23,fjy3);
10863 fjz3 = vec_nmsub(fs23,dz23,fjz3);
10865 fjx1 = vec_nmsub(fs31,dx31,fjx1);
10866 fjy1 = vec_nmsub(fs31,dy31,fjy1);
10867 fjz1 = vec_nmsub(fs31,dz31,fjz1);
10868 fjx2 = vec_nmsub(fs32,dx32,fjx2);
10869 fjy2 = vec_nmsub(fs32,dy32,fjy2);
10870 fjz2 = vec_nmsub(fs32,dz32,fjz2);
10871 fjx3 = vec_nmsub(fs33,dx33,fjx3);
10872 fjy3 = vec_nmsub(fs33,dy33,fjy3);
10873 fjz3 = vec_nmsub(fs33,dz33,fjz3);
10875 add_force_to_4_water(faction+j3a,faction+j3b,faction+j3c,faction+j3d,
10876 fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
10878 if(k<(nj1-2)) {
10879 jnra = jjnr[k];
10880 jnrb = jjnr[k+1];
10881 jnrc = jjnr[k+2];
10882 j3a = 3*jnra;
10883 j3b = 3*jnrb;
10884 j3c = 3*jnrc;
10885 load_3_water(pos+j3a,pos+j3b,pos+j3c,
10886 &jx1,&jy1,&jz1,&jx2,&jy2,&jz2,&jx3,&jy3,&jz3);
10887 qqOOt = vec_sld(qqOO,nul,4);
10888 qqOHt = vec_sld(qqOH,nul,4);
10889 qqHHt = vec_sld(qqHH,nul,4);
10890 c6t = vec_sld(c6,nul,4);
10891 c12t = vec_sld(c12,nul,4);
10893 dx11 = vec_sub(ix1,jx1);
10894 dx12 = vec_sub(ix1,jx2);
10895 dx13 = vec_sub(ix1,jx3);
10896 dy11 = vec_sub(iy1,jy1);
10897 dy12 = vec_sub(iy1,jy2);
10898 dy13 = vec_sub(iy1,jy3);
10899 dz11 = vec_sub(iz1,jz1);
10900 dz12 = vec_sub(iz1,jz2);
10901 dz13 = vec_sub(iz1,jz3);
10902 dx21 = vec_sub(ix2,jx1);
10903 dx22 = vec_sub(ix2,jx2);
10904 dx23 = vec_sub(ix2,jx3);
10905 dy21 = vec_sub(iy2,jy1);
10906 dy22 = vec_sub(iy2,jy2);
10907 dy23 = vec_sub(iy2,jy3);
10908 dz21 = vec_sub(iz2,jz1);
10909 dz22 = vec_sub(iz2,jz2);
10910 dz23 = vec_sub(iz2,jz3);
10911 dx31 = vec_sub(ix3,jx1);
10912 dx32 = vec_sub(ix3,jx2);
10913 dx33 = vec_sub(ix3,jx3);
10914 dy31 = vec_sub(iy3,jy1);
10915 dy32 = vec_sub(iy3,jy2);
10916 dy33 = vec_sub(iy3,jy3);
10917 dz31 = vec_sub(iz3,jz1);
10918 dz32 = vec_sub(iz3,jz2);
10919 dz33 = vec_sub(iz3,jz3);
10921 rsq11 = vec_madd(dx11,dx11,nul);
10922 rsq12 = vec_madd(dx12,dx12,nul);
10923 rsq13 = vec_madd(dx13,dx13,nul);
10924 rsq21 = vec_madd(dx21,dx21,nul);
10925 rsq22 = vec_madd(dx22,dx22,nul);
10926 rsq23 = vec_madd(dx23,dx23,nul);
10927 rsq31 = vec_madd(dx31,dx31,nul);
10928 rsq32 = vec_madd(dx32,dx32,nul);
10929 rsq33 = vec_madd(dx33,dx33,nul);
10930 rsq11 = vec_madd(dy11,dy11,rsq11);
10931 rsq12 = vec_madd(dy12,dy12,rsq12);
10932 rsq13 = vec_madd(dy13,dy13,rsq13);
10933 rsq21 = vec_madd(dy21,dy21,rsq21);
10934 rsq22 = vec_madd(dy22,dy22,rsq22);
10935 rsq23 = vec_madd(dy23,dy23,rsq23);
10936 rsq31 = vec_madd(dy31,dy31,rsq31);
10937 rsq32 = vec_madd(dy32,dy32,rsq32);
10938 rsq33 = vec_madd(dy33,dy33,rsq33);
10939 rsq11 = vec_madd(dz11,dz11,rsq11);
10940 rsq12 = vec_madd(dz12,dz12,rsq12);
10941 rsq13 = vec_madd(dz13,dz13,rsq13);
10942 rsq21 = vec_madd(dz21,dz21,rsq21);
10943 rsq22 = vec_madd(dz22,dz22,rsq22);
10944 rsq23 = vec_madd(dz23,dz23,rsq23);
10945 rsq31 = vec_madd(dz31,dz31,rsq31);
10946 rsq32 = vec_madd(dz32,dz32,rsq32);
10947 rsq33 = vec_madd(dz33,dz33,rsq33);
10949 zero_highest_element_in_9_vectors(&rsq11,&rsq12,&rsq13,
10950 &rsq21,&rsq22,&rsq23,
10951 &rsq31,&rsq32,&rsq33);
10953 do_9_invsqrt(rsq11,rsq12,rsq13,
10954 rsq21,rsq22,rsq23,
10955 rsq31,rsq32,rsq33,
10956 &rinv11,&rinv12,&rinv13,
10957 &rinv21,&rinv22,&rinv23,
10958 &rinv31,&rinv32,&rinv33);
10960 zero_highest_element_in_9_vectors(&rinv11,&rinv12,&rinv13,
10961 &rinv21,&rinv22,&rinv23,
10962 &rinv31,&rinv32,&rinv33);
10964 rinvsq11 = vec_madd(rinv11,rinv11,nul);
10965 r11 = vec_madd(rsq11,rinv11,nul);
10966 r12 = vec_madd(rsq12,rinv12,nul);
10967 r13 = vec_madd(rsq13,rinv13,nul);
10968 r21 = vec_madd(rsq21,rinv21,nul);
10969 rinvsix = vec_madd(rinvsq11,rinvsq11,nul);
10970 r22 = vec_madd(rsq22,rinv22,nul);
10971 r23 = vec_madd(rsq23,rinv23,nul);
10972 r31 = vec_madd(rsq31,rinv31,nul);
10973 r32 = vec_madd(rsq32,rinv32,nul);
10974 r33 = vec_madd(rsq33,rinv33,nul);
10975 rinvsix = vec_madd(rinvsix,rinvsq11,nul);
10977 do_3_ctable_coul(VFtab,vec_madd(r11,tsc,nul),&VV11c,&FF11c);
10978 do_3_ctable_coul(VFtab,vec_madd(r12,tsc,nul),&VV12c,&FF12c);
10979 do_3_ctable_coul(VFtab,vec_madd(r13,tsc,nul),&VV13c,&FF13c);
10980 do_3_ctable_coul(VFtab,vec_madd(r21,tsc,nul),&VV21c,&FF21c);
10981 do_3_ctable_coul(VFtab,vec_madd(r22,tsc,nul),&VV22c,&FF22c);
10982 do_3_ctable_coul(VFtab,vec_madd(r23,tsc,nul),&VV23c,&FF23c);
10983 do_3_ctable_coul(VFtab,vec_madd(r31,tsc,nul),&VV31c,&FF31c);
10984 do_3_ctable_coul(VFtab,vec_madd(r32,tsc,nul),&VV32c,&FF32c);
10985 do_3_ctable_coul(VFtab,vec_madd(r33,tsc,nul),&VV33c,&FF33c);
10987 vnb6 = vec_madd(c6t,rinvsix,nul);
10988 vnb12 = vec_madd(c12t,vec_madd(rinvsix,rinvsix,nul),nul);
10989 fs11 = vec_madd(vec_twelve(),vnb12,nul);
10990 fs11c = vec_nmsub(qqOOt,FF11c,nul);
10991 fs12 = vec_nmsub(qqOHt,FF12c,nul);
10992 fs13 = vec_nmsub(qqOHt,FF13c,nul);
10993 fs21 = vec_nmsub(qqOHt,FF21c,nul);
10994 fs11 = vec_nmsub(vec_six(),vnb6,fs11);
10995 fs22 = vec_nmsub(qqHHt,FF22c,nul);
10996 fs23 = vec_nmsub(qqHHt,FF23c,nul);
10997 fs31 = vec_nmsub(qqOHt,FF31c,nul);
10998 fs32 = vec_nmsub(qqHHt,FF32c,nul);
10999 fs11 = vec_madd(fs11,rinv11,nul);
11000 fs33 = vec_nmsub(qqHHt,FF33c,nul);
11001 vnbtot = vec_add(vnbtot,vnb12);
11002 vnbtot = vec_sub(vnbtot,vnb6);
11003 vctot = vec_madd(qqOOt,VV11c,vctot);
11004 vctot = vec_madd(qqOHt,VV12c,vctot);
11005 vctot = vec_madd(qqOHt,VV13c,vctot);
11006 vctot = vec_madd(qqOHt,VV21c,vctot);
11007 vctot = vec_madd(qqHHt,VV22c,vctot);
11008 vctot = vec_madd(qqHHt,VV23c,vctot);
11009 vctot = vec_madd(qqOHt,VV31c,vctot);
11010 vctot = vec_madd(qqHHt,VV32c,vctot);
11011 vctot = vec_madd(qqHHt,VV33c,vctot);
11013 fs11 = vec_madd(fs11c,tsc,fs11);
11014 fs12 = vec_madd(fs12,tsc,nul);
11015 fs13 = vec_madd(fs13,tsc,nul);
11016 fs21 = vec_madd(fs21,tsc,nul);
11017 fs22 = vec_madd(fs22,tsc,nul);
11018 fs23 = vec_madd(fs23,tsc,nul);
11019 fs31 = vec_madd(fs31,tsc,nul);
11020 fs32 = vec_madd(fs32,tsc,nul);
11021 fs33 = vec_madd(fs33,tsc,nul);
11023 fs11 = vec_madd(fs11,rinv11,nul);
11024 fs12 = vec_madd(fs12,rinv12,nul);
11025 fs13 = vec_madd(fs13,rinv13,nul);
11026 fs21 = vec_madd(fs21,rinv21,nul);
11027 fs22 = vec_madd(fs22,rinv22,nul);
11028 fs23 = vec_madd(fs23,rinv23,nul);
11029 fs31 = vec_madd(fs31,rinv31,nul);
11030 fs32 = vec_madd(fs32,rinv32,nul);
11031 fs33 = vec_madd(fs33,rinv33,nul);
11033 fix1 = vec_madd(fs11,dx11,fix1);
11034 fiy1 = vec_madd(fs11,dy11,fiy1);
11035 fiz1 = vec_madd(fs11,dz11,fiz1);
11036 fix2 = vec_madd(fs21,dx21,fix2);
11037 fiy2 = vec_madd(fs21,dy21,fiy2);
11038 fiz2 = vec_madd(fs21,dz21,fiz2);
11039 fix3 = vec_madd(fs31,dx31,fix3);
11040 fiy3 = vec_madd(fs31,dy31,fiy3);
11041 fiz3 = vec_madd(fs31,dz31,fiz3);
11043 fix1 = vec_madd(fs12,dx12,fix1);
11044 fiy1 = vec_madd(fs12,dy12,fiy1);
11045 fiz1 = vec_madd(fs12,dz12,fiz1);
11046 fix2 = vec_madd(fs22,dx22,fix2);
11047 fiy2 = vec_madd(fs22,dy22,fiy2);
11048 fiz2 = vec_madd(fs22,dz22,fiz2);
11049 fix3 = vec_madd(fs32,dx32,fix3);
11050 fiy3 = vec_madd(fs32,dy32,fiy3);
11051 fiz3 = vec_madd(fs32,dz32,fiz3);
11053 fix1 = vec_madd(fs13,dx13,fix1);
11054 fiy1 = vec_madd(fs13,dy13,fiy1);
11055 fiz1 = vec_madd(fs13,dz13,fiz1);
11056 fix2 = vec_madd(fs23,dx23,fix2);
11057 fiy2 = vec_madd(fs23,dy23,fiy2);
11058 fiz2 = vec_madd(fs23,dz23,fiz2);
11059 fix3 = vec_madd(fs33,dx33,fix3);
11060 fiy3 = vec_madd(fs33,dy33,fiy3);
11061 fiz3 = vec_madd(fs33,dz33,fiz3);
11063 fjx1 = vec_nmsub(fs11,dx11,nul);
11064 fjy1 = vec_nmsub(fs11,dy11,nul);
11065 fjz1 = vec_nmsub(fs11,dz11,nul);
11066 fjx2 = vec_nmsub(fs12,dx12,nul);
11067 fjy2 = vec_nmsub(fs12,dy12,nul);
11068 fjz2 = vec_nmsub(fs12,dz12,nul);
11069 fjx3 = vec_nmsub(fs13,dx13,nul);
11070 fjy3 = vec_nmsub(fs13,dy13,nul);
11071 fjz3 = vec_nmsub(fs13,dz13,nul);
11073 fjx1 = vec_nmsub(fs21,dx21,fjx1);
11074 fjy1 = vec_nmsub(fs21,dy21,fjy1);
11075 fjz1 = vec_nmsub(fs21,dz21,fjz1);
11076 fjx2 = vec_nmsub(fs22,dx22,fjx2);
11077 fjy2 = vec_nmsub(fs22,dy22,fjy2);
11078 fjz2 = vec_nmsub(fs22,dz22,fjz2);
11079 fjx3 = vec_nmsub(fs23,dx23,fjx3);
11080 fjy3 = vec_nmsub(fs23,dy23,fjy3);
11081 fjz3 = vec_nmsub(fs23,dz23,fjz3);
11083 fjx1 = vec_nmsub(fs31,dx31,fjx1);
11084 fjy1 = vec_nmsub(fs31,dy31,fjy1);
11085 fjz1 = vec_nmsub(fs31,dz31,fjz1);
11086 fjx2 = vec_nmsub(fs32,dx32,fjx2);
11087 fjy2 = vec_nmsub(fs32,dy32,fjy2);
11088 fjz2 = vec_nmsub(fs32,dz32,fjz2);
11089 fjx3 = vec_nmsub(fs33,dx33,fjx3);
11090 fjy3 = vec_nmsub(fs33,dy33,fjy3);
11091 fjz3 = vec_nmsub(fs33,dz33,fjz3);
11093 add_force_to_3_water(faction+j3a,faction+j3b,faction+j3c,
11094 fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
11095 } else if(k<(nj1-1)) {
11096 jnra = jjnr[k];
11097 jnrb = jjnr[k+1];
11098 j3a = 3*jnra;
11099 j3b = 3*jnrb;
11100 load_2_water(pos+j3a,pos+j3b,
11101 &jx1,&jy1,&jz1,&jx2,&jy2,&jz2,&jx3,&jy3,&jz3);
11102 qqOOt = vec_sld(qqOO,nul,8);
11103 qqOHt = vec_sld(qqOH,nul,8);
11104 qqHHt = vec_sld(qqHH,nul,8);
11105 c6t = vec_sld(c6,nul,8);
11106 c12t = vec_sld(c12,nul,8);
11108 dx11 = vec_sub(ix1,jx1);
11109 dx12 = vec_sub(ix1,jx2);
11110 dx13 = vec_sub(ix1,jx3);
11111 dy11 = vec_sub(iy1,jy1);
11112 dy12 = vec_sub(iy1,jy2);
11113 dy13 = vec_sub(iy1,jy3);
11114 dz11 = vec_sub(iz1,jz1);
11115 dz12 = vec_sub(iz1,jz2);
11116 dz13 = vec_sub(iz1,jz3);
11117 dx21 = vec_sub(ix2,jx1);
11118 dx22 = vec_sub(ix2,jx2);
11119 dx23 = vec_sub(ix2,jx3);
11120 dy21 = vec_sub(iy2,jy1);
11121 dy22 = vec_sub(iy2,jy2);
11122 dy23 = vec_sub(iy2,jy3);
11123 dz21 = vec_sub(iz2,jz1);
11124 dz22 = vec_sub(iz2,jz2);
11125 dz23 = vec_sub(iz2,jz3);
11126 dx31 = vec_sub(ix3,jx1);
11127 dx32 = vec_sub(ix3,jx2);
11128 dx33 = vec_sub(ix3,jx3);
11129 dy31 = vec_sub(iy3,jy1);
11130 dy32 = vec_sub(iy3,jy2);
11131 dy33 = vec_sub(iy3,jy3);
11132 dz31 = vec_sub(iz3,jz1);
11133 dz32 = vec_sub(iz3,jz2);
11134 dz33 = vec_sub(iz3,jz3);
11136 rsq11 = vec_madd(dx11,dx11,nul);
11137 rsq12 = vec_madd(dx12,dx12,nul);
11138 rsq13 = vec_madd(dx13,dx13,nul);
11139 rsq21 = vec_madd(dx21,dx21,nul);
11140 rsq22 = vec_madd(dx22,dx22,nul);
11141 rsq23 = vec_madd(dx23,dx23,nul);
11142 rsq31 = vec_madd(dx31,dx31,nul);
11143 rsq32 = vec_madd(dx32,dx32,nul);
11144 rsq33 = vec_madd(dx33,dx33,nul);
11145 rsq11 = vec_madd(dy11,dy11,rsq11);
11146 rsq12 = vec_madd(dy12,dy12,rsq12);
11147 rsq13 = vec_madd(dy13,dy13,rsq13);
11148 rsq21 = vec_madd(dy21,dy21,rsq21);
11149 rsq22 = vec_madd(dy22,dy22,rsq22);
11150 rsq23 = vec_madd(dy23,dy23,rsq23);
11151 rsq31 = vec_madd(dy31,dy31,rsq31);
11152 rsq32 = vec_madd(dy32,dy32,rsq32);
11153 rsq33 = vec_madd(dy33,dy33,rsq33);
11154 rsq11 = vec_madd(dz11,dz11,rsq11);
11155 rsq12 = vec_madd(dz12,dz12,rsq12);
11156 rsq13 = vec_madd(dz13,dz13,rsq13);
11157 rsq21 = vec_madd(dz21,dz21,rsq21);
11158 rsq22 = vec_madd(dz22,dz22,rsq22);
11159 rsq23 = vec_madd(dz23,dz23,rsq23);
11160 rsq31 = vec_madd(dz31,dz31,rsq31);
11161 rsq32 = vec_madd(dz32,dz32,rsq32);
11162 rsq33 = vec_madd(dz33,dz33,rsq33);
11164 zero_highest_2_elements_in_9_vectors(&rsq11,&rsq12,&rsq13,
11165 &rsq21,&rsq22,&rsq23,
11166 &rsq31,&rsq32,&rsq33);
11168 do_9_invsqrt(rsq11,rsq12,rsq13,
11169 rsq21,rsq22,rsq23,
11170 rsq31,rsq32,rsq33,
11171 &rinv11,&rinv12,&rinv13,
11172 &rinv21,&rinv22,&rinv23,
11173 &rinv31,&rinv32,&rinv33);
11175 zero_highest_2_elements_in_9_vectors(&rinv11,&rinv12,&rinv13,
11176 &rinv21,&rinv22,&rinv23,
11177 &rinv31,&rinv32,&rinv33);
11179 rinvsq11 = vec_madd(rinv11,rinv11,nul);
11180 r11 = vec_madd(rsq11,rinv11,nul);
11181 r12 = vec_madd(rsq12,rinv12,nul);
11182 r13 = vec_madd(rsq13,rinv13,nul);
11183 r21 = vec_madd(rsq21,rinv21,nul);
11184 rinvsix = vec_madd(rinvsq11,rinvsq11,nul);
11185 r22 = vec_madd(rsq22,rinv22,nul);
11186 r23 = vec_madd(rsq23,rinv23,nul);
11187 r31 = vec_madd(rsq31,rinv31,nul);
11188 r32 = vec_madd(rsq32,rinv32,nul);
11189 r33 = vec_madd(rsq33,rinv33,nul);
11190 rinvsix = vec_madd(rinvsix,rinvsq11,nul);
11192 do_2_ctable_coul(VFtab,vec_madd(r11,tsc,nul),&VV11c,&FF11c);
11193 do_2_ctable_coul(VFtab,vec_madd(r12,tsc,nul),&VV12c,&FF12c);
11194 do_2_ctable_coul(VFtab,vec_madd(r13,tsc,nul),&VV13c,&FF13c);
11195 do_2_ctable_coul(VFtab,vec_madd(r21,tsc,nul),&VV21c,&FF21c);
11196 do_2_ctable_coul(VFtab,vec_madd(r22,tsc,nul),&VV22c,&FF22c);
11197 do_2_ctable_coul(VFtab,vec_madd(r23,tsc,nul),&VV23c,&FF23c);
11198 do_2_ctable_coul(VFtab,vec_madd(r31,tsc,nul),&VV31c,&FF31c);
11199 do_2_ctable_coul(VFtab,vec_madd(r32,tsc,nul),&VV32c,&FF32c);
11200 do_2_ctable_coul(VFtab,vec_madd(r33,tsc,nul),&VV33c,&FF33c);
11202 vnb6 = vec_madd(c6t,rinvsix,nul);
11203 vnb12 = vec_madd(c12t,vec_madd(rinvsix,rinvsix,nul),nul);
11204 fs11 = vec_madd(vec_twelve(),vnb12,nul);
11205 fs11c = vec_nmsub(qqOOt,FF11c,nul);
11206 fs12 = vec_nmsub(qqOHt,FF12c,nul);
11207 fs13 = vec_nmsub(qqOHt,FF13c,nul);
11208 fs21 = vec_nmsub(qqOHt,FF21c,nul);
11209 fs11 = vec_nmsub(vec_six(),vnb6,fs11);
11210 fs22 = vec_nmsub(qqHHt,FF22c,nul);
11211 fs23 = vec_nmsub(qqHHt,FF23c,nul);
11212 fs31 = vec_nmsub(qqOHt,FF31c,nul);
11213 fs32 = vec_nmsub(qqHHt,FF32c,nul);
11214 fs11 = vec_madd(fs11,rinv11,nul);
11215 fs33 = vec_nmsub(qqHHt,FF33c,nul);
11216 vnbtot = vec_add(vnbtot,vnb12);
11217 vnbtot = vec_sub(vnbtot,vnb6);
11218 vctot = vec_madd(qqOOt,VV11c,vctot);
11219 vctot = vec_madd(qqOHt,VV12c,vctot);
11220 vctot = vec_madd(qqOHt,VV13c,vctot);
11221 vctot = vec_madd(qqOHt,VV21c,vctot);
11222 vctot = vec_madd(qqHHt,VV22c,vctot);
11223 vctot = vec_madd(qqHHt,VV23c,vctot);
11224 vctot = vec_madd(qqOHt,VV31c,vctot);
11225 vctot = vec_madd(qqHHt,VV32c,vctot);
11226 vctot = vec_madd(qqHHt,VV33c,vctot);
11228 fs11 = vec_madd(fs11c,tsc,fs11);
11229 fs12 = vec_madd(fs12,tsc,nul);
11230 fs13 = vec_madd(fs13,tsc,nul);
11231 fs21 = vec_madd(fs21,tsc,nul);
11232 fs22 = vec_madd(fs22,tsc,nul);
11233 fs23 = vec_madd(fs23,tsc,nul);
11234 fs31 = vec_madd(fs31,tsc,nul);
11235 fs32 = vec_madd(fs32,tsc,nul);
11236 fs33 = vec_madd(fs33,tsc,nul);
11238 fs11 = vec_madd(fs11,rinv11,nul);
11239 fs12 = vec_madd(fs12,rinv12,nul);
11240 fs13 = vec_madd(fs13,rinv13,nul);
11241 fs21 = vec_madd(fs21,rinv21,nul);
11242 fs22 = vec_madd(fs22,rinv22,nul);
11243 fs23 = vec_madd(fs23,rinv23,nul);
11244 fs31 = vec_madd(fs31,rinv31,nul);
11245 fs32 = vec_madd(fs32,rinv32,nul);
11246 fs33 = vec_madd(fs33,rinv33,nul);
11248 fix1 = vec_madd(fs11,dx11,fix1);
11249 fiy1 = vec_madd(fs11,dy11,fiy1);
11250 fiz1 = vec_madd(fs11,dz11,fiz1);
11251 fix2 = vec_madd(fs21,dx21,fix2);
11252 fiy2 = vec_madd(fs21,dy21,fiy2);
11253 fiz2 = vec_madd(fs21,dz21,fiz2);
11254 fix3 = vec_madd(fs31,dx31,fix3);
11255 fiy3 = vec_madd(fs31,dy31,fiy3);
11256 fiz3 = vec_madd(fs31,dz31,fiz3);
11258 fix1 = vec_madd(fs12,dx12,fix1);
11259 fiy1 = vec_madd(fs12,dy12,fiy1);
11260 fiz1 = vec_madd(fs12,dz12,fiz1);
11261 fix2 = vec_madd(fs22,dx22,fix2);
11262 fiy2 = vec_madd(fs22,dy22,fiy2);
11263 fiz2 = vec_madd(fs22,dz22,fiz2);
11264 fix3 = vec_madd(fs32,dx32,fix3);
11265 fiy3 = vec_madd(fs32,dy32,fiy3);
11266 fiz3 = vec_madd(fs32,dz32,fiz3);
11268 fix1 = vec_madd(fs13,dx13,fix1);
11269 fiy1 = vec_madd(fs13,dy13,fiy1);
11270 fiz1 = vec_madd(fs13,dz13,fiz1);
11271 fix2 = vec_madd(fs23,dx23,fix2);
11272 fiy2 = vec_madd(fs23,dy23,fiy2);
11273 fiz2 = vec_madd(fs23,dz23,fiz2);
11274 fix3 = vec_madd(fs33,dx33,fix3);
11275 fiy3 = vec_madd(fs33,dy33,fiy3);
11276 fiz3 = vec_madd(fs33,dz33,fiz3);
11278 fjx1 = vec_nmsub(fs11,dx11,nul);
11279 fjy1 = vec_nmsub(fs11,dy11,nul);
11280 fjz1 = vec_nmsub(fs11,dz11,nul);
11281 fjx2 = vec_nmsub(fs12,dx12,nul);
11282 fjy2 = vec_nmsub(fs12,dy12,nul);
11283 fjz2 = vec_nmsub(fs12,dz12,nul);
11284 fjx3 = vec_nmsub(fs13,dx13,nul);
11285 fjy3 = vec_nmsub(fs13,dy13,nul);
11286 fjz3 = vec_nmsub(fs13,dz13,nul);
11288 fjx1 = vec_nmsub(fs21,dx21,fjx1);
11289 fjy1 = vec_nmsub(fs21,dy21,fjy1);
11290 fjz1 = vec_nmsub(fs21,dz21,fjz1);
11291 fjx2 = vec_nmsub(fs22,dx22,fjx2);
11292 fjy2 = vec_nmsub(fs22,dy22,fjy2);
11293 fjz2 = vec_nmsub(fs22,dz22,fjz2);
11294 fjx3 = vec_nmsub(fs23,dx23,fjx3);
11295 fjy3 = vec_nmsub(fs23,dy23,fjy3);
11296 fjz3 = vec_nmsub(fs23,dz23,fjz3);
11298 fjx1 = vec_nmsub(fs31,dx31,fjx1);
11299 fjy1 = vec_nmsub(fs31,dy31,fjy1);
11300 fjz1 = vec_nmsub(fs31,dz31,fjz1);
11301 fjx2 = vec_nmsub(fs32,dx32,fjx2);
11302 fjy2 = vec_nmsub(fs32,dy32,fjy2);
11303 fjz2 = vec_nmsub(fs32,dz32,fjz2);
11304 fjx3 = vec_nmsub(fs33,dx33,fjx3);
11305 fjy3 = vec_nmsub(fs33,dy33,fjy3);
11306 fjz3 = vec_nmsub(fs33,dz33,fjz3);
11308 add_force_to_2_water(faction+j3a,faction+j3b,
11309 fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
11310 } else if(k<nj1) {
11311 jnra = jjnr[k];
11312 j3a = 3*jnra;
11313 load_1_water(pos+j3a,
11314 &jx1,&jy1,&jz1,&jx2,&jy2,&jz2,&jx3,&jy3,&jz3);
11315 qqOOt = vec_sld(qqOO,nul,12);
11316 qqOHt = vec_sld(qqOH,nul,12);
11317 qqHHt = vec_sld(qqHH,nul,12);
11318 c6t = vec_sld(c6,nul,12);
11319 c12t = vec_sld(c12,nul,12);
11321 dx11 = vec_sub(ix1,jx1);
11322 dx12 = vec_sub(ix1,jx2);
11323 dx13 = vec_sub(ix1,jx3);
11324 dy11 = vec_sub(iy1,jy1);
11325 dy12 = vec_sub(iy1,jy2);
11326 dy13 = vec_sub(iy1,jy3);
11327 dz11 = vec_sub(iz1,jz1);
11328 dz12 = vec_sub(iz1,jz2);
11329 dz13 = vec_sub(iz1,jz3);
11330 dx21 = vec_sub(ix2,jx1);
11331 dx22 = vec_sub(ix2,jx2);
11332 dx23 = vec_sub(ix2,jx3);
11333 dy21 = vec_sub(iy2,jy1);
11334 dy22 = vec_sub(iy2,jy2);
11335 dy23 = vec_sub(iy2,jy3);
11336 dz21 = vec_sub(iz2,jz1);
11337 dz22 = vec_sub(iz2,jz2);
11338 dz23 = vec_sub(iz2,jz3);
11339 dx31 = vec_sub(ix3,jx1);
11340 dx32 = vec_sub(ix3,jx2);
11341 dx33 = vec_sub(ix3,jx3);
11342 dy31 = vec_sub(iy3,jy1);
11343 dy32 = vec_sub(iy3,jy2);
11344 dy33 = vec_sub(iy3,jy3);
11345 dz31 = vec_sub(iz3,jz1);
11346 dz32 = vec_sub(iz3,jz2);
11347 dz33 = vec_sub(iz3,jz3);
11349 rsq11 = vec_madd(dx11,dx11,nul);
11350 rsq12 = vec_madd(dx12,dx12,nul);
11351 rsq13 = vec_madd(dx13,dx13,nul);
11352 rsq21 = vec_madd(dx21,dx21,nul);
11353 rsq22 = vec_madd(dx22,dx22,nul);
11354 rsq23 = vec_madd(dx23,dx23,nul);
11355 rsq31 = vec_madd(dx31,dx31,nul);
11356 rsq32 = vec_madd(dx32,dx32,nul);
11357 rsq33 = vec_madd(dx33,dx33,nul);
11358 rsq11 = vec_madd(dy11,dy11,rsq11);
11359 rsq12 = vec_madd(dy12,dy12,rsq12);
11360 rsq13 = vec_madd(dy13,dy13,rsq13);
11361 rsq21 = vec_madd(dy21,dy21,rsq21);
11362 rsq22 = vec_madd(dy22,dy22,rsq22);
11363 rsq23 = vec_madd(dy23,dy23,rsq23);
11364 rsq31 = vec_madd(dy31,dy31,rsq31);
11365 rsq32 = vec_madd(dy32,dy32,rsq32);
11366 rsq33 = vec_madd(dy33,dy33,rsq33);
11367 rsq11 = vec_madd(dz11,dz11,rsq11);
11368 rsq12 = vec_madd(dz12,dz12,rsq12);
11369 rsq13 = vec_madd(dz13,dz13,rsq13);
11370 rsq21 = vec_madd(dz21,dz21,rsq21);
11371 rsq22 = vec_madd(dz22,dz22,rsq22);
11372 rsq23 = vec_madd(dz23,dz23,rsq23);
11373 rsq31 = vec_madd(dz31,dz31,rsq31);
11374 rsq32 = vec_madd(dz32,dz32,rsq32);
11375 rsq33 = vec_madd(dz33,dz33,rsq33);
11377 zero_highest_3_elements_in_9_vectors(&rsq11,&rsq12,&rsq13,
11378 &rsq21,&rsq22,&rsq23,
11379 &rsq31,&rsq32,&rsq33);
11381 do_9_invsqrt(rsq11,rsq12,rsq13,
11382 rsq21,rsq22,rsq23,
11383 rsq31,rsq32,rsq33,
11384 &rinv11,&rinv12,&rinv13,
11385 &rinv21,&rinv22,&rinv23,
11386 &rinv31,&rinv32,&rinv33);
11388 zero_highest_3_elements_in_9_vectors(&rinv11,&rinv12,&rinv13,
11389 &rinv21,&rinv22,&rinv23,
11390 &rinv31,&rinv32,&rinv33);
11392 rinvsq11 = vec_madd(rinv11,rinv11,nul);
11393 r11 = vec_madd(rsq11,rinv11,nul);
11394 r12 = vec_madd(rsq12,rinv12,nul);
11395 r13 = vec_madd(rsq13,rinv13,nul);
11396 r21 = vec_madd(rsq21,rinv21,nul);
11397 rinvsix = vec_madd(rinvsq11,rinvsq11,nul);
11398 r22 = vec_madd(rsq22,rinv22,nul);
11399 r23 = vec_madd(rsq23,rinv23,nul);
11400 r31 = vec_madd(rsq31,rinv31,nul);
11401 r32 = vec_madd(rsq32,rinv32,nul);
11402 r33 = vec_madd(rsq33,rinv33,nul);
11403 rinvsix = vec_madd(rinvsix,rinvsq11,nul);
11405 do_1_ctable_coul(VFtab,vec_madd(r11,tsc,nul),&VV11c,&FF11c);
11406 do_1_ctable_coul(VFtab,vec_madd(r12,tsc,nul),&VV12c,&FF12c);
11407 do_1_ctable_coul(VFtab,vec_madd(r13,tsc,nul),&VV13c,&FF13c);
11408 do_1_ctable_coul(VFtab,vec_madd(r21,tsc,nul),&VV21c,&FF21c);
11409 do_1_ctable_coul(VFtab,vec_madd(r22,tsc,nul),&VV22c,&FF22c);
11410 do_1_ctable_coul(VFtab,vec_madd(r23,tsc,nul),&VV23c,&FF23c);
11411 do_1_ctable_coul(VFtab,vec_madd(r31,tsc,nul),&VV31c,&FF31c);
11412 do_1_ctable_coul(VFtab,vec_madd(r32,tsc,nul),&VV32c,&FF32c);
11413 do_1_ctable_coul(VFtab,vec_madd(r33,tsc,nul),&VV33c,&FF33c);
11415 vnb6 = vec_madd(c6t,rinvsix,nul);
11416 vnb12 = vec_madd(c12t,vec_madd(rinvsix,rinvsix,nul),nul);
11417 fs11 = vec_madd(vec_twelve(),vnb12,nul);
11418 fs11c = vec_nmsub(qqOOt,FF11c,nul);
11419 fs12 = vec_nmsub(qqOHt,FF12c,nul);
11420 fs13 = vec_nmsub(qqOHt,FF13c,nul);
11421 fs21 = vec_nmsub(qqOHt,FF21c,nul);
11422 fs11 = vec_nmsub(vec_six(),vnb6,fs11);
11423 fs22 = vec_nmsub(qqHHt,FF22c,nul);
11424 fs23 = vec_nmsub(qqHHt,FF23c,nul);
11425 fs31 = vec_nmsub(qqOHt,FF31c,nul);
11426 fs32 = vec_nmsub(qqHHt,FF32c,nul);
11427 fs11 = vec_madd(fs11,rinv11,nul);
11428 fs33 = vec_nmsub(qqHHt,FF33c,nul);
11429 vnbtot = vec_add(vnbtot,vnb12);
11430 vnbtot = vec_sub(vnbtot,vnb6);
11431 vctot = vec_madd(qqOOt,VV11c,vctot);
11432 vctot = vec_madd(qqOHt,VV12c,vctot);
11433 vctot = vec_madd(qqOHt,VV13c,vctot);
11434 vctot = vec_madd(qqOHt,VV21c,vctot);
11435 vctot = vec_madd(qqHHt,VV22c,vctot);
11436 vctot = vec_madd(qqHHt,VV23c,vctot);
11437 vctot = vec_madd(qqOHt,VV31c,vctot);
11438 vctot = vec_madd(qqHHt,VV32c,vctot);
11439 vctot = vec_madd(qqHHt,VV33c,vctot);
11441 fs11 = vec_madd(fs11c,tsc,fs11);
11442 fs12 = vec_madd(fs12,tsc,nul);
11443 fs13 = vec_madd(fs13,tsc,nul);
11444 fs21 = vec_madd(fs21,tsc,nul);
11445 fs22 = vec_madd(fs22,tsc,nul);
11446 fs23 = vec_madd(fs23,tsc,nul);
11447 fs31 = vec_madd(fs31,tsc,nul);
11448 fs32 = vec_madd(fs32,tsc,nul);
11449 fs33 = vec_madd(fs33,tsc,nul);
11451 fs11 = vec_madd(fs11,rinv11,nul);
11452 fs12 = vec_madd(fs12,rinv12,nul);
11453 fs13 = vec_madd(fs13,rinv13,nul);
11454 fs21 = vec_madd(fs21,rinv21,nul);
11455 fs22 = vec_madd(fs22,rinv22,nul);
11456 fs23 = vec_madd(fs23,rinv23,nul);
11457 fs31 = vec_madd(fs31,rinv31,nul);
11458 fs32 = vec_madd(fs32,rinv32,nul);
11459 fs33 = vec_madd(fs33,rinv33,nul);
11461 fix1 = vec_madd(fs11,dx11,fix1);
11462 fiy1 = vec_madd(fs11,dy11,fiy1);
11463 fiz1 = vec_madd(fs11,dz11,fiz1);
11464 fix2 = vec_madd(fs21,dx21,fix2);
11465 fiy2 = vec_madd(fs21,dy21,fiy2);
11466 fiz2 = vec_madd(fs21,dz21,fiz2);
11467 fix3 = vec_madd(fs31,dx31,fix3);
11468 fiy3 = vec_madd(fs31,dy31,fiy3);
11469 fiz3 = vec_madd(fs31,dz31,fiz3);
11471 fix1 = vec_madd(fs12,dx12,fix1);
11472 fiy1 = vec_madd(fs12,dy12,fiy1);
11473 fiz1 = vec_madd(fs12,dz12,fiz1);
11474 fix2 = vec_madd(fs22,dx22,fix2);
11475 fiy2 = vec_madd(fs22,dy22,fiy2);
11476 fiz2 = vec_madd(fs22,dz22,fiz2);
11477 fix3 = vec_madd(fs32,dx32,fix3);
11478 fiy3 = vec_madd(fs32,dy32,fiy3);
11479 fiz3 = vec_madd(fs32,dz32,fiz3);
11481 fix1 = vec_madd(fs13,dx13,fix1);
11482 fiy1 = vec_madd(fs13,dy13,fiy1);
11483 fiz1 = vec_madd(fs13,dz13,fiz1);
11484 fix2 = vec_madd(fs23,dx23,fix2);
11485 fiy2 = vec_madd(fs23,dy23,fiy2);
11486 fiz2 = vec_madd(fs23,dz23,fiz2);
11487 fix3 = vec_madd(fs33,dx33,fix3);
11488 fiy3 = vec_madd(fs33,dy33,fiy3);
11489 fiz3 = vec_madd(fs33,dz33,fiz3);
11491 fjx1 = vec_nmsub(fs11,dx11,nul);
11492 fjy1 = vec_nmsub(fs11,dy11,nul);
11493 fjz1 = vec_nmsub(fs11,dz11,nul);
11494 fjx2 = vec_nmsub(fs12,dx12,nul);
11495 fjy2 = vec_nmsub(fs12,dy12,nul);
11496 fjz2 = vec_nmsub(fs12,dz12,nul);
11497 fjx3 = vec_nmsub(fs13,dx13,nul);
11498 fjy3 = vec_nmsub(fs13,dy13,nul);
11499 fjz3 = vec_nmsub(fs13,dz13,nul);
11501 fjx1 = vec_nmsub(fs21,dx21,fjx1);
11502 fjy1 = vec_nmsub(fs21,dy21,fjy1);
11503 fjz1 = vec_nmsub(fs21,dz21,fjz1);
11504 fjx2 = vec_nmsub(fs22,dx22,fjx2);
11505 fjy2 = vec_nmsub(fs22,dy22,fjy2);
11506 fjz2 = vec_nmsub(fs22,dz22,fjz2);
11507 fjx3 = vec_nmsub(fs23,dx23,fjx3);
11508 fjy3 = vec_nmsub(fs23,dy23,fjy3);
11509 fjz3 = vec_nmsub(fs23,dz23,fjz3);
11511 fjx1 = vec_nmsub(fs31,dx31,fjx1);
11512 fjy1 = vec_nmsub(fs31,dy31,fjy1);
11513 fjz1 = vec_nmsub(fs31,dz31,fjz1);
11514 fjx2 = vec_nmsub(fs32,dx32,fjx2);
11515 fjy2 = vec_nmsub(fs32,dy32,fjy2);
11516 fjz2 = vec_nmsub(fs32,dz32,fjz2);
11517 fjx3 = vec_nmsub(fs33,dx33,fjx3);
11518 fjy3 = vec_nmsub(fs33,dy33,fjy3);
11519 fjz3 = vec_nmsub(fs33,dz33,fjz3);
11521 add_force_to_1_water(faction+j3a,
11522 fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
11524 /* update outer data */
11525 update_i_water_forces(faction+ii3,fshift+is3,
11526 fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3);
11528 add_vector_to_float(Vc+gid[n],vctot);
11529 add_vector_to_float(Vnb+gid[n],vnbtot);
11534 void inl3330_altivec(
11535 int nri,
11536 int iinr[],
11537 int jindex[],
11538 int jjnr[],
11539 int shift[],
11540 float shiftvec[],
11541 float fshift[],
11542 int gid[],
11543 float pos[],
11544 float faction[],
11545 float charge[],
11546 float facel,
11547 float Vc[],
11548 int type[],
11549 int ntype,
11550 float nbfp[],
11551 float Vnb[],
11552 float tabscale,
11553 float VFtab[])
11555 vector float ix1,iy1,iz1,ix2,iy2,iz2,ix3,iy3,iz3;
11556 vector float jx1,jy1,jz1,jx2,jy2,jz2,jx3,jy3,jz3;
11558 vector float dx11,dy11,dz11,dx12,dy12,dz12,dx13,dy13,dz13;
11559 vector float dx21,dy21,dz21,dx22,dy22,dz22,dx23,dy23,dz23;
11560 vector float dx31,dy31,dz31,dx32,dy32,dz32,dx33,dy33,dz33;
11562 vector float rsq11,rsq12,rsq13,rsq21,rsq22,rsq23,rsq31,rsq32,rsq33;
11563 vector float r11,r12,r13,r21,r22,r23,r31,r32,r33;
11564 vector float rinv11,rinv12,rinv13,rinv21,rinv22,rinv23,rinv31,rinv32,rinv33;
11565 vector float vc11,vc12,vc13,vc21,vc22,vc23,vc31,vc32,vc33;
11567 vector float vfacel,vcoul1,vcoul2,vcoul3,nul;
11568 vector float fs11,fs12,fs13,fs21,fs22,fs23,fs31,fs32,fs33;
11569 vector float fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3;
11570 vector float fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3;
11571 vector float vctot,qqOO,qqOH,qqHH,qO,qH,c6,c12;
11572 vector float vnb6,vnb12,vnbtot,tsc,qqOOt,qqOHt,qqHHt,c6t,c12t;
11573 vector float VV11c,FF11c,VV12c,FF12c,VV13c,FF13c;
11574 vector float VV21c,FF21c,VV22c,FF22c,VV23c,FF23c;
11575 vector float VV31c,FF31c,VV32c,FF32c,VV33c,FF33c;
11576 vector float VVd,FFd,VVr,FFr;
11578 int n,k,k0,ii,is3,ii3,ntiA,nj0,nj1;
11579 int jnra,jnrb,jnrc,jnrd,tp,tj;
11580 int j3a,j3b,j3c,j3d;
11582 nul=vec_zero();
11583 vfacel=load_float_and_splat(&facel);
11584 tsc=load_float_and_splat(&tabscale);
11585 ii = iinr[0];
11586 qO = load_float_and_splat(charge+ii);
11587 qH = load_float_and_splat(charge+ii+1);
11588 qqOO = vec_madd(qO,qO,nul);
11589 qqOH = vec_madd(qO,qH,nul);
11590 qqHH = vec_madd(qH,qH,nul);
11591 qqOO = vec_madd(qqOO,vfacel,nul);
11592 qqOH = vec_madd(qqOH,vfacel,nul);
11593 qqHH = vec_madd(qqHH,vfacel,nul);
11594 tp = 2*type[ii];
11595 tj = (ntype+1)*tp;
11596 load_1_pair(nbfp+tj,&c6,&c12);
11597 c6 = vec_splat(c6,0);
11598 c12 = vec_splat(c12,0);
11600 for(n=0;n<nri;n++) {
11601 is3 = 3*shift[n];
11602 ii = iinr[n];
11603 ii3 = 3*ii;
11604 load_1_water_shift_and_splat(pos+ii3,shiftvec+is3,&ix1,&iy1,&iz1,
11605 &ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
11606 vctot = nul;
11607 vnbtot = nul;
11608 fix1 = nul;
11609 fiy1 = nul;
11610 fiz1 = nul;
11611 fix2 = nul;
11612 fiy2 = nul;
11613 fiz2 = nul;
11614 fix3 = nul;
11615 fiy3 = nul;
11616 fiz3 = nul;
11617 nj0 = jindex[n];
11618 nj1 = jindex[n+1];
11620 for(k=nj0; k<(nj1-3); k+=4) {
11621 jnra = jjnr[k];
11622 jnrb = jjnr[k+1];
11623 jnrc = jjnr[k+2];
11624 jnrd = jjnr[k+3];
11625 j3a = 3*jnra;
11626 j3b = 3*jnrb;
11627 j3c = 3*jnrc;
11628 j3d = 3*jnrd;
11629 load_4_water(pos+j3a,pos+j3b,pos+j3c,pos+j3d,
11630 &jx1,&jy1,&jz1,&jx2,&jy2,&jz2,&jx3,&jy3,&jz3);
11631 dx11 = vec_sub(ix1,jx1);
11632 dx12 = vec_sub(ix1,jx2);
11633 dx13 = vec_sub(ix1,jx3);
11634 dy11 = vec_sub(iy1,jy1);
11635 dy12 = vec_sub(iy1,jy2);
11636 dy13 = vec_sub(iy1,jy3);
11637 dz11 = vec_sub(iz1,jz1);
11638 dz12 = vec_sub(iz1,jz2);
11639 dz13 = vec_sub(iz1,jz3);
11640 dx21 = vec_sub(ix2,jx1);
11641 dx22 = vec_sub(ix2,jx2);
11642 dx23 = vec_sub(ix2,jx3);
11643 dy21 = vec_sub(iy2,jy1);
11644 dy22 = vec_sub(iy2,jy2);
11645 dy23 = vec_sub(iy2,jy3);
11646 dz21 = vec_sub(iz2,jz1);
11647 dz22 = vec_sub(iz2,jz2);
11648 dz23 = vec_sub(iz2,jz3);
11649 dx31 = vec_sub(ix3,jx1);
11650 dx32 = vec_sub(ix3,jx2);
11651 dx33 = vec_sub(ix3,jx3);
11652 dy31 = vec_sub(iy3,jy1);
11653 dy32 = vec_sub(iy3,jy2);
11654 dy33 = vec_sub(iy3,jy3);
11655 dz31 = vec_sub(iz3,jz1);
11656 dz32 = vec_sub(iz3,jz2);
11657 dz33 = vec_sub(iz3,jz3);
11659 rsq11 = vec_madd(dx11,dx11,nul);
11660 rsq12 = vec_madd(dx12,dx12,nul);
11661 rsq13 = vec_madd(dx13,dx13,nul);
11662 rsq21 = vec_madd(dx21,dx21,nul);
11663 rsq22 = vec_madd(dx22,dx22,nul);
11664 rsq23 = vec_madd(dx23,dx23,nul);
11665 rsq31 = vec_madd(dx31,dx31,nul);
11666 rsq32 = vec_madd(dx32,dx32,nul);
11667 rsq33 = vec_madd(dx33,dx33,nul);
11668 rsq11 = vec_madd(dy11,dy11,rsq11);
11669 rsq12 = vec_madd(dy12,dy12,rsq12);
11670 rsq13 = vec_madd(dy13,dy13,rsq13);
11671 rsq21 = vec_madd(dy21,dy21,rsq21);
11672 rsq22 = vec_madd(dy22,dy22,rsq22);
11673 rsq23 = vec_madd(dy23,dy23,rsq23);
11674 rsq31 = vec_madd(dy31,dy31,rsq31);
11675 rsq32 = vec_madd(dy32,dy32,rsq32);
11676 rsq33 = vec_madd(dy33,dy33,rsq33);
11677 rsq11 = vec_madd(dz11,dz11,rsq11);
11678 rsq12 = vec_madd(dz12,dz12,rsq12);
11679 rsq13 = vec_madd(dz13,dz13,rsq13);
11680 rsq21 = vec_madd(dz21,dz21,rsq21);
11681 rsq22 = vec_madd(dz22,dz22,rsq22);
11682 rsq23 = vec_madd(dz23,dz23,rsq23);
11683 rsq31 = vec_madd(dz31,dz31,rsq31);
11684 rsq32 = vec_madd(dz32,dz32,rsq32);
11685 rsq33 = vec_madd(dz33,dz33,rsq33);
11687 do_9_invsqrt(rsq11,rsq12,rsq13,
11688 rsq21,rsq22,rsq23,
11689 rsq31,rsq32,rsq33,
11690 &rinv11,&rinv12,&rinv13,
11691 &rinv21,&rinv22,&rinv23,
11692 &rinv31,&rinv32,&rinv33);
11694 r11 = vec_madd(rsq11,rinv11,nul);
11695 r12 = vec_madd(rsq12,rinv12,nul);
11696 r13 = vec_madd(rsq13,rinv13,nul);
11697 r21 = vec_madd(rsq21,rinv21,nul);
11698 r22 = vec_madd(rsq22,rinv22,nul);
11699 r23 = vec_madd(rsq23,rinv23,nul);
11700 r31 = vec_madd(rsq31,rinv31,nul);
11701 r32 = vec_madd(rsq32,rinv32,nul);
11702 r33 = vec_madd(rsq33,rinv33,nul);
11704 do_4_ljctable_coul_and_lj(VFtab,vec_madd(r11,tsc,nul),
11705 &VV11c,&FF11c,&VVd,&FFd,&VVr,&FFr);
11706 do_4_ljctable_coul(VFtab,vec_madd(r12,tsc,nul),&VV12c,&FF12c);
11707 do_4_ljctable_coul(VFtab,vec_madd(r13,tsc,nul),&VV13c,&FF13c);
11708 do_4_ljctable_coul(VFtab,vec_madd(r21,tsc,nul),&VV21c,&FF21c);
11709 do_4_ljctable_coul(VFtab,vec_madd(r22,tsc,nul),&VV22c,&FF22c);
11710 do_4_ljctable_coul(VFtab,vec_madd(r23,tsc,nul),&VV23c,&FF23c);
11711 do_4_ljctable_coul(VFtab,vec_madd(r31,tsc,nul),&VV31c,&FF31c);
11712 do_4_ljctable_coul(VFtab,vec_madd(r32,tsc,nul),&VV32c,&FF32c);
11713 do_4_ljctable_coul(VFtab,vec_madd(r33,tsc,nul),&VV33c,&FF33c);
11715 vnbtot = vec_madd(c6,VVd,vnbtot);
11716 vnbtot = vec_madd(c12,VVr,vnbtot);
11718 fs11 = vec_nmsub(qqOO,FF11c,nul);
11719 fs12 = vec_nmsub(qqOH,FF12c,nul);
11720 fs13 = vec_nmsub(qqOH,FF13c,nul);
11721 fs21 = vec_nmsub(qqOH,FF21c,nul);
11722 fs11 = vec_nmsub(c6,FFd,fs11);
11723 fs22 = vec_nmsub(qqHH,FF22c,nul);
11724 fs23 = vec_nmsub(qqHH,FF23c,nul);
11725 fs31 = vec_nmsub(qqOH,FF31c,nul);
11726 fs32 = vec_nmsub(qqHH,FF32c,nul);
11727 fs33 = vec_nmsub(qqHH,FF33c,nul);
11728 fs11 = vec_nmsub(c12,FFr,fs11);
11730 vctot = vec_madd(qqOO,VV11c,vctot);
11731 vctot = vec_madd(qqOH,VV12c,vctot);
11732 vctot = vec_madd(qqOH,VV13c,vctot);
11733 vctot = vec_madd(qqOH,VV21c,vctot);
11734 vctot = vec_madd(qqHH,VV22c,vctot);
11735 vctot = vec_madd(qqHH,VV23c,vctot);
11736 vctot = vec_madd(qqOH,VV31c,vctot);
11737 vctot = vec_madd(qqHH,VV32c,vctot);
11738 vctot = vec_madd(qqHH,VV33c,vctot);
11740 fs11 = vec_madd(fs11,tsc,nul);
11741 fs12 = vec_madd(fs12,tsc,nul);
11742 fs13 = vec_madd(fs13,tsc,nul);
11743 fs21 = vec_madd(fs21,tsc,nul);
11744 fs22 = vec_madd(fs22,tsc,nul);
11745 fs23 = vec_madd(fs23,tsc,nul);
11746 fs31 = vec_madd(fs31,tsc,nul);
11747 fs32 = vec_madd(fs32,tsc,nul);
11748 fs33 = vec_madd(fs33,tsc,nul);
11750 fs11 = vec_madd(fs11,rinv11,nul);
11751 fs12 = vec_madd(fs12,rinv12,nul);
11752 fs13 = vec_madd(fs13,rinv13,nul);
11753 fs21 = vec_madd(fs21,rinv21,nul);
11754 fs22 = vec_madd(fs22,rinv22,nul);
11755 fs23 = vec_madd(fs23,rinv23,nul);
11756 fs31 = vec_madd(fs31,rinv31,nul);
11757 fs32 = vec_madd(fs32,rinv32,nul);
11758 fs33 = vec_madd(fs33,rinv33,nul);
11760 fix1 = vec_madd(fs11,dx11,fix1);
11761 fiy1 = vec_madd(fs11,dy11,fiy1);
11762 fiz1 = vec_madd(fs11,dz11,fiz1);
11763 fix2 = vec_madd(fs21,dx21,fix2);
11764 fiy2 = vec_madd(fs21,dy21,fiy2);
11765 fiz2 = vec_madd(fs21,dz21,fiz2);
11766 fix3 = vec_madd(fs31,dx31,fix3);
11767 fiy3 = vec_madd(fs31,dy31,fiy3);
11768 fiz3 = vec_madd(fs31,dz31,fiz3);
11770 fix1 = vec_madd(fs12,dx12,fix1);
11771 fiy1 = vec_madd(fs12,dy12,fiy1);
11772 fiz1 = vec_madd(fs12,dz12,fiz1);
11773 fix2 = vec_madd(fs22,dx22,fix2);
11774 fiy2 = vec_madd(fs22,dy22,fiy2);
11775 fiz2 = vec_madd(fs22,dz22,fiz2);
11776 fix3 = vec_madd(fs32,dx32,fix3);
11777 fiy3 = vec_madd(fs32,dy32,fiy3);
11778 fiz3 = vec_madd(fs32,dz32,fiz3);
11780 fix1 = vec_madd(fs13,dx13,fix1);
11781 fiy1 = vec_madd(fs13,dy13,fiy1);
11782 fiz1 = vec_madd(fs13,dz13,fiz1);
11783 fix2 = vec_madd(fs23,dx23,fix2);
11784 fiy2 = vec_madd(fs23,dy23,fiy2);
11785 fiz2 = vec_madd(fs23,dz23,fiz2);
11786 fix3 = vec_madd(fs33,dx33,fix3);
11787 fiy3 = vec_madd(fs33,dy33,fiy3);
11788 fiz3 = vec_madd(fs33,dz33,fiz3);
11790 fjx1 = vec_nmsub(fs11,dx11,nul);
11791 fjy1 = vec_nmsub(fs11,dy11,nul);
11792 fjz1 = vec_nmsub(fs11,dz11,nul);
11793 fjx2 = vec_nmsub(fs12,dx12,nul);
11794 fjy2 = vec_nmsub(fs12,dy12,nul);
11795 fjz2 = vec_nmsub(fs12,dz12,nul);
11796 fjx3 = vec_nmsub(fs13,dx13,nul);
11797 fjy3 = vec_nmsub(fs13,dy13,nul);
11798 fjz3 = vec_nmsub(fs13,dz13,nul);
11800 fjx1 = vec_nmsub(fs21,dx21,fjx1);
11801 fjy1 = vec_nmsub(fs21,dy21,fjy1);
11802 fjz1 = vec_nmsub(fs21,dz21,fjz1);
11803 fjx2 = vec_nmsub(fs22,dx22,fjx2);
11804 fjy2 = vec_nmsub(fs22,dy22,fjy2);
11805 fjz2 = vec_nmsub(fs22,dz22,fjz2);
11806 fjx3 = vec_nmsub(fs23,dx23,fjx3);
11807 fjy3 = vec_nmsub(fs23,dy23,fjy3);
11808 fjz3 = vec_nmsub(fs23,dz23,fjz3);
11810 fjx1 = vec_nmsub(fs31,dx31,fjx1);
11811 fjy1 = vec_nmsub(fs31,dy31,fjy1);
11812 fjz1 = vec_nmsub(fs31,dz31,fjz1);
11813 fjx2 = vec_nmsub(fs32,dx32,fjx2);
11814 fjy2 = vec_nmsub(fs32,dy32,fjy2);
11815 fjz2 = vec_nmsub(fs32,dz32,fjz2);
11816 fjx3 = vec_nmsub(fs33,dx33,fjx3);
11817 fjy3 = vec_nmsub(fs33,dy33,fjy3);
11818 fjz3 = vec_nmsub(fs33,dz33,fjz3);
11820 add_force_to_4_water(faction+j3a,faction+j3b,faction+j3c,faction+j3d,
11821 fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
11823 if(k<(nj1-2)) {
11824 jnra = jjnr[k];
11825 jnrb = jjnr[k+1];
11826 jnrc = jjnr[k+2];
11827 j3a = 3*jnra;
11828 j3b = 3*jnrb;
11829 j3c = 3*jnrc;
11830 load_3_water(pos+j3a,pos+j3b,pos+j3c,
11831 &jx1,&jy1,&jz1,&jx2,&jy2,&jz2,&jx3,&jy3,&jz3);
11832 qqOOt = vec_sld(qqOO,nul,4);
11833 qqOHt = vec_sld(qqOH,nul,4);
11834 qqHHt = vec_sld(qqHH,nul,4);
11835 c6t = vec_sld(c6,nul,4);
11836 c12t = vec_sld(c12,nul,4);
11838 dx11 = vec_sub(ix1,jx1);
11839 dx12 = vec_sub(ix1,jx2);
11840 dx13 = vec_sub(ix1,jx3);
11841 dy11 = vec_sub(iy1,jy1);
11842 dy12 = vec_sub(iy1,jy2);
11843 dy13 = vec_sub(iy1,jy3);
11844 dz11 = vec_sub(iz1,jz1);
11845 dz12 = vec_sub(iz1,jz2);
11846 dz13 = vec_sub(iz1,jz3);
11847 dx21 = vec_sub(ix2,jx1);
11848 dx22 = vec_sub(ix2,jx2);
11849 dx23 = vec_sub(ix2,jx3);
11850 dy21 = vec_sub(iy2,jy1);
11851 dy22 = vec_sub(iy2,jy2);
11852 dy23 = vec_sub(iy2,jy3);
11853 dz21 = vec_sub(iz2,jz1);
11854 dz22 = vec_sub(iz2,jz2);
11855 dz23 = vec_sub(iz2,jz3);
11856 dx31 = vec_sub(ix3,jx1);
11857 dx32 = vec_sub(ix3,jx2);
11858 dx33 = vec_sub(ix3,jx3);
11859 dy31 = vec_sub(iy3,jy1);
11860 dy32 = vec_sub(iy3,jy2);
11861 dy33 = vec_sub(iy3,jy3);
11862 dz31 = vec_sub(iz3,jz1);
11863 dz32 = vec_sub(iz3,jz2);
11864 dz33 = vec_sub(iz3,jz3);
11866 rsq11 = vec_madd(dx11,dx11,nul);
11867 rsq12 = vec_madd(dx12,dx12,nul);
11868 rsq13 = vec_madd(dx13,dx13,nul);
11869 rsq21 = vec_madd(dx21,dx21,nul);
11870 rsq22 = vec_madd(dx22,dx22,nul);
11871 rsq23 = vec_madd(dx23,dx23,nul);
11872 rsq31 = vec_madd(dx31,dx31,nul);
11873 rsq32 = vec_madd(dx32,dx32,nul);
11874 rsq33 = vec_madd(dx33,dx33,nul);
11875 rsq11 = vec_madd(dy11,dy11,rsq11);
11876 rsq12 = vec_madd(dy12,dy12,rsq12);
11877 rsq13 = vec_madd(dy13,dy13,rsq13);
11878 rsq21 = vec_madd(dy21,dy21,rsq21);
11879 rsq22 = vec_madd(dy22,dy22,rsq22);
11880 rsq23 = vec_madd(dy23,dy23,rsq23);
11881 rsq31 = vec_madd(dy31,dy31,rsq31);
11882 rsq32 = vec_madd(dy32,dy32,rsq32);
11883 rsq33 = vec_madd(dy33,dy33,rsq33);
11884 rsq11 = vec_madd(dz11,dz11,rsq11);
11885 rsq12 = vec_madd(dz12,dz12,rsq12);
11886 rsq13 = vec_madd(dz13,dz13,rsq13);
11887 rsq21 = vec_madd(dz21,dz21,rsq21);
11888 rsq22 = vec_madd(dz22,dz22,rsq22);
11889 rsq23 = vec_madd(dz23,dz23,rsq23);
11890 rsq31 = vec_madd(dz31,dz31,rsq31);
11891 rsq32 = vec_madd(dz32,dz32,rsq32);
11892 rsq33 = vec_madd(dz33,dz33,rsq33);
11894 zero_highest_element_in_9_vectors(&rsq11,&rsq12,&rsq13,
11895 &rsq21,&rsq22,&rsq23,
11896 &rsq31,&rsq32,&rsq33);
11898 do_9_invsqrt(rsq11,rsq12,rsq13,
11899 rsq21,rsq22,rsq23,
11900 rsq31,rsq32,rsq33,
11901 &rinv11,&rinv12,&rinv13,
11902 &rinv21,&rinv22,&rinv23,
11903 &rinv31,&rinv32,&rinv33);
11905 zero_highest_element_in_9_vectors(&rinv11,&rinv12,&rinv13,
11906 &rinv21,&rinv22,&rinv23,
11907 &rinv31,&rinv32,&rinv33);
11909 r11 = vec_madd(rsq11,rinv11,nul);
11910 r12 = vec_madd(rsq12,rinv12,nul);
11911 r13 = vec_madd(rsq13,rinv13,nul);
11912 r21 = vec_madd(rsq21,rinv21,nul);
11913 r22 = vec_madd(rsq22,rinv22,nul);
11914 r23 = vec_madd(rsq23,rinv23,nul);
11915 r31 = vec_madd(rsq31,rinv31,nul);
11916 r32 = vec_madd(rsq32,rinv32,nul);
11917 r33 = vec_madd(rsq33,rinv33,nul);
11919 do_3_ljctable_coul_and_lj(VFtab,vec_madd(r11,tsc,nul),
11920 &VV11c,&FF11c,&VVd,&FFd,&VVr,&FFr);
11921 do_3_ljctable_coul(VFtab,vec_madd(r12,tsc,nul),&VV12c,&FF12c);
11922 do_3_ljctable_coul(VFtab,vec_madd(r13,tsc,nul),&VV13c,&FF13c);
11923 do_3_ljctable_coul(VFtab,vec_madd(r21,tsc,nul),&VV21c,&FF21c);
11924 do_3_ljctable_coul(VFtab,vec_madd(r22,tsc,nul),&VV22c,&FF22c);
11925 do_3_ljctable_coul(VFtab,vec_madd(r23,tsc,nul),&VV23c,&FF23c);
11926 do_3_ljctable_coul(VFtab,vec_madd(r31,tsc,nul),&VV31c,&FF31c);
11927 do_3_ljctable_coul(VFtab,vec_madd(r32,tsc,nul),&VV32c,&FF32c);
11928 do_3_ljctable_coul(VFtab,vec_madd(r33,tsc,nul),&VV33c,&FF33c);
11930 vnbtot = vec_madd(c6t,VVd,vnbtot);
11931 vnbtot = vec_madd(c12t,VVr,vnbtot);
11933 fs11 = vec_nmsub(qqOOt,FF11c,nul);
11934 fs12 = vec_nmsub(qqOHt,FF12c,nul);
11935 fs13 = vec_nmsub(qqOHt,FF13c,nul);
11936 fs21 = vec_nmsub(qqOHt,FF21c,nul);
11937 fs11 = vec_nmsub(c6t,FFd,fs11);
11938 fs22 = vec_nmsub(qqHHt,FF22c,nul);
11939 fs23 = vec_nmsub(qqHHt,FF23c,nul);
11940 fs31 = vec_nmsub(qqOHt,FF31c,nul);
11941 fs32 = vec_nmsub(qqHHt,FF32c,nul);
11942 fs33 = vec_nmsub(qqHHt,FF33c,nul);
11943 fs11 = vec_nmsub(c12t,FFr,fs11);
11945 vctot = vec_madd(qqOOt,VV11c,vctot);
11946 vctot = vec_madd(qqOHt,VV12c,vctot);
11947 vctot = vec_madd(qqOHt,VV13c,vctot);
11948 vctot = vec_madd(qqOHt,VV21c,vctot);
11949 vctot = vec_madd(qqHHt,VV22c,vctot);
11950 vctot = vec_madd(qqHHt,VV23c,vctot);
11951 vctot = vec_madd(qqOHt,VV31c,vctot);
11952 vctot = vec_madd(qqHHt,VV32c,vctot);
11953 vctot = vec_madd(qqHHt,VV33c,vctot);
11955 fs11 = vec_madd(fs11,tsc,nul);
11956 fs12 = vec_madd(fs12,tsc,nul);
11957 fs13 = vec_madd(fs13,tsc,nul);
11958 fs21 = vec_madd(fs21,tsc,nul);
11959 fs22 = vec_madd(fs22,tsc,nul);
11960 fs23 = vec_madd(fs23,tsc,nul);
11961 fs31 = vec_madd(fs31,tsc,nul);
11962 fs32 = vec_madd(fs32,tsc,nul);
11963 fs33 = vec_madd(fs33,tsc,nul);
11965 fs11 = vec_madd(fs11,rinv11,nul);
11966 fs12 = vec_madd(fs12,rinv12,nul);
11967 fs13 = vec_madd(fs13,rinv13,nul);
11968 fs21 = vec_madd(fs21,rinv21,nul);
11969 fs22 = vec_madd(fs22,rinv22,nul);
11970 fs23 = vec_madd(fs23,rinv23,nul);
11971 fs31 = vec_madd(fs31,rinv31,nul);
11972 fs32 = vec_madd(fs32,rinv32,nul);
11973 fs33 = vec_madd(fs33,rinv33,nul);
11975 fix1 = vec_madd(fs11,dx11,fix1);
11976 fiy1 = vec_madd(fs11,dy11,fiy1);
11977 fiz1 = vec_madd(fs11,dz11,fiz1);
11978 fix2 = vec_madd(fs21,dx21,fix2);
11979 fiy2 = vec_madd(fs21,dy21,fiy2);
11980 fiz2 = vec_madd(fs21,dz21,fiz2);
11981 fix3 = vec_madd(fs31,dx31,fix3);
11982 fiy3 = vec_madd(fs31,dy31,fiy3);
11983 fiz3 = vec_madd(fs31,dz31,fiz3);
11985 fix1 = vec_madd(fs12,dx12,fix1);
11986 fiy1 = vec_madd(fs12,dy12,fiy1);
11987 fiz1 = vec_madd(fs12,dz12,fiz1);
11988 fix2 = vec_madd(fs22,dx22,fix2);
11989 fiy2 = vec_madd(fs22,dy22,fiy2);
11990 fiz2 = vec_madd(fs22,dz22,fiz2);
11991 fix3 = vec_madd(fs32,dx32,fix3);
11992 fiy3 = vec_madd(fs32,dy32,fiy3);
11993 fiz3 = vec_madd(fs32,dz32,fiz3);
11995 fix1 = vec_madd(fs13,dx13,fix1);
11996 fiy1 = vec_madd(fs13,dy13,fiy1);
11997 fiz1 = vec_madd(fs13,dz13,fiz1);
11998 fix2 = vec_madd(fs23,dx23,fix2);
11999 fiy2 = vec_madd(fs23,dy23,fiy2);
12000 fiz2 = vec_madd(fs23,dz23,fiz2);
12001 fix3 = vec_madd(fs33,dx33,fix3);
12002 fiy3 = vec_madd(fs33,dy33,fiy3);
12003 fiz3 = vec_madd(fs33,dz33,fiz3);
12005 fjx1 = vec_nmsub(fs11,dx11,nul);
12006 fjy1 = vec_nmsub(fs11,dy11,nul);
12007 fjz1 = vec_nmsub(fs11,dz11,nul);
12008 fjx2 = vec_nmsub(fs12,dx12,nul);
12009 fjy2 = vec_nmsub(fs12,dy12,nul);
12010 fjz2 = vec_nmsub(fs12,dz12,nul);
12011 fjx3 = vec_nmsub(fs13,dx13,nul);
12012 fjy3 = vec_nmsub(fs13,dy13,nul);
12013 fjz3 = vec_nmsub(fs13,dz13,nul);
12015 fjx1 = vec_nmsub(fs21,dx21,fjx1);
12016 fjy1 = vec_nmsub(fs21,dy21,fjy1);
12017 fjz1 = vec_nmsub(fs21,dz21,fjz1);
12018 fjx2 = vec_nmsub(fs22,dx22,fjx2);
12019 fjy2 = vec_nmsub(fs22,dy22,fjy2);
12020 fjz2 = vec_nmsub(fs22,dz22,fjz2);
12021 fjx3 = vec_nmsub(fs23,dx23,fjx3);
12022 fjy3 = vec_nmsub(fs23,dy23,fjy3);
12023 fjz3 = vec_nmsub(fs23,dz23,fjz3);
12025 fjx1 = vec_nmsub(fs31,dx31,fjx1);
12026 fjy1 = vec_nmsub(fs31,dy31,fjy1);
12027 fjz1 = vec_nmsub(fs31,dz31,fjz1);
12028 fjx2 = vec_nmsub(fs32,dx32,fjx2);
12029 fjy2 = vec_nmsub(fs32,dy32,fjy2);
12030 fjz2 = vec_nmsub(fs32,dz32,fjz2);
12031 fjx3 = vec_nmsub(fs33,dx33,fjx3);
12032 fjy3 = vec_nmsub(fs33,dy33,fjy3);
12033 fjz3 = vec_nmsub(fs33,dz33,fjz3);
12035 add_force_to_3_water(faction+j3a,faction+j3b,faction+j3c,
12036 fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
12037 } else if(k<(nj1-1)) {
12038 jnra = jjnr[k];
12039 jnrb = jjnr[k+1];
12040 j3a = 3*jnra;
12041 j3b = 3*jnrb;
12042 load_2_water(pos+j3a,pos+j3b,
12043 &jx1,&jy1,&jz1,&jx2,&jy2,&jz2,&jx3,&jy3,&jz3);
12044 qqOOt = vec_sld(qqOO,nul,8);
12045 qqOHt = vec_sld(qqOH,nul,8);
12046 qqHHt = vec_sld(qqHH,nul,8);
12047 c6t = vec_sld(c6,nul,8);
12048 c12t = vec_sld(c12,nul,8);
12050 dx11 = vec_sub(ix1,jx1);
12051 dx12 = vec_sub(ix1,jx2);
12052 dx13 = vec_sub(ix1,jx3);
12053 dy11 = vec_sub(iy1,jy1);
12054 dy12 = vec_sub(iy1,jy2);
12055 dy13 = vec_sub(iy1,jy3);
12056 dz11 = vec_sub(iz1,jz1);
12057 dz12 = vec_sub(iz1,jz2);
12058 dz13 = vec_sub(iz1,jz3);
12059 dx21 = vec_sub(ix2,jx1);
12060 dx22 = vec_sub(ix2,jx2);
12061 dx23 = vec_sub(ix2,jx3);
12062 dy21 = vec_sub(iy2,jy1);
12063 dy22 = vec_sub(iy2,jy2);
12064 dy23 = vec_sub(iy2,jy3);
12065 dz21 = vec_sub(iz2,jz1);
12066 dz22 = vec_sub(iz2,jz2);
12067 dz23 = vec_sub(iz2,jz3);
12068 dx31 = vec_sub(ix3,jx1);
12069 dx32 = vec_sub(ix3,jx2);
12070 dx33 = vec_sub(ix3,jx3);
12071 dy31 = vec_sub(iy3,jy1);
12072 dy32 = vec_sub(iy3,jy2);
12073 dy33 = vec_sub(iy3,jy3);
12074 dz31 = vec_sub(iz3,jz1);
12075 dz32 = vec_sub(iz3,jz2);
12076 dz33 = vec_sub(iz3,jz3);
12078 rsq11 = vec_madd(dx11,dx11,nul);
12079 rsq12 = vec_madd(dx12,dx12,nul);
12080 rsq13 = vec_madd(dx13,dx13,nul);
12081 rsq21 = vec_madd(dx21,dx21,nul);
12082 rsq22 = vec_madd(dx22,dx22,nul);
12083 rsq23 = vec_madd(dx23,dx23,nul);
12084 rsq31 = vec_madd(dx31,dx31,nul);
12085 rsq32 = vec_madd(dx32,dx32,nul);
12086 rsq33 = vec_madd(dx33,dx33,nul);
12087 rsq11 = vec_madd(dy11,dy11,rsq11);
12088 rsq12 = vec_madd(dy12,dy12,rsq12);
12089 rsq13 = vec_madd(dy13,dy13,rsq13);
12090 rsq21 = vec_madd(dy21,dy21,rsq21);
12091 rsq22 = vec_madd(dy22,dy22,rsq22);
12092 rsq23 = vec_madd(dy23,dy23,rsq23);
12093 rsq31 = vec_madd(dy31,dy31,rsq31);
12094 rsq32 = vec_madd(dy32,dy32,rsq32);
12095 rsq33 = vec_madd(dy33,dy33,rsq33);
12096 rsq11 = vec_madd(dz11,dz11,rsq11);
12097 rsq12 = vec_madd(dz12,dz12,rsq12);
12098 rsq13 = vec_madd(dz13,dz13,rsq13);
12099 rsq21 = vec_madd(dz21,dz21,rsq21);
12100 rsq22 = vec_madd(dz22,dz22,rsq22);
12101 rsq23 = vec_madd(dz23,dz23,rsq23);
12102 rsq31 = vec_madd(dz31,dz31,rsq31);
12103 rsq32 = vec_madd(dz32,dz32,rsq32);
12104 rsq33 = vec_madd(dz33,dz33,rsq33);
12106 zero_highest_2_elements_in_9_vectors(&rsq11,&rsq12,&rsq13,
12107 &rsq21,&rsq22,&rsq23,
12108 &rsq31,&rsq32,&rsq33);
12110 do_9_invsqrt(rsq11,rsq12,rsq13,
12111 rsq21,rsq22,rsq23,
12112 rsq31,rsq32,rsq33,
12113 &rinv11,&rinv12,&rinv13,
12114 &rinv21,&rinv22,&rinv23,
12115 &rinv31,&rinv32,&rinv33);
12117 zero_highest_2_elements_in_9_vectors(&rinv11,&rinv12,&rinv13,
12118 &rinv21,&rinv22,&rinv23,
12119 &rinv31,&rinv32,&rinv33);
12121 r11 = vec_madd(rsq11,rinv11,nul);
12122 r12 = vec_madd(rsq12,rinv12,nul);
12123 r13 = vec_madd(rsq13,rinv13,nul);
12124 r21 = vec_madd(rsq21,rinv21,nul);
12125 r22 = vec_madd(rsq22,rinv22,nul);
12126 r23 = vec_madd(rsq23,rinv23,nul);
12127 r31 = vec_madd(rsq31,rinv31,nul);
12128 r32 = vec_madd(rsq32,rinv32,nul);
12129 r33 = vec_madd(rsq33,rinv33,nul);
12131 do_2_ljctable_coul_and_lj(VFtab,vec_madd(r11,tsc,nul),
12132 &VV11c,&FF11c,&VVd,&FFd,&VVr,&FFr);
12133 do_2_ljctable_coul(VFtab,vec_madd(r12,tsc,nul),&VV12c,&FF12c);
12134 do_2_ljctable_coul(VFtab,vec_madd(r13,tsc,nul),&VV13c,&FF13c);
12135 do_2_ljctable_coul(VFtab,vec_madd(r21,tsc,nul),&VV21c,&FF21c);
12136 do_2_ljctable_coul(VFtab,vec_madd(r22,tsc,nul),&VV22c,&FF22c);
12137 do_2_ljctable_coul(VFtab,vec_madd(r23,tsc,nul),&VV23c,&FF23c);
12138 do_2_ljctable_coul(VFtab,vec_madd(r31,tsc,nul),&VV31c,&FF31c);
12139 do_2_ljctable_coul(VFtab,vec_madd(r32,tsc,nul),&VV32c,&FF32c);
12140 do_2_ljctable_coul(VFtab,vec_madd(r33,tsc,nul),&VV33c,&FF33c);
12142 vnbtot = vec_madd(c6t,VVd,vnbtot);
12143 vnbtot = vec_madd(c12t,VVr,vnbtot);
12145 fs11 = vec_nmsub(qqOOt,FF11c,nul);
12146 fs12 = vec_nmsub(qqOHt,FF12c,nul);
12147 fs13 = vec_nmsub(qqOHt,FF13c,nul);
12148 fs21 = vec_nmsub(qqOHt,FF21c,nul);
12149 fs11 = vec_nmsub(c6t,FFd,fs11);
12150 fs22 = vec_nmsub(qqHHt,FF22c,nul);
12151 fs23 = vec_nmsub(qqHHt,FF23c,nul);
12152 fs31 = vec_nmsub(qqOHt,FF31c,nul);
12153 fs32 = vec_nmsub(qqHHt,FF32c,nul);
12154 fs33 = vec_nmsub(qqHHt,FF33c,nul);
12155 fs11 = vec_nmsub(c12t,FFr,fs11);
12157 vctot = vec_madd(qqOOt,VV11c,vctot);
12158 vctot = vec_madd(qqOHt,VV12c,vctot);
12159 vctot = vec_madd(qqOHt,VV13c,vctot);
12160 vctot = vec_madd(qqOHt,VV21c,vctot);
12161 vctot = vec_madd(qqHHt,VV22c,vctot);
12162 vctot = vec_madd(qqHHt,VV23c,vctot);
12163 vctot = vec_madd(qqOHt,VV31c,vctot);
12164 vctot = vec_madd(qqHHt,VV32c,vctot);
12165 vctot = vec_madd(qqHHt,VV33c,vctot);
12167 fs11 = vec_madd(fs11,tsc,nul);
12168 fs12 = vec_madd(fs12,tsc,nul);
12169 fs13 = vec_madd(fs13,tsc,nul);
12170 fs21 = vec_madd(fs21,tsc,nul);
12171 fs22 = vec_madd(fs22,tsc,nul);
12172 fs23 = vec_madd(fs23,tsc,nul);
12173 fs31 = vec_madd(fs31,tsc,nul);
12174 fs32 = vec_madd(fs32,tsc,nul);
12175 fs33 = vec_madd(fs33,tsc,nul);
12177 fs11 = vec_madd(fs11,rinv11,nul);
12178 fs12 = vec_madd(fs12,rinv12,nul);
12179 fs13 = vec_madd(fs13,rinv13,nul);
12180 fs21 = vec_madd(fs21,rinv21,nul);
12181 fs22 = vec_madd(fs22,rinv22,nul);
12182 fs23 = vec_madd(fs23,rinv23,nul);
12183 fs31 = vec_madd(fs31,rinv31,nul);
12184 fs32 = vec_madd(fs32,rinv32,nul);
12185 fs33 = vec_madd(fs33,rinv33,nul);
12187 fix1 = vec_madd(fs11,dx11,fix1);
12188 fiy1 = vec_madd(fs11,dy11,fiy1);
12189 fiz1 = vec_madd(fs11,dz11,fiz1);
12190 fix2 = vec_madd(fs21,dx21,fix2);
12191 fiy2 = vec_madd(fs21,dy21,fiy2);
12192 fiz2 = vec_madd(fs21,dz21,fiz2);
12193 fix3 = vec_madd(fs31,dx31,fix3);
12194 fiy3 = vec_madd(fs31,dy31,fiy3);
12195 fiz3 = vec_madd(fs31,dz31,fiz3);
12197 fix1 = vec_madd(fs12,dx12,fix1);
12198 fiy1 = vec_madd(fs12,dy12,fiy1);
12199 fiz1 = vec_madd(fs12,dz12,fiz1);
12200 fix2 = vec_madd(fs22,dx22,fix2);
12201 fiy2 = vec_madd(fs22,dy22,fiy2);
12202 fiz2 = vec_madd(fs22,dz22,fiz2);
12203 fix3 = vec_madd(fs32,dx32,fix3);
12204 fiy3 = vec_madd(fs32,dy32,fiy3);
12205 fiz3 = vec_madd(fs32,dz32,fiz3);
12207 fix1 = vec_madd(fs13,dx13,fix1);
12208 fiy1 = vec_madd(fs13,dy13,fiy1);
12209 fiz1 = vec_madd(fs13,dz13,fiz1);
12210 fix2 = vec_madd(fs23,dx23,fix2);
12211 fiy2 = vec_madd(fs23,dy23,fiy2);
12212 fiz2 = vec_madd(fs23,dz23,fiz2);
12213 fix3 = vec_madd(fs33,dx33,fix3);
12214 fiy3 = vec_madd(fs33,dy33,fiy3);
12215 fiz3 = vec_madd(fs33,dz33,fiz3);
12217 fjx1 = vec_nmsub(fs11,dx11,nul);
12218 fjy1 = vec_nmsub(fs11,dy11,nul);
12219 fjz1 = vec_nmsub(fs11,dz11,nul);
12220 fjx2 = vec_nmsub(fs12,dx12,nul);
12221 fjy2 = vec_nmsub(fs12,dy12,nul);
12222 fjz2 = vec_nmsub(fs12,dz12,nul);
12223 fjx3 = vec_nmsub(fs13,dx13,nul);
12224 fjy3 = vec_nmsub(fs13,dy13,nul);
12225 fjz3 = vec_nmsub(fs13,dz13,nul);
12227 fjx1 = vec_nmsub(fs21,dx21,fjx1);
12228 fjy1 = vec_nmsub(fs21,dy21,fjy1);
12229 fjz1 = vec_nmsub(fs21,dz21,fjz1);
12230 fjx2 = vec_nmsub(fs22,dx22,fjx2);
12231 fjy2 = vec_nmsub(fs22,dy22,fjy2);
12232 fjz2 = vec_nmsub(fs22,dz22,fjz2);
12233 fjx3 = vec_nmsub(fs23,dx23,fjx3);
12234 fjy3 = vec_nmsub(fs23,dy23,fjy3);
12235 fjz3 = vec_nmsub(fs23,dz23,fjz3);
12237 fjx1 = vec_nmsub(fs31,dx31,fjx1);
12238 fjy1 = vec_nmsub(fs31,dy31,fjy1);
12239 fjz1 = vec_nmsub(fs31,dz31,fjz1);
12240 fjx2 = vec_nmsub(fs32,dx32,fjx2);
12241 fjy2 = vec_nmsub(fs32,dy32,fjy2);
12242 fjz2 = vec_nmsub(fs32,dz32,fjz2);
12243 fjx3 = vec_nmsub(fs33,dx33,fjx3);
12244 fjy3 = vec_nmsub(fs33,dy33,fjy3);
12245 fjz3 = vec_nmsub(fs33,dz33,fjz3);
12247 add_force_to_2_water(faction+j3a,faction+j3b,
12248 fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
12249 } else if(k<nj1) {
12250 jnra = jjnr[k];
12251 j3a = 3*jnra;
12252 load_1_water(pos+j3a,
12253 &jx1,&jy1,&jz1,&jx2,&jy2,&jz2,&jx3,&jy3,&jz3);
12254 qqOOt = vec_sld(qqOO,nul,12);
12255 qqOHt = vec_sld(qqOH,nul,12);
12256 qqHHt = vec_sld(qqHH,nul,12);
12257 c6t = vec_sld(c6,nul,12);
12258 c12t = vec_sld(c12,nul,12);
12260 dx11 = vec_sub(ix1,jx1);
12261 dx12 = vec_sub(ix1,jx2);
12262 dx13 = vec_sub(ix1,jx3);
12263 dy11 = vec_sub(iy1,jy1);
12264 dy12 = vec_sub(iy1,jy2);
12265 dy13 = vec_sub(iy1,jy3);
12266 dz11 = vec_sub(iz1,jz1);
12267 dz12 = vec_sub(iz1,jz2);
12268 dz13 = vec_sub(iz1,jz3);
12269 dx21 = vec_sub(ix2,jx1);
12270 dx22 = vec_sub(ix2,jx2);
12271 dx23 = vec_sub(ix2,jx3);
12272 dy21 = vec_sub(iy2,jy1);
12273 dy22 = vec_sub(iy2,jy2);
12274 dy23 = vec_sub(iy2,jy3);
12275 dz21 = vec_sub(iz2,jz1);
12276 dz22 = vec_sub(iz2,jz2);
12277 dz23 = vec_sub(iz2,jz3);
12278 dx31 = vec_sub(ix3,jx1);
12279 dx32 = vec_sub(ix3,jx2);
12280 dx33 = vec_sub(ix3,jx3);
12281 dy31 = vec_sub(iy3,jy1);
12282 dy32 = vec_sub(iy3,jy2);
12283 dy33 = vec_sub(iy3,jy3);
12284 dz31 = vec_sub(iz3,jz1);
12285 dz32 = vec_sub(iz3,jz2);
12286 dz33 = vec_sub(iz3,jz3);
12288 rsq11 = vec_madd(dx11,dx11,nul);
12289 rsq12 = vec_madd(dx12,dx12,nul);
12290 rsq13 = vec_madd(dx13,dx13,nul);
12291 rsq21 = vec_madd(dx21,dx21,nul);
12292 rsq22 = vec_madd(dx22,dx22,nul);
12293 rsq23 = vec_madd(dx23,dx23,nul);
12294 rsq31 = vec_madd(dx31,dx31,nul);
12295 rsq32 = vec_madd(dx32,dx32,nul);
12296 rsq33 = vec_madd(dx33,dx33,nul);
12297 rsq11 = vec_madd(dy11,dy11,rsq11);
12298 rsq12 = vec_madd(dy12,dy12,rsq12);
12299 rsq13 = vec_madd(dy13,dy13,rsq13);
12300 rsq21 = vec_madd(dy21,dy21,rsq21);
12301 rsq22 = vec_madd(dy22,dy22,rsq22);
12302 rsq23 = vec_madd(dy23,dy23,rsq23);
12303 rsq31 = vec_madd(dy31,dy31,rsq31);
12304 rsq32 = vec_madd(dy32,dy32,rsq32);
12305 rsq33 = vec_madd(dy33,dy33,rsq33);
12306 rsq11 = vec_madd(dz11,dz11,rsq11);
12307 rsq12 = vec_madd(dz12,dz12,rsq12);
12308 rsq13 = vec_madd(dz13,dz13,rsq13);
12309 rsq21 = vec_madd(dz21,dz21,rsq21);
12310 rsq22 = vec_madd(dz22,dz22,rsq22);
12311 rsq23 = vec_madd(dz23,dz23,rsq23);
12312 rsq31 = vec_madd(dz31,dz31,rsq31);
12313 rsq32 = vec_madd(dz32,dz32,rsq32);
12314 rsq33 = vec_madd(dz33,dz33,rsq33);
12316 zero_highest_3_elements_in_9_vectors(&rsq11,&rsq12,&rsq13,
12317 &rsq21,&rsq22,&rsq23,
12318 &rsq31,&rsq32,&rsq33);
12320 do_9_invsqrt(rsq11,rsq12,rsq13,
12321 rsq21,rsq22,rsq23,
12322 rsq31,rsq32,rsq33,
12323 &rinv11,&rinv12,&rinv13,
12324 &rinv21,&rinv22,&rinv23,
12325 &rinv31,&rinv32,&rinv33);
12327 zero_highest_3_elements_in_9_vectors(&rinv11,&rinv12,&rinv13,
12328 &rinv21,&rinv22,&rinv23,
12329 &rinv31,&rinv32,&rinv33);
12331 r11 = vec_madd(rsq11,rinv11,nul);
12332 r12 = vec_madd(rsq12,rinv12,nul);
12333 r13 = vec_madd(rsq13,rinv13,nul);
12334 r21 = vec_madd(rsq21,rinv21,nul);
12335 r22 = vec_madd(rsq22,rinv22,nul);
12336 r23 = vec_madd(rsq23,rinv23,nul);
12337 r31 = vec_madd(rsq31,rinv31,nul);
12338 r32 = vec_madd(rsq32,rinv32,nul);
12339 r33 = vec_madd(rsq33,rinv33,nul);
12341 do_1_ljctable_coul_and_lj(VFtab,vec_madd(r11,tsc,nul),
12342 &VV11c,&FF11c,&VVd,&FFd,&VVr,&FFr);
12343 do_1_ljctable_coul(VFtab,vec_madd(r12,tsc,nul),&VV12c,&FF12c);
12344 do_1_ljctable_coul(VFtab,vec_madd(r13,tsc,nul),&VV13c,&FF13c);
12345 do_1_ljctable_coul(VFtab,vec_madd(r21,tsc,nul),&VV21c,&FF21c);
12346 do_1_ljctable_coul(VFtab,vec_madd(r22,tsc,nul),&VV22c,&FF22c);
12347 do_1_ljctable_coul(VFtab,vec_madd(r23,tsc,nul),&VV23c,&FF23c);
12348 do_1_ljctable_coul(VFtab,vec_madd(r31,tsc,nul),&VV31c,&FF31c);
12349 do_1_ljctable_coul(VFtab,vec_madd(r32,tsc,nul),&VV32c,&FF32c);
12350 do_1_ljctable_coul(VFtab,vec_madd(r33,tsc,nul),&VV33c,&FF33c);
12352 vnbtot = vec_madd(c6t,VVd,vnbtot);
12353 vnbtot = vec_madd(c12t,VVr,vnbtot);
12355 fs11 = vec_nmsub(qqOOt,FF11c,nul);
12356 fs12 = vec_nmsub(qqOHt,FF12c,nul);
12357 fs13 = vec_nmsub(qqOHt,FF13c,nul);
12358 fs21 = vec_nmsub(qqOHt,FF21c,nul);
12359 fs11 = vec_nmsub(c6t,FFd,fs11);
12360 fs22 = vec_nmsub(qqHHt,FF22c,nul);
12361 fs23 = vec_nmsub(qqHHt,FF23c,nul);
12362 fs31 = vec_nmsub(qqOHt,FF31c,nul);
12363 fs32 = vec_nmsub(qqHHt,FF32c,nul);
12364 fs33 = vec_nmsub(qqHHt,FF33c,nul);
12365 fs11 = vec_nmsub(c12t,FFr,fs11);
12367 vctot = vec_madd(qqOOt,VV11c,vctot);
12368 vctot = vec_madd(qqOHt,VV12c,vctot);
12369 vctot = vec_madd(qqOHt,VV13c,vctot);
12370 vctot = vec_madd(qqOHt,VV21c,vctot);
12371 vctot = vec_madd(qqHHt,VV22c,vctot);
12372 vctot = vec_madd(qqHHt,VV23c,vctot);
12373 vctot = vec_madd(qqOHt,VV31c,vctot);
12374 vctot = vec_madd(qqHHt,VV32c,vctot);
12375 vctot = vec_madd(qqHHt,VV33c,vctot);
12377 fs11 = vec_madd(fs11,tsc,nul);
12378 fs12 = vec_madd(fs12,tsc,nul);
12379 fs13 = vec_madd(fs13,tsc,nul);
12380 fs21 = vec_madd(fs21,tsc,nul);
12381 fs22 = vec_madd(fs22,tsc,nul);
12382 fs23 = vec_madd(fs23,tsc,nul);
12383 fs31 = vec_madd(fs31,tsc,nul);
12384 fs32 = vec_madd(fs32,tsc,nul);
12385 fs33 = vec_madd(fs33,tsc,nul);
12387 fs11 = vec_madd(fs11,rinv11,nul);
12388 fs12 = vec_madd(fs12,rinv12,nul);
12389 fs13 = vec_madd(fs13,rinv13,nul);
12390 fs21 = vec_madd(fs21,rinv21,nul);
12391 fs22 = vec_madd(fs22,rinv22,nul);
12392 fs23 = vec_madd(fs23,rinv23,nul);
12393 fs31 = vec_madd(fs31,rinv31,nul);
12394 fs32 = vec_madd(fs32,rinv32,nul);
12395 fs33 = vec_madd(fs33,rinv33,nul);
12397 fix1 = vec_madd(fs11,dx11,fix1);
12398 fiy1 = vec_madd(fs11,dy11,fiy1);
12399 fiz1 = vec_madd(fs11,dz11,fiz1);
12400 fix2 = vec_madd(fs21,dx21,fix2);
12401 fiy2 = vec_madd(fs21,dy21,fiy2);
12402 fiz2 = vec_madd(fs21,dz21,fiz2);
12403 fix3 = vec_madd(fs31,dx31,fix3);
12404 fiy3 = vec_madd(fs31,dy31,fiy3);
12405 fiz3 = vec_madd(fs31,dz31,fiz3);
12407 fix1 = vec_madd(fs12,dx12,fix1);
12408 fiy1 = vec_madd(fs12,dy12,fiy1);
12409 fiz1 = vec_madd(fs12,dz12,fiz1);
12410 fix2 = vec_madd(fs22,dx22,fix2);
12411 fiy2 = vec_madd(fs22,dy22,fiy2);
12412 fiz2 = vec_madd(fs22,dz22,fiz2);
12413 fix3 = vec_madd(fs32,dx32,fix3);
12414 fiy3 = vec_madd(fs32,dy32,fiy3);
12415 fiz3 = vec_madd(fs32,dz32,fiz3);
12417 fix1 = vec_madd(fs13,dx13,fix1);
12418 fiy1 = vec_madd(fs13,dy13,fiy1);
12419 fiz1 = vec_madd(fs13,dz13,fiz1);
12420 fix2 = vec_madd(fs23,dx23,fix2);
12421 fiy2 = vec_madd(fs23,dy23,fiy2);
12422 fiz2 = vec_madd(fs23,dz23,fiz2);
12423 fix3 = vec_madd(fs33,dx33,fix3);
12424 fiy3 = vec_madd(fs33,dy33,fiy3);
12425 fiz3 = vec_madd(fs33,dz33,fiz3);
12427 fjx1 = vec_nmsub(fs11,dx11,nul);
12428 fjy1 = vec_nmsub(fs11,dy11,nul);
12429 fjz1 = vec_nmsub(fs11,dz11,nul);
12430 fjx2 = vec_nmsub(fs12,dx12,nul);
12431 fjy2 = vec_nmsub(fs12,dy12,nul);
12432 fjz2 = vec_nmsub(fs12,dz12,nul);
12433 fjx3 = vec_nmsub(fs13,dx13,nul);
12434 fjy3 = vec_nmsub(fs13,dy13,nul);
12435 fjz3 = vec_nmsub(fs13,dz13,nul);
12437 fjx1 = vec_nmsub(fs21,dx21,fjx1);
12438 fjy1 = vec_nmsub(fs21,dy21,fjy1);
12439 fjz1 = vec_nmsub(fs21,dz21,fjz1);
12440 fjx2 = vec_nmsub(fs22,dx22,fjx2);
12441 fjy2 = vec_nmsub(fs22,dy22,fjy2);
12442 fjz2 = vec_nmsub(fs22,dz22,fjz2);
12443 fjx3 = vec_nmsub(fs23,dx23,fjx3);
12444 fjy3 = vec_nmsub(fs23,dy23,fjy3);
12445 fjz3 = vec_nmsub(fs23,dz23,fjz3);
12447 fjx1 = vec_nmsub(fs31,dx31,fjx1);
12448 fjy1 = vec_nmsub(fs31,dy31,fjy1);
12449 fjz1 = vec_nmsub(fs31,dz31,fjz1);
12450 fjx2 = vec_nmsub(fs32,dx32,fjx2);
12451 fjy2 = vec_nmsub(fs32,dy32,fjy2);
12452 fjz2 = vec_nmsub(fs32,dz32,fjz2);
12453 fjx3 = vec_nmsub(fs33,dx33,fjx3);
12454 fjy3 = vec_nmsub(fs33,dy33,fjy3);
12455 fjz3 = vec_nmsub(fs33,dz33,fjz3);
12457 add_force_to_1_water(faction+j3a,
12458 fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
12460 /* update outer data */
12461 update_i_water_forces(faction+ii3,fshift+is3,
12462 fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3);
12464 add_vector_to_float(Vc+gid[n],vctot);
12465 add_vector_to_float(Vnb+gid[n],vnbtot);
12471 void mcinl0100_altivec(
12472 int nri,
12473 int iinr[],
12474 int jindex[],
12475 int jjnr[],
12476 int shift[],
12477 float shiftvec[],
12478 int gid[],
12479 float pos[],
12480 int type[],
12481 int ntype,
12482 float nbfp[],
12483 float Vnb[])
12485 vector float ix,iy,iz,shvec;
12486 vector float nul;
12487 vector float dx,dy,dz;
12488 vector float vnbtot,c6,c12;
12489 vector float rinvsq,rsq,rinvsix;
12491 int n,k,k0,ii,is3,ii3,nj0,nj1;
12492 int jnra,jnrb,jnrc,jnrd;
12493 int j3a,j3b,j3c,j3d;
12494 int ntiA,tja,tjb,tjc,tjd;
12496 nul=vec_zero();
12498 for(n=0;n<nri;n++) {
12499 is3 = 3*shift[n];
12500 shvec = load_xyz(shiftvec+is3);
12501 ii = iinr[n];
12502 ii3 = 3*ii;
12503 ix = load_xyz(pos+ii3);
12504 vnbtot = nul;
12505 ix = vec_add(ix,shvec);
12506 nj0 = jindex[n];
12507 nj1 = jindex[n+1];
12508 splat_xyz_to_vectors(ix,&ix,&iy,&iz);
12509 ntiA = 2*ntype*type[ii];
12510 for(k=nj0; k<(nj1-3); k+=4) {
12511 jnra = jjnr[k];
12512 jnrb = jjnr[k+1];
12513 jnrc = jjnr[k+2];
12514 jnrd = jjnr[k+3];
12515 j3a = 3*jnra;
12516 j3b = 3*jnrb;
12517 j3c = 3*jnrc;
12518 j3d = 3*jnrd;
12519 transpose_4_to_3(load_xyz(pos+j3a),
12520 load_xyz(pos+j3b),
12521 load_xyz(pos+j3c),
12522 load_xyz(pos+j3d),&dx,&dy,&dz);
12523 dx = vec_sub(ix,dx);
12524 dy = vec_sub(iy,dy);
12525 dz = vec_sub(iz,dz);
12526 rsq = vec_madd(dx,dx,nul);
12527 rsq = vec_madd(dy,dy,rsq);
12528 rsq = vec_madd(dz,dz,rsq);
12529 rinvsq = do_recip(rsq);
12530 rinvsix = vec_madd(rinvsq,rinvsq,nul);
12531 rinvsix = vec_madd(rinvsix,rinvsq,nul);
12532 tja = ntiA+2*type[jnra];
12533 tjb = ntiA+2*type[jnrb];
12534 tjc = ntiA+2*type[jnrc];
12535 tjd = ntiA+2*type[jnrd];
12536 load_4_pair(nbfp+tja,nbfp+tjb,nbfp+tjc,nbfp+tjd,&c6,&c12);
12537 vnbtot = vec_nmsub(c6,rinvsix,vnbtot);
12538 vnbtot = vec_madd(c12,vec_madd(rinvsix,rinvsix,nul),vnbtot);
12540 if(k<(nj1-1)) {
12541 jnra = jjnr[k];
12542 jnrb = jjnr[k+1];
12543 j3a = 3*jnra;
12544 j3b = 3*jnrb;
12545 transpose_2_to_3(load_xyz(pos+j3a),
12546 load_xyz(pos+j3b),&dx,&dy,&dz);
12547 dx = vec_sub(ix,dx);
12548 dy = vec_sub(iy,dy);
12549 dz = vec_sub(iz,dz);
12550 rsq = vec_madd(dx,dx,nul);
12551 rsq = vec_madd(dy,dy,rsq);
12552 rsq = vec_madd(dz,dz,rsq);
12553 rinvsq = do_recip(rsq);
12554 zero_highest_2_elements_in_vector(&rinvsq);
12555 rinvsix = vec_madd(rinvsq,rinvsq,nul);
12556 rinvsix = vec_madd(rinvsix,rinvsq,nul);
12557 tja = ntiA+2*type[jnra];
12558 tjb = ntiA+2*type[jnrb];
12559 load_2_pair(nbfp+tja,nbfp+tjb,&c6,&c12);
12560 vnbtot = vec_nmsub(c6,rinvsix,vnbtot);
12561 vnbtot = vec_madd(c12,vec_madd(rinvsix,rinvsix,nul),vnbtot);
12562 k += 2;
12564 if((nj1-nj0)%2) {
12565 jnra = jjnr[k];
12566 j3a = 3*jnra;
12567 transpose_1_to_3(load_xyz(pos+j3a),&dx,&dy,&dz);
12568 dx = vec_sub(ix,dx);
12569 dy = vec_sub(iy,dy);
12570 dz = vec_sub(iz,dz);
12571 rsq = vec_madd(dx,dx,nul);
12572 rsq = vec_madd(dy,dy,rsq);
12573 rsq = vec_madd(dz,dz,rsq);
12574 rinvsq = do_recip(rsq);
12575 zero_highest_3_elements_in_vector(&rinvsq);
12576 rinvsix = vec_madd(rinvsq,rinvsq,nul);
12577 rinvsix = vec_madd(rinvsix,rinvsq,nul);
12578 tja = ntiA+2*type[jnra];
12579 load_1_pair(nbfp+tja,&c6,&c12);
12580 vnbtot = vec_nmsub(c6,rinvsix,vnbtot);
12581 vnbtot = vec_madd(c12,vec_madd(rinvsix,rinvsix,nul),vnbtot);
12583 /* update outer data */
12584 add_vector_to_float(Vnb+gid[n],vnbtot);
12589 void mcinl0300_altivec(
12590 int nri,
12591 int iinr[],
12592 int jindex[],
12593 int jjnr[],
12594 int shift[],
12595 float shiftvec[],
12596 int gid[],
12597 float pos[],
12598 int type[],
12599 int ntype,
12600 float nbfp[],
12601 float Vnb[],
12602 float tabscale,
12603 float VFtab[])
12605 vector float ix,iy,iz,shvec;
12606 vector float nul,tsc;
12607 vector float dx,dy,dz;
12608 vector float vnbtot,c6,c12;
12609 vector float rinv,r,rsq;
12610 vector float VVd,VVr;
12612 int n,k,k0,ii,is3,ii3,ntiA,nj0,nj1;
12613 int jnra,jnrb,jnrc,jnrd;
12614 int j3a,j3b,j3c,j3d;
12615 int tja,tjb,tjc,tjd;
12617 nul=vec_zero();
12618 tsc=load_float_and_splat(&tabscale);
12620 for(n=0;n<nri;n++) {
12621 is3 = 3*shift[n];
12622 shvec = load_xyz(shiftvec+is3);
12623 ii = iinr[n];
12624 ii3 = 3*ii;
12625 ix = load_xyz(pos+ii3);
12626 vnbtot = nul;
12627 ix = vec_add(ix,shvec);
12628 nj0 = jindex[n];
12629 nj1 = jindex[n+1];
12630 splat_xyz_to_vectors(ix,&ix,&iy,&iz);
12631 ntiA = 2*ntype*type[ii];
12633 for(k=nj0; k<(nj1-3); k+=4) {
12634 jnra = jjnr[k];
12635 jnrb = jjnr[k+1];
12636 jnrc = jjnr[k+2];
12637 jnrd = jjnr[k+3];
12638 j3a = 3*jnra;
12639 j3b = 3*jnrb;
12640 j3c = 3*jnrc;
12641 j3d = 3*jnrd;
12642 transpose_4_to_3(load_xyz(pos+j3a),
12643 load_xyz(pos+j3b),
12644 load_xyz(pos+j3c),
12645 load_xyz(pos+j3d),&dx,&dy,&dz);
12646 dx = vec_sub(ix,dx);
12647 dy = vec_sub(iy,dy);
12648 dz = vec_sub(iz,dz);
12649 rsq = vec_madd(dx,dx,nul);
12650 rsq = vec_madd(dy,dy,rsq);
12651 rsq = vec_madd(dz,dz,rsq);
12652 rinv = do_invsqrt(rsq);
12653 r = vec_madd(rinv,rsq,nul);
12654 tja = ntiA+2*type[jnra];
12655 tjb = ntiA+2*type[jnrb];
12656 tjc = ntiA+2*type[jnrc];
12657 tjd = ntiA+2*type[jnrd];
12658 load_4_pair(nbfp+tja,nbfp+tjb,nbfp+tjc,nbfp+tjd,&c6,&c12);
12659 do_vonly_4_ljtable_lj(VFtab,vec_madd(r,tsc,nul),&VVd,&VVr);
12660 vnbtot = vec_madd(c6,VVd,vnbtot);
12661 vnbtot = vec_madd(c12,VVr,vnbtot);
12663 if(k<(nj1-1)) {
12664 jnra = jjnr[k];
12665 jnrb = jjnr[k+1];
12666 j3a = 3*jnra;
12667 j3b = 3*jnrb;
12668 transpose_2_to_3(load_xyz(pos+j3a),
12669 load_xyz(pos+j3b),&dx,&dy,&dz);
12670 dx = vec_sub(ix,dx);
12671 dy = vec_sub(iy,dy);
12672 dz = vec_sub(iz,dz);
12673 rsq = vec_madd(dx,dx,nul);
12674 rsq = vec_madd(dy,dy,rsq);
12675 rsq = vec_madd(dz,dz,rsq);
12676 zero_highest_2_elements_in_vector(&rsq);
12677 rinv = do_invsqrt(rsq);
12678 zero_highest_2_elements_in_vector(&rinv);
12679 r = vec_madd(rinv,rsq,nul);
12680 tja = ntiA+2*type[jnra];
12681 tjb = ntiA+2*type[jnrb];
12682 load_2_pair(nbfp+tja,nbfp+tjb,&c6,&c12);
12683 do_vonly_2_ljtable_lj(VFtab,vec_madd(r,tsc,nul),&VVd,&VVr);
12684 vnbtot = vec_madd(c6,VVd,vnbtot);
12685 vnbtot = vec_madd(c12,VVr,vnbtot);
12686 k += 2;
12688 if((nj1-nj0)%2) {
12689 jnra = jjnr[k];
12690 j3a = 3*jnra;
12691 transpose_1_to_3(load_xyz(pos+j3a),&dx,&dy,&dz);
12692 dx = vec_sub(ix,dx);
12693 dy = vec_sub(iy,dy);
12694 dz = vec_sub(iz,dz);
12695 rsq = vec_madd(dx,dx,nul);
12696 rsq = vec_madd(dy,dy,rsq);
12697 rsq = vec_madd(dz,dz,rsq);
12698 zero_highest_3_elements_in_vector(&rsq);
12699 rinv = do_invsqrt(rsq);
12700 zero_highest_3_elements_in_vector(&rinv);
12701 r = vec_madd(rinv,rsq,nul);
12702 tja = ntiA+2*type[jnra];
12703 load_1_pair(nbfp+tja,&c6,&c12);
12704 do_vonly_1_ljtable_lj(VFtab,vec_madd(r,tsc,nul),&VVd,&VVr);
12705 vnbtot = vec_madd(c6,VVd,vnbtot);
12706 vnbtot = vec_madd(c12,VVr,vnbtot);
12708 /* update outer data */
12709 add_vector_to_float(Vnb+gid[n],vnbtot);
12715 void mcinl1000_altivec(
12716 int nri,
12717 int iinr[],
12718 int jindex[],
12719 int jjnr[],
12720 int shift[],
12721 float shiftvec[],
12722 int gid[],
12723 float pos[],
12724 float charge[],
12725 float facel,
12726 float Vc[])
12728 vector float ix,iy,iz,shvec;
12729 vector float vfacel,nul;
12730 vector float dx,dy,dz;
12731 vector float vctot,qq,iq;
12732 vector float rinv,rsq;
12734 int n,k,k0,ii,is3,ii3,ntiA,nj0,nj1;
12735 int jnra,jnrb,jnrc,jnrd;
12736 int j3a,j3b,j3c,j3d;
12738 nul=vec_zero();
12739 vfacel=load_float_and_splat(&facel);
12741 for(n=0;n<nri;n++) {
12742 is3 = 3*shift[n];
12743 shvec = load_xyz(shiftvec+is3);
12744 ii = iinr[n];
12745 ii3 = 3*ii;
12746 ix = load_xyz(pos+ii3);
12747 vctot = nul;
12748 ix = vec_add(ix,shvec);
12749 nj0 = jindex[n];
12750 nj1 = jindex[n+1];
12751 splat_xyz_to_vectors(ix,&ix,&iy,&iz);
12752 iq = vec_madd(load_float_and_splat(charge+ii),vfacel,nul);
12754 for(k=nj0; k<(nj1-3); k+=4) {
12755 jnra = jjnr[k];
12756 jnrb = jjnr[k+1];
12757 jnrc = jjnr[k+2];
12758 jnrd = jjnr[k+3];
12759 j3a = 3*jnra;
12760 j3b = 3*jnrb;
12761 j3c = 3*jnrc;
12762 j3d = 3*jnrd;
12763 transpose_4_to_3(load_xyz(pos+j3a),
12764 load_xyz(pos+j3b),
12765 load_xyz(pos+j3c),
12766 load_xyz(pos+j3d),&dx,&dy,&dz);
12767 dx = vec_sub(ix,dx);
12768 dy = vec_sub(iy,dy);
12769 dz = vec_sub(iz,dz);
12770 rsq = vec_madd(dx,dx,nul);
12771 rsq = vec_madd(dy,dy,rsq);
12772 rsq = vec_madd(dz,dz,rsq);
12773 rinv = do_invsqrt(rsq);
12774 /* load 4 j charges and multiply by iq */
12775 qq = vec_madd(load_4_float(charge+jnra,charge+jnrb,
12776 charge+jnrc,charge+jnrd),iq,nul);
12777 vctot = vec_madd(qq,rinv,vctot);
12779 if(k<(nj1-1)) {
12780 jnra = jjnr[k];
12781 jnrb = jjnr[k+1];
12782 j3a = 3*jnra;
12783 j3b = 3*jnrb;
12784 transpose_2_to_3(load_xyz(pos+j3a),
12785 load_xyz(pos+j3b),&dx,&dy,&dz);
12786 dx = vec_sub(ix,dx);
12787 dy = vec_sub(iy,dy);
12788 dz = vec_sub(iz,dz);
12789 rsq = vec_madd(dx,dx,nul);
12790 rsq = vec_madd(dy,dy,rsq);
12791 rsq = vec_madd(dz,dz,rsq);
12792 rinv = do_invsqrt(rsq);
12793 zero_highest_2_elements_in_vector(&rinv);
12794 /* load 2 j charges and multiply by iq */
12795 qq = vec_madd(load_2_float(charge+jnra,charge+jnrb),iq,nul);
12796 vctot = vec_madd(qq,rinv,vctot);
12797 k += 2;
12799 if((nj1-nj0)%2) {
12800 jnra = jjnr[k];
12801 j3a = 3*jnra;
12802 transpose_1_to_3(load_xyz(pos+j3a),&dx,&dy,&dz);
12803 dx = vec_sub(ix,dx);
12804 dy = vec_sub(iy,dy);
12805 dz = vec_sub(iz,dz);
12806 rsq = vec_madd(dx,dx,nul);
12807 rsq = vec_madd(dy,dy,rsq);
12808 rsq = vec_madd(dz,dz,rsq);
12809 rinv = do_invsqrt(rsq);
12810 zero_highest_3_elements_in_vector(&rinv);
12811 /* load 1 j charge and multiply by iq */
12812 qq = vec_madd(load_1_float(charge+jnra),iq,nul);
12813 vctot = vec_madd(qq,rinv,vctot);
12815 /* update outer data */
12816 add_vector_to_float(Vc+gid[n],vctot);
12822 void mcinl1100_altivec(
12823 int nri,
12824 int iinr[],
12825 int jindex[],
12826 int jjnr[],
12827 int shift[],
12828 float shiftvec[],
12829 int gid[],
12830 float pos[],
12831 float charge[],
12832 float facel,
12833 float Vc[],
12834 int type[],
12835 int ntype,
12836 float nbfp[],
12837 float Vnb[])
12839 vector float ix,iy,iz,shvec;
12840 vector float vfacel,nul;
12841 vector float dx,dy,dz;
12842 vector float vnbtot,vctot,qq,iq,c6,c12;
12843 vector float rinv,rinvsq,rsq,rinvsix;
12845 int n,k,k0,ii,is3,ii3,ntiA,nj0,nj1;
12846 int jnra,jnrb,jnrc,jnrd;
12847 int j3a,j3b,j3c,j3d;
12848 int tja,tjb,tjc,tjd;
12850 nul=vec_zero();
12851 vfacel=load_float_and_splat(&facel);
12853 for(n=0;n<nri;n++) {
12854 is3 = 3*shift[n];
12855 shvec = load_xyz(shiftvec+is3);
12856 ii = iinr[n];
12857 ii3 = 3*ii;
12858 ix = load_xyz(pos+ii3);
12859 vnbtot = nul;
12860 vctot = nul;
12861 ix = vec_add(ix,shvec);
12862 nj0 = jindex[n];
12863 nj1 = jindex[n+1];
12864 splat_xyz_to_vectors(ix,&ix,&iy,&iz);
12865 ntiA = 2*ntype*type[ii];
12866 iq = vec_madd(load_float_and_splat(charge+ii),vfacel,nul);
12868 for(k=nj0; k<(nj1-3); k+=4) {
12869 jnra = jjnr[k];
12870 jnrb = jjnr[k+1];
12871 jnrc = jjnr[k+2];
12872 jnrd = jjnr[k+3];
12873 j3a = 3*jnra;
12874 j3b = 3*jnrb;
12875 j3c = 3*jnrc;
12876 j3d = 3*jnrd;
12877 transpose_4_to_3(load_xyz(pos+j3a),
12878 load_xyz(pos+j3b),
12879 load_xyz(pos+j3c),
12880 load_xyz(pos+j3d),&dx,&dy,&dz);
12881 dx = vec_sub(ix,dx);
12882 dy = vec_sub(iy,dy);
12883 dz = vec_sub(iz,dz);
12884 rsq = vec_madd(dx,dx,nul);
12885 rsq = vec_madd(dy,dy,rsq);
12886 rsq = vec_madd(dz,dz,rsq);
12887 rinv = do_invsqrt(rsq);
12888 rinvsq = vec_madd(rinv,rinv,nul);
12889 rinvsix = vec_madd(rinvsq,rinvsq,nul);
12890 rinvsix = vec_madd(rinvsix,rinvsq,nul);
12891 tja = ntiA+2*type[jnra];
12892 tjb = ntiA+2*type[jnrb];
12893 tjc = ntiA+2*type[jnrc];
12894 tjd = ntiA+2*type[jnrd];
12895 qq = vec_madd(load_4_float(charge+jnra,charge+jnrb,
12896 charge+jnrc,charge+jnrd),iq,nul);
12897 load_4_pair(nbfp+tja,nbfp+tjb,nbfp+tjc,nbfp+tjd,&c6,&c12);
12898 vnbtot = vec_nmsub(c6,rinvsix,vnbtot);
12899 vnbtot = vec_madd(c12,vec_madd(rinvsix,rinvsix,nul),vnbtot);
12900 vctot = vec_madd(qq,rinv,vctot);
12902 if(k<(nj1-1)) {
12903 jnra = jjnr[k];
12904 jnrb = jjnr[k+1];
12905 j3a = 3*jnra;
12906 j3b = 3*jnrb;
12907 transpose_2_to_3(load_xyz(pos+j3a),
12908 load_xyz(pos+j3b),&dx,&dy,&dz);
12909 dx = vec_sub(ix,dx);
12910 dy = vec_sub(iy,dy);
12911 dz = vec_sub(iz,dz);
12912 rsq = vec_madd(dx,dx,nul);
12913 rsq = vec_madd(dy,dy,rsq);
12914 rsq = vec_madd(dz,dz,rsq);
12915 rinv = do_invsqrt(rsq);
12916 zero_highest_2_elements_in_vector(&rinv);
12917 rinvsq = vec_madd(rinv,rinv,nul);
12918 rinvsix = vec_madd(rinvsq,rinvsq,nul);
12919 rinvsix = vec_madd(rinvsix,rinvsq,nul);
12920 tja = ntiA+2*type[jnra];
12921 tjb = ntiA+2*type[jnrb];
12922 qq = vec_madd(load_2_float(charge+jnra,charge+jnrb),iq,nul);
12923 load_2_pair(nbfp+tja,nbfp+tjb,&c6,&c12);
12924 vnbtot = vec_nmsub(c6,rinvsix,vnbtot);
12925 vnbtot = vec_madd(c12,vec_madd(rinvsix,rinvsix,nul),vnbtot);
12926 vctot = vec_madd(qq,rinv,vctot);
12927 k += 2;
12929 if((nj1-nj0)%2) {
12930 jnra = jjnr[k];
12931 j3a = 3*jnra;
12932 transpose_1_to_3(load_xyz(pos+j3a),&dx,&dy,&dz);
12933 dx = vec_sub(ix,dx);
12934 dy = vec_sub(iy,dy);
12935 dz = vec_sub(iz,dz);
12936 rsq = vec_madd(dx,dx,nul);
12937 rsq = vec_madd(dy,dy,rsq);
12938 rsq = vec_madd(dz,dz,rsq);
12939 rinv = do_invsqrt(rsq);
12940 zero_highest_3_elements_in_vector(&rinv);
12941 rinvsq = vec_madd(rinv,rinv,nul);
12942 rinvsix = vec_madd(rinvsq,rinvsq,nul);
12943 rinvsix = vec_madd(rinvsix,rinvsq,nul);
12944 tja = ntiA+2*type[jnra];
12945 qq = vec_madd(load_1_float(charge+jnra),iq,nul);
12946 load_1_pair(nbfp+tja,&c6,&c12);
12947 vnbtot = vec_nmsub(c6,rinvsix,vnbtot);
12948 vnbtot = vec_madd(c12,vec_madd(rinvsix,rinvsix,nul),vnbtot);
12949 vctot = vec_madd(qq,rinv,vctot);
12951 /* update outer data */
12952 add_vector_to_float(Vc+gid[n],vctot);
12953 add_vector_to_float(Vnb+gid[n],vnbtot);
12960 void mcinl2000_altivec(
12961 int nri,
12962 int iinr[],
12963 int jindex[],
12964 int jjnr[],
12965 int shift[],
12966 float shiftvec[],
12967 int gid[],
12968 float pos[],
12969 float charge[],
12970 float facel,
12971 float Vc[],
12972 float krf,
12973 float crf)
12975 vector float ix,iy,iz,shvec;
12976 vector float vfacel,vkrf,vcrf,krsq,nul,vcoul;
12977 vector float dx,dy,dz;
12978 vector float vctot,qq,iq;
12979 vector float rinv,rsq;
12981 int n,k,k0,ii,is3,ii3,ntiA,nj0,nj1;
12982 int jnra,jnrb,jnrc,jnrd;
12983 int j3a,j3b,j3c,j3d;
12985 nul=vec_zero();
12986 vfacel=load_float_and_splat(&facel);
12987 vkrf=load_float_and_splat(&krf);
12988 vcrf=load_float_and_splat(&crf);
12990 for(n=0;n<nri;n++) {
12991 is3 = 3*shift[n];
12992 shvec = load_xyz(shiftvec+is3);
12993 ii = iinr[n];
12994 ii3 = 3*ii;
12995 ix = load_xyz(pos+ii3);
12996 vctot = nul;
12997 ix = vec_add(ix,shvec);
12998 nj0 = jindex[n];
12999 nj1 = jindex[n+1];
13000 splat_xyz_to_vectors(ix,&ix,&iy,&iz);
13001 iq = vec_madd(load_float_and_splat(charge+ii),vfacel,nul);
13003 for(k=nj0; k<(nj1-3); k+=4) {
13004 jnra = jjnr[k];
13005 jnrb = jjnr[k+1];
13006 jnrc = jjnr[k+2];
13007 jnrd = jjnr[k+3];
13008 j3a = 3*jnra;
13009 j3b = 3*jnrb;
13010 j3c = 3*jnrc;
13011 j3d = 3*jnrd;
13012 transpose_4_to_3(load_xyz(pos+j3a),
13013 load_xyz(pos+j3b),
13014 load_xyz(pos+j3c),
13015 load_xyz(pos+j3d),&dx,&dy,&dz);
13016 dx = vec_sub(ix,dx);
13017 dy = vec_sub(iy,dy);
13018 dz = vec_sub(iz,dz);
13019 rsq = vec_madd(dx,dx,nul);
13020 rsq = vec_madd(dy,dy,rsq);
13021 rsq = vec_madd(dz,dz,rsq);
13022 rinv = do_invsqrt(rsq);
13023 /* load 4 j charges and multiply by iq */
13024 qq = vec_madd(load_4_float(charge+jnra,charge+jnrb,
13025 charge+jnrc,charge+jnrd),iq,nul);
13026 krsq = vec_madd(vkrf,rsq,nul);
13027 vcoul = vec_add(rinv,krsq);
13028 vcoul = vec_sub(vcoul,vcrf);
13029 vctot = vec_madd(qq,vcoul,vctot);
13031 if(k<(nj1-1)) {
13032 jnra = jjnr[k];
13033 jnrb = jjnr[k+1];
13034 j3a = 3*jnra;
13035 j3b = 3*jnrb;
13036 transpose_2_to_3(load_xyz(pos+j3a),
13037 load_xyz(pos+j3b),&dx,&dy,&dz);
13038 dx = vec_sub(ix,dx);
13039 dy = vec_sub(iy,dy);
13040 dz = vec_sub(iz,dz);
13041 rsq = vec_madd(dx,dx,nul);
13042 rsq = vec_madd(dy,dy,rsq);
13043 rsq = vec_madd(dz,dz,rsq);
13044 zero_highest_2_elements_in_vector(&rsq);
13045 rinv = do_invsqrt(rsq);
13046 zero_highest_2_elements_in_vector(&rinv);
13047 /* load 2 j charges and multiply by iq */
13048 qq = vec_madd(load_2_float(charge+jnra,charge+jnrb),iq,nul);
13049 krsq = vec_madd(vkrf,rsq,nul);
13050 vcoul = vec_add(rinv,krsq);
13051 vcoul = vec_sub(vcoul,vcrf);
13052 vctot = vec_madd(qq,vcoul,vctot);
13053 k += 2;
13055 if((nj1-nj0)%2) {
13056 jnra = jjnr[k];
13057 j3a = 3*jnra;
13058 transpose_1_to_3(load_xyz(pos+j3a),&dx,&dy,&dz);
13059 dx = vec_sub(ix,dx);
13060 dy = vec_sub(iy,dy);
13061 dz = vec_sub(iz,dz);
13062 rsq = vec_madd(dx,dx,nul);
13063 rsq = vec_madd(dy,dy,rsq);
13064 rsq = vec_madd(dz,dz,rsq);
13065 zero_highest_3_elements_in_vector(&rsq);
13066 rinv = do_invsqrt(rsq);
13067 zero_highest_3_elements_in_vector(&rinv);
13068 /* load 1 j charge and multiply by iq */
13069 qq = vec_madd(load_1_float(charge+jnra),iq,nul);
13070 krsq = vec_madd(vkrf,rsq,nul);
13071 vcoul = vec_add(rinv,krsq);
13072 vcoul = vec_sub(vcoul,vcrf);
13073 vctot = vec_madd(qq,vcoul,vctot);
13075 /* update outer data */
13076 add_vector_to_float(Vc+gid[n],vctot);
13082 void mcinl2100_altivec(
13083 int nri,
13084 int iinr[],
13085 int jindex[],
13086 int jjnr[],
13087 int shift[],
13088 float shiftvec[],
13089 int gid[],
13090 float pos[],
13091 float charge[],
13092 float facel,
13093 float Vc[],
13094 float krf,
13095 float crf,
13096 int type[],
13097 int ntype,
13098 float nbfp[],
13099 float Vnb[])
13101 vector float ix,iy,iz,shvec;
13102 vector float vfacel,vkrf,vcrf,krsq,vcoul,nul;
13103 vector float dx,dy,dz;
13104 vector float vnbtot,vctot,qq,iq,c6,c12;
13105 vector float rinv,rinvsq,rsq,rinvsix;
13107 int n,k,k0,ii,is3,ii3,ntiA,nj0,nj1;
13108 int jnra,jnrb,jnrc,jnrd;
13109 int j3a,j3b,j3c,j3d;
13110 int tja,tjb,tjc,tjd;
13112 nul=vec_zero();
13113 vfacel=load_float_and_splat(&facel);
13114 vkrf=load_float_and_splat(&krf);
13115 vcrf=load_float_and_splat(&crf);
13117 for(n=0;n<nri;n++) {
13118 is3 = 3*shift[n];
13119 shvec = load_xyz(shiftvec+is3);
13120 ii = iinr[n];
13121 ii3 = 3*ii;
13122 ix = load_xyz(pos+ii3);
13123 vnbtot = nul;
13124 vctot = nul;
13125 ix = vec_add(ix,shvec);
13126 nj0 = jindex[n];
13127 nj1 = jindex[n+1];
13128 splat_xyz_to_vectors(ix,&ix,&iy,&iz);
13129 ntiA = 2*ntype*type[ii];
13130 iq = vec_madd(load_float_and_splat(charge+ii),vfacel,nul);
13132 for(k=nj0; k<(nj1-3); k+=4) {
13133 jnra = jjnr[k];
13134 jnrb = jjnr[k+1];
13135 jnrc = jjnr[k+2];
13136 jnrd = jjnr[k+3];
13137 j3a = 3*jnra;
13138 j3b = 3*jnrb;
13139 j3c = 3*jnrc;
13140 j3d = 3*jnrd;
13141 transpose_4_to_3(load_xyz(pos+j3a),
13142 load_xyz(pos+j3b),
13143 load_xyz(pos+j3c),
13144 load_xyz(pos+j3d),&dx,&dy,&dz);
13145 dx = vec_sub(ix,dx);
13146 dy = vec_sub(iy,dy);
13147 dz = vec_sub(iz,dz);
13148 rsq = vec_madd(dx,dx,nul);
13149 rsq = vec_madd(dy,dy,rsq);
13150 rsq = vec_madd(dz,dz,rsq);
13151 rinv = do_invsqrt(rsq);
13152 rinvsq = vec_madd(rinv,rinv,nul);
13153 rinvsix = vec_madd(rinvsq,rinvsq,nul);
13154 rinvsix = vec_madd(rinvsix,rinvsq,nul);
13155 tja = ntiA+2*type[jnra];
13156 tjb = ntiA+2*type[jnrb];
13157 tjc = ntiA+2*type[jnrc];
13158 tjd = ntiA+2*type[jnrd];
13159 qq = vec_madd(load_4_float(charge+jnra,charge+jnrb,
13160 charge+jnrc,charge+jnrd),iq,nul);
13161 load_4_pair(nbfp+tja,nbfp+tjb,nbfp+tjc,nbfp+tjd,&c6,&c12);
13162 vnbtot = vec_nmsub(c6,rinvsix,vnbtot);
13163 vnbtot = vec_madd(c12,vec_madd(rinvsix,rinvsix,nul),vnbtot);
13164 krsq = vec_madd(vkrf,rsq,nul);
13165 vcoul = vec_add(rinv,krsq);
13166 vcoul = vec_sub(vcoul,vcrf);
13167 vctot = vec_madd(qq,vcoul,vctot);
13169 if(k<(nj1-1)) {
13170 jnra = jjnr[k];
13171 jnrb = jjnr[k+1];
13172 j3a = 3*jnra;
13173 j3b = 3*jnrb;
13174 transpose_2_to_3(load_xyz(pos+j3a),
13175 load_xyz(pos+j3b),&dx,&dy,&dz);
13176 dx = vec_sub(ix,dx);
13177 dy = vec_sub(iy,dy);
13178 dz = vec_sub(iz,dz);
13179 rsq = vec_madd(dx,dx,nul);
13180 rsq = vec_madd(dy,dy,rsq);
13181 rsq = vec_madd(dz,dz,rsq);
13182 zero_highest_2_elements_in_vector(&rsq);
13183 rinv = do_invsqrt(rsq);
13184 zero_highest_2_elements_in_vector(&rinv);
13185 rinvsq = vec_madd(rinv,rinv,nul);
13186 rinvsix = vec_madd(rinvsq,rinvsq,nul);
13187 rinvsix = vec_madd(rinvsix,rinvsq,nul);
13188 tja = ntiA+2*type[jnra];
13189 tjb = ntiA+2*type[jnrb];
13190 qq = vec_madd(load_2_float(charge+jnra,charge+jnrb),iq,nul);
13191 load_2_pair(nbfp+tja,nbfp+tjb,&c6,&c12);
13192 vnbtot = vec_nmsub(c6,rinvsix,vnbtot);
13193 vnbtot = vec_madd(c12,vec_madd(rinvsix,rinvsix,nul),vnbtot);
13194 krsq = vec_madd(vkrf,rsq,nul);
13195 vcoul = vec_add(rinv,krsq);
13196 vcoul = vec_sub(vcoul,vcrf);
13197 vctot = vec_madd(qq,vcoul,vctot);
13198 k += 2;
13200 if((nj1-nj0)%2) {
13201 jnra = jjnr[k];
13202 j3a = 3*jnra;
13203 transpose_1_to_3(load_xyz(pos+j3a),&dx,&dy,&dz);
13204 dx = vec_sub(ix,dx);
13205 dy = vec_sub(iy,dy);
13206 dz = vec_sub(iz,dz);
13207 rsq = vec_madd(dx,dx,nul);
13208 rsq = vec_madd(dy,dy,rsq);
13209 rsq = vec_madd(dz,dz,rsq);
13210 zero_highest_3_elements_in_vector(&rsq);
13211 rinv = do_invsqrt(rsq);
13212 zero_highest_3_elements_in_vector(&rinv);
13213 rinvsq = vec_madd(rinv,rinv,nul);
13214 rinvsix = vec_madd(rinvsq,rinvsq,nul);
13215 rinvsix = vec_madd(rinvsix,rinvsq,nul);
13216 tja = ntiA+2*type[jnra];
13217 qq = vec_madd(load_1_float(charge+jnra),iq,nul);
13218 load_1_pair(nbfp+tja,&c6,&c12);
13219 vnbtot = vec_nmsub(c6,rinvsix,vnbtot);
13220 vnbtot = vec_madd(c12,vec_madd(rinvsix,rinvsix,nul),vnbtot);
13221 krsq = vec_madd(vkrf,rsq,nul);
13222 vcoul = vec_add(rinv,krsq);
13223 vcoul = vec_sub(vcoul,vcrf);
13224 vctot = vec_madd(qq,vcoul,vctot);
13226 /* update outer data */
13227 add_vector_to_float(Vc+gid[n],vctot);
13228 add_vector_to_float(Vnb+gid[n],vnbtot);
13237 void mcinl3000_altivec(
13238 int nri,
13239 int iinr[],
13240 int jindex[],
13241 int jjnr[],
13242 int shift[],
13243 float shiftvec[],
13244 int gid[],
13245 float pos[],
13246 float charge[],
13247 float facel,
13248 float Vc[],
13249 float tabscale,
13250 float VFtab[])
13252 vector float ix,iy,iz,shvec;
13253 vector float vfacel,tsc,nul;
13254 vector float dx,dy,dz;
13255 vector float vctot,qq,iq;
13256 vector float rinv,r,rsq,VVc;
13258 int n,k,k0,ii,is3,ii3,ntiA,nj0,nj1;
13259 int jnra,jnrb,jnrc,jnrd;
13260 int j3a,j3b,j3c,j3d;
13262 nul=vec_zero();
13263 vfacel=load_float_and_splat(&facel);
13264 tsc=load_float_and_splat(&tabscale);
13266 for(n=0;n<nri;n++) {
13267 is3 = 3*shift[n];
13268 shvec = load_xyz(shiftvec+is3);
13269 ii = iinr[n];
13270 ii3 = 3*ii;
13271 ix = load_xyz(pos+ii3);
13272 vctot = nul;
13273 ix = vec_add(ix,shvec);
13274 nj0 = jindex[n];
13275 nj1 = jindex[n+1];
13276 splat_xyz_to_vectors(ix,&ix,&iy,&iz);
13277 iq = vec_madd(load_float_and_splat(charge+ii),vfacel,nul);
13279 for(k=nj0; k<(nj1-3); k+=4) {
13280 jnra = jjnr[k];
13281 jnrb = jjnr[k+1];
13282 jnrc = jjnr[k+2];
13283 jnrd = jjnr[k+3];
13284 j3a = 3*jnra;
13285 j3b = 3*jnrb;
13286 j3c = 3*jnrc;
13287 j3d = 3*jnrd;
13288 transpose_4_to_3(load_xyz(pos+j3a),
13289 load_xyz(pos+j3b),
13290 load_xyz(pos+j3c),
13291 load_xyz(pos+j3d),&dx,&dy,&dz);
13292 dx = vec_sub(ix,dx);
13293 dy = vec_sub(iy,dy);
13294 dz = vec_sub(iz,dz);
13295 rsq = vec_madd(dx,dx,nul);
13296 rsq = vec_madd(dy,dy,rsq);
13297 rsq = vec_madd(dz,dz,rsq);
13298 rinv = do_invsqrt(rsq);
13299 r = vec_madd(rinv,rsq,nul);
13300 /* load 4 j charges and multiply by iq */
13301 qq = vec_madd(load_4_float(charge+jnra,charge+jnrb,
13302 charge+jnrc,charge+jnrd),iq,nul);
13303 do_vonly_4_ctable_coul(VFtab,vec_madd(r,tsc,nul),&VVc);
13304 vctot = vec_madd(qq,VVc,vctot);
13306 if(k<(nj1-1)) {
13307 jnra = jjnr[k];
13308 jnrb = jjnr[k+1];
13309 j3a = 3*jnra;
13310 j3b = 3*jnrb;
13311 transpose_2_to_3(load_xyz(pos+j3a),
13312 load_xyz(pos+j3b),&dx,&dy,&dz);
13313 dx = vec_sub(ix,dx);
13314 dy = vec_sub(iy,dy);
13315 dz = vec_sub(iz,dz);
13316 rsq = vec_madd(dx,dx,nul);
13317 rsq = vec_madd(dy,dy,rsq);
13318 rsq = vec_madd(dz,dz,rsq);
13319 zero_highest_2_elements_in_vector(&rsq);
13320 rinv = do_invsqrt(rsq);
13321 zero_highest_2_elements_in_vector(&rinv);
13322 r = vec_madd(rinv,rsq,nul);
13323 /* load 2 j charges and multiply by iq */
13324 qq = vec_madd(load_2_float(charge+jnra,charge+jnrb),iq,nul);
13325 do_vonly_2_ctable_coul(VFtab,vec_madd(r,tsc,nul),&VVc);
13326 vctot = vec_madd(qq,VVc,vctot);
13327 k += 2;
13329 if((nj1-nj0)%2) {
13330 jnra = jjnr[k];
13331 j3a = 3*jnra;
13332 transpose_1_to_3(load_xyz(pos+j3a),&dx,&dy,&dz);
13333 dx = vec_sub(ix,dx);
13334 dy = vec_sub(iy,dy);
13335 dz = vec_sub(iz,dz);
13336 rsq = vec_madd(dx,dx,nul);
13337 rsq = vec_madd(dy,dy,rsq);
13338 rsq = vec_madd(dz,dz,rsq);
13339 zero_highest_3_elements_in_vector(&rsq);
13340 rinv = do_invsqrt(rsq);
13341 zero_highest_3_elements_in_vector(&rinv);
13342 r = vec_madd(rinv,rsq,nul);
13343 /* load 1 j charge and multiply by iq */
13344 qq = vec_madd(load_1_float(charge+jnra),iq,nul);
13345 do_vonly_1_ctable_coul(VFtab,vec_madd(r,tsc,nul),&VVc);
13346 vctot = vec_madd(qq,VVc,vctot);
13348 /* update outer data */
13349 add_vector_to_float(Vc+gid[n],vctot);
13355 void mcinl3100_altivec(
13356 int nri,
13357 int iinr[],
13358 int jindex[],
13359 int jjnr[],
13360 int shift[],
13361 float shiftvec[],
13362 int gid[],
13363 float pos[],
13364 float charge[],
13365 float facel,
13366 float Vc[],
13367 int type[],
13368 int ntype,
13369 float nbfp[],
13370 float Vnb[],
13371 float tabscale,
13372 float VFtab[])
13374 vector float ix,iy,iz,shvec;
13375 vector float vfacel,tsc,nul;
13376 vector float dx,dy,dz;
13377 vector float vnbtot,vctot,qq,iq,c6,c12,VVc;
13378 vector float rinv,r,rinvsq,rsq,rinvsix;
13380 int n,k,k0,ii,is3,ii3,ntiA,nj0,nj1;
13381 int jnra,jnrb,jnrc,jnrd;
13382 int j3a,j3b,j3c,j3d;
13383 int tja,tjb,tjc,tjd;
13385 nul=vec_zero();
13386 vfacel=load_float_and_splat(&facel);
13387 tsc=load_float_and_splat(&tabscale);
13389 for(n=0;n<nri;n++) {
13390 is3 = 3*shift[n];
13391 shvec = load_xyz(shiftvec+is3);
13392 ii = iinr[n];
13393 ii3 = 3*ii;
13394 ix = load_xyz(pos+ii3);
13395 vnbtot = nul;
13396 vctot = nul;
13397 ix = vec_add(ix,shvec);
13398 nj0 = jindex[n];
13399 nj1 = jindex[n+1];
13400 splat_xyz_to_vectors(ix,&ix,&iy,&iz);
13401 ntiA = 2*ntype*type[ii];
13402 iq = vec_madd(load_float_and_splat(charge+ii),vfacel,nul);
13404 for(k=nj0; k<(nj1-3); k+=4) {
13405 jnra = jjnr[k];
13406 jnrb = jjnr[k+1];
13407 jnrc = jjnr[k+2];
13408 jnrd = jjnr[k+3];
13409 j3a = 3*jnra;
13410 j3b = 3*jnrb;
13411 j3c = 3*jnrc;
13412 j3d = 3*jnrd;
13413 transpose_4_to_3(load_xyz(pos+j3a),
13414 load_xyz(pos+j3b),
13415 load_xyz(pos+j3c),
13416 load_xyz(pos+j3d),&dx,&dy,&dz);
13417 dx = vec_sub(ix,dx);
13418 dy = vec_sub(iy,dy);
13419 dz = vec_sub(iz,dz);
13420 rsq = vec_madd(dx,dx,nul);
13421 rsq = vec_madd(dy,dy,rsq);
13422 rsq = vec_madd(dz,dz,rsq);
13423 rinv = do_invsqrt(rsq);
13424 rinvsq = vec_madd(rinv,rinv,nul);
13425 r = vec_madd(rinv,rsq,nul);
13426 rinvsix = vec_madd(rinvsq,rinvsq,nul);
13427 rinvsix = vec_madd(rinvsix,rinvsq,nul);
13428 tja = ntiA+2*type[jnra];
13429 tjb = ntiA+2*type[jnrb];
13430 tjc = ntiA+2*type[jnrc];
13431 tjd = ntiA+2*type[jnrd];
13432 qq = vec_madd(load_4_float(charge+jnra,charge+jnrb,
13433 charge+jnrc,charge+jnrd),iq,nul);
13434 load_4_pair(nbfp+tja,nbfp+tjb,nbfp+tjc,nbfp+tjd,&c6,&c12);
13435 do_vonly_4_ctable_coul(VFtab,vec_madd(r,tsc,nul),&VVc);
13436 vctot = vec_madd(qq,VVc,vctot);
13437 vnbtot = vec_nmsub(c6,rinvsix,vnbtot);
13438 vnbtot = vec_madd(c12,vec_madd(rinvsix,rinvsix,nul),vnbtot);
13440 if(k<(nj1-1)) {
13441 jnra = jjnr[k];
13442 jnrb = jjnr[k+1];
13443 j3a = 3*jnra;
13444 j3b = 3*jnrb;
13445 transpose_2_to_3(load_xyz(pos+j3a),
13446 load_xyz(pos+j3b),&dx,&dy,&dz);
13447 dx = vec_sub(ix,dx);
13448 dy = vec_sub(iy,dy);
13449 dz = vec_sub(iz,dz);
13450 rsq = vec_madd(dx,dx,nul);
13451 rsq = vec_madd(dy,dy,rsq);
13452 rsq = vec_madd(dz,dz,rsq);
13453 zero_highest_2_elements_in_vector(&rsq);
13454 rinv = do_invsqrt(rsq);
13455 zero_highest_2_elements_in_vector(&rinv);
13456 rinvsq = vec_madd(rinv,rinv,nul);
13457 r = vec_madd(rinv,rsq,nul);
13458 rinvsix = vec_madd(rinvsq,rinvsq,nul);
13459 rinvsix = vec_madd(rinvsix,rinvsq,nul);
13460 tja = ntiA+2*type[jnra];
13461 tjb = ntiA+2*type[jnrb];
13462 qq = vec_madd(load_2_float(charge+jnra,charge+jnrb),iq,nul);
13463 load_2_pair(nbfp+tja,nbfp+tjb,&c6,&c12);
13464 do_vonly_2_ctable_coul(VFtab,vec_madd(r,tsc,nul),&VVc);
13465 vctot = vec_madd(qq,VVc,vctot);
13466 vnbtot = vec_nmsub(c6,rinvsix,vnbtot);
13467 vnbtot = vec_madd(c12,vec_madd(rinvsix,rinvsix,nul),vnbtot);
13468 k += 2;
13470 if((nj1-nj0)%2) {
13471 jnra = jjnr[k];
13472 j3a = 3*jnra;
13473 transpose_1_to_3(load_xyz(pos+j3a),&dx,&dy,&dz);
13474 dx = vec_sub(ix,dx);
13475 dy = vec_sub(iy,dy);
13476 dz = vec_sub(iz,dz);
13477 rsq = vec_madd(dx,dx,nul);
13478 rsq = vec_madd(dy,dy,rsq);
13479 rsq = vec_madd(dz,dz,rsq);
13480 zero_highest_3_elements_in_vector(&rsq);
13481 rinv = do_invsqrt(rsq);
13482 zero_highest_3_elements_in_vector(&rinv);
13483 rinvsq = vec_madd(rinv,rinv,nul);
13484 r = vec_madd(rinv,rsq,nul);
13485 rinvsix = vec_madd(rinvsq,rinvsq,nul);
13486 rinvsix = vec_madd(rinvsix,rinvsq,nul);
13487 tja = ntiA+2*type[jnra];
13488 qq = vec_madd(load_1_float(charge+jnra),iq,nul);
13489 load_1_pair(nbfp+tja,&c6,&c12);
13490 do_vonly_1_ctable_coul(VFtab,vec_madd(r,tsc,nul),&VVc);
13491 vctot = vec_madd(qq,VVc,vctot);
13492 vnbtot = vec_nmsub(c6,rinvsix,vnbtot);
13493 vnbtot = vec_madd(c12,vec_madd(rinvsix,rinvsix,nul),vnbtot);
13495 /* update outer data */
13496 add_vector_to_float(Vc+gid[n],vctot);
13497 add_vector_to_float(Vnb+gid[n],vnbtot);
13502 void mcinl3300_altivec(
13503 int nri,
13504 int iinr[],
13505 int jindex[],
13506 int jjnr[],
13507 int shift[],
13508 float shiftvec[],
13509 int gid[],
13510 float pos[],
13511 float charge[],
13512 float facel,
13513 float Vc[],
13514 int type[],
13515 int ntype,
13516 float nbfp[],
13517 float Vnb[],
13518 float tabscale,
13519 float VFtab[])
13521 vector float ix,iy,iz,shvec;
13522 vector float fs,nul,tsc;
13523 vector float dx,dy,dz,vfacel,vctot;
13524 vector float vnbtot,c6,c12,iq,qq;
13525 vector float rinv,r,rsq;
13526 vector float VVc,VVd,VVr;
13528 int n,k,k0,ii,is3,ii3,ntiA,nj0,nj1;
13529 int jnra,jnrb,jnrc,jnrd;
13530 int j3a,j3b,j3c,j3d;
13531 int tja,tjb,tjc,tjd;
13533 nul=vec_zero();
13534 tsc=load_float_and_splat(&tabscale);
13535 vfacel=load_float_and_splat(&facel);
13537 for(n=0;n<nri;n++) {
13538 is3 = 3*shift[n];
13539 shvec = load_xyz(shiftvec+is3);
13540 ii = iinr[n];
13541 ii3 = 3*ii;
13542 ix = load_xyz(pos+ii3);
13543 vnbtot = nul;
13544 vctot = nul;
13545 ix = vec_add(ix,shvec);
13546 nj0 = jindex[n];
13547 nj1 = jindex[n+1];
13548 splat_xyz_to_vectors(ix,&ix,&iy,&iz);
13549 ntiA = 2*ntype*type[ii];
13550 iq = vec_madd(load_float_and_splat(charge+ii),vfacel,nul);
13552 for(k=nj0; k<(nj1-3); k+=4) {
13553 jnra = jjnr[k];
13554 jnrb = jjnr[k+1];
13555 jnrc = jjnr[k+2];
13556 jnrd = jjnr[k+3];
13557 j3a = 3*jnra;
13558 j3b = 3*jnrb;
13559 j3c = 3*jnrc;
13560 j3d = 3*jnrd;
13561 transpose_4_to_3(load_xyz(pos+j3a),
13562 load_xyz(pos+j3b),
13563 load_xyz(pos+j3c),
13564 load_xyz(pos+j3d),&dx,&dy,&dz);
13565 dx = vec_sub(ix,dx);
13566 dy = vec_sub(iy,dy);
13567 dz = vec_sub(iz,dz);
13568 rsq = vec_madd(dx,dx,nul);
13569 rsq = vec_madd(dy,dy,rsq);
13570 rsq = vec_madd(dz,dz,rsq);
13571 rinv = do_invsqrt(rsq);
13572 r = vec_madd(rinv,rsq,nul);
13573 qq = vec_madd(load_4_float(charge+jnra,charge+jnrb,
13574 charge+jnrc,charge+jnrd),iq,nul);
13575 tja = ntiA+2*type[jnra];
13576 tjb = ntiA+2*type[jnrb];
13577 tjc = ntiA+2*type[jnrc];
13578 tjd = ntiA+2*type[jnrd];
13579 load_4_pair(nbfp+tja,nbfp+tjb,nbfp+tjc,nbfp+tjd,&c6,&c12);
13580 do_vonly_4_ljctable_coul_and_lj(VFtab,vec_madd(r,tsc,nul),&VVc,&VVd,&VVr);
13581 vctot = vec_madd(qq,VVc,vctot);
13582 vnbtot = vec_madd(c6,VVd,vnbtot);
13583 vnbtot = vec_madd(c12,VVr,vnbtot);
13585 if(k<(nj1-1)) {
13586 jnra = jjnr[k];
13587 jnrb = jjnr[k+1];
13588 j3a = 3*jnra;
13589 j3b = 3*jnrb;
13590 transpose_2_to_3(load_xyz(pos+j3a),
13591 load_xyz(pos+j3b),&dx,&dy,&dz);
13592 dx = vec_sub(ix,dx);
13593 dy = vec_sub(iy,dy);
13594 dz = vec_sub(iz,dz);
13595 rsq = vec_madd(dx,dx,nul);
13596 rsq = vec_madd(dy,dy,rsq);
13597 rsq = vec_madd(dz,dz,rsq);
13598 zero_highest_2_elements_in_vector(&rsq);
13599 rinv = do_invsqrt(rsq);
13600 zero_highest_2_elements_in_vector(&rinv);
13601 r = vec_madd(rinv,rsq,nul);
13602 qq = vec_madd(load_2_float(charge+jnra,charge+jnrb),iq,nul);
13603 tja = ntiA+2*type[jnra];
13604 tjb = ntiA+2*type[jnrb];
13605 load_2_pair(nbfp+tja,nbfp+tjb,&c6,&c12);
13606 do_vonly_2_ljctable_coul_and_lj(VFtab,vec_madd(r,tsc,nul),&VVc,&VVd,&VVr);
13607 vctot = vec_madd(qq,VVc,vctot);
13608 vnbtot = vec_madd(c6,VVd,vnbtot);
13609 vnbtot = vec_madd(c12,VVr,vnbtot);
13610 k += 2;
13612 if((nj1-nj0)%2) {
13613 jnra = jjnr[k];
13614 j3a = 3*jnra;
13615 transpose_1_to_3(load_xyz(pos+j3a),&dx,&dy,&dz);
13616 dx = vec_sub(ix,dx);
13617 dy = vec_sub(iy,dy);
13618 dz = vec_sub(iz,dz);
13619 rsq = vec_madd(dx,dx,nul);
13620 rsq = vec_madd(dy,dy,rsq);
13621 rsq = vec_madd(dz,dz,rsq);
13622 zero_highest_3_elements_in_vector(&rsq);
13623 rinv = do_invsqrt(rsq);
13624 zero_highest_3_elements_in_vector(&rinv);
13625 r = vec_madd(rinv,rsq,nul);
13626 qq = vec_madd(load_1_float(charge+jnra),iq,nul);
13627 tja = ntiA+2*type[jnra];
13628 load_1_pair(nbfp+tja,&c6,&c12);
13629 do_vonly_1_ljctable_coul_and_lj(VFtab,vec_madd(r,tsc,nul),&VVc,&VVd,&VVr);
13630 vctot = vec_madd(qq,VVc,vctot);
13631 vnbtot = vec_madd(c6,VVd,vnbtot);
13632 vnbtot = vec_madd(c12,VVr,vnbtot);
13634 /* update outer data */
13635 add_vector_to_float(Vnb+gid[n],vnbtot);
13636 add_vector_to_float(Vc+gid[n],vctot);
13641 void mcinl1020_altivec(
13642 int nri,
13643 int iinr[],
13644 int jindex[],
13645 int jjnr[],
13646 int shift[],
13647 float shiftvec[],
13648 int gid[],
13649 float pos[],
13650 float charge[],
13651 float facel,
13652 float Vc[])
13654 vector float iOx,iOy,iOz,iH1x,iH1y,iH1z,iH2x,iH2y,iH2z;
13655 vector float dOx,dOy,dOz,dH1x,dH1y,dH1z,dH2x,dH2y,dH2z;
13656 vector float vfacel,nul;
13657 vector float vctot,qqO,qqH,iqO,iqH,jq;
13658 vector float rinvO,rinvH1,rinvH2,rsqO,rsqH1,rsqH2;
13661 int n,k,k0,ii,is3,ii3,ntiA,nj0,nj1;
13662 int jnra,jnrb,jnrc,jnrd;
13663 int j3a,j3b,j3c,j3d;
13665 nul=vec_zero();
13666 vfacel=load_float_and_splat(&facel);
13667 iqO = vec_madd(load_float_and_splat(charge+iinr[0]),vfacel,nul);
13668 iqH = vec_madd(load_float_and_splat(charge+iinr[0]+1),vfacel,nul);
13670 for(n=0;n<nri;n++) {
13671 is3 = 3*shift[n];
13672 ii = iinr[n];
13673 ii3 = 3*ii;
13674 load_1_water_shift_and_splat(pos+ii3,shiftvec+is3,&iOx,&iOy,&iOz,
13675 &iH1x,&iH1y,&iH1z,&iH2x,&iH2y,&iH2z);
13676 vctot = nul;
13677 nj0 = jindex[n];
13678 nj1 = jindex[n+1];
13680 for(k=nj0; k<(nj1-3); k+=4) {
13681 jnra = jjnr[k];
13682 jnrb = jjnr[k+1];
13683 jnrc = jjnr[k+2];
13684 jnrd = jjnr[k+3];
13685 j3a = 3*jnra;
13686 j3b = 3*jnrb;
13687 j3c = 3*jnrc;
13688 j3d = 3*jnrd;
13689 transpose_4_to_3(load_xyz(pos+j3a),
13690 load_xyz(pos+j3b),
13691 load_xyz(pos+j3c),
13692 load_xyz(pos+j3d),&dH2x,&dH2y,&dH2z);
13693 dOx = vec_sub(iOx,dH2x);
13694 dOy = vec_sub(iOy,dH2y);
13695 dOz = vec_sub(iOz,dH2z);
13696 dH1x = vec_sub(iH1x,dH2x);
13697 dH1y = vec_sub(iH1y,dH2y);
13698 dH1z = vec_sub(iH1z,dH2z);
13699 dH2x = vec_sub(iH2x,dH2x);
13700 dH2y = vec_sub(iH2y,dH2y);
13701 dH2z = vec_sub(iH2z,dH2z);
13703 rsqO = vec_madd(dOx,dOx,nul);
13704 rsqH1 = vec_madd(dH1x,dH1x,nul);
13705 rsqH2 = vec_madd(dH2x,dH2x,nul);
13706 rsqO = vec_madd(dOy,dOy,rsqO);
13707 rsqH1 = vec_madd(dH1y,dH1y,rsqH1);
13708 rsqH2 = vec_madd(dH2y,dH2y,rsqH2);
13709 rsqO = vec_madd(dOz,dOz,rsqO);
13710 rsqH1 = vec_madd(dH1z,dH1z,rsqH1);
13711 rsqH2 = vec_madd(dH2z,dH2z,rsqH2);
13712 do_3_invsqrt(rsqO,rsqH1,rsqH2,&rinvO,&rinvH1,&rinvH2);
13713 /* load 4 j charges and multiply by iq */
13714 jq=load_4_float(charge+jnra,charge+jnrb,charge+jnrc,charge+jnrd);
13715 qqO = vec_madd(iqO,jq,nul);
13716 qqH = vec_madd(iqH,jq,nul);
13717 vctot = vec_madd(qqO,rinvO,vctot);
13718 vctot = vec_madd(qqH,rinvH1,vctot);
13719 vctot = vec_madd(qqH,rinvH2,vctot);
13721 if(k<(nj1-2)) {
13722 jnra = jjnr[k];
13723 jnrb = jjnr[k+1];
13724 jnrc = jjnr[k+2];
13725 j3a = 3*jnra;
13726 j3b = 3*jnrb;
13727 j3c = 3*jnrc;
13728 transpose_4_to_3(load_xyz(pos+j3a),
13729 load_xyz(pos+j3b),
13730 load_xyz(pos+j3c),nul,&dH2x,&dH2y,&dH2z);
13731 dOx = vec_sub(iOx,dH2x);
13732 dOy = vec_sub(iOy,dH2y);
13733 dOz = vec_sub(iOz,dH2z);
13734 dH1x = vec_sub(iH1x,dH2x);
13735 dH1y = vec_sub(iH1y,dH2y);
13736 dH1z = vec_sub(iH1z,dH2z);
13737 dH2x = vec_sub(iH2x,dH2x);
13738 dH2y = vec_sub(iH2y,dH2y);
13739 dH2z = vec_sub(iH2z,dH2z);
13741 rsqO = vec_madd(dOx,dOx,nul);
13742 rsqH1 = vec_madd(dH1x,dH1x,nul);
13743 rsqH2 = vec_madd(dH2x,dH2x,nul);
13744 rsqO = vec_madd(dOy,dOy,rsqO);
13745 rsqH1 = vec_madd(dH1y,dH1y,rsqH1);
13746 rsqH2 = vec_madd(dH2y,dH2y,rsqH2);
13747 rsqO = vec_madd(dOz,dOz,rsqO);
13748 rsqH1 = vec_madd(dH1z,dH1z,rsqH1);
13749 rsqH2 = vec_madd(dH2z,dH2z,rsqH2);
13750 do_3_invsqrt(rsqO,rsqH1,rsqH2,&rinvO,&rinvH1,&rinvH2);
13751 zero_highest_element_in_3_vectors(&rinvO,&rinvH1,&rinvH2);
13753 jq=load_3_float(charge+jnra,charge+jnrb,charge+jnrc);
13754 qqO = vec_madd(iqO,jq,nul);
13755 qqH = vec_madd(iqH,jq,nul);
13756 vctot = vec_madd(qqO,rinvO,vctot);
13757 vctot = vec_madd(qqH,rinvH1,vctot);
13758 vctot = vec_madd(qqH,rinvH2,vctot);
13759 } else if(k<(nj1-1)) {
13760 jnra = jjnr[k];
13761 jnrb = jjnr[k+1];
13762 j3a = 3*jnra;
13763 j3b = 3*jnrb;
13764 transpose_2_to_3(load_xyz(pos+j3a),
13765 load_xyz(pos+j3b),&dH2x,&dH2y,&dH2z);
13766 dOx = vec_sub(iOx,dH2x);
13767 dOy = vec_sub(iOy,dH2y);
13768 dOz = vec_sub(iOz,dH2z);
13769 dH1x = vec_sub(iH1x,dH2x);
13770 dH1y = vec_sub(iH1y,dH2y);
13771 dH1z = vec_sub(iH1z,dH2z);
13772 dH2x = vec_sub(iH2x,dH2x);
13773 dH2y = vec_sub(iH2y,dH2y);
13774 dH2z = vec_sub(iH2z,dH2z);
13776 rsqO = vec_madd(dOx,dOx,nul);
13777 rsqH1 = vec_madd(dH1x,dH1x,nul);
13778 rsqH2 = vec_madd(dH2x,dH2x,nul);
13779 rsqO = vec_madd(dOy,dOy,rsqO);
13780 rsqH1 = vec_madd(dH1y,dH1y,rsqH1);
13781 rsqH2 = vec_madd(dH2y,dH2y,rsqH2);
13782 rsqO = vec_madd(dOz,dOz,rsqO);
13783 rsqH1 = vec_madd(dH1z,dH1z,rsqH1);
13784 rsqH2 = vec_madd(dH2z,dH2z,rsqH2);
13785 do_3_invsqrt(rsqO,rsqH1,rsqH2,&rinvO,&rinvH1,&rinvH2);
13786 zero_highest_2_elements_in_3_vectors(&rinvO,&rinvH1,&rinvH2);
13788 /* load 2 j charges and multiply by iq */
13789 jq=load_2_float(charge+jnra,charge+jnrb);
13790 qqO = vec_madd(iqO,jq,nul);
13791 qqH = vec_madd(iqH,jq,nul);
13792 vctot = vec_madd(qqO,rinvO,vctot);
13793 vctot = vec_madd(qqH,rinvH1,vctot);
13794 vctot = vec_madd(qqH,rinvH2,vctot);
13795 } else if(k<nj1) {
13796 jnra = jjnr[k];
13797 j3a = 3*jnra;
13798 transpose_1_to_3(load_xyz(pos+j3a),&dH2x,&dH2y,&dH2z);
13799 dOx = vec_sub(iOx,dH2x);
13800 dOy = vec_sub(iOy,dH2y);
13801 dOz = vec_sub(iOz,dH2z);
13802 dH1x = vec_sub(iH1x,dH2x);
13803 dH1y = vec_sub(iH1y,dH2y);
13804 dH1z = vec_sub(iH1z,dH2z);
13805 dH2x = vec_sub(iH2x,dH2x);
13806 dH2y = vec_sub(iH2y,dH2y);
13807 dH2z = vec_sub(iH2z,dH2z);
13809 rsqO = vec_madd(dOx,dOx,nul);
13810 rsqH1 = vec_madd(dH1x,dH1x,nul);
13811 rsqH2 = vec_madd(dH2x,dH2x,nul);
13812 rsqO = vec_madd(dOy,dOy,rsqO);
13813 rsqH1 = vec_madd(dH1y,dH1y,rsqH1);
13814 rsqH2 = vec_madd(dH2y,dH2y,rsqH2);
13815 rsqO = vec_madd(dOz,dOz,rsqO);
13816 rsqH1 = vec_madd(dH1z,dH1z,rsqH1);
13817 rsqH2 = vec_madd(dH2z,dH2z,rsqH2);
13818 do_3_invsqrt(rsqO,rsqH1,rsqH2,&rinvO,&rinvH1,&rinvH2);
13819 zero_highest_3_elements_in_3_vectors(&rinvO,&rinvH1,&rinvH2);
13820 /* load 1 j charges and multiply by iq */
13821 jq=load_1_float(charge+jnra);
13822 qqO = vec_madd(iqO,jq,nul);
13823 qqH = vec_madd(iqH,jq,nul);
13824 vctot = vec_madd(qqO,rinvO,vctot);
13825 vctot = vec_madd(qqH,rinvH1,vctot);
13826 vctot = vec_madd(qqH,rinvH2,vctot);
13828 /* update outer data */
13829 add_vector_to_float(Vc+gid[n],vctot);
13834 void mcinl1120_altivec(
13835 int nri,
13836 int iinr[],
13837 int jindex[],
13838 int jjnr[],
13839 int shift[],
13840 float shiftvec[],
13841 int gid[],
13842 float pos[],
13843 float charge[],
13844 float facel,
13845 float Vc[],
13846 int type[],
13847 int ntype,
13848 float nbfp[],
13849 float Vnb[])
13851 vector float iOx,iOy,iOz,iH1x,iH1y,iH1z,iH2x,iH2y,iH2z;
13852 vector float dOx,dOy,dOz,dH1x,dH1y,dH1z,dH2x,dH2y,dH2z;
13853 vector float vfacel,nul;
13854 vector float vnbtot,c6,c12,rinvsix;
13855 vector float vctot,qqO,qqH,iqO,iqH,jq;
13856 vector float rinvO,rinvH1,rinvH2,rinvsqO,rsqO,rsqH1,rsqH2;
13858 int n,k,k0,ii,is3,ii3,ntiA,nj0,nj1;
13859 int jnra,jnrb,jnrc,jnrd;
13860 int j3a,j3b,j3c,j3d;
13861 int tja,tjb,tjc,tjd;
13863 nul=vec_zero();
13864 vfacel=load_float_and_splat(&facel);
13865 ii = iinr[0];
13866 iqO = vec_madd(load_float_and_splat(charge+ii),vfacel,nul);
13867 iqH = vec_madd(load_float_and_splat(charge+ii+1),vfacel,nul);
13868 ntiA = 2*ntype*type[ii];
13870 for(n=0;n<nri;n++) {
13871 is3 = 3*shift[n];
13872 ii = iinr[n];
13873 ii3 = 3*ii;
13874 load_1_water_shift_and_splat(pos+ii3,shiftvec+is3,&iOx,&iOy,&iOz,
13875 &iH1x,&iH1y,&iH1z,&iH2x,&iH2y,&iH2z);
13876 vctot = nul;
13877 vnbtot = nul;
13878 nj0 = jindex[n];
13879 nj1 = jindex[n+1];
13881 for(k=nj0; k<(nj1-3); k+=4) {
13882 jnra = jjnr[k];
13883 jnrb = jjnr[k+1];
13884 jnrc = jjnr[k+2];
13885 jnrd = jjnr[k+3];
13886 j3a = 3*jnra;
13887 j3b = 3*jnrb;
13888 j3c = 3*jnrc;
13889 j3d = 3*jnrd;
13890 transpose_4_to_3(load_xyz(pos+j3a),
13891 load_xyz(pos+j3b),
13892 load_xyz(pos+j3c),
13893 load_xyz(pos+j3d),&dH2x,&dH2y,&dH2z);
13894 dOx = vec_sub(iOx,dH2x);
13895 dOy = vec_sub(iOy,dH2y);
13896 dOz = vec_sub(iOz,dH2z);
13897 dH1x = vec_sub(iH1x,dH2x);
13898 dH1y = vec_sub(iH1y,dH2y);
13899 dH1z = vec_sub(iH1z,dH2z);
13900 dH2x = vec_sub(iH2x,dH2x);
13901 dH2y = vec_sub(iH2y,dH2y);
13902 dH2z = vec_sub(iH2z,dH2z);
13904 rsqO = vec_madd(dOx,dOx,nul);
13905 rsqH1 = vec_madd(dH1x,dH1x,nul);
13906 rsqH2 = vec_madd(dH2x,dH2x,nul);
13907 rsqO = vec_madd(dOy,dOy,rsqO);
13908 rsqH1 = vec_madd(dH1y,dH1y,rsqH1);
13909 rsqH2 = vec_madd(dH2y,dH2y,rsqH2);
13910 rsqO = vec_madd(dOz,dOz,rsqO);
13911 rsqH1 = vec_madd(dH1z,dH1z,rsqH1);
13912 rsqH2 = vec_madd(dH2z,dH2z,rsqH2);
13913 do_3_invsqrt(rsqO,rsqH1,rsqH2,&rinvO,&rinvH1,&rinvH2);
13914 rinvsqO = vec_madd(rinvO,rinvO,nul);
13915 rinvsix = vec_madd(rinvsqO,rinvsqO,nul);
13916 rinvsix = vec_madd(rinvsix,rinvsqO,nul);
13917 tja = ntiA+2*type[jnra];
13918 tjb = ntiA+2*type[jnrb];
13919 tjc = ntiA+2*type[jnrc];
13920 tjd = ntiA+2*type[jnrd];
13921 /* load 4 j charges and multiply by iq */
13922 jq=load_4_float(charge+jnra,charge+jnrb,charge+jnrc,charge+jnrd);
13923 load_4_pair(nbfp+tja,nbfp+tjb,nbfp+tjc,nbfp+tjd,&c6,&c12);
13924 qqO = vec_madd(iqO,jq,nul);
13925 qqH = vec_madd(iqH,jq,nul);
13926 vnbtot = vec_nmsub(c6,rinvsix,vnbtot);
13927 vctot = vec_madd(qqO,rinvO,vctot);
13928 vnbtot = vec_madd(c12,vec_madd(rinvsix,rinvsix,nul),vnbtot);
13929 vctot = vec_madd(qqH,rinvH1,vctot);
13930 vctot = vec_madd(qqH,rinvH2,vctot);
13932 if(k<(nj1-2)) {
13933 jnra = jjnr[k];
13934 jnrb = jjnr[k+1];
13935 jnrc = jjnr[k+2];
13936 j3a = 3*jnra;
13937 j3b = 3*jnrb;
13938 j3c = 3*jnrc;
13939 transpose_4_to_3(load_xyz(pos+j3a),
13940 load_xyz(pos+j3b),
13941 load_xyz(pos+j3c),nul,&dH2x,&dH2y,&dH2z);
13942 dOx = vec_sub(iOx,dH2x);
13943 dOy = vec_sub(iOy,dH2y);
13944 dOz = vec_sub(iOz,dH2z);
13945 dH1x = vec_sub(iH1x,dH2x);
13946 dH1y = vec_sub(iH1y,dH2y);
13947 dH1z = vec_sub(iH1z,dH2z);
13948 dH2x = vec_sub(iH2x,dH2x);
13949 dH2y = vec_sub(iH2y,dH2y);
13950 dH2z = vec_sub(iH2z,dH2z);
13952 rsqO = vec_madd(dOx,dOx,nul);
13953 rsqH1 = vec_madd(dH1x,dH1x,nul);
13954 rsqH2 = vec_madd(dH2x,dH2x,nul);
13955 rsqO = vec_madd(dOy,dOy,rsqO);
13956 rsqH1 = vec_madd(dH1y,dH1y,rsqH1);
13957 rsqH2 = vec_madd(dH2y,dH2y,rsqH2);
13958 rsqO = vec_madd(dOz,dOz,rsqO);
13959 rsqH1 = vec_madd(dH1z,dH1z,rsqH1);
13960 rsqH2 = vec_madd(dH2z,dH2z,rsqH2);
13961 do_3_invsqrt(rsqO,rsqH1,rsqH2,&rinvO,&rinvH1,&rinvH2);
13962 zero_highest_element_in_3_vectors(&rinvO,&rinvH1,&rinvH2);
13964 rinvsqO = vec_madd(rinvO,rinvO,nul);
13965 rinvsix = vec_madd(rinvsqO,rinvsqO,nul);
13966 rinvsix = vec_madd(rinvsix,rinvsqO,nul);
13967 jq=load_3_float(charge+jnra,charge+jnrb,charge+jnrc);
13968 tja = ntiA+2*type[jnra];
13969 tjb = ntiA+2*type[jnrb];
13970 tjc = ntiA+2*type[jnrc];
13971 /* load 3 j charges and multiply by iq */
13972 load_3_pair(nbfp+tja,nbfp+tjb,nbfp+tjc,&c6,&c12);
13973 qqO = vec_madd(iqO,jq,nul);
13974 qqH = vec_madd(iqH,jq,nul);
13975 vnbtot = vec_nmsub(c6,rinvsix,vnbtot);
13976 vctot = vec_madd(qqO,rinvO,vctot);
13977 vnbtot = vec_madd(c12,vec_madd(rinvsix,rinvsix,nul),vnbtot);
13978 vctot = vec_madd(qqH,rinvH1,vctot);
13979 vctot = vec_madd(qqH,rinvH2,vctot);
13980 } else if(k<(nj1-1)) {
13981 jnra = jjnr[k];
13982 jnrb = jjnr[k+1];
13983 j3a = 3*jnra;
13984 j3b = 3*jnrb;
13985 transpose_2_to_3(load_xyz(pos+j3a),
13986 load_xyz(pos+j3b),&dH2x,&dH2y,&dH2z);
13987 dOx = vec_sub(iOx,dH2x);
13988 dOy = vec_sub(iOy,dH2y);
13989 dOz = vec_sub(iOz,dH2z);
13990 dH1x = vec_sub(iH1x,dH2x);
13991 dH1y = vec_sub(iH1y,dH2y);
13992 dH1z = vec_sub(iH1z,dH2z);
13993 dH2x = vec_sub(iH2x,dH2x);
13994 dH2y = vec_sub(iH2y,dH2y);
13995 dH2z = vec_sub(iH2z,dH2z);
13997 rsqO = vec_madd(dOx,dOx,nul);
13998 rsqH1 = vec_madd(dH1x,dH1x,nul);
13999 rsqH2 = vec_madd(dH2x,dH2x,nul);
14000 rsqO = vec_madd(dOy,dOy,rsqO);
14001 rsqH1 = vec_madd(dH1y,dH1y,rsqH1);
14002 rsqH2 = vec_madd(dH2y,dH2y,rsqH2);
14003 rsqO = vec_madd(dOz,dOz,rsqO);
14004 rsqH1 = vec_madd(dH1z,dH1z,rsqH1);
14005 rsqH2 = vec_madd(dH2z,dH2z,rsqH2);
14006 do_3_invsqrt(rsqO,rsqH1,rsqH2,&rinvO,&rinvH1,&rinvH2);
14007 zero_highest_2_elements_in_3_vectors(&rinvO,&rinvH1,&rinvH2);
14009 rinvsqO = vec_madd(rinvO,rinvO,nul);
14010 rinvsix = vec_madd(rinvsqO,rinvsqO,nul);
14011 rinvsix = vec_madd(rinvsix,rinvsqO,nul);
14012 tja = ntiA+2*type[jnra];
14013 tjb = ntiA+2*type[jnrb];
14014 /* load 2 j charges and multiply by iq */
14015 jq=load_2_float(charge+jnra,charge+jnrb);
14016 load_2_pair(nbfp+tja,nbfp+tjb,&c6,&c12);
14017 qqO = vec_madd(iqO,jq,nul);
14018 qqH = vec_madd(iqH,jq,nul);
14019 vnbtot = vec_nmsub(c6,rinvsix,vnbtot);
14020 vctot = vec_madd(qqO,rinvO,vctot);
14021 vnbtot = vec_madd(c12,vec_madd(rinvsix,rinvsix,nul),vnbtot);
14022 vctot = vec_madd(qqH,rinvH1,vctot);
14023 vctot = vec_madd(qqH,rinvH2,vctot);
14024 } else if(k<nj1) {
14025 jnra = jjnr[k];
14026 j3a = 3*jnra;
14027 transpose_1_to_3(load_xyz(pos+j3a),&dH2x,&dH2y,&dH2z);
14028 dOx = vec_sub(iOx,dH2x);
14029 dOy = vec_sub(iOy,dH2y);
14030 dOz = vec_sub(iOz,dH2z);
14031 dH1x = vec_sub(iH1x,dH2x);
14032 dH1y = vec_sub(iH1y,dH2y);
14033 dH1z = vec_sub(iH1z,dH2z);
14034 dH2x = vec_sub(iH2x,dH2x);
14035 dH2y = vec_sub(iH2y,dH2y);
14036 dH2z = vec_sub(iH2z,dH2z);
14038 rsqO = vec_madd(dOx,dOx,nul);
14039 rsqH1 = vec_madd(dH1x,dH1x,nul);
14040 rsqH2 = vec_madd(dH2x,dH2x,nul);
14041 rsqO = vec_madd(dOy,dOy,rsqO);
14042 rsqH1 = vec_madd(dH1y,dH1y,rsqH1);
14043 rsqH2 = vec_madd(dH2y,dH2y,rsqH2);
14044 rsqO = vec_madd(dOz,dOz,rsqO);
14045 rsqH1 = vec_madd(dH1z,dH1z,rsqH1);
14046 rsqH2 = vec_madd(dH2z,dH2z,rsqH2);
14047 do_3_invsqrt(rsqO,rsqH1,rsqH2,&rinvO,&rinvH1,&rinvH2);
14048 zero_highest_3_elements_in_3_vectors(&rinvO,&rinvH1,&rinvH2);
14050 rinvsqO = vec_madd(rinvO,rinvO,nul);
14051 rinvsix = vec_madd(rinvsqO,rinvsqO,nul);
14052 rinvsix = vec_madd(rinvsix,rinvsqO,nul);
14053 tja = ntiA+2*type[jnra];
14054 /* load 1 j charges and multiply by iq */
14055 jq=load_1_float(charge+jnra);
14056 load_1_pair(nbfp+tja,&c6,&c12);
14057 qqO = vec_madd(iqO,jq,nul);
14058 qqH = vec_madd(iqH,jq,nul);
14059 vnbtot = vec_nmsub(c6,rinvsix,vnbtot);
14060 vctot = vec_madd(qqO,rinvO,vctot);
14061 vnbtot = vec_madd(c12,vec_madd(rinvsix,rinvsix,nul),vnbtot);
14062 vctot = vec_madd(qqH,rinvH1,vctot);
14063 vctot = vec_madd(qqH,rinvH2,vctot);
14065 /* update outer data */
14066 add_vector_to_float(Vc+gid[n],vctot);
14067 add_vector_to_float(Vnb+gid[n],vnbtot);
14073 void mcinl2020_altivec(
14074 int nri,
14075 int iinr[],
14076 int jindex[],
14077 int jjnr[],
14078 int shift[],
14079 float shiftvec[],
14080 int gid[],
14081 float pos[],
14082 float charge[],
14083 float facel,
14084 float Vc[],
14085 float krf,
14086 float crf)
14088 vector float vkrf,vcrf;
14089 vector float iOx,iOy,iOz,iH1x,iH1y,iH1z,iH2x,iH2y,iH2z;
14090 vector float dOx,dOy,dOz,dH1x,dH1y,dH1z,dH2x,dH2y,dH2z;
14091 vector float vfacel,nul;
14092 vector float krsqO,krsqH1,krsqH2;
14093 vector float vctot,qqO,qqH,iqO,iqH,jq,vcoulO,vcoulH1,vcoulH2;
14094 vector float rinvO,rinvH1,rinvH2,rsqO,rsqH1,rsqH2;
14096 int n,k,k0,ii,is3,ii3,ntiA,nj0,nj1;
14097 int jnra,jnrb,jnrc,jnrd;
14098 int j3a,j3b,j3c,j3d;
14100 nul=vec_zero();
14101 vfacel=load_float_and_splat(&facel);
14102 vkrf=load_float_and_splat(&krf);
14103 vcrf=load_float_and_splat(&crf);
14105 iqO = vec_madd(load_float_and_splat(charge+iinr[0]),vfacel,nul);
14106 iqH = vec_madd(load_float_and_splat(charge+iinr[0]+1),vfacel,nul);
14108 for(n=0;n<nri;n++) {
14109 is3 = 3*shift[n];
14110 ii = iinr[n];
14111 ii3 = 3*ii;
14112 load_1_water_shift_and_splat(pos+ii3,shiftvec+is3,&iOx,&iOy,&iOz,
14113 &iH1x,&iH1y,&iH1z,&iH2x,&iH2y,&iH2z);
14114 vctot = nul;
14115 nj0 = jindex[n];
14116 nj1 = jindex[n+1];
14118 for(k=nj0; k<(nj1-3); k+=4) {
14119 jnra = jjnr[k];
14120 jnrb = jjnr[k+1];
14121 jnrc = jjnr[k+2];
14122 jnrd = jjnr[k+3];
14123 j3a = 3*jnra;
14124 j3b = 3*jnrb;
14125 j3c = 3*jnrc;
14126 j3d = 3*jnrd;
14127 transpose_4_to_3(load_xyz(pos+j3a),
14128 load_xyz(pos+j3b),
14129 load_xyz(pos+j3c),
14130 load_xyz(pos+j3d),&dH2x,&dH2y,&dH2z);
14131 dOx = vec_sub(iOx,dH2x);
14132 dOy = vec_sub(iOy,dH2y);
14133 dOz = vec_sub(iOz,dH2z);
14134 dH1x = vec_sub(iH1x,dH2x);
14135 dH1y = vec_sub(iH1y,dH2y);
14136 dH1z = vec_sub(iH1z,dH2z);
14137 dH2x = vec_sub(iH2x,dH2x);
14138 dH2y = vec_sub(iH2y,dH2y);
14139 dH2z = vec_sub(iH2z,dH2z);
14141 rsqO = vec_madd(dOx,dOx,nul);
14142 rsqH1 = vec_madd(dH1x,dH1x,nul);
14143 rsqH2 = vec_madd(dH2x,dH2x,nul);
14144 rsqO = vec_madd(dOy,dOy,rsqO);
14145 rsqH1 = vec_madd(dH1y,dH1y,rsqH1);
14146 rsqH2 = vec_madd(dH2y,dH2y,rsqH2);
14147 rsqO = vec_madd(dOz,dOz,rsqO);
14148 rsqH1 = vec_madd(dH1z,dH1z,rsqH1);
14149 rsqH2 = vec_madd(dH2z,dH2z,rsqH2);
14150 do_3_invsqrt(rsqO,rsqH1,rsqH2,&rinvO,&rinvH1,&rinvH2);
14151 /* load 4 j charges and multiply by iq */
14152 jq=load_4_float(charge+jnra,charge+jnrb,charge+jnrc,charge+jnrd);
14153 qqO = vec_madd(iqO,jq,nul);
14154 qqH = vec_madd(iqH,jq,nul);
14155 krsqO = vec_madd(vkrf,rsqO,nul);
14156 krsqH1 = vec_madd(vkrf,rsqH1,nul);
14157 krsqH2 = vec_madd(vkrf,rsqH2,nul);
14158 vcoulO = vec_add(rinvO,krsqO);
14159 vcoulH1 = vec_add(rinvH1,krsqH1);
14160 vcoulH2 = vec_add(rinvH2,krsqH2);
14161 vcoulO = vec_sub(vcoulO,vcrf);
14162 vcoulH1 = vec_sub(vcoulH1,vcrf);
14163 vcoulH2 = vec_sub(vcoulH2,vcrf);
14164 vctot = vec_madd(qqO,vcoulO,vctot);
14165 vctot = vec_madd(qqH,vcoulH1,vctot);
14166 vctot = vec_madd(qqH,vcoulH2,vctot);
14168 if(k<(nj1-2)) {
14169 jnra = jjnr[k];
14170 jnrb = jjnr[k+1];
14171 jnrc = jjnr[k+2];
14172 j3a = 3*jnra;
14173 j3b = 3*jnrb;
14174 j3c = 3*jnrc;
14175 transpose_4_to_3(load_xyz(pos+j3a),
14176 load_xyz(pos+j3b),
14177 load_xyz(pos+j3c),nul,&dH2x,&dH2y,&dH2z);
14178 dOx = vec_sub(iOx,dH2x);
14179 dOy = vec_sub(iOy,dH2y);
14180 dOz = vec_sub(iOz,dH2z);
14181 dH1x = vec_sub(iH1x,dH2x);
14182 dH1y = vec_sub(iH1y,dH2y);
14183 dH1z = vec_sub(iH1z,dH2z);
14184 dH2x = vec_sub(iH2x,dH2x);
14185 dH2y = vec_sub(iH2y,dH2y);
14186 dH2z = vec_sub(iH2z,dH2z);
14188 rsqO = vec_madd(dOx,dOx,nul);
14189 rsqH1 = vec_madd(dH1x,dH1x,nul);
14190 rsqH2 = vec_madd(dH2x,dH2x,nul);
14191 rsqO = vec_madd(dOy,dOy,rsqO);
14192 rsqH1 = vec_madd(dH1y,dH1y,rsqH1);
14193 rsqH2 = vec_madd(dH2y,dH2y,rsqH2);
14194 rsqO = vec_madd(dOz,dOz,rsqO);
14195 rsqH1 = vec_madd(dH1z,dH1z,rsqH1);
14196 rsqH2 = vec_madd(dH2z,dH2z,rsqH2);
14197 zero_highest_element_in_3_vectors(&rsqO,&rsqH1,&rsqH2);
14198 do_3_invsqrt(rsqO,rsqH1,rsqH2,&rinvO,&rinvH1,&rinvH2);
14199 zero_highest_element_in_3_vectors(&rinvO,&rinvH1,&rinvH2);
14201 jq=load_3_float(charge+jnra,charge+jnrb,charge+jnrc);
14202 /* load 3 j charges and multiply by iq */
14203 qqO = vec_madd(iqO,jq,nul);
14204 qqH = vec_madd(iqH,jq,nul);
14205 krsqO = vec_madd(vkrf,rsqO,nul);
14206 krsqH1 = vec_madd(vkrf,rsqH1,nul);
14207 krsqH2 = vec_madd(vkrf,rsqH2,nul);
14208 vcoulO = vec_add(rinvO,krsqO);
14209 vcoulH1 = vec_add(rinvH1,krsqH1);
14210 vcoulH2 = vec_add(rinvH2,krsqH2);
14211 vcoulO = vec_sub(vcoulO,vcrf);
14212 vcoulH1 = vec_sub(vcoulH1,vcrf);
14213 vcoulH2 = vec_sub(vcoulH2,vcrf);
14214 vctot = vec_madd(qqO,vcoulO,vctot);
14215 vctot = vec_madd(qqH,vcoulH1,vctot);
14216 vctot = vec_madd(qqH,vcoulH2,vctot);
14217 } else if(k<(nj1-1)) {
14218 jnra = jjnr[k];
14219 jnrb = jjnr[k+1];
14220 j3a = 3*jnra;
14221 j3b = 3*jnrb;
14222 transpose_2_to_3(load_xyz(pos+j3a),
14223 load_xyz(pos+j3b),&dH2x,&dH2y,&dH2z);
14224 dOx = vec_sub(iOx,dH2x);
14225 dOy = vec_sub(iOy,dH2y);
14226 dOz = vec_sub(iOz,dH2z);
14227 dH1x = vec_sub(iH1x,dH2x);
14228 dH1y = vec_sub(iH1y,dH2y);
14229 dH1z = vec_sub(iH1z,dH2z);
14230 dH2x = vec_sub(iH2x,dH2x);
14231 dH2y = vec_sub(iH2y,dH2y);
14232 dH2z = vec_sub(iH2z,dH2z);
14234 rsqO = vec_madd(dOx,dOx,nul);
14235 rsqH1 = vec_madd(dH1x,dH1x,nul);
14236 rsqH2 = vec_madd(dH2x,dH2x,nul);
14237 rsqO = vec_madd(dOy,dOy,rsqO);
14238 rsqH1 = vec_madd(dH1y,dH1y,rsqH1);
14239 rsqH2 = vec_madd(dH2y,dH2y,rsqH2);
14240 rsqO = vec_madd(dOz,dOz,rsqO);
14241 rsqH1 = vec_madd(dH1z,dH1z,rsqH1);
14242 rsqH2 = vec_madd(dH2z,dH2z,rsqH2);
14243 zero_highest_2_elements_in_3_vectors(&rsqO,&rsqH1,&rsqH2);
14244 do_3_invsqrt(rsqO,rsqH1,rsqH2,&rinvO,&rinvH1,&rinvH2);
14245 zero_highest_2_elements_in_3_vectors(&rinvO,&rinvH1,&rinvH2);
14246 /* load 2 j charges and multiply by iq */
14247 jq=load_2_float(charge+jnra,charge+jnrb);
14248 qqO = vec_madd(iqO,jq,nul);
14249 qqH = vec_madd(iqH,jq,nul);
14250 krsqO = vec_madd(vkrf,rsqO,nul);
14251 krsqH1 = vec_madd(vkrf,rsqH1,nul);
14252 krsqH2 = vec_madd(vkrf,rsqH2,nul);
14253 vcoulO = vec_add(rinvO,krsqO);
14254 vcoulH1 = vec_add(rinvH1,krsqH1);
14255 vcoulH2 = vec_add(rinvH2,krsqH2);
14256 vcoulO = vec_sub(vcoulO,vcrf);
14257 vcoulH1 = vec_sub(vcoulH1,vcrf);
14258 vcoulH2 = vec_sub(vcoulH2,vcrf);
14259 vctot = vec_madd(qqO,vcoulO,vctot);
14260 vctot = vec_madd(qqH,vcoulH1,vctot);
14261 vctot = vec_madd(qqH,vcoulH2,vctot);
14262 } else if(k<nj1) {
14263 jnra = jjnr[k];
14264 j3a = 3*jnra;
14265 transpose_1_to_3(load_xyz(pos+j3a),&dH2x,&dH2y,&dH2z);
14266 dOx = vec_sub(iOx,dH2x);
14267 dOy = vec_sub(iOy,dH2y);
14268 dOz = vec_sub(iOz,dH2z);
14269 dH1x = vec_sub(iH1x,dH2x);
14270 dH1y = vec_sub(iH1y,dH2y);
14271 dH1z = vec_sub(iH1z,dH2z);
14272 dH2x = vec_sub(iH2x,dH2x);
14273 dH2y = vec_sub(iH2y,dH2y);
14274 dH2z = vec_sub(iH2z,dH2z);
14276 rsqO = vec_madd(dOx,dOx,nul);
14277 rsqH1 = vec_madd(dH1x,dH1x,nul);
14278 rsqH2 = vec_madd(dH2x,dH2x,nul);
14279 rsqO = vec_madd(dOy,dOy,rsqO);
14280 rsqH1 = vec_madd(dH1y,dH1y,rsqH1);
14281 rsqH2 = vec_madd(dH2y,dH2y,rsqH2);
14282 rsqO = vec_madd(dOz,dOz,rsqO);
14283 rsqH1 = vec_madd(dH1z,dH1z,rsqH1);
14284 rsqH2 = vec_madd(dH2z,dH2z,rsqH2);
14285 zero_highest_3_elements_in_3_vectors(&rsqO,&rsqH1,&rsqH2);
14286 do_3_invsqrt(rsqO,rsqH1,rsqH2,&rinvO,&rinvH1,&rinvH2);
14287 zero_highest_3_elements_in_3_vectors(&rinvO,&rinvH1,&rinvH2);
14288 /* load 1 j charges and multiply by iq */
14289 jq=load_1_float(charge+jnra);
14290 qqO = vec_madd(iqO,jq,nul);
14291 qqH = vec_madd(iqH,jq,nul);
14292 krsqO = vec_madd(vkrf,rsqO,nul);
14293 krsqH1 = vec_madd(vkrf,rsqH1,nul);
14294 krsqH2 = vec_madd(vkrf,rsqH2,nul);
14295 vcoulO = vec_add(rinvO,krsqO);
14296 vcoulH1 = vec_add(rinvH1,krsqH1);
14297 vcoulH2 = vec_add(rinvH2,krsqH2);
14298 vcoulO = vec_sub(vcoulO,vcrf);
14299 vcoulH1 = vec_sub(vcoulH1,vcrf);
14300 vcoulH2 = vec_sub(vcoulH2,vcrf);
14301 vctot = vec_madd(qqO,vcoulO,vctot);
14302 vctot = vec_madd(qqH,vcoulH1,vctot);
14303 vctot = vec_madd(qqH,vcoulH2,vctot);
14305 /* update outer data */
14306 add_vector_to_float(Vc+gid[n],vctot);
14312 void mcinl2120_altivec(
14313 int nri,
14314 int iinr[],
14315 int jindex[],
14316 int jjnr[],
14317 int shift[],
14318 float shiftvec[],
14319 int gid[],
14320 float pos[],
14321 float charge[],
14322 float facel,
14323 float Vc[],
14324 float krf,
14325 float crf,
14326 int type[],
14327 int ntype,
14328 float nbfp[],
14329 float Vnb[])
14331 vector float vkrf,vcrf;
14332 vector float iOx,iOy,iOz,iH1x,iH1y,iH1z,iH2x,iH2y,iH2z;
14333 vector float dOx,dOy,dOz,dH1x,dH1y,dH1z,dH2x,dH2y,dH2z;
14334 vector float vfacel,nul,vcoulO,vcoulH1,vcoulH2;
14335 vector float vnbtot,c6,c12,rinvsix;
14336 vector float krsqO,krsqH1,krsqH2;
14337 vector float vctot,qqO,qqH,iqO,iqH,jq;
14338 vector float rinvO,rinvH1,rinvH2,rinvsqO,rsqO,rsqH1,rsqH2;
14340 int n,k,k0,ii,is3,ii3,ntiA,nj0,nj1;
14341 int jnra,jnrb,jnrc,jnrd;
14342 int j3a,j3b,j3c,j3d;
14343 int tja,tjb,tjc,tjd;
14345 nul=vec_zero();
14346 vfacel=load_float_and_splat(&facel);
14347 vkrf=load_float_and_splat(&krf);
14348 vcrf=load_float_and_splat(&crf);
14349 ii = iinr[0];
14350 iqO = vec_madd(load_float_and_splat(charge+ii),vfacel,nul);
14351 iqH = vec_madd(load_float_and_splat(charge+ii+1),vfacel,nul);
14352 ntiA = 2*ntype*type[ii];
14354 for(n=0;n<nri;n++) {
14355 is3 = 3*shift[n];
14356 ii = iinr[n];
14357 ii3 = 3*ii;
14358 load_1_water_shift_and_splat(pos+ii3,shiftvec+is3,&iOx,&iOy,&iOz,
14359 &iH1x,&iH1y,&iH1z,&iH2x,&iH2y,&iH2z);
14360 vctot = nul;
14361 vnbtot = nul;
14362 nj0 = jindex[n];
14363 nj1 = jindex[n+1];
14365 for(k=nj0; k<(nj1-3); k+=4) {
14366 jnra = jjnr[k];
14367 jnrb = jjnr[k+1];
14368 jnrc = jjnr[k+2];
14369 jnrd = jjnr[k+3];
14370 j3a = 3*jnra;
14371 j3b = 3*jnrb;
14372 j3c = 3*jnrc;
14373 j3d = 3*jnrd;
14374 transpose_4_to_3(load_xyz(pos+j3a),
14375 load_xyz(pos+j3b),
14376 load_xyz(pos+j3c),
14377 load_xyz(pos+j3d),&dH2x,&dH2y,&dH2z);
14378 dOx = vec_sub(iOx,dH2x);
14379 dOy = vec_sub(iOy,dH2y);
14380 dOz = vec_sub(iOz,dH2z);
14381 dH1x = vec_sub(iH1x,dH2x);
14382 dH1y = vec_sub(iH1y,dH2y);
14383 dH1z = vec_sub(iH1z,dH2z);
14384 dH2x = vec_sub(iH2x,dH2x);
14385 dH2y = vec_sub(iH2y,dH2y);
14386 dH2z = vec_sub(iH2z,dH2z);
14388 rsqO = vec_madd(dOx,dOx,nul);
14389 rsqH1 = vec_madd(dH1x,dH1x,nul);
14390 rsqH2 = vec_madd(dH2x,dH2x,nul);
14391 rsqO = vec_madd(dOy,dOy,rsqO);
14392 rsqH1 = vec_madd(dH1y,dH1y,rsqH1);
14393 rsqH2 = vec_madd(dH2y,dH2y,rsqH2);
14394 rsqO = vec_madd(dOz,dOz,rsqO);
14395 rsqH1 = vec_madd(dH1z,dH1z,rsqH1);
14396 rsqH2 = vec_madd(dH2z,dH2z,rsqH2);
14397 do_3_invsqrt(rsqO,rsqH1,rsqH2,&rinvO,&rinvH1,&rinvH2);
14398 rinvsqO = vec_madd(rinvO,rinvO,nul);
14399 rinvsix = vec_madd(rinvsqO,rinvsqO,nul);
14400 rinvsix = vec_madd(rinvsix,rinvsqO,nul);
14401 tja = ntiA+2*type[jnra];
14402 tjb = ntiA+2*type[jnrb];
14403 tjc = ntiA+2*type[jnrc];
14404 tjd = ntiA+2*type[jnrd];
14405 /* load 4 j charges and multiply by iq */
14406 jq=load_4_float(charge+jnra,charge+jnrb,charge+jnrc,charge+jnrd);
14407 load_4_pair(nbfp+tja,nbfp+tjb,nbfp+tjc,nbfp+tjd,&c6,&c12);
14408 qqO = vec_madd(iqO,jq,nul);
14409 qqH = vec_madd(iqH,jq,nul);
14410 krsqO = vec_madd(vkrf,rsqO,nul);
14411 krsqH1 = vec_madd(vkrf,rsqH1,nul);
14412 krsqH2 = vec_madd(vkrf,rsqH2,nul);
14413 vnbtot = vec_nmsub(c6,rinvsix,vnbtot);
14414 vnbtot = vec_madd(c12,vec_madd(rinvsix,rinvsix,nul),vnbtot);
14415 vcoulO = vec_add(rinvO,krsqO);
14416 vcoulH1 = vec_add(rinvH1,krsqH1);
14417 vcoulH2 = vec_add(rinvH2,krsqH2);
14418 vcoulO = vec_sub(vcoulO,vcrf);
14419 vcoulH1 = vec_sub(vcoulH1,vcrf);
14420 vcoulH2 = vec_sub(vcoulH2,vcrf);
14421 vctot = vec_madd(qqO,vcoulO,vctot);
14422 vctot = vec_madd(qqH,vcoulH1,vctot);
14423 vctot = vec_madd(qqH,vcoulH2,vctot);
14425 if(k<(nj1-2)) {
14426 jnra = jjnr[k];
14427 jnrb = jjnr[k+1];
14428 jnrc = jjnr[k+2];
14429 j3a = 3*jnra;
14430 j3b = 3*jnrb;
14431 j3c = 3*jnrc;
14432 transpose_4_to_3(load_xyz(pos+j3a),
14433 load_xyz(pos+j3b),
14434 load_xyz(pos+j3c),nul,&dH2x,&dH2y,&dH2z);
14435 dOx = vec_sub(iOx,dH2x);
14436 dOy = vec_sub(iOy,dH2y);
14437 dOz = vec_sub(iOz,dH2z);
14438 dH1x = vec_sub(iH1x,dH2x);
14439 dH1y = vec_sub(iH1y,dH2y);
14440 dH1z = vec_sub(iH1z,dH2z);
14441 dH2x = vec_sub(iH2x,dH2x);
14442 dH2y = vec_sub(iH2y,dH2y);
14443 dH2z = vec_sub(iH2z,dH2z);
14445 rsqO = vec_madd(dOx,dOx,nul);
14446 rsqH1 = vec_madd(dH1x,dH1x,nul);
14447 rsqH2 = vec_madd(dH2x,dH2x,nul);
14448 rsqO = vec_madd(dOy,dOy,rsqO);
14449 rsqH1 = vec_madd(dH1y,dH1y,rsqH1);
14450 rsqH2 = vec_madd(dH2y,dH2y,rsqH2);
14451 rsqO = vec_madd(dOz,dOz,rsqO);
14452 rsqH1 = vec_madd(dH1z,dH1z,rsqH1);
14453 rsqH2 = vec_madd(dH2z,dH2z,rsqH2);
14454 zero_highest_element_in_3_vectors(&rsqO,&rsqH1,&rsqH2);
14455 do_3_invsqrt(rsqO,rsqH1,rsqH2,&rinvO,&rinvH1,&rinvH2);
14456 zero_highest_element_in_3_vectors(&rinvO,&rinvH1,&rinvH2);
14457 rinvsqO = vec_madd(rinvO,rinvO,nul);
14458 rinvsix = vec_madd(rinvsqO,rinvsqO,nul);
14459 rinvsix = vec_madd(rinvsix,rinvsqO,nul);
14460 jq=load_3_float(charge+jnra,charge+jnrb,charge+jnrc);
14461 tja = ntiA+2*type[jnra];
14462 tjb = ntiA+2*type[jnrb];
14463 tjc = ntiA+2*type[jnrc];
14464 load_3_pair(nbfp+tja,nbfp+tjb,nbfp+tjc,&c6,&c12);
14465 qqO = vec_madd(iqO,jq,nul);
14466 qqH = vec_madd(iqH,jq,nul);
14467 krsqO = vec_madd(vkrf,rsqO,nul);
14468 krsqH1 = vec_madd(vkrf,rsqH1,nul);
14469 krsqH2 = vec_madd(vkrf,rsqH2,nul);
14470 vnbtot = vec_nmsub(c6,rinvsix,vnbtot);
14471 vnbtot = vec_madd(c12,vec_madd(rinvsix,rinvsix,nul),vnbtot);
14472 vcoulO = vec_add(rinvO,krsqO);
14473 vcoulH1 = vec_add(rinvH1,krsqH1);
14474 vcoulH2 = vec_add(rinvH2,krsqH2);
14475 vcoulO = vec_sub(vcoulO,vcrf);
14476 vcoulH1 = vec_sub(vcoulH1,vcrf);
14477 vcoulH2 = vec_sub(vcoulH2,vcrf);
14478 vctot = vec_madd(qqO,vcoulO,vctot);
14479 vctot = vec_madd(qqH,vcoulH1,vctot);
14480 vctot = vec_madd(qqH,vcoulH2,vctot);
14481 } else if(k<(nj1-1)) {
14482 jnra = jjnr[k];
14483 jnrb = jjnr[k+1];
14484 j3a = 3*jnra;
14485 j3b = 3*jnrb;
14486 transpose_2_to_3(load_xyz(pos+j3a),
14487 load_xyz(pos+j3b),&dH2x,&dH2y,&dH2z);
14488 dOx = vec_sub(iOx,dH2x);
14489 dOy = vec_sub(iOy,dH2y);
14490 dOz = vec_sub(iOz,dH2z);
14491 dH1x = vec_sub(iH1x,dH2x);
14492 dH1y = vec_sub(iH1y,dH2y);
14493 dH1z = vec_sub(iH1z,dH2z);
14494 dH2x = vec_sub(iH2x,dH2x);
14495 dH2y = vec_sub(iH2y,dH2y);
14496 dH2z = vec_sub(iH2z,dH2z);
14498 rsqO = vec_madd(dOx,dOx,nul);
14499 rsqH1 = vec_madd(dH1x,dH1x,nul);
14500 rsqH2 = vec_madd(dH2x,dH2x,nul);
14501 rsqO = vec_madd(dOy,dOy,rsqO);
14502 rsqH1 = vec_madd(dH1y,dH1y,rsqH1);
14503 rsqH2 = vec_madd(dH2y,dH2y,rsqH2);
14504 rsqO = vec_madd(dOz,dOz,rsqO);
14505 rsqH1 = vec_madd(dH1z,dH1z,rsqH1);
14506 rsqH2 = vec_madd(dH2z,dH2z,rsqH2);
14507 zero_highest_2_elements_in_3_vectors(&rsqO,&rsqH1,&rsqH2);
14508 do_3_invsqrt(rsqO,rsqH1,rsqH2,&rinvO,&rinvH1,&rinvH2);
14509 zero_highest_2_elements_in_3_vectors(&rinvO,&rinvH1,&rinvH2);
14510 rinvsqO = vec_madd(rinvO,rinvO,nul);
14511 rinvsix = vec_madd(rinvsqO,rinvsqO,nul);
14512 rinvsix = vec_madd(rinvsix,rinvsqO,nul);
14513 tja = ntiA+2*type[jnra];
14514 tjb = ntiA+2*type[jnrb];
14515 /* load 2 j charges and multiply by iq */
14516 jq=load_2_float(charge+jnra,charge+jnrb);
14517 load_2_pair(nbfp+tja,nbfp+tjb,&c6,&c12);
14518 qqO = vec_madd(iqO,jq,nul);
14519 qqH = vec_madd(iqH,jq,nul);
14520 krsqO = vec_madd(vkrf,rsqO,nul);
14521 krsqH1 = vec_madd(vkrf,rsqH1,nul);
14522 krsqH2 = vec_madd(vkrf,rsqH2,nul);
14523 vnbtot = vec_nmsub(c6,rinvsix,vnbtot);
14524 vnbtot = vec_madd(c12,vec_madd(rinvsix,rinvsix,nul),vnbtot);
14525 vcoulO = vec_add(rinvO,krsqO);
14526 vcoulH1 = vec_add(rinvH1,krsqH1);
14527 vcoulH2 = vec_add(rinvH2,krsqH2);
14528 vcoulO = vec_sub(vcoulO,vcrf);
14529 vcoulH1 = vec_sub(vcoulH1,vcrf);
14530 vcoulH2 = vec_sub(vcoulH2,vcrf);
14531 vctot = vec_madd(qqO,vcoulO,vctot);
14532 vctot = vec_madd(qqH,vcoulH1,vctot);
14533 vctot = vec_madd(qqH,vcoulH2,vctot);
14534 } else if(k<nj1) {
14535 jnra = jjnr[k];
14536 j3a = 3*jnra;
14537 transpose_1_to_3(load_xyz(pos+j3a),&dH2x,&dH2y,&dH2z);
14538 dOx = vec_sub(iOx,dH2x);
14539 dOy = vec_sub(iOy,dH2y);
14540 dOz = vec_sub(iOz,dH2z);
14541 dH1x = vec_sub(iH1x,dH2x);
14542 dH1y = vec_sub(iH1y,dH2y);
14543 dH1z = vec_sub(iH1z,dH2z);
14544 dH2x = vec_sub(iH2x,dH2x);
14545 dH2y = vec_sub(iH2y,dH2y);
14546 dH2z = vec_sub(iH2z,dH2z);
14548 rsqO = vec_madd(dOx,dOx,nul);
14549 rsqH1 = vec_madd(dH1x,dH1x,nul);
14550 rsqH2 = vec_madd(dH2x,dH2x,nul);
14551 rsqO = vec_madd(dOy,dOy,rsqO);
14552 rsqH1 = vec_madd(dH1y,dH1y,rsqH1);
14553 rsqH2 = vec_madd(dH2y,dH2y,rsqH2);
14554 rsqO = vec_madd(dOz,dOz,rsqO);
14555 rsqH1 = vec_madd(dH1z,dH1z,rsqH1);
14556 rsqH2 = vec_madd(dH2z,dH2z,rsqH2);
14557 zero_highest_3_elements_in_3_vectors(&rsqO,&rsqH1,&rsqH2);
14558 do_3_invsqrt(rsqO,rsqH1,rsqH2,&rinvO,&rinvH1,&rinvH2);
14559 zero_highest_3_elements_in_3_vectors(&rinvO,&rinvH1,&rinvH2);
14560 rinvsqO = vec_madd(rinvO,rinvO,nul);
14561 rinvsix = vec_madd(rinvsqO,rinvsqO,nul);
14562 rinvsix = vec_madd(rinvsix,rinvsqO,nul);
14563 tja = ntiA+2*type[jnra];
14564 /* load 1 j charges and multiply by iq */
14565 jq=load_1_float(charge+jnra);
14566 load_1_pair(nbfp+tja,&c6,&c12);
14567 qqO = vec_madd(iqO,jq,nul);
14568 qqH = vec_madd(iqH,jq,nul);
14569 krsqO = vec_madd(vkrf,rsqO,nul);
14570 krsqH1 = vec_madd(vkrf,rsqH1,nul);
14571 krsqH2 = vec_madd(vkrf,rsqH2,nul);
14572 vnbtot = vec_nmsub(c6,rinvsix,vnbtot);
14573 vnbtot = vec_madd(c12,vec_madd(rinvsix,rinvsix,nul),vnbtot);
14574 vcoulO = vec_add(rinvO,krsqO);
14575 vcoulH1 = vec_add(rinvH1,krsqH1);
14576 vcoulH2 = vec_add(rinvH2,krsqH2);
14577 vcoulO = vec_sub(vcoulO,vcrf);
14578 vcoulH1 = vec_sub(vcoulH1,vcrf);
14579 vcoulH2 = vec_sub(vcoulH2,vcrf);
14580 vctot = vec_madd(qqO,vcoulO,vctot);
14581 vctot = vec_madd(qqH,vcoulH1,vctot);
14582 vctot = vec_madd(qqH,vcoulH2,vctot);
14584 /* update outer data */
14585 add_vector_to_float(Vc+gid[n],vctot);
14586 add_vector_to_float(Vnb+gid[n],vnbtot);
14592 void mcinl3020_altivec(
14593 int nri,
14594 int iinr[],
14595 int jindex[],
14596 int jjnr[],
14597 int shift[],
14598 float shiftvec[],
14599 int gid[],
14600 float pos[],
14601 float charge[],
14602 float facel,
14603 float Vc[],
14604 float tabscale,
14605 float VFtab[])
14607 vector float iOx,iOy,iOz,iH1x,iH1y,iH1z,iH2x,iH2y,iH2z;
14608 vector float dOx,dOy,dOz,dH1x,dH1y,dH1z,dH2x,dH2y,dH2z;
14609 vector float vfacel,nul;
14610 vector float tsc,VVcO,VVcH1,VVcH2;
14611 vector float vctot,qqO,qqH,iqO,iqH,jq;
14612 vector float rinvO,rinvH1,rinvH2,rO,rH1,rH2,rsqO,rsqH1,rsqH2;
14614 int n,k,k0,ii,is3,ii3,ntiA,nj0,nj1;
14615 int jnra,jnrb,jnrc,jnrd;
14616 int j3a,j3b,j3c,j3d;
14618 nul=vec_zero();
14619 vfacel=load_float_and_splat(&facel);
14620 tsc=load_float_and_splat(&tabscale);
14621 iqO = vec_madd(load_float_and_splat(charge+iinr[0]),vfacel,nul);
14622 iqH = vec_madd(load_float_and_splat(charge+iinr[0]+1),vfacel,nul);
14624 for(n=0;n<nri;n++) {
14625 is3 = 3*shift[n];
14626 ii = iinr[n];
14627 ii3 = 3*ii;
14628 load_1_water_shift_and_splat(pos+ii3,shiftvec+is3,&iOx,&iOy,&iOz,
14629 &iH1x,&iH1y,&iH1z,&iH2x,&iH2y,&iH2z);
14630 vctot = nul;
14631 nj0 = jindex[n];
14632 nj1 = jindex[n+1];
14634 for(k=nj0; k<(nj1-3); k+=4) {
14635 jnra = jjnr[k];
14636 jnrb = jjnr[k+1];
14637 jnrc = jjnr[k+2];
14638 jnrd = jjnr[k+3];
14639 j3a = 3*jnra;
14640 j3b = 3*jnrb;
14641 j3c = 3*jnrc;
14642 j3d = 3*jnrd;
14643 transpose_4_to_3(load_xyz(pos+j3a),
14644 load_xyz(pos+j3b),
14645 load_xyz(pos+j3c),
14646 load_xyz(pos+j3d),&dH2x,&dH2y,&dH2z);
14647 dOx = vec_sub(iOx,dH2x);
14648 dOy = vec_sub(iOy,dH2y);
14649 dOz = vec_sub(iOz,dH2z);
14650 dH1x = vec_sub(iH1x,dH2x);
14651 dH1y = vec_sub(iH1y,dH2y);
14652 dH1z = vec_sub(iH1z,dH2z);
14653 dH2x = vec_sub(iH2x,dH2x);
14654 dH2y = vec_sub(iH2y,dH2y);
14655 dH2z = vec_sub(iH2z,dH2z);
14657 rsqO = vec_madd(dOx,dOx,nul);
14658 rsqH1 = vec_madd(dH1x,dH1x,nul);
14659 rsqH2 = vec_madd(dH2x,dH2x,nul);
14660 rsqO = vec_madd(dOy,dOy,rsqO);
14661 rsqH1 = vec_madd(dH1y,dH1y,rsqH1);
14662 rsqH2 = vec_madd(dH2y,dH2y,rsqH2);
14663 rsqO = vec_madd(dOz,dOz,rsqO);
14664 rsqH1 = vec_madd(dH1z,dH1z,rsqH1);
14665 rsqH2 = vec_madd(dH2z,dH2z,rsqH2);
14666 do_3_invsqrt(rsqO,rsqH1,rsqH2,&rinvO,&rinvH1,&rinvH2);
14667 rO = vec_madd(rsqO,rinvO,nul);
14668 rH1 = vec_madd(rsqH1,rinvH1,nul);
14669 rH2 = vec_madd(rsqH2,rinvH2,nul);
14671 /* load 4 j charges and multiply by iq */
14672 jq=load_4_float(charge+jnra,charge+jnrb,charge+jnrc,charge+jnrd);
14673 do_vonly_4_ctable_coul(VFtab,vec_madd(rO,tsc,nul),&VVcO);
14674 do_vonly_4_ctable_coul(VFtab,vec_madd(rH1,tsc,nul),&VVcH1);
14675 do_vonly_4_ctable_coul(VFtab,vec_madd(rH2,tsc,nul),&VVcH2);
14676 qqO = vec_madd(iqO,jq,nul);
14677 qqH = vec_madd(iqH,jq,nul);
14678 vctot = vec_madd(qqO,VVcO,vctot);
14679 vctot = vec_madd(qqH,VVcH1,vctot);
14680 vctot = vec_madd(qqH,VVcH2,vctot);
14682 if(k<(nj1-2)) {
14683 jnra = jjnr[k];
14684 jnrb = jjnr[k+1];
14685 jnrc = jjnr[k+2];
14686 j3a = 3*jnra;
14687 j3b = 3*jnrb;
14688 j3c = 3*jnrc;
14689 transpose_4_to_3(load_xyz(pos+j3a),
14690 load_xyz(pos+j3b),
14691 load_xyz(pos+j3c),nul,&dH2x,&dH2y,&dH2z);
14692 dOx = vec_sub(iOx,dH2x);
14693 dOy = vec_sub(iOy,dH2y);
14694 dOz = vec_sub(iOz,dH2z);
14695 dH1x = vec_sub(iH1x,dH2x);
14696 dH1y = vec_sub(iH1y,dH2y);
14697 dH1z = vec_sub(iH1z,dH2z);
14698 dH2x = vec_sub(iH2x,dH2x);
14699 dH2y = vec_sub(iH2y,dH2y);
14700 dH2z = vec_sub(iH2z,dH2z);
14702 rsqO = vec_madd(dOx,dOx,nul);
14703 rsqH1 = vec_madd(dH1x,dH1x,nul);
14704 rsqH2 = vec_madd(dH2x,dH2x,nul);
14705 rsqO = vec_madd(dOy,dOy,rsqO);
14706 rsqH1 = vec_madd(dH1y,dH1y,rsqH1);
14707 rsqH2 = vec_madd(dH2y,dH2y,rsqH2);
14708 rsqO = vec_madd(dOz,dOz,rsqO);
14709 rsqH1 = vec_madd(dH1z,dH1z,rsqH1);
14710 rsqH2 = vec_madd(dH2z,dH2z,rsqH2);
14711 zero_highest_element_in_3_vectors(&rsqO,&rsqH1,&rsqH2);
14712 do_3_invsqrt(rsqO,rsqH1,rsqH2,&rinvO,&rinvH1,&rinvH2);
14713 zero_highest_element_in_3_vectors(&rinvO,&rinvH1,&rinvH2);
14714 rO = vec_madd(rsqO,rinvO,nul);
14715 rH1 = vec_madd(rsqH1,rinvH1,nul);
14716 rH2 = vec_madd(rsqH2,rinvH2,nul);
14718 /* load 3 j charges and multiply by iq */
14719 jq=load_3_float(charge+jnra,charge+jnrb,charge+jnrc);
14720 do_vonly_3_ctable_coul(VFtab,vec_madd(rO,tsc,nul),&VVcO);
14721 do_vonly_3_ctable_coul(VFtab,vec_madd(rH1,tsc,nul),&VVcH1);
14722 do_vonly_3_ctable_coul(VFtab,vec_madd(rH2,tsc,nul),&VVcH2);
14723 qqO = vec_madd(iqO,jq,nul);
14724 qqH = vec_madd(iqH,jq,nul);
14725 vctot = vec_madd(qqO,VVcO,vctot);
14726 vctot = vec_madd(qqH,VVcH1,vctot);
14727 vctot = vec_madd(qqH,VVcH2,vctot);
14728 } else if(k<(nj1-1)) {
14729 jnra = jjnr[k];
14730 jnrb = jjnr[k+1];
14731 j3a = 3*jnra;
14732 j3b = 3*jnrb;
14733 transpose_2_to_3(load_xyz(pos+j3a),
14734 load_xyz(pos+j3b),&dH2x,&dH2y,&dH2z);
14735 dOx = vec_sub(iOx,dH2x);
14736 dOy = vec_sub(iOy,dH2y);
14737 dOz = vec_sub(iOz,dH2z);
14738 dH1x = vec_sub(iH1x,dH2x);
14739 dH1y = vec_sub(iH1y,dH2y);
14740 dH1z = vec_sub(iH1z,dH2z);
14741 dH2x = vec_sub(iH2x,dH2x);
14742 dH2y = vec_sub(iH2y,dH2y);
14743 dH2z = vec_sub(iH2z,dH2z);
14745 rsqO = vec_madd(dOx,dOx,nul);
14746 rsqH1 = vec_madd(dH1x,dH1x,nul);
14747 rsqH2 = vec_madd(dH2x,dH2x,nul);
14748 rsqO = vec_madd(dOy,dOy,rsqO);
14749 rsqH1 = vec_madd(dH1y,dH1y,rsqH1);
14750 rsqH2 = vec_madd(dH2y,dH2y,rsqH2);
14751 rsqO = vec_madd(dOz,dOz,rsqO);
14752 rsqH1 = vec_madd(dH1z,dH1z,rsqH1);
14753 rsqH2 = vec_madd(dH2z,dH2z,rsqH2);
14754 zero_highest_2_elements_in_3_vectors(&rsqO,&rsqH1,&rsqH2);
14755 do_3_invsqrt(rsqO,rsqH1,rsqH2,&rinvO,&rinvH1,&rinvH2);
14756 zero_highest_2_elements_in_3_vectors(&rinvO,&rinvH1,&rinvH2);
14757 rO = vec_madd(rsqO,rinvO,nul);
14758 rH1 = vec_madd(rsqH1,rinvH1,nul);
14759 rH2 = vec_madd(rsqH2,rinvH2,nul);
14761 /* load 2 j charges and multiply by iq */
14762 jq=load_2_float(charge+jnra,charge+jnrb);
14763 do_vonly_2_ctable_coul(VFtab,vec_madd(rO,tsc,nul),&VVcO);
14764 do_vonly_2_ctable_coul(VFtab,vec_madd(rH1,tsc,nul),&VVcH1);
14765 do_vonly_2_ctable_coul(VFtab,vec_madd(rH2,tsc,nul),&VVcH2);
14766 qqO = vec_madd(iqO,jq,nul);
14767 qqH = vec_madd(iqH,jq,nul);
14768 vctot = vec_madd(qqO,VVcO,vctot);
14769 vctot = vec_madd(qqH,VVcH1,vctot);
14770 vctot = vec_madd(qqH,VVcH2,vctot);
14771 } else if(k<nj1) {
14772 jnra = jjnr[k];
14773 j3a = 3*jnra;
14774 transpose_1_to_3(load_xyz(pos+j3a),&dH2x,&dH2y,&dH2z);
14775 dOx = vec_sub(iOx,dH2x);
14776 dOy = vec_sub(iOy,dH2y);
14777 dOz = vec_sub(iOz,dH2z);
14778 dH1x = vec_sub(iH1x,dH2x);
14779 dH1y = vec_sub(iH1y,dH2y);
14780 dH1z = vec_sub(iH1z,dH2z);
14781 dH2x = vec_sub(iH2x,dH2x);
14782 dH2y = vec_sub(iH2y,dH2y);
14783 dH2z = vec_sub(iH2z,dH2z);
14785 rsqO = vec_madd(dOx,dOx,nul);
14786 rsqH1 = vec_madd(dH1x,dH1x,nul);
14787 rsqH2 = vec_madd(dH2x,dH2x,nul);
14788 rsqO = vec_madd(dOy,dOy,rsqO);
14789 rsqH1 = vec_madd(dH1y,dH1y,rsqH1);
14790 rsqH2 = vec_madd(dH2y,dH2y,rsqH2);
14791 rsqO = vec_madd(dOz,dOz,rsqO);
14792 rsqH1 = vec_madd(dH1z,dH1z,rsqH1);
14793 rsqH2 = vec_madd(dH2z,dH2z,rsqH2);
14794 zero_highest_3_elements_in_3_vectors(&rsqO,&rsqH1,&rsqH2);
14795 do_3_invsqrt(rsqO,rsqH1,rsqH2,&rinvO,&rinvH1,&rinvH2);
14796 zero_highest_3_elements_in_3_vectors(&rinvO,&rinvH1,&rinvH2);
14797 rO = vec_madd(rsqO,rinvO,nul);
14798 rH1 = vec_madd(rsqH1,rinvH1,nul);
14799 rH2 = vec_madd(rsqH2,rinvH2,nul);
14801 /* load 1 j charges and multiply by iq */
14802 jq=load_1_float(charge+jnra);
14803 do_vonly_1_ctable_coul(VFtab,vec_madd(rO,tsc,nul),&VVcO);
14804 do_vonly_1_ctable_coul(VFtab,vec_madd(rH1,tsc,nul),&VVcH1);
14805 do_vonly_1_ctable_coul(VFtab,vec_madd(rH2,tsc,nul),&VVcH2);
14806 qqO = vec_madd(iqO,jq,nul);
14807 qqH = vec_madd(iqH,jq,nul);
14808 vctot = vec_madd(qqO,VVcO,vctot);
14809 vctot = vec_madd(qqH,VVcH1,vctot);
14810 vctot = vec_madd(qqH,VVcH2,vctot);
14812 /* update outer data */
14813 add_vector_to_float(Vc+gid[n],vctot);
14819 void mcinl3120_altivec(
14820 int nri,
14821 int iinr[],
14822 int jindex[],
14823 int jjnr[],
14824 int shift[],
14825 float shiftvec[],
14826 int gid[],
14827 float pos[],
14828 float charge[],
14829 float facel,
14830 float Vc[],
14831 int type[],
14832 int ntype,
14833 float nbfp[],
14834 float Vnb[],
14835 float tabscale,
14836 float VFtab[])
14838 vector float iOx,iOy,iOz,iH1x,iH1y,iH1z,iH2x,iH2y,iH2z;
14839 vector float dOx,dOy,dOz,dH1x,dH1y,dH1z,dH2x,dH2y,dH2z;
14840 vector float vfacel,nul;
14841 vector float vnbtot,c6,c12,rinvsix,rinvsqO;
14842 vector float tsc,VVcO,VVcH1,VVcH2;
14843 vector float vctot,qqO,qqH,iqO,iqH,jq;
14844 vector float rinvO,rinvH1,rinvH2,rO,rH1,rH2,rsqO,rsqH1,rsqH2;
14846 int n,k,k0,ii,is3,ii3,ntiA,nj0,nj1;
14847 int jnra,jnrb,jnrc,jnrd;
14848 int j3a,j3b,j3c,j3d;
14849 int tja,tjb,tjc,tjd;
14851 nul=vec_zero();
14852 vfacel=load_float_and_splat(&facel);
14853 tsc=load_float_and_splat(&tabscale);
14854 ii = iinr[0];
14855 iqO = vec_madd(load_float_and_splat(charge+ii),vfacel,nul);
14856 iqH = vec_madd(load_float_and_splat(charge+ii+1),vfacel,nul);
14857 ntiA = 2*ntype*type[ii];
14859 for(n=0;n<nri;n++) {
14860 is3 = 3*shift[n];
14861 ii = iinr[n];
14862 ii3 = 3*ii;
14863 load_1_water_shift_and_splat(pos+ii3,shiftvec+is3,&iOx,&iOy,&iOz,
14864 &iH1x,&iH1y,&iH1z,&iH2x,&iH2y,&iH2z);
14865 vctot = nul;
14866 vnbtot = nul;
14867 nj0 = jindex[n];
14868 nj1 = jindex[n+1];
14870 for(k=nj0; k<(nj1-3); k+=4) {
14871 jnra = jjnr[k];
14872 jnrb = jjnr[k+1];
14873 jnrc = jjnr[k+2];
14874 jnrd = jjnr[k+3];
14875 j3a = 3*jnra;
14876 j3b = 3*jnrb;
14877 j3c = 3*jnrc;
14878 j3d = 3*jnrd;
14879 transpose_4_to_3(load_xyz(pos+j3a),
14880 load_xyz(pos+j3b),
14881 load_xyz(pos+j3c),
14882 load_xyz(pos+j3d),&dH2x,&dH2y,&dH2z);
14883 dOx = vec_sub(iOx,dH2x);
14884 dOy = vec_sub(iOy,dH2y);
14885 dOz = vec_sub(iOz,dH2z);
14886 dH1x = vec_sub(iH1x,dH2x);
14887 dH1y = vec_sub(iH1y,dH2y);
14888 dH1z = vec_sub(iH1z,dH2z);
14889 dH2x = vec_sub(iH2x,dH2x);
14890 dH2y = vec_sub(iH2y,dH2y);
14891 dH2z = vec_sub(iH2z,dH2z);
14893 rsqO = vec_madd(dOx,dOx,nul);
14894 rsqH1 = vec_madd(dH1x,dH1x,nul);
14895 rsqH2 = vec_madd(dH2x,dH2x,nul);
14896 rsqO = vec_madd(dOy,dOy,rsqO);
14897 rsqH1 = vec_madd(dH1y,dH1y,rsqH1);
14898 rsqH2 = vec_madd(dH2y,dH2y,rsqH2);
14899 rsqO = vec_madd(dOz,dOz,rsqO);
14900 rsqH1 = vec_madd(dH1z,dH1z,rsqH1);
14901 rsqH2 = vec_madd(dH2z,dH2z,rsqH2);
14902 do_3_invsqrt(rsqO,rsqH1,rsqH2,&rinvO,&rinvH1,&rinvH2);
14903 rinvsqO = vec_madd(rinvO,rinvO,nul);
14904 rO = vec_madd(rsqO,rinvO,nul);
14905 rH1 = vec_madd(rsqH1,rinvH1,nul);
14906 rH2 = vec_madd(rsqH2,rinvH2,nul);
14907 rinvsix = vec_madd(rinvsqO,rinvsqO,nul);
14908 rinvsix = vec_madd(rinvsix,rinvsqO,nul);
14909 tja = ntiA+2*type[jnra];
14910 tjb = ntiA+2*type[jnrb];
14911 tjc = ntiA+2*type[jnrc];
14912 tjd = ntiA+2*type[jnrd];
14913 /* load 4 j charges and multiply by iq */
14914 jq=load_4_float(charge+jnra,charge+jnrb,charge+jnrc,charge+jnrd);
14915 load_4_pair(nbfp+tja,nbfp+tjb,nbfp+tjc,nbfp+tjd,&c6,&c12);
14916 do_vonly_4_ctable_coul(VFtab,vec_madd(rO,tsc,nul),&VVcO);
14917 do_vonly_4_ctable_coul(VFtab,vec_madd(rH1,tsc,nul),&VVcH1);
14918 do_vonly_4_ctable_coul(VFtab,vec_madd(rH2,tsc,nul),&VVcH2);
14919 vnbtot = vec_nmsub(c6,rinvsix,vnbtot);
14920 qqO = vec_madd(iqO,jq,nul);
14921 qqH = vec_madd(iqH,jq,nul);
14922 vnbtot = vec_madd(c12,vec_madd(rinvsix,rinvsix,nul),vnbtot);
14923 vctot = vec_madd(qqO,VVcO,vctot);
14924 vctot = vec_madd(qqH,VVcH1,vctot);
14925 vctot = vec_madd(qqH,VVcH2,vctot);
14927 if(k<(nj1-2)) {
14928 jnra = jjnr[k];
14929 jnrb = jjnr[k+1];
14930 jnrc = jjnr[k+2];
14931 j3a = 3*jnra;
14932 j3b = 3*jnrb;
14933 j3c = 3*jnrc;
14934 transpose_4_to_3(load_xyz(pos+j3a),
14935 load_xyz(pos+j3b),
14936 load_xyz(pos+j3c),nul,&dH2x,&dH2y,&dH2z);
14937 dOx = vec_sub(iOx,dH2x);
14938 dOy = vec_sub(iOy,dH2y);
14939 dOz = vec_sub(iOz,dH2z);
14940 dH1x = vec_sub(iH1x,dH2x);
14941 dH1y = vec_sub(iH1y,dH2y);
14942 dH1z = vec_sub(iH1z,dH2z);
14943 dH2x = vec_sub(iH2x,dH2x);
14944 dH2y = vec_sub(iH2y,dH2y);
14945 dH2z = vec_sub(iH2z,dH2z);
14947 rsqO = vec_madd(dOx,dOx,nul);
14948 rsqH1 = vec_madd(dH1x,dH1x,nul);
14949 rsqH2 = vec_madd(dH2x,dH2x,nul);
14950 rsqO = vec_madd(dOy,dOy,rsqO);
14951 rsqH1 = vec_madd(dH1y,dH1y,rsqH1);
14952 rsqH2 = vec_madd(dH2y,dH2y,rsqH2);
14953 rsqO = vec_madd(dOz,dOz,rsqO);
14954 rsqH1 = vec_madd(dH1z,dH1z,rsqH1);
14955 rsqH2 = vec_madd(dH2z,dH2z,rsqH2);
14956 zero_highest_element_in_3_vectors(&rsqO,&rsqH1,&rsqH2);
14957 do_3_invsqrt(rsqO,rsqH1,rsqH2,&rinvO,&rinvH1,&rinvH2);
14958 zero_highest_element_in_3_vectors(&rinvO,&rinvH1,&rinvH2);
14959 rinvsqO = vec_madd(rinvO,rinvO,nul);
14960 rO = vec_madd(rsqO,rinvO,nul);
14961 rH1 = vec_madd(rsqH1,rinvH1,nul);
14962 rH2 = vec_madd(rsqH2,rinvH2,nul);
14963 rinvsix = vec_madd(rinvsqO,rinvsqO,nul);
14964 rinvsix = vec_madd(rinvsix,rinvsqO,nul);
14965 tja = ntiA+2*type[jnra];
14966 tjb = ntiA+2*type[jnrb];
14967 tjc = ntiA+2*type[jnrc];
14968 /* load 3 j charges and multiply by iq */
14969 load_3_pair(nbfp+tja,nbfp+tjb,nbfp+tjc,&c6,&c12);
14970 jq=load_3_float(charge+jnra,charge+jnrb,charge+jnrc);
14971 do_vonly_3_ctable_coul(VFtab,vec_madd(rO,tsc,nul),&VVcO);
14972 do_vonly_3_ctable_coul(VFtab,vec_madd(rH1,tsc,nul),&VVcH1);
14973 do_vonly_3_ctable_coul(VFtab,vec_madd(rH2,tsc,nul),&VVcH2);
14974 vnbtot = vec_nmsub(c6,rinvsix,vnbtot);
14975 qqO = vec_madd(iqO,jq,nul);
14976 qqH = vec_madd(iqH,jq,nul);
14977 vnbtot = vec_madd(c12,vec_madd(rinvsix,rinvsix,nul),vnbtot);
14978 vctot = vec_madd(qqO,VVcO,vctot);
14979 vctot = vec_madd(qqH,VVcH1,vctot);
14980 vctot = vec_madd(qqH,VVcH2,vctot);
14981 } else if(k<(nj1-1)) {
14982 jnra = jjnr[k];
14983 jnrb = jjnr[k+1];
14984 j3a = 3*jnra;
14985 j3b = 3*jnrb;
14986 transpose_2_to_3(load_xyz(pos+j3a),
14987 load_xyz(pos+j3b),&dH2x,&dH2y,&dH2z);
14988 dOx = vec_sub(iOx,dH2x);
14989 dOy = vec_sub(iOy,dH2y);
14990 dOz = vec_sub(iOz,dH2z);
14991 dH1x = vec_sub(iH1x,dH2x);
14992 dH1y = vec_sub(iH1y,dH2y);
14993 dH1z = vec_sub(iH1z,dH2z);
14994 dH2x = vec_sub(iH2x,dH2x);
14995 dH2y = vec_sub(iH2y,dH2y);
14996 dH2z = vec_sub(iH2z,dH2z);
14998 rsqO = vec_madd(dOx,dOx,nul);
14999 rsqH1 = vec_madd(dH1x,dH1x,nul);
15000 rsqH2 = vec_madd(dH2x,dH2x,nul);
15001 rsqO = vec_madd(dOy,dOy,rsqO);
15002 rsqH1 = vec_madd(dH1y,dH1y,rsqH1);
15003 rsqH2 = vec_madd(dH2y,dH2y,rsqH2);
15004 rsqO = vec_madd(dOz,dOz,rsqO);
15005 rsqH1 = vec_madd(dH1z,dH1z,rsqH1);
15006 rsqH2 = vec_madd(dH2z,dH2z,rsqH2);
15007 zero_highest_2_elements_in_3_vectors(&rsqO,&rsqH1,&rsqH2);
15008 do_3_invsqrt(rsqO,rsqH1,rsqH2,&rinvO,&rinvH1,&rinvH2);
15009 zero_highest_2_elements_in_3_vectors(&rinvO,&rinvH1,&rinvH2);
15010 rinvsqO = vec_madd(rinvO,rinvO,nul);
15011 rO = vec_madd(rsqO,rinvO,nul);
15012 rH1 = vec_madd(rsqH1,rinvH1,nul);
15013 rH2 = vec_madd(rsqH2,rinvH2,nul);
15014 rinvsix = vec_madd(rinvsqO,rinvsqO,nul);
15015 rinvsix = vec_madd(rinvsix,rinvsqO,nul);
15016 tja = ntiA+2*type[jnra];
15017 tjb = ntiA+2*type[jnrb];
15018 /* load 2 j charges and multiply by iq */
15019 jq=load_2_float(charge+jnra,charge+jnrb);
15020 load_2_pair(nbfp+tja,nbfp+tjb,&c6,&c12);
15021 do_vonly_2_ctable_coul(VFtab,vec_madd(rO,tsc,nul),&VVcO);
15022 do_vonly_2_ctable_coul(VFtab,vec_madd(rH1,tsc,nul),&VVcH1);
15023 do_vonly_2_ctable_coul(VFtab,vec_madd(rH2,tsc,nul),&VVcH2);
15024 vnbtot = vec_nmsub(c6,rinvsix,vnbtot);
15025 qqO = vec_madd(iqO,jq,nul);
15026 qqH = vec_madd(iqH,jq,nul);
15027 vnbtot = vec_madd(c12,vec_madd(rinvsix,rinvsix,nul),vnbtot);
15028 vctot = vec_madd(qqO,VVcO,vctot);
15029 vctot = vec_madd(qqH,VVcH1,vctot);
15030 vctot = vec_madd(qqH,VVcH2,vctot);
15031 } else if(k<nj1) {
15032 jnra = jjnr[k];
15033 j3a = 3*jnra;
15034 transpose_1_to_3(load_xyz(pos+j3a),&dH2x,&dH2y,&dH2z);
15035 dOx = vec_sub(iOx,dH2x);
15036 dOy = vec_sub(iOy,dH2y);
15037 dOz = vec_sub(iOz,dH2z);
15038 dH1x = vec_sub(iH1x,dH2x);
15039 dH1y = vec_sub(iH1y,dH2y);
15040 dH1z = vec_sub(iH1z,dH2z);
15041 dH2x = vec_sub(iH2x,dH2x);
15042 dH2y = vec_sub(iH2y,dH2y);
15043 dH2z = vec_sub(iH2z,dH2z);
15045 rsqO = vec_madd(dOx,dOx,nul);
15046 rsqH1 = vec_madd(dH1x,dH1x,nul);
15047 rsqH2 = vec_madd(dH2x,dH2x,nul);
15048 rsqO = vec_madd(dOy,dOy,rsqO);
15049 rsqH1 = vec_madd(dH1y,dH1y,rsqH1);
15050 rsqH2 = vec_madd(dH2y,dH2y,rsqH2);
15051 rsqO = vec_madd(dOz,dOz,rsqO);
15052 rsqH1 = vec_madd(dH1z,dH1z,rsqH1);
15053 rsqH2 = vec_madd(dH2z,dH2z,rsqH2);
15054 zero_highest_3_elements_in_3_vectors(&rsqO,&rsqH1,&rsqH2);
15055 do_3_invsqrt(rsqO,rsqH1,rsqH2,&rinvO,&rinvH1,&rinvH2);
15056 zero_highest_3_elements_in_3_vectors(&rinvO,&rinvH1,&rinvH2);
15057 rinvsqO = vec_madd(rinvO,rinvO,nul);
15058 rO = vec_madd(rsqO,rinvO,nul);
15059 rH1 = vec_madd(rsqH1,rinvH1,nul);
15060 rH2 = vec_madd(rsqH2,rinvH2,nul);
15061 rinvsix = vec_madd(rinvsqO,rinvsqO,nul);
15062 rinvsix = vec_madd(rinvsix,rinvsqO,nul);
15063 tja = ntiA+2*type[jnra];
15064 /* load 1 j charges and multiply by iq */
15065 jq=load_1_float(charge+jnra);
15066 load_1_pair(nbfp+tja,&c6,&c12);
15067 do_vonly_1_ctable_coul(VFtab,vec_madd(rO,tsc,nul),&VVcO);
15068 do_vonly_1_ctable_coul(VFtab,vec_madd(rH1,tsc,nul),&VVcH1);
15069 do_vonly_1_ctable_coul(VFtab,vec_madd(rH2,tsc,nul),&VVcH2);
15070 vnbtot = vec_nmsub(c6,rinvsix,vnbtot);
15071 qqO = vec_madd(iqO,jq,nul);
15072 qqH = vec_madd(iqH,jq,nul);
15073 vnbtot = vec_madd(c12,vec_madd(rinvsix,rinvsix,nul),vnbtot);
15074 vctot = vec_madd(qqO,VVcO,vctot);
15075 vctot = vec_madd(qqH,VVcH1,vctot);
15076 vctot = vec_madd(qqH,VVcH2,vctot);
15078 /* update outer data */
15079 add_vector_to_float(Vc+gid[n],vctot);
15080 add_vector_to_float(Vnb+gid[n],vnbtot);
15085 void mcinl3320_altivec(
15086 int nri,
15087 int iinr[],
15088 int jindex[],
15089 int jjnr[],
15090 int shift[],
15091 float shiftvec[],
15092 int gid[],
15093 float pos[],
15094 float charge[],
15095 float facel,
15096 float Vc[],
15097 int type[],
15098 int ntype,
15099 float nbfp[],
15100 float Vnb[],
15101 float tabscale,
15102 float VFtab[])
15104 vector float tsc;
15105 vector float iOx,iOy,iOz,iH1x,iH1y,iH1z,iH2x,iH2y,iH2z;
15106 vector float dOx,dOy,dOz,dH1x,dH1y,dH1z,dH2x,dH2y,dH2z;
15107 vector float vfacel,nul;
15108 vector float vnbtot,c6,c12;
15109 vector float vctot,qqO,qqH,iqO,iqH,jq;
15110 vector float rinvO,rinvH1,rinvH2,rsqO,rsqH1,rsqH2;
15111 vector float rO,rH1,rH2,VVcO,VVcH1,VVcH2,VVd,VVr;
15113 int n,k,k0,ii,is3,ii3,ntiA,nj0,nj1;
15114 int jnra,jnrb,jnrc,jnrd;
15115 int j3a,j3b,j3c,j3d;
15116 int tja,tjb,tjc,tjd;
15118 nul=vec_zero();
15119 tsc=load_float_and_splat(&tabscale);
15120 vfacel=load_float_and_splat(&facel);
15122 ii = iinr[0];
15123 iqO = vec_madd(load_float_and_splat(charge+ii),vfacel,nul);
15124 iqH = vec_madd(load_float_and_splat(charge+ii+1),vfacel,nul);
15125 ntiA = 2*ntype*type[ii];
15127 for(n=0;n<nri;n++) {
15128 is3 = 3*shift[n];
15129 ii = iinr[n];
15130 ii3 = 3*ii;
15131 load_1_water_shift_and_splat(pos+ii3,shiftvec+is3,&iOx,&iOy,&iOz,
15132 &iH1x,&iH1y,&iH1z,&iH2x,&iH2y,&iH2z);
15133 vctot = nul;
15134 vnbtot = nul;
15135 nj0 = jindex[n];
15136 nj1 = jindex[n+1];
15138 for(k=nj0; k<(nj1-3); k+=4) {
15139 jnra = jjnr[k];
15140 jnrb = jjnr[k+1];
15141 jnrc = jjnr[k+2];
15142 jnrd = jjnr[k+3];
15143 j3a = 3*jnra;
15144 j3b = 3*jnrb;
15145 j3c = 3*jnrc;
15146 j3d = 3*jnrd;
15147 transpose_4_to_3(load_xyz(pos+j3a),
15148 load_xyz(pos+j3b),
15149 load_xyz(pos+j3c),
15150 load_xyz(pos+j3d),&dH2x,&dH2y,&dH2z);
15151 dOx = vec_sub(iOx,dH2x);
15152 dOy = vec_sub(iOy,dH2y);
15153 dOz = vec_sub(iOz,dH2z);
15154 dH1x = vec_sub(iH1x,dH2x);
15155 dH1y = vec_sub(iH1y,dH2y);
15156 dH1z = vec_sub(iH1z,dH2z);
15157 dH2x = vec_sub(iH2x,dH2x);
15158 dH2y = vec_sub(iH2y,dH2y);
15159 dH2z = vec_sub(iH2z,dH2z);
15161 rsqO = vec_madd(dOx,dOx,nul);
15162 rsqH1 = vec_madd(dH1x,dH1x,nul);
15163 rsqH2 = vec_madd(dH2x,dH2x,nul);
15164 rsqO = vec_madd(dOy,dOy,rsqO);
15165 rsqH1 = vec_madd(dH1y,dH1y,rsqH1);
15166 rsqH2 = vec_madd(dH2y,dH2y,rsqH2);
15167 rsqO = vec_madd(dOz,dOz,rsqO);
15168 rsqH1 = vec_madd(dH1z,dH1z,rsqH1);
15169 rsqH2 = vec_madd(dH2z,dH2z,rsqH2);
15170 do_3_invsqrt(rsqO,rsqH1,rsqH2,&rinvO,&rinvH1,&rinvH2);
15171 rO = vec_madd(rsqO,rinvO,nul);
15172 rH1 = vec_madd(rsqH1,rinvH1,nul);
15173 rH2 = vec_madd(rsqH2,rinvH2,nul);
15174 tja = ntiA+2*type[jnra];
15175 tjb = ntiA+2*type[jnrb];
15176 tjc = ntiA+2*type[jnrc];
15177 tjd = ntiA+2*type[jnrd];
15178 /* load 4 j charges and multiply by iq */
15179 jq=load_4_float(charge+jnra,charge+jnrb,charge+jnrc,charge+jnrd);
15180 load_4_pair(nbfp+tja,nbfp+tjb,nbfp+tjc,nbfp+tjd,&c6,&c12);
15181 do_vonly_4_ljctable_coul_and_lj(VFtab,vec_madd(rO,tsc,nul),&VVcO,&VVd,&VVr);
15182 do_vonly_4_ljctable_coul(VFtab,vec_madd(rH1,tsc,nul),&VVcH1);
15183 do_vonly_4_ljctable_coul(VFtab,vec_madd(rH2,tsc,nul),&VVcH2);
15184 qqO = vec_madd(iqO,jq,nul);
15185 qqH = vec_madd(iqH,jq,nul);
15186 vctot = vec_madd(qqO,VVcO,vctot);
15187 vnbtot = vec_madd(c6,VVd,vnbtot);
15188 vctot = vec_madd(qqH,VVcH1,vctot);
15189 vnbtot = vec_madd(c12,VVr,vnbtot);
15190 vctot = vec_madd(qqH,VVcH2,vctot);
15192 if(k<(nj1-2)) {
15193 jnra = jjnr[k];
15194 jnrb = jjnr[k+1];
15195 jnrc = jjnr[k+2];
15196 j3a = 3*jnra;
15197 j3b = 3*jnrb;
15198 j3c = 3*jnrc;
15199 transpose_4_to_3(load_xyz(pos+j3a),
15200 load_xyz(pos+j3b),
15201 load_xyz(pos+j3c),nul,&dH2x,&dH2y,&dH2z);
15202 dOx = vec_sub(iOx,dH2x);
15203 dOy = vec_sub(iOy,dH2y);
15204 dOz = vec_sub(iOz,dH2z);
15205 dH1x = vec_sub(iH1x,dH2x);
15206 dH1y = vec_sub(iH1y,dH2y);
15207 dH1z = vec_sub(iH1z,dH2z);
15208 dH2x = vec_sub(iH2x,dH2x);
15209 dH2y = vec_sub(iH2y,dH2y);
15210 dH2z = vec_sub(iH2z,dH2z);
15212 rsqO = vec_madd(dOx,dOx,nul);
15213 rsqH1 = vec_madd(dH1x,dH1x,nul);
15214 rsqH2 = vec_madd(dH2x,dH2x,nul);
15215 rsqO = vec_madd(dOy,dOy,rsqO);
15216 rsqH1 = vec_madd(dH1y,dH1y,rsqH1);
15217 rsqH2 = vec_madd(dH2y,dH2y,rsqH2);
15218 rsqO = vec_madd(dOz,dOz,rsqO);
15219 rsqH1 = vec_madd(dH1z,dH1z,rsqH1);
15220 rsqH2 = vec_madd(dH2z,dH2z,rsqH2);
15221 zero_highest_element_in_3_vectors(&rsqO,&rsqH1,&rsqH2);
15222 do_3_invsqrt(rsqO,rsqH1,rsqH2,&rinvO,&rinvH1,&rinvH2);
15223 zero_highest_element_in_3_vectors(&rinvO,&rinvH1,&rinvH2);
15224 rO = vec_madd(rsqO,rinvO,nul);
15225 rH1 = vec_madd(rsqH1,rinvH1,nul);
15226 rH2 = vec_madd(rsqH2,rinvH2,nul);
15227 tja = ntiA+2*type[jnra];
15228 tjb = ntiA+2*type[jnrb];
15229 tjc = ntiA+2*type[jnrc];
15231 load_3_pair(nbfp+tja,nbfp+tjb,nbfp+tjc,&c6,&c12);
15232 jq=load_3_float(charge+jnra,charge+jnrb,charge+jnrc);
15233 do_vonly_3_ljctable_coul_and_lj(VFtab,vec_madd(rO,tsc,nul),&VVcO,&VVd,&VVr);
15234 do_vonly_3_ljctable_coul(VFtab,vec_madd(rH1,tsc,nul),&VVcH1);
15235 do_vonly_3_ljctable_coul(VFtab,vec_madd(rH2,tsc,nul),&VVcH2);
15236 qqO = vec_madd(iqO,jq,nul);
15237 qqH = vec_madd(iqH,jq,nul);
15238 vctot = vec_madd(qqO,VVcO,vctot);
15239 vnbtot = vec_madd(c6,VVd,vnbtot);
15240 vctot = vec_madd(qqH,VVcH1,vctot);
15241 vnbtot = vec_madd(c12,VVr,vnbtot);
15242 vctot = vec_madd(qqH,VVcH2,vctot);
15243 } else if(k<(nj1-1)) {
15244 jnra = jjnr[k];
15245 jnrb = jjnr[k+1];
15246 j3a = 3*jnra;
15247 j3b = 3*jnrb;
15248 transpose_2_to_3(load_xyz(pos+j3a),
15249 load_xyz(pos+j3b),&dH2x,&dH2y,&dH2z);
15250 dOx = vec_sub(iOx,dH2x);
15251 dOy = vec_sub(iOy,dH2y);
15252 dOz = vec_sub(iOz,dH2z);
15253 dH1x = vec_sub(iH1x,dH2x);
15254 dH1y = vec_sub(iH1y,dH2y);
15255 dH1z = vec_sub(iH1z,dH2z);
15256 dH2x = vec_sub(iH2x,dH2x);
15257 dH2y = vec_sub(iH2y,dH2y);
15258 dH2z = vec_sub(iH2z,dH2z);
15260 rsqO = vec_madd(dOx,dOx,nul);
15261 rsqH1 = vec_madd(dH1x,dH1x,nul);
15262 rsqH2 = vec_madd(dH2x,dH2x,nul);
15263 rsqO = vec_madd(dOy,dOy,rsqO);
15264 rsqH1 = vec_madd(dH1y,dH1y,rsqH1);
15265 rsqH2 = vec_madd(dH2y,dH2y,rsqH2);
15266 rsqO = vec_madd(dOz,dOz,rsqO);
15267 rsqH1 = vec_madd(dH1z,dH1z,rsqH1);
15268 rsqH2 = vec_madd(dH2z,dH2z,rsqH2);
15269 zero_highest_2_elements_in_3_vectors(&rsqO,&rsqH1,&rsqH2);
15270 do_3_invsqrt(rsqO,rsqH1,rsqH2,&rinvO,&rinvH1,&rinvH2);
15271 zero_highest_2_elements_in_3_vectors(&rinvO,&rinvH1,&rinvH2);
15272 rO = vec_madd(rsqO,rinvO,nul);
15273 rH1 = vec_madd(rsqH1,rinvH1,nul);
15274 rH2 = vec_madd(rsqH2,rinvH2,nul);
15275 tja = ntiA+2*type[jnra];
15276 tjb = ntiA+2*type[jnrb];
15277 /* load 2 j charges and multiply by iq */
15278 jq=load_2_float(charge+jnra,charge+jnrb);
15279 load_2_pair(nbfp+tja,nbfp+tjb,&c6,&c12);
15280 do_vonly_2_ljctable_coul_and_lj(VFtab,vec_madd(rO,tsc,nul),&VVcO,&VVd,&VVr);
15281 do_vonly_2_ljctable_coul(VFtab,vec_madd(rH1,tsc,nul),&VVcH1);
15282 do_vonly_2_ljctable_coul(VFtab,vec_madd(rH2,tsc,nul),&VVcH2);
15283 qqO = vec_madd(iqO,jq,nul);
15284 qqH = vec_madd(iqH,jq,nul);
15285 vctot = vec_madd(qqO,VVcO,vctot);
15286 vnbtot = vec_madd(c6,VVd,vnbtot);
15287 vctot = vec_madd(qqH,VVcH1,vctot);
15288 vnbtot = vec_madd(c12,VVr,vnbtot);
15289 vctot = vec_madd(qqH,VVcH2,vctot);
15290 } else if(k<nj1) {
15291 jnra = jjnr[k];
15292 j3a = 3*jnra;
15293 transpose_1_to_3(load_xyz(pos+j3a),&dH2x,&dH2y,&dH2z);
15294 dOx = vec_sub(iOx,dH2x);
15295 dOy = vec_sub(iOy,dH2y);
15296 dOz = vec_sub(iOz,dH2z);
15297 dH1x = vec_sub(iH1x,dH2x);
15298 dH1y = vec_sub(iH1y,dH2y);
15299 dH1z = vec_sub(iH1z,dH2z);
15300 dH2x = vec_sub(iH2x,dH2x);
15301 dH2y = vec_sub(iH2y,dH2y);
15302 dH2z = vec_sub(iH2z,dH2z);
15304 rsqO = vec_madd(dOx,dOx,nul);
15305 rsqH1 = vec_madd(dH1x,dH1x,nul);
15306 rsqH2 = vec_madd(dH2x,dH2x,nul);
15307 rsqO = vec_madd(dOy,dOy,rsqO);
15308 rsqH1 = vec_madd(dH1y,dH1y,rsqH1);
15309 rsqH2 = vec_madd(dH2y,dH2y,rsqH2);
15310 rsqO = vec_madd(dOz,dOz,rsqO);
15311 rsqH1 = vec_madd(dH1z,dH1z,rsqH1);
15312 rsqH2 = vec_madd(dH2z,dH2z,rsqH2);
15313 zero_highest_3_elements_in_3_vectors(&rsqO,&rsqH1,&rsqH2);
15314 do_3_invsqrt(rsqO,rsqH1,rsqH2,&rinvO,&rinvH1,&rinvH2);
15315 zero_highest_3_elements_in_3_vectors(&rinvO,&rinvH1,&rinvH2);
15316 rO = vec_madd(rsqO,rinvO,nul);
15317 rH1 = vec_madd(rsqH1,rinvH1,nul);
15318 rH2 = vec_madd(rsqH2,rinvH2,nul);
15319 tja = ntiA+2*type[jnra];
15320 /* load 1 j charges and multiply by iq */
15321 jq=load_1_float(charge+jnra);
15322 load_1_pair(nbfp+tja,&c6,&c12);
15323 do_vonly_1_ljctable_coul_and_lj(VFtab,vec_madd(rO,tsc,nul),&VVcO,&VVd,&VVr);
15324 do_vonly_1_ljctable_coul(VFtab,vec_madd(rH1,tsc,nul),&VVcH1);
15325 do_vonly_1_ljctable_coul(VFtab,vec_madd(rH2,tsc,nul),&VVcH2);
15326 qqO = vec_madd(iqO,jq,nul);
15327 qqH = vec_madd(iqH,jq,nul);
15328 vctot = vec_madd(qqO,VVcO,vctot);
15329 vnbtot = vec_madd(c6,VVd,vnbtot);
15330 vctot = vec_madd(qqH,VVcH1,vctot);
15331 vnbtot = vec_madd(c12,VVr,vnbtot);
15332 vctot = vec_madd(qqH,VVcH2,vctot);
15334 /* update outer data */
15335 add_vector_to_float(Vc+gid[n],vctot);
15336 add_vector_to_float(Vnb+gid[n],vnbtot);
15343 void mcinl1030_altivec(
15344 int nri,
15345 int iinr[],
15346 int jindex[],
15347 int jjnr[],
15348 int shift[],
15349 float shiftvec[],
15350 int gid[],
15351 float pos[],
15352 float charge[],
15353 float facel,
15354 float Vc[])
15356 vector float ix1,iy1,iz1,ix2,iy2,iz2,ix3,iy3,iz3;
15357 vector float jx1,jy1,jz1,jx2,jy2,jz2,jx3,jy3,jz3;
15359 vector float dx11,dy11,dz11,dx12,dy12,dz12,dx13,dy13,dz13;
15360 vector float dx21,dy21,dz21,dx22,dy22,dz22,dx23,dy23,dz23;
15361 vector float dx31,dy31,dz31,dx32,dy32,dz32,dx33,dy33,dz33;
15363 vector float rsq11,rsq12,rsq13,rsq21,rsq22,rsq23,rsq31,rsq32,rsq33;
15364 vector float rinv11,rinv12,rinv13,rinv21,rinv22,rinv23,rinv31,rinv32,rinv33;
15366 vector float vfacel,vcoul1,vcoul2,vcoul3,nul;
15367 vector float vctot,qqOO,qqOH,qqHH,qO,qH,qqOOt,qqOHt,qqHHt;
15371 int n,k,k0,ii,is3,ii3,ntiA,nj0,nj1;
15372 int jnra,jnrb,jnrc,jnrd;
15373 int j3a,j3b,j3c,j3d;
15375 nul=vec_zero();
15376 vfacel=load_float_and_splat(&facel);
15377 qO = load_float_and_splat(charge+iinr[0]);
15378 qH = load_float_and_splat(charge+iinr[0]+1);
15379 qqOO = vec_madd(qO,qO,nul);
15380 qqOH = vec_madd(qO,qH,nul);
15381 qqHH = vec_madd(qH,qH,nul);
15382 qqOO = vec_madd(qqOO,vfacel,nul);
15383 qqOH = vec_madd(qqOH,vfacel,nul);
15384 qqHH = vec_madd(qqHH,vfacel,nul);
15386 for(n=0;n<nri;n++) {
15387 is3 = 3*shift[n];
15388 ii = iinr[n];
15389 ii3 = 3*ii;
15390 load_1_water_shift_and_splat(pos+ii3,shiftvec+is3,&ix1,&iy1,&iz1,
15391 &ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
15392 vctot = nul;
15393 nj0 = jindex[n];
15394 nj1 = jindex[n+1];
15396 for(k=nj0; k<(nj1-3); k+=4) {
15397 jnra = jjnr[k];
15398 jnrb = jjnr[k+1];
15399 jnrc = jjnr[k+2];
15400 jnrd = jjnr[k+3];
15401 j3a = 3*jnra;
15402 j3b = 3*jnrb;
15403 j3c = 3*jnrc;
15404 j3d = 3*jnrd;
15405 load_4_water(pos+j3a,pos+j3b,pos+j3c,pos+j3d,
15406 &jx1,&jy1,&jz1,&jx2,&jy2,&jz2,&jx3,&jy3,&jz3);
15408 dx11 = vec_sub(ix1,jx1);
15409 dx12 = vec_sub(ix1,jx2);
15410 dx13 = vec_sub(ix1,jx3);
15411 dy11 = vec_sub(iy1,jy1);
15412 dy12 = vec_sub(iy1,jy2);
15413 dy13 = vec_sub(iy1,jy3);
15414 dz11 = vec_sub(iz1,jz1);
15415 dz12 = vec_sub(iz1,jz2);
15416 dz13 = vec_sub(iz1,jz3);
15417 dx21 = vec_sub(ix2,jx1);
15418 dx22 = vec_sub(ix2,jx2);
15419 dx23 = vec_sub(ix2,jx3);
15420 dy21 = vec_sub(iy2,jy1);
15421 dy22 = vec_sub(iy2,jy2);
15422 dy23 = vec_sub(iy2,jy3);
15423 dz21 = vec_sub(iz2,jz1);
15424 dz22 = vec_sub(iz2,jz2);
15425 dz23 = vec_sub(iz2,jz3);
15426 dx31 = vec_sub(ix3,jx1);
15427 dx32 = vec_sub(ix3,jx2);
15428 dx33 = vec_sub(ix3,jx3);
15429 dy31 = vec_sub(iy3,jy1);
15430 dy32 = vec_sub(iy3,jy2);
15431 dy33 = vec_sub(iy3,jy3);
15432 dz31 = vec_sub(iz3,jz1);
15433 dz32 = vec_sub(iz3,jz2);
15434 dz33 = vec_sub(iz3,jz3);
15436 rsq11 = vec_madd(dx11,dx11,nul);
15437 rsq12 = vec_madd(dx12,dx12,nul);
15438 rsq13 = vec_madd(dx13,dx13,nul);
15439 rsq21 = vec_madd(dx21,dx21,nul);
15440 rsq22 = vec_madd(dx22,dx22,nul);
15441 rsq23 = vec_madd(dx23,dx23,nul);
15442 rsq31 = vec_madd(dx31,dx31,nul);
15443 rsq32 = vec_madd(dx32,dx32,nul);
15444 rsq33 = vec_madd(dx33,dx33,nul);
15445 rsq11 = vec_madd(dy11,dy11,rsq11);
15446 rsq12 = vec_madd(dy12,dy12,rsq12);
15447 rsq13 = vec_madd(dy13,dy13,rsq13);
15448 rsq21 = vec_madd(dy21,dy21,rsq21);
15449 rsq22 = vec_madd(dy22,dy22,rsq22);
15450 rsq23 = vec_madd(dy23,dy23,rsq23);
15451 rsq31 = vec_madd(dy31,dy31,rsq31);
15452 rsq32 = vec_madd(dy32,dy32,rsq32);
15453 rsq33 = vec_madd(dy33,dy33,rsq33);
15454 rsq11 = vec_madd(dz11,dz11,rsq11);
15455 rsq12 = vec_madd(dz12,dz12,rsq12);
15456 rsq13 = vec_madd(dz13,dz13,rsq13);
15457 rsq21 = vec_madd(dz21,dz21,rsq21);
15458 rsq22 = vec_madd(dz22,dz22,rsq22);
15459 rsq23 = vec_madd(dz23,dz23,rsq23);
15460 rsq31 = vec_madd(dz31,dz31,rsq31);
15461 rsq32 = vec_madd(dz32,dz32,rsq32);
15462 rsq33 = vec_madd(dz33,dz33,rsq33);
15464 do_9_invsqrt(rsq11,rsq12,rsq13,
15465 rsq21,rsq22,rsq23,
15466 rsq31,rsq32,rsq33,
15467 &rinv11,&rinv12,&rinv13,
15468 &rinv21,&rinv22,&rinv23,
15469 &rinv31,&rinv32,&rinv33);
15471 vctot = vec_madd(rinv11,qqOO,vctot);
15472 vctot = vec_madd(rinv12,qqOH,vctot);
15473 vctot = vec_madd(rinv13,qqOH,vctot);
15474 vctot = vec_madd(rinv21,qqOH,vctot);
15475 vctot = vec_madd(rinv22,qqHH,vctot);
15476 vctot = vec_madd(rinv23,qqHH,vctot);
15477 vctot = vec_madd(rinv31,qqOH,vctot);
15478 vctot = vec_madd(rinv32,qqHH,vctot);
15479 vctot = vec_madd(rinv33,qqHH,vctot);
15481 if(k<(nj1-2)) {
15482 jnra = jjnr[k];
15483 jnrb = jjnr[k+1];
15484 jnrc = jjnr[k+2];
15485 j3a = 3*jnra;
15486 j3b = 3*jnrb;
15487 j3c = 3*jnrc;
15488 load_3_water(pos+j3a,pos+j3b,pos+j3c,
15489 &jx1,&jy1,&jz1,&jx2,&jy2,&jz2,&jx3,&jy3,&jz3);
15490 qqOOt = vec_sld(qqOO,nul,4);
15491 qqOHt = vec_sld(qqOH,nul,4);
15492 qqHHt = vec_sld(qqHH,nul,4);
15494 dx11 = vec_sub(ix1,jx1);
15495 dx12 = vec_sub(ix1,jx2);
15496 dx13 = vec_sub(ix1,jx3);
15497 dy11 = vec_sub(iy1,jy1);
15498 dy12 = vec_sub(iy1,jy2);
15499 dy13 = vec_sub(iy1,jy3);
15500 dz11 = vec_sub(iz1,jz1);
15501 dz12 = vec_sub(iz1,jz2);
15502 dz13 = vec_sub(iz1,jz3);
15503 dx21 = vec_sub(ix2,jx1);
15504 dx22 = vec_sub(ix2,jx2);
15505 dx23 = vec_sub(ix2,jx3);
15506 dy21 = vec_sub(iy2,jy1);
15507 dy22 = vec_sub(iy2,jy2);
15508 dy23 = vec_sub(iy2,jy3);
15509 dz21 = vec_sub(iz2,jz1);
15510 dz22 = vec_sub(iz2,jz2);
15511 dz23 = vec_sub(iz2,jz3);
15512 dx31 = vec_sub(ix3,jx1);
15513 dx32 = vec_sub(ix3,jx2);
15514 dx33 = vec_sub(ix3,jx3);
15515 dy31 = vec_sub(iy3,jy1);
15516 dy32 = vec_sub(iy3,jy2);
15517 dy33 = vec_sub(iy3,jy3);
15518 dz31 = vec_sub(iz3,jz1);
15519 dz32 = vec_sub(iz3,jz2);
15520 dz33 = vec_sub(iz3,jz3);
15522 rsq11 = vec_madd(dx11,dx11,nul);
15523 rsq12 = vec_madd(dx12,dx12,nul);
15524 rsq13 = vec_madd(dx13,dx13,nul);
15525 rsq21 = vec_madd(dx21,dx21,nul);
15526 rsq22 = vec_madd(dx22,dx22,nul);
15527 rsq23 = vec_madd(dx23,dx23,nul);
15528 rsq31 = vec_madd(dx31,dx31,nul);
15529 rsq32 = vec_madd(dx32,dx32,nul);
15530 rsq33 = vec_madd(dx33,dx33,nul);
15531 rsq11 = vec_madd(dy11,dy11,rsq11);
15532 rsq12 = vec_madd(dy12,dy12,rsq12);
15533 rsq13 = vec_madd(dy13,dy13,rsq13);
15534 rsq21 = vec_madd(dy21,dy21,rsq21);
15535 rsq22 = vec_madd(dy22,dy22,rsq22);
15536 rsq23 = vec_madd(dy23,dy23,rsq23);
15537 rsq31 = vec_madd(dy31,dy31,rsq31);
15538 rsq32 = vec_madd(dy32,dy32,rsq32);
15539 rsq33 = vec_madd(dy33,dy33,rsq33);
15540 rsq11 = vec_madd(dz11,dz11,rsq11);
15541 rsq12 = vec_madd(dz12,dz12,rsq12);
15542 rsq13 = vec_madd(dz13,dz13,rsq13);
15543 rsq21 = vec_madd(dz21,dz21,rsq21);
15544 rsq22 = vec_madd(dz22,dz22,rsq22);
15545 rsq23 = vec_madd(dz23,dz23,rsq23);
15546 rsq31 = vec_madd(dz31,dz31,rsq31);
15547 rsq32 = vec_madd(dz32,dz32,rsq32);
15548 rsq33 = vec_madd(dz33,dz33,rsq33);
15550 do_9_invsqrt(rsq11,rsq12,rsq13,
15551 rsq21,rsq22,rsq23,
15552 rsq31,rsq32,rsq33,
15553 &rinv11,&rinv12,&rinv13,
15554 &rinv21,&rinv22,&rinv23,
15555 &rinv31,&rinv32,&rinv33);
15557 zero_highest_element_in_9_vectors(&rinv11,&rinv12,&rinv13,
15558 &rinv21,&rinv22,&rinv23,
15559 &rinv31,&rinv32,&rinv33);
15561 vctot = vec_madd(rinv11,qqOOt,vctot);
15562 vctot = vec_madd(rinv12,qqOHt,vctot);
15563 vctot = vec_madd(rinv13,qqOHt,vctot);
15564 vctot = vec_madd(rinv21,qqOHt,vctot);
15565 vctot = vec_madd(rinv22,qqHHt,vctot);
15566 vctot = vec_madd(rinv23,qqHHt,vctot);
15567 vctot = vec_madd(rinv31,qqOHt,vctot);
15568 vctot = vec_madd(rinv32,qqHHt,vctot);
15569 vctot = vec_madd(rinv33,qqHHt,vctot);
15570 } else if(k<(nj1-1)) {
15571 jnra = jjnr[k];
15572 jnrb = jjnr[k+1];
15573 j3a = 3*jnra;
15574 j3b = 3*jnrb;
15575 load_2_water(pos+j3a,pos+j3b,
15576 &jx1,&jy1,&jz1,&jx2,&jy2,&jz2,&jx3,&jy3,&jz3);
15577 qqOOt = vec_sld(qqOO,nul,8);
15578 qqOHt = vec_sld(qqOH,nul,8);
15579 qqHHt = vec_sld(qqHH,nul,8);
15581 dx11 = vec_sub(ix1,jx1);
15582 dx12 = vec_sub(ix1,jx2);
15583 dx13 = vec_sub(ix1,jx3);
15584 dy11 = vec_sub(iy1,jy1);
15585 dy12 = vec_sub(iy1,jy2);
15586 dy13 = vec_sub(iy1,jy3);
15587 dz11 = vec_sub(iz1,jz1);
15588 dz12 = vec_sub(iz1,jz2);
15589 dz13 = vec_sub(iz1,jz3);
15590 dx21 = vec_sub(ix2,jx1);
15591 dx22 = vec_sub(ix2,jx2);
15592 dx23 = vec_sub(ix2,jx3);
15593 dy21 = vec_sub(iy2,jy1);
15594 dy22 = vec_sub(iy2,jy2);
15595 dy23 = vec_sub(iy2,jy3);
15596 dz21 = vec_sub(iz2,jz1);
15597 dz22 = vec_sub(iz2,jz2);
15598 dz23 = vec_sub(iz2,jz3);
15599 dx31 = vec_sub(ix3,jx1);
15600 dx32 = vec_sub(ix3,jx2);
15601 dx33 = vec_sub(ix3,jx3);
15602 dy31 = vec_sub(iy3,jy1);
15603 dy32 = vec_sub(iy3,jy2);
15604 dy33 = vec_sub(iy3,jy3);
15605 dz31 = vec_sub(iz3,jz1);
15606 dz32 = vec_sub(iz3,jz2);
15607 dz33 = vec_sub(iz3,jz3);
15609 rsq11 = vec_madd(dx11,dx11,nul);
15610 rsq12 = vec_madd(dx12,dx12,nul);
15611 rsq13 = vec_madd(dx13,dx13,nul);
15612 rsq21 = vec_madd(dx21,dx21,nul);
15613 rsq22 = vec_madd(dx22,dx22,nul);
15614 rsq23 = vec_madd(dx23,dx23,nul);
15615 rsq31 = vec_madd(dx31,dx31,nul);
15616 rsq32 = vec_madd(dx32,dx32,nul);
15617 rsq33 = vec_madd(dx33,dx33,nul);
15618 rsq11 = vec_madd(dy11,dy11,rsq11);
15619 rsq12 = vec_madd(dy12,dy12,rsq12);
15620 rsq13 = vec_madd(dy13,dy13,rsq13);
15621 rsq21 = vec_madd(dy21,dy21,rsq21);
15622 rsq22 = vec_madd(dy22,dy22,rsq22);
15623 rsq23 = vec_madd(dy23,dy23,rsq23);
15624 rsq31 = vec_madd(dy31,dy31,rsq31);
15625 rsq32 = vec_madd(dy32,dy32,rsq32);
15626 rsq33 = vec_madd(dy33,dy33,rsq33);
15627 rsq11 = vec_madd(dz11,dz11,rsq11);
15628 rsq12 = vec_madd(dz12,dz12,rsq12);
15629 rsq13 = vec_madd(dz13,dz13,rsq13);
15630 rsq21 = vec_madd(dz21,dz21,rsq21);
15631 rsq22 = vec_madd(dz22,dz22,rsq22);
15632 rsq23 = vec_madd(dz23,dz23,rsq23);
15633 rsq31 = vec_madd(dz31,dz31,rsq31);
15634 rsq32 = vec_madd(dz32,dz32,rsq32);
15635 rsq33 = vec_madd(dz33,dz33,rsq33);
15637 do_9_invsqrt(rsq11,rsq12,rsq13,
15638 rsq21,rsq22,rsq23,
15639 rsq31,rsq32,rsq33,
15640 &rinv11,&rinv12,&rinv13,
15641 &rinv21,&rinv22,&rinv23,
15642 &rinv31,&rinv32,&rinv33);
15644 zero_highest_2_elements_in_9_vectors(&rinv11,&rinv12,&rinv13,
15645 &rinv21,&rinv22,&rinv23,
15646 &rinv31,&rinv32,&rinv33);
15648 vctot = vec_madd(rinv11,qqOOt,vctot);
15649 vctot = vec_madd(rinv12,qqOHt,vctot);
15650 vctot = vec_madd(rinv13,qqOHt,vctot);
15651 vctot = vec_madd(rinv21,qqOHt,vctot);
15652 vctot = vec_madd(rinv22,qqHHt,vctot);
15653 vctot = vec_madd(rinv23,qqHHt,vctot);
15654 vctot = vec_madd(rinv31,qqOHt,vctot);
15655 vctot = vec_madd(rinv32,qqHHt,vctot);
15656 vctot = vec_madd(rinv33,qqHHt,vctot);
15657 } else if(k<nj1) {
15658 jnra = jjnr[k];
15659 j3a = 3*jnra;
15660 load_1_water(pos+j3a,
15661 &jx1,&jy1,&jz1,&jx2,&jy2,&jz2,&jx3,&jy3,&jz3);
15662 qqOOt = vec_sld(qqOO,nul,12);
15663 qqOHt = vec_sld(qqOH,nul,12);
15664 qqHHt = vec_sld(qqHH,nul,12);
15666 dx11 = vec_sub(ix1,jx1);
15667 dx12 = vec_sub(ix1,jx2);
15668 dx13 = vec_sub(ix1,jx3);
15669 dy11 = vec_sub(iy1,jy1);
15670 dy12 = vec_sub(iy1,jy2);
15671 dy13 = vec_sub(iy1,jy3);
15672 dz11 = vec_sub(iz1,jz1);
15673 dz12 = vec_sub(iz1,jz2);
15674 dz13 = vec_sub(iz1,jz3);
15675 dx21 = vec_sub(ix2,jx1);
15676 dx22 = vec_sub(ix2,jx2);
15677 dx23 = vec_sub(ix2,jx3);
15678 dy21 = vec_sub(iy2,jy1);
15679 dy22 = vec_sub(iy2,jy2);
15680 dy23 = vec_sub(iy2,jy3);
15681 dz21 = vec_sub(iz2,jz1);
15682 dz22 = vec_sub(iz2,jz2);
15683 dz23 = vec_sub(iz2,jz3);
15684 dx31 = vec_sub(ix3,jx1);
15685 dx32 = vec_sub(ix3,jx2);
15686 dx33 = vec_sub(ix3,jx3);
15687 dy31 = vec_sub(iy3,jy1);
15688 dy32 = vec_sub(iy3,jy2);
15689 dy33 = vec_sub(iy3,jy3);
15690 dz31 = vec_sub(iz3,jz1);
15691 dz32 = vec_sub(iz3,jz2);
15692 dz33 = vec_sub(iz3,jz3);
15694 rsq11 = vec_madd(dx11,dx11,nul);
15695 rsq12 = vec_madd(dx12,dx12,nul);
15696 rsq13 = vec_madd(dx13,dx13,nul);
15697 rsq21 = vec_madd(dx21,dx21,nul);
15698 rsq22 = vec_madd(dx22,dx22,nul);
15699 rsq23 = vec_madd(dx23,dx23,nul);
15700 rsq31 = vec_madd(dx31,dx31,nul);
15701 rsq32 = vec_madd(dx32,dx32,nul);
15702 rsq33 = vec_madd(dx33,dx33,nul);
15703 rsq11 = vec_madd(dy11,dy11,rsq11);
15704 rsq12 = vec_madd(dy12,dy12,rsq12);
15705 rsq13 = vec_madd(dy13,dy13,rsq13);
15706 rsq21 = vec_madd(dy21,dy21,rsq21);
15707 rsq22 = vec_madd(dy22,dy22,rsq22);
15708 rsq23 = vec_madd(dy23,dy23,rsq23);
15709 rsq31 = vec_madd(dy31,dy31,rsq31);
15710 rsq32 = vec_madd(dy32,dy32,rsq32);
15711 rsq33 = vec_madd(dy33,dy33,rsq33);
15712 rsq11 = vec_madd(dz11,dz11,rsq11);
15713 rsq12 = vec_madd(dz12,dz12,rsq12);
15714 rsq13 = vec_madd(dz13,dz13,rsq13);
15715 rsq21 = vec_madd(dz21,dz21,rsq21);
15716 rsq22 = vec_madd(dz22,dz22,rsq22);
15717 rsq23 = vec_madd(dz23,dz23,rsq23);
15718 rsq31 = vec_madd(dz31,dz31,rsq31);
15719 rsq32 = vec_madd(dz32,dz32,rsq32);
15720 rsq33 = vec_madd(dz33,dz33,rsq33);
15722 do_9_invsqrt(rsq11,rsq12,rsq13,
15723 rsq21,rsq22,rsq23,
15724 rsq31,rsq32,rsq33,
15725 &rinv11,&rinv12,&rinv13,
15726 &rinv21,&rinv22,&rinv23,
15727 &rinv31,&rinv32,&rinv33);
15729 zero_highest_3_elements_in_9_vectors(&rinv11,&rinv12,&rinv13,
15730 &rinv21,&rinv22,&rinv23,
15731 &rinv31,&rinv32,&rinv33);
15733 vctot = vec_madd(rinv11,qqOOt,vctot);
15734 vctot = vec_madd(rinv12,qqOHt,vctot);
15735 vctot = vec_madd(rinv13,qqOHt,vctot);
15736 vctot = vec_madd(rinv21,qqOHt,vctot);
15737 vctot = vec_madd(rinv22,qqHHt,vctot);
15738 vctot = vec_madd(rinv23,qqHHt,vctot);
15739 vctot = vec_madd(rinv31,qqOHt,vctot);
15740 vctot = vec_madd(rinv32,qqHHt,vctot);
15741 vctot = vec_madd(rinv33,qqHHt,vctot);
15743 /* update outer data */
15744 add_vector_to_float(Vc+gid[n],vctot);
15749 void mcinl1130_altivec(
15750 int nri,
15751 int iinr[],
15752 int jindex[],
15753 int jjnr[],
15754 int shift[],
15755 float shiftvec[],
15756 int gid[],
15757 float pos[],
15758 float charge[],
15759 float facel,
15760 float Vc[],
15761 int type[],
15762 int ntype,
15763 float nbfp[],
15764 float Vnb[])
15766 vector float ix1,iy1,iz1,ix2,iy2,iz2,ix3,iy3,iz3;
15767 vector float jx1,jy1,jz1,jx2,jy2,jz2,jx3,jy3,jz3;
15769 vector float dx11,dy11,dz11,dx12,dy12,dz12,dx13,dy13,dz13;
15770 vector float dx21,dy21,dz21,dx22,dy22,dz22,dx23,dy23,dz23;
15771 vector float dx31,dy31,dz31,dx32,dy32,dz32,dx33,dy33,dz33;
15773 vector float rsq11,rsq12,rsq13,rsq21,rsq22,rsq23,rsq31,rsq32,rsq33;
15774 vector float rinv11,rinv12,rinv13,rinv21,rinv22,rinv23,rinv31,rinv32,rinv33;
15775 vector float rinvsq11;
15777 vector float vfacel,nul;
15778 vector float vctot,qqOO,qqOH,qqHH,qO,qH,c6,c12,rinvsix;
15779 vector float vnb6,vnb12,vnbtot,qqOOt,qqOHt,qqHHt,c6t,c12t;
15781 int n,k,k0,ii,is3,ii3,ntiA,nj0,nj1;
15782 int jnra,jnrb,jnrc,jnrd,tp,tj;
15783 int j3a,j3b,j3c,j3d;
15785 nul=vec_zero();
15786 vfacel=load_float_and_splat(&facel);
15787 ii = iinr[0];
15788 qO = load_float_and_splat(charge+ii);
15789 qH = load_float_and_splat(charge+ii+1);
15790 qqOO = vec_madd(qO,qO,nul);
15791 qqOH = vec_madd(qO,qH,nul);
15792 qqHH = vec_madd(qH,qH,nul);
15793 qqOO = vec_madd(qqOO,vfacel,nul);
15794 qqOH = vec_madd(qqOH,vfacel,nul);
15795 qqHH = vec_madd(qqHH,vfacel,nul);
15796 tp = 2*type[ii];
15797 tj = (ntype+1)*tp;
15798 load_1_pair(nbfp+tj,&c6,&c12);
15799 c6 = vec_splat(c6,0);
15800 c12 = vec_splat(c12,0);
15802 for(n=0;n<nri;n++) {
15803 is3 = 3*shift[n];
15804 ii = iinr[n];
15805 ii3 = 3*ii;
15806 load_1_water_shift_and_splat(pos+ii3,shiftvec+is3,&ix1,&iy1,&iz1,
15807 &ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
15808 vctot = nul;
15809 vnbtot = nul;
15810 nj0 = jindex[n];
15811 nj1 = jindex[n+1];
15813 for(k=nj0; k<(nj1-3); k+=4) {
15814 jnra = jjnr[k];
15815 jnrb = jjnr[k+1];
15816 jnrc = jjnr[k+2];
15817 jnrd = jjnr[k+3];
15818 j3a = 3*jnra;
15819 j3b = 3*jnrb;
15820 j3c = 3*jnrc;
15821 j3d = 3*jnrd;
15822 load_4_water(pos+j3a,pos+j3b,pos+j3c,pos+j3d,
15823 &jx1,&jy1,&jz1,&jx2,&jy2,&jz2,&jx3,&jy3,&jz3);
15824 dx11 = vec_sub(ix1,jx1);
15825 dx12 = vec_sub(ix1,jx2);
15826 dx13 = vec_sub(ix1,jx3);
15827 dy11 = vec_sub(iy1,jy1);
15828 dy12 = vec_sub(iy1,jy2);
15829 dy13 = vec_sub(iy1,jy3);
15830 dz11 = vec_sub(iz1,jz1);
15831 dz12 = vec_sub(iz1,jz2);
15832 dz13 = vec_sub(iz1,jz3);
15833 dx21 = vec_sub(ix2,jx1);
15834 dx22 = vec_sub(ix2,jx2);
15835 dx23 = vec_sub(ix2,jx3);
15836 dy21 = vec_sub(iy2,jy1);
15837 dy22 = vec_sub(iy2,jy2);
15838 dy23 = vec_sub(iy2,jy3);
15839 dz21 = vec_sub(iz2,jz1);
15840 dz22 = vec_sub(iz2,jz2);
15841 dz23 = vec_sub(iz2,jz3);
15842 dx31 = vec_sub(ix3,jx1);
15843 dx32 = vec_sub(ix3,jx2);
15844 dx33 = vec_sub(ix3,jx3);
15845 dy31 = vec_sub(iy3,jy1);
15846 dy32 = vec_sub(iy3,jy2);
15847 dy33 = vec_sub(iy3,jy3);
15848 dz31 = vec_sub(iz3,jz1);
15849 dz32 = vec_sub(iz3,jz2);
15850 dz33 = vec_sub(iz3,jz3);
15852 rsq11 = vec_madd(dx11,dx11,nul);
15853 rsq12 = vec_madd(dx12,dx12,nul);
15854 rsq13 = vec_madd(dx13,dx13,nul);
15855 rsq21 = vec_madd(dx21,dx21,nul);
15856 rsq22 = vec_madd(dx22,dx22,nul);
15857 rsq23 = vec_madd(dx23,dx23,nul);
15858 rsq31 = vec_madd(dx31,dx31,nul);
15859 rsq32 = vec_madd(dx32,dx32,nul);
15860 rsq33 = vec_madd(dx33,dx33,nul);
15861 rsq11 = vec_madd(dy11,dy11,rsq11);
15862 rsq12 = vec_madd(dy12,dy12,rsq12);
15863 rsq13 = vec_madd(dy13,dy13,rsq13);
15864 rsq21 = vec_madd(dy21,dy21,rsq21);
15865 rsq22 = vec_madd(dy22,dy22,rsq22);
15866 rsq23 = vec_madd(dy23,dy23,rsq23);
15867 rsq31 = vec_madd(dy31,dy31,rsq31);
15868 rsq32 = vec_madd(dy32,dy32,rsq32);
15869 rsq33 = vec_madd(dy33,dy33,rsq33);
15870 rsq11 = vec_madd(dz11,dz11,rsq11);
15871 rsq12 = vec_madd(dz12,dz12,rsq12);
15872 rsq13 = vec_madd(dz13,dz13,rsq13);
15873 rsq21 = vec_madd(dz21,dz21,rsq21);
15874 rsq22 = vec_madd(dz22,dz22,rsq22);
15875 rsq23 = vec_madd(dz23,dz23,rsq23);
15876 rsq31 = vec_madd(dz31,dz31,rsq31);
15877 rsq32 = vec_madd(dz32,dz32,rsq32);
15878 rsq33 = vec_madd(dz33,dz33,rsq33);
15880 do_9_invsqrt(rsq11,rsq12,rsq13,
15881 rsq21,rsq22,rsq23,
15882 rsq31,rsq32,rsq33,
15883 &rinv11,&rinv12,&rinv13,
15884 &rinv21,&rinv22,&rinv23,
15885 &rinv31,&rinv32,&rinv33);
15887 rinvsq11 = vec_madd(rinv11,rinv11,nul);
15888 rinvsix = vec_madd(rinvsq11,rinvsq11,nul);
15889 rinvsix = vec_madd(rinvsix,rinvsq11,nul);
15890 vnbtot = vec_nmsub(c6,rinvsix,vnbtot);
15891 rinvsix = vec_madd(rinvsix,rinvsix,nul);
15892 vctot = vec_madd(rinv11,qqOO,vctot);
15893 vctot = vec_madd(rinv12,qqOH,vctot);
15894 vctot = vec_madd(rinv13,qqOH,vctot);
15895 vnbtot = vec_madd(c12,rinvsix,vnbtot);
15896 vctot = vec_madd(rinv21,qqOH,vctot);
15897 vctot = vec_madd(rinv22,qqHH,vctot);
15898 vctot = vec_madd(rinv23,qqHH,vctot);
15899 vctot = vec_madd(rinv31,qqOH,vctot);
15900 vctot = vec_madd(rinv32,qqHH,vctot);
15901 vctot = vec_madd(rinv33,qqHH,vctot);
15903 if(k<(nj1-2)) {
15904 jnra = jjnr[k];
15905 jnrb = jjnr[k+1];
15906 jnrc = jjnr[k+2];
15907 j3a = 3*jnra;
15908 j3b = 3*jnrb;
15909 j3c = 3*jnrc;
15910 load_3_water(pos+j3a,pos+j3b,pos+j3c,
15911 &jx1,&jy1,&jz1,&jx2,&jy2,&jz2,&jx3,&jy3,&jz3);
15912 qqOOt = vec_sld(qqOO,nul,4);
15913 qqOHt = vec_sld(qqOH,nul,4);
15914 qqHHt = vec_sld(qqHH,nul,4);
15915 c6t = vec_sld(c6,nul,4);
15916 c12t = vec_sld(c12,nul,4);
15918 dx11 = vec_sub(ix1,jx1);
15919 dx12 = vec_sub(ix1,jx2);
15920 dx13 = vec_sub(ix1,jx3);
15921 dy11 = vec_sub(iy1,jy1);
15922 dy12 = vec_sub(iy1,jy2);
15923 dy13 = vec_sub(iy1,jy3);
15924 dz11 = vec_sub(iz1,jz1);
15925 dz12 = vec_sub(iz1,jz2);
15926 dz13 = vec_sub(iz1,jz3);
15927 dx21 = vec_sub(ix2,jx1);
15928 dx22 = vec_sub(ix2,jx2);
15929 dx23 = vec_sub(ix2,jx3);
15930 dy21 = vec_sub(iy2,jy1);
15931 dy22 = vec_sub(iy2,jy2);
15932 dy23 = vec_sub(iy2,jy3);
15933 dz21 = vec_sub(iz2,jz1);
15934 dz22 = vec_sub(iz2,jz2);
15935 dz23 = vec_sub(iz2,jz3);
15936 dx31 = vec_sub(ix3,jx1);
15937 dx32 = vec_sub(ix3,jx2);
15938 dx33 = vec_sub(ix3,jx3);
15939 dy31 = vec_sub(iy3,jy1);
15940 dy32 = vec_sub(iy3,jy2);
15941 dy33 = vec_sub(iy3,jy3);
15942 dz31 = vec_sub(iz3,jz1);
15943 dz32 = vec_sub(iz3,jz2);
15944 dz33 = vec_sub(iz3,jz3);
15946 rsq11 = vec_madd(dx11,dx11,nul);
15947 rsq12 = vec_madd(dx12,dx12,nul);
15948 rsq13 = vec_madd(dx13,dx13,nul);
15949 rsq21 = vec_madd(dx21,dx21,nul);
15950 rsq22 = vec_madd(dx22,dx22,nul);
15951 rsq23 = vec_madd(dx23,dx23,nul);
15952 rsq31 = vec_madd(dx31,dx31,nul);
15953 rsq32 = vec_madd(dx32,dx32,nul);
15954 rsq33 = vec_madd(dx33,dx33,nul);
15955 rsq11 = vec_madd(dy11,dy11,rsq11);
15956 rsq12 = vec_madd(dy12,dy12,rsq12);
15957 rsq13 = vec_madd(dy13,dy13,rsq13);
15958 rsq21 = vec_madd(dy21,dy21,rsq21);
15959 rsq22 = vec_madd(dy22,dy22,rsq22);
15960 rsq23 = vec_madd(dy23,dy23,rsq23);
15961 rsq31 = vec_madd(dy31,dy31,rsq31);
15962 rsq32 = vec_madd(dy32,dy32,rsq32);
15963 rsq33 = vec_madd(dy33,dy33,rsq33);
15964 rsq11 = vec_madd(dz11,dz11,rsq11);
15965 rsq12 = vec_madd(dz12,dz12,rsq12);
15966 rsq13 = vec_madd(dz13,dz13,rsq13);
15967 rsq21 = vec_madd(dz21,dz21,rsq21);
15968 rsq22 = vec_madd(dz22,dz22,rsq22);
15969 rsq23 = vec_madd(dz23,dz23,rsq23);
15970 rsq31 = vec_madd(dz31,dz31,rsq31);
15971 rsq32 = vec_madd(dz32,dz32,rsq32);
15972 rsq33 = vec_madd(dz33,dz33,rsq33);
15974 do_9_invsqrt(rsq11,rsq12,rsq13,
15975 rsq21,rsq22,rsq23,
15976 rsq31,rsq32,rsq33,
15977 &rinv11,&rinv12,&rinv13,
15978 &rinv21,&rinv22,&rinv23,
15979 &rinv31,&rinv32,&rinv33);
15981 zero_highest_element_in_9_vectors(&rinv11,&rinv12,&rinv13,
15982 &rinv21,&rinv22,&rinv23,
15983 &rinv31,&rinv32,&rinv33);
15985 rinvsq11 = vec_madd(rinv11,rinv11,nul);
15986 rinvsix = vec_madd(rinvsq11,rinvsq11,nul);
15987 rinvsix = vec_madd(rinvsix,rinvsq11,nul);
15988 vnbtot = vec_nmsub(c6t,rinvsix,vnbtot);
15989 rinvsix = vec_madd(rinvsix,rinvsix,nul);
15990 vctot = vec_madd(rinv11,qqOOt,vctot);
15991 vctot = vec_madd(rinv12,qqOHt,vctot);
15992 vctot = vec_madd(rinv13,qqOHt,vctot);
15993 vnbtot = vec_madd(c12t,rinvsix,vnbtot);
15994 vctot = vec_madd(rinv21,qqOHt,vctot);
15995 vctot = vec_madd(rinv22,qqHHt,vctot);
15996 vctot = vec_madd(rinv23,qqHHt,vctot);
15997 vctot = vec_madd(rinv31,qqOHt,vctot);
15998 vctot = vec_madd(rinv32,qqHHt,vctot);
15999 vctot = vec_madd(rinv33,qqHHt,vctot);
16000 } else if(k<(nj1-1)) {
16001 jnra = jjnr[k];
16002 jnrb = jjnr[k+1];
16003 j3a = 3*jnra;
16004 j3b = 3*jnrb;
16005 load_2_water(pos+j3a,pos+j3b,
16006 &jx1,&jy1,&jz1,&jx2,&jy2,&jz2,&jx3,&jy3,&jz3);
16007 qqOOt = vec_sld(qqOO,nul,8);
16008 qqOHt = vec_sld(qqOH,nul,8);
16009 qqHHt = vec_sld(qqHH,nul,8);
16010 c6t = vec_sld(c6,nul,8);
16011 c12t = vec_sld(c12,nul,8);
16013 dx11 = vec_sub(ix1,jx1);
16014 dx12 = vec_sub(ix1,jx2);
16015 dx13 = vec_sub(ix1,jx3);
16016 dy11 = vec_sub(iy1,jy1);
16017 dy12 = vec_sub(iy1,jy2);
16018 dy13 = vec_sub(iy1,jy3);
16019 dz11 = vec_sub(iz1,jz1);
16020 dz12 = vec_sub(iz1,jz2);
16021 dz13 = vec_sub(iz1,jz3);
16022 dx21 = vec_sub(ix2,jx1);
16023 dx22 = vec_sub(ix2,jx2);
16024 dx23 = vec_sub(ix2,jx3);
16025 dy21 = vec_sub(iy2,jy1);
16026 dy22 = vec_sub(iy2,jy2);
16027 dy23 = vec_sub(iy2,jy3);
16028 dz21 = vec_sub(iz2,jz1);
16029 dz22 = vec_sub(iz2,jz2);
16030 dz23 = vec_sub(iz2,jz3);
16031 dx31 = vec_sub(ix3,jx1);
16032 dx32 = vec_sub(ix3,jx2);
16033 dx33 = vec_sub(ix3,jx3);
16034 dy31 = vec_sub(iy3,jy1);
16035 dy32 = vec_sub(iy3,jy2);
16036 dy33 = vec_sub(iy3,jy3);
16037 dz31 = vec_sub(iz3,jz1);
16038 dz32 = vec_sub(iz3,jz2);
16039 dz33 = vec_sub(iz3,jz3);
16041 rsq11 = vec_madd(dx11,dx11,nul);
16042 rsq12 = vec_madd(dx12,dx12,nul);
16043 rsq13 = vec_madd(dx13,dx13,nul);
16044 rsq21 = vec_madd(dx21,dx21,nul);
16045 rsq22 = vec_madd(dx22,dx22,nul);
16046 rsq23 = vec_madd(dx23,dx23,nul);
16047 rsq31 = vec_madd(dx31,dx31,nul);
16048 rsq32 = vec_madd(dx32,dx32,nul);
16049 rsq33 = vec_madd(dx33,dx33,nul);
16050 rsq11 = vec_madd(dy11,dy11,rsq11);
16051 rsq12 = vec_madd(dy12,dy12,rsq12);
16052 rsq13 = vec_madd(dy13,dy13,rsq13);
16053 rsq21 = vec_madd(dy21,dy21,rsq21);
16054 rsq22 = vec_madd(dy22,dy22,rsq22);
16055 rsq23 = vec_madd(dy23,dy23,rsq23);
16056 rsq31 = vec_madd(dy31,dy31,rsq31);
16057 rsq32 = vec_madd(dy32,dy32,rsq32);
16058 rsq33 = vec_madd(dy33,dy33,rsq33);
16059 rsq11 = vec_madd(dz11,dz11,rsq11);
16060 rsq12 = vec_madd(dz12,dz12,rsq12);
16061 rsq13 = vec_madd(dz13,dz13,rsq13);
16062 rsq21 = vec_madd(dz21,dz21,rsq21);
16063 rsq22 = vec_madd(dz22,dz22,rsq22);
16064 rsq23 = vec_madd(dz23,dz23,rsq23);
16065 rsq31 = vec_madd(dz31,dz31,rsq31);
16066 rsq32 = vec_madd(dz32,dz32,rsq32);
16067 rsq33 = vec_madd(dz33,dz33,rsq33);
16069 do_9_invsqrt(rsq11,rsq12,rsq13,
16070 rsq21,rsq22,rsq23,
16071 rsq31,rsq32,rsq33,
16072 &rinv11,&rinv12,&rinv13,
16073 &rinv21,&rinv22,&rinv23,
16074 &rinv31,&rinv32,&rinv33);
16076 zero_highest_2_elements_in_9_vectors(&rinv11,&rinv12,&rinv13,
16077 &rinv21,&rinv22,&rinv23,
16078 &rinv31,&rinv32,&rinv33);
16080 rinvsq11 = vec_madd(rinv11,rinv11,nul);
16081 rinvsix = vec_madd(rinvsq11,rinvsq11,nul);
16082 rinvsix = vec_madd(rinvsix,rinvsq11,nul);
16083 vnbtot = vec_nmsub(c6t,rinvsix,vnbtot);
16084 rinvsix = vec_madd(rinvsix,rinvsix,nul);
16085 vctot = vec_madd(rinv11,qqOOt,vctot);
16086 vctot = vec_madd(rinv12,qqOHt,vctot);
16087 vctot = vec_madd(rinv13,qqOHt,vctot);
16088 vnbtot = vec_madd(c12t,rinvsix,vnbtot);
16089 vctot = vec_madd(rinv21,qqOHt,vctot);
16090 vctot = vec_madd(rinv22,qqHHt,vctot);
16091 vctot = vec_madd(rinv23,qqHHt,vctot);
16092 vctot = vec_madd(rinv31,qqOHt,vctot);
16093 vctot = vec_madd(rinv32,qqHHt,vctot);
16094 vctot = vec_madd(rinv33,qqHHt,vctot);
16095 } else if(k<nj1) {
16096 jnra = jjnr[k];
16097 j3a = 3*jnra;
16098 load_1_water(pos+j3a,
16099 &jx1,&jy1,&jz1,&jx2,&jy2,&jz2,&jx3,&jy3,&jz3);
16100 qqOOt = vec_sld(qqOO,nul,12);
16101 qqOHt = vec_sld(qqOH,nul,12);
16102 qqHHt = vec_sld(qqHH,nul,12);
16103 c6t = vec_sld(c6,nul,12);
16104 c12t = vec_sld(c12,nul,12);
16106 dx11 = vec_sub(ix1,jx1);
16107 dx12 = vec_sub(ix1,jx2);
16108 dx13 = vec_sub(ix1,jx3);
16109 dy11 = vec_sub(iy1,jy1);
16110 dy12 = vec_sub(iy1,jy2);
16111 dy13 = vec_sub(iy1,jy3);
16112 dz11 = vec_sub(iz1,jz1);
16113 dz12 = vec_sub(iz1,jz2);
16114 dz13 = vec_sub(iz1,jz3);
16115 dx21 = vec_sub(ix2,jx1);
16116 dx22 = vec_sub(ix2,jx2);
16117 dx23 = vec_sub(ix2,jx3);
16118 dy21 = vec_sub(iy2,jy1);
16119 dy22 = vec_sub(iy2,jy2);
16120 dy23 = vec_sub(iy2,jy3);
16121 dz21 = vec_sub(iz2,jz1);
16122 dz22 = vec_sub(iz2,jz2);
16123 dz23 = vec_sub(iz2,jz3);
16124 dx31 = vec_sub(ix3,jx1);
16125 dx32 = vec_sub(ix3,jx2);
16126 dx33 = vec_sub(ix3,jx3);
16127 dy31 = vec_sub(iy3,jy1);
16128 dy32 = vec_sub(iy3,jy2);
16129 dy33 = vec_sub(iy3,jy3);
16130 dz31 = vec_sub(iz3,jz1);
16131 dz32 = vec_sub(iz3,jz2);
16132 dz33 = vec_sub(iz3,jz3);
16134 rsq11 = vec_madd(dx11,dx11,nul);
16135 rsq12 = vec_madd(dx12,dx12,nul);
16136 rsq13 = vec_madd(dx13,dx13,nul);
16137 rsq21 = vec_madd(dx21,dx21,nul);
16138 rsq22 = vec_madd(dx22,dx22,nul);
16139 rsq23 = vec_madd(dx23,dx23,nul);
16140 rsq31 = vec_madd(dx31,dx31,nul);
16141 rsq32 = vec_madd(dx32,dx32,nul);
16142 rsq33 = vec_madd(dx33,dx33,nul);
16143 rsq11 = vec_madd(dy11,dy11,rsq11);
16144 rsq12 = vec_madd(dy12,dy12,rsq12);
16145 rsq13 = vec_madd(dy13,dy13,rsq13);
16146 rsq21 = vec_madd(dy21,dy21,rsq21);
16147 rsq22 = vec_madd(dy22,dy22,rsq22);
16148 rsq23 = vec_madd(dy23,dy23,rsq23);
16149 rsq31 = vec_madd(dy31,dy31,rsq31);
16150 rsq32 = vec_madd(dy32,dy32,rsq32);
16151 rsq33 = vec_madd(dy33,dy33,rsq33);
16152 rsq11 = vec_madd(dz11,dz11,rsq11);
16153 rsq12 = vec_madd(dz12,dz12,rsq12);
16154 rsq13 = vec_madd(dz13,dz13,rsq13);
16155 rsq21 = vec_madd(dz21,dz21,rsq21);
16156 rsq22 = vec_madd(dz22,dz22,rsq22);
16157 rsq23 = vec_madd(dz23,dz23,rsq23);
16158 rsq31 = vec_madd(dz31,dz31,rsq31);
16159 rsq32 = vec_madd(dz32,dz32,rsq32);
16160 rsq33 = vec_madd(dz33,dz33,rsq33);
16162 do_9_invsqrt(rsq11,rsq12,rsq13,
16163 rsq21,rsq22,rsq23,
16164 rsq31,rsq32,rsq33,
16165 &rinv11,&rinv12,&rinv13,
16166 &rinv21,&rinv22,&rinv23,
16167 &rinv31,&rinv32,&rinv33);
16169 zero_highest_3_elements_in_9_vectors(&rinv11,&rinv12,&rinv13,
16170 &rinv21,&rinv22,&rinv23,
16171 &rinv31,&rinv32,&rinv33);
16173 rinvsq11 = vec_madd(rinv11,rinv11,nul);
16174 rinvsix = vec_madd(rinvsq11,rinvsq11,nul);
16175 rinvsix = vec_madd(rinvsix,rinvsq11,nul);
16176 vnbtot = vec_nmsub(c6t,rinvsix,vnbtot);
16177 rinvsix = vec_madd(rinvsix,rinvsix,nul);
16178 vctot = vec_madd(rinv11,qqOOt,vctot);
16179 vctot = vec_madd(rinv12,qqOHt,vctot);
16180 vctot = vec_madd(rinv13,qqOHt,vctot);
16181 vnbtot = vec_madd(c12t,rinvsix,vnbtot);
16182 vctot = vec_madd(rinv21,qqOHt,vctot);
16183 vctot = vec_madd(rinv22,qqHHt,vctot);
16184 vctot = vec_madd(rinv23,qqHHt,vctot);
16185 vctot = vec_madd(rinv31,qqOHt,vctot);
16186 vctot = vec_madd(rinv32,qqHHt,vctot);
16187 vctot = vec_madd(rinv33,qqHHt,vctot);
16189 add_vector_to_float(Vc+gid[n],vctot);
16190 add_vector_to_float(Vnb+gid[n],vnbtot);
16198 void mcinl2030_altivec(
16199 int nri,
16200 int iinr[],
16201 int jindex[],
16202 int jjnr[],
16203 int shift[],
16204 float shiftvec[],
16205 int gid[],
16206 float pos[],
16207 float charge[],
16208 float facel,
16209 float Vc[],
16210 float krf,
16211 float crf)
16213 vector float ix1,iy1,iz1,ix2,iy2,iz2,ix3,iy3,iz3;
16214 vector float jx1,jy1,jz1,jx2,jy2,jz2,jx3,jy3,jz3;
16216 vector float dx11,dy11,dz11,dx12,dy12,dz12,dx13,dy13,dz13;
16217 vector float dx21,dy21,dz21,dx22,dy22,dz22,dx23,dy23,dz23;
16218 vector float dx31,dy31,dz31,dx32,dy32,dz32,dx33,dy33,dz33;
16220 vector float rsq11,rsq12,rsq13,rsq21,rsq22,rsq23,rsq31,rsq32,rsq33;
16221 vector float rinv11,rinv12,rinv13,rinv21,rinv22,rinv23,rinv31,rinv32,rinv33;
16223 vector float vfacel,nul;
16224 vector float vctot,qqOO,qqOH,qqHH,qO,qH,vkrf,vcrf;
16225 vector float krsq11,krsq12,krsq13,krsq21,krsq22,krsq23,krsq31,krsq32,krsq33;
16226 vector float qqOOt,qqOHt,qqHHt;
16228 int n,k,k0,ii,is3,ii3,ntiA,nj0,nj1;
16229 int jnra,jnrb,jnrc,jnrd;
16230 int j3a,j3b,j3c,j3d;
16232 nul=vec_zero();
16233 vfacel=load_float_and_splat(&facel);
16234 vkrf=load_float_and_splat(&krf);
16235 vcrf=load_float_and_splat(&crf);
16236 qO = load_float_and_splat(charge+iinr[0]);
16237 qH = load_float_and_splat(charge+iinr[0]+1);
16238 qqOO = vec_madd(qO,qO,nul);
16239 qqOH = vec_madd(qO,qH,nul);
16240 qqHH = vec_madd(qH,qH,nul);
16241 qqOO = vec_madd(qqOO,vfacel,nul);
16242 qqOH = vec_madd(qqOH,vfacel,nul);
16243 qqHH = vec_madd(qqHH,vfacel,nul);
16245 for(n=0;n<nri;n++) {
16246 is3 = 3*shift[n];
16247 ii = iinr[n];
16248 ii3 = 3*ii;
16249 load_1_water_shift_and_splat(pos+ii3,shiftvec+is3,&ix1,&iy1,&iz1,
16250 &ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
16251 vctot = nul;
16252 nj0 = jindex[n];
16253 nj1 = jindex[n+1];
16255 for(k=nj0; k<(nj1-3); k+=4) {
16256 jnra = jjnr[k];
16257 jnrb = jjnr[k+1];
16258 jnrc = jjnr[k+2];
16259 jnrd = jjnr[k+3];
16260 j3a = 3*jnra;
16261 j3b = 3*jnrb;
16262 j3c = 3*jnrc;
16263 j3d = 3*jnrd;
16264 load_4_water(pos+j3a,pos+j3b,pos+j3c,pos+j3d,
16265 &jx1,&jy1,&jz1,&jx2,&jy2,&jz2,&jx3,&jy3,&jz3);
16266 dx11 = vec_sub(ix1,jx1);
16267 dx12 = vec_sub(ix1,jx2);
16268 dx13 = vec_sub(ix1,jx3);
16269 dy11 = vec_sub(iy1,jy1);
16270 dy12 = vec_sub(iy1,jy2);
16271 dy13 = vec_sub(iy1,jy3);
16272 dz11 = vec_sub(iz1,jz1);
16273 dz12 = vec_sub(iz1,jz2);
16274 dz13 = vec_sub(iz1,jz3);
16275 dx21 = vec_sub(ix2,jx1);
16276 dx22 = vec_sub(ix2,jx2);
16277 dx23 = vec_sub(ix2,jx3);
16278 dy21 = vec_sub(iy2,jy1);
16279 dy22 = vec_sub(iy2,jy2);
16280 dy23 = vec_sub(iy2,jy3);
16281 dz21 = vec_sub(iz2,jz1);
16282 dz22 = vec_sub(iz2,jz2);
16283 dz23 = vec_sub(iz2,jz3);
16284 dx31 = vec_sub(ix3,jx1);
16285 dx32 = vec_sub(ix3,jx2);
16286 dx33 = vec_sub(ix3,jx3);
16287 dy31 = vec_sub(iy3,jy1);
16288 dy32 = vec_sub(iy3,jy2);
16289 dy33 = vec_sub(iy3,jy3);
16290 dz31 = vec_sub(iz3,jz1);
16291 dz32 = vec_sub(iz3,jz2);
16292 dz33 = vec_sub(iz3,jz3);
16294 rsq11 = vec_madd(dx11,dx11,nul);
16295 rsq12 = vec_madd(dx12,dx12,nul);
16296 rsq13 = vec_madd(dx13,dx13,nul);
16297 rsq21 = vec_madd(dx21,dx21,nul);
16298 rsq22 = vec_madd(dx22,dx22,nul);
16299 rsq23 = vec_madd(dx23,dx23,nul);
16300 rsq31 = vec_madd(dx31,dx31,nul);
16301 rsq32 = vec_madd(dx32,dx32,nul);
16302 rsq33 = vec_madd(dx33,dx33,nul);
16303 rsq11 = vec_madd(dy11,dy11,rsq11);
16304 rsq12 = vec_madd(dy12,dy12,rsq12);
16305 rsq13 = vec_madd(dy13,dy13,rsq13);
16306 rsq21 = vec_madd(dy21,dy21,rsq21);
16307 rsq22 = vec_madd(dy22,dy22,rsq22);
16308 rsq23 = vec_madd(dy23,dy23,rsq23);
16309 rsq31 = vec_madd(dy31,dy31,rsq31);
16310 rsq32 = vec_madd(dy32,dy32,rsq32);
16311 rsq33 = vec_madd(dy33,dy33,rsq33);
16312 rsq11 = vec_madd(dz11,dz11,rsq11);
16313 rsq12 = vec_madd(dz12,dz12,rsq12);
16314 rsq13 = vec_madd(dz13,dz13,rsq13);
16315 rsq21 = vec_madd(dz21,dz21,rsq21);
16316 rsq22 = vec_madd(dz22,dz22,rsq22);
16317 rsq23 = vec_madd(dz23,dz23,rsq23);
16318 rsq31 = vec_madd(dz31,dz31,rsq31);
16319 rsq32 = vec_madd(dz32,dz32,rsq32);
16320 rsq33 = vec_madd(dz33,dz33,rsq33);
16322 do_9_invsqrt(rsq11,rsq12,rsq13,
16323 rsq21,rsq22,rsq23,
16324 rsq31,rsq32,rsq33,
16325 &rinv11,&rinv12,&rinv13,
16326 &rinv21,&rinv22,&rinv23,
16327 &rinv31,&rinv32,&rinv33);
16329 krsq11 = vec_madd(vkrf,rsq11,nul);
16330 krsq12 = vec_madd(vkrf,rsq12,nul);
16331 krsq13 = vec_madd(vkrf,rsq13,nul);
16332 krsq21 = vec_madd(vkrf,rsq21,nul);
16333 krsq22 = vec_madd(vkrf,rsq22,nul);
16334 krsq23 = vec_madd(vkrf,rsq23,nul);
16335 krsq31 = vec_madd(vkrf,rsq31,nul);
16336 krsq32 = vec_madd(vkrf,rsq32,nul);
16337 krsq33 = vec_madd(vkrf,rsq33,nul);
16339 rinv11 = vec_add(rinv11,krsq11);
16340 rinv12 = vec_add(rinv12,krsq12);
16341 rinv13 = vec_add(rinv13,krsq13);
16342 rinv21 = vec_add(rinv21,krsq21);
16343 rinv22 = vec_add(rinv22,krsq22);
16344 rinv23 = vec_add(rinv23,krsq23);
16345 rinv31 = vec_add(rinv31,krsq31);
16346 rinv32 = vec_add(rinv32,krsq32);
16347 rinv33 = vec_add(rinv33,krsq33);
16349 rinv11 = vec_sub(rinv11,vcrf);
16350 rinv12 = vec_sub(rinv12,vcrf);
16351 rinv13 = vec_sub(rinv13,vcrf);
16352 rinv21 = vec_sub(rinv21,vcrf);
16353 rinv22 = vec_sub(rinv22,vcrf);
16354 rinv23 = vec_sub(rinv23,vcrf);
16355 rinv31 = vec_sub(rinv31,vcrf);
16356 rinv32 = vec_sub(rinv32,vcrf);
16357 rinv33 = vec_sub(rinv33,vcrf);
16359 vctot = vec_madd(qqOO,rinv11,vctot);
16360 vctot = vec_madd(qqOH,rinv12,vctot);
16361 vctot = vec_madd(qqOH,rinv13,vctot);
16362 vctot = vec_madd(qqOH,rinv21,vctot);
16363 vctot = vec_madd(qqHH,rinv22,vctot);
16364 vctot = vec_madd(qqHH,rinv23,vctot);
16365 vctot = vec_madd(qqOH,rinv31,vctot);
16366 vctot = vec_madd(qqHH,rinv32,vctot);
16367 vctot = vec_madd(qqHH,rinv33,vctot);
16369 if(k<(nj1-2)) {
16370 jnra = jjnr[k];
16371 jnrb = jjnr[k+1];
16372 jnrc = jjnr[k+2];
16373 j3a = 3*jnra;
16374 j3b = 3*jnrb;
16375 j3c = 3*jnrc;
16376 load_3_water(pos+j3a,pos+j3b,pos+j3c,
16377 &jx1,&jy1,&jz1,&jx2,&jy2,&jz2,&jx3,&jy3,&jz3);
16378 qqOOt = vec_sld(qqOO,nul,4);
16379 qqOHt = vec_sld(qqOH,nul,4);
16380 qqHHt = vec_sld(qqHH,nul,4);
16382 dx11 = vec_sub(ix1,jx1);
16383 dx12 = vec_sub(ix1,jx2);
16384 dx13 = vec_sub(ix1,jx3);
16385 dy11 = vec_sub(iy1,jy1);
16386 dy12 = vec_sub(iy1,jy2);
16387 dy13 = vec_sub(iy1,jy3);
16388 dz11 = vec_sub(iz1,jz1);
16389 dz12 = vec_sub(iz1,jz2);
16390 dz13 = vec_sub(iz1,jz3);
16391 dx21 = vec_sub(ix2,jx1);
16392 dx22 = vec_sub(ix2,jx2);
16393 dx23 = vec_sub(ix2,jx3);
16394 dy21 = vec_sub(iy2,jy1);
16395 dy22 = vec_sub(iy2,jy2);
16396 dy23 = vec_sub(iy2,jy3);
16397 dz21 = vec_sub(iz2,jz1);
16398 dz22 = vec_sub(iz2,jz2);
16399 dz23 = vec_sub(iz2,jz3);
16400 dx31 = vec_sub(ix3,jx1);
16401 dx32 = vec_sub(ix3,jx2);
16402 dx33 = vec_sub(ix3,jx3);
16403 dy31 = vec_sub(iy3,jy1);
16404 dy32 = vec_sub(iy3,jy2);
16405 dy33 = vec_sub(iy3,jy3);
16406 dz31 = vec_sub(iz3,jz1);
16407 dz32 = vec_sub(iz3,jz2);
16408 dz33 = vec_sub(iz3,jz3);
16410 rsq11 = vec_madd(dx11,dx11,nul);
16411 rsq12 = vec_madd(dx12,dx12,nul);
16412 rsq13 = vec_madd(dx13,dx13,nul);
16413 rsq21 = vec_madd(dx21,dx21,nul);
16414 rsq22 = vec_madd(dx22,dx22,nul);
16415 rsq23 = vec_madd(dx23,dx23,nul);
16416 rsq31 = vec_madd(dx31,dx31,nul);
16417 rsq32 = vec_madd(dx32,dx32,nul);
16418 rsq33 = vec_madd(dx33,dx33,nul);
16419 rsq11 = vec_madd(dy11,dy11,rsq11);
16420 rsq12 = vec_madd(dy12,dy12,rsq12);
16421 rsq13 = vec_madd(dy13,dy13,rsq13);
16422 rsq21 = vec_madd(dy21,dy21,rsq21);
16423 rsq22 = vec_madd(dy22,dy22,rsq22);
16424 rsq23 = vec_madd(dy23,dy23,rsq23);
16425 rsq31 = vec_madd(dy31,dy31,rsq31);
16426 rsq32 = vec_madd(dy32,dy32,rsq32);
16427 rsq33 = vec_madd(dy33,dy33,rsq33);
16428 rsq11 = vec_madd(dz11,dz11,rsq11);
16429 rsq12 = vec_madd(dz12,dz12,rsq12);
16430 rsq13 = vec_madd(dz13,dz13,rsq13);
16431 rsq21 = vec_madd(dz21,dz21,rsq21);
16432 rsq22 = vec_madd(dz22,dz22,rsq22);
16433 rsq23 = vec_madd(dz23,dz23,rsq23);
16434 rsq31 = vec_madd(dz31,dz31,rsq31);
16435 rsq32 = vec_madd(dz32,dz32,rsq32);
16436 rsq33 = vec_madd(dz33,dz33,rsq33);
16438 zero_highest_element_in_9_vectors(&rsq11,&rsq12,&rsq13,
16439 &rsq21,&rsq22,&rsq23,
16440 &rsq31,&rsq32,&rsq33);
16442 do_9_invsqrt(rsq11,rsq12,rsq13,
16443 rsq21,rsq22,rsq23,
16444 rsq31,rsq32,rsq33,
16445 &rinv11,&rinv12,&rinv13,
16446 &rinv21,&rinv22,&rinv23,
16447 &rinv31,&rinv32,&rinv33);
16449 zero_highest_element_in_9_vectors(&rinv11,&rinv12,&rinv13,
16450 &rinv21,&rinv22,&rinv23,
16451 &rinv31,&rinv32,&rinv33);
16453 krsq11 = vec_madd(vkrf,rsq11,nul);
16454 krsq12 = vec_madd(vkrf,rsq12,nul);
16455 krsq13 = vec_madd(vkrf,rsq13,nul);
16456 krsq21 = vec_madd(vkrf,rsq21,nul);
16457 krsq22 = vec_madd(vkrf,rsq22,nul);
16458 krsq23 = vec_madd(vkrf,rsq23,nul);
16459 krsq31 = vec_madd(vkrf,rsq31,nul);
16460 krsq32 = vec_madd(vkrf,rsq32,nul);
16461 krsq33 = vec_madd(vkrf,rsq33,nul);
16463 rinv11 = vec_add(rinv11,krsq11);
16464 rinv12 = vec_add(rinv12,krsq12);
16465 rinv13 = vec_add(rinv13,krsq13);
16466 rinv21 = vec_add(rinv21,krsq21);
16467 rinv22 = vec_add(rinv22,krsq22);
16468 rinv23 = vec_add(rinv23,krsq23);
16469 rinv31 = vec_add(rinv31,krsq31);
16470 rinv32 = vec_add(rinv32,krsq32);
16471 rinv33 = vec_add(rinv33,krsq33);
16473 rinv11 = vec_sub(rinv11,vcrf);
16474 rinv12 = vec_sub(rinv12,vcrf);
16475 rinv13 = vec_sub(rinv13,vcrf);
16476 rinv21 = vec_sub(rinv21,vcrf);
16477 rinv22 = vec_sub(rinv22,vcrf);
16478 rinv23 = vec_sub(rinv23,vcrf);
16479 rinv31 = vec_sub(rinv31,vcrf);
16480 rinv32 = vec_sub(rinv32,vcrf);
16481 rinv33 = vec_sub(rinv33,vcrf);
16483 vctot = vec_madd(qqOOt,rinv11,vctot);
16484 vctot = vec_madd(qqOHt,rinv12,vctot);
16485 vctot = vec_madd(qqOHt,rinv13,vctot);
16486 vctot = vec_madd(qqOHt,rinv21,vctot);
16487 vctot = vec_madd(qqHHt,rinv22,vctot);
16488 vctot = vec_madd(qqHHt,rinv23,vctot);
16489 vctot = vec_madd(qqOHt,rinv31,vctot);
16490 vctot = vec_madd(qqHHt,rinv32,vctot);
16491 vctot = vec_madd(qqHHt,rinv33,vctot);
16492 } else if(k<(nj1-1)) {
16493 jnra = jjnr[k];
16494 jnrb = jjnr[k+1];
16495 j3a = 3*jnra;
16496 j3b = 3*jnrb;
16497 load_2_water(pos+j3a,pos+j3b,
16498 &jx1,&jy1,&jz1,&jx2,&jy2,&jz2,&jx3,&jy3,&jz3);
16499 qqOOt = vec_sld(qqOO,nul,8);
16500 qqOHt = vec_sld(qqOH,nul,8);
16501 qqHHt = vec_sld(qqHH,nul,8);
16503 dx11 = vec_sub(ix1,jx1);
16504 dx12 = vec_sub(ix1,jx2);
16505 dx13 = vec_sub(ix1,jx3);
16506 dy11 = vec_sub(iy1,jy1);
16507 dy12 = vec_sub(iy1,jy2);
16508 dy13 = vec_sub(iy1,jy3);
16509 dz11 = vec_sub(iz1,jz1);
16510 dz12 = vec_sub(iz1,jz2);
16511 dz13 = vec_sub(iz1,jz3);
16512 dx21 = vec_sub(ix2,jx1);
16513 dx22 = vec_sub(ix2,jx2);
16514 dx23 = vec_sub(ix2,jx3);
16515 dy21 = vec_sub(iy2,jy1);
16516 dy22 = vec_sub(iy2,jy2);
16517 dy23 = vec_sub(iy2,jy3);
16518 dz21 = vec_sub(iz2,jz1);
16519 dz22 = vec_sub(iz2,jz2);
16520 dz23 = vec_sub(iz2,jz3);
16521 dx31 = vec_sub(ix3,jx1);
16522 dx32 = vec_sub(ix3,jx2);
16523 dx33 = vec_sub(ix3,jx3);
16524 dy31 = vec_sub(iy3,jy1);
16525 dy32 = vec_sub(iy3,jy2);
16526 dy33 = vec_sub(iy3,jy3);
16527 dz31 = vec_sub(iz3,jz1);
16528 dz32 = vec_sub(iz3,jz2);
16529 dz33 = vec_sub(iz3,jz3);
16531 rsq11 = vec_madd(dx11,dx11,nul);
16532 rsq12 = vec_madd(dx12,dx12,nul);
16533 rsq13 = vec_madd(dx13,dx13,nul);
16534 rsq21 = vec_madd(dx21,dx21,nul);
16535 rsq22 = vec_madd(dx22,dx22,nul);
16536 rsq23 = vec_madd(dx23,dx23,nul);
16537 rsq31 = vec_madd(dx31,dx31,nul);
16538 rsq32 = vec_madd(dx32,dx32,nul);
16539 rsq33 = vec_madd(dx33,dx33,nul);
16540 rsq11 = vec_madd(dy11,dy11,rsq11);
16541 rsq12 = vec_madd(dy12,dy12,rsq12);
16542 rsq13 = vec_madd(dy13,dy13,rsq13);
16543 rsq21 = vec_madd(dy21,dy21,rsq21);
16544 rsq22 = vec_madd(dy22,dy22,rsq22);
16545 rsq23 = vec_madd(dy23,dy23,rsq23);
16546 rsq31 = vec_madd(dy31,dy31,rsq31);
16547 rsq32 = vec_madd(dy32,dy32,rsq32);
16548 rsq33 = vec_madd(dy33,dy33,rsq33);
16549 rsq11 = vec_madd(dz11,dz11,rsq11);
16550 rsq12 = vec_madd(dz12,dz12,rsq12);
16551 rsq13 = vec_madd(dz13,dz13,rsq13);
16552 rsq21 = vec_madd(dz21,dz21,rsq21);
16553 rsq22 = vec_madd(dz22,dz22,rsq22);
16554 rsq23 = vec_madd(dz23,dz23,rsq23);
16555 rsq31 = vec_madd(dz31,dz31,rsq31);
16556 rsq32 = vec_madd(dz32,dz32,rsq32);
16557 rsq33 = vec_madd(dz33,dz33,rsq33);
16559 zero_highest_2_elements_in_9_vectors(&rsq11,&rsq12,&rsq13,
16560 &rsq21,&rsq22,&rsq23,
16561 &rsq31,&rsq32,&rsq33);
16563 do_9_invsqrt(rsq11,rsq12,rsq13,
16564 rsq21,rsq22,rsq23,
16565 rsq31,rsq32,rsq33,
16566 &rinv11,&rinv12,&rinv13,
16567 &rinv21,&rinv22,&rinv23,
16568 &rinv31,&rinv32,&rinv33);
16570 zero_highest_2_elements_in_9_vectors(&rinv11,&rinv12,&rinv13,
16571 &rinv21,&rinv22,&rinv23,
16572 &rinv31,&rinv32,&rinv33);
16574 krsq11 = vec_madd(vkrf,rsq11,nul);
16575 krsq12 = vec_madd(vkrf,rsq12,nul);
16576 krsq13 = vec_madd(vkrf,rsq13,nul);
16577 krsq21 = vec_madd(vkrf,rsq21,nul);
16578 krsq22 = vec_madd(vkrf,rsq22,nul);
16579 krsq23 = vec_madd(vkrf,rsq23,nul);
16580 krsq31 = vec_madd(vkrf,rsq31,nul);
16581 krsq32 = vec_madd(vkrf,rsq32,nul);
16582 krsq33 = vec_madd(vkrf,rsq33,nul);
16584 rinv11 = vec_add(rinv11,krsq11);
16585 rinv12 = vec_add(rinv12,krsq12);
16586 rinv13 = vec_add(rinv13,krsq13);
16587 rinv21 = vec_add(rinv21,krsq21);
16588 rinv22 = vec_add(rinv22,krsq22);
16589 rinv23 = vec_add(rinv23,krsq23);
16590 rinv31 = vec_add(rinv31,krsq31);
16591 rinv32 = vec_add(rinv32,krsq32);
16592 rinv33 = vec_add(rinv33,krsq33);
16594 rinv11 = vec_sub(rinv11,vcrf);
16595 rinv12 = vec_sub(rinv12,vcrf);
16596 rinv13 = vec_sub(rinv13,vcrf);
16597 rinv21 = vec_sub(rinv21,vcrf);
16598 rinv22 = vec_sub(rinv22,vcrf);
16599 rinv23 = vec_sub(rinv23,vcrf);
16600 rinv31 = vec_sub(rinv31,vcrf);
16601 rinv32 = vec_sub(rinv32,vcrf);
16602 rinv33 = vec_sub(rinv33,vcrf);
16604 vctot = vec_madd(qqOOt,rinv11,vctot);
16605 vctot = vec_madd(qqOHt,rinv12,vctot);
16606 vctot = vec_madd(qqOHt,rinv13,vctot);
16607 vctot = vec_madd(qqOHt,rinv21,vctot);
16608 vctot = vec_madd(qqHHt,rinv22,vctot);
16609 vctot = vec_madd(qqHHt,rinv23,vctot);
16610 vctot = vec_madd(qqOHt,rinv31,vctot);
16611 vctot = vec_madd(qqHHt,rinv32,vctot);
16612 vctot = vec_madd(qqHHt,rinv33,vctot);
16613 } else if(k<nj1) {
16614 jnra = jjnr[k];
16615 j3a = 3*jnra;
16616 load_1_water(pos+j3a,
16617 &jx1,&jy1,&jz1,&jx2,&jy2,&jz2,&jx3,&jy3,&jz3);
16618 qqOOt = vec_sld(qqOO,nul,12);
16619 qqOHt = vec_sld(qqOH,nul,12);
16620 qqHHt = vec_sld(qqHH,nul,12);
16622 dx11 = vec_sub(ix1,jx1);
16623 dx12 = vec_sub(ix1,jx2);
16624 dx13 = vec_sub(ix1,jx3);
16625 dy11 = vec_sub(iy1,jy1);
16626 dy12 = vec_sub(iy1,jy2);
16627 dy13 = vec_sub(iy1,jy3);
16628 dz11 = vec_sub(iz1,jz1);
16629 dz12 = vec_sub(iz1,jz2);
16630 dz13 = vec_sub(iz1,jz3);
16631 dx21 = vec_sub(ix2,jx1);
16632 dx22 = vec_sub(ix2,jx2);
16633 dx23 = vec_sub(ix2,jx3);
16634 dy21 = vec_sub(iy2,jy1);
16635 dy22 = vec_sub(iy2,jy2);
16636 dy23 = vec_sub(iy2,jy3);
16637 dz21 = vec_sub(iz2,jz1);
16638 dz22 = vec_sub(iz2,jz2);
16639 dz23 = vec_sub(iz2,jz3);
16640 dx31 = vec_sub(ix3,jx1);
16641 dx32 = vec_sub(ix3,jx2);
16642 dx33 = vec_sub(ix3,jx3);
16643 dy31 = vec_sub(iy3,jy1);
16644 dy32 = vec_sub(iy3,jy2);
16645 dy33 = vec_sub(iy3,jy3);
16646 dz31 = vec_sub(iz3,jz1);
16647 dz32 = vec_sub(iz3,jz2);
16648 dz33 = vec_sub(iz3,jz3);
16650 rsq11 = vec_madd(dx11,dx11,nul);
16651 rsq12 = vec_madd(dx12,dx12,nul);
16652 rsq13 = vec_madd(dx13,dx13,nul);
16653 rsq21 = vec_madd(dx21,dx21,nul);
16654 rsq22 = vec_madd(dx22,dx22,nul);
16655 rsq23 = vec_madd(dx23,dx23,nul);
16656 rsq31 = vec_madd(dx31,dx31,nul);
16657 rsq32 = vec_madd(dx32,dx32,nul);
16658 rsq33 = vec_madd(dx33,dx33,nul);
16659 rsq11 = vec_madd(dy11,dy11,rsq11);
16660 rsq12 = vec_madd(dy12,dy12,rsq12);
16661 rsq13 = vec_madd(dy13,dy13,rsq13);
16662 rsq21 = vec_madd(dy21,dy21,rsq21);
16663 rsq22 = vec_madd(dy22,dy22,rsq22);
16664 rsq23 = vec_madd(dy23,dy23,rsq23);
16665 rsq31 = vec_madd(dy31,dy31,rsq31);
16666 rsq32 = vec_madd(dy32,dy32,rsq32);
16667 rsq33 = vec_madd(dy33,dy33,rsq33);
16668 rsq11 = vec_madd(dz11,dz11,rsq11);
16669 rsq12 = vec_madd(dz12,dz12,rsq12);
16670 rsq13 = vec_madd(dz13,dz13,rsq13);
16671 rsq21 = vec_madd(dz21,dz21,rsq21);
16672 rsq22 = vec_madd(dz22,dz22,rsq22);
16673 rsq23 = vec_madd(dz23,dz23,rsq23);
16674 rsq31 = vec_madd(dz31,dz31,rsq31);
16675 rsq32 = vec_madd(dz32,dz32,rsq32);
16676 rsq33 = vec_madd(dz33,dz33,rsq33);
16678 zero_highest_3_elements_in_9_vectors(&rsq11,&rsq12,&rsq13,
16679 &rsq21,&rsq22,&rsq23,
16680 &rsq31,&rsq32,&rsq33);
16682 do_9_invsqrt(rsq11,rsq12,rsq13,
16683 rsq21,rsq22,rsq23,
16684 rsq31,rsq32,rsq33,
16685 &rinv11,&rinv12,&rinv13,
16686 &rinv21,&rinv22,&rinv23,
16687 &rinv31,&rinv32,&rinv33);
16689 zero_highest_3_elements_in_9_vectors(&rinv11,&rinv12,&rinv13,
16690 &rinv21,&rinv22,&rinv23,
16691 &rinv31,&rinv32,&rinv33);
16693 krsq11 = vec_madd(vkrf,rsq11,nul);
16694 krsq12 = vec_madd(vkrf,rsq12,nul);
16695 krsq13 = vec_madd(vkrf,rsq13,nul);
16696 krsq21 = vec_madd(vkrf,rsq21,nul);
16697 krsq22 = vec_madd(vkrf,rsq22,nul);
16698 krsq23 = vec_madd(vkrf,rsq23,nul);
16699 krsq31 = vec_madd(vkrf,rsq31,nul);
16700 krsq32 = vec_madd(vkrf,rsq32,nul);
16701 krsq33 = vec_madd(vkrf,rsq33,nul);
16703 rinv11 = vec_add(rinv11,krsq11);
16704 rinv12 = vec_add(rinv12,krsq12);
16705 rinv13 = vec_add(rinv13,krsq13);
16706 rinv21 = vec_add(rinv21,krsq21);
16707 rinv22 = vec_add(rinv22,krsq22);
16708 rinv23 = vec_add(rinv23,krsq23);
16709 rinv31 = vec_add(rinv31,krsq31);
16710 rinv32 = vec_add(rinv32,krsq32);
16711 rinv33 = vec_add(rinv33,krsq33);
16713 rinv11 = vec_sub(rinv11,vcrf);
16714 rinv12 = vec_sub(rinv12,vcrf);
16715 rinv13 = vec_sub(rinv13,vcrf);
16716 rinv21 = vec_sub(rinv21,vcrf);
16717 rinv22 = vec_sub(rinv22,vcrf);
16718 rinv23 = vec_sub(rinv23,vcrf);
16719 rinv31 = vec_sub(rinv31,vcrf);
16720 rinv32 = vec_sub(rinv32,vcrf);
16721 rinv33 = vec_sub(rinv33,vcrf);
16723 vctot = vec_madd(qqOOt,rinv11,vctot);
16724 vctot = vec_madd(qqOHt,rinv12,vctot);
16725 vctot = vec_madd(qqOHt,rinv13,vctot);
16726 vctot = vec_madd(qqOHt,rinv21,vctot);
16727 vctot = vec_madd(qqHHt,rinv22,vctot);
16728 vctot = vec_madd(qqHHt,rinv23,vctot);
16729 vctot = vec_madd(qqOHt,rinv31,vctot);
16730 vctot = vec_madd(qqHHt,rinv32,vctot);
16731 vctot = vec_madd(qqHHt,rinv33,vctot);
16733 /* update outer data */
16734 add_vector_to_float(Vc+gid[n],vctot);
16740 void mcinl2130_altivec(
16741 int nri,
16742 int iinr[],
16743 int jindex[],
16744 int jjnr[],
16745 int shift[],
16746 float shiftvec[],
16747 int gid[],
16748 float pos[],
16749 float charge[],
16750 float facel,
16751 float Vc[],
16752 float krf,
16753 float crf,
16754 int type[],
16755 int ntype,
16756 float nbfp[],
16757 float Vnb[])
16759 vector float ix1,iy1,iz1,ix2,iy2,iz2,ix3,iy3,iz3;
16760 vector float jx1,jy1,jz1,jx2,jy2,jz2,jx3,jy3,jz3;
16762 vector float dx11,dy11,dz11,dx12,dy12,dz12,dx13,dy13,dz13;
16763 vector float dx21,dy21,dz21,dx22,dy22,dz22,dx23,dy23,dz23;
16764 vector float dx31,dy31,dz31,dx32,dy32,dz32,dx33,dy33,dz33;
16766 vector float rsq11,rsq12,rsq13,rsq21,rsq22,rsq23,rsq31,rsq32,rsq33;
16767 vector float rinv11,rinv12,rinv13,rinv21,rinv22,rinv23,rinv31,rinv32,rinv33;
16768 vector float rinvsq11,vkrf,vcrf;
16769 vector float krsq11,krsq12,krsq13,krsq21,krsq22,krsq23,krsq31,krsq32,krsq33;
16771 vector float vfacel,nul;
16772 vector float vctot,qqOO,qqOH,qqHH,qO,qH,c6,c12,rinvsix;
16773 vector float vnbtot,qqOOt,qqOHt,qqHHt,c6t,c12t;
16775 int n,k,k0,ii,is3,ii3,ntiA,nj0,nj1;
16776 int jnra,jnrb,jnrc,jnrd,tp,tj;
16777 int j3a,j3b,j3c,j3d;
16779 nul=vec_zero();
16780 vfacel=load_float_and_splat(&facel);
16781 vkrf=load_float_and_splat(&krf);
16782 vcrf=load_float_and_splat(&crf);
16783 ii = iinr[0];
16784 qO = load_float_and_splat(charge+ii);
16785 qH = load_float_and_splat(charge+ii+1);
16786 qqOO = vec_madd(qO,qO,nul);
16787 qqOH = vec_madd(qO,qH,nul);
16788 qqHH = vec_madd(qH,qH,nul);
16789 qqOO = vec_madd(qqOO,vfacel,nul);
16790 qqOH = vec_madd(qqOH,vfacel,nul);
16791 qqHH = vec_madd(qqHH,vfacel,nul);
16792 tp = 2*type[ii];
16793 tj = (ntype+1)*tp;
16794 load_1_pair(nbfp+tj,&c6,&c12);
16795 c6 = vec_splat(c6,0);
16796 c12 = vec_splat(c12,0);
16798 for(n=0;n<nri;n++) {
16799 is3 = 3*shift[n];
16800 ii = iinr[n];
16801 ii3 = 3*ii;
16802 load_1_water_shift_and_splat(pos+ii3,shiftvec+is3,&ix1,&iy1,&iz1,
16803 &ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
16804 vctot = nul;
16805 vnbtot = nul;
16806 nj0 = jindex[n];
16807 nj1 = jindex[n+1];
16809 for(k=nj0; k<(nj1-3); k+=4) {
16810 jnra = jjnr[k];
16811 jnrb = jjnr[k+1];
16812 jnrc = jjnr[k+2];
16813 jnrd = jjnr[k+3];
16814 j3a = 3*jnra;
16815 j3b = 3*jnrb;
16816 j3c = 3*jnrc;
16817 j3d = 3*jnrd;
16818 load_4_water(pos+j3a,pos+j3b,pos+j3c,pos+j3d,
16819 &jx1,&jy1,&jz1,&jx2,&jy2,&jz2,&jx3,&jy3,&jz3);
16820 dx11 = vec_sub(ix1,jx1);
16821 dx12 = vec_sub(ix1,jx2);
16822 dx13 = vec_sub(ix1,jx3);
16823 dy11 = vec_sub(iy1,jy1);
16824 dy12 = vec_sub(iy1,jy2);
16825 dy13 = vec_sub(iy1,jy3);
16826 dz11 = vec_sub(iz1,jz1);
16827 dz12 = vec_sub(iz1,jz2);
16828 dz13 = vec_sub(iz1,jz3);
16829 dx21 = vec_sub(ix2,jx1);
16830 dx22 = vec_sub(ix2,jx2);
16831 dx23 = vec_sub(ix2,jx3);
16832 dy21 = vec_sub(iy2,jy1);
16833 dy22 = vec_sub(iy2,jy2);
16834 dy23 = vec_sub(iy2,jy3);
16835 dz21 = vec_sub(iz2,jz1);
16836 dz22 = vec_sub(iz2,jz2);
16837 dz23 = vec_sub(iz2,jz3);
16838 dx31 = vec_sub(ix3,jx1);
16839 dx32 = vec_sub(ix3,jx2);
16840 dx33 = vec_sub(ix3,jx3);
16841 dy31 = vec_sub(iy3,jy1);
16842 dy32 = vec_sub(iy3,jy2);
16843 dy33 = vec_sub(iy3,jy3);
16844 dz31 = vec_sub(iz3,jz1);
16845 dz32 = vec_sub(iz3,jz2);
16846 dz33 = vec_sub(iz3,jz3);
16848 rsq11 = vec_madd(dx11,dx11,nul);
16849 rsq12 = vec_madd(dx12,dx12,nul);
16850 rsq13 = vec_madd(dx13,dx13,nul);
16851 rsq21 = vec_madd(dx21,dx21,nul);
16852 rsq22 = vec_madd(dx22,dx22,nul);
16853 rsq23 = vec_madd(dx23,dx23,nul);
16854 rsq31 = vec_madd(dx31,dx31,nul);
16855 rsq32 = vec_madd(dx32,dx32,nul);
16856 rsq33 = vec_madd(dx33,dx33,nul);
16857 rsq11 = vec_madd(dy11,dy11,rsq11);
16858 rsq12 = vec_madd(dy12,dy12,rsq12);
16859 rsq13 = vec_madd(dy13,dy13,rsq13);
16860 rsq21 = vec_madd(dy21,dy21,rsq21);
16861 rsq22 = vec_madd(dy22,dy22,rsq22);
16862 rsq23 = vec_madd(dy23,dy23,rsq23);
16863 rsq31 = vec_madd(dy31,dy31,rsq31);
16864 rsq32 = vec_madd(dy32,dy32,rsq32);
16865 rsq33 = vec_madd(dy33,dy33,rsq33);
16866 rsq11 = vec_madd(dz11,dz11,rsq11);
16867 rsq12 = vec_madd(dz12,dz12,rsq12);
16868 rsq13 = vec_madd(dz13,dz13,rsq13);
16869 rsq21 = vec_madd(dz21,dz21,rsq21);
16870 rsq22 = vec_madd(dz22,dz22,rsq22);
16871 rsq23 = vec_madd(dz23,dz23,rsq23);
16872 rsq31 = vec_madd(dz31,dz31,rsq31);
16873 rsq32 = vec_madd(dz32,dz32,rsq32);
16874 rsq33 = vec_madd(dz33,dz33,rsq33);
16876 do_9_invsqrt(rsq11,rsq12,rsq13,
16877 rsq21,rsq22,rsq23,
16878 rsq31,rsq32,rsq33,
16879 &rinv11,&rinv12,&rinv13,
16880 &rinv21,&rinv22,&rinv23,
16881 &rinv31,&rinv32,&rinv33);
16883 rinvsq11 = vec_madd(rinv11,rinv11,nul);
16884 krsq11 = vec_madd(vkrf,rsq11,nul);
16885 krsq12 = vec_madd(vkrf,rsq12,nul);
16886 krsq13 = vec_madd(vkrf,rsq13,nul);
16887 krsq21 = vec_madd(vkrf,rsq21,nul);
16888 krsq22 = vec_madd(vkrf,rsq22,nul);
16889 krsq23 = vec_madd(vkrf,rsq23,nul);
16890 krsq31 = vec_madd(vkrf,rsq31,nul);
16891 krsq32 = vec_madd(vkrf,rsq32,nul);
16892 krsq33 = vec_madd(vkrf,rsq33,nul);
16894 rinvsix = vec_madd(rinvsq11,rinvsq11,nul);
16895 rinvsix = vec_madd(rinvsix,rinvsq11,nul);
16896 vnbtot = vec_nmsub(c6,rinvsix,vnbtot);
16897 vnbtot = vec_madd(c12,vec_madd(rinvsix,rinvsix,nul),vnbtot);
16899 rinv11 = vec_add(rinv11,krsq11);
16900 rinv12 = vec_add(rinv12,krsq12);
16901 rinv13 = vec_add(rinv13,krsq13);
16902 rinv21 = vec_add(rinv21,krsq21);
16903 rinv22 = vec_add(rinv22,krsq22);
16904 rinv23 = vec_add(rinv23,krsq23);
16905 rinv31 = vec_add(rinv31,krsq31);
16906 rinv32 = vec_add(rinv32,krsq32);
16907 rinv33 = vec_add(rinv33,krsq33);
16909 rinv11 = vec_sub(rinv11,vcrf);
16910 rinv12 = vec_sub(rinv12,vcrf);
16911 rinv13 = vec_sub(rinv13,vcrf);
16912 rinv21 = vec_sub(rinv21,vcrf);
16913 rinv22 = vec_sub(rinv22,vcrf);
16914 rinv23 = vec_sub(rinv23,vcrf);
16915 rinv31 = vec_sub(rinv31,vcrf);
16916 rinv32 = vec_sub(rinv32,vcrf);
16917 rinv33 = vec_sub(rinv33,vcrf);
16919 vctot = vec_madd(qqOO,rinv11,vctot);
16920 vctot = vec_madd(qqOH,rinv12,vctot);
16921 vctot = vec_madd(qqOH,rinv13,vctot);
16922 vctot = vec_madd(qqOH,rinv21,vctot);
16923 vctot = vec_madd(qqHH,rinv22,vctot);
16924 vctot = vec_madd(qqHH,rinv23,vctot);
16925 vctot = vec_madd(qqOH,rinv31,vctot);
16926 vctot = vec_madd(qqHH,rinv32,vctot);
16927 vctot = vec_madd(qqHH,rinv33,vctot);
16929 if(k<(nj1-2)) {
16930 jnra = jjnr[k];
16931 jnrb = jjnr[k+1];
16932 jnrc = jjnr[k+2];
16933 j3a = 3*jnra;
16934 j3b = 3*jnrb;
16935 j3c = 3*jnrc;
16936 load_3_water(pos+j3a,pos+j3b,pos+j3c,
16937 &jx1,&jy1,&jz1,&jx2,&jy2,&jz2,&jx3,&jy3,&jz3);
16938 qqOOt = vec_sld(qqOO,nul,4);
16939 qqOHt = vec_sld(qqOH,nul,4);
16940 qqHHt = vec_sld(qqHH,nul,4);
16941 c6t = vec_sld(c6,nul,4);
16942 c12t = vec_sld(c12,nul,4);
16944 dx11 = vec_sub(ix1,jx1);
16945 dx12 = vec_sub(ix1,jx2);
16946 dx13 = vec_sub(ix1,jx3);
16947 dy11 = vec_sub(iy1,jy1);
16948 dy12 = vec_sub(iy1,jy2);
16949 dy13 = vec_sub(iy1,jy3);
16950 dz11 = vec_sub(iz1,jz1);
16951 dz12 = vec_sub(iz1,jz2);
16952 dz13 = vec_sub(iz1,jz3);
16953 dx21 = vec_sub(ix2,jx1);
16954 dx22 = vec_sub(ix2,jx2);
16955 dx23 = vec_sub(ix2,jx3);
16956 dy21 = vec_sub(iy2,jy1);
16957 dy22 = vec_sub(iy2,jy2);
16958 dy23 = vec_sub(iy2,jy3);
16959 dz21 = vec_sub(iz2,jz1);
16960 dz22 = vec_sub(iz2,jz2);
16961 dz23 = vec_sub(iz2,jz3);
16962 dx31 = vec_sub(ix3,jx1);
16963 dx32 = vec_sub(ix3,jx2);
16964 dx33 = vec_sub(ix3,jx3);
16965 dy31 = vec_sub(iy3,jy1);
16966 dy32 = vec_sub(iy3,jy2);
16967 dy33 = vec_sub(iy3,jy3);
16968 dz31 = vec_sub(iz3,jz1);
16969 dz32 = vec_sub(iz3,jz2);
16970 dz33 = vec_sub(iz3,jz3);
16972 rsq11 = vec_madd(dx11,dx11,nul);
16973 rsq12 = vec_madd(dx12,dx12,nul);
16974 rsq13 = vec_madd(dx13,dx13,nul);
16975 rsq21 = vec_madd(dx21,dx21,nul);
16976 rsq22 = vec_madd(dx22,dx22,nul);
16977 rsq23 = vec_madd(dx23,dx23,nul);
16978 rsq31 = vec_madd(dx31,dx31,nul);
16979 rsq32 = vec_madd(dx32,dx32,nul);
16980 rsq33 = vec_madd(dx33,dx33,nul);
16981 rsq11 = vec_madd(dy11,dy11,rsq11);
16982 rsq12 = vec_madd(dy12,dy12,rsq12);
16983 rsq13 = vec_madd(dy13,dy13,rsq13);
16984 rsq21 = vec_madd(dy21,dy21,rsq21);
16985 rsq22 = vec_madd(dy22,dy22,rsq22);
16986 rsq23 = vec_madd(dy23,dy23,rsq23);
16987 rsq31 = vec_madd(dy31,dy31,rsq31);
16988 rsq32 = vec_madd(dy32,dy32,rsq32);
16989 rsq33 = vec_madd(dy33,dy33,rsq33);
16990 rsq11 = vec_madd(dz11,dz11,rsq11);
16991 rsq12 = vec_madd(dz12,dz12,rsq12);
16992 rsq13 = vec_madd(dz13,dz13,rsq13);
16993 rsq21 = vec_madd(dz21,dz21,rsq21);
16994 rsq22 = vec_madd(dz22,dz22,rsq22);
16995 rsq23 = vec_madd(dz23,dz23,rsq23);
16996 rsq31 = vec_madd(dz31,dz31,rsq31);
16997 rsq32 = vec_madd(dz32,dz32,rsq32);
16998 rsq33 = vec_madd(dz33,dz33,rsq33);
17000 zero_highest_element_in_9_vectors(&rsq11,&rsq12,&rsq13,
17001 &rsq21,&rsq22,&rsq23,
17002 &rsq31,&rsq32,&rsq33);
17004 do_9_invsqrt(rsq11,rsq12,rsq13,
17005 rsq21,rsq22,rsq23,
17006 rsq31,rsq32,rsq33,
17007 &rinv11,&rinv12,&rinv13,
17008 &rinv21,&rinv22,&rinv23,
17009 &rinv31,&rinv32,&rinv33);
17011 zero_highest_element_in_9_vectors(&rinv11,&rinv12,&rinv13,
17012 &rinv21,&rinv22,&rinv23,
17013 &rinv31,&rinv32,&rinv33);
17015 rinvsq11 = vec_madd(rinv11,rinv11,nul);
17016 krsq11 = vec_madd(vkrf,rsq11,nul);
17017 krsq12 = vec_madd(vkrf,rsq12,nul);
17018 krsq13 = vec_madd(vkrf,rsq13,nul);
17019 krsq21 = vec_madd(vkrf,rsq21,nul);
17020 krsq22 = vec_madd(vkrf,rsq22,nul);
17021 krsq23 = vec_madd(vkrf,rsq23,nul);
17022 krsq31 = vec_madd(vkrf,rsq31,nul);
17023 krsq32 = vec_madd(vkrf,rsq32,nul);
17024 krsq33 = vec_madd(vkrf,rsq33,nul);
17026 rinvsix = vec_madd(rinvsq11,rinvsq11,nul);
17027 rinvsix = vec_madd(rinvsix,rinvsq11,nul);
17028 vnbtot = vec_nmsub(c6t,rinvsix,vnbtot);
17029 vnbtot = vec_madd(c12t,vec_madd(rinvsix,rinvsix,nul),vnbtot);
17031 rinv11 = vec_add(rinv11,krsq11);
17032 rinv12 = vec_add(rinv12,krsq12);
17033 rinv13 = vec_add(rinv13,krsq13);
17034 rinv21 = vec_add(rinv21,krsq21);
17035 rinv22 = vec_add(rinv22,krsq22);
17036 rinv23 = vec_add(rinv23,krsq23);
17037 rinv31 = vec_add(rinv31,krsq31);
17038 rinv32 = vec_add(rinv32,krsq32);
17039 rinv33 = vec_add(rinv33,krsq33);
17041 rinv11 = vec_sub(rinv11,vcrf);
17042 rinv12 = vec_sub(rinv12,vcrf);
17043 rinv13 = vec_sub(rinv13,vcrf);
17044 rinv21 = vec_sub(rinv21,vcrf);
17045 rinv22 = vec_sub(rinv22,vcrf);
17046 rinv23 = vec_sub(rinv23,vcrf);
17047 rinv31 = vec_sub(rinv31,vcrf);
17048 rinv32 = vec_sub(rinv32,vcrf);
17049 rinv33 = vec_sub(rinv33,vcrf);
17051 vctot = vec_madd(qqOOt,rinv11,vctot);
17052 vctot = vec_madd(qqOHt,rinv12,vctot);
17053 vctot = vec_madd(qqOHt,rinv13,vctot);
17054 vctot = vec_madd(qqOHt,rinv21,vctot);
17055 vctot = vec_madd(qqHHt,rinv22,vctot);
17056 vctot = vec_madd(qqHHt,rinv23,vctot);
17057 vctot = vec_madd(qqOHt,rinv31,vctot);
17058 vctot = vec_madd(qqHHt,rinv32,vctot);
17059 vctot = vec_madd(qqHHt,rinv33,vctot);
17060 } else if(k<(nj1-1)) {
17061 jnra = jjnr[k];
17062 jnrb = jjnr[k+1];
17063 j3a = 3*jnra;
17064 j3b = 3*jnrb;
17065 load_2_water(pos+j3a,pos+j3b,
17066 &jx1,&jy1,&jz1,&jx2,&jy2,&jz2,&jx3,&jy3,&jz3);
17067 qqOOt = vec_sld(qqOO,nul,8);
17068 qqOHt = vec_sld(qqOH,nul,8);
17069 qqHHt = vec_sld(qqHH,nul,8);
17070 c6t = vec_sld(c6,nul,8);
17071 c12t = vec_sld(c12,nul,8);
17073 dx11 = vec_sub(ix1,jx1);
17074 dx12 = vec_sub(ix1,jx2);
17075 dx13 = vec_sub(ix1,jx3);
17076 dy11 = vec_sub(iy1,jy1);
17077 dy12 = vec_sub(iy1,jy2);
17078 dy13 = vec_sub(iy1,jy3);
17079 dz11 = vec_sub(iz1,jz1);
17080 dz12 = vec_sub(iz1,jz2);
17081 dz13 = vec_sub(iz1,jz3);
17082 dx21 = vec_sub(ix2,jx1);
17083 dx22 = vec_sub(ix2,jx2);
17084 dx23 = vec_sub(ix2,jx3);
17085 dy21 = vec_sub(iy2,jy1);
17086 dy22 = vec_sub(iy2,jy2);
17087 dy23 = vec_sub(iy2,jy3);
17088 dz21 = vec_sub(iz2,jz1);
17089 dz22 = vec_sub(iz2,jz2);
17090 dz23 = vec_sub(iz2,jz3);
17091 dx31 = vec_sub(ix3,jx1);
17092 dx32 = vec_sub(ix3,jx2);
17093 dx33 = vec_sub(ix3,jx3);
17094 dy31 = vec_sub(iy3,jy1);
17095 dy32 = vec_sub(iy3,jy2);
17096 dy33 = vec_sub(iy3,jy3);
17097 dz31 = vec_sub(iz3,jz1);
17098 dz32 = vec_sub(iz3,jz2);
17099 dz33 = vec_sub(iz3,jz3);
17101 rsq11 = vec_madd(dx11,dx11,nul);
17102 rsq12 = vec_madd(dx12,dx12,nul);
17103 rsq13 = vec_madd(dx13,dx13,nul);
17104 rsq21 = vec_madd(dx21,dx21,nul);
17105 rsq22 = vec_madd(dx22,dx22,nul);
17106 rsq23 = vec_madd(dx23,dx23,nul);
17107 rsq31 = vec_madd(dx31,dx31,nul);
17108 rsq32 = vec_madd(dx32,dx32,nul);
17109 rsq33 = vec_madd(dx33,dx33,nul);
17110 rsq11 = vec_madd(dy11,dy11,rsq11);
17111 rsq12 = vec_madd(dy12,dy12,rsq12);
17112 rsq13 = vec_madd(dy13,dy13,rsq13);
17113 rsq21 = vec_madd(dy21,dy21,rsq21);
17114 rsq22 = vec_madd(dy22,dy22,rsq22);
17115 rsq23 = vec_madd(dy23,dy23,rsq23);
17116 rsq31 = vec_madd(dy31,dy31,rsq31);
17117 rsq32 = vec_madd(dy32,dy32,rsq32);
17118 rsq33 = vec_madd(dy33,dy33,rsq33);
17119 rsq11 = vec_madd(dz11,dz11,rsq11);
17120 rsq12 = vec_madd(dz12,dz12,rsq12);
17121 rsq13 = vec_madd(dz13,dz13,rsq13);
17122 rsq21 = vec_madd(dz21,dz21,rsq21);
17123 rsq22 = vec_madd(dz22,dz22,rsq22);
17124 rsq23 = vec_madd(dz23,dz23,rsq23);
17125 rsq31 = vec_madd(dz31,dz31,rsq31);
17126 rsq32 = vec_madd(dz32,dz32,rsq32);
17127 rsq33 = vec_madd(dz33,dz33,rsq33);
17129 zero_highest_2_elements_in_9_vectors(&rsq11,&rsq12,&rsq13,
17130 &rsq21,&rsq22,&rsq23,
17131 &rsq31,&rsq32,&rsq33);
17133 do_9_invsqrt(rsq11,rsq12,rsq13,
17134 rsq21,rsq22,rsq23,
17135 rsq31,rsq32,rsq33,
17136 &rinv11,&rinv12,&rinv13,
17137 &rinv21,&rinv22,&rinv23,
17138 &rinv31,&rinv32,&rinv33);
17140 zero_highest_2_elements_in_9_vectors(&rinv11,&rinv12,&rinv13,
17141 &rinv21,&rinv22,&rinv23,
17142 &rinv31,&rinv32,&rinv33);
17144 rinvsq11 = vec_madd(rinv11,rinv11,nul);
17145 krsq11 = vec_madd(vkrf,rsq11,nul);
17146 krsq12 = vec_madd(vkrf,rsq12,nul);
17147 krsq13 = vec_madd(vkrf,rsq13,nul);
17148 krsq21 = vec_madd(vkrf,rsq21,nul);
17149 krsq22 = vec_madd(vkrf,rsq22,nul);
17150 krsq23 = vec_madd(vkrf,rsq23,nul);
17151 krsq31 = vec_madd(vkrf,rsq31,nul);
17152 krsq32 = vec_madd(vkrf,rsq32,nul);
17153 krsq33 = vec_madd(vkrf,rsq33,nul);
17155 rinvsix = vec_madd(rinvsq11,rinvsq11,nul);
17156 rinvsix = vec_madd(rinvsix,rinvsq11,nul);
17157 vnbtot = vec_nmsub(c6t,rinvsix,vnbtot);
17158 vnbtot = vec_madd(c12t,vec_madd(rinvsix,rinvsix,nul),vnbtot);
17160 rinv11 = vec_add(rinv11,krsq11);
17161 rinv12 = vec_add(rinv12,krsq12);
17162 rinv13 = vec_add(rinv13,krsq13);
17163 rinv21 = vec_add(rinv21,krsq21);
17164 rinv22 = vec_add(rinv22,krsq22);
17165 rinv23 = vec_add(rinv23,krsq23);
17166 rinv31 = vec_add(rinv31,krsq31);
17167 rinv32 = vec_add(rinv32,krsq32);
17168 rinv33 = vec_add(rinv33,krsq33);
17170 rinv11 = vec_sub(rinv11,vcrf);
17171 rinv12 = vec_sub(rinv12,vcrf);
17172 rinv13 = vec_sub(rinv13,vcrf);
17173 rinv21 = vec_sub(rinv21,vcrf);
17174 rinv22 = vec_sub(rinv22,vcrf);
17175 rinv23 = vec_sub(rinv23,vcrf);
17176 rinv31 = vec_sub(rinv31,vcrf);
17177 rinv32 = vec_sub(rinv32,vcrf);
17178 rinv33 = vec_sub(rinv33,vcrf);
17180 vctot = vec_madd(qqOOt,rinv11,vctot);
17181 vctot = vec_madd(qqOHt,rinv12,vctot);
17182 vctot = vec_madd(qqOHt,rinv13,vctot);
17183 vctot = vec_madd(qqOHt,rinv21,vctot);
17184 vctot = vec_madd(qqHHt,rinv22,vctot);
17185 vctot = vec_madd(qqHHt,rinv23,vctot);
17186 vctot = vec_madd(qqOHt,rinv31,vctot);
17187 vctot = vec_madd(qqHHt,rinv32,vctot);
17188 vctot = vec_madd(qqHHt,rinv33,vctot);
17189 } else if(k<nj1) {
17190 jnra = jjnr[k];
17191 j3a = 3*jnra;
17192 load_1_water(pos+j3a,
17193 &jx1,&jy1,&jz1,&jx2,&jy2,&jz2,&jx3,&jy3,&jz3);
17194 qqOOt = vec_sld(qqOO,nul,12);
17195 qqOHt = vec_sld(qqOH,nul,12);
17196 qqHHt = vec_sld(qqHH,nul,12);
17197 c6t = vec_sld(c6,nul,12);
17198 c12t = vec_sld(c12,nul,12);
17200 dx11 = vec_sub(ix1,jx1);
17201 dx12 = vec_sub(ix1,jx2);
17202 dx13 = vec_sub(ix1,jx3);
17203 dy11 = vec_sub(iy1,jy1);
17204 dy12 = vec_sub(iy1,jy2);
17205 dy13 = vec_sub(iy1,jy3);
17206 dz11 = vec_sub(iz1,jz1);
17207 dz12 = vec_sub(iz1,jz2);
17208 dz13 = vec_sub(iz1,jz3);
17209 dx21 = vec_sub(ix2,jx1);
17210 dx22 = vec_sub(ix2,jx2);
17211 dx23 = vec_sub(ix2,jx3);
17212 dy21 = vec_sub(iy2,jy1);
17213 dy22 = vec_sub(iy2,jy2);
17214 dy23 = vec_sub(iy2,jy3);
17215 dz21 = vec_sub(iz2,jz1);
17216 dz22 = vec_sub(iz2,jz2);
17217 dz23 = vec_sub(iz2,jz3);
17218 dx31 = vec_sub(ix3,jx1);
17219 dx32 = vec_sub(ix3,jx2);
17220 dx33 = vec_sub(ix3,jx3);
17221 dy31 = vec_sub(iy3,jy1);
17222 dy32 = vec_sub(iy3,jy2);
17223 dy33 = vec_sub(iy3,jy3);
17224 dz31 = vec_sub(iz3,jz1);
17225 dz32 = vec_sub(iz3,jz2);
17226 dz33 = vec_sub(iz3,jz3);
17228 rsq11 = vec_madd(dx11,dx11,nul);
17229 rsq12 = vec_madd(dx12,dx12,nul);
17230 rsq13 = vec_madd(dx13,dx13,nul);
17231 rsq21 = vec_madd(dx21,dx21,nul);
17232 rsq22 = vec_madd(dx22,dx22,nul);
17233 rsq23 = vec_madd(dx23,dx23,nul);
17234 rsq31 = vec_madd(dx31,dx31,nul);
17235 rsq32 = vec_madd(dx32,dx32,nul);
17236 rsq33 = vec_madd(dx33,dx33,nul);
17237 rsq11 = vec_madd(dy11,dy11,rsq11);
17238 rsq12 = vec_madd(dy12,dy12,rsq12);
17239 rsq13 = vec_madd(dy13,dy13,rsq13);
17240 rsq21 = vec_madd(dy21,dy21,rsq21);
17241 rsq22 = vec_madd(dy22,dy22,rsq22);
17242 rsq23 = vec_madd(dy23,dy23,rsq23);
17243 rsq31 = vec_madd(dy31,dy31,rsq31);
17244 rsq32 = vec_madd(dy32,dy32,rsq32);
17245 rsq33 = vec_madd(dy33,dy33,rsq33);
17246 rsq11 = vec_madd(dz11,dz11,rsq11);
17247 rsq12 = vec_madd(dz12,dz12,rsq12);
17248 rsq13 = vec_madd(dz13,dz13,rsq13);
17249 rsq21 = vec_madd(dz21,dz21,rsq21);
17250 rsq22 = vec_madd(dz22,dz22,rsq22);
17251 rsq23 = vec_madd(dz23,dz23,rsq23);
17252 rsq31 = vec_madd(dz31,dz31,rsq31);
17253 rsq32 = vec_madd(dz32,dz32,rsq32);
17254 rsq33 = vec_madd(dz33,dz33,rsq33);
17256 zero_highest_3_elements_in_9_vectors(&rsq11,&rsq12,&rsq13,
17257 &rsq21,&rsq22,&rsq23,
17258 &rsq31,&rsq32,&rsq33);
17260 do_9_invsqrt(rsq11,rsq12,rsq13,
17261 rsq21,rsq22,rsq23,
17262 rsq31,rsq32,rsq33,
17263 &rinv11,&rinv12,&rinv13,
17264 &rinv21,&rinv22,&rinv23,
17265 &rinv31,&rinv32,&rinv33);
17267 zero_highest_3_elements_in_9_vectors(&rinv11,&rinv12,&rinv13,
17268 &rinv21,&rinv22,&rinv23,
17269 &rinv31,&rinv32,&rinv33);
17271 rinvsq11 = vec_madd(rinv11,rinv11,nul);
17272 krsq11 = vec_madd(vkrf,rsq11,nul);
17273 krsq12 = vec_madd(vkrf,rsq12,nul);
17274 krsq13 = vec_madd(vkrf,rsq13,nul);
17275 krsq21 = vec_madd(vkrf,rsq21,nul);
17276 krsq22 = vec_madd(vkrf,rsq22,nul);
17277 krsq23 = vec_madd(vkrf,rsq23,nul);
17278 krsq31 = vec_madd(vkrf,rsq31,nul);
17279 krsq32 = vec_madd(vkrf,rsq32,nul);
17280 krsq33 = vec_madd(vkrf,rsq33,nul);
17282 rinvsix = vec_madd(rinvsq11,rinvsq11,nul);
17283 rinvsix = vec_madd(rinvsix,rinvsq11,nul);
17284 vnbtot = vec_nmsub(c6t,rinvsix,vnbtot);
17285 vnbtot = vec_madd(c12t,vec_madd(rinvsix,rinvsix,nul),vnbtot);
17287 rinv11 = vec_add(rinv11,krsq11);
17288 rinv12 = vec_add(rinv12,krsq12);
17289 rinv13 = vec_add(rinv13,krsq13);
17290 rinv21 = vec_add(rinv21,krsq21);
17291 rinv22 = vec_add(rinv22,krsq22);
17292 rinv23 = vec_add(rinv23,krsq23);
17293 rinv31 = vec_add(rinv31,krsq31);
17294 rinv32 = vec_add(rinv32,krsq32);
17295 rinv33 = vec_add(rinv33,krsq33);
17297 rinv11 = vec_sub(rinv11,vcrf);
17298 rinv12 = vec_sub(rinv12,vcrf);
17299 rinv13 = vec_sub(rinv13,vcrf);
17300 rinv21 = vec_sub(rinv21,vcrf);
17301 rinv22 = vec_sub(rinv22,vcrf);
17302 rinv23 = vec_sub(rinv23,vcrf);
17303 rinv31 = vec_sub(rinv31,vcrf);
17304 rinv32 = vec_sub(rinv32,vcrf);
17305 rinv33 = vec_sub(rinv33,vcrf);
17307 vctot = vec_madd(qqOOt,rinv11,vctot);
17308 vctot = vec_madd(qqOHt,rinv12,vctot);
17309 vctot = vec_madd(qqOHt,rinv13,vctot);
17310 vctot = vec_madd(qqOHt,rinv21,vctot);
17311 vctot = vec_madd(qqHHt,rinv22,vctot);
17312 vctot = vec_madd(qqHHt,rinv23,vctot);
17313 vctot = vec_madd(qqOHt,rinv31,vctot);
17314 vctot = vec_madd(qqHHt,rinv32,vctot);
17315 vctot = vec_madd(qqHHt,rinv33,vctot);
17317 /* update outer data */
17318 add_vector_to_float(Vc+gid[n],vctot);
17319 add_vector_to_float(Vnb+gid[n],vnbtot);
17325 void mcinl3030_altivec(
17326 int nri,
17327 int iinr[],
17328 int jindex[],
17329 int jjnr[],
17330 int shift[],
17331 float shiftvec[],
17332 int gid[],
17333 float pos[],
17334 float charge[],
17335 float facel,
17336 float Vc[],
17337 float tabscale,
17338 float VFtab[])
17340 vector float ix1,iy1,iz1,ix2,iy2,iz2,ix3,iy3,iz3;
17341 vector float jx1,jy1,jz1,jx2,jy2,jz2,jx3,jy3,jz3;
17343 vector float dx11,dy11,dz11,dx12,dy12,dz12,dx13,dy13,dz13;
17344 vector float dx21,dy21,dz21,dx22,dy22,dz22,dx23,dy23,dz23;
17345 vector float dx31,dy31,dz31,dx32,dy32,dz32,dx33,dy33,dz33;
17347 vector float rsq11,rsq12,rsq13,rsq21,rsq22,rsq23,rsq31,rsq32,rsq33;
17348 vector float r11,r12,r13,r21,r22,r23,r31,r32,r33;
17349 vector float rinv11,rinv12,rinv13,rinv21,rinv22,rinv23,rinv31,rinv32,rinv33;
17351 vector float vfacel,nul;
17352 vector float vctot,qqOO,qqOH,qqHH,qO,qH,tsc;
17353 vector float VV11c,VV12c,VV13c;
17354 vector float VV21c,VV22c,VV23c;
17355 vector float VV31c,VV32c,VV33c;
17356 vector float qqOOt,qqOHt,qqHHt;
17358 int n,k,k0,ii,is3,ii3,ntiA,nj0,nj1;
17359 int jnra,jnrb,jnrc,jnrd;
17360 int j3a,j3b,j3c,j3d;
17362 nul=vec_zero();
17363 vfacel=load_float_and_splat(&facel);
17364 tsc=load_float_and_splat(&tabscale);
17365 qO = load_float_and_splat(charge+iinr[0]);
17366 qH = load_float_and_splat(charge+iinr[0]+1);
17367 qqOO = vec_madd(qO,qO,nul);
17368 qqOH = vec_madd(qO,qH,nul);
17369 qqHH = vec_madd(qH,qH,nul);
17370 qqOO = vec_madd(qqOO,vfacel,nul);
17371 qqOH = vec_madd(qqOH,vfacel,nul);
17372 qqHH = vec_madd(qqHH,vfacel,nul);
17374 for(n=0;n<nri;n++) {
17375 is3 = 3*shift[n];
17376 ii = iinr[n];
17377 ii3 = 3*ii;
17378 load_1_water_shift_and_splat(pos+ii3,shiftvec+is3,&ix1,&iy1,&iz1,
17379 &ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
17380 vctot = nul;
17381 nj0 = jindex[n];
17382 nj1 = jindex[n+1];
17384 for(k=nj0; k<(nj1-3); k+=4) {
17385 jnra = jjnr[k];
17386 jnrb = jjnr[k+1];
17387 jnrc = jjnr[k+2];
17388 jnrd = jjnr[k+3];
17389 j3a = 3*jnra;
17390 j3b = 3*jnrb;
17391 j3c = 3*jnrc;
17392 j3d = 3*jnrd;
17393 load_4_water(pos+j3a,pos+j3b,pos+j3c,pos+j3d,
17394 &jx1,&jy1,&jz1,&jx2,&jy2,&jz2,&jx3,&jy3,&jz3);
17395 dx11 = vec_sub(ix1,jx1);
17396 dx12 = vec_sub(ix1,jx2);
17397 dx13 = vec_sub(ix1,jx3);
17398 dy11 = vec_sub(iy1,jy1);
17399 dy12 = vec_sub(iy1,jy2);
17400 dy13 = vec_sub(iy1,jy3);
17401 dz11 = vec_sub(iz1,jz1);
17402 dz12 = vec_sub(iz1,jz2);
17403 dz13 = vec_sub(iz1,jz3);
17404 dx21 = vec_sub(ix2,jx1);
17405 dx22 = vec_sub(ix2,jx2);
17406 dx23 = vec_sub(ix2,jx3);
17407 dy21 = vec_sub(iy2,jy1);
17408 dy22 = vec_sub(iy2,jy2);
17409 dy23 = vec_sub(iy2,jy3);
17410 dz21 = vec_sub(iz2,jz1);
17411 dz22 = vec_sub(iz2,jz2);
17412 dz23 = vec_sub(iz2,jz3);
17413 dx31 = vec_sub(ix3,jx1);
17414 dx32 = vec_sub(ix3,jx2);
17415 dx33 = vec_sub(ix3,jx3);
17416 dy31 = vec_sub(iy3,jy1);
17417 dy32 = vec_sub(iy3,jy2);
17418 dy33 = vec_sub(iy3,jy3);
17419 dz31 = vec_sub(iz3,jz1);
17420 dz32 = vec_sub(iz3,jz2);
17421 dz33 = vec_sub(iz3,jz3);
17423 rsq11 = vec_madd(dx11,dx11,nul);
17424 rsq12 = vec_madd(dx12,dx12,nul);
17425 rsq13 = vec_madd(dx13,dx13,nul);
17426 rsq21 = vec_madd(dx21,dx21,nul);
17427 rsq22 = vec_madd(dx22,dx22,nul);
17428 rsq23 = vec_madd(dx23,dx23,nul);
17429 rsq31 = vec_madd(dx31,dx31,nul);
17430 rsq32 = vec_madd(dx32,dx32,nul);
17431 rsq33 = vec_madd(dx33,dx33,nul);
17432 rsq11 = vec_madd(dy11,dy11,rsq11);
17433 rsq12 = vec_madd(dy12,dy12,rsq12);
17434 rsq13 = vec_madd(dy13,dy13,rsq13);
17435 rsq21 = vec_madd(dy21,dy21,rsq21);
17436 rsq22 = vec_madd(dy22,dy22,rsq22);
17437 rsq23 = vec_madd(dy23,dy23,rsq23);
17438 rsq31 = vec_madd(dy31,dy31,rsq31);
17439 rsq32 = vec_madd(dy32,dy32,rsq32);
17440 rsq33 = vec_madd(dy33,dy33,rsq33);
17441 rsq11 = vec_madd(dz11,dz11,rsq11);
17442 rsq12 = vec_madd(dz12,dz12,rsq12);
17443 rsq13 = vec_madd(dz13,dz13,rsq13);
17444 rsq21 = vec_madd(dz21,dz21,rsq21);
17445 rsq22 = vec_madd(dz22,dz22,rsq22);
17446 rsq23 = vec_madd(dz23,dz23,rsq23);
17447 rsq31 = vec_madd(dz31,dz31,rsq31);
17448 rsq32 = vec_madd(dz32,dz32,rsq32);
17449 rsq33 = vec_madd(dz33,dz33,rsq33);
17451 do_9_invsqrt(rsq11,rsq12,rsq13,
17452 rsq21,rsq22,rsq23,
17453 rsq31,rsq32,rsq33,
17454 &rinv11,&rinv12,&rinv13,
17455 &rinv21,&rinv22,&rinv23,
17456 &rinv31,&rinv32,&rinv33);
17458 r11 = vec_madd(rsq11,rinv11,nul);
17459 r12 = vec_madd(rsq12,rinv12,nul);
17460 r13 = vec_madd(rsq13,rinv13,nul);
17461 r21 = vec_madd(rsq21,rinv21,nul);
17462 r22 = vec_madd(rsq22,rinv22,nul);
17463 r23 = vec_madd(rsq23,rinv23,nul);
17464 r31 = vec_madd(rsq31,rinv31,nul);
17465 r32 = vec_madd(rsq32,rinv32,nul);
17466 r33 = vec_madd(rsq33,rinv33,nul);
17468 do_vonly_4_ctable_coul(VFtab,vec_madd(r11,tsc,nul),&VV11c);
17469 do_vonly_4_ctable_coul(VFtab,vec_madd(r12,tsc,nul),&VV12c);
17470 do_vonly_4_ctable_coul(VFtab,vec_madd(r13,tsc,nul),&VV13c);
17471 do_vonly_4_ctable_coul(VFtab,vec_madd(r21,tsc,nul),&VV21c);
17472 do_vonly_4_ctable_coul(VFtab,vec_madd(r22,tsc,nul),&VV22c);
17473 do_vonly_4_ctable_coul(VFtab,vec_madd(r23,tsc,nul),&VV23c);
17474 do_vonly_4_ctable_coul(VFtab,vec_madd(r31,tsc,nul),&VV31c);
17475 do_vonly_4_ctable_coul(VFtab,vec_madd(r32,tsc,nul),&VV32c);
17476 do_vonly_4_ctable_coul(VFtab,vec_madd(r33,tsc,nul),&VV33c);
17478 vctot = vec_madd(qqOO,VV11c,vctot);
17479 vctot = vec_madd(qqOH,VV12c,vctot);
17480 vctot = vec_madd(qqOH,VV13c,vctot);
17481 vctot = vec_madd(qqOH,VV21c,vctot);
17482 vctot = vec_madd(qqHH,VV22c,vctot);
17483 vctot = vec_madd(qqHH,VV23c,vctot);
17484 vctot = vec_madd(qqOH,VV31c,vctot);
17485 vctot = vec_madd(qqHH,VV32c,vctot);
17486 vctot = vec_madd(qqHH,VV33c,vctot);
17488 if(k<(nj1-2)) {
17489 jnra = jjnr[k];
17490 jnrb = jjnr[k+1];
17491 jnrc = jjnr[k+2];
17492 j3a = 3*jnra;
17493 j3b = 3*jnrb;
17494 j3c = 3*jnrc;
17495 load_3_water(pos+j3a,pos+j3b,pos+j3c,
17496 &jx1,&jy1,&jz1,&jx2,&jy2,&jz2,&jx3,&jy3,&jz3);
17497 qqOOt = vec_sld(qqOO,nul,4);
17498 qqOHt = vec_sld(qqOH,nul,4);
17499 qqHHt = vec_sld(qqHH,nul,4);
17501 dx11 = vec_sub(ix1,jx1);
17502 dx12 = vec_sub(ix1,jx2);
17503 dx13 = vec_sub(ix1,jx3);
17504 dy11 = vec_sub(iy1,jy1);
17505 dy12 = vec_sub(iy1,jy2);
17506 dy13 = vec_sub(iy1,jy3);
17507 dz11 = vec_sub(iz1,jz1);
17508 dz12 = vec_sub(iz1,jz2);
17509 dz13 = vec_sub(iz1,jz3);
17510 dx21 = vec_sub(ix2,jx1);
17511 dx22 = vec_sub(ix2,jx2);
17512 dx23 = vec_sub(ix2,jx3);
17513 dy21 = vec_sub(iy2,jy1);
17514 dy22 = vec_sub(iy2,jy2);
17515 dy23 = vec_sub(iy2,jy3);
17516 dz21 = vec_sub(iz2,jz1);
17517 dz22 = vec_sub(iz2,jz2);
17518 dz23 = vec_sub(iz2,jz3);
17519 dx31 = vec_sub(ix3,jx1);
17520 dx32 = vec_sub(ix3,jx2);
17521 dx33 = vec_sub(ix3,jx3);
17522 dy31 = vec_sub(iy3,jy1);
17523 dy32 = vec_sub(iy3,jy2);
17524 dy33 = vec_sub(iy3,jy3);
17525 dz31 = vec_sub(iz3,jz1);
17526 dz32 = vec_sub(iz3,jz2);
17527 dz33 = vec_sub(iz3,jz3);
17529 rsq11 = vec_madd(dx11,dx11,nul);
17530 rsq12 = vec_madd(dx12,dx12,nul);
17531 rsq13 = vec_madd(dx13,dx13,nul);
17532 rsq21 = vec_madd(dx21,dx21,nul);
17533 rsq22 = vec_madd(dx22,dx22,nul);
17534 rsq23 = vec_madd(dx23,dx23,nul);
17535 rsq31 = vec_madd(dx31,dx31,nul);
17536 rsq32 = vec_madd(dx32,dx32,nul);
17537 rsq33 = vec_madd(dx33,dx33,nul);
17538 rsq11 = vec_madd(dy11,dy11,rsq11);
17539 rsq12 = vec_madd(dy12,dy12,rsq12);
17540 rsq13 = vec_madd(dy13,dy13,rsq13);
17541 rsq21 = vec_madd(dy21,dy21,rsq21);
17542 rsq22 = vec_madd(dy22,dy22,rsq22);
17543 rsq23 = vec_madd(dy23,dy23,rsq23);
17544 rsq31 = vec_madd(dy31,dy31,rsq31);
17545 rsq32 = vec_madd(dy32,dy32,rsq32);
17546 rsq33 = vec_madd(dy33,dy33,rsq33);
17547 rsq11 = vec_madd(dz11,dz11,rsq11);
17548 rsq12 = vec_madd(dz12,dz12,rsq12);
17549 rsq13 = vec_madd(dz13,dz13,rsq13);
17550 rsq21 = vec_madd(dz21,dz21,rsq21);
17551 rsq22 = vec_madd(dz22,dz22,rsq22);
17552 rsq23 = vec_madd(dz23,dz23,rsq23);
17553 rsq31 = vec_madd(dz31,dz31,rsq31);
17554 rsq32 = vec_madd(dz32,dz32,rsq32);
17555 rsq33 = vec_madd(dz33,dz33,rsq33);
17557 zero_highest_element_in_9_vectors(&rsq11,&rsq12,&rsq13,
17558 &rsq21,&rsq22,&rsq23,
17559 &rsq31,&rsq32,&rsq33);
17561 do_9_invsqrt(rsq11,rsq12,rsq13,
17562 rsq21,rsq22,rsq23,
17563 rsq31,rsq32,rsq33,
17564 &rinv11,&rinv12,&rinv13,
17565 &rinv21,&rinv22,&rinv23,
17566 &rinv31,&rinv32,&rinv33);
17568 zero_highest_element_in_9_vectors(&rinv11,&rinv12,&rinv13,
17569 &rinv21,&rinv22,&rinv23,
17570 &rinv31,&rinv32,&rinv33);
17572 r11 = vec_madd(rsq11,rinv11,nul);
17573 r12 = vec_madd(rsq12,rinv12,nul);
17574 r13 = vec_madd(rsq13,rinv13,nul);
17575 r21 = vec_madd(rsq21,rinv21,nul);
17576 r22 = vec_madd(rsq22,rinv22,nul);
17577 r23 = vec_madd(rsq23,rinv23,nul);
17578 r31 = vec_madd(rsq31,rinv31,nul);
17579 r32 = vec_madd(rsq32,rinv32,nul);
17580 r33 = vec_madd(rsq33,rinv33,nul);
17582 do_vonly_3_ctable_coul(VFtab,vec_madd(r11,tsc,nul),&VV11c);
17583 do_vonly_3_ctable_coul(VFtab,vec_madd(r12,tsc,nul),&VV12c);
17584 do_vonly_3_ctable_coul(VFtab,vec_madd(r13,tsc,nul),&VV13c);
17585 do_vonly_3_ctable_coul(VFtab,vec_madd(r21,tsc,nul),&VV21c);
17586 do_vonly_3_ctable_coul(VFtab,vec_madd(r22,tsc,nul),&VV22c);
17587 do_vonly_3_ctable_coul(VFtab,vec_madd(r23,tsc,nul),&VV23c);
17588 do_vonly_3_ctable_coul(VFtab,vec_madd(r31,tsc,nul),&VV31c);
17589 do_vonly_3_ctable_coul(VFtab,vec_madd(r32,tsc,nul),&VV32c);
17590 do_vonly_3_ctable_coul(VFtab,vec_madd(r33,tsc,nul),&VV33c);
17592 vctot = vec_madd(qqOOt,VV11c,vctot);
17593 vctot = vec_madd(qqOHt,VV12c,vctot);
17594 vctot = vec_madd(qqOHt,VV13c,vctot);
17595 vctot = vec_madd(qqOHt,VV21c,vctot);
17596 vctot = vec_madd(qqHHt,VV22c,vctot);
17597 vctot = vec_madd(qqHHt,VV23c,vctot);
17598 vctot = vec_madd(qqOHt,VV31c,vctot);
17599 vctot = vec_madd(qqHHt,VV32c,vctot);
17600 vctot = vec_madd(qqHHt,VV33c,vctot);
17601 } else if(k<(nj1-1)) {
17602 jnra = jjnr[k];
17603 jnrb = jjnr[k+1];
17604 j3a = 3*jnra;
17605 j3b = 3*jnrb;
17606 load_2_water(pos+j3a,pos+j3b,
17607 &jx1,&jy1,&jz1,&jx2,&jy2,&jz2,&jx3,&jy3,&jz3);
17608 qqOOt = vec_sld(qqOO,nul,8);
17609 qqOHt = vec_sld(qqOH,nul,8);
17610 qqHHt = vec_sld(qqHH,nul,8);
17612 dx11 = vec_sub(ix1,jx1);
17613 dx12 = vec_sub(ix1,jx2);
17614 dx13 = vec_sub(ix1,jx3);
17615 dy11 = vec_sub(iy1,jy1);
17616 dy12 = vec_sub(iy1,jy2);
17617 dy13 = vec_sub(iy1,jy3);
17618 dz11 = vec_sub(iz1,jz1);
17619 dz12 = vec_sub(iz1,jz2);
17620 dz13 = vec_sub(iz1,jz3);
17621 dx21 = vec_sub(ix2,jx1);
17622 dx22 = vec_sub(ix2,jx2);
17623 dx23 = vec_sub(ix2,jx3);
17624 dy21 = vec_sub(iy2,jy1);
17625 dy22 = vec_sub(iy2,jy2);
17626 dy23 = vec_sub(iy2,jy3);
17627 dz21 = vec_sub(iz2,jz1);
17628 dz22 = vec_sub(iz2,jz2);
17629 dz23 = vec_sub(iz2,jz3);
17630 dx31 = vec_sub(ix3,jx1);
17631 dx32 = vec_sub(ix3,jx2);
17632 dx33 = vec_sub(ix3,jx3);
17633 dy31 = vec_sub(iy3,jy1);
17634 dy32 = vec_sub(iy3,jy2);
17635 dy33 = vec_sub(iy3,jy3);
17636 dz31 = vec_sub(iz3,jz1);
17637 dz32 = vec_sub(iz3,jz2);
17638 dz33 = vec_sub(iz3,jz3);
17640 rsq11 = vec_madd(dx11,dx11,nul);
17641 rsq12 = vec_madd(dx12,dx12,nul);
17642 rsq13 = vec_madd(dx13,dx13,nul);
17643 rsq21 = vec_madd(dx21,dx21,nul);
17644 rsq22 = vec_madd(dx22,dx22,nul);
17645 rsq23 = vec_madd(dx23,dx23,nul);
17646 rsq31 = vec_madd(dx31,dx31,nul);
17647 rsq32 = vec_madd(dx32,dx32,nul);
17648 rsq33 = vec_madd(dx33,dx33,nul);
17649 rsq11 = vec_madd(dy11,dy11,rsq11);
17650 rsq12 = vec_madd(dy12,dy12,rsq12);
17651 rsq13 = vec_madd(dy13,dy13,rsq13);
17652 rsq21 = vec_madd(dy21,dy21,rsq21);
17653 rsq22 = vec_madd(dy22,dy22,rsq22);
17654 rsq23 = vec_madd(dy23,dy23,rsq23);
17655 rsq31 = vec_madd(dy31,dy31,rsq31);
17656 rsq32 = vec_madd(dy32,dy32,rsq32);
17657 rsq33 = vec_madd(dy33,dy33,rsq33);
17658 rsq11 = vec_madd(dz11,dz11,rsq11);
17659 rsq12 = vec_madd(dz12,dz12,rsq12);
17660 rsq13 = vec_madd(dz13,dz13,rsq13);
17661 rsq21 = vec_madd(dz21,dz21,rsq21);
17662 rsq22 = vec_madd(dz22,dz22,rsq22);
17663 rsq23 = vec_madd(dz23,dz23,rsq23);
17664 rsq31 = vec_madd(dz31,dz31,rsq31);
17665 rsq32 = vec_madd(dz32,dz32,rsq32);
17666 rsq33 = vec_madd(dz33,dz33,rsq33);
17668 zero_highest_2_elements_in_9_vectors(&rsq11,&rsq12,&rsq13,
17669 &rsq21,&rsq22,&rsq23,
17670 &rsq31,&rsq32,&rsq33);
17672 do_9_invsqrt(rsq11,rsq12,rsq13,
17673 rsq21,rsq22,rsq23,
17674 rsq31,rsq32,rsq33,
17675 &rinv11,&rinv12,&rinv13,
17676 &rinv21,&rinv22,&rinv23,
17677 &rinv31,&rinv32,&rinv33);
17679 zero_highest_2_elements_in_9_vectors(&rinv11,&rinv12,&rinv13,
17680 &rinv21,&rinv22,&rinv23,
17681 &rinv31,&rinv32,&rinv33);
17683 r11 = vec_madd(rsq11,rinv11,nul);
17684 r12 = vec_madd(rsq12,rinv12,nul);
17685 r13 = vec_madd(rsq13,rinv13,nul);
17686 r21 = vec_madd(rsq21,rinv21,nul);
17687 r22 = vec_madd(rsq22,rinv22,nul);
17688 r23 = vec_madd(rsq23,rinv23,nul);
17689 r31 = vec_madd(rsq31,rinv31,nul);
17690 r32 = vec_madd(rsq32,rinv32,nul);
17691 r33 = vec_madd(rsq33,rinv33,nul);
17693 do_vonly_2_ctable_coul(VFtab,vec_madd(r11,tsc,nul),&VV11c);
17694 do_vonly_2_ctable_coul(VFtab,vec_madd(r12,tsc,nul),&VV12c);
17695 do_vonly_2_ctable_coul(VFtab,vec_madd(r13,tsc,nul),&VV13c);
17696 do_vonly_2_ctable_coul(VFtab,vec_madd(r21,tsc,nul),&VV21c);
17697 do_vonly_2_ctable_coul(VFtab,vec_madd(r22,tsc,nul),&VV22c);
17698 do_vonly_2_ctable_coul(VFtab,vec_madd(r23,tsc,nul),&VV23c);
17699 do_vonly_2_ctable_coul(VFtab,vec_madd(r31,tsc,nul),&VV31c);
17700 do_vonly_2_ctable_coul(VFtab,vec_madd(r32,tsc,nul),&VV32c);
17701 do_vonly_2_ctable_coul(VFtab,vec_madd(r33,tsc,nul),&VV33c);
17703 vctot = vec_madd(qqOOt,VV11c,vctot);
17704 vctot = vec_madd(qqOHt,VV12c,vctot);
17705 vctot = vec_madd(qqOHt,VV13c,vctot);
17706 vctot = vec_madd(qqOHt,VV21c,vctot);
17707 vctot = vec_madd(qqHHt,VV22c,vctot);
17708 vctot = vec_madd(qqHHt,VV23c,vctot);
17709 vctot = vec_madd(qqOHt,VV31c,vctot);
17710 vctot = vec_madd(qqHHt,VV32c,vctot);
17711 vctot = vec_madd(qqHHt,VV33c,vctot);
17712 } else if(k<nj1) {
17713 jnra = jjnr[k];
17714 j3a = 3*jnra;
17715 load_1_water(pos+j3a,
17716 &jx1,&jy1,&jz1,&jx2,&jy2,&jz2,&jx3,&jy3,&jz3);
17717 qqOOt = vec_sld(qqOO,nul,12);
17718 qqOHt = vec_sld(qqOH,nul,12);
17719 qqHHt = vec_sld(qqHH,nul,12);
17721 dx11 = vec_sub(ix1,jx1);
17722 dx12 = vec_sub(ix1,jx2);
17723 dx13 = vec_sub(ix1,jx3);
17724 dy11 = vec_sub(iy1,jy1);
17725 dy12 = vec_sub(iy1,jy2);
17726 dy13 = vec_sub(iy1,jy3);
17727 dz11 = vec_sub(iz1,jz1);
17728 dz12 = vec_sub(iz1,jz2);
17729 dz13 = vec_sub(iz1,jz3);
17730 dx21 = vec_sub(ix2,jx1);
17731 dx22 = vec_sub(ix2,jx2);
17732 dx23 = vec_sub(ix2,jx3);
17733 dy21 = vec_sub(iy2,jy1);
17734 dy22 = vec_sub(iy2,jy2);
17735 dy23 = vec_sub(iy2,jy3);
17736 dz21 = vec_sub(iz2,jz1);
17737 dz22 = vec_sub(iz2,jz2);
17738 dz23 = vec_sub(iz2,jz3);
17739 dx31 = vec_sub(ix3,jx1);
17740 dx32 = vec_sub(ix3,jx2);
17741 dx33 = vec_sub(ix3,jx3);
17742 dy31 = vec_sub(iy3,jy1);
17743 dy32 = vec_sub(iy3,jy2);
17744 dy33 = vec_sub(iy3,jy3);
17745 dz31 = vec_sub(iz3,jz1);
17746 dz32 = vec_sub(iz3,jz2);
17747 dz33 = vec_sub(iz3,jz3);
17749 rsq11 = vec_madd(dx11,dx11,nul);
17750 rsq12 = vec_madd(dx12,dx12,nul);
17751 rsq13 = vec_madd(dx13,dx13,nul);
17752 rsq21 = vec_madd(dx21,dx21,nul);
17753 rsq22 = vec_madd(dx22,dx22,nul);
17754 rsq23 = vec_madd(dx23,dx23,nul);
17755 rsq31 = vec_madd(dx31,dx31,nul);
17756 rsq32 = vec_madd(dx32,dx32,nul);
17757 rsq33 = vec_madd(dx33,dx33,nul);
17758 rsq11 = vec_madd(dy11,dy11,rsq11);
17759 rsq12 = vec_madd(dy12,dy12,rsq12);
17760 rsq13 = vec_madd(dy13,dy13,rsq13);
17761 rsq21 = vec_madd(dy21,dy21,rsq21);
17762 rsq22 = vec_madd(dy22,dy22,rsq22);
17763 rsq23 = vec_madd(dy23,dy23,rsq23);
17764 rsq31 = vec_madd(dy31,dy31,rsq31);
17765 rsq32 = vec_madd(dy32,dy32,rsq32);
17766 rsq33 = vec_madd(dy33,dy33,rsq33);
17767 rsq11 = vec_madd(dz11,dz11,rsq11);
17768 rsq12 = vec_madd(dz12,dz12,rsq12);
17769 rsq13 = vec_madd(dz13,dz13,rsq13);
17770 rsq21 = vec_madd(dz21,dz21,rsq21);
17771 rsq22 = vec_madd(dz22,dz22,rsq22);
17772 rsq23 = vec_madd(dz23,dz23,rsq23);
17773 rsq31 = vec_madd(dz31,dz31,rsq31);
17774 rsq32 = vec_madd(dz32,dz32,rsq32);
17775 rsq33 = vec_madd(dz33,dz33,rsq33);
17777 zero_highest_3_elements_in_9_vectors(&rsq11,&rsq12,&rsq13,
17778 &rsq21,&rsq22,&rsq23,
17779 &rsq31,&rsq32,&rsq33);
17781 do_9_invsqrt(rsq11,rsq12,rsq13,
17782 rsq21,rsq22,rsq23,
17783 rsq31,rsq32,rsq33,
17784 &rinv11,&rinv12,&rinv13,
17785 &rinv21,&rinv22,&rinv23,
17786 &rinv31,&rinv32,&rinv33);
17788 zero_highest_3_elements_in_9_vectors(&rinv11,&rinv12,&rinv13,
17789 &rinv21,&rinv22,&rinv23,
17790 &rinv31,&rinv32,&rinv33);
17792 r11 = vec_madd(rsq11,rinv11,nul);
17793 r12 = vec_madd(rsq12,rinv12,nul);
17794 r13 = vec_madd(rsq13,rinv13,nul);
17795 r21 = vec_madd(rsq21,rinv21,nul);
17796 r22 = vec_madd(rsq22,rinv22,nul);
17797 r23 = vec_madd(rsq23,rinv23,nul);
17798 r31 = vec_madd(rsq31,rinv31,nul);
17799 r32 = vec_madd(rsq32,rinv32,nul);
17800 r33 = vec_madd(rsq33,rinv33,nul);
17802 do_vonly_1_ctable_coul(VFtab,vec_madd(r11,tsc,nul),&VV11c);
17803 do_vonly_1_ctable_coul(VFtab,vec_madd(r12,tsc,nul),&VV12c);
17804 do_vonly_1_ctable_coul(VFtab,vec_madd(r13,tsc,nul),&VV13c);
17805 do_vonly_1_ctable_coul(VFtab,vec_madd(r21,tsc,nul),&VV21c);
17806 do_vonly_1_ctable_coul(VFtab,vec_madd(r22,tsc,nul),&VV22c);
17807 do_vonly_1_ctable_coul(VFtab,vec_madd(r23,tsc,nul),&VV23c);
17808 do_vonly_1_ctable_coul(VFtab,vec_madd(r31,tsc,nul),&VV31c);
17809 do_vonly_1_ctable_coul(VFtab,vec_madd(r32,tsc,nul),&VV32c);
17810 do_vonly_1_ctable_coul(VFtab,vec_madd(r33,tsc,nul),&VV33c);
17812 vctot = vec_madd(qqOOt,VV11c,vctot);
17813 vctot = vec_madd(qqOHt,VV12c,vctot);
17814 vctot = vec_madd(qqOHt,VV13c,vctot);
17815 vctot = vec_madd(qqOHt,VV21c,vctot);
17816 vctot = vec_madd(qqHHt,VV22c,vctot);
17817 vctot = vec_madd(qqHHt,VV23c,vctot);
17818 vctot = vec_madd(qqOHt,VV31c,vctot);
17819 vctot = vec_madd(qqHHt,VV32c,vctot);
17820 vctot = vec_madd(qqHHt,VV33c,vctot);
17822 /* update outer data */
17823 add_vector_to_float(Vc+gid[n],vctot);
17829 void mcinl3130_altivec(
17830 int nri,
17831 int iinr[],
17832 int jindex[],
17833 int jjnr[],
17834 int shift[],
17835 float shiftvec[],
17836 int gid[],
17837 float pos[],
17838 float charge[],
17839 float facel,
17840 float Vc[],
17841 int type[],
17842 int ntype,
17843 float nbfp[],
17844 float Vnb[],
17845 float tabscale,
17846 float VFtab[])
17848 vector float ix1,iy1,iz1,ix2,iy2,iz2,ix3,iy3,iz3;
17849 vector float jx1,jy1,jz1,jx2,jy2,jz2,jx3,jy3,jz3;
17851 vector float dx11,dy11,dz11,dx12,dy12,dz12,dx13,dy13,dz13;
17852 vector float dx21,dy21,dz21,dx22,dy22,dz22,dx23,dy23,dz23;
17853 vector float dx31,dy31,dz31,dx32,dy32,dz32,dx33,dy33,dz33;
17855 vector float rsq11,rsq12,rsq13,rsq21,rsq22,rsq23,rsq31,rsq32,rsq33;
17856 vector float r11,r12,r13,r21,r22,r23,r31,r32,r33;
17857 vector float rinv11,rinv12,rinv13,rinv21,rinv22,rinv23,rinv31,rinv32,rinv33;
17858 vector float rinvsq11;
17859 vector float vc11,vc12,vc13,vc21,vc22,vc23,vc31,vc32,vc33,tsc,VVc;
17861 vector float vfacel,nul;
17862 vector float vctot,qqOO,qqOH,qqHH,qO,qH,c6,c12,rinvsix;
17863 vector float vnbtot,qqOOt,qqOHt,qqHHt,c6t,c12t;
17864 vector float VV11c,VV12c,VV13c;
17865 vector float VV21c,VV22c,VV23c;
17866 vector float VV31c,VV32c,VV33c;
17868 int n,k,k0,ii,is3,ii3,ntiA,nj0,nj1;
17869 int jnra,jnrb,jnrc,jnrd,tp,tj;
17870 int j3a,j3b,j3c,j3d;
17872 nul=vec_zero();
17873 vfacel=load_float_and_splat(&facel);
17874 tsc=load_float_and_splat(&tabscale);
17875 ii = iinr[0];
17876 qO = load_float_and_splat(charge+ii);
17877 qH = load_float_and_splat(charge+ii+1);
17878 qqOO = vec_madd(qO,qO,nul);
17879 qqOH = vec_madd(qO,qH,nul);
17880 qqHH = vec_madd(qH,qH,nul);
17881 qqOO = vec_madd(qqOO,vfacel,nul);
17882 qqOH = vec_madd(qqOH,vfacel,nul);
17883 qqHH = vec_madd(qqHH,vfacel,nul);
17884 tp = 2*type[ii];
17885 tj = (ntype+1)*tp;
17886 load_1_pair(nbfp+tj,&c6,&c12);
17887 c6 = vec_splat(c6,0);
17888 c12 = vec_splat(c12,0);
17890 for(n=0;n<nri;n++) {
17891 is3 = 3*shift[n];
17892 ii = iinr[n];
17893 ii3 = 3*ii;
17894 load_1_water_shift_and_splat(pos+ii3,shiftvec+is3,&ix1,&iy1,&iz1,
17895 &ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
17896 vctot = nul;
17897 vnbtot = nul;
17898 nj0 = jindex[n];
17899 nj1 = jindex[n+1];
17901 for(k=nj0; k<(nj1-3); k+=4) {
17902 jnra = jjnr[k];
17903 jnrb = jjnr[k+1];
17904 jnrc = jjnr[k+2];
17905 jnrd = jjnr[k+3];
17906 j3a = 3*jnra;
17907 j3b = 3*jnrb;
17908 j3c = 3*jnrc;
17909 j3d = 3*jnrd;
17910 load_4_water(pos+j3a,pos+j3b,pos+j3c,pos+j3d,
17911 &jx1,&jy1,&jz1,&jx2,&jy2,&jz2,&jx3,&jy3,&jz3);
17912 dx11 = vec_sub(ix1,jx1);
17913 dx12 = vec_sub(ix1,jx2);
17914 dx13 = vec_sub(ix1,jx3);
17915 dy11 = vec_sub(iy1,jy1);
17916 dy12 = vec_sub(iy1,jy2);
17917 dy13 = vec_sub(iy1,jy3);
17918 dz11 = vec_sub(iz1,jz1);
17919 dz12 = vec_sub(iz1,jz2);
17920 dz13 = vec_sub(iz1,jz3);
17921 dx21 = vec_sub(ix2,jx1);
17922 dx22 = vec_sub(ix2,jx2);
17923 dx23 = vec_sub(ix2,jx3);
17924 dy21 = vec_sub(iy2,jy1);
17925 dy22 = vec_sub(iy2,jy2);
17926 dy23 = vec_sub(iy2,jy3);
17927 dz21 = vec_sub(iz2,jz1);
17928 dz22 = vec_sub(iz2,jz2);
17929 dz23 = vec_sub(iz2,jz3);
17930 dx31 = vec_sub(ix3,jx1);
17931 dx32 = vec_sub(ix3,jx2);
17932 dx33 = vec_sub(ix3,jx3);
17933 dy31 = vec_sub(iy3,jy1);
17934 dy32 = vec_sub(iy3,jy2);
17935 dy33 = vec_sub(iy3,jy3);
17936 dz31 = vec_sub(iz3,jz1);
17937 dz32 = vec_sub(iz3,jz2);
17938 dz33 = vec_sub(iz3,jz3);
17940 rsq11 = vec_madd(dx11,dx11,nul);
17941 rsq12 = vec_madd(dx12,dx12,nul);
17942 rsq13 = vec_madd(dx13,dx13,nul);
17943 rsq21 = vec_madd(dx21,dx21,nul);
17944 rsq22 = vec_madd(dx22,dx22,nul);
17945 rsq23 = vec_madd(dx23,dx23,nul);
17946 rsq31 = vec_madd(dx31,dx31,nul);
17947 rsq32 = vec_madd(dx32,dx32,nul);
17948 rsq33 = vec_madd(dx33,dx33,nul);
17949 rsq11 = vec_madd(dy11,dy11,rsq11);
17950 rsq12 = vec_madd(dy12,dy12,rsq12);
17951 rsq13 = vec_madd(dy13,dy13,rsq13);
17952 rsq21 = vec_madd(dy21,dy21,rsq21);
17953 rsq22 = vec_madd(dy22,dy22,rsq22);
17954 rsq23 = vec_madd(dy23,dy23,rsq23);
17955 rsq31 = vec_madd(dy31,dy31,rsq31);
17956 rsq32 = vec_madd(dy32,dy32,rsq32);
17957 rsq33 = vec_madd(dy33,dy33,rsq33);
17958 rsq11 = vec_madd(dz11,dz11,rsq11);
17959 rsq12 = vec_madd(dz12,dz12,rsq12);
17960 rsq13 = vec_madd(dz13,dz13,rsq13);
17961 rsq21 = vec_madd(dz21,dz21,rsq21);
17962 rsq22 = vec_madd(dz22,dz22,rsq22);
17963 rsq23 = vec_madd(dz23,dz23,rsq23);
17964 rsq31 = vec_madd(dz31,dz31,rsq31);
17965 rsq32 = vec_madd(dz32,dz32,rsq32);
17966 rsq33 = vec_madd(dz33,dz33,rsq33);
17968 do_9_invsqrt(rsq11,rsq12,rsq13,
17969 rsq21,rsq22,rsq23,
17970 rsq31,rsq32,rsq33,
17971 &rinv11,&rinv12,&rinv13,
17972 &rinv21,&rinv22,&rinv23,
17973 &rinv31,&rinv32,&rinv33);
17975 rinvsq11 = vec_madd(rinv11,rinv11,nul);
17976 r11 = vec_madd(rsq11,rinv11,nul);
17977 r12 = vec_madd(rsq12,rinv12,nul);
17978 r13 = vec_madd(rsq13,rinv13,nul);
17979 r21 = vec_madd(rsq21,rinv21,nul);
17980 rinvsix = vec_madd(rinvsq11,rinvsq11,nul);
17981 r22 = vec_madd(rsq22,rinv22,nul);
17982 r23 = vec_madd(rsq23,rinv23,nul);
17983 r31 = vec_madd(rsq31,rinv31,nul);
17984 r32 = vec_madd(rsq32,rinv32,nul);
17985 r33 = vec_madd(rsq33,rinv33,nul);
17986 rinvsix = vec_madd(rinvsix,rinvsq11,nul);
17988 do_vonly_4_ctable_coul(VFtab,vec_madd(r11,tsc,nul),&VV11c);
17989 do_vonly_4_ctable_coul(VFtab,vec_madd(r12,tsc,nul),&VV12c);
17990 do_vonly_4_ctable_coul(VFtab,vec_madd(r13,tsc,nul),&VV13c);
17991 do_vonly_4_ctable_coul(VFtab,vec_madd(r21,tsc,nul),&VV21c);
17992 do_vonly_4_ctable_coul(VFtab,vec_madd(r22,tsc,nul),&VV22c);
17993 do_vonly_4_ctable_coul(VFtab,vec_madd(r23,tsc,nul),&VV23c);
17994 do_vonly_4_ctable_coul(VFtab,vec_madd(r31,tsc,nul),&VV31c);
17995 do_vonly_4_ctable_coul(VFtab,vec_madd(r32,tsc,nul),&VV32c);
17996 do_vonly_4_ctable_coul(VFtab,vec_madd(r33,tsc,nul),&VV33c);
17998 vnbtot = vec_nmsub(c6,rinvsix,vnbtot);
17999 vnbtot = vec_madd(c12,vec_madd(rinvsix,rinvsix,nul),vnbtot);
18000 vctot = vec_madd(qqOO,VV11c,vctot);
18001 vctot = vec_madd(qqOH,VV12c,vctot);
18002 vctot = vec_madd(qqOH,VV13c,vctot);
18003 vctot = vec_madd(qqOH,VV21c,vctot);
18004 vctot = vec_madd(qqHH,VV22c,vctot);
18005 vctot = vec_madd(qqHH,VV23c,vctot);
18006 vctot = vec_madd(qqOH,VV31c,vctot);
18007 vctot = vec_madd(qqHH,VV32c,vctot);
18008 vctot = vec_madd(qqHH,VV33c,vctot);
18010 if(k<(nj1-2)) {
18011 jnra = jjnr[k];
18012 jnrb = jjnr[k+1];
18013 jnrc = jjnr[k+2];
18014 j3a = 3*jnra;
18015 j3b = 3*jnrb;
18016 j3c = 3*jnrc;
18017 load_3_water(pos+j3a,pos+j3b,pos+j3c,
18018 &jx1,&jy1,&jz1,&jx2,&jy2,&jz2,&jx3,&jy3,&jz3);
18019 qqOOt = vec_sld(qqOO,nul,4);
18020 qqOHt = vec_sld(qqOH,nul,4);
18021 qqHHt = vec_sld(qqHH,nul,4);
18022 c6t = vec_sld(c6,nul,4);
18023 c12t = vec_sld(c12,nul,4);
18025 dx11 = vec_sub(ix1,jx1);
18026 dx12 = vec_sub(ix1,jx2);
18027 dx13 = vec_sub(ix1,jx3);
18028 dy11 = vec_sub(iy1,jy1);
18029 dy12 = vec_sub(iy1,jy2);
18030 dy13 = vec_sub(iy1,jy3);
18031 dz11 = vec_sub(iz1,jz1);
18032 dz12 = vec_sub(iz1,jz2);
18033 dz13 = vec_sub(iz1,jz3);
18034 dx21 = vec_sub(ix2,jx1);
18035 dx22 = vec_sub(ix2,jx2);
18036 dx23 = vec_sub(ix2,jx3);
18037 dy21 = vec_sub(iy2,jy1);
18038 dy22 = vec_sub(iy2,jy2);
18039 dy23 = vec_sub(iy2,jy3);
18040 dz21 = vec_sub(iz2,jz1);
18041 dz22 = vec_sub(iz2,jz2);
18042 dz23 = vec_sub(iz2,jz3);
18043 dx31 = vec_sub(ix3,jx1);
18044 dx32 = vec_sub(ix3,jx2);
18045 dx33 = vec_sub(ix3,jx3);
18046 dy31 = vec_sub(iy3,jy1);
18047 dy32 = vec_sub(iy3,jy2);
18048 dy33 = vec_sub(iy3,jy3);
18049 dz31 = vec_sub(iz3,jz1);
18050 dz32 = vec_sub(iz3,jz2);
18051 dz33 = vec_sub(iz3,jz3);
18053 rsq11 = vec_madd(dx11,dx11,nul);
18054 rsq12 = vec_madd(dx12,dx12,nul);
18055 rsq13 = vec_madd(dx13,dx13,nul);
18056 rsq21 = vec_madd(dx21,dx21,nul);
18057 rsq22 = vec_madd(dx22,dx22,nul);
18058 rsq23 = vec_madd(dx23,dx23,nul);
18059 rsq31 = vec_madd(dx31,dx31,nul);
18060 rsq32 = vec_madd(dx32,dx32,nul);
18061 rsq33 = vec_madd(dx33,dx33,nul);
18062 rsq11 = vec_madd(dy11,dy11,rsq11);
18063 rsq12 = vec_madd(dy12,dy12,rsq12);
18064 rsq13 = vec_madd(dy13,dy13,rsq13);
18065 rsq21 = vec_madd(dy21,dy21,rsq21);
18066 rsq22 = vec_madd(dy22,dy22,rsq22);
18067 rsq23 = vec_madd(dy23,dy23,rsq23);
18068 rsq31 = vec_madd(dy31,dy31,rsq31);
18069 rsq32 = vec_madd(dy32,dy32,rsq32);
18070 rsq33 = vec_madd(dy33,dy33,rsq33);
18071 rsq11 = vec_madd(dz11,dz11,rsq11);
18072 rsq12 = vec_madd(dz12,dz12,rsq12);
18073 rsq13 = vec_madd(dz13,dz13,rsq13);
18074 rsq21 = vec_madd(dz21,dz21,rsq21);
18075 rsq22 = vec_madd(dz22,dz22,rsq22);
18076 rsq23 = vec_madd(dz23,dz23,rsq23);
18077 rsq31 = vec_madd(dz31,dz31,rsq31);
18078 rsq32 = vec_madd(dz32,dz32,rsq32);
18079 rsq33 = vec_madd(dz33,dz33,rsq33);
18081 zero_highest_element_in_9_vectors(&rsq11,&rsq12,&rsq13,
18082 &rsq21,&rsq22,&rsq23,
18083 &rsq31,&rsq32,&rsq33);
18085 do_9_invsqrt(rsq11,rsq12,rsq13,
18086 rsq21,rsq22,rsq23,
18087 rsq31,rsq32,rsq33,
18088 &rinv11,&rinv12,&rinv13,
18089 &rinv21,&rinv22,&rinv23,
18090 &rinv31,&rinv32,&rinv33);
18092 zero_highest_element_in_9_vectors(&rinv11,&rinv12,&rinv13,
18093 &rinv21,&rinv22,&rinv23,
18094 &rinv31,&rinv32,&rinv33);
18096 rinvsq11 = vec_madd(rinv11,rinv11,nul);
18097 r11 = vec_madd(rsq11,rinv11,nul);
18098 r12 = vec_madd(rsq12,rinv12,nul);
18099 r13 = vec_madd(rsq13,rinv13,nul);
18100 r21 = vec_madd(rsq21,rinv21,nul);
18101 rinvsix = vec_madd(rinvsq11,rinvsq11,nul);
18102 r22 = vec_madd(rsq22,rinv22,nul);
18103 r23 = vec_madd(rsq23,rinv23,nul);
18104 r31 = vec_madd(rsq31,rinv31,nul);
18105 r32 = vec_madd(rsq32,rinv32,nul);
18106 r33 = vec_madd(rsq33,rinv33,nul);
18107 rinvsix = vec_madd(rinvsix,rinvsq11,nul);
18109 do_vonly_3_ctable_coul(VFtab,vec_madd(r11,tsc,nul),&VV11c);
18110 do_vonly_3_ctable_coul(VFtab,vec_madd(r12,tsc,nul),&VV12c);
18111 do_vonly_3_ctable_coul(VFtab,vec_madd(r13,tsc,nul),&VV13c);
18112 do_vonly_3_ctable_coul(VFtab,vec_madd(r21,tsc,nul),&VV21c);
18113 do_vonly_3_ctable_coul(VFtab,vec_madd(r22,tsc,nul),&VV22c);
18114 do_vonly_3_ctable_coul(VFtab,vec_madd(r23,tsc,nul),&VV23c);
18115 do_vonly_3_ctable_coul(VFtab,vec_madd(r31,tsc,nul),&VV31c);
18116 do_vonly_3_ctable_coul(VFtab,vec_madd(r32,tsc,nul),&VV32c);
18117 do_vonly_3_ctable_coul(VFtab,vec_madd(r33,tsc,nul),&VV33c);
18119 vnbtot = vec_nmsub(c6t,rinvsix,vnbtot);
18120 vnbtot = vec_madd(c12t,vec_madd(rinvsix,rinvsix,nul),vnbtot);
18121 vctot = vec_madd(qqOOt,VV11c,vctot);
18122 vctot = vec_madd(qqOHt,VV12c,vctot);
18123 vctot = vec_madd(qqOHt,VV13c,vctot);
18124 vctot = vec_madd(qqOHt,VV21c,vctot);
18125 vctot = vec_madd(qqHHt,VV22c,vctot);
18126 vctot = vec_madd(qqHHt,VV23c,vctot);
18127 vctot = vec_madd(qqOHt,VV31c,vctot);
18128 vctot = vec_madd(qqHHt,VV32c,vctot);
18129 vctot = vec_madd(qqHHt,VV33c,vctot);
18130 } else if(k<(nj1-1)) {
18131 jnra = jjnr[k];
18132 jnrb = jjnr[k+1];
18133 j3a = 3*jnra;
18134 j3b = 3*jnrb;
18135 load_2_water(pos+j3a,pos+j3b,
18136 &jx1,&jy1,&jz1,&jx2,&jy2,&jz2,&jx3,&jy3,&jz3);
18137 qqOOt = vec_sld(qqOO,nul,8);
18138 qqOHt = vec_sld(qqOH,nul,8);
18139 qqHHt = vec_sld(qqHH,nul,8);
18140 c6t = vec_sld(c6,nul,8);
18141 c12t = vec_sld(c12,nul,8);
18143 dx11 = vec_sub(ix1,jx1);
18144 dx12 = vec_sub(ix1,jx2);
18145 dx13 = vec_sub(ix1,jx3);
18146 dy11 = vec_sub(iy1,jy1);
18147 dy12 = vec_sub(iy1,jy2);
18148 dy13 = vec_sub(iy1,jy3);
18149 dz11 = vec_sub(iz1,jz1);
18150 dz12 = vec_sub(iz1,jz2);
18151 dz13 = vec_sub(iz1,jz3);
18152 dx21 = vec_sub(ix2,jx1);
18153 dx22 = vec_sub(ix2,jx2);
18154 dx23 = vec_sub(ix2,jx3);
18155 dy21 = vec_sub(iy2,jy1);
18156 dy22 = vec_sub(iy2,jy2);
18157 dy23 = vec_sub(iy2,jy3);
18158 dz21 = vec_sub(iz2,jz1);
18159 dz22 = vec_sub(iz2,jz2);
18160 dz23 = vec_sub(iz2,jz3);
18161 dx31 = vec_sub(ix3,jx1);
18162 dx32 = vec_sub(ix3,jx2);
18163 dx33 = vec_sub(ix3,jx3);
18164 dy31 = vec_sub(iy3,jy1);
18165 dy32 = vec_sub(iy3,jy2);
18166 dy33 = vec_sub(iy3,jy3);
18167 dz31 = vec_sub(iz3,jz1);
18168 dz32 = vec_sub(iz3,jz2);
18169 dz33 = vec_sub(iz3,jz3);
18171 rsq11 = vec_madd(dx11,dx11,nul);
18172 rsq12 = vec_madd(dx12,dx12,nul);
18173 rsq13 = vec_madd(dx13,dx13,nul);
18174 rsq21 = vec_madd(dx21,dx21,nul);
18175 rsq22 = vec_madd(dx22,dx22,nul);
18176 rsq23 = vec_madd(dx23,dx23,nul);
18177 rsq31 = vec_madd(dx31,dx31,nul);
18178 rsq32 = vec_madd(dx32,dx32,nul);
18179 rsq33 = vec_madd(dx33,dx33,nul);
18180 rsq11 = vec_madd(dy11,dy11,rsq11);
18181 rsq12 = vec_madd(dy12,dy12,rsq12);
18182 rsq13 = vec_madd(dy13,dy13,rsq13);
18183 rsq21 = vec_madd(dy21,dy21,rsq21);
18184 rsq22 = vec_madd(dy22,dy22,rsq22);
18185 rsq23 = vec_madd(dy23,dy23,rsq23);
18186 rsq31 = vec_madd(dy31,dy31,rsq31);
18187 rsq32 = vec_madd(dy32,dy32,rsq32);
18188 rsq33 = vec_madd(dy33,dy33,rsq33);
18189 rsq11 = vec_madd(dz11,dz11,rsq11);
18190 rsq12 = vec_madd(dz12,dz12,rsq12);
18191 rsq13 = vec_madd(dz13,dz13,rsq13);
18192 rsq21 = vec_madd(dz21,dz21,rsq21);
18193 rsq22 = vec_madd(dz22,dz22,rsq22);
18194 rsq23 = vec_madd(dz23,dz23,rsq23);
18195 rsq31 = vec_madd(dz31,dz31,rsq31);
18196 rsq32 = vec_madd(dz32,dz32,rsq32);
18197 rsq33 = vec_madd(dz33,dz33,rsq33);
18199 zero_highest_2_elements_in_9_vectors(&rsq11,&rsq12,&rsq13,
18200 &rsq21,&rsq22,&rsq23,
18201 &rsq31,&rsq32,&rsq33);
18203 do_9_invsqrt(rsq11,rsq12,rsq13,
18204 rsq21,rsq22,rsq23,
18205 rsq31,rsq32,rsq33,
18206 &rinv11,&rinv12,&rinv13,
18207 &rinv21,&rinv22,&rinv23,
18208 &rinv31,&rinv32,&rinv33);
18210 zero_highest_2_elements_in_9_vectors(&rinv11,&rinv12,&rinv13,
18211 &rinv21,&rinv22,&rinv23,
18212 &rinv31,&rinv32,&rinv33);
18214 rinvsq11 = vec_madd(rinv11,rinv11,nul);
18215 r11 = vec_madd(rsq11,rinv11,nul);
18216 r12 = vec_madd(rsq12,rinv12,nul);
18217 r13 = vec_madd(rsq13,rinv13,nul);
18218 r21 = vec_madd(rsq21,rinv21,nul);
18219 rinvsix = vec_madd(rinvsq11,rinvsq11,nul);
18220 r22 = vec_madd(rsq22,rinv22,nul);
18221 r23 = vec_madd(rsq23,rinv23,nul);
18222 r31 = vec_madd(rsq31,rinv31,nul);
18223 r32 = vec_madd(rsq32,rinv32,nul);
18224 r33 = vec_madd(rsq33,rinv33,nul);
18225 rinvsix = vec_madd(rinvsix,rinvsq11,nul);
18227 do_vonly_2_ctable_coul(VFtab,vec_madd(r11,tsc,nul),&VV11c);
18228 do_vonly_2_ctable_coul(VFtab,vec_madd(r12,tsc,nul),&VV12c);
18229 do_vonly_2_ctable_coul(VFtab,vec_madd(r13,tsc,nul),&VV13c);
18230 do_vonly_2_ctable_coul(VFtab,vec_madd(r21,tsc,nul),&VV21c);
18231 do_vonly_2_ctable_coul(VFtab,vec_madd(r22,tsc,nul),&VV22c);
18232 do_vonly_2_ctable_coul(VFtab,vec_madd(r23,tsc,nul),&VV23c);
18233 do_vonly_2_ctable_coul(VFtab,vec_madd(r31,tsc,nul),&VV31c);
18234 do_vonly_2_ctable_coul(VFtab,vec_madd(r32,tsc,nul),&VV32c);
18235 do_vonly_2_ctable_coul(VFtab,vec_madd(r33,tsc,nul),&VV33c);
18237 vnbtot = vec_nmsub(c6t,rinvsix,vnbtot);
18238 vnbtot = vec_madd(c12t,vec_madd(rinvsix,rinvsix,nul),vnbtot);
18239 vctot = vec_madd(qqOOt,VV11c,vctot);
18240 vctot = vec_madd(qqOHt,VV12c,vctot);
18241 vctot = vec_madd(qqOHt,VV13c,vctot);
18242 vctot = vec_madd(qqOHt,VV21c,vctot);
18243 vctot = vec_madd(qqHHt,VV22c,vctot);
18244 vctot = vec_madd(qqHHt,VV23c,vctot);
18245 vctot = vec_madd(qqOHt,VV31c,vctot);
18246 vctot = vec_madd(qqHHt,VV32c,vctot);
18247 vctot = vec_madd(qqHHt,VV33c,vctot);
18248 } else if(k<nj1) {
18249 jnra = jjnr[k];
18250 j3a = 3*jnra;
18251 load_1_water(pos+j3a,
18252 &jx1,&jy1,&jz1,&jx2,&jy2,&jz2,&jx3,&jy3,&jz3);
18253 qqOOt = vec_sld(qqOO,nul,12);
18254 qqOHt = vec_sld(qqOH,nul,12);
18255 qqHHt = vec_sld(qqHH,nul,12);
18256 c6t = vec_sld(c6,nul,12);
18257 c12t = vec_sld(c12,nul,12);
18259 dx11 = vec_sub(ix1,jx1);
18260 dx12 = vec_sub(ix1,jx2);
18261 dx13 = vec_sub(ix1,jx3);
18262 dy11 = vec_sub(iy1,jy1);
18263 dy12 = vec_sub(iy1,jy2);
18264 dy13 = vec_sub(iy1,jy3);
18265 dz11 = vec_sub(iz1,jz1);
18266 dz12 = vec_sub(iz1,jz2);
18267 dz13 = vec_sub(iz1,jz3);
18268 dx21 = vec_sub(ix2,jx1);
18269 dx22 = vec_sub(ix2,jx2);
18270 dx23 = vec_sub(ix2,jx3);
18271 dy21 = vec_sub(iy2,jy1);
18272 dy22 = vec_sub(iy2,jy2);
18273 dy23 = vec_sub(iy2,jy3);
18274 dz21 = vec_sub(iz2,jz1);
18275 dz22 = vec_sub(iz2,jz2);
18276 dz23 = vec_sub(iz2,jz3);
18277 dx31 = vec_sub(ix3,jx1);
18278 dx32 = vec_sub(ix3,jx2);
18279 dx33 = vec_sub(ix3,jx3);
18280 dy31 = vec_sub(iy3,jy1);
18281 dy32 = vec_sub(iy3,jy2);
18282 dy33 = vec_sub(iy3,jy3);
18283 dz31 = vec_sub(iz3,jz1);
18284 dz32 = vec_sub(iz3,jz2);
18285 dz33 = vec_sub(iz3,jz3);
18287 rsq11 = vec_madd(dx11,dx11,nul);
18288 rsq12 = vec_madd(dx12,dx12,nul);
18289 rsq13 = vec_madd(dx13,dx13,nul);
18290 rsq21 = vec_madd(dx21,dx21,nul);
18291 rsq22 = vec_madd(dx22,dx22,nul);
18292 rsq23 = vec_madd(dx23,dx23,nul);
18293 rsq31 = vec_madd(dx31,dx31,nul);
18294 rsq32 = vec_madd(dx32,dx32,nul);
18295 rsq33 = vec_madd(dx33,dx33,nul);
18296 rsq11 = vec_madd(dy11,dy11,rsq11);
18297 rsq12 = vec_madd(dy12,dy12,rsq12);
18298 rsq13 = vec_madd(dy13,dy13,rsq13);
18299 rsq21 = vec_madd(dy21,dy21,rsq21);
18300 rsq22 = vec_madd(dy22,dy22,rsq22);
18301 rsq23 = vec_madd(dy23,dy23,rsq23);
18302 rsq31 = vec_madd(dy31,dy31,rsq31);
18303 rsq32 = vec_madd(dy32,dy32,rsq32);
18304 rsq33 = vec_madd(dy33,dy33,rsq33);
18305 rsq11 = vec_madd(dz11,dz11,rsq11);
18306 rsq12 = vec_madd(dz12,dz12,rsq12);
18307 rsq13 = vec_madd(dz13,dz13,rsq13);
18308 rsq21 = vec_madd(dz21,dz21,rsq21);
18309 rsq22 = vec_madd(dz22,dz22,rsq22);
18310 rsq23 = vec_madd(dz23,dz23,rsq23);
18311 rsq31 = vec_madd(dz31,dz31,rsq31);
18312 rsq32 = vec_madd(dz32,dz32,rsq32);
18313 rsq33 = vec_madd(dz33,dz33,rsq33);
18315 zero_highest_3_elements_in_9_vectors(&rsq11,&rsq12,&rsq13,
18316 &rsq21,&rsq22,&rsq23,
18317 &rsq31,&rsq32,&rsq33);
18319 do_9_invsqrt(rsq11,rsq12,rsq13,
18320 rsq21,rsq22,rsq23,
18321 rsq31,rsq32,rsq33,
18322 &rinv11,&rinv12,&rinv13,
18323 &rinv21,&rinv22,&rinv23,
18324 &rinv31,&rinv32,&rinv33);
18326 zero_highest_3_elements_in_9_vectors(&rinv11,&rinv12,&rinv13,
18327 &rinv21,&rinv22,&rinv23,
18328 &rinv31,&rinv32,&rinv33);
18330 rinvsq11 = vec_madd(rinv11,rinv11,nul);
18331 r11 = vec_madd(rsq11,rinv11,nul);
18332 r12 = vec_madd(rsq12,rinv12,nul);
18333 r13 = vec_madd(rsq13,rinv13,nul);
18334 r21 = vec_madd(rsq21,rinv21,nul);
18335 rinvsix = vec_madd(rinvsq11,rinvsq11,nul);
18336 r22 = vec_madd(rsq22,rinv22,nul);
18337 r23 = vec_madd(rsq23,rinv23,nul);
18338 r31 = vec_madd(rsq31,rinv31,nul);
18339 r32 = vec_madd(rsq32,rinv32,nul);
18340 r33 = vec_madd(rsq33,rinv33,nul);
18341 rinvsix = vec_madd(rinvsix,rinvsq11,nul);
18343 do_vonly_1_ctable_coul(VFtab,vec_madd(r11,tsc,nul),&VV11c);
18344 do_vonly_1_ctable_coul(VFtab,vec_madd(r12,tsc,nul),&VV12c);
18345 do_vonly_1_ctable_coul(VFtab,vec_madd(r13,tsc,nul),&VV13c);
18346 do_vonly_1_ctable_coul(VFtab,vec_madd(r21,tsc,nul),&VV21c);
18347 do_vonly_1_ctable_coul(VFtab,vec_madd(r22,tsc,nul),&VV22c);
18348 do_vonly_1_ctable_coul(VFtab,vec_madd(r23,tsc,nul),&VV23c);
18349 do_vonly_1_ctable_coul(VFtab,vec_madd(r31,tsc,nul),&VV31c);
18350 do_vonly_1_ctable_coul(VFtab,vec_madd(r32,tsc,nul),&VV32c);
18351 do_vonly_1_ctable_coul(VFtab,vec_madd(r33,tsc,nul),&VV33c);
18353 vnbtot = vec_nmsub(c6t,rinvsix,vnbtot);
18354 vnbtot = vec_madd(c12t,vec_madd(rinvsix,rinvsix,nul),vnbtot);
18355 vctot = vec_madd(qqOOt,VV11c,vctot);
18356 vctot = vec_madd(qqOHt,VV12c,vctot);
18357 vctot = vec_madd(qqOHt,VV13c,vctot);
18358 vctot = vec_madd(qqOHt,VV21c,vctot);
18359 vctot = vec_madd(qqHHt,VV22c,vctot);
18360 vctot = vec_madd(qqHHt,VV23c,vctot);
18361 vctot = vec_madd(qqOHt,VV31c,vctot);
18362 vctot = vec_madd(qqHHt,VV32c,vctot);
18363 vctot = vec_madd(qqHHt,VV33c,vctot);
18365 /* update outer data */
18366 add_vector_to_float(Vc+gid[n],vctot);
18367 add_vector_to_float(Vnb+gid[n],vnbtot);
18372 void mcinl3330_altivec(
18373 int nri,
18374 int iinr[],
18375 int jindex[],
18376 int jjnr[],
18377 int shift[],
18378 float shiftvec[],
18379 int gid[],
18380 float pos[],
18381 float charge[],
18382 float facel,
18383 float Vc[],
18384 int type[],
18385 int ntype,
18386 float nbfp[],
18387 float Vnb[],
18388 float tabscale,
18389 float VFtab[])
18391 vector float ix1,iy1,iz1,ix2,iy2,iz2,ix3,iy3,iz3;
18392 vector float jx1,jy1,jz1,jx2,jy2,jz2,jx3,jy3,jz3;
18394 vector float dx11,dy11,dz11,dx12,dy12,dz12,dx13,dy13,dz13;
18395 vector float dx21,dy21,dz21,dx22,dy22,dz22,dx23,dy23,dz23;
18396 vector float dx31,dy31,dz31,dx32,dy32,dz32,dx33,dy33,dz33;
18398 vector float rsq11,rsq12,rsq13,rsq21,rsq22,rsq23,rsq31,rsq32,rsq33;
18399 vector float r11,r12,r13,r21,r22,r23,r31,r32,r33;
18400 vector float rinv11,rinv12,rinv13,rinv21,rinv22,rinv23,rinv31,rinv32,rinv33;
18402 vector float vfacel,nul;
18403 vector float vctot,qqOO,qqOH,qqHH,qO,qH,c6,c12;
18404 vector float vnbtot,tsc,qqOOt,qqOHt,qqHHt,c6t,c12t;
18405 vector float VV11c,VV12c,VV13c;
18406 vector float VV21c,VV22c,VV23c;
18407 vector float VV31c,VV32c,VV33c;
18408 vector float VVd,VVr;
18410 int n,k,k0,ii,is3,ii3,ntiA,nj0,nj1;
18411 int jnra,jnrb,jnrc,jnrd,tp,tj;
18412 int j3a,j3b,j3c,j3d;
18414 nul=vec_zero();
18415 vfacel=load_float_and_splat(&facel);
18416 tsc=load_float_and_splat(&tabscale);
18417 ii = iinr[0];
18418 qO = load_float_and_splat(charge+ii);
18419 qH = load_float_and_splat(charge+ii+1);
18420 qqOO = vec_madd(qO,qO,nul);
18421 qqOH = vec_madd(qO,qH,nul);
18422 qqHH = vec_madd(qH,qH,nul);
18423 qqOO = vec_madd(qqOO,vfacel,nul);
18424 qqOH = vec_madd(qqOH,vfacel,nul);
18425 qqHH = vec_madd(qqHH,vfacel,nul);
18426 tp = 2*type[ii];
18427 tj = (ntype+1)*tp;
18428 load_1_pair(nbfp+tj,&c6,&c12);
18429 c6 = vec_splat(c6,0);
18430 c12 = vec_splat(c12,0);
18432 for(n=0;n<nri;n++) {
18433 is3 = 3*shift[n];
18434 ii = iinr[n];
18435 ii3 = 3*ii;
18436 load_1_water_shift_and_splat(pos+ii3,shiftvec+is3,&ix1,&iy1,&iz1,
18437 &ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
18438 vctot = nul;
18439 vnbtot = nul;
18440 nj0 = jindex[n];
18441 nj1 = jindex[n+1];
18443 for(k=nj0; k<(nj1-3); k+=4) {
18444 jnra = jjnr[k];
18445 jnrb = jjnr[k+1];
18446 jnrc = jjnr[k+2];
18447 jnrd = jjnr[k+3];
18448 j3a = 3*jnra;
18449 j3b = 3*jnrb;
18450 j3c = 3*jnrc;
18451 j3d = 3*jnrd;
18452 load_4_water(pos+j3a,pos+j3b,pos+j3c,pos+j3d,
18453 &jx1,&jy1,&jz1,&jx2,&jy2,&jz2,&jx3,&jy3,&jz3);
18454 dx11 = vec_sub(ix1,jx1);
18455 dx12 = vec_sub(ix1,jx2);
18456 dx13 = vec_sub(ix1,jx3);
18457 dy11 = vec_sub(iy1,jy1);
18458 dy12 = vec_sub(iy1,jy2);
18459 dy13 = vec_sub(iy1,jy3);
18460 dz11 = vec_sub(iz1,jz1);
18461 dz12 = vec_sub(iz1,jz2);
18462 dz13 = vec_sub(iz1,jz3);
18463 dx21 = vec_sub(ix2,jx1);
18464 dx22 = vec_sub(ix2,jx2);
18465 dx23 = vec_sub(ix2,jx3);
18466 dy21 = vec_sub(iy2,jy1);
18467 dy22 = vec_sub(iy2,jy2);
18468 dy23 = vec_sub(iy2,jy3);
18469 dz21 = vec_sub(iz2,jz1);
18470 dz22 = vec_sub(iz2,jz2);
18471 dz23 = vec_sub(iz2,jz3);
18472 dx31 = vec_sub(ix3,jx1);
18473 dx32 = vec_sub(ix3,jx2);
18474 dx33 = vec_sub(ix3,jx3);
18475 dy31 = vec_sub(iy3,jy1);
18476 dy32 = vec_sub(iy3,jy2);
18477 dy33 = vec_sub(iy3,jy3);
18478 dz31 = vec_sub(iz3,jz1);
18479 dz32 = vec_sub(iz3,jz2);
18480 dz33 = vec_sub(iz3,jz3);
18482 rsq11 = vec_madd(dx11,dx11,nul);
18483 rsq12 = vec_madd(dx12,dx12,nul);
18484 rsq13 = vec_madd(dx13,dx13,nul);
18485 rsq21 = vec_madd(dx21,dx21,nul);
18486 rsq22 = vec_madd(dx22,dx22,nul);
18487 rsq23 = vec_madd(dx23,dx23,nul);
18488 rsq31 = vec_madd(dx31,dx31,nul);
18489 rsq32 = vec_madd(dx32,dx32,nul);
18490 rsq33 = vec_madd(dx33,dx33,nul);
18491 rsq11 = vec_madd(dy11,dy11,rsq11);
18492 rsq12 = vec_madd(dy12,dy12,rsq12);
18493 rsq13 = vec_madd(dy13,dy13,rsq13);
18494 rsq21 = vec_madd(dy21,dy21,rsq21);
18495 rsq22 = vec_madd(dy22,dy22,rsq22);
18496 rsq23 = vec_madd(dy23,dy23,rsq23);
18497 rsq31 = vec_madd(dy31,dy31,rsq31);
18498 rsq32 = vec_madd(dy32,dy32,rsq32);
18499 rsq33 = vec_madd(dy33,dy33,rsq33);
18500 rsq11 = vec_madd(dz11,dz11,rsq11);
18501 rsq12 = vec_madd(dz12,dz12,rsq12);
18502 rsq13 = vec_madd(dz13,dz13,rsq13);
18503 rsq21 = vec_madd(dz21,dz21,rsq21);
18504 rsq22 = vec_madd(dz22,dz22,rsq22);
18505 rsq23 = vec_madd(dz23,dz23,rsq23);
18506 rsq31 = vec_madd(dz31,dz31,rsq31);
18507 rsq32 = vec_madd(dz32,dz32,rsq32);
18508 rsq33 = vec_madd(dz33,dz33,rsq33);
18510 do_9_invsqrt(rsq11,rsq12,rsq13,
18511 rsq21,rsq22,rsq23,
18512 rsq31,rsq32,rsq33,
18513 &rinv11,&rinv12,&rinv13,
18514 &rinv21,&rinv22,&rinv23,
18515 &rinv31,&rinv32,&rinv33);
18517 r11 = vec_madd(rsq11,rinv11,nul);
18518 r12 = vec_madd(rsq12,rinv12,nul);
18519 r13 = vec_madd(rsq13,rinv13,nul);
18520 r21 = vec_madd(rsq21,rinv21,nul);
18521 r22 = vec_madd(rsq22,rinv22,nul);
18522 r23 = vec_madd(rsq23,rinv23,nul);
18523 r31 = vec_madd(rsq31,rinv31,nul);
18524 r32 = vec_madd(rsq32,rinv32,nul);
18525 r33 = vec_madd(rsq33,rinv33,nul);
18527 do_vonly_4_ljctable_coul_and_lj(VFtab,vec_madd(r11,tsc,nul),
18528 &VV11c,&VVd,&VVr);
18529 do_vonly_4_ljctable_coul(VFtab,vec_madd(r12,tsc,nul),&VV12c);
18530 do_vonly_4_ljctable_coul(VFtab,vec_madd(r13,tsc,nul),&VV13c);
18531 do_vonly_4_ljctable_coul(VFtab,vec_madd(r21,tsc,nul),&VV21c);
18532 do_vonly_4_ljctable_coul(VFtab,vec_madd(r22,tsc,nul),&VV22c);
18533 do_vonly_4_ljctable_coul(VFtab,vec_madd(r23,tsc,nul),&VV23c);
18534 do_vonly_4_ljctable_coul(VFtab,vec_madd(r31,tsc,nul),&VV31c);
18535 do_vonly_4_ljctable_coul(VFtab,vec_madd(r32,tsc,nul),&VV32c);
18536 do_vonly_4_ljctable_coul(VFtab,vec_madd(r33,tsc,nul),&VV33c);
18538 vnbtot = vec_madd(c6,VVd,vnbtot);
18539 vnbtot = vec_madd(c12,VVr,vnbtot);
18540 vctot = vec_madd(qqOO,VV11c,vctot);
18541 vctot = vec_madd(qqOH,VV12c,vctot);
18542 vctot = vec_madd(qqOH,VV13c,vctot);
18543 vctot = vec_madd(qqOH,VV21c,vctot);
18544 vctot = vec_madd(qqHH,VV22c,vctot);
18545 vctot = vec_madd(qqHH,VV23c,vctot);
18546 vctot = vec_madd(qqOH,VV31c,vctot);
18547 vctot = vec_madd(qqHH,VV32c,vctot);
18548 vctot = vec_madd(qqHH,VV33c,vctot);
18550 if(k<(nj1-2)) {
18551 jnra = jjnr[k];
18552 jnrb = jjnr[k+1];
18553 jnrc = jjnr[k+2];
18554 j3a = 3*jnra;
18555 j3b = 3*jnrb;
18556 j3c = 3*jnrc;
18557 load_3_water(pos+j3a,pos+j3b,pos+j3c,
18558 &jx1,&jy1,&jz1,&jx2,&jy2,&jz2,&jx3,&jy3,&jz3);
18559 qqOOt = vec_sld(qqOO,nul,4);
18560 qqOHt = vec_sld(qqOH,nul,4);
18561 qqHHt = vec_sld(qqHH,nul,4);
18562 c6t = vec_sld(c6,nul,4);
18563 c12t = vec_sld(c12,nul,4);
18565 dx11 = vec_sub(ix1,jx1);
18566 dx12 = vec_sub(ix1,jx2);
18567 dx13 = vec_sub(ix1,jx3);
18568 dy11 = vec_sub(iy1,jy1);
18569 dy12 = vec_sub(iy1,jy2);
18570 dy13 = vec_sub(iy1,jy3);
18571 dz11 = vec_sub(iz1,jz1);
18572 dz12 = vec_sub(iz1,jz2);
18573 dz13 = vec_sub(iz1,jz3);
18574 dx21 = vec_sub(ix2,jx1);
18575 dx22 = vec_sub(ix2,jx2);
18576 dx23 = vec_sub(ix2,jx3);
18577 dy21 = vec_sub(iy2,jy1);
18578 dy22 = vec_sub(iy2,jy2);
18579 dy23 = vec_sub(iy2,jy3);
18580 dz21 = vec_sub(iz2,jz1);
18581 dz22 = vec_sub(iz2,jz2);
18582 dz23 = vec_sub(iz2,jz3);
18583 dx31 = vec_sub(ix3,jx1);
18584 dx32 = vec_sub(ix3,jx2);
18585 dx33 = vec_sub(ix3,jx3);
18586 dy31 = vec_sub(iy3,jy1);
18587 dy32 = vec_sub(iy3,jy2);
18588 dy33 = vec_sub(iy3,jy3);
18589 dz31 = vec_sub(iz3,jz1);
18590 dz32 = vec_sub(iz3,jz2);
18591 dz33 = vec_sub(iz3,jz3);
18593 rsq11 = vec_madd(dx11,dx11,nul);
18594 rsq12 = vec_madd(dx12,dx12,nul);
18595 rsq13 = vec_madd(dx13,dx13,nul);
18596 rsq21 = vec_madd(dx21,dx21,nul);
18597 rsq22 = vec_madd(dx22,dx22,nul);
18598 rsq23 = vec_madd(dx23,dx23,nul);
18599 rsq31 = vec_madd(dx31,dx31,nul);
18600 rsq32 = vec_madd(dx32,dx32,nul);
18601 rsq33 = vec_madd(dx33,dx33,nul);
18602 rsq11 = vec_madd(dy11,dy11,rsq11);
18603 rsq12 = vec_madd(dy12,dy12,rsq12);
18604 rsq13 = vec_madd(dy13,dy13,rsq13);
18605 rsq21 = vec_madd(dy21,dy21,rsq21);
18606 rsq22 = vec_madd(dy22,dy22,rsq22);
18607 rsq23 = vec_madd(dy23,dy23,rsq23);
18608 rsq31 = vec_madd(dy31,dy31,rsq31);
18609 rsq32 = vec_madd(dy32,dy32,rsq32);
18610 rsq33 = vec_madd(dy33,dy33,rsq33);
18611 rsq11 = vec_madd(dz11,dz11,rsq11);
18612 rsq12 = vec_madd(dz12,dz12,rsq12);
18613 rsq13 = vec_madd(dz13,dz13,rsq13);
18614 rsq21 = vec_madd(dz21,dz21,rsq21);
18615 rsq22 = vec_madd(dz22,dz22,rsq22);
18616 rsq23 = vec_madd(dz23,dz23,rsq23);
18617 rsq31 = vec_madd(dz31,dz31,rsq31);
18618 rsq32 = vec_madd(dz32,dz32,rsq32);
18619 rsq33 = vec_madd(dz33,dz33,rsq33);
18621 zero_highest_element_in_9_vectors(&rsq11,&rsq12,&rsq13,
18622 &rsq21,&rsq22,&rsq23,
18623 &rsq31,&rsq32,&rsq33);
18625 do_9_invsqrt(rsq11,rsq12,rsq13,
18626 rsq21,rsq22,rsq23,
18627 rsq31,rsq32,rsq33,
18628 &rinv11,&rinv12,&rinv13,
18629 &rinv21,&rinv22,&rinv23,
18630 &rinv31,&rinv32,&rinv33);
18632 zero_highest_element_in_9_vectors(&rinv11,&rinv12,&rinv13,
18633 &rinv21,&rinv22,&rinv23,
18634 &rinv31,&rinv32,&rinv33);
18636 r11 = vec_madd(rsq11,rinv11,nul);
18637 r12 = vec_madd(rsq12,rinv12,nul);
18638 r13 = vec_madd(rsq13,rinv13,nul);
18639 r21 = vec_madd(rsq21,rinv21,nul);
18640 r22 = vec_madd(rsq22,rinv22,nul);
18641 r23 = vec_madd(rsq23,rinv23,nul);
18642 r31 = vec_madd(rsq31,rinv31,nul);
18643 r32 = vec_madd(rsq32,rinv32,nul);
18644 r33 = vec_madd(rsq33,rinv33,nul);
18646 do_vonly_3_ljctable_coul_and_lj(VFtab,vec_madd(r11,tsc,nul),
18647 &VV11c,&VVd,&VVr);
18648 do_vonly_3_ljctable_coul(VFtab,vec_madd(r12,tsc,nul),&VV12c);
18649 do_vonly_3_ljctable_coul(VFtab,vec_madd(r13,tsc,nul),&VV13c);
18650 do_vonly_3_ljctable_coul(VFtab,vec_madd(r21,tsc,nul),&VV21c);
18651 do_vonly_3_ljctable_coul(VFtab,vec_madd(r22,tsc,nul),&VV22c);
18652 do_vonly_3_ljctable_coul(VFtab,vec_madd(r23,tsc,nul),&VV23c);
18653 do_vonly_3_ljctable_coul(VFtab,vec_madd(r31,tsc,nul),&VV31c);
18654 do_vonly_3_ljctable_coul(VFtab,vec_madd(r32,tsc,nul),&VV32c);
18655 do_vonly_3_ljctable_coul(VFtab,vec_madd(r33,tsc,nul),&VV33c);
18657 vnbtot = vec_madd(c6t,VVd,vnbtot);
18658 vnbtot = vec_madd(c12t,VVr,vnbtot);
18659 vctot = vec_madd(qqOOt,VV11c,vctot);
18660 vctot = vec_madd(qqOHt,VV12c,vctot);
18661 vctot = vec_madd(qqOHt,VV13c,vctot);
18662 vctot = vec_madd(qqOHt,VV21c,vctot);
18663 vctot = vec_madd(qqHHt,VV22c,vctot);
18664 vctot = vec_madd(qqHHt,VV23c,vctot);
18665 vctot = vec_madd(qqOHt,VV31c,vctot);
18666 vctot = vec_madd(qqHHt,VV32c,vctot);
18667 vctot = vec_madd(qqHHt,VV33c,vctot);
18668 } else if(k<(nj1-1)) {
18669 jnra = jjnr[k];
18670 jnrb = jjnr[k+1];
18671 j3a = 3*jnra;
18672 j3b = 3*jnrb;
18673 load_2_water(pos+j3a,pos+j3b,
18674 &jx1,&jy1,&jz1,&jx2,&jy2,&jz2,&jx3,&jy3,&jz3);
18675 qqOOt = vec_sld(qqOO,nul,8);
18676 qqOHt = vec_sld(qqOH,nul,8);
18677 qqHHt = vec_sld(qqHH,nul,8);
18678 c6t = vec_sld(c6,nul,8);
18679 c12t = vec_sld(c12,nul,8);
18681 dx11 = vec_sub(ix1,jx1);
18682 dx12 = vec_sub(ix1,jx2);
18683 dx13 = vec_sub(ix1,jx3);
18684 dy11 = vec_sub(iy1,jy1);
18685 dy12 = vec_sub(iy1,jy2);
18686 dy13 = vec_sub(iy1,jy3);
18687 dz11 = vec_sub(iz1,jz1);
18688 dz12 = vec_sub(iz1,jz2);
18689 dz13 = vec_sub(iz1,jz3);
18690 dx21 = vec_sub(ix2,jx1);
18691 dx22 = vec_sub(ix2,jx2);
18692 dx23 = vec_sub(ix2,jx3);
18693 dy21 = vec_sub(iy2,jy1);
18694 dy22 = vec_sub(iy2,jy2);
18695 dy23 = vec_sub(iy2,jy3);
18696 dz21 = vec_sub(iz2,jz1);
18697 dz22 = vec_sub(iz2,jz2);
18698 dz23 = vec_sub(iz2,jz3);
18699 dx31 = vec_sub(ix3,jx1);
18700 dx32 = vec_sub(ix3,jx2);
18701 dx33 = vec_sub(ix3,jx3);
18702 dy31 = vec_sub(iy3,jy1);
18703 dy32 = vec_sub(iy3,jy2);
18704 dy33 = vec_sub(iy3,jy3);
18705 dz31 = vec_sub(iz3,jz1);
18706 dz32 = vec_sub(iz3,jz2);
18707 dz33 = vec_sub(iz3,jz3);
18709 rsq11 = vec_madd(dx11,dx11,nul);
18710 rsq12 = vec_madd(dx12,dx12,nul);
18711 rsq13 = vec_madd(dx13,dx13,nul);
18712 rsq21 = vec_madd(dx21,dx21,nul);
18713 rsq22 = vec_madd(dx22,dx22,nul);
18714 rsq23 = vec_madd(dx23,dx23,nul);
18715 rsq31 = vec_madd(dx31,dx31,nul);
18716 rsq32 = vec_madd(dx32,dx32,nul);
18717 rsq33 = vec_madd(dx33,dx33,nul);
18718 rsq11 = vec_madd(dy11,dy11,rsq11);
18719 rsq12 = vec_madd(dy12,dy12,rsq12);
18720 rsq13 = vec_madd(dy13,dy13,rsq13);
18721 rsq21 = vec_madd(dy21,dy21,rsq21);
18722 rsq22 = vec_madd(dy22,dy22,rsq22);
18723 rsq23 = vec_madd(dy23,dy23,rsq23);
18724 rsq31 = vec_madd(dy31,dy31,rsq31);
18725 rsq32 = vec_madd(dy32,dy32,rsq32);
18726 rsq33 = vec_madd(dy33,dy33,rsq33);
18727 rsq11 = vec_madd(dz11,dz11,rsq11);
18728 rsq12 = vec_madd(dz12,dz12,rsq12);
18729 rsq13 = vec_madd(dz13,dz13,rsq13);
18730 rsq21 = vec_madd(dz21,dz21,rsq21);
18731 rsq22 = vec_madd(dz22,dz22,rsq22);
18732 rsq23 = vec_madd(dz23,dz23,rsq23);
18733 rsq31 = vec_madd(dz31,dz31,rsq31);
18734 rsq32 = vec_madd(dz32,dz32,rsq32);
18735 rsq33 = vec_madd(dz33,dz33,rsq33);
18737 zero_highest_2_elements_in_9_vectors(&rsq11,&rsq12,&rsq13,
18738 &rsq21,&rsq22,&rsq23,
18739 &rsq31,&rsq32,&rsq33);
18741 do_9_invsqrt(rsq11,rsq12,rsq13,
18742 rsq21,rsq22,rsq23,
18743 rsq31,rsq32,rsq33,
18744 &rinv11,&rinv12,&rinv13,
18745 &rinv21,&rinv22,&rinv23,
18746 &rinv31,&rinv32,&rinv33);
18748 zero_highest_2_elements_in_9_vectors(&rinv11,&rinv12,&rinv13,
18749 &rinv21,&rinv22,&rinv23,
18750 &rinv31,&rinv32,&rinv33);
18752 r11 = vec_madd(rsq11,rinv11,nul);
18753 r12 = vec_madd(rsq12,rinv12,nul);
18754 r13 = vec_madd(rsq13,rinv13,nul);
18755 r21 = vec_madd(rsq21,rinv21,nul);
18756 r22 = vec_madd(rsq22,rinv22,nul);
18757 r23 = vec_madd(rsq23,rinv23,nul);
18758 r31 = vec_madd(rsq31,rinv31,nul);
18759 r32 = vec_madd(rsq32,rinv32,nul);
18760 r33 = vec_madd(rsq33,rinv33,nul);
18762 do_vonly_2_ljctable_coul_and_lj(VFtab,vec_madd(r11,tsc,nul),
18763 &VV11c,&VVd,&VVr);
18764 do_vonly_2_ljctable_coul(VFtab,vec_madd(r12,tsc,nul),&VV12c);
18765 do_vonly_2_ljctable_coul(VFtab,vec_madd(r13,tsc,nul),&VV13c);
18766 do_vonly_2_ljctable_coul(VFtab,vec_madd(r21,tsc,nul),&VV21c);
18767 do_vonly_2_ljctable_coul(VFtab,vec_madd(r22,tsc,nul),&VV22c);
18768 do_vonly_2_ljctable_coul(VFtab,vec_madd(r23,tsc,nul),&VV23c);
18769 do_vonly_2_ljctable_coul(VFtab,vec_madd(r31,tsc,nul),&VV31c);
18770 do_vonly_2_ljctable_coul(VFtab,vec_madd(r32,tsc,nul),&VV32c);
18771 do_vonly_2_ljctable_coul(VFtab,vec_madd(r33,tsc,nul),&VV33c);
18773 vnbtot = vec_madd(c6t,VVd,vnbtot);
18774 vnbtot = vec_madd(c12t,VVr,vnbtot);
18775 vctot = vec_madd(qqOOt,VV11c,vctot);
18776 vctot = vec_madd(qqOHt,VV12c,vctot);
18777 vctot = vec_madd(qqOHt,VV13c,vctot);
18778 vctot = vec_madd(qqOHt,VV21c,vctot);
18779 vctot = vec_madd(qqHHt,VV22c,vctot);
18780 vctot = vec_madd(qqHHt,VV23c,vctot);
18781 vctot = vec_madd(qqOHt,VV31c,vctot);
18782 vctot = vec_madd(qqHHt,VV32c,vctot);
18783 vctot = vec_madd(qqHHt,VV33c,vctot);
18784 } else if(k<nj1) {
18785 jnra = jjnr[k];
18786 j3a = 3*jnra;
18787 load_1_water(pos+j3a,
18788 &jx1,&jy1,&jz1,&jx2,&jy2,&jz2,&jx3,&jy3,&jz3);
18789 qqOOt = vec_sld(qqOO,nul,12);
18790 qqOHt = vec_sld(qqOH,nul,12);
18791 qqHHt = vec_sld(qqHH,nul,12);
18792 c6t = vec_sld(c6,nul,12);
18793 c12t = vec_sld(c12,nul,12);
18795 dx11 = vec_sub(ix1,jx1);
18796 dx12 = vec_sub(ix1,jx2);
18797 dx13 = vec_sub(ix1,jx3);
18798 dy11 = vec_sub(iy1,jy1);
18799 dy12 = vec_sub(iy1,jy2);
18800 dy13 = vec_sub(iy1,jy3);
18801 dz11 = vec_sub(iz1,jz1);
18802 dz12 = vec_sub(iz1,jz2);
18803 dz13 = vec_sub(iz1,jz3);
18804 dx21 = vec_sub(ix2,jx1);
18805 dx22 = vec_sub(ix2,jx2);
18806 dx23 = vec_sub(ix2,jx3);
18807 dy21 = vec_sub(iy2,jy1);
18808 dy22 = vec_sub(iy2,jy2);
18809 dy23 = vec_sub(iy2,jy3);
18810 dz21 = vec_sub(iz2,jz1);
18811 dz22 = vec_sub(iz2,jz2);
18812 dz23 = vec_sub(iz2,jz3);
18813 dx31 = vec_sub(ix3,jx1);
18814 dx32 = vec_sub(ix3,jx2);
18815 dx33 = vec_sub(ix3,jx3);
18816 dy31 = vec_sub(iy3,jy1);
18817 dy32 = vec_sub(iy3,jy2);
18818 dy33 = vec_sub(iy3,jy3);
18819 dz31 = vec_sub(iz3,jz1);
18820 dz32 = vec_sub(iz3,jz2);
18821 dz33 = vec_sub(iz3,jz3);
18823 rsq11 = vec_madd(dx11,dx11,nul);
18824 rsq12 = vec_madd(dx12,dx12,nul);
18825 rsq13 = vec_madd(dx13,dx13,nul);
18826 rsq21 = vec_madd(dx21,dx21,nul);
18827 rsq22 = vec_madd(dx22,dx22,nul);
18828 rsq23 = vec_madd(dx23,dx23,nul);
18829 rsq31 = vec_madd(dx31,dx31,nul);
18830 rsq32 = vec_madd(dx32,dx32,nul);
18831 rsq33 = vec_madd(dx33,dx33,nul);
18832 rsq11 = vec_madd(dy11,dy11,rsq11);
18833 rsq12 = vec_madd(dy12,dy12,rsq12);
18834 rsq13 = vec_madd(dy13,dy13,rsq13);
18835 rsq21 = vec_madd(dy21,dy21,rsq21);
18836 rsq22 = vec_madd(dy22,dy22,rsq22);
18837 rsq23 = vec_madd(dy23,dy23,rsq23);
18838 rsq31 = vec_madd(dy31,dy31,rsq31);
18839 rsq32 = vec_madd(dy32,dy32,rsq32);
18840 rsq33 = vec_madd(dy33,dy33,rsq33);
18841 rsq11 = vec_madd(dz11,dz11,rsq11);
18842 rsq12 = vec_madd(dz12,dz12,rsq12);
18843 rsq13 = vec_madd(dz13,dz13,rsq13);
18844 rsq21 = vec_madd(dz21,dz21,rsq21);
18845 rsq22 = vec_madd(dz22,dz22,rsq22);
18846 rsq23 = vec_madd(dz23,dz23,rsq23);
18847 rsq31 = vec_madd(dz31,dz31,rsq31);
18848 rsq32 = vec_madd(dz32,dz32,rsq32);
18849 rsq33 = vec_madd(dz33,dz33,rsq33);
18851 zero_highest_3_elements_in_9_vectors(&rsq11,&rsq12,&rsq13,
18852 &rsq21,&rsq22,&rsq23,
18853 &rsq31,&rsq32,&rsq33);
18855 do_9_invsqrt(rsq11,rsq12,rsq13,
18856 rsq21,rsq22,rsq23,
18857 rsq31,rsq32,rsq33,
18858 &rinv11,&rinv12,&rinv13,
18859 &rinv21,&rinv22,&rinv23,
18860 &rinv31,&rinv32,&rinv33);
18862 zero_highest_3_elements_in_9_vectors(&rinv11,&rinv12,&rinv13,
18863 &rinv21,&rinv22,&rinv23,
18864 &rinv31,&rinv32,&rinv33);
18866 r11 = vec_madd(rsq11,rinv11,nul);
18867 r12 = vec_madd(rsq12,rinv12,nul);
18868 r13 = vec_madd(rsq13,rinv13,nul);
18869 r21 = vec_madd(rsq21,rinv21,nul);
18870 r22 = vec_madd(rsq22,rinv22,nul);
18871 r23 = vec_madd(rsq23,rinv23,nul);
18872 r31 = vec_madd(rsq31,rinv31,nul);
18873 r32 = vec_madd(rsq32,rinv32,nul);
18874 r33 = vec_madd(rsq33,rinv33,nul);
18876 do_vonly_1_ljctable_coul_and_lj(VFtab,vec_madd(r11,tsc,nul),
18877 &VV11c,&VVd,&VVr);
18878 do_vonly_1_ljctable_coul(VFtab,vec_madd(r12,tsc,nul),&VV12c);
18879 do_vonly_1_ljctable_coul(VFtab,vec_madd(r13,tsc,nul),&VV13c);
18880 do_vonly_1_ljctable_coul(VFtab,vec_madd(r21,tsc,nul),&VV21c);
18881 do_vonly_1_ljctable_coul(VFtab,vec_madd(r22,tsc,nul),&VV22c);
18882 do_vonly_1_ljctable_coul(VFtab,vec_madd(r23,tsc,nul),&VV23c);
18883 do_vonly_1_ljctable_coul(VFtab,vec_madd(r31,tsc,nul),&VV31c);
18884 do_vonly_1_ljctable_coul(VFtab,vec_madd(r32,tsc,nul),&VV32c);
18885 do_vonly_1_ljctable_coul(VFtab,vec_madd(r33,tsc,nul),&VV33c);
18887 vnbtot = vec_madd(c6t,VVd,vnbtot);
18888 vnbtot = vec_madd(c12t,VVr,vnbtot);
18889 vctot = vec_madd(qqOOt,VV11c,vctot);
18890 vctot = vec_madd(qqOHt,VV12c,vctot);
18891 vctot = vec_madd(qqOHt,VV13c,vctot);
18892 vctot = vec_madd(qqOHt,VV21c,vctot);
18893 vctot = vec_madd(qqHHt,VV22c,vctot);
18894 vctot = vec_madd(qqHHt,VV23c,vctot);
18895 vctot = vec_madd(qqOHt,VV31c,vctot);
18896 vctot = vec_madd(qqHHt,VV32c,vctot);
18897 vctot = vec_madd(qqHHt,VV33c,vctot);
18899 /* update outer data */
18900 add_vector_to_float(Vc+gid[n],vctot);
18901 add_vector_to_float(Vnb+gid[n],vnbtot);