Removed genalg.h by mistake
[gromacs.git] / include / ppc_altivec.h
blob03f2ab722144ec591504b37653f382b89f2cd919
1 /*
2 * $Id$
3 *
4 * This source code is part of
5 *
6 * G R O M A C S
7 *
8 * GROningen MAchine for Chemical Simulations
9 *
10 * VERSION 3.1
11 * Copyright (c) 1991-2001, University of Groningen, The Netherlands
12 * This program is free software; you can redistribute it and/or
13 * modify it under the terms of the GNU General Public License
14 * as published by the Free Software Foundation; either version 2
15 * of the License, or (at your option) any later version.
17 * If you want to redistribute modifications, please consider that
18 * scientific software is very special. Version control is crucial -
19 * bugs must be traceable. We will be happy to consider code for
20 * inclusion in the official distribution, but derived work must not
21 * be called official GROMACS. Details are found in the README & COPYING
22 * files - if they are missing, get the official version at www.gromacs.org.
24 * To help us fund GROMACS development, we humbly ask that you cite
25 * the papers on the package - you can find them in the top README file.
27 * For more info, check our website at http://www.gromacs.org
29 * And Hey:
30 * Getting the Right Output Means no Artefacts in Calculating Stuff
33 #ifndef _ppc_altivec_h
34 #define _ppc_altivec_h
36 static char *SRCID_ppc_altivec_h = "$Id$";
37 #ifdef HAVE_CONFIG_H
38 #include <config.h>
39 #endif
40 #include<stdio.h>
44 static void printvec(vector float v)
46 int i;
47 printf(" ");
49 for(i=0;i<4;i++)
50 printf("%8.5f ",*(((float *)&v)+i));
51 printf("\n");
55 /* Altivec utility routines */
56 static void set_non_java_mode(void)
58 vector unsigned short vsr1,vsr2;
59 vector unsigned int tmp;
61 vsr1=vec_mfvscr();
62 tmp=vec_sl(vec_splat_u32(1),vec_splat_u32(8));
63 vsr2=(vector unsigned short)vec_sl(tmp,vec_splat_u32(8));
64 vsr1=vec_or(vsr1,vsr2);
65 vec_mtvscr(vsr1);
68 /* Simple numerical constants */
69 static inline vector float vec_zero(void)
71 return vec_ctf(vec_splat_u32(0),0);
74 static inline vector float vec_half(void)
75 { /* 0.5 */
76 return vec_ctf(vec_splat_u32(1),1);
79 static inline vector float vec_one(void)
81 return vec_ctf(vec_splat_u32(1),0);
84 static inline vector float vec_two(void)
86 return vec_ctf(vec_splat_u32(2),0);
89 static inline vector float vec_three(void)
91 return vec_ctf(vec_splat_u32(3),0);
94 static inline vector float vec_six(void)
96 return vec_ctf(vec_splat_u32(6),0);
99 static inline vector float vec_twelve(void)
101 return vec_ctf(vec_splat_u32(12),0);
105 /* load three consecutive floats. We never access the fourth,
106 * so this is safe even at the end of an array.
108 static inline vector float load_xyz(float *address)
110 vector float c1,c2,c3;
111 vector unsigned char perm;
113 perm = vec_lvsl( 0, address );
114 c1 = vec_lde( 0, address );
115 c2 = vec_lde( 4, address );
116 c3 = vec_lde( 8, address );
117 c1 = vec_perm(c1,c1,perm);
118 c2 = vec_perm(c2,c2,perm);
119 c3 = vec_perm(c3,c3,perm);
120 c2 = vec_sld(c2,c2,4);
121 c3 = vec_sld(c3,c3,8);
122 c1 = vec_mergeh(c1,c3);
124 return vec_mergeh(c1,c2);
128 /* load four consecutive floats (from unaligned address) */
129 static inline vector float load_vector_unaligned(float *address)
131 vector unsigned char perm;
132 vector float low,high;
134 perm = vec_lvsl( 0, (int *) address );
135 low = vec_ld( 0, address );
136 high = vec_ld( 16, address );
138 return vec_perm(low,high,perm);
141 /* load a single float and spread it to all vector elements */
142 static inline vector float load_float_and_splat(float *address)
144 vector unsigned char perm;
145 vector float tmp;
147 tmp = vec_lde(0,address);
148 perm = vec_lvsl(0,address);
149 tmp = vec_perm(tmp,tmp,perm);
151 return vec_splat(tmp,0);
155 /* load four non-consecutive floats into vector [mem1 mem2 mem3 mem4] */
156 static inline vector float load_4_float(float *float1,
157 float *float2,
158 float *float3,
159 float *float4)
161 vector unsigned char xshift = vec_lvsl( 12, float1 );
162 vector unsigned char yshift = vec_lvsl( 12, float2 );
163 vector unsigned char zshift = vec_lvsl( 0, float3 );
164 vector unsigned char wshift = vec_lvsl( 0, float4 );
166 vector float X = vec_lde( 0, float1 );
167 vector float Y = vec_lde( 0, float2 );
168 vector float Z = vec_lde( 0, float3 );
169 vector float W = vec_lde( 0, float4 );
171 X = vec_perm( X, X, xshift);
172 Y = vec_perm( Y, Y, yshift);
173 Z = vec_perm( Z, Z, zshift);
174 W = vec_perm( W, W, wshift);
176 X = vec_mergeh( X, Y );
177 Z = vec_mergeh( Z, W );
179 return vec_sld( X, Z, 8 );
182 /* load four non-consecutive floats into vector [mem1 mem2 mem3 mem4] */
183 static inline vector float load_3_float(float *float1,
184 float *float2,
185 float *float3)
187 vector unsigned char xshift = vec_lvsl( 12, float1 );
188 vector unsigned char yshift = vec_lvsl( 12, float2 );
189 vector unsigned char zshift = vec_lvsl( 0, float3 );
191 vector float X = vec_lde( 0, float1 );
192 vector float Y = vec_lde( 0, float2 );
193 vector float Z = vec_lde( 0, float3 );
195 X = vec_perm( X, X, xshift);
196 Y = vec_perm( Y, Y, yshift);
197 Z = vec_perm( Z, Z, zshift);
199 X = vec_mergeh( X, Y );
201 return vec_sld( X, Z, 8 );
205 /* load two non-consecutive floats into vector [mem1 mem2 0 0] */
206 static inline vector float load_2_float(float *float1,
207 float *float2)
209 vector unsigned char xshift = vec_lvsl( 8, float1 );
210 vector unsigned char yshift = vec_lvsl( 8, float2 );
212 vector float X = vec_lde( 0, float1 );
213 vector float Y = vec_lde( 0, float2 );
215 X = vec_perm( X, X, xshift);
216 Y = vec_perm( Y, Y, yshift);
218 return vec_mergel( X, Y );
221 /* load one float into vector [mem1 0 0 0] */
222 static inline vector float load_1_float(float *float1)
224 vector unsigned char xshift = vec_lvsl( 4, float1 );
226 vector float X = vec_lde( 0, float1 );
227 X = vec_perm( X, X, xshift);
229 return vec_sld(X,vec_zero(),12);
235 /* load lj parameters for four atoms and store in two vectors.
236 * The nbfp memory is aligned on an 8-byte boundary and consists
237 * of pairs, so the two parameters are either in the upper or
238 * lower half of a 16-byte structure.
240 static inline void load_4_pair(float *pair1,
241 float *pair2,
242 float *pair3,
243 float *pair4,
244 vector float *c6,
245 vector float *c12)
247 vector float X = vec_ld( 0,pair1); /* c6a c12a */
248 vector float Y = vec_ld( 0,pair2); /* c6b c12b */
249 vector float Z = vec_ld( 0,pair3); /* c6c c12c */
250 vector float W = vec_ld( 0,pair4); /* c6d c12d */
251 vector unsigned char perm1 = vec_lvsl(0,pair1);
252 vector unsigned char perm2 = vec_lvsl(0,pair2);
253 vector unsigned char perm3 = vec_lvsl(0,pair3);
254 vector unsigned char perm4 = vec_lvsl(0,pair4);
255 X = vec_perm(X,X,perm1);
256 Y = vec_perm(Y,Y,perm2);
257 Z = vec_perm(Z,Z,perm3);
258 W = vec_perm(W,W,perm4);
260 X = vec_mergeh(X,Z); /* c6a c6c c12a c12c */
261 Y = vec_mergeh(Y,W); /* c6b c6d c12b c12d */
263 *c6 = vec_mergeh(X,Y);
264 *c12 = vec_mergel(X,Y);
268 /* load lj parameters for three atoms and store in two vectors
269 * See load_4_pair for alignment requirements.
271 static inline void load_3_pair(float *pair1,
272 float *pair2,
273 float *pair3,
274 vector float *c6,
275 vector float *c12)
277 vector float X = vec_ld( 0,pair1); /* c6a c12a */
278 vector float Y = vec_ld( 0,pair2); /* c6b c12b */
279 vector float Z = vec_ld( 0,pair3); /* c6c c12c */
280 vector unsigned char perm1 = vec_lvsl(0,pair1);
281 vector unsigned char perm2 = vec_lvsl(0,pair2);
282 vector unsigned char perm3 = vec_lvsl(0,pair3);
283 X = vec_perm(X,X,perm1);
284 Y = vec_perm(Y,Y,perm2);
285 Z = vec_perm(Z,Z,perm3);
287 X = vec_mergeh(X,Z); /* c6a c6c c12a c12c */
288 Y = vec_mergeh(Y,vec_zero()); /* c6b 0 c12b 0 */
290 *c6 = vec_mergeh(X,Y);
291 *c12 = vec_mergel(X,Y);
295 /* load lj parameters for two atoms and store in two vectors
296 * See load_4_pair for alignment requirements.
298 static inline void load_2_pair(float *pair1,
299 float *pair2,
300 vector float *c6,
301 vector float *c12)
303 vector float X = vec_ld( 0,pair1); /* c6a c12a */
304 vector float Y = vec_ld( 0,pair2); /* c6b c12b */
305 vector unsigned char perm1 = vec_lvsl(0,pair1);
306 vector unsigned char perm2 = vec_lvsl(0,pair2);
307 X = vec_perm(X,X,perm1);
308 Y = vec_perm(Y,Y,perm2);
310 X = vec_mergeh(X,vec_zero()); /* c6a 0 c12a 0 */
311 Y = vec_mergeh(Y,vec_zero()); /* c6b 0 c12b 0 */
313 *c6 = vec_mergeh(X,Y);
314 *c12 = vec_mergel(X,Y);
317 /* load lj parameters for one atom and store in two vectors
318 * See load_4_pair for alignment requirements.
320 static inline void load_1_pair(float *pair1,
321 vector float *c6,
322 vector float *c12)
324 vector float X = vec_ld( 0,pair1); /* c6a c12a */
325 vector unsigned char perm1 = vec_lvsl(0,pair1);
326 X = vec_perm(X,X,perm1);
327 X = vec_mergeh(X,vec_zero()); /* c6a 0 c12a 0 */
329 *c6 = vec_mergeh(X,vec_zero());
330 *c12 = vec_mergel(X,vec_zero());
335 * permute [x y z ?] into [x x x x] [y y y y] [z z z z]
336 * xreg can be the same as xyzreg, but not yreg/zreg!
338 static inline void splat_xyz_to_vectors(vector float xyzreg,
339 vector float *xreg,
340 vector float *yreg,
341 vector float *zreg)
343 *zreg = vec_splat(xyzreg,2);
344 *yreg = vec_splat(xyzreg,1);
345 *xreg = vec_splat(xyzreg,0);
349 /* The following transpose routines do not fill with zero padding:
351 * 1_to_3
352 * 2_to_3
353 * 1_to_4
354 * 2_to_4
356 * ...but the rest do.
360 /* move 4*[x y z ?] into [x1 x2 x3 x4] [y1 y2 y3 y4] [z1 z2 z3 z4]
362 static inline void transpose_4_to_3(vector float xyz1,
363 vector float xyz2,
364 vector float xyz3,
365 vector float xyz4,
366 vector float *xvector,
367 vector float *yvector,
368 vector float *zvector)
370 *yvector = vec_mergeh(xyz1,xyz3); /* [x1 x3 y1 y3] */
371 *zvector = vec_mergeh(xyz2,xyz4); /* [x2 x4 y2 y4] */
372 xyz1 = vec_mergel(xyz1,xyz3); /* [z1 z3 ? ?] */
373 xyz2 = vec_mergel(xyz2,xyz4); /* [z2 z4 ? ?] */
374 *xvector = vec_mergeh(*yvector,*zvector); /* [x1 x2 x3 x4] */
375 *yvector = vec_mergel(*yvector,*zvector); /* [y1 y2 y3 y4] */
376 *zvector = vec_mergeh(xyz1,xyz2); /* [z1 z2 z3 z4] */
380 /* move 2*[x y z ?] into [x1 x2 ? ?] [y1 y2 ? ?] [z1 z2 ? ?]
382 static inline void transpose_2_to_3(vector float xyz1,
383 vector float xyz2,
384 vector float *xvector,
385 vector float *yvector,
386 vector float *zvector)
388 *xvector = vec_mergeh(xyz1,xyz2); /* x1 x2 y1 y2 */
389 *zvector = vec_mergel(xyz1,xyz2); /* z1 z2 ? ? */
390 *yvector = vec_sld(*xvector,*xvector,8); /* y1 y2 0 0 */
394 /* move [x y z ?] into [x1 ? ? ?] [y1 ? ? ?] [z1 ? ? ?]
396 static inline void transpose_1_to_3(vector float xyz1,
397 vector float *xvector,
398 vector float *yvector,
399 vector float *zvector)
401 /* simply use splat, since elem 2,3,4 dont matter. */
402 *xvector = vec_splat(xyz1,0); /* x1 x1 x1 x1 */
403 *yvector = vec_splat(xyz1,1); /* y1 y1 y1 y1 */
404 *zvector = vec_splat(xyz1,2); /* z1 z1 z1 z1 */
410 /* move [x1 x2 x3 x4] [y1 y2 y3 y4] [z1 z2 z3 z4] to 4* [x y z 0]
412 static inline void transpose_3_to_4(vector float xvector,
413 vector float yvector,
414 vector float zvector,
415 vector float *xyz1,
416 vector float *xyz2,
417 vector float *xyz3,
418 vector float *xyz4)
420 vector float tmp1,tmp2;
421 vector float nul=vec_zero();
423 *xyz2 = vec_mergeh(xvector,zvector); /* [x1 z1 x2 z2 ] */
424 tmp1 = vec_mergeh(yvector,nul); /* [y1 0 y2 0 ] */
425 *xyz4 = vec_mergel(xvector,zvector); /* [x3 z3 x4 z4 ] */
426 tmp2 = vec_mergel(yvector,nul); /* [y3 0 y4 0 ] */
428 *xyz1 = vec_mergeh(*xyz2,tmp1); /* [x1 y1 z1 0 ] */
429 *xyz2 = vec_mergel(*xyz2,tmp1); /* [x2 y2 z2 0 ] */
430 *xyz3 = vec_mergeh(*xyz4,tmp2); /* [x3 y3 z3 0 ] */
431 *xyz4 = vec_mergel(*xyz4,tmp2); /* [x4 y4 z4 0 ] */
436 /* move [x1 x2 ? ?] [y1 y2 ? ?] [z1 z2 ? ?] to 2* [x y z 0]
438 static inline void transpose_3_to_2(vector float xvector,
439 vector float yvector,
440 vector float zvector,
441 vector float *xyz1,
442 vector float *xyz2)
444 vector float tmp1;
446 *xyz2 = vec_mergeh(xvector,zvector); /* [x1 z1 x2 z2 ] */
447 tmp1 = vec_mergeh(yvector,vec_zero()); /* [y1 0 y2 0 ] */
449 *xyz1 = vec_mergeh(*xyz2,tmp1); /* [x1 y1 z1 0 ] */
450 *xyz2 = vec_mergel(*xyz2,tmp1); /* [x2 y2 z2 0 ] */
454 /* move [x1 ? ? ?] [y1 ? ? ?] [z1 ? ? ?] to 1* [x y z 0]
456 static inline void transpose_3_to_1(vector float xvector,
457 vector float yvector,
458 vector float zvector,
459 vector float *xyz1)
461 *xyz1 = vec_mergeh(xvector,zvector); /* [x1 z1 ? ? ] */
462 yvector = vec_mergeh(yvector,vec_zero()); /* [y1 0 ? 0] */
463 *xyz1 = vec_mergeh(*xyz1,yvector); /* [x1 y1 z1 0] */
468 static inline void transpose_4_to_4(vector float in1,
469 vector float in2,
470 vector float in3,
471 vector float in4,
472 vector float *out1,
473 vector float *out2,
474 vector float *out3,
475 vector float *out4)
477 *out2 = vec_mergeh(in1,in3); /* [x1 x3 y1 y3] */
478 *out3 = vec_mergeh(in2,in4); /* [x2 x4 y2 y4] */
479 in1 = vec_mergel(in1,in3); /* [z1 z3 w1 w3] */
480 in2 = vec_mergel(in2,in4); /* [z2 z4 w2 w4] */
482 *out1 = vec_mergeh(*out2,*out3); /* [x1 x2 x3 x4] */
483 *out2 = vec_mergel(*out2,*out3); /* [y1 y2 y3 y4] */
484 *out3 = vec_mergeh(in1,in2); /* [z1 z2 z3 z4] */
485 *out4 = vec_mergel(in1,in2); /* [w1 w2 w3 w4] */
489 static inline void transpose_4_to_2(vector float in1,
490 vector float in2,
491 vector float in3,
492 vector float in4,
493 vector float *out1,
494 vector float *out2)
496 vector float tmp;
498 tmp = vec_mergeh(in1,in3); /* [x1 x3 y1 y3] */
499 *out2 = vec_mergeh(in2,in4); /* [x2 x4 y2 y4] */
500 *out1 = vec_mergeh(tmp,*out2); /* [x1 x2 x3 x4] */
501 *out2 = vec_mergel(tmp,*out2); /* [y1 y2 y3 y4] */
504 static inline void transpose_4_to_1(vector float in1,
505 vector float in2,
506 vector float in3,
507 vector float in4,
508 vector float *out1)
510 vector float tmp;
512 tmp = vec_mergeh(in1,in3); /* [x1 x3 ? ?] */
513 *out1 = vec_mergeh(in2,in4); /* [x2 x4 ? ?] */
514 *out1 = vec_mergeh(tmp,*out1); /* [x1 x2 x3 x4] */
519 static inline void transpose_2_to_4(vector float in1,
520 vector float in2,
521 vector float *out1,
522 vector float *out2,
523 vector float *out3,
524 vector float *out4)
526 *out1 = vec_mergeh(in1,in2); /* [x1 x2 y1 y2] */
527 *out3 = vec_mergel(in1,in2); /* [z1 z2 w1 w2] */
528 *out2 = vec_sld(*out1,*out1,8);/* [y1 y2 0 0] */
529 *out4 = vec_sld(*out3,*out3,8);/* [w1 w2 0 0] */
533 static inline void transpose_1_to_4(vector float in1,
534 vector float *out1,
535 vector float *out2,
536 vector float *out3,
537 vector float *out4)
539 *out1 = vec_splat(in1,0); /* [x1 x1 x1 x1] */
540 *out2 = vec_splat(in1,1); /* [y1 y1 y1 y1] */
541 *out3 = vec_splat(in1,2); /* [z1 z1 z1 z1] */
542 *out4 = vec_splat(in1,3); /* [w1 w1 w1 w1] */
546 /* Add the contents in xyz to an unaligned address location.
548 static inline void add_xyz_to_mem(float *address,vector float vdata)
550 vector float c1,c2,c3;
552 c1 = vec_lde( 0, address);
553 c2 = vec_lde( 4, address);
554 c3 = vec_lde( 8, address);
555 c1 = vec_add(c1,vec_splat(vdata,0));
556 c2 = vec_add(c2,vec_splat(vdata,1));
557 c3 = vec_add(c3,vec_splat(vdata,2));
558 vec_ste( c1, 0, address);
559 vec_ste( c2, 4, address);
560 vec_ste( c3, 8, address);
564 static inline void add_vector_to_float(float *address,vector float vdata)
566 vector float tmp;
568 tmp = vec_sld(vdata,vdata,8);
569 vdata = vec_add(vdata,tmp);
570 tmp = vec_sld(vdata,vdata,4);
571 vdata = vec_add(vdata,tmp);
572 tmp = vec_lde(0,address);
573 /* all four positions in vdata contain the sum */
574 tmp = vec_add(tmp,vdata);
575 vec_ste(tmp,0,address);
580 * load entire water molecule to vectors. Water is special;
581 * the allocated memory is 8-byte aligned, so with nine atoms
582 * we can always read 3 16-byte chunks (rounded down by vec_ld)
583 * without going outside the current page. The extra elements
584 * before and after the water dont matter as long as we dont
585 * try to write to them.
587 static inline void load_1_water_shift_and_splat(float *address,
588 float *shiftvec,
589 vector float *Ox,
590 vector float *Oy,
591 vector float *Oz,
592 vector float *H1x,
593 vector float *H1y,
594 vector float *H1z,
595 vector float *H2x,
596 vector float *H2y,
597 vector float *H2z)
599 vector unsigned char perm;
600 vector float c1,c2,c3,sha,shb,sh1,sh2,sh3;
602 /* load shift */
603 shb = load_xyz(shiftvec); /* [ shX shY shZ -] */
604 /* load the coordinates */
605 perm = vec_lvsl( 0, (int *) address );
606 c1 = vec_ld( 0, address );
607 c2 = vec_ld( 16, address );
608 c3 = vec_ld( 32, address );
610 sha = vec_sld(shb,shb,12); /* [ - shX shY shZ ] */
612 sh1 = vec_sld(sha,shb,4); /* [ shX shY shZ shX ] */
613 sh2 = vec_sld(sha,shb,8); /* [ shY shZ shX shY ] */
614 sh3 = vec_sld(sha,shb,12); /* [ shZ shX shY shZ ] */
616 c1 = vec_perm(c1,c2,perm); /* Ox Oy Oz H1x */
617 c2 = vec_perm(c2,c3,perm); /* H1y H1z H2x H2y */
618 c3 = vec_perm(c3,c3,perm); /* H2z - - - */
619 c1 = vec_add(c1,sh1);
620 c2 = vec_add(c2,sh2);
621 c3 = vec_add(c3,sh3);
623 *Ox = vec_splat(c1,0);
624 *Oy = vec_splat(c1,1);
625 *Oz = vec_splat(c1,2);
626 *H1x = vec_splat(c1,3);
627 *H1y = vec_splat(c2,0);
628 *H1z = vec_splat(c2,1);
629 *H2x = vec_splat(c2,2);
630 *H2y = vec_splat(c2,3);
631 *H2z = vec_splat(c3,0);
634 static inline void load_4_water(float *address1,
635 float *address2,
636 float *address3,
637 float *address4,
638 vector float *Ox,
639 vector float *Oy,
640 vector float *Oz,
641 vector float *H1x,
642 vector float *H1y,
643 vector float *H1z,
644 vector float *H2x,
645 vector float *H2y,
646 vector float *H2z)
648 vector unsigned char perm;
649 vector float tmp1,tmp2,tmp3;
651 vector float c1a,c2a,c3a;
652 vector float c1b,c2b,c3b;
653 vector float c1c,c2c,c3c;
654 vector float c1d,c2d,c3d;
656 /* load the coordinates */
657 perm = vec_lvsl( 0, address1 );
658 tmp1 = vec_ld( 0, address1 );
659 tmp2 = vec_ld( 16, address1 );
660 tmp3 = vec_ld( 32, address1 );
661 c1a = vec_perm(tmp1,tmp2,perm); /* Oxa Oya Oza H1xa */
662 c2a = vec_perm(tmp2,tmp3,perm); /* H1ya H1za H2xa H2ya */
663 c3a = vec_perm(tmp3,tmp3,perm); /* H2za - - - */
665 perm = vec_lvsl( 0, address2 );
666 tmp1 = vec_ld( 0, address2 );
667 tmp2 = vec_ld( 16, address2 );
668 tmp3 = vec_ld( 32, address2 );
669 c1b = vec_perm(tmp1,tmp2,perm); /* Oxb Oyb Ozb H1xb */
670 c2b = vec_perm(tmp2,tmp3,perm); /* H1yb H1zb H2xb H2yb */
671 c3b = vec_perm(tmp3,tmp3,perm); /* H2zb - - - */
673 perm = vec_lvsl( 0, address3 );
674 tmp1 = vec_ld( 0, address3 );
675 tmp2 = vec_ld( 16, address3 );
676 tmp3 = vec_ld( 32, address3 );
677 c1c = vec_perm(tmp1,tmp2,perm); /* Oxc Oyc Ozc H1xc */
678 c2c = vec_perm(tmp2,tmp3,perm); /* H1yc H1zc H2xc H2yc */
679 c3c = vec_perm(tmp3,tmp3,perm); /* H2zc - - - */
681 perm = vec_lvsl( 0, address4 );
682 tmp1 = vec_ld( 0, address4 );
683 tmp2 = vec_ld( 16, address4 );
684 tmp3 = vec_ld( 32, address4 );
685 c1d = vec_perm(tmp1,tmp2,perm); /* Oxd Oyd Ozd H1xd */
686 c2d = vec_perm(tmp2,tmp3,perm); /* H1yd H1zd H2xd H2yd */
687 c3d = vec_perm(tmp3,tmp3,perm); /* H2zd - - - */
689 /* permute things */
690 tmp1 = vec_mergeh(c1a,c1c); /* Oxa Oxc Oya Oyc */
691 c1a = vec_mergel(c1a,c1c); /* Oza Ozc H1xa H1xc */
692 c1c = vec_mergeh(c1b,c1d); /* Oxb Oxd Oyb Oyd */
693 c1b = vec_mergel(c1b,c1d); /* Ozb Ozd H1xb H1xd */
695 c1d = vec_mergeh(c2a,c2c); /* H1ya H1yc H1za H1zc */
696 c2a = vec_mergel(c2a,c2c); /* H2xa H2xc H2ya H2yc */
697 c2c = vec_mergeh(c2b,c2d); /* H1yb H1yd H1zb H1zd */
698 c2b = vec_mergel(c2b,c2d); /* H2xb H2xd H2yb H2yd */
700 c3a = vec_mergeh(c3a,c3c); /* H2za H2zc - - */
701 c3b = vec_mergeh(c3b,c3d); /* H2zb H2zd - - */
703 *Ox = vec_mergeh(tmp1,c1c); /* Oxa Oxb Oxc Oxd */
704 *Oy = vec_mergel(tmp1,c1c); /* Oya Oyb Oyc Oyd */
705 *Oz = vec_mergeh(c1a,c1b); /* Oza Ozb Ozc Ozd */
706 *H1x = vec_mergel(c1a,c1b); /* H1xa H1xb H1xc H1xd */
707 *H1y = vec_mergeh(c1d,c2c); /* H1ya H1yb H1yc H1yd */
708 *H1z = vec_mergel(c1d,c2c); /* H1za H1zb H1zc H1zd */
709 *H2x = vec_mergeh(c2a,c2b); /* H2xa H2xb H2xc H2xd */
710 *H2y = vec_mergel(c2a,c2b); /* H2ya H2yb H2yc H2yd */
711 *H2z = vec_mergeh(c3a,c3b); /* H2za H2zb H2zc H2zd */
716 static inline void load_3_water(float *address1,
717 float *address2,
718 float *address3,
719 vector float *Ox,
720 vector float *Oy,
721 vector float *Oz,
722 vector float *H1x,
723 vector float *H1y,
724 vector float *H1z,
725 vector float *H2x,
726 vector float *H2y,
727 vector float *H2z)
729 vector unsigned char perm;
730 vector float tmp1,tmp2,tmp3;
731 vector float c1a,c2a,c3a;
732 vector float c1b,c2b,c3b;
733 vector float c1c,c2c,c3c;
735 /* load the coordinates */
736 perm = vec_lvsl( 0, (int *) address1 );
737 tmp1 = vec_ld( 0, address1 );
738 tmp2 = vec_ld( 16, address1 );
739 tmp3 = vec_ld( 32, address1 );
740 c1a = vec_perm(tmp1,tmp2,perm); /* Oxa Oya Oza H1xa */
741 c2a = vec_perm(tmp2,tmp3,perm); /* H1ya H1za H2xa H2ya */
742 c3a = vec_perm(tmp3,tmp3,perm); /* H2za - - - */
744 perm = vec_lvsl( 0, (int *) address3 );
745 tmp1 = vec_ld( 0, address3 );
746 tmp2 = vec_ld( 16, address3 );
747 tmp3 = vec_ld( 32, address3 );
748 c1c = vec_perm(tmp1,tmp2,perm); /* Oxc Oyc Ozc H1xc */
749 c2c = vec_perm(tmp2,tmp3,perm); /* H1yc H1zc H2xc H2yc */
750 c3c = vec_perm(tmp3,tmp3,perm); /* H2zc - - - */
752 perm = vec_lvsl( 0, (int *) address2 );
753 tmp1 = vec_ld( 0, address2 );
754 tmp2 = vec_ld( 16, address2 );
755 tmp3 = vec_ld( 32, address2 );
756 c1b = vec_perm(tmp1,tmp2,perm); /* Oxb Oyb Ozb H1xb */
757 c2b = vec_perm(tmp2,tmp3,perm); /* H1yb H1zb H2xb H2yb */
758 c3b = vec_perm(tmp3,tmp3,perm); /* H2zb - - - */
760 /* permute things */
761 tmp1 = vec_mergeh(c1a,c1c); /* Oxa Oxc Oya Oyc */
762 c1a = vec_mergel(c1a,c1c); /* Oza Ozc H1xa H1xc */
763 c1c = vec_mergeh(c1b,c1b); /* Oxb - Oyb - */
764 c1b = vec_mergel(c1b,c1b); /* Ozb - H1xb - */
766 tmp2 = vec_mergeh(c2a,c2c); /* H1ya H1yc H1za H1zc */
767 c2a = vec_mergel(c2a,c2c); /* H2xa H2xc H2ya H2yc */
768 c2c = vec_mergeh(c2b,c2b); /* H1yb - H1zb - */
769 c2b = vec_mergel(c2b,c2b); /* H2xb - H2yb - */
771 c3a = vec_mergeh(c3a,c3c); /* H2za H2zc - - */
773 *Ox = vec_mergeh(tmp1,c1c); /* Oxa Oxb Oxc - */
774 *Oy = vec_mergel(tmp1,c1c); /* Oya Oyb Oyc - */
775 *Oz = vec_mergeh(c1a,c1b); /* Oza Ozb Ozc - */
776 *H1x = vec_mergel(c1a,c1b); /* H1xa H1xb H1xc - */
777 *H1y = vec_mergeh(tmp2,c2c); /* H1ya H1yb H1yc - */
778 *H1z = vec_mergel(tmp2,c2c); /* H1za H1zb H1zc - */
779 *H2x = vec_mergeh(c2a,c2b); /* H2xa H2xb H2xc - */
780 *H2y = vec_mergel(c2a,c2b); /* H2ya H2yb H2yc - */
781 *H2z = vec_mergeh(c3a,c3b); /* H2za H2zb H2zc - */
786 static inline void load_2_water(float *address1,
787 float *address2,
788 vector float *Ox,
789 vector float *Oy,
790 vector float *Oz,
791 vector float *H1x,
792 vector float *H1y,
793 vector float *H1z,
794 vector float *H2x,
795 vector float *H2y,
796 vector float *H2z)
798 vector unsigned char perm;
799 vector float tmp1,tmp2,tmp3;
800 vector float c1a,c2a,c3a;
801 vector float c1b,c2b,c3b;
803 /* load the coordinates */
804 perm = vec_lvsl( 0, (int *) address1 );
805 tmp1 = vec_ld( 0, address1 );
806 tmp2 = vec_ld( 16, address1 );
807 tmp3 = vec_ld( 32, address1 );
808 c1a = vec_perm(tmp1,tmp2,perm); /* Oxa Oya Oza H1xa */
809 c2a = vec_perm(tmp2,tmp3,perm); /* H1ya H1za H2xa H2ya */
810 c3a = vec_perm(tmp3,tmp3,perm); /* H2za - - - */
812 perm = vec_lvsl( 0, (int *) address2 );
813 tmp1 = vec_ld( 0, address2 );
814 tmp2 = vec_ld( 16, address2 );
815 tmp3 = vec_ld( 32, address2 );
816 c1b = vec_perm(tmp1,tmp2,perm); /* Oxb Oyb Ozb H1xb */
817 c2b = vec_perm(tmp2,tmp3,perm); /* H1yb H1zb H2xb H2yb */
818 c3b = vec_perm(tmp3,tmp3,perm); /* H2zb - - - */
820 /* never mind what we get in the two remaining elements */
821 *Ox = vec_mergeh(c1a,c1b); /* Oxa Oxb Oya Oyb */
822 *H1y = vec_mergeh(c2a,c2b); /* H1ya H1yb H1za H1zb */
823 *H2z = vec_mergeh(c3a,c3b); /* H2za H2zb - - */
824 *Oz = vec_mergel(c1a,c1b); /* Oza Ozb H1xa H1xb */
825 *H2x = vec_mergel(c2a,c2b); /* H2xa H2xb H2ya H2yb */
826 *Oy = vec_sld(*Ox,*Ox,8);
827 *H1z = vec_sld(*H1y,*H1y,8);
828 *H1x = vec_sld(*Oz,*Oz,8);
829 *H2y = vec_sld(*H2x,*H2x,8);
833 static inline void load_1_water(float *address1,
834 vector float *Ox,
835 vector float *Oy,
836 vector float *Oz,
837 vector float *H1x,
838 vector float *H1y,
839 vector float *H1z,
840 vector float *H2x,
841 vector float *H2y,
842 vector float *H2z)
844 vector unsigned char perm;
845 vector float tmp1,tmp2,tmp3;
847 /* load the coordinates */
848 perm = vec_lvsl( 0, (int *) address1 );
849 tmp1 = vec_ld( 0, address1 );
850 tmp2 = vec_ld( 16, address1 );
851 tmp3 = vec_ld( 32, address1 );
852 *Ox = vec_perm(tmp1,tmp2,perm); /* Ox Oy Oz H1x */
853 *H1y = vec_perm(tmp2,tmp3,perm); /* H1y H1z H2x H2y */
854 *H2z = vec_perm(tmp3,tmp3,perm); /* H2z - - - */
856 /* just splat things... never mind that we fill all cells :-) */
857 *Oy = vec_splat(*Ox,1);
858 *Oz = vec_splat(*Ox,2);
859 *H1x = vec_splat(*Ox,3);
860 *H1z = vec_splat(*H1y,1);
861 *H2x = vec_splat(*H1y,2);
862 *H2y = vec_splat(*H1y,3);
867 static inline void update_i_water_forces(float *water,
868 float *shiftvec,
869 vector float Ox,
870 vector float Oy,
871 vector float Oz,
872 vector float H1x,
873 vector float H1y,
874 vector float H1z,
875 vector float H2x,
876 vector float H2y,
877 vector float H2z)
879 vector float l1,l2,l3;
880 vector unsigned char perm;
881 vector unsigned int mask,ox00;
882 vector float nul=vec_zero();
884 ox00=(vector unsigned int)vec_splat_s32(0);
885 /* load */
886 perm = vec_lvsr( 0, water );
887 l1 = vec_ld( 0, water);
888 l2 = vec_ld( 16, water);
889 l3 = vec_ld( 32, water);
890 mask = vec_perm(ox00,
891 (vector unsigned int)vec_splat_s32(-1),perm);
893 /* accumulate the forces */
894 Ox = vec_add(Ox,vec_sld(Ox,Ox,8)); /* Ox Ox' - - */
895 Oy = vec_add(Oy,vec_sld(Oy,Oy,8)); /* Oy Oy' - - */
896 Oz = vec_add(Oz,vec_sld(Oz,Oz,8)); /* Oz Oz' - - */
897 H1x = vec_add(H1x,vec_sld(H1x,H1x,8)); /* H1x H1x' - - */
898 H1y = vec_add(H1y,vec_sld(H1y,H1y,8)); /* H1y H1y' - - */
899 H1z = vec_add(H1z,vec_sld(H1z,H1z,8)); /* H1z H1z' - - */
900 H2x = vec_add(H2x,vec_sld(H2x,H2x,8)); /* H2x H2x' - - */
901 H2y = vec_add(H2y,vec_sld(H2y,H2y,8)); /* H2y H2y' - - */
902 H2z = vec_add(H2z,vec_sld(H2z,H2z,8)); /* H2z H2z' - - */
904 Ox = vec_mergeh(Ox,Oz); /* Ox Oz Ox' Oz' */
905 Oy = vec_mergeh(Oy,H1x); /* Oy H1x Oy' H1x' */
906 H1y = vec_mergeh(H1y,H2x); /* H1y H2x H1y' H2x' */
907 H1z = vec_mergeh(H1z,H2y); /* H1z H2y H1z' H2y' */
908 H2z = vec_mergeh(H2z,nul); /* H2z 0 H2z' 0 */
910 Ox = vec_add(Ox,vec_sld(Ox,Ox,8)); /* Ox Oz - - */
911 Oy = vec_add(Oy,vec_sld(Oy,Oy,8)); /* Oy H1x - - */
912 H1y = vec_add(H1y,vec_sld(H1y,H1y,8)); /* H1y H2x - - */
913 H1z = vec_add(H1z,vec_sld(H1z,H1z,8)); /* H1z H2y - - */
914 H2z = vec_add(H2z,vec_sld(H2z,H2z,8)); /* H2z 0 ? 0 */
916 Ox = vec_mergeh(Ox,Oy); /* Ox Oy Oz H1x */
917 H1y = vec_mergeh(H1y,H1z); /* H1y H1z H2x H2y */
918 H2z = vec_mergeh(H2z,nul); /* H2z 0 0 0 */
920 Oy = vec_sld(nul,Ox,12); /* 0 Ox Oy Oz */
921 Oz = vec_sld(Ox,H1y,8); /* - H1x H1y H1z */
922 H1x = vec_sld(H1y,H2z,4); /* - H2x H2y H2z */
924 H2x = vec_perm(nul,Ox,perm); /* The part to add to l1 */
925 H2y = vec_perm(Ox,H1y,perm); /* The part to add to l2 */
926 H2z = vec_perm(H1y,H2z,perm); /* The part to add to l3 */
928 H2x = vec_add(l1,H2x);
929 H2y = vec_add(l2,H2y);
930 H2z = vec_add(l3,H2z);
932 H2x = vec_sel(l1,H2x,mask);
933 mask = vec_sld(ox00,mask,12);
934 H2z = vec_sel(H2z,l3,mask);
936 /* store */
937 vec_st(H2x, 0, water);
938 vec_st(H2y, 16, water);
939 vec_st(H2z, 32, water);
941 Oy = vec_add(Oy,Oz);
942 Oy = vec_add(Oy,H1x);
943 Oy = vec_sld(Oy,nul,4); /* x y z 0 */
945 add_xyz_to_mem(shiftvec,Oy);
948 static inline void add_force_to_4_water(float *address1,
949 float *address2,
950 float *address3,
951 float *address4,
952 vector float Ox,
953 vector float Oy,
954 vector float Oz,
955 vector float H1x,
956 vector float H1y,
957 vector float H1z,
958 vector float H2x,
959 vector float H2y,
960 vector float H2z)
962 vector float low,medium,high,low2,medium2,high2;
963 vector unsigned char perm,perm2;
964 vector unsigned int mask,mask2,oxFF,ox00;
965 vector float tmp1,tmp2,tmp3;
966 vector float nul=vec_zero();
968 oxFF=(vector unsigned int)vec_splat_s32(-1);
969 ox00=(vector unsigned int)vec_splat_s32(0);
971 /* We have to be careful here since water molecules will sometimes
972 * be adjacent in memory. When we update the forces on water 1,
973 * we usually read & write back some extra neighboring cells in
974 * the vector. This means we cannot first read all data, update
975 * it and write it out again.
977 * Instead of doing the molecules one by one, we rely on the
978 * fact that the list will be ordered, so water molecules
979 * 1 & 3 can be done first (they cannot be adjacent), and then
980 * waters 2 & 4.
983 tmp1 = vec_mergeh(Ox,Oz); /* Oxa Oza Oxb Ozb */
984 Ox = vec_mergel(Ox,Oz); /* Oxc Ozc Oxd Ozd */
985 Oz = vec_mergeh(Oy,H1x); /* Oya H1xa Oyb H1xb */
986 Oy = vec_mergel(Oy,H1x); /* Oyc H1xc Oyd H1xd */
987 H1x = vec_mergeh(H1y,H2x); /* H1ya H2xa H1yb H2xb */
988 H1y = vec_mergel(H1y,H2x); /* H1yc H2xc H1yd H2xd */
990 H2x = vec_mergeh(H1z,H2y); /* H1za H2ya H1zb H2yb */
991 H1z = vec_mergel(H1z,H2y); /* H1zc H2yc H1zd H2yd */
992 H2y = vec_mergeh(H2z,nul); /* H2za 0 H2zb 0 */
993 H2z = vec_mergel(H2z,nul); /* H2zc 0 H2zd 0 */
995 tmp2 = vec_mergeh(tmp1,Oz); /* Oxa Oya Oza H1xa */
996 Oz = vec_mergel(tmp1,Oz); /* Oxb Oyb Ozb H1xb */
997 tmp1 = vec_mergeh(Ox,Oy); /* Oxc Oyc Ozc H1xc */
998 Ox = vec_mergel(Ox,Oy); /* Oxd Oyd Ozd H1xd */
999 Oy = vec_mergeh(H1x,H2x); /* H1ya H1za H2xa H2ya */
1000 H1x = vec_mergel(H1x,H2x); /* H1yb H1zb H2xb H2yb */
1001 H2x = vec_mergeh(H1y,H1z); /* H1yc H1zc H2xc H2yc */
1002 H1y = vec_mergel(H1y,H1z); /* H1yd H1zd H2xd H2yd */
1003 H1z = vec_mergeh(H2y,nul); /* H2za 0 0 0 */
1004 H2y = vec_mergel(H2y,nul); /* H2zb 0 0 0 */
1005 tmp3 = vec_mergeh(H2z,nul); /* H2zc 0 0 0 */
1006 H2z = vec_mergel(H2z,nul); /* H2zd 0 0 0 */
1008 /* move into position, load and add */
1009 perm = vec_lvsr( 0, (int *) address1 );
1010 perm2 = vec_lvsr( 0, (int *) address3 );
1011 low = vec_ld( 0, address1);
1012 low2 = vec_ld( 0, address3);
1014 medium = vec_ld( 16, address1);
1015 medium2 = vec_ld( 16, address3);
1016 high = vec_ld( 32, address1);
1017 high2 = vec_ld( 32, address3);
1018 mask = vec_perm(ox00,oxFF,perm);
1019 mask2 = vec_perm(ox00,oxFF,perm2);
1021 low = vec_add(vec_perm(nul,tmp2,perm),low);
1022 low2 = vec_add(vec_perm(nul,tmp1,perm2),low2);
1024 tmp2 = vec_add(vec_perm(tmp2,Oy,perm),medium);
1025 tmp1 = vec_add(vec_perm(tmp1,H2x,perm2),medium2);
1027 Oy = vec_add(vec_perm(Oy,H1z,perm),high);
1028 H2x = vec_add(vec_perm(H2x,tmp3,perm2),high2);
1030 vec_st(vec_sel(low,low,mask), 0, address1);
1031 vec_st(vec_sel(low2,low2,mask2), 0, address3);
1033 mask = vec_sld(ox00,mask,12);
1034 mask2 = vec_sld(ox00,mask2,12);
1036 vec_st(tmp2, 16, address1);
1037 vec_st(tmp1, 16, address3);
1039 vec_st(vec_sel(Oy,high,mask), 32, address1);
1040 vec_st(vec_sel(H2x,high2,mask2), 32, address3);
1042 /* Finished 1 & 3 - now do 2 & 4 */
1044 perm = vec_lvsr( 0, (int *) address2 );
1045 perm2 = vec_lvsr( 0, (int *) address4 );
1047 low = vec_ld( 0, address2);
1048 low2 = vec_ld( 0, address4);
1049 medium = vec_ld( 16, address2);
1050 medium2 = vec_ld( 16, address4);
1051 high = vec_ld( 32, address2);
1052 high2 = vec_ld( 32, address4);
1053 mask = vec_perm(ox00,oxFF,perm);
1054 mask2 = vec_perm(ox00,oxFF,perm2);
1055 H1z = vec_add(vec_perm(nul,Oz,perm),low);
1056 H2x = vec_add(vec_perm(nul,Ox,perm2),low2);
1057 Oz = vec_add(vec_perm(Oz,H1x,perm),medium);
1058 Ox = vec_add(vec_perm(Ox,H1y,perm2),medium2);
1059 H1x = vec_add(vec_perm(H1x,H2y,perm),high);
1060 H1y = vec_add(vec_perm(H1y,H2z,perm2),high2);
1061 vec_st(vec_sel(low,H1z,mask), 0, address2);
1062 vec_st(vec_sel(low2,H2x,mask2), 0, address4);
1063 mask = vec_sld(ox00,mask,12);
1064 mask2 = vec_sld(ox00,mask2,12);
1065 vec_st(Oz, 16, address2);
1066 vec_st(Ox, 16, address4);
1067 vec_st(vec_sel(H1x,high,mask), 32, address2);
1068 vec_st(vec_sel(H1y,high2,mask2), 32, address4);
1072 static inline void add_force_to_3_water(float *address1,
1073 float *address2,
1074 float *address3,
1075 vector float Ox,
1076 vector float Oy,
1077 vector float Oz,
1078 vector float H1x,
1079 vector float H1y,
1080 vector float H1z,
1081 vector float H2x,
1082 vector float H2y,
1083 vector float H2z)
1085 vector float low,medium,high;
1086 vector unsigned char perm;
1087 vector unsigned int mask,oxFF,ox00;
1088 vector float tmp1,tmp2,tmp3;
1090 vector float nul=vec_zero();
1091 oxFF=(vector unsigned int)vec_splat_s32(-1);
1092 ox00=(vector unsigned int)vec_splat_s32(0);
1094 tmp1 = vec_mergeh(Ox,Oz); /* Oxa Oza Oxb Ozb */
1095 Ox = vec_mergel(Ox,Oz); /* Oxc Ozc ? ? */
1096 Oz = vec_mergeh(Oy,H1x); /* Oya H1xa Oyb H1xb */
1097 Oy = vec_mergel(Oy,H1x); /* Oyc H1xc ? ? */
1098 H1x = vec_mergeh(H1y,H2x); /* H1ya H2xa H1yb H2xb */
1099 H1y = vec_mergel(H1y,H2x); /* H1yc H2xc ? ? */
1100 H2x = vec_mergeh(H1z,H2y); /* H1za H2ya H1zb H2yb */
1101 H1z = vec_mergel(H1z,H2y); /* H1zc H2yc ? ? */
1102 H2y = vec_mergeh(H2z,nul); /* H2za 0 H2zb 0 */
1103 H2z = vec_mergel(H2z,nul); /* H2zc 0 ? 0 */
1105 tmp2 = vec_mergeh(tmp1,Oz); /* Oxa Oya Oza H1xa */
1106 Oz = vec_mergel(tmp1,Oz); /* Oxb Oyb Ozb H1xb */
1107 tmp1 = vec_mergeh(Ox,Oy); /* Oxc Oyc Ozc H1xc */
1108 Oy = vec_mergeh(H1x,H2x); /* H1ya H1za H2xa H2ya */
1109 H1x = vec_mergel(H1x,H2x); /* H1yb H1zb H2xb H2yb */
1110 H2x = vec_mergeh(H1y,H1z); /* H1yc H1zc H2xc H2yc */
1111 H1z = vec_mergeh(H2y,nul); /* H2za 0 0 0 */
1112 H2y = vec_mergel(H2y,nul); /* H2zb 0 0 0 */
1113 tmp3 = vec_mergeh(H2z,nul); /* H2zc 0 0 0 */
1115 /* move into position, load and add */
1116 perm = vec_lvsr( 0, (int *) address1 );
1117 low = vec_ld( 0, address1);
1118 medium = vec_ld( 16, address1);
1119 high = vec_ld( 32, address1);
1120 mask = vec_perm(ox00,oxFF,perm);
1121 H1y = vec_add(vec_perm(nul,tmp2,perm),low);
1122 tmp2 = vec_add(vec_perm(tmp2,Oy,perm),medium);
1123 Oy = vec_add(vec_perm(Oy,H1z,perm),high);
1124 vec_st(vec_sel(low,H1y,mask), 0, address1);
1125 mask = vec_sld(ox00,mask,12);
1126 vec_st(tmp2, 16, address1);
1127 vec_st(vec_sel(Oy,high,mask), 32, address1);
1129 perm = vec_lvsr( 0, (int *) address2 );
1130 low = vec_ld( 0, address2);
1131 medium = vec_ld( 16, address2);
1132 high = vec_ld( 32, address2);
1133 mask = vec_perm(ox00,oxFF,perm);
1134 H1z = vec_add(vec_perm(nul,Oz,perm),low);
1135 Oz = vec_add(vec_perm(Oz,H1x,perm),medium);
1136 H1x = vec_add(vec_perm(H1x,H2y,perm),high);
1137 vec_st(vec_sel(low,H1z,mask), 0, address2);
1138 mask = vec_sld(ox00,mask,12);
1139 vec_st(Oz, 16, address2);
1140 vec_st(vec_sel(H1x,high,mask), 32, address2);
1142 perm = vec_lvsr( 0, (int *) address3 );
1143 low = vec_ld( 0, address3);
1144 medium = vec_ld( 16, address3);
1145 high = vec_ld( 32, address3);
1146 mask = vec_perm(ox00,oxFF,perm);
1147 H2y = vec_add(vec_perm(nul,tmp1,perm),low);
1148 tmp1 = vec_add(vec_perm(tmp1,H2x,perm),medium);
1149 H2x = vec_add(vec_perm(H2x,tmp3,perm),high);
1150 vec_st(vec_sel(low,H2y,mask), 0, address3);
1151 mask = vec_sld(ox00,mask,12);
1152 vec_st(tmp1, 16, address3);
1153 vec_st(vec_sel(H2x,high,mask), 32, address3);
1159 static inline void add_force_to_2_water(float *address1,
1160 float *address2,
1161 vector float Ox,
1162 vector float Oy,
1163 vector float Oz,
1164 vector float H1x,
1165 vector float H1y,
1166 vector float H1z,
1167 vector float H2x,
1168 vector float H2y,
1169 vector float H2z)
1171 vector float low,medium,high;
1172 vector unsigned char perm;
1173 vector unsigned int mask,oxFF,ox00;
1174 vector float tmp1,tmp2,tmp3;
1176 vector float nul=vec_zero();
1177 oxFF=(vector unsigned int)vec_splat_s32(-1);
1178 ox00=(vector unsigned int)vec_splat_s32(0);
1180 tmp1 = vec_mergeh(Ox,Oz); /* Oxa Oza Oxb Ozb */
1181 Oz = vec_mergeh(Oy,H1x); /* Oya H1xa Oyb H1xb */
1182 H1x = vec_mergeh(H1y,H2x); /* H1ya H2xa H1yb H2xb */
1183 H2x = vec_mergeh(H1z,H2y); /* H1za H2ya H1zb H2yb */
1184 H2y = vec_mergeh(H2z,nul); /* H2za 0 H2zb 0 */
1186 tmp2 = vec_mergeh(tmp1,Oz); /* Oxa Oya Oza H1xa */
1187 Oz = vec_mergel(tmp1,Oz); /* Oxb Oyb Ozb H1xb */
1188 Oy = vec_mergeh(H1x,H2x); /* H1ya H1za H2xa H2ya */
1189 H1x = vec_mergel(H1x,H2x); /* H1yb H1zb H2xb H2yb */
1190 H1z = vec_mergeh(H2y,nul); /* H2za 0 0 0 */
1191 H2y = vec_mergel(H2y,nul); /* H2zb 0 0 0 */
1193 /* move into position and add */
1194 perm = vec_lvsr( 0, (int *) address1 );
1195 low = vec_ld( 0, address1);
1196 medium = vec_ld( 16, address1);
1197 high = vec_ld( 32, address1);
1198 mask = vec_perm(ox00,oxFF,perm);
1199 H2x = vec_add(vec_perm(nul,tmp2,perm),low);
1200 tmp2 = vec_add(vec_perm(tmp2,Oy,perm),medium);
1201 Oy = vec_add(vec_perm(Oy,H1z,perm),high);
1202 vec_st(vec_sel(low,H2x,mask), 0, address1);
1203 mask = vec_sld(ox00,mask,12);
1204 vec_st(tmp2, 16, address1);
1205 vec_st(vec_sel(Oy,high,mask), 32, address1);
1207 perm = vec_lvsr( 0, (int *) address2 );
1208 low = vec_ld( 0, address2);
1209 medium = vec_ld( 16, address2);
1210 high = vec_ld( 32, address2);
1211 mask = vec_perm(ox00,oxFF,perm);
1212 H1z = vec_add(vec_perm(nul,Oz,perm),low);
1213 Oz = vec_add(vec_perm(Oz,H1x,perm),medium);
1214 H1x = vec_add(vec_perm(H1x,H2y,perm),high);
1215 vec_st(vec_sel(low,H1z,mask), 0, address2);
1216 mask = vec_sld(ox00,mask,12);
1217 vec_st(Oz, 16, address2);
1218 vec_st(vec_sel(H1x,high,mask), 32, address2);
1223 static inline void add_force_to_1_water(float *address1,
1224 vector float Ox,
1225 vector float Oy,
1226 vector float Oz,
1227 vector float H1x,
1228 vector float H1y,
1229 vector float H1z,
1230 vector float H2x,
1231 vector float H2y,
1232 vector float H2z)
1234 vector float low,medium,high;
1235 vector unsigned char perm;
1236 vector unsigned int mask,oxFF,ox00;
1237 vector float tmp1,tmp2,tmp3;
1239 vector float nul=vec_zero();
1240 oxFF=(vector unsigned int)vec_splat_s32(-1);
1241 ox00=(vector unsigned int)vec_splat_s32(0);
1243 /* load */
1244 perm = vec_lvsr( 0, (int *) address1 );
1245 low = vec_ld( 0, address1);
1246 medium = vec_ld( 16, address1);
1247 high = vec_ld( 32, address1);
1249 tmp1 = vec_mergeh(Ox,Oz); /* Oxa Oza ? ? */
1250 Oz = vec_mergeh(Oy,H1x); /* Oya H1xa ? ? */
1251 H1x = vec_mergeh(H1y,H2x); /* H1ya H2xa ? ? */
1252 H2x = vec_mergeh(H1z,H2y); /* H1za H2ya ? ? */
1253 H2y = vec_mergeh(H2z,nul); /* H2za 0 ? 0 */
1255 tmp2 = vec_mergeh(tmp1,Oz); /* Oxa Oya Oza H1xa */
1256 Oy = vec_mergeh(H1x,H2x); /* H1ya H1za H2xa H2ya */
1257 H1z = vec_mergeh(H2y,nul); /* H2za 0 0 0 */
1259 /* move into position and add */
1260 perm = vec_lvsr( 0, (int *) address1 );
1261 low = vec_ld( 0, address1);
1262 medium = vec_ld( 16, address1);
1263 high = vec_ld( 32, address1);
1264 mask = vec_perm(ox00,oxFF,perm);
1265 H2x = vec_add(vec_perm(nul,tmp2,perm),low);
1266 tmp2 = vec_add(vec_perm(tmp2,Oy,perm),medium);
1267 Oy = vec_add(vec_perm(Oy,H1z,perm),high);
1268 vec_st(vec_sel(low,H2x,mask), 0, address1);
1269 mask = vec_sld(ox00,mask,12);
1270 vec_st(tmp2, 16, address1);
1271 vec_st(vec_sel(Oy,high,mask), 32, address1);
1276 static inline void do_4_ctable_coul(float *VFtab,
1277 vector float rtab,
1278 vector float *VV,
1279 vector float *FF)
1281 vector signed int vidx;
1282 vector float Y,F,G,H,eps,eps2,tab1,tab2,tab3,tab4;
1283 int idx1,idx2,idx3,idx4;
1285 vidx = vec_cts(rtab,0);
1286 vidx = vec_sl(vidx,vec_splat_u32(2));
1288 idx1 = *((int *)&vidx);
1289 idx2 = *(((int *)&vidx)+1);
1290 idx3 = *(((int *)&vidx)+2);
1291 idx4 = *(((int *)&vidx)+3);
1293 eps = vec_sub(rtab,vec_floor(rtab));
1294 eps2 = vec_madd(eps,eps,vec_zero());
1296 if(((unsigned int)VFtab)%16) { /* not 16-byte aligned, but must be 8 byte. */
1297 tab1 = vec_ld( 0, VFtab+idx1);
1298 Y = vec_ld(16, VFtab+idx1);
1299 tab1 = vec_sld(tab1,Y,8);
1300 tab2 = vec_ld( 0, VFtab+idx2);
1301 F = vec_ld(16, VFtab+idx2);
1302 tab2 = vec_sld(tab2,F,8);
1303 tab3 = vec_ld( 0, VFtab+idx3);
1304 G = vec_ld(16, VFtab+idx3);
1305 tab3 = vec_sld(tab3,G,8);
1306 tab4 = vec_ld( 0, VFtab+idx4);
1307 H = vec_ld(16, VFtab+idx4);
1308 tab4 = vec_sld(tab4,H,8);
1309 } else { /* aligned */
1310 tab1=vec_ld(0, VFtab+idx1);
1311 tab2=vec_ld(0, VFtab+idx2);
1312 tab3=vec_ld(0, VFtab+idx3);
1313 tab4=vec_ld(0, VFtab+idx4);
1316 /* table data is aligned */
1317 transpose_4_to_4(tab1,tab2,tab3,tab4,&Y,&F,&G,&H);
1319 F = vec_madd(G,eps,F); /* F + Geps */
1320 H = vec_madd(H,eps2,vec_zero()); /* Heps2 */
1321 F = vec_add(F,H); /* F + Geps + Heps2 (=Fp) */
1322 *VV = vec_madd(eps,F,Y); /* VV = Y + eps*Fp */
1323 F = vec_madd(G,eps,F); /* Fp + Geps */
1324 *FF = vec_madd(vec_two(),H,F); /* Fp + Geps + 2.0*Heps2 */
1327 /* do only coulomb, but on a table with both coulomb and lj data */
1328 static inline void do_4_ljctable_coul(float *VFtab,
1329 vector float rtab,
1330 vector float *VV,
1331 vector float *FF)
1333 vector signed int vidx;
1334 vector float Y,F,G,H,eps,eps2,tab1,tab2,tab3,tab4;
1335 int idx1,idx2,idx3,idx4;
1337 vidx = vec_cts(rtab,0);
1338 vidx = vec_add(vidx,vec_sl(vidx,vec_splat_u32(1))); /* multiply by 3 */
1339 vidx = vec_sl(vidx,vec_splat_u32(2));
1341 idx1 = *((int *)&vidx);
1342 idx2 = *(((int *)&vidx)+1);
1343 idx3 = *(((int *)&vidx)+2);
1344 idx4 = *(((int *)&vidx)+3);
1346 eps = vec_sub(rtab,vec_floor(rtab));
1347 eps2 = vec_madd(eps,eps,vec_zero());
1349 if(((unsigned int)VFtab)%16) { /* not 16-byte aligned, but must be 8 byte. */
1350 tab1 = vec_ld( 0, VFtab+idx1);
1351 Y = vec_ld(16, VFtab+idx1);
1352 tab1 = vec_sld(tab1,Y,8);
1353 tab2 = vec_ld( 0, VFtab+idx2);
1354 F = vec_ld(16, VFtab+idx2);
1355 tab2 = vec_sld(tab2,F,8);
1356 tab3 = vec_ld( 0, VFtab+idx3);
1357 G = vec_ld(16, VFtab+idx3);
1358 tab3 = vec_sld(tab3,G,8);
1359 tab4 = vec_ld( 0, VFtab+idx4);
1360 H = vec_ld(16, VFtab+idx4);
1361 tab4 = vec_sld(tab4,H,8);
1362 } else { /* aligned */
1363 tab1=vec_ld(0, VFtab+idx1);
1364 tab2=vec_ld(0, VFtab+idx2);
1365 tab3=vec_ld(0, VFtab+idx3);
1366 tab4=vec_ld(0, VFtab+idx4);
1369 transpose_4_to_4(tab1,tab2,tab3,tab4,&Y,&F,&G,&H);
1371 F = vec_madd(G,eps,F); /* F + Geps */
1372 H = vec_madd(H,eps2,vec_zero()); /* Heps2 */
1373 F = vec_add(F,H); /* F + Geps + Heps2 (=Fp) */
1374 *VV = vec_madd(eps,F,Y); /* VV = Y + eps*Fp */
1375 F = vec_madd(G,eps,F); /* Fp + Geps */
1376 *FF = vec_madd(vec_two(),H,F); /* Fp + Geps + 2.0*Heps2 */
1380 static inline void do_4_ljtable_lj(float *VFtab,
1381 vector float rtab,
1382 vector float *VVdisp,
1383 vector float *FFdisp,
1384 vector float *VVrep,
1385 vector float *FFrep)
1387 vector signed int vidx;
1388 vector float Y,F,G,H,eps,eps2;
1389 vector float tabd1,tabd2,tabd3,tabd4;
1390 vector float tabr1,tabr2,tabr3,tabr4;
1392 int idx1,idx2,idx3,idx4;
1394 vidx = vec_cts(rtab,0);
1395 vidx = vec_sl(vidx,vec_splat_u32(3));
1397 idx1 = *((int *)&vidx);
1398 idx2 = *(((int *)&vidx)+1);
1399 idx3 = *(((int *)&vidx)+2);
1400 idx4 = *(((int *)&vidx)+3);
1402 eps = vec_sub(rtab,vec_floor(rtab));
1403 eps2 = vec_madd(eps,eps,vec_zero());
1405 if(((unsigned int)VFtab)%16) {
1406 /* not 16 byte aligned, i.e. must be 8 byte. */
1407 /* use Y,F,G,H as temp storage */
1408 tabd1 = vec_ld( 0, VFtab+idx1);
1409 tabr1 = vec_ld(16, VFtab+idx1);
1410 Y = vec_ld(32, VFtab+idx1);
1411 tabd1 = vec_sld(tabd1,tabr1,8);
1412 tabr1 = vec_sld(tabr1,Y,8);
1414 tabd2 = vec_ld( 0, VFtab+idx2);
1415 tabr2 = vec_ld(16, VFtab+idx2);
1416 F = vec_ld(32, VFtab+idx2);
1417 tabd2 = vec_sld(tabd2,tabr2,8);
1418 tabr2 = vec_sld(tabr2,F,8);
1420 tabd3 = vec_ld( 0, VFtab+idx3);
1421 tabr3 = vec_ld(16, VFtab+idx3);
1422 G = vec_ld(32, VFtab+idx3);
1423 tabd3 = vec_sld(tabd3,tabr3,8);
1424 tabr3 = vec_sld(tabr3,G,8);
1426 tabd4 = vec_ld( 0, VFtab+idx4);
1427 tabr4 = vec_ld(16, VFtab+idx4);
1428 H = vec_ld(32, VFtab+idx4);
1429 tabd4 = vec_sld(tabd4,tabr4,8);
1430 tabr4 = vec_sld(tabr4,H,8);
1431 } else { /* 16 byte aligned */
1432 tabd1 = vec_ld( 0, VFtab+idx1);
1433 tabr1 = vec_ld(16, VFtab+idx1);
1434 tabd2 = vec_ld( 0, VFtab+idx2);
1435 tabr2 = vec_ld(16, VFtab+idx2);
1436 tabd3 = vec_ld( 0, VFtab+idx3);
1437 tabr3 = vec_ld(16, VFtab+idx3);
1438 tabd4 = vec_ld( 0, VFtab+idx4);
1439 tabr4 = vec_ld(16, VFtab+idx4);
1442 transpose_4_to_4(tabd1,tabd2,tabd3,tabd4,&Y,&F,&G,&H);
1444 F = vec_madd(G,eps,F); /* F + Geps */
1445 H = vec_madd(H,eps2,vec_zero()); /* Heps2 */
1446 F = vec_add(F,H); /* F + Geps + Heps2 (=Fp) */
1447 *VVdisp = vec_madd(eps,F,Y); /* VV = Y + eps*Fp */
1448 F = vec_madd(G,eps,F); /* Fp + Geps */
1449 *FFdisp = vec_madd(vec_two(),H,F); /* Fp + Geps + 2.0*Heps2 */
1451 transpose_4_to_4(tabr1,tabr2,tabr3,tabr4,&Y,&F,&G,&H);
1453 F = vec_madd(G,eps,F); /* F + Geps */
1454 H = vec_madd(H,eps2,vec_zero()); /* Heps2 */
1455 F = vec_add(F,H); /* F + Geps + Heps2 (=Fp) */
1456 *VVrep = vec_madd(eps,F,Y); /* VV = Y + eps*Fp */
1457 F = vec_madd(G,eps,F); /* Fp + Geps */
1458 *FFrep = vec_madd(vec_two(),H,F); /* Fp + Geps + 2.0*Heps2 */
1462 static inline void do_4_ljctable_coul_and_lj(float *VFtab,
1463 vector float rtab,
1464 vector float *VVcoul,
1465 vector float *FFcoul,
1466 vector float *VVdisp,
1467 vector float *FFdisp,
1468 vector float *VVrep,
1469 vector float *FFrep)
1471 vector signed int vidx;
1472 vector float Y,F,G,H,eps,eps2;
1473 vector float tabd1,tabr1,tabc1,tabd2,tabr2,tabc2;
1474 vector float tabd3,tabr3,tabc3,tabd4,tabr4,tabc4;
1475 int idx1,idx2,idx3,idx4;
1477 vidx = vec_cts(rtab,0);
1478 vidx = vec_add(vidx,vec_sl(vidx,vec_splat_u32(1))); /* multiply by 3 */
1479 vidx = vec_sl(vidx,vec_splat_u32(2));
1481 idx1 = *((int *)&vidx);
1482 idx2 = *(((int *)&vidx)+1);
1483 idx3 = *(((int *)&vidx)+2);
1484 idx4 = *(((int *)&vidx)+3);
1486 eps = vec_sub(rtab,vec_floor(rtab));
1487 eps2 = vec_madd(eps,eps,vec_zero());
1489 if(((unsigned int)VFtab)%16) {
1490 /* not 16-byte aligned, but must be 8 byte. */
1491 /* use Y,F,G,H as temp storage */
1492 tabc1 = vec_ld( 0, VFtab+idx1);
1493 tabd1 = vec_ld(16, VFtab+idx1);
1494 tabr1 = vec_ld(32, VFtab+idx1);
1495 Y = vec_ld(48, VFtab+idx1);
1496 tabc1 = vec_sld(tabc1,tabd1,8);
1497 tabd1 = vec_sld(tabd1,tabr1,8);
1498 tabr1 = vec_sld(tabr1,Y,8);
1500 tabc2 = vec_ld( 0, VFtab+idx2);
1501 tabd2 = vec_ld(16, VFtab+idx2);
1502 tabr2 = vec_ld(32, VFtab+idx2);
1503 F = vec_ld(48, VFtab+idx2);
1504 tabc2 = vec_sld(tabc2,tabd2,8);
1505 tabd2 = vec_sld(tabd2,tabr2,8);
1506 tabr2 = vec_sld(tabr2,F,8);
1508 tabc3 = vec_ld( 0, VFtab+idx3);
1509 tabd3 = vec_ld(16, VFtab+idx3);
1510 tabr3 = vec_ld(32, VFtab+idx3);
1511 G = vec_ld(48, VFtab+idx3);
1512 tabc3 = vec_sld(tabc3,tabd3,8);
1513 tabd3 = vec_sld(tabd3,tabr3,8);
1514 tabr3 = vec_sld(tabr3,G,8);
1516 tabc4 = vec_ld( 0, VFtab+idx4);
1517 tabd4 = vec_ld(16, VFtab+idx4);
1518 tabr4 = vec_ld(32, VFtab+idx4);
1519 H = vec_ld(48, VFtab+idx4);
1520 tabc4 = vec_sld(tabc4,tabd4,8);
1521 tabd4 = vec_sld(tabd4,tabr4,8);
1522 tabr4 = vec_sld(tabr4,H,8);
1523 } else { /* 16 byte aligned */
1524 tabc1 = vec_ld( 0, VFtab+idx1);
1525 tabd1 = vec_ld(16, VFtab+idx1);
1526 tabr1 = vec_ld(32, VFtab+idx1);
1527 tabc2 = vec_ld( 0, VFtab+idx2);
1528 tabd2 = vec_ld(16, VFtab+idx2);
1529 tabr2 = vec_ld(32, VFtab+idx2);
1530 tabc3 = vec_ld( 0, VFtab+idx3);
1531 tabd3 = vec_ld(16, VFtab+idx3);
1532 tabr3 = vec_ld(32, VFtab+idx3);
1533 tabc4 = vec_ld( 0, VFtab+idx4);
1534 tabd4 = vec_ld(16, VFtab+idx4);
1535 tabr4 = vec_ld(32, VFtab+idx4);
1537 transpose_4_to_4(tabc1,tabc2,tabc3,tabc4,&Y,&F,&G,&H);
1539 F = vec_madd(G,eps,F); /* F + Geps */
1540 H = vec_madd(H,eps2,vec_zero()); /* Heps2 */
1541 F = vec_add(F,H); /* F + Geps + Heps2 (=Fp) */
1542 *VVcoul = vec_madd(eps,F,Y); /* VV = Y + eps*Fp */
1543 F = vec_madd(G,eps,F); /* Fp + Geps */
1544 *FFcoul = vec_madd(vec_two(),H,F); /* Fp + Geps + 2.0*Heps2 */
1546 transpose_4_to_4(tabd1,tabd2,tabd3,tabd4,&Y,&F,&G,&H);
1548 F = vec_madd(G,eps,F); /* F + Geps */
1549 H = vec_madd(H,eps2,vec_zero()); /* Heps2 */
1550 F = vec_add(F,H); /* F + Geps + Heps2 (=Fp) */
1551 *VVdisp = vec_madd(eps,F,Y); /* VV = Y + eps*Fp */
1552 F = vec_madd(G,eps,F); /* Fp + Geps */
1553 *FFdisp = vec_madd(vec_two(),H,F); /* Fp + Geps + 2.0*Heps2 */
1555 transpose_4_to_4(tabr1,tabr2,tabr3,tabr4,&Y,&F,&G,&H);
1557 F = vec_madd(G,eps,F); /* F + Geps */
1558 H = vec_madd(H,eps2,vec_zero()); /* Heps2 */
1559 F = vec_add(F,H); /* F + Geps + Heps2 (=Fp) */
1560 *VVrep = vec_madd(eps,F,Y); /* VV = Y + eps*Fp */
1561 F = vec_madd(G,eps,F); /* Fp + Geps */
1562 *FFrep = vec_madd(vec_two(),H,F); /* Fp + Geps + 2.0*Heps2 */
1566 static inline void do_3_ctable_coul(float *VFtab,
1567 vector float rtab,
1568 vector float *VV,
1569 vector float *FF)
1571 vector signed int vidx;
1572 vector float Y,F,G,H,eps,eps2,tab1,tab2,tab3;
1573 int idx1,idx2,idx3;
1575 vidx = vec_cts(rtab,0);
1576 vidx = vec_sl(vidx,vec_splat_u32(2));
1578 idx1 = *((int *)&vidx);
1579 idx2 = *(((int *)&vidx)+1);
1580 idx3 = *(((int *)&vidx)+2);
1582 eps = vec_sub(rtab,vec_floor(rtab));
1583 eps2 = vec_madd(eps,eps,vec_zero());
1585 if(((unsigned int)VFtab)%16) { /* not 16-byte aligned, but must be 8 byte. */
1586 tab1 = vec_ld( 0, VFtab+idx1);
1587 Y = vec_ld(16, VFtab+idx1);
1588 tab1 = vec_sld(tab1,Y,8);
1589 tab2 = vec_ld( 0, VFtab+idx2);
1590 F = vec_ld(16, VFtab+idx2);
1591 tab2 = vec_sld(tab2,F,8);
1592 tab3 = vec_ld( 0, VFtab+idx3);
1593 G = vec_ld(16, VFtab+idx3);
1594 tab3 = vec_sld(tab3,G,8);
1595 } else { /* aligned */
1596 tab1=vec_ld(0, VFtab+idx1);
1597 tab2=vec_ld(0, VFtab+idx2);
1598 tab3=vec_ld(0, VFtab+idx3);
1601 /* table data is aligned */
1602 transpose_3_to_4(tab1,tab2,tab3,&Y,&F,&G,&H);
1605 F = vec_madd(G,eps,F); /* F + Geps */
1606 H = vec_madd(H,eps2,vec_zero()); /* Heps2 */
1607 F = vec_add(F,H); /* F + Geps + Heps2 (=Fp) */
1608 *VV = vec_madd(eps,F,Y); /* VV = Y + eps*Fp */
1609 F = vec_madd(G,eps,F); /* Fp + Geps */
1610 *FF = vec_madd(vec_two(),H,F); /* Fp + Geps + 2.0*Heps2 */
1613 /* do only coulomb, but on a table with both coulomb and lj data */
1614 static inline void do_3_ljctable_coul(float *VFtab,
1615 vector float rtab,
1616 vector float *VV,
1617 vector float *FF)
1619 vector signed int vidx;
1620 vector float Y,F,G,H,eps,eps2,tab1,tab2,tab3;
1621 int idx1,idx2,idx3;
1623 vidx = vec_cts(rtab,0);
1624 vidx = vec_add(vidx,vec_sl(vidx,vec_splat_u32(1))); /* multiply by 3 */
1625 vidx = vec_sl(vidx,vec_splat_u32(2));
1627 idx1 = *((int *)&vidx);
1628 idx2 = *(((int *)&vidx)+1);
1629 idx3 = *(((int *)&vidx)+2);
1631 eps = vec_sub(rtab,vec_floor(rtab));
1632 eps2 = vec_madd(eps,eps,vec_zero());
1635 if(((unsigned int)VFtab)%16) {
1636 /* not 16-byte aligned, but must be 8 byte. */
1637 tab1 = vec_ld( 0, VFtab+idx1);
1638 Y = vec_ld(16, VFtab+idx1);
1639 tab1 = vec_sld(tab1,Y,8);
1640 tab2 = vec_ld( 0, VFtab+idx2);
1641 F = vec_ld(16, VFtab+idx2);
1642 tab2 = vec_sld(tab2,F,8);
1643 tab3 = vec_ld( 0, VFtab+idx3);
1644 G = vec_ld(16, VFtab+idx3);
1645 tab3 = vec_sld(tab3,G,8);
1646 } else { /* aligned */
1647 tab1=vec_ld(0, VFtab+idx1);
1648 tab2=vec_ld(0, VFtab+idx2);
1649 tab3=vec_ld(0, VFtab+idx3);
1652 /* table data is aligned */
1653 transpose_3_to_4(tab1,tab2,tab3,&Y,&F,&G,&H);
1655 F = vec_madd(G,eps,F); /* F + Geps */
1656 H = vec_madd(H,eps2,vec_zero()); /* Heps2 */
1657 F = vec_add(F,H); /* F + Geps + Heps2 (=Fp) */
1658 *VV = vec_madd(eps,F,Y); /* VV = Y + eps*Fp */
1659 F = vec_madd(G,eps,F); /* Fp + Geps */
1660 *FF = vec_madd(vec_two(),H,F); /* Fp + Geps + 2.0*Heps2 */
1664 static inline void do_3_ljtable_lj(float *VFtab,
1665 vector float rtab,
1666 vector float *VVdisp,
1667 vector float *FFdisp,
1668 vector float *VVrep,
1669 vector float *FFrep)
1671 vector signed int vidx;
1672 vector float Y,F,G,H,eps,eps2;
1673 int idx1,idx2,idx3;
1674 vector float tabd1,tabd2,tabd3;
1675 vector float tabr1,tabr2,tabr3;
1677 vidx = vec_cts(rtab,0);
1678 vidx = vec_sl(vidx,vec_splat_u32(3));
1680 idx1 = *((int *)&vidx);
1681 idx2 = *(((int *)&vidx)+1);
1682 idx3 = *(((int *)&vidx)+2);
1684 eps = vec_sub(rtab,vec_floor(rtab));
1685 eps2 = vec_madd(eps,eps,vec_zero());
1687 if(((unsigned int)VFtab)%16) {
1688 /* not 16 byte aligned, i.e. must be 8 byte. */
1689 /* use Y,F,G,H as temp storage */
1690 tabd1 = vec_ld( 0, VFtab+idx1);
1691 tabr1 = vec_ld(16, VFtab+idx1);
1692 Y = vec_ld(32, VFtab+idx1);
1693 tabd1 = vec_sld(tabd1,tabr1,8);
1694 tabr1 = vec_sld(tabr1,Y,8);
1696 tabd2 = vec_ld( 0, VFtab+idx2);
1697 tabr2 = vec_ld(16, VFtab+idx2);
1698 F = vec_ld(32, VFtab+idx2);
1699 tabd2 = vec_sld(tabd2,tabr2,8);
1700 tabr2 = vec_sld(tabr2,F,8);
1702 tabd3 = vec_ld( 0, VFtab+idx3);
1703 tabr3 = vec_ld(16, VFtab+idx3);
1704 G = vec_ld(32, VFtab+idx3);
1705 tabd3 = vec_sld(tabd3,tabr3,8);
1706 tabr3 = vec_sld(tabr3,G,8);
1707 } else { /* 16 byte aligned */
1708 tabd1 = vec_ld( 0, VFtab+idx1);
1709 tabr1 = vec_ld(16, VFtab+idx1);
1710 tabd2 = vec_ld( 0, VFtab+idx2);
1711 tabr2 = vec_ld(16, VFtab+idx2);
1712 tabd3 = vec_ld( 0, VFtab+idx3);
1713 tabr3 = vec_ld(16, VFtab+idx3);
1716 transpose_3_to_4(tabd1,tabd2,tabd3,&Y,&F,&G,&H);
1718 F = vec_madd(G,eps,F); /* F + Geps */
1719 H = vec_madd(H,eps2,vec_zero()); /* Heps2 */
1720 F = vec_add(F,H); /* F + Geps + Heps2 (=Fp) */
1721 *VVdisp = vec_madd(eps,F,Y); /* VV = Y + eps*Fp */
1722 F = vec_madd(G,eps,F); /* Fp + Geps */
1723 *FFdisp = vec_madd(vec_two(),H,F); /* Fp + Geps + 2.0*Heps2 */
1725 transpose_3_to_4(tabr1,tabr2,tabr3,&Y,&F,&G,&H);
1727 F = vec_madd(G,eps,F); /* F + Geps */
1728 H = vec_madd(H,eps2,vec_zero()); /* Heps2 */
1729 F = vec_add(F,H); /* F + Geps + Heps2 (=Fp) */
1730 *VVrep = vec_madd(eps,F,Y); /* VV = Y + eps*Fp */
1731 F = vec_madd(G,eps,F); /* Fp + Geps */
1732 *FFrep = vec_madd(vec_two(),H,F); /* Fp + Geps + 2.0*Heps2 */
1736 static inline void do_3_ljctable_coul_and_lj(float *VFtab,
1737 vector float rtab,
1738 vector float *VVcoul,
1739 vector float *FFcoul,
1740 vector float *VVdisp,
1741 vector float *FFdisp,
1742 vector float *VVrep,
1743 vector float *FFrep)
1745 vector signed int vidx;
1746 vector float Y,F,G,H,eps,eps2;
1747 vector float tabd1,tabr1,tabc1,tabd2,tabr2,tabc2;
1748 vector float tabd3,tabr3,tabc3;
1749 int idx1,idx2,idx3;
1751 vidx = vec_cts(rtab,0);
1752 vidx = vec_add(vidx,vec_sl(vidx,vec_splat_u32(1))); /* multiply by 3 */
1753 vidx = vec_sl(vidx,vec_splat_u32(2));
1755 idx1 = *((int *)&vidx);
1756 idx2 = *(((int *)&vidx)+1);
1757 idx3 = *(((int *)&vidx)+2);
1759 eps = vec_sub(rtab,vec_floor(rtab));
1760 eps2 = vec_madd(eps,eps,vec_zero());
1762 if(((unsigned int)VFtab)%16) {
1763 /* not 16-byte aligned, but must be 8 byte. */
1764 /* use Y,F,G,H as temp storage */
1765 tabc1 = vec_ld( 0, VFtab+idx1);
1766 tabd1 = vec_ld(16, VFtab+idx1);
1767 tabr1 = vec_ld(32, VFtab+idx1);
1768 Y = vec_ld(48, VFtab+idx1);
1769 tabc1 = vec_sld(tabc1,tabd1,8);
1770 tabd1 = vec_sld(tabd1,tabr1,8);
1771 tabr1 = vec_sld(tabr1,Y,8);
1773 tabc2 = vec_ld( 0, VFtab+idx2);
1774 tabd2 = vec_ld(16, VFtab+idx2);
1775 tabr2 = vec_ld(32, VFtab+idx2);
1776 F = vec_ld(48, VFtab+idx2);
1777 tabc2 = vec_sld(tabc2,tabd2,8);
1778 tabd2 = vec_sld(tabd2,tabr2,8);
1779 tabr2 = vec_sld(tabr2,F,8);
1781 tabc3 = vec_ld( 0, VFtab+idx3);
1782 tabd3 = vec_ld(16, VFtab+idx3);
1783 tabr3 = vec_ld(32, VFtab+idx3);
1784 G = vec_ld(48, VFtab+idx3);
1785 tabc3 = vec_sld(tabc3,tabd3,8);
1786 tabd3 = vec_sld(tabd3,tabr3,8);
1787 tabr3 = vec_sld(tabr3,G,8);
1788 } else { /* 16 byte aligned */
1789 tabc1 = vec_ld( 0, VFtab+idx1);
1790 tabd1 = vec_ld(16, VFtab+idx1);
1791 tabr1 = vec_ld(32, VFtab+idx1);
1792 tabc2 = vec_ld( 0, VFtab+idx2);
1793 tabd2 = vec_ld(16, VFtab+idx2);
1794 tabr2 = vec_ld(32, VFtab+idx2);
1795 tabc3 = vec_ld( 0, VFtab+idx3);
1796 tabd3 = vec_ld(16, VFtab+idx3);
1797 tabr3 = vec_ld(32, VFtab+idx3);
1799 transpose_3_to_4(tabc1,tabc2,tabc3,&Y,&F,&G,&H);
1801 F = vec_madd(G,eps,F); /* F + Geps */
1802 H = vec_madd(H,eps2,vec_zero()); /* Heps2 */
1803 F = vec_add(F,H); /* F + Geps + Heps2 (=Fp) */
1804 *VVcoul = vec_madd(eps,F,Y); /* VV = Y + eps*Fp */
1805 F = vec_madd(G,eps,F); /* Fp + Geps */
1806 *FFcoul = vec_madd(vec_two(),H,F); /* Fp + Geps + 2.0*Heps2 */
1808 transpose_3_to_4(tabd1,tabd2,tabd3,&Y,&F,&G,&H);
1810 F = vec_madd(G,eps,F); /* F + Geps */
1811 H = vec_madd(H,eps2,vec_zero()); /* Heps2 */
1812 F = vec_add(F,H); /* F + Geps + Heps2 (=Fp) */
1813 *VVdisp = vec_madd(eps,F,Y); /* VV = Y + eps*Fp */
1814 F = vec_madd(G,eps,F); /* Fp + Geps */
1815 *FFdisp = vec_madd(vec_two(),H,F); /* Fp + Geps + 2.0*Heps2 */
1817 transpose_3_to_4(tabr1,tabr2,tabr3,&Y,&F,&G,&H);
1819 F = vec_madd(G,eps,F); /* F + Geps */
1820 H = vec_madd(H,eps2,vec_zero()); /* Heps2 */
1821 F = vec_add(F,H); /* F + Geps + Heps2 (=Fp) */
1822 *VVrep = vec_madd(eps,F,Y); /* VV = Y + eps*Fp */
1823 F = vec_madd(G,eps,F); /* Fp + Geps */
1824 *FFrep = vec_madd(vec_two(),H,F); /* Fp + Geps + 2.0*Heps2 */
1830 static inline void do_2_ctable_coul(float *VFtab,
1831 vector float rtab,
1832 vector float *VV,
1833 vector float *FF)
1835 vector signed int vidx;
1836 vector float Y,F,G,H,eps,eps2,tab1,tab2;
1837 int idx1,idx2;
1839 vidx = vec_cts(rtab,0);
1840 vidx = vec_sl(vidx,vec_splat_u32(2));
1842 idx1 = *((int *)&vidx);
1843 idx2 = *(((int *)&vidx)+1);
1845 eps = vec_sub(rtab,vec_floor(rtab));
1846 eps2 = vec_madd(eps,eps,vec_zero());
1848 if(((unsigned int)VFtab)%16) { /* not 16-byte aligned, but must be 8 byte. */
1849 tab1 = vec_ld( 0, VFtab+idx1);
1850 Y = vec_ld(16, VFtab+idx1);
1851 tab1 = vec_sld(tab1,Y,8);
1852 tab2 = vec_ld( 0, VFtab+idx2);
1853 F = vec_ld(16, VFtab+idx2);
1854 tab2 = vec_sld(tab2,F,8);
1855 } else { /* aligned */
1856 tab1=vec_ld(0, VFtab+idx1);
1857 tab2=vec_ld(0, VFtab+idx2);
1860 /* table data is aligned */
1861 transpose_2_to_4(tab1,tab2,&Y,&F,&G,&H);
1863 F = vec_madd(G,eps,F); /* F + Geps */
1864 H = vec_madd(H,eps2,vec_zero()); /* Heps2 */
1865 F = vec_add(F,H); /* F + Geps + Heps2 (=Fp) */
1866 *VV = vec_madd(eps,F,Y); /* VV = Y + eps*Fp */
1867 F = vec_madd(G,eps,F); /* Fp + Geps */
1868 *FF = vec_madd(vec_two(),H,F); /* Fp + Geps + 2.0*Heps2 */
1871 /* do only coulomb, but on a table with both coulomb and lj data */
1872 static inline void do_2_ljctable_coul(float *VFtab,
1873 vector float rtab,
1874 vector float *VV,
1875 vector float *FF)
1877 vector signed int vidx;
1878 vector float Y,F,G,H,eps,eps2,tab1,tab2;
1879 int idx1,idx2;
1881 vidx = vec_cts(rtab,0);
1882 vidx = vec_add(vidx,vec_sl(vidx,vec_splat_u32(1))); /* multiply by 3 */
1883 vidx = vec_sl(vidx,vec_splat_u32(2));
1885 idx1 = *((int *)&vidx);
1886 idx2 = *(((int *)&vidx)+1);
1888 eps = vec_sub(rtab,vec_floor(rtab));
1889 eps2 = vec_madd(eps,eps,vec_zero());
1892 if(((unsigned int)VFtab)%16) {
1893 /* not 16-byte aligned, but must be 8 byte. */
1894 tab1 = vec_ld( 0, VFtab+idx1);
1895 Y = vec_ld(16, VFtab+idx1);
1896 tab1 = vec_sld(tab1,Y,8);
1897 tab2 = vec_ld( 0, VFtab+idx2);
1898 F = vec_ld(16, VFtab+idx2);
1899 tab2 = vec_sld(tab2,F,8);
1900 } else { /* aligned */
1901 tab1=vec_ld(0, VFtab+idx1);
1902 tab2=vec_ld(0, VFtab+idx2);
1905 /* table data is aligned */
1906 transpose_2_to_4(tab1,tab2,&Y,&F,&G,&H);
1908 F = vec_madd(G,eps,F); /* F + Geps */
1909 H = vec_madd(H,eps2,vec_zero()); /* Heps2 */
1910 F = vec_add(F,H); /* F + Geps + Heps2 (=Fp) */
1911 *VV = vec_madd(eps,F,Y); /* VV = Y + eps*Fp */
1912 F = vec_madd(G,eps,F); /* Fp + Geps */
1913 *FF = vec_madd(vec_two(),H,F); /* Fp + Geps + 2.0*Heps2 */
1917 static inline void do_2_ljtable_lj(float *VFtab,
1918 vector float rtab,
1919 vector float *VVdisp,
1920 vector float *FFdisp,
1921 vector float *VVrep,
1922 vector float *FFrep)
1924 vector signed int vidx;
1925 vector float Y,F,G,H,eps,eps2;
1926 int idx1,idx2;
1927 vector float tabd1,tabd2;
1928 vector float tabr1,tabr2;
1930 vidx = vec_cts(rtab,0);
1931 vidx = vec_sl(vidx,vec_splat_u32(3));
1933 idx1 = *((int *)&vidx);
1934 idx2 = *(((int *)&vidx)+1);
1936 eps = vec_sub(rtab,vec_floor(rtab));
1937 eps2 = vec_madd(eps,eps,vec_zero());
1939 if(((unsigned int)VFtab)%16) {
1940 /* not 16 byte aligned, i.e. must be 8 byte. */
1941 /* use Y,F,G,H as temp storage */
1942 tabd1 = vec_ld( 0, VFtab+idx1);
1943 tabr1 = vec_ld(16, VFtab+idx1);
1944 Y = vec_ld(32, VFtab+idx1);
1945 tabd1 = vec_sld(tabd1,tabr1,8);
1946 tabr1 = vec_sld(tabr1,Y,8);
1948 tabd2 = vec_ld( 0, VFtab+idx2);
1949 tabr2 = vec_ld(16, VFtab+idx2);
1950 F = vec_ld(32, VFtab+idx2);
1951 tabd2 = vec_sld(tabd2,tabr2,8);
1952 tabr2 = vec_sld(tabr2,F,8);
1953 } else { /* 16 byte aligned */
1954 tabd1 = vec_ld( 0, VFtab+idx1);
1955 tabr1 = vec_ld(16, VFtab+idx1);
1956 tabd2 = vec_ld( 0, VFtab+idx2);
1957 tabr2 = vec_ld(16, VFtab+idx2);
1960 transpose_2_to_4(tabd1,tabd2,&Y,&F,&G,&H);
1962 F = vec_madd(G,eps,F); /* F + Geps */
1963 H = vec_madd(H,eps2,vec_zero()); /* Heps2 */
1964 F = vec_add(F,H); /* F + Geps + Heps2 (=Fp) */
1965 *VVdisp = vec_madd(eps,F,Y); /* VV = Y + eps*Fp */
1966 F = vec_madd(G,eps,F); /* Fp + Geps */
1967 *FFdisp = vec_madd(vec_two(),H,F); /* Fp + Geps + 2.0*Heps2 */
1969 transpose_2_to_4(tabr1,tabr2,&Y,&F,&G,&H);
1971 F = vec_madd(G,eps,F); /* F + Geps */
1972 H = vec_madd(H,eps2,vec_zero()); /* Heps2 */
1973 F = vec_add(F,H); /* F + Geps + Heps2 (=Fp) */
1974 *VVrep = vec_madd(eps,F,Y); /* VV = Y + eps*Fp */
1975 F = vec_madd(G,eps,F); /* Fp + Geps */
1976 *FFrep = vec_madd(vec_two(),H,F); /* Fp + Geps + 2.0*Heps2 */
1980 static inline void do_2_ljctable_coul_and_lj(float *VFtab,
1981 vector float rtab,
1982 vector float *VVcoul,
1983 vector float *FFcoul,
1984 vector float *VVdisp,
1985 vector float *FFdisp,
1986 vector float *VVrep,
1987 vector float *FFrep)
1989 vector signed int vidx;
1990 vector float Y,F,G,H,eps,eps2;
1991 vector float tabd1,tabr1,tabc1,tabd2,tabr2,tabc2;
1992 int idx1,idx2;
1994 vidx = vec_cts(rtab,0);
1995 vidx = vec_add(vidx,vec_sl(vidx,vec_splat_u32(1))); /* multiply by 3 */
1996 vidx = vec_sl(vidx,vec_splat_u32(2));
1998 idx1 = *((int *)&vidx);
1999 idx2 = *(((int *)&vidx)+1);
2001 eps = vec_sub(rtab,vec_floor(rtab));
2002 eps2 = vec_madd(eps,eps,vec_zero());
2004 if(((unsigned int)VFtab)%16) {
2005 /* not 16-byte aligned, but must be 8 byte. */
2006 /* use Y,F,G,H as temp storage */
2007 tabc1 = vec_ld( 0, VFtab+idx1);
2008 tabd1 = vec_ld(16, VFtab+idx1);
2009 tabr1 = vec_ld(32, VFtab+idx1);
2010 Y = vec_ld(48, VFtab+idx1);
2011 tabc1 = vec_sld(tabc1,tabd1,8);
2012 tabd1 = vec_sld(tabd1,tabr1,8);
2013 tabr1 = vec_sld(tabr1,Y,8);
2015 tabc2 = vec_ld( 0, VFtab+idx2);
2016 tabd2 = vec_ld(16, VFtab+idx2);
2017 tabr2 = vec_ld(32, VFtab+idx2);
2018 F = vec_ld(48, VFtab+idx2);
2019 tabc2 = vec_sld(tabc2,tabd2,8);
2020 tabd2 = vec_sld(tabd2,tabr2,8);
2021 tabr2 = vec_sld(tabr2,F,8);
2022 } else { /* 16 byte aligned */
2023 tabc1 = vec_ld( 0, VFtab+idx1);
2024 tabd1 = vec_ld(16, VFtab+idx1);
2025 tabr1 = vec_ld(32, VFtab+idx1);
2026 tabc2 = vec_ld( 0, VFtab+idx2);
2027 tabd2 = vec_ld(16, VFtab+idx2);
2028 tabr2 = vec_ld(32, VFtab+idx2);
2030 transpose_2_to_4(tabc1,tabc2,&Y,&F,&G,&H);
2032 F = vec_madd(G,eps,F); /* F + Geps */
2033 H = vec_madd(H,eps2,vec_zero()); /* Heps2 */
2034 F = vec_add(F,H); /* F + Geps + Heps2 (=Fp) */
2035 *VVcoul = vec_madd(eps,F,Y); /* VV = Y + eps*Fp */
2036 F = vec_madd(G,eps,F); /* Fp + Geps */
2037 *FFcoul = vec_madd(vec_two(),H,F); /* Fp + Geps + 2.0*Heps2 */
2039 transpose_2_to_4(tabd1,tabd2,&Y,&F,&G,&H);
2041 F = vec_madd(G,eps,F); /* F + Geps */
2042 H = vec_madd(H,eps2,vec_zero()); /* Heps2 */
2043 F = vec_add(F,H); /* F + Geps + Heps2 (=Fp) */
2044 *VVdisp = vec_madd(eps,F,Y); /* VV = Y + eps*Fp */
2045 F = vec_madd(G,eps,F); /* Fp + Geps */
2046 *FFdisp = vec_madd(vec_two(),H,F); /* Fp + Geps + 2.0*Heps2 */
2048 transpose_2_to_4(tabr1,tabr2,&Y,&F,&G,&H);
2050 F = vec_madd(G,eps,F); /* F + Geps */
2051 H = vec_madd(H,eps2,vec_zero()); /* Heps2 */
2052 F = vec_add(F,H); /* F + Geps + Heps2 (=Fp) */
2053 *VVrep = vec_madd(eps,F,Y); /* VV = Y + eps*Fp */
2054 F = vec_madd(G,eps,F); /* Fp + Geps */
2055 *FFrep = vec_madd(vec_two(),H,F); /* Fp + Geps + 2.0*Heps2 */
2060 static inline void do_1_ctable_coul(float *VFtab,
2061 vector float rtab,
2062 vector float *VV,
2063 vector float *FF)
2065 vector signed int vidx;
2066 vector float Y,F,G,H,eps,eps2,tab1;
2067 int idx1;
2069 vidx = vec_cts(rtab,0);
2070 vidx = vec_sl(vidx,vec_splat_u32(2));
2072 idx1 = *((int *)&vidx);
2074 eps = vec_sub(rtab,vec_floor(rtab));
2075 eps2 = vec_madd(eps,eps,vec_zero());
2077 if(((unsigned int)VFtab)%16) { /* not 16-byte aligned, but must be 8 byte. */
2078 tab1 = vec_ld( 0, VFtab+idx1);
2079 Y = vec_ld(16, VFtab+idx1);
2080 tab1 = vec_sld(tab1,Y,8);
2081 } else { /* aligned */
2082 tab1=vec_ld(0, VFtab+idx1);
2085 /* table data is aligned */
2086 transpose_1_to_4(tab1,&Y,&F,&G,&H);
2088 F = vec_madd(G,eps,F); /* F + Geps */
2089 H = vec_madd(H,eps2,vec_zero()); /* Heps2 */
2090 F = vec_add(F,H); /* F + Geps + Heps2 (=Fp) */
2091 *VV = vec_madd(eps,F,Y); /* VV = Y + eps*Fp */
2092 F = vec_madd(G,eps,F); /* Fp + Geps */
2093 *FF = vec_madd(vec_two(),H,F); /* Fp + Geps + 2.0*Heps2 */
2096 /* do only coulomb, but on a table with both coulomb and lj data */
2097 static inline void do_1_ljctable_coul(float *VFtab,
2098 vector float rtab,
2099 vector float *VV,
2100 vector float *FF)
2102 vector signed int vidx;
2103 vector float Y,F,G,H,eps,eps2,tab1;
2104 int idx1;
2106 vidx = vec_cts(rtab,0);
2107 vidx = vec_add(vidx,vec_sl(vidx,vec_splat_u32(1))); /* multiply by 3 */
2108 vidx = vec_sl(vidx,vec_splat_u32(2));
2110 idx1 = *((int *)&vidx);
2112 eps = vec_sub(rtab,vec_floor(rtab));
2113 eps2 = vec_madd(eps,eps,vec_zero());
2116 if(((unsigned int)VFtab)%16) {
2117 /* not 16-byte aligned, but must be 8 byte. */
2118 tab1 = vec_ld( 0, VFtab+idx1);
2119 Y = vec_ld(16, VFtab+idx1);
2120 tab1 = vec_sld(tab1,Y,8);
2121 } else { /* aligned */
2122 tab1=vec_ld(0, VFtab+idx1);
2125 /* table data is aligned */
2126 transpose_1_to_4(tab1,&Y,&F,&G,&H);
2128 F = vec_madd(G,eps,F); /* F + Geps */
2129 H = vec_madd(H,eps2,vec_zero()); /* Heps2 */
2130 F = vec_add(F,H); /* F + Geps + Heps2 (=Fp) */
2131 *VV = vec_madd(eps,F,Y); /* VV = Y + eps*Fp */
2132 F = vec_madd(G,eps,F); /* Fp + Geps */
2133 *FF = vec_madd(vec_two(),H,F); /* Fp + Geps + 2.0*Heps2 */
2137 static inline void do_1_ljtable_lj(float *VFtab,
2138 vector float rtab,
2139 vector float *VVdisp,
2140 vector float *FFdisp,
2141 vector float *VVrep,
2142 vector float *FFrep)
2144 vector signed int vidx;
2145 vector float Y,F,G,H,eps,eps2;
2146 int idx1;
2147 vector float tabd1;
2148 vector float tabr1;
2150 vidx = vec_cts(rtab,0);
2151 vidx = vec_sl(vidx,vec_splat_u32(3));
2153 idx1 = *((int *)&vidx);
2155 eps = vec_sub(rtab,vec_floor(rtab));
2156 eps2 = vec_madd(eps,eps,vec_zero());
2158 if(((unsigned int)VFtab)%16) {
2159 /* not 16 byte aligned, i.e. must be 8 byte. */
2160 /* use Y,F,G,H as temp storage */
2161 tabd1 = vec_ld( 0, VFtab+idx1);
2162 tabr1 = vec_ld(16, VFtab+idx1);
2163 Y = vec_ld(32, VFtab+idx1);
2164 tabd1 = vec_sld(tabd1,tabr1,8);
2165 tabr1 = vec_sld(tabr1,Y,8);
2166 } else { /* 16 byte aligned */
2167 tabd1 = vec_ld( 0, VFtab+idx1);
2168 tabr1 = vec_ld(16, VFtab+idx1);
2171 transpose_1_to_4(tabd1,&Y,&F,&G,&H);
2173 F = vec_madd(G,eps,F); /* F + Geps */
2174 H = vec_madd(H,eps2,vec_zero()); /* Heps2 */
2175 F = vec_add(F,H); /* F + Geps + Heps2 (=Fp) */
2176 *VVdisp = vec_madd(eps,F,Y); /* VV = Y + eps*Fp */
2177 F = vec_madd(G,eps,F); /* Fp + Geps */
2178 *FFdisp = vec_madd(vec_two(),H,F); /* Fp + Geps + 2.0*Heps2 */
2180 transpose_1_to_4(tabr1,&Y,&F,&G,&H);
2182 F = vec_madd(G,eps,F); /* F + Geps */
2183 H = vec_madd(H,eps2,vec_zero()); /* Heps2 */
2184 F = vec_add(F,H); /* F + Geps + Heps2 (=Fp) */
2185 *VVrep = vec_madd(eps,F,Y); /* VV = Y + eps*Fp */
2186 F = vec_madd(G,eps,F); /* Fp + Geps */
2187 *FFrep = vec_madd(vec_two(),H,F); /* Fp + Geps + 2.0*Heps2 */
2191 static inline void do_1_ljctable_coul_and_lj(float *VFtab,
2192 vector float rtab,
2193 vector float *VVcoul,
2194 vector float *FFcoul,
2195 vector float *VVdisp,
2196 vector float *FFdisp,
2197 vector float *VVrep,
2198 vector float *FFrep)
2200 vector signed int vidx;
2201 vector float Y,F,G,H,eps,eps2;
2202 vector float tabd1,tabr1,tabc1;
2203 int idx1;
2205 vidx = vec_cts(rtab,0);
2206 vidx = vec_add(vidx,vec_sl(vidx,vec_splat_u32(1))); /* multiply by 3 */
2207 vidx = vec_sl(vidx,vec_splat_u32(2));
2209 idx1 = *((int *)&vidx);
2211 eps = vec_sub(rtab,vec_floor(rtab));
2212 eps2 = vec_madd(eps,eps,vec_zero());
2214 if(((unsigned int)VFtab)%16) {
2215 /* not 16-byte aligned, but must be 8 byte. */
2216 /* use Y,F,G,H as temp storage */
2217 tabc1 = vec_ld( 0, VFtab+idx1);
2218 tabd1 = vec_ld(16, VFtab+idx1);
2219 tabr1 = vec_ld(32, VFtab+idx1);
2220 Y = vec_ld(48, VFtab+idx1);
2221 tabc1 = vec_sld(tabc1,tabd1,8);
2222 tabd1 = vec_sld(tabd1,tabr1,8);
2223 tabr1 = vec_sld(tabr1,Y,8);
2224 } else { /* 16 byte aligned */
2225 tabc1 = vec_ld( 0, VFtab+idx1);
2226 tabd1 = vec_ld(16, VFtab+idx1);
2227 tabr1 = vec_ld(32, VFtab+idx1);
2229 transpose_1_to_4(tabc1,&Y,&F,&G,&H);
2231 F = vec_madd(G,eps,F); /* F + Geps */
2232 H = vec_madd(H,eps2,vec_zero()); /* Heps2 */
2233 F = vec_add(F,H); /* F + Geps + Heps2 (=Fp) */
2234 *VVcoul = vec_madd(eps,F,Y); /* VV = Y + eps*Fp */
2235 F = vec_madd(G,eps,F); /* Fp + Geps */
2236 *FFcoul = vec_madd(vec_two(),H,F); /* Fp + Geps + 2.0*Heps2 */
2238 transpose_1_to_4(tabd1,&Y,&F,&G,&H);
2240 F = vec_madd(G,eps,F); /* F + Geps */
2241 H = vec_madd(H,eps2,vec_zero()); /* Heps2 */
2242 F = vec_add(F,H); /* F + Geps + Heps2 (=Fp) */
2243 *VVdisp = vec_madd(eps,F,Y); /* VV = Y + eps*Fp */
2244 F = vec_madd(G,eps,F); /* Fp + Geps */
2245 *FFdisp = vec_madd(vec_two(),H,F); /* Fp + Geps + 2.0*Heps2 */
2247 transpose_1_to_4(tabr1,&Y,&F,&G,&H);
2249 F = vec_madd(G,eps,F); /* F + Geps */
2250 H = vec_madd(H,eps2,vec_zero()); /* Heps2 */
2251 F = vec_add(F,H); /* F + Geps + Heps2 (=Fp) */
2252 *VVrep = vec_madd(eps,F,Y); /* VV = Y + eps*Fp */
2253 F = vec_madd(G,eps,F); /* Fp + Geps */
2254 *FFrep = vec_madd(vec_two(),H,F); /* Fp + Geps + 2.0*Heps2 */
2260 static inline void do_vonly_4_ctable_coul(float *VFtab,
2261 vector float rtab,
2262 vector float *VV)
2264 vector signed int vidx;
2265 vector float Y,F,G,H,eps,eps2,tab1,tab2,tab3,tab4;
2266 int idx1,idx2,idx3,idx4;
2268 vidx = vec_cts(rtab,0);
2269 vidx = vec_sl(vidx,vec_splat_u32(2));
2271 idx1 = *((int *)&vidx);
2272 idx2 = *(((int *)&vidx)+1);
2273 idx3 = *(((int *)&vidx)+2);
2274 idx4 = *(((int *)&vidx)+3);
2276 eps = vec_sub(rtab,vec_floor(rtab));
2277 eps2 = vec_madd(eps,eps,vec_zero());
2279 if(((unsigned int)VFtab)%16) { /* not 16-byte aligned, but must be 8 byte. */
2280 tab1 = vec_ld( 0, VFtab+idx1);
2281 Y = vec_ld(16, VFtab+idx1);
2282 tab1 = vec_sld(tab1,Y,8);
2283 tab2 = vec_ld( 0, VFtab+idx2);
2284 F = vec_ld(16, VFtab+idx2);
2285 tab2 = vec_sld(tab2,F,8);
2286 tab3 = vec_ld( 0, VFtab+idx3);
2287 G = vec_ld(16, VFtab+idx3);
2288 tab3 = vec_sld(tab3,G,8);
2289 tab4 = vec_ld( 0, VFtab+idx4);
2290 H = vec_ld(16, VFtab+idx4);
2291 tab4 = vec_sld(tab4,H,8);
2292 } else { /* aligned */
2293 tab1=vec_ld(0, VFtab+idx1);
2294 tab2=vec_ld(0, VFtab+idx2);
2295 tab3=vec_ld(0, VFtab+idx3);
2296 tab4=vec_ld(0, VFtab+idx4);
2299 /* table data is aligned */
2300 transpose_4_to_4(tab1,tab2,tab3,tab4,&Y,&F,&G,&H);
2302 F = vec_madd(G,eps,F); /* F + Geps */
2303 H = vec_madd(H,eps2,vec_zero()); /* Heps2 */
2304 F = vec_add(F,H); /* F + Geps + Heps2 (=Fp) */
2305 *VV = vec_madd(eps,F,Y); /* VV = Y + eps*Fp */
2308 /* do only coulomb, but on a table with both coulomb and lj data */
2309 static inline void do_vonly_4_ljctable_coul(float *VFtab,
2310 vector float rtab,
2311 vector float *VV)
2313 vector signed int vidx;
2314 vector float Y,F,G,H,eps,eps2,tab1,tab2,tab3,tab4;
2315 int idx1,idx2,idx3,idx4;
2317 vidx = vec_cts(rtab,0);
2318 vidx = vec_add(vidx,vec_sl(vidx,vec_splat_u32(1))); /* multiply by 3 */
2319 vidx = vec_sl(vidx,vec_splat_u32(2));
2321 idx1 = *((int *)&vidx);
2322 idx2 = *(((int *)&vidx)+1);
2323 idx3 = *(((int *)&vidx)+2);
2324 idx4 = *(((int *)&vidx)+3);
2326 eps = vec_sub(rtab,vec_floor(rtab));
2327 eps2 = vec_madd(eps,eps,vec_zero());
2329 if(((unsigned int)VFtab)%16) { /* not 16-byte aligned, but must be 8 byte. */
2330 tab1 = vec_ld( 0, VFtab+idx1);
2331 Y = vec_ld(16, VFtab+idx1);
2332 tab1 = vec_sld(tab1,Y,8);
2333 tab2 = vec_ld( 0, VFtab+idx2);
2334 F = vec_ld(16, VFtab+idx2);
2335 tab2 = vec_sld(tab2,F,8);
2336 tab3 = vec_ld( 0, VFtab+idx3);
2337 G = vec_ld(16, VFtab+idx3);
2338 tab3 = vec_sld(tab3,G,8);
2339 tab4 = vec_ld( 0, VFtab+idx4);
2340 H = vec_ld(16, VFtab+idx4);
2341 tab4 = vec_sld(tab4,H,8);
2342 } else { /* aligned */
2343 tab1=vec_ld(0, VFtab+idx1);
2344 tab2=vec_ld(0, VFtab+idx2);
2345 tab3=vec_ld(0, VFtab+idx3);
2346 tab4=vec_ld(0, VFtab+idx4);
2349 transpose_4_to_4(tab1,tab2,tab3,tab4,&Y,&F,&G,&H);
2351 F = vec_madd(G,eps,F); /* F + Geps */
2352 H = vec_madd(H,eps2,vec_zero()); /* Heps2 */
2353 F = vec_add(F,H); /* F + Geps + Heps2 (=Fp) */
2354 *VV = vec_madd(eps,F,Y); /* VV = Y + eps*Fp */
2358 static inline void do_vonly_4_ljtable_lj(float *VFtab,
2359 vector float rtab,
2360 vector float *VVdisp,
2361 vector float *VVrep)
2363 vector signed int vidx;
2364 vector float Y,F,G,H,eps,eps2;
2365 vector float tabd1,tabd2,tabd3,tabd4;
2366 vector float tabr1,tabr2,tabr3,tabr4;
2368 int idx1,idx2,idx3,idx4;
2370 vidx = vec_cts(rtab,0);
2371 vidx = vec_sl(vidx,vec_splat_u32(3));
2373 idx1 = *((int *)&vidx);
2374 idx2 = *(((int *)&vidx)+1);
2375 idx3 = *(((int *)&vidx)+2);
2376 idx4 = *(((int *)&vidx)+3);
2378 eps = vec_sub(rtab,vec_floor(rtab));
2379 eps2 = vec_madd(eps,eps,vec_zero());
2381 if(((unsigned int)VFtab)%16) {
2382 /* not 16 byte aligned, i.e. must be 8 byte. */
2383 /* use Y,F,G,H as temp storage */
2384 tabd1 = vec_ld( 0, VFtab+idx1);
2385 tabr1 = vec_ld(16, VFtab+idx1);
2386 Y = vec_ld(32, VFtab+idx1);
2387 tabd1 = vec_sld(tabd1,tabr1,8);
2388 tabr1 = vec_sld(tabr1,Y,8);
2390 tabd2 = vec_ld( 0, VFtab+idx2);
2391 tabr2 = vec_ld(16, VFtab+idx2);
2392 F = vec_ld(32, VFtab+idx2);
2393 tabd2 = vec_sld(tabd2,tabr2,8);
2394 tabr2 = vec_sld(tabr2,F,8);
2396 tabd3 = vec_ld( 0, VFtab+idx3);
2397 tabr3 = vec_ld(16, VFtab+idx3);
2398 G = vec_ld(32, VFtab+idx3);
2399 tabd3 = vec_sld(tabd3,tabr3,8);
2400 tabr3 = vec_sld(tabr3,G,8);
2402 tabd4 = vec_ld( 0, VFtab+idx4);
2403 tabr4 = vec_ld(16, VFtab+idx4);
2404 H = vec_ld(32, VFtab+idx4);
2405 tabd4 = vec_sld(tabd4,tabr4,8);
2406 tabr4 = vec_sld(tabr4,H,8);
2407 } else { /* 16 byte aligned */
2408 tabd1 = vec_ld( 0, VFtab+idx1);
2409 tabr1 = vec_ld(16, VFtab+idx1);
2410 tabd2 = vec_ld( 0, VFtab+idx2);
2411 tabr2 = vec_ld(16, VFtab+idx2);
2412 tabd3 = vec_ld( 0, VFtab+idx3);
2413 tabr3 = vec_ld(16, VFtab+idx3);
2414 tabd4 = vec_ld( 0, VFtab+idx4);
2415 tabr4 = vec_ld(16, VFtab+idx4);
2418 transpose_4_to_4(tabd1,tabd2,tabd3,tabd4,&Y,&F,&G,&H);
2420 F = vec_madd(G,eps,F); /* F + Geps */
2421 H = vec_madd(H,eps2,vec_zero()); /* Heps2 */
2422 F = vec_add(F,H); /* F + Geps + Heps2 (=Fp) */
2423 *VVdisp = vec_madd(eps,F,Y); /* VV = Y + eps*Fp */
2425 transpose_4_to_4(tabr1,tabr2,tabr3,tabr4,&Y,&F,&G,&H);
2427 F = vec_madd(G,eps,F); /* F + Geps */
2428 H = vec_madd(H,eps2,vec_zero()); /* Heps2 */
2429 F = vec_add(F,H); /* F + Geps + Heps2 (=Fp) */
2430 *VVrep = vec_madd(eps,F,Y); /* VV = Y + eps*Fp */
2434 static inline void do_vonly_4_ljctable_coul_and_lj(float *VFtab,
2435 vector float rtab,
2436 vector float *VVcoul,
2437 vector float *VVdisp,
2438 vector float *VVrep)
2440 vector signed int vidx;
2441 vector float Y,F,G,H,eps,eps2;
2442 vector float tabd1,tabr1,tabc1,tabd2,tabr2,tabc2;
2443 vector float tabd3,tabr3,tabc3,tabd4,tabr4,tabc4;
2444 int idx1,idx2,idx3,idx4;
2446 vidx = vec_cts(rtab,0);
2447 vidx = vec_add(vidx,vec_sl(vidx,vec_splat_u32(1))); /* multiply by 3 */
2448 vidx = vec_sl(vidx,vec_splat_u32(2));
2450 idx1 = *((int *)&vidx);
2451 idx2 = *(((int *)&vidx)+1);
2452 idx3 = *(((int *)&vidx)+2);
2453 idx4 = *(((int *)&vidx)+3);
2455 eps = vec_sub(rtab,vec_floor(rtab));
2456 eps2 = vec_madd(eps,eps,vec_zero());
2458 if(((unsigned int)VFtab)%16) {
2459 /* not 16-byte aligned, but must be 8 byte. */
2460 /* use Y,F,G,H as temp storage */
2461 tabc1 = vec_ld( 0, VFtab+idx1);
2462 tabd1 = vec_ld(16, VFtab+idx1);
2463 tabr1 = vec_ld(32, VFtab+idx1);
2464 Y = vec_ld(48, VFtab+idx1);
2465 tabc1 = vec_sld(tabc1,tabd1,8);
2466 tabd1 = vec_sld(tabd1,tabr1,8);
2467 tabr1 = vec_sld(tabr1,Y,8);
2469 tabc2 = vec_ld( 0, VFtab+idx2);
2470 tabd2 = vec_ld(16, VFtab+idx2);
2471 tabr2 = vec_ld(32, VFtab+idx2);
2472 F = vec_ld(48, VFtab+idx2);
2473 tabc2 = vec_sld(tabc2,tabd2,8);
2474 tabd2 = vec_sld(tabd2,tabr2,8);
2475 tabr2 = vec_sld(tabr2,F,8);
2477 tabc3 = vec_ld( 0, VFtab+idx3);
2478 tabd3 = vec_ld(16, VFtab+idx3);
2479 tabr3 = vec_ld(32, VFtab+idx3);
2480 G = vec_ld(48, VFtab+idx3);
2481 tabc3 = vec_sld(tabc3,tabd3,8);
2482 tabd3 = vec_sld(tabd3,tabr3,8);
2483 tabr3 = vec_sld(tabr3,G,8);
2485 tabc4 = vec_ld( 0, VFtab+idx4);
2486 tabd4 = vec_ld(16, VFtab+idx4);
2487 tabr4 = vec_ld(32, VFtab+idx4);
2488 H = vec_ld(48, VFtab+idx4);
2489 tabc4 = vec_sld(tabc4,tabd4,8);
2490 tabd4 = vec_sld(tabd4,tabr4,8);
2491 tabr4 = vec_sld(tabr4,H,8);
2492 } else { /* 16 byte aligned */
2493 tabc1 = vec_ld( 0, VFtab+idx1);
2494 tabd1 = vec_ld(16, VFtab+idx1);
2495 tabr1 = vec_ld(32, VFtab+idx1);
2496 tabc2 = vec_ld( 0, VFtab+idx2);
2497 tabd2 = vec_ld(16, VFtab+idx2);
2498 tabr2 = vec_ld(32, VFtab+idx2);
2499 tabc3 = vec_ld( 0, VFtab+idx3);
2500 tabd3 = vec_ld(16, VFtab+idx3);
2501 tabr3 = vec_ld(32, VFtab+idx3);
2502 tabc4 = vec_ld( 0, VFtab+idx4);
2503 tabd4 = vec_ld(16, VFtab+idx4);
2504 tabr4 = vec_ld(32, VFtab+idx4);
2506 transpose_4_to_4(tabc1,tabc2,tabc3,tabc4,&Y,&F,&G,&H);
2508 F = vec_madd(G,eps,F); /* F + Geps */
2509 H = vec_madd(H,eps2,vec_zero()); /* Heps2 */
2510 F = vec_add(F,H); /* F + Geps + Heps2 (=Fp) */
2511 *VVcoul = vec_madd(eps,F,Y); /* VV = Y + eps*Fp */
2513 transpose_4_to_4(tabd1,tabd2,tabd3,tabd4,&Y,&F,&G,&H);
2515 F = vec_madd(G,eps,F); /* F + Geps */
2516 H = vec_madd(H,eps2,vec_zero()); /* Heps2 */
2517 F = vec_add(F,H); /* F + Geps + Heps2 (=Fp) */
2518 *VVdisp = vec_madd(eps,F,Y); /* VV = Y + eps*Fp */
2520 transpose_4_to_4(tabr1,tabr2,tabr3,tabr4,&Y,&F,&G,&H);
2522 F = vec_madd(G,eps,F); /* F + Geps */
2523 H = vec_madd(H,eps2,vec_zero()); /* Heps2 */
2524 F = vec_add(F,H); /* F + Geps + Heps2 (=Fp) */
2525 *VVrep = vec_madd(eps,F,Y); /* VV = Y + eps*Fp */
2529 static inline void do_vonly_3_ctable_coul(float *VFtab,
2530 vector float rtab,
2531 vector float *VV)
2533 vector signed int vidx;
2534 vector float Y,F,G,H,eps,eps2,tab1,tab2,tab3;
2535 int idx1,idx2,idx3;
2537 vidx = vec_cts(rtab,0);
2538 vidx = vec_sl(vidx,vec_splat_u32(2));
2540 idx1 = *((int *)&vidx);
2541 idx2 = *(((int *)&vidx)+1);
2542 idx3 = *(((int *)&vidx)+2);
2544 eps = vec_sub(rtab,vec_floor(rtab));
2545 eps2 = vec_madd(eps,eps,vec_zero());
2547 if(((unsigned int)VFtab)%16) { /* not 16-byte aligned, but must be 8 byte. */
2548 tab1 = vec_ld( 0, VFtab+idx1);
2549 Y = vec_ld(16, VFtab+idx1);
2550 tab1 = vec_sld(tab1,Y,8);
2551 tab2 = vec_ld( 0, VFtab+idx2);
2552 F = vec_ld(16, VFtab+idx2);
2553 tab2 = vec_sld(tab2,F,8);
2554 tab3 = vec_ld( 0, VFtab+idx3);
2555 G = vec_ld(16, VFtab+idx3);
2556 tab3 = vec_sld(tab3,G,8);
2557 } else { /* aligned */
2558 tab1=vec_ld(0, VFtab+idx1);
2559 tab2=vec_ld(0, VFtab+idx2);
2560 tab3=vec_ld(0, VFtab+idx3);
2563 /* table data is aligned */
2564 transpose_3_to_4(tab1,tab2,tab3,&Y,&F,&G,&H);
2567 F = vec_madd(G,eps,F); /* F + Geps */
2568 H = vec_madd(H,eps2,vec_zero()); /* Heps2 */
2569 F = vec_add(F,H); /* F + Geps + Heps2 (=Fp) */
2570 *VV = vec_madd(eps,F,Y); /* VV = Y + eps*Fp */
2573 /* do only coulomb, but on a table with both coulomb and lj data */
2574 static inline void do_vonly_3_ljctable_coul(float *VFtab,
2575 vector float rtab,
2576 vector float *VV)
2578 vector signed int vidx;
2579 vector float Y,F,G,H,eps,eps2,tab1,tab2,tab3;
2580 int idx1,idx2,idx3;
2582 vidx = vec_cts(rtab,0);
2583 vidx = vec_add(vidx,vec_sl(vidx,vec_splat_u32(1))); /* multiply by 3 */
2584 vidx = vec_sl(vidx,vec_splat_u32(2));
2586 idx1 = *((int *)&vidx);
2587 idx2 = *(((int *)&vidx)+1);
2588 idx3 = *(((int *)&vidx)+2);
2590 eps = vec_sub(rtab,vec_floor(rtab));
2591 eps2 = vec_madd(eps,eps,vec_zero());
2594 if(((unsigned int)VFtab)%16) {
2595 /* not 16-byte aligned, but must be 8 byte. */
2596 tab1 = vec_ld( 0, VFtab+idx1);
2597 Y = vec_ld(16, VFtab+idx1);
2598 tab1 = vec_sld(tab1,Y,8);
2599 tab2 = vec_ld( 0, VFtab+idx2);
2600 F = vec_ld(16, VFtab+idx2);
2601 tab2 = vec_sld(tab2,F,8);
2602 tab3 = vec_ld( 0, VFtab+idx3);
2603 G = vec_ld(16, VFtab+idx3);
2604 tab3 = vec_sld(tab3,G,8);
2605 } else { /* aligned */
2606 tab1=vec_ld(0, VFtab+idx1);
2607 tab2=vec_ld(0, VFtab+idx2);
2608 tab3=vec_ld(0, VFtab+idx3);
2611 /* table data is aligned */
2612 transpose_3_to_4(tab1,tab2,tab3,&Y,&F,&G,&H);
2614 F = vec_madd(G,eps,F); /* F + Geps */
2615 H = vec_madd(H,eps2,vec_zero()); /* Heps2 */
2616 F = vec_add(F,H); /* F + Geps + Heps2 (=Fp) */
2617 *VV = vec_madd(eps,F,Y); /* VV = Y + eps*Fp */
2621 static inline void do_vonly_3_ljtable_lj(float *VFtab,
2622 vector float rtab,
2623 vector float *VVdisp,
2624 vector float *VVrep)
2626 vector signed int vidx;
2627 vector float Y,F,G,H,eps,eps2;
2628 int idx1,idx2,idx3;
2629 vector float tabd1,tabd2,tabd3;
2630 vector float tabr1,tabr2,tabr3;
2632 vidx = vec_cts(rtab,0);
2633 vidx = vec_sl(vidx,vec_splat_u32(3));
2635 idx1 = *((int *)&vidx);
2636 idx2 = *(((int *)&vidx)+1);
2637 idx3 = *(((int *)&vidx)+2);
2639 eps = vec_sub(rtab,vec_floor(rtab));
2640 eps2 = vec_madd(eps,eps,vec_zero());
2642 if(((unsigned int)VFtab)%16) {
2643 /* not 16 byte aligned, i.e. must be 8 byte. */
2644 /* use Y,F,G,H as temp storage */
2645 tabd1 = vec_ld( 0, VFtab+idx1);
2646 tabr1 = vec_ld(16, VFtab+idx1);
2647 Y = vec_ld(32, VFtab+idx1);
2648 tabd1 = vec_sld(tabd1,tabr1,8);
2649 tabr1 = vec_sld(tabr1,Y,8);
2651 tabd2 = vec_ld( 0, VFtab+idx2);
2652 tabr2 = vec_ld(16, VFtab+idx2);
2653 F = vec_ld(32, VFtab+idx2);
2654 tabd2 = vec_sld(tabd2,tabr2,8);
2655 tabr2 = vec_sld(tabr2,F,8);
2657 tabd3 = vec_ld( 0, VFtab+idx3);
2658 tabr3 = vec_ld(16, VFtab+idx3);
2659 G = vec_ld(32, VFtab+idx3);
2660 tabd3 = vec_sld(tabd3,tabr3,8);
2661 tabr3 = vec_sld(tabr3,G,8);
2662 } else { /* 16 byte aligned */
2663 tabd1 = vec_ld( 0, VFtab+idx1);
2664 tabr1 = vec_ld(16, VFtab+idx1);
2665 tabd2 = vec_ld( 0, VFtab+idx2);
2666 tabr2 = vec_ld(16, VFtab+idx2);
2667 tabd3 = vec_ld( 0, VFtab+idx3);
2668 tabr3 = vec_ld(16, VFtab+idx3);
2671 transpose_3_to_4(tabd1,tabd2,tabd3,&Y,&F,&G,&H);
2673 F = vec_madd(G,eps,F); /* F + Geps */
2674 H = vec_madd(H,eps2,vec_zero()); /* Heps2 */
2675 F = vec_add(F,H); /* F + Geps + Heps2 (=Fp) */
2676 *VVdisp = vec_madd(eps,F,Y); /* VV = Y + eps*Fp */
2678 transpose_3_to_4(tabr1,tabr2,tabr3,&Y,&F,&G,&H);
2680 F = vec_madd(G,eps,F); /* F + Geps */
2681 H = vec_madd(H,eps2,vec_zero()); /* Heps2 */
2682 F = vec_add(F,H); /* F + Geps + Heps2 (=Fp) */
2683 *VVrep = vec_madd(eps,F,Y); /* VV = Y + eps*Fp */
2687 static inline void do_vonly_3_ljctable_coul_and_lj(float *VFtab,
2688 vector float rtab,
2689 vector float *VVcoul,
2690 vector float *VVdisp,
2691 vector float *VVrep)
2693 vector signed int vidx;
2694 vector float Y,F,G,H,eps,eps2;
2695 vector float tabd1,tabr1,tabc1,tabd2,tabr2,tabc2;
2696 vector float tabd3,tabr3,tabc3;
2697 int idx1,idx2,idx3;
2699 vidx = vec_cts(rtab,0);
2700 vidx = vec_add(vidx,vec_sl(vidx,vec_splat_u32(1))); /* multiply by 3 */
2701 vidx = vec_sl(vidx,vec_splat_u32(2));
2703 idx1 = *((int *)&vidx);
2704 idx2 = *(((int *)&vidx)+1);
2705 idx3 = *(((int *)&vidx)+2);
2707 eps = vec_sub(rtab,vec_floor(rtab));
2708 eps2 = vec_madd(eps,eps,vec_zero());
2710 if(((unsigned int)VFtab)%16) {
2711 /* not 16-byte aligned, but must be 8 byte. */
2712 /* use Y,F,G,H as temp storage */
2713 tabc1 = vec_ld( 0, VFtab+idx1);
2714 tabd1 = vec_ld(16, VFtab+idx1);
2715 tabr1 = vec_ld(32, VFtab+idx1);
2716 Y = vec_ld(48, VFtab+idx1);
2717 tabc1 = vec_sld(tabc1,tabd1,8);
2718 tabd1 = vec_sld(tabd1,tabr1,8);
2719 tabr1 = vec_sld(tabr1,Y,8);
2721 tabc2 = vec_ld( 0, VFtab+idx2);
2722 tabd2 = vec_ld(16, VFtab+idx2);
2723 tabr2 = vec_ld(32, VFtab+idx2);
2724 F = vec_ld(48, VFtab+idx2);
2725 tabc2 = vec_sld(tabc2,tabd2,8);
2726 tabd2 = vec_sld(tabd2,tabr2,8);
2727 tabr2 = vec_sld(tabr2,F,8);
2729 tabc3 = vec_ld( 0, VFtab+idx3);
2730 tabd3 = vec_ld(16, VFtab+idx3);
2731 tabr3 = vec_ld(32, VFtab+idx3);
2732 G = vec_ld(48, VFtab+idx3);
2733 tabc3 = vec_sld(tabc3,tabd3,8);
2734 tabd3 = vec_sld(tabd3,tabr3,8);
2735 tabr3 = vec_sld(tabr3,G,8);
2736 } else { /* 16 byte aligned */
2737 tabc1 = vec_ld( 0, VFtab+idx1);
2738 tabd1 = vec_ld(16, VFtab+idx1);
2739 tabr1 = vec_ld(32, VFtab+idx1);
2740 tabc2 = vec_ld( 0, VFtab+idx2);
2741 tabd2 = vec_ld(16, VFtab+idx2);
2742 tabr2 = vec_ld(32, VFtab+idx2);
2743 tabc3 = vec_ld( 0, VFtab+idx3);
2744 tabd3 = vec_ld(16, VFtab+idx3);
2745 tabr3 = vec_ld(32, VFtab+idx3);
2747 transpose_3_to_4(tabc1,tabc2,tabc3,&Y,&F,&G,&H);
2749 F = vec_madd(G,eps,F); /* F + Geps */
2750 H = vec_madd(H,eps2,vec_zero()); /* Heps2 */
2751 F = vec_add(F,H); /* F + Geps + Heps2 (=Fp) */
2752 *VVcoul = vec_madd(eps,F,Y); /* VV = Y + eps*Fp */
2754 transpose_3_to_4(tabd1,tabd2,tabd3,&Y,&F,&G,&H);
2756 F = vec_madd(G,eps,F); /* F + Geps */
2757 H = vec_madd(H,eps2,vec_zero()); /* Heps2 */
2758 F = vec_add(F,H); /* F + Geps + Heps2 (=Fp) */
2759 *VVdisp = vec_madd(eps,F,Y); /* VV = Y + eps*Fp */
2761 transpose_3_to_4(tabr1,tabr2,tabr3,&Y,&F,&G,&H);
2763 F = vec_madd(G,eps,F); /* F + Geps */
2764 H = vec_madd(H,eps2,vec_zero()); /* Heps2 */
2765 F = vec_add(F,H); /* F + Geps + Heps2 (=Fp) */
2766 *VVrep = vec_madd(eps,F,Y); /* VV = Y + eps*Fp */
2772 static inline void do_vonly_2_ctable_coul(float *VFtab,
2773 vector float rtab,
2774 vector float *VV)
2776 vector signed int vidx;
2777 vector float Y,F,G,H,eps,eps2,tab1,tab2;
2778 int idx1,idx2;
2780 vidx = vec_cts(rtab,0);
2781 vidx = vec_sl(vidx,vec_splat_u32(2));
2783 idx1 = *((int *)&vidx);
2784 idx2 = *(((int *)&vidx)+1);
2786 eps = vec_sub(rtab,vec_floor(rtab));
2787 eps2 = vec_madd(eps,eps,vec_zero());
2789 if(((unsigned int)VFtab)%16) { /* not 16-byte aligned, but must be 8 byte. */
2790 tab1 = vec_ld( 0, VFtab+idx1);
2791 Y = vec_ld(16, VFtab+idx1);
2792 tab1 = vec_sld(tab1,Y,8);
2793 tab2 = vec_ld( 0, VFtab+idx2);
2794 F = vec_ld(16, VFtab+idx2);
2795 tab2 = vec_sld(tab2,F,8);
2796 } else { /* aligned */
2797 tab1=vec_ld(0, VFtab+idx1);
2798 tab2=vec_ld(0, VFtab+idx2);
2801 /* table data is aligned */
2802 transpose_2_to_4(tab1,tab2,&Y,&F,&G,&H);
2804 F = vec_madd(G,eps,F); /* F + Geps */
2805 H = vec_madd(H,eps2,vec_zero()); /* Heps2 */
2806 F = vec_add(F,H); /* F + Geps + Heps2 (=Fp) */
2807 *VV = vec_madd(eps,F,Y); /* VV = Y + eps*Fp */
2810 /* do only coulomb, but on a table with both coulomb and lj data */
2811 static inline void do_vonly_2_ljctable_coul(float *VFtab,
2812 vector float rtab,
2813 vector float *VV)
2815 vector signed int vidx;
2816 vector float Y,F,G,H,eps,eps2,tab1,tab2;
2817 int idx1,idx2;
2819 vidx = vec_cts(rtab,0);
2820 vidx = vec_add(vidx,vec_sl(vidx,vec_splat_u32(1))); /* multiply by 3 */
2821 vidx = vec_sl(vidx,vec_splat_u32(2));
2823 idx1 = *((int *)&vidx);
2824 idx2 = *(((int *)&vidx)+1);
2826 eps = vec_sub(rtab,vec_floor(rtab));
2827 eps2 = vec_madd(eps,eps,vec_zero());
2830 if(((unsigned int)VFtab)%16) {
2831 /* not 16-byte aligned, but must be 8 byte. */
2832 tab1 = vec_ld( 0, VFtab+idx1);
2833 Y = vec_ld(16, VFtab+idx1);
2834 tab1 = vec_sld(tab1,Y,8);
2835 tab2 = vec_ld( 0, VFtab+idx2);
2836 F = vec_ld(16, VFtab+idx2);
2837 tab2 = vec_sld(tab2,F,8);
2838 } else { /* aligned */
2839 tab1=vec_ld(0, VFtab+idx1);
2840 tab2=vec_ld(0, VFtab+idx2);
2843 /* table data is aligned */
2844 transpose_2_to_4(tab1,tab2,&Y,&F,&G,&H);
2846 F = vec_madd(G,eps,F); /* F + Geps */
2847 H = vec_madd(H,eps2,vec_zero()); /* Heps2 */
2848 F = vec_add(F,H); /* F + Geps + Heps2 (=Fp) */
2849 *VV = vec_madd(eps,F,Y); /* VV = Y + eps*Fp */
2853 static inline void do_vonly_2_ljtable_lj(float *VFtab,
2854 vector float rtab,
2855 vector float *VVdisp,
2856 vector float *VVrep)
2858 vector signed int vidx;
2859 vector float Y,F,G,H,eps,eps2;
2860 int idx1,idx2;
2861 vector float tabd1,tabd2;
2862 vector float tabr1,tabr2;
2864 vidx = vec_cts(rtab,0);
2865 vidx = vec_sl(vidx,vec_splat_u32(3));
2867 idx1 = *((int *)&vidx);
2868 idx2 = *(((int *)&vidx)+1);
2870 eps = vec_sub(rtab,vec_floor(rtab));
2871 eps2 = vec_madd(eps,eps,vec_zero());
2873 if(((unsigned int)VFtab)%16) {
2874 /* not 16 byte aligned, i.e. must be 8 byte. */
2875 /* use Y,F,G,H as temp storage */
2876 tabd1 = vec_ld( 0, VFtab+idx1);
2877 tabr1 = vec_ld(16, VFtab+idx1);
2878 Y = vec_ld(32, VFtab+idx1);
2879 tabd1 = vec_sld(tabd1,tabr1,8);
2880 tabr1 = vec_sld(tabr1,Y,8);
2882 tabd2 = vec_ld( 0, VFtab+idx2);
2883 tabr2 = vec_ld(16, VFtab+idx2);
2884 F = vec_ld(32, VFtab+idx2);
2885 tabd2 = vec_sld(tabd2,tabr2,8);
2886 tabr2 = vec_sld(tabr2,F,8);
2887 } else { /* 16 byte aligned */
2888 tabd1 = vec_ld( 0, VFtab+idx1);
2889 tabr1 = vec_ld(16, VFtab+idx1);
2890 tabd2 = vec_ld( 0, VFtab+idx2);
2891 tabr2 = vec_ld(16, VFtab+idx2);
2894 transpose_2_to_4(tabd1,tabd2,&Y,&F,&G,&H);
2896 F = vec_madd(G,eps,F); /* F + Geps */
2897 H = vec_madd(H,eps2,vec_zero()); /* Heps2 */
2898 F = vec_add(F,H); /* F + Geps + Heps2 (=Fp) */
2899 *VVdisp = vec_madd(eps,F,Y); /* VV = Y + eps*Fp */
2901 transpose_2_to_4(tabr1,tabr2,&Y,&F,&G,&H);
2903 F = vec_madd(G,eps,F); /* F + Geps */
2904 H = vec_madd(H,eps2,vec_zero()); /* Heps2 */
2905 F = vec_add(F,H); /* F + Geps + Heps2 (=Fp) */
2906 *VVrep = vec_madd(eps,F,Y); /* VV = Y + eps*Fp */
2910 static inline void do_vonly_2_ljctable_coul_and_lj(float *VFtab,
2911 vector float rtab,
2912 vector float *VVcoul,
2913 vector float *VVdisp,
2914 vector float *VVrep)
2916 vector signed int vidx;
2917 vector float Y,F,G,H,eps,eps2;
2918 vector float tabd1,tabr1,tabc1,tabd2,tabr2,tabc2;
2919 int idx1,idx2;
2921 vidx = vec_cts(rtab,0);
2922 vidx = vec_add(vidx,vec_sl(vidx,vec_splat_u32(1))); /* multiply by 3 */
2923 vidx = vec_sl(vidx,vec_splat_u32(2));
2925 idx1 = *((int *)&vidx);
2926 idx2 = *(((int *)&vidx)+1);
2928 eps = vec_sub(rtab,vec_floor(rtab));
2929 eps2 = vec_madd(eps,eps,vec_zero());
2931 if(((unsigned int)VFtab)%16) {
2932 /* not 16-byte aligned, but must be 8 byte. */
2933 /* use Y,F,G,H as temp storage */
2934 tabc1 = vec_ld( 0, VFtab+idx1);
2935 tabd1 = vec_ld(16, VFtab+idx1);
2936 tabr1 = vec_ld(32, VFtab+idx1);
2937 Y = vec_ld(48, VFtab+idx1);
2938 tabc1 = vec_sld(tabc1,tabd1,8);
2939 tabd1 = vec_sld(tabd1,tabr1,8);
2940 tabr1 = vec_sld(tabr1,Y,8);
2942 tabc2 = vec_ld( 0, VFtab+idx2);
2943 tabd2 = vec_ld(16, VFtab+idx2);
2944 tabr2 = vec_ld(32, VFtab+idx2);
2945 F = vec_ld(48, VFtab+idx2);
2946 tabc2 = vec_sld(tabc2,tabd2,8);
2947 tabd2 = vec_sld(tabd2,tabr2,8);
2948 tabr2 = vec_sld(tabr2,F,8);
2949 } else { /* 16 byte aligned */
2950 tabc1 = vec_ld( 0, VFtab+idx1);
2951 tabd1 = vec_ld(16, VFtab+idx1);
2952 tabr1 = vec_ld(32, VFtab+idx1);
2953 tabc2 = vec_ld( 0, VFtab+idx2);
2954 tabd2 = vec_ld(16, VFtab+idx2);
2955 tabr2 = vec_ld(32, VFtab+idx2);
2957 transpose_2_to_4(tabc1,tabc2,&Y,&F,&G,&H);
2959 F = vec_madd(G,eps,F); /* F + Geps */
2960 H = vec_madd(H,eps2,vec_zero()); /* Heps2 */
2961 F = vec_add(F,H); /* F + Geps + Heps2 (=Fp) */
2962 *VVcoul = vec_madd(eps,F,Y); /* VV = Y + eps*Fp */
2964 transpose_2_to_4(tabd1,tabd2,&Y,&F,&G,&H);
2966 F = vec_madd(G,eps,F); /* F + Geps */
2967 H = vec_madd(H,eps2,vec_zero()); /* Heps2 */
2968 F = vec_add(F,H); /* F + Geps + Heps2 (=Fp) */
2969 *VVdisp = vec_madd(eps,F,Y); /* VV = Y + eps*Fp */
2971 transpose_2_to_4(tabr1,tabr2,&Y,&F,&G,&H);
2973 F = vec_madd(G,eps,F); /* F + Geps */
2974 H = vec_madd(H,eps2,vec_zero()); /* Heps2 */
2975 F = vec_add(F,H); /* F + Geps + Heps2 (=Fp) */
2976 *VVrep = vec_madd(eps,F,Y); /* VV = Y + eps*Fp */
2981 static inline void do_vonly_1_ctable_coul(float *VFtab,
2982 vector float rtab,
2983 vector float *VV)
2985 vector signed int vidx;
2986 vector float Y,F,G,H,eps,eps2,tab1;
2987 int idx1;
2989 vidx = vec_cts(rtab,0);
2990 vidx = vec_sl(vidx,vec_splat_u32(2));
2992 idx1 = *((int *)&vidx);
2994 eps = vec_sub(rtab,vec_floor(rtab));
2995 eps2 = vec_madd(eps,eps,vec_zero());
2997 if(((unsigned int)VFtab)%16) { /* not 16-byte aligned, but must be 8 byte. */
2998 tab1 = vec_ld( 0, VFtab+idx1);
2999 Y = vec_ld(16, VFtab+idx1);
3000 tab1 = vec_sld(tab1,Y,8);
3001 } else { /* aligned */
3002 tab1=vec_ld(0, VFtab+idx1);
3005 /* table data is aligned */
3006 transpose_1_to_4(tab1,&Y,&F,&G,&H);
3008 F = vec_madd(G,eps,F); /* F + Geps */
3009 H = vec_madd(H,eps2,vec_zero()); /* Heps2 */
3010 F = vec_add(F,H); /* F + Geps + Heps2 (=Fp) */
3011 *VV = vec_madd(eps,F,Y); /* VV = Y + eps*Fp */
3014 /* do only coulomb, but on a table with both coulomb and lj data */
3015 static inline void do_vonly_1_ljctable_coul(float *VFtab,
3016 vector float rtab,
3017 vector float *VV)
3019 vector signed int vidx;
3020 vector float Y,F,G,H,eps,eps2,tab1;
3021 int idx1;
3023 vidx = vec_cts(rtab,0);
3024 vidx = vec_add(vidx,vec_sl(vidx,vec_splat_u32(1))); /* multiply by 3 */
3025 vidx = vec_sl(vidx,vec_splat_u32(2));
3027 idx1 = *((int *)&vidx);
3029 eps = vec_sub(rtab,vec_floor(rtab));
3030 eps2 = vec_madd(eps,eps,vec_zero());
3033 if(((unsigned int)VFtab)%16) {
3034 /* not 16-byte aligned, but must be 8 byte. */
3035 tab1 = vec_ld( 0, VFtab+idx1);
3036 Y = vec_ld(16, VFtab+idx1);
3037 tab1 = vec_sld(tab1,Y,8);
3038 } else { /* aligned */
3039 tab1=vec_ld(0, VFtab+idx1);
3042 /* table data is aligned */
3043 transpose_1_to_4(tab1,&Y,&F,&G,&H);
3045 F = vec_madd(G,eps,F); /* F + Geps */
3046 H = vec_madd(H,eps2,vec_zero()); /* Heps2 */
3047 F = vec_add(F,H); /* F + Geps + Heps2 (=Fp) */
3048 *VV = vec_madd(eps,F,Y); /* VV = Y + eps*Fp */
3052 static inline void do_vonly_1_ljtable_lj(float *VFtab,
3053 vector float rtab,
3054 vector float *VVdisp,
3055 vector float *VVrep)
3057 vector signed int vidx;
3058 vector float Y,F,G,H,eps,eps2;
3059 int idx1;
3060 vector float tabd1;
3061 vector float tabr1;
3063 vidx = vec_cts(rtab,0);
3064 vidx = vec_sl(vidx,vec_splat_u32(3));
3066 idx1 = *((int *)&vidx);
3068 eps = vec_sub(rtab,vec_floor(rtab));
3069 eps2 = vec_madd(eps,eps,vec_zero());
3071 if(((unsigned int)VFtab)%16) {
3072 /* not 16 byte aligned, i.e. must be 8 byte. */
3073 /* use Y,F,G,H as temp storage */
3074 tabd1 = vec_ld( 0, VFtab+idx1);
3075 tabr1 = vec_ld(16, VFtab+idx1);
3076 Y = vec_ld(32, VFtab+idx1);
3077 tabd1 = vec_sld(tabd1,tabr1,8);
3078 tabr1 = vec_sld(tabr1,Y,8);
3079 } else { /* 16 byte aligned */
3080 tabd1 = vec_ld( 0, VFtab+idx1);
3081 tabr1 = vec_ld(16, VFtab+idx1);
3084 transpose_1_to_4(tabd1,&Y,&F,&G,&H);
3086 F = vec_madd(G,eps,F); /* F + Geps */
3087 H = vec_madd(H,eps2,vec_zero()); /* Heps2 */
3088 F = vec_add(F,H); /* F + Geps + Heps2 (=Fp) */
3089 *VVdisp = vec_madd(eps,F,Y); /* VV = Y + eps*Fp */
3091 transpose_1_to_4(tabr1,&Y,&F,&G,&H);
3093 F = vec_madd(G,eps,F); /* F + Geps */
3094 H = vec_madd(H,eps2,vec_zero()); /* Heps2 */
3095 F = vec_add(F,H); /* F + Geps + Heps2 (=Fp) */
3096 *VVrep = vec_madd(eps,F,Y); /* VV = Y + eps*Fp */
3100 static inline void do_vonly_1_ljctable_coul_and_lj(float *VFtab,
3101 vector float rtab,
3102 vector float *VVcoul,
3103 vector float *VVdisp,
3104 vector float *VVrep)
3106 vector signed int vidx;
3107 vector float Y,F,G,H,eps,eps2;
3108 vector float tabd1,tabr1,tabc1;
3109 int idx1;
3111 vidx = vec_cts(rtab,0);
3112 vidx = vec_add(vidx,vec_sl(vidx,vec_splat_u32(1))); /* multiply by 3 */
3113 vidx = vec_sl(vidx,vec_splat_u32(2));
3115 idx1 = *((int *)&vidx);
3117 eps = vec_sub(rtab,vec_floor(rtab));
3118 eps2 = vec_madd(eps,eps,vec_zero());
3120 if(((unsigned int)VFtab)%16) {
3121 /* not 16-byte aligned, but must be 8 byte. */
3122 /* use Y,F,G,H as temp storage */
3123 tabc1 = vec_ld( 0, VFtab+idx1);
3124 tabd1 = vec_ld(16, VFtab+idx1);
3125 tabr1 = vec_ld(32, VFtab+idx1);
3126 Y = vec_ld(48, VFtab+idx1);
3127 tabc1 = vec_sld(tabc1,tabd1,8);
3128 tabd1 = vec_sld(tabd1,tabr1,8);
3129 tabr1 = vec_sld(tabr1,Y,8);
3130 } else { /* 16 byte aligned */
3131 tabc1 = vec_ld( 0, VFtab+idx1);
3132 tabd1 = vec_ld(16, VFtab+idx1);
3133 tabr1 = vec_ld(32, VFtab+idx1);
3135 transpose_1_to_4(tabc1,&Y,&F,&G,&H);
3137 F = vec_madd(G,eps,F); /* F + Geps */
3138 H = vec_madd(H,eps2,vec_zero()); /* Heps2 */
3139 F = vec_add(F,H); /* F + Geps + Heps2 (=Fp) */
3140 *VVcoul = vec_madd(eps,F,Y); /* VV = Y + eps*Fp */
3142 transpose_1_to_4(tabd1,&Y,&F,&G,&H);
3144 F = vec_madd(G,eps,F); /* F + Geps */
3145 H = vec_madd(H,eps2,vec_zero()); /* Heps2 */
3146 F = vec_add(F,H); /* F + Geps + Heps2 (=Fp) */
3147 *VVdisp = vec_madd(eps,F,Y); /* VV = Y + eps*Fp */
3149 transpose_1_to_4(tabr1,&Y,&F,&G,&H);
3151 F = vec_madd(G,eps,F); /* F + Geps */
3152 H = vec_madd(H,eps2,vec_zero()); /* Heps2 */
3153 F = vec_add(F,H); /* F + Geps + Heps2 (=Fp) */
3154 *VVrep = vec_madd(eps,F,Y); /* VV = Y + eps*Fp */
3160 static inline vector float do_recip(vector float rsq)
3162 vector float tmp,lu;
3164 lu = vec_re(rsq);
3165 tmp = vec_nmsub(rsq,lu,vec_two());
3166 return vec_madd(lu,tmp,vec_zero());
3169 static inline vector float do_invsqrt(vector float rsq)
3171 vector float lu,tmpA,tmpB;
3173 lu = vec_rsqrte(rsq);
3174 tmpA = vec_madd(lu,lu,vec_zero());
3175 tmpB = vec_madd(lu,vec_half(),vec_zero());
3176 tmpA = vec_nmsub(rsq,tmpA,vec_one());
3177 return vec_madd(tmpA,tmpB,lu);
3180 static inline void do_3_invsqrt(vector float rsq1,
3181 vector float rsq2,
3182 vector float rsq3,
3183 vector float *rinvsq1,
3184 vector float *rinvsq2,
3185 vector float *rinvsq3)
3187 vector float lu1,lu2,lu3;
3188 vector float tmpA1,tmpA2,tmpA3,tmpB1,tmpB2,tmpB3;
3189 vector float nul=vec_zero();
3190 vector float half=vec_half();
3191 vector float one=vec_one();
3193 lu1 = vec_rsqrte(rsq1);
3194 lu2 = vec_rsqrte(rsq2);
3195 lu3 = vec_rsqrte(rsq3);
3196 tmpA1 = vec_madd(lu1,lu1,nul);
3197 tmpA2 = vec_madd(lu2,lu2,nul);
3198 tmpA3 = vec_madd(lu3,lu3,nul);
3199 tmpB1 = vec_madd(lu1,half,nul);
3200 tmpB2 = vec_madd(lu2,half,nul);
3201 tmpB3 = vec_madd(lu3,half,nul);
3202 tmpA1 = vec_nmsub(rsq1,tmpA1,one);
3203 tmpA2 = vec_nmsub(rsq2,tmpA2,one);
3204 tmpA3 = vec_nmsub(rsq3,tmpA3,one);
3205 *rinvsq1 = vec_madd(tmpA1,tmpB1,lu1);
3206 *rinvsq2 = vec_madd(tmpA2,tmpB2,lu2);
3207 *rinvsq3 = vec_madd(tmpA3,tmpB3,lu3);
3210 static inline void do_9_invsqrt(vector float rsq1,
3211 vector float rsq2,
3212 vector float rsq3,
3213 vector float rsq4,
3214 vector float rsq5,
3215 vector float rsq6,
3216 vector float rsq7,
3217 vector float rsq8,
3218 vector float rsq9,
3219 vector float *rinv1,
3220 vector float *rinv2,
3221 vector float *rinv3,
3222 vector float *rinv4,
3223 vector float *rinv5,
3224 vector float *rinv6,
3225 vector float *rinv7,
3226 vector float *rinv8,
3227 vector float *rinv9)
3229 vector float lu1,lu2,lu3,lu4,lu5,lu6,lu7,lu8,lu9;
3230 vector float tmpA1,tmpA2,tmpA3,tmpA4,tmpA5,tmpA6,tmpA7,tmpA8,tmpA9;
3231 vector float tmpB1,tmpB2,tmpB3,tmpB4,tmpB5,tmpB6,tmpB7,tmpB8,tmpB9;
3232 vector float nul=vec_zero();
3233 vector float half=vec_half();
3234 vector float one=vec_one();
3236 lu1 = vec_rsqrte(rsq1);
3237 lu2 = vec_rsqrte(rsq2);
3238 lu3 = vec_rsqrte(rsq3);
3239 lu4 = vec_rsqrte(rsq4);
3240 lu5 = vec_rsqrte(rsq5);
3241 lu6 = vec_rsqrte(rsq6);
3242 lu7 = vec_rsqrte(rsq7);
3243 lu8 = vec_rsqrte(rsq8);
3244 lu9 = vec_rsqrte(rsq9);
3245 tmpA1 = vec_madd(lu1,lu1,nul);
3246 tmpA2 = vec_madd(lu2,lu2,nul);
3247 tmpA3 = vec_madd(lu3,lu3,nul);
3248 tmpA4 = vec_madd(lu4,lu4,nul);
3249 tmpA5 = vec_madd(lu5,lu5,nul);
3250 tmpA6 = vec_madd(lu6,lu6,nul);
3251 tmpA7 = vec_madd(lu7,lu7,nul);
3252 tmpA8 = vec_madd(lu8,lu8,nul);
3253 tmpA9 = vec_madd(lu9,lu9,nul);
3254 tmpB1 = vec_madd(lu1,half,nul);
3255 tmpB2 = vec_madd(lu2,half,nul);
3256 tmpB3 = vec_madd(lu3,half,nul);
3257 tmpB4 = vec_madd(lu4,half,nul);
3258 tmpB5 = vec_madd(lu5,half,nul);
3259 tmpB6 = vec_madd(lu6,half,nul);
3260 tmpB7 = vec_madd(lu7,half,nul);
3261 tmpB8 = vec_madd(lu8,half,nul);
3262 tmpB9 = vec_madd(lu9,half,nul);
3263 tmpA1 = vec_nmsub(rsq1,tmpA1,one);
3264 tmpA2 = vec_nmsub(rsq2,tmpA2,one);
3265 tmpA3 = vec_nmsub(rsq3,tmpA3,one);
3266 tmpA4 = vec_nmsub(rsq4,tmpA4,one);
3267 tmpA5 = vec_nmsub(rsq5,tmpA5,one);
3268 tmpA6 = vec_nmsub(rsq6,tmpA6,one);
3269 tmpA7 = vec_nmsub(rsq7,tmpA7,one);
3270 tmpA8 = vec_nmsub(rsq8,tmpA8,one);
3271 tmpA9 = vec_nmsub(rsq9,tmpA9,one);
3272 *rinv1 = vec_madd(tmpA1,tmpB1,lu1);
3273 *rinv2 = vec_madd(tmpA2,tmpB2,lu2);
3274 *rinv3 = vec_madd(tmpA3,tmpB3,lu3);
3275 *rinv4 = vec_madd(tmpA4,tmpB4,lu4);
3276 *rinv5 = vec_madd(tmpA5,tmpB5,lu5);
3277 *rinv6 = vec_madd(tmpA6,tmpB6,lu6);
3278 *rinv7 = vec_madd(tmpA7,tmpB7,lu7);
3279 *rinv8 = vec_madd(tmpA8,tmpB8,lu8);
3280 *rinv9 = vec_madd(tmpA9,tmpB9,lu9);
3281 /* 36 invsqrt in about 48 cycles due to pipelining ... pretty fast :-) */
3285 static inline void zero_highest_element_in_vector(vector float *v)
3287 vector signed int zero = (vector signed int) vec_zero();
3289 *v = (vector float)vec_sel((vector signed int)*v,zero,(vector unsigned int)vec_sld(zero,vec_splat_s32(-1),4));
3292 static inline void zero_highest_2_elements_in_vector(vector float *v)
3294 vector signed int zero = (vector signed int) vec_zero();
3296 *v = (vector float)vec_sel((vector signed int)*v,zero,(vector unsigned int)vec_sld(zero,vec_splat_s32(-1),8));
3299 static inline void zero_highest_3_elements_in_vector(vector float *v)
3301 vector signed int zero = (vector signed int) vec_zero();
3303 *v = (vector float)vec_sel((vector signed int)*v,zero,(vector unsigned int)vec_sld(zero,vec_splat_s32(-1),12));
3307 static inline void zero_highest_element_in_3_vectors(vector float *v1,vector float *v2,vector float *v3)
3309 vector signed int zero = (vector signed int) vec_zero();
3310 vector unsigned int mask = (vector unsigned int)vec_sld(zero,vec_splat_s32(-1),4);
3312 *v1 = (vector float)vec_sel((vector signed int)*v1,zero,mask);
3313 *v2 = (vector float)vec_sel((vector signed int)*v2,zero,mask);
3314 *v3 = (vector float)vec_sel((vector signed int)*v3,zero,mask);
3317 static inline void zero_highest_2_elements_in_3_vectors(vector float *v1,vector float *v2,vector float *v3)
3319 vector signed int zero = (vector signed int) vec_zero();
3320 vector unsigned int mask = (vector unsigned int)vec_sld(zero,vec_splat_s32(-1),8);
3322 *v1 = (vector float)vec_sel((vector signed int)*v1,zero,mask);
3323 *v2 = (vector float)vec_sel((vector signed int)*v2,zero,mask);
3324 *v3 = (vector float)vec_sel((vector signed int)*v3,zero,mask);
3327 static inline void zero_highest_3_elements_in_3_vectors(vector float *v1,vector float *v2,vector float *v3)
3329 vector signed int zero = (vector signed int) vec_zero();
3330 vector unsigned int mask = (vector unsigned int)vec_sld(zero,vec_splat_s32(-1),12);
3332 *v1 = (vector float)vec_sel((vector signed int)*v1,zero,mask);
3333 *v2 = (vector float)vec_sel((vector signed int)*v2,zero,mask);
3334 *v3 = (vector float)vec_sel((vector signed int)*v3,zero,mask);
3337 static inline void zero_highest_element_in_9_vectors(vector float *v1,vector float *v2,vector float *v3,
3338 vector float *v4,vector float *v5,vector float *v6,
3339 vector float *v7,vector float *v8,vector float *v9)
3341 vector signed int zero = (vector signed int) vec_zero();
3342 vector unsigned int mask = (vector unsigned int)vec_sld(zero,vec_splat_s32(-1),4);
3344 *v1 = (vector float)vec_sel((vector signed int)*v1,zero,mask);
3345 *v2 = (vector float)vec_sel((vector signed int)*v2,zero,mask);
3346 *v3 = (vector float)vec_sel((vector signed int)*v3,zero,mask);
3347 *v4 = (vector float)vec_sel((vector signed int)*v4,zero,mask);
3348 *v5 = (vector float)vec_sel((vector signed int)*v5,zero,mask);
3349 *v6 = (vector float)vec_sel((vector signed int)*v6,zero,mask);
3350 *v7 = (vector float)vec_sel((vector signed int)*v7,zero,mask);
3351 *v8 = (vector float)vec_sel((vector signed int)*v8,zero,mask);
3352 *v9 = (vector float)vec_sel((vector signed int)*v9,zero,mask);
3355 static inline void zero_highest_2_elements_in_9_vectors(vector float *v1,vector float *v2,vector float *v3,
3356 vector float *v4,vector float *v5,vector float *v6,
3357 vector float *v7,vector float *v8,vector float *v9)
3359 vector signed int zero = (vector signed int) vec_zero();
3360 vector unsigned int mask = (vector unsigned int)vec_sld(zero,vec_splat_s32(-1),8);
3362 *v1 = (vector float)vec_sel((vector signed int)*v1,zero,mask);
3363 *v2 = (vector float)vec_sel((vector signed int)*v2,zero,mask);
3364 *v3 = (vector float)vec_sel((vector signed int)*v3,zero,mask);
3365 *v4 = (vector float)vec_sel((vector signed int)*v4,zero,mask);
3366 *v5 = (vector float)vec_sel((vector signed int)*v5,zero,mask);
3367 *v6 = (vector float)vec_sel((vector signed int)*v6,zero,mask);
3368 *v7 = (vector float)vec_sel((vector signed int)*v7,zero,mask);
3369 *v8 = (vector float)vec_sel((vector signed int)*v8,zero,mask);
3370 *v9 = (vector float)vec_sel((vector signed int)*v9,zero,mask);
3373 static inline void zero_highest_3_elements_in_9_vectors(vector float *v1,vector float *v2,vector float *v3,
3374 vector float *v4,vector float *v5,vector float *v6,
3375 vector float *v7,vector float *v8,vector float *v9)
3377 vector signed int zero = (vector signed int) vec_zero();
3378 vector unsigned int mask = (vector unsigned int)vec_sld(zero,vec_splat_s32(-1),12);
3380 *v1 = (vector float)vec_sel((vector signed int)*v1,zero,mask);
3381 *v2 = (vector float)vec_sel((vector signed int)*v2,zero,mask);
3382 *v3 = (vector float)vec_sel((vector signed int)*v3,zero,mask);
3383 *v4 = (vector float)vec_sel((vector signed int)*v4,zero,mask);
3384 *v5 = (vector float)vec_sel((vector signed int)*v5,zero,mask);
3385 *v6 = (vector float)vec_sel((vector signed int)*v6,zero,mask);
3386 *v7 = (vector float)vec_sel((vector signed int)*v7,zero,mask);
3387 *v8 = (vector float)vec_sel((vector signed int)*v8,zero,mask);
3388 *v9 = (vector float)vec_sel((vector signed int)*v9,zero,mask);
3392 void inl0100_altivec(int nri,int iinr[],int jindex[],int jjnr[],int shift[],
3393 float shiftvec[],float fshift[],int gid[],float pos[],
3394 float faction[],int type[],int ntype,float nbfp[],
3395 float Vnb[]);
3396 void inl0300_altivec(int nri,int iinr[],int jindex[],int jjnr[],int shift[],
3397 float shiftvec[],float fshift[],int gid[],float pos[],
3398 float faction[],int type[],int ntype,float nbfp[],
3399 float Vnb[],float tabscale,float VFtab[]);
3400 void inl1000_altivec(int nri,int iinr[],int jindex[],int jjnr[],int shift[],
3401 float shiftvec[],float fshift[],int gid[],float pos[],
3402 float faction[],float charge[],float facel,float Vc[]);
3403 void inl1020_altivec(int nri,int iinr[],int jindex[],int jjnr[],int shift[],
3404 float shiftvec[],float fshift[],int gid[],float pos[],
3405 float faction[],float charge[],float facel,float Vc[]);
3406 void inl1030_altivec(int nri,int iinr[],int jindex[],int jjnr[],int shift[],
3407 float shiftvec[],float fshift[],int gid[],float pos[],
3408 float faction[],float charge[],float facel,float Vc[]);
3409 void inl1100_altivec(int nri,int iinr[],int jindex[],int jjnr[],int shift[],
3410 float shiftvec[],float fshift[],int gid[],float pos[],
3411 float faction[],float charge[],float facel,float Vc[],
3412 int type[],int ntype,float nbfp[],float Vnb[]);
3413 void inl1120_altivec(int nri,int iinr[],int jindex[],int jjnr[],int shift[],
3414 float shiftvec[],float fshift[],int gid[],float pos[],
3415 float faction[],float charge[],float facel,float Vc[],
3416 int type[],int ntype,float nbfp[],float Vnb[]);
3417 void inl1130_altivec(int nri,int iinr[],int jindex[],int jjnr[],int shift[],
3418 float shiftvec[],float fshift[],int gid[],float pos[],
3419 float faction[],float charge[],float facel,float Vc[],
3420 int type[],int ntype,float nbfp[],float Vnb[]);
3421 void inl2000_altivec(int nri,int iinr[],int jindex[],int jjnr[],int shift[],
3422 float shiftvec[],float fshift[],int gid[],float pos[],
3423 float faction[],float charge[],float facel,float Vc[],
3424 float krf, float crf);
3425 void inl2020_altivec(int nri,int iinr[],int jindex[],int jjnr[],int shift[],
3426 float shiftvec[],float fshift[],int gid[],float pos[],
3427 float faction[],float charge[],float facel,float Vc[],
3428 float krf, float crf);
3429 void inl2030_altivec(int nri,int iinr[],int jindex[],int jjnr[],int shift[],
3430 float shiftvec[],float fshift[],int gid[],float pos[],
3431 float faction[],float charge[],float facel,float Vc[],
3432 float krf, float crf);
3433 void inl2100_altivec(int nri,int iinr[],int jindex[],int jjnr[],int shift[],
3434 float shiftvec[],float fshift[],int gid[],float pos[],
3435 float faction[],float charge[],float facel,float Vc[],
3436 float krf, float crf, int type[],int ntype,
3437 float nbfp[],float Vnb[]);
3438 void inl2120_altivec(int nri,int iinr[],int jindex[],int jjnr[],int shift[],
3439 float shiftvec[],float fshift[],int gid[],float pos[],
3440 float faction[],float charge[],float facel,float Vc[],
3441 float krf, float crf, int type[],int ntype,
3442 float nbfp[],float Vnb[]);
3443 void inl2130_altivec(int nri,int iinr[],int jindex[],int jjnr[],int shift[],
3444 float shiftvec[],float fshift[],int gid[],float pos[],
3445 float faction[],float charge[],float facel,float Vc[],
3446 float krf, float crf, int type[],int ntype,
3447 float nbfp[],float Vnb[]);
3448 void inl3000_altivec(int nri,int iinr[],int jindex[],int jjnr[],int shift[],
3449 float shiftvec[],float fshift[],int gid[],float pos[],
3450 float faction[],float charge[],float facel,float Vc[],
3451 float tabscale,float VFtab[]);
3452 void inl3020_altivec(int nri,int iinr[],int jindex[],int jjnr[],int shift[],
3453 float shiftvec[],float fshift[],int gid[],float pos[],
3454 float faction[],float charge[],float facel,float Vc[],
3455 float tabscale,float VFtab[]);
3456 void inl3030_altivec(int nri,int iinr[],int jindex[],int jjnr[],int shift[],
3457 float shiftvec[],float fshift[],int gid[],float pos[],
3458 float faction[],float charge[],float facel,float Vc[],
3459 float tabscale,float VFtab[]);
3460 void inl3100_altivec(int nri,int iinr[],int jindex[],int jjnr[],int shift[],
3461 float shiftvec[],float fshift[],int gid[],float pos[],
3462 float faction[],float charge[],float facel,float Vc[],
3463 int type[],int ntype,float nbfp[],float Vnb[],
3464 float tabscale, float VFtab[]);
3465 void inl3120_altivec(int nri,int iinr[],int jindex[],int jjnr[],int shift[],
3466 float shiftvec[],float fshift[],int gid[],float pos[],
3467 float faction[],float charge[],float facel,float Vc[],
3468 int type[],int ntype,float nbfp[],float Vnb[],
3469 float tabscale, float VFtab[]);
3470 void inl3130_altivec(int nri,int iinr[],int jindex[],int jjnr[],int shift[],
3471 float shiftvec[],float fshift[],int gid[],float pos[],
3472 float faction[],float charge[],float facel,float Vc[],
3473 int type[],int ntype,float nbfp[],float Vnb[],
3474 float tabscale, float VFtab[]);
3475 void inl3300_altivec(int nri,int iinr[],int jindex[],int jjnr[],int shift[],
3476 float shiftvec[],float fshift[],int gid[],float pos[],
3477 float faction[],float charge[],float facel,float Vc[],
3478 int type[],int ntype,float nbfp[],float Vnb[],
3479 float tabscale,float VFtab[]);
3480 void inl3320_altivec(int nri,int iinr[],int jindex[],int jjnr[],int shift[],
3481 float shiftvec[],float fshift[],int gid[],float pos[],
3482 float faction[],float charge[],float facel,float Vc[],
3483 int type[],int ntype,float nbfp[],float Vnb[],
3484 float tabscale,float VFtab[]);
3485 void inl3330_altivec(int nri,int iinr[],int jindex[],int jjnr[],int shift[],
3486 float shiftvec[],float fshift[],int gid[],float pos[],
3487 float faction[],float charge[],float facel,float Vc[],
3488 int type[],int ntype,float nbfp[],float Vnb[],
3489 float tabscale,float VFtab[]);
3491 /*---*/
3493 void mcinl0100_altivec(int nri,int iinr[],int jindex[],int jjnr[],int shift[],
3494 float shiftvec[],int gid[],float pos[],int type[],
3495 int ntype,float nbfp[],float Vnb[]);
3496 void mcinl0300_altivec(int nri,int iinr[],int jindex[],int jjnr[],int shift[],
3497 float shiftvec[],int gid[],float pos[],int type[],
3498 int ntype,float nbfp[],float Vnb[],
3499 float tabscale,float VFtab[]);
3500 void mcinl1000_altivec(int nri,int iinr[],int jindex[],int jjnr[],int shift[],
3501 float shiftvec[],int gid[],float pos[],
3502 float charge[],float facel,float Vc[]);
3503 void mcinl1020_altivec(int nri,int iinr[],int jindex[],int jjnr[],int shift[],
3504 float shiftvec[],int gid[],float pos[],
3505 float charge[],float facel,float Vc[]);
3506 void mcinl1030_altivec(int nri,int iinr[],int jindex[],int jjnr[],int shift[],
3507 float shiftvec[],int gid[],float pos[],
3508 float charge[],float facel,float Vc[]);
3509 void mcinl1100_altivec(int nri,int iinr[],int jindex[],int jjnr[],int shift[],
3510 float shiftvec[],int gid[],float pos[],
3511 float charge[],float facel,float Vc[],
3512 int type[],int ntype,float nbfp[],float Vnb[]);
3513 void mcinl1120_altivec(int nri,int iinr[],int jindex[],int jjnr[],int shift[],
3514 float shiftvec[],int gid[],float pos[],
3515 float charge[],float facel,float Vc[],
3516 int type[],int ntype,float nbfp[],float Vnb[]);
3517 void mcinl1130_altivec(int nri,int iinr[],int jindex[],int jjnr[],int shift[],
3518 float shiftvec[],int gid[],float pos[],
3519 float charge[],float facel,float Vc[],
3520 int type[],int ntype,float nbfp[],float Vnb[]);
3521 void mcinl2000_altivec(int nri,int iinr[],int jindex[],int jjnr[],int shift[],
3522 float shiftvec[],int gid[],float pos[],
3523 float charge[],float facel,float Vc[],
3524 float krf, float crf);
3525 void mcinl2020_altivec(int nri,int iinr[],int jindex[],int jjnr[],int shift[],
3526 float shiftvec[],int gid[],float pos[],
3527 float charge[],float facel,float Vc[],
3528 float krf, float crf);
3529 void mcinl2030_altivec(int nri,int iinr[],int jindex[],int jjnr[],int shift[],
3530 float shiftvec[],int gid[],float pos[],
3531 float charge[],float facel,float Vc[],
3532 float krf, float crf);
3533 void mcinl2100_altivec(int nri,int iinr[],int jindex[],int jjnr[],int shift[],
3534 float shiftvec[],int gid[],float pos[],
3535 float charge[],float facel,float Vc[],
3536 float krf, float crf, int type[],int ntype,
3537 float nbfp[],float Vnb[]);
3538 void mcinl2120_altivec(int nri,int iinr[],int jindex[],int jjnr[],int shift[],
3539 float shiftvec[],int gid[],float pos[],
3540 float charge[],float facel,float Vc[],
3541 float krf, float crf, int type[],int ntype,
3542 float nbfp[],float Vnb[]);
3543 void mcinl2130_altivec(int nri,int iinr[],int jindex[],int jjnr[],int shift[],
3544 float shiftvec[],int gid[],float pos[],
3545 float charge[],float facel,float Vc[],
3546 float krf, float crf, int type[],int ntype,
3547 float nbfp[],float Vnb[]);
3548 void mcinl3000_altivec(int nri,int iinr[],int jindex[],int jjnr[],int shift[],
3549 float shiftvec[],int gid[],float pos[],
3550 float charge[],float facel,float Vc[],
3551 float tabscale,float VFtab[]);
3552 void mcinl3020_altivec(int nri,int iinr[],int jindex[],int jjnr[],int shift[],
3553 float shiftvec[],int gid[],float pos[],
3554 float charge[],float facel,float Vc[],
3555 float tabscale,float VFtab[]);
3556 void mcinl3030_altivec(int nri,int iinr[],int jindex[],int jjnr[],int shift[],
3557 float shiftvec[],int gid[],float pos[],
3558 float charge[],float facel,float Vc[],
3559 float tabscale,float VFtab[]);
3560 void mcinl3100_altivec(int nri,int iinr[],int jindex[],int jjnr[],int shift[],
3561 float shiftvec[],int gid[],float pos[],
3562 float charge[],float facel,float Vc[],
3563 int type[],int ntype,float nbfp[],float Vnb[],
3564 float tabscale, float VFtab[]);
3565 void mcinl3120_altivec(int nri,int iinr[],int jindex[],int jjnr[],int shift[],
3566 float shiftvec[],int gid[],float pos[],
3567 float charge[],float facel,float Vc[],
3568 int type[],int ntype,float nbfp[],float Vnb[],
3569 float tabscale, float VFtab[]);
3570 void mcinl3130_altivec(int nri,int iinr[],int jindex[],int jjnr[],int shift[],
3571 float shiftvec[],int gid[],float pos[],
3572 float charge[],float facel,float Vc[],
3573 int type[],int ntype,float nbfp[],float Vnb[],
3574 float tabscale, float VFtab[]);
3575 void mcinl3300_altivec(int nri,int iinr[],int jindex[],int jjnr[],int shift[],
3576 float shiftvec[],int gid[],float pos[],
3577 float charge[],float facel,float Vc[],
3578 int type[],int ntype,float nbfp[],float Vnb[],
3579 float tabscale,float VFtab[]);
3580 void mcinl3320_altivec(int nri,int iinr[],int jindex[],int jjnr[],int shift[],
3581 float shiftvec[],int gid[],float pos[],
3582 float charge[],float facel,float Vc[],
3583 int type[],int ntype,float nbfp[],float Vnb[],
3584 float tabscale,float VFtab[]);
3585 void mcinl3330_altivec(int nri,int iinr[],int jindex[],int jjnr[],int shift[],
3586 float shiftvec[],int gid[],float pos[],
3587 float charge[],float facel,float Vc[],
3588 int type[],int ntype,float nbfp[],float Vnb[],
3589 float tabscale,float VFtab[]);
3593 #endif /* ppc_altivec.h */