Partial commit of the project to remove all static variables.
[gromacs.git] / include / ppc_altivec.h
blob1e4201924623baa337cca3faf80d79f14317051e
1 /*
2 * $Id$
3 *
4 * This source code is part of
5 *
6 * G R O M A C S
7 *
8 * GROningen MAchine for Chemical Simulations
9 *
10 * VERSION 3.1
11 * Copyright (c) 1991-2001, University of Groningen, The Netherlands
12 * This program is free software; you can redistribute it and/or
13 * modify it under the terms of the GNU General Public License
14 * as published by the Free Software Foundation; either version 2
15 * of the License, or (at your option) any later version.
17 * If you want to redistribute modifications, please consider that
18 * scientific software is very special. Version control is crucial -
19 * bugs must be traceable. We will be happy to consider code for
20 * inclusion in the official distribution, but derived work must not
21 * be called official GROMACS. Details are found in the README & COPYING
22 * files - if they are missing, get the official version at www.gromacs.org.
24 * To help us fund GROMACS development, we humbly ask that you cite
25 * the papers on the package - you can find them in the top README file.
27 * For more info, check our website at http://www.gromacs.org
29 * And Hey:
30 * Getting the Right Output Means no Artefacts in Calculating Stuff
33 #ifndef _ppc_altivec_h
34 #define _ppc_altivec_h
36 #ifdef HAVE_CONFIG_H
37 #include <config.h>
38 #endif
39 #include<stdio.h>
43 static void printvec(vector float v)
45 int i;
46 printf(" ");
48 for(i=0;i<4;i++)
49 printf("%8.5f ",*(((float *)&v)+i));
50 printf("\n");
54 /* Altivec utility routines */
55 static void set_non_java_mode(void)
57 vector unsigned short vsr1,vsr2;
58 vector unsigned int tmp;
60 vsr1=vec_mfvscr();
61 tmp=vec_sl(vec_splat_u32(1),vec_splat_u32(8));
62 vsr2=(vector unsigned short)vec_sl(tmp,vec_splat_u32(8));
63 vsr1=vec_or(vsr1,vsr2);
64 vec_mtvscr(vsr1);
67 /* Simple numerical constants */
68 static inline vector float vec_zero(void)
70 return vec_ctf(vec_splat_u32(0),0);
73 static inline vector float vec_half(void)
74 { /* 0.5 */
75 return vec_ctf(vec_splat_u32(1),1);
78 static inline vector float vec_one(void)
80 return vec_ctf(vec_splat_u32(1),0);
83 static inline vector float vec_two(void)
85 return vec_ctf(vec_splat_u32(2),0);
88 static inline vector float vec_three(void)
90 return vec_ctf(vec_splat_u32(3),0);
93 static inline vector float vec_six(void)
95 return vec_ctf(vec_splat_u32(6),0);
98 static inline vector float vec_twelve(void)
100 return vec_ctf(vec_splat_u32(12),0);
104 /* load three consecutive floats. We never access the fourth,
105 * so this is safe even at the end of an array.
107 static inline vector float load_xyz(float *address)
109 vector float c1,c2,c3;
110 vector unsigned char perm;
112 perm = vec_lvsl( 0, address );
113 c1 = vec_lde( 0, address );
114 c2 = vec_lde( 4, address );
115 c3 = vec_lde( 8, address );
116 c1 = vec_perm(c1,c1,perm);
117 c2 = vec_perm(c2,c2,perm);
118 c3 = vec_perm(c3,c3,perm);
119 c2 = vec_sld(c2,c2,4);
120 c3 = vec_sld(c3,c3,8);
121 c1 = vec_mergeh(c1,c3);
123 return vec_mergeh(c1,c2);
127 /* load four consecutive floats (from unaligned address) */
128 static inline vector float load_vector_unaligned(float *address)
130 vector unsigned char perm;
131 vector float low,high;
133 perm = vec_lvsl( 0, (int *) address );
134 low = vec_ld( 0, address );
135 high = vec_ld( 16, address );
137 return vec_perm(low,high,perm);
140 /* load a single float and spread it to all vector elements */
141 static inline vector float load_float_and_splat(float *address)
143 vector unsigned char perm;
144 vector float tmp;
146 tmp = vec_lde(0,address);
147 perm = vec_lvsl(0,address);
148 tmp = vec_perm(tmp,tmp,perm);
150 return vec_splat(tmp,0);
154 /* load four non-consecutive floats into vector [mem1 mem2 mem3 mem4] */
155 static inline vector float load_4_float(float *float1,
156 float *float2,
157 float *float3,
158 float *float4)
160 vector unsigned char xshift = vec_lvsl( 12, float1 );
161 vector unsigned char yshift = vec_lvsl( 12, float2 );
162 vector unsigned char zshift = vec_lvsl( 0, float3 );
163 vector unsigned char wshift = vec_lvsl( 0, float4 );
165 vector float X = vec_lde( 0, float1 );
166 vector float Y = vec_lde( 0, float2 );
167 vector float Z = vec_lde( 0, float3 );
168 vector float W = vec_lde( 0, float4 );
170 X = vec_perm( X, X, xshift);
171 Y = vec_perm( Y, Y, yshift);
172 Z = vec_perm( Z, Z, zshift);
173 W = vec_perm( W, W, wshift);
175 X = vec_mergeh( X, Y );
176 Z = vec_mergeh( Z, W );
178 return vec_sld( X, Z, 8 );
181 /* load four non-consecutive floats into vector [mem1 mem2 mem3 mem4] */
182 static inline vector float load_3_float(float *float1,
183 float *float2,
184 float *float3)
186 vector unsigned char xshift = vec_lvsl( 12, float1 );
187 vector unsigned char yshift = vec_lvsl( 12, float2 );
188 vector unsigned char zshift = vec_lvsl( 0, float3 );
190 vector float X = vec_lde( 0, float1 );
191 vector float Y = vec_lde( 0, float2 );
192 vector float Z = vec_lde( 0, float3 );
194 X = vec_perm( X, X, xshift);
195 Y = vec_perm( Y, Y, yshift);
196 Z = vec_perm( Z, Z, zshift);
198 X = vec_mergeh( X, Y );
200 return vec_sld( X, Z, 8 );
204 /* load two non-consecutive floats into vector [mem1 mem2 0 0] */
205 static inline vector float load_2_float(float *float1,
206 float *float2)
208 vector unsigned char xshift = vec_lvsl( 8, float1 );
209 vector unsigned char yshift = vec_lvsl( 8, float2 );
211 vector float X = vec_lde( 0, float1 );
212 vector float Y = vec_lde( 0, float2 );
214 X = vec_perm( X, X, xshift);
215 Y = vec_perm( Y, Y, yshift);
217 return vec_mergel( X, Y );
220 /* load one float into vector [mem1 0 0 0] */
221 static inline vector float load_1_float(float *float1)
223 vector unsigned char xshift = vec_lvsl( 4, float1 );
225 vector float X = vec_lde( 0, float1 );
226 X = vec_perm( X, X, xshift);
228 return vec_sld(X,vec_zero(),12);
234 /* load lj parameters for four atoms and store in two vectors.
235 * The nbfp memory is aligned on an 8-byte boundary and consists
236 * of pairs, so the two parameters are either in the upper or
237 * lower half of a 16-byte structure.
239 static inline void load_4_pair(float *pair1,
240 float *pair2,
241 float *pair3,
242 float *pair4,
243 vector float *c6,
244 vector float *c12)
246 vector float X = vec_ld( 0,pair1); /* c6a c12a */
247 vector float Y = vec_ld( 0,pair2); /* c6b c12b */
248 vector float Z = vec_ld( 0,pair3); /* c6c c12c */
249 vector float W = vec_ld( 0,pair4); /* c6d c12d */
250 vector unsigned char perm1 = vec_lvsl(0,pair1);
251 vector unsigned char perm2 = vec_lvsl(0,pair2);
252 vector unsigned char perm3 = vec_lvsl(0,pair3);
253 vector unsigned char perm4 = vec_lvsl(0,pair4);
254 X = vec_perm(X,X,perm1);
255 Y = vec_perm(Y,Y,perm2);
256 Z = vec_perm(Z,Z,perm3);
257 W = vec_perm(W,W,perm4);
259 X = vec_mergeh(X,Z); /* c6a c6c c12a c12c */
260 Y = vec_mergeh(Y,W); /* c6b c6d c12b c12d */
262 *c6 = vec_mergeh(X,Y);
263 *c12 = vec_mergel(X,Y);
267 /* load lj parameters for three atoms and store in two vectors
268 * See load_4_pair for alignment requirements.
270 static inline void load_3_pair(float *pair1,
271 float *pair2,
272 float *pair3,
273 vector float *c6,
274 vector float *c12)
276 vector float X = vec_ld( 0,pair1); /* c6a c12a */
277 vector float Y = vec_ld( 0,pair2); /* c6b c12b */
278 vector float Z = vec_ld( 0,pair3); /* c6c c12c */
279 vector unsigned char perm1 = vec_lvsl(0,pair1);
280 vector unsigned char perm2 = vec_lvsl(0,pair2);
281 vector unsigned char perm3 = vec_lvsl(0,pair3);
282 X = vec_perm(X,X,perm1);
283 Y = vec_perm(Y,Y,perm2);
284 Z = vec_perm(Z,Z,perm3);
286 X = vec_mergeh(X,Z); /* c6a c6c c12a c12c */
287 Y = vec_mergeh(Y,vec_zero()); /* c6b 0 c12b 0 */
289 *c6 = vec_mergeh(X,Y);
290 *c12 = vec_mergel(X,Y);
294 /* load lj parameters for two atoms and store in two vectors
295 * See load_4_pair for alignment requirements.
297 static inline void load_2_pair(float *pair1,
298 float *pair2,
299 vector float *c6,
300 vector float *c12)
302 vector float X = vec_ld( 0,pair1); /* c6a c12a */
303 vector float Y = vec_ld( 0,pair2); /* c6b c12b */
304 vector unsigned char perm1 = vec_lvsl(0,pair1);
305 vector unsigned char perm2 = vec_lvsl(0,pair2);
306 X = vec_perm(X,X,perm1);
307 Y = vec_perm(Y,Y,perm2);
309 X = vec_mergeh(X,vec_zero()); /* c6a 0 c12a 0 */
310 Y = vec_mergeh(Y,vec_zero()); /* c6b 0 c12b 0 */
312 *c6 = vec_mergeh(X,Y);
313 *c12 = vec_mergel(X,Y);
316 /* load lj parameters for one atom and store in two vectors
317 * See load_4_pair for alignment requirements.
319 static inline void load_1_pair(float *pair1,
320 vector float *c6,
321 vector float *c12)
323 vector float X = vec_ld( 0,pair1); /* c6a c12a */
324 vector unsigned char perm1 = vec_lvsl(0,pair1);
325 X = vec_perm(X,X,perm1);
326 X = vec_mergeh(X,vec_zero()); /* c6a 0 c12a 0 */
328 *c6 = vec_mergeh(X,vec_zero());
329 *c12 = vec_mergel(X,vec_zero());
334 * permute [x y z ?] into [x x x x] [y y y y] [z z z z]
335 * xreg can be the same as xyzreg, but not yreg/zreg!
337 static inline void splat_xyz_to_vectors(vector float xyzreg,
338 vector float *xreg,
339 vector float *yreg,
340 vector float *zreg)
342 *zreg = vec_splat(xyzreg,2);
343 *yreg = vec_splat(xyzreg,1);
344 *xreg = vec_splat(xyzreg,0);
348 /* The following transpose routines do not fill with zero padding:
350 * 1_to_3
351 * 2_to_3
352 * 1_to_4
353 * 2_to_4
355 * ...but the rest do.
359 /* move 4*[x y z ?] into [x1 x2 x3 x4] [y1 y2 y3 y4] [z1 z2 z3 z4]
361 static inline void transpose_4_to_3(vector float xyz1,
362 vector float xyz2,
363 vector float xyz3,
364 vector float xyz4,
365 vector float *xvector,
366 vector float *yvector,
367 vector float *zvector)
369 *yvector = vec_mergeh(xyz1,xyz3); /* [x1 x3 y1 y3] */
370 *zvector = vec_mergeh(xyz2,xyz4); /* [x2 x4 y2 y4] */
371 xyz1 = vec_mergel(xyz1,xyz3); /* [z1 z3 ? ?] */
372 xyz2 = vec_mergel(xyz2,xyz4); /* [z2 z4 ? ?] */
373 *xvector = vec_mergeh(*yvector,*zvector); /* [x1 x2 x3 x4] */
374 *yvector = vec_mergel(*yvector,*zvector); /* [y1 y2 y3 y4] */
375 *zvector = vec_mergeh(xyz1,xyz2); /* [z1 z2 z3 z4] */
379 /* move 2*[x y z ?] into [x1 x2 ? ?] [y1 y2 ? ?] [z1 z2 ? ?]
381 static inline void transpose_2_to_3(vector float xyz1,
382 vector float xyz2,
383 vector float *xvector,
384 vector float *yvector,
385 vector float *zvector)
387 *xvector = vec_mergeh(xyz1,xyz2); /* x1 x2 y1 y2 */
388 *zvector = vec_mergel(xyz1,xyz2); /* z1 z2 ? ? */
389 *yvector = vec_sld(*xvector,*xvector,8); /* y1 y2 0 0 */
393 /* move [x y z ?] into [x1 ? ? ?] [y1 ? ? ?] [z1 ? ? ?]
395 static inline void transpose_1_to_3(vector float xyz1,
396 vector float *xvector,
397 vector float *yvector,
398 vector float *zvector)
400 /* simply use splat, since elem 2,3,4 dont matter. */
401 *xvector = vec_splat(xyz1,0); /* x1 x1 x1 x1 */
402 *yvector = vec_splat(xyz1,1); /* y1 y1 y1 y1 */
403 *zvector = vec_splat(xyz1,2); /* z1 z1 z1 z1 */
409 /* move [x1 x2 x3 x4] [y1 y2 y3 y4] [z1 z2 z3 z4] to 4* [x y z 0]
411 static inline void transpose_3_to_4(vector float xvector,
412 vector float yvector,
413 vector float zvector,
414 vector float *xyz1,
415 vector float *xyz2,
416 vector float *xyz3,
417 vector float *xyz4)
419 vector float tmp1,tmp2;
420 vector float nul=vec_zero();
422 *xyz2 = vec_mergeh(xvector,zvector); /* [x1 z1 x2 z2 ] */
423 tmp1 = vec_mergeh(yvector,nul); /* [y1 0 y2 0 ] */
424 *xyz4 = vec_mergel(xvector,zvector); /* [x3 z3 x4 z4 ] */
425 tmp2 = vec_mergel(yvector,nul); /* [y3 0 y4 0 ] */
427 *xyz1 = vec_mergeh(*xyz2,tmp1); /* [x1 y1 z1 0 ] */
428 *xyz2 = vec_mergel(*xyz2,tmp1); /* [x2 y2 z2 0 ] */
429 *xyz3 = vec_mergeh(*xyz4,tmp2); /* [x3 y3 z3 0 ] */
430 *xyz4 = vec_mergel(*xyz4,tmp2); /* [x4 y4 z4 0 ] */
435 /* move [x1 x2 ? ?] [y1 y2 ? ?] [z1 z2 ? ?] to 2* [x y z 0]
437 static inline void transpose_3_to_2(vector float xvector,
438 vector float yvector,
439 vector float zvector,
440 vector float *xyz1,
441 vector float *xyz2)
443 vector float tmp1;
445 *xyz2 = vec_mergeh(xvector,zvector); /* [x1 z1 x2 z2 ] */
446 tmp1 = vec_mergeh(yvector,vec_zero()); /* [y1 0 y2 0 ] */
448 *xyz1 = vec_mergeh(*xyz2,tmp1); /* [x1 y1 z1 0 ] */
449 *xyz2 = vec_mergel(*xyz2,tmp1); /* [x2 y2 z2 0 ] */
453 /* move [x1 ? ? ?] [y1 ? ? ?] [z1 ? ? ?] to 1* [x y z 0]
455 static inline void transpose_3_to_1(vector float xvector,
456 vector float yvector,
457 vector float zvector,
458 vector float *xyz1)
460 *xyz1 = vec_mergeh(xvector,zvector); /* [x1 z1 ? ? ] */
461 yvector = vec_mergeh(yvector,vec_zero()); /* [y1 0 ? 0] */
462 *xyz1 = vec_mergeh(*xyz1,yvector); /* [x1 y1 z1 0] */
467 static inline void transpose_4_to_4(vector float in1,
468 vector float in2,
469 vector float in3,
470 vector float in4,
471 vector float *out1,
472 vector float *out2,
473 vector float *out3,
474 vector float *out4)
476 *out2 = vec_mergeh(in1,in3); /* [x1 x3 y1 y3] */
477 *out3 = vec_mergeh(in2,in4); /* [x2 x4 y2 y4] */
478 in1 = vec_mergel(in1,in3); /* [z1 z3 w1 w3] */
479 in2 = vec_mergel(in2,in4); /* [z2 z4 w2 w4] */
481 *out1 = vec_mergeh(*out2,*out3); /* [x1 x2 x3 x4] */
482 *out2 = vec_mergel(*out2,*out3); /* [y1 y2 y3 y4] */
483 *out3 = vec_mergeh(in1,in2); /* [z1 z2 z3 z4] */
484 *out4 = vec_mergel(in1,in2); /* [w1 w2 w3 w4] */
488 static inline void transpose_4_to_2(vector float in1,
489 vector float in2,
490 vector float in3,
491 vector float in4,
492 vector float *out1,
493 vector float *out2)
495 vector float tmp;
497 tmp = vec_mergeh(in1,in3); /* [x1 x3 y1 y3] */
498 *out2 = vec_mergeh(in2,in4); /* [x2 x4 y2 y4] */
499 *out1 = vec_mergeh(tmp,*out2); /* [x1 x2 x3 x4] */
500 *out2 = vec_mergel(tmp,*out2); /* [y1 y2 y3 y4] */
503 static inline void transpose_4_to_1(vector float in1,
504 vector float in2,
505 vector float in3,
506 vector float in4,
507 vector float *out1)
509 vector float tmp;
511 tmp = vec_mergeh(in1,in3); /* [x1 x3 ? ?] */
512 *out1 = vec_mergeh(in2,in4); /* [x2 x4 ? ?] */
513 *out1 = vec_mergeh(tmp,*out1); /* [x1 x2 x3 x4] */
518 static inline void transpose_2_to_4(vector float in1,
519 vector float in2,
520 vector float *out1,
521 vector float *out2,
522 vector float *out3,
523 vector float *out4)
525 *out1 = vec_mergeh(in1,in2); /* [x1 x2 y1 y2] */
526 *out3 = vec_mergel(in1,in2); /* [z1 z2 w1 w2] */
527 *out2 = vec_sld(*out1,*out1,8);/* [y1 y2 0 0] */
528 *out4 = vec_sld(*out3,*out3,8);/* [w1 w2 0 0] */
532 static inline void transpose_1_to_4(vector float in1,
533 vector float *out1,
534 vector float *out2,
535 vector float *out3,
536 vector float *out4)
538 *out1 = vec_splat(in1,0); /* [x1 x1 x1 x1] */
539 *out2 = vec_splat(in1,1); /* [y1 y1 y1 y1] */
540 *out3 = vec_splat(in1,2); /* [z1 z1 z1 z1] */
541 *out4 = vec_splat(in1,3); /* [w1 w1 w1 w1] */
545 /* Add the contents in xyz to an unaligned address location.
547 static inline void add_xyz_to_mem(float *address,vector float vdata)
549 vector float c1,c2,c3;
551 c1 = vec_lde( 0, address);
552 c2 = vec_lde( 4, address);
553 c3 = vec_lde( 8, address);
554 c1 = vec_add(c1,vec_splat(vdata,0));
555 c2 = vec_add(c2,vec_splat(vdata,1));
556 c3 = vec_add(c3,vec_splat(vdata,2));
557 vec_ste( c1, 0, address);
558 vec_ste( c2, 4, address);
559 vec_ste( c3, 8, address);
563 static inline void add_vector_to_float(float *address,vector float vdata)
565 vector float tmp;
567 tmp = vec_sld(vdata,vdata,8);
568 vdata = vec_add(vdata,tmp);
569 tmp = vec_sld(vdata,vdata,4);
570 vdata = vec_add(vdata,tmp);
571 tmp = vec_lde(0,address);
572 /* all four positions in vdata contain the sum */
573 tmp = vec_add(tmp,vdata);
574 vec_ste(tmp,0,address);
579 * load entire water molecule to vectors. Water is special;
580 * the allocated memory is 8-byte aligned, so with nine atoms
581 * we can always read 3 16-byte chunks (rounded down by vec_ld)
582 * without going outside the current page. The extra elements
583 * before and after the water dont matter as long as we dont
584 * try to write to them.
586 static inline void load_1_water_shift_and_splat(float *address,
587 float *shiftvec,
588 vector float *Ox,
589 vector float *Oy,
590 vector float *Oz,
591 vector float *H1x,
592 vector float *H1y,
593 vector float *H1z,
594 vector float *H2x,
595 vector float *H2y,
596 vector float *H2z)
598 vector unsigned char perm;
599 vector float c1,c2,c3,sha,shb,sh1,sh2,sh3;
601 /* load shift */
602 shb = load_xyz(shiftvec); /* [ shX shY shZ -] */
603 /* load the coordinates */
604 perm = vec_lvsl( 0, (int *) address );
605 c1 = vec_ld( 0, address );
606 c2 = vec_ld( 16, address );
607 c3 = vec_ld( 32, address );
609 sha = vec_sld(shb,shb,12); /* [ - shX shY shZ ] */
611 sh1 = vec_sld(sha,shb,4); /* [ shX shY shZ shX ] */
612 sh2 = vec_sld(sha,shb,8); /* [ shY shZ shX shY ] */
613 sh3 = vec_sld(sha,shb,12); /* [ shZ shX shY shZ ] */
615 c1 = vec_perm(c1,c2,perm); /* Ox Oy Oz H1x */
616 c2 = vec_perm(c2,c3,perm); /* H1y H1z H2x H2y */
617 c3 = vec_perm(c3,c3,perm); /* H2z - - - */
618 c1 = vec_add(c1,sh1);
619 c2 = vec_add(c2,sh2);
620 c3 = vec_add(c3,sh3);
622 *Ox = vec_splat(c1,0);
623 *Oy = vec_splat(c1,1);
624 *Oz = vec_splat(c1,2);
625 *H1x = vec_splat(c1,3);
626 *H1y = vec_splat(c2,0);
627 *H1z = vec_splat(c2,1);
628 *H2x = vec_splat(c2,2);
629 *H2y = vec_splat(c2,3);
630 *H2z = vec_splat(c3,0);
633 static inline void load_4_water(float *address1,
634 float *address2,
635 float *address3,
636 float *address4,
637 vector float *Ox,
638 vector float *Oy,
639 vector float *Oz,
640 vector float *H1x,
641 vector float *H1y,
642 vector float *H1z,
643 vector float *H2x,
644 vector float *H2y,
645 vector float *H2z)
647 vector unsigned char perm;
648 vector float tmp1,tmp2,tmp3;
650 vector float c1a,c2a,c3a;
651 vector float c1b,c2b,c3b;
652 vector float c1c,c2c,c3c;
653 vector float c1d,c2d,c3d;
655 /* load the coordinates */
656 perm = vec_lvsl( 0, address1 );
657 tmp1 = vec_ld( 0, address1 );
658 tmp2 = vec_ld( 16, address1 );
659 tmp3 = vec_ld( 32, address1 );
660 c1a = vec_perm(tmp1,tmp2,perm); /* Oxa Oya Oza H1xa */
661 c2a = vec_perm(tmp2,tmp3,perm); /* H1ya H1za H2xa H2ya */
662 c3a = vec_perm(tmp3,tmp3,perm); /* H2za - - - */
664 perm = vec_lvsl( 0, address2 );
665 tmp1 = vec_ld( 0, address2 );
666 tmp2 = vec_ld( 16, address2 );
667 tmp3 = vec_ld( 32, address2 );
668 c1b = vec_perm(tmp1,tmp2,perm); /* Oxb Oyb Ozb H1xb */
669 c2b = vec_perm(tmp2,tmp3,perm); /* H1yb H1zb H2xb H2yb */
670 c3b = vec_perm(tmp3,tmp3,perm); /* H2zb - - - */
672 perm = vec_lvsl( 0, address3 );
673 tmp1 = vec_ld( 0, address3 );
674 tmp2 = vec_ld( 16, address3 );
675 tmp3 = vec_ld( 32, address3 );
676 c1c = vec_perm(tmp1,tmp2,perm); /* Oxc Oyc Ozc H1xc */
677 c2c = vec_perm(tmp2,tmp3,perm); /* H1yc H1zc H2xc H2yc */
678 c3c = vec_perm(tmp3,tmp3,perm); /* H2zc - - - */
680 perm = vec_lvsl( 0, address4 );
681 tmp1 = vec_ld( 0, address4 );
682 tmp2 = vec_ld( 16, address4 );
683 tmp3 = vec_ld( 32, address4 );
684 c1d = vec_perm(tmp1,tmp2,perm); /* Oxd Oyd Ozd H1xd */
685 c2d = vec_perm(tmp2,tmp3,perm); /* H1yd H1zd H2xd H2yd */
686 c3d = vec_perm(tmp3,tmp3,perm); /* H2zd - - - */
688 /* permute things */
689 tmp1 = vec_mergeh(c1a,c1c); /* Oxa Oxc Oya Oyc */
690 c1a = vec_mergel(c1a,c1c); /* Oza Ozc H1xa H1xc */
691 c1c = vec_mergeh(c1b,c1d); /* Oxb Oxd Oyb Oyd */
692 c1b = vec_mergel(c1b,c1d); /* Ozb Ozd H1xb H1xd */
694 c1d = vec_mergeh(c2a,c2c); /* H1ya H1yc H1za H1zc */
695 c2a = vec_mergel(c2a,c2c); /* H2xa H2xc H2ya H2yc */
696 c2c = vec_mergeh(c2b,c2d); /* H1yb H1yd H1zb H1zd */
697 c2b = vec_mergel(c2b,c2d); /* H2xb H2xd H2yb H2yd */
699 c3a = vec_mergeh(c3a,c3c); /* H2za H2zc - - */
700 c3b = vec_mergeh(c3b,c3d); /* H2zb H2zd - - */
702 *Ox = vec_mergeh(tmp1,c1c); /* Oxa Oxb Oxc Oxd */
703 *Oy = vec_mergel(tmp1,c1c); /* Oya Oyb Oyc Oyd */
704 *Oz = vec_mergeh(c1a,c1b); /* Oza Ozb Ozc Ozd */
705 *H1x = vec_mergel(c1a,c1b); /* H1xa H1xb H1xc H1xd */
706 *H1y = vec_mergeh(c1d,c2c); /* H1ya H1yb H1yc H1yd */
707 *H1z = vec_mergel(c1d,c2c); /* H1za H1zb H1zc H1zd */
708 *H2x = vec_mergeh(c2a,c2b); /* H2xa H2xb H2xc H2xd */
709 *H2y = vec_mergel(c2a,c2b); /* H2ya H2yb H2yc H2yd */
710 *H2z = vec_mergeh(c3a,c3b); /* H2za H2zb H2zc H2zd */
715 static inline void load_3_water(float *address1,
716 float *address2,
717 float *address3,
718 vector float *Ox,
719 vector float *Oy,
720 vector float *Oz,
721 vector float *H1x,
722 vector float *H1y,
723 vector float *H1z,
724 vector float *H2x,
725 vector float *H2y,
726 vector float *H2z)
728 vector unsigned char perm;
729 vector float tmp1,tmp2,tmp3;
730 vector float c1a,c2a,c3a;
731 vector float c1b,c2b,c3b;
732 vector float c1c,c2c,c3c;
734 /* load the coordinates */
735 perm = vec_lvsl( 0, (int *) address1 );
736 tmp1 = vec_ld( 0, address1 );
737 tmp2 = vec_ld( 16, address1 );
738 tmp3 = vec_ld( 32, address1 );
739 c1a = vec_perm(tmp1,tmp2,perm); /* Oxa Oya Oza H1xa */
740 c2a = vec_perm(tmp2,tmp3,perm); /* H1ya H1za H2xa H2ya */
741 c3a = vec_perm(tmp3,tmp3,perm); /* H2za - - - */
743 perm = vec_lvsl( 0, (int *) address3 );
744 tmp1 = vec_ld( 0, address3 );
745 tmp2 = vec_ld( 16, address3 );
746 tmp3 = vec_ld( 32, address3 );
747 c1c = vec_perm(tmp1,tmp2,perm); /* Oxc Oyc Ozc H1xc */
748 c2c = vec_perm(tmp2,tmp3,perm); /* H1yc H1zc H2xc H2yc */
749 c3c = vec_perm(tmp3,tmp3,perm); /* H2zc - - - */
751 perm = vec_lvsl( 0, (int *) address2 );
752 tmp1 = vec_ld( 0, address2 );
753 tmp2 = vec_ld( 16, address2 );
754 tmp3 = vec_ld( 32, address2 );
755 c1b = vec_perm(tmp1,tmp2,perm); /* Oxb Oyb Ozb H1xb */
756 c2b = vec_perm(tmp2,tmp3,perm); /* H1yb H1zb H2xb H2yb */
757 c3b = vec_perm(tmp3,tmp3,perm); /* H2zb - - - */
759 /* permute things */
760 tmp1 = vec_mergeh(c1a,c1c); /* Oxa Oxc Oya Oyc */
761 c1a = vec_mergel(c1a,c1c); /* Oza Ozc H1xa H1xc */
762 c1c = vec_mergeh(c1b,c1b); /* Oxb - Oyb - */
763 c1b = vec_mergel(c1b,c1b); /* Ozb - H1xb - */
765 tmp2 = vec_mergeh(c2a,c2c); /* H1ya H1yc H1za H1zc */
766 c2a = vec_mergel(c2a,c2c); /* H2xa H2xc H2ya H2yc */
767 c2c = vec_mergeh(c2b,c2b); /* H1yb - H1zb - */
768 c2b = vec_mergel(c2b,c2b); /* H2xb - H2yb - */
770 c3a = vec_mergeh(c3a,c3c); /* H2za H2zc - - */
772 *Ox = vec_mergeh(tmp1,c1c); /* Oxa Oxb Oxc - */
773 *Oy = vec_mergel(tmp1,c1c); /* Oya Oyb Oyc - */
774 *Oz = vec_mergeh(c1a,c1b); /* Oza Ozb Ozc - */
775 *H1x = vec_mergel(c1a,c1b); /* H1xa H1xb H1xc - */
776 *H1y = vec_mergeh(tmp2,c2c); /* H1ya H1yb H1yc - */
777 *H1z = vec_mergel(tmp2,c2c); /* H1za H1zb H1zc - */
778 *H2x = vec_mergeh(c2a,c2b); /* H2xa H2xb H2xc - */
779 *H2y = vec_mergel(c2a,c2b); /* H2ya H2yb H2yc - */
780 *H2z = vec_mergeh(c3a,c3b); /* H2za H2zb H2zc - */
785 static inline void load_2_water(float *address1,
786 float *address2,
787 vector float *Ox,
788 vector float *Oy,
789 vector float *Oz,
790 vector float *H1x,
791 vector float *H1y,
792 vector float *H1z,
793 vector float *H2x,
794 vector float *H2y,
795 vector float *H2z)
797 vector unsigned char perm;
798 vector float tmp1,tmp2,tmp3;
799 vector float c1a,c2a,c3a;
800 vector float c1b,c2b,c3b;
802 /* load the coordinates */
803 perm = vec_lvsl( 0, (int *) address1 );
804 tmp1 = vec_ld( 0, address1 );
805 tmp2 = vec_ld( 16, address1 );
806 tmp3 = vec_ld( 32, address1 );
807 c1a = vec_perm(tmp1,tmp2,perm); /* Oxa Oya Oza H1xa */
808 c2a = vec_perm(tmp2,tmp3,perm); /* H1ya H1za H2xa H2ya */
809 c3a = vec_perm(tmp3,tmp3,perm); /* H2za - - - */
811 perm = vec_lvsl( 0, (int *) address2 );
812 tmp1 = vec_ld( 0, address2 );
813 tmp2 = vec_ld( 16, address2 );
814 tmp3 = vec_ld( 32, address2 );
815 c1b = vec_perm(tmp1,tmp2,perm); /* Oxb Oyb Ozb H1xb */
816 c2b = vec_perm(tmp2,tmp3,perm); /* H1yb H1zb H2xb H2yb */
817 c3b = vec_perm(tmp3,tmp3,perm); /* H2zb - - - */
819 /* never mind what we get in the two remaining elements */
820 *Ox = vec_mergeh(c1a,c1b); /* Oxa Oxb Oya Oyb */
821 *H1y = vec_mergeh(c2a,c2b); /* H1ya H1yb H1za H1zb */
822 *H2z = vec_mergeh(c3a,c3b); /* H2za H2zb - - */
823 *Oz = vec_mergel(c1a,c1b); /* Oza Ozb H1xa H1xb */
824 *H2x = vec_mergel(c2a,c2b); /* H2xa H2xb H2ya H2yb */
825 *Oy = vec_sld(*Ox,*Ox,8);
826 *H1z = vec_sld(*H1y,*H1y,8);
827 *H1x = vec_sld(*Oz,*Oz,8);
828 *H2y = vec_sld(*H2x,*H2x,8);
832 static inline void load_1_water(float *address1,
833 vector float *Ox,
834 vector float *Oy,
835 vector float *Oz,
836 vector float *H1x,
837 vector float *H1y,
838 vector float *H1z,
839 vector float *H2x,
840 vector float *H2y,
841 vector float *H2z)
843 vector unsigned char perm;
844 vector float tmp1,tmp2,tmp3;
846 /* load the coordinates */
847 perm = vec_lvsl( 0, (int *) address1 );
848 tmp1 = vec_ld( 0, address1 );
849 tmp2 = vec_ld( 16, address1 );
850 tmp3 = vec_ld( 32, address1 );
851 *Ox = vec_perm(tmp1,tmp2,perm); /* Ox Oy Oz H1x */
852 *H1y = vec_perm(tmp2,tmp3,perm); /* H1y H1z H2x H2y */
853 *H2z = vec_perm(tmp3,tmp3,perm); /* H2z - - - */
855 /* just splat things... never mind that we fill all cells :-) */
856 *Oy = vec_splat(*Ox,1);
857 *Oz = vec_splat(*Ox,2);
858 *H1x = vec_splat(*Ox,3);
859 *H1z = vec_splat(*H1y,1);
860 *H2x = vec_splat(*H1y,2);
861 *H2y = vec_splat(*H1y,3);
866 static inline void update_i_water_forces(float *water,
867 float *shiftvec,
868 vector float Ox,
869 vector float Oy,
870 vector float Oz,
871 vector float H1x,
872 vector float H1y,
873 vector float H1z,
874 vector float H2x,
875 vector float H2y,
876 vector float H2z)
878 vector float l1,l2,l3;
879 vector unsigned char perm;
880 vector unsigned int mask,ox00;
881 vector float nul=vec_zero();
883 ox00=(vector unsigned int)vec_splat_s32(0);
884 /* load */
885 perm = vec_lvsr( 0, water );
886 l1 = vec_ld( 0, water);
887 l2 = vec_ld( 16, water);
888 l3 = vec_ld( 32, water);
889 mask = vec_perm(ox00,
890 (vector unsigned int)vec_splat_s32(-1),perm);
892 /* accumulate the forces */
893 Ox = vec_add(Ox,vec_sld(Ox,Ox,8)); /* Ox Ox' - - */
894 Oy = vec_add(Oy,vec_sld(Oy,Oy,8)); /* Oy Oy' - - */
895 Oz = vec_add(Oz,vec_sld(Oz,Oz,8)); /* Oz Oz' - - */
896 H1x = vec_add(H1x,vec_sld(H1x,H1x,8)); /* H1x H1x' - - */
897 H1y = vec_add(H1y,vec_sld(H1y,H1y,8)); /* H1y H1y' - - */
898 H1z = vec_add(H1z,vec_sld(H1z,H1z,8)); /* H1z H1z' - - */
899 H2x = vec_add(H2x,vec_sld(H2x,H2x,8)); /* H2x H2x' - - */
900 H2y = vec_add(H2y,vec_sld(H2y,H2y,8)); /* H2y H2y' - - */
901 H2z = vec_add(H2z,vec_sld(H2z,H2z,8)); /* H2z H2z' - - */
903 Ox = vec_mergeh(Ox,Oz); /* Ox Oz Ox' Oz' */
904 Oy = vec_mergeh(Oy,H1x); /* Oy H1x Oy' H1x' */
905 H1y = vec_mergeh(H1y,H2x); /* H1y H2x H1y' H2x' */
906 H1z = vec_mergeh(H1z,H2y); /* H1z H2y H1z' H2y' */
907 H2z = vec_mergeh(H2z,nul); /* H2z 0 H2z' 0 */
909 Ox = vec_add(Ox,vec_sld(Ox,Ox,8)); /* Ox Oz - - */
910 Oy = vec_add(Oy,vec_sld(Oy,Oy,8)); /* Oy H1x - - */
911 H1y = vec_add(H1y,vec_sld(H1y,H1y,8)); /* H1y H2x - - */
912 H1z = vec_add(H1z,vec_sld(H1z,H1z,8)); /* H1z H2y - - */
913 H2z = vec_add(H2z,vec_sld(H2z,H2z,8)); /* H2z 0 ? 0 */
915 Ox = vec_mergeh(Ox,Oy); /* Ox Oy Oz H1x */
916 H1y = vec_mergeh(H1y,H1z); /* H1y H1z H2x H2y */
917 H2z = vec_mergeh(H2z,nul); /* H2z 0 0 0 */
919 Oy = vec_sld(nul,Ox,12); /* 0 Ox Oy Oz */
920 Oz = vec_sld(Ox,H1y,8); /* - H1x H1y H1z */
921 H1x = vec_sld(H1y,H2z,4); /* - H2x H2y H2z */
923 H2x = vec_perm(nul,Ox,perm); /* The part to add to l1 */
924 H2y = vec_perm(Ox,H1y,perm); /* The part to add to l2 */
925 H2z = vec_perm(H1y,H2z,perm); /* The part to add to l3 */
927 H2x = vec_add(l1,H2x);
928 H2y = vec_add(l2,H2y);
929 H2z = vec_add(l3,H2z);
931 H2x = vec_sel(l1,H2x,mask);
932 mask = vec_sld(ox00,mask,12);
933 H2z = vec_sel(H2z,l3,mask);
935 /* store */
936 vec_st(H2x, 0, water);
937 vec_st(H2y, 16, water);
938 vec_st(H2z, 32, water);
940 Oy = vec_add(Oy,Oz);
941 Oy = vec_add(Oy,H1x);
942 Oy = vec_sld(Oy,nul,4); /* x y z 0 */
944 add_xyz_to_mem(shiftvec,Oy);
947 static inline void add_force_to_4_water(float *address1,
948 float *address2,
949 float *address3,
950 float *address4,
951 vector float Ox,
952 vector float Oy,
953 vector float Oz,
954 vector float H1x,
955 vector float H1y,
956 vector float H1z,
957 vector float H2x,
958 vector float H2y,
959 vector float H2z)
961 vector float low,medium,high,low2,medium2,high2;
962 vector unsigned char perm,perm2;
963 vector unsigned int mask,mask2,oxFF,ox00;
964 vector float tmp1,tmp2,tmp3;
965 vector float nul=vec_zero();
967 oxFF=(vector unsigned int)vec_splat_s32(-1);
968 ox00=(vector unsigned int)vec_splat_s32(0);
970 /* We have to be careful here since water molecules will sometimes
971 * be adjacent in memory. When we update the forces on water 1,
972 * we usually read & write back some extra neighboring cells in
973 * the vector. This means we cannot first read all data, update
974 * it and write it out again.
976 * Instead of doing the molecules one by one, we rely on the
977 * fact that the list will be ordered, so water molecules
978 * 1 & 3 can be done first (they cannot be adjacent), and then
979 * waters 2 & 4.
982 tmp1 = vec_mergeh(Ox,Oz); /* Oxa Oza Oxb Ozb */
983 Ox = vec_mergel(Ox,Oz); /* Oxc Ozc Oxd Ozd */
984 Oz = vec_mergeh(Oy,H1x); /* Oya H1xa Oyb H1xb */
985 Oy = vec_mergel(Oy,H1x); /* Oyc H1xc Oyd H1xd */
986 H1x = vec_mergeh(H1y,H2x); /* H1ya H2xa H1yb H2xb */
987 H1y = vec_mergel(H1y,H2x); /* H1yc H2xc H1yd H2xd */
989 H2x = vec_mergeh(H1z,H2y); /* H1za H2ya H1zb H2yb */
990 H1z = vec_mergel(H1z,H2y); /* H1zc H2yc H1zd H2yd */
991 H2y = vec_mergeh(H2z,nul); /* H2za 0 H2zb 0 */
992 H2z = vec_mergel(H2z,nul); /* H2zc 0 H2zd 0 */
994 tmp2 = vec_mergeh(tmp1,Oz); /* Oxa Oya Oza H1xa */
995 Oz = vec_mergel(tmp1,Oz); /* Oxb Oyb Ozb H1xb */
996 tmp1 = vec_mergeh(Ox,Oy); /* Oxc Oyc Ozc H1xc */
997 Ox = vec_mergel(Ox,Oy); /* Oxd Oyd Ozd H1xd */
998 Oy = vec_mergeh(H1x,H2x); /* H1ya H1za H2xa H2ya */
999 H1x = vec_mergel(H1x,H2x); /* H1yb H1zb H2xb H2yb */
1000 H2x = vec_mergeh(H1y,H1z); /* H1yc H1zc H2xc H2yc */
1001 H1y = vec_mergel(H1y,H1z); /* H1yd H1zd H2xd H2yd */
1002 H1z = vec_mergeh(H2y,nul); /* H2za 0 0 0 */
1003 H2y = vec_mergel(H2y,nul); /* H2zb 0 0 0 */
1004 tmp3 = vec_mergeh(H2z,nul); /* H2zc 0 0 0 */
1005 H2z = vec_mergel(H2z,nul); /* H2zd 0 0 0 */
1007 /* move into position, load and add */
1008 perm = vec_lvsr( 0, (int *) address1 );
1009 perm2 = vec_lvsr( 0, (int *) address3 );
1010 low = vec_ld( 0, address1);
1011 low2 = vec_ld( 0, address3);
1013 medium = vec_ld( 16, address1);
1014 medium2 = vec_ld( 16, address3);
1015 high = vec_ld( 32, address1);
1016 high2 = vec_ld( 32, address3);
1017 mask = vec_perm(ox00,oxFF,perm);
1018 mask2 = vec_perm(ox00,oxFF,perm2);
1020 low = vec_add(vec_perm(nul,tmp2,perm),low);
1021 low2 = vec_add(vec_perm(nul,tmp1,perm2),low2);
1023 tmp2 = vec_add(vec_perm(tmp2,Oy,perm),medium);
1024 tmp1 = vec_add(vec_perm(tmp1,H2x,perm2),medium2);
1026 Oy = vec_add(vec_perm(Oy,H1z,perm),high);
1027 H2x = vec_add(vec_perm(H2x,tmp3,perm2),high2);
1029 vec_st(vec_sel(low,low,mask), 0, address1);
1030 vec_st(vec_sel(low2,low2,mask2), 0, address3);
1032 mask = vec_sld(ox00,mask,12);
1033 mask2 = vec_sld(ox00,mask2,12);
1035 vec_st(tmp2, 16, address1);
1036 vec_st(tmp1, 16, address3);
1038 vec_st(vec_sel(Oy,high,mask), 32, address1);
1039 vec_st(vec_sel(H2x,high2,mask2), 32, address3);
1041 /* Finished 1 & 3 - now do 2 & 4 */
1043 perm = vec_lvsr( 0, (int *) address2 );
1044 perm2 = vec_lvsr( 0, (int *) address4 );
1046 low = vec_ld( 0, address2);
1047 low2 = vec_ld( 0, address4);
1048 medium = vec_ld( 16, address2);
1049 medium2 = vec_ld( 16, address4);
1050 high = vec_ld( 32, address2);
1051 high2 = vec_ld( 32, address4);
1052 mask = vec_perm(ox00,oxFF,perm);
1053 mask2 = vec_perm(ox00,oxFF,perm2);
1054 H1z = vec_add(vec_perm(nul,Oz,perm),low);
1055 H2x = vec_add(vec_perm(nul,Ox,perm2),low2);
1056 Oz = vec_add(vec_perm(Oz,H1x,perm),medium);
1057 Ox = vec_add(vec_perm(Ox,H1y,perm2),medium2);
1058 H1x = vec_add(vec_perm(H1x,H2y,perm),high);
1059 H1y = vec_add(vec_perm(H1y,H2z,perm2),high2);
1060 vec_st(vec_sel(low,H1z,mask), 0, address2);
1061 vec_st(vec_sel(low2,H2x,mask2), 0, address4);
1062 mask = vec_sld(ox00,mask,12);
1063 mask2 = vec_sld(ox00,mask2,12);
1064 vec_st(Oz, 16, address2);
1065 vec_st(Ox, 16, address4);
1066 vec_st(vec_sel(H1x,high,mask), 32, address2);
1067 vec_st(vec_sel(H1y,high2,mask2), 32, address4);
1071 static inline void add_force_to_3_water(float *address1,
1072 float *address2,
1073 float *address3,
1074 vector float Ox,
1075 vector float Oy,
1076 vector float Oz,
1077 vector float H1x,
1078 vector float H1y,
1079 vector float H1z,
1080 vector float H2x,
1081 vector float H2y,
1082 vector float H2z)
1084 vector float low,medium,high;
1085 vector unsigned char perm;
1086 vector unsigned int mask,oxFF,ox00;
1087 vector float tmp1,tmp2,tmp3;
1089 vector float nul=vec_zero();
1090 oxFF=(vector unsigned int)vec_splat_s32(-1);
1091 ox00=(vector unsigned int)vec_splat_s32(0);
1093 tmp1 = vec_mergeh(Ox,Oz); /* Oxa Oza Oxb Ozb */
1094 Ox = vec_mergel(Ox,Oz); /* Oxc Ozc ? ? */
1095 Oz = vec_mergeh(Oy,H1x); /* Oya H1xa Oyb H1xb */
1096 Oy = vec_mergel(Oy,H1x); /* Oyc H1xc ? ? */
1097 H1x = vec_mergeh(H1y,H2x); /* H1ya H2xa H1yb H2xb */
1098 H1y = vec_mergel(H1y,H2x); /* H1yc H2xc ? ? */
1099 H2x = vec_mergeh(H1z,H2y); /* H1za H2ya H1zb H2yb */
1100 H1z = vec_mergel(H1z,H2y); /* H1zc H2yc ? ? */
1101 H2y = vec_mergeh(H2z,nul); /* H2za 0 H2zb 0 */
1102 H2z = vec_mergel(H2z,nul); /* H2zc 0 ? 0 */
1104 tmp2 = vec_mergeh(tmp1,Oz); /* Oxa Oya Oza H1xa */
1105 Oz = vec_mergel(tmp1,Oz); /* Oxb Oyb Ozb H1xb */
1106 tmp1 = vec_mergeh(Ox,Oy); /* Oxc Oyc Ozc H1xc */
1107 Oy = vec_mergeh(H1x,H2x); /* H1ya H1za H2xa H2ya */
1108 H1x = vec_mergel(H1x,H2x); /* H1yb H1zb H2xb H2yb */
1109 H2x = vec_mergeh(H1y,H1z); /* H1yc H1zc H2xc H2yc */
1110 H1z = vec_mergeh(H2y,nul); /* H2za 0 0 0 */
1111 H2y = vec_mergel(H2y,nul); /* H2zb 0 0 0 */
1112 tmp3 = vec_mergeh(H2z,nul); /* H2zc 0 0 0 */
1114 /* move into position, load and add */
1115 perm = vec_lvsr( 0, (int *) address1 );
1116 low = vec_ld( 0, address1);
1117 medium = vec_ld( 16, address1);
1118 high = vec_ld( 32, address1);
1119 mask = vec_perm(ox00,oxFF,perm);
1120 H1y = vec_add(vec_perm(nul,tmp2,perm),low);
1121 tmp2 = vec_add(vec_perm(tmp2,Oy,perm),medium);
1122 Oy = vec_add(vec_perm(Oy,H1z,perm),high);
1123 vec_st(vec_sel(low,H1y,mask), 0, address1);
1124 mask = vec_sld(ox00,mask,12);
1125 vec_st(tmp2, 16, address1);
1126 vec_st(vec_sel(Oy,high,mask), 32, address1);
1128 perm = vec_lvsr( 0, (int *) address2 );
1129 low = vec_ld( 0, address2);
1130 medium = vec_ld( 16, address2);
1131 high = vec_ld( 32, address2);
1132 mask = vec_perm(ox00,oxFF,perm);
1133 H1z = vec_add(vec_perm(nul,Oz,perm),low);
1134 Oz = vec_add(vec_perm(Oz,H1x,perm),medium);
1135 H1x = vec_add(vec_perm(H1x,H2y,perm),high);
1136 vec_st(vec_sel(low,H1z,mask), 0, address2);
1137 mask = vec_sld(ox00,mask,12);
1138 vec_st(Oz, 16, address2);
1139 vec_st(vec_sel(H1x,high,mask), 32, address2);
1141 perm = vec_lvsr( 0, (int *) address3 );
1142 low = vec_ld( 0, address3);
1143 medium = vec_ld( 16, address3);
1144 high = vec_ld( 32, address3);
1145 mask = vec_perm(ox00,oxFF,perm);
1146 H2y = vec_add(vec_perm(nul,tmp1,perm),low);
1147 tmp1 = vec_add(vec_perm(tmp1,H2x,perm),medium);
1148 H2x = vec_add(vec_perm(H2x,tmp3,perm),high);
1149 vec_st(vec_sel(low,H2y,mask), 0, address3);
1150 mask = vec_sld(ox00,mask,12);
1151 vec_st(tmp1, 16, address3);
1152 vec_st(vec_sel(H2x,high,mask), 32, address3);
1158 static inline void add_force_to_2_water(float *address1,
1159 float *address2,
1160 vector float Ox,
1161 vector float Oy,
1162 vector float Oz,
1163 vector float H1x,
1164 vector float H1y,
1165 vector float H1z,
1166 vector float H2x,
1167 vector float H2y,
1168 vector float H2z)
1170 vector float low,medium,high;
1171 vector unsigned char perm;
1172 vector unsigned int mask,oxFF,ox00;
1173 vector float tmp1,tmp2,tmp3;
1175 vector float nul=vec_zero();
1176 oxFF=(vector unsigned int)vec_splat_s32(-1);
1177 ox00=(vector unsigned int)vec_splat_s32(0);
1179 tmp1 = vec_mergeh(Ox,Oz); /* Oxa Oza Oxb Ozb */
1180 Oz = vec_mergeh(Oy,H1x); /* Oya H1xa Oyb H1xb */
1181 H1x = vec_mergeh(H1y,H2x); /* H1ya H2xa H1yb H2xb */
1182 H2x = vec_mergeh(H1z,H2y); /* H1za H2ya H1zb H2yb */
1183 H2y = vec_mergeh(H2z,nul); /* H2za 0 H2zb 0 */
1185 tmp2 = vec_mergeh(tmp1,Oz); /* Oxa Oya Oza H1xa */
1186 Oz = vec_mergel(tmp1,Oz); /* Oxb Oyb Ozb H1xb */
1187 Oy = vec_mergeh(H1x,H2x); /* H1ya H1za H2xa H2ya */
1188 H1x = vec_mergel(H1x,H2x); /* H1yb H1zb H2xb H2yb */
1189 H1z = vec_mergeh(H2y,nul); /* H2za 0 0 0 */
1190 H2y = vec_mergel(H2y,nul); /* H2zb 0 0 0 */
1192 /* move into position and add */
1193 perm = vec_lvsr( 0, (int *) address1 );
1194 low = vec_ld( 0, address1);
1195 medium = vec_ld( 16, address1);
1196 high = vec_ld( 32, address1);
1197 mask = vec_perm(ox00,oxFF,perm);
1198 H2x = vec_add(vec_perm(nul,tmp2,perm),low);
1199 tmp2 = vec_add(vec_perm(tmp2,Oy,perm),medium);
1200 Oy = vec_add(vec_perm(Oy,H1z,perm),high);
1201 vec_st(vec_sel(low,H2x,mask), 0, address1);
1202 mask = vec_sld(ox00,mask,12);
1203 vec_st(tmp2, 16, address1);
1204 vec_st(vec_sel(Oy,high,mask), 32, address1);
1206 perm = vec_lvsr( 0, (int *) address2 );
1207 low = vec_ld( 0, address2);
1208 medium = vec_ld( 16, address2);
1209 high = vec_ld( 32, address2);
1210 mask = vec_perm(ox00,oxFF,perm);
1211 H1z = vec_add(vec_perm(nul,Oz,perm),low);
1212 Oz = vec_add(vec_perm(Oz,H1x,perm),medium);
1213 H1x = vec_add(vec_perm(H1x,H2y,perm),high);
1214 vec_st(vec_sel(low,H1z,mask), 0, address2);
1215 mask = vec_sld(ox00,mask,12);
1216 vec_st(Oz, 16, address2);
1217 vec_st(vec_sel(H1x,high,mask), 32, address2);
1222 static inline void add_force_to_1_water(float *address1,
1223 vector float Ox,
1224 vector float Oy,
1225 vector float Oz,
1226 vector float H1x,
1227 vector float H1y,
1228 vector float H1z,
1229 vector float H2x,
1230 vector float H2y,
1231 vector float H2z)
1233 vector float low,medium,high;
1234 vector unsigned char perm;
1235 vector unsigned int mask,oxFF,ox00;
1236 vector float tmp1,tmp2,tmp3;
1238 vector float nul=vec_zero();
1239 oxFF=(vector unsigned int)vec_splat_s32(-1);
1240 ox00=(vector unsigned int)vec_splat_s32(0);
1242 /* load */
1243 perm = vec_lvsr( 0, (int *) address1 );
1244 low = vec_ld( 0, address1);
1245 medium = vec_ld( 16, address1);
1246 high = vec_ld( 32, address1);
1248 tmp1 = vec_mergeh(Ox,Oz); /* Oxa Oza ? ? */
1249 Oz = vec_mergeh(Oy,H1x); /* Oya H1xa ? ? */
1250 H1x = vec_mergeh(H1y,H2x); /* H1ya H2xa ? ? */
1251 H2x = vec_mergeh(H1z,H2y); /* H1za H2ya ? ? */
1252 H2y = vec_mergeh(H2z,nul); /* H2za 0 ? 0 */
1254 tmp2 = vec_mergeh(tmp1,Oz); /* Oxa Oya Oza H1xa */
1255 Oy = vec_mergeh(H1x,H2x); /* H1ya H1za H2xa H2ya */
1256 H1z = vec_mergeh(H2y,nul); /* H2za 0 0 0 */
1258 /* move into position and add */
1259 perm = vec_lvsr( 0, (int *) address1 );
1260 low = vec_ld( 0, address1);
1261 medium = vec_ld( 16, address1);
1262 high = vec_ld( 32, address1);
1263 mask = vec_perm(ox00,oxFF,perm);
1264 H2x = vec_add(vec_perm(nul,tmp2,perm),low);
1265 tmp2 = vec_add(vec_perm(tmp2,Oy,perm),medium);
1266 Oy = vec_add(vec_perm(Oy,H1z,perm),high);
1267 vec_st(vec_sel(low,H2x,mask), 0, address1);
1268 mask = vec_sld(ox00,mask,12);
1269 vec_st(tmp2, 16, address1);
1270 vec_st(vec_sel(Oy,high,mask), 32, address1);
1275 static inline void do_4_ctable_coul(float *VFtab,
1276 vector float rtab,
1277 vector float *VV,
1278 vector float *FF)
1280 vector signed int vidx;
1281 vector float Y,F,G,H,eps,eps2,tab1,tab2,tab3,tab4;
1282 int idx1,idx2,idx3,idx4;
1284 vidx = vec_cts(rtab,0);
1285 vidx = vec_sl(vidx,vec_splat_u32(2));
1287 idx1 = *((int *)&vidx);
1288 idx2 = *(((int *)&vidx)+1);
1289 idx3 = *(((int *)&vidx)+2);
1290 idx4 = *(((int *)&vidx)+3);
1292 eps = vec_sub(rtab,vec_floor(rtab));
1293 eps2 = vec_madd(eps,eps,vec_zero());
1295 if(((unsigned int)VFtab)%16) { /* not 16-byte aligned, but must be 8 byte. */
1296 tab1 = vec_ld( 0, VFtab+idx1);
1297 Y = vec_ld(16, VFtab+idx1);
1298 tab1 = vec_sld(tab1,Y,8);
1299 tab2 = vec_ld( 0, VFtab+idx2);
1300 F = vec_ld(16, VFtab+idx2);
1301 tab2 = vec_sld(tab2,F,8);
1302 tab3 = vec_ld( 0, VFtab+idx3);
1303 G = vec_ld(16, VFtab+idx3);
1304 tab3 = vec_sld(tab3,G,8);
1305 tab4 = vec_ld( 0, VFtab+idx4);
1306 H = vec_ld(16, VFtab+idx4);
1307 tab4 = vec_sld(tab4,H,8);
1308 } else { /* aligned */
1309 tab1=vec_ld(0, VFtab+idx1);
1310 tab2=vec_ld(0, VFtab+idx2);
1311 tab3=vec_ld(0, VFtab+idx3);
1312 tab4=vec_ld(0, VFtab+idx4);
1315 /* table data is aligned */
1316 transpose_4_to_4(tab1,tab2,tab3,tab4,&Y,&F,&G,&H);
1318 F = vec_madd(G,eps,F); /* F + Geps */
1319 H = vec_madd(H,eps2,vec_zero()); /* Heps2 */
1320 F = vec_add(F,H); /* F + Geps + Heps2 (=Fp) */
1321 *VV = vec_madd(eps,F,Y); /* VV = Y + eps*Fp */
1322 F = vec_madd(G,eps,F); /* Fp + Geps */
1323 *FF = vec_madd(vec_two(),H,F); /* Fp + Geps + 2.0*Heps2 */
1326 /* do only coulomb, but on a table with both coulomb and lj data */
1327 static inline void do_4_ljctable_coul(float *VFtab,
1328 vector float rtab,
1329 vector float *VV,
1330 vector float *FF)
1332 vector signed int vidx;
1333 vector float Y,F,G,H,eps,eps2,tab1,tab2,tab3,tab4;
1334 int idx1,idx2,idx3,idx4;
1336 vidx = vec_cts(rtab,0);
1337 vidx = vec_add(vidx,vec_sl(vidx,vec_splat_u32(1))); /* multiply by 3 */
1338 vidx = vec_sl(vidx,vec_splat_u32(2));
1340 idx1 = *((int *)&vidx);
1341 idx2 = *(((int *)&vidx)+1);
1342 idx3 = *(((int *)&vidx)+2);
1343 idx4 = *(((int *)&vidx)+3);
1345 eps = vec_sub(rtab,vec_floor(rtab));
1346 eps2 = vec_madd(eps,eps,vec_zero());
1348 if(((unsigned int)VFtab)%16) { /* not 16-byte aligned, but must be 8 byte. */
1349 tab1 = vec_ld( 0, VFtab+idx1);
1350 Y = vec_ld(16, VFtab+idx1);
1351 tab1 = vec_sld(tab1,Y,8);
1352 tab2 = vec_ld( 0, VFtab+idx2);
1353 F = vec_ld(16, VFtab+idx2);
1354 tab2 = vec_sld(tab2,F,8);
1355 tab3 = vec_ld( 0, VFtab+idx3);
1356 G = vec_ld(16, VFtab+idx3);
1357 tab3 = vec_sld(tab3,G,8);
1358 tab4 = vec_ld( 0, VFtab+idx4);
1359 H = vec_ld(16, VFtab+idx4);
1360 tab4 = vec_sld(tab4,H,8);
1361 } else { /* aligned */
1362 tab1=vec_ld(0, VFtab+idx1);
1363 tab2=vec_ld(0, VFtab+idx2);
1364 tab3=vec_ld(0, VFtab+idx3);
1365 tab4=vec_ld(0, VFtab+idx4);
1368 transpose_4_to_4(tab1,tab2,tab3,tab4,&Y,&F,&G,&H);
1370 F = vec_madd(G,eps,F); /* F + Geps */
1371 H = vec_madd(H,eps2,vec_zero()); /* Heps2 */
1372 F = vec_add(F,H); /* F + Geps + Heps2 (=Fp) */
1373 *VV = vec_madd(eps,F,Y); /* VV = Y + eps*Fp */
1374 F = vec_madd(G,eps,F); /* Fp + Geps */
1375 *FF = vec_madd(vec_two(),H,F); /* Fp + Geps + 2.0*Heps2 */
1379 static inline void do_4_ljtable_lj(float *VFtab,
1380 vector float rtab,
1381 vector float *VVdisp,
1382 vector float *FFdisp,
1383 vector float *VVrep,
1384 vector float *FFrep)
1386 vector signed int vidx;
1387 vector float Y,F,G,H,eps,eps2;
1388 vector float tabd1,tabd2,tabd3,tabd4;
1389 vector float tabr1,tabr2,tabr3,tabr4;
1391 int idx1,idx2,idx3,idx4;
1393 vidx = vec_cts(rtab,0);
1394 vidx = vec_sl(vidx,vec_splat_u32(3));
1396 idx1 = *((int *)&vidx);
1397 idx2 = *(((int *)&vidx)+1);
1398 idx3 = *(((int *)&vidx)+2);
1399 idx4 = *(((int *)&vidx)+3);
1401 eps = vec_sub(rtab,vec_floor(rtab));
1402 eps2 = vec_madd(eps,eps,vec_zero());
1404 if(((unsigned int)VFtab)%16) {
1405 /* not 16 byte aligned, i.e. must be 8 byte. */
1406 /* use Y,F,G,H as temp storage */
1407 tabd1 = vec_ld( 0, VFtab+idx1);
1408 tabr1 = vec_ld(16, VFtab+idx1);
1409 Y = vec_ld(32, VFtab+idx1);
1410 tabd1 = vec_sld(tabd1,tabr1,8);
1411 tabr1 = vec_sld(tabr1,Y,8);
1413 tabd2 = vec_ld( 0, VFtab+idx2);
1414 tabr2 = vec_ld(16, VFtab+idx2);
1415 F = vec_ld(32, VFtab+idx2);
1416 tabd2 = vec_sld(tabd2,tabr2,8);
1417 tabr2 = vec_sld(tabr2,F,8);
1419 tabd3 = vec_ld( 0, VFtab+idx3);
1420 tabr3 = vec_ld(16, VFtab+idx3);
1421 G = vec_ld(32, VFtab+idx3);
1422 tabd3 = vec_sld(tabd3,tabr3,8);
1423 tabr3 = vec_sld(tabr3,G,8);
1425 tabd4 = vec_ld( 0, VFtab+idx4);
1426 tabr4 = vec_ld(16, VFtab+idx4);
1427 H = vec_ld(32, VFtab+idx4);
1428 tabd4 = vec_sld(tabd4,tabr4,8);
1429 tabr4 = vec_sld(tabr4,H,8);
1430 } else { /* 16 byte aligned */
1431 tabd1 = vec_ld( 0, VFtab+idx1);
1432 tabr1 = vec_ld(16, VFtab+idx1);
1433 tabd2 = vec_ld( 0, VFtab+idx2);
1434 tabr2 = vec_ld(16, VFtab+idx2);
1435 tabd3 = vec_ld( 0, VFtab+idx3);
1436 tabr3 = vec_ld(16, VFtab+idx3);
1437 tabd4 = vec_ld( 0, VFtab+idx4);
1438 tabr4 = vec_ld(16, VFtab+idx4);
1441 transpose_4_to_4(tabd1,tabd2,tabd3,tabd4,&Y,&F,&G,&H);
1443 F = vec_madd(G,eps,F); /* F + Geps */
1444 H = vec_madd(H,eps2,vec_zero()); /* Heps2 */
1445 F = vec_add(F,H); /* F + Geps + Heps2 (=Fp) */
1446 *VVdisp = vec_madd(eps,F,Y); /* VV = Y + eps*Fp */
1447 F = vec_madd(G,eps,F); /* Fp + Geps */
1448 *FFdisp = vec_madd(vec_two(),H,F); /* Fp + Geps + 2.0*Heps2 */
1450 transpose_4_to_4(tabr1,tabr2,tabr3,tabr4,&Y,&F,&G,&H);
1452 F = vec_madd(G,eps,F); /* F + Geps */
1453 H = vec_madd(H,eps2,vec_zero()); /* Heps2 */
1454 F = vec_add(F,H); /* F + Geps + Heps2 (=Fp) */
1455 *VVrep = vec_madd(eps,F,Y); /* VV = Y + eps*Fp */
1456 F = vec_madd(G,eps,F); /* Fp + Geps */
1457 *FFrep = vec_madd(vec_two(),H,F); /* Fp + Geps + 2.0*Heps2 */
1461 static inline void do_4_ljctable_coul_and_lj(float *VFtab,
1462 vector float rtab,
1463 vector float *VVcoul,
1464 vector float *FFcoul,
1465 vector float *VVdisp,
1466 vector float *FFdisp,
1467 vector float *VVrep,
1468 vector float *FFrep)
1470 vector signed int vidx;
1471 vector float Y,F,G,H,eps,eps2;
1472 vector float tabd1,tabr1,tabc1,tabd2,tabr2,tabc2;
1473 vector float tabd3,tabr3,tabc3,tabd4,tabr4,tabc4;
1474 int idx1,idx2,idx3,idx4;
1476 vidx = vec_cts(rtab,0);
1477 vidx = vec_add(vidx,vec_sl(vidx,vec_splat_u32(1))); /* multiply by 3 */
1478 vidx = vec_sl(vidx,vec_splat_u32(2));
1480 idx1 = *((int *)&vidx);
1481 idx2 = *(((int *)&vidx)+1);
1482 idx3 = *(((int *)&vidx)+2);
1483 idx4 = *(((int *)&vidx)+3);
1485 eps = vec_sub(rtab,vec_floor(rtab));
1486 eps2 = vec_madd(eps,eps,vec_zero());
1488 if(((unsigned int)VFtab)%16) {
1489 /* not 16-byte aligned, but must be 8 byte. */
1490 /* use Y,F,G,H as temp storage */
1491 tabc1 = vec_ld( 0, VFtab+idx1);
1492 tabd1 = vec_ld(16, VFtab+idx1);
1493 tabr1 = vec_ld(32, VFtab+idx1);
1494 Y = vec_ld(48, VFtab+idx1);
1495 tabc1 = vec_sld(tabc1,tabd1,8);
1496 tabd1 = vec_sld(tabd1,tabr1,8);
1497 tabr1 = vec_sld(tabr1,Y,8);
1499 tabc2 = vec_ld( 0, VFtab+idx2);
1500 tabd2 = vec_ld(16, VFtab+idx2);
1501 tabr2 = vec_ld(32, VFtab+idx2);
1502 F = vec_ld(48, VFtab+idx2);
1503 tabc2 = vec_sld(tabc2,tabd2,8);
1504 tabd2 = vec_sld(tabd2,tabr2,8);
1505 tabr2 = vec_sld(tabr2,F,8);
1507 tabc3 = vec_ld( 0, VFtab+idx3);
1508 tabd3 = vec_ld(16, VFtab+idx3);
1509 tabr3 = vec_ld(32, VFtab+idx3);
1510 G = vec_ld(48, VFtab+idx3);
1511 tabc3 = vec_sld(tabc3,tabd3,8);
1512 tabd3 = vec_sld(tabd3,tabr3,8);
1513 tabr3 = vec_sld(tabr3,G,8);
1515 tabc4 = vec_ld( 0, VFtab+idx4);
1516 tabd4 = vec_ld(16, VFtab+idx4);
1517 tabr4 = vec_ld(32, VFtab+idx4);
1518 H = vec_ld(48, VFtab+idx4);
1519 tabc4 = vec_sld(tabc4,tabd4,8);
1520 tabd4 = vec_sld(tabd4,tabr4,8);
1521 tabr4 = vec_sld(tabr4,H,8);
1522 } else { /* 16 byte aligned */
1523 tabc1 = vec_ld( 0, VFtab+idx1);
1524 tabd1 = vec_ld(16, VFtab+idx1);
1525 tabr1 = vec_ld(32, VFtab+idx1);
1526 tabc2 = vec_ld( 0, VFtab+idx2);
1527 tabd2 = vec_ld(16, VFtab+idx2);
1528 tabr2 = vec_ld(32, VFtab+idx2);
1529 tabc3 = vec_ld( 0, VFtab+idx3);
1530 tabd3 = vec_ld(16, VFtab+idx3);
1531 tabr3 = vec_ld(32, VFtab+idx3);
1532 tabc4 = vec_ld( 0, VFtab+idx4);
1533 tabd4 = vec_ld(16, VFtab+idx4);
1534 tabr4 = vec_ld(32, VFtab+idx4);
1536 transpose_4_to_4(tabc1,tabc2,tabc3,tabc4,&Y,&F,&G,&H);
1538 F = vec_madd(G,eps,F); /* F + Geps */
1539 H = vec_madd(H,eps2,vec_zero()); /* Heps2 */
1540 F = vec_add(F,H); /* F + Geps + Heps2 (=Fp) */
1541 *VVcoul = vec_madd(eps,F,Y); /* VV = Y + eps*Fp */
1542 F = vec_madd(G,eps,F); /* Fp + Geps */
1543 *FFcoul = vec_madd(vec_two(),H,F); /* Fp + Geps + 2.0*Heps2 */
1545 transpose_4_to_4(tabd1,tabd2,tabd3,tabd4,&Y,&F,&G,&H);
1547 F = vec_madd(G,eps,F); /* F + Geps */
1548 H = vec_madd(H,eps2,vec_zero()); /* Heps2 */
1549 F = vec_add(F,H); /* F + Geps + Heps2 (=Fp) */
1550 *VVdisp = vec_madd(eps,F,Y); /* VV = Y + eps*Fp */
1551 F = vec_madd(G,eps,F); /* Fp + Geps */
1552 *FFdisp = vec_madd(vec_two(),H,F); /* Fp + Geps + 2.0*Heps2 */
1554 transpose_4_to_4(tabr1,tabr2,tabr3,tabr4,&Y,&F,&G,&H);
1556 F = vec_madd(G,eps,F); /* F + Geps */
1557 H = vec_madd(H,eps2,vec_zero()); /* Heps2 */
1558 F = vec_add(F,H); /* F + Geps + Heps2 (=Fp) */
1559 *VVrep = vec_madd(eps,F,Y); /* VV = Y + eps*Fp */
1560 F = vec_madd(G,eps,F); /* Fp + Geps */
1561 *FFrep = vec_madd(vec_two(),H,F); /* Fp + Geps + 2.0*Heps2 */
1565 static inline void do_3_ctable_coul(float *VFtab,
1566 vector float rtab,
1567 vector float *VV,
1568 vector float *FF)
1570 vector signed int vidx;
1571 vector float Y,F,G,H,eps,eps2,tab1,tab2,tab3;
1572 int idx1,idx2,idx3;
1574 vidx = vec_cts(rtab,0);
1575 vidx = vec_sl(vidx,vec_splat_u32(2));
1577 idx1 = *((int *)&vidx);
1578 idx2 = *(((int *)&vidx)+1);
1579 idx3 = *(((int *)&vidx)+2);
1581 eps = vec_sub(rtab,vec_floor(rtab));
1582 eps2 = vec_madd(eps,eps,vec_zero());
1584 if(((unsigned int)VFtab)%16) { /* not 16-byte aligned, but must be 8 byte. */
1585 tab1 = vec_ld( 0, VFtab+idx1);
1586 Y = vec_ld(16, VFtab+idx1);
1587 tab1 = vec_sld(tab1,Y,8);
1588 tab2 = vec_ld( 0, VFtab+idx2);
1589 F = vec_ld(16, VFtab+idx2);
1590 tab2 = vec_sld(tab2,F,8);
1591 tab3 = vec_ld( 0, VFtab+idx3);
1592 G = vec_ld(16, VFtab+idx3);
1593 tab3 = vec_sld(tab3,G,8);
1594 } else { /* aligned */
1595 tab1=vec_ld(0, VFtab+idx1);
1596 tab2=vec_ld(0, VFtab+idx2);
1597 tab3=vec_ld(0, VFtab+idx3);
1600 /* table data is aligned */
1601 transpose_3_to_4(tab1,tab2,tab3,&Y,&F,&G,&H);
1604 F = vec_madd(G,eps,F); /* F + Geps */
1605 H = vec_madd(H,eps2,vec_zero()); /* Heps2 */
1606 F = vec_add(F,H); /* F + Geps + Heps2 (=Fp) */
1607 *VV = vec_madd(eps,F,Y); /* VV = Y + eps*Fp */
1608 F = vec_madd(G,eps,F); /* Fp + Geps */
1609 *FF = vec_madd(vec_two(),H,F); /* Fp + Geps + 2.0*Heps2 */
1612 /* do only coulomb, but on a table with both coulomb and lj data */
1613 static inline void do_3_ljctable_coul(float *VFtab,
1614 vector float rtab,
1615 vector float *VV,
1616 vector float *FF)
1618 vector signed int vidx;
1619 vector float Y,F,G,H,eps,eps2,tab1,tab2,tab3;
1620 int idx1,idx2,idx3;
1622 vidx = vec_cts(rtab,0);
1623 vidx = vec_add(vidx,vec_sl(vidx,vec_splat_u32(1))); /* multiply by 3 */
1624 vidx = vec_sl(vidx,vec_splat_u32(2));
1626 idx1 = *((int *)&vidx);
1627 idx2 = *(((int *)&vidx)+1);
1628 idx3 = *(((int *)&vidx)+2);
1630 eps = vec_sub(rtab,vec_floor(rtab));
1631 eps2 = vec_madd(eps,eps,vec_zero());
1634 if(((unsigned int)VFtab)%16) {
1635 /* not 16-byte aligned, but must be 8 byte. */
1636 tab1 = vec_ld( 0, VFtab+idx1);
1637 Y = vec_ld(16, VFtab+idx1);
1638 tab1 = vec_sld(tab1,Y,8);
1639 tab2 = vec_ld( 0, VFtab+idx2);
1640 F = vec_ld(16, VFtab+idx2);
1641 tab2 = vec_sld(tab2,F,8);
1642 tab3 = vec_ld( 0, VFtab+idx3);
1643 G = vec_ld(16, VFtab+idx3);
1644 tab3 = vec_sld(tab3,G,8);
1645 } else { /* aligned */
1646 tab1=vec_ld(0, VFtab+idx1);
1647 tab2=vec_ld(0, VFtab+idx2);
1648 tab3=vec_ld(0, VFtab+idx3);
1651 /* table data is aligned */
1652 transpose_3_to_4(tab1,tab2,tab3,&Y,&F,&G,&H);
1654 F = vec_madd(G,eps,F); /* F + Geps */
1655 H = vec_madd(H,eps2,vec_zero()); /* Heps2 */
1656 F = vec_add(F,H); /* F + Geps + Heps2 (=Fp) */
1657 *VV = vec_madd(eps,F,Y); /* VV = Y + eps*Fp */
1658 F = vec_madd(G,eps,F); /* Fp + Geps */
1659 *FF = vec_madd(vec_two(),H,F); /* Fp + Geps + 2.0*Heps2 */
1663 static inline void do_3_ljtable_lj(float *VFtab,
1664 vector float rtab,
1665 vector float *VVdisp,
1666 vector float *FFdisp,
1667 vector float *VVrep,
1668 vector float *FFrep)
1670 vector signed int vidx;
1671 vector float Y,F,G,H,eps,eps2;
1672 int idx1,idx2,idx3;
1673 vector float tabd1,tabd2,tabd3;
1674 vector float tabr1,tabr2,tabr3;
1676 vidx = vec_cts(rtab,0);
1677 vidx = vec_sl(vidx,vec_splat_u32(3));
1679 idx1 = *((int *)&vidx);
1680 idx2 = *(((int *)&vidx)+1);
1681 idx3 = *(((int *)&vidx)+2);
1683 eps = vec_sub(rtab,vec_floor(rtab));
1684 eps2 = vec_madd(eps,eps,vec_zero());
1686 if(((unsigned int)VFtab)%16) {
1687 /* not 16 byte aligned, i.e. must be 8 byte. */
1688 /* use Y,F,G,H as temp storage */
1689 tabd1 = vec_ld( 0, VFtab+idx1);
1690 tabr1 = vec_ld(16, VFtab+idx1);
1691 Y = vec_ld(32, VFtab+idx1);
1692 tabd1 = vec_sld(tabd1,tabr1,8);
1693 tabr1 = vec_sld(tabr1,Y,8);
1695 tabd2 = vec_ld( 0, VFtab+idx2);
1696 tabr2 = vec_ld(16, VFtab+idx2);
1697 F = vec_ld(32, VFtab+idx2);
1698 tabd2 = vec_sld(tabd2,tabr2,8);
1699 tabr2 = vec_sld(tabr2,F,8);
1701 tabd3 = vec_ld( 0, VFtab+idx3);
1702 tabr3 = vec_ld(16, VFtab+idx3);
1703 G = vec_ld(32, VFtab+idx3);
1704 tabd3 = vec_sld(tabd3,tabr3,8);
1705 tabr3 = vec_sld(tabr3,G,8);
1706 } else { /* 16 byte aligned */
1707 tabd1 = vec_ld( 0, VFtab+idx1);
1708 tabr1 = vec_ld(16, VFtab+idx1);
1709 tabd2 = vec_ld( 0, VFtab+idx2);
1710 tabr2 = vec_ld(16, VFtab+idx2);
1711 tabd3 = vec_ld( 0, VFtab+idx3);
1712 tabr3 = vec_ld(16, VFtab+idx3);
1715 transpose_3_to_4(tabd1,tabd2,tabd3,&Y,&F,&G,&H);
1717 F = vec_madd(G,eps,F); /* F + Geps */
1718 H = vec_madd(H,eps2,vec_zero()); /* Heps2 */
1719 F = vec_add(F,H); /* F + Geps + Heps2 (=Fp) */
1720 *VVdisp = vec_madd(eps,F,Y); /* VV = Y + eps*Fp */
1721 F = vec_madd(G,eps,F); /* Fp + Geps */
1722 *FFdisp = vec_madd(vec_two(),H,F); /* Fp + Geps + 2.0*Heps2 */
1724 transpose_3_to_4(tabr1,tabr2,tabr3,&Y,&F,&G,&H);
1726 F = vec_madd(G,eps,F); /* F + Geps */
1727 H = vec_madd(H,eps2,vec_zero()); /* Heps2 */
1728 F = vec_add(F,H); /* F + Geps + Heps2 (=Fp) */
1729 *VVrep = vec_madd(eps,F,Y); /* VV = Y + eps*Fp */
1730 F = vec_madd(G,eps,F); /* Fp + Geps */
1731 *FFrep = vec_madd(vec_two(),H,F); /* Fp + Geps + 2.0*Heps2 */
1735 static inline void do_3_ljctable_coul_and_lj(float *VFtab,
1736 vector float rtab,
1737 vector float *VVcoul,
1738 vector float *FFcoul,
1739 vector float *VVdisp,
1740 vector float *FFdisp,
1741 vector float *VVrep,
1742 vector float *FFrep)
1744 vector signed int vidx;
1745 vector float Y,F,G,H,eps,eps2;
1746 vector float tabd1,tabr1,tabc1,tabd2,tabr2,tabc2;
1747 vector float tabd3,tabr3,tabc3;
1748 int idx1,idx2,idx3;
1750 vidx = vec_cts(rtab,0);
1751 vidx = vec_add(vidx,vec_sl(vidx,vec_splat_u32(1))); /* multiply by 3 */
1752 vidx = vec_sl(vidx,vec_splat_u32(2));
1754 idx1 = *((int *)&vidx);
1755 idx2 = *(((int *)&vidx)+1);
1756 idx3 = *(((int *)&vidx)+2);
1758 eps = vec_sub(rtab,vec_floor(rtab));
1759 eps2 = vec_madd(eps,eps,vec_zero());
1761 if(((unsigned int)VFtab)%16) {
1762 /* not 16-byte aligned, but must be 8 byte. */
1763 /* use Y,F,G,H as temp storage */
1764 tabc1 = vec_ld( 0, VFtab+idx1);
1765 tabd1 = vec_ld(16, VFtab+idx1);
1766 tabr1 = vec_ld(32, VFtab+idx1);
1767 Y = vec_ld(48, VFtab+idx1);
1768 tabc1 = vec_sld(tabc1,tabd1,8);
1769 tabd1 = vec_sld(tabd1,tabr1,8);
1770 tabr1 = vec_sld(tabr1,Y,8);
1772 tabc2 = vec_ld( 0, VFtab+idx2);
1773 tabd2 = vec_ld(16, VFtab+idx2);
1774 tabr2 = vec_ld(32, VFtab+idx2);
1775 F = vec_ld(48, VFtab+idx2);
1776 tabc2 = vec_sld(tabc2,tabd2,8);
1777 tabd2 = vec_sld(tabd2,tabr2,8);
1778 tabr2 = vec_sld(tabr2,F,8);
1780 tabc3 = vec_ld( 0, VFtab+idx3);
1781 tabd3 = vec_ld(16, VFtab+idx3);
1782 tabr3 = vec_ld(32, VFtab+idx3);
1783 G = vec_ld(48, VFtab+idx3);
1784 tabc3 = vec_sld(tabc3,tabd3,8);
1785 tabd3 = vec_sld(tabd3,tabr3,8);
1786 tabr3 = vec_sld(tabr3,G,8);
1787 } else { /* 16 byte aligned */
1788 tabc1 = vec_ld( 0, VFtab+idx1);
1789 tabd1 = vec_ld(16, VFtab+idx1);
1790 tabr1 = vec_ld(32, VFtab+idx1);
1791 tabc2 = vec_ld( 0, VFtab+idx2);
1792 tabd2 = vec_ld(16, VFtab+idx2);
1793 tabr2 = vec_ld(32, VFtab+idx2);
1794 tabc3 = vec_ld( 0, VFtab+idx3);
1795 tabd3 = vec_ld(16, VFtab+idx3);
1796 tabr3 = vec_ld(32, VFtab+idx3);
1798 transpose_3_to_4(tabc1,tabc2,tabc3,&Y,&F,&G,&H);
1800 F = vec_madd(G,eps,F); /* F + Geps */
1801 H = vec_madd(H,eps2,vec_zero()); /* Heps2 */
1802 F = vec_add(F,H); /* F + Geps + Heps2 (=Fp) */
1803 *VVcoul = vec_madd(eps,F,Y); /* VV = Y + eps*Fp */
1804 F = vec_madd(G,eps,F); /* Fp + Geps */
1805 *FFcoul = vec_madd(vec_two(),H,F); /* Fp + Geps + 2.0*Heps2 */
1807 transpose_3_to_4(tabd1,tabd2,tabd3,&Y,&F,&G,&H);
1809 F = vec_madd(G,eps,F); /* F + Geps */
1810 H = vec_madd(H,eps2,vec_zero()); /* Heps2 */
1811 F = vec_add(F,H); /* F + Geps + Heps2 (=Fp) */
1812 *VVdisp = vec_madd(eps,F,Y); /* VV = Y + eps*Fp */
1813 F = vec_madd(G,eps,F); /* Fp + Geps */
1814 *FFdisp = vec_madd(vec_two(),H,F); /* Fp + Geps + 2.0*Heps2 */
1816 transpose_3_to_4(tabr1,tabr2,tabr3,&Y,&F,&G,&H);
1818 F = vec_madd(G,eps,F); /* F + Geps */
1819 H = vec_madd(H,eps2,vec_zero()); /* Heps2 */
1820 F = vec_add(F,H); /* F + Geps + Heps2 (=Fp) */
1821 *VVrep = vec_madd(eps,F,Y); /* VV = Y + eps*Fp */
1822 F = vec_madd(G,eps,F); /* Fp + Geps */
1823 *FFrep = vec_madd(vec_two(),H,F); /* Fp + Geps + 2.0*Heps2 */
1829 static inline void do_2_ctable_coul(float *VFtab,
1830 vector float rtab,
1831 vector float *VV,
1832 vector float *FF)
1834 vector signed int vidx;
1835 vector float Y,F,G,H,eps,eps2,tab1,tab2;
1836 int idx1,idx2;
1838 vidx = vec_cts(rtab,0);
1839 vidx = vec_sl(vidx,vec_splat_u32(2));
1841 idx1 = *((int *)&vidx);
1842 idx2 = *(((int *)&vidx)+1);
1844 eps = vec_sub(rtab,vec_floor(rtab));
1845 eps2 = vec_madd(eps,eps,vec_zero());
1847 if(((unsigned int)VFtab)%16) { /* not 16-byte aligned, but must be 8 byte. */
1848 tab1 = vec_ld( 0, VFtab+idx1);
1849 Y = vec_ld(16, VFtab+idx1);
1850 tab1 = vec_sld(tab1,Y,8);
1851 tab2 = vec_ld( 0, VFtab+idx2);
1852 F = vec_ld(16, VFtab+idx2);
1853 tab2 = vec_sld(tab2,F,8);
1854 } else { /* aligned */
1855 tab1=vec_ld(0, VFtab+idx1);
1856 tab2=vec_ld(0, VFtab+idx2);
1859 /* table data is aligned */
1860 transpose_2_to_4(tab1,tab2,&Y,&F,&G,&H);
1862 F = vec_madd(G,eps,F); /* F + Geps */
1863 H = vec_madd(H,eps2,vec_zero()); /* Heps2 */
1864 F = vec_add(F,H); /* F + Geps + Heps2 (=Fp) */
1865 *VV = vec_madd(eps,F,Y); /* VV = Y + eps*Fp */
1866 F = vec_madd(G,eps,F); /* Fp + Geps */
1867 *FF = vec_madd(vec_two(),H,F); /* Fp + Geps + 2.0*Heps2 */
1870 /* do only coulomb, but on a table with both coulomb and lj data */
1871 static inline void do_2_ljctable_coul(float *VFtab,
1872 vector float rtab,
1873 vector float *VV,
1874 vector float *FF)
1876 vector signed int vidx;
1877 vector float Y,F,G,H,eps,eps2,tab1,tab2;
1878 int idx1,idx2;
1880 vidx = vec_cts(rtab,0);
1881 vidx = vec_add(vidx,vec_sl(vidx,vec_splat_u32(1))); /* multiply by 3 */
1882 vidx = vec_sl(vidx,vec_splat_u32(2));
1884 idx1 = *((int *)&vidx);
1885 idx2 = *(((int *)&vidx)+1);
1887 eps = vec_sub(rtab,vec_floor(rtab));
1888 eps2 = vec_madd(eps,eps,vec_zero());
1891 if(((unsigned int)VFtab)%16) {
1892 /* not 16-byte aligned, but must be 8 byte. */
1893 tab1 = vec_ld( 0, VFtab+idx1);
1894 Y = vec_ld(16, VFtab+idx1);
1895 tab1 = vec_sld(tab1,Y,8);
1896 tab2 = vec_ld( 0, VFtab+idx2);
1897 F = vec_ld(16, VFtab+idx2);
1898 tab2 = vec_sld(tab2,F,8);
1899 } else { /* aligned */
1900 tab1=vec_ld(0, VFtab+idx1);
1901 tab2=vec_ld(0, VFtab+idx2);
1904 /* table data is aligned */
1905 transpose_2_to_4(tab1,tab2,&Y,&F,&G,&H);
1907 F = vec_madd(G,eps,F); /* F + Geps */
1908 H = vec_madd(H,eps2,vec_zero()); /* Heps2 */
1909 F = vec_add(F,H); /* F + Geps + Heps2 (=Fp) */
1910 *VV = vec_madd(eps,F,Y); /* VV = Y + eps*Fp */
1911 F = vec_madd(G,eps,F); /* Fp + Geps */
1912 *FF = vec_madd(vec_two(),H,F); /* Fp + Geps + 2.0*Heps2 */
1916 static inline void do_2_ljtable_lj(float *VFtab,
1917 vector float rtab,
1918 vector float *VVdisp,
1919 vector float *FFdisp,
1920 vector float *VVrep,
1921 vector float *FFrep)
1923 vector signed int vidx;
1924 vector float Y,F,G,H,eps,eps2;
1925 int idx1,idx2;
1926 vector float tabd1,tabd2;
1927 vector float tabr1,tabr2;
1929 vidx = vec_cts(rtab,0);
1930 vidx = vec_sl(vidx,vec_splat_u32(3));
1932 idx1 = *((int *)&vidx);
1933 idx2 = *(((int *)&vidx)+1);
1935 eps = vec_sub(rtab,vec_floor(rtab));
1936 eps2 = vec_madd(eps,eps,vec_zero());
1938 if(((unsigned int)VFtab)%16) {
1939 /* not 16 byte aligned, i.e. must be 8 byte. */
1940 /* use Y,F,G,H as temp storage */
1941 tabd1 = vec_ld( 0, VFtab+idx1);
1942 tabr1 = vec_ld(16, VFtab+idx1);
1943 Y = vec_ld(32, VFtab+idx1);
1944 tabd1 = vec_sld(tabd1,tabr1,8);
1945 tabr1 = vec_sld(tabr1,Y,8);
1947 tabd2 = vec_ld( 0, VFtab+idx2);
1948 tabr2 = vec_ld(16, VFtab+idx2);
1949 F = vec_ld(32, VFtab+idx2);
1950 tabd2 = vec_sld(tabd2,tabr2,8);
1951 tabr2 = vec_sld(tabr2,F,8);
1952 } else { /* 16 byte aligned */
1953 tabd1 = vec_ld( 0, VFtab+idx1);
1954 tabr1 = vec_ld(16, VFtab+idx1);
1955 tabd2 = vec_ld( 0, VFtab+idx2);
1956 tabr2 = vec_ld(16, VFtab+idx2);
1959 transpose_2_to_4(tabd1,tabd2,&Y,&F,&G,&H);
1961 F = vec_madd(G,eps,F); /* F + Geps */
1962 H = vec_madd(H,eps2,vec_zero()); /* Heps2 */
1963 F = vec_add(F,H); /* F + Geps + Heps2 (=Fp) */
1964 *VVdisp = vec_madd(eps,F,Y); /* VV = Y + eps*Fp */
1965 F = vec_madd(G,eps,F); /* Fp + Geps */
1966 *FFdisp = vec_madd(vec_two(),H,F); /* Fp + Geps + 2.0*Heps2 */
1968 transpose_2_to_4(tabr1,tabr2,&Y,&F,&G,&H);
1970 F = vec_madd(G,eps,F); /* F + Geps */
1971 H = vec_madd(H,eps2,vec_zero()); /* Heps2 */
1972 F = vec_add(F,H); /* F + Geps + Heps2 (=Fp) */
1973 *VVrep = vec_madd(eps,F,Y); /* VV = Y + eps*Fp */
1974 F = vec_madd(G,eps,F); /* Fp + Geps */
1975 *FFrep = vec_madd(vec_two(),H,F); /* Fp + Geps + 2.0*Heps2 */
1979 static inline void do_2_ljctable_coul_and_lj(float *VFtab,
1980 vector float rtab,
1981 vector float *VVcoul,
1982 vector float *FFcoul,
1983 vector float *VVdisp,
1984 vector float *FFdisp,
1985 vector float *VVrep,
1986 vector float *FFrep)
1988 vector signed int vidx;
1989 vector float Y,F,G,H,eps,eps2;
1990 vector float tabd1,tabr1,tabc1,tabd2,tabr2,tabc2;
1991 int idx1,idx2;
1993 vidx = vec_cts(rtab,0);
1994 vidx = vec_add(vidx,vec_sl(vidx,vec_splat_u32(1))); /* multiply by 3 */
1995 vidx = vec_sl(vidx,vec_splat_u32(2));
1997 idx1 = *((int *)&vidx);
1998 idx2 = *(((int *)&vidx)+1);
2000 eps = vec_sub(rtab,vec_floor(rtab));
2001 eps2 = vec_madd(eps,eps,vec_zero());
2003 if(((unsigned int)VFtab)%16) {
2004 /* not 16-byte aligned, but must be 8 byte. */
2005 /* use Y,F,G,H as temp storage */
2006 tabc1 = vec_ld( 0, VFtab+idx1);
2007 tabd1 = vec_ld(16, VFtab+idx1);
2008 tabr1 = vec_ld(32, VFtab+idx1);
2009 Y = vec_ld(48, VFtab+idx1);
2010 tabc1 = vec_sld(tabc1,tabd1,8);
2011 tabd1 = vec_sld(tabd1,tabr1,8);
2012 tabr1 = vec_sld(tabr1,Y,8);
2014 tabc2 = vec_ld( 0, VFtab+idx2);
2015 tabd2 = vec_ld(16, VFtab+idx2);
2016 tabr2 = vec_ld(32, VFtab+idx2);
2017 F = vec_ld(48, VFtab+idx2);
2018 tabc2 = vec_sld(tabc2,tabd2,8);
2019 tabd2 = vec_sld(tabd2,tabr2,8);
2020 tabr2 = vec_sld(tabr2,F,8);
2021 } else { /* 16 byte aligned */
2022 tabc1 = vec_ld( 0, VFtab+idx1);
2023 tabd1 = vec_ld(16, VFtab+idx1);
2024 tabr1 = vec_ld(32, VFtab+idx1);
2025 tabc2 = vec_ld( 0, VFtab+idx2);
2026 tabd2 = vec_ld(16, VFtab+idx2);
2027 tabr2 = vec_ld(32, VFtab+idx2);
2029 transpose_2_to_4(tabc1,tabc2,&Y,&F,&G,&H);
2031 F = vec_madd(G,eps,F); /* F + Geps */
2032 H = vec_madd(H,eps2,vec_zero()); /* Heps2 */
2033 F = vec_add(F,H); /* F + Geps + Heps2 (=Fp) */
2034 *VVcoul = vec_madd(eps,F,Y); /* VV = Y + eps*Fp */
2035 F = vec_madd(G,eps,F); /* Fp + Geps */
2036 *FFcoul = vec_madd(vec_two(),H,F); /* Fp + Geps + 2.0*Heps2 */
2038 transpose_2_to_4(tabd1,tabd2,&Y,&F,&G,&H);
2040 F = vec_madd(G,eps,F); /* F + Geps */
2041 H = vec_madd(H,eps2,vec_zero()); /* Heps2 */
2042 F = vec_add(F,H); /* F + Geps + Heps2 (=Fp) */
2043 *VVdisp = vec_madd(eps,F,Y); /* VV = Y + eps*Fp */
2044 F = vec_madd(G,eps,F); /* Fp + Geps */
2045 *FFdisp = vec_madd(vec_two(),H,F); /* Fp + Geps + 2.0*Heps2 */
2047 transpose_2_to_4(tabr1,tabr2,&Y,&F,&G,&H);
2049 F = vec_madd(G,eps,F); /* F + Geps */
2050 H = vec_madd(H,eps2,vec_zero()); /* Heps2 */
2051 F = vec_add(F,H); /* F + Geps + Heps2 (=Fp) */
2052 *VVrep = vec_madd(eps,F,Y); /* VV = Y + eps*Fp */
2053 F = vec_madd(G,eps,F); /* Fp + Geps */
2054 *FFrep = vec_madd(vec_two(),H,F); /* Fp + Geps + 2.0*Heps2 */
2059 static inline void do_1_ctable_coul(float *VFtab,
2060 vector float rtab,
2061 vector float *VV,
2062 vector float *FF)
2064 vector signed int vidx;
2065 vector float Y,F,G,H,eps,eps2,tab1;
2066 int idx1;
2068 vidx = vec_cts(rtab,0);
2069 vidx = vec_sl(vidx,vec_splat_u32(2));
2071 idx1 = *((int *)&vidx);
2073 eps = vec_sub(rtab,vec_floor(rtab));
2074 eps2 = vec_madd(eps,eps,vec_zero());
2076 if(((unsigned int)VFtab)%16) { /* not 16-byte aligned, but must be 8 byte. */
2077 tab1 = vec_ld( 0, VFtab+idx1);
2078 Y = vec_ld(16, VFtab+idx1);
2079 tab1 = vec_sld(tab1,Y,8);
2080 } else { /* aligned */
2081 tab1=vec_ld(0, VFtab+idx1);
2084 /* table data is aligned */
2085 transpose_1_to_4(tab1,&Y,&F,&G,&H);
2087 F = vec_madd(G,eps,F); /* F + Geps */
2088 H = vec_madd(H,eps2,vec_zero()); /* Heps2 */
2089 F = vec_add(F,H); /* F + Geps + Heps2 (=Fp) */
2090 *VV = vec_madd(eps,F,Y); /* VV = Y + eps*Fp */
2091 F = vec_madd(G,eps,F); /* Fp + Geps */
2092 *FF = vec_madd(vec_two(),H,F); /* Fp + Geps + 2.0*Heps2 */
2095 /* do only coulomb, but on a table with both coulomb and lj data */
2096 static inline void do_1_ljctable_coul(float *VFtab,
2097 vector float rtab,
2098 vector float *VV,
2099 vector float *FF)
2101 vector signed int vidx;
2102 vector float Y,F,G,H,eps,eps2,tab1;
2103 int idx1;
2105 vidx = vec_cts(rtab,0);
2106 vidx = vec_add(vidx,vec_sl(vidx,vec_splat_u32(1))); /* multiply by 3 */
2107 vidx = vec_sl(vidx,vec_splat_u32(2));
2109 idx1 = *((int *)&vidx);
2111 eps = vec_sub(rtab,vec_floor(rtab));
2112 eps2 = vec_madd(eps,eps,vec_zero());
2115 if(((unsigned int)VFtab)%16) {
2116 /* not 16-byte aligned, but must be 8 byte. */
2117 tab1 = vec_ld( 0, VFtab+idx1);
2118 Y = vec_ld(16, VFtab+idx1);
2119 tab1 = vec_sld(tab1,Y,8);
2120 } else { /* aligned */
2121 tab1=vec_ld(0, VFtab+idx1);
2124 /* table data is aligned */
2125 transpose_1_to_4(tab1,&Y,&F,&G,&H);
2127 F = vec_madd(G,eps,F); /* F + Geps */
2128 H = vec_madd(H,eps2,vec_zero()); /* Heps2 */
2129 F = vec_add(F,H); /* F + Geps + Heps2 (=Fp) */
2130 *VV = vec_madd(eps,F,Y); /* VV = Y + eps*Fp */
2131 F = vec_madd(G,eps,F); /* Fp + Geps */
2132 *FF = vec_madd(vec_two(),H,F); /* Fp + Geps + 2.0*Heps2 */
2136 static inline void do_1_ljtable_lj(float *VFtab,
2137 vector float rtab,
2138 vector float *VVdisp,
2139 vector float *FFdisp,
2140 vector float *VVrep,
2141 vector float *FFrep)
2143 vector signed int vidx;
2144 vector float Y,F,G,H,eps,eps2;
2145 int idx1;
2146 vector float tabd1;
2147 vector float tabr1;
2149 vidx = vec_cts(rtab,0);
2150 vidx = vec_sl(vidx,vec_splat_u32(3));
2152 idx1 = *((int *)&vidx);
2154 eps = vec_sub(rtab,vec_floor(rtab));
2155 eps2 = vec_madd(eps,eps,vec_zero());
2157 if(((unsigned int)VFtab)%16) {
2158 /* not 16 byte aligned, i.e. must be 8 byte. */
2159 /* use Y,F,G,H as temp storage */
2160 tabd1 = vec_ld( 0, VFtab+idx1);
2161 tabr1 = vec_ld(16, VFtab+idx1);
2162 Y = vec_ld(32, VFtab+idx1);
2163 tabd1 = vec_sld(tabd1,tabr1,8);
2164 tabr1 = vec_sld(tabr1,Y,8);
2165 } else { /* 16 byte aligned */
2166 tabd1 = vec_ld( 0, VFtab+idx1);
2167 tabr1 = vec_ld(16, VFtab+idx1);
2170 transpose_1_to_4(tabd1,&Y,&F,&G,&H);
2172 F = vec_madd(G,eps,F); /* F + Geps */
2173 H = vec_madd(H,eps2,vec_zero()); /* Heps2 */
2174 F = vec_add(F,H); /* F + Geps + Heps2 (=Fp) */
2175 *VVdisp = vec_madd(eps,F,Y); /* VV = Y + eps*Fp */
2176 F = vec_madd(G,eps,F); /* Fp + Geps */
2177 *FFdisp = vec_madd(vec_two(),H,F); /* Fp + Geps + 2.0*Heps2 */
2179 transpose_1_to_4(tabr1,&Y,&F,&G,&H);
2181 F = vec_madd(G,eps,F); /* F + Geps */
2182 H = vec_madd(H,eps2,vec_zero()); /* Heps2 */
2183 F = vec_add(F,H); /* F + Geps + Heps2 (=Fp) */
2184 *VVrep = vec_madd(eps,F,Y); /* VV = Y + eps*Fp */
2185 F = vec_madd(G,eps,F); /* Fp + Geps */
2186 *FFrep = vec_madd(vec_two(),H,F); /* Fp + Geps + 2.0*Heps2 */
2190 static inline void do_1_ljctable_coul_and_lj(float *VFtab,
2191 vector float rtab,
2192 vector float *VVcoul,
2193 vector float *FFcoul,
2194 vector float *VVdisp,
2195 vector float *FFdisp,
2196 vector float *VVrep,
2197 vector float *FFrep)
2199 vector signed int vidx;
2200 vector float Y,F,G,H,eps,eps2;
2201 vector float tabd1,tabr1,tabc1;
2202 int idx1;
2204 vidx = vec_cts(rtab,0);
2205 vidx = vec_add(vidx,vec_sl(vidx,vec_splat_u32(1))); /* multiply by 3 */
2206 vidx = vec_sl(vidx,vec_splat_u32(2));
2208 idx1 = *((int *)&vidx);
2210 eps = vec_sub(rtab,vec_floor(rtab));
2211 eps2 = vec_madd(eps,eps,vec_zero());
2213 if(((unsigned int)VFtab)%16) {
2214 /* not 16-byte aligned, but must be 8 byte. */
2215 /* use Y,F,G,H as temp storage */
2216 tabc1 = vec_ld( 0, VFtab+idx1);
2217 tabd1 = vec_ld(16, VFtab+idx1);
2218 tabr1 = vec_ld(32, VFtab+idx1);
2219 Y = vec_ld(48, VFtab+idx1);
2220 tabc1 = vec_sld(tabc1,tabd1,8);
2221 tabd1 = vec_sld(tabd1,tabr1,8);
2222 tabr1 = vec_sld(tabr1,Y,8);
2223 } else { /* 16 byte aligned */
2224 tabc1 = vec_ld( 0, VFtab+idx1);
2225 tabd1 = vec_ld(16, VFtab+idx1);
2226 tabr1 = vec_ld(32, VFtab+idx1);
2228 transpose_1_to_4(tabc1,&Y,&F,&G,&H);
2230 F = vec_madd(G,eps,F); /* F + Geps */
2231 H = vec_madd(H,eps2,vec_zero()); /* Heps2 */
2232 F = vec_add(F,H); /* F + Geps + Heps2 (=Fp) */
2233 *VVcoul = vec_madd(eps,F,Y); /* VV = Y + eps*Fp */
2234 F = vec_madd(G,eps,F); /* Fp + Geps */
2235 *FFcoul = vec_madd(vec_two(),H,F); /* Fp + Geps + 2.0*Heps2 */
2237 transpose_1_to_4(tabd1,&Y,&F,&G,&H);
2239 F = vec_madd(G,eps,F); /* F + Geps */
2240 H = vec_madd(H,eps2,vec_zero()); /* Heps2 */
2241 F = vec_add(F,H); /* F + Geps + Heps2 (=Fp) */
2242 *VVdisp = vec_madd(eps,F,Y); /* VV = Y + eps*Fp */
2243 F = vec_madd(G,eps,F); /* Fp + Geps */
2244 *FFdisp = vec_madd(vec_two(),H,F); /* Fp + Geps + 2.0*Heps2 */
2246 transpose_1_to_4(tabr1,&Y,&F,&G,&H);
2248 F = vec_madd(G,eps,F); /* F + Geps */
2249 H = vec_madd(H,eps2,vec_zero()); /* Heps2 */
2250 F = vec_add(F,H); /* F + Geps + Heps2 (=Fp) */
2251 *VVrep = vec_madd(eps,F,Y); /* VV = Y + eps*Fp */
2252 F = vec_madd(G,eps,F); /* Fp + Geps */
2253 *FFrep = vec_madd(vec_two(),H,F); /* Fp + Geps + 2.0*Heps2 */
2259 static inline void do_vonly_4_ctable_coul(float *VFtab,
2260 vector float rtab,
2261 vector float *VV)
2263 vector signed int vidx;
2264 vector float Y,F,G,H,eps,eps2,tab1,tab2,tab3,tab4;
2265 int idx1,idx2,idx3,idx4;
2267 vidx = vec_cts(rtab,0);
2268 vidx = vec_sl(vidx,vec_splat_u32(2));
2270 idx1 = *((int *)&vidx);
2271 idx2 = *(((int *)&vidx)+1);
2272 idx3 = *(((int *)&vidx)+2);
2273 idx4 = *(((int *)&vidx)+3);
2275 eps = vec_sub(rtab,vec_floor(rtab));
2276 eps2 = vec_madd(eps,eps,vec_zero());
2278 if(((unsigned int)VFtab)%16) { /* not 16-byte aligned, but must be 8 byte. */
2279 tab1 = vec_ld( 0, VFtab+idx1);
2280 Y = vec_ld(16, VFtab+idx1);
2281 tab1 = vec_sld(tab1,Y,8);
2282 tab2 = vec_ld( 0, VFtab+idx2);
2283 F = vec_ld(16, VFtab+idx2);
2284 tab2 = vec_sld(tab2,F,8);
2285 tab3 = vec_ld( 0, VFtab+idx3);
2286 G = vec_ld(16, VFtab+idx3);
2287 tab3 = vec_sld(tab3,G,8);
2288 tab4 = vec_ld( 0, VFtab+idx4);
2289 H = vec_ld(16, VFtab+idx4);
2290 tab4 = vec_sld(tab4,H,8);
2291 } else { /* aligned */
2292 tab1=vec_ld(0, VFtab+idx1);
2293 tab2=vec_ld(0, VFtab+idx2);
2294 tab3=vec_ld(0, VFtab+idx3);
2295 tab4=vec_ld(0, VFtab+idx4);
2298 /* table data is aligned */
2299 transpose_4_to_4(tab1,tab2,tab3,tab4,&Y,&F,&G,&H);
2301 F = vec_madd(G,eps,F); /* F + Geps */
2302 H = vec_madd(H,eps2,vec_zero()); /* Heps2 */
2303 F = vec_add(F,H); /* F + Geps + Heps2 (=Fp) */
2304 *VV = vec_madd(eps,F,Y); /* VV = Y + eps*Fp */
2307 /* do only coulomb, but on a table with both coulomb and lj data */
2308 static inline void do_vonly_4_ljctable_coul(float *VFtab,
2309 vector float rtab,
2310 vector float *VV)
2312 vector signed int vidx;
2313 vector float Y,F,G,H,eps,eps2,tab1,tab2,tab3,tab4;
2314 int idx1,idx2,idx3,idx4;
2316 vidx = vec_cts(rtab,0);
2317 vidx = vec_add(vidx,vec_sl(vidx,vec_splat_u32(1))); /* multiply by 3 */
2318 vidx = vec_sl(vidx,vec_splat_u32(2));
2320 idx1 = *((int *)&vidx);
2321 idx2 = *(((int *)&vidx)+1);
2322 idx3 = *(((int *)&vidx)+2);
2323 idx4 = *(((int *)&vidx)+3);
2325 eps = vec_sub(rtab,vec_floor(rtab));
2326 eps2 = vec_madd(eps,eps,vec_zero());
2328 if(((unsigned int)VFtab)%16) { /* not 16-byte aligned, but must be 8 byte. */
2329 tab1 = vec_ld( 0, VFtab+idx1);
2330 Y = vec_ld(16, VFtab+idx1);
2331 tab1 = vec_sld(tab1,Y,8);
2332 tab2 = vec_ld( 0, VFtab+idx2);
2333 F = vec_ld(16, VFtab+idx2);
2334 tab2 = vec_sld(tab2,F,8);
2335 tab3 = vec_ld( 0, VFtab+idx3);
2336 G = vec_ld(16, VFtab+idx3);
2337 tab3 = vec_sld(tab3,G,8);
2338 tab4 = vec_ld( 0, VFtab+idx4);
2339 H = vec_ld(16, VFtab+idx4);
2340 tab4 = vec_sld(tab4,H,8);
2341 } else { /* aligned */
2342 tab1=vec_ld(0, VFtab+idx1);
2343 tab2=vec_ld(0, VFtab+idx2);
2344 tab3=vec_ld(0, VFtab+idx3);
2345 tab4=vec_ld(0, VFtab+idx4);
2348 transpose_4_to_4(tab1,tab2,tab3,tab4,&Y,&F,&G,&H);
2350 F = vec_madd(G,eps,F); /* F + Geps */
2351 H = vec_madd(H,eps2,vec_zero()); /* Heps2 */
2352 F = vec_add(F,H); /* F + Geps + Heps2 (=Fp) */
2353 *VV = vec_madd(eps,F,Y); /* VV = Y + eps*Fp */
2357 static inline void do_vonly_4_ljtable_lj(float *VFtab,
2358 vector float rtab,
2359 vector float *VVdisp,
2360 vector float *VVrep)
2362 vector signed int vidx;
2363 vector float Y,F,G,H,eps,eps2;
2364 vector float tabd1,tabd2,tabd3,tabd4;
2365 vector float tabr1,tabr2,tabr3,tabr4;
2367 int idx1,idx2,idx3,idx4;
2369 vidx = vec_cts(rtab,0);
2370 vidx = vec_sl(vidx,vec_splat_u32(3));
2372 idx1 = *((int *)&vidx);
2373 idx2 = *(((int *)&vidx)+1);
2374 idx3 = *(((int *)&vidx)+2);
2375 idx4 = *(((int *)&vidx)+3);
2377 eps = vec_sub(rtab,vec_floor(rtab));
2378 eps2 = vec_madd(eps,eps,vec_zero());
2380 if(((unsigned int)VFtab)%16) {
2381 /* not 16 byte aligned, i.e. must be 8 byte. */
2382 /* use Y,F,G,H as temp storage */
2383 tabd1 = vec_ld( 0, VFtab+idx1);
2384 tabr1 = vec_ld(16, VFtab+idx1);
2385 Y = vec_ld(32, VFtab+idx1);
2386 tabd1 = vec_sld(tabd1,tabr1,8);
2387 tabr1 = vec_sld(tabr1,Y,8);
2389 tabd2 = vec_ld( 0, VFtab+idx2);
2390 tabr2 = vec_ld(16, VFtab+idx2);
2391 F = vec_ld(32, VFtab+idx2);
2392 tabd2 = vec_sld(tabd2,tabr2,8);
2393 tabr2 = vec_sld(tabr2,F,8);
2395 tabd3 = vec_ld( 0, VFtab+idx3);
2396 tabr3 = vec_ld(16, VFtab+idx3);
2397 G = vec_ld(32, VFtab+idx3);
2398 tabd3 = vec_sld(tabd3,tabr3,8);
2399 tabr3 = vec_sld(tabr3,G,8);
2401 tabd4 = vec_ld( 0, VFtab+idx4);
2402 tabr4 = vec_ld(16, VFtab+idx4);
2403 H = vec_ld(32, VFtab+idx4);
2404 tabd4 = vec_sld(tabd4,tabr4,8);
2405 tabr4 = vec_sld(tabr4,H,8);
2406 } else { /* 16 byte aligned */
2407 tabd1 = vec_ld( 0, VFtab+idx1);
2408 tabr1 = vec_ld(16, VFtab+idx1);
2409 tabd2 = vec_ld( 0, VFtab+idx2);
2410 tabr2 = vec_ld(16, VFtab+idx2);
2411 tabd3 = vec_ld( 0, VFtab+idx3);
2412 tabr3 = vec_ld(16, VFtab+idx3);
2413 tabd4 = vec_ld( 0, VFtab+idx4);
2414 tabr4 = vec_ld(16, VFtab+idx4);
2417 transpose_4_to_4(tabd1,tabd2,tabd3,tabd4,&Y,&F,&G,&H);
2419 F = vec_madd(G,eps,F); /* F + Geps */
2420 H = vec_madd(H,eps2,vec_zero()); /* Heps2 */
2421 F = vec_add(F,H); /* F + Geps + Heps2 (=Fp) */
2422 *VVdisp = vec_madd(eps,F,Y); /* VV = Y + eps*Fp */
2424 transpose_4_to_4(tabr1,tabr2,tabr3,tabr4,&Y,&F,&G,&H);
2426 F = vec_madd(G,eps,F); /* F + Geps */
2427 H = vec_madd(H,eps2,vec_zero()); /* Heps2 */
2428 F = vec_add(F,H); /* F + Geps + Heps2 (=Fp) */
2429 *VVrep = vec_madd(eps,F,Y); /* VV = Y + eps*Fp */
2433 static inline void do_vonly_4_ljctable_coul_and_lj(float *VFtab,
2434 vector float rtab,
2435 vector float *VVcoul,
2436 vector float *VVdisp,
2437 vector float *VVrep)
2439 vector signed int vidx;
2440 vector float Y,F,G,H,eps,eps2;
2441 vector float tabd1,tabr1,tabc1,tabd2,tabr2,tabc2;
2442 vector float tabd3,tabr3,tabc3,tabd4,tabr4,tabc4;
2443 int idx1,idx2,idx3,idx4;
2445 vidx = vec_cts(rtab,0);
2446 vidx = vec_add(vidx,vec_sl(vidx,vec_splat_u32(1))); /* multiply by 3 */
2447 vidx = vec_sl(vidx,vec_splat_u32(2));
2449 idx1 = *((int *)&vidx);
2450 idx2 = *(((int *)&vidx)+1);
2451 idx3 = *(((int *)&vidx)+2);
2452 idx4 = *(((int *)&vidx)+3);
2454 eps = vec_sub(rtab,vec_floor(rtab));
2455 eps2 = vec_madd(eps,eps,vec_zero());
2457 if(((unsigned int)VFtab)%16) {
2458 /* not 16-byte aligned, but must be 8 byte. */
2459 /* use Y,F,G,H as temp storage */
2460 tabc1 = vec_ld( 0, VFtab+idx1);
2461 tabd1 = vec_ld(16, VFtab+idx1);
2462 tabr1 = vec_ld(32, VFtab+idx1);
2463 Y = vec_ld(48, VFtab+idx1);
2464 tabc1 = vec_sld(tabc1,tabd1,8);
2465 tabd1 = vec_sld(tabd1,tabr1,8);
2466 tabr1 = vec_sld(tabr1,Y,8);
2468 tabc2 = vec_ld( 0, VFtab+idx2);
2469 tabd2 = vec_ld(16, VFtab+idx2);
2470 tabr2 = vec_ld(32, VFtab+idx2);
2471 F = vec_ld(48, VFtab+idx2);
2472 tabc2 = vec_sld(tabc2,tabd2,8);
2473 tabd2 = vec_sld(tabd2,tabr2,8);
2474 tabr2 = vec_sld(tabr2,F,8);
2476 tabc3 = vec_ld( 0, VFtab+idx3);
2477 tabd3 = vec_ld(16, VFtab+idx3);
2478 tabr3 = vec_ld(32, VFtab+idx3);
2479 G = vec_ld(48, VFtab+idx3);
2480 tabc3 = vec_sld(tabc3,tabd3,8);
2481 tabd3 = vec_sld(tabd3,tabr3,8);
2482 tabr3 = vec_sld(tabr3,G,8);
2484 tabc4 = vec_ld( 0, VFtab+idx4);
2485 tabd4 = vec_ld(16, VFtab+idx4);
2486 tabr4 = vec_ld(32, VFtab+idx4);
2487 H = vec_ld(48, VFtab+idx4);
2488 tabc4 = vec_sld(tabc4,tabd4,8);
2489 tabd4 = vec_sld(tabd4,tabr4,8);
2490 tabr4 = vec_sld(tabr4,H,8);
2491 } else { /* 16 byte aligned */
2492 tabc1 = vec_ld( 0, VFtab+idx1);
2493 tabd1 = vec_ld(16, VFtab+idx1);
2494 tabr1 = vec_ld(32, VFtab+idx1);
2495 tabc2 = vec_ld( 0, VFtab+idx2);
2496 tabd2 = vec_ld(16, VFtab+idx2);
2497 tabr2 = vec_ld(32, VFtab+idx2);
2498 tabc3 = vec_ld( 0, VFtab+idx3);
2499 tabd3 = vec_ld(16, VFtab+idx3);
2500 tabr3 = vec_ld(32, VFtab+idx3);
2501 tabc4 = vec_ld( 0, VFtab+idx4);
2502 tabd4 = vec_ld(16, VFtab+idx4);
2503 tabr4 = vec_ld(32, VFtab+idx4);
2505 transpose_4_to_4(tabc1,tabc2,tabc3,tabc4,&Y,&F,&G,&H);
2507 F = vec_madd(G,eps,F); /* F + Geps */
2508 H = vec_madd(H,eps2,vec_zero()); /* Heps2 */
2509 F = vec_add(F,H); /* F + Geps + Heps2 (=Fp) */
2510 *VVcoul = vec_madd(eps,F,Y); /* VV = Y + eps*Fp */
2512 transpose_4_to_4(tabd1,tabd2,tabd3,tabd4,&Y,&F,&G,&H);
2514 F = vec_madd(G,eps,F); /* F + Geps */
2515 H = vec_madd(H,eps2,vec_zero()); /* Heps2 */
2516 F = vec_add(F,H); /* F + Geps + Heps2 (=Fp) */
2517 *VVdisp = vec_madd(eps,F,Y); /* VV = Y + eps*Fp */
2519 transpose_4_to_4(tabr1,tabr2,tabr3,tabr4,&Y,&F,&G,&H);
2521 F = vec_madd(G,eps,F); /* F + Geps */
2522 H = vec_madd(H,eps2,vec_zero()); /* Heps2 */
2523 F = vec_add(F,H); /* F + Geps + Heps2 (=Fp) */
2524 *VVrep = vec_madd(eps,F,Y); /* VV = Y + eps*Fp */
2528 static inline void do_vonly_3_ctable_coul(float *VFtab,
2529 vector float rtab,
2530 vector float *VV)
2532 vector signed int vidx;
2533 vector float Y,F,G,H,eps,eps2,tab1,tab2,tab3;
2534 int idx1,idx2,idx3;
2536 vidx = vec_cts(rtab,0);
2537 vidx = vec_sl(vidx,vec_splat_u32(2));
2539 idx1 = *((int *)&vidx);
2540 idx2 = *(((int *)&vidx)+1);
2541 idx3 = *(((int *)&vidx)+2);
2543 eps = vec_sub(rtab,vec_floor(rtab));
2544 eps2 = vec_madd(eps,eps,vec_zero());
2546 if(((unsigned int)VFtab)%16) { /* not 16-byte aligned, but must be 8 byte. */
2547 tab1 = vec_ld( 0, VFtab+idx1);
2548 Y = vec_ld(16, VFtab+idx1);
2549 tab1 = vec_sld(tab1,Y,8);
2550 tab2 = vec_ld( 0, VFtab+idx2);
2551 F = vec_ld(16, VFtab+idx2);
2552 tab2 = vec_sld(tab2,F,8);
2553 tab3 = vec_ld( 0, VFtab+idx3);
2554 G = vec_ld(16, VFtab+idx3);
2555 tab3 = vec_sld(tab3,G,8);
2556 } else { /* aligned */
2557 tab1=vec_ld(0, VFtab+idx1);
2558 tab2=vec_ld(0, VFtab+idx2);
2559 tab3=vec_ld(0, VFtab+idx3);
2562 /* table data is aligned */
2563 transpose_3_to_4(tab1,tab2,tab3,&Y,&F,&G,&H);
2566 F = vec_madd(G,eps,F); /* F + Geps */
2567 H = vec_madd(H,eps2,vec_zero()); /* Heps2 */
2568 F = vec_add(F,H); /* F + Geps + Heps2 (=Fp) */
2569 *VV = vec_madd(eps,F,Y); /* VV = Y + eps*Fp */
2572 /* do only coulomb, but on a table with both coulomb and lj data */
2573 static inline void do_vonly_3_ljctable_coul(float *VFtab,
2574 vector float rtab,
2575 vector float *VV)
2577 vector signed int vidx;
2578 vector float Y,F,G,H,eps,eps2,tab1,tab2,tab3;
2579 int idx1,idx2,idx3;
2581 vidx = vec_cts(rtab,0);
2582 vidx = vec_add(vidx,vec_sl(vidx,vec_splat_u32(1))); /* multiply by 3 */
2583 vidx = vec_sl(vidx,vec_splat_u32(2));
2585 idx1 = *((int *)&vidx);
2586 idx2 = *(((int *)&vidx)+1);
2587 idx3 = *(((int *)&vidx)+2);
2589 eps = vec_sub(rtab,vec_floor(rtab));
2590 eps2 = vec_madd(eps,eps,vec_zero());
2593 if(((unsigned int)VFtab)%16) {
2594 /* not 16-byte aligned, but must be 8 byte. */
2595 tab1 = vec_ld( 0, VFtab+idx1);
2596 Y = vec_ld(16, VFtab+idx1);
2597 tab1 = vec_sld(tab1,Y,8);
2598 tab2 = vec_ld( 0, VFtab+idx2);
2599 F = vec_ld(16, VFtab+idx2);
2600 tab2 = vec_sld(tab2,F,8);
2601 tab3 = vec_ld( 0, VFtab+idx3);
2602 G = vec_ld(16, VFtab+idx3);
2603 tab3 = vec_sld(tab3,G,8);
2604 } else { /* aligned */
2605 tab1=vec_ld(0, VFtab+idx1);
2606 tab2=vec_ld(0, VFtab+idx2);
2607 tab3=vec_ld(0, VFtab+idx3);
2610 /* table data is aligned */
2611 transpose_3_to_4(tab1,tab2,tab3,&Y,&F,&G,&H);
2613 F = vec_madd(G,eps,F); /* F + Geps */
2614 H = vec_madd(H,eps2,vec_zero()); /* Heps2 */
2615 F = vec_add(F,H); /* F + Geps + Heps2 (=Fp) */
2616 *VV = vec_madd(eps,F,Y); /* VV = Y + eps*Fp */
2620 static inline void do_vonly_3_ljtable_lj(float *VFtab,
2621 vector float rtab,
2622 vector float *VVdisp,
2623 vector float *VVrep)
2625 vector signed int vidx;
2626 vector float Y,F,G,H,eps,eps2;
2627 int idx1,idx2,idx3;
2628 vector float tabd1,tabd2,tabd3;
2629 vector float tabr1,tabr2,tabr3;
2631 vidx = vec_cts(rtab,0);
2632 vidx = vec_sl(vidx,vec_splat_u32(3));
2634 idx1 = *((int *)&vidx);
2635 idx2 = *(((int *)&vidx)+1);
2636 idx3 = *(((int *)&vidx)+2);
2638 eps = vec_sub(rtab,vec_floor(rtab));
2639 eps2 = vec_madd(eps,eps,vec_zero());
2641 if(((unsigned int)VFtab)%16) {
2642 /* not 16 byte aligned, i.e. must be 8 byte. */
2643 /* use Y,F,G,H as temp storage */
2644 tabd1 = vec_ld( 0, VFtab+idx1);
2645 tabr1 = vec_ld(16, VFtab+idx1);
2646 Y = vec_ld(32, VFtab+idx1);
2647 tabd1 = vec_sld(tabd1,tabr1,8);
2648 tabr1 = vec_sld(tabr1,Y,8);
2650 tabd2 = vec_ld( 0, VFtab+idx2);
2651 tabr2 = vec_ld(16, VFtab+idx2);
2652 F = vec_ld(32, VFtab+idx2);
2653 tabd2 = vec_sld(tabd2,tabr2,8);
2654 tabr2 = vec_sld(tabr2,F,8);
2656 tabd3 = vec_ld( 0, VFtab+idx3);
2657 tabr3 = vec_ld(16, VFtab+idx3);
2658 G = vec_ld(32, VFtab+idx3);
2659 tabd3 = vec_sld(tabd3,tabr3,8);
2660 tabr3 = vec_sld(tabr3,G,8);
2661 } else { /* 16 byte aligned */
2662 tabd1 = vec_ld( 0, VFtab+idx1);
2663 tabr1 = vec_ld(16, VFtab+idx1);
2664 tabd2 = vec_ld( 0, VFtab+idx2);
2665 tabr2 = vec_ld(16, VFtab+idx2);
2666 tabd3 = vec_ld( 0, VFtab+idx3);
2667 tabr3 = vec_ld(16, VFtab+idx3);
2670 transpose_3_to_4(tabd1,tabd2,tabd3,&Y,&F,&G,&H);
2672 F = vec_madd(G,eps,F); /* F + Geps */
2673 H = vec_madd(H,eps2,vec_zero()); /* Heps2 */
2674 F = vec_add(F,H); /* F + Geps + Heps2 (=Fp) */
2675 *VVdisp = vec_madd(eps,F,Y); /* VV = Y + eps*Fp */
2677 transpose_3_to_4(tabr1,tabr2,tabr3,&Y,&F,&G,&H);
2679 F = vec_madd(G,eps,F); /* F + Geps */
2680 H = vec_madd(H,eps2,vec_zero()); /* Heps2 */
2681 F = vec_add(F,H); /* F + Geps + Heps2 (=Fp) */
2682 *VVrep = vec_madd(eps,F,Y); /* VV = Y + eps*Fp */
2686 static inline void do_vonly_3_ljctable_coul_and_lj(float *VFtab,
2687 vector float rtab,
2688 vector float *VVcoul,
2689 vector float *VVdisp,
2690 vector float *VVrep)
2692 vector signed int vidx;
2693 vector float Y,F,G,H,eps,eps2;
2694 vector float tabd1,tabr1,tabc1,tabd2,tabr2,tabc2;
2695 vector float tabd3,tabr3,tabc3;
2696 int idx1,idx2,idx3;
2698 vidx = vec_cts(rtab,0);
2699 vidx = vec_add(vidx,vec_sl(vidx,vec_splat_u32(1))); /* multiply by 3 */
2700 vidx = vec_sl(vidx,vec_splat_u32(2));
2702 idx1 = *((int *)&vidx);
2703 idx2 = *(((int *)&vidx)+1);
2704 idx3 = *(((int *)&vidx)+2);
2706 eps = vec_sub(rtab,vec_floor(rtab));
2707 eps2 = vec_madd(eps,eps,vec_zero());
2709 if(((unsigned int)VFtab)%16) {
2710 /* not 16-byte aligned, but must be 8 byte. */
2711 /* use Y,F,G,H as temp storage */
2712 tabc1 = vec_ld( 0, VFtab+idx1);
2713 tabd1 = vec_ld(16, VFtab+idx1);
2714 tabr1 = vec_ld(32, VFtab+idx1);
2715 Y = vec_ld(48, VFtab+idx1);
2716 tabc1 = vec_sld(tabc1,tabd1,8);
2717 tabd1 = vec_sld(tabd1,tabr1,8);
2718 tabr1 = vec_sld(tabr1,Y,8);
2720 tabc2 = vec_ld( 0, VFtab+idx2);
2721 tabd2 = vec_ld(16, VFtab+idx2);
2722 tabr2 = vec_ld(32, VFtab+idx2);
2723 F = vec_ld(48, VFtab+idx2);
2724 tabc2 = vec_sld(tabc2,tabd2,8);
2725 tabd2 = vec_sld(tabd2,tabr2,8);
2726 tabr2 = vec_sld(tabr2,F,8);
2728 tabc3 = vec_ld( 0, VFtab+idx3);
2729 tabd3 = vec_ld(16, VFtab+idx3);
2730 tabr3 = vec_ld(32, VFtab+idx3);
2731 G = vec_ld(48, VFtab+idx3);
2732 tabc3 = vec_sld(tabc3,tabd3,8);
2733 tabd3 = vec_sld(tabd3,tabr3,8);
2734 tabr3 = vec_sld(tabr3,G,8);
2735 } else { /* 16 byte aligned */
2736 tabc1 = vec_ld( 0, VFtab+idx1);
2737 tabd1 = vec_ld(16, VFtab+idx1);
2738 tabr1 = vec_ld(32, VFtab+idx1);
2739 tabc2 = vec_ld( 0, VFtab+idx2);
2740 tabd2 = vec_ld(16, VFtab+idx2);
2741 tabr2 = vec_ld(32, VFtab+idx2);
2742 tabc3 = vec_ld( 0, VFtab+idx3);
2743 tabd3 = vec_ld(16, VFtab+idx3);
2744 tabr3 = vec_ld(32, VFtab+idx3);
2746 transpose_3_to_4(tabc1,tabc2,tabc3,&Y,&F,&G,&H);
2748 F = vec_madd(G,eps,F); /* F + Geps */
2749 H = vec_madd(H,eps2,vec_zero()); /* Heps2 */
2750 F = vec_add(F,H); /* F + Geps + Heps2 (=Fp) */
2751 *VVcoul = vec_madd(eps,F,Y); /* VV = Y + eps*Fp */
2753 transpose_3_to_4(tabd1,tabd2,tabd3,&Y,&F,&G,&H);
2755 F = vec_madd(G,eps,F); /* F + Geps */
2756 H = vec_madd(H,eps2,vec_zero()); /* Heps2 */
2757 F = vec_add(F,H); /* F + Geps + Heps2 (=Fp) */
2758 *VVdisp = vec_madd(eps,F,Y); /* VV = Y + eps*Fp */
2760 transpose_3_to_4(tabr1,tabr2,tabr3,&Y,&F,&G,&H);
2762 F = vec_madd(G,eps,F); /* F + Geps */
2763 H = vec_madd(H,eps2,vec_zero()); /* Heps2 */
2764 F = vec_add(F,H); /* F + Geps + Heps2 (=Fp) */
2765 *VVrep = vec_madd(eps,F,Y); /* VV = Y + eps*Fp */
2771 static inline void do_vonly_2_ctable_coul(float *VFtab,
2772 vector float rtab,
2773 vector float *VV)
2775 vector signed int vidx;
2776 vector float Y,F,G,H,eps,eps2,tab1,tab2;
2777 int idx1,idx2;
2779 vidx = vec_cts(rtab,0);
2780 vidx = vec_sl(vidx,vec_splat_u32(2));
2782 idx1 = *((int *)&vidx);
2783 idx2 = *(((int *)&vidx)+1);
2785 eps = vec_sub(rtab,vec_floor(rtab));
2786 eps2 = vec_madd(eps,eps,vec_zero());
2788 if(((unsigned int)VFtab)%16) { /* not 16-byte aligned, but must be 8 byte. */
2789 tab1 = vec_ld( 0, VFtab+idx1);
2790 Y = vec_ld(16, VFtab+idx1);
2791 tab1 = vec_sld(tab1,Y,8);
2792 tab2 = vec_ld( 0, VFtab+idx2);
2793 F = vec_ld(16, VFtab+idx2);
2794 tab2 = vec_sld(tab2,F,8);
2795 } else { /* aligned */
2796 tab1=vec_ld(0, VFtab+idx1);
2797 tab2=vec_ld(0, VFtab+idx2);
2800 /* table data is aligned */
2801 transpose_2_to_4(tab1,tab2,&Y,&F,&G,&H);
2803 F = vec_madd(G,eps,F); /* F + Geps */
2804 H = vec_madd(H,eps2,vec_zero()); /* Heps2 */
2805 F = vec_add(F,H); /* F + Geps + Heps2 (=Fp) */
2806 *VV = vec_madd(eps,F,Y); /* VV = Y + eps*Fp */
2809 /* do only coulomb, but on a table with both coulomb and lj data */
2810 static inline void do_vonly_2_ljctable_coul(float *VFtab,
2811 vector float rtab,
2812 vector float *VV)
2814 vector signed int vidx;
2815 vector float Y,F,G,H,eps,eps2,tab1,tab2;
2816 int idx1,idx2;
2818 vidx = vec_cts(rtab,0);
2819 vidx = vec_add(vidx,vec_sl(vidx,vec_splat_u32(1))); /* multiply by 3 */
2820 vidx = vec_sl(vidx,vec_splat_u32(2));
2822 idx1 = *((int *)&vidx);
2823 idx2 = *(((int *)&vidx)+1);
2825 eps = vec_sub(rtab,vec_floor(rtab));
2826 eps2 = vec_madd(eps,eps,vec_zero());
2829 if(((unsigned int)VFtab)%16) {
2830 /* not 16-byte aligned, but must be 8 byte. */
2831 tab1 = vec_ld( 0, VFtab+idx1);
2832 Y = vec_ld(16, VFtab+idx1);
2833 tab1 = vec_sld(tab1,Y,8);
2834 tab2 = vec_ld( 0, VFtab+idx2);
2835 F = vec_ld(16, VFtab+idx2);
2836 tab2 = vec_sld(tab2,F,8);
2837 } else { /* aligned */
2838 tab1=vec_ld(0, VFtab+idx1);
2839 tab2=vec_ld(0, VFtab+idx2);
2842 /* table data is aligned */
2843 transpose_2_to_4(tab1,tab2,&Y,&F,&G,&H);
2845 F = vec_madd(G,eps,F); /* F + Geps */
2846 H = vec_madd(H,eps2,vec_zero()); /* Heps2 */
2847 F = vec_add(F,H); /* F + Geps + Heps2 (=Fp) */
2848 *VV = vec_madd(eps,F,Y); /* VV = Y + eps*Fp */
2852 static inline void do_vonly_2_ljtable_lj(float *VFtab,
2853 vector float rtab,
2854 vector float *VVdisp,
2855 vector float *VVrep)
2857 vector signed int vidx;
2858 vector float Y,F,G,H,eps,eps2;
2859 int idx1,idx2;
2860 vector float tabd1,tabd2;
2861 vector float tabr1,tabr2;
2863 vidx = vec_cts(rtab,0);
2864 vidx = vec_sl(vidx,vec_splat_u32(3));
2866 idx1 = *((int *)&vidx);
2867 idx2 = *(((int *)&vidx)+1);
2869 eps = vec_sub(rtab,vec_floor(rtab));
2870 eps2 = vec_madd(eps,eps,vec_zero());
2872 if(((unsigned int)VFtab)%16) {
2873 /* not 16 byte aligned, i.e. must be 8 byte. */
2874 /* use Y,F,G,H as temp storage */
2875 tabd1 = vec_ld( 0, VFtab+idx1);
2876 tabr1 = vec_ld(16, VFtab+idx1);
2877 Y = vec_ld(32, VFtab+idx1);
2878 tabd1 = vec_sld(tabd1,tabr1,8);
2879 tabr1 = vec_sld(tabr1,Y,8);
2881 tabd2 = vec_ld( 0, VFtab+idx2);
2882 tabr2 = vec_ld(16, VFtab+idx2);
2883 F = vec_ld(32, VFtab+idx2);
2884 tabd2 = vec_sld(tabd2,tabr2,8);
2885 tabr2 = vec_sld(tabr2,F,8);
2886 } else { /* 16 byte aligned */
2887 tabd1 = vec_ld( 0, VFtab+idx1);
2888 tabr1 = vec_ld(16, VFtab+idx1);
2889 tabd2 = vec_ld( 0, VFtab+idx2);
2890 tabr2 = vec_ld(16, VFtab+idx2);
2893 transpose_2_to_4(tabd1,tabd2,&Y,&F,&G,&H);
2895 F = vec_madd(G,eps,F); /* F + Geps */
2896 H = vec_madd(H,eps2,vec_zero()); /* Heps2 */
2897 F = vec_add(F,H); /* F + Geps + Heps2 (=Fp) */
2898 *VVdisp = vec_madd(eps,F,Y); /* VV = Y + eps*Fp */
2900 transpose_2_to_4(tabr1,tabr2,&Y,&F,&G,&H);
2902 F = vec_madd(G,eps,F); /* F + Geps */
2903 H = vec_madd(H,eps2,vec_zero()); /* Heps2 */
2904 F = vec_add(F,H); /* F + Geps + Heps2 (=Fp) */
2905 *VVrep = vec_madd(eps,F,Y); /* VV = Y + eps*Fp */
2909 static inline void do_vonly_2_ljctable_coul_and_lj(float *VFtab,
2910 vector float rtab,
2911 vector float *VVcoul,
2912 vector float *VVdisp,
2913 vector float *VVrep)
2915 vector signed int vidx;
2916 vector float Y,F,G,H,eps,eps2;
2917 vector float tabd1,tabr1,tabc1,tabd2,tabr2,tabc2;
2918 int idx1,idx2;
2920 vidx = vec_cts(rtab,0);
2921 vidx = vec_add(vidx,vec_sl(vidx,vec_splat_u32(1))); /* multiply by 3 */
2922 vidx = vec_sl(vidx,vec_splat_u32(2));
2924 idx1 = *((int *)&vidx);
2925 idx2 = *(((int *)&vidx)+1);
2927 eps = vec_sub(rtab,vec_floor(rtab));
2928 eps2 = vec_madd(eps,eps,vec_zero());
2930 if(((unsigned int)VFtab)%16) {
2931 /* not 16-byte aligned, but must be 8 byte. */
2932 /* use Y,F,G,H as temp storage */
2933 tabc1 = vec_ld( 0, VFtab+idx1);
2934 tabd1 = vec_ld(16, VFtab+idx1);
2935 tabr1 = vec_ld(32, VFtab+idx1);
2936 Y = vec_ld(48, VFtab+idx1);
2937 tabc1 = vec_sld(tabc1,tabd1,8);
2938 tabd1 = vec_sld(tabd1,tabr1,8);
2939 tabr1 = vec_sld(tabr1,Y,8);
2941 tabc2 = vec_ld( 0, VFtab+idx2);
2942 tabd2 = vec_ld(16, VFtab+idx2);
2943 tabr2 = vec_ld(32, VFtab+idx2);
2944 F = vec_ld(48, VFtab+idx2);
2945 tabc2 = vec_sld(tabc2,tabd2,8);
2946 tabd2 = vec_sld(tabd2,tabr2,8);
2947 tabr2 = vec_sld(tabr2,F,8);
2948 } else { /* 16 byte aligned */
2949 tabc1 = vec_ld( 0, VFtab+idx1);
2950 tabd1 = vec_ld(16, VFtab+idx1);
2951 tabr1 = vec_ld(32, VFtab+idx1);
2952 tabc2 = vec_ld( 0, VFtab+idx2);
2953 tabd2 = vec_ld(16, VFtab+idx2);
2954 tabr2 = vec_ld(32, VFtab+idx2);
2956 transpose_2_to_4(tabc1,tabc2,&Y,&F,&G,&H);
2958 F = vec_madd(G,eps,F); /* F + Geps */
2959 H = vec_madd(H,eps2,vec_zero()); /* Heps2 */
2960 F = vec_add(F,H); /* F + Geps + Heps2 (=Fp) */
2961 *VVcoul = vec_madd(eps,F,Y); /* VV = Y + eps*Fp */
2963 transpose_2_to_4(tabd1,tabd2,&Y,&F,&G,&H);
2965 F = vec_madd(G,eps,F); /* F + Geps */
2966 H = vec_madd(H,eps2,vec_zero()); /* Heps2 */
2967 F = vec_add(F,H); /* F + Geps + Heps2 (=Fp) */
2968 *VVdisp = vec_madd(eps,F,Y); /* VV = Y + eps*Fp */
2970 transpose_2_to_4(tabr1,tabr2,&Y,&F,&G,&H);
2972 F = vec_madd(G,eps,F); /* F + Geps */
2973 H = vec_madd(H,eps2,vec_zero()); /* Heps2 */
2974 F = vec_add(F,H); /* F + Geps + Heps2 (=Fp) */
2975 *VVrep = vec_madd(eps,F,Y); /* VV = Y + eps*Fp */
2980 static inline void do_vonly_1_ctable_coul(float *VFtab,
2981 vector float rtab,
2982 vector float *VV)
2984 vector signed int vidx;
2985 vector float Y,F,G,H,eps,eps2,tab1;
2986 int idx1;
2988 vidx = vec_cts(rtab,0);
2989 vidx = vec_sl(vidx,vec_splat_u32(2));
2991 idx1 = *((int *)&vidx);
2993 eps = vec_sub(rtab,vec_floor(rtab));
2994 eps2 = vec_madd(eps,eps,vec_zero());
2996 if(((unsigned int)VFtab)%16) { /* not 16-byte aligned, but must be 8 byte. */
2997 tab1 = vec_ld( 0, VFtab+idx1);
2998 Y = vec_ld(16, VFtab+idx1);
2999 tab1 = vec_sld(tab1,Y,8);
3000 } else { /* aligned */
3001 tab1=vec_ld(0, VFtab+idx1);
3004 /* table data is aligned */
3005 transpose_1_to_4(tab1,&Y,&F,&G,&H);
3007 F = vec_madd(G,eps,F); /* F + Geps */
3008 H = vec_madd(H,eps2,vec_zero()); /* Heps2 */
3009 F = vec_add(F,H); /* F + Geps + Heps2 (=Fp) */
3010 *VV = vec_madd(eps,F,Y); /* VV = Y + eps*Fp */
3013 /* do only coulomb, but on a table with both coulomb and lj data */
3014 static inline void do_vonly_1_ljctable_coul(float *VFtab,
3015 vector float rtab,
3016 vector float *VV)
3018 vector signed int vidx;
3019 vector float Y,F,G,H,eps,eps2,tab1;
3020 int idx1;
3022 vidx = vec_cts(rtab,0);
3023 vidx = vec_add(vidx,vec_sl(vidx,vec_splat_u32(1))); /* multiply by 3 */
3024 vidx = vec_sl(vidx,vec_splat_u32(2));
3026 idx1 = *((int *)&vidx);
3028 eps = vec_sub(rtab,vec_floor(rtab));
3029 eps2 = vec_madd(eps,eps,vec_zero());
3032 if(((unsigned int)VFtab)%16) {
3033 /* not 16-byte aligned, but must be 8 byte. */
3034 tab1 = vec_ld( 0, VFtab+idx1);
3035 Y = vec_ld(16, VFtab+idx1);
3036 tab1 = vec_sld(tab1,Y,8);
3037 } else { /* aligned */
3038 tab1=vec_ld(0, VFtab+idx1);
3041 /* table data is aligned */
3042 transpose_1_to_4(tab1,&Y,&F,&G,&H);
3044 F = vec_madd(G,eps,F); /* F + Geps */
3045 H = vec_madd(H,eps2,vec_zero()); /* Heps2 */
3046 F = vec_add(F,H); /* F + Geps + Heps2 (=Fp) */
3047 *VV = vec_madd(eps,F,Y); /* VV = Y + eps*Fp */
3051 static inline void do_vonly_1_ljtable_lj(float *VFtab,
3052 vector float rtab,
3053 vector float *VVdisp,
3054 vector float *VVrep)
3056 vector signed int vidx;
3057 vector float Y,F,G,H,eps,eps2;
3058 int idx1;
3059 vector float tabd1;
3060 vector float tabr1;
3062 vidx = vec_cts(rtab,0);
3063 vidx = vec_sl(vidx,vec_splat_u32(3));
3065 idx1 = *((int *)&vidx);
3067 eps = vec_sub(rtab,vec_floor(rtab));
3068 eps2 = vec_madd(eps,eps,vec_zero());
3070 if(((unsigned int)VFtab)%16) {
3071 /* not 16 byte aligned, i.e. must be 8 byte. */
3072 /* use Y,F,G,H as temp storage */
3073 tabd1 = vec_ld( 0, VFtab+idx1);
3074 tabr1 = vec_ld(16, VFtab+idx1);
3075 Y = vec_ld(32, VFtab+idx1);
3076 tabd1 = vec_sld(tabd1,tabr1,8);
3077 tabr1 = vec_sld(tabr1,Y,8);
3078 } else { /* 16 byte aligned */
3079 tabd1 = vec_ld( 0, VFtab+idx1);
3080 tabr1 = vec_ld(16, VFtab+idx1);
3083 transpose_1_to_4(tabd1,&Y,&F,&G,&H);
3085 F = vec_madd(G,eps,F); /* F + Geps */
3086 H = vec_madd(H,eps2,vec_zero()); /* Heps2 */
3087 F = vec_add(F,H); /* F + Geps + Heps2 (=Fp) */
3088 *VVdisp = vec_madd(eps,F,Y); /* VV = Y + eps*Fp */
3090 transpose_1_to_4(tabr1,&Y,&F,&G,&H);
3092 F = vec_madd(G,eps,F); /* F + Geps */
3093 H = vec_madd(H,eps2,vec_zero()); /* Heps2 */
3094 F = vec_add(F,H); /* F + Geps + Heps2 (=Fp) */
3095 *VVrep = vec_madd(eps,F,Y); /* VV = Y + eps*Fp */
3099 static inline void do_vonly_1_ljctable_coul_and_lj(float *VFtab,
3100 vector float rtab,
3101 vector float *VVcoul,
3102 vector float *VVdisp,
3103 vector float *VVrep)
3105 vector signed int vidx;
3106 vector float Y,F,G,H,eps,eps2;
3107 vector float tabd1,tabr1,tabc1;
3108 int idx1;
3110 vidx = vec_cts(rtab,0);
3111 vidx = vec_add(vidx,vec_sl(vidx,vec_splat_u32(1))); /* multiply by 3 */
3112 vidx = vec_sl(vidx,vec_splat_u32(2));
3114 idx1 = *((int *)&vidx);
3116 eps = vec_sub(rtab,vec_floor(rtab));
3117 eps2 = vec_madd(eps,eps,vec_zero());
3119 if(((unsigned int)VFtab)%16) {
3120 /* not 16-byte aligned, but must be 8 byte. */
3121 /* use Y,F,G,H as temp storage */
3122 tabc1 = vec_ld( 0, VFtab+idx1);
3123 tabd1 = vec_ld(16, VFtab+idx1);
3124 tabr1 = vec_ld(32, VFtab+idx1);
3125 Y = vec_ld(48, VFtab+idx1);
3126 tabc1 = vec_sld(tabc1,tabd1,8);
3127 tabd1 = vec_sld(tabd1,tabr1,8);
3128 tabr1 = vec_sld(tabr1,Y,8);
3129 } else { /* 16 byte aligned */
3130 tabc1 = vec_ld( 0, VFtab+idx1);
3131 tabd1 = vec_ld(16, VFtab+idx1);
3132 tabr1 = vec_ld(32, VFtab+idx1);
3134 transpose_1_to_4(tabc1,&Y,&F,&G,&H);
3136 F = vec_madd(G,eps,F); /* F + Geps */
3137 H = vec_madd(H,eps2,vec_zero()); /* Heps2 */
3138 F = vec_add(F,H); /* F + Geps + Heps2 (=Fp) */
3139 *VVcoul = vec_madd(eps,F,Y); /* VV = Y + eps*Fp */
3141 transpose_1_to_4(tabd1,&Y,&F,&G,&H);
3143 F = vec_madd(G,eps,F); /* F + Geps */
3144 H = vec_madd(H,eps2,vec_zero()); /* Heps2 */
3145 F = vec_add(F,H); /* F + Geps + Heps2 (=Fp) */
3146 *VVdisp = vec_madd(eps,F,Y); /* VV = Y + eps*Fp */
3148 transpose_1_to_4(tabr1,&Y,&F,&G,&H);
3150 F = vec_madd(G,eps,F); /* F + Geps */
3151 H = vec_madd(H,eps2,vec_zero()); /* Heps2 */
3152 F = vec_add(F,H); /* F + Geps + Heps2 (=Fp) */
3153 *VVrep = vec_madd(eps,F,Y); /* VV = Y + eps*Fp */
3159 static inline vector float do_recip(vector float rsq)
3161 vector float tmp,lu;
3163 lu = vec_re(rsq);
3164 tmp = vec_nmsub(rsq,lu,vec_two());
3165 return vec_madd(lu,tmp,vec_zero());
3168 static inline vector float do_invsqrt(vector float rsq)
3170 vector float lu,tmpA,tmpB;
3172 lu = vec_rsqrte(rsq);
3173 tmpA = vec_madd(lu,lu,vec_zero());
3174 tmpB = vec_madd(lu,vec_half(),vec_zero());
3175 tmpA = vec_nmsub(rsq,tmpA,vec_one());
3176 return vec_madd(tmpA,tmpB,lu);
3179 static inline void do_3_invsqrt(vector float rsq1,
3180 vector float rsq2,
3181 vector float rsq3,
3182 vector float *rinvsq1,
3183 vector float *rinvsq2,
3184 vector float *rinvsq3)
3186 vector float lu1,lu2,lu3;
3187 vector float tmpA1,tmpA2,tmpA3,tmpB1,tmpB2,tmpB3;
3188 vector float nul=vec_zero();
3189 vector float half=vec_half();
3190 vector float one=vec_one();
3192 lu1 = vec_rsqrte(rsq1);
3193 lu2 = vec_rsqrte(rsq2);
3194 lu3 = vec_rsqrte(rsq3);
3195 tmpA1 = vec_madd(lu1,lu1,nul);
3196 tmpA2 = vec_madd(lu2,lu2,nul);
3197 tmpA3 = vec_madd(lu3,lu3,nul);
3198 tmpB1 = vec_madd(lu1,half,nul);
3199 tmpB2 = vec_madd(lu2,half,nul);
3200 tmpB3 = vec_madd(lu3,half,nul);
3201 tmpA1 = vec_nmsub(rsq1,tmpA1,one);
3202 tmpA2 = vec_nmsub(rsq2,tmpA2,one);
3203 tmpA3 = vec_nmsub(rsq3,tmpA3,one);
3204 *rinvsq1 = vec_madd(tmpA1,tmpB1,lu1);
3205 *rinvsq2 = vec_madd(tmpA2,tmpB2,lu2);
3206 *rinvsq3 = vec_madd(tmpA3,tmpB3,lu3);
3209 static inline void do_9_invsqrt(vector float rsq1,
3210 vector float rsq2,
3211 vector float rsq3,
3212 vector float rsq4,
3213 vector float rsq5,
3214 vector float rsq6,
3215 vector float rsq7,
3216 vector float rsq8,
3217 vector float rsq9,
3218 vector float *rinv1,
3219 vector float *rinv2,
3220 vector float *rinv3,
3221 vector float *rinv4,
3222 vector float *rinv5,
3223 vector float *rinv6,
3224 vector float *rinv7,
3225 vector float *rinv8,
3226 vector float *rinv9)
3228 vector float lu1,lu2,lu3,lu4,lu5,lu6,lu7,lu8,lu9;
3229 vector float tmpA1,tmpA2,tmpA3,tmpA4,tmpA5,tmpA6,tmpA7,tmpA8,tmpA9;
3230 vector float tmpB1,tmpB2,tmpB3,tmpB4,tmpB5,tmpB6,tmpB7,tmpB8,tmpB9;
3231 vector float nul=vec_zero();
3232 vector float half=vec_half();
3233 vector float one=vec_one();
3235 lu1 = vec_rsqrte(rsq1);
3236 lu2 = vec_rsqrte(rsq2);
3237 lu3 = vec_rsqrte(rsq3);
3238 lu4 = vec_rsqrte(rsq4);
3239 lu5 = vec_rsqrte(rsq5);
3240 lu6 = vec_rsqrte(rsq6);
3241 lu7 = vec_rsqrte(rsq7);
3242 lu8 = vec_rsqrte(rsq8);
3243 lu9 = vec_rsqrte(rsq9);
3244 tmpA1 = vec_madd(lu1,lu1,nul);
3245 tmpA2 = vec_madd(lu2,lu2,nul);
3246 tmpA3 = vec_madd(lu3,lu3,nul);
3247 tmpA4 = vec_madd(lu4,lu4,nul);
3248 tmpA5 = vec_madd(lu5,lu5,nul);
3249 tmpA6 = vec_madd(lu6,lu6,nul);
3250 tmpA7 = vec_madd(lu7,lu7,nul);
3251 tmpA8 = vec_madd(lu8,lu8,nul);
3252 tmpA9 = vec_madd(lu9,lu9,nul);
3253 tmpB1 = vec_madd(lu1,half,nul);
3254 tmpB2 = vec_madd(lu2,half,nul);
3255 tmpB3 = vec_madd(lu3,half,nul);
3256 tmpB4 = vec_madd(lu4,half,nul);
3257 tmpB5 = vec_madd(lu5,half,nul);
3258 tmpB6 = vec_madd(lu6,half,nul);
3259 tmpB7 = vec_madd(lu7,half,nul);
3260 tmpB8 = vec_madd(lu8,half,nul);
3261 tmpB9 = vec_madd(lu9,half,nul);
3262 tmpA1 = vec_nmsub(rsq1,tmpA1,one);
3263 tmpA2 = vec_nmsub(rsq2,tmpA2,one);
3264 tmpA3 = vec_nmsub(rsq3,tmpA3,one);
3265 tmpA4 = vec_nmsub(rsq4,tmpA4,one);
3266 tmpA5 = vec_nmsub(rsq5,tmpA5,one);
3267 tmpA6 = vec_nmsub(rsq6,tmpA6,one);
3268 tmpA7 = vec_nmsub(rsq7,tmpA7,one);
3269 tmpA8 = vec_nmsub(rsq8,tmpA8,one);
3270 tmpA9 = vec_nmsub(rsq9,tmpA9,one);
3271 *rinv1 = vec_madd(tmpA1,tmpB1,lu1);
3272 *rinv2 = vec_madd(tmpA2,tmpB2,lu2);
3273 *rinv3 = vec_madd(tmpA3,tmpB3,lu3);
3274 *rinv4 = vec_madd(tmpA4,tmpB4,lu4);
3275 *rinv5 = vec_madd(tmpA5,tmpB5,lu5);
3276 *rinv6 = vec_madd(tmpA6,tmpB6,lu6);
3277 *rinv7 = vec_madd(tmpA7,tmpB7,lu7);
3278 *rinv8 = vec_madd(tmpA8,tmpB8,lu8);
3279 *rinv9 = vec_madd(tmpA9,tmpB9,lu9);
3280 /* 36 invsqrt in about 48 cycles due to pipelining ... pretty fast :-) */
3284 static inline void zero_highest_element_in_vector(vector float *v)
3286 vector signed int zero = (vector signed int) vec_zero();
3288 *v = (vector float)vec_sel((vector signed int)*v,zero,(vector unsigned int)vec_sld(zero,vec_splat_s32(-1),4));
3291 static inline void zero_highest_2_elements_in_vector(vector float *v)
3293 vector signed int zero = (vector signed int) vec_zero();
3295 *v = (vector float)vec_sel((vector signed int)*v,zero,(vector unsigned int)vec_sld(zero,vec_splat_s32(-1),8));
3298 static inline void zero_highest_3_elements_in_vector(vector float *v)
3300 vector signed int zero = (vector signed int) vec_zero();
3302 *v = (vector float)vec_sel((vector signed int)*v,zero,(vector unsigned int)vec_sld(zero,vec_splat_s32(-1),12));
3306 static inline void zero_highest_element_in_3_vectors(vector float *v1,vector float *v2,vector float *v3)
3308 vector signed int zero = (vector signed int) vec_zero();
3309 vector unsigned int mask = (vector unsigned int)vec_sld(zero,vec_splat_s32(-1),4);
3311 *v1 = (vector float)vec_sel((vector signed int)*v1,zero,mask);
3312 *v2 = (vector float)vec_sel((vector signed int)*v2,zero,mask);
3313 *v3 = (vector float)vec_sel((vector signed int)*v3,zero,mask);
3316 static inline void zero_highest_2_elements_in_3_vectors(vector float *v1,vector float *v2,vector float *v3)
3318 vector signed int zero = (vector signed int) vec_zero();
3319 vector unsigned int mask = (vector unsigned int)vec_sld(zero,vec_splat_s32(-1),8);
3321 *v1 = (vector float)vec_sel((vector signed int)*v1,zero,mask);
3322 *v2 = (vector float)vec_sel((vector signed int)*v2,zero,mask);
3323 *v3 = (vector float)vec_sel((vector signed int)*v3,zero,mask);
3326 static inline void zero_highest_3_elements_in_3_vectors(vector float *v1,vector float *v2,vector float *v3)
3328 vector signed int zero = (vector signed int) vec_zero();
3329 vector unsigned int mask = (vector unsigned int)vec_sld(zero,vec_splat_s32(-1),12);
3331 *v1 = (vector float)vec_sel((vector signed int)*v1,zero,mask);
3332 *v2 = (vector float)vec_sel((vector signed int)*v2,zero,mask);
3333 *v3 = (vector float)vec_sel((vector signed int)*v3,zero,mask);
3336 static inline void zero_highest_element_in_9_vectors(vector float *v1,vector float *v2,vector float *v3,
3337 vector float *v4,vector float *v5,vector float *v6,
3338 vector float *v7,vector float *v8,vector float *v9)
3340 vector signed int zero = (vector signed int) vec_zero();
3341 vector unsigned int mask = (vector unsigned int)vec_sld(zero,vec_splat_s32(-1),4);
3343 *v1 = (vector float)vec_sel((vector signed int)*v1,zero,mask);
3344 *v2 = (vector float)vec_sel((vector signed int)*v2,zero,mask);
3345 *v3 = (vector float)vec_sel((vector signed int)*v3,zero,mask);
3346 *v4 = (vector float)vec_sel((vector signed int)*v4,zero,mask);
3347 *v5 = (vector float)vec_sel((vector signed int)*v5,zero,mask);
3348 *v6 = (vector float)vec_sel((vector signed int)*v6,zero,mask);
3349 *v7 = (vector float)vec_sel((vector signed int)*v7,zero,mask);
3350 *v8 = (vector float)vec_sel((vector signed int)*v8,zero,mask);
3351 *v9 = (vector float)vec_sel((vector signed int)*v9,zero,mask);
3354 static inline void zero_highest_2_elements_in_9_vectors(vector float *v1,vector float *v2,vector float *v3,
3355 vector float *v4,vector float *v5,vector float *v6,
3356 vector float *v7,vector float *v8,vector float *v9)
3358 vector signed int zero = (vector signed int) vec_zero();
3359 vector unsigned int mask = (vector unsigned int)vec_sld(zero,vec_splat_s32(-1),8);
3361 *v1 = (vector float)vec_sel((vector signed int)*v1,zero,mask);
3362 *v2 = (vector float)vec_sel((vector signed int)*v2,zero,mask);
3363 *v3 = (vector float)vec_sel((vector signed int)*v3,zero,mask);
3364 *v4 = (vector float)vec_sel((vector signed int)*v4,zero,mask);
3365 *v5 = (vector float)vec_sel((vector signed int)*v5,zero,mask);
3366 *v6 = (vector float)vec_sel((vector signed int)*v6,zero,mask);
3367 *v7 = (vector float)vec_sel((vector signed int)*v7,zero,mask);
3368 *v8 = (vector float)vec_sel((vector signed int)*v8,zero,mask);
3369 *v9 = (vector float)vec_sel((vector signed int)*v9,zero,mask);
3372 static inline void zero_highest_3_elements_in_9_vectors(vector float *v1,vector float *v2,vector float *v3,
3373 vector float *v4,vector float *v5,vector float *v6,
3374 vector float *v7,vector float *v8,vector float *v9)
3376 vector signed int zero = (vector signed int) vec_zero();
3377 vector unsigned int mask = (vector unsigned int)vec_sld(zero,vec_splat_s32(-1),12);
3379 *v1 = (vector float)vec_sel((vector signed int)*v1,zero,mask);
3380 *v2 = (vector float)vec_sel((vector signed int)*v2,zero,mask);
3381 *v3 = (vector float)vec_sel((vector signed int)*v3,zero,mask);
3382 *v4 = (vector float)vec_sel((vector signed int)*v4,zero,mask);
3383 *v5 = (vector float)vec_sel((vector signed int)*v5,zero,mask);
3384 *v6 = (vector float)vec_sel((vector signed int)*v6,zero,mask);
3385 *v7 = (vector float)vec_sel((vector signed int)*v7,zero,mask);
3386 *v8 = (vector float)vec_sel((vector signed int)*v8,zero,mask);
3387 *v9 = (vector float)vec_sel((vector signed int)*v9,zero,mask);
3391 void inl0100_altivec(int nri,int iinr[],int jindex[],int jjnr[],int shift[],
3392 float shiftvec[],float fshift[],int gid[],float pos[],
3393 float faction[],int type[],int ntype,float nbfp[],
3394 float Vnb[]);
3395 void inl0300_altivec(int nri,int iinr[],int jindex[],int jjnr[],int shift[],
3396 float shiftvec[],float fshift[],int gid[],float pos[],
3397 float faction[],int type[],int ntype,float nbfp[],
3398 float Vnb[],float tabscale,float VFtab[]);
3399 void inl1000_altivec(int nri,int iinr[],int jindex[],int jjnr[],int shift[],
3400 float shiftvec[],float fshift[],int gid[],float pos[],
3401 float faction[],float charge[],float facel,float Vc[]);
3402 void inl1020_altivec(int nri,int iinr[],int jindex[],int jjnr[],int shift[],
3403 float shiftvec[],float fshift[],int gid[],float pos[],
3404 float faction[],float charge[],float facel,float Vc[]);
3405 void inl1030_altivec(int nri,int iinr[],int jindex[],int jjnr[],int shift[],
3406 float shiftvec[],float fshift[],int gid[],float pos[],
3407 float faction[],float charge[],float facel,float Vc[]);
3408 void inl1100_altivec(int nri,int iinr[],int jindex[],int jjnr[],int shift[],
3409 float shiftvec[],float fshift[],int gid[],float pos[],
3410 float faction[],float charge[],float facel,float Vc[],
3411 int type[],int ntype,float nbfp[],float Vnb[]);
3412 void inl1120_altivec(int nri,int iinr[],int jindex[],int jjnr[],int shift[],
3413 float shiftvec[],float fshift[],int gid[],float pos[],
3414 float faction[],float charge[],float facel,float Vc[],
3415 int type[],int ntype,float nbfp[],float Vnb[]);
3416 void inl1130_altivec(int nri,int iinr[],int jindex[],int jjnr[],int shift[],
3417 float shiftvec[],float fshift[],int gid[],float pos[],
3418 float faction[],float charge[],float facel,float Vc[],
3419 int type[],int ntype,float nbfp[],float Vnb[]);
3420 void inl2000_altivec(int nri,int iinr[],int jindex[],int jjnr[],int shift[],
3421 float shiftvec[],float fshift[],int gid[],float pos[],
3422 float faction[],float charge[],float facel,float Vc[],
3423 float krf, float crf);
3424 void inl2020_altivec(int nri,int iinr[],int jindex[],int jjnr[],int shift[],
3425 float shiftvec[],float fshift[],int gid[],float pos[],
3426 float faction[],float charge[],float facel,float Vc[],
3427 float krf, float crf);
3428 void inl2030_altivec(int nri,int iinr[],int jindex[],int jjnr[],int shift[],
3429 float shiftvec[],float fshift[],int gid[],float pos[],
3430 float faction[],float charge[],float facel,float Vc[],
3431 float krf, float crf);
3432 void inl2100_altivec(int nri,int iinr[],int jindex[],int jjnr[],int shift[],
3433 float shiftvec[],float fshift[],int gid[],float pos[],
3434 float faction[],float charge[],float facel,float Vc[],
3435 float krf, float crf, int type[],int ntype,
3436 float nbfp[],float Vnb[]);
3437 void inl2120_altivec(int nri,int iinr[],int jindex[],int jjnr[],int shift[],
3438 float shiftvec[],float fshift[],int gid[],float pos[],
3439 float faction[],float charge[],float facel,float Vc[],
3440 float krf, float crf, int type[],int ntype,
3441 float nbfp[],float Vnb[]);
3442 void inl2130_altivec(int nri,int iinr[],int jindex[],int jjnr[],int shift[],
3443 float shiftvec[],float fshift[],int gid[],float pos[],
3444 float faction[],float charge[],float facel,float Vc[],
3445 float krf, float crf, int type[],int ntype,
3446 float nbfp[],float Vnb[]);
3447 void inl3000_altivec(int nri,int iinr[],int jindex[],int jjnr[],int shift[],
3448 float shiftvec[],float fshift[],int gid[],float pos[],
3449 float faction[],float charge[],float facel,float Vc[],
3450 float tabscale,float VFtab[]);
3451 void inl3020_altivec(int nri,int iinr[],int jindex[],int jjnr[],int shift[],
3452 float shiftvec[],float fshift[],int gid[],float pos[],
3453 float faction[],float charge[],float facel,float Vc[],
3454 float tabscale,float VFtab[]);
3455 void inl3030_altivec(int nri,int iinr[],int jindex[],int jjnr[],int shift[],
3456 float shiftvec[],float fshift[],int gid[],float pos[],
3457 float faction[],float charge[],float facel,float Vc[],
3458 float tabscale,float VFtab[]);
3459 void inl3100_altivec(int nri,int iinr[],int jindex[],int jjnr[],int shift[],
3460 float shiftvec[],float fshift[],int gid[],float pos[],
3461 float faction[],float charge[],float facel,float Vc[],
3462 int type[],int ntype,float nbfp[],float Vnb[],
3463 float tabscale, float VFtab[]);
3464 void inl3120_altivec(int nri,int iinr[],int jindex[],int jjnr[],int shift[],
3465 float shiftvec[],float fshift[],int gid[],float pos[],
3466 float faction[],float charge[],float facel,float Vc[],
3467 int type[],int ntype,float nbfp[],float Vnb[],
3468 float tabscale, float VFtab[]);
3469 void inl3130_altivec(int nri,int iinr[],int jindex[],int jjnr[],int shift[],
3470 float shiftvec[],float fshift[],int gid[],float pos[],
3471 float faction[],float charge[],float facel,float Vc[],
3472 int type[],int ntype,float nbfp[],float Vnb[],
3473 float tabscale, float VFtab[]);
3474 void inl3300_altivec(int nri,int iinr[],int jindex[],int jjnr[],int shift[],
3475 float shiftvec[],float fshift[],int gid[],float pos[],
3476 float faction[],float charge[],float facel,float Vc[],
3477 int type[],int ntype,float nbfp[],float Vnb[],
3478 float tabscale,float VFtab[]);
3479 void inl3320_altivec(int nri,int iinr[],int jindex[],int jjnr[],int shift[],
3480 float shiftvec[],float fshift[],int gid[],float pos[],
3481 float faction[],float charge[],float facel,float Vc[],
3482 int type[],int ntype,float nbfp[],float Vnb[],
3483 float tabscale,float VFtab[]);
3484 void inl3330_altivec(int nri,int iinr[],int jindex[],int jjnr[],int shift[],
3485 float shiftvec[],float fshift[],int gid[],float pos[],
3486 float faction[],float charge[],float facel,float Vc[],
3487 int type[],int ntype,float nbfp[],float Vnb[],
3488 float tabscale,float VFtab[]);
3490 /*---*/
3492 void mcinl0100_altivec(int nri,int iinr[],int jindex[],int jjnr[],int shift[],
3493 float shiftvec[],int gid[],float pos[],int type[],
3494 int ntype,float nbfp[],float Vnb[]);
3495 void mcinl0300_altivec(int nri,int iinr[],int jindex[],int jjnr[],int shift[],
3496 float shiftvec[],int gid[],float pos[],int type[],
3497 int ntype,float nbfp[],float Vnb[],
3498 float tabscale,float VFtab[]);
3499 void mcinl1000_altivec(int nri,int iinr[],int jindex[],int jjnr[],int shift[],
3500 float shiftvec[],int gid[],float pos[],
3501 float charge[],float facel,float Vc[]);
3502 void mcinl1020_altivec(int nri,int iinr[],int jindex[],int jjnr[],int shift[],
3503 float shiftvec[],int gid[],float pos[],
3504 float charge[],float facel,float Vc[]);
3505 void mcinl1030_altivec(int nri,int iinr[],int jindex[],int jjnr[],int shift[],
3506 float shiftvec[],int gid[],float pos[],
3507 float charge[],float facel,float Vc[]);
3508 void mcinl1100_altivec(int nri,int iinr[],int jindex[],int jjnr[],int shift[],
3509 float shiftvec[],int gid[],float pos[],
3510 float charge[],float facel,float Vc[],
3511 int type[],int ntype,float nbfp[],float Vnb[]);
3512 void mcinl1120_altivec(int nri,int iinr[],int jindex[],int jjnr[],int shift[],
3513 float shiftvec[],int gid[],float pos[],
3514 float charge[],float facel,float Vc[],
3515 int type[],int ntype,float nbfp[],float Vnb[]);
3516 void mcinl1130_altivec(int nri,int iinr[],int jindex[],int jjnr[],int shift[],
3517 float shiftvec[],int gid[],float pos[],
3518 float charge[],float facel,float Vc[],
3519 int type[],int ntype,float nbfp[],float Vnb[]);
3520 void mcinl2000_altivec(int nri,int iinr[],int jindex[],int jjnr[],int shift[],
3521 float shiftvec[],int gid[],float pos[],
3522 float charge[],float facel,float Vc[],
3523 float krf, float crf);
3524 void mcinl2020_altivec(int nri,int iinr[],int jindex[],int jjnr[],int shift[],
3525 float shiftvec[],int gid[],float pos[],
3526 float charge[],float facel,float Vc[],
3527 float krf, float crf);
3528 void mcinl2030_altivec(int nri,int iinr[],int jindex[],int jjnr[],int shift[],
3529 float shiftvec[],int gid[],float pos[],
3530 float charge[],float facel,float Vc[],
3531 float krf, float crf);
3532 void mcinl2100_altivec(int nri,int iinr[],int jindex[],int jjnr[],int shift[],
3533 float shiftvec[],int gid[],float pos[],
3534 float charge[],float facel,float Vc[],
3535 float krf, float crf, int type[],int ntype,
3536 float nbfp[],float Vnb[]);
3537 void mcinl2120_altivec(int nri,int iinr[],int jindex[],int jjnr[],int shift[],
3538 float shiftvec[],int gid[],float pos[],
3539 float charge[],float facel,float Vc[],
3540 float krf, float crf, int type[],int ntype,
3541 float nbfp[],float Vnb[]);
3542 void mcinl2130_altivec(int nri,int iinr[],int jindex[],int jjnr[],int shift[],
3543 float shiftvec[],int gid[],float pos[],
3544 float charge[],float facel,float Vc[],
3545 float krf, float crf, int type[],int ntype,
3546 float nbfp[],float Vnb[]);
3547 void mcinl3000_altivec(int nri,int iinr[],int jindex[],int jjnr[],int shift[],
3548 float shiftvec[],int gid[],float pos[],
3549 float charge[],float facel,float Vc[],
3550 float tabscale,float VFtab[]);
3551 void mcinl3020_altivec(int nri,int iinr[],int jindex[],int jjnr[],int shift[],
3552 float shiftvec[],int gid[],float pos[],
3553 float charge[],float facel,float Vc[],
3554 float tabscale,float VFtab[]);
3555 void mcinl3030_altivec(int nri,int iinr[],int jindex[],int jjnr[],int shift[],
3556 float shiftvec[],int gid[],float pos[],
3557 float charge[],float facel,float Vc[],
3558 float tabscale,float VFtab[]);
3559 void mcinl3100_altivec(int nri,int iinr[],int jindex[],int jjnr[],int shift[],
3560 float shiftvec[],int gid[],float pos[],
3561 float charge[],float facel,float Vc[],
3562 int type[],int ntype,float nbfp[],float Vnb[],
3563 float tabscale, float VFtab[]);
3564 void mcinl3120_altivec(int nri,int iinr[],int jindex[],int jjnr[],int shift[],
3565 float shiftvec[],int gid[],float pos[],
3566 float charge[],float facel,float Vc[],
3567 int type[],int ntype,float nbfp[],float Vnb[],
3568 float tabscale, float VFtab[]);
3569 void mcinl3130_altivec(int nri,int iinr[],int jindex[],int jjnr[],int shift[],
3570 float shiftvec[],int gid[],float pos[],
3571 float charge[],float facel,float Vc[],
3572 int type[],int ntype,float nbfp[],float Vnb[],
3573 float tabscale, float VFtab[]);
3574 void mcinl3300_altivec(int nri,int iinr[],int jindex[],int jjnr[],int shift[],
3575 float shiftvec[],int gid[],float pos[],
3576 float charge[],float facel,float Vc[],
3577 int type[],int ntype,float nbfp[],float Vnb[],
3578 float tabscale,float VFtab[]);
3579 void mcinl3320_altivec(int nri,int iinr[],int jindex[],int jjnr[],int shift[],
3580 float shiftvec[],int gid[],float pos[],
3581 float charge[],float facel,float Vc[],
3582 int type[],int ntype,float nbfp[],float Vnb[],
3583 float tabscale,float VFtab[]);
3584 void mcinl3330_altivec(int nri,int iinr[],int jindex[],int jjnr[],int shift[],
3585 float shiftvec[],int gid[],float pos[],
3586 float charge[],float facel,float Vc[],
3587 int type[],int ntype,float nbfp[],float Vnb[],
3588 float tabscale,float VFtab[]);
3592 #endif /* ppc_altivec.h */