4 * This source code is part of
8 * GROningen MAchine for Chemical Simulations
11 * Copyright (c) 1991-2001, University of Groningen, The Netherlands
12 * This program is free software; you can redistribute it and/or
13 * modify it under the terms of the GNU General Public License
14 * as published by the Free Software Foundation; either version 2
15 * of the License, or (at your option) any later version.
17 * If you want to redistribute modifications, please consider that
18 * scientific software is very special. Version control is crucial -
19 * bugs must be traceable. We will be happy to consider code for
20 * inclusion in the official distribution, but derived work must not
21 * be called official GROMACS. Details are found in the README & COPYING
22 * files - if they are missing, get the official version at www.gromacs.org.
24 * To help us fund GROMACS development, we humbly ask that you cite
25 * the papers on the package - you can find them in the top README file.
27 * For more info, check our website at http://www.gromacs.org
30 * Getting the Right Output Means no Artefacts in Calculating Stuff
33 #ifndef _ppc_altivec_h
34 #define _ppc_altivec_h
43 static void printvec(vector
float v
)
49 printf("%8.5f ",*(((float *)&v
)+i
));
54 /* Altivec utility routines */
55 static void set_non_java_mode(void)
57 vector
unsigned short vsr1
,vsr2
;
58 vector
unsigned int tmp
;
61 tmp
=vec_sl(vec_splat_u32(1),vec_splat_u32(8));
62 vsr2
=(vector
unsigned short)vec_sl(tmp
,vec_splat_u32(8));
63 vsr1
=vec_or(vsr1
,vsr2
);
67 /* Simple numerical constants */
68 static inline vector
float vec_zero(void)
70 return vec_ctf(vec_splat_u32(0),0);
73 static inline vector
float vec_half(void)
75 return vec_ctf(vec_splat_u32(1),1);
78 static inline vector
float vec_one(void)
80 return vec_ctf(vec_splat_u32(1),0);
83 static inline vector
float vec_two(void)
85 return vec_ctf(vec_splat_u32(2),0);
88 static inline vector
float vec_three(void)
90 return vec_ctf(vec_splat_u32(3),0);
93 static inline vector
float vec_six(void)
95 return vec_ctf(vec_splat_u32(6),0);
98 static inline vector
float vec_twelve(void)
100 return vec_ctf(vec_splat_u32(12),0);
104 /* load three consecutive floats. We never access the fourth,
105 * so this is safe even at the end of an array.
107 static inline vector
float load_xyz(float *address
)
109 vector
float c1
,c2
,c3
;
110 vector
unsigned char perm
;
112 perm
= vec_lvsl( 0, address
);
113 c1
= vec_lde( 0, address
);
114 c2
= vec_lde( 4, address
);
115 c3
= vec_lde( 8, address
);
116 c1
= vec_perm(c1
,c1
,perm
);
117 c2
= vec_perm(c2
,c2
,perm
);
118 c3
= vec_perm(c3
,c3
,perm
);
119 c2
= vec_sld(c2
,c2
,4);
120 c3
= vec_sld(c3
,c3
,8);
121 c1
= vec_mergeh(c1
,c3
);
123 return vec_mergeh(c1
,c2
);
127 /* load four consecutive floats (from unaligned address) */
128 static inline vector
float load_vector_unaligned(float *address
)
130 vector
unsigned char perm
;
131 vector
float low
,high
;
133 perm
= vec_lvsl( 0, (int *) address
);
134 low
= vec_ld( 0, address
);
135 high
= vec_ld( 16, address
);
137 return vec_perm(low
,high
,perm
);
140 /* load a single float and spread it to all vector elements */
141 static inline vector
float load_float_and_splat(float *address
)
143 vector
unsigned char perm
;
146 tmp
= vec_lde(0,address
);
147 perm
= vec_lvsl(0,address
);
148 tmp
= vec_perm(tmp
,tmp
,perm
);
150 return vec_splat(tmp
,0);
154 /* load four non-consecutive floats into vector [mem1 mem2 mem3 mem4] */
155 static inline vector
float load_4_float(float *float1
,
160 vector
unsigned char xshift
= vec_lvsl( 12, float1
);
161 vector
unsigned char yshift
= vec_lvsl( 12, float2
);
162 vector
unsigned char zshift
= vec_lvsl( 0, float3
);
163 vector
unsigned char wshift
= vec_lvsl( 0, float4
);
165 vector
float X
= vec_lde( 0, float1
);
166 vector
float Y
= vec_lde( 0, float2
);
167 vector
float Z
= vec_lde( 0, float3
);
168 vector
float W
= vec_lde( 0, float4
);
170 X
= vec_perm( X
, X
, xshift
);
171 Y
= vec_perm( Y
, Y
, yshift
);
172 Z
= vec_perm( Z
, Z
, zshift
);
173 W
= vec_perm( W
, W
, wshift
);
175 X
= vec_mergeh( X
, Y
);
176 Z
= vec_mergeh( Z
, W
);
178 return vec_sld( X
, Z
, 8 );
181 /* load four non-consecutive floats into vector [mem1 mem2 mem3 mem4] */
182 static inline vector
float load_3_float(float *float1
,
186 vector
unsigned char xshift
= vec_lvsl( 12, float1
);
187 vector
unsigned char yshift
= vec_lvsl( 12, float2
);
188 vector
unsigned char zshift
= vec_lvsl( 0, float3
);
190 vector
float X
= vec_lde( 0, float1
);
191 vector
float Y
= vec_lde( 0, float2
);
192 vector
float Z
= vec_lde( 0, float3
);
194 X
= vec_perm( X
, X
, xshift
);
195 Y
= vec_perm( Y
, Y
, yshift
);
196 Z
= vec_perm( Z
, Z
, zshift
);
198 X
= vec_mergeh( X
, Y
);
200 return vec_sld( X
, Z
, 8 );
204 /* load two non-consecutive floats into vector [mem1 mem2 0 0] */
205 static inline vector
float load_2_float(float *float1
,
208 vector
unsigned char xshift
= vec_lvsl( 8, float1
);
209 vector
unsigned char yshift
= vec_lvsl( 8, float2
);
211 vector
float X
= vec_lde( 0, float1
);
212 vector
float Y
= vec_lde( 0, float2
);
214 X
= vec_perm( X
, X
, xshift
);
215 Y
= vec_perm( Y
, Y
, yshift
);
217 return vec_mergel( X
, Y
);
220 /* load one float into vector [mem1 0 0 0] */
221 static inline vector
float load_1_float(float *float1
)
223 vector
unsigned char xshift
= vec_lvsl( 4, float1
);
225 vector
float X
= vec_lde( 0, float1
);
226 X
= vec_perm( X
, X
, xshift
);
228 return vec_sld(X
,vec_zero(),12);
234 /* load lj parameters for four atoms and store in two vectors.
235 * The nbfp memory is aligned on an 8-byte boundary and consists
236 * of pairs, so the two parameters are either in the upper or
237 * lower half of a 16-byte structure.
239 static inline void load_4_pair(float *pair1
,
246 vector
float X
= vec_ld( 0,pair1
); /* c6a c12a */
247 vector
float Y
= vec_ld( 0,pair2
); /* c6b c12b */
248 vector
float Z
= vec_ld( 0,pair3
); /* c6c c12c */
249 vector
float W
= vec_ld( 0,pair4
); /* c6d c12d */
250 vector
unsigned char perm1
= vec_lvsl(0,pair1
);
251 vector
unsigned char perm2
= vec_lvsl(0,pair2
);
252 vector
unsigned char perm3
= vec_lvsl(0,pair3
);
253 vector
unsigned char perm4
= vec_lvsl(0,pair4
);
254 X
= vec_perm(X
,X
,perm1
);
255 Y
= vec_perm(Y
,Y
,perm2
);
256 Z
= vec_perm(Z
,Z
,perm3
);
257 W
= vec_perm(W
,W
,perm4
);
259 X
= vec_mergeh(X
,Z
); /* c6a c6c c12a c12c */
260 Y
= vec_mergeh(Y
,W
); /* c6b c6d c12b c12d */
262 *c6
= vec_mergeh(X
,Y
);
263 *c12
= vec_mergel(X
,Y
);
267 /* load lj parameters for three atoms and store in two vectors
268 * See load_4_pair for alignment requirements.
270 static inline void load_3_pair(float *pair1
,
276 vector
float X
= vec_ld( 0,pair1
); /* c6a c12a */
277 vector
float Y
= vec_ld( 0,pair2
); /* c6b c12b */
278 vector
float Z
= vec_ld( 0,pair3
); /* c6c c12c */
279 vector
unsigned char perm1
= vec_lvsl(0,pair1
);
280 vector
unsigned char perm2
= vec_lvsl(0,pair2
);
281 vector
unsigned char perm3
= vec_lvsl(0,pair3
);
282 X
= vec_perm(X
,X
,perm1
);
283 Y
= vec_perm(Y
,Y
,perm2
);
284 Z
= vec_perm(Z
,Z
,perm3
);
286 X
= vec_mergeh(X
,Z
); /* c6a c6c c12a c12c */
287 Y
= vec_mergeh(Y
,vec_zero()); /* c6b 0 c12b 0 */
289 *c6
= vec_mergeh(X
,Y
);
290 *c12
= vec_mergel(X
,Y
);
294 /* load lj parameters for two atoms and store in two vectors
295 * See load_4_pair for alignment requirements.
297 static inline void load_2_pair(float *pair1
,
302 vector
float X
= vec_ld( 0,pair1
); /* c6a c12a */
303 vector
float Y
= vec_ld( 0,pair2
); /* c6b c12b */
304 vector
unsigned char perm1
= vec_lvsl(0,pair1
);
305 vector
unsigned char perm2
= vec_lvsl(0,pair2
);
306 X
= vec_perm(X
,X
,perm1
);
307 Y
= vec_perm(Y
,Y
,perm2
);
309 X
= vec_mergeh(X
,vec_zero()); /* c6a 0 c12a 0 */
310 Y
= vec_mergeh(Y
,vec_zero()); /* c6b 0 c12b 0 */
312 *c6
= vec_mergeh(X
,Y
);
313 *c12
= vec_mergel(X
,Y
);
316 /* load lj parameters for one atom and store in two vectors
317 * See load_4_pair for alignment requirements.
319 static inline void load_1_pair(float *pair1
,
323 vector
float X
= vec_ld( 0,pair1
); /* c6a c12a */
324 vector
unsigned char perm1
= vec_lvsl(0,pair1
);
325 X
= vec_perm(X
,X
,perm1
);
326 X
= vec_mergeh(X
,vec_zero()); /* c6a 0 c12a 0 */
328 *c6
= vec_mergeh(X
,vec_zero());
329 *c12
= vec_mergel(X
,vec_zero());
334 * permute [x y z ?] into [x x x x] [y y y y] [z z z z]
335 * xreg can be the same as xyzreg, but not yreg/zreg!
337 static inline void splat_xyz_to_vectors(vector
float xyzreg
,
342 *zreg
= vec_splat(xyzreg
,2);
343 *yreg
= vec_splat(xyzreg
,1);
344 *xreg
= vec_splat(xyzreg
,0);
348 /* The following transpose routines do not fill with zero padding:
355 * ...but the rest do.
359 /* move 4*[x y z ?] into [x1 x2 x3 x4] [y1 y2 y3 y4] [z1 z2 z3 z4]
361 static inline void transpose_4_to_3(vector
float xyz1
,
365 vector
float *xvector
,
366 vector
float *yvector
,
367 vector
float *zvector
)
369 *yvector
= vec_mergeh(xyz1
,xyz3
); /* [x1 x3 y1 y3] */
370 *zvector
= vec_mergeh(xyz2
,xyz4
); /* [x2 x4 y2 y4] */
371 xyz1
= vec_mergel(xyz1
,xyz3
); /* [z1 z3 ? ?] */
372 xyz2
= vec_mergel(xyz2
,xyz4
); /* [z2 z4 ? ?] */
373 *xvector
= vec_mergeh(*yvector
,*zvector
); /* [x1 x2 x3 x4] */
374 *yvector
= vec_mergel(*yvector
,*zvector
); /* [y1 y2 y3 y4] */
375 *zvector
= vec_mergeh(xyz1
,xyz2
); /* [z1 z2 z3 z4] */
379 /* move 2*[x y z ?] into [x1 x2 ? ?] [y1 y2 ? ?] [z1 z2 ? ?]
381 static inline void transpose_2_to_3(vector
float xyz1
,
383 vector
float *xvector
,
384 vector
float *yvector
,
385 vector
float *zvector
)
387 *xvector
= vec_mergeh(xyz1
,xyz2
); /* x1 x2 y1 y2 */
388 *zvector
= vec_mergel(xyz1
,xyz2
); /* z1 z2 ? ? */
389 *yvector
= vec_sld(*xvector
,*xvector
,8); /* y1 y2 0 0 */
393 /* move [x y z ?] into [x1 ? ? ?] [y1 ? ? ?] [z1 ? ? ?]
395 static inline void transpose_1_to_3(vector
float xyz1
,
396 vector
float *xvector
,
397 vector
float *yvector
,
398 vector
float *zvector
)
400 /* simply use splat, since elem 2,3,4 dont matter. */
401 *xvector
= vec_splat(xyz1
,0); /* x1 x1 x1 x1 */
402 *yvector
= vec_splat(xyz1
,1); /* y1 y1 y1 y1 */
403 *zvector
= vec_splat(xyz1
,2); /* z1 z1 z1 z1 */
409 /* move [x1 x2 x3 x4] [y1 y2 y3 y4] [z1 z2 z3 z4] to 4* [x y z 0]
411 static inline void transpose_3_to_4(vector
float xvector
,
412 vector
float yvector
,
413 vector
float zvector
,
419 vector
float tmp1
,tmp2
;
420 vector
float nul
=vec_zero();
422 *xyz2
= vec_mergeh(xvector
,zvector
); /* [x1 z1 x2 z2 ] */
423 tmp1
= vec_mergeh(yvector
,nul
); /* [y1 0 y2 0 ] */
424 *xyz4
= vec_mergel(xvector
,zvector
); /* [x3 z3 x4 z4 ] */
425 tmp2
= vec_mergel(yvector
,nul
); /* [y3 0 y4 0 ] */
427 *xyz1
= vec_mergeh(*xyz2
,tmp1
); /* [x1 y1 z1 0 ] */
428 *xyz2
= vec_mergel(*xyz2
,tmp1
); /* [x2 y2 z2 0 ] */
429 *xyz3
= vec_mergeh(*xyz4
,tmp2
); /* [x3 y3 z3 0 ] */
430 *xyz4
= vec_mergel(*xyz4
,tmp2
); /* [x4 y4 z4 0 ] */
435 /* move [x1 x2 ? ?] [y1 y2 ? ?] [z1 z2 ? ?] to 2* [x y z 0]
437 static inline void transpose_3_to_2(vector
float xvector
,
438 vector
float yvector
,
439 vector
float zvector
,
445 *xyz2
= vec_mergeh(xvector
,zvector
); /* [x1 z1 x2 z2 ] */
446 tmp1
= vec_mergeh(yvector
,vec_zero()); /* [y1 0 y2 0 ] */
448 *xyz1
= vec_mergeh(*xyz2
,tmp1
); /* [x1 y1 z1 0 ] */
449 *xyz2
= vec_mergel(*xyz2
,tmp1
); /* [x2 y2 z2 0 ] */
453 /* move [x1 ? ? ?] [y1 ? ? ?] [z1 ? ? ?] to 1* [x y z 0]
455 static inline void transpose_3_to_1(vector
float xvector
,
456 vector
float yvector
,
457 vector
float zvector
,
460 *xyz1
= vec_mergeh(xvector
,zvector
); /* [x1 z1 ? ? ] */
461 yvector
= vec_mergeh(yvector
,vec_zero()); /* [y1 0 ? 0] */
462 *xyz1
= vec_mergeh(*xyz1
,yvector
); /* [x1 y1 z1 0] */
467 static inline void transpose_4_to_4(vector
float in1
,
476 *out2
= vec_mergeh(in1
,in3
); /* [x1 x3 y1 y3] */
477 *out3
= vec_mergeh(in2
,in4
); /* [x2 x4 y2 y4] */
478 in1
= vec_mergel(in1
,in3
); /* [z1 z3 w1 w3] */
479 in2
= vec_mergel(in2
,in4
); /* [z2 z4 w2 w4] */
481 *out1
= vec_mergeh(*out2
,*out3
); /* [x1 x2 x3 x4] */
482 *out2
= vec_mergel(*out2
,*out3
); /* [y1 y2 y3 y4] */
483 *out3
= vec_mergeh(in1
,in2
); /* [z1 z2 z3 z4] */
484 *out4
= vec_mergel(in1
,in2
); /* [w1 w2 w3 w4] */
488 static inline void transpose_4_to_2(vector
float in1
,
497 tmp
= vec_mergeh(in1
,in3
); /* [x1 x3 y1 y3] */
498 *out2
= vec_mergeh(in2
,in4
); /* [x2 x4 y2 y4] */
499 *out1
= vec_mergeh(tmp
,*out2
); /* [x1 x2 x3 x4] */
500 *out2
= vec_mergel(tmp
,*out2
); /* [y1 y2 y3 y4] */
503 static inline void transpose_4_to_1(vector
float in1
,
511 tmp
= vec_mergeh(in1
,in3
); /* [x1 x3 ? ?] */
512 *out1
= vec_mergeh(in2
,in4
); /* [x2 x4 ? ?] */
513 *out1
= vec_mergeh(tmp
,*out1
); /* [x1 x2 x3 x4] */
518 static inline void transpose_2_to_4(vector
float in1
,
525 *out1
= vec_mergeh(in1
,in2
); /* [x1 x2 y1 y2] */
526 *out3
= vec_mergel(in1
,in2
); /* [z1 z2 w1 w2] */
527 *out2
= vec_sld(*out1
,*out1
,8);/* [y1 y2 0 0] */
528 *out4
= vec_sld(*out3
,*out3
,8);/* [w1 w2 0 0] */
532 static inline void transpose_1_to_4(vector
float in1
,
538 *out1
= vec_splat(in1
,0); /* [x1 x1 x1 x1] */
539 *out2
= vec_splat(in1
,1); /* [y1 y1 y1 y1] */
540 *out3
= vec_splat(in1
,2); /* [z1 z1 z1 z1] */
541 *out4
= vec_splat(in1
,3); /* [w1 w1 w1 w1] */
545 /* Add the contents in xyz to an unaligned address location.
547 static inline void add_xyz_to_mem(float *address
,vector
float vdata
)
549 vector
float c1
,c2
,c3
;
551 c1
= vec_lde( 0, address
);
552 c2
= vec_lde( 4, address
);
553 c3
= vec_lde( 8, address
);
554 c1
= vec_add(c1
,vec_splat(vdata
,0));
555 c2
= vec_add(c2
,vec_splat(vdata
,1));
556 c3
= vec_add(c3
,vec_splat(vdata
,2));
557 vec_ste( c1
, 0, address
);
558 vec_ste( c2
, 4, address
);
559 vec_ste( c3
, 8, address
);
563 static inline void add_vector_to_float(float *address
,vector
float vdata
)
567 tmp
= vec_sld(vdata
,vdata
,8);
568 vdata
= vec_add(vdata
,tmp
);
569 tmp
= vec_sld(vdata
,vdata
,4);
570 vdata
= vec_add(vdata
,tmp
);
571 tmp
= vec_lde(0,address
);
572 /* all four positions in vdata contain the sum */
573 tmp
= vec_add(tmp
,vdata
);
574 vec_ste(tmp
,0,address
);
579 * load entire water molecule to vectors. Water is special;
580 * the allocated memory is 8-byte aligned, so with nine atoms
581 * we can always read 3 16-byte chunks (rounded down by vec_ld)
582 * without going outside the current page. The extra elements
583 * before and after the water dont matter as long as we dont
584 * try to write to them.
586 static inline void load_1_water_shift_and_splat(float *address
,
598 vector
unsigned char perm
;
599 vector
float c1
,c2
,c3
,sha
,shb
,sh1
,sh2
,sh3
;
602 shb
= load_xyz(shiftvec
); /* [ shX shY shZ -] */
603 /* load the coordinates */
604 perm
= vec_lvsl( 0, (int *) address
);
605 c1
= vec_ld( 0, address
);
606 c2
= vec_ld( 16, address
);
607 c3
= vec_ld( 32, address
);
609 sha
= vec_sld(shb
,shb
,12); /* [ - shX shY shZ ] */
611 sh1
= vec_sld(sha
,shb
,4); /* [ shX shY shZ shX ] */
612 sh2
= vec_sld(sha
,shb
,8); /* [ shY shZ shX shY ] */
613 sh3
= vec_sld(sha
,shb
,12); /* [ shZ shX shY shZ ] */
615 c1
= vec_perm(c1
,c2
,perm
); /* Ox Oy Oz H1x */
616 c2
= vec_perm(c2
,c3
,perm
); /* H1y H1z H2x H2y */
617 c3
= vec_perm(c3
,c3
,perm
); /* H2z - - - */
618 c1
= vec_add(c1
,sh1
);
619 c2
= vec_add(c2
,sh2
);
620 c3
= vec_add(c3
,sh3
);
622 *Ox
= vec_splat(c1
,0);
623 *Oy
= vec_splat(c1
,1);
624 *Oz
= vec_splat(c1
,2);
625 *H1x
= vec_splat(c1
,3);
626 *H1y
= vec_splat(c2
,0);
627 *H1z
= vec_splat(c2
,1);
628 *H2x
= vec_splat(c2
,2);
629 *H2y
= vec_splat(c2
,3);
630 *H2z
= vec_splat(c3
,0);
633 static inline void load_4_water(float *address1
,
647 vector
unsigned char perm
;
648 vector
float tmp1
,tmp2
,tmp3
;
650 vector
float c1a
,c2a
,c3a
;
651 vector
float c1b
,c2b
,c3b
;
652 vector
float c1c
,c2c
,c3c
;
653 vector
float c1d
,c2d
,c3d
;
655 /* load the coordinates */
656 perm
= vec_lvsl( 0, address1
);
657 tmp1
= vec_ld( 0, address1
);
658 tmp2
= vec_ld( 16, address1
);
659 tmp3
= vec_ld( 32, address1
);
660 c1a
= vec_perm(tmp1
,tmp2
,perm
); /* Oxa Oya Oza H1xa */
661 c2a
= vec_perm(tmp2
,tmp3
,perm
); /* H1ya H1za H2xa H2ya */
662 c3a
= vec_perm(tmp3
,tmp3
,perm
); /* H2za - - - */
664 perm
= vec_lvsl( 0, address2
);
665 tmp1
= vec_ld( 0, address2
);
666 tmp2
= vec_ld( 16, address2
);
667 tmp3
= vec_ld( 32, address2
);
668 c1b
= vec_perm(tmp1
,tmp2
,perm
); /* Oxb Oyb Ozb H1xb */
669 c2b
= vec_perm(tmp2
,tmp3
,perm
); /* H1yb H1zb H2xb H2yb */
670 c3b
= vec_perm(tmp3
,tmp3
,perm
); /* H2zb - - - */
672 perm
= vec_lvsl( 0, address3
);
673 tmp1
= vec_ld( 0, address3
);
674 tmp2
= vec_ld( 16, address3
);
675 tmp3
= vec_ld( 32, address3
);
676 c1c
= vec_perm(tmp1
,tmp2
,perm
); /* Oxc Oyc Ozc H1xc */
677 c2c
= vec_perm(tmp2
,tmp3
,perm
); /* H1yc H1zc H2xc H2yc */
678 c3c
= vec_perm(tmp3
,tmp3
,perm
); /* H2zc - - - */
680 perm
= vec_lvsl( 0, address4
);
681 tmp1
= vec_ld( 0, address4
);
682 tmp2
= vec_ld( 16, address4
);
683 tmp3
= vec_ld( 32, address4
);
684 c1d
= vec_perm(tmp1
,tmp2
,perm
); /* Oxd Oyd Ozd H1xd */
685 c2d
= vec_perm(tmp2
,tmp3
,perm
); /* H1yd H1zd H2xd H2yd */
686 c3d
= vec_perm(tmp3
,tmp3
,perm
); /* H2zd - - - */
689 tmp1
= vec_mergeh(c1a
,c1c
); /* Oxa Oxc Oya Oyc */
690 c1a
= vec_mergel(c1a
,c1c
); /* Oza Ozc H1xa H1xc */
691 c1c
= vec_mergeh(c1b
,c1d
); /* Oxb Oxd Oyb Oyd */
692 c1b
= vec_mergel(c1b
,c1d
); /* Ozb Ozd H1xb H1xd */
694 c1d
= vec_mergeh(c2a
,c2c
); /* H1ya H1yc H1za H1zc */
695 c2a
= vec_mergel(c2a
,c2c
); /* H2xa H2xc H2ya H2yc */
696 c2c
= vec_mergeh(c2b
,c2d
); /* H1yb H1yd H1zb H1zd */
697 c2b
= vec_mergel(c2b
,c2d
); /* H2xb H2xd H2yb H2yd */
699 c3a
= vec_mergeh(c3a
,c3c
); /* H2za H2zc - - */
700 c3b
= vec_mergeh(c3b
,c3d
); /* H2zb H2zd - - */
702 *Ox
= vec_mergeh(tmp1
,c1c
); /* Oxa Oxb Oxc Oxd */
703 *Oy
= vec_mergel(tmp1
,c1c
); /* Oya Oyb Oyc Oyd */
704 *Oz
= vec_mergeh(c1a
,c1b
); /* Oza Ozb Ozc Ozd */
705 *H1x
= vec_mergel(c1a
,c1b
); /* H1xa H1xb H1xc H1xd */
706 *H1y
= vec_mergeh(c1d
,c2c
); /* H1ya H1yb H1yc H1yd */
707 *H1z
= vec_mergel(c1d
,c2c
); /* H1za H1zb H1zc H1zd */
708 *H2x
= vec_mergeh(c2a
,c2b
); /* H2xa H2xb H2xc H2xd */
709 *H2y
= vec_mergel(c2a
,c2b
); /* H2ya H2yb H2yc H2yd */
710 *H2z
= vec_mergeh(c3a
,c3b
); /* H2za H2zb H2zc H2zd */
715 static inline void load_3_water(float *address1
,
728 vector
unsigned char perm
;
729 vector
float tmp1
,tmp2
,tmp3
;
730 vector
float c1a
,c2a
,c3a
;
731 vector
float c1b
,c2b
,c3b
;
732 vector
float c1c
,c2c
,c3c
;
734 /* load the coordinates */
735 perm
= vec_lvsl( 0, (int *) address1
);
736 tmp1
= vec_ld( 0, address1
);
737 tmp2
= vec_ld( 16, address1
);
738 tmp3
= vec_ld( 32, address1
);
739 c1a
= vec_perm(tmp1
,tmp2
,perm
); /* Oxa Oya Oza H1xa */
740 c2a
= vec_perm(tmp2
,tmp3
,perm
); /* H1ya H1za H2xa H2ya */
741 c3a
= vec_perm(tmp3
,tmp3
,perm
); /* H2za - - - */
743 perm
= vec_lvsl( 0, (int *) address3
);
744 tmp1
= vec_ld( 0, address3
);
745 tmp2
= vec_ld( 16, address3
);
746 tmp3
= vec_ld( 32, address3
);
747 c1c
= vec_perm(tmp1
,tmp2
,perm
); /* Oxc Oyc Ozc H1xc */
748 c2c
= vec_perm(tmp2
,tmp3
,perm
); /* H1yc H1zc H2xc H2yc */
749 c3c
= vec_perm(tmp3
,tmp3
,perm
); /* H2zc - - - */
751 perm
= vec_lvsl( 0, (int *) address2
);
752 tmp1
= vec_ld( 0, address2
);
753 tmp2
= vec_ld( 16, address2
);
754 tmp3
= vec_ld( 32, address2
);
755 c1b
= vec_perm(tmp1
,tmp2
,perm
); /* Oxb Oyb Ozb H1xb */
756 c2b
= vec_perm(tmp2
,tmp3
,perm
); /* H1yb H1zb H2xb H2yb */
757 c3b
= vec_perm(tmp3
,tmp3
,perm
); /* H2zb - - - */
760 tmp1
= vec_mergeh(c1a
,c1c
); /* Oxa Oxc Oya Oyc */
761 c1a
= vec_mergel(c1a
,c1c
); /* Oza Ozc H1xa H1xc */
762 c1c
= vec_mergeh(c1b
,c1b
); /* Oxb - Oyb - */
763 c1b
= vec_mergel(c1b
,c1b
); /* Ozb - H1xb - */
765 tmp2
= vec_mergeh(c2a
,c2c
); /* H1ya H1yc H1za H1zc */
766 c2a
= vec_mergel(c2a
,c2c
); /* H2xa H2xc H2ya H2yc */
767 c2c
= vec_mergeh(c2b
,c2b
); /* H1yb - H1zb - */
768 c2b
= vec_mergel(c2b
,c2b
); /* H2xb - H2yb - */
770 c3a
= vec_mergeh(c3a
,c3c
); /* H2za H2zc - - */
772 *Ox
= vec_mergeh(tmp1
,c1c
); /* Oxa Oxb Oxc - */
773 *Oy
= vec_mergel(tmp1
,c1c
); /* Oya Oyb Oyc - */
774 *Oz
= vec_mergeh(c1a
,c1b
); /* Oza Ozb Ozc - */
775 *H1x
= vec_mergel(c1a
,c1b
); /* H1xa H1xb H1xc - */
776 *H1y
= vec_mergeh(tmp2
,c2c
); /* H1ya H1yb H1yc - */
777 *H1z
= vec_mergel(tmp2
,c2c
); /* H1za H1zb H1zc - */
778 *H2x
= vec_mergeh(c2a
,c2b
); /* H2xa H2xb H2xc - */
779 *H2y
= vec_mergel(c2a
,c2b
); /* H2ya H2yb H2yc - */
780 *H2z
= vec_mergeh(c3a
,c3b
); /* H2za H2zb H2zc - */
785 static inline void load_2_water(float *address1
,
797 vector
unsigned char perm
;
798 vector
float tmp1
,tmp2
,tmp3
;
799 vector
float c1a
,c2a
,c3a
;
800 vector
float c1b
,c2b
,c3b
;
802 /* load the coordinates */
803 perm
= vec_lvsl( 0, (int *) address1
);
804 tmp1
= vec_ld( 0, address1
);
805 tmp2
= vec_ld( 16, address1
);
806 tmp3
= vec_ld( 32, address1
);
807 c1a
= vec_perm(tmp1
,tmp2
,perm
); /* Oxa Oya Oza H1xa */
808 c2a
= vec_perm(tmp2
,tmp3
,perm
); /* H1ya H1za H2xa H2ya */
809 c3a
= vec_perm(tmp3
,tmp3
,perm
); /* H2za - - - */
811 perm
= vec_lvsl( 0, (int *) address2
);
812 tmp1
= vec_ld( 0, address2
);
813 tmp2
= vec_ld( 16, address2
);
814 tmp3
= vec_ld( 32, address2
);
815 c1b
= vec_perm(tmp1
,tmp2
,perm
); /* Oxb Oyb Ozb H1xb */
816 c2b
= vec_perm(tmp2
,tmp3
,perm
); /* H1yb H1zb H2xb H2yb */
817 c3b
= vec_perm(tmp3
,tmp3
,perm
); /* H2zb - - - */
819 /* never mind what we get in the two remaining elements */
820 *Ox
= vec_mergeh(c1a
,c1b
); /* Oxa Oxb Oya Oyb */
821 *H1y
= vec_mergeh(c2a
,c2b
); /* H1ya H1yb H1za H1zb */
822 *H2z
= vec_mergeh(c3a
,c3b
); /* H2za H2zb - - */
823 *Oz
= vec_mergel(c1a
,c1b
); /* Oza Ozb H1xa H1xb */
824 *H2x
= vec_mergel(c2a
,c2b
); /* H2xa H2xb H2ya H2yb */
825 *Oy
= vec_sld(*Ox
,*Ox
,8);
826 *H1z
= vec_sld(*H1y
,*H1y
,8);
827 *H1x
= vec_sld(*Oz
,*Oz
,8);
828 *H2y
= vec_sld(*H2x
,*H2x
,8);
832 static inline void load_1_water(float *address1
,
843 vector
unsigned char perm
;
844 vector
float tmp1
,tmp2
,tmp3
;
846 /* load the coordinates */
847 perm
= vec_lvsl( 0, (int *) address1
);
848 tmp1
= vec_ld( 0, address1
);
849 tmp2
= vec_ld( 16, address1
);
850 tmp3
= vec_ld( 32, address1
);
851 *Ox
= vec_perm(tmp1
,tmp2
,perm
); /* Ox Oy Oz H1x */
852 *H1y
= vec_perm(tmp2
,tmp3
,perm
); /* H1y H1z H2x H2y */
853 *H2z
= vec_perm(tmp3
,tmp3
,perm
); /* H2z - - - */
855 /* just splat things... never mind that we fill all cells :-) */
856 *Oy
= vec_splat(*Ox
,1);
857 *Oz
= vec_splat(*Ox
,2);
858 *H1x
= vec_splat(*Ox
,3);
859 *H1z
= vec_splat(*H1y
,1);
860 *H2x
= vec_splat(*H1y
,2);
861 *H2y
= vec_splat(*H1y
,3);
866 static inline void update_i_water_forces(float *water
,
878 vector
float l1
,l2
,l3
;
879 vector
unsigned char perm
;
880 vector
unsigned int mask
,ox00
;
881 vector
float nul
=vec_zero();
883 ox00
=(vector
unsigned int)vec_splat_s32(0);
885 perm
= vec_lvsr( 0, water
);
886 l1
= vec_ld( 0, water
);
887 l2
= vec_ld( 16, water
);
888 l3
= vec_ld( 32, water
);
889 mask
= vec_perm(ox00
,
890 (vector
unsigned int)vec_splat_s32(-1),perm
);
892 /* accumulate the forces */
893 Ox
= vec_add(Ox
,vec_sld(Ox
,Ox
,8)); /* Ox Ox' - - */
894 Oy
= vec_add(Oy
,vec_sld(Oy
,Oy
,8)); /* Oy Oy' - - */
895 Oz
= vec_add(Oz
,vec_sld(Oz
,Oz
,8)); /* Oz Oz' - - */
896 H1x
= vec_add(H1x
,vec_sld(H1x
,H1x
,8)); /* H1x H1x' - - */
897 H1y
= vec_add(H1y
,vec_sld(H1y
,H1y
,8)); /* H1y H1y' - - */
898 H1z
= vec_add(H1z
,vec_sld(H1z
,H1z
,8)); /* H1z H1z' - - */
899 H2x
= vec_add(H2x
,vec_sld(H2x
,H2x
,8)); /* H2x H2x' - - */
900 H2y
= vec_add(H2y
,vec_sld(H2y
,H2y
,8)); /* H2y H2y' - - */
901 H2z
= vec_add(H2z
,vec_sld(H2z
,H2z
,8)); /* H2z H2z' - - */
903 Ox
= vec_mergeh(Ox
,Oz
); /* Ox Oz Ox' Oz' */
904 Oy
= vec_mergeh(Oy
,H1x
); /* Oy H1x Oy' H1x' */
905 H1y
= vec_mergeh(H1y
,H2x
); /* H1y H2x H1y' H2x' */
906 H1z
= vec_mergeh(H1z
,H2y
); /* H1z H2y H1z' H2y' */
907 H2z
= vec_mergeh(H2z
,nul
); /* H2z 0 H2z' 0 */
909 Ox
= vec_add(Ox
,vec_sld(Ox
,Ox
,8)); /* Ox Oz - - */
910 Oy
= vec_add(Oy
,vec_sld(Oy
,Oy
,8)); /* Oy H1x - - */
911 H1y
= vec_add(H1y
,vec_sld(H1y
,H1y
,8)); /* H1y H2x - - */
912 H1z
= vec_add(H1z
,vec_sld(H1z
,H1z
,8)); /* H1z H2y - - */
913 H2z
= vec_add(H2z
,vec_sld(H2z
,H2z
,8)); /* H2z 0 ? 0 */
915 Ox
= vec_mergeh(Ox
,Oy
); /* Ox Oy Oz H1x */
916 H1y
= vec_mergeh(H1y
,H1z
); /* H1y H1z H2x H2y */
917 H2z
= vec_mergeh(H2z
,nul
); /* H2z 0 0 0 */
919 Oy
= vec_sld(nul
,Ox
,12); /* 0 Ox Oy Oz */
920 Oz
= vec_sld(Ox
,H1y
,8); /* - H1x H1y H1z */
921 H1x
= vec_sld(H1y
,H2z
,4); /* - H2x H2y H2z */
923 H2x
= vec_perm(nul
,Ox
,perm
); /* The part to add to l1 */
924 H2y
= vec_perm(Ox
,H1y
,perm
); /* The part to add to l2 */
925 H2z
= vec_perm(H1y
,H2z
,perm
); /* The part to add to l3 */
927 H2x
= vec_add(l1
,H2x
);
928 H2y
= vec_add(l2
,H2y
);
929 H2z
= vec_add(l3
,H2z
);
931 H2x
= vec_sel(l1
,H2x
,mask
);
932 mask
= vec_sld(ox00
,mask
,12);
933 H2z
= vec_sel(H2z
,l3
,mask
);
936 vec_st(H2x
, 0, water
);
937 vec_st(H2y
, 16, water
);
938 vec_st(H2z
, 32, water
);
941 Oy
= vec_add(Oy
,H1x
);
942 Oy
= vec_sld(Oy
,nul
,4); /* x y z 0 */
944 add_xyz_to_mem(shiftvec
,Oy
);
947 static inline void add_force_to_4_water(float *address1
,
961 vector
float low
,medium
,high
,low2
,medium2
,high2
;
962 vector
unsigned char perm
,perm2
;
963 vector
unsigned int mask
,mask2
,oxFF
,ox00
;
964 vector
float tmp1
,tmp2
,tmp3
;
965 vector
float nul
=vec_zero();
967 oxFF
=(vector
unsigned int)vec_splat_s32(-1);
968 ox00
=(vector
unsigned int)vec_splat_s32(0);
970 /* We have to be careful here since water molecules will sometimes
971 * be adjacent in memory. When we update the forces on water 1,
972 * we usually read & write back some extra neighboring cells in
973 * the vector. This means we cannot first read all data, update
974 * it and write it out again.
976 * Instead of doing the molecules one by one, we rely on the
977 * fact that the list will be ordered, so water molecules
978 * 1 & 3 can be done first (they cannot be adjacent), and then
982 tmp1
= vec_mergeh(Ox
,Oz
); /* Oxa Oza Oxb Ozb */
983 Ox
= vec_mergel(Ox
,Oz
); /* Oxc Ozc Oxd Ozd */
984 Oz
= vec_mergeh(Oy
,H1x
); /* Oya H1xa Oyb H1xb */
985 Oy
= vec_mergel(Oy
,H1x
); /* Oyc H1xc Oyd H1xd */
986 H1x
= vec_mergeh(H1y
,H2x
); /* H1ya H2xa H1yb H2xb */
987 H1y
= vec_mergel(H1y
,H2x
); /* H1yc H2xc H1yd H2xd */
989 H2x
= vec_mergeh(H1z
,H2y
); /* H1za H2ya H1zb H2yb */
990 H1z
= vec_mergel(H1z
,H2y
); /* H1zc H2yc H1zd H2yd */
991 H2y
= vec_mergeh(H2z
,nul
); /* H2za 0 H2zb 0 */
992 H2z
= vec_mergel(H2z
,nul
); /* H2zc 0 H2zd 0 */
994 tmp2
= vec_mergeh(tmp1
,Oz
); /* Oxa Oya Oza H1xa */
995 Oz
= vec_mergel(tmp1
,Oz
); /* Oxb Oyb Ozb H1xb */
996 tmp1
= vec_mergeh(Ox
,Oy
); /* Oxc Oyc Ozc H1xc */
997 Ox
= vec_mergel(Ox
,Oy
); /* Oxd Oyd Ozd H1xd */
998 Oy
= vec_mergeh(H1x
,H2x
); /* H1ya H1za H2xa H2ya */
999 H1x
= vec_mergel(H1x
,H2x
); /* H1yb H1zb H2xb H2yb */
1000 H2x
= vec_mergeh(H1y
,H1z
); /* H1yc H1zc H2xc H2yc */
1001 H1y
= vec_mergel(H1y
,H1z
); /* H1yd H1zd H2xd H2yd */
1002 H1z
= vec_mergeh(H2y
,nul
); /* H2za 0 0 0 */
1003 H2y
= vec_mergel(H2y
,nul
); /* H2zb 0 0 0 */
1004 tmp3
= vec_mergeh(H2z
,nul
); /* H2zc 0 0 0 */
1005 H2z
= vec_mergel(H2z
,nul
); /* H2zd 0 0 0 */
1007 /* move into position, load and add */
1008 perm
= vec_lvsr( 0, (int *) address1
);
1009 perm2
= vec_lvsr( 0, (int *) address3
);
1010 low
= vec_ld( 0, address1
);
1011 low2
= vec_ld( 0, address3
);
1013 medium
= vec_ld( 16, address1
);
1014 medium2
= vec_ld( 16, address3
);
1015 high
= vec_ld( 32, address1
);
1016 high2
= vec_ld( 32, address3
);
1017 mask
= vec_perm(ox00
,oxFF
,perm
);
1018 mask2
= vec_perm(ox00
,oxFF
,perm2
);
1020 low
= vec_add(vec_perm(nul
,tmp2
,perm
),low
);
1021 low2
= vec_add(vec_perm(nul
,tmp1
,perm2
),low2
);
1023 tmp2
= vec_add(vec_perm(tmp2
,Oy
,perm
),medium
);
1024 tmp1
= vec_add(vec_perm(tmp1
,H2x
,perm2
),medium2
);
1026 Oy
= vec_add(vec_perm(Oy
,H1z
,perm
),high
);
1027 H2x
= vec_add(vec_perm(H2x
,tmp3
,perm2
),high2
);
1029 vec_st(vec_sel(low
,low
,mask
), 0, address1
);
1030 vec_st(vec_sel(low2
,low2
,mask2
), 0, address3
);
1032 mask
= vec_sld(ox00
,mask
,12);
1033 mask2
= vec_sld(ox00
,mask2
,12);
1035 vec_st(tmp2
, 16, address1
);
1036 vec_st(tmp1
, 16, address3
);
1038 vec_st(vec_sel(Oy
,high
,mask
), 32, address1
);
1039 vec_st(vec_sel(H2x
,high2
,mask2
), 32, address3
);
1041 /* Finished 1 & 3 - now do 2 & 4 */
1043 perm
= vec_lvsr( 0, (int *) address2
);
1044 perm2
= vec_lvsr( 0, (int *) address4
);
1046 low
= vec_ld( 0, address2
);
1047 low2
= vec_ld( 0, address4
);
1048 medium
= vec_ld( 16, address2
);
1049 medium2
= vec_ld( 16, address4
);
1050 high
= vec_ld( 32, address2
);
1051 high2
= vec_ld( 32, address4
);
1052 mask
= vec_perm(ox00
,oxFF
,perm
);
1053 mask2
= vec_perm(ox00
,oxFF
,perm2
);
1054 H1z
= vec_add(vec_perm(nul
,Oz
,perm
),low
);
1055 H2x
= vec_add(vec_perm(nul
,Ox
,perm2
),low2
);
1056 Oz
= vec_add(vec_perm(Oz
,H1x
,perm
),medium
);
1057 Ox
= vec_add(vec_perm(Ox
,H1y
,perm2
),medium2
);
1058 H1x
= vec_add(vec_perm(H1x
,H2y
,perm
),high
);
1059 H1y
= vec_add(vec_perm(H1y
,H2z
,perm2
),high2
);
1060 vec_st(vec_sel(low
,H1z
,mask
), 0, address2
);
1061 vec_st(vec_sel(low2
,H2x
,mask2
), 0, address4
);
1062 mask
= vec_sld(ox00
,mask
,12);
1063 mask2
= vec_sld(ox00
,mask2
,12);
1064 vec_st(Oz
, 16, address2
);
1065 vec_st(Ox
, 16, address4
);
1066 vec_st(vec_sel(H1x
,high
,mask
), 32, address2
);
1067 vec_st(vec_sel(H1y
,high2
,mask2
), 32, address4
);
1071 static inline void add_force_to_3_water(float *address1
,
1084 vector
float low
,medium
,high
;
1085 vector
unsigned char perm
;
1086 vector
unsigned int mask
,oxFF
,ox00
;
1087 vector
float tmp1
,tmp2
,tmp3
;
1089 vector
float nul
=vec_zero();
1090 oxFF
=(vector
unsigned int)vec_splat_s32(-1);
1091 ox00
=(vector
unsigned int)vec_splat_s32(0);
1093 tmp1
= vec_mergeh(Ox
,Oz
); /* Oxa Oza Oxb Ozb */
1094 Ox
= vec_mergel(Ox
,Oz
); /* Oxc Ozc ? ? */
1095 Oz
= vec_mergeh(Oy
,H1x
); /* Oya H1xa Oyb H1xb */
1096 Oy
= vec_mergel(Oy
,H1x
); /* Oyc H1xc ? ? */
1097 H1x
= vec_mergeh(H1y
,H2x
); /* H1ya H2xa H1yb H2xb */
1098 H1y
= vec_mergel(H1y
,H2x
); /* H1yc H2xc ? ? */
1099 H2x
= vec_mergeh(H1z
,H2y
); /* H1za H2ya H1zb H2yb */
1100 H1z
= vec_mergel(H1z
,H2y
); /* H1zc H2yc ? ? */
1101 H2y
= vec_mergeh(H2z
,nul
); /* H2za 0 H2zb 0 */
1102 H2z
= vec_mergel(H2z
,nul
); /* H2zc 0 ? 0 */
1104 tmp2
= vec_mergeh(tmp1
,Oz
); /* Oxa Oya Oza H1xa */
1105 Oz
= vec_mergel(tmp1
,Oz
); /* Oxb Oyb Ozb H1xb */
1106 tmp1
= vec_mergeh(Ox
,Oy
); /* Oxc Oyc Ozc H1xc */
1107 Oy
= vec_mergeh(H1x
,H2x
); /* H1ya H1za H2xa H2ya */
1108 H1x
= vec_mergel(H1x
,H2x
); /* H1yb H1zb H2xb H2yb */
1109 H2x
= vec_mergeh(H1y
,H1z
); /* H1yc H1zc H2xc H2yc */
1110 H1z
= vec_mergeh(H2y
,nul
); /* H2za 0 0 0 */
1111 H2y
= vec_mergel(H2y
,nul
); /* H2zb 0 0 0 */
1112 tmp3
= vec_mergeh(H2z
,nul
); /* H2zc 0 0 0 */
1114 /* move into position, load and add */
1115 perm
= vec_lvsr( 0, (int *) address1
);
1116 low
= vec_ld( 0, address1
);
1117 medium
= vec_ld( 16, address1
);
1118 high
= vec_ld( 32, address1
);
1119 mask
= vec_perm(ox00
,oxFF
,perm
);
1120 H1y
= vec_add(vec_perm(nul
,tmp2
,perm
),low
);
1121 tmp2
= vec_add(vec_perm(tmp2
,Oy
,perm
),medium
);
1122 Oy
= vec_add(vec_perm(Oy
,H1z
,perm
),high
);
1123 vec_st(vec_sel(low
,H1y
,mask
), 0, address1
);
1124 mask
= vec_sld(ox00
,mask
,12);
1125 vec_st(tmp2
, 16, address1
);
1126 vec_st(vec_sel(Oy
,high
,mask
), 32, address1
);
1128 perm
= vec_lvsr( 0, (int *) address2
);
1129 low
= vec_ld( 0, address2
);
1130 medium
= vec_ld( 16, address2
);
1131 high
= vec_ld( 32, address2
);
1132 mask
= vec_perm(ox00
,oxFF
,perm
);
1133 H1z
= vec_add(vec_perm(nul
,Oz
,perm
),low
);
1134 Oz
= vec_add(vec_perm(Oz
,H1x
,perm
),medium
);
1135 H1x
= vec_add(vec_perm(H1x
,H2y
,perm
),high
);
1136 vec_st(vec_sel(low
,H1z
,mask
), 0, address2
);
1137 mask
= vec_sld(ox00
,mask
,12);
1138 vec_st(Oz
, 16, address2
);
1139 vec_st(vec_sel(H1x
,high
,mask
), 32, address2
);
1141 perm
= vec_lvsr( 0, (int *) address3
);
1142 low
= vec_ld( 0, address3
);
1143 medium
= vec_ld( 16, address3
);
1144 high
= vec_ld( 32, address3
);
1145 mask
= vec_perm(ox00
,oxFF
,perm
);
1146 H2y
= vec_add(vec_perm(nul
,tmp1
,perm
),low
);
1147 tmp1
= vec_add(vec_perm(tmp1
,H2x
,perm
),medium
);
1148 H2x
= vec_add(vec_perm(H2x
,tmp3
,perm
),high
);
1149 vec_st(vec_sel(low
,H2y
,mask
), 0, address3
);
1150 mask
= vec_sld(ox00
,mask
,12);
1151 vec_st(tmp1
, 16, address3
);
1152 vec_st(vec_sel(H2x
,high
,mask
), 32, address3
);
1158 static inline void add_force_to_2_water(float *address1
,
1170 vector
float low
,medium
,high
;
1171 vector
unsigned char perm
;
1172 vector
unsigned int mask
,oxFF
,ox00
;
1173 vector
float tmp1
,tmp2
,tmp3
;
1175 vector
float nul
=vec_zero();
1176 oxFF
=(vector
unsigned int)vec_splat_s32(-1);
1177 ox00
=(vector
unsigned int)vec_splat_s32(0);
1179 tmp1
= vec_mergeh(Ox
,Oz
); /* Oxa Oza Oxb Ozb */
1180 Oz
= vec_mergeh(Oy
,H1x
); /* Oya H1xa Oyb H1xb */
1181 H1x
= vec_mergeh(H1y
,H2x
); /* H1ya H2xa H1yb H2xb */
1182 H2x
= vec_mergeh(H1z
,H2y
); /* H1za H2ya H1zb H2yb */
1183 H2y
= vec_mergeh(H2z
,nul
); /* H2za 0 H2zb 0 */
1185 tmp2
= vec_mergeh(tmp1
,Oz
); /* Oxa Oya Oza H1xa */
1186 Oz
= vec_mergel(tmp1
,Oz
); /* Oxb Oyb Ozb H1xb */
1187 Oy
= vec_mergeh(H1x
,H2x
); /* H1ya H1za H2xa H2ya */
1188 H1x
= vec_mergel(H1x
,H2x
); /* H1yb H1zb H2xb H2yb */
1189 H1z
= vec_mergeh(H2y
,nul
); /* H2za 0 0 0 */
1190 H2y
= vec_mergel(H2y
,nul
); /* H2zb 0 0 0 */
1192 /* move into position and add */
1193 perm
= vec_lvsr( 0, (int *) address1
);
1194 low
= vec_ld( 0, address1
);
1195 medium
= vec_ld( 16, address1
);
1196 high
= vec_ld( 32, address1
);
1197 mask
= vec_perm(ox00
,oxFF
,perm
);
1198 H2x
= vec_add(vec_perm(nul
,tmp2
,perm
),low
);
1199 tmp2
= vec_add(vec_perm(tmp2
,Oy
,perm
),medium
);
1200 Oy
= vec_add(vec_perm(Oy
,H1z
,perm
),high
);
1201 vec_st(vec_sel(low
,H2x
,mask
), 0, address1
);
1202 mask
= vec_sld(ox00
,mask
,12);
1203 vec_st(tmp2
, 16, address1
);
1204 vec_st(vec_sel(Oy
,high
,mask
), 32, address1
);
1206 perm
= vec_lvsr( 0, (int *) address2
);
1207 low
= vec_ld( 0, address2
);
1208 medium
= vec_ld( 16, address2
);
1209 high
= vec_ld( 32, address2
);
1210 mask
= vec_perm(ox00
,oxFF
,perm
);
1211 H1z
= vec_add(vec_perm(nul
,Oz
,perm
),low
);
1212 Oz
= vec_add(vec_perm(Oz
,H1x
,perm
),medium
);
1213 H1x
= vec_add(vec_perm(H1x
,H2y
,perm
),high
);
1214 vec_st(vec_sel(low
,H1z
,mask
), 0, address2
);
1215 mask
= vec_sld(ox00
,mask
,12);
1216 vec_st(Oz
, 16, address2
);
1217 vec_st(vec_sel(H1x
,high
,mask
), 32, address2
);
1222 static inline void add_force_to_1_water(float *address1
,
1233 vector
float low
,medium
,high
;
1234 vector
unsigned char perm
;
1235 vector
unsigned int mask
,oxFF
,ox00
;
1236 vector
float tmp1
,tmp2
,tmp3
;
1238 vector
float nul
=vec_zero();
1239 oxFF
=(vector
unsigned int)vec_splat_s32(-1);
1240 ox00
=(vector
unsigned int)vec_splat_s32(0);
1243 perm
= vec_lvsr( 0, (int *) address1
);
1244 low
= vec_ld( 0, address1
);
1245 medium
= vec_ld( 16, address1
);
1246 high
= vec_ld( 32, address1
);
1248 tmp1
= vec_mergeh(Ox
,Oz
); /* Oxa Oza ? ? */
1249 Oz
= vec_mergeh(Oy
,H1x
); /* Oya H1xa ? ? */
1250 H1x
= vec_mergeh(H1y
,H2x
); /* H1ya H2xa ? ? */
1251 H2x
= vec_mergeh(H1z
,H2y
); /* H1za H2ya ? ? */
1252 H2y
= vec_mergeh(H2z
,nul
); /* H2za 0 ? 0 */
1254 tmp2
= vec_mergeh(tmp1
,Oz
); /* Oxa Oya Oza H1xa */
1255 Oy
= vec_mergeh(H1x
,H2x
); /* H1ya H1za H2xa H2ya */
1256 H1z
= vec_mergeh(H2y
,nul
); /* H2za 0 0 0 */
1258 /* move into position and add */
1259 perm
= vec_lvsr( 0, (int *) address1
);
1260 low
= vec_ld( 0, address1
);
1261 medium
= vec_ld( 16, address1
);
1262 high
= vec_ld( 32, address1
);
1263 mask
= vec_perm(ox00
,oxFF
,perm
);
1264 H2x
= vec_add(vec_perm(nul
,tmp2
,perm
),low
);
1265 tmp2
= vec_add(vec_perm(tmp2
,Oy
,perm
),medium
);
1266 Oy
= vec_add(vec_perm(Oy
,H1z
,perm
),high
);
1267 vec_st(vec_sel(low
,H2x
,mask
), 0, address1
);
1268 mask
= vec_sld(ox00
,mask
,12);
1269 vec_st(tmp2
, 16, address1
);
1270 vec_st(vec_sel(Oy
,high
,mask
), 32, address1
);
1275 static inline void do_4_ctable_coul(float *VFtab
,
1280 vector
signed int vidx
;
1281 vector
float Y
,F
,G
,H
,eps
,eps2
,tab1
,tab2
,tab3
,tab4
;
1282 int idx1
,idx2
,idx3
,idx4
;
1284 vidx
= vec_cts(rtab
,0);
1285 vidx
= vec_sl(vidx
,vec_splat_u32(2));
1287 idx1
= *((int *)&vidx
);
1288 idx2
= *(((int *)&vidx
)+1);
1289 idx3
= *(((int *)&vidx
)+2);
1290 idx4
= *(((int *)&vidx
)+3);
1292 eps
= vec_sub(rtab
,vec_floor(rtab
));
1293 eps2
= vec_madd(eps
,eps
,vec_zero());
1295 if(((unsigned int)VFtab
)%16) { /* not 16-byte aligned, but must be 8 byte. */
1296 tab1
= vec_ld( 0, VFtab
+idx1
);
1297 Y
= vec_ld(16, VFtab
+idx1
);
1298 tab1
= vec_sld(tab1
,Y
,8);
1299 tab2
= vec_ld( 0, VFtab
+idx2
);
1300 F
= vec_ld(16, VFtab
+idx2
);
1301 tab2
= vec_sld(tab2
,F
,8);
1302 tab3
= vec_ld( 0, VFtab
+idx3
);
1303 G
= vec_ld(16, VFtab
+idx3
);
1304 tab3
= vec_sld(tab3
,G
,8);
1305 tab4
= vec_ld( 0, VFtab
+idx4
);
1306 H
= vec_ld(16, VFtab
+idx4
);
1307 tab4
= vec_sld(tab4
,H
,8);
1308 } else { /* aligned */
1309 tab1
=vec_ld(0, VFtab
+idx1
);
1310 tab2
=vec_ld(0, VFtab
+idx2
);
1311 tab3
=vec_ld(0, VFtab
+idx3
);
1312 tab4
=vec_ld(0, VFtab
+idx4
);
1315 /* table data is aligned */
1316 transpose_4_to_4(tab1
,tab2
,tab3
,tab4
,&Y
,&F
,&G
,&H
);
1318 F
= vec_madd(G
,eps
,F
); /* F + Geps */
1319 H
= vec_madd(H
,eps2
,vec_zero()); /* Heps2 */
1320 F
= vec_add(F
,H
); /* F + Geps + Heps2 (=Fp) */
1321 *VV
= vec_madd(eps
,F
,Y
); /* VV = Y + eps*Fp */
1322 F
= vec_madd(G
,eps
,F
); /* Fp + Geps */
1323 *FF
= vec_madd(vec_two(),H
,F
); /* Fp + Geps + 2.0*Heps2 */
1326 /* do only coulomb, but on a table with both coulomb and lj data */
1327 static inline void do_4_ljctable_coul(float *VFtab
,
1332 vector
signed int vidx
;
1333 vector
float Y
,F
,G
,H
,eps
,eps2
,tab1
,tab2
,tab3
,tab4
;
1334 int idx1
,idx2
,idx3
,idx4
;
1336 vidx
= vec_cts(rtab
,0);
1337 vidx
= vec_add(vidx
,vec_sl(vidx
,vec_splat_u32(1))); /* multiply by 3 */
1338 vidx
= vec_sl(vidx
,vec_splat_u32(2));
1340 idx1
= *((int *)&vidx
);
1341 idx2
= *(((int *)&vidx
)+1);
1342 idx3
= *(((int *)&vidx
)+2);
1343 idx4
= *(((int *)&vidx
)+3);
1345 eps
= vec_sub(rtab
,vec_floor(rtab
));
1346 eps2
= vec_madd(eps
,eps
,vec_zero());
1348 if(((unsigned int)VFtab
)%16) { /* not 16-byte aligned, but must be 8 byte. */
1349 tab1
= vec_ld( 0, VFtab
+idx1
);
1350 Y
= vec_ld(16, VFtab
+idx1
);
1351 tab1
= vec_sld(tab1
,Y
,8);
1352 tab2
= vec_ld( 0, VFtab
+idx2
);
1353 F
= vec_ld(16, VFtab
+idx2
);
1354 tab2
= vec_sld(tab2
,F
,8);
1355 tab3
= vec_ld( 0, VFtab
+idx3
);
1356 G
= vec_ld(16, VFtab
+idx3
);
1357 tab3
= vec_sld(tab3
,G
,8);
1358 tab4
= vec_ld( 0, VFtab
+idx4
);
1359 H
= vec_ld(16, VFtab
+idx4
);
1360 tab4
= vec_sld(tab4
,H
,8);
1361 } else { /* aligned */
1362 tab1
=vec_ld(0, VFtab
+idx1
);
1363 tab2
=vec_ld(0, VFtab
+idx2
);
1364 tab3
=vec_ld(0, VFtab
+idx3
);
1365 tab4
=vec_ld(0, VFtab
+idx4
);
1368 transpose_4_to_4(tab1
,tab2
,tab3
,tab4
,&Y
,&F
,&G
,&H
);
1370 F
= vec_madd(G
,eps
,F
); /* F + Geps */
1371 H
= vec_madd(H
,eps2
,vec_zero()); /* Heps2 */
1372 F
= vec_add(F
,H
); /* F + Geps + Heps2 (=Fp) */
1373 *VV
= vec_madd(eps
,F
,Y
); /* VV = Y + eps*Fp */
1374 F
= vec_madd(G
,eps
,F
); /* Fp + Geps */
1375 *FF
= vec_madd(vec_two(),H
,F
); /* Fp + Geps + 2.0*Heps2 */
1379 static inline void do_4_ljtable_lj(float *VFtab
,
1381 vector
float *VVdisp
,
1382 vector
float *FFdisp
,
1383 vector
float *VVrep
,
1384 vector
float *FFrep
)
1386 vector
signed int vidx
;
1387 vector
float Y
,F
,G
,H
,eps
,eps2
;
1388 vector
float tabd1
,tabd2
,tabd3
,tabd4
;
1389 vector
float tabr1
,tabr2
,tabr3
,tabr4
;
1391 int idx1
,idx2
,idx3
,idx4
;
1393 vidx
= vec_cts(rtab
,0);
1394 vidx
= vec_sl(vidx
,vec_splat_u32(3));
1396 idx1
= *((int *)&vidx
);
1397 idx2
= *(((int *)&vidx
)+1);
1398 idx3
= *(((int *)&vidx
)+2);
1399 idx4
= *(((int *)&vidx
)+3);
1401 eps
= vec_sub(rtab
,vec_floor(rtab
));
1402 eps2
= vec_madd(eps
,eps
,vec_zero());
1404 if(((unsigned int)VFtab
)%16) {
1405 /* not 16 byte aligned, i.e. must be 8 byte. */
1406 /* use Y,F,G,H as temp storage */
1407 tabd1
= vec_ld( 0, VFtab
+idx1
);
1408 tabr1
= vec_ld(16, VFtab
+idx1
);
1409 Y
= vec_ld(32, VFtab
+idx1
);
1410 tabd1
= vec_sld(tabd1
,tabr1
,8);
1411 tabr1
= vec_sld(tabr1
,Y
,8);
1413 tabd2
= vec_ld( 0, VFtab
+idx2
);
1414 tabr2
= vec_ld(16, VFtab
+idx2
);
1415 F
= vec_ld(32, VFtab
+idx2
);
1416 tabd2
= vec_sld(tabd2
,tabr2
,8);
1417 tabr2
= vec_sld(tabr2
,F
,8);
1419 tabd3
= vec_ld( 0, VFtab
+idx3
);
1420 tabr3
= vec_ld(16, VFtab
+idx3
);
1421 G
= vec_ld(32, VFtab
+idx3
);
1422 tabd3
= vec_sld(tabd3
,tabr3
,8);
1423 tabr3
= vec_sld(tabr3
,G
,8);
1425 tabd4
= vec_ld( 0, VFtab
+idx4
);
1426 tabr4
= vec_ld(16, VFtab
+idx4
);
1427 H
= vec_ld(32, VFtab
+idx4
);
1428 tabd4
= vec_sld(tabd4
,tabr4
,8);
1429 tabr4
= vec_sld(tabr4
,H
,8);
1430 } else { /* 16 byte aligned */
1431 tabd1
= vec_ld( 0, VFtab
+idx1
);
1432 tabr1
= vec_ld(16, VFtab
+idx1
);
1433 tabd2
= vec_ld( 0, VFtab
+idx2
);
1434 tabr2
= vec_ld(16, VFtab
+idx2
);
1435 tabd3
= vec_ld( 0, VFtab
+idx3
);
1436 tabr3
= vec_ld(16, VFtab
+idx3
);
1437 tabd4
= vec_ld( 0, VFtab
+idx4
);
1438 tabr4
= vec_ld(16, VFtab
+idx4
);
1441 transpose_4_to_4(tabd1
,tabd2
,tabd3
,tabd4
,&Y
,&F
,&G
,&H
);
1443 F
= vec_madd(G
,eps
,F
); /* F + Geps */
1444 H
= vec_madd(H
,eps2
,vec_zero()); /* Heps2 */
1445 F
= vec_add(F
,H
); /* F + Geps + Heps2 (=Fp) */
1446 *VVdisp
= vec_madd(eps
,F
,Y
); /* VV = Y + eps*Fp */
1447 F
= vec_madd(G
,eps
,F
); /* Fp + Geps */
1448 *FFdisp
= vec_madd(vec_two(),H
,F
); /* Fp + Geps + 2.0*Heps2 */
1450 transpose_4_to_4(tabr1
,tabr2
,tabr3
,tabr4
,&Y
,&F
,&G
,&H
);
1452 F
= vec_madd(G
,eps
,F
); /* F + Geps */
1453 H
= vec_madd(H
,eps2
,vec_zero()); /* Heps2 */
1454 F
= vec_add(F
,H
); /* F + Geps + Heps2 (=Fp) */
1455 *VVrep
= vec_madd(eps
,F
,Y
); /* VV = Y + eps*Fp */
1456 F
= vec_madd(G
,eps
,F
); /* Fp + Geps */
1457 *FFrep
= vec_madd(vec_two(),H
,F
); /* Fp + Geps + 2.0*Heps2 */
1461 static inline void do_4_ljctable_coul_and_lj(float *VFtab
,
1463 vector
float *VVcoul
,
1464 vector
float *FFcoul
,
1465 vector
float *VVdisp
,
1466 vector
float *FFdisp
,
1467 vector
float *VVrep
,
1468 vector
float *FFrep
)
1470 vector
signed int vidx
;
1471 vector
float Y
,F
,G
,H
,eps
,eps2
;
1472 vector
float tabd1
,tabr1
,tabc1
,tabd2
,tabr2
,tabc2
;
1473 vector
float tabd3
,tabr3
,tabc3
,tabd4
,tabr4
,tabc4
;
1474 int idx1
,idx2
,idx3
,idx4
;
1476 vidx
= vec_cts(rtab
,0);
1477 vidx
= vec_add(vidx
,vec_sl(vidx
,vec_splat_u32(1))); /* multiply by 3 */
1478 vidx
= vec_sl(vidx
,vec_splat_u32(2));
1480 idx1
= *((int *)&vidx
);
1481 idx2
= *(((int *)&vidx
)+1);
1482 idx3
= *(((int *)&vidx
)+2);
1483 idx4
= *(((int *)&vidx
)+3);
1485 eps
= vec_sub(rtab
,vec_floor(rtab
));
1486 eps2
= vec_madd(eps
,eps
,vec_zero());
1488 if(((unsigned int)VFtab
)%16) {
1489 /* not 16-byte aligned, but must be 8 byte. */
1490 /* use Y,F,G,H as temp storage */
1491 tabc1
= vec_ld( 0, VFtab
+idx1
);
1492 tabd1
= vec_ld(16, VFtab
+idx1
);
1493 tabr1
= vec_ld(32, VFtab
+idx1
);
1494 Y
= vec_ld(48, VFtab
+idx1
);
1495 tabc1
= vec_sld(tabc1
,tabd1
,8);
1496 tabd1
= vec_sld(tabd1
,tabr1
,8);
1497 tabr1
= vec_sld(tabr1
,Y
,8);
1499 tabc2
= vec_ld( 0, VFtab
+idx2
);
1500 tabd2
= vec_ld(16, VFtab
+idx2
);
1501 tabr2
= vec_ld(32, VFtab
+idx2
);
1502 F
= vec_ld(48, VFtab
+idx2
);
1503 tabc2
= vec_sld(tabc2
,tabd2
,8);
1504 tabd2
= vec_sld(tabd2
,tabr2
,8);
1505 tabr2
= vec_sld(tabr2
,F
,8);
1507 tabc3
= vec_ld( 0, VFtab
+idx3
);
1508 tabd3
= vec_ld(16, VFtab
+idx3
);
1509 tabr3
= vec_ld(32, VFtab
+idx3
);
1510 G
= vec_ld(48, VFtab
+idx3
);
1511 tabc3
= vec_sld(tabc3
,tabd3
,8);
1512 tabd3
= vec_sld(tabd3
,tabr3
,8);
1513 tabr3
= vec_sld(tabr3
,G
,8);
1515 tabc4
= vec_ld( 0, VFtab
+idx4
);
1516 tabd4
= vec_ld(16, VFtab
+idx4
);
1517 tabr4
= vec_ld(32, VFtab
+idx4
);
1518 H
= vec_ld(48, VFtab
+idx4
);
1519 tabc4
= vec_sld(tabc4
,tabd4
,8);
1520 tabd4
= vec_sld(tabd4
,tabr4
,8);
1521 tabr4
= vec_sld(tabr4
,H
,8);
1522 } else { /* 16 byte aligned */
1523 tabc1
= vec_ld( 0, VFtab
+idx1
);
1524 tabd1
= vec_ld(16, VFtab
+idx1
);
1525 tabr1
= vec_ld(32, VFtab
+idx1
);
1526 tabc2
= vec_ld( 0, VFtab
+idx2
);
1527 tabd2
= vec_ld(16, VFtab
+idx2
);
1528 tabr2
= vec_ld(32, VFtab
+idx2
);
1529 tabc3
= vec_ld( 0, VFtab
+idx3
);
1530 tabd3
= vec_ld(16, VFtab
+idx3
);
1531 tabr3
= vec_ld(32, VFtab
+idx3
);
1532 tabc4
= vec_ld( 0, VFtab
+idx4
);
1533 tabd4
= vec_ld(16, VFtab
+idx4
);
1534 tabr4
= vec_ld(32, VFtab
+idx4
);
1536 transpose_4_to_4(tabc1
,tabc2
,tabc3
,tabc4
,&Y
,&F
,&G
,&H
);
1538 F
= vec_madd(G
,eps
,F
); /* F + Geps */
1539 H
= vec_madd(H
,eps2
,vec_zero()); /* Heps2 */
1540 F
= vec_add(F
,H
); /* F + Geps + Heps2 (=Fp) */
1541 *VVcoul
= vec_madd(eps
,F
,Y
); /* VV = Y + eps*Fp */
1542 F
= vec_madd(G
,eps
,F
); /* Fp + Geps */
1543 *FFcoul
= vec_madd(vec_two(),H
,F
); /* Fp + Geps + 2.0*Heps2 */
1545 transpose_4_to_4(tabd1
,tabd2
,tabd3
,tabd4
,&Y
,&F
,&G
,&H
);
1547 F
= vec_madd(G
,eps
,F
); /* F + Geps */
1548 H
= vec_madd(H
,eps2
,vec_zero()); /* Heps2 */
1549 F
= vec_add(F
,H
); /* F + Geps + Heps2 (=Fp) */
1550 *VVdisp
= vec_madd(eps
,F
,Y
); /* VV = Y + eps*Fp */
1551 F
= vec_madd(G
,eps
,F
); /* Fp + Geps */
1552 *FFdisp
= vec_madd(vec_two(),H
,F
); /* Fp + Geps + 2.0*Heps2 */
1554 transpose_4_to_4(tabr1
,tabr2
,tabr3
,tabr4
,&Y
,&F
,&G
,&H
);
1556 F
= vec_madd(G
,eps
,F
); /* F + Geps */
1557 H
= vec_madd(H
,eps2
,vec_zero()); /* Heps2 */
1558 F
= vec_add(F
,H
); /* F + Geps + Heps2 (=Fp) */
1559 *VVrep
= vec_madd(eps
,F
,Y
); /* VV = Y + eps*Fp */
1560 F
= vec_madd(G
,eps
,F
); /* Fp + Geps */
1561 *FFrep
= vec_madd(vec_two(),H
,F
); /* Fp + Geps + 2.0*Heps2 */
1565 static inline void do_3_ctable_coul(float *VFtab
,
1570 vector
signed int vidx
;
1571 vector
float Y
,F
,G
,H
,eps
,eps2
,tab1
,tab2
,tab3
;
1574 vidx
= vec_cts(rtab
,0);
1575 vidx
= vec_sl(vidx
,vec_splat_u32(2));
1577 idx1
= *((int *)&vidx
);
1578 idx2
= *(((int *)&vidx
)+1);
1579 idx3
= *(((int *)&vidx
)+2);
1581 eps
= vec_sub(rtab
,vec_floor(rtab
));
1582 eps2
= vec_madd(eps
,eps
,vec_zero());
1584 if(((unsigned int)VFtab
)%16) { /* not 16-byte aligned, but must be 8 byte. */
1585 tab1
= vec_ld( 0, VFtab
+idx1
);
1586 Y
= vec_ld(16, VFtab
+idx1
);
1587 tab1
= vec_sld(tab1
,Y
,8);
1588 tab2
= vec_ld( 0, VFtab
+idx2
);
1589 F
= vec_ld(16, VFtab
+idx2
);
1590 tab2
= vec_sld(tab2
,F
,8);
1591 tab3
= vec_ld( 0, VFtab
+idx3
);
1592 G
= vec_ld(16, VFtab
+idx3
);
1593 tab3
= vec_sld(tab3
,G
,8);
1594 } else { /* aligned */
1595 tab1
=vec_ld(0, VFtab
+idx1
);
1596 tab2
=vec_ld(0, VFtab
+idx2
);
1597 tab3
=vec_ld(0, VFtab
+idx3
);
1600 /* table data is aligned */
1601 transpose_3_to_4(tab1
,tab2
,tab3
,&Y
,&F
,&G
,&H
);
1604 F
= vec_madd(G
,eps
,F
); /* F + Geps */
1605 H
= vec_madd(H
,eps2
,vec_zero()); /* Heps2 */
1606 F
= vec_add(F
,H
); /* F + Geps + Heps2 (=Fp) */
1607 *VV
= vec_madd(eps
,F
,Y
); /* VV = Y + eps*Fp */
1608 F
= vec_madd(G
,eps
,F
); /* Fp + Geps */
1609 *FF
= vec_madd(vec_two(),H
,F
); /* Fp + Geps + 2.0*Heps2 */
1612 /* do only coulomb, but on a table with both coulomb and lj data */
1613 static inline void do_3_ljctable_coul(float *VFtab
,
1618 vector
signed int vidx
;
1619 vector
float Y
,F
,G
,H
,eps
,eps2
,tab1
,tab2
,tab3
;
1622 vidx
= vec_cts(rtab
,0);
1623 vidx
= vec_add(vidx
,vec_sl(vidx
,vec_splat_u32(1))); /* multiply by 3 */
1624 vidx
= vec_sl(vidx
,vec_splat_u32(2));
1626 idx1
= *((int *)&vidx
);
1627 idx2
= *(((int *)&vidx
)+1);
1628 idx3
= *(((int *)&vidx
)+2);
1630 eps
= vec_sub(rtab
,vec_floor(rtab
));
1631 eps2
= vec_madd(eps
,eps
,vec_zero());
1634 if(((unsigned int)VFtab
)%16) {
1635 /* not 16-byte aligned, but must be 8 byte. */
1636 tab1
= vec_ld( 0, VFtab
+idx1
);
1637 Y
= vec_ld(16, VFtab
+idx1
);
1638 tab1
= vec_sld(tab1
,Y
,8);
1639 tab2
= vec_ld( 0, VFtab
+idx2
);
1640 F
= vec_ld(16, VFtab
+idx2
);
1641 tab2
= vec_sld(tab2
,F
,8);
1642 tab3
= vec_ld( 0, VFtab
+idx3
);
1643 G
= vec_ld(16, VFtab
+idx3
);
1644 tab3
= vec_sld(tab3
,G
,8);
1645 } else { /* aligned */
1646 tab1
=vec_ld(0, VFtab
+idx1
);
1647 tab2
=vec_ld(0, VFtab
+idx2
);
1648 tab3
=vec_ld(0, VFtab
+idx3
);
1651 /* table data is aligned */
1652 transpose_3_to_4(tab1
,tab2
,tab3
,&Y
,&F
,&G
,&H
);
1654 F
= vec_madd(G
,eps
,F
); /* F + Geps */
1655 H
= vec_madd(H
,eps2
,vec_zero()); /* Heps2 */
1656 F
= vec_add(F
,H
); /* F + Geps + Heps2 (=Fp) */
1657 *VV
= vec_madd(eps
,F
,Y
); /* VV = Y + eps*Fp */
1658 F
= vec_madd(G
,eps
,F
); /* Fp + Geps */
1659 *FF
= vec_madd(vec_two(),H
,F
); /* Fp + Geps + 2.0*Heps2 */
1663 static inline void do_3_ljtable_lj(float *VFtab
,
1665 vector
float *VVdisp
,
1666 vector
float *FFdisp
,
1667 vector
float *VVrep
,
1668 vector
float *FFrep
)
1670 vector
signed int vidx
;
1671 vector
float Y
,F
,G
,H
,eps
,eps2
;
1673 vector
float tabd1
,tabd2
,tabd3
;
1674 vector
float tabr1
,tabr2
,tabr3
;
1676 vidx
= vec_cts(rtab
,0);
1677 vidx
= vec_sl(vidx
,vec_splat_u32(3));
1679 idx1
= *((int *)&vidx
);
1680 idx2
= *(((int *)&vidx
)+1);
1681 idx3
= *(((int *)&vidx
)+2);
1683 eps
= vec_sub(rtab
,vec_floor(rtab
));
1684 eps2
= vec_madd(eps
,eps
,vec_zero());
1686 if(((unsigned int)VFtab
)%16) {
1687 /* not 16 byte aligned, i.e. must be 8 byte. */
1688 /* use Y,F,G,H as temp storage */
1689 tabd1
= vec_ld( 0, VFtab
+idx1
);
1690 tabr1
= vec_ld(16, VFtab
+idx1
);
1691 Y
= vec_ld(32, VFtab
+idx1
);
1692 tabd1
= vec_sld(tabd1
,tabr1
,8);
1693 tabr1
= vec_sld(tabr1
,Y
,8);
1695 tabd2
= vec_ld( 0, VFtab
+idx2
);
1696 tabr2
= vec_ld(16, VFtab
+idx2
);
1697 F
= vec_ld(32, VFtab
+idx2
);
1698 tabd2
= vec_sld(tabd2
,tabr2
,8);
1699 tabr2
= vec_sld(tabr2
,F
,8);
1701 tabd3
= vec_ld( 0, VFtab
+idx3
);
1702 tabr3
= vec_ld(16, VFtab
+idx3
);
1703 G
= vec_ld(32, VFtab
+idx3
);
1704 tabd3
= vec_sld(tabd3
,tabr3
,8);
1705 tabr3
= vec_sld(tabr3
,G
,8);
1706 } else { /* 16 byte aligned */
1707 tabd1
= vec_ld( 0, VFtab
+idx1
);
1708 tabr1
= vec_ld(16, VFtab
+idx1
);
1709 tabd2
= vec_ld( 0, VFtab
+idx2
);
1710 tabr2
= vec_ld(16, VFtab
+idx2
);
1711 tabd3
= vec_ld( 0, VFtab
+idx3
);
1712 tabr3
= vec_ld(16, VFtab
+idx3
);
1715 transpose_3_to_4(tabd1
,tabd2
,tabd3
,&Y
,&F
,&G
,&H
);
1717 F
= vec_madd(G
,eps
,F
); /* F + Geps */
1718 H
= vec_madd(H
,eps2
,vec_zero()); /* Heps2 */
1719 F
= vec_add(F
,H
); /* F + Geps + Heps2 (=Fp) */
1720 *VVdisp
= vec_madd(eps
,F
,Y
); /* VV = Y + eps*Fp */
1721 F
= vec_madd(G
,eps
,F
); /* Fp + Geps */
1722 *FFdisp
= vec_madd(vec_two(),H
,F
); /* Fp + Geps + 2.0*Heps2 */
1724 transpose_3_to_4(tabr1
,tabr2
,tabr3
,&Y
,&F
,&G
,&H
);
1726 F
= vec_madd(G
,eps
,F
); /* F + Geps */
1727 H
= vec_madd(H
,eps2
,vec_zero()); /* Heps2 */
1728 F
= vec_add(F
,H
); /* F + Geps + Heps2 (=Fp) */
1729 *VVrep
= vec_madd(eps
,F
,Y
); /* VV = Y + eps*Fp */
1730 F
= vec_madd(G
,eps
,F
); /* Fp + Geps */
1731 *FFrep
= vec_madd(vec_two(),H
,F
); /* Fp + Geps + 2.0*Heps2 */
1735 static inline void do_3_ljctable_coul_and_lj(float *VFtab
,
1737 vector
float *VVcoul
,
1738 vector
float *FFcoul
,
1739 vector
float *VVdisp
,
1740 vector
float *FFdisp
,
1741 vector
float *VVrep
,
1742 vector
float *FFrep
)
1744 vector
signed int vidx
;
1745 vector
float Y
,F
,G
,H
,eps
,eps2
;
1746 vector
float tabd1
,tabr1
,tabc1
,tabd2
,tabr2
,tabc2
;
1747 vector
float tabd3
,tabr3
,tabc3
;
1750 vidx
= vec_cts(rtab
,0);
1751 vidx
= vec_add(vidx
,vec_sl(vidx
,vec_splat_u32(1))); /* multiply by 3 */
1752 vidx
= vec_sl(vidx
,vec_splat_u32(2));
1754 idx1
= *((int *)&vidx
);
1755 idx2
= *(((int *)&vidx
)+1);
1756 idx3
= *(((int *)&vidx
)+2);
1758 eps
= vec_sub(rtab
,vec_floor(rtab
));
1759 eps2
= vec_madd(eps
,eps
,vec_zero());
1761 if(((unsigned int)VFtab
)%16) {
1762 /* not 16-byte aligned, but must be 8 byte. */
1763 /* use Y,F,G,H as temp storage */
1764 tabc1
= vec_ld( 0, VFtab
+idx1
);
1765 tabd1
= vec_ld(16, VFtab
+idx1
);
1766 tabr1
= vec_ld(32, VFtab
+idx1
);
1767 Y
= vec_ld(48, VFtab
+idx1
);
1768 tabc1
= vec_sld(tabc1
,tabd1
,8);
1769 tabd1
= vec_sld(tabd1
,tabr1
,8);
1770 tabr1
= vec_sld(tabr1
,Y
,8);
1772 tabc2
= vec_ld( 0, VFtab
+idx2
);
1773 tabd2
= vec_ld(16, VFtab
+idx2
);
1774 tabr2
= vec_ld(32, VFtab
+idx2
);
1775 F
= vec_ld(48, VFtab
+idx2
);
1776 tabc2
= vec_sld(tabc2
,tabd2
,8);
1777 tabd2
= vec_sld(tabd2
,tabr2
,8);
1778 tabr2
= vec_sld(tabr2
,F
,8);
1780 tabc3
= vec_ld( 0, VFtab
+idx3
);
1781 tabd3
= vec_ld(16, VFtab
+idx3
);
1782 tabr3
= vec_ld(32, VFtab
+idx3
);
1783 G
= vec_ld(48, VFtab
+idx3
);
1784 tabc3
= vec_sld(tabc3
,tabd3
,8);
1785 tabd3
= vec_sld(tabd3
,tabr3
,8);
1786 tabr3
= vec_sld(tabr3
,G
,8);
1787 } else { /* 16 byte aligned */
1788 tabc1
= vec_ld( 0, VFtab
+idx1
);
1789 tabd1
= vec_ld(16, VFtab
+idx1
);
1790 tabr1
= vec_ld(32, VFtab
+idx1
);
1791 tabc2
= vec_ld( 0, VFtab
+idx2
);
1792 tabd2
= vec_ld(16, VFtab
+idx2
);
1793 tabr2
= vec_ld(32, VFtab
+idx2
);
1794 tabc3
= vec_ld( 0, VFtab
+idx3
);
1795 tabd3
= vec_ld(16, VFtab
+idx3
);
1796 tabr3
= vec_ld(32, VFtab
+idx3
);
1798 transpose_3_to_4(tabc1
,tabc2
,tabc3
,&Y
,&F
,&G
,&H
);
1800 F
= vec_madd(G
,eps
,F
); /* F + Geps */
1801 H
= vec_madd(H
,eps2
,vec_zero()); /* Heps2 */
1802 F
= vec_add(F
,H
); /* F + Geps + Heps2 (=Fp) */
1803 *VVcoul
= vec_madd(eps
,F
,Y
); /* VV = Y + eps*Fp */
1804 F
= vec_madd(G
,eps
,F
); /* Fp + Geps */
1805 *FFcoul
= vec_madd(vec_two(),H
,F
); /* Fp + Geps + 2.0*Heps2 */
1807 transpose_3_to_4(tabd1
,tabd2
,tabd3
,&Y
,&F
,&G
,&H
);
1809 F
= vec_madd(G
,eps
,F
); /* F + Geps */
1810 H
= vec_madd(H
,eps2
,vec_zero()); /* Heps2 */
1811 F
= vec_add(F
,H
); /* F + Geps + Heps2 (=Fp) */
1812 *VVdisp
= vec_madd(eps
,F
,Y
); /* VV = Y + eps*Fp */
1813 F
= vec_madd(G
,eps
,F
); /* Fp + Geps */
1814 *FFdisp
= vec_madd(vec_two(),H
,F
); /* Fp + Geps + 2.0*Heps2 */
1816 transpose_3_to_4(tabr1
,tabr2
,tabr3
,&Y
,&F
,&G
,&H
);
1818 F
= vec_madd(G
,eps
,F
); /* F + Geps */
1819 H
= vec_madd(H
,eps2
,vec_zero()); /* Heps2 */
1820 F
= vec_add(F
,H
); /* F + Geps + Heps2 (=Fp) */
1821 *VVrep
= vec_madd(eps
,F
,Y
); /* VV = Y + eps*Fp */
1822 F
= vec_madd(G
,eps
,F
); /* Fp + Geps */
1823 *FFrep
= vec_madd(vec_two(),H
,F
); /* Fp + Geps + 2.0*Heps2 */
1829 static inline void do_2_ctable_coul(float *VFtab
,
1834 vector
signed int vidx
;
1835 vector
float Y
,F
,G
,H
,eps
,eps2
,tab1
,tab2
;
1838 vidx
= vec_cts(rtab
,0);
1839 vidx
= vec_sl(vidx
,vec_splat_u32(2));
1841 idx1
= *((int *)&vidx
);
1842 idx2
= *(((int *)&vidx
)+1);
1844 eps
= vec_sub(rtab
,vec_floor(rtab
));
1845 eps2
= vec_madd(eps
,eps
,vec_zero());
1847 if(((unsigned int)VFtab
)%16) { /* not 16-byte aligned, but must be 8 byte. */
1848 tab1
= vec_ld( 0, VFtab
+idx1
);
1849 Y
= vec_ld(16, VFtab
+idx1
);
1850 tab1
= vec_sld(tab1
,Y
,8);
1851 tab2
= vec_ld( 0, VFtab
+idx2
);
1852 F
= vec_ld(16, VFtab
+idx2
);
1853 tab2
= vec_sld(tab2
,F
,8);
1854 } else { /* aligned */
1855 tab1
=vec_ld(0, VFtab
+idx1
);
1856 tab2
=vec_ld(0, VFtab
+idx2
);
1859 /* table data is aligned */
1860 transpose_2_to_4(tab1
,tab2
,&Y
,&F
,&G
,&H
);
1862 F
= vec_madd(G
,eps
,F
); /* F + Geps */
1863 H
= vec_madd(H
,eps2
,vec_zero()); /* Heps2 */
1864 F
= vec_add(F
,H
); /* F + Geps + Heps2 (=Fp) */
1865 *VV
= vec_madd(eps
,F
,Y
); /* VV = Y + eps*Fp */
1866 F
= vec_madd(G
,eps
,F
); /* Fp + Geps */
1867 *FF
= vec_madd(vec_two(),H
,F
); /* Fp + Geps + 2.0*Heps2 */
1870 /* do only coulomb, but on a table with both coulomb and lj data */
1871 static inline void do_2_ljctable_coul(float *VFtab
,
1876 vector
signed int vidx
;
1877 vector
float Y
,F
,G
,H
,eps
,eps2
,tab1
,tab2
;
1880 vidx
= vec_cts(rtab
,0);
1881 vidx
= vec_add(vidx
,vec_sl(vidx
,vec_splat_u32(1))); /* multiply by 3 */
1882 vidx
= vec_sl(vidx
,vec_splat_u32(2));
1884 idx1
= *((int *)&vidx
);
1885 idx2
= *(((int *)&vidx
)+1);
1887 eps
= vec_sub(rtab
,vec_floor(rtab
));
1888 eps2
= vec_madd(eps
,eps
,vec_zero());
1891 if(((unsigned int)VFtab
)%16) {
1892 /* not 16-byte aligned, but must be 8 byte. */
1893 tab1
= vec_ld( 0, VFtab
+idx1
);
1894 Y
= vec_ld(16, VFtab
+idx1
);
1895 tab1
= vec_sld(tab1
,Y
,8);
1896 tab2
= vec_ld( 0, VFtab
+idx2
);
1897 F
= vec_ld(16, VFtab
+idx2
);
1898 tab2
= vec_sld(tab2
,F
,8);
1899 } else { /* aligned */
1900 tab1
=vec_ld(0, VFtab
+idx1
);
1901 tab2
=vec_ld(0, VFtab
+idx2
);
1904 /* table data is aligned */
1905 transpose_2_to_4(tab1
,tab2
,&Y
,&F
,&G
,&H
);
1907 F
= vec_madd(G
,eps
,F
); /* F + Geps */
1908 H
= vec_madd(H
,eps2
,vec_zero()); /* Heps2 */
1909 F
= vec_add(F
,H
); /* F + Geps + Heps2 (=Fp) */
1910 *VV
= vec_madd(eps
,F
,Y
); /* VV = Y + eps*Fp */
1911 F
= vec_madd(G
,eps
,F
); /* Fp + Geps */
1912 *FF
= vec_madd(vec_two(),H
,F
); /* Fp + Geps + 2.0*Heps2 */
1916 static inline void do_2_ljtable_lj(float *VFtab
,
1918 vector
float *VVdisp
,
1919 vector
float *FFdisp
,
1920 vector
float *VVrep
,
1921 vector
float *FFrep
)
1923 vector
signed int vidx
;
1924 vector
float Y
,F
,G
,H
,eps
,eps2
;
1926 vector
float tabd1
,tabd2
;
1927 vector
float tabr1
,tabr2
;
1929 vidx
= vec_cts(rtab
,0);
1930 vidx
= vec_sl(vidx
,vec_splat_u32(3));
1932 idx1
= *((int *)&vidx
);
1933 idx2
= *(((int *)&vidx
)+1);
1935 eps
= vec_sub(rtab
,vec_floor(rtab
));
1936 eps2
= vec_madd(eps
,eps
,vec_zero());
1938 if(((unsigned int)VFtab
)%16) {
1939 /* not 16 byte aligned, i.e. must be 8 byte. */
1940 /* use Y,F,G,H as temp storage */
1941 tabd1
= vec_ld( 0, VFtab
+idx1
);
1942 tabr1
= vec_ld(16, VFtab
+idx1
);
1943 Y
= vec_ld(32, VFtab
+idx1
);
1944 tabd1
= vec_sld(tabd1
,tabr1
,8);
1945 tabr1
= vec_sld(tabr1
,Y
,8);
1947 tabd2
= vec_ld( 0, VFtab
+idx2
);
1948 tabr2
= vec_ld(16, VFtab
+idx2
);
1949 F
= vec_ld(32, VFtab
+idx2
);
1950 tabd2
= vec_sld(tabd2
,tabr2
,8);
1951 tabr2
= vec_sld(tabr2
,F
,8);
1952 } else { /* 16 byte aligned */
1953 tabd1
= vec_ld( 0, VFtab
+idx1
);
1954 tabr1
= vec_ld(16, VFtab
+idx1
);
1955 tabd2
= vec_ld( 0, VFtab
+idx2
);
1956 tabr2
= vec_ld(16, VFtab
+idx2
);
1959 transpose_2_to_4(tabd1
,tabd2
,&Y
,&F
,&G
,&H
);
1961 F
= vec_madd(G
,eps
,F
); /* F + Geps */
1962 H
= vec_madd(H
,eps2
,vec_zero()); /* Heps2 */
1963 F
= vec_add(F
,H
); /* F + Geps + Heps2 (=Fp) */
1964 *VVdisp
= vec_madd(eps
,F
,Y
); /* VV = Y + eps*Fp */
1965 F
= vec_madd(G
,eps
,F
); /* Fp + Geps */
1966 *FFdisp
= vec_madd(vec_two(),H
,F
); /* Fp + Geps + 2.0*Heps2 */
1968 transpose_2_to_4(tabr1
,tabr2
,&Y
,&F
,&G
,&H
);
1970 F
= vec_madd(G
,eps
,F
); /* F + Geps */
1971 H
= vec_madd(H
,eps2
,vec_zero()); /* Heps2 */
1972 F
= vec_add(F
,H
); /* F + Geps + Heps2 (=Fp) */
1973 *VVrep
= vec_madd(eps
,F
,Y
); /* VV = Y + eps*Fp */
1974 F
= vec_madd(G
,eps
,F
); /* Fp + Geps */
1975 *FFrep
= vec_madd(vec_two(),H
,F
); /* Fp + Geps + 2.0*Heps2 */
1979 static inline void do_2_ljctable_coul_and_lj(float *VFtab
,
1981 vector
float *VVcoul
,
1982 vector
float *FFcoul
,
1983 vector
float *VVdisp
,
1984 vector
float *FFdisp
,
1985 vector
float *VVrep
,
1986 vector
float *FFrep
)
1988 vector
signed int vidx
;
1989 vector
float Y
,F
,G
,H
,eps
,eps2
;
1990 vector
float tabd1
,tabr1
,tabc1
,tabd2
,tabr2
,tabc2
;
1993 vidx
= vec_cts(rtab
,0);
1994 vidx
= vec_add(vidx
,vec_sl(vidx
,vec_splat_u32(1))); /* multiply by 3 */
1995 vidx
= vec_sl(vidx
,vec_splat_u32(2));
1997 idx1
= *((int *)&vidx
);
1998 idx2
= *(((int *)&vidx
)+1);
2000 eps
= vec_sub(rtab
,vec_floor(rtab
));
2001 eps2
= vec_madd(eps
,eps
,vec_zero());
2003 if(((unsigned int)VFtab
)%16) {
2004 /* not 16-byte aligned, but must be 8 byte. */
2005 /* use Y,F,G,H as temp storage */
2006 tabc1
= vec_ld( 0, VFtab
+idx1
);
2007 tabd1
= vec_ld(16, VFtab
+idx1
);
2008 tabr1
= vec_ld(32, VFtab
+idx1
);
2009 Y
= vec_ld(48, VFtab
+idx1
);
2010 tabc1
= vec_sld(tabc1
,tabd1
,8);
2011 tabd1
= vec_sld(tabd1
,tabr1
,8);
2012 tabr1
= vec_sld(tabr1
,Y
,8);
2014 tabc2
= vec_ld( 0, VFtab
+idx2
);
2015 tabd2
= vec_ld(16, VFtab
+idx2
);
2016 tabr2
= vec_ld(32, VFtab
+idx2
);
2017 F
= vec_ld(48, VFtab
+idx2
);
2018 tabc2
= vec_sld(tabc2
,tabd2
,8);
2019 tabd2
= vec_sld(tabd2
,tabr2
,8);
2020 tabr2
= vec_sld(tabr2
,F
,8);
2021 } else { /* 16 byte aligned */
2022 tabc1
= vec_ld( 0, VFtab
+idx1
);
2023 tabd1
= vec_ld(16, VFtab
+idx1
);
2024 tabr1
= vec_ld(32, VFtab
+idx1
);
2025 tabc2
= vec_ld( 0, VFtab
+idx2
);
2026 tabd2
= vec_ld(16, VFtab
+idx2
);
2027 tabr2
= vec_ld(32, VFtab
+idx2
);
2029 transpose_2_to_4(tabc1
,tabc2
,&Y
,&F
,&G
,&H
);
2031 F
= vec_madd(G
,eps
,F
); /* F + Geps */
2032 H
= vec_madd(H
,eps2
,vec_zero()); /* Heps2 */
2033 F
= vec_add(F
,H
); /* F + Geps + Heps2 (=Fp) */
2034 *VVcoul
= vec_madd(eps
,F
,Y
); /* VV = Y + eps*Fp */
2035 F
= vec_madd(G
,eps
,F
); /* Fp + Geps */
2036 *FFcoul
= vec_madd(vec_two(),H
,F
); /* Fp + Geps + 2.0*Heps2 */
2038 transpose_2_to_4(tabd1
,tabd2
,&Y
,&F
,&G
,&H
);
2040 F
= vec_madd(G
,eps
,F
); /* F + Geps */
2041 H
= vec_madd(H
,eps2
,vec_zero()); /* Heps2 */
2042 F
= vec_add(F
,H
); /* F + Geps + Heps2 (=Fp) */
2043 *VVdisp
= vec_madd(eps
,F
,Y
); /* VV = Y + eps*Fp */
2044 F
= vec_madd(G
,eps
,F
); /* Fp + Geps */
2045 *FFdisp
= vec_madd(vec_two(),H
,F
); /* Fp + Geps + 2.0*Heps2 */
2047 transpose_2_to_4(tabr1
,tabr2
,&Y
,&F
,&G
,&H
);
2049 F
= vec_madd(G
,eps
,F
); /* F + Geps */
2050 H
= vec_madd(H
,eps2
,vec_zero()); /* Heps2 */
2051 F
= vec_add(F
,H
); /* F + Geps + Heps2 (=Fp) */
2052 *VVrep
= vec_madd(eps
,F
,Y
); /* VV = Y + eps*Fp */
2053 F
= vec_madd(G
,eps
,F
); /* Fp + Geps */
2054 *FFrep
= vec_madd(vec_two(),H
,F
); /* Fp + Geps + 2.0*Heps2 */
2059 static inline void do_1_ctable_coul(float *VFtab
,
2064 vector
signed int vidx
;
2065 vector
float Y
,F
,G
,H
,eps
,eps2
,tab1
;
2068 vidx
= vec_cts(rtab
,0);
2069 vidx
= vec_sl(vidx
,vec_splat_u32(2));
2071 idx1
= *((int *)&vidx
);
2073 eps
= vec_sub(rtab
,vec_floor(rtab
));
2074 eps2
= vec_madd(eps
,eps
,vec_zero());
2076 if(((unsigned int)VFtab
)%16) { /* not 16-byte aligned, but must be 8 byte. */
2077 tab1
= vec_ld( 0, VFtab
+idx1
);
2078 Y
= vec_ld(16, VFtab
+idx1
);
2079 tab1
= vec_sld(tab1
,Y
,8);
2080 } else { /* aligned */
2081 tab1
=vec_ld(0, VFtab
+idx1
);
2084 /* table data is aligned */
2085 transpose_1_to_4(tab1
,&Y
,&F
,&G
,&H
);
2087 F
= vec_madd(G
,eps
,F
); /* F + Geps */
2088 H
= vec_madd(H
,eps2
,vec_zero()); /* Heps2 */
2089 F
= vec_add(F
,H
); /* F + Geps + Heps2 (=Fp) */
2090 *VV
= vec_madd(eps
,F
,Y
); /* VV = Y + eps*Fp */
2091 F
= vec_madd(G
,eps
,F
); /* Fp + Geps */
2092 *FF
= vec_madd(vec_two(),H
,F
); /* Fp + Geps + 2.0*Heps2 */
2095 /* do only coulomb, but on a table with both coulomb and lj data */
2096 static inline void do_1_ljctable_coul(float *VFtab
,
2101 vector
signed int vidx
;
2102 vector
float Y
,F
,G
,H
,eps
,eps2
,tab1
;
2105 vidx
= vec_cts(rtab
,0);
2106 vidx
= vec_add(vidx
,vec_sl(vidx
,vec_splat_u32(1))); /* multiply by 3 */
2107 vidx
= vec_sl(vidx
,vec_splat_u32(2));
2109 idx1
= *((int *)&vidx
);
2111 eps
= vec_sub(rtab
,vec_floor(rtab
));
2112 eps2
= vec_madd(eps
,eps
,vec_zero());
2115 if(((unsigned int)VFtab
)%16) {
2116 /* not 16-byte aligned, but must be 8 byte. */
2117 tab1
= vec_ld( 0, VFtab
+idx1
);
2118 Y
= vec_ld(16, VFtab
+idx1
);
2119 tab1
= vec_sld(tab1
,Y
,8);
2120 } else { /* aligned */
2121 tab1
=vec_ld(0, VFtab
+idx1
);
2124 /* table data is aligned */
2125 transpose_1_to_4(tab1
,&Y
,&F
,&G
,&H
);
2127 F
= vec_madd(G
,eps
,F
); /* F + Geps */
2128 H
= vec_madd(H
,eps2
,vec_zero()); /* Heps2 */
2129 F
= vec_add(F
,H
); /* F + Geps + Heps2 (=Fp) */
2130 *VV
= vec_madd(eps
,F
,Y
); /* VV = Y + eps*Fp */
2131 F
= vec_madd(G
,eps
,F
); /* Fp + Geps */
2132 *FF
= vec_madd(vec_two(),H
,F
); /* Fp + Geps + 2.0*Heps2 */
2136 static inline void do_1_ljtable_lj(float *VFtab
,
2138 vector
float *VVdisp
,
2139 vector
float *FFdisp
,
2140 vector
float *VVrep
,
2141 vector
float *FFrep
)
2143 vector
signed int vidx
;
2144 vector
float Y
,F
,G
,H
,eps
,eps2
;
2149 vidx
= vec_cts(rtab
,0);
2150 vidx
= vec_sl(vidx
,vec_splat_u32(3));
2152 idx1
= *((int *)&vidx
);
2154 eps
= vec_sub(rtab
,vec_floor(rtab
));
2155 eps2
= vec_madd(eps
,eps
,vec_zero());
2157 if(((unsigned int)VFtab
)%16) {
2158 /* not 16 byte aligned, i.e. must be 8 byte. */
2159 /* use Y,F,G,H as temp storage */
2160 tabd1
= vec_ld( 0, VFtab
+idx1
);
2161 tabr1
= vec_ld(16, VFtab
+idx1
);
2162 Y
= vec_ld(32, VFtab
+idx1
);
2163 tabd1
= vec_sld(tabd1
,tabr1
,8);
2164 tabr1
= vec_sld(tabr1
,Y
,8);
2165 } else { /* 16 byte aligned */
2166 tabd1
= vec_ld( 0, VFtab
+idx1
);
2167 tabr1
= vec_ld(16, VFtab
+idx1
);
2170 transpose_1_to_4(tabd1
,&Y
,&F
,&G
,&H
);
2172 F
= vec_madd(G
,eps
,F
); /* F + Geps */
2173 H
= vec_madd(H
,eps2
,vec_zero()); /* Heps2 */
2174 F
= vec_add(F
,H
); /* F + Geps + Heps2 (=Fp) */
2175 *VVdisp
= vec_madd(eps
,F
,Y
); /* VV = Y + eps*Fp */
2176 F
= vec_madd(G
,eps
,F
); /* Fp + Geps */
2177 *FFdisp
= vec_madd(vec_two(),H
,F
); /* Fp + Geps + 2.0*Heps2 */
2179 transpose_1_to_4(tabr1
,&Y
,&F
,&G
,&H
);
2181 F
= vec_madd(G
,eps
,F
); /* F + Geps */
2182 H
= vec_madd(H
,eps2
,vec_zero()); /* Heps2 */
2183 F
= vec_add(F
,H
); /* F + Geps + Heps2 (=Fp) */
2184 *VVrep
= vec_madd(eps
,F
,Y
); /* VV = Y + eps*Fp */
2185 F
= vec_madd(G
,eps
,F
); /* Fp + Geps */
2186 *FFrep
= vec_madd(vec_two(),H
,F
); /* Fp + Geps + 2.0*Heps2 */
2190 static inline void do_1_ljctable_coul_and_lj(float *VFtab
,
2192 vector
float *VVcoul
,
2193 vector
float *FFcoul
,
2194 vector
float *VVdisp
,
2195 vector
float *FFdisp
,
2196 vector
float *VVrep
,
2197 vector
float *FFrep
)
2199 vector
signed int vidx
;
2200 vector
float Y
,F
,G
,H
,eps
,eps2
;
2201 vector
float tabd1
,tabr1
,tabc1
;
2204 vidx
= vec_cts(rtab
,0);
2205 vidx
= vec_add(vidx
,vec_sl(vidx
,vec_splat_u32(1))); /* multiply by 3 */
2206 vidx
= vec_sl(vidx
,vec_splat_u32(2));
2208 idx1
= *((int *)&vidx
);
2210 eps
= vec_sub(rtab
,vec_floor(rtab
));
2211 eps2
= vec_madd(eps
,eps
,vec_zero());
2213 if(((unsigned int)VFtab
)%16) {
2214 /* not 16-byte aligned, but must be 8 byte. */
2215 /* use Y,F,G,H as temp storage */
2216 tabc1
= vec_ld( 0, VFtab
+idx1
);
2217 tabd1
= vec_ld(16, VFtab
+idx1
);
2218 tabr1
= vec_ld(32, VFtab
+idx1
);
2219 Y
= vec_ld(48, VFtab
+idx1
);
2220 tabc1
= vec_sld(tabc1
,tabd1
,8);
2221 tabd1
= vec_sld(tabd1
,tabr1
,8);
2222 tabr1
= vec_sld(tabr1
,Y
,8);
2223 } else { /* 16 byte aligned */
2224 tabc1
= vec_ld( 0, VFtab
+idx1
);
2225 tabd1
= vec_ld(16, VFtab
+idx1
);
2226 tabr1
= vec_ld(32, VFtab
+idx1
);
2228 transpose_1_to_4(tabc1
,&Y
,&F
,&G
,&H
);
2230 F
= vec_madd(G
,eps
,F
); /* F + Geps */
2231 H
= vec_madd(H
,eps2
,vec_zero()); /* Heps2 */
2232 F
= vec_add(F
,H
); /* F + Geps + Heps2 (=Fp) */
2233 *VVcoul
= vec_madd(eps
,F
,Y
); /* VV = Y + eps*Fp */
2234 F
= vec_madd(G
,eps
,F
); /* Fp + Geps */
2235 *FFcoul
= vec_madd(vec_two(),H
,F
); /* Fp + Geps + 2.0*Heps2 */
2237 transpose_1_to_4(tabd1
,&Y
,&F
,&G
,&H
);
2239 F
= vec_madd(G
,eps
,F
); /* F + Geps */
2240 H
= vec_madd(H
,eps2
,vec_zero()); /* Heps2 */
2241 F
= vec_add(F
,H
); /* F + Geps + Heps2 (=Fp) */
2242 *VVdisp
= vec_madd(eps
,F
,Y
); /* VV = Y + eps*Fp */
2243 F
= vec_madd(G
,eps
,F
); /* Fp + Geps */
2244 *FFdisp
= vec_madd(vec_two(),H
,F
); /* Fp + Geps + 2.0*Heps2 */
2246 transpose_1_to_4(tabr1
,&Y
,&F
,&G
,&H
);
2248 F
= vec_madd(G
,eps
,F
); /* F + Geps */
2249 H
= vec_madd(H
,eps2
,vec_zero()); /* Heps2 */
2250 F
= vec_add(F
,H
); /* F + Geps + Heps2 (=Fp) */
2251 *VVrep
= vec_madd(eps
,F
,Y
); /* VV = Y + eps*Fp */
2252 F
= vec_madd(G
,eps
,F
); /* Fp + Geps */
2253 *FFrep
= vec_madd(vec_two(),H
,F
); /* Fp + Geps + 2.0*Heps2 */
2259 static inline void do_vonly_4_ctable_coul(float *VFtab
,
2263 vector
signed int vidx
;
2264 vector
float Y
,F
,G
,H
,eps
,eps2
,tab1
,tab2
,tab3
,tab4
;
2265 int idx1
,idx2
,idx3
,idx4
;
2267 vidx
= vec_cts(rtab
,0);
2268 vidx
= vec_sl(vidx
,vec_splat_u32(2));
2270 idx1
= *((int *)&vidx
);
2271 idx2
= *(((int *)&vidx
)+1);
2272 idx3
= *(((int *)&vidx
)+2);
2273 idx4
= *(((int *)&vidx
)+3);
2275 eps
= vec_sub(rtab
,vec_floor(rtab
));
2276 eps2
= vec_madd(eps
,eps
,vec_zero());
2278 if(((unsigned int)VFtab
)%16) { /* not 16-byte aligned, but must be 8 byte. */
2279 tab1
= vec_ld( 0, VFtab
+idx1
);
2280 Y
= vec_ld(16, VFtab
+idx1
);
2281 tab1
= vec_sld(tab1
,Y
,8);
2282 tab2
= vec_ld( 0, VFtab
+idx2
);
2283 F
= vec_ld(16, VFtab
+idx2
);
2284 tab2
= vec_sld(tab2
,F
,8);
2285 tab3
= vec_ld( 0, VFtab
+idx3
);
2286 G
= vec_ld(16, VFtab
+idx3
);
2287 tab3
= vec_sld(tab3
,G
,8);
2288 tab4
= vec_ld( 0, VFtab
+idx4
);
2289 H
= vec_ld(16, VFtab
+idx4
);
2290 tab4
= vec_sld(tab4
,H
,8);
2291 } else { /* aligned */
2292 tab1
=vec_ld(0, VFtab
+idx1
);
2293 tab2
=vec_ld(0, VFtab
+idx2
);
2294 tab3
=vec_ld(0, VFtab
+idx3
);
2295 tab4
=vec_ld(0, VFtab
+idx4
);
2298 /* table data is aligned */
2299 transpose_4_to_4(tab1
,tab2
,tab3
,tab4
,&Y
,&F
,&G
,&H
);
2301 F
= vec_madd(G
,eps
,F
); /* F + Geps */
2302 H
= vec_madd(H
,eps2
,vec_zero()); /* Heps2 */
2303 F
= vec_add(F
,H
); /* F + Geps + Heps2 (=Fp) */
2304 *VV
= vec_madd(eps
,F
,Y
); /* VV = Y + eps*Fp */
2307 /* do only coulomb, but on a table with both coulomb and lj data */
2308 static inline void do_vonly_4_ljctable_coul(float *VFtab
,
2312 vector
signed int vidx
;
2313 vector
float Y
,F
,G
,H
,eps
,eps2
,tab1
,tab2
,tab3
,tab4
;
2314 int idx1
,idx2
,idx3
,idx4
;
2316 vidx
= vec_cts(rtab
,0);
2317 vidx
= vec_add(vidx
,vec_sl(vidx
,vec_splat_u32(1))); /* multiply by 3 */
2318 vidx
= vec_sl(vidx
,vec_splat_u32(2));
2320 idx1
= *((int *)&vidx
);
2321 idx2
= *(((int *)&vidx
)+1);
2322 idx3
= *(((int *)&vidx
)+2);
2323 idx4
= *(((int *)&vidx
)+3);
2325 eps
= vec_sub(rtab
,vec_floor(rtab
));
2326 eps2
= vec_madd(eps
,eps
,vec_zero());
2328 if(((unsigned int)VFtab
)%16) { /* not 16-byte aligned, but must be 8 byte. */
2329 tab1
= vec_ld( 0, VFtab
+idx1
);
2330 Y
= vec_ld(16, VFtab
+idx1
);
2331 tab1
= vec_sld(tab1
,Y
,8);
2332 tab2
= vec_ld( 0, VFtab
+idx2
);
2333 F
= vec_ld(16, VFtab
+idx2
);
2334 tab2
= vec_sld(tab2
,F
,8);
2335 tab3
= vec_ld( 0, VFtab
+idx3
);
2336 G
= vec_ld(16, VFtab
+idx3
);
2337 tab3
= vec_sld(tab3
,G
,8);
2338 tab4
= vec_ld( 0, VFtab
+idx4
);
2339 H
= vec_ld(16, VFtab
+idx4
);
2340 tab4
= vec_sld(tab4
,H
,8);
2341 } else { /* aligned */
2342 tab1
=vec_ld(0, VFtab
+idx1
);
2343 tab2
=vec_ld(0, VFtab
+idx2
);
2344 tab3
=vec_ld(0, VFtab
+idx3
);
2345 tab4
=vec_ld(0, VFtab
+idx4
);
2348 transpose_4_to_4(tab1
,tab2
,tab3
,tab4
,&Y
,&F
,&G
,&H
);
2350 F
= vec_madd(G
,eps
,F
); /* F + Geps */
2351 H
= vec_madd(H
,eps2
,vec_zero()); /* Heps2 */
2352 F
= vec_add(F
,H
); /* F + Geps + Heps2 (=Fp) */
2353 *VV
= vec_madd(eps
,F
,Y
); /* VV = Y + eps*Fp */
2357 static inline void do_vonly_4_ljtable_lj(float *VFtab
,
2359 vector
float *VVdisp
,
2360 vector
float *VVrep
)
2362 vector
signed int vidx
;
2363 vector
float Y
,F
,G
,H
,eps
,eps2
;
2364 vector
float tabd1
,tabd2
,tabd3
,tabd4
;
2365 vector
float tabr1
,tabr2
,tabr3
,tabr4
;
2367 int idx1
,idx2
,idx3
,idx4
;
2369 vidx
= vec_cts(rtab
,0);
2370 vidx
= vec_sl(vidx
,vec_splat_u32(3));
2372 idx1
= *((int *)&vidx
);
2373 idx2
= *(((int *)&vidx
)+1);
2374 idx3
= *(((int *)&vidx
)+2);
2375 idx4
= *(((int *)&vidx
)+3);
2377 eps
= vec_sub(rtab
,vec_floor(rtab
));
2378 eps2
= vec_madd(eps
,eps
,vec_zero());
2380 if(((unsigned int)VFtab
)%16) {
2381 /* not 16 byte aligned, i.e. must be 8 byte. */
2382 /* use Y,F,G,H as temp storage */
2383 tabd1
= vec_ld( 0, VFtab
+idx1
);
2384 tabr1
= vec_ld(16, VFtab
+idx1
);
2385 Y
= vec_ld(32, VFtab
+idx1
);
2386 tabd1
= vec_sld(tabd1
,tabr1
,8);
2387 tabr1
= vec_sld(tabr1
,Y
,8);
2389 tabd2
= vec_ld( 0, VFtab
+idx2
);
2390 tabr2
= vec_ld(16, VFtab
+idx2
);
2391 F
= vec_ld(32, VFtab
+idx2
);
2392 tabd2
= vec_sld(tabd2
,tabr2
,8);
2393 tabr2
= vec_sld(tabr2
,F
,8);
2395 tabd3
= vec_ld( 0, VFtab
+idx3
);
2396 tabr3
= vec_ld(16, VFtab
+idx3
);
2397 G
= vec_ld(32, VFtab
+idx3
);
2398 tabd3
= vec_sld(tabd3
,tabr3
,8);
2399 tabr3
= vec_sld(tabr3
,G
,8);
2401 tabd4
= vec_ld( 0, VFtab
+idx4
);
2402 tabr4
= vec_ld(16, VFtab
+idx4
);
2403 H
= vec_ld(32, VFtab
+idx4
);
2404 tabd4
= vec_sld(tabd4
,tabr4
,8);
2405 tabr4
= vec_sld(tabr4
,H
,8);
2406 } else { /* 16 byte aligned */
2407 tabd1
= vec_ld( 0, VFtab
+idx1
);
2408 tabr1
= vec_ld(16, VFtab
+idx1
);
2409 tabd2
= vec_ld( 0, VFtab
+idx2
);
2410 tabr2
= vec_ld(16, VFtab
+idx2
);
2411 tabd3
= vec_ld( 0, VFtab
+idx3
);
2412 tabr3
= vec_ld(16, VFtab
+idx3
);
2413 tabd4
= vec_ld( 0, VFtab
+idx4
);
2414 tabr4
= vec_ld(16, VFtab
+idx4
);
2417 transpose_4_to_4(tabd1
,tabd2
,tabd3
,tabd4
,&Y
,&F
,&G
,&H
);
2419 F
= vec_madd(G
,eps
,F
); /* F + Geps */
2420 H
= vec_madd(H
,eps2
,vec_zero()); /* Heps2 */
2421 F
= vec_add(F
,H
); /* F + Geps + Heps2 (=Fp) */
2422 *VVdisp
= vec_madd(eps
,F
,Y
); /* VV = Y + eps*Fp */
2424 transpose_4_to_4(tabr1
,tabr2
,tabr3
,tabr4
,&Y
,&F
,&G
,&H
);
2426 F
= vec_madd(G
,eps
,F
); /* F + Geps */
2427 H
= vec_madd(H
,eps2
,vec_zero()); /* Heps2 */
2428 F
= vec_add(F
,H
); /* F + Geps + Heps2 (=Fp) */
2429 *VVrep
= vec_madd(eps
,F
,Y
); /* VV = Y + eps*Fp */
2433 static inline void do_vonly_4_ljctable_coul_and_lj(float *VFtab
,
2435 vector
float *VVcoul
,
2436 vector
float *VVdisp
,
2437 vector
float *VVrep
)
2439 vector
signed int vidx
;
2440 vector
float Y
,F
,G
,H
,eps
,eps2
;
2441 vector
float tabd1
,tabr1
,tabc1
,tabd2
,tabr2
,tabc2
;
2442 vector
float tabd3
,tabr3
,tabc3
,tabd4
,tabr4
,tabc4
;
2443 int idx1
,idx2
,idx3
,idx4
;
2445 vidx
= vec_cts(rtab
,0);
2446 vidx
= vec_add(vidx
,vec_sl(vidx
,vec_splat_u32(1))); /* multiply by 3 */
2447 vidx
= vec_sl(vidx
,vec_splat_u32(2));
2449 idx1
= *((int *)&vidx
);
2450 idx2
= *(((int *)&vidx
)+1);
2451 idx3
= *(((int *)&vidx
)+2);
2452 idx4
= *(((int *)&vidx
)+3);
2454 eps
= vec_sub(rtab
,vec_floor(rtab
));
2455 eps2
= vec_madd(eps
,eps
,vec_zero());
2457 if(((unsigned int)VFtab
)%16) {
2458 /* not 16-byte aligned, but must be 8 byte. */
2459 /* use Y,F,G,H as temp storage */
2460 tabc1
= vec_ld( 0, VFtab
+idx1
);
2461 tabd1
= vec_ld(16, VFtab
+idx1
);
2462 tabr1
= vec_ld(32, VFtab
+idx1
);
2463 Y
= vec_ld(48, VFtab
+idx1
);
2464 tabc1
= vec_sld(tabc1
,tabd1
,8);
2465 tabd1
= vec_sld(tabd1
,tabr1
,8);
2466 tabr1
= vec_sld(tabr1
,Y
,8);
2468 tabc2
= vec_ld( 0, VFtab
+idx2
);
2469 tabd2
= vec_ld(16, VFtab
+idx2
);
2470 tabr2
= vec_ld(32, VFtab
+idx2
);
2471 F
= vec_ld(48, VFtab
+idx2
);
2472 tabc2
= vec_sld(tabc2
,tabd2
,8);
2473 tabd2
= vec_sld(tabd2
,tabr2
,8);
2474 tabr2
= vec_sld(tabr2
,F
,8);
2476 tabc3
= vec_ld( 0, VFtab
+idx3
);
2477 tabd3
= vec_ld(16, VFtab
+idx3
);
2478 tabr3
= vec_ld(32, VFtab
+idx3
);
2479 G
= vec_ld(48, VFtab
+idx3
);
2480 tabc3
= vec_sld(tabc3
,tabd3
,8);
2481 tabd3
= vec_sld(tabd3
,tabr3
,8);
2482 tabr3
= vec_sld(tabr3
,G
,8);
2484 tabc4
= vec_ld( 0, VFtab
+idx4
);
2485 tabd4
= vec_ld(16, VFtab
+idx4
);
2486 tabr4
= vec_ld(32, VFtab
+idx4
);
2487 H
= vec_ld(48, VFtab
+idx4
);
2488 tabc4
= vec_sld(tabc4
,tabd4
,8);
2489 tabd4
= vec_sld(tabd4
,tabr4
,8);
2490 tabr4
= vec_sld(tabr4
,H
,8);
2491 } else { /* 16 byte aligned */
2492 tabc1
= vec_ld( 0, VFtab
+idx1
);
2493 tabd1
= vec_ld(16, VFtab
+idx1
);
2494 tabr1
= vec_ld(32, VFtab
+idx1
);
2495 tabc2
= vec_ld( 0, VFtab
+idx2
);
2496 tabd2
= vec_ld(16, VFtab
+idx2
);
2497 tabr2
= vec_ld(32, VFtab
+idx2
);
2498 tabc3
= vec_ld( 0, VFtab
+idx3
);
2499 tabd3
= vec_ld(16, VFtab
+idx3
);
2500 tabr3
= vec_ld(32, VFtab
+idx3
);
2501 tabc4
= vec_ld( 0, VFtab
+idx4
);
2502 tabd4
= vec_ld(16, VFtab
+idx4
);
2503 tabr4
= vec_ld(32, VFtab
+idx4
);
2505 transpose_4_to_4(tabc1
,tabc2
,tabc3
,tabc4
,&Y
,&F
,&G
,&H
);
2507 F
= vec_madd(G
,eps
,F
); /* F + Geps */
2508 H
= vec_madd(H
,eps2
,vec_zero()); /* Heps2 */
2509 F
= vec_add(F
,H
); /* F + Geps + Heps2 (=Fp) */
2510 *VVcoul
= vec_madd(eps
,F
,Y
); /* VV = Y + eps*Fp */
2512 transpose_4_to_4(tabd1
,tabd2
,tabd3
,tabd4
,&Y
,&F
,&G
,&H
);
2514 F
= vec_madd(G
,eps
,F
); /* F + Geps */
2515 H
= vec_madd(H
,eps2
,vec_zero()); /* Heps2 */
2516 F
= vec_add(F
,H
); /* F + Geps + Heps2 (=Fp) */
2517 *VVdisp
= vec_madd(eps
,F
,Y
); /* VV = Y + eps*Fp */
2519 transpose_4_to_4(tabr1
,tabr2
,tabr3
,tabr4
,&Y
,&F
,&G
,&H
);
2521 F
= vec_madd(G
,eps
,F
); /* F + Geps */
2522 H
= vec_madd(H
,eps2
,vec_zero()); /* Heps2 */
2523 F
= vec_add(F
,H
); /* F + Geps + Heps2 (=Fp) */
2524 *VVrep
= vec_madd(eps
,F
,Y
); /* VV = Y + eps*Fp */
2528 static inline void do_vonly_3_ctable_coul(float *VFtab
,
2532 vector
signed int vidx
;
2533 vector
float Y
,F
,G
,H
,eps
,eps2
,tab1
,tab2
,tab3
;
2536 vidx
= vec_cts(rtab
,0);
2537 vidx
= vec_sl(vidx
,vec_splat_u32(2));
2539 idx1
= *((int *)&vidx
);
2540 idx2
= *(((int *)&vidx
)+1);
2541 idx3
= *(((int *)&vidx
)+2);
2543 eps
= vec_sub(rtab
,vec_floor(rtab
));
2544 eps2
= vec_madd(eps
,eps
,vec_zero());
2546 if(((unsigned int)VFtab
)%16) { /* not 16-byte aligned, but must be 8 byte. */
2547 tab1
= vec_ld( 0, VFtab
+idx1
);
2548 Y
= vec_ld(16, VFtab
+idx1
);
2549 tab1
= vec_sld(tab1
,Y
,8);
2550 tab2
= vec_ld( 0, VFtab
+idx2
);
2551 F
= vec_ld(16, VFtab
+idx2
);
2552 tab2
= vec_sld(tab2
,F
,8);
2553 tab3
= vec_ld( 0, VFtab
+idx3
);
2554 G
= vec_ld(16, VFtab
+idx3
);
2555 tab3
= vec_sld(tab3
,G
,8);
2556 } else { /* aligned */
2557 tab1
=vec_ld(0, VFtab
+idx1
);
2558 tab2
=vec_ld(0, VFtab
+idx2
);
2559 tab3
=vec_ld(0, VFtab
+idx3
);
2562 /* table data is aligned */
2563 transpose_3_to_4(tab1
,tab2
,tab3
,&Y
,&F
,&G
,&H
);
2566 F
= vec_madd(G
,eps
,F
); /* F + Geps */
2567 H
= vec_madd(H
,eps2
,vec_zero()); /* Heps2 */
2568 F
= vec_add(F
,H
); /* F + Geps + Heps2 (=Fp) */
2569 *VV
= vec_madd(eps
,F
,Y
); /* VV = Y + eps*Fp */
2572 /* do only coulomb, but on a table with both coulomb and lj data */
2573 static inline void do_vonly_3_ljctable_coul(float *VFtab
,
2577 vector
signed int vidx
;
2578 vector
float Y
,F
,G
,H
,eps
,eps2
,tab1
,tab2
,tab3
;
2581 vidx
= vec_cts(rtab
,0);
2582 vidx
= vec_add(vidx
,vec_sl(vidx
,vec_splat_u32(1))); /* multiply by 3 */
2583 vidx
= vec_sl(vidx
,vec_splat_u32(2));
2585 idx1
= *((int *)&vidx
);
2586 idx2
= *(((int *)&vidx
)+1);
2587 idx3
= *(((int *)&vidx
)+2);
2589 eps
= vec_sub(rtab
,vec_floor(rtab
));
2590 eps2
= vec_madd(eps
,eps
,vec_zero());
2593 if(((unsigned int)VFtab
)%16) {
2594 /* not 16-byte aligned, but must be 8 byte. */
2595 tab1
= vec_ld( 0, VFtab
+idx1
);
2596 Y
= vec_ld(16, VFtab
+idx1
);
2597 tab1
= vec_sld(tab1
,Y
,8);
2598 tab2
= vec_ld( 0, VFtab
+idx2
);
2599 F
= vec_ld(16, VFtab
+idx2
);
2600 tab2
= vec_sld(tab2
,F
,8);
2601 tab3
= vec_ld( 0, VFtab
+idx3
);
2602 G
= vec_ld(16, VFtab
+idx3
);
2603 tab3
= vec_sld(tab3
,G
,8);
2604 } else { /* aligned */
2605 tab1
=vec_ld(0, VFtab
+idx1
);
2606 tab2
=vec_ld(0, VFtab
+idx2
);
2607 tab3
=vec_ld(0, VFtab
+idx3
);
2610 /* table data is aligned */
2611 transpose_3_to_4(tab1
,tab2
,tab3
,&Y
,&F
,&G
,&H
);
2613 F
= vec_madd(G
,eps
,F
); /* F + Geps */
2614 H
= vec_madd(H
,eps2
,vec_zero()); /* Heps2 */
2615 F
= vec_add(F
,H
); /* F + Geps + Heps2 (=Fp) */
2616 *VV
= vec_madd(eps
,F
,Y
); /* VV = Y + eps*Fp */
2620 static inline void do_vonly_3_ljtable_lj(float *VFtab
,
2622 vector
float *VVdisp
,
2623 vector
float *VVrep
)
2625 vector
signed int vidx
;
2626 vector
float Y
,F
,G
,H
,eps
,eps2
;
2628 vector
float tabd1
,tabd2
,tabd3
;
2629 vector
float tabr1
,tabr2
,tabr3
;
2631 vidx
= vec_cts(rtab
,0);
2632 vidx
= vec_sl(vidx
,vec_splat_u32(3));
2634 idx1
= *((int *)&vidx
);
2635 idx2
= *(((int *)&vidx
)+1);
2636 idx3
= *(((int *)&vidx
)+2);
2638 eps
= vec_sub(rtab
,vec_floor(rtab
));
2639 eps2
= vec_madd(eps
,eps
,vec_zero());
2641 if(((unsigned int)VFtab
)%16) {
2642 /* not 16 byte aligned, i.e. must be 8 byte. */
2643 /* use Y,F,G,H as temp storage */
2644 tabd1
= vec_ld( 0, VFtab
+idx1
);
2645 tabr1
= vec_ld(16, VFtab
+idx1
);
2646 Y
= vec_ld(32, VFtab
+idx1
);
2647 tabd1
= vec_sld(tabd1
,tabr1
,8);
2648 tabr1
= vec_sld(tabr1
,Y
,8);
2650 tabd2
= vec_ld( 0, VFtab
+idx2
);
2651 tabr2
= vec_ld(16, VFtab
+idx2
);
2652 F
= vec_ld(32, VFtab
+idx2
);
2653 tabd2
= vec_sld(tabd2
,tabr2
,8);
2654 tabr2
= vec_sld(tabr2
,F
,8);
2656 tabd3
= vec_ld( 0, VFtab
+idx3
);
2657 tabr3
= vec_ld(16, VFtab
+idx3
);
2658 G
= vec_ld(32, VFtab
+idx3
);
2659 tabd3
= vec_sld(tabd3
,tabr3
,8);
2660 tabr3
= vec_sld(tabr3
,G
,8);
2661 } else { /* 16 byte aligned */
2662 tabd1
= vec_ld( 0, VFtab
+idx1
);
2663 tabr1
= vec_ld(16, VFtab
+idx1
);
2664 tabd2
= vec_ld( 0, VFtab
+idx2
);
2665 tabr2
= vec_ld(16, VFtab
+idx2
);
2666 tabd3
= vec_ld( 0, VFtab
+idx3
);
2667 tabr3
= vec_ld(16, VFtab
+idx3
);
2670 transpose_3_to_4(tabd1
,tabd2
,tabd3
,&Y
,&F
,&G
,&H
);
2672 F
= vec_madd(G
,eps
,F
); /* F + Geps */
2673 H
= vec_madd(H
,eps2
,vec_zero()); /* Heps2 */
2674 F
= vec_add(F
,H
); /* F + Geps + Heps2 (=Fp) */
2675 *VVdisp
= vec_madd(eps
,F
,Y
); /* VV = Y + eps*Fp */
2677 transpose_3_to_4(tabr1
,tabr2
,tabr3
,&Y
,&F
,&G
,&H
);
2679 F
= vec_madd(G
,eps
,F
); /* F + Geps */
2680 H
= vec_madd(H
,eps2
,vec_zero()); /* Heps2 */
2681 F
= vec_add(F
,H
); /* F + Geps + Heps2 (=Fp) */
2682 *VVrep
= vec_madd(eps
,F
,Y
); /* VV = Y + eps*Fp */
2686 static inline void do_vonly_3_ljctable_coul_and_lj(float *VFtab
,
2688 vector
float *VVcoul
,
2689 vector
float *VVdisp
,
2690 vector
float *VVrep
)
2692 vector
signed int vidx
;
2693 vector
float Y
,F
,G
,H
,eps
,eps2
;
2694 vector
float tabd1
,tabr1
,tabc1
,tabd2
,tabr2
,tabc2
;
2695 vector
float tabd3
,tabr3
,tabc3
;
2698 vidx
= vec_cts(rtab
,0);
2699 vidx
= vec_add(vidx
,vec_sl(vidx
,vec_splat_u32(1))); /* multiply by 3 */
2700 vidx
= vec_sl(vidx
,vec_splat_u32(2));
2702 idx1
= *((int *)&vidx
);
2703 idx2
= *(((int *)&vidx
)+1);
2704 idx3
= *(((int *)&vidx
)+2);
2706 eps
= vec_sub(rtab
,vec_floor(rtab
));
2707 eps2
= vec_madd(eps
,eps
,vec_zero());
2709 if(((unsigned int)VFtab
)%16) {
2710 /* not 16-byte aligned, but must be 8 byte. */
2711 /* use Y,F,G,H as temp storage */
2712 tabc1
= vec_ld( 0, VFtab
+idx1
);
2713 tabd1
= vec_ld(16, VFtab
+idx1
);
2714 tabr1
= vec_ld(32, VFtab
+idx1
);
2715 Y
= vec_ld(48, VFtab
+idx1
);
2716 tabc1
= vec_sld(tabc1
,tabd1
,8);
2717 tabd1
= vec_sld(tabd1
,tabr1
,8);
2718 tabr1
= vec_sld(tabr1
,Y
,8);
2720 tabc2
= vec_ld( 0, VFtab
+idx2
);
2721 tabd2
= vec_ld(16, VFtab
+idx2
);
2722 tabr2
= vec_ld(32, VFtab
+idx2
);
2723 F
= vec_ld(48, VFtab
+idx2
);
2724 tabc2
= vec_sld(tabc2
,tabd2
,8);
2725 tabd2
= vec_sld(tabd2
,tabr2
,8);
2726 tabr2
= vec_sld(tabr2
,F
,8);
2728 tabc3
= vec_ld( 0, VFtab
+idx3
);
2729 tabd3
= vec_ld(16, VFtab
+idx3
);
2730 tabr3
= vec_ld(32, VFtab
+idx3
);
2731 G
= vec_ld(48, VFtab
+idx3
);
2732 tabc3
= vec_sld(tabc3
,tabd3
,8);
2733 tabd3
= vec_sld(tabd3
,tabr3
,8);
2734 tabr3
= vec_sld(tabr3
,G
,8);
2735 } else { /* 16 byte aligned */
2736 tabc1
= vec_ld( 0, VFtab
+idx1
);
2737 tabd1
= vec_ld(16, VFtab
+idx1
);
2738 tabr1
= vec_ld(32, VFtab
+idx1
);
2739 tabc2
= vec_ld( 0, VFtab
+idx2
);
2740 tabd2
= vec_ld(16, VFtab
+idx2
);
2741 tabr2
= vec_ld(32, VFtab
+idx2
);
2742 tabc3
= vec_ld( 0, VFtab
+idx3
);
2743 tabd3
= vec_ld(16, VFtab
+idx3
);
2744 tabr3
= vec_ld(32, VFtab
+idx3
);
2746 transpose_3_to_4(tabc1
,tabc2
,tabc3
,&Y
,&F
,&G
,&H
);
2748 F
= vec_madd(G
,eps
,F
); /* F + Geps */
2749 H
= vec_madd(H
,eps2
,vec_zero()); /* Heps2 */
2750 F
= vec_add(F
,H
); /* F + Geps + Heps2 (=Fp) */
2751 *VVcoul
= vec_madd(eps
,F
,Y
); /* VV = Y + eps*Fp */
2753 transpose_3_to_4(tabd1
,tabd2
,tabd3
,&Y
,&F
,&G
,&H
);
2755 F
= vec_madd(G
,eps
,F
); /* F + Geps */
2756 H
= vec_madd(H
,eps2
,vec_zero()); /* Heps2 */
2757 F
= vec_add(F
,H
); /* F + Geps + Heps2 (=Fp) */
2758 *VVdisp
= vec_madd(eps
,F
,Y
); /* VV = Y + eps*Fp */
2760 transpose_3_to_4(tabr1
,tabr2
,tabr3
,&Y
,&F
,&G
,&H
);
2762 F
= vec_madd(G
,eps
,F
); /* F + Geps */
2763 H
= vec_madd(H
,eps2
,vec_zero()); /* Heps2 */
2764 F
= vec_add(F
,H
); /* F + Geps + Heps2 (=Fp) */
2765 *VVrep
= vec_madd(eps
,F
,Y
); /* VV = Y + eps*Fp */
2771 static inline void do_vonly_2_ctable_coul(float *VFtab
,
2775 vector
signed int vidx
;
2776 vector
float Y
,F
,G
,H
,eps
,eps2
,tab1
,tab2
;
2779 vidx
= vec_cts(rtab
,0);
2780 vidx
= vec_sl(vidx
,vec_splat_u32(2));
2782 idx1
= *((int *)&vidx
);
2783 idx2
= *(((int *)&vidx
)+1);
2785 eps
= vec_sub(rtab
,vec_floor(rtab
));
2786 eps2
= vec_madd(eps
,eps
,vec_zero());
2788 if(((unsigned int)VFtab
)%16) { /* not 16-byte aligned, but must be 8 byte. */
2789 tab1
= vec_ld( 0, VFtab
+idx1
);
2790 Y
= vec_ld(16, VFtab
+idx1
);
2791 tab1
= vec_sld(tab1
,Y
,8);
2792 tab2
= vec_ld( 0, VFtab
+idx2
);
2793 F
= vec_ld(16, VFtab
+idx2
);
2794 tab2
= vec_sld(tab2
,F
,8);
2795 } else { /* aligned */
2796 tab1
=vec_ld(0, VFtab
+idx1
);
2797 tab2
=vec_ld(0, VFtab
+idx2
);
2800 /* table data is aligned */
2801 transpose_2_to_4(tab1
,tab2
,&Y
,&F
,&G
,&H
);
2803 F
= vec_madd(G
,eps
,F
); /* F + Geps */
2804 H
= vec_madd(H
,eps2
,vec_zero()); /* Heps2 */
2805 F
= vec_add(F
,H
); /* F + Geps + Heps2 (=Fp) */
2806 *VV
= vec_madd(eps
,F
,Y
); /* VV = Y + eps*Fp */
2809 /* do only coulomb, but on a table with both coulomb and lj data */
2810 static inline void do_vonly_2_ljctable_coul(float *VFtab
,
2814 vector
signed int vidx
;
2815 vector
float Y
,F
,G
,H
,eps
,eps2
,tab1
,tab2
;
2818 vidx
= vec_cts(rtab
,0);
2819 vidx
= vec_add(vidx
,vec_sl(vidx
,vec_splat_u32(1))); /* multiply by 3 */
2820 vidx
= vec_sl(vidx
,vec_splat_u32(2));
2822 idx1
= *((int *)&vidx
);
2823 idx2
= *(((int *)&vidx
)+1);
2825 eps
= vec_sub(rtab
,vec_floor(rtab
));
2826 eps2
= vec_madd(eps
,eps
,vec_zero());
2829 if(((unsigned int)VFtab
)%16) {
2830 /* not 16-byte aligned, but must be 8 byte. */
2831 tab1
= vec_ld( 0, VFtab
+idx1
);
2832 Y
= vec_ld(16, VFtab
+idx1
);
2833 tab1
= vec_sld(tab1
,Y
,8);
2834 tab2
= vec_ld( 0, VFtab
+idx2
);
2835 F
= vec_ld(16, VFtab
+idx2
);
2836 tab2
= vec_sld(tab2
,F
,8);
2837 } else { /* aligned */
2838 tab1
=vec_ld(0, VFtab
+idx1
);
2839 tab2
=vec_ld(0, VFtab
+idx2
);
2842 /* table data is aligned */
2843 transpose_2_to_4(tab1
,tab2
,&Y
,&F
,&G
,&H
);
2845 F
= vec_madd(G
,eps
,F
); /* F + Geps */
2846 H
= vec_madd(H
,eps2
,vec_zero()); /* Heps2 */
2847 F
= vec_add(F
,H
); /* F + Geps + Heps2 (=Fp) */
2848 *VV
= vec_madd(eps
,F
,Y
); /* VV = Y + eps*Fp */
2852 static inline void do_vonly_2_ljtable_lj(float *VFtab
,
2854 vector
float *VVdisp
,
2855 vector
float *VVrep
)
2857 vector
signed int vidx
;
2858 vector
float Y
,F
,G
,H
,eps
,eps2
;
2860 vector
float tabd1
,tabd2
;
2861 vector
float tabr1
,tabr2
;
2863 vidx
= vec_cts(rtab
,0);
2864 vidx
= vec_sl(vidx
,vec_splat_u32(3));
2866 idx1
= *((int *)&vidx
);
2867 idx2
= *(((int *)&vidx
)+1);
2869 eps
= vec_sub(rtab
,vec_floor(rtab
));
2870 eps2
= vec_madd(eps
,eps
,vec_zero());
2872 if(((unsigned int)VFtab
)%16) {
2873 /* not 16 byte aligned, i.e. must be 8 byte. */
2874 /* use Y,F,G,H as temp storage */
2875 tabd1
= vec_ld( 0, VFtab
+idx1
);
2876 tabr1
= vec_ld(16, VFtab
+idx1
);
2877 Y
= vec_ld(32, VFtab
+idx1
);
2878 tabd1
= vec_sld(tabd1
,tabr1
,8);
2879 tabr1
= vec_sld(tabr1
,Y
,8);
2881 tabd2
= vec_ld( 0, VFtab
+idx2
);
2882 tabr2
= vec_ld(16, VFtab
+idx2
);
2883 F
= vec_ld(32, VFtab
+idx2
);
2884 tabd2
= vec_sld(tabd2
,tabr2
,8);
2885 tabr2
= vec_sld(tabr2
,F
,8);
2886 } else { /* 16 byte aligned */
2887 tabd1
= vec_ld( 0, VFtab
+idx1
);
2888 tabr1
= vec_ld(16, VFtab
+idx1
);
2889 tabd2
= vec_ld( 0, VFtab
+idx2
);
2890 tabr2
= vec_ld(16, VFtab
+idx2
);
2893 transpose_2_to_4(tabd1
,tabd2
,&Y
,&F
,&G
,&H
);
2895 F
= vec_madd(G
,eps
,F
); /* F + Geps */
2896 H
= vec_madd(H
,eps2
,vec_zero()); /* Heps2 */
2897 F
= vec_add(F
,H
); /* F + Geps + Heps2 (=Fp) */
2898 *VVdisp
= vec_madd(eps
,F
,Y
); /* VV = Y + eps*Fp */
2900 transpose_2_to_4(tabr1
,tabr2
,&Y
,&F
,&G
,&H
);
2902 F
= vec_madd(G
,eps
,F
); /* F + Geps */
2903 H
= vec_madd(H
,eps2
,vec_zero()); /* Heps2 */
2904 F
= vec_add(F
,H
); /* F + Geps + Heps2 (=Fp) */
2905 *VVrep
= vec_madd(eps
,F
,Y
); /* VV = Y + eps*Fp */
2909 static inline void do_vonly_2_ljctable_coul_and_lj(float *VFtab
,
2911 vector
float *VVcoul
,
2912 vector
float *VVdisp
,
2913 vector
float *VVrep
)
2915 vector
signed int vidx
;
2916 vector
float Y
,F
,G
,H
,eps
,eps2
;
2917 vector
float tabd1
,tabr1
,tabc1
,tabd2
,tabr2
,tabc2
;
2920 vidx
= vec_cts(rtab
,0);
2921 vidx
= vec_add(vidx
,vec_sl(vidx
,vec_splat_u32(1))); /* multiply by 3 */
2922 vidx
= vec_sl(vidx
,vec_splat_u32(2));
2924 idx1
= *((int *)&vidx
);
2925 idx2
= *(((int *)&vidx
)+1);
2927 eps
= vec_sub(rtab
,vec_floor(rtab
));
2928 eps2
= vec_madd(eps
,eps
,vec_zero());
2930 if(((unsigned int)VFtab
)%16) {
2931 /* not 16-byte aligned, but must be 8 byte. */
2932 /* use Y,F,G,H as temp storage */
2933 tabc1
= vec_ld( 0, VFtab
+idx1
);
2934 tabd1
= vec_ld(16, VFtab
+idx1
);
2935 tabr1
= vec_ld(32, VFtab
+idx1
);
2936 Y
= vec_ld(48, VFtab
+idx1
);
2937 tabc1
= vec_sld(tabc1
,tabd1
,8);
2938 tabd1
= vec_sld(tabd1
,tabr1
,8);
2939 tabr1
= vec_sld(tabr1
,Y
,8);
2941 tabc2
= vec_ld( 0, VFtab
+idx2
);
2942 tabd2
= vec_ld(16, VFtab
+idx2
);
2943 tabr2
= vec_ld(32, VFtab
+idx2
);
2944 F
= vec_ld(48, VFtab
+idx2
);
2945 tabc2
= vec_sld(tabc2
,tabd2
,8);
2946 tabd2
= vec_sld(tabd2
,tabr2
,8);
2947 tabr2
= vec_sld(tabr2
,F
,8);
2948 } else { /* 16 byte aligned */
2949 tabc1
= vec_ld( 0, VFtab
+idx1
);
2950 tabd1
= vec_ld(16, VFtab
+idx1
);
2951 tabr1
= vec_ld(32, VFtab
+idx1
);
2952 tabc2
= vec_ld( 0, VFtab
+idx2
);
2953 tabd2
= vec_ld(16, VFtab
+idx2
);
2954 tabr2
= vec_ld(32, VFtab
+idx2
);
2956 transpose_2_to_4(tabc1
,tabc2
,&Y
,&F
,&G
,&H
);
2958 F
= vec_madd(G
,eps
,F
); /* F + Geps */
2959 H
= vec_madd(H
,eps2
,vec_zero()); /* Heps2 */
2960 F
= vec_add(F
,H
); /* F + Geps + Heps2 (=Fp) */
2961 *VVcoul
= vec_madd(eps
,F
,Y
); /* VV = Y + eps*Fp */
2963 transpose_2_to_4(tabd1
,tabd2
,&Y
,&F
,&G
,&H
);
2965 F
= vec_madd(G
,eps
,F
); /* F + Geps */
2966 H
= vec_madd(H
,eps2
,vec_zero()); /* Heps2 */
2967 F
= vec_add(F
,H
); /* F + Geps + Heps2 (=Fp) */
2968 *VVdisp
= vec_madd(eps
,F
,Y
); /* VV = Y + eps*Fp */
2970 transpose_2_to_4(tabr1
,tabr2
,&Y
,&F
,&G
,&H
);
2972 F
= vec_madd(G
,eps
,F
); /* F + Geps */
2973 H
= vec_madd(H
,eps2
,vec_zero()); /* Heps2 */
2974 F
= vec_add(F
,H
); /* F + Geps + Heps2 (=Fp) */
2975 *VVrep
= vec_madd(eps
,F
,Y
); /* VV = Y + eps*Fp */
2980 static inline void do_vonly_1_ctable_coul(float *VFtab
,
2984 vector
signed int vidx
;
2985 vector
float Y
,F
,G
,H
,eps
,eps2
,tab1
;
2988 vidx
= vec_cts(rtab
,0);
2989 vidx
= vec_sl(vidx
,vec_splat_u32(2));
2991 idx1
= *((int *)&vidx
);
2993 eps
= vec_sub(rtab
,vec_floor(rtab
));
2994 eps2
= vec_madd(eps
,eps
,vec_zero());
2996 if(((unsigned int)VFtab
)%16) { /* not 16-byte aligned, but must be 8 byte. */
2997 tab1
= vec_ld( 0, VFtab
+idx1
);
2998 Y
= vec_ld(16, VFtab
+idx1
);
2999 tab1
= vec_sld(tab1
,Y
,8);
3000 } else { /* aligned */
3001 tab1
=vec_ld(0, VFtab
+idx1
);
3004 /* table data is aligned */
3005 transpose_1_to_4(tab1
,&Y
,&F
,&G
,&H
);
3007 F
= vec_madd(G
,eps
,F
); /* F + Geps */
3008 H
= vec_madd(H
,eps2
,vec_zero()); /* Heps2 */
3009 F
= vec_add(F
,H
); /* F + Geps + Heps2 (=Fp) */
3010 *VV
= vec_madd(eps
,F
,Y
); /* VV = Y + eps*Fp */
3013 /* do only coulomb, but on a table with both coulomb and lj data */
3014 static inline void do_vonly_1_ljctable_coul(float *VFtab
,
3018 vector
signed int vidx
;
3019 vector
float Y
,F
,G
,H
,eps
,eps2
,tab1
;
3022 vidx
= vec_cts(rtab
,0);
3023 vidx
= vec_add(vidx
,vec_sl(vidx
,vec_splat_u32(1))); /* multiply by 3 */
3024 vidx
= vec_sl(vidx
,vec_splat_u32(2));
3026 idx1
= *((int *)&vidx
);
3028 eps
= vec_sub(rtab
,vec_floor(rtab
));
3029 eps2
= vec_madd(eps
,eps
,vec_zero());
3032 if(((unsigned int)VFtab
)%16) {
3033 /* not 16-byte aligned, but must be 8 byte. */
3034 tab1
= vec_ld( 0, VFtab
+idx1
);
3035 Y
= vec_ld(16, VFtab
+idx1
);
3036 tab1
= vec_sld(tab1
,Y
,8);
3037 } else { /* aligned */
3038 tab1
=vec_ld(0, VFtab
+idx1
);
3041 /* table data is aligned */
3042 transpose_1_to_4(tab1
,&Y
,&F
,&G
,&H
);
3044 F
= vec_madd(G
,eps
,F
); /* F + Geps */
3045 H
= vec_madd(H
,eps2
,vec_zero()); /* Heps2 */
3046 F
= vec_add(F
,H
); /* F + Geps + Heps2 (=Fp) */
3047 *VV
= vec_madd(eps
,F
,Y
); /* VV = Y + eps*Fp */
3051 static inline void do_vonly_1_ljtable_lj(float *VFtab
,
3053 vector
float *VVdisp
,
3054 vector
float *VVrep
)
3056 vector
signed int vidx
;
3057 vector
float Y
,F
,G
,H
,eps
,eps2
;
3062 vidx
= vec_cts(rtab
,0);
3063 vidx
= vec_sl(vidx
,vec_splat_u32(3));
3065 idx1
= *((int *)&vidx
);
3067 eps
= vec_sub(rtab
,vec_floor(rtab
));
3068 eps2
= vec_madd(eps
,eps
,vec_zero());
3070 if(((unsigned int)VFtab
)%16) {
3071 /* not 16 byte aligned, i.e. must be 8 byte. */
3072 /* use Y,F,G,H as temp storage */
3073 tabd1
= vec_ld( 0, VFtab
+idx1
);
3074 tabr1
= vec_ld(16, VFtab
+idx1
);
3075 Y
= vec_ld(32, VFtab
+idx1
);
3076 tabd1
= vec_sld(tabd1
,tabr1
,8);
3077 tabr1
= vec_sld(tabr1
,Y
,8);
3078 } else { /* 16 byte aligned */
3079 tabd1
= vec_ld( 0, VFtab
+idx1
);
3080 tabr1
= vec_ld(16, VFtab
+idx1
);
3083 transpose_1_to_4(tabd1
,&Y
,&F
,&G
,&H
);
3085 F
= vec_madd(G
,eps
,F
); /* F + Geps */
3086 H
= vec_madd(H
,eps2
,vec_zero()); /* Heps2 */
3087 F
= vec_add(F
,H
); /* F + Geps + Heps2 (=Fp) */
3088 *VVdisp
= vec_madd(eps
,F
,Y
); /* VV = Y + eps*Fp */
3090 transpose_1_to_4(tabr1
,&Y
,&F
,&G
,&H
);
3092 F
= vec_madd(G
,eps
,F
); /* F + Geps */
3093 H
= vec_madd(H
,eps2
,vec_zero()); /* Heps2 */
3094 F
= vec_add(F
,H
); /* F + Geps + Heps2 (=Fp) */
3095 *VVrep
= vec_madd(eps
,F
,Y
); /* VV = Y + eps*Fp */
3099 static inline void do_vonly_1_ljctable_coul_and_lj(float *VFtab
,
3101 vector
float *VVcoul
,
3102 vector
float *VVdisp
,
3103 vector
float *VVrep
)
3105 vector
signed int vidx
;
3106 vector
float Y
,F
,G
,H
,eps
,eps2
;
3107 vector
float tabd1
,tabr1
,tabc1
;
3110 vidx
= vec_cts(rtab
,0);
3111 vidx
= vec_add(vidx
,vec_sl(vidx
,vec_splat_u32(1))); /* multiply by 3 */
3112 vidx
= vec_sl(vidx
,vec_splat_u32(2));
3114 idx1
= *((int *)&vidx
);
3116 eps
= vec_sub(rtab
,vec_floor(rtab
));
3117 eps2
= vec_madd(eps
,eps
,vec_zero());
3119 if(((unsigned int)VFtab
)%16) {
3120 /* not 16-byte aligned, but must be 8 byte. */
3121 /* use Y,F,G,H as temp storage */
3122 tabc1
= vec_ld( 0, VFtab
+idx1
);
3123 tabd1
= vec_ld(16, VFtab
+idx1
);
3124 tabr1
= vec_ld(32, VFtab
+idx1
);
3125 Y
= vec_ld(48, VFtab
+idx1
);
3126 tabc1
= vec_sld(tabc1
,tabd1
,8);
3127 tabd1
= vec_sld(tabd1
,tabr1
,8);
3128 tabr1
= vec_sld(tabr1
,Y
,8);
3129 } else { /* 16 byte aligned */
3130 tabc1
= vec_ld( 0, VFtab
+idx1
);
3131 tabd1
= vec_ld(16, VFtab
+idx1
);
3132 tabr1
= vec_ld(32, VFtab
+idx1
);
3134 transpose_1_to_4(tabc1
,&Y
,&F
,&G
,&H
);
3136 F
= vec_madd(G
,eps
,F
); /* F + Geps */
3137 H
= vec_madd(H
,eps2
,vec_zero()); /* Heps2 */
3138 F
= vec_add(F
,H
); /* F + Geps + Heps2 (=Fp) */
3139 *VVcoul
= vec_madd(eps
,F
,Y
); /* VV = Y + eps*Fp */
3141 transpose_1_to_4(tabd1
,&Y
,&F
,&G
,&H
);
3143 F
= vec_madd(G
,eps
,F
); /* F + Geps */
3144 H
= vec_madd(H
,eps2
,vec_zero()); /* Heps2 */
3145 F
= vec_add(F
,H
); /* F + Geps + Heps2 (=Fp) */
3146 *VVdisp
= vec_madd(eps
,F
,Y
); /* VV = Y + eps*Fp */
3148 transpose_1_to_4(tabr1
,&Y
,&F
,&G
,&H
);
3150 F
= vec_madd(G
,eps
,F
); /* F + Geps */
3151 H
= vec_madd(H
,eps2
,vec_zero()); /* Heps2 */
3152 F
= vec_add(F
,H
); /* F + Geps + Heps2 (=Fp) */
3153 *VVrep
= vec_madd(eps
,F
,Y
); /* VV = Y + eps*Fp */
3159 static inline vector
float do_recip(vector
float rsq
)
3161 vector
float tmp
,lu
;
3164 tmp
= vec_nmsub(rsq
,lu
,vec_two());
3165 return vec_madd(lu
,tmp
,vec_zero());
3168 static inline vector
float do_invsqrt(vector
float rsq
)
3170 vector
float lu
,tmpA
,tmpB
;
3172 lu
= vec_rsqrte(rsq
);
3173 tmpA
= vec_madd(lu
,lu
,vec_zero());
3174 tmpB
= vec_madd(lu
,vec_half(),vec_zero());
3175 tmpA
= vec_nmsub(rsq
,tmpA
,vec_one());
3176 return vec_madd(tmpA
,tmpB
,lu
);
3179 static inline void do_3_invsqrt(vector
float rsq1
,
3182 vector
float *rinvsq1
,
3183 vector
float *rinvsq2
,
3184 vector
float *rinvsq3
)
3186 vector
float lu1
,lu2
,lu3
;
3187 vector
float tmpA1
,tmpA2
,tmpA3
,tmpB1
,tmpB2
,tmpB3
;
3188 vector
float nul
=vec_zero();
3189 vector
float half
=vec_half();
3190 vector
float one
=vec_one();
3192 lu1
= vec_rsqrte(rsq1
);
3193 lu2
= vec_rsqrte(rsq2
);
3194 lu3
= vec_rsqrte(rsq3
);
3195 tmpA1
= vec_madd(lu1
,lu1
,nul
);
3196 tmpA2
= vec_madd(lu2
,lu2
,nul
);
3197 tmpA3
= vec_madd(lu3
,lu3
,nul
);
3198 tmpB1
= vec_madd(lu1
,half
,nul
);
3199 tmpB2
= vec_madd(lu2
,half
,nul
);
3200 tmpB3
= vec_madd(lu3
,half
,nul
);
3201 tmpA1
= vec_nmsub(rsq1
,tmpA1
,one
);
3202 tmpA2
= vec_nmsub(rsq2
,tmpA2
,one
);
3203 tmpA3
= vec_nmsub(rsq3
,tmpA3
,one
);
3204 *rinvsq1
= vec_madd(tmpA1
,tmpB1
,lu1
);
3205 *rinvsq2
= vec_madd(tmpA2
,tmpB2
,lu2
);
3206 *rinvsq3
= vec_madd(tmpA3
,tmpB3
,lu3
);
3209 static inline void do_9_invsqrt(vector
float rsq1
,
3218 vector
float *rinv1
,
3219 vector
float *rinv2
,
3220 vector
float *rinv3
,
3221 vector
float *rinv4
,
3222 vector
float *rinv5
,
3223 vector
float *rinv6
,
3224 vector
float *rinv7
,
3225 vector
float *rinv8
,
3226 vector
float *rinv9
)
3228 vector
float lu1
,lu2
,lu3
,lu4
,lu5
,lu6
,lu7
,lu8
,lu9
;
3229 vector
float tmpA1
,tmpA2
,tmpA3
,tmpA4
,tmpA5
,tmpA6
,tmpA7
,tmpA8
,tmpA9
;
3230 vector
float tmpB1
,tmpB2
,tmpB3
,tmpB4
,tmpB5
,tmpB6
,tmpB7
,tmpB8
,tmpB9
;
3231 vector
float nul
=vec_zero();
3232 vector
float half
=vec_half();
3233 vector
float one
=vec_one();
3235 lu1
= vec_rsqrte(rsq1
);
3236 lu2
= vec_rsqrte(rsq2
);
3237 lu3
= vec_rsqrte(rsq3
);
3238 lu4
= vec_rsqrte(rsq4
);
3239 lu5
= vec_rsqrte(rsq5
);
3240 lu6
= vec_rsqrte(rsq6
);
3241 lu7
= vec_rsqrte(rsq7
);
3242 lu8
= vec_rsqrte(rsq8
);
3243 lu9
= vec_rsqrte(rsq9
);
3244 tmpA1
= vec_madd(lu1
,lu1
,nul
);
3245 tmpA2
= vec_madd(lu2
,lu2
,nul
);
3246 tmpA3
= vec_madd(lu3
,lu3
,nul
);
3247 tmpA4
= vec_madd(lu4
,lu4
,nul
);
3248 tmpA5
= vec_madd(lu5
,lu5
,nul
);
3249 tmpA6
= vec_madd(lu6
,lu6
,nul
);
3250 tmpA7
= vec_madd(lu7
,lu7
,nul
);
3251 tmpA8
= vec_madd(lu8
,lu8
,nul
);
3252 tmpA9
= vec_madd(lu9
,lu9
,nul
);
3253 tmpB1
= vec_madd(lu1
,half
,nul
);
3254 tmpB2
= vec_madd(lu2
,half
,nul
);
3255 tmpB3
= vec_madd(lu3
,half
,nul
);
3256 tmpB4
= vec_madd(lu4
,half
,nul
);
3257 tmpB5
= vec_madd(lu5
,half
,nul
);
3258 tmpB6
= vec_madd(lu6
,half
,nul
);
3259 tmpB7
= vec_madd(lu7
,half
,nul
);
3260 tmpB8
= vec_madd(lu8
,half
,nul
);
3261 tmpB9
= vec_madd(lu9
,half
,nul
);
3262 tmpA1
= vec_nmsub(rsq1
,tmpA1
,one
);
3263 tmpA2
= vec_nmsub(rsq2
,tmpA2
,one
);
3264 tmpA3
= vec_nmsub(rsq3
,tmpA3
,one
);
3265 tmpA4
= vec_nmsub(rsq4
,tmpA4
,one
);
3266 tmpA5
= vec_nmsub(rsq5
,tmpA5
,one
);
3267 tmpA6
= vec_nmsub(rsq6
,tmpA6
,one
);
3268 tmpA7
= vec_nmsub(rsq7
,tmpA7
,one
);
3269 tmpA8
= vec_nmsub(rsq8
,tmpA8
,one
);
3270 tmpA9
= vec_nmsub(rsq9
,tmpA9
,one
);
3271 *rinv1
= vec_madd(tmpA1
,tmpB1
,lu1
);
3272 *rinv2
= vec_madd(tmpA2
,tmpB2
,lu2
);
3273 *rinv3
= vec_madd(tmpA3
,tmpB3
,lu3
);
3274 *rinv4
= vec_madd(tmpA4
,tmpB4
,lu4
);
3275 *rinv5
= vec_madd(tmpA5
,tmpB5
,lu5
);
3276 *rinv6
= vec_madd(tmpA6
,tmpB6
,lu6
);
3277 *rinv7
= vec_madd(tmpA7
,tmpB7
,lu7
);
3278 *rinv8
= vec_madd(tmpA8
,tmpB8
,lu8
);
3279 *rinv9
= vec_madd(tmpA9
,tmpB9
,lu9
);
3280 /* 36 invsqrt in about 48 cycles due to pipelining ... pretty fast :-) */
3284 static inline void zero_highest_element_in_vector(vector
float *v
)
3286 vector
signed int zero
= (vector
signed int) vec_zero();
3288 *v
= (vector
float)vec_sel((vector
signed int)*v
,zero
,(vector
unsigned int)vec_sld(zero
,vec_splat_s32(-1),4));
3291 static inline void zero_highest_2_elements_in_vector(vector
float *v
)
3293 vector
signed int zero
= (vector
signed int) vec_zero();
3295 *v
= (vector
float)vec_sel((vector
signed int)*v
,zero
,(vector
unsigned int)vec_sld(zero
,vec_splat_s32(-1),8));
3298 static inline void zero_highest_3_elements_in_vector(vector
float *v
)
3300 vector
signed int zero
= (vector
signed int) vec_zero();
3302 *v
= (vector
float)vec_sel((vector
signed int)*v
,zero
,(vector
unsigned int)vec_sld(zero
,vec_splat_s32(-1),12));
3306 static inline void zero_highest_element_in_3_vectors(vector
float *v1
,vector
float *v2
,vector
float *v3
)
3308 vector
signed int zero
= (vector
signed int) vec_zero();
3309 vector
unsigned int mask
= (vector
unsigned int)vec_sld(zero
,vec_splat_s32(-1),4);
3311 *v1
= (vector
float)vec_sel((vector
signed int)*v1
,zero
,mask
);
3312 *v2
= (vector
float)vec_sel((vector
signed int)*v2
,zero
,mask
);
3313 *v3
= (vector
float)vec_sel((vector
signed int)*v3
,zero
,mask
);
3316 static inline void zero_highest_2_elements_in_3_vectors(vector
float *v1
,vector
float *v2
,vector
float *v3
)
3318 vector
signed int zero
= (vector
signed int) vec_zero();
3319 vector
unsigned int mask
= (vector
unsigned int)vec_sld(zero
,vec_splat_s32(-1),8);
3321 *v1
= (vector
float)vec_sel((vector
signed int)*v1
,zero
,mask
);
3322 *v2
= (vector
float)vec_sel((vector
signed int)*v2
,zero
,mask
);
3323 *v3
= (vector
float)vec_sel((vector
signed int)*v3
,zero
,mask
);
3326 static inline void zero_highest_3_elements_in_3_vectors(vector
float *v1
,vector
float *v2
,vector
float *v3
)
3328 vector
signed int zero
= (vector
signed int) vec_zero();
3329 vector
unsigned int mask
= (vector
unsigned int)vec_sld(zero
,vec_splat_s32(-1),12);
3331 *v1
= (vector
float)vec_sel((vector
signed int)*v1
,zero
,mask
);
3332 *v2
= (vector
float)vec_sel((vector
signed int)*v2
,zero
,mask
);
3333 *v3
= (vector
float)vec_sel((vector
signed int)*v3
,zero
,mask
);
3336 static inline void zero_highest_element_in_9_vectors(vector
float *v1
,vector
float *v2
,vector
float *v3
,
3337 vector
float *v4
,vector
float *v5
,vector
float *v6
,
3338 vector
float *v7
,vector
float *v8
,vector
float *v9
)
3340 vector
signed int zero
= (vector
signed int) vec_zero();
3341 vector
unsigned int mask
= (vector
unsigned int)vec_sld(zero
,vec_splat_s32(-1),4);
3343 *v1
= (vector
float)vec_sel((vector
signed int)*v1
,zero
,mask
);
3344 *v2
= (vector
float)vec_sel((vector
signed int)*v2
,zero
,mask
);
3345 *v3
= (vector
float)vec_sel((vector
signed int)*v3
,zero
,mask
);
3346 *v4
= (vector
float)vec_sel((vector
signed int)*v4
,zero
,mask
);
3347 *v5
= (vector
float)vec_sel((vector
signed int)*v5
,zero
,mask
);
3348 *v6
= (vector
float)vec_sel((vector
signed int)*v6
,zero
,mask
);
3349 *v7
= (vector
float)vec_sel((vector
signed int)*v7
,zero
,mask
);
3350 *v8
= (vector
float)vec_sel((vector
signed int)*v8
,zero
,mask
);
3351 *v9
= (vector
float)vec_sel((vector
signed int)*v9
,zero
,mask
);
3354 static inline void zero_highest_2_elements_in_9_vectors(vector
float *v1
,vector
float *v2
,vector
float *v3
,
3355 vector
float *v4
,vector
float *v5
,vector
float *v6
,
3356 vector
float *v7
,vector
float *v8
,vector
float *v9
)
3358 vector
signed int zero
= (vector
signed int) vec_zero();
3359 vector
unsigned int mask
= (vector
unsigned int)vec_sld(zero
,vec_splat_s32(-1),8);
3361 *v1
= (vector
float)vec_sel((vector
signed int)*v1
,zero
,mask
);
3362 *v2
= (vector
float)vec_sel((vector
signed int)*v2
,zero
,mask
);
3363 *v3
= (vector
float)vec_sel((vector
signed int)*v3
,zero
,mask
);
3364 *v4
= (vector
float)vec_sel((vector
signed int)*v4
,zero
,mask
);
3365 *v5
= (vector
float)vec_sel((vector
signed int)*v5
,zero
,mask
);
3366 *v6
= (vector
float)vec_sel((vector
signed int)*v6
,zero
,mask
);
3367 *v7
= (vector
float)vec_sel((vector
signed int)*v7
,zero
,mask
);
3368 *v8
= (vector
float)vec_sel((vector
signed int)*v8
,zero
,mask
);
3369 *v9
= (vector
float)vec_sel((vector
signed int)*v9
,zero
,mask
);
3372 static inline void zero_highest_3_elements_in_9_vectors(vector
float *v1
,vector
float *v2
,vector
float *v3
,
3373 vector
float *v4
,vector
float *v5
,vector
float *v6
,
3374 vector
float *v7
,vector
float *v8
,vector
float *v9
)
3376 vector
signed int zero
= (vector
signed int) vec_zero();
3377 vector
unsigned int mask
= (vector
unsigned int)vec_sld(zero
,vec_splat_s32(-1),12);
3379 *v1
= (vector
float)vec_sel((vector
signed int)*v1
,zero
,mask
);
3380 *v2
= (vector
float)vec_sel((vector
signed int)*v2
,zero
,mask
);
3381 *v3
= (vector
float)vec_sel((vector
signed int)*v3
,zero
,mask
);
3382 *v4
= (vector
float)vec_sel((vector
signed int)*v4
,zero
,mask
);
3383 *v5
= (vector
float)vec_sel((vector
signed int)*v5
,zero
,mask
);
3384 *v6
= (vector
float)vec_sel((vector
signed int)*v6
,zero
,mask
);
3385 *v7
= (vector
float)vec_sel((vector
signed int)*v7
,zero
,mask
);
3386 *v8
= (vector
float)vec_sel((vector
signed int)*v8
,zero
,mask
);
3387 *v9
= (vector
float)vec_sel((vector
signed int)*v9
,zero
,mask
);
3391 void inl0100_altivec(int nri
,int iinr
[],int jindex
[],int jjnr
[],int shift
[],
3392 float shiftvec
[],float fshift
[],int gid
[],float pos
[],
3393 float faction
[],int type
[],int ntype
,float nbfp
[],
3395 void inl0300_altivec(int nri
,int iinr
[],int jindex
[],int jjnr
[],int shift
[],
3396 float shiftvec
[],float fshift
[],int gid
[],float pos
[],
3397 float faction
[],int type
[],int ntype
,float nbfp
[],
3398 float Vnb
[],float tabscale
,float VFtab
[]);
3399 void inl1000_altivec(int nri
,int iinr
[],int jindex
[],int jjnr
[],int shift
[],
3400 float shiftvec
[],float fshift
[],int gid
[],float pos
[],
3401 float faction
[],float charge
[],float facel
,float Vc
[]);
3402 void inl1020_altivec(int nri
,int iinr
[],int jindex
[],int jjnr
[],int shift
[],
3403 float shiftvec
[],float fshift
[],int gid
[],float pos
[],
3404 float faction
[],float charge
[],float facel
,float Vc
[]);
3405 void inl1030_altivec(int nri
,int iinr
[],int jindex
[],int jjnr
[],int shift
[],
3406 float shiftvec
[],float fshift
[],int gid
[],float pos
[],
3407 float faction
[],float charge
[],float facel
,float Vc
[]);
3408 void inl1100_altivec(int nri
,int iinr
[],int jindex
[],int jjnr
[],int shift
[],
3409 float shiftvec
[],float fshift
[],int gid
[],float pos
[],
3410 float faction
[],float charge
[],float facel
,float Vc
[],
3411 int type
[],int ntype
,float nbfp
[],float Vnb
[]);
3412 void inl1120_altivec(int nri
,int iinr
[],int jindex
[],int jjnr
[],int shift
[],
3413 float shiftvec
[],float fshift
[],int gid
[],float pos
[],
3414 float faction
[],float charge
[],float facel
,float Vc
[],
3415 int type
[],int ntype
,float nbfp
[],float Vnb
[]);
3416 void inl1130_altivec(int nri
,int iinr
[],int jindex
[],int jjnr
[],int shift
[],
3417 float shiftvec
[],float fshift
[],int gid
[],float pos
[],
3418 float faction
[],float charge
[],float facel
,float Vc
[],
3419 int type
[],int ntype
,float nbfp
[],float Vnb
[]);
3420 void inl2000_altivec(int nri
,int iinr
[],int jindex
[],int jjnr
[],int shift
[],
3421 float shiftvec
[],float fshift
[],int gid
[],float pos
[],
3422 float faction
[],float charge
[],float facel
,float Vc
[],
3423 float krf
, float crf
);
3424 void inl2020_altivec(int nri
,int iinr
[],int jindex
[],int jjnr
[],int shift
[],
3425 float shiftvec
[],float fshift
[],int gid
[],float pos
[],
3426 float faction
[],float charge
[],float facel
,float Vc
[],
3427 float krf
, float crf
);
3428 void inl2030_altivec(int nri
,int iinr
[],int jindex
[],int jjnr
[],int shift
[],
3429 float shiftvec
[],float fshift
[],int gid
[],float pos
[],
3430 float faction
[],float charge
[],float facel
,float Vc
[],
3431 float krf
, float crf
);
3432 void inl2100_altivec(int nri
,int iinr
[],int jindex
[],int jjnr
[],int shift
[],
3433 float shiftvec
[],float fshift
[],int gid
[],float pos
[],
3434 float faction
[],float charge
[],float facel
,float Vc
[],
3435 float krf
, float crf
, int type
[],int ntype
,
3436 float nbfp
[],float Vnb
[]);
3437 void inl2120_altivec(int nri
,int iinr
[],int jindex
[],int jjnr
[],int shift
[],
3438 float shiftvec
[],float fshift
[],int gid
[],float pos
[],
3439 float faction
[],float charge
[],float facel
,float Vc
[],
3440 float krf
, float crf
, int type
[],int ntype
,
3441 float nbfp
[],float Vnb
[]);
3442 void inl2130_altivec(int nri
,int iinr
[],int jindex
[],int jjnr
[],int shift
[],
3443 float shiftvec
[],float fshift
[],int gid
[],float pos
[],
3444 float faction
[],float charge
[],float facel
,float Vc
[],
3445 float krf
, float crf
, int type
[],int ntype
,
3446 float nbfp
[],float Vnb
[]);
3447 void inl3000_altivec(int nri
,int iinr
[],int jindex
[],int jjnr
[],int shift
[],
3448 float shiftvec
[],float fshift
[],int gid
[],float pos
[],
3449 float faction
[],float charge
[],float facel
,float Vc
[],
3450 float tabscale
,float VFtab
[]);
3451 void inl3020_altivec(int nri
,int iinr
[],int jindex
[],int jjnr
[],int shift
[],
3452 float shiftvec
[],float fshift
[],int gid
[],float pos
[],
3453 float faction
[],float charge
[],float facel
,float Vc
[],
3454 float tabscale
,float VFtab
[]);
3455 void inl3030_altivec(int nri
,int iinr
[],int jindex
[],int jjnr
[],int shift
[],
3456 float shiftvec
[],float fshift
[],int gid
[],float pos
[],
3457 float faction
[],float charge
[],float facel
,float Vc
[],
3458 float tabscale
,float VFtab
[]);
3459 void inl3100_altivec(int nri
,int iinr
[],int jindex
[],int jjnr
[],int shift
[],
3460 float shiftvec
[],float fshift
[],int gid
[],float pos
[],
3461 float faction
[],float charge
[],float facel
,float Vc
[],
3462 int type
[],int ntype
,float nbfp
[],float Vnb
[],
3463 float tabscale
, float VFtab
[]);
3464 void inl3120_altivec(int nri
,int iinr
[],int jindex
[],int jjnr
[],int shift
[],
3465 float shiftvec
[],float fshift
[],int gid
[],float pos
[],
3466 float faction
[],float charge
[],float facel
,float Vc
[],
3467 int type
[],int ntype
,float nbfp
[],float Vnb
[],
3468 float tabscale
, float VFtab
[]);
3469 void inl3130_altivec(int nri
,int iinr
[],int jindex
[],int jjnr
[],int shift
[],
3470 float shiftvec
[],float fshift
[],int gid
[],float pos
[],
3471 float faction
[],float charge
[],float facel
,float Vc
[],
3472 int type
[],int ntype
,float nbfp
[],float Vnb
[],
3473 float tabscale
, float VFtab
[]);
3474 void inl3300_altivec(int nri
,int iinr
[],int jindex
[],int jjnr
[],int shift
[],
3475 float shiftvec
[],float fshift
[],int gid
[],float pos
[],
3476 float faction
[],float charge
[],float facel
,float Vc
[],
3477 int type
[],int ntype
,float nbfp
[],float Vnb
[],
3478 float tabscale
,float VFtab
[]);
3479 void inl3320_altivec(int nri
,int iinr
[],int jindex
[],int jjnr
[],int shift
[],
3480 float shiftvec
[],float fshift
[],int gid
[],float pos
[],
3481 float faction
[],float charge
[],float facel
,float Vc
[],
3482 int type
[],int ntype
,float nbfp
[],float Vnb
[],
3483 float tabscale
,float VFtab
[]);
3484 void inl3330_altivec(int nri
,int iinr
[],int jindex
[],int jjnr
[],int shift
[],
3485 float shiftvec
[],float fshift
[],int gid
[],float pos
[],
3486 float faction
[],float charge
[],float facel
,float Vc
[],
3487 int type
[],int ntype
,float nbfp
[],float Vnb
[],
3488 float tabscale
,float VFtab
[]);
3492 void mcinl0100_altivec(int nri
,int iinr
[],int jindex
[],int jjnr
[],int shift
[],
3493 float shiftvec
[],int gid
[],float pos
[],int type
[],
3494 int ntype
,float nbfp
[],float Vnb
[]);
3495 void mcinl0300_altivec(int nri
,int iinr
[],int jindex
[],int jjnr
[],int shift
[],
3496 float shiftvec
[],int gid
[],float pos
[],int type
[],
3497 int ntype
,float nbfp
[],float Vnb
[],
3498 float tabscale
,float VFtab
[]);
3499 void mcinl1000_altivec(int nri
,int iinr
[],int jindex
[],int jjnr
[],int shift
[],
3500 float shiftvec
[],int gid
[],float pos
[],
3501 float charge
[],float facel
,float Vc
[]);
3502 void mcinl1020_altivec(int nri
,int iinr
[],int jindex
[],int jjnr
[],int shift
[],
3503 float shiftvec
[],int gid
[],float pos
[],
3504 float charge
[],float facel
,float Vc
[]);
3505 void mcinl1030_altivec(int nri
,int iinr
[],int jindex
[],int jjnr
[],int shift
[],
3506 float shiftvec
[],int gid
[],float pos
[],
3507 float charge
[],float facel
,float Vc
[]);
3508 void mcinl1100_altivec(int nri
,int iinr
[],int jindex
[],int jjnr
[],int shift
[],
3509 float shiftvec
[],int gid
[],float pos
[],
3510 float charge
[],float facel
,float Vc
[],
3511 int type
[],int ntype
,float nbfp
[],float Vnb
[]);
3512 void mcinl1120_altivec(int nri
,int iinr
[],int jindex
[],int jjnr
[],int shift
[],
3513 float shiftvec
[],int gid
[],float pos
[],
3514 float charge
[],float facel
,float Vc
[],
3515 int type
[],int ntype
,float nbfp
[],float Vnb
[]);
3516 void mcinl1130_altivec(int nri
,int iinr
[],int jindex
[],int jjnr
[],int shift
[],
3517 float shiftvec
[],int gid
[],float pos
[],
3518 float charge
[],float facel
,float Vc
[],
3519 int type
[],int ntype
,float nbfp
[],float Vnb
[]);
3520 void mcinl2000_altivec(int nri
,int iinr
[],int jindex
[],int jjnr
[],int shift
[],
3521 float shiftvec
[],int gid
[],float pos
[],
3522 float charge
[],float facel
,float Vc
[],
3523 float krf
, float crf
);
3524 void mcinl2020_altivec(int nri
,int iinr
[],int jindex
[],int jjnr
[],int shift
[],
3525 float shiftvec
[],int gid
[],float pos
[],
3526 float charge
[],float facel
,float Vc
[],
3527 float krf
, float crf
);
3528 void mcinl2030_altivec(int nri
,int iinr
[],int jindex
[],int jjnr
[],int shift
[],
3529 float shiftvec
[],int gid
[],float pos
[],
3530 float charge
[],float facel
,float Vc
[],
3531 float krf
, float crf
);
3532 void mcinl2100_altivec(int nri
,int iinr
[],int jindex
[],int jjnr
[],int shift
[],
3533 float shiftvec
[],int gid
[],float pos
[],
3534 float charge
[],float facel
,float Vc
[],
3535 float krf
, float crf
, int type
[],int ntype
,
3536 float nbfp
[],float Vnb
[]);
3537 void mcinl2120_altivec(int nri
,int iinr
[],int jindex
[],int jjnr
[],int shift
[],
3538 float shiftvec
[],int gid
[],float pos
[],
3539 float charge
[],float facel
,float Vc
[],
3540 float krf
, float crf
, int type
[],int ntype
,
3541 float nbfp
[],float Vnb
[]);
3542 void mcinl2130_altivec(int nri
,int iinr
[],int jindex
[],int jjnr
[],int shift
[],
3543 float shiftvec
[],int gid
[],float pos
[],
3544 float charge
[],float facel
,float Vc
[],
3545 float krf
, float crf
, int type
[],int ntype
,
3546 float nbfp
[],float Vnb
[]);
3547 void mcinl3000_altivec(int nri
,int iinr
[],int jindex
[],int jjnr
[],int shift
[],
3548 float shiftvec
[],int gid
[],float pos
[],
3549 float charge
[],float facel
,float Vc
[],
3550 float tabscale
,float VFtab
[]);
3551 void mcinl3020_altivec(int nri
,int iinr
[],int jindex
[],int jjnr
[],int shift
[],
3552 float shiftvec
[],int gid
[],float pos
[],
3553 float charge
[],float facel
,float Vc
[],
3554 float tabscale
,float VFtab
[]);
3555 void mcinl3030_altivec(int nri
,int iinr
[],int jindex
[],int jjnr
[],int shift
[],
3556 float shiftvec
[],int gid
[],float pos
[],
3557 float charge
[],float facel
,float Vc
[],
3558 float tabscale
,float VFtab
[]);
3559 void mcinl3100_altivec(int nri
,int iinr
[],int jindex
[],int jjnr
[],int shift
[],
3560 float shiftvec
[],int gid
[],float pos
[],
3561 float charge
[],float facel
,float Vc
[],
3562 int type
[],int ntype
,float nbfp
[],float Vnb
[],
3563 float tabscale
, float VFtab
[]);
3564 void mcinl3120_altivec(int nri
,int iinr
[],int jindex
[],int jjnr
[],int shift
[],
3565 float shiftvec
[],int gid
[],float pos
[],
3566 float charge
[],float facel
,float Vc
[],
3567 int type
[],int ntype
,float nbfp
[],float Vnb
[],
3568 float tabscale
, float VFtab
[]);
3569 void mcinl3130_altivec(int nri
,int iinr
[],int jindex
[],int jjnr
[],int shift
[],
3570 float shiftvec
[],int gid
[],float pos
[],
3571 float charge
[],float facel
,float Vc
[],
3572 int type
[],int ntype
,float nbfp
[],float Vnb
[],
3573 float tabscale
, float VFtab
[]);
3574 void mcinl3300_altivec(int nri
,int iinr
[],int jindex
[],int jjnr
[],int shift
[],
3575 float shiftvec
[],int gid
[],float pos
[],
3576 float charge
[],float facel
,float Vc
[],
3577 int type
[],int ntype
,float nbfp
[],float Vnb
[],
3578 float tabscale
,float VFtab
[]);
3579 void mcinl3320_altivec(int nri
,int iinr
[],int jindex
[],int jjnr
[],int shift
[],
3580 float shiftvec
[],int gid
[],float pos
[],
3581 float charge
[],float facel
,float Vc
[],
3582 int type
[],int ntype
,float nbfp
[],float Vnb
[],
3583 float tabscale
,float VFtab
[]);
3584 void mcinl3330_altivec(int nri
,int iinr
[],int jindex
[],int jjnr
[],int shift
[],
3585 float shiftvec
[],int gid
[],float pos
[],
3586 float charge
[],float facel
,float Vc
[],
3587 int type
[],int ntype
,float nbfp
[],float Vnb
[],
3588 float tabscale
,float VFtab
[]);
3592 #endif /* ppc_altivec.h */