4 * This source code is part of
8 * GROningen MAchine for Chemical Simulations
11 * Copyright (c) 1991-2001, University of Groningen, The Netherlands
12 * This program is free software; you can redistribute it and/or
13 * modify it under the terms of the GNU General Public License
14 * as published by the Free Software Foundation; either version 2
15 * of the License, or (at your option) any later version.
17 * If you want to redistribute modifications, please consider that
18 * scientific software is very special. Version control is crucial -
19 * bugs must be traceable. We will be happy to consider code for
20 * inclusion in the official distribution, but derived work must not
21 * be called official GROMACS. Details are found in the README & COPYING
22 * files - if they are missing, get the official version at www.gromacs.org.
24 * To help us fund GROMACS development, we humbly ask that you cite
25 * the papers on the package - you can find them in the top README file.
27 * For more info, check our website at http://www.gromacs.org
30 * Getting the Right Output Means no Artefacts in Calculating Stuff
33 #ifndef _ppc_altivec_h
34 #define _ppc_altivec_h
36 static char *SRCID_ppc_altivec_h
= "$Id$";
44 static void printvec(vector
float v
)
50 printf("%8.5f ",*(((float *)&v
)+i
));
55 /* Altivec utility routines */
56 static void set_non_java_mode(void)
58 vector
unsigned short vsr1
,vsr2
;
59 vector
unsigned int tmp
;
62 tmp
=vec_sl(vec_splat_u32(1),vec_splat_u32(8));
63 vsr2
=(vector
unsigned short)vec_sl(tmp
,vec_splat_u32(8));
64 vsr1
=vec_or(vsr1
,vsr2
);
68 /* Simple numerical constants */
69 static inline vector
float vec_zero(void)
71 return vec_ctf(vec_splat_u32(0),0);
74 static inline vector
float vec_half(void)
76 return vec_ctf(vec_splat_u32(1),1);
79 static inline vector
float vec_one(void)
81 return vec_ctf(vec_splat_u32(1),0);
84 static inline vector
float vec_two(void)
86 return vec_ctf(vec_splat_u32(2),0);
89 static inline vector
float vec_three(void)
91 return vec_ctf(vec_splat_u32(3),0);
94 static inline vector
float vec_six(void)
96 return vec_ctf(vec_splat_u32(6),0);
99 static inline vector
float vec_twelve(void)
101 return vec_ctf(vec_splat_u32(12),0);
105 /* load three consecutive floats. We never access the fourth,
106 * so this is safe even at the end of an array.
108 static inline vector
float load_xyz(float *address
)
110 vector
float c1
,c2
,c3
;
111 vector
unsigned char perm
;
113 perm
= vec_lvsl( 0, address
);
114 c1
= vec_lde( 0, address
);
115 c2
= vec_lde( 4, address
);
116 c3
= vec_lde( 8, address
);
117 c1
= vec_perm(c1
,c1
,perm
);
118 c2
= vec_perm(c2
,c2
,perm
);
119 c3
= vec_perm(c3
,c3
,perm
);
120 c2
= vec_sld(c2
,c2
,4);
121 c3
= vec_sld(c3
,c3
,8);
122 c1
= vec_mergeh(c1
,c3
);
124 return vec_mergeh(c1
,c2
);
128 /* load four consecutive floats (from unaligned address) */
129 static inline vector
float load_vector_unaligned(float *address
)
131 vector
unsigned char perm
;
132 vector
float low
,high
;
134 perm
= vec_lvsl( 0, (int *) address
);
135 low
= vec_ld( 0, address
);
136 high
= vec_ld( 16, address
);
138 return vec_perm(low
,high
,perm
);
141 /* load a single float and spread it to all vector elements */
142 static inline vector
float load_float_and_splat(float *address
)
144 vector
unsigned char perm
;
147 tmp
= vec_lde(0,address
);
148 perm
= vec_lvsl(0,address
);
149 tmp
= vec_perm(tmp
,tmp
,perm
);
151 return vec_splat(tmp
,0);
155 /* load four non-consecutive floats into vector [mem1 mem2 mem3 mem4] */
156 static inline vector
float load_4_float(float *float1
,
161 vector
unsigned char xshift
= vec_lvsl( 12, float1
);
162 vector
unsigned char yshift
= vec_lvsl( 12, float2
);
163 vector
unsigned char zshift
= vec_lvsl( 0, float3
);
164 vector
unsigned char wshift
= vec_lvsl( 0, float4
);
166 vector
float X
= vec_lde( 0, float1
);
167 vector
float Y
= vec_lde( 0, float2
);
168 vector
float Z
= vec_lde( 0, float3
);
169 vector
float W
= vec_lde( 0, float4
);
171 X
= vec_perm( X
, X
, xshift
);
172 Y
= vec_perm( Y
, Y
, yshift
);
173 Z
= vec_perm( Z
, Z
, zshift
);
174 W
= vec_perm( W
, W
, wshift
);
176 X
= vec_mergeh( X
, Y
);
177 Z
= vec_mergeh( Z
, W
);
179 return vec_sld( X
, Z
, 8 );
182 /* load four non-consecutive floats into vector [mem1 mem2 mem3 mem4] */
183 static inline vector
float load_3_float(float *float1
,
187 vector
unsigned char xshift
= vec_lvsl( 12, float1
);
188 vector
unsigned char yshift
= vec_lvsl( 12, float2
);
189 vector
unsigned char zshift
= vec_lvsl( 0, float3
);
191 vector
float X
= vec_lde( 0, float1
);
192 vector
float Y
= vec_lde( 0, float2
);
193 vector
float Z
= vec_lde( 0, float3
);
195 X
= vec_perm( X
, X
, xshift
);
196 Y
= vec_perm( Y
, Y
, yshift
);
197 Z
= vec_perm( Z
, Z
, zshift
);
199 X
= vec_mergeh( X
, Y
);
201 return vec_sld( X
, Z
, 8 );
205 /* load two non-consecutive floats into vector [mem1 mem2 0 0] */
206 static inline vector
float load_2_float(float *float1
,
209 vector
unsigned char xshift
= vec_lvsl( 8, float1
);
210 vector
unsigned char yshift
= vec_lvsl( 8, float2
);
212 vector
float X
= vec_lde( 0, float1
);
213 vector
float Y
= vec_lde( 0, float2
);
215 X
= vec_perm( X
, X
, xshift
);
216 Y
= vec_perm( Y
, Y
, yshift
);
218 return vec_mergel( X
, Y
);
221 /* load one float into vector [mem1 0 0 0] */
222 static inline vector
float load_1_float(float *float1
)
224 vector
unsigned char xshift
= vec_lvsl( 4, float1
);
226 vector
float X
= vec_lde( 0, float1
);
227 X
= vec_perm( X
, X
, xshift
);
229 return vec_sld(X
,vec_zero(),12);
235 /* load lj parameters for four atoms and store in two vectors.
236 * The nbfp memory is aligned on an 8-byte boundary and consists
237 * of pairs, so the two parameters are either in the upper or
238 * lower half of a 16-byte structure.
240 static inline void load_4_pair(float *pair1
,
247 vector
float X
= vec_ld( 0,pair1
); /* c6a c12a */
248 vector
float Y
= vec_ld( 0,pair2
); /* c6b c12b */
249 vector
float Z
= vec_ld( 0,pair3
); /* c6c c12c */
250 vector
float W
= vec_ld( 0,pair4
); /* c6d c12d */
251 vector
unsigned char perm1
= vec_lvsl(0,pair1
);
252 vector
unsigned char perm2
= vec_lvsl(0,pair2
);
253 vector
unsigned char perm3
= vec_lvsl(0,pair3
);
254 vector
unsigned char perm4
= vec_lvsl(0,pair4
);
255 X
= vec_perm(X
,X
,perm1
);
256 Y
= vec_perm(Y
,Y
,perm2
);
257 Z
= vec_perm(Z
,Z
,perm3
);
258 W
= vec_perm(W
,W
,perm4
);
260 X
= vec_mergeh(X
,Z
); /* c6a c6c c12a c12c */
261 Y
= vec_mergeh(Y
,W
); /* c6b c6d c12b c12d */
263 *c6
= vec_mergeh(X
,Y
);
264 *c12
= vec_mergel(X
,Y
);
268 /* load lj parameters for three atoms and store in two vectors
269 * See load_4_pair for alignment requirements.
271 static inline void load_3_pair(float *pair1
,
277 vector
float X
= vec_ld( 0,pair1
); /* c6a c12a */
278 vector
float Y
= vec_ld( 0,pair2
); /* c6b c12b */
279 vector
float Z
= vec_ld( 0,pair3
); /* c6c c12c */
280 vector
unsigned char perm1
= vec_lvsl(0,pair1
);
281 vector
unsigned char perm2
= vec_lvsl(0,pair2
);
282 vector
unsigned char perm3
= vec_lvsl(0,pair3
);
283 X
= vec_perm(X
,X
,perm1
);
284 Y
= vec_perm(Y
,Y
,perm2
);
285 Z
= vec_perm(Z
,Z
,perm3
);
287 X
= vec_mergeh(X
,Z
); /* c6a c6c c12a c12c */
288 Y
= vec_mergeh(Y
,vec_zero()); /* c6b 0 c12b 0 */
290 *c6
= vec_mergeh(X
,Y
);
291 *c12
= vec_mergel(X
,Y
);
295 /* load lj parameters for two atoms and store in two vectors
296 * See load_4_pair for alignment requirements.
298 static inline void load_2_pair(float *pair1
,
303 vector
float X
= vec_ld( 0,pair1
); /* c6a c12a */
304 vector
float Y
= vec_ld( 0,pair2
); /* c6b c12b */
305 vector
unsigned char perm1
= vec_lvsl(0,pair1
);
306 vector
unsigned char perm2
= vec_lvsl(0,pair2
);
307 X
= vec_perm(X
,X
,perm1
);
308 Y
= vec_perm(Y
,Y
,perm2
);
310 X
= vec_mergeh(X
,vec_zero()); /* c6a 0 c12a 0 */
311 Y
= vec_mergeh(Y
,vec_zero()); /* c6b 0 c12b 0 */
313 *c6
= vec_mergeh(X
,Y
);
314 *c12
= vec_mergel(X
,Y
);
317 /* load lj parameters for one atom and store in two vectors
318 * See load_4_pair for alignment requirements.
320 static inline void load_1_pair(float *pair1
,
324 vector
float X
= vec_ld( 0,pair1
); /* c6a c12a */
325 vector
unsigned char perm1
= vec_lvsl(0,pair1
);
326 X
= vec_perm(X
,X
,perm1
);
327 X
= vec_mergeh(X
,vec_zero()); /* c6a 0 c12a 0 */
329 *c6
= vec_mergeh(X
,vec_zero());
330 *c12
= vec_mergel(X
,vec_zero());
335 * permute [x y z ?] into [x x x x] [y y y y] [z z z z]
336 * xreg can be the same as xyzreg, but not yreg/zreg!
338 static inline void splat_xyz_to_vectors(vector
float xyzreg
,
343 *zreg
= vec_splat(xyzreg
,2);
344 *yreg
= vec_splat(xyzreg
,1);
345 *xreg
= vec_splat(xyzreg
,0);
349 /* The following transpose routines do not fill with zero padding:
356 * ...but the rest do.
360 /* move 4*[x y z ?] into [x1 x2 x3 x4] [y1 y2 y3 y4] [z1 z2 z3 z4]
362 static inline void transpose_4_to_3(vector
float xyz1
,
366 vector
float *xvector
,
367 vector
float *yvector
,
368 vector
float *zvector
)
370 *yvector
= vec_mergeh(xyz1
,xyz3
); /* [x1 x3 y1 y3] */
371 *zvector
= vec_mergeh(xyz2
,xyz4
); /* [x2 x4 y2 y4] */
372 xyz1
= vec_mergel(xyz1
,xyz3
); /* [z1 z3 ? ?] */
373 xyz2
= vec_mergel(xyz2
,xyz4
); /* [z2 z4 ? ?] */
374 *xvector
= vec_mergeh(*yvector
,*zvector
); /* [x1 x2 x3 x4] */
375 *yvector
= vec_mergel(*yvector
,*zvector
); /* [y1 y2 y3 y4] */
376 *zvector
= vec_mergeh(xyz1
,xyz2
); /* [z1 z2 z3 z4] */
380 /* move 2*[x y z ?] into [x1 x2 ? ?] [y1 y2 ? ?] [z1 z2 ? ?]
382 static inline void transpose_2_to_3(vector
float xyz1
,
384 vector
float *xvector
,
385 vector
float *yvector
,
386 vector
float *zvector
)
388 *xvector
= vec_mergeh(xyz1
,xyz2
); /* x1 x2 y1 y2 */
389 *zvector
= vec_mergel(xyz1
,xyz2
); /* z1 z2 ? ? */
390 *yvector
= vec_sld(*xvector
,*xvector
,8); /* y1 y2 0 0 */
394 /* move [x y z ?] into [x1 ? ? ?] [y1 ? ? ?] [z1 ? ? ?]
396 static inline void transpose_1_to_3(vector
float xyz1
,
397 vector
float *xvector
,
398 vector
float *yvector
,
399 vector
float *zvector
)
401 /* simply use splat, since elem 2,3,4 dont matter. */
402 *xvector
= vec_splat(xyz1
,0); /* x1 x1 x1 x1 */
403 *yvector
= vec_splat(xyz1
,1); /* y1 y1 y1 y1 */
404 *zvector
= vec_splat(xyz1
,2); /* z1 z1 z1 z1 */
410 /* move [x1 x2 x3 x4] [y1 y2 y3 y4] [z1 z2 z3 z4] to 4* [x y z 0]
412 static inline void transpose_3_to_4(vector
float xvector
,
413 vector
float yvector
,
414 vector
float zvector
,
420 vector
float tmp1
,tmp2
;
421 vector
float nul
=vec_zero();
423 *xyz2
= vec_mergeh(xvector
,zvector
); /* [x1 z1 x2 z2 ] */
424 tmp1
= vec_mergeh(yvector
,nul
); /* [y1 0 y2 0 ] */
425 *xyz4
= vec_mergel(xvector
,zvector
); /* [x3 z3 x4 z4 ] */
426 tmp2
= vec_mergel(yvector
,nul
); /* [y3 0 y4 0 ] */
428 *xyz1
= vec_mergeh(*xyz2
,tmp1
); /* [x1 y1 z1 0 ] */
429 *xyz2
= vec_mergel(*xyz2
,tmp1
); /* [x2 y2 z2 0 ] */
430 *xyz3
= vec_mergeh(*xyz4
,tmp2
); /* [x3 y3 z3 0 ] */
431 *xyz4
= vec_mergel(*xyz4
,tmp2
); /* [x4 y4 z4 0 ] */
436 /* move [x1 x2 ? ?] [y1 y2 ? ?] [z1 z2 ? ?] to 2* [x y z 0]
438 static inline void transpose_3_to_2(vector
float xvector
,
439 vector
float yvector
,
440 vector
float zvector
,
446 *xyz2
= vec_mergeh(xvector
,zvector
); /* [x1 z1 x2 z2 ] */
447 tmp1
= vec_mergeh(yvector
,vec_zero()); /* [y1 0 y2 0 ] */
449 *xyz1
= vec_mergeh(*xyz2
,tmp1
); /* [x1 y1 z1 0 ] */
450 *xyz2
= vec_mergel(*xyz2
,tmp1
); /* [x2 y2 z2 0 ] */
454 /* move [x1 ? ? ?] [y1 ? ? ?] [z1 ? ? ?] to 1* [x y z 0]
456 static inline void transpose_3_to_1(vector
float xvector
,
457 vector
float yvector
,
458 vector
float zvector
,
461 *xyz1
= vec_mergeh(xvector
,zvector
); /* [x1 z1 ? ? ] */
462 yvector
= vec_mergeh(yvector
,vec_zero()); /* [y1 0 ? 0] */
463 *xyz1
= vec_mergeh(*xyz1
,yvector
); /* [x1 y1 z1 0] */
468 static inline void transpose_4_to_4(vector
float in1
,
477 *out2
= vec_mergeh(in1
,in3
); /* [x1 x3 y1 y3] */
478 *out3
= vec_mergeh(in2
,in4
); /* [x2 x4 y2 y4] */
479 in1
= vec_mergel(in1
,in3
); /* [z1 z3 w1 w3] */
480 in2
= vec_mergel(in2
,in4
); /* [z2 z4 w2 w4] */
482 *out1
= vec_mergeh(*out2
,*out3
); /* [x1 x2 x3 x4] */
483 *out2
= vec_mergel(*out2
,*out3
); /* [y1 y2 y3 y4] */
484 *out3
= vec_mergeh(in1
,in2
); /* [z1 z2 z3 z4] */
485 *out4
= vec_mergel(in1
,in2
); /* [w1 w2 w3 w4] */
489 static inline void transpose_4_to_2(vector
float in1
,
498 tmp
= vec_mergeh(in1
,in3
); /* [x1 x3 y1 y3] */
499 *out2
= vec_mergeh(in2
,in4
); /* [x2 x4 y2 y4] */
500 *out1
= vec_mergeh(tmp
,*out2
); /* [x1 x2 x3 x4] */
501 *out2
= vec_mergel(tmp
,*out2
); /* [y1 y2 y3 y4] */
504 static inline void transpose_4_to_1(vector
float in1
,
512 tmp
= vec_mergeh(in1
,in3
); /* [x1 x3 ? ?] */
513 *out1
= vec_mergeh(in2
,in4
); /* [x2 x4 ? ?] */
514 *out1
= vec_mergeh(tmp
,*out1
); /* [x1 x2 x3 x4] */
519 static inline void transpose_2_to_4(vector
float in1
,
526 *out1
= vec_mergeh(in1
,in2
); /* [x1 x2 y1 y2] */
527 *out3
= vec_mergel(in1
,in2
); /* [z1 z2 w1 w2] */
528 *out2
= vec_sld(*out1
,*out1
,8);/* [y1 y2 0 0] */
529 *out4
= vec_sld(*out3
,*out3
,8);/* [w1 w2 0 0] */
533 static inline void transpose_1_to_4(vector
float in1
,
539 *out1
= vec_splat(in1
,0); /* [x1 x1 x1 x1] */
540 *out2
= vec_splat(in1
,1); /* [y1 y1 y1 y1] */
541 *out3
= vec_splat(in1
,2); /* [z1 z1 z1 z1] */
542 *out4
= vec_splat(in1
,3); /* [w1 w1 w1 w1] */
546 /* Add the contents in xyz to an unaligned address location.
548 static inline void add_xyz_to_mem(float *address
,vector
float vdata
)
550 vector
float c1
,c2
,c3
;
552 c1
= vec_lde( 0, address
);
553 c2
= vec_lde( 4, address
);
554 c3
= vec_lde( 8, address
);
555 c1
= vec_add(c1
,vec_splat(vdata
,0));
556 c2
= vec_add(c2
,vec_splat(vdata
,1));
557 c3
= vec_add(c3
,vec_splat(vdata
,2));
558 vec_ste( c1
, 0, address
);
559 vec_ste( c2
, 4, address
);
560 vec_ste( c3
, 8, address
);
564 static inline void add_vector_to_float(float *address
,vector
float vdata
)
568 tmp
= vec_sld(vdata
,vdata
,8);
569 vdata
= vec_add(vdata
,tmp
);
570 tmp
= vec_sld(vdata
,vdata
,4);
571 vdata
= vec_add(vdata
,tmp
);
572 tmp
= vec_lde(0,address
);
573 /* all four positions in vdata contain the sum */
574 tmp
= vec_add(tmp
,vdata
);
575 vec_ste(tmp
,0,address
);
580 * load entire water molecule to vectors. Water is special;
581 * the allocated memory is 8-byte aligned, so with nine atoms
582 * we can always read 3 16-byte chunks (rounded down by vec_ld)
583 * without going outside the current page. The extra elements
584 * before and after the water dont matter as long as we dont
585 * try to write to them.
587 static inline void load_1_water_shift_and_splat(float *address
,
599 vector
unsigned char perm
;
600 vector
float c1
,c2
,c3
,sha
,shb
,sh1
,sh2
,sh3
;
603 shb
= load_xyz(shiftvec
); /* [ shX shY shZ -] */
604 /* load the coordinates */
605 perm
= vec_lvsl( 0, (int *) address
);
606 c1
= vec_ld( 0, address
);
607 c2
= vec_ld( 16, address
);
608 c3
= vec_ld( 32, address
);
610 sha
= vec_sld(shb
,shb
,12); /* [ - shX shY shZ ] */
612 sh1
= vec_sld(sha
,shb
,4); /* [ shX shY shZ shX ] */
613 sh2
= vec_sld(sha
,shb
,8); /* [ shY shZ shX shY ] */
614 sh3
= vec_sld(sha
,shb
,12); /* [ shZ shX shY shZ ] */
616 c1
= vec_perm(c1
,c2
,perm
); /* Ox Oy Oz H1x */
617 c2
= vec_perm(c2
,c3
,perm
); /* H1y H1z H2x H2y */
618 c3
= vec_perm(c3
,c3
,perm
); /* H2z - - - */
619 c1
= vec_add(c1
,sh1
);
620 c2
= vec_add(c2
,sh2
);
621 c3
= vec_add(c3
,sh3
);
623 *Ox
= vec_splat(c1
,0);
624 *Oy
= vec_splat(c1
,1);
625 *Oz
= vec_splat(c1
,2);
626 *H1x
= vec_splat(c1
,3);
627 *H1y
= vec_splat(c2
,0);
628 *H1z
= vec_splat(c2
,1);
629 *H2x
= vec_splat(c2
,2);
630 *H2y
= vec_splat(c2
,3);
631 *H2z
= vec_splat(c3
,0);
634 static inline void load_4_water(float *address1
,
648 vector
unsigned char perm
;
649 vector
float tmp1
,tmp2
,tmp3
;
651 vector
float c1a
,c2a
,c3a
;
652 vector
float c1b
,c2b
,c3b
;
653 vector
float c1c
,c2c
,c3c
;
654 vector
float c1d
,c2d
,c3d
;
656 /* load the coordinates */
657 perm
= vec_lvsl( 0, address1
);
658 tmp1
= vec_ld( 0, address1
);
659 tmp2
= vec_ld( 16, address1
);
660 tmp3
= vec_ld( 32, address1
);
661 c1a
= vec_perm(tmp1
,tmp2
,perm
); /* Oxa Oya Oza H1xa */
662 c2a
= vec_perm(tmp2
,tmp3
,perm
); /* H1ya H1za H2xa H2ya */
663 c3a
= vec_perm(tmp3
,tmp3
,perm
); /* H2za - - - */
665 perm
= vec_lvsl( 0, address2
);
666 tmp1
= vec_ld( 0, address2
);
667 tmp2
= vec_ld( 16, address2
);
668 tmp3
= vec_ld( 32, address2
);
669 c1b
= vec_perm(tmp1
,tmp2
,perm
); /* Oxb Oyb Ozb H1xb */
670 c2b
= vec_perm(tmp2
,tmp3
,perm
); /* H1yb H1zb H2xb H2yb */
671 c3b
= vec_perm(tmp3
,tmp3
,perm
); /* H2zb - - - */
673 perm
= vec_lvsl( 0, address3
);
674 tmp1
= vec_ld( 0, address3
);
675 tmp2
= vec_ld( 16, address3
);
676 tmp3
= vec_ld( 32, address3
);
677 c1c
= vec_perm(tmp1
,tmp2
,perm
); /* Oxc Oyc Ozc H1xc */
678 c2c
= vec_perm(tmp2
,tmp3
,perm
); /* H1yc H1zc H2xc H2yc */
679 c3c
= vec_perm(tmp3
,tmp3
,perm
); /* H2zc - - - */
681 perm
= vec_lvsl( 0, address4
);
682 tmp1
= vec_ld( 0, address4
);
683 tmp2
= vec_ld( 16, address4
);
684 tmp3
= vec_ld( 32, address4
);
685 c1d
= vec_perm(tmp1
,tmp2
,perm
); /* Oxd Oyd Ozd H1xd */
686 c2d
= vec_perm(tmp2
,tmp3
,perm
); /* H1yd H1zd H2xd H2yd */
687 c3d
= vec_perm(tmp3
,tmp3
,perm
); /* H2zd - - - */
690 tmp1
= vec_mergeh(c1a
,c1c
); /* Oxa Oxc Oya Oyc */
691 c1a
= vec_mergel(c1a
,c1c
); /* Oza Ozc H1xa H1xc */
692 c1c
= vec_mergeh(c1b
,c1d
); /* Oxb Oxd Oyb Oyd */
693 c1b
= vec_mergel(c1b
,c1d
); /* Ozb Ozd H1xb H1xd */
695 c1d
= vec_mergeh(c2a
,c2c
); /* H1ya H1yc H1za H1zc */
696 c2a
= vec_mergel(c2a
,c2c
); /* H2xa H2xc H2ya H2yc */
697 c2c
= vec_mergeh(c2b
,c2d
); /* H1yb H1yd H1zb H1zd */
698 c2b
= vec_mergel(c2b
,c2d
); /* H2xb H2xd H2yb H2yd */
700 c3a
= vec_mergeh(c3a
,c3c
); /* H2za H2zc - - */
701 c3b
= vec_mergeh(c3b
,c3d
); /* H2zb H2zd - - */
703 *Ox
= vec_mergeh(tmp1
,c1c
); /* Oxa Oxb Oxc Oxd */
704 *Oy
= vec_mergel(tmp1
,c1c
); /* Oya Oyb Oyc Oyd */
705 *Oz
= vec_mergeh(c1a
,c1b
); /* Oza Ozb Ozc Ozd */
706 *H1x
= vec_mergel(c1a
,c1b
); /* H1xa H1xb H1xc H1xd */
707 *H1y
= vec_mergeh(c1d
,c2c
); /* H1ya H1yb H1yc H1yd */
708 *H1z
= vec_mergel(c1d
,c2c
); /* H1za H1zb H1zc H1zd */
709 *H2x
= vec_mergeh(c2a
,c2b
); /* H2xa H2xb H2xc H2xd */
710 *H2y
= vec_mergel(c2a
,c2b
); /* H2ya H2yb H2yc H2yd */
711 *H2z
= vec_mergeh(c3a
,c3b
); /* H2za H2zb H2zc H2zd */
716 static inline void load_3_water(float *address1
,
729 vector
unsigned char perm
;
730 vector
float tmp1
,tmp2
,tmp3
;
731 vector
float c1a
,c2a
,c3a
;
732 vector
float c1b
,c2b
,c3b
;
733 vector
float c1c
,c2c
,c3c
;
735 /* load the coordinates */
736 perm
= vec_lvsl( 0, (int *) address1
);
737 tmp1
= vec_ld( 0, address1
);
738 tmp2
= vec_ld( 16, address1
);
739 tmp3
= vec_ld( 32, address1
);
740 c1a
= vec_perm(tmp1
,tmp2
,perm
); /* Oxa Oya Oza H1xa */
741 c2a
= vec_perm(tmp2
,tmp3
,perm
); /* H1ya H1za H2xa H2ya */
742 c3a
= vec_perm(tmp3
,tmp3
,perm
); /* H2za - - - */
744 perm
= vec_lvsl( 0, (int *) address3
);
745 tmp1
= vec_ld( 0, address3
);
746 tmp2
= vec_ld( 16, address3
);
747 tmp3
= vec_ld( 32, address3
);
748 c1c
= vec_perm(tmp1
,tmp2
,perm
); /* Oxc Oyc Ozc H1xc */
749 c2c
= vec_perm(tmp2
,tmp3
,perm
); /* H1yc H1zc H2xc H2yc */
750 c3c
= vec_perm(tmp3
,tmp3
,perm
); /* H2zc - - - */
752 perm
= vec_lvsl( 0, (int *) address2
);
753 tmp1
= vec_ld( 0, address2
);
754 tmp2
= vec_ld( 16, address2
);
755 tmp3
= vec_ld( 32, address2
);
756 c1b
= vec_perm(tmp1
,tmp2
,perm
); /* Oxb Oyb Ozb H1xb */
757 c2b
= vec_perm(tmp2
,tmp3
,perm
); /* H1yb H1zb H2xb H2yb */
758 c3b
= vec_perm(tmp3
,tmp3
,perm
); /* H2zb - - - */
761 tmp1
= vec_mergeh(c1a
,c1c
); /* Oxa Oxc Oya Oyc */
762 c1a
= vec_mergel(c1a
,c1c
); /* Oza Ozc H1xa H1xc */
763 c1c
= vec_mergeh(c1b
,c1b
); /* Oxb - Oyb - */
764 c1b
= vec_mergel(c1b
,c1b
); /* Ozb - H1xb - */
766 tmp2
= vec_mergeh(c2a
,c2c
); /* H1ya H1yc H1za H1zc */
767 c2a
= vec_mergel(c2a
,c2c
); /* H2xa H2xc H2ya H2yc */
768 c2c
= vec_mergeh(c2b
,c2b
); /* H1yb - H1zb - */
769 c2b
= vec_mergel(c2b
,c2b
); /* H2xb - H2yb - */
771 c3a
= vec_mergeh(c3a
,c3c
); /* H2za H2zc - - */
773 *Ox
= vec_mergeh(tmp1
,c1c
); /* Oxa Oxb Oxc - */
774 *Oy
= vec_mergel(tmp1
,c1c
); /* Oya Oyb Oyc - */
775 *Oz
= vec_mergeh(c1a
,c1b
); /* Oza Ozb Ozc - */
776 *H1x
= vec_mergel(c1a
,c1b
); /* H1xa H1xb H1xc - */
777 *H1y
= vec_mergeh(tmp2
,c2c
); /* H1ya H1yb H1yc - */
778 *H1z
= vec_mergel(tmp2
,c2c
); /* H1za H1zb H1zc - */
779 *H2x
= vec_mergeh(c2a
,c2b
); /* H2xa H2xb H2xc - */
780 *H2y
= vec_mergel(c2a
,c2b
); /* H2ya H2yb H2yc - */
781 *H2z
= vec_mergeh(c3a
,c3b
); /* H2za H2zb H2zc - */
786 static inline void load_2_water(float *address1
,
798 vector
unsigned char perm
;
799 vector
float tmp1
,tmp2
,tmp3
;
800 vector
float c1a
,c2a
,c3a
;
801 vector
float c1b
,c2b
,c3b
;
803 /* load the coordinates */
804 perm
= vec_lvsl( 0, (int *) address1
);
805 tmp1
= vec_ld( 0, address1
);
806 tmp2
= vec_ld( 16, address1
);
807 tmp3
= vec_ld( 32, address1
);
808 c1a
= vec_perm(tmp1
,tmp2
,perm
); /* Oxa Oya Oza H1xa */
809 c2a
= vec_perm(tmp2
,tmp3
,perm
); /* H1ya H1za H2xa H2ya */
810 c3a
= vec_perm(tmp3
,tmp3
,perm
); /* H2za - - - */
812 perm
= vec_lvsl( 0, (int *) address2
);
813 tmp1
= vec_ld( 0, address2
);
814 tmp2
= vec_ld( 16, address2
);
815 tmp3
= vec_ld( 32, address2
);
816 c1b
= vec_perm(tmp1
,tmp2
,perm
); /* Oxb Oyb Ozb H1xb */
817 c2b
= vec_perm(tmp2
,tmp3
,perm
); /* H1yb H1zb H2xb H2yb */
818 c3b
= vec_perm(tmp3
,tmp3
,perm
); /* H2zb - - - */
820 /* never mind what we get in the two remaining elements */
821 *Ox
= vec_mergeh(c1a
,c1b
); /* Oxa Oxb Oya Oyb */
822 *H1y
= vec_mergeh(c2a
,c2b
); /* H1ya H1yb H1za H1zb */
823 *H2z
= vec_mergeh(c3a
,c3b
); /* H2za H2zb - - */
824 *Oz
= vec_mergel(c1a
,c1b
); /* Oza Ozb H1xa H1xb */
825 *H2x
= vec_mergel(c2a
,c2b
); /* H2xa H2xb H2ya H2yb */
826 *Oy
= vec_sld(*Ox
,*Ox
,8);
827 *H1z
= vec_sld(*H1y
,*H1y
,8);
828 *H1x
= vec_sld(*Oz
,*Oz
,8);
829 *H2y
= vec_sld(*H2x
,*H2x
,8);
833 static inline void load_1_water(float *address1
,
844 vector
unsigned char perm
;
845 vector
float tmp1
,tmp2
,tmp3
;
847 /* load the coordinates */
848 perm
= vec_lvsl( 0, (int *) address1
);
849 tmp1
= vec_ld( 0, address1
);
850 tmp2
= vec_ld( 16, address1
);
851 tmp3
= vec_ld( 32, address1
);
852 *Ox
= vec_perm(tmp1
,tmp2
,perm
); /* Ox Oy Oz H1x */
853 *H1y
= vec_perm(tmp2
,tmp3
,perm
); /* H1y H1z H2x H2y */
854 *H2z
= vec_perm(tmp3
,tmp3
,perm
); /* H2z - - - */
856 /* just splat things... never mind that we fill all cells :-) */
857 *Oy
= vec_splat(*Ox
,1);
858 *Oz
= vec_splat(*Ox
,2);
859 *H1x
= vec_splat(*Ox
,3);
860 *H1z
= vec_splat(*H1y
,1);
861 *H2x
= vec_splat(*H1y
,2);
862 *H2y
= vec_splat(*H1y
,3);
867 static inline void update_i_water_forces(float *water
,
879 vector
float l1
,l2
,l3
;
880 vector
unsigned char perm
;
881 vector
unsigned int mask
,ox00
;
882 vector
float nul
=vec_zero();
884 ox00
=(vector
unsigned int)vec_splat_s32(0);
886 perm
= vec_lvsr( 0, water
);
887 l1
= vec_ld( 0, water
);
888 l2
= vec_ld( 16, water
);
889 l3
= vec_ld( 32, water
);
890 mask
= vec_perm(ox00
,
891 (vector
unsigned int)vec_splat_s32(-1),perm
);
893 /* accumulate the forces */
894 Ox
= vec_add(Ox
,vec_sld(Ox
,Ox
,8)); /* Ox Ox' - - */
895 Oy
= vec_add(Oy
,vec_sld(Oy
,Oy
,8)); /* Oy Oy' - - */
896 Oz
= vec_add(Oz
,vec_sld(Oz
,Oz
,8)); /* Oz Oz' - - */
897 H1x
= vec_add(H1x
,vec_sld(H1x
,H1x
,8)); /* H1x H1x' - - */
898 H1y
= vec_add(H1y
,vec_sld(H1y
,H1y
,8)); /* H1y H1y' - - */
899 H1z
= vec_add(H1z
,vec_sld(H1z
,H1z
,8)); /* H1z H1z' - - */
900 H2x
= vec_add(H2x
,vec_sld(H2x
,H2x
,8)); /* H2x H2x' - - */
901 H2y
= vec_add(H2y
,vec_sld(H2y
,H2y
,8)); /* H2y H2y' - - */
902 H2z
= vec_add(H2z
,vec_sld(H2z
,H2z
,8)); /* H2z H2z' - - */
904 Ox
= vec_mergeh(Ox
,Oz
); /* Ox Oz Ox' Oz' */
905 Oy
= vec_mergeh(Oy
,H1x
); /* Oy H1x Oy' H1x' */
906 H1y
= vec_mergeh(H1y
,H2x
); /* H1y H2x H1y' H2x' */
907 H1z
= vec_mergeh(H1z
,H2y
); /* H1z H2y H1z' H2y' */
908 H2z
= vec_mergeh(H2z
,nul
); /* H2z 0 H2z' 0 */
910 Ox
= vec_add(Ox
,vec_sld(Ox
,Ox
,8)); /* Ox Oz - - */
911 Oy
= vec_add(Oy
,vec_sld(Oy
,Oy
,8)); /* Oy H1x - - */
912 H1y
= vec_add(H1y
,vec_sld(H1y
,H1y
,8)); /* H1y H2x - - */
913 H1z
= vec_add(H1z
,vec_sld(H1z
,H1z
,8)); /* H1z H2y - - */
914 H2z
= vec_add(H2z
,vec_sld(H2z
,H2z
,8)); /* H2z 0 ? 0 */
916 Ox
= vec_mergeh(Ox
,Oy
); /* Ox Oy Oz H1x */
917 H1y
= vec_mergeh(H1y
,H1z
); /* H1y H1z H2x H2y */
918 H2z
= vec_mergeh(H2z
,nul
); /* H2z 0 0 0 */
920 Oy
= vec_sld(nul
,Ox
,12); /* 0 Ox Oy Oz */
921 Oz
= vec_sld(Ox
,H1y
,8); /* - H1x H1y H1z */
922 H1x
= vec_sld(H1y
,H2z
,4); /* - H2x H2y H2z */
924 H2x
= vec_perm(nul
,Ox
,perm
); /* The part to add to l1 */
925 H2y
= vec_perm(Ox
,H1y
,perm
); /* The part to add to l2 */
926 H2z
= vec_perm(H1y
,H2z
,perm
); /* The part to add to l3 */
928 H2x
= vec_add(l1
,H2x
);
929 H2y
= vec_add(l2
,H2y
);
930 H2z
= vec_add(l3
,H2z
);
932 H2x
= vec_sel(l1
,H2x
,mask
);
933 mask
= vec_sld(ox00
,mask
,12);
934 H2z
= vec_sel(H2z
,l3
,mask
);
937 vec_st(H2x
, 0, water
);
938 vec_st(H2y
, 16, water
);
939 vec_st(H2z
, 32, water
);
942 Oy
= vec_add(Oy
,H1x
);
943 Oy
= vec_sld(Oy
,nul
,4); /* x y z 0 */
945 add_xyz_to_mem(shiftvec
,Oy
);
948 static inline void add_force_to_4_water(float *address1
,
962 vector
float low
,medium
,high
,low2
,medium2
,high2
;
963 vector
unsigned char perm
,perm2
;
964 vector
unsigned int mask
,mask2
,oxFF
,ox00
;
965 vector
float tmp1
,tmp2
,tmp3
;
966 vector
float nul
=vec_zero();
968 oxFF
=(vector
unsigned int)vec_splat_s32(-1);
969 ox00
=(vector
unsigned int)vec_splat_s32(0);
971 /* We have to be careful here since water molecules will sometimes
972 * be adjacent in memory. When we update the forces on water 1,
973 * we usually read & write back some extra neighboring cells in
974 * the vector. This means we cannot first read all data, update
975 * it and write it out again.
977 * Instead of doing the molecules one by one, we rely on the
978 * fact that the list will be ordered, so water molecules
979 * 1 & 3 can be done first (they cannot be adjacent), and then
983 tmp1
= vec_mergeh(Ox
,Oz
); /* Oxa Oza Oxb Ozb */
984 Ox
= vec_mergel(Ox
,Oz
); /* Oxc Ozc Oxd Ozd */
985 Oz
= vec_mergeh(Oy
,H1x
); /* Oya H1xa Oyb H1xb */
986 Oy
= vec_mergel(Oy
,H1x
); /* Oyc H1xc Oyd H1xd */
987 H1x
= vec_mergeh(H1y
,H2x
); /* H1ya H2xa H1yb H2xb */
988 H1y
= vec_mergel(H1y
,H2x
); /* H1yc H2xc H1yd H2xd */
990 H2x
= vec_mergeh(H1z
,H2y
); /* H1za H2ya H1zb H2yb */
991 H1z
= vec_mergel(H1z
,H2y
); /* H1zc H2yc H1zd H2yd */
992 H2y
= vec_mergeh(H2z
,nul
); /* H2za 0 H2zb 0 */
993 H2z
= vec_mergel(H2z
,nul
); /* H2zc 0 H2zd 0 */
995 tmp2
= vec_mergeh(tmp1
,Oz
); /* Oxa Oya Oza H1xa */
996 Oz
= vec_mergel(tmp1
,Oz
); /* Oxb Oyb Ozb H1xb */
997 tmp1
= vec_mergeh(Ox
,Oy
); /* Oxc Oyc Ozc H1xc */
998 Ox
= vec_mergel(Ox
,Oy
); /* Oxd Oyd Ozd H1xd */
999 Oy
= vec_mergeh(H1x
,H2x
); /* H1ya H1za H2xa H2ya */
1000 H1x
= vec_mergel(H1x
,H2x
); /* H1yb H1zb H2xb H2yb */
1001 H2x
= vec_mergeh(H1y
,H1z
); /* H1yc H1zc H2xc H2yc */
1002 H1y
= vec_mergel(H1y
,H1z
); /* H1yd H1zd H2xd H2yd */
1003 H1z
= vec_mergeh(H2y
,nul
); /* H2za 0 0 0 */
1004 H2y
= vec_mergel(H2y
,nul
); /* H2zb 0 0 0 */
1005 tmp3
= vec_mergeh(H2z
,nul
); /* H2zc 0 0 0 */
1006 H2z
= vec_mergel(H2z
,nul
); /* H2zd 0 0 0 */
1008 /* move into position, load and add */
1009 perm
= vec_lvsr( 0, (int *) address1
);
1010 perm2
= vec_lvsr( 0, (int *) address3
);
1011 low
= vec_ld( 0, address1
);
1012 low2
= vec_ld( 0, address3
);
1014 medium
= vec_ld( 16, address1
);
1015 medium2
= vec_ld( 16, address3
);
1016 high
= vec_ld( 32, address1
);
1017 high2
= vec_ld( 32, address3
);
1018 mask
= vec_perm(ox00
,oxFF
,perm
);
1019 mask2
= vec_perm(ox00
,oxFF
,perm2
);
1021 low
= vec_add(vec_perm(nul
,tmp2
,perm
),low
);
1022 low2
= vec_add(vec_perm(nul
,tmp1
,perm2
),low2
);
1024 tmp2
= vec_add(vec_perm(tmp2
,Oy
,perm
),medium
);
1025 tmp1
= vec_add(vec_perm(tmp1
,H2x
,perm2
),medium2
);
1027 Oy
= vec_add(vec_perm(Oy
,H1z
,perm
),high
);
1028 H2x
= vec_add(vec_perm(H2x
,tmp3
,perm2
),high2
);
1030 vec_st(vec_sel(low
,low
,mask
), 0, address1
);
1031 vec_st(vec_sel(low2
,low2
,mask2
), 0, address3
);
1033 mask
= vec_sld(ox00
,mask
,12);
1034 mask2
= vec_sld(ox00
,mask2
,12);
1036 vec_st(tmp2
, 16, address1
);
1037 vec_st(tmp1
, 16, address3
);
1039 vec_st(vec_sel(Oy
,high
,mask
), 32, address1
);
1040 vec_st(vec_sel(H2x
,high2
,mask2
), 32, address3
);
1042 /* Finished 1 & 3 - now do 2 & 4 */
1044 perm
= vec_lvsr( 0, (int *) address2
);
1045 perm2
= vec_lvsr( 0, (int *) address4
);
1047 low
= vec_ld( 0, address2
);
1048 low2
= vec_ld( 0, address4
);
1049 medium
= vec_ld( 16, address2
);
1050 medium2
= vec_ld( 16, address4
);
1051 high
= vec_ld( 32, address2
);
1052 high2
= vec_ld( 32, address4
);
1053 mask
= vec_perm(ox00
,oxFF
,perm
);
1054 mask2
= vec_perm(ox00
,oxFF
,perm2
);
1055 H1z
= vec_add(vec_perm(nul
,Oz
,perm
),low
);
1056 H2x
= vec_add(vec_perm(nul
,Ox
,perm2
),low2
);
1057 Oz
= vec_add(vec_perm(Oz
,H1x
,perm
),medium
);
1058 Ox
= vec_add(vec_perm(Ox
,H1y
,perm2
),medium2
);
1059 H1x
= vec_add(vec_perm(H1x
,H2y
,perm
),high
);
1060 H1y
= vec_add(vec_perm(H1y
,H2z
,perm2
),high2
);
1061 vec_st(vec_sel(low
,H1z
,mask
), 0, address2
);
1062 vec_st(vec_sel(low2
,H2x
,mask2
), 0, address4
);
1063 mask
= vec_sld(ox00
,mask
,12);
1064 mask2
= vec_sld(ox00
,mask2
,12);
1065 vec_st(Oz
, 16, address2
);
1066 vec_st(Ox
, 16, address4
);
1067 vec_st(vec_sel(H1x
,high
,mask
), 32, address2
);
1068 vec_st(vec_sel(H1y
,high2
,mask2
), 32, address4
);
1072 static inline void add_force_to_3_water(float *address1
,
1085 vector
float low
,medium
,high
;
1086 vector
unsigned char perm
;
1087 vector
unsigned int mask
,oxFF
,ox00
;
1088 vector
float tmp1
,tmp2
,tmp3
;
1090 vector
float nul
=vec_zero();
1091 oxFF
=(vector
unsigned int)vec_splat_s32(-1);
1092 ox00
=(vector
unsigned int)vec_splat_s32(0);
1094 tmp1
= vec_mergeh(Ox
,Oz
); /* Oxa Oza Oxb Ozb */
1095 Ox
= vec_mergel(Ox
,Oz
); /* Oxc Ozc ? ? */
1096 Oz
= vec_mergeh(Oy
,H1x
); /* Oya H1xa Oyb H1xb */
1097 Oy
= vec_mergel(Oy
,H1x
); /* Oyc H1xc ? ? */
1098 H1x
= vec_mergeh(H1y
,H2x
); /* H1ya H2xa H1yb H2xb */
1099 H1y
= vec_mergel(H1y
,H2x
); /* H1yc H2xc ? ? */
1100 H2x
= vec_mergeh(H1z
,H2y
); /* H1za H2ya H1zb H2yb */
1101 H1z
= vec_mergel(H1z
,H2y
); /* H1zc H2yc ? ? */
1102 H2y
= vec_mergeh(H2z
,nul
); /* H2za 0 H2zb 0 */
1103 H2z
= vec_mergel(H2z
,nul
); /* H2zc 0 ? 0 */
1105 tmp2
= vec_mergeh(tmp1
,Oz
); /* Oxa Oya Oza H1xa */
1106 Oz
= vec_mergel(tmp1
,Oz
); /* Oxb Oyb Ozb H1xb */
1107 tmp1
= vec_mergeh(Ox
,Oy
); /* Oxc Oyc Ozc H1xc */
1108 Oy
= vec_mergeh(H1x
,H2x
); /* H1ya H1za H2xa H2ya */
1109 H1x
= vec_mergel(H1x
,H2x
); /* H1yb H1zb H2xb H2yb */
1110 H2x
= vec_mergeh(H1y
,H1z
); /* H1yc H1zc H2xc H2yc */
1111 H1z
= vec_mergeh(H2y
,nul
); /* H2za 0 0 0 */
1112 H2y
= vec_mergel(H2y
,nul
); /* H2zb 0 0 0 */
1113 tmp3
= vec_mergeh(H2z
,nul
); /* H2zc 0 0 0 */
1115 /* move into position, load and add */
1116 perm
= vec_lvsr( 0, (int *) address1
);
1117 low
= vec_ld( 0, address1
);
1118 medium
= vec_ld( 16, address1
);
1119 high
= vec_ld( 32, address1
);
1120 mask
= vec_perm(ox00
,oxFF
,perm
);
1121 H1y
= vec_add(vec_perm(nul
,tmp2
,perm
),low
);
1122 tmp2
= vec_add(vec_perm(tmp2
,Oy
,perm
),medium
);
1123 Oy
= vec_add(vec_perm(Oy
,H1z
,perm
),high
);
1124 vec_st(vec_sel(low
,H1y
,mask
), 0, address1
);
1125 mask
= vec_sld(ox00
,mask
,12);
1126 vec_st(tmp2
, 16, address1
);
1127 vec_st(vec_sel(Oy
,high
,mask
), 32, address1
);
1129 perm
= vec_lvsr( 0, (int *) address2
);
1130 low
= vec_ld( 0, address2
);
1131 medium
= vec_ld( 16, address2
);
1132 high
= vec_ld( 32, address2
);
1133 mask
= vec_perm(ox00
,oxFF
,perm
);
1134 H1z
= vec_add(vec_perm(nul
,Oz
,perm
),low
);
1135 Oz
= vec_add(vec_perm(Oz
,H1x
,perm
),medium
);
1136 H1x
= vec_add(vec_perm(H1x
,H2y
,perm
),high
);
1137 vec_st(vec_sel(low
,H1z
,mask
), 0, address2
);
1138 mask
= vec_sld(ox00
,mask
,12);
1139 vec_st(Oz
, 16, address2
);
1140 vec_st(vec_sel(H1x
,high
,mask
), 32, address2
);
1142 perm
= vec_lvsr( 0, (int *) address3
);
1143 low
= vec_ld( 0, address3
);
1144 medium
= vec_ld( 16, address3
);
1145 high
= vec_ld( 32, address3
);
1146 mask
= vec_perm(ox00
,oxFF
,perm
);
1147 H2y
= vec_add(vec_perm(nul
,tmp1
,perm
),low
);
1148 tmp1
= vec_add(vec_perm(tmp1
,H2x
,perm
),medium
);
1149 H2x
= vec_add(vec_perm(H2x
,tmp3
,perm
),high
);
1150 vec_st(vec_sel(low
,H2y
,mask
), 0, address3
);
1151 mask
= vec_sld(ox00
,mask
,12);
1152 vec_st(tmp1
, 16, address3
);
1153 vec_st(vec_sel(H2x
,high
,mask
), 32, address3
);
1159 static inline void add_force_to_2_water(float *address1
,
1171 vector
float low
,medium
,high
;
1172 vector
unsigned char perm
;
1173 vector
unsigned int mask
,oxFF
,ox00
;
1174 vector
float tmp1
,tmp2
,tmp3
;
1176 vector
float nul
=vec_zero();
1177 oxFF
=(vector
unsigned int)vec_splat_s32(-1);
1178 ox00
=(vector
unsigned int)vec_splat_s32(0);
1180 tmp1
= vec_mergeh(Ox
,Oz
); /* Oxa Oza Oxb Ozb */
1181 Oz
= vec_mergeh(Oy
,H1x
); /* Oya H1xa Oyb H1xb */
1182 H1x
= vec_mergeh(H1y
,H2x
); /* H1ya H2xa H1yb H2xb */
1183 H2x
= vec_mergeh(H1z
,H2y
); /* H1za H2ya H1zb H2yb */
1184 H2y
= vec_mergeh(H2z
,nul
); /* H2za 0 H2zb 0 */
1186 tmp2
= vec_mergeh(tmp1
,Oz
); /* Oxa Oya Oza H1xa */
1187 Oz
= vec_mergel(tmp1
,Oz
); /* Oxb Oyb Ozb H1xb */
1188 Oy
= vec_mergeh(H1x
,H2x
); /* H1ya H1za H2xa H2ya */
1189 H1x
= vec_mergel(H1x
,H2x
); /* H1yb H1zb H2xb H2yb */
1190 H1z
= vec_mergeh(H2y
,nul
); /* H2za 0 0 0 */
1191 H2y
= vec_mergel(H2y
,nul
); /* H2zb 0 0 0 */
1193 /* move into position and add */
1194 perm
= vec_lvsr( 0, (int *) address1
);
1195 low
= vec_ld( 0, address1
);
1196 medium
= vec_ld( 16, address1
);
1197 high
= vec_ld( 32, address1
);
1198 mask
= vec_perm(ox00
,oxFF
,perm
);
1199 H2x
= vec_add(vec_perm(nul
,tmp2
,perm
),low
);
1200 tmp2
= vec_add(vec_perm(tmp2
,Oy
,perm
),medium
);
1201 Oy
= vec_add(vec_perm(Oy
,H1z
,perm
),high
);
1202 vec_st(vec_sel(low
,H2x
,mask
), 0, address1
);
1203 mask
= vec_sld(ox00
,mask
,12);
1204 vec_st(tmp2
, 16, address1
);
1205 vec_st(vec_sel(Oy
,high
,mask
), 32, address1
);
1207 perm
= vec_lvsr( 0, (int *) address2
);
1208 low
= vec_ld( 0, address2
);
1209 medium
= vec_ld( 16, address2
);
1210 high
= vec_ld( 32, address2
);
1211 mask
= vec_perm(ox00
,oxFF
,perm
);
1212 H1z
= vec_add(vec_perm(nul
,Oz
,perm
),low
);
1213 Oz
= vec_add(vec_perm(Oz
,H1x
,perm
),medium
);
1214 H1x
= vec_add(vec_perm(H1x
,H2y
,perm
),high
);
1215 vec_st(vec_sel(low
,H1z
,mask
), 0, address2
);
1216 mask
= vec_sld(ox00
,mask
,12);
1217 vec_st(Oz
, 16, address2
);
1218 vec_st(vec_sel(H1x
,high
,mask
), 32, address2
);
1223 static inline void add_force_to_1_water(float *address1
,
1234 vector
float low
,medium
,high
;
1235 vector
unsigned char perm
;
1236 vector
unsigned int mask
,oxFF
,ox00
;
1237 vector
float tmp1
,tmp2
,tmp3
;
1239 vector
float nul
=vec_zero();
1240 oxFF
=(vector
unsigned int)vec_splat_s32(-1);
1241 ox00
=(vector
unsigned int)vec_splat_s32(0);
1244 perm
= vec_lvsr( 0, (int *) address1
);
1245 low
= vec_ld( 0, address1
);
1246 medium
= vec_ld( 16, address1
);
1247 high
= vec_ld( 32, address1
);
1249 tmp1
= vec_mergeh(Ox
,Oz
); /* Oxa Oza ? ? */
1250 Oz
= vec_mergeh(Oy
,H1x
); /* Oya H1xa ? ? */
1251 H1x
= vec_mergeh(H1y
,H2x
); /* H1ya H2xa ? ? */
1252 H2x
= vec_mergeh(H1z
,H2y
); /* H1za H2ya ? ? */
1253 H2y
= vec_mergeh(H2z
,nul
); /* H2za 0 ? 0 */
1255 tmp2
= vec_mergeh(tmp1
,Oz
); /* Oxa Oya Oza H1xa */
1256 Oy
= vec_mergeh(H1x
,H2x
); /* H1ya H1za H2xa H2ya */
1257 H1z
= vec_mergeh(H2y
,nul
); /* H2za 0 0 0 */
1259 /* move into position and add */
1260 perm
= vec_lvsr( 0, (int *) address1
);
1261 low
= vec_ld( 0, address1
);
1262 medium
= vec_ld( 16, address1
);
1263 high
= vec_ld( 32, address1
);
1264 mask
= vec_perm(ox00
,oxFF
,perm
);
1265 H2x
= vec_add(vec_perm(nul
,tmp2
,perm
),low
);
1266 tmp2
= vec_add(vec_perm(tmp2
,Oy
,perm
),medium
);
1267 Oy
= vec_add(vec_perm(Oy
,H1z
,perm
),high
);
1268 vec_st(vec_sel(low
,H2x
,mask
), 0, address1
);
1269 mask
= vec_sld(ox00
,mask
,12);
1270 vec_st(tmp2
, 16, address1
);
1271 vec_st(vec_sel(Oy
,high
,mask
), 32, address1
);
1276 static inline void do_4_ctable_coul(float *VFtab
,
1281 vector
signed int vidx
;
1282 vector
float Y
,F
,G
,H
,eps
,eps2
,tab1
,tab2
,tab3
,tab4
;
1283 int idx1
,idx2
,idx3
,idx4
;
1285 vidx
= vec_cts(rtab
,0);
1286 vidx
= vec_sl(vidx
,vec_splat_u32(2));
1288 idx1
= *((int *)&vidx
);
1289 idx2
= *(((int *)&vidx
)+1);
1290 idx3
= *(((int *)&vidx
)+2);
1291 idx4
= *(((int *)&vidx
)+3);
1293 eps
= vec_sub(rtab
,vec_floor(rtab
));
1294 eps2
= vec_madd(eps
,eps
,vec_zero());
1296 if(((unsigned int)VFtab
)%16) { /* not 16-byte aligned, but must be 8 byte. */
1297 tab1
= vec_ld( 0, VFtab
+idx1
);
1298 Y
= vec_ld(16, VFtab
+idx1
);
1299 tab1
= vec_sld(tab1
,Y
,8);
1300 tab2
= vec_ld( 0, VFtab
+idx2
);
1301 F
= vec_ld(16, VFtab
+idx2
);
1302 tab2
= vec_sld(tab2
,F
,8);
1303 tab3
= vec_ld( 0, VFtab
+idx3
);
1304 G
= vec_ld(16, VFtab
+idx3
);
1305 tab3
= vec_sld(tab3
,G
,8);
1306 tab4
= vec_ld( 0, VFtab
+idx4
);
1307 H
= vec_ld(16, VFtab
+idx4
);
1308 tab4
= vec_sld(tab4
,H
,8);
1309 } else { /* aligned */
1310 tab1
=vec_ld(0, VFtab
+idx1
);
1311 tab2
=vec_ld(0, VFtab
+idx2
);
1312 tab3
=vec_ld(0, VFtab
+idx3
);
1313 tab4
=vec_ld(0, VFtab
+idx4
);
1316 /* table data is aligned */
1317 transpose_4_to_4(tab1
,tab2
,tab3
,tab4
,&Y
,&F
,&G
,&H
);
1319 F
= vec_madd(G
,eps
,F
); /* F + Geps */
1320 H
= vec_madd(H
,eps2
,vec_zero()); /* Heps2 */
1321 F
= vec_add(F
,H
); /* F + Geps + Heps2 (=Fp) */
1322 *VV
= vec_madd(eps
,F
,Y
); /* VV = Y + eps*Fp */
1323 F
= vec_madd(G
,eps
,F
); /* Fp + Geps */
1324 *FF
= vec_madd(vec_two(),H
,F
); /* Fp + Geps + 2.0*Heps2 */
1327 /* do only coulomb, but on a table with both coulomb and lj data */
1328 static inline void do_4_ljctable_coul(float *VFtab
,
1333 vector
signed int vidx
;
1334 vector
float Y
,F
,G
,H
,eps
,eps2
,tab1
,tab2
,tab3
,tab4
;
1335 int idx1
,idx2
,idx3
,idx4
;
1337 vidx
= vec_cts(rtab
,0);
1338 vidx
= vec_add(vidx
,vec_sl(vidx
,vec_splat_u32(1))); /* multiply by 3 */
1339 vidx
= vec_sl(vidx
,vec_splat_u32(2));
1341 idx1
= *((int *)&vidx
);
1342 idx2
= *(((int *)&vidx
)+1);
1343 idx3
= *(((int *)&vidx
)+2);
1344 idx4
= *(((int *)&vidx
)+3);
1346 eps
= vec_sub(rtab
,vec_floor(rtab
));
1347 eps2
= vec_madd(eps
,eps
,vec_zero());
1349 if(((unsigned int)VFtab
)%16) { /* not 16-byte aligned, but must be 8 byte. */
1350 tab1
= vec_ld( 0, VFtab
+idx1
);
1351 Y
= vec_ld(16, VFtab
+idx1
);
1352 tab1
= vec_sld(tab1
,Y
,8);
1353 tab2
= vec_ld( 0, VFtab
+idx2
);
1354 F
= vec_ld(16, VFtab
+idx2
);
1355 tab2
= vec_sld(tab2
,F
,8);
1356 tab3
= vec_ld( 0, VFtab
+idx3
);
1357 G
= vec_ld(16, VFtab
+idx3
);
1358 tab3
= vec_sld(tab3
,G
,8);
1359 tab4
= vec_ld( 0, VFtab
+idx4
);
1360 H
= vec_ld(16, VFtab
+idx4
);
1361 tab4
= vec_sld(tab4
,H
,8);
1362 } else { /* aligned */
1363 tab1
=vec_ld(0, VFtab
+idx1
);
1364 tab2
=vec_ld(0, VFtab
+idx2
);
1365 tab3
=vec_ld(0, VFtab
+idx3
);
1366 tab4
=vec_ld(0, VFtab
+idx4
);
1369 transpose_4_to_4(tab1
,tab2
,tab3
,tab4
,&Y
,&F
,&G
,&H
);
1371 F
= vec_madd(G
,eps
,F
); /* F + Geps */
1372 H
= vec_madd(H
,eps2
,vec_zero()); /* Heps2 */
1373 F
= vec_add(F
,H
); /* F + Geps + Heps2 (=Fp) */
1374 *VV
= vec_madd(eps
,F
,Y
); /* VV = Y + eps*Fp */
1375 F
= vec_madd(G
,eps
,F
); /* Fp + Geps */
1376 *FF
= vec_madd(vec_two(),H
,F
); /* Fp + Geps + 2.0*Heps2 */
1380 static inline void do_4_ljtable_lj(float *VFtab
,
1382 vector
float *VVdisp
,
1383 vector
float *FFdisp
,
1384 vector
float *VVrep
,
1385 vector
float *FFrep
)
1387 vector
signed int vidx
;
1388 vector
float Y
,F
,G
,H
,eps
,eps2
;
1389 vector
float tabd1
,tabd2
,tabd3
,tabd4
;
1390 vector
float tabr1
,tabr2
,tabr3
,tabr4
;
1392 int idx1
,idx2
,idx3
,idx4
;
1394 vidx
= vec_cts(rtab
,0);
1395 vidx
= vec_sl(vidx
,vec_splat_u32(3));
1397 idx1
= *((int *)&vidx
);
1398 idx2
= *(((int *)&vidx
)+1);
1399 idx3
= *(((int *)&vidx
)+2);
1400 idx4
= *(((int *)&vidx
)+3);
1402 eps
= vec_sub(rtab
,vec_floor(rtab
));
1403 eps2
= vec_madd(eps
,eps
,vec_zero());
1405 if(((unsigned int)VFtab
)%16) {
1406 /* not 16 byte aligned, i.e. must be 8 byte. */
1407 /* use Y,F,G,H as temp storage */
1408 tabd1
= vec_ld( 0, VFtab
+idx1
);
1409 tabr1
= vec_ld(16, VFtab
+idx1
);
1410 Y
= vec_ld(32, VFtab
+idx1
);
1411 tabd1
= vec_sld(tabd1
,tabr1
,8);
1412 tabr1
= vec_sld(tabr1
,Y
,8);
1414 tabd2
= vec_ld( 0, VFtab
+idx2
);
1415 tabr2
= vec_ld(16, VFtab
+idx2
);
1416 F
= vec_ld(32, VFtab
+idx2
);
1417 tabd2
= vec_sld(tabd2
,tabr2
,8);
1418 tabr2
= vec_sld(tabr2
,F
,8);
1420 tabd3
= vec_ld( 0, VFtab
+idx3
);
1421 tabr3
= vec_ld(16, VFtab
+idx3
);
1422 G
= vec_ld(32, VFtab
+idx3
);
1423 tabd3
= vec_sld(tabd3
,tabr3
,8);
1424 tabr3
= vec_sld(tabr3
,G
,8);
1426 tabd4
= vec_ld( 0, VFtab
+idx4
);
1427 tabr4
= vec_ld(16, VFtab
+idx4
);
1428 H
= vec_ld(32, VFtab
+idx4
);
1429 tabd4
= vec_sld(tabd4
,tabr4
,8);
1430 tabr4
= vec_sld(tabr4
,H
,8);
1431 } else { /* 16 byte aligned */
1432 tabd1
= vec_ld( 0, VFtab
+idx1
);
1433 tabr1
= vec_ld(16, VFtab
+idx1
);
1434 tabd2
= vec_ld( 0, VFtab
+idx2
);
1435 tabr2
= vec_ld(16, VFtab
+idx2
);
1436 tabd3
= vec_ld( 0, VFtab
+idx3
);
1437 tabr3
= vec_ld(16, VFtab
+idx3
);
1438 tabd4
= vec_ld( 0, VFtab
+idx4
);
1439 tabr4
= vec_ld(16, VFtab
+idx4
);
1442 transpose_4_to_4(tabd1
,tabd2
,tabd3
,tabd4
,&Y
,&F
,&G
,&H
);
1444 F
= vec_madd(G
,eps
,F
); /* F + Geps */
1445 H
= vec_madd(H
,eps2
,vec_zero()); /* Heps2 */
1446 F
= vec_add(F
,H
); /* F + Geps + Heps2 (=Fp) */
1447 *VVdisp
= vec_madd(eps
,F
,Y
); /* VV = Y + eps*Fp */
1448 F
= vec_madd(G
,eps
,F
); /* Fp + Geps */
1449 *FFdisp
= vec_madd(vec_two(),H
,F
); /* Fp + Geps + 2.0*Heps2 */
1451 transpose_4_to_4(tabr1
,tabr2
,tabr3
,tabr4
,&Y
,&F
,&G
,&H
);
1453 F
= vec_madd(G
,eps
,F
); /* F + Geps */
1454 H
= vec_madd(H
,eps2
,vec_zero()); /* Heps2 */
1455 F
= vec_add(F
,H
); /* F + Geps + Heps2 (=Fp) */
1456 *VVrep
= vec_madd(eps
,F
,Y
); /* VV = Y + eps*Fp */
1457 F
= vec_madd(G
,eps
,F
); /* Fp + Geps */
1458 *FFrep
= vec_madd(vec_two(),H
,F
); /* Fp + Geps + 2.0*Heps2 */
1462 static inline void do_4_ljctable_coul_and_lj(float *VFtab
,
1464 vector
float *VVcoul
,
1465 vector
float *FFcoul
,
1466 vector
float *VVdisp
,
1467 vector
float *FFdisp
,
1468 vector
float *VVrep
,
1469 vector
float *FFrep
)
1471 vector
signed int vidx
;
1472 vector
float Y
,F
,G
,H
,eps
,eps2
;
1473 vector
float tabd1
,tabr1
,tabc1
,tabd2
,tabr2
,tabc2
;
1474 vector
float tabd3
,tabr3
,tabc3
,tabd4
,tabr4
,tabc4
;
1475 int idx1
,idx2
,idx3
,idx4
;
1477 vidx
= vec_cts(rtab
,0);
1478 vidx
= vec_add(vidx
,vec_sl(vidx
,vec_splat_u32(1))); /* multiply by 3 */
1479 vidx
= vec_sl(vidx
,vec_splat_u32(2));
1481 idx1
= *((int *)&vidx
);
1482 idx2
= *(((int *)&vidx
)+1);
1483 idx3
= *(((int *)&vidx
)+2);
1484 idx4
= *(((int *)&vidx
)+3);
1486 eps
= vec_sub(rtab
,vec_floor(rtab
));
1487 eps2
= vec_madd(eps
,eps
,vec_zero());
1489 if(((unsigned int)VFtab
)%16) {
1490 /* not 16-byte aligned, but must be 8 byte. */
1491 /* use Y,F,G,H as temp storage */
1492 tabc1
= vec_ld( 0, VFtab
+idx1
);
1493 tabd1
= vec_ld(16, VFtab
+idx1
);
1494 tabr1
= vec_ld(32, VFtab
+idx1
);
1495 Y
= vec_ld(48, VFtab
+idx1
);
1496 tabc1
= vec_sld(tabc1
,tabd1
,8);
1497 tabd1
= vec_sld(tabd1
,tabr1
,8);
1498 tabr1
= vec_sld(tabr1
,Y
,8);
1500 tabc2
= vec_ld( 0, VFtab
+idx2
);
1501 tabd2
= vec_ld(16, VFtab
+idx2
);
1502 tabr2
= vec_ld(32, VFtab
+idx2
);
1503 F
= vec_ld(48, VFtab
+idx2
);
1504 tabc2
= vec_sld(tabc2
,tabd2
,8);
1505 tabd2
= vec_sld(tabd2
,tabr2
,8);
1506 tabr2
= vec_sld(tabr2
,F
,8);
1508 tabc3
= vec_ld( 0, VFtab
+idx3
);
1509 tabd3
= vec_ld(16, VFtab
+idx3
);
1510 tabr3
= vec_ld(32, VFtab
+idx3
);
1511 G
= vec_ld(48, VFtab
+idx3
);
1512 tabc3
= vec_sld(tabc3
,tabd3
,8);
1513 tabd3
= vec_sld(tabd3
,tabr3
,8);
1514 tabr3
= vec_sld(tabr3
,G
,8);
1516 tabc4
= vec_ld( 0, VFtab
+idx4
);
1517 tabd4
= vec_ld(16, VFtab
+idx4
);
1518 tabr4
= vec_ld(32, VFtab
+idx4
);
1519 H
= vec_ld(48, VFtab
+idx4
);
1520 tabc4
= vec_sld(tabc4
,tabd4
,8);
1521 tabd4
= vec_sld(tabd4
,tabr4
,8);
1522 tabr4
= vec_sld(tabr4
,H
,8);
1523 } else { /* 16 byte aligned */
1524 tabc1
= vec_ld( 0, VFtab
+idx1
);
1525 tabd1
= vec_ld(16, VFtab
+idx1
);
1526 tabr1
= vec_ld(32, VFtab
+idx1
);
1527 tabc2
= vec_ld( 0, VFtab
+idx2
);
1528 tabd2
= vec_ld(16, VFtab
+idx2
);
1529 tabr2
= vec_ld(32, VFtab
+idx2
);
1530 tabc3
= vec_ld( 0, VFtab
+idx3
);
1531 tabd3
= vec_ld(16, VFtab
+idx3
);
1532 tabr3
= vec_ld(32, VFtab
+idx3
);
1533 tabc4
= vec_ld( 0, VFtab
+idx4
);
1534 tabd4
= vec_ld(16, VFtab
+idx4
);
1535 tabr4
= vec_ld(32, VFtab
+idx4
);
1537 transpose_4_to_4(tabc1
,tabc2
,tabc3
,tabc4
,&Y
,&F
,&G
,&H
);
1539 F
= vec_madd(G
,eps
,F
); /* F + Geps */
1540 H
= vec_madd(H
,eps2
,vec_zero()); /* Heps2 */
1541 F
= vec_add(F
,H
); /* F + Geps + Heps2 (=Fp) */
1542 *VVcoul
= vec_madd(eps
,F
,Y
); /* VV = Y + eps*Fp */
1543 F
= vec_madd(G
,eps
,F
); /* Fp + Geps */
1544 *FFcoul
= vec_madd(vec_two(),H
,F
); /* Fp + Geps + 2.0*Heps2 */
1546 transpose_4_to_4(tabd1
,tabd2
,tabd3
,tabd4
,&Y
,&F
,&G
,&H
);
1548 F
= vec_madd(G
,eps
,F
); /* F + Geps */
1549 H
= vec_madd(H
,eps2
,vec_zero()); /* Heps2 */
1550 F
= vec_add(F
,H
); /* F + Geps + Heps2 (=Fp) */
1551 *VVdisp
= vec_madd(eps
,F
,Y
); /* VV = Y + eps*Fp */
1552 F
= vec_madd(G
,eps
,F
); /* Fp + Geps */
1553 *FFdisp
= vec_madd(vec_two(),H
,F
); /* Fp + Geps + 2.0*Heps2 */
1555 transpose_4_to_4(tabr1
,tabr2
,tabr3
,tabr4
,&Y
,&F
,&G
,&H
);
1557 F
= vec_madd(G
,eps
,F
); /* F + Geps */
1558 H
= vec_madd(H
,eps2
,vec_zero()); /* Heps2 */
1559 F
= vec_add(F
,H
); /* F + Geps + Heps2 (=Fp) */
1560 *VVrep
= vec_madd(eps
,F
,Y
); /* VV = Y + eps*Fp */
1561 F
= vec_madd(G
,eps
,F
); /* Fp + Geps */
1562 *FFrep
= vec_madd(vec_two(),H
,F
); /* Fp + Geps + 2.0*Heps2 */
1566 static inline void do_3_ctable_coul(float *VFtab
,
1571 vector
signed int vidx
;
1572 vector
float Y
,F
,G
,H
,eps
,eps2
,tab1
,tab2
,tab3
;
1575 vidx
= vec_cts(rtab
,0);
1576 vidx
= vec_sl(vidx
,vec_splat_u32(2));
1578 idx1
= *((int *)&vidx
);
1579 idx2
= *(((int *)&vidx
)+1);
1580 idx3
= *(((int *)&vidx
)+2);
1582 eps
= vec_sub(rtab
,vec_floor(rtab
));
1583 eps2
= vec_madd(eps
,eps
,vec_zero());
1585 if(((unsigned int)VFtab
)%16) { /* not 16-byte aligned, but must be 8 byte. */
1586 tab1
= vec_ld( 0, VFtab
+idx1
);
1587 Y
= vec_ld(16, VFtab
+idx1
);
1588 tab1
= vec_sld(tab1
,Y
,8);
1589 tab2
= vec_ld( 0, VFtab
+idx2
);
1590 F
= vec_ld(16, VFtab
+idx2
);
1591 tab2
= vec_sld(tab2
,F
,8);
1592 tab3
= vec_ld( 0, VFtab
+idx3
);
1593 G
= vec_ld(16, VFtab
+idx3
);
1594 tab3
= vec_sld(tab3
,G
,8);
1595 } else { /* aligned */
1596 tab1
=vec_ld(0, VFtab
+idx1
);
1597 tab2
=vec_ld(0, VFtab
+idx2
);
1598 tab3
=vec_ld(0, VFtab
+idx3
);
1601 /* table data is aligned */
1602 transpose_3_to_4(tab1
,tab2
,tab3
,&Y
,&F
,&G
,&H
);
1605 F
= vec_madd(G
,eps
,F
); /* F + Geps */
1606 H
= vec_madd(H
,eps2
,vec_zero()); /* Heps2 */
1607 F
= vec_add(F
,H
); /* F + Geps + Heps2 (=Fp) */
1608 *VV
= vec_madd(eps
,F
,Y
); /* VV = Y + eps*Fp */
1609 F
= vec_madd(G
,eps
,F
); /* Fp + Geps */
1610 *FF
= vec_madd(vec_two(),H
,F
); /* Fp + Geps + 2.0*Heps2 */
1613 /* do only coulomb, but on a table with both coulomb and lj data */
1614 static inline void do_3_ljctable_coul(float *VFtab
,
1619 vector
signed int vidx
;
1620 vector
float Y
,F
,G
,H
,eps
,eps2
,tab1
,tab2
,tab3
;
1623 vidx
= vec_cts(rtab
,0);
1624 vidx
= vec_add(vidx
,vec_sl(vidx
,vec_splat_u32(1))); /* multiply by 3 */
1625 vidx
= vec_sl(vidx
,vec_splat_u32(2));
1627 idx1
= *((int *)&vidx
);
1628 idx2
= *(((int *)&vidx
)+1);
1629 idx3
= *(((int *)&vidx
)+2);
1631 eps
= vec_sub(rtab
,vec_floor(rtab
));
1632 eps2
= vec_madd(eps
,eps
,vec_zero());
1635 if(((unsigned int)VFtab
)%16) {
1636 /* not 16-byte aligned, but must be 8 byte. */
1637 tab1
= vec_ld( 0, VFtab
+idx1
);
1638 Y
= vec_ld(16, VFtab
+idx1
);
1639 tab1
= vec_sld(tab1
,Y
,8);
1640 tab2
= vec_ld( 0, VFtab
+idx2
);
1641 F
= vec_ld(16, VFtab
+idx2
);
1642 tab2
= vec_sld(tab2
,F
,8);
1643 tab3
= vec_ld( 0, VFtab
+idx3
);
1644 G
= vec_ld(16, VFtab
+idx3
);
1645 tab3
= vec_sld(tab3
,G
,8);
1646 } else { /* aligned */
1647 tab1
=vec_ld(0, VFtab
+idx1
);
1648 tab2
=vec_ld(0, VFtab
+idx2
);
1649 tab3
=vec_ld(0, VFtab
+idx3
);
1652 /* table data is aligned */
1653 transpose_3_to_4(tab1
,tab2
,tab3
,&Y
,&F
,&G
,&H
);
1655 F
= vec_madd(G
,eps
,F
); /* F + Geps */
1656 H
= vec_madd(H
,eps2
,vec_zero()); /* Heps2 */
1657 F
= vec_add(F
,H
); /* F + Geps + Heps2 (=Fp) */
1658 *VV
= vec_madd(eps
,F
,Y
); /* VV = Y + eps*Fp */
1659 F
= vec_madd(G
,eps
,F
); /* Fp + Geps */
1660 *FF
= vec_madd(vec_two(),H
,F
); /* Fp + Geps + 2.0*Heps2 */
1664 static inline void do_3_ljtable_lj(float *VFtab
,
1666 vector
float *VVdisp
,
1667 vector
float *FFdisp
,
1668 vector
float *VVrep
,
1669 vector
float *FFrep
)
1671 vector
signed int vidx
;
1672 vector
float Y
,F
,G
,H
,eps
,eps2
;
1674 vector
float tabd1
,tabd2
,tabd3
;
1675 vector
float tabr1
,tabr2
,tabr3
;
1677 vidx
= vec_cts(rtab
,0);
1678 vidx
= vec_sl(vidx
,vec_splat_u32(3));
1680 idx1
= *((int *)&vidx
);
1681 idx2
= *(((int *)&vidx
)+1);
1682 idx3
= *(((int *)&vidx
)+2);
1684 eps
= vec_sub(rtab
,vec_floor(rtab
));
1685 eps2
= vec_madd(eps
,eps
,vec_zero());
1687 if(((unsigned int)VFtab
)%16) {
1688 /* not 16 byte aligned, i.e. must be 8 byte. */
1689 /* use Y,F,G,H as temp storage */
1690 tabd1
= vec_ld( 0, VFtab
+idx1
);
1691 tabr1
= vec_ld(16, VFtab
+idx1
);
1692 Y
= vec_ld(32, VFtab
+idx1
);
1693 tabd1
= vec_sld(tabd1
,tabr1
,8);
1694 tabr1
= vec_sld(tabr1
,Y
,8);
1696 tabd2
= vec_ld( 0, VFtab
+idx2
);
1697 tabr2
= vec_ld(16, VFtab
+idx2
);
1698 F
= vec_ld(32, VFtab
+idx2
);
1699 tabd2
= vec_sld(tabd2
,tabr2
,8);
1700 tabr2
= vec_sld(tabr2
,F
,8);
1702 tabd3
= vec_ld( 0, VFtab
+idx3
);
1703 tabr3
= vec_ld(16, VFtab
+idx3
);
1704 G
= vec_ld(32, VFtab
+idx3
);
1705 tabd3
= vec_sld(tabd3
,tabr3
,8);
1706 tabr3
= vec_sld(tabr3
,G
,8);
1707 } else { /* 16 byte aligned */
1708 tabd1
= vec_ld( 0, VFtab
+idx1
);
1709 tabr1
= vec_ld(16, VFtab
+idx1
);
1710 tabd2
= vec_ld( 0, VFtab
+idx2
);
1711 tabr2
= vec_ld(16, VFtab
+idx2
);
1712 tabd3
= vec_ld( 0, VFtab
+idx3
);
1713 tabr3
= vec_ld(16, VFtab
+idx3
);
1716 transpose_3_to_4(tabd1
,tabd2
,tabd3
,&Y
,&F
,&G
,&H
);
1718 F
= vec_madd(G
,eps
,F
); /* F + Geps */
1719 H
= vec_madd(H
,eps2
,vec_zero()); /* Heps2 */
1720 F
= vec_add(F
,H
); /* F + Geps + Heps2 (=Fp) */
1721 *VVdisp
= vec_madd(eps
,F
,Y
); /* VV = Y + eps*Fp */
1722 F
= vec_madd(G
,eps
,F
); /* Fp + Geps */
1723 *FFdisp
= vec_madd(vec_two(),H
,F
); /* Fp + Geps + 2.0*Heps2 */
1725 transpose_3_to_4(tabr1
,tabr2
,tabr3
,&Y
,&F
,&G
,&H
);
1727 F
= vec_madd(G
,eps
,F
); /* F + Geps */
1728 H
= vec_madd(H
,eps2
,vec_zero()); /* Heps2 */
1729 F
= vec_add(F
,H
); /* F + Geps + Heps2 (=Fp) */
1730 *VVrep
= vec_madd(eps
,F
,Y
); /* VV = Y + eps*Fp */
1731 F
= vec_madd(G
,eps
,F
); /* Fp + Geps */
1732 *FFrep
= vec_madd(vec_two(),H
,F
); /* Fp + Geps + 2.0*Heps2 */
1736 static inline void do_3_ljctable_coul_and_lj(float *VFtab
,
1738 vector
float *VVcoul
,
1739 vector
float *FFcoul
,
1740 vector
float *VVdisp
,
1741 vector
float *FFdisp
,
1742 vector
float *VVrep
,
1743 vector
float *FFrep
)
1745 vector
signed int vidx
;
1746 vector
float Y
,F
,G
,H
,eps
,eps2
;
1747 vector
float tabd1
,tabr1
,tabc1
,tabd2
,tabr2
,tabc2
;
1748 vector
float tabd3
,tabr3
,tabc3
;
1751 vidx
= vec_cts(rtab
,0);
1752 vidx
= vec_add(vidx
,vec_sl(vidx
,vec_splat_u32(1))); /* multiply by 3 */
1753 vidx
= vec_sl(vidx
,vec_splat_u32(2));
1755 idx1
= *((int *)&vidx
);
1756 idx2
= *(((int *)&vidx
)+1);
1757 idx3
= *(((int *)&vidx
)+2);
1759 eps
= vec_sub(rtab
,vec_floor(rtab
));
1760 eps2
= vec_madd(eps
,eps
,vec_zero());
1762 if(((unsigned int)VFtab
)%16) {
1763 /* not 16-byte aligned, but must be 8 byte. */
1764 /* use Y,F,G,H as temp storage */
1765 tabc1
= vec_ld( 0, VFtab
+idx1
);
1766 tabd1
= vec_ld(16, VFtab
+idx1
);
1767 tabr1
= vec_ld(32, VFtab
+idx1
);
1768 Y
= vec_ld(48, VFtab
+idx1
);
1769 tabc1
= vec_sld(tabc1
,tabd1
,8);
1770 tabd1
= vec_sld(tabd1
,tabr1
,8);
1771 tabr1
= vec_sld(tabr1
,Y
,8);
1773 tabc2
= vec_ld( 0, VFtab
+idx2
);
1774 tabd2
= vec_ld(16, VFtab
+idx2
);
1775 tabr2
= vec_ld(32, VFtab
+idx2
);
1776 F
= vec_ld(48, VFtab
+idx2
);
1777 tabc2
= vec_sld(tabc2
,tabd2
,8);
1778 tabd2
= vec_sld(tabd2
,tabr2
,8);
1779 tabr2
= vec_sld(tabr2
,F
,8);
1781 tabc3
= vec_ld( 0, VFtab
+idx3
);
1782 tabd3
= vec_ld(16, VFtab
+idx3
);
1783 tabr3
= vec_ld(32, VFtab
+idx3
);
1784 G
= vec_ld(48, VFtab
+idx3
);
1785 tabc3
= vec_sld(tabc3
,tabd3
,8);
1786 tabd3
= vec_sld(tabd3
,tabr3
,8);
1787 tabr3
= vec_sld(tabr3
,G
,8);
1788 } else { /* 16 byte aligned */
1789 tabc1
= vec_ld( 0, VFtab
+idx1
);
1790 tabd1
= vec_ld(16, VFtab
+idx1
);
1791 tabr1
= vec_ld(32, VFtab
+idx1
);
1792 tabc2
= vec_ld( 0, VFtab
+idx2
);
1793 tabd2
= vec_ld(16, VFtab
+idx2
);
1794 tabr2
= vec_ld(32, VFtab
+idx2
);
1795 tabc3
= vec_ld( 0, VFtab
+idx3
);
1796 tabd3
= vec_ld(16, VFtab
+idx3
);
1797 tabr3
= vec_ld(32, VFtab
+idx3
);
1799 transpose_3_to_4(tabc1
,tabc2
,tabc3
,&Y
,&F
,&G
,&H
);
1801 F
= vec_madd(G
,eps
,F
); /* F + Geps */
1802 H
= vec_madd(H
,eps2
,vec_zero()); /* Heps2 */
1803 F
= vec_add(F
,H
); /* F + Geps + Heps2 (=Fp) */
1804 *VVcoul
= vec_madd(eps
,F
,Y
); /* VV = Y + eps*Fp */
1805 F
= vec_madd(G
,eps
,F
); /* Fp + Geps */
1806 *FFcoul
= vec_madd(vec_two(),H
,F
); /* Fp + Geps + 2.0*Heps2 */
1808 transpose_3_to_4(tabd1
,tabd2
,tabd3
,&Y
,&F
,&G
,&H
);
1810 F
= vec_madd(G
,eps
,F
); /* F + Geps */
1811 H
= vec_madd(H
,eps2
,vec_zero()); /* Heps2 */
1812 F
= vec_add(F
,H
); /* F + Geps + Heps2 (=Fp) */
1813 *VVdisp
= vec_madd(eps
,F
,Y
); /* VV = Y + eps*Fp */
1814 F
= vec_madd(G
,eps
,F
); /* Fp + Geps */
1815 *FFdisp
= vec_madd(vec_two(),H
,F
); /* Fp + Geps + 2.0*Heps2 */
1817 transpose_3_to_4(tabr1
,tabr2
,tabr3
,&Y
,&F
,&G
,&H
);
1819 F
= vec_madd(G
,eps
,F
); /* F + Geps */
1820 H
= vec_madd(H
,eps2
,vec_zero()); /* Heps2 */
1821 F
= vec_add(F
,H
); /* F + Geps + Heps2 (=Fp) */
1822 *VVrep
= vec_madd(eps
,F
,Y
); /* VV = Y + eps*Fp */
1823 F
= vec_madd(G
,eps
,F
); /* Fp + Geps */
1824 *FFrep
= vec_madd(vec_two(),H
,F
); /* Fp + Geps + 2.0*Heps2 */
1830 static inline void do_2_ctable_coul(float *VFtab
,
1835 vector
signed int vidx
;
1836 vector
float Y
,F
,G
,H
,eps
,eps2
,tab1
,tab2
;
1839 vidx
= vec_cts(rtab
,0);
1840 vidx
= vec_sl(vidx
,vec_splat_u32(2));
1842 idx1
= *((int *)&vidx
);
1843 idx2
= *(((int *)&vidx
)+1);
1845 eps
= vec_sub(rtab
,vec_floor(rtab
));
1846 eps2
= vec_madd(eps
,eps
,vec_zero());
1848 if(((unsigned int)VFtab
)%16) { /* not 16-byte aligned, but must be 8 byte. */
1849 tab1
= vec_ld( 0, VFtab
+idx1
);
1850 Y
= vec_ld(16, VFtab
+idx1
);
1851 tab1
= vec_sld(tab1
,Y
,8);
1852 tab2
= vec_ld( 0, VFtab
+idx2
);
1853 F
= vec_ld(16, VFtab
+idx2
);
1854 tab2
= vec_sld(tab2
,F
,8);
1855 } else { /* aligned */
1856 tab1
=vec_ld(0, VFtab
+idx1
);
1857 tab2
=vec_ld(0, VFtab
+idx2
);
1860 /* table data is aligned */
1861 transpose_2_to_4(tab1
,tab2
,&Y
,&F
,&G
,&H
);
1863 F
= vec_madd(G
,eps
,F
); /* F + Geps */
1864 H
= vec_madd(H
,eps2
,vec_zero()); /* Heps2 */
1865 F
= vec_add(F
,H
); /* F + Geps + Heps2 (=Fp) */
1866 *VV
= vec_madd(eps
,F
,Y
); /* VV = Y + eps*Fp */
1867 F
= vec_madd(G
,eps
,F
); /* Fp + Geps */
1868 *FF
= vec_madd(vec_two(),H
,F
); /* Fp + Geps + 2.0*Heps2 */
1871 /* do only coulomb, but on a table with both coulomb and lj data */
1872 static inline void do_2_ljctable_coul(float *VFtab
,
1877 vector
signed int vidx
;
1878 vector
float Y
,F
,G
,H
,eps
,eps2
,tab1
,tab2
;
1881 vidx
= vec_cts(rtab
,0);
1882 vidx
= vec_add(vidx
,vec_sl(vidx
,vec_splat_u32(1))); /* multiply by 3 */
1883 vidx
= vec_sl(vidx
,vec_splat_u32(2));
1885 idx1
= *((int *)&vidx
);
1886 idx2
= *(((int *)&vidx
)+1);
1888 eps
= vec_sub(rtab
,vec_floor(rtab
));
1889 eps2
= vec_madd(eps
,eps
,vec_zero());
1892 if(((unsigned int)VFtab
)%16) {
1893 /* not 16-byte aligned, but must be 8 byte. */
1894 tab1
= vec_ld( 0, VFtab
+idx1
);
1895 Y
= vec_ld(16, VFtab
+idx1
);
1896 tab1
= vec_sld(tab1
,Y
,8);
1897 tab2
= vec_ld( 0, VFtab
+idx2
);
1898 F
= vec_ld(16, VFtab
+idx2
);
1899 tab2
= vec_sld(tab2
,F
,8);
1900 } else { /* aligned */
1901 tab1
=vec_ld(0, VFtab
+idx1
);
1902 tab2
=vec_ld(0, VFtab
+idx2
);
1905 /* table data is aligned */
1906 transpose_2_to_4(tab1
,tab2
,&Y
,&F
,&G
,&H
);
1908 F
= vec_madd(G
,eps
,F
); /* F + Geps */
1909 H
= vec_madd(H
,eps2
,vec_zero()); /* Heps2 */
1910 F
= vec_add(F
,H
); /* F + Geps + Heps2 (=Fp) */
1911 *VV
= vec_madd(eps
,F
,Y
); /* VV = Y + eps*Fp */
1912 F
= vec_madd(G
,eps
,F
); /* Fp + Geps */
1913 *FF
= vec_madd(vec_two(),H
,F
); /* Fp + Geps + 2.0*Heps2 */
1917 static inline void do_2_ljtable_lj(float *VFtab
,
1919 vector
float *VVdisp
,
1920 vector
float *FFdisp
,
1921 vector
float *VVrep
,
1922 vector
float *FFrep
)
1924 vector
signed int vidx
;
1925 vector
float Y
,F
,G
,H
,eps
,eps2
;
1927 vector
float tabd1
,tabd2
;
1928 vector
float tabr1
,tabr2
;
1930 vidx
= vec_cts(rtab
,0);
1931 vidx
= vec_sl(vidx
,vec_splat_u32(3));
1933 idx1
= *((int *)&vidx
);
1934 idx2
= *(((int *)&vidx
)+1);
1936 eps
= vec_sub(rtab
,vec_floor(rtab
));
1937 eps2
= vec_madd(eps
,eps
,vec_zero());
1939 if(((unsigned int)VFtab
)%16) {
1940 /* not 16 byte aligned, i.e. must be 8 byte. */
1941 /* use Y,F,G,H as temp storage */
1942 tabd1
= vec_ld( 0, VFtab
+idx1
);
1943 tabr1
= vec_ld(16, VFtab
+idx1
);
1944 Y
= vec_ld(32, VFtab
+idx1
);
1945 tabd1
= vec_sld(tabd1
,tabr1
,8);
1946 tabr1
= vec_sld(tabr1
,Y
,8);
1948 tabd2
= vec_ld( 0, VFtab
+idx2
);
1949 tabr2
= vec_ld(16, VFtab
+idx2
);
1950 F
= vec_ld(32, VFtab
+idx2
);
1951 tabd2
= vec_sld(tabd2
,tabr2
,8);
1952 tabr2
= vec_sld(tabr2
,F
,8);
1953 } else { /* 16 byte aligned */
1954 tabd1
= vec_ld( 0, VFtab
+idx1
);
1955 tabr1
= vec_ld(16, VFtab
+idx1
);
1956 tabd2
= vec_ld( 0, VFtab
+idx2
);
1957 tabr2
= vec_ld(16, VFtab
+idx2
);
1960 transpose_2_to_4(tabd1
,tabd2
,&Y
,&F
,&G
,&H
);
1962 F
= vec_madd(G
,eps
,F
); /* F + Geps */
1963 H
= vec_madd(H
,eps2
,vec_zero()); /* Heps2 */
1964 F
= vec_add(F
,H
); /* F + Geps + Heps2 (=Fp) */
1965 *VVdisp
= vec_madd(eps
,F
,Y
); /* VV = Y + eps*Fp */
1966 F
= vec_madd(G
,eps
,F
); /* Fp + Geps */
1967 *FFdisp
= vec_madd(vec_two(),H
,F
); /* Fp + Geps + 2.0*Heps2 */
1969 transpose_2_to_4(tabr1
,tabr2
,&Y
,&F
,&G
,&H
);
1971 F
= vec_madd(G
,eps
,F
); /* F + Geps */
1972 H
= vec_madd(H
,eps2
,vec_zero()); /* Heps2 */
1973 F
= vec_add(F
,H
); /* F + Geps + Heps2 (=Fp) */
1974 *VVrep
= vec_madd(eps
,F
,Y
); /* VV = Y + eps*Fp */
1975 F
= vec_madd(G
,eps
,F
); /* Fp + Geps */
1976 *FFrep
= vec_madd(vec_two(),H
,F
); /* Fp + Geps + 2.0*Heps2 */
1980 static inline void do_2_ljctable_coul_and_lj(float *VFtab
,
1982 vector
float *VVcoul
,
1983 vector
float *FFcoul
,
1984 vector
float *VVdisp
,
1985 vector
float *FFdisp
,
1986 vector
float *VVrep
,
1987 vector
float *FFrep
)
1989 vector
signed int vidx
;
1990 vector
float Y
,F
,G
,H
,eps
,eps2
;
1991 vector
float tabd1
,tabr1
,tabc1
,tabd2
,tabr2
,tabc2
;
1994 vidx
= vec_cts(rtab
,0);
1995 vidx
= vec_add(vidx
,vec_sl(vidx
,vec_splat_u32(1))); /* multiply by 3 */
1996 vidx
= vec_sl(vidx
,vec_splat_u32(2));
1998 idx1
= *((int *)&vidx
);
1999 idx2
= *(((int *)&vidx
)+1);
2001 eps
= vec_sub(rtab
,vec_floor(rtab
));
2002 eps2
= vec_madd(eps
,eps
,vec_zero());
2004 if(((unsigned int)VFtab
)%16) {
2005 /* not 16-byte aligned, but must be 8 byte. */
2006 /* use Y,F,G,H as temp storage */
2007 tabc1
= vec_ld( 0, VFtab
+idx1
);
2008 tabd1
= vec_ld(16, VFtab
+idx1
);
2009 tabr1
= vec_ld(32, VFtab
+idx1
);
2010 Y
= vec_ld(48, VFtab
+idx1
);
2011 tabc1
= vec_sld(tabc1
,tabd1
,8);
2012 tabd1
= vec_sld(tabd1
,tabr1
,8);
2013 tabr1
= vec_sld(tabr1
,Y
,8);
2015 tabc2
= vec_ld( 0, VFtab
+idx2
);
2016 tabd2
= vec_ld(16, VFtab
+idx2
);
2017 tabr2
= vec_ld(32, VFtab
+idx2
);
2018 F
= vec_ld(48, VFtab
+idx2
);
2019 tabc2
= vec_sld(tabc2
,tabd2
,8);
2020 tabd2
= vec_sld(tabd2
,tabr2
,8);
2021 tabr2
= vec_sld(tabr2
,F
,8);
2022 } else { /* 16 byte aligned */
2023 tabc1
= vec_ld( 0, VFtab
+idx1
);
2024 tabd1
= vec_ld(16, VFtab
+idx1
);
2025 tabr1
= vec_ld(32, VFtab
+idx1
);
2026 tabc2
= vec_ld( 0, VFtab
+idx2
);
2027 tabd2
= vec_ld(16, VFtab
+idx2
);
2028 tabr2
= vec_ld(32, VFtab
+idx2
);
2030 transpose_2_to_4(tabc1
,tabc2
,&Y
,&F
,&G
,&H
);
2032 F
= vec_madd(G
,eps
,F
); /* F + Geps */
2033 H
= vec_madd(H
,eps2
,vec_zero()); /* Heps2 */
2034 F
= vec_add(F
,H
); /* F + Geps + Heps2 (=Fp) */
2035 *VVcoul
= vec_madd(eps
,F
,Y
); /* VV = Y + eps*Fp */
2036 F
= vec_madd(G
,eps
,F
); /* Fp + Geps */
2037 *FFcoul
= vec_madd(vec_two(),H
,F
); /* Fp + Geps + 2.0*Heps2 */
2039 transpose_2_to_4(tabd1
,tabd2
,&Y
,&F
,&G
,&H
);
2041 F
= vec_madd(G
,eps
,F
); /* F + Geps */
2042 H
= vec_madd(H
,eps2
,vec_zero()); /* Heps2 */
2043 F
= vec_add(F
,H
); /* F + Geps + Heps2 (=Fp) */
2044 *VVdisp
= vec_madd(eps
,F
,Y
); /* VV = Y + eps*Fp */
2045 F
= vec_madd(G
,eps
,F
); /* Fp + Geps */
2046 *FFdisp
= vec_madd(vec_two(),H
,F
); /* Fp + Geps + 2.0*Heps2 */
2048 transpose_2_to_4(tabr1
,tabr2
,&Y
,&F
,&G
,&H
);
2050 F
= vec_madd(G
,eps
,F
); /* F + Geps */
2051 H
= vec_madd(H
,eps2
,vec_zero()); /* Heps2 */
2052 F
= vec_add(F
,H
); /* F + Geps + Heps2 (=Fp) */
2053 *VVrep
= vec_madd(eps
,F
,Y
); /* VV = Y + eps*Fp */
2054 F
= vec_madd(G
,eps
,F
); /* Fp + Geps */
2055 *FFrep
= vec_madd(vec_two(),H
,F
); /* Fp + Geps + 2.0*Heps2 */
2060 static inline void do_1_ctable_coul(float *VFtab
,
2065 vector
signed int vidx
;
2066 vector
float Y
,F
,G
,H
,eps
,eps2
,tab1
;
2069 vidx
= vec_cts(rtab
,0);
2070 vidx
= vec_sl(vidx
,vec_splat_u32(2));
2072 idx1
= *((int *)&vidx
);
2074 eps
= vec_sub(rtab
,vec_floor(rtab
));
2075 eps2
= vec_madd(eps
,eps
,vec_zero());
2077 if(((unsigned int)VFtab
)%16) { /* not 16-byte aligned, but must be 8 byte. */
2078 tab1
= vec_ld( 0, VFtab
+idx1
);
2079 Y
= vec_ld(16, VFtab
+idx1
);
2080 tab1
= vec_sld(tab1
,Y
,8);
2081 } else { /* aligned */
2082 tab1
=vec_ld(0, VFtab
+idx1
);
2085 /* table data is aligned */
2086 transpose_1_to_4(tab1
,&Y
,&F
,&G
,&H
);
2088 F
= vec_madd(G
,eps
,F
); /* F + Geps */
2089 H
= vec_madd(H
,eps2
,vec_zero()); /* Heps2 */
2090 F
= vec_add(F
,H
); /* F + Geps + Heps2 (=Fp) */
2091 *VV
= vec_madd(eps
,F
,Y
); /* VV = Y + eps*Fp */
2092 F
= vec_madd(G
,eps
,F
); /* Fp + Geps */
2093 *FF
= vec_madd(vec_two(),H
,F
); /* Fp + Geps + 2.0*Heps2 */
2096 /* do only coulomb, but on a table with both coulomb and lj data */
2097 static inline void do_1_ljctable_coul(float *VFtab
,
2102 vector
signed int vidx
;
2103 vector
float Y
,F
,G
,H
,eps
,eps2
,tab1
;
2106 vidx
= vec_cts(rtab
,0);
2107 vidx
= vec_add(vidx
,vec_sl(vidx
,vec_splat_u32(1))); /* multiply by 3 */
2108 vidx
= vec_sl(vidx
,vec_splat_u32(2));
2110 idx1
= *((int *)&vidx
);
2112 eps
= vec_sub(rtab
,vec_floor(rtab
));
2113 eps2
= vec_madd(eps
,eps
,vec_zero());
2116 if(((unsigned int)VFtab
)%16) {
2117 /* not 16-byte aligned, but must be 8 byte. */
2118 tab1
= vec_ld( 0, VFtab
+idx1
);
2119 Y
= vec_ld(16, VFtab
+idx1
);
2120 tab1
= vec_sld(tab1
,Y
,8);
2121 } else { /* aligned */
2122 tab1
=vec_ld(0, VFtab
+idx1
);
2125 /* table data is aligned */
2126 transpose_1_to_4(tab1
,&Y
,&F
,&G
,&H
);
2128 F
= vec_madd(G
,eps
,F
); /* F + Geps */
2129 H
= vec_madd(H
,eps2
,vec_zero()); /* Heps2 */
2130 F
= vec_add(F
,H
); /* F + Geps + Heps2 (=Fp) */
2131 *VV
= vec_madd(eps
,F
,Y
); /* VV = Y + eps*Fp */
2132 F
= vec_madd(G
,eps
,F
); /* Fp + Geps */
2133 *FF
= vec_madd(vec_two(),H
,F
); /* Fp + Geps + 2.0*Heps2 */
2137 static inline void do_1_ljtable_lj(float *VFtab
,
2139 vector
float *VVdisp
,
2140 vector
float *FFdisp
,
2141 vector
float *VVrep
,
2142 vector
float *FFrep
)
2144 vector
signed int vidx
;
2145 vector
float Y
,F
,G
,H
,eps
,eps2
;
2150 vidx
= vec_cts(rtab
,0);
2151 vidx
= vec_sl(vidx
,vec_splat_u32(3));
2153 idx1
= *((int *)&vidx
);
2155 eps
= vec_sub(rtab
,vec_floor(rtab
));
2156 eps2
= vec_madd(eps
,eps
,vec_zero());
2158 if(((unsigned int)VFtab
)%16) {
2159 /* not 16 byte aligned, i.e. must be 8 byte. */
2160 /* use Y,F,G,H as temp storage */
2161 tabd1
= vec_ld( 0, VFtab
+idx1
);
2162 tabr1
= vec_ld(16, VFtab
+idx1
);
2163 Y
= vec_ld(32, VFtab
+idx1
);
2164 tabd1
= vec_sld(tabd1
,tabr1
,8);
2165 tabr1
= vec_sld(tabr1
,Y
,8);
2166 } else { /* 16 byte aligned */
2167 tabd1
= vec_ld( 0, VFtab
+idx1
);
2168 tabr1
= vec_ld(16, VFtab
+idx1
);
2171 transpose_1_to_4(tabd1
,&Y
,&F
,&G
,&H
);
2173 F
= vec_madd(G
,eps
,F
); /* F + Geps */
2174 H
= vec_madd(H
,eps2
,vec_zero()); /* Heps2 */
2175 F
= vec_add(F
,H
); /* F + Geps + Heps2 (=Fp) */
2176 *VVdisp
= vec_madd(eps
,F
,Y
); /* VV = Y + eps*Fp */
2177 F
= vec_madd(G
,eps
,F
); /* Fp + Geps */
2178 *FFdisp
= vec_madd(vec_two(),H
,F
); /* Fp + Geps + 2.0*Heps2 */
2180 transpose_1_to_4(tabr1
,&Y
,&F
,&G
,&H
);
2182 F
= vec_madd(G
,eps
,F
); /* F + Geps */
2183 H
= vec_madd(H
,eps2
,vec_zero()); /* Heps2 */
2184 F
= vec_add(F
,H
); /* F + Geps + Heps2 (=Fp) */
2185 *VVrep
= vec_madd(eps
,F
,Y
); /* VV = Y + eps*Fp */
2186 F
= vec_madd(G
,eps
,F
); /* Fp + Geps */
2187 *FFrep
= vec_madd(vec_two(),H
,F
); /* Fp + Geps + 2.0*Heps2 */
2191 static inline void do_1_ljctable_coul_and_lj(float *VFtab
,
2193 vector
float *VVcoul
,
2194 vector
float *FFcoul
,
2195 vector
float *VVdisp
,
2196 vector
float *FFdisp
,
2197 vector
float *VVrep
,
2198 vector
float *FFrep
)
2200 vector
signed int vidx
;
2201 vector
float Y
,F
,G
,H
,eps
,eps2
;
2202 vector
float tabd1
,tabr1
,tabc1
;
2205 vidx
= vec_cts(rtab
,0);
2206 vidx
= vec_add(vidx
,vec_sl(vidx
,vec_splat_u32(1))); /* multiply by 3 */
2207 vidx
= vec_sl(vidx
,vec_splat_u32(2));
2209 idx1
= *((int *)&vidx
);
2211 eps
= vec_sub(rtab
,vec_floor(rtab
));
2212 eps2
= vec_madd(eps
,eps
,vec_zero());
2214 if(((unsigned int)VFtab
)%16) {
2215 /* not 16-byte aligned, but must be 8 byte. */
2216 /* use Y,F,G,H as temp storage */
2217 tabc1
= vec_ld( 0, VFtab
+idx1
);
2218 tabd1
= vec_ld(16, VFtab
+idx1
);
2219 tabr1
= vec_ld(32, VFtab
+idx1
);
2220 Y
= vec_ld(48, VFtab
+idx1
);
2221 tabc1
= vec_sld(tabc1
,tabd1
,8);
2222 tabd1
= vec_sld(tabd1
,tabr1
,8);
2223 tabr1
= vec_sld(tabr1
,Y
,8);
2224 } else { /* 16 byte aligned */
2225 tabc1
= vec_ld( 0, VFtab
+idx1
);
2226 tabd1
= vec_ld(16, VFtab
+idx1
);
2227 tabr1
= vec_ld(32, VFtab
+idx1
);
2229 transpose_1_to_4(tabc1
,&Y
,&F
,&G
,&H
);
2231 F
= vec_madd(G
,eps
,F
); /* F + Geps */
2232 H
= vec_madd(H
,eps2
,vec_zero()); /* Heps2 */
2233 F
= vec_add(F
,H
); /* F + Geps + Heps2 (=Fp) */
2234 *VVcoul
= vec_madd(eps
,F
,Y
); /* VV = Y + eps*Fp */
2235 F
= vec_madd(G
,eps
,F
); /* Fp + Geps */
2236 *FFcoul
= vec_madd(vec_two(),H
,F
); /* Fp + Geps + 2.0*Heps2 */
2238 transpose_1_to_4(tabd1
,&Y
,&F
,&G
,&H
);
2240 F
= vec_madd(G
,eps
,F
); /* F + Geps */
2241 H
= vec_madd(H
,eps2
,vec_zero()); /* Heps2 */
2242 F
= vec_add(F
,H
); /* F + Geps + Heps2 (=Fp) */
2243 *VVdisp
= vec_madd(eps
,F
,Y
); /* VV = Y + eps*Fp */
2244 F
= vec_madd(G
,eps
,F
); /* Fp + Geps */
2245 *FFdisp
= vec_madd(vec_two(),H
,F
); /* Fp + Geps + 2.0*Heps2 */
2247 transpose_1_to_4(tabr1
,&Y
,&F
,&G
,&H
);
2249 F
= vec_madd(G
,eps
,F
); /* F + Geps */
2250 H
= vec_madd(H
,eps2
,vec_zero()); /* Heps2 */
2251 F
= vec_add(F
,H
); /* F + Geps + Heps2 (=Fp) */
2252 *VVrep
= vec_madd(eps
,F
,Y
); /* VV = Y + eps*Fp */
2253 F
= vec_madd(G
,eps
,F
); /* Fp + Geps */
2254 *FFrep
= vec_madd(vec_two(),H
,F
); /* Fp + Geps + 2.0*Heps2 */
2260 static inline void do_vonly_4_ctable_coul(float *VFtab
,
2264 vector
signed int vidx
;
2265 vector
float Y
,F
,G
,H
,eps
,eps2
,tab1
,tab2
,tab3
,tab4
;
2266 int idx1
,idx2
,idx3
,idx4
;
2268 vidx
= vec_cts(rtab
,0);
2269 vidx
= vec_sl(vidx
,vec_splat_u32(2));
2271 idx1
= *((int *)&vidx
);
2272 idx2
= *(((int *)&vidx
)+1);
2273 idx3
= *(((int *)&vidx
)+2);
2274 idx4
= *(((int *)&vidx
)+3);
2276 eps
= vec_sub(rtab
,vec_floor(rtab
));
2277 eps2
= vec_madd(eps
,eps
,vec_zero());
2279 if(((unsigned int)VFtab
)%16) { /* not 16-byte aligned, but must be 8 byte. */
2280 tab1
= vec_ld( 0, VFtab
+idx1
);
2281 Y
= vec_ld(16, VFtab
+idx1
);
2282 tab1
= vec_sld(tab1
,Y
,8);
2283 tab2
= vec_ld( 0, VFtab
+idx2
);
2284 F
= vec_ld(16, VFtab
+idx2
);
2285 tab2
= vec_sld(tab2
,F
,8);
2286 tab3
= vec_ld( 0, VFtab
+idx3
);
2287 G
= vec_ld(16, VFtab
+idx3
);
2288 tab3
= vec_sld(tab3
,G
,8);
2289 tab4
= vec_ld( 0, VFtab
+idx4
);
2290 H
= vec_ld(16, VFtab
+idx4
);
2291 tab4
= vec_sld(tab4
,H
,8);
2292 } else { /* aligned */
2293 tab1
=vec_ld(0, VFtab
+idx1
);
2294 tab2
=vec_ld(0, VFtab
+idx2
);
2295 tab3
=vec_ld(0, VFtab
+idx3
);
2296 tab4
=vec_ld(0, VFtab
+idx4
);
2299 /* table data is aligned */
2300 transpose_4_to_4(tab1
,tab2
,tab3
,tab4
,&Y
,&F
,&G
,&H
);
2302 F
= vec_madd(G
,eps
,F
); /* F + Geps */
2303 H
= vec_madd(H
,eps2
,vec_zero()); /* Heps2 */
2304 F
= vec_add(F
,H
); /* F + Geps + Heps2 (=Fp) */
2305 *VV
= vec_madd(eps
,F
,Y
); /* VV = Y + eps*Fp */
2308 /* do only coulomb, but on a table with both coulomb and lj data */
2309 static inline void do_vonly_4_ljctable_coul(float *VFtab
,
2313 vector
signed int vidx
;
2314 vector
float Y
,F
,G
,H
,eps
,eps2
,tab1
,tab2
,tab3
,tab4
;
2315 int idx1
,idx2
,idx3
,idx4
;
2317 vidx
= vec_cts(rtab
,0);
2318 vidx
= vec_add(vidx
,vec_sl(vidx
,vec_splat_u32(1))); /* multiply by 3 */
2319 vidx
= vec_sl(vidx
,vec_splat_u32(2));
2321 idx1
= *((int *)&vidx
);
2322 idx2
= *(((int *)&vidx
)+1);
2323 idx3
= *(((int *)&vidx
)+2);
2324 idx4
= *(((int *)&vidx
)+3);
2326 eps
= vec_sub(rtab
,vec_floor(rtab
));
2327 eps2
= vec_madd(eps
,eps
,vec_zero());
2329 if(((unsigned int)VFtab
)%16) { /* not 16-byte aligned, but must be 8 byte. */
2330 tab1
= vec_ld( 0, VFtab
+idx1
);
2331 Y
= vec_ld(16, VFtab
+idx1
);
2332 tab1
= vec_sld(tab1
,Y
,8);
2333 tab2
= vec_ld( 0, VFtab
+idx2
);
2334 F
= vec_ld(16, VFtab
+idx2
);
2335 tab2
= vec_sld(tab2
,F
,8);
2336 tab3
= vec_ld( 0, VFtab
+idx3
);
2337 G
= vec_ld(16, VFtab
+idx3
);
2338 tab3
= vec_sld(tab3
,G
,8);
2339 tab4
= vec_ld( 0, VFtab
+idx4
);
2340 H
= vec_ld(16, VFtab
+idx4
);
2341 tab4
= vec_sld(tab4
,H
,8);
2342 } else { /* aligned */
2343 tab1
=vec_ld(0, VFtab
+idx1
);
2344 tab2
=vec_ld(0, VFtab
+idx2
);
2345 tab3
=vec_ld(0, VFtab
+idx3
);
2346 tab4
=vec_ld(0, VFtab
+idx4
);
2349 transpose_4_to_4(tab1
,tab2
,tab3
,tab4
,&Y
,&F
,&G
,&H
);
2351 F
= vec_madd(G
,eps
,F
); /* F + Geps */
2352 H
= vec_madd(H
,eps2
,vec_zero()); /* Heps2 */
2353 F
= vec_add(F
,H
); /* F + Geps + Heps2 (=Fp) */
2354 *VV
= vec_madd(eps
,F
,Y
); /* VV = Y + eps*Fp */
2358 static inline void do_vonly_4_ljtable_lj(float *VFtab
,
2360 vector
float *VVdisp
,
2361 vector
float *VVrep
)
2363 vector
signed int vidx
;
2364 vector
float Y
,F
,G
,H
,eps
,eps2
;
2365 vector
float tabd1
,tabd2
,tabd3
,tabd4
;
2366 vector
float tabr1
,tabr2
,tabr3
,tabr4
;
2368 int idx1
,idx2
,idx3
,idx4
;
2370 vidx
= vec_cts(rtab
,0);
2371 vidx
= vec_sl(vidx
,vec_splat_u32(3));
2373 idx1
= *((int *)&vidx
);
2374 idx2
= *(((int *)&vidx
)+1);
2375 idx3
= *(((int *)&vidx
)+2);
2376 idx4
= *(((int *)&vidx
)+3);
2378 eps
= vec_sub(rtab
,vec_floor(rtab
));
2379 eps2
= vec_madd(eps
,eps
,vec_zero());
2381 if(((unsigned int)VFtab
)%16) {
2382 /* not 16 byte aligned, i.e. must be 8 byte. */
2383 /* use Y,F,G,H as temp storage */
2384 tabd1
= vec_ld( 0, VFtab
+idx1
);
2385 tabr1
= vec_ld(16, VFtab
+idx1
);
2386 Y
= vec_ld(32, VFtab
+idx1
);
2387 tabd1
= vec_sld(tabd1
,tabr1
,8);
2388 tabr1
= vec_sld(tabr1
,Y
,8);
2390 tabd2
= vec_ld( 0, VFtab
+idx2
);
2391 tabr2
= vec_ld(16, VFtab
+idx2
);
2392 F
= vec_ld(32, VFtab
+idx2
);
2393 tabd2
= vec_sld(tabd2
,tabr2
,8);
2394 tabr2
= vec_sld(tabr2
,F
,8);
2396 tabd3
= vec_ld( 0, VFtab
+idx3
);
2397 tabr3
= vec_ld(16, VFtab
+idx3
);
2398 G
= vec_ld(32, VFtab
+idx3
);
2399 tabd3
= vec_sld(tabd3
,tabr3
,8);
2400 tabr3
= vec_sld(tabr3
,G
,8);
2402 tabd4
= vec_ld( 0, VFtab
+idx4
);
2403 tabr4
= vec_ld(16, VFtab
+idx4
);
2404 H
= vec_ld(32, VFtab
+idx4
);
2405 tabd4
= vec_sld(tabd4
,tabr4
,8);
2406 tabr4
= vec_sld(tabr4
,H
,8);
2407 } else { /* 16 byte aligned */
2408 tabd1
= vec_ld( 0, VFtab
+idx1
);
2409 tabr1
= vec_ld(16, VFtab
+idx1
);
2410 tabd2
= vec_ld( 0, VFtab
+idx2
);
2411 tabr2
= vec_ld(16, VFtab
+idx2
);
2412 tabd3
= vec_ld( 0, VFtab
+idx3
);
2413 tabr3
= vec_ld(16, VFtab
+idx3
);
2414 tabd4
= vec_ld( 0, VFtab
+idx4
);
2415 tabr4
= vec_ld(16, VFtab
+idx4
);
2418 transpose_4_to_4(tabd1
,tabd2
,tabd3
,tabd4
,&Y
,&F
,&G
,&H
);
2420 F
= vec_madd(G
,eps
,F
); /* F + Geps */
2421 H
= vec_madd(H
,eps2
,vec_zero()); /* Heps2 */
2422 F
= vec_add(F
,H
); /* F + Geps + Heps2 (=Fp) */
2423 *VVdisp
= vec_madd(eps
,F
,Y
); /* VV = Y + eps*Fp */
2425 transpose_4_to_4(tabr1
,tabr2
,tabr3
,tabr4
,&Y
,&F
,&G
,&H
);
2427 F
= vec_madd(G
,eps
,F
); /* F + Geps */
2428 H
= vec_madd(H
,eps2
,vec_zero()); /* Heps2 */
2429 F
= vec_add(F
,H
); /* F + Geps + Heps2 (=Fp) */
2430 *VVrep
= vec_madd(eps
,F
,Y
); /* VV = Y + eps*Fp */
2434 static inline void do_vonly_4_ljctable_coul_and_lj(float *VFtab
,
2436 vector
float *VVcoul
,
2437 vector
float *VVdisp
,
2438 vector
float *VVrep
)
2440 vector
signed int vidx
;
2441 vector
float Y
,F
,G
,H
,eps
,eps2
;
2442 vector
float tabd1
,tabr1
,tabc1
,tabd2
,tabr2
,tabc2
;
2443 vector
float tabd3
,tabr3
,tabc3
,tabd4
,tabr4
,tabc4
;
2444 int idx1
,idx2
,idx3
,idx4
;
2446 vidx
= vec_cts(rtab
,0);
2447 vidx
= vec_add(vidx
,vec_sl(vidx
,vec_splat_u32(1))); /* multiply by 3 */
2448 vidx
= vec_sl(vidx
,vec_splat_u32(2));
2450 idx1
= *((int *)&vidx
);
2451 idx2
= *(((int *)&vidx
)+1);
2452 idx3
= *(((int *)&vidx
)+2);
2453 idx4
= *(((int *)&vidx
)+3);
2455 eps
= vec_sub(rtab
,vec_floor(rtab
));
2456 eps2
= vec_madd(eps
,eps
,vec_zero());
2458 if(((unsigned int)VFtab
)%16) {
2459 /* not 16-byte aligned, but must be 8 byte. */
2460 /* use Y,F,G,H as temp storage */
2461 tabc1
= vec_ld( 0, VFtab
+idx1
);
2462 tabd1
= vec_ld(16, VFtab
+idx1
);
2463 tabr1
= vec_ld(32, VFtab
+idx1
);
2464 Y
= vec_ld(48, VFtab
+idx1
);
2465 tabc1
= vec_sld(tabc1
,tabd1
,8);
2466 tabd1
= vec_sld(tabd1
,tabr1
,8);
2467 tabr1
= vec_sld(tabr1
,Y
,8);
2469 tabc2
= vec_ld( 0, VFtab
+idx2
);
2470 tabd2
= vec_ld(16, VFtab
+idx2
);
2471 tabr2
= vec_ld(32, VFtab
+idx2
);
2472 F
= vec_ld(48, VFtab
+idx2
);
2473 tabc2
= vec_sld(tabc2
,tabd2
,8);
2474 tabd2
= vec_sld(tabd2
,tabr2
,8);
2475 tabr2
= vec_sld(tabr2
,F
,8);
2477 tabc3
= vec_ld( 0, VFtab
+idx3
);
2478 tabd3
= vec_ld(16, VFtab
+idx3
);
2479 tabr3
= vec_ld(32, VFtab
+idx3
);
2480 G
= vec_ld(48, VFtab
+idx3
);
2481 tabc3
= vec_sld(tabc3
,tabd3
,8);
2482 tabd3
= vec_sld(tabd3
,tabr3
,8);
2483 tabr3
= vec_sld(tabr3
,G
,8);
2485 tabc4
= vec_ld( 0, VFtab
+idx4
);
2486 tabd4
= vec_ld(16, VFtab
+idx4
);
2487 tabr4
= vec_ld(32, VFtab
+idx4
);
2488 H
= vec_ld(48, VFtab
+idx4
);
2489 tabc4
= vec_sld(tabc4
,tabd4
,8);
2490 tabd4
= vec_sld(tabd4
,tabr4
,8);
2491 tabr4
= vec_sld(tabr4
,H
,8);
2492 } else { /* 16 byte aligned */
2493 tabc1
= vec_ld( 0, VFtab
+idx1
);
2494 tabd1
= vec_ld(16, VFtab
+idx1
);
2495 tabr1
= vec_ld(32, VFtab
+idx1
);
2496 tabc2
= vec_ld( 0, VFtab
+idx2
);
2497 tabd2
= vec_ld(16, VFtab
+idx2
);
2498 tabr2
= vec_ld(32, VFtab
+idx2
);
2499 tabc3
= vec_ld( 0, VFtab
+idx3
);
2500 tabd3
= vec_ld(16, VFtab
+idx3
);
2501 tabr3
= vec_ld(32, VFtab
+idx3
);
2502 tabc4
= vec_ld( 0, VFtab
+idx4
);
2503 tabd4
= vec_ld(16, VFtab
+idx4
);
2504 tabr4
= vec_ld(32, VFtab
+idx4
);
2506 transpose_4_to_4(tabc1
,tabc2
,tabc3
,tabc4
,&Y
,&F
,&G
,&H
);
2508 F
= vec_madd(G
,eps
,F
); /* F + Geps */
2509 H
= vec_madd(H
,eps2
,vec_zero()); /* Heps2 */
2510 F
= vec_add(F
,H
); /* F + Geps + Heps2 (=Fp) */
2511 *VVcoul
= vec_madd(eps
,F
,Y
); /* VV = Y + eps*Fp */
2513 transpose_4_to_4(tabd1
,tabd2
,tabd3
,tabd4
,&Y
,&F
,&G
,&H
);
2515 F
= vec_madd(G
,eps
,F
); /* F + Geps */
2516 H
= vec_madd(H
,eps2
,vec_zero()); /* Heps2 */
2517 F
= vec_add(F
,H
); /* F + Geps + Heps2 (=Fp) */
2518 *VVdisp
= vec_madd(eps
,F
,Y
); /* VV = Y + eps*Fp */
2520 transpose_4_to_4(tabr1
,tabr2
,tabr3
,tabr4
,&Y
,&F
,&G
,&H
);
2522 F
= vec_madd(G
,eps
,F
); /* F + Geps */
2523 H
= vec_madd(H
,eps2
,vec_zero()); /* Heps2 */
2524 F
= vec_add(F
,H
); /* F + Geps + Heps2 (=Fp) */
2525 *VVrep
= vec_madd(eps
,F
,Y
); /* VV = Y + eps*Fp */
2529 static inline void do_vonly_3_ctable_coul(float *VFtab
,
2533 vector
signed int vidx
;
2534 vector
float Y
,F
,G
,H
,eps
,eps2
,tab1
,tab2
,tab3
;
2537 vidx
= vec_cts(rtab
,0);
2538 vidx
= vec_sl(vidx
,vec_splat_u32(2));
2540 idx1
= *((int *)&vidx
);
2541 idx2
= *(((int *)&vidx
)+1);
2542 idx3
= *(((int *)&vidx
)+2);
2544 eps
= vec_sub(rtab
,vec_floor(rtab
));
2545 eps2
= vec_madd(eps
,eps
,vec_zero());
2547 if(((unsigned int)VFtab
)%16) { /* not 16-byte aligned, but must be 8 byte. */
2548 tab1
= vec_ld( 0, VFtab
+idx1
);
2549 Y
= vec_ld(16, VFtab
+idx1
);
2550 tab1
= vec_sld(tab1
,Y
,8);
2551 tab2
= vec_ld( 0, VFtab
+idx2
);
2552 F
= vec_ld(16, VFtab
+idx2
);
2553 tab2
= vec_sld(tab2
,F
,8);
2554 tab3
= vec_ld( 0, VFtab
+idx3
);
2555 G
= vec_ld(16, VFtab
+idx3
);
2556 tab3
= vec_sld(tab3
,G
,8);
2557 } else { /* aligned */
2558 tab1
=vec_ld(0, VFtab
+idx1
);
2559 tab2
=vec_ld(0, VFtab
+idx2
);
2560 tab3
=vec_ld(0, VFtab
+idx3
);
2563 /* table data is aligned */
2564 transpose_3_to_4(tab1
,tab2
,tab3
,&Y
,&F
,&G
,&H
);
2567 F
= vec_madd(G
,eps
,F
); /* F + Geps */
2568 H
= vec_madd(H
,eps2
,vec_zero()); /* Heps2 */
2569 F
= vec_add(F
,H
); /* F + Geps + Heps2 (=Fp) */
2570 *VV
= vec_madd(eps
,F
,Y
); /* VV = Y + eps*Fp */
2573 /* do only coulomb, but on a table with both coulomb and lj data */
2574 static inline void do_vonly_3_ljctable_coul(float *VFtab
,
2578 vector
signed int vidx
;
2579 vector
float Y
,F
,G
,H
,eps
,eps2
,tab1
,tab2
,tab3
;
2582 vidx
= vec_cts(rtab
,0);
2583 vidx
= vec_add(vidx
,vec_sl(vidx
,vec_splat_u32(1))); /* multiply by 3 */
2584 vidx
= vec_sl(vidx
,vec_splat_u32(2));
2586 idx1
= *((int *)&vidx
);
2587 idx2
= *(((int *)&vidx
)+1);
2588 idx3
= *(((int *)&vidx
)+2);
2590 eps
= vec_sub(rtab
,vec_floor(rtab
));
2591 eps2
= vec_madd(eps
,eps
,vec_zero());
2594 if(((unsigned int)VFtab
)%16) {
2595 /* not 16-byte aligned, but must be 8 byte. */
2596 tab1
= vec_ld( 0, VFtab
+idx1
);
2597 Y
= vec_ld(16, VFtab
+idx1
);
2598 tab1
= vec_sld(tab1
,Y
,8);
2599 tab2
= vec_ld( 0, VFtab
+idx2
);
2600 F
= vec_ld(16, VFtab
+idx2
);
2601 tab2
= vec_sld(tab2
,F
,8);
2602 tab3
= vec_ld( 0, VFtab
+idx3
);
2603 G
= vec_ld(16, VFtab
+idx3
);
2604 tab3
= vec_sld(tab3
,G
,8);
2605 } else { /* aligned */
2606 tab1
=vec_ld(0, VFtab
+idx1
);
2607 tab2
=vec_ld(0, VFtab
+idx2
);
2608 tab3
=vec_ld(0, VFtab
+idx3
);
2611 /* table data is aligned */
2612 transpose_3_to_4(tab1
,tab2
,tab3
,&Y
,&F
,&G
,&H
);
2614 F
= vec_madd(G
,eps
,F
); /* F + Geps */
2615 H
= vec_madd(H
,eps2
,vec_zero()); /* Heps2 */
2616 F
= vec_add(F
,H
); /* F + Geps + Heps2 (=Fp) */
2617 *VV
= vec_madd(eps
,F
,Y
); /* VV = Y + eps*Fp */
2621 static inline void do_vonly_3_ljtable_lj(float *VFtab
,
2623 vector
float *VVdisp
,
2624 vector
float *VVrep
)
2626 vector
signed int vidx
;
2627 vector
float Y
,F
,G
,H
,eps
,eps2
;
2629 vector
float tabd1
,tabd2
,tabd3
;
2630 vector
float tabr1
,tabr2
,tabr3
;
2632 vidx
= vec_cts(rtab
,0);
2633 vidx
= vec_sl(vidx
,vec_splat_u32(3));
2635 idx1
= *((int *)&vidx
);
2636 idx2
= *(((int *)&vidx
)+1);
2637 idx3
= *(((int *)&vidx
)+2);
2639 eps
= vec_sub(rtab
,vec_floor(rtab
));
2640 eps2
= vec_madd(eps
,eps
,vec_zero());
2642 if(((unsigned int)VFtab
)%16) {
2643 /* not 16 byte aligned, i.e. must be 8 byte. */
2644 /* use Y,F,G,H as temp storage */
2645 tabd1
= vec_ld( 0, VFtab
+idx1
);
2646 tabr1
= vec_ld(16, VFtab
+idx1
);
2647 Y
= vec_ld(32, VFtab
+idx1
);
2648 tabd1
= vec_sld(tabd1
,tabr1
,8);
2649 tabr1
= vec_sld(tabr1
,Y
,8);
2651 tabd2
= vec_ld( 0, VFtab
+idx2
);
2652 tabr2
= vec_ld(16, VFtab
+idx2
);
2653 F
= vec_ld(32, VFtab
+idx2
);
2654 tabd2
= vec_sld(tabd2
,tabr2
,8);
2655 tabr2
= vec_sld(tabr2
,F
,8);
2657 tabd3
= vec_ld( 0, VFtab
+idx3
);
2658 tabr3
= vec_ld(16, VFtab
+idx3
);
2659 G
= vec_ld(32, VFtab
+idx3
);
2660 tabd3
= vec_sld(tabd3
,tabr3
,8);
2661 tabr3
= vec_sld(tabr3
,G
,8);
2662 } else { /* 16 byte aligned */
2663 tabd1
= vec_ld( 0, VFtab
+idx1
);
2664 tabr1
= vec_ld(16, VFtab
+idx1
);
2665 tabd2
= vec_ld( 0, VFtab
+idx2
);
2666 tabr2
= vec_ld(16, VFtab
+idx2
);
2667 tabd3
= vec_ld( 0, VFtab
+idx3
);
2668 tabr3
= vec_ld(16, VFtab
+idx3
);
2671 transpose_3_to_4(tabd1
,tabd2
,tabd3
,&Y
,&F
,&G
,&H
);
2673 F
= vec_madd(G
,eps
,F
); /* F + Geps */
2674 H
= vec_madd(H
,eps2
,vec_zero()); /* Heps2 */
2675 F
= vec_add(F
,H
); /* F + Geps + Heps2 (=Fp) */
2676 *VVdisp
= vec_madd(eps
,F
,Y
); /* VV = Y + eps*Fp */
2678 transpose_3_to_4(tabr1
,tabr2
,tabr3
,&Y
,&F
,&G
,&H
);
2680 F
= vec_madd(G
,eps
,F
); /* F + Geps */
2681 H
= vec_madd(H
,eps2
,vec_zero()); /* Heps2 */
2682 F
= vec_add(F
,H
); /* F + Geps + Heps2 (=Fp) */
2683 *VVrep
= vec_madd(eps
,F
,Y
); /* VV = Y + eps*Fp */
2687 static inline void do_vonly_3_ljctable_coul_and_lj(float *VFtab
,
2689 vector
float *VVcoul
,
2690 vector
float *VVdisp
,
2691 vector
float *VVrep
)
2693 vector
signed int vidx
;
2694 vector
float Y
,F
,G
,H
,eps
,eps2
;
2695 vector
float tabd1
,tabr1
,tabc1
,tabd2
,tabr2
,tabc2
;
2696 vector
float tabd3
,tabr3
,tabc3
;
2699 vidx
= vec_cts(rtab
,0);
2700 vidx
= vec_add(vidx
,vec_sl(vidx
,vec_splat_u32(1))); /* multiply by 3 */
2701 vidx
= vec_sl(vidx
,vec_splat_u32(2));
2703 idx1
= *((int *)&vidx
);
2704 idx2
= *(((int *)&vidx
)+1);
2705 idx3
= *(((int *)&vidx
)+2);
2707 eps
= vec_sub(rtab
,vec_floor(rtab
));
2708 eps2
= vec_madd(eps
,eps
,vec_zero());
2710 if(((unsigned int)VFtab
)%16) {
2711 /* not 16-byte aligned, but must be 8 byte. */
2712 /* use Y,F,G,H as temp storage */
2713 tabc1
= vec_ld( 0, VFtab
+idx1
);
2714 tabd1
= vec_ld(16, VFtab
+idx1
);
2715 tabr1
= vec_ld(32, VFtab
+idx1
);
2716 Y
= vec_ld(48, VFtab
+idx1
);
2717 tabc1
= vec_sld(tabc1
,tabd1
,8);
2718 tabd1
= vec_sld(tabd1
,tabr1
,8);
2719 tabr1
= vec_sld(tabr1
,Y
,8);
2721 tabc2
= vec_ld( 0, VFtab
+idx2
);
2722 tabd2
= vec_ld(16, VFtab
+idx2
);
2723 tabr2
= vec_ld(32, VFtab
+idx2
);
2724 F
= vec_ld(48, VFtab
+idx2
);
2725 tabc2
= vec_sld(tabc2
,tabd2
,8);
2726 tabd2
= vec_sld(tabd2
,tabr2
,8);
2727 tabr2
= vec_sld(tabr2
,F
,8);
2729 tabc3
= vec_ld( 0, VFtab
+idx3
);
2730 tabd3
= vec_ld(16, VFtab
+idx3
);
2731 tabr3
= vec_ld(32, VFtab
+idx3
);
2732 G
= vec_ld(48, VFtab
+idx3
);
2733 tabc3
= vec_sld(tabc3
,tabd3
,8);
2734 tabd3
= vec_sld(tabd3
,tabr3
,8);
2735 tabr3
= vec_sld(tabr3
,G
,8);
2736 } else { /* 16 byte aligned */
2737 tabc1
= vec_ld( 0, VFtab
+idx1
);
2738 tabd1
= vec_ld(16, VFtab
+idx1
);
2739 tabr1
= vec_ld(32, VFtab
+idx1
);
2740 tabc2
= vec_ld( 0, VFtab
+idx2
);
2741 tabd2
= vec_ld(16, VFtab
+idx2
);
2742 tabr2
= vec_ld(32, VFtab
+idx2
);
2743 tabc3
= vec_ld( 0, VFtab
+idx3
);
2744 tabd3
= vec_ld(16, VFtab
+idx3
);
2745 tabr3
= vec_ld(32, VFtab
+idx3
);
2747 transpose_3_to_4(tabc1
,tabc2
,tabc3
,&Y
,&F
,&G
,&H
);
2749 F
= vec_madd(G
,eps
,F
); /* F + Geps */
2750 H
= vec_madd(H
,eps2
,vec_zero()); /* Heps2 */
2751 F
= vec_add(F
,H
); /* F + Geps + Heps2 (=Fp) */
2752 *VVcoul
= vec_madd(eps
,F
,Y
); /* VV = Y + eps*Fp */
2754 transpose_3_to_4(tabd1
,tabd2
,tabd3
,&Y
,&F
,&G
,&H
);
2756 F
= vec_madd(G
,eps
,F
); /* F + Geps */
2757 H
= vec_madd(H
,eps2
,vec_zero()); /* Heps2 */
2758 F
= vec_add(F
,H
); /* F + Geps + Heps2 (=Fp) */
2759 *VVdisp
= vec_madd(eps
,F
,Y
); /* VV = Y + eps*Fp */
2761 transpose_3_to_4(tabr1
,tabr2
,tabr3
,&Y
,&F
,&G
,&H
);
2763 F
= vec_madd(G
,eps
,F
); /* F + Geps */
2764 H
= vec_madd(H
,eps2
,vec_zero()); /* Heps2 */
2765 F
= vec_add(F
,H
); /* F + Geps + Heps2 (=Fp) */
2766 *VVrep
= vec_madd(eps
,F
,Y
); /* VV = Y + eps*Fp */
2772 static inline void do_vonly_2_ctable_coul(float *VFtab
,
2776 vector
signed int vidx
;
2777 vector
float Y
,F
,G
,H
,eps
,eps2
,tab1
,tab2
;
2780 vidx
= vec_cts(rtab
,0);
2781 vidx
= vec_sl(vidx
,vec_splat_u32(2));
2783 idx1
= *((int *)&vidx
);
2784 idx2
= *(((int *)&vidx
)+1);
2786 eps
= vec_sub(rtab
,vec_floor(rtab
));
2787 eps2
= vec_madd(eps
,eps
,vec_zero());
2789 if(((unsigned int)VFtab
)%16) { /* not 16-byte aligned, but must be 8 byte. */
2790 tab1
= vec_ld( 0, VFtab
+idx1
);
2791 Y
= vec_ld(16, VFtab
+idx1
);
2792 tab1
= vec_sld(tab1
,Y
,8);
2793 tab2
= vec_ld( 0, VFtab
+idx2
);
2794 F
= vec_ld(16, VFtab
+idx2
);
2795 tab2
= vec_sld(tab2
,F
,8);
2796 } else { /* aligned */
2797 tab1
=vec_ld(0, VFtab
+idx1
);
2798 tab2
=vec_ld(0, VFtab
+idx2
);
2801 /* table data is aligned */
2802 transpose_2_to_4(tab1
,tab2
,&Y
,&F
,&G
,&H
);
2804 F
= vec_madd(G
,eps
,F
); /* F + Geps */
2805 H
= vec_madd(H
,eps2
,vec_zero()); /* Heps2 */
2806 F
= vec_add(F
,H
); /* F + Geps + Heps2 (=Fp) */
2807 *VV
= vec_madd(eps
,F
,Y
); /* VV = Y + eps*Fp */
2810 /* do only coulomb, but on a table with both coulomb and lj data */
2811 static inline void do_vonly_2_ljctable_coul(float *VFtab
,
2815 vector
signed int vidx
;
2816 vector
float Y
,F
,G
,H
,eps
,eps2
,tab1
,tab2
;
2819 vidx
= vec_cts(rtab
,0);
2820 vidx
= vec_add(vidx
,vec_sl(vidx
,vec_splat_u32(1))); /* multiply by 3 */
2821 vidx
= vec_sl(vidx
,vec_splat_u32(2));
2823 idx1
= *((int *)&vidx
);
2824 idx2
= *(((int *)&vidx
)+1);
2826 eps
= vec_sub(rtab
,vec_floor(rtab
));
2827 eps2
= vec_madd(eps
,eps
,vec_zero());
2830 if(((unsigned int)VFtab
)%16) {
2831 /* not 16-byte aligned, but must be 8 byte. */
2832 tab1
= vec_ld( 0, VFtab
+idx1
);
2833 Y
= vec_ld(16, VFtab
+idx1
);
2834 tab1
= vec_sld(tab1
,Y
,8);
2835 tab2
= vec_ld( 0, VFtab
+idx2
);
2836 F
= vec_ld(16, VFtab
+idx2
);
2837 tab2
= vec_sld(tab2
,F
,8);
2838 } else { /* aligned */
2839 tab1
=vec_ld(0, VFtab
+idx1
);
2840 tab2
=vec_ld(0, VFtab
+idx2
);
2843 /* table data is aligned */
2844 transpose_2_to_4(tab1
,tab2
,&Y
,&F
,&G
,&H
);
2846 F
= vec_madd(G
,eps
,F
); /* F + Geps */
2847 H
= vec_madd(H
,eps2
,vec_zero()); /* Heps2 */
2848 F
= vec_add(F
,H
); /* F + Geps + Heps2 (=Fp) */
2849 *VV
= vec_madd(eps
,F
,Y
); /* VV = Y + eps*Fp */
2853 static inline void do_vonly_2_ljtable_lj(float *VFtab
,
2855 vector
float *VVdisp
,
2856 vector
float *VVrep
)
2858 vector
signed int vidx
;
2859 vector
float Y
,F
,G
,H
,eps
,eps2
;
2861 vector
float tabd1
,tabd2
;
2862 vector
float tabr1
,tabr2
;
2864 vidx
= vec_cts(rtab
,0);
2865 vidx
= vec_sl(vidx
,vec_splat_u32(3));
2867 idx1
= *((int *)&vidx
);
2868 idx2
= *(((int *)&vidx
)+1);
2870 eps
= vec_sub(rtab
,vec_floor(rtab
));
2871 eps2
= vec_madd(eps
,eps
,vec_zero());
2873 if(((unsigned int)VFtab
)%16) {
2874 /* not 16 byte aligned, i.e. must be 8 byte. */
2875 /* use Y,F,G,H as temp storage */
2876 tabd1
= vec_ld( 0, VFtab
+idx1
);
2877 tabr1
= vec_ld(16, VFtab
+idx1
);
2878 Y
= vec_ld(32, VFtab
+idx1
);
2879 tabd1
= vec_sld(tabd1
,tabr1
,8);
2880 tabr1
= vec_sld(tabr1
,Y
,8);
2882 tabd2
= vec_ld( 0, VFtab
+idx2
);
2883 tabr2
= vec_ld(16, VFtab
+idx2
);
2884 F
= vec_ld(32, VFtab
+idx2
);
2885 tabd2
= vec_sld(tabd2
,tabr2
,8);
2886 tabr2
= vec_sld(tabr2
,F
,8);
2887 } else { /* 16 byte aligned */
2888 tabd1
= vec_ld( 0, VFtab
+idx1
);
2889 tabr1
= vec_ld(16, VFtab
+idx1
);
2890 tabd2
= vec_ld( 0, VFtab
+idx2
);
2891 tabr2
= vec_ld(16, VFtab
+idx2
);
2894 transpose_2_to_4(tabd1
,tabd2
,&Y
,&F
,&G
,&H
);
2896 F
= vec_madd(G
,eps
,F
); /* F + Geps */
2897 H
= vec_madd(H
,eps2
,vec_zero()); /* Heps2 */
2898 F
= vec_add(F
,H
); /* F + Geps + Heps2 (=Fp) */
2899 *VVdisp
= vec_madd(eps
,F
,Y
); /* VV = Y + eps*Fp */
2901 transpose_2_to_4(tabr1
,tabr2
,&Y
,&F
,&G
,&H
);
2903 F
= vec_madd(G
,eps
,F
); /* F + Geps */
2904 H
= vec_madd(H
,eps2
,vec_zero()); /* Heps2 */
2905 F
= vec_add(F
,H
); /* F + Geps + Heps2 (=Fp) */
2906 *VVrep
= vec_madd(eps
,F
,Y
); /* VV = Y + eps*Fp */
2910 static inline void do_vonly_2_ljctable_coul_and_lj(float *VFtab
,
2912 vector
float *VVcoul
,
2913 vector
float *VVdisp
,
2914 vector
float *VVrep
)
2916 vector
signed int vidx
;
2917 vector
float Y
,F
,G
,H
,eps
,eps2
;
2918 vector
float tabd1
,tabr1
,tabc1
,tabd2
,tabr2
,tabc2
;
2921 vidx
= vec_cts(rtab
,0);
2922 vidx
= vec_add(vidx
,vec_sl(vidx
,vec_splat_u32(1))); /* multiply by 3 */
2923 vidx
= vec_sl(vidx
,vec_splat_u32(2));
2925 idx1
= *((int *)&vidx
);
2926 idx2
= *(((int *)&vidx
)+1);
2928 eps
= vec_sub(rtab
,vec_floor(rtab
));
2929 eps2
= vec_madd(eps
,eps
,vec_zero());
2931 if(((unsigned int)VFtab
)%16) {
2932 /* not 16-byte aligned, but must be 8 byte. */
2933 /* use Y,F,G,H as temp storage */
2934 tabc1
= vec_ld( 0, VFtab
+idx1
);
2935 tabd1
= vec_ld(16, VFtab
+idx1
);
2936 tabr1
= vec_ld(32, VFtab
+idx1
);
2937 Y
= vec_ld(48, VFtab
+idx1
);
2938 tabc1
= vec_sld(tabc1
,tabd1
,8);
2939 tabd1
= vec_sld(tabd1
,tabr1
,8);
2940 tabr1
= vec_sld(tabr1
,Y
,8);
2942 tabc2
= vec_ld( 0, VFtab
+idx2
);
2943 tabd2
= vec_ld(16, VFtab
+idx2
);
2944 tabr2
= vec_ld(32, VFtab
+idx2
);
2945 F
= vec_ld(48, VFtab
+idx2
);
2946 tabc2
= vec_sld(tabc2
,tabd2
,8);
2947 tabd2
= vec_sld(tabd2
,tabr2
,8);
2948 tabr2
= vec_sld(tabr2
,F
,8);
2949 } else { /* 16 byte aligned */
2950 tabc1
= vec_ld( 0, VFtab
+idx1
);
2951 tabd1
= vec_ld(16, VFtab
+idx1
);
2952 tabr1
= vec_ld(32, VFtab
+idx1
);
2953 tabc2
= vec_ld( 0, VFtab
+idx2
);
2954 tabd2
= vec_ld(16, VFtab
+idx2
);
2955 tabr2
= vec_ld(32, VFtab
+idx2
);
2957 transpose_2_to_4(tabc1
,tabc2
,&Y
,&F
,&G
,&H
);
2959 F
= vec_madd(G
,eps
,F
); /* F + Geps */
2960 H
= vec_madd(H
,eps2
,vec_zero()); /* Heps2 */
2961 F
= vec_add(F
,H
); /* F + Geps + Heps2 (=Fp) */
2962 *VVcoul
= vec_madd(eps
,F
,Y
); /* VV = Y + eps*Fp */
2964 transpose_2_to_4(tabd1
,tabd2
,&Y
,&F
,&G
,&H
);
2966 F
= vec_madd(G
,eps
,F
); /* F + Geps */
2967 H
= vec_madd(H
,eps2
,vec_zero()); /* Heps2 */
2968 F
= vec_add(F
,H
); /* F + Geps + Heps2 (=Fp) */
2969 *VVdisp
= vec_madd(eps
,F
,Y
); /* VV = Y + eps*Fp */
2971 transpose_2_to_4(tabr1
,tabr2
,&Y
,&F
,&G
,&H
);
2973 F
= vec_madd(G
,eps
,F
); /* F + Geps */
2974 H
= vec_madd(H
,eps2
,vec_zero()); /* Heps2 */
2975 F
= vec_add(F
,H
); /* F + Geps + Heps2 (=Fp) */
2976 *VVrep
= vec_madd(eps
,F
,Y
); /* VV = Y + eps*Fp */
2981 static inline void do_vonly_1_ctable_coul(float *VFtab
,
2985 vector
signed int vidx
;
2986 vector
float Y
,F
,G
,H
,eps
,eps2
,tab1
;
2989 vidx
= vec_cts(rtab
,0);
2990 vidx
= vec_sl(vidx
,vec_splat_u32(2));
2992 idx1
= *((int *)&vidx
);
2994 eps
= vec_sub(rtab
,vec_floor(rtab
));
2995 eps2
= vec_madd(eps
,eps
,vec_zero());
2997 if(((unsigned int)VFtab
)%16) { /* not 16-byte aligned, but must be 8 byte. */
2998 tab1
= vec_ld( 0, VFtab
+idx1
);
2999 Y
= vec_ld(16, VFtab
+idx1
);
3000 tab1
= vec_sld(tab1
,Y
,8);
3001 } else { /* aligned */
3002 tab1
=vec_ld(0, VFtab
+idx1
);
3005 /* table data is aligned */
3006 transpose_1_to_4(tab1
,&Y
,&F
,&G
,&H
);
3008 F
= vec_madd(G
,eps
,F
); /* F + Geps */
3009 H
= vec_madd(H
,eps2
,vec_zero()); /* Heps2 */
3010 F
= vec_add(F
,H
); /* F + Geps + Heps2 (=Fp) */
3011 *VV
= vec_madd(eps
,F
,Y
); /* VV = Y + eps*Fp */
3014 /* do only coulomb, but on a table with both coulomb and lj data */
3015 static inline void do_vonly_1_ljctable_coul(float *VFtab
,
3019 vector
signed int vidx
;
3020 vector
float Y
,F
,G
,H
,eps
,eps2
,tab1
;
3023 vidx
= vec_cts(rtab
,0);
3024 vidx
= vec_add(vidx
,vec_sl(vidx
,vec_splat_u32(1))); /* multiply by 3 */
3025 vidx
= vec_sl(vidx
,vec_splat_u32(2));
3027 idx1
= *((int *)&vidx
);
3029 eps
= vec_sub(rtab
,vec_floor(rtab
));
3030 eps2
= vec_madd(eps
,eps
,vec_zero());
3033 if(((unsigned int)VFtab
)%16) {
3034 /* not 16-byte aligned, but must be 8 byte. */
3035 tab1
= vec_ld( 0, VFtab
+idx1
);
3036 Y
= vec_ld(16, VFtab
+idx1
);
3037 tab1
= vec_sld(tab1
,Y
,8);
3038 } else { /* aligned */
3039 tab1
=vec_ld(0, VFtab
+idx1
);
3042 /* table data is aligned */
3043 transpose_1_to_4(tab1
,&Y
,&F
,&G
,&H
);
3045 F
= vec_madd(G
,eps
,F
); /* F + Geps */
3046 H
= vec_madd(H
,eps2
,vec_zero()); /* Heps2 */
3047 F
= vec_add(F
,H
); /* F + Geps + Heps2 (=Fp) */
3048 *VV
= vec_madd(eps
,F
,Y
); /* VV = Y + eps*Fp */
3052 static inline void do_vonly_1_ljtable_lj(float *VFtab
,
3054 vector
float *VVdisp
,
3055 vector
float *VVrep
)
3057 vector
signed int vidx
;
3058 vector
float Y
,F
,G
,H
,eps
,eps2
;
3063 vidx
= vec_cts(rtab
,0);
3064 vidx
= vec_sl(vidx
,vec_splat_u32(3));
3066 idx1
= *((int *)&vidx
);
3068 eps
= vec_sub(rtab
,vec_floor(rtab
));
3069 eps2
= vec_madd(eps
,eps
,vec_zero());
3071 if(((unsigned int)VFtab
)%16) {
3072 /* not 16 byte aligned, i.e. must be 8 byte. */
3073 /* use Y,F,G,H as temp storage */
3074 tabd1
= vec_ld( 0, VFtab
+idx1
);
3075 tabr1
= vec_ld(16, VFtab
+idx1
);
3076 Y
= vec_ld(32, VFtab
+idx1
);
3077 tabd1
= vec_sld(tabd1
,tabr1
,8);
3078 tabr1
= vec_sld(tabr1
,Y
,8);
3079 } else { /* 16 byte aligned */
3080 tabd1
= vec_ld( 0, VFtab
+idx1
);
3081 tabr1
= vec_ld(16, VFtab
+idx1
);
3084 transpose_1_to_4(tabd1
,&Y
,&F
,&G
,&H
);
3086 F
= vec_madd(G
,eps
,F
); /* F + Geps */
3087 H
= vec_madd(H
,eps2
,vec_zero()); /* Heps2 */
3088 F
= vec_add(F
,H
); /* F + Geps + Heps2 (=Fp) */
3089 *VVdisp
= vec_madd(eps
,F
,Y
); /* VV = Y + eps*Fp */
3091 transpose_1_to_4(tabr1
,&Y
,&F
,&G
,&H
);
3093 F
= vec_madd(G
,eps
,F
); /* F + Geps */
3094 H
= vec_madd(H
,eps2
,vec_zero()); /* Heps2 */
3095 F
= vec_add(F
,H
); /* F + Geps + Heps2 (=Fp) */
3096 *VVrep
= vec_madd(eps
,F
,Y
); /* VV = Y + eps*Fp */
3100 static inline void do_vonly_1_ljctable_coul_and_lj(float *VFtab
,
3102 vector
float *VVcoul
,
3103 vector
float *VVdisp
,
3104 vector
float *VVrep
)
3106 vector
signed int vidx
;
3107 vector
float Y
,F
,G
,H
,eps
,eps2
;
3108 vector
float tabd1
,tabr1
,tabc1
;
3111 vidx
= vec_cts(rtab
,0);
3112 vidx
= vec_add(vidx
,vec_sl(vidx
,vec_splat_u32(1))); /* multiply by 3 */
3113 vidx
= vec_sl(vidx
,vec_splat_u32(2));
3115 idx1
= *((int *)&vidx
);
3117 eps
= vec_sub(rtab
,vec_floor(rtab
));
3118 eps2
= vec_madd(eps
,eps
,vec_zero());
3120 if(((unsigned int)VFtab
)%16) {
3121 /* not 16-byte aligned, but must be 8 byte. */
3122 /* use Y,F,G,H as temp storage */
3123 tabc1
= vec_ld( 0, VFtab
+idx1
);
3124 tabd1
= vec_ld(16, VFtab
+idx1
);
3125 tabr1
= vec_ld(32, VFtab
+idx1
);
3126 Y
= vec_ld(48, VFtab
+idx1
);
3127 tabc1
= vec_sld(tabc1
,tabd1
,8);
3128 tabd1
= vec_sld(tabd1
,tabr1
,8);
3129 tabr1
= vec_sld(tabr1
,Y
,8);
3130 } else { /* 16 byte aligned */
3131 tabc1
= vec_ld( 0, VFtab
+idx1
);
3132 tabd1
= vec_ld(16, VFtab
+idx1
);
3133 tabr1
= vec_ld(32, VFtab
+idx1
);
3135 transpose_1_to_4(tabc1
,&Y
,&F
,&G
,&H
);
3137 F
= vec_madd(G
,eps
,F
); /* F + Geps */
3138 H
= vec_madd(H
,eps2
,vec_zero()); /* Heps2 */
3139 F
= vec_add(F
,H
); /* F + Geps + Heps2 (=Fp) */
3140 *VVcoul
= vec_madd(eps
,F
,Y
); /* VV = Y + eps*Fp */
3142 transpose_1_to_4(tabd1
,&Y
,&F
,&G
,&H
);
3144 F
= vec_madd(G
,eps
,F
); /* F + Geps */
3145 H
= vec_madd(H
,eps2
,vec_zero()); /* Heps2 */
3146 F
= vec_add(F
,H
); /* F + Geps + Heps2 (=Fp) */
3147 *VVdisp
= vec_madd(eps
,F
,Y
); /* VV = Y + eps*Fp */
3149 transpose_1_to_4(tabr1
,&Y
,&F
,&G
,&H
);
3151 F
= vec_madd(G
,eps
,F
); /* F + Geps */
3152 H
= vec_madd(H
,eps2
,vec_zero()); /* Heps2 */
3153 F
= vec_add(F
,H
); /* F + Geps + Heps2 (=Fp) */
3154 *VVrep
= vec_madd(eps
,F
,Y
); /* VV = Y + eps*Fp */
3160 static inline vector
float do_recip(vector
float rsq
)
3162 vector
float tmp
,lu
;
3165 tmp
= vec_nmsub(rsq
,lu
,vec_two());
3166 return vec_madd(lu
,tmp
,vec_zero());
3169 static inline vector
float do_invsqrt(vector
float rsq
)
3171 vector
float lu
,tmpA
,tmpB
;
3173 lu
= vec_rsqrte(rsq
);
3174 tmpA
= vec_madd(lu
,lu
,vec_zero());
3175 tmpB
= vec_madd(lu
,vec_half(),vec_zero());
3176 tmpA
= vec_nmsub(rsq
,tmpA
,vec_one());
3177 return vec_madd(tmpA
,tmpB
,lu
);
3180 static inline void do_3_invsqrt(vector
float rsq1
,
3183 vector
float *rinvsq1
,
3184 vector
float *rinvsq2
,
3185 vector
float *rinvsq3
)
3187 vector
float lu1
,lu2
,lu3
;
3188 vector
float tmpA1
,tmpA2
,tmpA3
,tmpB1
,tmpB2
,tmpB3
;
3189 vector
float nul
=vec_zero();
3190 vector
float half
=vec_half();
3191 vector
float one
=vec_one();
3193 lu1
= vec_rsqrte(rsq1
);
3194 lu2
= vec_rsqrte(rsq2
);
3195 lu3
= vec_rsqrte(rsq3
);
3196 tmpA1
= vec_madd(lu1
,lu1
,nul
);
3197 tmpA2
= vec_madd(lu2
,lu2
,nul
);
3198 tmpA3
= vec_madd(lu3
,lu3
,nul
);
3199 tmpB1
= vec_madd(lu1
,half
,nul
);
3200 tmpB2
= vec_madd(lu2
,half
,nul
);
3201 tmpB3
= vec_madd(lu3
,half
,nul
);
3202 tmpA1
= vec_nmsub(rsq1
,tmpA1
,one
);
3203 tmpA2
= vec_nmsub(rsq2
,tmpA2
,one
);
3204 tmpA3
= vec_nmsub(rsq3
,tmpA3
,one
);
3205 *rinvsq1
= vec_madd(tmpA1
,tmpB1
,lu1
);
3206 *rinvsq2
= vec_madd(tmpA2
,tmpB2
,lu2
);
3207 *rinvsq3
= vec_madd(tmpA3
,tmpB3
,lu3
);
3210 static inline void do_9_invsqrt(vector
float rsq1
,
3219 vector
float *rinv1
,
3220 vector
float *rinv2
,
3221 vector
float *rinv3
,
3222 vector
float *rinv4
,
3223 vector
float *rinv5
,
3224 vector
float *rinv6
,
3225 vector
float *rinv7
,
3226 vector
float *rinv8
,
3227 vector
float *rinv9
)
3229 vector
float lu1
,lu2
,lu3
,lu4
,lu5
,lu6
,lu7
,lu8
,lu9
;
3230 vector
float tmpA1
,tmpA2
,tmpA3
,tmpA4
,tmpA5
,tmpA6
,tmpA7
,tmpA8
,tmpA9
;
3231 vector
float tmpB1
,tmpB2
,tmpB3
,tmpB4
,tmpB5
,tmpB6
,tmpB7
,tmpB8
,tmpB9
;
3232 vector
float nul
=vec_zero();
3233 vector
float half
=vec_half();
3234 vector
float one
=vec_one();
3236 lu1
= vec_rsqrte(rsq1
);
3237 lu2
= vec_rsqrte(rsq2
);
3238 lu3
= vec_rsqrte(rsq3
);
3239 lu4
= vec_rsqrte(rsq4
);
3240 lu5
= vec_rsqrte(rsq5
);
3241 lu6
= vec_rsqrte(rsq6
);
3242 lu7
= vec_rsqrte(rsq7
);
3243 lu8
= vec_rsqrte(rsq8
);
3244 lu9
= vec_rsqrte(rsq9
);
3245 tmpA1
= vec_madd(lu1
,lu1
,nul
);
3246 tmpA2
= vec_madd(lu2
,lu2
,nul
);
3247 tmpA3
= vec_madd(lu3
,lu3
,nul
);
3248 tmpA4
= vec_madd(lu4
,lu4
,nul
);
3249 tmpA5
= vec_madd(lu5
,lu5
,nul
);
3250 tmpA6
= vec_madd(lu6
,lu6
,nul
);
3251 tmpA7
= vec_madd(lu7
,lu7
,nul
);
3252 tmpA8
= vec_madd(lu8
,lu8
,nul
);
3253 tmpA9
= vec_madd(lu9
,lu9
,nul
);
3254 tmpB1
= vec_madd(lu1
,half
,nul
);
3255 tmpB2
= vec_madd(lu2
,half
,nul
);
3256 tmpB3
= vec_madd(lu3
,half
,nul
);
3257 tmpB4
= vec_madd(lu4
,half
,nul
);
3258 tmpB5
= vec_madd(lu5
,half
,nul
);
3259 tmpB6
= vec_madd(lu6
,half
,nul
);
3260 tmpB7
= vec_madd(lu7
,half
,nul
);
3261 tmpB8
= vec_madd(lu8
,half
,nul
);
3262 tmpB9
= vec_madd(lu9
,half
,nul
);
3263 tmpA1
= vec_nmsub(rsq1
,tmpA1
,one
);
3264 tmpA2
= vec_nmsub(rsq2
,tmpA2
,one
);
3265 tmpA3
= vec_nmsub(rsq3
,tmpA3
,one
);
3266 tmpA4
= vec_nmsub(rsq4
,tmpA4
,one
);
3267 tmpA5
= vec_nmsub(rsq5
,tmpA5
,one
);
3268 tmpA6
= vec_nmsub(rsq6
,tmpA6
,one
);
3269 tmpA7
= vec_nmsub(rsq7
,tmpA7
,one
);
3270 tmpA8
= vec_nmsub(rsq8
,tmpA8
,one
);
3271 tmpA9
= vec_nmsub(rsq9
,tmpA9
,one
);
3272 *rinv1
= vec_madd(tmpA1
,tmpB1
,lu1
);
3273 *rinv2
= vec_madd(tmpA2
,tmpB2
,lu2
);
3274 *rinv3
= vec_madd(tmpA3
,tmpB3
,lu3
);
3275 *rinv4
= vec_madd(tmpA4
,tmpB4
,lu4
);
3276 *rinv5
= vec_madd(tmpA5
,tmpB5
,lu5
);
3277 *rinv6
= vec_madd(tmpA6
,tmpB6
,lu6
);
3278 *rinv7
= vec_madd(tmpA7
,tmpB7
,lu7
);
3279 *rinv8
= vec_madd(tmpA8
,tmpB8
,lu8
);
3280 *rinv9
= vec_madd(tmpA9
,tmpB9
,lu9
);
3281 /* 36 invsqrt in about 48 cycles due to pipelining ... pretty fast :-) */
3285 static inline void zero_highest_element_in_vector(vector
float *v
)
3287 vector
signed int zero
= (vector
signed int) vec_zero();
3289 *v
= (vector
float)vec_sel((vector
signed int)*v
,zero
,(vector
unsigned int)vec_sld(zero
,vec_splat_s32(-1),4));
3292 static inline void zero_highest_2_elements_in_vector(vector
float *v
)
3294 vector
signed int zero
= (vector
signed int) vec_zero();
3296 *v
= (vector
float)vec_sel((vector
signed int)*v
,zero
,(vector
unsigned int)vec_sld(zero
,vec_splat_s32(-1),8));
3299 static inline void zero_highest_3_elements_in_vector(vector
float *v
)
3301 vector
signed int zero
= (vector
signed int) vec_zero();
3303 *v
= (vector
float)vec_sel((vector
signed int)*v
,zero
,(vector
unsigned int)vec_sld(zero
,vec_splat_s32(-1),12));
3307 static inline void zero_highest_element_in_3_vectors(vector
float *v1
,vector
float *v2
,vector
float *v3
)
3309 vector
signed int zero
= (vector
signed int) vec_zero();
3310 vector
unsigned int mask
= (vector
unsigned int)vec_sld(zero
,vec_splat_s32(-1),4);
3312 *v1
= (vector
float)vec_sel((vector
signed int)*v1
,zero
,mask
);
3313 *v2
= (vector
float)vec_sel((vector
signed int)*v2
,zero
,mask
);
3314 *v3
= (vector
float)vec_sel((vector
signed int)*v3
,zero
,mask
);
3317 static inline void zero_highest_2_elements_in_3_vectors(vector
float *v1
,vector
float *v2
,vector
float *v3
)
3319 vector
signed int zero
= (vector
signed int) vec_zero();
3320 vector
unsigned int mask
= (vector
unsigned int)vec_sld(zero
,vec_splat_s32(-1),8);
3322 *v1
= (vector
float)vec_sel((vector
signed int)*v1
,zero
,mask
);
3323 *v2
= (vector
float)vec_sel((vector
signed int)*v2
,zero
,mask
);
3324 *v3
= (vector
float)vec_sel((vector
signed int)*v3
,zero
,mask
);
3327 static inline void zero_highest_3_elements_in_3_vectors(vector
float *v1
,vector
float *v2
,vector
float *v3
)
3329 vector
signed int zero
= (vector
signed int) vec_zero();
3330 vector
unsigned int mask
= (vector
unsigned int)vec_sld(zero
,vec_splat_s32(-1),12);
3332 *v1
= (vector
float)vec_sel((vector
signed int)*v1
,zero
,mask
);
3333 *v2
= (vector
float)vec_sel((vector
signed int)*v2
,zero
,mask
);
3334 *v3
= (vector
float)vec_sel((vector
signed int)*v3
,zero
,mask
);
3337 static inline void zero_highest_element_in_9_vectors(vector
float *v1
,vector
float *v2
,vector
float *v3
,
3338 vector
float *v4
,vector
float *v5
,vector
float *v6
,
3339 vector
float *v7
,vector
float *v8
,vector
float *v9
)
3341 vector
signed int zero
= (vector
signed int) vec_zero();
3342 vector
unsigned int mask
= (vector
unsigned int)vec_sld(zero
,vec_splat_s32(-1),4);
3344 *v1
= (vector
float)vec_sel((vector
signed int)*v1
,zero
,mask
);
3345 *v2
= (vector
float)vec_sel((vector
signed int)*v2
,zero
,mask
);
3346 *v3
= (vector
float)vec_sel((vector
signed int)*v3
,zero
,mask
);
3347 *v4
= (vector
float)vec_sel((vector
signed int)*v4
,zero
,mask
);
3348 *v5
= (vector
float)vec_sel((vector
signed int)*v5
,zero
,mask
);
3349 *v6
= (vector
float)vec_sel((vector
signed int)*v6
,zero
,mask
);
3350 *v7
= (vector
float)vec_sel((vector
signed int)*v7
,zero
,mask
);
3351 *v8
= (vector
float)vec_sel((vector
signed int)*v8
,zero
,mask
);
3352 *v9
= (vector
float)vec_sel((vector
signed int)*v9
,zero
,mask
);
3355 static inline void zero_highest_2_elements_in_9_vectors(vector
float *v1
,vector
float *v2
,vector
float *v3
,
3356 vector
float *v4
,vector
float *v5
,vector
float *v6
,
3357 vector
float *v7
,vector
float *v8
,vector
float *v9
)
3359 vector
signed int zero
= (vector
signed int) vec_zero();
3360 vector
unsigned int mask
= (vector
unsigned int)vec_sld(zero
,vec_splat_s32(-1),8);
3362 *v1
= (vector
float)vec_sel((vector
signed int)*v1
,zero
,mask
);
3363 *v2
= (vector
float)vec_sel((vector
signed int)*v2
,zero
,mask
);
3364 *v3
= (vector
float)vec_sel((vector
signed int)*v3
,zero
,mask
);
3365 *v4
= (vector
float)vec_sel((vector
signed int)*v4
,zero
,mask
);
3366 *v5
= (vector
float)vec_sel((vector
signed int)*v5
,zero
,mask
);
3367 *v6
= (vector
float)vec_sel((vector
signed int)*v6
,zero
,mask
);
3368 *v7
= (vector
float)vec_sel((vector
signed int)*v7
,zero
,mask
);
3369 *v8
= (vector
float)vec_sel((vector
signed int)*v8
,zero
,mask
);
3370 *v9
= (vector
float)vec_sel((vector
signed int)*v9
,zero
,mask
);
3373 static inline void zero_highest_3_elements_in_9_vectors(vector
float *v1
,vector
float *v2
,vector
float *v3
,
3374 vector
float *v4
,vector
float *v5
,vector
float *v6
,
3375 vector
float *v7
,vector
float *v8
,vector
float *v9
)
3377 vector
signed int zero
= (vector
signed int) vec_zero();
3378 vector
unsigned int mask
= (vector
unsigned int)vec_sld(zero
,vec_splat_s32(-1),12);
3380 *v1
= (vector
float)vec_sel((vector
signed int)*v1
,zero
,mask
);
3381 *v2
= (vector
float)vec_sel((vector
signed int)*v2
,zero
,mask
);
3382 *v3
= (vector
float)vec_sel((vector
signed int)*v3
,zero
,mask
);
3383 *v4
= (vector
float)vec_sel((vector
signed int)*v4
,zero
,mask
);
3384 *v5
= (vector
float)vec_sel((vector
signed int)*v5
,zero
,mask
);
3385 *v6
= (vector
float)vec_sel((vector
signed int)*v6
,zero
,mask
);
3386 *v7
= (vector
float)vec_sel((vector
signed int)*v7
,zero
,mask
);
3387 *v8
= (vector
float)vec_sel((vector
signed int)*v8
,zero
,mask
);
3388 *v9
= (vector
float)vec_sel((vector
signed int)*v9
,zero
,mask
);
3392 void inl0100_altivec(int nri
,int iinr
[],int jindex
[],int jjnr
[],int shift
[],
3393 float shiftvec
[],float fshift
[],int gid
[],float pos
[],
3394 float faction
[],int type
[],int ntype
,float nbfp
[],
3396 void inl0300_altivec(int nri
,int iinr
[],int jindex
[],int jjnr
[],int shift
[],
3397 float shiftvec
[],float fshift
[],int gid
[],float pos
[],
3398 float faction
[],int type
[],int ntype
,float nbfp
[],
3399 float Vnb
[],float tabscale
,float VFtab
[]);
3400 void inl1000_altivec(int nri
,int iinr
[],int jindex
[],int jjnr
[],int shift
[],
3401 float shiftvec
[],float fshift
[],int gid
[],float pos
[],
3402 float faction
[],float charge
[],float facel
,float Vc
[]);
3403 void inl1020_altivec(int nri
,int iinr
[],int jindex
[],int jjnr
[],int shift
[],
3404 float shiftvec
[],float fshift
[],int gid
[],float pos
[],
3405 float faction
[],float charge
[],float facel
,float Vc
[]);
3406 void inl1030_altivec(int nri
,int iinr
[],int jindex
[],int jjnr
[],int shift
[],
3407 float shiftvec
[],float fshift
[],int gid
[],float pos
[],
3408 float faction
[],float charge
[],float facel
,float Vc
[]);
3409 void inl1100_altivec(int nri
,int iinr
[],int jindex
[],int jjnr
[],int shift
[],
3410 float shiftvec
[],float fshift
[],int gid
[],float pos
[],
3411 float faction
[],float charge
[],float facel
,float Vc
[],
3412 int type
[],int ntype
,float nbfp
[],float Vnb
[]);
3413 void inl1120_altivec(int nri
,int iinr
[],int jindex
[],int jjnr
[],int shift
[],
3414 float shiftvec
[],float fshift
[],int gid
[],float pos
[],
3415 float faction
[],float charge
[],float facel
,float Vc
[],
3416 int type
[],int ntype
,float nbfp
[],float Vnb
[]);
3417 void inl1130_altivec(int nri
,int iinr
[],int jindex
[],int jjnr
[],int shift
[],
3418 float shiftvec
[],float fshift
[],int gid
[],float pos
[],
3419 float faction
[],float charge
[],float facel
,float Vc
[],
3420 int type
[],int ntype
,float nbfp
[],float Vnb
[]);
3421 void inl2000_altivec(int nri
,int iinr
[],int jindex
[],int jjnr
[],int shift
[],
3422 float shiftvec
[],float fshift
[],int gid
[],float pos
[],
3423 float faction
[],float charge
[],float facel
,float Vc
[],
3424 float krf
, float crf
);
3425 void inl2020_altivec(int nri
,int iinr
[],int jindex
[],int jjnr
[],int shift
[],
3426 float shiftvec
[],float fshift
[],int gid
[],float pos
[],
3427 float faction
[],float charge
[],float facel
,float Vc
[],
3428 float krf
, float crf
);
3429 void inl2030_altivec(int nri
,int iinr
[],int jindex
[],int jjnr
[],int shift
[],
3430 float shiftvec
[],float fshift
[],int gid
[],float pos
[],
3431 float faction
[],float charge
[],float facel
,float Vc
[],
3432 float krf
, float crf
);
3433 void inl2100_altivec(int nri
,int iinr
[],int jindex
[],int jjnr
[],int shift
[],
3434 float shiftvec
[],float fshift
[],int gid
[],float pos
[],
3435 float faction
[],float charge
[],float facel
,float Vc
[],
3436 float krf
, float crf
, int type
[],int ntype
,
3437 float nbfp
[],float Vnb
[]);
3438 void inl2120_altivec(int nri
,int iinr
[],int jindex
[],int jjnr
[],int shift
[],
3439 float shiftvec
[],float fshift
[],int gid
[],float pos
[],
3440 float faction
[],float charge
[],float facel
,float Vc
[],
3441 float krf
, float crf
, int type
[],int ntype
,
3442 float nbfp
[],float Vnb
[]);
3443 void inl2130_altivec(int nri
,int iinr
[],int jindex
[],int jjnr
[],int shift
[],
3444 float shiftvec
[],float fshift
[],int gid
[],float pos
[],
3445 float faction
[],float charge
[],float facel
,float Vc
[],
3446 float krf
, float crf
, int type
[],int ntype
,
3447 float nbfp
[],float Vnb
[]);
3448 void inl3000_altivec(int nri
,int iinr
[],int jindex
[],int jjnr
[],int shift
[],
3449 float shiftvec
[],float fshift
[],int gid
[],float pos
[],
3450 float faction
[],float charge
[],float facel
,float Vc
[],
3451 float tabscale
,float VFtab
[]);
3452 void inl3020_altivec(int nri
,int iinr
[],int jindex
[],int jjnr
[],int shift
[],
3453 float shiftvec
[],float fshift
[],int gid
[],float pos
[],
3454 float faction
[],float charge
[],float facel
,float Vc
[],
3455 float tabscale
,float VFtab
[]);
3456 void inl3030_altivec(int nri
,int iinr
[],int jindex
[],int jjnr
[],int shift
[],
3457 float shiftvec
[],float fshift
[],int gid
[],float pos
[],
3458 float faction
[],float charge
[],float facel
,float Vc
[],
3459 float tabscale
,float VFtab
[]);
3460 void inl3100_altivec(int nri
,int iinr
[],int jindex
[],int jjnr
[],int shift
[],
3461 float shiftvec
[],float fshift
[],int gid
[],float pos
[],
3462 float faction
[],float charge
[],float facel
,float Vc
[],
3463 int type
[],int ntype
,float nbfp
[],float Vnb
[],
3464 float tabscale
, float VFtab
[]);
3465 void inl3120_altivec(int nri
,int iinr
[],int jindex
[],int jjnr
[],int shift
[],
3466 float shiftvec
[],float fshift
[],int gid
[],float pos
[],
3467 float faction
[],float charge
[],float facel
,float Vc
[],
3468 int type
[],int ntype
,float nbfp
[],float Vnb
[],
3469 float tabscale
, float VFtab
[]);
3470 void inl3130_altivec(int nri
,int iinr
[],int jindex
[],int jjnr
[],int shift
[],
3471 float shiftvec
[],float fshift
[],int gid
[],float pos
[],
3472 float faction
[],float charge
[],float facel
,float Vc
[],
3473 int type
[],int ntype
,float nbfp
[],float Vnb
[],
3474 float tabscale
, float VFtab
[]);
3475 void inl3300_altivec(int nri
,int iinr
[],int jindex
[],int jjnr
[],int shift
[],
3476 float shiftvec
[],float fshift
[],int gid
[],float pos
[],
3477 float faction
[],float charge
[],float facel
,float Vc
[],
3478 int type
[],int ntype
,float nbfp
[],float Vnb
[],
3479 float tabscale
,float VFtab
[]);
3480 void inl3320_altivec(int nri
,int iinr
[],int jindex
[],int jjnr
[],int shift
[],
3481 float shiftvec
[],float fshift
[],int gid
[],float pos
[],
3482 float faction
[],float charge
[],float facel
,float Vc
[],
3483 int type
[],int ntype
,float nbfp
[],float Vnb
[],
3484 float tabscale
,float VFtab
[]);
3485 void inl3330_altivec(int nri
,int iinr
[],int jindex
[],int jjnr
[],int shift
[],
3486 float shiftvec
[],float fshift
[],int gid
[],float pos
[],
3487 float faction
[],float charge
[],float facel
,float Vc
[],
3488 int type
[],int ntype
,float nbfp
[],float Vnb
[],
3489 float tabscale
,float VFtab
[]);
3493 void mcinl0100_altivec(int nri
,int iinr
[],int jindex
[],int jjnr
[],int shift
[],
3494 float shiftvec
[],int gid
[],float pos
[],int type
[],
3495 int ntype
,float nbfp
[],float Vnb
[]);
3496 void mcinl0300_altivec(int nri
,int iinr
[],int jindex
[],int jjnr
[],int shift
[],
3497 float shiftvec
[],int gid
[],float pos
[],int type
[],
3498 int ntype
,float nbfp
[],float Vnb
[],
3499 float tabscale
,float VFtab
[]);
3500 void mcinl1000_altivec(int nri
,int iinr
[],int jindex
[],int jjnr
[],int shift
[],
3501 float shiftvec
[],int gid
[],float pos
[],
3502 float charge
[],float facel
,float Vc
[]);
3503 void mcinl1020_altivec(int nri
,int iinr
[],int jindex
[],int jjnr
[],int shift
[],
3504 float shiftvec
[],int gid
[],float pos
[],
3505 float charge
[],float facel
,float Vc
[]);
3506 void mcinl1030_altivec(int nri
,int iinr
[],int jindex
[],int jjnr
[],int shift
[],
3507 float shiftvec
[],int gid
[],float pos
[],
3508 float charge
[],float facel
,float Vc
[]);
3509 void mcinl1100_altivec(int nri
,int iinr
[],int jindex
[],int jjnr
[],int shift
[],
3510 float shiftvec
[],int gid
[],float pos
[],
3511 float charge
[],float facel
,float Vc
[],
3512 int type
[],int ntype
,float nbfp
[],float Vnb
[]);
3513 void mcinl1120_altivec(int nri
,int iinr
[],int jindex
[],int jjnr
[],int shift
[],
3514 float shiftvec
[],int gid
[],float pos
[],
3515 float charge
[],float facel
,float Vc
[],
3516 int type
[],int ntype
,float nbfp
[],float Vnb
[]);
3517 void mcinl1130_altivec(int nri
,int iinr
[],int jindex
[],int jjnr
[],int shift
[],
3518 float shiftvec
[],int gid
[],float pos
[],
3519 float charge
[],float facel
,float Vc
[],
3520 int type
[],int ntype
,float nbfp
[],float Vnb
[]);
3521 void mcinl2000_altivec(int nri
,int iinr
[],int jindex
[],int jjnr
[],int shift
[],
3522 float shiftvec
[],int gid
[],float pos
[],
3523 float charge
[],float facel
,float Vc
[],
3524 float krf
, float crf
);
3525 void mcinl2020_altivec(int nri
,int iinr
[],int jindex
[],int jjnr
[],int shift
[],
3526 float shiftvec
[],int gid
[],float pos
[],
3527 float charge
[],float facel
,float Vc
[],
3528 float krf
, float crf
);
3529 void mcinl2030_altivec(int nri
,int iinr
[],int jindex
[],int jjnr
[],int shift
[],
3530 float shiftvec
[],int gid
[],float pos
[],
3531 float charge
[],float facel
,float Vc
[],
3532 float krf
, float crf
);
3533 void mcinl2100_altivec(int nri
,int iinr
[],int jindex
[],int jjnr
[],int shift
[],
3534 float shiftvec
[],int gid
[],float pos
[],
3535 float charge
[],float facel
,float Vc
[],
3536 float krf
, float crf
, int type
[],int ntype
,
3537 float nbfp
[],float Vnb
[]);
3538 void mcinl2120_altivec(int nri
,int iinr
[],int jindex
[],int jjnr
[],int shift
[],
3539 float shiftvec
[],int gid
[],float pos
[],
3540 float charge
[],float facel
,float Vc
[],
3541 float krf
, float crf
, int type
[],int ntype
,
3542 float nbfp
[],float Vnb
[]);
3543 void mcinl2130_altivec(int nri
,int iinr
[],int jindex
[],int jjnr
[],int shift
[],
3544 float shiftvec
[],int gid
[],float pos
[],
3545 float charge
[],float facel
,float Vc
[],
3546 float krf
, float crf
, int type
[],int ntype
,
3547 float nbfp
[],float Vnb
[]);
3548 void mcinl3000_altivec(int nri
,int iinr
[],int jindex
[],int jjnr
[],int shift
[],
3549 float shiftvec
[],int gid
[],float pos
[],
3550 float charge
[],float facel
,float Vc
[],
3551 float tabscale
,float VFtab
[]);
3552 void mcinl3020_altivec(int nri
,int iinr
[],int jindex
[],int jjnr
[],int shift
[],
3553 float shiftvec
[],int gid
[],float pos
[],
3554 float charge
[],float facel
,float Vc
[],
3555 float tabscale
,float VFtab
[]);
3556 void mcinl3030_altivec(int nri
,int iinr
[],int jindex
[],int jjnr
[],int shift
[],
3557 float shiftvec
[],int gid
[],float pos
[],
3558 float charge
[],float facel
,float Vc
[],
3559 float tabscale
,float VFtab
[]);
3560 void mcinl3100_altivec(int nri
,int iinr
[],int jindex
[],int jjnr
[],int shift
[],
3561 float shiftvec
[],int gid
[],float pos
[],
3562 float charge
[],float facel
,float Vc
[],
3563 int type
[],int ntype
,float nbfp
[],float Vnb
[],
3564 float tabscale
, float VFtab
[]);
3565 void mcinl3120_altivec(int nri
,int iinr
[],int jindex
[],int jjnr
[],int shift
[],
3566 float shiftvec
[],int gid
[],float pos
[],
3567 float charge
[],float facel
,float Vc
[],
3568 int type
[],int ntype
,float nbfp
[],float Vnb
[],
3569 float tabscale
, float VFtab
[]);
3570 void mcinl3130_altivec(int nri
,int iinr
[],int jindex
[],int jjnr
[],int shift
[],
3571 float shiftvec
[],int gid
[],float pos
[],
3572 float charge
[],float facel
,float Vc
[],
3573 int type
[],int ntype
,float nbfp
[],float Vnb
[],
3574 float tabscale
, float VFtab
[]);
3575 void mcinl3300_altivec(int nri
,int iinr
[],int jindex
[],int jjnr
[],int shift
[],
3576 float shiftvec
[],int gid
[],float pos
[],
3577 float charge
[],float facel
,float Vc
[],
3578 int type
[],int ntype
,float nbfp
[],float Vnb
[],
3579 float tabscale
,float VFtab
[]);
3580 void mcinl3320_altivec(int nri
,int iinr
[],int jindex
[],int jjnr
[],int shift
[],
3581 float shiftvec
[],int gid
[],float pos
[],
3582 float charge
[],float facel
,float Vc
[],
3583 int type
[],int ntype
,float nbfp
[],float Vnb
[],
3584 float tabscale
,float VFtab
[]);
3585 void mcinl3330_altivec(int nri
,int iinr
[],int jindex
[],int jjnr
[],int shift
[],
3586 float shiftvec
[],int gid
[],float pos
[],
3587 float charge
[],float facel
,float Vc
[],
3588 int type
[],int ntype
,float nbfp
[],float Vnb
[],
3589 float tabscale
,float VFtab
[]);
3593 #endif /* ppc_altivec.h */