1 /* -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustrup"; -*-
4 * This source code is part of
8 * GROningen MAchine for Chemical Simulations
11 * Written by David van der Spoel, Erik Lindahl, Berk Hess, and others.
12 * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
13 * Copyright (c) 2001-2004, The GROMACS development team,
14 * check out http://www.gromacs.org for more information.
16 * This program is free software; you can redistribute it and/or
17 * modify it under the terms of the GNU General Public License
18 * as published by the Free Software Foundation; either version 2
19 * of the License, or (at your option) any later version.
21 * If you want to redistribute modifications, please consider that
22 * scientific software is very special. Version control is crucial -
23 * bugs must be traceable. We will be happy to consider code for
24 * inclusion in the official distribution, but derived work must not
25 * be called official GROMACS. Details are found in the README & COPYING
26 * files - if they are missing, get the official version at www.gromacs.org.
28 * To help us fund GROMACS development, we humbly ask that you cite
29 * the papers on the package - you can find them in the top README file.
31 * For more info, check our website at http://www.gromacs.org
34 * GROwing Monsters And Cloning Shrimps
37 /* This include file has code between ifdef's to make sure
38 * that this performance sensitive code is inlined
39 * and to remove conditionals and variable loop bounds at compile time.
42 #ifdef PME_SPREAD_SSE_ORDER4
43 /* This code does not assume any memory alignment.
44 * This code only works for pme_order = 4.
47 __m128 ty_SSE0
,ty_SSE1
,ty_SSE2
,ty_SSE3
;
51 __m128 sum_SSE0
,sum_SSE1
,sum_SSE2
,sum_SSE3
;
52 __m128 gri_SSE0
,gri_SSE1
,gri_SSE2
,gri_SSE3
;
54 ty_SSE0
= _mm_load1_ps(&thy
[0]);
55 ty_SSE1
= _mm_load1_ps(&thy
[1]);
56 ty_SSE2
= _mm_load1_ps(&thy
[2]);
57 ty_SSE3
= _mm_load1_ps(&thy
[3]);
59 tz_SSE
= _mm_loadu_ps(thz
);
61 for(ithx
=0; (ithx
<4); ithx
++)
63 index_x
= (i0
+ithx
)*pny
*pnz
;
66 vx_SSE
= _mm_load1_ps(&valx
);
68 vx_tz_SSE
= _mm_mul_ps(vx_SSE
,tz_SSE
);
70 gri_SSE0
= _mm_loadu_ps(grid
+index_x
+(j0
+0)*pnz
+k0
);
71 gri_SSE1
= _mm_loadu_ps(grid
+index_x
+(j0
+1)*pnz
+k0
);
72 gri_SSE2
= _mm_loadu_ps(grid
+index_x
+(j0
+2)*pnz
+k0
);
73 gri_SSE3
= _mm_loadu_ps(grid
+index_x
+(j0
+3)*pnz
+k0
);
75 sum_SSE0
= _mm_add_ps(gri_SSE0
,_mm_mul_ps(vx_tz_SSE
,ty_SSE0
));
76 sum_SSE1
= _mm_add_ps(gri_SSE1
,_mm_mul_ps(vx_tz_SSE
,ty_SSE1
));
77 sum_SSE2
= _mm_add_ps(gri_SSE2
,_mm_mul_ps(vx_tz_SSE
,ty_SSE2
));
78 sum_SSE3
= _mm_add_ps(gri_SSE3
,_mm_mul_ps(vx_tz_SSE
,ty_SSE3
));
80 _mm_storeu_ps(grid
+index_x
+(j0
+0)*pnz
+k0
,sum_SSE0
);
81 _mm_storeu_ps(grid
+index_x
+(j0
+1)*pnz
+k0
,sum_SSE1
);
82 _mm_storeu_ps(grid
+index_x
+(j0
+2)*pnz
+k0
,sum_SSE2
);
83 _mm_storeu_ps(grid
+index_x
+(j0
+3)*pnz
+k0
,sum_SSE3
);
86 #undef PME_SPREAD_SSE_ORDER4
90 #ifdef PME_GATHER_F_SSE_ORDER4
91 /* This code does not assume any memory alignment.
92 * This code only works for pme_order = 4.
95 float fx_tmp
[4],fy_tmp
[4],fz_tmp
[4];
97 __m128 fx_SSE
,fy_SSE
,fz_SSE
;
99 __m128 tx_SSE
,ty_SSE
,tz_SSE
;
100 __m128 dx_SSE
,dy_SSE
,dz_SSE
;
107 fx_SSE
= _mm_setzero_ps();
108 fy_SSE
= _mm_setzero_ps();
109 fz_SSE
= _mm_setzero_ps();
111 tz_SSE
= _mm_loadu_ps(thz
);
112 dz_SSE
= _mm_loadu_ps(dthz
);
114 for(ithx
=0; (ithx
<4); ithx
++)
116 index_x
= (i0
+ithx
)*pny
*pnz
;
117 tx_SSE
= _mm_load1_ps(thx
+ithx
);
118 dx_SSE
= _mm_load1_ps(dthx
+ithx
);
120 for(ithy
=0; (ithy
<4); ithy
++)
122 index_xy
= index_x
+(j0
+ithy
)*pnz
;
123 ty_SSE
= _mm_load1_ps(thy
+ithy
);
124 dy_SSE
= _mm_load1_ps(dthy
+ithy
);
126 gval_SSE
= _mm_loadu_ps(grid
+index_xy
+k0
);
128 fxy1_SSE
= _mm_mul_ps(tz_SSE
,gval_SSE
);
129 fz1_SSE
= _mm_mul_ps(dz_SSE
,gval_SSE
);
131 fx_SSE
= _mm_add_ps(fx_SSE
,_mm_mul_ps(_mm_mul_ps(dx_SSE
,ty_SSE
),fxy1_SSE
));
132 fy_SSE
= _mm_add_ps(fy_SSE
,_mm_mul_ps(_mm_mul_ps(tx_SSE
,dy_SSE
),fxy1_SSE
));
133 fz_SSE
= _mm_add_ps(fz_SSE
,_mm_mul_ps(_mm_mul_ps(tx_SSE
,ty_SSE
),fz1_SSE
));
137 _mm_storeu_ps(fx_tmp
,fx_SSE
);
138 _mm_storeu_ps(fy_tmp
,fy_SSE
);
139 _mm_storeu_ps(fz_tmp
,fz_SSE
);
141 fx
+= fx_tmp
[0]+fx_tmp
[1]+fx_tmp
[2]+fx_tmp
[3];
142 fy
+= fy_tmp
[0]+fy_tmp
[1]+fy_tmp
[2]+fy_tmp
[3];
143 fz
+= fz_tmp
[0]+fz_tmp
[1]+fz_tmp
[2]+fz_tmp
[3];
145 #undef PME_GATHER_F_SSE_ORDER4
149 #ifdef PME_SPREAD_SSE_ALIGNED
150 /* This code assumes that the grid is allocated 16 bit aligned
151 * and that pnz is a multiple of 4.
152 * This code supports pme_order <= 5.
157 __m128 ty_SSE0
,ty_SSE1
,ty_SSE2
,ty_SSE3
,ty_SSE4
;
163 __m128 sum_SSE00
,sum_SSE01
,sum_SSE02
,sum_SSE03
,sum_SSE04
;
164 __m128 sum_SSE10
,sum_SSE11
,sum_SSE12
,sum_SSE13
,sum_SSE14
;
165 __m128 gri_SSE00
,gri_SSE01
,gri_SSE02
,gri_SSE03
,gri_SSE04
;
166 __m128 gri_SSE10
,gri_SSE11
,gri_SSE12
,gri_SSE13
,gri_SSE14
;
170 ty_SSE0
= _mm_load1_ps(&thy
[0]);
171 ty_SSE1
= _mm_load1_ps(&thy
[1]);
172 ty_SSE2
= _mm_load1_ps(&thy
[2]);
173 ty_SSE3
= _mm_load1_ps(&thy
[3]);
175 ty_SSE4
= _mm_load1_ps(&thy
[4]);
178 tz_SSE0
= _mm_loadu_ps(thz
-offset
);
179 tz_SSE1
= _mm_loadu_ps(thz
-offset
+4);
180 tz_SSE0
= _mm_and_ps(tz_SSE0
,work
->mask_SSE0
[offset
]);
181 tz_SSE1
= _mm_and_ps(tz_SSE1
,work
->mask_SSE1
[offset
]);
183 for(ithx
=0; (ithx
<PME_ORDER
); ithx
++)
185 index
= (i0
+ithx
)*pny
*pnz
+ j0
*pnz
+ k0
- offset
;
188 vx_SSE
= _mm_load1_ps(&valx
);
190 vx_tz_SSE0
= _mm_mul_ps(vx_SSE
,tz_SSE0
);
191 vx_tz_SSE1
= _mm_mul_ps(vx_SSE
,tz_SSE1
);
193 gri_SSE00
= _mm_load_ps(grid
+index
+0*pnz
);
194 gri_SSE01
= _mm_load_ps(grid
+index
+1*pnz
);
195 gri_SSE02
= _mm_load_ps(grid
+index
+2*pnz
);
196 gri_SSE03
= _mm_load_ps(grid
+index
+3*pnz
);
198 gri_SSE04
= _mm_load_ps(grid
+index
+4*pnz
);
200 gri_SSE10
= _mm_load_ps(grid
+index
+0*pnz
+4);
201 gri_SSE11
= _mm_load_ps(grid
+index
+1*pnz
+4);
202 gri_SSE12
= _mm_load_ps(grid
+index
+2*pnz
+4);
203 gri_SSE13
= _mm_load_ps(grid
+index
+3*pnz
+4);
205 gri_SSE14
= _mm_load_ps(grid
+index
+4*pnz
+4);
208 sum_SSE00
= _mm_add_ps(gri_SSE00
,_mm_mul_ps(vx_tz_SSE0
,ty_SSE0
));
209 sum_SSE01
= _mm_add_ps(gri_SSE01
,_mm_mul_ps(vx_tz_SSE0
,ty_SSE1
));
210 sum_SSE02
= _mm_add_ps(gri_SSE02
,_mm_mul_ps(vx_tz_SSE0
,ty_SSE2
));
211 sum_SSE03
= _mm_add_ps(gri_SSE03
,_mm_mul_ps(vx_tz_SSE0
,ty_SSE3
));
213 sum_SSE04
= _mm_add_ps(gri_SSE04
,_mm_mul_ps(vx_tz_SSE0
,ty_SSE4
));
215 sum_SSE10
= _mm_add_ps(gri_SSE10
,_mm_mul_ps(vx_tz_SSE1
,ty_SSE0
));
216 sum_SSE11
= _mm_add_ps(gri_SSE11
,_mm_mul_ps(vx_tz_SSE1
,ty_SSE1
));
217 sum_SSE12
= _mm_add_ps(gri_SSE12
,_mm_mul_ps(vx_tz_SSE1
,ty_SSE2
));
218 sum_SSE13
= _mm_add_ps(gri_SSE13
,_mm_mul_ps(vx_tz_SSE1
,ty_SSE3
));
220 sum_SSE14
= _mm_add_ps(gri_SSE14
,_mm_mul_ps(vx_tz_SSE1
,ty_SSE4
));
223 _mm_store_ps(grid
+index
+0*pnz
,sum_SSE00
);
224 _mm_store_ps(grid
+index
+1*pnz
,sum_SSE01
);
225 _mm_store_ps(grid
+index
+2*pnz
,sum_SSE02
);
226 _mm_store_ps(grid
+index
+3*pnz
,sum_SSE03
);
228 _mm_store_ps(grid
+index
+4*pnz
,sum_SSE04
);
230 _mm_store_ps(grid
+index
+0*pnz
+4,sum_SSE10
);
231 _mm_store_ps(grid
+index
+1*pnz
+4,sum_SSE11
);
232 _mm_store_ps(grid
+index
+2*pnz
+4,sum_SSE12
);
233 _mm_store_ps(grid
+index
+3*pnz
+4,sum_SSE13
);
235 _mm_store_ps(grid
+index
+4*pnz
+4,sum_SSE14
);
240 #undef PME_SPREAD_SSE_ALIGNED
244 #ifdef PME_GATHER_F_SSE_ALIGNED
245 /* This code assumes that the grid is allocated 16 bit aligned
246 * and that pnz is a multiple of 4.
247 * This code supports pme_order <= 5.
252 float fx_tmp
[4],fy_tmp
[4],fz_tmp
[4];
254 __m128 fx_SSE
,fy_SSE
,fz_SSE
;
256 __m128 tx_SSE
,ty_SSE
,tz_SSE0
,tz_SSE1
;
257 __m128 dx_SSE
,dy_SSE
,dz_SSE0
,dz_SSE1
;
271 fx_SSE
= _mm_setzero_ps();
272 fy_SSE
= _mm_setzero_ps();
273 fz_SSE
= _mm_setzero_ps();
275 tz_SSE0
= _mm_loadu_ps(thz
-offset
);
276 dz_SSE0
= _mm_loadu_ps(dthz
-offset
);
277 tz_SSE1
= _mm_loadu_ps(thz
-offset
+4);
278 dz_SSE1
= _mm_loadu_ps(dthz
-offset
+4);
279 tz_SSE0
= _mm_and_ps(tz_SSE0
,work
->mask_SSE0
[offset
]);
280 dz_SSE0
= _mm_and_ps(dz_SSE0
,work
->mask_SSE0
[offset
]);
281 tz_SSE1
= _mm_and_ps(tz_SSE1
,work
->mask_SSE1
[offset
]);
282 dz_SSE1
= _mm_and_ps(dz_SSE1
,work
->mask_SSE1
[offset
]);
284 for(ithx
=0; (ithx
<PME_ORDER
); ithx
++)
286 index_x
= (i0
+ithx
)*pny
*pnz
;
287 tx_SSE
= _mm_load1_ps(thx
+ithx
);
288 dx_SSE
= _mm_load1_ps(dthx
+ithx
);
290 for(ithy
=0; (ithy
<PME_ORDER
); ithy
++)
292 index_xy
= index_x
+(j0
+ithy
)*pnz
;
293 ty_SSE
= _mm_load1_ps(thy
+ithy
);
294 dy_SSE
= _mm_load1_ps(dthy
+ithy
);
296 gval_SSE0
= _mm_load_ps(grid
+index_xy
+k0
-offset
);
297 gval_SSE1
= _mm_load_ps(grid
+index_xy
+k0
-offset
+4);
299 fxy1_SSE0
= _mm_mul_ps(tz_SSE0
,gval_SSE0
);
300 fz1_SSE0
= _mm_mul_ps(dz_SSE0
,gval_SSE0
);
301 fxy1_SSE1
= _mm_mul_ps(tz_SSE1
,gval_SSE1
);
302 fz1_SSE1
= _mm_mul_ps(dz_SSE1
,gval_SSE1
);
304 fxy1_SSE
= _mm_add_ps(fxy1_SSE0
,fxy1_SSE1
);
305 fz1_SSE
= _mm_add_ps(fz1_SSE0
,fz1_SSE1
);
307 fx_SSE
= _mm_add_ps(fx_SSE
,_mm_mul_ps(_mm_mul_ps(dx_SSE
,ty_SSE
),fxy1_SSE
));
308 fy_SSE
= _mm_add_ps(fy_SSE
,_mm_mul_ps(_mm_mul_ps(tx_SSE
,dy_SSE
),fxy1_SSE
));
309 fz_SSE
= _mm_add_ps(fz_SSE
,_mm_mul_ps(_mm_mul_ps(tx_SSE
,ty_SSE
),fz1_SSE
));
313 _mm_store_ps(fx_tmp
,fx_SSE
);
314 _mm_store_ps(fy_tmp
,fy_SSE
);
315 _mm_store_ps(fz_tmp
,fz_SSE
);
317 fx
+= fx_tmp
[0]+fx_tmp
[1]+fx_tmp
[2]+fx_tmp
[3];
318 fy
+= fy_tmp
[0]+fy_tmp
[1]+fy_tmp
[2]+fy_tmp
[3];
319 fz
+= fz_tmp
[0]+fz_tmp
[1]+fz_tmp
[2]+fz_tmp
[3];
322 #undef PME_GATHER_F_SSE_ALIGNED