added Verlet scheme and NxN non-bonded functionality
[gromacs.git] / src / mdlib / pme_sse_single.h
blob7d1623528e086949571aac3467624f76ff2b0958
1 /* -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustrup"; -*-
3 *
4 * This source code is part of
5 *
6 * G R O M A C S
7 *
8 * GROningen MAchine for Chemical Simulations
9 *
10 * VERSION 4.5
11 * Written by David van der Spoel, Erik Lindahl, Berk Hess, and others.
12 * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
13 * Copyright (c) 2001-2004, The GROMACS development team,
14 * check out http://www.gromacs.org for more information.
16 * This program is free software; you can redistribute it and/or
17 * modify it under the terms of the GNU General Public License
18 * as published by the Free Software Foundation; either version 2
19 * of the License, or (at your option) any later version.
21 * If you want to redistribute modifications, please consider that
22 * scientific software is very special. Version control is crucial -
23 * bugs must be traceable. We will be happy to consider code for
24 * inclusion in the official distribution, but derived work must not
25 * be called official GROMACS. Details are found in the README & COPYING
26 * files - if they are missing, get the official version at www.gromacs.org.
28 * To help us fund GROMACS development, we humbly ask that you cite
29 * the papers on the package - you can find them in the top README file.
31 * For more info, check our website at http://www.gromacs.org
33 * And Hey:
34 * GROwing Monsters And Cloning Shrimps
37 /* This include file has code between ifdef's to make sure
38 * that this performance sensitive code is inlined
39 * and to remove conditionals and variable loop bounds at compile time.
42 #ifdef PME_SPREAD_SSE_ORDER4
43 /* This code does not assume any memory alignment.
44 * This code only works for pme_order = 4.
47 __m128 ty_SSE0,ty_SSE1,ty_SSE2,ty_SSE3;
48 __m128 tz_SSE;
49 __m128 vx_SSE;
50 __m128 vx_tz_SSE;
51 __m128 sum_SSE0,sum_SSE1,sum_SSE2,sum_SSE3;
52 __m128 gri_SSE0,gri_SSE1,gri_SSE2,gri_SSE3;
54 ty_SSE0 = _mm_load1_ps(&thy[0]);
55 ty_SSE1 = _mm_load1_ps(&thy[1]);
56 ty_SSE2 = _mm_load1_ps(&thy[2]);
57 ty_SSE3 = _mm_load1_ps(&thy[3]);
59 tz_SSE = _mm_loadu_ps(thz);
61 for(ithx=0; (ithx<4); ithx++)
63 index_x = (i0+ithx)*pny*pnz;
64 valx = qn*thx[ithx];
66 vx_SSE = _mm_load1_ps(&valx);
68 vx_tz_SSE = _mm_mul_ps(vx_SSE,tz_SSE);
70 gri_SSE0 = _mm_loadu_ps(grid+index_x+(j0+0)*pnz+k0);
71 gri_SSE1 = _mm_loadu_ps(grid+index_x+(j0+1)*pnz+k0);
72 gri_SSE2 = _mm_loadu_ps(grid+index_x+(j0+2)*pnz+k0);
73 gri_SSE3 = _mm_loadu_ps(grid+index_x+(j0+3)*pnz+k0);
75 sum_SSE0 = _mm_add_ps(gri_SSE0,_mm_mul_ps(vx_tz_SSE,ty_SSE0));
76 sum_SSE1 = _mm_add_ps(gri_SSE1,_mm_mul_ps(vx_tz_SSE,ty_SSE1));
77 sum_SSE2 = _mm_add_ps(gri_SSE2,_mm_mul_ps(vx_tz_SSE,ty_SSE2));
78 sum_SSE3 = _mm_add_ps(gri_SSE3,_mm_mul_ps(vx_tz_SSE,ty_SSE3));
80 _mm_storeu_ps(grid+index_x+(j0+0)*pnz+k0,sum_SSE0);
81 _mm_storeu_ps(grid+index_x+(j0+1)*pnz+k0,sum_SSE1);
82 _mm_storeu_ps(grid+index_x+(j0+2)*pnz+k0,sum_SSE2);
83 _mm_storeu_ps(grid+index_x+(j0+3)*pnz+k0,sum_SSE3);
86 #undef PME_SPREAD_SSE_ORDER4
87 #endif
90 #ifdef PME_GATHER_F_SSE_ORDER4
91 /* This code does not assume any memory alignment.
92 * This code only works for pme_order = 4.
95 float fx_tmp[4],fy_tmp[4],fz_tmp[4];
97 __m128 fx_SSE,fy_SSE,fz_SSE;
99 __m128 tx_SSE,ty_SSE,tz_SSE;
100 __m128 dx_SSE,dy_SSE,dz_SSE;
102 __m128 gval_SSE;
104 __m128 fxy1_SSE;
105 __m128 fz1_SSE;
107 fx_SSE = _mm_setzero_ps();
108 fy_SSE = _mm_setzero_ps();
109 fz_SSE = _mm_setzero_ps();
111 tz_SSE = _mm_loadu_ps(thz);
112 dz_SSE = _mm_loadu_ps(dthz);
114 for(ithx=0; (ithx<4); ithx++)
116 index_x = (i0+ithx)*pny*pnz;
117 tx_SSE = _mm_load1_ps(thx+ithx);
118 dx_SSE = _mm_load1_ps(dthx+ithx);
120 for(ithy=0; (ithy<4); ithy++)
122 index_xy = index_x+(j0+ithy)*pnz;
123 ty_SSE = _mm_load1_ps(thy+ithy);
124 dy_SSE = _mm_load1_ps(dthy+ithy);
126 gval_SSE = _mm_loadu_ps(grid+index_xy+k0);
128 fxy1_SSE = _mm_mul_ps(tz_SSE,gval_SSE);
129 fz1_SSE = _mm_mul_ps(dz_SSE,gval_SSE);
131 fx_SSE = _mm_add_ps(fx_SSE,_mm_mul_ps(_mm_mul_ps(dx_SSE,ty_SSE),fxy1_SSE));
132 fy_SSE = _mm_add_ps(fy_SSE,_mm_mul_ps(_mm_mul_ps(tx_SSE,dy_SSE),fxy1_SSE));
133 fz_SSE = _mm_add_ps(fz_SSE,_mm_mul_ps(_mm_mul_ps(tx_SSE,ty_SSE),fz1_SSE));
137 _mm_storeu_ps(fx_tmp,fx_SSE);
138 _mm_storeu_ps(fy_tmp,fy_SSE);
139 _mm_storeu_ps(fz_tmp,fz_SSE);
141 fx += fx_tmp[0]+fx_tmp[1]+fx_tmp[2]+fx_tmp[3];
142 fy += fy_tmp[0]+fy_tmp[1]+fy_tmp[2]+fy_tmp[3];
143 fz += fz_tmp[0]+fz_tmp[1]+fz_tmp[2]+fz_tmp[3];
145 #undef PME_GATHER_F_SSE_ORDER4
146 #endif
149 #ifdef PME_SPREAD_SSE_ALIGNED
150 /* This code assumes that the grid is allocated 16 bit aligned
151 * and that pnz is a multiple of 4.
152 * This code supports pme_order <= 5.
155 int offset;
156 int index;
157 __m128 ty_SSE0,ty_SSE1,ty_SSE2,ty_SSE3,ty_SSE4;
158 __m128 tz_SSE0;
159 __m128 tz_SSE1;
160 __m128 vx_SSE;
161 __m128 vx_tz_SSE0;
162 __m128 vx_tz_SSE1;
163 __m128 sum_SSE00,sum_SSE01,sum_SSE02,sum_SSE03,sum_SSE04;
164 __m128 sum_SSE10,sum_SSE11,sum_SSE12,sum_SSE13,sum_SSE14;
165 __m128 gri_SSE00,gri_SSE01,gri_SSE02,gri_SSE03,gri_SSE04;
166 __m128 gri_SSE10,gri_SSE11,gri_SSE12,gri_SSE13,gri_SSE14;
168 offset = k0 & 3;
170 ty_SSE0 = _mm_load1_ps(&thy[0]);
171 ty_SSE1 = _mm_load1_ps(&thy[1]);
172 ty_SSE2 = _mm_load1_ps(&thy[2]);
173 ty_SSE3 = _mm_load1_ps(&thy[3]);
174 #if PME_ORDER == 5
175 ty_SSE4 = _mm_load1_ps(&thy[4]);
176 #endif
178 tz_SSE0 = _mm_loadu_ps(thz-offset);
179 tz_SSE1 = _mm_loadu_ps(thz-offset+4);
180 tz_SSE0 = _mm_and_ps(tz_SSE0,work->mask_SSE0[offset]);
181 tz_SSE1 = _mm_and_ps(tz_SSE1,work->mask_SSE1[offset]);
183 for(ithx=0; (ithx<PME_ORDER); ithx++)
185 index = (i0+ithx)*pny*pnz + j0*pnz + k0 - offset;
186 valx = qn*thx[ithx];
188 vx_SSE = _mm_load1_ps(&valx);
190 vx_tz_SSE0 = _mm_mul_ps(vx_SSE,tz_SSE0);
191 vx_tz_SSE1 = _mm_mul_ps(vx_SSE,tz_SSE1);
193 gri_SSE00 = _mm_load_ps(grid+index+0*pnz);
194 gri_SSE01 = _mm_load_ps(grid+index+1*pnz);
195 gri_SSE02 = _mm_load_ps(grid+index+2*pnz);
196 gri_SSE03 = _mm_load_ps(grid+index+3*pnz);
197 #if PME_ORDER == 5
198 gri_SSE04 = _mm_load_ps(grid+index+4*pnz);
199 #endif
200 gri_SSE10 = _mm_load_ps(grid+index+0*pnz+4);
201 gri_SSE11 = _mm_load_ps(grid+index+1*pnz+4);
202 gri_SSE12 = _mm_load_ps(grid+index+2*pnz+4);
203 gri_SSE13 = _mm_load_ps(grid+index+3*pnz+4);
204 #if PME_ORDER == 5
205 gri_SSE14 = _mm_load_ps(grid+index+4*pnz+4);
206 #endif
208 sum_SSE00 = _mm_add_ps(gri_SSE00,_mm_mul_ps(vx_tz_SSE0,ty_SSE0));
209 sum_SSE01 = _mm_add_ps(gri_SSE01,_mm_mul_ps(vx_tz_SSE0,ty_SSE1));
210 sum_SSE02 = _mm_add_ps(gri_SSE02,_mm_mul_ps(vx_tz_SSE0,ty_SSE2));
211 sum_SSE03 = _mm_add_ps(gri_SSE03,_mm_mul_ps(vx_tz_SSE0,ty_SSE3));
212 #if PME_ORDER == 5
213 sum_SSE04 = _mm_add_ps(gri_SSE04,_mm_mul_ps(vx_tz_SSE0,ty_SSE4));
214 #endif
215 sum_SSE10 = _mm_add_ps(gri_SSE10,_mm_mul_ps(vx_tz_SSE1,ty_SSE0));
216 sum_SSE11 = _mm_add_ps(gri_SSE11,_mm_mul_ps(vx_tz_SSE1,ty_SSE1));
217 sum_SSE12 = _mm_add_ps(gri_SSE12,_mm_mul_ps(vx_tz_SSE1,ty_SSE2));
218 sum_SSE13 = _mm_add_ps(gri_SSE13,_mm_mul_ps(vx_tz_SSE1,ty_SSE3));
219 #if PME_ORDER == 5
220 sum_SSE14 = _mm_add_ps(gri_SSE14,_mm_mul_ps(vx_tz_SSE1,ty_SSE4));
221 #endif
223 _mm_store_ps(grid+index+0*pnz,sum_SSE00);
224 _mm_store_ps(grid+index+1*pnz,sum_SSE01);
225 _mm_store_ps(grid+index+2*pnz,sum_SSE02);
226 _mm_store_ps(grid+index+3*pnz,sum_SSE03);
227 #if PME_ORDER == 5
228 _mm_store_ps(grid+index+4*pnz,sum_SSE04);
229 #endif
230 _mm_store_ps(grid+index+0*pnz+4,sum_SSE10);
231 _mm_store_ps(grid+index+1*pnz+4,sum_SSE11);
232 _mm_store_ps(grid+index+2*pnz+4,sum_SSE12);
233 _mm_store_ps(grid+index+3*pnz+4,sum_SSE13);
234 #if PME_ORDER == 5
235 _mm_store_ps(grid+index+4*pnz+4,sum_SSE14);
236 #endif
239 #undef PME_ORDER
240 #undef PME_SPREAD_SSE_ALIGNED
241 #endif
244 #ifdef PME_GATHER_F_SSE_ALIGNED
245 /* This code assumes that the grid is allocated 16 bit aligned
246 * and that pnz is a multiple of 4.
247 * This code supports pme_order <= 5.
250 int offset;
252 float fx_tmp[4],fy_tmp[4],fz_tmp[4];
254 __m128 fx_SSE,fy_SSE,fz_SSE;
256 __m128 tx_SSE,ty_SSE,tz_SSE0,tz_SSE1;
257 __m128 dx_SSE,dy_SSE,dz_SSE0,dz_SSE1;
259 __m128 gval_SSE0;
260 __m128 gval_SSE1;
262 __m128 fxy1_SSE0;
263 __m128 fz1_SSE0;
264 __m128 fxy1_SSE1;
265 __m128 fz1_SSE1;
266 __m128 fxy1_SSE;
267 __m128 fz1_SSE;
269 offset = k0 & 3;
271 fx_SSE = _mm_setzero_ps();
272 fy_SSE = _mm_setzero_ps();
273 fz_SSE = _mm_setzero_ps();
275 tz_SSE0 = _mm_loadu_ps(thz-offset);
276 dz_SSE0 = _mm_loadu_ps(dthz-offset);
277 tz_SSE1 = _mm_loadu_ps(thz-offset+4);
278 dz_SSE1 = _mm_loadu_ps(dthz-offset+4);
279 tz_SSE0 = _mm_and_ps(tz_SSE0,work->mask_SSE0[offset]);
280 dz_SSE0 = _mm_and_ps(dz_SSE0,work->mask_SSE0[offset]);
281 tz_SSE1 = _mm_and_ps(tz_SSE1,work->mask_SSE1[offset]);
282 dz_SSE1 = _mm_and_ps(dz_SSE1,work->mask_SSE1[offset]);
284 for(ithx=0; (ithx<PME_ORDER); ithx++)
286 index_x = (i0+ithx)*pny*pnz;
287 tx_SSE = _mm_load1_ps(thx+ithx);
288 dx_SSE = _mm_load1_ps(dthx+ithx);
290 for(ithy=0; (ithy<PME_ORDER); ithy++)
292 index_xy = index_x+(j0+ithy)*pnz;
293 ty_SSE = _mm_load1_ps(thy+ithy);
294 dy_SSE = _mm_load1_ps(dthy+ithy);
296 gval_SSE0 = _mm_load_ps(grid+index_xy+k0-offset);
297 gval_SSE1 = _mm_load_ps(grid+index_xy+k0-offset+4);
299 fxy1_SSE0 = _mm_mul_ps(tz_SSE0,gval_SSE0);
300 fz1_SSE0 = _mm_mul_ps(dz_SSE0,gval_SSE0);
301 fxy1_SSE1 = _mm_mul_ps(tz_SSE1,gval_SSE1);
302 fz1_SSE1 = _mm_mul_ps(dz_SSE1,gval_SSE1);
304 fxy1_SSE = _mm_add_ps(fxy1_SSE0,fxy1_SSE1);
305 fz1_SSE = _mm_add_ps(fz1_SSE0,fz1_SSE1);
307 fx_SSE = _mm_add_ps(fx_SSE,_mm_mul_ps(_mm_mul_ps(dx_SSE,ty_SSE),fxy1_SSE));
308 fy_SSE = _mm_add_ps(fy_SSE,_mm_mul_ps(_mm_mul_ps(tx_SSE,dy_SSE),fxy1_SSE));
309 fz_SSE = _mm_add_ps(fz_SSE,_mm_mul_ps(_mm_mul_ps(tx_SSE,ty_SSE),fz1_SSE));
313 _mm_store_ps(fx_tmp,fx_SSE);
314 _mm_store_ps(fy_tmp,fy_SSE);
315 _mm_store_ps(fz_tmp,fz_SSE);
317 fx += fx_tmp[0]+fx_tmp[1]+fx_tmp[2]+fx_tmp[3];
318 fy += fy_tmp[0]+fy_tmp[1]+fy_tmp[2]+fy_tmp[3];
319 fz += fz_tmp[0]+fz_tmp[1]+fz_tmp[2]+fz_tmp[3];
321 #undef PME_ORDER
322 #undef PME_GATHER_F_SSE_ALIGNED
323 #endif