added Verlet scheme and NxN non-bonded functionality
[gromacs.git] / src / mdlib / nbnxn_kernels / nbnxn_kernel_x86_simd_utils.h
blob4ef461092254bdb46abbae7229bb75f4f11f7ba5
1 /* -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustrup"; -*-
4 * This source code is part of
6 * G R O M A C S
8 * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
9 * Copyright (c) 2001-2012, The GROMACS Development Team
11 * Gromacs is a library for molecular simulation and trajectory analysis,
12 * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
13 * a full list of developers and information, check out http://www.gromacs.org
15 * This program is free software; you can redistribute it and/or modify it under
16 * the terms of the GNU Lesser General Public License as published by the Free
17 * Software Foundation; either version 2 of the License, or (at your option) any
18 * later version.
19 * As a special exception, you may use this file as part of a free software
20 * library without restriction. Specifically, if other files instantiate
21 * templates or use macros or inline functions from this file, or you compile
22 * this file and link it with other files to produce an executable, this
23 * file does not by itself cause the resulting executable to be covered by
24 * the GNU Lesser General Public License.
26 * In plain-speak: do not worry about classes/macros/templates either - only
27 * changes to the library have to be LGPL, not an application linking with it.
29 * To help fund GROMACS development, we humbly ask that you cite
30 * the papers people have written on it - you can find them on the website!
32 #ifndef _nbnxn_kernel_sse_utils_h_
33 #define _nbnxn_kernel_sse_utils_h_
35 /* This files contains all functions/macros for the SSE/AVX kernels
36 * which have explicit dependencies on the j-size / SIMD-width, which
37 * can be 2 (SSE-double), 4 (SSE-single,AVX-double) or 8 (AVX-single).
38 * The functionality which depends on the j-cluster size is:
39 * LJ-parameter lookup
40 * force table lookup
41 * energy group pair energy storage
44 #define GMX_MM_TRANSPOSE2_OP_PD(in0,in1,out0,out1) \
45 { \
46 out0 = _mm_shuffle_pd(in0,in1,_MM_SHUFFLE2(0,0)); \
47 out1 = _mm_shuffle_pd(in0,in1,_MM_SHUFFLE2(1,1)); \
50 #if defined GMX_MM128_HERE || !defined GMX_DOUBLE
51 #define GMX_MM_SHUFFLE_4_PS_FIL01_TO_2_PS(in0,in1,in2,in3,out0,out1) \
52 { \
53 __m128 _c01,_c23; \
54 _c01 = _mm_shuffle_ps(in0,in1,_MM_SHUFFLE(1,0,1,0)); \
55 _c23 = _mm_shuffle_ps(in2,in3,_MM_SHUFFLE(1,0,1,0)); \
56 out0 = _mm_shuffle_ps(_c01,_c23,_MM_SHUFFLE(2,0,2,0)); \
57 out1 = _mm_shuffle_ps(_c01,_c23,_MM_SHUFFLE(3,1,3,1)); \
59 #else
60 #define GMX_MM_SHUFFLE_4_PS_FIL01_TO_2_PS(in0,in1,in2,in3,out0,out1) \
61 { \
62 __m256d _c01,_c23; \
63 _c01 = _mm256_shuffle_pd(in0,in1,_MM_SHUFFLE(1,0,1,0)); \
64 _c23 = _mm256_shuffle_pd(in2,in3,_MM_SHUFFLE(1,0,1,0)); \
65 out0 = _mm256_shuffle_pd(_c01,_c23,_MM_SHUFFLE(2,0,2,0)); \
66 out1 = _mm256_shuffle_pd(_c01,_c23,_MM_SHUFFLE(3,1,3,1)); \
68 #endif
70 #define GMX_MM_SHUFFLE_4_PS_FIL2_TO_1_PS(in0,in1,in2,in3,out) \
71 { \
72 __m128 _c01,_c23; \
73 _c01 = _mm_shuffle_ps(in0,in1,_MM_SHUFFLE(3,2,3,2)); \
74 _c23 = _mm_shuffle_ps(in2,in3,_MM_SHUFFLE(3,2,3,2)); \
75 out = _mm_shuffle_ps(_c01,_c23,_MM_SHUFFLE(2,0,2,0)); \
78 #ifndef GMX_MM256_HERE
79 #ifndef GMX_DOUBLE
80 #define GMX_MM_TRANSPOSE_SUM4_PR(i_SSE0,i_SSE1,i_SSE2,i_SSE3,o_SSE) \
81 { \
82 _MM_TRANSPOSE4_PS(i_SSE0,i_SSE1,i_SSE2,i_SSE3); \
83 i_SSE0 = _mm_add_ps(i_SSE0,i_SSE1); \
84 i_SSE2 = _mm_add_ps(i_SSE2,i_SSE3); \
85 o_SSE = _mm_add_ps(i_SSE0,i_SSE2); \
87 #else
88 #define GMX_MM_TRANSPOSE_SUM2_PD(i_SSE0,i_SSE1,o_SSE) \
89 { \
90 GMX_MM_TRANSPOSE2_PD(i_SSE0,i_SSE1); \
91 o_SSE = _mm_add_pd(i_SSE0,i_SSE1); \
93 #endif
94 #else
95 #ifndef GMX_DOUBLE
96 #define GMX_MM_TRANSPOSE_SUM4_PR(i_SSE0,i_SSE1,i_SSE2,i_SSE3,o_SSE) \
97 { \
98 i_SSE0 = _mm256_hadd_ps(i_SSE0,i_SSE1); \
99 i_SSE2 = _mm256_hadd_ps(i_SSE2,i_SSE3); \
100 i_SSE1 = _mm256_hadd_ps(i_SSE0,i_SSE2); \
101 o_SSE = _mm_add_ps(_mm256_castps256_ps128(i_SSE1),_mm256_extractf128_ps(i_SSE1,1)); \
103 #else
104 #define GMX_MM_TRANSPOSE_SUM4_PR(i_SSE0,i_SSE1,i_SSE2,i_SSE3,o_SSE) \
106 i_SSE0 = _mm256_hadd_pd(i_SSE0,i_SSE1); \
107 i_SSE2 = _mm256_hadd_pd(i_SSE2,i_SSE3); \
108 o_SSE = _mm256_add_pd(_mm256_permute2f128_pd(i_SSE0,i_SSE2,0x20),_mm256_permute2f128_pd(i_SSE0,i_SSE2,0x31)); \
110 #endif
111 #endif
113 #ifdef GMX_MM128_HERE
115 static inline __m128
116 gmx_mm128_invsqrt_ps_single(__m128 x)
118 const __m128 half = _mm_set_ps(0.5,0.5,0.5,0.5);
119 const __m128 three = _mm_set_ps(3.0,3.0,3.0,3.0);
121 __m128 lu = _mm_rsqrt_ps(x);
123 return _mm_mul_ps(half,_mm_mul_ps(_mm_sub_ps(three,_mm_mul_ps(_mm_mul_ps(lu,lu),x)),lu));
126 /* Do 2/4 double precision invsqrt operations.
127 * Doing the SSE rsqrt and the first Newton Raphson iteration
128 * in single precision gives full double precision accuracy.
129 * The speed is more than twice as fast as two gmx_mm_invsqrt_pd calls.
131 #define GMX_MM128_INVSQRT2_PD(i_SSE0,i_SSE1,o_SSE0,o_SSE1) \
133 const __m128d half = _mm_set1_pd(0.5); \
134 const __m128d three = _mm_set1_pd(3.0); \
135 __m128 s_SSE,ir_SSE; \
136 __m128d lu0,lu1; \
138 s_SSE = _mm_movelh_ps(_mm_cvtpd_ps(i_SSE0),_mm_cvtpd_ps(i_SSE1)); \
139 ir_SSE = gmx_mm128_invsqrt_ps_single(s_SSE); \
140 lu0 = _mm_cvtps_pd(ir_SSE); \
141 lu1 = _mm_cvtps_pd(_mm_movehl_ps(ir_SSE,ir_SSE)); \
142 o_SSE0 = _mm_mul_pd(half,_mm_mul_pd(_mm_sub_pd(three,_mm_mul_pd(_mm_mul_pd(lu0,lu0),i_SSE0)),lu0)); \
143 o_SSE1 = _mm_mul_pd(half,_mm_mul_pd(_mm_sub_pd(three,_mm_mul_pd(_mm_mul_pd(lu1,lu1),i_SSE1)),lu1)); \
146 #define GMX_MM_INVSQRT2_PD GMX_MM128_INVSQRT2_PD
148 #endif
150 #ifdef GMX_MM256_HERE
152 static inline __m256
153 gmx_mm256_invsqrt_ps_single(__m256 x)
155 const __m256 half = _mm256_set_ps(0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5);
156 const __m256 three = _mm256_set_ps(3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0);
158 __m256 lu = _mm256_rsqrt_ps(x);
160 return _mm256_mul_ps(half,_mm256_mul_ps(_mm256_sub_ps(three,_mm256_mul_ps(_mm256_mul_ps(lu,lu),x)),lu));
163 #define GMX_MM256_INVSQRT2_PD(i_SSE0,i_SSE1,o_SSE0,o_SSE1) \
165 const __m256d half = _mm256_set1_pd(0.5); \
166 const __m256d three = _mm256_set1_pd(3.0); \
167 __m256 s_SSE,ir_SSE; \
168 __m256d lu0,lu1; \
170 s_SSE = _mm256_insertf128_ps(_mm256_castps128_ps256(_mm256_cvtpd_ps(i_SSE0)),_mm256_cvtpd_ps(i_SSE1),1); \
171 ir_SSE = gmx_mm256_invsqrt_ps_single(s_SSE); \
172 lu0 = _mm256_cvtps_pd(_mm256_castps256_ps128(ir_SSE)); \
173 lu1 = _mm256_cvtps_pd(_mm256_extractf128_ps(ir_SSE,1)); \
174 o_SSE0 = _mm256_mul_pd(half,_mm256_mul_pd(_mm256_sub_pd(three,_mm256_mul_pd(_mm256_mul_pd(lu0,lu0),i_SSE0)),lu0)); \
175 o_SSE1 = _mm256_mul_pd(half,_mm256_mul_pd(_mm256_sub_pd(three,_mm256_mul_pd(_mm256_mul_pd(lu1,lu1),i_SSE1)),lu1)); \
178 #define GMX_MM_INVSQRT2_PD GMX_MM256_INVSQRT2_PD
180 #endif
182 /* Force and energy table load and interpolation routines */
184 #if defined GMX_MM128_HERE && !defined GMX_DOUBLE
186 #define load_lj_pair_params(nbfp,type,aj,c6_SSE,c12_SSE) \
188 gmx_mm_pr clj_SSE[UNROLLJ]; \
189 int p; \
191 for(p=0; p<UNROLLJ; p++) \
193 /* Here we load 4 aligned floats, but we need just 2 */ \
194 clj_SSE[p] = gmx_load_pr(nbfp+type[aj+p]*NBFP_STRIDE); \
196 GMX_MM_SHUFFLE_4_PS_FIL01_TO_2_PS(clj_SSE[0],clj_SSE[1],clj_SSE[2],clj_SSE[3],c6_SSE,c12_SSE); \
199 #endif
201 #if defined GMX_MM256_HERE && !defined GMX_DOUBLE
203 /* Put two 128-bit 4-float registers into one 256-bit 8-float register */
204 #define GMX_2_MM_TO_M256(in0,in1,out) \
206 out = _mm256_insertf128_ps(_mm256_castps128_ps256(in0),in1,1); \
209 #define load_lj_pair_params(nbfp,type,aj,c6_SSE,c12_SSE) \
211 __m128 clj_SSE[UNROLLJ],c6t_SSE[2],c12t_SSE[2]; \
212 int p; \
214 for(p=0; p<UNROLLJ; p++) \
216 /* Here we load 4 aligned floats, but we need just 2 */ \
217 clj_SSE[p] = _mm_load_ps(nbfp+type[aj+p]*NBFP_STRIDE); \
219 GMX_MM_SHUFFLE_4_PS_FIL01_TO_2_PS(clj_SSE[0],clj_SSE[1],clj_SSE[2],clj_SSE[3],c6t_SSE[0],c12t_SSE[0]); \
220 GMX_MM_SHUFFLE_4_PS_FIL01_TO_2_PS(clj_SSE[4],clj_SSE[5],clj_SSE[6],clj_SSE[7],c6t_SSE[1],c12t_SSE[1]); \
222 GMX_2_MM_TO_M256(c6t_SSE[0],c6t_SSE[1],c6_SSE); \
223 GMX_2_MM_TO_M256(c12t_SSE[0],c12t_SSE[1],c12_SSE); \
226 #endif
228 #if defined GMX_MM128_HERE && defined GMX_DOUBLE
230 #define load_lj_pair_params(nbfp,type,aj,c6_SSE,c12_SSE) \
232 gmx_mm_pr clj_SSE[UNROLLJ]; \
233 int p; \
235 for(p=0; p<UNROLLJ; p++) \
237 clj_SSE[p] = gmx_load_pr(nbfp+type[aj+p]*NBFP_STRIDE); \
239 GMX_MM_TRANSPOSE2_OP_PD(clj_SSE[0],clj_SSE[1],c6_SSE,c12_SSE); \
242 #endif
244 #if defined GMX_MM256_HERE && defined GMX_DOUBLE
246 #define load_lj_pair_params(nbfp,type,aj,c6_SSE,c12_SSE) \
248 __m128d clj_SSE[UNROLLJ],c6t_SSE[2],c12t_SSE[2]; \
249 int p; \
251 for(p=0; p<UNROLLJ; p++) \
253 clj_SSE[p] = _mm_load_pd(nbfp+type[aj+p]*NBFP_STRIDE); \
255 GMX_MM_TRANSPOSE2_OP_PD(clj_SSE[0],clj_SSE[1],c6t_SSE[0],c12t_SSE[0]); \
256 GMX_MM_TRANSPOSE2_OP_PD(clj_SSE[2],clj_SSE[3],c6t_SSE[1],c12t_SSE[1]); \
257 GMX_2_M128D_TO_M256D(c6t_SSE[0],c6t_SSE[1],c6_SSE); \
258 GMX_2_M128D_TO_M256D(c12t_SSE[0],c12t_SSE[1],c12_SSE); \
261 #endif
264 /* The load_table functions below are performance critical.
265 * The routines issue UNROLLI*UNROLLJ _mm_load_ps calls.
266 * As these all have latencies, scheduling is crucial.
267 * The Intel compilers and CPUs seem to do a good job at this.
268 * But AMD CPUs perform significantly worse with gcc than with icc.
269 * Performance is improved a bit by using the extract function UNROLLJ times,
270 * instead of doing an _mm_store_si128 for every i-particle.
271 * With AVX this significantly deteriorates performance (8 extracts iso 4).
272 * Because of this, the load_table_f macro always takes the ti parameter,
273 * but it is only used with AVX.
276 #if defined GMX_MM128_HERE && !defined GMX_DOUBLE
278 #define load_table_f(tab_coul_FDV0, ti_SSE, ti, ctab0_SSE, ctab1_SSE) \
280 int idx[4]; \
281 __m128 ctab_SSE[4]; \
283 /* Table has 4 entries, left-shift index by 2 */ \
284 ti_SSE = _mm_slli_epi32(ti_SSE,2); \
285 /* Without SSE4.1 the extract macro needs an immediate: unroll */ \
286 idx[0] = gmx_mm_extract_epi32(ti_SSE,0); \
287 ctab_SSE[0] = _mm_load_ps(tab_coul_FDV0+idx[0]); \
288 idx[1] = gmx_mm_extract_epi32(ti_SSE,1); \
289 ctab_SSE[1] = _mm_load_ps(tab_coul_FDV0+idx[1]); \
290 idx[2] = gmx_mm_extract_epi32(ti_SSE,2); \
291 ctab_SSE[2] = _mm_load_ps(tab_coul_FDV0+idx[2]); \
292 idx[3] = gmx_mm_extract_epi32(ti_SSE,3); \
293 ctab_SSE[3] = _mm_load_ps(tab_coul_FDV0+idx[3]); \
295 /* Shuffle the force table entries to a convenient order */ \
296 GMX_MM_SHUFFLE_4_PS_FIL01_TO_2_PS(ctab_SSE[0],ctab_SSE[1],ctab_SSE[2],ctab_SSE[3],ctab0_SSE,ctab1_SSE); \
299 #define load_table_f_v(tab_coul_FDV0, ti_SSE, ti, ctab0_SSE, ctab1_SSE, ctabv_SSE) \
301 int idx[4]; \
302 __m128 ctab_SSE[4]; \
304 /* Table has 4 entries, left-shift index by 2 */ \
305 ti_SSE = _mm_slli_epi32(ti_SSE,2); \
306 /* Without SSE4.1 the extract macro needs an immediate: unroll */ \
307 idx[0] = gmx_mm_extract_epi32(ti_SSE,0); \
308 ctab_SSE[0] = _mm_load_ps(tab_coul_FDV0+idx[0]); \
309 idx[1] = gmx_mm_extract_epi32(ti_SSE,1); \
310 ctab_SSE[1] = _mm_load_ps(tab_coul_FDV0+idx[1]); \
311 idx[2] = gmx_mm_extract_epi32(ti_SSE,2); \
312 ctab_SSE[2] = _mm_load_ps(tab_coul_FDV0+idx[2]); \
313 idx[3] = gmx_mm_extract_epi32(ti_SSE,3); \
314 ctab_SSE[3] = _mm_load_ps(tab_coul_FDV0+idx[3]); \
316 /* Shuffle the force table entries to a convenient order */ \
317 GMX_MM_SHUFFLE_4_PS_FIL01_TO_2_PS(ctab_SSE[0],ctab_SSE[1],ctab_SSE[2],ctab_SSE[3],ctab0_SSE,ctab1_SSE); \
318 /* Shuffle the energy table entries to a convenient order */ \
319 GMX_MM_SHUFFLE_4_PS_FIL2_TO_1_PS(ctab_SSE[0],ctab_SSE[1],ctab_SSE[2],ctab_SSE[3],ctabv_SSE); \
322 #endif
324 #if defined GMX_MM256_HERE && !defined GMX_DOUBLE
326 #define load_table_f(tab_coul_FDV0, ti_SSE, ti, ctab0_SSE, ctab1_SSE) \
328 __m128 ctab_SSE[8],ctabt_SSE[4]; \
329 int j; \
331 /* Bit shifting would be faster, but AVX doesn't support that */ \
332 _mm256_store_si256((__m256i *)ti,ti_SSE); \
333 for(j=0; j<8; j++) \
335 ctab_SSE[j] = _mm_load_ps(tab_coul_FDV0+ti[j]*4); \
337 GMX_MM_SHUFFLE_4_PS_FIL01_TO_2_PS(ctab_SSE[0],ctab_SSE[1],ctab_SSE[2],ctab_SSE[3],ctabt_SSE[0],ctabt_SSE[2]); \
338 GMX_MM_SHUFFLE_4_PS_FIL01_TO_2_PS(ctab_SSE[4],ctab_SSE[5],ctab_SSE[6],ctab_SSE[7],ctabt_SSE[1],ctabt_SSE[3]); \
340 GMX_2_MM_TO_M256(ctabt_SSE[0],ctabt_SSE[1],ctab0_SSE); \
341 GMX_2_MM_TO_M256(ctabt_SSE[2],ctabt_SSE[3],ctab1_SSE); \
344 #define load_table_f_v(tab_coul_FDV0, ti_SSE, ti, ctab0_SSE, ctab1_SSE, ctabv_SSE) \
346 __m128 ctab_SSE[8],ctabt_SSE[4],ctabvt_SSE[2]; \
347 int j; \
349 /* Bit shifting would be faster, but AVX doesn't support that */ \
350 _mm256_store_si256((__m256i *)ti,ti_SSE); \
351 for(j=0; j<8; j++) \
353 ctab_SSE[j] = _mm_load_ps(tab_coul_FDV0+ti[j]*4); \
355 GMX_MM_SHUFFLE_4_PS_FIL01_TO_2_PS(ctab_SSE[0],ctab_SSE[1],ctab_SSE[2],ctab_SSE[3],ctabt_SSE[0],ctabt_SSE[2]); \
356 GMX_MM_SHUFFLE_4_PS_FIL01_TO_2_PS(ctab_SSE[4],ctab_SSE[5],ctab_SSE[6],ctab_SSE[7],ctabt_SSE[1],ctabt_SSE[3]); \
358 GMX_2_MM_TO_M256(ctabt_SSE[0],ctabt_SSE[1],ctab0_SSE); \
359 GMX_2_MM_TO_M256(ctabt_SSE[2],ctabt_SSE[3],ctab1_SSE); \
361 GMX_MM_SHUFFLE_4_PS_FIL2_TO_1_PS(ctab_SSE[0],ctab_SSE[1],ctab_SSE[2],ctab_SSE[3],ctabvt_SSE[0]); \
362 GMX_MM_SHUFFLE_4_PS_FIL2_TO_1_PS(ctab_SSE[4],ctab_SSE[5],ctab_SSE[6],ctab_SSE[7],ctabvt_SSE[1]); \
364 GMX_2_MM_TO_M256(ctabvt_SSE[0],ctabvt_SSE[1],ctabv_SSE); \
367 #endif
369 #if defined GMX_MM128_HERE && defined GMX_DOUBLE
371 #define load_table_f(tab_coul_F, ti_SSE, ti, ctab0_SSE, ctab1_SSE) \
373 int idx[2]; \
374 __m128d ctab_SSE[2]; \
376 /* Without SSE4.1 the extract macro needs an immediate: unroll */ \
377 idx[0] = gmx_mm_extract_epi32(ti_SSE,0); \
378 ctab_SSE[0] = _mm_loadu_pd(tab_coul_F+idx[0]); \
379 idx[1] = gmx_mm_extract_epi32(ti_SSE,1); \
380 ctab_SSE[1] = _mm_loadu_pd(tab_coul_F+idx[1]); \
382 /* Shuffle the force table entries to a convenient order */ \
383 GMX_MM_TRANSPOSE2_OP_PD(ctab_SSE[0],ctab_SSE[1],ctab0_SSE,ctab1_SSE); \
384 /* The second force table entry should contain the difference */ \
385 ctab1_SSE = _mm_sub_pd(ctab1_SSE,ctab0_SSE); \
388 #define load_table_f_v(tab_coul_F, tab_coul_V, ti_SSE, ti, ctab0_SSE, ctab1_SSE, ctabv_SSE) \
390 int idx[2]; \
391 __m128d ctab_SSE[4]; \
393 /* Without SSE4.1 the extract macro needs an immediate: unroll */ \
394 idx[0] = gmx_mm_extract_epi32(ti_SSE,0); \
395 ctab_SSE[0] = _mm_loadu_pd(tab_coul_F+idx[0]); \
396 idx[1] = gmx_mm_extract_epi32(ti_SSE,1); \
397 ctab_SSE[1] = _mm_loadu_pd(tab_coul_F+idx[1]); \
399 /* Shuffle the force table entries to a convenient order */ \
400 GMX_MM_TRANSPOSE2_OP_PD(ctab_SSE[0],ctab_SSE[1],ctab0_SSE,ctab1_SSE); \
401 /* The second force table entry should contain the difference */ \
402 ctab1_SSE = _mm_sub_pd(ctab1_SSE,ctab0_SSE); \
404 ctab_SSE[2] = _mm_loadu_pd(tab_coul_V+idx[0]); \
405 ctab_SSE[3] = _mm_loadu_pd(tab_coul_V+idx[1]); \
407 /* Shuffle the energy table entries to a single register */ \
408 ctabv_SSE = _mm_shuffle_pd(ctab_SSE[2],ctab_SSE[3],_MM_SHUFFLE2(0,0)); \
411 #endif
413 #if defined GMX_MM256_HERE && defined GMX_DOUBLE
415 /* Put two 128-bit 2-double registers into one 256-bit 4-ouble register */
416 #define GMX_2_M128D_TO_M256D(in0,in1,out) \
418 out = _mm256_insertf128_pd(_mm256_castpd128_pd256(in0),in1,1); \
421 #define load_table_f(tab_coul_F, ti_SSE, ti, ctab0_SSE, ctab1_SSE) \
423 __m128d ctab_SSE[4],tr_SSE[4]; \
424 int j; \
426 _mm_store_si128((__m128i *)ti,ti_SSE); \
427 for(j=0; j<4; j++) \
429 ctab_SSE[j] = _mm_loadu_pd(tab_coul_F+ti[j]); \
431 /* Shuffle the force table entries to a convenient order */ \
432 GMX_MM_TRANSPOSE2_OP_PD(ctab_SSE[0],ctab_SSE[1],tr_SSE[0],tr_SSE[1]); \
433 GMX_MM_TRANSPOSE2_OP_PD(ctab_SSE[2],ctab_SSE[3],tr_SSE[2],tr_SSE[3]); \
434 GMX_2_M128D_TO_M256D(tr_SSE[0],tr_SSE[2],ctab0_SSE); \
435 GMX_2_M128D_TO_M256D(tr_SSE[1],tr_SSE[3],ctab1_SSE); \
436 /* The second force table entry should contain the difference */ \
437 ctab1_SSE = _mm256_sub_pd(ctab1_SSE,ctab0_SSE); \
440 #define load_table_f_v(tab_coul_F, tab_coul_V, ti_SSE, ti, ctab0_SSE, ctab1_SSE, ctabv_SSE) \
442 __m128d ctab_SSE[8],tr_SSE[4]; \
443 int j; \
445 _mm_store_si128((__m128i *)ti,ti_SSE); \
446 for(j=0; j<4; j++) \
448 ctab_SSE[j] = _mm_loadu_pd(tab_coul_F+ti[j]); \
450 /* Shuffle the force table entries to a convenient order */ \
451 GMX_MM_TRANSPOSE2_OP_PD(ctab_SSE[0],ctab_SSE[1],tr_SSE[0],tr_SSE[1]); \
452 GMX_MM_TRANSPOSE2_OP_PD(ctab_SSE[2],ctab_SSE[3],tr_SSE[2],tr_SSE[3]); \
453 GMX_2_M128D_TO_M256D(tr_SSE[0],tr_SSE[2],ctab0_SSE); \
454 GMX_2_M128D_TO_M256D(tr_SSE[1],tr_SSE[3],ctab1_SSE); \
455 /* The second force table entry should contain the difference */ \
456 ctab1_SSE = _mm256_sub_pd(ctab1_SSE,ctab0_SSE); \
458 for(j=0; j<4; j++) \
460 ctab_SSE[4+j] = _mm_loadu_pd(tab_coul_V+ti[j]); \
462 /* Shuffle the energy table entries to a single register */ \
463 GMX_2_M128D_TO_M256D(_mm_shuffle_pd(ctab_SSE[4],ctab_SSE[5],_MM_SHUFFLE2(0,0)),_mm_shuffle_pd(ctab_SSE[6],ctab_SSE[7],_MM_SHUFFLE2(0,0)),ctabv_SSE); \
466 #endif
469 /* Add energy register to possibly multiple terms in the energy array.
470 * This function is the same for SSE/AVX single/double.
472 static inline void add_ener_grp(gmx_mm_pr e_SSE,real *v,int *offset_jj)
474 int jj;
476 /* We need to balance the number of store operations with
477 * the rapidly increases number of combinations of energy groups.
478 * We add to a temporary buffer for 1 i-group vs 2 j-groups.
480 for(jj=0; jj<(UNROLLJ>>1); jj++)
482 gmx_mm_pr v_SSE;
484 v_SSE = gmx_load_pr(v+offset_jj[jj]+jj*UNROLLJ);
485 gmx_store_pr(v+offset_jj[jj]+jj*UNROLLJ,gmx_add_pr(v_SSE,e_SSE));
489 #endif /* _nbnxn_kernel_sse_utils_h_ */