2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
5 * Copyright (c) 2001-2012, The GROMACS Development Team
6 * Copyright (c) 2012,2013, by the GROMACS development team, led by
7 * David van der Spoel, Berk Hess, Erik Lindahl, and including many
8 * others, as listed in the AUTHORS file in the top-level source
9 * directory and at http://www.gromacs.org.
11 * GROMACS is free software; you can redistribute it and/or
12 * modify it under the terms of the GNU Lesser General Public License
13 * as published by the Free Software Foundation; either version 2.1
14 * of the License, or (at your option) any later version.
16 * GROMACS is distributed in the hope that it will be useful,
17 * but WITHOUT ANY WARRANTY; without even the implied warranty of
18 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
19 * Lesser General Public License for more details.
21 * You should have received a copy of the GNU Lesser General Public
22 * License along with GROMACS; if not, see
23 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
24 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
26 * If you want to redistribute modifications to GROMACS, please
27 * consider that scientific software is very special. Version
28 * control is crucial - bugs must be traceable. We will be happy to
29 * consider code for inclusion in the official distribution, but
30 * derived work must not be called official GROMACS. Details are found
31 * in the README & COPYING files - if they are missing, get the
32 * official version at http://www.gromacs.org.
34 * To help us fund GROMACS development, we humbly ask that you cite
35 * the research papers on the package. Check out http://www.gromacs.org.
38 #ifndef _gmx_simd_ref_h_
39 #define _gmx_simd_ref_h_
41 /* This file contains a reference plain-C implementation of arbitrary width.
42 * This code is only useful for testing and documentation.
43 * The SIMD width is set by defining GMX_SIMD_REF_WIDTH before including.
47 #ifndef GMX_SIMD_REF_WIDTH
48 #error "GMX_SIMD_REF_WIDTH should be defined before including gmx_simd_ref.h"
53 /* float/double SIMD register type */
55 real r
[GMX_SIMD_REF_WIDTH
];
58 /* boolean SIMD register type */
60 char r
[GMX_SIMD_REF_WIDTH
];
63 /* integer SIMD register type, only for table indexing and exclusion masks */
65 int r
[GMX_SIMD_REF_WIDTH
];
67 #define GMX_SIMD_REF_EPI32_WIDTH GMX_SIMD_REF_WIDTH
69 /* Load GMX_SIMD_REF_WIDTH reals for memory starting at r */
70 static gmx_inline gmx_simd_ref_pr
71 gmx_simd_ref_load_pr(const real
*r
)
76 for (i
= 0; i
< GMX_SIMD_REF_WIDTH
; i
++)
84 /* Set all SIMD register elements to *r */
85 static gmx_inline gmx_simd_ref_pr
86 gmx_simd_ref_load1_pr(const real
*r
)
91 for (i
= 0; i
< GMX_SIMD_REF_WIDTH
; i
++)
99 /* Set all SIMD register elements to r */
100 static gmx_inline gmx_simd_ref_pr
101 gmx_simd_ref_set1_pr(real r
)
106 for (i
= 0; i
< GMX_SIMD_REF_WIDTH
; i
++)
114 /* Set all SIMD register elements to 0 */
115 static gmx_inline gmx_simd_ref_pr
116 gmx_simd_ref_setzero_pr()
121 for (i
= 0; i
< GMX_SIMD_REF_WIDTH
; i
++)
129 static gmx_inline
void
130 gmx_simd_ref_store_pr(real
*dest
, gmx_simd_ref_pr src
)
134 for (i
= 0; i
< GMX_SIMD_REF_WIDTH
; i
++)
140 static gmx_inline gmx_simd_ref_pr
141 gmx_simd_ref_add_pr(gmx_simd_ref_pr a
, gmx_simd_ref_pr b
)
146 for (i
= 0; i
< GMX_SIMD_REF_WIDTH
; i
++)
148 c
.r
[i
] = a
.r
[i
] + b
.r
[i
];
154 static gmx_inline gmx_simd_ref_pr
155 gmx_simd_ref_sub_pr(gmx_simd_ref_pr a
, gmx_simd_ref_pr b
)
160 for (i
= 0; i
< GMX_SIMD_REF_WIDTH
; i
++)
162 c
.r
[i
] = a
.r
[i
] - b
.r
[i
];
168 static gmx_inline gmx_simd_ref_pr
169 gmx_simd_ref_mul_pr(gmx_simd_ref_pr a
, gmx_simd_ref_pr b
)
174 for (i
= 0; i
< GMX_SIMD_REF_WIDTH
; i
++)
176 c
.r
[i
] = a
.r
[i
]*b
.r
[i
];
182 static gmx_inline gmx_simd_ref_pr
183 gmx_simd_ref_madd_pr(gmx_simd_ref_pr a
, gmx_simd_ref_pr b
, gmx_simd_ref_pr c
)
188 for (i
= 0; i
< GMX_SIMD_REF_WIDTH
; i
++)
190 d
.r
[i
] = a
.r
[i
]*b
.r
[i
] + c
.r
[i
];
196 static gmx_inline gmx_simd_ref_pr
197 gmx_simd_ref_nmsub_pr(gmx_simd_ref_pr a
, gmx_simd_ref_pr b
, gmx_simd_ref_pr c
)
202 for (i
= 0; i
< GMX_SIMD_REF_WIDTH
; i
++)
204 d
.r
[i
] = -a
.r
[i
]*b
.r
[i
] + c
.r
[i
];
210 static gmx_inline gmx_simd_ref_pr
211 gmx_simd_ref_max_pr(gmx_simd_ref_pr a
, gmx_simd_ref_pr b
)
216 for (i
= 0; i
< GMX_SIMD_REF_WIDTH
; i
++)
218 c
.r
[i
] = (a
.r
[i
] >= b
.r
[i
] ? a
.r
[i
] : b
.r
[i
]);
224 static gmx_inline gmx_simd_ref_pr
225 gmx_simd_ref_blendzero_pr(gmx_simd_ref_pr a
, gmx_simd_ref_pb b
)
230 for (i
= 0; i
< GMX_SIMD_REF_WIDTH
; i
++)
232 c
.r
[i
] = (b
.r
[i
] ? a
.r
[i
] : 0.0);
238 /* Note that this reference implementation rounds away from zero,
239 * whereas most SIMD intrinsics will round to nearest even. Since this
240 * function is only used for periodic image calculations, the rounding
241 * of mantissas close to 0.5 is irrelevant, except in testing. This
242 * could be fixed by using rint/rintf, but the bigger problem is that
243 * MSVC does not support full C99, and none of the round or rint
244 * functions are defined. It's much easier to approximately implement
245 * round() than rint(), so we do that and hope we never get bitten in
246 * testing. (Thanks, Microsoft.)
248 static gmx_inline gmx_simd_ref_pr
249 gmx_simd_ref_round_pr(gmx_simd_ref_pr a
)
254 for (i
= 0; i
< GMX_SIMD_REF_WIDTH
; i
++)
257 int temp
= (a
.r
[i
] >= 0.)
260 b
.r
[i
] = (real
) temp
;
261 #elif defined GMX_DOUBLE
262 b
.r
[i
] = round(a
.r
[i
]);
264 b
.r
[i
] = roundf(a
.r
[i
]);
271 /* Not required, only used to speed up the nbnxn tabulated PME kernels */
272 static gmx_inline gmx_simd_ref_pr
273 gmx_simd_ref_floor_pr(gmx_simd_ref_pr a
)
278 for (i
= 0; i
< GMX_SIMD_REF_WIDTH
; i
++)
281 b
.r
[i
] = floor(a
.r
[i
]);
283 b
.r
[i
] = floorf(a
.r
[i
]);
290 /* Not required, only used when blendv is faster than comparison */
291 static gmx_inline gmx_simd_ref_pr
292 gmx_simd_ref_blendv_pr(gmx_simd_ref_pr a
, gmx_simd_ref_pr b
, gmx_simd_ref_pr c
)
297 for (i
= 0; i
< GMX_SIMD_REF_WIDTH
; i
++)
299 d
.r
[i
] = (c
.r
[i
] >= 0) ? a
.r
[i
] : b
.r
[i
];
305 /* Copy the sign of a to b, assumes b >= 0 for efficiency */
306 static gmx_inline gmx_simd_ref_pr
307 gmx_simd_ref_cpsgn_nonneg_pr(gmx_simd_ref_pr a
, gmx_simd_ref_pr b
)
312 for (i
= 0; i
< GMX_SIMD_REF_WIDTH
; i
++)
314 c
.r
[i
] = (a
.r
[i
] >= 0) ? b
.r
[i
] : -b
.r
[i
];
320 /* Very specific operation required in the non-bonded kernels */
321 static gmx_inline gmx_simd_ref_pr
322 gmx_simd_ref_masknot_add_pr(gmx_simd_ref_pb a
, gmx_simd_ref_pr b
, gmx_simd_ref_pr c
)
327 for (i
= 0; i
< GMX_SIMD_REF_WIDTH
; i
++)
329 d
.r
[i
] = a
.r
[i
] ? b
.r
[i
] : b
.r
[i
] + c
.r
[i
];
336 static gmx_inline gmx_simd_ref_pb
337 gmx_simd_ref_cmplt_pr(gmx_simd_ref_pr a
, gmx_simd_ref_pr b
)
342 for (i
= 0; i
< GMX_SIMD_REF_WIDTH
; i
++)
344 c
.r
[i
] = (a
.r
[i
] < b
.r
[i
]);
350 /* Logical AND on SIMD booleans */
351 static gmx_inline gmx_simd_ref_pb
352 gmx_simd_ref_and_pb(gmx_simd_ref_pb a
, gmx_simd_ref_pb b
)
357 for (i
= 0; i
< GMX_SIMD_REF_WIDTH
; i
++)
359 c
.r
[i
] = (a
.r
[i
] && b
.r
[i
]);
365 /* Logical OR on SIMD booleans */
366 static gmx_inline gmx_simd_ref_pb
367 gmx_simd_ref_or_pb(gmx_simd_ref_pb a
, gmx_simd_ref_pb b
)
372 for (i
= 0; i
< GMX_SIMD_REF_WIDTH
; i
++)
374 c
.r
[i
] = (a
.r
[i
] || b
.r
[i
]);
380 /* Returns a single int (0/1) which tells if any of the booleans is True */
381 static gmx_inline
int
382 gmx_simd_ref_anytrue_pb(gmx_simd_ref_pb a
)
388 for (i
= 0; i
< GMX_SIMD_REF_WIDTH
; i
++)
399 /* Conversions only used for PME table lookup */
400 static gmx_inline gmx_simd_ref_epi32
401 gmx_simd_ref_cvttpr_epi32(gmx_simd_ref_pr a
)
403 gmx_simd_ref_epi32 b
;
406 for (i
= 0; i
< GMX_SIMD_REF_WIDTH
; i
++)
408 b
.r
[i
] = (int)a
.r
[i
];
414 /* These two function only need to be approximate, Newton-Raphson iteration
415 * is used for full accuracy in gmx_invsqrt_pr and gmx_inv_pr.
417 static gmx_inline gmx_simd_ref_pr
418 gmx_simd_ref_rsqrt_pr(gmx_simd_ref_pr a
)
423 for (i
= 0; i
< GMX_SIMD_REF_WIDTH
; i
++)
426 b
.r
[i
] = 1.0/sqrt(a
.r
[i
]);
428 b
.r
[i
] = 1.0/sqrtf(a
.r
[i
]);
435 static gmx_inline gmx_simd_ref_pr
436 gmx_simd_ref_rcp_pr(gmx_simd_ref_pr a
)
441 for (i
= 0; i
< GMX_SIMD_REF_WIDTH
; i
++)
449 static gmx_inline gmx_simd_ref_pr
450 gmx_simd_ref_exp_pr(gmx_simd_ref_pr a
)
455 for (i
= 0; i
< GMX_SIMD_REF_WIDTH
; i
++)
458 b
.r
[i
] = exp(a
.r
[i
]);
460 b
.r
[i
] = expf(a
.r
[i
]);
467 static gmx_inline gmx_simd_ref_pr
468 gmx_simd_ref_sqrt_pr(gmx_simd_ref_pr a
)
473 for (i
= 0; i
< GMX_SIMD_REF_WIDTH
; i
++)
476 b
.r
[i
] = sqrt(a
.r
[i
]);
478 b
.r
[i
] = sqrtf(a
.r
[i
]);
485 static gmx_inline
int
486 gmx_simd_ref_sincos_pr(gmx_simd_ref_pr a
,
487 gmx_simd_ref_pr
*s
, gmx_simd_ref_pr
*c
)
491 for (i
= 0; i
< GMX_SIMD_REF_WIDTH
; i
++)
493 s
->r
[i
] = sin(a
.r
[i
]);
494 c
->r
[i
] = cos(a
.r
[i
]);
500 static gmx_inline gmx_simd_ref_pr
501 gmx_simd_ref_acos_pr(gmx_simd_ref_pr a
)
506 for (i
= 0; i
< GMX_SIMD_REF_WIDTH
; i
++)
508 b
.r
[i
] = acos(a
.r
[i
]);
514 static gmx_inline gmx_simd_ref_pr
515 gmx_simd_ref_atan2_pr(gmx_simd_ref_pr a
, gmx_simd_ref_pr b
)
520 for (i
= 0; i
< GMX_SIMD_REF_WIDTH
; i
++)
522 c
.r
[i
] = atan2(a
.r
[i
], b
.r
[i
]);
528 #endif /* _gmx_simd_ref_h_ */