Fix compile issues on K computer
[gromacs.git] / src / gromacs / simd / impl_sparc64_hpc_ace / impl_sparc64_hpc_ace.h
blobe3f474c3bfe6bb764665802fa2ada7cae8cee244
1 /*
2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2014,2015, by the GROMACS development team, led by
5 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6 * and including many others, as listed in the AUTHORS file in the
7 * top-level source directory and at http://www.gromacs.org.
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
36 #ifndef GMX_SIMD_IMPL_SPARC64_HPC_ACE_H
37 #define GMX_SIMD_IMPL_SPARC64_HPC_ACE_H
39 #include <math.h>
41 /* Fujitsu header borrows the name from SSE2, since some instructions have aliases.
42 * Environment/compiler version GM-1.2.0-17 seems to be buggy; when -Xg is
43 * defined to enable GNUC extensions, this sets _ISOC99_SOURCE, which in
44 * turn causes all intrinsics to be declared inline _instead_ of static. This
45 * leads to duplicate symbol errors at link time.
46 * To work around this we unset this before including the HPC-ACE header, and
47 * reset the value afterwards.
49 #ifdef _ISOC99_SOURCE
50 # undef _ISOC99_SOURCE
51 # define SAVE_ISOC99_SOURCE
52 #endif
54 #include <emmintrin.h>
56 #ifdef SAVE_ISOC99_SOURCE
57 # define _ISOC99_SOURCE
58 # undef SAVE_ISOC99_SOURCE
59 #endif
62 /* Sparc64 HPC-ACE SIMD instruction wrappers
64 * Please see documentation in gromacs/simd/simd.h for defines.
67 /* Capability definitions for Sparc64 HPC-ACE */
68 /* HPC-ACE is actually double-only on the register level, but we also implement
69 * a single-precision interface where we only offer single-precision accuracy
70 * in math functions - this can save quite a few cycles.
72 #define GMX_SIMD_HAVE_FLOAT
73 #define GMX_SIMD_HAVE_DOUBLE
74 #define GMX_SIMD_HAVE_HARDWARE
75 #undef GMX_SIMD_HAVE_LOADU
76 #undef GMX_SIMD_HAVE_STOREU
77 #define GMX_SIMD_HAVE_LOGICAL
78 #define GMX_SIMD_HAVE_FMA
79 #undef GMX_SIMD_HAVE_FRACTION
80 #define GMX_SIMD_HAVE_FINT32
81 #define GMX_SIMD_HAVE_FINT32_EXTRACT
82 #define GMX_SIMD_HAVE_FINT32_LOGICAL
83 #undef GMX_SIMD_HAVE_FINT32_ARITHMETICS
84 #define GMX_SIMD_HAVE_DINT32
85 #define GMX_SIMD_HAVE_DINT32_EXTRACT
86 #define GMX_SIMD_HAVE_DINT32_LOGICAL
87 #undef GMX_SIMD_HAVE_DINT32_ARITHMETICS
88 #undef GMX_SIMD4_HAVE_FLOAT
89 #undef GMX_SIMD4_HAVE_DOUBLE
91 /* Implementation details */
92 #define GMX_SIMD_FLOAT_WIDTH 2
93 #define GMX_SIMD_DOUBLE_WIDTH 2
94 #define GMX_SIMD_FINT32_WIDTH 2
95 #define GMX_SIMD_DINT32_WIDTH 2
96 #define GMX_SIMD_RSQRT_BITS 10
97 #define GMX_SIMD_RCP_BITS 9
99 /* HPC-ACE is a bit strange; some instructions like
100 * shifts only work on _integer_ versions of SIMD
101 * registers, but there are no intrinsics to load
102 * or convert, or even to cast. The only way to use
103 * them is to declare unions with the SIMD integer
104 * type. However, this will lead to extra load ops,
105 * and the normal real-to-int and int-to-real
106 * conversions work purely on the v2r8 fp regs.
107 * Since our most common usage is to convert and
108 * then extract the result for table lookups, we
109 * define the gmx_simd_fint32_t datatype to use
110 * the v2r8 rather than v2i8 SIMD type.
113 /****************************************************
114 * SINGLE PRECISION SIMD IMPLEMENTATION *
115 ****************************************************/
116 #define gmx_simd_float_t _fjsp_v2r8
117 #define gmx_simd_load_f gmx_simd_load_f_sparc64_hpc_ace
118 #define gmx_simd_load1_f(m) _fjsp_set_v2r8((*m), (*m))
119 #define gmx_simd_set1_f(a) _fjsp_set_v2r8(a, a)
120 #define gmx_simd_store_f gmx_simd_store_f_sparc64_hpc_ace
121 #define gmx_simd_loadu_f gmx_simd_load_f
122 /* No unaligned store of gmx_simd_float_t */
123 #define gmx_simd_setzero_f _fjsp_setzero_v2r8
124 #define gmx_simd_add_f _fjsp_add_v2r8
125 #define gmx_simd_sub_f _fjsp_sub_v2r8
126 #define gmx_simd_mul_f _fjsp_mul_v2r8
127 #define gmx_simd_fmadd_f(a, b, c) _fjsp_madd_v2r8(a, b, c)
128 #define gmx_simd_fmsub_f(a, b, c) _fjsp_msub_v2r8(a, b, c)
129 #define gmx_simd_fnmadd_f(a, b, c) _fjsp_nmsub_v2r8(a, b, c)
130 #define gmx_simd_fnmsub_f(a, b, c) _fjsp_nmadd_v2r8(a, b, c)
131 #define gmx_simd_and_f _fjsp_and_v2r8
132 #define gmx_simd_andnot_f _fjsp_andnot1_v2r8
133 #define gmx_simd_or_f _fjsp_or_v2r8
134 #define gmx_simd_xor_f _fjsp_xor_v2r8
135 #define gmx_simd_rsqrt_f _fjsp_rsqrta_v2r8
136 #define gmx_simd_rcp_f _fjsp_rcpa_v2r8
137 #define gmx_simd_fabs_f(x) _fjsp_abs_v2r8(x)
138 #define gmx_simd_fneg_f(x) _fjsp_neg_v2r8(x)
139 #define gmx_simd_max_f _fjsp_max_v2r8
140 #define gmx_simd_min_f _fjsp_min_v2r8
141 #define gmx_simd_round_f(x) gmx_simd_round_d(x)
142 #define gmx_simd_trunc_f(x) gmx_simd_trunc_d(x)
143 #define gmx_simd_fraction_f(x) gmx_simd_sub_f(x, gmx_simd_trunc_f(x))
144 #define gmx_simd_get_exponent_f gmx_simd_get_exponent_d_sparc64_hpc_ace
145 #define gmx_simd_get_mantissa_f gmx_simd_get_mantissa_d_sparc64_hpc_ace
146 #define gmx_simd_set_exponent_f gmx_simd_set_exponent_d_sparc64_hpc_ace
147 /* integer datatype corresponding to float: gmx_simd_fint32_t */
148 #define gmx_simd_fint32_t _fjsp_v2r8
149 #define gmx_simd_load_fi(m) gmx_simd_load_di_sparc64_hpc_ace(m)
150 #define gmx_simd_set1_fi(i) gmx_simd_set1_di_sparc64_hpc_ace(i)
151 #define gmx_simd_store_fi(m, x) gmx_simd_store_di_sparc64_hpc_ace(m, x)
152 #define gmx_simd_loadu_fi gmx_simd_load_fi
153 /* No unaligned store of gmx_simd_fint32_t */
154 #define gmx_simd_setzero_fi _fjsp_setzero_v2r8
155 #define gmx_simd_cvt_f2i gmx_simd_cvt_d2i
156 #define gmx_simd_cvtt_f2i _fjsp_dtox_v2r8
157 #define gmx_simd_cvt_i2f _fjsp_xtod_v2r8
158 #define gmx_simd_extract_fi gmx_simd_extract_di_sparc64_hpc_ace
159 /* Integer logical ops on gmx_simd_fint32_t */
160 /* Shifts are horrible since they require memory re-loads. */
161 #define gmx_simd_slli_fi gmx_simd_slli_di_sparc64_hpc_ace
162 #define gmx_simd_srli_fi gmx_simd_srli_di_sparc64_hpc_ace
163 #define gmx_simd_and_fi _fjsp_and_v2r8
164 #define gmx_simd_andnot_fi(a, b) _fjsp_andnot1_v2r8(a, b)
165 #define gmx_simd_or_fi _fjsp_or_v2r8
166 #define gmx_simd_xor_fi _fjsp_xor_v2r8
167 /* No integer arithmetic ops on gmx_simd_fint32_t */
168 /* Boolean & comparison operations on gmx_simd_float_t */
169 #define gmx_simd_fbool_t _fjsp_v2r8
170 #define gmx_simd_cmpeq_f _fjsp_cmpeq_v2r8
171 #define gmx_simd_cmplt_f _fjsp_cmplt_v2r8
172 #define gmx_simd_cmple_f _fjsp_cmple_v2r8
173 #define gmx_simd_and_fb _fjsp_and_v2r8
174 #define gmx_simd_or_fb _fjsp_or_v2r8
175 #define gmx_simd_anytrue_fb gmx_simd_anytrue_d_sparc64_hpc_ace
176 #define gmx_simd_blendzero_f _fjsp_and_v2r8
177 #define gmx_simd_blendnotzero_f(a, sel) _fjsp_andnot1_v2r8(sel, a)
178 #define gmx_simd_blendv_f(a, b, s) _fjsp_selmov_v2r8(b, a, s)
179 #define gmx_simd_reduce_f(a) gmx_simd_reduce_d_sparc64_hpc_ace(a)
180 /* No boolean & comparison operations on gmx_simd_fint32_t */
181 /* No conversions between different booleans */
183 /****************************************************
184 * DOUBLE PRECISION SIMD IMPLEMENTATION *
185 ****************************************************/
186 #define gmx_simd_double_t _fjsp_v2r8
187 #define gmx_simd_load_d _fjsp_load_v2r8
188 #define gmx_simd_load1_d(m) _fjsp_set_v2r8((*m), (*m))
189 #define gmx_simd_set1_d(a) _fjsp_set_v2r8(a, a)
190 #define gmx_simd_store_d _fjsp_store_v2r8
191 #define gmx_simd_loadu_d gmx_simd_load_d
192 /* No unaligned store of gmx_simd_double_t */
193 #define gmx_simd_setzero_d _fjsp_setzero_v2r8
194 #define gmx_simd_add_d _fjsp_add_v2r8
195 #define gmx_simd_sub_d _fjsp_sub_v2r8
196 #define gmx_simd_mul_d _fjsp_mul_v2r8
197 #define gmx_simd_fmadd_d(a, b, c) _fjsp_madd_v2r8(a, b, c)
198 #define gmx_simd_fmsub_d(a, b, c) _fjsp_msub_v2r8(a, b, c)
199 #define gmx_simd_fnmadd_d(a, b, c) _fjsp_nmsub_v2r8(a, b, c)
200 #define gmx_simd_fnmsub_d(a, b, c) _fjsp_nmadd_v2r8(a, b, c)
201 #define gmx_simd_and_d _fjsp_and_v2r8
202 #define gmx_simd_andnot_d _fjsp_andnot1_v2r8
203 #define gmx_simd_or_d _fjsp_or_v2r8
204 #define gmx_simd_xor_d _fjsp_xor_v2r8
205 #define gmx_simd_rsqrt_d(x) _fjsp_rsqrta_v2r8(x)
206 #define gmx_simd_rcp_d(x) _fjsp_rcpa_v2r8(x)
207 #define gmx_simd_fabs_d(x) _fjsp_abs_v2r8(x)
208 #define gmx_simd_fneg_d(x) _fjsp_neg_v2r8(x)
209 #define gmx_simd_max_d _fjsp_max_v2r8
210 #define gmx_simd_min_d _fjsp_min_v2r8
211 #define gmx_simd_round_d(x) gmx_simd_cvt_i2d(gmx_simd_cvt_d2i(x))
212 #define gmx_simd_trunc_d(x) gmx_simd_cvt_i2d(gmx_simd_cvtt_d2i(x))
213 #define gmx_simd_fraction_d(x) gmx_simd_sub_d(x, gmx_simd_trunc_d(x))
214 #define gmx_simd_get_exponent_d gmx_simd_get_exponent_d_sparc64_hpc_ace
215 #define gmx_simd_get_mantissa_d gmx_simd_get_mantissa_d_sparc64_hpc_ace
216 #define gmx_simd_set_exponent_d gmx_simd_set_exponent_d_sparc64_hpc_ace
217 /* integer datatype corresponding to double: gmx_simd_dint32_t */
218 #define gmx_simd_dint32_t _fjsp_v2r8
219 #define gmx_simd_load_di(m) gmx_simd_load_di_sparc64_hpc_ace(m)
220 #define gmx_simd_set1_di(i) gmx_simd_set1_di_sparc64_hpc_ace(i)
221 #define gmx_simd_store_di(m, x) gmx_simd_store_di_sparc64_hpc_ace(m, x)
222 #define gmx_simd_loadu_di gmx_simd_load_di
223 /* No unaligned store of gmx_simd_dint32_t */
224 #define gmx_simd_setzero_di _fjsp_setzero_v2r8
225 #define gmx_simd_cvt_d2i gmx_simd_cvt_d2i_sparc64_hpc_ace
226 #define gmx_simd_cvtt_d2i _fjsp_dtox_v2r8
227 #define gmx_simd_cvt_i2d _fjsp_xtod_v2r8
228 #define gmx_simd_extract_di gmx_simd_extract_di_sparc64_hpc_ace
229 /* Integer logical ops on gmx_simd_dint32_t */
230 #define gmx_simd_slli_di gmx_simd_slli_di_sparc64_hpc_ace
231 #define gmx_simd_srli_di gmx_simd_srli_di_sparc64_hpc_ace
232 #define gmx_simd_and_di _fjsp_and_v2r8
233 #define gmx_simd_andnot_di _fjsp_andnot1_v2r8
234 #define gmx_simd_or_di _fjsp_or_v2r8
235 #define gmx_simd_xor_di _fjsp_xor_v2r8
236 /* Integer arithmetic ops on integer datatype corresponding to double */
237 /* Boolean & comparison operations on gmx_simd_double_t */
238 #define gmx_simd_dbool_t _fjsp_v2r8
239 #define gmx_simd_cmpeq_d _fjsp_cmpeq_v2r8
240 #define gmx_simd_cmplt_d _fjsp_cmplt_v2r8
241 #define gmx_simd_cmple_d _fjsp_cmple_v2r8
242 #define gmx_simd_and_db _fjsp_and_v2r8
243 #define gmx_simd_or_db _fjsp_or_v2r8
244 #define gmx_simd_anytrue_db gmx_simd_anytrue_d_sparc64_hpc_ace
245 #define gmx_simd_blendzero_d _fjsp_and_v2r8
246 #define gmx_simd_blendnotzero_d(a, sel) _fjsp_andnot1_v2r8(sel, a)
247 #define gmx_simd_blendv_d(a, b, sel) _fjsp_selmov_v2r8(b, a, sel)
248 #define gmx_simd_reduce_d(a) gmx_simd_reduce_d_sparc64_hpc_ace(a)
250 /* No boolean & comparison operations on gmx_simd_dint32_t */
251 /* Float/double conversion */
252 #define gmx_simd_cvt_f2d(f) (f)
253 #define gmx_simd_cvt_d2f(d) (d)
256 /****************************************************
257 * SINGLE PRECISION IMPLEMENTATION HELPER FUNCTIONS *
258 ****************************************************/
259 static gmx_inline gmx_simd_float_t
260 gmx_simd_load_f_sparc64_hpc_ace(const float *m)
262 /* We are not allowed to cast single-to-double registers, but we can
263 * masquerade the memory location as a variable of type _fjsp_v2r4.
265 const _fjsp_v2r4 *p = (const _fjsp_v2r4 *)m;
266 _fjsp_v2r4 simd;
268 simd = *p;
269 return _fjsp_stod_v2r8(simd);
272 static gmx_inline void
273 gmx_simd_store_f_sparc64_hpc_ace(float *m, gmx_simd_float_t x)
275 /* We are not allowed to cast single-to-double registers, but we can
276 * masquerade the memory location as a variable of type _fjsp_v2r4.
278 _fjsp_v2r4 *p = (_fjsp_v2r4 *)m;
279 *p = _fjsp_dtos_v2r4(x);
282 static gmx_inline gmx_simd_dint32_t
283 gmx_simd_load_di_sparc64_hpc_ace(const int *m)
285 union
287 _fjsp_v2r8 simd;
288 long long int i[2];
290 conv;
292 conv.i[0] = m[0];
293 conv.i[1] = m[1];
295 return _fjsp_load_v2r8( (double *) &(conv.simd) );
298 static gmx_inline void
299 gmx_simd_store_di_sparc64_hpc_ace(int *m, gmx_simd_dint32_t x)
301 union
303 _fjsp_v2r8 simd;
304 long long int i[2];
306 conv;
308 _fjsp_store_v2r8( (double *) &(conv.simd), x );
310 m[0] = conv.i[0];
311 m[1] = conv.i[1];
314 static gmx_inline gmx_simd_dint32_t
315 gmx_simd_set1_di_sparc64_hpc_ace(int i)
317 union
319 _fjsp_v2r8 simd;
320 long long int i[2];
322 conv;
324 conv.i[0] = i;
325 conv.i[1] = i;
327 return _fjsp_load_v2r8( (double *) &(conv.simd) );
330 static gmx_inline int
331 gmx_simd_extract_di_sparc64_hpc_ace(gmx_simd_dint32_t x, int i)
333 long long int res;
334 /* This conditional should be optimized away at compile time */
335 if (i == 0)
337 _fjsp_storel_v2r8((double *)&res, x);
339 else
341 _fjsp_storeh_v2r8((double *)&res, x);
343 return (int)res;
346 static gmx_inline gmx_simd_dint32_t
347 gmx_simd_slli_di_sparc64_hpc_ace(gmx_simd_dint32_t x, int i)
349 _fjsp_v2i8 ix = *((_fjsp_v2i8 *)&x);
350 ix = _fjsp_slli_v2i8(ix, i);
351 x = *((_fjsp_v2r8 *)&ix);
352 return x;
355 static gmx_inline gmx_simd_dint32_t
356 gmx_simd_srli_di_sparc64_hpc_ace(gmx_simd_dint32_t x, int i)
358 _fjsp_v2i8 ix = *((_fjsp_v2i8 *)&x);
359 ix = _fjsp_srli_v2i8(ix, i);
360 x = *((_fjsp_v2r8 *)&ix);
361 return x;
364 static gmx_inline gmx_simd_dint32_t
365 gmx_simd_cvt_d2i_sparc64_hpc_ace(gmx_simd_double_t x)
367 _fjsp_v2r8 signbit = _fjsp_set_v2r8(-0.0, -0.0);
368 _fjsp_v2r8 half = _fjsp_set_v2r8(0.5, 0.5);
370 x = _fjsp_add_v2r8(x, _fjsp_or_v2r8(_fjsp_and_v2r8(signbit, x), half));
371 return _fjsp_dtox_v2r8(x);
374 static gmx_inline int
375 gmx_simd_anytrue_d_sparc64_hpc_ace(gmx_simd_dbool_t x)
377 long long int i;
378 x = _fjsp_or_v2r8(x, _fjsp_unpackhi_v2r8(x, x));
379 _fjsp_storel_v2r8((double *)&i, x);
380 return (i != 0LL);
383 static gmx_inline double
384 gmx_simd_reduce_d_sparc64_hpc_ace(gmx_simd_double_t x)
386 double d;
387 x = _fjsp_add_v2r8(x, _fjsp_unpackhi_v2r8(x, x));
388 _fjsp_storel_v2r8(&d, x);
389 return d;
393 static gmx_inline gmx_simd_double_t
394 gmx_simd_get_exponent_d_sparc64_hpc_ace(gmx_simd_double_t x)
396 /* HPC-ACE cannot cast _fjsp_v2r8 to _fjsp_v4i4, so to perform shifts we
397 * would need to store and reload. Since we are only operating on two
398 * numbers it is likely more efficient to do the operations directly on
399 * normal registers.
401 const gmx_int64_t expmask = 0x7ff0000000000000LL;
402 const gmx_int64_t expbias = 1023LL;
404 union
406 _fjsp_v2r8 simd;
407 long long int i[2];
409 conv;
411 _fjsp_store_v2r8( (double *) &conv.simd, x);
412 conv.i[0] = ((conv.i[0] & expmask) >> 52) - expbias;
413 conv.i[1] = ((conv.i[1] & expmask) >> 52) - expbias;
414 x = _fjsp_load_v2r8( (double *) &conv.simd);
415 return _fjsp_xtod_v2r8(x);
418 static gmx_inline gmx_simd_double_t
419 gmx_simd_get_mantissa_d_sparc64_hpc_ace(gmx_simd_double_t x)
421 gmx_int64_t mantmask[2] = {0x000fffffffffffffLL, 0x000fffffffffffffLL};
422 gmx_simd_double_t one = _fjsp_set_v2r8(1.0, 1.0);
424 x = _fjsp_and_v2r8(x, _fjsp_load_v2r8((double *)mantmask));
425 return _fjsp_or_v2r8(x, one);
428 static gmx_inline gmx_simd_double_t
429 gmx_simd_set_exponent_d_sparc64_hpc_ace(gmx_simd_double_t x)
431 const gmx_int64_t expbias = 1023;
432 union
434 _fjsp_v2r8 simd;
435 long long int i[2];
437 conv;
440 _fjsp_store_v2r8( (double *) &conv.simd, gmx_simd_cvt_d2i_sparc64_hpc_ace(x));
441 conv.i[0] = (conv.i[0] + expbias) << 52;
442 conv.i[1] = (conv.i[1] + expbias) << 52;
444 return _fjsp_load_v2r8( (double *) &conv.simd);
448 /* No SIMD4 support, since both single & double are only 2-wide */
451 #endif /* GMX_SIMD_IMPL_SPARC64_HPC_ACE_H */