Enable AVX2 optimized memset only if -mavx2 works
[glibc.git] / soft-fp / op-2.h
blob4ea2a00d61caade4037eb222fa58a8c066881152
1 /* Software floating-point emulation.
2 Basic two-word fraction declaration and manipulation.
3 Copyright (C) 1997-2014 Free Software Foundation, Inc.
4 This file is part of the GNU C Library.
5 Contributed by Richard Henderson (rth@cygnus.com),
6 Jakub Jelinek (jj@ultra.linux.cz),
7 David S. Miller (davem@redhat.com) and
8 Peter Maydell (pmaydell@chiark.greenend.org.uk).
10 The GNU C Library is free software; you can redistribute it and/or
11 modify it under the terms of the GNU Lesser General Public
12 License as published by the Free Software Foundation; either
13 version 2.1 of the License, or (at your option) any later version.
15 In addition to the permissions in the GNU Lesser General Public
16 License, the Free Software Foundation gives you unlimited
17 permission to link the compiled version of this file into
18 combinations with other programs, and to distribute those
19 combinations without any restriction coming from the use of this
20 file. (The Lesser General Public License restrictions do apply in
21 other respects; for example, they cover modification of the file,
22 and distribution when not linked into a combine executable.)
24 The GNU C Library is distributed in the hope that it will be useful,
25 but WITHOUT ANY WARRANTY; without even the implied warranty of
26 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
27 Lesser General Public License for more details.
29 You should have received a copy of the GNU Lesser General Public
30 License along with the GNU C Library; if not, see
31 <http://www.gnu.org/licenses/>. */
33 #define _FP_FRAC_DECL_2(X) _FP_W_TYPE X##_f0, X##_f1
34 #define _FP_FRAC_COPY_2(D, S) (D##_f0 = S##_f0, D##_f1 = S##_f1)
35 #define _FP_FRAC_SET_2(X, I) __FP_FRAC_SET_2 (X, I)
36 #define _FP_FRAC_HIGH_2(X) (X##_f1)
37 #define _FP_FRAC_LOW_2(X) (X##_f0)
38 #define _FP_FRAC_WORD_2(X, w) (X##_f##w)
40 #define _FP_FRAC_SLL_2(X, N) \
41 (void) (((N) < _FP_W_TYPE_SIZE) \
42 ? ({ \
43 if (__builtin_constant_p (N) && (N) == 1) \
44 { \
45 X##_f1 = X##_f1 + X##_f1 + (((_FP_WS_TYPE) (X##_f0)) < 0); \
46 X##_f0 += X##_f0; \
47 } \
48 else \
49 { \
50 X##_f1 = X##_f1 << (N) | X##_f0 >> (_FP_W_TYPE_SIZE - (N)); \
51 X##_f0 <<= (N); \
52 } \
53 0; \
54 }) \
55 : ({ \
56 X##_f1 = X##_f0 << ((N) - _FP_W_TYPE_SIZE); \
57 X##_f0 = 0; \
58 }))
61 #define _FP_FRAC_SRL_2(X, N) \
62 (void) (((N) < _FP_W_TYPE_SIZE) \
63 ? ({ \
64 X##_f0 = X##_f0 >> (N) | X##_f1 << (_FP_W_TYPE_SIZE - (N)); \
65 X##_f1 >>= (N); \
66 }) \
67 : ({ \
68 X##_f0 = X##_f1 >> ((N) - _FP_W_TYPE_SIZE); \
69 X##_f1 = 0; \
70 }))
72 /* Right shift with sticky-lsb. */
73 #define _FP_FRAC_SRST_2(X, S, N, sz) \
74 (void) (((N) < _FP_W_TYPE_SIZE) \
75 ? ({ \
76 S = (__builtin_constant_p (N) && (N) == 1 \
77 ? X##_f0 & 1 \
78 : (X##_f0 << (_FP_W_TYPE_SIZE - (N))) != 0); \
79 X##_f0 = (X##_f1 << (_FP_W_TYPE_SIZE - (N)) | X##_f0 >> (N)); \
80 X##_f1 >>= (N); \
81 }) \
82 : ({ \
83 S = ((((N) == _FP_W_TYPE_SIZE \
84 ? 0 \
85 : (X##_f1 << (2*_FP_W_TYPE_SIZE - (N)))) \
86 | X##_f0) != 0); \
87 X##_f0 = (X##_f1 >> ((N) - _FP_W_TYPE_SIZE)); \
88 X##_f1 = 0; \
89 }))
91 #define _FP_FRAC_SRS_2(X, N, sz) \
92 (void) (((N) < _FP_W_TYPE_SIZE) \
93 ? ({ \
94 X##_f0 = (X##_f1 << (_FP_W_TYPE_SIZE - (N)) | X##_f0 >> (N) \
95 | (__builtin_constant_p (N) && (N) == 1 \
96 ? X##_f0 & 1 \
97 : (X##_f0 << (_FP_W_TYPE_SIZE - (N))) != 0)); \
98 X##_f1 >>= (N); \
99 }) \
100 : ({ \
101 X##_f0 = (X##_f1 >> ((N) - _FP_W_TYPE_SIZE) \
102 | ((((N) == _FP_W_TYPE_SIZE \
103 ? 0 \
104 : (X##_f1 << (2*_FP_W_TYPE_SIZE - (N)))) \
105 | X##_f0) != 0)); \
106 X##_f1 = 0; \
109 #define _FP_FRAC_ADDI_2(X, I) \
110 __FP_FRAC_ADDI_2 (X##_f1, X##_f0, I)
112 #define _FP_FRAC_ADD_2(R, X, Y) \
113 __FP_FRAC_ADD_2 (R##_f1, R##_f0, X##_f1, X##_f0, Y##_f1, Y##_f0)
115 #define _FP_FRAC_SUB_2(R, X, Y) \
116 __FP_FRAC_SUB_2 (R##_f1, R##_f0, X##_f1, X##_f0, Y##_f1, Y##_f0)
118 #define _FP_FRAC_DEC_2(X, Y) \
119 __FP_FRAC_DEC_2 (X##_f1, X##_f0, Y##_f1, Y##_f0)
121 #define _FP_FRAC_CLZ_2(R, X) \
122 do \
124 if (X##_f1) \
125 __FP_CLZ (R, X##_f1); \
126 else \
128 __FP_CLZ (R, X##_f0); \
129 R += _FP_W_TYPE_SIZE; \
132 while (0)
134 /* Predicates */
135 #define _FP_FRAC_NEGP_2(X) ((_FP_WS_TYPE) X##_f1 < 0)
136 #define _FP_FRAC_ZEROP_2(X) ((X##_f1 | X##_f0) == 0)
137 #define _FP_FRAC_OVERP_2(fs, X) (_FP_FRAC_HIGH_##fs (X) & _FP_OVERFLOW_##fs)
138 #define _FP_FRAC_CLEAR_OVERP_2(fs, X) (_FP_FRAC_HIGH_##fs (X) &= ~_FP_OVERFLOW_##fs)
139 #define _FP_FRAC_HIGHBIT_DW_2(fs, X) \
140 (_FP_FRAC_HIGH_DW_##fs (X) & _FP_HIGHBIT_DW_##fs)
141 #define _FP_FRAC_EQ_2(X, Y) (X##_f1 == Y##_f1 && X##_f0 == Y##_f0)
142 #define _FP_FRAC_GT_2(X, Y) \
143 (X##_f1 > Y##_f1 || (X##_f1 == Y##_f1 && X##_f0 > Y##_f0))
144 #define _FP_FRAC_GE_2(X, Y) \
145 (X##_f1 > Y##_f1 || (X##_f1 == Y##_f1 && X##_f0 >= Y##_f0))
147 #define _FP_ZEROFRAC_2 0, 0
148 #define _FP_MINFRAC_2 0, 1
149 #define _FP_MAXFRAC_2 (~(_FP_WS_TYPE) 0), (~(_FP_WS_TYPE) 0)
152 * Internals
155 #define __FP_FRAC_SET_2(X, I1, I0) (X##_f0 = I0, X##_f1 = I1)
157 #define __FP_CLZ_2(R, xh, xl) \
158 do \
160 if (xh) \
161 __FP_CLZ (R, xh); \
162 else \
164 __FP_CLZ (R, xl); \
165 R += _FP_W_TYPE_SIZE; \
168 while (0)
170 #if 0
172 # ifndef __FP_FRAC_ADDI_2
173 # define __FP_FRAC_ADDI_2(xh, xl, i) \
174 (xh += ((xl += i) < i))
175 # endif
176 # ifndef __FP_FRAC_ADD_2
177 # define __FP_FRAC_ADD_2(rh, rl, xh, xl, yh, yl) \
178 (rh = xh + yh + ((rl = xl + yl) < xl))
179 # endif
180 # ifndef __FP_FRAC_SUB_2
181 # define __FP_FRAC_SUB_2(rh, rl, xh, xl, yh, yl) \
182 (rh = xh - yh - ((rl = xl - yl) > xl))
183 # endif
184 # ifndef __FP_FRAC_DEC_2
185 # define __FP_FRAC_DEC_2(xh, xl, yh, yl) \
186 do \
188 UWtype __FP_FRAC_DEC_2_t = xl; \
189 xh -= yh + ((xl -= yl) > __FP_FRAC_DEC_2_t); \
191 while (0)
192 # endif
194 #else
196 # undef __FP_FRAC_ADDI_2
197 # define __FP_FRAC_ADDI_2(xh, xl, i) add_ssaaaa (xh, xl, xh, xl, 0, i)
198 # undef __FP_FRAC_ADD_2
199 # define __FP_FRAC_ADD_2 add_ssaaaa
200 # undef __FP_FRAC_SUB_2
201 # define __FP_FRAC_SUB_2 sub_ddmmss
202 # undef __FP_FRAC_DEC_2
203 # define __FP_FRAC_DEC_2(xh, xl, yh, yl) \
204 sub_ddmmss (xh, xl, xh, xl, yh, yl)
206 #endif
209 * Unpack the raw bits of a native fp value. Do not classify or
210 * normalize the data.
213 #define _FP_UNPACK_RAW_2(fs, X, val) \
214 do \
216 union _FP_UNION_##fs _FP_UNPACK_RAW_2_flo; \
217 _FP_UNPACK_RAW_2_flo.flt = (val); \
219 X##_f0 = _FP_UNPACK_RAW_2_flo.bits.frac0; \
220 X##_f1 = _FP_UNPACK_RAW_2_flo.bits.frac1; \
221 X##_e = _FP_UNPACK_RAW_2_flo.bits.exp; \
222 X##_s = _FP_UNPACK_RAW_2_flo.bits.sign; \
224 while (0)
226 #define _FP_UNPACK_RAW_2_P(fs, X, val) \
227 do \
229 union _FP_UNION_##fs *_FP_UNPACK_RAW_2_P_flo \
230 = (union _FP_UNION_##fs *) (val); \
232 X##_f0 = _FP_UNPACK_RAW_2_P_flo->bits.frac0; \
233 X##_f1 = _FP_UNPACK_RAW_2_P_flo->bits.frac1; \
234 X##_e = _FP_UNPACK_RAW_2_P_flo->bits.exp; \
235 X##_s = _FP_UNPACK_RAW_2_P_flo->bits.sign; \
237 while (0)
241 * Repack the raw bits of a native fp value.
244 #define _FP_PACK_RAW_2(fs, val, X) \
245 do \
247 union _FP_UNION_##fs _FP_PACK_RAW_2_flo; \
249 _FP_PACK_RAW_2_flo.bits.frac0 = X##_f0; \
250 _FP_PACK_RAW_2_flo.bits.frac1 = X##_f1; \
251 _FP_PACK_RAW_2_flo.bits.exp = X##_e; \
252 _FP_PACK_RAW_2_flo.bits.sign = X##_s; \
254 (val) = _FP_PACK_RAW_2_flo.flt; \
256 while (0)
258 #define _FP_PACK_RAW_2_P(fs, val, X) \
259 do \
261 union _FP_UNION_##fs *_FP_PACK_RAW_2_P_flo \
262 = (union _FP_UNION_##fs *) (val); \
264 _FP_PACK_RAW_2_P_flo->bits.frac0 = X##_f0; \
265 _FP_PACK_RAW_2_P_flo->bits.frac1 = X##_f1; \
266 _FP_PACK_RAW_2_P_flo->bits.exp = X##_e; \
267 _FP_PACK_RAW_2_P_flo->bits.sign = X##_s; \
269 while (0)
273 * Multiplication algorithms:
276 /* Given a 1W * 1W => 2W primitive, do the extended multiplication. */
278 #define _FP_MUL_MEAT_DW_2_wide(wfracbits, R, X, Y, doit) \
279 do \
281 _FP_FRAC_DECL_2 (_FP_MUL_MEAT_DW_2_wide_b); \
282 _FP_FRAC_DECL_2 (_FP_MUL_MEAT_DW_2_wide_c); \
284 doit (_FP_FRAC_WORD_4 (R, 1), _FP_FRAC_WORD_4 (R, 0), \
285 X##_f0, Y##_f0); \
286 doit (_FP_MUL_MEAT_DW_2_wide_b_f1, _FP_MUL_MEAT_DW_2_wide_b_f0, \
287 X##_f0, Y##_f1); \
288 doit (_FP_MUL_MEAT_DW_2_wide_c_f1, _FP_MUL_MEAT_DW_2_wide_c_f0, \
289 X##_f1, Y##_f0); \
290 doit (_FP_FRAC_WORD_4 (R, 3), _FP_FRAC_WORD_4 (R, 2), \
291 X##_f1, Y##_f1); \
293 __FP_FRAC_ADD_3 (_FP_FRAC_WORD_4 (R, 3), _FP_FRAC_WORD_4 (R, 2), \
294 _FP_FRAC_WORD_4 (R, 1), 0, \
295 _FP_MUL_MEAT_DW_2_wide_b_f1, \
296 _FP_MUL_MEAT_DW_2_wide_b_f0, \
297 _FP_FRAC_WORD_4 (R, 3), _FP_FRAC_WORD_4 (R, 2), \
298 _FP_FRAC_WORD_4 (R, 1)); \
299 __FP_FRAC_ADD_3 (_FP_FRAC_WORD_4 (R, 3), _FP_FRAC_WORD_4 (R, 2), \
300 _FP_FRAC_WORD_4 (R, 1), 0, \
301 _FP_MUL_MEAT_DW_2_wide_c_f1, \
302 _FP_MUL_MEAT_DW_2_wide_c_f0, \
303 _FP_FRAC_WORD_4 (R, 3), _FP_FRAC_WORD_4 (R, 2), \
304 _FP_FRAC_WORD_4 (R, 1)); \
306 while (0)
308 #define _FP_MUL_MEAT_2_wide(wfracbits, R, X, Y, doit) \
309 do \
311 _FP_FRAC_DECL_4 (_FP_MUL_MEAT_2_wide_z); \
313 _FP_MUL_MEAT_DW_2_wide (wfracbits, _FP_MUL_MEAT_2_wide_z, \
314 X, Y, doit); \
316 /* Normalize since we know where the msb of the multiplicands \
317 were (bit B), we know that the msb of the of the product is \
318 at either 2B or 2B-1. */ \
319 _FP_FRAC_SRS_4 (_FP_MUL_MEAT_2_wide_z, wfracbits-1, 2*wfracbits); \
320 R##_f0 = _FP_FRAC_WORD_4 (_FP_MUL_MEAT_2_wide_z, 0); \
321 R##_f1 = _FP_FRAC_WORD_4 (_FP_MUL_MEAT_2_wide_z, 1); \
323 while (0)
325 /* Given a 1W * 1W => 2W primitive, do the extended multiplication.
326 Do only 3 multiplications instead of four. This one is for machines
327 where multiplication is much more expensive than subtraction. */
329 #define _FP_MUL_MEAT_DW_2_wide_3mul(wfracbits, R, X, Y, doit) \
330 do \
332 _FP_FRAC_DECL_2 (_FP_MUL_MEAT_DW_2_wide_3mul_b); \
333 _FP_FRAC_DECL_2 (_FP_MUL_MEAT_DW_2_wide_3mul_c); \
334 _FP_W_TYPE _FP_MUL_MEAT_DW_2_wide_3mul_d; \
335 int _FP_MUL_MEAT_DW_2_wide_3mul_c1; \
336 int _FP_MUL_MEAT_DW_2_wide_3mul_c2; \
338 _FP_MUL_MEAT_DW_2_wide_3mul_b_f0 = X##_f0 + X##_f1; \
339 _FP_MUL_MEAT_DW_2_wide_3mul_c1 \
340 = _FP_MUL_MEAT_DW_2_wide_3mul_b_f0 < X##_f0; \
341 _FP_MUL_MEAT_DW_2_wide_3mul_b_f1 = Y##_f0 + Y##_f1; \
342 _FP_MUL_MEAT_DW_2_wide_3mul_c2 \
343 = _FP_MUL_MEAT_DW_2_wide_3mul_b_f1 < Y##_f0; \
344 doit (_FP_MUL_MEAT_DW_2_wide_3mul_d, _FP_FRAC_WORD_4 (R, 0), \
345 X##_f0, Y##_f0); \
346 doit (_FP_FRAC_WORD_4 (R, 2), _FP_FRAC_WORD_4 (R, 1), \
347 _FP_MUL_MEAT_DW_2_wide_3mul_b_f0, \
348 _FP_MUL_MEAT_DW_2_wide_3mul_b_f1); \
349 doit (_FP_MUL_MEAT_DW_2_wide_3mul_c_f1, \
350 _FP_MUL_MEAT_DW_2_wide_3mul_c_f0, X##_f1, Y##_f1); \
352 _FP_MUL_MEAT_DW_2_wide_3mul_b_f0 \
353 &= -_FP_MUL_MEAT_DW_2_wide_3mul_c2; \
354 _FP_MUL_MEAT_DW_2_wide_3mul_b_f1 \
355 &= -_FP_MUL_MEAT_DW_2_wide_3mul_c1; \
356 __FP_FRAC_ADD_3 (_FP_FRAC_WORD_4 (R, 3), _FP_FRAC_WORD_4 (R, 2), \
357 _FP_FRAC_WORD_4 (R, 1), \
358 (_FP_MUL_MEAT_DW_2_wide_3mul_c1 \
359 & _FP_MUL_MEAT_DW_2_wide_3mul_c2), 0, \
360 _FP_MUL_MEAT_DW_2_wide_3mul_d, \
361 0, _FP_FRAC_WORD_4 (R, 2), _FP_FRAC_WORD_4 (R, 1)); \
362 __FP_FRAC_ADDI_2 (_FP_FRAC_WORD_4 (R, 3), _FP_FRAC_WORD_4 (R, 2), \
363 _FP_MUL_MEAT_DW_2_wide_3mul_b_f0); \
364 __FP_FRAC_ADDI_2 (_FP_FRAC_WORD_4 (R, 3), _FP_FRAC_WORD_4 (R, 2), \
365 _FP_MUL_MEAT_DW_2_wide_3mul_b_f1); \
366 __FP_FRAC_DEC_3 (_FP_FRAC_WORD_4 (R, 3), _FP_FRAC_WORD_4 (R, 2), \
367 _FP_FRAC_WORD_4 (R, 1), \
368 0, _FP_MUL_MEAT_DW_2_wide_3mul_d, \
369 _FP_FRAC_WORD_4 (R, 0)); \
370 __FP_FRAC_DEC_3 (_FP_FRAC_WORD_4 (R, 3), _FP_FRAC_WORD_4 (R, 2), \
371 _FP_FRAC_WORD_4 (R, 1), 0, \
372 _FP_MUL_MEAT_DW_2_wide_3mul_c_f1, \
373 _FP_MUL_MEAT_DW_2_wide_3mul_c_f0); \
374 __FP_FRAC_ADD_2 (_FP_FRAC_WORD_4 (R, 3), _FP_FRAC_WORD_4 (R, 2), \
375 _FP_MUL_MEAT_DW_2_wide_3mul_c_f1, \
376 _FP_MUL_MEAT_DW_2_wide_3mul_c_f0, \
377 _FP_FRAC_WORD_4 (R, 3), _FP_FRAC_WORD_4 (R, 2)); \
379 while (0)
381 #define _FP_MUL_MEAT_2_wide_3mul(wfracbits, R, X, Y, doit) \
382 do \
384 _FP_FRAC_DECL_4 (_FP_MUL_MEAT_2_wide_3mul_z); \
386 _FP_MUL_MEAT_DW_2_wide_3mul (wfracbits, \
387 _FP_MUL_MEAT_2_wide_3mul_z, \
388 X, Y, doit); \
390 /* Normalize since we know where the msb of the multiplicands \
391 were (bit B), we know that the msb of the of the product is \
392 at either 2B or 2B-1. */ \
393 _FP_FRAC_SRS_4 (_FP_MUL_MEAT_2_wide_3mul_z, \
394 wfracbits-1, 2*wfracbits); \
395 R##_f0 = _FP_FRAC_WORD_4 (_FP_MUL_MEAT_2_wide_3mul_z, 0); \
396 R##_f1 = _FP_FRAC_WORD_4 (_FP_MUL_MEAT_2_wide_3mul_z, 1); \
398 while (0)
400 #define _FP_MUL_MEAT_DW_2_gmp(wfracbits, R, X, Y) \
401 do \
403 _FP_W_TYPE _FP_MUL_MEAT_DW_2_gmp_x[2]; \
404 _FP_W_TYPE _FP_MUL_MEAT_DW_2_gmp_y[2]; \
405 _FP_MUL_MEAT_DW_2_gmp_x[0] = X##_f0; \
406 _FP_MUL_MEAT_DW_2_gmp_x[1] = X##_f1; \
407 _FP_MUL_MEAT_DW_2_gmp_y[0] = Y##_f0; \
408 _FP_MUL_MEAT_DW_2_gmp_y[1] = Y##_f1; \
410 mpn_mul_n (R##_f, _FP_MUL_MEAT_DW_2_gmp_x, \
411 _FP_MUL_MEAT_DW_2_gmp_y, 2); \
413 while (0)
415 #define _FP_MUL_MEAT_2_gmp(wfracbits, R, X, Y) \
416 do \
418 _FP_FRAC_DECL_4 (_FP_MUL_MEAT_2_gmp_z); \
420 _FP_MUL_MEAT_DW_2_gmp (wfracbits, _FP_MUL_MEAT_2_gmp_z, X, Y); \
422 /* Normalize since we know where the msb of the multiplicands \
423 were (bit B), we know that the msb of the of the product is \
424 at either 2B or 2B-1. */ \
425 _FP_FRAC_SRS_4 (_FP_MUL_MEAT_2_gmp_z, wfracbits-1, 2*wfracbits); \
426 R##_f0 = _FP_MUL_MEAT_2_gmp_z_f[0]; \
427 R##_f1 = _FP_MUL_MEAT_2_gmp_z_f[1]; \
429 while (0)
431 /* Do at most 120x120=240 bits multiplication using double floating
432 point multiplication. This is useful if floating point
433 multiplication has much bigger throughput than integer multiply.
434 It is supposed to work for _FP_W_TYPE_SIZE 64 and wfracbits
435 between 106 and 120 only.
436 Caller guarantees that X and Y has (1LLL << (wfracbits - 1)) set.
437 SETFETZ is a macro which will disable all FPU exceptions and set rounding
438 towards zero, RESETFE should optionally reset it back. */
440 #define _FP_MUL_MEAT_2_120_240_double(wfracbits, R, X, Y, setfetz, resetfe) \
441 do \
443 static const double _const[] = \
445 /* 2^-24 */ 5.9604644775390625e-08, \
446 /* 2^-48 */ 3.5527136788005009e-15, \
447 /* 2^-72 */ 2.1175823681357508e-22, \
448 /* 2^-96 */ 1.2621774483536189e-29, \
449 /* 2^28 */ 2.68435456e+08, \
450 /* 2^4 */ 1.600000e+01, \
451 /* 2^-20 */ 9.5367431640625e-07, \
452 /* 2^-44 */ 5.6843418860808015e-14, \
453 /* 2^-68 */ 3.3881317890172014e-21, \
454 /* 2^-92 */ 2.0194839173657902e-28, \
455 /* 2^-116 */ 1.2037062152420224e-35 \
456 }; \
457 double _a240, _b240, _c240, _d240, _e240, _f240, \
458 _g240, _h240, _i240, _j240, _k240; \
459 union { double d; UDItype i; } _l240, _m240, _n240, _o240, \
460 _p240, _q240, _r240, _s240; \
461 UDItype _t240, _u240, _v240, _w240, _x240, _y240 = 0; \
463 if (wfracbits < 106 || wfracbits > 120) \
464 abort (); \
466 setfetz; \
468 _e240 = (double) (long) (X##_f0 & 0xffffff); \
469 _j240 = (double) (long) (Y##_f0 & 0xffffff); \
470 _d240 = (double) (long) ((X##_f0 >> 24) & 0xffffff); \
471 _i240 = (double) (long) ((Y##_f0 >> 24) & 0xffffff); \
472 _c240 = (double) (long) (((X##_f1 << 16) & 0xffffff) | (X##_f0 >> 48)); \
473 _h240 = (double) (long) (((Y##_f1 << 16) & 0xffffff) | (Y##_f0 >> 48)); \
474 _b240 = (double) (long) ((X##_f1 >> 8) & 0xffffff); \
475 _g240 = (double) (long) ((Y##_f1 >> 8) & 0xffffff); \
476 _a240 = (double) (long) (X##_f1 >> 32); \
477 _f240 = (double) (long) (Y##_f1 >> 32); \
478 _e240 *= _const[3]; \
479 _j240 *= _const[3]; \
480 _d240 *= _const[2]; \
481 _i240 *= _const[2]; \
482 _c240 *= _const[1]; \
483 _h240 *= _const[1]; \
484 _b240 *= _const[0]; \
485 _g240 *= _const[0]; \
486 _s240.d = _e240*_j240; \
487 _r240.d = _d240*_j240 + _e240*_i240; \
488 _q240.d = _c240*_j240 + _d240*_i240 + _e240*_h240; \
489 _p240.d = _b240*_j240 + _c240*_i240 + _d240*_h240 + _e240*_g240; \
490 _o240.d = _a240*_j240 + _b240*_i240 + _c240*_h240 + _d240*_g240 + _e240*_f240; \
491 _n240.d = _a240*_i240 + _b240*_h240 + _c240*_g240 + _d240*_f240; \
492 _m240.d = _a240*_h240 + _b240*_g240 + _c240*_f240; \
493 _l240.d = _a240*_g240 + _b240*_f240; \
494 _k240 = _a240*_f240; \
495 _r240.d += _s240.d; \
496 _q240.d += _r240.d; \
497 _p240.d += _q240.d; \
498 _o240.d += _p240.d; \
499 _n240.d += _o240.d; \
500 _m240.d += _n240.d; \
501 _l240.d += _m240.d; \
502 _k240 += _l240.d; \
503 _s240.d -= ((_const[10]+_s240.d)-_const[10]); \
504 _r240.d -= ((_const[9]+_r240.d)-_const[9]); \
505 _q240.d -= ((_const[8]+_q240.d)-_const[8]); \
506 _p240.d -= ((_const[7]+_p240.d)-_const[7]); \
507 _o240.d += _const[7]; \
508 _n240.d += _const[6]; \
509 _m240.d += _const[5]; \
510 _l240.d += _const[4]; \
511 if (_s240.d != 0.0) \
512 _y240 = 1; \
513 if (_r240.d != 0.0) \
514 _y240 = 1; \
515 if (_q240.d != 0.0) \
516 _y240 = 1; \
517 if (_p240.d != 0.0) \
518 _y240 = 1; \
519 _t240 = (DItype) _k240; \
520 _u240 = _l240.i; \
521 _v240 = _m240.i; \
522 _w240 = _n240.i; \
523 _x240 = _o240.i; \
524 R##_f1 = ((_t240 << (128 - (wfracbits - 1))) \
525 | ((_u240 & 0xffffff) >> ((wfracbits - 1) - 104))); \
526 R##_f0 = (((_u240 & 0xffffff) << (168 - (wfracbits - 1))) \
527 | ((_v240 & 0xffffff) << (144 - (wfracbits - 1))) \
528 | ((_w240 & 0xffffff) << (120 - (wfracbits - 1))) \
529 | ((_x240 & 0xffffff) >> ((wfracbits - 1) - 96)) \
530 | _y240); \
531 resetfe; \
533 while (0)
536 * Division algorithms:
539 #define _FP_DIV_MEAT_2_udiv(fs, R, X, Y) \
540 do \
542 _FP_W_TYPE _FP_DIV_MEAT_2_udiv_n_f2; \
543 _FP_W_TYPE _FP_DIV_MEAT_2_udiv_n_f1; \
544 _FP_W_TYPE _FP_DIV_MEAT_2_udiv_n_f0; \
545 _FP_W_TYPE _FP_DIV_MEAT_2_udiv_r_f1; \
546 _FP_W_TYPE _FP_DIV_MEAT_2_udiv_r_f0; \
547 _FP_W_TYPE _FP_DIV_MEAT_2_udiv_m_f1; \
548 _FP_W_TYPE _FP_DIV_MEAT_2_udiv_m_f0; \
549 if (_FP_FRAC_GE_2 (X, Y)) \
551 _FP_DIV_MEAT_2_udiv_n_f2 = X##_f1 >> 1; \
552 _FP_DIV_MEAT_2_udiv_n_f1 \
553 = X##_f1 << (_FP_W_TYPE_SIZE - 1) | X##_f0 >> 1; \
554 _FP_DIV_MEAT_2_udiv_n_f0 \
555 = X##_f0 << (_FP_W_TYPE_SIZE - 1); \
557 else \
559 R##_e--; \
560 _FP_DIV_MEAT_2_udiv_n_f2 = X##_f1; \
561 _FP_DIV_MEAT_2_udiv_n_f1 = X##_f0; \
562 _FP_DIV_MEAT_2_udiv_n_f0 = 0; \
565 /* Normalize, i.e. make the most significant bit of the \
566 denominator set. */ \
567 _FP_FRAC_SLL_2 (Y, _FP_WFRACXBITS_##fs); \
569 udiv_qrnnd (R##_f1, _FP_DIV_MEAT_2_udiv_r_f1, \
570 _FP_DIV_MEAT_2_udiv_n_f2, _FP_DIV_MEAT_2_udiv_n_f1, \
571 Y##_f1); \
572 umul_ppmm (_FP_DIV_MEAT_2_udiv_m_f1, _FP_DIV_MEAT_2_udiv_m_f0, \
573 R##_f1, Y##_f0); \
574 _FP_DIV_MEAT_2_udiv_r_f0 = _FP_DIV_MEAT_2_udiv_n_f0; \
575 if (_FP_FRAC_GT_2 (_FP_DIV_MEAT_2_udiv_m, _FP_DIV_MEAT_2_udiv_r)) \
577 R##_f1--; \
578 _FP_FRAC_ADD_2 (_FP_DIV_MEAT_2_udiv_r, Y, \
579 _FP_DIV_MEAT_2_udiv_r); \
580 if (_FP_FRAC_GE_2 (_FP_DIV_MEAT_2_udiv_r, Y) \
581 && _FP_FRAC_GT_2 (_FP_DIV_MEAT_2_udiv_m, \
582 _FP_DIV_MEAT_2_udiv_r)) \
584 R##_f1--; \
585 _FP_FRAC_ADD_2 (_FP_DIV_MEAT_2_udiv_r, Y, \
586 _FP_DIV_MEAT_2_udiv_r); \
589 _FP_FRAC_DEC_2 (_FP_DIV_MEAT_2_udiv_r, _FP_DIV_MEAT_2_udiv_m); \
591 if (_FP_DIV_MEAT_2_udiv_r_f1 == Y##_f1) \
593 /* This is a special case, not an optimization \
594 (_FP_DIV_MEAT_2_udiv_r/Y##_f1 would not fit into UWtype). \
595 As _FP_DIV_MEAT_2_udiv_r is guaranteed to be < Y, \
596 R##_f0 can be either (UWtype)-1 or (UWtype)-2. But as we \
597 know what kind of bits it is (sticky, guard, round), \
598 we don't care. We also don't care what the reminder is, \
599 because the guard bit will be set anyway. -jj */ \
600 R##_f0 = -1; \
602 else \
604 udiv_qrnnd (R##_f0, _FP_DIV_MEAT_2_udiv_r_f1, \
605 _FP_DIV_MEAT_2_udiv_r_f1, \
606 _FP_DIV_MEAT_2_udiv_r_f0, Y##_f1); \
607 umul_ppmm (_FP_DIV_MEAT_2_udiv_m_f1, \
608 _FP_DIV_MEAT_2_udiv_m_f0, R##_f0, Y##_f0); \
609 _FP_DIV_MEAT_2_udiv_r_f0 = 0; \
610 if (_FP_FRAC_GT_2 (_FP_DIV_MEAT_2_udiv_m, \
611 _FP_DIV_MEAT_2_udiv_r)) \
613 R##_f0--; \
614 _FP_FRAC_ADD_2 (_FP_DIV_MEAT_2_udiv_r, Y, \
615 _FP_DIV_MEAT_2_udiv_r); \
616 if (_FP_FRAC_GE_2 (_FP_DIV_MEAT_2_udiv_r, Y) \
617 && _FP_FRAC_GT_2 (_FP_DIV_MEAT_2_udiv_m, \
618 _FP_DIV_MEAT_2_udiv_r)) \
620 R##_f0--; \
621 _FP_FRAC_ADD_2 (_FP_DIV_MEAT_2_udiv_r, Y, \
622 _FP_DIV_MEAT_2_udiv_r); \
625 if (!_FP_FRAC_EQ_2 (_FP_DIV_MEAT_2_udiv_r, \
626 _FP_DIV_MEAT_2_udiv_m)) \
627 R##_f0 |= _FP_WORK_STICKY; \
630 while (0)
634 * Square root algorithms:
635 * We have just one right now, maybe Newton approximation
636 * should be added for those machines where division is fast.
639 #define _FP_SQRT_MEAT_2(R, S, T, X, q) \
640 do \
642 while (q) \
644 T##_f1 = S##_f1 + q; \
645 if (T##_f1 <= X##_f1) \
647 S##_f1 = T##_f1 + q; \
648 X##_f1 -= T##_f1; \
649 R##_f1 += q; \
651 _FP_FRAC_SLL_2 (X, 1); \
652 q >>= 1; \
654 q = (_FP_W_TYPE) 1 << (_FP_W_TYPE_SIZE - 1); \
655 while (q != _FP_WORK_ROUND) \
657 T##_f0 = S##_f0 + q; \
658 T##_f1 = S##_f1; \
659 if (T##_f1 < X##_f1 \
660 || (T##_f1 == X##_f1 && T##_f0 <= X##_f0)) \
662 S##_f0 = T##_f0 + q; \
663 S##_f1 += (T##_f0 > S##_f0); \
664 _FP_FRAC_DEC_2 (X, T); \
665 R##_f0 += q; \
667 _FP_FRAC_SLL_2 (X, 1); \
668 q >>= 1; \
670 if (X##_f0 | X##_f1) \
672 if (S##_f1 < X##_f1 \
673 || (S##_f1 == X##_f1 && S##_f0 < X##_f0)) \
674 R##_f0 |= _FP_WORK_ROUND; \
675 R##_f0 |= _FP_WORK_STICKY; \
678 while (0)
682 * Assembly/disassembly for converting to/from integral types.
683 * No shifting or overflow handled here.
686 #define _FP_FRAC_ASSEMBLE_2(r, X, rsize) \
687 (void) ((rsize <= _FP_W_TYPE_SIZE) \
688 ? ({ r = X##_f0; }) \
689 : ({ \
690 r = X##_f1; \
691 r <<= _FP_W_TYPE_SIZE; \
692 r += X##_f0; \
695 #define _FP_FRAC_DISASSEMBLE_2(X, r, rsize) \
696 do \
698 X##_f0 = r; \
699 X##_f1 = (rsize <= _FP_W_TYPE_SIZE ? 0 : r >> _FP_W_TYPE_SIZE); \
701 while (0)
704 * Convert FP values between word sizes
707 #define _FP_FRAC_COPY_1_2(D, S) (D##_f = S##_f0)
709 #define _FP_FRAC_COPY_2_1(D, S) ((D##_f0 = S##_f), (D##_f1 = 0))
711 #define _FP_FRAC_COPY_2_2(D, S) _FP_FRAC_COPY_2 (D, S)