Enable AVX2 optimized memset only if -mavx2 works
[glibc.git] / soft-fp / op-4.h
blob3acf96c48c84ed0a634e9d0bac1559a656dcf574
1 /* Software floating-point emulation.
2 Basic four-word fraction declaration and manipulation.
3 Copyright (C) 1997-2014 Free Software Foundation, Inc.
4 This file is part of the GNU C Library.
5 Contributed by Richard Henderson (rth@cygnus.com),
6 Jakub Jelinek (jj@ultra.linux.cz),
7 David S. Miller (davem@redhat.com) and
8 Peter Maydell (pmaydell@chiark.greenend.org.uk).
10 The GNU C Library is free software; you can redistribute it and/or
11 modify it under the terms of the GNU Lesser General Public
12 License as published by the Free Software Foundation; either
13 version 2.1 of the License, or (at your option) any later version.
15 In addition to the permissions in the GNU Lesser General Public
16 License, the Free Software Foundation gives you unlimited
17 permission to link the compiled version of this file into
18 combinations with other programs, and to distribute those
19 combinations without any restriction coming from the use of this
20 file. (The Lesser General Public License restrictions do apply in
21 other respects; for example, they cover modification of the file,
22 and distribution when not linked into a combine executable.)
24 The GNU C Library is distributed in the hope that it will be useful,
25 but WITHOUT ANY WARRANTY; without even the implied warranty of
26 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
27 Lesser General Public License for more details.
29 You should have received a copy of the GNU Lesser General Public
30 License along with the GNU C Library; if not, see
31 <http://www.gnu.org/licenses/>. */
33 #define _FP_FRAC_DECL_4(X) _FP_W_TYPE X##_f[4]
34 #define _FP_FRAC_COPY_4(D, S) \
35 (D##_f[0] = S##_f[0], D##_f[1] = S##_f[1], \
36 D##_f[2] = S##_f[2], D##_f[3] = S##_f[3])
37 #define _FP_FRAC_SET_4(X, I) __FP_FRAC_SET_4 (X, I)
38 #define _FP_FRAC_HIGH_4(X) (X##_f[3])
39 #define _FP_FRAC_LOW_4(X) (X##_f[0])
40 #define _FP_FRAC_WORD_4(X, w) (X##_f[w])
42 #define _FP_FRAC_SLL_4(X, N) \
43 do \
44 { \
45 _FP_I_TYPE _FP_FRAC_SLL_4_up, _FP_FRAC_SLL_4_down; \
46 _FP_I_TYPE _FP_FRAC_SLL_4_skip, _FP_FRAC_SLL_4_i; \
47 _FP_FRAC_SLL_4_skip = (N) / _FP_W_TYPE_SIZE; \
48 _FP_FRAC_SLL_4_up = (N) % _FP_W_TYPE_SIZE; \
49 _FP_FRAC_SLL_4_down = _FP_W_TYPE_SIZE - _FP_FRAC_SLL_4_up; \
50 if (!_FP_FRAC_SLL_4_up) \
51 for (_FP_FRAC_SLL_4_i = 3; \
52 _FP_FRAC_SLL_4_i >= _FP_FRAC_SLL_4_skip; \
53 --_FP_FRAC_SLL_4_i) \
54 X##_f[_FP_FRAC_SLL_4_i] \
55 = X##_f[_FP_FRAC_SLL_4_i-_FP_FRAC_SLL_4_skip]; \
56 else \
57 { \
58 for (_FP_FRAC_SLL_4_i = 3; \
59 _FP_FRAC_SLL_4_i > _FP_FRAC_SLL_4_skip; \
60 --_FP_FRAC_SLL_4_i) \
61 X##_f[_FP_FRAC_SLL_4_i] \
62 = ((X##_f[_FP_FRAC_SLL_4_i-_FP_FRAC_SLL_4_skip] \
63 << _FP_FRAC_SLL_4_up) \
64 | (X##_f[_FP_FRAC_SLL_4_i-_FP_FRAC_SLL_4_skip-1] \
65 >> _FP_FRAC_SLL_4_down)); \
66 X##_f[_FP_FRAC_SLL_4_i--] = X##_f[0] << _FP_FRAC_SLL_4_up; \
67 } \
68 for (; _FP_FRAC_SLL_4_i >= 0; --_FP_FRAC_SLL_4_i) \
69 X##_f[_FP_FRAC_SLL_4_i] = 0; \
70 } \
71 while (0)
73 /* This one was broken too */
74 #define _FP_FRAC_SRL_4(X, N) \
75 do \
76 { \
77 _FP_I_TYPE _FP_FRAC_SRL_4_up, _FP_FRAC_SRL_4_down; \
78 _FP_I_TYPE _FP_FRAC_SRL_4_skip, _FP_FRAC_SRL_4_i; \
79 _FP_FRAC_SRL_4_skip = (N) / _FP_W_TYPE_SIZE; \
80 _FP_FRAC_SRL_4_down = (N) % _FP_W_TYPE_SIZE; \
81 _FP_FRAC_SRL_4_up = _FP_W_TYPE_SIZE - _FP_FRAC_SRL_4_down; \
82 if (!_FP_FRAC_SRL_4_down) \
83 for (_FP_FRAC_SRL_4_i = 0; \
84 _FP_FRAC_SRL_4_i <= 3-_FP_FRAC_SRL_4_skip; \
85 ++_FP_FRAC_SRL_4_i) \
86 X##_f[_FP_FRAC_SRL_4_i] \
87 = X##_f[_FP_FRAC_SRL_4_i+_FP_FRAC_SRL_4_skip]; \
88 else \
89 { \
90 for (_FP_FRAC_SRL_4_i = 0; \
91 _FP_FRAC_SRL_4_i < 3-_FP_FRAC_SRL_4_skip; \
92 ++_FP_FRAC_SRL_4_i) \
93 X##_f[_FP_FRAC_SRL_4_i] \
94 = ((X##_f[_FP_FRAC_SRL_4_i+_FP_FRAC_SRL_4_skip] \
95 >> _FP_FRAC_SRL_4_down) \
96 | (X##_f[_FP_FRAC_SRL_4_i+_FP_FRAC_SRL_4_skip+1] \
97 << _FP_FRAC_SRL_4_up)); \
98 X##_f[_FP_FRAC_SRL_4_i++] = X##_f[3] >> _FP_FRAC_SRL_4_down; \
99 } \
100 for (; _FP_FRAC_SRL_4_i < 4; ++_FP_FRAC_SRL_4_i) \
101 X##_f[_FP_FRAC_SRL_4_i] = 0; \
103 while (0)
106 /* Right shift with sticky-lsb.
107 * What this actually means is that we do a standard right-shift,
108 * but that if any of the bits that fall off the right hand side
109 * were one then we always set the LSbit.
111 #define _FP_FRAC_SRST_4(X, S, N, size) \
112 do \
114 _FP_I_TYPE _FP_FRAC_SRST_4_up, _FP_FRAC_SRST_4_down; \
115 _FP_I_TYPE _FP_FRAC_SRST_4_skip, _FP_FRAC_SRST_4_i; \
116 _FP_W_TYPE _FP_FRAC_SRST_4_s; \
117 _FP_FRAC_SRST_4_skip = (N) / _FP_W_TYPE_SIZE; \
118 _FP_FRAC_SRST_4_down = (N) % _FP_W_TYPE_SIZE; \
119 _FP_FRAC_SRST_4_up = _FP_W_TYPE_SIZE - _FP_FRAC_SRST_4_down; \
120 for (_FP_FRAC_SRST_4_s = _FP_FRAC_SRST_4_i = 0; \
121 _FP_FRAC_SRST_4_i < _FP_FRAC_SRST_4_skip; \
122 ++_FP_FRAC_SRST_4_i) \
123 _FP_FRAC_SRST_4_s |= X##_f[_FP_FRAC_SRST_4_i]; \
124 if (!_FP_FRAC_SRST_4_down) \
125 for (_FP_FRAC_SRST_4_i = 0; \
126 _FP_FRAC_SRST_4_i <= 3-_FP_FRAC_SRST_4_skip; \
127 ++_FP_FRAC_SRST_4_i) \
128 X##_f[_FP_FRAC_SRST_4_i] \
129 = X##_f[_FP_FRAC_SRST_4_i+_FP_FRAC_SRST_4_skip]; \
130 else \
132 _FP_FRAC_SRST_4_s \
133 |= X##_f[_FP_FRAC_SRST_4_i] << _FP_FRAC_SRST_4_up; \
134 for (_FP_FRAC_SRST_4_i = 0; \
135 _FP_FRAC_SRST_4_i < 3-_FP_FRAC_SRST_4_skip; \
136 ++_FP_FRAC_SRST_4_i) \
137 X##_f[_FP_FRAC_SRST_4_i] \
138 = ((X##_f[_FP_FRAC_SRST_4_i+_FP_FRAC_SRST_4_skip] \
139 >> _FP_FRAC_SRST_4_down) \
140 | (X##_f[_FP_FRAC_SRST_4_i+_FP_FRAC_SRST_4_skip+1] \
141 << _FP_FRAC_SRST_4_up)); \
142 X##_f[_FP_FRAC_SRST_4_i++] \
143 = X##_f[3] >> _FP_FRAC_SRST_4_down; \
145 for (; _FP_FRAC_SRST_4_i < 4; ++_FP_FRAC_SRST_4_i) \
146 X##_f[_FP_FRAC_SRST_4_i] = 0; \
147 S = (_FP_FRAC_SRST_4_s != 0); \
149 while (0)
151 #define _FP_FRAC_SRS_4(X, N, size) \
152 do \
154 int _FP_FRAC_SRS_4_sticky; \
155 _FP_FRAC_SRST_4 (X, _FP_FRAC_SRS_4_sticky, N, size); \
156 X##_f[0] |= _FP_FRAC_SRS_4_sticky; \
158 while (0)
160 #define _FP_FRAC_ADD_4(R, X, Y) \
161 __FP_FRAC_ADD_4 (R##_f[3], R##_f[2], R##_f[1], R##_f[0], \
162 X##_f[3], X##_f[2], X##_f[1], X##_f[0], \
163 Y##_f[3], Y##_f[2], Y##_f[1], Y##_f[0])
165 #define _FP_FRAC_SUB_4(R, X, Y) \
166 __FP_FRAC_SUB_4 (R##_f[3], R##_f[2], R##_f[1], R##_f[0], \
167 X##_f[3], X##_f[2], X##_f[1], X##_f[0], \
168 Y##_f[3], Y##_f[2], Y##_f[1], Y##_f[0])
170 #define _FP_FRAC_DEC_4(X, Y) \
171 __FP_FRAC_DEC_4 (X##_f[3], X##_f[2], X##_f[1], X##_f[0], \
172 Y##_f[3], Y##_f[2], Y##_f[1], Y##_f[0])
174 #define _FP_FRAC_ADDI_4(X, I) \
175 __FP_FRAC_ADDI_4 (X##_f[3], X##_f[2], X##_f[1], X##_f[0], I)
177 #define _FP_ZEROFRAC_4 0, 0, 0, 0
178 #define _FP_MINFRAC_4 0, 0, 0, 1
179 #define _FP_MAXFRAC_4 (~(_FP_WS_TYPE) 0), (~(_FP_WS_TYPE) 0), (~(_FP_WS_TYPE) 0), (~(_FP_WS_TYPE) 0)
181 #define _FP_FRAC_ZEROP_4(X) ((X##_f[0] | X##_f[1] | X##_f[2] | X##_f[3]) == 0)
182 #define _FP_FRAC_NEGP_4(X) ((_FP_WS_TYPE) X##_f[3] < 0)
183 #define _FP_FRAC_OVERP_4(fs, X) (_FP_FRAC_HIGH_##fs (X) & _FP_OVERFLOW_##fs)
184 #define _FP_FRAC_HIGHBIT_DW_4(fs, X) \
185 (_FP_FRAC_HIGH_DW_##fs (X) & _FP_HIGHBIT_DW_##fs)
186 #define _FP_FRAC_CLEAR_OVERP_4(fs, X) (_FP_FRAC_HIGH_##fs (X) &= ~_FP_OVERFLOW_##fs)
188 #define _FP_FRAC_EQ_4(X, Y) \
189 (X##_f[0] == Y##_f[0] && X##_f[1] == Y##_f[1] \
190 && X##_f[2] == Y##_f[2] && X##_f[3] == Y##_f[3])
192 #define _FP_FRAC_GT_4(X, Y) \
193 (X##_f[3] > Y##_f[3] \
194 || (X##_f[3] == Y##_f[3] \
195 && (X##_f[2] > Y##_f[2] \
196 || (X##_f[2] == Y##_f[2] \
197 && (X##_f[1] > Y##_f[1] \
198 || (X##_f[1] == Y##_f[1] \
199 && X##_f[0] > Y##_f[0]))))))
201 #define _FP_FRAC_GE_4(X, Y) \
202 (X##_f[3] > Y##_f[3] \
203 || (X##_f[3] == Y##_f[3] \
204 && (X##_f[2] > Y##_f[2] \
205 || (X##_f[2] == Y##_f[2] \
206 && (X##_f[1] > Y##_f[1] \
207 || (X##_f[1] == Y##_f[1] \
208 && X##_f[0] >= Y##_f[0]))))))
211 #define _FP_FRAC_CLZ_4(R, X) \
212 do \
214 if (X##_f[3]) \
215 __FP_CLZ (R, X##_f[3]); \
216 else if (X##_f[2]) \
218 __FP_CLZ (R, X##_f[2]); \
219 R += _FP_W_TYPE_SIZE; \
221 else if (X##_f[1]) \
223 __FP_CLZ (R, X##_f[1]); \
224 R += _FP_W_TYPE_SIZE*2; \
226 else \
228 __FP_CLZ (R, X##_f[0]); \
229 R += _FP_W_TYPE_SIZE*3; \
232 while (0)
235 #define _FP_UNPACK_RAW_4(fs, X, val) \
236 do \
238 union _FP_UNION_##fs _FP_UNPACK_RAW_4_flo; \
239 _FP_UNPACK_RAW_4_flo.flt = (val); \
240 X##_f[0] = _FP_UNPACK_RAW_4_flo.bits.frac0; \
241 X##_f[1] = _FP_UNPACK_RAW_4_flo.bits.frac1; \
242 X##_f[2] = _FP_UNPACK_RAW_4_flo.bits.frac2; \
243 X##_f[3] = _FP_UNPACK_RAW_4_flo.bits.frac3; \
244 X##_e = _FP_UNPACK_RAW_4_flo.bits.exp; \
245 X##_s = _FP_UNPACK_RAW_4_flo.bits.sign; \
247 while (0)
249 #define _FP_UNPACK_RAW_4_P(fs, X, val) \
250 do \
252 union _FP_UNION_##fs *_FP_UNPACK_RAW_4_P_flo \
253 = (union _FP_UNION_##fs *) (val); \
255 X##_f[0] = _FP_UNPACK_RAW_4_P_flo->bits.frac0; \
256 X##_f[1] = _FP_UNPACK_RAW_4_P_flo->bits.frac1; \
257 X##_f[2] = _FP_UNPACK_RAW_4_P_flo->bits.frac2; \
258 X##_f[3] = _FP_UNPACK_RAW_4_P_flo->bits.frac3; \
259 X##_e = _FP_UNPACK_RAW_4_P_flo->bits.exp; \
260 X##_s = _FP_UNPACK_RAW_4_P_flo->bits.sign; \
262 while (0)
264 #define _FP_PACK_RAW_4(fs, val, X) \
265 do \
267 union _FP_UNION_##fs _FP_PACK_RAW_4_flo; \
268 _FP_PACK_RAW_4_flo.bits.frac0 = X##_f[0]; \
269 _FP_PACK_RAW_4_flo.bits.frac1 = X##_f[1]; \
270 _FP_PACK_RAW_4_flo.bits.frac2 = X##_f[2]; \
271 _FP_PACK_RAW_4_flo.bits.frac3 = X##_f[3]; \
272 _FP_PACK_RAW_4_flo.bits.exp = X##_e; \
273 _FP_PACK_RAW_4_flo.bits.sign = X##_s; \
274 (val) = _FP_PACK_RAW_4_flo.flt; \
276 while (0)
278 #define _FP_PACK_RAW_4_P(fs, val, X) \
279 do \
281 union _FP_UNION_##fs *_FP_PACK_RAW_4_P_flo \
282 = (union _FP_UNION_##fs *) (val); \
284 _FP_PACK_RAW_4_P_flo->bits.frac0 = X##_f[0]; \
285 _FP_PACK_RAW_4_P_flo->bits.frac1 = X##_f[1]; \
286 _FP_PACK_RAW_4_P_flo->bits.frac2 = X##_f[2]; \
287 _FP_PACK_RAW_4_P_flo->bits.frac3 = X##_f[3]; \
288 _FP_PACK_RAW_4_P_flo->bits.exp = X##_e; \
289 _FP_PACK_RAW_4_P_flo->bits.sign = X##_s; \
291 while (0)
294 * Multiplication algorithms:
297 /* Given a 1W * 1W => 2W primitive, do the extended multiplication. */
299 #define _FP_MUL_MEAT_DW_4_wide(wfracbits, R, X, Y, doit) \
300 do \
302 _FP_FRAC_DECL_2 (_FP_MUL_MEAT_DW_4_wide_b); \
303 _FP_FRAC_DECL_2 (_FP_MUL_MEAT_DW_4_wide_c); \
304 _FP_FRAC_DECL_2 (_FP_MUL_MEAT_DW_4_wide_d); \
305 _FP_FRAC_DECL_2 (_FP_MUL_MEAT_DW_4_wide_e); \
306 _FP_FRAC_DECL_2 (_FP_MUL_MEAT_DW_4_wide_f); \
308 doit (_FP_FRAC_WORD_8 (R, 1), _FP_FRAC_WORD_8 (R, 0), \
309 X##_f[0], Y##_f[0]); \
310 doit (_FP_MUL_MEAT_DW_4_wide_b_f1, _FP_MUL_MEAT_DW_4_wide_b_f0, \
311 X##_f[0], Y##_f[1]); \
312 doit (_FP_MUL_MEAT_DW_4_wide_c_f1, _FP_MUL_MEAT_DW_4_wide_c_f0, \
313 X##_f[1], Y##_f[0]); \
314 doit (_FP_MUL_MEAT_DW_4_wide_d_f1, _FP_MUL_MEAT_DW_4_wide_d_f0, \
315 X##_f[1], Y##_f[1]); \
316 doit (_FP_MUL_MEAT_DW_4_wide_e_f1, _FP_MUL_MEAT_DW_4_wide_e_f0, \
317 X##_f[0], Y##_f[2]); \
318 doit (_FP_MUL_MEAT_DW_4_wide_f_f1, _FP_MUL_MEAT_DW_4_wide_f_f0, \
319 X##_f[2], Y##_f[0]); \
320 __FP_FRAC_ADD_3 (_FP_FRAC_WORD_8 (R, 3), _FP_FRAC_WORD_8 (R, 2), \
321 _FP_FRAC_WORD_8 (R, 1), 0, \
322 _FP_MUL_MEAT_DW_4_wide_b_f1, \
323 _FP_MUL_MEAT_DW_4_wide_b_f0, \
324 0, 0, _FP_FRAC_WORD_8 (R, 1)); \
325 __FP_FRAC_ADD_3 (_FP_FRAC_WORD_8 (R, 3), _FP_FRAC_WORD_8 (R, 2), \
326 _FP_FRAC_WORD_8 (R, 1), 0, \
327 _FP_MUL_MEAT_DW_4_wide_c_f1, \
328 _FP_MUL_MEAT_DW_4_wide_c_f0, \
329 _FP_FRAC_WORD_8 (R, 3), _FP_FRAC_WORD_8 (R, 2), \
330 _FP_FRAC_WORD_8 (R, 1)); \
331 __FP_FRAC_ADD_3 (_FP_FRAC_WORD_8 (R, 4), _FP_FRAC_WORD_8 (R, 3), \
332 _FP_FRAC_WORD_8 (R, 2), 0, \
333 _FP_MUL_MEAT_DW_4_wide_d_f1, \
334 _FP_MUL_MEAT_DW_4_wide_d_f0, \
335 0, _FP_FRAC_WORD_8 (R, 3), _FP_FRAC_WORD_8 (R, 2)); \
336 __FP_FRAC_ADD_3 (_FP_FRAC_WORD_8 (R, 4), _FP_FRAC_WORD_8 (R, 3), \
337 _FP_FRAC_WORD_8 (R, 2), 0, \
338 _FP_MUL_MEAT_DW_4_wide_e_f1, \
339 _FP_MUL_MEAT_DW_4_wide_e_f0, \
340 _FP_FRAC_WORD_8 (R, 4), _FP_FRAC_WORD_8 (R, 3), \
341 _FP_FRAC_WORD_8 (R, 2)); \
342 __FP_FRAC_ADD_3 (_FP_FRAC_WORD_8 (R, 4), _FP_FRAC_WORD_8 (R, 3), \
343 _FP_FRAC_WORD_8 (R, 2), 0, \
344 _FP_MUL_MEAT_DW_4_wide_f_f1, \
345 _FP_MUL_MEAT_DW_4_wide_f_f0, \
346 _FP_FRAC_WORD_8 (R, 4), _FP_FRAC_WORD_8 (R, 3), \
347 _FP_FRAC_WORD_8 (R, 2)); \
348 doit (_FP_MUL_MEAT_DW_4_wide_b_f1, \
349 _FP_MUL_MEAT_DW_4_wide_b_f0, X##_f[0], Y##_f[3]); \
350 doit (_FP_MUL_MEAT_DW_4_wide_c_f1, \
351 _FP_MUL_MEAT_DW_4_wide_c_f0, X##_f[3], Y##_f[0]); \
352 doit (_FP_MUL_MEAT_DW_4_wide_d_f1, _FP_MUL_MEAT_DW_4_wide_d_f0, \
353 X##_f[1], Y##_f[2]); \
354 doit (_FP_MUL_MEAT_DW_4_wide_e_f1, _FP_MUL_MEAT_DW_4_wide_e_f0, \
355 X##_f[2], Y##_f[1]); \
356 __FP_FRAC_ADD_3 (_FP_FRAC_WORD_8 (R, 5), _FP_FRAC_WORD_8 (R, 4), \
357 _FP_FRAC_WORD_8 (R, 3), 0, \
358 _FP_MUL_MEAT_DW_4_wide_b_f1, \
359 _FP_MUL_MEAT_DW_4_wide_b_f0, \
360 0, _FP_FRAC_WORD_8 (R, 4), _FP_FRAC_WORD_8 (R, 3)); \
361 __FP_FRAC_ADD_3 (_FP_FRAC_WORD_8 (R, 5), _FP_FRAC_WORD_8 (R, 4), \
362 _FP_FRAC_WORD_8 (R, 3), 0, \
363 _FP_MUL_MEAT_DW_4_wide_c_f1, \
364 _FP_MUL_MEAT_DW_4_wide_c_f0, \
365 _FP_FRAC_WORD_8 (R, 5), _FP_FRAC_WORD_8 (R, 4), \
366 _FP_FRAC_WORD_8 (R, 3)); \
367 __FP_FRAC_ADD_3 (_FP_FRAC_WORD_8 (R, 5), _FP_FRAC_WORD_8 (R, 4), \
368 _FP_FRAC_WORD_8 (R, 3), 0, \
369 _FP_MUL_MEAT_DW_4_wide_d_f1, \
370 _FP_MUL_MEAT_DW_4_wide_d_f0, \
371 _FP_FRAC_WORD_8 (R, 5), _FP_FRAC_WORD_8 (R, 4), \
372 _FP_FRAC_WORD_8 (R, 3)); \
373 __FP_FRAC_ADD_3 (_FP_FRAC_WORD_8 (R, 5), _FP_FRAC_WORD_8 (R, 4), \
374 _FP_FRAC_WORD_8 (R, 3), 0, \
375 _FP_MUL_MEAT_DW_4_wide_e_f1, \
376 _FP_MUL_MEAT_DW_4_wide_e_f0, \
377 _FP_FRAC_WORD_8 (R, 5), _FP_FRAC_WORD_8 (R, 4), \
378 _FP_FRAC_WORD_8 (R, 3)); \
379 doit (_FP_MUL_MEAT_DW_4_wide_b_f1, _FP_MUL_MEAT_DW_4_wide_b_f0, \
380 X##_f[2], Y##_f[2]); \
381 doit (_FP_MUL_MEAT_DW_4_wide_c_f1, _FP_MUL_MEAT_DW_4_wide_c_f0, \
382 X##_f[1], Y##_f[3]); \
383 doit (_FP_MUL_MEAT_DW_4_wide_d_f1, _FP_MUL_MEAT_DW_4_wide_d_f0, \
384 X##_f[3], Y##_f[1]); \
385 doit (_FP_MUL_MEAT_DW_4_wide_e_f1, _FP_MUL_MEAT_DW_4_wide_e_f0, \
386 X##_f[2], Y##_f[3]); \
387 doit (_FP_MUL_MEAT_DW_4_wide_f_f1, _FP_MUL_MEAT_DW_4_wide_f_f0, \
388 X##_f[3], Y##_f[2]); \
389 __FP_FRAC_ADD_3 (_FP_FRAC_WORD_8 (R, 6), _FP_FRAC_WORD_8 (R, 5), \
390 _FP_FRAC_WORD_8 (R, 4), 0, \
391 _FP_MUL_MEAT_DW_4_wide_b_f1, \
392 _FP_MUL_MEAT_DW_4_wide_b_f0, \
393 0, _FP_FRAC_WORD_8 (R, 5), _FP_FRAC_WORD_8 (R, 4)); \
394 __FP_FRAC_ADD_3 (_FP_FRAC_WORD_8 (R, 6), _FP_FRAC_WORD_8 (R, 5), \
395 _FP_FRAC_WORD_8 (R, 4), 0, \
396 _FP_MUL_MEAT_DW_4_wide_c_f1, \
397 _FP_MUL_MEAT_DW_4_wide_c_f0, \
398 _FP_FRAC_WORD_8 (R, 6), _FP_FRAC_WORD_8 (R, 5), \
399 _FP_FRAC_WORD_8 (R, 4)); \
400 __FP_FRAC_ADD_3 (_FP_FRAC_WORD_8 (R, 6), _FP_FRAC_WORD_8 (R, 5), \
401 _FP_FRAC_WORD_8 (R, 4), 0, \
402 _FP_MUL_MEAT_DW_4_wide_d_f1, \
403 _FP_MUL_MEAT_DW_4_wide_d_f0, \
404 _FP_FRAC_WORD_8 (R, 6), _FP_FRAC_WORD_8 (R, 5), \
405 _FP_FRAC_WORD_8 (R, 4)); \
406 __FP_FRAC_ADD_3 (_FP_FRAC_WORD_8 (R, 7), _FP_FRAC_WORD_8 (R, 6), \
407 _FP_FRAC_WORD_8 (R, 5), 0, \
408 _FP_MUL_MEAT_DW_4_wide_e_f1, \
409 _FP_MUL_MEAT_DW_4_wide_e_f0, \
410 0, _FP_FRAC_WORD_8 (R, 6), _FP_FRAC_WORD_8 (R, 5)); \
411 __FP_FRAC_ADD_3 (_FP_FRAC_WORD_8 (R, 7), _FP_FRAC_WORD_8 (R, 6), \
412 _FP_FRAC_WORD_8 (R, 5), 0, \
413 _FP_MUL_MEAT_DW_4_wide_f_f1, \
414 _FP_MUL_MEAT_DW_4_wide_f_f0, \
415 _FP_FRAC_WORD_8 (R, 7), _FP_FRAC_WORD_8 (R, 6), \
416 _FP_FRAC_WORD_8 (R, 5)); \
417 doit (_FP_MUL_MEAT_DW_4_wide_b_f1, _FP_MUL_MEAT_DW_4_wide_b_f0, \
418 X##_f[3], Y##_f[3]); \
419 __FP_FRAC_ADD_2 (_FP_FRAC_WORD_8 (R, 7), _FP_FRAC_WORD_8 (R, 6), \
420 _FP_MUL_MEAT_DW_4_wide_b_f1, \
421 _FP_MUL_MEAT_DW_4_wide_b_f0, \
422 _FP_FRAC_WORD_8 (R, 7), _FP_FRAC_WORD_8 (R, 6)); \
424 while (0)
426 #define _FP_MUL_MEAT_4_wide(wfracbits, R, X, Y, doit) \
427 do \
429 _FP_FRAC_DECL_8 (_FP_MUL_MEAT_4_wide_z); \
431 _FP_MUL_MEAT_DW_4_wide (wfracbits, _FP_MUL_MEAT_4_wide_z, \
432 X, Y, doit); \
434 /* Normalize since we know where the msb of the multiplicands \
435 were (bit B), we know that the msb of the of the product is \
436 at either 2B or 2B-1. */ \
437 _FP_FRAC_SRS_8 (_FP_MUL_MEAT_4_wide_z, wfracbits-1, 2*wfracbits); \
438 __FP_FRAC_SET_4 (R, _FP_FRAC_WORD_8 (_FP_MUL_MEAT_4_wide_z, 3), \
439 _FP_FRAC_WORD_8 (_FP_MUL_MEAT_4_wide_z, 2), \
440 _FP_FRAC_WORD_8 (_FP_MUL_MEAT_4_wide_z, 1), \
441 _FP_FRAC_WORD_8 (_FP_MUL_MEAT_4_wide_z, 0)); \
443 while (0)
445 #define _FP_MUL_MEAT_DW_4_gmp(wfracbits, R, X, Y) \
446 do \
448 mpn_mul_n (R##_f, _x_f, _y_f, 4); \
450 while (0)
452 #define _FP_MUL_MEAT_4_gmp(wfracbits, R, X, Y) \
453 do \
455 _FP_FRAC_DECL_8 (_FP_MUL_MEAT_4_gmp_z); \
457 _FP_MUL_MEAT_DW_4_gmp (wfracbits, _FP_MUL_MEAT_4_gmp_z, X, Y); \
459 /* Normalize since we know where the msb of the multiplicands \
460 were (bit B), we know that the msb of the of the product is \
461 at either 2B or 2B-1. */ \
462 _FP_FRAC_SRS_8 (_FP_MUL_MEAT_4_gmp_z, wfracbits-1, 2*wfracbits); \
463 __FP_FRAC_SET_4 (R, _FP_FRAC_WORD_8 (_FP_MUL_MEAT_4_gmp_z, 3), \
464 _FP_FRAC_WORD_8 (_FP_MUL_MEAT_4_gmp_z, 2), \
465 _FP_FRAC_WORD_8 (_FP_MUL_MEAT_4_gmp_z, 1), \
466 _FP_FRAC_WORD_8 (_FP_MUL_MEAT_4_gmp_z, 0)); \
468 while (0)
471 * Helper utility for _FP_DIV_MEAT_4_udiv:
472 * pppp = m * nnn
474 #define umul_ppppmnnn(p3, p2, p1, p0, m, n2, n1, n0) \
475 do \
477 UWtype umul_ppppmnnn_t; \
478 umul_ppmm (p1, p0, m, n0); \
479 umul_ppmm (p2, umul_ppppmnnn_t, m, n1); \
480 __FP_FRAC_ADDI_2 (p2, p1, umul_ppppmnnn_t); \
481 umul_ppmm (p3, umul_ppppmnnn_t, m, n2); \
482 __FP_FRAC_ADDI_2 (p3, p2, umul_ppppmnnn_t); \
484 while (0)
487 * Division algorithms:
490 #define _FP_DIV_MEAT_4_udiv(fs, R, X, Y) \
491 do \
493 int _FP_DIV_MEAT_4_udiv_i; \
494 _FP_FRAC_DECL_4 (_FP_DIV_MEAT_4_udiv_n); \
495 _FP_FRAC_DECL_4 (_FP_DIV_MEAT_4_udiv_m); \
496 _FP_FRAC_SET_4 (_FP_DIV_MEAT_4_udiv_n, _FP_ZEROFRAC_4); \
497 if (_FP_FRAC_GE_4 (X, Y)) \
499 _FP_DIV_MEAT_4_udiv_n_f[3] \
500 = X##_f[0] << (_FP_W_TYPE_SIZE - 1); \
501 _FP_FRAC_SRL_4 (X, 1); \
503 else \
504 R##_e--; \
506 /* Normalize, i.e. make the most significant bit of the \
507 denominator set. */ \
508 _FP_FRAC_SLL_4 (Y, _FP_WFRACXBITS_##fs); \
510 for (_FP_DIV_MEAT_4_udiv_i = 3; ; _FP_DIV_MEAT_4_udiv_i--) \
512 if (X##_f[3] == Y##_f[3]) \
514 /* This is a special case, not an optimization \
515 (X##_f[3]/Y##_f[3] would not fit into UWtype). \
516 As X## is guaranteed to be < Y, \
517 R##_f[_FP_DIV_MEAT_4_udiv_i] can be either \
518 (UWtype)-1 or (UWtype)-2. */ \
519 R##_f[_FP_DIV_MEAT_4_udiv_i] = -1; \
520 if (!_FP_DIV_MEAT_4_udiv_i) \
521 break; \
522 __FP_FRAC_SUB_4 (X##_f[3], X##_f[2], X##_f[1], X##_f[0], \
523 Y##_f[2], Y##_f[1], Y##_f[0], 0, \
524 X##_f[2], X##_f[1], X##_f[0], \
525 _FP_DIV_MEAT_4_udiv_n_f[_FP_DIV_MEAT_4_udiv_i]); \
526 _FP_FRAC_SUB_4 (X, Y, X); \
527 if (X##_f[3] > Y##_f[3]) \
529 R##_f[_FP_DIV_MEAT_4_udiv_i] = -2; \
530 _FP_FRAC_ADD_4 (X, Y, X); \
533 else \
535 udiv_qrnnd (R##_f[_FP_DIV_MEAT_4_udiv_i], \
536 X##_f[3], X##_f[3], X##_f[2], Y##_f[3]); \
537 umul_ppppmnnn (_FP_DIV_MEAT_4_udiv_m_f[3], \
538 _FP_DIV_MEAT_4_udiv_m_f[2], \
539 _FP_DIV_MEAT_4_udiv_m_f[1], \
540 _FP_DIV_MEAT_4_udiv_m_f[0], \
541 R##_f[_FP_DIV_MEAT_4_udiv_i], \
542 Y##_f[2], Y##_f[1], Y##_f[0]); \
543 X##_f[2] = X##_f[1]; \
544 X##_f[1] = X##_f[0]; \
545 X##_f[0] \
546 = _FP_DIV_MEAT_4_udiv_n_f[_FP_DIV_MEAT_4_udiv_i]; \
547 if (_FP_FRAC_GT_4 (_FP_DIV_MEAT_4_udiv_m, X)) \
549 R##_f[_FP_DIV_MEAT_4_udiv_i]--; \
550 _FP_FRAC_ADD_4 (X, Y, X); \
551 if (_FP_FRAC_GE_4 (X, Y) \
552 && _FP_FRAC_GT_4 (_FP_DIV_MEAT_4_udiv_m, X)) \
554 R##_f[_FP_DIV_MEAT_4_udiv_i]--; \
555 _FP_FRAC_ADD_4 (X, Y, X); \
558 _FP_FRAC_DEC_4 (X, _FP_DIV_MEAT_4_udiv_m); \
559 if (!_FP_DIV_MEAT_4_udiv_i) \
561 if (!_FP_FRAC_EQ_4 (X, _FP_DIV_MEAT_4_udiv_m)) \
562 R##_f[0] |= _FP_WORK_STICKY; \
563 break; \
568 while (0)
572 * Square root algorithms:
573 * We have just one right now, maybe Newton approximation
574 * should be added for those machines where division is fast.
577 #define _FP_SQRT_MEAT_4(R, S, T, X, q) \
578 do \
580 while (q) \
582 T##_f[3] = S##_f[3] + q; \
583 if (T##_f[3] <= X##_f[3]) \
585 S##_f[3] = T##_f[3] + q; \
586 X##_f[3] -= T##_f[3]; \
587 R##_f[3] += q; \
589 _FP_FRAC_SLL_4 (X, 1); \
590 q >>= 1; \
592 q = (_FP_W_TYPE) 1 << (_FP_W_TYPE_SIZE - 1); \
593 while (q) \
595 T##_f[2] = S##_f[2] + q; \
596 T##_f[3] = S##_f[3]; \
597 if (T##_f[3] < X##_f[3] \
598 || (T##_f[3] == X##_f[3] && T##_f[2] <= X##_f[2])) \
600 S##_f[2] = T##_f[2] + q; \
601 S##_f[3] += (T##_f[2] > S##_f[2]); \
602 __FP_FRAC_DEC_2 (X##_f[3], X##_f[2], \
603 T##_f[3], T##_f[2]); \
604 R##_f[2] += q; \
606 _FP_FRAC_SLL_4 (X, 1); \
607 q >>= 1; \
609 q = (_FP_W_TYPE) 1 << (_FP_W_TYPE_SIZE - 1); \
610 while (q) \
612 T##_f[1] = S##_f[1] + q; \
613 T##_f[2] = S##_f[2]; \
614 T##_f[3] = S##_f[3]; \
615 if (T##_f[3] < X##_f[3] \
616 || (T##_f[3] == X##_f[3] \
617 && (T##_f[2] < X##_f[2] \
618 || (T##_f[2] == X##_f[2] \
619 && T##_f[1] <= X##_f[1])))) \
621 S##_f[1] = T##_f[1] + q; \
622 S##_f[2] += (T##_f[1] > S##_f[1]); \
623 S##_f[3] += (T##_f[2] > S##_f[2]); \
624 __FP_FRAC_DEC_3 (X##_f[3], X##_f[2], X##_f[1], \
625 T##_f[3], T##_f[2], T##_f[1]); \
626 R##_f[1] += q; \
628 _FP_FRAC_SLL_4 (X, 1); \
629 q >>= 1; \
631 q = (_FP_W_TYPE) 1 << (_FP_W_TYPE_SIZE - 1); \
632 while (q != _FP_WORK_ROUND) \
634 T##_f[0] = S##_f[0] + q; \
635 T##_f[1] = S##_f[1]; \
636 T##_f[2] = S##_f[2]; \
637 T##_f[3] = S##_f[3]; \
638 if (_FP_FRAC_GE_4 (X, T)) \
640 S##_f[0] = T##_f[0] + q; \
641 S##_f[1] += (T##_f[0] > S##_f[0]); \
642 S##_f[2] += (T##_f[1] > S##_f[1]); \
643 S##_f[3] += (T##_f[2] > S##_f[2]); \
644 _FP_FRAC_DEC_4 (X, T); \
645 R##_f[0] += q; \
647 _FP_FRAC_SLL_4 (X, 1); \
648 q >>= 1; \
650 if (!_FP_FRAC_ZEROP_4 (X)) \
652 if (_FP_FRAC_GT_4 (X, S)) \
653 R##_f[0] |= _FP_WORK_ROUND; \
654 R##_f[0] |= _FP_WORK_STICKY; \
657 while (0)
661 * Internals
664 #define __FP_FRAC_SET_4(X, I3, I2, I1, I0) \
665 (X##_f[3] = I3, X##_f[2] = I2, X##_f[1] = I1, X##_f[0] = I0)
667 #ifndef __FP_FRAC_ADD_3
668 # define __FP_FRAC_ADD_3(r2, r1, r0, x2, x1, x0, y2, y1, y0) \
669 do \
671 _FP_W_TYPE __FP_FRAC_ADD_3_c1, __FP_FRAC_ADD_3_c2; \
672 r0 = x0 + y0; \
673 __FP_FRAC_ADD_3_c1 = r0 < x0; \
674 r1 = x1 + y1; \
675 __FP_FRAC_ADD_3_c2 = r1 < x1; \
676 r1 += __FP_FRAC_ADD_3_c1; \
677 __FP_FRAC_ADD_3_c2 |= r1 < __FP_FRAC_ADD_3_c1; \
678 r2 = x2 + y2 + __FP_FRAC_ADD_3_c2; \
680 while (0)
681 #endif
683 #ifndef __FP_FRAC_ADD_4
684 # define __FP_FRAC_ADD_4(r3, r2, r1, r0, x3, x2, x1, x0, y3, y2, y1, y0) \
685 do \
687 _FP_W_TYPE __FP_FRAC_ADD_4_c1, __FP_FRAC_ADD_4_c2; \
688 _FP_W_TYPE __FP_FRAC_ADD_4_c3; \
689 r0 = x0 + y0; \
690 __FP_FRAC_ADD_4_c1 = r0 < x0; \
691 r1 = x1 + y1; \
692 __FP_FRAC_ADD_4_c2 = r1 < x1; \
693 r1 += __FP_FRAC_ADD_4_c1; \
694 __FP_FRAC_ADD_4_c2 |= r1 < __FP_FRAC_ADD_4_c1; \
695 r2 = x2 + y2; \
696 __FP_FRAC_ADD_4_c3 = r2 < x2; \
697 r2 += __FP_FRAC_ADD_4_c2; \
698 __FP_FRAC_ADD_4_c3 |= r2 < __FP_FRAC_ADD_4_c2; \
699 r3 = x3 + y3 + __FP_FRAC_ADD_4_c3; \
701 while (0)
702 #endif
704 #ifndef __FP_FRAC_SUB_3
705 # define __FP_FRAC_SUB_3(r2, r1, r0, x2, x1, x0, y2, y1, y0) \
706 do \
708 _FP_W_TYPE __FP_FRAC_SUB_3_c1, __FP_FRAC_SUB_3_c2; \
709 r0 = x0 - y0; \
710 __FP_FRAC_SUB_3_c1 = r0 > x0; \
711 r1 = x1 - y1; \
712 __FP_FRAC_SUB_3_c2 = r1 > x1; \
713 r1 -= __FP_FRAC_SUB_3_c1; \
714 __FP_FRAC_SUB_3_c2 |= __FP_FRAC_SUB_3_c1 && (y1 == x1); \
715 r2 = x2 - y2 - __FP_FRAC_SUB_3_c2; \
717 while (0)
718 #endif
720 #ifndef __FP_FRAC_SUB_4
721 # define __FP_FRAC_SUB_4(r3, r2, r1, r0, x3, x2, x1, x0, y3, y2, y1, y0) \
722 do \
724 _FP_W_TYPE __FP_FRAC_SUB_4_c1, __FP_FRAC_SUB_4_c2; \
725 _FP_W_TYPE __FP_FRAC_SUB_4_c3; \
726 r0 = x0 - y0; \
727 __FP_FRAC_SUB_4_c1 = r0 > x0; \
728 r1 = x1 - y1; \
729 __FP_FRAC_SUB_4_c2 = r1 > x1; \
730 r1 -= __FP_FRAC_SUB_4_c1; \
731 __FP_FRAC_SUB_4_c2 |= __FP_FRAC_SUB_4_c1 && (y1 == x1); \
732 r2 = x2 - y2; \
733 __FP_FRAC_SUB_4_c3 = r2 > x2; \
734 r2 -= __FP_FRAC_SUB_4_c2; \
735 __FP_FRAC_SUB_4_c3 |= __FP_FRAC_SUB_4_c2 && (y2 == x2); \
736 r3 = x3 - y3 - __FP_FRAC_SUB_4_c3; \
738 while (0)
739 #endif
741 #ifndef __FP_FRAC_DEC_3
742 # define __FP_FRAC_DEC_3(x2, x1, x0, y2, y1, y0) \
743 do \
745 UWtype __FP_FRAC_DEC_3_t0, __FP_FRAC_DEC_3_t1; \
746 UWtype __FP_FRAC_DEC_3_t2; \
747 __FP_FRAC_DEC_3_t0 = x0; \
748 __FP_FRAC_DEC_3_t1 = x1; \
749 __FP_FRAC_DEC_3_t2 = x2; \
750 __FP_FRAC_SUB_3 (x2, x1, x0, __FP_FRAC_DEC_3_t2, \
751 __FP_FRAC_DEC_3_t1, __FP_FRAC_DEC_3_t0, \
752 y2, y1, y0); \
754 while (0)
755 #endif
757 #ifndef __FP_FRAC_DEC_4
758 # define __FP_FRAC_DEC_4(x3, x2, x1, x0, y3, y2, y1, y0) \
759 do \
761 UWtype __FP_FRAC_DEC_4_t0, __FP_FRAC_DEC_4_t1; \
762 UWtype __FP_FRAC_DEC_4_t2, __FP_FRAC_DEC_4_t3; \
763 __FP_FRAC_DEC_4_t0 = x0; \
764 __FP_FRAC_DEC_4_t1 = x1; \
765 __FP_FRAC_DEC_4_t2 = x2; \
766 __FP_FRAC_DEC_4_t3 = x3; \
767 __FP_FRAC_SUB_4 (x3, x2, x1, x0, __FP_FRAC_DEC_4_t3, \
768 __FP_FRAC_DEC_4_t2, __FP_FRAC_DEC_4_t1, \
769 __FP_FRAC_DEC_4_t0, y3, y2, y1, y0); \
771 while (0)
772 #endif
774 #ifndef __FP_FRAC_ADDI_4
775 # define __FP_FRAC_ADDI_4(x3, x2, x1, x0, i) \
776 do \
778 UWtype __FP_FRAC_ADDI_4_t; \
779 __FP_FRAC_ADDI_4_t = ((x0 += i) < i); \
780 x1 += __FP_FRAC_ADDI_4_t; \
781 __FP_FRAC_ADDI_4_t = (x1 < __FP_FRAC_ADDI_4_t); \
782 x2 += __FP_FRAC_ADDI_4_t; \
783 __FP_FRAC_ADDI_4_t = (x2 < __FP_FRAC_ADDI_4_t); \
784 x3 += __FP_FRAC_ADDI_4_t; \
786 while (0)
787 #endif
789 /* Convert FP values between word sizes. This appears to be more
790 * complicated than I'd have expected it to be, so these might be
791 * wrong... These macros are in any case somewhat bogus because they
792 * use information about what various FRAC_n variables look like
793 * internally [eg, that 2 word vars are X_f0 and x_f1]. But so do
794 * the ones in op-2.h and op-1.h.
796 #define _FP_FRAC_COPY_1_4(D, S) (D##_f = S##_f[0])
798 #define _FP_FRAC_COPY_2_4(D, S) \
799 do \
801 D##_f0 = S##_f[0]; \
802 D##_f1 = S##_f[1]; \
804 while (0)
806 /* Assembly/disassembly for converting to/from integral types.
807 * No shifting or overflow handled here.
809 /* Put the FP value X into r, which is an integer of size rsize. */
810 #define _FP_FRAC_ASSEMBLE_4(r, X, rsize) \
811 do \
813 if (rsize <= _FP_W_TYPE_SIZE) \
814 r = X##_f[0]; \
815 else if (rsize <= 2*_FP_W_TYPE_SIZE) \
817 r = X##_f[1]; \
818 r = (rsize <= _FP_W_TYPE_SIZE ? 0 : r << _FP_W_TYPE_SIZE); \
819 r += X##_f[0]; \
821 else \
823 /* I'm feeling lazy so we deal with int == 3words (implausible)*/ \
824 /* and int == 4words as a single case. */ \
825 r = X##_f[3]; \
826 r = (rsize <= _FP_W_TYPE_SIZE ? 0 : r << _FP_W_TYPE_SIZE); \
827 r += X##_f[2]; \
828 r = (rsize <= _FP_W_TYPE_SIZE ? 0 : r << _FP_W_TYPE_SIZE); \
829 r += X##_f[1]; \
830 r = (rsize <= _FP_W_TYPE_SIZE ? 0 : r << _FP_W_TYPE_SIZE); \
831 r += X##_f[0]; \
834 while (0)
836 /* "No disassemble Number Five!" */
837 /* move an integer of size rsize into X's fractional part. We rely on
838 * the _f[] array consisting of words of size _FP_W_TYPE_SIZE to avoid
839 * having to mask the values we store into it.
841 #define _FP_FRAC_DISASSEMBLE_4(X, r, rsize) \
842 do \
844 X##_f[0] = r; \
845 X##_f[1] = (rsize <= _FP_W_TYPE_SIZE ? 0 : r >> _FP_W_TYPE_SIZE); \
846 X##_f[2] = (rsize <= 2*_FP_W_TYPE_SIZE ? 0 : r >> 2*_FP_W_TYPE_SIZE); \
847 X##_f[3] = (rsize <= 3*_FP_W_TYPE_SIZE ? 0 : r >> 3*_FP_W_TYPE_SIZE); \
849 while (0)
851 #define _FP_FRAC_COPY_4_1(D, S) \
852 do \
854 D##_f[0] = S##_f; \
855 D##_f[1] = D##_f[2] = D##_f[3] = 0; \
857 while (0)
859 #define _FP_FRAC_COPY_4_2(D, S) \
860 do \
862 D##_f[0] = S##_f0; \
863 D##_f[1] = S##_f1; \
864 D##_f[2] = D##_f[3] = 0; \
866 while (0)
868 #define _FP_FRAC_COPY_4_4(D, S) _FP_FRAC_COPY_4 (D, S)