remove kerberos/heimdal
[dragonfly.git] / contrib / mpfr / mpfr-longlong.h
blob111b36cb39da355c09e51873f8a09e089b932cc0
1 /* longlong.h -- definitions for mixed size 32/64 bit arithmetic.
3 Copyright 1991, 1992, 1993, 1994, 1996, 1997, 1999, 2000, 2001, 2002, 2003,
4 2004, 2005 Free Software Foundation, Inc.
6 This file is free software; you can redistribute it and/or modify
7 it under the terms of the GNU Lesser General Public License as published by
8 the Free Software Foundation; either version 2.1 of the License, or (at your
9 option) any later version.
11 This file is distributed in the hope that it will be useful, but
12 WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
13 or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
14 License for more details.
16 You should have received a copy of the GNU Lesser General Public License
17 along with this file; see the file COPYING.LIB. If not, write to
18 the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
19 MA 02110-1301, USA. */
21 /* You have to define the following before including this file:
23 UWtype -- An unsigned type, default type for operations (typically a "word")
24 UHWtype -- An unsigned type, at least half the size of UWtype.
25 UDWtype -- An unsigned type, at least twice as large a UWtype
26 W_TYPE_SIZE -- size in bits of UWtype
28 SItype, USItype -- Signed and unsigned 32 bit types.
29 DItype, UDItype -- Signed and unsigned 64 bit types.
31 On a 32 bit machine UWtype should typically be USItype;
32 on a 64 bit machine, UWtype should typically be UDItype.
35 #define __BITS4 (W_TYPE_SIZE / 4)
36 #define __ll_B ((UWtype) 1 << (W_TYPE_SIZE / 2))
37 #define __ll_lowpart(t) ((UWtype) (t) & (__ll_B - 1))
38 #define __ll_highpart(t) ((UWtype) (t) >> (W_TYPE_SIZE / 2))
40 /* This is used to make sure no undesirable sharing between different libraries
41 that use this file takes place. */
42 #ifndef __MPN
43 #define __MPN(x) __##x
44 #endif
46 #ifndef _PROTO
47 #if (__STDC__-0) || defined (__cplusplus)
48 #define _PROTO(x) x
49 #else
50 #define _PROTO(x) ()
51 #endif
52 #endif
54 /* Define auxiliary asm macros.
56 1) umul_ppmm(high_prod, low_prod, multipler, multiplicand) multiplies two
57 UWtype integers MULTIPLER and MULTIPLICAND, and generates a two UWtype
58 word product in HIGH_PROD and LOW_PROD.
60 2) __umulsidi3(a,b) multiplies two UWtype integers A and B, and returns a
61 UDWtype product. This is just a variant of umul_ppmm.
63 3) udiv_qrnnd(quotient, remainder, high_numerator, low_numerator,
64 denominator) divides a UDWtype, composed by the UWtype integers
65 HIGH_NUMERATOR and LOW_NUMERATOR, by DENOMINATOR and places the quotient
66 in QUOTIENT and the remainder in REMAINDER. HIGH_NUMERATOR must be less
67 than DENOMINATOR for correct operation. If, in addition, the most
68 significant bit of DENOMINATOR must be 1, then the pre-processor symbol
69 UDIV_NEEDS_NORMALIZATION is defined to 1.
71 4) sdiv_qrnnd(quotient, remainder, high_numerator, low_numerator,
72 denominator). Like udiv_qrnnd but the numbers are signed. The quotient
73 is rounded towards 0.
75 5) count_leading_zeros(count, x) counts the number of zero-bits from the
76 msb to the first non-zero bit in the UWtype X. This is the number of
77 steps X needs to be shifted left to set the msb. Undefined for X == 0,
78 unless the symbol COUNT_LEADING_ZEROS_0 is defined to some value.
80 6) count_trailing_zeros(count, x) like count_leading_zeros, but counts
81 from the least significant end.
83 7) add_ssaaaa(high_sum, low_sum, high_addend_1, low_addend_1,
84 high_addend_2, low_addend_2) adds two UWtype integers, composed by
85 HIGH_ADDEND_1 and LOW_ADDEND_1, and HIGH_ADDEND_2 and LOW_ADDEND_2
86 respectively. The result is placed in HIGH_SUM and LOW_SUM. Overflow
87 (i.e. carry out) is not stored anywhere, and is lost.
89 8) sub_ddmmss(high_difference, low_difference, high_minuend, low_minuend,
90 high_subtrahend, low_subtrahend) subtracts two two-word UWtype integers,
91 composed by HIGH_MINUEND_1 and LOW_MINUEND_1, and HIGH_SUBTRAHEND_2 and
92 LOW_SUBTRAHEND_2 respectively. The result is placed in HIGH_DIFFERENCE
93 and LOW_DIFFERENCE. Overflow (i.e. carry out) is not stored anywhere,
94 and is lost.
96 If any of these macros are left undefined for a particular CPU,
97 C macros are used.
100 Notes:
102 For add_ssaaaa the two high and two low addends can both commute, but
103 unfortunately gcc only supports one "%" commutative in each asm block.
104 This has always been so but is only documented in recent versions
105 (eg. pre-release 3.3). Having two or more "%"s can cause an internal
106 compiler error in certain rare circumstances.
108 Apparently it was only the last "%" that was ever actually respected, so
109 the code has been updated to leave just that. Clearly there's a free
110 choice whether high or low should get it, if there's a reason to favour
111 one over the other. Also obviously when the constraints on the two
112 operands are identical there's no benefit to the reloader in any "%" at
113 all.
117 /* The CPUs come in alphabetical order below.
119 Please add support for more CPUs here, or improve the current support
120 for the CPUs below! */
123 /* count_leading_zeros_gcc_clz is count_leading_zeros implemented with gcc
124 3.4 __builtin_clzl or __builtin_clzll, according to our limb size.
125 Similarly count_trailing_zeros_gcc_ctz using __builtin_ctzl or
126 __builtin_ctzll.
128 These builtins are only used when we check what code comes out, on some
129 chips they're merely libgcc calls, where we will instead want an inline
130 in that case (either asm or generic C).
132 These builtins are better than an asm block of the same insn, since an
133 asm block doesn't give gcc any information about scheduling or resource
134 usage. We keep an asm block for use on prior versions of gcc though.
136 For reference, __builtin_ffs existed in gcc prior to __builtin_clz, but
137 it's not used (for count_leading_zeros) because it generally gives extra
138 code to ensure the result is 0 when the input is 0, which we don't need
139 or want. */
141 #ifdef _LONG_LONG_LIMB
142 #define count_leading_zeros_gcc_clz(count,x) \
143 do { \
144 ASSERT ((x) != 0); \
145 (count) = __builtin_clzll (x); \
146 } while (0)
147 #else
148 #define count_leading_zeros_gcc_clz(count,x) \
149 do { \
150 ASSERT ((x) != 0); \
151 (count) = __builtin_clzl (x); \
152 } while (0)
153 #endif
155 #ifdef _LONG_LONG_LIMB
156 #define count_trailing_zeros_gcc_ctz(count,x) \
157 do { \
158 ASSERT ((x) != 0); \
159 (count) = __builtin_ctzll (x); \
160 } while (0)
161 #else
162 #define count_trailing_zeros_gcc_ctz(count,x) \
163 do { \
164 ASSERT ((x) != 0); \
165 (count) = __builtin_ctzl (x); \
166 } while (0)
167 #endif
170 /* FIXME: The macros using external routines like __MPN(count_leading_zeros)
171 don't need to be under !NO_ASM */
172 #if ! defined (NO_ASM)
174 #if defined (__alpha) && W_TYPE_SIZE == 64
175 /* Most alpha-based machines, except Cray systems. */
176 #if defined (__GNUC__)
177 #define umul_ppmm(ph, pl, m0, m1) \
178 do { \
179 UDItype __m0 = (m0), __m1 = (m1); \
180 __asm__ ("umulh %r1,%2,%0" \
181 : "=r" (ph) \
182 : "%rJ" (m0), "rI" (m1)); \
183 (pl) = __m0 * __m1; \
184 } while (0)
185 #define UMUL_TIME 18
186 #else /* ! __GNUC__ */
187 #include <machine/builtins.h>
188 #define umul_ppmm(ph, pl, m0, m1) \
189 do { \
190 UDItype __m0 = (m0), __m1 = (m1); \
191 (ph) = __UMULH (m0, m1); \
192 (pl) = __m0 * __m1; \
193 } while (0)
194 #endif
195 #ifndef LONGLONG_STANDALONE
196 #define udiv_qrnnd(q, r, n1, n0, d) \
197 do { UWtype __di; \
198 __di = __MPN(invert_limb) (d); \
199 udiv_qrnnd_preinv (q, r, n1, n0, d, __di); \
200 } while (0)
201 #define UDIV_PREINV_ALWAYS 1
202 #define UDIV_NEEDS_NORMALIZATION 1
203 #define UDIV_TIME 220
204 #endif /* LONGLONG_STANDALONE */
206 /* clz_tab is required in all configurations, since mpn/alpha/cntlz.asm
207 always goes into libgmp.so, even when not actually used. */
208 #define COUNT_LEADING_ZEROS_NEED_CLZ_TAB
210 #if defined (__GNUC__) && HAVE_HOST_CPU_alpha_CIX
211 #define count_leading_zeros(COUNT,X) \
212 __asm__("ctlz %1,%0" : "=r"(COUNT) : "r"(X))
213 #define count_trailing_zeros(COUNT,X) \
214 __asm__("cttz %1,%0" : "=r"(COUNT) : "r"(X))
215 #endif /* clz/ctz using cix */
217 #if ! defined (count_leading_zeros) \
218 && defined (__GNUC__) && ! defined (LONGLONG_STANDALONE)
219 /* ALPHA_CMPBGE_0 gives "cmpbge $31,src,dst", ie. test src bytes == 0.
220 "$31" is written explicitly in the asm, since an "r" constraint won't
221 select reg 31. There seems no need to worry about "r31" syntax for cray,
222 since gcc itself (pre-release 3.4) emits just $31 in various places. */
223 #define ALPHA_CMPBGE_0(dst, src) \
224 do { asm ("cmpbge $31, %1, %0" : "=r" (dst) : "r" (src)); } while (0)
225 /* Zero bytes are turned into bits with cmpbge, a __clz_tab lookup counts
226 them, locating the highest non-zero byte. A second __clz_tab lookup
227 counts the leading zero bits in that byte, giving the result. */
228 #define count_leading_zeros(count, x) \
229 do { \
230 UWtype __clz__b, __clz__c, __clz__x = (x); \
231 ALPHA_CMPBGE_0 (__clz__b, __clz__x); /* zero bytes */ \
232 __clz__b = __clz_tab [(__clz__b >> 1) ^ 0x7F]; /* 8 to 1 byte */ \
233 __clz__b = __clz__b * 8 - 7; /* 57 to 1 shift */ \
234 __clz__x >>= __clz__b; \
235 __clz__c = __clz_tab [__clz__x]; /* 8 to 1 bit */ \
236 __clz__b = 65 - __clz__b; \
237 (count) = __clz__b - __clz__c; \
238 } while (0)
239 #define COUNT_LEADING_ZEROS_NEED_CLZ_TAB
240 #endif /* clz using cmpbge */
242 #if ! defined (count_leading_zeros) && ! defined (LONGLONG_STANDALONE)
243 #if HAVE_ATTRIBUTE_CONST
244 long __MPN(count_leading_zeros) _PROTO ((UDItype)) __attribute__ ((const));
245 #else
246 long __MPN(count_leading_zeros) _PROTO ((UDItype));
247 #endif
248 #define count_leading_zeros(count, x) \
249 ((count) = __MPN(count_leading_zeros) (x))
250 #endif /* clz using mpn */
251 #endif /* __alpha */
253 #if defined (_CRAY) && W_TYPE_SIZE == 64
254 #include <intrinsics.h>
255 #define UDIV_PREINV_ALWAYS 1
256 #define UDIV_NEEDS_NORMALIZATION 1
257 #define UDIV_TIME 220
258 long __MPN(count_leading_zeros) _PROTO ((UDItype));
259 #define count_leading_zeros(count, x) \
260 ((count) = _leadz ((UWtype) (x)))
261 #if defined (_CRAYIEEE) /* I.e., Cray T90/ieee, T3D, and T3E */
262 #define umul_ppmm(ph, pl, m0, m1) \
263 do { \
264 UDItype __m0 = (m0), __m1 = (m1); \
265 (ph) = _int_mult_upper (m0, m1); \
266 (pl) = __m0 * __m1; \
267 } while (0)
268 #ifndef LONGLONG_STANDALONE
269 #define udiv_qrnnd(q, r, n1, n0, d) \
270 do { UWtype __di; \
271 __di = __MPN(invert_limb) (d); \
272 udiv_qrnnd_preinv (q, r, n1, n0, d, __di); \
273 } while (0)
274 #endif /* LONGLONG_STANDALONE */
275 #endif /* _CRAYIEEE */
276 #endif /* _CRAY */
278 #if defined (__ia64) && W_TYPE_SIZE == 64
279 /* This form encourages gcc (pre-release 3.4 at least) to emit predicated
280 "sub r=r,r" and "sub r=r,r,1", giving a 2 cycle latency. The generic
281 code using "al<bl" arithmetically comes out making an actual 0 or 1 in a
282 register, which takes an extra cycle. */
283 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
284 do { \
285 UWtype __x; \
286 __x = (al) - (bl); \
287 if ((al) < (bl)) \
288 (sh) = (ah) - (bh) - 1; \
289 else \
290 (sh) = (ah) - (bh); \
291 (sl) = __x; \
292 } while (0)
293 #if defined (__GNUC__) && ! defined (__INTEL_COMPILER)
294 /* Do both product parts in assembly, since that gives better code with
295 all gcc versions. Some callers will just use the upper part, and in
296 that situation we waste an instruction, but not any cycles. */
297 #define umul_ppmm(ph, pl, m0, m1) \
298 __asm__ ("xma.hu %0 = %2, %3, f0\n\txma.l %1 = %2, %3, f0" \
299 : "=&f" (ph), "=f" (pl) \
300 : "f" (m0), "f" (m1))
301 #define UMUL_TIME 14
302 #define count_leading_zeros(count, x) \
303 do { \
304 UWtype _x = (x), _y, _a, _c; \
305 __asm__ ("mux1 %0 = %1, @rev" : "=r" (_y) : "r" (_x)); \
306 __asm__ ("czx1.l %0 = %1" : "=r" (_a) : "r" (-_y | _y)); \
307 _c = (_a - 1) << 3; \
308 _x >>= _c; \
309 if (_x >= 1 << 4) \
310 _x >>= 4, _c += 4; \
311 if (_x >= 1 << 2) \
312 _x >>= 2, _c += 2; \
313 _c += _x >> 1; \
314 (count) = W_TYPE_SIZE - 1 - _c; \
315 } while (0)
316 /* similar to what gcc does for __builtin_ffs, but 0 based rather than 1
317 based, and we don't need a special case for x==0 here */
318 #define count_trailing_zeros(count, x) \
319 do { \
320 UWtype __ctz_x = (x); \
321 __asm__ ("popcnt %0 = %1" \
322 : "=r" (count) \
323 : "r" ((__ctz_x-1) & ~__ctz_x)); \
324 } while (0)
325 #endif
326 #if defined (__INTEL_COMPILER)
327 #include <ia64intrin.h>
328 #define umul_ppmm(ph, pl, m0, m1) \
329 do { \
330 UWtype _m0 = (m0), _m1 = (m1); \
331 ph = _m64_xmahu (_m0, _m1, 0); \
332 pl = _m0 * _m1; \
333 } while (0)
334 #endif
335 #ifndef LONGLONG_STANDALONE
336 #define udiv_qrnnd(q, r, n1, n0, d) \
337 do { UWtype __di; \
338 __di = __MPN(invert_limb) (d); \
339 udiv_qrnnd_preinv (q, r, n1, n0, d, __di); \
340 } while (0)
341 #define UDIV_PREINV_ALWAYS 1
342 #define UDIV_NEEDS_NORMALIZATION 1
343 #endif
344 #define UDIV_TIME 220
345 #endif
348 #if defined (__GNUC__)
350 /* We sometimes need to clobber "cc" with gcc2, but that would not be
351 understood by gcc1. Use cpp to avoid major code duplication. */
352 #if __GNUC__ < 2
353 #define __CLOBBER_CC
354 #define __AND_CLOBBER_CC
355 #else /* __GNUC__ >= 2 */
356 #define __CLOBBER_CC : "cc"
357 #define __AND_CLOBBER_CC , "cc"
358 #endif /* __GNUC__ < 2 */
360 #if (defined (__a29k__) || defined (_AM29K)) && W_TYPE_SIZE == 32
361 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
362 __asm__ ("add %1,%4,%5\n\taddc %0,%2,%3" \
363 : "=r" (sh), "=&r" (sl) \
364 : "r" (ah), "rI" (bh), "%r" (al), "rI" (bl))
365 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
366 __asm__ ("sub %1,%4,%5\n\tsubc %0,%2,%3" \
367 : "=r" (sh), "=&r" (sl) \
368 : "r" (ah), "rI" (bh), "r" (al), "rI" (bl))
369 #define umul_ppmm(xh, xl, m0, m1) \
370 do { \
371 USItype __m0 = (m0), __m1 = (m1); \
372 __asm__ ("multiplu %0,%1,%2" \
373 : "=r" (xl) \
374 : "r" (__m0), "r" (__m1)); \
375 __asm__ ("multmu %0,%1,%2" \
376 : "=r" (xh) \
377 : "r" (__m0), "r" (__m1)); \
378 } while (0)
379 #define udiv_qrnnd(q, r, n1, n0, d) \
380 __asm__ ("dividu %0,%3,%4" \
381 : "=r" (q), "=q" (r) \
382 : "1" (n1), "r" (n0), "r" (d))
383 #define count_leading_zeros(count, x) \
384 __asm__ ("clz %0,%1" \
385 : "=r" (count) \
386 : "r" (x))
387 #define COUNT_LEADING_ZEROS_0 32
388 #endif /* __a29k__ */
390 #if defined (__arc__)
391 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
392 __asm__ ("add.f\t%1, %4, %5\n\tadc\t%0, %2, %3" \
393 : "=r" (sh), \
394 "=&r" (sl) \
395 : "r" ((USItype) (ah)), \
396 "rIJ" ((USItype) (bh)), \
397 "%r" ((USItype) (al)), \
398 "rIJ" ((USItype) (bl)))
399 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
400 __asm__ ("sub.f\t%1, %4, %5\n\tsbc\t%0, %2, %3" \
401 : "=r" (sh), \
402 "=&r" (sl) \
403 : "r" ((USItype) (ah)), \
404 "rIJ" ((USItype) (bh)), \
405 "r" ((USItype) (al)), \
406 "rIJ" ((USItype) (bl)))
407 #endif
409 #if defined (__arm__) && W_TYPE_SIZE == 32
410 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
411 __asm__ ("adds\t%1, %4, %5\n\tadc\t%0, %2, %3" \
412 : "=r" (sh), "=&r" (sl) \
413 : "r" (ah), "rI" (bh), "%r" (al), "rI" (bl) __CLOBBER_CC)
414 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
415 do { \
416 if (__builtin_constant_p (al)) \
418 if (__builtin_constant_p (ah)) \
419 __asm__ ("rsbs\t%1, %5, %4\n\trsc\t%0, %3, %2" \
420 : "=r" (sh), "=&r" (sl) \
421 : "rI" (ah), "r" (bh), "rI" (al), "r" (bl) __CLOBBER_CC); \
422 else \
423 __asm__ ("rsbs\t%1, %5, %4\n\tsbc\t%0, %2, %3" \
424 : "=r" (sh), "=&r" (sl) \
425 : "r" (ah), "rI" (bh), "rI" (al), "r" (bl) __CLOBBER_CC); \
427 else if (__builtin_constant_p (ah)) \
429 if (__builtin_constant_p (bl)) \
430 __asm__ ("subs\t%1, %4, %5\n\trsc\t%0, %3, %2" \
431 : "=r" (sh), "=&r" (sl) \
432 : "rI" (ah), "r" (bh), "r" (al), "rI" (bl) __CLOBBER_CC); \
433 else \
434 __asm__ ("rsbs\t%1, %5, %4\n\trsc\t%0, %3, %2" \
435 : "=r" (sh), "=&r" (sl) \
436 : "rI" (ah), "r" (bh), "rI" (al), "r" (bl) __CLOBBER_CC); \
438 else if (__builtin_constant_p (bl)) \
440 if (__builtin_constant_p (bh)) \
441 __asm__ ("subs\t%1, %4, %5\n\tsbc\t%0, %2, %3" \
442 : "=r" (sh), "=&r" (sl) \
443 : "r" (ah), "rI" (bh), "r" (al), "rI" (bl) __CLOBBER_CC); \
444 else \
445 __asm__ ("subs\t%1, %4, %5\n\trsc\t%0, %3, %2" \
446 : "=r" (sh), "=&r" (sl) \
447 : "rI" (ah), "r" (bh), "r" (al), "rI" (bl) __CLOBBER_CC); \
449 else /* only bh might be a constant */ \
450 __asm__ ("subs\t%1, %4, %5\n\tsbc\t%0, %2, %3" \
451 : "=r" (sh), "=&r" (sl) \
452 : "r" (ah), "rI" (bh), "r" (al), "rI" (bl) __CLOBBER_CC);\
453 } while (0)
454 #if 1 || defined (__arm_m__) /* `M' series has widening multiply support */
455 #define umul_ppmm(xh, xl, a, b) \
456 __asm__ ("umull %0,%1,%2,%3" : "=&r" (xl), "=&r" (xh) : "r" (a), "r" (b))
457 #define UMUL_TIME 5
458 #define smul_ppmm(xh, xl, a, b) \
459 __asm__ ("smull %0,%1,%2,%3" : "=&r" (xl), "=&r" (xh) : "r" (a), "r" (b))
460 #ifndef LONGLONG_STANDALONE
461 #define udiv_qrnnd(q, r, n1, n0, d) \
462 do { UWtype __di; \
463 __di = __MPN(invert_limb) (d); \
464 udiv_qrnnd_preinv (q, r, n1, n0, d, __di); \
465 } while (0)
466 #define UDIV_PREINV_ALWAYS 1
467 #define UDIV_NEEDS_NORMALIZATION 1
468 #define UDIV_TIME 70
469 #endif /* LONGLONG_STANDALONE */
470 #else
471 #define umul_ppmm(xh, xl, a, b) \
472 __asm__ ("%@ Inlined umul_ppmm\n" \
473 " mov %|r0, %2, lsr #16\n" \
474 " mov %|r2, %3, lsr #16\n" \
475 " bic %|r1, %2, %|r0, lsl #16\n" \
476 " bic %|r2, %3, %|r2, lsl #16\n" \
477 " mul %1, %|r1, %|r2\n" \
478 " mul %|r2, %|r0, %|r2\n" \
479 " mul %|r1, %0, %|r1\n" \
480 " mul %0, %|r0, %0\n" \
481 " adds %|r1, %|r2, %|r1\n" \
482 " addcs %0, %0, #65536\n" \
483 " adds %1, %1, %|r1, lsl #16\n" \
484 " adc %0, %0, %|r1, lsr #16" \
485 : "=&r" (xh), "=r" (xl) \
486 : "r" (a), "r" (b) \
487 : "r0", "r1", "r2")
488 #define UMUL_TIME 20
489 #ifndef LONGLONG_STANDALONE
490 #define udiv_qrnnd(q, r, n1, n0, d) \
491 do { UWtype __r; \
492 (q) = __MPN(udiv_qrnnd) (&__r, (n1), (n0), (d)); \
493 (r) = __r; \
494 } while (0)
495 extern UWtype __MPN(udiv_qrnnd) _PROTO ((UWtype *, UWtype, UWtype, UWtype));
496 #define UDIV_TIME 200
497 #endif /* LONGLONG_STANDALONE */
498 #endif
499 #endif /* __arm__ */
501 #if defined (__clipper__) && W_TYPE_SIZE == 32
502 #define umul_ppmm(w1, w0, u, v) \
503 ({union {UDItype __ll; \
504 struct {USItype __l, __h;} __i; \
505 } __x; \
506 __asm__ ("mulwux %2,%0" \
507 : "=r" (__x.__ll) \
508 : "%0" ((USItype)(u)), "r" ((USItype)(v))); \
509 (w1) = __x.__i.__h; (w0) = __x.__i.__l;})
510 #define smul_ppmm(w1, w0, u, v) \
511 ({union {DItype __ll; \
512 struct {SItype __l, __h;} __i; \
513 } __x; \
514 __asm__ ("mulwx %2,%0" \
515 : "=r" (__x.__ll) \
516 : "%0" ((SItype)(u)), "r" ((SItype)(v))); \
517 (w1) = __x.__i.__h; (w0) = __x.__i.__l;})
518 #define __umulsidi3(u, v) \
519 ({UDItype __w; \
520 __asm__ ("mulwux %2,%0" \
521 : "=r" (__w) : "%0" ((USItype)(u)), "r" ((USItype)(v))); \
522 __w; })
523 #endif /* __clipper__ */
525 /* Fujitsu vector computers. */
526 #if defined (__uxp__) && W_TYPE_SIZE == 32
527 #define umul_ppmm(ph, pl, u, v) \
528 do { \
529 union {UDItype __ll; \
530 struct {USItype __h, __l;} __i; \
531 } __x; \
532 __asm__ ("mult.lu %1,%2,%0" : "=r" (__x.__ll) : "%r" (u), "rK" (v));\
533 (ph) = __x.__i.__h; \
534 (pl) = __x.__i.__l; \
535 } while (0)
536 #define smul_ppmm(ph, pl, u, v) \
537 do { \
538 union {UDItype __ll; \
539 struct {USItype __h, __l;} __i; \
540 } __x; \
541 __asm__ ("mult.l %1,%2,%0" : "=r" (__x.__ll) : "%r" (u), "rK" (v)); \
542 (ph) = __x.__i.__h; \
543 (pl) = __x.__i.__l; \
544 } while (0)
545 #endif
547 #if defined (__gmicro__) && W_TYPE_SIZE == 32
548 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
549 __asm__ ("add.w %5,%1\n\taddx %3,%0" \
550 : "=g" (sh), "=&g" (sl) \
551 : "0" ((USItype)(ah)), "g" ((USItype)(bh)), \
552 "%1" ((USItype)(al)), "g" ((USItype)(bl)))
553 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
554 __asm__ ("sub.w %5,%1\n\tsubx %3,%0" \
555 : "=g" (sh), "=&g" (sl) \
556 : "0" ((USItype)(ah)), "g" ((USItype)(bh)), \
557 "1" ((USItype)(al)), "g" ((USItype)(bl)))
558 #define umul_ppmm(ph, pl, m0, m1) \
559 __asm__ ("mulx %3,%0,%1" \
560 : "=g" (ph), "=r" (pl) \
561 : "%0" ((USItype)(m0)), "g" ((USItype)(m1)))
562 #define udiv_qrnnd(q, r, nh, nl, d) \
563 __asm__ ("divx %4,%0,%1" \
564 : "=g" (q), "=r" (r) \
565 : "1" ((USItype)(nh)), "0" ((USItype)(nl)), "g" ((USItype)(d)))
566 #define count_leading_zeros(count, x) \
567 __asm__ ("bsch/1 %1,%0" \
568 : "=g" (count) : "g" ((USItype)(x)), "0" ((USItype)0))
569 #endif
571 #if defined (__hppa) && W_TYPE_SIZE == 32
572 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
573 __asm__ ("add%I5 %5,%r4,%1\n\taddc %r2,%r3,%0" \
574 : "=r" (sh), "=&r" (sl) \
575 : "rM" (ah), "rM" (bh), "%rM" (al), "rI" (bl))
576 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
577 __asm__ ("sub%I4 %4,%r5,%1\n\tsubb %r2,%r3,%0" \
578 : "=r" (sh), "=&r" (sl) \
579 : "rM" (ah), "rM" (bh), "rI" (al), "rM" (bl))
580 #if defined (_PA_RISC1_1)
581 #define umul_ppmm(wh, wl, u, v) \
582 do { \
583 union {UDItype __ll; \
584 struct {USItype __h, __l;} __i; \
585 } __x; \
586 __asm__ ("xmpyu %1,%2,%0" : "=*f" (__x.__ll) : "*f" (u), "*f" (v)); \
587 (wh) = __x.__i.__h; \
588 (wl) = __x.__i.__l; \
589 } while (0)
590 #define UMUL_TIME 8
591 #define UDIV_TIME 60
592 #else
593 #define UMUL_TIME 40
594 #define UDIV_TIME 80
595 #endif
596 #define count_leading_zeros(count, x) \
597 do { \
598 USItype __tmp; \
599 __asm__ ( \
600 "ldi 1,%0\n" \
601 " extru,= %1,15,16,%%r0 ; Bits 31..16 zero?\n" \
602 " extru,tr %1,15,16,%1 ; No. Shift down, skip add.\n" \
603 " ldo 16(%0),%0 ; Yes. Perform add.\n" \
604 " extru,= %1,23,8,%%r0 ; Bits 15..8 zero?\n" \
605 " extru,tr %1,23,8,%1 ; No. Shift down, skip add.\n" \
606 " ldo 8(%0),%0 ; Yes. Perform add.\n" \
607 " extru,= %1,27,4,%%r0 ; Bits 7..4 zero?\n" \
608 " extru,tr %1,27,4,%1 ; No. Shift down, skip add.\n" \
609 " ldo 4(%0),%0 ; Yes. Perform add.\n" \
610 " extru,= %1,29,2,%%r0 ; Bits 3..2 zero?\n" \
611 " extru,tr %1,29,2,%1 ; No. Shift down, skip add.\n" \
612 " ldo 2(%0),%0 ; Yes. Perform add.\n" \
613 " extru %1,30,1,%1 ; Extract bit 1.\n" \
614 " sub %0,%1,%0 ; Subtract it.\n" \
615 : "=r" (count), "=r" (__tmp) : "1" (x)); \
616 } while (0)
617 #endif /* hppa */
619 /* These macros are for ABI=2.0w. In ABI=2.0n they can't be used, since GCC
620 (3.2) puts longlong into two adjacent 32-bit registers. Presumably this
621 is just a case of no direct support for 2.0n but treating it like 1.0. */
622 #if defined (__hppa) && W_TYPE_SIZE == 64 && ! defined (_LONG_LONG_LIMB)
623 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
624 __asm__ ("add%I5 %5,%r4,%1\n\tadd,dc %r2,%r3,%0" \
625 : "=r" (sh), "=&r" (sl) \
626 : "rM" (ah), "rM" (bh), "%rM" (al), "rI" (bl))
627 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
628 __asm__ ("sub%I4 %4,%r5,%1\n\tsub,db %r2,%r3,%0" \
629 : "=r" (sh), "=&r" (sl) \
630 : "rM" (ah), "rM" (bh), "rI" (al), "rM" (bl))
631 #endif /* hppa */
633 #if (defined (__i370__) || defined (__s390__) || defined (__mvs__)) && W_TYPE_SIZE == 32
634 #define smul_ppmm(xh, xl, m0, m1) \
635 do { \
636 union {DItype __ll; \
637 struct {USItype __h, __l;} __i; \
638 } __x; \
639 __asm__ ("lr %N0,%1\n\tmr %0,%2" \
640 : "=&r" (__x.__ll) \
641 : "r" (m0), "r" (m1)); \
642 (xh) = __x.__i.__h; (xl) = __x.__i.__l; \
643 } while (0)
644 #define sdiv_qrnnd(q, r, n1, n0, d) \
645 do { \
646 union {DItype __ll; \
647 struct {USItype __h, __l;} __i; \
648 } __x; \
649 __x.__i.__h = n1; __x.__i.__l = n0; \
650 __asm__ ("dr %0,%2" \
651 : "=r" (__x.__ll) \
652 : "0" (__x.__ll), "r" (d)); \
653 (q) = __x.__i.__l; (r) = __x.__i.__h; \
654 } while (0)
655 #endif
657 #if (defined (__i386__) || defined (__i486__)) && W_TYPE_SIZE == 32
658 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
659 __asm__ ("addl %5,%k1\n\tadcl %3,%k0" \
660 : "=r" (sh), "=&r" (sl) \
661 : "0" ((USItype)(ah)), "g" ((USItype)(bh)), \
662 "%1" ((USItype)(al)), "g" ((USItype)(bl)))
663 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
664 __asm__ ("subl %5,%k1\n\tsbbl %3,%k0" \
665 : "=r" (sh), "=&r" (sl) \
666 : "0" ((USItype)(ah)), "g" ((USItype)(bh)), \
667 "1" ((USItype)(al)), "g" ((USItype)(bl)))
668 #define umul_ppmm(w1, w0, u, v) \
669 __asm__ ("mull %3" \
670 : "=a" (w0), "=d" (w1) \
671 : "%0" ((USItype)(u)), "rm" ((USItype)(v)))
672 #define udiv_qrnnd(q, r, n1, n0, dx) /* d renamed to dx avoiding "=d" */\
673 __asm__ ("divl %4" /* stringification in K&R C */ \
674 : "=a" (q), "=d" (r) \
675 : "0" ((USItype)(n0)), "1" ((USItype)(n1)), "rm" ((USItype)(dx)))
677 #if HAVE_HOST_CPU_i586 || HAVE_HOST_CPU_pentium || HAVE_HOST_CPU_pentiummmx
678 /* Pentium bsrl takes between 10 and 72 cycles depending where the most
679 significant 1 bit is, hence the use of the following alternatives. bsfl
680 is slow too, between 18 and 42 depending where the least significant 1
681 bit is, so let the generic count_trailing_zeros below make use of the
682 count_leading_zeros here too. */
684 #if HAVE_HOST_CPU_pentiummmx && ! defined (LONGLONG_STANDALONE)
685 /* The following should be a fixed 14 or 15 cycles, but possibly plus an L1
686 cache miss reading from __clz_tab. For P55 it's favoured over the float
687 below so as to avoid mixing MMX and x87, since the penalty for switching
688 between the two is about 100 cycles.
690 The asm block sets __shift to -3 if the high 24 bits are clear, -2 for
691 16, -1 for 8, or 0 otherwise. This could be written equivalently as
692 follows, but as of gcc 2.95.2 it results in conditional jumps.
694 __shift = -(__n < 0x1000000);
695 __shift -= (__n < 0x10000);
696 __shift -= (__n < 0x100);
698 The middle two sbbl and cmpl's pair, and with luck something gcc
699 generates might pair with the first cmpl and the last sbbl. The "32+1"
700 constant could be folded into __clz_tab[], but it doesn't seem worth
701 making a different table just for that. */
703 #define count_leading_zeros(c,n) \
704 do { \
705 USItype __n = (n); \
706 USItype __shift; \
707 __asm__ ("cmpl $0x1000000, %1\n" \
708 "sbbl %0, %0\n" \
709 "cmpl $0x10000, %1\n" \
710 "sbbl $0, %0\n" \
711 "cmpl $0x100, %1\n" \
712 "sbbl $0, %0\n" \
713 : "=&r" (__shift) : "r" (__n)); \
714 __shift = __shift*8 + 24 + 1; \
715 (c) = 32 + 1 - __shift - __clz_tab[__n >> __shift]; \
716 } while (0)
717 #define COUNT_LEADING_ZEROS_NEED_CLZ_TAB
718 #define COUNT_LEADING_ZEROS_0 31 /* n==0 indistinguishable from n==1 */
720 #else /* ! pentiummmx || LONGLONG_STANDALONE */
721 /* The following should be a fixed 14 cycles or so. Some scheduling
722 opportunities should be available between the float load/store too. This
723 sort of code is used in gcc 3 for __builtin_ffs (with "n&-n") and is
724 apparently suggested by the Intel optimizing manual (don't know exactly
725 where). gcc 2.95 or up will be best for this, so the "double" is
726 correctly aligned on the stack. */
727 #define count_leading_zeros(c,n) \
728 do { \
729 union { \
730 double d; \
731 unsigned a[2]; \
732 } __u; \
733 ASSERT ((n) != 0); \
734 __u.d = (UWtype) (n); \
735 (c) = 0x3FF + 31 - (__u.a[1] >> 20); \
736 } while (0)
737 #define COUNT_LEADING_ZEROS_0 (0x3FF + 31)
738 #endif /* pentiummx */
740 #else /* ! pentium */
742 #if __GMP_GNUC_PREREQ (3,4) /* using bsrl */
743 #define count_leading_zeros(count,x) count_leading_zeros_gcc_clz(count,x)
744 #endif /* gcc clz */
746 /* On P6, gcc prior to 3.0 generates a partial register stall for
747 __cbtmp^31, due to using "xorb $31" instead of "xorl $31", the former
748 being 1 code byte smaller. "31-__cbtmp" is a workaround, probably at the
749 cost of one extra instruction. Do this for "i386" too, since that means
750 generic x86. */
751 #if ! defined (count_leading_zeros) && __GNUC__ < 3 \
752 && (HAVE_HOST_CPU_i386 \
753 || HAVE_HOST_CPU_i686 \
754 || HAVE_HOST_CPU_pentiumpro \
755 || HAVE_HOST_CPU_pentium2 \
756 || HAVE_HOST_CPU_pentium3)
757 #define count_leading_zeros(count, x) \
758 do { \
759 USItype __cbtmp; \
760 ASSERT ((x) != 0); \
761 __asm__ ("bsrl %1,%0" : "=r" (__cbtmp) : "rm" ((USItype)(x))); \
762 (count) = 31 - __cbtmp; \
763 } while (0)
764 #endif /* gcc<3 asm bsrl */
766 #ifndef count_leading_zeros
767 #define count_leading_zeros(count, x) \
768 do { \
769 USItype __cbtmp; \
770 ASSERT ((x) != 0); \
771 __asm__ ("bsrl %1,%0" : "=r" (__cbtmp) : "rm" ((USItype)(x))); \
772 (count) = __cbtmp ^ 31; \
773 } while (0)
774 #endif /* asm bsrl */
776 #if __GMP_GNUC_PREREQ (3,4) /* using bsfl */
777 #define count_trailing_zeros(count,x) count_trailing_zeros_gcc_ctz(count,x)
778 #endif /* gcc ctz */
780 #ifndef count_trailing_zeros
781 #define count_trailing_zeros(count, x) \
782 do { \
783 ASSERT ((x) != 0); \
784 __asm__ ("bsfl %1,%0" : "=r" (count) : "rm" ((USItype)(x))); \
785 } while (0)
786 #endif /* asm bsfl */
788 #endif /* ! pentium */
790 #ifndef UMUL_TIME
791 #define UMUL_TIME 10
792 #endif
793 #ifndef UDIV_TIME
794 #define UDIV_TIME 40
795 #endif
796 #endif /* 80x86 */
798 #if defined (__amd64__) && W_TYPE_SIZE == 64
799 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
800 __asm__ ("addq %5,%q1\n\tadcq %3,%q0" \
801 : "=r" (sh), "=&r" (sl) \
802 : "0" ((UDItype)(ah)), "rme" ((UDItype)(bh)), \
803 "%1" ((UDItype)(al)), "rme" ((UDItype)(bl)))
804 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
805 __asm__ ("subq %5,%q1\n\tsbbq %3,%q0" \
806 : "=r" (sh), "=&r" (sl) \
807 : "0" ((UDItype)(ah)), "rme" ((UDItype)(bh)), \
808 "1" ((UDItype)(al)), "rme" ((UDItype)(bl)))
809 #define umul_ppmm(w1, w0, u, v) \
810 __asm__ ("mulq %3" \
811 : "=a" (w0), "=d" (w1) \
812 : "%0" ((UDItype)(u)), "rm" ((UDItype)(v)))
813 #define udiv_qrnnd(q, r, n1, n0, dx) /* d renamed to dx avoiding "=d" */\
814 __asm__ ("divq %4" /* stringification in K&R C */ \
815 : "=a" (q), "=d" (r) \
816 : "0" ((UDItype)(n0)), "1" ((UDItype)(n1)), "rm" ((UDItype)(dx)))
817 /* bsrq destination must be a 64-bit register, hence UDItype for __cbtmp. */
818 #define count_leading_zeros(count, x) \
819 do { \
820 UDItype __cbtmp; \
821 ASSERT ((x) != 0); \
822 __asm__ ("bsrq %1,%0" : "=r" (__cbtmp) : "rm" ((UDItype)(x))); \
823 (count) = __cbtmp ^ 63; \
824 } while (0)
825 /* bsfq destination must be a 64-bit register, "%q0" forces this in case
826 count is only an int. */
827 #define count_trailing_zeros(count, x) \
828 do { \
829 ASSERT ((x) != 0); \
830 __asm__ ("bsfq %1,%q0" : "=r" (count) : "rm" ((UDItype)(x))); \
831 } while (0)
832 #endif /* x86_64 */
834 #if defined (__i860__) && W_TYPE_SIZE == 32
835 #define rshift_rhlc(r,h,l,c) \
836 __asm__ ("shr %3,r0,r0\;shrd %1,%2,%0" \
837 "=r" (r) : "r" (h), "r" (l), "rn" (c))
838 #endif /* i860 */
840 #if defined (__i960__) && W_TYPE_SIZE == 32
841 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
842 __asm__ ("cmpo 1,0\;addc %5,%4,%1\;addc %3,%2,%0" \
843 : "=r" (sh), "=&r" (sl) \
844 : "dI" (ah), "dI" (bh), "%dI" (al), "dI" (bl))
845 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
846 __asm__ ("cmpo 0,0\;subc %5,%4,%1\;subc %3,%2,%0" \
847 : "=r" (sh), "=&r" (sl) \
848 : "dI" (ah), "dI" (bh), "dI" (al), "dI" (bl))
849 #define umul_ppmm(w1, w0, u, v) \
850 ({union {UDItype __ll; \
851 struct {USItype __l, __h;} __i; \
852 } __x; \
853 __asm__ ("emul %2,%1,%0" \
854 : "=d" (__x.__ll) : "%dI" (u), "dI" (v)); \
855 (w1) = __x.__i.__h; (w0) = __x.__i.__l;})
856 #define __umulsidi3(u, v) \
857 ({UDItype __w; \
858 __asm__ ("emul %2,%1,%0" : "=d" (__w) : "%dI" (u), "dI" (v)); \
859 __w; })
860 #define udiv_qrnnd(q, r, nh, nl, d) \
861 do { \
862 union {UDItype __ll; \
863 struct {USItype __l, __h;} __i; \
864 } __nn; \
865 __nn.__i.__h = (nh); __nn.__i.__l = (nl); \
866 __asm__ ("ediv %d,%n,%0" \
867 : "=d" (__rq.__ll) : "dI" (__nn.__ll), "dI" (d)); \
868 (r) = __rq.__i.__l; (q) = __rq.__i.__h; \
869 } while (0)
870 #define count_leading_zeros(count, x) \
871 do { \
872 USItype __cbtmp; \
873 __asm__ ("scanbit %1,%0" : "=r" (__cbtmp) : "r" (x)); \
874 (count) = __cbtmp ^ 31; \
875 } while (0)
876 #define COUNT_LEADING_ZEROS_0 (-32) /* sic */
877 #if defined (__i960mx) /* what is the proper symbol to test??? */
878 #define rshift_rhlc(r,h,l,c) \
879 do { \
880 union {UDItype __ll; \
881 struct {USItype __l, __h;} __i; \
882 } __nn; \
883 __nn.__i.__h = (h); __nn.__i.__l = (l); \
884 __asm__ ("shre %2,%1,%0" : "=d" (r) : "dI" (__nn.__ll), "dI" (c)); \
886 #endif /* i960mx */
887 #endif /* i960 */
889 #if (defined (__mc68000__) || defined (__mc68020__) || defined(mc68020) \
890 || defined (__m68k__) || defined (__mc5200__) || defined (__mc5206e__) \
891 || defined (__mc5307__)) && W_TYPE_SIZE == 32
892 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
893 __asm__ ("add%.l %5,%1\n\taddx%.l %3,%0" \
894 : "=d" (sh), "=&d" (sl) \
895 : "0" ((USItype)(ah)), "d" ((USItype)(bh)), \
896 "%1" ((USItype)(al)), "g" ((USItype)(bl)))
897 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
898 __asm__ ("sub%.l %5,%1\n\tsubx%.l %3,%0" \
899 : "=d" (sh), "=&d" (sl) \
900 : "0" ((USItype)(ah)), "d" ((USItype)(bh)), \
901 "1" ((USItype)(al)), "g" ((USItype)(bl)))
902 /* The '020, '030, '040 and CPU32 have 32x32->64 and 64/32->32q-32r. */
903 #if defined (__mc68020__) || defined(mc68020) \
904 || defined (__mc68030__) || defined (mc68030) \
905 || defined (__mc68040__) || defined (mc68040) \
906 || defined (__mcpu32__) || defined (mcpu32) \
907 || defined (__NeXT__)
908 #define umul_ppmm(w1, w0, u, v) \
909 __asm__ ("mulu%.l %3,%1:%0" \
910 : "=d" (w0), "=d" (w1) \
911 : "%0" ((USItype)(u)), "dmi" ((USItype)(v)))
912 #define UMUL_TIME 45
913 #define udiv_qrnnd(q, r, n1, n0, d) \
914 __asm__ ("divu%.l %4,%1:%0" \
915 : "=d" (q), "=d" (r) \
916 : "0" ((USItype)(n0)), "1" ((USItype)(n1)), "dmi" ((USItype)(d)))
917 #define UDIV_TIME 90
918 #define sdiv_qrnnd(q, r, n1, n0, d) \
919 __asm__ ("divs%.l %4,%1:%0" \
920 : "=d" (q), "=d" (r) \
921 : "0" ((USItype)(n0)), "1" ((USItype)(n1)), "dmi" ((USItype)(d)))
922 #else /* for other 68k family members use 16x16->32 multiplication */
923 #define umul_ppmm(xh, xl, a, b) \
924 do { USItype __umul_tmp1, __umul_tmp2; \
925 __asm__ ("| Inlined umul_ppmm\n" \
926 " move%.l %5,%3\n" \
927 " move%.l %2,%0\n" \
928 " move%.w %3,%1\n" \
929 " swap %3\n" \
930 " swap %0\n" \
931 " mulu%.w %2,%1\n" \
932 " mulu%.w %3,%0\n" \
933 " mulu%.w %2,%3\n" \
934 " swap %2\n" \
935 " mulu%.w %5,%2\n" \
936 " add%.l %3,%2\n" \
937 " jcc 1f\n" \
938 " add%.l %#0x10000,%0\n" \
939 "1: move%.l %2,%3\n" \
940 " clr%.w %2\n" \
941 " swap %2\n" \
942 " swap %3\n" \
943 " clr%.w %3\n" \
944 " add%.l %3,%1\n" \
945 " addx%.l %2,%0\n" \
946 " | End inlined umul_ppmm" \
947 : "=&d" (xh), "=&d" (xl), \
948 "=d" (__umul_tmp1), "=&d" (__umul_tmp2) \
949 : "%2" ((USItype)(a)), "d" ((USItype)(b))); \
950 } while (0)
951 #define UMUL_TIME 100
952 #define UDIV_TIME 400
953 #endif /* not mc68020 */
954 /* The '020, '030, '040 and '060 have bitfield insns.
955 GCC 3.4 defines __mc68020__ when in CPU32 mode, check for __mcpu32__ to
956 exclude bfffo on that chip (bitfield insns not available). */
957 #if (defined (__mc68020__) || defined (mc68020) \
958 || defined (__mc68030__) || defined (mc68030) \
959 || defined (__mc68040__) || defined (mc68040) \
960 || defined (__mc68060__) || defined (mc68060) \
961 || defined (__NeXT__)) \
962 && ! defined (__mcpu32__)
963 #define count_leading_zeros(count, x) \
964 __asm__ ("bfffo %1{%b2:%b2},%0" \
965 : "=d" (count) \
966 : "od" ((USItype) (x)), "n" (0))
967 #define COUNT_LEADING_ZEROS_0 32
968 #endif
969 #endif /* mc68000 */
971 #if defined (__m88000__) && W_TYPE_SIZE == 32
972 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
973 __asm__ ("addu.co %1,%r4,%r5\n\taddu.ci %0,%r2,%r3" \
974 : "=r" (sh), "=&r" (sl) \
975 : "rJ" (ah), "rJ" (bh), "%rJ" (al), "rJ" (bl))
976 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
977 __asm__ ("subu.co %1,%r4,%r5\n\tsubu.ci %0,%r2,%r3" \
978 : "=r" (sh), "=&r" (sl) \
979 : "rJ" (ah), "rJ" (bh), "rJ" (al), "rJ" (bl))
980 #define count_leading_zeros(count, x) \
981 do { \
982 USItype __cbtmp; \
983 __asm__ ("ff1 %0,%1" : "=r" (__cbtmp) : "r" (x)); \
984 (count) = __cbtmp ^ 31; \
985 } while (0)
986 #define COUNT_LEADING_ZEROS_0 63 /* sic */
987 #if defined (__m88110__)
988 #define umul_ppmm(wh, wl, u, v) \
989 do { \
990 union {UDItype __ll; \
991 struct {USItype __h, __l;} __i; \
992 } __x; \
993 __asm__ ("mulu.d %0,%1,%2" : "=r" (__x.__ll) : "r" (u), "r" (v)); \
994 (wh) = __x.__i.__h; \
995 (wl) = __x.__i.__l; \
996 } while (0)
997 #define udiv_qrnnd(q, r, n1, n0, d) \
998 ({union {UDItype __ll; \
999 struct {USItype __h, __l;} __i; \
1000 } __x, __q; \
1001 __x.__i.__h = (n1); __x.__i.__l = (n0); \
1002 __asm__ ("divu.d %0,%1,%2" \
1003 : "=r" (__q.__ll) : "r" (__x.__ll), "r" (d)); \
1004 (r) = (n0) - __q.__l * (d); (q) = __q.__l; })
1005 #define UMUL_TIME 5
1006 #define UDIV_TIME 25
1007 #else
1008 #define UMUL_TIME 17
1009 #define UDIV_TIME 150
1010 #endif /* __m88110__ */
1011 #endif /* __m88000__ */
1013 #if defined (__mips) && W_TYPE_SIZE == 32
1014 #if __GNUC__ > 2 || __GNUC_MINOR__ >= 7
1015 #define umul_ppmm(w1, w0, u, v) \
1016 __asm__ ("multu %2,%3" : "=l" (w0), "=h" (w1) : "d" (u), "d" (v))
1017 #else
1018 #define umul_ppmm(w1, w0, u, v) \
1019 __asm__ ("multu %2,%3\n\tmflo %0\n\tmfhi %1" \
1020 : "=d" (w0), "=d" (w1) : "d" (u), "d" (v))
1021 #endif
1022 #define UMUL_TIME 10
1023 #define UDIV_TIME 100
1024 #endif /* __mips */
1026 #if (defined (__mips) && __mips >= 3) && W_TYPE_SIZE == 64
1027 #if __GNUC__ > 2 || __GNUC_MINOR__ >= 7
1028 #define umul_ppmm(w1, w0, u, v) \
1029 __asm__ ("dmultu %2,%3" : "=l" (w0), "=h" (w1) : "d" (u), "d" (v))
1030 #else
1031 #define umul_ppmm(w1, w0, u, v) \
1032 __asm__ ("dmultu %2,%3\n\tmflo %0\n\tmfhi %1" \
1033 : "=d" (w0), "=d" (w1) : "d" (u), "d" (v))
1034 #endif
1035 #define UMUL_TIME 20
1036 #define UDIV_TIME 140
1037 #endif /* __mips */
1039 #if defined (__ns32000__) && W_TYPE_SIZE == 32
1040 #define umul_ppmm(w1, w0, u, v) \
1041 ({union {UDItype __ll; \
1042 struct {USItype __l, __h;} __i; \
1043 } __x; \
1044 __asm__ ("meid %2,%0" \
1045 : "=g" (__x.__ll) \
1046 : "%0" ((USItype)(u)), "g" ((USItype)(v))); \
1047 (w1) = __x.__i.__h; (w0) = __x.__i.__l;})
1048 #define __umulsidi3(u, v) \
1049 ({UDItype __w; \
1050 __asm__ ("meid %2,%0" \
1051 : "=g" (__w) \
1052 : "%0" ((USItype)(u)), "g" ((USItype)(v))); \
1053 __w; })
1054 #define udiv_qrnnd(q, r, n1, n0, d) \
1055 ({union {UDItype __ll; \
1056 struct {USItype __l, __h;} __i; \
1057 } __x; \
1058 __x.__i.__h = (n1); __x.__i.__l = (n0); \
1059 __asm__ ("deid %2,%0" \
1060 : "=g" (__x.__ll) \
1061 : "0" (__x.__ll), "g" ((USItype)(d))); \
1062 (r) = __x.__i.__l; (q) = __x.__i.__h; })
1063 #define count_trailing_zeros(count,x) \
1064 do { \
1065 __asm__ ("ffsd %2,%0" \
1066 : "=r" (count) \
1067 : "0" ((USItype) 0), "r" ((USItype) (x))); \
1068 } while (0)
1069 #endif /* __ns32000__ */
1071 /* In the past we had a block of various #defines tested
1072 _ARCH_PPC - AIX
1073 _ARCH_PWR - AIX
1074 __powerpc__ - gcc
1075 __POWERPC__ - BEOS
1076 __ppc__ - Darwin
1077 PPC - old gcc, GNU/Linux, SysV
1078 The plain PPC test was not good for vxWorks, since PPC is defined on all
1079 CPUs there (eg. m68k too), as a constant one is expected to compare
1080 CPU_FAMILY against.
1082 At any rate, this was pretty unattractive and a bit fragile. The use of
1083 HAVE_HOST_CPU_FAMILY is designed to cut through it all and be sure of
1084 getting the desired effect.
1086 ENHANCE-ME: We should test _IBMR2 here when we add assembly support for
1087 the system vendor compilers. (Is that vendor compilers with inline asm,
1088 or what?) */
1090 #if (HAVE_HOST_CPU_FAMILY_power || HAVE_HOST_CPU_FAMILY_powerpc) \
1091 && W_TYPE_SIZE == 32
1092 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
1093 do { \
1094 if (__builtin_constant_p (bh) && (bh) == 0) \
1095 __asm__ ("{a%I4|add%I4c} %1,%3,%4\n\t{aze|addze} %0,%2" \
1096 : "=r" (sh), "=&r" (sl) : "r" (ah), "%r" (al), "rI" (bl));\
1097 else if (__builtin_constant_p (bh) && (bh) == ~(USItype) 0) \
1098 __asm__ ("{a%I4|add%I4c} %1,%3,%4\n\t{ame|addme} %0,%2" \
1099 : "=r" (sh), "=&r" (sl) : "r" (ah), "%r" (al), "rI" (bl));\
1100 else \
1101 __asm__ ("{a%I5|add%I5c} %1,%4,%5\n\t{ae|adde} %0,%2,%3" \
1102 : "=r" (sh), "=&r" (sl) \
1103 : "r" (ah), "r" (bh), "%r" (al), "rI" (bl)); \
1104 } while (0)
1105 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
1106 do { \
1107 if (__builtin_constant_p (ah) && (ah) == 0) \
1108 __asm__ ("{sf%I3|subf%I3c} %1,%4,%3\n\t{sfze|subfze} %0,%2" \
1109 : "=r" (sh), "=&r" (sl) : "r" (bh), "rI" (al), "r" (bl));\
1110 else if (__builtin_constant_p (ah) && (ah) == ~(USItype) 0) \
1111 __asm__ ("{sf%I3|subf%I3c} %1,%4,%3\n\t{sfme|subfme} %0,%2" \
1112 : "=r" (sh), "=&r" (sl) : "r" (bh), "rI" (al), "r" (bl));\
1113 else if (__builtin_constant_p (bh) && (bh) == 0) \
1114 __asm__ ("{sf%I3|subf%I3c} %1,%4,%3\n\t{ame|addme} %0,%2" \
1115 : "=r" (sh), "=&r" (sl) : "r" (ah), "rI" (al), "r" (bl));\
1116 else if (__builtin_constant_p (bh) && (bh) == ~(USItype) 0) \
1117 __asm__ ("{sf%I3|subf%I3c} %1,%4,%3\n\t{aze|addze} %0,%2" \
1118 : "=r" (sh), "=&r" (sl) : "r" (ah), "rI" (al), "r" (bl));\
1119 else \
1120 __asm__ ("{sf%I4|subf%I4c} %1,%5,%4\n\t{sfe|subfe} %0,%3,%2" \
1121 : "=r" (sh), "=&r" (sl) \
1122 : "r" (ah), "r" (bh), "rI" (al), "r" (bl)); \
1123 } while (0)
1124 #define count_leading_zeros(count, x) \
1125 __asm__ ("{cntlz|cntlzw} %0,%1" : "=r" (count) : "r" (x))
1126 #define COUNT_LEADING_ZEROS_0 32
1127 #if HAVE_HOST_CPU_FAMILY_powerpc
1128 #define umul_ppmm(ph, pl, m0, m1) \
1129 do { \
1130 USItype __m0 = (m0), __m1 = (m1); \
1131 __asm__ ("mulhwu %0,%1,%2" : "=r" (ph) : "%r" (m0), "r" (m1)); \
1132 (pl) = __m0 * __m1; \
1133 } while (0)
1134 #define UMUL_TIME 15
1135 #define smul_ppmm(ph, pl, m0, m1) \
1136 do { \
1137 SItype __m0 = (m0), __m1 = (m1); \
1138 __asm__ ("mulhw %0,%1,%2" : "=r" (ph) : "%r" (m0), "r" (m1)); \
1139 (pl) = __m0 * __m1; \
1140 } while (0)
1141 #define SMUL_TIME 14
1142 #define UDIV_TIME 120
1143 #else
1144 #define UMUL_TIME 8
1145 #define smul_ppmm(xh, xl, m0, m1) \
1146 __asm__ ("mul %0,%2,%3" : "=r" (xh), "=q" (xl) : "r" (m0), "r" (m1))
1147 #define SMUL_TIME 4
1148 #define sdiv_qrnnd(q, r, nh, nl, d) \
1149 __asm__ ("div %0,%2,%4" : "=r" (q), "=q" (r) : "r" (nh), "1" (nl), "r" (d))
1150 #define UDIV_TIME 100
1151 #endif
1152 #endif /* 32-bit POWER architecture variants. */
1154 /* We should test _IBMR2 here when we add assembly support for the system
1155 vendor compilers. */
1156 #if HAVE_HOST_CPU_FAMILY_powerpc && W_TYPE_SIZE == 64
1157 #if !defined (_LONG_LONG_LIMB)
1158 /* _LONG_LONG_LIMB is ABI=mode32 where adde operates on 32-bit values. So
1159 use adde etc only when not _LONG_LONG_LIMB. */
1160 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
1161 do { \
1162 if (__builtin_constant_p (bh) && (bh) == 0) \
1163 __asm__ ("{a%I4|add%I4c} %1,%3,%4\n\t{aze|addze} %0,%2" \
1164 : "=r" (sh), "=&r" (sl) : "r" (ah), "%r" (al), "rI" (bl));\
1165 else if (__builtin_constant_p (bh) && (bh) == ~(UDItype) 0) \
1166 __asm__ ("{a%I4|add%I4c} %1,%3,%4\n\t{ame|addme} %0,%2" \
1167 : "=r" (sh), "=&r" (sl) : "r" (ah), "%r" (al), "rI" (bl));\
1168 else \
1169 __asm__ ("{a%I5|add%I5c} %1,%4,%5\n\t{ae|adde} %0,%2,%3" \
1170 : "=r" (sh), "=&r" (sl) \
1171 : "r" (ah), "r" (bh), "%r" (al), "rI" (bl)); \
1172 } while (0)
1173 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
1174 do { \
1175 if (__builtin_constant_p (ah) && (ah) == 0) \
1176 __asm__ ("{sf%I3|subf%I3c} %1,%4,%3\n\t{sfze|subfze} %0,%2" \
1177 : "=r" (sh), "=&r" (sl) : "r" (bh), "rI" (al), "r" (bl));\
1178 else if (__builtin_constant_p (ah) && (ah) == ~(UDItype) 0) \
1179 __asm__ ("{sf%I3|subf%I3c} %1,%4,%3\n\t{sfme|subfme} %0,%2" \
1180 : "=r" (sh), "=&r" (sl) : "r" (bh), "rI" (al), "r" (bl));\
1181 else if (__builtin_constant_p (bh) && (bh) == 0) \
1182 __asm__ ("{sf%I3|subf%I3c} %1,%4,%3\n\t{ame|addme} %0,%2" \
1183 : "=r" (sh), "=&r" (sl) : "r" (ah), "rI" (al), "r" (bl));\
1184 else if (__builtin_constant_p (bh) && (bh) == ~(UDItype) 0) \
1185 __asm__ ("{sf%I3|subf%I3c} %1,%4,%3\n\t{aze|addze} %0,%2" \
1186 : "=r" (sh), "=&r" (sl) : "r" (ah), "rI" (al), "r" (bl));\
1187 else \
1188 __asm__ ("{sf%I4|subf%I4c} %1,%5,%4\n\t{sfe|subfe} %0,%3,%2" \
1189 : "=r" (sh), "=&r" (sl) \
1190 : "r" (ah), "r" (bh), "rI" (al), "r" (bl)); \
1191 } while (0)
1192 #endif /* ! _LONG_LONG_LIMB */
1193 #define count_leading_zeros(count, x) \
1194 __asm__ ("cntlzd %0,%1" : "=r" (count) : "r" (x))
1195 #define COUNT_LEADING_ZEROS_0 64
1196 #define umul_ppmm(ph, pl, m0, m1) \
1197 do { \
1198 UDItype __m0 = (m0), __m1 = (m1); \
1199 __asm__ ("mulhdu %0,%1,%2" : "=r" (ph) : "%r" (m0), "r" (m1)); \
1200 (pl) = __m0 * __m1; \
1201 } while (0)
1202 #define UMUL_TIME 15
1203 #define smul_ppmm(ph, pl, m0, m1) \
1204 do { \
1205 DItype __m0 = (m0), __m1 = (m1); \
1206 __asm__ ("mulhd %0,%1,%2" : "=r" (ph) : "%r" (m0), "r" (m1)); \
1207 (pl) = __m0 * __m1; \
1208 } while (0)
1209 #define SMUL_TIME 14 /* ??? */
1210 #define UDIV_TIME 120 /* ??? */
1211 #endif /* 64-bit PowerPC. */
1213 #if defined (__pyr__) && W_TYPE_SIZE == 32
1214 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
1215 __asm__ ("addw %5,%1\n\taddwc %3,%0" \
1216 : "=r" (sh), "=&r" (sl) \
1217 : "0" ((USItype)(ah)), "g" ((USItype)(bh)), \
1218 "%1" ((USItype)(al)), "g" ((USItype)(bl)))
1219 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
1220 __asm__ ("subw %5,%1\n\tsubwb %3,%0" \
1221 : "=r" (sh), "=&r" (sl) \
1222 : "0" ((USItype)(ah)), "g" ((USItype)(bh)), \
1223 "1" ((USItype)(al)), "g" ((USItype)(bl)))
1224 /* This insn works on Pyramids with AP, XP, or MI CPUs, but not with SP. */
1225 #define umul_ppmm(w1, w0, u, v) \
1226 ({union {UDItype __ll; \
1227 struct {USItype __h, __l;} __i; \
1228 } __x; \
1229 __asm__ ("movw %1,%R0\n\tuemul %2,%0" \
1230 : "=&r" (__x.__ll) \
1231 : "g" ((USItype) (u)), "g" ((USItype)(v))); \
1232 (w1) = __x.__i.__h; (w0) = __x.__i.__l;})
1233 #endif /* __pyr__ */
1235 #if defined (__ibm032__) /* RT/ROMP */ && W_TYPE_SIZE == 32
1236 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
1237 __asm__ ("a %1,%5\n\tae %0,%3" \
1238 : "=r" (sh), "=&r" (sl) \
1239 : "0" ((USItype)(ah)), "r" ((USItype)(bh)), \
1240 "%1" ((USItype)(al)), "r" ((USItype)(bl)))
1241 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
1242 __asm__ ("s %1,%5\n\tse %0,%3" \
1243 : "=r" (sh), "=&r" (sl) \
1244 : "0" ((USItype)(ah)), "r" ((USItype)(bh)), \
1245 "1" ((USItype)(al)), "r" ((USItype)(bl)))
1246 #define smul_ppmm(ph, pl, m0, m1) \
1247 __asm__ ( \
1248 "s r2,r2\n" \
1249 " mts r10,%2\n" \
1250 " m r2,%3\n" \
1251 " m r2,%3\n" \
1252 " m r2,%3\n" \
1253 " m r2,%3\n" \
1254 " m r2,%3\n" \
1255 " m r2,%3\n" \
1256 " m r2,%3\n" \
1257 " m r2,%3\n" \
1258 " m r2,%3\n" \
1259 " m r2,%3\n" \
1260 " m r2,%3\n" \
1261 " m r2,%3\n" \
1262 " m r2,%3\n" \
1263 " m r2,%3\n" \
1264 " m r2,%3\n" \
1265 " m r2,%3\n" \
1266 " cas %0,r2,r0\n" \
1267 " mfs r10,%1" \
1268 : "=r" (ph), "=r" (pl) \
1269 : "%r" ((USItype)(m0)), "r" ((USItype)(m1)) \
1270 : "r2")
1271 #define UMUL_TIME 20
1272 #define UDIV_TIME 200
1273 #define count_leading_zeros(count, x) \
1274 do { \
1275 if ((x) >= 0x10000) \
1276 __asm__ ("clz %0,%1" \
1277 : "=r" (count) : "r" ((USItype)(x) >> 16)); \
1278 else \
1280 __asm__ ("clz %0,%1" \
1281 : "=r" (count) : "r" ((USItype)(x))); \
1282 (count) += 16; \
1284 } while (0)
1285 #endif /* RT/ROMP */
1287 #if defined (__sh2__) && W_TYPE_SIZE == 32
1288 #define umul_ppmm(w1, w0, u, v) \
1289 __asm__ ("dmulu.l %2,%3\n\tsts macl,%1\n\tsts mach,%0" \
1290 : "=r" (w1), "=r" (w0) : "r" (u), "r" (v) : "macl", "mach")
1291 #define UMUL_TIME 5
1292 #endif
1294 #if defined (__sparc__) && W_TYPE_SIZE == 32
1295 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
1296 __asm__ ("addcc %r4,%5,%1\n\taddx %r2,%3,%0" \
1297 : "=r" (sh), "=&r" (sl) \
1298 : "rJ" (ah), "rI" (bh),"%rJ" (al), "rI" (bl) \
1299 __CLOBBER_CC)
1300 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
1301 __asm__ ("subcc %r4,%5,%1\n\tsubx %r2,%3,%0" \
1302 : "=r" (sh), "=&r" (sl) \
1303 : "rJ" (ah), "rI" (bh), "rJ" (al), "rI" (bl) \
1304 __CLOBBER_CC)
1305 /* FIXME: When gcc -mcpu=v9 is used on solaris, gcc/config/sol2-sld-64.h
1306 doesn't define anything to indicate that to us, it only sets __sparcv8. */
1307 #if defined (__sparc_v9__) || defined (__sparcv9)
1308 /* Perhaps we should use floating-point operations here? */
1309 #if 0
1310 /* Triggers a bug making mpz/tests/t-gcd.c fail.
1311 Perhaps we simply need explicitly zero-extend the inputs? */
1312 #define umul_ppmm(w1, w0, u, v) \
1313 __asm__ ("mulx %2,%3,%%g1; srl %%g1,0,%1; srlx %%g1,32,%0" : \
1314 "=r" (w1), "=r" (w0) : "r" (u), "r" (v) : "g1")
1315 #else
1316 /* Use v8 umul until above bug is fixed. */
1317 #define umul_ppmm(w1, w0, u, v) \
1318 __asm__ ("umul %2,%3,%1;rd %%y,%0" : "=r" (w1), "=r" (w0) : "r" (u), "r" (v))
1319 #endif
1320 /* Use a plain v8 divide for v9. */
1321 #define udiv_qrnnd(q, r, n1, n0, d) \
1322 do { \
1323 USItype __q; \
1324 __asm__ ("mov %1,%%y;nop;nop;nop;udiv %2,%3,%0" \
1325 : "=r" (__q) : "r" (n1), "r" (n0), "r" (d)); \
1326 (r) = (n0) - __q * (d); \
1327 (q) = __q; \
1328 } while (0)
1329 #else
1330 #if defined (__sparc_v8__) /* gcc normal */ \
1331 || defined (__sparcv8) /* gcc solaris */ \
1332 || HAVE_HOST_CPU_supersparc
1333 /* Don't match immediate range because, 1) it is not often useful,
1334 2) the 'I' flag thinks of the range as a 13 bit signed interval,
1335 while we want to match a 13 bit interval, sign extended to 32 bits,
1336 but INTERPRETED AS UNSIGNED. */
1337 #define umul_ppmm(w1, w0, u, v) \
1338 __asm__ ("umul %2,%3,%1;rd %%y,%0" : "=r" (w1), "=r" (w0) : "r" (u), "r" (v))
1339 #define UMUL_TIME 5
1341 #if HAVE_HOST_CPU_supersparc
1342 #define UDIV_TIME 60 /* SuperSPARC timing */
1343 #else
1344 /* Don't use this on SuperSPARC because its udiv only handles 53 bit
1345 dividends and will trap to the kernel for the rest. */
1346 #define udiv_qrnnd(q, r, n1, n0, d) \
1347 do { \
1348 USItype __q; \
1349 __asm__ ("mov %1,%%y;nop;nop;nop;udiv %2,%3,%0" \
1350 : "=r" (__q) : "r" (n1), "r" (n0), "r" (d)); \
1351 (r) = (n0) - __q * (d); \
1352 (q) = __q; \
1353 } while (0)
1354 #define UDIV_TIME 25
1355 #endif /* HAVE_HOST_CPU_supersparc */
1357 #else /* ! __sparc_v8__ */
1358 #if defined (__sparclite__)
1359 /* This has hardware multiply but not divide. It also has two additional
1360 instructions scan (ffs from high bit) and divscc. */
1361 #define umul_ppmm(w1, w0, u, v) \
1362 __asm__ ("umul %2,%3,%1;rd %%y,%0" : "=r" (w1), "=r" (w0) : "r" (u), "r" (v))
1363 #define UMUL_TIME 5
1364 #define udiv_qrnnd(q, r, n1, n0, d) \
1365 __asm__ ("! Inlined udiv_qrnnd\n" \
1366 " wr %%g0,%2,%%y ! Not a delayed write for sparclite\n" \
1367 " tst %%g0\n" \
1368 " divscc %3,%4,%%g1\n" \
1369 " divscc %%g1,%4,%%g1\n" \
1370 " divscc %%g1,%4,%%g1\n" \
1371 " divscc %%g1,%4,%%g1\n" \
1372 " divscc %%g1,%4,%%g1\n" \
1373 " divscc %%g1,%4,%%g1\n" \
1374 " divscc %%g1,%4,%%g1\n" \
1375 " divscc %%g1,%4,%%g1\n" \
1376 " divscc %%g1,%4,%%g1\n" \
1377 " divscc %%g1,%4,%%g1\n" \
1378 " divscc %%g1,%4,%%g1\n" \
1379 " divscc %%g1,%4,%%g1\n" \
1380 " divscc %%g1,%4,%%g1\n" \
1381 " divscc %%g1,%4,%%g1\n" \
1382 " divscc %%g1,%4,%%g1\n" \
1383 " divscc %%g1,%4,%%g1\n" \
1384 " divscc %%g1,%4,%%g1\n" \
1385 " divscc %%g1,%4,%%g1\n" \
1386 " divscc %%g1,%4,%%g1\n" \
1387 " divscc %%g1,%4,%%g1\n" \
1388 " divscc %%g1,%4,%%g1\n" \
1389 " divscc %%g1,%4,%%g1\n" \
1390 " divscc %%g1,%4,%%g1\n" \
1391 " divscc %%g1,%4,%%g1\n" \
1392 " divscc %%g1,%4,%%g1\n" \
1393 " divscc %%g1,%4,%%g1\n" \
1394 " divscc %%g1,%4,%%g1\n" \
1395 " divscc %%g1,%4,%%g1\n" \
1396 " divscc %%g1,%4,%%g1\n" \
1397 " divscc %%g1,%4,%%g1\n" \
1398 " divscc %%g1,%4,%%g1\n" \
1399 " divscc %%g1,%4,%0\n" \
1400 " rd %%y,%1\n" \
1401 " bl,a 1f\n" \
1402 " add %1,%4,%1\n" \
1403 "1: ! End of inline udiv_qrnnd" \
1404 : "=r" (q), "=r" (r) : "r" (n1), "r" (n0), "rI" (d) \
1405 : "%g1" __AND_CLOBBER_CC)
1406 #define UDIV_TIME 37
1407 #define count_leading_zeros(count, x) \
1408 __asm__ ("scan %1,1,%0" : "=r" (count) : "r" (x))
1409 /* Early sparclites return 63 for an argument of 0, but they warn that future
1410 implementations might change this. Therefore, leave COUNT_LEADING_ZEROS_0
1411 undefined. */
1412 #endif /* __sparclite__ */
1413 #endif /* __sparc_v8__ */
1414 #endif /* __sparc_v9__ */
1415 /* Default to sparc v7 versions of umul_ppmm and udiv_qrnnd. */
1416 #ifndef umul_ppmm
1417 #define umul_ppmm(w1, w0, u, v) \
1418 __asm__ ("! Inlined umul_ppmm\n" \
1419 " wr %%g0,%2,%%y ! SPARC has 0-3 delay insn after a wr\n" \
1420 " sra %3,31,%%g2 ! Don't move this insn\n" \
1421 " and %2,%%g2,%%g2 ! Don't move this insn\n" \
1422 " andcc %%g0,0,%%g1 ! Don't move this insn\n" \
1423 " mulscc %%g1,%3,%%g1\n" \
1424 " mulscc %%g1,%3,%%g1\n" \
1425 " mulscc %%g1,%3,%%g1\n" \
1426 " mulscc %%g1,%3,%%g1\n" \
1427 " mulscc %%g1,%3,%%g1\n" \
1428 " mulscc %%g1,%3,%%g1\n" \
1429 " mulscc %%g1,%3,%%g1\n" \
1430 " mulscc %%g1,%3,%%g1\n" \
1431 " mulscc %%g1,%3,%%g1\n" \
1432 " mulscc %%g1,%3,%%g1\n" \
1433 " mulscc %%g1,%3,%%g1\n" \
1434 " mulscc %%g1,%3,%%g1\n" \
1435 " mulscc %%g1,%3,%%g1\n" \
1436 " mulscc %%g1,%3,%%g1\n" \
1437 " mulscc %%g1,%3,%%g1\n" \
1438 " mulscc %%g1,%3,%%g1\n" \
1439 " mulscc %%g1,%3,%%g1\n" \
1440 " mulscc %%g1,%3,%%g1\n" \
1441 " mulscc %%g1,%3,%%g1\n" \
1442 " mulscc %%g1,%3,%%g1\n" \
1443 " mulscc %%g1,%3,%%g1\n" \
1444 " mulscc %%g1,%3,%%g1\n" \
1445 " mulscc %%g1,%3,%%g1\n" \
1446 " mulscc %%g1,%3,%%g1\n" \
1447 " mulscc %%g1,%3,%%g1\n" \
1448 " mulscc %%g1,%3,%%g1\n" \
1449 " mulscc %%g1,%3,%%g1\n" \
1450 " mulscc %%g1,%3,%%g1\n" \
1451 " mulscc %%g1,%3,%%g1\n" \
1452 " mulscc %%g1,%3,%%g1\n" \
1453 " mulscc %%g1,%3,%%g1\n" \
1454 " mulscc %%g1,%3,%%g1\n" \
1455 " mulscc %%g1,0,%%g1\n" \
1456 " add %%g1,%%g2,%0\n" \
1457 " rd %%y,%1" \
1458 : "=r" (w1), "=r" (w0) : "%rI" (u), "r" (v) \
1459 : "%g1", "%g2" __AND_CLOBBER_CC)
1460 #define UMUL_TIME 39 /* 39 instructions */
1461 #endif
1462 #ifndef udiv_qrnnd
1463 #ifndef LONGLONG_STANDALONE
1464 #define udiv_qrnnd(q, r, n1, n0, d) \
1465 do { UWtype __r; \
1466 (q) = __MPN(udiv_qrnnd) (&__r, (n1), (n0), (d)); \
1467 (r) = __r; \
1468 } while (0)
1469 extern UWtype __MPN(udiv_qrnnd) _PROTO ((UWtype *, UWtype, UWtype, UWtype));
1470 #ifndef UDIV_TIME
1471 #define UDIV_TIME 140
1472 #endif
1473 #endif /* LONGLONG_STANDALONE */
1474 #endif /* udiv_qrnnd */
1475 #endif /* __sparc__ */
1477 #if defined (__sparc__) && W_TYPE_SIZE == 64
1478 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
1479 __asm__ ( \
1480 "addcc %r4,%5,%1\n" \
1481 " addccc %r6,%7,%%g0\n" \
1482 " addc %r2,%3,%0" \
1483 : "=r" (sh), "=&r" (sl) \
1484 : "rJ" (ah), "rI" (bh), "%rJ" (al), "rI" (bl), \
1485 "%rJ" ((al) >> 32), "rI" ((bl) >> 32) \
1486 __CLOBBER_CC)
1487 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
1488 __asm__ ( \
1489 "subcc %r4,%5,%1\n" \
1490 " subccc %r6,%7,%%g0\n" \
1491 " subc %r2,%3,%0" \
1492 : "=r" (sh), "=&r" (sl) \
1493 : "rJ" (ah), "rI" (bh), "rJ" (al), "rI" (bl), \
1494 "rJ" ((al) >> 32), "rI" ((bl) >> 32) \
1495 __CLOBBER_CC)
1496 #endif
1498 #if defined (__vax__) && W_TYPE_SIZE == 32
1499 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
1500 __asm__ ("addl2 %5,%1\n\tadwc %3,%0" \
1501 : "=g" (sh), "=&g" (sl) \
1502 : "0" ((USItype)(ah)), "g" ((USItype)(bh)), \
1503 "%1" ((USItype)(al)), "g" ((USItype)(bl)))
1504 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
1505 __asm__ ("subl2 %5,%1\n\tsbwc %3,%0" \
1506 : "=g" (sh), "=&g" (sl) \
1507 : "0" ((USItype)(ah)), "g" ((USItype)(bh)), \
1508 "1" ((USItype)(al)), "g" ((USItype)(bl)))
1509 #define smul_ppmm(xh, xl, m0, m1) \
1510 do { \
1511 union {UDItype __ll; \
1512 struct {USItype __l, __h;} __i; \
1513 } __x; \
1514 USItype __m0 = (m0), __m1 = (m1); \
1515 __asm__ ("emul %1,%2,$0,%0" \
1516 : "=g" (__x.__ll) : "g" (__m0), "g" (__m1)); \
1517 (xh) = __x.__i.__h; (xl) = __x.__i.__l; \
1518 } while (0)
1519 #define sdiv_qrnnd(q, r, n1, n0, d) \
1520 do { \
1521 union {DItype __ll; \
1522 struct {SItype __l, __h;} __i; \
1523 } __x; \
1524 __x.__i.__h = n1; __x.__i.__l = n0; \
1525 __asm__ ("ediv %3,%2,%0,%1" \
1526 : "=g" (q), "=g" (r) : "g" (__x.__ll), "g" (d)); \
1527 } while (0)
1528 #if 0
1529 /* FIXME: This instruction appears to be unimplemented on some systems (vax
1530 8800 maybe). */
1531 #define count_trailing_zeros(count,x) \
1532 do { \
1533 __asm__ ("ffs 0, 31, %1, %0" \
1534 : "=g" (count) \
1535 : "g" ((USItype) (x))); \
1536 } while (0)
1537 #endif
1538 #endif /* __vax__ */
1540 #if defined (__z8000__) && W_TYPE_SIZE == 16
1541 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
1542 __asm__ ("add %H1,%H5\n\tadc %H0,%H3" \
1543 : "=r" (sh), "=&r" (sl) \
1544 : "0" ((unsigned int)(ah)), "r" ((unsigned int)(bh)), \
1545 "%1" ((unsigned int)(al)), "rQR" ((unsigned int)(bl)))
1546 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
1547 __asm__ ("sub %H1,%H5\n\tsbc %H0,%H3" \
1548 : "=r" (sh), "=&r" (sl) \
1549 : "0" ((unsigned int)(ah)), "r" ((unsigned int)(bh)), \
1550 "1" ((unsigned int)(al)), "rQR" ((unsigned int)(bl)))
1551 #define umul_ppmm(xh, xl, m0, m1) \
1552 do { \
1553 union {long int __ll; \
1554 struct {unsigned int __h, __l;} __i; \
1555 } __x; \
1556 unsigned int __m0 = (m0), __m1 = (m1); \
1557 __asm__ ("mult %S0,%H3" \
1558 : "=r" (__x.__i.__h), "=r" (__x.__i.__l) \
1559 : "%1" (m0), "rQR" (m1)); \
1560 (xh) = __x.__i.__h; (xl) = __x.__i.__l; \
1561 (xh) += ((((signed int) __m0 >> 15) & __m1) \
1562 + (((signed int) __m1 >> 15) & __m0)); \
1563 } while (0)
1564 #endif /* __z8000__ */
1566 #endif /* __GNUC__ */
1568 #endif /* NO_ASM */
1571 #if !defined (umul_ppmm) && defined (__umulsidi3)
1572 #define umul_ppmm(ph, pl, m0, m1) \
1574 UDWtype __ll = __umulsidi3 (m0, m1); \
1575 ph = (UWtype) (__ll >> W_TYPE_SIZE); \
1576 pl = (UWtype) __ll; \
1578 #endif
1580 #if !defined (__umulsidi3)
1581 #define __umulsidi3(u, v) \
1582 ({UWtype __hi, __lo; \
1583 umul_ppmm (__hi, __lo, u, v); \
1584 ((UDWtype) __hi << W_TYPE_SIZE) | __lo; })
1585 #endif
1588 /* Use mpn_umul_ppmm or mpn_udiv_qrnnd functions, if they exist. The "_r"
1589 forms have "reversed" arguments, meaning the pointer is last, which
1590 sometimes allows better parameter passing, in particular on 64-bit
1591 hppa. */
1593 #define mpn_umul_ppmm __MPN(umul_ppmm)
1594 extern UWtype mpn_umul_ppmm _PROTO ((UWtype *, UWtype, UWtype));
1596 #if ! defined (umul_ppmm) && HAVE_NATIVE_mpn_umul_ppmm \
1597 && ! defined (LONGLONG_STANDALONE)
1598 #define umul_ppmm(wh, wl, u, v) \
1599 do { \
1600 UWtype __umul_ppmm__p0; \
1601 (wh) = mpn_umul_ppmm (&__umul_ppmm__p0, (UWtype) (u), (UWtype) (v)); \
1602 (wl) = __umul_ppmm__p0; \
1603 } while (0)
1604 #endif
1606 #define mpn_umul_ppmm_r __MPN(umul_ppmm_r)
1607 extern UWtype mpn_umul_ppmm_r _PROTO ((UWtype, UWtype, UWtype *));
1609 #if ! defined (umul_ppmm) && HAVE_NATIVE_mpn_umul_ppmm_r \
1610 && ! defined (LONGLONG_STANDALONE)
1611 #define umul_ppmm(wh, wl, u, v) \
1612 do { \
1613 UWtype __umul_ppmm__p0; \
1614 (wh) = mpn_umul_ppmm_r ((UWtype) (u), (UWtype) (v), &__umul_ppmm__p0); \
1615 (wl) = __umul_ppmm__p0; \
1616 } while (0)
1617 #endif
1619 #define mpn_udiv_qrnnd __MPN(udiv_qrnnd)
1620 extern UWtype mpn_udiv_qrnnd _PROTO ((UWtype *, UWtype, UWtype, UWtype));
1622 #if ! defined (udiv_qrnnd) && HAVE_NATIVE_mpn_udiv_qrnnd \
1623 && ! defined (LONGLONG_STANDALONE)
1624 #define udiv_qrnnd(q, r, n1, n0, d) \
1625 do { \
1626 UWtype __udiv_qrnnd__r; \
1627 (q) = mpn_udiv_qrnnd (&__udiv_qrnnd__r, \
1628 (UWtype) (n1), (UWtype) (n0), (UWtype) d); \
1629 (r) = __udiv_qrnnd__r; \
1630 } while (0)
1631 #endif
1633 #define mpn_udiv_qrnnd_r __MPN(udiv_qrnnd_r)
1634 extern UWtype mpn_udiv_qrnnd_r _PROTO ((UWtype, UWtype, UWtype, UWtype *));
1636 #if ! defined (udiv_qrnnd) && HAVE_NATIVE_mpn_udiv_qrnnd_r \
1637 && ! defined (LONGLONG_STANDALONE)
1638 #define udiv_qrnnd(q, r, n1, n0, d) \
1639 do { \
1640 UWtype __udiv_qrnnd__r; \
1641 (q) = mpn_udiv_qrnnd_r ((UWtype) (n1), (UWtype) (n0), (UWtype) d, \
1642 &__udiv_qrnnd__r); \
1643 (r) = __udiv_qrnnd__r; \
1644 } while (0)
1645 #endif
1648 /* If this machine has no inline assembler, use C macros. */
1650 #if !defined (add_ssaaaa)
1651 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
1652 do { \
1653 UWtype __x; \
1654 __x = (al) + (bl); \
1655 (sh) = (ah) + (bh) + (__x < (al)); \
1656 (sl) = __x; \
1657 } while (0)
1658 #endif
1660 #if !defined (sub_ddmmss)
1661 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
1662 do { \
1663 UWtype __x; \
1664 __x = (al) - (bl); \
1665 (sh) = (ah) - (bh) - ((al) < (bl)); \
1666 (sl) = __x; \
1667 } while (0)
1668 #endif
1670 /* If we lack umul_ppmm but have smul_ppmm, define umul_ppmm in terms of
1671 smul_ppmm. */
1672 #if !defined (umul_ppmm) && defined (smul_ppmm)
1673 #define umul_ppmm(w1, w0, u, v) \
1674 do { \
1675 UWtype __w1; \
1676 UWtype __xm0 = (u), __xm1 = (v); \
1677 smul_ppmm (__w1, w0, __xm0, __xm1); \
1678 (w1) = __w1 + (-(__xm0 >> (W_TYPE_SIZE - 1)) & __xm1) \
1679 + (-(__xm1 >> (W_TYPE_SIZE - 1)) & __xm0); \
1680 } while (0)
1681 #endif
1683 /* If we still don't have umul_ppmm, define it using plain C.
1685 For reference, when this code is used for squaring (ie. u and v identical
1686 expressions), gcc recognises __x1 and __x2 are the same and generates 3
1687 multiplies, not 4. The subsequent additions could be optimized a bit,
1688 but the only place GMP currently uses such a square is mpn_sqr_basecase,
1689 and chips obliged to use this generic C umul will have plenty of worse
1690 performance problems than a couple of extra instructions on the diagonal
1691 of sqr_basecase. */
1693 #if !defined (umul_ppmm)
1694 #define umul_ppmm(w1, w0, u, v) \
1695 do { \
1696 UWtype __x0, __x1, __x2, __x3; \
1697 UHWtype __ul, __vl, __uh, __vh; \
1698 UWtype __u = (u), __v = (v); \
1700 __ul = __ll_lowpart (__u); \
1701 __uh = __ll_highpart (__u); \
1702 __vl = __ll_lowpart (__v); \
1703 __vh = __ll_highpart (__v); \
1705 __x0 = (UWtype) __ul * __vl; \
1706 __x1 = (UWtype) __ul * __vh; \
1707 __x2 = (UWtype) __uh * __vl; \
1708 __x3 = (UWtype) __uh * __vh; \
1710 __x1 += __ll_highpart (__x0);/* this can't give carry */ \
1711 __x1 += __x2; /* but this indeed can */ \
1712 if (__x1 < __x2) /* did we get it? */ \
1713 __x3 += __ll_B; /* yes, add it in the proper pos. */ \
1715 (w1) = __x3 + __ll_highpart (__x1); \
1716 (w0) = (__x1 << W_TYPE_SIZE/2) + __ll_lowpart (__x0); \
1717 } while (0)
1718 #endif
1720 /* If we don't have smul_ppmm, define it using umul_ppmm (which surely will
1721 exist in one form or another. */
1722 #if !defined (smul_ppmm)
1723 #define smul_ppmm(w1, w0, u, v) \
1724 do { \
1725 UWtype __w1; \
1726 UWtype __xm0 = (u), __xm1 = (v); \
1727 umul_ppmm (__w1, w0, __xm0, __xm1); \
1728 (w1) = __w1 - (-(__xm0 >> (W_TYPE_SIZE - 1)) & __xm1) \
1729 - (-(__xm1 >> (W_TYPE_SIZE - 1)) & __xm0); \
1730 } while (0)
1731 #endif
1733 /* Define this unconditionally, so it can be used for debugging. */
1734 #define __udiv_qrnnd_c(q, r, n1, n0, d) \
1735 do { \
1736 UWtype __d1, __d0, __q1, __q0, __r1, __r0, __m; \
1738 ASSERT ((d) != 0); \
1739 ASSERT ((n1) < (d)); \
1741 __d1 = __ll_highpart (d); \
1742 __d0 = __ll_lowpart (d); \
1744 __q1 = (n1) / __d1; \
1745 __r1 = (n1) - __q1 * __d1; \
1746 __m = __q1 * __d0; \
1747 __r1 = __r1 * __ll_B | __ll_highpart (n0); \
1748 if (__r1 < __m) \
1750 __q1--, __r1 += (d); \
1751 if (__r1 >= (d)) /* i.e. we didn't get carry when adding to __r1 */\
1752 if (__r1 < __m) \
1753 __q1--, __r1 += (d); \
1755 __r1 -= __m; \
1757 __q0 = __r1 / __d1; \
1758 __r0 = __r1 - __q0 * __d1; \
1759 __m = __q0 * __d0; \
1760 __r0 = __r0 * __ll_B | __ll_lowpart (n0); \
1761 if (__r0 < __m) \
1763 __q0--, __r0 += (d); \
1764 if (__r0 >= (d)) \
1765 if (__r0 < __m) \
1766 __q0--, __r0 += (d); \
1768 __r0 -= __m; \
1770 (q) = __q1 * __ll_B | __q0; \
1771 (r) = __r0; \
1772 } while (0)
1774 /* If the processor has no udiv_qrnnd but sdiv_qrnnd, go through
1775 __udiv_w_sdiv (defined in libgcc or elsewhere). */
1776 #if !defined (udiv_qrnnd) && defined (sdiv_qrnnd)
1777 #define udiv_qrnnd(q, r, nh, nl, d) \
1778 do { \
1779 UWtype __r; \
1780 (q) = __MPN(udiv_w_sdiv) (&__r, nh, nl, d); \
1781 (r) = __r; \
1782 } while (0)
1783 #endif
1785 /* If udiv_qrnnd was not defined for this processor, use __udiv_qrnnd_c. */
1786 #if !defined (udiv_qrnnd)
1787 #define UDIV_NEEDS_NORMALIZATION 1
1788 #define udiv_qrnnd __udiv_qrnnd_c
1789 #endif
1791 #if !defined (count_leading_zeros)
1792 #define count_leading_zeros(count, x) \
1793 do { \
1794 UWtype __xr = (x); \
1795 UWtype __a; \
1797 if (W_TYPE_SIZE == 32) \
1799 __a = __xr < ((UWtype) 1 << 2*__BITS4) \
1800 ? (__xr < ((UWtype) 1 << __BITS4) ? 1 : __BITS4 + 1) \
1801 : (__xr < ((UWtype) 1 << 3*__BITS4) ? 2*__BITS4 + 1 \
1802 : 3*__BITS4 + 1); \
1804 else \
1806 for (__a = W_TYPE_SIZE - 8; __a > 0; __a -= 8) \
1807 if (((__xr >> __a) & 0xff) != 0) \
1808 break; \
1809 ++__a; \
1812 (count) = W_TYPE_SIZE + 1 - __a - __clz_tab[__xr >> __a]; \
1813 } while (0)
1814 /* This version gives a well-defined value for zero. */
1815 #define COUNT_LEADING_ZEROS_0 (W_TYPE_SIZE - 1)
1816 #define COUNT_LEADING_ZEROS_NEED_CLZ_TAB
1817 #endif
1819 /* clz_tab needed by mpn/x86/pentium/mod_1.asm in a fat binary */
1820 #if HAVE_HOST_CPU_FAMILY_x86 && WANT_FAT_BINARY
1821 #define COUNT_LEADING_ZEROS_NEED_CLZ_TAB
1822 #endif
1824 #ifdef COUNT_LEADING_ZEROS_NEED_CLZ_TAB
1825 # ifdef MPFR_HAVE_GMP_IMPL
1826 extern const unsigned char __GMP_DECLSPEC __clz_tab[128];
1827 # else
1828 extern const unsigned char __clz_tab[128];
1829 # endif
1830 #endif
1832 #if !defined (count_trailing_zeros)
1833 /* Define count_trailing_zeros using count_leading_zeros. The latter might be
1834 defined in asm, but if it is not, the C version above is good enough. */
1835 #define count_trailing_zeros(count, x) \
1836 do { \
1837 UWtype __ctz_x = (x); \
1838 UWtype __ctz_c; \
1839 ASSERT (__ctz_x != 0); \
1840 count_leading_zeros (__ctz_c, __ctz_x & -__ctz_x); \
1841 (count) = W_TYPE_SIZE - 1 - __ctz_c; \
1842 } while (0)
1843 #endif
1845 #ifndef UDIV_NEEDS_NORMALIZATION
1846 #define UDIV_NEEDS_NORMALIZATION 0
1847 #endif
1849 /* Whether udiv_qrnnd is actually implemented with udiv_qrnnd_preinv, and
1850 that hence the latter should always be used. */
1851 #ifndef UDIV_PREINV_ALWAYS
1852 #define UDIV_PREINV_ALWAYS 0
1853 #endif
1855 /* Give defaults for UMUL_TIME and UDIV_TIME. */
1856 #ifndef UMUL_TIME
1857 #define UMUL_TIME 1
1858 #endif
1860 #ifndef UDIV_TIME
1861 #define UDIV_TIME UMUL_TIME
1862 #endif