kmalloc: Avoid code duplication.
[dragonfly.git] / contrib / gmp / longlong.h
blob8cac79da9c2f164d2d604f65849b39466254c265
1 /* longlong.h -- definitions for mixed size 32/64 bit arithmetic.
3 Copyright 1991, 1992, 1993, 1994, 1996, 1997, 1999, 2000, 2001, 2002, 2003,
4 2004, 2005, 2007, 2008, 2009, 2011 Free Software Foundation, Inc.
6 This file is free software; you can redistribute it and/or modify it under the
7 terms of the GNU Lesser General Public License as published by the Free
8 Software Foundation; either version 3 of the License, or (at your option) any
9 later version.
11 This file is distributed in the hope that it will be useful, but WITHOUT ANY
12 WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
13 PARTICULAR PURPOSE. See the GNU Lesser General Public License for more
14 details.
16 You should have received a copy of the GNU Lesser General Public License
17 along with this file. If not, see http://www.gnu.org/licenses/. */
19 /* You have to define the following before including this file:
21 UWtype -- An unsigned type, default type for operations (typically a "word")
22 UHWtype -- An unsigned type, at least half the size of UWtype
23 UDWtype -- An unsigned type, at least twice as large a UWtype
24 W_TYPE_SIZE -- size in bits of UWtype
26 SItype, USItype -- Signed and unsigned 32 bit types
27 DItype, UDItype -- Signed and unsigned 64 bit types
29 On a 32 bit machine UWtype should typically be USItype;
30 on a 64 bit machine, UWtype should typically be UDItype.
32 Optionally, define:
34 LONGLONG_STANDALONE -- Avoid code that needs machine-dependent support files
35 NO_ASM -- Disable inline asm
38 CAUTION! Using this version of longlong.h outside of GMP is not safe. You
39 need to include gmp.h and gmp-impl.h, or certain things might not work as
40 expected.
43 #define __BITS4 (W_TYPE_SIZE / 4)
44 #define __ll_B ((UWtype) 1 << (W_TYPE_SIZE / 2))
45 #define __ll_lowpart(t) ((UWtype) (t) & (__ll_B - 1))
46 #define __ll_highpart(t) ((UWtype) (t) >> (W_TYPE_SIZE / 2))
48 /* This is used to make sure no undesirable sharing between different libraries
49 that use this file takes place. */
50 #ifndef __MPN
51 #define __MPN(x) __##x
52 #endif
54 #ifndef _PROTO
55 #if (__STDC__-0) || defined (__cplusplus)
56 #define _PROTO(x) x
57 #else
58 #define _PROTO(x) ()
59 #endif
60 #endif
62 /* Define auxiliary asm macros.
64 1) umul_ppmm(high_prod, low_prod, multiplier, multiplicand) multiplies two
65 UWtype integers MULTIPLIER and MULTIPLICAND, and generates a two UWtype
66 word product in HIGH_PROD and LOW_PROD.
68 2) __umulsidi3(a,b) multiplies two UWtype integers A and B, and returns a
69 UDWtype product. This is just a variant of umul_ppmm.
71 3) udiv_qrnnd(quotient, remainder, high_numerator, low_numerator,
72 denominator) divides a UDWtype, composed by the UWtype integers
73 HIGH_NUMERATOR and LOW_NUMERATOR, by DENOMINATOR and places the quotient
74 in QUOTIENT and the remainder in REMAINDER. HIGH_NUMERATOR must be less
75 than DENOMINATOR for correct operation. If, in addition, the most
76 significant bit of DENOMINATOR must be 1, then the pre-processor symbol
77 UDIV_NEEDS_NORMALIZATION is defined to 1.
79 4) sdiv_qrnnd(quotient, remainder, high_numerator, low_numerator,
80 denominator). Like udiv_qrnnd but the numbers are signed. The quotient
81 is rounded towards 0.
83 5) count_leading_zeros(count, x) counts the number of zero-bits from the
84 msb to the first non-zero bit in the UWtype X. This is the number of
85 steps X needs to be shifted left to set the msb. Undefined for X == 0,
86 unless the symbol COUNT_LEADING_ZEROS_0 is defined to some value.
88 6) count_trailing_zeros(count, x) like count_leading_zeros, but counts
89 from the least significant end.
91 7) add_ssaaaa(high_sum, low_sum, high_addend_1, low_addend_1,
92 high_addend_2, low_addend_2) adds two UWtype integers, composed by
93 HIGH_ADDEND_1 and LOW_ADDEND_1, and HIGH_ADDEND_2 and LOW_ADDEND_2
94 respectively. The result is placed in HIGH_SUM and LOW_SUM. Overflow
95 (i.e. carry out) is not stored anywhere, and is lost.
97 8) sub_ddmmss(high_difference, low_difference, high_minuend, low_minuend,
98 high_subtrahend, low_subtrahend) subtracts two two-word UWtype integers,
99 composed by HIGH_MINUEND_1 and LOW_MINUEND_1, and HIGH_SUBTRAHEND_2 and
100 LOW_SUBTRAHEND_2 respectively. The result is placed in HIGH_DIFFERENCE
101 and LOW_DIFFERENCE. Overflow (i.e. carry out) is not stored anywhere,
102 and is lost.
104 If any of these macros are left undefined for a particular CPU,
105 C macros are used.
108 Notes:
110 For add_ssaaaa the two high and two low addends can both commute, but
111 unfortunately gcc only supports one "%" commutative in each asm block.
112 This has always been so but is only documented in recent versions
113 (eg. pre-release 3.3). Having two or more "%"s can cause an internal
114 compiler error in certain rare circumstances.
116 Apparently it was only the last "%" that was ever actually respected, so
117 the code has been updated to leave just that. Clearly there's a free
118 choice whether high or low should get it, if there's a reason to favour
119 one over the other. Also obviously when the constraints on the two
120 operands are identical there's no benefit to the reloader in any "%" at
121 all.
125 /* The CPUs come in alphabetical order below.
127 Please add support for more CPUs here, or improve the current support
128 for the CPUs below! */
131 /* count_leading_zeros_gcc_clz is count_leading_zeros implemented with gcc
132 3.4 __builtin_clzl or __builtin_clzll, according to our limb size.
133 Similarly count_trailing_zeros_gcc_ctz using __builtin_ctzl or
134 __builtin_ctzll.
136 These builtins are only used when we check what code comes out, on some
137 chips they're merely libgcc calls, where we will instead want an inline
138 in that case (either asm or generic C).
140 These builtins are better than an asm block of the same insn, since an
141 asm block doesn't give gcc any information about scheduling or resource
142 usage. We keep an asm block for use on prior versions of gcc though.
144 For reference, __builtin_ffs existed in gcc prior to __builtin_clz, but
145 it's not used (for count_leading_zeros) because it generally gives extra
146 code to ensure the result is 0 when the input is 0, which we don't need
147 or want. */
149 #ifdef _LONG_LONG_LIMB
150 #define count_leading_zeros_gcc_clz(count,x) \
151 do { \
152 ASSERT ((x) != 0); \
153 (count) = __builtin_clzll (x); \
154 } while (0)
155 #else
156 #define count_leading_zeros_gcc_clz(count,x) \
157 do { \
158 ASSERT ((x) != 0); \
159 (count) = __builtin_clzl (x); \
160 } while (0)
161 #endif
163 #ifdef _LONG_LONG_LIMB
164 #define count_trailing_zeros_gcc_ctz(count,x) \
165 do { \
166 ASSERT ((x) != 0); \
167 (count) = __builtin_ctzll (x); \
168 } while (0)
169 #else
170 #define count_trailing_zeros_gcc_ctz(count,x) \
171 do { \
172 ASSERT ((x) != 0); \
173 (count) = __builtin_ctzl (x); \
174 } while (0)
175 #endif
178 /* FIXME: The macros using external routines like __MPN(count_leading_zeros)
179 don't need to be under !NO_ASM */
180 #if ! defined (NO_ASM)
182 #if defined (__alpha) && W_TYPE_SIZE == 64
183 /* Most alpha-based machines, except Cray systems. */
184 #if defined (__GNUC__)
185 #if __GMP_GNUC_PREREQ (3,3)
186 #define umul_ppmm(ph, pl, m0, m1) \
187 do { \
188 UDItype __m0 = (m0), __m1 = (m1); \
189 (ph) = __builtin_alpha_umulh (__m0, __m1); \
190 (pl) = __m0 * __m1; \
191 } while (0)
192 #else
193 #define umul_ppmm(ph, pl, m0, m1) \
194 do { \
195 UDItype __m0 = (m0), __m1 = (m1); \
196 __asm__ ("umulh %r1,%2,%0" \
197 : "=r" (ph) \
198 : "%rJ" (m0), "rI" (m1)); \
199 (pl) = __m0 * __m1; \
200 } while (0)
201 #endif
202 #define UMUL_TIME 18
203 #else /* ! __GNUC__ */
204 #include <machine/builtins.h>
205 #define umul_ppmm(ph, pl, m0, m1) \
206 do { \
207 UDItype __m0 = (m0), __m1 = (m1); \
208 (ph) = __UMULH (m0, m1); \
209 (pl) = __m0 * __m1; \
210 } while (0)
211 #endif
212 #ifndef LONGLONG_STANDALONE
213 #define udiv_qrnnd(q, r, n1, n0, d) \
214 do { UWtype __di; \
215 __di = __MPN(invert_limb) (d); \
216 udiv_qrnnd_preinv (q, r, n1, n0, d, __di); \
217 } while (0)
218 #define UDIV_PREINV_ALWAYS 1
219 #define UDIV_NEEDS_NORMALIZATION 1
220 #define UDIV_TIME 220
221 #endif /* LONGLONG_STANDALONE */
223 /* clz_tab is required in all configurations, since mpn/alpha/cntlz.asm
224 always goes into libgmp.so, even when not actually used. */
225 #define COUNT_LEADING_ZEROS_NEED_CLZ_TAB
227 #if defined (__GNUC__) && HAVE_HOST_CPU_alpha_CIX
228 #define count_leading_zeros(COUNT,X) \
229 __asm__("ctlz %1,%0" : "=r"(COUNT) : "r"(X))
230 #define count_trailing_zeros(COUNT,X) \
231 __asm__("cttz %1,%0" : "=r"(COUNT) : "r"(X))
232 #endif /* clz/ctz using cix */
234 #if ! defined (count_leading_zeros) \
235 && defined (__GNUC__) && ! defined (LONGLONG_STANDALONE)
236 /* ALPHA_CMPBGE_0 gives "cmpbge $31,src,dst", ie. test src bytes == 0.
237 "$31" is written explicitly in the asm, since an "r" constraint won't
238 select reg 31. There seems no need to worry about "r31" syntax for cray,
239 since gcc itself (pre-release 3.4) emits just $31 in various places. */
240 #define ALPHA_CMPBGE_0(dst, src) \
241 do { asm ("cmpbge $31, %1, %0" : "=r" (dst) : "r" (src)); } while (0)
242 /* Zero bytes are turned into bits with cmpbge, a __clz_tab lookup counts
243 them, locating the highest non-zero byte. A second __clz_tab lookup
244 counts the leading zero bits in that byte, giving the result. */
245 #define count_leading_zeros(count, x) \
246 do { \
247 UWtype __clz__b, __clz__c, __clz__x = (x); \
248 ALPHA_CMPBGE_0 (__clz__b, __clz__x); /* zero bytes */ \
249 __clz__b = __clz_tab [(__clz__b >> 1) ^ 0x7F]; /* 8 to 1 byte */ \
250 __clz__b = __clz__b * 8 - 7; /* 57 to 1 shift */ \
251 __clz__x >>= __clz__b; \
252 __clz__c = __clz_tab [__clz__x]; /* 8 to 1 bit */ \
253 __clz__b = 65 - __clz__b; \
254 (count) = __clz__b - __clz__c; \
255 } while (0)
256 #define COUNT_LEADING_ZEROS_NEED_CLZ_TAB
257 #endif /* clz using cmpbge */
259 #if ! defined (count_leading_zeros) && ! defined (LONGLONG_STANDALONE)
260 #if HAVE_ATTRIBUTE_CONST
261 long __MPN(count_leading_zeros) _PROTO ((UDItype)) __attribute__ ((const));
262 #else
263 long __MPN(count_leading_zeros) _PROTO ((UDItype));
264 #endif
265 #define count_leading_zeros(count, x) \
266 ((count) = __MPN(count_leading_zeros) (x))
267 #endif /* clz using mpn */
268 #endif /* __alpha */
270 #if defined (_CRAY) && W_TYPE_SIZE == 64
271 #include <intrinsics.h>
272 #define UDIV_PREINV_ALWAYS 1
273 #define UDIV_NEEDS_NORMALIZATION 1
274 #define UDIV_TIME 220
275 long __MPN(count_leading_zeros) _PROTO ((UDItype));
276 #define count_leading_zeros(count, x) \
277 ((count) = _leadz ((UWtype) (x)))
278 #if defined (_CRAYIEEE) /* I.e., Cray T90/ieee, T3D, and T3E */
279 #define umul_ppmm(ph, pl, m0, m1) \
280 do { \
281 UDItype __m0 = (m0), __m1 = (m1); \
282 (ph) = _int_mult_upper (m0, m1); \
283 (pl) = __m0 * __m1; \
284 } while (0)
285 #ifndef LONGLONG_STANDALONE
286 #define udiv_qrnnd(q, r, n1, n0, d) \
287 do { UWtype __di; \
288 __di = __MPN(invert_limb) (d); \
289 udiv_qrnnd_preinv (q, r, n1, n0, d, __di); \
290 } while (0)
291 #endif /* LONGLONG_STANDALONE */
292 #endif /* _CRAYIEEE */
293 #endif /* _CRAY */
295 #if defined (__ia64) && W_TYPE_SIZE == 64
296 /* This form encourages gcc (pre-release 3.4 at least) to emit predicated
297 "sub r=r,r" and "sub r=r,r,1", giving a 2 cycle latency. The generic
298 code using "al<bl" arithmetically comes out making an actual 0 or 1 in a
299 register, which takes an extra cycle. */
300 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
301 do { \
302 UWtype __x; \
303 __x = (al) - (bl); \
304 if ((al) < (bl)) \
305 (sh) = (ah) - (bh) - 1; \
306 else \
307 (sh) = (ah) - (bh); \
308 (sl) = __x; \
309 } while (0)
310 #if defined (__GNUC__) && ! defined (__INTEL_COMPILER)
311 /* Do both product parts in assembly, since that gives better code with
312 all gcc versions. Some callers will just use the upper part, and in
313 that situation we waste an instruction, but not any cycles. */
314 #define umul_ppmm(ph, pl, m0, m1) \
315 __asm__ ("xma.hu %0 = %2, %3, f0\n\txma.l %1 = %2, %3, f0" \
316 : "=&f" (ph), "=f" (pl) \
317 : "f" (m0), "f" (m1))
318 #define UMUL_TIME 14
319 #define count_leading_zeros(count, x) \
320 do { \
321 UWtype _x = (x), _y, _a, _c; \
322 __asm__ ("mux1 %0 = %1, @rev" : "=r" (_y) : "r" (_x)); \
323 __asm__ ("czx1.l %0 = %1" : "=r" (_a) : "r" (-_y | _y)); \
324 _c = (_a - 1) << 3; \
325 _x >>= _c; \
326 if (_x >= 1 << 4) \
327 _x >>= 4, _c += 4; \
328 if (_x >= 1 << 2) \
329 _x >>= 2, _c += 2; \
330 _c += _x >> 1; \
331 (count) = W_TYPE_SIZE - 1 - _c; \
332 } while (0)
333 /* similar to what gcc does for __builtin_ffs, but 0 based rather than 1
334 based, and we don't need a special case for x==0 here */
335 #define count_trailing_zeros(count, x) \
336 do { \
337 UWtype __ctz_x = (x); \
338 __asm__ ("popcnt %0 = %1" \
339 : "=r" (count) \
340 : "r" ((__ctz_x-1) & ~__ctz_x)); \
341 } while (0)
342 #endif
343 #if defined (__INTEL_COMPILER)
344 #include <ia64intrin.h>
345 #define umul_ppmm(ph, pl, m0, m1) \
346 do { \
347 UWtype _m0 = (m0), _m1 = (m1); \
348 ph = _m64_xmahu (_m0, _m1, 0); \
349 pl = _m0 * _m1; \
350 } while (0)
351 #endif
352 #ifndef LONGLONG_STANDALONE
353 #define udiv_qrnnd(q, r, n1, n0, d) \
354 do { UWtype __di; \
355 __di = __MPN(invert_limb) (d); \
356 udiv_qrnnd_preinv (q, r, n1, n0, d, __di); \
357 } while (0)
358 #define UDIV_PREINV_ALWAYS 1
359 #define UDIV_NEEDS_NORMALIZATION 1
360 #endif
361 #define UDIV_TIME 220
362 #endif
365 #if defined (__GNUC__)
367 /* We sometimes need to clobber "cc" with gcc2, but that would not be
368 understood by gcc1. Use cpp to avoid major code duplication. */
369 #if __GNUC__ < 2
370 #define __CLOBBER_CC
371 #define __AND_CLOBBER_CC
372 #else /* __GNUC__ >= 2 */
373 #define __CLOBBER_CC : "cc"
374 #define __AND_CLOBBER_CC , "cc"
375 #endif /* __GNUC__ < 2 */
377 #if (defined (__a29k__) || defined (_AM29K)) && W_TYPE_SIZE == 32
378 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
379 __asm__ ("add %1,%4,%5\n\taddc %0,%2,%3" \
380 : "=r" (sh), "=&r" (sl) \
381 : "r" (ah), "rI" (bh), "%r" (al), "rI" (bl))
382 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
383 __asm__ ("sub %1,%4,%5\n\tsubc %0,%2,%3" \
384 : "=r" (sh), "=&r" (sl) \
385 : "r" (ah), "rI" (bh), "r" (al), "rI" (bl))
386 #define umul_ppmm(xh, xl, m0, m1) \
387 do { \
388 USItype __m0 = (m0), __m1 = (m1); \
389 __asm__ ("multiplu %0,%1,%2" \
390 : "=r" (xl) \
391 : "r" (__m0), "r" (__m1)); \
392 __asm__ ("multmu %0,%1,%2" \
393 : "=r" (xh) \
394 : "r" (__m0), "r" (__m1)); \
395 } while (0)
396 #define udiv_qrnnd(q, r, n1, n0, d) \
397 __asm__ ("dividu %0,%3,%4" \
398 : "=r" (q), "=q" (r) \
399 : "1" (n1), "r" (n0), "r" (d))
400 #define count_leading_zeros(count, x) \
401 __asm__ ("clz %0,%1" \
402 : "=r" (count) \
403 : "r" (x))
404 #define COUNT_LEADING_ZEROS_0 32
405 #endif /* __a29k__ */
407 #if defined (__arc__)
408 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
409 __asm__ ("add.f\t%1, %4, %5\n\tadc\t%0, %2, %3" \
410 : "=r" (sh), \
411 "=&r" (sl) \
412 : "r" ((USItype) (ah)), \
413 "rIJ" ((USItype) (bh)), \
414 "%r" ((USItype) (al)), \
415 "rIJ" ((USItype) (bl)))
416 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
417 __asm__ ("sub.f\t%1, %4, %5\n\tsbc\t%0, %2, %3" \
418 : "=r" (sh), \
419 "=&r" (sl) \
420 : "r" ((USItype) (ah)), \
421 "rIJ" ((USItype) (bh)), \
422 "r" ((USItype) (al)), \
423 "rIJ" ((USItype) (bl)))
424 #endif
426 #if defined (__arm__) && W_TYPE_SIZE == 32
427 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
428 __asm__ ("adds\t%1, %4, %5\n\tadc\t%0, %2, %3" \
429 : "=r" (sh), "=&r" (sl) \
430 : "r" (ah), "rI" (bh), "%r" (al), "rI" (bl) __CLOBBER_CC)
431 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
432 do { \
433 if (__builtin_constant_p (al)) \
435 if (__builtin_constant_p (ah)) \
436 __asm__ ("rsbs\t%1, %5, %4\n\trsc\t%0, %3, %2" \
437 : "=r" (sh), "=&r" (sl) \
438 : "rI" (ah), "r" (bh), "rI" (al), "r" (bl) __CLOBBER_CC); \
439 else \
440 __asm__ ("rsbs\t%1, %5, %4\n\tsbc\t%0, %2, %3" \
441 : "=r" (sh), "=&r" (sl) \
442 : "r" (ah), "rI" (bh), "rI" (al), "r" (bl) __CLOBBER_CC); \
444 else if (__builtin_constant_p (ah)) \
446 if (__builtin_constant_p (bl)) \
447 __asm__ ("subs\t%1, %4, %5\n\trsc\t%0, %3, %2" \
448 : "=r" (sh), "=&r" (sl) \
449 : "rI" (ah), "r" (bh), "r" (al), "rI" (bl) __CLOBBER_CC); \
450 else \
451 __asm__ ("rsbs\t%1, %5, %4\n\trsc\t%0, %3, %2" \
452 : "=r" (sh), "=&r" (sl) \
453 : "rI" (ah), "r" (bh), "rI" (al), "r" (bl) __CLOBBER_CC); \
455 else if (__builtin_constant_p (bl)) \
457 if (__builtin_constant_p (bh)) \
458 __asm__ ("subs\t%1, %4, %5\n\tsbc\t%0, %2, %3" \
459 : "=r" (sh), "=&r" (sl) \
460 : "r" (ah), "rI" (bh), "r" (al), "rI" (bl) __CLOBBER_CC); \
461 else \
462 __asm__ ("subs\t%1, %4, %5\n\trsc\t%0, %3, %2" \
463 : "=r" (sh), "=&r" (sl) \
464 : "rI" (ah), "r" (bh), "r" (al), "rI" (bl) __CLOBBER_CC); \
466 else /* only bh might be a constant */ \
467 __asm__ ("subs\t%1, %4, %5\n\tsbc\t%0, %2, %3" \
468 : "=r" (sh), "=&r" (sl) \
469 : "r" (ah), "rI" (bh), "r" (al), "rI" (bl) __CLOBBER_CC);\
470 } while (0)
471 #if 1 || defined (__arm_m__) /* `M' series has widening multiply support */
472 #define umul_ppmm(xh, xl, a, b) \
473 __asm__ ("umull %0,%1,%2,%3" : "=&r" (xl), "=&r" (xh) : "r" (a), "r" (b))
474 #define UMUL_TIME 5
475 #define smul_ppmm(xh, xl, a, b) \
476 __asm__ ("smull %0,%1,%2,%3" : "=&r" (xl), "=&r" (xh) : "r" (a), "r" (b))
477 #ifndef LONGLONG_STANDALONE
478 #define udiv_qrnnd(q, r, n1, n0, d) \
479 do { UWtype __di; \
480 __di = __MPN(invert_limb) (d); \
481 udiv_qrnnd_preinv (q, r, n1, n0, d, __di); \
482 } while (0)
483 #define UDIV_PREINV_ALWAYS 1
484 #define UDIV_NEEDS_NORMALIZATION 1
485 #define UDIV_TIME 70
486 #endif /* LONGLONG_STANDALONE */
487 #else
488 #define umul_ppmm(xh, xl, a, b) \
489 __asm__ ("%@ Inlined umul_ppmm\n" \
490 " mov %|r0, %2, lsr #16\n" \
491 " mov %|r2, %3, lsr #16\n" \
492 " bic %|r1, %2, %|r0, lsl #16\n" \
493 " bic %|r2, %3, %|r2, lsl #16\n" \
494 " mul %1, %|r1, %|r2\n" \
495 " mul %|r2, %|r0, %|r2\n" \
496 " mul %|r1, %0, %|r1\n" \
497 " mul %0, %|r0, %0\n" \
498 " adds %|r1, %|r2, %|r1\n" \
499 " addcs %0, %0, #65536\n" \
500 " adds %1, %1, %|r1, lsl #16\n" \
501 " adc %0, %0, %|r1, lsr #16" \
502 : "=&r" (xh), "=r" (xl) \
503 : "r" (a), "r" (b) \
504 : "r0", "r1", "r2")
505 #define UMUL_TIME 20
506 #ifndef LONGLONG_STANDALONE
507 #define udiv_qrnnd(q, r, n1, n0, d) \
508 do { UWtype __r; \
509 (q) = __MPN(udiv_qrnnd) (&__r, (n1), (n0), (d)); \
510 (r) = __r; \
511 } while (0)
512 extern UWtype __MPN(udiv_qrnnd) _PROTO ((UWtype *, UWtype, UWtype, UWtype));
513 #define UDIV_TIME 200
514 #endif /* LONGLONG_STANDALONE */
515 #endif
516 #if defined (__ARM_ARCH_5__)
517 /* This actually requires arm 5 */
518 #define count_leading_zeros(count, x) \
519 __asm__ ("clz\t%0, %1" : "=r" (count) : "r" (x))
520 #define COUNT_LEADING_ZEROS_0 32
521 #endif
522 #endif /* __arm__ */
524 #if defined (__clipper__) && W_TYPE_SIZE == 32
525 #define umul_ppmm(w1, w0, u, v) \
526 ({union {UDItype __ll; \
527 struct {USItype __l, __h;} __i; \
528 } __x; \
529 __asm__ ("mulwux %2,%0" \
530 : "=r" (__x.__ll) \
531 : "%0" ((USItype)(u)), "r" ((USItype)(v))); \
532 (w1) = __x.__i.__h; (w0) = __x.__i.__l;})
533 #define smul_ppmm(w1, w0, u, v) \
534 ({union {DItype __ll; \
535 struct {SItype __l, __h;} __i; \
536 } __x; \
537 __asm__ ("mulwx %2,%0" \
538 : "=r" (__x.__ll) \
539 : "%0" ((SItype)(u)), "r" ((SItype)(v))); \
540 (w1) = __x.__i.__h; (w0) = __x.__i.__l;})
541 #define __umulsidi3(u, v) \
542 ({UDItype __w; \
543 __asm__ ("mulwux %2,%0" \
544 : "=r" (__w) : "%0" ((USItype)(u)), "r" ((USItype)(v))); \
545 __w; })
546 #endif /* __clipper__ */
548 /* Fujitsu vector computers. */
549 #if defined (__uxp__) && W_TYPE_SIZE == 32
550 #define umul_ppmm(ph, pl, u, v) \
551 do { \
552 union {UDItype __ll; \
553 struct {USItype __h, __l;} __i; \
554 } __x; \
555 __asm__ ("mult.lu %1,%2,%0" : "=r" (__x.__ll) : "%r" (u), "rK" (v));\
556 (ph) = __x.__i.__h; \
557 (pl) = __x.__i.__l; \
558 } while (0)
559 #define smul_ppmm(ph, pl, u, v) \
560 do { \
561 union {UDItype __ll; \
562 struct {USItype __h, __l;} __i; \
563 } __x; \
564 __asm__ ("mult.l %1,%2,%0" : "=r" (__x.__ll) : "%r" (u), "rK" (v)); \
565 (ph) = __x.__i.__h; \
566 (pl) = __x.__i.__l; \
567 } while (0)
568 #endif
570 #if defined (__gmicro__) && W_TYPE_SIZE == 32
571 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
572 __asm__ ("add.w %5,%1\n\taddx %3,%0" \
573 : "=g" (sh), "=&g" (sl) \
574 : "0" ((USItype)(ah)), "g" ((USItype)(bh)), \
575 "%1" ((USItype)(al)), "g" ((USItype)(bl)))
576 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
577 __asm__ ("sub.w %5,%1\n\tsubx %3,%0" \
578 : "=g" (sh), "=&g" (sl) \
579 : "0" ((USItype)(ah)), "g" ((USItype)(bh)), \
580 "1" ((USItype)(al)), "g" ((USItype)(bl)))
581 #define umul_ppmm(ph, pl, m0, m1) \
582 __asm__ ("mulx %3,%0,%1" \
583 : "=g" (ph), "=r" (pl) \
584 : "%0" ((USItype)(m0)), "g" ((USItype)(m1)))
585 #define udiv_qrnnd(q, r, nh, nl, d) \
586 __asm__ ("divx %4,%0,%1" \
587 : "=g" (q), "=r" (r) \
588 : "1" ((USItype)(nh)), "0" ((USItype)(nl)), "g" ((USItype)(d)))
589 #define count_leading_zeros(count, x) \
590 __asm__ ("bsch/1 %1,%0" \
591 : "=g" (count) : "g" ((USItype)(x)), "0" ((USItype)0))
592 #endif
594 #if defined (__hppa) && W_TYPE_SIZE == 32
595 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
596 __asm__ ("add%I5 %5,%r4,%1\n\taddc %r2,%r3,%0" \
597 : "=r" (sh), "=&r" (sl) \
598 : "rM" (ah), "rM" (bh), "%rM" (al), "rI" (bl))
599 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
600 __asm__ ("sub%I4 %4,%r5,%1\n\tsubb %r2,%r3,%0" \
601 : "=r" (sh), "=&r" (sl) \
602 : "rM" (ah), "rM" (bh), "rI" (al), "rM" (bl))
603 #if defined (_PA_RISC1_1)
604 #define umul_ppmm(wh, wl, u, v) \
605 do { \
606 union {UDItype __ll; \
607 struct {USItype __h, __l;} __i; \
608 } __x; \
609 __asm__ ("xmpyu %1,%2,%0" : "=*f" (__x.__ll) : "*f" (u), "*f" (v)); \
610 (wh) = __x.__i.__h; \
611 (wl) = __x.__i.__l; \
612 } while (0)
613 #define UMUL_TIME 8
614 #define UDIV_TIME 60
615 #else
616 #define UMUL_TIME 40
617 #define UDIV_TIME 80
618 #endif
619 #define count_leading_zeros(count, x) \
620 do { \
621 USItype __tmp; \
622 __asm__ ( \
623 "ldi 1,%0\n" \
624 " extru,= %1,15,16,%%r0 ; Bits 31..16 zero?\n" \
625 " extru,tr %1,15,16,%1 ; No. Shift down, skip add.\n" \
626 " ldo 16(%0),%0 ; Yes. Perform add.\n" \
627 " extru,= %1,23,8,%%r0 ; Bits 15..8 zero?\n" \
628 " extru,tr %1,23,8,%1 ; No. Shift down, skip add.\n" \
629 " ldo 8(%0),%0 ; Yes. Perform add.\n" \
630 " extru,= %1,27,4,%%r0 ; Bits 7..4 zero?\n" \
631 " extru,tr %1,27,4,%1 ; No. Shift down, skip add.\n" \
632 " ldo 4(%0),%0 ; Yes. Perform add.\n" \
633 " extru,= %1,29,2,%%r0 ; Bits 3..2 zero?\n" \
634 " extru,tr %1,29,2,%1 ; No. Shift down, skip add.\n" \
635 " ldo 2(%0),%0 ; Yes. Perform add.\n" \
636 " extru %1,30,1,%1 ; Extract bit 1.\n" \
637 " sub %0,%1,%0 ; Subtract it.\n" \
638 : "=r" (count), "=r" (__tmp) : "1" (x)); \
639 } while (0)
640 #endif /* hppa */
642 /* These macros are for ABI=2.0w. In ABI=2.0n they can't be used, since GCC
643 (3.2) puts longlong into two adjacent 32-bit registers. Presumably this
644 is just a case of no direct support for 2.0n but treating it like 1.0. */
645 #if defined (__hppa) && W_TYPE_SIZE == 64 && ! defined (_LONG_LONG_LIMB)
646 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
647 __asm__ ("add%I5 %5,%r4,%1\n\tadd,dc %r2,%r3,%0" \
648 : "=r" (sh), "=&r" (sl) \
649 : "rM" (ah), "rM" (bh), "%rM" (al), "rI" (bl))
650 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
651 __asm__ ("sub%I4 %4,%r5,%1\n\tsub,db %r2,%r3,%0" \
652 : "=r" (sh), "=&r" (sl) \
653 : "rM" (ah), "rM" (bh), "rI" (al), "rM" (bl))
654 #endif /* hppa */
656 #if (defined (__i370__) || defined (__s390__) || defined (__mvs__)) && W_TYPE_SIZE == 32
657 #if defined (__zarch__) || defined (HAVE_HOST_CPU_s390_zarch)
658 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
659 do { \
660 /* if (__builtin_constant_p (bl)) \
661 __asm__ ("alfi\t%1,%o5\n\talcr\t%0,%3" \
662 : "=r" (sh), "=&r" (sl) \
663 : "0" (ah), "r" (bh), "%1" (al), "n" (bl) __CLOBBER_CC);\
664 else \
665 */ __asm__ ("alr\t%1,%5\n\talcr\t%0,%3" \
666 : "=r" (sh), "=&r" (sl) \
667 : "0" (ah), "r" (bh), "%1" (al), "r" (bl)__CLOBBER_CC); \
668 } while (0)
669 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
670 do { \
671 /* if (__builtin_constant_p (bl)) \
672 __asm__ ("slfi\t%1,%o5\n\tslbr\t%0,%3" \
673 : "=r" (sh), "=&r" (sl) \
674 : "0" (ah), "r" (bh), "1" (al), "n" (bl) __CLOBBER_CC); \
675 else \
676 */ __asm__ ("slr\t%1,%5\n\tslbr\t%0,%3" \
677 : "=r" (sh), "=&r" (sl) \
678 : "0" (ah), "r" (bh), "1" (al), "r" (bl) __CLOBBER_CC); \
679 } while (0)
680 #if __GMP_GNUC_PREREQ (4,5)
681 #define umul_ppmm(xh, xl, m0, m1) \
682 do { \
683 union {UDItype __ll; \
684 struct {USItype __h, __l;} __i; \
685 } __x; \
686 __x.__ll = (UDItype) (m0) * (UDItype) (m1); \
687 (xh) = __x.__i.__h; (xl) = __x.__i.__l; \
688 } while (0)
689 #else
690 #if 0
691 /* FIXME: this fails if gcc knows about the 64-bit registers. Use only
692 with a new enough processor pretending we have 32-bit registers. */
693 #define umul_ppmm(xh, xl, m0, m1) \
694 do { \
695 union {UDItype __ll; \
696 struct {USItype __h, __l;} __i; \
697 } __x; \
698 __asm__ ("mlr\t%0,%2" \
699 : "=r" (__x.__ll) \
700 : "%0" (m0), "r" (m1)); \
701 (xh) = __x.__i.__h; (xl) = __x.__i.__l; \
702 } while (0)
703 #else
704 #define umul_ppmm(xh, xl, m0, m1) \
705 do { \
706 /* When we have 64-bit regs and gcc is aware of that, we cannot simply use
707 DImode for the product, since that would be allocated to a single 64-bit
708 register, whereas mlr uses the low 32-bits of an even-odd register pair.
709 */ \
710 register USItype __r0 __asm__ ("0"); \
711 register USItype __r1 __asm__ ("1") = (m0); \
712 __asm__ ("mlr\t%0,%3" \
713 : "=r" (__r0), "=r" (__r1) \
714 : "r" (__r1), "r" (m1)); \
715 (xh) = __r0; (xl) = __r1; \
716 } while (0)
717 #endif /* if 0 */
718 #endif
719 #if 0
720 /* FIXME: this fails if gcc knows about the 64-bit registers. Use only
721 with a new enough processor pretending we have 32-bit registers. */
722 #define udiv_qrnnd(q, r, n1, n0, d) \
723 do { \
724 union {UDItype __ll; \
725 struct {USItype __h, __l;} __i; \
726 } __x; \
727 __x.__i.__h = n1; __x.__i.__l = n0; \
728 __asm__ ("dlr\t%0,%2" \
729 : "=r" (__x.__ll) \
730 : "0" (__x.__ll), "r" (d)); \
731 (q) = __x.__i.__l; (r) = __x.__i.__h; \
732 } while (0)
733 #else
734 #define udiv_qrnnd(q, r, n1, n0, d) \
735 do { \
736 register USItype __r0 __asm__ ("0") = (n1); \
737 register USItype __r1 __asm__ ("1") = (n0); \
738 __asm__ ("dlr\t%0,%4" \
739 : "=r" (__r0), "=r" (__r1) \
740 : "r" (__r0), "r" (__r1), "r" (d)); \
741 (q) = __r1; (r) = __r0; \
742 } while (0)
743 #endif /* if 0 */
744 #else /* if __zarch__ */
745 /* FIXME: this fails if gcc knows about the 64-bit registers. */
746 #define smul_ppmm(xh, xl, m0, m1) \
747 do { \
748 union {DItype __ll; \
749 struct {USItype __h, __l;} __i; \
750 } __x; \
751 __asm__ ("mr\t%0,%2" \
752 : "=r" (__x.__ll) \
753 : "%0" (m0), "r" (m1)); \
754 (xh) = __x.__i.__h; (xl) = __x.__i.__l; \
755 } while (0)
756 /* FIXME: this fails if gcc knows about the 64-bit registers. */
757 #define sdiv_qrnnd(q, r, n1, n0, d) \
758 do { \
759 union {DItype __ll; \
760 struct {USItype __h, __l;} __i; \
761 } __x; \
762 __x.__i.__h = n1; __x.__i.__l = n0; \
763 __asm__ ("dr\t%0,%2" \
764 : "=r" (__x.__ll) \
765 : "0" (__x.__ll), "r" (d)); \
766 (q) = __x.__i.__l; (r) = __x.__i.__h; \
767 } while (0)
768 #endif /* if __zarch__ */
769 #endif
771 #if defined (__s390x__) && W_TYPE_SIZE == 64
772 /* We need to cast operands with register constraints, otherwise their types
773 will be assumed to be SImode by gcc. For these machines, such operations
774 will insert a value into the low 32 bits, and leave the high 32 bits with
775 garbage. */
776 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
777 do { \
778 __asm__ ("algr\t%1,%5\n\talcgr\t%0,%3" \
779 : "=r" (sh), "=&r" (sl) \
780 : "0" ((UDItype)(ah)), "r" ((UDItype)(bh)), \
781 "%1" ((UDItype)(al)), "r" ((UDItype)(bl)) __CLOBBER_CC); \
782 } while (0)
783 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
784 do { \
785 __asm__ ("slgr\t%1,%5\n\tslbgr\t%0,%3" \
786 : "=r" (sh), "=&r" (sl) \
787 : "0" ((UDItype)(ah)), "r" ((UDItype)(bh)), \
788 "1" ((UDItype)(al)), "r" ((UDItype)(bl)) __CLOBBER_CC); \
789 } while (0)
790 #define umul_ppmm(xh, xl, m0, m1) \
791 do { \
792 union {unsigned int __attribute__ ((mode(TI))) __ll; \
793 struct {UDItype __h, __l;} __i; \
794 } __x; \
795 __asm__ ("mlgr\t%0,%2" \
796 : "=r" (__x.__ll) \
797 : "%0" ((UDItype)(m0)), "r" ((UDItype)(m1))); \
798 (xh) = __x.__i.__h; (xl) = __x.__i.__l; \
799 } while (0)
800 #define udiv_qrnnd(q, r, n1, n0, d) \
801 do { \
802 union {unsigned int __attribute__ ((mode(TI))) __ll; \
803 struct {UDItype __h, __l;} __i; \
804 } __x; \
805 __x.__i.__h = n1; __x.__i.__l = n0; \
806 __asm__ ("dlgr\t%0,%2" \
807 : "=r" (__x.__ll) \
808 : "0" (__x.__ll), "r" ((UDItype)(d))); \
809 (q) = __x.__i.__l; (r) = __x.__i.__h; \
810 } while (0)
811 #if 0 /* FIXME: Enable for z10 (?) */
812 #define count_leading_zeros(cnt, x) \
813 do { \
814 union {unsigned int __attribute__ ((mode(TI))) __ll; \
815 struct {UDItype __h, __l;} __i; \
816 } __clr_cnt; \
817 __asm__ ("flogr\t%0,%1" \
818 : "=r" (__clr_cnt.__ll) \
819 : "r" (x) __CLOBBER_CC); \
820 (cnt) = __clr_cnt.__i.__h; \
821 } while (0)
822 #endif
823 #endif
825 #if (defined (__i386__) || defined (__i486__)) && W_TYPE_SIZE == 32
826 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
827 __asm__ ("addl %5,%k1\n\tadcl %3,%k0" \
828 : "=r" (sh), "=&r" (sl) \
829 : "0" ((USItype)(ah)), "g" ((USItype)(bh)), \
830 "%1" ((USItype)(al)), "g" ((USItype)(bl)))
831 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
832 __asm__ ("subl %5,%k1\n\tsbbl %3,%k0" \
833 : "=r" (sh), "=&r" (sl) \
834 : "0" ((USItype)(ah)), "g" ((USItype)(bh)), \
835 "1" ((USItype)(al)), "g" ((USItype)(bl)))
836 #define umul_ppmm(w1, w0, u, v) \
837 __asm__ ("mull %3" \
838 : "=a" (w0), "=d" (w1) \
839 : "%0" ((USItype)(u)), "rm" ((USItype)(v)))
840 #define udiv_qrnnd(q, r, n1, n0, dx) /* d renamed to dx avoiding "=d" */\
841 __asm__ ("divl %4" /* stringification in K&R C */ \
842 : "=a" (q), "=d" (r) \
843 : "0" ((USItype)(n0)), "1" ((USItype)(n1)), "rm" ((USItype)(dx)))
845 #if HAVE_HOST_CPU_i586 || HAVE_HOST_CPU_pentium || HAVE_HOST_CPU_pentiummmx
846 /* Pentium bsrl takes between 10 and 72 cycles depending where the most
847 significant 1 bit is, hence the use of the following alternatives. bsfl
848 is slow too, between 18 and 42 depending where the least significant 1
849 bit is, so let the generic count_trailing_zeros below make use of the
850 count_leading_zeros here too. */
852 #if HAVE_HOST_CPU_pentiummmx && ! defined (LONGLONG_STANDALONE)
853 /* The following should be a fixed 14 or 15 cycles, but possibly plus an L1
854 cache miss reading from __clz_tab. For P55 it's favoured over the float
855 below so as to avoid mixing MMX and x87, since the penalty for switching
856 between the two is about 100 cycles.
858 The asm block sets __shift to -3 if the high 24 bits are clear, -2 for
859 16, -1 for 8, or 0 otherwise. This could be written equivalently as
860 follows, but as of gcc 2.95.2 it results in conditional jumps.
862 __shift = -(__n < 0x1000000);
863 __shift -= (__n < 0x10000);
864 __shift -= (__n < 0x100);
866 The middle two sbbl and cmpl's pair, and with luck something gcc
867 generates might pair with the first cmpl and the last sbbl. The "32+1"
868 constant could be folded into __clz_tab[], but it doesn't seem worth
869 making a different table just for that. */
871 #define count_leading_zeros(c,n) \
872 do { \
873 USItype __n = (n); \
874 USItype __shift; \
875 __asm__ ("cmpl $0x1000000, %1\n" \
876 "sbbl %0, %0\n" \
877 "cmpl $0x10000, %1\n" \
878 "sbbl $0, %0\n" \
879 "cmpl $0x100, %1\n" \
880 "sbbl $0, %0\n" \
881 : "=&r" (__shift) : "r" (__n)); \
882 __shift = __shift*8 + 24 + 1; \
883 (c) = 32 + 1 - __shift - __clz_tab[__n >> __shift]; \
884 } while (0)
885 #define COUNT_LEADING_ZEROS_NEED_CLZ_TAB
886 #define COUNT_LEADING_ZEROS_0 31 /* n==0 indistinguishable from n==1 */
888 #else /* ! pentiummmx || LONGLONG_STANDALONE */
889 /* The following should be a fixed 14 cycles or so. Some scheduling
890 opportunities should be available between the float load/store too. This
891 sort of code is used in gcc 3 for __builtin_ffs (with "n&-n") and is
892 apparently suggested by the Intel optimizing manual (don't know exactly
893 where). gcc 2.95 or up will be best for this, so the "double" is
894 correctly aligned on the stack. */
895 #define count_leading_zeros(c,n) \
896 do { \
897 union { \
898 double d; \
899 unsigned a[2]; \
900 } __u; \
901 ASSERT ((n) != 0); \
902 __u.d = (UWtype) (n); \
903 (c) = 0x3FF + 31 - (__u.a[1] >> 20); \
904 } while (0)
905 #define COUNT_LEADING_ZEROS_0 (0x3FF + 31)
906 #endif /* pentiummx */
908 #else /* ! pentium */
910 #if __GMP_GNUC_PREREQ (3,4) /* using bsrl */
911 #define count_leading_zeros(count,x) count_leading_zeros_gcc_clz(count,x)
912 #endif /* gcc clz */
914 /* On P6, gcc prior to 3.0 generates a partial register stall for
915 __cbtmp^31, due to using "xorb $31" instead of "xorl $31", the former
916 being 1 code byte smaller. "31-__cbtmp" is a workaround, probably at the
917 cost of one extra instruction. Do this for "i386" too, since that means
918 generic x86. */
919 #if ! defined (count_leading_zeros) && __GNUC__ < 3 \
920 && (HAVE_HOST_CPU_i386 \
921 || HAVE_HOST_CPU_i686 \
922 || HAVE_HOST_CPU_pentiumpro \
923 || HAVE_HOST_CPU_pentium2 \
924 || HAVE_HOST_CPU_pentium3)
925 #define count_leading_zeros(count, x) \
926 do { \
927 USItype __cbtmp; \
928 ASSERT ((x) != 0); \
929 __asm__ ("bsrl %1,%0" : "=r" (__cbtmp) : "rm" ((USItype)(x))); \
930 (count) = 31 - __cbtmp; \
931 } while (0)
932 #endif /* gcc<3 asm bsrl */
934 #ifndef count_leading_zeros
935 #define count_leading_zeros(count, x) \
936 do { \
937 USItype __cbtmp; \
938 ASSERT ((x) != 0); \
939 __asm__ ("bsrl %1,%0" : "=r" (__cbtmp) : "rm" ((USItype)(x))); \
940 (count) = __cbtmp ^ 31; \
941 } while (0)
942 #endif /* asm bsrl */
944 #if __GMP_GNUC_PREREQ (3,4) /* using bsfl */
945 #define count_trailing_zeros(count,x) count_trailing_zeros_gcc_ctz(count,x)
946 #endif /* gcc ctz */
948 #ifndef count_trailing_zeros
949 #define count_trailing_zeros(count, x) \
950 do { \
951 ASSERT ((x) != 0); \
952 __asm__ ("bsfl %1,%k0" : "=r" (count) : "rm" ((USItype)(x))); \
953 } while (0)
954 #endif /* asm bsfl */
956 #endif /* ! pentium */
958 #ifndef UMUL_TIME
959 #define UMUL_TIME 10
960 #endif
961 #ifndef UDIV_TIME
962 #define UDIV_TIME 40
963 #endif
964 #endif /* 80x86 */
966 #if defined (__amd64__) && W_TYPE_SIZE == 64
967 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
968 __asm__ ("addq %5,%q1\n\tadcq %3,%q0" \
969 : "=r" (sh), "=&r" (sl) \
970 : "0" ((UDItype)(ah)), "rme" ((UDItype)(bh)), \
971 "%1" ((UDItype)(al)), "rme" ((UDItype)(bl)))
972 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
973 __asm__ ("subq %5,%q1\n\tsbbq %3,%q0" \
974 : "=r" (sh), "=&r" (sl) \
975 : "0" ((UDItype)(ah)), "rme" ((UDItype)(bh)), \
976 "1" ((UDItype)(al)), "rme" ((UDItype)(bl)))
977 #define umul_ppmm(w1, w0, u, v) \
978 __asm__ ("mulq %3" \
979 : "=a" (w0), "=d" (w1) \
980 : "%0" ((UDItype)(u)), "rm" ((UDItype)(v)))
981 #define udiv_qrnnd(q, r, n1, n0, dx) /* d renamed to dx avoiding "=d" */\
982 __asm__ ("divq %4" /* stringification in K&R C */ \
983 : "=a" (q), "=d" (r) \
984 : "0" ((UDItype)(n0)), "1" ((UDItype)(n1)), "rm" ((UDItype)(dx)))
985 /* bsrq destination must be a 64-bit register, hence UDItype for __cbtmp. */
986 #define count_leading_zeros(count, x) \
987 do { \
988 UDItype __cbtmp; \
989 ASSERT ((x) != 0); \
990 __asm__ ("bsrq %1,%0" : "=r" (__cbtmp) : "rm" ((UDItype)(x))); \
991 (count) = __cbtmp ^ 63; \
992 } while (0)
993 /* bsfq destination must be a 64-bit register, "%q0" forces this in case
994 count is only an int. */
995 #define count_trailing_zeros(count, x) \
996 do { \
997 ASSERT ((x) != 0); \
998 __asm__ ("bsfq %1,%q0" : "=r" (count) : "rm" ((UDItype)(x))); \
999 } while (0)
1000 #endif /* x86_64 */
1002 #if defined (__i860__) && W_TYPE_SIZE == 32
1003 #define rshift_rhlc(r,h,l,c) \
1004 __asm__ ("shr %3,r0,r0\;shrd %1,%2,%0" \
1005 "=r" (r) : "r" (h), "r" (l), "rn" (c))
1006 #endif /* i860 */
1008 #if defined (__i960__) && W_TYPE_SIZE == 32
1009 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
1010 __asm__ ("cmpo 1,0\;addc %5,%4,%1\;addc %3,%2,%0" \
1011 : "=r" (sh), "=&r" (sl) \
1012 : "dI" (ah), "dI" (bh), "%dI" (al), "dI" (bl))
1013 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
1014 __asm__ ("cmpo 0,0\;subc %5,%4,%1\;subc %3,%2,%0" \
1015 : "=r" (sh), "=&r" (sl) \
1016 : "dI" (ah), "dI" (bh), "dI" (al), "dI" (bl))
1017 #define umul_ppmm(w1, w0, u, v) \
1018 ({union {UDItype __ll; \
1019 struct {USItype __l, __h;} __i; \
1020 } __x; \
1021 __asm__ ("emul %2,%1,%0" \
1022 : "=d" (__x.__ll) : "%dI" (u), "dI" (v)); \
1023 (w1) = __x.__i.__h; (w0) = __x.__i.__l;})
1024 #define __umulsidi3(u, v) \
1025 ({UDItype __w; \
1026 __asm__ ("emul %2,%1,%0" : "=d" (__w) : "%dI" (u), "dI" (v)); \
1027 __w; })
1028 #define udiv_qrnnd(q, r, nh, nl, d) \
1029 do { \
1030 union {UDItype __ll; \
1031 struct {USItype __l, __h;} __i; \
1032 } __nn; \
1033 __nn.__i.__h = (nh); __nn.__i.__l = (nl); \
1034 __asm__ ("ediv %d,%n,%0" \
1035 : "=d" (__rq.__ll) : "dI" (__nn.__ll), "dI" (d)); \
1036 (r) = __rq.__i.__l; (q) = __rq.__i.__h; \
1037 } while (0)
1038 #define count_leading_zeros(count, x) \
1039 do { \
1040 USItype __cbtmp; \
1041 __asm__ ("scanbit %1,%0" : "=r" (__cbtmp) : "r" (x)); \
1042 (count) = __cbtmp ^ 31; \
1043 } while (0)
1044 #define COUNT_LEADING_ZEROS_0 (-32) /* sic */
1045 #if defined (__i960mx) /* what is the proper symbol to test??? */
1046 #define rshift_rhlc(r,h,l,c) \
1047 do { \
1048 union {UDItype __ll; \
1049 struct {USItype __l, __h;} __i; \
1050 } __nn; \
1051 __nn.__i.__h = (h); __nn.__i.__l = (l); \
1052 __asm__ ("shre %2,%1,%0" : "=d" (r) : "dI" (__nn.__ll), "dI" (c)); \
1054 #endif /* i960mx */
1055 #endif /* i960 */
1057 #if (defined (__mc68000__) || defined (__mc68020__) || defined(mc68020) \
1058 || defined (__m68k__) || defined (__mc5200__) || defined (__mc5206e__) \
1059 || defined (__mc5307__)) && W_TYPE_SIZE == 32
1060 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
1061 __asm__ ("add%.l %5,%1\n\taddx%.l %3,%0" \
1062 : "=d" (sh), "=&d" (sl) \
1063 : "0" ((USItype)(ah)), "d" ((USItype)(bh)), \
1064 "%1" ((USItype)(al)), "g" ((USItype)(bl)))
1065 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
1066 __asm__ ("sub%.l %5,%1\n\tsubx%.l %3,%0" \
1067 : "=d" (sh), "=&d" (sl) \
1068 : "0" ((USItype)(ah)), "d" ((USItype)(bh)), \
1069 "1" ((USItype)(al)), "g" ((USItype)(bl)))
1070 /* The '020, '030, '040 and CPU32 have 32x32->64 and 64/32->32q-32r. */
1071 #if defined (__mc68020__) || defined(mc68020) \
1072 || defined (__mc68030__) || defined (mc68030) \
1073 || defined (__mc68040__) || defined (mc68040) \
1074 || defined (__mcpu32__) || defined (mcpu32) \
1075 || defined (__NeXT__)
1076 #define umul_ppmm(w1, w0, u, v) \
1077 __asm__ ("mulu%.l %3,%1:%0" \
1078 : "=d" (w0), "=d" (w1) \
1079 : "%0" ((USItype)(u)), "dmi" ((USItype)(v)))
1080 #define UMUL_TIME 45
1081 #define udiv_qrnnd(q, r, n1, n0, d) \
1082 __asm__ ("divu%.l %4,%1:%0" \
1083 : "=d" (q), "=d" (r) \
1084 : "0" ((USItype)(n0)), "1" ((USItype)(n1)), "dmi" ((USItype)(d)))
1085 #define UDIV_TIME 90
1086 #define sdiv_qrnnd(q, r, n1, n0, d) \
1087 __asm__ ("divs%.l %4,%1:%0" \
1088 : "=d" (q), "=d" (r) \
1089 : "0" ((USItype)(n0)), "1" ((USItype)(n1)), "dmi" ((USItype)(d)))
1090 #else /* for other 68k family members use 16x16->32 multiplication */
1091 #define umul_ppmm(xh, xl, a, b) \
1092 do { USItype __umul_tmp1, __umul_tmp2; \
1093 __asm__ ("| Inlined umul_ppmm\n" \
1094 " move%.l %5,%3\n" \
1095 " move%.l %2,%0\n" \
1096 " move%.w %3,%1\n" \
1097 " swap %3\n" \
1098 " swap %0\n" \
1099 " mulu%.w %2,%1\n" \
1100 " mulu%.w %3,%0\n" \
1101 " mulu%.w %2,%3\n" \
1102 " swap %2\n" \
1103 " mulu%.w %5,%2\n" \
1104 " add%.l %3,%2\n" \
1105 " jcc 1f\n" \
1106 " add%.l %#0x10000,%0\n" \
1107 "1: move%.l %2,%3\n" \
1108 " clr%.w %2\n" \
1109 " swap %2\n" \
1110 " swap %3\n" \
1111 " clr%.w %3\n" \
1112 " add%.l %3,%1\n" \
1113 " addx%.l %2,%0\n" \
1114 " | End inlined umul_ppmm" \
1115 : "=&d" (xh), "=&d" (xl), \
1116 "=d" (__umul_tmp1), "=&d" (__umul_tmp2) \
1117 : "%2" ((USItype)(a)), "d" ((USItype)(b))); \
1118 } while (0)
1119 #define UMUL_TIME 100
1120 #define UDIV_TIME 400
1121 #endif /* not mc68020 */
1122 /* The '020, '030, '040 and '060 have bitfield insns.
1123 GCC 3.4 defines __mc68020__ when in CPU32 mode, check for __mcpu32__ to
1124 exclude bfffo on that chip (bitfield insns not available). */
1125 #if (defined (__mc68020__) || defined (mc68020) \
1126 || defined (__mc68030__) || defined (mc68030) \
1127 || defined (__mc68040__) || defined (mc68040) \
1128 || defined (__mc68060__) || defined (mc68060) \
1129 || defined (__NeXT__)) \
1130 && ! defined (__mcpu32__)
1131 #define count_leading_zeros(count, x) \
1132 __asm__ ("bfffo %1{%b2:%b2},%0" \
1133 : "=d" (count) \
1134 : "od" ((USItype) (x)), "n" (0))
1135 #define COUNT_LEADING_ZEROS_0 32
1136 #endif
1137 #endif /* mc68000 */
1139 #if defined (__m88000__) && W_TYPE_SIZE == 32
1140 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
1141 __asm__ ("addu.co %1,%r4,%r5\n\taddu.ci %0,%r2,%r3" \
1142 : "=r" (sh), "=&r" (sl) \
1143 : "rJ" (ah), "rJ" (bh), "%rJ" (al), "rJ" (bl))
1144 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
1145 __asm__ ("subu.co %1,%r4,%r5\n\tsubu.ci %0,%r2,%r3" \
1146 : "=r" (sh), "=&r" (sl) \
1147 : "rJ" (ah), "rJ" (bh), "rJ" (al), "rJ" (bl))
1148 #define count_leading_zeros(count, x) \
1149 do { \
1150 USItype __cbtmp; \
1151 __asm__ ("ff1 %0,%1" : "=r" (__cbtmp) : "r" (x)); \
1152 (count) = __cbtmp ^ 31; \
1153 } while (0)
1154 #define COUNT_LEADING_ZEROS_0 63 /* sic */
1155 #if defined (__m88110__)
1156 #define umul_ppmm(wh, wl, u, v) \
1157 do { \
1158 union {UDItype __ll; \
1159 struct {USItype __h, __l;} __i; \
1160 } __x; \
1161 __asm__ ("mulu.d %0,%1,%2" : "=r" (__x.__ll) : "r" (u), "r" (v)); \
1162 (wh) = __x.__i.__h; \
1163 (wl) = __x.__i.__l; \
1164 } while (0)
1165 #define udiv_qrnnd(q, r, n1, n0, d) \
1166 ({union {UDItype __ll; \
1167 struct {USItype __h, __l;} __i; \
1168 } __x, __q; \
1169 __x.__i.__h = (n1); __x.__i.__l = (n0); \
1170 __asm__ ("divu.d %0,%1,%2" \
1171 : "=r" (__q.__ll) : "r" (__x.__ll), "r" (d)); \
1172 (r) = (n0) - __q.__l * (d); (q) = __q.__l; })
1173 #define UMUL_TIME 5
1174 #define UDIV_TIME 25
1175 #else
1176 #define UMUL_TIME 17
1177 #define UDIV_TIME 150
1178 #endif /* __m88110__ */
1179 #endif /* __m88000__ */
1181 #if defined (__mips) && W_TYPE_SIZE == 32
1182 #if __GMP_GNUC_PREREQ (4,4)
1183 #define umul_ppmm(w1, w0, u, v) \
1184 do { \
1185 UDItype __ll = (UDItype)(u) * (v); \
1186 w1 = __ll >> 32; \
1187 w0 = __ll; \
1188 } while (0)
1189 #endif
1190 #if !defined (umul_ppmm) && __GMP_GNUC_PREREQ (2,7)
1191 #define umul_ppmm(w1, w0, u, v) \
1192 __asm__ ("multu %2,%3" : "=l" (w0), "=h" (w1) : "d" (u), "d" (v))
1193 #endif
1194 #if !defined (umul_ppmm)
1195 #define umul_ppmm(w1, w0, u, v) \
1196 __asm__ ("multu %2,%3\n\tmflo %0\n\tmfhi %1" \
1197 : "=d" (w0), "=d" (w1) : "d" (u), "d" (v))
1198 #endif
1199 #define UMUL_TIME 10
1200 #define UDIV_TIME 100
1201 #endif /* __mips */
1203 #if (defined (__mips) && __mips >= 3) && W_TYPE_SIZE == 64
1204 #if __GMP_GNUC_PREREQ (4,4)
1205 #define umul_ppmm(w1, w0, u, v) \
1206 do { \
1207 typedef unsigned int __ll_UTItype __attribute__((mode(TI))); \
1208 __ll_UTItype __ll = (__ll_UTItype)(u) * (v); \
1209 w1 = __ll >> 64; \
1210 w0 = __ll; \
1211 } while (0)
1212 #endif
1213 #if !defined (umul_ppmm) && __GMP_GNUC_PREREQ (2,7)
1214 #define umul_ppmm(w1, w0, u, v) \
1215 __asm__ ("dmultu %2,%3" : "=l" (w0), "=h" (w1) : "d" (u), "d" (v))
1216 #endif
1217 #if !defined (umul_ppmm)
1218 #define umul_ppmm(w1, w0, u, v) \
1219 __asm__ ("dmultu %2,%3\n\tmflo %0\n\tmfhi %1" \
1220 : "=d" (w0), "=d" (w1) : "d" (u), "d" (v))
1221 #endif
1222 #define UMUL_TIME 20
1223 #define UDIV_TIME 140
1224 #endif /* __mips */
1226 #if defined (__mmix__) && W_TYPE_SIZE == 64
1227 #define umul_ppmm(w1, w0, u, v) \
1228 __asm__ ("MULU %0,%2,%3" : "=r" (w0), "=z" (w1) : "r" (u), "r" (v))
1229 #endif
1231 #if defined (__ns32000__) && W_TYPE_SIZE == 32
1232 #define umul_ppmm(w1, w0, u, v) \
1233 ({union {UDItype __ll; \
1234 struct {USItype __l, __h;} __i; \
1235 } __x; \
1236 __asm__ ("meid %2,%0" \
1237 : "=g" (__x.__ll) \
1238 : "%0" ((USItype)(u)), "g" ((USItype)(v))); \
1239 (w1) = __x.__i.__h; (w0) = __x.__i.__l;})
1240 #define __umulsidi3(u, v) \
1241 ({UDItype __w; \
1242 __asm__ ("meid %2,%0" \
1243 : "=g" (__w) \
1244 : "%0" ((USItype)(u)), "g" ((USItype)(v))); \
1245 __w; })
1246 #define udiv_qrnnd(q, r, n1, n0, d) \
1247 ({union {UDItype __ll; \
1248 struct {USItype __l, __h;} __i; \
1249 } __x; \
1250 __x.__i.__h = (n1); __x.__i.__l = (n0); \
1251 __asm__ ("deid %2,%0" \
1252 : "=g" (__x.__ll) \
1253 : "0" (__x.__ll), "g" ((USItype)(d))); \
1254 (r) = __x.__i.__l; (q) = __x.__i.__h; })
1255 #define count_trailing_zeros(count,x) \
1256 do { \
1257 __asm__ ("ffsd %2,%0" \
1258 : "=r" (count) \
1259 : "0" ((USItype) 0), "r" ((USItype) (x))); \
1260 } while (0)
1261 #endif /* __ns32000__ */
1263 /* In the past we had a block of various #defines tested
1264 _ARCH_PPC - AIX
1265 _ARCH_PWR - AIX
1266 __powerpc__ - gcc
1267 __POWERPC__ - BEOS
1268 __ppc__ - Darwin
1269 PPC - old gcc, GNU/Linux, SysV
1270 The plain PPC test was not good for vxWorks, since PPC is defined on all
1271 CPUs there (eg. m68k too), as a constant one is expected to compare
1272 CPU_FAMILY against.
1274 At any rate, this was pretty unattractive and a bit fragile. The use of
1275 HAVE_HOST_CPU_FAMILY is designed to cut through it all and be sure of
1276 getting the desired effect.
1278 ENHANCE-ME: We should test _IBMR2 here when we add assembly support for
1279 the system vendor compilers. (Is that vendor compilers with inline asm,
1280 or what?) */
1282 #if (HAVE_HOST_CPU_FAMILY_power || HAVE_HOST_CPU_FAMILY_powerpc) \
1283 && W_TYPE_SIZE == 32
1284 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
1285 do { \
1286 if (__builtin_constant_p (bh) && (bh) == 0) \
1287 __asm__ ("{a%I4|add%I4c} %1,%3,%4\n\t{aze|addze} %0,%2" \
1288 : "=r" (sh), "=&r" (sl) : "r" (ah), "%r" (al), "rI" (bl));\
1289 else if (__builtin_constant_p (bh) && (bh) == ~(USItype) 0) \
1290 __asm__ ("{a%I4|add%I4c} %1,%3,%4\n\t{ame|addme} %0,%2" \
1291 : "=r" (sh), "=&r" (sl) : "r" (ah), "%r" (al), "rI" (bl));\
1292 else \
1293 __asm__ ("{a%I5|add%I5c} %1,%4,%5\n\t{ae|adde} %0,%2,%3" \
1294 : "=r" (sh), "=&r" (sl) \
1295 : "r" (ah), "r" (bh), "%r" (al), "rI" (bl)); \
1296 } while (0)
1297 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
1298 do { \
1299 if (__builtin_constant_p (ah) && (ah) == 0) \
1300 __asm__ ("{sf%I3|subf%I3c} %1,%4,%3\n\t{sfze|subfze} %0,%2" \
1301 : "=r" (sh), "=&r" (sl) : "r" (bh), "rI" (al), "r" (bl));\
1302 else if (__builtin_constant_p (ah) && (ah) == ~(USItype) 0) \
1303 __asm__ ("{sf%I3|subf%I3c} %1,%4,%3\n\t{sfme|subfme} %0,%2" \
1304 : "=r" (sh), "=&r" (sl) : "r" (bh), "rI" (al), "r" (bl));\
1305 else if (__builtin_constant_p (bh) && (bh) == 0) \
1306 __asm__ ("{sf%I3|subf%I3c} %1,%4,%3\n\t{ame|addme} %0,%2" \
1307 : "=r" (sh), "=&r" (sl) : "r" (ah), "rI" (al), "r" (bl));\
1308 else if (__builtin_constant_p (bh) && (bh) == ~(USItype) 0) \
1309 __asm__ ("{sf%I3|subf%I3c} %1,%4,%3\n\t{aze|addze} %0,%2" \
1310 : "=r" (sh), "=&r" (sl) : "r" (ah), "rI" (al), "r" (bl));\
1311 else \
1312 __asm__ ("{sf%I4|subf%I4c} %1,%5,%4\n\t{sfe|subfe} %0,%3,%2" \
1313 : "=r" (sh), "=&r" (sl) \
1314 : "r" (ah), "r" (bh), "rI" (al), "r" (bl)); \
1315 } while (0)
1316 #define count_leading_zeros(count, x) \
1317 __asm__ ("{cntlz|cntlzw} %0,%1" : "=r" (count) : "r" (x))
1318 #define COUNT_LEADING_ZEROS_0 32
1319 #if HAVE_HOST_CPU_FAMILY_powerpc
1320 #if __GMP_GNUC_PREREQ (4,4)
1321 #define umul_ppmm(w1, w0, u, v) \
1322 do { \
1323 UDItype __ll = (UDItype)(u) * (v); \
1324 w1 = __ll >> 32; \
1325 w0 = __ll; \
1326 } while (0)
1327 #endif
1328 #if !defined (umul_ppmm)
1329 #define umul_ppmm(ph, pl, m0, m1) \
1330 do { \
1331 USItype __m0 = (m0), __m1 = (m1); \
1332 __asm__ ("mulhwu %0,%1,%2" : "=r" (ph) : "%r" (m0), "r" (m1)); \
1333 (pl) = __m0 * __m1; \
1334 } while (0)
1335 #endif
1336 #define UMUL_TIME 15
1337 #define smul_ppmm(ph, pl, m0, m1) \
1338 do { \
1339 SItype __m0 = (m0), __m1 = (m1); \
1340 __asm__ ("mulhw %0,%1,%2" : "=r" (ph) : "%r" (m0), "r" (m1)); \
1341 (pl) = __m0 * __m1; \
1342 } while (0)
1343 #define SMUL_TIME 14
1344 #define UDIV_TIME 120
1345 #else
1346 #define UMUL_TIME 8
1347 #define smul_ppmm(xh, xl, m0, m1) \
1348 __asm__ ("mul %0,%2,%3" : "=r" (xh), "=q" (xl) : "r" (m0), "r" (m1))
1349 #define SMUL_TIME 4
1350 #define sdiv_qrnnd(q, r, nh, nl, d) \
1351 __asm__ ("div %0,%2,%4" : "=r" (q), "=q" (r) : "r" (nh), "1" (nl), "r" (d))
1352 #define UDIV_TIME 100
1353 #endif
1354 #endif /* 32-bit POWER architecture variants. */
1356 /* We should test _IBMR2 here when we add assembly support for the system
1357 vendor compilers. */
1358 #if HAVE_HOST_CPU_FAMILY_powerpc && W_TYPE_SIZE == 64
1359 #if !defined (_LONG_LONG_LIMB)
1360 /* _LONG_LONG_LIMB is ABI=mode32 where adde operates on 32-bit values. So
1361 use adde etc only when not _LONG_LONG_LIMB. */
1362 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
1363 do { \
1364 if (__builtin_constant_p (bh) && (bh) == 0) \
1365 __asm__ ("{a%I4|add%I4c} %1,%3,%4\n\t{aze|addze} %0,%2" \
1366 : "=r" (sh), "=&r" (sl) : "r" (ah), "%r" (al), "rI" (bl));\
1367 else if (__builtin_constant_p (bh) && (bh) == ~(UDItype) 0) \
1368 __asm__ ("{a%I4|add%I4c} %1,%3,%4\n\t{ame|addme} %0,%2" \
1369 : "=r" (sh), "=&r" (sl) : "r" (ah), "%r" (al), "rI" (bl));\
1370 else \
1371 __asm__ ("{a%I5|add%I5c} %1,%4,%5\n\t{ae|adde} %0,%2,%3" \
1372 : "=r" (sh), "=&r" (sl) \
1373 : "r" (ah), "r" (bh), "%r" (al), "rI" (bl)); \
1374 } while (0)
1375 /* We use "*rI" for the constant operand here, since with just "I", gcc barfs.
1376 This might seem strange, but gcc folds away the dead code late. */
1377 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
1378 do { \
1379 if (__builtin_constant_p (bl) && bl > -0x8000 && bl <= 0x8000) { \
1380 if (__builtin_constant_p (ah) && (ah) == 0) \
1381 __asm__ ("{ai|addic} %1,%3,%4\n\t{sfze|subfze} %0,%2" \
1382 : "=r" (sh), "=&r" (sl) : "r" (bh), "rI" (al), "*rI" (-bl)); \
1383 else if (__builtin_constant_p (ah) && (ah) == ~(UDItype) 0) \
1384 __asm__ ("{ai|addic} %1,%3,%4\n\t{sfme|subfme} %0,%2" \
1385 : "=r" (sh), "=&r" (sl) : "r" (bh), "rI" (al), "*rI" (-bl)); \
1386 else if (__builtin_constant_p (bh) && (bh) == 0) \
1387 __asm__ ("{ai|addic} %1,%3,%4\n\t{ame|addme} %0,%2" \
1388 : "=r" (sh), "=&r" (sl) : "r" (ah), "rI" (al), "*rI" (-bl)); \
1389 else if (__builtin_constant_p (bh) && (bh) == ~(UDItype) 0) \
1390 __asm__ ("{ai|addic} %1,%3,%4\n\t{aze|addze} %0,%2" \
1391 : "=r" (sh), "=&r" (sl) : "r" (ah), "rI" (al), "*rI" (-bl)); \
1392 else \
1393 __asm__ ("{ai|addic} %1,%4,%5\n\t{sfe|subfe} %0,%3,%2" \
1394 : "=r" (sh), "=&r" (sl) \
1395 : "r" (ah), "r" (bh), "rI" (al), "*rI" (-bl)); \
1396 } else { \
1397 if (__builtin_constant_p (ah) && (ah) == 0) \
1398 __asm__ ("{sf%I3|subf%I3c} %1,%4,%3\n\t{sfze|subfze} %0,%2" \
1399 : "=r" (sh), "=&r" (sl) : "r" (bh), "rI" (al), "r" (bl)); \
1400 else if (__builtin_constant_p (ah) && (ah) == ~(UDItype) 0) \
1401 __asm__ ("{sf%I3|subf%I3c} %1,%4,%3\n\t{sfme|subfme} %0,%2" \
1402 : "=r" (sh), "=&r" (sl) : "r" (bh), "rI" (al), "r" (bl)); \
1403 else if (__builtin_constant_p (bh) && (bh) == 0) \
1404 __asm__ ("{sf%I3|subf%I3c} %1,%4,%3\n\t{ame|addme} %0,%2" \
1405 : "=r" (sh), "=&r" (sl) : "r" (ah), "rI" (al), "r" (bl)); \
1406 else if (__builtin_constant_p (bh) && (bh) == ~(UDItype) 0) \
1407 __asm__ ("{sf%I3|subf%I3c} %1,%4,%3\n\t{aze|addze} %0,%2" \
1408 : "=r" (sh), "=&r" (sl) : "r" (ah), "rI" (al), "r" (bl)); \
1409 else \
1410 __asm__ ("{sf%I4|subf%I4c} %1,%5,%4\n\t{sfe|subfe} %0,%3,%2" \
1411 : "=r" (sh), "=&r" (sl) \
1412 : "r" (ah), "r" (bh), "rI" (al), "r" (bl)); \
1414 } while (0)
1415 #endif /* ! _LONG_LONG_LIMB */
1416 #define count_leading_zeros(count, x) \
1417 __asm__ ("cntlzd %0,%1" : "=r" (count) : "r" (x))
1418 #define COUNT_LEADING_ZEROS_0 64
1419 #if 0 && __GMP_GNUC_PREREQ (4,4) /* Disable, this results in libcalls! */
1420 #define umul_ppmm(w1, w0, u, v) \
1421 do { \
1422 typedef unsigned int __ll_UTItype __attribute__((mode(TI))); \
1423 __ll_UTItype __ll = (__ll_UTItype)(u) * (v); \
1424 w1 = __ll >> 64; \
1425 w0 = __ll; \
1426 } while (0)
1427 #endif
1428 #if !defined (umul_ppmm)
1429 #define umul_ppmm(ph, pl, m0, m1) \
1430 do { \
1431 UDItype __m0 = (m0), __m1 = (m1); \
1432 __asm__ ("mulhdu %0,%1,%2" : "=r" (ph) : "%r" (m0), "r" (m1)); \
1433 (pl) = __m0 * __m1; \
1434 } while (0)
1435 #endif
1436 #define UMUL_TIME 15
1437 #define smul_ppmm(ph, pl, m0, m1) \
1438 do { \
1439 DItype __m0 = (m0), __m1 = (m1); \
1440 __asm__ ("mulhd %0,%1,%2" : "=r" (ph) : "%r" (m0), "r" (m1)); \
1441 (pl) = __m0 * __m1; \
1442 } while (0)
1443 #define SMUL_TIME 14 /* ??? */
1444 #define UDIV_TIME 120 /* ??? */
1445 #endif /* 64-bit PowerPC. */
1447 #if defined (__pyr__) && W_TYPE_SIZE == 32
1448 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
1449 __asm__ ("addw %5,%1\n\taddwc %3,%0" \
1450 : "=r" (sh), "=&r" (sl) \
1451 : "0" ((USItype)(ah)), "g" ((USItype)(bh)), \
1452 "%1" ((USItype)(al)), "g" ((USItype)(bl)))
1453 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
1454 __asm__ ("subw %5,%1\n\tsubwb %3,%0" \
1455 : "=r" (sh), "=&r" (sl) \
1456 : "0" ((USItype)(ah)), "g" ((USItype)(bh)), \
1457 "1" ((USItype)(al)), "g" ((USItype)(bl)))
1458 /* This insn works on Pyramids with AP, XP, or MI CPUs, but not with SP. */
1459 #define umul_ppmm(w1, w0, u, v) \
1460 ({union {UDItype __ll; \
1461 struct {USItype __h, __l;} __i; \
1462 } __x; \
1463 __asm__ ("movw %1,%R0\n\tuemul %2,%0" \
1464 : "=&r" (__x.__ll) \
1465 : "g" ((USItype) (u)), "g" ((USItype)(v))); \
1466 (w1) = __x.__i.__h; (w0) = __x.__i.__l;})
1467 #endif /* __pyr__ */
1469 #if defined (__ibm032__) /* RT/ROMP */ && W_TYPE_SIZE == 32
1470 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
1471 __asm__ ("a %1,%5\n\tae %0,%3" \
1472 : "=r" (sh), "=&r" (sl) \
1473 : "0" ((USItype)(ah)), "r" ((USItype)(bh)), \
1474 "%1" ((USItype)(al)), "r" ((USItype)(bl)))
1475 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
1476 __asm__ ("s %1,%5\n\tse %0,%3" \
1477 : "=r" (sh), "=&r" (sl) \
1478 : "0" ((USItype)(ah)), "r" ((USItype)(bh)), \
1479 "1" ((USItype)(al)), "r" ((USItype)(bl)))
1480 #define smul_ppmm(ph, pl, m0, m1) \
1481 __asm__ ( \
1482 "s r2,r2\n" \
1483 " mts r10,%2\n" \
1484 " m r2,%3\n" \
1485 " m r2,%3\n" \
1486 " m r2,%3\n" \
1487 " m r2,%3\n" \
1488 " m r2,%3\n" \
1489 " m r2,%3\n" \
1490 " m r2,%3\n" \
1491 " m r2,%3\n" \
1492 " m r2,%3\n" \
1493 " m r2,%3\n" \
1494 " m r2,%3\n" \
1495 " m r2,%3\n" \
1496 " m r2,%3\n" \
1497 " m r2,%3\n" \
1498 " m r2,%3\n" \
1499 " m r2,%3\n" \
1500 " cas %0,r2,r0\n" \
1501 " mfs r10,%1" \
1502 : "=r" (ph), "=r" (pl) \
1503 : "%r" ((USItype)(m0)), "r" ((USItype)(m1)) \
1504 : "r2")
1505 #define UMUL_TIME 20
1506 #define UDIV_TIME 200
1507 #define count_leading_zeros(count, x) \
1508 do { \
1509 if ((x) >= 0x10000) \
1510 __asm__ ("clz %0,%1" \
1511 : "=r" (count) : "r" ((USItype)(x) >> 16)); \
1512 else \
1514 __asm__ ("clz %0,%1" \
1515 : "=r" (count) : "r" ((USItype)(x))); \
1516 (count) += 16; \
1518 } while (0)
1519 #endif /* RT/ROMP */
1521 #if defined (__sh2__) && W_TYPE_SIZE == 32
1522 #define umul_ppmm(w1, w0, u, v) \
1523 __asm__ ("dmulu.l %2,%3\n\tsts macl,%1\n\tsts mach,%0" \
1524 : "=r" (w1), "=r" (w0) : "r" (u), "r" (v) : "macl", "mach")
1525 #define UMUL_TIME 5
1526 #endif
1528 #if defined (__sparc__) && W_TYPE_SIZE == 32
1529 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
1530 __asm__ ("addcc %r4,%5,%1\n\taddx %r2,%3,%0" \
1531 : "=r" (sh), "=&r" (sl) \
1532 : "rJ" (ah), "rI" (bh),"%rJ" (al), "rI" (bl) \
1533 __CLOBBER_CC)
1534 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
1535 __asm__ ("subcc %r4,%5,%1\n\tsubx %r2,%3,%0" \
1536 : "=r" (sh), "=&r" (sl) \
1537 : "rJ" (ah), "rI" (bh), "rJ" (al), "rI" (bl) \
1538 __CLOBBER_CC)
1539 /* FIXME: When gcc -mcpu=v9 is used on solaris, gcc/config/sol2-sld-64.h
1540 doesn't define anything to indicate that to us, it only sets __sparcv8. */
1541 #if defined (__sparc_v9__) || defined (__sparcv9)
1542 /* Perhaps we should use floating-point operations here? */
1543 #if 0
1544 /* Triggers a bug making mpz/tests/t-gcd.c fail.
1545 Perhaps we simply need explicitly zero-extend the inputs? */
1546 #define umul_ppmm(w1, w0, u, v) \
1547 __asm__ ("mulx %2,%3,%%g1; srl %%g1,0,%1; srlx %%g1,32,%0" : \
1548 "=r" (w1), "=r" (w0) : "r" (u), "r" (v) : "g1")
1549 #else
1550 /* Use v8 umul until above bug is fixed. */
1551 #define umul_ppmm(w1, w0, u, v) \
1552 __asm__ ("umul %2,%3,%1;rd %%y,%0" : "=r" (w1), "=r" (w0) : "r" (u), "r" (v))
1553 #endif
1554 /* Use a plain v8 divide for v9. */
1555 #define udiv_qrnnd(q, r, n1, n0, d) \
1556 do { \
1557 USItype __q; \
1558 __asm__ ("mov %1,%%y;nop;nop;nop;udiv %2,%3,%0" \
1559 : "=r" (__q) : "r" (n1), "r" (n0), "r" (d)); \
1560 (r) = (n0) - __q * (d); \
1561 (q) = __q; \
1562 } while (0)
1563 #else
1564 #if defined (__sparc_v8__) /* gcc normal */ \
1565 || defined (__sparcv8) /* gcc solaris */ \
1566 || HAVE_HOST_CPU_supersparc
1567 /* Don't match immediate range because, 1) it is not often useful,
1568 2) the 'I' flag thinks of the range as a 13 bit signed interval,
1569 while we want to match a 13 bit interval, sign extended to 32 bits,
1570 but INTERPRETED AS UNSIGNED. */
1571 #define umul_ppmm(w1, w0, u, v) \
1572 __asm__ ("umul %2,%3,%1;rd %%y,%0" : "=r" (w1), "=r" (w0) : "r" (u), "r" (v))
1573 #define UMUL_TIME 5
1575 #if HAVE_HOST_CPU_supersparc
1576 #define UDIV_TIME 60 /* SuperSPARC timing */
1577 #else
1578 /* Don't use this on SuperSPARC because its udiv only handles 53 bit
1579 dividends and will trap to the kernel for the rest. */
1580 #define udiv_qrnnd(q, r, n1, n0, d) \
1581 do { \
1582 USItype __q; \
1583 __asm__ ("mov %1,%%y;nop;nop;nop;udiv %2,%3,%0" \
1584 : "=r" (__q) : "r" (n1), "r" (n0), "r" (d)); \
1585 (r) = (n0) - __q * (d); \
1586 (q) = __q; \
1587 } while (0)
1588 #define UDIV_TIME 25
1589 #endif /* HAVE_HOST_CPU_supersparc */
1591 #else /* ! __sparc_v8__ */
1592 #if defined (__sparclite__)
1593 /* This has hardware multiply but not divide. It also has two additional
1594 instructions scan (ffs from high bit) and divscc. */
1595 #define umul_ppmm(w1, w0, u, v) \
1596 __asm__ ("umul %2,%3,%1;rd %%y,%0" : "=r" (w1), "=r" (w0) : "r" (u), "r" (v))
1597 #define UMUL_TIME 5
1598 #define udiv_qrnnd(q, r, n1, n0, d) \
1599 __asm__ ("! Inlined udiv_qrnnd\n" \
1600 " wr %%g0,%2,%%y ! Not a delayed write for sparclite\n" \
1601 " tst %%g0\n" \
1602 " divscc %3,%4,%%g1\n" \
1603 " divscc %%g1,%4,%%g1\n" \
1604 " divscc %%g1,%4,%%g1\n" \
1605 " divscc %%g1,%4,%%g1\n" \
1606 " divscc %%g1,%4,%%g1\n" \
1607 " divscc %%g1,%4,%%g1\n" \
1608 " divscc %%g1,%4,%%g1\n" \
1609 " divscc %%g1,%4,%%g1\n" \
1610 " divscc %%g1,%4,%%g1\n" \
1611 " divscc %%g1,%4,%%g1\n" \
1612 " divscc %%g1,%4,%%g1\n" \
1613 " divscc %%g1,%4,%%g1\n" \
1614 " divscc %%g1,%4,%%g1\n" \
1615 " divscc %%g1,%4,%%g1\n" \
1616 " divscc %%g1,%4,%%g1\n" \
1617 " divscc %%g1,%4,%%g1\n" \
1618 " divscc %%g1,%4,%%g1\n" \
1619 " divscc %%g1,%4,%%g1\n" \
1620 " divscc %%g1,%4,%%g1\n" \
1621 " divscc %%g1,%4,%%g1\n" \
1622 " divscc %%g1,%4,%%g1\n" \
1623 " divscc %%g1,%4,%%g1\n" \
1624 " divscc %%g1,%4,%%g1\n" \
1625 " divscc %%g1,%4,%%g1\n" \
1626 " divscc %%g1,%4,%%g1\n" \
1627 " divscc %%g1,%4,%%g1\n" \
1628 " divscc %%g1,%4,%%g1\n" \
1629 " divscc %%g1,%4,%%g1\n" \
1630 " divscc %%g1,%4,%%g1\n" \
1631 " divscc %%g1,%4,%%g1\n" \
1632 " divscc %%g1,%4,%%g1\n" \
1633 " divscc %%g1,%4,%0\n" \
1634 " rd %%y,%1\n" \
1635 " bl,a 1f\n" \
1636 " add %1,%4,%1\n" \
1637 "1: ! End of inline udiv_qrnnd" \
1638 : "=r" (q), "=r" (r) : "r" (n1), "r" (n0), "rI" (d) \
1639 : "%g1" __AND_CLOBBER_CC)
1640 #define UDIV_TIME 37
1641 #define count_leading_zeros(count, x) \
1642 __asm__ ("scan %1,1,%0" : "=r" (count) : "r" (x))
1643 /* Early sparclites return 63 for an argument of 0, but they warn that future
1644 implementations might change this. Therefore, leave COUNT_LEADING_ZEROS_0
1645 undefined. */
1646 #endif /* __sparclite__ */
1647 #endif /* __sparc_v8__ */
1648 #endif /* __sparc_v9__ */
1649 /* Default to sparc v7 versions of umul_ppmm and udiv_qrnnd. */
1650 #ifndef umul_ppmm
1651 #define umul_ppmm(w1, w0, u, v) \
1652 __asm__ ("! Inlined umul_ppmm\n" \
1653 " wr %%g0,%2,%%y ! SPARC has 0-3 delay insn after a wr\n" \
1654 " sra %3,31,%%g2 ! Don't move this insn\n" \
1655 " and %2,%%g2,%%g2 ! Don't move this insn\n" \
1656 " andcc %%g0,0,%%g1 ! Don't move this insn\n" \
1657 " mulscc %%g1,%3,%%g1\n" \
1658 " mulscc %%g1,%3,%%g1\n" \
1659 " mulscc %%g1,%3,%%g1\n" \
1660 " mulscc %%g1,%3,%%g1\n" \
1661 " mulscc %%g1,%3,%%g1\n" \
1662 " mulscc %%g1,%3,%%g1\n" \
1663 " mulscc %%g1,%3,%%g1\n" \
1664 " mulscc %%g1,%3,%%g1\n" \
1665 " mulscc %%g1,%3,%%g1\n" \
1666 " mulscc %%g1,%3,%%g1\n" \
1667 " mulscc %%g1,%3,%%g1\n" \
1668 " mulscc %%g1,%3,%%g1\n" \
1669 " mulscc %%g1,%3,%%g1\n" \
1670 " mulscc %%g1,%3,%%g1\n" \
1671 " mulscc %%g1,%3,%%g1\n" \
1672 " mulscc %%g1,%3,%%g1\n" \
1673 " mulscc %%g1,%3,%%g1\n" \
1674 " mulscc %%g1,%3,%%g1\n" \
1675 " mulscc %%g1,%3,%%g1\n" \
1676 " mulscc %%g1,%3,%%g1\n" \
1677 " mulscc %%g1,%3,%%g1\n" \
1678 " mulscc %%g1,%3,%%g1\n" \
1679 " mulscc %%g1,%3,%%g1\n" \
1680 " mulscc %%g1,%3,%%g1\n" \
1681 " mulscc %%g1,%3,%%g1\n" \
1682 " mulscc %%g1,%3,%%g1\n" \
1683 " mulscc %%g1,%3,%%g1\n" \
1684 " mulscc %%g1,%3,%%g1\n" \
1685 " mulscc %%g1,%3,%%g1\n" \
1686 " mulscc %%g1,%3,%%g1\n" \
1687 " mulscc %%g1,%3,%%g1\n" \
1688 " mulscc %%g1,%3,%%g1\n" \
1689 " mulscc %%g1,0,%%g1\n" \
1690 " add %%g1,%%g2,%0\n" \
1691 " rd %%y,%1" \
1692 : "=r" (w1), "=r" (w0) : "%rI" (u), "r" (v) \
1693 : "%g1", "%g2" __AND_CLOBBER_CC)
1694 #define UMUL_TIME 39 /* 39 instructions */
1695 #endif
1696 #ifndef udiv_qrnnd
1697 #ifndef LONGLONG_STANDALONE
1698 #define udiv_qrnnd(q, r, n1, n0, d) \
1699 do { UWtype __r; \
1700 (q) = __MPN(udiv_qrnnd) (&__r, (n1), (n0), (d)); \
1701 (r) = __r; \
1702 } while (0)
1703 extern UWtype __MPN(udiv_qrnnd) _PROTO ((UWtype *, UWtype, UWtype, UWtype));
1704 #ifndef UDIV_TIME
1705 #define UDIV_TIME 140
1706 #endif
1707 #endif /* LONGLONG_STANDALONE */
1708 #endif /* udiv_qrnnd */
1709 #endif /* __sparc__ */
1711 #if defined (__sparc__) && W_TYPE_SIZE == 64
1712 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
1713 __asm__ ( \
1714 "addcc %r4,%5,%1\n" \
1715 " addccc %r6,%7,%%g0\n" \
1716 " addc %r2,%3,%0" \
1717 : "=r" (sh), "=&r" (sl) \
1718 : "rJ" (ah), "rI" (bh), "%rJ" (al), "rI" (bl), \
1719 "%rJ" ((al) >> 32), "rI" ((bl) >> 32) \
1720 __CLOBBER_CC)
1721 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
1722 __asm__ ( \
1723 "subcc %r4,%5,%1\n" \
1724 " subccc %r6,%7,%%g0\n" \
1725 " subc %r2,%3,%0" \
1726 : "=r" (sh), "=&r" (sl) \
1727 : "rJ" (ah), "rI" (bh), "rJ" (al), "rI" (bl), \
1728 "rJ" ((al) >> 32), "rI" ((bl) >> 32) \
1729 __CLOBBER_CC)
1730 #endif
1732 #if defined (__vax__) && W_TYPE_SIZE == 32
1733 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
1734 __asm__ ("addl2 %5,%1\n\tadwc %3,%0" \
1735 : "=g" (sh), "=&g" (sl) \
1736 : "0" ((USItype)(ah)), "g" ((USItype)(bh)), \
1737 "%1" ((USItype)(al)), "g" ((USItype)(bl)))
1738 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
1739 __asm__ ("subl2 %5,%1\n\tsbwc %3,%0" \
1740 : "=g" (sh), "=&g" (sl) \
1741 : "0" ((USItype)(ah)), "g" ((USItype)(bh)), \
1742 "1" ((USItype)(al)), "g" ((USItype)(bl)))
1743 #define smul_ppmm(xh, xl, m0, m1) \
1744 do { \
1745 union {UDItype __ll; \
1746 struct {USItype __l, __h;} __i; \
1747 } __x; \
1748 USItype __m0 = (m0), __m1 = (m1); \
1749 __asm__ ("emul %1,%2,$0,%0" \
1750 : "=g" (__x.__ll) : "g" (__m0), "g" (__m1)); \
1751 (xh) = __x.__i.__h; (xl) = __x.__i.__l; \
1752 } while (0)
1753 #define sdiv_qrnnd(q, r, n1, n0, d) \
1754 do { \
1755 union {DItype __ll; \
1756 struct {SItype __l, __h;} __i; \
1757 } __x; \
1758 __x.__i.__h = n1; __x.__i.__l = n0; \
1759 __asm__ ("ediv %3,%2,%0,%1" \
1760 : "=g" (q), "=g" (r) : "g" (__x.__ll), "g" (d)); \
1761 } while (0)
1762 #if 0
1763 /* FIXME: This instruction appears to be unimplemented on some systems (vax
1764 8800 maybe). */
1765 #define count_trailing_zeros(count,x) \
1766 do { \
1767 __asm__ ("ffs 0, 31, %1, %0" \
1768 : "=g" (count) \
1769 : "g" ((USItype) (x))); \
1770 } while (0)
1771 #endif
1772 #endif /* __vax__ */
1774 #if defined (__z8000__) && W_TYPE_SIZE == 16
1775 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
1776 __asm__ ("add %H1,%H5\n\tadc %H0,%H3" \
1777 : "=r" (sh), "=&r" (sl) \
1778 : "0" ((unsigned int)(ah)), "r" ((unsigned int)(bh)), \
1779 "%1" ((unsigned int)(al)), "rQR" ((unsigned int)(bl)))
1780 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
1781 __asm__ ("sub %H1,%H5\n\tsbc %H0,%H3" \
1782 : "=r" (sh), "=&r" (sl) \
1783 : "0" ((unsigned int)(ah)), "r" ((unsigned int)(bh)), \
1784 "1" ((unsigned int)(al)), "rQR" ((unsigned int)(bl)))
1785 #define umul_ppmm(xh, xl, m0, m1) \
1786 do { \
1787 union {long int __ll; \
1788 struct {unsigned int __h, __l;} __i; \
1789 } __x; \
1790 unsigned int __m0 = (m0), __m1 = (m1); \
1791 __asm__ ("mult %S0,%H3" \
1792 : "=r" (__x.__i.__h), "=r" (__x.__i.__l) \
1793 : "%1" (m0), "rQR" (m1)); \
1794 (xh) = __x.__i.__h; (xl) = __x.__i.__l; \
1795 (xh) += ((((signed int) __m0 >> 15) & __m1) \
1796 + (((signed int) __m1 >> 15) & __m0)); \
1797 } while (0)
1798 #endif /* __z8000__ */
1800 #endif /* __GNUC__ */
1802 #endif /* NO_ASM */
1805 #if !defined (umul_ppmm) && defined (__umulsidi3)
1806 #define umul_ppmm(ph, pl, m0, m1) \
1808 UDWtype __ll = __umulsidi3 (m0, m1); \
1809 ph = (UWtype) (__ll >> W_TYPE_SIZE); \
1810 pl = (UWtype) __ll; \
1812 #endif
1814 #if !defined (__umulsidi3)
1815 #define __umulsidi3(u, v) \
1816 ({UWtype __hi, __lo; \
1817 umul_ppmm (__hi, __lo, u, v); \
1818 ((UDWtype) __hi << W_TYPE_SIZE) | __lo; })
1819 #endif
1822 /* Use mpn_umul_ppmm or mpn_udiv_qrnnd functions, if they exist. The "_r"
1823 forms have "reversed" arguments, meaning the pointer is last, which
1824 sometimes allows better parameter passing, in particular on 64-bit
1825 hppa. */
1827 #define mpn_umul_ppmm __MPN(umul_ppmm)
1828 extern UWtype mpn_umul_ppmm _PROTO ((UWtype *, UWtype, UWtype));
1830 #if ! defined (umul_ppmm) && HAVE_NATIVE_mpn_umul_ppmm \
1831 && ! defined (LONGLONG_STANDALONE)
1832 #define umul_ppmm(wh, wl, u, v) \
1833 do { \
1834 UWtype __umul_ppmm__p0; \
1835 (wh) = mpn_umul_ppmm (&__umul_ppmm__p0, (UWtype) (u), (UWtype) (v)); \
1836 (wl) = __umul_ppmm__p0; \
1837 } while (0)
1838 #endif
1840 #define mpn_umul_ppmm_r __MPN(umul_ppmm_r)
1841 extern UWtype mpn_umul_ppmm_r _PROTO ((UWtype, UWtype, UWtype *));
1843 #if ! defined (umul_ppmm) && HAVE_NATIVE_mpn_umul_ppmm_r \
1844 && ! defined (LONGLONG_STANDALONE)
1845 #define umul_ppmm(wh, wl, u, v) \
1846 do { \
1847 UWtype __umul_ppmm__p0; \
1848 (wh) = mpn_umul_ppmm_r ((UWtype) (u), (UWtype) (v), &__umul_ppmm__p0); \
1849 (wl) = __umul_ppmm__p0; \
1850 } while (0)
1851 #endif
1853 #define mpn_udiv_qrnnd __MPN(udiv_qrnnd)
1854 extern UWtype mpn_udiv_qrnnd _PROTO ((UWtype *, UWtype, UWtype, UWtype));
1856 #if ! defined (udiv_qrnnd) && HAVE_NATIVE_mpn_udiv_qrnnd \
1857 && ! defined (LONGLONG_STANDALONE)
1858 #define udiv_qrnnd(q, r, n1, n0, d) \
1859 do { \
1860 UWtype __udiv_qrnnd__r; \
1861 (q) = mpn_udiv_qrnnd (&__udiv_qrnnd__r, \
1862 (UWtype) (n1), (UWtype) (n0), (UWtype) d); \
1863 (r) = __udiv_qrnnd__r; \
1864 } while (0)
1865 #endif
1867 #define mpn_udiv_qrnnd_r __MPN(udiv_qrnnd_r)
1868 extern UWtype mpn_udiv_qrnnd_r _PROTO ((UWtype, UWtype, UWtype, UWtype *));
1870 #if ! defined (udiv_qrnnd) && HAVE_NATIVE_mpn_udiv_qrnnd_r \
1871 && ! defined (LONGLONG_STANDALONE)
1872 #define udiv_qrnnd(q, r, n1, n0, d) \
1873 do { \
1874 UWtype __udiv_qrnnd__r; \
1875 (q) = mpn_udiv_qrnnd_r ((UWtype) (n1), (UWtype) (n0), (UWtype) d, \
1876 &__udiv_qrnnd__r); \
1877 (r) = __udiv_qrnnd__r; \
1878 } while (0)
1879 #endif
1882 /* If this machine has no inline assembler, use C macros. */
1884 #if !defined (add_ssaaaa)
1885 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
1886 do { \
1887 UWtype __x; \
1888 __x = (al) + (bl); \
1889 (sh) = (ah) + (bh) + (__x < (al)); \
1890 (sl) = __x; \
1891 } while (0)
1892 #endif
1894 #if !defined (sub_ddmmss)
1895 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
1896 do { \
1897 UWtype __x; \
1898 __x = (al) - (bl); \
1899 (sh) = (ah) - (bh) - ((al) < (bl)); \
1900 (sl) = __x; \
1901 } while (0)
1902 #endif
1904 /* If we lack umul_ppmm but have smul_ppmm, define umul_ppmm in terms of
1905 smul_ppmm. */
1906 #if !defined (umul_ppmm) && defined (smul_ppmm)
1907 #define umul_ppmm(w1, w0, u, v) \
1908 do { \
1909 UWtype __w1; \
1910 UWtype __xm0 = (u), __xm1 = (v); \
1911 smul_ppmm (__w1, w0, __xm0, __xm1); \
1912 (w1) = __w1 + (-(__xm0 >> (W_TYPE_SIZE - 1)) & __xm1) \
1913 + (-(__xm1 >> (W_TYPE_SIZE - 1)) & __xm0); \
1914 } while (0)
1915 #endif
1917 /* If we still don't have umul_ppmm, define it using plain C.
1919 For reference, when this code is used for squaring (ie. u and v identical
1920 expressions), gcc recognises __x1 and __x2 are the same and generates 3
1921 multiplies, not 4. The subsequent additions could be optimized a bit,
1922 but the only place GMP currently uses such a square is mpn_sqr_basecase,
1923 and chips obliged to use this generic C umul will have plenty of worse
1924 performance problems than a couple of extra instructions on the diagonal
1925 of sqr_basecase. */
1927 #if !defined (umul_ppmm)
1928 #define umul_ppmm(w1, w0, u, v) \
1929 do { \
1930 UWtype __x0, __x1, __x2, __x3; \
1931 UHWtype __ul, __vl, __uh, __vh; \
1932 UWtype __u = (u), __v = (v); \
1934 __ul = __ll_lowpart (__u); \
1935 __uh = __ll_highpart (__u); \
1936 __vl = __ll_lowpart (__v); \
1937 __vh = __ll_highpart (__v); \
1939 __x0 = (UWtype) __ul * __vl; \
1940 __x1 = (UWtype) __ul * __vh; \
1941 __x2 = (UWtype) __uh * __vl; \
1942 __x3 = (UWtype) __uh * __vh; \
1944 __x1 += __ll_highpart (__x0);/* this can't give carry */ \
1945 __x1 += __x2; /* but this indeed can */ \
1946 if (__x1 < __x2) /* did we get it? */ \
1947 __x3 += __ll_B; /* yes, add it in the proper pos. */ \
1949 (w1) = __x3 + __ll_highpart (__x1); \
1950 (w0) = (__x1 << W_TYPE_SIZE/2) + __ll_lowpart (__x0); \
1951 } while (0)
1952 #endif
1954 /* If we don't have smul_ppmm, define it using umul_ppmm (which surely will
1955 exist in one form or another. */
1956 #if !defined (smul_ppmm)
1957 #define smul_ppmm(w1, w0, u, v) \
1958 do { \
1959 UWtype __w1; \
1960 UWtype __xm0 = (u), __xm1 = (v); \
1961 umul_ppmm (__w1, w0, __xm0, __xm1); \
1962 (w1) = __w1 - (-(__xm0 >> (W_TYPE_SIZE - 1)) & __xm1) \
1963 - (-(__xm1 >> (W_TYPE_SIZE - 1)) & __xm0); \
1964 } while (0)
1965 #endif
1967 /* Define this unconditionally, so it can be used for debugging. */
1968 #define __udiv_qrnnd_c(q, r, n1, n0, d) \
1969 do { \
1970 UWtype __d1, __d0, __q1, __q0, __r1, __r0, __m; \
1972 ASSERT ((d) != 0); \
1973 ASSERT ((n1) < (d)); \
1975 __d1 = __ll_highpart (d); \
1976 __d0 = __ll_lowpart (d); \
1978 __q1 = (n1) / __d1; \
1979 __r1 = (n1) - __q1 * __d1; \
1980 __m = __q1 * __d0; \
1981 __r1 = __r1 * __ll_B | __ll_highpart (n0); \
1982 if (__r1 < __m) \
1984 __q1--, __r1 += (d); \
1985 if (__r1 >= (d)) /* i.e. we didn't get carry when adding to __r1 */\
1986 if (__r1 < __m) \
1987 __q1--, __r1 += (d); \
1989 __r1 -= __m; \
1991 __q0 = __r1 / __d1; \
1992 __r0 = __r1 - __q0 * __d1; \
1993 __m = __q0 * __d0; \
1994 __r0 = __r0 * __ll_B | __ll_lowpart (n0); \
1995 if (__r0 < __m) \
1997 __q0--, __r0 += (d); \
1998 if (__r0 >= (d)) \
1999 if (__r0 < __m) \
2000 __q0--, __r0 += (d); \
2002 __r0 -= __m; \
2004 (q) = __q1 * __ll_B | __q0; \
2005 (r) = __r0; \
2006 } while (0)
2008 /* If the processor has no udiv_qrnnd but sdiv_qrnnd, go through
2009 __udiv_w_sdiv (defined in libgcc or elsewhere). */
2010 #if !defined (udiv_qrnnd) && defined (sdiv_qrnnd)
2011 #define udiv_qrnnd(q, r, nh, nl, d) \
2012 do { \
2013 UWtype __r; \
2014 (q) = __MPN(udiv_w_sdiv) (&__r, nh, nl, d); \
2015 (r) = __r; \
2016 } while (0)
2017 __GMP_DECLSPEC UWtype __MPN(udiv_w_sdiv) (UWtype *, UWtype, UWtype, UWtype);
2018 #endif
2020 /* If udiv_qrnnd was not defined for this processor, use __udiv_qrnnd_c. */
2021 #if !defined (udiv_qrnnd)
2022 #define UDIV_NEEDS_NORMALIZATION 1
2023 #define udiv_qrnnd __udiv_qrnnd_c
2024 #endif
2026 #if !defined (count_leading_zeros)
2027 #define count_leading_zeros(count, x) \
2028 do { \
2029 UWtype __xr = (x); \
2030 UWtype __a; \
2032 if (W_TYPE_SIZE == 32) \
2034 __a = __xr < ((UWtype) 1 << 2*__BITS4) \
2035 ? (__xr < ((UWtype) 1 << __BITS4) ? 1 : __BITS4 + 1) \
2036 : (__xr < ((UWtype) 1 << 3*__BITS4) ? 2*__BITS4 + 1 \
2037 : 3*__BITS4 + 1); \
2039 else \
2041 for (__a = W_TYPE_SIZE - 8; __a > 0; __a -= 8) \
2042 if (((__xr >> __a) & 0xff) != 0) \
2043 break; \
2044 ++__a; \
2047 (count) = W_TYPE_SIZE + 1 - __a - __clz_tab[__xr >> __a]; \
2048 } while (0)
2049 /* This version gives a well-defined value for zero. */
2050 #define COUNT_LEADING_ZEROS_0 (W_TYPE_SIZE - 1)
2051 #define COUNT_LEADING_ZEROS_NEED_CLZ_TAB
2052 #endif
2054 /* clz_tab needed by mpn/x86/pentium/mod_1.asm in a fat binary */
2055 #if HAVE_HOST_CPU_FAMILY_x86 && WANT_FAT_BINARY
2056 #define COUNT_LEADING_ZEROS_NEED_CLZ_TAB
2057 #endif
2059 #ifdef COUNT_LEADING_ZEROS_NEED_CLZ_TAB
2060 extern const unsigned char __GMP_DECLSPEC __clz_tab[128];
2061 #endif
2063 #if !defined (count_trailing_zeros)
2064 /* Define count_trailing_zeros using count_leading_zeros. The latter might be
2065 defined in asm, but if it is not, the C version above is good enough. */
2066 #define count_trailing_zeros(count, x) \
2067 do { \
2068 UWtype __ctz_x = (x); \
2069 UWtype __ctz_c; \
2070 ASSERT (__ctz_x != 0); \
2071 count_leading_zeros (__ctz_c, __ctz_x & -__ctz_x); \
2072 (count) = W_TYPE_SIZE - 1 - __ctz_c; \
2073 } while (0)
2074 #endif
2076 #ifndef UDIV_NEEDS_NORMALIZATION
2077 #define UDIV_NEEDS_NORMALIZATION 0
2078 #endif
2080 /* Whether udiv_qrnnd is actually implemented with udiv_qrnnd_preinv, and
2081 that hence the latter should always be used. */
2082 #ifndef UDIV_PREINV_ALWAYS
2083 #define UDIV_PREINV_ALWAYS 0
2084 #endif
2086 /* Give defaults for UMUL_TIME and UDIV_TIME. */
2087 #ifndef UMUL_TIME
2088 #define UMUL_TIME 1
2089 #endif
2091 #ifndef UDIV_TIME
2092 #define UDIV_TIME UMUL_TIME
2093 #endif