source/libs/mpfr/mpfr-3.1.2/src/mpfr-longlong.h

   1 /* longlong.h -- definitions for mixed size 32/64 bit arithmetic.
   2
   3 Copyright 1991, 1992, 1993, 1994, 1996, 1997, 1999, 2000, 2001, 2002, 2003,
   4 2004, 2005, 2007, 2008, 2009, 2010, 2011, 2012, 2013 Free Software Foundation, Inc.
   5
   6 This file is free software; you can redistribute it and/or modify it under the
   7 terms of the GNU Lesser General Public License as published by the Free
   8 Software Foundation; either version 3 of the License, or (at your option) any
   9 later version.
  10
  11 This file is distributed in the hope that it will be useful, but WITHOUT ANY
  12 WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
  13 PARTICULAR PURPOSE.  See the GNU Lesser General Public License for more
  14 details.
  15
  16 You should have received a copy of the GNU Lesser General Public License
  17 along with this file.  If not, see http://www.gnu.org/licenses/.  */
  18
  19 /* You have to define the following before including this file:
  20
  21    UWtype -- An unsigned type, default type for operations (typically a "word")
  22    UHWtype -- An unsigned type, at least half the size of UWtype.
  23    UDWtype -- An unsigned type, at least twice as large a UWtype
  24    W_TYPE_SIZE -- size in bits of UWtype
  25
  26    SItype, USItype -- Signed and unsigned 32 bit types.
  27    DItype, UDItype -- Signed and unsigned 64 bit types.
  28
  29    On a 32 bit machine UWtype should typically be USItype;
  30    on a 64 bit machine, UWtype should typically be UDItype.
  31
  32    CAUTION!  Using this file outside of GMP is not safe.  You need to include
  33    gmp.h and gmp-impl.h, or certain things might not work as expected.
  34 */
  35
  36 #define __BITS4 (W_TYPE_SIZE / 4)
  37 #define __ll_B ((UWtype) 1 << (W_TYPE_SIZE / 2))
  38 #define __ll_lowpart(t) ((UWtype) (t) & (__ll_B - 1))
  39 #define __ll_highpart(t) ((UWtype) (t) >> (W_TYPE_SIZE / 2))
  40
  41 /* This is used to make sure no undesirable sharing between different libraries
  42    that use this file takes place.  */
  43 #ifndef __MPN
  44 #define __MPN(x) __##x
  45 #endif
  46
  47 #ifndef _PROTO
  48 #if (__STDC__-0) || defined (__cplusplus)
  49 #define _PROTO(x) x
  50 #else
  51 #define _PROTO(x) ()
  52 #endif
  53 #endif
  54
  55 /* Define auxiliary asm macros.
  56
  57    1) umul_ppmm(high_prod, low_prod, multiplier, multiplicand) multiplies two
  58    UWtype integers MULTIPLIER and MULTIPLICAND, and generates a two UWtype
  59    word product in HIGH_PROD and LOW_PROD.
  60
  61    2) __umulsidi3(a,b) multiplies two UWtype integers A and B, and returns a
  62    UDWtype product.  This is just a variant of umul_ppmm.
  63
  64    3) udiv_qrnnd(quotient, remainder, high_numerator, low_numerator,
  65    denominator) divides a UDWtype, composed by the UWtype integers
  66    HIGH_NUMERATOR and LOW_NUMERATOR, by DENOMINATOR and places the quotient
  67    in QUOTIENT and the remainder in REMAINDER.  HIGH_NUMERATOR must be less
  68    than DENOMINATOR for correct operation.  If, in addition, the most
  69    significant bit of DENOMINATOR must be 1, then the pre-processor symbol
  70    UDIV_NEEDS_NORMALIZATION is defined to 1.
  71
  72    4) sdiv_qrnnd(quotient, remainder, high_numerator, low_numerator,
  73    denominator).  Like udiv_qrnnd but the numbers are signed.  The quotient
  74    is rounded toward 0.
  75
  76    5) count_leading_zeros(count, x) counts the number of zero-bits from the
  77    msb to the first non-zero bit in the UWtype X.  This is the number of
  78    steps X needs to be shifted left to set the msb.  Undefined for X == 0,
  79    unless the symbol COUNT_LEADING_ZEROS_0 is defined to some value.
  80
  81    6) count_trailing_zeros(count, x) like count_leading_zeros, but counts
  82    from the least significant end.
  83
  84    7) add_ssaaaa(high_sum, low_sum, high_addend_1, low_addend_1,
  85    high_addend_2, low_addend_2) adds two UWtype integers, composed by
  86    HIGH_ADDEND_1 and LOW_ADDEND_1, and HIGH_ADDEND_2 and LOW_ADDEND_2
  87    respectively.  The result is placed in HIGH_SUM and LOW_SUM.  Overflow
  88    (i.e. carry out) is not stored anywhere, and is lost.
  89
  90    8) sub_ddmmss(high_difference, low_difference, high_minuend, low_minuend,
  91    high_subtrahend, low_subtrahend) subtracts two two-word UWtype integers,
  92    composed by HIGH_MINUEND_1 and LOW_MINUEND_1, and HIGH_SUBTRAHEND_2 and
  93    LOW_SUBTRAHEND_2 respectively.  The result is placed in HIGH_DIFFERENCE
  94    and LOW_DIFFERENCE.  Overflow (i.e. carry out) is not stored anywhere,
  95    and is lost.
  96
  97    If any of these macros are left undefined for a particular CPU,
  98    C macros are used.
  99
 100
 101    Notes:
 102
 103    For add_ssaaaa the two high and two low addends can both commute, but
 104    unfortunately gcc only supports one "%" commutative in each asm block.
 105    This has always been so but is only documented in recent versions
 106    (eg. pre-release 3.3).  Having two or more "%"s can cause an internal
 107    compiler error in certain rare circumstances.
 108
 109    Apparently it was only the last "%" that was ever actually respected, so
 110    the code has been updated to leave just that.  Clearly there's a free
 111    choice whether high or low should get it, if there's a reason to favour
 112    one over the other.  Also obviously when the constraints on the two
 113    operands are identical there's no benefit to the reloader in any "%" at
 114    all.
 115
 116    */
 117
 118 /* The CPUs come in alphabetical order below.
 119
 120    Please add support for more CPUs here, or improve the current support
 121    for the CPUs below!  */
 122
 123
 124 /* count_leading_zeros_gcc_clz is count_leading_zeros implemented with gcc
 125    3.4 __builtin_clzl or __builtin_clzll, according to our limb size.
 126    Similarly count_trailing_zeros_gcc_ctz using __builtin_ctzl or
 127    __builtin_ctzll.
 128
 129    These builtins are only used when we check what code comes out, on some
 130    chips they're merely libgcc calls, where we will instead want an inline
 131    in that case (either asm or generic C).
 132
 133    These builtins are better than an asm block of the same insn, since an
 134    asm block doesn't give gcc any information about scheduling or resource
 135    usage.  We keep an asm block for use on prior versions of gcc though.
 136
 137    For reference, __builtin_ffs existed in gcc prior to __builtin_clz, but
 138    it's not used (for count_leading_zeros) because it generally gives extra
 139    code to ensure the result is 0 when the input is 0, which we don't need
 140    or want.  */
 141
 142 #ifdef _LONG_LONG_LIMB
 143 #define count_leading_zeros_gcc_clz(count,x)    \
 144   do {                                          \
 145     ASSERT ((x) != 0);                          \
 146     (count) = __builtin_clzll (x);              \
 147   } while (0)
 148 #else
 149 #define count_leading_zeros_gcc_clz(count,x)    \
 150   do {                                          \
 151     ASSERT ((x) != 0);                          \
 152     (count) = __builtin_clzl (x);               \
 153   } while (0)
 154 #endif
 155
 156 #ifdef _LONG_LONG_LIMB
 157 #define count_trailing_zeros_gcc_ctz(count,x)   \
 158   do {                                          \
 159     ASSERT ((x) != 0);                          \
 160     (count) = __builtin_ctzll (x);              \
 161   } while (0)
 162 #else
 163 #define count_trailing_zeros_gcc_ctz(count,x)   \
 164   do {                                          \
 165     ASSERT ((x) != 0);                          \
 166     (count) = __builtin_ctzl (x);               \
 167   } while (0)
 168 #endif
 169
 170 /* Note: the following FIXME comes from GMP, thus it does make sense to try
 171    to resolve it in MPFR. */
 172 /* FIXME: The macros using external routines like __MPN(count_leading_zeros)
 173    don't need to be under !NO_ASM */
 174 #if ! defined (NO_ASM)
 175
 176 #if defined (__alpha) && W_TYPE_SIZE == 64
 177 /* Most alpha-based machines, except Cray systems. */
 178 #if defined (__GNUC__)
 179 #if __GMP_GNUC_PREREQ (3,3)
 180 #define umul_ppmm(ph, pl, m0, m1) \
 181   do {                                                                  \
 182     UDItype __m0 = (m0), __m1 = (m1);                                   \
 183     (ph) = __builtin_alpha_umulh (__m0, __m1);                          \
 184     (pl) = __m0 * __m1;                                                 \
 185   } while (0)
 186 #else
 187 #define umul_ppmm(ph, pl, m0, m1) \
 188   do {                                                                  \
 189     UDItype __m0 = (m0), __m1 = (m1);                                   \
 190     __asm__ ("umulh %r1,%2,%0"                                          \
 191              : "=r" (ph)                                                \
 192              : "%rJ" (m0), "rI" (m1));                                  \
 193     (pl) = __m0 * __m1;                                                 \
 194   } while (0)
 195 #endif
 196 #define UMUL_TIME 18
 197 #else /* ! __GNUC__ */
 198 #include <machine/builtins.h>
 199 #define umul_ppmm(ph, pl, m0, m1) \
 200   do {                                                                  \
 201     UDItype __m0 = (m0), __m1 = (m1);                                   \
 202     (ph) = __UMULH (m0, m1);                                            \
 203     (pl) = __m0 * __m1;                                                 \
 204   } while (0)
 205 #endif
 206 #ifndef LONGLONG_STANDALONE
 207 #define udiv_qrnnd(q, r, n1, n0, d) \
 208   do { UWtype __di;                                                     \
 209     __di = __MPN(invert_limb) (d);                                      \
 210     udiv_qrnnd_preinv (q, r, n1, n0, d, __di);                          \
 211   } while (0)
 212 #define UDIV_PREINV_ALWAYS  1
 213 #define UDIV_NEEDS_NORMALIZATION 1
 214 #define UDIV_TIME 220
 215 #endif /* LONGLONG_STANDALONE */
 216
 217 /* clz_tab is required in all configurations, since mpn/alpha/cntlz.asm
 218    always goes into libgmp.so, even when not actually used.  */
 219 #define COUNT_LEADING_ZEROS_NEED_CLZ_TAB
 220
 221 #if defined (__GNUC__) && HAVE_HOST_CPU_alpha_CIX
 222 #define count_leading_zeros(COUNT,X) \
 223   __asm__("ctlz %1,%0" : "=r"(COUNT) : "r"(X))
 224 #define count_trailing_zeros(COUNT,X) \
 225   __asm__("cttz %1,%0" : "=r"(COUNT) : "r"(X))
 226 #endif /* clz/ctz using cix */
 227
 228 #if ! defined (count_leading_zeros)                             \
 229   && defined (__GNUC__) && ! defined (LONGLONG_STANDALONE)
 230 /* ALPHA_CMPBGE_0 gives "cmpbge $31,src,dst", ie. test src bytes == 0.
 231    "$31" is written explicitly in the asm, since an "r" constraint won't
 232    select reg 31.  There seems no need to worry about "r31" syntax for cray,
 233    since gcc itself (pre-release 3.4) emits just $31 in various places.  */
 234 #define ALPHA_CMPBGE_0(dst, src)                                        \
 235   do { asm ("cmpbge $31, %1, %0" : "=r" (dst) : "r" (src)); } while (0)
 236 /* Zero bytes are turned into bits with cmpbge, a __clz_tab lookup counts
 237    them, locating the highest non-zero byte.  A second __clz_tab lookup
 238    counts the leading zero bits in that byte, giving the result.  */
 239 #define count_leading_zeros(count, x)                                   \
 240   do {                                                                  \
 241     UWtype  __clz__b, __clz__c, __clz__x = (x);                         \
 242     ALPHA_CMPBGE_0 (__clz__b,  __clz__x);           /* zero bytes */    \
 243     __clz__b = __clz_tab [(__clz__b >> 1) ^ 0x7F];  /* 8 to 1 byte */   \
 244     __clz__b = __clz__b * 8 - 7;                    /* 57 to 1 shift */ \
 245     __clz__x >>= __clz__b;                                              \
 246     __clz__c = __clz_tab [__clz__x];                /* 8 to 1 bit */    \
 247     __clz__b = 65 - __clz__b;                                           \
 248     (count) = __clz__b - __clz__c;                                      \
 249   } while (0)
 250 #define COUNT_LEADING_ZEROS_NEED_CLZ_TAB
 251 #endif /* clz using cmpbge */
 252
 253 #if ! defined (count_leading_zeros) && ! defined (LONGLONG_STANDALONE)
 254 #if HAVE_ATTRIBUTE_CONST
 255 long __MPN(count_leading_zeros) _PROTO ((UDItype)) __attribute__ ((const));
 256 #else
 257 long __MPN(count_leading_zeros) _PROTO ((UDItype));
 258 #endif
 259 #define count_leading_zeros(count, x) \
 260   ((count) = __MPN(count_leading_zeros) (x))
 261 #endif /* clz using mpn */
 262 #endif /* __alpha */
 263
 264 #if defined (_CRAY) && W_TYPE_SIZE == 64
 265 #include <intrinsics.h>
 266 #define UDIV_PREINV_ALWAYS  1
 267 #define UDIV_NEEDS_NORMALIZATION 1
 268 #define UDIV_TIME 220
 269 long __MPN(count_leading_zeros) _PROTO ((UDItype));
 270 #define count_leading_zeros(count, x) \
 271   ((count) = _leadz ((UWtype) (x)))
 272 #if defined (_CRAYIEEE)         /* I.e., Cray T90/ieee, T3D, and T3E */
 273 #define umul_ppmm(ph, pl, m0, m1) \
 274   do {                                                                  \
 275     UDItype __m0 = (m0), __m1 = (m1);                                   \
 276     (ph) = _int_mult_upper (m0, m1);                                    \
 277     (pl) = __m0 * __m1;                                                 \
 278   } while (0)
 279 #ifndef LONGLONG_STANDALONE
 280 #define udiv_qrnnd(q, r, n1, n0, d) \
 281   do { UWtype __di;                                                     \
 282     __di = __MPN(invert_limb) (d);                                      \
 283     udiv_qrnnd_preinv (q, r, n1, n0, d, __di);                          \
 284   } while (0)
 285 #endif /* LONGLONG_STANDALONE */
 286 #endif /* _CRAYIEEE */
 287 #endif /* _CRAY */
 288
 289 #if defined (__ia64) && W_TYPE_SIZE == 64
 290 /* This form encourages gcc (pre-release 3.4 at least) to emit predicated
 291    "sub r=r,r" and "sub r=r,r,1", giving a 2 cycle latency.  The generic
 292    code using "al<bl" arithmetically comes out making an actual 0 or 1 in a
 293    register, which takes an extra cycle.  */
 294 #define sub_ddmmss(sh, sl, ah, al, bh, bl)      \
 295   do {                                          \
 296     UWtype __x;                                 \
 297     __x = (al) - (bl);                          \
 298     if ((al) < (bl))                            \
 299       (sh) = (ah) - (bh) - 1;                   \
 300     else                                        \
 301       (sh) = (ah) - (bh);                       \
 302     (sl) = __x;                                 \
 303   } while (0)
 304 #if defined (__GNUC__) && ! defined (__INTEL_COMPILER)
 305 /* Do both product parts in assembly, since that gives better code with
 306    all gcc versions.  Some callers will just use the upper part, and in
 307    that situation we waste an instruction, but not any cycles.  */
 308 #define umul_ppmm(ph, pl, m0, m1) \
 309     __asm__ ("xma.hu %0 = %2, %3, f0\n\txma.l %1 = %2, %3, f0"          \
 310              : "=&f" (ph), "=f" (pl)                                    \
 311              : "f" (m0), "f" (m1))
 312 #define UMUL_TIME 14
 313 #define count_leading_zeros(count, x) \
 314   do {                                                                  \
 315     UWtype _x = (x), _y, _a, _c;                                        \
 316     __asm__ ("mux1 %0 = %1, @rev" : "=r" (_y) : "r" (_x));              \
 317     __asm__ ("czx1.l %0 = %1" : "=r" (_a) : "r" (-_y | _y));            \
 318     _c = (_a - 1) << 3;                                                 \
 319     _x >>= _c;                                                          \
 320     if (_x >= 1 << 4)                                                   \
 321       _x >>= 4, _c += 4;                                                \
 322     if (_x >= 1 << 2)                                                   \
 323       _x >>= 2, _c += 2;                                                \
 324     _c += _x >> 1;                                                      \
 325     (count) =  W_TYPE_SIZE - 1 - _c;                                    \
 326   } while (0)
 327 /* similar to what gcc does for __builtin_ffs, but 0 based rather than 1
 328    based, and we don't need a special case for x==0 here */
 329 #define count_trailing_zeros(count, x)                                  \
 330   do {                                                                  \
 331     UWtype __ctz_x = (x);                                               \
 332     __asm__ ("popcnt %0 = %1"                                           \
 333              : "=r" (count)                                             \
 334              : "r" ((__ctz_x-1) & ~__ctz_x));                           \
 335   } while (0)
 336 #endif
 337 #if defined (__INTEL_COMPILER)
 338 #include <ia64intrin.h>
 339 #define umul_ppmm(ph, pl, m0, m1)                                       \
 340   do {                                                                  \
 341     UWtype _m0 = (m0), _m1 = (m1);                                      \
 342     ph = _m64_xmahu (_m0, _m1, 0);                                      \
 343     pl = _m0 * _m1;                                                     \
 344   } while (0)
 345 #endif
 346 #ifndef LONGLONG_STANDALONE
 347 #define udiv_qrnnd(q, r, n1, n0, d) \
 348   do { UWtype __di;                                                     \
 349     __di = __MPN(invert_limb) (d);                                      \
 350     udiv_qrnnd_preinv (q, r, n1, n0, d, __di);                          \
 351   } while (0)
 352 #define UDIV_PREINV_ALWAYS  1
 353 #define UDIV_NEEDS_NORMALIZATION 1
 354 #endif
 355 #define UDIV_TIME 220
 356 #endif
 357
 358
 359 #if defined (__GNUC__)
 360
 361 /* We sometimes need to clobber "cc" with gcc2, but that would not be
 362    understood by gcc1.  Use cpp to avoid major code duplication.  */
 363 #if __GNUC__ < 2
 364 #define __CLOBBER_CC
 365 #define __AND_CLOBBER_CC
 366 #else /* __GNUC__ >= 2 */
 367 #define __CLOBBER_CC : "cc"
 368 #define __AND_CLOBBER_CC , "cc"
 369 #endif /* __GNUC__ < 2 */
 370
 371 #if (defined (__a29k__) || defined (_AM29K)) && W_TYPE_SIZE == 32
 372 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
 373   __asm__ ("add %1,%4,%5\n\taddc %0,%2,%3"                              \
 374            : "=r" (sh), "=&r" (sl)                                      \
 375            : "r" (ah), "rI" (bh), "%r" (al), "rI" (bl))
 376 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
 377   __asm__ ("sub %1,%4,%5\n\tsubc %0,%2,%3"                              \
 378            : "=r" (sh), "=&r" (sl)                                      \
 379            : "r" (ah), "rI" (bh), "r" (al), "rI" (bl))
 380 #define umul_ppmm(xh, xl, m0, m1) \
 381   do {                                                                  \
 382     USItype __m0 = (m0), __m1 = (m1);                                   \
 383     __asm__ ("multiplu %0,%1,%2"                                        \
 384              : "=r" (xl)                                                \
 385              : "r" (__m0), "r" (__m1));                                 \
 386     __asm__ ("multmu %0,%1,%2"                                          \
 387              : "=r" (xh)                                                \
 388              : "r" (__m0), "r" (__m1));                                 \
 389   } while (0)
 390 #define udiv_qrnnd(q, r, n1, n0, d) \
 391   __asm__ ("dividu %0,%3,%4"                                            \
 392            : "=r" (q), "=q" (r)                                         \
 393            : "1" (n1), "r" (n0), "r" (d))
 394 #define count_leading_zeros(count, x) \
 395     __asm__ ("clz %0,%1"                                                \
 396              : "=r" (count)                                             \
 397              : "r" (x))
 398 #define COUNT_LEADING_ZEROS_0 32
 399 #endif /* __a29k__ */
 400
 401 #if defined (__arc__)
 402 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
 403   __asm__ ("add.f\t%1, %4, %5\n\tadc\t%0, %2, %3"                       \
 404            : "=r" (sh),                                                 \
 405              "=&r" (sl)                                                 \
 406            : "r"  ((USItype) (ah)),                                     \
 407              "rIJ" ((USItype) (bh)),                                    \
 408              "%r" ((USItype) (al)),                                     \
 409              "rIJ" ((USItype) (bl)))
 410 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
 411   __asm__ ("sub.f\t%1, %4, %5\n\tsbc\t%0, %2, %3"                       \
 412            : "=r" (sh),                                                 \
 413              "=&r" (sl)                                                 \
 414            : "r" ((USItype) (ah)),                                      \
 415              "rIJ" ((USItype) (bh)),                                    \
 416              "r" ((USItype) (al)),                                      \
 417              "rIJ" ((USItype) (bl)))
 418 #endif
 419
 420 #if defined (__arm__) && W_TYPE_SIZE == 32
 421 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
 422   __asm__ ("adds\t%1, %4, %5\n\tadc\t%0, %2, %3"                        \
 423            : "=r" (sh), "=&r" (sl)                                      \
 424            : "r" (ah), "rI" (bh), "%r" (al), "rI" (bl) __CLOBBER_CC)
 425 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
 426   do {                                                                  \
 427     if (__builtin_constant_p (al))                                      \
 428       {                                                                 \
 429         if (__builtin_constant_p (ah))                                  \
 430           __asm__ ("rsbs\t%1, %5, %4\n\trsc\t%0, %3, %2"                \
 431                    : "=r" (sh), "=&r" (sl)                              \
 432                    : "rI" (ah), "r" (bh), "rI" (al), "r" (bl) __CLOBBER_CC); \
 433         else                                                            \
 434           __asm__ ("rsbs\t%1, %5, %4\n\tsbc\t%0, %2, %3"                \
 435                    : "=r" (sh), "=&r" (sl)                              \
 436                    : "r" (ah), "rI" (bh), "rI" (al), "r" (bl) __CLOBBER_CC); \
 437       }                                                                 \
 438     else if (__builtin_constant_p (ah))                                 \
 439       {                                                                 \
 440         if (__builtin_constant_p (bl))                                  \
 441           __asm__ ("subs\t%1, %4, %5\n\trsc\t%0, %3, %2"                \
 442                    : "=r" (sh), "=&r" (sl)                              \
 443                    : "rI" (ah), "r" (bh), "r" (al), "rI" (bl) __CLOBBER_CC); \
 444         else                                                            \
 445           __asm__ ("rsbs\t%1, %5, %4\n\trsc\t%0, %3, %2"                \
 446                    : "=r" (sh), "=&r" (sl)                              \
 447                    : "rI" (ah), "r" (bh), "rI" (al), "r" (bl) __CLOBBER_CC); \
 448       }                                                                 \
 449     else if (__builtin_constant_p (bl))                                 \
 450       {                                                                 \
 451         if (__builtin_constant_p (bh))                                  \
 452           __asm__ ("subs\t%1, %4, %5\n\tsbc\t%0, %2, %3"                \
 453                    : "=r" (sh), "=&r" (sl)                              \
 454                    : "r" (ah), "rI" (bh), "r" (al), "rI" (bl) __CLOBBER_CC); \
 455         else                                                            \
 456           __asm__ ("subs\t%1, %4, %5\n\trsc\t%0, %3, %2"                \
 457                    : "=r" (sh), "=&r" (sl)                              \
 458                    : "rI" (ah), "r" (bh), "r" (al), "rI" (bl) __CLOBBER_CC); \
 459       }                                                                 \
 460     else /* only bh might be a constant */                              \
 461       __asm__ ("subs\t%1, %4, %5\n\tsbc\t%0, %2, %3"                    \
 462                : "=r" (sh), "=&r" (sl)                                  \
 463                : "r" (ah), "rI" (bh), "r" (al), "rI" (bl) __CLOBBER_CC);\
 464     } while (0)
 465 #if 1 || defined (__arm_m__)    /* `M' series has widening multiply support */
 466 #define umul_ppmm(xh, xl, a, b) \
 467   __asm__ ("umull %0,%1,%2,%3" : "=&r" (xl), "=&r" (xh) : "r" (a), "r" (b))
 468 #define UMUL_TIME 5
 469 #define smul_ppmm(xh, xl, a, b) \
 470   __asm__ ("smull %0,%1,%2,%3" : "=&r" (xl), "=&r" (xh) : "r" (a), "r" (b))
 471 #ifndef LONGLONG_STANDALONE
 472 #define udiv_qrnnd(q, r, n1, n0, d) \
 473   do { UWtype __di;                                                     \
 474     __di = __MPN(invert_limb) (d);                                      \
 475     udiv_qrnnd_preinv (q, r, n1, n0, d, __di);                          \
 476   } while (0)
 477 #define UDIV_PREINV_ALWAYS  1
 478 #define UDIV_NEEDS_NORMALIZATION 1
 479 #define UDIV_TIME 70
 480 #endif /* LONGLONG_STANDALONE */
 481 #else
 482 #define umul_ppmm(xh, xl, a, b) \
 483   __asm__ ("%@ Inlined umul_ppmm\n"                                     \
 484 "       mov     %|r0, %2, lsr #16\n"                                    \
 485 "       mov     %|r2, %3, lsr #16\n"                                    \
 486 "       bic     %|r1, %2, %|r0, lsl #16\n"                              \
 487 "       bic     %|r2, %3, %|r2, lsl #16\n"                              \
 488 "       mul     %1, %|r1, %|r2\n"                                       \
 489 "       mul     %|r2, %|r0, %|r2\n"                                     \
 490 "       mul     %|r1, %0, %|r1\n"                                       \
 491 "       mul     %0, %|r0, %0\n"                                         \
 492 "       adds    %|r1, %|r2, %|r1\n"                                     \
 493 "       addcs   %0, %0, #65536\n"                                       \
 494 "       adds    %1, %1, %|r1, lsl #16\n"                                \
 495 "       adc     %0, %0, %|r1, lsr #16"                                  \
 496            : "=&r" (xh), "=r" (xl)                                      \
 497            : "r" (a), "r" (b)                                           \
 498            : "r0", "r1", "r2")
 499 #define UMUL_TIME 20
 500 #ifndef LONGLONG_STANDALONE
 501 #define udiv_qrnnd(q, r, n1, n0, d) \
 502   do { UWtype __r;                                                      \
 503     (q) = __MPN(udiv_qrnnd) (&__r, (n1), (n0), (d));                    \
 504     (r) = __r;                                                          \
 505   } while (0)
 506 extern UWtype __MPN(udiv_qrnnd) _PROTO ((UWtype *, UWtype, UWtype, UWtype));
 507 #define UDIV_TIME 200
 508 #endif /* LONGLONG_STANDALONE */
 509 #endif
 510 #endif /* __arm__ */
 511
 512 #if defined (__clipper__) && W_TYPE_SIZE == 32
 513 #define umul_ppmm(w1, w0, u, v) \
 514   ({union {UDItype __ll;                                                \
 515            struct {USItype __l, __h;} __i;                              \
 516           } __x;                                                        \
 517   __asm__ ("mulwux %2,%0"                                               \
 518            : "=r" (__x.__ll)                                            \
 519            : "%0" ((USItype)(u)), "r" ((USItype)(v)));                  \
 520   (w1) = __x.__i.__h; (w0) = __x.__i.__l;})
 521 #define smul_ppmm(w1, w0, u, v) \
 522   ({union {DItype __ll;                                                 \
 523            struct {SItype __l, __h;} __i;                               \
 524           } __x;                                                        \
 525   __asm__ ("mulwx %2,%0"                                                \
 526            : "=r" (__x.__ll)                                            \
 527            : "%0" ((SItype)(u)), "r" ((SItype)(v)));                    \
 528   (w1) = __x.__i.__h; (w0) = __x.__i.__l;})
 529 #define __umulsidi3(u, v) \
 530   ({UDItype __w;                                                        \
 531     __asm__ ("mulwux %2,%0"                                             \
 532              : "=r" (__w) : "%0" ((USItype)(u)), "r" ((USItype)(v)));   \
 533     __w; })
 534 #endif /* __clipper__ */
 535
 536 /* Fujitsu vector computers.  */
 537 #if defined (__uxp__) && W_TYPE_SIZE == 32
 538 #define umul_ppmm(ph, pl, u, v) \
 539   do {                                                                  \
 540     union {UDItype __ll;                                                \
 541            struct {USItype __h, __l;} __i;                              \
 542           } __x;                                                        \
 543     __asm__ ("mult.lu %1,%2,%0" : "=r" (__x.__ll) : "%r" (u), "rK" (v));\
 544     (ph) = __x.__i.__h;                                                 \
 545     (pl) = __x.__i.__l;                                                 \
 546   } while (0)
 547 #define smul_ppmm(ph, pl, u, v) \
 548   do {                                                                  \
 549     union {UDItype __ll;                                                \
 550            struct {USItype __h, __l;} __i;                              \
 551           } __x;                                                        \
 552     __asm__ ("mult.l %1,%2,%0" : "=r" (__x.__ll) : "%r" (u), "rK" (v)); \
 553     (ph) = __x.__i.__h;                                                 \
 554     (pl) = __x.__i.__l;                                                 \
 555   } while (0)
 556 #endif
 557
 558 #if defined (__gmicro__) && W_TYPE_SIZE == 32
 559 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
 560   __asm__ ("add.w %5,%1\n\taddx %3,%0"                                  \
 561            : "=g" (sh), "=&g" (sl)                                      \
 562            : "0"  ((USItype)(ah)), "g" ((USItype)(bh)),                 \
 563              "%1" ((USItype)(al)), "g" ((USItype)(bl)))
 564 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
 565   __asm__ ("sub.w %5,%1\n\tsubx %3,%0"                                  \
 566            : "=g" (sh), "=&g" (sl)                                      \
 567            : "0" ((USItype)(ah)), "g" ((USItype)(bh)),                  \
 568              "1" ((USItype)(al)), "g" ((USItype)(bl)))
 569 #define umul_ppmm(ph, pl, m0, m1) \
 570   __asm__ ("mulx %3,%0,%1"                                              \
 571            : "=g" (ph), "=r" (pl)                                       \
 572            : "%0" ((USItype)(m0)), "g" ((USItype)(m1)))
 573 #define udiv_qrnnd(q, r, nh, nl, d) \
 574   __asm__ ("divx %4,%0,%1"                                              \
 575            : "=g" (q), "=r" (r)                                         \
 576            : "1" ((USItype)(nh)), "0" ((USItype)(nl)), "g" ((USItype)(d)))
 577 #define count_leading_zeros(count, x) \
 578   __asm__ ("bsch/1 %1,%0"                                               \
 579            : "=g" (count) : "g" ((USItype)(x)), "0" ((USItype)0))
 580 #endif
 581
 582 #if defined (__hppa) && W_TYPE_SIZE == 32
 583 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
 584   __asm__ ("add%I5 %5,%r4,%1\n\taddc %r2,%r3,%0"                        \
 585            : "=r" (sh), "=&r" (sl)                                      \
 586            : "rM" (ah), "rM" (bh), "%rM" (al), "rI" (bl))
 587 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
 588   __asm__ ("sub%I4 %4,%r5,%1\n\tsubb %r2,%r3,%0"                        \
 589            : "=r" (sh), "=&r" (sl)                                      \
 590            : "rM" (ah), "rM" (bh), "rI" (al), "rM" (bl))
 591 #if defined (_PA_RISC1_1)
 592 #define umul_ppmm(wh, wl, u, v) \
 593   do {                                                                  \
 594     union {UDItype __ll;                                                \
 595            struct {USItype __h, __l;} __i;                              \
 596           } __x;                                                        \
 597     __asm__ ("xmpyu %1,%2,%0" : "=*f" (__x.__ll) : "*f" (u), "*f" (v)); \
 598     (wh) = __x.__i.__h;                                                 \
 599     (wl) = __x.__i.__l;                                                 \
 600   } while (0)
 601 #define UMUL_TIME 8
 602 #define UDIV_TIME 60
 603 #else
 604 #define UMUL_TIME 40
 605 #define UDIV_TIME 80
 606 #endif
 607 #define count_leading_zeros(count, x) \
 608   do {                                                                  \
 609     USItype __tmp;                                                      \
 610     __asm__ (                                                           \
 611        "ldi             1,%0\n"                                         \
 612 "       extru,=         %1,15,16,%%r0   ; Bits 31..16 zero?\n"          \
 613 "       extru,tr        %1,15,16,%1     ; No.  Shift down, skip add.\n" \
 614 "       ldo             16(%0),%0       ; Yes.  Perform add.\n"         \
 615 "       extru,=         %1,23,8,%%r0    ; Bits 15..8 zero?\n"           \
 616 "       extru,tr        %1,23,8,%1      ; No.  Shift down, skip add.\n" \
 617 "       ldo             8(%0),%0        ; Yes.  Perform add.\n"         \
 618 "       extru,=         %1,27,4,%%r0    ; Bits 7..4 zero?\n"            \
 619 "       extru,tr        %1,27,4,%1      ; No.  Shift down, skip add.\n" \
 620 "       ldo             4(%0),%0        ; Yes.  Perform add.\n"         \
 621 "       extru,=         %1,29,2,%%r0    ; Bits 3..2 zero?\n"            \
 622 "       extru,tr        %1,29,2,%1      ; No.  Shift down, skip add.\n" \
 623 "       ldo             2(%0),%0        ; Yes.  Perform add.\n"         \
 624 "       extru           %1,30,1,%1      ; Extract bit 1.\n"             \
 625 "       sub             %0,%1,%0        ; Subtract it.\n"               \
 626         : "=r" (count), "=r" (__tmp) : "1" (x));                        \
 627   } while (0)
 628 #endif /* hppa */
 629
 630 /* These macros are for ABI=2.0w.  In ABI=2.0n they can't be used, since GCC
 631    (3.2) puts longlong into two adjacent 32-bit registers.  Presumably this
 632    is just a case of no direct support for 2.0n but treating it like 1.0. */
 633 #if defined (__hppa) && W_TYPE_SIZE == 64 && ! defined (_LONG_LONG_LIMB)
 634 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
 635   __asm__ ("add%I5 %5,%r4,%1\n\tadd,dc %r2,%r3,%0"                      \
 636            : "=r" (sh), "=&r" (sl)                                      \
 637            : "rM" (ah), "rM" (bh), "%rM" (al), "rI" (bl))
 638 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
 639   __asm__ ("sub%I4 %4,%r5,%1\n\tsub,db %r2,%r3,%0"                      \
 640            : "=r" (sh), "=&r" (sl)                                      \
 641            : "rM" (ah), "rM" (bh), "rI" (al), "rM" (bl))
 642 #endif /* hppa */
 643
 644 #if (defined (__i370__) || defined (__s390__) || defined (__mvs__)) && W_TYPE_SIZE == 32
 645 #define smul_ppmm(xh, xl, m0, m1) \
 646   do {                                                                  \
 647     union {DItype __ll;                                                 \
 648            struct {USItype __h, __l;} __i;                              \
 649           } __x;                                                        \
 650     __asm__ ("lr %N0,%1\n\tmr %0,%2"                                    \
 651              : "=&r" (__x.__ll)                                         \
 652              : "r" (m0), "r" (m1));                                     \
 653     (xh) = __x.__i.__h; (xl) = __x.__i.__l;                             \
 654   } while (0)
 655 #define sdiv_qrnnd(q, r, n1, n0, d) \
 656   do {                                                                  \
 657     union {DItype __ll;                                                 \
 658            struct {USItype __h, __l;} __i;                              \
 659           } __x;                                                        \
 660     __x.__i.__h = n1; __x.__i.__l = n0;                                 \
 661     __asm__ ("dr %0,%2"                                                 \
 662              : "=r" (__x.__ll)                                          \
 663              : "0" (__x.__ll), "r" (d));                                \
 664     (q) = __x.__i.__l; (r) = __x.__i.__h;                               \
 665   } while (0)
 666 #endif
 667
 668 #if (defined (__i386__) || defined (__i486__)) && W_TYPE_SIZE == 32
 669 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
 670   __asm__ ("addl %5,%k1\n\tadcl %3,%k0"                                 \
 671            : "=r" (sh), "=&r" (sl)                                      \
 672            : "0"  ((USItype)(ah)), "g" ((USItype)(bh)),                 \
 673              "%1" ((USItype)(al)), "g" ((USItype)(bl)))
 674 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
 675   __asm__ ("subl %5,%k1\n\tsbbl %3,%k0"                                 \
 676            : "=r" (sh), "=&r" (sl)                                      \
 677            : "0" ((USItype)(ah)), "g" ((USItype)(bh)),                  \
 678              "1" ((USItype)(al)), "g" ((USItype)(bl)))
 679 #define umul_ppmm(w1, w0, u, v) \
 680   __asm__ ("mull %3"                                                    \
 681            : "=a" (w0), "=d" (w1)                                       \
 682            : "%0" ((USItype)(u)), "rm" ((USItype)(v)))
 683 #define udiv_qrnnd(q, r, n1, n0, dx) /* d renamed to dx avoiding "=d" */\
 684   __asm__ ("divl %4"                 /* stringification in K&R C */     \
 685            : "=a" (q), "=d" (r)                                         \
 686            : "0" ((USItype)(n0)), "1" ((USItype)(n1)), "rm" ((USItype)(dx)))
 687
 688 #if HAVE_HOST_CPU_i586 || HAVE_HOST_CPU_pentium || HAVE_HOST_CPU_pentiummmx
 689 /* Pentium bsrl takes between 10 and 72 cycles depending where the most
 690    significant 1 bit is, hence the use of the following alternatives.  bsfl
 691    is slow too, between 18 and 42 depending where the least significant 1
 692    bit is, so let the generic count_trailing_zeros below make use of the
 693    count_leading_zeros here too.  */
 694
 695 #if HAVE_HOST_CPU_pentiummmx && ! defined (LONGLONG_STANDALONE)
 696 /* The following should be a fixed 14 or 15 cycles, but possibly plus an L1
 697    cache miss reading from __clz_tab.  For P55 it's favoured over the float
 698    below so as to avoid mixing MMX and x87, since the penalty for switching
 699    between the two is about 100 cycles.
 700
 701    The asm block sets __shift to -3 if the high 24 bits are clear, -2 for
 702    16, -1 for 8, or 0 otherwise.  This could be written equivalently as
 703    follows, but as of gcc 2.95.2 it results in conditional jumps.
 704
 705        __shift = -(__n < 0x1000000);
 706        __shift -= (__n < 0x10000);
 707        __shift -= (__n < 0x100);
 708
 709    The middle two sbbl and cmpl's pair, and with luck something gcc
 710    generates might pair with the first cmpl and the last sbbl.  The "32+1"
 711    constant could be folded into __clz_tab[], but it doesn't seem worth
 712    making a different table just for that.  */
 713
 714 #define count_leading_zeros(c,n)                                        \
 715   do {                                                                  \
 716     USItype  __n = (n);                                                 \
 717     USItype  __shift;                                                   \
 718     __asm__ ("cmpl  $0x1000000, %1\n"                                   \
 719              "sbbl  %0, %0\n"                                           \
 720              "cmpl  $0x10000, %1\n"                                     \
 721              "sbbl  $0, %0\n"                                           \
 722              "cmpl  $0x100, %1\n"                                       \
 723              "sbbl  $0, %0\n"                                           \
 724              : "=&r" (__shift) : "r"  (__n));                           \
 725     __shift = __shift*8 + 24 + 1;                                       \
 726     (c) = 32 + 1 - __shift - __clz_tab[__n >> __shift];                 \
 727   } while (0)
 728 #define COUNT_LEADING_ZEROS_NEED_CLZ_TAB
 729 #define COUNT_LEADING_ZEROS_0   31   /* n==0 indistinguishable from n==1 */
 730
 731 #else /* ! pentiummmx || LONGLONG_STANDALONE */
 732 /* The following should be a fixed 14 cycles or so.  Some scheduling
 733    opportunities should be available between the float load/store too.  This
 734    sort of code is used in gcc 3 for __builtin_ffs (with "n&-n") and is
 735    apparently suggested by the Intel optimizing manual (don't know exactly
 736    where).  gcc 2.95 or up will be best for this, so the "double" is
 737    correctly aligned on the stack.  */
 738 #define count_leading_zeros(c,n)                                        \
 739   do {                                                                  \
 740     union {                                                             \
 741       double    d;                                                      \
 742       unsigned  a[2];                                                   \
 743     } __u;                                                              \
 744     ASSERT ((n) != 0);                                                  \
 745     __u.d = (UWtype) (n);                                               \
 746     (c) = 0x3FF + 31 - (__u.a[1] >> 20);                                \
 747   } while (0)
 748 #define COUNT_LEADING_ZEROS_0   (0x3FF + 31)
 749 #endif /* pentiummx */
 750
 751 #else /* ! pentium */
 752
 753 #if __GMP_GNUC_PREREQ (3,4)  /* using bsrl */
 754 #define count_leading_zeros(count,x)  count_leading_zeros_gcc_clz(count,x)
 755 #endif /* gcc clz */
 756
 757 /* On P6, gcc prior to 3.0 generates a partial register stall for
 758    __cbtmp^31, due to using "xorb $31" instead of "xorl $31", the former
 759    being 1 code byte smaller.  "31-__cbtmp" is a workaround, probably at the
 760    cost of one extra instruction.  Do this for "i386" too, since that means
 761    generic x86.  */
 762 #if ! defined (count_leading_zeros) && __GNUC__ < 3                     \
 763   && (HAVE_HOST_CPU_i386                                                \
 764       || HAVE_HOST_CPU_i686                                             \
 765       || HAVE_HOST_CPU_pentiumpro                                       \
 766       || HAVE_HOST_CPU_pentium2                                         \
 767       || HAVE_HOST_CPU_pentium3)
 768 #define count_leading_zeros(count, x)                                   \
 769   do {                                                                  \
 770     USItype __cbtmp;                                                    \
 771     ASSERT ((x) != 0);                                                  \
 772     __asm__ ("bsrl %1,%0" : "=r" (__cbtmp) : "rm" ((USItype)(x)));      \
 773     (count) = 31 - __cbtmp;                                             \
 774   } while (0)
 775 #endif /* gcc<3 asm bsrl */
 776
 777 #ifndef count_leading_zeros
 778 #define count_leading_zeros(count, x)                                   \
 779   do {                                                                  \
 780     USItype __cbtmp;                                                    \
 781     ASSERT ((x) != 0);                                                  \
 782     __asm__ ("bsrl %1,%0" : "=r" (__cbtmp) : "rm" ((USItype)(x)));      \
 783     (count) = __cbtmp ^ 31;                                             \
 784   } while (0)
 785 #endif /* asm bsrl */
 786
 787 #if __GMP_GNUC_PREREQ (3,4)  /* using bsfl */
 788 #define count_trailing_zeros(count,x)  count_trailing_zeros_gcc_ctz(count,x)
 789 #endif /* gcc ctz */
 790
 791 #ifndef count_trailing_zeros
 792 #define count_trailing_zeros(count, x)                                  \
 793   do {                                                                  \
 794     ASSERT ((x) != 0);                                                  \
 795     __asm__ ("bsfl %1,%k0" : "=r" (count) : "rm" ((USItype)(x)));       \
 796   } while (0)
 797 #endif /* asm bsfl */
 798
 799 #endif /* ! pentium */
 800
 801 #ifndef UMUL_TIME
 802 #define UMUL_TIME 10
 803 #endif
 804 #ifndef UDIV_TIME
 805 #define UDIV_TIME 40
 806 #endif
 807 #endif /* 80x86 */
 808
 809 #if defined (__amd64__) && W_TYPE_SIZE == 64
 810 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
 811   __asm__ ("addq %5,%q1\n\tadcq %3,%q0"                                 \
 812            : "=r" (sh), "=&r" (sl)                                      \
 813            : "0"  ((UDItype)(ah)), "rme" ((UDItype)(bh)),               \
 814              "%1" ((UDItype)(al)), "rme" ((UDItype)(bl)))
 815 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
 816   __asm__ ("subq %5,%q1\n\tsbbq %3,%q0"                                 \
 817            : "=r" (sh), "=&r" (sl)                                      \
 818            : "0" ((UDItype)(ah)), "rme" ((UDItype)(bh)),                \
 819              "1" ((UDItype)(al)), "rme" ((UDItype)(bl)))
 820 #define umul_ppmm(w1, w0, u, v) \
 821   __asm__ ("mulq %3"                                                    \
 822            : "=a" (w0), "=d" (w1)                                       \
 823            : "%0" ((UDItype)(u)), "rm" ((UDItype)(v)))
 824 #define udiv_qrnnd(q, r, n1, n0, dx) /* d renamed to dx avoiding "=d" */\
 825   __asm__ ("divq %4"                 /* stringification in K&R C */     \
 826            : "=a" (q), "=d" (r)                                         \
 827            : "0" ((UDItype)(n0)), "1" ((UDItype)(n1)), "rm" ((UDItype)(dx)))
 828 /* bsrq destination must be a 64-bit register, hence UDItype for __cbtmp. */
 829 #define count_leading_zeros(count, x)                                   \
 830   do {                                                                  \
 831     UDItype __cbtmp;                                                    \
 832     ASSERT ((x) != 0);                                                  \
 833     __asm__ ("bsrq %1,%0" : "=r" (__cbtmp) : "rm" ((UDItype)(x)));      \
 834     (count) = __cbtmp ^ 63;                                             \
 835   } while (0)
 836 /* bsfq destination must be a 64-bit register, "%q0" forces this in case
 837    count is only an int. */
 838 #define count_trailing_zeros(count, x)                                  \
 839   do {                                                                  \
 840     ASSERT ((x) != 0);                                                  \
 841     __asm__ ("bsfq %1,%q0" : "=r" (count) : "rm" ((UDItype)(x)));       \
 842   } while (0)
 843 #endif /* x86_64 */
 844
 845 #if defined (__i860__) && W_TYPE_SIZE == 32
 846 #define rshift_rhlc(r,h,l,c) \
 847   __asm__ ("shr %3,r0,r0\;shrd %1,%2,%0"                                \
 848            "=r" (r) : "r" (h), "r" (l), "rn" (c))
 849 #endif /* i860 */
 850
 851 #if defined (__i960__) && W_TYPE_SIZE == 32
 852 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
 853   __asm__ ("cmpo 1,0\;addc %5,%4,%1\;addc %3,%2,%0"                     \
 854            : "=r" (sh), "=&r" (sl)                                      \
 855            : "dI" (ah), "dI" (bh), "%dI" (al), "dI" (bl))
 856 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
 857   __asm__ ("cmpo 0,0\;subc %5,%4,%1\;subc %3,%2,%0"                     \
 858            : "=r" (sh), "=&r" (sl)                                      \
 859            : "dI" (ah), "dI" (bh), "dI" (al), "dI" (bl))
 860 #define umul_ppmm(w1, w0, u, v) \
 861   ({union {UDItype __ll;                                                \
 862            struct {USItype __l, __h;} __i;                              \
 863           } __x;                                                        \
 864   __asm__ ("emul %2,%1,%0"                                              \
 865            : "=d" (__x.__ll) : "%dI" (u), "dI" (v));                    \
 866   (w1) = __x.__i.__h; (w0) = __x.__i.__l;})
 867 #define __umulsidi3(u, v) \
 868   ({UDItype __w;                                                        \
 869     __asm__ ("emul %2,%1,%0" : "=d" (__w) : "%dI" (u), "dI" (v));       \
 870     __w; })
 871 #define udiv_qrnnd(q, r, nh, nl, d) \
 872   do {                                                                  \
 873     union {UDItype __ll;                                                \
 874            struct {USItype __l, __h;} __i;                              \
 875           } __nn;                                                       \
 876     __nn.__i.__h = (nh); __nn.__i.__l = (nl);                           \
 877     __asm__ ("ediv %d,%n,%0"                                            \
 878            : "=d" (__rq.__ll) : "dI" (__nn.__ll), "dI" (d));            \
 879     (r) = __rq.__i.__l; (q) = __rq.__i.__h;                             \
 880   } while (0)
 881 #define count_leading_zeros(count, x) \
 882   do {                                                                  \
 883     USItype __cbtmp;                                                    \
 884     __asm__ ("scanbit %1,%0" : "=r" (__cbtmp) : "r" (x));               \
 885     (count) = __cbtmp ^ 31;                                             \
 886   } while (0)
 887 #define COUNT_LEADING_ZEROS_0 (-32) /* sic */
 888 #if defined (__i960mx)          /* what is the proper symbol to test??? */
 889 #define rshift_rhlc(r,h,l,c) \
 890   do {                                                                  \
 891     union {UDItype __ll;                                                \
 892            struct {USItype __l, __h;} __i;                              \
 893           } __nn;                                                       \
 894     __nn.__i.__h = (h); __nn.__i.__l = (l);                             \
 895     __asm__ ("shre %2,%1,%0" : "=d" (r) : "dI" (__nn.__ll), "dI" (c));  \
 896   }
 897 #endif /* i960mx */
 898 #endif /* i960 */
 899
 900 #if (defined (__mc68000__) || defined (__mc68020__) || defined(mc68020) \
 901      || defined (__m68k__) || defined (__mc5200__) || defined (__mc5206e__) \
 902      || defined (__mc5307__)) && W_TYPE_SIZE == 32
 903 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
 904   __asm__ ("add%.l %5,%1\n\taddx%.l %3,%0"                              \
 905            : "=d" (sh), "=&d" (sl)                                      \
 906            : "0"  ((USItype)(ah)), "d" ((USItype)(bh)),                 \
 907              "%1" ((USItype)(al)), "g" ((USItype)(bl)))
 908 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
 909   __asm__ ("sub%.l %5,%1\n\tsubx%.l %3,%0"                              \
 910            : "=d" (sh), "=&d" (sl)                                      \
 911            : "0" ((USItype)(ah)), "d" ((USItype)(bh)),                  \
 912              "1" ((USItype)(al)), "g" ((USItype)(bl)))
 913 /* The '020, '030, '040 and CPU32 have 32x32->64 and 64/32->32q-32r.  */
 914 #if defined (__mc68020__) || defined(mc68020) \
 915      || defined (__mc68030__) || defined (mc68030) \
 916      || defined (__mc68040__) || defined (mc68040) \
 917      || defined (__mcpu32__) || defined (mcpu32) \
 918      || defined (__NeXT__)
 919 #define umul_ppmm(w1, w0, u, v) \
 920   __asm__ ("mulu%.l %3,%1:%0"                                           \
 921            : "=d" (w0), "=d" (w1)                                       \
 922            : "%0" ((USItype)(u)), "dmi" ((USItype)(v)))
 923 #define UMUL_TIME 45
 924 #define udiv_qrnnd(q, r, n1, n0, d) \
 925   __asm__ ("divu%.l %4,%1:%0"                                           \
 926            : "=d" (q), "=d" (r)                                         \
 927            : "0" ((USItype)(n0)), "1" ((USItype)(n1)), "dmi" ((USItype)(d)))
 928 #define UDIV_TIME 90
 929 #define sdiv_qrnnd(q, r, n1, n0, d) \
 930   __asm__ ("divs%.l %4,%1:%0"                                           \
 931            : "=d" (q), "=d" (r)                                         \
 932            : "0" ((USItype)(n0)), "1" ((USItype)(n1)), "dmi" ((USItype)(d)))
 933 #else /* for other 68k family members use 16x16->32 multiplication */
 934 #define umul_ppmm(xh, xl, a, b) \
 935   do { USItype __umul_tmp1, __umul_tmp2;                                \
 936         __asm__ ("| Inlined umul_ppmm\n"                                \
 937 "       move%.l %5,%3\n"                                                \
 938 "       move%.l %2,%0\n"                                                \
 939 "       move%.w %3,%1\n"                                                \
 940 "       swap    %3\n"                                                   \
 941 "       swap    %0\n"                                                   \
 942 "       mulu%.w %2,%1\n"                                                \
 943 "       mulu%.w %3,%0\n"                                                \
 944 "       mulu%.w %2,%3\n"                                                \
 945 "       swap    %2\n"                                                   \
 946 "       mulu%.w %5,%2\n"                                                \
 947 "       add%.l  %3,%2\n"                                                \
 948 "       jcc     1f\n"                                                   \
 949 "       add%.l  %#0x10000,%0\n"                                         \
 950 "1:     move%.l %2,%3\n"                                                \
 951 "       clr%.w  %2\n"                                                   \
 952 "       swap    %2\n"                                                   \
 953 "       swap    %3\n"                                                   \
 954 "       clr%.w  %3\n"                                                   \
 955 "       add%.l  %3,%1\n"                                                \
 956 "       addx%.l %2,%0\n"                                                \
 957 "       | End inlined umul_ppmm"                                        \
 958               : "=&d" (xh), "=&d" (xl),                                 \
 959                 "=d" (__umul_tmp1), "=&d" (__umul_tmp2)                 \
 960               : "%2" ((USItype)(a)), "d" ((USItype)(b)));               \
 961   } while (0)
 962 #define UMUL_TIME 100
 963 #define UDIV_TIME 400
 964 #endif /* not mc68020 */
 965 /* The '020, '030, '040 and '060 have bitfield insns.
 966    GCC 3.4 defines __mc68020__ when in CPU32 mode, check for __mcpu32__ to
 967    exclude bfffo on that chip (bitfield insns not available).  */
 968 #if (defined (__mc68020__) || defined (mc68020)    \
 969      || defined (__mc68030__) || defined (mc68030) \
 970      || defined (__mc68040__) || defined (mc68040) \
 971      || defined (__mc68060__) || defined (mc68060) \
 972      || defined (__NeXT__))                        \
 973   && ! defined (__mcpu32__)
 974 #define count_leading_zeros(count, x) \
 975   __asm__ ("bfffo %1{%b2:%b2},%0"                                       \
 976            : "=d" (count)                                               \
 977            : "od" ((USItype) (x)), "n" (0))
 978 #define COUNT_LEADING_ZEROS_0 32
 979 #endif
 980 #endif /* mc68000 */
 981
 982 #if defined (__m88000__) && W_TYPE_SIZE == 32
 983 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
 984   __asm__ ("addu.co %1,%r4,%r5\n\taddu.ci %0,%r2,%r3"                   \
 985            : "=r" (sh), "=&r" (sl)                                      \
 986            : "rJ" (ah), "rJ" (bh), "%rJ" (al), "rJ" (bl))
 987 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
 988   __asm__ ("subu.co %1,%r4,%r5\n\tsubu.ci %0,%r2,%r3"                   \
 989            : "=r" (sh), "=&r" (sl)                                      \
 990            : "rJ" (ah), "rJ" (bh), "rJ" (al), "rJ" (bl))
 991 #define count_leading_zeros(count, x) \
 992   do {                                                                  \
 993     USItype __cbtmp;                                                    \
 994     __asm__ ("ff1 %0,%1" : "=r" (__cbtmp) : "r" (x));                   \
 995     (count) = __cbtmp ^ 31;                                             \
 996   } while (0)
 997 #define COUNT_LEADING_ZEROS_0 63 /* sic */
 998 #if defined (__m88110__)
 999 #define umul_ppmm(wh, wl, u, v) \
1000   do {                                                                  \
1001     union {UDItype __ll;                                                \
1002            struct {USItype __h, __l;} __i;                              \
1003           } __x;                                                        \
1004     __asm__ ("mulu.d %0,%1,%2" : "=r" (__x.__ll) : "r" (u), "r" (v));   \
1005     (wh) = __x.__i.__h;                                                 \
1006     (wl) = __x.__i.__l;                                                 \
1007   } while (0)
1008 #define udiv_qrnnd(q, r, n1, n0, d) \
1009   ({union {UDItype __ll;                                                \
1010            struct {USItype __h, __l;} __i;                              \
1011           } __x, __q;                                                   \
1012   __x.__i.__h = (n1); __x.__i.__l = (n0);                               \
1013   __asm__ ("divu.d %0,%1,%2"                                            \
1014            : "=r" (__q.__ll) : "r" (__x.__ll), "r" (d));                \
1015   (r) = (n0) - __q.__l * (d); (q) = __q.__l; })
1016 #define UMUL_TIME 5
1017 #define UDIV_TIME 25
1018 #else
1019 #define UMUL_TIME 17
1020 #define UDIV_TIME 150
1021 #endif /* __m88110__ */
1022 #endif /* __m88000__ */
1023
1024 #if defined (__mips) && W_TYPE_SIZE == 32
1025 #if __GMP_GNUC_PREREQ (4,4)
1026 #define umul_ppmm(w1, w0, u, v) \
1027   do {                                                                  \
1028     UDItype __ll = (UDItype)(u) * (v);                                  \
1029     w1 = __ll >> 32;                                                    \
1030     w0 = __ll;                                                          \
1031   } while (0)
1032 #endif
1033 #if !defined (umul_ppmm) && __GMP_GNUC_PREREQ (2,7)
1034 #define umul_ppmm(w1, w0, u, v) \
1035   __asm__ ("multu %2,%3" : "=l" (w0), "=h" (w1) : "d" (u), "d" (v))
1036 #endif
1037 #if !defined (umul_ppmm)
1038 #define umul_ppmm(w1, w0, u, v) \
1039   __asm__ ("multu %2,%3\n\tmflo %0\n\tmfhi %1"                          \
1040            : "=d" (w0), "=d" (w1) : "d" (u), "d" (v))
1041 #endif
1042 #define UMUL_TIME 10
1043 #define UDIV_TIME 100
1044 #endif /* __mips */
1045
1046 #if (defined (__mips) && __mips >= 3) && W_TYPE_SIZE == 64
1047 #if __GMP_GNUC_PREREQ (4,4)
1048 #define umul_ppmm(w1, w0, u, v) \
1049   do {                                                                  \
1050     typedef unsigned int __ll_UTItype __attribute__((mode(TI)));        \
1051     __ll_UTItype __ll = (__ll_UTItype)(u) * (v);                        \
1052     w1 = __ll >> 64;                                                    \
1053     w0 = __ll;                                                          \
1054   } while (0)
1055 #endif
1056 #if !defined (umul_ppmm) && __GMP_GNUC_PREREQ (2,7)
1057 #define umul_ppmm(w1, w0, u, v) \
1058   __asm__ ("dmultu %2,%3" : "=l" (w0), "=h" (w1) : "d" (u), "d" (v))
1059 #endif
1060 #if !defined (umul_ppmm)
1061 #define umul_ppmm(w1, w0, u, v) \
1062   __asm__ ("dmultu %2,%3\n\tmflo %0\n\tmfhi %1"                         \
1063            : "=d" (w0), "=d" (w1) : "d" (u), "d" (v))
1064 #endif
1065 #define UMUL_TIME 20
1066 #define UDIV_TIME 140
1067 #endif /* __mips */
1068
1069 #if defined (__mmix__) && W_TYPE_SIZE == 64
1070 #define umul_ppmm(w1, w0, u, v) \
1071   __asm__ ("MULU %0,%2,%3" : "=r" (w0), "=z" (w1) : "r" (u), "r" (v))
1072 #endif
1073
1074 #if defined (__ns32000__) && W_TYPE_SIZE == 32
1075 #define umul_ppmm(w1, w0, u, v) \
1076   ({union {UDItype __ll;                                                \
1077            struct {USItype __l, __h;} __i;                              \
1078           } __x;                                                        \
1079   __asm__ ("meid %2,%0"                                                 \
1080            : "=g" (__x.__ll)                                            \
1081            : "%0" ((USItype)(u)), "g" ((USItype)(v)));                  \
1082   (w1) = __x.__i.__h; (w0) = __x.__i.__l;})
1083 #define __umulsidi3(u, v) \
1084   ({UDItype __w;                                                        \
1085     __asm__ ("meid %2,%0"                                               \
1086              : "=g" (__w)                                               \
1087              : "%0" ((USItype)(u)), "g" ((USItype)(v)));                \
1088     __w; })
1089 #define udiv_qrnnd(q, r, n1, n0, d) \
1090   ({union {UDItype __ll;                                                \
1091            struct {USItype __l, __h;} __i;                              \
1092           } __x;                                                        \
1093   __x.__i.__h = (n1); __x.__i.__l = (n0);                               \
1094   __asm__ ("deid %2,%0"                                                 \
1095            : "=g" (__x.__ll)                                            \
1096            : "0" (__x.__ll), "g" ((USItype)(d)));                       \
1097   (r) = __x.__i.__l; (q) = __x.__i.__h; })
1098 #define count_trailing_zeros(count,x) \
1099   do {                                                                  \
1100     __asm__ ("ffsd      %2,%0"                                          \
1101              : "=r" (count)                                             \
1102              : "0" ((USItype) 0), "r" ((USItype) (x)));                 \
1103   } while (0)
1104 #endif /* __ns32000__ */
1105
1106 /* In the past we had a block of various #defines tested
1107        _ARCH_PPC    - AIX
1108        _ARCH_PWR    - AIX
1109        __powerpc__  - gcc
1110        __POWERPC__  - BEOS
1111        __ppc__      - Darwin
1112        PPC          - old gcc, GNU/Linux, SysV
1113    The plain PPC test was not good for vxWorks, since PPC is defined on all
1114    CPUs there (eg. m68k too), as a constant one is expected to compare
1115    CPU_FAMILY against.
1116
1117    At any rate, this was pretty unattractive and a bit fragile.  The use of
1118    HAVE_HOST_CPU_FAMILY is designed to cut through it all and be sure of
1119    getting the desired effect.
1120
1121    ENHANCE-ME: We should test _IBMR2 here when we add assembly support for
1122    the system vendor compilers.  (Is that vendor compilers with inline asm,
1123    or what?)  */
1124
1125 #if (HAVE_HOST_CPU_FAMILY_power || HAVE_HOST_CPU_FAMILY_powerpc)        \
1126   && W_TYPE_SIZE == 32
1127 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
1128   do {                                                                  \
1129     if (__builtin_constant_p (bh) && (bh) == 0)                         \
1130       __asm__ ("{a%I4|add%I4c} %1,%3,%4\n\t{aze|addze} %0,%2"           \
1131              : "=r" (sh), "=&r" (sl) : "r" (ah), "%r" (al), "rI" (bl));\
1132     else if (__builtin_constant_p (bh) && (bh) == ~(USItype) 0)         \
1133       __asm__ ("{a%I4|add%I4c} %1,%3,%4\n\t{ame|addme} %0,%2"           \
1134              : "=r" (sh), "=&r" (sl) : "r" (ah), "%r" (al), "rI" (bl));\
1135     else                                                                \
1136       __asm__ ("{a%I5|add%I5c} %1,%4,%5\n\t{ae|adde} %0,%2,%3"          \
1137              : "=r" (sh), "=&r" (sl)                                    \
1138              : "r" (ah), "r" (bh), "%r" (al), "rI" (bl));               \
1139   } while (0)
1140 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
1141   do {                                                                  \
1142     if (__builtin_constant_p (ah) && (ah) == 0)                         \
1143       __asm__ ("{sf%I3|subf%I3c} %1,%4,%3\n\t{sfze|subfze} %0,%2"       \
1144                : "=r" (sh), "=&r" (sl) : "r" (bh), "rI" (al), "r" (bl));\
1145     else if (__builtin_constant_p (ah) && (ah) == ~(USItype) 0)         \
1146       __asm__ ("{sf%I3|subf%I3c} %1,%4,%3\n\t{sfme|subfme} %0,%2"       \
1147                : "=r" (sh), "=&r" (sl) : "r" (bh), "rI" (al), "r" (bl));\
1148     else if (__builtin_constant_p (bh) && (bh) == 0)                    \
1149       __asm__ ("{sf%I3|subf%I3c} %1,%4,%3\n\t{ame|addme} %0,%2"         \
1150                : "=r" (sh), "=&r" (sl) : "r" (ah), "rI" (al), "r" (bl));\
1151     else if (__builtin_constant_p (bh) && (bh) == ~(USItype) 0)         \
1152       __asm__ ("{sf%I3|subf%I3c} %1,%4,%3\n\t{aze|addze} %0,%2"         \
1153                : "=r" (sh), "=&r" (sl) : "r" (ah), "rI" (al), "r" (bl));\
1154     else                                                                \
1155       __asm__ ("{sf%I4|subf%I4c} %1,%5,%4\n\t{sfe|subfe} %0,%3,%2"      \
1156                : "=r" (sh), "=&r" (sl)                                  \
1157                : "r" (ah), "r" (bh), "rI" (al), "r" (bl));              \
1158   } while (0)
1159 #define count_leading_zeros(count, x) \
1160   __asm__ ("{cntlz|cntlzw} %0,%1" : "=r" (count) : "r" (x))
1161 #define COUNT_LEADING_ZEROS_0 32
1162 #if HAVE_HOST_CPU_FAMILY_powerpc
1163 #if __GMP_GNUC_PREREQ (4,4)
1164 #define umul_ppmm(w1, w0, u, v) \
1165   do {                                                                  \
1166     UDItype __ll = (UDItype)(u) * (v);                                  \
1167     w1 = __ll >> 32;                                                    \
1168     w0 = __ll;                                                          \
1169   } while (0)
1170 #endif
1171 #if !defined (umul_ppmm)
1172 #define umul_ppmm(ph, pl, m0, m1) \
1173   do {                                                                  \
1174     USItype __m0 = (m0), __m1 = (m1);                                   \
1175     __asm__ ("mulhwu %0,%1,%2" : "=r" (ph) : "%r" (m0), "r" (m1));      \
1176     (pl) = __m0 * __m1;                                                 \
1177   } while (0)
1178 #endif
1179 #define UMUL_TIME 15
1180 #define smul_ppmm(ph, pl, m0, m1) \
1181   do {                                                                  \
1182     SItype __m0 = (m0), __m1 = (m1);                                    \
1183     __asm__ ("mulhw %0,%1,%2" : "=r" (ph) : "%r" (m0), "r" (m1));       \
1184     (pl) = __m0 * __m1;                                                 \
1185   } while (0)
1186 #define SMUL_TIME 14
1187 #define UDIV_TIME 120
1188 #else
1189 #define UMUL_TIME 8
1190 #define smul_ppmm(xh, xl, m0, m1) \
1191   __asm__ ("mul %0,%2,%3" : "=r" (xh), "=q" (xl) : "r" (m0), "r" (m1))
1192 #define SMUL_TIME 4
1193 #define sdiv_qrnnd(q, r, nh, nl, d) \
1194   __asm__ ("div %0,%2,%4" : "=r" (q), "=q" (r) : "r" (nh), "1" (nl), "r" (d))
1195 #define UDIV_TIME 100
1196 #endif
1197 #endif /* 32-bit POWER architecture variants.  */
1198
1199 /* We should test _IBMR2 here when we add assembly support for the system
1200    vendor compilers.  */
1201 #if HAVE_HOST_CPU_FAMILY_powerpc && W_TYPE_SIZE == 64
1202 #if !defined (_LONG_LONG_LIMB)
1203 /* _LONG_LONG_LIMB is ABI=mode32 where adde operates on 32-bit values.  So
1204    use adde etc only when not _LONG_LONG_LIMB.  */
1205 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
1206   do {                                                                  \
1207     if (__builtin_constant_p (bh) && (bh) == 0)                         \
1208       __asm__ ("{a%I4|add%I4c} %1,%3,%4\n\t{aze|addze} %0,%2"           \
1209              : "=r" (sh), "=&r" (sl) : "r" (ah), "%r" (al), "rI" (bl));\
1210     else if (__builtin_constant_p (bh) && (bh) == ~(UDItype) 0)         \
1211       __asm__ ("{a%I4|add%I4c} %1,%3,%4\n\t{ame|addme} %0,%2"           \
1212              : "=r" (sh), "=&r" (sl) : "r" (ah), "%r" (al), "rI" (bl));\
1213     else                                                                \
1214       __asm__ ("{a%I5|add%I5c} %1,%4,%5\n\t{ae|adde} %0,%2,%3"          \
1215              : "=r" (sh), "=&r" (sl)                                    \
1216              : "r" (ah), "r" (bh), "%r" (al), "rI" (bl));               \
1217   } while (0)
1218 /* We use "*rI" for the constant operand here, since with just "I", gcc barfs.
1219    This might seem strange, but gcc folds away the dead code late.  */
1220 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
1221   do {                                                                        \
1222     if (__builtin_constant_p (bl) && bl > -0x8000 && bl <= 0x8000) {          \
1223         if (__builtin_constant_p (ah) && (ah) == 0)                           \
1224           __asm__ ("{ai|addic} %1,%3,%4\n\t{sfze|subfze} %0,%2"               \
1225                    : "=r" (sh), "=&r" (sl) : "r" (bh), "rI" (al), "*rI" (-bl)); \
1226         else if (__builtin_constant_p (ah) && (ah) == ~(UDItype) 0)           \
1227           __asm__ ("{ai|addic} %1,%3,%4\n\t{sfme|subfme} %0,%2"               \
1228                    : "=r" (sh), "=&r" (sl) : "r" (bh), "rI" (al), "*rI" (-bl)); \
1229         else if (__builtin_constant_p (bh) && (bh) == 0)                      \
1230           __asm__ ("{ai|addic} %1,%3,%4\n\t{ame|addme} %0,%2"                 \
1231                    : "=r" (sh), "=&r" (sl) : "r" (ah), "rI" (al), "*rI" (-bl)); \
1232         else if (__builtin_constant_p (bh) && (bh) == ~(UDItype) 0)           \
1233           __asm__ ("{ai|addic} %1,%3,%4\n\t{aze|addze} %0,%2"                 \
1234                    : "=r" (sh), "=&r" (sl) : "r" (ah), "rI" (al), "*rI" (-bl)); \
1235         else                                                                  \
1236           __asm__ ("{ai|addic} %1,%4,%5\n\t{sfe|subfe} %0,%3,%2"              \
1237                    : "=r" (sh), "=&r" (sl)                                    \
1238                    : "r" (ah), "r" (bh), "rI" (al), "*rI" (-bl));             \
1239       } else {                                                                \
1240         if (__builtin_constant_p (ah) && (ah) == 0)                           \
1241           __asm__ ("{sf%I3|subf%I3c} %1,%4,%3\n\t{sfze|subfze} %0,%2"         \
1242                    : "=r" (sh), "=&r" (sl) : "r" (bh), "rI" (al), "r" (bl));  \
1243         else if (__builtin_constant_p (ah) && (ah) == ~(UDItype) 0)           \
1244           __asm__ ("{sf%I3|subf%I3c} %1,%4,%3\n\t{sfme|subfme} %0,%2"         \
1245                    : "=r" (sh), "=&r" (sl) : "r" (bh), "rI" (al), "r" (bl));  \
1246         else if (__builtin_constant_p (bh) && (bh) == 0)                      \
1247           __asm__ ("{sf%I3|subf%I3c} %1,%4,%3\n\t{ame|addme} %0,%2"           \
1248                    : "=r" (sh), "=&r" (sl) : "r" (ah), "rI" (al), "r" (bl));  \
1249         else if (__builtin_constant_p (bh) && (bh) == ~(UDItype) 0)           \
1250           __asm__ ("{sf%I3|subf%I3c} %1,%4,%3\n\t{aze|addze} %0,%2"           \
1251                    : "=r" (sh), "=&r" (sl) : "r" (ah), "rI" (al), "r" (bl));  \
1252         else                                                                  \
1253           __asm__ ("{sf%I4|subf%I4c} %1,%5,%4\n\t{sfe|subfe} %0,%3,%2"        \
1254                    : "=r" (sh), "=&r" (sl)                                    \
1255                    : "r" (ah), "r" (bh), "rI" (al), "r" (bl));                \
1256       }                                                                       \
1257   } while (0)
1258 #endif /* ! _LONG_LONG_LIMB */
1259 #define count_leading_zeros(count, x) \
1260   __asm__ ("cntlzd %0,%1" : "=r" (count) : "r" (x))
1261 #define COUNT_LEADING_ZEROS_0 64
1262 #if __GMP_GNUC_PREREQ (4,4)
1263 #define umul_ppmm(w1, w0, u, v) \
1264   do {                                                                  \
1265     typedef unsigned int __ll_UTItype __attribute__((mode(TI)));        \
1266     __ll_UTItype __ll = (__ll_UTItype)(u) * (v);                        \
1267     w1 = __ll >> 64;                                                    \
1268     w0 = __ll;                                                          \
1269   } while (0)
1270 #endif
1271 #if !defined (umul_ppmm)
1272 #define umul_ppmm(ph, pl, m0, m1) \
1273   do {                                                                  \
1274     UDItype __m0 = (m0), __m1 = (m1);                                   \
1275     __asm__ ("mulhdu %0,%1,%2" : "=r" (ph) : "%r" (m0), "r" (m1));      \
1276     (pl) = __m0 * __m1;                                                 \
1277   } while (0)
1278 #endif
1279 #define UMUL_TIME 15
1280 #define smul_ppmm(ph, pl, m0, m1) \
1281   do {                                                                  \
1282     DItype __m0 = (m0), __m1 = (m1);                                    \
1283     __asm__ ("mulhd %0,%1,%2" : "=r" (ph) : "%r" (m0), "r" (m1));       \
1284     (pl) = __m0 * __m1;                                                 \
1285   } while (0)
1286 #define SMUL_TIME 14  /* ??? */
1287 #define UDIV_TIME 120 /* ??? */
1288 #endif /* 64-bit PowerPC.  */
1289
1290 #if defined (__pyr__) && W_TYPE_SIZE == 32
1291 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
1292   __asm__ ("addw %5,%1\n\taddwc %3,%0"                                  \
1293            : "=r" (sh), "=&r" (sl)                                      \
1294            : "0"  ((USItype)(ah)), "g" ((USItype)(bh)),                 \
1295              "%1" ((USItype)(al)), "g" ((USItype)(bl)))
1296 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
1297   __asm__ ("subw %5,%1\n\tsubwb %3,%0"                                  \
1298            : "=r" (sh), "=&r" (sl)                                      \
1299            : "0" ((USItype)(ah)), "g" ((USItype)(bh)),                  \
1300              "1" ((USItype)(al)), "g" ((USItype)(bl)))
1301 /* This insn works on Pyramids with AP, XP, or MI CPUs, but not with SP.  */
1302 #define umul_ppmm(w1, w0, u, v) \
1303   ({union {UDItype __ll;                                                \
1304            struct {USItype __h, __l;} __i;                              \
1305           } __x;                                                        \
1306   __asm__ ("movw %1,%R0\n\tuemul %2,%0"                                 \
1307            : "=&r" (__x.__ll)                                           \
1308            : "g" ((USItype) (u)), "g" ((USItype)(v)));                  \
1309   (w1) = __x.__i.__h; (w0) = __x.__i.__l;})
1310 #endif /* __pyr__ */
1311
1312 #if defined (__ibm032__) /* RT/ROMP */  && W_TYPE_SIZE == 32
1313 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
1314   __asm__ ("a %1,%5\n\tae %0,%3"                                        \
1315            : "=r" (sh), "=&r" (sl)                                      \
1316            : "0"  ((USItype)(ah)), "r" ((USItype)(bh)),                 \
1317              "%1" ((USItype)(al)), "r" ((USItype)(bl)))
1318 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
1319   __asm__ ("s %1,%5\n\tse %0,%3"                                        \
1320            : "=r" (sh), "=&r" (sl)                                      \
1321            : "0" ((USItype)(ah)), "r" ((USItype)(bh)),                  \
1322              "1" ((USItype)(al)), "r" ((USItype)(bl)))
1323 #define smul_ppmm(ph, pl, m0, m1) \
1324   __asm__ (                                                             \
1325        "s       r2,r2\n"                                                \
1326 "       mts r10,%2\n"                                                   \
1327 "       m       r2,%3\n"                                                \
1328 "       m       r2,%3\n"                                                \
1329 "       m       r2,%3\n"                                                \
1330 "       m       r2,%3\n"                                                \
1331 "       m       r2,%3\n"                                                \
1332 "       m       r2,%3\n"                                                \
1333 "       m       r2,%3\n"                                                \
1334 "       m       r2,%3\n"                                                \
1335 "       m       r2,%3\n"                                                \
1336 "       m       r2,%3\n"                                                \
1337 "       m       r2,%3\n"                                                \
1338 "       m       r2,%3\n"                                                \
1339 "       m       r2,%3\n"                                                \
1340 "       m       r2,%3\n"                                                \
1341 "       m       r2,%3\n"                                                \
1342 "       m       r2,%3\n"                                                \
1343 "       cas     %0,r2,r0\n"                                             \
1344 "       mfs     r10,%1"                                                 \
1345            : "=r" (ph), "=r" (pl)                                       \
1346            : "%r" ((USItype)(m0)), "r" ((USItype)(m1))                  \
1347            : "r2")
1348 #define UMUL_TIME 20
1349 #define UDIV_TIME 200
1350 #define count_leading_zeros(count, x) \
1351   do {                                                                  \
1352     if ((x) >= 0x10000)                                                 \
1353       __asm__ ("clz     %0,%1"                                          \
1354                : "=r" (count) : "r" ((USItype)(x) >> 16));              \
1355     else                                                                \
1356       {                                                                 \
1357         __asm__ ("clz   %0,%1"                                          \
1358                  : "=r" (count) : "r" ((USItype)(x)));                  \
1359         (count) += 16;                                                  \
1360       }                                                                 \
1361   } while (0)
1362 #endif /* RT/ROMP */
1363
1364 #if defined (__sh2__) && W_TYPE_SIZE == 32
1365 #define umul_ppmm(w1, w0, u, v) \
1366   __asm__ ("dmulu.l %2,%3\n\tsts macl,%1\n\tsts mach,%0"                \
1367            : "=r" (w1), "=r" (w0) : "r" (u), "r" (v) : "macl", "mach")
1368 #define UMUL_TIME 5
1369 #endif
1370
1371 #if defined (__sparc__) && W_TYPE_SIZE == 32
1372 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
1373   __asm__ ("addcc %r4,%5,%1\n\taddx %r2,%3,%0"                          \
1374            : "=r" (sh), "=&r" (sl)                                      \
1375            : "rJ" (ah), "rI" (bh),"%rJ" (al), "rI" (bl)                 \
1376            __CLOBBER_CC)
1377 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
1378   __asm__ ("subcc %r4,%5,%1\n\tsubx %r2,%3,%0"                          \
1379            : "=r" (sh), "=&r" (sl)                                      \
1380            : "rJ" (ah), "rI" (bh), "rJ" (al), "rI" (bl) \
1381            __CLOBBER_CC)
1382 /* Note: the following FIXME comes from GMP, thus it does make sense to try
1383    to resolve it in MPFR. */
1384 /* FIXME: When gcc -mcpu=v9 is used on solaris, gcc/config/sol2-sld-64.h
1385    doesn't define anything to indicate that to us, it only sets __sparcv8. */
1386 #if defined (__sparc_v9__) || defined (__sparcv9)
1387 /* Perhaps we should use floating-point operations here?  */
1388 #if 0
1389 /* Triggers a bug making mpz/tests/t-gcd.c fail.
1390    Perhaps we simply need explicitly zero-extend the inputs?  */
1391 #define umul_ppmm(w1, w0, u, v) \
1392   __asm__ ("mulx %2,%3,%%g1; srl %%g1,0,%1; srlx %%g1,32,%0" :          \
1393            "=r" (w1), "=r" (w0) : "r" (u), "r" (v) : "g1")
1394 #else
1395 /* Use v8 umul until above bug is fixed.  */
1396 #define umul_ppmm(w1, w0, u, v) \
1397   __asm__ ("umul %2,%3,%1;rd %%y,%0" : "=r" (w1), "=r" (w0) : "r" (u), "r" (v))
1398 #endif
1399 /* Use a plain v8 divide for v9.  */
1400 #define udiv_qrnnd(q, r, n1, n0, d) \
1401   do {                                                                  \
1402     USItype __q;                                                        \
1403     __asm__ ("mov %1,%%y;nop;nop;nop;udiv %2,%3,%0"                     \
1404              : "=r" (__q) : "r" (n1), "r" (n0), "r" (d));               \
1405     (r) = (n0) - __q * (d);                                             \
1406     (q) = __q;                                                          \
1407   } while (0)
1408 #else
1409 #if defined (__sparc_v8__)   /* gcc normal */                           \
1410   || defined (__sparcv8)     /* gcc solaris */                          \
1411   || HAVE_HOST_CPU_supersparc
1412 /* Don't match immediate range because, 1) it is not often useful,
1413    2) the 'I' flag thinks of the range as a 13 bit signed interval,
1414    while we want to match a 13 bit interval, sign extended to 32 bits,
1415    but INTERPRETED AS UNSIGNED.  */
1416 #define umul_ppmm(w1, w0, u, v) \
1417   __asm__ ("umul %2,%3,%1;rd %%y,%0" : "=r" (w1), "=r" (w0) : "r" (u), "r" (v))
1418 #define UMUL_TIME 5
1419
1420 #if HAVE_HOST_CPU_supersparc
1421 #define UDIV_TIME 60            /* SuperSPARC timing */
1422 #else
1423 /* Don't use this on SuperSPARC because its udiv only handles 53 bit
1424    dividends and will trap to the kernel for the rest. */
1425 #define udiv_qrnnd(q, r, n1, n0, d) \
1426   do {                                                                  \
1427     USItype __q;                                                        \
1428     __asm__ ("mov %1,%%y;nop;nop;nop;udiv %2,%3,%0"                     \
1429              : "=r" (__q) : "r" (n1), "r" (n0), "r" (d));               \
1430     (r) = (n0) - __q * (d);                                             \
1431     (q) = __q;                                                          \
1432   } while (0)
1433 #define UDIV_TIME 25
1434 #endif /* HAVE_HOST_CPU_supersparc */
1435
1436 #else /* ! __sparc_v8__ */
1437 #if defined (__sparclite__)
1438 /* This has hardware multiply but not divide.  It also has two additional
1439    instructions scan (ffs from high bit) and divscc.  */
1440 #define umul_ppmm(w1, w0, u, v) \
1441   __asm__ ("umul %2,%3,%1;rd %%y,%0" : "=r" (w1), "=r" (w0) : "r" (u), "r" (v))
1442 #define UMUL_TIME 5
1443 #define udiv_qrnnd(q, r, n1, n0, d) \
1444   __asm__ ("! Inlined udiv_qrnnd\n"                                     \
1445 "       wr      %%g0,%2,%%y     ! Not a delayed write for sparclite\n"  \
1446 "       tst     %%g0\n"                                                 \
1447 "       divscc  %3,%4,%%g1\n"                                           \
1448 "       divscc  %%g1,%4,%%g1\n"                                         \
1449 "       divscc  %%g1,%4,%%g1\n"                                         \
1450 "       divscc  %%g1,%4,%%g1\n"                                         \
1451 "       divscc  %%g1,%4,%%g1\n"                                         \
1452 "       divscc  %%g1,%4,%%g1\n"                                         \
1453 "       divscc  %%g1,%4,%%g1\n"                                         \
1454 "       divscc  %%g1,%4,%%g1\n"                                         \
1455 "       divscc  %%g1,%4,%%g1\n"                                         \
1456 "       divscc  %%g1,%4,%%g1\n"                                         \
1457 "       divscc  %%g1,%4,%%g1\n"                                         \
1458 "       divscc  %%g1,%4,%%g1\n"                                         \
1459 "       divscc  %%g1,%4,%%g1\n"                                         \
1460 "       divscc  %%g1,%4,%%g1\n"                                         \
1461 "       divscc  %%g1,%4,%%g1\n"                                         \
1462 "       divscc  %%g1,%4,%%g1\n"                                         \
1463 "       divscc  %%g1,%4,%%g1\n"                                         \
1464 "       divscc  %%g1,%4,%%g1\n"                                         \
1465 "       divscc  %%g1,%4,%%g1\n"                                         \
1466 "       divscc  %%g1,%4,%%g1\n"                                         \
1467 "       divscc  %%g1,%4,%%g1\n"                                         \
1468 "       divscc  %%g1,%4,%%g1\n"                                         \
1469 "       divscc  %%g1,%4,%%g1\n"                                         \
1470 "       divscc  %%g1,%4,%%g1\n"                                         \
1471 "       divscc  %%g1,%4,%%g1\n"                                         \
1472 "       divscc  %%g1,%4,%%g1\n"                                         \
1473 "       divscc  %%g1,%4,%%g1\n"                                         \
1474 "       divscc  %%g1,%4,%%g1\n"                                         \
1475 "       divscc  %%g1,%4,%%g1\n"                                         \
1476 "       divscc  %%g1,%4,%%g1\n"                                         \
1477 "       divscc  %%g1,%4,%%g1\n"                                         \
1478 "       divscc  %%g1,%4,%0\n"                                           \
1479 "       rd      %%y,%1\n"                                               \
1480 "       bl,a 1f\n"                                                      \
1481 "       add     %1,%4,%1\n"                                             \
1482 "1:     ! End of inline udiv_qrnnd"                                     \
1483            : "=r" (q), "=r" (r) : "r" (n1), "r" (n0), "rI" (d)          \
1484            : "%g1" __AND_CLOBBER_CC)
1485 #define UDIV_TIME 37
1486 #define count_leading_zeros(count, x) \
1487   __asm__ ("scan %1,1,%0" : "=r" (count) : "r" (x))
1488 /* Early sparclites return 63 for an argument of 0, but they warn that future
1489    implementations might change this.  Therefore, leave COUNT_LEADING_ZEROS_0
1490    undefined.  */
1491 #endif /* __sparclite__ */
1492 #endif /* __sparc_v8__ */
1493 #endif /* __sparc_v9__ */
1494 /* Default to sparc v7 versions of umul_ppmm and udiv_qrnnd.  */
1495 #ifndef umul_ppmm
1496 #define umul_ppmm(w1, w0, u, v) \
1497   __asm__ ("! Inlined umul_ppmm\n"                                      \
1498 "       wr      %%g0,%2,%%y     ! SPARC has 0-3 delay insn after a wr\n" \
1499 "       sra     %3,31,%%g2      ! Don't move this insn\n"               \
1500 "       and     %2,%%g2,%%g2    ! Don't move this insn\n"               \
1501 "       andcc   %%g0,0,%%g1     ! Don't move this insn\n"               \
1502 "       mulscc  %%g1,%3,%%g1\n"                                         \
1503 "       mulscc  %%g1,%3,%%g1\n"                                         \
1504 "       mulscc  %%g1,%3,%%g1\n"                                         \
1505 "       mulscc  %%g1,%3,%%g1\n"                                         \
1506 "       mulscc  %%g1,%3,%%g1\n"                                         \
1507 "       mulscc  %%g1,%3,%%g1\n"                                         \
1508 "       mulscc  %%g1,%3,%%g1\n"                                         \
1509 "       mulscc  %%g1,%3,%%g1\n"                                         \
1510 "       mulscc  %%g1,%3,%%g1\n"                                         \
1511 "       mulscc  %%g1,%3,%%g1\n"                                         \
1512 "       mulscc  %%g1,%3,%%g1\n"                                         \
1513 "       mulscc  %%g1,%3,%%g1\n"                                         \
1514 "       mulscc  %%g1,%3,%%g1\n"                                         \
1515 "       mulscc  %%g1,%3,%%g1\n"                                         \
1516 "       mulscc  %%g1,%3,%%g1\n"                                         \
1517 "       mulscc  %%g1,%3,%%g1\n"                                         \
1518 "       mulscc  %%g1,%3,%%g1\n"                                         \
1519 "       mulscc  %%g1,%3,%%g1\n"                                         \
1520 "       mulscc  %%g1,%3,%%g1\n"                                         \
1521 "       mulscc  %%g1,%3,%%g1\n"                                         \
1522 "       mulscc  %%g1,%3,%%g1\n"                                         \
1523 "       mulscc  %%g1,%3,%%g1\n"                                         \
1524 "       mulscc  %%g1,%3,%%g1\n"                                         \
1525 "       mulscc  %%g1,%3,%%g1\n"                                         \
1526 "       mulscc  %%g1,%3,%%g1\n"                                         \
1527 "       mulscc  %%g1,%3,%%g1\n"                                         \
1528 "       mulscc  %%g1,%3,%%g1\n"                                         \
1529 "       mulscc  %%g1,%3,%%g1\n"                                         \
1530 "       mulscc  %%g1,%3,%%g1\n"                                         \
1531 "       mulscc  %%g1,%3,%%g1\n"                                         \
1532 "       mulscc  %%g1,%3,%%g1\n"                                         \
1533 "       mulscc  %%g1,%3,%%g1\n"                                         \
1534 "       mulscc  %%g1,0,%%g1\n"                                          \
1535 "       add     %%g1,%%g2,%0\n"                                         \
1536 "       rd      %%y,%1"                                                 \
1537            : "=r" (w1), "=r" (w0) : "%rI" (u), "r" (v)                  \
1538            : "%g1", "%g2" __AND_CLOBBER_CC)
1539 #define UMUL_TIME 39            /* 39 instructions */
1540 #endif
1541 #ifndef udiv_qrnnd
1542 #ifndef LONGLONG_STANDALONE
1543 #define udiv_qrnnd(q, r, n1, n0, d) \
1544   do { UWtype __r;                                                      \
1545     (q) = __MPN(udiv_qrnnd) (&__r, (n1), (n0), (d));                    \
1546     (r) = __r;                                                          \
1547   } while (0)
1548 extern UWtype __MPN(udiv_qrnnd) _PROTO ((UWtype *, UWtype, UWtype, UWtype));
1549 #ifndef UDIV_TIME
1550 #define UDIV_TIME 140
1551 #endif
1552 #endif /* LONGLONG_STANDALONE */
1553 #endif /* udiv_qrnnd */
1554 #endif /* __sparc__ */
1555
1556 #if defined (__sparc__) && W_TYPE_SIZE == 64
1557 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
1558   __asm__ (                                                             \
1559        "addcc   %r4,%5,%1\n"                                            \
1560       " addccc  %r6,%7,%%g0\n"                                          \
1561       " addc    %r2,%3,%0"                                              \
1562           : "=r" (sh), "=&r" (sl)                                       \
1563           : "rJ" (ah), "rI" (bh), "%rJ" (al), "rI" (bl),                \
1564             "%rJ" ((al) >> 32), "rI" ((bl) >> 32)                       \
1565            __CLOBBER_CC)
1566 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
1567   __asm__ (                                                             \
1568        "subcc   %r4,%5,%1\n"                                            \
1569       " subccc  %r6,%7,%%g0\n"                                          \
1570       " subc    %r2,%3,%0"                                              \
1571           : "=r" (sh), "=&r" (sl)                                       \
1572           : "rJ" (ah), "rI" (bh), "rJ" (al), "rI" (bl),         \
1573             "rJ" ((al) >> 32), "rI" ((bl) >> 32)                        \
1574            __CLOBBER_CC)
1575 #endif
1576
1577 #if defined (__vax__) && W_TYPE_SIZE == 32
1578 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
1579   __asm__ ("addl2 %5,%1\n\tadwc %3,%0"                                  \
1580            : "=g" (sh), "=&g" (sl)                                      \
1581            : "0"  ((USItype)(ah)), "g" ((USItype)(bh)),                 \
1582              "%1" ((USItype)(al)), "g" ((USItype)(bl)))
1583 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
1584   __asm__ ("subl2 %5,%1\n\tsbwc %3,%0"                                  \
1585            : "=g" (sh), "=&g" (sl)                                      \
1586            : "0" ((USItype)(ah)), "g" ((USItype)(bh)),                  \
1587              "1" ((USItype)(al)), "g" ((USItype)(bl)))
1588 #define smul_ppmm(xh, xl, m0, m1) \
1589   do {                                                                  \
1590     union {UDItype __ll;                                                \
1591            struct {USItype __l, __h;} __i;                              \
1592           } __x;                                                        \
1593     USItype __m0 = (m0), __m1 = (m1);                                   \
1594     __asm__ ("emul %1,%2,$0,%0"                                         \
1595              : "=g" (__x.__ll) : "g" (__m0), "g" (__m1));               \
1596     (xh) = __x.__i.__h; (xl) = __x.__i.__l;                             \
1597   } while (0)
1598 #define sdiv_qrnnd(q, r, n1, n0, d) \
1599   do {                                                                  \
1600     union {DItype __ll;                                                 \
1601            struct {SItype __l, __h;} __i;                               \
1602           } __x;                                                        \
1603     __x.__i.__h = n1; __x.__i.__l = n0;                                 \
1604     __asm__ ("ediv %3,%2,%0,%1"                                         \
1605              : "=g" (q), "=g" (r) : "g" (__x.__ll), "g" (d));           \
1606   } while (0)
1607 #if 0
1608 /* Note: the following FIXME comes from GMP, thus it does make sense to try
1609    to resolve it in MPFR. */
1610 /* FIXME: This instruction appears to be unimplemented on some systems (vax
1611    8800 maybe). */
1612 #define count_trailing_zeros(count,x)                                   \
1613   do {                                                                  \
1614     __asm__ ("ffs 0, 31, %1, %0"                                        \
1615              : "=g" (count)                                             \
1616              : "g" ((USItype) (x)));                                    \
1617   } while (0)
1618 #endif
1619 #endif /* __vax__ */
1620
1621 #if defined (__z8000__) && W_TYPE_SIZE == 16
1622 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
1623   __asm__ ("add %H1,%H5\n\tadc  %H0,%H3"                                \
1624            : "=r" (sh), "=&r" (sl)                                      \
1625            : "0"  ((unsigned int)(ah)), "r" ((unsigned int)(bh)),       \
1626              "%1" ((unsigned int)(al)), "rQR" ((unsigned int)(bl)))
1627 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
1628   __asm__ ("sub %H1,%H5\n\tsbc  %H0,%H3"                                \
1629            : "=r" (sh), "=&r" (sl)                                      \
1630            : "0" ((unsigned int)(ah)), "r" ((unsigned int)(bh)),        \
1631              "1" ((unsigned int)(al)), "rQR" ((unsigned int)(bl)))
1632 #define umul_ppmm(xh, xl, m0, m1) \
1633   do {                                                                  \
1634     union {long int __ll;                                               \
1635            struct {unsigned int __h, __l;} __i;                         \
1636           } __x;                                                        \
1637     unsigned int __m0 = (m0), __m1 = (m1);                              \
1638     __asm__ ("mult      %S0,%H3"                                        \
1639              : "=r" (__x.__i.__h), "=r" (__x.__i.__l)                   \
1640              : "%1" (m0), "rQR" (m1));                                  \
1641     (xh) = __x.__i.__h; (xl) = __x.__i.__l;                             \
1642     (xh) += ((((signed int) __m0 >> 15) & __m1)                         \
1643              + (((signed int) __m1 >> 15) & __m0));                     \
1644   } while (0)
1645 #endif /* __z8000__ */
1646
1647 #endif /* __GNUC__ */
1648
1649 #endif /* NO_ASM */
1650
1651
1652 #if !defined (umul_ppmm) && defined (__umulsidi3)
1653 #define umul_ppmm(ph, pl, m0, m1) \
1654   {                                                                     \
1655     UDWtype __ll = __umulsidi3 (m0, m1);                                \
1656     ph = (UWtype) (__ll >> W_TYPE_SIZE);                                \
1657     pl = (UWtype) __ll;                                                 \
1658   }
1659 #endif
1660
1661 #if !defined (__umulsidi3)
1662 #define __umulsidi3(u, v) \
1663   ({UWtype __hi, __lo;                                                  \
1664     umul_ppmm (__hi, __lo, u, v);                                       \
1665     ((UDWtype) __hi << W_TYPE_SIZE) | __lo; })
1666 #endif
1667
1668
1669 /* Use mpn_umul_ppmm or mpn_udiv_qrnnd functions, if they exist.  The "_r"
1670    forms have "reversed" arguments, meaning the pointer is last, which
1671    sometimes allows better parameter passing, in particular on 64-bit
1672    hppa. */
1673
1674 #define mpn_umul_ppmm  __MPN(umul_ppmm)
1675 extern UWtype mpn_umul_ppmm _PROTO ((UWtype *, UWtype, UWtype));
1676
1677 #if ! defined (umul_ppmm) && HAVE_NATIVE_mpn_umul_ppmm  \
1678   && ! defined (LONGLONG_STANDALONE)
1679 #define umul_ppmm(wh, wl, u, v)                                               \
1680   do {                                                                        \
1681     UWtype __umul_ppmm__p0;                                                   \
1682     (wh) = mpn_umul_ppmm (&__umul_ppmm__p0, (UWtype) (u), (UWtype) (v));      \
1683     (wl) = __umul_ppmm__p0;                                                   \
1684   } while (0)
1685 #endif
1686
1687 #define mpn_umul_ppmm_r  __MPN(umul_ppmm_r)
1688 extern UWtype mpn_umul_ppmm_r _PROTO ((UWtype, UWtype, UWtype *));
1689
1690 #if ! defined (umul_ppmm) && HAVE_NATIVE_mpn_umul_ppmm_r        \
1691   && ! defined (LONGLONG_STANDALONE)
1692 #define umul_ppmm(wh, wl, u, v)                                               \
1693   do {                                                                        \
1694     UWtype __umul_ppmm__p0;                                                   \
1695     (wh) = mpn_umul_ppmm_r ((UWtype) (u), (UWtype) (v), &__umul_ppmm__p0);    \
1696     (wl) = __umul_ppmm__p0;                                                   \
1697   } while (0)
1698 #endif
1699
1700 #define mpn_udiv_qrnnd  __MPN(udiv_qrnnd)
1701 extern UWtype mpn_udiv_qrnnd _PROTO ((UWtype *, UWtype, UWtype, UWtype));
1702
1703 #if ! defined (udiv_qrnnd) && HAVE_NATIVE_mpn_udiv_qrnnd        \
1704   && ! defined (LONGLONG_STANDALONE)
1705 #define udiv_qrnnd(q, r, n1, n0, d)                                     \
1706   do {                                                                  \
1707     UWtype __udiv_qrnnd__r;                                             \
1708     (q) = mpn_udiv_qrnnd (&__udiv_qrnnd__r,                             \
1709                           (UWtype) (n1), (UWtype) (n0), (UWtype) d);    \
1710     (r) = __udiv_qrnnd__r;                                              \
1711   } while (0)
1712 #endif
1713
1714 #define mpn_udiv_qrnnd_r  __MPN(udiv_qrnnd_r)
1715 extern UWtype mpn_udiv_qrnnd_r _PROTO ((UWtype, UWtype, UWtype, UWtype *));
1716
1717 #if ! defined (udiv_qrnnd) && HAVE_NATIVE_mpn_udiv_qrnnd_r      \
1718   && ! defined (LONGLONG_STANDALONE)
1719 #define udiv_qrnnd(q, r, n1, n0, d)                                     \
1720   do {                                                                  \
1721     UWtype __udiv_qrnnd__r;                                             \
1722     (q) = mpn_udiv_qrnnd_r ((UWtype) (n1), (UWtype) (n0), (UWtype) d,   \
1723                             &__udiv_qrnnd__r);                          \
1724     (r) = __udiv_qrnnd__r;                                              \
1725   } while (0)
1726 #endif
1727
1728
1729 /* If this machine has no inline assembler, use C macros.  */
1730
1731 #if !defined (add_ssaaaa)
1732 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
1733   do {                                                                  \
1734     UWtype __x;                                                         \
1735     __x = (al) + (bl);                                                  \
1736     (sh) = (ah) + (bh) + (__x < (al));                                  \
1737     (sl) = __x;                                                         \
1738   } while (0)
1739 #endif
1740
1741 #if !defined (sub_ddmmss)
1742 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
1743   do {                                                                  \
1744     UWtype __x;                                                         \
1745     __x = (al) - (bl);                                                  \
1746     (sh) = (ah) - (bh) - ((al) < (bl));                                 \
1747     (sl) = __x;                                                         \
1748   } while (0)
1749 #endif
1750
1751 /* If we lack umul_ppmm but have smul_ppmm, define umul_ppmm in terms of
1752    smul_ppmm.  */
1753 #if !defined (umul_ppmm) && defined (smul_ppmm)
1754 #define umul_ppmm(w1, w0, u, v)                                         \
1755   do {                                                                  \
1756     UWtype __w1;                                                        \
1757     UWtype __xm0 = (u), __xm1 = (v);                                    \
1758     smul_ppmm (__w1, w0, __xm0, __xm1);                                 \
1759     (w1) = __w1 + (-(__xm0 >> (W_TYPE_SIZE - 1)) & __xm1)               \
1760                 + (-(__xm1 >> (W_TYPE_SIZE - 1)) & __xm0);              \
1761   } while (0)
1762 #endif
1763
1764 /* If we still don't have umul_ppmm, define it using plain C.
1765
1766    For reference, when this code is used for squaring (ie. u and v identical
1767    expressions), gcc recognises __x1 and __x2 are the same and generates 3
1768    multiplies, not 4.  The subsequent additions could be optimized a bit,
1769    but the only place GMP currently uses such a square is mpn_sqr_basecase,
1770    and chips obliged to use this generic C umul will have plenty of worse
1771    performance problems than a couple of extra instructions on the diagonal
1772    of sqr_basecase.  */
1773
1774 #if !defined (umul_ppmm)
1775 #define umul_ppmm(w1, w0, u, v)                                         \
1776   do {                                                                  \
1777     UWtype __x0, __x1, __x2, __x3;                                      \
1778     UHWtype __ul, __vl, __uh, __vh;                                     \
1779     UWtype __u = (u), __v = (v);                                        \
1780                                                                         \
1781     __ul = __ll_lowpart (__u);                                          \
1782     __uh = __ll_highpart (__u);                                         \
1783     __vl = __ll_lowpart (__v);                                          \
1784     __vh = __ll_highpart (__v);                                         \
1785                                                                         \
1786     __x0 = (UWtype) __ul * __vl;                                        \
1787     __x1 = (UWtype) __ul * __vh;                                        \
1788     __x2 = (UWtype) __uh * __vl;                                        \
1789     __x3 = (UWtype) __uh * __vh;                                        \
1790                                                                         \
1791     __x1 += __ll_highpart (__x0);/* this can't give carry */            \
1792     __x1 += __x2;               /* but this indeed can */               \
1793     if (__x1 < __x2)            /* did we get it? */                    \
1794       __x3 += __ll_B;           /* yes, add it in the proper pos. */    \
1795                                                                         \
1796     (w1) = __x3 + __ll_highpart (__x1);                                 \
1797     (w0) = (__x1 << W_TYPE_SIZE/2) + __ll_lowpart (__x0);               \
1798   } while (0)
1799 #endif
1800
1801 /* If we don't have smul_ppmm, define it using umul_ppmm (which surely will
1802    exist in one form or another.  */
1803 #if !defined (smul_ppmm)
1804 #define smul_ppmm(w1, w0, u, v)                                         \
1805   do {                                                                  \
1806     UWtype __w1;                                                        \
1807     UWtype __xm0 = (u), __xm1 = (v);                                    \
1808     umul_ppmm (__w1, w0, __xm0, __xm1);                                 \
1809     (w1) = __w1 - (-(__xm0 >> (W_TYPE_SIZE - 1)) & __xm1)               \
1810                 - (-(__xm1 >> (W_TYPE_SIZE - 1)) & __xm0);              \
1811   } while (0)
1812 #endif
1813
1814 /* Define this unconditionally, so it can be used for debugging.  */
1815 #define __udiv_qrnnd_c(q, r, n1, n0, d) \
1816   do {                                                                  \
1817     UWtype __d1, __d0, __q1, __q0, __r1, __r0, __m;                     \
1818                                                                         \
1819     ASSERT ((d) != 0);                                                  \
1820     ASSERT ((n1) < (d));                                                \
1821                                                                         \
1822     __d1 = __ll_highpart (d);                                           \
1823     __d0 = __ll_lowpart (d);                                            \
1824                                                                         \
1825     __q1 = (n1) / __d1;                                                 \
1826     __r1 = (n1) - __q1 * __d1;                                          \
1827     __m = __q1 * __d0;                                                  \
1828     __r1 = __r1 * __ll_B | __ll_highpart (n0);                          \
1829     if (__r1 < __m)                                                     \
1830       {                                                                 \
1831         __q1--, __r1 += (d);                                            \
1832         if (__r1 >= (d)) /* i.e. we didn't get carry when adding to __r1 */\
1833           if (__r1 < __m)                                               \
1834             __q1--, __r1 += (d);                                        \
1835       }                                                                 \
1836     __r1 -= __m;                                                        \
1837                                                                         \
1838     __q0 = __r1 / __d1;                                                 \
1839     __r0 = __r1  - __q0 * __d1;                                         \
1840     __m = __q0 * __d0;                                                  \
1841     __r0 = __r0 * __ll_B | __ll_lowpart (n0);                           \
1842     if (__r0 < __m)                                                     \
1843       {                                                                 \
1844         __q0--, __r0 += (d);                                            \
1845         if (__r0 >= (d))                                                \
1846           if (__r0 < __m)                                               \
1847             __q0--, __r0 += (d);                                        \
1848       }                                                                 \
1849     __r0 -= __m;                                                        \
1850                                                                         \
1851     (q) = __q1 * __ll_B | __q0;                                         \
1852     (r) = __r0;                                                         \
1853   } while (0)
1854
1855 /* If the processor has no udiv_qrnnd but sdiv_qrnnd, go through
1856    __udiv_w_sdiv (defined in libgcc or elsewhere).  */
1857 #if !defined (udiv_qrnnd) && defined (sdiv_qrnnd)
1858 #define udiv_qrnnd(q, r, nh, nl, d) \
1859   do {                                                                  \
1860     UWtype __r;                                                         \
1861     (q) = __MPN(udiv_w_sdiv) (&__r, nh, nl, d);                         \
1862     (r) = __r;                                                          \
1863   } while (0)
1864 #endif
1865
1866 /* If udiv_qrnnd was not defined for this processor, use __udiv_qrnnd_c.  */
1867 #if !defined (udiv_qrnnd)
1868 #define UDIV_NEEDS_NORMALIZATION 1
1869 #define udiv_qrnnd __udiv_qrnnd_c
1870 #endif
1871
1872 #if !defined (count_leading_zeros)
1873 #define count_leading_zeros(count, x) \
1874   do {                                                                  \
1875     UWtype __xr = (x);                                                  \
1876     UWtype __a;                                                         \
1877                                                                         \
1878     if (W_TYPE_SIZE == 32)                                              \
1879       {                                                                 \
1880         __a = __xr < ((UWtype) 1 << 2*__BITS4)                          \
1881           ? (__xr < ((UWtype) 1 << __BITS4) ? 1 : __BITS4 + 1)          \
1882           : (__xr < ((UWtype) 1 << 3*__BITS4) ? 2*__BITS4 + 1           \
1883           : 3*__BITS4 + 1);                                             \
1884       }                                                                 \
1885     else                                                                \
1886       {                                                                 \
1887         for (__a = W_TYPE_SIZE - 8; __a > 0; __a -= 8)                  \
1888           if (((__xr >> __a) & 0xff) != 0)                              \
1889             break;                                                      \
1890         ++__a;                                                          \
1891       }                                                                 \
1892                                                                         \
1893     (count) = W_TYPE_SIZE + 1 - __a - __clz_tab[__xr >> __a];           \
1894   } while (0)
1895 /* This version gives a well-defined value for zero. */
1896 #define COUNT_LEADING_ZEROS_0 (W_TYPE_SIZE - 1)
1897 #define COUNT_LEADING_ZEROS_NEED_CLZ_TAB
1898 #endif
1899
1900 /* clz_tab needed by mpn/x86/pentium/mod_1.asm in a fat binary */
1901 #if HAVE_HOST_CPU_FAMILY_x86 && WANT_FAT_BINARY
1902 #define COUNT_LEADING_ZEROS_NEED_CLZ_TAB
1903 #endif
1904
1905 #ifdef COUNT_LEADING_ZEROS_NEED_CLZ_TAB
1906 # ifdef MPFR_HAVE_GMP_IMPL
1907     extern const unsigned char __GMP_DECLSPEC __clz_tab[128];
1908 # else
1909     extern const unsigned char __clz_tab[128];
1910 # endif
1911 #endif
1912
1913 #if !defined (count_trailing_zeros)
1914 /* Define count_trailing_zeros using count_leading_zeros.  The latter might be
1915    defined in asm, but if it is not, the C version above is good enough.  */
1916 #define count_trailing_zeros(count, x) \
1917   do {                                                                  \
1918     UWtype __ctz_x = (x);                                               \
1919     UWtype __ctz_c;                                                     \
1920     ASSERT (__ctz_x != 0);                                              \
1921     count_leading_zeros (__ctz_c, __ctz_x & -__ctz_x);                  \
1922     (count) = W_TYPE_SIZE - 1 - __ctz_c;                                \
1923   } while (0)
1924 #endif
1925
1926 #ifndef UDIV_NEEDS_NORMALIZATION
1927 #define UDIV_NEEDS_NORMALIZATION 0
1928 #endif
1929
1930 /* Whether udiv_qrnnd is actually implemented with udiv_qrnnd_preinv, and
1931    that hence the latter should always be used.  */
1932 #ifndef UDIV_PREINV_ALWAYS
1933 #define UDIV_PREINV_ALWAYS 0
1934 #endif
1935
1936 /* Give defaults for UMUL_TIME and UDIV_TIME.  */
1937 #ifndef UMUL_TIME
1938 #define UMUL_TIME 1
1939 #endif
1940
1941 #ifndef UDIV_TIME
1942 #define UDIV_TIME UMUL_TIME
1943 #endif