source/libs/gmp/gmp-src/longlong.h

   1 /* longlong.h -- definitions for mixed size 32/64 bit arithmetic.
   2
   3 Copyright 1991-1994, 1996, 1997, 1999-2005, 2007-2009, 2011-2015 Free Software
   4 Foundation, Inc.
   5
   6 This file is part of the GNU MP Library.
   7
   8 The GNU MP Library is free software; you can redistribute it and/or modify
   9 it under the terms of either:
  10
  11   * the GNU Lesser General Public License as published by the Free
  12     Software Foundation; either version 3 of the License, or (at your
  13     option) any later version.
  14
  15 or
  16
  17   * the GNU General Public License as published by the Free Software
  18     Foundation; either version 2 of the License, or (at your option) any
  19     later version.
  20
  21 or both in parallel, as here.
  22
  23 The GNU MP Library is distributed in the hope that it will be useful, but
  24 WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
  25 or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  26 for more details.
  27
  28 You should have received copies of the GNU General Public License and the
  29 GNU Lesser General Public License along with the GNU MP Library.  If not,
  30 see https://www.gnu.org/licenses/.  */
  31
  32 /* You have to define the following before including this file:
  33
  34    UWtype -- An unsigned type, default type for operations (typically a "word")
  35    UHWtype -- An unsigned type, at least half the size of UWtype
  36    UDWtype -- An unsigned type, at least twice as large a UWtype
  37    W_TYPE_SIZE -- size in bits of UWtype
  38
  39    SItype, USItype -- Signed and unsigned 32 bit types
  40    DItype, UDItype -- Signed and unsigned 64 bit types
  41
  42    On a 32 bit machine UWtype should typically be USItype;
  43    on a 64 bit machine, UWtype should typically be UDItype.
  44
  45    Optionally, define:
  46
  47    LONGLONG_STANDALONE -- Avoid code that needs machine-dependent support files
  48    NO_ASM -- Disable inline asm
  49
  50
  51    CAUTION!  Using this version of longlong.h outside of GMP is not safe.  You
  52    need to include gmp.h and gmp-impl.h, or certain things might not work as
  53    expected.
  54 */
  55
  56 #define __BITS4 (W_TYPE_SIZE / 4)
  57 #define __ll_B ((UWtype) 1 << (W_TYPE_SIZE / 2))
  58 #define __ll_lowpart(t) ((UWtype) (t) & (__ll_B - 1))
  59 #define __ll_highpart(t) ((UWtype) (t) >> (W_TYPE_SIZE / 2))
  60
  61 /* This is used to make sure no undesirable sharing between different libraries
  62    that use this file takes place.  */
  63 #ifndef __MPN
  64 #define __MPN(x) __##x
  65 #endif
  66
  67 /* Define auxiliary asm macros.
  68
  69    1) umul_ppmm(high_prod, low_prod, multiplier, multiplicand) multiplies two
  70    UWtype integers MULTIPLIER and MULTIPLICAND, and generates a two UWtype
  71    word product in HIGH_PROD and LOW_PROD.
  72
  73    2) __umulsidi3(a,b) multiplies two UWtype integers A and B, and returns a
  74    UDWtype product.  This is just a variant of umul_ppmm.
  75
  76    3) udiv_qrnnd(quotient, remainder, high_numerator, low_numerator,
  77    denominator) divides a UDWtype, composed by the UWtype integers
  78    HIGH_NUMERATOR and LOW_NUMERATOR, by DENOMINATOR and places the quotient
  79    in QUOTIENT and the remainder in REMAINDER.  HIGH_NUMERATOR must be less
  80    than DENOMINATOR for correct operation.  If, in addition, the most
  81    significant bit of DENOMINATOR must be 1, then the pre-processor symbol
  82    UDIV_NEEDS_NORMALIZATION is defined to 1.
  83
  84    4) sdiv_qrnnd(quotient, remainder, high_numerator, low_numerator,
  85    denominator).  Like udiv_qrnnd but the numbers are signed.  The quotient
  86    is rounded towards 0.
  87
  88    5) count_leading_zeros(count, x) counts the number of zero-bits from the
  89    msb to the first non-zero bit in the UWtype X.  This is the number of
  90    steps X needs to be shifted left to set the msb.  Undefined for X == 0,
  91    unless the symbol COUNT_LEADING_ZEROS_0 is defined to some value.
  92
  93    6) count_trailing_zeros(count, x) like count_leading_zeros, but counts
  94    from the least significant end.
  95
  96    7) add_ssaaaa(high_sum, low_sum, high_addend_1, low_addend_1,
  97    high_addend_2, low_addend_2) adds two UWtype integers, composed by
  98    HIGH_ADDEND_1 and LOW_ADDEND_1, and HIGH_ADDEND_2 and LOW_ADDEND_2
  99    respectively.  The result is placed in HIGH_SUM and LOW_SUM.  Overflow
 100    (i.e. carry out) is not stored anywhere, and is lost.
 101
 102    8) sub_ddmmss(high_difference, low_difference, high_minuend, low_minuend,
 103    high_subtrahend, low_subtrahend) subtracts two two-word UWtype integers,
 104    composed by HIGH_MINUEND_1 and LOW_MINUEND_1, and HIGH_SUBTRAHEND_2 and
 105    LOW_SUBTRAHEND_2 respectively.  The result is placed in HIGH_DIFFERENCE
 106    and LOW_DIFFERENCE.  Overflow (i.e. carry out) is not stored anywhere,
 107    and is lost.
 108
 109    If any of these macros are left undefined for a particular CPU,
 110    C macros are used.
 111
 112
 113    Notes:
 114
 115    For add_ssaaaa the two high and two low addends can both commute, but
 116    unfortunately gcc only supports one "%" commutative in each asm block.
 117    This has always been so but is only documented in recent versions
 118    (eg. pre-release 3.3).  Having two or more "%"s can cause an internal
 119    compiler error in certain rare circumstances.
 120
 121    Apparently it was only the last "%" that was ever actually respected, so
 122    the code has been updated to leave just that.  Clearly there's a free
 123    choice whether high or low should get it, if there's a reason to favour
 124    one over the other.  Also obviously when the constraints on the two
 125    operands are identical there's no benefit to the reloader in any "%" at
 126    all.
 127
 128    */
 129
 130 /* The CPUs come in alphabetical order below.
 131
 132    Please add support for more CPUs here, or improve the current support
 133    for the CPUs below!  */
 134
 135
 136 /* count_leading_zeros_gcc_clz is count_leading_zeros implemented with gcc
 137    3.4 __builtin_clzl or __builtin_clzll, according to our limb size.
 138    Similarly count_trailing_zeros_gcc_ctz using __builtin_ctzl or
 139    __builtin_ctzll.
 140
 141    These builtins are only used when we check what code comes out, on some
 142    chips they're merely libgcc calls, where we will instead want an inline
 143    in that case (either asm or generic C).
 144
 145    These builtins are better than an asm block of the same insn, since an
 146    asm block doesn't give gcc any information about scheduling or resource
 147    usage.  We keep an asm block for use on prior versions of gcc though.
 148
 149    For reference, __builtin_ffs existed in gcc prior to __builtin_clz, but
 150    it's not used (for count_leading_zeros) because it generally gives extra
 151    code to ensure the result is 0 when the input is 0, which we don't need
 152    or want.  */
 153
 154 #ifdef _LONG_LONG_LIMB
 155 #define count_leading_zeros_gcc_clz(count,x)    \
 156   do {                                          \
 157     ASSERT ((x) != 0);                          \
 158     (count) = __builtin_clzll (x);              \
 159   } while (0)
 160 #else
 161 #define count_leading_zeros_gcc_clz(count,x)    \
 162   do {                                          \
 163     ASSERT ((x) != 0);                          \
 164     (count) = __builtin_clzl (x);               \
 165   } while (0)
 166 #endif
 167
 168 #ifdef _LONG_LONG_LIMB
 169 #define count_trailing_zeros_gcc_ctz(count,x)   \
 170   do {                                          \
 171     ASSERT ((x) != 0);                          \
 172     (count) = __builtin_ctzll (x);              \
 173   } while (0)
 174 #else
 175 #define count_trailing_zeros_gcc_ctz(count,x)   \
 176   do {                                          \
 177     ASSERT ((x) != 0);                          \
 178     (count) = __builtin_ctzl (x);               \
 179   } while (0)
 180 #endif
 181
 182
 183 /* FIXME: The macros using external routines like __MPN(count_leading_zeros)
 184    don't need to be under !NO_ASM */
 185 #if ! defined (NO_ASM)
 186
 187 #if defined (__alpha) && W_TYPE_SIZE == 64
 188 /* Most alpha-based machines, except Cray systems. */
 189 #if defined (__GNUC__)
 190 #if __GMP_GNUC_PREREQ (3,3)
 191 #define umul_ppmm(ph, pl, m0, m1) \
 192   do {                                                                  \
 193     UDItype __m0 = (m0), __m1 = (m1);                                   \
 194     (ph) = __builtin_alpha_umulh (__m0, __m1);                          \
 195     (pl) = __m0 * __m1;                                                 \
 196   } while (0)
 197 #else
 198 #define umul_ppmm(ph, pl, m0, m1) \
 199   do {                                                                  \
 200     UDItype __m0 = (m0), __m1 = (m1);                                   \
 201     __asm__ ("umulh %r1,%2,%0"                                          \
 202              : "=r" (ph)                                                \
 203              : "%rJ" (__m0), "rI" (__m1));                              \
 204     (pl) = __m0 * __m1;                                                 \
 205   } while (0)
 206 #endif
 207 #define UMUL_TIME 18
 208 #else /* ! __GNUC__ */
 209 #include <machine/builtins.h>
 210 #define umul_ppmm(ph, pl, m0, m1) \
 211   do {                                                                  \
 212     UDItype __m0 = (m0), __m1 = (m1);                                   \
 213     (ph) = __UMULH (__m0, __m1);                                        \
 214     (pl) = __m0 * __m1;                                                 \
 215   } while (0)
 216 #endif
 217 #ifndef LONGLONG_STANDALONE
 218 #define udiv_qrnnd(q, r, n1, n0, d) \
 219   do { UWtype __di;                                                     \
 220     __di = __MPN(invert_limb) (d);                                      \
 221     udiv_qrnnd_preinv (q, r, n1, n0, d, __di);                          \
 222   } while (0)
 223 #define UDIV_PREINV_ALWAYS  1
 224 #define UDIV_NEEDS_NORMALIZATION 1
 225 #define UDIV_TIME 220
 226 #endif /* LONGLONG_STANDALONE */
 227
 228 /* clz_tab is required in all configurations, since mpn/alpha/cntlz.asm
 229    always goes into libgmp.so, even when not actually used.  */
 230 #define COUNT_LEADING_ZEROS_NEED_CLZ_TAB
 231
 232 #if defined (__GNUC__) && HAVE_HOST_CPU_alpha_CIX
 233 #define count_leading_zeros(COUNT,X) \
 234   __asm__("ctlz %1,%0" : "=r"(COUNT) : "r"(X))
 235 #define count_trailing_zeros(COUNT,X) \
 236   __asm__("cttz %1,%0" : "=r"(COUNT) : "r"(X))
 237 #endif /* clz/ctz using cix */
 238
 239 #if ! defined (count_leading_zeros)                             \
 240   && defined (__GNUC__) && ! defined (LONGLONG_STANDALONE)
 241 /* ALPHA_CMPBGE_0 gives "cmpbge $31,src,dst", ie. test src bytes == 0.
 242    "$31" is written explicitly in the asm, since an "r" constraint won't
 243    select reg 31.  There seems no need to worry about "r31" syntax for cray,
 244    since gcc itself (pre-release 3.4) emits just $31 in various places.  */
 245 #define ALPHA_CMPBGE_0(dst, src)                                        \
 246   do { asm ("cmpbge $31, %1, %0" : "=r" (dst) : "r" (src)); } while (0)
 247 /* Zero bytes are turned into bits with cmpbge, a __clz_tab lookup counts
 248    them, locating the highest non-zero byte.  A second __clz_tab lookup
 249    counts the leading zero bits in that byte, giving the result.  */
 250 #define count_leading_zeros(count, x)                                   \
 251   do {                                                                  \
 252     UWtype  __clz__b, __clz__c, __clz__x = (x);                         \
 253     ALPHA_CMPBGE_0 (__clz__b,  __clz__x);           /* zero bytes */    \
 254     __clz__b = __clz_tab [(__clz__b >> 1) ^ 0x7F];  /* 8 to 1 byte */   \
 255     __clz__b = __clz__b * 8 - 7;                    /* 57 to 1 shift */ \
 256     __clz__x >>= __clz__b;                                              \
 257     __clz__c = __clz_tab [__clz__x];                /* 8 to 1 bit */    \
 258     __clz__b = 65 - __clz__b;                                           \
 259     (count) = __clz__b - __clz__c;                                      \
 260   } while (0)
 261 #define COUNT_LEADING_ZEROS_NEED_CLZ_TAB
 262 #endif /* clz using cmpbge */
 263
 264 #if ! defined (count_leading_zeros) && ! defined (LONGLONG_STANDALONE)
 265 #if HAVE_ATTRIBUTE_CONST
 266 long __MPN(count_leading_zeros) (UDItype) __attribute__ ((const));
 267 #else
 268 long __MPN(count_leading_zeros) (UDItype);
 269 #endif
 270 #define count_leading_zeros(count, x) \
 271   ((count) = __MPN(count_leading_zeros) (x))
 272 #endif /* clz using mpn */
 273 #endif /* __alpha */
 274
 275 #if defined (__AVR) && W_TYPE_SIZE == 8
 276 #define umul_ppmm(ph, pl, m0, m1) \
 277   do {                                                                  \
 278     unsigned short __p = (unsigned short) (m0) * (m1);                  \
 279     (ph) = __p >> 8;                                                    \
 280     (pl) = __p;                                                         \
 281   } while (0)
 282 #endif /* AVR */
 283
 284 #if defined (_CRAY) && W_TYPE_SIZE == 64
 285 #include <intrinsics.h>
 286 #define UDIV_PREINV_ALWAYS  1
 287 #define UDIV_NEEDS_NORMALIZATION 1
 288 #define UDIV_TIME 220
 289 long __MPN(count_leading_zeros) (UDItype);
 290 #define count_leading_zeros(count, x) \
 291   ((count) = _leadz ((UWtype) (x)))
 292 #if defined (_CRAYIEEE)         /* I.e., Cray T90/ieee, T3D, and T3E */
 293 #define umul_ppmm(ph, pl, m0, m1) \
 294   do {                                                                  \
 295     UDItype __m0 = (m0), __m1 = (m1);                                   \
 296     (ph) = _int_mult_upper (__m0, __m1);                                \
 297     (pl) = __m0 * __m1;                                                 \
 298   } while (0)
 299 #ifndef LONGLONG_STANDALONE
 300 #define udiv_qrnnd(q, r, n1, n0, d) \
 301   do { UWtype __di;                                                     \
 302     __di = __MPN(invert_limb) (d);                                      \
 303     udiv_qrnnd_preinv (q, r, n1, n0, d, __di);                          \
 304   } while (0)
 305 #endif /* LONGLONG_STANDALONE */
 306 #endif /* _CRAYIEEE */
 307 #endif /* _CRAY */
 308
 309 #if defined (__ia64) && W_TYPE_SIZE == 64
 310 /* This form encourages gcc (pre-release 3.4 at least) to emit predicated
 311    "sub r=r,r" and "sub r=r,r,1", giving a 2 cycle latency.  The generic
 312    code using "al<bl" arithmetically comes out making an actual 0 or 1 in a
 313    register, which takes an extra cycle.  */
 314 #define sub_ddmmss(sh, sl, ah, al, bh, bl)      \
 315   do {                                          \
 316     UWtype __x;                                 \
 317     __x = (al) - (bl);                          \
 318     if ((al) < (bl))                            \
 319       (sh) = (ah) - (bh) - 1;                   \
 320     else                                        \
 321       (sh) = (ah) - (bh);                       \
 322     (sl) = __x;                                 \
 323   } while (0)
 324 #if defined (__GNUC__) && ! defined (__INTEL_COMPILER)
 325 /* Do both product parts in assembly, since that gives better code with
 326    all gcc versions.  Some callers will just use the upper part, and in
 327    that situation we waste an instruction, but not any cycles.  */
 328 #define umul_ppmm(ph, pl, m0, m1) \
 329     __asm__ ("xma.hu %0 = %2, %3, f0\n\txma.l %1 = %2, %3, f0"          \
 330              : "=&f" (ph), "=f" (pl)                                    \
 331              : "f" (m0), "f" (m1))
 332 #define UMUL_TIME 14
 333 #define count_leading_zeros(count, x) \
 334   do {                                                                  \
 335     UWtype _x = (x), _y, _a, _c;                                        \
 336     __asm__ ("mux1 %0 = %1, @rev" : "=r" (_y) : "r" (_x));              \
 337     __asm__ ("czx1.l %0 = %1" : "=r" (_a) : "r" (-_y | _y));            \
 338     _c = (_a - 1) << 3;                                                 \
 339     _x >>= _c;                                                          \
 340     if (_x >= 1 << 4)                                                   \
 341       _x >>= 4, _c += 4;                                                \
 342     if (_x >= 1 << 2)                                                   \
 343       _x >>= 2, _c += 2;                                                \
 344     _c += _x >> 1;                                                      \
 345     (count) =  W_TYPE_SIZE - 1 - _c;                                    \
 346   } while (0)
 347 /* similar to what gcc does for __builtin_ffs, but 0 based rather than 1
 348    based, and we don't need a special case for x==0 here */
 349 #define count_trailing_zeros(count, x)                                  \
 350   do {                                                                  \
 351     UWtype __ctz_x = (x);                                               \
 352     __asm__ ("popcnt %0 = %1"                                           \
 353              : "=r" (count)                                             \
 354              : "r" ((__ctz_x-1) & ~__ctz_x));                           \
 355   } while (0)
 356 #endif
 357 #if defined (__INTEL_COMPILER)
 358 #include <ia64intrin.h>
 359 #define umul_ppmm(ph, pl, m0, m1)                                       \
 360   do {                                                                  \
 361     UWtype __m0 = (m0), __m1 = (m1);                                    \
 362     ph = _m64_xmahu (__m0, __m1, 0);                                    \
 363     pl = __m0 * __m1;                                                   \
 364   } while (0)
 365 #endif
 366 #ifndef LONGLONG_STANDALONE
 367 #define udiv_qrnnd(q, r, n1, n0, d) \
 368   do { UWtype __di;                                                     \
 369     __di = __MPN(invert_limb) (d);                                      \
 370     udiv_qrnnd_preinv (q, r, n1, n0, d, __di);                          \
 371   } while (0)
 372 #define UDIV_PREINV_ALWAYS  1
 373 #define UDIV_NEEDS_NORMALIZATION 1
 374 #endif
 375 #define UDIV_TIME 220
 376 #endif
 377
 378
 379 #if defined (__GNUC__)
 380
 381 /* We sometimes need to clobber "cc" with gcc2, but that would not be
 382    understood by gcc1.  Use cpp to avoid major code duplication.  */
 383 #if __GNUC__ < 2
 384 #define __CLOBBER_CC
 385 #define __AND_CLOBBER_CC
 386 #else /* __GNUC__ >= 2 */
 387 #define __CLOBBER_CC : "cc"
 388 #define __AND_CLOBBER_CC , "cc"
 389 #endif /* __GNUC__ < 2 */
 390
 391 #if (defined (__a29k__) || defined (_AM29K)) && W_TYPE_SIZE == 32
 392 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
 393   __asm__ ("add %1,%4,%5\n\taddc %0,%2,%3"                              \
 394            : "=r" (sh), "=&r" (sl)                                      \
 395            : "r" (ah), "rI" (bh), "%r" (al), "rI" (bl))
 396 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
 397   __asm__ ("sub %1,%4,%5\n\tsubc %0,%2,%3"                              \
 398            : "=r" (sh), "=&r" (sl)                                      \
 399            : "r" (ah), "rI" (bh), "r" (al), "rI" (bl))
 400 #define umul_ppmm(xh, xl, m0, m1) \
 401   do {                                                                  \
 402     USItype __m0 = (m0), __m1 = (m1);                                   \
 403     __asm__ ("multiplu %0,%1,%2"                                        \
 404              : "=r" (xl)                                                \
 405              : "r" (__m0), "r" (__m1));                                 \
 406     __asm__ ("multmu %0,%1,%2"                                          \
 407              : "=r" (xh)                                                \
 408              : "r" (__m0), "r" (__m1));                                 \
 409   } while (0)
 410 #define udiv_qrnnd(q, r, n1, n0, d) \
 411   __asm__ ("dividu %0,%3,%4"                                            \
 412            : "=r" (q), "=q" (r)                                         \
 413            : "1" (n1), "r" (n0), "r" (d))
 414 #define count_leading_zeros(count, x) \
 415     __asm__ ("clz %0,%1"                                                \
 416              : "=r" (count)                                             \
 417              : "r" (x))
 418 #define COUNT_LEADING_ZEROS_0 32
 419 #endif /* __a29k__ */
 420
 421 #if defined (__arc__)
 422 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
 423   __asm__ ("add.f\t%1, %4, %5\n\tadc\t%0, %2, %3"                       \
 424            : "=r" (sh),                                                 \
 425              "=&r" (sl)                                                 \
 426            : "r"  ((USItype) (ah)),                                     \
 427              "rIJ" ((USItype) (bh)),                                    \
 428              "%r" ((USItype) (al)),                                     \
 429              "rIJ" ((USItype) (bl)))
 430 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
 431   __asm__ ("sub.f\t%1, %4, %5\n\tsbc\t%0, %2, %3"                       \
 432            : "=r" (sh),                                                 \
 433              "=&r" (sl)                                                 \
 434            : "r" ((USItype) (ah)),                                      \
 435              "rIJ" ((USItype) (bh)),                                    \
 436              "r" ((USItype) (al)),                                      \
 437              "rIJ" ((USItype) (bl)))
 438 #endif
 439
 440 #if defined (__arm__) && (defined (__thumb2__) || !defined (__thumb__)) \
 441     && W_TYPE_SIZE == 32
 442 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
 443   __asm__ ("adds\t%1, %4, %5\n\tadc\t%0, %2, %3"                        \
 444            : "=r" (sh), "=&r" (sl)                                      \
 445            : "r" (ah), "rI" (bh), "%r" (al), "rI" (bl) __CLOBBER_CC)
 446 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
 447   do {                                                                  \
 448     if (__builtin_constant_p (al))                                      \
 449       {                                                                 \
 450         if (__builtin_constant_p (ah))                                  \
 451           __asm__ ("rsbs\t%1, %5, %4\n\trsc\t%0, %3, %2"                \
 452                    : "=r" (sh), "=&r" (sl)                              \
 453                    : "rI" (ah), "r" (bh), "rI" (al), "r" (bl) __CLOBBER_CC); \
 454         else                                                            \
 455           __asm__ ("rsbs\t%1, %5, %4\n\tsbc\t%0, %2, %3"                \
 456                    : "=r" (sh), "=&r" (sl)                              \
 457                    : "r" (ah), "rI" (bh), "rI" (al), "r" (bl) __CLOBBER_CC); \
 458       }                                                                 \
 459     else if (__builtin_constant_p (ah))                                 \
 460       {                                                                 \
 461         if (__builtin_constant_p (bl))                                  \
 462           __asm__ ("subs\t%1, %4, %5\n\trsc\t%0, %3, %2"                \
 463                    : "=r" (sh), "=&r" (sl)                              \
 464                    : "rI" (ah), "r" (bh), "r" (al), "rI" (bl) __CLOBBER_CC); \
 465         else                                                            \
 466           __asm__ ("rsbs\t%1, %5, %4\n\trsc\t%0, %3, %2"                \
 467                    : "=r" (sh), "=&r" (sl)                              \
 468                    : "rI" (ah), "r" (bh), "rI" (al), "r" (bl) __CLOBBER_CC); \
 469       }                                                                 \
 470     else if (__builtin_constant_p (bl))                                 \
 471       {                                                                 \
 472         if (__builtin_constant_p (bh))                                  \
 473           __asm__ ("subs\t%1, %4, %5\n\tsbc\t%0, %2, %3"                \
 474                    : "=r" (sh), "=&r" (sl)                              \
 475                    : "r" (ah), "rI" (bh), "r" (al), "rI" (bl) __CLOBBER_CC); \
 476         else                                                            \
 477           __asm__ ("subs\t%1, %4, %5\n\trsc\t%0, %3, %2"                \
 478                    : "=r" (sh), "=&r" (sl)                              \
 479                    : "rI" (ah), "r" (bh), "r" (al), "rI" (bl) __CLOBBER_CC); \
 480       }                                                                 \
 481     else /* only bh might be a constant */                              \
 482       __asm__ ("subs\t%1, %4, %5\n\tsbc\t%0, %2, %3"                    \
 483                : "=r" (sh), "=&r" (sl)                                  \
 484                : "r" (ah), "rI" (bh), "r" (al), "rI" (bl) __CLOBBER_CC);\
 485     } while (0)
 486 #if defined (__ARM_ARCH_2__) || defined (__ARM_ARCH_2A__) \
 487     || defined (__ARM_ARCH_3__)
 488 #define umul_ppmm(xh, xl, a, b)                                         \
 489   do {                                                                  \
 490     register USItype __t0, __t1, __t2;                                  \
 491     __asm__ ("%@ Inlined umul_ppmm\n"                                   \
 492            "    mov     %2, %5, lsr #16\n"                              \
 493            "    mov     %0, %6, lsr #16\n"                              \
 494            "    bic     %3, %5, %2, lsl #16\n"                          \
 495            "    bic     %4, %6, %0, lsl #16\n"                          \
 496            "    mul     %1, %3, %4\n"                                   \
 497            "    mul     %4, %2, %4\n"                                   \
 498            "    mul     %3, %0, %3\n"                                   \
 499            "    mul     %0, %2, %0\n"                                   \
 500            "    adds    %3, %4, %3\n"                                   \
 501            "    addcs   %0, %0, #65536\n"                               \
 502            "    adds    %1, %1, %3, lsl #16\n"                          \
 503            "    adc     %0, %0, %3, lsr #16"                            \
 504            : "=&r" ((USItype) (xh)), "=r" ((USItype) (xl)),             \
 505              "=&r" (__t0), "=&r" (__t1), "=r" (__t2)                    \
 506            : "r" ((USItype) (a)), "r" ((USItype) (b)) __CLOBBER_CC);    \
 507   } while (0)
 508 #define UMUL_TIME 20
 509 #define udiv_qrnnd(q, r, n1, n0, d) \
 510   do { UWtype __r;                                                      \
 511     (q) = __MPN(udiv_qrnnd) (&__r, (n1), (n0), (d));                    \
 512     (r) = __r;                                                          \
 513   } while (0)
 514 extern UWtype __MPN(udiv_qrnnd) (UWtype *, UWtype, UWtype, UWtype);
 515 #define UDIV_TIME 200
 516 #else /* ARMv4 or newer */
 517 #define umul_ppmm(xh, xl, a, b) \
 518   __asm__ ("umull %0,%1,%2,%3" : "=&r" (xl), "=&r" (xh) : "r" (a), "r" (b))
 519 #define UMUL_TIME 5
 520 #define smul_ppmm(xh, xl, a, b) \
 521   __asm__ ("smull %0,%1,%2,%3" : "=&r" (xl), "=&r" (xh) : "r" (a), "r" (b))
 522 #ifndef LONGLONG_STANDALONE
 523 #define udiv_qrnnd(q, r, n1, n0, d) \
 524   do { UWtype __di;                                                     \
 525     __di = __MPN(invert_limb) (d);                                      \
 526     udiv_qrnnd_preinv (q, r, n1, n0, d, __di);                          \
 527   } while (0)
 528 #define UDIV_PREINV_ALWAYS  1
 529 #define UDIV_NEEDS_NORMALIZATION 1
 530 #define UDIV_TIME 70
 531 #endif /* LONGLONG_STANDALONE */
 532 #endif /* defined(__ARM_ARCH_2__) ... */
 533 #define count_leading_zeros(count, x)  count_leading_zeros_gcc_clz(count, x)
 534 #define count_trailing_zeros(count, x)  count_trailing_zeros_gcc_ctz(count, x)
 535 #define COUNT_LEADING_ZEROS_0 32
 536 #endif /* __arm__ */
 537
 538 #if defined (__aarch64__) && W_TYPE_SIZE == 64
 539 /* FIXME: Extend the immediate range for the low word by using both
 540    ADDS and SUBS, since they set carry in the same way.  */
 541 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
 542   __asm__ ("adds\t%1, %x4, %5\n\tadc\t%0, %x2, %x3"                     \
 543            : "=r" (sh), "=&r" (sl)                                      \
 544            : "rZ" ((UDItype)(ah)), "rZ" ((UDItype)(bh)),                \
 545              "%r" ((UDItype)(al)), "rI" ((UDItype)(bl)) __CLOBBER_CC)
 546 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
 547   __asm__ ("subs\t%1, %x4, %5\n\tsbc\t%0, %x2, %x3"                     \
 548            : "=r,r" (sh), "=&r,&r" (sl)                                 \
 549            : "rZ,rZ" ((UDItype)(ah)), "rZ,rZ" ((UDItype)(bh)),          \
 550              "r,Z"   ((UDItype)(al)), "rI,r"  ((UDItype)(bl)) __CLOBBER_CC)
 551 #define umul_ppmm(ph, pl, m0, m1) \
 552   do {                                                                  \
 553     UDItype __m0 = (m0), __m1 = (m1);                                   \
 554     __asm__ ("umulh\t%0, %1, %2" : "=r" (ph) : "r" (__m0), "r" (__m1)); \
 555     (pl) = __m0 * __m1;                                                 \
 556   } while (0)
 557 #define count_leading_zeros(count, x)  count_leading_zeros_gcc_clz(count, x)
 558 #define count_trailing_zeros(count, x)  count_trailing_zeros_gcc_ctz(count, x)
 559 #define COUNT_LEADING_ZEROS_0 64
 560 #endif /* __aarch64__ */
 561
 562 #if defined (__clipper__) && W_TYPE_SIZE == 32
 563 #define umul_ppmm(w1, w0, u, v) \
 564   ({union {UDItype __ll;                                                \
 565            struct {USItype __l, __h;} __i;                              \
 566           } __x;                                                        \
 567   __asm__ ("mulwux %2,%0"                                               \
 568            : "=r" (__x.__ll)                                            \
 569            : "%0" ((USItype)(u)), "r" ((USItype)(v)));                  \
 570   (w1) = __x.__i.__h; (w0) = __x.__i.__l;})
 571 #define smul_ppmm(w1, w0, u, v) \
 572   ({union {DItype __ll;                                                 \
 573            struct {SItype __l, __h;} __i;                               \
 574           } __x;                                                        \
 575   __asm__ ("mulwx %2,%0"                                                \
 576            : "=r" (__x.__ll)                                            \
 577            : "%0" ((SItype)(u)), "r" ((SItype)(v)));                    \
 578   (w1) = __x.__i.__h; (w0) = __x.__i.__l;})
 579 #define __umulsidi3(u, v) \
 580   ({UDItype __w;                                                        \
 581     __asm__ ("mulwux %2,%0"                                             \
 582              : "=r" (__w) : "%0" ((USItype)(u)), "r" ((USItype)(v)));   \
 583     __w; })
 584 #endif /* __clipper__ */
 585
 586 /* Fujitsu vector computers.  */
 587 #if defined (__uxp__) && W_TYPE_SIZE == 32
 588 #define umul_ppmm(ph, pl, u, v) \
 589   do {                                                                  \
 590     union {UDItype __ll;                                                \
 591            struct {USItype __h, __l;} __i;                              \
 592           } __x;                                                        \
 593     __asm__ ("mult.lu %1,%2,%0" : "=r" (__x.__ll) : "%r" (u), "rK" (v));\
 594     (ph) = __x.__i.__h;                                                 \
 595     (pl) = __x.__i.__l;                                                 \
 596   } while (0)
 597 #define smul_ppmm(ph, pl, u, v) \
 598   do {                                                                  \
 599     union {UDItype __ll;                                                \
 600            struct {USItype __h, __l;} __i;                              \
 601           } __x;                                                        \
 602     __asm__ ("mult.l %1,%2,%0" : "=r" (__x.__ll) : "%r" (u), "rK" (v)); \
 603     (ph) = __x.__i.__h;                                                 \
 604     (pl) = __x.__i.__l;                                                 \
 605   } while (0)
 606 #endif
 607
 608 #if defined (__gmicro__) && W_TYPE_SIZE == 32
 609 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
 610   __asm__ ("add.w %5,%1\n\taddx %3,%0"                                  \
 611            : "=g" (sh), "=&g" (sl)                                      \
 612            : "0"  ((USItype)(ah)), "g" ((USItype)(bh)),                 \
 613              "%1" ((USItype)(al)), "g" ((USItype)(bl)))
 614 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
 615   __asm__ ("sub.w %5,%1\n\tsubx %3,%0"                                  \
 616            : "=g" (sh), "=&g" (sl)                                      \
 617            : "0" ((USItype)(ah)), "g" ((USItype)(bh)),                  \
 618              "1" ((USItype)(al)), "g" ((USItype)(bl)))
 619 #define umul_ppmm(ph, pl, m0, m1) \
 620   __asm__ ("mulx %3,%0,%1"                                              \
 621            : "=g" (ph), "=r" (pl)                                       \
 622            : "%0" ((USItype)(m0)), "g" ((USItype)(m1)))
 623 #define udiv_qrnnd(q, r, nh, nl, d) \
 624   __asm__ ("divx %4,%0,%1"                                              \
 625            : "=g" (q), "=r" (r)                                         \
 626            : "1" ((USItype)(nh)), "0" ((USItype)(nl)), "g" ((USItype)(d)))
 627 #define count_leading_zeros(count, x) \
 628   __asm__ ("bsch/1 %1,%0"                                               \
 629            : "=g" (count) : "g" ((USItype)(x)), "0" ((USItype)0))
 630 #endif
 631
 632 #if defined (__hppa) && W_TYPE_SIZE == 32
 633 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
 634   __asm__ ("add%I5 %5,%r4,%1\n\taddc %r2,%r3,%0"                        \
 635            : "=r" (sh), "=&r" (sl)                                      \
 636            : "rM" (ah), "rM" (bh), "%rM" (al), "rI" (bl))
 637 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
 638   __asm__ ("sub%I4 %4,%r5,%1\n\tsubb %r2,%r3,%0"                        \
 639            : "=r" (sh), "=&r" (sl)                                      \
 640            : "rM" (ah), "rM" (bh), "rI" (al), "rM" (bl))
 641 #if defined (_PA_RISC1_1)
 642 #define umul_ppmm(wh, wl, u, v) \
 643   do {                                                                  \
 644     union {UDItype __ll;                                                \
 645            struct {USItype __h, __l;} __i;                              \
 646           } __x;                                                        \
 647     __asm__ ("xmpyu %1,%2,%0" : "=*f" (__x.__ll) : "*f" (u), "*f" (v)); \
 648     (wh) = __x.__i.__h;                                                 \
 649     (wl) = __x.__i.__l;                                                 \
 650   } while (0)
 651 #define UMUL_TIME 8
 652 #define UDIV_TIME 60
 653 #else
 654 #define UMUL_TIME 40
 655 #define UDIV_TIME 80
 656 #endif
 657 #define count_leading_zeros(count, x) \
 658   do {                                                                  \
 659     USItype __tmp;                                                      \
 660     __asm__ (                                                           \
 661        "ldi             1,%0\n"                                         \
 662 "       extru,=         %1,15,16,%%r0   ; Bits 31..16 zero?\n"          \
 663 "       extru,tr        %1,15,16,%1     ; No.  Shift down, skip add.\n" \
 664 "       ldo             16(%0),%0       ; Yes.  Perform add.\n"         \
 665 "       extru,=         %1,23,8,%%r0    ; Bits 15..8 zero?\n"           \
 666 "       extru,tr        %1,23,8,%1      ; No.  Shift down, skip add.\n" \
 667 "       ldo             8(%0),%0        ; Yes.  Perform add.\n"         \
 668 "       extru,=         %1,27,4,%%r0    ; Bits 7..4 zero?\n"            \
 669 "       extru,tr        %1,27,4,%1      ; No.  Shift down, skip add.\n" \
 670 "       ldo             4(%0),%0        ; Yes.  Perform add.\n"         \
 671 "       extru,=         %1,29,2,%%r0    ; Bits 3..2 zero?\n"            \
 672 "       extru,tr        %1,29,2,%1      ; No.  Shift down, skip add.\n" \
 673 "       ldo             2(%0),%0        ; Yes.  Perform add.\n"         \
 674 "       extru           %1,30,1,%1      ; Extract bit 1.\n"             \
 675 "       sub             %0,%1,%0        ; Subtract it.\n"               \
 676         : "=r" (count), "=r" (__tmp) : "1" (x));                        \
 677   } while (0)
 678 #endif /* hppa */
 679
 680 /* These macros are for ABI=2.0w.  In ABI=2.0n they can't be used, since GCC
 681    (3.2) puts longlong into two adjacent 32-bit registers.  Presumably this
 682    is just a case of no direct support for 2.0n but treating it like 1.0. */
 683 #if defined (__hppa) && W_TYPE_SIZE == 64 && ! defined (_LONG_LONG_LIMB)
 684 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
 685   __asm__ ("add%I5 %5,%r4,%1\n\tadd,dc %r2,%r3,%0"                      \
 686            : "=r" (sh), "=&r" (sl)                                      \
 687            : "rM" (ah), "rM" (bh), "%rM" (al), "rI" (bl))
 688 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
 689   __asm__ ("sub%I4 %4,%r5,%1\n\tsub,db %r2,%r3,%0"                      \
 690            : "=r" (sh), "=&r" (sl)                                      \
 691            : "rM" (ah), "rM" (bh), "rI" (al), "rM" (bl))
 692 #endif /* hppa */
 693
 694 #if (defined (__i370__) || defined (__s390__) || defined (__mvs__)) && W_TYPE_SIZE == 32
 695 #if defined (__zarch__) || defined (HAVE_HOST_CPU_s390_zarch)
 696 #define add_ssaaaa(sh, sl, ah, al, bh, bl)                              \
 697   do {                                                                  \
 698 /*  if (__builtin_constant_p (bl))                                      \
 699       __asm__ ("alfi\t%1,%o5\n\talcr\t%0,%3"                            \
 700                : "=r" (sh), "=&r" (sl)                                  \
 701                : "0"  (ah), "r" (bh), "%1" (al), "n" (bl) __CLOBBER_CC);\
 702     else                                                                \
 703 */    __asm__ ("alr\t%1,%5\n\talcr\t%0,%3"                              \
 704                : "=r" (sh), "=&r" (sl)                                  \
 705                : "0"  (ah), "r" (bh), "%1" (al), "r" (bl)__CLOBBER_CC); \
 706   } while (0)
 707 #define sub_ddmmss(sh, sl, ah, al, bh, bl)                              \
 708   do {                                                                  \
 709 /*  if (__builtin_constant_p (bl))                                      \
 710       __asm__ ("slfi\t%1,%o5\n\tslbr\t%0,%3"                            \
 711                : "=r" (sh), "=&r" (sl)                                  \
 712                : "0" (ah), "r" (bh), "1" (al), "n" (bl) __CLOBBER_CC);  \
 713     else                                                                \
 714 */    __asm__ ("slr\t%1,%5\n\tslbr\t%0,%3"                              \
 715                : "=r" (sh), "=&r" (sl)                                  \
 716                : "0" (ah), "r" (bh), "1" (al), "r" (bl) __CLOBBER_CC);  \
 717   } while (0)
 718 #if __GMP_GNUC_PREREQ (4,5)
 719 #define umul_ppmm(xh, xl, m0, m1)                                       \
 720   do {                                                                  \
 721     union {UDItype __ll;                                                \
 722            struct {USItype __h, __l;} __i;                              \
 723           } __x;                                                        \
 724     __x.__ll = (UDItype) (m0) * (UDItype) (m1);                         \
 725     (xh) = __x.__i.__h; (xl) = __x.__i.__l;                             \
 726   } while (0)
 727 #else
 728 #if 0
 729 /* FIXME: this fails if gcc knows about the 64-bit registers.  Use only
 730    with a new enough processor pretending we have 32-bit registers.  */
 731 #define umul_ppmm(xh, xl, m0, m1)                                       \
 732   do {                                                                  \
 733     union {UDItype __ll;                                                \
 734            struct {USItype __h, __l;} __i;                              \
 735           } __x;                                                        \
 736     __asm__ ("mlr\t%0,%2"                                               \
 737              : "=r" (__x.__ll)                                          \
 738              : "%0" (m0), "r" (m1));                                    \
 739     (xh) = __x.__i.__h; (xl) = __x.__i.__l;                             \
 740   } while (0)
 741 #else
 742 #define umul_ppmm(xh, xl, m0, m1)                                       \
 743   do {                                                                  \
 744   /* When we have 64-bit regs and gcc is aware of that, we cannot simply use
 745      DImode for the product, since that would be allocated to a single 64-bit
 746      register, whereas mlr uses the low 32-bits of an even-odd register pair.
 747   */                                                                    \
 748     register USItype __r0 __asm__ ("0");                                \
 749     register USItype __r1 __asm__ ("1") = (m0);                         \
 750     __asm__ ("mlr\t%0,%3"                                               \
 751              : "=r" (__r0), "=r" (__r1)                                 \
 752              : "r" (__r1), "r" (m1));                                   \
 753     (xh) = __r0; (xl) = __r1;                                           \
 754   } while (0)
 755 #endif /* if 0 */
 756 #endif
 757 #if 0
 758 /* FIXME: this fails if gcc knows about the 64-bit registers.  Use only
 759    with a new enough processor pretending we have 32-bit registers.  */
 760 #define udiv_qrnnd(q, r, n1, n0, d)                                     \
 761   do {                                                                  \
 762     union {UDItype __ll;                                                \
 763            struct {USItype __h, __l;} __i;                              \
 764           } __x;                                                        \
 765     __x.__i.__h = n1; __x.__i.__l = n0;                                 \
 766     __asm__ ("dlr\t%0,%2"                                               \
 767              : "=r" (__x.__ll)                                          \
 768              : "0" (__x.__ll), "r" (d));                                \
 769     (q) = __x.__i.__l; (r) = __x.__i.__h;                               \
 770   } while (0)
 771 #else
 772 #define udiv_qrnnd(q, r, n1, n0, d)                                     \
 773   do {                                                                  \
 774     register USItype __r0 __asm__ ("0") = (n1);                         \
 775     register USItype __r1 __asm__ ("1") = (n0);                         \
 776     __asm__ ("dlr\t%0,%4"                                               \
 777              : "=r" (__r0), "=r" (__r1)                                 \
 778              : "r" (__r0), "r" (__r1), "r" (d));                        \
 779     (q) = __r1; (r) = __r0;                                             \
 780   } while (0)
 781 #endif /* if 0 */
 782 #else /* if __zarch__ */
 783 /* FIXME: this fails if gcc knows about the 64-bit registers.  */
 784 #define smul_ppmm(xh, xl, m0, m1)                                       \
 785   do {                                                                  \
 786     union {DItype __ll;                                                 \
 787            struct {USItype __h, __l;} __i;                              \
 788           } __x;                                                        \
 789     __asm__ ("mr\t%0,%2"                                                \
 790              : "=r" (__x.__ll)                                          \
 791              : "%0" (m0), "r" (m1));                                    \
 792     (xh) = __x.__i.__h; (xl) = __x.__i.__l;                             \
 793   } while (0)
 794 /* FIXME: this fails if gcc knows about the 64-bit registers.  */
 795 #define sdiv_qrnnd(q, r, n1, n0, d)                                     \
 796   do {                                                                  \
 797     union {DItype __ll;                                                 \
 798            struct {USItype __h, __l;} __i;                              \
 799           } __x;                                                        \
 800     __x.__i.__h = n1; __x.__i.__l = n0;                                 \
 801     __asm__ ("dr\t%0,%2"                                                \
 802              : "=r" (__x.__ll)                                          \
 803              : "0" (__x.__ll), "r" (d));                                \
 804     (q) = __x.__i.__l; (r) = __x.__i.__h;                               \
 805   } while (0)
 806 #endif /* if __zarch__ */
 807 #endif
 808
 809 #if defined (__s390x__) && W_TYPE_SIZE == 64
 810 /* We need to cast operands with register constraints, otherwise their types
 811    will be assumed to be SImode by gcc.  For these machines, such operations
 812    will insert a value into the low 32 bits, and leave the high 32 bits with
 813    garbage.  */
 814 #define add_ssaaaa(sh, sl, ah, al, bh, bl)                              \
 815   do {                                                                  \
 816     __asm__ ("algr\t%1,%5\n\talcgr\t%0,%3"                              \
 817                : "=r" (sh), "=&r" (sl)                                  \
 818                : "0"  ((UDItype)(ah)), "r" ((UDItype)(bh)),             \
 819                  "%1" ((UDItype)(al)), "r" ((UDItype)(bl)) __CLOBBER_CC); \
 820   } while (0)
 821 #define sub_ddmmss(sh, sl, ah, al, bh, bl)                              \
 822   do {                                                                  \
 823     __asm__ ("slgr\t%1,%5\n\tslbgr\t%0,%3"                              \
 824              : "=r" (sh), "=&r" (sl)                                    \
 825              : "0" ((UDItype)(ah)), "r" ((UDItype)(bh)),                \
 826                "1" ((UDItype)(al)), "r" ((UDItype)(bl)) __CLOBBER_CC);  \
 827   } while (0)
 828 #define umul_ppmm(xh, xl, m0, m1)                                       \
 829   do {                                                                  \
 830     union {unsigned int __attribute__ ((mode(TI))) __ll;                \
 831            struct {UDItype __h, __l;} __i;                              \
 832           } __x;                                                        \
 833     __asm__ ("mlgr\t%0,%2"                                              \
 834              : "=r" (__x.__ll)                                          \
 835              : "%0" ((UDItype)(m0)), "r" ((UDItype)(m1)));              \
 836     (xh) = __x.__i.__h; (xl) = __x.__i.__l;                             \
 837   } while (0)
 838 #define udiv_qrnnd(q, r, n1, n0, d)                                     \
 839   do {                                                                  \
 840     union {unsigned int __attribute__ ((mode(TI))) __ll;                \
 841            struct {UDItype __h, __l;} __i;                              \
 842           } __x;                                                        \
 843     __x.__i.__h = n1; __x.__i.__l = n0;                                 \
 844     __asm__ ("dlgr\t%0,%2"                                              \
 845              : "=r" (__x.__ll)                                          \
 846              : "0" (__x.__ll), "r" ((UDItype)(d)));                     \
 847     (q) = __x.__i.__l; (r) = __x.__i.__h;                               \
 848   } while (0)
 849 #if 0 /* FIXME: Enable for z10 (?) */
 850 #define count_leading_zeros(cnt, x)                                     \
 851   do {                                                                  \
 852     union {unsigned int __attribute__ ((mode(TI))) __ll;                \
 853            struct {UDItype __h, __l;} __i;                              \
 854           } __clr_cnt;                                                  \
 855     __asm__ ("flogr\t%0,%1"                                             \
 856              : "=r" (__clr_cnt.__ll)                                    \
 857              : "r" (x) __CLOBBER_CC);                                   \
 858     (cnt) = __clr_cnt.__i.__h;                                          \
 859   } while (0)
 860 #endif
 861 #endif
 862
 863 /* On x86 and x86_64, every asm implicitly clobbers "flags" and "fpsr",
 864    so we don't need __CLOBBER_CC.  */
 865 #if (defined (__i386__) || defined (__i486__)) && W_TYPE_SIZE == 32
 866 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
 867   __asm__ ("addl %5,%k1\n\tadcl %3,%k0"                                 \
 868            : "=r" (sh), "=&r" (sl)                                      \
 869            : "0"  ((USItype)(ah)), "g" ((USItype)(bh)),                 \
 870              "%1" ((USItype)(al)), "g" ((USItype)(bl)))
 871 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
 872   __asm__ ("subl %5,%k1\n\tsbbl %3,%k0"                                 \
 873            : "=r" (sh), "=&r" (sl)                                      \
 874            : "0" ((USItype)(ah)), "g" ((USItype)(bh)),                  \
 875              "1" ((USItype)(al)), "g" ((USItype)(bl)))
 876 #define umul_ppmm(w1, w0, u, v) \
 877   __asm__ ("mull %3"                                                    \
 878            : "=a" (w0), "=d" (w1)                                       \
 879            : "%0" ((USItype)(u)), "rm" ((USItype)(v)))
 880 #define udiv_qrnnd(q, r, n1, n0, dx) /* d renamed to dx avoiding "=d" */\
 881   __asm__ ("divl %4"                 /* stringification in K&R C */     \
 882            : "=a" (q), "=d" (r)                                         \
 883            : "0" ((USItype)(n0)), "1" ((USItype)(n1)), "rm" ((USItype)(dx)))
 884
 885 #if HAVE_HOST_CPU_i586 || HAVE_HOST_CPU_pentium || HAVE_HOST_CPU_pentiummmx
 886 /* Pentium bsrl takes between 10 and 72 cycles depending where the most
 887    significant 1 bit is, hence the use of the following alternatives.  bsfl
 888    is slow too, between 18 and 42 depending where the least significant 1
 889    bit is, so let the generic count_trailing_zeros below make use of the
 890    count_leading_zeros here too.  */
 891
 892 #if HAVE_HOST_CPU_pentiummmx && ! defined (LONGLONG_STANDALONE)
 893 /* The following should be a fixed 14 or 15 cycles, but possibly plus an L1
 894    cache miss reading from __clz_tab.  For P55 it's favoured over the float
 895    below so as to avoid mixing MMX and x87, since the penalty for switching
 896    between the two is about 100 cycles.
 897
 898    The asm block sets __shift to -3 if the high 24 bits are clear, -2 for
 899    16, -1 for 8, or 0 otherwise.  This could be written equivalently as
 900    follows, but as of gcc 2.95.2 it results in conditional jumps.
 901
 902        __shift = -(__n < 0x1000000);
 903        __shift -= (__n < 0x10000);
 904        __shift -= (__n < 0x100);
 905
 906    The middle two sbbl and cmpl's pair, and with luck something gcc
 907    generates might pair with the first cmpl and the last sbbl.  The "32+1"
 908    constant could be folded into __clz_tab[], but it doesn't seem worth
 909    making a different table just for that.  */
 910
 911 #define count_leading_zeros(c,n)                                        \
 912   do {                                                                  \
 913     USItype  __n = (n);                                                 \
 914     USItype  __shift;                                                   \
 915     __asm__ ("cmpl  $0x1000000, %1\n"                                   \
 916              "sbbl  %0, %0\n"                                           \
 917              "cmpl  $0x10000, %1\n"                                     \
 918              "sbbl  $0, %0\n"                                           \
 919              "cmpl  $0x100, %1\n"                                       \
 920              "sbbl  $0, %0\n"                                           \
 921              : "=&r" (__shift) : "r"  (__n));                           \
 922     __shift = __shift*8 + 24 + 1;                                       \
 923     (c) = 32 + 1 - __shift - __clz_tab[__n >> __shift];                 \
 924   } while (0)
 925 #define COUNT_LEADING_ZEROS_NEED_CLZ_TAB
 926 #define COUNT_LEADING_ZEROS_0   31   /* n==0 indistinguishable from n==1 */
 927
 928 #else /* ! pentiummmx || LONGLONG_STANDALONE */
 929 /* The following should be a fixed 14 cycles or so.  Some scheduling
 930    opportunities should be available between the float load/store too.  This
 931    sort of code is used in gcc 3 for __builtin_ffs (with "n&-n") and is
 932    apparently suggested by the Intel optimizing manual (don't know exactly
 933    where).  gcc 2.95 or up will be best for this, so the "double" is
 934    correctly aligned on the stack.  */
 935 #define count_leading_zeros(c,n)                                        \
 936   do {                                                                  \
 937     union {                                                             \
 938       double    d;                                                      \
 939       unsigned  a[2];                                                   \
 940     } __u;                                                              \
 941     ASSERT ((n) != 0);                                                  \
 942     __u.d = (UWtype) (n);                                               \
 943     (c) = 0x3FF + 31 - (__u.a[1] >> 20);                                \
 944   } while (0)
 945 #define COUNT_LEADING_ZEROS_0   (0x3FF + 31)
 946 #endif /* pentiummx */
 947
 948 #else /* ! pentium */
 949
 950 #if __GMP_GNUC_PREREQ (3,4)  /* using bsrl */
 951 #define count_leading_zeros(count,x)  count_leading_zeros_gcc_clz(count,x)
 952 #endif /* gcc clz */
 953
 954 /* On P6, gcc prior to 3.0 generates a partial register stall for
 955    __cbtmp^31, due to using "xorb $31" instead of "xorl $31", the former
 956    being 1 code byte smaller.  "31-__cbtmp" is a workaround, probably at the
 957    cost of one extra instruction.  Do this for "i386" too, since that means
 958    generic x86.  */
 959 #if ! defined (count_leading_zeros) && __GNUC__ < 3                     \
 960   && (HAVE_HOST_CPU_i386                                                \
 961       || HAVE_HOST_CPU_i686                                             \
 962       || HAVE_HOST_CPU_pentiumpro                                       \
 963       || HAVE_HOST_CPU_pentium2                                         \
 964       || HAVE_HOST_CPU_pentium3)
 965 #define count_leading_zeros(count, x)                                   \
 966   do {                                                                  \
 967     USItype __cbtmp;                                                    \
 968     ASSERT ((x) != 0);                                                  \
 969     __asm__ ("bsrl %1,%0" : "=r" (__cbtmp) : "rm" ((USItype)(x)));      \
 970     (count) = 31 - __cbtmp;                                             \
 971   } while (0)
 972 #endif /* gcc<3 asm bsrl */
 973
 974 #ifndef count_leading_zeros
 975 #define count_leading_zeros(count, x)                                   \
 976   do {                                                                  \
 977     USItype __cbtmp;                                                    \
 978     ASSERT ((x) != 0);                                                  \
 979     __asm__ ("bsrl %1,%0" : "=r" (__cbtmp) : "rm" ((USItype)(x)));      \
 980     (count) = __cbtmp ^ 31;                                             \
 981   } while (0)
 982 #endif /* asm bsrl */
 983
 984 #if __GMP_GNUC_PREREQ (3,4)  /* using bsfl */
 985 #define count_trailing_zeros(count,x)  count_trailing_zeros_gcc_ctz(count,x)
 986 #endif /* gcc ctz */
 987
 988 #ifndef count_trailing_zeros
 989 #define count_trailing_zeros(count, x)                                  \
 990   do {                                                                  \
 991     ASSERT ((x) != 0);                                                  \
 992     __asm__ ("bsfl %1,%k0" : "=r" (count) : "rm" ((USItype)(x)));       \
 993   } while (0)
 994 #endif /* asm bsfl */
 995
 996 #endif /* ! pentium */
 997
 998 #ifndef UMUL_TIME
 999 #define UMUL_TIME 10
1000 #endif
1001 #ifndef UDIV_TIME
1002 #define UDIV_TIME 40
1003 #endif
1004 #endif /* 80x86 */
1005
1006 #if defined (__amd64__) && W_TYPE_SIZE == 64
1007 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
1008   __asm__ ("addq %5,%q1\n\tadcq %3,%q0"                                 \
1009            : "=r" (sh), "=&r" (sl)                                      \
1010            : "0"  ((UDItype)(ah)), "rme" ((UDItype)(bh)),               \
1011              "%1" ((UDItype)(al)), "rme" ((UDItype)(bl)))
1012 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
1013   __asm__ ("subq %5,%q1\n\tsbbq %3,%q0"                                 \
1014            : "=r" (sh), "=&r" (sl)                                      \
1015            : "0" ((UDItype)(ah)), "rme" ((UDItype)(bh)),                \
1016              "1" ((UDItype)(al)), "rme" ((UDItype)(bl)))
1017 #define umul_ppmm(w1, w0, u, v) \
1018   __asm__ ("mulq %3"                                                    \
1019            : "=a" (w0), "=d" (w1)                                       \
1020            : "%0" ((UDItype)(u)), "rm" ((UDItype)(v)))
1021 #define udiv_qrnnd(q, r, n1, n0, dx) /* d renamed to dx avoiding "=d" */\
1022   __asm__ ("divq %4"                 /* stringification in K&R C */     \
1023            : "=a" (q), "=d" (r)                                         \
1024            : "0" ((UDItype)(n0)), "1" ((UDItype)(n1)), "rm" ((UDItype)(dx)))
1025 /* bsrq destination must be a 64-bit register, hence UDItype for __cbtmp. */
1026 #define count_leading_zeros(count, x)                                   \
1027   do {                                                                  \
1028     UDItype __cbtmp;                                                    \
1029     ASSERT ((x) != 0);                                                  \
1030     __asm__ ("bsrq %1,%0" : "=r" (__cbtmp) : "rm" ((UDItype)(x)));      \
1031     (count) = __cbtmp ^ 63;                                             \
1032   } while (0)
1033 /* bsfq destination must be a 64-bit register, "%q0" forces this in case
1034    count is only an int. */
1035 #define count_trailing_zeros(count, x)                                  \
1036   do {                                                                  \
1037     ASSERT ((x) != 0);                                                  \
1038     __asm__ ("bsfq %1,%q0" : "=r" (count) : "rm" ((UDItype)(x)));       \
1039   } while (0)
1040 #endif /* __amd64__ */
1041
1042 #if defined (__i860__) && W_TYPE_SIZE == 32
1043 #define rshift_rhlc(r,h,l,c) \
1044   __asm__ ("shr %3,r0,r0\;shrd %1,%2,%0"                                \
1045            "=r" (r) : "r" (h), "r" (l), "rn" (c))
1046 #endif /* i860 */
1047
1048 #if defined (__i960__) && W_TYPE_SIZE == 32
1049 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
1050   __asm__ ("cmpo 1,0\;addc %5,%4,%1\;addc %3,%2,%0"                     \
1051            : "=r" (sh), "=&r" (sl)                                      \
1052            : "dI" (ah), "dI" (bh), "%dI" (al), "dI" (bl))
1053 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
1054   __asm__ ("cmpo 0,0\;subc %5,%4,%1\;subc %3,%2,%0"                     \
1055            : "=r" (sh), "=&r" (sl)                                      \
1056            : "dI" (ah), "dI" (bh), "dI" (al), "dI" (bl))
1057 #define umul_ppmm(w1, w0, u, v) \
1058   ({union {UDItype __ll;                                                \
1059            struct {USItype __l, __h;} __i;                              \
1060           } __x;                                                        \
1061   __asm__ ("emul %2,%1,%0"                                              \
1062            : "=d" (__x.__ll) : "%dI" (u), "dI" (v));                    \
1063   (w1) = __x.__i.__h; (w0) = __x.__i.__l;})
1064 #define __umulsidi3(u, v) \
1065   ({UDItype __w;                                                        \
1066     __asm__ ("emul %2,%1,%0" : "=d" (__w) : "%dI" (u), "dI" (v));       \
1067     __w; })
1068 #define udiv_qrnnd(q, r, nh, nl, d) \
1069   do {                                                                  \
1070     union {UDItype __ll;                                                \
1071            struct {USItype __l, __h;} __i;                              \
1072           } __nn;                                                       \
1073     __nn.__i.__h = (nh); __nn.__i.__l = (nl);                           \
1074     __asm__ ("ediv %d,%n,%0"                                            \
1075            : "=d" (__rq.__ll) : "dI" (__nn.__ll), "dI" (d));            \
1076     (r) = __rq.__i.__l; (q) = __rq.__i.__h;                             \
1077   } while (0)
1078 #define count_leading_zeros(count, x) \
1079   do {                                                                  \
1080     USItype __cbtmp;                                                    \
1081     __asm__ ("scanbit %1,%0" : "=r" (__cbtmp) : "r" (x));               \
1082     (count) = __cbtmp ^ 31;                                             \
1083   } while (0)
1084 #define COUNT_LEADING_ZEROS_0 (-32) /* sic */
1085 #if defined (__i960mx)          /* what is the proper symbol to test??? */
1086 #define rshift_rhlc(r,h,l,c) \
1087   do {                                                                  \
1088     union {UDItype __ll;                                                \
1089            struct {USItype __l, __h;} __i;                              \
1090           } __nn;                                                       \
1091     __nn.__i.__h = (h); __nn.__i.__l = (l);                             \
1092     __asm__ ("shre %2,%1,%0" : "=d" (r) : "dI" (__nn.__ll), "dI" (c));  \
1093   }
1094 #endif /* i960mx */
1095 #endif /* i960 */
1096
1097 #if (defined (__mc68000__) || defined (__mc68020__) || defined(mc68020) \
1098      || defined (__m68k__) || defined (__mc5200__) || defined (__mc5206e__) \
1099      || defined (__mc5307__)) && W_TYPE_SIZE == 32
1100 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
1101   __asm__ ("add%.l %5,%1\n\taddx%.l %3,%0"                              \
1102            : "=d" (sh), "=&d" (sl)                                      \
1103            : "0"  ((USItype)(ah)), "d" ((USItype)(bh)),                 \
1104              "%1" ((USItype)(al)), "g" ((USItype)(bl)))
1105 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
1106   __asm__ ("sub%.l %5,%1\n\tsubx%.l %3,%0"                              \
1107            : "=d" (sh), "=&d" (sl)                                      \
1108            : "0" ((USItype)(ah)), "d" ((USItype)(bh)),                  \
1109              "1" ((USItype)(al)), "g" ((USItype)(bl)))
1110 /* The '020, '030, '040 and CPU32 have 32x32->64 and 64/32->32q-32r.  */
1111 #if defined (__mc68020__) || defined(mc68020) \
1112      || defined (__mc68030__) || defined (mc68030) \
1113      || defined (__mc68040__) || defined (mc68040) \
1114      || defined (__mcpu32__) || defined (mcpu32) \
1115      || defined (__NeXT__)
1116 #define umul_ppmm(w1, w0, u, v) \
1117   __asm__ ("mulu%.l %3,%1:%0"                                           \
1118            : "=d" (w0), "=d" (w1)                                       \
1119            : "%0" ((USItype)(u)), "dmi" ((USItype)(v)))
1120 #define UMUL_TIME 45
1121 #define udiv_qrnnd(q, r, n1, n0, d) \
1122   __asm__ ("divu%.l %4,%1:%0"                                           \
1123            : "=d" (q), "=d" (r)                                         \
1124            : "0" ((USItype)(n0)), "1" ((USItype)(n1)), "dmi" ((USItype)(d)))
1125 #define UDIV_TIME 90
1126 #define sdiv_qrnnd(q, r, n1, n0, d) \
1127   __asm__ ("divs%.l %4,%1:%0"                                           \
1128            : "=d" (q), "=d" (r)                                         \
1129            : "0" ((USItype)(n0)), "1" ((USItype)(n1)), "dmi" ((USItype)(d)))
1130 #else /* for other 68k family members use 16x16->32 multiplication */
1131 #define umul_ppmm(xh, xl, a, b) \
1132   do { USItype __umul_tmp1, __umul_tmp2;                                \
1133         __asm__ ("| Inlined umul_ppmm\n"                                \
1134 "       move%.l %5,%3\n"                                                \
1135 "       move%.l %2,%0\n"                                                \
1136 "       move%.w %3,%1\n"                                                \
1137 "       swap    %3\n"                                                   \
1138 "       swap    %0\n"                                                   \
1139 "       mulu%.w %2,%1\n"                                                \
1140 "       mulu%.w %3,%0\n"                                                \
1141 "       mulu%.w %2,%3\n"                                                \
1142 "       swap    %2\n"                                                   \
1143 "       mulu%.w %5,%2\n"                                                \
1144 "       add%.l  %3,%2\n"                                                \
1145 "       jcc     1f\n"                                                   \
1146 "       add%.l  %#0x10000,%0\n"                                         \
1147 "1:     move%.l %2,%3\n"                                                \
1148 "       clr%.w  %2\n"                                                   \
1149 "       swap    %2\n"                                                   \
1150 "       swap    %3\n"                                                   \
1151 "       clr%.w  %3\n"                                                   \
1152 "       add%.l  %3,%1\n"                                                \
1153 "       addx%.l %2,%0\n"                                                \
1154 "       | End inlined umul_ppmm"                                        \
1155               : "=&d" (xh), "=&d" (xl),                                 \
1156                 "=d" (__umul_tmp1), "=&d" (__umul_tmp2)                 \
1157               : "%2" ((USItype)(a)), "d" ((USItype)(b)));               \
1158   } while (0)
1159 #define UMUL_TIME 100
1160 #define UDIV_TIME 400
1161 #endif /* not mc68020 */
1162 /* The '020, '030, '040 and '060 have bitfield insns.
1163    GCC 3.4 defines __mc68020__ when in CPU32 mode, check for __mcpu32__ to
1164    exclude bfffo on that chip (bitfield insns not available).  */
1165 #if (defined (__mc68020__) || defined (mc68020)    \
1166      || defined (__mc68030__) || defined (mc68030) \
1167      || defined (__mc68040__) || defined (mc68040) \
1168      || defined (__mc68060__) || defined (mc68060) \
1169      || defined (__NeXT__))                        \
1170   && ! defined (__mcpu32__)
1171 #define count_leading_zeros(count, x) \
1172   __asm__ ("bfffo %1{%b2:%b2},%0"                                       \
1173            : "=d" (count)                                               \
1174            : "od" ((USItype) (x)), "n" (0))
1175 #define COUNT_LEADING_ZEROS_0 32
1176 #endif
1177 #endif /* mc68000 */
1178
1179 #if defined (__m88000__) && W_TYPE_SIZE == 32
1180 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
1181   __asm__ ("addu.co %1,%r4,%r5\n\taddu.ci %0,%r2,%r3"                   \
1182            : "=r" (sh), "=&r" (sl)                                      \
1183            : "rJ" (ah), "rJ" (bh), "%rJ" (al), "rJ" (bl))
1184 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
1185   __asm__ ("subu.co %1,%r4,%r5\n\tsubu.ci %0,%r2,%r3"                   \
1186            : "=r" (sh), "=&r" (sl)                                      \
1187            : "rJ" (ah), "rJ" (bh), "rJ" (al), "rJ" (bl))
1188 #define count_leading_zeros(count, x) \
1189   do {                                                                  \
1190     USItype __cbtmp;                                                    \
1191     __asm__ ("ff1 %0,%1" : "=r" (__cbtmp) : "r" (x));                   \
1192     (count) = __cbtmp ^ 31;                                             \
1193   } while (0)
1194 #define COUNT_LEADING_ZEROS_0 63 /* sic */
1195 #if defined (__m88110__)
1196 #define umul_ppmm(wh, wl, u, v) \
1197   do {                                                                  \
1198     union {UDItype __ll;                                                \
1199            struct {USItype __h, __l;} __i;                              \
1200           } __x;                                                        \
1201     __asm__ ("mulu.d %0,%1,%2" : "=r" (__x.__ll) : "r" (u), "r" (v));   \
1202     (wh) = __x.__i.__h;                                                 \
1203     (wl) = __x.__i.__l;                                                 \
1204   } while (0)
1205 #define udiv_qrnnd(q, r, n1, n0, d) \
1206   ({union {UDItype __ll;                                                \
1207            struct {USItype __h, __l;} __i;                              \
1208           } __x, __q;                                                   \
1209   __x.__i.__h = (n1); __x.__i.__l = (n0);                               \
1210   __asm__ ("divu.d %0,%1,%2"                                            \
1211            : "=r" (__q.__ll) : "r" (__x.__ll), "r" (d));                \
1212   (r) = (n0) - __q.__l * (d); (q) = __q.__l; })
1213 #define UMUL_TIME 5
1214 #define UDIV_TIME 25
1215 #else
1216 #define UMUL_TIME 17
1217 #define UDIV_TIME 150
1218 #endif /* __m88110__ */
1219 #endif /* __m88000__ */
1220
1221 #if defined (__mips) && W_TYPE_SIZE == 32
1222 #if __GMP_GNUC_PREREQ (4,4)
1223 #define umul_ppmm(w1, w0, u, v) \
1224   do {                                                                  \
1225     UDItype __ll = (UDItype)(u) * (v);                                  \
1226     w1 = __ll >> 32;                                                    \
1227     w0 = __ll;                                                          \
1228   } while (0)
1229 #endif
1230 #if !defined (umul_ppmm) && __GMP_GNUC_PREREQ (2,7) && !defined (__clang__)
1231 #define umul_ppmm(w1, w0, u, v) \
1232   __asm__ ("multu %2,%3" : "=l" (w0), "=h" (w1) : "d" (u), "d" (v))
1233 #endif
1234 #if !defined (umul_ppmm)
1235 #define umul_ppmm(w1, w0, u, v) \
1236   __asm__ ("multu %2,%3\n\tmflo %0\n\tmfhi %1"                          \
1237            : "=d" (w0), "=d" (w1) : "d" (u), "d" (v))
1238 #endif
1239 #define UMUL_TIME 10
1240 #define UDIV_TIME 100
1241 #endif /* __mips */
1242
1243 #if (defined (__mips) && __mips >= 3) && W_TYPE_SIZE == 64
1244 #if __GMP_GNUC_PREREQ (4,4)
1245 #define umul_ppmm(w1, w0, u, v) \
1246   do {                                                                  \
1247     typedef unsigned int __ll_UTItype __attribute__((mode(TI)));        \
1248     __ll_UTItype __ll = (__ll_UTItype)(u) * (v);                        \
1249     w1 = __ll >> 64;                                                    \
1250     w0 = __ll;                                                          \
1251   } while (0)
1252 #endif
1253 #if !defined (umul_ppmm) && __GMP_GNUC_PREREQ (2,7) && !defined (__clang__)
1254 #define umul_ppmm(w1, w0, u, v) \
1255   __asm__ ("dmultu %2,%3"                                               \
1256            : "=l" (w0), "=h" (w1)                                       \
1257            : "d" ((UDItype)(u)), "d" ((UDItype)(v)))
1258 #endif
1259 #if !defined (umul_ppmm)
1260 #define umul_ppmm(w1, w0, u, v) \
1261   __asm__ ("dmultu %2,%3\n\tmflo %0\n\tmfhi %1"                         \
1262            : "=d" (w0), "=d" (w1)                                       \
1263            : "d" ((UDItype)(u)), "d" ((UDItype)(v)))
1264 #endif
1265 #define UMUL_TIME 20
1266 #define UDIV_TIME 140
1267 #endif /* __mips */
1268
1269 #if defined (__mmix__) && W_TYPE_SIZE == 64
1270 #define umul_ppmm(w1, w0, u, v) \
1271   __asm__ ("MULU %0,%2,%3" : "=r" (w0), "=z" (w1) : "r" (u), "r" (v))
1272 #endif
1273
1274 #if defined (__ns32000__) && W_TYPE_SIZE == 32
1275 #define umul_ppmm(w1, w0, u, v) \
1276   ({union {UDItype __ll;                                                \
1277            struct {USItype __l, __h;} __i;                              \
1278           } __x;                                                        \
1279   __asm__ ("meid %2,%0"                                                 \
1280            : "=g" (__x.__ll)                                            \
1281            : "%0" ((USItype)(u)), "g" ((USItype)(v)));                  \
1282   (w1) = __x.__i.__h; (w0) = __x.__i.__l;})
1283 #define __umulsidi3(u, v) \
1284   ({UDItype __w;                                                        \
1285     __asm__ ("meid %2,%0"                                               \
1286              : "=g" (__w)                                               \
1287              : "%0" ((USItype)(u)), "g" ((USItype)(v)));                \
1288     __w; })
1289 #define udiv_qrnnd(q, r, n1, n0, d) \
1290   ({union {UDItype __ll;                                                \
1291            struct {USItype __l, __h;} __i;                              \
1292           } __x;                                                        \
1293   __x.__i.__h = (n1); __x.__i.__l = (n0);                               \
1294   __asm__ ("deid %2,%0"                                                 \
1295            : "=g" (__x.__ll)                                            \
1296            : "0" (__x.__ll), "g" ((USItype)(d)));                       \
1297   (r) = __x.__i.__l; (q) = __x.__i.__h; })
1298 #define count_trailing_zeros(count,x) \
1299   do {                                                                  \
1300     __asm__ ("ffsd      %2,%0"                                          \
1301              : "=r" (count)                                             \
1302              : "0" ((USItype) 0), "r" ((USItype) (x)));                 \
1303   } while (0)
1304 #endif /* __ns32000__ */
1305
1306 /* In the past we had a block of various #defines tested
1307        _ARCH_PPC    - AIX
1308        _ARCH_PWR    - AIX
1309        __powerpc__  - gcc
1310        __POWERPC__  - BEOS
1311        __ppc__      - Darwin
1312        PPC          - old gcc, GNU/Linux, SysV
1313    The plain PPC test was not good for vxWorks, since PPC is defined on all
1314    CPUs there (eg. m68k too), as a constant one is expected to compare
1315    CPU_FAMILY against.
1316
1317    At any rate, this was pretty unattractive and a bit fragile.  The use of
1318    HAVE_HOST_CPU_FAMILY is designed to cut through it all and be sure of
1319    getting the desired effect.
1320
1321    ENHANCE-ME: We should test _IBMR2 here when we add assembly support for
1322    the system vendor compilers.  (Is that vendor compilers with inline asm,
1323    or what?)  */
1324
1325 #if (HAVE_HOST_CPU_FAMILY_power || HAVE_HOST_CPU_FAMILY_powerpc)        \
1326   && W_TYPE_SIZE == 32
1327 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
1328   do {                                                                  \
1329     if (__builtin_constant_p (bh) && (bh) == 0)                         \
1330       __asm__ ("add%I4c %1,%3,%4\n\taddze %0,%2"                        \
1331              : "=r" (sh), "=&r" (sl) : "r" (ah), "%r" (al), "rI" (bl)); \
1332     else if (__builtin_constant_p (bh) && (bh) == ~(USItype) 0)         \
1333       __asm__ ("add%I4c %1,%3,%4\n\taddme %0,%2"                        \
1334              : "=r" (sh), "=&r" (sl) : "r" (ah), "%r" (al), "rI" (bl)); \
1335     else                                                                \
1336       __asm__ ("add%I5c %1,%4,%5\n\tadde %0,%2,%3"                      \
1337              : "=r" (sh), "=&r" (sl)                                    \
1338              : "r" (ah), "r" (bh), "%r" (al), "rI" (bl));               \
1339   } while (0)
1340 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
1341   do {                                                                  \
1342     if (__builtin_constant_p (ah) && (ah) == 0)                         \
1343       __asm__ ("subf%I3c %1,%4,%3\n\tsubfze %0,%2"                      \
1344                : "=r" (sh), "=&r" (sl) : "r" (bh), "rI" (al), "r" (bl));\
1345     else if (__builtin_constant_p (ah) && (ah) == ~(USItype) 0)         \
1346       __asm__ ("subf%I3c %1,%4,%3\n\tsubfme %0,%2"                      \
1347                : "=r" (sh), "=&r" (sl) : "r" (bh), "rI" (al), "r" (bl));\
1348     else if (__builtin_constant_p (bh) && (bh) == 0)                    \
1349       __asm__ ("subf%I3c %1,%4,%3\n\taddme %0,%2"                       \
1350                : "=r" (sh), "=&r" (sl) : "r" (ah), "rI" (al), "r" (bl));\
1351     else if (__builtin_constant_p (bh) && (bh) == ~(USItype) 0)         \
1352       __asm__ ("subf%I3c %1,%4,%3\n\taddze %0,%2"                       \
1353                : "=r" (sh), "=&r" (sl) : "r" (ah), "rI" (al), "r" (bl));\
1354     else                                                                \
1355       __asm__ ("subf%I4c %1,%5,%4\n\tsubfe %0,%3,%2"                    \
1356                : "=r" (sh), "=&r" (sl)                                  \
1357                : "r" (ah), "r" (bh), "rI" (al), "r" (bl));              \
1358   } while (0)
1359 #define count_leading_zeros(count, x) \
1360   __asm__ ("cntlzw %0,%1" : "=r" (count) : "r" (x))
1361 #define COUNT_LEADING_ZEROS_0 32
1362 #if HAVE_HOST_CPU_FAMILY_powerpc
1363 #if __GMP_GNUC_PREREQ (4,4)
1364 #define umul_ppmm(w1, w0, u, v) \
1365   do {                                                                  \
1366     UDItype __ll = (UDItype)(u) * (v);                                  \
1367     w1 = __ll >> 32;                                                    \
1368     w0 = __ll;                                                          \
1369   } while (0)
1370 #endif
1371 #if !defined (umul_ppmm)
1372 #define umul_ppmm(ph, pl, m0, m1) \
1373   do {                                                                  \
1374     USItype __m0 = (m0), __m1 = (m1);                                   \
1375     __asm__ ("mulhwu %0,%1,%2" : "=r" (ph) : "%r" (m0), "r" (m1));      \
1376     (pl) = __m0 * __m1;                                                 \
1377   } while (0)
1378 #endif
1379 #define UMUL_TIME 15
1380 #define smul_ppmm(ph, pl, m0, m1) \
1381   do {                                                                  \
1382     SItype __m0 = (m0), __m1 = (m1);                                    \
1383     __asm__ ("mulhw %0,%1,%2" : "=r" (ph) : "%r" (m0), "r" (m1));       \
1384     (pl) = __m0 * __m1;                                                 \
1385   } while (0)
1386 #define SMUL_TIME 14
1387 #define UDIV_TIME 120
1388 #else
1389 #define UMUL_TIME 8
1390 #define smul_ppmm(xh, xl, m0, m1) \
1391   __asm__ ("mul %0,%2,%3" : "=r" (xh), "=q" (xl) : "r" (m0), "r" (m1))
1392 #define SMUL_TIME 4
1393 #define sdiv_qrnnd(q, r, nh, nl, d) \
1394   __asm__ ("div %0,%2,%4" : "=r" (q), "=q" (r) : "r" (nh), "1" (nl), "r" (d))
1395 #define UDIV_TIME 100
1396 #endif
1397 #endif /* 32-bit POWER architecture variants.  */
1398
1399 /* We should test _IBMR2 here when we add assembly support for the system
1400    vendor compilers.  */
1401 #if HAVE_HOST_CPU_FAMILY_powerpc && W_TYPE_SIZE == 64
1402 #if !defined (_LONG_LONG_LIMB)
1403 /* _LONG_LONG_LIMB is ABI=mode32 where adde operates on 32-bit values.  So
1404    use adde etc only when not _LONG_LONG_LIMB.  */
1405 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
1406   do {                                                                  \
1407     if (__builtin_constant_p (bh) && (bh) == 0)                         \
1408       __asm__ ("add%I4c %1,%3,%4\n\taddze %0,%2"                        \
1409                : "=r" (sh), "=&r" (sl)                                  \
1410                : "r"  ((UDItype)(ah)),                                  \
1411                  "%r" ((UDItype)(al)), "rI" ((UDItype)(bl)));           \
1412     else if (__builtin_constant_p (bh) && (bh) == ~(UDItype) 0)         \
1413       __asm__ ("add%I4c %1,%3,%4\n\taddme %0,%2"                        \
1414                : "=r" (sh), "=&r" (sl)                                  \
1415                : "r"  ((UDItype)(ah)),                                  \
1416                  "%r" ((UDItype)(al)), "rI" ((UDItype)(bl)));           \
1417     else                                                                \
1418       __asm__ ("add%I5c %1,%4,%5\n\tadde %0,%2,%3"                      \
1419                : "=r" (sh), "=&r" (sl)                                  \
1420                : "r"  ((UDItype)(ah)), "r"  ((UDItype)(bh)),            \
1421                  "%r" ((UDItype)(al)), "rI" ((UDItype)(bl)));           \
1422   } while (0)
1423 /* We use "*rI" for the constant operand here, since with just "I", gcc barfs.
1424    This might seem strange, but gcc folds away the dead code late.  */
1425 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
1426   do {                                                                  \
1427     if (__builtin_constant_p (bl) && bl > -0x8000 && bl <= 0x8000) {    \
1428         if (__builtin_constant_p (ah) && (ah) == 0)                     \
1429           __asm__ ("addic %1,%3,%4\n\tsubfze %0,%2"                     \
1430                    : "=r" (sh), "=&r" (sl)                              \
1431                    :                       "r" ((UDItype)(bh)),         \
1432                      "rI" ((UDItype)(al)), "*rI" (-((UDItype)(bl))));   \
1433         else if (__builtin_constant_p (ah) && (ah) == ~(UDItype) 0)     \
1434           __asm__ ("addic %1,%3,%4\n\tsubfme %0,%2"                     \
1435                    : "=r" (sh), "=&r" (sl)                              \
1436                    :                       "r" ((UDItype)(bh)),         \
1437                      "rI" ((UDItype)(al)), "*rI" (-((UDItype)(bl))));   \
1438         else if (__builtin_constant_p (bh) && (bh) == 0)                \
1439           __asm__ ("addic %1,%3,%4\n\taddme %0,%2"                      \
1440                    : "=r" (sh), "=&r" (sl)                              \
1441                    : "r"  ((UDItype)(ah)),                              \
1442                      "rI" ((UDItype)(al)), "*rI" (-((UDItype)(bl))));   \
1443         else if (__builtin_constant_p (bh) && (bh) == ~(UDItype) 0)     \
1444           __asm__ ("addic %1,%3,%4\n\taddze %0,%2"                      \
1445                    : "=r" (sh), "=&r" (sl)                              \
1446                    : "r"  ((UDItype)(ah)),                              \
1447                      "rI" ((UDItype)(al)), "*rI" (-((UDItype)(bl))));   \
1448         else                                                            \
1449           __asm__ ("addic %1,%4,%5\n\tsubfe %0,%3,%2"                   \
1450                    : "=r" (sh), "=&r" (sl)                              \
1451                    : "r"  ((UDItype)(ah)), "r" ((UDItype)(bh)),         \
1452                      "rI" ((UDItype)(al)), "*rI" (-((UDItype)(bl))));   \
1453     } else {                                                            \
1454         if (__builtin_constant_p (ah) && (ah) == 0)                     \
1455           __asm__ ("subf%I3c %1,%4,%3\n\tsubfze %0,%2"                  \
1456                    : "=r" (sh), "=&r" (sl)                              \
1457                    :                       "r" ((UDItype)(bh)),         \
1458                      "rI" ((UDItype)(al)), "r" ((UDItype)(bl)));        \
1459         else if (__builtin_constant_p (ah) && (ah) == ~(UDItype) 0)     \
1460           __asm__ ("subf%I3c %1,%4,%3\n\tsubfme %0,%2"                  \
1461                    : "=r" (sh), "=&r" (sl)                              \
1462                    :                       "r" ((UDItype)(bh)),         \
1463                      "rI" ((UDItype)(al)), "r" ((UDItype)(bl)));        \
1464         else if (__builtin_constant_p (bh) && (bh) == 0)                \
1465           __asm__ ("subf%I3c %1,%4,%3\n\taddme %0,%2"                   \
1466                    : "=r" (sh), "=&r" (sl)                              \
1467                    : "r"  ((UDItype)(ah)),                              \
1468                      "rI" ((UDItype)(al)), "r" ((UDItype)(bl)));        \
1469         else if (__builtin_constant_p (bh) && (bh) == ~(UDItype) 0)     \
1470           __asm__ ("subf%I3c %1,%4,%3\n\taddze %0,%2"                   \
1471                    : "=r" (sh), "=&r" (sl)                              \
1472                    : "r"  ((UDItype)(ah)),                              \
1473                      "rI" ((UDItype)(al)), "r" ((UDItype)(bl)));        \
1474         else                                                            \
1475           __asm__ ("subf%I4c %1,%5,%4\n\tsubfe %0,%3,%2"                \
1476                    : "=r" (sh), "=&r" (sl)                              \
1477                    : "r"  ((UDItype)(ah)), "r" ((UDItype)(bh)),         \
1478                      "rI" ((UDItype)(al)), "r" ((UDItype)(bl)));        \
1479     }                                                                   \
1480   } while (0)
1481 #endif /* ! _LONG_LONG_LIMB */
1482 #define count_leading_zeros(count, x) \
1483   __asm__ ("cntlzd %0,%1" : "=r" (count) : "r" (x))
1484 #define COUNT_LEADING_ZEROS_0 64
1485 #if 0 && __GMP_GNUC_PREREQ (4,4) /* Disable, this results in libcalls! */
1486 #define umul_ppmm(w1, w0, u, v) \
1487   do {                                                                  \
1488     typedef unsigned int __ll_UTItype __attribute__((mode(TI)));        \
1489     __ll_UTItype __ll = (__ll_UTItype)(u) * (v);                        \
1490     w1 = __ll >> 64;                                                    \
1491     w0 = __ll;                                                          \
1492   } while (0)
1493 #endif
1494 #if !defined (umul_ppmm)
1495 #define umul_ppmm(ph, pl, m0, m1) \
1496   do {                                                                  \
1497     UDItype __m0 = (m0), __m1 = (m1);                                   \
1498     __asm__ ("mulhdu %0,%1,%2" : "=r" (ph) : "%r" (__m0), "r" (__m1));  \
1499     (pl) = __m0 * __m1;                                                 \
1500   } while (0)
1501 #endif
1502 #define UMUL_TIME 15
1503 #define smul_ppmm(ph, pl, m0, m1) \
1504   do {                                                                  \
1505     DItype __m0 = (m0), __m1 = (m1);                                    \
1506     __asm__ ("mulhd %0,%1,%2" : "=r" (ph) : "%r" (__m0), "r" (__m1));   \
1507     (pl) = __m0 * __m1;                                                 \
1508   } while (0)
1509 #define SMUL_TIME 14  /* ??? */
1510 #define UDIV_TIME 120 /* ??? */
1511 #endif /* 64-bit PowerPC.  */
1512
1513 #if defined (__pyr__) && W_TYPE_SIZE == 32
1514 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
1515   __asm__ ("addw %5,%1\n\taddwc %3,%0"                                  \
1516            : "=r" (sh), "=&r" (sl)                                      \
1517            : "0"  ((USItype)(ah)), "g" ((USItype)(bh)),                 \
1518              "%1" ((USItype)(al)), "g" ((USItype)(bl)))
1519 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
1520   __asm__ ("subw %5,%1\n\tsubwb %3,%0"                                  \
1521            : "=r" (sh), "=&r" (sl)                                      \
1522            : "0" ((USItype)(ah)), "g" ((USItype)(bh)),                  \
1523              "1" ((USItype)(al)), "g" ((USItype)(bl)))
1524 /* This insn works on Pyramids with AP, XP, or MI CPUs, but not with SP.  */
1525 #define umul_ppmm(w1, w0, u, v) \
1526   ({union {UDItype __ll;                                                \
1527            struct {USItype __h, __l;} __i;                              \
1528           } __x;                                                        \
1529   __asm__ ("movw %1,%R0\n\tuemul %2,%0"                                 \
1530            : "=&r" (__x.__ll)                                           \
1531            : "g" ((USItype) (u)), "g" ((USItype)(v)));                  \
1532   (w1) = __x.__i.__h; (w0) = __x.__i.__l;})
1533 #endif /* __pyr__ */
1534
1535 #if defined (__ibm032__) /* RT/ROMP */  && W_TYPE_SIZE == 32
1536 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
1537   __asm__ ("a %1,%5\n\tae %0,%3"                                        \
1538            : "=r" (sh), "=&r" (sl)                                      \
1539            : "0"  ((USItype)(ah)), "r" ((USItype)(bh)),                 \
1540              "%1" ((USItype)(al)), "r" ((USItype)(bl)))
1541 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
1542   __asm__ ("s %1,%5\n\tse %0,%3"                                        \
1543            : "=r" (sh), "=&r" (sl)                                      \
1544            : "0" ((USItype)(ah)), "r" ((USItype)(bh)),                  \
1545              "1" ((USItype)(al)), "r" ((USItype)(bl)))
1546 #define smul_ppmm(ph, pl, m0, m1) \
1547   __asm__ (                                                             \
1548        "s       r2,r2\n"                                                \
1549 "       mts r10,%2\n"                                                   \
1550 "       m       r2,%3\n"                                                \
1551 "       m       r2,%3\n"                                                \
1552 "       m       r2,%3\n"                                                \
1553 "       m       r2,%3\n"                                                \
1554 "       m       r2,%3\n"                                                \
1555 "       m       r2,%3\n"                                                \
1556 "       m       r2,%3\n"                                                \
1557 "       m       r2,%3\n"                                                \
1558 "       m       r2,%3\n"                                                \
1559 "       m       r2,%3\n"                                                \
1560 "       m       r2,%3\n"                                                \
1561 "       m       r2,%3\n"                                                \
1562 "       m       r2,%3\n"                                                \
1563 "       m       r2,%3\n"                                                \
1564 "       m       r2,%3\n"                                                \
1565 "       m       r2,%3\n"                                                \
1566 "       cas     %0,r2,r0\n"                                             \
1567 "       mfs     r10,%1"                                                 \
1568            : "=r" (ph), "=r" (pl)                                       \
1569            : "%r" ((USItype)(m0)), "r" ((USItype)(m1))                  \
1570            : "r2")
1571 #define UMUL_TIME 20
1572 #define UDIV_TIME 200
1573 #define count_leading_zeros(count, x) \
1574   do {                                                                  \
1575     if ((x) >= 0x10000)                                                 \
1576       __asm__ ("clz     %0,%1"                                          \
1577                : "=r" (count) : "r" ((USItype)(x) >> 16));              \
1578     else                                                                \
1579       {                                                                 \
1580         __asm__ ("clz   %0,%1"                                          \
1581                  : "=r" (count) : "r" ((USItype)(x)));                  \
1582         (count) += 16;                                                  \
1583       }                                                                 \
1584   } while (0)
1585 #endif /* RT/ROMP */
1586
1587 #if (defined (__SH2__) || defined (__SH3__) || defined (__SH4__)) && W_TYPE_SIZE == 32
1588 #define umul_ppmm(w1, w0, u, v) \
1589   __asm__ ("dmulu.l %2,%3\n\tsts macl,%1\n\tsts mach,%0"                \
1590            : "=r" (w1), "=r" (w0) : "r" (u), "r" (v) : "macl", "mach")
1591 #define UMUL_TIME 5
1592 #endif
1593
1594 #if defined (__sparc__) && W_TYPE_SIZE == 32
1595 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
1596   __asm__ ("addcc %r4,%5,%1\n\taddx %r2,%3,%0"                          \
1597            : "=r" (sh), "=&r" (sl)                                      \
1598            : "rJ" (ah), "rI" (bh),"%rJ" (al), "rI" (bl)                 \
1599            __CLOBBER_CC)
1600 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
1601   __asm__ ("subcc %r4,%5,%1\n\tsubx %r2,%3,%0"                          \
1602            : "=r" (sh), "=&r" (sl)                                      \
1603            : "rJ" (ah), "rI" (bh), "rJ" (al), "rI" (bl) \
1604            __CLOBBER_CC)
1605 /* FIXME: When gcc -mcpu=v9 is used on solaris, gcc/config/sol2-sld-64.h
1606    doesn't define anything to indicate that to us, it only sets __sparcv8. */
1607 #if defined (__sparc_v9__) || defined (__sparcv9)
1608 /* Perhaps we should use floating-point operations here?  */
1609 #if 0
1610 /* Triggers a bug making mpz/tests/t-gcd.c fail.
1611    Perhaps we simply need explicitly zero-extend the inputs?  */
1612 #define umul_ppmm(w1, w0, u, v) \
1613   __asm__ ("mulx %2,%3,%%g1; srl %%g1,0,%1; srlx %%g1,32,%0" :          \
1614            "=r" (w1), "=r" (w0) : "r" (u), "r" (v) : "g1")
1615 #else
1616 /* Use v8 umul until above bug is fixed.  */
1617 #define umul_ppmm(w1, w0, u, v) \
1618   __asm__ ("umul %2,%3,%1;rd %%y,%0" : "=r" (w1), "=r" (w0) : "r" (u), "r" (v))
1619 #endif
1620 /* Use a plain v8 divide for v9.  */
1621 #define udiv_qrnnd(q, r, n1, n0, d) \
1622   do {                                                                  \
1623     USItype __q;                                                        \
1624     __asm__ ("mov %1,%%y;nop;nop;nop;udiv %2,%3,%0"                     \
1625              : "=r" (__q) : "r" (n1), "r" (n0), "r" (d));               \
1626     (r) = (n0) - __q * (d);                                             \
1627     (q) = __q;                                                          \
1628   } while (0)
1629 #else
1630 #if defined (__sparc_v8__)   /* gcc normal */                           \
1631   || defined (__sparcv8)     /* gcc solaris */                          \
1632   || HAVE_HOST_CPU_supersparc
1633 /* Don't match immediate range because, 1) it is not often useful,
1634    2) the 'I' flag thinks of the range as a 13 bit signed interval,
1635    while we want to match a 13 bit interval, sign extended to 32 bits,
1636    but INTERPRETED AS UNSIGNED.  */
1637 #define umul_ppmm(w1, w0, u, v) \
1638   __asm__ ("umul %2,%3,%1;rd %%y,%0" : "=r" (w1), "=r" (w0) : "r" (u), "r" (v))
1639 #define UMUL_TIME 5
1640
1641 #if HAVE_HOST_CPU_supersparc
1642 #define UDIV_TIME 60            /* SuperSPARC timing */
1643 #else
1644 /* Don't use this on SuperSPARC because its udiv only handles 53 bit
1645    dividends and will trap to the kernel for the rest. */
1646 #define udiv_qrnnd(q, r, n1, n0, d) \
1647   do {                                                                  \
1648     USItype __q;                                                        \
1649     __asm__ ("mov %1,%%y;nop;nop;nop;udiv %2,%3,%0"                     \
1650              : "=r" (__q) : "r" (n1), "r" (n0), "r" (d));               \
1651     (r) = (n0) - __q * (d);                                             \
1652     (q) = __q;                                                          \
1653   } while (0)
1654 #define UDIV_TIME 25
1655 #endif /* HAVE_HOST_CPU_supersparc */
1656
1657 #else /* ! __sparc_v8__ */
1658 #if defined (__sparclite__)
1659 /* This has hardware multiply but not divide.  It also has two additional
1660    instructions scan (ffs from high bit) and divscc.  */
1661 #define umul_ppmm(w1, w0, u, v) \
1662   __asm__ ("umul %2,%3,%1;rd %%y,%0" : "=r" (w1), "=r" (w0) : "r" (u), "r" (v))
1663 #define UMUL_TIME 5
1664 #define udiv_qrnnd(q, r, n1, n0, d) \
1665   __asm__ ("! Inlined udiv_qrnnd\n"                                     \
1666 "       wr      %%g0,%2,%%y     ! Not a delayed write for sparclite\n"  \
1667 "       tst     %%g0\n"                                                 \
1668 "       divscc  %3,%4,%%g1\n"                                           \
1669 "       divscc  %%g1,%4,%%g1\n"                                         \
1670 "       divscc  %%g1,%4,%%g1\n"                                         \
1671 "       divscc  %%g1,%4,%%g1\n"                                         \
1672 "       divscc  %%g1,%4,%%g1\n"                                         \
1673 "       divscc  %%g1,%4,%%g1\n"                                         \
1674 "       divscc  %%g1,%4,%%g1\n"                                         \
1675 "       divscc  %%g1,%4,%%g1\n"                                         \
1676 "       divscc  %%g1,%4,%%g1\n"                                         \
1677 "       divscc  %%g1,%4,%%g1\n"                                         \
1678 "       divscc  %%g1,%4,%%g1\n"                                         \
1679 "       divscc  %%g1,%4,%%g1\n"                                         \
1680 "       divscc  %%g1,%4,%%g1\n"                                         \
1681 "       divscc  %%g1,%4,%%g1\n"                                         \
1682 "       divscc  %%g1,%4,%%g1\n"                                         \
1683 "       divscc  %%g1,%4,%%g1\n"                                         \
1684 "       divscc  %%g1,%4,%%g1\n"                                         \
1685 "       divscc  %%g1,%4,%%g1\n"                                         \
1686 "       divscc  %%g1,%4,%%g1\n"                                         \
1687 "       divscc  %%g1,%4,%%g1\n"                                         \
1688 "       divscc  %%g1,%4,%%g1\n"                                         \
1689 "       divscc  %%g1,%4,%%g1\n"                                         \
1690 "       divscc  %%g1,%4,%%g1\n"                                         \
1691 "       divscc  %%g1,%4,%%g1\n"                                         \
1692 "       divscc  %%g1,%4,%%g1\n"                                         \
1693 "       divscc  %%g1,%4,%%g1\n"                                         \
1694 "       divscc  %%g1,%4,%%g1\n"                                         \
1695 "       divscc  %%g1,%4,%%g1\n"                                         \
1696 "       divscc  %%g1,%4,%%g1\n"                                         \
1697 "       divscc  %%g1,%4,%%g1\n"                                         \
1698 "       divscc  %%g1,%4,%%g1\n"                                         \
1699 "       divscc  %%g1,%4,%0\n"                                           \
1700 "       rd      %%y,%1\n"                                               \
1701 "       bl,a 1f\n"                                                      \
1702 "       add     %1,%4,%1\n"                                             \
1703 "1:     ! End of inline udiv_qrnnd"                                     \
1704            : "=r" (q), "=r" (r) : "r" (n1), "r" (n0), "rI" (d)          \
1705            : "%g1" __AND_CLOBBER_CC)
1706 #define UDIV_TIME 37
1707 #define count_leading_zeros(count, x) \
1708   __asm__ ("scan %1,1,%0" : "=r" (count) : "r" (x))
1709 /* Early sparclites return 63 for an argument of 0, but they warn that future
1710    implementations might change this.  Therefore, leave COUNT_LEADING_ZEROS_0
1711    undefined.  */
1712 #endif /* __sparclite__ */
1713 #endif /* __sparc_v8__ */
1714 #endif /* __sparc_v9__ */
1715 /* Default to sparc v7 versions of umul_ppmm and udiv_qrnnd.  */
1716 #ifndef umul_ppmm
1717 #define umul_ppmm(w1, w0, u, v) \
1718   __asm__ ("! Inlined umul_ppmm\n"                                      \
1719 "       wr      %%g0,%2,%%y     ! SPARC has 0-3 delay insn after a wr\n" \
1720 "       sra     %3,31,%%g2      ! Don't move this insn\n"               \
1721 "       and     %2,%%g2,%%g2    ! Don't move this insn\n"               \
1722 "       andcc   %%g0,0,%%g1     ! Don't move this insn\n"               \
1723 "       mulscc  %%g1,%3,%%g1\n"                                         \
1724 "       mulscc  %%g1,%3,%%g1\n"                                         \
1725 "       mulscc  %%g1,%3,%%g1\n"                                         \
1726 "       mulscc  %%g1,%3,%%g1\n"                                         \
1727 "       mulscc  %%g1,%3,%%g1\n"                                         \
1728 "       mulscc  %%g1,%3,%%g1\n"                                         \
1729 "       mulscc  %%g1,%3,%%g1\n"                                         \
1730 "       mulscc  %%g1,%3,%%g1\n"                                         \
1731 "       mulscc  %%g1,%3,%%g1\n"                                         \
1732 "       mulscc  %%g1,%3,%%g1\n"                                         \
1733 "       mulscc  %%g1,%3,%%g1\n"                                         \
1734 "       mulscc  %%g1,%3,%%g1\n"                                         \
1735 "       mulscc  %%g1,%3,%%g1\n"                                         \
1736 "       mulscc  %%g1,%3,%%g1\n"                                         \
1737 "       mulscc  %%g1,%3,%%g1\n"                                         \
1738 "       mulscc  %%g1,%3,%%g1\n"                                         \
1739 "       mulscc  %%g1,%3,%%g1\n"                                         \
1740 "       mulscc  %%g1,%3,%%g1\n"                                         \
1741 "       mulscc  %%g1,%3,%%g1\n"                                         \
1742 "       mulscc  %%g1,%3,%%g1\n"                                         \
1743 "       mulscc  %%g1,%3,%%g1\n"                                         \
1744 "       mulscc  %%g1,%3,%%g1\n"                                         \
1745 "       mulscc  %%g1,%3,%%g1\n"                                         \
1746 "       mulscc  %%g1,%3,%%g1\n"                                         \
1747 "       mulscc  %%g1,%3,%%g1\n"                                         \
1748 "       mulscc  %%g1,%3,%%g1\n"                                         \
1749 "       mulscc  %%g1,%3,%%g1\n"                                         \
1750 "       mulscc  %%g1,%3,%%g1\n"                                         \
1751 "       mulscc  %%g1,%3,%%g1\n"                                         \
1752 "       mulscc  %%g1,%3,%%g1\n"                                         \
1753 "       mulscc  %%g1,%3,%%g1\n"                                         \
1754 "       mulscc  %%g1,%3,%%g1\n"                                         \
1755 "       mulscc  %%g1,0,%%g1\n"                                          \
1756 "       add     %%g1,%%g2,%0\n"                                         \
1757 "       rd      %%y,%1"                                                 \
1758            : "=r" (w1), "=r" (w0) : "%rI" (u), "r" (v)                  \
1759            : "%g1", "%g2" __AND_CLOBBER_CC)
1760 #define UMUL_TIME 39            /* 39 instructions */
1761 #endif
1762 #ifndef udiv_qrnnd
1763 #ifndef LONGLONG_STANDALONE
1764 #define udiv_qrnnd(q, r, n1, n0, d) \
1765   do { UWtype __r;                                                      \
1766     (q) = __MPN(udiv_qrnnd) (&__r, (n1), (n0), (d));                    \
1767     (r) = __r;                                                          \
1768   } while (0)
1769 extern UWtype __MPN(udiv_qrnnd) (UWtype *, UWtype, UWtype, UWtype);
1770 #ifndef UDIV_TIME
1771 #define UDIV_TIME 140
1772 #endif
1773 #endif /* LONGLONG_STANDALONE */
1774 #endif /* udiv_qrnnd */
1775 #endif /* __sparc__ */
1776
1777 #if defined (__sparc__) && W_TYPE_SIZE == 64
1778 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
1779   __asm__ (                                                             \
1780        "addcc   %r4,%5,%1\n"                                            \
1781       " addccc  %r6,%7,%%g0\n"                                          \
1782       " addc    %r2,%3,%0"                                              \
1783        : "=r" (sh), "=&r" (sl)                                          \
1784        : "rJ"  ((UDItype)(ah)), "rI" ((UDItype)(bh)),                   \
1785          "%rJ" ((UDItype)(al)), "rI" ((UDItype)(bl)),                   \
1786          "%rJ" ((UDItype)(al) >> 32), "rI" ((UDItype)(bl) >> 32)        \
1787            __CLOBBER_CC)
1788 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
1789   __asm__ (                                                             \
1790        "subcc   %r4,%5,%1\n"                                            \
1791       " subccc  %r6,%7,%%g0\n"                                          \
1792       " subc    %r2,%3,%0"                                              \
1793        : "=r" (sh), "=&r" (sl)                                          \
1794        : "rJ" ((UDItype)(ah)), "rI" ((UDItype)(bh)),                    \
1795          "rJ" ((UDItype)(al)), "rI" ((UDItype)(bl)),                    \
1796          "rJ" ((UDItype)(al) >> 32), "rI" ((UDItype)(bl) >> 32)         \
1797            __CLOBBER_CC)
1798 #if __VIS__ >= 0x300
1799 #undef add_ssaaaa
1800 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
1801   __asm__ (                                                             \
1802        "addcc   %r4, %5, %1\n"                                          \
1803       " addxc   %r2, %r3, %0"                                           \
1804           : "=r" (sh), "=&r" (sl)                                       \
1805        : "rJ"  ((UDItype)(ah)), "rJ" ((UDItype)(bh)),                   \
1806          "%rJ" ((UDItype)(al)), "rI" ((UDItype)(bl)) __CLOBBER_CC)
1807 #define umul_ppmm(ph, pl, m0, m1) \
1808   do {                                                                  \
1809     UDItype __m0 = (m0), __m1 = (m1);                                   \
1810     (pl) = __m0 * __m1;                                                 \
1811     __asm__ ("umulxhi\t%2, %1, %0"                                      \
1812              : "=r" (ph)                                                \
1813              : "%r" (__m0), "r" (__m1));                                \
1814   } while (0)
1815 #define count_leading_zeros(count, x) \
1816   __asm__ ("lzd\t%1,%0" : "=r" (count) : "r" (x))
1817 /* Needed by count_leading_zeros_32 in sparc64.h.  */
1818 #define COUNT_LEADING_ZEROS_NEED_CLZ_TAB
1819 #endif
1820 #endif
1821
1822 #if (defined (__vax) || defined (__vax__)) && W_TYPE_SIZE == 32
1823 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
1824   __asm__ ("addl2 %5,%1\n\tadwc %3,%0"                                  \
1825            : "=g" (sh), "=&g" (sl)                                      \
1826            : "0"  ((USItype)(ah)), "g" ((USItype)(bh)),                 \
1827              "%1" ((USItype)(al)), "g" ((USItype)(bl)))
1828 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
1829   __asm__ ("subl2 %5,%1\n\tsbwc %3,%0"                                  \
1830            : "=g" (sh), "=&g" (sl)                                      \
1831            : "0" ((USItype)(ah)), "g" ((USItype)(bh)),                  \
1832              "1" ((USItype)(al)), "g" ((USItype)(bl)))
1833 #define smul_ppmm(xh, xl, m0, m1) \
1834   do {                                                                  \
1835     union {UDItype __ll;                                                \
1836            struct {USItype __l, __h;} __i;                              \
1837           } __x;                                                        \
1838     USItype __m0 = (m0), __m1 = (m1);                                   \
1839     __asm__ ("emul %1,%2,$0,%0"                                         \
1840              : "=g" (__x.__ll) : "g" (__m0), "g" (__m1));               \
1841     (xh) = __x.__i.__h; (xl) = __x.__i.__l;                             \
1842   } while (0)
1843 #define sdiv_qrnnd(q, r, n1, n0, d) \
1844   do {                                                                  \
1845     union {DItype __ll;                                                 \
1846            struct {SItype __l, __h;} __i;                               \
1847           } __x;                                                        \
1848     __x.__i.__h = n1; __x.__i.__l = n0;                                 \
1849     __asm__ ("ediv %3,%2,%0,%1"                                         \
1850              : "=g" (q), "=g" (r) : "g" (__x.__ll), "g" (d));           \
1851   } while (0)
1852 #if 0
1853 /* FIXME: This instruction appears to be unimplemented on some systems (vax
1854    8800 maybe). */
1855 #define count_trailing_zeros(count,x)                                   \
1856   do {                                                                  \
1857     __asm__ ("ffs 0, 31, %1, %0"                                        \
1858              : "=g" (count)                                             \
1859              : "g" ((USItype) (x)));                                    \
1860   } while (0)
1861 #endif
1862 #endif /* vax */
1863
1864 #if defined (__z8000__) && W_TYPE_SIZE == 16
1865 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
1866   __asm__ ("add %H1,%H5\n\tadc  %H0,%H3"                                \
1867            : "=r" (sh), "=&r" (sl)                                      \
1868            : "0"  ((unsigned int)(ah)), "r" ((unsigned int)(bh)),       \
1869              "%1" ((unsigned int)(al)), "rQR" ((unsigned int)(bl)))
1870 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
1871   __asm__ ("sub %H1,%H5\n\tsbc  %H0,%H3"                                \
1872            : "=r" (sh), "=&r" (sl)                                      \
1873            : "0" ((unsigned int)(ah)), "r" ((unsigned int)(bh)),        \
1874              "1" ((unsigned int)(al)), "rQR" ((unsigned int)(bl)))
1875 #define umul_ppmm(xh, xl, m0, m1) \
1876   do {                                                                  \
1877     union {long int __ll;                                               \
1878            struct {unsigned int __h, __l;} __i;                         \
1879           } __x;                                                        \
1880     unsigned int __m0 = (m0), __m1 = (m1);                              \
1881     __asm__ ("mult      %S0,%H3"                                        \
1882              : "=r" (__x.__i.__h), "=r" (__x.__i.__l)                   \
1883              : "%1" (m0), "rQR" (m1));                                  \
1884     (xh) = __x.__i.__h; (xl) = __x.__i.__l;                             \
1885     (xh) += ((((signed int) __m0 >> 15) & __m1)                         \
1886              + (((signed int) __m1 >> 15) & __m0));                     \
1887   } while (0)
1888 #endif /* __z8000__ */
1889
1890 #endif /* __GNUC__ */
1891
1892 #endif /* NO_ASM */
1893
1894
1895 /* FIXME: "sidi" here is highly doubtful, should sometimes be "diti".  */
1896 #if !defined (umul_ppmm) && defined (__umulsidi3)
1897 #define umul_ppmm(ph, pl, m0, m1) \
1898   {                                                                     \
1899     UDWtype __ll = __umulsidi3 (m0, m1);                                \
1900     ph = (UWtype) (__ll >> W_TYPE_SIZE);                                \
1901     pl = (UWtype) __ll;                                                 \
1902   }
1903 #endif
1904
1905 #if !defined (__umulsidi3)
1906 #define __umulsidi3(u, v) \
1907   ({UWtype __hi, __lo;                                                  \
1908     umul_ppmm (__hi, __lo, u, v);                                       \
1909     ((UDWtype) __hi << W_TYPE_SIZE) | __lo; })
1910 #endif
1911
1912
1913 #if defined (__cplusplus)
1914 #define __longlong_h_C "C"
1915 #else
1916 #define __longlong_h_C
1917 #endif
1918
1919 /* Use mpn_umul_ppmm or mpn_udiv_qrnnd functions, if they exist.  The "_r"
1920    forms have "reversed" arguments, meaning the pointer is last, which
1921    sometimes allows better parameter passing, in particular on 64-bit
1922    hppa. */
1923
1924 #define mpn_umul_ppmm  __MPN(umul_ppmm)
1925 extern __longlong_h_C UWtype mpn_umul_ppmm (UWtype *, UWtype, UWtype);
1926
1927 #if ! defined (umul_ppmm) && HAVE_NATIVE_mpn_umul_ppmm  \
1928   && ! defined (LONGLONG_STANDALONE)
1929 #define umul_ppmm(wh, wl, u, v)                                         \
1930   do {                                                                  \
1931     UWtype __umul_ppmm__p0;                                             \
1932     (wh) = mpn_umul_ppmm (&__umul_ppmm__p0, (UWtype) (u), (UWtype) (v));\
1933     (wl) = __umul_ppmm__p0;                                             \
1934   } while (0)
1935 #endif
1936
1937 #define mpn_umul_ppmm_r  __MPN(umul_ppmm_r)
1938 extern __longlong_h_C UWtype mpn_umul_ppmm_r (UWtype, UWtype, UWtype *);
1939
1940 #if ! defined (umul_ppmm) && HAVE_NATIVE_mpn_umul_ppmm_r        \
1941   && ! defined (LONGLONG_STANDALONE)
1942 #define umul_ppmm(wh, wl, u, v)                                         \
1943   do {                                                                  \
1944     UWtype __umul_p0;                                                   \
1945     (wh) = mpn_umul_ppmm_r ((UWtype) (u), (UWtype) (v), &__umul_p0);    \
1946     (wl) = __umul_p0;                                                   \
1947   } while (0)
1948 #endif
1949
1950 #define mpn_udiv_qrnnd  __MPN(udiv_qrnnd)
1951 extern __longlong_h_C UWtype mpn_udiv_qrnnd (UWtype *, UWtype, UWtype, UWtype);
1952
1953 #if ! defined (udiv_qrnnd) && HAVE_NATIVE_mpn_udiv_qrnnd        \
1954   && ! defined (LONGLONG_STANDALONE)
1955 #define udiv_qrnnd(q, r, n1, n0, d)                                     \
1956   do {                                                                  \
1957     UWtype __udiv_qrnnd_r;                                              \
1958     (q) = mpn_udiv_qrnnd (&__udiv_qrnnd_r,                              \
1959                           (UWtype) (n1), (UWtype) (n0), (UWtype) d);    \
1960     (r) = __udiv_qrnnd_r;                                               \
1961   } while (0)
1962 #endif
1963
1964 #define mpn_udiv_qrnnd_r  __MPN(udiv_qrnnd_r)
1965 extern __longlong_h_C UWtype mpn_udiv_qrnnd_r (UWtype, UWtype, UWtype, UWtype *);
1966
1967 #if ! defined (udiv_qrnnd) && HAVE_NATIVE_mpn_udiv_qrnnd_r      \
1968   && ! defined (LONGLONG_STANDALONE)
1969 #define udiv_qrnnd(q, r, n1, n0, d)                                     \
1970   do {                                                                  \
1971     UWtype __udiv_qrnnd_r;                                              \
1972     (q) = mpn_udiv_qrnnd_r ((UWtype) (n1), (UWtype) (n0), (UWtype) d,   \
1973                             &__udiv_qrnnd_r);                           \
1974     (r) = __udiv_qrnnd_r;                                               \
1975   } while (0)
1976 #endif
1977
1978
1979 /* If this machine has no inline assembler, use C macros.  */
1980
1981 #if !defined (add_ssaaaa)
1982 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
1983   do {                                                                  \
1984     UWtype __x;                                                         \
1985     __x = (al) + (bl);                                                  \
1986     (sh) = (ah) + (bh) + (__x < (al));                                  \
1987     (sl) = __x;                                                         \
1988   } while (0)
1989 #endif
1990
1991 #if !defined (sub_ddmmss)
1992 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
1993   do {                                                                  \
1994     UWtype __x;                                                         \
1995     __x = (al) - (bl);                                                  \
1996     (sh) = (ah) - (bh) - ((al) < (bl));                                 \
1997     (sl) = __x;                                                         \
1998   } while (0)
1999 #endif
2000
2001 /* If we lack umul_ppmm but have smul_ppmm, define umul_ppmm in terms of
2002    smul_ppmm.  */
2003 #if !defined (umul_ppmm) && defined (smul_ppmm)
2004 #define umul_ppmm(w1, w0, u, v)                                         \
2005   do {                                                                  \
2006     UWtype __w1;                                                        \
2007     UWtype __xm0 = (u), __xm1 = (v);                                    \
2008     smul_ppmm (__w1, w0, __xm0, __xm1);                                 \
2009     (w1) = __w1 + (-(__xm0 >> (W_TYPE_SIZE - 1)) & __xm1)               \
2010                 + (-(__xm1 >> (W_TYPE_SIZE - 1)) & __xm0);              \
2011   } while (0)
2012 #endif
2013
2014 /* If we still don't have umul_ppmm, define it using plain C.
2015
2016    For reference, when this code is used for squaring (ie. u and v identical
2017    expressions), gcc recognises __x1 and __x2 are the same and generates 3
2018    multiplies, not 4.  The subsequent additions could be optimized a bit,
2019    but the only place GMP currently uses such a square is mpn_sqr_basecase,
2020    and chips obliged to use this generic C umul will have plenty of worse
2021    performance problems than a couple of extra instructions on the diagonal
2022    of sqr_basecase.  */
2023
2024 #if !defined (umul_ppmm)
2025 #define umul_ppmm(w1, w0, u, v)                                         \
2026   do {                                                                  \
2027     UWtype __x0, __x1, __x2, __x3;                                      \
2028     UHWtype __ul, __vl, __uh, __vh;                                     \
2029     UWtype __u = (u), __v = (v);                                        \
2030                                                                         \
2031     __ul = __ll_lowpart (__u);                                          \
2032     __uh = __ll_highpart (__u);                                         \
2033     __vl = __ll_lowpart (__v);                                          \
2034     __vh = __ll_highpart (__v);                                         \
2035                                                                         \
2036     __x0 = (UWtype) __ul * __vl;                                        \
2037     __x1 = (UWtype) __ul * __vh;                                        \
2038     __x2 = (UWtype) __uh * __vl;                                        \
2039     __x3 = (UWtype) __uh * __vh;                                        \
2040                                                                         \
2041     __x1 += __ll_highpart (__x0);/* this can't give carry */            \
2042     __x1 += __x2;               /* but this indeed can */               \
2043     if (__x1 < __x2)            /* did we get it? */                    \
2044       __x3 += __ll_B;           /* yes, add it in the proper pos. */    \
2045                                                                         \
2046     (w1) = __x3 + __ll_highpart (__x1);                                 \
2047     (w0) = (__x1 << W_TYPE_SIZE/2) + __ll_lowpart (__x0);               \
2048   } while (0)
2049 #endif
2050
2051 /* If we don't have smul_ppmm, define it using umul_ppmm (which surely will
2052    exist in one form or another.  */
2053 #if !defined (smul_ppmm)
2054 #define smul_ppmm(w1, w0, u, v)                                         \
2055   do {                                                                  \
2056     UWtype __w1;                                                        \
2057     UWtype __xm0 = (u), __xm1 = (v);                                    \
2058     umul_ppmm (__w1, w0, __xm0, __xm1);                                 \
2059     (w1) = __w1 - (-(__xm0 >> (W_TYPE_SIZE - 1)) & __xm1)               \
2060                 - (-(__xm1 >> (W_TYPE_SIZE - 1)) & __xm0);              \
2061   } while (0)
2062 #endif
2063
2064 /* Define this unconditionally, so it can be used for debugging.  */
2065 #define __udiv_qrnnd_c(q, r, n1, n0, d) \
2066   do {                                                                  \
2067     UWtype __d1, __d0, __q1, __q0, __r1, __r0, __m;                     \
2068                                                                         \
2069     ASSERT ((d) != 0);                                                  \
2070     ASSERT ((n1) < (d));                                                \
2071                                                                         \
2072     __d1 = __ll_highpart (d);                                           \
2073     __d0 = __ll_lowpart (d);                                            \
2074                                                                         \
2075     __q1 = (n1) / __d1;                                                 \
2076     __r1 = (n1) - __q1 * __d1;                                          \
2077     __m = __q1 * __d0;                                                  \
2078     __r1 = __r1 * __ll_B | __ll_highpart (n0);                          \
2079     if (__r1 < __m)                                                     \
2080       {                                                                 \
2081         __q1--, __r1 += (d);                                            \
2082         if (__r1 >= (d)) /* i.e. we didn't get carry when adding to __r1 */\
2083           if (__r1 < __m)                                               \
2084             __q1--, __r1 += (d);                                        \
2085       }                                                                 \
2086     __r1 -= __m;                                                        \
2087                                                                         \
2088     __q0 = __r1 / __d1;                                                 \
2089     __r0 = __r1  - __q0 * __d1;                                         \
2090     __m = __q0 * __d0;                                                  \
2091     __r0 = __r0 * __ll_B | __ll_lowpart (n0);                           \
2092     if (__r0 < __m)                                                     \
2093       {                                                                 \
2094         __q0--, __r0 += (d);                                            \
2095         if (__r0 >= (d))                                                \
2096           if (__r0 < __m)                                               \
2097             __q0--, __r0 += (d);                                        \
2098       }                                                                 \
2099     __r0 -= __m;                                                        \
2100                                                                         \
2101     (q) = __q1 * __ll_B | __q0;                                         \
2102     (r) = __r0;                                                         \
2103   } while (0)
2104
2105 /* If the processor has no udiv_qrnnd but sdiv_qrnnd, go through
2106    __udiv_w_sdiv (defined in libgcc or elsewhere).  */
2107 #if !defined (udiv_qrnnd) && defined (sdiv_qrnnd)
2108 #define udiv_qrnnd(q, r, nh, nl, d) \
2109   do {                                                                  \
2110     UWtype __r;                                                         \
2111     (q) = __MPN(udiv_w_sdiv) (&__r, nh, nl, d);                         \
2112     (r) = __r;                                                          \
2113   } while (0)
2114 __GMP_DECLSPEC UWtype __MPN(udiv_w_sdiv) (UWtype *, UWtype, UWtype, UWtype);
2115 #endif
2116
2117 /* If udiv_qrnnd was not defined for this processor, use __udiv_qrnnd_c.  */
2118 #if !defined (udiv_qrnnd)
2119 #define UDIV_NEEDS_NORMALIZATION 1
2120 #define udiv_qrnnd __udiv_qrnnd_c
2121 #endif
2122
2123 #if !defined (count_leading_zeros)
2124 #define count_leading_zeros(count, x) \
2125   do {                                                                  \
2126     UWtype __xr = (x);                                                  \
2127     UWtype __a;                                                         \
2128                                                                         \
2129     if (W_TYPE_SIZE == 32)                                              \
2130       {                                                                 \
2131         __a = __xr < ((UWtype) 1 << 2*__BITS4)                          \
2132           ? (__xr < ((UWtype) 1 << __BITS4) ? 1 : __BITS4 + 1)          \
2133           : (__xr < ((UWtype) 1 << 3*__BITS4) ? 2*__BITS4 + 1           \
2134           : 3*__BITS4 + 1);                                             \
2135       }                                                                 \
2136     else                                                                \
2137       {                                                                 \
2138         for (__a = W_TYPE_SIZE - 8; __a > 0; __a -= 8)                  \
2139           if (((__xr >> __a) & 0xff) != 0)                              \
2140             break;                                                      \
2141         ++__a;                                                          \
2142       }                                                                 \
2143                                                                         \
2144     (count) = W_TYPE_SIZE + 1 - __a - __clz_tab[__xr >> __a];           \
2145   } while (0)
2146 /* This version gives a well-defined value for zero. */
2147 #define COUNT_LEADING_ZEROS_0 (W_TYPE_SIZE - 1)
2148 #define COUNT_LEADING_ZEROS_NEED_CLZ_TAB
2149 #define COUNT_LEADING_ZEROS_SLOW
2150 #endif
2151
2152 /* clz_tab needed by mpn/x86/pentium/mod_1.asm in a fat binary */
2153 #if HAVE_HOST_CPU_FAMILY_x86 && WANT_FAT_BINARY
2154 #define COUNT_LEADING_ZEROS_NEED_CLZ_TAB
2155 #endif
2156
2157 #ifdef COUNT_LEADING_ZEROS_NEED_CLZ_TAB
2158 extern const unsigned char __GMP_DECLSPEC __clz_tab[129];
2159 #endif
2160
2161 #if !defined (count_trailing_zeros)
2162 #if !defined (COUNT_LEADING_ZEROS_SLOW)
2163 /* Define count_trailing_zeros using an asm count_leading_zeros.  */
2164 #define count_trailing_zeros(count, x)                                  \
2165   do {                                                                  \
2166     UWtype __ctz_x = (x);                                               \
2167     UWtype __ctz_c;                                                     \
2168     ASSERT (__ctz_x != 0);                                              \
2169     count_leading_zeros (__ctz_c, __ctz_x & -__ctz_x);                  \
2170     (count) = W_TYPE_SIZE - 1 - __ctz_c;                                \
2171   } while (0)
2172 #else
2173 /* Define count_trailing_zeros in plain C, assuming small counts are common.
2174    We use clz_tab without ado, since the C count_leading_zeros above will have
2175    pulled it in.  */
2176 #define count_trailing_zeros(count, x)                                  \
2177   do {                                                                  \
2178     UWtype __ctz_x = (x);                                               \
2179     int __ctz_c;                                                        \
2180                                                                         \
2181     if (LIKELY ((__ctz_x & 0xff) != 0))                                 \
2182       (count) = __clz_tab[__ctz_x & -__ctz_x] - 2;                      \
2183     else                                                                \
2184       {                                                                 \
2185         for (__ctz_c = 8 - 2; __ctz_c < W_TYPE_SIZE - 2; __ctz_c += 8)  \
2186           {                                                             \
2187             __ctz_x >>= 8;                                              \
2188             if (LIKELY ((__ctz_x & 0xff) != 0))                         \
2189               break;                                                    \
2190           }                                                             \
2191                                                                         \
2192         (count) = __ctz_c + __clz_tab[__ctz_x & -__ctz_x];              \
2193       }                                                                 \
2194   } while (0)
2195 #endif
2196 #endif
2197
2198 #ifndef UDIV_NEEDS_NORMALIZATION
2199 #define UDIV_NEEDS_NORMALIZATION 0
2200 #endif
2201
2202 /* Whether udiv_qrnnd is actually implemented with udiv_qrnnd_preinv, and
2203    that hence the latter should always be used.  */
2204 #ifndef UDIV_PREINV_ALWAYS
2205 #define UDIV_PREINV_ALWAYS 0
2206 #endif
2207
2208 /* Give defaults for UMUL_TIME and UDIV_TIME.  */
2209 #ifndef UMUL_TIME
2210 #define UMUL_TIME 1
2211 #endif
2212
2213 #ifndef UDIV_TIME
2214 #define UDIV_TIME UMUL_TIME
2215 #endif