src/longlong.h

   1 /* longlong.h -- definitions for mixed size 32/64 bit arithmetic.
   2
   3 Copyright 1991-2016 Free Software Foundation, Inc.
   4
   5 This file is free software; you can redistribute it and/or modify it under the
   6 terms of the GNU Lesser General Public License as published by the Free
   7 Software Foundation; either version 3 of the License, or (at your option) any
   8 later version.
   9
  10 This file is distributed in the hope that it will be useful, but WITHOUT ANY
  11 WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
  12 PARTICULAR PURPOSE.  See the GNU Lesser General Public License for more
  13 details.
  14
  15 You should have received a copy of the GNU Lesser General Public License
  16 along with this file.  If not, see http://www.gnu.org/licenses/.  */
  17
  18 /* You have to define the following before including this file:
  19
  20    UWtype -- An unsigned type, default type for operations (typically a "word")
  21    UHWtype -- An unsigned type, at least half the size of UWtype
  22    UDWtype -- An unsigned type, at least twice as large a UWtype
  23    W_TYPE_SIZE -- size in bits of UWtype
  24
  25    SItype, USItype -- Signed and unsigned 32 bit types
  26    DItype, UDItype -- Signed and unsigned 64 bit types
  27
  28    On a 32 bit machine UWtype should typically be USItype;
  29    on a 64 bit machine, UWtype should typically be UDItype.
  30
  31    Optionally, define:
  32
  33    LONGLONG_STANDALONE -- Avoid code that needs machine-dependent support files
  34    NO_ASM -- Disable inline asm
  35
  36
  37    CAUTION!  Using this version of longlong.h outside of GMP is not safe.  You
  38    need to include gmp.h and gmp-impl.h, or certain things might not work as
  39    expected.
  40 */
  41
  42 #define __BITS4 (W_TYPE_SIZE / 4)
  43 #define __ll_B ((UWtype) 1 << (W_TYPE_SIZE / 2))
  44 #define __ll_lowpart(t) ((UWtype) (t) & (__ll_B - 1))
  45 #define __ll_highpart(t) ((UWtype) (t) >> (W_TYPE_SIZE / 2))
  46
  47 /* This is used to make sure no undesirable sharing between different libraries
  48    that use this file takes place.  */
  49 #ifndef __MPN
  50 #define __MPN(x) __##x
  51 #endif
  52
  53 /* Define auxiliary asm macros.
  54
  55    1) umul_ppmm(high_prod, low_prod, multiplier, multiplicand) multiplies two
  56    UWtype integers MULTIPLIER and MULTIPLICAND, and generates a two UWtype
  57    word product in HIGH_PROD and LOW_PROD.
  58
  59    2) __umulsidi3(a,b) multiplies two UWtype integers A and B, and returns a
  60    UDWtype product.  This is just a variant of umul_ppmm.
  61
  62    3) udiv_qrnnd(quotient, remainder, high_numerator, low_numerator,
  63    denominator) divides a UDWtype, composed by the UWtype integers
  64    HIGH_NUMERATOR and LOW_NUMERATOR, by DENOMINATOR and places the quotient
  65    in QUOTIENT and the remainder in REMAINDER.  HIGH_NUMERATOR must be less
  66    than DENOMINATOR for correct operation.  If, in addition, the most
  67    significant bit of DENOMINATOR must be 1, then the pre-processor symbol
  68    UDIV_NEEDS_NORMALIZATION is defined to 1.
  69
  70    4) sdiv_qrnnd(quotient, remainder, high_numerator, low_numerator,
  71    denominator).  Like udiv_qrnnd but the numbers are signed.  The quotient
  72    is rounded towards 0.
  73
  74    5) count_leading_zeros(count, x) counts the number of zero-bits from the
  75    msb to the first non-zero bit in the UWtype X.  This is the number of
  76    steps X needs to be shifted left to set the msb.  Undefined for X == 0,
  77    unless the symbol COUNT_LEADING_ZEROS_0 is defined to some value.
  78
  79    6) count_trailing_zeros(count, x) like count_leading_zeros, but counts
  80    from the least significant end.
  81
  82    7) add_ssaaaa(high_sum, low_sum, high_addend_1, low_addend_1,
  83    high_addend_2, low_addend_2) adds two UWtype integers, composed by
  84    HIGH_ADDEND_1 and LOW_ADDEND_1, and HIGH_ADDEND_2 and LOW_ADDEND_2
  85    respectively.  The result is placed in HIGH_SUM and LOW_SUM.  Overflow
  86    (i.e. carry out) is not stored anywhere, and is lost.
  87
  88    8) sub_ddmmss(high_difference, low_difference, high_minuend, low_minuend,
  89    high_subtrahend, low_subtrahend) subtracts two two-word UWtype integers,
  90    composed by HIGH_MINUEND_1 and LOW_MINUEND_1, and HIGH_SUBTRAHEND_2 and
  91    LOW_SUBTRAHEND_2 respectively.  The result is placed in HIGH_DIFFERENCE
  92    and LOW_DIFFERENCE.  Overflow (i.e. carry out) is not stored anywhere,
  93    and is lost.
  94
  95    If any of these macros are left undefined for a particular CPU,
  96    C macros are used.
  97
  98
  99    Notes:
 100
 101    For add_ssaaaa the two high and two low addends can both commute, but
 102    unfortunately gcc only supports one "%" commutative in each asm block.
 103    This has always been so but is only documented in recent versions
 104    (eg. pre-release 3.3).  Having two or more "%"s can cause an internal
 105    compiler error in certain rare circumstances.
 106
 107    Apparently it was only the last "%" that was ever actually respected, so
 108    the code has been updated to leave just that.  Clearly there's a free
 109    choice whether high or low should get it, if there's a reason to favour
 110    one over the other.  Also obviously when the constraints on the two
 111    operands are identical there's no benefit to the reloader in any "%" at
 112    all.
 113
 114    */
 115
 116 /* The CPUs come in alphabetical order below.
 117
 118    Please add support for more CPUs here, or improve the current support
 119    for the CPUs below!  */
 120
 121
 122 /* count_leading_zeros_gcc_clz is count_leading_zeros implemented with gcc
 123    3.4 __builtin_clzl or __builtin_clzll, according to our limb size.
 124    Similarly count_trailing_zeros_gcc_ctz using __builtin_ctzl or
 125    __builtin_ctzll.
 126
 127    These builtins are only used when we check what code comes out, on some
 128    chips they're merely libgcc calls, where we will instead want an inline
 129    in that case (either asm or generic C).
 130
 131    These builtins are better than an asm block of the same insn, since an
 132    asm block doesn't give gcc any information about scheduling or resource
 133    usage.  We keep an asm block for use on prior versions of gcc though.
 134
 135    For reference, __builtin_ffs existed in gcc prior to __builtin_clz, but
 136    it's not used (for count_leading_zeros) because it generally gives extra
 137    code to ensure the result is 0 when the input is 0, which we don't need
 138    or want.  */
 139
 140 #ifdef _LONG_LONG_LIMB
 141 #define count_leading_zeros_gcc_clz(count,x)    \
 142   do {                                          \
 143     ASSERT ((x) != 0);                          \
 144     (count) = __builtin_clzll (x);              \
 145   } while (0)
 146 #else
 147 #define count_leading_zeros_gcc_clz(count,x)    \
 148   do {                                          \
 149     ASSERT ((x) != 0);                          \
 150     (count) = __builtin_clzl (x);               \
 151   } while (0)
 152 #endif
 153
 154 #ifdef _LONG_LONG_LIMB
 155 #define count_trailing_zeros_gcc_ctz(count,x)   \
 156   do {                                          \
 157     ASSERT ((x) != 0);                          \
 158     (count) = __builtin_ctzll (x);              \
 159   } while (0)
 160 #else
 161 #define count_trailing_zeros_gcc_ctz(count,x)   \
 162   do {                                          \
 163     ASSERT ((x) != 0);                          \
 164     (count) = __builtin_ctzl (x);               \
 165   } while (0)
 166 #endif
 167
 168
 169 /* FIXME: The macros using external routines like __MPN(count_leading_zeros)
 170    don't need to be under !NO_ASM */
 171 #if ! defined (NO_ASM)
 172
 173 #if defined (__alpha) && W_TYPE_SIZE == 64
 174 /* Most alpha-based machines, except Cray systems. */
 175 #if defined (__GNUC__)
 176 #if __GMP_GNUC_PREREQ (3,3)
 177 #define umul_ppmm(ph, pl, m0, m1) \
 178   do {                                                                  \
 179     UDItype __m0 = (m0), __m1 = (m1);                                   \
 180     (ph) = __builtin_alpha_umulh (__m0, __m1);                          \
 181     (pl) = __m0 * __m1;                                                 \
 182   } while (0)
 183 #else
 184 #define umul_ppmm(ph, pl, m0, m1) \
 185   do {                                                                  \
 186     UDItype __m0 = (m0), __m1 = (m1);                                   \
 187     __asm__ ("umulh %r1,%2,%0"                                          \
 188              : "=r" (ph)                                                \
 189              : "%rJ" (__m0), "rI" (__m1));                              \
 190     (pl) = __m0 * __m1;                                                 \
 191   } while (0)
 192 #endif
 193 #define UMUL_TIME 18
 194 #else /* ! __GNUC__ */
 195 #include <machine/builtins.h>
 196 #define umul_ppmm(ph, pl, m0, m1) \
 197   do {                                                                  \
 198     UDItype __m0 = (m0), __m1 = (m1);                                   \
 199     (ph) = __UMULH (__m0, __m1);                                        \
 200     (pl) = __m0 * __m1;                                                 \
 201   } while (0)
 202 #endif
 203 #ifndef LONGLONG_STANDALONE
 204 #define udiv_qrnnd(q, r, n1, n0, d) \
 205   do { UWtype __di;                                                     \
 206     __di = __MPN(invert_limb) (d);                                      \
 207     udiv_qrnnd_preinv (q, r, n1, n0, d, __di);                          \
 208   } while (0)
 209 #define UDIV_PREINV_ALWAYS  1
 210 #define UDIV_NEEDS_NORMALIZATION 1
 211 #define UDIV_TIME 220
 212 #endif /* LONGLONG_STANDALONE */
 213
 214 /* clz_tab is required in all configurations, since mpn/alpha/cntlz.asm
 215    always goes into libgmp.so, even when not actually used.  */
 216 #define COUNT_LEADING_ZEROS_NEED_CLZ_TAB
 217
 218 #if defined (__GNUC__) && HAVE_HOST_CPU_alpha_CIX
 219 #define count_leading_zeros(COUNT,X) \
 220   __asm__("ctlz %1,%0" : "=r"(COUNT) : "r"(X))
 221 #define count_trailing_zeros(COUNT,X) \
 222   __asm__("cttz %1,%0" : "=r"(COUNT) : "r"(X))
 223 #endif /* clz/ctz using cix */
 224
 225 #if ! defined (count_leading_zeros)                             \
 226   && defined (__GNUC__) && ! defined (LONGLONG_STANDALONE)
 227 /* ALPHA_CMPBGE_0 gives "cmpbge $31,src,dst", ie. test src bytes == 0.
 228    "$31" is written explicitly in the asm, since an "r" constraint won't
 229    select reg 31.  There seems no need to worry about "r31" syntax for cray,
 230    since gcc itself (pre-release 3.4) emits just $31 in various places.  */
 231 #define ALPHA_CMPBGE_0(dst, src)                                        \
 232   do { asm ("cmpbge $31, %1, %0" : "=r" (dst) : "r" (src)); } while (0)
 233 /* Zero bytes are turned into bits with cmpbge, a __clz_tab lookup counts
 234    them, locating the highest non-zero byte.  A second __clz_tab lookup
 235    counts the leading zero bits in that byte, giving the result.  */
 236 #define count_leading_zeros(count, x)                                   \
 237   do {                                                                  \
 238     UWtype  __clz__b, __clz__c, __clz__x = (x);                         \
 239     ALPHA_CMPBGE_0 (__clz__b,  __clz__x);           /* zero bytes */    \
 240     __clz__b = __clz_tab [(__clz__b >> 1) ^ 0x7F];  /* 8 to 1 byte */   \
 241     __clz__b = __clz__b * 8 - 7;                    /* 57 to 1 shift */ \
 242     __clz__x >>= __clz__b;                                              \
 243     __clz__c = __clz_tab [__clz__x];                /* 8 to 1 bit */    \
 244     __clz__b = 65 - __clz__b;                                           \
 245     (count) = __clz__b - __clz__c;                                      \
 246   } while (0)
 247 #define COUNT_LEADING_ZEROS_NEED_CLZ_TAB
 248 #endif /* clz using cmpbge */
 249
 250 #if ! defined (count_leading_zeros) && ! defined (LONGLONG_STANDALONE)
 251 #if HAVE_ATTRIBUTE_CONST
 252 long __MPN(count_leading_zeros) (UDItype) __attribute__ ((const));
 253 #else
 254 long __MPN(count_leading_zeros) (UDItype);
 255 #endif
 256 #define count_leading_zeros(count, x) \
 257   ((count) = __MPN(count_leading_zeros) (x))
 258 #endif /* clz using mpn */
 259 #endif /* __alpha */
 260
 261 #if defined (__AVR) && W_TYPE_SIZE == 8
 262 #define umul_ppmm(ph, pl, m0, m1) \
 263   do {                                                                  \
 264     unsigned short __p = (unsigned short) (m0) * (m1);                  \
 265     (ph) = __p >> 8;                                                    \
 266     (pl) = __p;                                                         \
 267   } while (0)
 268 #endif /* AVR */
 269
 270 #if defined (_CRAY) && W_TYPE_SIZE == 64
 271 #include <intrinsics.h>
 272 #define UDIV_PREINV_ALWAYS  1
 273 #define UDIV_NEEDS_NORMALIZATION 1
 274 #define UDIV_TIME 220
 275 long __MPN(count_leading_zeros) (UDItype);
 276 #define count_leading_zeros(count, x) \
 277   ((count) = _leadz ((UWtype) (x)))
 278 #if defined (_CRAYIEEE)         /* I.e., Cray T90/ieee, T3D, and T3E */
 279 #define umul_ppmm(ph, pl, m0, m1) \
 280   do {                                                                  \
 281     UDItype __m0 = (m0), __m1 = (m1);                                   \
 282     (ph) = _int_mult_upper (__m0, __m1);                                \
 283     (pl) = __m0 * __m1;                                                 \
 284   } while (0)
 285 #ifndef LONGLONG_STANDALONE
 286 #define udiv_qrnnd(q, r, n1, n0, d) \
 287   do { UWtype __di;                                                     \
 288     __di = __MPN(invert_limb) (d);                                      \
 289     udiv_qrnnd_preinv (q, r, n1, n0, d, __di);                          \
 290   } while (0)
 291 #endif /* LONGLONG_STANDALONE */
 292 #endif /* _CRAYIEEE */
 293 #endif /* _CRAY */
 294
 295 #if defined (__ia64) && W_TYPE_SIZE == 64
 296 /* This form encourages gcc (pre-release 3.4 at least) to emit predicated
 297    "sub r=r,r" and "sub r=r,r,1", giving a 2 cycle latency.  The generic
 298    code using "al<bl" arithmetically comes out making an actual 0 or 1 in a
 299    register, which takes an extra cycle.  */
 300 #define sub_ddmmss(sh, sl, ah, al, bh, bl)      \
 301   do {                                          \
 302     UWtype __x;                                 \
 303     __x = (al) - (bl);                          \
 304     if ((al) < (bl))                            \
 305       (sh) = (ah) - (bh) - 1;                   \
 306     else                                        \
 307       (sh) = (ah) - (bh);                       \
 308     (sl) = __x;                                 \
 309   } while (0)
 310 #if defined (__GNUC__) && ! defined (__INTEL_COMPILER)
 311 /* Do both product parts in assembly, since that gives better code with
 312    all gcc versions.  Some callers will just use the upper part, and in
 313    that situation we waste an instruction, but not any cycles.  */
 314 #define umul_ppmm(ph, pl, m0, m1) \
 315     __asm__ ("xma.hu %0 = %2, %3, f0\n\txma.l %1 = %2, %3, f0"          \
 316              : "=&f" (ph), "=f" (pl)                                    \
 317              : "f" (m0), "f" (m1))
 318 #define UMUL_TIME 14
 319 #define count_leading_zeros(count, x) \
 320   do {                                                                  \
 321     UWtype _x = (x), _y, _a, _c;                                        \
 322     __asm__ ("mux1 %0 = %1, @rev" : "=r" (_y) : "r" (_x));              \
 323     __asm__ ("czx1.l %0 = %1" : "=r" (_a) : "r" (-_y | _y));            \
 324     _c = (_a - 1) << 3;                                                 \
 325     _x >>= _c;                                                          \
 326     if (_x >= 1 << 4)                                                   \
 327       _x >>= 4, _c += 4;                                                \
 328     if (_x >= 1 << 2)                                                   \
 329       _x >>= 2, _c += 2;                                                \
 330     _c += _x >> 1;                                                      \
 331     (count) =  W_TYPE_SIZE - 1 - _c;                                    \
 332   } while (0)
 333 /* similar to what gcc does for __builtin_ffs, but 0 based rather than 1
 334    based, and we don't need a special case for x==0 here */
 335 #define count_trailing_zeros(count, x)                                  \
 336   do {                                                                  \
 337     UWtype __ctz_x = (x);                                               \
 338     __asm__ ("popcnt %0 = %1"                                           \
 339              : "=r" (count)                                             \
 340              : "r" ((__ctz_x-1) & ~__ctz_x));                           \
 341   } while (0)
 342 #endif
 343 #if defined (__INTEL_COMPILER)
 344 #include <ia64intrin.h>
 345 #define umul_ppmm(ph, pl, m0, m1)                                       \
 346   do {                                                                  \
 347     UWtype __m0 = (m0), __m1 = (m1);                                    \
 348     ph = _m64_xmahu (__m0, __m1, 0);                                    \
 349     pl = __m0 * __m1;                                                   \
 350   } while (0)
 351 #endif
 352 #ifndef LONGLONG_STANDALONE
 353 #define udiv_qrnnd(q, r, n1, n0, d) \
 354   do { UWtype __di;                                                     \
 355     __di = __MPN(invert_limb) (d);                                      \
 356     udiv_qrnnd_preinv (q, r, n1, n0, d, __di);                          \
 357   } while (0)
 358 #define UDIV_PREINV_ALWAYS  1
 359 #define UDIV_NEEDS_NORMALIZATION 1
 360 #endif
 361 #define UDIV_TIME 220
 362 #endif
 363
 364
 365 #if defined (__GNUC__)
 366
 367 /* We sometimes need to clobber "cc" with gcc2, but that would not be
 368    understood by gcc1.  Use cpp to avoid major code duplication.  */
 369 #if __GNUC__ < 2
 370 #define __CLOBBER_CC
 371 #define __AND_CLOBBER_CC
 372 #else /* __GNUC__ >= 2 */
 373 #define __CLOBBER_CC : "cc"
 374 #define __AND_CLOBBER_CC , "cc"
 375 #endif /* __GNUC__ < 2 */
 376
 377 #if (defined (__a29k__) || defined (_AM29K)) && W_TYPE_SIZE == 32
 378 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
 379   __asm__ ("add %1,%4,%5\n\taddc %0,%2,%3"                              \
 380            : "=r" (sh), "=&r" (sl)                                      \
 381            : "r" (ah), "rI" (bh), "%r" (al), "rI" (bl))
 382 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
 383   __asm__ ("sub %1,%4,%5\n\tsubc %0,%2,%3"                              \
 384            : "=r" (sh), "=&r" (sl)                                      \
 385            : "r" (ah), "rI" (bh), "r" (al), "rI" (bl))
 386 #define umul_ppmm(xh, xl, m0, m1) \
 387   do {                                                                  \
 388     USItype __m0 = (m0), __m1 = (m1);                                   \
 389     __asm__ ("multiplu %0,%1,%2"                                        \
 390              : "=r" (xl)                                                \
 391              : "r" (__m0), "r" (__m1));                                 \
 392     __asm__ ("multmu %0,%1,%2"                                          \
 393              : "=r" (xh)                                                \
 394              : "r" (__m0), "r" (__m1));                                 \
 395   } while (0)
 396 #define udiv_qrnnd(q, r, n1, n0, d) \
 397   __asm__ ("dividu %0,%3,%4"                                            \
 398            : "=r" (q), "=q" (r)                                         \
 399            : "1" (n1), "r" (n0), "r" (d))
 400 #define count_leading_zeros(count, x) \
 401     __asm__ ("clz %0,%1"                                                \
 402              : "=r" (count)                                             \
 403              : "r" (x))
 404 #define COUNT_LEADING_ZEROS_0 32
 405 #endif /* __a29k__ */
 406
 407 #if defined (__arc__)
 408 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
 409   __asm__ ("add.f\t%1, %4, %5\n\tadc\t%0, %2, %3"                       \
 410            : "=r" (sh),                                                 \
 411              "=&r" (sl)                                                 \
 412            : "r"  ((USItype) (ah)),                                     \
 413              "rICal" ((USItype) (bh)),                                  \
 414              "%r" ((USItype) (al)),                                     \
 415              "rICal" ((USItype) (bl)))
 416 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
 417   __asm__ ("sub.f\t%1, %4, %5\n\tsbc\t%0, %2, %3"                       \
 418            : "=r" (sh),                                                 \
 419              "=&r" (sl)                                                 \
 420            : "r" ((USItype) (ah)),                                      \
 421              "rICal" ((USItype) (bh)),                                  \
 422              "r" ((USItype) (al)),                                      \
 423              "rICal" ((USItype) (bl)))
 424 #endif
 425
 426 #if defined (__arm__) && (defined (__thumb2__) || !defined (__thumb__)) \
 427     && W_TYPE_SIZE == 32
 428 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
 429   __asm__ ("adds\t%1, %4, %5\n\tadc\t%0, %2, %3"                        \
 430            : "=r" (sh), "=&r" (sl)                                      \
 431            : "r" (ah), "rI" (bh), "%r" (al), "rI" (bl) __CLOBBER_CC)
 432 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
 433   do {                                                                  \
 434     if (__builtin_constant_p (al))                                      \
 435       {                                                                 \
 436         if (__builtin_constant_p (ah))                                  \
 437           __asm__ ("rsbs\t%1, %5, %4\n\trsc\t%0, %3, %2"                \
 438                    : "=r" (sh), "=&r" (sl)                              \
 439                    : "rI" (ah), "r" (bh), "rI" (al), "r" (bl) __CLOBBER_CC); \
 440         else                                                            \
 441           __asm__ ("rsbs\t%1, %5, %4\n\tsbc\t%0, %2, %3"                \
 442                    : "=r" (sh), "=&r" (sl)                              \
 443                    : "r" (ah), "rI" (bh), "rI" (al), "r" (bl) __CLOBBER_CC); \
 444       }                                                                 \
 445     else if (__builtin_constant_p (ah))                                 \
 446       {                                                                 \
 447         if (__builtin_constant_p (bl))                                  \
 448           __asm__ ("subs\t%1, %4, %5\n\trsc\t%0, %3, %2"                \
 449                    : "=r" (sh), "=&r" (sl)                              \
 450                    : "rI" (ah), "r" (bh), "r" (al), "rI" (bl) __CLOBBER_CC); \
 451         else                                                            \
 452           __asm__ ("rsbs\t%1, %5, %4\n\trsc\t%0, %3, %2"                \
 453                    : "=r" (sh), "=&r" (sl)                              \
 454                    : "rI" (ah), "r" (bh), "rI" (al), "r" (bl) __CLOBBER_CC); \
 455       }                                                                 \
 456     else if (__builtin_constant_p (bl))                                 \
 457       {                                                                 \
 458         if (__builtin_constant_p (bh))                                  \
 459           __asm__ ("subs\t%1, %4, %5\n\tsbc\t%0, %2, %3"                \
 460                    : "=r" (sh), "=&r" (sl)                              \
 461                    : "r" (ah), "rI" (bh), "r" (al), "rI" (bl) __CLOBBER_CC); \
 462         else                                                            \
 463           __asm__ ("subs\t%1, %4, %5\n\trsc\t%0, %3, %2"                \
 464                    : "=r" (sh), "=&r" (sl)                              \
 465                    : "rI" (ah), "r" (bh), "r" (al), "rI" (bl) __CLOBBER_CC); \
 466       }                                                                 \
 467     else /* only bh might be a constant */                              \
 468       __asm__ ("subs\t%1, %4, %5\n\tsbc\t%0, %2, %3"                    \
 469                : "=r" (sh), "=&r" (sl)                                  \
 470                : "r" (ah), "rI" (bh), "r" (al), "rI" (bl) __CLOBBER_CC);\
 471     } while (0)
 472 #if defined (__ARM_ARCH_2__) || defined (__ARM_ARCH_2A__) \
 473     || defined (__ARM_ARCH_3__)
 474 #define umul_ppmm(xh, xl, a, b)                                         \
 475   do {                                                                  \
 476     register USItype __t0, __t1, __t2;                                  \
 477     __asm__ ("%@ Inlined umul_ppmm\n"                                   \
 478            "    mov     %2, %5, lsr #16\n"                              \
 479            "    mov     %0, %6, lsr #16\n"                              \
 480            "    bic     %3, %5, %2, lsl #16\n"                          \
 481            "    bic     %4, %6, %0, lsl #16\n"                          \
 482            "    mul     %1, %3, %4\n"                                   \
 483            "    mul     %4, %2, %4\n"                                   \
 484            "    mul     %3, %0, %3\n"                                   \
 485            "    mul     %0, %2, %0\n"                                   \
 486            "    adds    %3, %4, %3\n"                                   \
 487            "    addcs   %0, %0, #65536\n"                               \
 488            "    adds    %1, %1, %3, lsl #16\n"                          \
 489            "    adc     %0, %0, %3, lsr #16"                            \
 490            : "=&r" ((USItype) (xh)), "=r" ((USItype) (xl)),             \
 491              "=&r" (__t0), "=&r" (__t1), "=r" (__t2)                    \
 492            : "r" ((USItype) (a)), "r" ((USItype) (b)) __CLOBBER_CC);    \
 493   } while (0)
 494 #define UMUL_TIME 20
 495 #define udiv_qrnnd(q, r, n1, n0, d) \
 496   do { UWtype __r;                                                      \
 497     (q) = __MPN(udiv_qrnnd) (&__r, (n1), (n0), (d));                    \
 498     (r) = __r;                                                          \
 499   } while (0)
 500 extern UWtype __MPN(udiv_qrnnd) (UWtype *, UWtype, UWtype, UWtype);
 501 #define UDIV_TIME 200
 502 #else /* ARMv4 or newer */
 503 #define umul_ppmm(xh, xl, a, b) \
 504   __asm__ ("umull %0,%1,%2,%3" : "=&r" (xl), "=&r" (xh) : "r" (a), "r" (b))
 505 #define UMUL_TIME 5
 506 #define smul_ppmm(xh, xl, a, b) \
 507   __asm__ ("smull %0,%1,%2,%3" : "=&r" (xl), "=&r" (xh) : "r" (a), "r" (b))
 508 #ifndef LONGLONG_STANDALONE
 509 #define udiv_qrnnd(q, r, n1, n0, d) \
 510   do { UWtype __di;                                                     \
 511     __di = __MPN(invert_limb) (d);                                      \
 512     udiv_qrnnd_preinv (q, r, n1, n0, d, __di);                          \
 513   } while (0)
 514 #define UDIV_PREINV_ALWAYS  1
 515 #define UDIV_NEEDS_NORMALIZATION 1
 516 #define UDIV_TIME 70
 517 #endif /* LONGLONG_STANDALONE */
 518 #endif /* defined(__ARM_ARCH_2__) ... */
 519 #define count_leading_zeros(count, x)  count_leading_zeros_gcc_clz(count, x)
 520 #define count_trailing_zeros(count, x)  count_trailing_zeros_gcc_ctz(count, x)
 521 #define COUNT_LEADING_ZEROS_0 32
 522 #endif /* __arm__ */
 523
 524 #if defined (__aarch64__) && W_TYPE_SIZE == 64
 525 /* FIXME: Extend the immediate range for the low word by using both
 526    ADDS and SUBS, since they set carry in the same way.  */
 527 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
 528   __asm__ ("adds\t%1, %x4, %5\n\tadc\t%0, %x2, %x3"                     \
 529            : "=r" (sh), "=&r" (sl)                                      \
 530            : "rZ" ((UDItype)(ah)), "rZ" ((UDItype)(bh)),                \
 531              "%r" ((UDItype)(al)), "rI" ((UDItype)(bl)) __CLOBBER_CC)
 532 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
 533   __asm__ ("subs\t%1, %x4, %5\n\tsbc\t%0, %x2, %x3"                     \
 534            : "=r,r" (sh), "=&r,&r" (sl)                                 \
 535            : "rZ,rZ" ((UDItype)(ah)), "rZ,rZ" ((UDItype)(bh)),          \
 536              "r,Z"   ((UDItype)(al)), "rI,r"  ((UDItype)(bl)) __CLOBBER_CC)
 537 #define umul_ppmm(ph, pl, m0, m1) \
 538   do {                                                                  \
 539     UDItype __m0 = (m0), __m1 = (m1);                                   \
 540     __asm__ ("umulh\t%0, %1, %2" : "=r" (ph) : "r" (__m0), "r" (__m1)); \
 541     (pl) = __m0 * __m1;                                                 \
 542   } while (0)
 543 #define count_leading_zeros(count, x)  count_leading_zeros_gcc_clz(count, x)
 544 #define count_trailing_zeros(count, x)  count_trailing_zeros_gcc_ctz(count, x)
 545 #define COUNT_LEADING_ZEROS_0 64
 546 #endif /* __aarch64__ */
 547
 548 #if defined (__clipper__) && W_TYPE_SIZE == 32
 549 #define umul_ppmm(w1, w0, u, v) \
 550   ({union {UDItype __ll;                                                \
 551            struct {USItype __l, __h;} __i;                              \
 552           } __x;                                                        \
 553   __asm__ ("mulwux %2,%0"                                               \
 554            : "=r" (__x.__ll)                                            \
 555            : "%0" ((USItype)(u)), "r" ((USItype)(v)));                  \
 556   (w1) = __x.__i.__h; (w0) = __x.__i.__l;})
 557 #define smul_ppmm(w1, w0, u, v) \
 558   ({union {DItype __ll;                                                 \
 559            struct {SItype __l, __h;} __i;                               \
 560           } __x;                                                        \
 561   __asm__ ("mulwx %2,%0"                                                \
 562            : "=r" (__x.__ll)                                            \
 563            : "%0" ((SItype)(u)), "r" ((SItype)(v)));                    \
 564   (w1) = __x.__i.__h; (w0) = __x.__i.__l;})
 565 #define __umulsidi3(u, v) \
 566   ({UDItype __w;                                                        \
 567     __asm__ ("mulwux %2,%0"                                             \
 568              : "=r" (__w) : "%0" ((USItype)(u)), "r" ((USItype)(v)));   \
 569     __w; })
 570 #endif /* __clipper__ */
 571
 572 /* Fujitsu vector computers.  */
 573 #if defined (__uxp__) && W_TYPE_SIZE == 32
 574 #define umul_ppmm(ph, pl, u, v) \
 575   do {                                                                  \
 576     union {UDItype __ll;                                                \
 577            struct {USItype __h, __l;} __i;                              \
 578           } __x;                                                        \
 579     __asm__ ("mult.lu %1,%2,%0" : "=r" (__x.__ll) : "%r" (u), "rK" (v));\
 580     (ph) = __x.__i.__h;                                                 \
 581     (pl) = __x.__i.__l;                                                 \
 582   } while (0)
 583 #define smul_ppmm(ph, pl, u, v) \
 584   do {                                                                  \
 585     union {UDItype __ll;                                                \
 586            struct {USItype __h, __l;} __i;                              \
 587           } __x;                                                        \
 588     __asm__ ("mult.l %1,%2,%0" : "=r" (__x.__ll) : "%r" (u), "rK" (v)); \
 589     (ph) = __x.__i.__h;                                                 \
 590     (pl) = __x.__i.__l;                                                 \
 591   } while (0)
 592 #endif
 593
 594 #if defined (__gmicro__) && W_TYPE_SIZE == 32
 595 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
 596   __asm__ ("add.w %5,%1\n\taddx %3,%0"                                  \
 597            : "=g" (sh), "=&g" (sl)                                      \
 598            : "0"  ((USItype)(ah)), "g" ((USItype)(bh)),                 \
 599              "%1" ((USItype)(al)), "g" ((USItype)(bl)))
 600 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
 601   __asm__ ("sub.w %5,%1\n\tsubx %3,%0"                                  \
 602            : "=g" (sh), "=&g" (sl)                                      \
 603            : "0" ((USItype)(ah)), "g" ((USItype)(bh)),                  \
 604              "1" ((USItype)(al)), "g" ((USItype)(bl)))
 605 #define umul_ppmm(ph, pl, m0, m1) \
 606   __asm__ ("mulx %3,%0,%1"                                              \
 607            : "=g" (ph), "=r" (pl)                                       \
 608            : "%0" ((USItype)(m0)), "g" ((USItype)(m1)))
 609 #define udiv_qrnnd(q, r, nh, nl, d) \
 610   __asm__ ("divx %4,%0,%1"                                              \
 611            : "=g" (q), "=r" (r)                                         \
 612            : "1" ((USItype)(nh)), "0" ((USItype)(nl)), "g" ((USItype)(d)))
 613 #define count_leading_zeros(count, x) \
 614   __asm__ ("bsch/1 %1,%0"                                               \
 615            : "=g" (count) : "g" ((USItype)(x)), "0" ((USItype)0))
 616 #endif
 617
 618 #if defined (__hppa) && W_TYPE_SIZE == 32
 619 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
 620   __asm__ ("add%I5 %5,%r4,%1\n\taddc %r2,%r3,%0"                        \
 621            : "=r" (sh), "=&r" (sl)                                      \
 622            : "rM" (ah), "rM" (bh), "%rM" (al), "rI" (bl))
 623 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
 624   __asm__ ("sub%I4 %4,%r5,%1\n\tsubb %r2,%r3,%0"                        \
 625            : "=r" (sh), "=&r" (sl)                                      \
 626            : "rM" (ah), "rM" (bh), "rI" (al), "rM" (bl))
 627 #if defined (_PA_RISC1_1)
 628 #define umul_ppmm(wh, wl, u, v) \
 629   do {                                                                  \
 630     union {UDItype __ll;                                                \
 631            struct {USItype __h, __l;} __i;                              \
 632           } __x;                                                        \
 633     __asm__ ("xmpyu %1,%2,%0" : "=*f" (__x.__ll) : "*f" (u), "*f" (v)); \
 634     (wh) = __x.__i.__h;                                                 \
 635     (wl) = __x.__i.__l;                                                 \
 636   } while (0)
 637 #define UMUL_TIME 8
 638 #define UDIV_TIME 60
 639 #else
 640 #define UMUL_TIME 40
 641 #define UDIV_TIME 80
 642 #endif
 643 #define count_leading_zeros(count, x) \
 644   do {                                                                  \
 645     USItype __tmp;                                                      \
 646     __asm__ (                                                           \
 647        "ldi             1,%0\n"                                         \
 648 "       extru,=         %1,15,16,%%r0   ; Bits 31..16 zero?\n"          \
 649 "       extru,tr        %1,15,16,%1     ; No.  Shift down, skip add.\n" \
 650 "       ldo             16(%0),%0       ; Yes.  Perform add.\n"         \
 651 "       extru,=         %1,23,8,%%r0    ; Bits 15..8 zero?\n"           \
 652 "       extru,tr        %1,23,8,%1      ; No.  Shift down, skip add.\n" \
 653 "       ldo             8(%0),%0        ; Yes.  Perform add.\n"         \
 654 "       extru,=         %1,27,4,%%r0    ; Bits 7..4 zero?\n"            \
 655 "       extru,tr        %1,27,4,%1      ; No.  Shift down, skip add.\n" \
 656 "       ldo             4(%0),%0        ; Yes.  Perform add.\n"         \
 657 "       extru,=         %1,29,2,%%r0    ; Bits 3..2 zero?\n"            \
 658 "       extru,tr        %1,29,2,%1      ; No.  Shift down, skip add.\n" \
 659 "       ldo             2(%0),%0        ; Yes.  Perform add.\n"         \
 660 "       extru           %1,30,1,%1      ; Extract bit 1.\n"             \
 661 "       sub             %0,%1,%0        ; Subtract it.\n"               \
 662         : "=r" (count), "=r" (__tmp) : "1" (x));                        \
 663   } while (0)
 664 #endif /* hppa */
 665
 666 /* These macros are for ABI=2.0w.  In ABI=2.0n they can't be used, since GCC
 667    (3.2) puts longlong into two adjacent 32-bit registers.  Presumably this
 668    is just a case of no direct support for 2.0n but treating it like 1.0. */
 669 #if defined (__hppa) && W_TYPE_SIZE == 64 && ! defined (_LONG_LONG_LIMB)
 670 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
 671   __asm__ ("add%I5 %5,%r4,%1\n\tadd,dc %r2,%r3,%0"                      \
 672            : "=r" (sh), "=&r" (sl)                                      \
 673            : "rM" (ah), "rM" (bh), "%rM" (al), "rI" (bl))
 674 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
 675   __asm__ ("sub%I4 %4,%r5,%1\n\tsub,db %r2,%r3,%0"                      \
 676            : "=r" (sh), "=&r" (sl)                                      \
 677            : "rM" (ah), "rM" (bh), "rI" (al), "rM" (bl))
 678 #endif /* hppa */
 679
 680 #if (defined (__i370__) || defined (__s390__) || defined (__mvs__)) && W_TYPE_SIZE == 32
 681 #if defined (__zarch__) || defined (HAVE_HOST_CPU_s390_zarch)
 682 #define add_ssaaaa(sh, sl, ah, al, bh, bl)                              \
 683   do {                                                                  \
 684 /*  if (__builtin_constant_p (bl))                                      \
 685       __asm__ ("alfi\t%1,%o5\n\talcr\t%0,%3"                            \
 686                : "=r" (sh), "=&r" (sl)                                  \
 687                : "0"  (ah), "r" (bh), "%1" (al), "n" (bl) __CLOBBER_CC);\
 688     else                                                                \
 689 */    __asm__ ("alr\t%1,%5\n\talcr\t%0,%3"                              \
 690                : "=r" (sh), "=&r" (sl)                                  \
 691                : "0"  (ah), "r" (bh), "%1" (al), "r" (bl)__CLOBBER_CC); \
 692   } while (0)
 693 #define sub_ddmmss(sh, sl, ah, al, bh, bl)                              \
 694   do {                                                                  \
 695 /*  if (__builtin_constant_p (bl))                                      \
 696       __asm__ ("slfi\t%1,%o5\n\tslbr\t%0,%3"                            \
 697                : "=r" (sh), "=&r" (sl)                                  \
 698                : "0" (ah), "r" (bh), "1" (al), "n" (bl) __CLOBBER_CC);  \
 699     else                                                                \
 700 */    __asm__ ("slr\t%1,%5\n\tslbr\t%0,%3"                              \
 701                : "=r" (sh), "=&r" (sl)                                  \
 702                : "0" (ah), "r" (bh), "1" (al), "r" (bl) __CLOBBER_CC);  \
 703   } while (0)
 704 #if __GMP_GNUC_PREREQ (4,5)
 705 #define umul_ppmm(xh, xl, m0, m1)                                       \
 706   do {                                                                  \
 707     union {UDItype __ll;                                                \
 708            struct {USItype __h, __l;} __i;                              \
 709           } __x;                                                        \
 710     __x.__ll = (UDItype) (m0) * (UDItype) (m1);                         \
 711     (xh) = __x.__i.__h; (xl) = __x.__i.__l;                             \
 712   } while (0)
 713 #else
 714 #if 0
 715 /* FIXME: this fails if gcc knows about the 64-bit registers.  Use only
 716    with a new enough processor pretending we have 32-bit registers.  */
 717 #define umul_ppmm(xh, xl, m0, m1)                                       \
 718   do {                                                                  \
 719     union {UDItype __ll;                                                \
 720            struct {USItype __h, __l;} __i;                              \
 721           } __x;                                                        \
 722     __asm__ ("mlr\t%0,%2"                                               \
 723              : "=r" (__x.__ll)                                          \
 724              : "%0" (m0), "r" (m1));                                    \
 725     (xh) = __x.__i.__h; (xl) = __x.__i.__l;                             \
 726   } while (0)
 727 #else
 728 #define umul_ppmm(xh, xl, m0, m1)                                       \
 729   do {                                                                  \
 730   /* When we have 64-bit regs and gcc is aware of that, we cannot simply use
 731      DImode for the product, since that would be allocated to a single 64-bit
 732      register, whereas mlr uses the low 32-bits of an even-odd register pair.
 733   */                                                                    \
 734     register USItype __r0 __asm__ ("0");                                \
 735     register USItype __r1 __asm__ ("1") = (m0);                         \
 736     __asm__ ("mlr\t%0,%3"                                               \
 737              : "=r" (__r0), "=r" (__r1)                                 \
 738              : "r" (__r1), "r" (m1));                                   \
 739     (xh) = __r0; (xl) = __r1;                                           \
 740   } while (0)
 741 #endif /* if 0 */
 742 #endif
 743 #if 0
 744 /* FIXME: this fails if gcc knows about the 64-bit registers.  Use only
 745    with a new enough processor pretending we have 32-bit registers.  */
 746 #define udiv_qrnnd(q, r, n1, n0, d)                                     \
 747   do {                                                                  \
 748     union {UDItype __ll;                                                \
 749            struct {USItype __h, __l;} __i;                              \
 750           } __x;                                                        \
 751     __x.__i.__h = n1; __x.__i.__l = n0;                                 \
 752     __asm__ ("dlr\t%0,%2"                                               \
 753              : "=r" (__x.__ll)                                          \
 754              : "0" (__x.__ll), "r" (d));                                \
 755     (q) = __x.__i.__l; (r) = __x.__i.__h;                               \
 756   } while (0)
 757 #else
 758 #define udiv_qrnnd(q, r, n1, n0, d)                                     \
 759   do {                                                                  \
 760     register USItype __r0 __asm__ ("0") = (n1);                         \
 761     register USItype __r1 __asm__ ("1") = (n0);                         \
 762     __asm__ ("dlr\t%0,%4"                                               \
 763              : "=r" (__r0), "=r" (__r1)                                 \
 764              : "r" (__r0), "r" (__r1), "r" (d));                        \
 765     (q) = __r1; (r) = __r0;                                             \
 766   } while (0)
 767 #endif /* if 0 */
 768 #else /* if __zarch__ */
 769 /* FIXME: this fails if gcc knows about the 64-bit registers.  */
 770 #define smul_ppmm(xh, xl, m0, m1)                                       \
 771   do {                                                                  \
 772     union {DItype __ll;                                                 \
 773            struct {USItype __h, __l;} __i;                              \
 774           } __x;                                                        \
 775     __asm__ ("mr\t%0,%2"                                                \
 776              : "=r" (__x.__ll)                                          \
 777              : "%0" (m0), "r" (m1));                                    \
 778     (xh) = __x.__i.__h; (xl) = __x.__i.__l;                             \
 779   } while (0)
 780 /* FIXME: this fails if gcc knows about the 64-bit registers.  */
 781 #define sdiv_qrnnd(q, r, n1, n0, d)                                     \
 782   do {                                                                  \
 783     union {DItype __ll;                                                 \
 784            struct {USItype __h, __l;} __i;                              \
 785           } __x;                                                        \
 786     __x.__i.__h = n1; __x.__i.__l = n0;                                 \
 787     __asm__ ("dr\t%0,%2"                                                \
 788              : "=r" (__x.__ll)                                          \
 789              : "0" (__x.__ll), "r" (d));                                \
 790     (q) = __x.__i.__l; (r) = __x.__i.__h;                               \
 791   } while (0)
 792 #endif /* if __zarch__ */
 793 #endif
 794
 795 #if defined (__s390x__) && W_TYPE_SIZE == 64
 796 /* We need to cast operands with register constraints, otherwise their types
 797    will be assumed to be SImode by gcc.  For these machines, such operations
 798    will insert a value into the low 32 bits, and leave the high 32 bits with
 799    garbage.  */
 800 #define add_ssaaaa(sh, sl, ah, al, bh, bl)                              \
 801   do {                                                                  \
 802     __asm__ ("algr\t%1,%5\n\talcgr\t%0,%3"                              \
 803                : "=r" (sh), "=&r" (sl)                                  \
 804                : "0"  ((UDItype)(ah)), "r" ((UDItype)(bh)),             \
 805                  "%1" ((UDItype)(al)), "r" ((UDItype)(bl)) __CLOBBER_CC); \
 806   } while (0)
 807 #define sub_ddmmss(sh, sl, ah, al, bh, bl)                              \
 808   do {                                                                  \
 809     __asm__ ("slgr\t%1,%5\n\tslbgr\t%0,%3"                              \
 810              : "=r" (sh), "=&r" (sl)                                    \
 811              : "0" ((UDItype)(ah)), "r" ((UDItype)(bh)),                \
 812                "1" ((UDItype)(al)), "r" ((UDItype)(bl)) __CLOBBER_CC);  \
 813   } while (0)
 814 #define umul_ppmm(xh, xl, m0, m1)                                       \
 815   do {                                                                  \
 816     union {unsigned int __attribute__ ((mode(TI))) __ll;                \
 817            struct {UDItype __h, __l;} __i;                              \
 818           } __x;                                                        \
 819     __asm__ ("mlgr\t%0,%2"                                              \
 820              : "=r" (__x.__ll)                                          \
 821              : "%0" ((UDItype)(m0)), "r" ((UDItype)(m1)));              \
 822     (xh) = __x.__i.__h; (xl) = __x.__i.__l;                             \
 823   } while (0)
 824 #define udiv_qrnnd(q, r, n1, n0, d)                                     \
 825   do {                                                                  \
 826     union {unsigned int __attribute__ ((mode(TI))) __ll;                \
 827            struct {UDItype __h, __l;} __i;                              \
 828           } __x;                                                        \
 829     __x.__i.__h = n1; __x.__i.__l = n0;                                 \
 830     __asm__ ("dlgr\t%0,%2"                                              \
 831              : "=r" (__x.__ll)                                          \
 832              : "0" (__x.__ll), "r" ((UDItype)(d)));                     \
 833     (q) = __x.__i.__l; (r) = __x.__i.__h;                               \
 834   } while (0)
 835 #if 0 /* FIXME: Enable for z10 (?) */
 836 #define count_leading_zeros(cnt, x)                                     \
 837   do {                                                                  \
 838     union {unsigned int __attribute__ ((mode(TI))) __ll;                \
 839            struct {UDItype __h, __l;} __i;                              \
 840           } __clr_cnt;                                                  \
 841     __asm__ ("flogr\t%0,%1"                                             \
 842              : "=r" (__clr_cnt.__ll)                                    \
 843              : "r" (x) __CLOBBER_CC);                                   \
 844     (cnt) = __clr_cnt.__i.__h;                                          \
 845   } while (0)
 846 #endif
 847 #endif
 848
 849 /* On x86 and x86_64, every asm implicitly clobbers "flags" and "fpsr",
 850    so we don't need __CLOBBER_CC.  */
 851 #if (defined (__i386__) || defined (__i486__)) && W_TYPE_SIZE == 32
 852 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
 853   __asm__ ("addl %5,%k1\n\tadcl %3,%k0"                                 \
 854            : "=r" (sh), "=&r" (sl)                                      \
 855            : "0"  ((USItype)(ah)), "g" ((USItype)(bh)),                 \
 856              "%1" ((USItype)(al)), "g" ((USItype)(bl)))
 857 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
 858   __asm__ ("subl %5,%k1\n\tsbbl %3,%k0"                                 \
 859            : "=r" (sh), "=&r" (sl)                                      \
 860            : "0" ((USItype)(ah)), "g" ((USItype)(bh)),                  \
 861              "1" ((USItype)(al)), "g" ((USItype)(bl)))
 862 #define umul_ppmm(w1, w0, u, v) \
 863   __asm__ ("mull %3"                                                    \
 864            : "=a" (w0), "=d" (w1)                                       \
 865            : "%0" ((USItype)(u)), "rm" ((USItype)(v)))
 866 #define udiv_qrnnd(q, r, n1, n0, dx) /* d renamed to dx avoiding "=d" */\
 867   __asm__ ("divl %4"                 /* stringification in K&R C */     \
 868            : "=a" (q), "=d" (r)                                         \
 869            : "0" ((USItype)(n0)), "1" ((USItype)(n1)), "rm" ((USItype)(dx)))
 870
 871 #if HAVE_HOST_CPU_i586 || HAVE_HOST_CPU_pentium || HAVE_HOST_CPU_pentiummmx
 872 /* Pentium bsrl takes between 10 and 72 cycles depending where the most
 873    significant 1 bit is, hence the use of the following alternatives.  bsfl
 874    is slow too, between 18 and 42 depending where the least significant 1
 875    bit is, so let the generic count_trailing_zeros below make use of the
 876    count_leading_zeros here too.  */
 877
 878 #if HAVE_HOST_CPU_pentiummmx && ! defined (LONGLONG_STANDALONE)
 879 /* The following should be a fixed 14 or 15 cycles, but possibly plus an L1
 880    cache miss reading from __clz_tab.  For P55 it's favoured over the float
 881    below so as to avoid mixing MMX and x87, since the penalty for switching
 882    between the two is about 100 cycles.
 883
 884    The asm block sets __shift to -3 if the high 24 bits are clear, -2 for
 885    16, -1 for 8, or 0 otherwise.  This could be written equivalently as
 886    follows, but as of gcc 2.95.2 it results in conditional jumps.
 887
 888        __shift = -(__n < 0x1000000);
 889        __shift -= (__n < 0x10000);
 890        __shift -= (__n < 0x100);
 891
 892    The middle two sbbl and cmpl's pair, and with luck something gcc
 893    generates might pair with the first cmpl and the last sbbl.  The "32+1"
 894    constant could be folded into __clz_tab[], but it doesn't seem worth
 895    making a different table just for that.  */
 896
 897 #define count_leading_zeros(c,n)                                        \
 898   do {                                                                  \
 899     USItype  __n = (n);                                                 \
 900     USItype  __shift;                                                   \
 901     __asm__ ("cmpl  $0x1000000, %1\n"                                   \
 902              "sbbl  %0, %0\n"                                           \
 903              "cmpl  $0x10000, %1\n"                                     \
 904              "sbbl  $0, %0\n"                                           \
 905              "cmpl  $0x100, %1\n"                                       \
 906              "sbbl  $0, %0\n"                                           \
 907              : "=&r" (__shift) : "r"  (__n));                           \
 908     __shift = __shift*8 + 24 + 1;                                       \
 909     (c) = 32 + 1 - __shift - __clz_tab[__n >> __shift];                 \
 910   } while (0)
 911 #define COUNT_LEADING_ZEROS_NEED_CLZ_TAB
 912 #define COUNT_LEADING_ZEROS_0   31   /* n==0 indistinguishable from n==1 */
 913
 914 #else /* ! pentiummmx || LONGLONG_STANDALONE */
 915 /* The following should be a fixed 14 cycles or so.  Some scheduling
 916    opportunities should be available between the float load/store too.  This
 917    sort of code is used in gcc 3 for __builtin_ffs (with "n&-n") and is
 918    apparently suggested by the Intel optimizing manual (don't know exactly
 919    where).  gcc 2.95 or up will be best for this, so the "double" is
 920    correctly aligned on the stack.  */
 921 #define count_leading_zeros(c,n)                                        \
 922   do {                                                                  \
 923     union {                                                             \
 924       double    d;                                                      \
 925       unsigned  a[2];                                                   \
 926     } __u;                                                              \
 927     ASSERT ((n) != 0);                                                  \
 928     __u.d = (UWtype) (n);                                               \
 929     (c) = 0x3FF + 31 - (__u.a[1] >> 20);                                \
 930   } while (0)
 931 #define COUNT_LEADING_ZEROS_0   (0x3FF + 31)
 932 #endif /* pentiummx */
 933
 934 #else /* ! pentium */
 935
 936 #if __GMP_GNUC_PREREQ (3,4)  /* using bsrl */
 937 #define count_leading_zeros(count,x)  count_leading_zeros_gcc_clz(count,x)
 938 #endif /* gcc clz */
 939
 940 /* On P6, gcc prior to 3.0 generates a partial register stall for
 941    __cbtmp^31, due to using "xorb $31" instead of "xorl $31", the former
 942    being 1 code byte smaller.  "31-__cbtmp" is a workaround, probably at the
 943    cost of one extra instruction.  Do this for "i386" too, since that means
 944    generic x86.  */
 945 #if ! defined (count_leading_zeros) && __GNUC__ < 3                     \
 946   && (HAVE_HOST_CPU_i386                                                \
 947       || HAVE_HOST_CPU_i686                                             \
 948       || HAVE_HOST_CPU_pentiumpro                                       \
 949       || HAVE_HOST_CPU_pentium2                                         \
 950       || HAVE_HOST_CPU_pentium3)
 951 #define count_leading_zeros(count, x)                                   \
 952   do {                                                                  \
 953     USItype __cbtmp;                                                    \
 954     ASSERT ((x) != 0);                                                  \
 955     __asm__ ("bsrl %1,%0" : "=r" (__cbtmp) : "rm" ((USItype)(x)));      \
 956     (count) = 31 - __cbtmp;                                             \
 957   } while (0)
 958 #endif /* gcc<3 asm bsrl */
 959
 960 #ifndef count_leading_zeros
 961 #define count_leading_zeros(count, x)                                   \
 962   do {                                                                  \
 963     USItype __cbtmp;                                                    \
 964     ASSERT ((x) != 0);                                                  \
 965     __asm__ ("bsrl %1,%0" : "=r" (__cbtmp) : "rm" ((USItype)(x)));      \
 966     (count) = __cbtmp ^ 31;                                             \
 967   } while (0)
 968 #endif /* asm bsrl */
 969
 970 #if __GMP_GNUC_PREREQ (3,4)  /* using bsfl */
 971 #define count_trailing_zeros(count,x)  count_trailing_zeros_gcc_ctz(count,x)
 972 #endif /* gcc ctz */
 973
 974 #ifndef count_trailing_zeros
 975 #define count_trailing_zeros(count, x)                                  \
 976   do {                                                                  \
 977     ASSERT ((x) != 0);                                                  \
 978     __asm__ ("bsfl %1,%k0" : "=r" (count) : "rm" ((USItype)(x)));       \
 979   } while (0)
 980 #endif /* asm bsfl */
 981
 982 #endif /* ! pentium */
 983
 984 #ifndef UMUL_TIME
 985 #define UMUL_TIME 10
 986 #endif
 987 #ifndef UDIV_TIME
 988 #define UDIV_TIME 40
 989 #endif
 990 #endif /* 80x86 */
 991
 992 #if defined (__amd64__) && W_TYPE_SIZE == 64
 993 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
 994   __asm__ ("addq %5,%q1\n\tadcq %3,%q0"                                 \
 995            : "=r" (sh), "=&r" (sl)                                      \
 996            : "0"  ((UDItype)(ah)), "rme" ((UDItype)(bh)),               \
 997              "%1" ((UDItype)(al)), "rme" ((UDItype)(bl)))
 998 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
 999   __asm__ ("subq %5,%q1\n\tsbbq %3,%q0"                                 \
1000            : "=r" (sh), "=&r" (sl)                                      \
1001            : "0" ((UDItype)(ah)), "rme" ((UDItype)(bh)),                \
1002              "1" ((UDItype)(al)), "rme" ((UDItype)(bl)))
1003 #define umul_ppmm(w1, w0, u, v) \
1004   __asm__ ("mulq %3"                                                    \
1005            : "=a" (w0), "=d" (w1)                                       \
1006            : "%0" ((UDItype)(u)), "rm" ((UDItype)(v)))
1007 #define udiv_qrnnd(q, r, n1, n0, dx) /* d renamed to dx avoiding "=d" */\
1008   __asm__ ("divq %4"                 /* stringification in K&R C */     \
1009            : "=a" (q), "=d" (r)                                         \
1010            : "0" ((UDItype)(n0)), "1" ((UDItype)(n1)), "rm" ((UDItype)(dx)))
1011 /* bsrq destination must be a 64-bit register, hence UDItype for __cbtmp. */
1012 #define count_leading_zeros(count, x)                                   \
1013   do {                                                                  \
1014     UDItype __cbtmp;                                                    \
1015     ASSERT ((x) != 0);                                                  \
1016     __asm__ ("bsrq %1,%0" : "=r" (__cbtmp) : "rm" ((UDItype)(x)));      \
1017     (count) = __cbtmp ^ 63;                                             \
1018   } while (0)
1019 /* bsfq destination must be a 64-bit register, "%q0" forces this in case
1020    count is only an int. */
1021 #define count_trailing_zeros(count, x)                                  \
1022   do {                                                                  \
1023     ASSERT ((x) != 0);                                                  \
1024     __asm__ ("bsfq %1,%q0" : "=r" (count) : "rm" ((UDItype)(x)));       \
1025   } while (0)
1026 #endif /* __amd64__ */
1027
1028 #if defined (__i860__) && W_TYPE_SIZE == 32
1029 #define rshift_rhlc(r,h,l,c) \
1030   __asm__ ("shr %3,r0,r0\;shrd %1,%2,%0"                                \
1031            "=r" (r) : "r" (h), "r" (l), "rn" (c))
1032 #endif /* i860 */
1033
1034 #if defined (__i960__) && W_TYPE_SIZE == 32
1035 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
1036   __asm__ ("cmpo 1,0\;addc %5,%4,%1\;addc %3,%2,%0"                     \
1037            : "=r" (sh), "=&r" (sl)                                      \
1038            : "dI" (ah), "dI" (bh), "%dI" (al), "dI" (bl))
1039 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
1040   __asm__ ("cmpo 0,0\;subc %5,%4,%1\;subc %3,%2,%0"                     \
1041            : "=r" (sh), "=&r" (sl)                                      \
1042            : "dI" (ah), "dI" (bh), "dI" (al), "dI" (bl))
1043 #define umul_ppmm(w1, w0, u, v) \
1044   ({union {UDItype __ll;                                                \
1045            struct {USItype __l, __h;} __i;                              \
1046           } __x;                                                        \
1047   __asm__ ("emul %2,%1,%0"                                              \
1048            : "=d" (__x.__ll) : "%dI" (u), "dI" (v));                    \
1049   (w1) = __x.__i.__h; (w0) = __x.__i.__l;})
1050 #define __umulsidi3(u, v) \
1051   ({UDItype __w;                                                        \
1052     __asm__ ("emul %2,%1,%0" : "=d" (__w) : "%dI" (u), "dI" (v));       \
1053     __w; })
1054 #define udiv_qrnnd(q, r, nh, nl, d) \
1055   do {                                                                  \
1056     union {UDItype __ll;                                                \
1057            struct {USItype __l, __h;} __i;                              \
1058           } __nn;                                                       \
1059     __nn.__i.__h = (nh); __nn.__i.__l = (nl);                           \
1060     __asm__ ("ediv %d,%n,%0"                                            \
1061            : "=d" (__rq.__ll) : "dI" (__nn.__ll), "dI" (d));            \
1062     (r) = __rq.__i.__l; (q) = __rq.__i.__h;                             \
1063   } while (0)
1064 #define count_leading_zeros(count, x) \
1065   do {                                                                  \
1066     USItype __cbtmp;                                                    \
1067     __asm__ ("scanbit %1,%0" : "=r" (__cbtmp) : "r" (x));               \
1068     (count) = __cbtmp ^ 31;                                             \
1069   } while (0)
1070 #define COUNT_LEADING_ZEROS_0 (-32) /* sic */
1071 #if defined (__i960mx)          /* what is the proper symbol to test??? */
1072 #define rshift_rhlc(r,h,l,c) \
1073   do {                                                                  \
1074     union {UDItype __ll;                                                \
1075            struct {USItype __l, __h;} __i;                              \
1076           } __nn;                                                       \
1077     __nn.__i.__h = (h); __nn.__i.__l = (l);                             \
1078     __asm__ ("shre %2,%1,%0" : "=d" (r) : "dI" (__nn.__ll), "dI" (c));  \
1079   }
1080 #endif /* i960mx */
1081 #endif /* i960 */
1082
1083 #if (defined (__mc68000__) || defined (__mc68020__) || defined(mc68020) \
1084      || defined (__m68k__) || defined (__mc5200__) || defined (__mc5206e__) \
1085      || defined (__mc5307__)) && W_TYPE_SIZE == 32
1086 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
1087   __asm__ ("add%.l %5,%1\n\taddx%.l %3,%0"                              \
1088            : "=d" (sh), "=&d" (sl)                                      \
1089            : "0"  ((USItype)(ah)), "d" ((USItype)(bh)),                 \
1090              "%1" ((USItype)(al)), "g" ((USItype)(bl)))
1091 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
1092   __asm__ ("sub%.l %5,%1\n\tsubx%.l %3,%0"                              \
1093            : "=d" (sh), "=&d" (sl)                                      \
1094            : "0" ((USItype)(ah)), "d" ((USItype)(bh)),                  \
1095              "1" ((USItype)(al)), "g" ((USItype)(bl)))
1096 /* The '020, '030, '040 and CPU32 have 32x32->64 and 64/32->32q-32r.  */
1097 #if defined (__mc68020__) || defined(mc68020) \
1098      || defined (__mc68030__) || defined (mc68030) \
1099      || defined (__mc68040__) || defined (mc68040) \
1100      || defined (__mcpu32__) || defined (mcpu32) \
1101      || defined (__NeXT__)
1102 #define umul_ppmm(w1, w0, u, v) \
1103   __asm__ ("mulu%.l %3,%1:%0"                                           \
1104            : "=d" (w0), "=d" (w1)                                       \
1105            : "%0" ((USItype)(u)), "dmi" ((USItype)(v)))
1106 #define UMUL_TIME 45
1107 #define udiv_qrnnd(q, r, n1, n0, d) \
1108   __asm__ ("divu%.l %4,%1:%0"                                           \
1109            : "=d" (q), "=d" (r)                                         \
1110            : "0" ((USItype)(n0)), "1" ((USItype)(n1)), "dmi" ((USItype)(d)))
1111 #define UDIV_TIME 90
1112 #define sdiv_qrnnd(q, r, n1, n0, d) \
1113   __asm__ ("divs%.l %4,%1:%0"                                           \
1114            : "=d" (q), "=d" (r)                                         \
1115            : "0" ((USItype)(n0)), "1" ((USItype)(n1)), "dmi" ((USItype)(d)))
1116 #else /* for other 68k family members use 16x16->32 multiplication */
1117 #define umul_ppmm(xh, xl, a, b) \
1118   do { USItype __umul_tmp1, __umul_tmp2;                                \
1119         __asm__ ("| Inlined umul_ppmm\n"                                \
1120 "       move%.l %5,%3\n"                                                \
1121 "       move%.l %2,%0\n"                                                \
1122 "       move%.w %3,%1\n"                                                \
1123 "       swap    %3\n"                                                   \
1124 "       swap    %0\n"                                                   \
1125 "       mulu%.w %2,%1\n"                                                \
1126 "       mulu%.w %3,%0\n"                                                \
1127 "       mulu%.w %2,%3\n"                                                \
1128 "       swap    %2\n"                                                   \
1129 "       mulu%.w %5,%2\n"                                                \
1130 "       add%.l  %3,%2\n"                                                \
1131 "       jcc     1f\n"                                                   \
1132 "       add%.l  %#0x10000,%0\n"                                         \
1133 "1:     move%.l %2,%3\n"                                                \
1134 "       clr%.w  %2\n"                                                   \
1135 "       swap    %2\n"                                                   \
1136 "       swap    %3\n"                                                   \
1137 "       clr%.w  %3\n"                                                   \
1138 "       add%.l  %3,%1\n"                                                \
1139 "       addx%.l %2,%0\n"                                                \
1140 "       | End inlined umul_ppmm"                                        \
1141               : "=&d" (xh), "=&d" (xl),                                 \
1142                 "=d" (__umul_tmp1), "=&d" (__umul_tmp2)                 \
1143               : "%2" ((USItype)(a)), "d" ((USItype)(b)));               \
1144   } while (0)
1145 #define UMUL_TIME 100
1146 #define UDIV_TIME 400
1147 #endif /* not mc68020 */
1148 /* The '020, '030, '040 and '060 have bitfield insns.
1149    GCC 3.4 defines __mc68020__ when in CPU32 mode, check for __mcpu32__ to
1150    exclude bfffo on that chip (bitfield insns not available).  */
1151 #if (defined (__mc68020__) || defined (mc68020)    \
1152      || defined (__mc68030__) || defined (mc68030) \
1153      || defined (__mc68040__) || defined (mc68040) \
1154      || defined (__mc68060__) || defined (mc68060) \
1155      || defined (__NeXT__))                        \
1156   && ! defined (__mcpu32__)
1157 #define count_leading_zeros(count, x) \
1158   __asm__ ("bfffo %1{%b2:%b2},%0"                                       \
1159            : "=d" (count)                                               \
1160            : "od" ((USItype) (x)), "n" (0))
1161 #define COUNT_LEADING_ZEROS_0 32
1162 #endif
1163 #endif /* mc68000 */
1164
1165 #if defined (__m88000__) && W_TYPE_SIZE == 32
1166 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
1167   __asm__ ("addu.co %1,%r4,%r5\n\taddu.ci %0,%r2,%r3"                   \
1168            : "=r" (sh), "=&r" (sl)                                      \
1169            : "rJ" (ah), "rJ" (bh), "%rJ" (al), "rJ" (bl))
1170 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
1171   __asm__ ("subu.co %1,%r4,%r5\n\tsubu.ci %0,%r2,%r3"                   \
1172            : "=r" (sh), "=&r" (sl)                                      \
1173            : "rJ" (ah), "rJ" (bh), "rJ" (al), "rJ" (bl))
1174 #define count_leading_zeros(count, x) \
1175   do {                                                                  \
1176     USItype __cbtmp;                                                    \
1177     __asm__ ("ff1 %0,%1" : "=r" (__cbtmp) : "r" (x));                   \
1178     (count) = __cbtmp ^ 31;                                             \
1179   } while (0)
1180 #define COUNT_LEADING_ZEROS_0 63 /* sic */
1181 #if defined (__m88110__)
1182 #define umul_ppmm(wh, wl, u, v) \
1183   do {                                                                  \
1184     union {UDItype __ll;                                                \
1185            struct {USItype __h, __l;} __i;                              \
1186           } __x;                                                        \
1187     __asm__ ("mulu.d %0,%1,%2" : "=r" (__x.__ll) : "r" (u), "r" (v));   \
1188     (wh) = __x.__i.__h;                                                 \
1189     (wl) = __x.__i.__l;                                                 \
1190   } while (0)
1191 #define udiv_qrnnd(q, r, n1, n0, d) \
1192   ({union {UDItype __ll;                                                \
1193            struct {USItype __h, __l;} __i;                              \
1194           } __x, __q;                                                   \
1195   __x.__i.__h = (n1); __x.__i.__l = (n0);                               \
1196   __asm__ ("divu.d %0,%1,%2"                                            \
1197            : "=r" (__q.__ll) : "r" (__x.__ll), "r" (d));                \
1198   (r) = (n0) - __q.__l * (d); (q) = __q.__l; })
1199 #define UMUL_TIME 5
1200 #define UDIV_TIME 25
1201 #else
1202 #define UMUL_TIME 17
1203 #define UDIV_TIME 150
1204 #endif /* __m88110__ */
1205 #endif /* __m88000__ */
1206
1207 #if defined (__mips) && W_TYPE_SIZE == 32
1208 #if __GMP_GNUC_PREREQ (4,4)
1209 #define umul_ppmm(w1, w0, u, v) \
1210   do {                                                                  \
1211     UDItype __ll = (UDItype)(u) * (v);                                  \
1212     w1 = __ll >> 32;                                                    \
1213     w0 = __ll;                                                          \
1214   } while (0)
1215 #endif
1216 #if !defined (umul_ppmm) && __GMP_GNUC_PREREQ (2,7) && !defined (__clang__)
1217 #define umul_ppmm(w1, w0, u, v) \
1218   __asm__ ("multu %2,%3" : "=l" (w0), "=h" (w1) : "d" (u), "d" (v))
1219 #endif
1220 #if !defined (umul_ppmm)
1221 #define umul_ppmm(w1, w0, u, v) \
1222   __asm__ ("multu %2,%3\n\tmflo %0\n\tmfhi %1"                          \
1223            : "=d" (w0), "=d" (w1) : "d" (u), "d" (v))
1224 #endif
1225 #define UMUL_TIME 10
1226 #define UDIV_TIME 100
1227 #endif /* __mips */
1228
1229 #if (defined (__mips) && __mips >= 3) && W_TYPE_SIZE == 64
1230 #if __GMP_GNUC_PREREQ (4,4)
1231 #define umul_ppmm(w1, w0, u, v) \
1232   do {                                                                  \
1233     typedef unsigned int __ll_UTItype __attribute__((mode(TI)));        \
1234     __ll_UTItype __ll = (__ll_UTItype)(u) * (v);                        \
1235     w1 = __ll >> 64;                                                    \
1236     w0 = __ll;                                                          \
1237   } while (0)
1238 #endif
1239 #if !defined (umul_ppmm) && __GMP_GNUC_PREREQ (2,7) && !defined (__clang__)
1240 #define umul_ppmm(w1, w0, u, v) \
1241   __asm__ ("dmultu %2,%3"                                               \
1242            : "=l" (w0), "=h" (w1)                                       \
1243            : "d" ((UDItype)(u)), "d" ((UDItype)(v)))
1244 #endif
1245 #if !defined (umul_ppmm)
1246 #define umul_ppmm(w1, w0, u, v) \
1247   __asm__ ("dmultu %2,%3\n\tmflo %0\n\tmfhi %1"                         \
1248            : "=d" (w0), "=d" (w1)                                       \
1249            : "d" ((UDItype)(u)), "d" ((UDItype)(v)))
1250 #endif
1251 #define UMUL_TIME 20
1252 #define UDIV_TIME 140
1253 #endif /* __mips */
1254
1255 #if defined (__mmix__) && W_TYPE_SIZE == 64
1256 #define umul_ppmm(w1, w0, u, v) \
1257   __asm__ ("MULU %0,%2,%3" : "=r" (w0), "=z" (w1) : "r" (u), "r" (v))
1258 #endif
1259
1260 #if defined (__ns32000__) && W_TYPE_SIZE == 32
1261 #define umul_ppmm(w1, w0, u, v) \
1262   ({union {UDItype __ll;                                                \
1263            struct {USItype __l, __h;} __i;                              \
1264           } __x;                                                        \
1265   __asm__ ("meid %2,%0"                                                 \
1266            : "=g" (__x.__ll)                                            \
1267            : "%0" ((USItype)(u)), "g" ((USItype)(v)));                  \
1268   (w1) = __x.__i.__h; (w0) = __x.__i.__l;})
1269 #define __umulsidi3(u, v) \
1270   ({UDItype __w;                                                        \
1271     __asm__ ("meid %2,%0"                                               \
1272              : "=g" (__w)                                               \
1273              : "%0" ((USItype)(u)), "g" ((USItype)(v)));                \
1274     __w; })
1275 #define udiv_qrnnd(q, r, n1, n0, d) \
1276   ({union {UDItype __ll;                                                \
1277            struct {USItype __l, __h;} __i;                              \
1278           } __x;                                                        \
1279   __x.__i.__h = (n1); __x.__i.__l = (n0);                               \
1280   __asm__ ("deid %2,%0"                                                 \
1281            : "=g" (__x.__ll)                                            \
1282            : "0" (__x.__ll), "g" ((USItype)(d)));                       \
1283   (r) = __x.__i.__l; (q) = __x.__i.__h; })
1284 #define count_trailing_zeros(count,x) \
1285   do {                                                                  \
1286     __asm__ ("ffsd      %2,%0"                                          \
1287              : "=r" (count)                                             \
1288              : "0" ((USItype) 0), "r" ((USItype) (x)));                 \
1289   } while (0)
1290 #endif /* __ns32000__ */
1291
1292 /* In the past we had a block of various #defines tested
1293        _ARCH_PPC    - AIX
1294        _ARCH_PWR    - AIX
1295        __powerpc__  - gcc
1296        __POWERPC__  - BEOS
1297        __ppc__      - Darwin
1298        PPC          - old gcc, GNU/Linux, SysV
1299    The plain PPC test was not good for vxWorks, since PPC is defined on all
1300    CPUs there (eg. m68k too), as a constant one is expected to compare
1301    CPU_FAMILY against.
1302
1303    At any rate, this was pretty unattractive and a bit fragile.  The use of
1304    HAVE_HOST_CPU_FAMILY is designed to cut through it all and be sure of
1305    getting the desired effect.
1306
1307    ENHANCE-ME: We should test _IBMR2 here when we add assembly support for
1308    the system vendor compilers.  (Is that vendor compilers with inline asm,
1309    or what?)  */
1310
1311 #if (HAVE_HOST_CPU_FAMILY_power || HAVE_HOST_CPU_FAMILY_powerpc)        \
1312   && W_TYPE_SIZE == 32
1313 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
1314   do {                                                                  \
1315     if (__builtin_constant_p (bh) && (bh) == 0)                         \
1316       __asm__ ("add%I4c %1,%3,%4\n\taddze %0,%2"                        \
1317              : "=r" (sh), "=&r" (sl) : "r" (ah), "%r" (al), "rI" (bl)); \
1318     else if (__builtin_constant_p (bh) && (bh) == ~(USItype) 0)         \
1319       __asm__ ("add%I4c %1,%3,%4\n\taddme %0,%2"                        \
1320              : "=r" (sh), "=&r" (sl) : "r" (ah), "%r" (al), "rI" (bl)); \
1321     else                                                                \
1322       __asm__ ("add%I5c %1,%4,%5\n\tadde %0,%2,%3"                      \
1323              : "=r" (sh), "=&r" (sl)                                    \
1324              : "r" (ah), "r" (bh), "%r" (al), "rI" (bl));               \
1325   } while (0)
1326 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
1327   do {                                                                  \
1328     if (__builtin_constant_p (ah) && (ah) == 0)                         \
1329       __asm__ ("subf%I3c %1,%4,%3\n\tsubfze %0,%2"                      \
1330                : "=r" (sh), "=&r" (sl) : "r" (bh), "rI" (al), "r" (bl));\
1331     else if (__builtin_constant_p (ah) && (ah) == ~(USItype) 0)         \
1332       __asm__ ("subf%I3c %1,%4,%3\n\tsubfme %0,%2"                      \
1333                : "=r" (sh), "=&r" (sl) : "r" (bh), "rI" (al), "r" (bl));\
1334     else if (__builtin_constant_p (bh) && (bh) == 0)                    \
1335       __asm__ ("subf%I3c %1,%4,%3\n\taddme %0,%2"                       \
1336                : "=r" (sh), "=&r" (sl) : "r" (ah), "rI" (al), "r" (bl));\
1337     else if (__builtin_constant_p (bh) && (bh) == ~(USItype) 0)         \
1338       __asm__ ("subf%I3c %1,%4,%3\n\taddze %0,%2"                       \
1339                : "=r" (sh), "=&r" (sl) : "r" (ah), "rI" (al), "r" (bl));\
1340     else                                                                \
1341       __asm__ ("subf%I4c %1,%5,%4\n\tsubfe %0,%3,%2"                    \
1342                : "=r" (sh), "=&r" (sl)                                  \
1343                : "r" (ah), "r" (bh), "rI" (al), "r" (bl));              \
1344   } while (0)
1345 #define count_leading_zeros(count, x) \
1346   __asm__ ("cntlzw %0,%1" : "=r" (count) : "r" (x))
1347 #define COUNT_LEADING_ZEROS_0 32
1348 #if HAVE_HOST_CPU_FAMILY_powerpc
1349 #if __GMP_GNUC_PREREQ (4,4)
1350 #define umul_ppmm(w1, w0, u, v) \
1351   do {                                                                  \
1352     UDItype __ll = (UDItype)(u) * (v);                                  \
1353     w1 = __ll >> 32;                                                    \
1354     w0 = __ll;                                                          \
1355   } while (0)
1356 #endif
1357 #if !defined (umul_ppmm)
1358 #define umul_ppmm(ph, pl, m0, m1) \
1359   do {                                                                  \
1360     USItype __m0 = (m0), __m1 = (m1);                                   \
1361     __asm__ ("mulhwu %0,%1,%2" : "=r" (ph) : "%r" (m0), "r" (m1));      \
1362     (pl) = __m0 * __m1;                                                 \
1363   } while (0)
1364 #endif
1365 #define UMUL_TIME 15
1366 #define smul_ppmm(ph, pl, m0, m1) \
1367   do {                                                                  \
1368     SItype __m0 = (m0), __m1 = (m1);                                    \
1369     __asm__ ("mulhw %0,%1,%2" : "=r" (ph) : "%r" (m0), "r" (m1));       \
1370     (pl) = __m0 * __m1;                                                 \
1371   } while (0)
1372 #define SMUL_TIME 14
1373 #define UDIV_TIME 120
1374 #else
1375 #define UMUL_TIME 8
1376 #define smul_ppmm(xh, xl, m0, m1) \
1377   __asm__ ("mul %0,%2,%3" : "=r" (xh), "=q" (xl) : "r" (m0), "r" (m1))
1378 #define SMUL_TIME 4
1379 #define sdiv_qrnnd(q, r, nh, nl, d) \
1380   __asm__ ("div %0,%2,%4" : "=r" (q), "=q" (r) : "r" (nh), "1" (nl), "r" (d))
1381 #define UDIV_TIME 100
1382 #endif
1383 #endif /* 32-bit POWER architecture variants.  */
1384
1385 /* We should test _IBMR2 here when we add assembly support for the system
1386    vendor compilers.  */
1387 #if HAVE_HOST_CPU_FAMILY_powerpc && W_TYPE_SIZE == 64
1388 #if !defined (_LONG_LONG_LIMB)
1389 /* _LONG_LONG_LIMB is ABI=mode32 where adde operates on 32-bit values.  So
1390    use adde etc only when not _LONG_LONG_LIMB.  */
1391 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
1392   do {                                                                  \
1393     if (__builtin_constant_p (bh) && (bh) == 0)                         \
1394       __asm__ ("add%I4c %1,%3,%4\n\taddze %0,%2"                        \
1395                : "=r" (sh), "=&r" (sl)                                  \
1396                : "r"  ((UDItype)(ah)),                                  \
1397                  "%r" ((UDItype)(al)), "rI" ((UDItype)(bl)));           \
1398     else if (__builtin_constant_p (bh) && (bh) == ~(UDItype) 0)         \
1399       __asm__ ("add%I4c %1,%3,%4\n\taddme %0,%2"                        \
1400                : "=r" (sh), "=&r" (sl)                                  \
1401                : "r"  ((UDItype)(ah)),                                  \
1402                  "%r" ((UDItype)(al)), "rI" ((UDItype)(bl)));           \
1403     else                                                                \
1404       __asm__ ("add%I5c %1,%4,%5\n\tadde %0,%2,%3"                      \
1405                : "=r" (sh), "=&r" (sl)                                  \
1406                : "r"  ((UDItype)(ah)), "r"  ((UDItype)(bh)),            \
1407                  "%r" ((UDItype)(al)), "rI" ((UDItype)(bl)));           \
1408   } while (0)
1409 /* We use "*rI" for the constant operand here, since with just "I", gcc barfs.
1410    This might seem strange, but gcc folds away the dead code late.  */
1411 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
1412   do {                                                                  \
1413     if (__builtin_constant_p (bl) && bl > -0x8000 && bl <= 0x8000) {    \
1414         if (__builtin_constant_p (ah) && (ah) == 0)                     \
1415           __asm__ ("addic %1,%3,%4\n\tsubfze %0,%2"                     \
1416                    : "=r" (sh), "=&r" (sl)                              \
1417                    :                       "r" ((UDItype)(bh)),         \
1418                      "rI" ((UDItype)(al)), "*rI" (-((UDItype)(bl))));   \
1419         else if (__builtin_constant_p (ah) && (ah) == ~(UDItype) 0)     \
1420           __asm__ ("addic %1,%3,%4\n\tsubfme %0,%2"                     \
1421                    : "=r" (sh), "=&r" (sl)                              \
1422                    :                       "r" ((UDItype)(bh)),         \
1423                      "rI" ((UDItype)(al)), "*rI" (-((UDItype)(bl))));   \
1424         else if (__builtin_constant_p (bh) && (bh) == 0)                \
1425           __asm__ ("addic %1,%3,%4\n\taddme %0,%2"                      \
1426                    : "=r" (sh), "=&r" (sl)                              \
1427                    : "r"  ((UDItype)(ah)),                              \
1428                      "rI" ((UDItype)(al)), "*rI" (-((UDItype)(bl))));   \
1429         else if (__builtin_constant_p (bh) && (bh) == ~(UDItype) 0)     \
1430           __asm__ ("addic %1,%3,%4\n\taddze %0,%2"                      \
1431                    : "=r" (sh), "=&r" (sl)                              \
1432                    : "r"  ((UDItype)(ah)),                              \
1433                      "rI" ((UDItype)(al)), "*rI" (-((UDItype)(bl))));   \
1434         else                                                            \
1435           __asm__ ("addic %1,%4,%5\n\tsubfe %0,%3,%2"                   \
1436                    : "=r" (sh), "=&r" (sl)                              \
1437                    : "r"  ((UDItype)(ah)), "r" ((UDItype)(bh)),         \
1438                      "rI" ((UDItype)(al)), "*rI" (-((UDItype)(bl))));   \
1439     } else {                                                            \
1440         if (__builtin_constant_p (ah) && (ah) == 0)                     \
1441           __asm__ ("subf%I3c %1,%4,%3\n\tsubfze %0,%2"                  \
1442                    : "=r" (sh), "=&r" (sl)                              \
1443                    :                       "r" ((UDItype)(bh)),         \
1444                      "rI" ((UDItype)(al)), "r" ((UDItype)(bl)));        \
1445         else if (__builtin_constant_p (ah) && (ah) == ~(UDItype) 0)     \
1446           __asm__ ("subf%I3c %1,%4,%3\n\tsubfme %0,%2"                  \
1447                    : "=r" (sh), "=&r" (sl)                              \
1448                    :                       "r" ((UDItype)(bh)),         \
1449                      "rI" ((UDItype)(al)), "r" ((UDItype)(bl)));        \
1450         else if (__builtin_constant_p (bh) && (bh) == 0)                \
1451           __asm__ ("subf%I3c %1,%4,%3\n\taddme %0,%2"                   \
1452                    : "=r" (sh), "=&r" (sl)                              \
1453                    : "r"  ((UDItype)(ah)),                              \
1454                      "rI" ((UDItype)(al)), "r" ((UDItype)(bl)));        \
1455         else if (__builtin_constant_p (bh) && (bh) == ~(UDItype) 0)     \
1456           __asm__ ("subf%I3c %1,%4,%3\n\taddze %0,%2"                   \
1457                    : "=r" (sh), "=&r" (sl)                              \
1458                    : "r"  ((UDItype)(ah)),                              \
1459                      "rI" ((UDItype)(al)), "r" ((UDItype)(bl)));        \
1460         else                                                            \
1461           __asm__ ("subf%I4c %1,%5,%4\n\tsubfe %0,%3,%2"                \
1462                    : "=r" (sh), "=&r" (sl)                              \
1463                    : "r"  ((UDItype)(ah)), "r" ((UDItype)(bh)),         \
1464                      "rI" ((UDItype)(al)), "r" ((UDItype)(bl)));        \
1465     }                                                                   \
1466   } while (0)
1467 #endif /* ! _LONG_LONG_LIMB */
1468 #define count_leading_zeros(count, x) \
1469   __asm__ ("cntlzd %0,%1" : "=r" (count) : "r" (x))
1470 #define COUNT_LEADING_ZEROS_0 64
1471 #if 0 && __GMP_GNUC_PREREQ (4,4) /* Disable, this results in libcalls! */
1472 #define umul_ppmm(w1, w0, u, v) \
1473   do {                                                                  \
1474     typedef unsigned int __ll_UTItype __attribute__((mode(TI)));        \
1475     __ll_UTItype __ll = (__ll_UTItype)(u) * (v);                        \
1476     w1 = __ll >> 64;                                                    \
1477     w0 = __ll;                                                          \
1478   } while (0)
1479 #endif
1480 #if !defined (umul_ppmm)
1481 #define umul_ppmm(ph, pl, m0, m1) \
1482   do {                                                                  \
1483     UDItype __m0 = (m0), __m1 = (m1);                                   \
1484     __asm__ ("mulhdu %0,%1,%2" : "=r" (ph) : "%r" (__m0), "r" (__m1));  \
1485     (pl) = __m0 * __m1;                                                 \
1486   } while (0)
1487 #endif
1488 #define UMUL_TIME 15
1489 #define smul_ppmm(ph, pl, m0, m1) \
1490   do {                                                                  \
1491     DItype __m0 = (m0), __m1 = (m1);                                    \
1492     __asm__ ("mulhd %0,%1,%2" : "=r" (ph) : "%r" (__m0), "r" (__m1));   \
1493     (pl) = __m0 * __m1;                                                 \
1494   } while (0)
1495 #define SMUL_TIME 14  /* ??? */
1496 #define UDIV_TIME 120 /* ??? */
1497 #endif /* 64-bit PowerPC.  */
1498
1499 #if defined (__pyr__) && W_TYPE_SIZE == 32
1500 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
1501   __asm__ ("addw %5,%1\n\taddwc %3,%0"                                  \
1502            : "=r" (sh), "=&r" (sl)                                      \
1503            : "0"  ((USItype)(ah)), "g" ((USItype)(bh)),                 \
1504              "%1" ((USItype)(al)), "g" ((USItype)(bl)))
1505 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
1506   __asm__ ("subw %5,%1\n\tsubwb %3,%0"                                  \
1507            : "=r" (sh), "=&r" (sl)                                      \
1508            : "0" ((USItype)(ah)), "g" ((USItype)(bh)),                  \
1509              "1" ((USItype)(al)), "g" ((USItype)(bl)))
1510 /* This insn works on Pyramids with AP, XP, or MI CPUs, but not with SP.  */
1511 #define umul_ppmm(w1, w0, u, v) \
1512   ({union {UDItype __ll;                                                \
1513            struct {USItype __h, __l;} __i;                              \
1514           } __x;                                                        \
1515   __asm__ ("movw %1,%R0\n\tuemul %2,%0"                                 \
1516            : "=&r" (__x.__ll)                                           \
1517            : "g" ((USItype) (u)), "g" ((USItype)(v)));                  \
1518   (w1) = __x.__i.__h; (w0) = __x.__i.__l;})
1519 #endif /* __pyr__ */
1520
1521 #if defined (__ibm032__) /* RT/ROMP */  && W_TYPE_SIZE == 32
1522 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
1523   __asm__ ("a %1,%5\n\tae %0,%3"                                        \
1524            : "=r" (sh), "=&r" (sl)                                      \
1525            : "0"  ((USItype)(ah)), "r" ((USItype)(bh)),                 \
1526              "%1" ((USItype)(al)), "r" ((USItype)(bl)))
1527 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
1528   __asm__ ("s %1,%5\n\tse %0,%3"                                        \
1529            : "=r" (sh), "=&r" (sl)                                      \
1530            : "0" ((USItype)(ah)), "r" ((USItype)(bh)),                  \
1531              "1" ((USItype)(al)), "r" ((USItype)(bl)))
1532 #define smul_ppmm(ph, pl, m0, m1) \
1533   __asm__ (                                                             \
1534        "s       r2,r2\n"                                                \
1535 "       mts r10,%2\n"                                                   \
1536 "       m       r2,%3\n"                                                \
1537 "       m       r2,%3\n"                                                \
1538 "       m       r2,%3\n"                                                \
1539 "       m       r2,%3\n"                                                \
1540 "       m       r2,%3\n"                                                \
1541 "       m       r2,%3\n"                                                \
1542 "       m       r2,%3\n"                                                \
1543 "       m       r2,%3\n"                                                \
1544 "       m       r2,%3\n"                                                \
1545 "       m       r2,%3\n"                                                \
1546 "       m       r2,%3\n"                                                \
1547 "       m       r2,%3\n"                                                \
1548 "       m       r2,%3\n"                                                \
1549 "       m       r2,%3\n"                                                \
1550 "       m       r2,%3\n"                                                \
1551 "       m       r2,%3\n"                                                \
1552 "       cas     %0,r2,r0\n"                                             \
1553 "       mfs     r10,%1"                                                 \
1554            : "=r" (ph), "=r" (pl)                                       \
1555            : "%r" ((USItype)(m0)), "r" ((USItype)(m1))                  \
1556            : "r2")
1557 #define UMUL_TIME 20
1558 #define UDIV_TIME 200
1559 #define count_leading_zeros(count, x) \
1560   do {                                                                  \
1561     if ((x) >= 0x10000)                                                 \
1562       __asm__ ("clz     %0,%1"                                          \
1563                : "=r" (count) : "r" ((USItype)(x) >> 16));              \
1564     else                                                                \
1565       {                                                                 \
1566         __asm__ ("clz   %0,%1"                                          \
1567                  : "=r" (count) : "r" ((USItype)(x)));                  \
1568         (count) += 16;                                                  \
1569       }                                                                 \
1570   } while (0)
1571 #endif /* RT/ROMP */
1572
1573 #if defined (__riscv64) && W_TYPE_SIZE == 64
1574 #define umul_ppmm(ph, pl, u, v) \
1575   do {                                                                  \
1576     UDItype __u = (u), __v = (v);                                       \
1577     (pl) = __u * __v;                                                   \
1578     __asm__ ("mulhu\t%2, %1, %0" : "=r" (ph) : "%r" (__u), "r" (__v));  \
1579   } while (0)
1580 #endif
1581
1582 #if (defined (__SH2__) || defined (__SH3__) || defined (__SH4__)) && W_TYPE_SIZE == 32
1583 #define umul_ppmm(w1, w0, u, v) \
1584   __asm__ ("dmulu.l %2,%3\n\tsts macl,%1\n\tsts mach,%0"                \
1585            : "=r" (w1), "=r" (w0) : "r" (u), "r" (v) : "macl", "mach")
1586 #define UMUL_TIME 5
1587 #endif
1588
1589 #if defined (__sparc__) && W_TYPE_SIZE == 32
1590 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
1591   __asm__ ("addcc %r4,%5,%1\n\taddx %r2,%3,%0"                          \
1592            : "=r" (sh), "=&r" (sl)                                      \
1593            : "rJ" (ah), "rI" (bh),"%rJ" (al), "rI" (bl)                 \
1594            __CLOBBER_CC)
1595 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
1596   __asm__ ("subcc %r4,%5,%1\n\tsubx %r2,%3,%0"                          \
1597            : "=r" (sh), "=&r" (sl)                                      \
1598            : "rJ" (ah), "rI" (bh), "rJ" (al), "rI" (bl) \
1599            __CLOBBER_CC)
1600 /* FIXME: When gcc -mcpu=v9 is used on solaris, gcc/config/sol2-sld-64.h
1601    doesn't define anything to indicate that to us, it only sets __sparcv8. */
1602 #if defined (__sparc_v9__) || defined (__sparcv9)
1603 /* Perhaps we should use floating-point operations here?  */
1604 #if 0
1605 /* Triggers a bug making mpz/tests/t-gcd.c fail.
1606    Perhaps we simply need explicitly zero-extend the inputs?  */
1607 #define umul_ppmm(w1, w0, u, v) \
1608   __asm__ ("mulx %2,%3,%%g1; srl %%g1,0,%1; srlx %%g1,32,%0" :          \
1609            "=r" (w1), "=r" (w0) : "r" (u), "r" (v) : "g1")
1610 #else
1611 /* Use v8 umul until above bug is fixed.  */
1612 #define umul_ppmm(w1, w0, u, v) \
1613   __asm__ ("umul %2,%3,%1;rd %%y,%0" : "=r" (w1), "=r" (w0) : "r" (u), "r" (v))
1614 #endif
1615 /* Use a plain v8 divide for v9.  */
1616 #define udiv_qrnnd(q, r, n1, n0, d) \
1617   do {                                                                  \
1618     USItype __q;                                                        \
1619     __asm__ ("mov %1,%%y;nop;nop;nop;udiv %2,%3,%0"                     \
1620              : "=r" (__q) : "r" (n1), "r" (n0), "r" (d));               \
1621     (r) = (n0) - __q * (d);                                             \
1622     (q) = __q;                                                          \
1623   } while (0)
1624 #else
1625 #if defined (__sparc_v8__)   /* gcc normal */                           \
1626   || defined (__sparcv8)     /* gcc solaris */                          \
1627   || HAVE_HOST_CPU_supersparc
1628 /* Don't match immediate range because, 1) it is not often useful,
1629    2) the 'I' flag thinks of the range as a 13 bit signed interval,
1630    while we want to match a 13 bit interval, sign extended to 32 bits,
1631    but INTERPRETED AS UNSIGNED.  */
1632 #define umul_ppmm(w1, w0, u, v) \
1633   __asm__ ("umul %2,%3,%1;rd %%y,%0" : "=r" (w1), "=r" (w0) : "r" (u), "r" (v))
1634 #define UMUL_TIME 5
1635
1636 #if HAVE_HOST_CPU_supersparc
1637 #define UDIV_TIME 60            /* SuperSPARC timing */
1638 #else
1639 /* Don't use this on SuperSPARC because its udiv only handles 53 bit
1640    dividends and will trap to the kernel for the rest. */
1641 #define udiv_qrnnd(q, r, n1, n0, d) \
1642   do {                                                                  \
1643     USItype __q;                                                        \
1644     __asm__ ("mov %1,%%y;nop;nop;nop;udiv %2,%3,%0"                     \
1645              : "=r" (__q) : "r" (n1), "r" (n0), "r" (d));               \
1646     (r) = (n0) - __q * (d);                                             \
1647     (q) = __q;                                                          \
1648   } while (0)
1649 #define UDIV_TIME 25
1650 #endif /* HAVE_HOST_CPU_supersparc */
1651
1652 #else /* ! __sparc_v8__ */
1653 #if defined (__sparclite__)
1654 /* This has hardware multiply but not divide.  It also has two additional
1655    instructions scan (ffs from high bit) and divscc.  */
1656 #define umul_ppmm(w1, w0, u, v) \
1657   __asm__ ("umul %2,%3,%1;rd %%y,%0" : "=r" (w1), "=r" (w0) : "r" (u), "r" (v))
1658 #define UMUL_TIME 5
1659 #define udiv_qrnnd(q, r, n1, n0, d) \
1660   __asm__ ("! Inlined udiv_qrnnd\n"                                     \
1661 "       wr      %%g0,%2,%%y     ! Not a delayed write for sparclite\n"  \
1662 "       tst     %%g0\n"                                                 \
1663 "       divscc  %3,%4,%%g1\n"                                           \
1664 "       divscc  %%g1,%4,%%g1\n"                                         \
1665 "       divscc  %%g1,%4,%%g1\n"                                         \
1666 "       divscc  %%g1,%4,%%g1\n"                                         \
1667 "       divscc  %%g1,%4,%%g1\n"                                         \
1668 "       divscc  %%g1,%4,%%g1\n"                                         \
1669 "       divscc  %%g1,%4,%%g1\n"                                         \
1670 "       divscc  %%g1,%4,%%g1\n"                                         \
1671 "       divscc  %%g1,%4,%%g1\n"                                         \
1672 "       divscc  %%g1,%4,%%g1\n"                                         \
1673 "       divscc  %%g1,%4,%%g1\n"                                         \
1674 "       divscc  %%g1,%4,%%g1\n"                                         \
1675 "       divscc  %%g1,%4,%%g1\n"                                         \
1676 "       divscc  %%g1,%4,%%g1\n"                                         \
1677 "       divscc  %%g1,%4,%%g1\n"                                         \
1678 "       divscc  %%g1,%4,%%g1\n"                                         \
1679 "       divscc  %%g1,%4,%%g1\n"                                         \
1680 "       divscc  %%g1,%4,%%g1\n"                                         \
1681 "       divscc  %%g1,%4,%%g1\n"                                         \
1682 "       divscc  %%g1,%4,%%g1\n"                                         \
1683 "       divscc  %%g1,%4,%%g1\n"                                         \
1684 "       divscc  %%g1,%4,%%g1\n"                                         \
1685 "       divscc  %%g1,%4,%%g1\n"                                         \
1686 "       divscc  %%g1,%4,%%g1\n"                                         \
1687 "       divscc  %%g1,%4,%%g1\n"                                         \
1688 "       divscc  %%g1,%4,%%g1\n"                                         \
1689 "       divscc  %%g1,%4,%%g1\n"                                         \
1690 "       divscc  %%g1,%4,%%g1\n"                                         \
1691 "       divscc  %%g1,%4,%%g1\n"                                         \
1692 "       divscc  %%g1,%4,%%g1\n"                                         \
1693 "       divscc  %%g1,%4,%%g1\n"                                         \
1694 "       divscc  %%g1,%4,%0\n"                                           \
1695 "       rd      %%y,%1\n"                                               \
1696 "       bl,a 1f\n"                                                      \
1697 "       add     %1,%4,%1\n"                                             \
1698 "1:     ! End of inline udiv_qrnnd"                                     \
1699            : "=r" (q), "=r" (r) : "r" (n1), "r" (n0), "rI" (d)          \
1700            : "%g1" __AND_CLOBBER_CC)
1701 #define UDIV_TIME 37
1702 #define count_leading_zeros(count, x) \
1703   __asm__ ("scan %1,1,%0" : "=r" (count) : "r" (x))
1704 /* Early sparclites return 63 for an argument of 0, but they warn that future
1705    implementations might change this.  Therefore, leave COUNT_LEADING_ZEROS_0
1706    undefined.  */
1707 #endif /* __sparclite__ */
1708 #endif /* __sparc_v8__ */
1709 #endif /* __sparc_v9__ */
1710 /* Default to sparc v7 versions of umul_ppmm and udiv_qrnnd.  */
1711 #ifndef umul_ppmm
1712 #define umul_ppmm(w1, w0, u, v) \
1713   __asm__ ("! Inlined umul_ppmm\n"                                      \
1714 "       wr      %%g0,%2,%%y     ! SPARC has 0-3 delay insn after a wr\n" \
1715 "       sra     %3,31,%%g2      ! Don't move this insn\n"               \
1716 "       and     %2,%%g2,%%g2    ! Don't move this insn\n"               \
1717 "       andcc   %%g0,0,%%g1     ! Don't move this insn\n"               \
1718 "       mulscc  %%g1,%3,%%g1\n"                                         \
1719 "       mulscc  %%g1,%3,%%g1\n"                                         \
1720 "       mulscc  %%g1,%3,%%g1\n"                                         \
1721 "       mulscc  %%g1,%3,%%g1\n"                                         \
1722 "       mulscc  %%g1,%3,%%g1\n"                                         \
1723 "       mulscc  %%g1,%3,%%g1\n"                                         \
1724 "       mulscc  %%g1,%3,%%g1\n"                                         \
1725 "       mulscc  %%g1,%3,%%g1\n"                                         \
1726 "       mulscc  %%g1,%3,%%g1\n"                                         \
1727 "       mulscc  %%g1,%3,%%g1\n"                                         \
1728 "       mulscc  %%g1,%3,%%g1\n"                                         \
1729 "       mulscc  %%g1,%3,%%g1\n"                                         \
1730 "       mulscc  %%g1,%3,%%g1\n"                                         \
1731 "       mulscc  %%g1,%3,%%g1\n"                                         \
1732 "       mulscc  %%g1,%3,%%g1\n"                                         \
1733 "       mulscc  %%g1,%3,%%g1\n"                                         \
1734 "       mulscc  %%g1,%3,%%g1\n"                                         \
1735 "       mulscc  %%g1,%3,%%g1\n"                                         \
1736 "       mulscc  %%g1,%3,%%g1\n"                                         \
1737 "       mulscc  %%g1,%3,%%g1\n"                                         \
1738 "       mulscc  %%g1,%3,%%g1\n"                                         \
1739 "       mulscc  %%g1,%3,%%g1\n"                                         \
1740 "       mulscc  %%g1,%3,%%g1\n"                                         \
1741 "       mulscc  %%g1,%3,%%g1\n"                                         \
1742 "       mulscc  %%g1,%3,%%g1\n"                                         \
1743 "       mulscc  %%g1,%3,%%g1\n"                                         \
1744 "       mulscc  %%g1,%3,%%g1\n"                                         \
1745 "       mulscc  %%g1,%3,%%g1\n"                                         \
1746 "       mulscc  %%g1,%3,%%g1\n"                                         \
1747 "       mulscc  %%g1,%3,%%g1\n"                                         \
1748 "       mulscc  %%g1,%3,%%g1\n"                                         \
1749 "       mulscc  %%g1,%3,%%g1\n"                                         \
1750 "       mulscc  %%g1,0,%%g1\n"                                          \
1751 "       add     %%g1,%%g2,%0\n"                                         \
1752 "       rd      %%y,%1"                                                 \
1753            : "=r" (w1), "=r" (w0) : "%rI" (u), "r" (v)                  \
1754            : "%g1", "%g2" __AND_CLOBBER_CC)
1755 #define UMUL_TIME 39            /* 39 instructions */
1756 #endif
1757 #ifndef udiv_qrnnd
1758 #ifndef LONGLONG_STANDALONE
1759 #define udiv_qrnnd(q, r, n1, n0, d) \
1760   do { UWtype __r;                                                      \
1761     (q) = __MPN(udiv_qrnnd) (&__r, (n1), (n0), (d));                    \
1762     (r) = __r;                                                          \
1763   } while (0)
1764 extern UWtype __MPN(udiv_qrnnd) (UWtype *, UWtype, UWtype, UWtype);
1765 #ifndef UDIV_TIME
1766 #define UDIV_TIME 140
1767 #endif
1768 #endif /* LONGLONG_STANDALONE */
1769 #endif /* udiv_qrnnd */
1770 #endif /* __sparc__ */
1771
1772 #if defined (__sparc__) && W_TYPE_SIZE == 64
1773 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
1774   __asm__ (                                                             \
1775        "addcc   %r4,%5,%1\n"                                            \
1776       " addccc  %r6,%7,%%g0\n"                                          \
1777       " addc    %r2,%3,%0"                                              \
1778        : "=r" (sh), "=&r" (sl)                                          \
1779        : "rJ"  ((UDItype)(ah)), "rI" ((UDItype)(bh)),                   \
1780          "%rJ" ((UDItype)(al)), "rI" ((UDItype)(bl)),                   \
1781          "%rJ" ((UDItype)(al) >> 32), "rI" ((UDItype)(bl) >> 32)        \
1782            __CLOBBER_CC)
1783 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
1784   __asm__ (                                                             \
1785        "subcc   %r4,%5,%1\n"                                            \
1786       " subccc  %r6,%7,%%g0\n"                                          \
1787       " subc    %r2,%3,%0"                                              \
1788        : "=r" (sh), "=&r" (sl)                                          \
1789        : "rJ" ((UDItype)(ah)), "rI" ((UDItype)(bh)),                    \
1790          "rJ" ((UDItype)(al)), "rI" ((UDItype)(bl)),                    \
1791          "rJ" ((UDItype)(al) >> 32), "rI" ((UDItype)(bl) >> 32)         \
1792            __CLOBBER_CC)
1793 #if __VIS__ >= 0x300
1794 #undef add_ssaaaa
1795 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
1796   __asm__ (                                                             \
1797        "addcc   %r4, %5, %1\n"                                          \
1798       " addxc   %r2, %r3, %0"                                           \
1799           : "=r" (sh), "=&r" (sl)                                       \
1800        : "rJ"  ((UDItype)(ah)), "rJ" ((UDItype)(bh)),                   \
1801          "%rJ" ((UDItype)(al)), "rI" ((UDItype)(bl)) __CLOBBER_CC)
1802 #define umul_ppmm(ph, pl, m0, m1) \
1803   do {                                                                  \
1804     UDItype __m0 = (m0), __m1 = (m1);                                   \
1805     (pl) = __m0 * __m1;                                                 \
1806     __asm__ ("umulxhi\t%2, %1, %0"                                      \
1807              : "=r" (ph)                                                \
1808              : "%r" (__m0), "r" (__m1));                                \
1809   } while (0)
1810 #define count_leading_zeros(count, x) \
1811   __asm__ ("lzd\t%1,%0" : "=r" (count) : "r" (x))
1812 /* Needed by count_leading_zeros_32 in sparc64.h.  */
1813 #define COUNT_LEADING_ZEROS_NEED_CLZ_TAB
1814 #endif
1815 #endif
1816
1817 #if (defined (__vax) || defined (__vax__)) && W_TYPE_SIZE == 32
1818 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
1819   __asm__ ("addl2 %5,%1\n\tadwc %3,%0"                                  \
1820            : "=g" (sh), "=&g" (sl)                                      \
1821            : "0"  ((USItype)(ah)), "g" ((USItype)(bh)),                 \
1822              "%1" ((USItype)(al)), "g" ((USItype)(bl)))
1823 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
1824   __asm__ ("subl2 %5,%1\n\tsbwc %3,%0"                                  \
1825            : "=g" (sh), "=&g" (sl)                                      \
1826            : "0" ((USItype)(ah)), "g" ((USItype)(bh)),                  \
1827              "1" ((USItype)(al)), "g" ((USItype)(bl)))
1828 #define smul_ppmm(xh, xl, m0, m1) \
1829   do {                                                                  \
1830     union {UDItype __ll;                                                \
1831            struct {USItype __l, __h;} __i;                              \
1832           } __x;                                                        \
1833     USItype __m0 = (m0), __m1 = (m1);                                   \
1834     __asm__ ("emul %1,%2,$0,%0"                                         \
1835              : "=g" (__x.__ll) : "g" (__m0), "g" (__m1));               \
1836     (xh) = __x.__i.__h; (xl) = __x.__i.__l;                             \
1837   } while (0)
1838 #define sdiv_qrnnd(q, r, n1, n0, d) \
1839   do {                                                                  \
1840     union {DItype __ll;                                                 \
1841            struct {SItype __l, __h;} __i;                               \
1842           } __x;                                                        \
1843     __x.__i.__h = n1; __x.__i.__l = n0;                                 \
1844     __asm__ ("ediv %3,%2,%0,%1"                                         \
1845              : "=g" (q), "=g" (r) : "g" (__x.__ll), "g" (d));           \
1846   } while (0)
1847 #if 0
1848 /* FIXME: This instruction appears to be unimplemented on some systems (vax
1849    8800 maybe). */
1850 #define count_trailing_zeros(count,x)                                   \
1851   do {                                                                  \
1852     __asm__ ("ffs 0, 31, %1, %0"                                        \
1853              : "=g" (count)                                             \
1854              : "g" ((USItype) (x)));                                    \
1855   } while (0)
1856 #endif
1857 #endif /* vax */
1858
1859 #if defined (__z8000__) && W_TYPE_SIZE == 16
1860 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
1861   __asm__ ("add %H1,%H5\n\tadc  %H0,%H3"                                \
1862            : "=r" (sh), "=&r" (sl)                                      \
1863            : "0"  ((unsigned int)(ah)), "r" ((unsigned int)(bh)),       \
1864              "%1" ((unsigned int)(al)), "rQR" ((unsigned int)(bl)))
1865 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
1866   __asm__ ("sub %H1,%H5\n\tsbc  %H0,%H3"                                \
1867            : "=r" (sh), "=&r" (sl)                                      \
1868            : "0" ((unsigned int)(ah)), "r" ((unsigned int)(bh)),        \
1869              "1" ((unsigned int)(al)), "rQR" ((unsigned int)(bl)))
1870 #define umul_ppmm(xh, xl, m0, m1) \
1871   do {                                                                  \
1872     union {long int __ll;                                               \
1873            struct {unsigned int __h, __l;} __i;                         \
1874           } __x;                                                        \
1875     unsigned int __m0 = (m0), __m1 = (m1);                              \
1876     __asm__ ("mult      %S0,%H3"                                        \
1877              : "=r" (__x.__i.__h), "=r" (__x.__i.__l)                   \
1878              : "%1" (m0), "rQR" (m1));                                  \
1879     (xh) = __x.__i.__h; (xl) = __x.__i.__l;                             \
1880     (xh) += ((((signed int) __m0 >> 15) & __m1)                         \
1881              + (((signed int) __m1 >> 15) & __m0));                     \
1882   } while (0)
1883 #endif /* __z8000__ */
1884
1885 #endif /* __GNUC__ */
1886
1887 #endif /* NO_ASM */
1888
1889
1890 /* FIXME: "sidi" here is highly doubtful, should sometimes be "diti".  */
1891 #if !defined (umul_ppmm) && defined (__umulsidi3)
1892 #define umul_ppmm(ph, pl, m0, m1) \
1893   do {                                                                  \
1894     UDWtype __ll = __umulsidi3 (m0, m1);                                \
1895     ph = (UWtype) (__ll >> W_TYPE_SIZE);                                \
1896     pl = (UWtype) __ll;                                                 \
1897   } while (0)
1898 #endif
1899
1900 #if !defined (__umulsidi3)
1901 #define __umulsidi3(u, v) \
1902   ({UWtype __hi, __lo;                                                  \
1903     umul_ppmm (__hi, __lo, u, v);                                       \
1904     ((UDWtype) __hi << W_TYPE_SIZE) | __lo; })
1905 #endif
1906
1907
1908 #if defined (__cplusplus)
1909 #define __longlong_h_C "C"
1910 #else
1911 #define __longlong_h_C
1912 #endif
1913
1914 /* Use mpn_umul_ppmm or mpn_udiv_qrnnd functions, if they exist.  The "_r"
1915    forms have "reversed" arguments, meaning the pointer is last, which
1916    sometimes allows better parameter passing, in particular on 64-bit
1917    hppa. */
1918
1919 #define mpn_umul_ppmm  __MPN(umul_ppmm)
1920 extern __longlong_h_C UWtype mpn_umul_ppmm (UWtype *, UWtype, UWtype);
1921
1922 #if ! defined (umul_ppmm) && HAVE_NATIVE_mpn_umul_ppmm  \
1923   && ! defined (LONGLONG_STANDALONE)
1924 #define umul_ppmm(wh, wl, u, v)                                         \
1925   do {                                                                  \
1926     UWtype __umul_ppmm__p0;                                             \
1927     (wh) = mpn_umul_ppmm (&__umul_ppmm__p0, (UWtype) (u), (UWtype) (v));\
1928     (wl) = __umul_ppmm__p0;                                             \
1929   } while (0)
1930 #endif
1931
1932 #define mpn_umul_ppmm_r  __MPN(umul_ppmm_r)
1933 extern __longlong_h_C UWtype mpn_umul_ppmm_r (UWtype, UWtype, UWtype *);
1934
1935 #if ! defined (umul_ppmm) && HAVE_NATIVE_mpn_umul_ppmm_r        \
1936   && ! defined (LONGLONG_STANDALONE)
1937 #define umul_ppmm(wh, wl, u, v)                                         \
1938   do {                                                                  \
1939     UWtype __umul_p0;                                                   \
1940     (wh) = mpn_umul_ppmm_r ((UWtype) (u), (UWtype) (v), &__umul_p0);    \
1941     (wl) = __umul_p0;                                                   \
1942   } while (0)
1943 #endif
1944
1945 #define mpn_udiv_qrnnd  __MPN(udiv_qrnnd)
1946 extern __longlong_h_C UWtype mpn_udiv_qrnnd (UWtype *, UWtype, UWtype, UWtype);
1947
1948 #if ! defined (udiv_qrnnd) && HAVE_NATIVE_mpn_udiv_qrnnd        \
1949   && ! defined (LONGLONG_STANDALONE)
1950 #define udiv_qrnnd(q, r, n1, n0, d)                                     \
1951   do {                                                                  \
1952     UWtype __udiv_qrnnd_r;                                              \
1953     (q) = mpn_udiv_qrnnd (&__udiv_qrnnd_r,                              \
1954                           (UWtype) (n1), (UWtype) (n0), (UWtype) d);    \
1955     (r) = __udiv_qrnnd_r;                                               \
1956   } while (0)
1957 #endif
1958
1959 #define mpn_udiv_qrnnd_r  __MPN(udiv_qrnnd_r)
1960 extern __longlong_h_C UWtype mpn_udiv_qrnnd_r (UWtype, UWtype, UWtype, UWtype *);
1961
1962 #if ! defined (udiv_qrnnd) && HAVE_NATIVE_mpn_udiv_qrnnd_r      \
1963   && ! defined (LONGLONG_STANDALONE)
1964 #define udiv_qrnnd(q, r, n1, n0, d)                                     \
1965   do {                                                                  \
1966     UWtype __udiv_qrnnd_r;                                              \
1967     (q) = mpn_udiv_qrnnd_r ((UWtype) (n1), (UWtype) (n0), (UWtype) d,   \
1968                             &__udiv_qrnnd_r);                           \
1969     (r) = __udiv_qrnnd_r;                                               \
1970   } while (0)
1971 #endif
1972
1973
1974 /* If this machine has no inline assembler, use C macros.  */
1975
1976 #if !defined (add_ssaaaa)
1977 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
1978   do {                                                                  \
1979     UWtype __x;                                                         \
1980     __x = (al) + (bl);                                                  \
1981     (sh) = (ah) + (bh) + (__x < (al));                                  \
1982     (sl) = __x;                                                         \
1983   } while (0)
1984 #endif
1985
1986 #if !defined (sub_ddmmss)
1987 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
1988   do {                                                                  \
1989     UWtype __x;                                                         \
1990     __x = (al) - (bl);                                                  \
1991     (sh) = (ah) - (bh) - ((al) < (bl));                                 \
1992     (sl) = __x;                                                         \
1993   } while (0)
1994 #endif
1995
1996 /* If we lack umul_ppmm but have smul_ppmm, define umul_ppmm in terms of
1997    smul_ppmm.  */
1998 #if !defined (umul_ppmm) && defined (smul_ppmm)
1999 #define umul_ppmm(w1, w0, u, v)                                         \
2000   do {                                                                  \
2001     UWtype __w1;                                                        \
2002     UWtype __xm0 = (u), __xm1 = (v);                                    \
2003     smul_ppmm (__w1, w0, __xm0, __xm1);                                 \
2004     (w1) = __w1 + (-(__xm0 >> (W_TYPE_SIZE - 1)) & __xm1)               \
2005                 + (-(__xm1 >> (W_TYPE_SIZE - 1)) & __xm0);              \
2006   } while (0)
2007 #endif
2008
2009 /* If we still don't have umul_ppmm, define it using plain C.
2010
2011    For reference, when this code is used for squaring (ie. u and v identical
2012    expressions), gcc recognises __x1 and __x2 are the same and generates 3
2013    multiplies, not 4.  The subsequent additions could be optimized a bit,
2014    but the only place GMP currently uses such a square is mpn_sqr_basecase,
2015    and chips obliged to use this generic C umul will have plenty of worse
2016    performance problems than a couple of extra instructions on the diagonal
2017    of sqr_basecase.  */
2018
2019 #if !defined (umul_ppmm)
2020 #define umul_ppmm(w1, w0, u, v)                                         \
2021   do {                                                                  \
2022     UWtype __x0, __x1, __x2, __x3;                                      \
2023     UHWtype __ul, __vl, __uh, __vh;                                     \
2024     UWtype __u = (u), __v = (v);                                        \
2025                                                                         \
2026     __ul = __ll_lowpart (__u);                                          \
2027     __uh = __ll_highpart (__u);                                         \
2028     __vl = __ll_lowpart (__v);                                          \
2029     __vh = __ll_highpart (__v);                                         \
2030                                                                         \
2031     __x0 = (UWtype) __ul * __vl;                                        \
2032     __x1 = (UWtype) __ul * __vh;                                        \
2033     __x2 = (UWtype) __uh * __vl;                                        \
2034     __x3 = (UWtype) __uh * __vh;                                        \
2035                                                                         \
2036     __x1 += __ll_highpart (__x0);/* this can't give carry */            \
2037     __x1 += __x2;               /* but this indeed can */               \
2038     if (__x1 < __x2)            /* did we get it? */                    \
2039       __x3 += __ll_B;           /* yes, add it in the proper pos. */    \
2040                                                                         \
2041     (w1) = __x3 + __ll_highpart (__x1);                                 \
2042     (w0) = (__x1 << W_TYPE_SIZE/2) + __ll_lowpart (__x0);               \
2043   } while (0)
2044 #endif
2045
2046 /* If we don't have smul_ppmm, define it using umul_ppmm (which surely will
2047    exist in one form or another.  */
2048 #if !defined (smul_ppmm)
2049 #define smul_ppmm(w1, w0, u, v)                                         \
2050   do {                                                                  \
2051     UWtype __w1;                                                        \
2052     UWtype __xm0 = (u), __xm1 = (v);                                    \
2053     umul_ppmm (__w1, w0, __xm0, __xm1);                                 \
2054     (w1) = __w1 - (-(__xm0 >> (W_TYPE_SIZE - 1)) & __xm1)               \
2055                 - (-(__xm1 >> (W_TYPE_SIZE - 1)) & __xm0);              \
2056   } while (0)
2057 #endif
2058
2059 /* Define this unconditionally, so it can be used for debugging.  */
2060 #define __udiv_qrnnd_c(q, r, n1, n0, d) \
2061   do {                                                                  \
2062     UWtype __d1, __d0, __q1, __q0, __r1, __r0, __m;                     \
2063                                                                         \
2064     ASSERT ((d) != 0);                                                  \
2065     ASSERT ((n1) < (d));                                                \
2066                                                                         \
2067     __d1 = __ll_highpart (d);                                           \
2068     __d0 = __ll_lowpart (d);                                            \
2069                                                                         \
2070     __q1 = (n1) / __d1;                                                 \
2071     __r1 = (n1) - __q1 * __d1;                                          \
2072     __m = __q1 * __d0;                                                  \
2073     __r1 = __r1 * __ll_B | __ll_highpart (n0);                          \
2074     if (__r1 < __m)                                                     \
2075       {                                                                 \
2076         __q1--, __r1 += (d);                                            \
2077         if (__r1 >= (d)) /* i.e. we didn't get carry when adding to __r1 */\
2078           if (__r1 < __m)                                               \
2079             __q1--, __r1 += (d);                                        \
2080       }                                                                 \
2081     __r1 -= __m;                                                        \
2082                                                                         \
2083     __q0 = __r1 / __d1;                                                 \
2084     __r0 = __r1  - __q0 * __d1;                                         \
2085     __m = __q0 * __d0;                                                  \
2086     __r0 = __r0 * __ll_B | __ll_lowpart (n0);                           \
2087     if (__r0 < __m)                                                     \
2088       {                                                                 \
2089         __q0--, __r0 += (d);                                            \
2090         if (__r0 >= (d))                                                \
2091           if (__r0 < __m)                                               \
2092             __q0--, __r0 += (d);                                        \
2093       }                                                                 \
2094     __r0 -= __m;                                                        \
2095                                                                         \
2096     (q) = __q1 * __ll_B | __q0;                                         \
2097     (r) = __r0;                                                         \
2098   } while (0)
2099
2100 /* If the processor has no udiv_qrnnd but sdiv_qrnnd, go through
2101    __udiv_w_sdiv (defined in libgcc or elsewhere).  */
2102 #if !defined (udiv_qrnnd) && defined (sdiv_qrnnd)
2103 #define udiv_qrnnd(q, r, nh, nl, d) \
2104   do {                                                                  \
2105     UWtype __r;                                                         \
2106     (q) = __MPN(udiv_w_sdiv) (&__r, nh, nl, d);                         \
2107     (r) = __r;                                                          \
2108   } while (0)
2109 __GMP_DECLSPEC UWtype __MPN(udiv_w_sdiv) (UWtype *, UWtype, UWtype, UWtype);
2110 #endif
2111
2112 /* If udiv_qrnnd was not defined for this processor, use __udiv_qrnnd_c.  */
2113 #if !defined (udiv_qrnnd)
2114 #define UDIV_NEEDS_NORMALIZATION 1
2115 #define udiv_qrnnd __udiv_qrnnd_c
2116 #endif
2117
2118 #if !defined (count_leading_zeros)
2119 #define count_leading_zeros(count, x) \
2120   do {                                                                  \
2121     UWtype __xr = (x);                                                  \
2122     UWtype __a;                                                         \
2123                                                                         \
2124     if (W_TYPE_SIZE == 32)                                              \
2125       {                                                                 \
2126         __a = __xr < ((UWtype) 1 << 2*__BITS4)                          \
2127           ? (__xr < ((UWtype) 1 << __BITS4) ? 1 : __BITS4 + 1)          \
2128           : (__xr < ((UWtype) 1 << 3*__BITS4) ? 2*__BITS4 + 1           \
2129           : 3*__BITS4 + 1);                                             \
2130       }                                                                 \
2131     else                                                                \
2132       {                                                                 \
2133         for (__a = W_TYPE_SIZE - 8; __a > 0; __a -= 8)                  \
2134           if (((__xr >> __a) & 0xff) != 0)                              \
2135             break;                                                      \
2136         ++__a;                                                          \
2137       }                                                                 \
2138                                                                         \
2139     (count) = W_TYPE_SIZE + 1 - __a - __clz_tab[__xr >> __a];           \
2140   } while (0)
2141 /* This version gives a well-defined value for zero. */
2142 #define COUNT_LEADING_ZEROS_0 (W_TYPE_SIZE - 1)
2143 #define COUNT_LEADING_ZEROS_NEED_CLZ_TAB
2144 #define COUNT_LEADING_ZEROS_SLOW
2145 #endif
2146
2147 /* clz_tab needed by mpn/x86/pentium/mod_1.asm in a fat binary */
2148 #if HAVE_HOST_CPU_FAMILY_x86 && WANT_FAT_BINARY
2149 #define COUNT_LEADING_ZEROS_NEED_CLZ_TAB
2150 #endif
2151
2152 #ifdef COUNT_LEADING_ZEROS_NEED_CLZ_TAB
2153 extern const unsigned char __GMP_DECLSPEC __clz_tab[129];
2154 #endif
2155
2156 #if !defined (count_trailing_zeros)
2157 #if !defined (COUNT_LEADING_ZEROS_SLOW)
2158 /* Define count_trailing_zeros using an asm count_leading_zeros.  */
2159 #define count_trailing_zeros(count, x)                                  \
2160   do {                                                                  \
2161     UWtype __ctz_x = (x);                                               \
2162     UWtype __ctz_c;                                                     \
2163     ASSERT (__ctz_x != 0);                                              \
2164     count_leading_zeros (__ctz_c, __ctz_x & -__ctz_x);                  \
2165     (count) = W_TYPE_SIZE - 1 - __ctz_c;                                \
2166   } while (0)
2167 #else
2168 /* Define count_trailing_zeros in plain C, assuming small counts are common.
2169    We use clz_tab without ado, since the C count_leading_zeros above will have
2170    pulled it in.  */
2171 #define count_trailing_zeros(count, x)                                  \
2172   do {                                                                  \
2173     UWtype __ctz_x = (x);                                               \
2174     int __ctz_c;                                                        \
2175                                                                         \
2176     if (LIKELY ((__ctz_x & 0xff) != 0))                                 \
2177       (count) = __clz_tab[__ctz_x & -__ctz_x] - 2;                      \
2178     else                                                                \
2179       {                                                                 \
2180         for (__ctz_c = 8 - 2; __ctz_c < W_TYPE_SIZE - 2; __ctz_c += 8)  \
2181           {                                                             \
2182             __ctz_x >>= 8;                                              \
2183             if (LIKELY ((__ctz_x & 0xff) != 0))                         \
2184               break;                                                    \
2185           }                                                             \
2186                                                                         \
2187         (count) = __ctz_c + __clz_tab[__ctz_x & -__ctz_x];              \
2188       }                                                                 \
2189   } while (0)
2190 #endif
2191 #endif
2192
2193 #ifndef UDIV_NEEDS_NORMALIZATION
2194 #define UDIV_NEEDS_NORMALIZATION 0
2195 #endif
2196
2197 /* Whether udiv_qrnnd is actually implemented with udiv_qrnnd_preinv, and
2198    that hence the latter should always be used.  */
2199 #ifndef UDIV_PREINV_ALWAYS
2200 #define UDIV_PREINV_ALWAYS 0
2201 #endif
2202
2203 /* Give defaults for UMUL_TIME and UDIV_TIME.  */
2204 #ifndef UMUL_TIME
2205 #define UMUL_TIME 1
2206 #endif
2207
2208 #ifndef UDIV_TIME
2209 #define UDIV_TIME UMUL_TIME
2210 #endif