src/longlong.h

   1 /* longlong.h -- definitions for mixed size 32/64 bit arithmetic.
   2
   3 Copyright 1991-2015 Free Software Foundation, Inc.
   4
   5 This file is free software; you can redistribute it and/or modify it under the
   6 terms of the GNU Lesser General Public License as published by the Free
   7 Software Foundation; either version 3 of the License, or (at your option) any
   8 later version.
   9
  10 This file is distributed in the hope that it will be useful, but WITHOUT ANY
  11 WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
  12 PARTICULAR PURPOSE.  See the GNU Lesser General Public License for more
  13 details.
  14
  15 You should have received a copy of the GNU Lesser General Public License
  16 along with this file.  If not, see http://www.gnu.org/licenses/.  */
  17
  18 /* You have to define the following before including this file:
  19
  20    UWtype -- An unsigned type, default type for operations (typically a "word")
  21    UHWtype -- An unsigned type, at least half the size of UWtype
  22    UDWtype -- An unsigned type, at least twice as large a UWtype
  23    W_TYPE_SIZE -- size in bits of UWtype
  24
  25    SItype, USItype -- Signed and unsigned 32 bit types
  26    DItype, UDItype -- Signed and unsigned 64 bit types
  27
  28    On a 32 bit machine UWtype should typically be USItype;
  29    on a 64 bit machine, UWtype should typically be UDItype.
  30
  31    Optionally, define:
  32
  33    LONGLONG_STANDALONE -- Avoid code that needs machine-dependent support files
  34    NO_ASM -- Disable inline asm
  35
  36
  37    CAUTION!  Using this version of longlong.h outside of GMP is not safe.  You
  38    need to include gmp.h and gmp-impl.h, or certain things might not work as
  39    expected.
  40 */
  41
  42 #define __BITS4 (W_TYPE_SIZE / 4)
  43 #define __ll_B ((UWtype) 1 << (W_TYPE_SIZE / 2))
  44 #define __ll_lowpart(t) ((UWtype) (t) & (__ll_B - 1))
  45 #define __ll_highpart(t) ((UWtype) (t) >> (W_TYPE_SIZE / 2))
  46
  47 /* This is used to make sure no undesirable sharing between different libraries
  48    that use this file takes place.  */
  49 #ifndef __MPN
  50 #define __MPN(x) __##x
  51 #endif
  52
  53 /* Define auxiliary asm macros.
  54
  55    1) umul_ppmm(high_prod, low_prod, multiplier, multiplicand) multiplies two
  56    UWtype integers MULTIPLIER and MULTIPLICAND, and generates a two UWtype
  57    word product in HIGH_PROD and LOW_PROD.
  58
  59    2) __umulsidi3(a,b) multiplies two UWtype integers A and B, and returns a
  60    UDWtype product.  This is just a variant of umul_ppmm.
  61
  62    3) udiv_qrnnd(quotient, remainder, high_numerator, low_numerator,
  63    denominator) divides a UDWtype, composed by the UWtype integers
  64    HIGH_NUMERATOR and LOW_NUMERATOR, by DENOMINATOR and places the quotient
  65    in QUOTIENT and the remainder in REMAINDER.  HIGH_NUMERATOR must be less
  66    than DENOMINATOR for correct operation.  If, in addition, the most
  67    significant bit of DENOMINATOR must be 1, then the pre-processor symbol
  68    UDIV_NEEDS_NORMALIZATION is defined to 1.
  69
  70    4) sdiv_qrnnd(quotient, remainder, high_numerator, low_numerator,
  71    denominator).  Like udiv_qrnnd but the numbers are signed.  The quotient
  72    is rounded towards 0.
  73
  74    5) count_leading_zeros(count, x) counts the number of zero-bits from the
  75    msb to the first non-zero bit in the UWtype X.  This is the number of
  76    steps X needs to be shifted left to set the msb.  Undefined for X == 0,
  77    unless the symbol COUNT_LEADING_ZEROS_0 is defined to some value.
  78
  79    6) count_trailing_zeros(count, x) like count_leading_zeros, but counts
  80    from the least significant end.
  81
  82    7) add_ssaaaa(high_sum, low_sum, high_addend_1, low_addend_1,
  83    high_addend_2, low_addend_2) adds two UWtype integers, composed by
  84    HIGH_ADDEND_1 and LOW_ADDEND_1, and HIGH_ADDEND_2 and LOW_ADDEND_2
  85    respectively.  The result is placed in HIGH_SUM and LOW_SUM.  Overflow
  86    (i.e. carry out) is not stored anywhere, and is lost.
  87
  88    8) sub_ddmmss(high_difference, low_difference, high_minuend, low_minuend,
  89    high_subtrahend, low_subtrahend) subtracts two two-word UWtype integers,
  90    composed by HIGH_MINUEND_1 and LOW_MINUEND_1, and HIGH_SUBTRAHEND_2 and
  91    LOW_SUBTRAHEND_2 respectively.  The result is placed in HIGH_DIFFERENCE
  92    and LOW_DIFFERENCE.  Overflow (i.e. carry out) is not stored anywhere,
  93    and is lost.
  94
  95    If any of these macros are left undefined for a particular CPU,
  96    C macros are used.
  97
  98
  99    Notes:
 100
 101    For add_ssaaaa the two high and two low addends can both commute, but
 102    unfortunately gcc only supports one "%" commutative in each asm block.
 103    This has always been so but is only documented in recent versions
 104    (eg. pre-release 3.3).  Having two or more "%"s can cause an internal
 105    compiler error in certain rare circumstances.
 106
 107    Apparently it was only the last "%" that was ever actually respected, so
 108    the code has been updated to leave just that.  Clearly there's a free
 109    choice whether high or low should get it, if there's a reason to favour
 110    one over the other.  Also obviously when the constraints on the two
 111    operands are identical there's no benefit to the reloader in any "%" at
 112    all.
 113
 114    */
 115
 116 /* The CPUs come in alphabetical order below.
 117
 118    Please add support for more CPUs here, or improve the current support
 119    for the CPUs below!  */
 120
 121
 122 /* count_leading_zeros_gcc_clz is count_leading_zeros implemented with gcc
 123    3.4 __builtin_clzl or __builtin_clzll, according to our limb size.
 124    Similarly count_trailing_zeros_gcc_ctz using __builtin_ctzl or
 125    __builtin_ctzll.
 126
 127    These builtins are only used when we check what code comes out, on some
 128    chips they're merely libgcc calls, where we will instead want an inline
 129    in that case (either asm or generic C).
 130
 131    These builtins are better than an asm block of the same insn, since an
 132    asm block doesn't give gcc any information about scheduling or resource
 133    usage.  We keep an asm block for use on prior versions of gcc though.
 134
 135    For reference, __builtin_ffs existed in gcc prior to __builtin_clz, but
 136    it's not used (for count_leading_zeros) because it generally gives extra
 137    code to ensure the result is 0 when the input is 0, which we don't need
 138    or want.  */
 139
 140 #ifdef _LONG_LONG_LIMB
 141 #define count_leading_zeros_gcc_clz(count,x)    \
 142   do {                                          \
 143     ASSERT ((x) != 0);                          \
 144     (count) = __builtin_clzll (x);              \
 145   } while (0)
 146 #else
 147 #define count_leading_zeros_gcc_clz(count,x)    \
 148   do {                                          \
 149     ASSERT ((x) != 0);                          \
 150     (count) = __builtin_clzl (x);               \
 151   } while (0)
 152 #endif
 153
 154 #ifdef _LONG_LONG_LIMB
 155 #define count_trailing_zeros_gcc_ctz(count,x)   \
 156   do {                                          \
 157     ASSERT ((x) != 0);                          \
 158     (count) = __builtin_ctzll (x);              \
 159   } while (0)
 160 #else
 161 #define count_trailing_zeros_gcc_ctz(count,x)   \
 162   do {                                          \
 163     ASSERT ((x) != 0);                          \
 164     (count) = __builtin_ctzl (x);               \
 165   } while (0)
 166 #endif
 167
 168
 169 /* FIXME: The macros using external routines like __MPN(count_leading_zeros)
 170    don't need to be under !NO_ASM */
 171 #if ! defined (NO_ASM)
 172
 173 #if defined (__alpha) && W_TYPE_SIZE == 64
 174 /* Most alpha-based machines, except Cray systems. */
 175 #if defined (__GNUC__)
 176 #if __GMP_GNUC_PREREQ (3,3)
 177 #define umul_ppmm(ph, pl, m0, m1) \
 178   do {                                                                  \
 179     UDItype __m0 = (m0), __m1 = (m1);                                   \
 180     (ph) = __builtin_alpha_umulh (__m0, __m1);                          \
 181     (pl) = __m0 * __m1;                                                 \
 182   } while (0)
 183 #else
 184 #define umul_ppmm(ph, pl, m0, m1) \
 185   do {                                                                  \
 186     UDItype __m0 = (m0), __m1 = (m1);                                   \
 187     __asm__ ("umulh %r1,%2,%0"                                          \
 188              : "=r" (ph)                                                \
 189              : "%rJ" (__m0), "rI" (__m1));                              \
 190     (pl) = __m0 * __m1;                                                 \
 191   } while (0)
 192 #endif
 193 #define UMUL_TIME 18
 194 #else /* ! __GNUC__ */
 195 #include <machine/builtins.h>
 196 #define umul_ppmm(ph, pl, m0, m1) \
 197   do {                                                                  \
 198     UDItype __m0 = (m0), __m1 = (m1);                                   \
 199     (ph) = __UMULH (__m0, __m1);                                        \
 200     (pl) = __m0 * __m1;                                                 \
 201   } while (0)
 202 #endif
 203 #ifndef LONGLONG_STANDALONE
 204 #define udiv_qrnnd(q, r, n1, n0, d) \
 205   do { UWtype __di;                                                     \
 206     __di = __MPN(invert_limb) (d);                                      \
 207     udiv_qrnnd_preinv (q, r, n1, n0, d, __di);                          \
 208   } while (0)
 209 #define UDIV_PREINV_ALWAYS  1
 210 #define UDIV_NEEDS_NORMALIZATION 1
 211 #define UDIV_TIME 220
 212 #endif /* LONGLONG_STANDALONE */
 213
 214 /* clz_tab is required in all configurations, since mpn/alpha/cntlz.asm
 215    always goes into libgmp.so, even when not actually used.  */
 216 #define COUNT_LEADING_ZEROS_NEED_CLZ_TAB
 217
 218 #if defined (__GNUC__) && HAVE_HOST_CPU_alpha_CIX
 219 #define count_leading_zeros(COUNT,X) \
 220   __asm__("ctlz %1,%0" : "=r"(COUNT) : "r"(X))
 221 #define count_trailing_zeros(COUNT,X) \
 222   __asm__("cttz %1,%0" : "=r"(COUNT) : "r"(X))
 223 #endif /* clz/ctz using cix */
 224
 225 #if ! defined (count_leading_zeros)                             \
 226   && defined (__GNUC__) && ! defined (LONGLONG_STANDALONE)
 227 /* ALPHA_CMPBGE_0 gives "cmpbge $31,src,dst", ie. test src bytes == 0.
 228    "$31" is written explicitly in the asm, since an "r" constraint won't
 229    select reg 31.  There seems no need to worry about "r31" syntax for cray,
 230    since gcc itself (pre-release 3.4) emits just $31 in various places.  */
 231 #define ALPHA_CMPBGE_0(dst, src)                                        \
 232   do { asm ("cmpbge $31, %1, %0" : "=r" (dst) : "r" (src)); } while (0)
 233 /* Zero bytes are turned into bits with cmpbge, a __clz_tab lookup counts
 234    them, locating the highest non-zero byte.  A second __clz_tab lookup
 235    counts the leading zero bits in that byte, giving the result.  */
 236 #define count_leading_zeros(count, x)                                   \
 237   do {                                                                  \
 238     UWtype  __clz__b, __clz__c, __clz__x = (x);                         \
 239     ALPHA_CMPBGE_0 (__clz__b,  __clz__x);           /* zero bytes */    \
 240     __clz__b = __clz_tab [(__clz__b >> 1) ^ 0x7F];  /* 8 to 1 byte */   \
 241     __clz__b = __clz__b * 8 - 7;                    /* 57 to 1 shift */ \
 242     __clz__x >>= __clz__b;                                              \
 243     __clz__c = __clz_tab [__clz__x];                /* 8 to 1 bit */    \
 244     __clz__b = 65 - __clz__b;                                           \
 245     (count) = __clz__b - __clz__c;                                      \
 246   } while (0)
 247 #define COUNT_LEADING_ZEROS_NEED_CLZ_TAB
 248 #endif /* clz using cmpbge */
 249
 250 #if ! defined (count_leading_zeros) && ! defined (LONGLONG_STANDALONE)
 251 #if HAVE_ATTRIBUTE_CONST
 252 long __MPN(count_leading_zeros) (UDItype) __attribute__ ((const));
 253 #else
 254 long __MPN(count_leading_zeros) (UDItype);
 255 #endif
 256 #define count_leading_zeros(count, x) \
 257   ((count) = __MPN(count_leading_zeros) (x))
 258 #endif /* clz using mpn */
 259 #endif /* __alpha */
 260
 261 #if defined (__AVR) && W_TYPE_SIZE == 8
 262 #define umul_ppmm(ph, pl, m0, m1) \
 263   do {                                                                  \
 264     unsigned short __p = (unsigned short) (m0) * (m1);                  \
 265     (ph) = __p >> 8;                                                    \
 266     (pl) = __p;                                                         \
 267   } while (0)
 268 #endif /* AVR */
 269
 270 #if defined (_CRAY) && W_TYPE_SIZE == 64
 271 #include <intrinsics.h>
 272 #define UDIV_PREINV_ALWAYS  1
 273 #define UDIV_NEEDS_NORMALIZATION 1
 274 #define UDIV_TIME 220
 275 long __MPN(count_leading_zeros) (UDItype);
 276 #define count_leading_zeros(count, x) \
 277   ((count) = _leadz ((UWtype) (x)))
 278 #if defined (_CRAYIEEE)         /* I.e., Cray T90/ieee, T3D, and T3E */
 279 #define umul_ppmm(ph, pl, m0, m1) \
 280   do {                                                                  \
 281     UDItype __m0 = (m0), __m1 = (m1);                                   \
 282     (ph) = _int_mult_upper (__m0, __m1);                                \
 283     (pl) = __m0 * __m1;                                                 \
 284   } while (0)
 285 #ifndef LONGLONG_STANDALONE
 286 #define udiv_qrnnd(q, r, n1, n0, d) \
 287   do { UWtype __di;                                                     \
 288     __di = __MPN(invert_limb) (d);                                      \
 289     udiv_qrnnd_preinv (q, r, n1, n0, d, __di);                          \
 290   } while (0)
 291 #endif /* LONGLONG_STANDALONE */
 292 #endif /* _CRAYIEEE */
 293 #endif /* _CRAY */
 294
 295 #if defined (__ia64) && W_TYPE_SIZE == 64
 296 /* This form encourages gcc (pre-release 3.4 at least) to emit predicated
 297    "sub r=r,r" and "sub r=r,r,1", giving a 2 cycle latency.  The generic
 298    code using "al<bl" arithmetically comes out making an actual 0 or 1 in a
 299    register, which takes an extra cycle.  */
 300 #define sub_ddmmss(sh, sl, ah, al, bh, bl)      \
 301   do {                                          \
 302     UWtype __x;                                 \
 303     __x = (al) - (bl);                          \
 304     if ((al) < (bl))                            \
 305       (sh) = (ah) - (bh) - 1;                   \
 306     else                                        \
 307       (sh) = (ah) - (bh);                       \
 308     (sl) = __x;                                 \
 309   } while (0)
 310 #if defined (__GNUC__) && ! defined (__INTEL_COMPILER)
 311 /* Do both product parts in assembly, since that gives better code with
 312    all gcc versions.  Some callers will just use the upper part, and in
 313    that situation we waste an instruction, but not any cycles.  */
 314 #define umul_ppmm(ph, pl, m0, m1) \
 315     __asm__ ("xma.hu %0 = %2, %3, f0\n\txma.l %1 = %2, %3, f0"          \
 316              : "=&f" (ph), "=f" (pl)                                    \
 317              : "f" (m0), "f" (m1))
 318 #define UMUL_TIME 14
 319 #define count_leading_zeros(count, x) \
 320   do {                                                                  \
 321     UWtype _x = (x), _y, _a, _c;                                        \
 322     __asm__ ("mux1 %0 = %1, @rev" : "=r" (_y) : "r" (_x));              \
 323     __asm__ ("czx1.l %0 = %1" : "=r" (_a) : "r" (-_y | _y));            \
 324     _c = (_a - 1) << 3;                                                 \
 325     _x >>= _c;                                                          \
 326     if (_x >= 1 << 4)                                                   \
 327       _x >>= 4, _c += 4;                                                \
 328     if (_x >= 1 << 2)                                                   \
 329       _x >>= 2, _c += 2;                                                \
 330     _c += _x >> 1;                                                      \
 331     (count) =  W_TYPE_SIZE - 1 - _c;                                    \
 332   } while (0)
 333 /* similar to what gcc does for __builtin_ffs, but 0 based rather than 1
 334    based, and we don't need a special case for x==0 here */
 335 #define count_trailing_zeros(count, x)                                  \
 336   do {                                                                  \
 337     UWtype __ctz_x = (x);                                               \
 338     __asm__ ("popcnt %0 = %1"                                           \
 339              : "=r" (count)                                             \
 340              : "r" ((__ctz_x-1) & ~__ctz_x));                           \
 341   } while (0)
 342 #endif
 343 #if defined (__INTEL_COMPILER)
 344 #include <ia64intrin.h>
 345 #define umul_ppmm(ph, pl, m0, m1)                                       \
 346   do {                                                                  \
 347     UWtype __m0 = (m0), __m1 = (m1);                                    \
 348     ph = _m64_xmahu (__m0, __m1, 0);                                    \
 349     pl = __m0 * __m1;                                                   \
 350   } while (0)
 351 #endif
 352 #ifndef LONGLONG_STANDALONE
 353 #define udiv_qrnnd(q, r, n1, n0, d) \
 354   do { UWtype __di;                                                     \
 355     __di = __MPN(invert_limb) (d);                                      \
 356     udiv_qrnnd_preinv (q, r, n1, n0, d, __di);                          \
 357   } while (0)
 358 #define UDIV_PREINV_ALWAYS  1
 359 #define UDIV_NEEDS_NORMALIZATION 1
 360 #endif
 361 #define UDIV_TIME 220
 362 #endif
 363
 364
 365 #if defined (__GNUC__)
 366
 367 /* We sometimes need to clobber "cc" with gcc2, but that would not be
 368    understood by gcc1.  Use cpp to avoid major code duplication.  */
 369 #if __GNUC__ < 2
 370 #define __CLOBBER_CC
 371 #define __AND_CLOBBER_CC
 372 #else /* __GNUC__ >= 2 */
 373 #define __CLOBBER_CC : "cc"
 374 #define __AND_CLOBBER_CC , "cc"
 375 #endif /* __GNUC__ < 2 */
 376
 377 #if (defined (__a29k__) || defined (_AM29K)) && W_TYPE_SIZE == 32
 378 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
 379   __asm__ ("add %1,%4,%5\n\taddc %0,%2,%3"                              \
 380            : "=r" (sh), "=&r" (sl)                                      \
 381            : "r" (ah), "rI" (bh), "%r" (al), "rI" (bl))
 382 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
 383   __asm__ ("sub %1,%4,%5\n\tsubc %0,%2,%3"                              \
 384            : "=r" (sh), "=&r" (sl)                                      \
 385            : "r" (ah), "rI" (bh), "r" (al), "rI" (bl))
 386 #define umul_ppmm(xh, xl, m0, m1) \
 387   do {                                                                  \
 388     USItype __m0 = (m0), __m1 = (m1);                                   \
 389     __asm__ ("multiplu %0,%1,%2"                                        \
 390              : "=r" (xl)                                                \
 391              : "r" (__m0), "r" (__m1));                                 \
 392     __asm__ ("multmu %0,%1,%2"                                          \
 393              : "=r" (xh)                                                \
 394              : "r" (__m0), "r" (__m1));                                 \
 395   } while (0)
 396 #define udiv_qrnnd(q, r, n1, n0, d) \
 397   __asm__ ("dividu %0,%3,%4"                                            \
 398            : "=r" (q), "=q" (r)                                         \
 399            : "1" (n1), "r" (n0), "r" (d))
 400 #define count_leading_zeros(count, x) \
 401     __asm__ ("clz %0,%1"                                                \
 402              : "=r" (count)                                             \
 403              : "r" (x))
 404 #define COUNT_LEADING_ZEROS_0 32
 405 #endif /* __a29k__ */
 406
 407 #if defined (__arc__)
 408 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
 409   __asm__ ("add.f\t%1, %4, %5\n\tadc\t%0, %2, %3"                       \
 410            : "=r" (sh),                                                 \
 411              "=&r" (sl)                                                 \
 412            : "r"  ((USItype) (ah)),                                     \
 413              "rIJ" ((USItype) (bh)),                                    \
 414              "%r" ((USItype) (al)),                                     \
 415              "rIJ" ((USItype) (bl)))
 416 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
 417   __asm__ ("sub.f\t%1, %4, %5\n\tsbc\t%0, %2, %3"                       \
 418            : "=r" (sh),                                                 \
 419              "=&r" (sl)                                                 \
 420            : "r" ((USItype) (ah)),                                      \
 421              "rIJ" ((USItype) (bh)),                                    \
 422              "r" ((USItype) (al)),                                      \
 423              "rIJ" ((USItype) (bl)))
 424 #endif
 425
 426 #if defined (__arm__) && (defined (__thumb2__) || !defined (__thumb__)) \
 427     && W_TYPE_SIZE == 32
 428 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
 429   __asm__ ("adds\t%1, %4, %5\n\tadc\t%0, %2, %3"                        \
 430            : "=r" (sh), "=&r" (sl)                                      \
 431            : "r" (ah), "rI" (bh), "%r" (al), "rI" (bl) __CLOBBER_CC)
 432 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
 433   do {                                                                  \
 434     if (__builtin_constant_p (al))                                      \
 435       {                                                                 \
 436         if (__builtin_constant_p (ah))                                  \
 437           __asm__ ("rsbs\t%1, %5, %4\n\trsc\t%0, %3, %2"                \
 438                    : "=r" (sh), "=&r" (sl)                              \
 439                    : "rI" (ah), "r" (bh), "rI" (al), "r" (bl) __CLOBBER_CC); \
 440         else                                                            \
 441           __asm__ ("rsbs\t%1, %5, %4\n\tsbc\t%0, %2, %3"                \
 442                    : "=r" (sh), "=&r" (sl)                              \
 443                    : "r" (ah), "rI" (bh), "rI" (al), "r" (bl) __CLOBBER_CC); \
 444       }                                                                 \
 445     else if (__builtin_constant_p (ah))                                 \
 446       {                                                                 \
 447         if (__builtin_constant_p (bl))                                  \
 448           __asm__ ("subs\t%1, %4, %5\n\trsc\t%0, %3, %2"                \
 449                    : "=r" (sh), "=&r" (sl)                              \
 450                    : "rI" (ah), "r" (bh), "r" (al), "rI" (bl) __CLOBBER_CC); \
 451         else                                                            \
 452           __asm__ ("rsbs\t%1, %5, %4\n\trsc\t%0, %3, %2"                \
 453                    : "=r" (sh), "=&r" (sl)                              \
 454                    : "rI" (ah), "r" (bh), "rI" (al), "r" (bl) __CLOBBER_CC); \
 455       }                                                                 \
 456     else if (__builtin_constant_p (bl))                                 \
 457       {                                                                 \
 458         if (__builtin_constant_p (bh))                                  \
 459           __asm__ ("subs\t%1, %4, %5\n\tsbc\t%0, %2, %3"                \
 460                    : "=r" (sh), "=&r" (sl)                              \
 461                    : "r" (ah), "rI" (bh), "r" (al), "rI" (bl) __CLOBBER_CC); \
 462         else                                                            \
 463           __asm__ ("subs\t%1, %4, %5\n\trsc\t%0, %3, %2"                \
 464                    : "=r" (sh), "=&r" (sl)                              \
 465                    : "rI" (ah), "r" (bh), "r" (al), "rI" (bl) __CLOBBER_CC); \
 466       }                                                                 \
 467     else /* only bh might be a constant */                              \
 468       __asm__ ("subs\t%1, %4, %5\n\tsbc\t%0, %2, %3"                    \
 469                : "=r" (sh), "=&r" (sl)                                  \
 470                : "r" (ah), "rI" (bh), "r" (al), "rI" (bl) __CLOBBER_CC);\
 471     } while (0)
 472 #if defined (__ARM_ARCH_2__) || defined (__ARM_ARCH_2A__) \
 473     || defined (__ARM_ARCH_3__)
 474 #define umul_ppmm(xh, xl, a, b)                                         \
 475   do {                                                                  \
 476     register USItype __t0, __t1, __t2;                                  \
 477     __asm__ ("%@ Inlined umul_ppmm\n"                                   \
 478            "    mov     %2, %5, lsr #16\n"                              \
 479            "    mov     %0, %6, lsr #16\n"                              \
 480            "    bic     %3, %5, %2, lsl #16\n"                          \
 481            "    bic     %4, %6, %0, lsl #16\n"                          \
 482            "    mul     %1, %3, %4\n"                                   \
 483            "    mul     %4, %2, %4\n"                                   \
 484            "    mul     %3, %0, %3\n"                                   \
 485            "    mul     %0, %2, %0\n"                                   \
 486            "    adds    %3, %4, %3\n"                                   \
 487            "    addcs   %0, %0, #65536\n"                               \
 488            "    adds    %1, %1, %3, lsl #16\n"                          \
 489            "    adc     %0, %0, %3, lsr #16"                            \
 490            : "=&r" ((USItype) (xh)), "=r" ((USItype) (xl)),             \
 491              "=&r" (__t0), "=&r" (__t1), "=r" (__t2)                    \
 492            : "r" ((USItype) (a)), "r" ((USItype) (b)) __CLOBBER_CC);    \
 493   } while (0)
 494 #define UMUL_TIME 20
 495 #define udiv_qrnnd(q, r, n1, n0, d) \
 496   do { UWtype __r;                                                      \
 497     (q) = __MPN(udiv_qrnnd) (&__r, (n1), (n0), (d));                    \
 498     (r) = __r;                                                          \
 499   } while (0)
 500 extern UWtype __MPN(udiv_qrnnd) (UWtype *, UWtype, UWtype, UWtype);
 501 #define UDIV_TIME 200
 502 #else /* ARMv4 or newer */
 503 #define umul_ppmm(xh, xl, a, b) \
 504   __asm__ ("umull %0,%1,%2,%3" : "=&r" (xl), "=&r" (xh) : "r" (a), "r" (b))
 505 #define UMUL_TIME 5
 506 #define smul_ppmm(xh, xl, a, b) \
 507   __asm__ ("smull %0,%1,%2,%3" : "=&r" (xl), "=&r" (xh) : "r" (a), "r" (b))
 508 #ifndef LONGLONG_STANDALONE
 509 #define udiv_qrnnd(q, r, n1, n0, d) \
 510   do { UWtype __di;                                                     \
 511     __di = __MPN(invert_limb) (d);                                      \
 512     udiv_qrnnd_preinv (q, r, n1, n0, d, __di);                          \
 513   } while (0)
 514 #define UDIV_PREINV_ALWAYS  1
 515 #define UDIV_NEEDS_NORMALIZATION 1
 516 #define UDIV_TIME 70
 517 #endif /* LONGLONG_STANDALONE */
 518 #endif /* defined(__ARM_ARCH_2__) ... */
 519 #define count_leading_zeros(count, x)  count_leading_zeros_gcc_clz(count, x)
 520 #define count_trailing_zeros(count, x)  count_trailing_zeros_gcc_ctz(count, x)
 521 #define COUNT_LEADING_ZEROS_0 32
 522 #endif /* __arm__ */
 523
 524 #if defined (__aarch64__) && W_TYPE_SIZE == 64
 525 /* FIXME: Extend the immediate range for the low word by using both
 526    ADDS and SUBS, since they set carry in the same way.  */
 527 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
 528   __asm__ ("adds\t%1, %x4, %5\n\tadc\t%0, %x2, %x3"                     \
 529            : "=r" (sh), "=&r" (sl)                                      \
 530            : "rZ" ((UDItype)(ah)), "rZ" ((UDItype)(bh)),                \
 531              "%r" ((UDItype)(al)), "rI" ((UDItype)(bl)) __CLOBBER_CC)
 532 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
 533   __asm__ ("subs\t%1, %x4, %5\n\tsbc\t%0, %x2, %x3"                     \
 534            : "=r,r" (sh), "=&r,&r" (sl)                                 \
 535            : "rZ,rZ" ((UDItype)(ah)), "rZ,rZ" ((UDItype)(bh)),          \
 536              "r,Z"   ((UDItype)(al)), "rI,r"  ((UDItype)(bl)) __CLOBBER_CC)
 537 #define umul_ppmm(ph, pl, m0, m1) \
 538   do {                                                                  \
 539     UDItype __m0 = (m0), __m1 = (m1);                                   \
 540     __asm__ ("umulh\t%0, %1, %2" : "=r" (ph) : "r" (__m0), "r" (__m1)); \
 541     (pl) = __m0 * __m1;                                                 \
 542   } while (0)
 543 #define count_leading_zeros(count, x)  count_leading_zeros_gcc_clz(count, x)
 544 #define count_trailing_zeros(count, x)  count_trailing_zeros_gcc_ctz(count, x)
 545 #define COUNT_LEADING_ZEROS_0 64
 546 #endif /* __aarch64__ */
 547
 548 #if defined (__clipper__) && W_TYPE_SIZE == 32
 549 #define umul_ppmm(w1, w0, u, v) \
 550   ({union {UDItype __ll;                                                \
 551            struct {USItype __l, __h;} __i;                              \
 552           } __x;                                                        \
 553   __asm__ ("mulwux %2,%0"                                               \
 554            : "=r" (__x.__ll)                                            \
 555            : "%0" ((USItype)(u)), "r" ((USItype)(v)));                  \
 556   (w1) = __x.__i.__h; (w0) = __x.__i.__l;})
 557 #define smul_ppmm(w1, w0, u, v) \
 558   ({union {DItype __ll;                                                 \
 559            struct {SItype __l, __h;} __i;                               \
 560           } __x;                                                        \
 561   __asm__ ("mulwx %2,%0"                                                \
 562            : "=r" (__x.__ll)                                            \
 563            : "%0" ((SItype)(u)), "r" ((SItype)(v)));                    \
 564   (w1) = __x.__i.__h; (w0) = __x.__i.__l;})
 565 #define __umulsidi3(u, v) \
 566   ({UDItype __w;                                                        \
 567     __asm__ ("mulwux %2,%0"                                             \
 568              : "=r" (__w) : "%0" ((USItype)(u)), "r" ((USItype)(v)));   \
 569     __w; })
 570 #endif /* __clipper__ */
 571
 572 /* Fujitsu vector computers.  */
 573 #if defined (__uxp__) && W_TYPE_SIZE == 32
 574 #define umul_ppmm(ph, pl, u, v) \
 575   do {                                                                  \
 576     union {UDItype __ll;                                                \
 577            struct {USItype __h, __l;} __i;                              \
 578           } __x;                                                        \
 579     __asm__ ("mult.lu %1,%2,%0" : "=r" (__x.__ll) : "%r" (u), "rK" (v));\
 580     (ph) = __x.__i.__h;                                                 \
 581     (pl) = __x.__i.__l;                                                 \
 582   } while (0)
 583 #define smul_ppmm(ph, pl, u, v) \
 584   do {                                                                  \
 585     union {UDItype __ll;                                                \
 586            struct {USItype __h, __l;} __i;                              \
 587           } __x;                                                        \
 588     __asm__ ("mult.l %1,%2,%0" : "=r" (__x.__ll) : "%r" (u), "rK" (v)); \
 589     (ph) = __x.__i.__h;                                                 \
 590     (pl) = __x.__i.__l;                                                 \
 591   } while (0)
 592 #endif
 593
 594 #if defined (__gmicro__) && W_TYPE_SIZE == 32
 595 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
 596   __asm__ ("add.w %5,%1\n\taddx %3,%0"                                  \
 597            : "=g" (sh), "=&g" (sl)                                      \
 598            : "0"  ((USItype)(ah)), "g" ((USItype)(bh)),                 \
 599              "%1" ((USItype)(al)), "g" ((USItype)(bl)))
 600 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
 601   __asm__ ("sub.w %5,%1\n\tsubx %3,%0"                                  \
 602            : "=g" (sh), "=&g" (sl)                                      \
 603            : "0" ((USItype)(ah)), "g" ((USItype)(bh)),                  \
 604              "1" ((USItype)(al)), "g" ((USItype)(bl)))
 605 #define umul_ppmm(ph, pl, m0, m1) \
 606   __asm__ ("mulx %3,%0,%1"                                              \
 607            : "=g" (ph), "=r" (pl)                                       \
 608            : "%0" ((USItype)(m0)), "g" ((USItype)(m1)))
 609 #define udiv_qrnnd(q, r, nh, nl, d) \
 610   __asm__ ("divx %4,%0,%1"                                              \
 611            : "=g" (q), "=r" (r)                                         \
 612            : "1" ((USItype)(nh)), "0" ((USItype)(nl)), "g" ((USItype)(d)))
 613 #define count_leading_zeros(count, x) \
 614   __asm__ ("bsch/1 %1,%0"                                               \
 615            : "=g" (count) : "g" ((USItype)(x)), "0" ((USItype)0))
 616 #endif
 617
 618 #if defined (__hppa) && W_TYPE_SIZE == 32
 619 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
 620   __asm__ ("add%I5 %5,%r4,%1\n\taddc %r2,%r3,%0"                        \
 621            : "=r" (sh), "=&r" (sl)                                      \
 622            : "rM" (ah), "rM" (bh), "%rM" (al), "rI" (bl))
 623 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
 624   __asm__ ("sub%I4 %4,%r5,%1\n\tsubb %r2,%r3,%0"                        \
 625            : "=r" (sh), "=&r" (sl)                                      \
 626            : "rM" (ah), "rM" (bh), "rI" (al), "rM" (bl))
 627 #if defined (_PA_RISC1_1)
 628 #define umul_ppmm(wh, wl, u, v) \
 629   do {                                                                  \
 630     union {UDItype __ll;                                                \
 631            struct {USItype __h, __l;} __i;                              \
 632           } __x;                                                        \
 633     __asm__ ("xmpyu %1,%2,%0" : "=*f" (__x.__ll) : "*f" (u), "*f" (v)); \
 634     (wh) = __x.__i.__h;                                                 \
 635     (wl) = __x.__i.__l;                                                 \
 636   } while (0)
 637 #define UMUL_TIME 8
 638 #define UDIV_TIME 60
 639 #else
 640 #define UMUL_TIME 40
 641 #define UDIV_TIME 80
 642 #endif
 643 #define count_leading_zeros(count, x) \
 644   do {                                                                  \
 645     USItype __tmp;                                                      \
 646     __asm__ (                                                           \
 647        "ldi             1,%0\n"                                         \
 648 "       extru,=         %1,15,16,%%r0   ; Bits 31..16 zero?\n"          \
 649 "       extru,tr        %1,15,16,%1     ; No.  Shift down, skip add.\n" \
 650 "       ldo             16(%0),%0       ; Yes.  Perform add.\n"         \
 651 "       extru,=         %1,23,8,%%r0    ; Bits 15..8 zero?\n"           \
 652 "       extru,tr        %1,23,8,%1      ; No.  Shift down, skip add.\n" \
 653 "       ldo             8(%0),%0        ; Yes.  Perform add.\n"         \
 654 "       extru,=         %1,27,4,%%r0    ; Bits 7..4 zero?\n"            \
 655 "       extru,tr        %1,27,4,%1      ; No.  Shift down, skip add.\n" \
 656 "       ldo             4(%0),%0        ; Yes.  Perform add.\n"         \
 657 "       extru,=         %1,29,2,%%r0    ; Bits 3..2 zero?\n"            \
 658 "       extru,tr        %1,29,2,%1      ; No.  Shift down, skip add.\n" \
 659 "       ldo             2(%0),%0        ; Yes.  Perform add.\n"         \
 660 "       extru           %1,30,1,%1      ; Extract bit 1.\n"             \
 661 "       sub             %0,%1,%0        ; Subtract it.\n"               \
 662         : "=r" (count), "=r" (__tmp) : "1" (x));                        \
 663   } while (0)
 664 #endif /* hppa */
 665
 666 /* These macros are for ABI=2.0w.  In ABI=2.0n they can't be used, since GCC
 667    (3.2) puts longlong into two adjacent 32-bit registers.  Presumably this
 668    is just a case of no direct support for 2.0n but treating it like 1.0. */
 669 #if defined (__hppa) && W_TYPE_SIZE == 64 && ! defined (_LONG_LONG_LIMB)
 670 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
 671   __asm__ ("add%I5 %5,%r4,%1\n\tadd,dc %r2,%r3,%0"                      \
 672            : "=r" (sh), "=&r" (sl)                                      \
 673            : "rM" (ah), "rM" (bh), "%rM" (al), "rI" (bl))
 674 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
 675   __asm__ ("sub%I4 %4,%r5,%1\n\tsub,db %r2,%r3,%0"                      \
 676            : "=r" (sh), "=&r" (sl)                                      \
 677            : "rM" (ah), "rM" (bh), "rI" (al), "rM" (bl))
 678 #endif /* hppa */
 679
 680 #if (defined (__i370__) || defined (__s390__) || defined (__mvs__)) && W_TYPE_SIZE == 32
 681 #if defined (__zarch__) || defined (HAVE_HOST_CPU_s390_zarch)
 682 #define add_ssaaaa(sh, sl, ah, al, bh, bl)                              \
 683   do {                                                                  \
 684 /*  if (__builtin_constant_p (bl))                                      \
 685       __asm__ ("alfi\t%1,%o5\n\talcr\t%0,%3"                            \
 686                : "=r" (sh), "=&r" (sl)                                  \
 687                : "0"  (ah), "r" (bh), "%1" (al), "n" (bl) __CLOBBER_CC);\
 688     else                                                                \
 689 */    __asm__ ("alr\t%1,%5\n\talcr\t%0,%3"                              \
 690                : "=r" (sh), "=&r" (sl)                                  \
 691                : "0"  (ah), "r" (bh), "%1" (al), "r" (bl)__CLOBBER_CC); \
 692   } while (0)
 693 #define sub_ddmmss(sh, sl, ah, al, bh, bl)                              \
 694   do {                                                                  \
 695 /*  if (__builtin_constant_p (bl))                                      \
 696       __asm__ ("slfi\t%1,%o5\n\tslbr\t%0,%3"                            \
 697                : "=r" (sh), "=&r" (sl)                                  \
 698                : "0" (ah), "r" (bh), "1" (al), "n" (bl) __CLOBBER_CC);  \
 699     else                                                                \
 700 */    __asm__ ("slr\t%1,%5\n\tslbr\t%0,%3"                              \
 701                : "=r" (sh), "=&r" (sl)                                  \
 702                : "0" (ah), "r" (bh), "1" (al), "r" (bl) __CLOBBER_CC);  \
 703   } while (0)
 704 #if __GMP_GNUC_PREREQ (4,5)
 705 #define umul_ppmm(xh, xl, m0, m1)                                       \
 706   do {                                                                  \
 707     union {UDItype __ll;                                                \
 708            struct {USItype __h, __l;} __i;                              \
 709           } __x;                                                        \
 710     __x.__ll = (UDItype) (m0) * (UDItype) (m1);                         \
 711     (xh) = __x.__i.__h; (xl) = __x.__i.__l;                             \
 712   } while (0)
 713 #else
 714 #if 0
 715 /* FIXME: this fails if gcc knows about the 64-bit registers.  Use only
 716    with a new enough processor pretending we have 32-bit registers.  */
 717 #define umul_ppmm(xh, xl, m0, m1)                                       \
 718   do {                                                                  \
 719     union {UDItype __ll;                                                \
 720            struct {USItype __h, __l;} __i;                              \
 721           } __x;                                                        \
 722     __asm__ ("mlr\t%0,%2"                                               \
 723              : "=r" (__x.__ll)                                          \
 724              : "%0" (m0), "r" (m1));                                    \
 725     (xh) = __x.__i.__h; (xl) = __x.__i.__l;                             \
 726   } while (0)
 727 #else
 728 #define umul_ppmm(xh, xl, m0, m1)                                       \
 729   do {                                                                  \
 730   /* When we have 64-bit regs and gcc is aware of that, we cannot simply use
 731      DImode for the product, since that would be allocated to a single 64-bit
 732      register, whereas mlr uses the low 32-bits of an even-odd register pair.
 733   */                                                                    \
 734     register USItype __r0 __asm__ ("0");                                \
 735     register USItype __r1 __asm__ ("1") = (m0);                         \
 736     __asm__ ("mlr\t%0,%3"                                               \
 737              : "=r" (__r0), "=r" (__r1)                                 \
 738              : "r" (__r1), "r" (m1));                                   \
 739     (xh) = __r0; (xl) = __r1;                                           \
 740   } while (0)
 741 #endif /* if 0 */
 742 #endif
 743 #if 0
 744 /* FIXME: this fails if gcc knows about the 64-bit registers.  Use only
 745    with a new enough processor pretending we have 32-bit registers.  */
 746 #define udiv_qrnnd(q, r, n1, n0, d)                                     \
 747   do {                                                                  \
 748     union {UDItype __ll;                                                \
 749            struct {USItype __h, __l;} __i;                              \
 750           } __x;                                                        \
 751     __x.__i.__h = n1; __x.__i.__l = n0;                                 \
 752     __asm__ ("dlr\t%0,%2"                                               \
 753              : "=r" (__x.__ll)                                          \
 754              : "0" (__x.__ll), "r" (d));                                \
 755     (q) = __x.__i.__l; (r) = __x.__i.__h;                               \
 756   } while (0)
 757 #else
 758 #define udiv_qrnnd(q, r, n1, n0, d)                                     \
 759   do {                                                                  \
 760     register USItype __r0 __asm__ ("0") = (n1);                         \
 761     register USItype __r1 __asm__ ("1") = (n0);                         \
 762     __asm__ ("dlr\t%0,%4"                                               \
 763              : "=r" (__r0), "=r" (__r1)                                 \
 764              : "r" (__r0), "r" (__r1), "r" (d));                        \
 765     (q) = __r1; (r) = __r0;                                             \
 766   } while (0)
 767 #endif /* if 0 */
 768 #else /* if __zarch__ */
 769 /* FIXME: this fails if gcc knows about the 64-bit registers.  */
 770 #define smul_ppmm(xh, xl, m0, m1)                                       \
 771   do {                                                                  \
 772     union {DItype __ll;                                                 \
 773            struct {USItype __h, __l;} __i;                              \
 774           } __x;                                                        \
 775     __asm__ ("mr\t%0,%2"                                                \
 776              : "=r" (__x.__ll)                                          \
 777              : "%0" (m0), "r" (m1));                                    \
 778     (xh) = __x.__i.__h; (xl) = __x.__i.__l;                             \
 779   } while (0)
 780 /* FIXME: this fails if gcc knows about the 64-bit registers.  */
 781 #define sdiv_qrnnd(q, r, n1, n0, d)                                     \
 782   do {                                                                  \
 783     union {DItype __ll;                                                 \
 784            struct {USItype __h, __l;} __i;                              \
 785           } __x;                                                        \
 786     __x.__i.__h = n1; __x.__i.__l = n0;                                 \
 787     __asm__ ("dr\t%0,%2"                                                \
 788              : "=r" (__x.__ll)                                          \
 789              : "0" (__x.__ll), "r" (d));                                \
 790     (q) = __x.__i.__l; (r) = __x.__i.__h;                               \
 791   } while (0)
 792 #endif /* if __zarch__ */
 793 #endif
 794
 795 #if defined (__s390x__) && W_TYPE_SIZE == 64
 796 /* We need to cast operands with register constraints, otherwise their types
 797    will be assumed to be SImode by gcc.  For these machines, such operations
 798    will insert a value into the low 32 bits, and leave the high 32 bits with
 799    garbage.  */
 800 #define add_ssaaaa(sh, sl, ah, al, bh, bl)                              \
 801   do {                                                                  \
 802     __asm__ ("algr\t%1,%5\n\talcgr\t%0,%3"                              \
 803                : "=r" (sh), "=&r" (sl)                                  \
 804                : "0"  ((UDItype)(ah)), "r" ((UDItype)(bh)),             \
 805                  "%1" ((UDItype)(al)), "r" ((UDItype)(bl)) __CLOBBER_CC); \
 806   } while (0)
 807 #define sub_ddmmss(sh, sl, ah, al, bh, bl)                              \
 808   do {                                                                  \
 809     __asm__ ("slgr\t%1,%5\n\tslbgr\t%0,%3"                              \
 810              : "=r" (sh), "=&r" (sl)                                    \
 811              : "0" ((UDItype)(ah)), "r" ((UDItype)(bh)),                \
 812                "1" ((UDItype)(al)), "r" ((UDItype)(bl)) __CLOBBER_CC);  \
 813   } while (0)
 814 #define umul_ppmm(xh, xl, m0, m1)                                       \
 815   do {                                                                  \
 816     union {unsigned int __attribute__ ((mode(TI))) __ll;                \
 817            struct {UDItype __h, __l;} __i;                              \
 818           } __x;                                                        \
 819     __asm__ ("mlgr\t%0,%2"                                              \
 820              : "=r" (__x.__ll)                                          \
 821              : "%0" ((UDItype)(m0)), "r" ((UDItype)(m1)));              \
 822     (xh) = __x.__i.__h; (xl) = __x.__i.__l;                             \
 823   } while (0)
 824 #define udiv_qrnnd(q, r, n1, n0, d)                                     \
 825   do {                                                                  \
 826     union {unsigned int __attribute__ ((mode(TI))) __ll;                \
 827            struct {UDItype __h, __l;} __i;                              \
 828           } __x;                                                        \
 829     __x.__i.__h = n1; __x.__i.__l = n0;                                 \
 830     __asm__ ("dlgr\t%0,%2"                                              \
 831              : "=r" (__x.__ll)                                          \
 832              : "0" (__x.__ll), "r" ((UDItype)(d)));                     \
 833     (q) = __x.__i.__l; (r) = __x.__i.__h;                               \
 834   } while (0)
 835 #if 0 /* FIXME: Enable for z10 (?) */
 836 #define count_leading_zeros(cnt, x)                                     \
 837   do {                                                                  \
 838     union {unsigned int __attribute__ ((mode(TI))) __ll;                \
 839            struct {UDItype __h, __l;} __i;                              \
 840           } __clr_cnt;                                                  \
 841     __asm__ ("flogr\t%0,%1"                                             \
 842              : "=r" (__clr_cnt.__ll)                                    \
 843              : "r" (x) __CLOBBER_CC);                                   \
 844     (cnt) = __clr_cnt.__i.__h;                                          \
 845   } while (0)
 846 #endif
 847 #endif
 848
 849 /* On x86 and x86_64, every asm implicitly clobbers "flags" and "fpsr",
 850    so we don't need __CLOBBER_CC.  */
 851 #if (defined (__i386__) || defined (__i486__)) && W_TYPE_SIZE == 32
 852 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
 853   __asm__ ("addl %5,%k1\n\tadcl %3,%k0"                                 \
 854            : "=r" (sh), "=&r" (sl)                                      \
 855            : "0"  ((USItype)(ah)), "g" ((USItype)(bh)),                 \
 856              "%1" ((USItype)(al)), "g" ((USItype)(bl)))
 857 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
 858   __asm__ ("subl %5,%k1\n\tsbbl %3,%k0"                                 \
 859            : "=r" (sh), "=&r" (sl)                                      \
 860            : "0" ((USItype)(ah)), "g" ((USItype)(bh)),                  \
 861              "1" ((USItype)(al)), "g" ((USItype)(bl)))
 862 #define umul_ppmm(w1, w0, u, v) \
 863   __asm__ ("mull %3"                                                    \
 864            : "=a" (w0), "=d" (w1)                                       \
 865            : "%0" ((USItype)(u)), "rm" ((USItype)(v)))
 866 #define udiv_qrnnd(q, r, n1, n0, dx) /* d renamed to dx avoiding "=d" */\
 867   __asm__ ("divl %4"                 /* stringification in K&R C */     \
 868            : "=a" (q), "=d" (r)                                         \
 869            : "0" ((USItype)(n0)), "1" ((USItype)(n1)), "rm" ((USItype)(dx)))
 870
 871 #if HAVE_HOST_CPU_i586 || HAVE_HOST_CPU_pentium || HAVE_HOST_CPU_pentiummmx
 872 /* Pentium bsrl takes between 10 and 72 cycles depending where the most
 873    significant 1 bit is, hence the use of the following alternatives.  bsfl
 874    is slow too, between 18 and 42 depending where the least significant 1
 875    bit is, so let the generic count_trailing_zeros below make use of the
 876    count_leading_zeros here too.  */
 877
 878 #if HAVE_HOST_CPU_pentiummmx && ! defined (LONGLONG_STANDALONE)
 879 /* The following should be a fixed 14 or 15 cycles, but possibly plus an L1
 880    cache miss reading from __clz_tab.  For P55 it's favoured over the float
 881    below so as to avoid mixing MMX and x87, since the penalty for switching
 882    between the two is about 100 cycles.
 883
 884    The asm block sets __shift to -3 if the high 24 bits are clear, -2 for
 885    16, -1 for 8, or 0 otherwise.  This could be written equivalently as
 886    follows, but as of gcc 2.95.2 it results in conditional jumps.
 887
 888        __shift = -(__n < 0x1000000);
 889        __shift -= (__n < 0x10000);
 890        __shift -= (__n < 0x100);
 891
 892    The middle two sbbl and cmpl's pair, and with luck something gcc
 893    generates might pair with the first cmpl and the last sbbl.  The "32+1"
 894    constant could be folded into __clz_tab[], but it doesn't seem worth
 895    making a different table just for that.  */
 896
 897 #define count_leading_zeros(c,n)                                        \
 898   do {                                                                  \
 899     USItype  __n = (n);                                                 \
 900     USItype  __shift;                                                   \
 901     __asm__ ("cmpl  $0x1000000, %1\n"                                   \
 902              "sbbl  %0, %0\n"                                           \
 903              "cmpl  $0x10000, %1\n"                                     \
 904              "sbbl  $0, %0\n"                                           \
 905              "cmpl  $0x100, %1\n"                                       \
 906              "sbbl  $0, %0\n"                                           \
 907              : "=&r" (__shift) : "r"  (__n));                           \
 908     __shift = __shift*8 + 24 + 1;                                       \
 909     (c) = 32 + 1 - __shift - __clz_tab[__n >> __shift];                 \
 910   } while (0)
 911 #define COUNT_LEADING_ZEROS_NEED_CLZ_TAB
 912 #define COUNT_LEADING_ZEROS_0   31   /* n==0 indistinguishable from n==1 */
 913
 914 #else /* ! pentiummmx || LONGLONG_STANDALONE */
 915 /* The following should be a fixed 14 cycles or so.  Some scheduling
 916    opportunities should be available between the float load/store too.  This
 917    sort of code is used in gcc 3 for __builtin_ffs (with "n&-n") and is
 918    apparently suggested by the Intel optimizing manual (don't know exactly
 919    where).  gcc 2.95 or up will be best for this, so the "double" is
 920    correctly aligned on the stack.  */
 921 #define count_leading_zeros(c,n)                                        \
 922   do {                                                                  \
 923     union {                                                             \
 924       double    d;                                                      \
 925       unsigned  a[2];                                                   \
 926     } __u;                                                              \
 927     ASSERT ((n) != 0);                                                  \
 928     __u.d = (UWtype) (n);                                               \
 929     (c) = 0x3FF + 31 - (__u.a[1] >> 20);                                \
 930   } while (0)
 931 #define COUNT_LEADING_ZEROS_0   (0x3FF + 31)
 932 #endif /* pentiummx */
 933
 934 #else /* ! pentium */
 935
 936 #if __GMP_GNUC_PREREQ (3,4)  /* using bsrl */
 937 #define count_leading_zeros(count,x)  count_leading_zeros_gcc_clz(count,x)
 938 #endif /* gcc clz */
 939
 940 /* On P6, gcc prior to 3.0 generates a partial register stall for
 941    __cbtmp^31, due to using "xorb $31" instead of "xorl $31", the former
 942    being 1 code byte smaller.  "31-__cbtmp" is a workaround, probably at the
 943    cost of one extra instruction.  Do this for "i386" too, since that means
 944    generic x86.  */
 945 #if ! defined (count_leading_zeros) && __GNUC__ < 3                     \
 946   && (HAVE_HOST_CPU_i386                                                \
 947       || HAVE_HOST_CPU_i686                                             \
 948       || HAVE_HOST_CPU_pentiumpro                                       \
 949       || HAVE_HOST_CPU_pentium2                                         \
 950       || HAVE_HOST_CPU_pentium3)
 951 #define count_leading_zeros(count, x)                                   \
 952   do {                                                                  \
 953     USItype __cbtmp;                                                    \
 954     ASSERT ((x) != 0);                                                  \
 955     __asm__ ("bsrl %1,%0" : "=r" (__cbtmp) : "rm" ((USItype)(x)));      \
 956     (count) = 31 - __cbtmp;                                             \
 957   } while (0)
 958 #endif /* gcc<3 asm bsrl */
 959
 960 #ifndef count_leading_zeros
 961 #define count_leading_zeros(count, x)                                   \
 962   do {                                                                  \
 963     USItype __cbtmp;                                                    \
 964     ASSERT ((x) != 0);                                                  \
 965     __asm__ ("bsrl %1,%0" : "=r" (__cbtmp) : "rm" ((USItype)(x)));      \
 966     (count) = __cbtmp ^ 31;                                             \
 967   } while (0)
 968 #endif /* asm bsrl */
 969
 970 #if __GMP_GNUC_PREREQ (3,4)  /* using bsfl */
 971 #define count_trailing_zeros(count,x)  count_trailing_zeros_gcc_ctz(count,x)
 972 #endif /* gcc ctz */
 973
 974 #ifndef count_trailing_zeros
 975 #define count_trailing_zeros(count, x)                                  \
 976   do {                                                                  \
 977     ASSERT ((x) != 0);                                                  \
 978     __asm__ ("bsfl %1,%k0" : "=r" (count) : "rm" ((USItype)(x)));       \
 979   } while (0)
 980 #endif /* asm bsfl */
 981
 982 #endif /* ! pentium */
 983
 984 #ifndef UMUL_TIME
 985 #define UMUL_TIME 10
 986 #endif
 987 #ifndef UDIV_TIME
 988 #define UDIV_TIME 40
 989 #endif
 990 #endif /* 80x86 */
 991
 992 #if defined (__amd64__) && W_TYPE_SIZE == 64
 993 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
 994   __asm__ ("addq %5,%q1\n\tadcq %3,%q0"                                 \
 995            : "=r" (sh), "=&r" (sl)                                      \
 996            : "0"  ((UDItype)(ah)), "rme" ((UDItype)(bh)),               \
 997              "%1" ((UDItype)(al)), "rme" ((UDItype)(bl)))
 998 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
 999   __asm__ ("subq %5,%q1\n\tsbbq %3,%q0"                                 \
1000            : "=r" (sh), "=&r" (sl)                                      \
1001            : "0" ((UDItype)(ah)), "rme" ((UDItype)(bh)),                \
1002              "1" ((UDItype)(al)), "rme" ((UDItype)(bl)))
1003 #define umul_ppmm(w1, w0, u, v) \
1004   __asm__ ("mulq %3"                                                    \
1005            : "=a" (w0), "=d" (w1)                                       \
1006            : "%0" ((UDItype)(u)), "rm" ((UDItype)(v)))
1007 #define udiv_qrnnd(q, r, n1, n0, dx) /* d renamed to dx avoiding "=d" */\
1008   __asm__ ("divq %4"                 /* stringification in K&R C */     \
1009            : "=a" (q), "=d" (r)                                         \
1010            : "0" ((UDItype)(n0)), "1" ((UDItype)(n1)), "rm" ((UDItype)(dx)))
1011 /* bsrq destination must be a 64-bit register, hence UDItype for __cbtmp. */
1012 #define count_leading_zeros(count, x)                                   \
1013   do {                                                                  \
1014     UDItype __cbtmp;                                                    \
1015     ASSERT ((x) != 0);                                                  \
1016     __asm__ ("bsrq %1,%0" : "=r" (__cbtmp) : "rm" ((UDItype)(x)));      \
1017     (count) = __cbtmp ^ 63;                                             \
1018   } while (0)
1019 /* bsfq destination must be a 64-bit register, "%q0" forces this in case
1020    count is only an int. */
1021 #define count_trailing_zeros(count, x)                                  \
1022   do {                                                                  \
1023     ASSERT ((x) != 0);                                                  \
1024     __asm__ ("bsfq %1,%q0" : "=r" (count) : "rm" ((UDItype)(x)));       \
1025   } while (0)
1026 #endif /* __amd64__ */
1027
1028 #if defined (__i860__) && W_TYPE_SIZE == 32
1029 #define rshift_rhlc(r,h,l,c) \
1030   __asm__ ("shr %3,r0,r0\;shrd %1,%2,%0"                                \
1031            "=r" (r) : "r" (h), "r" (l), "rn" (c))
1032 #endif /* i860 */
1033
1034 #if defined (__i960__) && W_TYPE_SIZE == 32
1035 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
1036   __asm__ ("cmpo 1,0\;addc %5,%4,%1\;addc %3,%2,%0"                     \
1037            : "=r" (sh), "=&r" (sl)                                      \
1038            : "dI" (ah), "dI" (bh), "%dI" (al), "dI" (bl))
1039 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
1040   __asm__ ("cmpo 0,0\;subc %5,%4,%1\;subc %3,%2,%0"                     \
1041            : "=r" (sh), "=&r" (sl)                                      \
1042            : "dI" (ah), "dI" (bh), "dI" (al), "dI" (bl))
1043 #define umul_ppmm(w1, w0, u, v) \
1044   ({union {UDItype __ll;                                                \
1045            struct {USItype __l, __h;} __i;                              \
1046           } __x;                                                        \
1047   __asm__ ("emul %2,%1,%0"                                              \
1048            : "=d" (__x.__ll) : "%dI" (u), "dI" (v));                    \
1049   (w1) = __x.__i.__h; (w0) = __x.__i.__l;})
1050 #define __umulsidi3(u, v) \
1051   ({UDItype __w;                                                        \
1052     __asm__ ("emul %2,%1,%0" : "=d" (__w) : "%dI" (u), "dI" (v));       \
1053     __w; })
1054 #define udiv_qrnnd(q, r, nh, nl, d) \
1055   do {                                                                  \
1056     union {UDItype __ll;                                                \
1057            struct {USItype __l, __h;} __i;                              \
1058           } __nn;                                                       \
1059     __nn.__i.__h = (nh); __nn.__i.__l = (nl);                           \
1060     __asm__ ("ediv %d,%n,%0"                                            \
1061            : "=d" (__rq.__ll) : "dI" (__nn.__ll), "dI" (d));            \
1062     (r) = __rq.__i.__l; (q) = __rq.__i.__h;                             \
1063   } while (0)
1064 #define count_leading_zeros(count, x) \
1065   do {                                                                  \
1066     USItype __cbtmp;                                                    \
1067     __asm__ ("scanbit %1,%0" : "=r" (__cbtmp) : "r" (x));               \
1068     (count) = __cbtmp ^ 31;                                             \
1069   } while (0)
1070 #define COUNT_LEADING_ZEROS_0 (-32) /* sic */
1071 #if defined (__i960mx)          /* what is the proper symbol to test??? */
1072 #define rshift_rhlc(r,h,l,c) \
1073   do {                                                                  \
1074     union {UDItype __ll;                                                \
1075            struct {USItype __l, __h;} __i;                              \
1076           } __nn;                                                       \
1077     __nn.__i.__h = (h); __nn.__i.__l = (l);                             \
1078     __asm__ ("shre %2,%1,%0" : "=d" (r) : "dI" (__nn.__ll), "dI" (c));  \
1079   }
1080 #endif /* i960mx */
1081 #endif /* i960 */
1082
1083 #if (defined (__mc68000__) || defined (__mc68020__) || defined(mc68020) \
1084      || defined (__m68k__) || defined (__mc5200__) || defined (__mc5206e__) \
1085      || defined (__mc5307__)) && W_TYPE_SIZE == 32
1086 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
1087   __asm__ ("add%.l %5,%1\n\taddx%.l %3,%0"                              \
1088            : "=d" (sh), "=&d" (sl)                                      \
1089            : "0"  ((USItype)(ah)), "d" ((USItype)(bh)),                 \
1090              "%1" ((USItype)(al)), "g" ((USItype)(bl)))
1091 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
1092   __asm__ ("sub%.l %5,%1\n\tsubx%.l %3,%0"                              \
1093            : "=d" (sh), "=&d" (sl)                                      \
1094            : "0" ((USItype)(ah)), "d" ((USItype)(bh)),                  \
1095              "1" ((USItype)(al)), "g" ((USItype)(bl)))
1096 /* The '020, '030, '040 and CPU32 have 32x32->64 and 64/32->32q-32r.  */
1097 #if defined (__mc68020__) || defined(mc68020) \
1098      || defined (__mc68030__) || defined (mc68030) \
1099      || defined (__mc68040__) || defined (mc68040) \
1100      || defined (__mcpu32__) || defined (mcpu32) \
1101      || defined (__NeXT__)
1102 #define umul_ppmm(w1, w0, u, v) \
1103   __asm__ ("mulu%.l %3,%1:%0"                                           \
1104            : "=d" (w0), "=d" (w1)                                       \
1105            : "%0" ((USItype)(u)), "dmi" ((USItype)(v)))
1106 #define UMUL_TIME 45
1107 #define udiv_qrnnd(q, r, n1, n0, d) \
1108   __asm__ ("divu%.l %4,%1:%0"                                           \
1109            : "=d" (q), "=d" (r)                                         \
1110            : "0" ((USItype)(n0)), "1" ((USItype)(n1)), "dmi" ((USItype)(d)))
1111 #define UDIV_TIME 90
1112 #define sdiv_qrnnd(q, r, n1, n0, d) \
1113   __asm__ ("divs%.l %4,%1:%0"                                           \
1114            : "=d" (q), "=d" (r)                                         \
1115            : "0" ((USItype)(n0)), "1" ((USItype)(n1)), "dmi" ((USItype)(d)))
1116 #else /* for other 68k family members use 16x16->32 multiplication */
1117 #define umul_ppmm(xh, xl, a, b) \
1118   do { USItype __umul_tmp1, __umul_tmp2;                                \
1119         __asm__ ("| Inlined umul_ppmm\n"                                \
1120 "       move%.l %5,%3\n"                                                \
1121 "       move%.l %2,%0\n"                                                \
1122 "       move%.w %3,%1\n"                                                \
1123 "       swap    %3\n"                                                   \
1124 "       swap    %0\n"                                                   \
1125 "       mulu%.w %2,%1\n"                                                \
1126 "       mulu%.w %3,%0\n"                                                \
1127 "       mulu%.w %2,%3\n"                                                \
1128 "       swap    %2\n"                                                   \
1129 "       mulu%.w %5,%2\n"                                                \
1130 "       add%.l  %3,%2\n"                                                \
1131 "       jcc     1f\n"                                                   \
1132 "       add%.l  %#0x10000,%0\n"                                         \
1133 "1:     move%.l %2,%3\n"                                                \
1134 "       clr%.w  %2\n"                                                   \
1135 "       swap    %2\n"                                                   \
1136 "       swap    %3\n"                                                   \
1137 "       clr%.w  %3\n"                                                   \
1138 "       add%.l  %3,%1\n"                                                \
1139 "       addx%.l %2,%0\n"                                                \
1140 "       | End inlined umul_ppmm"                                        \
1141               : "=&d" (xh), "=&d" (xl),                                 \
1142                 "=d" (__umul_tmp1), "=&d" (__umul_tmp2)                 \
1143               : "%2" ((USItype)(a)), "d" ((USItype)(b)));               \
1144   } while (0)
1145 #define UMUL_TIME 100
1146 #define UDIV_TIME 400
1147 #endif /* not mc68020 */
1148 /* The '020, '030, '040 and '060 have bitfield insns.
1149    GCC 3.4 defines __mc68020__ when in CPU32 mode, check for __mcpu32__ to
1150    exclude bfffo on that chip (bitfield insns not available).  */
1151 #if (defined (__mc68020__) || defined (mc68020)    \
1152      || defined (__mc68030__) || defined (mc68030) \
1153      || defined (__mc68040__) || defined (mc68040) \
1154      || defined (__mc68060__) || defined (mc68060) \
1155      || defined (__NeXT__))                        \
1156   && ! defined (__mcpu32__)
1157 #define count_leading_zeros(count, x) \
1158   __asm__ ("bfffo %1{%b2:%b2},%0"                                       \
1159            : "=d" (count)                                               \
1160            : "od" ((USItype) (x)), "n" (0))
1161 #define COUNT_LEADING_ZEROS_0 32
1162 #endif
1163 #endif /* mc68000 */
1164
1165 #if defined (__m88000__) && W_TYPE_SIZE == 32
1166 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
1167   __asm__ ("addu.co %1,%r4,%r5\n\taddu.ci %0,%r2,%r3"                   \
1168            : "=r" (sh), "=&r" (sl)                                      \
1169            : "rJ" (ah), "rJ" (bh), "%rJ" (al), "rJ" (bl))
1170 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
1171   __asm__ ("subu.co %1,%r4,%r5\n\tsubu.ci %0,%r2,%r3"                   \
1172            : "=r" (sh), "=&r" (sl)                                      \
1173            : "rJ" (ah), "rJ" (bh), "rJ" (al), "rJ" (bl))
1174 #define count_leading_zeros(count, x) \
1175   do {                                                                  \
1176     USItype __cbtmp;                                                    \
1177     __asm__ ("ff1 %0,%1" : "=r" (__cbtmp) : "r" (x));                   \
1178     (count) = __cbtmp ^ 31;                                             \
1179   } while (0)
1180 #define COUNT_LEADING_ZEROS_0 63 /* sic */
1181 #if defined (__m88110__)
1182 #define umul_ppmm(wh, wl, u, v) \
1183   do {                                                                  \
1184     union {UDItype __ll;                                                \
1185            struct {USItype __h, __l;} __i;                              \
1186           } __x;                                                        \
1187     __asm__ ("mulu.d %0,%1,%2" : "=r" (__x.__ll) : "r" (u), "r" (v));   \
1188     (wh) = __x.__i.__h;                                                 \
1189     (wl) = __x.__i.__l;                                                 \
1190   } while (0)
1191 #define udiv_qrnnd(q, r, n1, n0, d) \
1192   ({union {UDItype __ll;                                                \
1193            struct {USItype __h, __l;} __i;                              \
1194           } __x, __q;                                                   \
1195   __x.__i.__h = (n1); __x.__i.__l = (n0);                               \
1196   __asm__ ("divu.d %0,%1,%2"                                            \
1197            : "=r" (__q.__ll) : "r" (__x.__ll), "r" (d));                \
1198   (r) = (n0) - __q.__l * (d); (q) = __q.__l; })
1199 #define UMUL_TIME 5
1200 #define UDIV_TIME 25
1201 #else
1202 #define UMUL_TIME 17
1203 #define UDIV_TIME 150
1204 #endif /* __m88110__ */
1205 #endif /* __m88000__ */
1206
1207 #if defined (__mips) && W_TYPE_SIZE == 32
1208 #if __GMP_GNUC_PREREQ (4,4)
1209 #define umul_ppmm(w1, w0, u, v) \
1210   do {                                                                  \
1211     UDItype __ll = (UDItype)(u) * (v);                                  \
1212     w1 = __ll >> 32;                                                    \
1213     w0 = __ll;                                                          \
1214   } while (0)
1215 #endif
1216 #if !defined (umul_ppmm) && __GMP_GNUC_PREREQ (2,7) && !defined (__clang__)
1217 #define umul_ppmm(w1, w0, u, v) \
1218   __asm__ ("multu %2,%3" : "=l" (w0), "=h" (w1) : "d" (u), "d" (v))
1219 #endif
1220 #if !defined (umul_ppmm)
1221 #define umul_ppmm(w1, w0, u, v) \
1222   __asm__ ("multu %2,%3\n\tmflo %0\n\tmfhi %1"                          \
1223            : "=d" (w0), "=d" (w1) : "d" (u), "d" (v))
1224 #endif
1225 #define UMUL_TIME 10
1226 #define UDIV_TIME 100
1227 #endif /* __mips */
1228
1229 #if (defined (__mips) && __mips >= 3) && W_TYPE_SIZE == 64
1230 #if __GMP_GNUC_PREREQ (4,4)
1231 #define umul_ppmm(w1, w0, u, v) \
1232   do {                                                                  \
1233     typedef unsigned int __ll_UTItype __attribute__((mode(TI)));        \
1234     __ll_UTItype __ll = (__ll_UTItype)(u) * (v);                        \
1235     w1 = __ll >> 64;                                                    \
1236     w0 = __ll;                                                          \
1237   } while (0)
1238 #endif
1239 #if !defined (umul_ppmm) && __GMP_GNUC_PREREQ (2,7) && !defined (__clang__)
1240 #define umul_ppmm(w1, w0, u, v) \
1241   __asm__ ("dmultu %2,%3"                                               \
1242            : "=l" (w0), "=h" (w1)                                       \
1243            : "d" ((UDItype)(u)), "d" ((UDItype)(v)))
1244 #endif
1245 #if !defined (umul_ppmm)
1246 #define umul_ppmm(w1, w0, u, v) \
1247   __asm__ ("dmultu %2,%3\n\tmflo %0\n\tmfhi %1"                         \
1248            : "=d" (w0), "=d" (w1)                                       \
1249            : "d" ((UDItype)(u)), "d" ((UDItype)(v)))
1250 #endif
1251 #define UMUL_TIME 20
1252 #define UDIV_TIME 140
1253 #endif /* __mips */
1254
1255 #if defined (__mmix__) && W_TYPE_SIZE == 64
1256 #define umul_ppmm(w1, w0, u, v) \
1257   __asm__ ("MULU %0,%2,%3" : "=r" (w0), "=z" (w1) : "r" (u), "r" (v))
1258 #endif
1259
1260 #if defined (__ns32000__) && W_TYPE_SIZE == 32
1261 #define umul_ppmm(w1, w0, u, v) \
1262   ({union {UDItype __ll;                                                \
1263            struct {USItype __l, __h;} __i;                              \
1264           } __x;                                                        \
1265   __asm__ ("meid %2,%0"                                                 \
1266            : "=g" (__x.__ll)                                            \
1267            : "%0" ((USItype)(u)), "g" ((USItype)(v)));                  \
1268   (w1) = __x.__i.__h; (w0) = __x.__i.__l;})
1269 #define __umulsidi3(u, v) \
1270   ({UDItype __w;                                                        \
1271     __asm__ ("meid %2,%0"                                               \
1272              : "=g" (__w)                                               \
1273              : "%0" ((USItype)(u)), "g" ((USItype)(v)));                \
1274     __w; })
1275 #define udiv_qrnnd(q, r, n1, n0, d) \
1276   ({union {UDItype __ll;                                                \
1277            struct {USItype __l, __h;} __i;                              \
1278           } __x;                                                        \
1279   __x.__i.__h = (n1); __x.__i.__l = (n0);                               \
1280   __asm__ ("deid %2,%0"                                                 \
1281            : "=g" (__x.__ll)                                            \
1282            : "0" (__x.__ll), "g" ((USItype)(d)));                       \
1283   (r) = __x.__i.__l; (q) = __x.__i.__h; })
1284 #define count_trailing_zeros(count,x) \
1285   do {                                                                  \
1286     __asm__ ("ffsd      %2,%0"                                          \
1287              : "=r" (count)                                             \
1288              : "0" ((USItype) 0), "r" ((USItype) (x)));                 \
1289   } while (0)
1290 #endif /* __ns32000__ */
1291
1292 /* In the past we had a block of various #defines tested
1293        _ARCH_PPC    - AIX
1294        _ARCH_PWR    - AIX
1295        __powerpc__  - gcc
1296        __POWERPC__  - BEOS
1297        __ppc__      - Darwin
1298        PPC          - old gcc, GNU/Linux, SysV
1299    The plain PPC test was not good for vxWorks, since PPC is defined on all
1300    CPUs there (eg. m68k too), as a constant one is expected to compare
1301    CPU_FAMILY against.
1302
1303    At any rate, this was pretty unattractive and a bit fragile.  The use of
1304    HAVE_HOST_CPU_FAMILY is designed to cut through it all and be sure of
1305    getting the desired effect.
1306
1307    ENHANCE-ME: We should test _IBMR2 here when we add assembly support for
1308    the system vendor compilers.  (Is that vendor compilers with inline asm,
1309    or what?)  */
1310
1311 #if (HAVE_HOST_CPU_FAMILY_power || HAVE_HOST_CPU_FAMILY_powerpc)        \
1312   && W_TYPE_SIZE == 32
1313 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
1314   do {                                                                  \
1315     if (__builtin_constant_p (bh) && (bh) == 0)                         \
1316       __asm__ ("add%I4c %1,%3,%4\n\taddze %0,%2"                        \
1317              : "=r" (sh), "=&r" (sl) : "r" (ah), "%r" (al), "rI" (bl)); \
1318     else if (__builtin_constant_p (bh) && (bh) == ~(USItype) 0)         \
1319       __asm__ ("add%I4c %1,%3,%4\n\taddme %0,%2"                        \
1320              : "=r" (sh), "=&r" (sl) : "r" (ah), "%r" (al), "rI" (bl)); \
1321     else                                                                \
1322       __asm__ ("add%I5c %1,%4,%5\n\tadde %0,%2,%3"                      \
1323              : "=r" (sh), "=&r" (sl)                                    \
1324              : "r" (ah), "r" (bh), "%r" (al), "rI" (bl));               \
1325   } while (0)
1326 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
1327   do {                                                                  \
1328     if (__builtin_constant_p (ah) && (ah) == 0)                         \
1329       __asm__ ("subf%I3c %1,%4,%3\n\tsubfze %0,%2"                      \
1330                : "=r" (sh), "=&r" (sl) : "r" (bh), "rI" (al), "r" (bl));\
1331     else if (__builtin_constant_p (ah) && (ah) == ~(USItype) 0)         \
1332       __asm__ ("subf%I3c %1,%4,%3\n\tsubfme %0,%2"                      \
1333                : "=r" (sh), "=&r" (sl) : "r" (bh), "rI" (al), "r" (bl));\
1334     else if (__builtin_constant_p (bh) && (bh) == 0)                    \
1335       __asm__ ("subf%I3c %1,%4,%3\n\taddme %0,%2"                       \
1336                : "=r" (sh), "=&r" (sl) : "r" (ah), "rI" (al), "r" (bl));\
1337     else if (__builtin_constant_p (bh) && (bh) == ~(USItype) 0)         \
1338       __asm__ ("subf%I3c %1,%4,%3\n\taddze %0,%2"                       \
1339                : "=r" (sh), "=&r" (sl) : "r" (ah), "rI" (al), "r" (bl));\
1340     else                                                                \
1341       __asm__ ("subf%I4c %1,%5,%4\n\tsubfe %0,%3,%2"                    \
1342                : "=r" (sh), "=&r" (sl)                                  \
1343                : "r" (ah), "r" (bh), "rI" (al), "r" (bl));              \
1344   } while (0)
1345 #define count_leading_zeros(count, x) \
1346   __asm__ ("cntlzw %0,%1" : "=r" (count) : "r" (x))
1347 #define COUNT_LEADING_ZEROS_0 32
1348 #if HAVE_HOST_CPU_FAMILY_powerpc
1349 #if __GMP_GNUC_PREREQ (4,4)
1350 #define umul_ppmm(w1, w0, u, v) \
1351   do {                                                                  \
1352     UDItype __ll = (UDItype)(u) * (v);                                  \
1353     w1 = __ll >> 32;                                                    \
1354     w0 = __ll;                                                          \
1355   } while (0)
1356 #endif
1357 #if !defined (umul_ppmm)
1358 #define umul_ppmm(ph, pl, m0, m1) \
1359   do {                                                                  \
1360     USItype __m0 = (m0), __m1 = (m1);                                   \
1361     __asm__ ("mulhwu %0,%1,%2" : "=r" (ph) : "%r" (m0), "r" (m1));      \
1362     (pl) = __m0 * __m1;                                                 \
1363   } while (0)
1364 #endif
1365 #define UMUL_TIME 15
1366 #define smul_ppmm(ph, pl, m0, m1) \
1367   do {                                                                  \
1368     SItype __m0 = (m0), __m1 = (m1);                                    \
1369     __asm__ ("mulhw %0,%1,%2" : "=r" (ph) : "%r" (m0), "r" (m1));       \
1370     (pl) = __m0 * __m1;                                                 \
1371   } while (0)
1372 #define SMUL_TIME 14
1373 #define UDIV_TIME 120
1374 #else
1375 #define UMUL_TIME 8
1376 #define smul_ppmm(xh, xl, m0, m1) \
1377   __asm__ ("mul %0,%2,%3" : "=r" (xh), "=q" (xl) : "r" (m0), "r" (m1))
1378 #define SMUL_TIME 4
1379 #define sdiv_qrnnd(q, r, nh, nl, d) \
1380   __asm__ ("div %0,%2,%4" : "=r" (q), "=q" (r) : "r" (nh), "1" (nl), "r" (d))
1381 #define UDIV_TIME 100
1382 #endif
1383 #endif /* 32-bit POWER architecture variants.  */
1384
1385 /* We should test _IBMR2 here when we add assembly support for the system
1386    vendor compilers.  */
1387 #if HAVE_HOST_CPU_FAMILY_powerpc && W_TYPE_SIZE == 64
1388 #if !defined (_LONG_LONG_LIMB)
1389 /* _LONG_LONG_LIMB is ABI=mode32 where adde operates on 32-bit values.  So
1390    use adde etc only when not _LONG_LONG_LIMB.  */
1391 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
1392   do {                                                                  \
1393     if (__builtin_constant_p (bh) && (bh) == 0)                         \
1394       __asm__ ("add%I4c %1,%3,%4\n\taddze %0,%2"                        \
1395                : "=r" (sh), "=&r" (sl)                                  \
1396                : "r"  ((UDItype)(ah)),                                  \
1397                  "%r" ((UDItype)(al)), "rI" ((UDItype)(bl)));           \
1398     else if (__builtin_constant_p (bh) && (bh) == ~(UDItype) 0)         \
1399       __asm__ ("add%I4c %1,%3,%4\n\taddme %0,%2"                        \
1400                : "=r" (sh), "=&r" (sl)                                  \
1401                : "r"  ((UDItype)(ah)),                                  \
1402                  "%r" ((UDItype)(al)), "rI" ((UDItype)(bl)));           \
1403     else                                                                \
1404       __asm__ ("add%I5c %1,%4,%5\n\tadde %0,%2,%3"                      \
1405                : "=r" (sh), "=&r" (sl)                                  \
1406                : "r"  ((UDItype)(ah)), "r"  ((UDItype)(bh)),            \
1407                  "%r" ((UDItype)(al)), "rI" ((UDItype)(bl)));           \
1408   } while (0)
1409 /* We use "*rI" for the constant operand here, since with just "I", gcc barfs.
1410    This might seem strange, but gcc folds away the dead code late.  */
1411 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
1412   do {                                                                  \
1413     if (__builtin_constant_p (bl) && bl > -0x8000 && bl <= 0x8000) {    \
1414         if (__builtin_constant_p (ah) && (ah) == 0)                     \
1415           __asm__ ("addic %1,%3,%4\n\tsubfze %0,%2"                     \
1416                    : "=r" (sh), "=&r" (sl)                              \
1417                    :                       "r" ((UDItype)(bh)),         \
1418                      "rI" ((UDItype)(al)), "*rI" (-((UDItype)(bl))));   \
1419         else if (__builtin_constant_p (ah) && (ah) == ~(UDItype) 0)     \
1420           __asm__ ("addic %1,%3,%4\n\tsubfme %0,%2"                     \
1421                    : "=r" (sh), "=&r" (sl)                              \
1422                    :                       "r" ((UDItype)(bh)),         \
1423                      "rI" ((UDItype)(al)), "*rI" (-((UDItype)(bl))));   \
1424         else if (__builtin_constant_p (bh) && (bh) == 0)                \
1425           __asm__ ("addic %1,%3,%4\n\taddme %0,%2"                      \
1426                    : "=r" (sh), "=&r" (sl)                              \
1427                    : "r"  ((UDItype)(ah)),                              \
1428                      "rI" ((UDItype)(al)), "*rI" (-((UDItype)(bl))));   \
1429         else if (__builtin_constant_p (bh) && (bh) == ~(UDItype) 0)     \
1430           __asm__ ("addic %1,%3,%4\n\taddze %0,%2"                      \
1431                    : "=r" (sh), "=&r" (sl)                              \
1432                    : "r"  ((UDItype)(ah)),                              \
1433                      "rI" ((UDItype)(al)), "*rI" (-((UDItype)(bl))));   \
1434         else                                                            \
1435           __asm__ ("addic %1,%4,%5\n\tsubfe %0,%3,%2"                   \
1436                    : "=r" (sh), "=&r" (sl)                              \
1437                    : "r"  ((UDItype)(ah)), "r" ((UDItype)(bh)),         \
1438                      "rI" ((UDItype)(al)), "*rI" (-((UDItype)(bl))));   \
1439     } else {                                                            \
1440         if (__builtin_constant_p (ah) && (ah) == 0)                     \
1441           __asm__ ("subf%I3c %1,%4,%3\n\tsubfze %0,%2"                  \
1442                    : "=r" (sh), "=&r" (sl)                              \
1443                    :                       "r" ((UDItype)(bh)),         \
1444                      "rI" ((UDItype)(al)), "r" ((UDItype)(bl)));        \
1445         else if (__builtin_constant_p (ah) && (ah) == ~(UDItype) 0)     \
1446           __asm__ ("subf%I3c %1,%4,%3\n\tsubfme %0,%2"                  \
1447                    : "=r" (sh), "=&r" (sl)                              \
1448                    :                       "r" ((UDItype)(bh)),         \
1449                      "rI" ((UDItype)(al)), "r" ((UDItype)(bl)));        \
1450         else if (__builtin_constant_p (bh) && (bh) == 0)                \
1451           __asm__ ("subf%I3c %1,%4,%3\n\taddme %0,%2"                   \
1452                    : "=r" (sh), "=&r" (sl)                              \
1453                    : "r"  ((UDItype)(ah)),                              \
1454                      "rI" ((UDItype)(al)), "r" ((UDItype)(bl)));        \
1455         else if (__builtin_constant_p (bh) && (bh) == ~(UDItype) 0)     \
1456           __asm__ ("subf%I3c %1,%4,%3\n\taddze %0,%2"                   \
1457                    : "=r" (sh), "=&r" (sl)                              \
1458                    : "r"  ((UDItype)(ah)),                              \
1459                      "rI" ((UDItype)(al)), "r" ((UDItype)(bl)));        \
1460         else                                                            \
1461           __asm__ ("subf%I4c %1,%5,%4\n\tsubfe %0,%3,%2"                \
1462                    : "=r" (sh), "=&r" (sl)                              \
1463                    : "r"  ((UDItype)(ah)), "r" ((UDItype)(bh)),         \
1464                      "rI" ((UDItype)(al)), "r" ((UDItype)(bl)));        \
1465     }                                                                   \
1466   } while (0)
1467 #endif /* ! _LONG_LONG_LIMB */
1468 #define count_leading_zeros(count, x) \
1469   __asm__ ("cntlzd %0,%1" : "=r" (count) : "r" (x))
1470 #define COUNT_LEADING_ZEROS_0 64
1471 #if 0 && __GMP_GNUC_PREREQ (4,4) /* Disable, this results in libcalls! */
1472 #define umul_ppmm(w1, w0, u, v) \
1473   do {                                                                  \
1474     typedef unsigned int __ll_UTItype __attribute__((mode(TI)));        \
1475     __ll_UTItype __ll = (__ll_UTItype)(u) * (v);                        \
1476     w1 = __ll >> 64;                                                    \
1477     w0 = __ll;                                                          \
1478   } while (0)
1479 #endif
1480 #if !defined (umul_ppmm)
1481 #define umul_ppmm(ph, pl, m0, m1) \
1482   do {                                                                  \
1483     UDItype __m0 = (m0), __m1 = (m1);                                   \
1484     __asm__ ("mulhdu %0,%1,%2" : "=r" (ph) : "%r" (__m0), "r" (__m1));  \
1485     (pl) = __m0 * __m1;                                                 \
1486   } while (0)
1487 #endif
1488 #define UMUL_TIME 15
1489 #define smul_ppmm(ph, pl, m0, m1) \
1490   do {                                                                  \
1491     DItype __m0 = (m0), __m1 = (m1);                                    \
1492     __asm__ ("mulhd %0,%1,%2" : "=r" (ph) : "%r" (__m0), "r" (__m1));   \
1493     (pl) = __m0 * __m1;                                                 \
1494   } while (0)
1495 #define SMUL_TIME 14  /* ??? */
1496 #define UDIV_TIME 120 /* ??? */
1497 #endif /* 64-bit PowerPC.  */
1498
1499 #if defined (__pyr__) && W_TYPE_SIZE == 32
1500 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
1501   __asm__ ("addw %5,%1\n\taddwc %3,%0"                                  \
1502            : "=r" (sh), "=&r" (sl)                                      \
1503            : "0"  ((USItype)(ah)), "g" ((USItype)(bh)),                 \
1504              "%1" ((USItype)(al)), "g" ((USItype)(bl)))
1505 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
1506   __asm__ ("subw %5,%1\n\tsubwb %3,%0"                                  \
1507            : "=r" (sh), "=&r" (sl)                                      \
1508            : "0" ((USItype)(ah)), "g" ((USItype)(bh)),                  \
1509              "1" ((USItype)(al)), "g" ((USItype)(bl)))
1510 /* This insn works on Pyramids with AP, XP, or MI CPUs, but not with SP.  */
1511 #define umul_ppmm(w1, w0, u, v) \
1512   ({union {UDItype __ll;                                                \
1513            struct {USItype __h, __l;} __i;                              \
1514           } __x;                                                        \
1515   __asm__ ("movw %1,%R0\n\tuemul %2,%0"                                 \
1516            : "=&r" (__x.__ll)                                           \
1517            : "g" ((USItype) (u)), "g" ((USItype)(v)));                  \
1518   (w1) = __x.__i.__h; (w0) = __x.__i.__l;})
1519 #endif /* __pyr__ */
1520
1521 #if defined (__ibm032__) /* RT/ROMP */  && W_TYPE_SIZE == 32
1522 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
1523   __asm__ ("a %1,%5\n\tae %0,%3"                                        \
1524            : "=r" (sh), "=&r" (sl)                                      \
1525            : "0"  ((USItype)(ah)), "r" ((USItype)(bh)),                 \
1526              "%1" ((USItype)(al)), "r" ((USItype)(bl)))
1527 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
1528   __asm__ ("s %1,%5\n\tse %0,%3"                                        \
1529            : "=r" (sh), "=&r" (sl)                                      \
1530            : "0" ((USItype)(ah)), "r" ((USItype)(bh)),                  \
1531              "1" ((USItype)(al)), "r" ((USItype)(bl)))
1532 #define smul_ppmm(ph, pl, m0, m1) \
1533   __asm__ (                                                             \
1534        "s       r2,r2\n"                                                \
1535 "       mts r10,%2\n"                                                   \
1536 "       m       r2,%3\n"                                                \
1537 "       m       r2,%3\n"                                                \
1538 "       m       r2,%3\n"                                                \
1539 "       m       r2,%3\n"                                                \
1540 "       m       r2,%3\n"                                                \
1541 "       m       r2,%3\n"                                                \
1542 "       m       r2,%3\n"                                                \
1543 "       m       r2,%3\n"                                                \
1544 "       m       r2,%3\n"                                                \
1545 "       m       r2,%3\n"                                                \
1546 "       m       r2,%3\n"                                                \
1547 "       m       r2,%3\n"                                                \
1548 "       m       r2,%3\n"                                                \
1549 "       m       r2,%3\n"                                                \
1550 "       m       r2,%3\n"                                                \
1551 "       m       r2,%3\n"                                                \
1552 "       cas     %0,r2,r0\n"                                             \
1553 "       mfs     r10,%1"                                                 \
1554            : "=r" (ph), "=r" (pl)                                       \
1555            : "%r" ((USItype)(m0)), "r" ((USItype)(m1))                  \
1556            : "r2")
1557 #define UMUL_TIME 20
1558 #define UDIV_TIME 200
1559 #define count_leading_zeros(count, x) \
1560   do {                                                                  \
1561     if ((x) >= 0x10000)                                                 \
1562       __asm__ ("clz     %0,%1"                                          \
1563                : "=r" (count) : "r" ((USItype)(x) >> 16));              \
1564     else                                                                \
1565       {                                                                 \
1566         __asm__ ("clz   %0,%1"                                          \
1567                  : "=r" (count) : "r" ((USItype)(x)));                  \
1568         (count) += 16;                                                  \
1569       }                                                                 \
1570   } while (0)
1571 #endif /* RT/ROMP */
1572
1573 #if (defined (__SH2__) || defined (__SH3__) || defined (__SH4__)) && W_TYPE_SIZE == 32
1574 #define umul_ppmm(w1, w0, u, v) \
1575   __asm__ ("dmulu.l %2,%3\n\tsts macl,%1\n\tsts mach,%0"                \
1576            : "=r" (w1), "=r" (w0) : "r" (u), "r" (v) : "macl", "mach")
1577 #define UMUL_TIME 5
1578 #endif
1579
1580 #if defined (__sparc__) && W_TYPE_SIZE == 32
1581 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
1582   __asm__ ("addcc %r4,%5,%1\n\taddx %r2,%3,%0"                          \
1583            : "=r" (sh), "=&r" (sl)                                      \
1584            : "rJ" (ah), "rI" (bh),"%rJ" (al), "rI" (bl)                 \
1585            __CLOBBER_CC)
1586 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
1587   __asm__ ("subcc %r4,%5,%1\n\tsubx %r2,%3,%0"                          \
1588            : "=r" (sh), "=&r" (sl)                                      \
1589            : "rJ" (ah), "rI" (bh), "rJ" (al), "rI" (bl) \
1590            __CLOBBER_CC)
1591 /* FIXME: When gcc -mcpu=v9 is used on solaris, gcc/config/sol2-sld-64.h
1592    doesn't define anything to indicate that to us, it only sets __sparcv8. */
1593 #if defined (__sparc_v9__) || defined (__sparcv9)
1594 /* Perhaps we should use floating-point operations here?  */
1595 #if 0
1596 /* Triggers a bug making mpz/tests/t-gcd.c fail.
1597    Perhaps we simply need explicitly zero-extend the inputs?  */
1598 #define umul_ppmm(w1, w0, u, v) \
1599   __asm__ ("mulx %2,%3,%%g1; srl %%g1,0,%1; srlx %%g1,32,%0" :          \
1600            "=r" (w1), "=r" (w0) : "r" (u), "r" (v) : "g1")
1601 #else
1602 /* Use v8 umul until above bug is fixed.  */
1603 #define umul_ppmm(w1, w0, u, v) \
1604   __asm__ ("umul %2,%3,%1;rd %%y,%0" : "=r" (w1), "=r" (w0) : "r" (u), "r" (v))
1605 #endif
1606 /* Use a plain v8 divide for v9.  */
1607 #define udiv_qrnnd(q, r, n1, n0, d) \
1608   do {                                                                  \
1609     USItype __q;                                                        \
1610     __asm__ ("mov %1,%%y;nop;nop;nop;udiv %2,%3,%0"                     \
1611              : "=r" (__q) : "r" (n1), "r" (n0), "r" (d));               \
1612     (r) = (n0) - __q * (d);                                             \
1613     (q) = __q;                                                          \
1614   } while (0)
1615 #else
1616 #if defined (__sparc_v8__)   /* gcc normal */                           \
1617   || defined (__sparcv8)     /* gcc solaris */                          \
1618   || HAVE_HOST_CPU_supersparc
1619 /* Don't match immediate range because, 1) it is not often useful,
1620    2) the 'I' flag thinks of the range as a 13 bit signed interval,
1621    while we want to match a 13 bit interval, sign extended to 32 bits,
1622    but INTERPRETED AS UNSIGNED.  */
1623 #define umul_ppmm(w1, w0, u, v) \
1624   __asm__ ("umul %2,%3,%1;rd %%y,%0" : "=r" (w1), "=r" (w0) : "r" (u), "r" (v))
1625 #define UMUL_TIME 5
1626
1627 #if HAVE_HOST_CPU_supersparc
1628 #define UDIV_TIME 60            /* SuperSPARC timing */
1629 #else
1630 /* Don't use this on SuperSPARC because its udiv only handles 53 bit
1631    dividends and will trap to the kernel for the rest. */
1632 #define udiv_qrnnd(q, r, n1, n0, d) \
1633   do {                                                                  \
1634     USItype __q;                                                        \
1635     __asm__ ("mov %1,%%y;nop;nop;nop;udiv %2,%3,%0"                     \
1636              : "=r" (__q) : "r" (n1), "r" (n0), "r" (d));               \
1637     (r) = (n0) - __q * (d);                                             \
1638     (q) = __q;                                                          \
1639   } while (0)
1640 #define UDIV_TIME 25
1641 #endif /* HAVE_HOST_CPU_supersparc */
1642
1643 #else /* ! __sparc_v8__ */
1644 #if defined (__sparclite__)
1645 /* This has hardware multiply but not divide.  It also has two additional
1646    instructions scan (ffs from high bit) and divscc.  */
1647 #define umul_ppmm(w1, w0, u, v) \
1648   __asm__ ("umul %2,%3,%1;rd %%y,%0" : "=r" (w1), "=r" (w0) : "r" (u), "r" (v))
1649 #define UMUL_TIME 5
1650 #define udiv_qrnnd(q, r, n1, n0, d) \
1651   __asm__ ("! Inlined udiv_qrnnd\n"                                     \
1652 "       wr      %%g0,%2,%%y     ! Not a delayed write for sparclite\n"  \
1653 "       tst     %%g0\n"                                                 \
1654 "       divscc  %3,%4,%%g1\n"                                           \
1655 "       divscc  %%g1,%4,%%g1\n"                                         \
1656 "       divscc  %%g1,%4,%%g1\n"                                         \
1657 "       divscc  %%g1,%4,%%g1\n"                                         \
1658 "       divscc  %%g1,%4,%%g1\n"                                         \
1659 "       divscc  %%g1,%4,%%g1\n"                                         \
1660 "       divscc  %%g1,%4,%%g1\n"                                         \
1661 "       divscc  %%g1,%4,%%g1\n"                                         \
1662 "       divscc  %%g1,%4,%%g1\n"                                         \
1663 "       divscc  %%g1,%4,%%g1\n"                                         \
1664 "       divscc  %%g1,%4,%%g1\n"                                         \
1665 "       divscc  %%g1,%4,%%g1\n"                                         \
1666 "       divscc  %%g1,%4,%%g1\n"                                         \
1667 "       divscc  %%g1,%4,%%g1\n"                                         \
1668 "       divscc  %%g1,%4,%%g1\n"                                         \
1669 "       divscc  %%g1,%4,%%g1\n"                                         \
1670 "       divscc  %%g1,%4,%%g1\n"                                         \
1671 "       divscc  %%g1,%4,%%g1\n"                                         \
1672 "       divscc  %%g1,%4,%%g1\n"                                         \
1673 "       divscc  %%g1,%4,%%g1\n"                                         \
1674 "       divscc  %%g1,%4,%%g1\n"                                         \
1675 "       divscc  %%g1,%4,%%g1\n"                                         \
1676 "       divscc  %%g1,%4,%%g1\n"                                         \
1677 "       divscc  %%g1,%4,%%g1\n"                                         \
1678 "       divscc  %%g1,%4,%%g1\n"                                         \
1679 "       divscc  %%g1,%4,%%g1\n"                                         \
1680 "       divscc  %%g1,%4,%%g1\n"                                         \
1681 "       divscc  %%g1,%4,%%g1\n"                                         \
1682 "       divscc  %%g1,%4,%%g1\n"                                         \
1683 "       divscc  %%g1,%4,%%g1\n"                                         \
1684 "       divscc  %%g1,%4,%%g1\n"                                         \
1685 "       divscc  %%g1,%4,%0\n"                                           \
1686 "       rd      %%y,%1\n"                                               \
1687 "       bl,a 1f\n"                                                      \
1688 "       add     %1,%4,%1\n"                                             \
1689 "1:     ! End of inline udiv_qrnnd"                                     \
1690            : "=r" (q), "=r" (r) : "r" (n1), "r" (n0), "rI" (d)          \
1691            : "%g1" __AND_CLOBBER_CC)
1692 #define UDIV_TIME 37
1693 #define count_leading_zeros(count, x) \
1694   __asm__ ("scan %1,1,%0" : "=r" (count) : "r" (x))
1695 /* Early sparclites return 63 for an argument of 0, but they warn that future
1696    implementations might change this.  Therefore, leave COUNT_LEADING_ZEROS_0
1697    undefined.  */
1698 #endif /* __sparclite__ */
1699 #endif /* __sparc_v8__ */
1700 #endif /* __sparc_v9__ */
1701 /* Default to sparc v7 versions of umul_ppmm and udiv_qrnnd.  */
1702 #ifndef umul_ppmm
1703 #define umul_ppmm(w1, w0, u, v) \
1704   __asm__ ("! Inlined umul_ppmm\n"                                      \
1705 "       wr      %%g0,%2,%%y     ! SPARC has 0-3 delay insn after a wr\n" \
1706 "       sra     %3,31,%%g2      ! Don't move this insn\n"               \
1707 "       and     %2,%%g2,%%g2    ! Don't move this insn\n"               \
1708 "       andcc   %%g0,0,%%g1     ! Don't move this insn\n"               \
1709 "       mulscc  %%g1,%3,%%g1\n"                                         \
1710 "       mulscc  %%g1,%3,%%g1\n"                                         \
1711 "       mulscc  %%g1,%3,%%g1\n"                                         \
1712 "       mulscc  %%g1,%3,%%g1\n"                                         \
1713 "       mulscc  %%g1,%3,%%g1\n"                                         \
1714 "       mulscc  %%g1,%3,%%g1\n"                                         \
1715 "       mulscc  %%g1,%3,%%g1\n"                                         \
1716 "       mulscc  %%g1,%3,%%g1\n"                                         \
1717 "       mulscc  %%g1,%3,%%g1\n"                                         \
1718 "       mulscc  %%g1,%3,%%g1\n"                                         \
1719 "       mulscc  %%g1,%3,%%g1\n"                                         \
1720 "       mulscc  %%g1,%3,%%g1\n"                                         \
1721 "       mulscc  %%g1,%3,%%g1\n"                                         \
1722 "       mulscc  %%g1,%3,%%g1\n"                                         \
1723 "       mulscc  %%g1,%3,%%g1\n"                                         \
1724 "       mulscc  %%g1,%3,%%g1\n"                                         \
1725 "       mulscc  %%g1,%3,%%g1\n"                                         \
1726 "       mulscc  %%g1,%3,%%g1\n"                                         \
1727 "       mulscc  %%g1,%3,%%g1\n"                                         \
1728 "       mulscc  %%g1,%3,%%g1\n"                                         \
1729 "       mulscc  %%g1,%3,%%g1\n"                                         \
1730 "       mulscc  %%g1,%3,%%g1\n"                                         \
1731 "       mulscc  %%g1,%3,%%g1\n"                                         \
1732 "       mulscc  %%g1,%3,%%g1\n"                                         \
1733 "       mulscc  %%g1,%3,%%g1\n"                                         \
1734 "       mulscc  %%g1,%3,%%g1\n"                                         \
1735 "       mulscc  %%g1,%3,%%g1\n"                                         \
1736 "       mulscc  %%g1,%3,%%g1\n"                                         \
1737 "       mulscc  %%g1,%3,%%g1\n"                                         \
1738 "       mulscc  %%g1,%3,%%g1\n"                                         \
1739 "       mulscc  %%g1,%3,%%g1\n"                                         \
1740 "       mulscc  %%g1,%3,%%g1\n"                                         \
1741 "       mulscc  %%g1,0,%%g1\n"                                          \
1742 "       add     %%g1,%%g2,%0\n"                                         \
1743 "       rd      %%y,%1"                                                 \
1744            : "=r" (w1), "=r" (w0) : "%rI" (u), "r" (v)                  \
1745            : "%g1", "%g2" __AND_CLOBBER_CC)
1746 #define UMUL_TIME 39            /* 39 instructions */
1747 #endif
1748 #ifndef udiv_qrnnd
1749 #ifndef LONGLONG_STANDALONE
1750 #define udiv_qrnnd(q, r, n1, n0, d) \
1751   do { UWtype __r;                                                      \
1752     (q) = __MPN(udiv_qrnnd) (&__r, (n1), (n0), (d));                    \
1753     (r) = __r;                                                          \
1754   } while (0)
1755 extern UWtype __MPN(udiv_qrnnd) (UWtype *, UWtype, UWtype, UWtype);
1756 #ifndef UDIV_TIME
1757 #define UDIV_TIME 140
1758 #endif
1759 #endif /* LONGLONG_STANDALONE */
1760 #endif /* udiv_qrnnd */
1761 #endif /* __sparc__ */
1762
1763 #if defined (__sparc__) && W_TYPE_SIZE == 64
1764 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
1765   __asm__ (                                                             \
1766        "addcc   %r4,%5,%1\n"                                            \
1767       " addccc  %r6,%7,%%g0\n"                                          \
1768       " addc    %r2,%3,%0"                                              \
1769        : "=r" (sh), "=&r" (sl)                                          \
1770        : "rJ"  ((UDItype)(ah)), "rI" ((UDItype)(bh)),                   \
1771          "%rJ" ((UDItype)(al)), "rI" ((UDItype)(bl)),                   \
1772          "%rJ" ((UDItype)(al) >> 32), "rI" ((UDItype)(bl) >> 32)        \
1773            __CLOBBER_CC)
1774 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
1775   __asm__ (                                                             \
1776        "subcc   %r4,%5,%1\n"                                            \
1777       " subccc  %r6,%7,%%g0\n"                                          \
1778       " subc    %r2,%3,%0"                                              \
1779        : "=r" (sh), "=&r" (sl)                                          \
1780        : "rJ" ((UDItype)(ah)), "rI" ((UDItype)(bh)),                    \
1781          "rJ" ((UDItype)(al)), "rI" ((UDItype)(bl)),                    \
1782          "rJ" ((UDItype)(al) >> 32), "rI" ((UDItype)(bl) >> 32)         \
1783            __CLOBBER_CC)
1784 #if __VIS__ >= 0x300
1785 #undef add_ssaaaa
1786 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
1787   __asm__ (                                                             \
1788        "addcc   %r4, %5, %1\n"                                          \
1789       " addxc   %r2, %r3, %0"                                           \
1790           : "=r" (sh), "=&r" (sl)                                       \
1791        : "rJ"  ((UDItype)(ah)), "rJ" ((UDItype)(bh)),                   \
1792          "%rJ" ((UDItype)(al)), "rI" ((UDItype)(bl)) __CLOBBER_CC)
1793 #define umul_ppmm(ph, pl, m0, m1) \
1794   do {                                                                  \
1795     UDItype __m0 = (m0), __m1 = (m1);                                   \
1796     (pl) = __m0 * __m1;                                                 \
1797     __asm__ ("umulxhi\t%2, %1, %0"                                      \
1798              : "=r" (ph)                                                \
1799              : "%r" (__m0), "r" (__m1));                                \
1800   } while (0)
1801 #define count_leading_zeros(count, x) \
1802   __asm__ ("lzd\t%1,%0" : "=r" (count) : "r" (x))
1803 /* Needed by count_leading_zeros_32 in sparc64.h.  */
1804 #define COUNT_LEADING_ZEROS_NEED_CLZ_TAB
1805 #endif
1806 #endif
1807
1808 #if (defined (__vax) || defined (__vax__)) && W_TYPE_SIZE == 32
1809 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
1810   __asm__ ("addl2 %5,%1\n\tadwc %3,%0"                                  \
1811            : "=g" (sh), "=&g" (sl)                                      \
1812            : "0"  ((USItype)(ah)), "g" ((USItype)(bh)),                 \
1813              "%1" ((USItype)(al)), "g" ((USItype)(bl)))
1814 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
1815   __asm__ ("subl2 %5,%1\n\tsbwc %3,%0"                                  \
1816            : "=g" (sh), "=&g" (sl)                                      \
1817            : "0" ((USItype)(ah)), "g" ((USItype)(bh)),                  \
1818              "1" ((USItype)(al)), "g" ((USItype)(bl)))
1819 #define smul_ppmm(xh, xl, m0, m1) \
1820   do {                                                                  \
1821     union {UDItype __ll;                                                \
1822            struct {USItype __l, __h;} __i;                              \
1823           } __x;                                                        \
1824     USItype __m0 = (m0), __m1 = (m1);                                   \
1825     __asm__ ("emul %1,%2,$0,%0"                                         \
1826              : "=g" (__x.__ll) : "g" (__m0), "g" (__m1));               \
1827     (xh) = __x.__i.__h; (xl) = __x.__i.__l;                             \
1828   } while (0)
1829 #define sdiv_qrnnd(q, r, n1, n0, d) \
1830   do {                                                                  \
1831     union {DItype __ll;                                                 \
1832            struct {SItype __l, __h;} __i;                               \
1833           } __x;                                                        \
1834     __x.__i.__h = n1; __x.__i.__l = n0;                                 \
1835     __asm__ ("ediv %3,%2,%0,%1"                                         \
1836              : "=g" (q), "=g" (r) : "g" (__x.__ll), "g" (d));           \
1837   } while (0)
1838 #if 0
1839 /* FIXME: This instruction appears to be unimplemented on some systems (vax
1840    8800 maybe). */
1841 #define count_trailing_zeros(count,x)                                   \
1842   do {                                                                  \
1843     __asm__ ("ffs 0, 31, %1, %0"                                        \
1844              : "=g" (count)                                             \
1845              : "g" ((USItype) (x)));                                    \
1846   } while (0)
1847 #endif
1848 #endif /* vax */
1849
1850 #if defined (__z8000__) && W_TYPE_SIZE == 16
1851 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
1852   __asm__ ("add %H1,%H5\n\tadc  %H0,%H3"                                \
1853            : "=r" (sh), "=&r" (sl)                                      \
1854            : "0"  ((unsigned int)(ah)), "r" ((unsigned int)(bh)),       \
1855              "%1" ((unsigned int)(al)), "rQR" ((unsigned int)(bl)))
1856 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
1857   __asm__ ("sub %H1,%H5\n\tsbc  %H0,%H3"                                \
1858            : "=r" (sh), "=&r" (sl)                                      \
1859            : "0" ((unsigned int)(ah)), "r" ((unsigned int)(bh)),        \
1860              "1" ((unsigned int)(al)), "rQR" ((unsigned int)(bl)))
1861 #define umul_ppmm(xh, xl, m0, m1) \
1862   do {                                                                  \
1863     union {long int __ll;                                               \
1864            struct {unsigned int __h, __l;} __i;                         \
1865           } __x;                                                        \
1866     unsigned int __m0 = (m0), __m1 = (m1);                              \
1867     __asm__ ("mult      %S0,%H3"                                        \
1868              : "=r" (__x.__i.__h), "=r" (__x.__i.__l)                   \
1869              : "%1" (m0), "rQR" (m1));                                  \
1870     (xh) = __x.__i.__h; (xl) = __x.__i.__l;                             \
1871     (xh) += ((((signed int) __m0 >> 15) & __m1)                         \
1872              + (((signed int) __m1 >> 15) & __m0));                     \
1873   } while (0)
1874 #endif /* __z8000__ */
1875
1876 #endif /* __GNUC__ */
1877
1878 #endif /* NO_ASM */
1879
1880
1881 /* FIXME: "sidi" here is highly doubtful, should sometimes be "diti".  */
1882 #if !defined (umul_ppmm) && defined (__umulsidi3)
1883 #define umul_ppmm(ph, pl, m0, m1) \
1884   {                                                                     \
1885     UDWtype __ll = __umulsidi3 (m0, m1);                                \
1886     ph = (UWtype) (__ll >> W_TYPE_SIZE);                                \
1887     pl = (UWtype) __ll;                                                 \
1888   }
1889 #endif
1890
1891 #if !defined (__umulsidi3)
1892 #define __umulsidi3(u, v) \
1893   ({UWtype __hi, __lo;                                                  \
1894     umul_ppmm (__hi, __lo, u, v);                                       \
1895     ((UDWtype) __hi << W_TYPE_SIZE) | __lo; })
1896 #endif
1897
1898
1899 #if defined (__cplusplus)
1900 #define __longlong_h_C "C"
1901 #else
1902 #define __longlong_h_C
1903 #endif
1904
1905 /* Use mpn_umul_ppmm or mpn_udiv_qrnnd functions, if they exist.  The "_r"
1906    forms have "reversed" arguments, meaning the pointer is last, which
1907    sometimes allows better parameter passing, in particular on 64-bit
1908    hppa. */
1909
1910 #define mpn_umul_ppmm  __MPN(umul_ppmm)
1911 extern __longlong_h_C UWtype mpn_umul_ppmm (UWtype *, UWtype, UWtype);
1912
1913 #if ! defined (umul_ppmm) && HAVE_NATIVE_mpn_umul_ppmm  \
1914   && ! defined (LONGLONG_STANDALONE)
1915 #define umul_ppmm(wh, wl, u, v)                                         \
1916   do {                                                                  \
1917     UWtype __umul_ppmm__p0;                                             \
1918     (wh) = mpn_umul_ppmm (&__umul_ppmm__p0, (UWtype) (u), (UWtype) (v));\
1919     (wl) = __umul_ppmm__p0;                                             \
1920   } while (0)
1921 #endif
1922
1923 #define mpn_umul_ppmm_r  __MPN(umul_ppmm_r)
1924 extern __longlong_h_C UWtype mpn_umul_ppmm_r (UWtype, UWtype, UWtype *);
1925
1926 #if ! defined (umul_ppmm) && HAVE_NATIVE_mpn_umul_ppmm_r        \
1927   && ! defined (LONGLONG_STANDALONE)
1928 #define umul_ppmm(wh, wl, u, v)                                         \
1929   do {                                                                  \
1930     UWtype __umul_p0;                                                   \
1931     (wh) = mpn_umul_ppmm_r ((UWtype) (u), (UWtype) (v), &__umul_p0);    \
1932     (wl) = __umul_p0;                                                   \
1933   } while (0)
1934 #endif
1935
1936 #define mpn_udiv_qrnnd  __MPN(udiv_qrnnd)
1937 extern __longlong_h_C UWtype mpn_udiv_qrnnd (UWtype *, UWtype, UWtype, UWtype);
1938
1939 #if ! defined (udiv_qrnnd) && HAVE_NATIVE_mpn_udiv_qrnnd        \
1940   && ! defined (LONGLONG_STANDALONE)
1941 #define udiv_qrnnd(q, r, n1, n0, d)                                     \
1942   do {                                                                  \
1943     UWtype __udiv_qrnnd_r;                                              \
1944     (q) = mpn_udiv_qrnnd (&__udiv_qrnnd_r,                              \
1945                           (UWtype) (n1), (UWtype) (n0), (UWtype) d);    \
1946     (r) = __udiv_qrnnd_r;                                               \
1947   } while (0)
1948 #endif
1949
1950 #define mpn_udiv_qrnnd_r  __MPN(udiv_qrnnd_r)
1951 extern __longlong_h_C UWtype mpn_udiv_qrnnd_r (UWtype, UWtype, UWtype, UWtype *);
1952
1953 #if ! defined (udiv_qrnnd) && HAVE_NATIVE_mpn_udiv_qrnnd_r      \
1954   && ! defined (LONGLONG_STANDALONE)
1955 #define udiv_qrnnd(q, r, n1, n0, d)                                     \
1956   do {                                                                  \
1957     UWtype __udiv_qrnnd_r;                                              \
1958     (q) = mpn_udiv_qrnnd_r ((UWtype) (n1), (UWtype) (n0), (UWtype) d,   \
1959                             &__udiv_qrnnd_r);                           \
1960     (r) = __udiv_qrnnd_r;                                               \
1961   } while (0)
1962 #endif
1963
1964
1965 /* If this machine has no inline assembler, use C macros.  */
1966
1967 #if !defined (add_ssaaaa)
1968 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
1969   do {                                                                  \
1970     UWtype __x;                                                         \
1971     __x = (al) + (bl);                                                  \
1972     (sh) = (ah) + (bh) + (__x < (al));                                  \
1973     (sl) = __x;                                                         \
1974   } while (0)
1975 #endif
1976
1977 #if !defined (sub_ddmmss)
1978 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
1979   do {                                                                  \
1980     UWtype __x;                                                         \
1981     __x = (al) - (bl);                                                  \
1982     (sh) = (ah) - (bh) - ((al) < (bl));                                 \
1983     (sl) = __x;                                                         \
1984   } while (0)
1985 #endif
1986
1987 /* If we lack umul_ppmm but have smul_ppmm, define umul_ppmm in terms of
1988    smul_ppmm.  */
1989 #if !defined (umul_ppmm) && defined (smul_ppmm)
1990 #define umul_ppmm(w1, w0, u, v)                                         \
1991   do {                                                                  \
1992     UWtype __w1;                                                        \
1993     UWtype __xm0 = (u), __xm1 = (v);                                    \
1994     smul_ppmm (__w1, w0, __xm0, __xm1);                                 \
1995     (w1) = __w1 + (-(__xm0 >> (W_TYPE_SIZE - 1)) & __xm1)               \
1996                 + (-(__xm1 >> (W_TYPE_SIZE - 1)) & __xm0);              \
1997   } while (0)
1998 #endif
1999
2000 /* If we still don't have umul_ppmm, define it using plain C.
2001
2002    For reference, when this code is used for squaring (ie. u and v identical
2003    expressions), gcc recognises __x1 and __x2 are the same and generates 3
2004    multiplies, not 4.  The subsequent additions could be optimized a bit,
2005    but the only place GMP currently uses such a square is mpn_sqr_basecase,
2006    and chips obliged to use this generic C umul will have plenty of worse
2007    performance problems than a couple of extra instructions on the diagonal
2008    of sqr_basecase.  */
2009
2010 #if !defined (umul_ppmm)
2011 #define umul_ppmm(w1, w0, u, v)                                         \
2012   do {                                                                  \
2013     UWtype __x0, __x1, __x2, __x3;                                      \
2014     UHWtype __ul, __vl, __uh, __vh;                                     \
2015     UWtype __u = (u), __v = (v);                                        \
2016                                                                         \
2017     __ul = __ll_lowpart (__u);                                          \
2018     __uh = __ll_highpart (__u);                                         \
2019     __vl = __ll_lowpart (__v);                                          \
2020     __vh = __ll_highpart (__v);                                         \
2021                                                                         \
2022     __x0 = (UWtype) __ul * __vl;                                        \
2023     __x1 = (UWtype) __ul * __vh;                                        \
2024     __x2 = (UWtype) __uh * __vl;                                        \
2025     __x3 = (UWtype) __uh * __vh;                                        \
2026                                                                         \
2027     __x1 += __ll_highpart (__x0);/* this can't give carry */            \
2028     __x1 += __x2;               /* but this indeed can */               \
2029     if (__x1 < __x2)            /* did we get it? */                    \
2030       __x3 += __ll_B;           /* yes, add it in the proper pos. */    \
2031                                                                         \
2032     (w1) = __x3 + __ll_highpart (__x1);                                 \
2033     (w0) = (__x1 << W_TYPE_SIZE/2) + __ll_lowpart (__x0);               \
2034   } while (0)
2035 #endif
2036
2037 /* If we don't have smul_ppmm, define it using umul_ppmm (which surely will
2038    exist in one form or another.  */
2039 #if !defined (smul_ppmm)
2040 #define smul_ppmm(w1, w0, u, v)                                         \
2041   do {                                                                  \
2042     UWtype __w1;                                                        \
2043     UWtype __xm0 = (u), __xm1 = (v);                                    \
2044     umul_ppmm (__w1, w0, __xm0, __xm1);                                 \
2045     (w1) = __w1 - (-(__xm0 >> (W_TYPE_SIZE - 1)) & __xm1)               \
2046                 - (-(__xm1 >> (W_TYPE_SIZE - 1)) & __xm0);              \
2047   } while (0)
2048 #endif
2049
2050 /* Define this unconditionally, so it can be used for debugging.  */
2051 #define __udiv_qrnnd_c(q, r, n1, n0, d) \
2052   do {                                                                  \
2053     UWtype __d1, __d0, __q1, __q0, __r1, __r0, __m;                     \
2054                                                                         \
2055     ASSERT ((d) != 0);                                                  \
2056     ASSERT ((n1) < (d));                                                \
2057                                                                         \
2058     __d1 = __ll_highpart (d);                                           \
2059     __d0 = __ll_lowpart (d);                                            \
2060                                                                         \
2061     __q1 = (n1) / __d1;                                                 \
2062     __r1 = (n1) - __q1 * __d1;                                          \
2063     __m = __q1 * __d0;                                                  \
2064     __r1 = __r1 * __ll_B | __ll_highpart (n0);                          \
2065     if (__r1 < __m)                                                     \
2066       {                                                                 \
2067         __q1--, __r1 += (d);                                            \
2068         if (__r1 >= (d)) /* i.e. we didn't get carry when adding to __r1 */\
2069           if (__r1 < __m)                                               \
2070             __q1--, __r1 += (d);                                        \
2071       }                                                                 \
2072     __r1 -= __m;                                                        \
2073                                                                         \
2074     __q0 = __r1 / __d1;                                                 \
2075     __r0 = __r1  - __q0 * __d1;                                         \
2076     __m = __q0 * __d0;                                                  \
2077     __r0 = __r0 * __ll_B | __ll_lowpart (n0);                           \
2078     if (__r0 < __m)                                                     \
2079       {                                                                 \
2080         __q0--, __r0 += (d);                                            \
2081         if (__r0 >= (d))                                                \
2082           if (__r0 < __m)                                               \
2083             __q0--, __r0 += (d);                                        \
2084       }                                                                 \
2085     __r0 -= __m;                                                        \
2086                                                                         \
2087     (q) = __q1 * __ll_B | __q0;                                         \
2088     (r) = __r0;                                                         \
2089   } while (0)
2090
2091 /* If the processor has no udiv_qrnnd but sdiv_qrnnd, go through
2092    __udiv_w_sdiv (defined in libgcc or elsewhere).  */
2093 #if !defined (udiv_qrnnd) && defined (sdiv_qrnnd)
2094 #define udiv_qrnnd(q, r, nh, nl, d) \
2095   do {                                                                  \
2096     UWtype __r;                                                         \
2097     (q) = __MPN(udiv_w_sdiv) (&__r, nh, nl, d);                         \
2098     (r) = __r;                                                          \
2099   } while (0)
2100 __GMP_DECLSPEC UWtype __MPN(udiv_w_sdiv) (UWtype *, UWtype, UWtype, UWtype);
2101 #endif
2102
2103 /* If udiv_qrnnd was not defined for this processor, use __udiv_qrnnd_c.  */
2104 #if !defined (udiv_qrnnd)
2105 #define UDIV_NEEDS_NORMALIZATION 1
2106 #define udiv_qrnnd __udiv_qrnnd_c
2107 #endif
2108
2109 #if !defined (count_leading_zeros)
2110 #define count_leading_zeros(count, x) \
2111   do {                                                                  \
2112     UWtype __xr = (x);                                                  \
2113     UWtype __a;                                                         \
2114                                                                         \
2115     if (W_TYPE_SIZE == 32)                                              \
2116       {                                                                 \
2117         __a = __xr < ((UWtype) 1 << 2*__BITS4)                          \
2118           ? (__xr < ((UWtype) 1 << __BITS4) ? 1 : __BITS4 + 1)          \
2119           : (__xr < ((UWtype) 1 << 3*__BITS4) ? 2*__BITS4 + 1           \
2120           : 3*__BITS4 + 1);                                             \
2121       }                                                                 \
2122     else                                                                \
2123       {                                                                 \
2124         for (__a = W_TYPE_SIZE - 8; __a > 0; __a -= 8)                  \
2125           if (((__xr >> __a) & 0xff) != 0)                              \
2126             break;                                                      \
2127         ++__a;                                                          \
2128       }                                                                 \
2129                                                                         \
2130     (count) = W_TYPE_SIZE + 1 - __a - __clz_tab[__xr >> __a];           \
2131   } while (0)
2132 /* This version gives a well-defined value for zero. */
2133 #define COUNT_LEADING_ZEROS_0 (W_TYPE_SIZE - 1)
2134 #define COUNT_LEADING_ZEROS_NEED_CLZ_TAB
2135 #define COUNT_LEADING_ZEROS_SLOW
2136 #endif
2137
2138 /* clz_tab needed by mpn/x86/pentium/mod_1.asm in a fat binary */
2139 #if HAVE_HOST_CPU_FAMILY_x86 && WANT_FAT_BINARY
2140 #define COUNT_LEADING_ZEROS_NEED_CLZ_TAB
2141 #endif
2142
2143 #ifdef COUNT_LEADING_ZEROS_NEED_CLZ_TAB
2144 extern const unsigned char __GMP_DECLSPEC __clz_tab[129];
2145 #endif
2146
2147 #if !defined (count_trailing_zeros)
2148 #if !defined (COUNT_LEADING_ZEROS_SLOW)
2149 /* Define count_trailing_zeros using an asm count_leading_zeros.  */
2150 #define count_trailing_zeros(count, x)                                  \
2151   do {                                                                  \
2152     UWtype __ctz_x = (x);                                               \
2153     UWtype __ctz_c;                                                     \
2154     ASSERT (__ctz_x != 0);                                              \
2155     count_leading_zeros (__ctz_c, __ctz_x & -__ctz_x);                  \
2156     (count) = W_TYPE_SIZE - 1 - __ctz_c;                                \
2157   } while (0)
2158 #else
2159 /* Define count_trailing_zeros in plain C, assuming small counts are common.
2160    We use clz_tab without ado, since the C count_leading_zeros above will have
2161    pulled it in.  */
2162 #define count_trailing_zeros(count, x)                                  \
2163   do {                                                                  \
2164     UWtype __ctz_x = (x);                                               \
2165     int __ctz_c;                                                        \
2166                                                                         \
2167     if (LIKELY ((__ctz_x & 0xff) != 0))                                 \
2168       (count) = __clz_tab[__ctz_x & -__ctz_x] - 2;                      \
2169     else                                                                \
2170       {                                                                 \
2171         for (__ctz_c = 8 - 2; __ctz_c < W_TYPE_SIZE - 2; __ctz_c += 8)  \
2172           {                                                             \
2173             __ctz_x >>= 8;                                              \
2174             if (LIKELY ((__ctz_x & 0xff) != 0))                         \
2175               break;                                                    \
2176           }                                                             \
2177                                                                         \
2178         (count) = __ctz_c + __clz_tab[__ctz_x & -__ctz_x];              \
2179       }                                                                 \
2180   } while (0)
2181 #endif
2182 #endif
2183
2184 #ifndef UDIV_NEEDS_NORMALIZATION
2185 #define UDIV_NEEDS_NORMALIZATION 0
2186 #endif
2187
2188 /* Whether udiv_qrnnd is actually implemented with udiv_qrnnd_preinv, and
2189    that hence the latter should always be used.  */
2190 #ifndef UDIV_PREINV_ALWAYS
2191 #define UDIV_PREINV_ALWAYS 0
2192 #endif
2193
2194 /* Give defaults for UMUL_TIME and UDIV_TIME.  */
2195 #ifndef UMUL_TIME
2196 #define UMUL_TIME 1
2197 #endif
2198
2199 #ifndef UDIV_TIME
2200 #define UDIV_TIME UMUL_TIME
2201 #endif