src/longlong.h

   1 /* longlong.h -- definitions for mixed size 32/64 bit arithmetic.
   2
   3 Copyright 1991-2013 Free Software Foundation, Inc.
   4
   5 This file is free software; you can redistribute it and/or modify it under the
   6 terms of the GNU Lesser General Public License as published by the Free
   7 Software Foundation; either version 3 of the License, or (at your option) any
   8 later version.
   9
  10 This file is distributed in the hope that it will be useful, but WITHOUT ANY
  11 WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
  12 PARTICULAR PURPOSE.  See the GNU Lesser General Public License for more
  13 details.
  14
  15 You should have received a copy of the GNU Lesser General Public License
  16 along with this file.  If not, see http://www.gnu.org/licenses/.  */
  17
  18 /* You have to define the following before including this file:
  19
  20    UWtype -- An unsigned type, default type for operations (typically a "word")
  21    UHWtype -- An unsigned type, at least half the size of UWtype
  22    UDWtype -- An unsigned type, at least twice as large a UWtype
  23    W_TYPE_SIZE -- size in bits of UWtype
  24
  25    SItype, USItype -- Signed and unsigned 32 bit types
  26    DItype, UDItype -- Signed and unsigned 64 bit types
  27
  28    On a 32 bit machine UWtype should typically be USItype;
  29    on a 64 bit machine, UWtype should typically be UDItype.
  30
  31    Optionally, define:
  32
  33    LONGLONG_STANDALONE -- Avoid code that needs machine-dependent support files
  34    NO_ASM -- Disable inline asm
  35
  36
  37    CAUTION!  Using this version of longlong.h outside of GMP is not safe.  You
  38    need to include gmp.h and gmp-impl.h, or certain things might not work as
  39    expected.
  40 */
  41
  42 #define __BITS4 (W_TYPE_SIZE / 4)
  43 #define __ll_B ((UWtype) 1 << (W_TYPE_SIZE / 2))
  44 #define __ll_lowpart(t) ((UWtype) (t) & (__ll_B - 1))
  45 #define __ll_highpart(t) ((UWtype) (t) >> (W_TYPE_SIZE / 2))
  46
  47 /* This is used to make sure no undesirable sharing between different libraries
  48    that use this file takes place.  */
  49 #ifndef __MPN
  50 #define __MPN(x) __##x
  51 #endif
  52
  53 /* Define auxiliary asm macros.
  54
  55    1) umul_ppmm(high_prod, low_prod, multiplier, multiplicand) multiplies two
  56    UWtype integers MULTIPLIER and MULTIPLICAND, and generates a two UWtype
  57    word product in HIGH_PROD and LOW_PROD.
  58
  59    2) __umulsidi3(a,b) multiplies two UWtype integers A and B, and returns a
  60    UDWtype product.  This is just a variant of umul_ppmm.
  61
  62    3) udiv_qrnnd(quotient, remainder, high_numerator, low_numerator,
  63    denominator) divides a UDWtype, composed by the UWtype integers
  64    HIGH_NUMERATOR and LOW_NUMERATOR, by DENOMINATOR and places the quotient
  65    in QUOTIENT and the remainder in REMAINDER.  HIGH_NUMERATOR must be less
  66    than DENOMINATOR for correct operation.  If, in addition, the most
  67    significant bit of DENOMINATOR must be 1, then the pre-processor symbol
  68    UDIV_NEEDS_NORMALIZATION is defined to 1.
  69
  70    4) sdiv_qrnnd(quotient, remainder, high_numerator, low_numerator,
  71    denominator).  Like udiv_qrnnd but the numbers are signed.  The quotient
  72    is rounded towards 0.
  73
  74    5) count_leading_zeros(count, x) counts the number of zero-bits from the
  75    msb to the first non-zero bit in the UWtype X.  This is the number of
  76    steps X needs to be shifted left to set the msb.  Undefined for X == 0,
  77    unless the symbol COUNT_LEADING_ZEROS_0 is defined to some value.
  78
  79    6) count_trailing_zeros(count, x) like count_leading_zeros, but counts
  80    from the least significant end.
  81
  82    7) add_ssaaaa(high_sum, low_sum, high_addend_1, low_addend_1,
  83    high_addend_2, low_addend_2) adds two UWtype integers, composed by
  84    HIGH_ADDEND_1 and LOW_ADDEND_1, and HIGH_ADDEND_2 and LOW_ADDEND_2
  85    respectively.  The result is placed in HIGH_SUM and LOW_SUM.  Overflow
  86    (i.e. carry out) is not stored anywhere, and is lost.
  87
  88    8) sub_ddmmss(high_difference, low_difference, high_minuend, low_minuend,
  89    high_subtrahend, low_subtrahend) subtracts two two-word UWtype integers,
  90    composed by HIGH_MINUEND_1 and LOW_MINUEND_1, and HIGH_SUBTRAHEND_2 and
  91    LOW_SUBTRAHEND_2 respectively.  The result is placed in HIGH_DIFFERENCE
  92    and LOW_DIFFERENCE.  Overflow (i.e. carry out) is not stored anywhere,
  93    and is lost.
  94
  95    If any of these macros are left undefined for a particular CPU,
  96    C macros are used.
  97
  98
  99    Notes:
 100
 101    For add_ssaaaa the two high and two low addends can both commute, but
 102    unfortunately gcc only supports one "%" commutative in each asm block.
 103    This has always been so but is only documented in recent versions
 104    (eg. pre-release 3.3).  Having two or more "%"s can cause an internal
 105    compiler error in certain rare circumstances.
 106
 107    Apparently it was only the last "%" that was ever actually respected, so
 108    the code has been updated to leave just that.  Clearly there's a free
 109    choice whether high or low should get it, if there's a reason to favour
 110    one over the other.  Also obviously when the constraints on the two
 111    operands are identical there's no benefit to the reloader in any "%" at
 112    all.
 113
 114    */
 115
 116 /* The CPUs come in alphabetical order below.
 117
 118    Please add support for more CPUs here, or improve the current support
 119    for the CPUs below!  */
 120
 121
 122 /* count_leading_zeros_gcc_clz is count_leading_zeros implemented with gcc
 123    3.4 __builtin_clzl or __builtin_clzll, according to our limb size.
 124    Similarly count_trailing_zeros_gcc_ctz using __builtin_ctzl or
 125    __builtin_ctzll.
 126
 127    These builtins are only used when we check what code comes out, on some
 128    chips they're merely libgcc calls, where we will instead want an inline
 129    in that case (either asm or generic C).
 130
 131    These builtins are better than an asm block of the same insn, since an
 132    asm block doesn't give gcc any information about scheduling or resource
 133    usage.  We keep an asm block for use on prior versions of gcc though.
 134
 135    For reference, __builtin_ffs existed in gcc prior to __builtin_clz, but
 136    it's not used (for count_leading_zeros) because it generally gives extra
 137    code to ensure the result is 0 when the input is 0, which we don't need
 138    or want.  */
 139
 140 #ifdef _LONG_LONG_LIMB
 141 #define count_leading_zeros_gcc_clz(count,x)    \
 142   do {                                          \
 143     ASSERT ((x) != 0);                          \
 144     (count) = __builtin_clzll (x);              \
 145   } while (0)
 146 #else
 147 #define count_leading_zeros_gcc_clz(count,x)    \
 148   do {                                          \
 149     ASSERT ((x) != 0);                          \
 150     (count) = __builtin_clzl (x);               \
 151   } while (0)
 152 #endif
 153
 154 #ifdef _LONG_LONG_LIMB
 155 #define count_trailing_zeros_gcc_ctz(count,x)   \
 156   do {                                          \
 157     ASSERT ((x) != 0);                          \
 158     (count) = __builtin_ctzll (x);              \
 159   } while (0)
 160 #else
 161 #define count_trailing_zeros_gcc_ctz(count,x)   \
 162   do {                                          \
 163     ASSERT ((x) != 0);                          \
 164     (count) = __builtin_ctzl (x);               \
 165   } while (0)
 166 #endif
 167
 168
 169 /* FIXME: The macros using external routines like __MPN(count_leading_zeros)
 170    don't need to be under !NO_ASM */
 171 #if ! defined (NO_ASM)
 172
 173 #if defined (__alpha) && W_TYPE_SIZE == 64
 174 /* Most alpha-based machines, except Cray systems. */
 175 #if defined (__GNUC__)
 176 #if __GMP_GNUC_PREREQ (3,3)
 177 #define umul_ppmm(ph, pl, m0, m1) \
 178   do {                                                                  \
 179     UDItype __m0 = (m0), __m1 = (m1);                                   \
 180     (ph) = __builtin_alpha_umulh (__m0, __m1);                          \
 181     (pl) = __m0 * __m1;                                                 \
 182   } while (0)
 183 #else
 184 #define umul_ppmm(ph, pl, m0, m1) \
 185   do {                                                                  \
 186     UDItype __m0 = (m0), __m1 = (m1);                                   \
 187     __asm__ ("umulh %r1,%2,%0"                                          \
 188              : "=r" (ph)                                                \
 189              : "%rJ" (m0), "rI" (m1));                                  \
 190     (pl) = __m0 * __m1;                                                 \
 191   } while (0)
 192 #endif
 193 #define UMUL_TIME 18
 194 #else /* ! __GNUC__ */
 195 #include <machine/builtins.h>
 196 #define umul_ppmm(ph, pl, m0, m1) \
 197   do {                                                                  \
 198     UDItype __m0 = (m0), __m1 = (m1);                                   \
 199     (ph) = __UMULH (m0, m1);                                            \
 200     (pl) = __m0 * __m1;                                                 \
 201   } while (0)
 202 #endif
 203 #ifndef LONGLONG_STANDALONE
 204 #define udiv_qrnnd(q, r, n1, n0, d) \
 205   do { UWtype __di;                                                     \
 206     __di = __MPN(invert_limb) (d);                                      \
 207     udiv_qrnnd_preinv (q, r, n1, n0, d, __di);                          \
 208   } while (0)
 209 #define UDIV_PREINV_ALWAYS  1
 210 #define UDIV_NEEDS_NORMALIZATION 1
 211 #define UDIV_TIME 220
 212 #endif /* LONGLONG_STANDALONE */
 213
 214 /* clz_tab is required in all configurations, since mpn/alpha/cntlz.asm
 215    always goes into libgmp.so, even when not actually used.  */
 216 #define COUNT_LEADING_ZEROS_NEED_CLZ_TAB
 217
 218 #if defined (__GNUC__) && HAVE_HOST_CPU_alpha_CIX
 219 #define count_leading_zeros(COUNT,X) \
 220   __asm__("ctlz %1,%0" : "=r"(COUNT) : "r"(X))
 221 #define count_trailing_zeros(COUNT,X) \
 222   __asm__("cttz %1,%0" : "=r"(COUNT) : "r"(X))
 223 #endif /* clz/ctz using cix */
 224
 225 #if ! defined (count_leading_zeros)                             \
 226   && defined (__GNUC__) && ! defined (LONGLONG_STANDALONE)
 227 /* ALPHA_CMPBGE_0 gives "cmpbge $31,src,dst", ie. test src bytes == 0.
 228    "$31" is written explicitly in the asm, since an "r" constraint won't
 229    select reg 31.  There seems no need to worry about "r31" syntax for cray,
 230    since gcc itself (pre-release 3.4) emits just $31 in various places.  */
 231 #define ALPHA_CMPBGE_0(dst, src)                                        \
 232   do { asm ("cmpbge $31, %1, %0" : "=r" (dst) : "r" (src)); } while (0)
 233 /* Zero bytes are turned into bits with cmpbge, a __clz_tab lookup counts
 234    them, locating the highest non-zero byte.  A second __clz_tab lookup
 235    counts the leading zero bits in that byte, giving the result.  */
 236 #define count_leading_zeros(count, x)                                   \
 237   do {                                                                  \
 238     UWtype  __clz__b, __clz__c, __clz__x = (x);                         \
 239     ALPHA_CMPBGE_0 (__clz__b,  __clz__x);           /* zero bytes */    \
 240     __clz__b = __clz_tab [(__clz__b >> 1) ^ 0x7F];  /* 8 to 1 byte */   \
 241     __clz__b = __clz__b * 8 - 7;                    /* 57 to 1 shift */ \
 242     __clz__x >>= __clz__b;                                              \
 243     __clz__c = __clz_tab [__clz__x];                /* 8 to 1 bit */    \
 244     __clz__b = 65 - __clz__b;                                           \
 245     (count) = __clz__b - __clz__c;                                      \
 246   } while (0)
 247 #define COUNT_LEADING_ZEROS_NEED_CLZ_TAB
 248 #endif /* clz using cmpbge */
 249
 250 #if ! defined (count_leading_zeros) && ! defined (LONGLONG_STANDALONE)
 251 #if HAVE_ATTRIBUTE_CONST
 252 long __MPN(count_leading_zeros) (UDItype) __attribute__ ((const));
 253 #else
 254 long __MPN(count_leading_zeros) (UDItype);
 255 #endif
 256 #define count_leading_zeros(count, x) \
 257   ((count) = __MPN(count_leading_zeros) (x))
 258 #endif /* clz using mpn */
 259 #endif /* __alpha */
 260
 261 #if defined (__AVR) && W_TYPE_SIZE == 8
 262 #define umul_ppmm(ph, pl, m0, m1) \
 263   do {                                                                  \
 264     unsigned short __p = (unsigned short) (m0) * (m1);                  \
 265     (ph) = __p >> 8;                                                    \
 266     (pl) = __p;                                                         \
 267   } while (0)
 268 #endif /* AVR */
 269
 270 #if defined (_CRAY) && W_TYPE_SIZE == 64
 271 #include <intrinsics.h>
 272 #define UDIV_PREINV_ALWAYS  1
 273 #define UDIV_NEEDS_NORMALIZATION 1
 274 #define UDIV_TIME 220
 275 long __MPN(count_leading_zeros) (UDItype);
 276 #define count_leading_zeros(count, x) \
 277   ((count) = _leadz ((UWtype) (x)))
 278 #if defined (_CRAYIEEE)         /* I.e., Cray T90/ieee, T3D, and T3E */
 279 #define umul_ppmm(ph, pl, m0, m1) \
 280   do {                                                                  \
 281     UDItype __m0 = (m0), __m1 = (m1);                                   \
 282     (ph) = _int_mult_upper (m0, m1);                                    \
 283     (pl) = __m0 * __m1;                                                 \
 284   } while (0)
 285 #ifndef LONGLONG_STANDALONE
 286 #define udiv_qrnnd(q, r, n1, n0, d) \
 287   do { UWtype __di;                                                     \
 288     __di = __MPN(invert_limb) (d);                                      \
 289     udiv_qrnnd_preinv (q, r, n1, n0, d, __di);                          \
 290   } while (0)
 291 #endif /* LONGLONG_STANDALONE */
 292 #endif /* _CRAYIEEE */
 293 #endif /* _CRAY */
 294
 295 #if defined (__ia64) && W_TYPE_SIZE == 64
 296 /* This form encourages gcc (pre-release 3.4 at least) to emit predicated
 297    "sub r=r,r" and "sub r=r,r,1", giving a 2 cycle latency.  The generic
 298    code using "al<bl" arithmetically comes out making an actual 0 or 1 in a
 299    register, which takes an extra cycle.  */
 300 #define sub_ddmmss(sh, sl, ah, al, bh, bl)      \
 301   do {                                          \
 302     UWtype __x;                                 \
 303     __x = (al) - (bl);                          \
 304     if ((al) < (bl))                            \
 305       (sh) = (ah) - (bh) - 1;                   \
 306     else                                        \
 307       (sh) = (ah) - (bh);                       \
 308     (sl) = __x;                                 \
 309   } while (0)
 310 #if defined (__GNUC__) && ! defined (__INTEL_COMPILER)
 311 /* Do both product parts in assembly, since that gives better code with
 312    all gcc versions.  Some callers will just use the upper part, and in
 313    that situation we waste an instruction, but not any cycles.  */
 314 #define umul_ppmm(ph, pl, m0, m1) \
 315     __asm__ ("xma.hu %0 = %2, %3, f0\n\txma.l %1 = %2, %3, f0"          \
 316              : "=&f" (ph), "=f" (pl)                                    \
 317              : "f" (m0), "f" (m1))
 318 #define UMUL_TIME 14
 319 #define count_leading_zeros(count, x) \
 320   do {                                                                  \
 321     UWtype _x = (x), _y, _a, _c;                                        \
 322     __asm__ ("mux1 %0 = %1, @rev" : "=r" (_y) : "r" (_x));              \
 323     __asm__ ("czx1.l %0 = %1" : "=r" (_a) : "r" (-_y | _y));            \
 324     _c = (_a - 1) << 3;                                                 \
 325     _x >>= _c;                                                          \
 326     if (_x >= 1 << 4)                                                   \
 327       _x >>= 4, _c += 4;                                                \
 328     if (_x >= 1 << 2)                                                   \
 329       _x >>= 2, _c += 2;                                                \
 330     _c += _x >> 1;                                                      \
 331     (count) =  W_TYPE_SIZE - 1 - _c;                                    \
 332   } while (0)
 333 /* similar to what gcc does for __builtin_ffs, but 0 based rather than 1
 334    based, and we don't need a special case for x==0 here */
 335 #define count_trailing_zeros(count, x)                                  \
 336   do {                                                                  \
 337     UWtype __ctz_x = (x);                                               \
 338     __asm__ ("popcnt %0 = %1"                                           \
 339              : "=r" (count)                                             \
 340              : "r" ((__ctz_x-1) & ~__ctz_x));                           \
 341   } while (0)
 342 #endif
 343 #if defined (__INTEL_COMPILER)
 344 #include <ia64intrin.h>
 345 #define umul_ppmm(ph, pl, m0, m1)                                       \
 346   do {                                                                  \
 347     UWtype _m0 = (m0), _m1 = (m1);                                      \
 348     ph = _m64_xmahu (_m0, _m1, 0);                                      \
 349     pl = _m0 * _m1;                                                     \
 350   } while (0)
 351 #endif
 352 #ifndef LONGLONG_STANDALONE
 353 #define udiv_qrnnd(q, r, n1, n0, d) \
 354   do { UWtype __di;                                                     \
 355     __di = __MPN(invert_limb) (d);                                      \
 356     udiv_qrnnd_preinv (q, r, n1, n0, d, __di);                          \
 357   } while (0)
 358 #define UDIV_PREINV_ALWAYS  1
 359 #define UDIV_NEEDS_NORMALIZATION 1
 360 #endif
 361 #define UDIV_TIME 220
 362 #endif
 363
 364
 365 #if defined (__GNUC__)
 366
 367 /* We sometimes need to clobber "cc" with gcc2, but that would not be
 368    understood by gcc1.  Use cpp to avoid major code duplication.  */
 369 #if __GNUC__ < 2
 370 #define __CLOBBER_CC
 371 #define __AND_CLOBBER_CC
 372 #else /* __GNUC__ >= 2 */
 373 #define __CLOBBER_CC : "cc"
 374 #define __AND_CLOBBER_CC , "cc"
 375 #endif /* __GNUC__ < 2 */
 376
 377 #if (defined (__a29k__) || defined (_AM29K)) && W_TYPE_SIZE == 32
 378 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
 379   __asm__ ("add %1,%4,%5\n\taddc %0,%2,%3"                              \
 380            : "=r" (sh), "=&r" (sl)                                      \
 381            : "r" (ah), "rI" (bh), "%r" (al), "rI" (bl))
 382 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
 383   __asm__ ("sub %1,%4,%5\n\tsubc %0,%2,%3"                              \
 384            : "=r" (sh), "=&r" (sl)                                      \
 385            : "r" (ah), "rI" (bh), "r" (al), "rI" (bl))
 386 #define umul_ppmm(xh, xl, m0, m1) \
 387   do {                                                                  \
 388     USItype __m0 = (m0), __m1 = (m1);                                   \
 389     __asm__ ("multiplu %0,%1,%2"                                        \
 390              : "=r" (xl)                                                \
 391              : "r" (__m0), "r" (__m1));                                 \
 392     __asm__ ("multmu %0,%1,%2"                                          \
 393              : "=r" (xh)                                                \
 394              : "r" (__m0), "r" (__m1));                                 \
 395   } while (0)
 396 #define udiv_qrnnd(q, r, n1, n0, d) \
 397   __asm__ ("dividu %0,%3,%4"                                            \
 398            : "=r" (q), "=q" (r)                                         \
 399            : "1" (n1), "r" (n0), "r" (d))
 400 #define count_leading_zeros(count, x) \
 401     __asm__ ("clz %0,%1"                                                \
 402              : "=r" (count)                                             \
 403              : "r" (x))
 404 #define COUNT_LEADING_ZEROS_0 32
 405 #endif /* __a29k__ */
 406
 407 #if defined (__arc__)
 408 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
 409   __asm__ ("add.f\t%1, %4, %5\n\tadc\t%0, %2, %3"                       \
 410            : "=r" (sh),                                                 \
 411              "=&r" (sl)                                                 \
 412            : "r"  ((USItype) (ah)),                                     \
 413              "rIJ" ((USItype) (bh)),                                    \
 414              "%r" ((USItype) (al)),                                     \
 415              "rIJ" ((USItype) (bl)))
 416 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
 417   __asm__ ("sub.f\t%1, %4, %5\n\tsbc\t%0, %2, %3"                       \
 418            : "=r" (sh),                                                 \
 419              "=&r" (sl)                                                 \
 420            : "r" ((USItype) (ah)),                                      \
 421              "rIJ" ((USItype) (bh)),                                    \
 422              "r" ((USItype) (al)),                                      \
 423              "rIJ" ((USItype) (bl)))
 424 #endif
 425
 426 #if defined (__arm__) && W_TYPE_SIZE == 32
 427 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
 428   __asm__ ("adds\t%1, %4, %5\n\tadc\t%0, %2, %3"                        \
 429            : "=r" (sh), "=&r" (sl)                                      \
 430            : "r" (ah), "rI" (bh), "%r" (al), "rI" (bl) __CLOBBER_CC)
 431 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
 432   do {                                                                  \
 433     if (__builtin_constant_p (al))                                      \
 434       {                                                                 \
 435         if (__builtin_constant_p (ah))                                  \
 436           __asm__ ("rsbs\t%1, %5, %4\n\trsc\t%0, %3, %2"                \
 437                    : "=r" (sh), "=&r" (sl)                              \
 438                    : "rI" (ah), "r" (bh), "rI" (al), "r" (bl) __CLOBBER_CC); \
 439         else                                                            \
 440           __asm__ ("rsbs\t%1, %5, %4\n\tsbc\t%0, %2, %3"                \
 441                    : "=r" (sh), "=&r" (sl)                              \
 442                    : "r" (ah), "rI" (bh), "rI" (al), "r" (bl) __CLOBBER_CC); \
 443       }                                                                 \
 444     else if (__builtin_constant_p (ah))                                 \
 445       {                                                                 \
 446         if (__builtin_constant_p (bl))                                  \
 447           __asm__ ("subs\t%1, %4, %5\n\trsc\t%0, %3, %2"                \
 448                    : "=r" (sh), "=&r" (sl)                              \
 449                    : "rI" (ah), "r" (bh), "r" (al), "rI" (bl) __CLOBBER_CC); \
 450         else                                                            \
 451           __asm__ ("rsbs\t%1, %5, %4\n\trsc\t%0, %3, %2"                \
 452                    : "=r" (sh), "=&r" (sl)                              \
 453                    : "rI" (ah), "r" (bh), "rI" (al), "r" (bl) __CLOBBER_CC); \
 454       }                                                                 \
 455     else if (__builtin_constant_p (bl))                                 \
 456       {                                                                 \
 457         if (__builtin_constant_p (bh))                                  \
 458           __asm__ ("subs\t%1, %4, %5\n\tsbc\t%0, %2, %3"                \
 459                    : "=r" (sh), "=&r" (sl)                              \
 460                    : "r" (ah), "rI" (bh), "r" (al), "rI" (bl) __CLOBBER_CC); \
 461         else                                                            \
 462           __asm__ ("subs\t%1, %4, %5\n\trsc\t%0, %3, %2"                \
 463                    : "=r" (sh), "=&r" (sl)                              \
 464                    : "rI" (ah), "r" (bh), "r" (al), "rI" (bl) __CLOBBER_CC); \
 465       }                                                                 \
 466     else /* only bh might be a constant */                              \
 467       __asm__ ("subs\t%1, %4, %5\n\tsbc\t%0, %2, %3"                    \
 468                : "=r" (sh), "=&r" (sl)                                  \
 469                : "r" (ah), "rI" (bh), "r" (al), "rI" (bl) __CLOBBER_CC);\
 470     } while (0)
 471 #if 1 || defined (__arm_m__)    /* `M' series has widening multiply support */
 472 #define umul_ppmm(xh, xl, a, b) \
 473   __asm__ ("umull %0,%1,%2,%3" : "=&r" (xl), "=&r" (xh) : "r" (a), "r" (b))
 474 #define UMUL_TIME 5
 475 #define smul_ppmm(xh, xl, a, b) \
 476   __asm__ ("smull %0,%1,%2,%3" : "=&r" (xl), "=&r" (xh) : "r" (a), "r" (b))
 477 #ifndef LONGLONG_STANDALONE
 478 #define udiv_qrnnd(q, r, n1, n0, d) \
 479   do { UWtype __di;                                                     \
 480     __di = __MPN(invert_limb) (d);                                      \
 481     udiv_qrnnd_preinv (q, r, n1, n0, d, __di);                          \
 482   } while (0)
 483 #define UDIV_PREINV_ALWAYS  1
 484 #define UDIV_NEEDS_NORMALIZATION 1
 485 #define UDIV_TIME 70
 486 #endif /* LONGLONG_STANDALONE */
 487 #else
 488 #define umul_ppmm(xh, xl, a, b) \
 489   __asm__ ("%@ Inlined umul_ppmm\n"                                     \
 490 "       mov     %|r0, %2, lsr #16\n"                                    \
 491 "       mov     %|r2, %3, lsr #16\n"                                    \
 492 "       bic     %|r1, %2, %|r0, lsl #16\n"                              \
 493 "       bic     %|r2, %3, %|r2, lsl #16\n"                              \
 494 "       mul     %1, %|r1, %|r2\n"                                       \
 495 "       mul     %|r2, %|r0, %|r2\n"                                     \
 496 "       mul     %|r1, %0, %|r1\n"                                       \
 497 "       mul     %0, %|r0, %0\n"                                         \
 498 "       adds    %|r1, %|r2, %|r1\n"                                     \
 499 "       addcs   %0, %0, #65536\n"                                       \
 500 "       adds    %1, %1, %|r1, lsl #16\n"                                \
 501 "       adc     %0, %0, %|r1, lsr #16"                                  \
 502            : "=&r" (xh), "=r" (xl)                                      \
 503            : "r" (a), "r" (b)                                           \
 504            : "r0", "r1", "r2")
 505 #define UMUL_TIME 20
 506 #ifndef LONGLONG_STANDALONE
 507 #define udiv_qrnnd(q, r, n1, n0, d) \
 508   do { UWtype __r;                                                      \
 509     (q) = __MPN(udiv_qrnnd) (&__r, (n1), (n0), (d));                    \
 510     (r) = __r;                                                          \
 511   } while (0)
 512 extern UWtype __MPN(udiv_qrnnd) (UWtype *, UWtype, UWtype, UWtype);
 513 #define UDIV_TIME 200
 514 #endif /* LONGLONG_STANDALONE */
 515 #endif
 516 /* This is a bizarre test, but GCC doesn't define useful common symbol. */
 517 #if defined (__ARM_ARCH_5__)  || defined (__ARM_ARCH_5T__) || \
 518     defined (__ARM_ARCH_5E__) || defined (__ARM_ARCH_5TE__)|| \
 519     defined (__ARM_ARCH_6__)  || defined (__ARM_ARCH_6J__) || \
 520     defined (__ARM_ARCH_6K__) || defined (__ARM_ARCH_6Z__) || \
 521     defined (__ARM_ARCH_6ZK__)|| defined (__ARM_ARCH_6T2__)|| \
 522     defined (__ARM_ARCH_6M__) || defined (__ARM_ARCH_7__)  || \
 523     defined (__ARM_ARCH_7A__) || defined (__ARM_ARCH_7R__) || \
 524     defined (__ARM_ARCH_7M__) || defined (__ARM_ARCH_7EM__)
 525 #define count_leading_zeros(count, x) \
 526   __asm__ ("clz\t%0, %1" : "=r" (count) : "r" (x))
 527 #define COUNT_LEADING_ZEROS_0 32
 528 #endif
 529 #endif /* __arm__ */
 530
 531 #if defined (__aarch64__) && W_TYPE_SIZE == 64
 532 /* FIXME: Extend the immediate range for the low word by using both
 533    ADDS and SUBS, since they set carry in the same way.  */
 534 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
 535   __asm__ ("adds\t%1, %x4, %5\n\tadc\t%0, %x2, %x3"                     \
 536            : "=r" (sh), "=&r" (sl)                                      \
 537            : "rZ" (ah), "rZ" (bh), "%r" (al), "rI" (bl) __CLOBBER_CC)
 538 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
 539   __asm__ ("subs\t%1, %x4, %5\n\tsbc\t%0, %x2, %x3"                     \
 540            : "=r,r" (sh), "=&r,&r" (sl)                                 \
 541            : "rZ,rZ" (ah), "rZ,rZ" (bh), "r,Z" (al), "rI,r" (bl) __CLOBBER_CC)
 542 #define umul_ppmm(ph, pl, m0, m1) \
 543   do {                                                                  \
 544     UDItype __m0 = (m0), __m1 = (m1);                                   \
 545     __asm__ ("umulh\t%0, %1, %2" : "=r" (ph) : "r" (m0), "r" (m1));     \
 546     (pl) = __m0 * __m1;                                                 \
 547   } while (0)
 548 #define count_leading_zeros(count, x) \
 549   __asm__ ("clz\t%0, %1" : "=r" (count) : "r" (x))
 550 #define COUNT_LEADING_ZEROS_0 64
 551 #endif /* __aarch64__ */
 552
 553 #if defined (__clipper__) && W_TYPE_SIZE == 32
 554 #define umul_ppmm(w1, w0, u, v) \
 555   ({union {UDItype __ll;                                                \
 556            struct {USItype __l, __h;} __i;                              \
 557           } __x;                                                        \
 558   __asm__ ("mulwux %2,%0"                                               \
 559            : "=r" (__x.__ll)                                            \
 560            : "%0" ((USItype)(u)), "r" ((USItype)(v)));                  \
 561   (w1) = __x.__i.__h; (w0) = __x.__i.__l;})
 562 #define smul_ppmm(w1, w0, u, v) \
 563   ({union {DItype __ll;                                                 \
 564            struct {SItype __l, __h;} __i;                               \
 565           } __x;                                                        \
 566   __asm__ ("mulwx %2,%0"                                                \
 567            : "=r" (__x.__ll)                                            \
 568            : "%0" ((SItype)(u)), "r" ((SItype)(v)));                    \
 569   (w1) = __x.__i.__h; (w0) = __x.__i.__l;})
 570 #define __umulsidi3(u, v) \
 571   ({UDItype __w;                                                        \
 572     __asm__ ("mulwux %2,%0"                                             \
 573              : "=r" (__w) : "%0" ((USItype)(u)), "r" ((USItype)(v)));   \
 574     __w; })
 575 #endif /* __clipper__ */
 576
 577 /* Fujitsu vector computers.  */
 578 #if defined (__uxp__) && W_TYPE_SIZE == 32
 579 #define umul_ppmm(ph, pl, u, v) \
 580   do {                                                                  \
 581     union {UDItype __ll;                                                \
 582            struct {USItype __h, __l;} __i;                              \
 583           } __x;                                                        \
 584     __asm__ ("mult.lu %1,%2,%0" : "=r" (__x.__ll) : "%r" (u), "rK" (v));\
 585     (ph) = __x.__i.__h;                                                 \
 586     (pl) = __x.__i.__l;                                                 \
 587   } while (0)
 588 #define smul_ppmm(ph, pl, u, v) \
 589   do {                                                                  \
 590     union {UDItype __ll;                                                \
 591            struct {USItype __h, __l;} __i;                              \
 592           } __x;                                                        \
 593     __asm__ ("mult.l %1,%2,%0" : "=r" (__x.__ll) : "%r" (u), "rK" (v)); \
 594     (ph) = __x.__i.__h;                                                 \
 595     (pl) = __x.__i.__l;                                                 \
 596   } while (0)
 597 #endif
 598
 599 #if defined (__gmicro__) && W_TYPE_SIZE == 32
 600 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
 601   __asm__ ("add.w %5,%1\n\taddx %3,%0"                                  \
 602            : "=g" (sh), "=&g" (sl)                                      \
 603            : "0"  ((USItype)(ah)), "g" ((USItype)(bh)),                 \
 604              "%1" ((USItype)(al)), "g" ((USItype)(bl)))
 605 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
 606   __asm__ ("sub.w %5,%1\n\tsubx %3,%0"                                  \
 607            : "=g" (sh), "=&g" (sl)                                      \
 608            : "0" ((USItype)(ah)), "g" ((USItype)(bh)),                  \
 609              "1" ((USItype)(al)), "g" ((USItype)(bl)))
 610 #define umul_ppmm(ph, pl, m0, m1) \
 611   __asm__ ("mulx %3,%0,%1"                                              \
 612            : "=g" (ph), "=r" (pl)                                       \
 613            : "%0" ((USItype)(m0)), "g" ((USItype)(m1)))
 614 #define udiv_qrnnd(q, r, nh, nl, d) \
 615   __asm__ ("divx %4,%0,%1"                                              \
 616            : "=g" (q), "=r" (r)                                         \
 617            : "1" ((USItype)(nh)), "0" ((USItype)(nl)), "g" ((USItype)(d)))
 618 #define count_leading_zeros(count, x) \
 619   __asm__ ("bsch/1 %1,%0"                                               \
 620            : "=g" (count) : "g" ((USItype)(x)), "0" ((USItype)0))
 621 #endif
 622
 623 #if defined (__hppa) && W_TYPE_SIZE == 32
 624 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
 625   __asm__ ("add%I5 %5,%r4,%1\n\taddc %r2,%r3,%0"                        \
 626            : "=r" (sh), "=&r" (sl)                                      \
 627            : "rM" (ah), "rM" (bh), "%rM" (al), "rI" (bl))
 628 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
 629   __asm__ ("sub%I4 %4,%r5,%1\n\tsubb %r2,%r3,%0"                        \
 630            : "=r" (sh), "=&r" (sl)                                      \
 631            : "rM" (ah), "rM" (bh), "rI" (al), "rM" (bl))
 632 #if defined (_PA_RISC1_1)
 633 #define umul_ppmm(wh, wl, u, v) \
 634   do {                                                                  \
 635     union {UDItype __ll;                                                \
 636            struct {USItype __h, __l;} __i;                              \
 637           } __x;                                                        \
 638     __asm__ ("xmpyu %1,%2,%0" : "=*f" (__x.__ll) : "*f" (u), "*f" (v)); \
 639     (wh) = __x.__i.__h;                                                 \
 640     (wl) = __x.__i.__l;                                                 \
 641   } while (0)
 642 #define UMUL_TIME 8
 643 #define UDIV_TIME 60
 644 #else
 645 #define UMUL_TIME 40
 646 #define UDIV_TIME 80
 647 #endif
 648 #define count_leading_zeros(count, x) \
 649   do {                                                                  \
 650     USItype __tmp;                                                      \
 651     __asm__ (                                                           \
 652        "ldi             1,%0\n"                                         \
 653 "       extru,=         %1,15,16,%%r0   ; Bits 31..16 zero?\n"          \
 654 "       extru,tr        %1,15,16,%1     ; No.  Shift down, skip add.\n" \
 655 "       ldo             16(%0),%0       ; Yes.  Perform add.\n"         \
 656 "       extru,=         %1,23,8,%%r0    ; Bits 15..8 zero?\n"           \
 657 "       extru,tr        %1,23,8,%1      ; No.  Shift down, skip add.\n" \
 658 "       ldo             8(%0),%0        ; Yes.  Perform add.\n"         \
 659 "       extru,=         %1,27,4,%%r0    ; Bits 7..4 zero?\n"            \
 660 "       extru,tr        %1,27,4,%1      ; No.  Shift down, skip add.\n" \
 661 "       ldo             4(%0),%0        ; Yes.  Perform add.\n"         \
 662 "       extru,=         %1,29,2,%%r0    ; Bits 3..2 zero?\n"            \
 663 "       extru,tr        %1,29,2,%1      ; No.  Shift down, skip add.\n" \
 664 "       ldo             2(%0),%0        ; Yes.  Perform add.\n"         \
 665 "       extru           %1,30,1,%1      ; Extract bit 1.\n"             \
 666 "       sub             %0,%1,%0        ; Subtract it.\n"               \
 667         : "=r" (count), "=r" (__tmp) : "1" (x));                        \
 668   } while (0)
 669 #endif /* hppa */
 670
 671 /* These macros are for ABI=2.0w.  In ABI=2.0n they can't be used, since GCC
 672    (3.2) puts longlong into two adjacent 32-bit registers.  Presumably this
 673    is just a case of no direct support for 2.0n but treating it like 1.0. */
 674 #if defined (__hppa) && W_TYPE_SIZE == 64 && ! defined (_LONG_LONG_LIMB)
 675 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
 676   __asm__ ("add%I5 %5,%r4,%1\n\tadd,dc %r2,%r3,%0"                      \
 677            : "=r" (sh), "=&r" (sl)                                      \
 678            : "rM" (ah), "rM" (bh), "%rM" (al), "rI" (bl))
 679 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
 680   __asm__ ("sub%I4 %4,%r5,%1\n\tsub,db %r2,%r3,%0"                      \
 681            : "=r" (sh), "=&r" (sl)                                      \
 682            : "rM" (ah), "rM" (bh), "rI" (al), "rM" (bl))
 683 #endif /* hppa */
 684
 685 #if (defined (__i370__) || defined (__s390__) || defined (__mvs__)) && W_TYPE_SIZE == 32
 686 #if defined (__zarch__) || defined (HAVE_HOST_CPU_s390_zarch)
 687 #define add_ssaaaa(sh, sl, ah, al, bh, bl)                              \
 688   do {                                                                  \
 689 /*  if (__builtin_constant_p (bl))                                      \
 690       __asm__ ("alfi\t%1,%o5\n\talcr\t%0,%3"                            \
 691                : "=r" (sh), "=&r" (sl)                                  \
 692                : "0"  (ah), "r" (bh), "%1" (al), "n" (bl) __CLOBBER_CC);\
 693     else                                                                \
 694 */    __asm__ ("alr\t%1,%5\n\talcr\t%0,%3"                              \
 695                : "=r" (sh), "=&r" (sl)                                  \
 696                : "0"  (ah), "r" (bh), "%1" (al), "r" (bl)__CLOBBER_CC); \
 697   } while (0)
 698 #define sub_ddmmss(sh, sl, ah, al, bh, bl)                              \
 699   do {                                                                  \
 700 /*  if (__builtin_constant_p (bl))                                      \
 701       __asm__ ("slfi\t%1,%o5\n\tslbr\t%0,%3"                            \
 702                : "=r" (sh), "=&r" (sl)                                  \
 703                : "0" (ah), "r" (bh), "1" (al), "n" (bl) __CLOBBER_CC);  \
 704     else                                                                \
 705 */    __asm__ ("slr\t%1,%5\n\tslbr\t%0,%3"                              \
 706                : "=r" (sh), "=&r" (sl)                                  \
 707                : "0" (ah), "r" (bh), "1" (al), "r" (bl) __CLOBBER_CC);  \
 708   } while (0)
 709 #if __GMP_GNUC_PREREQ (4,5)
 710 #define umul_ppmm(xh, xl, m0, m1)                                       \
 711   do {                                                                  \
 712     union {UDItype __ll;                                                \
 713            struct {USItype __h, __l;} __i;                              \
 714           } __x;                                                        \
 715     __x.__ll = (UDItype) (m0) * (UDItype) (m1);                         \
 716     (xh) = __x.__i.__h; (xl) = __x.__i.__l;                             \
 717   } while (0)
 718 #else
 719 #if 0
 720 /* FIXME: this fails if gcc knows about the 64-bit registers.  Use only
 721    with a new enough processor pretending we have 32-bit registers.  */
 722 #define umul_ppmm(xh, xl, m0, m1)                                       \
 723   do {                                                                  \
 724     union {UDItype __ll;                                                \
 725            struct {USItype __h, __l;} __i;                              \
 726           } __x;                                                        \
 727     __asm__ ("mlr\t%0,%2"                                               \
 728              : "=r" (__x.__ll)                                          \
 729              : "%0" (m0), "r" (m1));                                    \
 730     (xh) = __x.__i.__h; (xl) = __x.__i.__l;                             \
 731   } while (0)
 732 #else
 733 #define umul_ppmm(xh, xl, m0, m1)                                       \
 734   do {                                                                  \
 735   /* When we have 64-bit regs and gcc is aware of that, we cannot simply use
 736      DImode for the product, since that would be allocated to a single 64-bit
 737      register, whereas mlr uses the low 32-bits of an even-odd register pair.
 738   */                                                                    \
 739     register USItype __r0 __asm__ ("0");                                \
 740     register USItype __r1 __asm__ ("1") = (m0);                         \
 741     __asm__ ("mlr\t%0,%3"                                               \
 742              : "=r" (__r0), "=r" (__r1)                                 \
 743              : "r" (__r1), "r" (m1));                                   \
 744     (xh) = __r0; (xl) = __r1;                                           \
 745   } while (0)
 746 #endif /* if 0 */
 747 #endif
 748 #if 0
 749 /* FIXME: this fails if gcc knows about the 64-bit registers.  Use only
 750    with a new enough processor pretending we have 32-bit registers.  */
 751 #define udiv_qrnnd(q, r, n1, n0, d)                                     \
 752   do {                                                                  \
 753     union {UDItype __ll;                                                \
 754            struct {USItype __h, __l;} __i;                              \
 755           } __x;                                                        \
 756     __x.__i.__h = n1; __x.__i.__l = n0;                                 \
 757     __asm__ ("dlr\t%0,%2"                                               \
 758              : "=r" (__x.__ll)                                          \
 759              : "0" (__x.__ll), "r" (d));                                \
 760     (q) = __x.__i.__l; (r) = __x.__i.__h;                               \
 761   } while (0)
 762 #else
 763 #define udiv_qrnnd(q, r, n1, n0, d)                                     \
 764   do {                                                                  \
 765     register USItype __r0 __asm__ ("0") = (n1);                         \
 766     register USItype __r1 __asm__ ("1") = (n0);                         \
 767     __asm__ ("dlr\t%0,%4"                                               \
 768              : "=r" (__r0), "=r" (__r1)                                 \
 769              : "r" (__r0), "r" (__r1), "r" (d));                        \
 770     (q) = __r1; (r) = __r0;                                             \
 771   } while (0)
 772 #endif /* if 0 */
 773 #else /* if __zarch__ */
 774 /* FIXME: this fails if gcc knows about the 64-bit registers.  */
 775 #define smul_ppmm(xh, xl, m0, m1)                                       \
 776   do {                                                                  \
 777     union {DItype __ll;                                                 \
 778            struct {USItype __h, __l;} __i;                              \
 779           } __x;                                                        \
 780     __asm__ ("mr\t%0,%2"                                                \
 781              : "=r" (__x.__ll)                                          \
 782              : "%0" (m0), "r" (m1));                                    \
 783     (xh) = __x.__i.__h; (xl) = __x.__i.__l;                             \
 784   } while (0)
 785 /* FIXME: this fails if gcc knows about the 64-bit registers.  */
 786 #define sdiv_qrnnd(q, r, n1, n0, d)                                     \
 787   do {                                                                  \
 788     union {DItype __ll;                                                 \
 789            struct {USItype __h, __l;} __i;                              \
 790           } __x;                                                        \
 791     __x.__i.__h = n1; __x.__i.__l = n0;                                 \
 792     __asm__ ("dr\t%0,%2"                                                \
 793              : "=r" (__x.__ll)                                          \
 794              : "0" (__x.__ll), "r" (d));                                \
 795     (q) = __x.__i.__l; (r) = __x.__i.__h;                               \
 796   } while (0)
 797 #endif /* if __zarch__ */
 798 #endif
 799
 800 #if defined (__s390x__) && W_TYPE_SIZE == 64
 801 /* We need to cast operands with register constraints, otherwise their types
 802    will be assumed to be SImode by gcc.  For these machines, such operations
 803    will insert a value into the low 32 bits, and leave the high 32 bits with
 804    garbage.  */
 805 #define add_ssaaaa(sh, sl, ah, al, bh, bl)                              \
 806   do {                                                                  \
 807     __asm__ ("algr\t%1,%5\n\talcgr\t%0,%3"                              \
 808                : "=r" (sh), "=&r" (sl)                                  \
 809                : "0"  ((UDItype)(ah)), "r" ((UDItype)(bh)),             \
 810                  "%1" ((UDItype)(al)), "r" ((UDItype)(bl)) __CLOBBER_CC); \
 811   } while (0)
 812 #define sub_ddmmss(sh, sl, ah, al, bh, bl)                              \
 813   do {                                                                  \
 814     __asm__ ("slgr\t%1,%5\n\tslbgr\t%0,%3"                              \
 815              : "=r" (sh), "=&r" (sl)                                    \
 816              : "0" ((UDItype)(ah)), "r" ((UDItype)(bh)),                \
 817                "1" ((UDItype)(al)), "r" ((UDItype)(bl)) __CLOBBER_CC);  \
 818   } while (0)
 819 #define umul_ppmm(xh, xl, m0, m1)                                       \
 820   do {                                                                  \
 821     union {unsigned int __attribute__ ((mode(TI))) __ll;                \
 822            struct {UDItype __h, __l;} __i;                              \
 823           } __x;                                                        \
 824     __asm__ ("mlgr\t%0,%2"                                              \
 825              : "=r" (__x.__ll)                                          \
 826              : "%0" ((UDItype)(m0)), "r" ((UDItype)(m1)));              \
 827     (xh) = __x.__i.__h; (xl) = __x.__i.__l;                             \
 828   } while (0)
 829 #define udiv_qrnnd(q, r, n1, n0, d)                                     \
 830   do {                                                                  \
 831     union {unsigned int __attribute__ ((mode(TI))) __ll;                \
 832            struct {UDItype __h, __l;} __i;                              \
 833           } __x;                                                        \
 834     __x.__i.__h = n1; __x.__i.__l = n0;                                 \
 835     __asm__ ("dlgr\t%0,%2"                                              \
 836              : "=r" (__x.__ll)                                          \
 837              : "0" (__x.__ll), "r" ((UDItype)(d)));                     \
 838     (q) = __x.__i.__l; (r) = __x.__i.__h;                               \
 839   } while (0)
 840 #if 0 /* FIXME: Enable for z10 (?) */
 841 #define count_leading_zeros(cnt, x)                                     \
 842   do {                                                                  \
 843     union {unsigned int __attribute__ ((mode(TI))) __ll;                \
 844            struct {UDItype __h, __l;} __i;                              \
 845           } __clr_cnt;                                                  \
 846     __asm__ ("flogr\t%0,%1"                                             \
 847              : "=r" (__clr_cnt.__ll)                                    \
 848              : "r" (x) __CLOBBER_CC);                                   \
 849     (cnt) = __clr_cnt.__i.__h;                                          \
 850   } while (0)
 851 #endif
 852 #endif
 853
 854 #if (defined (__i386__) || defined (__i486__)) && W_TYPE_SIZE == 32
 855 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
 856   __asm__ ("addl %5,%k1\n\tadcl %3,%k0"                                 \
 857            : "=r" (sh), "=&r" (sl)                                      \
 858            : "0"  ((USItype)(ah)), "g" ((USItype)(bh)),                 \
 859              "%1" ((USItype)(al)), "g" ((USItype)(bl)))
 860 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
 861   __asm__ ("subl %5,%k1\n\tsbbl %3,%k0"                                 \
 862            : "=r" (sh), "=&r" (sl)                                      \
 863            : "0" ((USItype)(ah)), "g" ((USItype)(bh)),                  \
 864              "1" ((USItype)(al)), "g" ((USItype)(bl)))
 865 #define umul_ppmm(w1, w0, u, v) \
 866   __asm__ ("mull %3"                                                    \
 867            : "=a" (w0), "=d" (w1)                                       \
 868            : "%0" ((USItype)(u)), "rm" ((USItype)(v)))
 869 #define udiv_qrnnd(q, r, n1, n0, dx) /* d renamed to dx avoiding "=d" */\
 870   __asm__ ("divl %4"                 /* stringification in K&R C */     \
 871            : "=a" (q), "=d" (r)                                         \
 872            : "0" ((USItype)(n0)), "1" ((USItype)(n1)), "rm" ((USItype)(dx)))
 873
 874 #if HAVE_HOST_CPU_i586 || HAVE_HOST_CPU_pentium || HAVE_HOST_CPU_pentiummmx
 875 /* Pentium bsrl takes between 10 and 72 cycles depending where the most
 876    significant 1 bit is, hence the use of the following alternatives.  bsfl
 877    is slow too, between 18 and 42 depending where the least significant 1
 878    bit is, so let the generic count_trailing_zeros below make use of the
 879    count_leading_zeros here too.  */
 880
 881 #if HAVE_HOST_CPU_pentiummmx && ! defined (LONGLONG_STANDALONE)
 882 /* The following should be a fixed 14 or 15 cycles, but possibly plus an L1
 883    cache miss reading from __clz_tab.  For P55 it's favoured over the float
 884    below so as to avoid mixing MMX and x87, since the penalty for switching
 885    between the two is about 100 cycles.
 886
 887    The asm block sets __shift to -3 if the high 24 bits are clear, -2 for
 888    16, -1 for 8, or 0 otherwise.  This could be written equivalently as
 889    follows, but as of gcc 2.95.2 it results in conditional jumps.
 890
 891        __shift = -(__n < 0x1000000);
 892        __shift -= (__n < 0x10000);
 893        __shift -= (__n < 0x100);
 894
 895    The middle two sbbl and cmpl's pair, and with luck something gcc
 896    generates might pair with the first cmpl and the last sbbl.  The "32+1"
 897    constant could be folded into __clz_tab[], but it doesn't seem worth
 898    making a different table just for that.  */
 899
 900 #define count_leading_zeros(c,n)                                        \
 901   do {                                                                  \
 902     USItype  __n = (n);                                                 \
 903     USItype  __shift;                                                   \
 904     __asm__ ("cmpl  $0x1000000, %1\n"                                   \
 905              "sbbl  %0, %0\n"                                           \
 906              "cmpl  $0x10000, %1\n"                                     \
 907              "sbbl  $0, %0\n"                                           \
 908              "cmpl  $0x100, %1\n"                                       \
 909              "sbbl  $0, %0\n"                                           \
 910              : "=&r" (__shift) : "r"  (__n));                           \
 911     __shift = __shift*8 + 24 + 1;                                       \
 912     (c) = 32 + 1 - __shift - __clz_tab[__n >> __shift];                 \
 913   } while (0)
 914 #define COUNT_LEADING_ZEROS_NEED_CLZ_TAB
 915 #define COUNT_LEADING_ZEROS_0   31   /* n==0 indistinguishable from n==1 */
 916
 917 #else /* ! pentiummmx || LONGLONG_STANDALONE */
 918 /* The following should be a fixed 14 cycles or so.  Some scheduling
 919    opportunities should be available between the float load/store too.  This
 920    sort of code is used in gcc 3 for __builtin_ffs (with "n&-n") and is
 921    apparently suggested by the Intel optimizing manual (don't know exactly
 922    where).  gcc 2.95 or up will be best for this, so the "double" is
 923    correctly aligned on the stack.  */
 924 #define count_leading_zeros(c,n)                                        \
 925   do {                                                                  \
 926     union {                                                             \
 927       double    d;                                                      \
 928       unsigned  a[2];                                                   \
 929     } __u;                                                              \
 930     ASSERT ((n) != 0);                                                  \
 931     __u.d = (UWtype) (n);                                               \
 932     (c) = 0x3FF + 31 - (__u.a[1] >> 20);                                \
 933   } while (0)
 934 #define COUNT_LEADING_ZEROS_0   (0x3FF + 31)
 935 #endif /* pentiummx */
 936
 937 #else /* ! pentium */
 938
 939 #if __GMP_GNUC_PREREQ (3,4)  /* using bsrl */
 940 #define count_leading_zeros(count,x)  count_leading_zeros_gcc_clz(count,x)
 941 #endif /* gcc clz */
 942
 943 /* On P6, gcc prior to 3.0 generates a partial register stall for
 944    __cbtmp^31, due to using "xorb $31" instead of "xorl $31", the former
 945    being 1 code byte smaller.  "31-__cbtmp" is a workaround, probably at the
 946    cost of one extra instruction.  Do this for "i386" too, since that means
 947    generic x86.  */
 948 #if ! defined (count_leading_zeros) && __GNUC__ < 3                     \
 949   && (HAVE_HOST_CPU_i386                                                \
 950       || HAVE_HOST_CPU_i686                                             \
 951       || HAVE_HOST_CPU_pentiumpro                                       \
 952       || HAVE_HOST_CPU_pentium2                                         \
 953       || HAVE_HOST_CPU_pentium3)
 954 #define count_leading_zeros(count, x)                                   \
 955   do {                                                                  \
 956     USItype __cbtmp;                                                    \
 957     ASSERT ((x) != 0);                                                  \
 958     __asm__ ("bsrl %1,%0" : "=r" (__cbtmp) : "rm" ((USItype)(x)));      \
 959     (count) = 31 - __cbtmp;                                             \
 960   } while (0)
 961 #endif /* gcc<3 asm bsrl */
 962
 963 #ifndef count_leading_zeros
 964 #define count_leading_zeros(count, x)                                   \
 965   do {                                                                  \
 966     USItype __cbtmp;                                                    \
 967     ASSERT ((x) != 0);                                                  \
 968     __asm__ ("bsrl %1,%0" : "=r" (__cbtmp) : "rm" ((USItype)(x)));      \
 969     (count) = __cbtmp ^ 31;                                             \
 970   } while (0)
 971 #endif /* asm bsrl */
 972
 973 #if __GMP_GNUC_PREREQ (3,4)  /* using bsfl */
 974 #define count_trailing_zeros(count,x)  count_trailing_zeros_gcc_ctz(count,x)
 975 #endif /* gcc ctz */
 976
 977 #ifndef count_trailing_zeros
 978 #define count_trailing_zeros(count, x)                                  \
 979   do {                                                                  \
 980     ASSERT ((x) != 0);                                                  \
 981     __asm__ ("bsfl %1,%k0" : "=r" (count) : "rm" ((USItype)(x)));       \
 982   } while (0)
 983 #endif /* asm bsfl */
 984
 985 #endif /* ! pentium */
 986
 987 #ifndef UMUL_TIME
 988 #define UMUL_TIME 10
 989 #endif
 990 #ifndef UDIV_TIME
 991 #define UDIV_TIME 40
 992 #endif
 993 #endif /* 80x86 */
 994
 995 #if defined (__amd64__) && W_TYPE_SIZE == 64
 996 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
 997   __asm__ ("addq %5,%q1\n\tadcq %3,%q0"                                 \
 998            : "=r" (sh), "=&r" (sl)                                      \
 999            : "0"  ((UDItype)(ah)), "rme" ((UDItype)(bh)),               \
1000              "%1" ((UDItype)(al)), "rme" ((UDItype)(bl)))
1001 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
1002   __asm__ ("subq %5,%q1\n\tsbbq %3,%q0"                                 \
1003            : "=r" (sh), "=&r" (sl)                                      \
1004            : "0" ((UDItype)(ah)), "rme" ((UDItype)(bh)),                \
1005              "1" ((UDItype)(al)), "rme" ((UDItype)(bl)))
1006 #define umul_ppmm(w1, w0, u, v) \
1007   __asm__ ("mulq %3"                                                    \
1008            : "=a" (w0), "=d" (w1)                                       \
1009            : "%0" ((UDItype)(u)), "rm" ((UDItype)(v)))
1010 #define udiv_qrnnd(q, r, n1, n0, dx) /* d renamed to dx avoiding "=d" */\
1011   __asm__ ("divq %4"                 /* stringification in K&R C */     \
1012            : "=a" (q), "=d" (r)                                         \
1013            : "0" ((UDItype)(n0)), "1" ((UDItype)(n1)), "rm" ((UDItype)(dx)))
1014 /* bsrq destination must be a 64-bit register, hence UDItype for __cbtmp. */
1015 #define count_leading_zeros(count, x)                                   \
1016   do {                                                                  \
1017     UDItype __cbtmp;                                                    \
1018     ASSERT ((x) != 0);                                                  \
1019     __asm__ ("bsrq %1,%0" : "=r" (__cbtmp) : "rm" ((UDItype)(x)));      \
1020     (count) = __cbtmp ^ 63;                                             \
1021   } while (0)
1022 /* bsfq destination must be a 64-bit register, "%q0" forces this in case
1023    count is only an int. */
1024 #define count_trailing_zeros(count, x)                                  \
1025   do {                                                                  \
1026     ASSERT ((x) != 0);                                                  \
1027     __asm__ ("bsfq %1,%q0" : "=r" (count) : "rm" ((UDItype)(x)));       \
1028   } while (0)
1029 #endif /* x86_64 */
1030
1031 #if defined (__i860__) && W_TYPE_SIZE == 32
1032 #define rshift_rhlc(r,h,l,c) \
1033   __asm__ ("shr %3,r0,r0\;shrd %1,%2,%0"                                \
1034            "=r" (r) : "r" (h), "r" (l), "rn" (c))
1035 #endif /* i860 */
1036
1037 #if defined (__i960__) && W_TYPE_SIZE == 32
1038 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
1039   __asm__ ("cmpo 1,0\;addc %5,%4,%1\;addc %3,%2,%0"                     \
1040            : "=r" (sh), "=&r" (sl)                                      \
1041            : "dI" (ah), "dI" (bh), "%dI" (al), "dI" (bl))
1042 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
1043   __asm__ ("cmpo 0,0\;subc %5,%4,%1\;subc %3,%2,%0"                     \
1044            : "=r" (sh), "=&r" (sl)                                      \
1045            : "dI" (ah), "dI" (bh), "dI" (al), "dI" (bl))
1046 #define umul_ppmm(w1, w0, u, v) \
1047   ({union {UDItype __ll;                                                \
1048            struct {USItype __l, __h;} __i;                              \
1049           } __x;                                                        \
1050   __asm__ ("emul %2,%1,%0"                                              \
1051            : "=d" (__x.__ll) : "%dI" (u), "dI" (v));                    \
1052   (w1) = __x.__i.__h; (w0) = __x.__i.__l;})
1053 #define __umulsidi3(u, v) \
1054   ({UDItype __w;                                                        \
1055     __asm__ ("emul %2,%1,%0" : "=d" (__w) : "%dI" (u), "dI" (v));       \
1056     __w; })
1057 #define udiv_qrnnd(q, r, nh, nl, d) \
1058   do {                                                                  \
1059     union {UDItype __ll;                                                \
1060            struct {USItype __l, __h;} __i;                              \
1061           } __nn;                                                       \
1062     __nn.__i.__h = (nh); __nn.__i.__l = (nl);                           \
1063     __asm__ ("ediv %d,%n,%0"                                            \
1064            : "=d" (__rq.__ll) : "dI" (__nn.__ll), "dI" (d));            \
1065     (r) = __rq.__i.__l; (q) = __rq.__i.__h;                             \
1066   } while (0)
1067 #define count_leading_zeros(count, x) \
1068   do {                                                                  \
1069     USItype __cbtmp;                                                    \
1070     __asm__ ("scanbit %1,%0" : "=r" (__cbtmp) : "r" (x));               \
1071     (count) = __cbtmp ^ 31;                                             \
1072   } while (0)
1073 #define COUNT_LEADING_ZEROS_0 (-32) /* sic */
1074 #if defined (__i960mx)          /* what is the proper symbol to test??? */
1075 #define rshift_rhlc(r,h,l,c) \
1076   do {                                                                  \
1077     union {UDItype __ll;                                                \
1078            struct {USItype __l, __h;} __i;                              \
1079           } __nn;                                                       \
1080     __nn.__i.__h = (h); __nn.__i.__l = (l);                             \
1081     __asm__ ("shre %2,%1,%0" : "=d" (r) : "dI" (__nn.__ll), "dI" (c));  \
1082   }
1083 #endif /* i960mx */
1084 #endif /* i960 */
1085
1086 #if (defined (__mc68000__) || defined (__mc68020__) || defined(mc68020) \
1087      || defined (__m68k__) || defined (__mc5200__) || defined (__mc5206e__) \
1088      || defined (__mc5307__)) && W_TYPE_SIZE == 32
1089 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
1090   __asm__ ("add%.l %5,%1\n\taddx%.l %3,%0"                              \
1091            : "=d" (sh), "=&d" (sl)                                      \
1092            : "0"  ((USItype)(ah)), "d" ((USItype)(bh)),                 \
1093              "%1" ((USItype)(al)), "g" ((USItype)(bl)))
1094 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
1095   __asm__ ("sub%.l %5,%1\n\tsubx%.l %3,%0"                              \
1096            : "=d" (sh), "=&d" (sl)                                      \
1097            : "0" ((USItype)(ah)), "d" ((USItype)(bh)),                  \
1098              "1" ((USItype)(al)), "g" ((USItype)(bl)))
1099 /* The '020, '030, '040 and CPU32 have 32x32->64 and 64/32->32q-32r.  */
1100 #if defined (__mc68020__) || defined(mc68020) \
1101      || defined (__mc68030__) || defined (mc68030) \
1102      || defined (__mc68040__) || defined (mc68040) \
1103      || defined (__mcpu32__) || defined (mcpu32) \
1104      || defined (__NeXT__)
1105 #define umul_ppmm(w1, w0, u, v) \
1106   __asm__ ("mulu%.l %3,%1:%0"                                           \
1107            : "=d" (w0), "=d" (w1)                                       \
1108            : "%0" ((USItype)(u)), "dmi" ((USItype)(v)))
1109 #define UMUL_TIME 45
1110 #define udiv_qrnnd(q, r, n1, n0, d) \
1111   __asm__ ("divu%.l %4,%1:%0"                                           \
1112            : "=d" (q), "=d" (r)                                         \
1113            : "0" ((USItype)(n0)), "1" ((USItype)(n1)), "dmi" ((USItype)(d)))
1114 #define UDIV_TIME 90
1115 #define sdiv_qrnnd(q, r, n1, n0, d) \
1116   __asm__ ("divs%.l %4,%1:%0"                                           \
1117            : "=d" (q), "=d" (r)                                         \
1118            : "0" ((USItype)(n0)), "1" ((USItype)(n1)), "dmi" ((USItype)(d)))
1119 #else /* for other 68k family members use 16x16->32 multiplication */
1120 #define umul_ppmm(xh, xl, a, b) \
1121   do { USItype __umul_tmp1, __umul_tmp2;                                \
1122         __asm__ ("| Inlined umul_ppmm\n"                                \
1123 "       move%.l %5,%3\n"                                                \
1124 "       move%.l %2,%0\n"                                                \
1125 "       move%.w %3,%1\n"                                                \
1126 "       swap    %3\n"                                                   \
1127 "       swap    %0\n"                                                   \
1128 "       mulu%.w %2,%1\n"                                                \
1129 "       mulu%.w %3,%0\n"                                                \
1130 "       mulu%.w %2,%3\n"                                                \
1131 "       swap    %2\n"                                                   \
1132 "       mulu%.w %5,%2\n"                                                \
1133 "       add%.l  %3,%2\n"                                                \
1134 "       jcc     1f\n"                                                   \
1135 "       add%.l  %#0x10000,%0\n"                                         \
1136 "1:     move%.l %2,%3\n"                                                \
1137 "       clr%.w  %2\n"                                                   \
1138 "       swap    %2\n"                                                   \
1139 "       swap    %3\n"                                                   \
1140 "       clr%.w  %3\n"                                                   \
1141 "       add%.l  %3,%1\n"                                                \
1142 "       addx%.l %2,%0\n"                                                \
1143 "       | End inlined umul_ppmm"                                        \
1144               : "=&d" (xh), "=&d" (xl),                                 \
1145                 "=d" (__umul_tmp1), "=&d" (__umul_tmp2)                 \
1146               : "%2" ((USItype)(a)), "d" ((USItype)(b)));               \
1147   } while (0)
1148 #define UMUL_TIME 100
1149 #define UDIV_TIME 400
1150 #endif /* not mc68020 */
1151 /* The '020, '030, '040 and '060 have bitfield insns.
1152    GCC 3.4 defines __mc68020__ when in CPU32 mode, check for __mcpu32__ to
1153    exclude bfffo on that chip (bitfield insns not available).  */
1154 #if (defined (__mc68020__) || defined (mc68020)    \
1155      || defined (__mc68030__) || defined (mc68030) \
1156      || defined (__mc68040__) || defined (mc68040) \
1157      || defined (__mc68060__) || defined (mc68060) \
1158      || defined (__NeXT__))                        \
1159   && ! defined (__mcpu32__)
1160 #define count_leading_zeros(count, x) \
1161   __asm__ ("bfffo %1{%b2:%b2},%0"                                       \
1162            : "=d" (count)                                               \
1163            : "od" ((USItype) (x)), "n" (0))
1164 #define COUNT_LEADING_ZEROS_0 32
1165 #endif
1166 #endif /* mc68000 */
1167
1168 #if defined (__m88000__) && W_TYPE_SIZE == 32
1169 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
1170   __asm__ ("addu.co %1,%r4,%r5\n\taddu.ci %0,%r2,%r3"                   \
1171            : "=r" (sh), "=&r" (sl)                                      \
1172            : "rJ" (ah), "rJ" (bh), "%rJ" (al), "rJ" (bl))
1173 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
1174   __asm__ ("subu.co %1,%r4,%r5\n\tsubu.ci %0,%r2,%r3"                   \
1175            : "=r" (sh), "=&r" (sl)                                      \
1176            : "rJ" (ah), "rJ" (bh), "rJ" (al), "rJ" (bl))
1177 #define count_leading_zeros(count, x) \
1178   do {                                                                  \
1179     USItype __cbtmp;                                                    \
1180     __asm__ ("ff1 %0,%1" : "=r" (__cbtmp) : "r" (x));                   \
1181     (count) = __cbtmp ^ 31;                                             \
1182   } while (0)
1183 #define COUNT_LEADING_ZEROS_0 63 /* sic */
1184 #if defined (__m88110__)
1185 #define umul_ppmm(wh, wl, u, v) \
1186   do {                                                                  \
1187     union {UDItype __ll;                                                \
1188            struct {USItype __h, __l;} __i;                              \
1189           } __x;                                                        \
1190     __asm__ ("mulu.d %0,%1,%2" : "=r" (__x.__ll) : "r" (u), "r" (v));   \
1191     (wh) = __x.__i.__h;                                                 \
1192     (wl) = __x.__i.__l;                                                 \
1193   } while (0)
1194 #define udiv_qrnnd(q, r, n1, n0, d) \
1195   ({union {UDItype __ll;                                                \
1196            struct {USItype __h, __l;} __i;                              \
1197           } __x, __q;                                                   \
1198   __x.__i.__h = (n1); __x.__i.__l = (n0);                               \
1199   __asm__ ("divu.d %0,%1,%2"                                            \
1200            : "=r" (__q.__ll) : "r" (__x.__ll), "r" (d));                \
1201   (r) = (n0) - __q.__l * (d); (q) = __q.__l; })
1202 #define UMUL_TIME 5
1203 #define UDIV_TIME 25
1204 #else
1205 #define UMUL_TIME 17
1206 #define UDIV_TIME 150
1207 #endif /* __m88110__ */
1208 #endif /* __m88000__ */
1209
1210 #if defined (__mips) && W_TYPE_SIZE == 32
1211 #if __GMP_GNUC_PREREQ (4,4)
1212 #define umul_ppmm(w1, w0, u, v) \
1213   do {                                                                  \
1214     UDItype __ll = (UDItype)(u) * (v);                                  \
1215     w1 = __ll >> 32;                                                    \
1216     w0 = __ll;                                                          \
1217   } while (0)
1218 #endif
1219 #if !defined (umul_ppmm) && __GMP_GNUC_PREREQ (2,7)
1220 #define umul_ppmm(w1, w0, u, v) \
1221   __asm__ ("multu %2,%3" : "=l" (w0), "=h" (w1) : "d" (u), "d" (v))
1222 #endif
1223 #if !defined (umul_ppmm)
1224 #define umul_ppmm(w1, w0, u, v) \
1225   __asm__ ("multu %2,%3\n\tmflo %0\n\tmfhi %1"                          \
1226            : "=d" (w0), "=d" (w1) : "d" (u), "d" (v))
1227 #endif
1228 #define UMUL_TIME 10
1229 #define UDIV_TIME 100
1230 #endif /* __mips */
1231
1232 #if (defined (__mips) && __mips >= 3) && W_TYPE_SIZE == 64
1233 #if __GMP_GNUC_PREREQ (4,4)
1234 #define umul_ppmm(w1, w0, u, v) \
1235   do {                                                                  \
1236     typedef unsigned int __ll_UTItype __attribute__((mode(TI)));        \
1237     __ll_UTItype __ll = (__ll_UTItype)(u) * (v);                        \
1238     w1 = __ll >> 64;                                                    \
1239     w0 = __ll;                                                          \
1240   } while (0)
1241 #endif
1242 #if !defined (umul_ppmm) && __GMP_GNUC_PREREQ (2,7)
1243 #define umul_ppmm(w1, w0, u, v) \
1244   __asm__ ("dmultu %2,%3" : "=l" (w0), "=h" (w1) : "d" (u), "d" (v))
1245 #endif
1246 #if !defined (umul_ppmm)
1247 #define umul_ppmm(w1, w0, u, v) \
1248   __asm__ ("dmultu %2,%3\n\tmflo %0\n\tmfhi %1"                         \
1249            : "=d" (w0), "=d" (w1) : "d" (u), "d" (v))
1250 #endif
1251 #define UMUL_TIME 20
1252 #define UDIV_TIME 140
1253 #endif /* __mips */
1254
1255 #if defined (__mmix__) && W_TYPE_SIZE == 64
1256 #define umul_ppmm(w1, w0, u, v) \
1257   __asm__ ("MULU %0,%2,%3" : "=r" (w0), "=z" (w1) : "r" (u), "r" (v))
1258 #endif
1259
1260 #if defined (__ns32000__) && W_TYPE_SIZE == 32
1261 #define umul_ppmm(w1, w0, u, v) \
1262   ({union {UDItype __ll;                                                \
1263            struct {USItype __l, __h;} __i;                              \
1264           } __x;                                                        \
1265   __asm__ ("meid %2,%0"                                                 \
1266            : "=g" (__x.__ll)                                            \
1267            : "%0" ((USItype)(u)), "g" ((USItype)(v)));                  \
1268   (w1) = __x.__i.__h; (w0) = __x.__i.__l;})
1269 #define __umulsidi3(u, v) \
1270   ({UDItype __w;                                                        \
1271     __asm__ ("meid %2,%0"                                               \
1272              : "=g" (__w)                                               \
1273              : "%0" ((USItype)(u)), "g" ((USItype)(v)));                \
1274     __w; })
1275 #define udiv_qrnnd(q, r, n1, n0, d) \
1276   ({union {UDItype __ll;                                                \
1277            struct {USItype __l, __h;} __i;                              \
1278           } __x;                                                        \
1279   __x.__i.__h = (n1); __x.__i.__l = (n0);                               \
1280   __asm__ ("deid %2,%0"                                                 \
1281            : "=g" (__x.__ll)                                            \
1282            : "0" (__x.__ll), "g" ((USItype)(d)));                       \
1283   (r) = __x.__i.__l; (q) = __x.__i.__h; })
1284 #define count_trailing_zeros(count,x) \
1285   do {                                                                  \
1286     __asm__ ("ffsd      %2,%0"                                          \
1287              : "=r" (count)                                             \
1288              : "0" ((USItype) 0), "r" ((USItype) (x)));                 \
1289   } while (0)
1290 #endif /* __ns32000__ */
1291
1292 /* In the past we had a block of various #defines tested
1293        _ARCH_PPC    - AIX
1294        _ARCH_PWR    - AIX
1295        __powerpc__  - gcc
1296        __POWERPC__  - BEOS
1297        __ppc__      - Darwin
1298        PPC          - old gcc, GNU/Linux, SysV
1299    The plain PPC test was not good for vxWorks, since PPC is defined on all
1300    CPUs there (eg. m68k too), as a constant one is expected to compare
1301    CPU_FAMILY against.
1302
1303    At any rate, this was pretty unattractive and a bit fragile.  The use of
1304    HAVE_HOST_CPU_FAMILY is designed to cut through it all and be sure of
1305    getting the desired effect.
1306
1307    ENHANCE-ME: We should test _IBMR2 here when we add assembly support for
1308    the system vendor compilers.  (Is that vendor compilers with inline asm,
1309    or what?)  */
1310
1311 #if (HAVE_HOST_CPU_FAMILY_power || HAVE_HOST_CPU_FAMILY_powerpc)        \
1312   && W_TYPE_SIZE == 32
1313 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
1314   do {                                                                  \
1315     if (__builtin_constant_p (bh) && (bh) == 0)                         \
1316       __asm__ ("{a%I4|add%I4c} %1,%3,%4\n\t{aze|addze} %0,%2"           \
1317              : "=r" (sh), "=&r" (sl) : "r" (ah), "%r" (al), "rI" (bl));\
1318     else if (__builtin_constant_p (bh) && (bh) == ~(USItype) 0)         \
1319       __asm__ ("{a%I4|add%I4c} %1,%3,%4\n\t{ame|addme} %0,%2"           \
1320              : "=r" (sh), "=&r" (sl) : "r" (ah), "%r" (al), "rI" (bl));\
1321     else                                                                \
1322       __asm__ ("{a%I5|add%I5c} %1,%4,%5\n\t{ae|adde} %0,%2,%3"          \
1323              : "=r" (sh), "=&r" (sl)                                    \
1324              : "r" (ah), "r" (bh), "%r" (al), "rI" (bl));               \
1325   } while (0)
1326 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
1327   do {                                                                  \
1328     if (__builtin_constant_p (ah) && (ah) == 0)                         \
1329       __asm__ ("{sf%I3|subf%I3c} %1,%4,%3\n\t{sfze|subfze} %0,%2"       \
1330                : "=r" (sh), "=&r" (sl) : "r" (bh), "rI" (al), "r" (bl));\
1331     else if (__builtin_constant_p (ah) && (ah) == ~(USItype) 0)         \
1332       __asm__ ("{sf%I3|subf%I3c} %1,%4,%3\n\t{sfme|subfme} %0,%2"       \
1333                : "=r" (sh), "=&r" (sl) : "r" (bh), "rI" (al), "r" (bl));\
1334     else if (__builtin_constant_p (bh) && (bh) == 0)                    \
1335       __asm__ ("{sf%I3|subf%I3c} %1,%4,%3\n\t{ame|addme} %0,%2"         \
1336                : "=r" (sh), "=&r" (sl) : "r" (ah), "rI" (al), "r" (bl));\
1337     else if (__builtin_constant_p (bh) && (bh) == ~(USItype) 0)         \
1338       __asm__ ("{sf%I3|subf%I3c} %1,%4,%3\n\t{aze|addze} %0,%2"         \
1339                : "=r" (sh), "=&r" (sl) : "r" (ah), "rI" (al), "r" (bl));\
1340     else                                                                \
1341       __asm__ ("{sf%I4|subf%I4c} %1,%5,%4\n\t{sfe|subfe} %0,%3,%2"      \
1342                : "=r" (sh), "=&r" (sl)                                  \
1343                : "r" (ah), "r" (bh), "rI" (al), "r" (bl));              \
1344   } while (0)
1345 #define count_leading_zeros(count, x) \
1346   __asm__ ("{cntlz|cntlzw} %0,%1" : "=r" (count) : "r" (x))
1347 #define COUNT_LEADING_ZEROS_0 32
1348 #if HAVE_HOST_CPU_FAMILY_powerpc
1349 #if __GMP_GNUC_PREREQ (4,4)
1350 #define umul_ppmm(w1, w0, u, v) \
1351   do {                                                                  \
1352     UDItype __ll = (UDItype)(u) * (v);                                  \
1353     w1 = __ll >> 32;                                                    \
1354     w0 = __ll;                                                          \
1355   } while (0)
1356 #endif
1357 #if !defined (umul_ppmm)
1358 #define umul_ppmm(ph, pl, m0, m1) \
1359   do {                                                                  \
1360     USItype __m0 = (m0), __m1 = (m1);                                   \
1361     __asm__ ("mulhwu %0,%1,%2" : "=r" (ph) : "%r" (m0), "r" (m1));      \
1362     (pl) = __m0 * __m1;                                                 \
1363   } while (0)
1364 #endif
1365 #define UMUL_TIME 15
1366 #define smul_ppmm(ph, pl, m0, m1) \
1367   do {                                                                  \
1368     SItype __m0 = (m0), __m1 = (m1);                                    \
1369     __asm__ ("mulhw %0,%1,%2" : "=r" (ph) : "%r" (m0), "r" (m1));       \
1370     (pl) = __m0 * __m1;                                                 \
1371   } while (0)
1372 #define SMUL_TIME 14
1373 #define UDIV_TIME 120
1374 #else
1375 #define UMUL_TIME 8
1376 #define smul_ppmm(xh, xl, m0, m1) \
1377   __asm__ ("mul %0,%2,%3" : "=r" (xh), "=q" (xl) : "r" (m0), "r" (m1))
1378 #define SMUL_TIME 4
1379 #define sdiv_qrnnd(q, r, nh, nl, d) \
1380   __asm__ ("div %0,%2,%4" : "=r" (q), "=q" (r) : "r" (nh), "1" (nl), "r" (d))
1381 #define UDIV_TIME 100
1382 #endif
1383 #endif /* 32-bit POWER architecture variants.  */
1384
1385 /* We should test _IBMR2 here when we add assembly support for the system
1386    vendor compilers.  */
1387 #if HAVE_HOST_CPU_FAMILY_powerpc && W_TYPE_SIZE == 64
1388 #if !defined (_LONG_LONG_LIMB)
1389 /* _LONG_LONG_LIMB is ABI=mode32 where adde operates on 32-bit values.  So
1390    use adde etc only when not _LONG_LONG_LIMB.  */
1391 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
1392   do {                                                                  \
1393     if (__builtin_constant_p (bh) && (bh) == 0)                         \
1394       __asm__ ("{a%I4|add%I4c} %1,%3,%4\n\t{aze|addze} %0,%2"           \
1395              : "=r" (sh), "=&r" (sl) : "r" (ah), "%r" (al), "rI" (bl));\
1396     else if (__builtin_constant_p (bh) && (bh) == ~(UDItype) 0)         \
1397       __asm__ ("{a%I4|add%I4c} %1,%3,%4\n\t{ame|addme} %0,%2"           \
1398              : "=r" (sh), "=&r" (sl) : "r" (ah), "%r" (al), "rI" (bl));\
1399     else                                                                \
1400       __asm__ ("{a%I5|add%I5c} %1,%4,%5\n\t{ae|adde} %0,%2,%3"          \
1401              : "=r" (sh), "=&r" (sl)                                    \
1402              : "r" (ah), "r" (bh), "%r" (al), "rI" (bl));               \
1403   } while (0)
1404 /* We use "*rI" for the constant operand here, since with just "I", gcc barfs.
1405    This might seem strange, but gcc folds away the dead code late.  */
1406 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
1407   do {                                                                        \
1408     if (__builtin_constant_p (bl) && bl > -0x8000 && bl <= 0x8000) {          \
1409         if (__builtin_constant_p (ah) && (ah) == 0)                           \
1410           __asm__ ("{ai|addic} %1,%3,%4\n\t{sfze|subfze} %0,%2"               \
1411                    : "=r" (sh), "=&r" (sl) : "r" (bh), "rI" (al), "*rI" (-bl)); \
1412         else if (__builtin_constant_p (ah) && (ah) == ~(UDItype) 0)           \
1413           __asm__ ("{ai|addic} %1,%3,%4\n\t{sfme|subfme} %0,%2"               \
1414                    : "=r" (sh), "=&r" (sl) : "r" (bh), "rI" (al), "*rI" (-bl)); \
1415         else if (__builtin_constant_p (bh) && (bh) == 0)                      \
1416           __asm__ ("{ai|addic} %1,%3,%4\n\t{ame|addme} %0,%2"                 \
1417                    : "=r" (sh), "=&r" (sl) : "r" (ah), "rI" (al), "*rI" (-bl)); \
1418         else if (__builtin_constant_p (bh) && (bh) == ~(UDItype) 0)           \
1419           __asm__ ("{ai|addic} %1,%3,%4\n\t{aze|addze} %0,%2"                 \
1420                    : "=r" (sh), "=&r" (sl) : "r" (ah), "rI" (al), "*rI" (-bl)); \
1421         else                                                                  \
1422           __asm__ ("{ai|addic} %1,%4,%5\n\t{sfe|subfe} %0,%3,%2"              \
1423                    : "=r" (sh), "=&r" (sl)                                    \
1424                    : "r" (ah), "r" (bh), "rI" (al), "*rI" (-bl));             \
1425       } else {                                                                \
1426         if (__builtin_constant_p (ah) && (ah) == 0)                           \
1427           __asm__ ("{sf%I3|subf%I3c} %1,%4,%3\n\t{sfze|subfze} %0,%2"         \
1428                    : "=r" (sh), "=&r" (sl) : "r" (bh), "rI" (al), "r" (bl));  \
1429         else if (__builtin_constant_p (ah) && (ah) == ~(UDItype) 0)           \
1430           __asm__ ("{sf%I3|subf%I3c} %1,%4,%3\n\t{sfme|subfme} %0,%2"         \
1431                    : "=r" (sh), "=&r" (sl) : "r" (bh), "rI" (al), "r" (bl));  \
1432         else if (__builtin_constant_p (bh) && (bh) == 0)                      \
1433           __asm__ ("{sf%I3|subf%I3c} %1,%4,%3\n\t{ame|addme} %0,%2"           \
1434                    : "=r" (sh), "=&r" (sl) : "r" (ah), "rI" (al), "r" (bl));  \
1435         else if (__builtin_constant_p (bh) && (bh) == ~(UDItype) 0)           \
1436           __asm__ ("{sf%I3|subf%I3c} %1,%4,%3\n\t{aze|addze} %0,%2"           \
1437                    : "=r" (sh), "=&r" (sl) : "r" (ah), "rI" (al), "r" (bl));  \
1438         else                                                                  \
1439           __asm__ ("{sf%I4|subf%I4c} %1,%5,%4\n\t{sfe|subfe} %0,%3,%2"        \
1440                    : "=r" (sh), "=&r" (sl)                                    \
1441                    : "r" (ah), "r" (bh), "rI" (al), "r" (bl));                \
1442       }                                                                       \
1443   } while (0)
1444 #endif /* ! _LONG_LONG_LIMB */
1445 #define count_leading_zeros(count, x) \
1446   __asm__ ("cntlzd %0,%1" : "=r" (count) : "r" (x))
1447 #define COUNT_LEADING_ZEROS_0 64
1448 #if 0 && __GMP_GNUC_PREREQ (4,4) /* Disable, this results in libcalls! */
1449 #define umul_ppmm(w1, w0, u, v) \
1450   do {                                                                  \
1451     typedef unsigned int __ll_UTItype __attribute__((mode(TI)));        \
1452     __ll_UTItype __ll = (__ll_UTItype)(u) * (v);                        \
1453     w1 = __ll >> 64;                                                    \
1454     w0 = __ll;                                                          \
1455   } while (0)
1456 #endif
1457 #if !defined (umul_ppmm)
1458 #define umul_ppmm(ph, pl, m0, m1) \
1459   do {                                                                  \
1460     UDItype __m0 = (m0), __m1 = (m1);                                   \
1461     __asm__ ("mulhdu %0,%1,%2" : "=r" (ph) : "%r" (m0), "r" (m1));      \
1462     (pl) = __m0 * __m1;                                                 \
1463   } while (0)
1464 #endif
1465 #define UMUL_TIME 15
1466 #define smul_ppmm(ph, pl, m0, m1) \
1467   do {                                                                  \
1468     DItype __m0 = (m0), __m1 = (m1);                                    \
1469     __asm__ ("mulhd %0,%1,%2" : "=r" (ph) : "%r" (m0), "r" (m1));       \
1470     (pl) = __m0 * __m1;                                                 \
1471   } while (0)
1472 #define SMUL_TIME 14  /* ??? */
1473 #define UDIV_TIME 120 /* ??? */
1474 #endif /* 64-bit PowerPC.  */
1475
1476 #if defined (__pyr__) && W_TYPE_SIZE == 32
1477 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
1478   __asm__ ("addw %5,%1\n\taddwc %3,%0"                                  \
1479            : "=r" (sh), "=&r" (sl)                                      \
1480            : "0"  ((USItype)(ah)), "g" ((USItype)(bh)),                 \
1481              "%1" ((USItype)(al)), "g" ((USItype)(bl)))
1482 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
1483   __asm__ ("subw %5,%1\n\tsubwb %3,%0"                                  \
1484            : "=r" (sh), "=&r" (sl)                                      \
1485            : "0" ((USItype)(ah)), "g" ((USItype)(bh)),                  \
1486              "1" ((USItype)(al)), "g" ((USItype)(bl)))
1487 /* This insn works on Pyramids with AP, XP, or MI CPUs, but not with SP.  */
1488 #define umul_ppmm(w1, w0, u, v) \
1489   ({union {UDItype __ll;                                                \
1490            struct {USItype __h, __l;} __i;                              \
1491           } __x;                                                        \
1492   __asm__ ("movw %1,%R0\n\tuemul %2,%0"                                 \
1493            : "=&r" (__x.__ll)                                           \
1494            : "g" ((USItype) (u)), "g" ((USItype)(v)));                  \
1495   (w1) = __x.__i.__h; (w0) = __x.__i.__l;})
1496 #endif /* __pyr__ */
1497
1498 #if defined (__ibm032__) /* RT/ROMP */  && W_TYPE_SIZE == 32
1499 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
1500   __asm__ ("a %1,%5\n\tae %0,%3"                                        \
1501            : "=r" (sh), "=&r" (sl)                                      \
1502            : "0"  ((USItype)(ah)), "r" ((USItype)(bh)),                 \
1503              "%1" ((USItype)(al)), "r" ((USItype)(bl)))
1504 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
1505   __asm__ ("s %1,%5\n\tse %0,%3"                                        \
1506            : "=r" (sh), "=&r" (sl)                                      \
1507            : "0" ((USItype)(ah)), "r" ((USItype)(bh)),                  \
1508              "1" ((USItype)(al)), "r" ((USItype)(bl)))
1509 #define smul_ppmm(ph, pl, m0, m1) \
1510   __asm__ (                                                             \
1511        "s       r2,r2\n"                                                \
1512 "       mts r10,%2\n"                                                   \
1513 "       m       r2,%3\n"                                                \
1514 "       m       r2,%3\n"                                                \
1515 "       m       r2,%3\n"                                                \
1516 "       m       r2,%3\n"                                                \
1517 "       m       r2,%3\n"                                                \
1518 "       m       r2,%3\n"                                                \
1519 "       m       r2,%3\n"                                                \
1520 "       m       r2,%3\n"                                                \
1521 "       m       r2,%3\n"                                                \
1522 "       m       r2,%3\n"                                                \
1523 "       m       r2,%3\n"                                                \
1524 "       m       r2,%3\n"                                                \
1525 "       m       r2,%3\n"                                                \
1526 "       m       r2,%3\n"                                                \
1527 "       m       r2,%3\n"                                                \
1528 "       m       r2,%3\n"                                                \
1529 "       cas     %0,r2,r0\n"                                             \
1530 "       mfs     r10,%1"                                                 \
1531            : "=r" (ph), "=r" (pl)                                       \
1532            : "%r" ((USItype)(m0)), "r" ((USItype)(m1))                  \
1533            : "r2")
1534 #define UMUL_TIME 20
1535 #define UDIV_TIME 200
1536 #define count_leading_zeros(count, x) \
1537   do {                                                                  \
1538     if ((x) >= 0x10000)                                                 \
1539       __asm__ ("clz     %0,%1"                                          \
1540                : "=r" (count) : "r" ((USItype)(x) >> 16));              \
1541     else                                                                \
1542       {                                                                 \
1543         __asm__ ("clz   %0,%1"                                          \
1544                  : "=r" (count) : "r" ((USItype)(x)));                  \
1545         (count) += 16;                                                  \
1546       }                                                                 \
1547   } while (0)
1548 #endif /* RT/ROMP */
1549
1550 #if (defined (__SH2__) || defined (__SH3__) || defined (__SH4__)) && W_TYPE_SIZE == 32
1551 #define umul_ppmm(w1, w0, u, v) \
1552   __asm__ ("dmulu.l %2,%3\n\tsts macl,%1\n\tsts mach,%0"                \
1553            : "=r" (w1), "=r" (w0) : "r" (u), "r" (v) : "macl", "mach")
1554 #define UMUL_TIME 5
1555 #endif
1556
1557 #if defined (__sparc__) && W_TYPE_SIZE == 32
1558 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
1559   __asm__ ("addcc %r4,%5,%1\n\taddx %r2,%3,%0"                          \
1560            : "=r" (sh), "=&r" (sl)                                      \
1561            : "rJ" (ah), "rI" (bh),"%rJ" (al), "rI" (bl)                 \
1562            __CLOBBER_CC)
1563 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
1564   __asm__ ("subcc %r4,%5,%1\n\tsubx %r2,%3,%0"                          \
1565            : "=r" (sh), "=&r" (sl)                                      \
1566            : "rJ" (ah), "rI" (bh), "rJ" (al), "rI" (bl) \
1567            __CLOBBER_CC)
1568 /* FIXME: When gcc -mcpu=v9 is used on solaris, gcc/config/sol2-sld-64.h
1569    doesn't define anything to indicate that to us, it only sets __sparcv8. */
1570 #if defined (__sparc_v9__) || defined (__sparcv9)
1571 /* Perhaps we should use floating-point operations here?  */
1572 #if 0
1573 /* Triggers a bug making mpz/tests/t-gcd.c fail.
1574    Perhaps we simply need explicitly zero-extend the inputs?  */
1575 #define umul_ppmm(w1, w0, u, v) \
1576   __asm__ ("mulx %2,%3,%%g1; srl %%g1,0,%1; srlx %%g1,32,%0" :          \
1577            "=r" (w1), "=r" (w0) : "r" (u), "r" (v) : "g1")
1578 #else
1579 /* Use v8 umul until above bug is fixed.  */
1580 #define umul_ppmm(w1, w0, u, v) \
1581   __asm__ ("umul %2,%3,%1;rd %%y,%0" : "=r" (w1), "=r" (w0) : "r" (u), "r" (v))
1582 #endif
1583 /* Use a plain v8 divide for v9.  */
1584 #define udiv_qrnnd(q, r, n1, n0, d) \
1585   do {                                                                  \
1586     USItype __q;                                                        \
1587     __asm__ ("mov %1,%%y;nop;nop;nop;udiv %2,%3,%0"                     \
1588              : "=r" (__q) : "r" (n1), "r" (n0), "r" (d));               \
1589     (r) = (n0) - __q * (d);                                             \
1590     (q) = __q;                                                          \
1591   } while (0)
1592 #else
1593 #if defined (__sparc_v8__)   /* gcc normal */                           \
1594   || defined (__sparcv8)     /* gcc solaris */                          \
1595   || HAVE_HOST_CPU_supersparc
1596 /* Don't match immediate range because, 1) it is not often useful,
1597    2) the 'I' flag thinks of the range as a 13 bit signed interval,
1598    while we want to match a 13 bit interval, sign extended to 32 bits,
1599    but INTERPRETED AS UNSIGNED.  */
1600 #define umul_ppmm(w1, w0, u, v) \
1601   __asm__ ("umul %2,%3,%1;rd %%y,%0" : "=r" (w1), "=r" (w0) : "r" (u), "r" (v))
1602 #define UMUL_TIME 5
1603
1604 #if HAVE_HOST_CPU_supersparc
1605 #define UDIV_TIME 60            /* SuperSPARC timing */
1606 #else
1607 /* Don't use this on SuperSPARC because its udiv only handles 53 bit
1608    dividends and will trap to the kernel for the rest. */
1609 #define udiv_qrnnd(q, r, n1, n0, d) \
1610   do {                                                                  \
1611     USItype __q;                                                        \
1612     __asm__ ("mov %1,%%y;nop;nop;nop;udiv %2,%3,%0"                     \
1613              : "=r" (__q) : "r" (n1), "r" (n0), "r" (d));               \
1614     (r) = (n0) - __q * (d);                                             \
1615     (q) = __q;                                                          \
1616   } while (0)
1617 #define UDIV_TIME 25
1618 #endif /* HAVE_HOST_CPU_supersparc */
1619
1620 #else /* ! __sparc_v8__ */
1621 #if defined (__sparclite__)
1622 /* This has hardware multiply but not divide.  It also has two additional
1623    instructions scan (ffs from high bit) and divscc.  */
1624 #define umul_ppmm(w1, w0, u, v) \
1625   __asm__ ("umul %2,%3,%1;rd %%y,%0" : "=r" (w1), "=r" (w0) : "r" (u), "r" (v))
1626 #define UMUL_TIME 5
1627 #define udiv_qrnnd(q, r, n1, n0, d) \
1628   __asm__ ("! Inlined udiv_qrnnd\n"                                     \
1629 "       wr      %%g0,%2,%%y     ! Not a delayed write for sparclite\n"  \
1630 "       tst     %%g0\n"                                                 \
1631 "       divscc  %3,%4,%%g1\n"                                           \
1632 "       divscc  %%g1,%4,%%g1\n"                                         \
1633 "       divscc  %%g1,%4,%%g1\n"                                         \
1634 "       divscc  %%g1,%4,%%g1\n"                                         \
1635 "       divscc  %%g1,%4,%%g1\n"                                         \
1636 "       divscc  %%g1,%4,%%g1\n"                                         \
1637 "       divscc  %%g1,%4,%%g1\n"                                         \
1638 "       divscc  %%g1,%4,%%g1\n"                                         \
1639 "       divscc  %%g1,%4,%%g1\n"                                         \
1640 "       divscc  %%g1,%4,%%g1\n"                                         \
1641 "       divscc  %%g1,%4,%%g1\n"                                         \
1642 "       divscc  %%g1,%4,%%g1\n"                                         \
1643 "       divscc  %%g1,%4,%%g1\n"                                         \
1644 "       divscc  %%g1,%4,%%g1\n"                                         \
1645 "       divscc  %%g1,%4,%%g1\n"                                         \
1646 "       divscc  %%g1,%4,%%g1\n"                                         \
1647 "       divscc  %%g1,%4,%%g1\n"                                         \
1648 "       divscc  %%g1,%4,%%g1\n"                                         \
1649 "       divscc  %%g1,%4,%%g1\n"                                         \
1650 "       divscc  %%g1,%4,%%g1\n"                                         \
1651 "       divscc  %%g1,%4,%%g1\n"                                         \
1652 "       divscc  %%g1,%4,%%g1\n"                                         \
1653 "       divscc  %%g1,%4,%%g1\n"                                         \
1654 "       divscc  %%g1,%4,%%g1\n"                                         \
1655 "       divscc  %%g1,%4,%%g1\n"                                         \
1656 "       divscc  %%g1,%4,%%g1\n"                                         \
1657 "       divscc  %%g1,%4,%%g1\n"                                         \
1658 "       divscc  %%g1,%4,%%g1\n"                                         \
1659 "       divscc  %%g1,%4,%%g1\n"                                         \
1660 "       divscc  %%g1,%4,%%g1\n"                                         \
1661 "       divscc  %%g1,%4,%%g1\n"                                         \
1662 "       divscc  %%g1,%4,%0\n"                                           \
1663 "       rd      %%y,%1\n"                                               \
1664 "       bl,a 1f\n"                                                      \
1665 "       add     %1,%4,%1\n"                                             \
1666 "1:     ! End of inline udiv_qrnnd"                                     \
1667            : "=r" (q), "=r" (r) : "r" (n1), "r" (n0), "rI" (d)          \
1668            : "%g1" __AND_CLOBBER_CC)
1669 #define UDIV_TIME 37
1670 #define count_leading_zeros(count, x) \
1671   __asm__ ("scan %1,1,%0" : "=r" (count) : "r" (x))
1672 /* Early sparclites return 63 for an argument of 0, but they warn that future
1673    implementations might change this.  Therefore, leave COUNT_LEADING_ZEROS_0
1674    undefined.  */
1675 #endif /* __sparclite__ */
1676 #endif /* __sparc_v8__ */
1677 #endif /* __sparc_v9__ */
1678 /* Default to sparc v7 versions of umul_ppmm and udiv_qrnnd.  */
1679 #ifndef umul_ppmm
1680 #define umul_ppmm(w1, w0, u, v) \
1681   __asm__ ("! Inlined umul_ppmm\n"                                      \
1682 "       wr      %%g0,%2,%%y     ! SPARC has 0-3 delay insn after a wr\n" \
1683 "       sra     %3,31,%%g2      ! Don't move this insn\n"               \
1684 "       and     %2,%%g2,%%g2    ! Don't move this insn\n"               \
1685 "       andcc   %%g0,0,%%g1     ! Don't move this insn\n"               \
1686 "       mulscc  %%g1,%3,%%g1\n"                                         \
1687 "       mulscc  %%g1,%3,%%g1\n"                                         \
1688 "       mulscc  %%g1,%3,%%g1\n"                                         \
1689 "       mulscc  %%g1,%3,%%g1\n"                                         \
1690 "       mulscc  %%g1,%3,%%g1\n"                                         \
1691 "       mulscc  %%g1,%3,%%g1\n"                                         \
1692 "       mulscc  %%g1,%3,%%g1\n"                                         \
1693 "       mulscc  %%g1,%3,%%g1\n"                                         \
1694 "       mulscc  %%g1,%3,%%g1\n"                                         \
1695 "       mulscc  %%g1,%3,%%g1\n"                                         \
1696 "       mulscc  %%g1,%3,%%g1\n"                                         \
1697 "       mulscc  %%g1,%3,%%g1\n"                                         \
1698 "       mulscc  %%g1,%3,%%g1\n"                                         \
1699 "       mulscc  %%g1,%3,%%g1\n"                                         \
1700 "       mulscc  %%g1,%3,%%g1\n"                                         \
1701 "       mulscc  %%g1,%3,%%g1\n"                                         \
1702 "       mulscc  %%g1,%3,%%g1\n"                                         \
1703 "       mulscc  %%g1,%3,%%g1\n"                                         \
1704 "       mulscc  %%g1,%3,%%g1\n"                                         \
1705 "       mulscc  %%g1,%3,%%g1\n"                                         \
1706 "       mulscc  %%g1,%3,%%g1\n"                                         \
1707 "       mulscc  %%g1,%3,%%g1\n"                                         \
1708 "       mulscc  %%g1,%3,%%g1\n"                                         \
1709 "       mulscc  %%g1,%3,%%g1\n"                                         \
1710 "       mulscc  %%g1,%3,%%g1\n"                                         \
1711 "       mulscc  %%g1,%3,%%g1\n"                                         \
1712 "       mulscc  %%g1,%3,%%g1\n"                                         \
1713 "       mulscc  %%g1,%3,%%g1\n"                                         \
1714 "       mulscc  %%g1,%3,%%g1\n"                                         \
1715 "       mulscc  %%g1,%3,%%g1\n"                                         \
1716 "       mulscc  %%g1,%3,%%g1\n"                                         \
1717 "       mulscc  %%g1,%3,%%g1\n"                                         \
1718 "       mulscc  %%g1,0,%%g1\n"                                          \
1719 "       add     %%g1,%%g2,%0\n"                                         \
1720 "       rd      %%y,%1"                                                 \
1721            : "=r" (w1), "=r" (w0) : "%rI" (u), "r" (v)                  \
1722            : "%g1", "%g2" __AND_CLOBBER_CC)
1723 #define UMUL_TIME 39            /* 39 instructions */
1724 #endif
1725 #ifndef udiv_qrnnd
1726 #ifndef LONGLONG_STANDALONE
1727 #define udiv_qrnnd(q, r, n1, n0, d) \
1728   do { UWtype __r;                                                      \
1729     (q) = __MPN(udiv_qrnnd) (&__r, (n1), (n0), (d));                    \
1730     (r) = __r;                                                          \
1731   } while (0)
1732 extern UWtype __MPN(udiv_qrnnd) (UWtype *, UWtype, UWtype, UWtype);
1733 #ifndef UDIV_TIME
1734 #define UDIV_TIME 140
1735 #endif
1736 #endif /* LONGLONG_STANDALONE */
1737 #endif /* udiv_qrnnd */
1738 #endif /* __sparc__ */
1739
1740 #if defined (__sparc__) && W_TYPE_SIZE == 64
1741 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
1742   __asm__ (                                                             \
1743        "addcc   %r4,%5,%1\n"                                            \
1744       " addccc  %r6,%7,%%g0\n"                                          \
1745       " addc    %r2,%3,%0"                                              \
1746           : "=r" (sh), "=&r" (sl)                                       \
1747           : "rJ" (ah), "rI" (bh), "%rJ" (al), "rI" (bl),                \
1748             "%rJ" ((al) >> 32), "rI" ((bl) >> 32)                       \
1749            __CLOBBER_CC)
1750 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
1751   __asm__ (                                                             \
1752        "subcc   %r4,%5,%1\n"                                            \
1753       " subccc  %r6,%7,%%g0\n"                                          \
1754       " subc    %r2,%3,%0"                                              \
1755           : "=r" (sh), "=&r" (sl)                                       \
1756           : "rJ" (ah), "rI" (bh), "rJ" (al), "rI" (bl),         \
1757             "rJ" ((al) >> 32), "rI" ((bl) >> 32)                        \
1758            __CLOBBER_CC)
1759 #endif
1760
1761 #if defined (__vax__) && W_TYPE_SIZE == 32
1762 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
1763   __asm__ ("addl2 %5,%1\n\tadwc %3,%0"                                  \
1764            : "=g" (sh), "=&g" (sl)                                      \
1765            : "0"  ((USItype)(ah)), "g" ((USItype)(bh)),                 \
1766              "%1" ((USItype)(al)), "g" ((USItype)(bl)))
1767 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
1768   __asm__ ("subl2 %5,%1\n\tsbwc %3,%0"                                  \
1769            : "=g" (sh), "=&g" (sl)                                      \
1770            : "0" ((USItype)(ah)), "g" ((USItype)(bh)),                  \
1771              "1" ((USItype)(al)), "g" ((USItype)(bl)))
1772 #define smul_ppmm(xh, xl, m0, m1) \
1773   do {                                                                  \
1774     union {UDItype __ll;                                                \
1775            struct {USItype __l, __h;} __i;                              \
1776           } __x;                                                        \
1777     USItype __m0 = (m0), __m1 = (m1);                                   \
1778     __asm__ ("emul %1,%2,$0,%0"                                         \
1779              : "=g" (__x.__ll) : "g" (__m0), "g" (__m1));               \
1780     (xh) = __x.__i.__h; (xl) = __x.__i.__l;                             \
1781   } while (0)
1782 #define sdiv_qrnnd(q, r, n1, n0, d) \
1783   do {                                                                  \
1784     union {DItype __ll;                                                 \
1785            struct {SItype __l, __h;} __i;                               \
1786           } __x;                                                        \
1787     __x.__i.__h = n1; __x.__i.__l = n0;                                 \
1788     __asm__ ("ediv %3,%2,%0,%1"                                         \
1789              : "=g" (q), "=g" (r) : "g" (__x.__ll), "g" (d));           \
1790   } while (0)
1791 #if 0
1792 /* FIXME: This instruction appears to be unimplemented on some systems (vax
1793    8800 maybe). */
1794 #define count_trailing_zeros(count,x)                                   \
1795   do {                                                                  \
1796     __asm__ ("ffs 0, 31, %1, %0"                                        \
1797              : "=g" (count)                                             \
1798              : "g" ((USItype) (x)));                                    \
1799   } while (0)
1800 #endif
1801 #endif /* __vax__ */
1802
1803 #if defined (__z8000__) && W_TYPE_SIZE == 16
1804 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
1805   __asm__ ("add %H1,%H5\n\tadc  %H0,%H3"                                \
1806            : "=r" (sh), "=&r" (sl)                                      \
1807            : "0"  ((unsigned int)(ah)), "r" ((unsigned int)(bh)),       \
1808              "%1" ((unsigned int)(al)), "rQR" ((unsigned int)(bl)))
1809 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
1810   __asm__ ("sub %H1,%H5\n\tsbc  %H0,%H3"                                \
1811            : "=r" (sh), "=&r" (sl)                                      \
1812            : "0" ((unsigned int)(ah)), "r" ((unsigned int)(bh)),        \
1813              "1" ((unsigned int)(al)), "rQR" ((unsigned int)(bl)))
1814 #define umul_ppmm(xh, xl, m0, m1) \
1815   do {                                                                  \
1816     union {long int __ll;                                               \
1817            struct {unsigned int __h, __l;} __i;                         \
1818           } __x;                                                        \
1819     unsigned int __m0 = (m0), __m1 = (m1);                              \
1820     __asm__ ("mult      %S0,%H3"                                        \
1821              : "=r" (__x.__i.__h), "=r" (__x.__i.__l)                   \
1822              : "%1" (m0), "rQR" (m1));                                  \
1823     (xh) = __x.__i.__h; (xl) = __x.__i.__l;                             \
1824     (xh) += ((((signed int) __m0 >> 15) & __m1)                         \
1825              + (((signed int) __m1 >> 15) & __m0));                     \
1826   } while (0)
1827 #endif /* __z8000__ */
1828
1829 #endif /* __GNUC__ */
1830
1831 #endif /* NO_ASM */
1832
1833
1834 /* FIXME: "sidi" here is highly doubtful, should sometimes be "diti".  */
1835 #if !defined (umul_ppmm) && defined (__umulsidi3)
1836 #define umul_ppmm(ph, pl, m0, m1) \
1837   {                                                                     \
1838     UDWtype __ll = __umulsidi3 (m0, m1);                                \
1839     ph = (UWtype) (__ll >> W_TYPE_SIZE);                                \
1840     pl = (UWtype) __ll;                                                 \
1841   }
1842 #endif
1843
1844 #if !defined (__umulsidi3)
1845 #define __umulsidi3(u, v) \
1846   ({UWtype __hi, __lo;                                                  \
1847     umul_ppmm (__hi, __lo, u, v);                                       \
1848     ((UDWtype) __hi << W_TYPE_SIZE) | __lo; })
1849 #endif
1850
1851
1852 /* Use mpn_umul_ppmm or mpn_udiv_qrnnd functions, if they exist.  The "_r"
1853    forms have "reversed" arguments, meaning the pointer is last, which
1854    sometimes allows better parameter passing, in particular on 64-bit
1855    hppa. */
1856
1857 #define mpn_umul_ppmm  __MPN(umul_ppmm)
1858 extern UWtype mpn_umul_ppmm (UWtype *, UWtype, UWtype);
1859
1860 #if ! defined (umul_ppmm) && HAVE_NATIVE_mpn_umul_ppmm  \
1861   && ! defined (LONGLONG_STANDALONE)
1862 #define umul_ppmm(wh, wl, u, v)                                               \
1863   do {                                                                        \
1864     UWtype __umul_ppmm__p0;                                                   \
1865     (wh) = mpn_umul_ppmm (&__umul_ppmm__p0, (UWtype) (u), (UWtype) (v));      \
1866     (wl) = __umul_ppmm__p0;                                                   \
1867   } while (0)
1868 #endif
1869
1870 #define mpn_umul_ppmm_r  __MPN(umul_ppmm_r)
1871 extern UWtype mpn_umul_ppmm_r (UWtype, UWtype, UWtype *);
1872
1873 #if ! defined (umul_ppmm) && HAVE_NATIVE_mpn_umul_ppmm_r        \
1874   && ! defined (LONGLONG_STANDALONE)
1875 #define umul_ppmm(wh, wl, u, v)                                               \
1876   do {                                                                        \
1877     UWtype __umul_ppmm__p0;                                                   \
1878     (wh) = mpn_umul_ppmm_r ((UWtype) (u), (UWtype) (v), &__umul_ppmm__p0);    \
1879     (wl) = __umul_ppmm__p0;                                                   \
1880   } while (0)
1881 #endif
1882
1883 #define mpn_udiv_qrnnd  __MPN(udiv_qrnnd)
1884 extern UWtype mpn_udiv_qrnnd (UWtype *, UWtype, UWtype, UWtype);
1885
1886 #if ! defined (udiv_qrnnd) && HAVE_NATIVE_mpn_udiv_qrnnd        \
1887   && ! defined (LONGLONG_STANDALONE)
1888 #define udiv_qrnnd(q, r, n1, n0, d)                                     \
1889   do {                                                                  \
1890     UWtype __udiv_qrnnd__r;                                             \
1891     (q) = mpn_udiv_qrnnd (&__udiv_qrnnd__r,                             \
1892                           (UWtype) (n1), (UWtype) (n0), (UWtype) d);    \
1893     (r) = __udiv_qrnnd__r;                                              \
1894   } while (0)
1895 #endif
1896
1897 #define mpn_udiv_qrnnd_r  __MPN(udiv_qrnnd_r)
1898 extern UWtype mpn_udiv_qrnnd_r (UWtype, UWtype, UWtype, UWtype *);
1899
1900 #if ! defined (udiv_qrnnd) && HAVE_NATIVE_mpn_udiv_qrnnd_r      \
1901   && ! defined (LONGLONG_STANDALONE)
1902 #define udiv_qrnnd(q, r, n1, n0, d)                                     \
1903   do {                                                                  \
1904     UWtype __udiv_qrnnd__r;                                             \
1905     (q) = mpn_udiv_qrnnd_r ((UWtype) (n1), (UWtype) (n0), (UWtype) d,   \
1906                             &__udiv_qrnnd__r);                          \
1907     (r) = __udiv_qrnnd__r;                                              \
1908   } while (0)
1909 #endif
1910
1911
1912 /* If this machine has no inline assembler, use C macros.  */
1913
1914 #if !defined (add_ssaaaa)
1915 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
1916   do {                                                                  \
1917     UWtype __x;                                                         \
1918     __x = (al) + (bl);                                                  \
1919     (sh) = (ah) + (bh) + (__x < (al));                                  \
1920     (sl) = __x;                                                         \
1921   } while (0)
1922 #endif
1923
1924 #if !defined (sub_ddmmss)
1925 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
1926   do {                                                                  \
1927     UWtype __x;                                                         \
1928     __x = (al) - (bl);                                                  \
1929     (sh) = (ah) - (bh) - ((al) < (bl));                                 \
1930     (sl) = __x;                                                         \
1931   } while (0)
1932 #endif
1933
1934 /* If we lack umul_ppmm but have smul_ppmm, define umul_ppmm in terms of
1935    smul_ppmm.  */
1936 #if !defined (umul_ppmm) && defined (smul_ppmm)
1937 #define umul_ppmm(w1, w0, u, v)                                         \
1938   do {                                                                  \
1939     UWtype __w1;                                                        \
1940     UWtype __xm0 = (u), __xm1 = (v);                                    \
1941     smul_ppmm (__w1, w0, __xm0, __xm1);                                 \
1942     (w1) = __w1 + (-(__xm0 >> (W_TYPE_SIZE - 1)) & __xm1)               \
1943                 + (-(__xm1 >> (W_TYPE_SIZE - 1)) & __xm0);              \
1944   } while (0)
1945 #endif
1946
1947 /* If we still don't have umul_ppmm, define it using plain C.
1948
1949    For reference, when this code is used for squaring (ie. u and v identical
1950    expressions), gcc recognises __x1 and __x2 are the same and generates 3
1951    multiplies, not 4.  The subsequent additions could be optimized a bit,
1952    but the only place GMP currently uses such a square is mpn_sqr_basecase,
1953    and chips obliged to use this generic C umul will have plenty of worse
1954    performance problems than a couple of extra instructions on the diagonal
1955    of sqr_basecase.  */
1956
1957 #if !defined (umul_ppmm)
1958 #define umul_ppmm(w1, w0, u, v)                                         \
1959   do {                                                                  \
1960     UWtype __x0, __x1, __x2, __x3;                                      \
1961     UHWtype __ul, __vl, __uh, __vh;                                     \
1962     UWtype __u = (u), __v = (v);                                        \
1963                                                                         \
1964     __ul = __ll_lowpart (__u);                                          \
1965     __uh = __ll_highpart (__u);                                         \
1966     __vl = __ll_lowpart (__v);                                          \
1967     __vh = __ll_highpart (__v);                                         \
1968                                                                         \
1969     __x0 = (UWtype) __ul * __vl;                                        \
1970     __x1 = (UWtype) __ul * __vh;                                        \
1971     __x2 = (UWtype) __uh * __vl;                                        \
1972     __x3 = (UWtype) __uh * __vh;                                        \
1973                                                                         \
1974     __x1 += __ll_highpart (__x0);/* this can't give carry */            \
1975     __x1 += __x2;               /* but this indeed can */               \
1976     if (__x1 < __x2)            /* did we get it? */                    \
1977       __x3 += __ll_B;           /* yes, add it in the proper pos. */    \
1978                                                                         \
1979     (w1) = __x3 + __ll_highpart (__x1);                                 \
1980     (w0) = (__x1 << W_TYPE_SIZE/2) + __ll_lowpart (__x0);               \
1981   } while (0)
1982 #endif
1983
1984 /* If we don't have smul_ppmm, define it using umul_ppmm (which surely will
1985    exist in one form or another.  */
1986 #if !defined (smul_ppmm)
1987 #define smul_ppmm(w1, w0, u, v)                                         \
1988   do {                                                                  \
1989     UWtype __w1;                                                        \
1990     UWtype __xm0 = (u), __xm1 = (v);                                    \
1991     umul_ppmm (__w1, w0, __xm0, __xm1);                                 \
1992     (w1) = __w1 - (-(__xm0 >> (W_TYPE_SIZE - 1)) & __xm1)               \
1993                 - (-(__xm1 >> (W_TYPE_SIZE - 1)) & __xm0);              \
1994   } while (0)
1995 #endif
1996
1997 /* Define this unconditionally, so it can be used for debugging.  */
1998 #define __udiv_qrnnd_c(q, r, n1, n0, d) \
1999   do {                                                                  \
2000     UWtype __d1, __d0, __q1, __q0, __r1, __r0, __m;                     \
2001                                                                         \
2002     ASSERT ((d) != 0);                                                  \
2003     ASSERT ((n1) < (d));                                                \
2004                                                                         \
2005     __d1 = __ll_highpart (d);                                           \
2006     __d0 = __ll_lowpart (d);                                            \
2007                                                                         \
2008     __q1 = (n1) / __d1;                                                 \
2009     __r1 = (n1) - __q1 * __d1;                                          \
2010     __m = __q1 * __d0;                                                  \
2011     __r1 = __r1 * __ll_B | __ll_highpart (n0);                          \
2012     if (__r1 < __m)                                                     \
2013       {                                                                 \
2014         __q1--, __r1 += (d);                                            \
2015         if (__r1 >= (d)) /* i.e. we didn't get carry when adding to __r1 */\
2016           if (__r1 < __m)                                               \
2017             __q1--, __r1 += (d);                                        \
2018       }                                                                 \
2019     __r1 -= __m;                                                        \
2020                                                                         \
2021     __q0 = __r1 / __d1;                                                 \
2022     __r0 = __r1  - __q0 * __d1;                                         \
2023     __m = __q0 * __d0;                                                  \
2024     __r0 = __r0 * __ll_B | __ll_lowpart (n0);                           \
2025     if (__r0 < __m)                                                     \
2026       {                                                                 \
2027         __q0--, __r0 += (d);                                            \
2028         if (__r0 >= (d))                                                \
2029           if (__r0 < __m)                                               \
2030             __q0--, __r0 += (d);                                        \
2031       }                                                                 \
2032     __r0 -= __m;                                                        \
2033                                                                         \
2034     (q) = __q1 * __ll_B | __q0;                                         \
2035     (r) = __r0;                                                         \
2036   } while (0)
2037
2038 /* If the processor has no udiv_qrnnd but sdiv_qrnnd, go through
2039    __udiv_w_sdiv (defined in libgcc or elsewhere).  */
2040 #if !defined (udiv_qrnnd) && defined (sdiv_qrnnd)
2041 #define udiv_qrnnd(q, r, nh, nl, d) \
2042   do {                                                                  \
2043     UWtype __r;                                                         \
2044     (q) = __MPN(udiv_w_sdiv) (&__r, nh, nl, d);                         \
2045     (r) = __r;                                                          \
2046   } while (0)
2047 __GMP_DECLSPEC UWtype __MPN(udiv_w_sdiv) (UWtype *, UWtype, UWtype, UWtype);
2048 #endif
2049
2050 /* If udiv_qrnnd was not defined for this processor, use __udiv_qrnnd_c.  */
2051 #if !defined (udiv_qrnnd)
2052 #define UDIV_NEEDS_NORMALIZATION 1
2053 #define udiv_qrnnd __udiv_qrnnd_c
2054 #endif
2055
2056 #if !defined (count_leading_zeros)
2057 #define count_leading_zeros(count, x) \
2058   do {                                                                  \
2059     UWtype __xr = (x);                                                  \
2060     UWtype __a;                                                         \
2061                                                                         \
2062     if (W_TYPE_SIZE == 32)                                              \
2063       {                                                                 \
2064         __a = __xr < ((UWtype) 1 << 2*__BITS4)                          \
2065           ? (__xr < ((UWtype) 1 << __BITS4) ? 1 : __BITS4 + 1)          \
2066           : (__xr < ((UWtype) 1 << 3*__BITS4) ? 2*__BITS4 + 1           \
2067           : 3*__BITS4 + 1);                                             \
2068       }                                                                 \
2069     else                                                                \
2070       {                                                                 \
2071         for (__a = W_TYPE_SIZE - 8; __a > 0; __a -= 8)                  \
2072           if (((__xr >> __a) & 0xff) != 0)                              \
2073             break;                                                      \
2074         ++__a;                                                          \
2075       }                                                                 \
2076                                                                         \
2077     (count) = W_TYPE_SIZE + 1 - __a - __clz_tab[__xr >> __a];           \
2078   } while (0)
2079 /* This version gives a well-defined value for zero. */
2080 #define COUNT_LEADING_ZEROS_0 (W_TYPE_SIZE - 1)
2081 #define COUNT_LEADING_ZEROS_NEED_CLZ_TAB
2082 #define COUNT_LEADING_ZEROS_SLOW
2083 #endif
2084
2085 /* clz_tab needed by mpn/x86/pentium/mod_1.asm in a fat binary */
2086 #if HAVE_HOST_CPU_FAMILY_x86 && WANT_FAT_BINARY
2087 #define COUNT_LEADING_ZEROS_NEED_CLZ_TAB
2088 #endif
2089
2090 #ifdef COUNT_LEADING_ZEROS_NEED_CLZ_TAB
2091 extern const unsigned char __GMP_DECLSPEC __clz_tab[129];
2092 #endif
2093
2094 #if !defined (count_trailing_zeros)
2095 #if !defined (COUNT_LEADING_ZEROS_SLOW)
2096 /* Define count_trailing_zeros using an asm count_leading_zeros.  */
2097 #define count_trailing_zeros(count, x)                                  \
2098   do {                                                                  \
2099     UWtype __ctz_x = (x);                                               \
2100     UWtype __ctz_c;                                                     \
2101     ASSERT (__ctz_x != 0);                                              \
2102     count_leading_zeros (__ctz_c, __ctz_x & -__ctz_x);                  \
2103     (count) = W_TYPE_SIZE - 1 - __ctz_c;                                \
2104   } while (0)
2105 #else
2106 /* Define count_trailing_zeros in plain C, assuming small counts are common.
2107    We use clz_tab without ado, since the C count_leading_zeros above will have
2108    pulled it in.  */
2109 #define count_trailing_zeros(count, x)                                  \
2110   do {                                                                  \
2111     UWtype __ctz_x = (x);                                               \
2112     int __ctz_c;                                                        \
2113                                                                         \
2114     if (LIKELY ((__ctz_x & 0xff) != 0))                                 \
2115       (count) = __clz_tab[__ctz_x & -__ctz_x] - 2;                      \
2116     else                                                                \
2117       {                                                                 \
2118         for (__ctz_c = 8 - 2; __ctz_c < W_TYPE_SIZE - 2; __ctz_c += 8)  \
2119           {                                                             \
2120             __ctz_x >>= 8;                                              \
2121             if (LIKELY ((__ctz_x & 0xff) != 0))                         \
2122               break;                                                    \
2123           }                                                             \
2124                                                                         \
2125         (count) = __ctz_c + __clz_tab[__ctz_x & -__ctz_x];              \
2126       }                                                                 \
2127   } while (0)
2128 #endif
2129 #endif
2130
2131 #ifndef UDIV_NEEDS_NORMALIZATION
2132 #define UDIV_NEEDS_NORMALIZATION 0
2133 #endif
2134
2135 /* Whether udiv_qrnnd is actually implemented with udiv_qrnnd_preinv, and
2136    that hence the latter should always be used.  */
2137 #ifndef UDIV_PREINV_ALWAYS
2138 #define UDIV_PREINV_ALWAYS 0
2139 #endif
2140
2141 /* Give defaults for UMUL_TIME and UDIV_TIME.  */
2142 #ifndef UMUL_TIME
2143 #define UMUL_TIME 1
2144 #endif
2145
2146 #ifndef UDIV_TIME
2147 #define UDIV_TIME UMUL_TIME
2148 #endif