src/longlong.h

   1 /* longlong.h -- definitions for mixed size 32/64 bit arithmetic.
   2
   3 Copyright 1991, 1992, 1993, 1994, 1996, 1997, 1999, 2000, 2001, 2002, 2003,
   4 2004, 2005, 2007, 2008, 2009, 2011, 2012 Free Software Foundation, Inc.
   5
   6 This file is free software; you can redistribute it and/or modify it under the
   7 terms of the GNU Lesser General Public License as published by the Free
   8 Software Foundation; either version 3 of the License, or (at your option) any
   9 later version.
  10
  11 This file is distributed in the hope that it will be useful, but WITHOUT ANY
  12 WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
  13 PARTICULAR PURPOSE.  See the GNU Lesser General Public License for more
  14 details.
  15
  16 You should have received a copy of the GNU Lesser General Public License
  17 along with this file.  If not, see http://www.gnu.org/licenses/.  */
  18
  19 /* You have to define the following before including this file:
  20
  21    UWtype -- An unsigned type, default type for operations (typically a "word")
  22    UHWtype -- An unsigned type, at least half the size of UWtype
  23    UDWtype -- An unsigned type, at least twice as large a UWtype
  24    W_TYPE_SIZE -- size in bits of UWtype
  25
  26    SItype, USItype -- Signed and unsigned 32 bit types
  27    DItype, UDItype -- Signed and unsigned 64 bit types
  28
  29    On a 32 bit machine UWtype should typically be USItype;
  30    on a 64 bit machine, UWtype should typically be UDItype.
  31
  32    Optionally, define:
  33
  34    LONGLONG_STANDALONE -- Avoid code that needs machine-dependent support files
  35    NO_ASM -- Disable inline asm
  36
  37
  38    CAUTION!  Using this version of longlong.h outside of GMP is not safe.  You
  39    need to include gmp.h and gmp-impl.h, or certain things might not work as
  40    expected.
  41 */
  42
  43 #define __BITS4 (W_TYPE_SIZE / 4)
  44 #define __ll_B ((UWtype) 1 << (W_TYPE_SIZE / 2))
  45 #define __ll_lowpart(t) ((UWtype) (t) & (__ll_B - 1))
  46 #define __ll_highpart(t) ((UWtype) (t) >> (W_TYPE_SIZE / 2))
  47
  48 /* This is used to make sure no undesirable sharing between different libraries
  49    that use this file takes place.  */
  50 #ifndef __MPN
  51 #define __MPN(x) __##x
  52 #endif
  53
  54 /* Define auxiliary asm macros.
  55
  56    1) umul_ppmm(high_prod, low_prod, multiplier, multiplicand) multiplies two
  57    UWtype integers MULTIPLIER and MULTIPLICAND, and generates a two UWtype
  58    word product in HIGH_PROD and LOW_PROD.
  59
  60    2) __umulsidi3(a,b) multiplies two UWtype integers A and B, and returns a
  61    UDWtype product.  This is just a variant of umul_ppmm.
  62
  63    3) udiv_qrnnd(quotient, remainder, high_numerator, low_numerator,
  64    denominator) divides a UDWtype, composed by the UWtype integers
  65    HIGH_NUMERATOR and LOW_NUMERATOR, by DENOMINATOR and places the quotient
  66    in QUOTIENT and the remainder in REMAINDER.  HIGH_NUMERATOR must be less
  67    than DENOMINATOR for correct operation.  If, in addition, the most
  68    significant bit of DENOMINATOR must be 1, then the pre-processor symbol
  69    UDIV_NEEDS_NORMALIZATION is defined to 1.
  70
  71    4) sdiv_qrnnd(quotient, remainder, high_numerator, low_numerator,
  72    denominator).  Like udiv_qrnnd but the numbers are signed.  The quotient
  73    is rounded towards 0.
  74
  75    5) count_leading_zeros(count, x) counts the number of zero-bits from the
  76    msb to the first non-zero bit in the UWtype X.  This is the number of
  77    steps X needs to be shifted left to set the msb.  Undefined for X == 0,
  78    unless the symbol COUNT_LEADING_ZEROS_0 is defined to some value.
  79
  80    6) count_trailing_zeros(count, x) like count_leading_zeros, but counts
  81    from the least significant end.
  82
  83    7) add_ssaaaa(high_sum, low_sum, high_addend_1, low_addend_1,
  84    high_addend_2, low_addend_2) adds two UWtype integers, composed by
  85    HIGH_ADDEND_1 and LOW_ADDEND_1, and HIGH_ADDEND_2 and LOW_ADDEND_2
  86    respectively.  The result is placed in HIGH_SUM and LOW_SUM.  Overflow
  87    (i.e. carry out) is not stored anywhere, and is lost.
  88
  89    8) sub_ddmmss(high_difference, low_difference, high_minuend, low_minuend,
  90    high_subtrahend, low_subtrahend) subtracts two two-word UWtype integers,
  91    composed by HIGH_MINUEND_1 and LOW_MINUEND_1, and HIGH_SUBTRAHEND_2 and
  92    LOW_SUBTRAHEND_2 respectively.  The result is placed in HIGH_DIFFERENCE
  93    and LOW_DIFFERENCE.  Overflow (i.e. carry out) is not stored anywhere,
  94    and is lost.
  95
  96    If any of these macros are left undefined for a particular CPU,
  97    C macros are used.
  98
  99
 100    Notes:
 101
 102    For add_ssaaaa the two high and two low addends can both commute, but
 103    unfortunately gcc only supports one "%" commutative in each asm block.
 104    This has always been so but is only documented in recent versions
 105    (eg. pre-release 3.3).  Having two or more "%"s can cause an internal
 106    compiler error in certain rare circumstances.
 107
 108    Apparently it was only the last "%" that was ever actually respected, so
 109    the code has been updated to leave just that.  Clearly there's a free
 110    choice whether high or low should get it, if there's a reason to favour
 111    one over the other.  Also obviously when the constraints on the two
 112    operands are identical there's no benefit to the reloader in any "%" at
 113    all.
 114
 115    */
 116
 117 /* The CPUs come in alphabetical order below.
 118
 119    Please add support for more CPUs here, or improve the current support
 120    for the CPUs below!  */
 121
 122
 123 /* count_leading_zeros_gcc_clz is count_leading_zeros implemented with gcc
 124    3.4 __builtin_clzl or __builtin_clzll, according to our limb size.
 125    Similarly count_trailing_zeros_gcc_ctz using __builtin_ctzl or
 126    __builtin_ctzll.
 127
 128    These builtins are only used when we check what code comes out, on some
 129    chips they're merely libgcc calls, where we will instead want an inline
 130    in that case (either asm or generic C).
 131
 132    These builtins are better than an asm block of the same insn, since an
 133    asm block doesn't give gcc any information about scheduling or resource
 134    usage.  We keep an asm block for use on prior versions of gcc though.
 135
 136    For reference, __builtin_ffs existed in gcc prior to __builtin_clz, but
 137    it's not used (for count_leading_zeros) because it generally gives extra
 138    code to ensure the result is 0 when the input is 0, which we don't need
 139    or want.  */
 140
 141 #ifdef _LONG_LONG_LIMB
 142 #define count_leading_zeros_gcc_clz(count,x)    \
 143   do {                                          \
 144     ASSERT ((x) != 0);                          \
 145     (count) = __builtin_clzll (x);              \
 146   } while (0)
 147 #else
 148 #define count_leading_zeros_gcc_clz(count,x)    \
 149   do {                                          \
 150     ASSERT ((x) != 0);                          \
 151     (count) = __builtin_clzl (x);               \
 152   } while (0)
 153 #endif
 154
 155 #ifdef _LONG_LONG_LIMB
 156 #define count_trailing_zeros_gcc_ctz(count,x)   \
 157   do {                                          \
 158     ASSERT ((x) != 0);                          \
 159     (count) = __builtin_ctzll (x);              \
 160   } while (0)
 161 #else
 162 #define count_trailing_zeros_gcc_ctz(count,x)   \
 163   do {                                          \
 164     ASSERT ((x) != 0);                          \
 165     (count) = __builtin_ctzl (x);               \
 166   } while (0)
 167 #endif
 168
 169
 170 /* FIXME: The macros using external routines like __MPN(count_leading_zeros)
 171    don't need to be under !NO_ASM */
 172 #if ! defined (NO_ASM)
 173
 174 #if defined (__alpha) && W_TYPE_SIZE == 64
 175 /* Most alpha-based machines, except Cray systems. */
 176 #if defined (__GNUC__)
 177 #if __GMP_GNUC_PREREQ (3,3)
 178 #define umul_ppmm(ph, pl, m0, m1) \
 179   do {                                                                  \
 180     UDItype __m0 = (m0), __m1 = (m1);                                   \
 181     (ph) = __builtin_alpha_umulh (__m0, __m1);                          \
 182     (pl) = __m0 * __m1;                                                 \
 183   } while (0)
 184 #else
 185 #define umul_ppmm(ph, pl, m0, m1) \
 186   do {                                                                  \
 187     UDItype __m0 = (m0), __m1 = (m1);                                   \
 188     __asm__ ("umulh %r1,%2,%0"                                          \
 189              : "=r" (ph)                                                \
 190              : "%rJ" (m0), "rI" (m1));                                  \
 191     (pl) = __m0 * __m1;                                                 \
 192   } while (0)
 193 #endif
 194 #define UMUL_TIME 18
 195 #else /* ! __GNUC__ */
 196 #include <machine/builtins.h>
 197 #define umul_ppmm(ph, pl, m0, m1) \
 198   do {                                                                  \
 199     UDItype __m0 = (m0), __m1 = (m1);                                   \
 200     (ph) = __UMULH (m0, m1);                                            \
 201     (pl) = __m0 * __m1;                                                 \
 202   } while (0)
 203 #endif
 204 #ifndef LONGLONG_STANDALONE
 205 #define udiv_qrnnd(q, r, n1, n0, d) \
 206   do { UWtype __di;                                                     \
 207     __di = __MPN(invert_limb) (d);                                      \
 208     udiv_qrnnd_preinv (q, r, n1, n0, d, __di);                          \
 209   } while (0)
 210 #define UDIV_PREINV_ALWAYS  1
 211 #define UDIV_NEEDS_NORMALIZATION 1
 212 #define UDIV_TIME 220
 213 #endif /* LONGLONG_STANDALONE */
 214
 215 /* clz_tab is required in all configurations, since mpn/alpha/cntlz.asm
 216    always goes into libgmp.so, even when not actually used.  */
 217 #define COUNT_LEADING_ZEROS_NEED_CLZ_TAB
 218
 219 #if defined (__GNUC__) && HAVE_HOST_CPU_alpha_CIX
 220 #define count_leading_zeros(COUNT,X) \
 221   __asm__("ctlz %1,%0" : "=r"(COUNT) : "r"(X))
 222 #define count_trailing_zeros(COUNT,X) \
 223   __asm__("cttz %1,%0" : "=r"(COUNT) : "r"(X))
 224 #endif /* clz/ctz using cix */
 225
 226 #if ! defined (count_leading_zeros)                             \
 227   && defined (__GNUC__) && ! defined (LONGLONG_STANDALONE)
 228 /* ALPHA_CMPBGE_0 gives "cmpbge $31,src,dst", ie. test src bytes == 0.
 229    "$31" is written explicitly in the asm, since an "r" constraint won't
 230    select reg 31.  There seems no need to worry about "r31" syntax for cray,
 231    since gcc itself (pre-release 3.4) emits just $31 in various places.  */
 232 #define ALPHA_CMPBGE_0(dst, src)                                        \
 233   do { asm ("cmpbge $31, %1, %0" : "=r" (dst) : "r" (src)); } while (0)
 234 /* Zero bytes are turned into bits with cmpbge, a __clz_tab lookup counts
 235    them, locating the highest non-zero byte.  A second __clz_tab lookup
 236    counts the leading zero bits in that byte, giving the result.  */
 237 #define count_leading_zeros(count, x)                                   \
 238   do {                                                                  \
 239     UWtype  __clz__b, __clz__c, __clz__x = (x);                         \
 240     ALPHA_CMPBGE_0 (__clz__b,  __clz__x);           /* zero bytes */    \
 241     __clz__b = __clz_tab [(__clz__b >> 1) ^ 0x7F];  /* 8 to 1 byte */   \
 242     __clz__b = __clz__b * 8 - 7;                    /* 57 to 1 shift */ \
 243     __clz__x >>= __clz__b;                                              \
 244     __clz__c = __clz_tab [__clz__x];                /* 8 to 1 bit */    \
 245     __clz__b = 65 - __clz__b;                                           \
 246     (count) = __clz__b - __clz__c;                                      \
 247   } while (0)
 248 #define COUNT_LEADING_ZEROS_NEED_CLZ_TAB
 249 #endif /* clz using cmpbge */
 250
 251 #if ! defined (count_leading_zeros) && ! defined (LONGLONG_STANDALONE)
 252 #if HAVE_ATTRIBUTE_CONST
 253 long __MPN(count_leading_zeros) (UDItype) __attribute__ ((const));
 254 #else
 255 long __MPN(count_leading_zeros) (UDItype);
 256 #endif
 257 #define count_leading_zeros(count, x) \
 258   ((count) = __MPN(count_leading_zeros) (x))
 259 #endif /* clz using mpn */
 260 #endif /* __alpha */
 261
 262 #if defined (__AVR) && W_TYPE_SIZE == 8
 263 #define umul_ppmm(ph, pl, m0, m1) \
 264   do {                                                                  \
 265     unsigned short __p = (unsigned short) (m0) * (m1);                  \
 266     (ph) = __p >> 8;                                                    \
 267     (pl) = __p;                                                         \
 268   } while (0)
 269 #endif /* AVR */
 270
 271 #if defined (_CRAY) && W_TYPE_SIZE == 64
 272 #include <intrinsics.h>
 273 #define UDIV_PREINV_ALWAYS  1
 274 #define UDIV_NEEDS_NORMALIZATION 1
 275 #define UDIV_TIME 220
 276 long __MPN(count_leading_zeros) (UDItype);
 277 #define count_leading_zeros(count, x) \
 278   ((count) = _leadz ((UWtype) (x)))
 279 #if defined (_CRAYIEEE)         /* I.e., Cray T90/ieee, T3D, and T3E */
 280 #define umul_ppmm(ph, pl, m0, m1) \
 281   do {                                                                  \
 282     UDItype __m0 = (m0), __m1 = (m1);                                   \
 283     (ph) = _int_mult_upper (m0, m1);                                    \
 284     (pl) = __m0 * __m1;                                                 \
 285   } while (0)
 286 #ifndef LONGLONG_STANDALONE
 287 #define udiv_qrnnd(q, r, n1, n0, d) \
 288   do { UWtype __di;                                                     \
 289     __di = __MPN(invert_limb) (d);                                      \
 290     udiv_qrnnd_preinv (q, r, n1, n0, d, __di);                          \
 291   } while (0)
 292 #endif /* LONGLONG_STANDALONE */
 293 #endif /* _CRAYIEEE */
 294 #endif /* _CRAY */
 295
 296 #if defined (__ia64) && W_TYPE_SIZE == 64
 297 /* This form encourages gcc (pre-release 3.4 at least) to emit predicated
 298    "sub r=r,r" and "sub r=r,r,1", giving a 2 cycle latency.  The generic
 299    code using "al<bl" arithmetically comes out making an actual 0 or 1 in a
 300    register, which takes an extra cycle.  */
 301 #define sub_ddmmss(sh, sl, ah, al, bh, bl)      \
 302   do {                                          \
 303     UWtype __x;                                 \
 304     __x = (al) - (bl);                          \
 305     if ((al) < (bl))                            \
 306       (sh) = (ah) - (bh) - 1;                   \
 307     else                                        \
 308       (sh) = (ah) - (bh);                       \
 309     (sl) = __x;                                 \
 310   } while (0)
 311 #if defined (__GNUC__) && ! defined (__INTEL_COMPILER)
 312 /* Do both product parts in assembly, since that gives better code with
 313    all gcc versions.  Some callers will just use the upper part, and in
 314    that situation we waste an instruction, but not any cycles.  */
 315 #define umul_ppmm(ph, pl, m0, m1) \
 316     __asm__ ("xma.hu %0 = %2, %3, f0\n\txma.l %1 = %2, %3, f0"          \
 317              : "=&f" (ph), "=f" (pl)                                    \
 318              : "f" (m0), "f" (m1))
 319 #define UMUL_TIME 14
 320 #define count_leading_zeros(count, x) \
 321   do {                                                                  \
 322     UWtype _x = (x), _y, _a, _c;                                        \
 323     __asm__ ("mux1 %0 = %1, @rev" : "=r" (_y) : "r" (_x));              \
 324     __asm__ ("czx1.l %0 = %1" : "=r" (_a) : "r" (-_y | _y));            \
 325     _c = (_a - 1) << 3;                                                 \
 326     _x >>= _c;                                                          \
 327     if (_x >= 1 << 4)                                                   \
 328       _x >>= 4, _c += 4;                                                \
 329     if (_x >= 1 << 2)                                                   \
 330       _x >>= 2, _c += 2;                                                \
 331     _c += _x >> 1;                                                      \
 332     (count) =  W_TYPE_SIZE - 1 - _c;                                    \
 333   } while (0)
 334 /* similar to what gcc does for __builtin_ffs, but 0 based rather than 1
 335    based, and we don't need a special case for x==0 here */
 336 #define count_trailing_zeros(count, x)                                  \
 337   do {                                                                  \
 338     UWtype __ctz_x = (x);                                               \
 339     __asm__ ("popcnt %0 = %1"                                           \
 340              : "=r" (count)                                             \
 341              : "r" ((__ctz_x-1) & ~__ctz_x));                           \
 342   } while (0)
 343 #endif
 344 #if defined (__INTEL_COMPILER)
 345 #include <ia64intrin.h>
 346 #define umul_ppmm(ph, pl, m0, m1)                                       \
 347   do {                                                                  \
 348     UWtype _m0 = (m0), _m1 = (m1);                                      \
 349     ph = _m64_xmahu (_m0, _m1, 0);                                      \
 350     pl = _m0 * _m1;                                                     \
 351   } while (0)
 352 #endif
 353 #ifndef LONGLONG_STANDALONE
 354 #define udiv_qrnnd(q, r, n1, n0, d) \
 355   do { UWtype __di;                                                     \
 356     __di = __MPN(invert_limb) (d);                                      \
 357     udiv_qrnnd_preinv (q, r, n1, n0, d, __di);                          \
 358   } while (0)
 359 #define UDIV_PREINV_ALWAYS  1
 360 #define UDIV_NEEDS_NORMALIZATION 1
 361 #endif
 362 #define UDIV_TIME 220
 363 #endif
 364
 365
 366 #if defined (__GNUC__)
 367
 368 /* We sometimes need to clobber "cc" with gcc2, but that would not be
 369    understood by gcc1.  Use cpp to avoid major code duplication.  */
 370 #if __GNUC__ < 2
 371 #define __CLOBBER_CC
 372 #define __AND_CLOBBER_CC
 373 #else /* __GNUC__ >= 2 */
 374 #define __CLOBBER_CC : "cc"
 375 #define __AND_CLOBBER_CC , "cc"
 376 #endif /* __GNUC__ < 2 */
 377
 378 #if (defined (__a29k__) || defined (_AM29K)) && W_TYPE_SIZE == 32
 379 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
 380   __asm__ ("add %1,%4,%5\n\taddc %0,%2,%3"                              \
 381            : "=r" (sh), "=&r" (sl)                                      \
 382            : "r" (ah), "rI" (bh), "%r" (al), "rI" (bl))
 383 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
 384   __asm__ ("sub %1,%4,%5\n\tsubc %0,%2,%3"                              \
 385            : "=r" (sh), "=&r" (sl)                                      \
 386            : "r" (ah), "rI" (bh), "r" (al), "rI" (bl))
 387 #define umul_ppmm(xh, xl, m0, m1) \
 388   do {                                                                  \
 389     USItype __m0 = (m0), __m1 = (m1);                                   \
 390     __asm__ ("multiplu %0,%1,%2"                                        \
 391              : "=r" (xl)                                                \
 392              : "r" (__m0), "r" (__m1));                                 \
 393     __asm__ ("multmu %0,%1,%2"                                          \
 394              : "=r" (xh)                                                \
 395              : "r" (__m0), "r" (__m1));                                 \
 396   } while (0)
 397 #define udiv_qrnnd(q, r, n1, n0, d) \
 398   __asm__ ("dividu %0,%3,%4"                                            \
 399            : "=r" (q), "=q" (r)                                         \
 400            : "1" (n1), "r" (n0), "r" (d))
 401 #define count_leading_zeros(count, x) \
 402     __asm__ ("clz %0,%1"                                                \
 403              : "=r" (count)                                             \
 404              : "r" (x))
 405 #define COUNT_LEADING_ZEROS_0 32
 406 #endif /* __a29k__ */
 407
 408 #if defined (__arc__)
 409 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
 410   __asm__ ("add.f\t%1, %4, %5\n\tadc\t%0, %2, %3"                       \
 411            : "=r" (sh),                                                 \
 412              "=&r" (sl)                                                 \
 413            : "r"  ((USItype) (ah)),                                     \
 414              "rIJ" ((USItype) (bh)),                                    \
 415              "%r" ((USItype) (al)),                                     \
 416              "rIJ" ((USItype) (bl)))
 417 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
 418   __asm__ ("sub.f\t%1, %4, %5\n\tsbc\t%0, %2, %3"                       \
 419            : "=r" (sh),                                                 \
 420              "=&r" (sl)                                                 \
 421            : "r" ((USItype) (ah)),                                      \
 422              "rIJ" ((USItype) (bh)),                                    \
 423              "r" ((USItype) (al)),                                      \
 424              "rIJ" ((USItype) (bl)))
 425 #endif
 426
 427 #if defined (__arm__) && W_TYPE_SIZE == 32
 428 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
 429   __asm__ ("adds\t%1, %4, %5\n\tadc\t%0, %2, %3"                        \
 430            : "=r" (sh), "=&r" (sl)                                      \
 431            : "r" (ah), "rI" (bh), "%r" (al), "rI" (bl) __CLOBBER_CC)
 432 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
 433   do {                                                                  \
 434     if (__builtin_constant_p (al))                                      \
 435       {                                                                 \
 436         if (__builtin_constant_p (ah))                                  \
 437           __asm__ ("rsbs\t%1, %5, %4\n\trsc\t%0, %3, %2"                \
 438                    : "=r" (sh), "=&r" (sl)                              \
 439                    : "rI" (ah), "r" (bh), "rI" (al), "r" (bl) __CLOBBER_CC); \
 440         else                                                            \
 441           __asm__ ("rsbs\t%1, %5, %4\n\tsbc\t%0, %2, %3"                \
 442                    : "=r" (sh), "=&r" (sl)                              \
 443                    : "r" (ah), "rI" (bh), "rI" (al), "r" (bl) __CLOBBER_CC); \
 444       }                                                                 \
 445     else if (__builtin_constant_p (ah))                                 \
 446       {                                                                 \
 447         if (__builtin_constant_p (bl))                                  \
 448           __asm__ ("subs\t%1, %4, %5\n\trsc\t%0, %3, %2"                \
 449                    : "=r" (sh), "=&r" (sl)                              \
 450                    : "rI" (ah), "r" (bh), "r" (al), "rI" (bl) __CLOBBER_CC); \
 451         else                                                            \
 452           __asm__ ("rsbs\t%1, %5, %4\n\trsc\t%0, %3, %2"                \
 453                    : "=r" (sh), "=&r" (sl)                              \
 454                    : "rI" (ah), "r" (bh), "rI" (al), "r" (bl) __CLOBBER_CC); \
 455       }                                                                 \
 456     else if (__builtin_constant_p (bl))                                 \
 457       {                                                                 \
 458         if (__builtin_constant_p (bh))                                  \
 459           __asm__ ("subs\t%1, %4, %5\n\tsbc\t%0, %2, %3"                \
 460                    : "=r" (sh), "=&r" (sl)                              \
 461                    : "r" (ah), "rI" (bh), "r" (al), "rI" (bl) __CLOBBER_CC); \
 462         else                                                            \
 463           __asm__ ("subs\t%1, %4, %5\n\trsc\t%0, %3, %2"                \
 464                    : "=r" (sh), "=&r" (sl)                              \
 465                    : "rI" (ah), "r" (bh), "r" (al), "rI" (bl) __CLOBBER_CC); \
 466       }                                                                 \
 467     else /* only bh might be a constant */                              \
 468       __asm__ ("subs\t%1, %4, %5\n\tsbc\t%0, %2, %3"                    \
 469                : "=r" (sh), "=&r" (sl)                                  \
 470                : "r" (ah), "rI" (bh), "r" (al), "rI" (bl) __CLOBBER_CC);\
 471     } while (0)
 472 #if 1 || defined (__arm_m__)    /* `M' series has widening multiply support */
 473 #define umul_ppmm(xh, xl, a, b) \
 474   __asm__ ("umull %0,%1,%2,%3" : "=&r" (xl), "=&r" (xh) : "r" (a), "r" (b))
 475 #define UMUL_TIME 5
 476 #define smul_ppmm(xh, xl, a, b) \
 477   __asm__ ("smull %0,%1,%2,%3" : "=&r" (xl), "=&r" (xh) : "r" (a), "r" (b))
 478 #ifndef LONGLONG_STANDALONE
 479 #define udiv_qrnnd(q, r, n1, n0, d) \
 480   do { UWtype __di;                                                     \
 481     __di = __MPN(invert_limb) (d);                                      \
 482     udiv_qrnnd_preinv (q, r, n1, n0, d, __di);                          \
 483   } while (0)
 484 #define UDIV_PREINV_ALWAYS  1
 485 #define UDIV_NEEDS_NORMALIZATION 1
 486 #define UDIV_TIME 70
 487 #endif /* LONGLONG_STANDALONE */
 488 #else
 489 #define umul_ppmm(xh, xl, a, b) \
 490   __asm__ ("%@ Inlined umul_ppmm\n"                                     \
 491 "       mov     %|r0, %2, lsr #16\n"                                    \
 492 "       mov     %|r2, %3, lsr #16\n"                                    \
 493 "       bic     %|r1, %2, %|r0, lsl #16\n"                              \
 494 "       bic     %|r2, %3, %|r2, lsl #16\n"                              \
 495 "       mul     %1, %|r1, %|r2\n"                                       \
 496 "       mul     %|r2, %|r0, %|r2\n"                                     \
 497 "       mul     %|r1, %0, %|r1\n"                                       \
 498 "       mul     %0, %|r0, %0\n"                                         \
 499 "       adds    %|r1, %|r2, %|r1\n"                                     \
 500 "       addcs   %0, %0, #65536\n"                                       \
 501 "       adds    %1, %1, %|r1, lsl #16\n"                                \
 502 "       adc     %0, %0, %|r1, lsr #16"                                  \
 503            : "=&r" (xh), "=r" (xl)                                      \
 504            : "r" (a), "r" (b)                                           \
 505            : "r0", "r1", "r2")
 506 #define UMUL_TIME 20
 507 #ifndef LONGLONG_STANDALONE
 508 #define udiv_qrnnd(q, r, n1, n0, d) \
 509   do { UWtype __r;                                                      \
 510     (q) = __MPN(udiv_qrnnd) (&__r, (n1), (n0), (d));                    \
 511     (r) = __r;                                                          \
 512   } while (0)
 513 extern UWtype __MPN(udiv_qrnnd) (UWtype *, UWtype, UWtype, UWtype);
 514 #define UDIV_TIME 200
 515 #endif /* LONGLONG_STANDALONE */
 516 #endif
 517 /* This is a bizarre test, but GCC doesn't define useful common symbol. */
 518 #if defined (__ARM_ARCH_5__)  || defined (__ARM_ARCH_5T__) || \
 519     defined (__ARM_ARCH_5E__) || defined (__ARM_ARCH_5TE__)|| \
 520     defined (__ARM_ARCH_6__)  || defined (__ARM_ARCH_6J__) || \
 521     defined (__ARM_ARCH_6K__) || defined (__ARM_ARCH_6Z__) || \
 522     defined (__ARM_ARCH_6ZK__)|| defined (__ARM_ARCH_6T2__)|| \
 523     defined (__ARM_ARCH_6M__) || defined (__ARM_ARCH_7__)  || \
 524     defined (__ARM_ARCH_7A__) || defined (__ARM_ARCH_7R__) || \
 525     defined (__ARM_ARCH_7M__) || defined (__ARM_ARCH_7EM__)
 526 #define count_leading_zeros(count, x) \
 527   __asm__ ("clz\t%0, %1" : "=r" (count) : "r" (x))
 528 #define COUNT_LEADING_ZEROS_0 32
 529 #endif
 530 #endif /* __arm__ */
 531
 532 #if defined (__aarch64__) && W_TYPE_SIZE == 64
 533 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
 534   __asm__ ("adds\t%1, %4, %5\n\tadc\t%0, %2, %3"                        \
 535            : "=r" (sh), "=&r" (sl)                                      \
 536            : "r" (ah), "rZ" (bh), "%r" (al), "rI" (bl) __CLOBBER_CC)
 537 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
 538   do {                                                                  \
 539     if (__builtin_constant_p (bl))                                      \
 540       {                                                                 \
 541         __asm__ ("subs\t%1, %4, %5\n\tsbc\t%0, %2, %3"                  \
 542                  : "=r" (sh), "=&r" (sl)                                \
 543                  : "r" (ah), "r" (bh), "r" (al), "rI" (bl) __CLOBBER_CC); \
 544       }                                                                 \
 545     else /* only bh might be a constant */                              \
 546       __asm__ ("subs\t%1, %4, %5\n\tsbc\t%0, %2, %3"                    \
 547                : "=r" (sh), "=&r" (sl)                                  \
 548                : "r" (ah), "rZ" (bh), "r" (al), "rI" (bl) __CLOBBER_CC);\
 549     } while (0)
 550 #define umul_ppmm(ph, pl, m0, m1) \
 551   do {                                                                  \
 552     UDItype __m0 = (m0), __m1 = (m1);                                   \
 553     __asm__ ("umulh\t%0, %1, %2" : "=r" (ph) : "r" (m0), "r" (m1));     \
 554     (pl) = __m0 * __m1;                                                 \
 555   } while (0)
 556 #define count_leading_zeros(count, x) \
 557   __asm__ ("clz\t%0, %1" : "=r" (count) : "r" (x))
 558 #define COUNT_LEADING_ZEROS_0 64
 559 #endif /* __aarch64__ */
 560
 561 #if defined (__clipper__) && W_TYPE_SIZE == 32
 562 #define umul_ppmm(w1, w0, u, v) \
 563   ({union {UDItype __ll;                                                \
 564            struct {USItype __l, __h;} __i;                              \
 565           } __x;                                                        \
 566   __asm__ ("mulwux %2,%0"                                               \
 567            : "=r" (__x.__ll)                                            \
 568            : "%0" ((USItype)(u)), "r" ((USItype)(v)));                  \
 569   (w1) = __x.__i.__h; (w0) = __x.__i.__l;})
 570 #define smul_ppmm(w1, w0, u, v) \
 571   ({union {DItype __ll;                                                 \
 572            struct {SItype __l, __h;} __i;                               \
 573           } __x;                                                        \
 574   __asm__ ("mulwx %2,%0"                                                \
 575            : "=r" (__x.__ll)                                            \
 576            : "%0" ((SItype)(u)), "r" ((SItype)(v)));                    \
 577   (w1) = __x.__i.__h; (w0) = __x.__i.__l;})
 578 #define __umulsidi3(u, v) \
 579   ({UDItype __w;                                                        \
 580     __asm__ ("mulwux %2,%0"                                             \
 581              : "=r" (__w) : "%0" ((USItype)(u)), "r" ((USItype)(v)));   \
 582     __w; })
 583 #endif /* __clipper__ */
 584
 585 /* Fujitsu vector computers.  */
 586 #if defined (__uxp__) && W_TYPE_SIZE == 32
 587 #define umul_ppmm(ph, pl, u, v) \
 588   do {                                                                  \
 589     union {UDItype __ll;                                                \
 590            struct {USItype __h, __l;} __i;                              \
 591           } __x;                                                        \
 592     __asm__ ("mult.lu %1,%2,%0" : "=r" (__x.__ll) : "%r" (u), "rK" (v));\
 593     (ph) = __x.__i.__h;                                                 \
 594     (pl) = __x.__i.__l;                                                 \
 595   } while (0)
 596 #define smul_ppmm(ph, pl, u, v) \
 597   do {                                                                  \
 598     union {UDItype __ll;                                                \
 599            struct {USItype __h, __l;} __i;                              \
 600           } __x;                                                        \
 601     __asm__ ("mult.l %1,%2,%0" : "=r" (__x.__ll) : "%r" (u), "rK" (v)); \
 602     (ph) = __x.__i.__h;                                                 \
 603     (pl) = __x.__i.__l;                                                 \
 604   } while (0)
 605 #endif
 606
 607 #if defined (__gmicro__) && W_TYPE_SIZE == 32
 608 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
 609   __asm__ ("add.w %5,%1\n\taddx %3,%0"                                  \
 610            : "=g" (sh), "=&g" (sl)                                      \
 611            : "0"  ((USItype)(ah)), "g" ((USItype)(bh)),                 \
 612              "%1" ((USItype)(al)), "g" ((USItype)(bl)))
 613 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
 614   __asm__ ("sub.w %5,%1\n\tsubx %3,%0"                                  \
 615            : "=g" (sh), "=&g" (sl)                                      \
 616            : "0" ((USItype)(ah)), "g" ((USItype)(bh)),                  \
 617              "1" ((USItype)(al)), "g" ((USItype)(bl)))
 618 #define umul_ppmm(ph, pl, m0, m1) \
 619   __asm__ ("mulx %3,%0,%1"                                              \
 620            : "=g" (ph), "=r" (pl)                                       \
 621            : "%0" ((USItype)(m0)), "g" ((USItype)(m1)))
 622 #define udiv_qrnnd(q, r, nh, nl, d) \
 623   __asm__ ("divx %4,%0,%1"                                              \
 624            : "=g" (q), "=r" (r)                                         \
 625            : "1" ((USItype)(nh)), "0" ((USItype)(nl)), "g" ((USItype)(d)))
 626 #define count_leading_zeros(count, x) \
 627   __asm__ ("bsch/1 %1,%0"                                               \
 628            : "=g" (count) : "g" ((USItype)(x)), "0" ((USItype)0))
 629 #endif
 630
 631 #if defined (__hppa) && W_TYPE_SIZE == 32
 632 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
 633   __asm__ ("add%I5 %5,%r4,%1\n\taddc %r2,%r3,%0"                        \
 634            : "=r" (sh), "=&r" (sl)                                      \
 635            : "rM" (ah), "rM" (bh), "%rM" (al), "rI" (bl))
 636 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
 637   __asm__ ("sub%I4 %4,%r5,%1\n\tsubb %r2,%r3,%0"                        \
 638            : "=r" (sh), "=&r" (sl)                                      \
 639            : "rM" (ah), "rM" (bh), "rI" (al), "rM" (bl))
 640 #if defined (_PA_RISC1_1)
 641 #define umul_ppmm(wh, wl, u, v) \
 642   do {                                                                  \
 643     union {UDItype __ll;                                                \
 644            struct {USItype __h, __l;} __i;                              \
 645           } __x;                                                        \
 646     __asm__ ("xmpyu %1,%2,%0" : "=*f" (__x.__ll) : "*f" (u), "*f" (v)); \
 647     (wh) = __x.__i.__h;                                                 \
 648     (wl) = __x.__i.__l;                                                 \
 649   } while (0)
 650 #define UMUL_TIME 8
 651 #define UDIV_TIME 60
 652 #else
 653 #define UMUL_TIME 40
 654 #define UDIV_TIME 80
 655 #endif
 656 #define count_leading_zeros(count, x) \
 657   do {                                                                  \
 658     USItype __tmp;                                                      \
 659     __asm__ (                                                           \
 660        "ldi             1,%0\n"                                         \
 661 "       extru,=         %1,15,16,%%r0   ; Bits 31..16 zero?\n"          \
 662 "       extru,tr        %1,15,16,%1     ; No.  Shift down, skip add.\n" \
 663 "       ldo             16(%0),%0       ; Yes.  Perform add.\n"         \
 664 "       extru,=         %1,23,8,%%r0    ; Bits 15..8 zero?\n"           \
 665 "       extru,tr        %1,23,8,%1      ; No.  Shift down, skip add.\n" \
 666 "       ldo             8(%0),%0        ; Yes.  Perform add.\n"         \
 667 "       extru,=         %1,27,4,%%r0    ; Bits 7..4 zero?\n"            \
 668 "       extru,tr        %1,27,4,%1      ; No.  Shift down, skip add.\n" \
 669 "       ldo             4(%0),%0        ; Yes.  Perform add.\n"         \
 670 "       extru,=         %1,29,2,%%r0    ; Bits 3..2 zero?\n"            \
 671 "       extru,tr        %1,29,2,%1      ; No.  Shift down, skip add.\n" \
 672 "       ldo             2(%0),%0        ; Yes.  Perform add.\n"         \
 673 "       extru           %1,30,1,%1      ; Extract bit 1.\n"             \
 674 "       sub             %0,%1,%0        ; Subtract it.\n"               \
 675         : "=r" (count), "=r" (__tmp) : "1" (x));                        \
 676   } while (0)
 677 #endif /* hppa */
 678
 679 /* These macros are for ABI=2.0w.  In ABI=2.0n they can't be used, since GCC
 680    (3.2) puts longlong into two adjacent 32-bit registers.  Presumably this
 681    is just a case of no direct support for 2.0n but treating it like 1.0. */
 682 #if defined (__hppa) && W_TYPE_SIZE == 64 && ! defined (_LONG_LONG_LIMB) \
 683   && defined (_PA_RISC2_0) && defined (_LP64)
 684 /* Note the _PA_RISC2_0 above is to exclude this code from GCC with
 685    default -march options which doesn't support these instructions.
 686    Also the width check for 'long' is to avoid ilp32 runtimes where
 687    GNU/Linux and narrow HP-UX kernels are known to have issues with
 688    clobbering of context between the add and add,dc instructions.  */
 689 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
 690   __asm__ ("add%I5 %5,%r4,%1\n\tadd,dc %r2,%r3,%0"                      \
 691            : "=r" (sh), "=&r" (sl)                                      \
 692            : "rM" (ah), "rM" (bh), "%rM" (al), "rI" (bl))
 693 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
 694   __asm__ ("sub%I4 %4,%r5,%1\n\tsub,db %r2,%r3,%0"                      \
 695            : "=r" (sh), "=&r" (sl)                                      \
 696            : "rM" (ah), "rM" (bh), "rI" (al), "rM" (bl))
 697 #endif /* hppa */
 698
 699 #if (defined (__i370__) || defined (__s390__) || defined (__mvs__)) && W_TYPE_SIZE == 32
 700 #if defined (__zarch__) || defined (HAVE_HOST_CPU_s390_zarch)
 701 #define add_ssaaaa(sh, sl, ah, al, bh, bl)                              \
 702   do {                                                                  \
 703 /*  if (__builtin_constant_p (bl))                                      \
 704       __asm__ ("alfi\t%1,%o5\n\talcr\t%0,%3"                            \
 705                : "=r" (sh), "=&r" (sl)                                  \
 706                : "0"  (ah), "r" (bh), "%1" (al), "n" (bl) __CLOBBER_CC);\
 707     else                                                                \
 708 */    __asm__ ("alr\t%1,%5\n\talcr\t%0,%3"                              \
 709                : "=r" (sh), "=&r" (sl)                                  \
 710                : "0"  (ah), "r" (bh), "%1" (al), "r" (bl)__CLOBBER_CC); \
 711   } while (0)
 712 #define sub_ddmmss(sh, sl, ah, al, bh, bl)                              \
 713   do {                                                                  \
 714 /*  if (__builtin_constant_p (bl))                                      \
 715       __asm__ ("slfi\t%1,%o5\n\tslbr\t%0,%3"                            \
 716                : "=r" (sh), "=&r" (sl)                                  \
 717                : "0" (ah), "r" (bh), "1" (al), "n" (bl) __CLOBBER_CC);  \
 718     else                                                                \
 719 */    __asm__ ("slr\t%1,%5\n\tslbr\t%0,%3"                              \
 720                : "=r" (sh), "=&r" (sl)                                  \
 721                : "0" (ah), "r" (bh), "1" (al), "r" (bl) __CLOBBER_CC);  \
 722   } while (0)
 723 #if __GMP_GNUC_PREREQ (4,5)
 724 #define umul_ppmm(xh, xl, m0, m1)                                       \
 725   do {                                                                  \
 726     union {UDItype __ll;                                                \
 727            struct {USItype __h, __l;} __i;                              \
 728           } __x;                                                        \
 729     __x.__ll = (UDItype) (m0) * (UDItype) (m1);                         \
 730     (xh) = __x.__i.__h; (xl) = __x.__i.__l;                             \
 731   } while (0)
 732 #else
 733 #if 0
 734 /* FIXME: this fails if gcc knows about the 64-bit registers.  Use only
 735    with a new enough processor pretending we have 32-bit registers.  */
 736 #define umul_ppmm(xh, xl, m0, m1)                                       \
 737   do {                                                                  \
 738     union {UDItype __ll;                                                \
 739            struct {USItype __h, __l;} __i;                              \
 740           } __x;                                                        \
 741     __asm__ ("mlr\t%0,%2"                                               \
 742              : "=r" (__x.__ll)                                          \
 743              : "%0" (m0), "r" (m1));                                    \
 744     (xh) = __x.__i.__h; (xl) = __x.__i.__l;                             \
 745   } while (0)
 746 #else
 747 #define umul_ppmm(xh, xl, m0, m1)                                       \
 748   do {                                                                  \
 749   /* When we have 64-bit regs and gcc is aware of that, we cannot simply use
 750      DImode for the product, since that would be allocated to a single 64-bit
 751      register, whereas mlr uses the low 32-bits of an even-odd register pair.
 752   */                                                                    \
 753     register USItype __r0 __asm__ ("0");                                \
 754     register USItype __r1 __asm__ ("1") = (m0);                         \
 755     __asm__ ("mlr\t%0,%3"                                               \
 756              : "=r" (__r0), "=r" (__r1)                                 \
 757              : "r" (__r1), "r" (m1));                                   \
 758     (xh) = __r0; (xl) = __r1;                                           \
 759   } while (0)
 760 #endif /* if 0 */
 761 #endif
 762 #if 0
 763 /* FIXME: this fails if gcc knows about the 64-bit registers.  Use only
 764    with a new enough processor pretending we have 32-bit registers.  */
 765 #define udiv_qrnnd(q, r, n1, n0, d)                                     \
 766   do {                                                                  \
 767     union {UDItype __ll;                                                \
 768            struct {USItype __h, __l;} __i;                              \
 769           } __x;                                                        \
 770     __x.__i.__h = n1; __x.__i.__l = n0;                                 \
 771     __asm__ ("dlr\t%0,%2"                                               \
 772              : "=r" (__x.__ll)                                          \
 773              : "0" (__x.__ll), "r" (d));                                \
 774     (q) = __x.__i.__l; (r) = __x.__i.__h;                               \
 775   } while (0)
 776 #else
 777 #define udiv_qrnnd(q, r, n1, n0, d)                                     \
 778   do {                                                                  \
 779     register USItype __r0 __asm__ ("0") = (n1);                         \
 780     register USItype __r1 __asm__ ("1") = (n0);                         \
 781     __asm__ ("dlr\t%0,%4"                                               \
 782              : "=r" (__r0), "=r" (__r1)                                 \
 783              : "r" (__r0), "r" (__r1), "r" (d));                        \
 784     (q) = __r1; (r) = __r0;                                             \
 785   } while (0)
 786 #endif /* if 0 */
 787 #else /* if __zarch__ */
 788 /* FIXME: this fails if gcc knows about the 64-bit registers.  */
 789 #define smul_ppmm(xh, xl, m0, m1)                                       \
 790   do {                                                                  \
 791     union {DItype __ll;                                                 \
 792            struct {USItype __h, __l;} __i;                              \
 793           } __x;                                                        \
 794     __asm__ ("mr\t%0,%2"                                                \
 795              : "=r" (__x.__ll)                                          \
 796              : "%0" (m0), "r" (m1));                                    \
 797     (xh) = __x.__i.__h; (xl) = __x.__i.__l;                             \
 798   } while (0)
 799 /* FIXME: this fails if gcc knows about the 64-bit registers.  */
 800 #define sdiv_qrnnd(q, r, n1, n0, d)                                     \
 801   do {                                                                  \
 802     union {DItype __ll;                                                 \
 803            struct {USItype __h, __l;} __i;                              \
 804           } __x;                                                        \
 805     __x.__i.__h = n1; __x.__i.__l = n0;                                 \
 806     __asm__ ("dr\t%0,%2"                                                \
 807              : "=r" (__x.__ll)                                          \
 808              : "0" (__x.__ll), "r" (d));                                \
 809     (q) = __x.__i.__l; (r) = __x.__i.__h;                               \
 810   } while (0)
 811 #endif /* if __zarch__ */
 812 #endif
 813
 814 #if defined (__s390x__) && W_TYPE_SIZE == 64
 815 /* We need to cast operands with register constraints, otherwise their types
 816    will be assumed to be SImode by gcc.  For these machines, such operations
 817    will insert a value into the low 32 bits, and leave the high 32 bits with
 818    garbage.  */
 819 #define add_ssaaaa(sh, sl, ah, al, bh, bl)                              \
 820   do {                                                                  \
 821     __asm__ ("algr\t%1,%5\n\talcgr\t%0,%3"                              \
 822                : "=r" (sh), "=&r" (sl)                                  \
 823                : "0"  ((UDItype)(ah)), "r" ((UDItype)(bh)),             \
 824                  "%1" ((UDItype)(al)), "r" ((UDItype)(bl)) __CLOBBER_CC); \
 825   } while (0)
 826 #define sub_ddmmss(sh, sl, ah, al, bh, bl)                              \
 827   do {                                                                  \
 828     __asm__ ("slgr\t%1,%5\n\tslbgr\t%0,%3"                              \
 829              : "=r" (sh), "=&r" (sl)                                    \
 830              : "0" ((UDItype)(ah)), "r" ((UDItype)(bh)),                \
 831                "1" ((UDItype)(al)), "r" ((UDItype)(bl)) __CLOBBER_CC);  \
 832   } while (0)
 833 #define umul_ppmm(xh, xl, m0, m1)                                       \
 834   do {                                                                  \
 835     union {unsigned int __attribute__ ((mode(TI))) __ll;                \
 836            struct {UDItype __h, __l;} __i;                              \
 837           } __x;                                                        \
 838     __asm__ ("mlgr\t%0,%2"                                              \
 839              : "=r" (__x.__ll)                                          \
 840              : "%0" ((UDItype)(m0)), "r" ((UDItype)(m1)));              \
 841     (xh) = __x.__i.__h; (xl) = __x.__i.__l;                             \
 842   } while (0)
 843 #define udiv_qrnnd(q, r, n1, n0, d)                                     \
 844   do {                                                                  \
 845     union {unsigned int __attribute__ ((mode(TI))) __ll;                \
 846            struct {UDItype __h, __l;} __i;                              \
 847           } __x;                                                        \
 848     __x.__i.__h = n1; __x.__i.__l = n0;                                 \
 849     __asm__ ("dlgr\t%0,%2"                                              \
 850              : "=r" (__x.__ll)                                          \
 851              : "0" (__x.__ll), "r" ((UDItype)(d)));                     \
 852     (q) = __x.__i.__l; (r) = __x.__i.__h;                               \
 853   } while (0)
 854 #if 0 /* FIXME: Enable for z10 (?) */
 855 #define count_leading_zeros(cnt, x)                                     \
 856   do {                                                                  \
 857     union {unsigned int __attribute__ ((mode(TI))) __ll;                \
 858            struct {UDItype __h, __l;} __i;                              \
 859           } __clr_cnt;                                                  \
 860     __asm__ ("flogr\t%0,%1"                                             \
 861              : "=r" (__clr_cnt.__ll)                                    \
 862              : "r" (x) __CLOBBER_CC);                                   \
 863     (cnt) = __clr_cnt.__i.__h;                                          \
 864   } while (0)
 865 #endif
 866 #endif
 867
 868 #if (defined (__i386__) || defined (__i486__)) && W_TYPE_SIZE == 32
 869 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
 870   __asm__ ("addl %5,%k1\n\tadcl %3,%k0"                                 \
 871            : "=r" (sh), "=&r" (sl)                                      \
 872            : "0"  ((USItype)(ah)), "g" ((USItype)(bh)),                 \
 873              "%1" ((USItype)(al)), "g" ((USItype)(bl)))
 874 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
 875   __asm__ ("subl %5,%k1\n\tsbbl %3,%k0"                                 \
 876            : "=r" (sh), "=&r" (sl)                                      \
 877            : "0" ((USItype)(ah)), "g" ((USItype)(bh)),                  \
 878              "1" ((USItype)(al)), "g" ((USItype)(bl)))
 879 #define umul_ppmm(w1, w0, u, v) \
 880   __asm__ ("mull %3"                                                    \
 881            : "=a" (w0), "=d" (w1)                                       \
 882            : "%0" ((USItype)(u)), "rm" ((USItype)(v)))
 883 #define udiv_qrnnd(q, r, n1, n0, dx) /* d renamed to dx avoiding "=d" */\
 884   __asm__ ("divl %4"                 /* stringification in K&R C */     \
 885            : "=a" (q), "=d" (r)                                         \
 886            : "0" ((USItype)(n0)), "1" ((USItype)(n1)), "rm" ((USItype)(dx)))
 887
 888 #if HAVE_HOST_CPU_i586 || HAVE_HOST_CPU_pentium || HAVE_HOST_CPU_pentiummmx
 889 /* Pentium bsrl takes between 10 and 72 cycles depending where the most
 890    significant 1 bit is, hence the use of the following alternatives.  bsfl
 891    is slow too, between 18 and 42 depending where the least significant 1
 892    bit is, so let the generic count_trailing_zeros below make use of the
 893    count_leading_zeros here too.  */
 894
 895 #if HAVE_HOST_CPU_pentiummmx && ! defined (LONGLONG_STANDALONE)
 896 /* The following should be a fixed 14 or 15 cycles, but possibly plus an L1
 897    cache miss reading from __clz_tab.  For P55 it's favoured over the float
 898    below so as to avoid mixing MMX and x87, since the penalty for switching
 899    between the two is about 100 cycles.
 900
 901    The asm block sets __shift to -3 if the high 24 bits are clear, -2 for
 902    16, -1 for 8, or 0 otherwise.  This could be written equivalently as
 903    follows, but as of gcc 2.95.2 it results in conditional jumps.
 904
 905        __shift = -(__n < 0x1000000);
 906        __shift -= (__n < 0x10000);
 907        __shift -= (__n < 0x100);
 908
 909    The middle two sbbl and cmpl's pair, and with luck something gcc
 910    generates might pair with the first cmpl and the last sbbl.  The "32+1"
 911    constant could be folded into __clz_tab[], but it doesn't seem worth
 912    making a different table just for that.  */
 913
 914 #define count_leading_zeros(c,n)                                        \
 915   do {                                                                  \
 916     USItype  __n = (n);                                                 \
 917     USItype  __shift;                                                   \
 918     __asm__ ("cmpl  $0x1000000, %1\n"                                   \
 919              "sbbl  %0, %0\n"                                           \
 920              "cmpl  $0x10000, %1\n"                                     \
 921              "sbbl  $0, %0\n"                                           \
 922              "cmpl  $0x100, %1\n"                                       \
 923              "sbbl  $0, %0\n"                                           \
 924              : "=&r" (__shift) : "r"  (__n));                           \
 925     __shift = __shift*8 + 24 + 1;                                       \
 926     (c) = 32 + 1 - __shift - __clz_tab[__n >> __shift];                 \
 927   } while (0)
 928 #define COUNT_LEADING_ZEROS_NEED_CLZ_TAB
 929 #define COUNT_LEADING_ZEROS_0   31   /* n==0 indistinguishable from n==1 */
 930
 931 #else /* ! pentiummmx || LONGLONG_STANDALONE */
 932 /* The following should be a fixed 14 cycles or so.  Some scheduling
 933    opportunities should be available between the float load/store too.  This
 934    sort of code is used in gcc 3 for __builtin_ffs (with "n&-n") and is
 935    apparently suggested by the Intel optimizing manual (don't know exactly
 936    where).  gcc 2.95 or up will be best for this, so the "double" is
 937    correctly aligned on the stack.  */
 938 #define count_leading_zeros(c,n)                                        \
 939   do {                                                                  \
 940     union {                                                             \
 941       double    d;                                                      \
 942       unsigned  a[2];                                                   \
 943     } __u;                                                              \
 944     ASSERT ((n) != 0);                                                  \
 945     __u.d = (UWtype) (n);                                               \
 946     (c) = 0x3FF + 31 - (__u.a[1] >> 20);                                \
 947   } while (0)
 948 #define COUNT_LEADING_ZEROS_0   (0x3FF + 31)
 949 #endif /* pentiummx */
 950
 951 #else /* ! pentium */
 952
 953 #if __GMP_GNUC_PREREQ (3,4)  /* using bsrl */
 954 #define count_leading_zeros(count,x)  count_leading_zeros_gcc_clz(count,x)
 955 #endif /* gcc clz */
 956
 957 /* On P6, gcc prior to 3.0 generates a partial register stall for
 958    __cbtmp^31, due to using "xorb $31" instead of "xorl $31", the former
 959    being 1 code byte smaller.  "31-__cbtmp" is a workaround, probably at the
 960    cost of one extra instruction.  Do this for "i386" too, since that means
 961    generic x86.  */
 962 #if ! defined (count_leading_zeros) && __GNUC__ < 3                     \
 963   && (HAVE_HOST_CPU_i386                                                \
 964       || HAVE_HOST_CPU_i686                                             \
 965       || HAVE_HOST_CPU_pentiumpro                                       \
 966       || HAVE_HOST_CPU_pentium2                                         \
 967       || HAVE_HOST_CPU_pentium3)
 968 #define count_leading_zeros(count, x)                                   \
 969   do {                                                                  \
 970     USItype __cbtmp;                                                    \
 971     ASSERT ((x) != 0);                                                  \
 972     __asm__ ("bsrl %1,%0" : "=r" (__cbtmp) : "rm" ((USItype)(x)));      \
 973     (count) = 31 - __cbtmp;                                             \
 974   } while (0)
 975 #endif /* gcc<3 asm bsrl */
 976
 977 #ifndef count_leading_zeros
 978 #define count_leading_zeros(count, x)                                   \
 979   do {                                                                  \
 980     USItype __cbtmp;                                                    \
 981     ASSERT ((x) != 0);                                                  \
 982     __asm__ ("bsrl %1,%0" : "=r" (__cbtmp) : "rm" ((USItype)(x)));      \
 983     (count) = __cbtmp ^ 31;                                             \
 984   } while (0)
 985 #endif /* asm bsrl */
 986
 987 #if __GMP_GNUC_PREREQ (3,4)  /* using bsfl */
 988 #define count_trailing_zeros(count,x)  count_trailing_zeros_gcc_ctz(count,x)
 989 #endif /* gcc ctz */
 990
 991 #ifndef count_trailing_zeros
 992 #define count_trailing_zeros(count, x)                                  \
 993   do {                                                                  \
 994     ASSERT ((x) != 0);                                                  \
 995     __asm__ ("bsfl %1,%k0" : "=r" (count) : "rm" ((USItype)(x)));       \
 996   } while (0)
 997 #endif /* asm bsfl */
 998
 999 #endif /* ! pentium */
1000
1001 #ifndef UMUL_TIME
1002 #define UMUL_TIME 10
1003 #endif
1004 #ifndef UDIV_TIME
1005 #define UDIV_TIME 40
1006 #endif
1007 #endif /* 80x86 */
1008
1009 #if defined (__amd64__) && W_TYPE_SIZE == 64
1010 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
1011   __asm__ ("addq %5,%q1\n\tadcq %3,%q0"                                 \
1012            : "=r" (sh), "=&r" (sl)                                      \
1013            : "0"  ((UDItype)(ah)), "rme" ((UDItype)(bh)),               \
1014              "%1" ((UDItype)(al)), "rme" ((UDItype)(bl)))
1015 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
1016   __asm__ ("subq %5,%q1\n\tsbbq %3,%q0"                                 \
1017            : "=r" (sh), "=&r" (sl)                                      \
1018            : "0" ((UDItype)(ah)), "rme" ((UDItype)(bh)),                \
1019              "1" ((UDItype)(al)), "rme" ((UDItype)(bl)))
1020 #define umul_ppmm(w1, w0, u, v) \
1021   __asm__ ("mulq %3"                                                    \
1022            : "=a" (w0), "=d" (w1)                                       \
1023            : "%0" ((UDItype)(u)), "rm" ((UDItype)(v)))
1024 #define udiv_qrnnd(q, r, n1, n0, dx) /* d renamed to dx avoiding "=d" */\
1025   __asm__ ("divq %4"                 /* stringification in K&R C */     \
1026            : "=a" (q), "=d" (r)                                         \
1027            : "0" ((UDItype)(n0)), "1" ((UDItype)(n1)), "rm" ((UDItype)(dx)))
1028 /* bsrq destination must be a 64-bit register, hence UDItype for __cbtmp. */
1029 #define count_leading_zeros(count, x)                                   \
1030   do {                                                                  \
1031     UDItype __cbtmp;                                                    \
1032     ASSERT ((x) != 0);                                                  \
1033     __asm__ ("bsrq %1,%0" : "=r" (__cbtmp) : "rm" ((UDItype)(x)));      \
1034     (count) = __cbtmp ^ 63;                                             \
1035   } while (0)
1036 /* bsfq destination must be a 64-bit register, "%q0" forces this in case
1037    count is only an int. */
1038 #define count_trailing_zeros(count, x)                                  \
1039   do {                                                                  \
1040     ASSERT ((x) != 0);                                                  \
1041     __asm__ ("bsfq %1,%q0" : "=r" (count) : "rm" ((UDItype)(x)));       \
1042   } while (0)
1043 #endif /* x86_64 */
1044
1045 #if defined (__i860__) && W_TYPE_SIZE == 32
1046 #define rshift_rhlc(r,h,l,c) \
1047   __asm__ ("shr %3,r0,r0\;shrd %1,%2,%0"                                \
1048            "=r" (r) : "r" (h), "r" (l), "rn" (c))
1049 #endif /* i860 */
1050
1051 #if defined (__i960__) && W_TYPE_SIZE == 32
1052 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
1053   __asm__ ("cmpo 1,0\;addc %5,%4,%1\;addc %3,%2,%0"                     \
1054            : "=r" (sh), "=&r" (sl)                                      \
1055            : "dI" (ah), "dI" (bh), "%dI" (al), "dI" (bl))
1056 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
1057   __asm__ ("cmpo 0,0\;subc %5,%4,%1\;subc %3,%2,%0"                     \
1058            : "=r" (sh), "=&r" (sl)                                      \
1059            : "dI" (ah), "dI" (bh), "dI" (al), "dI" (bl))
1060 #define umul_ppmm(w1, w0, u, v) \
1061   ({union {UDItype __ll;                                                \
1062            struct {USItype __l, __h;} __i;                              \
1063           } __x;                                                        \
1064   __asm__ ("emul %2,%1,%0"                                              \
1065            : "=d" (__x.__ll) : "%dI" (u), "dI" (v));                    \
1066   (w1) = __x.__i.__h; (w0) = __x.__i.__l;})
1067 #define __umulsidi3(u, v) \
1068   ({UDItype __w;                                                        \
1069     __asm__ ("emul %2,%1,%0" : "=d" (__w) : "%dI" (u), "dI" (v));       \
1070     __w; })
1071 #define udiv_qrnnd(q, r, nh, nl, d) \
1072   do {                                                                  \
1073     union {UDItype __ll;                                                \
1074            struct {USItype __l, __h;} __i;                              \
1075           } __nn;                                                       \
1076     __nn.__i.__h = (nh); __nn.__i.__l = (nl);                           \
1077     __asm__ ("ediv %d,%n,%0"                                            \
1078            : "=d" (__rq.__ll) : "dI" (__nn.__ll), "dI" (d));            \
1079     (r) = __rq.__i.__l; (q) = __rq.__i.__h;                             \
1080   } while (0)
1081 #define count_leading_zeros(count, x) \
1082   do {                                                                  \
1083     USItype __cbtmp;                                                    \
1084     __asm__ ("scanbit %1,%0" : "=r" (__cbtmp) : "r" (x));               \
1085     (count) = __cbtmp ^ 31;                                             \
1086   } while (0)
1087 #define COUNT_LEADING_ZEROS_0 (-32) /* sic */
1088 #if defined (__i960mx)          /* what is the proper symbol to test??? */
1089 #define rshift_rhlc(r,h,l,c) \
1090   do {                                                                  \
1091     union {UDItype __ll;                                                \
1092            struct {USItype __l, __h;} __i;                              \
1093           } __nn;                                                       \
1094     __nn.__i.__h = (h); __nn.__i.__l = (l);                             \
1095     __asm__ ("shre %2,%1,%0" : "=d" (r) : "dI" (__nn.__ll), "dI" (c));  \
1096   }
1097 #endif /* i960mx */
1098 #endif /* i960 */
1099
1100 #if (defined (__mc68000__) || defined (__mc68020__) || defined(mc68020) \
1101      || defined (__m68k__) || defined (__mc5200__) || defined (__mc5206e__) \
1102      || defined (__mc5307__)) && W_TYPE_SIZE == 32
1103 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
1104   __asm__ ("add%.l %5,%1\n\taddx%.l %3,%0"                              \
1105            : "=d" (sh), "=&d" (sl)                                      \
1106            : "0"  ((USItype)(ah)), "d" ((USItype)(bh)),                 \
1107              "%1" ((USItype)(al)), "g" ((USItype)(bl)))
1108 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
1109   __asm__ ("sub%.l %5,%1\n\tsubx%.l %3,%0"                              \
1110            : "=d" (sh), "=&d" (sl)                                      \
1111            : "0" ((USItype)(ah)), "d" ((USItype)(bh)),                  \
1112              "1" ((USItype)(al)), "g" ((USItype)(bl)))
1113 /* The '020, '030, '040 and CPU32 have 32x32->64 and 64/32->32q-32r.  */
1114 #if defined (__mc68020__) || defined(mc68020) \
1115      || defined (__mc68030__) || defined (mc68030) \
1116      || defined (__mc68040__) || defined (mc68040) \
1117      || defined (__mcpu32__) || defined (mcpu32) \
1118      || defined (__NeXT__)
1119 #define umul_ppmm(w1, w0, u, v) \
1120   __asm__ ("mulu%.l %3,%1:%0"                                           \
1121            : "=d" (w0), "=d" (w1)                                       \
1122            : "%0" ((USItype)(u)), "dmi" ((USItype)(v)))
1123 #define UMUL_TIME 45
1124 #define udiv_qrnnd(q, r, n1, n0, d) \
1125   __asm__ ("divu%.l %4,%1:%0"                                           \
1126            : "=d" (q), "=d" (r)                                         \
1127            : "0" ((USItype)(n0)), "1" ((USItype)(n1)), "dmi" ((USItype)(d)))
1128 #define UDIV_TIME 90
1129 #define sdiv_qrnnd(q, r, n1, n0, d) \
1130   __asm__ ("divs%.l %4,%1:%0"                                           \
1131            : "=d" (q), "=d" (r)                                         \
1132            : "0" ((USItype)(n0)), "1" ((USItype)(n1)), "dmi" ((USItype)(d)))
1133 #else /* for other 68k family members use 16x16->32 multiplication */
1134 #define umul_ppmm(xh, xl, a, b) \
1135   do { USItype __umul_tmp1, __umul_tmp2;                                \
1136         __asm__ ("| Inlined umul_ppmm\n"                                \
1137 "       move%.l %5,%3\n"                                                \
1138 "       move%.l %2,%0\n"                                                \
1139 "       move%.w %3,%1\n"                                                \
1140 "       swap    %3\n"                                                   \
1141 "       swap    %0\n"                                                   \
1142 "       mulu%.w %2,%1\n"                                                \
1143 "       mulu%.w %3,%0\n"                                                \
1144 "       mulu%.w %2,%3\n"                                                \
1145 "       swap    %2\n"                                                   \
1146 "       mulu%.w %5,%2\n"                                                \
1147 "       add%.l  %3,%2\n"                                                \
1148 "       jcc     1f\n"                                                   \
1149 "       add%.l  %#0x10000,%0\n"                                         \
1150 "1:     move%.l %2,%3\n"                                                \
1151 "       clr%.w  %2\n"                                                   \
1152 "       swap    %2\n"                                                   \
1153 "       swap    %3\n"                                                   \
1154 "       clr%.w  %3\n"                                                   \
1155 "       add%.l  %3,%1\n"                                                \
1156 "       addx%.l %2,%0\n"                                                \
1157 "       | End inlined umul_ppmm"                                        \
1158               : "=&d" (xh), "=&d" (xl),                                 \
1159                 "=d" (__umul_tmp1), "=&d" (__umul_tmp2)                 \
1160               : "%2" ((USItype)(a)), "d" ((USItype)(b)));               \
1161   } while (0)
1162 #define UMUL_TIME 100
1163 #define UDIV_TIME 400
1164 #endif /* not mc68020 */
1165 /* The '020, '030, '040 and '060 have bitfield insns.
1166    GCC 3.4 defines __mc68020__ when in CPU32 mode, check for __mcpu32__ to
1167    exclude bfffo on that chip (bitfield insns not available).  */
1168 #if (defined (__mc68020__) || defined (mc68020)    \
1169      || defined (__mc68030__) || defined (mc68030) \
1170      || defined (__mc68040__) || defined (mc68040) \
1171      || defined (__mc68060__) || defined (mc68060) \
1172      || defined (__NeXT__))                        \
1173   && ! defined (__mcpu32__)
1174 #define count_leading_zeros(count, x) \
1175   __asm__ ("bfffo %1{%b2:%b2},%0"                                       \
1176            : "=d" (count)                                               \
1177            : "od" ((USItype) (x)), "n" (0))
1178 #define COUNT_LEADING_ZEROS_0 32
1179 #endif
1180 #endif /* mc68000 */
1181
1182 #if defined (__m88000__) && W_TYPE_SIZE == 32
1183 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
1184   __asm__ ("addu.co %1,%r4,%r5\n\taddu.ci %0,%r2,%r3"                   \
1185            : "=r" (sh), "=&r" (sl)                                      \
1186            : "rJ" (ah), "rJ" (bh), "%rJ" (al), "rJ" (bl))
1187 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
1188   __asm__ ("subu.co %1,%r4,%r5\n\tsubu.ci %0,%r2,%r3"                   \
1189            : "=r" (sh), "=&r" (sl)                                      \
1190            : "rJ" (ah), "rJ" (bh), "rJ" (al), "rJ" (bl))
1191 #define count_leading_zeros(count, x) \
1192   do {                                                                  \
1193     USItype __cbtmp;                                                    \
1194     __asm__ ("ff1 %0,%1" : "=r" (__cbtmp) : "r" (x));                   \
1195     (count) = __cbtmp ^ 31;                                             \
1196   } while (0)
1197 #define COUNT_LEADING_ZEROS_0 63 /* sic */
1198 #if defined (__m88110__)
1199 #define umul_ppmm(wh, wl, u, v) \
1200   do {                                                                  \
1201     union {UDItype __ll;                                                \
1202            struct {USItype __h, __l;} __i;                              \
1203           } __x;                                                        \
1204     __asm__ ("mulu.d %0,%1,%2" : "=r" (__x.__ll) : "r" (u), "r" (v));   \
1205     (wh) = __x.__i.__h;                                                 \
1206     (wl) = __x.__i.__l;                                                 \
1207   } while (0)
1208 #define udiv_qrnnd(q, r, n1, n0, d) \
1209   ({union {UDItype __ll;                                                \
1210            struct {USItype __h, __l;} __i;                              \
1211           } __x, __q;                                                   \
1212   __x.__i.__h = (n1); __x.__i.__l = (n0);                               \
1213   __asm__ ("divu.d %0,%1,%2"                                            \
1214            : "=r" (__q.__ll) : "r" (__x.__ll), "r" (d));                \
1215   (r) = (n0) - __q.__l * (d); (q) = __q.__l; })
1216 #define UMUL_TIME 5
1217 #define UDIV_TIME 25
1218 #else
1219 #define UMUL_TIME 17
1220 #define UDIV_TIME 150
1221 #endif /* __m88110__ */
1222 #endif /* __m88000__ */
1223
1224 #if defined (__mips) && W_TYPE_SIZE == 32
1225 #if __GMP_GNUC_PREREQ (4,4)
1226 #define umul_ppmm(w1, w0, u, v) \
1227   do {                                                                  \
1228     UDItype __ll = (UDItype)(u) * (v);                                  \
1229     w1 = __ll >> 32;                                                    \
1230     w0 = __ll;                                                          \
1231   } while (0)
1232 #endif
1233 #if !defined (umul_ppmm) && __GMP_GNUC_PREREQ (2,7)
1234 #define umul_ppmm(w1, w0, u, v) \
1235   __asm__ ("multu %2,%3" : "=l" (w0), "=h" (w1) : "d" (u), "d" (v))
1236 #endif
1237 #if !defined (umul_ppmm)
1238 #define umul_ppmm(w1, w0, u, v) \
1239   __asm__ ("multu %2,%3\n\tmflo %0\n\tmfhi %1"                          \
1240            : "=d" (w0), "=d" (w1) : "d" (u), "d" (v))
1241 #endif
1242 #define UMUL_TIME 10
1243 #define UDIV_TIME 100
1244 #endif /* __mips */
1245
1246 #if (defined (__mips) && __mips >= 3) && W_TYPE_SIZE == 64
1247 #if __GMP_GNUC_PREREQ (4,4)
1248 #define umul_ppmm(w1, w0, u, v) \
1249   do {                                                                  \
1250     typedef unsigned int __ll_UTItype __attribute__((mode(TI)));        \
1251     __ll_UTItype __ll = (__ll_UTItype)(u) * (v);                        \
1252     w1 = __ll >> 64;                                                    \
1253     w0 = __ll;                                                          \
1254   } while (0)
1255 #endif
1256 #if !defined (umul_ppmm) && __GMP_GNUC_PREREQ (2,7)
1257 #define umul_ppmm(w1, w0, u, v) \
1258   __asm__ ("dmultu %2,%3" : "=l" (w0), "=h" (w1) : "d" (u), "d" (v))
1259 #endif
1260 #if !defined (umul_ppmm)
1261 #define umul_ppmm(w1, w0, u, v) \
1262   __asm__ ("dmultu %2,%3\n\tmflo %0\n\tmfhi %1"                         \
1263            : "=d" (w0), "=d" (w1) : "d" (u), "d" (v))
1264 #endif
1265 #define UMUL_TIME 20
1266 #define UDIV_TIME 140
1267 #endif /* __mips */
1268
1269 #if defined (__mmix__) && W_TYPE_SIZE == 64
1270 #define umul_ppmm(w1, w0, u, v) \
1271   __asm__ ("MULU %0,%2,%3" : "=r" (w0), "=z" (w1) : "r" (u), "r" (v))
1272 #endif
1273
1274 #if defined (__ns32000__) && W_TYPE_SIZE == 32
1275 #define umul_ppmm(w1, w0, u, v) \
1276   ({union {UDItype __ll;                                                \
1277            struct {USItype __l, __h;} __i;                              \
1278           } __x;                                                        \
1279   __asm__ ("meid %2,%0"                                                 \
1280            : "=g" (__x.__ll)                                            \
1281            : "%0" ((USItype)(u)), "g" ((USItype)(v)));                  \
1282   (w1) = __x.__i.__h; (w0) = __x.__i.__l;})
1283 #define __umulsidi3(u, v) \
1284   ({UDItype __w;                                                        \
1285     __asm__ ("meid %2,%0"                                               \
1286              : "=g" (__w)                                               \
1287              : "%0" ((USItype)(u)), "g" ((USItype)(v)));                \
1288     __w; })
1289 #define udiv_qrnnd(q, r, n1, n0, d) \
1290   ({union {UDItype __ll;                                                \
1291            struct {USItype __l, __h;} __i;                              \
1292           } __x;                                                        \
1293   __x.__i.__h = (n1); __x.__i.__l = (n0);                               \
1294   __asm__ ("deid %2,%0"                                                 \
1295            : "=g" (__x.__ll)                                            \
1296            : "0" (__x.__ll), "g" ((USItype)(d)));                       \
1297   (r) = __x.__i.__l; (q) = __x.__i.__h; })
1298 #define count_trailing_zeros(count,x) \
1299   do {                                                                  \
1300     __asm__ ("ffsd      %2,%0"                                          \
1301              : "=r" (count)                                             \
1302              : "0" ((USItype) 0), "r" ((USItype) (x)));                 \
1303   } while (0)
1304 #endif /* __ns32000__ */
1305
1306 /* In the past we had a block of various #defines tested
1307        _ARCH_PPC    - AIX
1308        _ARCH_PWR    - AIX
1309        __powerpc__  - gcc
1310        __POWERPC__  - BEOS
1311        __ppc__      - Darwin
1312        PPC          - old gcc, GNU/Linux, SysV
1313    The plain PPC test was not good for vxWorks, since PPC is defined on all
1314    CPUs there (eg. m68k too), as a constant one is expected to compare
1315    CPU_FAMILY against.
1316
1317    At any rate, this was pretty unattractive and a bit fragile.  The use of
1318    HAVE_HOST_CPU_FAMILY is designed to cut through it all and be sure of
1319    getting the desired effect.
1320
1321    ENHANCE-ME: We should test _IBMR2 here when we add assembly support for
1322    the system vendor compilers.  (Is that vendor compilers with inline asm,
1323    or what?)  */
1324
1325 #if (HAVE_HOST_CPU_FAMILY_power || HAVE_HOST_CPU_FAMILY_powerpc)        \
1326   && W_TYPE_SIZE == 32
1327 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
1328   do {                                                                  \
1329     if (__builtin_constant_p (bh) && (bh) == 0)                         \
1330       __asm__ ("{a%I4|add%I4c} %1,%3,%4\n\t{aze|addze} %0,%2"           \
1331              : "=r" (sh), "=&r" (sl) : "r" (ah), "%r" (al), "rI" (bl));\
1332     else if (__builtin_constant_p (bh) && (bh) == ~(USItype) 0)         \
1333       __asm__ ("{a%I4|add%I4c} %1,%3,%4\n\t{ame|addme} %0,%2"           \
1334              : "=r" (sh), "=&r" (sl) : "r" (ah), "%r" (al), "rI" (bl));\
1335     else                                                                \
1336       __asm__ ("{a%I5|add%I5c} %1,%4,%5\n\t{ae|adde} %0,%2,%3"          \
1337              : "=r" (sh), "=&r" (sl)                                    \
1338              : "r" (ah), "r" (bh), "%r" (al), "rI" (bl));               \
1339   } while (0)
1340 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
1341   do {                                                                  \
1342     if (__builtin_constant_p (ah) && (ah) == 0)                         \
1343       __asm__ ("{sf%I3|subf%I3c} %1,%4,%3\n\t{sfze|subfze} %0,%2"       \
1344                : "=r" (sh), "=&r" (sl) : "r" (bh), "rI" (al), "r" (bl));\
1345     else if (__builtin_constant_p (ah) && (ah) == ~(USItype) 0)         \
1346       __asm__ ("{sf%I3|subf%I3c} %1,%4,%3\n\t{sfme|subfme} %0,%2"       \
1347                : "=r" (sh), "=&r" (sl) : "r" (bh), "rI" (al), "r" (bl));\
1348     else if (__builtin_constant_p (bh) && (bh) == 0)                    \
1349       __asm__ ("{sf%I3|subf%I3c} %1,%4,%3\n\t{ame|addme} %0,%2"         \
1350                : "=r" (sh), "=&r" (sl) : "r" (ah), "rI" (al), "r" (bl));\
1351     else if (__builtin_constant_p (bh) && (bh) == ~(USItype) 0)         \
1352       __asm__ ("{sf%I3|subf%I3c} %1,%4,%3\n\t{aze|addze} %0,%2"         \
1353                : "=r" (sh), "=&r" (sl) : "r" (ah), "rI" (al), "r" (bl));\
1354     else                                                                \
1355       __asm__ ("{sf%I4|subf%I4c} %1,%5,%4\n\t{sfe|subfe} %0,%3,%2"      \
1356                : "=r" (sh), "=&r" (sl)                                  \
1357                : "r" (ah), "r" (bh), "rI" (al), "r" (bl));              \
1358   } while (0)
1359 #define count_leading_zeros(count, x) \
1360   __asm__ ("{cntlz|cntlzw} %0,%1" : "=r" (count) : "r" (x))
1361 #define COUNT_LEADING_ZEROS_0 32
1362 #if HAVE_HOST_CPU_FAMILY_powerpc
1363 #if __GMP_GNUC_PREREQ (4,4)
1364 #define umul_ppmm(w1, w0, u, v) \
1365   do {                                                                  \
1366     UDItype __ll = (UDItype)(u) * (v);                                  \
1367     w1 = __ll >> 32;                                                    \
1368     w0 = __ll;                                                          \
1369   } while (0)
1370 #endif
1371 #if !defined (umul_ppmm)
1372 #define umul_ppmm(ph, pl, m0, m1) \
1373   do {                                                                  \
1374     USItype __m0 = (m0), __m1 = (m1);                                   \
1375     __asm__ ("mulhwu %0,%1,%2" : "=r" (ph) : "%r" (m0), "r" (m1));      \
1376     (pl) = __m0 * __m1;                                                 \
1377   } while (0)
1378 #endif
1379 #define UMUL_TIME 15
1380 #define smul_ppmm(ph, pl, m0, m1) \
1381   do {                                                                  \
1382     SItype __m0 = (m0), __m1 = (m1);                                    \
1383     __asm__ ("mulhw %0,%1,%2" : "=r" (ph) : "%r" (m0), "r" (m1));       \
1384     (pl) = __m0 * __m1;                                                 \
1385   } while (0)
1386 #define SMUL_TIME 14
1387 #define UDIV_TIME 120
1388 #else
1389 #define UMUL_TIME 8
1390 #define smul_ppmm(xh, xl, m0, m1) \
1391   __asm__ ("mul %0,%2,%3" : "=r" (xh), "=q" (xl) : "r" (m0), "r" (m1))
1392 #define SMUL_TIME 4
1393 #define sdiv_qrnnd(q, r, nh, nl, d) \
1394   __asm__ ("div %0,%2,%4" : "=r" (q), "=q" (r) : "r" (nh), "1" (nl), "r" (d))
1395 #define UDIV_TIME 100
1396 #endif
1397 #endif /* 32-bit POWER architecture variants.  */
1398
1399 /* We should test _IBMR2 here when we add assembly support for the system
1400    vendor compilers.  */
1401 #if HAVE_HOST_CPU_FAMILY_powerpc && W_TYPE_SIZE == 64 && defined (_LP64)
1402 #if !defined (_LONG_LONG_LIMB)
1403 /* _LONG_LONG_LIMB is ABI=mode32 where adde operates on 32-bit values.  So
1404    use adde etc only when not _LONG_LONG_LIMB.  */
1405 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
1406   do {                                                                  \
1407     if (__builtin_constant_p (bh) && (bh) == 0)                         \
1408       __asm__ ("{a%I4|add%I4c} %1,%3,%4\n\t{aze|addze} %0,%2"           \
1409              : "=r" (sh), "=&r" (sl) : "r" (ah), "%r" (al), "rI" (bl));\
1410     else if (__builtin_constant_p (bh) && (bh) == ~(UDItype) 0)         \
1411       __asm__ ("{a%I4|add%I4c} %1,%3,%4\n\t{ame|addme} %0,%2"           \
1412              : "=r" (sh), "=&r" (sl) : "r" (ah), "%r" (al), "rI" (bl));\
1413     else                                                                \
1414       __asm__ ("{a%I5|add%I5c} %1,%4,%5\n\t{ae|adde} %0,%2,%3"          \
1415              : "=r" (sh), "=&r" (sl)                                    \
1416              : "r" (ah), "r" (bh), "%r" (al), "rI" (bl));               \
1417   } while (0)
1418 /* We use "*rI" for the constant operand here, since with just "I", gcc barfs.
1419    This might seem strange, but gcc folds away the dead code late.  */
1420 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
1421   do {                                                                        \
1422     if (__builtin_constant_p (bl) && bl > -0x8000 && bl <= 0x8000) {          \
1423         if (__builtin_constant_p (ah) && (ah) == 0)                           \
1424           __asm__ ("{ai|addic} %1,%3,%4\n\t{sfze|subfze} %0,%2"               \
1425                    : "=r" (sh), "=&r" (sl) : "r" (bh), "rI" (al), "*rI" (-bl)); \
1426         else if (__builtin_constant_p (ah) && (ah) == ~(UDItype) 0)           \
1427           __asm__ ("{ai|addic} %1,%3,%4\n\t{sfme|subfme} %0,%2"               \
1428                    : "=r" (sh), "=&r" (sl) : "r" (bh), "rI" (al), "*rI" (-bl)); \
1429         else if (__builtin_constant_p (bh) && (bh) == 0)                      \
1430           __asm__ ("{ai|addic} %1,%3,%4\n\t{ame|addme} %0,%2"                 \
1431                    : "=r" (sh), "=&r" (sl) : "r" (ah), "rI" (al), "*rI" (-bl)); \
1432         else if (__builtin_constant_p (bh) && (bh) == ~(UDItype) 0)           \
1433           __asm__ ("{ai|addic} %1,%3,%4\n\t{aze|addze} %0,%2"                 \
1434                    : "=r" (sh), "=&r" (sl) : "r" (ah), "rI" (al), "*rI" (-bl)); \
1435         else                                                                  \
1436           __asm__ ("{ai|addic} %1,%4,%5\n\t{sfe|subfe} %0,%3,%2"              \
1437                    : "=r" (sh), "=&r" (sl)                                    \
1438                    : "r" (ah), "r" (bh), "rI" (al), "*rI" (-bl));             \
1439       } else {                                                                \
1440         if (__builtin_constant_p (ah) && (ah) == 0)                           \
1441           __asm__ ("{sf%I3|subf%I3c} %1,%4,%3\n\t{sfze|subfze} %0,%2"         \
1442                    : "=r" (sh), "=&r" (sl) : "r" (bh), "rI" (al), "r" (bl));  \
1443         else if (__builtin_constant_p (ah) && (ah) == ~(UDItype) 0)           \
1444           __asm__ ("{sf%I3|subf%I3c} %1,%4,%3\n\t{sfme|subfme} %0,%2"         \
1445                    : "=r" (sh), "=&r" (sl) : "r" (bh), "rI" (al), "r" (bl));  \
1446         else if (__builtin_constant_p (bh) && (bh) == 0)                      \
1447           __asm__ ("{sf%I3|subf%I3c} %1,%4,%3\n\t{ame|addme} %0,%2"           \
1448                    : "=r" (sh), "=&r" (sl) : "r" (ah), "rI" (al), "r" (bl));  \
1449         else if (__builtin_constant_p (bh) && (bh) == ~(UDItype) 0)           \
1450           __asm__ ("{sf%I3|subf%I3c} %1,%4,%3\n\t{aze|addze} %0,%2"           \
1451                    : "=r" (sh), "=&r" (sl) : "r" (ah), "rI" (al), "r" (bl));  \
1452         else                                                                  \
1453           __asm__ ("{sf%I4|subf%I4c} %1,%5,%4\n\t{sfe|subfe} %0,%3,%2"        \
1454                    : "=r" (sh), "=&r" (sl)                                    \
1455                    : "r" (ah), "r" (bh), "rI" (al), "r" (bl));                \
1456       }                                                                       \
1457   } while (0)
1458 #endif /* ! _LONG_LONG_LIMB */
1459 #define count_leading_zeros(count, x) \
1460   __asm__ ("cntlzd %0,%1" : "=r" (count) : "r" (x))
1461 #define COUNT_LEADING_ZEROS_0 64
1462 #if 0 && __GMP_GNUC_PREREQ (4,4) /* Disable, this results in libcalls! */
1463 #define umul_ppmm(w1, w0, u, v) \
1464   do {                                                                  \
1465     typedef unsigned int __ll_UTItype __attribute__((mode(TI)));        \
1466     __ll_UTItype __ll = (__ll_UTItype)(u) * (v);                        \
1467     w1 = __ll >> 64;                                                    \
1468     w0 = __ll;                                                          \
1469   } while (0)
1470 #endif
1471 #if !defined (umul_ppmm)
1472 #define umul_ppmm(ph, pl, m0, m1) \
1473   do {                                                                  \
1474     UDItype __m0 = (m0), __m1 = (m1);                                   \
1475     __asm__ ("mulhdu %0,%1,%2" : "=r" (ph) : "%r" (m0), "r" (m1));      \
1476     (pl) = __m0 * __m1;                                                 \
1477   } while (0)
1478 #endif
1479 #define UMUL_TIME 15
1480 #define smul_ppmm(ph, pl, m0, m1) \
1481   do {                                                                  \
1482     DItype __m0 = (m0), __m1 = (m1);                                    \
1483     __asm__ ("mulhd %0,%1,%2" : "=r" (ph) : "%r" (m0), "r" (m1));       \
1484     (pl) = __m0 * __m1;                                                 \
1485   } while (0)
1486 #define SMUL_TIME 14  /* ??? */
1487 #define UDIV_TIME 120 /* ??? */
1488 #endif /* 64-bit PowerPC.  */
1489
1490 #if defined (__pyr__) && W_TYPE_SIZE == 32
1491 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
1492   __asm__ ("addw %5,%1\n\taddwc %3,%0"                                  \
1493            : "=r" (sh), "=&r" (sl)                                      \
1494            : "0"  ((USItype)(ah)), "g" ((USItype)(bh)),                 \
1495              "%1" ((USItype)(al)), "g" ((USItype)(bl)))
1496 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
1497   __asm__ ("subw %5,%1\n\tsubwb %3,%0"                                  \
1498            : "=r" (sh), "=&r" (sl)                                      \
1499            : "0" ((USItype)(ah)), "g" ((USItype)(bh)),                  \
1500              "1" ((USItype)(al)), "g" ((USItype)(bl)))
1501 /* This insn works on Pyramids with AP, XP, or MI CPUs, but not with SP.  */
1502 #define umul_ppmm(w1, w0, u, v) \
1503   ({union {UDItype __ll;                                                \
1504            struct {USItype __h, __l;} __i;                              \
1505           } __x;                                                        \
1506   __asm__ ("movw %1,%R0\n\tuemul %2,%0"                                 \
1507            : "=&r" (__x.__ll)                                           \
1508            : "g" ((USItype) (u)), "g" ((USItype)(v)));                  \
1509   (w1) = __x.__i.__h; (w0) = __x.__i.__l;})
1510 #endif /* __pyr__ */
1511
1512 #if defined (__ibm032__) /* RT/ROMP */  && W_TYPE_SIZE == 32
1513 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
1514   __asm__ ("a %1,%5\n\tae %0,%3"                                        \
1515            : "=r" (sh), "=&r" (sl)                                      \
1516            : "0"  ((USItype)(ah)), "r" ((USItype)(bh)),                 \
1517              "%1" ((USItype)(al)), "r" ((USItype)(bl)))
1518 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
1519   __asm__ ("s %1,%5\n\tse %0,%3"                                        \
1520            : "=r" (sh), "=&r" (sl)                                      \
1521            : "0" ((USItype)(ah)), "r" ((USItype)(bh)),                  \
1522              "1" ((USItype)(al)), "r" ((USItype)(bl)))
1523 #define smul_ppmm(ph, pl, m0, m1) \
1524   __asm__ (                                                             \
1525        "s       r2,r2\n"                                                \
1526 "       mts r10,%2\n"                                                   \
1527 "       m       r2,%3\n"                                                \
1528 "       m       r2,%3\n"                                                \
1529 "       m       r2,%3\n"                                                \
1530 "       m       r2,%3\n"                                                \
1531 "       m       r2,%3\n"                                                \
1532 "       m       r2,%3\n"                                                \
1533 "       m       r2,%3\n"                                                \
1534 "       m       r2,%3\n"                                                \
1535 "       m       r2,%3\n"                                                \
1536 "       m       r2,%3\n"                                                \
1537 "       m       r2,%3\n"                                                \
1538 "       m       r2,%3\n"                                                \
1539 "       m       r2,%3\n"                                                \
1540 "       m       r2,%3\n"                                                \
1541 "       m       r2,%3\n"                                                \
1542 "       m       r2,%3\n"                                                \
1543 "       cas     %0,r2,r0\n"                                             \
1544 "       mfs     r10,%1"                                                 \
1545            : "=r" (ph), "=r" (pl)                                       \
1546            : "%r" ((USItype)(m0)), "r" ((USItype)(m1))                  \
1547            : "r2")
1548 #define UMUL_TIME 20
1549 #define UDIV_TIME 200
1550 #define count_leading_zeros(count, x) \
1551   do {                                                                  \
1552     if ((x) >= 0x10000)                                                 \
1553       __asm__ ("clz     %0,%1"                                          \
1554                : "=r" (count) : "r" ((USItype)(x) >> 16));              \
1555     else                                                                \
1556       {                                                                 \
1557         __asm__ ("clz   %0,%1"                                          \
1558                  : "=r" (count) : "r" ((USItype)(x)));                  \
1559         (count) += 16;                                                  \
1560       }                                                                 \
1561   } while (0)
1562 #endif /* RT/ROMP */
1563
1564 #if (defined (__SH2__) || defined (__SH3__) || defined (__SH4__)) && W_TYPE_SIZE == 32
1565 #define umul_ppmm(w1, w0, u, v) \
1566   __asm__ ("dmulu.l %2,%3\n\tsts macl,%1\n\tsts mach,%0"                \
1567            : "=r" (w1), "=r" (w0) : "r" (u), "r" (v) : "macl", "mach")
1568 #define UMUL_TIME 5
1569 #endif
1570
1571 #if defined (__sparc__) && W_TYPE_SIZE == 32
1572 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
1573   __asm__ ("addcc %r4,%5,%1\n\taddx %r2,%3,%0"                          \
1574            : "=r" (sh), "=&r" (sl)                                      \
1575            : "rJ" (ah), "rI" (bh),"%rJ" (al), "rI" (bl)                 \
1576            __CLOBBER_CC)
1577 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
1578   __asm__ ("subcc %r4,%5,%1\n\tsubx %r2,%3,%0"                          \
1579            : "=r" (sh), "=&r" (sl)                                      \
1580            : "rJ" (ah), "rI" (bh), "rJ" (al), "rI" (bl) \
1581            __CLOBBER_CC)
1582 /* FIXME: When gcc -mcpu=v9 is used on solaris, gcc/config/sol2-sld-64.h
1583    doesn't define anything to indicate that to us, it only sets __sparcv8. */
1584 #if defined (__sparc_v9__) || defined (__sparcv9)
1585 /* Perhaps we should use floating-point operations here?  */
1586 #if 0
1587 /* Triggers a bug making mpz/tests/t-gcd.c fail.
1588    Perhaps we simply need explicitly zero-extend the inputs?  */
1589 #define umul_ppmm(w1, w0, u, v) \
1590   __asm__ ("mulx %2,%3,%%g1; srl %%g1,0,%1; srlx %%g1,32,%0" :          \
1591            "=r" (w1), "=r" (w0) : "r" (u), "r" (v) : "g1")
1592 #else
1593 /* Use v8 umul until above bug is fixed.  */
1594 #define umul_ppmm(w1, w0, u, v) \
1595   __asm__ ("umul %2,%3,%1;rd %%y,%0" : "=r" (w1), "=r" (w0) : "r" (u), "r" (v))
1596 #endif
1597 /* Use a plain v8 divide for v9.  */
1598 #define udiv_qrnnd(q, r, n1, n0, d) \
1599   do {                                                                  \
1600     USItype __q;                                                        \
1601     __asm__ ("mov %1,%%y;nop;nop;nop;udiv %2,%3,%0"                     \
1602              : "=r" (__q) : "r" (n1), "r" (n0), "r" (d));               \
1603     (r) = (n0) - __q * (d);                                             \
1604     (q) = __q;                                                          \
1605   } while (0)
1606 #else
1607 #if defined (__sparc_v8__)   /* gcc normal */                           \
1608   || defined (__sparcv8)     /* gcc solaris */                          \
1609   || HAVE_HOST_CPU_supersparc
1610 /* Don't match immediate range because, 1) it is not often useful,
1611    2) the 'I' flag thinks of the range as a 13 bit signed interval,
1612    while we want to match a 13 bit interval, sign extended to 32 bits,
1613    but INTERPRETED AS UNSIGNED.  */
1614 #define umul_ppmm(w1, w0, u, v) \
1615   __asm__ ("umul %2,%3,%1;rd %%y,%0" : "=r" (w1), "=r" (w0) : "r" (u), "r" (v))
1616 #define UMUL_TIME 5
1617
1618 #if HAVE_HOST_CPU_supersparc
1619 #define UDIV_TIME 60            /* SuperSPARC timing */
1620 #else
1621 /* Don't use this on SuperSPARC because its udiv only handles 53 bit
1622    dividends and will trap to the kernel for the rest. */
1623 #define udiv_qrnnd(q, r, n1, n0, d) \
1624   do {                                                                  \
1625     USItype __q;                                                        \
1626     __asm__ ("mov %1,%%y;nop;nop;nop;udiv %2,%3,%0"                     \
1627              : "=r" (__q) : "r" (n1), "r" (n0), "r" (d));               \
1628     (r) = (n0) - __q * (d);                                             \
1629     (q) = __q;                                                          \
1630   } while (0)
1631 #define UDIV_TIME 25
1632 #endif /* HAVE_HOST_CPU_supersparc */
1633
1634 #else /* ! __sparc_v8__ */
1635 #if defined (__sparclite__)
1636 /* This has hardware multiply but not divide.  It also has two additional
1637    instructions scan (ffs from high bit) and divscc.  */
1638 #define umul_ppmm(w1, w0, u, v) \
1639   __asm__ ("umul %2,%3,%1;rd %%y,%0" : "=r" (w1), "=r" (w0) : "r" (u), "r" (v))
1640 #define UMUL_TIME 5
1641 #define udiv_qrnnd(q, r, n1, n0, d) \
1642   __asm__ ("! Inlined udiv_qrnnd\n"                                     \
1643 "       wr      %%g0,%2,%%y     ! Not a delayed write for sparclite\n"  \
1644 "       tst     %%g0\n"                                                 \
1645 "       divscc  %3,%4,%%g1\n"                                           \
1646 "       divscc  %%g1,%4,%%g1\n"                                         \
1647 "       divscc  %%g1,%4,%%g1\n"                                         \
1648 "       divscc  %%g1,%4,%%g1\n"                                         \
1649 "       divscc  %%g1,%4,%%g1\n"                                         \
1650 "       divscc  %%g1,%4,%%g1\n"                                         \
1651 "       divscc  %%g1,%4,%%g1\n"                                         \
1652 "       divscc  %%g1,%4,%%g1\n"                                         \
1653 "       divscc  %%g1,%4,%%g1\n"                                         \
1654 "       divscc  %%g1,%4,%%g1\n"                                         \
1655 "       divscc  %%g1,%4,%%g1\n"                                         \
1656 "       divscc  %%g1,%4,%%g1\n"                                         \
1657 "       divscc  %%g1,%4,%%g1\n"                                         \
1658 "       divscc  %%g1,%4,%%g1\n"                                         \
1659 "       divscc  %%g1,%4,%%g1\n"                                         \
1660 "       divscc  %%g1,%4,%%g1\n"                                         \
1661 "       divscc  %%g1,%4,%%g1\n"                                         \
1662 "       divscc  %%g1,%4,%%g1\n"                                         \
1663 "       divscc  %%g1,%4,%%g1\n"                                         \
1664 "       divscc  %%g1,%4,%%g1\n"                                         \
1665 "       divscc  %%g1,%4,%%g1\n"                                         \
1666 "       divscc  %%g1,%4,%%g1\n"                                         \
1667 "       divscc  %%g1,%4,%%g1\n"                                         \
1668 "       divscc  %%g1,%4,%%g1\n"                                         \
1669 "       divscc  %%g1,%4,%%g1\n"                                         \
1670 "       divscc  %%g1,%4,%%g1\n"                                         \
1671 "       divscc  %%g1,%4,%%g1\n"                                         \
1672 "       divscc  %%g1,%4,%%g1\n"                                         \
1673 "       divscc  %%g1,%4,%%g1\n"                                         \
1674 "       divscc  %%g1,%4,%%g1\n"                                         \
1675 "       divscc  %%g1,%4,%%g1\n"                                         \
1676 "       divscc  %%g1,%4,%0\n"                                           \
1677 "       rd      %%y,%1\n"                                               \
1678 "       bl,a 1f\n"                                                      \
1679 "       add     %1,%4,%1\n"                                             \
1680 "1:     ! End of inline udiv_qrnnd"                                     \
1681            : "=r" (q), "=r" (r) : "r" (n1), "r" (n0), "rI" (d)          \
1682            : "%g1" __AND_CLOBBER_CC)
1683 #define UDIV_TIME 37
1684 #define count_leading_zeros(count, x) \
1685   __asm__ ("scan %1,1,%0" : "=r" (count) : "r" (x))
1686 /* Early sparclites return 63 for an argument of 0, but they warn that future
1687    implementations might change this.  Therefore, leave COUNT_LEADING_ZEROS_0
1688    undefined.  */
1689 #endif /* __sparclite__ */
1690 #endif /* __sparc_v8__ */
1691 #endif /* __sparc_v9__ */
1692 /* Default to sparc v7 versions of umul_ppmm and udiv_qrnnd.  */
1693 #ifndef umul_ppmm
1694 #define umul_ppmm(w1, w0, u, v) \
1695   __asm__ ("! Inlined umul_ppmm\n"                                      \
1696 "       wr      %%g0,%2,%%y     ! SPARC has 0-3 delay insn after a wr\n" \
1697 "       sra     %3,31,%%g2      ! Don't move this insn\n"               \
1698 "       and     %2,%%g2,%%g2    ! Don't move this insn\n"               \
1699 "       andcc   %%g0,0,%%g1     ! Don't move this insn\n"               \
1700 "       mulscc  %%g1,%3,%%g1\n"                                         \
1701 "       mulscc  %%g1,%3,%%g1\n"                                         \
1702 "       mulscc  %%g1,%3,%%g1\n"                                         \
1703 "       mulscc  %%g1,%3,%%g1\n"                                         \
1704 "       mulscc  %%g1,%3,%%g1\n"                                         \
1705 "       mulscc  %%g1,%3,%%g1\n"                                         \
1706 "       mulscc  %%g1,%3,%%g1\n"                                         \
1707 "       mulscc  %%g1,%3,%%g1\n"                                         \
1708 "       mulscc  %%g1,%3,%%g1\n"                                         \
1709 "       mulscc  %%g1,%3,%%g1\n"                                         \
1710 "       mulscc  %%g1,%3,%%g1\n"                                         \
1711 "       mulscc  %%g1,%3,%%g1\n"                                         \
1712 "       mulscc  %%g1,%3,%%g1\n"                                         \
1713 "       mulscc  %%g1,%3,%%g1\n"                                         \
1714 "       mulscc  %%g1,%3,%%g1\n"                                         \
1715 "       mulscc  %%g1,%3,%%g1\n"                                         \
1716 "       mulscc  %%g1,%3,%%g1\n"                                         \
1717 "       mulscc  %%g1,%3,%%g1\n"                                         \
1718 "       mulscc  %%g1,%3,%%g1\n"                                         \
1719 "       mulscc  %%g1,%3,%%g1\n"                                         \
1720 "       mulscc  %%g1,%3,%%g1\n"                                         \
1721 "       mulscc  %%g1,%3,%%g1\n"                                         \
1722 "       mulscc  %%g1,%3,%%g1\n"                                         \
1723 "       mulscc  %%g1,%3,%%g1\n"                                         \
1724 "       mulscc  %%g1,%3,%%g1\n"                                         \
1725 "       mulscc  %%g1,%3,%%g1\n"                                         \
1726 "       mulscc  %%g1,%3,%%g1\n"                                         \
1727 "       mulscc  %%g1,%3,%%g1\n"                                         \
1728 "       mulscc  %%g1,%3,%%g1\n"                                         \
1729 "       mulscc  %%g1,%3,%%g1\n"                                         \
1730 "       mulscc  %%g1,%3,%%g1\n"                                         \
1731 "       mulscc  %%g1,%3,%%g1\n"                                         \
1732 "       mulscc  %%g1,0,%%g1\n"                                          \
1733 "       add     %%g1,%%g2,%0\n"                                         \
1734 "       rd      %%y,%1"                                                 \
1735            : "=r" (w1), "=r" (w0) : "%rI" (u), "r" (v)                  \
1736            : "%g1", "%g2" __AND_CLOBBER_CC)
1737 #define UMUL_TIME 39            /* 39 instructions */
1738 #endif
1739 #ifndef udiv_qrnnd
1740 #ifndef LONGLONG_STANDALONE
1741 #define udiv_qrnnd(q, r, n1, n0, d) \
1742   do { UWtype __r;                                                      \
1743     (q) = __MPN(udiv_qrnnd) (&__r, (n1), (n0), (d));                    \
1744     (r) = __r;                                                          \
1745   } while (0)
1746 extern UWtype __MPN(udiv_qrnnd) (UWtype *, UWtype, UWtype, UWtype);
1747 #ifndef UDIV_TIME
1748 #define UDIV_TIME 140
1749 #endif
1750 #endif /* LONGLONG_STANDALONE */
1751 #endif /* udiv_qrnnd */
1752 #endif /* __sparc__ */
1753
1754 #if (defined (__sparc_v9) || defined (__sparc_v9__)) && W_TYPE_SIZE == 64
1755 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
1756   __asm__ (                                                             \
1757        "addcc   %r4,%5,%1\n"                                            \
1758       " addccc  %r6,%7,%%g0\n"                                          \
1759       " addc    %r2,%3,%0"                                              \
1760           : "=r" (sh), "=&r" (sl)                                       \
1761           : "rJ" (ah), "rI" (bh), "%rJ" (al), "rI" (bl),                \
1762             "%rJ" ((al) >> 32), "rI" ((bl) >> 32)                       \
1763            __CLOBBER_CC)
1764 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
1765   __asm__ (                                                             \
1766        "subcc   %r4,%5,%1\n"                                            \
1767       " subccc  %r6,%7,%%g0\n"                                          \
1768       " subc    %r2,%3,%0"                                              \
1769           : "=r" (sh), "=&r" (sl)                                       \
1770           : "rJ" (ah), "rI" (bh), "rJ" (al), "rI" (bl),         \
1771             "rJ" ((al) >> 32), "rI" ((bl) >> 32)                        \
1772            __CLOBBER_CC)
1773 #endif
1774
1775 #if defined (__vax__) && W_TYPE_SIZE == 32
1776 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
1777   __asm__ ("addl2 %5,%1\n\tadwc %3,%0"                                  \
1778            : "=g" (sh), "=&g" (sl)                                      \
1779            : "0"  ((USItype)(ah)), "g" ((USItype)(bh)),                 \
1780              "%1" ((USItype)(al)), "g" ((USItype)(bl)))
1781 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
1782   __asm__ ("subl2 %5,%1\n\tsbwc %3,%0"                                  \
1783            : "=g" (sh), "=&g" (sl)                                      \
1784            : "0" ((USItype)(ah)), "g" ((USItype)(bh)),                  \
1785              "1" ((USItype)(al)), "g" ((USItype)(bl)))
1786 #define smul_ppmm(xh, xl, m0, m1) \
1787   do {                                                                  \
1788     union {UDItype __ll;                                                \
1789            struct {USItype __l, __h;} __i;                              \
1790           } __x;                                                        \
1791     USItype __m0 = (m0), __m1 = (m1);                                   \
1792     __asm__ ("emul %1,%2,$0,%0"                                         \
1793              : "=g" (__x.__ll) : "g" (__m0), "g" (__m1));               \
1794     (xh) = __x.__i.__h; (xl) = __x.__i.__l;                             \
1795   } while (0)
1796 #define sdiv_qrnnd(q, r, n1, n0, d) \
1797   do {                                                                  \
1798     union {DItype __ll;                                                 \
1799            struct {SItype __l, __h;} __i;                               \
1800           } __x;                                                        \
1801     __x.__i.__h = n1; __x.__i.__l = n0;                                 \
1802     __asm__ ("ediv %3,%2,%0,%1"                                         \
1803              : "=g" (q), "=g" (r) : "g" (__x.__ll), "g" (d));           \
1804   } while (0)
1805 #if 0
1806 /* FIXME: This instruction appears to be unimplemented on some systems (vax
1807    8800 maybe). */
1808 #define count_trailing_zeros(count,x)                                   \
1809   do {                                                                  \
1810     __asm__ ("ffs 0, 31, %1, %0"                                        \
1811              : "=g" (count)                                             \
1812              : "g" ((USItype) (x)));                                    \
1813   } while (0)
1814 #endif
1815 #endif /* __vax__ */
1816
1817 #if defined (__z8000__) && W_TYPE_SIZE == 16
1818 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
1819   __asm__ ("add %H1,%H5\n\tadc  %H0,%H3"                                \
1820            : "=r" (sh), "=&r" (sl)                                      \
1821            : "0"  ((unsigned int)(ah)), "r" ((unsigned int)(bh)),       \
1822              "%1" ((unsigned int)(al)), "rQR" ((unsigned int)(bl)))
1823 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
1824   __asm__ ("sub %H1,%H5\n\tsbc  %H0,%H3"                                \
1825            : "=r" (sh), "=&r" (sl)                                      \
1826            : "0" ((unsigned int)(ah)), "r" ((unsigned int)(bh)),        \
1827              "1" ((unsigned int)(al)), "rQR" ((unsigned int)(bl)))
1828 #define umul_ppmm(xh, xl, m0, m1) \
1829   do {                                                                  \
1830     union {long int __ll;                                               \
1831            struct {unsigned int __h, __l;} __i;                         \
1832           } __x;                                                        \
1833     unsigned int __m0 = (m0), __m1 = (m1);                              \
1834     __asm__ ("mult      %S0,%H3"                                        \
1835              : "=r" (__x.__i.__h), "=r" (__x.__i.__l)                   \
1836              : "%1" (m0), "rQR" (m1));                                  \
1837     (xh) = __x.__i.__h; (xl) = __x.__i.__l;                             \
1838     (xh) += ((((signed int) __m0 >> 15) & __m1)                         \
1839              + (((signed int) __m1 >> 15) & __m0));                     \
1840   } while (0)
1841 #endif /* __z8000__ */
1842
1843 #endif /* __GNUC__ */
1844
1845 #endif /* NO_ASM */
1846
1847
1848 /* FIXME: "sidi" here is highly doubtful, should sometimes be "diti".  */
1849 #if !defined (umul_ppmm) && defined (__umulsidi3)
1850 #define umul_ppmm(ph, pl, m0, m1) \
1851   {                                                                     \
1852     UDWtype __ll = __umulsidi3 (m0, m1);                                \
1853     ph = (UWtype) (__ll >> W_TYPE_SIZE);                                \
1854     pl = (UWtype) __ll;                                                 \
1855   }
1856 #endif
1857
1858 #if !defined (__umulsidi3)
1859 #define __umulsidi3(u, v) \
1860   ({UWtype __hi, __lo;                                                  \
1861     umul_ppmm (__hi, __lo, u, v);                                       \
1862     ((UDWtype) __hi << W_TYPE_SIZE) | __lo; })
1863 #endif
1864
1865
1866 /* Use mpn_umul_ppmm or mpn_udiv_qrnnd functions, if they exist.  The "_r"
1867    forms have "reversed" arguments, meaning the pointer is last, which
1868    sometimes allows better parameter passing, in particular on 64-bit
1869    hppa. */
1870
1871 #define mpn_umul_ppmm  __MPN(umul_ppmm)
1872 extern UWtype mpn_umul_ppmm (UWtype *, UWtype, UWtype);
1873
1874 #if ! defined (umul_ppmm) && HAVE_NATIVE_mpn_umul_ppmm  \
1875   && ! defined (LONGLONG_STANDALONE)
1876 #define umul_ppmm(wh, wl, u, v)                                               \
1877   do {                                                                        \
1878     UWtype __umul_ppmm__p0;                                                   \
1879     (wh) = mpn_umul_ppmm (&__umul_ppmm__p0, (UWtype) (u), (UWtype) (v));      \
1880     (wl) = __umul_ppmm__p0;                                                   \
1881   } while (0)
1882 #endif
1883
1884 #define mpn_umul_ppmm_r  __MPN(umul_ppmm_r)
1885 extern UWtype mpn_umul_ppmm_r (UWtype, UWtype, UWtype *);
1886
1887 #if ! defined (umul_ppmm) && HAVE_NATIVE_mpn_umul_ppmm_r        \
1888   && ! defined (LONGLONG_STANDALONE)
1889 #define umul_ppmm(wh, wl, u, v)                                               \
1890   do {                                                                        \
1891     UWtype __umul_ppmm__p0;                                                   \
1892     (wh) = mpn_umul_ppmm_r ((UWtype) (u), (UWtype) (v), &__umul_ppmm__p0);    \
1893     (wl) = __umul_ppmm__p0;                                                   \
1894   } while (0)
1895 #endif
1896
1897 #define mpn_udiv_qrnnd  __MPN(udiv_qrnnd)
1898 extern UWtype mpn_udiv_qrnnd (UWtype *, UWtype, UWtype, UWtype);
1899
1900 #if ! defined (udiv_qrnnd) && HAVE_NATIVE_mpn_udiv_qrnnd        \
1901   && ! defined (LONGLONG_STANDALONE)
1902 #define udiv_qrnnd(q, r, n1, n0, d)                                     \
1903   do {                                                                  \
1904     UWtype __udiv_qrnnd__r;                                             \
1905     (q) = mpn_udiv_qrnnd (&__udiv_qrnnd__r,                             \
1906                           (UWtype) (n1), (UWtype) (n0), (UWtype) d);    \
1907     (r) = __udiv_qrnnd__r;                                              \
1908   } while (0)
1909 #endif
1910
1911 #define mpn_udiv_qrnnd_r  __MPN(udiv_qrnnd_r)
1912 extern UWtype mpn_udiv_qrnnd_r (UWtype, UWtype, UWtype, UWtype *);
1913
1914 #if ! defined (udiv_qrnnd) && HAVE_NATIVE_mpn_udiv_qrnnd_r      \
1915   && ! defined (LONGLONG_STANDALONE)
1916 #define udiv_qrnnd(q, r, n1, n0, d)                                     \
1917   do {                                                                  \
1918     UWtype __udiv_qrnnd__r;                                             \
1919     (q) = mpn_udiv_qrnnd_r ((UWtype) (n1), (UWtype) (n0), (UWtype) d,   \
1920                             &__udiv_qrnnd__r);                          \
1921     (r) = __udiv_qrnnd__r;                                              \
1922   } while (0)
1923 #endif
1924
1925
1926 /* If this machine has no inline assembler, use C macros.  */
1927
1928 #if !defined (add_ssaaaa)
1929 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
1930   do {                                                                  \
1931     UWtype __x;                                                         \
1932     __x = (al) + (bl);                                                  \
1933     (sh) = (ah) + (bh) + (__x < (al));                                  \
1934     (sl) = __x;                                                         \
1935   } while (0)
1936 #endif
1937
1938 #if !defined (sub_ddmmss)
1939 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
1940   do {                                                                  \
1941     UWtype __x;                                                         \
1942     __x = (al) - (bl);                                                  \
1943     (sh) = (ah) - (bh) - ((al) < (bl));                                 \
1944     (sl) = __x;                                                         \
1945   } while (0)
1946 #endif
1947
1948 /* If we lack umul_ppmm but have smul_ppmm, define umul_ppmm in terms of
1949    smul_ppmm.  */
1950 #if !defined (umul_ppmm) && defined (smul_ppmm)
1951 #define umul_ppmm(w1, w0, u, v)                                         \
1952   do {                                                                  \
1953     UWtype __w1;                                                        \
1954     UWtype __xm0 = (u), __xm1 = (v);                                    \
1955     smul_ppmm (__w1, w0, __xm0, __xm1);                                 \
1956     (w1) = __w1 + (-(__xm0 >> (W_TYPE_SIZE - 1)) & __xm1)               \
1957                 + (-(__xm1 >> (W_TYPE_SIZE - 1)) & __xm0);              \
1958   } while (0)
1959 #endif
1960
1961 /* If we still don't have umul_ppmm, define it using plain C.
1962
1963    For reference, when this code is used for squaring (ie. u and v identical
1964    expressions), gcc recognises __x1 and __x2 are the same and generates 3
1965    multiplies, not 4.  The subsequent additions could be optimized a bit,
1966    but the only place GMP currently uses such a square is mpn_sqr_basecase,
1967    and chips obliged to use this generic C umul will have plenty of worse
1968    performance problems than a couple of extra instructions on the diagonal
1969    of sqr_basecase.  */
1970
1971 #if !defined (umul_ppmm)
1972 #define umul_ppmm(w1, w0, u, v)                                         \
1973   do {                                                                  \
1974     UWtype __x0, __x1, __x2, __x3;                                      \
1975     UHWtype __ul, __vl, __uh, __vh;                                     \
1976     UWtype __u = (u), __v = (v);                                        \
1977                                                                         \
1978     __ul = __ll_lowpart (__u);                                          \
1979     __uh = __ll_highpart (__u);                                         \
1980     __vl = __ll_lowpart (__v);                                          \
1981     __vh = __ll_highpart (__v);                                         \
1982                                                                         \
1983     __x0 = (UWtype) __ul * __vl;                                        \
1984     __x1 = (UWtype) __ul * __vh;                                        \
1985     __x2 = (UWtype) __uh * __vl;                                        \
1986     __x3 = (UWtype) __uh * __vh;                                        \
1987                                                                         \
1988     __x1 += __ll_highpart (__x0);/* this can't give carry */            \
1989     __x1 += __x2;               /* but this indeed can */               \
1990     if (__x1 < __x2)            /* did we get it? */                    \
1991       __x3 += __ll_B;           /* yes, add it in the proper pos. */    \
1992                                                                         \
1993     (w1) = __x3 + __ll_highpart (__x1);                                 \
1994     (w0) = (__x1 << W_TYPE_SIZE/2) + __ll_lowpart (__x0);               \
1995   } while (0)
1996 #endif
1997
1998 /* If we don't have smul_ppmm, define it using umul_ppmm (which surely will
1999    exist in one form or another.  */
2000 #if !defined (smul_ppmm)
2001 #define smul_ppmm(w1, w0, u, v)                                         \
2002   do {                                                                  \
2003     UWtype __w1;                                                        \
2004     UWtype __xm0 = (u), __xm1 = (v);                                    \
2005     umul_ppmm (__w1, w0, __xm0, __xm1);                                 \
2006     (w1) = __w1 - (-(__xm0 >> (W_TYPE_SIZE - 1)) & __xm1)               \
2007                 - (-(__xm1 >> (W_TYPE_SIZE - 1)) & __xm0);              \
2008   } while (0)
2009 #endif
2010
2011 /* Define this unconditionally, so it can be used for debugging.  */
2012 #define __udiv_qrnnd_c(q, r, n1, n0, d) \
2013   do {                                                                  \
2014     UWtype __d1, __d0, __q1, __q0, __r1, __r0, __m;                     \
2015                                                                         \
2016     ASSERT ((d) != 0);                                                  \
2017     ASSERT ((n1) < (d));                                                \
2018                                                                         \
2019     __d1 = __ll_highpart (d);                                           \
2020     __d0 = __ll_lowpart (d);                                            \
2021                                                                         \
2022     __q1 = (n1) / __d1;                                                 \
2023     __r1 = (n1) - __q1 * __d1;                                          \
2024     __m = __q1 * __d0;                                                  \
2025     __r1 = __r1 * __ll_B | __ll_highpart (n0);                          \
2026     if (__r1 < __m)                                                     \
2027       {                                                                 \
2028         __q1--, __r1 += (d);                                            \
2029         if (__r1 >= (d)) /* i.e. we didn't get carry when adding to __r1 */\
2030           if (__r1 < __m)                                               \
2031             __q1--, __r1 += (d);                                        \
2032       }                                                                 \
2033     __r1 -= __m;                                                        \
2034                                                                         \
2035     __q0 = __r1 / __d1;                                                 \
2036     __r0 = __r1  - __q0 * __d1;                                         \
2037     __m = __q0 * __d0;                                                  \
2038     __r0 = __r0 * __ll_B | __ll_lowpart (n0);                           \
2039     if (__r0 < __m)                                                     \
2040       {                                                                 \
2041         __q0--, __r0 += (d);                                            \
2042         if (__r0 >= (d))                                                \
2043           if (__r0 < __m)                                               \
2044             __q0--, __r0 += (d);                                        \
2045       }                                                                 \
2046     __r0 -= __m;                                                        \
2047                                                                         \
2048     (q) = __q1 * __ll_B | __q0;                                         \
2049     (r) = __r0;                                                         \
2050   } while (0)
2051
2052 /* If the processor has no udiv_qrnnd but sdiv_qrnnd, go through
2053    __udiv_w_sdiv (defined in libgcc or elsewhere).  */
2054 #if !defined (udiv_qrnnd) && defined (sdiv_qrnnd)
2055 #define udiv_qrnnd(q, r, nh, nl, d) \
2056   do {                                                                  \
2057     UWtype __r;                                                         \
2058     (q) = __MPN(udiv_w_sdiv) (&__r, nh, nl, d);                         \
2059     (r) = __r;                                                          \
2060   } while (0)
2061 __GMP_DECLSPEC UWtype __MPN(udiv_w_sdiv) (UWtype *, UWtype, UWtype, UWtype);
2062 #endif
2063
2064 /* If udiv_qrnnd was not defined for this processor, use __udiv_qrnnd_c.  */
2065 #if !defined (udiv_qrnnd)
2066 #define UDIV_NEEDS_NORMALIZATION 1
2067 #define udiv_qrnnd __udiv_qrnnd_c
2068 #endif
2069
2070 #if !defined (count_leading_zeros)
2071 #define count_leading_zeros(count, x) \
2072   do {                                                                  \
2073     UWtype __xr = (x);                                                  \
2074     UWtype __a;                                                         \
2075                                                                         \
2076     if (W_TYPE_SIZE == 32)                                              \
2077       {                                                                 \
2078         __a = __xr < ((UWtype) 1 << 2*__BITS4)                          \
2079           ? (__xr < ((UWtype) 1 << __BITS4) ? 1 : __BITS4 + 1)          \
2080           : (__xr < ((UWtype) 1 << 3*__BITS4) ? 2*__BITS4 + 1           \
2081           : 3*__BITS4 + 1);                                             \
2082       }                                                                 \
2083     else                                                                \
2084       {                                                                 \
2085         for (__a = W_TYPE_SIZE - 8; __a > 0; __a -= 8)                  \
2086           if (((__xr >> __a) & 0xff) != 0)                              \
2087             break;                                                      \
2088         ++__a;                                                          \
2089       }                                                                 \
2090                                                                         \
2091     (count) = W_TYPE_SIZE + 1 - __a - __clz_tab[__xr >> __a];           \
2092   } while (0)
2093 /* This version gives a well-defined value for zero. */
2094 #define COUNT_LEADING_ZEROS_0 (W_TYPE_SIZE - 1)
2095 #define COUNT_LEADING_ZEROS_NEED_CLZ_TAB
2096 #define COUNT_LEADING_ZEROS_SLOW
2097 #endif
2098
2099 /* clz_tab needed by mpn/x86/pentium/mod_1.asm in a fat binary */
2100 #if HAVE_HOST_CPU_FAMILY_x86 && WANT_FAT_BINARY
2101 #define COUNT_LEADING_ZEROS_NEED_CLZ_TAB
2102 #endif
2103
2104 #ifdef COUNT_LEADING_ZEROS_NEED_CLZ_TAB
2105 extern const unsigned char __GMP_DECLSPEC __clz_tab[129];
2106 #endif
2107
2108 #if !defined (count_trailing_zeros)
2109 #if !defined (COUNT_LEADING_ZEROS_SLOW)
2110 /* Define count_trailing_zeros using an asm count_leading_zeros.  */
2111 #define count_trailing_zeros(count, x)                                  \
2112   do {                                                                  \
2113     UWtype __ctz_x = (x);                                               \
2114     UWtype __ctz_c;                                                     \
2115     ASSERT (__ctz_x != 0);                                              \
2116     count_leading_zeros (__ctz_c, __ctz_x & -__ctz_x);                  \
2117     (count) = W_TYPE_SIZE - 1 - __ctz_c;                                \
2118   } while (0)
2119 #else
2120 /* Define count_trailing_zeros in plain C, assuming small counts are common.
2121    We use clz_tab without ado, since the C count_leading_zeros above will have
2122    pulled it in.  */
2123 #define count_trailing_zeros(count, x)                                  \
2124   do {                                                                  \
2125     UWtype __ctz_x = (x);                                               \
2126     int __ctz_c;                                                        \
2127                                                                         \
2128     if (LIKELY ((__ctz_x & 0xff) != 0))                                 \
2129       (count) = __clz_tab[__ctz_x & -__ctz_x] - 2;                      \
2130     else                                                                \
2131       {                                                                 \
2132         for (__ctz_c = 8 - 2; __ctz_c < W_TYPE_SIZE - 2; __ctz_c += 8)  \
2133           {                                                             \
2134             __ctz_x >>= 8;                                              \
2135             if (LIKELY ((__ctz_x & 0xff) != 0))                         \
2136               break;                                                    \
2137           }                                                             \
2138                                                                         \
2139         (count) = __ctz_c + __clz_tab[__ctz_x & -__ctz_x];              \
2140       }                                                                 \
2141   } while (0)
2142 #endif
2143 #endif
2144
2145 #ifndef UDIV_NEEDS_NORMALIZATION
2146 #define UDIV_NEEDS_NORMALIZATION 0
2147 #endif
2148
2149 /* Whether udiv_qrnnd is actually implemented with udiv_qrnnd_preinv, and
2150    that hence the latter should always be used.  */
2151 #ifndef UDIV_PREINV_ALWAYS
2152 #define UDIV_PREINV_ALWAYS 0
2153 #endif
2154
2155 /* Give defaults for UMUL_TIME and UDIV_TIME.  */
2156 #ifndef UMUL_TIME
2157 #define UMUL_TIME 1
2158 #endif
2159
2160 #ifndef UDIV_TIME
2161 #define UDIV_TIME UMUL_TIME
2162 #endif