src/factor.c

   1 /* factor -- print prime factors of n.
   2    Copyright (C) 1986-2012 Free Software Foundation, Inc.
   3
   4    This program is free software: you can redistribute it and/or modify
   5    it under the terms of the GNU General Public License as published by
   6    the Free Software Foundation, either version 3 of the License, or
   7    (at your option) any later version.
   8
   9    This program is distributed in the hope that it will be useful,
  10    but WITHOUT ANY WARRANTY; without even the implied warranty of
  11    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  12    GNU General Public License for more details.
  13
  14    You should have received a copy of the GNU General Public License
  15    along with this program.  If not, see <http://www.gnu.org/licenses/>.  */
  16
  17 /* Originally written by Paul Rubin <phr@ocf.berkeley.edu>.
  18    Adapted for GNU, fixed to factor UINT_MAX by Jim Meyering.
  19    Arbitrary-precision code adapted by James Youngman from Torbjörn
  20    Granlund's factorize.c, from GNU MP version 4.2.2.
  21    In 2012, the core was rewritten by Torbjörn Granlund and Niels Möller.
  22    Contains code from GNU MP.  */
  23
  24 /* Efficiently factor numbers that fit in one or two words (word = uintmax_t),
  25    or, with GMP, numbers of any size.
  26
  27   Code organisation:
  28
  29     There are several variants of many functions, for handling one word, two
  30     words, and GMP's mpz_t type.  If the one-word variant is called foo, the
  31     two-word variant will be foo2, and the one for mpz_t will be mp_foo.  In
  32     some cases, the plain function variants will handle both one-word and
  33     two-word numbers, evidenced by function arguments.
  34
  35     The factoring code for two words will fall into the code for one word when
  36     progress allows that.
  37
  38     Using GMP is optional.  Define HAVE_GMP to make this code include GMP
  39     factoring code.  The GMP factoring code is based on GMP's demos/factorize.c
  40     (last synched 2012-09-07).  The GMP-based factoring code will stay in GMP
  41     factoring code even if numbers get small enough for using the two-word
  42     code.
  43
  44   Algorithm:
  45
  46     (1) Perform trial division using a small primes table, but without hardware
  47         division since the primes table store inverses modulo the word base.
  48         (The GMP variant of this code doesn't make use of the precomputed
  49         inverses, but instead relies on GMP for fast divisibility testing.)
  50     (2) Check the nature of any non-factored part using Miller-Rabin for
  51         detecting composites, and Lucas for detecting primes.
  52     (3) Factor any remaining composite part using the Pollard-Brent rho
  53         algorithm or the SQUFOF algorithm, checking status of found factors
  54         again using Miller-Rabin and Lucas.
  55
  56     We prefer using Hensel norm in the divisions, not the more familiar
  57     Euclidian norm, since the former leads to much faster code.  In the
  58     Pollard-Brent rho code and the prime testing code, we use Montgomery's
  59     trick of multiplying all n-residues by the word base, allowing cheap Hensel
  60     reductions mod n.
  61
  62   Improvements:
  63
  64     * Use modular inverses also for exact division in the Lucas code, and
  65       elsewhere.  A problem is to locate the inverses not from an index, but
  66       from a prime.  We might instead compute the inverse on-the-fly.
  67
  68     * Tune trial division table size (not forgetting that this is a standalone
  69       program where the table will be read from disk for each invocation).
  70
  71     * Implement less naive powm, using k-ary exponentiation for k = 3 or
  72       perhaps k = 4.
  73
  74     * Try to speed trial division code for single uintmax_t numbers, i.e., the
  75       code using DIVBLOCK.  It currently runs at 2 cycles per prime (Intel SBR,
  76       IBR), 3 cycles per prime (AMD Stars) and 5 cycles per prime (AMD BD) when
  77       using gcc 4.6 and 4.7.  Some software pipelining should help; 1, 2, and 4
  78       respectively cycles ought to be possible.
  79
  80     * The redcify function could be vastly improved by using (plain Euclidian)
  81       pre-inversion (such as GMP's invert_limb) and udiv_qrnnd_preinv (from
  82       GMP's gmp-impl.h).  The redcify2 function could be vastly improved using
  83       similar methoods.  These functions currently dominate run time when using
  84       the -w option.
  85 */
  86
  87 #include <config.h>
  88 #include <getopt.h>
  89 #include <stdio.h>
  90 #if HAVE_GMP
  91 # include <gmp.h>
  92 # if !HAVE_DECL_MPZ_INITS
  93 #  include <stdarg.h>
  94 # endif
  95 #endif
  96
  97 #include <assert.h>
  98
  99 #include "system.h"
 100 #include "error.h"
 101 #include "quote.h"
 102 #include "readtokens.h"
 103 #include "xstrtol.h"
 104
 105 /* The official name of this program (e.g., no 'g' prefix).  */
 106 #define PROGRAM_NAME "factor"
 107
 108 #define AUTHORS \
 109   proper_name ("Paul Rubin"),                                           \
 110   proper_name_utf8 ("Torbjorn Granlund", "Torbj\303\266rn Granlund"),   \
 111   proper_name_utf8 ("Niels Moller", "Niels M\303\266ller")
 112
 113 /* Token delimiters when reading from a file.  */
 114 #define DELIM "\n\t "
 115
 116 #ifndef STAT_SQUFOF
 117 # define STAT_SQUFOF 0
 118 #endif
 119
 120 #ifndef USE_LONGLONG_H
 121 # define USE_LONGLONG_H 1
 122 #endif
 123
 124 #if USE_LONGLONG_H
 125
 126 /* Make definitions for longlong.h to make it do what it can do for us */
 127 # define W_TYPE_SIZE 64          /* bitcount for uintmax_t */
 128 # define UWtype  uintmax_t
 129 # define UHWtype unsigned long int
 130 # undef UDWtype
 131 # if HAVE_ATTRIBUTE_MODE
 132 typedef unsigned int UQItype    __attribute__ ((mode (QI)));
 133 typedef          int SItype     __attribute__ ((mode (SI)));
 134 typedef unsigned int USItype    __attribute__ ((mode (SI)));
 135 typedef          int DItype     __attribute__ ((mode (DI)));
 136 typedef unsigned int UDItype    __attribute__ ((mode (DI)));
 137 # else
 138 typedef unsigned char UQItype;
 139 typedef          long SItype;
 140 typedef unsigned long int USItype;
 141 #  if HAVE_LONG_LONG
 142 typedef long long int DItype;
 143 typedef unsigned long long int UDItype;
 144 #  else /* Assume `long' gives us a wide enough type.  Needed for hppa2.0w.  */
 145 typedef long int DItype;
 146 typedef unsigned long int UDItype;
 147 #  endif
 148 # endif
 149 # define LONGLONG_STANDALONE     /* Don't require GMP's longlong.h mdep files */
 150 # define ASSERT(x)               /* FIXME make longlong.h really standalone */
 151 # define __clz_tab factor_clz_tab /* Rename to avoid glibc collision */
 152 # ifndef __GMP_GNUC_PREREQ
 153 #  define __GMP_GNUC_PREREQ(a,b) 1
 154 # endif
 155 # if _ARCH_PPC
 156 #  define HAVE_HOST_CPU_FAMILY_powerpc 1
 157 # endif
 158 # include "longlong.h"
 159 # ifdef COUNT_LEADING_ZEROS_NEED_CLZ_TAB
 160 const unsigned char factor_clz_tab[129] =
 161 {
 162   1,2,3,3,4,4,4,4,5,5,5,5,5,5,5,5,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,
 163   7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,
 164   8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,
 165   8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,
 166   9
 167 };
 168 # endif
 169
 170 #else /* not USE_LONGLONG_H */
 171
 172 # define W_TYPE_SIZE (8 * sizeof (uintmax_t))
 173 # define __ll_B ((uintmax_t) 1 << (W_TYPE_SIZE / 2))
 174 # define __ll_lowpart(t)  ((uintmax_t) (t) & (__ll_B - 1))
 175 # define __ll_highpart(t) ((uintmax_t) (t) >> (W_TYPE_SIZE / 2))
 176
 177 #endif
 178
 179 #if !defined __clz_tab && !defined UHWtype
 180 /* Without this seemingly useless conditional, gcc -Wunused-macros
 181    warns that each of the two tested macros is unused on Fedora 18.
 182    FIXME: this is just an ugly band-aid.  Fix it properly.  */
 183 #endif
 184
 185 enum alg_type { ALG_POLLARD_RHO = 1, ALG_SQUFOF = 2 };
 186
 187 static enum alg_type alg;
 188
 189 /* 2*3*5*7*11...*101 is 128 bits, and has 26 prime factors */
 190 #define MAX_NFACTS 26
 191
 192 enum
 193 {
 194   VERBOSE_OPTION = CHAR_MAX + 1
 195 };
 196
 197 static struct option const long_options[] =
 198 {
 199   {"verbose", no_argument, NULL, VERBOSE_OPTION},
 200   {GETOPT_HELP_OPTION_DECL},
 201   {GETOPT_VERSION_OPTION_DECL},
 202   {NULL, 0, NULL, 0}
 203 };
 204
 205 struct factors
 206 {
 207   uintmax_t     plarge[2]; /* Can have a single large factor */
 208   uintmax_t     p[MAX_NFACTS];
 209   unsigned char e[MAX_NFACTS];
 210   unsigned char nfactors;
 211 };
 212
 213 #if HAVE_GMP
 214 struct mp_factors
 215 {
 216   mpz_t             *p;
 217   unsigned long int *e;
 218   unsigned long int nfactors;
 219 };
 220 #endif
 221
 222 static void factor (uintmax_t, uintmax_t, struct factors *);
 223
 224 #ifndef umul_ppmm
 225 # define umul_ppmm(w1, w0, u, v)                                        \
 226   do {                                                                  \
 227     uintmax_t __x0, __x1, __x2, __x3;                                   \
 228     unsigned long int __ul, __vl, __uh, __vh;                           \
 229     uintmax_t __u = (u), __v = (v);                                     \
 230                                                                         \
 231     __ul = __ll_lowpart (__u);                                          \
 232     __uh = __ll_highpart (__u);                                         \
 233     __vl = __ll_lowpart (__v);                                          \
 234     __vh = __ll_highpart (__v);                                         \
 235                                                                         \
 236     __x0 = (uintmax_t) __ul * __vl;                                     \
 237     __x1 = (uintmax_t) __ul * __vh;                                     \
 238     __x2 = (uintmax_t) __uh * __vl;                                     \
 239     __x3 = (uintmax_t) __uh * __vh;                                     \
 240                                                                         \
 241     __x1 += __ll_highpart (__x0);/* this can't give carry */            \
 242     __x1 += __x2;               /* but this indeed can */               \
 243     if (__x1 < __x2)            /* did we get it? */                    \
 244       __x3 += __ll_B;           /* yes, add it in the proper pos. */    \
 245                                                                         \
 246     (w1) = __x3 + __ll_highpart (__x1);                                 \
 247     (w0) = (__x1 << W_TYPE_SIZE / 2) + __ll_lowpart (__x0);             \
 248   } while (0)
 249 #endif
 250
 251 #if !defined udiv_qrnnd || defined UDIV_NEEDS_NORMALIZATION
 252 /* Define our own, not needing normalization. This function is
 253    currently not performance critical, so keep it simple. Similar to
 254    the mod macro below. */
 255 # undef udiv_qrnnd
 256 # define udiv_qrnnd(q, r, n1, n0, d)                                    \
 257   do {                                                                  \
 258     uintmax_t __d1, __d0, __q, __r1, __r0;                              \
 259                                                                         \
 260     assert ((n1) < (d));                                                \
 261     __d1 = (d); __d0 = 0;                                               \
 262     __r1 = (n1); __r0 = (n0);                                           \
 263     __q = 0;                                                            \
 264     for (unsigned int __i = W_TYPE_SIZE; __i > 0; __i--)                \
 265       {                                                                 \
 266         rsh2 (__d1, __d0, __d1, __d0, 1);                               \
 267         __q <<= 1;                                                      \
 268         if (ge2 (__r1, __r0, __d1, __d0))                               \
 269           {                                                             \
 270             __q++;                                                      \
 271             sub_ddmmss (__r1, __r0, __r1, __r0, __d1, __d0);            \
 272           }                                                             \
 273       }                                                                 \
 274     (r) = __r0;                                                         \
 275     (q) = __q;                                                          \
 276   } while (0)
 277 #endif
 278
 279 #if !defined add_ssaaaa
 280 # define add_ssaaaa(sh, sl, ah, al, bh, bl)                             \
 281   do {                                                                  \
 282     uintmax_t _add_x;                                                   \
 283     _add_x = (al) + (bl);                                               \
 284     (sh) = (ah) + (bh) + (_add_x < (al));                               \
 285     (sl) = _add_x;                                                      \
 286   } while (0)
 287 #endif
 288
 289 #define rsh2(rh, rl, ah, al, cnt)                                       \
 290   do {                                                                  \
 291     (rl) = ((ah) << (W_TYPE_SIZE - (cnt))) | ((al) >> (cnt));           \
 292     (rh) = (ah) >> (cnt);                                               \
 293   } while (0)
 294
 295 #define lsh2(rh, rl, ah, al, cnt)                                       \
 296   do {                                                                  \
 297     (rh) = ((ah) << cnt) | ((al) >> (W_TYPE_SIZE - (cnt)));             \
 298     (rl) = (al) << (cnt);                                               \
 299   } while (0)
 300
 301 #define ge2(ah, al, bh, bl)                                             \
 302   ((ah) > (bh) || ((ah) == (bh) && (al) >= (bl)))
 303
 304 #define gt2(ah, al, bh, bl)                                             \
 305   ((ah) > (bh) || ((ah) == (bh) && (al) > (bl)))
 306
 307 #ifndef sub_ddmmss
 308 # define sub_ddmmss(rh, rl, ah, al, bh, bl)                             \
 309   do {                                                                  \
 310     uintmax_t _cy;                                                      \
 311     _cy = (al) < (bl);                                                  \
 312     (rl) = (al) - (bl);                                                 \
 313     (rh) = (ah) - (bh) - _cy;                                           \
 314   } while (0)
 315 #endif
 316
 317 #ifndef count_leading_zeros
 318 # define count_leading_zeros(count, x) do {                             \
 319     uintmax_t __clz_x = (x);                                            \
 320     unsigned int __clz_c;                                               \
 321     for (__clz_c = 0;                                                   \
 322          (__clz_x & ((uintmax_t) 0xff << (W_TYPE_SIZE - 8))) == 0;      \
 323          __clz_c += 8)                                                  \
 324       __clz_x <<= 8;                                                    \
 325     for (; (intmax_t)__clz_x >= 0; __clz_c++)                           \
 326       __clz_x <<= 1;                                                    \
 327     (count) = __clz_c;                                                  \
 328   } while (0)
 329 #endif
 330
 331 #ifndef count_trailing_zeros
 332 # define count_trailing_zeros(count, x) do {                            \
 333     uintmax_t __ctz_x = (x);                                            \
 334     unsigned int __ctz_c = 0;                                           \
 335     while ((__ctz_x & 1) == 0)                                          \
 336       {                                                                 \
 337         __ctz_x >>= 1;                                                  \
 338         __ctz_c++;                                                      \
 339       }                                                                 \
 340     (count) = __ctz_c;                                                  \
 341   } while (0)
 342 #endif
 343
 344 /* Requires that a < n and b <= n */
 345 #define submod(r,a,b,n)                                                 \
 346   do {                                                                  \
 347     uintmax_t _t = - (uintmax_t) (a < b);                               \
 348     (r) = ((n) & _t) + (a) - (b);                                       \
 349   } while (0)
 350
 351 #define addmod(r,a,b,n)                                                 \
 352   submod ((r), (a), ((n) - (b)), (n))
 353
 354 /* Modular two-word addition and subtraction.  For performance reasons, the
 355    most significant bit of n1 must be clear.  The destination variables must be
 356    distinct from the mod operand.  */
 357 #define addmod2(r1, r0, a1, a0, b1, b0, n1, n0)                         \
 358   do {                                                                  \
 359     add_ssaaaa ((r1), (r0), (a1), (a0), (b1), (b0));                    \
 360     if (ge2 ((r1), (r0), (n1), (n0)))                                   \
 361       sub_ddmmss ((r1), (r0), (r1), (r0), (n1), (n0));                  \
 362   } while (0)
 363 #define submod2(r1, r0, a1, a0, b1, b0, n1, n0)                         \
 364   do {                                                                  \
 365     sub_ddmmss ((r1), (r0), (a1), (a0), (b1), (b0));                    \
 366     if ((intmax_t) (r1) < 0)                                            \
 367       add_ssaaaa ((r1), (r0), (r1), (r0), (n1), (n0));                  \
 368   } while (0)
 369
 370 #define HIGHBIT_TO_MASK(x)                                              \
 371   (((intmax_t)-1 >> 1) < 0                                              \
 372    ? (uintmax_t)((intmax_t)(x) >> (W_TYPE_SIZE - 1))                    \
 373    : ((x) & ((uintmax_t) 1 << (W_TYPE_SIZE - 1))                        \
 374       ? UINTMAX_MAX : (uintmax_t) 0))
 375
 376 /* Compute r = a mod d, where r = <*t1,retval>, a = <a1,a0>, d = <d1,d0>.
 377    Requires that d1 != 0.  */
 378 static uintmax_t
 379 mod2 (uintmax_t *r1, uintmax_t a1, uintmax_t a0, uintmax_t d1, uintmax_t d0)
 380 {
 381   int cntd, cnta;
 382
 383   assert (d1 != 0);
 384
 385   if (a1 == 0)
 386     {
 387       *r1 = 0;
 388       return a0;
 389     }
 390
 391   count_leading_zeros (cntd, d1);
 392   count_leading_zeros (cnta, a1);
 393   int cnt = cntd - cnta;
 394   lsh2 (d1, d0, d1, d0, cnt);
 395   for (int i = 0; i < cnt; i++)
 396     {
 397       if (ge2 (a1, a0, d1, d0))
 398         sub_ddmmss (a1, a0, a1, a0, d1, d0);
 399       rsh2 (d1, d0, d1, d0, 1);
 400     }
 401
 402   *r1 = a1;
 403   return a0;
 404 }
 405
 406 static uintmax_t _GL_ATTRIBUTE_CONST
 407 gcd_odd (uintmax_t a, uintmax_t b)
 408 {
 409   if ( (b & 1) == 0)
 410     {
 411       uintmax_t t = b;
 412       b = a;
 413       a = t;
 414     }
 415   if (a == 0)
 416     return b;
 417
 418   /* Take out least significant one bit, to make room for sign */
 419   b >>= 1;
 420
 421   for (;;)
 422     {
 423       uintmax_t t;
 424       uintmax_t bgta;
 425
 426       while ((a & 1) == 0)
 427         a >>= 1;
 428       a >>= 1;
 429
 430       t = a - b;
 431       if (t == 0)
 432         return (a << 1) + 1;
 433
 434       bgta = HIGHBIT_TO_MASK (t);
 435
 436       /* b <-- min (a, b) */
 437       b += (bgta & t);
 438
 439       /* a <-- |a - b| */
 440       a = (t ^ bgta) - bgta;
 441     }
 442 }
 443
 444 static uintmax_t
 445 gcd2_odd (uintmax_t *r1, uintmax_t a1, uintmax_t a0, uintmax_t b1, uintmax_t b0)
 446 {
 447   while ((a0 & 1) == 0)
 448     rsh2 (a1, a0, a1, a0, 1);
 449   while ((b0 & 1) == 0)
 450     rsh2 (b1, b0, b1, b0, 1);
 451
 452   for (;;)
 453     {
 454       if ((b1 | a1) == 0)
 455         {
 456           *r1 = 0;
 457           return gcd_odd (b0, a0);
 458         }
 459
 460       if (gt2 (a1, a0, b1, b0))
 461         {
 462           sub_ddmmss (a1, a0, a1, a0, b1, b0);
 463           do
 464             rsh2 (a1, a0, a1, a0, 1);
 465           while ((a0 & 1) == 0);
 466         }
 467       else if (gt2 (b1, b0, a1, a0))
 468         {
 469           sub_ddmmss (b1, b0, b1, b0, a1, a0);
 470           do
 471             rsh2 (b1, b0, b1, b0, 1);
 472           while ((b0 & 1) == 0);
 473         }
 474       else
 475         break;
 476     }
 477
 478   *r1 = a1;
 479   return a0;
 480 }
 481
 482 static void
 483 factor_insert_multiplicity (struct factors *factors,
 484                             uintmax_t prime, unsigned int m)
 485 {
 486   unsigned int nfactors = factors->nfactors;
 487   uintmax_t *p = factors->p;
 488   unsigned char *e = factors->e;
 489
 490   /* Locate position for insert new or increment e.  */
 491   int i;
 492   for (i = nfactors - 1; i >= 0; i--)
 493     {
 494       if (p[i] <= prime)
 495         break;
 496     }
 497
 498   if (i < 0 || p[i] != prime)
 499     {
 500       for (int j = nfactors - 1; j > i; j--)
 501         {
 502           p[j + 1] = p[j];
 503           e[j + 1] = e[j];
 504         }
 505       p[i + 1] = prime;
 506       e[i + 1] = m;
 507       factors->nfactors = nfactors + 1;
 508     }
 509   else
 510     {
 511       e[i] += m;
 512     }
 513 }
 514
 515 #define factor_insert(f, p) factor_insert_multiplicity (f, p, 1)
 516
 517 static void
 518 factor_insert_large (struct factors *factors,
 519                      uintmax_t p1, uintmax_t p0)
 520 {
 521   if (p1 > 0)
 522     {
 523       assert (factors->plarge[1] == 0);
 524       factors->plarge[0] = p0;
 525       factors->plarge[1] = p1;
 526     }
 527   else
 528     factor_insert (factors, p0);
 529 }
 530
 531 #if HAVE_GMP
 532
 533 # if !HAVE_DECL_MPZ_INITS
 534
 535 #  define mpz_inits(...) mpz_va_init (mpz_init, __VA_ARGS__)
 536 #  define mpz_clears(...) mpz_va_init (mpz_clear, __VA_ARGS__)
 537
 538 static void
 539 mpz_va_init (void (*mpz_single_init)(mpz_t), ...)
 540 {
 541   va_list ap;
 542
 543   va_start (ap, mpz_single_init);
 544
 545   mpz_t *mpz;
 546   while ((mpz = va_arg (ap, mpz_t *)))
 547     mpz_single_init (*mpz);
 548
 549   va_end (ap);
 550 }
 551 # endif
 552
 553 static void mp_factor (mpz_t, struct mp_factors *);
 554
 555 static void
 556 mp_factor_init (struct mp_factors *factors)
 557 {
 558   factors->p = xmalloc (1);
 559   factors->e = xmalloc (1);
 560   factors->nfactors = 0;
 561 }
 562
 563 static void
 564 mp_factor_clear (struct mp_factors *factors)
 565 {
 566   for (unsigned int i = 0; i < factors->nfactors; i++)
 567     mpz_clear (factors->p[i]);
 568
 569   free (factors->p);
 570   free (factors->e);
 571 }
 572
 573 static void
 574 mp_factor_insert (struct mp_factors *factors, mpz_t prime)
 575 {
 576   unsigned long int nfactors = factors->nfactors;
 577   mpz_t         *p  = factors->p;
 578   unsigned long int *e  = factors->e;
 579   long i;
 580
 581   /* Locate position for insert new or increment e.  */
 582   for (i = nfactors - 1; i >= 0; i--)
 583     {
 584       if (mpz_cmp (p[i], prime) <= 0)
 585         break;
 586     }
 587
 588   if (i < 0 || mpz_cmp (p[i], prime) != 0)
 589     {
 590       p = xrealloc (p, (nfactors + 1) * sizeof p[0]);
 591       e = xrealloc (e, (nfactors + 1) * sizeof e[0]);
 592
 593       mpz_init (p[nfactors]);
 594       for (long j = nfactors - 1; j > i; j--)
 595         {
 596           mpz_set (p[j + 1], p[j]);
 597           e[j + 1] = e[j];
 598         }
 599       mpz_set (p[i + 1], prime);
 600       e[i + 1] = 1;
 601
 602       factors->p = p;
 603       factors->e = e;
 604       factors->nfactors = nfactors + 1;
 605     }
 606   else
 607     {
 608       e[i] += 1;
 609     }
 610 }
 611
 612 static void
 613 mp_factor_insert_ui (struct mp_factors *factors, unsigned long int prime)
 614 {
 615   mpz_t pz;
 616
 617   mpz_init_set_ui (pz, prime);
 618   mp_factor_insert (factors, pz);
 619   mpz_clear (pz);
 620 }
 621 #endif /* HAVE_GMP */
 622
 623
 624 #define P(a,b,c,d) a,
 625 static const unsigned char primes_diff[] = {
 626 #include "primes.h"
 627 0,0,0,0,0,0,0                           /* 7 sentinels for 8-way loop */
 628 };
 629 #undef P
 630
 631 #define PRIMES_PTAB_ENTRIES \
 632   (sizeof (primes_diff) / sizeof (primes_diff[0]) - 8 + 1)
 633
 634 #define P(a,b,c,d) b,
 635 static const unsigned char primes_diff8[] = {
 636 #include "primes.h"
 637 0,0,0,0,0,0,0                           /* 7 sentinels for 8-way loop */
 638 };
 639 #undef P
 640
 641 struct primes_dtab
 642 {
 643   uintmax_t binv, lim;
 644 };
 645
 646 #define P(a,b,c,d) {c,d},
 647 static const struct primes_dtab primes_dtab[] = {
 648 #include "primes.h"
 649 {1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0} /* 7 sentinels for 8-way loop */
 650 };
 651 #undef P
 652
 653 /* This flag is honored only in the GMP code. */
 654 static int verbose = 0;
 655
 656 /* Prove primality or run probabilistic tests.  */
 657 static bool flag_prove_primality = true;
 658
 659 /* Number of Miller-Rabin tests to run when not proving primality. */
 660 #define MR_REPS 25
 661
 662 #ifdef __GNUC__
 663 # define LIKELY(cond)    __builtin_expect ((cond), 1)
 664 # define UNLIKELY(cond)  __builtin_expect ((cond), 0)
 665 #else
 666 # define LIKELY(cond)    (cond)
 667 # define UNLIKELY(cond)  (cond)
 668 #endif
 669
 670 static void
 671 debug (char const *fmt, ...)
 672 {
 673   if (verbose)
 674     {
 675       va_list ap;
 676       va_start (ap, fmt);
 677       vfprintf (stderr, fmt, ap);
 678       va_end (ap);
 679     }
 680 }
 681
 682 static void
 683 factor_insert_refind (struct factors *factors, uintmax_t p, unsigned int i,
 684                       unsigned int off)
 685 {
 686   for (unsigned int j = 0; j < off; j++)
 687     p += primes_diff[i + j];
 688   factor_insert (factors, p);
 689 }
 690
 691 /* Trial division with odd primes uses the following trick.
 692
 693    Let p be an odd prime, and B = 2^{W_TYPE_SIZE}. For simplicity,
 694    consider the case t < B (this is the second loop below).
 695
 696    From our tables we get
 697
 698      binv = p^{-1} (mod B)
 699      lim = floor ( (B-1) / p ).
 700
 701    First assume that t is a multiple of p, t = q * p. Then 0 <= q <= lim
 702    (and all quotients in this range occur for some t).
 703
 704    Then t = q * p is true also (mod B), and p is invertible we get
 705
 706      q = t * binv (mod B).
 707
 708    Next, assume that t is *not* divisible by p. Since multiplication
 709    by binv (mod B) is a one-to-one mapping,
 710
 711      t * binv (mod B) > lim,
 712
 713    because all the smaller values are already taken.
 714
 715    This can be summed up by saying that the function
 716
 717      q(t) = binv * t (mod B)
 718
 719    is a permutation of the range 0 <= t < B, with the curious property
 720    that it maps the multiples of p onto the range 0 <= q <= lim, in
 721    order, and the non-multiples of p onto the range lim < q < B.
 722  */
 723
 724 static uintmax_t
 725 factor_using_division (uintmax_t *t1p, uintmax_t t1, uintmax_t t0,
 726                        struct factors *factors)
 727 {
 728   if (t0 % 2 == 0)
 729     {
 730       unsigned int cnt;
 731
 732       if (t0 == 0)
 733         {
 734           count_trailing_zeros (cnt, t1);
 735           t0 = t1 >> cnt;
 736           t1 = 0;
 737           cnt += W_TYPE_SIZE;
 738         }
 739       else
 740         {
 741           count_trailing_zeros (cnt, t0);
 742           rsh2 (t1, t0, t1, t0, cnt);
 743         }
 744
 745       factor_insert_multiplicity (factors, 2, cnt);
 746     }
 747
 748   uintmax_t p = 3;
 749   unsigned int i;
 750   for (i = 0; t1 > 0 && i < PRIMES_PTAB_ENTRIES; i++)
 751     {
 752       for (;;)
 753         {
 754           uintmax_t q1, q0, hi, lo;
 755
 756           q0 = t0 * primes_dtab[i].binv;
 757           umul_ppmm (hi, lo, q0, p);
 758           if (hi > t1)
 759             break;
 760           hi = t1 - hi;
 761           q1 = hi * primes_dtab[i].binv;
 762           if (LIKELY (q1 > primes_dtab[i].lim))
 763             break;
 764           t1 = q1; t0 = q0;
 765           factor_insert (factors, p);
 766         }
 767       p += primes_diff[i + 1];
 768     }
 769   if (t1p)
 770     *t1p = t1;
 771
 772 #define DIVBLOCK(I)                                                     \
 773   do {                                                                  \
 774     for (;;)                                                            \
 775       {                                                                 \
 776         q = t0 * pd[I].binv;                                            \
 777         if (LIKELY (q > pd[I].lim))                                     \
 778           break;                                                        \
 779         t0 = q;                                                         \
 780         factor_insert_refind (factors, p, i + 1, I);                    \
 781       }                                                                 \
 782   } while (0)
 783
 784   for (; i < PRIMES_PTAB_ENTRIES; i += 8)
 785     {
 786       uintmax_t q;
 787       const struct primes_dtab *pd = &primes_dtab[i];
 788       DIVBLOCK (0);
 789       DIVBLOCK (1);
 790       DIVBLOCK (2);
 791       DIVBLOCK (3);
 792       DIVBLOCK (4);
 793       DIVBLOCK (5);
 794       DIVBLOCK (6);
 795       DIVBLOCK (7);
 796
 797       p += primes_diff8[i];
 798       if (p * p > t0)
 799         break;
 800     }
 801
 802   return t0;
 803 }
 804
 805 #if HAVE_GMP
 806 static void
 807 mp_factor_using_division (mpz_t t, struct mp_factors *factors)
 808 {
 809   mpz_t q;
 810   unsigned long int p;
 811
 812   debug ("[trial division] ");
 813
 814   mpz_init (q);
 815
 816   p = mpz_scan1 (t, 0);
 817   mpz_div_2exp (t, t, p);
 818   while (p)
 819     {
 820       mp_factor_insert_ui (factors, 2);
 821       --p;
 822     }
 823
 824   p = 3;
 825   for (unsigned int i = 1; i <= PRIMES_PTAB_ENTRIES;)
 826     {
 827       if (! mpz_divisible_ui_p (t, p))
 828         {
 829           p += primes_diff[i++];
 830           if (mpz_cmp_ui (t, p * p) < 0)
 831             break;
 832         }
 833       else
 834         {
 835           mpz_tdiv_q_ui (t, t, p);
 836           mp_factor_insert_ui (factors, p);
 837         }
 838     }
 839
 840   mpz_clear (q);
 841 }
 842 #endif
 843
 844 /* Entry i contains (2i+1)^(-1) mod 2^8.  */
 845 static const unsigned char  binvert_table[128] =
 846 {
 847   0x01, 0xAB, 0xCD, 0xB7, 0x39, 0xA3, 0xC5, 0xEF,
 848   0xF1, 0x1B, 0x3D, 0xA7, 0x29, 0x13, 0x35, 0xDF,
 849   0xE1, 0x8B, 0xAD, 0x97, 0x19, 0x83, 0xA5, 0xCF,
 850   0xD1, 0xFB, 0x1D, 0x87, 0x09, 0xF3, 0x15, 0xBF,
 851   0xC1, 0x6B, 0x8D, 0x77, 0xF9, 0x63, 0x85, 0xAF,
 852   0xB1, 0xDB, 0xFD, 0x67, 0xE9, 0xD3, 0xF5, 0x9F,
 853   0xA1, 0x4B, 0x6D, 0x57, 0xD9, 0x43, 0x65, 0x8F,
 854   0x91, 0xBB, 0xDD, 0x47, 0xC9, 0xB3, 0xD5, 0x7F,
 855   0x81, 0x2B, 0x4D, 0x37, 0xB9, 0x23, 0x45, 0x6F,
 856   0x71, 0x9B, 0xBD, 0x27, 0xA9, 0x93, 0xB5, 0x5F,
 857   0x61, 0x0B, 0x2D, 0x17, 0x99, 0x03, 0x25, 0x4F,
 858   0x51, 0x7B, 0x9D, 0x07, 0x89, 0x73, 0x95, 0x3F,
 859   0x41, 0xEB, 0x0D, 0xF7, 0x79, 0xE3, 0x05, 0x2F,
 860   0x31, 0x5B, 0x7D, 0xE7, 0x69, 0x53, 0x75, 0x1F,
 861   0x21, 0xCB, 0xED, 0xD7, 0x59, 0xC3, 0xE5, 0x0F,
 862   0x11, 0x3B, 0x5D, 0xC7, 0x49, 0x33, 0x55, 0xFF
 863 };
 864
 865 /* Compute n^(-1) mod B, using a Newton iteration.  */
 866 #define binv(inv,n)                                                     \
 867   do {                                                                  \
 868     uintmax_t  __n = (n);                                               \
 869     uintmax_t  __inv;                                                   \
 870                                                                         \
 871     __inv = binvert_table[(__n / 2) & 0x7F]; /*  8 */                   \
 872     if (W_TYPE_SIZE > 8)   __inv = 2 * __inv - __inv * __inv * __n;     \
 873     if (W_TYPE_SIZE > 16)  __inv = 2 * __inv - __inv * __inv * __n;     \
 874     if (W_TYPE_SIZE > 32)  __inv = 2 * __inv - __inv * __inv * __n;     \
 875                                                                         \
 876     if (W_TYPE_SIZE > 64)                                               \
 877       {                                                                 \
 878         int  __invbits = 64;                                            \
 879         do {                                                            \
 880           __inv = 2 * __inv - __inv * __inv * __n;                      \
 881           __invbits *= 2;                                               \
 882         } while (__invbits < W_TYPE_SIZE);                              \
 883       }                                                                 \
 884                                                                         \
 885     (inv) = __inv;                                                      \
 886   } while (0)
 887
 888 /* q = u / d, assuming d|u.  */
 889 #define divexact_21(q1, q0, u1, u0, d)                                  \
 890   do {                                                                  \
 891     uintmax_t _di, _q0;                                                 \
 892     binv (_di, (d));                                                    \
 893     _q0 = (u0) * _di;                                                   \
 894     if ((u1) >= (d))                                                    \
 895       {                                                                 \
 896         uintmax_t _p1, _p0;                                             \
 897         umul_ppmm (_p1, _p0, _q0, d);                                   \
 898         (q1) = ((u1) - _p1) * _di;                                      \
 899         (q0) = _q0;                                                     \
 900       }                                                                 \
 901     else                                                                \
 902       {                                                                 \
 903         (q0) = _q0;                                                     \
 904         (q1) = 0;                                                       \
 905       }                                                                 \
 906   } while (0)
 907
 908 /* x B (mod n). */
 909 #define redcify(r_prim, r, n)                                           \
 910   do {                                                                  \
 911     uintmax_t _redcify_q ATTRIBUTE_UNUSED;                              \
 912     udiv_qrnnd (_redcify_q, r_prim, r, 0, n);                           \
 913   } while (0)
 914
 915 /* x B^2 (mod n). Requires x > 0, n1 < B/2 */
 916 #define redcify2(r1, r0, x, n1, n0)                                     \
 917   do {                                                                  \
 918     uintmax_t _r1, _r0, _i;                                             \
 919     if ((x) < (n1))                                                     \
 920       {                                                                 \
 921         _r1 = (x); _r0 = 0;                                             \
 922         _i = W_TYPE_SIZE;                                               \
 923       }                                                                 \
 924     else                                                                \
 925       {                                                                 \
 926         _r1 = 0; _r0 = (x);                                             \
 927         _i = 2*W_TYPE_SIZE;                                             \
 928       }                                                                 \
 929     while (_i-- > 0)                                                    \
 930       {                                                                 \
 931         lsh2 (_r1, _r0, _r1, _r0, 1);                                   \
 932         if (ge2 (_r1, _r0, (n1), (n0)))                                 \
 933           sub_ddmmss (_r1, _r0, _r1, _r0, (n1), (n0));                  \
 934       }                                                                 \
 935     (r1) = _r1;                                                         \
 936     (r0) = _r0;                                                         \
 937   } while (0)
 938
 939 /* Modular two-word multiplication, r = a * b mod m, with mi = m^(-1) mod B.
 940    Both a and b must be in redc form, the result will be in redc form too. */
 941 static inline uintmax_t
 942 mulredc (uintmax_t a, uintmax_t b, uintmax_t m, uintmax_t mi)
 943 {
 944   uintmax_t rh, rl, q, th, tl, xh;
 945
 946   umul_ppmm (rh, rl, a, b);
 947   q = rl * mi;
 948   umul_ppmm (th, tl, q, m);
 949   xh = rh - th;
 950   if (rh < th)
 951     xh += m;
 952
 953   return xh;
 954 }
 955
 956 /* Modular two-word multiplication, r = a * b mod m, with mi = m^(-1) mod B.
 957    Both a and b must be in redc form, the result will be in redc form too.
 958    For performance reasons, the most significant bit of m must be clear. */
 959 static uintmax_t
 960 mulredc2 (uintmax_t *r1p,
 961           uintmax_t a1, uintmax_t a0, uintmax_t b1, uintmax_t b0,
 962           uintmax_t m1, uintmax_t m0, uintmax_t mi)
 963 {
 964   uintmax_t r1, r0, q, p1, p0, t1, t0, s1, s0;
 965   mi = -mi;
 966   assert ( (a1 >> (W_TYPE_SIZE - 1)) == 0);
 967   assert ( (b1 >> (W_TYPE_SIZE - 1)) == 0);
 968   assert ( (m1 >> (W_TYPE_SIZE - 1)) == 0);
 969
 970   /* First compute a0 * <b1, b0> B^{-1}
 971         +-----+
 972         |a0 b0|
 973      +--+--+--+
 974      |a0 b1|
 975      +--+--+--+
 976         |q0 m0|
 977      +--+--+--+
 978      |q0 m1|
 979     -+--+--+--+
 980      |r1|r0| 0|
 981      +--+--+--+
 982   */
 983   umul_ppmm (t1, t0, a0, b0);
 984   umul_ppmm (r1, r0, a0, b1);
 985   q = mi * t0;
 986   umul_ppmm (p1, p0, q, m0);
 987   umul_ppmm (s1, s0, q, m1);
 988   r0 += (t0 != 0); /* Carry */
 989   add_ssaaaa (r1, r0, r1, r0, 0, p1);
 990   add_ssaaaa (r1, r0, r1, r0, 0, t1);
 991   add_ssaaaa (r1, r0, r1, r0, s1, s0);
 992
 993   /* Next, (a1 * <b1, b0> + <r1, r0> B^{-1}
 994         +-----+
 995         |a1 b0|
 996         +--+--+
 997         |r1|r0|
 998      +--+--+--+
 999      |a1 b1|
1000      +--+--+--+
1001         |q1 m0|
1002      +--+--+--+
1003      |q1 m1|
1004     -+--+--+--+
1005      |r1|r0| 0|
1006      +--+--+--+
1007   */
1008   umul_ppmm (t1, t0, a1, b0);
1009   umul_ppmm (s1, s0, a1, b1);
1010   add_ssaaaa (t1, t0, t1, t0, 0, r0);
1011   q = mi * t0;
1012   add_ssaaaa (r1, r0, s1, s0, 0, r1);
1013   umul_ppmm (p1, p0, q, m0);
1014   umul_ppmm (s1, s0, q, m1);
1015   r0 += (t0 != 0); /* Carry */
1016   add_ssaaaa (r1, r0, r1, r0, 0, p1);
1017   add_ssaaaa (r1, r0, r1, r0, 0, t1);
1018   add_ssaaaa (r1, r0, r1, r0, s1, s0);
1019
1020   if (ge2 (r1, r0, m1, m0))
1021     sub_ddmmss (r1, r0, r1, r0, m1, m0);
1022
1023   *r1p = r1;
1024   return r0;
1025 }
1026
1027 static uintmax_t _GL_ATTRIBUTE_CONST
1028 powm (uintmax_t b, uintmax_t e, uintmax_t n, uintmax_t ni, uintmax_t one)
1029 {
1030   uintmax_t y = one;
1031
1032   if (e & 1)
1033     y = b;
1034
1035   while (e != 0)
1036     {
1037       b = mulredc (b, b, n, ni);
1038       e >>= 1;
1039
1040       if (e & 1)
1041         y = mulredc (y, b, n, ni);
1042     }
1043
1044   return y;
1045 }
1046
1047 static uintmax_t
1048 powm2 (uintmax_t *r1m,
1049        const uintmax_t *bp, const uintmax_t *ep, const uintmax_t *np,
1050        uintmax_t ni, const uintmax_t *one)
1051 {
1052   uintmax_t r1, r0, b1, b0, n1, n0;
1053   unsigned int i;
1054   uintmax_t e;
1055
1056   b0 = bp[0];
1057   b1 = bp[1];
1058   n0 = np[0];
1059   n1 = np[1];
1060
1061   r0 = one[0];
1062   r1 = one[1];
1063
1064   for (e = ep[0], i = W_TYPE_SIZE; i > 0; i--, e >>= 1)
1065     {
1066       if (e & 1)
1067         {
1068           r0 = mulredc2 (r1m, r1, r0, b1, b0, n1, n0, ni);
1069           r1 = *r1m;
1070         }
1071       b0 = mulredc2 (r1m, b1, b0, b1, b0, n1, n0, ni);
1072       b1 = *r1m;
1073     }
1074   for (e = ep[1]; e > 0; e >>= 1)
1075     {
1076       if (e & 1)
1077         {
1078           r0 = mulredc2 (r1m, r1, r0, b1, b0, n1, n0, ni);
1079           r1 = *r1m;
1080         }
1081       b0 = mulredc2 (r1m, b1, b0, b1, b0, n1, n0, ni);
1082       b1 = *r1m;
1083     }
1084   *r1m = r1;
1085   return r0;
1086 }
1087
1088 static bool _GL_ATTRIBUTE_CONST
1089 millerrabin (uintmax_t n, uintmax_t ni, uintmax_t b, uintmax_t q,
1090              unsigned int k, uintmax_t one)
1091 {
1092   uintmax_t y = powm (b, q, n, ni, one);
1093
1094   uintmax_t nm1 = n - one;      /* -1, but in redc representation. */
1095
1096   if (y == one || y == nm1)
1097     return true;
1098
1099   for (unsigned int i = 1; i < k; i++)
1100     {
1101       y = mulredc (y, y, n, ni);
1102
1103       if (y == nm1)
1104         return true;
1105       if (y == one)
1106         return false;
1107     }
1108   return false;
1109 }
1110
1111 static bool
1112 millerrabin2 (const uintmax_t *np, uintmax_t ni, const uintmax_t *bp,
1113               const uintmax_t *qp, unsigned int k, const uintmax_t *one)
1114 {
1115   uintmax_t y1, y0, nm1_1, nm1_0, r1m;
1116
1117   y0 = powm2 (&r1m, bp, qp, np, ni, one);
1118   y1 = r1m;
1119
1120   if (y0 == one[0] && y1 == one[1])
1121     return true;
1122
1123   sub_ddmmss (nm1_1, nm1_0, np[1], np[0], one[1], one[0]);
1124
1125   if (y0 == nm1_0 && y1 == nm1_1)
1126     return true;
1127
1128   for (unsigned int i = 1; i < k; i++)
1129     {
1130       y0 = mulredc2 (&r1m, y1, y0, y1, y0, np[1], np[0], ni);
1131       y1 = r1m;
1132
1133       if (y0 == nm1_0 && y1 == nm1_1)
1134         return true;
1135       if (y0 == one[0] && y1 == one[1])
1136         return false;
1137     }
1138   return false;
1139 }
1140
1141 #if HAVE_GMP
1142 static bool
1143 mp_millerrabin (mpz_srcptr n, mpz_srcptr nm1, mpz_ptr x, mpz_ptr y,
1144                 mpz_srcptr q, unsigned long int k)
1145 {
1146   mpz_powm (y, x, q, n);
1147
1148   if (mpz_cmp_ui (y, 1) == 0 || mpz_cmp (y, nm1) == 0)
1149     return true;
1150
1151   for (unsigned long int i = 1; i < k; i++)
1152     {
1153       mpz_powm_ui (y, y, 2, n);
1154       if (mpz_cmp (y, nm1) == 0)
1155         return true;
1156       if (mpz_cmp_ui (y, 1) == 0)
1157         return false;
1158     }
1159   return false;
1160 }
1161 #endif
1162
1163 /* Lucas' prime test.  The number of iterations vary greatly, up to a few dozen
1164    have been observed.  The average seem to be about 2.  */
1165 static bool
1166 prime_p (uintmax_t n)
1167 {
1168   int k;
1169   bool is_prime;
1170   uintmax_t a_prim, one, ni;
1171   struct factors factors;
1172
1173   if (n <= 1)
1174     return false;
1175
1176   /* We have already casted out small primes. */
1177   if (n < (uintmax_t) FIRST_OMITTED_PRIME * FIRST_OMITTED_PRIME)
1178     return true;
1179
1180   /* Precomputation for Miller-Rabin.  */
1181   uintmax_t q = n - 1;
1182   for (k = 0; (q & 1) == 0; k++)
1183     q >>= 1;
1184
1185   uintmax_t a = 2;
1186   binv (ni, n);                 /* ni <- 1/n mod B */
1187   redcify (one, 1, n);
1188   addmod (a_prim, one, one, n); /* i.e., redcify a = 2 */
1189
1190   /* Perform a Miller-Rabin test, finds most composites quickly.  */
1191   if (!millerrabin (n, ni, a_prim, q, k, one))
1192     return false;
1193
1194   if (flag_prove_primality)
1195     {
1196       /* Factor n-1 for Lucas.  */
1197       factor (0, n - 1, &factors);
1198     }
1199
1200   /* Loop until Lucas proves our number prime, or Miller-Rabin proves our
1201      number composite.  */
1202   for (unsigned int r = 0; r < PRIMES_PTAB_ENTRIES; r++)
1203     {
1204       if (flag_prove_primality)
1205         {
1206           is_prime = true;
1207           for (unsigned int i = 0; i < factors.nfactors && is_prime; i++)
1208             {
1209               is_prime
1210                 = powm (a_prim, (n - 1) / factors.p[i], n, ni, one) != one;
1211             }
1212         }
1213       else
1214         {
1215           /* After enough Miller-Rabin runs, be content. */
1216           is_prime = (r == MR_REPS - 1);
1217         }
1218
1219       if (is_prime)
1220         return true;
1221
1222       a += primes_diff[r];      /* Establish new base.  */
1223
1224       /* The following is equivalent to redcify (a_prim, a, n).  It runs faster
1225          on most processors, since it avoids udiv_qrnnd.  If we go down the
1226          udiv_qrnnd_preinv path, this code should be replaced.  */
1227       {
1228         uintmax_t s1, s0;
1229         umul_ppmm (s1, s0, one, a);
1230         if (LIKELY (s1 == 0))
1231           a_prim = s0 % n;
1232         else
1233           {
1234             uintmax_t dummy ATTRIBUTE_UNUSED;
1235             udiv_qrnnd (dummy, a_prim, s1, s0, n);
1236           }
1237       }
1238
1239       if (!millerrabin (n, ni, a_prim, q, k, one))
1240         return false;
1241     }
1242
1243   error (0, 0, _("Lucas prime test failure.  This should not happen"));
1244   abort ();
1245 }
1246
1247 static bool
1248 prime2_p (uintmax_t n1, uintmax_t n0)
1249 {
1250   uintmax_t q[2], nm1[2];
1251   uintmax_t a_prim[2];
1252   uintmax_t one[2];
1253   uintmax_t na[2];
1254   uintmax_t ni;
1255   unsigned int k;
1256   struct factors factors;
1257
1258   if (n1 == 0)
1259     return prime_p (n0);
1260
1261   nm1[1] = n1 - (n0 == 0);
1262   nm1[0] = n0 - 1;
1263   if (nm1[0] == 0)
1264     {
1265       count_trailing_zeros (k, nm1[1]);
1266
1267       q[0] = nm1[1] >> k;
1268       q[1] = 0;
1269       k += W_TYPE_SIZE;
1270     }
1271   else
1272     {
1273       count_trailing_zeros (k, nm1[0]);
1274       rsh2 (q[1], q[0], nm1[1], nm1[0], k);
1275     }
1276
1277   uintmax_t a = 2;
1278   binv (ni, n0);
1279   redcify2 (one[1], one[0], 1, n1, n0);
1280   addmod2 (a_prim[1], a_prim[0], one[1], one[0], one[1], one[0], n1, n0);
1281
1282   /* FIXME: Use scalars or pointers in arguments? Some consistency needed. */
1283   na[0] = n0;
1284   na[1] = n1;
1285
1286   if (!millerrabin2 (na, ni, a_prim, q, k, one))
1287     return false;
1288
1289   if (flag_prove_primality)
1290     {
1291       /* Factor n-1 for Lucas.  */
1292       factor (nm1[1], nm1[0], &factors);
1293     }
1294
1295   /* Loop until Lucas proves our number prime, or Miller-Rabin proves our
1296      number composite.  */
1297   for (unsigned int r = 0; r < PRIMES_PTAB_ENTRIES; r++)
1298     {
1299       bool is_prime;
1300       uintmax_t e[2], y[2];
1301
1302       if (flag_prove_primality)
1303         {
1304           is_prime = true;
1305           if (factors.plarge[1])
1306             {
1307               uintmax_t pi;
1308               binv (pi, factors.plarge[0]);
1309               e[0] = pi * nm1[0];
1310               e[1] = 0;
1311               y[0] = powm2 (&y[1], a_prim, e, na, ni, one);
1312               is_prime = (y[0] != one[0] || y[1] != one[1]);
1313             }
1314           for (unsigned int i = 0; i < factors.nfactors && is_prime; i++)
1315             {
1316               /* FIXME: We always have the factor 2. Do we really need to
1317                  handle it here? We have done the same powering as part
1318                  of millerrabin. */
1319               if (factors.p[i] == 2)
1320                 rsh2 (e[1], e[0], nm1[1], nm1[0], 1);
1321               else
1322                 divexact_21 (e[1], e[0], nm1[1], nm1[0], factors.p[i]);
1323               y[0] = powm2 (&y[1], a_prim, e, na, ni, one);
1324               is_prime = (y[0] != one[0] || y[1] != one[1]);
1325             }
1326         }
1327       else
1328         {
1329           /* After enough Miller-Rabin runs, be content. */
1330           is_prime = (r == MR_REPS - 1);
1331         }
1332
1333       if (is_prime)
1334         return true;
1335
1336       a += primes_diff[r];      /* Establish new base.  */
1337       redcify2 (a_prim[1], a_prim[0], a, n1, n0);
1338
1339       if (!millerrabin2 (na, ni, a_prim, q, k, one))
1340         return false;
1341     }
1342
1343   error (0, 0, _("Lucas prime test failure.  This should not happen"));
1344   abort ();
1345 }
1346
1347 #if HAVE_GMP
1348 static bool
1349 mp_prime_p (mpz_t n)
1350 {
1351   bool is_prime;
1352   mpz_t q, a, nm1, tmp;
1353   struct mp_factors factors;
1354
1355   if (mpz_cmp_ui (n, 1) <= 0)
1356     return false;
1357
1358   /* We have already casted out small primes. */
1359   if (mpz_cmp_ui (n, (long) FIRST_OMITTED_PRIME * FIRST_OMITTED_PRIME) < 0)
1360     return true;
1361
1362   mpz_inits (q, a, nm1, tmp, NULL);
1363
1364   /* Precomputation for Miller-Rabin.  */
1365   mpz_sub_ui (nm1, n, 1);
1366
1367   /* Find q and k, where q is odd and n = 1 + 2**k * q.  */
1368   unsigned long int k = mpz_scan1 (nm1, 0);
1369   mpz_tdiv_q_2exp (q, nm1, k);
1370
1371   mpz_set_ui (a, 2);
1372
1373   /* Perform a Miller-Rabin test, finds most composites quickly.  */
1374   if (!mp_millerrabin (n, nm1, a, tmp, q, k))
1375     {
1376       is_prime = false;
1377       goto ret2;
1378     }
1379
1380   if (flag_prove_primality)
1381     {
1382       /* Factor n-1 for Lucas.  */
1383       mpz_set (tmp, nm1);
1384       mp_factor (tmp, &factors);
1385     }
1386
1387   /* Loop until Lucas proves our number prime, or Miller-Rabin proves our
1388      number composite.  */
1389   for (unsigned int r = 0; r < PRIMES_PTAB_ENTRIES; r++)
1390     {
1391       if (flag_prove_primality)
1392         {
1393           is_prime = true;
1394           for (unsigned long int i = 0; i < factors.nfactors && is_prime; i++)
1395             {
1396               mpz_divexact (tmp, nm1, factors.p[i]);
1397               mpz_powm (tmp, a, tmp, n);
1398               is_prime = mpz_cmp_ui (tmp, 1) != 0;
1399             }
1400         }
1401       else
1402         {
1403           /* After enough Miller-Rabin runs, be content. */
1404           is_prime = (r == MR_REPS - 1);
1405         }
1406
1407       if (is_prime)
1408         goto ret1;
1409
1410       mpz_add_ui (a, a, primes_diff[r]);        /* Establish new base.  */
1411
1412       if (!mp_millerrabin (n, nm1, a, tmp, q, k))
1413         {
1414           is_prime = false;
1415           goto ret1;
1416         }
1417     }
1418
1419   error (0, 0, _("Lucas prime test failure.  This should not happen"));
1420   abort ();
1421
1422  ret1:
1423   if (flag_prove_primality)
1424     mp_factor_clear (&factors);
1425  ret2:
1426   mpz_clears (q, a, nm1, tmp, NULL);
1427
1428   return is_prime;
1429 }
1430 #endif
1431
1432 static void
1433 factor_using_pollard_rho (uintmax_t n, unsigned long int a,
1434                           struct factors *factors)
1435 {
1436   uintmax_t x, z, y, P, t, ni, g;
1437
1438   unsigned long int k = 1;
1439   unsigned long int l = 1;
1440
1441   redcify (P, 1, n);
1442   addmod (x, P, P, n);          /* i.e., redcify(2) */
1443   y = z = x;
1444
1445   while (n != 1)
1446     {
1447       assert (a < n);
1448
1449       binv (ni, n);             /* FIXME: when could we use old 'ni' value? */
1450
1451       for (;;)
1452         {
1453           do
1454             {
1455               x = mulredc (x, x, n, ni);
1456               addmod (x, x, a, n);
1457
1458               submod (t, z, x, n);
1459               P = mulredc (P, t, n, ni);
1460
1461               if (k % 32 == 1)
1462                 {
1463                   if (gcd_odd (P, n) != 1)
1464                     goto factor_found;
1465                   y = x;
1466                 }
1467             }
1468           while (--k != 0);
1469
1470           z = x;
1471           k = l;
1472           l = 2 * l;
1473           for (unsigned long int i = 0; i < k; i++)
1474             {
1475               x = mulredc (x, x, n, ni);
1476               addmod (x, x, a, n);
1477             }
1478           y = x;
1479         }
1480
1481     factor_found:
1482       do
1483         {
1484           y = mulredc (y, y, n, ni);
1485           addmod (y, y, a, n);
1486
1487           submod (t, z, y, n);
1488           g = gcd_odd (t, n);
1489         }
1490       while (g == 1);
1491
1492       n = n / g;
1493
1494       if (!prime_p (g))
1495         factor_using_pollard_rho (g, a + 1, factors);
1496       else
1497         factor_insert (factors, g);
1498
1499       if (prime_p (n))
1500         {
1501           factor_insert (factors, n);
1502           break;
1503         }
1504
1505       x = x % n;
1506       z = z % n;
1507       y = y % n;
1508     }
1509 }
1510
1511 static void
1512 factor_using_pollard_rho2 (uintmax_t n1, uintmax_t n0, unsigned long int a,
1513                            struct factors *factors)
1514 {
1515   uintmax_t x1, x0, z1, z0, y1, y0, P1, P0, t1, t0, ni, g1, g0, r1m;
1516
1517   unsigned long int k = 1;
1518   unsigned long int l = 1;
1519
1520   redcify2 (P1, P0, 1, n1, n0);
1521   addmod2 (x1, x0, P1, P0, P1, P0, n1, n0); /* i.e., redcify(2) */
1522   y1 = z1 = x1;
1523   y0 = z0 = x0;
1524
1525   while (n1 != 0 || n0 != 1)
1526     {
1527       binv (ni, n0);
1528
1529       for (;;)
1530         {
1531           do
1532             {
1533               x0 = mulredc2 (&r1m, x1, x0, x1, x0, n1, n0, ni);
1534               x1 = r1m;
1535               addmod2 (x1, x0, x1, x0, 0, (uintmax_t) a, n1, n0);
1536
1537               submod2 (t1, t0, z1, z0, x1, x0, n1, n0);
1538               P0 = mulredc2 (&r1m, P1, P0, t1, t0, n1, n0, ni);
1539               P1 = r1m;
1540
1541               if (k % 32 == 1)
1542                 {
1543                   g0 = gcd2_odd (&g1, P1, P0, n1, n0);
1544                   if (g1 != 0 || g0 != 1)
1545                     goto factor_found;
1546                   y1 = x1; y0 = x0;
1547                 }
1548             }
1549           while (--k != 0);
1550
1551           z1 = x1; z0 = x0;
1552           k = l;
1553           l = 2 * l;
1554           for (unsigned long int i = 0; i < k; i++)
1555             {
1556               x0 = mulredc2 (&r1m, x1, x0, x1, x0, n1, n0, ni);
1557               x1 = r1m;
1558               addmod2 (x1, x0, x1, x0, 0, (uintmax_t) a, n1, n0);
1559             }
1560           y1 = x1; y0 = x0;
1561         }
1562
1563     factor_found:
1564       do
1565         {
1566           y0 = mulredc2 (&r1m, y1, y0, y1, y0, n1, n0, ni);
1567           y1 = r1m;
1568           addmod2 (y1, y0, y1, y0, 0, (uintmax_t) a, n1, n0);
1569
1570           submod2 (t1, t0, z1, z0, y1, y0, n1, n0);
1571           g0 = gcd2_odd (&g1, t1, t0, n1, n0);
1572         }
1573       while (g1 == 0 && g0 == 1);
1574
1575       if (g1 == 0)
1576         {
1577           /* The found factor is one word. */
1578           divexact_21 (n1, n0, n1, n0, g0);     /* n = n / g */
1579
1580           if (!prime_p (g0))
1581             factor_using_pollard_rho (g0, a + 1, factors);
1582           else
1583             factor_insert (factors, g0);
1584         }
1585       else
1586         {
1587           /* The found factor is two words.  This is highly unlikely, thus hard
1588              to trigger.  Please be careful before you change this code!  */
1589           uintmax_t ginv;
1590
1591           binv (ginv, g0);      /* Compute n = n / g.  Since the result will */
1592           n0 = ginv * n0;       /* fit one word, we can compute the quotient */
1593           n1 = 0;               /* modulo B, ignoring the high divisor word. */
1594
1595           if (!prime2_p (g1, g0))
1596             factor_using_pollard_rho2 (g1, g0, a + 1, factors);
1597           else
1598             factor_insert_large (factors, g1, g0);
1599         }
1600
1601       if (n1 == 0)
1602         {
1603           if (prime_p (n0))
1604             {
1605               factor_insert (factors, n0);
1606               break;
1607             }
1608
1609           factor_using_pollard_rho (n0, a, factors);
1610           return;
1611         }
1612
1613       if (prime2_p (n1, n0))
1614         {
1615           factor_insert_large (factors, n1, n0);
1616           break;
1617         }
1618
1619       x0 = mod2 (&x1, x1, x0, n1, n0);
1620       z0 = mod2 (&z1, z1, z0, n1, n0);
1621       y0 = mod2 (&y1, y1, y0, n1, n0);
1622     }
1623 }
1624
1625 #if HAVE_GMP
1626 static void
1627 mp_factor_using_pollard_rho (mpz_t n, unsigned long int a,
1628                              struct mp_factors *factors)
1629 {
1630   mpz_t x, z, y, P;
1631   mpz_t t, t2;
1632
1633   debug ("[pollard-rho (%lu)] ", a);
1634
1635   mpz_inits (t, t2, NULL);
1636   mpz_init_set_si (y, 2);
1637   mpz_init_set_si (x, 2);
1638   mpz_init_set_si (z, 2);
1639   mpz_init_set_ui (P, 1);
1640
1641   unsigned long long int k = 1;
1642   unsigned long long int l = 1;
1643
1644   while (mpz_cmp_ui (n, 1) != 0)
1645     {
1646       for (;;)
1647         {
1648           do
1649             {
1650               mpz_mul (t, x, x);
1651               mpz_mod (x, t, n);
1652               mpz_add_ui (x, x, a);
1653
1654               mpz_sub (t, z, x);
1655               mpz_mul (t2, P, t);
1656               mpz_mod (P, t2, n);
1657
1658               if (k % 32 == 1)
1659                 {
1660                   mpz_gcd (t, P, n);
1661                   if (mpz_cmp_ui (t, 1) != 0)
1662                     goto factor_found;
1663                   mpz_set (y, x);
1664                 }
1665             }
1666           while (--k != 0);
1667
1668           mpz_set (z, x);
1669           k = l;
1670           l = 2 * l;
1671           for (unsigned long long int i = 0; i < k; i++)
1672             {
1673               mpz_mul (t, x, x);
1674               mpz_mod (x, t, n);
1675               mpz_add_ui (x, x, a);
1676             }
1677           mpz_set (y, x);
1678         }
1679
1680     factor_found:
1681       do
1682         {
1683           mpz_mul (t, y, y);
1684           mpz_mod (y, t, n);
1685           mpz_add_ui (y, y, a);
1686
1687           mpz_sub (t, z, y);
1688           mpz_gcd (t, t, n);
1689         }
1690       while (mpz_cmp_ui (t, 1) == 0);
1691
1692       mpz_divexact (n, n, t);   /* divide by t, before t is overwritten */
1693
1694       if (!mp_prime_p (t))
1695         {
1696           debug ("[composite factor--restarting pollard-rho] ");
1697           mp_factor_using_pollard_rho (t, a + 1, factors);
1698         }
1699       else
1700         {
1701           mp_factor_insert (factors, t);
1702         }
1703
1704       if (mp_prime_p (n))
1705         {
1706           mp_factor_insert (factors, n);
1707           break;
1708         }
1709
1710       mpz_mod (x, x, n);
1711       mpz_mod (z, z, n);
1712       mpz_mod (y, y, n);
1713     }
1714
1715   mpz_clears (P, t2, t, z, x, y, NULL);
1716 }
1717 #endif
1718
1719 /* FIXME: Maybe better to use an iteration converging to 1/sqrt(n)?  If
1720    algorithm is replaced, consider also returning the remainder. */
1721 static uintmax_t _GL_ATTRIBUTE_CONST
1722 isqrt (uintmax_t n)
1723 {
1724   uintmax_t x;
1725   unsigned c;
1726   if (n == 0)
1727     return 0;
1728
1729   count_leading_zeros (c, n);
1730
1731   /* Make x > sqrt(n). This will be invariant through the loop. */
1732   x = (uintmax_t) 1 << ((W_TYPE_SIZE + 1 - c) / 2);
1733
1734   for (;;)
1735     {
1736       uintmax_t y = (x + n/x) / 2;
1737       if (y >= x)
1738         return x;
1739
1740       x = y;
1741     }
1742 }
1743
1744 static uintmax_t _GL_ATTRIBUTE_CONST
1745 isqrt2 (uintmax_t nh, uintmax_t nl)
1746 {
1747   unsigned int shift;
1748   uintmax_t x;
1749
1750   /* Ensures the remainder fits in an uintmax_t. */
1751   assert (nh < ((uintmax_t) 1 << (W_TYPE_SIZE - 2)));
1752
1753   if (nh == 0)
1754     return isqrt (nl);
1755
1756   count_leading_zeros (shift, nh);
1757   shift &= ~1;
1758
1759   /* Make x > sqrt(n) */
1760   x = isqrt ( (nh << shift) + (nl >> (W_TYPE_SIZE - shift))) + 1;
1761   x <<= (W_TYPE_SIZE - shift) / 2;
1762
1763   /* Do we need more than one iteration? */
1764   for (;;)
1765     {
1766       uintmax_t r ATTRIBUTE_UNUSED;
1767       uintmax_t q, y;
1768       udiv_qrnnd (q, r, nh, nl, x);
1769       y = (x + q) / 2;
1770
1771       if (y >= x)
1772         {
1773           uintmax_t hi, lo;
1774           umul_ppmm (hi, lo, x + 1, x + 1);
1775           assert (gt2 (hi, lo, nh, nl));
1776
1777           umul_ppmm (hi, lo, x, x);
1778           assert (ge2 (nh, nl, hi, lo));
1779           sub_ddmmss (hi, lo, nh, nl, hi, lo);
1780           assert (hi == 0);
1781
1782           return x;
1783         }
1784
1785       x = y;
1786     }
1787 }
1788
1789 /* MAGIC[N] has a bit i set iff i is a quadratic residue mod N. */
1790 #define MAGIC64 ((uint64_t) 0x0202021202030213ULL)
1791 #define MAGIC63 ((uint64_t) 0x0402483012450293ULL)
1792 #define MAGIC65 ((uint64_t) 0x218a019866014613ULL)
1793 #define MAGIC11 0x23b
1794
1795 /* Returns the square root if the input is a square, otherwise 0. */
1796 static uintmax_t _GL_ATTRIBUTE_CONST
1797 is_square (uintmax_t x)
1798 {
1799   /* Uses the tests suggested by Cohen. Excludes 99% of the non-squares before
1800      computing the square root. */
1801   if (((MAGIC64 >> (x & 63)) & 1)
1802       && ((MAGIC63 >> (x % 63)) & 1)
1803       /* Both 0 and 64 are squares mod (65) */
1804       && ((MAGIC65 >> ((x % 65) & 63)) & 1)
1805       && ((MAGIC11 >> (x % 11) & 1)))
1806     {
1807       uintmax_t r = isqrt (x);
1808       if (r*r == x)
1809         return r;
1810     }
1811   return 0;
1812 }
1813
1814 /* invtab[i] = floor(0x10000 / (0x100 + i) */
1815 static const unsigned short invtab[0x81] =
1816   {
1817     0x200,
1818     0x1fc, 0x1f8, 0x1f4, 0x1f0, 0x1ec, 0x1e9, 0x1e5, 0x1e1,
1819     0x1de, 0x1da, 0x1d7, 0x1d4, 0x1d0, 0x1cd, 0x1ca, 0x1c7,
1820     0x1c3, 0x1c0, 0x1bd, 0x1ba, 0x1b7, 0x1b4, 0x1b2, 0x1af,
1821     0x1ac, 0x1a9, 0x1a6, 0x1a4, 0x1a1, 0x19e, 0x19c, 0x199,
1822     0x197, 0x194, 0x192, 0x18f, 0x18d, 0x18a, 0x188, 0x186,
1823     0x183, 0x181, 0x17f, 0x17d, 0x17a, 0x178, 0x176, 0x174,
1824     0x172, 0x170, 0x16e, 0x16c, 0x16a, 0x168, 0x166, 0x164,
1825     0x162, 0x160, 0x15e, 0x15c, 0x15a, 0x158, 0x157, 0x155,
1826     0x153, 0x151, 0x150, 0x14e, 0x14c, 0x14a, 0x149, 0x147,
1827     0x146, 0x144, 0x142, 0x141, 0x13f, 0x13e, 0x13c, 0x13b,
1828     0x139, 0x138, 0x136, 0x135, 0x133, 0x132, 0x130, 0x12f,
1829     0x12e, 0x12c, 0x12b, 0x129, 0x128, 0x127, 0x125, 0x124,
1830     0x123, 0x121, 0x120, 0x11f, 0x11e, 0x11c, 0x11b, 0x11a,
1831     0x119, 0x118, 0x116, 0x115, 0x114, 0x113, 0x112, 0x111,
1832     0x10f, 0x10e, 0x10d, 0x10c, 0x10b, 0x10a, 0x109, 0x108,
1833     0x107, 0x106, 0x105, 0x104, 0x103, 0x102, 0x101, 0x100,
1834   };
1835
1836 /* Compute q = [u/d], r = u mod d.  Avoids slow hardware division for the case
1837    that q < 0x40; here it instead uses a table of (Euclidian) inverses.  */
1838 #define div_smallq(q, r, u, d)                                          \
1839   do {                                                                  \
1840     if ((u) / 0x40 < (d))                                               \
1841       {                                                                 \
1842         int _cnt;                                                       \
1843         uintmax_t _dinv, _mask, _q, _r;                                 \
1844         count_leading_zeros (_cnt, (d));                                \
1845         _r = (u);                                                       \
1846         if (UNLIKELY (_cnt > (W_TYPE_SIZE - 8)))                        \
1847           {                                                             \
1848             _dinv = invtab[((d) << (_cnt + 8 - W_TYPE_SIZE)) - 0x80];   \
1849             _q = _dinv * _r >> (8 + W_TYPE_SIZE - _cnt);                \
1850           }                                                             \
1851         else                                                            \
1852           {                                                             \
1853             _dinv = invtab[((d) >> (W_TYPE_SIZE - 8 - _cnt)) - 0x7f];   \
1854             _q = _dinv * (_r >> (W_TYPE_SIZE - 3 - _cnt)) >> 11;        \
1855           }                                                             \
1856         _r -= _q*(d);                                                   \
1857                                                                         \
1858         _mask = -(uintmax_t) (_r >= (d));                               \
1859         (r) = _r - (_mask & (d));                                       \
1860         (q) = _q - _mask;                                               \
1861         assert ( (q) * (d) + (r) == u);                                 \
1862       }                                                                 \
1863     else                                                                \
1864       {                                                                 \
1865         uintmax_t _q = (u) / (d);                                       \
1866         (r) = (u) - _q * (d);                                           \
1867         (q) = _q;                                                       \
1868       }                                                                 \
1869   } while (0)
1870
1871 /* Notes: Example N = 22117019. After first phase we find Q1 = 6314, Q
1872    = 3025, P = 1737, representing F_{18} = (-6314, 2* 1737, 3025),
1873    with 3025 = 55^2.
1874
1875    Constructing the square root, we get Q1 = 55, Q = 8653, P = 4652,
1876    representing G_0 = (-55, 2*4652, 8653).
1877
1878    In the notation of the paper:
1879
1880    S_{-1} = 55, S_0 = 8653, R_0 = 4652
1881
1882    Put
1883
1884      t_0 = floor([q_0 + R_0] / S0) = 1
1885      R_1 = t_0 * S_0 - R_0 = 4001
1886      S_1 = S_{-1} +t_0 (R_0 - R_1) = 706
1887 */
1888
1889 /* Multipliers, in order of efficiency:
1890    0.7268  3*5*7*11 = 1155 = 3 (mod 4)
1891    0.7317  3*5*7    =  105 = 1
1892    0.7820  3*5*11   =  165 = 1
1893    0.7872  3*5      =   15 = 3
1894    0.8101  3*7*11   =  231 = 3
1895    0.8155  3*7      =   21 = 1
1896    0.8284  5*7*11   =  385 = 1
1897    0.8339  5*7      =   35 = 3
1898    0.8716  3*11     =   33 = 1
1899    0.8774  3        =    3 = 3
1900    0.8913  5*11     =   55 = 3
1901    0.8972  5        =    5 = 1
1902    0.9233  7*11     =   77 = 1
1903    0.9295  7        =    7 = 3
1904    0.9934  11       =   11 = 3
1905 */
1906 #define QUEUE_SIZE 50
1907
1908 #if STAT_SQUFOF
1909 # define Q_FREQ_SIZE 50
1910 /* Element 0 keeps the total */
1911 static unsigned int q_freq[Q_FREQ_SIZE + 1];
1912 # define MIN(a,b) ((a) < (b) ? (a) : (b))
1913 #endif
1914
1915 /* Return true on success.  Expected to fail only for numbers
1916    >= 2^{2*W_TYPE_SIZE - 2}, or close to that limit. */
1917 static bool
1918 factor_using_squfof (uintmax_t n1, uintmax_t n0, struct factors *factors)
1919 {
1920   /* Uses algorithm and notation from
1921
1922      SQUARE FORM FACTORIZATION
1923      JASON E. GOWER AND SAMUEL S. WAGSTAFF, JR.
1924
1925      http://homes.cerias.purdue.edu/~ssw/squfof.pdf
1926    */
1927
1928   static const unsigned int multipliers_1[] =
1929     { /* = 1 (mod 4) */
1930       105, 165, 21, 385, 33, 5, 77, 1, 0
1931     };
1932   static const unsigned int multipliers_3[] =
1933     { /* = 3 (mod 4) */
1934       1155, 15, 231, 35, 3, 55, 7, 11, 0
1935     };
1936
1937   const unsigned int *m;
1938
1939   struct { uintmax_t Q; uintmax_t P; } queue[QUEUE_SIZE];
1940
1941   if (n1 >= ((uintmax_t) 1 << (W_TYPE_SIZE - 2)))
1942     return false;
1943
1944   uintmax_t sqrt_n = isqrt2 (n1, n0);
1945
1946   if (n0 == sqrt_n * sqrt_n)
1947     {
1948       uintmax_t p1, p0;
1949
1950       umul_ppmm (p1, p0, sqrt_n, sqrt_n);
1951       assert (p0 == n0);
1952
1953       if (n1 == p1)
1954         {
1955           if (prime_p (sqrt_n))
1956             factor_insert_multiplicity (factors, sqrt_n, 2);
1957           else
1958             {
1959               struct factors f;
1960
1961               f.nfactors = 0;
1962               if (!factor_using_squfof (0, sqrt_n, &f))
1963                 {
1964                   /* Try pollard rho instead */
1965                   factor_using_pollard_rho (sqrt_n, 1, &f);
1966                 }
1967               /* Duplicate the new factors */
1968               for (unsigned int i = 0; i < f.nfactors; i++)
1969                 factor_insert_multiplicity (factors, f.p[i], 2*f.e[i]);
1970             }
1971           return true;
1972         }
1973     }
1974
1975   /* Select multipliers so we always get n * mu = 3 (mod 4) */
1976   for (m = (n0 % 4 == 1) ? multipliers_3 : multipliers_1;
1977        *m; m++)
1978     {
1979       uintmax_t S, Dh, Dl, Q1, Q, P, L, L1, B;
1980       unsigned int i;
1981       unsigned int mu = *m;
1982       unsigned int qpos = 0;
1983
1984       assert (mu * n0 % 4 == 3);
1985
1986       /* In the notation of the paper, with mu * n == 3 (mod 4), we
1987          get \Delta = 4 mu * n, and the paper's \mu is 2 mu. As far as
1988          I understand it, the necessary bound is 4 \mu^3 < n, or 32
1989          mu^3 < n.
1990
1991          However, this seems insufficient: With n = 37243139 and mu =
1992          105, we get a trivial factor, from the square 38809 = 197^2,
1993          without any corresponding Q earlier in the iteration.
1994
1995          Requiring 64 mu^3 < n seems sufficient. */
1996       if (n1 == 0)
1997         {
1998           if ((uintmax_t) mu*mu*mu >= n0 / 64)
1999             continue;
2000         }
2001       else
2002         {
2003           if (n1 > ((uintmax_t) 1 << (W_TYPE_SIZE - 2)) / mu)
2004             continue;
2005         }
2006       umul_ppmm (Dh, Dl, n0, mu);
2007       Dh += n1 * mu;
2008
2009       assert (Dl % 4 != 1);
2010       assert (Dh < (uintmax_t) 1 << (W_TYPE_SIZE - 2));
2011
2012       S = isqrt2 (Dh, Dl);
2013
2014       Q1 = 1;
2015       P = S;
2016
2017       /* Square root remainder fits in one word, so ignore high part. */
2018       Q = Dl - P*P;
2019       /* FIXME: When can this differ from floor(sqrt(2 sqrt(D)))? */
2020       L = isqrt (2*S);
2021       B = 2*L;
2022       L1 = mu * 2 * L;
2023
2024       /* The form is (+/- Q1, 2P, -/+ Q), of discriminant 4 (P^2 + Q Q1) =
2025          4 D. */
2026
2027       for (i = 0; i <= B; i++)
2028         {
2029           uintmax_t q, P1, t, rem;
2030
2031           div_smallq (q, rem, S+P, Q);
2032           P1 = S - rem; /* P1 = q*Q - P */
2033
2034 #if STAT_SQUFOF
2035           assert (q > 0);
2036           q_freq[0]++;
2037           q_freq[MIN (q, Q_FREQ_SIZE)]++;
2038 #endif
2039
2040           if (Q <= L1)
2041             {
2042               uintmax_t g = Q;
2043
2044               if ( (Q & 1) == 0)
2045                 g /= 2;
2046
2047               g /= gcd_odd (g, mu);
2048
2049               if (g <= L)
2050                 {
2051                   if (qpos >= QUEUE_SIZE)
2052                     error (EXIT_FAILURE, 0, _("squfof queue overflow"));
2053                   queue[qpos].Q = g;
2054                   queue[qpos].P = P % g;
2055                   qpos++;
2056                 }
2057             }
2058
2059           /* I think the difference can be either sign, but mod
2060              2^W_TYPE_SIZE arithmetic should be fine. */
2061           t = Q1 + q * (P - P1);
2062           Q1 = Q;
2063           Q = t;
2064           P = P1;
2065
2066           if ( (i & 1) == 0)
2067             {
2068               uintmax_t r = is_square (Q);
2069               if (r)
2070                 {
2071                   for (unsigned int j = 0; j < qpos; j++)
2072                     {
2073                       if (queue[j].Q == r)
2074                         {
2075                           if (r == 1)
2076                             /* Traversed entire cycle. */
2077                             goto next_multiplier;
2078
2079                           /* Need the absolute value for divisibility test. */
2080                           if (P >= queue[j].P)
2081                             t = P - queue[j].P;
2082                           else
2083                             t = queue[j].P - P;
2084                           if (t % r == 0)
2085                             {
2086                               /* Delete entries up to and including entry
2087                                  j, which matched. */
2088                               memmove (queue, queue + j + 1,
2089                                        (qpos - j - 1) * sizeof (queue[0]));
2090                               qpos -= (j + 1);
2091                             }
2092                           goto next_i;
2093                         }
2094                     }
2095
2096                   /* We have found a square form, which should give a
2097                      factor. */
2098                   Q1 = r;
2099                   assert (S >= P); /* What signs are possible? */
2100                   P += r * ((S - P) / r);
2101
2102                   /* Note: Paper says (N - P*P) / Q1, that seems incorrect
2103                      for the case D = 2N. */
2104                   /* Compute Q = (D - P*P) / Q1, but we need double
2105                      precision. */
2106                   uintmax_t hi, lo;
2107                   umul_ppmm (hi, lo, P, P);
2108                   sub_ddmmss (hi, lo, Dh, Dl, hi, lo);
2109                   udiv_qrnnd (Q, rem, hi, lo, Q1);
2110                   assert (rem == 0);
2111
2112                   for (;;)
2113                     {
2114                       /* Note: There appears to by a typo in the paper,
2115                          Step 4a in the algorithm description says q <--
2116                          floor([S+P]/\hat Q), but looking at the equations
2117                          in Sec. 3.1, it should be q <-- floor([S+P] / Q).
2118                          (In this code, \hat Q is Q1). */
2119                       div_smallq (q, rem, S+P, Q);
2120                       P1 = S - rem;     /* P1 = q*Q - P */
2121
2122 #if STAT_SQUFOF
2123                       q_freq[0]++;
2124                       q_freq[MIN (q, Q_FREQ_SIZE)]++;
2125 #endif
2126                       if (P == P1)
2127                         break;
2128                       t = Q1 + q * (P - P1);
2129                       Q1 = Q;
2130                       Q = t;
2131                       P = P1;
2132                     }
2133
2134                   if ( (Q & 1) == 0)
2135                     Q /= 2;
2136                   Q /= gcd_odd (Q, mu);
2137
2138                   assert (Q > 1 && (n1 || Q < n0));
2139
2140                   if (prime_p (Q))
2141                     factor_insert (factors, Q);
2142                   else if (!factor_using_squfof (0, Q, factors))
2143                     factor_using_pollard_rho (Q, 2, factors);
2144
2145                   divexact_21 (n1, n0, n1, n0, Q);
2146
2147                   if (prime2_p (n1, n0))
2148                     factor_insert_large (factors, n1, n0);
2149                   else
2150                     {
2151                       if (!factor_using_squfof (n1, n0, factors))
2152                         {
2153                           if (n1 == 0)
2154                             factor_using_pollard_rho (n0, 1, factors);
2155                           else
2156                             factor_using_pollard_rho2 (n1, n0, 1, factors);
2157                         }
2158                     }
2159
2160                   return true;
2161                 }
2162             }
2163         next_i:;
2164         }
2165     next_multiplier:;
2166     }
2167   return false;
2168 }
2169
2170 static void
2171 factor (uintmax_t t1, uintmax_t t0, struct factors *factors)
2172 {
2173   factors->nfactors = 0;
2174   factors->plarge[1] = 0;
2175
2176   if (t1 == 0 && t0 < 2)
2177     return;
2178
2179   t0 = factor_using_division (&t1, t1, t0, factors);
2180
2181   if (t1 == 0 && t0 < 2)
2182     return;
2183
2184   if (prime2_p (t1, t0))
2185     factor_insert_large (factors, t1, t0);
2186   else
2187     {
2188       if (alg == ALG_SQUFOF)
2189         if (factor_using_squfof (t1, t0, factors))
2190           return;
2191
2192       if (t1 == 0)
2193         factor_using_pollard_rho (t0, 1, factors);
2194       else
2195         factor_using_pollard_rho2 (t1, t0, 1, factors);
2196     }
2197 }
2198
2199 #if HAVE_GMP
2200 static void
2201 mp_factor (mpz_t t, struct mp_factors *factors)
2202 {
2203   mp_factor_init (factors);
2204
2205   if (mpz_sgn (t) != 0)
2206     {
2207       mp_factor_using_division (t, factors);
2208
2209       if (mpz_cmp_ui (t, 1) != 0)
2210         {
2211           debug ("[is number prime?] ");
2212           if (mp_prime_p (t))
2213             mp_factor_insert (factors, t);
2214           else
2215             mp_factor_using_pollard_rho (t, 1, factors);
2216         }
2217     }
2218 }
2219 #endif
2220
2221 static strtol_error
2222 strto2uintmax (uintmax_t *hip, uintmax_t *lop, const char *s)
2223 {
2224   unsigned int lo_carry;
2225   uintmax_t hi = 0, lo = 0;
2226
2227   strtol_error err = LONGINT_INVALID;
2228
2229   /* Skip initial spaces and '+'.  */
2230   for (;;)
2231     {
2232       char c = *s;
2233       if (c == ' ')
2234         s++;
2235       else if (c == '+')
2236         {
2237           s++;
2238           break;
2239         }
2240       else
2241         break;
2242     }
2243
2244   /* Initial scan for invalid digits.  */
2245   const char *p = s;
2246   for (;;)
2247     {
2248       unsigned int c = *p++;
2249       if (c == 0)
2250         break;
2251
2252       if (UNLIKELY (!ISDIGIT (c)))
2253         {
2254           err = LONGINT_INVALID;
2255           break;
2256         }
2257
2258       err = LONGINT_OK;           /* we've seen at least one valid digit */
2259     }
2260
2261   for (;err == LONGINT_OK;)
2262     {
2263       unsigned int c = *s++;
2264       if (c == 0)
2265         break;
2266
2267       c -= '0';
2268
2269       if (UNLIKELY (hi > ~(uintmax_t)0 / 10))
2270         {
2271           err = LONGINT_OVERFLOW;
2272           break;
2273         }
2274       hi = 10 * hi;
2275
2276       lo_carry = (lo >> (W_TYPE_SIZE - 3)) + (lo >> (W_TYPE_SIZE - 1));
2277       lo_carry += 10 * lo < 2 * lo;
2278
2279       lo = 10 * lo;
2280       lo += c;
2281
2282       lo_carry += lo < c;
2283       hi += lo_carry;
2284       if (UNLIKELY (hi < lo_carry))
2285         {
2286           err = LONGINT_OVERFLOW;
2287           break;
2288         }
2289     }
2290
2291   *hip = hi;
2292   *lop = lo;
2293
2294   return err;
2295 }
2296
2297 static void
2298 print_uintmaxes (uintmax_t t1, uintmax_t t0)
2299 {
2300   uintmax_t q, r;
2301
2302   if (t1 == 0)
2303     printf ("%ju", t0);
2304   else
2305     {
2306       /* Use very plain code here since it seems hard to write fast code
2307          without assuming a specific word size.  */
2308       q = t1 / 1000000000;
2309       r = t1 % 1000000000;
2310       udiv_qrnnd (t0, r, r, t0, 1000000000);
2311       print_uintmaxes (q, t0);
2312       printf ("%09u", (int) r);
2313     }
2314 }
2315
2316 /* Single-precision factoring */
2317 static void
2318 print_factors_single (uintmax_t t1, uintmax_t t0)
2319 {
2320   struct factors factors;
2321
2322   print_uintmaxes (t1, t0);
2323   putchar (':');
2324
2325   factor (t1, t0, &factors);
2326
2327   for (unsigned int j = 0; j < factors.nfactors; j++)
2328     for (unsigned int k = 0; k < factors.e[j]; k++)
2329       {
2330         char buf[INT_BUFSIZE_BOUND (uintmax_t)];
2331         putchar (' ');
2332         fputs (umaxtostr (factors.p[j], buf), stdout);
2333       }
2334
2335   if (factors.plarge[1])
2336     {
2337       putchar (' ');
2338       print_uintmaxes (factors.plarge[1], factors.plarge[0]);
2339     }
2340   putchar ('\n');
2341 }
2342
2343 /* Emit the factors of the indicated number.  If we have the option of using
2344    either algorithm, we select on the basis of the length of the number.
2345    For longer numbers, we prefer the MP algorithm even if the native algorithm
2346    has enough digits, because the algorithm is better.  The turnover point
2347    depends on the value.  */
2348 static bool
2349 print_factors (const char *input)
2350 {
2351   uintmax_t t1, t0;
2352
2353   /* Try converting the number to one or two words.  If it fails, use GMP or
2354      print an error message.  The 2nd condition checks that the most
2355      significant bit of the two-word number is clear, in a typesize neutral
2356      way.  */
2357   strtol_error err = strto2uintmax (&t1, &t0, input);
2358
2359   switch (err)
2360     {
2361     case LONGINT_OK:
2362       if (((t1 << 1) >> 1) == t1)
2363         {
2364           debug ("[%s]", _("using single-precision arithmetic"));
2365           print_factors_single (t1, t0);
2366           return true;
2367         }
2368       break;
2369
2370     case LONGINT_OVERFLOW:
2371       /* Try GMP.  */
2372       break;
2373
2374     default:
2375       error (0, 0, _("%s is not a valid positive integer"), quote (input));
2376       return false;
2377     }
2378
2379 #if HAVE_GMP
2380   debug ("[%s]", _("using arbitrary-precision arithmetic"));
2381   mpz_t t;
2382   struct mp_factors factors;
2383
2384   mpz_init_set_str (t, input, 10);
2385
2386   gmp_printf ("%Zd:", t);
2387   mp_factor (t, &factors);
2388
2389   for (unsigned int j = 0; j < factors.nfactors; j++)
2390     for (unsigned int k = 0; k < factors.e[j]; k++)
2391       gmp_printf (" %Zd", factors.p[j]);
2392
2393   mp_factor_clear (&factors);
2394   mpz_clear (t);
2395   putchar ('\n');
2396   return true;
2397 #else
2398   error (0, 0, _("%s is too large"), quote (input));
2399   return false;
2400 #endif
2401 }
2402
2403 void
2404 usage (int status)
2405 {
2406   if (status != EXIT_SUCCESS)
2407     emit_try_help ();
2408   else
2409     {
2410       printf (_("\
2411 Usage: %s [NUMBER]...\n\
2412   or:  %s OPTION\n\
2413 "),
2414               program_name, program_name);
2415       fputs (_("\
2416 Print the prime factors of each specified integer NUMBER.  If none\n\
2417 are specified on the command line, read them from standard input.\n\
2418 \n\
2419 "), stdout);
2420       fputs (HELP_OPTION_DESCRIPTION, stdout);
2421       fputs (VERSION_OPTION_DESCRIPTION, stdout);
2422       emit_ancillary_info ();
2423     }
2424   exit (status);
2425 }
2426
2427 static bool
2428 do_stdin (void)
2429 {
2430   bool ok = true;
2431   token_buffer tokenbuffer;
2432
2433   init_tokenbuffer (&tokenbuffer);
2434
2435   while (true)
2436     {
2437       size_t token_length = readtoken (stdin, DELIM, sizeof (DELIM) - 1,
2438                                        &tokenbuffer);
2439       if (token_length == (size_t) -1)
2440         break;
2441       ok &= print_factors (tokenbuffer.buffer);
2442     }
2443   free (tokenbuffer.buffer);
2444
2445   return ok;
2446 }
2447
2448 int
2449 main (int argc, char **argv)
2450 {
2451   initialize_main (&argc, &argv);
2452   set_program_name (argv[0]);
2453   setlocale (LC_ALL, "");
2454   bindtextdomain (PACKAGE, LOCALEDIR);
2455   textdomain (PACKAGE);
2456
2457   atexit (close_stdout);
2458
2459   alg = ALG_POLLARD_RHO;        /* Default to Pollard rho */
2460
2461   int c;
2462   while ((c = getopt_long (argc, argv, "", long_options, NULL)) != -1)
2463     {
2464       switch (c)
2465         {
2466         case VERBOSE_OPTION:
2467           verbose = 1;
2468           break;
2469
2470         case 's':
2471           alg = ALG_SQUFOF;
2472           break;
2473
2474         case 'w':
2475           flag_prove_primality = false;
2476           break;
2477
2478         case_GETOPT_HELP_CHAR;
2479
2480         case_GETOPT_VERSION_CHAR (PROGRAM_NAME, AUTHORS);
2481
2482         default:
2483           usage (EXIT_FAILURE);
2484         }
2485     }
2486
2487 #if STAT_SQUFOF
2488   if (alg == ALG_SQUFOF)
2489     memset (q_freq, 0, sizeof (q_freq));
2490 #endif
2491
2492   bool ok;
2493   if (argc <= optind)
2494     ok = do_stdin ();
2495   else
2496     {
2497       ok = true;
2498       for (int i = optind; i < argc; i++)
2499         if (! print_factors (argv[i]))
2500           ok = false;
2501     }
2502
2503 #if STAT_SQUFOF
2504   if (alg == ALG_SQUFOF && q_freq[0] > 0)
2505     {
2506       double acc_f;
2507       printf ("q  freq.  cum. freq.(total: %d)\n", q_freq[0]);
2508       for (unsigned int i = 1, acc_f = 0.0; i <= Q_FREQ_SIZE; i++)
2509         {
2510           double f = (double) q_freq[i] / q_freq[0];
2511           acc_f += f;
2512           printf ("%s%d %.2f%% %.2f%%\n", i == Q_FREQ_SIZE ? ">=" : "", i,
2513                   100.0 * f, 100.0 * acc_f);
2514         }
2515     }
2516 #endif
2517
2518   exit (ok ? EXIT_SUCCESS : EXIT_FAILURE);
2519 }