src/factor.c

   1 /* factor -- print prime factors of n.
   2    Copyright (C) 1986-2013 Free Software Foundation, Inc.
   3
   4    This program is free software: you can redistribute it and/or modify
   5    it under the terms of the GNU General Public License as published by
   6    the Free Software Foundation, either version 3 of the License, or
   7    (at your option) any later version.
   8
   9    This program is distributed in the hope that it will be useful,
  10    but WITHOUT ANY WARRANTY; without even the implied warranty of
  11    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  12    GNU General Public License for more details.
  13
  14    You should have received a copy of the GNU General Public License
  15    along with this program.  If not, see <http://www.gnu.org/licenses/>.  */
  16
  17 /* Originally written by Paul Rubin <phr@ocf.berkeley.edu>.
  18    Adapted for GNU, fixed to factor UINT_MAX by Jim Meyering.
  19    Arbitrary-precision code adapted by James Youngman from Torbjörn
  20    Granlund's factorize.c, from GNU MP version 4.2.2.
  21    In 2012, the core was rewritten by Torbjörn Granlund and Niels Möller.
  22    Contains code from GNU MP.  */
  23
  24 /* Efficiently factor numbers that fit in one or two words (word = uintmax_t),
  25    or, with GMP, numbers of any size.
  26
  27   Code organisation:
  28
  29     There are several variants of many functions, for handling one word, two
  30     words, and GMP's mpz_t type.  If the one-word variant is called foo, the
  31     two-word variant will be foo2, and the one for mpz_t will be mp_foo.  In
  32     some cases, the plain function variants will handle both one-word and
  33     two-word numbers, evidenced by function arguments.
  34
  35     The factoring code for two words will fall into the code for one word when
  36     progress allows that.
  37
  38     Using GMP is optional.  Define HAVE_GMP to make this code include GMP
  39     factoring code.  The GMP factoring code is based on GMP's demos/factorize.c
  40     (last synched 2012-09-07).  The GMP-based factoring code will stay in GMP
  41     factoring code even if numbers get small enough for using the two-word
  42     code.
  43
  44   Algorithm:
  45
  46     (1) Perform trial division using a small primes table, but without hardware
  47         division since the primes table store inverses modulo the word base.
  48         (The GMP variant of this code doesn't make use of the precomputed
  49         inverses, but instead relies on GMP for fast divisibility testing.)
  50     (2) Check the nature of any non-factored part using Miller-Rabin for
  51         detecting composites, and Lucas for detecting primes.
  52     (3) Factor any remaining composite part using the Pollard-Brent rho
  53         algorithm or the SQUFOF algorithm, checking status of found factors
  54         again using Miller-Rabin and Lucas.
  55
  56     We prefer using Hensel norm in the divisions, not the more familiar
  57     Euclidian norm, since the former leads to much faster code.  In the
  58     Pollard-Brent rho code and the prime testing code, we use Montgomery's
  59     trick of multiplying all n-residues by the word base, allowing cheap Hensel
  60     reductions mod n.
  61
  62   Improvements:
  63
  64     * Use modular inverses also for exact division in the Lucas code, and
  65       elsewhere.  A problem is to locate the inverses not from an index, but
  66       from a prime.  We might instead compute the inverse on-the-fly.
  67
  68     * Tune trial division table size (not forgetting that this is a standalone
  69       program where the table will be read from disk for each invocation).
  70
  71     * Implement less naive powm, using k-ary exponentiation for k = 3 or
  72       perhaps k = 4.
  73
  74     * Try to speed trial division code for single uintmax_t numbers, i.e., the
  75       code using DIVBLOCK.  It currently runs at 2 cycles per prime (Intel SBR,
  76       IBR), 3 cycles per prime (AMD Stars) and 5 cycles per prime (AMD BD) when
  77       using gcc 4.6 and 4.7.  Some software pipelining should help; 1, 2, and 4
  78       respectively cycles ought to be possible.
  79
  80     * The redcify function could be vastly improved by using (plain Euclidian)
  81       pre-inversion (such as GMP's invert_limb) and udiv_qrnnd_preinv (from
  82       GMP's gmp-impl.h).  The redcify2 function could be vastly improved using
  83       similar methoods.  These functions currently dominate run time when using
  84       the -w option.
  85 */
  86
  87 #include <config.h>
  88 #include <getopt.h>
  89 #include <stdio.h>
  90 #if HAVE_GMP
  91 # include <gmp.h>
  92 # if !HAVE_DECL_MPZ_INITS
  93 #  include <stdarg.h>
  94 # endif
  95 #endif
  96
  97 #include <assert.h>
  98
  99 #include "system.h"
 100 #include "error.h"
 101 #include "quote.h"
 102 #include "readtokens.h"
 103 #include "xstrtol.h"
 104
 105 /* The official name of this program (e.g., no 'g' prefix).  */
 106 #define PROGRAM_NAME "factor"
 107
 108 #define AUTHORS \
 109   proper_name ("Paul Rubin"),                                           \
 110   proper_name_utf8 ("Torbjorn Granlund", "Torbj\303\266rn Granlund"),   \
 111   proper_name_utf8 ("Niels Moller", "Niels M\303\266ller")
 112
 113 /* Token delimiters when reading from a file.  */
 114 #define DELIM "\n\t "
 115
 116 #ifndef STAT_SQUFOF
 117 # define STAT_SQUFOF 0
 118 #endif
 119
 120 #ifndef USE_LONGLONG_H
 121 /* With the way we use longlong.h, it's only safe to use
 122    when UWtype = UHWtype, as there were various cases
 123    (as can be seen in the history for longlong.h) where
 124    for example, _LP64 was required to enable W_TYPE_SIZE==64 code,
 125    to avoid compile time or run time issues.  */
 126 # if LONG_MAX == INTMAX_MAX
 127 #  define USE_LONGLONG_H 1
 128 # endif
 129 #endif
 130
 131 #if USE_LONGLONG_H
 132
 133 /* Make definitions for longlong.h to make it do what it can do for us */
 134
 135 /* bitcount for uintmax_t */
 136 # if UINTMAX_MAX == UINT32_MAX
 137 #  define W_TYPE_SIZE 32
 138 # elif UINTMAX_MAX == UINT64_MAX
 139 #  define W_TYPE_SIZE 64
 140 # elif UINTMAX_MAX == UINT128_MAX
 141 #  define W_TYPE_SIZE 128
 142 # endif
 143
 144 # define UWtype  uintmax_t
 145 # define UHWtype unsigned long int
 146 # undef UDWtype
 147 # if HAVE_ATTRIBUTE_MODE
 148 typedef unsigned int UQItype    __attribute__ ((mode (QI)));
 149 typedef          int SItype     __attribute__ ((mode (SI)));
 150 typedef unsigned int USItype    __attribute__ ((mode (SI)));
 151 typedef          int DItype     __attribute__ ((mode (DI)));
 152 typedef unsigned int UDItype    __attribute__ ((mode (DI)));
 153 # else
 154 typedef unsigned char UQItype;
 155 typedef          long SItype;
 156 typedef unsigned long int USItype;
 157 #  if HAVE_LONG_LONG_INT
 158 typedef long long int DItype;
 159 typedef unsigned long long int UDItype;
 160 #  else /* Assume `long' gives us a wide enough type.  Needed for hppa2.0w.  */
 161 typedef long int DItype;
 162 typedef unsigned long int UDItype;
 163 #  endif
 164 # endif
 165 # define LONGLONG_STANDALONE     /* Don't require GMP's longlong.h mdep files */
 166 # define ASSERT(x)               /* FIXME make longlong.h really standalone */
 167 # define __GMP_DECLSPEC          /* FIXME make longlong.h really standalone */
 168 # define __clz_tab factor_clz_tab /* Rename to avoid glibc collision */
 169 # ifndef __GMP_GNUC_PREREQ
 170 #  define __GMP_GNUC_PREREQ(a,b) 1
 171 # endif
 172
 173 /* These stub macros are only used in longlong.h in certain system compiler
 174    combinations, so ensure usage to avoid -Wunused-macros warnings.  */
 175 # if __GMP_GNUC_PREREQ (1,1) && defined __clz_tab
 176 ASSERT (1)
 177 __GMP_DECLSPEC
 178 # endif
 179
 180 # if _ARCH_PPC
 181 #  define HAVE_HOST_CPU_FAMILY_powerpc 1
 182 # endif
 183 # include "longlong.h"
 184 # ifdef COUNT_LEADING_ZEROS_NEED_CLZ_TAB
 185 const unsigned char factor_clz_tab[129] =
 186 {
 187   1,2,3,3,4,4,4,4,5,5,5,5,5,5,5,5,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,
 188   7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,
 189   8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,
 190   8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,
 191   9
 192 };
 193 # endif
 194
 195 #else /* not USE_LONGLONG_H */
 196
 197 # define W_TYPE_SIZE (8 * sizeof (uintmax_t))
 198 # define __ll_B ((uintmax_t) 1 << (W_TYPE_SIZE / 2))
 199 # define __ll_lowpart(t)  ((uintmax_t) (t) & (__ll_B - 1))
 200 # define __ll_highpart(t) ((uintmax_t) (t) >> (W_TYPE_SIZE / 2))
 201
 202 #endif
 203
 204 #if !defined __clz_tab && !defined UHWtype
 205 /* Without this seemingly useless conditional, gcc -Wunused-macros
 206    warns that each of the two tested macros is unused on Fedora 18.
 207    FIXME: this is just an ugly band-aid.  Fix it properly.  */
 208 #endif
 209
 210 enum alg_type { ALG_POLLARD_RHO = 1, ALG_SQUFOF = 2 };
 211
 212 static enum alg_type alg;
 213
 214 /* 2*3*5*7*11...*101 is 128 bits, and has 26 prime factors */
 215 #define MAX_NFACTS 26
 216
 217 enum
 218 {
 219   VERBOSE_OPTION = CHAR_MAX + 1
 220 };
 221
 222 static struct option const long_options[] =
 223 {
 224   {"verbose", no_argument, NULL, VERBOSE_OPTION},
 225   {GETOPT_HELP_OPTION_DECL},
 226   {GETOPT_VERSION_OPTION_DECL},
 227   {NULL, 0, NULL, 0}
 228 };
 229
 230 struct factors
 231 {
 232   uintmax_t     plarge[2]; /* Can have a single large factor */
 233   uintmax_t     p[MAX_NFACTS];
 234   unsigned char e[MAX_NFACTS];
 235   unsigned char nfactors;
 236 };
 237
 238 #if HAVE_GMP
 239 struct mp_factors
 240 {
 241   mpz_t             *p;
 242   unsigned long int *e;
 243   unsigned long int nfactors;
 244 };
 245 #endif
 246
 247 static void factor (uintmax_t, uintmax_t, struct factors *);
 248
 249 #ifndef umul_ppmm
 250 # define umul_ppmm(w1, w0, u, v)                                        \
 251   do {                                                                  \
 252     uintmax_t __x0, __x1, __x2, __x3;                                   \
 253     unsigned long int __ul, __vl, __uh, __vh;                           \
 254     uintmax_t __u = (u), __v = (v);                                     \
 255                                                                         \
 256     __ul = __ll_lowpart (__u);                                          \
 257     __uh = __ll_highpart (__u);                                         \
 258     __vl = __ll_lowpart (__v);                                          \
 259     __vh = __ll_highpart (__v);                                         \
 260                                                                         \
 261     __x0 = (uintmax_t) __ul * __vl;                                     \
 262     __x1 = (uintmax_t) __ul * __vh;                                     \
 263     __x2 = (uintmax_t) __uh * __vl;                                     \
 264     __x3 = (uintmax_t) __uh * __vh;                                     \
 265                                                                         \
 266     __x1 += __ll_highpart (__x0);/* this can't give carry */            \
 267     __x1 += __x2;               /* but this indeed can */               \
 268     if (__x1 < __x2)            /* did we get it? */                    \
 269       __x3 += __ll_B;           /* yes, add it in the proper pos. */    \
 270                                                                         \
 271     (w1) = __x3 + __ll_highpart (__x1);                                 \
 272     (w0) = (__x1 << W_TYPE_SIZE / 2) + __ll_lowpart (__x0);             \
 273   } while (0)
 274 #endif
 275
 276 #if !defined udiv_qrnnd || defined UDIV_NEEDS_NORMALIZATION
 277 /* Define our own, not needing normalization. This function is
 278    currently not performance critical, so keep it simple. Similar to
 279    the mod macro below. */
 280 # undef udiv_qrnnd
 281 # define udiv_qrnnd(q, r, n1, n0, d)                                    \
 282   do {                                                                  \
 283     uintmax_t __d1, __d0, __q, __r1, __r0;                              \
 284                                                                         \
 285     assert ((n1) < (d));                                                \
 286     __d1 = (d); __d0 = 0;                                               \
 287     __r1 = (n1); __r0 = (n0);                                           \
 288     __q = 0;                                                            \
 289     for (unsigned int __i = W_TYPE_SIZE; __i > 0; __i--)                \
 290       {                                                                 \
 291         rsh2 (__d1, __d0, __d1, __d0, 1);                               \
 292         __q <<= 1;                                                      \
 293         if (ge2 (__r1, __r0, __d1, __d0))                               \
 294           {                                                             \
 295             __q++;                                                      \
 296             sub_ddmmss (__r1, __r0, __r1, __r0, __d1, __d0);            \
 297           }                                                             \
 298       }                                                                 \
 299     (r) = __r0;                                                         \
 300     (q) = __q;                                                          \
 301   } while (0)
 302 #endif
 303
 304 #if !defined add_ssaaaa
 305 # define add_ssaaaa(sh, sl, ah, al, bh, bl)                             \
 306   do {                                                                  \
 307     uintmax_t _add_x;                                                   \
 308     _add_x = (al) + (bl);                                               \
 309     (sh) = (ah) + (bh) + (_add_x < (al));                               \
 310     (sl) = _add_x;                                                      \
 311   } while (0)
 312 #endif
 313
 314 #define rsh2(rh, rl, ah, al, cnt)                                       \
 315   do {                                                                  \
 316     (rl) = ((ah) << (W_TYPE_SIZE - (cnt))) | ((al) >> (cnt));           \
 317     (rh) = (ah) >> (cnt);                                               \
 318   } while (0)
 319
 320 #define lsh2(rh, rl, ah, al, cnt)                                       \
 321   do {                                                                  \
 322     (rh) = ((ah) << cnt) | ((al) >> (W_TYPE_SIZE - (cnt)));             \
 323     (rl) = (al) << (cnt);                                               \
 324   } while (0)
 325
 326 #define ge2(ah, al, bh, bl)                                             \
 327   ((ah) > (bh) || ((ah) == (bh) && (al) >= (bl)))
 328
 329 #define gt2(ah, al, bh, bl)                                             \
 330   ((ah) > (bh) || ((ah) == (bh) && (al) > (bl)))
 331
 332 #ifndef sub_ddmmss
 333 # define sub_ddmmss(rh, rl, ah, al, bh, bl)                             \
 334   do {                                                                  \
 335     uintmax_t _cy;                                                      \
 336     _cy = (al) < (bl);                                                  \
 337     (rl) = (al) - (bl);                                                 \
 338     (rh) = (ah) - (bh) - _cy;                                           \
 339   } while (0)
 340 #endif
 341
 342 #ifndef count_leading_zeros
 343 # define count_leading_zeros(count, x) do {                             \
 344     uintmax_t __clz_x = (x);                                            \
 345     unsigned int __clz_c;                                               \
 346     for (__clz_c = 0;                                                   \
 347          (__clz_x & ((uintmax_t) 0xff << (W_TYPE_SIZE - 8))) == 0;      \
 348          __clz_c += 8)                                                  \
 349       __clz_x <<= 8;                                                    \
 350     for (; (intmax_t)__clz_x >= 0; __clz_c++)                           \
 351       __clz_x <<= 1;                                                    \
 352     (count) = __clz_c;                                                  \
 353   } while (0)
 354 #endif
 355
 356 #ifndef count_trailing_zeros
 357 # define count_trailing_zeros(count, x) do {                            \
 358     uintmax_t __ctz_x = (x);                                            \
 359     unsigned int __ctz_c = 0;                                           \
 360     while ((__ctz_x & 1) == 0)                                          \
 361       {                                                                 \
 362         __ctz_x >>= 1;                                                  \
 363         __ctz_c++;                                                      \
 364       }                                                                 \
 365     (count) = __ctz_c;                                                  \
 366   } while (0)
 367 #endif
 368
 369 /* Requires that a < n and b <= n */
 370 #define submod(r,a,b,n)                                                 \
 371   do {                                                                  \
 372     uintmax_t _t = - (uintmax_t) (a < b);                               \
 373     (r) = ((n) & _t) + (a) - (b);                                       \
 374   } while (0)
 375
 376 #define addmod(r,a,b,n)                                                 \
 377   submod ((r), (a), ((n) - (b)), (n))
 378
 379 /* Modular two-word addition and subtraction.  For performance reasons, the
 380    most significant bit of n1 must be clear.  The destination variables must be
 381    distinct from the mod operand.  */
 382 #define addmod2(r1, r0, a1, a0, b1, b0, n1, n0)                         \
 383   do {                                                                  \
 384     add_ssaaaa ((r1), (r0), (a1), (a0), (b1), (b0));                    \
 385     if (ge2 ((r1), (r0), (n1), (n0)))                                   \
 386       sub_ddmmss ((r1), (r0), (r1), (r0), (n1), (n0));                  \
 387   } while (0)
 388 #define submod2(r1, r0, a1, a0, b1, b0, n1, n0)                         \
 389   do {                                                                  \
 390     sub_ddmmss ((r1), (r0), (a1), (a0), (b1), (b0));                    \
 391     if ((intmax_t) (r1) < 0)                                            \
 392       add_ssaaaa ((r1), (r0), (r1), (r0), (n1), (n0));                  \
 393   } while (0)
 394
 395 #define HIGHBIT_TO_MASK(x)                                              \
 396   (((intmax_t)-1 >> 1) < 0                                              \
 397    ? (uintmax_t)((intmax_t)(x) >> (W_TYPE_SIZE - 1))                    \
 398    : ((x) & ((uintmax_t) 1 << (W_TYPE_SIZE - 1))                        \
 399       ? UINTMAX_MAX : (uintmax_t) 0))
 400
 401 /* Compute r = a mod d, where r = <*t1,retval>, a = <a1,a0>, d = <d1,d0>.
 402    Requires that d1 != 0.  */
 403 static uintmax_t
 404 mod2 (uintmax_t *r1, uintmax_t a1, uintmax_t a0, uintmax_t d1, uintmax_t d0)
 405 {
 406   int cntd, cnta;
 407
 408   assert (d1 != 0);
 409
 410   if (a1 == 0)
 411     {
 412       *r1 = 0;
 413       return a0;
 414     }
 415
 416   count_leading_zeros (cntd, d1);
 417   count_leading_zeros (cnta, a1);
 418   int cnt = cntd - cnta;
 419   lsh2 (d1, d0, d1, d0, cnt);
 420   for (int i = 0; i < cnt; i++)
 421     {
 422       if (ge2 (a1, a0, d1, d0))
 423         sub_ddmmss (a1, a0, a1, a0, d1, d0);
 424       rsh2 (d1, d0, d1, d0, 1);
 425     }
 426
 427   *r1 = a1;
 428   return a0;
 429 }
 430
 431 static uintmax_t _GL_ATTRIBUTE_CONST
 432 gcd_odd (uintmax_t a, uintmax_t b)
 433 {
 434   if ( (b & 1) == 0)
 435     {
 436       uintmax_t t = b;
 437       b = a;
 438       a = t;
 439     }
 440   if (a == 0)
 441     return b;
 442
 443   /* Take out least significant one bit, to make room for sign */
 444   b >>= 1;
 445
 446   for (;;)
 447     {
 448       uintmax_t t;
 449       uintmax_t bgta;
 450
 451       while ((a & 1) == 0)
 452         a >>= 1;
 453       a >>= 1;
 454
 455       t = a - b;
 456       if (t == 0)
 457         return (a << 1) + 1;
 458
 459       bgta = HIGHBIT_TO_MASK (t);
 460
 461       /* b <-- min (a, b) */
 462       b += (bgta & t);
 463
 464       /* a <-- |a - b| */
 465       a = (t ^ bgta) - bgta;
 466     }
 467 }
 468
 469 static uintmax_t
 470 gcd2_odd (uintmax_t *r1, uintmax_t a1, uintmax_t a0, uintmax_t b1, uintmax_t b0)
 471 {
 472   while ((a0 & 1) == 0)
 473     rsh2 (a1, a0, a1, a0, 1);
 474   while ((b0 & 1) == 0)
 475     rsh2 (b1, b0, b1, b0, 1);
 476
 477   for (;;)
 478     {
 479       if ((b1 | a1) == 0)
 480         {
 481           *r1 = 0;
 482           return gcd_odd (b0, a0);
 483         }
 484
 485       if (gt2 (a1, a0, b1, b0))
 486         {
 487           sub_ddmmss (a1, a0, a1, a0, b1, b0);
 488           do
 489             rsh2 (a1, a0, a1, a0, 1);
 490           while ((a0 & 1) == 0);
 491         }
 492       else if (gt2 (b1, b0, a1, a0))
 493         {
 494           sub_ddmmss (b1, b0, b1, b0, a1, a0);
 495           do
 496             rsh2 (b1, b0, b1, b0, 1);
 497           while ((b0 & 1) == 0);
 498         }
 499       else
 500         break;
 501     }
 502
 503   *r1 = a1;
 504   return a0;
 505 }
 506
 507 static void
 508 factor_insert_multiplicity (struct factors *factors,
 509                             uintmax_t prime, unsigned int m)
 510 {
 511   unsigned int nfactors = factors->nfactors;
 512   uintmax_t *p = factors->p;
 513   unsigned char *e = factors->e;
 514
 515   /* Locate position for insert new or increment e.  */
 516   int i;
 517   for (i = nfactors - 1; i >= 0; i--)
 518     {
 519       if (p[i] <= prime)
 520         break;
 521     }
 522
 523   if (i < 0 || p[i] != prime)
 524     {
 525       for (int j = nfactors - 1; j > i; j--)
 526         {
 527           p[j + 1] = p[j];
 528           e[j + 1] = e[j];
 529         }
 530       p[i + 1] = prime;
 531       e[i + 1] = m;
 532       factors->nfactors = nfactors + 1;
 533     }
 534   else
 535     {
 536       e[i] += m;
 537     }
 538 }
 539
 540 #define factor_insert(f, p) factor_insert_multiplicity (f, p, 1)
 541
 542 static void
 543 factor_insert_large (struct factors *factors,
 544                      uintmax_t p1, uintmax_t p0)
 545 {
 546   if (p1 > 0)
 547     {
 548       assert (factors->plarge[1] == 0);
 549       factors->plarge[0] = p0;
 550       factors->plarge[1] = p1;
 551     }
 552   else
 553     factor_insert (factors, p0);
 554 }
 555
 556 #if HAVE_GMP
 557
 558 # if !HAVE_DECL_MPZ_INITS
 559
 560 #  define mpz_inits(...) mpz_va_init (mpz_init, __VA_ARGS__)
 561 #  define mpz_clears(...) mpz_va_init (mpz_clear, __VA_ARGS__)
 562
 563 static void
 564 mpz_va_init (void (*mpz_single_init)(mpz_t), ...)
 565 {
 566   va_list ap;
 567
 568   va_start (ap, mpz_single_init);
 569
 570   mpz_t *mpz;
 571   while ((mpz = va_arg (ap, mpz_t *)))
 572     mpz_single_init (*mpz);
 573
 574   va_end (ap);
 575 }
 576 # endif
 577
 578 static void mp_factor (mpz_t, struct mp_factors *);
 579
 580 static void
 581 mp_factor_init (struct mp_factors *factors)
 582 {
 583   factors->p = NULL;
 584   factors->e = NULL;
 585   factors->nfactors = 0;
 586 }
 587
 588 static void
 589 mp_factor_clear (struct mp_factors *factors)
 590 {
 591   for (unsigned int i = 0; i < factors->nfactors; i++)
 592     mpz_clear (factors->p[i]);
 593
 594   free (factors->p);
 595   free (factors->e);
 596 }
 597
 598 static void
 599 mp_factor_insert (struct mp_factors *factors, mpz_t prime)
 600 {
 601   unsigned long int nfactors = factors->nfactors;
 602   mpz_t         *p  = factors->p;
 603   unsigned long int *e  = factors->e;
 604   long i;
 605
 606   /* Locate position for insert new or increment e.  */
 607   for (i = nfactors - 1; i >= 0; i--)
 608     {
 609       if (mpz_cmp (p[i], prime) <= 0)
 610         break;
 611     }
 612
 613   if (i < 0 || mpz_cmp (p[i], prime) != 0)
 614     {
 615       p = xrealloc (p, (nfactors + 1) * sizeof p[0]);
 616       e = xrealloc (e, (nfactors + 1) * sizeof e[0]);
 617
 618       mpz_init (p[nfactors]);
 619       for (long j = nfactors - 1; j > i; j--)
 620         {
 621           mpz_set (p[j + 1], p[j]);
 622           e[j + 1] = e[j];
 623         }
 624       mpz_set (p[i + 1], prime);
 625       e[i + 1] = 1;
 626
 627       factors->p = p;
 628       factors->e = e;
 629       factors->nfactors = nfactors + 1;
 630     }
 631   else
 632     {
 633       e[i] += 1;
 634     }
 635 }
 636
 637 static void
 638 mp_factor_insert_ui (struct mp_factors *factors, unsigned long int prime)
 639 {
 640   mpz_t pz;
 641
 642   mpz_init_set_ui (pz, prime);
 643   mp_factor_insert (factors, pz);
 644   mpz_clear (pz);
 645 }
 646 #endif /* HAVE_GMP */
 647
 648
 649 /* Number of bits in an uintmax_t.  */
 650 enum { W = sizeof (uintmax_t) * CHAR_BIT };
 651
 652 /* Verify that uintmax_t does not have holes in its representation.  */
 653 verify (UINTMAX_MAX >> (W - 1) == 1);
 654
 655 #define P(a,b,c,d) a,
 656 static const unsigned char primes_diff[] = {
 657 #include "primes.h"
 658 0,0,0,0,0,0,0                           /* 7 sentinels for 8-way loop */
 659 };
 660 #undef P
 661
 662 #define PRIMES_PTAB_ENTRIES \
 663   (sizeof (primes_diff) / sizeof (primes_diff[0]) - 8 + 1)
 664
 665 #define P(a,b,c,d) b,
 666 static const unsigned char primes_diff8[] = {
 667 #include "primes.h"
 668 0,0,0,0,0,0,0                           /* 7 sentinels for 8-way loop */
 669 };
 670 #undef P
 671
 672 struct primes_dtab
 673 {
 674   uintmax_t binv, lim;
 675 };
 676
 677 #define P(a,b,c,d) {c,d},
 678 static const struct primes_dtab primes_dtab[] = {
 679 #include "primes.h"
 680 {1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0} /* 7 sentinels for 8-way loop */
 681 };
 682 #undef P
 683
 684 /* Verify that uintmax_t is not wider than
 685    the integers used to generate primes.h.  */
 686 verify (W <= WIDE_UINT_BITS);
 687
 688 /* This flag is honored only in the GMP code. */
 689 static int verbose = 0;
 690
 691 /* Prove primality or run probabilistic tests.  */
 692 static bool flag_prove_primality = true;
 693
 694 /* Number of Miller-Rabin tests to run when not proving primality. */
 695 #define MR_REPS 25
 696
 697 #ifdef __GNUC__
 698 # define LIKELY(cond)    __builtin_expect ((cond), 1)
 699 # define UNLIKELY(cond)  __builtin_expect ((cond), 0)
 700 #else
 701 # define LIKELY(cond)    (cond)
 702 # define UNLIKELY(cond)  (cond)
 703 #endif
 704
 705 static void
 706 debug (char const *fmt, ...)
 707 {
 708   if (verbose)
 709     {
 710       va_list ap;
 711       va_start (ap, fmt);
 712       vfprintf (stderr, fmt, ap);
 713       va_end (ap);
 714     }
 715 }
 716
 717 static void
 718 factor_insert_refind (struct factors *factors, uintmax_t p, unsigned int i,
 719                       unsigned int off)
 720 {
 721   for (unsigned int j = 0; j < off; j++)
 722     p += primes_diff[i + j];
 723   factor_insert (factors, p);
 724 }
 725
 726 /* Trial division with odd primes uses the following trick.
 727
 728    Let p be an odd prime, and B = 2^{W_TYPE_SIZE}. For simplicity,
 729    consider the case t < B (this is the second loop below).
 730
 731    From our tables we get
 732
 733      binv = p^{-1} (mod B)
 734      lim = floor ( (B-1) / p ).
 735
 736    First assume that t is a multiple of p, t = q * p. Then 0 <= q <= lim
 737    (and all quotients in this range occur for some t).
 738
 739    Then t = q * p is true also (mod B), and p is invertible we get
 740
 741      q = t * binv (mod B).
 742
 743    Next, assume that t is *not* divisible by p. Since multiplication
 744    by binv (mod B) is a one-to-one mapping,
 745
 746      t * binv (mod B) > lim,
 747
 748    because all the smaller values are already taken.
 749
 750    This can be summed up by saying that the function
 751
 752      q(t) = binv * t (mod B)
 753
 754    is a permutation of the range 0 <= t < B, with the curious property
 755    that it maps the multiples of p onto the range 0 <= q <= lim, in
 756    order, and the non-multiples of p onto the range lim < q < B.
 757  */
 758
 759 static uintmax_t
 760 factor_using_division (uintmax_t *t1p, uintmax_t t1, uintmax_t t0,
 761                        struct factors *factors)
 762 {
 763   if (t0 % 2 == 0)
 764     {
 765       unsigned int cnt;
 766
 767       if (t0 == 0)
 768         {
 769           count_trailing_zeros (cnt, t1);
 770           t0 = t1 >> cnt;
 771           t1 = 0;
 772           cnt += W_TYPE_SIZE;
 773         }
 774       else
 775         {
 776           count_trailing_zeros (cnt, t0);
 777           rsh2 (t1, t0, t1, t0, cnt);
 778         }
 779
 780       factor_insert_multiplicity (factors, 2, cnt);
 781     }
 782
 783   uintmax_t p = 3;
 784   unsigned int i;
 785   for (i = 0; t1 > 0 && i < PRIMES_PTAB_ENTRIES; i++)
 786     {
 787       for (;;)
 788         {
 789           uintmax_t q1, q0, hi, lo ATTRIBUTE_UNUSED;
 790
 791           q0 = t0 * primes_dtab[i].binv;
 792           umul_ppmm (hi, lo, q0, p);
 793           if (hi > t1)
 794             break;
 795           hi = t1 - hi;
 796           q1 = hi * primes_dtab[i].binv;
 797           if (LIKELY (q1 > primes_dtab[i].lim))
 798             break;
 799           t1 = q1; t0 = q0;
 800           factor_insert (factors, p);
 801         }
 802       p += primes_diff[i + 1];
 803     }
 804   if (t1p)
 805     *t1p = t1;
 806
 807 #define DIVBLOCK(I)                                                     \
 808   do {                                                                  \
 809     for (;;)                                                            \
 810       {                                                                 \
 811         q = t0 * pd[I].binv;                                            \
 812         if (LIKELY (q > pd[I].lim))                                     \
 813           break;                                                        \
 814         t0 = q;                                                         \
 815         factor_insert_refind (factors, p, i + 1, I);                    \
 816       }                                                                 \
 817   } while (0)
 818
 819   for (; i < PRIMES_PTAB_ENTRIES; i += 8)
 820     {
 821       uintmax_t q;
 822       const struct primes_dtab *pd = &primes_dtab[i];
 823       DIVBLOCK (0);
 824       DIVBLOCK (1);
 825       DIVBLOCK (2);
 826       DIVBLOCK (3);
 827       DIVBLOCK (4);
 828       DIVBLOCK (5);
 829       DIVBLOCK (6);
 830       DIVBLOCK (7);
 831
 832       p += primes_diff8[i];
 833       if (p * p > t0)
 834         break;
 835     }
 836
 837   return t0;
 838 }
 839
 840 #if HAVE_GMP
 841 static void
 842 mp_factor_using_division (mpz_t t, struct mp_factors *factors)
 843 {
 844   mpz_t q;
 845   unsigned long int p;
 846
 847   debug ("[trial division] ");
 848
 849   mpz_init (q);
 850
 851   p = mpz_scan1 (t, 0);
 852   mpz_div_2exp (t, t, p);
 853   while (p)
 854     {
 855       mp_factor_insert_ui (factors, 2);
 856       --p;
 857     }
 858
 859   p = 3;
 860   for (unsigned int i = 1; i <= PRIMES_PTAB_ENTRIES;)
 861     {
 862       if (! mpz_divisible_ui_p (t, p))
 863         {
 864           p += primes_diff[i++];
 865           if (mpz_cmp_ui (t, p * p) < 0)
 866             break;
 867         }
 868       else
 869         {
 870           mpz_tdiv_q_ui (t, t, p);
 871           mp_factor_insert_ui (factors, p);
 872         }
 873     }
 874
 875   mpz_clear (q);
 876 }
 877 #endif
 878
 879 /* Entry i contains (2i+1)^(-1) mod 2^8.  */
 880 static const unsigned char  binvert_table[128] =
 881 {
 882   0x01, 0xAB, 0xCD, 0xB7, 0x39, 0xA3, 0xC5, 0xEF,
 883   0xF1, 0x1B, 0x3D, 0xA7, 0x29, 0x13, 0x35, 0xDF,
 884   0xE1, 0x8B, 0xAD, 0x97, 0x19, 0x83, 0xA5, 0xCF,
 885   0xD1, 0xFB, 0x1D, 0x87, 0x09, 0xF3, 0x15, 0xBF,
 886   0xC1, 0x6B, 0x8D, 0x77, 0xF9, 0x63, 0x85, 0xAF,
 887   0xB1, 0xDB, 0xFD, 0x67, 0xE9, 0xD3, 0xF5, 0x9F,
 888   0xA1, 0x4B, 0x6D, 0x57, 0xD9, 0x43, 0x65, 0x8F,
 889   0x91, 0xBB, 0xDD, 0x47, 0xC9, 0xB3, 0xD5, 0x7F,
 890   0x81, 0x2B, 0x4D, 0x37, 0xB9, 0x23, 0x45, 0x6F,
 891   0x71, 0x9B, 0xBD, 0x27, 0xA9, 0x93, 0xB5, 0x5F,
 892   0x61, 0x0B, 0x2D, 0x17, 0x99, 0x03, 0x25, 0x4F,
 893   0x51, 0x7B, 0x9D, 0x07, 0x89, 0x73, 0x95, 0x3F,
 894   0x41, 0xEB, 0x0D, 0xF7, 0x79, 0xE3, 0x05, 0x2F,
 895   0x31, 0x5B, 0x7D, 0xE7, 0x69, 0x53, 0x75, 0x1F,
 896   0x21, 0xCB, 0xED, 0xD7, 0x59, 0xC3, 0xE5, 0x0F,
 897   0x11, 0x3B, 0x5D, 0xC7, 0x49, 0x33, 0x55, 0xFF
 898 };
 899
 900 /* Compute n^(-1) mod B, using a Newton iteration.  */
 901 #define binv(inv,n)                                                     \
 902   do {                                                                  \
 903     uintmax_t  __n = (n);                                               \
 904     uintmax_t  __inv;                                                   \
 905                                                                         \
 906     __inv = binvert_table[(__n / 2) & 0x7F]; /*  8 */                   \
 907     if (W_TYPE_SIZE > 8)   __inv = 2 * __inv - __inv * __inv * __n;     \
 908     if (W_TYPE_SIZE > 16)  __inv = 2 * __inv - __inv * __inv * __n;     \
 909     if (W_TYPE_SIZE > 32)  __inv = 2 * __inv - __inv * __inv * __n;     \
 910                                                                         \
 911     if (W_TYPE_SIZE > 64)                                               \
 912       {                                                                 \
 913         int  __invbits = 64;                                            \
 914         do {                                                            \
 915           __inv = 2 * __inv - __inv * __inv * __n;                      \
 916           __invbits *= 2;                                               \
 917         } while (__invbits < W_TYPE_SIZE);                              \
 918       }                                                                 \
 919                                                                         \
 920     (inv) = __inv;                                                      \
 921   } while (0)
 922
 923 /* q = u / d, assuming d|u.  */
 924 #define divexact_21(q1, q0, u1, u0, d)                                  \
 925   do {                                                                  \
 926     uintmax_t _di, _q0;                                                 \
 927     binv (_di, (d));                                                    \
 928     _q0 = (u0) * _di;                                                   \
 929     if ((u1) >= (d))                                                    \
 930       {                                                                 \
 931         uintmax_t _p1, _p0 ATTRIBUTE_UNUSED;                            \
 932         umul_ppmm (_p1, _p0, _q0, d);                                   \
 933         (q1) = ((u1) - _p1) * _di;                                      \
 934         (q0) = _q0;                                                     \
 935       }                                                                 \
 936     else                                                                \
 937       {                                                                 \
 938         (q0) = _q0;                                                     \
 939         (q1) = 0;                                                       \
 940       }                                                                 \
 941   } while (0)
 942
 943 /* x B (mod n). */
 944 #define redcify(r_prim, r, n)                                           \
 945   do {                                                                  \
 946     uintmax_t _redcify_q ATTRIBUTE_UNUSED;                              \
 947     udiv_qrnnd (_redcify_q, r_prim, r, 0, n);                           \
 948   } while (0)
 949
 950 /* x B^2 (mod n). Requires x > 0, n1 < B/2 */
 951 #define redcify2(r1, r0, x, n1, n0)                                     \
 952   do {                                                                  \
 953     uintmax_t _r1, _r0, _i;                                             \
 954     if ((x) < (n1))                                                     \
 955       {                                                                 \
 956         _r1 = (x); _r0 = 0;                                             \
 957         _i = W_TYPE_SIZE;                                               \
 958       }                                                                 \
 959     else                                                                \
 960       {                                                                 \
 961         _r1 = 0; _r0 = (x);                                             \
 962         _i = 2*W_TYPE_SIZE;                                             \
 963       }                                                                 \
 964     while (_i-- > 0)                                                    \
 965       {                                                                 \
 966         lsh2 (_r1, _r0, _r1, _r0, 1);                                   \
 967         if (ge2 (_r1, _r0, (n1), (n0)))                                 \
 968           sub_ddmmss (_r1, _r0, _r1, _r0, (n1), (n0));                  \
 969       }                                                                 \
 970     (r1) = _r1;                                                         \
 971     (r0) = _r0;                                                         \
 972   } while (0)
 973
 974 /* Modular two-word multiplication, r = a * b mod m, with mi = m^(-1) mod B.
 975    Both a and b must be in redc form, the result will be in redc form too. */
 976 static inline uintmax_t
 977 mulredc (uintmax_t a, uintmax_t b, uintmax_t m, uintmax_t mi)
 978 {
 979   uintmax_t rh, rl, q, th, tl ATTRIBUTE_UNUSED, xh;
 980
 981   umul_ppmm (rh, rl, a, b);
 982   q = rl * mi;
 983   umul_ppmm (th, tl, q, m);
 984   xh = rh - th;
 985   if (rh < th)
 986     xh += m;
 987
 988   return xh;
 989 }
 990
 991 /* Modular two-word multiplication, r = a * b mod m, with mi = m^(-1) mod B.
 992    Both a and b must be in redc form, the result will be in redc form too.
 993    For performance reasons, the most significant bit of m must be clear. */
 994 static uintmax_t
 995 mulredc2 (uintmax_t *r1p,
 996           uintmax_t a1, uintmax_t a0, uintmax_t b1, uintmax_t b0,
 997           uintmax_t m1, uintmax_t m0, uintmax_t mi)
 998 {
 999   uintmax_t r1, r0, q, p1, p0 ATTRIBUTE_UNUSED, t1, t0, s1, s0;
1000   mi = -mi;
1001   assert ( (a1 >> (W_TYPE_SIZE - 1)) == 0);
1002   assert ( (b1 >> (W_TYPE_SIZE - 1)) == 0);
1003   assert ( (m1 >> (W_TYPE_SIZE - 1)) == 0);
1004
1005   /* First compute a0 * <b1, b0> B^{-1}
1006         +-----+
1007         |a0 b0|
1008      +--+--+--+
1009      |a0 b1|
1010      +--+--+--+
1011         |q0 m0|
1012      +--+--+--+
1013      |q0 m1|
1014     -+--+--+--+
1015      |r1|r0| 0|
1016      +--+--+--+
1017   */
1018   umul_ppmm (t1, t0, a0, b0);
1019   umul_ppmm (r1, r0, a0, b1);
1020   q = mi * t0;
1021   umul_ppmm (p1, p0, q, m0);
1022   umul_ppmm (s1, s0, q, m1);
1023   r0 += (t0 != 0); /* Carry */
1024   add_ssaaaa (r1, r0, r1, r0, 0, p1);
1025   add_ssaaaa (r1, r0, r1, r0, 0, t1);
1026   add_ssaaaa (r1, r0, r1, r0, s1, s0);
1027
1028   /* Next, (a1 * <b1, b0> + <r1, r0> B^{-1}
1029         +-----+
1030         |a1 b0|
1031         +--+--+
1032         |r1|r0|
1033      +--+--+--+
1034      |a1 b1|
1035      +--+--+--+
1036         |q1 m0|
1037      +--+--+--+
1038      |q1 m1|
1039     -+--+--+--+
1040      |r1|r0| 0|
1041      +--+--+--+
1042   */
1043   umul_ppmm (t1, t0, a1, b0);
1044   umul_ppmm (s1, s0, a1, b1);
1045   add_ssaaaa (t1, t0, t1, t0, 0, r0);
1046   q = mi * t0;
1047   add_ssaaaa (r1, r0, s1, s0, 0, r1);
1048   umul_ppmm (p1, p0, q, m0);
1049   umul_ppmm (s1, s0, q, m1);
1050   r0 += (t0 != 0); /* Carry */
1051   add_ssaaaa (r1, r0, r1, r0, 0, p1);
1052   add_ssaaaa (r1, r0, r1, r0, 0, t1);
1053   add_ssaaaa (r1, r0, r1, r0, s1, s0);
1054
1055   if (ge2 (r1, r0, m1, m0))
1056     sub_ddmmss (r1, r0, r1, r0, m1, m0);
1057
1058   *r1p = r1;
1059   return r0;
1060 }
1061
1062 static uintmax_t _GL_ATTRIBUTE_CONST
1063 powm (uintmax_t b, uintmax_t e, uintmax_t n, uintmax_t ni, uintmax_t one)
1064 {
1065   uintmax_t y = one;
1066
1067   if (e & 1)
1068     y = b;
1069
1070   while (e != 0)
1071     {
1072       b = mulredc (b, b, n, ni);
1073       e >>= 1;
1074
1075       if (e & 1)
1076         y = mulredc (y, b, n, ni);
1077     }
1078
1079   return y;
1080 }
1081
1082 static uintmax_t
1083 powm2 (uintmax_t *r1m,
1084        const uintmax_t *bp, const uintmax_t *ep, const uintmax_t *np,
1085        uintmax_t ni, const uintmax_t *one)
1086 {
1087   uintmax_t r1, r0, b1, b0, n1, n0;
1088   unsigned int i;
1089   uintmax_t e;
1090
1091   b0 = bp[0];
1092   b1 = bp[1];
1093   n0 = np[0];
1094   n1 = np[1];
1095
1096   r0 = one[0];
1097   r1 = one[1];
1098
1099   for (e = ep[0], i = W_TYPE_SIZE; i > 0; i--, e >>= 1)
1100     {
1101       if (e & 1)
1102         {
1103           r0 = mulredc2 (r1m, r1, r0, b1, b0, n1, n0, ni);
1104           r1 = *r1m;
1105         }
1106       b0 = mulredc2 (r1m, b1, b0, b1, b0, n1, n0, ni);
1107       b1 = *r1m;
1108     }
1109   for (e = ep[1]; e > 0; e >>= 1)
1110     {
1111       if (e & 1)
1112         {
1113           r0 = mulredc2 (r1m, r1, r0, b1, b0, n1, n0, ni);
1114           r1 = *r1m;
1115         }
1116       b0 = mulredc2 (r1m, b1, b0, b1, b0, n1, n0, ni);
1117       b1 = *r1m;
1118     }
1119   *r1m = r1;
1120   return r0;
1121 }
1122
1123 static bool _GL_ATTRIBUTE_CONST
1124 millerrabin (uintmax_t n, uintmax_t ni, uintmax_t b, uintmax_t q,
1125              unsigned int k, uintmax_t one)
1126 {
1127   uintmax_t y = powm (b, q, n, ni, one);
1128
1129   uintmax_t nm1 = n - one;      /* -1, but in redc representation. */
1130
1131   if (y == one || y == nm1)
1132     return true;
1133
1134   for (unsigned int i = 1; i < k; i++)
1135     {
1136       y = mulredc (y, y, n, ni);
1137
1138       if (y == nm1)
1139         return true;
1140       if (y == one)
1141         return false;
1142     }
1143   return false;
1144 }
1145
1146 static bool
1147 millerrabin2 (const uintmax_t *np, uintmax_t ni, const uintmax_t *bp,
1148               const uintmax_t *qp, unsigned int k, const uintmax_t *one)
1149 {
1150   uintmax_t y1, y0, nm1_1, nm1_0, r1m;
1151
1152   y0 = powm2 (&r1m, bp, qp, np, ni, one);
1153   y1 = r1m;
1154
1155   if (y0 == one[0] && y1 == one[1])
1156     return true;
1157
1158   sub_ddmmss (nm1_1, nm1_0, np[1], np[0], one[1], one[0]);
1159
1160   if (y0 == nm1_0 && y1 == nm1_1)
1161     return true;
1162
1163   for (unsigned int i = 1; i < k; i++)
1164     {
1165       y0 = mulredc2 (&r1m, y1, y0, y1, y0, np[1], np[0], ni);
1166       y1 = r1m;
1167
1168       if (y0 == nm1_0 && y1 == nm1_1)
1169         return true;
1170       if (y0 == one[0] && y1 == one[1])
1171         return false;
1172     }
1173   return false;
1174 }
1175
1176 #if HAVE_GMP
1177 static bool
1178 mp_millerrabin (mpz_srcptr n, mpz_srcptr nm1, mpz_ptr x, mpz_ptr y,
1179                 mpz_srcptr q, unsigned long int k)
1180 {
1181   mpz_powm (y, x, q, n);
1182
1183   if (mpz_cmp_ui (y, 1) == 0 || mpz_cmp (y, nm1) == 0)
1184     return true;
1185
1186   for (unsigned long int i = 1; i < k; i++)
1187     {
1188       mpz_powm_ui (y, y, 2, n);
1189       if (mpz_cmp (y, nm1) == 0)
1190         return true;
1191       if (mpz_cmp_ui (y, 1) == 0)
1192         return false;
1193     }
1194   return false;
1195 }
1196 #endif
1197
1198 /* Lucas' prime test.  The number of iterations vary greatly, up to a few dozen
1199    have been observed.  The average seem to be about 2.  */
1200 static bool
1201 prime_p (uintmax_t n)
1202 {
1203   int k;
1204   bool is_prime;
1205   uintmax_t a_prim, one, ni;
1206   struct factors factors;
1207
1208   if (n <= 1)
1209     return false;
1210
1211   /* We have already casted out small primes. */
1212   if (n < (uintmax_t) FIRST_OMITTED_PRIME * FIRST_OMITTED_PRIME)
1213     return true;
1214
1215   /* Precomputation for Miller-Rabin.  */
1216   uintmax_t q = n - 1;
1217   for (k = 0; (q & 1) == 0; k++)
1218     q >>= 1;
1219
1220   uintmax_t a = 2;
1221   binv (ni, n);                 /* ni <- 1/n mod B */
1222   redcify (one, 1, n);
1223   addmod (a_prim, one, one, n); /* i.e., redcify a = 2 */
1224
1225   /* Perform a Miller-Rabin test, finds most composites quickly.  */
1226   if (!millerrabin (n, ni, a_prim, q, k, one))
1227     return false;
1228
1229   if (flag_prove_primality)
1230     {
1231       /* Factor n-1 for Lucas.  */
1232       factor (0, n - 1, &factors);
1233     }
1234
1235   /* Loop until Lucas proves our number prime, or Miller-Rabin proves our
1236      number composite.  */
1237   for (unsigned int r = 0; r < PRIMES_PTAB_ENTRIES; r++)
1238     {
1239       if (flag_prove_primality)
1240         {
1241           is_prime = true;
1242           for (unsigned int i = 0; i < factors.nfactors && is_prime; i++)
1243             {
1244               is_prime
1245                 = powm (a_prim, (n - 1) / factors.p[i], n, ni, one) != one;
1246             }
1247         }
1248       else
1249         {
1250           /* After enough Miller-Rabin runs, be content. */
1251           is_prime = (r == MR_REPS - 1);
1252         }
1253
1254       if (is_prime)
1255         return true;
1256
1257       a += primes_diff[r];      /* Establish new base.  */
1258
1259       /* The following is equivalent to redcify (a_prim, a, n).  It runs faster
1260          on most processors, since it avoids udiv_qrnnd.  If we go down the
1261          udiv_qrnnd_preinv path, this code should be replaced.  */
1262       {
1263         uintmax_t s1, s0;
1264         umul_ppmm (s1, s0, one, a);
1265         if (LIKELY (s1 == 0))
1266           a_prim = s0 % n;
1267         else
1268           {
1269             uintmax_t dummy ATTRIBUTE_UNUSED;
1270             udiv_qrnnd (dummy, a_prim, s1, s0, n);
1271           }
1272       }
1273
1274       if (!millerrabin (n, ni, a_prim, q, k, one))
1275         return false;
1276     }
1277
1278   error (0, 0, _("Lucas prime test failure.  This should not happen"));
1279   abort ();
1280 }
1281
1282 static bool
1283 prime2_p (uintmax_t n1, uintmax_t n0)
1284 {
1285   uintmax_t q[2], nm1[2];
1286   uintmax_t a_prim[2];
1287   uintmax_t one[2];
1288   uintmax_t na[2];
1289   uintmax_t ni;
1290   unsigned int k;
1291   struct factors factors;
1292
1293   if (n1 == 0)
1294     return prime_p (n0);
1295
1296   nm1[1] = n1 - (n0 == 0);
1297   nm1[0] = n0 - 1;
1298   if (nm1[0] == 0)
1299     {
1300       count_trailing_zeros (k, nm1[1]);
1301
1302       q[0] = nm1[1] >> k;
1303       q[1] = 0;
1304       k += W_TYPE_SIZE;
1305     }
1306   else
1307     {
1308       count_trailing_zeros (k, nm1[0]);
1309       rsh2 (q[1], q[0], nm1[1], nm1[0], k);
1310     }
1311
1312   uintmax_t a = 2;
1313   binv (ni, n0);
1314   redcify2 (one[1], one[0], 1, n1, n0);
1315   addmod2 (a_prim[1], a_prim[0], one[1], one[0], one[1], one[0], n1, n0);
1316
1317   /* FIXME: Use scalars or pointers in arguments? Some consistency needed. */
1318   na[0] = n0;
1319   na[1] = n1;
1320
1321   if (!millerrabin2 (na, ni, a_prim, q, k, one))
1322     return false;
1323
1324   if (flag_prove_primality)
1325     {
1326       /* Factor n-1 for Lucas.  */
1327       factor (nm1[1], nm1[0], &factors);
1328     }
1329
1330   /* Loop until Lucas proves our number prime, or Miller-Rabin proves our
1331      number composite.  */
1332   for (unsigned int r = 0; r < PRIMES_PTAB_ENTRIES; r++)
1333     {
1334       bool is_prime;
1335       uintmax_t e[2], y[2];
1336
1337       if (flag_prove_primality)
1338         {
1339           is_prime = true;
1340           if (factors.plarge[1])
1341             {
1342               uintmax_t pi;
1343               binv (pi, factors.plarge[0]);
1344               e[0] = pi * nm1[0];
1345               e[1] = 0;
1346               y[0] = powm2 (&y[1], a_prim, e, na, ni, one);
1347               is_prime = (y[0] != one[0] || y[1] != one[1]);
1348             }
1349           for (unsigned int i = 0; i < factors.nfactors && is_prime; i++)
1350             {
1351               /* FIXME: We always have the factor 2. Do we really need to
1352                  handle it here? We have done the same powering as part
1353                  of millerrabin. */
1354               if (factors.p[i] == 2)
1355                 rsh2 (e[1], e[0], nm1[1], nm1[0], 1);
1356               else
1357                 divexact_21 (e[1], e[0], nm1[1], nm1[0], factors.p[i]);
1358               y[0] = powm2 (&y[1], a_prim, e, na, ni, one);
1359               is_prime = (y[0] != one[0] || y[1] != one[1]);
1360             }
1361         }
1362       else
1363         {
1364           /* After enough Miller-Rabin runs, be content. */
1365           is_prime = (r == MR_REPS - 1);
1366         }
1367
1368       if (is_prime)
1369         return true;
1370
1371       a += primes_diff[r];      /* Establish new base.  */
1372       redcify2 (a_prim[1], a_prim[0], a, n1, n0);
1373
1374       if (!millerrabin2 (na, ni, a_prim, q, k, one))
1375         return false;
1376     }
1377
1378   error (0, 0, _("Lucas prime test failure.  This should not happen"));
1379   abort ();
1380 }
1381
1382 #if HAVE_GMP
1383 static bool
1384 mp_prime_p (mpz_t n)
1385 {
1386   bool is_prime;
1387   mpz_t q, a, nm1, tmp;
1388   struct mp_factors factors;
1389
1390   if (mpz_cmp_ui (n, 1) <= 0)
1391     return false;
1392
1393   /* We have already casted out small primes. */
1394   if (mpz_cmp_ui (n, (long) FIRST_OMITTED_PRIME * FIRST_OMITTED_PRIME) < 0)
1395     return true;
1396
1397   mpz_inits (q, a, nm1, tmp, NULL);
1398
1399   /* Precomputation for Miller-Rabin.  */
1400   mpz_sub_ui (nm1, n, 1);
1401
1402   /* Find q and k, where q is odd and n = 1 + 2**k * q.  */
1403   unsigned long int k = mpz_scan1 (nm1, 0);
1404   mpz_tdiv_q_2exp (q, nm1, k);
1405
1406   mpz_set_ui (a, 2);
1407
1408   /* Perform a Miller-Rabin test, finds most composites quickly.  */
1409   if (!mp_millerrabin (n, nm1, a, tmp, q, k))
1410     {
1411       is_prime = false;
1412       goto ret2;
1413     }
1414
1415   if (flag_prove_primality)
1416     {
1417       /* Factor n-1 for Lucas.  */
1418       mpz_set (tmp, nm1);
1419       mp_factor (tmp, &factors);
1420     }
1421
1422   /* Loop until Lucas proves our number prime, or Miller-Rabin proves our
1423      number composite.  */
1424   for (unsigned int r = 0; r < PRIMES_PTAB_ENTRIES; r++)
1425     {
1426       if (flag_prove_primality)
1427         {
1428           is_prime = true;
1429           for (unsigned long int i = 0; i < factors.nfactors && is_prime; i++)
1430             {
1431               mpz_divexact (tmp, nm1, factors.p[i]);
1432               mpz_powm (tmp, a, tmp, n);
1433               is_prime = mpz_cmp_ui (tmp, 1) != 0;
1434             }
1435         }
1436       else
1437         {
1438           /* After enough Miller-Rabin runs, be content. */
1439           is_prime = (r == MR_REPS - 1);
1440         }
1441
1442       if (is_prime)
1443         goto ret1;
1444
1445       mpz_add_ui (a, a, primes_diff[r]);        /* Establish new base.  */
1446
1447       if (!mp_millerrabin (n, nm1, a, tmp, q, k))
1448         {
1449           is_prime = false;
1450           goto ret1;
1451         }
1452     }
1453
1454   error (0, 0, _("Lucas prime test failure.  This should not happen"));
1455   abort ();
1456
1457  ret1:
1458   if (flag_prove_primality)
1459     mp_factor_clear (&factors);
1460  ret2:
1461   mpz_clears (q, a, nm1, tmp, NULL);
1462
1463   return is_prime;
1464 }
1465 #endif
1466
1467 static void
1468 factor_using_pollard_rho (uintmax_t n, unsigned long int a,
1469                           struct factors *factors)
1470 {
1471   uintmax_t x, z, y, P, t, ni, g;
1472
1473   unsigned long int k = 1;
1474   unsigned long int l = 1;
1475
1476   redcify (P, 1, n);
1477   addmod (x, P, P, n);          /* i.e., redcify(2) */
1478   y = z = x;
1479
1480   while (n != 1)
1481     {
1482       assert (a < n);
1483
1484       binv (ni, n);             /* FIXME: when could we use old 'ni' value? */
1485
1486       for (;;)
1487         {
1488           do
1489             {
1490               x = mulredc (x, x, n, ni);
1491               addmod (x, x, a, n);
1492
1493               submod (t, z, x, n);
1494               P = mulredc (P, t, n, ni);
1495
1496               if (k % 32 == 1)
1497                 {
1498                   if (gcd_odd (P, n) != 1)
1499                     goto factor_found;
1500                   y = x;
1501                 }
1502             }
1503           while (--k != 0);
1504
1505           z = x;
1506           k = l;
1507           l = 2 * l;
1508           for (unsigned long int i = 0; i < k; i++)
1509             {
1510               x = mulredc (x, x, n, ni);
1511               addmod (x, x, a, n);
1512             }
1513           y = x;
1514         }
1515
1516     factor_found:
1517       do
1518         {
1519           y = mulredc (y, y, n, ni);
1520           addmod (y, y, a, n);
1521
1522           submod (t, z, y, n);
1523           g = gcd_odd (t, n);
1524         }
1525       while (g == 1);
1526
1527       n = n / g;
1528
1529       if (!prime_p (g))
1530         factor_using_pollard_rho (g, a + 1, factors);
1531       else
1532         factor_insert (factors, g);
1533
1534       if (prime_p (n))
1535         {
1536           factor_insert (factors, n);
1537           break;
1538         }
1539
1540       x = x % n;
1541       z = z % n;
1542       y = y % n;
1543     }
1544 }
1545
1546 static void
1547 factor_using_pollard_rho2 (uintmax_t n1, uintmax_t n0, unsigned long int a,
1548                            struct factors *factors)
1549 {
1550   uintmax_t x1, x0, z1, z0, y1, y0, P1, P0, t1, t0, ni, g1, g0, r1m;
1551
1552   unsigned long int k = 1;
1553   unsigned long int l = 1;
1554
1555   redcify2 (P1, P0, 1, n1, n0);
1556   addmod2 (x1, x0, P1, P0, P1, P0, n1, n0); /* i.e., redcify(2) */
1557   y1 = z1 = x1;
1558   y0 = z0 = x0;
1559
1560   while (n1 != 0 || n0 != 1)
1561     {
1562       binv (ni, n0);
1563
1564       for (;;)
1565         {
1566           do
1567             {
1568               x0 = mulredc2 (&r1m, x1, x0, x1, x0, n1, n0, ni);
1569               x1 = r1m;
1570               addmod2 (x1, x0, x1, x0, 0, (uintmax_t) a, n1, n0);
1571
1572               submod2 (t1, t0, z1, z0, x1, x0, n1, n0);
1573               P0 = mulredc2 (&r1m, P1, P0, t1, t0, n1, n0, ni);
1574               P1 = r1m;
1575
1576               if (k % 32 == 1)
1577                 {
1578                   g0 = gcd2_odd (&g1, P1, P0, n1, n0);
1579                   if (g1 != 0 || g0 != 1)
1580                     goto factor_found;
1581                   y1 = x1; y0 = x0;
1582                 }
1583             }
1584           while (--k != 0);
1585
1586           z1 = x1; z0 = x0;
1587           k = l;
1588           l = 2 * l;
1589           for (unsigned long int i = 0; i < k; i++)
1590             {
1591               x0 = mulredc2 (&r1m, x1, x0, x1, x0, n1, n0, ni);
1592               x1 = r1m;
1593               addmod2 (x1, x0, x1, x0, 0, (uintmax_t) a, n1, n0);
1594             }
1595           y1 = x1; y0 = x0;
1596         }
1597
1598     factor_found:
1599       do
1600         {
1601           y0 = mulredc2 (&r1m, y1, y0, y1, y0, n1, n0, ni);
1602           y1 = r1m;
1603           addmod2 (y1, y0, y1, y0, 0, (uintmax_t) a, n1, n0);
1604
1605           submod2 (t1, t0, z1, z0, y1, y0, n1, n0);
1606           g0 = gcd2_odd (&g1, t1, t0, n1, n0);
1607         }
1608       while (g1 == 0 && g0 == 1);
1609
1610       if (g1 == 0)
1611         {
1612           /* The found factor is one word. */
1613           divexact_21 (n1, n0, n1, n0, g0);     /* n = n / g */
1614
1615           if (!prime_p (g0))
1616             factor_using_pollard_rho (g0, a + 1, factors);
1617           else
1618             factor_insert (factors, g0);
1619         }
1620       else
1621         {
1622           /* The found factor is two words.  This is highly unlikely, thus hard
1623              to trigger.  Please be careful before you change this code!  */
1624           uintmax_t ginv;
1625
1626           binv (ginv, g0);      /* Compute n = n / g.  Since the result will */
1627           n0 = ginv * n0;       /* fit one word, we can compute the quotient */
1628           n1 = 0;               /* modulo B, ignoring the high divisor word. */
1629
1630           if (!prime2_p (g1, g0))
1631             factor_using_pollard_rho2 (g1, g0, a + 1, factors);
1632           else
1633             factor_insert_large (factors, g1, g0);
1634         }
1635
1636       if (n1 == 0)
1637         {
1638           if (prime_p (n0))
1639             {
1640               factor_insert (factors, n0);
1641               break;
1642             }
1643
1644           factor_using_pollard_rho (n0, a, factors);
1645           return;
1646         }
1647
1648       if (prime2_p (n1, n0))
1649         {
1650           factor_insert_large (factors, n1, n0);
1651           break;
1652         }
1653
1654       x0 = mod2 (&x1, x1, x0, n1, n0);
1655       z0 = mod2 (&z1, z1, z0, n1, n0);
1656       y0 = mod2 (&y1, y1, y0, n1, n0);
1657     }
1658 }
1659
1660 #if HAVE_GMP
1661 static void
1662 mp_factor_using_pollard_rho (mpz_t n, unsigned long int a,
1663                              struct mp_factors *factors)
1664 {
1665   mpz_t x, z, y, P;
1666   mpz_t t, t2;
1667
1668   debug ("[pollard-rho (%lu)] ", a);
1669
1670   mpz_inits (t, t2, NULL);
1671   mpz_init_set_si (y, 2);
1672   mpz_init_set_si (x, 2);
1673   mpz_init_set_si (z, 2);
1674   mpz_init_set_ui (P, 1);
1675
1676   unsigned long long int k = 1;
1677   unsigned long long int l = 1;
1678
1679   while (mpz_cmp_ui (n, 1) != 0)
1680     {
1681       for (;;)
1682         {
1683           do
1684             {
1685               mpz_mul (t, x, x);
1686               mpz_mod (x, t, n);
1687               mpz_add_ui (x, x, a);
1688
1689               mpz_sub (t, z, x);
1690               mpz_mul (t2, P, t);
1691               mpz_mod (P, t2, n);
1692
1693               if (k % 32 == 1)
1694                 {
1695                   mpz_gcd (t, P, n);
1696                   if (mpz_cmp_ui (t, 1) != 0)
1697                     goto factor_found;
1698                   mpz_set (y, x);
1699                 }
1700             }
1701           while (--k != 0);
1702
1703           mpz_set (z, x);
1704           k = l;
1705           l = 2 * l;
1706           for (unsigned long long int i = 0; i < k; i++)
1707             {
1708               mpz_mul (t, x, x);
1709               mpz_mod (x, t, n);
1710               mpz_add_ui (x, x, a);
1711             }
1712           mpz_set (y, x);
1713         }
1714
1715     factor_found:
1716       do
1717         {
1718           mpz_mul (t, y, y);
1719           mpz_mod (y, t, n);
1720           mpz_add_ui (y, y, a);
1721
1722           mpz_sub (t, z, y);
1723           mpz_gcd (t, t, n);
1724         }
1725       while (mpz_cmp_ui (t, 1) == 0);
1726
1727       mpz_divexact (n, n, t);   /* divide by t, before t is overwritten */
1728
1729       if (!mp_prime_p (t))
1730         {
1731           debug ("[composite factor--restarting pollard-rho] ");
1732           mp_factor_using_pollard_rho (t, a + 1, factors);
1733         }
1734       else
1735         {
1736           mp_factor_insert (factors, t);
1737         }
1738
1739       if (mp_prime_p (n))
1740         {
1741           mp_factor_insert (factors, n);
1742           break;
1743         }
1744
1745       mpz_mod (x, x, n);
1746       mpz_mod (z, z, n);
1747       mpz_mod (y, y, n);
1748     }
1749
1750   mpz_clears (P, t2, t, z, x, y, NULL);
1751 }
1752 #endif
1753
1754 /* FIXME: Maybe better to use an iteration converging to 1/sqrt(n)?  If
1755    algorithm is replaced, consider also returning the remainder. */
1756 static uintmax_t _GL_ATTRIBUTE_CONST
1757 isqrt (uintmax_t n)
1758 {
1759   uintmax_t x;
1760   unsigned c;
1761   if (n == 0)
1762     return 0;
1763
1764   count_leading_zeros (c, n);
1765
1766   /* Make x > sqrt(n). This will be invariant through the loop. */
1767   x = (uintmax_t) 1 << ((W_TYPE_SIZE + 1 - c) / 2);
1768
1769   for (;;)
1770     {
1771       uintmax_t y = (x + n/x) / 2;
1772       if (y >= x)
1773         return x;
1774
1775       x = y;
1776     }
1777 }
1778
1779 static uintmax_t _GL_ATTRIBUTE_CONST
1780 isqrt2 (uintmax_t nh, uintmax_t nl)
1781 {
1782   unsigned int shift;
1783   uintmax_t x;
1784
1785   /* Ensures the remainder fits in an uintmax_t. */
1786   assert (nh < ((uintmax_t) 1 << (W_TYPE_SIZE - 2)));
1787
1788   if (nh == 0)
1789     return isqrt (nl);
1790
1791   count_leading_zeros (shift, nh);
1792   shift &= ~1;
1793
1794   /* Make x > sqrt(n) */
1795   x = isqrt ( (nh << shift) + (nl >> (W_TYPE_SIZE - shift))) + 1;
1796   x <<= (W_TYPE_SIZE - shift) / 2;
1797
1798   /* Do we need more than one iteration? */
1799   for (;;)
1800     {
1801       uintmax_t r ATTRIBUTE_UNUSED;
1802       uintmax_t q, y;
1803       udiv_qrnnd (q, r, nh, nl, x);
1804       y = (x + q) / 2;
1805
1806       if (y >= x)
1807         {
1808           uintmax_t hi, lo;
1809           umul_ppmm (hi, lo, x + 1, x + 1);
1810           assert (gt2 (hi, lo, nh, nl));
1811
1812           umul_ppmm (hi, lo, x, x);
1813           assert (ge2 (nh, nl, hi, lo));
1814           sub_ddmmss (hi, lo, nh, nl, hi, lo);
1815           assert (hi == 0);
1816
1817           return x;
1818         }
1819
1820       x = y;
1821     }
1822 }
1823
1824 /* MAGIC[N] has a bit i set iff i is a quadratic residue mod N. */
1825 #define MAGIC64 ((uint64_t) 0x0202021202030213ULL)
1826 #define MAGIC63 ((uint64_t) 0x0402483012450293ULL)
1827 #define MAGIC65 ((uint64_t) 0x218a019866014613ULL)
1828 #define MAGIC11 0x23b
1829
1830 /* Return the square root if the input is a square, otherwise 0. */
1831 static uintmax_t _GL_ATTRIBUTE_CONST
1832 is_square (uintmax_t x)
1833 {
1834   /* Uses the tests suggested by Cohen. Excludes 99% of the non-squares before
1835      computing the square root. */
1836   if (((MAGIC64 >> (x & 63)) & 1)
1837       && ((MAGIC63 >> (x % 63)) & 1)
1838       /* Both 0 and 64 are squares mod (65) */
1839       && ((MAGIC65 >> ((x % 65) & 63)) & 1)
1840       && ((MAGIC11 >> (x % 11) & 1)))
1841     {
1842       uintmax_t r = isqrt (x);
1843       if (r*r == x)
1844         return r;
1845     }
1846   return 0;
1847 }
1848
1849 /* invtab[i] = floor(0x10000 / (0x100 + i) */
1850 static const unsigned short invtab[0x81] =
1851   {
1852     0x200,
1853     0x1fc, 0x1f8, 0x1f4, 0x1f0, 0x1ec, 0x1e9, 0x1e5, 0x1e1,
1854     0x1de, 0x1da, 0x1d7, 0x1d4, 0x1d0, 0x1cd, 0x1ca, 0x1c7,
1855     0x1c3, 0x1c0, 0x1bd, 0x1ba, 0x1b7, 0x1b4, 0x1b2, 0x1af,
1856     0x1ac, 0x1a9, 0x1a6, 0x1a4, 0x1a1, 0x19e, 0x19c, 0x199,
1857     0x197, 0x194, 0x192, 0x18f, 0x18d, 0x18a, 0x188, 0x186,
1858     0x183, 0x181, 0x17f, 0x17d, 0x17a, 0x178, 0x176, 0x174,
1859     0x172, 0x170, 0x16e, 0x16c, 0x16a, 0x168, 0x166, 0x164,
1860     0x162, 0x160, 0x15e, 0x15c, 0x15a, 0x158, 0x157, 0x155,
1861     0x153, 0x151, 0x150, 0x14e, 0x14c, 0x14a, 0x149, 0x147,
1862     0x146, 0x144, 0x142, 0x141, 0x13f, 0x13e, 0x13c, 0x13b,
1863     0x139, 0x138, 0x136, 0x135, 0x133, 0x132, 0x130, 0x12f,
1864     0x12e, 0x12c, 0x12b, 0x129, 0x128, 0x127, 0x125, 0x124,
1865     0x123, 0x121, 0x120, 0x11f, 0x11e, 0x11c, 0x11b, 0x11a,
1866     0x119, 0x118, 0x116, 0x115, 0x114, 0x113, 0x112, 0x111,
1867     0x10f, 0x10e, 0x10d, 0x10c, 0x10b, 0x10a, 0x109, 0x108,
1868     0x107, 0x106, 0x105, 0x104, 0x103, 0x102, 0x101, 0x100,
1869   };
1870
1871 /* Compute q = [u/d], r = u mod d.  Avoids slow hardware division for the case
1872    that q < 0x40; here it instead uses a table of (Euclidian) inverses.  */
1873 #define div_smallq(q, r, u, d)                                          \
1874   do {                                                                  \
1875     if ((u) / 0x40 < (d))                                               \
1876       {                                                                 \
1877         int _cnt;                                                       \
1878         uintmax_t _dinv, _mask, _q, _r;                                 \
1879         count_leading_zeros (_cnt, (d));                                \
1880         _r = (u);                                                       \
1881         if (UNLIKELY (_cnt > (W_TYPE_SIZE - 8)))                        \
1882           {                                                             \
1883             _dinv = invtab[((d) << (_cnt + 8 - W_TYPE_SIZE)) - 0x80];   \
1884             _q = _dinv * _r >> (8 + W_TYPE_SIZE - _cnt);                \
1885           }                                                             \
1886         else                                                            \
1887           {                                                             \
1888             _dinv = invtab[((d) >> (W_TYPE_SIZE - 8 - _cnt)) - 0x7f];   \
1889             _q = _dinv * (_r >> (W_TYPE_SIZE - 3 - _cnt)) >> 11;        \
1890           }                                                             \
1891         _r -= _q*(d);                                                   \
1892                                                                         \
1893         _mask = -(uintmax_t) (_r >= (d));                               \
1894         (r) = _r - (_mask & (d));                                       \
1895         (q) = _q - _mask;                                               \
1896         assert ( (q) * (d) + (r) == u);                                 \
1897       }                                                                 \
1898     else                                                                \
1899       {                                                                 \
1900         uintmax_t _q = (u) / (d);                                       \
1901         (r) = (u) - _q * (d);                                           \
1902         (q) = _q;                                                       \
1903       }                                                                 \
1904   } while (0)
1905
1906 /* Notes: Example N = 22117019. After first phase we find Q1 = 6314, Q
1907    = 3025, P = 1737, representing F_{18} = (-6314, 2* 1737, 3025),
1908    with 3025 = 55^2.
1909
1910    Constructing the square root, we get Q1 = 55, Q = 8653, P = 4652,
1911    representing G_0 = (-55, 2*4652, 8653).
1912
1913    In the notation of the paper:
1914
1915    S_{-1} = 55, S_0 = 8653, R_0 = 4652
1916
1917    Put
1918
1919      t_0 = floor([q_0 + R_0] / S0) = 1
1920      R_1 = t_0 * S_0 - R_0 = 4001
1921      S_1 = S_{-1} +t_0 (R_0 - R_1) = 706
1922 */
1923
1924 /* Multipliers, in order of efficiency:
1925    0.7268  3*5*7*11 = 1155 = 3 (mod 4)
1926    0.7317  3*5*7    =  105 = 1
1927    0.7820  3*5*11   =  165 = 1
1928    0.7872  3*5      =   15 = 3
1929    0.8101  3*7*11   =  231 = 3
1930    0.8155  3*7      =   21 = 1
1931    0.8284  5*7*11   =  385 = 1
1932    0.8339  5*7      =   35 = 3
1933    0.8716  3*11     =   33 = 1
1934    0.8774  3        =    3 = 3
1935    0.8913  5*11     =   55 = 3
1936    0.8972  5        =    5 = 1
1937    0.9233  7*11     =   77 = 1
1938    0.9295  7        =    7 = 3
1939    0.9934  11       =   11 = 3
1940 */
1941 #define QUEUE_SIZE 50
1942
1943 #if STAT_SQUFOF
1944 # define Q_FREQ_SIZE 50
1945 /* Element 0 keeps the total */
1946 static unsigned int q_freq[Q_FREQ_SIZE + 1];
1947 # define MIN(a,b) ((a) < (b) ? (a) : (b))
1948 #endif
1949
1950 /* Return true on success.  Expected to fail only for numbers
1951    >= 2^{2*W_TYPE_SIZE - 2}, or close to that limit. */
1952 static bool
1953 factor_using_squfof (uintmax_t n1, uintmax_t n0, struct factors *factors)
1954 {
1955   /* Uses algorithm and notation from
1956
1957      SQUARE FORM FACTORIZATION
1958      JASON E. GOWER AND SAMUEL S. WAGSTAFF, JR.
1959
1960      http://homes.cerias.purdue.edu/~ssw/squfof.pdf
1961    */
1962
1963   static const unsigned int multipliers_1[] =
1964     { /* = 1 (mod 4) */
1965       105, 165, 21, 385, 33, 5, 77, 1, 0
1966     };
1967   static const unsigned int multipliers_3[] =
1968     { /* = 3 (mod 4) */
1969       1155, 15, 231, 35, 3, 55, 7, 11, 0
1970     };
1971
1972   const unsigned int *m;
1973
1974   struct { uintmax_t Q; uintmax_t P; } queue[QUEUE_SIZE];
1975
1976   if (n1 >= ((uintmax_t) 1 << (W_TYPE_SIZE - 2)))
1977     return false;
1978
1979   uintmax_t sqrt_n = isqrt2 (n1, n0);
1980
1981   if (n0 == sqrt_n * sqrt_n)
1982     {
1983       uintmax_t p1, p0;
1984
1985       umul_ppmm (p1, p0, sqrt_n, sqrt_n);
1986       assert (p0 == n0);
1987
1988       if (n1 == p1)
1989         {
1990           if (prime_p (sqrt_n))
1991             factor_insert_multiplicity (factors, sqrt_n, 2);
1992           else
1993             {
1994               struct factors f;
1995
1996               f.nfactors = 0;
1997               if (!factor_using_squfof (0, sqrt_n, &f))
1998                 {
1999                   /* Try pollard rho instead */
2000                   factor_using_pollard_rho (sqrt_n, 1, &f);
2001                 }
2002               /* Duplicate the new factors */
2003               for (unsigned int i = 0; i < f.nfactors; i++)
2004                 factor_insert_multiplicity (factors, f.p[i], 2*f.e[i]);
2005             }
2006           return true;
2007         }
2008     }
2009
2010   /* Select multipliers so we always get n * mu = 3 (mod 4) */
2011   for (m = (n0 % 4 == 1) ? multipliers_3 : multipliers_1;
2012        *m; m++)
2013     {
2014       uintmax_t S, Dh, Dl, Q1, Q, P, L, L1, B;
2015       unsigned int i;
2016       unsigned int mu = *m;
2017       unsigned int qpos = 0;
2018
2019       assert (mu * n0 % 4 == 3);
2020
2021       /* In the notation of the paper, with mu * n == 3 (mod 4), we
2022          get \Delta = 4 mu * n, and the paper's \mu is 2 mu. As far as
2023          I understand it, the necessary bound is 4 \mu^3 < n, or 32
2024          mu^3 < n.
2025
2026          However, this seems insufficient: With n = 37243139 and mu =
2027          105, we get a trivial factor, from the square 38809 = 197^2,
2028          without any corresponding Q earlier in the iteration.
2029
2030          Requiring 64 mu^3 < n seems sufficient. */
2031       if (n1 == 0)
2032         {
2033           if ((uintmax_t) mu*mu*mu >= n0 / 64)
2034             continue;
2035         }
2036       else
2037         {
2038           if (n1 > ((uintmax_t) 1 << (W_TYPE_SIZE - 2)) / mu)
2039             continue;
2040         }
2041       umul_ppmm (Dh, Dl, n0, mu);
2042       Dh += n1 * mu;
2043
2044       assert (Dl % 4 != 1);
2045       assert (Dh < (uintmax_t) 1 << (W_TYPE_SIZE - 2));
2046
2047       S = isqrt2 (Dh, Dl);
2048
2049       Q1 = 1;
2050       P = S;
2051
2052       /* Square root remainder fits in one word, so ignore high part. */
2053       Q = Dl - P*P;
2054       /* FIXME: When can this differ from floor(sqrt(2 sqrt(D)))? */
2055       L = isqrt (2*S);
2056       B = 2*L;
2057       L1 = mu * 2 * L;
2058
2059       /* The form is (+/- Q1, 2P, -/+ Q), of discriminant 4 (P^2 + Q Q1) =
2060          4 D. */
2061
2062       for (i = 0; i <= B; i++)
2063         {
2064           uintmax_t q, P1, t, rem;
2065
2066           div_smallq (q, rem, S+P, Q);
2067           P1 = S - rem; /* P1 = q*Q - P */
2068
2069 #if STAT_SQUFOF
2070           assert (q > 0);
2071           q_freq[0]++;
2072           q_freq[MIN (q, Q_FREQ_SIZE)]++;
2073 #endif
2074
2075           if (Q <= L1)
2076             {
2077               uintmax_t g = Q;
2078
2079               if ( (Q & 1) == 0)
2080                 g /= 2;
2081
2082               g /= gcd_odd (g, mu);
2083
2084               if (g <= L)
2085                 {
2086                   if (qpos >= QUEUE_SIZE)
2087                     error (EXIT_FAILURE, 0, _("squfof queue overflow"));
2088                   queue[qpos].Q = g;
2089                   queue[qpos].P = P % g;
2090                   qpos++;
2091                 }
2092             }
2093
2094           /* I think the difference can be either sign, but mod
2095              2^W_TYPE_SIZE arithmetic should be fine. */
2096           t = Q1 + q * (P - P1);
2097           Q1 = Q;
2098           Q = t;
2099           P = P1;
2100
2101           if ( (i & 1) == 0)
2102             {
2103               uintmax_t r = is_square (Q);
2104               if (r)
2105                 {
2106                   for (unsigned int j = 0; j < qpos; j++)
2107                     {
2108                       if (queue[j].Q == r)
2109                         {
2110                           if (r == 1)
2111                             /* Traversed entire cycle. */
2112                             goto next_multiplier;
2113
2114                           /* Need the absolute value for divisibility test. */
2115                           if (P >= queue[j].P)
2116                             t = P - queue[j].P;
2117                           else
2118                             t = queue[j].P - P;
2119                           if (t % r == 0)
2120                             {
2121                               /* Delete entries up to and including entry
2122                                  j, which matched. */
2123                               memmove (queue, queue + j + 1,
2124                                        (qpos - j - 1) * sizeof (queue[0]));
2125                               qpos -= (j + 1);
2126                             }
2127                           goto next_i;
2128                         }
2129                     }
2130
2131                   /* We have found a square form, which should give a
2132                      factor. */
2133                   Q1 = r;
2134                   assert (S >= P); /* What signs are possible? */
2135                   P += r * ((S - P) / r);
2136
2137                   /* Note: Paper says (N - P*P) / Q1, that seems incorrect
2138                      for the case D = 2N. */
2139                   /* Compute Q = (D - P*P) / Q1, but we need double
2140                      precision. */
2141                   uintmax_t hi, lo;
2142                   umul_ppmm (hi, lo, P, P);
2143                   sub_ddmmss (hi, lo, Dh, Dl, hi, lo);
2144                   udiv_qrnnd (Q, rem, hi, lo, Q1);
2145                   assert (rem == 0);
2146
2147                   for (;;)
2148                     {
2149                       /* Note: There appears to by a typo in the paper,
2150                          Step 4a in the algorithm description says q <--
2151                          floor([S+P]/\hat Q), but looking at the equations
2152                          in Sec. 3.1, it should be q <-- floor([S+P] / Q).
2153                          (In this code, \hat Q is Q1). */
2154                       div_smallq (q, rem, S+P, Q);
2155                       P1 = S - rem;     /* P1 = q*Q - P */
2156
2157 #if STAT_SQUFOF
2158                       q_freq[0]++;
2159                       q_freq[MIN (q, Q_FREQ_SIZE)]++;
2160 #endif
2161                       if (P == P1)
2162                         break;
2163                       t = Q1 + q * (P - P1);
2164                       Q1 = Q;
2165                       Q = t;
2166                       P = P1;
2167                     }
2168
2169                   if ( (Q & 1) == 0)
2170                     Q /= 2;
2171                   Q /= gcd_odd (Q, mu);
2172
2173                   assert (Q > 1 && (n1 || Q < n0));
2174
2175                   if (prime_p (Q))
2176                     factor_insert (factors, Q);
2177                   else if (!factor_using_squfof (0, Q, factors))
2178                     factor_using_pollard_rho (Q, 2, factors);
2179
2180                   divexact_21 (n1, n0, n1, n0, Q);
2181
2182                   if (prime2_p (n1, n0))
2183                     factor_insert_large (factors, n1, n0);
2184                   else
2185                     {
2186                       if (!factor_using_squfof (n1, n0, factors))
2187                         {
2188                           if (n1 == 0)
2189                             factor_using_pollard_rho (n0, 1, factors);
2190                           else
2191                             factor_using_pollard_rho2 (n1, n0, 1, factors);
2192                         }
2193                     }
2194
2195                   return true;
2196                 }
2197             }
2198         next_i:;
2199         }
2200     next_multiplier:;
2201     }
2202   return false;
2203 }
2204
2205 /* Compute the prime factors of the 128-bit number (T1,T0), and put the
2206    results in FACTORS.  Use the algorithm selected by the global ALG.  */
2207 static void
2208 factor (uintmax_t t1, uintmax_t t0, struct factors *factors)
2209 {
2210   factors->nfactors = 0;
2211   factors->plarge[1] = 0;
2212
2213   if (t1 == 0 && t0 < 2)
2214     return;
2215
2216   t0 = factor_using_division (&t1, t1, t0, factors);
2217
2218   if (t1 == 0 && t0 < 2)
2219     return;
2220
2221   if (prime2_p (t1, t0))
2222     factor_insert_large (factors, t1, t0);
2223   else
2224     {
2225       if (alg == ALG_SQUFOF)
2226         if (factor_using_squfof (t1, t0, factors))
2227           return;
2228
2229       if (t1 == 0)
2230         factor_using_pollard_rho (t0, 1, factors);
2231       else
2232         factor_using_pollard_rho2 (t1, t0, 1, factors);
2233     }
2234 }
2235
2236 #if HAVE_GMP
2237 /* Use Pollard-rho to compute the prime factors of
2238    arbitrary-precision T, and put the results in FACTORS.  */
2239 static void
2240 mp_factor (mpz_t t, struct mp_factors *factors)
2241 {
2242   mp_factor_init (factors);
2243
2244   if (mpz_sgn (t) != 0)
2245     {
2246       mp_factor_using_division (t, factors);
2247
2248       if (mpz_cmp_ui (t, 1) != 0)
2249         {
2250           debug ("[is number prime?] ");
2251           if (mp_prime_p (t))
2252             mp_factor_insert (factors, t);
2253           else
2254             mp_factor_using_pollard_rho (t, 1, factors);
2255         }
2256     }
2257 }
2258 #endif
2259
2260 static strtol_error
2261 strto2uintmax (uintmax_t *hip, uintmax_t *lop, const char *s)
2262 {
2263   unsigned int lo_carry;
2264   uintmax_t hi = 0, lo = 0;
2265
2266   strtol_error err = LONGINT_INVALID;
2267
2268   /* Skip initial spaces and '+'.  */
2269   for (;;)
2270     {
2271       char c = *s;
2272       if (c == ' ')
2273         s++;
2274       else if (c == '+')
2275         {
2276           s++;
2277           break;
2278         }
2279       else
2280         break;
2281     }
2282
2283   /* Initial scan for invalid digits.  */
2284   const char *p = s;
2285   for (;;)
2286     {
2287       unsigned int c = *p++;
2288       if (c == 0)
2289         break;
2290
2291       if (UNLIKELY (!ISDIGIT (c)))
2292         {
2293           err = LONGINT_INVALID;
2294           break;
2295         }
2296
2297       err = LONGINT_OK;           /* we've seen at least one valid digit */
2298     }
2299
2300   for (;err == LONGINT_OK;)
2301     {
2302       unsigned int c = *s++;
2303       if (c == 0)
2304         break;
2305
2306       c -= '0';
2307
2308       if (UNLIKELY (hi > ~(uintmax_t)0 / 10))
2309         {
2310           err = LONGINT_OVERFLOW;
2311           break;
2312         }
2313       hi = 10 * hi;
2314
2315       lo_carry = (lo >> (W_TYPE_SIZE - 3)) + (lo >> (W_TYPE_SIZE - 1));
2316       lo_carry += 10 * lo < 2 * lo;
2317
2318       lo = 10 * lo;
2319       lo += c;
2320
2321       lo_carry += lo < c;
2322       hi += lo_carry;
2323       if (UNLIKELY (hi < lo_carry))
2324         {
2325           err = LONGINT_OVERFLOW;
2326           break;
2327         }
2328     }
2329
2330   *hip = hi;
2331   *lop = lo;
2332
2333   return err;
2334 }
2335
2336 static void
2337 print_uintmaxes (uintmax_t t1, uintmax_t t0)
2338 {
2339   uintmax_t q, r;
2340
2341   if (t1 == 0)
2342     printf ("%"PRIuMAX, t0);
2343   else
2344     {
2345       /* Use very plain code here since it seems hard to write fast code
2346          without assuming a specific word size.  */
2347       q = t1 / 1000000000;
2348       r = t1 % 1000000000;
2349       udiv_qrnnd (t0, r, r, t0, 1000000000);
2350       print_uintmaxes (q, t0);
2351       printf ("%09u", (int) r);
2352     }
2353 }
2354
2355 /* Single-precision factoring */
2356 static void
2357 print_factors_single (uintmax_t t1, uintmax_t t0)
2358 {
2359   struct factors factors;
2360
2361   print_uintmaxes (t1, t0);
2362   putchar (':');
2363
2364   factor (t1, t0, &factors);
2365
2366   for (unsigned int j = 0; j < factors.nfactors; j++)
2367     for (unsigned int k = 0; k < factors.e[j]; k++)
2368       {
2369         char buf[INT_BUFSIZE_BOUND (uintmax_t)];
2370         putchar (' ');
2371         fputs (umaxtostr (factors.p[j], buf), stdout);
2372       }
2373
2374   if (factors.plarge[1])
2375     {
2376       putchar (' ');
2377       print_uintmaxes (factors.plarge[1], factors.plarge[0]);
2378     }
2379   putchar ('\n');
2380 }
2381
2382 /* Emit the factors of the indicated number.  If we have the option of using
2383    either algorithm, we select on the basis of the length of the number.
2384    For longer numbers, we prefer the MP algorithm even if the native algorithm
2385    has enough digits, because the algorithm is better.  The turnover point
2386    depends on the value.  */
2387 static bool
2388 print_factors (const char *input)
2389 {
2390   uintmax_t t1, t0;
2391
2392   /* Try converting the number to one or two words.  If it fails, use GMP or
2393      print an error message.  The 2nd condition checks that the most
2394      significant bit of the two-word number is clear, in a typesize neutral
2395      way.  */
2396   strtol_error err = strto2uintmax (&t1, &t0, input);
2397
2398   switch (err)
2399     {
2400     case LONGINT_OK:
2401       if (((t1 << 1) >> 1) == t1)
2402         {
2403           debug ("[%s]", _("using single-precision arithmetic"));
2404           print_factors_single (t1, t0);
2405           return true;
2406         }
2407       break;
2408
2409     case LONGINT_OVERFLOW:
2410       /* Try GMP.  */
2411       break;
2412
2413     default:
2414       error (0, 0, _("%s is not a valid positive integer"), quote (input));
2415       return false;
2416     }
2417
2418 #if HAVE_GMP
2419   debug ("[%s]", _("using arbitrary-precision arithmetic"));
2420   mpz_t t;
2421   struct mp_factors factors;
2422
2423   mpz_init_set_str (t, input, 10);
2424
2425   gmp_printf ("%Zd:", t);
2426   mp_factor (t, &factors);
2427
2428   for (unsigned int j = 0; j < factors.nfactors; j++)
2429     for (unsigned int k = 0; k < factors.e[j]; k++)
2430       gmp_printf (" %Zd", factors.p[j]);
2431
2432   mp_factor_clear (&factors);
2433   mpz_clear (t);
2434   putchar ('\n');
2435   return true;
2436 #else
2437   error (0, 0, _("%s is too large"), quote (input));
2438   return false;
2439 #endif
2440 }
2441
2442 void
2443 usage (int status)
2444 {
2445   if (status != EXIT_SUCCESS)
2446     emit_try_help ();
2447   else
2448     {
2449       printf (_("\
2450 Usage: %s [NUMBER]...\n\
2451   or:  %s OPTION\n\
2452 "),
2453               program_name, program_name);
2454       fputs (_("\
2455 Print the prime factors of each specified integer NUMBER.  If none\n\
2456 are specified on the command line, read them from standard input.\n\
2457 \n\
2458 "), stdout);
2459       fputs (HELP_OPTION_DESCRIPTION, stdout);
2460       fputs (VERSION_OPTION_DESCRIPTION, stdout);
2461       emit_ancillary_info ();
2462     }
2463   exit (status);
2464 }
2465
2466 static bool
2467 do_stdin (void)
2468 {
2469   bool ok = true;
2470   token_buffer tokenbuffer;
2471
2472   init_tokenbuffer (&tokenbuffer);
2473
2474   while (true)
2475     {
2476       size_t token_length = readtoken (stdin, DELIM, sizeof (DELIM) - 1,
2477                                        &tokenbuffer);
2478       if (token_length == (size_t) -1)
2479         break;
2480       ok &= print_factors (tokenbuffer.buffer);
2481     }
2482   free (tokenbuffer.buffer);
2483
2484   return ok;
2485 }
2486
2487 int
2488 main (int argc, char **argv)
2489 {
2490   initialize_main (&argc, &argv);
2491   set_program_name (argv[0]);
2492   setlocale (LC_ALL, "");
2493   bindtextdomain (PACKAGE, LOCALEDIR);
2494   textdomain (PACKAGE);
2495
2496   atexit (close_stdout);
2497
2498   alg = ALG_POLLARD_RHO;        /* Default to Pollard rho */
2499
2500   int c;
2501   while ((c = getopt_long (argc, argv, "", long_options, NULL)) != -1)
2502     {
2503       switch (c)
2504         {
2505         case VERBOSE_OPTION:
2506           verbose = 1;
2507           break;
2508
2509         case 's':
2510           alg = ALG_SQUFOF;
2511           break;
2512
2513         case 'w':
2514           flag_prove_primality = false;
2515           break;
2516
2517         case_GETOPT_HELP_CHAR;
2518
2519         case_GETOPT_VERSION_CHAR (PROGRAM_NAME, AUTHORS);
2520
2521         default:
2522           usage (EXIT_FAILURE);
2523         }
2524     }
2525
2526 #if STAT_SQUFOF
2527   if (alg == ALG_SQUFOF)
2528     memset (q_freq, 0, sizeof (q_freq));
2529 #endif
2530
2531   bool ok;
2532   if (argc <= optind)
2533     ok = do_stdin ();
2534   else
2535     {
2536       ok = true;
2537       for (int i = optind; i < argc; i++)
2538         if (! print_factors (argv[i]))
2539           ok = false;
2540     }
2541
2542 #if STAT_SQUFOF
2543   if (alg == ALG_SQUFOF && q_freq[0] > 0)
2544     {
2545       double acc_f;
2546       printf ("q  freq.  cum. freq.(total: %d)\n", q_freq[0]);
2547       for (unsigned int i = 1, acc_f = 0.0; i <= Q_FREQ_SIZE; i++)
2548         {
2549           double f = (double) q_freq[i] / q_freq[0];
2550           acc_f += f;
2551           printf ("%s%d %.2f%% %.2f%%\n", i == Q_FREQ_SIZE ? ">=" : "", i,
2552                   100.0 * f, 100.0 * acc_f);
2553         }
2554     }
2555 #endif
2556
2557   exit (ok ? EXIT_SUCCESS : EXIT_FAILURE);
2558 }