src/factor.c

   1 /* factor -- print prime factors of n.
   2    Copyright (C) 1986-2013 Free Software Foundation, Inc.
   3
   4    This program is free software: you can redistribute it and/or modify
   5    it under the terms of the GNU General Public License as published by
   6    the Free Software Foundation, either version 3 of the License, or
   7    (at your option) any later version.
   8
   9    This program is distributed in the hope that it will be useful,
  10    but WITHOUT ANY WARRANTY; without even the implied warranty of
  11    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  12    GNU General Public License for more details.
  13
  14    You should have received a copy of the GNU General Public License
  15    along with this program.  If not, see <http://www.gnu.org/licenses/>.  */
  16
  17 /* Originally written by Paul Rubin <phr@ocf.berkeley.edu>.
  18    Adapted for GNU, fixed to factor UINT_MAX by Jim Meyering.
  19    Arbitrary-precision code adapted by James Youngman from Torbjörn
  20    Granlund's factorize.c, from GNU MP version 4.2.2.
  21    In 2012, the core was rewritten by Torbjörn Granlund and Niels Möller.
  22    Contains code from GNU MP.  */
  23
  24 /* Efficiently factor numbers that fit in one or two words (word = uintmax_t),
  25    or, with GMP, numbers of any size.
  26
  27   Code organisation:
  28
  29     There are several variants of many functions, for handling one word, two
  30     words, and GMP's mpz_t type.  If the one-word variant is called foo, the
  31     two-word variant will be foo2, and the one for mpz_t will be mp_foo.  In
  32     some cases, the plain function variants will handle both one-word and
  33     two-word numbers, evidenced by function arguments.
  34
  35     The factoring code for two words will fall into the code for one word when
  36     progress allows that.
  37
  38     Using GMP is optional.  Define HAVE_GMP to make this code include GMP
  39     factoring code.  The GMP factoring code is based on GMP's demos/factorize.c
  40     (last synched 2012-09-07).  The GMP-based factoring code will stay in GMP
  41     factoring code even if numbers get small enough for using the two-word
  42     code.
  43
  44   Algorithm:
  45
  46     (1) Perform trial division using a small primes table, but without hardware
  47         division since the primes table store inverses modulo the word base.
  48         (The GMP variant of this code doesn't make use of the precomputed
  49         inverses, but instead relies on GMP for fast divisibility testing.)
  50     (2) Check the nature of any non-factored part using Miller-Rabin for
  51         detecting composites, and Lucas for detecting primes.
  52     (3) Factor any remaining composite part using the Pollard-Brent rho
  53         algorithm or the SQUFOF algorithm, checking status of found factors
  54         again using Miller-Rabin and Lucas.
  55
  56     We prefer using Hensel norm in the divisions, not the more familiar
  57     Euclidian norm, since the former leads to much faster code.  In the
  58     Pollard-Brent rho code and the prime testing code, we use Montgomery's
  59     trick of multiplying all n-residues by the word base, allowing cheap Hensel
  60     reductions mod n.
  61
  62   Improvements:
  63
  64     * Use modular inverses also for exact division in the Lucas code, and
  65       elsewhere.  A problem is to locate the inverses not from an index, but
  66       from a prime.  We might instead compute the inverse on-the-fly.
  67
  68     * Tune trial division table size (not forgetting that this is a standalone
  69       program where the table will be read from disk for each invocation).
  70
  71     * Implement less naive powm, using k-ary exponentiation for k = 3 or
  72       perhaps k = 4.
  73
  74     * Try to speed trial division code for single uintmax_t numbers, i.e., the
  75       code using DIVBLOCK.  It currently runs at 2 cycles per prime (Intel SBR,
  76       IBR), 3 cycles per prime (AMD Stars) and 5 cycles per prime (AMD BD) when
  77       using gcc 4.6 and 4.7.  Some software pipelining should help; 1, 2, and 4
  78       respectively cycles ought to be possible.
  79
  80     * The redcify function could be vastly improved by using (plain Euclidian)
  81       pre-inversion (such as GMP's invert_limb) and udiv_qrnnd_preinv (from
  82       GMP's gmp-impl.h).  The redcify2 function could be vastly improved using
  83       similar methoods.  These functions currently dominate run time when using
  84       the -w option.
  85 */
  86
  87 #include <config.h>
  88 #include <getopt.h>
  89 #include <stdio.h>
  90 #if HAVE_GMP
  91 # include <gmp.h>
  92 # if !HAVE_DECL_MPZ_INITS
  93 #  include <stdarg.h>
  94 # endif
  95 #endif
  96
  97 #include <assert.h>
  98
  99 #include "system.h"
 100 #include "error.h"
 101 #include "quote.h"
 102 #include "readtokens.h"
 103 #include "xstrtol.h"
 104
 105 /* The official name of this program (e.g., no 'g' prefix).  */
 106 #define PROGRAM_NAME "factor"
 107
 108 #define AUTHORS \
 109   proper_name ("Paul Rubin"),                                           \
 110   proper_name_utf8 ("Torbjorn Granlund", "Torbj\303\266rn Granlund"),   \
 111   proper_name_utf8 ("Niels Moller", "Niels M\303\266ller")
 112
 113 /* Token delimiters when reading from a file.  */
 114 #define DELIM "\n\t "
 115
 116 #ifndef STAT_SQUFOF
 117 # define STAT_SQUFOF 0
 118 #endif
 119
 120 #ifndef USE_LONGLONG_H
 121 /* With the way we use longlong.h, it's only safe to use
 122    when UWtype = UHWtype, as there were various cases
 123    (as can be seen in the history for longlong.h) where
 124    for example, _LP64 was required to enable W_TYPE_SIZE==64 code,
 125    to avoid compile time or run time issues.  */
 126 # if LONG_MAX == INTMAX_MAX
 127 #  define USE_LONGLONG_H 1
 128 # endif
 129 #endif
 130
 131 #if USE_LONGLONG_H
 132
 133 /* Make definitions for longlong.h to make it do what it can do for us */
 134
 135 /* bitcount for uintmax_t */
 136 # if UINTMAX_MAX == UINT32_MAX
 137 #  define W_TYPE_SIZE 32
 138 # elif UINTMAX_MAX == UINT64_MAX
 139 #  define W_TYPE_SIZE 64
 140 # elif UINTMAX_MAX == UINT128_MAX
 141 #  define W_TYPE_SIZE 128
 142 # endif
 143
 144 # define UWtype  uintmax_t
 145 # define UHWtype unsigned long int
 146 # undef UDWtype
 147 # if HAVE_ATTRIBUTE_MODE
 148 typedef unsigned int UQItype    __attribute__ ((mode (QI)));
 149 typedef          int SItype     __attribute__ ((mode (SI)));
 150 typedef unsigned int USItype    __attribute__ ((mode (SI)));
 151 typedef          int DItype     __attribute__ ((mode (DI)));
 152 typedef unsigned int UDItype    __attribute__ ((mode (DI)));
 153 # else
 154 typedef unsigned char UQItype;
 155 typedef          long SItype;
 156 typedef unsigned long int USItype;
 157 #  if HAVE_LONG_LONG_INT
 158 typedef long long int DItype;
 159 typedef unsigned long long int UDItype;
 160 #  else /* Assume `long' gives us a wide enough type.  Needed for hppa2.0w.  */
 161 typedef long int DItype;
 162 typedef unsigned long int UDItype;
 163 #  endif
 164 # endif
 165 # define LONGLONG_STANDALONE     /* Don't require GMP's longlong.h mdep files */
 166 # define ASSERT(x)               /* FIXME make longlong.h really standalone */
 167 # define __GMP_DECLSPEC          /* FIXME make longlong.h really standalone */
 168 # define __clz_tab factor_clz_tab /* Rename to avoid glibc collision */
 169 # ifndef __GMP_GNUC_PREREQ
 170 #  define __GMP_GNUC_PREREQ(a,b) 1
 171 # endif
 172
 173 /* These stub macros are only used in longlong.h in certain system compiler
 174    combinations, so ensure usage to avoid -Wunused-macros warnings.  */
 175 # if __GMP_GNUC_PREREQ (1,1) && defined __clz_tab
 176 ASSERT (1)
 177 __GMP_DECLSPEC
 178 # endif
 179
 180 # if _ARCH_PPC
 181 #  define HAVE_HOST_CPU_FAMILY_powerpc 1
 182 # endif
 183 # include "longlong.h"
 184 # ifdef COUNT_LEADING_ZEROS_NEED_CLZ_TAB
 185 const unsigned char factor_clz_tab[129] =
 186 {
 187   1,2,3,3,4,4,4,4,5,5,5,5,5,5,5,5,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,
 188   7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,
 189   8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,
 190   8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,
 191   9
 192 };
 193 # endif
 194
 195 #else /* not USE_LONGLONG_H */
 196
 197 # define W_TYPE_SIZE (8 * sizeof (uintmax_t))
 198 # define __ll_B ((uintmax_t) 1 << (W_TYPE_SIZE / 2))
 199 # define __ll_lowpart(t)  ((uintmax_t) (t) & (__ll_B - 1))
 200 # define __ll_highpart(t) ((uintmax_t) (t) >> (W_TYPE_SIZE / 2))
 201
 202 #endif
 203
 204 #if !defined __clz_tab && !defined UHWtype
 205 /* Without this seemingly useless conditional, gcc -Wunused-macros
 206    warns that each of the two tested macros is unused on Fedora 18.
 207    FIXME: this is just an ugly band-aid.  Fix it properly.  */
 208 #endif
 209
 210 enum alg_type { ALG_POLLARD_RHO = 1, ALG_SQUFOF = 2 };
 211
 212 static enum alg_type alg;
 213
 214 /* 2*3*5*7*11...*101 is 128 bits, and has 26 prime factors */
 215 #define MAX_NFACTS 26
 216
 217 enum
 218 {
 219   DEV_DEBUG_OPTION = CHAR_MAX + 1
 220 };
 221
 222 static struct option const long_options[] =
 223 {
 224   {"-debug", no_argument, NULL, DEV_DEBUG_OPTION},
 225   {GETOPT_HELP_OPTION_DECL},
 226   {GETOPT_VERSION_OPTION_DECL},
 227   {NULL, 0, NULL, 0}
 228 };
 229
 230 struct factors
 231 {
 232   uintmax_t     plarge[2]; /* Can have a single large factor */
 233   uintmax_t     p[MAX_NFACTS];
 234   unsigned char e[MAX_NFACTS];
 235   unsigned char nfactors;
 236 };
 237
 238 #if HAVE_GMP
 239 struct mp_factors
 240 {
 241   mpz_t             *p;
 242   unsigned long int *e;
 243   unsigned long int nfactors;
 244 };
 245 #endif
 246
 247 static void factor (uintmax_t, uintmax_t, struct factors *);
 248
 249 #ifndef umul_ppmm
 250 # define umul_ppmm(w1, w0, u, v)                                        \
 251   do {                                                                  \
 252     uintmax_t __x0, __x1, __x2, __x3;                                   \
 253     unsigned long int __ul, __vl, __uh, __vh;                           \
 254     uintmax_t __u = (u), __v = (v);                                     \
 255                                                                         \
 256     __ul = __ll_lowpart (__u);                                          \
 257     __uh = __ll_highpart (__u);                                         \
 258     __vl = __ll_lowpart (__v);                                          \
 259     __vh = __ll_highpart (__v);                                         \
 260                                                                         \
 261     __x0 = (uintmax_t) __ul * __vl;                                     \
 262     __x1 = (uintmax_t) __ul * __vh;                                     \
 263     __x2 = (uintmax_t) __uh * __vl;                                     \
 264     __x3 = (uintmax_t) __uh * __vh;                                     \
 265                                                                         \
 266     __x1 += __ll_highpart (__x0);/* this can't give carry */            \
 267     __x1 += __x2;               /* but this indeed can */               \
 268     if (__x1 < __x2)            /* did we get it? */                    \
 269       __x3 += __ll_B;           /* yes, add it in the proper pos. */    \
 270                                                                         \
 271     (w1) = __x3 + __ll_highpart (__x1);                                 \
 272     (w0) = (__x1 << W_TYPE_SIZE / 2) + __ll_lowpart (__x0);             \
 273   } while (0)
 274 #endif
 275
 276 #if !defined udiv_qrnnd || defined UDIV_NEEDS_NORMALIZATION
 277 /* Define our own, not needing normalization. This function is
 278    currently not performance critical, so keep it simple. Similar to
 279    the mod macro below. */
 280 # undef udiv_qrnnd
 281 # define udiv_qrnnd(q, r, n1, n0, d)                                    \
 282   do {                                                                  \
 283     uintmax_t __d1, __d0, __q, __r1, __r0;                              \
 284                                                                         \
 285     assert ((n1) < (d));                                                \
 286     __d1 = (d); __d0 = 0;                                               \
 287     __r1 = (n1); __r0 = (n0);                                           \
 288     __q = 0;                                                            \
 289     for (unsigned int __i = W_TYPE_SIZE; __i > 0; __i--)                \
 290       {                                                                 \
 291         rsh2 (__d1, __d0, __d1, __d0, 1);                               \
 292         __q <<= 1;                                                      \
 293         if (ge2 (__r1, __r0, __d1, __d0))                               \
 294           {                                                             \
 295             __q++;                                                      \
 296             sub_ddmmss (__r1, __r0, __r1, __r0, __d1, __d0);            \
 297           }                                                             \
 298       }                                                                 \
 299     (r) = __r0;                                                         \
 300     (q) = __q;                                                          \
 301   } while (0)
 302 #endif
 303
 304 #if !defined add_ssaaaa
 305 # define add_ssaaaa(sh, sl, ah, al, bh, bl)                             \
 306   do {                                                                  \
 307     uintmax_t _add_x;                                                   \
 308     _add_x = (al) + (bl);                                               \
 309     (sh) = (ah) + (bh) + (_add_x < (al));                               \
 310     (sl) = _add_x;                                                      \
 311   } while (0)
 312 #endif
 313
 314 #define rsh2(rh, rl, ah, al, cnt)                                       \
 315   do {                                                                  \
 316     (rl) = ((ah) << (W_TYPE_SIZE - (cnt))) | ((al) >> (cnt));           \
 317     (rh) = (ah) >> (cnt);                                               \
 318   } while (0)
 319
 320 #define lsh2(rh, rl, ah, al, cnt)                                       \
 321   do {                                                                  \
 322     (rh) = ((ah) << cnt) | ((al) >> (W_TYPE_SIZE - (cnt)));             \
 323     (rl) = (al) << (cnt);                                               \
 324   } while (0)
 325
 326 #define ge2(ah, al, bh, bl)                                             \
 327   ((ah) > (bh) || ((ah) == (bh) && (al) >= (bl)))
 328
 329 #define gt2(ah, al, bh, bl)                                             \
 330   ((ah) > (bh) || ((ah) == (bh) && (al) > (bl)))
 331
 332 #ifndef sub_ddmmss
 333 # define sub_ddmmss(rh, rl, ah, al, bh, bl)                             \
 334   do {                                                                  \
 335     uintmax_t _cy;                                                      \
 336     _cy = (al) < (bl);                                                  \
 337     (rl) = (al) - (bl);                                                 \
 338     (rh) = (ah) - (bh) - _cy;                                           \
 339   } while (0)
 340 #endif
 341
 342 #ifndef count_leading_zeros
 343 # define count_leading_zeros(count, x) do {                             \
 344     uintmax_t __clz_x = (x);                                            \
 345     unsigned int __clz_c;                                               \
 346     for (__clz_c = 0;                                                   \
 347          (__clz_x & ((uintmax_t) 0xff << (W_TYPE_SIZE - 8))) == 0;      \
 348          __clz_c += 8)                                                  \
 349       __clz_x <<= 8;                                                    \
 350     for (; (intmax_t)__clz_x >= 0; __clz_c++)                           \
 351       __clz_x <<= 1;                                                    \
 352     (count) = __clz_c;                                                  \
 353   } while (0)
 354 #endif
 355
 356 #ifndef count_trailing_zeros
 357 # define count_trailing_zeros(count, x) do {                            \
 358     uintmax_t __ctz_x = (x);                                            \
 359     unsigned int __ctz_c = 0;                                           \
 360     while ((__ctz_x & 1) == 0)                                          \
 361       {                                                                 \
 362         __ctz_x >>= 1;                                                  \
 363         __ctz_c++;                                                      \
 364       }                                                                 \
 365     (count) = __ctz_c;                                                  \
 366   } while (0)
 367 #endif
 368
 369 /* Requires that a < n and b <= n */
 370 #define submod(r,a,b,n)                                                 \
 371   do {                                                                  \
 372     uintmax_t _t = - (uintmax_t) (a < b);                               \
 373     (r) = ((n) & _t) + (a) - (b);                                       \
 374   } while (0)
 375
 376 #define addmod(r,a,b,n)                                                 \
 377   submod ((r), (a), ((n) - (b)), (n))
 378
 379 /* Modular two-word addition and subtraction.  For performance reasons, the
 380    most significant bit of n1 must be clear.  The destination variables must be
 381    distinct from the mod operand.  */
 382 #define addmod2(r1, r0, a1, a0, b1, b0, n1, n0)                         \
 383   do {                                                                  \
 384     add_ssaaaa ((r1), (r0), (a1), (a0), (b1), (b0));                    \
 385     if (ge2 ((r1), (r0), (n1), (n0)))                                   \
 386       sub_ddmmss ((r1), (r0), (r1), (r0), (n1), (n0));                  \
 387   } while (0)
 388 #define submod2(r1, r0, a1, a0, b1, b0, n1, n0)                         \
 389   do {                                                                  \
 390     sub_ddmmss ((r1), (r0), (a1), (a0), (b1), (b0));                    \
 391     if ((intmax_t) (r1) < 0)                                            \
 392       add_ssaaaa ((r1), (r0), (r1), (r0), (n1), (n0));                  \
 393   } while (0)
 394
 395 #define HIGHBIT_TO_MASK(x)                                              \
 396   (((intmax_t)-1 >> 1) < 0                                              \
 397    ? (uintmax_t)((intmax_t)(x) >> (W_TYPE_SIZE - 1))                    \
 398    : ((x) & ((uintmax_t) 1 << (W_TYPE_SIZE - 1))                        \
 399       ? UINTMAX_MAX : (uintmax_t) 0))
 400
 401 /* Compute r = a mod d, where r = <*t1,retval>, a = <a1,a0>, d = <d1,d0>.
 402    Requires that d1 != 0.  */
 403 static uintmax_t
 404 mod2 (uintmax_t *r1, uintmax_t a1, uintmax_t a0, uintmax_t d1, uintmax_t d0)
 405 {
 406   int cntd, cnta;
 407
 408   assert (d1 != 0);
 409
 410   if (a1 == 0)
 411     {
 412       *r1 = 0;
 413       return a0;
 414     }
 415
 416   count_leading_zeros (cntd, d1);
 417   count_leading_zeros (cnta, a1);
 418   int cnt = cntd - cnta;
 419   lsh2 (d1, d0, d1, d0, cnt);
 420   for (int i = 0; i < cnt; i++)
 421     {
 422       if (ge2 (a1, a0, d1, d0))
 423         sub_ddmmss (a1, a0, a1, a0, d1, d0);
 424       rsh2 (d1, d0, d1, d0, 1);
 425     }
 426
 427   *r1 = a1;
 428   return a0;
 429 }
 430
 431 static uintmax_t _GL_ATTRIBUTE_CONST
 432 gcd_odd (uintmax_t a, uintmax_t b)
 433 {
 434   if ( (b & 1) == 0)
 435     {
 436       uintmax_t t = b;
 437       b = a;
 438       a = t;
 439     }
 440   if (a == 0)
 441     return b;
 442
 443   /* Take out least significant one bit, to make room for sign */
 444   b >>= 1;
 445
 446   for (;;)
 447     {
 448       uintmax_t t;
 449       uintmax_t bgta;
 450
 451       while ((a & 1) == 0)
 452         a >>= 1;
 453       a >>= 1;
 454
 455       t = a - b;
 456       if (t == 0)
 457         return (a << 1) + 1;
 458
 459       bgta = HIGHBIT_TO_MASK (t);
 460
 461       /* b <-- min (a, b) */
 462       b += (bgta & t);
 463
 464       /* a <-- |a - b| */
 465       a = (t ^ bgta) - bgta;
 466     }
 467 }
 468
 469 static uintmax_t
 470 gcd2_odd (uintmax_t *r1, uintmax_t a1, uintmax_t a0, uintmax_t b1, uintmax_t b0)
 471 {
 472   while ((a0 & 1) == 0)
 473     rsh2 (a1, a0, a1, a0, 1);
 474   while ((b0 & 1) == 0)
 475     rsh2 (b1, b0, b1, b0, 1);
 476
 477   for (;;)
 478     {
 479       if ((b1 | a1) == 0)
 480         {
 481           *r1 = 0;
 482           return gcd_odd (b0, a0);
 483         }
 484
 485       if (gt2 (a1, a0, b1, b0))
 486         {
 487           sub_ddmmss (a1, a0, a1, a0, b1, b0);
 488           do
 489             rsh2 (a1, a0, a1, a0, 1);
 490           while ((a0 & 1) == 0);
 491         }
 492       else if (gt2 (b1, b0, a1, a0))
 493         {
 494           sub_ddmmss (b1, b0, b1, b0, a1, a0);
 495           do
 496             rsh2 (b1, b0, b1, b0, 1);
 497           while ((b0 & 1) == 0);
 498         }
 499       else
 500         break;
 501     }
 502
 503   *r1 = a1;
 504   return a0;
 505 }
 506
 507 static void
 508 factor_insert_multiplicity (struct factors *factors,
 509                             uintmax_t prime, unsigned int m)
 510 {
 511   unsigned int nfactors = factors->nfactors;
 512   uintmax_t *p = factors->p;
 513   unsigned char *e = factors->e;
 514
 515   /* Locate position for insert new or increment e.  */
 516   int i;
 517   for (i = nfactors - 1; i >= 0; i--)
 518     {
 519       if (p[i] <= prime)
 520         break;
 521     }
 522
 523   if (i < 0 || p[i] != prime)
 524     {
 525       for (int j = nfactors - 1; j > i; j--)
 526         {
 527           p[j + 1] = p[j];
 528           e[j + 1] = e[j];
 529         }
 530       p[i + 1] = prime;
 531       e[i + 1] = m;
 532       factors->nfactors = nfactors + 1;
 533     }
 534   else
 535     {
 536       e[i] += m;
 537     }
 538 }
 539
 540 #define factor_insert(f, p) factor_insert_multiplicity (f, p, 1)
 541
 542 static void
 543 factor_insert_large (struct factors *factors,
 544                      uintmax_t p1, uintmax_t p0)
 545 {
 546   if (p1 > 0)
 547     {
 548       assert (factors->plarge[1] == 0);
 549       factors->plarge[0] = p0;
 550       factors->plarge[1] = p1;
 551     }
 552   else
 553     factor_insert (factors, p0);
 554 }
 555
 556 #if HAVE_GMP
 557
 558 # if !HAVE_DECL_MPZ_INITS
 559
 560 #  define mpz_inits(...) mpz_va_init (mpz_init, __VA_ARGS__)
 561 #  define mpz_clears(...) mpz_va_init (mpz_clear, __VA_ARGS__)
 562
 563 static void
 564 mpz_va_init (void (*mpz_single_init)(mpz_t), ...)
 565 {
 566   va_list ap;
 567
 568   va_start (ap, mpz_single_init);
 569
 570   mpz_t *mpz;
 571   while ((mpz = va_arg (ap, mpz_t *)))
 572     mpz_single_init (*mpz);
 573
 574   va_end (ap);
 575 }
 576 # endif
 577
 578 static void mp_factor (mpz_t, struct mp_factors *);
 579
 580 static void
 581 mp_factor_init (struct mp_factors *factors)
 582 {
 583   factors->p = NULL;
 584   factors->e = NULL;
 585   factors->nfactors = 0;
 586 }
 587
 588 static void
 589 mp_factor_clear (struct mp_factors *factors)
 590 {
 591   for (unsigned int i = 0; i < factors->nfactors; i++)
 592     mpz_clear (factors->p[i]);
 593
 594   free (factors->p);
 595   free (factors->e);
 596 }
 597
 598 static void
 599 mp_factor_insert (struct mp_factors *factors, mpz_t prime)
 600 {
 601   unsigned long int nfactors = factors->nfactors;
 602   mpz_t         *p  = factors->p;
 603   unsigned long int *e  = factors->e;
 604   long i;
 605
 606   /* Locate position for insert new or increment e.  */
 607   for (i = nfactors - 1; i >= 0; i--)
 608     {
 609       if (mpz_cmp (p[i], prime) <= 0)
 610         break;
 611     }
 612
 613   if (i < 0 || mpz_cmp (p[i], prime) != 0)
 614     {
 615       p = xrealloc (p, (nfactors + 1) * sizeof p[0]);
 616       e = xrealloc (e, (nfactors + 1) * sizeof e[0]);
 617
 618       mpz_init (p[nfactors]);
 619       for (long j = nfactors - 1; j > i; j--)
 620         {
 621           mpz_set (p[j + 1], p[j]);
 622           e[j + 1] = e[j];
 623         }
 624       mpz_set (p[i + 1], prime);
 625       e[i + 1] = 1;
 626
 627       factors->p = p;
 628       factors->e = e;
 629       factors->nfactors = nfactors + 1;
 630     }
 631   else
 632     {
 633       e[i] += 1;
 634     }
 635 }
 636
 637 static void
 638 mp_factor_insert_ui (struct mp_factors *factors, unsigned long int prime)
 639 {
 640   mpz_t pz;
 641
 642   mpz_init_set_ui (pz, prime);
 643   mp_factor_insert (factors, pz);
 644   mpz_clear (pz);
 645 }
 646 #endif /* HAVE_GMP */
 647
 648
 649 /* Number of bits in an uintmax_t.  */
 650 enum { W = sizeof (uintmax_t) * CHAR_BIT };
 651
 652 /* Verify that uintmax_t does not have holes in its representation.  */
 653 verify (UINTMAX_MAX >> (W - 1) == 1);
 654
 655 #define P(a,b,c,d) a,
 656 static const unsigned char primes_diff[] = {
 657 #include "primes.h"
 658 0,0,0,0,0,0,0                           /* 7 sentinels for 8-way loop */
 659 };
 660 #undef P
 661
 662 #define PRIMES_PTAB_ENTRIES \
 663   (sizeof (primes_diff) / sizeof (primes_diff[0]) - 8 + 1)
 664
 665 #define P(a,b,c,d) b,
 666 static const unsigned char primes_diff8[] = {
 667 #include "primes.h"
 668 0,0,0,0,0,0,0                           /* 7 sentinels for 8-way loop */
 669 };
 670 #undef P
 671
 672 struct primes_dtab
 673 {
 674   uintmax_t binv, lim;
 675 };
 676
 677 #define P(a,b,c,d) {c,d},
 678 static const struct primes_dtab primes_dtab[] = {
 679 #include "primes.h"
 680 {1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0} /* 7 sentinels for 8-way loop */
 681 };
 682 #undef P
 683
 684 /* Verify that uintmax_t is not wider than
 685    the integers used to generate primes.h.  */
 686 verify (W <= WIDE_UINT_BITS);
 687
 688 /* debugging for developers.  Enables devmsg().
 689    This flag is used only in the GMP code.  */
 690 static bool dev_debug = false;
 691
 692 /* Like error(0, 0, ...), but without an implicit newline.
 693    Also a noop unless the global DEV_DEBUG is set.
 694    TODO: Replace with variadic macro in system.h or
 695    move to a separate module.  */
 696 static inline void
 697 devmsg (char const *fmt, ...)
 698 {
 699   if (dev_debug)
 700     {
 701       va_list ap;
 702       va_start (ap, fmt);
 703       vfprintf (stderr, fmt, ap);
 704       va_end (ap);
 705     }
 706 }
 707
 708 /* Prove primality or run probabilistic tests.  */
 709 static bool flag_prove_primality = true;
 710
 711 /* Number of Miller-Rabin tests to run when not proving primality. */
 712 #define MR_REPS 25
 713
 714 #ifdef __GNUC__
 715 # define LIKELY(cond)    __builtin_expect ((cond), 1)
 716 # define UNLIKELY(cond)  __builtin_expect ((cond), 0)
 717 #else
 718 # define LIKELY(cond)    (cond)
 719 # define UNLIKELY(cond)  (cond)
 720 #endif
 721
 722 static void
 723 factor_insert_refind (struct factors *factors, uintmax_t p, unsigned int i,
 724                       unsigned int off)
 725 {
 726   for (unsigned int j = 0; j < off; j++)
 727     p += primes_diff[i + j];
 728   factor_insert (factors, p);
 729 }
 730
 731 /* Trial division with odd primes uses the following trick.
 732
 733    Let p be an odd prime, and B = 2^{W_TYPE_SIZE}. For simplicity,
 734    consider the case t < B (this is the second loop below).
 735
 736    From our tables we get
 737
 738      binv = p^{-1} (mod B)
 739      lim = floor ( (B-1) / p ).
 740
 741    First assume that t is a multiple of p, t = q * p. Then 0 <= q <= lim
 742    (and all quotients in this range occur for some t).
 743
 744    Then t = q * p is true also (mod B), and p is invertible we get
 745
 746      q = t * binv (mod B).
 747
 748    Next, assume that t is *not* divisible by p. Since multiplication
 749    by binv (mod B) is a one-to-one mapping,
 750
 751      t * binv (mod B) > lim,
 752
 753    because all the smaller values are already taken.
 754
 755    This can be summed up by saying that the function
 756
 757      q(t) = binv * t (mod B)
 758
 759    is a permutation of the range 0 <= t < B, with the curious property
 760    that it maps the multiples of p onto the range 0 <= q <= lim, in
 761    order, and the non-multiples of p onto the range lim < q < B.
 762  */
 763
 764 static uintmax_t
 765 factor_using_division (uintmax_t *t1p, uintmax_t t1, uintmax_t t0,
 766                        struct factors *factors)
 767 {
 768   if (t0 % 2 == 0)
 769     {
 770       unsigned int cnt;
 771
 772       if (t0 == 0)
 773         {
 774           count_trailing_zeros (cnt, t1);
 775           t0 = t1 >> cnt;
 776           t1 = 0;
 777           cnt += W_TYPE_SIZE;
 778         }
 779       else
 780         {
 781           count_trailing_zeros (cnt, t0);
 782           rsh2 (t1, t0, t1, t0, cnt);
 783         }
 784
 785       factor_insert_multiplicity (factors, 2, cnt);
 786     }
 787
 788   uintmax_t p = 3;
 789   unsigned int i;
 790   for (i = 0; t1 > 0 && i < PRIMES_PTAB_ENTRIES; i++)
 791     {
 792       for (;;)
 793         {
 794           uintmax_t q1, q0, hi, lo _GL_UNUSED;
 795
 796           q0 = t0 * primes_dtab[i].binv;
 797           umul_ppmm (hi, lo, q0, p);
 798           if (hi > t1)
 799             break;
 800           hi = t1 - hi;
 801           q1 = hi * primes_dtab[i].binv;
 802           if (LIKELY (q1 > primes_dtab[i].lim))
 803             break;
 804           t1 = q1; t0 = q0;
 805           factor_insert (factors, p);
 806         }
 807       p += primes_diff[i + 1];
 808     }
 809   if (t1p)
 810     *t1p = t1;
 811
 812 #define DIVBLOCK(I)                                                     \
 813   do {                                                                  \
 814     for (;;)                                                            \
 815       {                                                                 \
 816         q = t0 * pd[I].binv;                                            \
 817         if (LIKELY (q > pd[I].lim))                                     \
 818           break;                                                        \
 819         t0 = q;                                                         \
 820         factor_insert_refind (factors, p, i + 1, I);                    \
 821       }                                                                 \
 822   } while (0)
 823
 824   for (; i < PRIMES_PTAB_ENTRIES; i += 8)
 825     {
 826       uintmax_t q;
 827       const struct primes_dtab *pd = &primes_dtab[i];
 828       DIVBLOCK (0);
 829       DIVBLOCK (1);
 830       DIVBLOCK (2);
 831       DIVBLOCK (3);
 832       DIVBLOCK (4);
 833       DIVBLOCK (5);
 834       DIVBLOCK (6);
 835       DIVBLOCK (7);
 836
 837       p += primes_diff8[i];
 838       if (p * p > t0)
 839         break;
 840     }
 841
 842   return t0;
 843 }
 844
 845 #if HAVE_GMP
 846 static void
 847 mp_factor_using_division (mpz_t t, struct mp_factors *factors)
 848 {
 849   mpz_t q;
 850   unsigned long int p;
 851
 852   devmsg ("[trial division] ");
 853
 854   mpz_init (q);
 855
 856   p = mpz_scan1 (t, 0);
 857   mpz_div_2exp (t, t, p);
 858   while (p)
 859     {
 860       mp_factor_insert_ui (factors, 2);
 861       --p;
 862     }
 863
 864   p = 3;
 865   for (unsigned int i = 1; i <= PRIMES_PTAB_ENTRIES;)
 866     {
 867       if (! mpz_divisible_ui_p (t, p))
 868         {
 869           p += primes_diff[i++];
 870           if (mpz_cmp_ui (t, p * p) < 0)
 871             break;
 872         }
 873       else
 874         {
 875           mpz_tdiv_q_ui (t, t, p);
 876           mp_factor_insert_ui (factors, p);
 877         }
 878     }
 879
 880   mpz_clear (q);
 881 }
 882 #endif
 883
 884 /* Entry i contains (2i+1)^(-1) mod 2^8.  */
 885 static const unsigned char  binvert_table[128] =
 886 {
 887   0x01, 0xAB, 0xCD, 0xB7, 0x39, 0xA3, 0xC5, 0xEF,
 888   0xF1, 0x1B, 0x3D, 0xA7, 0x29, 0x13, 0x35, 0xDF,
 889   0xE1, 0x8B, 0xAD, 0x97, 0x19, 0x83, 0xA5, 0xCF,
 890   0xD1, 0xFB, 0x1D, 0x87, 0x09, 0xF3, 0x15, 0xBF,
 891   0xC1, 0x6B, 0x8D, 0x77, 0xF9, 0x63, 0x85, 0xAF,
 892   0xB1, 0xDB, 0xFD, 0x67, 0xE9, 0xD3, 0xF5, 0x9F,
 893   0xA1, 0x4B, 0x6D, 0x57, 0xD9, 0x43, 0x65, 0x8F,
 894   0x91, 0xBB, 0xDD, 0x47, 0xC9, 0xB3, 0xD5, 0x7F,
 895   0x81, 0x2B, 0x4D, 0x37, 0xB9, 0x23, 0x45, 0x6F,
 896   0x71, 0x9B, 0xBD, 0x27, 0xA9, 0x93, 0xB5, 0x5F,
 897   0x61, 0x0B, 0x2D, 0x17, 0x99, 0x03, 0x25, 0x4F,
 898   0x51, 0x7B, 0x9D, 0x07, 0x89, 0x73, 0x95, 0x3F,
 899   0x41, 0xEB, 0x0D, 0xF7, 0x79, 0xE3, 0x05, 0x2F,
 900   0x31, 0x5B, 0x7D, 0xE7, 0x69, 0x53, 0x75, 0x1F,
 901   0x21, 0xCB, 0xED, 0xD7, 0x59, 0xC3, 0xE5, 0x0F,
 902   0x11, 0x3B, 0x5D, 0xC7, 0x49, 0x33, 0x55, 0xFF
 903 };
 904
 905 /* Compute n^(-1) mod B, using a Newton iteration.  */
 906 #define binv(inv,n)                                                     \
 907   do {                                                                  \
 908     uintmax_t  __n = (n);                                               \
 909     uintmax_t  __inv;                                                   \
 910                                                                         \
 911     __inv = binvert_table[(__n / 2) & 0x7F]; /*  8 */                   \
 912     if (W_TYPE_SIZE > 8)   __inv = 2 * __inv - __inv * __inv * __n;     \
 913     if (W_TYPE_SIZE > 16)  __inv = 2 * __inv - __inv * __inv * __n;     \
 914     if (W_TYPE_SIZE > 32)  __inv = 2 * __inv - __inv * __inv * __n;     \
 915                                                                         \
 916     if (W_TYPE_SIZE > 64)                                               \
 917       {                                                                 \
 918         int  __invbits = 64;                                            \
 919         do {                                                            \
 920           __inv = 2 * __inv - __inv * __inv * __n;                      \
 921           __invbits *= 2;                                               \
 922         } while (__invbits < W_TYPE_SIZE);                              \
 923       }                                                                 \
 924                                                                         \
 925     (inv) = __inv;                                                      \
 926   } while (0)
 927
 928 /* q = u / d, assuming d|u.  */
 929 #define divexact_21(q1, q0, u1, u0, d)                                  \
 930   do {                                                                  \
 931     uintmax_t _di, _q0;                                                 \
 932     binv (_di, (d));                                                    \
 933     _q0 = (u0) * _di;                                                   \
 934     if ((u1) >= (d))                                                    \
 935       {                                                                 \
 936         uintmax_t _p1, _p0 _GL_UNUSED;                            \
 937         umul_ppmm (_p1, _p0, _q0, d);                                   \
 938         (q1) = ((u1) - _p1) * _di;                                      \
 939         (q0) = _q0;                                                     \
 940       }                                                                 \
 941     else                                                                \
 942       {                                                                 \
 943         (q0) = _q0;                                                     \
 944         (q1) = 0;                                                       \
 945       }                                                                 \
 946   } while (0)
 947
 948 /* x B (mod n). */
 949 #define redcify(r_prim, r, n)                                           \
 950   do {                                                                  \
 951     uintmax_t _redcify_q _GL_UNUSED;                              \
 952     udiv_qrnnd (_redcify_q, r_prim, r, 0, n);                           \
 953   } while (0)
 954
 955 /* x B^2 (mod n). Requires x > 0, n1 < B/2 */
 956 #define redcify2(r1, r0, x, n1, n0)                                     \
 957   do {                                                                  \
 958     uintmax_t _r1, _r0, _i;                                             \
 959     if ((x) < (n1))                                                     \
 960       {                                                                 \
 961         _r1 = (x); _r0 = 0;                                             \
 962         _i = W_TYPE_SIZE;                                               \
 963       }                                                                 \
 964     else                                                                \
 965       {                                                                 \
 966         _r1 = 0; _r0 = (x);                                             \
 967         _i = 2*W_TYPE_SIZE;                                             \
 968       }                                                                 \
 969     while (_i-- > 0)                                                    \
 970       {                                                                 \
 971         lsh2 (_r1, _r0, _r1, _r0, 1);                                   \
 972         if (ge2 (_r1, _r0, (n1), (n0)))                                 \
 973           sub_ddmmss (_r1, _r0, _r1, _r0, (n1), (n0));                  \
 974       }                                                                 \
 975     (r1) = _r1;                                                         \
 976     (r0) = _r0;                                                         \
 977   } while (0)
 978
 979 /* Modular two-word multiplication, r = a * b mod m, with mi = m^(-1) mod B.
 980    Both a and b must be in redc form, the result will be in redc form too. */
 981 static inline uintmax_t
 982 mulredc (uintmax_t a, uintmax_t b, uintmax_t m, uintmax_t mi)
 983 {
 984   uintmax_t rh, rl, q, th, tl _GL_UNUSED, xh;
 985
 986   umul_ppmm (rh, rl, a, b);
 987   q = rl * mi;
 988   umul_ppmm (th, tl, q, m);
 989   xh = rh - th;
 990   if (rh < th)
 991     xh += m;
 992
 993   return xh;
 994 }
 995
 996 /* Modular two-word multiplication, r = a * b mod m, with mi = m^(-1) mod B.
 997    Both a and b must be in redc form, the result will be in redc form too.
 998    For performance reasons, the most significant bit of m must be clear. */
 999 static uintmax_t
1000 mulredc2 (uintmax_t *r1p,
1001           uintmax_t a1, uintmax_t a0, uintmax_t b1, uintmax_t b0,
1002           uintmax_t m1, uintmax_t m0, uintmax_t mi)
1003 {
1004   uintmax_t r1, r0, q, p1, p0 _GL_UNUSED, t1, t0, s1, s0;
1005   mi = -mi;
1006   assert ( (a1 >> (W_TYPE_SIZE - 1)) == 0);
1007   assert ( (b1 >> (W_TYPE_SIZE - 1)) == 0);
1008   assert ( (m1 >> (W_TYPE_SIZE - 1)) == 0);
1009
1010   /* First compute a0 * <b1, b0> B^{-1}
1011         +-----+
1012         |a0 b0|
1013      +--+--+--+
1014      |a0 b1|
1015      +--+--+--+
1016         |q0 m0|
1017      +--+--+--+
1018      |q0 m1|
1019     -+--+--+--+
1020      |r1|r0| 0|
1021      +--+--+--+
1022   */
1023   umul_ppmm (t1, t0, a0, b0);
1024   umul_ppmm (r1, r0, a0, b1);
1025   q = mi * t0;
1026   umul_ppmm (p1, p0, q, m0);
1027   umul_ppmm (s1, s0, q, m1);
1028   r0 += (t0 != 0); /* Carry */
1029   add_ssaaaa (r1, r0, r1, r0, 0, p1);
1030   add_ssaaaa (r1, r0, r1, r0, 0, t1);
1031   add_ssaaaa (r1, r0, r1, r0, s1, s0);
1032
1033   /* Next, (a1 * <b1, b0> + <r1, r0> B^{-1}
1034         +-----+
1035         |a1 b0|
1036         +--+--+
1037         |r1|r0|
1038      +--+--+--+
1039      |a1 b1|
1040      +--+--+--+
1041         |q1 m0|
1042      +--+--+--+
1043      |q1 m1|
1044     -+--+--+--+
1045      |r1|r0| 0|
1046      +--+--+--+
1047   */
1048   umul_ppmm (t1, t0, a1, b0);
1049   umul_ppmm (s1, s0, a1, b1);
1050   add_ssaaaa (t1, t0, t1, t0, 0, r0);
1051   q = mi * t0;
1052   add_ssaaaa (r1, r0, s1, s0, 0, r1);
1053   umul_ppmm (p1, p0, q, m0);
1054   umul_ppmm (s1, s0, q, m1);
1055   r0 += (t0 != 0); /* Carry */
1056   add_ssaaaa (r1, r0, r1, r0, 0, p1);
1057   add_ssaaaa (r1, r0, r1, r0, 0, t1);
1058   add_ssaaaa (r1, r0, r1, r0, s1, s0);
1059
1060   if (ge2 (r1, r0, m1, m0))
1061     sub_ddmmss (r1, r0, r1, r0, m1, m0);
1062
1063   *r1p = r1;
1064   return r0;
1065 }
1066
1067 static uintmax_t _GL_ATTRIBUTE_CONST
1068 powm (uintmax_t b, uintmax_t e, uintmax_t n, uintmax_t ni, uintmax_t one)
1069 {
1070   uintmax_t y = one;
1071
1072   if (e & 1)
1073     y = b;
1074
1075   while (e != 0)
1076     {
1077       b = mulredc (b, b, n, ni);
1078       e >>= 1;
1079
1080       if (e & 1)
1081         y = mulredc (y, b, n, ni);
1082     }
1083
1084   return y;
1085 }
1086
1087 static uintmax_t
1088 powm2 (uintmax_t *r1m,
1089        const uintmax_t *bp, const uintmax_t *ep, const uintmax_t *np,
1090        uintmax_t ni, const uintmax_t *one)
1091 {
1092   uintmax_t r1, r0, b1, b0, n1, n0;
1093   unsigned int i;
1094   uintmax_t e;
1095
1096   b0 = bp[0];
1097   b1 = bp[1];
1098   n0 = np[0];
1099   n1 = np[1];
1100
1101   r0 = one[0];
1102   r1 = one[1];
1103
1104   for (e = ep[0], i = W_TYPE_SIZE; i > 0; i--, e >>= 1)
1105     {
1106       if (e & 1)
1107         {
1108           r0 = mulredc2 (r1m, r1, r0, b1, b0, n1, n0, ni);
1109           r1 = *r1m;
1110         }
1111       b0 = mulredc2 (r1m, b1, b0, b1, b0, n1, n0, ni);
1112       b1 = *r1m;
1113     }
1114   for (e = ep[1]; e > 0; e >>= 1)
1115     {
1116       if (e & 1)
1117         {
1118           r0 = mulredc2 (r1m, r1, r0, b1, b0, n1, n0, ni);
1119           r1 = *r1m;
1120         }
1121       b0 = mulredc2 (r1m, b1, b0, b1, b0, n1, n0, ni);
1122       b1 = *r1m;
1123     }
1124   *r1m = r1;
1125   return r0;
1126 }
1127
1128 static bool _GL_ATTRIBUTE_CONST
1129 millerrabin (uintmax_t n, uintmax_t ni, uintmax_t b, uintmax_t q,
1130              unsigned int k, uintmax_t one)
1131 {
1132   uintmax_t y = powm (b, q, n, ni, one);
1133
1134   uintmax_t nm1 = n - one;      /* -1, but in redc representation. */
1135
1136   if (y == one || y == nm1)
1137     return true;
1138
1139   for (unsigned int i = 1; i < k; i++)
1140     {
1141       y = mulredc (y, y, n, ni);
1142
1143       if (y == nm1)
1144         return true;
1145       if (y == one)
1146         return false;
1147     }
1148   return false;
1149 }
1150
1151 static bool
1152 millerrabin2 (const uintmax_t *np, uintmax_t ni, const uintmax_t *bp,
1153               const uintmax_t *qp, unsigned int k, const uintmax_t *one)
1154 {
1155   uintmax_t y1, y0, nm1_1, nm1_0, r1m;
1156
1157   y0 = powm2 (&r1m, bp, qp, np, ni, one);
1158   y1 = r1m;
1159
1160   if (y0 == one[0] && y1 == one[1])
1161     return true;
1162
1163   sub_ddmmss (nm1_1, nm1_0, np[1], np[0], one[1], one[0]);
1164
1165   if (y0 == nm1_0 && y1 == nm1_1)
1166     return true;
1167
1168   for (unsigned int i = 1; i < k; i++)
1169     {
1170       y0 = mulredc2 (&r1m, y1, y0, y1, y0, np[1], np[0], ni);
1171       y1 = r1m;
1172
1173       if (y0 == nm1_0 && y1 == nm1_1)
1174         return true;
1175       if (y0 == one[0] && y1 == one[1])
1176         return false;
1177     }
1178   return false;
1179 }
1180
1181 #if HAVE_GMP
1182 static bool
1183 mp_millerrabin (mpz_srcptr n, mpz_srcptr nm1, mpz_ptr x, mpz_ptr y,
1184                 mpz_srcptr q, unsigned long int k)
1185 {
1186   mpz_powm (y, x, q, n);
1187
1188   if (mpz_cmp_ui (y, 1) == 0 || mpz_cmp (y, nm1) == 0)
1189     return true;
1190
1191   for (unsigned long int i = 1; i < k; i++)
1192     {
1193       mpz_powm_ui (y, y, 2, n);
1194       if (mpz_cmp (y, nm1) == 0)
1195         return true;
1196       if (mpz_cmp_ui (y, 1) == 0)
1197         return false;
1198     }
1199   return false;
1200 }
1201 #endif
1202
1203 /* Lucas' prime test.  The number of iterations vary greatly, up to a few dozen
1204    have been observed.  The average seem to be about 2.  */
1205 static bool
1206 prime_p (uintmax_t n)
1207 {
1208   int k;
1209   bool is_prime;
1210   uintmax_t a_prim, one, ni;
1211   struct factors factors;
1212
1213   if (n <= 1)
1214     return false;
1215
1216   /* We have already casted out small primes. */
1217   if (n < (uintmax_t) FIRST_OMITTED_PRIME * FIRST_OMITTED_PRIME)
1218     return true;
1219
1220   /* Precomputation for Miller-Rabin.  */
1221   uintmax_t q = n - 1;
1222   for (k = 0; (q & 1) == 0; k++)
1223     q >>= 1;
1224
1225   uintmax_t a = 2;
1226   binv (ni, n);                 /* ni <- 1/n mod B */
1227   redcify (one, 1, n);
1228   addmod (a_prim, one, one, n); /* i.e., redcify a = 2 */
1229
1230   /* Perform a Miller-Rabin test, finds most composites quickly.  */
1231   if (!millerrabin (n, ni, a_prim, q, k, one))
1232     return false;
1233
1234   if (flag_prove_primality)
1235     {
1236       /* Factor n-1 for Lucas.  */
1237       factor (0, n - 1, &factors);
1238     }
1239
1240   /* Loop until Lucas proves our number prime, or Miller-Rabin proves our
1241      number composite.  */
1242   for (unsigned int r = 0; r < PRIMES_PTAB_ENTRIES; r++)
1243     {
1244       if (flag_prove_primality)
1245         {
1246           is_prime = true;
1247           for (unsigned int i = 0; i < factors.nfactors && is_prime; i++)
1248             {
1249               is_prime
1250                 = powm (a_prim, (n - 1) / factors.p[i], n, ni, one) != one;
1251             }
1252         }
1253       else
1254         {
1255           /* After enough Miller-Rabin runs, be content. */
1256           is_prime = (r == MR_REPS - 1);
1257         }
1258
1259       if (is_prime)
1260         return true;
1261
1262       a += primes_diff[r];      /* Establish new base.  */
1263
1264       /* The following is equivalent to redcify (a_prim, a, n).  It runs faster
1265          on most processors, since it avoids udiv_qrnnd.  If we go down the
1266          udiv_qrnnd_preinv path, this code should be replaced.  */
1267       {
1268         uintmax_t s1, s0;
1269         umul_ppmm (s1, s0, one, a);
1270         if (LIKELY (s1 == 0))
1271           a_prim = s0 % n;
1272         else
1273           {
1274             uintmax_t dummy _GL_UNUSED;
1275             udiv_qrnnd (dummy, a_prim, s1, s0, n);
1276           }
1277       }
1278
1279       if (!millerrabin (n, ni, a_prim, q, k, one))
1280         return false;
1281     }
1282
1283   error (0, 0, _("Lucas prime test failure.  This should not happen"));
1284   abort ();
1285 }
1286
1287 static bool
1288 prime2_p (uintmax_t n1, uintmax_t n0)
1289 {
1290   uintmax_t q[2], nm1[2];
1291   uintmax_t a_prim[2];
1292   uintmax_t one[2];
1293   uintmax_t na[2];
1294   uintmax_t ni;
1295   unsigned int k;
1296   struct factors factors;
1297
1298   if (n1 == 0)
1299     return prime_p (n0);
1300
1301   nm1[1] = n1 - (n0 == 0);
1302   nm1[0] = n0 - 1;
1303   if (nm1[0] == 0)
1304     {
1305       count_trailing_zeros (k, nm1[1]);
1306
1307       q[0] = nm1[1] >> k;
1308       q[1] = 0;
1309       k += W_TYPE_SIZE;
1310     }
1311   else
1312     {
1313       count_trailing_zeros (k, nm1[0]);
1314       rsh2 (q[1], q[0], nm1[1], nm1[0], k);
1315     }
1316
1317   uintmax_t a = 2;
1318   binv (ni, n0);
1319   redcify2 (one[1], one[0], 1, n1, n0);
1320   addmod2 (a_prim[1], a_prim[0], one[1], one[0], one[1], one[0], n1, n0);
1321
1322   /* FIXME: Use scalars or pointers in arguments? Some consistency needed. */
1323   na[0] = n0;
1324   na[1] = n1;
1325
1326   if (!millerrabin2 (na, ni, a_prim, q, k, one))
1327     return false;
1328
1329   if (flag_prove_primality)
1330     {
1331       /* Factor n-1 for Lucas.  */
1332       factor (nm1[1], nm1[0], &factors);
1333     }
1334
1335   /* Loop until Lucas proves our number prime, or Miller-Rabin proves our
1336      number composite.  */
1337   for (unsigned int r = 0; r < PRIMES_PTAB_ENTRIES; r++)
1338     {
1339       bool is_prime;
1340       uintmax_t e[2], y[2];
1341
1342       if (flag_prove_primality)
1343         {
1344           is_prime = true;
1345           if (factors.plarge[1])
1346             {
1347               uintmax_t pi;
1348               binv (pi, factors.plarge[0]);
1349               e[0] = pi * nm1[0];
1350               e[1] = 0;
1351               y[0] = powm2 (&y[1], a_prim, e, na, ni, one);
1352               is_prime = (y[0] != one[0] || y[1] != one[1]);
1353             }
1354           for (unsigned int i = 0; i < factors.nfactors && is_prime; i++)
1355             {
1356               /* FIXME: We always have the factor 2. Do we really need to
1357                  handle it here? We have done the same powering as part
1358                  of millerrabin. */
1359               if (factors.p[i] == 2)
1360                 rsh2 (e[1], e[0], nm1[1], nm1[0], 1);
1361               else
1362                 divexact_21 (e[1], e[0], nm1[1], nm1[0], factors.p[i]);
1363               y[0] = powm2 (&y[1], a_prim, e, na, ni, one);
1364               is_prime = (y[0] != one[0] || y[1] != one[1]);
1365             }
1366         }
1367       else
1368         {
1369           /* After enough Miller-Rabin runs, be content. */
1370           is_prime = (r == MR_REPS - 1);
1371         }
1372
1373       if (is_prime)
1374         return true;
1375
1376       a += primes_diff[r];      /* Establish new base.  */
1377       redcify2 (a_prim[1], a_prim[0], a, n1, n0);
1378
1379       if (!millerrabin2 (na, ni, a_prim, q, k, one))
1380         return false;
1381     }
1382
1383   error (0, 0, _("Lucas prime test failure.  This should not happen"));
1384   abort ();
1385 }
1386
1387 #if HAVE_GMP
1388 static bool
1389 mp_prime_p (mpz_t n)
1390 {
1391   bool is_prime;
1392   mpz_t q, a, nm1, tmp;
1393   struct mp_factors factors;
1394
1395   if (mpz_cmp_ui (n, 1) <= 0)
1396     return false;
1397
1398   /* We have already casted out small primes. */
1399   if (mpz_cmp_ui (n, (long) FIRST_OMITTED_PRIME * FIRST_OMITTED_PRIME) < 0)
1400     return true;
1401
1402   mpz_inits (q, a, nm1, tmp, NULL);
1403
1404   /* Precomputation for Miller-Rabin.  */
1405   mpz_sub_ui (nm1, n, 1);
1406
1407   /* Find q and k, where q is odd and n = 1 + 2**k * q.  */
1408   unsigned long int k = mpz_scan1 (nm1, 0);
1409   mpz_tdiv_q_2exp (q, nm1, k);
1410
1411   mpz_set_ui (a, 2);
1412
1413   /* Perform a Miller-Rabin test, finds most composites quickly.  */
1414   if (!mp_millerrabin (n, nm1, a, tmp, q, k))
1415     {
1416       is_prime = false;
1417       goto ret2;
1418     }
1419
1420   if (flag_prove_primality)
1421     {
1422       /* Factor n-1 for Lucas.  */
1423       mpz_set (tmp, nm1);
1424       mp_factor (tmp, &factors);
1425     }
1426
1427   /* Loop until Lucas proves our number prime, or Miller-Rabin proves our
1428      number composite.  */
1429   for (unsigned int r = 0; r < PRIMES_PTAB_ENTRIES; r++)
1430     {
1431       if (flag_prove_primality)
1432         {
1433           is_prime = true;
1434           for (unsigned long int i = 0; i < factors.nfactors && is_prime; i++)
1435             {
1436               mpz_divexact (tmp, nm1, factors.p[i]);
1437               mpz_powm (tmp, a, tmp, n);
1438               is_prime = mpz_cmp_ui (tmp, 1) != 0;
1439             }
1440         }
1441       else
1442         {
1443           /* After enough Miller-Rabin runs, be content. */
1444           is_prime = (r == MR_REPS - 1);
1445         }
1446
1447       if (is_prime)
1448         goto ret1;
1449
1450       mpz_add_ui (a, a, primes_diff[r]);        /* Establish new base.  */
1451
1452       if (!mp_millerrabin (n, nm1, a, tmp, q, k))
1453         {
1454           is_prime = false;
1455           goto ret1;
1456         }
1457     }
1458
1459   error (0, 0, _("Lucas prime test failure.  This should not happen"));
1460   abort ();
1461
1462  ret1:
1463   if (flag_prove_primality)
1464     mp_factor_clear (&factors);
1465  ret2:
1466   mpz_clears (q, a, nm1, tmp, NULL);
1467
1468   return is_prime;
1469 }
1470 #endif
1471
1472 static void
1473 factor_using_pollard_rho (uintmax_t n, unsigned long int a,
1474                           struct factors *factors)
1475 {
1476   uintmax_t x, z, y, P, t, ni, g;
1477
1478   unsigned long int k = 1;
1479   unsigned long int l = 1;
1480
1481   redcify (P, 1, n);
1482   addmod (x, P, P, n);          /* i.e., redcify(2) */
1483   y = z = x;
1484
1485   while (n != 1)
1486     {
1487       assert (a < n);
1488
1489       binv (ni, n);             /* FIXME: when could we use old 'ni' value? */
1490
1491       for (;;)
1492         {
1493           do
1494             {
1495               x = mulredc (x, x, n, ni);
1496               addmod (x, x, a, n);
1497
1498               submod (t, z, x, n);
1499               P = mulredc (P, t, n, ni);
1500
1501               if (k % 32 == 1)
1502                 {
1503                   if (gcd_odd (P, n) != 1)
1504                     goto factor_found;
1505                   y = x;
1506                 }
1507             }
1508           while (--k != 0);
1509
1510           z = x;
1511           k = l;
1512           l = 2 * l;
1513           for (unsigned long int i = 0; i < k; i++)
1514             {
1515               x = mulredc (x, x, n, ni);
1516               addmod (x, x, a, n);
1517             }
1518           y = x;
1519         }
1520
1521     factor_found:
1522       do
1523         {
1524           y = mulredc (y, y, n, ni);
1525           addmod (y, y, a, n);
1526
1527           submod (t, z, y, n);
1528           g = gcd_odd (t, n);
1529         }
1530       while (g == 1);
1531
1532       n = n / g;
1533
1534       if (!prime_p (g))
1535         factor_using_pollard_rho (g, a + 1, factors);
1536       else
1537         factor_insert (factors, g);
1538
1539       if (prime_p (n))
1540         {
1541           factor_insert (factors, n);
1542           break;
1543         }
1544
1545       x = x % n;
1546       z = z % n;
1547       y = y % n;
1548     }
1549 }
1550
1551 static void
1552 factor_using_pollard_rho2 (uintmax_t n1, uintmax_t n0, unsigned long int a,
1553                            struct factors *factors)
1554 {
1555   uintmax_t x1, x0, z1, z0, y1, y0, P1, P0, t1, t0, ni, g1, g0, r1m;
1556
1557   unsigned long int k = 1;
1558   unsigned long int l = 1;
1559
1560   redcify2 (P1, P0, 1, n1, n0);
1561   addmod2 (x1, x0, P1, P0, P1, P0, n1, n0); /* i.e., redcify(2) */
1562   y1 = z1 = x1;
1563   y0 = z0 = x0;
1564
1565   while (n1 != 0 || n0 != 1)
1566     {
1567       binv (ni, n0);
1568
1569       for (;;)
1570         {
1571           do
1572             {
1573               x0 = mulredc2 (&r1m, x1, x0, x1, x0, n1, n0, ni);
1574               x1 = r1m;
1575               addmod2 (x1, x0, x1, x0, 0, (uintmax_t) a, n1, n0);
1576
1577               submod2 (t1, t0, z1, z0, x1, x0, n1, n0);
1578               P0 = mulredc2 (&r1m, P1, P0, t1, t0, n1, n0, ni);
1579               P1 = r1m;
1580
1581               if (k % 32 == 1)
1582                 {
1583                   g0 = gcd2_odd (&g1, P1, P0, n1, n0);
1584                   if (g1 != 0 || g0 != 1)
1585                     goto factor_found;
1586                   y1 = x1; y0 = x0;
1587                 }
1588             }
1589           while (--k != 0);
1590
1591           z1 = x1; z0 = x0;
1592           k = l;
1593           l = 2 * l;
1594           for (unsigned long int i = 0; i < k; i++)
1595             {
1596               x0 = mulredc2 (&r1m, x1, x0, x1, x0, n1, n0, ni);
1597               x1 = r1m;
1598               addmod2 (x1, x0, x1, x0, 0, (uintmax_t) a, n1, n0);
1599             }
1600           y1 = x1; y0 = x0;
1601         }
1602
1603     factor_found:
1604       do
1605         {
1606           y0 = mulredc2 (&r1m, y1, y0, y1, y0, n1, n0, ni);
1607           y1 = r1m;
1608           addmod2 (y1, y0, y1, y0, 0, (uintmax_t) a, n1, n0);
1609
1610           submod2 (t1, t0, z1, z0, y1, y0, n1, n0);
1611           g0 = gcd2_odd (&g1, t1, t0, n1, n0);
1612         }
1613       while (g1 == 0 && g0 == 1);
1614
1615       if (g1 == 0)
1616         {
1617           /* The found factor is one word. */
1618           divexact_21 (n1, n0, n1, n0, g0);     /* n = n / g */
1619
1620           if (!prime_p (g0))
1621             factor_using_pollard_rho (g0, a + 1, factors);
1622           else
1623             factor_insert (factors, g0);
1624         }
1625       else
1626         {
1627           /* The found factor is two words.  This is highly unlikely, thus hard
1628              to trigger.  Please be careful before you change this code!  */
1629           uintmax_t ginv;
1630
1631           binv (ginv, g0);      /* Compute n = n / g.  Since the result will */
1632           n0 = ginv * n0;       /* fit one word, we can compute the quotient */
1633           n1 = 0;               /* modulo B, ignoring the high divisor word. */
1634
1635           if (!prime2_p (g1, g0))
1636             factor_using_pollard_rho2 (g1, g0, a + 1, factors);
1637           else
1638             factor_insert_large (factors, g1, g0);
1639         }
1640
1641       if (n1 == 0)
1642         {
1643           if (prime_p (n0))
1644             {
1645               factor_insert (factors, n0);
1646               break;
1647             }
1648
1649           factor_using_pollard_rho (n0, a, factors);
1650           return;
1651         }
1652
1653       if (prime2_p (n1, n0))
1654         {
1655           factor_insert_large (factors, n1, n0);
1656           break;
1657         }
1658
1659       x0 = mod2 (&x1, x1, x0, n1, n0);
1660       z0 = mod2 (&z1, z1, z0, n1, n0);
1661       y0 = mod2 (&y1, y1, y0, n1, n0);
1662     }
1663 }
1664
1665 #if HAVE_GMP
1666 static void
1667 mp_factor_using_pollard_rho (mpz_t n, unsigned long int a,
1668                              struct mp_factors *factors)
1669 {
1670   mpz_t x, z, y, P;
1671   mpz_t t, t2;
1672
1673   devmsg ("[pollard-rho (%lu)] ", a);
1674
1675   mpz_inits (t, t2, NULL);
1676   mpz_init_set_si (y, 2);
1677   mpz_init_set_si (x, 2);
1678   mpz_init_set_si (z, 2);
1679   mpz_init_set_ui (P, 1);
1680
1681   unsigned long long int k = 1;
1682   unsigned long long int l = 1;
1683
1684   while (mpz_cmp_ui (n, 1) != 0)
1685     {
1686       for (;;)
1687         {
1688           do
1689             {
1690               mpz_mul (t, x, x);
1691               mpz_mod (x, t, n);
1692               mpz_add_ui (x, x, a);
1693
1694               mpz_sub (t, z, x);
1695               mpz_mul (t2, P, t);
1696               mpz_mod (P, t2, n);
1697
1698               if (k % 32 == 1)
1699                 {
1700                   mpz_gcd (t, P, n);
1701                   if (mpz_cmp_ui (t, 1) != 0)
1702                     goto factor_found;
1703                   mpz_set (y, x);
1704                 }
1705             }
1706           while (--k != 0);
1707
1708           mpz_set (z, x);
1709           k = l;
1710           l = 2 * l;
1711           for (unsigned long long int i = 0; i < k; i++)
1712             {
1713               mpz_mul (t, x, x);
1714               mpz_mod (x, t, n);
1715               mpz_add_ui (x, x, a);
1716             }
1717           mpz_set (y, x);
1718         }
1719
1720     factor_found:
1721       do
1722         {
1723           mpz_mul (t, y, y);
1724           mpz_mod (y, t, n);
1725           mpz_add_ui (y, y, a);
1726
1727           mpz_sub (t, z, y);
1728           mpz_gcd (t, t, n);
1729         }
1730       while (mpz_cmp_ui (t, 1) == 0);
1731
1732       mpz_divexact (n, n, t);   /* divide by t, before t is overwritten */
1733
1734       if (!mp_prime_p (t))
1735         {
1736           devmsg ("[composite factor--restarting pollard-rho] ");
1737           mp_factor_using_pollard_rho (t, a + 1, factors);
1738         }
1739       else
1740         {
1741           mp_factor_insert (factors, t);
1742         }
1743
1744       if (mp_prime_p (n))
1745         {
1746           mp_factor_insert (factors, n);
1747           break;
1748         }
1749
1750       mpz_mod (x, x, n);
1751       mpz_mod (z, z, n);
1752       mpz_mod (y, y, n);
1753     }
1754
1755   mpz_clears (P, t2, t, z, x, y, NULL);
1756 }
1757 #endif
1758
1759 /* FIXME: Maybe better to use an iteration converging to 1/sqrt(n)?  If
1760    algorithm is replaced, consider also returning the remainder. */
1761 static uintmax_t _GL_ATTRIBUTE_CONST
1762 isqrt (uintmax_t n)
1763 {
1764   uintmax_t x;
1765   unsigned c;
1766   if (n == 0)
1767     return 0;
1768
1769   count_leading_zeros (c, n);
1770
1771   /* Make x > sqrt(n). This will be invariant through the loop. */
1772   x = (uintmax_t) 1 << ((W_TYPE_SIZE + 1 - c) / 2);
1773
1774   for (;;)
1775     {
1776       uintmax_t y = (x + n/x) / 2;
1777       if (y >= x)
1778         return x;
1779
1780       x = y;
1781     }
1782 }
1783
1784 static uintmax_t _GL_ATTRIBUTE_CONST
1785 isqrt2 (uintmax_t nh, uintmax_t nl)
1786 {
1787   unsigned int shift;
1788   uintmax_t x;
1789
1790   /* Ensures the remainder fits in an uintmax_t. */
1791   assert (nh < ((uintmax_t) 1 << (W_TYPE_SIZE - 2)));
1792
1793   if (nh == 0)
1794     return isqrt (nl);
1795
1796   count_leading_zeros (shift, nh);
1797   shift &= ~1;
1798
1799   /* Make x > sqrt(n) */
1800   x = isqrt ( (nh << shift) + (nl >> (W_TYPE_SIZE - shift))) + 1;
1801   x <<= (W_TYPE_SIZE - shift) / 2;
1802
1803   /* Do we need more than one iteration? */
1804   for (;;)
1805     {
1806       uintmax_t r _GL_UNUSED;
1807       uintmax_t q, y;
1808       udiv_qrnnd (q, r, nh, nl, x);
1809       y = (x + q) / 2;
1810
1811       if (y >= x)
1812         {
1813           uintmax_t hi, lo;
1814           umul_ppmm (hi, lo, x + 1, x + 1);
1815           assert (gt2 (hi, lo, nh, nl));
1816
1817           umul_ppmm (hi, lo, x, x);
1818           assert (ge2 (nh, nl, hi, lo));
1819           sub_ddmmss (hi, lo, nh, nl, hi, lo);
1820           assert (hi == 0);
1821
1822           return x;
1823         }
1824
1825       x = y;
1826     }
1827 }
1828
1829 /* MAGIC[N] has a bit i set iff i is a quadratic residue mod N. */
1830 #define MAGIC64 ((uint64_t) 0x0202021202030213ULL)
1831 #define MAGIC63 ((uint64_t) 0x0402483012450293ULL)
1832 #define MAGIC65 ((uint64_t) 0x218a019866014613ULL)
1833 #define MAGIC11 0x23b
1834
1835 /* Return the square root if the input is a square, otherwise 0. */
1836 static uintmax_t _GL_ATTRIBUTE_CONST
1837 is_square (uintmax_t x)
1838 {
1839   /* Uses the tests suggested by Cohen. Excludes 99% of the non-squares before
1840      computing the square root. */
1841   if (((MAGIC64 >> (x & 63)) & 1)
1842       && ((MAGIC63 >> (x % 63)) & 1)
1843       /* Both 0 and 64 are squares mod (65) */
1844       && ((MAGIC65 >> ((x % 65) & 63)) & 1)
1845       && ((MAGIC11 >> (x % 11) & 1)))
1846     {
1847       uintmax_t r = isqrt (x);
1848       if (r*r == x)
1849         return r;
1850     }
1851   return 0;
1852 }
1853
1854 /* invtab[i] = floor(0x10000 / (0x100 + i) */
1855 static const unsigned short invtab[0x81] =
1856   {
1857     0x200,
1858     0x1fc, 0x1f8, 0x1f4, 0x1f0, 0x1ec, 0x1e9, 0x1e5, 0x1e1,
1859     0x1de, 0x1da, 0x1d7, 0x1d4, 0x1d0, 0x1cd, 0x1ca, 0x1c7,
1860     0x1c3, 0x1c0, 0x1bd, 0x1ba, 0x1b7, 0x1b4, 0x1b2, 0x1af,
1861     0x1ac, 0x1a9, 0x1a6, 0x1a4, 0x1a1, 0x19e, 0x19c, 0x199,
1862     0x197, 0x194, 0x192, 0x18f, 0x18d, 0x18a, 0x188, 0x186,
1863     0x183, 0x181, 0x17f, 0x17d, 0x17a, 0x178, 0x176, 0x174,
1864     0x172, 0x170, 0x16e, 0x16c, 0x16a, 0x168, 0x166, 0x164,
1865     0x162, 0x160, 0x15e, 0x15c, 0x15a, 0x158, 0x157, 0x155,
1866     0x153, 0x151, 0x150, 0x14e, 0x14c, 0x14a, 0x149, 0x147,
1867     0x146, 0x144, 0x142, 0x141, 0x13f, 0x13e, 0x13c, 0x13b,
1868     0x139, 0x138, 0x136, 0x135, 0x133, 0x132, 0x130, 0x12f,
1869     0x12e, 0x12c, 0x12b, 0x129, 0x128, 0x127, 0x125, 0x124,
1870     0x123, 0x121, 0x120, 0x11f, 0x11e, 0x11c, 0x11b, 0x11a,
1871     0x119, 0x118, 0x116, 0x115, 0x114, 0x113, 0x112, 0x111,
1872     0x10f, 0x10e, 0x10d, 0x10c, 0x10b, 0x10a, 0x109, 0x108,
1873     0x107, 0x106, 0x105, 0x104, 0x103, 0x102, 0x101, 0x100,
1874   };
1875
1876 /* Compute q = [u/d], r = u mod d.  Avoids slow hardware division for the case
1877    that q < 0x40; here it instead uses a table of (Euclidian) inverses.  */
1878 #define div_smallq(q, r, u, d)                                          \
1879   do {                                                                  \
1880     if ((u) / 0x40 < (d))                                               \
1881       {                                                                 \
1882         int _cnt;                                                       \
1883         uintmax_t _dinv, _mask, _q, _r;                                 \
1884         count_leading_zeros (_cnt, (d));                                \
1885         _r = (u);                                                       \
1886         if (UNLIKELY (_cnt > (W_TYPE_SIZE - 8)))                        \
1887           {                                                             \
1888             _dinv = invtab[((d) << (_cnt + 8 - W_TYPE_SIZE)) - 0x80];   \
1889             _q = _dinv * _r >> (8 + W_TYPE_SIZE - _cnt);                \
1890           }                                                             \
1891         else                                                            \
1892           {                                                             \
1893             _dinv = invtab[((d) >> (W_TYPE_SIZE - 8 - _cnt)) - 0x7f];   \
1894             _q = _dinv * (_r >> (W_TYPE_SIZE - 3 - _cnt)) >> 11;        \
1895           }                                                             \
1896         _r -= _q*(d);                                                   \
1897                                                                         \
1898         _mask = -(uintmax_t) (_r >= (d));                               \
1899         (r) = _r - (_mask & (d));                                       \
1900         (q) = _q - _mask;                                               \
1901         assert ( (q) * (d) + (r) == u);                                 \
1902       }                                                                 \
1903     else                                                                \
1904       {                                                                 \
1905         uintmax_t _q = (u) / (d);                                       \
1906         (r) = (u) - _q * (d);                                           \
1907         (q) = _q;                                                       \
1908       }                                                                 \
1909   } while (0)
1910
1911 /* Notes: Example N = 22117019. After first phase we find Q1 = 6314, Q
1912    = 3025, P = 1737, representing F_{18} = (-6314, 2* 1737, 3025),
1913    with 3025 = 55^2.
1914
1915    Constructing the square root, we get Q1 = 55, Q = 8653, P = 4652,
1916    representing G_0 = (-55, 2*4652, 8653).
1917
1918    In the notation of the paper:
1919
1920    S_{-1} = 55, S_0 = 8653, R_0 = 4652
1921
1922    Put
1923
1924      t_0 = floor([q_0 + R_0] / S0) = 1
1925      R_1 = t_0 * S_0 - R_0 = 4001
1926      S_1 = S_{-1} +t_0 (R_0 - R_1) = 706
1927 */
1928
1929 /* Multipliers, in order of efficiency:
1930    0.7268  3*5*7*11 = 1155 = 3 (mod 4)
1931    0.7317  3*5*7    =  105 = 1
1932    0.7820  3*5*11   =  165 = 1
1933    0.7872  3*5      =   15 = 3
1934    0.8101  3*7*11   =  231 = 3
1935    0.8155  3*7      =   21 = 1
1936    0.8284  5*7*11   =  385 = 1
1937    0.8339  5*7      =   35 = 3
1938    0.8716  3*11     =   33 = 1
1939    0.8774  3        =    3 = 3
1940    0.8913  5*11     =   55 = 3
1941    0.8972  5        =    5 = 1
1942    0.9233  7*11     =   77 = 1
1943    0.9295  7        =    7 = 3
1944    0.9934  11       =   11 = 3
1945 */
1946 #define QUEUE_SIZE 50
1947
1948 #if STAT_SQUFOF
1949 # define Q_FREQ_SIZE 50
1950 /* Element 0 keeps the total */
1951 static unsigned int q_freq[Q_FREQ_SIZE + 1];
1952 # define MIN(a,b) ((a) < (b) ? (a) : (b))
1953 #endif
1954
1955 /* Return true on success.  Expected to fail only for numbers
1956    >= 2^{2*W_TYPE_SIZE - 2}, or close to that limit. */
1957 static bool
1958 factor_using_squfof (uintmax_t n1, uintmax_t n0, struct factors *factors)
1959 {
1960   /* Uses algorithm and notation from
1961
1962      SQUARE FORM FACTORIZATION
1963      JASON E. GOWER AND SAMUEL S. WAGSTAFF, JR.
1964
1965      http://homes.cerias.purdue.edu/~ssw/squfof.pdf
1966    */
1967
1968   static const unsigned int multipliers_1[] =
1969     { /* = 1 (mod 4) */
1970       105, 165, 21, 385, 33, 5, 77, 1, 0
1971     };
1972   static const unsigned int multipliers_3[] =
1973     { /* = 3 (mod 4) */
1974       1155, 15, 231, 35, 3, 55, 7, 11, 0
1975     };
1976
1977   const unsigned int *m;
1978
1979   struct { uintmax_t Q; uintmax_t P; } queue[QUEUE_SIZE];
1980
1981   if (n1 >= ((uintmax_t) 1 << (W_TYPE_SIZE - 2)))
1982     return false;
1983
1984   uintmax_t sqrt_n = isqrt2 (n1, n0);
1985
1986   if (n0 == sqrt_n * sqrt_n)
1987     {
1988       uintmax_t p1, p0;
1989
1990       umul_ppmm (p1, p0, sqrt_n, sqrt_n);
1991       assert (p0 == n0);
1992
1993       if (n1 == p1)
1994         {
1995           if (prime_p (sqrt_n))
1996             factor_insert_multiplicity (factors, sqrt_n, 2);
1997           else
1998             {
1999               struct factors f;
2000
2001               f.nfactors = 0;
2002               if (!factor_using_squfof (0, sqrt_n, &f))
2003                 {
2004                   /* Try pollard rho instead */
2005                   factor_using_pollard_rho (sqrt_n, 1, &f);
2006                 }
2007               /* Duplicate the new factors */
2008               for (unsigned int i = 0; i < f.nfactors; i++)
2009                 factor_insert_multiplicity (factors, f.p[i], 2*f.e[i]);
2010             }
2011           return true;
2012         }
2013     }
2014
2015   /* Select multipliers so we always get n * mu = 3 (mod 4) */
2016   for (m = (n0 % 4 == 1) ? multipliers_3 : multipliers_1;
2017        *m; m++)
2018     {
2019       uintmax_t S, Dh, Dl, Q1, Q, P, L, L1, B;
2020       unsigned int i;
2021       unsigned int mu = *m;
2022       unsigned int qpos = 0;
2023
2024       assert (mu * n0 % 4 == 3);
2025
2026       /* In the notation of the paper, with mu * n == 3 (mod 4), we
2027          get \Delta = 4 mu * n, and the paper's \mu is 2 mu. As far as
2028          I understand it, the necessary bound is 4 \mu^3 < n, or 32
2029          mu^3 < n.
2030
2031          However, this seems insufficient: With n = 37243139 and mu =
2032          105, we get a trivial factor, from the square 38809 = 197^2,
2033          without any corresponding Q earlier in the iteration.
2034
2035          Requiring 64 mu^3 < n seems sufficient. */
2036       if (n1 == 0)
2037         {
2038           if ((uintmax_t) mu*mu*mu >= n0 / 64)
2039             continue;
2040         }
2041       else
2042         {
2043           if (n1 > ((uintmax_t) 1 << (W_TYPE_SIZE - 2)) / mu)
2044             continue;
2045         }
2046       umul_ppmm (Dh, Dl, n0, mu);
2047       Dh += n1 * mu;
2048
2049       assert (Dl % 4 != 1);
2050       assert (Dh < (uintmax_t) 1 << (W_TYPE_SIZE - 2));
2051
2052       S = isqrt2 (Dh, Dl);
2053
2054       Q1 = 1;
2055       P = S;
2056
2057       /* Square root remainder fits in one word, so ignore high part. */
2058       Q = Dl - P*P;
2059       /* FIXME: When can this differ from floor(sqrt(2 sqrt(D)))? */
2060       L = isqrt (2*S);
2061       B = 2*L;
2062       L1 = mu * 2 * L;
2063
2064       /* The form is (+/- Q1, 2P, -/+ Q), of discriminant 4 (P^2 + Q Q1) =
2065          4 D. */
2066
2067       for (i = 0; i <= B; i++)
2068         {
2069           uintmax_t q, P1, t, rem;
2070
2071           div_smallq (q, rem, S+P, Q);
2072           P1 = S - rem; /* P1 = q*Q - P */
2073
2074 #if STAT_SQUFOF
2075           assert (q > 0);
2076           q_freq[0]++;
2077           q_freq[MIN (q, Q_FREQ_SIZE)]++;
2078 #endif
2079
2080           if (Q <= L1)
2081             {
2082               uintmax_t g = Q;
2083
2084               if ( (Q & 1) == 0)
2085                 g /= 2;
2086
2087               g /= gcd_odd (g, mu);
2088
2089               if (g <= L)
2090                 {
2091                   if (qpos >= QUEUE_SIZE)
2092                     error (EXIT_FAILURE, 0, _("squfof queue overflow"));
2093                   queue[qpos].Q = g;
2094                   queue[qpos].P = P % g;
2095                   qpos++;
2096                 }
2097             }
2098
2099           /* I think the difference can be either sign, but mod
2100              2^W_TYPE_SIZE arithmetic should be fine. */
2101           t = Q1 + q * (P - P1);
2102           Q1 = Q;
2103           Q = t;
2104           P = P1;
2105
2106           if ( (i & 1) == 0)
2107             {
2108               uintmax_t r = is_square (Q);
2109               if (r)
2110                 {
2111                   for (unsigned int j = 0; j < qpos; j++)
2112                     {
2113                       if (queue[j].Q == r)
2114                         {
2115                           if (r == 1)
2116                             /* Traversed entire cycle. */
2117                             goto next_multiplier;
2118
2119                           /* Need the absolute value for divisibility test. */
2120                           if (P >= queue[j].P)
2121                             t = P - queue[j].P;
2122                           else
2123                             t = queue[j].P - P;
2124                           if (t % r == 0)
2125                             {
2126                               /* Delete entries up to and including entry
2127                                  j, which matched. */
2128                               memmove (queue, queue + j + 1,
2129                                        (qpos - j - 1) * sizeof (queue[0]));
2130                               qpos -= (j + 1);
2131                             }
2132                           goto next_i;
2133                         }
2134                     }
2135
2136                   /* We have found a square form, which should give a
2137                      factor. */
2138                   Q1 = r;
2139                   assert (S >= P); /* What signs are possible? */
2140                   P += r * ((S - P) / r);
2141
2142                   /* Note: Paper says (N - P*P) / Q1, that seems incorrect
2143                      for the case D = 2N. */
2144                   /* Compute Q = (D - P*P) / Q1, but we need double
2145                      precision. */
2146                   uintmax_t hi, lo;
2147                   umul_ppmm (hi, lo, P, P);
2148                   sub_ddmmss (hi, lo, Dh, Dl, hi, lo);
2149                   udiv_qrnnd (Q, rem, hi, lo, Q1);
2150                   assert (rem == 0);
2151
2152                   for (;;)
2153                     {
2154                       /* Note: There appears to by a typo in the paper,
2155                          Step 4a in the algorithm description says q <--
2156                          floor([S+P]/\hat Q), but looking at the equations
2157                          in Sec. 3.1, it should be q <-- floor([S+P] / Q).
2158                          (In this code, \hat Q is Q1). */
2159                       div_smallq (q, rem, S+P, Q);
2160                       P1 = S - rem;     /* P1 = q*Q - P */
2161
2162 #if STAT_SQUFOF
2163                       q_freq[0]++;
2164                       q_freq[MIN (q, Q_FREQ_SIZE)]++;
2165 #endif
2166                       if (P == P1)
2167                         break;
2168                       t = Q1 + q * (P - P1);
2169                       Q1 = Q;
2170                       Q = t;
2171                       P = P1;
2172                     }
2173
2174                   if ( (Q & 1) == 0)
2175                     Q /= 2;
2176                   Q /= gcd_odd (Q, mu);
2177
2178                   assert (Q > 1 && (n1 || Q < n0));
2179
2180                   if (prime_p (Q))
2181                     factor_insert (factors, Q);
2182                   else if (!factor_using_squfof (0, Q, factors))
2183                     factor_using_pollard_rho (Q, 2, factors);
2184
2185                   divexact_21 (n1, n0, n1, n0, Q);
2186
2187                   if (prime2_p (n1, n0))
2188                     factor_insert_large (factors, n1, n0);
2189                   else
2190                     {
2191                       if (!factor_using_squfof (n1, n0, factors))
2192                         {
2193                           if (n1 == 0)
2194                             factor_using_pollard_rho (n0, 1, factors);
2195                           else
2196                             factor_using_pollard_rho2 (n1, n0, 1, factors);
2197                         }
2198                     }
2199
2200                   return true;
2201                 }
2202             }
2203         next_i:;
2204         }
2205     next_multiplier:;
2206     }
2207   return false;
2208 }
2209
2210 /* Compute the prime factors of the 128-bit number (T1,T0), and put the
2211    results in FACTORS.  Use the algorithm selected by the global ALG.  */
2212 static void
2213 factor (uintmax_t t1, uintmax_t t0, struct factors *factors)
2214 {
2215   factors->nfactors = 0;
2216   factors->plarge[1] = 0;
2217
2218   if (t1 == 0 && t0 < 2)
2219     return;
2220
2221   t0 = factor_using_division (&t1, t1, t0, factors);
2222
2223   if (t1 == 0 && t0 < 2)
2224     return;
2225
2226   if (prime2_p (t1, t0))
2227     factor_insert_large (factors, t1, t0);
2228   else
2229     {
2230       if (alg == ALG_SQUFOF)
2231         if (factor_using_squfof (t1, t0, factors))
2232           return;
2233
2234       if (t1 == 0)
2235         factor_using_pollard_rho (t0, 1, factors);
2236       else
2237         factor_using_pollard_rho2 (t1, t0, 1, factors);
2238     }
2239 }
2240
2241 #if HAVE_GMP
2242 /* Use Pollard-rho to compute the prime factors of
2243    arbitrary-precision T, and put the results in FACTORS.  */
2244 static void
2245 mp_factor (mpz_t t, struct mp_factors *factors)
2246 {
2247   mp_factor_init (factors);
2248
2249   if (mpz_sgn (t) != 0)
2250     {
2251       mp_factor_using_division (t, factors);
2252
2253       if (mpz_cmp_ui (t, 1) != 0)
2254         {
2255           devmsg ("[is number prime?] ");
2256           if (mp_prime_p (t))
2257             mp_factor_insert (factors, t);
2258           else
2259             mp_factor_using_pollard_rho (t, 1, factors);
2260         }
2261     }
2262 }
2263 #endif
2264
2265 static strtol_error
2266 strto2uintmax (uintmax_t *hip, uintmax_t *lop, const char *s)
2267 {
2268   unsigned int lo_carry;
2269   uintmax_t hi = 0, lo = 0;
2270
2271   strtol_error err = LONGINT_INVALID;
2272
2273   /* Skip initial spaces and '+'.  */
2274   for (;;)
2275     {
2276       char c = *s;
2277       if (c == ' ')
2278         s++;
2279       else if (c == '+')
2280         {
2281           s++;
2282           break;
2283         }
2284       else
2285         break;
2286     }
2287
2288   /* Initial scan for invalid digits.  */
2289   const char *p = s;
2290   for (;;)
2291     {
2292       unsigned int c = *p++;
2293       if (c == 0)
2294         break;
2295
2296       if (UNLIKELY (!ISDIGIT (c)))
2297         {
2298           err = LONGINT_INVALID;
2299           break;
2300         }
2301
2302       err = LONGINT_OK;           /* we've seen at least one valid digit */
2303     }
2304
2305   for (;err == LONGINT_OK;)
2306     {
2307       unsigned int c = *s++;
2308       if (c == 0)
2309         break;
2310
2311       c -= '0';
2312
2313       if (UNLIKELY (hi > ~(uintmax_t)0 / 10))
2314         {
2315           err = LONGINT_OVERFLOW;
2316           break;
2317         }
2318       hi = 10 * hi;
2319
2320       lo_carry = (lo >> (W_TYPE_SIZE - 3)) + (lo >> (W_TYPE_SIZE - 1));
2321       lo_carry += 10 * lo < 2 * lo;
2322
2323       lo = 10 * lo;
2324       lo += c;
2325
2326       lo_carry += lo < c;
2327       hi += lo_carry;
2328       if (UNLIKELY (hi < lo_carry))
2329         {
2330           err = LONGINT_OVERFLOW;
2331           break;
2332         }
2333     }
2334
2335   *hip = hi;
2336   *lop = lo;
2337
2338   return err;
2339 }
2340
2341 static void
2342 print_uintmaxes (uintmax_t t1, uintmax_t t0)
2343 {
2344   uintmax_t q, r;
2345
2346   if (t1 == 0)
2347     printf ("%"PRIuMAX, t0);
2348   else
2349     {
2350       /* Use very plain code here since it seems hard to write fast code
2351          without assuming a specific word size.  */
2352       q = t1 / 1000000000;
2353       r = t1 % 1000000000;
2354       udiv_qrnnd (t0, r, r, t0, 1000000000);
2355       print_uintmaxes (q, t0);
2356       printf ("%09u", (int) r);
2357     }
2358 }
2359
2360 /* Single-precision factoring */
2361 static void
2362 print_factors_single (uintmax_t t1, uintmax_t t0)
2363 {
2364   struct factors factors;
2365
2366   print_uintmaxes (t1, t0);
2367   putchar (':');
2368
2369   factor (t1, t0, &factors);
2370
2371   for (unsigned int j = 0; j < factors.nfactors; j++)
2372     for (unsigned int k = 0; k < factors.e[j]; k++)
2373       {
2374         char buf[INT_BUFSIZE_BOUND (uintmax_t)];
2375         putchar (' ');
2376         fputs (umaxtostr (factors.p[j], buf), stdout);
2377       }
2378
2379   if (factors.plarge[1])
2380     {
2381       putchar (' ');
2382       print_uintmaxes (factors.plarge[1], factors.plarge[0]);
2383     }
2384   putchar ('\n');
2385 }
2386
2387 /* Emit the factors of the indicated number.  If we have the option of using
2388    either algorithm, we select on the basis of the length of the number.
2389    For longer numbers, we prefer the MP algorithm even if the native algorithm
2390    has enough digits, because the algorithm is better.  The turnover point
2391    depends on the value.  */
2392 static bool
2393 print_factors (const char *input)
2394 {
2395   uintmax_t t1, t0;
2396
2397   /* Try converting the number to one or two words.  If it fails, use GMP or
2398      print an error message.  The 2nd condition checks that the most
2399      significant bit of the two-word number is clear, in a typesize neutral
2400      way.  */
2401   strtol_error err = strto2uintmax (&t1, &t0, input);
2402
2403   switch (err)
2404     {
2405     case LONGINT_OK:
2406       if (((t1 << 1) >> 1) == t1)
2407         {
2408           devmsg ("[using single-precision arithmetic] ");
2409           print_factors_single (t1, t0);
2410           return true;
2411         }
2412       break;
2413
2414     case LONGINT_OVERFLOW:
2415       /* Try GMP.  */
2416       break;
2417
2418     default:
2419       error (0, 0, _("%s is not a valid positive integer"), quote (input));
2420       return false;
2421     }
2422
2423 #if HAVE_GMP
2424   devmsg ("[using arbitrary-precision arithmetic] ");
2425   mpz_t t;
2426   struct mp_factors factors;
2427
2428   mpz_init_set_str (t, input, 10);
2429
2430   gmp_printf ("%Zd:", t);
2431   mp_factor (t, &factors);
2432
2433   for (unsigned int j = 0; j < factors.nfactors; j++)
2434     for (unsigned int k = 0; k < factors.e[j]; k++)
2435       gmp_printf (" %Zd", factors.p[j]);
2436
2437   mp_factor_clear (&factors);
2438   mpz_clear (t);
2439   putchar ('\n');
2440   return true;
2441 #else
2442   error (0, 0, _("%s is too large"), quote (input));
2443   return false;
2444 #endif
2445 }
2446
2447 void
2448 usage (int status)
2449 {
2450   if (status != EXIT_SUCCESS)
2451     emit_try_help ();
2452   else
2453     {
2454       printf (_("\
2455 Usage: %s [NUMBER]...\n\
2456   or:  %s OPTION\n\
2457 "),
2458               program_name, program_name);
2459       fputs (_("\
2460 Print the prime factors of each specified integer NUMBER.  If none\n\
2461 are specified on the command line, read them from standard input.\n\
2462 \n\
2463 "), stdout);
2464       fputs (HELP_OPTION_DESCRIPTION, stdout);
2465       fputs (VERSION_OPTION_DESCRIPTION, stdout);
2466       emit_ancillary_info ();
2467     }
2468   exit (status);
2469 }
2470
2471 static bool
2472 do_stdin (void)
2473 {
2474   bool ok = true;
2475   token_buffer tokenbuffer;
2476
2477   init_tokenbuffer (&tokenbuffer);
2478
2479   while (true)
2480     {
2481       size_t token_length = readtoken (stdin, DELIM, sizeof (DELIM) - 1,
2482                                        &tokenbuffer);
2483       if (token_length == (size_t) -1)
2484         break;
2485       ok &= print_factors (tokenbuffer.buffer);
2486     }
2487   free (tokenbuffer.buffer);
2488
2489   return ok;
2490 }
2491
2492 int
2493 main (int argc, char **argv)
2494 {
2495   initialize_main (&argc, &argv);
2496   set_program_name (argv[0]);
2497   setlocale (LC_ALL, "");
2498   bindtextdomain (PACKAGE, LOCALEDIR);
2499   textdomain (PACKAGE);
2500
2501   atexit (close_stdout);
2502
2503   alg = ALG_POLLARD_RHO;        /* Default to Pollard rho */
2504
2505   int c;
2506   while ((c = getopt_long (argc, argv, "", long_options, NULL)) != -1)
2507     {
2508       switch (c)
2509         {
2510         case DEV_DEBUG_OPTION:
2511           dev_debug = true;
2512           break;
2513
2514         case 's':
2515           alg = ALG_SQUFOF;
2516           break;
2517
2518         case 'w':
2519           flag_prove_primality = false;
2520           break;
2521
2522         case_GETOPT_HELP_CHAR;
2523
2524         case_GETOPT_VERSION_CHAR (PROGRAM_NAME, AUTHORS);
2525
2526         default:
2527           usage (EXIT_FAILURE);
2528         }
2529     }
2530
2531 #if STAT_SQUFOF
2532   if (alg == ALG_SQUFOF)
2533     memset (q_freq, 0, sizeof (q_freq));
2534 #endif
2535
2536   bool ok;
2537   if (argc <= optind)
2538     ok = do_stdin ();
2539   else
2540     {
2541       ok = true;
2542       for (int i = optind; i < argc; i++)
2543         if (! print_factors (argv[i]))
2544           ok = false;
2545     }
2546
2547 #if STAT_SQUFOF
2548   if (alg == ALG_SQUFOF && q_freq[0] > 0)
2549     {
2550       double acc_f;
2551       printf ("q  freq.  cum. freq.(total: %d)\n", q_freq[0]);
2552       for (unsigned int i = 1, acc_f = 0.0; i <= Q_FREQ_SIZE; i++)
2553         {
2554           double f = (double) q_freq[i] / q_freq[0];
2555           acc_f += f;
2556           printf ("%s%d %.2f%% %.2f%%\n", i == Q_FREQ_SIZE ? ">=" : "", i,
2557                   100.0 * f, 100.0 * acc_f);
2558         }
2559     }
2560 #endif
2561
2562   exit (ok ? EXIT_SUCCESS : EXIT_FAILURE);
2563 }