src/factor.c

   1 /* factor -- print prime factors of n.
   2    Copyright (C) 1986-2022 Free Software Foundation, Inc.
   3
   4    This program is free software: you can redistribute it and/or modify
   5    it under the terms of the GNU General Public License as published by
   6    the Free Software Foundation, either version 3 of the License, or
   7    (at your option) any later version.
   8
   9    This program is distributed in the hope that it will be useful,
  10    but WITHOUT ANY WARRANTY; without even the implied warranty of
  11    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  12    GNU General Public License for more details.
  13
  14    You should have received a copy of the GNU General Public License
  15    along with this program.  If not, see <https://www.gnu.org/licenses/>.  */
  16
  17 /* Originally written by Paul Rubin <phr@ocf.berkeley.edu>.
  18    Adapted for GNU, fixed to factor UINT_MAX by Jim Meyering.
  19    Arbitrary-precision code adapted by James Youngman from Torbjörn
  20    Granlund's factorize.c, from GNU MP version 4.2.2.
  21    In 2012, the core was rewritten by Torbjörn Granlund and Niels Möller.
  22    Contains code from GNU MP.  */
  23
  24 /* Efficiently factor numbers that fit in one or two words (word = uintmax_t),
  25    or, with GMP, numbers of any size.
  26
  27   Code organisation:
  28
  29     There are several variants of many functions, for handling one word, two
  30     words, and GMP's mpz_t type.  If the one-word variant is called foo, the
  31     two-word variant will be foo2, and the one for mpz_t will be mp_foo.  In
  32     some cases, the plain function variants will handle both one-word and
  33     two-word numbers, evidenced by function arguments.
  34
  35     The factoring code for two words will fall into the code for one word when
  36     progress allows that.
  37
  38   Algorithm:
  39
  40     (1) Perform trial division using a small primes table, but without hardware
  41         division since the primes table store inverses modulo the word base.
  42         (The GMP variant of this code doesn't make use of the precomputed
  43         inverses, but instead relies on GMP for fast divisibility testing.)
  44     (2) Check the nature of any non-factored part using Miller-Rabin for
  45         detecting composites, and Lucas for detecting primes.
  46     (3) Factor any remaining composite part using the Pollard-Brent rho
  47         algorithm or if USE_SQUFOF is defined to 1, try that first.
  48         Status of found factors are checked again using Miller-Rabin and Lucas.
  49
  50     We prefer using Hensel norm in the divisions, not the more familiar
  51     Euclidian norm, since the former leads to much faster code.  In the
  52     Pollard-Brent rho code and the prime testing code, we use Montgomery's
  53     trick of multiplying all n-residues by the word base, allowing cheap Hensel
  54     reductions mod n.
  55
  56     The GMP code uses an algorithm that can be considerably slower;
  57     for example, on a circa-2017 Intel Xeon Silver 4116, factoring
  58     2^{127}-3 takes about 50 ms with the two-word algorithm but would
  59     take about 750 ms with the GMP code.
  60
  61   Improvements:
  62
  63     * Use modular inverses also for exact division in the Lucas code, and
  64       elsewhere.  A problem is to locate the inverses not from an index, but
  65       from a prime.  We might instead compute the inverse on-the-fly.
  66
  67     * Tune trial division table size (not forgetting that this is a standalone
  68       program where the table will be read from secondary storage for
  69       each invocation).
  70
  71     * Implement less naive powm, using k-ary exponentiation for k = 3 or
  72       perhaps k = 4.
  73
  74     * Try to speed trial division code for single uintmax_t numbers, i.e., the
  75       code using DIVBLOCK.  It currently runs at 2 cycles per prime (Intel SBR,
  76       IBR), 3 cycles per prime (AMD Stars) and 5 cycles per prime (AMD BD) when
  77       using gcc 4.6 and 4.7.  Some software pipelining should help; 1, 2, and 4
  78       respectively cycles ought to be possible.
  79
  80     * The redcify function could be vastly improved by using (plain Euclidian)
  81       pre-inversion (such as GMP's invert_limb) and udiv_qrnnd_preinv (from
  82       GMP's gmp-impl.h).  The redcify2 function could be vastly improved using
  83       similar methoods.  These functions currently dominate run time when using
  84       the -w option.
  85 */
  86
  87 /* Whether to recursively factor to prove primality,
  88    or run faster probabilistic tests.  */
  89 #ifndef PROVE_PRIMALITY
  90 # define PROVE_PRIMALITY 1
  91 #endif
  92
  93 /* Faster for certain ranges but less general.  */
  94 #ifndef USE_SQUFOF
  95 # define USE_SQUFOF 0
  96 #endif
  97
  98 /* Output SQUFOF statistics.  */
  99 #ifndef STAT_SQUFOF
 100 # define STAT_SQUFOF 0
 101 #endif
 102
 103
 104 #include <config.h>
 105 #include <getopt.h>
 106 #include <stdio.h>
 107 #include <gmp.h>
 108 #include <assert.h>
 109
 110 #include "system.h"
 111 #include "die.h"
 112 #include "error.h"
 113 #include "full-write.h"
 114 #include "quote.h"
 115 #include "readtokens.h"
 116 #include "xstrtol.h"
 117
 118 /* The official name of this program (e.g., no 'g' prefix).  */
 119 #define PROGRAM_NAME "factor"
 120
 121 #define AUTHORS \
 122   proper_name ("Paul Rubin"),                                           \
 123   proper_name_utf8 ("Torbjorn Granlund", "Torbj\303\266rn Granlund"),   \
 124   proper_name_utf8 ("Niels Moller", "Niels M\303\266ller")
 125
 126 /* Token delimiters when reading from a file.  */
 127 #define DELIM "\n\t "
 128
 129 #ifndef USE_LONGLONG_H
 130 /* With the way we use longlong.h, it's only safe to use
 131    when UWtype = UHWtype, as there were various cases
 132    (as can be seen in the history for longlong.h) where
 133    for example, _LP64 was required to enable W_TYPE_SIZE==64 code,
 134    to avoid compile time or run time issues.  */
 135 # if LONG_MAX == INTMAX_MAX
 136 #  define USE_LONGLONG_H 1
 137 # endif
 138 #endif
 139
 140 #if USE_LONGLONG_H
 141
 142 /* Make definitions for longlong.h to make it do what it can do for us */
 143
 144 /* bitcount for uintmax_t */
 145 # if UINTMAX_MAX == UINT32_MAX
 146 #  define W_TYPE_SIZE 32
 147 # elif UINTMAX_MAX == UINT64_MAX
 148 #  define W_TYPE_SIZE 64
 149 # elif UINTMAX_MAX == UINT128_MAX
 150 #  define W_TYPE_SIZE 128
 151 # endif
 152
 153 # define UWtype  uintmax_t
 154 # define UHWtype unsigned long int
 155 # undef UDWtype
 156 # if HAVE_ATTRIBUTE_MODE
 157 typedef unsigned int UQItype    __attribute__ ((mode (QI)));
 158 typedef          int SItype     __attribute__ ((mode (SI)));
 159 typedef unsigned int USItype    __attribute__ ((mode (SI)));
 160 typedef          int DItype     __attribute__ ((mode (DI)));
 161 typedef unsigned int UDItype    __attribute__ ((mode (DI)));
 162 # else
 163 typedef unsigned char UQItype;
 164 typedef          long SItype;
 165 typedef unsigned long int USItype;
 166 #  if HAVE_LONG_LONG_INT
 167 typedef long long int DItype;
 168 typedef unsigned long long int UDItype;
 169 #  else /* Assume `long' gives us a wide enough type.  Needed for hppa2.0w.  */
 170 typedef long int DItype;
 171 typedef unsigned long int UDItype;
 172 #  endif
 173 # endif
 174 # define LONGLONG_STANDALONE     /* Don't require GMP's longlong.h mdep files */
 175 # define ASSERT(x)               /* FIXME make longlong.h really standalone */
 176 # define __GMP_DECLSPEC          /* FIXME make longlong.h really standalone */
 177 # define __clz_tab factor_clz_tab /* Rename to avoid glibc collision */
 178 # ifndef __GMP_GNUC_PREREQ
 179 #  define __GMP_GNUC_PREREQ(a,b) 1
 180 # endif
 181
 182 /* These stub macros are only used in longlong.h in certain system compiler
 183    combinations, so ensure usage to avoid -Wunused-macros warnings.  */
 184 # if __GMP_GNUC_PREREQ (1,1) && defined __clz_tab
 185 ASSERT (1)
 186 __GMP_DECLSPEC
 187 # endif
 188
 189 # if _ARCH_PPC
 190 #  define HAVE_HOST_CPU_FAMILY_powerpc 1
 191 # endif
 192 # include "longlong.h"
 193 # ifdef COUNT_LEADING_ZEROS_NEED_CLZ_TAB
 194 const unsigned char factor_clz_tab[129] =
 195 {
 196   1,2,3,3,4,4,4,4,5,5,5,5,5,5,5,5,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,
 197   7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,
 198   8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,
 199   8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,
 200   9
 201 };
 202 # endif
 203
 204 #else /* not USE_LONGLONG_H */
 205
 206 # define W_TYPE_SIZE (8 * sizeof (uintmax_t))
 207 # define __ll_B ((uintmax_t) 1 << (W_TYPE_SIZE / 2))
 208 # define __ll_lowpart(t)  ((uintmax_t) (t) & (__ll_B - 1))
 209 # define __ll_highpart(t) ((uintmax_t) (t) >> (W_TYPE_SIZE / 2))
 210
 211 #endif
 212
 213 #if !defined __clz_tab && !defined UHWtype
 214 /* Without this seemingly useless conditional, gcc -Wunused-macros
 215    warns that each of the two tested macros is unused on Fedora 18.
 216    FIXME: this is just an ugly band-aid.  Fix it properly.  */
 217 #endif
 218
 219 /* 2*3*5*7*11...*101 is 128 bits, and has 26 prime factors */
 220 #define MAX_NFACTS 26
 221
 222 enum
 223 {
 224   DEV_DEBUG_OPTION = CHAR_MAX + 1
 225 };
 226
 227 static struct option const long_options[] =
 228 {
 229   {"-debug", no_argument, NULL, DEV_DEBUG_OPTION},
 230   {GETOPT_HELP_OPTION_DECL},
 231   {GETOPT_VERSION_OPTION_DECL},
 232   {NULL, 0, NULL, 0}
 233 };
 234
 235 struct factors
 236 {
 237   uintmax_t     plarge[2]; /* Can have a single large factor */
 238   uintmax_t     p[MAX_NFACTS];
 239   unsigned char e[MAX_NFACTS];
 240   unsigned char nfactors;
 241 };
 242
 243 struct mp_factors
 244 {
 245   mpz_t             *p;
 246   unsigned long int *e;
 247   unsigned long int nfactors;
 248 };
 249
 250 static void factor (uintmax_t, uintmax_t, struct factors *);
 251
 252 #ifndef umul_ppmm
 253 # define umul_ppmm(w1, w0, u, v)                                        \
 254   do {                                                                  \
 255     uintmax_t __x0, __x1, __x2, __x3;                                   \
 256     unsigned long int __ul, __vl, __uh, __vh;                           \
 257     uintmax_t __u = (u), __v = (v);                                     \
 258                                                                         \
 259     __ul = __ll_lowpart (__u);                                          \
 260     __uh = __ll_highpart (__u);                                         \
 261     __vl = __ll_lowpart (__v);                                          \
 262     __vh = __ll_highpart (__v);                                         \
 263                                                                         \
 264     __x0 = (uintmax_t) __ul * __vl;                                     \
 265     __x1 = (uintmax_t) __ul * __vh;                                     \
 266     __x2 = (uintmax_t) __uh * __vl;                                     \
 267     __x3 = (uintmax_t) __uh * __vh;                                     \
 268                                                                         \
 269     __x1 += __ll_highpart (__x0);/* This can't give carry.  */          \
 270     __x1 += __x2;               /* But this indeed can.  */             \
 271     if (__x1 < __x2)            /* Did we get it?  */                   \
 272       __x3 += __ll_B;           /* Yes, add it in the proper pos.  */   \
 273                                                                         \
 274     (w1) = __x3 + __ll_highpart (__x1);                                 \
 275     (w0) = (__x1 << W_TYPE_SIZE / 2) + __ll_lowpart (__x0);             \
 276   } while (0)
 277 #endif
 278
 279 #if !defined udiv_qrnnd || defined UDIV_NEEDS_NORMALIZATION
 280 /* Define our own, not needing normalization.  This function is
 281    currently not performance critical, so keep it simple.  Similar to
 282    the mod macro below.  */
 283 # undef udiv_qrnnd
 284 # define udiv_qrnnd(q, r, n1, n0, d)                                    \
 285   do {                                                                  \
 286     uintmax_t __d1, __d0, __q, __r1, __r0;                              \
 287                                                                         \
 288     assert ((n1) < (d));                                                \
 289     __d1 = (d); __d0 = 0;                                               \
 290     __r1 = (n1); __r0 = (n0);                                           \
 291     __q = 0;                                                            \
 292     for (unsigned int __i = W_TYPE_SIZE; __i > 0; __i--)                \
 293       {                                                                 \
 294         rsh2 (__d1, __d0, __d1, __d0, 1);                               \
 295         __q <<= 1;                                                      \
 296         if (ge2 (__r1, __r0, __d1, __d0))                               \
 297           {                                                             \
 298             __q++;                                                      \
 299             sub_ddmmss (__r1, __r0, __r1, __r0, __d1, __d0);            \
 300           }                                                             \
 301       }                                                                 \
 302     (r) = __r0;                                                         \
 303     (q) = __q;                                                          \
 304   } while (0)
 305 #endif
 306
 307 #if !defined add_ssaaaa
 308 # define add_ssaaaa(sh, sl, ah, al, bh, bl)                             \
 309   do {                                                                  \
 310     uintmax_t _add_x;                                                   \
 311     _add_x = (al) + (bl);                                               \
 312     (sh) = (ah) + (bh) + (_add_x < (al));                               \
 313     (sl) = _add_x;                                                      \
 314   } while (0)
 315 #endif
 316
 317 #define rsh2(rh, rl, ah, al, cnt)                                       \
 318   do {                                                                  \
 319     (rl) = ((ah) << (W_TYPE_SIZE - (cnt))) | ((al) >> (cnt));           \
 320     (rh) = (ah) >> (cnt);                                               \
 321   } while (0)
 322
 323 #define lsh2(rh, rl, ah, al, cnt)                                       \
 324   do {                                                                  \
 325     (rh) = ((ah) << cnt) | ((al) >> (W_TYPE_SIZE - (cnt)));             \
 326     (rl) = (al) << (cnt);                                               \
 327   } while (0)
 328
 329 #define ge2(ah, al, bh, bl)                                             \
 330   ((ah) > (bh) || ((ah) == (bh) && (al) >= (bl)))
 331
 332 #define gt2(ah, al, bh, bl)                                             \
 333   ((ah) > (bh) || ((ah) == (bh) && (al) > (bl)))
 334
 335 #ifndef sub_ddmmss
 336 # define sub_ddmmss(rh, rl, ah, al, bh, bl)                             \
 337   do {                                                                  \
 338     uintmax_t _cy;                                                      \
 339     _cy = (al) < (bl);                                                  \
 340     (rl) = (al) - (bl);                                                 \
 341     (rh) = (ah) - (bh) - _cy;                                           \
 342   } while (0)
 343 #endif
 344
 345 #ifndef count_leading_zeros
 346 # define count_leading_zeros(count, x) do {                             \
 347     uintmax_t __clz_x = (x);                                            \
 348     unsigned int __clz_c;                                               \
 349     for (__clz_c = 0;                                                   \
 350          (__clz_x & ((uintmax_t) 0xff << (W_TYPE_SIZE - 8))) == 0;      \
 351          __clz_c += 8)                                                  \
 352       __clz_x <<= 8;                                                    \
 353     for (; (intmax_t)__clz_x >= 0; __clz_c++)                           \
 354       __clz_x <<= 1;                                                    \
 355     (count) = __clz_c;                                                  \
 356   } while (0)
 357 #endif
 358
 359 #ifndef count_trailing_zeros
 360 # define count_trailing_zeros(count, x) do {                            \
 361     uintmax_t __ctz_x = (x);                                            \
 362     unsigned int __ctz_c = 0;                                           \
 363     while ((__ctz_x & 1) == 0)                                          \
 364       {                                                                 \
 365         __ctz_x >>= 1;                                                  \
 366         __ctz_c++;                                                      \
 367       }                                                                 \
 368     (count) = __ctz_c;                                                  \
 369   } while (0)
 370 #endif
 371
 372 /* Requires that a < n and b <= n */
 373 #define submod(r,a,b,n)                                                 \
 374   do {                                                                  \
 375     uintmax_t _t = - (uintmax_t) (a < b);                               \
 376     (r) = ((n) & _t) + (a) - (b);                                       \
 377   } while (0)
 378
 379 #define addmod(r,a,b,n)                                                 \
 380   submod ((r), (a), ((n) - (b)), (n))
 381
 382 /* Modular two-word addition and subtraction.  For performance reasons, the
 383    most significant bit of n1 must be clear.  The destination variables must be
 384    distinct from the mod operand.  */
 385 #define addmod2(r1, r0, a1, a0, b1, b0, n1, n0)                         \
 386   do {                                                                  \
 387     add_ssaaaa ((r1), (r0), (a1), (a0), (b1), (b0));                    \
 388     if (ge2 ((r1), (r0), (n1), (n0)))                                   \
 389       sub_ddmmss ((r1), (r0), (r1), (r0), (n1), (n0));                  \
 390   } while (0)
 391 #define submod2(r1, r0, a1, a0, b1, b0, n1, n0)                         \
 392   do {                                                                  \
 393     sub_ddmmss ((r1), (r0), (a1), (a0), (b1), (b0));                    \
 394     if ((intmax_t) (r1) < 0)                                            \
 395       add_ssaaaa ((r1), (r0), (r1), (r0), (n1), (n0));                  \
 396   } while (0)
 397
 398 #define HIGHBIT_TO_MASK(x)                                              \
 399   (((intmax_t)-1 >> 1) < 0                                              \
 400    ? (uintmax_t)((intmax_t)(x) >> (W_TYPE_SIZE - 1))                    \
 401    : ((x) & ((uintmax_t) 1 << (W_TYPE_SIZE - 1))                        \
 402       ? UINTMAX_MAX : (uintmax_t) 0))
 403
 404 /* Compute r = a mod d, where r = <*t1,retval>, a = <a1,a0>, d = <d1,d0>.
 405    Requires that d1 != 0.  */
 406 static uintmax_t
 407 mod2 (uintmax_t *r1, uintmax_t a1, uintmax_t a0, uintmax_t d1, uintmax_t d0)
 408 {
 409   int cntd, cnta;
 410
 411   assert (d1 != 0);
 412
 413   if (a1 == 0)
 414     {
 415       *r1 = 0;
 416       return a0;
 417     }
 418
 419   count_leading_zeros (cntd, d1);
 420   count_leading_zeros (cnta, a1);
 421   int cnt = cntd - cnta;
 422   lsh2 (d1, d0, d1, d0, cnt);
 423   for (int i = 0; i < cnt; i++)
 424     {
 425       if (ge2 (a1, a0, d1, d0))
 426         sub_ddmmss (a1, a0, a1, a0, d1, d0);
 427       rsh2 (d1, d0, d1, d0, 1);
 428     }
 429
 430   *r1 = a1;
 431   return a0;
 432 }
 433
 434 ATTRIBUTE_CONST
 435 static uintmax_t
 436 gcd_odd (uintmax_t a, uintmax_t b)
 437 {
 438   if ((b & 1) == 0)
 439     {
 440       uintmax_t t = b;
 441       b = a;
 442       a = t;
 443     }
 444   if (a == 0)
 445     return b;
 446
 447   /* Take out least significant one bit, to make room for sign */
 448   b >>= 1;
 449
 450   for (;;)
 451     {
 452       uintmax_t t;
 453       uintmax_t bgta;
 454
 455       while ((a & 1) == 0)
 456         a >>= 1;
 457       a >>= 1;
 458
 459       t = a - b;
 460       if (t == 0)
 461         return (a << 1) + 1;
 462
 463       bgta = HIGHBIT_TO_MASK (t);
 464
 465       /* b <-- min (a, b) */
 466       b += (bgta & t);
 467
 468       /* a <-- |a - b| */
 469       a = (t ^ bgta) - bgta;
 470     }
 471 }
 472
 473 static uintmax_t
 474 gcd2_odd (uintmax_t *r1, uintmax_t a1, uintmax_t a0, uintmax_t b1, uintmax_t b0)
 475 {
 476   assert (b0 & 1);
 477
 478   if ((a0 | a1) == 0)
 479     {
 480       *r1 = b1;
 481       return b0;
 482     }
 483
 484   while ((a0 & 1) == 0)
 485     rsh2 (a1, a0, a1, a0, 1);
 486
 487   for (;;)
 488     {
 489       if ((b1 | a1) == 0)
 490         {
 491           *r1 = 0;
 492           return gcd_odd (b0, a0);
 493         }
 494
 495       if (gt2 (a1, a0, b1, b0))
 496         {
 497           sub_ddmmss (a1, a0, a1, a0, b1, b0);
 498           do
 499             rsh2 (a1, a0, a1, a0, 1);
 500           while ((a0 & 1) == 0);
 501         }
 502       else if (gt2 (b1, b0, a1, a0))
 503         {
 504           sub_ddmmss (b1, b0, b1, b0, a1, a0);
 505           do
 506             rsh2 (b1, b0, b1, b0, 1);
 507           while ((b0 & 1) == 0);
 508         }
 509       else
 510         break;
 511     }
 512
 513   *r1 = a1;
 514   return a0;
 515 }
 516
 517 static void
 518 factor_insert_multiplicity (struct factors *factors,
 519                             uintmax_t prime, unsigned int m)
 520 {
 521   unsigned int nfactors = factors->nfactors;
 522   uintmax_t *p = factors->p;
 523   unsigned char *e = factors->e;
 524
 525   /* Locate position for insert new or increment e.  */
 526   int i;
 527   for (i = nfactors - 1; i >= 0; i--)
 528     {
 529       if (p[i] <= prime)
 530         break;
 531     }
 532
 533   if (i < 0 || p[i] != prime)
 534     {
 535       for (int j = nfactors - 1; j > i; j--)
 536         {
 537           p[j + 1] = p[j];
 538           e[j + 1] = e[j];
 539         }
 540       p[i + 1] = prime;
 541       e[i + 1] = m;
 542       factors->nfactors = nfactors + 1;
 543     }
 544   else
 545     {
 546       e[i] += m;
 547     }
 548 }
 549
 550 #define factor_insert(f, p) factor_insert_multiplicity (f, p, 1)
 551
 552 static void
 553 factor_insert_large (struct factors *factors,
 554                      uintmax_t p1, uintmax_t p0)
 555 {
 556   if (p1 > 0)
 557     {
 558       assert (factors->plarge[1] == 0);
 559       factors->plarge[0] = p0;
 560       factors->plarge[1] = p1;
 561     }
 562   else
 563     factor_insert (factors, p0);
 564 }
 565
 566 #ifndef mpz_inits
 567
 568 # include <stdarg.h>
 569
 570 # define mpz_inits(...) mpz_va_init (mpz_init, __VA_ARGS__)
 571 # define mpz_clears(...) mpz_va_init (mpz_clear, __VA_ARGS__)
 572
 573 static void
 574 mpz_va_init (void (*mpz_single_init)(mpz_t), ...)
 575 {
 576   va_list ap;
 577
 578   va_start (ap, mpz_single_init);
 579
 580   mpz_t *mpz;
 581   while ((mpz = va_arg (ap, mpz_t *)))
 582     mpz_single_init (*mpz);
 583
 584   va_end (ap);
 585 }
 586 #endif
 587
 588 static void mp_factor (mpz_t, struct mp_factors *);
 589
 590 static void
 591 mp_factor_init (struct mp_factors *factors)
 592 {
 593   factors->p = NULL;
 594   factors->e = NULL;
 595   factors->nfactors = 0;
 596 }
 597
 598 static void
 599 mp_factor_clear (struct mp_factors *factors)
 600 {
 601   for (unsigned int i = 0; i < factors->nfactors; i++)
 602     mpz_clear (factors->p[i]);
 603
 604   free (factors->p);
 605   free (factors->e);
 606 }
 607
 608 static void
 609 mp_factor_insert (struct mp_factors *factors, mpz_t prime)
 610 {
 611   unsigned long int nfactors = factors->nfactors;
 612   mpz_t         *p  = factors->p;
 613   unsigned long int *e  = factors->e;
 614   long i;
 615
 616   /* Locate position for insert new or increment e.  */
 617   for (i = nfactors - 1; i >= 0; i--)
 618     {
 619       if (mpz_cmp (p[i], prime) <= 0)
 620         break;
 621     }
 622
 623   if (i < 0 || mpz_cmp (p[i], prime) != 0)
 624     {
 625       p = xrealloc (p, (nfactors + 1) * sizeof p[0]);
 626       e = xrealloc (e, (nfactors + 1) * sizeof e[0]);
 627
 628       mpz_init (p[nfactors]);
 629       for (long j = nfactors - 1; j > i; j--)
 630         {
 631           mpz_set (p[j + 1], p[j]);
 632           e[j + 1] = e[j];
 633         }
 634       mpz_set (p[i + 1], prime);
 635       e[i + 1] = 1;
 636
 637       factors->p = p;
 638       factors->e = e;
 639       factors->nfactors = nfactors + 1;
 640     }
 641   else
 642     {
 643       e[i] += 1;
 644     }
 645 }
 646
 647 static void
 648 mp_factor_insert_ui (struct mp_factors *factors, unsigned long int prime)
 649 {
 650   mpz_t pz;
 651
 652   mpz_init_set_ui (pz, prime);
 653   mp_factor_insert (factors, pz);
 654   mpz_clear (pz);
 655 }
 656
 657
 658 /* Number of bits in an uintmax_t.  */
 659 enum { W = sizeof (uintmax_t) * CHAR_BIT };
 660
 661 /* Verify that uintmax_t does not have holes in its representation.  */
 662 verify (UINTMAX_MAX >> (W - 1) == 1);
 663
 664 #define P(a,b,c,d) a,
 665 static const unsigned char primes_diff[] = {
 666 #include "primes.h"
 667 0,0,0,0,0,0,0                           /* 7 sentinels for 8-way loop */
 668 };
 669 #undef P
 670
 671 #define PRIMES_PTAB_ENTRIES \
 672   (sizeof (primes_diff) / sizeof (primes_diff[0]) - 8 + 1)
 673
 674 #define P(a,b,c,d) b,
 675 static const unsigned char primes_diff8[] = {
 676 #include "primes.h"
 677 0,0,0,0,0,0,0                           /* 7 sentinels for 8-way loop */
 678 };
 679 #undef P
 680
 681 struct primes_dtab
 682 {
 683   uintmax_t binv, lim;
 684 };
 685
 686 #define P(a,b,c,d) {c,d},
 687 static const struct primes_dtab primes_dtab[] = {
 688 #include "primes.h"
 689 {1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0} /* 7 sentinels for 8-way loop */
 690 };
 691 #undef P
 692
 693 /* Verify that uintmax_t is not wider than
 694    the integers used to generate primes.h.  */
 695 verify (W <= WIDE_UINT_BITS);
 696
 697 /* debugging for developers.  Enables devmsg().
 698    This flag is used only in the GMP code.  */
 699 static bool dev_debug = false;
 700
 701 /* Prove primality or run probabilistic tests.  */
 702 static bool flag_prove_primality = PROVE_PRIMALITY;
 703
 704 /* Number of Miller-Rabin tests to run when not proving primality.  */
 705 #define MR_REPS 25
 706
 707 static void
 708 factor_insert_refind (struct factors *factors, uintmax_t p, unsigned int i,
 709                       unsigned int off)
 710 {
 711   for (unsigned int j = 0; j < off; j++)
 712     p += primes_diff[i + j];
 713   factor_insert (factors, p);
 714 }
 715
 716 /* Trial division with odd primes uses the following trick.
 717
 718    Let p be an odd prime, and B = 2^{W_TYPE_SIZE}.  For simplicity,
 719    consider the case t < B (this is the second loop below).
 720
 721    From our tables we get
 722
 723      binv = p^{-1} (mod B)
 724      lim = floor ((B-1) / p).
 725
 726    First assume that t is a multiple of p, t = q * p.  Then 0 <= q <= lim
 727    (and all quotients in this range occur for some t).
 728
 729    Then t = q * p is true also (mod B), and p is invertible we get
 730
 731      q = t * binv (mod B).
 732
 733    Next, assume that t is *not* divisible by p.  Since multiplication
 734    by binv (mod B) is a one-to-one mapping,
 735
 736      t * binv (mod B) > lim,
 737
 738    because all the smaller values are already taken.
 739
 740    This can be summed up by saying that the function
 741
 742      q(t) = binv * t (mod B)
 743
 744    is a permutation of the range 0 <= t < B, with the curious property
 745    that it maps the multiples of p onto the range 0 <= q <= lim, in
 746    order, and the non-multiples of p onto the range lim < q < B.
 747  */
 748
 749 static uintmax_t
 750 factor_using_division (uintmax_t *t1p, uintmax_t t1, uintmax_t t0,
 751                        struct factors *factors)
 752 {
 753   if (t0 % 2 == 0)
 754     {
 755       unsigned int cnt;
 756
 757       if (t0 == 0)
 758         {
 759           count_trailing_zeros (cnt, t1);
 760           t0 = t1 >> cnt;
 761           t1 = 0;
 762           cnt += W_TYPE_SIZE;
 763         }
 764       else
 765         {
 766           count_trailing_zeros (cnt, t0);
 767           rsh2 (t1, t0, t1, t0, cnt);
 768         }
 769
 770       factor_insert_multiplicity (factors, 2, cnt);
 771     }
 772
 773   uintmax_t p = 3;
 774   unsigned int i;
 775   for (i = 0; t1 > 0 && i < PRIMES_PTAB_ENTRIES; i++)
 776     {
 777       for (;;)
 778         {
 779           uintmax_t q1, q0, hi;
 780           MAYBE_UNUSED uintmax_t lo;
 781
 782           q0 = t0 * primes_dtab[i].binv;
 783           umul_ppmm (hi, lo, q0, p);
 784           if (hi > t1)
 785             break;
 786           hi = t1 - hi;
 787           q1 = hi * primes_dtab[i].binv;
 788           if (LIKELY (q1 > primes_dtab[i].lim))
 789             break;
 790           t1 = q1; t0 = q0;
 791           factor_insert (factors, p);
 792         }
 793       p += primes_diff[i + 1];
 794     }
 795   if (t1p)
 796     *t1p = t1;
 797
 798 #define DIVBLOCK(I)                                                     \
 799   do {                                                                  \
 800     for (;;)                                                            \
 801       {                                                                 \
 802         q = t0 * pd[I].binv;                                            \
 803         if (LIKELY (q > pd[I].lim))                                     \
 804           break;                                                        \
 805         t0 = q;                                                         \
 806         factor_insert_refind (factors, p, i + 1, I);                    \
 807       }                                                                 \
 808   } while (0)
 809
 810   for (; i < PRIMES_PTAB_ENTRIES; i += 8)
 811     {
 812       uintmax_t q;
 813       const struct primes_dtab *pd = &primes_dtab[i];
 814       DIVBLOCK (0);
 815       DIVBLOCK (1);
 816       DIVBLOCK (2);
 817       DIVBLOCK (3);
 818       DIVBLOCK (4);
 819       DIVBLOCK (5);
 820       DIVBLOCK (6);
 821       DIVBLOCK (7);
 822
 823       p += primes_diff8[i];
 824       if (p * p > t0)
 825         break;
 826     }
 827
 828   return t0;
 829 }
 830
 831 static void
 832 mp_factor_using_division (mpz_t t, struct mp_factors *factors)
 833 {
 834   mpz_t q;
 835   unsigned long int p;
 836
 837   devmsg ("[trial division] ");
 838
 839   mpz_init (q);
 840
 841   p = mpz_scan1 (t, 0);
 842   mpz_fdiv_q_2exp (t, t, p);
 843   while (p)
 844     {
 845       mp_factor_insert_ui (factors, 2);
 846       --p;
 847     }
 848
 849   p = 3;
 850   for (unsigned int i = 1; i <= PRIMES_PTAB_ENTRIES;)
 851     {
 852       if (! mpz_divisible_ui_p (t, p))
 853         {
 854           p += primes_diff[i++];
 855           if (mpz_cmp_ui (t, p * p) < 0)
 856             break;
 857         }
 858       else
 859         {
 860           mpz_tdiv_q_ui (t, t, p);
 861           mp_factor_insert_ui (factors, p);
 862         }
 863     }
 864
 865   mpz_clear (q);
 866 }
 867
 868 /* Entry i contains (2i+1)^(-1) mod 2^8.  */
 869 static const unsigned char  binvert_table[128] =
 870 {
 871   0x01, 0xAB, 0xCD, 0xB7, 0x39, 0xA3, 0xC5, 0xEF,
 872   0xF1, 0x1B, 0x3D, 0xA7, 0x29, 0x13, 0x35, 0xDF,
 873   0xE1, 0x8B, 0xAD, 0x97, 0x19, 0x83, 0xA5, 0xCF,
 874   0xD1, 0xFB, 0x1D, 0x87, 0x09, 0xF3, 0x15, 0xBF,
 875   0xC1, 0x6B, 0x8D, 0x77, 0xF9, 0x63, 0x85, 0xAF,
 876   0xB1, 0xDB, 0xFD, 0x67, 0xE9, 0xD3, 0xF5, 0x9F,
 877   0xA1, 0x4B, 0x6D, 0x57, 0xD9, 0x43, 0x65, 0x8F,
 878   0x91, 0xBB, 0xDD, 0x47, 0xC9, 0xB3, 0xD5, 0x7F,
 879   0x81, 0x2B, 0x4D, 0x37, 0xB9, 0x23, 0x45, 0x6F,
 880   0x71, 0x9B, 0xBD, 0x27, 0xA9, 0x93, 0xB5, 0x5F,
 881   0x61, 0x0B, 0x2D, 0x17, 0x99, 0x03, 0x25, 0x4F,
 882   0x51, 0x7B, 0x9D, 0x07, 0x89, 0x73, 0x95, 0x3F,
 883   0x41, 0xEB, 0x0D, 0xF7, 0x79, 0xE3, 0x05, 0x2F,
 884   0x31, 0x5B, 0x7D, 0xE7, 0x69, 0x53, 0x75, 0x1F,
 885   0x21, 0xCB, 0xED, 0xD7, 0x59, 0xC3, 0xE5, 0x0F,
 886   0x11, 0x3B, 0x5D, 0xC7, 0x49, 0x33, 0x55, 0xFF
 887 };
 888
 889 /* Compute n^(-1) mod B, using a Newton iteration.  */
 890 #define binv(inv,n)                                                     \
 891   do {                                                                  \
 892     uintmax_t  __n = (n);                                               \
 893     uintmax_t  __inv;                                                   \
 894                                                                         \
 895     __inv = binvert_table[(__n / 2) & 0x7F]; /*  8 */                   \
 896     if (W_TYPE_SIZE > 8)   __inv = 2 * __inv - __inv * __inv * __n;     \
 897     if (W_TYPE_SIZE > 16)  __inv = 2 * __inv - __inv * __inv * __n;     \
 898     if (W_TYPE_SIZE > 32)  __inv = 2 * __inv - __inv * __inv * __n;     \
 899                                                                         \
 900     if (W_TYPE_SIZE > 64)                                               \
 901       {                                                                 \
 902         int  __invbits = 64;                                            \
 903         do {                                                            \
 904           __inv = 2 * __inv - __inv * __inv * __n;                      \
 905           __invbits *= 2;                                               \
 906         } while (__invbits < W_TYPE_SIZE);                              \
 907       }                                                                 \
 908                                                                         \
 909     (inv) = __inv;                                                      \
 910   } while (0)
 911
 912 /* q = u / d, assuming d|u.  */
 913 #define divexact_21(q1, q0, u1, u0, d)                                  \
 914   do {                                                                  \
 915     uintmax_t _di, _q0;                                                 \
 916     binv (_di, (d));                                                    \
 917     _q0 = (u0) * _di;                                                   \
 918     if ((u1) >= (d))                                                    \
 919       {                                                                 \
 920         uintmax_t _p1;                                                  \
 921         MAYBE_UNUSED intmax_t _p0;                                      \
 922         umul_ppmm (_p1, _p0, _q0, d);                                   \
 923         (q1) = ((u1) - _p1) * _di;                                      \
 924         (q0) = _q0;                                                     \
 925       }                                                                 \
 926     else                                                                \
 927       {                                                                 \
 928         (q0) = _q0;                                                     \
 929         (q1) = 0;                                                       \
 930       }                                                                 \
 931   } while (0)
 932
 933 /* x B (mod n).  */
 934 #define redcify(r_prim, r, n)                                           \
 935   do {                                                                  \
 936     MAYBE_UNUSED uintmax_t _redcify_q;                                  \
 937     udiv_qrnnd (_redcify_q, r_prim, r, 0, n);                           \
 938   } while (0)
 939
 940 /* x B^2 (mod n).  Requires x > 0, n1 < B/2.  */
 941 #define redcify2(r1, r0, x, n1, n0)                                     \
 942   do {                                                                  \
 943     uintmax_t _r1, _r0, _i;                                             \
 944     if ((x) < (n1))                                                     \
 945       {                                                                 \
 946         _r1 = (x); _r0 = 0;                                             \
 947         _i = W_TYPE_SIZE;                                               \
 948       }                                                                 \
 949     else                                                                \
 950       {                                                                 \
 951         _r1 = 0; _r0 = (x);                                             \
 952         _i = 2 * W_TYPE_SIZE;                                           \
 953       }                                                                 \
 954     while (_i-- > 0)                                                    \
 955       {                                                                 \
 956         lsh2 (_r1, _r0, _r1, _r0, 1);                                   \
 957         if (ge2 (_r1, _r0, (n1), (n0)))                                 \
 958           sub_ddmmss (_r1, _r0, _r1, _r0, (n1), (n0));                  \
 959       }                                                                 \
 960     (r1) = _r1;                                                         \
 961     (r0) = _r0;                                                         \
 962   } while (0)
 963
 964 /* Modular two-word multiplication, r = a * b mod m, with mi = m^(-1) mod B.
 965    Both a and b must be in redc form, the result will be in redc form too.  */
 966 static inline uintmax_t
 967 mulredc (uintmax_t a, uintmax_t b, uintmax_t m, uintmax_t mi)
 968 {
 969   uintmax_t rh, rl, q, th, xh;
 970   MAYBE_UNUSED uintmax_t tl;
 971
 972   umul_ppmm (rh, rl, a, b);
 973   q = rl * mi;
 974   umul_ppmm (th, tl, q, m);
 975   xh = rh - th;
 976   if (rh < th)
 977     xh += m;
 978
 979   return xh;
 980 }
 981
 982 /* Modular two-word multiplication, r = a * b mod m, with mi = m^(-1) mod B.
 983    Both a and b must be in redc form, the result will be in redc form too.
 984    For performance reasons, the most significant bit of m must be clear.  */
 985 static uintmax_t
 986 mulredc2 (uintmax_t *r1p,
 987           uintmax_t a1, uintmax_t a0, uintmax_t b1, uintmax_t b0,
 988           uintmax_t m1, uintmax_t m0, uintmax_t mi)
 989 {
 990   uintmax_t r1, r0, q, p1, t1, t0, s1, s0;
 991   MAYBE_UNUSED uintmax_t p0;
 992   mi = -mi;
 993   assert ((a1 >> (W_TYPE_SIZE - 1)) == 0);
 994   assert ((b1 >> (W_TYPE_SIZE - 1)) == 0);
 995   assert ((m1 >> (W_TYPE_SIZE - 1)) == 0);
 996
 997   /* First compute a0 * <b1, b0> B^{-1}
 998         +-----+
 999         |a0 b0|
1000      +--+--+--+
1001      |a0 b1|
1002      +--+--+--+
1003         |q0 m0|
1004      +--+--+--+
1005      |q0 m1|
1006     -+--+--+--+
1007      |r1|r0| 0|
1008      +--+--+--+
1009   */
1010   umul_ppmm (t1, t0, a0, b0);
1011   umul_ppmm (r1, r0, a0, b1);
1012   q = mi * t0;
1013   umul_ppmm (p1, p0, q, m0);
1014   umul_ppmm (s1, s0, q, m1);
1015   r0 += (t0 != 0); /* Carry */
1016   add_ssaaaa (r1, r0, r1, r0, 0, p1);
1017   add_ssaaaa (r1, r0, r1, r0, 0, t1);
1018   add_ssaaaa (r1, r0, r1, r0, s1, s0);
1019
1020   /* Next, (a1 * <b1, b0> + <r1, r0> B^{-1}
1021         +-----+
1022         |a1 b0|
1023         +--+--+
1024         |r1|r0|
1025      +--+--+--+
1026      |a1 b1|
1027      +--+--+--+
1028         |q1 m0|
1029      +--+--+--+
1030      |q1 m1|
1031     -+--+--+--+
1032      |r1|r0| 0|
1033      +--+--+--+
1034   */
1035   umul_ppmm (t1, t0, a1, b0);
1036   umul_ppmm (s1, s0, a1, b1);
1037   add_ssaaaa (t1, t0, t1, t0, 0, r0);
1038   q = mi * t0;
1039   add_ssaaaa (r1, r0, s1, s0, 0, r1);
1040   umul_ppmm (p1, p0, q, m0);
1041   umul_ppmm (s1, s0, q, m1);
1042   r0 += (t0 != 0); /* Carry */
1043   add_ssaaaa (r1, r0, r1, r0, 0, p1);
1044   add_ssaaaa (r1, r0, r1, r0, 0, t1);
1045   add_ssaaaa (r1, r0, r1, r0, s1, s0);
1046
1047   if (ge2 (r1, r0, m1, m0))
1048     sub_ddmmss (r1, r0, r1, r0, m1, m0);
1049
1050   *r1p = r1;
1051   return r0;
1052 }
1053
1054 ATTRIBUTE_CONST
1055 static uintmax_t
1056 powm (uintmax_t b, uintmax_t e, uintmax_t n, uintmax_t ni, uintmax_t one)
1057 {
1058   uintmax_t y = one;
1059
1060   if (e & 1)
1061     y = b;
1062
1063   while (e != 0)
1064     {
1065       b = mulredc (b, b, n, ni);
1066       e >>= 1;
1067
1068       if (e & 1)
1069         y = mulredc (y, b, n, ni);
1070     }
1071
1072   return y;
1073 }
1074
1075 static uintmax_t
1076 powm2 (uintmax_t *r1m,
1077        const uintmax_t *bp, const uintmax_t *ep, const uintmax_t *np,
1078        uintmax_t ni, const uintmax_t *one)
1079 {
1080   uintmax_t r1, r0, b1, b0, n1, n0;
1081   unsigned int i;
1082   uintmax_t e;
1083
1084   b0 = bp[0];
1085   b1 = bp[1];
1086   n0 = np[0];
1087   n1 = np[1];
1088
1089   r0 = one[0];
1090   r1 = one[1];
1091
1092   for (e = ep[0], i = W_TYPE_SIZE; i > 0; i--, e >>= 1)
1093     {
1094       if (e & 1)
1095         {
1096           r0 = mulredc2 (r1m, r1, r0, b1, b0, n1, n0, ni);
1097           r1 = *r1m;
1098         }
1099       b0 = mulredc2 (r1m, b1, b0, b1, b0, n1, n0, ni);
1100       b1 = *r1m;
1101     }
1102   for (e = ep[1]; e > 0; e >>= 1)
1103     {
1104       if (e & 1)
1105         {
1106           r0 = mulredc2 (r1m, r1, r0, b1, b0, n1, n0, ni);
1107           r1 = *r1m;
1108         }
1109       b0 = mulredc2 (r1m, b1, b0, b1, b0, n1, n0, ni);
1110       b1 = *r1m;
1111     }
1112   *r1m = r1;
1113   return r0;
1114 }
1115
1116 ATTRIBUTE_CONST
1117 static bool
1118 millerrabin (uintmax_t n, uintmax_t ni, uintmax_t b, uintmax_t q,
1119              unsigned int k, uintmax_t one)
1120 {
1121   uintmax_t y = powm (b, q, n, ni, one);
1122
1123   uintmax_t nm1 = n - one;      /* -1, but in redc representation.  */
1124
1125   if (y == one || y == nm1)
1126     return true;
1127
1128   for (unsigned int i = 1; i < k; i++)
1129     {
1130       y = mulredc (y, y, n, ni);
1131
1132       if (y == nm1)
1133         return true;
1134       if (y == one)
1135         return false;
1136     }
1137   return false;
1138 }
1139
1140 ATTRIBUTE_PURE static bool
1141 millerrabin2 (const uintmax_t *np, uintmax_t ni, const uintmax_t *bp,
1142               const uintmax_t *qp, unsigned int k, const uintmax_t *one)
1143 {
1144   uintmax_t y1, y0, nm1_1, nm1_0, r1m;
1145
1146   y0 = powm2 (&r1m, bp, qp, np, ni, one);
1147   y1 = r1m;
1148
1149   if (y0 == one[0] && y1 == one[1])
1150     return true;
1151
1152   sub_ddmmss (nm1_1, nm1_0, np[1], np[0], one[1], one[0]);
1153
1154   if (y0 == nm1_0 && y1 == nm1_1)
1155     return true;
1156
1157   for (unsigned int i = 1; i < k; i++)
1158     {
1159       y0 = mulredc2 (&r1m, y1, y0, y1, y0, np[1], np[0], ni);
1160       y1 = r1m;
1161
1162       if (y0 == nm1_0 && y1 == nm1_1)
1163         return true;
1164       if (y0 == one[0] && y1 == one[1])
1165         return false;
1166     }
1167   return false;
1168 }
1169
1170 static bool
1171 mp_millerrabin (mpz_srcptr n, mpz_srcptr nm1, mpz_ptr x, mpz_ptr y,
1172                 mpz_srcptr q, unsigned long int k)
1173 {
1174   mpz_powm (y, x, q, n);
1175
1176   if (mpz_cmp_ui (y, 1) == 0 || mpz_cmp (y, nm1) == 0)
1177     return true;
1178
1179   for (unsigned long int i = 1; i < k; i++)
1180     {
1181       mpz_powm_ui (y, y, 2, n);
1182       if (mpz_cmp (y, nm1) == 0)
1183         return true;
1184       if (mpz_cmp_ui (y, 1) == 0)
1185         return false;
1186     }
1187   return false;
1188 }
1189
1190 /* Lucas' prime test.  The number of iterations vary greatly, up to a few dozen
1191    have been observed.  The average seem to be about 2.  */
1192 static bool
1193 prime_p (uintmax_t n)
1194 {
1195   int k;
1196   bool is_prime;
1197   uintmax_t a_prim, one, ni;
1198   struct factors factors;
1199
1200   if (n <= 1)
1201     return false;
1202
1203   /* We have already casted out small primes.  */
1204   if (n < (uintmax_t) FIRST_OMITTED_PRIME * FIRST_OMITTED_PRIME)
1205     return true;
1206
1207   /* Precomputation for Miller-Rabin.  */
1208   uintmax_t q = n - 1;
1209   for (k = 0; (q & 1) == 0; k++)
1210     q >>= 1;
1211
1212   uintmax_t a = 2;
1213   binv (ni, n);                 /* ni <- 1/n mod B */
1214   redcify (one, 1, n);
1215   addmod (a_prim, one, one, n); /* i.e., redcify a = 2 */
1216
1217   /* Perform a Miller-Rabin test, finds most composites quickly.  */
1218   if (!millerrabin (n, ni, a_prim, q, k, one))
1219     return false;
1220
1221   if (flag_prove_primality)
1222     {
1223       /* Factor n-1 for Lucas.  */
1224       factor (0, n - 1, &factors);
1225     }
1226
1227   /* Loop until Lucas proves our number prime, or Miller-Rabin proves our
1228      number composite.  */
1229   for (unsigned int r = 0; r < PRIMES_PTAB_ENTRIES; r++)
1230     {
1231       if (flag_prove_primality)
1232         {
1233           is_prime = true;
1234           for (unsigned int i = 0; i < factors.nfactors && is_prime; i++)
1235             {
1236               is_prime
1237                 = powm (a_prim, (n - 1) / factors.p[i], n, ni, one) != one;
1238             }
1239         }
1240       else
1241         {
1242           /* After enough Miller-Rabin runs, be content.  */
1243           is_prime = (r == MR_REPS - 1);
1244         }
1245
1246       if (is_prime)
1247         return true;
1248
1249       a += primes_diff[r];      /* Establish new base.  */
1250
1251       /* The following is equivalent to redcify (a_prim, a, n).  It runs faster
1252          on most processors, since it avoids udiv_qrnnd.  If we go down the
1253          udiv_qrnnd_preinv path, this code should be replaced.  */
1254       {
1255         uintmax_t s1, s0;
1256         umul_ppmm (s1, s0, one, a);
1257         if (LIKELY (s1 == 0))
1258           a_prim = s0 % n;
1259         else
1260           {
1261             MAYBE_UNUSED uintmax_t dummy;
1262             udiv_qrnnd (dummy, a_prim, s1, s0, n);
1263           }
1264       }
1265
1266       if (!millerrabin (n, ni, a_prim, q, k, one))
1267         return false;
1268     }
1269
1270   error (0, 0, _("Lucas prime test failure.  This should not happen"));
1271   abort ();
1272 }
1273
1274 static bool
1275 prime2_p (uintmax_t n1, uintmax_t n0)
1276 {
1277   uintmax_t q[2], nm1[2];
1278   uintmax_t a_prim[2];
1279   uintmax_t one[2];
1280   uintmax_t na[2];
1281   uintmax_t ni;
1282   unsigned int k;
1283   struct factors factors;
1284
1285   if (n1 == 0)
1286     return prime_p (n0);
1287
1288   nm1[1] = n1 - (n0 == 0);
1289   nm1[0] = n0 - 1;
1290   if (nm1[0] == 0)
1291     {
1292       count_trailing_zeros (k, nm1[1]);
1293
1294       q[0] = nm1[1] >> k;
1295       q[1] = 0;
1296       k += W_TYPE_SIZE;
1297     }
1298   else
1299     {
1300       count_trailing_zeros (k, nm1[0]);
1301       rsh2 (q[1], q[0], nm1[1], nm1[0], k);
1302     }
1303
1304   uintmax_t a = 2;
1305   binv (ni, n0);
1306   redcify2 (one[1], one[0], 1, n1, n0);
1307   addmod2 (a_prim[1], a_prim[0], one[1], one[0], one[1], one[0], n1, n0);
1308
1309   /* FIXME: Use scalars or pointers in arguments?  Some consistency needed.  */
1310   na[0] = n0;
1311   na[1] = n1;
1312
1313   if (!millerrabin2 (na, ni, a_prim, q, k, one))
1314     return false;
1315
1316   if (flag_prove_primality)
1317     {
1318       /* Factor n-1 for Lucas.  */
1319       factor (nm1[1], nm1[0], &factors);
1320     }
1321
1322   /* Loop until Lucas proves our number prime, or Miller-Rabin proves our
1323      number composite.  */
1324   for (unsigned int r = 0; r < PRIMES_PTAB_ENTRIES; r++)
1325     {
1326       bool is_prime;
1327       uintmax_t e[2], y[2];
1328
1329       if (flag_prove_primality)
1330         {
1331           is_prime = true;
1332           if (factors.plarge[1])
1333             {
1334               uintmax_t pi;
1335               binv (pi, factors.plarge[0]);
1336               e[0] = pi * nm1[0];
1337               e[1] = 0;
1338               y[0] = powm2 (&y[1], a_prim, e, na, ni, one);
1339               is_prime = (y[0] != one[0] || y[1] != one[1]);
1340             }
1341           for (unsigned int i = 0; i < factors.nfactors && is_prime; i++)
1342             {
1343               /* FIXME: We always have the factor 2.  Do we really need to
1344                  handle it here?  We have done the same powering as part
1345                  of millerrabin.  */
1346               if (factors.p[i] == 2)
1347                 rsh2 (e[1], e[0], nm1[1], nm1[0], 1);
1348               else
1349                 divexact_21 (e[1], e[0], nm1[1], nm1[0], factors.p[i]);
1350               y[0] = powm2 (&y[1], a_prim, e, na, ni, one);
1351               is_prime = (y[0] != one[0] || y[1] != one[1]);
1352             }
1353         }
1354       else
1355         {
1356           /* After enough Miller-Rabin runs, be content.  */
1357           is_prime = (r == MR_REPS - 1);
1358         }
1359
1360       if (is_prime)
1361         return true;
1362
1363       a += primes_diff[r];      /* Establish new base.  */
1364       redcify2 (a_prim[1], a_prim[0], a, n1, n0);
1365
1366       if (!millerrabin2 (na, ni, a_prim, q, k, one))
1367         return false;
1368     }
1369
1370   error (0, 0, _("Lucas prime test failure.  This should not happen"));
1371   abort ();
1372 }
1373
1374 static bool
1375 mp_prime_p (mpz_t n)
1376 {
1377   bool is_prime;
1378   mpz_t q, a, nm1, tmp;
1379   struct mp_factors factors;
1380
1381   if (mpz_cmp_ui (n, 1) <= 0)
1382     return false;
1383
1384   /* We have already casted out small primes.  */
1385   if (mpz_cmp_ui (n, (long) FIRST_OMITTED_PRIME * FIRST_OMITTED_PRIME) < 0)
1386     return true;
1387
1388   mpz_inits (q, a, nm1, tmp, NULL);
1389
1390   /* Precomputation for Miller-Rabin.  */
1391   mpz_sub_ui (nm1, n, 1);
1392
1393   /* Find q and k, where q is odd and n = 1 + 2**k * q.  */
1394   unsigned long int k = mpz_scan1 (nm1, 0);
1395   mpz_tdiv_q_2exp (q, nm1, k);
1396
1397   mpz_set_ui (a, 2);
1398
1399   /* Perform a Miller-Rabin test, finds most composites quickly.  */
1400   if (!mp_millerrabin (n, nm1, a, tmp, q, k))
1401     {
1402       is_prime = false;
1403       goto ret2;
1404     }
1405
1406   if (flag_prove_primality)
1407     {
1408       /* Factor n-1 for Lucas.  */
1409       mpz_set (tmp, nm1);
1410       mp_factor (tmp, &factors);
1411     }
1412
1413   /* Loop until Lucas proves our number prime, or Miller-Rabin proves our
1414      number composite.  */
1415   for (unsigned int r = 0; r < PRIMES_PTAB_ENTRIES; r++)
1416     {
1417       if (flag_prove_primality)
1418         {
1419           is_prime = true;
1420           for (unsigned long int i = 0; i < factors.nfactors && is_prime; i++)
1421             {
1422               mpz_divexact (tmp, nm1, factors.p[i]);
1423               mpz_powm (tmp, a, tmp, n);
1424               is_prime = mpz_cmp_ui (tmp, 1) != 0;
1425             }
1426         }
1427       else
1428         {
1429           /* After enough Miller-Rabin runs, be content.  */
1430           is_prime = (r == MR_REPS - 1);
1431         }
1432
1433       if (is_prime)
1434         goto ret1;
1435
1436       mpz_add_ui (a, a, primes_diff[r]);        /* Establish new base.  */
1437
1438       if (!mp_millerrabin (n, nm1, a, tmp, q, k))
1439         {
1440           is_prime = false;
1441           goto ret1;
1442         }
1443     }
1444
1445   error (0, 0, _("Lucas prime test failure.  This should not happen"));
1446   abort ();
1447
1448  ret1:
1449   if (flag_prove_primality)
1450     mp_factor_clear (&factors);
1451  ret2:
1452   mpz_clears (q, a, nm1, tmp, NULL);
1453
1454   return is_prime;
1455 }
1456
1457 static void
1458 factor_using_pollard_rho (uintmax_t n, unsigned long int a,
1459                           struct factors *factors)
1460 {
1461   uintmax_t x, z, y, P, t, ni, g;
1462
1463   unsigned long int k = 1;
1464   unsigned long int l = 1;
1465
1466   redcify (P, 1, n);
1467   addmod (x, P, P, n);          /* i.e., redcify(2) */
1468   y = z = x;
1469
1470   while (n != 1)
1471     {
1472       assert (a < n);
1473
1474       binv (ni, n);             /* FIXME: when could we use old 'ni' value?  */
1475
1476       for (;;)
1477         {
1478           do
1479             {
1480               x = mulredc (x, x, n, ni);
1481               addmod (x, x, a, n);
1482
1483               submod (t, z, x, n);
1484               P = mulredc (P, t, n, ni);
1485
1486               if (k % 32 == 1)
1487                 {
1488                   if (gcd_odd (P, n) != 1)
1489                     goto factor_found;
1490                   y = x;
1491                 }
1492             }
1493           while (--k != 0);
1494
1495           z = x;
1496           k = l;
1497           l = 2 * l;
1498           for (unsigned long int i = 0; i < k; i++)
1499             {
1500               x = mulredc (x, x, n, ni);
1501               addmod (x, x, a, n);
1502             }
1503           y = x;
1504         }
1505
1506     factor_found:
1507       do
1508         {
1509           y = mulredc (y, y, n, ni);
1510           addmod (y, y, a, n);
1511
1512           submod (t, z, y, n);
1513           g = gcd_odd (t, n);
1514         }
1515       while (g == 1);
1516
1517       if (n == g)
1518         {
1519           /* Found n itself as factor.  Restart with different params.  */
1520           factor_using_pollard_rho (n, a + 1, factors);
1521           return;
1522         }
1523
1524       n = n / g;
1525
1526       if (!prime_p (g))
1527         factor_using_pollard_rho (g, a + 1, factors);
1528       else
1529         factor_insert (factors, g);
1530
1531       if (prime_p (n))
1532         {
1533           factor_insert (factors, n);
1534           break;
1535         }
1536
1537       x = x % n;
1538       z = z % n;
1539       y = y % n;
1540     }
1541 }
1542
1543 static void
1544 factor_using_pollard_rho2 (uintmax_t n1, uintmax_t n0, unsigned long int a,
1545                            struct factors *factors)
1546 {
1547   uintmax_t x1, x0, z1, z0, y1, y0, P1, P0, t1, t0, ni, g1, g0, r1m;
1548
1549   unsigned long int k = 1;
1550   unsigned long int l = 1;
1551
1552   redcify2 (P1, P0, 1, n1, n0);
1553   addmod2 (x1, x0, P1, P0, P1, P0, n1, n0); /* i.e., redcify(2) */
1554   y1 = z1 = x1;
1555   y0 = z0 = x0;
1556
1557   while (n1 != 0 || n0 != 1)
1558     {
1559       binv (ni, n0);
1560
1561       for (;;)
1562         {
1563           do
1564             {
1565               x0 = mulredc2 (&r1m, x1, x0, x1, x0, n1, n0, ni);
1566               x1 = r1m;
1567               addmod2 (x1, x0, x1, x0, 0, (uintmax_t) a, n1, n0);
1568
1569               submod2 (t1, t0, z1, z0, x1, x0, n1, n0);
1570               P0 = mulredc2 (&r1m, P1, P0, t1, t0, n1, n0, ni);
1571               P1 = r1m;
1572
1573               if (k % 32 == 1)
1574                 {
1575                   g0 = gcd2_odd (&g1, P1, P0, n1, n0);
1576                   if (g1 != 0 || g0 != 1)
1577                     goto factor_found;
1578                   y1 = x1; y0 = x0;
1579                 }
1580             }
1581           while (--k != 0);
1582
1583           z1 = x1; z0 = x0;
1584           k = l;
1585           l = 2 * l;
1586           for (unsigned long int i = 0; i < k; i++)
1587             {
1588               x0 = mulredc2 (&r1m, x1, x0, x1, x0, n1, n0, ni);
1589               x1 = r1m;
1590               addmod2 (x1, x0, x1, x0, 0, (uintmax_t) a, n1, n0);
1591             }
1592           y1 = x1; y0 = x0;
1593         }
1594
1595     factor_found:
1596       do
1597         {
1598           y0 = mulredc2 (&r1m, y1, y0, y1, y0, n1, n0, ni);
1599           y1 = r1m;
1600           addmod2 (y1, y0, y1, y0, 0, (uintmax_t) a, n1, n0);
1601
1602           submod2 (t1, t0, z1, z0, y1, y0, n1, n0);
1603           g0 = gcd2_odd (&g1, t1, t0, n1, n0);
1604         }
1605       while (g1 == 0 && g0 == 1);
1606
1607       if (g1 == 0)
1608         {
1609           /* The found factor is one word, and > 1.  */
1610           divexact_21 (n1, n0, n1, n0, g0);     /* n = n / g */
1611
1612           if (!prime_p (g0))
1613             factor_using_pollard_rho (g0, a + 1, factors);
1614           else
1615             factor_insert (factors, g0);
1616         }
1617       else
1618         {
1619           /* The found factor is two words.  This is highly unlikely, thus hard
1620              to trigger.  Please be careful before you change this code!  */
1621           uintmax_t ginv;
1622
1623           if (n1 == g1 && n0 == g0)
1624             {
1625               /* Found n itself as factor.  Restart with different params.  */
1626               factor_using_pollard_rho2 (n1, n0, a + 1, factors);
1627               return;
1628             }
1629
1630           /* Compute n = n / g.  Since the result will fit one word,
1631              we can compute the quotient modulo B, ignoring the high
1632              divisor word.  */
1633           binv (ginv, g0);
1634           n0 = ginv * n0;
1635           n1 = 0;
1636
1637           if (!prime2_p (g1, g0))
1638             factor_using_pollard_rho2 (g1, g0, a + 1, factors);
1639           else
1640             factor_insert_large (factors, g1, g0);
1641         }
1642
1643       if (n1 == 0)
1644         {
1645           if (prime_p (n0))
1646             {
1647               factor_insert (factors, n0);
1648               break;
1649             }
1650
1651           factor_using_pollard_rho (n0, a, factors);
1652           return;
1653         }
1654
1655       if (prime2_p (n1, n0))
1656         {
1657           factor_insert_large (factors, n1, n0);
1658           break;
1659         }
1660
1661       x0 = mod2 (&x1, x1, x0, n1, n0);
1662       z0 = mod2 (&z1, z1, z0, n1, n0);
1663       y0 = mod2 (&y1, y1, y0, n1, n0);
1664     }
1665 }
1666
1667 static void
1668 mp_factor_using_pollard_rho (mpz_t n, unsigned long int a,
1669                              struct mp_factors *factors)
1670 {
1671   mpz_t x, z, y, P;
1672   mpz_t t, t2;
1673
1674   devmsg ("[pollard-rho (%lu)] ", a);
1675
1676   mpz_inits (t, t2, NULL);
1677   mpz_init_set_si (y, 2);
1678   mpz_init_set_si (x, 2);
1679   mpz_init_set_si (z, 2);
1680   mpz_init_set_ui (P, 1);
1681
1682   unsigned long long int k = 1;
1683   unsigned long long int l = 1;
1684
1685   while (mpz_cmp_ui (n, 1) != 0)
1686     {
1687       for (;;)
1688         {
1689           do
1690             {
1691               mpz_mul (t, x, x);
1692               mpz_mod (x, t, n);
1693               mpz_add_ui (x, x, a);
1694
1695               mpz_sub (t, z, x);
1696               mpz_mul (t2, P, t);
1697               mpz_mod (P, t2, n);
1698
1699               if (k % 32 == 1)
1700                 {
1701                   mpz_gcd (t, P, n);
1702                   if (mpz_cmp_ui (t, 1) != 0)
1703                     goto factor_found;
1704                   mpz_set (y, x);
1705                 }
1706             }
1707           while (--k != 0);
1708
1709           mpz_set (z, x);
1710           k = l;
1711           l = 2 * l;
1712           for (unsigned long long int i = 0; i < k; i++)
1713             {
1714               mpz_mul (t, x, x);
1715               mpz_mod (x, t, n);
1716               mpz_add_ui (x, x, a);
1717             }
1718           mpz_set (y, x);
1719         }
1720
1721     factor_found:
1722       do
1723         {
1724           mpz_mul (t, y, y);
1725           mpz_mod (y, t, n);
1726           mpz_add_ui (y, y, a);
1727
1728           mpz_sub (t, z, y);
1729           mpz_gcd (t, t, n);
1730         }
1731       while (mpz_cmp_ui (t, 1) == 0);
1732
1733       mpz_divexact (n, n, t);   /* divide by t, before t is overwritten */
1734
1735       if (!mp_prime_p (t))
1736         {
1737           devmsg ("[composite factor--restarting pollard-rho] ");
1738           mp_factor_using_pollard_rho (t, a + 1, factors);
1739         }
1740       else
1741         {
1742           mp_factor_insert (factors, t);
1743         }
1744
1745       if (mp_prime_p (n))
1746         {
1747           mp_factor_insert (factors, n);
1748           break;
1749         }
1750
1751       mpz_mod (x, x, n);
1752       mpz_mod (z, z, n);
1753       mpz_mod (y, y, n);
1754     }
1755
1756   mpz_clears (P, t2, t, z, x, y, NULL);
1757 }
1758
1759 #if USE_SQUFOF
1760 /* FIXME: Maybe better to use an iteration converging to 1/sqrt(n)?  If
1761    algorithm is replaced, consider also returning the remainder.  */
1762 ATTRIBUTE_CONST
1763 static uintmax_t
1764 isqrt (uintmax_t n)
1765 {
1766   uintmax_t x;
1767   unsigned c;
1768   if (n == 0)
1769     return 0;
1770
1771   count_leading_zeros (c, n);
1772
1773   /* Make x > sqrt(n).  This will be invariant through the loop.  */
1774   x = (uintmax_t) 1 << ((W_TYPE_SIZE + 1 - c) / 2);
1775
1776   for (;;)
1777     {
1778       uintmax_t y = (x + n / x) / 2;
1779       if (y >= x)
1780         return x;
1781
1782       x = y;
1783     }
1784 }
1785
1786 ATTRIBUTE_CONST
1787 static uintmax_t
1788 isqrt2 (uintmax_t nh, uintmax_t nl)
1789 {
1790   unsigned int shift;
1791   uintmax_t x;
1792
1793   /* Ensures the remainder fits in an uintmax_t.  */
1794   assert (nh < ((uintmax_t) 1 << (W_TYPE_SIZE - 2)));
1795
1796   if (nh == 0)
1797     return isqrt (nl);
1798
1799   count_leading_zeros (shift, nh);
1800   shift &= ~1;
1801
1802   /* Make x > sqrt (n).  */
1803   x = isqrt ((nh << shift) + (nl >> (W_TYPE_SIZE - shift))) + 1;
1804   x <<= (W_TYPE_SIZE - shift) / 2;
1805
1806   /* Do we need more than one iteration?  */
1807   for (;;)
1808     {
1809       MAYBE_UNUSED uintmax_t r;
1810       uintmax_t q, y;
1811       udiv_qrnnd (q, r, nh, nl, x);
1812       y = (x + q) / 2;
1813
1814       if (y >= x)
1815         {
1816           uintmax_t hi, lo;
1817           umul_ppmm (hi, lo, x + 1, x + 1);
1818           assert (gt2 (hi, lo, nh, nl));
1819
1820           umul_ppmm (hi, lo, x, x);
1821           assert (ge2 (nh, nl, hi, lo));
1822           sub_ddmmss (hi, lo, nh, nl, hi, lo);
1823           assert (hi == 0);
1824
1825           return x;
1826         }
1827
1828       x = y;
1829     }
1830 }
1831
1832 /* MAGIC[N] has a bit i set iff i is a quadratic residue mod N.  */
1833 # define MAGIC64 0x0202021202030213ULL
1834 # define MAGIC63 0x0402483012450293ULL
1835 # define MAGIC65 0x218a019866014613ULL
1836 # define MAGIC11 0x23b
1837
1838 /* Return the square root if the input is a square, otherwise 0.  */
1839 ATTRIBUTE_CONST
1840 static uintmax_t
1841 is_square (uintmax_t x)
1842 {
1843   /* Uses the tests suggested by Cohen.  Excludes 99% of the non-squares before
1844      computing the square root.  */
1845   if (((MAGIC64 >> (x & 63)) & 1)
1846       && ((MAGIC63 >> (x % 63)) & 1)
1847       /* Both 0 and 64 are squares mod (65).  */
1848       && ((MAGIC65 >> ((x % 65) & 63)) & 1)
1849       && ((MAGIC11 >> (x % 11) & 1)))
1850     {
1851       uintmax_t r = isqrt (x);
1852       if (r * r == x)
1853         return r;
1854     }
1855   return 0;
1856 }
1857
1858 /* invtab[i] = floor (0x10000 / (0x100 + i) */
1859 static const unsigned short invtab[0x81] =
1860   {
1861     0x200,
1862     0x1fc, 0x1f8, 0x1f4, 0x1f0, 0x1ec, 0x1e9, 0x1e5, 0x1e1,
1863     0x1de, 0x1da, 0x1d7, 0x1d4, 0x1d0, 0x1cd, 0x1ca, 0x1c7,
1864     0x1c3, 0x1c0, 0x1bd, 0x1ba, 0x1b7, 0x1b4, 0x1b2, 0x1af,
1865     0x1ac, 0x1a9, 0x1a6, 0x1a4, 0x1a1, 0x19e, 0x19c, 0x199,
1866     0x197, 0x194, 0x192, 0x18f, 0x18d, 0x18a, 0x188, 0x186,
1867     0x183, 0x181, 0x17f, 0x17d, 0x17a, 0x178, 0x176, 0x174,
1868     0x172, 0x170, 0x16e, 0x16c, 0x16a, 0x168, 0x166, 0x164,
1869     0x162, 0x160, 0x15e, 0x15c, 0x15a, 0x158, 0x157, 0x155,
1870     0x153, 0x151, 0x150, 0x14e, 0x14c, 0x14a, 0x149, 0x147,
1871     0x146, 0x144, 0x142, 0x141, 0x13f, 0x13e, 0x13c, 0x13b,
1872     0x139, 0x138, 0x136, 0x135, 0x133, 0x132, 0x130, 0x12f,
1873     0x12e, 0x12c, 0x12b, 0x129, 0x128, 0x127, 0x125, 0x124,
1874     0x123, 0x121, 0x120, 0x11f, 0x11e, 0x11c, 0x11b, 0x11a,
1875     0x119, 0x118, 0x116, 0x115, 0x114, 0x113, 0x112, 0x111,
1876     0x10f, 0x10e, 0x10d, 0x10c, 0x10b, 0x10a, 0x109, 0x108,
1877     0x107, 0x106, 0x105, 0x104, 0x103, 0x102, 0x101, 0x100,
1878   };
1879
1880 /* Compute q = [u/d], r = u mod d.  Avoids slow hardware division for the case
1881    that q < 0x40; here it instead uses a table of (Euclidian) inverses.  */
1882 # define div_smallq(q, r, u, d)                                          \
1883   do {                                                                  \
1884     if ((u) / 0x40 < (d))                                               \
1885       {                                                                 \
1886         int _cnt;                                                       \
1887         uintmax_t _dinv, _mask, _q, _r;                                 \
1888         count_leading_zeros (_cnt, (d));                                \
1889         _r = (u);                                                       \
1890         if (UNLIKELY (_cnt > (W_TYPE_SIZE - 8)))                        \
1891           {                                                             \
1892             _dinv = invtab[((d) << (_cnt + 8 - W_TYPE_SIZE)) - 0x80];   \
1893             _q = _dinv * _r >> (8 + W_TYPE_SIZE - _cnt);                \
1894           }                                                             \
1895         else                                                            \
1896           {                                                             \
1897             _dinv = invtab[((d) >> (W_TYPE_SIZE - 8 - _cnt)) - 0x7f];   \
1898             _q = _dinv * (_r >> (W_TYPE_SIZE - 3 - _cnt)) >> 11;        \
1899           }                                                             \
1900         _r -= _q * (d);                                                 \
1901                                                                         \
1902         _mask = -(uintmax_t) (_r >= (d));                               \
1903         (r) = _r - (_mask & (d));                                       \
1904         (q) = _q - _mask;                                               \
1905         assert ((q) * (d) + (r) == u);                                  \
1906       }                                                                 \
1907     else                                                                \
1908       {                                                                 \
1909         uintmax_t _q = (u) / (d);                                       \
1910         (r) = (u) - _q * (d);                                           \
1911         (q) = _q;                                                       \
1912       }                                                                 \
1913   } while (0)
1914
1915 /* Notes: Example N = 22117019.  After first phase we find Q1 = 6314, Q
1916    = 3025, P = 1737, representing F_{18} = (-6314, 2 * 1737, 3025),
1917    with 3025 = 55^2.
1918
1919    Constructing the square root, we get Q1 = 55, Q = 8653, P = 4652,
1920    representing G_0 = (-55, 2 * 4652, 8653).
1921
1922    In the notation of the paper:
1923
1924    S_{-1} = 55, S_0 = 8653, R_0 = 4652
1925
1926    Put
1927
1928      t_0 = floor([q_0 + R_0] / S0) = 1
1929      R_1 = t_0 * S_0 - R_0 = 4001
1930      S_1 = S_{-1} +t_0 (R_0 - R_1) = 706
1931 */
1932
1933 /* Multipliers, in order of efficiency:
1934    0.7268  3*5*7*11 = 1155 = 3 (mod 4)
1935    0.7317  3*5*7    =  105 = 1
1936    0.7820  3*5*11   =  165 = 1
1937    0.7872  3*5      =   15 = 3
1938    0.8101  3*7*11   =  231 = 3
1939    0.8155  3*7      =   21 = 1
1940    0.8284  5*7*11   =  385 = 1
1941    0.8339  5*7      =   35 = 3
1942    0.8716  3*11     =   33 = 1
1943    0.8774  3        =    3 = 3
1944    0.8913  5*11     =   55 = 3
1945    0.8972  5        =    5 = 1
1946    0.9233  7*11     =   77 = 1
1947    0.9295  7        =    7 = 3
1948    0.9934  11       =   11 = 3
1949 */
1950 # define QUEUE_SIZE 50
1951 #endif
1952
1953 #if STAT_SQUFOF
1954 # define Q_FREQ_SIZE 50
1955 /* Element 0 keeps the total */
1956 static unsigned int q_freq[Q_FREQ_SIZE + 1];
1957 #endif
1958
1959 #if USE_SQUFOF
1960 /* Return true on success.  Expected to fail only for numbers
1961    >= 2^{2*W_TYPE_SIZE - 2}, or close to that limit.  */
1962 static bool
1963 factor_using_squfof (uintmax_t n1, uintmax_t n0, struct factors *factors)
1964 {
1965   /* Uses algorithm and notation from
1966
1967      SQUARE FORM FACTORIZATION
1968      JASON E. GOWER AND SAMUEL S. WAGSTAFF, JR.
1969
1970      https://homes.cerias.purdue.edu/~ssw/squfof.pdf
1971    */
1972
1973   static const unsigned int multipliers_1[] =
1974     { /* = 1 (mod 4) */
1975       105, 165, 21, 385, 33, 5, 77, 1, 0
1976     };
1977   static const unsigned int multipliers_3[] =
1978     { /* = 3 (mod 4) */
1979       1155, 15, 231, 35, 3, 55, 7, 11, 0
1980     };
1981
1982   const unsigned int *m;
1983
1984   struct { uintmax_t Q; uintmax_t P; } queue[QUEUE_SIZE];
1985
1986   if (n1 >= ((uintmax_t) 1 << (W_TYPE_SIZE - 2)))
1987     return false;
1988
1989   uintmax_t sqrt_n = isqrt2 (n1, n0);
1990
1991   if (n0 == sqrt_n * sqrt_n)
1992     {
1993       uintmax_t p1, p0;
1994
1995       umul_ppmm (p1, p0, sqrt_n, sqrt_n);
1996       assert (p0 == n0);
1997
1998       if (n1 == p1)
1999         {
2000           if (prime_p (sqrt_n))
2001             factor_insert_multiplicity (factors, sqrt_n, 2);
2002           else
2003             {
2004               struct factors f;
2005
2006               f.nfactors = 0;
2007               if (!factor_using_squfof (0, sqrt_n, &f))
2008                 {
2009                   /* Try pollard rho instead */
2010                   factor_using_pollard_rho (sqrt_n, 1, &f);
2011                 }
2012               /* Duplicate the new factors */
2013               for (unsigned int i = 0; i < f.nfactors; i++)
2014                 factor_insert_multiplicity (factors, f.p[i], 2 * f.e[i]);
2015             }
2016           return true;
2017         }
2018     }
2019
2020   /* Select multipliers so we always get n * mu = 3 (mod 4) */
2021   for (m = (n0 % 4 == 1) ? multipliers_3 : multipliers_1;
2022        *m; m++)
2023     {
2024       uintmax_t S, Dh, Dl, Q1, Q, P, L, L1, B;
2025       unsigned int i;
2026       unsigned int mu = *m;
2027       unsigned int qpos = 0;
2028
2029       assert (mu * n0 % 4 == 3);
2030
2031       /* In the notation of the paper, with mu * n == 3 (mod 4), we
2032          get \Delta = 4 mu * n, and the paper's \mu is 2 mu.  As far as
2033          I understand it, the necessary bound is 4 \mu^3 < n, or 32
2034          mu^3 < n.
2035
2036          However, this seems insufficient: With n = 37243139 and mu =
2037          105, we get a trivial factor, from the square 38809 = 197^2,
2038          without any corresponding Q earlier in the iteration.
2039
2040          Requiring 64 mu^3 < n seems sufficient.  */
2041       if (n1 == 0)
2042         {
2043           if ((uintmax_t) mu * mu * mu >= n0 / 64)
2044             continue;
2045         }
2046       else
2047         {
2048           if (n1 > ((uintmax_t) 1 << (W_TYPE_SIZE - 2)) / mu)
2049             continue;
2050         }
2051       umul_ppmm (Dh, Dl, n0, mu);
2052       Dh += n1 * mu;
2053
2054       assert (Dl % 4 != 1);
2055       assert (Dh < (uintmax_t) 1 << (W_TYPE_SIZE - 2));
2056
2057       S = isqrt2 (Dh, Dl);
2058
2059       Q1 = 1;
2060       P = S;
2061
2062       /* Square root remainder fits in one word, so ignore high part.  */
2063       Q = Dl - P * P;
2064       /* FIXME: When can this differ from floor (sqrt (2 * sqrt (D)))?  */
2065       L = isqrt (2 * S);
2066       B = 2 * L;
2067       L1 = mu * 2 * L;
2068
2069       /* The form is (+/- Q1, 2P, -/+ Q), of discriminant 4 (P^2 + Q Q1) =
2070          4 D.  */
2071
2072       for (i = 0; i <= B; i++)
2073         {
2074           uintmax_t q, P1, t, rem;
2075
2076           div_smallq (q, rem, S + P, Q);
2077           P1 = S - rem; /* P1 = q*Q - P */
2078
2079           IF_LINT (assert (q > 0 && Q > 0));
2080
2081 # if STAT_SQUFOF
2082           q_freq[0]++;
2083           q_freq[MIN (q, Q_FREQ_SIZE)]++;
2084 # endif
2085
2086           if (Q <= L1)
2087             {
2088               uintmax_t g = Q;
2089
2090               if ((Q & 1) == 0)
2091                 g /= 2;
2092
2093               g /= gcd_odd (g, mu);
2094
2095               if (g <= L)
2096                 {
2097                   if (qpos >= QUEUE_SIZE)
2098                     die (EXIT_FAILURE, 0, _("squfof queue overflow"));
2099                   queue[qpos].Q = g;
2100                   queue[qpos].P = P % g;
2101                   qpos++;
2102                 }
2103             }
2104
2105           /* I think the difference can be either sign, but mod
2106              2^W_TYPE_SIZE arithmetic should be fine.  */
2107           t = Q1 + q * (P - P1);
2108           Q1 = Q;
2109           Q = t;
2110           P = P1;
2111
2112           if ((i & 1) == 0)
2113             {
2114               uintmax_t r = is_square (Q);
2115               if (r)
2116                 {
2117                   for (unsigned int j = 0; j < qpos; j++)
2118                     {
2119                       if (queue[j].Q == r)
2120                         {
2121                           if (r == 1)
2122                             /* Traversed entire cycle.  */
2123                             goto next_multiplier;
2124
2125                           /* Need the absolute value for divisibility test.  */
2126                           if (P >= queue[j].P)
2127                             t = P - queue[j].P;
2128                           else
2129                             t = queue[j].P - P;
2130                           if (t % r == 0)
2131                             {
2132                               /* Delete entries up to and including entry
2133                                  j, which matched.  */
2134                               memmove (queue, queue + j + 1,
2135                                        (qpos - j - 1) * sizeof (queue[0]));
2136                               qpos -= (j + 1);
2137                             }
2138                           goto next_i;
2139                         }
2140                     }
2141
2142                   /* We have found a square form, which should give a
2143                      factor.  */
2144                   Q1 = r;
2145                   assert (S >= P); /* What signs are possible?  */
2146                   P += r * ((S - P) / r);
2147
2148                   /* Note: Paper says (N - P*P) / Q1, that seems incorrect
2149                      for the case D = 2N.  */
2150                   /* Compute Q = (D - P*P) / Q1, but we need double
2151                      precision.  */
2152                   uintmax_t hi, lo;
2153                   umul_ppmm (hi, lo, P, P);
2154                   sub_ddmmss (hi, lo, Dh, Dl, hi, lo);
2155                   udiv_qrnnd (Q, rem, hi, lo, Q1);
2156                   assert (rem == 0);
2157
2158                   for (;;)
2159                     {
2160                       /* Note: There appears to by a typo in the paper,
2161                          Step 4a in the algorithm description says q <--
2162                          floor([S+P]/\hat Q), but looking at the equations
2163                          in Sec. 3.1, it should be q <-- floor([S+P] / Q).
2164                          (In this code, \hat Q is Q1).  */
2165                       div_smallq (q, rem, S + P, Q);
2166                       P1 = S - rem;     /* P1 = q*Q - P */
2167
2168 # if STAT_SQUFOF
2169                       q_freq[0]++;
2170                       q_freq[MIN (q, Q_FREQ_SIZE)]++;
2171 # endif
2172                       if (P == P1)
2173                         break;
2174                       t = Q1 + q * (P - P1);
2175                       Q1 = Q;
2176                       Q = t;
2177                       P = P1;
2178                     }
2179
2180                   if ((Q & 1) == 0)
2181                     Q /= 2;
2182                   Q /= gcd_odd (Q, mu);
2183
2184                   assert (Q > 1 && (n1 || Q < n0));
2185
2186                   if (prime_p (Q))
2187                     factor_insert (factors, Q);
2188                   else if (!factor_using_squfof (0, Q, factors))
2189                     factor_using_pollard_rho (Q, 2, factors);
2190
2191                   divexact_21 (n1, n0, n1, n0, Q);
2192
2193                   if (prime2_p (n1, n0))
2194                     factor_insert_large (factors, n1, n0);
2195                   else
2196                     {
2197                       if (!factor_using_squfof (n1, n0, factors))
2198                         {
2199                           if (n1 == 0)
2200                             factor_using_pollard_rho (n0, 1, factors);
2201                           else
2202                             factor_using_pollard_rho2 (n1, n0, 1, factors);
2203                         }
2204                     }
2205
2206                   return true;
2207                 }
2208             }
2209         next_i:;
2210         }
2211     next_multiplier:;
2212     }
2213   return false;
2214 }
2215 #endif
2216
2217 /* Compute the prime factors of the 128-bit number (T1,T0), and put the
2218    results in FACTORS.  */
2219 static void
2220 factor (uintmax_t t1, uintmax_t t0, struct factors *factors)
2221 {
2222   factors->nfactors = 0;
2223   factors->plarge[1] = 0;
2224
2225   if (t1 == 0 && t0 < 2)
2226     return;
2227
2228   t0 = factor_using_division (&t1, t1, t0, factors);
2229
2230   if (t1 == 0 && t0 < 2)
2231     return;
2232
2233   if (prime2_p (t1, t0))
2234     factor_insert_large (factors, t1, t0);
2235   else
2236     {
2237 #if USE_SQUFOF
2238       if (factor_using_squfof (t1, t0, factors))
2239         return;
2240 #endif
2241
2242       if (t1 == 0)
2243         factor_using_pollard_rho (t0, 1, factors);
2244       else
2245         factor_using_pollard_rho2 (t1, t0, 1, factors);
2246     }
2247 }
2248
2249 /* Use Pollard-rho to compute the prime factors of
2250    arbitrary-precision T, and put the results in FACTORS.  */
2251 static void
2252 mp_factor (mpz_t t, struct mp_factors *factors)
2253 {
2254   mp_factor_init (factors);
2255
2256   if (mpz_sgn (t) != 0)
2257     {
2258       mp_factor_using_division (t, factors);
2259
2260       if (mpz_cmp_ui (t, 1) != 0)
2261         {
2262           devmsg ("[is number prime?] ");
2263           if (mp_prime_p (t))
2264             mp_factor_insert (factors, t);
2265           else
2266             mp_factor_using_pollard_rho (t, 1, factors);
2267         }
2268     }
2269 }
2270
2271 static strtol_error
2272 strto2uintmax (uintmax_t *hip, uintmax_t *lop, char const *s)
2273 {
2274   unsigned int lo_carry;
2275   uintmax_t hi = 0, lo = 0;
2276
2277   strtol_error err = LONGINT_INVALID;
2278
2279   /* Initial scan for invalid digits.  */
2280   char const *p = s;
2281   for (;;)
2282     {
2283       unsigned int c = *p++;
2284       if (c == 0)
2285         break;
2286
2287       if (UNLIKELY (!ISDIGIT (c)))
2288         {
2289           err = LONGINT_INVALID;
2290           break;
2291         }
2292
2293       err = LONGINT_OK;           /* we've seen at least one valid digit */
2294     }
2295
2296   while (err == LONGINT_OK)
2297     {
2298       unsigned int c = *s++;
2299       if (c == 0)
2300         break;
2301
2302       c -= '0';
2303
2304       if (UNLIKELY (hi > ~(uintmax_t)0 / 10))
2305         {
2306           err = LONGINT_OVERFLOW;
2307           break;
2308         }
2309       hi = 10 * hi;
2310
2311       lo_carry = (lo >> (W_TYPE_SIZE - 3)) + (lo >> (W_TYPE_SIZE - 1));
2312       lo_carry += 10 * lo < 2 * lo;
2313
2314       lo = 10 * lo;
2315       lo += c;
2316
2317       lo_carry += lo < c;
2318       hi += lo_carry;
2319       if (UNLIKELY (hi < lo_carry))
2320         {
2321           err = LONGINT_OVERFLOW;
2322           break;
2323         }
2324     }
2325
2326   *hip = hi;
2327   *lop = lo;
2328
2329   return err;
2330 }
2331
2332 /* Structure and routines for buffering and outputting full lines,
2333    to support parallel operation efficiently.  */
2334 static struct lbuf_
2335 {
2336   char *buf;
2337   char *end;
2338 } lbuf;
2339
2340 /* 512 is chosen to give good performance,
2341    and also is the max guaranteed size that
2342    consumers can read atomically through pipes.
2343    Also it's big enough to cater for max line length
2344    even with 128 bit uintmax_t.  */
2345 #define FACTOR_PIPE_BUF 512
2346
2347 static void
2348 lbuf_alloc (void)
2349 {
2350   if (lbuf.buf)
2351     return;
2352
2353   /* Double to ensure enough space for
2354      previous numbers + next number.  */
2355   lbuf.buf = xmalloc (FACTOR_PIPE_BUF * 2);
2356   lbuf.end = lbuf.buf;
2357 }
2358
2359 /* Write complete LBUF to standard output.  */
2360 static void
2361 lbuf_flush (void)
2362 {
2363   size_t size = lbuf.end - lbuf.buf;
2364   if (full_write (STDOUT_FILENO, lbuf.buf, size) != size)
2365     die (EXIT_FAILURE, errno, "%s", _("write error"));
2366   lbuf.end = lbuf.buf;
2367 }
2368
2369 /* Add a character C to LBUF and if it's a newline
2370    and enough bytes are already buffered,
2371    then write atomically to standard output.  */
2372 static void
2373 lbuf_putc (char c)
2374 {
2375   *lbuf.end++ = c;
2376
2377   if (c == '\n')
2378     {
2379       size_t buffered = lbuf.end - lbuf.buf;
2380
2381       /* Provide immediate output for interactive use.  */
2382       static int line_buffered = -1;
2383       if (line_buffered == -1)
2384         line_buffered = isatty (STDIN_FILENO) || isatty (STDOUT_FILENO);
2385       if (line_buffered)
2386         lbuf_flush ();
2387       else if (buffered >= FACTOR_PIPE_BUF)
2388         {
2389           /* Write output in <= PIPE_BUF chunks
2390              so consumers can read atomically.  */
2391           char const *tend = lbuf.end;
2392
2393           /* Since a umaxint_t's factors must fit in 512
2394              we're guaranteed to find a newline here.  */
2395           char *tlend = lbuf.buf + FACTOR_PIPE_BUF;
2396           while (*--tlend != '\n');
2397           tlend++;
2398
2399           lbuf.end = tlend;
2400           lbuf_flush ();
2401
2402           /* Buffer the remainder.  */
2403           memcpy (lbuf.buf, tlend, tend - tlend);
2404           lbuf.end = lbuf.buf + (tend - tlend);
2405         }
2406     }
2407 }
2408
2409 /* Buffer an int to the internal LBUF.  */
2410 static void
2411 lbuf_putint (uintmax_t i, size_t min_width)
2412 {
2413   char buf[INT_BUFSIZE_BOUND (uintmax_t)];
2414   char const *umaxstr = umaxtostr (i, buf);
2415   size_t width = sizeof (buf) - (umaxstr - buf) - 1;
2416   size_t z = width;
2417
2418   for (; z < min_width; z++)
2419     *lbuf.end++ = '0';
2420
2421   memcpy (lbuf.end, umaxstr, width);
2422   lbuf.end += width;
2423 }
2424
2425 static void
2426 print_uintmaxes (uintmax_t t1, uintmax_t t0)
2427 {
2428   uintmax_t q, r;
2429
2430   if (t1 == 0)
2431     lbuf_putint (t0, 0);
2432   else
2433     {
2434       /* Use very plain code here since it seems hard to write fast code
2435          without assuming a specific word size.  */
2436       q = t1 / 1000000000;
2437       r = t1 % 1000000000;
2438       udiv_qrnnd (t0, r, r, t0, 1000000000);
2439       print_uintmaxes (q, t0);
2440       lbuf_putint (r, 9);
2441     }
2442 }
2443
2444 /* Single-precision factoring */
2445 static void
2446 print_factors_single (uintmax_t t1, uintmax_t t0)
2447 {
2448   struct factors factors;
2449
2450   print_uintmaxes (t1, t0);
2451   lbuf_putc (':');
2452
2453   factor (t1, t0, &factors);
2454
2455   for (unsigned int j = 0; j < factors.nfactors; j++)
2456     for (unsigned int k = 0; k < factors.e[j]; k++)
2457       {
2458         lbuf_putc (' ');
2459         print_uintmaxes (0, factors.p[j]);
2460       }
2461
2462   if (factors.plarge[1])
2463     {
2464       lbuf_putc (' ');
2465       print_uintmaxes (factors.plarge[1], factors.plarge[0]);
2466     }
2467
2468   lbuf_putc ('\n');
2469 }
2470
2471 /* Emit the factors of the indicated number.  If we have the option of using
2472    either algorithm, we select on the basis of the length of the number.
2473    For longer numbers, we prefer the MP algorithm even if the native algorithm
2474    has enough digits, because the algorithm is better.  The turnover point
2475    depends on the value.  */
2476 static bool
2477 print_factors (char const *input)
2478 {
2479   /* Skip initial spaces and '+'.  */
2480   char const *str = input;
2481   while (*str == ' ')
2482     str++;
2483   str += *str == '+';
2484
2485   uintmax_t t1, t0;
2486
2487   /* Try converting the number to one or two words.  If it fails, use GMP or
2488      print an error message.  The 2nd condition checks that the most
2489      significant bit of the two-word number is clear, in a typesize neutral
2490      way.  */
2491   strtol_error err = strto2uintmax (&t1, &t0, str);
2492
2493   switch (err)
2494     {
2495     case LONGINT_OK:
2496       if (((t1 << 1) >> 1) == t1)
2497         {
2498           devmsg ("[using single-precision arithmetic] ");
2499           print_factors_single (t1, t0);
2500           return true;
2501         }
2502       break;
2503
2504     case LONGINT_OVERFLOW:
2505       /* Try GMP.  */
2506       break;
2507
2508     default:
2509       error (0, 0, _("%s is not a valid positive integer"), quote (input));
2510       return false;
2511     }
2512
2513   devmsg ("[using arbitrary-precision arithmetic] ");
2514   mpz_t t;
2515   struct mp_factors factors;
2516
2517   mpz_init_set_str (t, str, 10);
2518
2519   mpz_out_str (stdout, 10, t);
2520   putchar (':');
2521   mp_factor (t, &factors);
2522
2523   for (unsigned int j = 0; j < factors.nfactors; j++)
2524     for (unsigned int k = 0; k < factors.e[j]; k++)
2525       {
2526         putchar (' ');
2527         mpz_out_str (stdout, 10, factors.p[j]);
2528       }
2529
2530   mp_factor_clear (&factors);
2531   mpz_clear (t);
2532   putchar ('\n');
2533   fflush (stdout);
2534   return true;
2535 }
2536
2537 void
2538 usage (int status)
2539 {
2540   if (status != EXIT_SUCCESS)
2541     emit_try_help ();
2542   else
2543     {
2544       printf (_("\
2545 Usage: %s [NUMBER]...\n\
2546   or:  %s OPTION\n\
2547 "),
2548               program_name, program_name);
2549       fputs (_("\
2550 Print the prime factors of each specified integer NUMBER.  If none\n\
2551 are specified on the command line, read them from standard input.\n\
2552 \n\
2553 "), stdout);
2554       fputs (HELP_OPTION_DESCRIPTION, stdout);
2555       fputs (VERSION_OPTION_DESCRIPTION, stdout);
2556       emit_ancillary_info (PROGRAM_NAME);
2557     }
2558   exit (status);
2559 }
2560
2561 static bool
2562 do_stdin (void)
2563 {
2564   bool ok = true;
2565   token_buffer tokenbuffer;
2566
2567   init_tokenbuffer (&tokenbuffer);
2568
2569   while (true)
2570     {
2571       size_t token_length = readtoken (stdin, DELIM, sizeof (DELIM) - 1,
2572                                        &tokenbuffer);
2573       if (token_length == (size_t) -1)
2574         break;
2575       ok &= print_factors (tokenbuffer.buffer);
2576     }
2577   free (tokenbuffer.buffer);
2578
2579   return ok;
2580 }
2581
2582 int
2583 main (int argc, char **argv)
2584 {
2585   initialize_main (&argc, &argv);
2586   set_program_name (argv[0]);
2587   setlocale (LC_ALL, "");
2588   bindtextdomain (PACKAGE, LOCALEDIR);
2589   textdomain (PACKAGE);
2590
2591   lbuf_alloc ();
2592   atexit (close_stdout);
2593   atexit (lbuf_flush);
2594
2595   int c;
2596   while ((c = getopt_long (argc, argv, "", long_options, NULL)) != -1)
2597     {
2598       switch (c)
2599         {
2600         case DEV_DEBUG_OPTION:
2601           dev_debug = true;
2602           break;
2603
2604         case_GETOPT_HELP_CHAR;
2605
2606         case_GETOPT_VERSION_CHAR (PROGRAM_NAME, AUTHORS);
2607
2608         default:
2609           usage (EXIT_FAILURE);
2610         }
2611     }
2612
2613 #if STAT_SQUFOF
2614   memset (q_freq, 0, sizeof (q_freq));
2615 #endif
2616
2617   bool ok;
2618   if (argc <= optind)
2619     ok = do_stdin ();
2620   else
2621     {
2622       ok = true;
2623       for (int i = optind; i < argc; i++)
2624         if (! print_factors (argv[i]))
2625           ok = false;
2626     }
2627
2628 #if STAT_SQUFOF
2629   if (q_freq[0] > 0)
2630     {
2631       double acc_f;
2632       printf ("q  freq.  cum. freq.(total: %d)\n", q_freq[0]);
2633       for (unsigned int i = 1, acc_f = 0.0; i <= Q_FREQ_SIZE; i++)
2634         {
2635           double f = (double) q_freq[i] / q_freq[0];
2636           acc_f += f;
2637           printf ("%s%d %.2f%% %.2f%%\n", i == Q_FREQ_SIZE ? ">=" : "", i,
2638                   100.0 * f, 100.0 * acc_f);
2639         }
2640     }
2641 #endif
2642
2643   return ok ? EXIT_SUCCESS : EXIT_FAILURE;
2644 }